08c3a6
commit 82a707aeb74f23bb1783af0f9e93790a2038ff7e
08c3a6
Author: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
08c3a6
Date:   Mon Jun 6 12:17:43 2022 -0700
08c3a6
08c3a6
    x86_64: Add strstr function with 512-bit EVEX
08c3a6
    
08c3a6
    Adding a 512-bit EVEX version of strstr. The algorithm works as follows:
08c3a6
    
08c3a6
    (1) We spend a few cycles at the begining to peek into the needle. We
08c3a6
    locate an edge in the needle (first occurance of 2 consequent distinct
08c3a6
    characters) and also store the first 64-bytes into a zmm register.
08c3a6
    
08c3a6
    (2) We search for the edge in the haystack by looking into one cache
08c3a6
    line of the haystack at a time. This avoids having to read past a page
08c3a6
    boundary which can cause a seg fault.
08c3a6
    
08c3a6
    (3) If an edge is found in the haystack we first compare the first
08c3a6
    64-bytes of the needle (already stored in a zmm register) before we
08c3a6
    proceed with a full string compare performed byte by byte.
08c3a6
    
08c3a6
    Benchmarking results: (old = strstr_sse2_unaligned, new = strstr_avx512)
08c3a6
    
08c3a6
    Geometric mean of all benchmarks: new / old =  0.66
08c3a6
    
08c3a6
    Difficult skiptable(0) : new / old =  0.02
08c3a6
    Difficult skiptable(1) : new / old =  0.01
08c3a6
    Difficult 2-way : new / old =  0.25
08c3a6
    Difficult testing first 2 : new / old =  1.26
08c3a6
    Difficult skiptable(0) : new / old =  0.05
08c3a6
    Difficult skiptable(1) : new / old =  0.06
08c3a6
    Difficult 2-way : new / old =  0.26
08c3a6
    Difficult testing first 2 : new / old =  1.05
08c3a6
    Difficult skiptable(0) : new / old =  0.42
08c3a6
    Difficult skiptable(1) : new / old =  0.24
08c3a6
    Difficult 2-way : new / old =  0.21
08c3a6
    Difficult testing first 2 : new / old =  1.04
08c3a6
    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
08c3a6
    
08c3a6
    (cherry picked from commit 5082a287d5e9a1f9cb98b7c982a708a3684f1d5c)
08c3a6
    
08c3a6
    x86: Remove __mmask intrinsics in strstr-avx512.c
08c3a6
    
08c3a6
    The intrinsics are not available before GCC7 and using standard
08c3a6
    operators generates code of equivalent or better quality.
08c3a6
    
08c3a6
    Removed:
08c3a6
        _cvtmask64_u64
08c3a6
        _kshiftri_mask64
08c3a6
        _kand_mask64
08c3a6
    
08c3a6
    Geometric Mean of 5 Runs of Full Benchmark Suite New / Old: 0.958
08c3a6
    
08c3a6
    (cherry picked from commit f2698954ff9c2f9626d4bcb5a30eb5729714e0b0)
08c3a6
08c3a6
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
08c3a6
index 4d4ad2a3686b5bc3..0e39e63ef6be6a86 100644
08c3a6
--- a/sysdeps/x86_64/multiarch/Makefile
08c3a6
+++ b/sysdeps/x86_64/multiarch/Makefile
08c3a6
@@ -126,6 +126,7 @@ sysdep_routines += \
08c3a6
   strrchr-sse2 \
08c3a6
   strspn-c \
08c3a6
   strspn-sse2 \
08c3a6
+  strstr-avx512 \
08c3a6
   strstr-sse2-unaligned \
08c3a6
   varshift \
08c3a6
 # sysdep_routines
08c3a6
@@ -133,6 +134,7 @@ CFLAGS-varshift.c += -msse4
08c3a6
 CFLAGS-strcspn-c.c += -msse4
08c3a6
 CFLAGS-strpbrk-c.c += -msse4
08c3a6
 CFLAGS-strspn-c.c += -msse4
08c3a6
+CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -mavx512bw -mbmi -mbmi2 -O3
08c3a6
 endif
08c3a6
 
08c3a6
 ifeq ($(subdir),wcsmbs)
08c3a6
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
08c3a6
index 6b75a7106e174bce..043821278fdb6d8f 100644
08c3a6
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
08c3a6
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
08c3a6
@@ -633,6 +633,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
08c3a6
 
08c3a6
   /* Support sysdeps/x86_64/multiarch/strstr.c.  */
08c3a6
   IFUNC_IMPL (i, name, strstr,
08c3a6
+              IFUNC_IMPL_ADD (array, i, strstr,
08c3a6
+                              (CPU_FEATURE_USABLE (AVX512VL)
08c3a6
+                               && CPU_FEATURE_USABLE (AVX512BW)
08c3a6
+                               && CPU_FEATURE_USABLE (AVX512DQ)
08c3a6
+                               && CPU_FEATURE_USABLE (BMI2)),
08c3a6
+                              __strstr_avx512)
08c3a6
 	      IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
08c3a6
 	      IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
08c3a6
 
08c3a6
diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c b/sysdeps/x86_64/multiarch/strstr-avx512.c
08c3a6
new file mode 100644
08c3a6
index 0000000000000000..e44c1a05dc0007e5
08c3a6
--- /dev/null
08c3a6
+++ b/sysdeps/x86_64/multiarch/strstr-avx512.c
08c3a6
@@ -0,0 +1,218 @@
08c3a6
+/* strstr optimized with 512-bit AVX-512 instructions
08c3a6
+   Copyright (C) 2022 Free Software Foundation, Inc.
08c3a6
+   This file is part of the GNU C Library.
08c3a6
+
08c3a6
+   The GNU C Library is free software; you can redistribute it and/or
08c3a6
+   modify it under the terms of the GNU Lesser General Public
08c3a6
+   License as published by the Free Software Foundation; either
08c3a6
+   version 2.1 of the License, or (at your option) any later version.
08c3a6
+
08c3a6
+   The GNU C Library is distributed in the hope that it will be useful,
08c3a6
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
08c3a6
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
08c3a6
+   Lesser General Public License for more details.
08c3a6
+
08c3a6
+   You should have received a copy of the GNU Lesser General Public
08c3a6
+   License along with the GNU C Library; if not, see
08c3a6
+   <https://www.gnu.org/licenses/>.  */
08c3a6
+
08c3a6
+#include <immintrin.h>
08c3a6
+#include <inttypes.h>
08c3a6
+#include <stdbool.h>
08c3a6
+#include <string.h>
08c3a6
+
08c3a6
+#define FULL_MMASK64 0xffffffffffffffff
08c3a6
+#define ONE_64BIT 0x1ull
08c3a6
+#define ZMM_SIZE_IN_BYTES 64
08c3a6
+#define PAGESIZE 4096
08c3a6
+
08c3a6
+#define cvtmask64_u64(...) (uint64_t) (__VA_ARGS__)
08c3a6
+#define kshiftri_mask64(x, y) ((x) >> (y))
08c3a6
+#define kand_mask64(x, y) ((x) & (y))
08c3a6
+
08c3a6
+/*
08c3a6
+ Returns the index of the first edge within the needle, returns 0 if no edge
08c3a6
+ is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg'
08c3a6
+ */
08c3a6
+static inline size_t
08c3a6
+find_edge_in_needle (const char *ned)
08c3a6
+{
08c3a6
+  size_t ind = 0;
08c3a6
+  while (ned[ind + 1] != '\0')
08c3a6
+    {
08c3a6
+      if (ned[ind] != ned[ind + 1])
08c3a6
+        return ind;
08c3a6
+      else
08c3a6
+        ind = ind + 1;
08c3a6
+    }
08c3a6
+  return 0;
08c3a6
+}
08c3a6
+
08c3a6
+/*
08c3a6
+ Compare needle with haystack byte by byte at specified location
08c3a6
+ */
08c3a6
+static inline bool
08c3a6
+verify_string_match (const char *hay, const size_t hay_index, const char *ned,
08c3a6
+                     size_t ind)
08c3a6
+{
08c3a6
+  while (ned[ind] != '\0')
08c3a6
+    {
08c3a6
+      if (ned[ind] != hay[hay_index + ind])
08c3a6
+        return false;
08c3a6
+      ind = ind + 1;
08c3a6
+    }
08c3a6
+  return true;
08c3a6
+}
08c3a6
+
08c3a6
+/*
08c3a6
+ Compare needle with haystack at specified location. The first 64 bytes are
08c3a6
+ compared using a ZMM register.
08c3a6
+ */
08c3a6
+static inline bool
08c3a6
+verify_string_match_avx512 (const char *hay, const size_t hay_index,
08c3a6
+                            const char *ned, const __mmask64 ned_mask,
08c3a6
+                            const __m512i ned_zmm)
08c3a6
+{
08c3a6
+  /* check first 64 bytes using zmm and then scalar */
08c3a6
+  __m512i hay_zmm = _mm512_loadu_si512 (hay + hay_index); // safe to do so
08c3a6
+  __mmask64 match = _mm512_mask_cmpneq_epi8_mask (ned_mask, hay_zmm, ned_zmm);
08c3a6
+  if (match != 0x0) // failed the first few chars
08c3a6
+    return false;
08c3a6
+  else if (ned_mask == FULL_MMASK64)
08c3a6
+    return verify_string_match (hay, hay_index, ned, ZMM_SIZE_IN_BYTES);
08c3a6
+  return true;
08c3a6
+}
08c3a6
+
08c3a6
+char *
08c3a6
+__strstr_avx512 (const char *haystack, const char *ned)
08c3a6
+{
08c3a6
+  char first = ned[0];
08c3a6
+  if (first == '\0')
08c3a6
+    return (char *)haystack;
08c3a6
+  if (ned[1] == '\0')
08c3a6
+    return (char *)strchr (haystack, ned[0]);
08c3a6
+
08c3a6
+  size_t edge = find_edge_in_needle (ned);
08c3a6
+
08c3a6
+  /* ensure haystack is as long as the pos of edge in needle */
08c3a6
+  for (int ii = 0; ii < edge; ++ii)
08c3a6
+    {
08c3a6
+      if (haystack[ii] == '\0')
08c3a6
+        return NULL;
08c3a6
+    }
08c3a6
+
08c3a6
+  /*
08c3a6
+   Load 64 bytes of the needle and save it to a zmm register
08c3a6
+   Read one cache line at a time to avoid loading across a page boundary
08c3a6
+   */
08c3a6
+  __mmask64 ned_load_mask = _bzhi_u64 (
08c3a6
+      FULL_MMASK64, 64 - ((uintptr_t) (ned) & 63));
08c3a6
+  __m512i ned_zmm = _mm512_maskz_loadu_epi8 (ned_load_mask, ned);
08c3a6
+  __mmask64 ned_nullmask
08c3a6
+      = _mm512_mask_testn_epi8_mask (ned_load_mask, ned_zmm, ned_zmm);
08c3a6
+
08c3a6
+  if (__glibc_unlikely (ned_nullmask == 0x0))
08c3a6
+    {
08c3a6
+      ned_zmm = _mm512_loadu_si512 (ned);
08c3a6
+      ned_nullmask = _mm512_testn_epi8_mask (ned_zmm, ned_zmm);
08c3a6
+      ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
08c3a6
+      if (ned_nullmask != 0x0)
08c3a6
+        ned_load_mask = ned_load_mask >> 1;
08c3a6
+    }
08c3a6
+  else
08c3a6
+    {
08c3a6
+      ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
08c3a6
+      ned_load_mask = ned_load_mask >> 1;
08c3a6
+    }
08c3a6
+  const __m512i ned0 = _mm512_set1_epi8 (ned[edge]);
08c3a6
+  const __m512i ned1 = _mm512_set1_epi8 (ned[edge + 1]);
08c3a6
+
08c3a6
+  /*
08c3a6
+   Read the bytes of haystack in the current cache line
08c3a6
+   */
08c3a6
+  size_t hay_index = edge;
08c3a6
+  __mmask64 loadmask = _bzhi_u64 (
08c3a6
+      FULL_MMASK64, 64 - ((uintptr_t) (haystack + hay_index) & 63));
08c3a6
+  /* First load is a partial cache line */
08c3a6
+  __m512i hay0 = _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index);
08c3a6
+  /* Search for NULL and compare only till null char */
08c3a6
+  uint64_t nullmask
08c3a6
+      = cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, hay0, hay0));
08c3a6
+  uint64_t cmpmask = nullmask ^ (nullmask - ONE_64BIT);
08c3a6
+  cmpmask = cmpmask & cvtmask64_u64 (loadmask);
08c3a6
+  /* Search for the 2 charaters of needle */
08c3a6
+  __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
08c3a6
+  __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
08c3a6
+  k1 = kshiftri_mask64 (k1, 1);
08c3a6
+  /* k2 masks tell us if both chars from needle match */
08c3a6
+  uint64_t k2 = cvtmask64_u64 (kand_mask64 (k0, k1)) & cmpmask;
08c3a6
+  /* For every match, search for the entire needle for a full match */
08c3a6
+  while (k2)
08c3a6
+    {
08c3a6
+      uint64_t bitcount = _tzcnt_u64 (k2);
08c3a6
+      k2 = _blsr_u64 (k2);
08c3a6
+      size_t match_pos = hay_index + bitcount - edge;
08c3a6
+      if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1))
08c3a6
+          < PAGESIZE - 1 - ZMM_SIZE_IN_BYTES)
08c3a6
+        {
08c3a6
+          /*
08c3a6
+           * Use vector compare as long as you are not crossing a page
08c3a6
+           */
08c3a6
+          if (verify_string_match_avx512 (haystack, match_pos, ned,
08c3a6
+                                          ned_load_mask, ned_zmm))
08c3a6
+            return (char *)haystack + match_pos;
08c3a6
+        }
08c3a6
+      else
08c3a6
+        {
08c3a6
+          if (verify_string_match (haystack, match_pos, ned, 0))
08c3a6
+            return (char *)haystack + match_pos;
08c3a6
+        }
08c3a6
+    }
08c3a6
+  /* We haven't checked for potential match at the last char yet */
08c3a6
+  haystack = (const char *)(((uintptr_t) (haystack + hay_index) | 63));
08c3a6
+  hay_index = 0;
08c3a6
+
08c3a6
+  /*
08c3a6
+   Loop over one cache line at a time to prevent reading over page
08c3a6
+   boundary
08c3a6
+   */
08c3a6
+  __m512i hay1;
08c3a6
+  while (nullmask == 0)
08c3a6
+    {
08c3a6
+      hay0 = _mm512_loadu_si512 (haystack + hay_index);
08c3a6
+      hay1 = _mm512_load_si512 (haystack + hay_index
08c3a6
+                                + 1); // Always 64 byte aligned
08c3a6
+      nullmask = cvtmask64_u64 (_mm512_testn_epi8_mask (hay1, hay1));
08c3a6
+      /* Compare only till null char */
08c3a6
+      cmpmask = nullmask ^ (nullmask - ONE_64BIT);
08c3a6
+      k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
08c3a6
+      k1 = _mm512_cmpeq_epi8_mask (hay1, ned1);
08c3a6
+      /* k2 masks tell us if both chars from needle match */
08c3a6
+      k2 = cvtmask64_u64 (kand_mask64 (k0, k1)) & cmpmask;
08c3a6
+      /* For every match, compare full strings for potential match */
08c3a6
+      while (k2)
08c3a6
+        {
08c3a6
+          uint64_t bitcount = _tzcnt_u64 (k2);
08c3a6
+          k2 = _blsr_u64 (k2);
08c3a6
+          size_t match_pos = hay_index + bitcount - edge;
08c3a6
+          if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1))
08c3a6
+              < PAGESIZE - 1 - ZMM_SIZE_IN_BYTES)
08c3a6
+            {
08c3a6
+              /*
08c3a6
+               * Use vector compare as long as you are not crossing a page
08c3a6
+               */
08c3a6
+              if (verify_string_match_avx512 (haystack, match_pos, ned,
08c3a6
+                                              ned_load_mask, ned_zmm))
08c3a6
+                return (char *)haystack + match_pos;
08c3a6
+            }
08c3a6
+          else
08c3a6
+            {
08c3a6
+              /* Compare byte by byte */
08c3a6
+              if (verify_string_match (haystack, match_pos, ned, 0))
08c3a6
+                return (char *)haystack + match_pos;
08c3a6
+            }
08c3a6
+        }
08c3a6
+      hay_index += ZMM_SIZE_IN_BYTES;
08c3a6
+    }
08c3a6
+  return NULL;
08c3a6
+}
08c3a6
diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c
08c3a6
index 848601bde7583ca3..9474d6234e9b62d3 100644
08c3a6
--- a/sysdeps/x86_64/multiarch/strstr.c
08c3a6
+++ b/sysdeps/x86_64/multiarch/strstr.c
08c3a6
@@ -35,16 +35,32 @@
08c3a6
 
08c3a6
 extern __typeof (__redirect_strstr) __strstr_sse2_unaligned attribute_hidden;
08c3a6
 extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden;
08c3a6
+extern __typeof (__redirect_strstr) __strstr_avx512 attribute_hidden;
08c3a6
 
08c3a6
 #include "init-arch.h"
08c3a6
 
08c3a6
 /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
08c3a6
    ifunc symbol properly.  */
08c3a6
 extern __typeof (__redirect_strstr) __libc_strstr;
08c3a6
-libc_ifunc (__libc_strstr,
08c3a6
-	    HAS_ARCH_FEATURE (Fast_Unaligned_Load)
08c3a6
-	    ? __strstr_sse2_unaligned
08c3a6
-	    : __strstr_sse2)
08c3a6
 
08c3a6
+static inline void *
08c3a6
+IFUNC_SELECTOR (void)
08c3a6
+{
08c3a6
+  const struct cpu_features *cpu_features = __get_cpu_features ();
08c3a6
+
08c3a6
+  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
08c3a6
+      && CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
08c3a6
+      && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
08c3a6
+      && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ)
08c3a6
+      && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
08c3a6
+    return __strstr_avx512;
08c3a6
+
08c3a6
+  if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
08c3a6
+    return __strstr_sse2_unaligned;
08c3a6
+
08c3a6
+  return __strstr_sse2;
08c3a6
+}
08c3a6
+
08c3a6
+libc_ifunc_redirected (__redirect_strstr, __libc_strstr, IFUNC_SELECTOR ());
08c3a6
 #undef strstr
08c3a6
 strong_alias (__libc_strstr, strstr)