Blame SOURCES/ia-upd-256bit-evex-instr-1.patch

190885
From 22a1b88414d40b700c84689d08a6026e3fdee874 Mon Sep 17 00:00:00 2001
190885
From: "H.J. Lu" <hjl.tools@gmail.com>
190885
Date: Fri, 5 Mar 2021 06:24:52 -0800
190885
Subject: [PATCH] x86-64: Add ifunc-avx2.h functions with 256-bit EVEX
190885
190885
Update ifunc-avx2.h, strchr.c, strcmp.c, strncmp.c and wcsnlen.c to
190885
select the function optimized with 256-bit EVEX instructions using
190885
YMM16-YMM31 registers to avoid RTM abort with usable AVX512VL, AVX512BW
190885
and BMI2 since VZEROUPPER isn't needed at function exit.
190885
190885
For strcmp/strncmp, prefer AVX2 strcmp/strncmp if Prefer_AVX2_STRCMP
190885
is set.
190885
190885
(cherry picked from commit 1fd8c163a83d96ace1ff78fa6bac7aee084f6f77)
190885
---
190885
 sysdeps/x86_64/multiarch/Makefile          |   21 +-
190885
 sysdeps/x86_64/multiarch/ifunc-avx2.h      |   14 +-
190885
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |   81 ++
190885
 sysdeps/x86_64/multiarch/memchr-evex.S     |  381 +++++++
190885
 sysdeps/x86_64/multiarch/memrchr-evex.S    |  337 +++++++
190885
 sysdeps/x86_64/multiarch/rawmemchr-evex.S  |    4 +
190885
 sysdeps/x86_64/multiarch/strchr-evex.S     |  335 +++++++
190885
 sysdeps/x86_64/multiarch/strchr.c          |   14 +-
190885
 sysdeps/x86_64/multiarch/strchrnul-evex.S  |    3 +
190885
 sysdeps/x86_64/multiarch/strcmp-evex.S     | 1043 ++++++++++++++++++++
190885
 sysdeps/x86_64/multiarch/strcmp.c          |   15 +-
190885
 sysdeps/x86_64/multiarch/strlen-evex.S     |  436 ++++++++
190885
 sysdeps/x86_64/multiarch/strncmp-evex.S    |    3 +
190885
 sysdeps/x86_64/multiarch/strncmp.c         |   15 +-
190885
 sysdeps/x86_64/multiarch/strnlen-evex.S    |    4 +
190885
 sysdeps/x86_64/multiarch/strrchr-evex.S    |  265 +++++
190885
 sysdeps/x86_64/multiarch/wcschr-evex.S     |    3 +
190885
 sysdeps/x86_64/multiarch/wcscmp-evex.S     |    4 +
190885
 sysdeps/x86_64/multiarch/wcslen-evex.S     |    4 +
190885
 sysdeps/x86_64/multiarch/wcsncmp-evex.S    |    5 +
190885
 sysdeps/x86_64/multiarch/wcsnlen-evex.S    |    5 +
190885
 sysdeps/x86_64/multiarch/wcsnlen.c         |   14 +-
190885
 sysdeps/x86_64/multiarch/wcsrchr-evex.S    |    3 +
190885
 sysdeps/x86_64/multiarch/wmemchr-evex.S    |    4 +
190885
 24 files changed, 2996 insertions(+), 17 deletions(-)
190885
 create mode 100644 sysdeps/x86_64/multiarch/memchr-evex.S
190885
 create mode 100644 sysdeps/x86_64/multiarch/memrchr-evex.S
190885
 create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex.S
190885
 create mode 100644 sysdeps/x86_64/multiarch/strchr-evex.S
190885
 create mode 100644 sysdeps/x86_64/multiarch/strchrnul-evex.S
190885
 create mode 100644 sysdeps/x86_64/multiarch/strcmp-evex.S
190885
 create mode 100644 sysdeps/x86_64/multiarch/strlen-evex.S
190885
 create mode 100644 sysdeps/x86_64/multiarch/strncmp-evex.S
190885
 create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex.S
190885
 create mode 100644 sysdeps/x86_64/multiarch/strrchr-evex.S
190885
 create mode 100644 sysdeps/x86_64/multiarch/wcschr-evex.S
190885
 create mode 100644 sysdeps/x86_64/multiarch/wcscmp-evex.S
190885
 create mode 100644 sysdeps/x86_64/multiarch/wcslen-evex.S
190885
 create mode 100644 sysdeps/x86_64/multiarch/wcsncmp-evex.S
190885
 create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-evex.S
190885
 create mode 100644 sysdeps/x86_64/multiarch/wcsrchr-evex.S
190885
 create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex.S
190885
190885
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
190885
index 9477538a..5ce85882 100644
190885
--- a/sysdeps/x86_64/multiarch/Makefile
190885
+++ b/sysdeps/x86_64/multiarch/Makefile
190885
@@ -39,7 +39,17 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
190885
 		   memmove-avx512-unaligned-erms \
190885
 		   memset-sse2-unaligned-erms \
190885
 		   memset-avx2-unaligned-erms \
190885
-		   memset-avx512-unaligned-erms
190885
+		   memset-avx512-unaligned-erms \
190885
+		   memchr-evex \
190885
+		   memrchr-evex \
190885
+		   rawmemchr-evex \
190885
+		   strchr-evex \
190885
+		   strchrnul-evex \
190885
+		   strcmp-evex \
190885
+		   strlen-evex \
190885
+		   strncmp-evex \
190885
+		   strnlen-evex \
190885
+		   strrchr-evex
190885
 CFLAGS-varshift.c += -msse4
190885
 CFLAGS-strcspn-c.c += -msse4
190885
 CFLAGS-strpbrk-c.c += -msse4
190885
@@ -56,7 +66,14 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
190885
 		   wcschr-sse2 wcschr-avx2 \
190885
 		   wcsrchr-sse2 wcsrchr-avx2 \
190885
 		   wcsnlen-sse4_1 wcsnlen-c \
190885
-		   wcslen-sse2 wcslen-avx2 wcsnlen-avx2
190885
+		   wcslen-sse2 wcslen-avx2 wcsnlen-avx2 \
190885
+		   wcschr-evex \
190885
+		   wcscmp-evex \
190885
+		   wcslen-evex \
190885
+		   wcsncmp-evex \
190885
+		   wcsnlen-evex \
190885
+		   wcsrchr-evex \
190885
+		   wmemchr-evex
190885
 endif
190885
 
190885
 ifeq ($(subdir),debug)
190885
diff --git a/sysdeps/x86_64/multiarch/ifunc-avx2.h b/sysdeps/x86_64/multiarch/ifunc-avx2.h
190885
index 5c88640a..7081b0c9 100644
190885
--- a/sysdeps/x86_64/multiarch/ifunc-avx2.h
190885
+++ b/sysdeps/x86_64/multiarch/ifunc-avx2.h
190885
@@ -21,16 +21,24 @@
190885
 
190885
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
190885
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
190885
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
190885
 
190885
 static inline void *
190885
 IFUNC_SELECTOR (void)
190885
 {
190885
   const struct cpu_features* cpu_features = __get_cpu_features ();
190885
 
190885
-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
190885
-      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
190885
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
190885
       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
190885
-    return OPTIMIZE (avx2);
190885
+    {
190885
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
190885
+	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
190885
+	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
190885
+	return OPTIMIZE (evex);
190885
+
190885
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
190885
+	return OPTIMIZE (avx2);
190885
+    }
190885
 
190885
   return OPTIMIZE (sse2);
190885
 }
190885
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
190885
index fe13505c..bd7d9f19 100644
190885
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
190885
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
190885
@@ -43,6 +43,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
190885
 	      IFUNC_IMPL_ADD (array, i, memchr,
190885
 			      CPU_FEATURE_USABLE (AVX2),
190885
 			      __memchr_avx2)
190885
+	      IFUNC_IMPL_ADD (array, i, memchr,
190885
+			      (CPU_FEATURE_USABLE (AVX512VL)
190885
+			       && CPU_FEATURE_USABLE (AVX512BW)
190885
+			       && CPU_FEATURE_USABLE (BMI2)),
190885
+			      __memchr_evex)
190885
 	      IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_sse2))
190885
 
190885
   /* Support sysdeps/x86_64/multiarch/memcmp.c.  */
190885
@@ -121,6 +126,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
190885
 	      IFUNC_IMPL_ADD (array, i, memrchr,
190885
 			      CPU_FEATURE_USABLE (AVX2),
190885
 			      __memrchr_avx2)
190885
+	      IFUNC_IMPL_ADD (array, i, memrchr,
190885
+			      (CPU_FEATURE_USABLE (AVX512VL)
190885
+			       && CPU_FEATURE_USABLE (AVX512BW)),
190885
+			      __memrchr_evex)
190885
+
190885
 	      IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_sse2))
190885
 
190885
 #ifdef SHARED
190885
@@ -179,6 +189,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
190885
 	      IFUNC_IMPL_ADD (array, i, rawmemchr,
190885
 			      CPU_FEATURE_USABLE (AVX2),
190885
 			      __rawmemchr_avx2)
190885
+	      IFUNC_IMPL_ADD (array, i, rawmemchr,
190885
+			      (CPU_FEATURE_USABLE (AVX512VL)
190885
+			       && CPU_FEATURE_USABLE (AVX512BW)
190885
+			       && CPU_FEATURE_USABLE (BMI2)),
190885
+			      __rawmemchr_evex)
190885
 	      IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2))
190885
 
190885
   /* Support sysdeps/x86_64/multiarch/strlen.c.  */
190885
@@ -186,6 +201,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
190885
 	      IFUNC_IMPL_ADD (array, i, strlen,
190885
 			      CPU_FEATURE_USABLE (AVX2),
190885
 			      __strlen_avx2)
190885
+	      IFUNC_IMPL_ADD (array, i, strlen,
190885
+			      (CPU_FEATURE_USABLE (AVX512VL)
190885
+			       && CPU_FEATURE_USABLE (AVX512BW)),
190885
+			      __strlen_evex)
190885
 	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
190885
 
190885
   /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
190885
@@ -193,6 +212,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
190885
 	      IFUNC_IMPL_ADD (array, i, strnlen,
190885
 			      CPU_FEATURE_USABLE (AVX2),
190885
 			      __strnlen_avx2)
190885
+	      IFUNC_IMPL_ADD (array, i, strnlen,
190885
+			      (CPU_FEATURE_USABLE (AVX512VL)
190885
+			       && CPU_FEATURE_USABLE (AVX512BW)),
190885
+			      __strnlen_evex)
190885
 	      IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
190885
 
190885
   /* Support sysdeps/x86_64/multiarch/stpncpy.c.  */
190885
@@ -255,6 +278,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
190885
 	      IFUNC_IMPL_ADD (array, i, strchr,
190885
 			      CPU_FEATURE_USABLE (AVX2),
190885
 			      __strchr_avx2)
190885
+	      IFUNC_IMPL_ADD (array, i, strchr,
190885
+			      (CPU_FEATURE_USABLE (AVX512VL)
190885
+			       && CPU_FEATURE_USABLE (AVX512BW)
190885
+			       && CPU_FEATURE_USABLE (BMI2)),
190885
+			      __strchr_evex)
190885
 	      IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2_no_bsf)
190885
 	      IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2))
190885
 
190885
@@ -263,6 +291,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
190885
 	      IFUNC_IMPL_ADD (array, i, strchrnul,
190885
 			      CPU_FEATURE_USABLE (AVX2),
190885
 			      __strchrnul_avx2)
190885
+	      IFUNC_IMPL_ADD (array, i, strchrnul,
190885
+			      (CPU_FEATURE_USABLE (AVX512VL)
190885
+			       && CPU_FEATURE_USABLE (AVX512BW)
190885
+			       && CPU_FEATURE_USABLE (BMI2)),
190885
+			      __strchrnul_evex)
190885
 	      IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_sse2))
190885
 
190885
   /* Support sysdeps/x86_64/multiarch/strrchr.c.  */
190885
@@ -270,6 +303,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
190885
 	      IFUNC_IMPL_ADD (array, i, strrchr,
190885
 			      CPU_FEATURE_USABLE (AVX2),
190885
 			      __strrchr_avx2)
190885
+	      IFUNC_IMPL_ADD (array, i, strrchr,
190885
+			      (CPU_FEATURE_USABLE (AVX512VL)
190885
+			       && CPU_FEATURE_USABLE (AVX512BW)),
190885
+			      __strrchr_evex)
190885
 	      IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_sse2))
190885
 
190885
   /* Support sysdeps/x86_64/multiarch/strcmp.c.  */
190885
@@ -277,6 +314,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
190885
 	      IFUNC_IMPL_ADD (array, i, strcmp,
190885
 			      CPU_FEATURE_USABLE (AVX2),
190885
 			      __strcmp_avx2)
190885
+	      IFUNC_IMPL_ADD (array, i, strcmp,
190885
+			      (CPU_FEATURE_USABLE (AVX512VL)
190885
+			       && CPU_FEATURE_USABLE (AVX512BW)
190885
+			       && CPU_FEATURE_USABLE (BMI2)),
190885
+			      __strcmp_evex)
190885
 	      IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSE4_2),
190885
 			      __strcmp_sse42)
190885
 	      IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSSE3),
190885
@@ -370,6 +412,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
190885
 	      IFUNC_IMPL_ADD (array, i, wcschr,
190885
 			      CPU_FEATURE_USABLE (AVX2),
190885
 			      __wcschr_avx2)
190885
+	      IFUNC_IMPL_ADD (array, i, wcschr,
190885
+			      (CPU_FEATURE_USABLE (AVX512VL)
190885
+			       && CPU_FEATURE_USABLE (AVX512BW)
190885
+			       && CPU_FEATURE_USABLE (BMI2)),
190885
+			      __wcschr_evex)
190885
 	      IFUNC_IMPL_ADD (array, i, wcschr, 1, __wcschr_sse2))
190885
 
190885
   /* Support sysdeps/x86_64/multiarch/wcsrchr.c.  */
190885
@@ -377,6 +424,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
190885
 	      IFUNC_IMPL_ADD (array, i, wcsrchr,
190885
 			      CPU_FEATURE_USABLE (AVX2),
190885
 			      __wcsrchr_avx2)
190885
+	      IFUNC_IMPL_ADD (array, i, wcsrchr,
190885
+			      (CPU_FEATURE_USABLE (AVX512VL)
190885
+			       && CPU_FEATURE_USABLE (AVX512BW)
190885
+			       && CPU_FEATURE_USABLE (BMI2)),
190885
+			      __wcsrchr_evex)
190885
 	      IFUNC_IMPL_ADD (array, i, wcsrchr, 1, __wcsrchr_sse2))
190885
 
190885
   /* Support sysdeps/x86_64/multiarch/wcscmp.c.  */
190885
@@ -384,6 +436,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
190885
 	      IFUNC_IMPL_ADD (array, i, wcscmp,
190885
 			      CPU_FEATURE_USABLE (AVX2),
190885
 			      __wcscmp_avx2)
190885
+	      IFUNC_IMPL_ADD (array, i, wcscmp,
190885
+			      (CPU_FEATURE_USABLE (AVX512VL)
190885
+			       && CPU_FEATURE_USABLE (AVX512BW)
190885
+			       && CPU_FEATURE_USABLE (BMI2)),
190885
+			      __wcscmp_evex)
190885
 	      IFUNC_IMPL_ADD (array, i, wcscmp, 1, __wcscmp_sse2))
190885
 
190885
   /* Support sysdeps/x86_64/multiarch/wcsncmp.c.  */
190885
@@ -391,6 +448,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
190885
 	      IFUNC_IMPL_ADD (array, i, wcsncmp,
190885
 			      CPU_FEATURE_USABLE (AVX2),
190885
 			      __wcsncmp_avx2)
190885
+	      IFUNC_IMPL_ADD (array, i, wcsncmp,
190885
+			      (CPU_FEATURE_USABLE (AVX512VL)
190885
+			       && CPU_FEATURE_USABLE (AVX512BW)
190885
+			       && CPU_FEATURE_USABLE (BMI2)),
190885
+			      __wcsncmp_evex)
190885
 	      IFUNC_IMPL_ADD (array, i, wcsncmp, 1, __wcsncmp_sse2))
190885
 
190885
   /* Support sysdeps/x86_64/multiarch/wcscpy.c.  */
190885
@@ -404,6 +466,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
190885
 	      IFUNC_IMPL_ADD (array, i, wcslen,
190885
 			      CPU_FEATURE_USABLE (AVX2),
190885
 			      __wcslen_avx2)
190885
+	      IFUNC_IMPL_ADD (array, i, wcslen,
190885
+			      (CPU_FEATURE_USABLE (AVX512VL)
190885
+			       && CPU_FEATURE_USABLE (AVX512BW)
190885
+			       && CPU_FEATURE_USABLE (BMI2)),
190885
+			      __wcslen_evex)
190885
 	      IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2))
190885
 
190885
   /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */
190885
@@ -411,6 +478,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
190885
 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
190885
 			      CPU_FEATURE_USABLE (AVX2),
190885
 			      __wcsnlen_avx2)
190885
+	      IFUNC_IMPL_ADD (array, i, wcsnlen,
190885
+			      (CPU_FEATURE_USABLE (AVX512VL)
190885
+			       && CPU_FEATURE_USABLE (AVX512BW)
190885
+			       && CPU_FEATURE_USABLE (BMI2)),
190885
+			      __wcsnlen_evex)
190885
 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
190885
 			      CPU_FEATURE_USABLE (SSE4_1),
190885
 			      __wcsnlen_sse4_1)
190885
@@ -421,6 +493,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
190885
 	      IFUNC_IMPL_ADD (array, i, wmemchr,
190885
 			      CPU_FEATURE_USABLE (AVX2),
190885
 			      __wmemchr_avx2)
190885
+	      IFUNC_IMPL_ADD (array, i, wmemchr,
190885
+			      (CPU_FEATURE_USABLE (AVX512VL)
190885
+			       && CPU_FEATURE_USABLE (AVX512BW)
190885
+			       && CPU_FEATURE_USABLE (BMI2)),
190885
+			      __wmemchr_evex)
190885
 	      IFUNC_IMPL_ADD (array, i, wmemchr, 1, __wmemchr_sse2))
190885
 
190885
   /* Support sysdeps/x86_64/multiarch/wmemcmp.c.  */
190885
@@ -568,6 +645,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
190885
 	      IFUNC_IMPL_ADD (array, i, strncmp,
190885
 			      CPU_FEATURE_USABLE (AVX2),
190885
 			      __strncmp_avx2)
190885
+	      IFUNC_IMPL_ADD (array, i, strncmp,
190885
+			      (CPU_FEATURE_USABLE (AVX512VL)
190885
+			       && CPU_FEATURE_USABLE (AVX512BW)),
190885
+			      __strncmp_evex)
190885
 	      IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSE4_2),
190885
 			      __strncmp_sse42)
190885
 	      IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSSE3),
190885
diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
190885
new file mode 100644
190885
index 00000000..6dd5d67b
190885
--- /dev/null
190885
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
190885
@@ -0,0 +1,381 @@
190885
+/* memchr/wmemchr optimized with 256-bit EVEX instructions.
190885
+   Copyright (C) 2021 Free Software Foundation, Inc.
190885
+   This file is part of the GNU C Library.
190885
+
190885
+   The GNU C Library is free software; you can redistribute it and/or
190885
+   modify it under the terms of the GNU Lesser General Public
190885
+   License as published by the Free Software Foundation; either
190885
+   version 2.1 of the License, or (at your option) any later version.
190885
+
190885
+   The GNU C Library is distributed in the hope that it will be useful,
190885
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
190885
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
190885
+   Lesser General Public License for more details.
190885
+
190885
+   You should have received a copy of the GNU Lesser General Public
190885
+   License along with the GNU C Library; if not, see
190885
+   <https://www.gnu.org/licenses/>.  */
190885
+
190885
+#if IS_IN (libc)
190885
+
190885
+# include <sysdep.h>
190885
+
190885
+# ifndef MEMCHR
190885
+#  define MEMCHR	__memchr_evex
190885
+# endif
190885
+
190885
+# ifdef USE_AS_WMEMCHR
190885
+#  define VPBROADCAST	vpbroadcastd
190885
+#  define VPCMP		vpcmpd
190885
+#  define SHIFT_REG	r8d
190885
+# else
190885
+#  define VPBROADCAST	vpbroadcastb
190885
+#  define VPCMP		vpcmpb
190885
+#  define SHIFT_REG	ecx
190885
+# endif
190885
+
190885
+# define XMMMATCH	xmm16
190885
+# define YMMMATCH	ymm16
190885
+# define YMM1		ymm17
190885
+# define YMM2		ymm18
190885
+# define YMM3		ymm19
190885
+# define YMM4		ymm20
190885
+# define YMM5		ymm21
190885
+# define YMM6		ymm22
190885
+
190885
+# define VEC_SIZE 32
190885
+
190885
+	.section .text.evex,"ax",@progbits
190885
+ENTRY (MEMCHR)
190885
+# ifndef USE_AS_RAWMEMCHR
190885
+	/* Check for zero length.  */
190885
+	test	%RDX_LP, %RDX_LP
190885
+	jz	L(zero)
190885
+# endif
190885
+	movl	%edi, %ecx
190885
+# ifdef USE_AS_WMEMCHR
190885
+	shl	$2, %RDX_LP
190885
+# else
190885
+#  ifdef __ILP32__
190885
+	/* Clear the upper 32 bits.  */
190885
+	movl	%edx, %edx
190885
+#  endif
190885
+# endif
190885
+	/* Broadcast CHAR to YMMMATCH.  */
190885
+	VPBROADCAST %esi, %YMMMATCH
190885
+	/* Check if we may cross page boundary with one vector load.  */
190885
+	andl	$(2 * VEC_SIZE - 1), %ecx
190885
+	cmpl	$VEC_SIZE, %ecx
190885
+	ja	L(cros_page_boundary)
190885
+
190885
+	/* Check the first VEC_SIZE bytes.  */
190885
+	VPCMP	$0, (%rdi), %YMMMATCH, %k1
190885
+	kmovd	%k1, %eax
190885
+	testl	%eax, %eax
190885
+
190885
+# ifndef USE_AS_RAWMEMCHR
190885
+	jnz	L(first_vec_x0_check)
190885
+	/* Adjust length and check the end of data.  */
190885
+	subq	$VEC_SIZE, %rdx
190885
+	jbe	L(zero)
190885
+# else
190885
+	jnz	L(first_vec_x0)
190885
+# endif
190885
+
190885
+	/* Align data for aligned loads in the loop.  */
190885
+	addq	$VEC_SIZE, %rdi
190885
+	andl	$(VEC_SIZE - 1), %ecx
190885
+	andq	$-VEC_SIZE, %rdi
190885
+
190885
+# ifndef USE_AS_RAWMEMCHR
190885
+	/* Adjust length.  */
190885
+	addq	%rcx, %rdx
190885
+
190885
+	subq	$(VEC_SIZE * 4), %rdx
190885
+	jbe	L(last_4x_vec_or_less)
190885
+# endif
190885
+	jmp	L(more_4x_vec)
190885
+
190885
+	.p2align 4
190885
+L(cros_page_boundary):
190885
+	andl	$(VEC_SIZE - 1), %ecx
190885
+# ifdef USE_AS_WMEMCHR
190885
+	/* NB: Divide shift count by 4 since each bit in K1 represent 4
190885
+	   bytes.  */
190885
+	movl	%ecx, %SHIFT_REG
190885
+	sarl	$2, %SHIFT_REG
190885
+# endif
190885
+	andq	$-VEC_SIZE, %rdi
190885
+	VPCMP	$0, (%rdi), %YMMMATCH, %k1
190885
+	kmovd	%k1, %eax
190885
+	/* Remove the leading bytes.  */
190885
+	sarxl	%SHIFT_REG, %eax, %eax
190885
+	testl	%eax, %eax
190885
+	jz	L(aligned_more)
190885
+	tzcntl	%eax, %eax
190885
+# ifdef USE_AS_WMEMCHR
190885
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
+	sall	$2, %eax
190885
+# endif
190885
+# ifndef USE_AS_RAWMEMCHR
190885
+	/* Check the end of data.  */
190885
+	cmpq	%rax, %rdx
190885
+	jbe	L(zero)
190885
+# endif
190885
+	addq	%rdi, %rax
190885
+	addq	%rcx, %rax
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(aligned_more):
190885
+# ifndef USE_AS_RAWMEMCHR
190885
+        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
190885
+	   instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
190885
+	   overflow.  */
190885
+	negq	%rcx
190885
+	addq	$VEC_SIZE, %rcx
190885
+
190885
+	/* Check the end of data.  */
190885
+	subq	%rcx, %rdx
190885
+	jbe	L(zero)
190885
+# endif
190885
+
190885
+	addq	$VEC_SIZE, %rdi
190885
+
190885
+# ifndef USE_AS_RAWMEMCHR
190885
+	subq	$(VEC_SIZE * 4), %rdx
190885
+	jbe	L(last_4x_vec_or_less)
190885
+# endif
190885
+
190885
+L(more_4x_vec):
190885
+	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
190885
+	   since data is only aligned to VEC_SIZE.  */
190885
+	VPCMP	$0, (%rdi), %YMMMATCH, %k1
190885
+	kmovd	%k1, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(first_vec_x0)
190885
+
190885
+	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
190885
+	kmovd	%k1, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(first_vec_x1)
190885
+
190885
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
190885
+	kmovd	%k1, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(first_vec_x2)
190885
+
190885
+	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
190885
+	kmovd	%k1, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(first_vec_x3)
190885
+
190885
+	addq	$(VEC_SIZE * 4), %rdi
190885
+
190885
+# ifndef USE_AS_RAWMEMCHR
190885
+	subq	$(VEC_SIZE * 4), %rdx
190885
+	jbe	L(last_4x_vec_or_less)
190885
+# endif
190885
+
190885
+	/* Align data to 4 * VEC_SIZE.  */
190885
+	movq	%rdi, %rcx
190885
+	andl	$(4 * VEC_SIZE - 1), %ecx
190885
+	andq	$-(4 * VEC_SIZE), %rdi
190885
+
190885
+# ifndef USE_AS_RAWMEMCHR
190885
+	/* Adjust length.  */
190885
+	addq	%rcx, %rdx
190885
+# endif
190885
+
190885
+	.p2align 4
190885
+L(loop_4x_vec):
190885
+	/* Compare 4 * VEC at a time forward.  */
190885
+	VPCMP	$0, (%rdi), %YMMMATCH, %k1
190885
+	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k2
190885
+	kord	%k1, %k2, %k5
190885
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
190885
+	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
190885
+
190885
+	kord	%k3, %k4, %k6
190885
+	kortestd %k5, %k6
190885
+	jnz	L(4x_vec_end)
190885
+
190885
+	addq	$(VEC_SIZE * 4), %rdi
190885
+
190885
+# ifdef USE_AS_RAWMEMCHR
190885
+	jmp	L(loop_4x_vec)
190885
+# else
190885
+	subq	$(VEC_SIZE * 4), %rdx
190885
+	ja	L(loop_4x_vec)
190885
+
190885
+L(last_4x_vec_or_less):
190885
+	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
190885
+	addl	$(VEC_SIZE * 2), %edx
190885
+	jle	L(last_2x_vec)
190885
+
190885
+	VPCMP	$0, (%rdi), %YMMMATCH, %k1
190885
+	kmovd	%k1, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(first_vec_x0)
190885
+
190885
+	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
190885
+	kmovd	%k1, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(first_vec_x1)
190885
+
190885
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
190885
+	kmovd	%k1, %eax
190885
+	testl	%eax, %eax
190885
+
190885
+	jnz	L(first_vec_x2_check)
190885
+	subl	$VEC_SIZE, %edx
190885
+	jle	L(zero)
190885
+
190885
+	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
190885
+	kmovd	%k1, %eax
190885
+	testl	%eax, %eax
190885
+
190885
+	jnz	L(first_vec_x3_check)
190885
+	xorl	%eax, %eax
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(last_2x_vec):
190885
+	addl	$(VEC_SIZE * 2), %edx
190885
+	VPCMP	$0, (%rdi), %YMMMATCH, %k1
190885
+	kmovd	%k1, %eax
190885
+	testl	%eax, %eax
190885
+
190885
+	jnz	L(first_vec_x0_check)
190885
+	subl	$VEC_SIZE, %edx
190885
+	jle	L(zero)
190885
+
190885
+	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
190885
+	kmovd	%k1, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(first_vec_x1_check)
190885
+	xorl	%eax, %eax
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(first_vec_x0_check):
190885
+	tzcntl	%eax, %eax
190885
+# ifdef USE_AS_WMEMCHR
190885
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
+	sall	$2, %eax
190885
+# endif
190885
+	/* Check the end of data.  */
190885
+	cmpq	%rax, %rdx
190885
+	jbe	L(zero)
190885
+	addq	%rdi, %rax
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(first_vec_x1_check):
190885
+	tzcntl	%eax, %eax
190885
+# ifdef USE_AS_WMEMCHR
190885
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
+	sall	$2, %eax
190885
+# endif
190885
+	/* Check the end of data.  */
190885
+	cmpq	%rax, %rdx
190885
+	jbe	L(zero)
190885
+	addq	$VEC_SIZE, %rax
190885
+	addq	%rdi, %rax
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(first_vec_x2_check):
190885
+	tzcntl	%eax, %eax
190885
+# ifdef USE_AS_WMEMCHR
190885
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
+	sall	$2, %eax
190885
+# endif
190885
+	/* Check the end of data.  */
190885
+	cmpq	%rax, %rdx
190885
+	jbe	L(zero)
190885
+	addq	$(VEC_SIZE * 2), %rax
190885
+	addq	%rdi, %rax
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(first_vec_x3_check):
190885
+	tzcntl	%eax, %eax
190885
+# ifdef USE_AS_WMEMCHR
190885
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
+	sall	$2, %eax
190885
+# endif
190885
+	/* Check the end of data.  */
190885
+	cmpq	%rax, %rdx
190885
+	jbe	L(zero)
190885
+	addq	$(VEC_SIZE * 3), %rax
190885
+	addq	%rdi, %rax
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(zero):
190885
+	xorl	%eax, %eax
190885
+	ret
190885
+# endif
190885
+
190885
+	.p2align 4
190885
+L(first_vec_x0):
190885
+	tzcntl	%eax, %eax
190885
+# ifdef USE_AS_WMEMCHR
190885
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
+	leaq	(%rdi, %rax, 4), %rax
190885
+# else
190885
+	addq	%rdi, %rax
190885
+# endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(first_vec_x1):
190885
+	tzcntl	%eax, %eax
190885
+# ifdef USE_AS_WMEMCHR
190885
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
+	leaq	VEC_SIZE(%rdi, %rax, 4), %rax
190885
+# else
190885
+	addq	$VEC_SIZE, %rax
190885
+	addq	%rdi, %rax
190885
+# endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(first_vec_x2):
190885
+	tzcntl	%eax, %eax
190885
+# ifdef USE_AS_WMEMCHR
190885
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
190885
+# else
190885
+	addq	$(VEC_SIZE * 2), %rax
190885
+	addq	%rdi, %rax
190885
+# endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(4x_vec_end):
190885
+	kmovd	%k1, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(first_vec_x0)
190885
+	kmovd	%k2, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(first_vec_x1)
190885
+	kmovd	%k3, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(first_vec_x2)
190885
+	kmovd	%k4, %eax
190885
+	testl	%eax, %eax
190885
+L(first_vec_x3):
190885
+	tzcntl	%eax, %eax
190885
+# ifdef USE_AS_WMEMCHR
190885
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, 4), %rax
190885
+# else
190885
+	addq	$(VEC_SIZE * 3), %rax
190885
+	addq	%rdi, %rax
190885
+# endif
190885
+	ret
190885
+
190885
+END (MEMCHR)
190885
+#endif
190885
diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S
190885
new file mode 100644
190885
index 00000000..16bf8e02
190885
--- /dev/null
190885
+++ b/sysdeps/x86_64/multiarch/memrchr-evex.S
190885
@@ -0,0 +1,337 @@
190885
+/* memrchr optimized with 256-bit EVEX instructions.
190885
+   Copyright (C) 2021 Free Software Foundation, Inc.
190885
+   This file is part of the GNU C Library.
190885
+
190885
+   The GNU C Library is free software; you can redistribute it and/or
190885
+   modify it under the terms of the GNU Lesser General Public
190885
+   License as published by the Free Software Foundation; either
190885
+   version 2.1 of the License, or (at your option) any later version.
190885
+
190885
+   The GNU C Library is distributed in the hope that it will be useful,
190885
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
190885
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
190885
+   Lesser General Public License for more details.
190885
+
190885
+   You should have received a copy of the GNU Lesser General Public
190885
+   License along with the GNU C Library; if not, see
190885
+   <https://www.gnu.org/licenses/>.  */
190885
+
190885
+#if IS_IN (libc)
190885
+
190885
+# include <sysdep.h>
190885
+
190885
+# define VMOVA		vmovdqa64
190885
+
190885
+# define YMMMATCH	ymm16
190885
+
190885
+# define VEC_SIZE 32
190885
+
190885
+	.section .text.evex,"ax",@progbits
190885
+ENTRY (__memrchr_evex)
190885
+	/* Broadcast CHAR to YMMMATCH.  */
190885
+	vpbroadcastb %esi, %YMMMATCH
190885
+
190885
+	sub	$VEC_SIZE, %RDX_LP
190885
+	jbe	L(last_vec_or_less)
190885
+
190885
+	add	%RDX_LP, %RDI_LP
190885
+
190885
+	/* Check the last VEC_SIZE bytes.  */
190885
+	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
190885
+	kmovd	%k1, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(last_vec_x0)
190885
+
190885
+	subq	$(VEC_SIZE * 4), %rdi
190885
+	movl	%edi, %ecx
190885
+	andl	$(VEC_SIZE - 1), %ecx
190885
+	jz	L(aligned_more)
190885
+
190885
+	/* Align data for aligned loads in the loop.  */
190885
+	addq	$VEC_SIZE, %rdi
190885
+	addq	$VEC_SIZE, %rdx
190885
+	andq	$-VEC_SIZE, %rdi
190885
+	subq	%rcx, %rdx
190885
+
190885
+	.p2align 4
190885
+L(aligned_more):
190885
+	subq	$(VEC_SIZE * 4), %rdx
190885
+	jbe	L(last_4x_vec_or_less)
190885
+
190885
+	/* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
190885
+	   since data is only aligned to VEC_SIZE.  */
190885
+	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
190885
+	kmovd	%k1, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(last_vec_x3)
190885
+
190885
+	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
190885
+	kmovd	%k2, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(last_vec_x2)
190885
+
190885
+	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k3
190885
+	kmovd	%k3, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(last_vec_x1)
190885
+
190885
+	vpcmpb	$0, (%rdi), %YMMMATCH, %k4
190885
+	kmovd	%k4, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(last_vec_x0)
190885
+
190885
+	/* Align data to 4 * VEC_SIZE for loop with fewer branches.
190885
+	   There are some overlaps with above if data isn't aligned
190885
+	   to 4 * VEC_SIZE.  */
190885
+	movl	%edi, %ecx
190885
+	andl	$(VEC_SIZE * 4 - 1), %ecx
190885
+	jz	L(loop_4x_vec)
190885
+
190885
+	addq	$(VEC_SIZE * 4), %rdi
190885
+	addq	$(VEC_SIZE * 4), %rdx
190885
+	andq	$-(VEC_SIZE * 4), %rdi
190885
+	subq	%rcx, %rdx
190885
+
190885
+	.p2align 4
190885
+L(loop_4x_vec):
190885
+	/* Compare 4 * VEC at a time forward.  */
190885
+	subq	$(VEC_SIZE * 4), %rdi
190885
+	subq	$(VEC_SIZE * 4), %rdx
190885
+	jbe	L(last_4x_vec_or_less)
190885
+
190885
+	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
190885
+	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k2
190885
+	kord	%k1, %k2, %k5
190885
+	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
190885
+	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
190885
+
190885
+	kord	%k3, %k4, %k6
190885
+	kortestd %k5, %k6
190885
+	jz	L(loop_4x_vec)
190885
+
190885
+	/* There is a match.  */
190885
+	kmovd	%k4, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(last_vec_x3)
190885
+
190885
+	kmovd	%k3, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(last_vec_x2)
190885
+
190885
+	kmovd	%k2, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(last_vec_x1)
190885
+
190885
+	kmovd	%k1, %eax
190885
+	bsrl	%eax, %eax
190885
+	addq	%rdi, %rax
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(last_4x_vec_or_less):
190885
+	addl	$(VEC_SIZE * 4), %edx
190885
+	cmpl	$(VEC_SIZE * 2), %edx
190885
+	jbe	L(last_2x_vec)
190885
+
190885
+	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
190885
+	kmovd	%k1, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(last_vec_x3)
190885
+
190885
+	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
190885
+	kmovd	%k2, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(last_vec_x2)
190885
+
190885
+	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k3
190885
+	kmovd	%k3, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(last_vec_x1_check)
190885
+	cmpl	$(VEC_SIZE * 3), %edx
190885
+	jbe	L(zero)
190885
+
190885
+	vpcmpb	$0, (%rdi), %YMMMATCH, %k4
190885
+	kmovd	%k4, %eax
190885
+	testl	%eax, %eax
190885
+	jz	L(zero)
190885
+	bsrl	%eax, %eax
190885
+	subq	$(VEC_SIZE * 4), %rdx
190885
+	addq	%rax, %rdx
190885
+	jl	L(zero)
190885
+	addq	%rdi, %rax
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(last_2x_vec):
190885
+	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
190885
+	kmovd	%k1, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(last_vec_x3_check)
190885
+	cmpl	$VEC_SIZE, %edx
190885
+	jbe	L(zero)
190885
+
190885
+	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
190885
+	kmovd	%k1, %eax
190885
+	testl	%eax, %eax
190885
+	jz	L(zero)
190885
+	bsrl	%eax, %eax
190885
+	subq	$(VEC_SIZE * 2), %rdx
190885
+	addq	%rax, %rdx
190885
+	jl	L(zero)
190885
+	addl	$(VEC_SIZE * 2), %eax
190885
+	addq	%rdi, %rax
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(last_vec_x0):
190885
+	bsrl	%eax, %eax
190885
+	addq	%rdi, %rax
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(last_vec_x1):
190885
+	bsrl	%eax, %eax
190885
+	addl	$VEC_SIZE, %eax
190885
+	addq	%rdi, %rax
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(last_vec_x2):
190885
+	bsrl	%eax, %eax
190885
+	addl	$(VEC_SIZE * 2), %eax
190885
+	addq	%rdi, %rax
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(last_vec_x3):
190885
+	bsrl	%eax, %eax
190885
+	addl	$(VEC_SIZE * 3), %eax
190885
+	addq	%rdi, %rax
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(last_vec_x1_check):
190885
+	bsrl	%eax, %eax
190885
+	subq	$(VEC_SIZE * 3), %rdx
190885
+	addq	%rax, %rdx
190885
+	jl	L(zero)
190885
+	addl	$VEC_SIZE, %eax
190885
+	addq	%rdi, %rax
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(last_vec_x3_check):
190885
+	bsrl	%eax, %eax
190885
+	subq	$VEC_SIZE, %rdx
190885
+	addq	%rax, %rdx
190885
+	jl	L(zero)
190885
+	addl	$(VEC_SIZE * 3), %eax
190885
+	addq	%rdi, %rax
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(zero):
190885
+	xorl	%eax, %eax
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(last_vec_or_less_aligned):
190885
+	movl	%edx, %ecx
190885
+
190885
+	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
190885
+
190885
+	movl	$1, %edx
190885
+	/* Support rdx << 32.  */
190885
+	salq	%cl, %rdx
190885
+	subq	$1, %rdx
190885
+
190885
+	kmovd	%k1, %eax
190885
+
190885
+	/* Remove the trailing bytes.  */
190885
+	andl	%edx, %eax
190885
+	testl	%eax, %eax
190885
+	jz	L(zero)
190885
+
190885
+	bsrl	%eax, %eax
190885
+	addq	%rdi, %rax
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(last_vec_or_less):
190885
+	addl	$VEC_SIZE, %edx
190885
+
190885
+	/* Check for zero length.  */
190885
+	testl	%edx, %edx
190885
+	jz	L(zero)
190885
+
190885
+	movl	%edi, %ecx
190885
+	andl	$(VEC_SIZE - 1), %ecx
190885
+	jz	L(last_vec_or_less_aligned)
190885
+
190885
+	movl	%ecx, %esi
190885
+	movl	%ecx, %r8d
190885
+	addl	%edx, %esi
190885
+	andq	$-VEC_SIZE, %rdi
190885
+
190885
+	subl	$VEC_SIZE, %esi
190885
+	ja	L(last_vec_2x_aligned)
190885
+
190885
+	/* Check the last VEC.  */
190885
+	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
190885
+	kmovd	%k1, %eax
190885
+
190885
+	/* Remove the leading and trailing bytes.  */
190885
+	sarl	%cl, %eax
190885
+	movl	%edx, %ecx
190885
+
190885
+	movl	$1, %edx
190885
+	sall	%cl, %edx
190885
+	subl	$1, %edx
190885
+
190885
+	andl	%edx, %eax
190885
+	testl	%eax, %eax
190885
+	jz	L(zero)
190885
+
190885
+	bsrl	%eax, %eax
190885
+	addq	%rdi, %rax
190885
+	addq	%r8, %rax
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(last_vec_2x_aligned):
190885
+	movl	%esi, %ecx
190885
+
190885
+	/* Check the last VEC.  */
190885
+	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
190885
+
190885
+	movl	$1, %edx
190885
+	sall	%cl, %edx
190885
+	subl	$1, %edx
190885
+
190885
+	kmovd	%k1, %eax
190885
+
190885
+	/* Remove the trailing bytes.  */
190885
+	andl	%edx, %eax
190885
+
190885
+	testl	%eax, %eax
190885
+	jnz	L(last_vec_x1)
190885
+
190885
+	/* Check the second last VEC.  */
190885
+	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
190885
+
190885
+	movl	%r8d, %ecx
190885
+
190885
+	kmovd	%k1, %eax
190885
+
190885
+	/* Remove the leading bytes.  Must use unsigned right shift for
190885
+	   bsrl below.  */
190885
+	shrl	%cl, %eax
190885
+	testl	%eax, %eax
190885
+	jz	L(zero)
190885
+
190885
+	bsrl	%eax, %eax
190885
+	addq	%rdi, %rax
190885
+	addq	%r8, %rax
190885
+	ret
190885
+END (__memrchr_evex)
190885
+#endif
190885
diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex.S b/sysdeps/x86_64/multiarch/rawmemchr-evex.S
190885
new file mode 100644
190885
index 00000000..ec942b77
190885
--- /dev/null
190885
+++ b/sysdeps/x86_64/multiarch/rawmemchr-evex.S
190885
@@ -0,0 +1,4 @@
190885
+#define MEMCHR __rawmemchr_evex
190885
+#define USE_AS_RAWMEMCHR 1
190885
+
190885
+#include "memchr-evex.S"
190885
diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
190885
new file mode 100644
190885
index 00000000..ddc86a70
190885
--- /dev/null
190885
+++ b/sysdeps/x86_64/multiarch/strchr-evex.S
190885
@@ -0,0 +1,335 @@
190885
+/* strchr/strchrnul optimized with 256-bit EVEX instructions.
190885
+   Copyright (C) 2021 Free Software Foundation, Inc.
190885
+   This file is part of the GNU C Library.
190885
+
190885
+   The GNU C Library is free software; you can redistribute it and/or
190885
+   modify it under the terms of the GNU Lesser General Public
190885
+   License as published by the Free Software Foundation; either
190885
+   version 2.1 of the License, or (at your option) any later version.
190885
+
190885
+   The GNU C Library is distributed in the hope that it will be useful,
190885
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
190885
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
190885
+   Lesser General Public License for more details.
190885
+
190885
+   You should have received a copy of the GNU Lesser General Public
190885
+   License along with the GNU C Library; if not, see
190885
+   <https://www.gnu.org/licenses/>.  */
190885
+
190885
+#if IS_IN (libc)
190885
+
190885
+# include <sysdep.h>
190885
+
190885
+# ifndef STRCHR
190885
+#  define STRCHR	__strchr_evex
190885
+# endif
190885
+
190885
+# define VMOVU		vmovdqu64
190885
+# define VMOVA		vmovdqa64
190885
+
190885
+# ifdef USE_AS_WCSCHR
190885
+#  define VPBROADCAST	vpbroadcastd
190885
+#  define VPCMP		vpcmpd
190885
+#  define VPMINU	vpminud
190885
+#  define CHAR_REG	esi
190885
+#  define SHIFT_REG	r8d
190885
+# else
190885
+#  define VPBROADCAST	vpbroadcastb
190885
+#  define VPCMP		vpcmpb
190885
+#  define VPMINU	vpminub
190885
+#  define CHAR_REG	sil
190885
+#  define SHIFT_REG	ecx
190885
+# endif
190885
+
190885
+# define XMMZERO	xmm16
190885
+
190885
+# define YMMZERO	ymm16
190885
+# define YMM0		ymm17
190885
+# define YMM1		ymm18
190885
+# define YMM2		ymm19
190885
+# define YMM3		ymm20
190885
+# define YMM4		ymm21
190885
+# define YMM5		ymm22
190885
+# define YMM6		ymm23
190885
+# define YMM7		ymm24
190885
+# define YMM8		ymm25
190885
+
190885
+# define VEC_SIZE 32
190885
+# define PAGE_SIZE 4096
190885
+
190885
+	.section .text.evex,"ax",@progbits
190885
+ENTRY (STRCHR)
190885
+	movl	%edi, %ecx
190885
+# ifndef USE_AS_STRCHRNUL
190885
+	xorl	%edx, %edx
190885
+# endif
190885
+
190885
+	/* Broadcast CHAR to YMM0.	*/
190885
+	VPBROADCAST %esi, %YMM0
190885
+
190885
+	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
190885
+
190885
+	/* Check if we cross page boundary with one vector load.  */
190885
+	andl	$(PAGE_SIZE - 1), %ecx
190885
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
190885
+	ja  L(cross_page_boundary)
190885
+
190885
+	/* Check the first VEC_SIZE bytes. Search for both CHAR and the
190885
+	   null bytes.  */
190885
+	VMOVU	(%rdi), %YMM1
190885
+
190885
+	/* Leaves only CHARS matching esi as 0.  */
190885
+	vpxorq	%YMM1, %YMM0, %YMM2
190885
+	VPMINU	%YMM2, %YMM1, %YMM2
190885
+	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
190885
+	VPCMP	$0, %YMMZERO, %YMM2, %k0
190885
+	ktestd	%k0, %k0
190885
+	jz	L(more_vecs)
190885
+	kmovd	%k0, %eax
190885
+	tzcntl	%eax, %eax
190885
+	/* Found CHAR or the null byte.	 */
190885
+# ifdef USE_AS_WCSCHR
190885
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
+	leaq	(%rdi, %rax, 4), %rax
190885
+# else
190885
+	addq	%rdi, %rax
190885
+# endif
190885
+# ifndef USE_AS_STRCHRNUL
190885
+	cmp (%rax), %CHAR_REG
190885
+	cmovne	%rdx, %rax
190885
+# endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(more_vecs):
190885
+	/* Align data for aligned loads in the loop.  */
190885
+	andq	$-VEC_SIZE, %rdi
190885
+L(aligned_more):
190885
+
190885
+	/* Check the next 4 * VEC_SIZE.	 Only one VEC_SIZE at a time
190885
+	   since data is only aligned to VEC_SIZE.	*/
190885
+	VMOVA	VEC_SIZE(%rdi), %YMM1
190885
+	addq	$VEC_SIZE, %rdi
190885
+
190885
+	/* Leaves only CHARS matching esi as 0.  */
190885
+	vpxorq	%YMM1, %YMM0, %YMM2
190885
+	VPMINU	%YMM2, %YMM1, %YMM2
190885
+	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
190885
+	VPCMP	$0, %YMMZERO, %YMM2, %k0
190885
+	kmovd	%k0, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(first_vec_x0)
190885
+
190885
+	VMOVA	VEC_SIZE(%rdi), %YMM1
190885
+	/* Leaves only CHARS matching esi as 0.  */
190885
+	vpxorq	%YMM1, %YMM0, %YMM2
190885
+	VPMINU	%YMM2, %YMM1, %YMM2
190885
+	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
190885
+	VPCMP	$0, %YMMZERO, %YMM2, %k0
190885
+	kmovd	%k0, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(first_vec_x1)
190885
+
190885
+	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM1
190885
+	/* Leaves only CHARS matching esi as 0.  */
190885
+	vpxorq	%YMM1, %YMM0, %YMM2
190885
+	VPMINU	%YMM2, %YMM1, %YMM2
190885
+	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
190885
+	VPCMP	$0, %YMMZERO, %YMM2, %k0
190885
+	kmovd	%k0, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(first_vec_x2)
190885
+
190885
+	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM1
190885
+	/* Leaves only CHARS matching esi as 0.  */
190885
+	vpxorq	%YMM1, %YMM0, %YMM2
190885
+	VPMINU	%YMM2, %YMM1, %YMM2
190885
+	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
190885
+	VPCMP	$0, %YMMZERO, %YMM2, %k0
190885
+	ktestd	%k0, %k0
190885
+	jz	L(prep_loop_4x)
190885
+
190885
+	kmovd	%k0, %eax
190885
+	tzcntl	%eax, %eax
190885
+	/* Found CHAR or the null byte.	 */
190885
+# ifdef USE_AS_WCSCHR
190885
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, 4), %rax
190885
+# else
190885
+	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
190885
+# endif
190885
+# ifndef USE_AS_STRCHRNUL
190885
+	cmp (%rax), %CHAR_REG
190885
+	cmovne	%rdx, %rax
190885
+# endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(first_vec_x0):
190885
+	tzcntl	%eax, %eax
190885
+	/* Found CHAR or the null byte.	 */
190885
+# ifdef USE_AS_WCSCHR
190885
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
+	leaq	(%rdi, %rax, 4), %rax
190885
+# else
190885
+	addq	%rdi, %rax
190885
+# endif
190885
+# ifndef USE_AS_STRCHRNUL
190885
+	cmp (%rax), %CHAR_REG
190885
+	cmovne	%rdx, %rax
190885
+# endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(first_vec_x1):
190885
+	tzcntl	%eax, %eax
190885
+	/* Found CHAR or the null byte.	 */
190885
+# ifdef USE_AS_WCSCHR
190885
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
+	leaq	VEC_SIZE(%rdi, %rax, 4), %rax
190885
+# else
190885
+	leaq	VEC_SIZE(%rdi, %rax), %rax
190885
+# endif
190885
+# ifndef USE_AS_STRCHRNUL
190885
+	cmp (%rax), %CHAR_REG
190885
+	cmovne	%rdx, %rax
190885
+# endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(first_vec_x2):
190885
+	tzcntl	%eax, %eax
190885
+	/* Found CHAR or the null byte.	 */
190885
+# ifdef USE_AS_WCSCHR
190885
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
190885
+# else
190885
+	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
190885
+# endif
190885
+# ifndef USE_AS_STRCHRNUL
190885
+	cmp (%rax), %CHAR_REG
190885
+	cmovne	%rdx, %rax
190885
+# endif
190885
+	ret
190885
+
190885
+L(prep_loop_4x):
190885
+	/* Align data to 4 * VEC_SIZE.	*/
190885
+	andq	$-(VEC_SIZE * 4), %rdi
190885
+
190885
+	.p2align 4
190885
+L(loop_4x_vec):
190885
+	/* Compare 4 * VEC at a time forward.  */
190885
+	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
190885
+	VMOVA	(VEC_SIZE * 5)(%rdi), %YMM2
190885
+	VMOVA	(VEC_SIZE * 6)(%rdi), %YMM3
190885
+	VMOVA	(VEC_SIZE * 7)(%rdi), %YMM4
190885
+
190885
+	/* Leaves only CHARS matching esi as 0.  */
190885
+	vpxorq	%YMM1, %YMM0, %YMM5
190885
+	vpxorq	%YMM2, %YMM0, %YMM6
190885
+	vpxorq	%YMM3, %YMM0, %YMM7
190885
+	vpxorq	%YMM4, %YMM0, %YMM8
190885
+
190885
+	VPMINU	%YMM5, %YMM1, %YMM5
190885
+	VPMINU	%YMM6, %YMM2, %YMM6
190885
+	VPMINU	%YMM7, %YMM3, %YMM7
190885
+	VPMINU	%YMM8, %YMM4, %YMM8
190885
+
190885
+	VPMINU	%YMM5, %YMM6, %YMM1
190885
+	VPMINU	%YMM7, %YMM8, %YMM2
190885
+
190885
+	VPMINU	%YMM1, %YMM2, %YMM1
190885
+
190885
+	/* Each bit in K0 represents a CHAR or a null byte.  */
190885
+	VPCMP	$0, %YMMZERO, %YMM1, %k0
190885
+
190885
+	addq	$(VEC_SIZE * 4), %rdi
190885
+
190885
+	ktestd	%k0, %k0
190885
+	jz	L(loop_4x_vec)
190885
+
190885
+	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
190885
+	VPCMP	$0, %YMMZERO, %YMM5, %k0
190885
+	kmovd	%k0, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(first_vec_x0)
190885
+
190885
+	/* Each bit in K1 represents a CHAR or a null byte in YMM2.  */
190885
+	VPCMP	$0, %YMMZERO, %YMM6, %k1
190885
+	kmovd	%k1, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(first_vec_x1)
190885
+
190885
+	/* Each bit in K2 represents a CHAR or a null byte in YMM3.  */
190885
+	VPCMP	$0, %YMMZERO, %YMM7, %k2
190885
+	/* Each bit in K3 represents a CHAR or a null byte in YMM4.  */
190885
+	VPCMP	$0, %YMMZERO, %YMM8, %k3
190885
+
190885
+# ifdef USE_AS_WCSCHR
190885
+	/* NB: Each bit in K2/K3 represents 4-byte element.  */
190885
+	kshiftlw $8, %k3, %k1
190885
+# else
190885
+	kshiftlq $32, %k3, %k1
190885
+# endif
190885
+
190885
+	/* Each bit in K1 represents a NULL or a mismatch.  */
190885
+	korq	%k1, %k2, %k1
190885
+	kmovq	%k1, %rax
190885
+
190885
+	tzcntq  %rax, %rax
190885
+# ifdef USE_AS_WCSCHR
190885
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
190885
+# else
190885
+	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
190885
+# endif
190885
+# ifndef USE_AS_STRCHRNUL
190885
+	cmp (%rax), %CHAR_REG
190885
+	cmovne	%rdx, %rax
190885
+# endif
190885
+	ret
190885
+
190885
+	/* Cold case for crossing page with first load.	 */
190885
+	.p2align 4
190885
+L(cross_page_boundary):
190885
+	andq	$-VEC_SIZE, %rdi
190885
+	andl	$(VEC_SIZE - 1), %ecx
190885
+
190885
+	VMOVA	(%rdi), %YMM1
190885
+
190885
+	/* Leaves only CHARS matching esi as 0.  */
190885
+	vpxorq	%YMM1, %YMM0, %YMM2
190885
+	VPMINU	%YMM2, %YMM1, %YMM2
190885
+	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
190885
+	VPCMP	$0, %YMMZERO, %YMM2, %k0
190885
+	kmovd	%k0, %eax
190885
+	testl	%eax, %eax
190885
+
190885
+# ifdef USE_AS_WCSCHR
190885
+	/* NB: Divide shift count by 4 since each bit in K1 represent 4
190885
+	   bytes.  */
190885
+	movl	%ecx, %SHIFT_REG
190885
+	sarl    $2, %SHIFT_REG
190885
+# endif
190885
+
190885
+	/* Remove the leading bits.	 */
190885
+	sarxl	%SHIFT_REG, %eax, %eax
190885
+	testl	%eax, %eax
190885
+
190885
+	jz	L(aligned_more)
190885
+	tzcntl	%eax, %eax
190885
+	addq	%rcx, %rdi
190885
+# ifdef USE_AS_WCSCHR
190885
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
+	leaq	(%rdi, %rax, 4), %rax
190885
+# else
190885
+	addq	%rdi, %rax
190885
+# endif
190885
+# ifndef USE_AS_STRCHRNUL
190885
+	cmp (%rax), %CHAR_REG
190885
+	cmovne	%rdx, %rax
190885
+# endif
190885
+	ret
190885
+
190885
+END (STRCHR)
190885
+# endif
190885
diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
190885
index 32954713..be05e197 100644
190885
--- a/sysdeps/x86_64/multiarch/strchr.c
190885
+++ b/sysdeps/x86_64/multiarch/strchr.c
190885
@@ -29,16 +29,24 @@
190885
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
190885
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_no_bsf) attribute_hidden;
190885
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
190885
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
190885
 
190885
 static inline void *
190885
 IFUNC_SELECTOR (void)
190885
 {
190885
   const struct cpu_features* cpu_features = __get_cpu_features ();
190885
 
190885
-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
190885
-      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
190885
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
190885
       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
190885
-    return OPTIMIZE (avx2);
190885
+    {
190885
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
190885
+	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
190885
+	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
190885
+	return OPTIMIZE (evex);
190885
+
190885
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
190885
+	return OPTIMIZE (avx2);
190885
+    }
190885
 
190885
   if (CPU_FEATURES_ARCH_P (cpu_features, Slow_BSF))
190885
     return OPTIMIZE (sse2_no_bsf);
190885
diff --git a/sysdeps/x86_64/multiarch/strchrnul-evex.S b/sysdeps/x86_64/multiarch/strchrnul-evex.S
190885
new file mode 100644
190885
index 00000000..064fe7ca
190885
--- /dev/null
190885
+++ b/sysdeps/x86_64/multiarch/strchrnul-evex.S
190885
@@ -0,0 +1,3 @@
190885
+#define STRCHR __strchrnul_evex
190885
+#define USE_AS_STRCHRNUL 1
190885
+#include "strchr-evex.S"
190885
diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
190885
new file mode 100644
190885
index 00000000..459eeed0
190885
--- /dev/null
190885
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
190885
@@ -0,0 +1,1043 @@
190885
+/* strcmp/wcscmp/strncmp/wcsncmp optimized with 256-bit EVEX instructions.
190885
+   Copyright (C) 2021 Free Software Foundation, Inc.
190885
+   This file is part of the GNU C Library.
190885
+
190885
+   The GNU C Library is free software; you can redistribute it and/or
190885
+   modify it under the terms of the GNU Lesser General Public
190885
+   License as published by the Free Software Foundation; either
190885
+   version 2.1 of the License, or (at your option) any later version.
190885
+
190885
+   The GNU C Library is distributed in the hope that it will be useful,
190885
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
190885
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
190885
+   Lesser General Public License for more details.
190885
+
190885
+   You should have received a copy of the GNU Lesser General Public
190885
+   License along with the GNU C Library; if not, see
190885
+   <https://www.gnu.org/licenses/>.  */
190885
+
190885
+#if IS_IN (libc)
190885
+
190885
+# include <sysdep.h>
190885
+
190885
+# ifndef STRCMP
190885
+#  define STRCMP	__strcmp_evex
190885
+# endif
190885
+
190885
+# define PAGE_SIZE	4096
190885
+
190885
+/* VEC_SIZE = Number of bytes in a ymm register */
190885
+# define VEC_SIZE	32
190885
+
190885
+/* Shift for dividing by (VEC_SIZE * 4).  */
190885
+# define DIVIDE_BY_VEC_4_SHIFT	7
190885
+# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
190885
+#  error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
190885
+# endif
190885
+
190885
+# define VMOVU		vmovdqu64
190885
+# define VMOVA		vmovdqa64
190885
+
190885
+# ifdef USE_AS_WCSCMP
190885
+/* Compare packed dwords.  */
190885
+#  define VPCMP		vpcmpd
190885
+#  define SHIFT_REG32	r8d
190885
+#  define SHIFT_REG64	r8
190885
+/* 1 dword char == 4 bytes.  */
190885
+#  define SIZE_OF_CHAR	4
190885
+# else
190885
+/* Compare packed bytes.  */
190885
+#  define VPCMP		vpcmpb
190885
+#  define SHIFT_REG32	ecx
190885
+#  define SHIFT_REG64	rcx
190885
+/* 1 byte char == 1 byte.  */
190885
+#  define SIZE_OF_CHAR	1
190885
+# endif
190885
+
190885
+# define XMMZERO	xmm16
190885
+# define XMM0		xmm17
190885
+# define XMM1		xmm18
190885
+
190885
+# define YMMZERO	ymm16
190885
+# define YMM0		ymm17
190885
+# define YMM1		ymm18
190885
+# define YMM2		ymm19
190885
+# define YMM3		ymm20
190885
+# define YMM4		ymm21
190885
+# define YMM5		ymm22
190885
+# define YMM6		ymm23
190885
+# define YMM7		ymm24
190885
+
190885
+/* Warning!
190885
+           wcscmp/wcsncmp have to use SIGNED comparison for elements.
190885
+           strcmp/strncmp have to use UNSIGNED comparison for elements.
190885
+*/
190885
+
190885
+/* The main idea of the string comparison (byte or dword) using 256-bit
190885
+   EVEX instructions consists of comparing (VPCMP) two ymm vectors. The
190885
+   latter can be on either packed bytes or dwords depending on
190885
+   USE_AS_WCSCMP. In order to check the null char, algorithm keeps the
190885
+   matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2
190885
+   KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes)
190885
+   are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd
190885
+   instructions.  Main loop (away from from page boundary) compares 4
190885
+   vectors are a time, effectively comparing 4 x VEC_SIZE bytes (128
190885
+   bytes) on each loop.
190885
+
190885
+   The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic
190885
+   is the same as strcmp, except that an a maximum offset is tracked.  If
190885
+   the maximum offset is reached before a difference is found, zero is
190885
+   returned.  */
190885
+
190885
+	.section .text.evex,"ax",@progbits
190885
+ENTRY (STRCMP)
190885
+# ifdef USE_AS_STRNCMP
190885
+	/* Check for simple cases (0 or 1) in offset.  */
190885
+	cmp	$1, %RDX_LP
190885
+	je	L(char0)
190885
+	jb	L(zero)
190885
+#  ifdef USE_AS_WCSCMP
190885
+	/* Convert units: from wide to byte char.  */
190885
+	shl	$2, %RDX_LP
190885
+#  endif
190885
+	/* Register %r11 tracks the maximum offset.  */
190885
+	mov	%RDX_LP, %R11_LP
190885
+# endif
190885
+	movl	%edi, %eax
190885
+	xorl	%edx, %edx
190885
+	/* Make %XMMZERO (%YMMZERO) all zeros in this function.  */
190885
+	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
190885
+	orl	%esi, %eax
190885
+	andl	$(PAGE_SIZE - 1), %eax
190885
+	cmpl	$(PAGE_SIZE - (VEC_SIZE * 4)), %eax
190885
+	jg	L(cross_page)
190885
+	/* Start comparing 4 vectors.  */
190885
+	VMOVU	(%rdi), %YMM0
190885
+	VMOVU	(%rsi), %YMM1
190885
+
190885
+	/* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
190885
+	VPCMP	$4, %YMM0, %YMM1, %k0
190885
+
190885
+	/* Check for NULL in YMM0.  */
190885
+	VPCMP	$0, %YMMZERO, %YMM0, %k1
190885
+	/* Check for NULL in YMM1.  */
190885
+	VPCMP	$0, %YMMZERO, %YMM1, %k2
190885
+	/* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
190885
+	kord	%k1, %k2, %k1
190885
+
190885
+	/* Each bit in K1 represents:
190885
+	   1. A mismatch in YMM0 and YMM1.  Or
190885
+	   2. A NULL in YMM0 or YMM1.
190885
+	 */
190885
+	kord	%k0, %k1, %k1
190885
+
190885
+	ktestd	%k1, %k1
190885
+	je	L(next_3_vectors)
190885
+	kmovd	%k1, %ecx
190885
+	tzcntl	%ecx, %edx
190885
+# ifdef USE_AS_WCSCMP
190885
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
+	sall	$2, %edx
190885
+# endif
190885
+# ifdef USE_AS_STRNCMP
190885
+	/* Return 0 if the mismatched index (%rdx) is after the maximum
190885
+	   offset (%r11).   */
190885
+	cmpq	%r11, %rdx
190885
+	jae	L(zero)
190885
+# endif
190885
+# ifdef USE_AS_WCSCMP
190885
+	xorl	%eax, %eax
190885
+	movl	(%rdi, %rdx), %ecx
190885
+	cmpl	(%rsi, %rdx), %ecx
190885
+	je	L(return)
190885
+L(wcscmp_return):
190885
+	setl	%al
190885
+	negl	%eax
190885
+	orl	$1, %eax
190885
+L(return):
190885
+# else
190885
+	movzbl	(%rdi, %rdx), %eax
190885
+	movzbl	(%rsi, %rdx), %edx
190885
+	subl	%edx, %eax
190885
+# endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(return_vec_size):
190885
+	kmovd	%k1, %ecx
190885
+	tzcntl	%ecx, %edx
190885
+# ifdef USE_AS_WCSCMP
190885
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
+	sall	$2, %edx
190885
+# endif
190885
+# ifdef USE_AS_STRNCMP
190885
+	/* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after
190885
+	   the maximum offset (%r11).  */
190885
+	addq	$VEC_SIZE, %rdx
190885
+	cmpq	%r11, %rdx
190885
+	jae	L(zero)
190885
+#  ifdef USE_AS_WCSCMP
190885
+	xorl	%eax, %eax
190885
+	movl	(%rdi, %rdx), %ecx
190885
+	cmpl	(%rsi, %rdx), %ecx
190885
+	jne	L(wcscmp_return)
190885
+#  else
190885
+	movzbl	(%rdi, %rdx), %eax
190885
+	movzbl	(%rsi, %rdx), %edx
190885
+	subl	%edx, %eax
190885
+#  endif
190885
+# else
190885
+#  ifdef USE_AS_WCSCMP
190885
+	xorl	%eax, %eax
190885
+	movl	VEC_SIZE(%rdi, %rdx), %ecx
190885
+	cmpl	VEC_SIZE(%rsi, %rdx), %ecx
190885
+	jne	L(wcscmp_return)
190885
+#  else
190885
+	movzbl	VEC_SIZE(%rdi, %rdx), %eax
190885
+	movzbl	VEC_SIZE(%rsi, %rdx), %edx
190885
+	subl	%edx, %eax
190885
+#  endif
190885
+# endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(return_2_vec_size):
190885
+	kmovd	%k1, %ecx
190885
+	tzcntl	%ecx, %edx
190885
+# ifdef USE_AS_WCSCMP
190885
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
+	sall	$2, %edx
190885
+# endif
190885
+# ifdef USE_AS_STRNCMP
190885
+	/* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is
190885
+	   after the maximum offset (%r11).  */
190885
+	addq	$(VEC_SIZE * 2), %rdx
190885
+	cmpq	%r11, %rdx
190885
+	jae	L(zero)
190885
+#  ifdef USE_AS_WCSCMP
190885
+	xorl	%eax, %eax
190885
+	movl	(%rdi, %rdx), %ecx
190885
+	cmpl	(%rsi, %rdx), %ecx
190885
+	jne	L(wcscmp_return)
190885
+#  else
190885
+	movzbl	(%rdi, %rdx), %eax
190885
+	movzbl	(%rsi, %rdx), %edx
190885
+	subl	%edx, %eax
190885
+#  endif
190885
+# else
190885
+#  ifdef USE_AS_WCSCMP
190885
+	xorl	%eax, %eax
190885
+	movl	(VEC_SIZE * 2)(%rdi, %rdx), %ecx
190885
+	cmpl	(VEC_SIZE * 2)(%rsi, %rdx), %ecx
190885
+	jne	L(wcscmp_return)
190885
+#  else
190885
+	movzbl	(VEC_SIZE * 2)(%rdi, %rdx), %eax
190885
+	movzbl	(VEC_SIZE * 2)(%rsi, %rdx), %edx
190885
+	subl	%edx, %eax
190885
+#  endif
190885
+# endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(return_3_vec_size):
190885
+	kmovd	%k1, %ecx
190885
+	tzcntl	%ecx, %edx
190885
+# ifdef USE_AS_WCSCMP
190885
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
+	sall	$2, %edx
190885
+# endif
190885
+# ifdef USE_AS_STRNCMP
190885
+	/* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is
190885
+	   after the maximum offset (%r11).  */
190885
+	addq	$(VEC_SIZE * 3), %rdx
190885
+	cmpq	%r11, %rdx
190885
+	jae	L(zero)
190885
+#  ifdef USE_AS_WCSCMP
190885
+	xorl	%eax, %eax
190885
+	movl	(%rdi, %rdx), %ecx
190885
+	cmpl	(%rsi, %rdx), %ecx
190885
+	jne	L(wcscmp_return)
190885
+#  else
190885
+	movzbl	(%rdi, %rdx), %eax
190885
+	movzbl	(%rsi, %rdx), %edx
190885
+	subl	%edx, %eax
190885
+#  endif
190885
+# else
190885
+#  ifdef USE_AS_WCSCMP
190885
+	xorl	%eax, %eax
190885
+	movl	(VEC_SIZE * 3)(%rdi, %rdx), %ecx
190885
+	cmpl	(VEC_SIZE * 3)(%rsi, %rdx), %ecx
190885
+	jne	L(wcscmp_return)
190885
+#  else
190885
+	movzbl	(VEC_SIZE * 3)(%rdi, %rdx), %eax
190885
+	movzbl	(VEC_SIZE * 3)(%rsi, %rdx), %edx
190885
+	subl	%edx, %eax
190885
+#  endif
190885
+# endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(next_3_vectors):
190885
+	VMOVU	VEC_SIZE(%rdi), %YMM0
190885
+	VMOVU	VEC_SIZE(%rsi), %YMM1
190885
+	/* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
190885
+	VPCMP	$4, %YMM0, %YMM1, %k0
190885
+	VPCMP	$0, %YMMZERO, %YMM0, %k1
190885
+	VPCMP	$0, %YMMZERO, %YMM1, %k2
190885
+	/* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
190885
+	kord	%k1, %k2, %k1
190885
+	/* Each bit in K1 represents a NULL or a mismatch.  */
190885
+	kord	%k0, %k1, %k1
190885
+	ktestd	%k1, %k1
190885
+	jne	L(return_vec_size)
190885
+
190885
+	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM2
190885
+	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM3
190885
+	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM4
190885
+	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM5
190885
+
190885
+	/* Each bit in K0 represents a mismatch in YMM2 and YMM4.  */
190885
+	VPCMP	$4, %YMM2, %YMM4, %k0
190885
+	VPCMP	$0, %YMMZERO, %YMM2, %k1
190885
+	VPCMP	$0, %YMMZERO, %YMM4, %k2
190885
+	/* Each bit in K1 represents a NULL in YMM2 or YMM4.  */
190885
+	kord	%k1, %k2, %k1
190885
+	/* Each bit in K1 represents a NULL or a mismatch.  */
190885
+	kord	%k0, %k1, %k1
190885
+	ktestd	%k1, %k1
190885
+	jne	L(return_2_vec_size)
190885
+
190885
+	/* Each bit in K0 represents a mismatch in YMM3 and YMM5.  */
190885
+	VPCMP	$4, %YMM3, %YMM5, %k0
190885
+	VPCMP	$0, %YMMZERO, %YMM3, %k1
190885
+	VPCMP	$0, %YMMZERO, %YMM5, %k2
190885
+	/* Each bit in K1 represents a NULL in YMM3 or YMM5.  */
190885
+	kord	%k1, %k2, %k1
190885
+	/* Each bit in K1 represents a NULL or a mismatch.  */
190885
+	kord	%k0, %k1, %k1
190885
+	ktestd	%k1, %k1
190885
+	jne	L(return_3_vec_size)
190885
+L(main_loop_header):
190885
+	leaq	(VEC_SIZE * 4)(%rdi), %rdx
190885
+	movl	$PAGE_SIZE, %ecx
190885
+	/* Align load via RAX.  */
190885
+	andq	$-(VEC_SIZE * 4), %rdx
190885
+	subq	%rdi, %rdx
190885
+	leaq	(%rdi, %rdx), %rax
190885
+# ifdef USE_AS_STRNCMP
190885
+	/* Starting from this point, the maximum offset, or simply the
190885
+	   'offset', DECREASES by the same amount when base pointers are
190885
+	   moved forward.  Return 0 when:
190885
+	     1) On match: offset <= the matched vector index.
190885
+	     2) On mistmach, offset is before the mistmatched index.
190885
+	 */
190885
+	subq	%rdx, %r11
190885
+	jbe	L(zero)
190885
+# endif
190885
+	addq	%rsi, %rdx
190885
+	movq	%rdx, %rsi
190885
+	andl	$(PAGE_SIZE - 1), %esi
190885
+	/* Number of bytes before page crossing.  */
190885
+	subq	%rsi, %rcx
190885
+	/* Number of VEC_SIZE * 4 blocks before page crossing.  */
190885
+	shrq	$DIVIDE_BY_VEC_4_SHIFT, %rcx
190885
+	/* ESI: Number of VEC_SIZE * 4 blocks before page crossing.   */
190885
+	movl	%ecx, %esi
190885
+	jmp	L(loop_start)
190885
+
190885
+	.p2align 4
190885
+L(loop):
190885
+# ifdef USE_AS_STRNCMP
190885
+	/* Base pointers are moved forward by 4 * VEC_SIZE.  Decrease
190885
+	   the maximum offset (%r11) by the same amount.  */
190885
+	subq	$(VEC_SIZE * 4), %r11
190885
+	jbe	L(zero)
190885
+# endif
190885
+	addq	$(VEC_SIZE * 4), %rax
190885
+	addq	$(VEC_SIZE * 4), %rdx
190885
+L(loop_start):
190885
+	testl	%esi, %esi
190885
+	leal	-1(%esi), %esi
190885
+	je	L(loop_cross_page)
190885
+L(back_to_loop):
190885
+	/* Main loop, comparing 4 vectors are a time.  */
190885
+	VMOVA	(%rax), %YMM0
190885
+	VMOVA	VEC_SIZE(%rax), %YMM2
190885
+	VMOVA	(VEC_SIZE * 2)(%rax), %YMM4
190885
+	VMOVA	(VEC_SIZE * 3)(%rax), %YMM6
190885
+	VMOVU	(%rdx), %YMM1
190885
+	VMOVU	VEC_SIZE(%rdx), %YMM3
190885
+	VMOVU	(VEC_SIZE * 2)(%rdx), %YMM5
190885
+	VMOVU	(VEC_SIZE * 3)(%rdx), %YMM7
190885
+
190885
+	VPCMP	$4, %YMM0, %YMM1, %k0
190885
+	VPCMP	$0, %YMMZERO, %YMM0, %k1
190885
+	VPCMP	$0, %YMMZERO, %YMM1, %k2
190885
+	kord	%k1, %k2, %k1
190885
+	/* Each bit in K4 represents a NULL or a mismatch in YMM0 and
190885
+	   YMM1.  */
190885
+	kord	%k0, %k1, %k4
190885
+
190885
+	VPCMP	$4, %YMM2, %YMM3, %k0
190885
+	VPCMP	$0, %YMMZERO, %YMM2, %k1
190885
+	VPCMP	$0, %YMMZERO, %YMM3, %k2
190885
+	kord	%k1, %k2, %k1
190885
+	/* Each bit in K5 represents a NULL or a mismatch in YMM2 and
190885
+	   YMM3.  */
190885
+	kord	%k0, %k1, %k5
190885
+
190885
+	VPCMP	$4, %YMM4, %YMM5, %k0
190885
+	VPCMP	$0, %YMMZERO, %YMM4, %k1
190885
+	VPCMP	$0, %YMMZERO, %YMM5, %k2
190885
+	kord	%k1, %k2, %k1
190885
+	/* Each bit in K6 represents a NULL or a mismatch in YMM4 and
190885
+	   YMM5.  */
190885
+	kord	%k0, %k1, %k6
190885
+
190885
+	VPCMP	$4, %YMM6, %YMM7, %k0
190885
+	VPCMP	$0, %YMMZERO, %YMM6, %k1
190885
+	VPCMP	$0, %YMMZERO, %YMM7, %k2
190885
+	kord	%k1, %k2, %k1
190885
+	/* Each bit in K7 represents a NULL or a mismatch in YMM6 and
190885
+	   YMM7.  */
190885
+	kord	%k0, %k1, %k7
190885
+
190885
+	kord	%k4, %k5, %k0
190885
+	kord	%k6, %k7, %k1
190885
+
190885
+	/* Test each mask (32 bits) individually because for VEC_SIZE
190885
+	   == 32 is not possible to OR the four masks and keep all bits
190885
+	   in a 64-bit integer register, differing from SSE2 strcmp
190885
+	   where ORing is possible.  */
190885
+	kortestd %k0, %k1
190885
+	je	L(loop)
190885
+	ktestd	%k4, %k4
190885
+	je	L(test_vec)
190885
+	kmovd	%k4, %edi
190885
+	tzcntl	%edi, %ecx
190885
+# ifdef USE_AS_WCSCMP
190885
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
+	sall	$2, %ecx
190885
+# endif
190885
+# ifdef USE_AS_STRNCMP
190885
+	cmpq	%rcx, %r11
190885
+	jbe	L(zero)
190885
+#  ifdef USE_AS_WCSCMP
190885
+	movq	%rax, %rsi
190885
+	xorl	%eax, %eax
190885
+	movl	(%rsi, %rcx), %edi
190885
+	cmpl	(%rdx, %rcx), %edi
190885
+	jne	L(wcscmp_return)
190885
+#  else
190885
+	movzbl	(%rax, %rcx), %eax
190885
+	movzbl	(%rdx, %rcx), %edx
190885
+	subl	%edx, %eax
190885
+#  endif
190885
+# else
190885
+#  ifdef USE_AS_WCSCMP
190885
+	movq	%rax, %rsi
190885
+	xorl	%eax, %eax
190885
+	movl	(%rsi, %rcx), %edi
190885
+	cmpl	(%rdx, %rcx), %edi
190885
+	jne	L(wcscmp_return)
190885
+#  else
190885
+	movzbl	(%rax, %rcx), %eax
190885
+	movzbl	(%rdx, %rcx), %edx
190885
+	subl	%edx, %eax
190885
+#  endif
190885
+# endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(test_vec):
190885
+# ifdef USE_AS_STRNCMP
190885
+	/* The first vector matched.  Return 0 if the maximum offset
190885
+	   (%r11) <= VEC_SIZE.  */
190885
+	cmpq	$VEC_SIZE, %r11
190885
+	jbe	L(zero)
190885
+# endif
190885
+	ktestd	%k5, %k5
190885
+	je	L(test_2_vec)
190885
+	kmovd	%k5, %ecx
190885
+	tzcntl	%ecx, %edi
190885
+# ifdef USE_AS_WCSCMP
190885
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
+	sall	$2, %edi
190885
+# endif
190885
+# ifdef USE_AS_STRNCMP
190885
+	addq	$VEC_SIZE, %rdi
190885
+	cmpq	%rdi, %r11
190885
+	jbe	L(zero)
190885
+#  ifdef USE_AS_WCSCMP
190885
+	movq	%rax, %rsi
190885
+	xorl	%eax, %eax
190885
+	movl	(%rsi, %rdi), %ecx
190885
+	cmpl	(%rdx, %rdi), %ecx
190885
+	jne	L(wcscmp_return)
190885
+#  else
190885
+	movzbl	(%rax, %rdi), %eax
190885
+	movzbl	(%rdx, %rdi), %edx
190885
+	subl	%edx, %eax
190885
+#  endif
190885
+# else
190885
+#  ifdef USE_AS_WCSCMP
190885
+	movq	%rax, %rsi
190885
+	xorl	%eax, %eax
190885
+	movl	VEC_SIZE(%rsi, %rdi), %ecx
190885
+	cmpl	VEC_SIZE(%rdx, %rdi), %ecx
190885
+	jne	L(wcscmp_return)
190885
+#  else
190885
+	movzbl	VEC_SIZE(%rax, %rdi), %eax
190885
+	movzbl	VEC_SIZE(%rdx, %rdi), %edx
190885
+	subl	%edx, %eax
190885
+#  endif
190885
+# endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(test_2_vec):
190885
+# ifdef USE_AS_STRNCMP
190885
+	/* The first 2 vectors matched.  Return 0 if the maximum offset
190885
+	   (%r11) <= 2 * VEC_SIZE.  */
190885
+	cmpq	$(VEC_SIZE * 2), %r11
190885
+	jbe	L(zero)
190885
+# endif
190885
+	ktestd	%k6, %k6
190885
+	je	L(test_3_vec)
190885
+	kmovd	%k6, %ecx
190885
+	tzcntl	%ecx, %edi
190885
+# ifdef USE_AS_WCSCMP
190885
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
+	sall	$2, %edi
190885
+# endif
190885
+# ifdef USE_AS_STRNCMP
190885
+	addq	$(VEC_SIZE * 2), %rdi
190885
+	cmpq	%rdi, %r11
190885
+	jbe	L(zero)
190885
+#  ifdef USE_AS_WCSCMP
190885
+	movq	%rax, %rsi
190885
+	xorl	%eax, %eax
190885
+	movl	(%rsi, %rdi), %ecx
190885
+	cmpl	(%rdx, %rdi), %ecx
190885
+	jne	L(wcscmp_return)
190885
+#  else
190885
+	movzbl	(%rax, %rdi), %eax
190885
+	movzbl	(%rdx, %rdi), %edx
190885
+	subl	%edx, %eax
190885
+#  endif
190885
+# else
190885
+#  ifdef USE_AS_WCSCMP
190885
+	movq	%rax, %rsi
190885
+	xorl	%eax, %eax
190885
+	movl	(VEC_SIZE * 2)(%rsi, %rdi), %ecx
190885
+	cmpl	(VEC_SIZE * 2)(%rdx, %rdi), %ecx
190885
+	jne	L(wcscmp_return)
190885
+#  else
190885
+	movzbl	(VEC_SIZE * 2)(%rax, %rdi), %eax
190885
+	movzbl	(VEC_SIZE * 2)(%rdx, %rdi), %edx
190885
+	subl	%edx, %eax
190885
+#  endif
190885
+# endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(test_3_vec):
190885
+# ifdef USE_AS_STRNCMP
190885
+	/* The first 3 vectors matched.  Return 0 if the maximum offset
190885
+	   (%r11) <= 3 * VEC_SIZE.  */
190885
+	cmpq	$(VEC_SIZE * 3), %r11
190885
+	jbe	L(zero)
190885
+# endif
190885
+	kmovd	%k7, %esi
190885
+	tzcntl	%esi, %ecx
190885
+# ifdef USE_AS_WCSCMP
190885
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
+	sall	$2, %ecx
190885
+# endif
190885
+# ifdef USE_AS_STRNCMP
190885
+	addq	$(VEC_SIZE * 3), %rcx
190885
+	cmpq	%rcx, %r11
190885
+	jbe	L(zero)
190885
+#  ifdef USE_AS_WCSCMP
190885
+	movq	%rax, %rsi
190885
+	xorl	%eax, %eax
190885
+	movl	(%rsi, %rcx), %esi
190885
+	cmpl	(%rdx, %rcx), %esi
190885
+	jne	L(wcscmp_return)
190885
+#  else
190885
+	movzbl	(%rax, %rcx), %eax
190885
+	movzbl	(%rdx, %rcx), %edx
190885
+	subl	%edx, %eax
190885
+#  endif
190885
+# else
190885
+#  ifdef USE_AS_WCSCMP
190885
+	movq	%rax, %rsi
190885
+	xorl	%eax, %eax
190885
+	movl	(VEC_SIZE * 3)(%rsi, %rcx), %esi
190885
+	cmpl	(VEC_SIZE * 3)(%rdx, %rcx), %esi
190885
+	jne	L(wcscmp_return)
190885
+#  else
190885
+	movzbl	(VEC_SIZE * 3)(%rax, %rcx), %eax
190885
+	movzbl	(VEC_SIZE * 3)(%rdx, %rcx), %edx
190885
+	subl	%edx, %eax
190885
+#  endif
190885
+# endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(loop_cross_page):
190885
+	xorl	%r10d, %r10d
190885
+	movq	%rdx, %rcx
190885
+	/* Align load via RDX.  We load the extra ECX bytes which should
190885
+	   be ignored.  */
190885
+	andl	$((VEC_SIZE * 4) - 1), %ecx
190885
+	/* R10 is -RCX.  */
190885
+	subq	%rcx, %r10
190885
+
190885
+	/* This works only if VEC_SIZE * 2 == 64. */
190885
+# if (VEC_SIZE * 2) != 64
190885
+#  error (VEC_SIZE * 2) != 64
190885
+# endif
190885
+
190885
+	/* Check if the first VEC_SIZE * 2 bytes should be ignored.  */
190885
+	cmpl	$(VEC_SIZE * 2), %ecx
190885
+	jge	L(loop_cross_page_2_vec)
190885
+
190885
+	VMOVU	(%rax, %r10), %YMM2
190885
+	VMOVU	VEC_SIZE(%rax, %r10), %YMM3
190885
+	VMOVU	(%rdx, %r10), %YMM4
190885
+	VMOVU	VEC_SIZE(%rdx, %r10), %YMM5
190885
+
190885
+	VPCMP	$4, %YMM4, %YMM2, %k0
190885
+	VPCMP	$0, %YMMZERO, %YMM2, %k1
190885
+	VPCMP	$0, %YMMZERO, %YMM4, %k2
190885
+	kord	%k1, %k2, %k1
190885
+	/* Each bit in K1 represents a NULL or a mismatch in YMM2 and
190885
+	   YMM4.  */
190885
+	kord	%k0, %k1, %k1
190885
+
190885
+	VPCMP	$4, %YMM5, %YMM3, %k3
190885
+	VPCMP	$0, %YMMZERO, %YMM3, %k4
190885
+	VPCMP	$0, %YMMZERO, %YMM5, %k5
190885
+	kord	%k4, %k5, %k4
190885
+	/* Each bit in K3 represents a NULL or a mismatch in YMM3 and
190885
+	   YMM5.  */
190885
+	kord	%k3, %k4, %k3
190885
+
190885
+# ifdef USE_AS_WCSCMP
190885
+	/* NB: Each bit in K1/K3 represents 4-byte element.  */
190885
+	kshiftlw $8, %k3, %k2
190885
+	/* NB: Divide shift count by 4 since each bit in K1 represent 4
190885
+	   bytes.  */
190885
+	movl	%ecx, %SHIFT_REG32
190885
+	sarl	$2, %SHIFT_REG32
190885
+# else
190885
+	kshiftlq $32, %k3, %k2
190885
+# endif
190885
+
190885
+	/* Each bit in K1 represents a NULL or a mismatch.  */
190885
+	korq	%k1, %k2, %k1
190885
+	kmovq	%k1, %rdi
190885
+
190885
+	/* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes.  */
190885
+	shrxq	%SHIFT_REG64, %rdi, %rdi
190885
+	testq	%rdi, %rdi
190885
+	je	L(loop_cross_page_2_vec)
190885
+	tzcntq	%rdi, %rcx
190885
+# ifdef USE_AS_WCSCMP
190885
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
+	sall	$2, %ecx
190885
+# endif
190885
+# ifdef USE_AS_STRNCMP
190885
+	cmpq	%rcx, %r11
190885
+	jbe	L(zero)
190885
+#  ifdef USE_AS_WCSCMP
190885
+	movq	%rax, %rsi
190885
+	xorl	%eax, %eax
190885
+	movl	(%rsi, %rcx), %edi
190885
+	cmpl	(%rdx, %rcx), %edi
190885
+	jne	L(wcscmp_return)
190885
+#  else
190885
+	movzbl	(%rax, %rcx), %eax
190885
+	movzbl	(%rdx, %rcx), %edx
190885
+	subl	%edx, %eax
190885
+#  endif
190885
+# else
190885
+#  ifdef USE_AS_WCSCMP
190885
+	movq	%rax, %rsi
190885
+	xorl	%eax, %eax
190885
+	movl	(%rsi, %rcx), %edi
190885
+	cmpl	(%rdx, %rcx), %edi
190885
+	jne	L(wcscmp_return)
190885
+#  else
190885
+	movzbl	(%rax, %rcx), %eax
190885
+	movzbl	(%rdx, %rcx), %edx
190885
+	subl	%edx, %eax
190885
+#  endif
190885
+# endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(loop_cross_page_2_vec):
190885
+	/* The first VEC_SIZE * 2 bytes match or are ignored.  */
190885
+	VMOVU	(VEC_SIZE * 2)(%rax, %r10), %YMM0
190885
+	VMOVU	(VEC_SIZE * 3)(%rax, %r10), %YMM1
190885
+	VMOVU	(VEC_SIZE * 2)(%rdx, %r10), %YMM2
190885
+	VMOVU	(VEC_SIZE * 3)(%rdx, %r10), %YMM3
190885
+
190885
+	VPCMP	$4, %YMM0, %YMM2, %k0
190885
+	VPCMP	$0, %YMMZERO, %YMM0, %k1
190885
+	VPCMP	$0, %YMMZERO, %YMM2, %k2
190885
+	kord	%k1, %k2, %k1
190885
+	/* Each bit in K1 represents a NULL or a mismatch in YMM0 and
190885
+	   YMM2.  */
190885
+	kord	%k0, %k1, %k1
190885
+
190885
+	VPCMP	$4, %YMM1, %YMM3, %k3
190885
+	VPCMP	$0, %YMMZERO, %YMM1, %k4
190885
+	VPCMP	$0, %YMMZERO, %YMM3, %k5
190885
+	kord	%k4, %k5, %k4
190885
+	/* Each bit in K3 represents a NULL or a mismatch in YMM1 and
190885
+	   YMM3.  */
190885
+	kord	%k3, %k4, %k3
190885
+
190885
+# ifdef USE_AS_WCSCMP
190885
+	/* NB: Each bit in K1/K3 represents 4-byte element.  */
190885
+	kshiftlw $8, %k3, %k2
190885
+# else
190885
+	kshiftlq $32, %k3, %k2
190885
+# endif
190885
+
190885
+	/* Each bit in K1 represents a NULL or a mismatch.  */
190885
+	korq	%k1, %k2, %k1
190885
+	kmovq	%k1, %rdi
190885
+
190885
+	xorl	%r8d, %r8d
190885
+	/* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes.  */
190885
+	subl	$(VEC_SIZE * 2), %ecx
190885
+	jle	1f
190885
+	/* R8 has number of bytes skipped.  */
190885
+	movl	%ecx, %r8d
190885
+# ifdef USE_AS_WCSCMP
190885
+	/* NB: Divide shift count by 4 since each bit in K1 represent 4
190885
+	   bytes.  */
190885
+	sarl	$2, %ecx
190885
+# endif
190885
+	/* Skip ECX bytes.  */
190885
+	shrq	%cl, %rdi
190885
+1:
190885
+	/* Before jumping back to the loop, set ESI to the number of
190885
+	   VEC_SIZE * 4 blocks before page crossing.  */
190885
+	movl	$(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi
190885
+
190885
+	testq	%rdi, %rdi
190885
+# ifdef USE_AS_STRNCMP
190885
+	/* At this point, if %rdi value is 0, it already tested
190885
+	   VEC_SIZE*4+%r10 byte starting from %rax. This label
190885
+	   checks whether strncmp maximum offset reached or not.  */
190885
+	je	L(string_nbyte_offset_check)
190885
+# else
190885
+	je	L(back_to_loop)
190885
+# endif
190885
+	tzcntq	%rdi, %rcx
190885
+# ifdef USE_AS_WCSCMP
190885
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
+	sall	$2, %ecx
190885
+# endif
190885
+	addq	%r10, %rcx
190885
+	/* Adjust for number of bytes skipped.  */
190885
+	addq	%r8, %rcx
190885
+# ifdef USE_AS_STRNCMP
190885
+	addq	$(VEC_SIZE * 2), %rcx
190885
+	subq	%rcx, %r11
190885
+	jbe	L(zero)
190885
+#  ifdef USE_AS_WCSCMP
190885
+	movq	%rax, %rsi
190885
+	xorl	%eax, %eax
190885
+	movl	(%rsi, %rcx), %edi
190885
+	cmpl	(%rdx, %rcx), %edi
190885
+	jne	L(wcscmp_return)
190885
+#  else
190885
+	movzbl	(%rax, %rcx), %eax
190885
+	movzbl	(%rdx, %rcx), %edx
190885
+	subl	%edx, %eax
190885
+#  endif
190885
+# else
190885
+#  ifdef USE_AS_WCSCMP
190885
+	movq	%rax, %rsi
190885
+	xorl	%eax, %eax
190885
+	movl	(VEC_SIZE * 2)(%rsi, %rcx), %edi
190885
+	cmpl	(VEC_SIZE * 2)(%rdx, %rcx), %edi
190885
+	jne	L(wcscmp_return)
190885
+#  else
190885
+	movzbl	(VEC_SIZE * 2)(%rax, %rcx), %eax
190885
+	movzbl	(VEC_SIZE * 2)(%rdx, %rcx), %edx
190885
+	subl	%edx, %eax
190885
+#  endif
190885
+# endif
190885
+	ret
190885
+
190885
+# ifdef USE_AS_STRNCMP
190885
+L(string_nbyte_offset_check):
190885
+	leaq	(VEC_SIZE * 4)(%r10), %r10
190885
+	cmpq	%r10, %r11
190885
+	jbe	L(zero)
190885
+	jmp	L(back_to_loop)
190885
+# endif
190885
+
190885
+	.p2align 4
190885
+L(cross_page_loop):
190885
+	/* Check one byte/dword at a time.  */
190885
+# ifdef USE_AS_WCSCMP
190885
+	cmpl	%ecx, %eax
190885
+# else
190885
+	subl	%ecx, %eax
190885
+# endif
190885
+	jne	L(different)
190885
+	addl	$SIZE_OF_CHAR, %edx
190885
+	cmpl	$(VEC_SIZE * 4), %edx
190885
+	je	L(main_loop_header)
190885
+# ifdef USE_AS_STRNCMP
190885
+	cmpq	%r11, %rdx
190885
+	jae	L(zero)
190885
+# endif
190885
+# ifdef USE_AS_WCSCMP
190885
+	movl	(%rdi, %rdx), %eax
190885
+	movl	(%rsi, %rdx), %ecx
190885
+# else
190885
+	movzbl	(%rdi, %rdx), %eax
190885
+	movzbl	(%rsi, %rdx), %ecx
190885
+# endif
190885
+	/* Check null char.  */
190885
+	testl	%eax, %eax
190885
+	jne	L(cross_page_loop)
190885
+	/* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
190885
+	   comparisons.  */
190885
+	subl	%ecx, %eax
190885
+# ifndef USE_AS_WCSCMP
190885
+L(different):
190885
+# endif
190885
+	ret
190885
+
190885
+# ifdef USE_AS_WCSCMP
190885
+	.p2align 4
190885
+L(different):
190885
+	/* Use movl to avoid modifying EFLAGS.  */
190885
+	movl	$0, %eax
190885
+	setl	%al
190885
+	negl	%eax
190885
+	orl	$1, %eax
190885
+	ret
190885
+# endif
190885
+
190885
+# ifdef USE_AS_STRNCMP
190885
+	.p2align 4
190885
+L(zero):
190885
+	xorl	%eax, %eax
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(char0):
190885
+#  ifdef USE_AS_WCSCMP
190885
+	xorl	%eax, %eax
190885
+	movl	(%rdi), %ecx
190885
+	cmpl	(%rsi), %ecx
190885
+	jne	L(wcscmp_return)
190885
+#  else
190885
+	movzbl	(%rsi), %ecx
190885
+	movzbl	(%rdi), %eax
190885
+	subl	%ecx, %eax
190885
+#  endif
190885
+	ret
190885
+# endif
190885
+
190885
+	.p2align 4
190885
+L(last_vector):
190885
+	addq	%rdx, %rdi
190885
+	addq	%rdx, %rsi
190885
+# ifdef USE_AS_STRNCMP
190885
+	subq	%rdx, %r11
190885
+# endif
190885
+	tzcntl	%ecx, %edx
190885
+# ifdef USE_AS_WCSCMP
190885
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
+	sall	$2, %edx
190885
+# endif
190885
+# ifdef USE_AS_STRNCMP
190885
+	cmpq	%r11, %rdx
190885
+	jae	L(zero)
190885
+# endif
190885
+# ifdef USE_AS_WCSCMP
190885
+	xorl	%eax, %eax
190885
+	movl	(%rdi, %rdx), %ecx
190885
+	cmpl	(%rsi, %rdx), %ecx
190885
+	jne	L(wcscmp_return)
190885
+# else
190885
+	movzbl	(%rdi, %rdx), %eax
190885
+	movzbl	(%rsi, %rdx), %edx
190885
+	subl	%edx, %eax
190885
+# endif
190885
+	ret
190885
+
190885
+	/* Comparing on page boundary region requires special treatment:
190885
+	   It must done one vector at the time, starting with the wider
190885
+	   ymm vector if possible, if not, with xmm. If fetching 16 bytes
190885
+	   (xmm) still passes the boundary, byte comparison must be done.
190885
+	 */
190885
+	.p2align 4
190885
+L(cross_page):
190885
+	/* Try one ymm vector at a time.  */
190885
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
190885
+	jg	L(cross_page_1_vector)
190885
+L(loop_1_vector):
190885
+	VMOVU	(%rdi, %rdx), %YMM0
190885
+	VMOVU	(%rsi, %rdx), %YMM1
190885
+
190885
+	/* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
190885
+	VPCMP	$4, %YMM0, %YMM1, %k0
190885
+	VPCMP	$0, %YMMZERO, %YMM0, %k1
190885
+	VPCMP	$0, %YMMZERO, %YMM1, %k2
190885
+	/* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
190885
+	kord	%k1, %k2, %k1
190885
+	/* Each bit in K1 represents a NULL or a mismatch.  */
190885
+	kord	%k0, %k1, %k1
190885
+	kmovd	%k1, %ecx
190885
+	testl	%ecx, %ecx
190885
+	jne	L(last_vector)
190885
+
190885
+	addl	$VEC_SIZE, %edx
190885
+
190885
+	addl	$VEC_SIZE, %eax
190885
+# ifdef USE_AS_STRNCMP
190885
+	/* Return 0 if the current offset (%rdx) >= the maximum offset
190885
+	   (%r11).  */
190885
+	cmpq	%r11, %rdx
190885
+	jae	L(zero)
190885
+# endif
190885
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
190885
+	jle	L(loop_1_vector)
190885
+L(cross_page_1_vector):
190885
+	/* Less than 32 bytes to check, try one xmm vector.  */
190885
+	cmpl	$(PAGE_SIZE - 16), %eax
190885
+	jg	L(cross_page_1_xmm)
190885
+	VMOVU	(%rdi, %rdx), %XMM0
190885
+	VMOVU	(%rsi, %rdx), %XMM1
190885
+
190885
+	/* Each bit in K0 represents a mismatch in XMM0 and XMM1.  */
190885
+	VPCMP	$4, %XMM0, %XMM1, %k0
190885
+	VPCMP	$0, %XMMZERO, %XMM0, %k1
190885
+	VPCMP	$0, %XMMZERO, %XMM1, %k2
190885
+	/* Each bit in K1 represents a NULL in XMM0 or XMM1.  */
190885
+	korw	%k1, %k2, %k1
190885
+	/* Each bit in K1 represents a NULL or a mismatch.  */
190885
+	korw	%k0, %k1, %k1
190885
+	kmovw	%k1, %ecx
190885
+	testl	%ecx, %ecx
190885
+	jne	L(last_vector)
190885
+
190885
+	addl	$16, %edx
190885
+# ifndef USE_AS_WCSCMP
190885
+	addl	$16, %eax
190885
+# endif
190885
+# ifdef USE_AS_STRNCMP
190885
+	/* Return 0 if the current offset (%rdx) >= the maximum offset
190885
+	   (%r11).  */
190885
+	cmpq	%r11, %rdx
190885
+	jae	L(zero)
190885
+# endif
190885
+
190885
+L(cross_page_1_xmm):
190885
+# ifndef USE_AS_WCSCMP
190885
+	/* Less than 16 bytes to check, try 8 byte vector.  NB: No need
190885
+	   for wcscmp nor wcsncmp since wide char is 4 bytes.   */
190885
+	cmpl	$(PAGE_SIZE - 8), %eax
190885
+	jg	L(cross_page_8bytes)
190885
+	vmovq	(%rdi, %rdx), %XMM0
190885
+	vmovq	(%rsi, %rdx), %XMM1
190885
+
190885
+	/* Each bit in K0 represents a mismatch in XMM0 and XMM1.  */
190885
+	VPCMP	$4, %XMM0, %XMM1, %k0
190885
+	VPCMP	$0, %XMMZERO, %XMM0, %k1
190885
+	VPCMP	$0, %XMMZERO, %XMM1, %k2
190885
+	/* Each bit in K1 represents a NULL in XMM0 or XMM1.  */
190885
+	kord	%k1, %k2, %k1
190885
+	/* Each bit in K1 represents a NULL or a mismatch.  */
190885
+	kord	%k0, %k1, %k1
190885
+	kmovd	%k1, %ecx
190885
+
190885
+# ifdef USE_AS_WCSCMP
190885
+	/* Only last 2 bits are valid.  */
190885
+	andl	$0x3, %ecx
190885
+# else
190885
+	/* Only last 8 bits are valid.  */
190885
+	andl	$0xff, %ecx
190885
+# endif
190885
+
190885
+	testl	%ecx, %ecx
190885
+	jne	L(last_vector)
190885
+
190885
+	addl	$8, %edx
190885
+	addl	$8, %eax
190885
+#  ifdef USE_AS_STRNCMP
190885
+	/* Return 0 if the current offset (%rdx) >= the maximum offset
190885
+	   (%r11).  */
190885
+	cmpq	%r11, %rdx
190885
+	jae	L(zero)
190885
+#  endif
190885
+
190885
+L(cross_page_8bytes):
190885
+	/* Less than 8 bytes to check, try 4 byte vector.  */
190885
+	cmpl	$(PAGE_SIZE - 4), %eax
190885
+	jg	L(cross_page_4bytes)
190885
+	vmovd	(%rdi, %rdx), %XMM0
190885
+	vmovd	(%rsi, %rdx), %XMM1
190885
+
190885
+	/* Each bit in K0 represents a mismatch in XMM0 and XMM1.  */
190885
+	VPCMP	$4, %XMM0, %XMM1, %k0
190885
+	VPCMP	$0, %XMMZERO, %XMM0, %k1
190885
+	VPCMP	$0, %XMMZERO, %XMM1, %k2
190885
+	/* Each bit in K1 represents a NULL in XMM0 or XMM1.  */
190885
+	kord	%k1, %k2, %k1
190885
+	/* Each bit in K1 represents a NULL or a mismatch.  */
190885
+	kord	%k0, %k1, %k1
190885
+	kmovd	%k1, %ecx
190885
+
190885
+# ifdef USE_AS_WCSCMP
190885
+	/* Only the last bit is valid.  */
190885
+	andl	$0x1, %ecx
190885
+# else
190885
+	/* Only last 4 bits are valid.  */
190885
+	andl	$0xf, %ecx
190885
+# endif
190885
+
190885
+	testl	%ecx, %ecx
190885
+	jne	L(last_vector)
190885
+
190885
+	addl	$4, %edx
190885
+#  ifdef USE_AS_STRNCMP
190885
+	/* Return 0 if the current offset (%rdx) >= the maximum offset
190885
+	   (%r11).  */
190885
+	cmpq	%r11, %rdx
190885
+	jae	L(zero)
190885
+#  endif
190885
+
190885
+L(cross_page_4bytes):
190885
+# endif
190885
+	/* Less than 4 bytes to check, try one byte/dword at a time.  */
190885
+# ifdef USE_AS_STRNCMP
190885
+	cmpq	%r11, %rdx
190885
+	jae	L(zero)
190885
+# endif
190885
+# ifdef USE_AS_WCSCMP
190885
+	movl	(%rdi, %rdx), %eax
190885
+	movl	(%rsi, %rdx), %ecx
190885
+# else
190885
+	movzbl	(%rdi, %rdx), %eax
190885
+	movzbl	(%rsi, %rdx), %ecx
190885
+# endif
190885
+	testl	%eax, %eax
190885
+	jne	L(cross_page_loop)
190885
+	subl	%ecx, %eax
190885
+	ret
190885
+END (STRCMP)
190885
+#endif
190885
diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c
190885
index 3f433fbc..c5f38510 100644
190885
--- a/sysdeps/x86_64/multiarch/strcmp.c
190885
+++ b/sysdeps/x86_64/multiarch/strcmp.c
190885
@@ -30,16 +30,25 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
190885
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
190885
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
190885
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
190885
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
190885
 
190885
 static inline void *
190885
 IFUNC_SELECTOR (void)
190885
 {
190885
   const struct cpu_features* cpu_features = __get_cpu_features ();
190885
 
190885
-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
190885
-      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
190885
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
190885
       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
190885
-    return OPTIMIZE (avx2);
190885
+    {
190885
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
190885
+	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
190885
+	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
190885
+	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP))
190885
+	return OPTIMIZE (evex);
190885
+
190885
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
190885
+	return OPTIMIZE (avx2);
190885
+    }
190885
 
190885
   if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
190885
     return OPTIMIZE (sse2_unaligned);
190885
diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
190885
new file mode 100644
190885
index 00000000..cd022509
190885
--- /dev/null
190885
+++ b/sysdeps/x86_64/multiarch/strlen-evex.S
190885
@@ -0,0 +1,436 @@
190885
+/* strlen/strnlen/wcslen/wcsnlen optimized with 256-bit EVEX instructions.
190885
+   Copyright (C) 2021 Free Software Foundation, Inc.
190885
+   This file is part of the GNU C Library.
190885
+
190885
+   The GNU C Library is free software; you can redistribute it and/or
190885
+   modify it under the terms of the GNU Lesser General Public
190885
+   License as published by the Free Software Foundation; either
190885
+   version 2.1 of the License, or (at your option) any later version.
190885
+
190885
+   The GNU C Library is distributed in the hope that it will be useful,
190885
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
190885
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
190885
+   Lesser General Public License for more details.
190885
+
190885
+   You should have received a copy of the GNU Lesser General Public
190885
+   License along with the GNU C Library; if not, see
190885
+   <https://www.gnu.org/licenses/>.  */
190885
+
190885
+#if IS_IN (libc)
190885
+
190885
+# include <sysdep.h>
190885
+
190885
+# ifndef STRLEN
190885
+#  define STRLEN	__strlen_evex
190885
+# endif
190885
+
190885
+# define VMOVA		vmovdqa64
190885
+
190885
+# ifdef USE_AS_WCSLEN
190885
+#  define VPCMP		vpcmpd
190885
+#  define VPMINU	vpminud
190885
+#  define SHIFT_REG	r9d
190885
+# else
190885
+#  define VPCMP		vpcmpb
190885
+#  define VPMINU	vpminub
190885
+#  define SHIFT_REG	ecx
190885
+# endif
190885
+
190885
+# define XMMZERO	xmm16
190885
+# define YMMZERO	ymm16
190885
+# define YMM1		ymm17
190885
+# define YMM2		ymm18
190885
+# define YMM3		ymm19
190885
+# define YMM4		ymm20
190885
+# define YMM5		ymm21
190885
+# define YMM6		ymm22
190885
+
190885
+# define VEC_SIZE 32
190885
+
190885
+	.section .text.evex,"ax",@progbits
190885
+ENTRY (STRLEN)
190885
+# ifdef USE_AS_STRNLEN
190885
+	/* Check for zero length.  */
190885
+	test	%RSI_LP, %RSI_LP
190885
+	jz	L(zero)
190885
+#  ifdef USE_AS_WCSLEN
190885
+	shl	$2, %RSI_LP
190885
+#  elif defined __ILP32__
190885
+	/* Clear the upper 32 bits.  */
190885
+	movl	%esi, %esi
190885
+#  endif
190885
+	mov	%RSI_LP, %R8_LP
190885
+# endif
190885
+	movl	%edi, %ecx
190885
+	movq	%rdi, %rdx
190885
+	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
190885
+
190885
+	/* Check if we may cross page boundary with one vector load.  */
190885
+	andl	$(2 * VEC_SIZE - 1), %ecx
190885
+	cmpl	$VEC_SIZE, %ecx
190885
+	ja	L(cros_page_boundary)
190885
+
190885
+	/* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
190885
+	   null byte.  */
190885
+	VPCMP	$0, (%rdi), %YMMZERO, %k0
190885
+	kmovd	%k0, %eax
190885
+	testl	%eax, %eax
190885
+
190885
+# ifdef USE_AS_STRNLEN
190885
+	jnz	L(first_vec_x0_check)
190885
+	/* Adjust length and check the end of data.  */
190885
+	subq	$VEC_SIZE, %rsi
190885
+	jbe	L(max)
190885
+# else
190885
+	jnz	L(first_vec_x0)
190885
+# endif
190885
+
190885
+	/* Align data for aligned loads in the loop.  */
190885
+	addq	$VEC_SIZE, %rdi
190885
+	andl	$(VEC_SIZE - 1), %ecx
190885
+	andq	$-VEC_SIZE, %rdi
190885
+
190885
+# ifdef USE_AS_STRNLEN
190885
+	/* Adjust length.  */
190885
+	addq	%rcx, %rsi
190885
+
190885
+	subq	$(VEC_SIZE * 4), %rsi
190885
+	jbe	L(last_4x_vec_or_less)
190885
+# endif
190885
+	jmp	L(more_4x_vec)
190885
+
190885
+	.p2align 4
190885
+L(cros_page_boundary):
190885
+	andl	$(VEC_SIZE - 1), %ecx
190885
+	andq	$-VEC_SIZE, %rdi
190885
+
190885
+# ifdef USE_AS_WCSLEN
190885
+	/* NB: Divide shift count by 4 since each bit in K0 represent 4
190885
+	   bytes.  */
190885
+	movl	%ecx, %SHIFT_REG
190885
+	sarl	$2, %SHIFT_REG
190885
+# endif
190885
+	VPCMP	$0, (%rdi), %YMMZERO, %k0
190885
+	kmovd	%k0, %eax
190885
+
190885
+	/* Remove the leading bytes.  */
190885
+	sarxl	%SHIFT_REG, %eax, %eax
190885
+	testl	%eax, %eax
190885
+	jz	L(aligned_more)
190885
+	tzcntl	%eax, %eax
190885
+# ifdef USE_AS_WCSLEN
190885
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
+	sall	$2, %eax
190885
+# endif
190885
+# ifdef USE_AS_STRNLEN
190885
+	/* Check the end of data.  */
190885
+	cmpq	%rax, %rsi
190885
+	jbe	L(max)
190885
+# endif
190885
+	addq	%rdi, %rax
190885
+	addq	%rcx, %rax
190885
+	subq	%rdx, %rax
190885
+# ifdef USE_AS_WCSLEN
190885
+	shrq	$2, %rax
190885
+# endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(aligned_more):
190885
+# ifdef USE_AS_STRNLEN
190885
+        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
190885
+	    with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
190885
+	    to void possible addition overflow.  */
190885
+	negq	%rcx
190885
+	addq	$VEC_SIZE, %rcx
190885
+
190885
+	/* Check the end of data.  */
190885
+	subq	%rcx, %rsi
190885
+	jbe	L(max)
190885
+# endif
190885
+
190885
+	addq	$VEC_SIZE, %rdi
190885
+
190885
+# ifdef USE_AS_STRNLEN
190885
+	subq	$(VEC_SIZE * 4), %rsi
190885
+	jbe	L(last_4x_vec_or_less)
190885
+# endif
190885
+
190885
+L(more_4x_vec):
190885
+	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
190885
+	   since data is only aligned to VEC_SIZE.  */
190885
+	VPCMP	$0, (%rdi), %YMMZERO, %k0
190885
+	kmovd	%k0, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(first_vec_x0)
190885
+
190885
+	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
190885
+	kmovd	%k0, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(first_vec_x1)
190885
+
190885
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
190885
+	kmovd	%k0, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(first_vec_x2)
190885
+
190885
+	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
190885
+	kmovd	%k0, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(first_vec_x3)
190885
+
190885
+	addq	$(VEC_SIZE * 4), %rdi
190885
+
190885
+# ifdef USE_AS_STRNLEN
190885
+	subq	$(VEC_SIZE * 4), %rsi
190885
+	jbe	L(last_4x_vec_or_less)
190885
+# endif
190885
+
190885
+	/* Align data to 4 * VEC_SIZE.  */
190885
+	movq	%rdi, %rcx
190885
+	andl	$(4 * VEC_SIZE - 1), %ecx
190885
+	andq	$-(4 * VEC_SIZE), %rdi
190885
+
190885
+# ifdef USE_AS_STRNLEN
190885
+	/* Adjust length.  */
190885
+	addq	%rcx, %rsi
190885
+# endif
190885
+
190885
+	.p2align 4
190885
+L(loop_4x_vec):
190885
+	/* Compare 4 * VEC at a time forward.  */
190885
+	VMOVA	(%rdi), %YMM1
190885
+	VMOVA	VEC_SIZE(%rdi), %YMM2
190885
+	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM3
190885
+	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM4
190885
+
190885
+	VPMINU	%YMM1, %YMM2, %YMM5
190885
+	VPMINU	%YMM3, %YMM4, %YMM6
190885
+
190885
+	VPMINU	%YMM5, %YMM6, %YMM5
190885
+	VPCMP	$0, %YMM5, %YMMZERO, %k0
190885
+	ktestd	%k0, %k0
190885
+	jnz	L(4x_vec_end)
190885
+
190885
+	addq	$(VEC_SIZE * 4), %rdi
190885
+
190885
+# ifndef USE_AS_STRNLEN
190885
+	jmp	L(loop_4x_vec)
190885
+# else
190885
+	subq	$(VEC_SIZE * 4), %rsi
190885
+	ja	L(loop_4x_vec)
190885
+
190885
+L(last_4x_vec_or_less):
190885
+	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
190885
+	addl	$(VEC_SIZE * 2), %esi
190885
+	jle	L(last_2x_vec)
190885
+
190885
+	VPCMP	$0, (%rdi), %YMMZERO, %k0
190885
+	kmovd	%k0, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(first_vec_x0)
190885
+
190885
+	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
190885
+	kmovd	%k0, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(first_vec_x1)
190885
+
190885
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
190885
+	kmovd	%k0, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(first_vec_x2_check)
190885
+	subl	$VEC_SIZE, %esi
190885
+	jle	L(max)
190885
+
190885
+	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
190885
+	kmovd	%k0, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(first_vec_x3_check)
190885
+	movq	%r8, %rax
190885
+#  ifdef USE_AS_WCSLEN
190885
+	shrq	$2, %rax
190885
+#  endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(last_2x_vec):
190885
+	addl	$(VEC_SIZE * 2), %esi
190885
+
190885
+	VPCMP	$0, (%rdi), %YMMZERO, %k0
190885
+	kmovd	%k0, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(first_vec_x0_check)
190885
+	subl	$VEC_SIZE, %esi
190885
+	jle	L(max)
190885
+
190885
+	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
190885
+	kmovd	%k0, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(first_vec_x1_check)
190885
+	movq	%r8, %rax
190885
+#  ifdef USE_AS_WCSLEN
190885
+	shrq	$2, %rax
190885
+#  endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(first_vec_x0_check):
190885
+	tzcntl	%eax, %eax
190885
+# ifdef USE_AS_WCSLEN
190885
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
+	sall	$2, %eax
190885
+# endif
190885
+	/* Check the end of data.  */
190885
+	cmpq	%rax, %rsi
190885
+	jbe	L(max)
190885
+	addq	%rdi, %rax
190885
+	subq	%rdx, %rax
190885
+#  ifdef USE_AS_WCSLEN
190885
+	shrq	$2, %rax
190885
+#  endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(first_vec_x1_check):
190885
+	tzcntl	%eax, %eax
190885
+# ifdef USE_AS_WCSLEN
190885
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
+	sall	$2, %eax
190885
+# endif
190885
+	/* Check the end of data.  */
190885
+	cmpq	%rax, %rsi
190885
+	jbe	L(max)
190885
+	addq	$VEC_SIZE, %rax
190885
+	addq	%rdi, %rax
190885
+	subq	%rdx, %rax
190885
+#  ifdef USE_AS_WCSLEN
190885
+	shrq	$2, %rax
190885
+#  endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(first_vec_x2_check):
190885
+	tzcntl	%eax, %eax
190885
+# ifdef USE_AS_WCSLEN
190885
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
+	sall	$2, %eax
190885
+# endif
190885
+	/* Check the end of data.  */
190885
+	cmpq	%rax, %rsi
190885
+	jbe	L(max)
190885
+	addq	$(VEC_SIZE * 2), %rax
190885
+	addq	%rdi, %rax
190885
+	subq	%rdx, %rax
190885
+#  ifdef USE_AS_WCSLEN
190885
+	shrq	$2, %rax
190885
+#  endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(first_vec_x3_check):
190885
+	tzcntl	%eax, %eax
190885
+# ifdef USE_AS_WCSLEN
190885
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
+	sall	$2, %eax
190885
+# endif
190885
+	/* Check the end of data.  */
190885
+	cmpq	%rax, %rsi
190885
+	jbe	L(max)
190885
+	addq	$(VEC_SIZE * 3), %rax
190885
+	addq	%rdi, %rax
190885
+	subq	%rdx, %rax
190885
+#  ifdef USE_AS_WCSLEN
190885
+	shrq	$2, %rax
190885
+#  endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(max):
190885
+	movq	%r8, %rax
190885
+#  ifdef USE_AS_WCSLEN
190885
+	shrq	$2, %rax
190885
+#  endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(zero):
190885
+	xorl	%eax, %eax
190885
+	ret
190885
+# endif
190885
+
190885
+	.p2align 4
190885
+L(first_vec_x0):
190885
+	tzcntl	%eax, %eax
190885
+# ifdef USE_AS_WCSLEN
190885
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
+	sall	$2, %eax
190885
+# endif
190885
+	addq	%rdi, %rax
190885
+	subq	%rdx, %rax
190885
+# ifdef USE_AS_WCSLEN
190885
+	shrq	$2, %rax
190885
+# endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(first_vec_x1):
190885
+	tzcntl	%eax, %eax
190885
+# ifdef USE_AS_WCSLEN
190885
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
+	sall	$2, %eax
190885
+# endif
190885
+	addq	$VEC_SIZE, %rax
190885
+	addq	%rdi, %rax
190885
+	subq	%rdx, %rax
190885
+# ifdef USE_AS_WCSLEN
190885
+	shrq	$2, %rax
190885
+# endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(first_vec_x2):
190885
+	tzcntl	%eax, %eax
190885
+# ifdef USE_AS_WCSLEN
190885
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
+	sall	$2, %eax
190885
+# endif
190885
+	addq	$(VEC_SIZE * 2), %rax
190885
+	addq	%rdi, %rax
190885
+	subq	%rdx, %rax
190885
+# ifdef USE_AS_WCSLEN
190885
+	shrq	$2, %rax
190885
+# endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(4x_vec_end):
190885
+	VPCMP	$0, %YMM1, %YMMZERO, %k0
190885
+	kmovd	%k0, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(first_vec_x0)
190885
+	VPCMP	$0, %YMM2, %YMMZERO, %k1
190885
+	kmovd	%k1, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(first_vec_x1)
190885
+	VPCMP	$0, %YMM3, %YMMZERO, %k2
190885
+	kmovd	%k2, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(first_vec_x2)
190885
+	VPCMP	$0, %YMM4, %YMMZERO, %k3
190885
+	kmovd	%k3, %eax
190885
+L(first_vec_x3):
190885
+	tzcntl	%eax, %eax
190885
+# ifdef USE_AS_WCSLEN
190885
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
+	sall	$2, %eax
190885
+# endif
190885
+	addq	$(VEC_SIZE * 3), %rax
190885
+	addq	%rdi, %rax
190885
+	subq	%rdx, %rax
190885
+# ifdef USE_AS_WCSLEN
190885
+	shrq	$2, %rax
190885
+# endif
190885
+	ret
190885
+
190885
+END (STRLEN)
190885
+#endif
190885
diff --git a/sysdeps/x86_64/multiarch/strncmp-evex.S b/sysdeps/x86_64/multiarch/strncmp-evex.S
190885
new file mode 100644
190885
index 00000000..a1d53e8c
190885
--- /dev/null
190885
+++ b/sysdeps/x86_64/multiarch/strncmp-evex.S
190885
@@ -0,0 +1,3 @@
190885
+#define STRCMP	__strncmp_evex
190885
+#define USE_AS_STRNCMP 1
190885
+#include "strcmp-evex.S"
190885
diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c
190885
index 686d654f..4c15542f 100644
190885
--- a/sysdeps/x86_64/multiarch/strncmp.c
190885
+++ b/sysdeps/x86_64/multiarch/strncmp.c
190885
@@ -30,16 +30,25 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
190885
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
190885
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
190885
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
190885
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
190885
 
190885
 static inline void *
190885
 IFUNC_SELECTOR (void)
190885
 {
190885
   const struct cpu_features* cpu_features = __get_cpu_features ();
190885
 
190885
-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
190885
-      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
190885
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
190885
       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
190885
-    return OPTIMIZE (avx2);
190885
+    {
190885
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
190885
+	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
190885
+	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
190885
+	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP))
190885
+	return OPTIMIZE (evex);
190885
+
190885
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
190885
+	return OPTIMIZE (avx2);
190885
+    }
190885
 
190885
   if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_2)
190885
       && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
190885
diff --git a/sysdeps/x86_64/multiarch/strnlen-evex.S b/sysdeps/x86_64/multiarch/strnlen-evex.S
190885
new file mode 100644
190885
index 00000000..722022f3
190885
--- /dev/null
190885
+++ b/sysdeps/x86_64/multiarch/strnlen-evex.S
190885
@@ -0,0 +1,4 @@
190885
+#define STRLEN __strnlen_evex
190885
+#define USE_AS_STRNLEN 1
190885
+
190885
+#include "strlen-evex.S"
190885
diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
190885
new file mode 100644
190885
index 00000000..f920b5a5
190885
--- /dev/null
190885
+++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
190885
@@ -0,0 +1,265 @@
190885
+/* strrchr/wcsrchr optimized with 256-bit EVEX instructions.
190885
+   Copyright (C) 2021 Free Software Foundation, Inc.
190885
+   This file is part of the GNU C Library.
190885
+
190885
+   The GNU C Library is free software; you can redistribute it and/or
190885
+   modify it under the terms of the GNU Lesser General Public
190885
+   License as published by the Free Software Foundation; either
190885
+   version 2.1 of the License, or (at your option) any later version.
190885
+
190885
+   The GNU C Library is distributed in the hope that it will be useful,
190885
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
190885
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
190885
+   Lesser General Public License for more details.
190885
+
190885
+   You should have received a copy of the GNU Lesser General Public
190885
+   License along with the GNU C Library; if not, see
190885
+   <https://www.gnu.org/licenses/>.  */
190885
+
190885
+#if IS_IN (libc)
190885
+
190885
+# include <sysdep.h>
190885
+
190885
+# ifndef STRRCHR
190885
+#  define STRRCHR	__strrchr_evex
190885
+# endif
190885
+
190885
+# define VMOVU		vmovdqu64
190885
+# define VMOVA		vmovdqa64
190885
+
190885
+# ifdef USE_AS_WCSRCHR
190885
+#  define VPBROADCAST	vpbroadcastd
190885
+#  define VPCMP		vpcmpd
190885
+#  define SHIFT_REG	r8d
190885
+# else
190885
+#  define VPBROADCAST	vpbroadcastb
190885
+#  define VPCMP		vpcmpb
190885
+#  define SHIFT_REG	ecx
190885
+# endif
190885
+
190885
+# define XMMZERO	xmm16
190885
+# define YMMZERO	ymm16
190885
+# define YMMMATCH	ymm17
190885
+# define YMM1		ymm18
190885
+
190885
+# define VEC_SIZE	32
190885
+
190885
+	.section .text.evex,"ax",@progbits
190885
+ENTRY (STRRCHR)
190885
+	movl	%edi, %ecx
190885
+	/* Broadcast CHAR to YMMMATCH.  */
190885
+	VPBROADCAST %esi, %YMMMATCH
190885
+
190885
+	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
190885
+
190885
+	/* Check if we may cross page boundary with one vector load.  */
190885
+	andl	$(2 * VEC_SIZE - 1), %ecx
190885
+	cmpl	$VEC_SIZE, %ecx
190885
+	ja	L(cros_page_boundary)
190885
+
190885
+	VMOVU	(%rdi), %YMM1
190885
+
190885
+	/* Each bit in K0 represents a null byte in YMM1.  */
190885
+	VPCMP	$0, %YMMZERO, %YMM1, %k0
190885
+	/* Each bit in K1 represents a CHAR in YMM1.  */
190885
+	VPCMP	$0, %YMMMATCH, %YMM1, %k1
190885
+	kmovd	%k0, %ecx
190885
+	kmovd	%k1, %eax
190885
+
190885
+	addq	$VEC_SIZE, %rdi
190885
+
190885
+	testl	%eax, %eax
190885
+	jnz	L(first_vec)
190885
+
190885
+	testl	%ecx, %ecx
190885
+	jnz	L(return_null)
190885
+
190885
+	andq	$-VEC_SIZE, %rdi
190885
+	xorl	%edx, %edx
190885
+	jmp	L(aligned_loop)
190885
+
190885
+	.p2align 4
190885
+L(first_vec):
190885
+	/* Check if there is a null byte.  */
190885
+	testl	%ecx, %ecx
190885
+	jnz	L(char_and_nul_in_first_vec)
190885
+
190885
+	/* Remember the match and keep searching.  */
190885
+	movl	%eax, %edx
190885
+	movq	%rdi, %rsi
190885
+	andq	$-VEC_SIZE, %rdi
190885
+	jmp	L(aligned_loop)
190885
+
190885
+	.p2align 4
190885
+L(cros_page_boundary):
190885
+	andl	$(VEC_SIZE - 1), %ecx
190885
+	andq	$-VEC_SIZE, %rdi
190885
+
190885
+# ifdef USE_AS_WCSRCHR
190885
+	/* NB: Divide shift count by 4 since each bit in K1 represent 4
190885
+	   bytes.  */
190885
+	movl	%ecx, %SHIFT_REG
190885
+	sarl	$2, %SHIFT_REG
190885
+# endif
190885
+
190885
+	VMOVA	(%rdi), %YMM1
190885
+
190885
+	/* Each bit in K0 represents a null byte in YMM1.  */
190885
+	VPCMP	$0, %YMMZERO, %YMM1, %k0
190885
+	/* Each bit in K1 represents a CHAR in YMM1.  */
190885
+	VPCMP	$0, %YMMMATCH, %YMM1, %k1
190885
+	kmovd	%k0, %edx
190885
+	kmovd	%k1, %eax
190885
+
190885
+	shrxl	%SHIFT_REG, %edx, %edx
190885
+	shrxl	%SHIFT_REG, %eax, %eax
190885
+	addq	$VEC_SIZE, %rdi
190885
+
190885
+	/* Check if there is a CHAR.  */
190885
+	testl	%eax, %eax
190885
+	jnz	L(found_char)
190885
+
190885
+	testl	%edx, %edx
190885
+	jnz	L(return_null)
190885
+
190885
+	jmp	L(aligned_loop)
190885
+
190885
+	.p2align 4
190885
+L(found_char):
190885
+	testl	%edx, %edx
190885
+	jnz	L(char_and_nul)
190885
+
190885
+	/* Remember the match and keep searching.  */
190885
+	movl	%eax, %edx
190885
+	leaq	(%rdi, %rcx), %rsi
190885
+
190885
+	.p2align 4
190885
+L(aligned_loop):
190885
+	VMOVA	(%rdi), %YMM1
190885
+	addq	$VEC_SIZE, %rdi
190885
+
190885
+	/* Each bit in K0 represents a null byte in YMM1.  */
190885
+	VPCMP	$0, %YMMZERO, %YMM1, %k0
190885
+	/* Each bit in K1 represents a CHAR in YMM1.  */
190885
+	VPCMP	$0, %YMMMATCH, %YMM1, %k1
190885
+	kmovd	%k0, %ecx
190885
+	kmovd	%k1, %eax
190885
+	orl	%eax, %ecx
190885
+	jnz	L(char_nor_null)
190885
+
190885
+	VMOVA	(%rdi), %YMM1
190885
+	add	$VEC_SIZE, %rdi
190885
+
190885
+	/* Each bit in K0 represents a null byte in YMM1.  */
190885
+	VPCMP	$0, %YMMZERO, %YMM1, %k0
190885
+	/* Each bit in K1 represents a CHAR in YMM1.  */
190885
+	VPCMP	$0, %YMMMATCH, %YMM1, %k1
190885
+	kmovd	%k0, %ecx
190885
+	kmovd	%k1, %eax
190885
+	orl	%eax, %ecx
190885
+	jnz	L(char_nor_null)
190885
+
190885
+	VMOVA	(%rdi), %YMM1
190885
+	addq	$VEC_SIZE, %rdi
190885
+
190885
+	/* Each bit in K0 represents a null byte in YMM1.  */
190885
+	VPCMP	$0, %YMMZERO, %YMM1, %k0
190885
+	/* Each bit in K1 represents a CHAR in YMM1.  */
190885
+	VPCMP	$0, %YMMMATCH, %YMM1, %k1
190885
+	kmovd	%k0, %ecx
190885
+	kmovd	%k1, %eax
190885
+	orl	%eax, %ecx
190885
+	jnz	L(char_nor_null)
190885
+
190885
+	VMOVA	(%rdi), %YMM1
190885
+	addq	$VEC_SIZE, %rdi
190885
+
190885
+	/* Each bit in K0 represents a null byte in YMM1.  */
190885
+	VPCMP	$0, %YMMZERO, %YMM1, %k0
190885
+	/* Each bit in K1 represents a CHAR in YMM1.  */
190885
+	VPCMP	$0, %YMMMATCH, %YMM1, %k1
190885
+	kmovd	%k0, %ecx
190885
+	kmovd	%k1, %eax
190885
+	orl	%eax, %ecx
190885
+	jz	L(aligned_loop)
190885
+
190885
+	.p2align 4
190885
+L(char_nor_null):
190885
+	/* Find a CHAR or a null byte in a loop.  */
190885
+	testl	%eax, %eax
190885
+	jnz	L(match)
190885
+L(return_value):
190885
+	testl	%edx, %edx
190885
+	jz	L(return_null)
190885
+	movl	%edx, %eax
190885
+	movq	%rsi, %rdi
190885
+	bsrl	%eax, %eax
190885
+# ifdef USE_AS_WCSRCHR
190885
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
+	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
190885
+# else
190885
+	leaq	-VEC_SIZE(%rdi, %rax), %rax
190885
+# endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(match):
190885
+	/* Find a CHAR.  Check if there is a null byte.  */
190885
+	kmovd	%k0, %ecx
190885
+	testl	%ecx, %ecx
190885
+	jnz	L(find_nul)
190885
+
190885
+	/* Remember the match and keep searching.  */
190885
+	movl	%eax, %edx
190885
+	movq	%rdi, %rsi
190885
+	jmp	L(aligned_loop)
190885
+
190885
+	.p2align 4
190885
+L(find_nul):
190885
+	/* Mask out any matching bits after the null byte.  */
190885
+	movl	%ecx, %r8d
190885
+	subl	$1, %r8d
190885
+	xorl	%ecx, %r8d
190885
+	andl	%r8d, %eax
190885
+	testl	%eax, %eax
190885
+	/* If there is no CHAR here, return the remembered one.  */
190885
+	jz	L(return_value)
190885
+	bsrl	%eax, %eax
190885
+# ifdef USE_AS_WCSRCHR
190885
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
+	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
190885
+# else
190885
+	leaq	-VEC_SIZE(%rdi, %rax), %rax
190885
+# endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(char_and_nul):
190885
+	/* Find both a CHAR and a null byte.  */
190885
+	addq	%rcx, %rdi
190885
+	movl	%edx, %ecx
190885
+L(char_and_nul_in_first_vec):
190885
+	/* Mask out any matching bits after the null byte.  */
190885
+	movl	%ecx, %r8d
190885
+	subl	$1, %r8d
190885
+	xorl	%ecx, %r8d
190885
+	andl	%r8d, %eax
190885
+	testl	%eax, %eax
190885
+	/* Return null pointer if the null byte comes first.  */
190885
+	jz	L(return_null)
190885
+	bsrl	%eax, %eax
190885
+# ifdef USE_AS_WCSRCHR
190885
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
190885
+	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
190885
+# else
190885
+	leaq	-VEC_SIZE(%rdi, %rax), %rax
190885
+# endif
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(return_null):
190885
+	xorl	%eax, %eax
190885
+	ret
190885
+
190885
+END (STRRCHR)
190885
+#endif
190885
diff --git a/sysdeps/x86_64/multiarch/wcschr-evex.S b/sysdeps/x86_64/multiarch/wcschr-evex.S
190885
new file mode 100644
190885
index 00000000..7cb8f1e4
190885
--- /dev/null
190885
+++ b/sysdeps/x86_64/multiarch/wcschr-evex.S
190885
@@ -0,0 +1,3 @@
190885
+#define STRCHR __wcschr_evex
190885
+#define USE_AS_WCSCHR 1
190885
+#include "strchr-evex.S"
190885
diff --git a/sysdeps/x86_64/multiarch/wcscmp-evex.S b/sysdeps/x86_64/multiarch/wcscmp-evex.S
190885
new file mode 100644
190885
index 00000000..42e73e51
190885
--- /dev/null
190885
+++ b/sysdeps/x86_64/multiarch/wcscmp-evex.S
190885
@@ -0,0 +1,4 @@
190885
+#define STRCMP __wcscmp_evex
190885
+#define USE_AS_WCSCMP 1
190885
+
190885
+#include "strcmp-evex.S"
190885
diff --git a/sysdeps/x86_64/multiarch/wcslen-evex.S b/sysdeps/x86_64/multiarch/wcslen-evex.S
190885
new file mode 100644
190885
index 00000000..bdafa83b
190885
--- /dev/null
190885
+++ b/sysdeps/x86_64/multiarch/wcslen-evex.S
190885
@@ -0,0 +1,4 @@
190885
+#define STRLEN __wcslen_evex
190885
+#define USE_AS_WCSLEN 1
190885
+
190885
+#include "strlen-evex.S"
190885
diff --git a/sysdeps/x86_64/multiarch/wcsncmp-evex.S b/sysdeps/x86_64/multiarch/wcsncmp-evex.S
190885
new file mode 100644
190885
index 00000000..8a8e3107
190885
--- /dev/null
190885
+++ b/sysdeps/x86_64/multiarch/wcsncmp-evex.S
190885
@@ -0,0 +1,5 @@
190885
+#define STRCMP __wcsncmp_evex
190885
+#define USE_AS_STRNCMP 1
190885
+#define USE_AS_WCSCMP 1
190885
+
190885
+#include "strcmp-evex.S"
190885
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex.S b/sysdeps/x86_64/multiarch/wcsnlen-evex.S
190885
new file mode 100644
190885
index 00000000..24773bb4
190885
--- /dev/null
190885
+++ b/sysdeps/x86_64/multiarch/wcsnlen-evex.S
190885
@@ -0,0 +1,5 @@
190885
+#define STRLEN __wcsnlen_evex
190885
+#define USE_AS_WCSLEN 1
190885
+#define USE_AS_STRNLEN 1
190885
+
190885
+#include "strlen-evex.S"
190885
diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c
190885
index b3144c93..84254b83 100644
190885
--- a/sysdeps/x86_64/multiarch/wcsnlen.c
190885
+++ b/sysdeps/x86_64/multiarch/wcsnlen.c
190885
@@ -29,16 +29,24 @@
190885
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
190885
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
190885
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
190885
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
190885
 
190885
 static inline void *
190885
 IFUNC_SELECTOR (void)
190885
 {
190885
   const struct cpu_features* cpu_features = __get_cpu_features ();
190885
 
190885
-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
190885
-      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
190885
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
190885
       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
190885
-    return OPTIMIZE (avx2);
190885
+    {
190885
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
190885
+	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
190885
+	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
190885
+	return OPTIMIZE (evex);
190885
+
190885
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
190885
+	return OPTIMIZE (avx2);
190885
+    }
190885
 
190885
   if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
190885
     return OPTIMIZE (sse4_1);
190885
diff --git a/sysdeps/x86_64/multiarch/wcsrchr-evex.S b/sysdeps/x86_64/multiarch/wcsrchr-evex.S
190885
new file mode 100644
190885
index 00000000..c64602f7
190885
--- /dev/null
190885
+++ b/sysdeps/x86_64/multiarch/wcsrchr-evex.S
190885
@@ -0,0 +1,3 @@
190885
+#define STRRCHR __wcsrchr_evex
190885
+#define USE_AS_WCSRCHR 1
190885
+#include "strrchr-evex.S"
190885
diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex.S b/sysdeps/x86_64/multiarch/wmemchr-evex.S
190885
new file mode 100644
190885
index 00000000..06cd0f9f
190885
--- /dev/null
190885
+++ b/sysdeps/x86_64/multiarch/wmemchr-evex.S
190885
@@ -0,0 +1,4 @@
190885
+#define MEMCHR __wmemchr_evex
190885
+#define USE_AS_WMEMCHR 1
190885
+
190885
+#include "memchr-evex.S"
190885
-- 
190885
GitLab
190885