190885
From adf509678af2cf861f3e6d34aba8d062cfc27bf5 Mon Sep 17 00:00:00 2001
190885
From: Noah Goldstein <goldstein.w.n@gmail.com>
190885
Date: Tue, 4 May 2021 19:02:40 -0400
190885
Subject: [PATCH] x86: Add EVEX optimized memchr family not safe for RTM
190885
190885
No bug.
190885
190885
This commit adds a new implementation for EVEX memchr that is not safe
190885
for RTM because it uses vzeroupper. The benefit is that by using
190885
ymm0-ymm15 it can use vpcmpeq and vpternlogd in the 4x loop which is
190885
faster than the RTM safe version which cannot use vpcmpeq because
190885
there is no EVEX encoding for the instruction. All parts of the
190885
implementation aside from the 4x loop are the same for the two
190885
versions and the optimization is only relevant for large sizes.
190885
190885
Tigerlake:
190885
size  , algn  , Pos   , Cur T , New T , Win     , Dif
190885
512   , 6     , 192   , 9.2   , 9.04  , no-RTM  , 0.16
190885
512   , 7     , 224   , 9.19  , 8.98  , no-RTM  , 0.21
190885
2048  , 0     , 256   , 10.74 , 10.54 , no-RTM  , 0.2
190885
2048  , 0     , 512   , 14.81 , 14.87 , RTM     , 0.06
190885
2048  , 0     , 1024  , 22.97 , 22.57 , no-RTM  , 0.4
190885
2048  , 0     , 2048  , 37.49 , 34.51 , no-RTM  , 2.98   <--
190885
190885
Icelake:
190885
size  , algn  , Pos   , Cur T , New T , Win     , Dif
190885
512   , 6     , 192   , 7.6   , 7.3   , no-RTM  , 0.3
190885
512   , 7     , 224   , 7.63  , 7.27  , no-RTM  , 0.36
190885
2048  , 0     , 256   , 8.48  , 8.38  , no-RTM  , 0.1
190885
2048  , 0     , 512   , 11.57 , 11.42 , no-RTM  , 0.15
190885
2048  , 0     , 1024  , 17.92 , 17.38 , no-RTM  , 0.54
190885
2048  , 0     , 2048  , 30.37 , 27.34 , no-RTM  , 3.03   <--
190885
190885
test-memchr, test-wmemchr, and test-rawmemchr are all passing.
190885
190885
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
190885
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
190885
(cherry picked from commit 104c7b1967c3e78435c6f7eab5e225a7eddf9c6e)
190885
---
190885
 sysdeps/x86_64/multiarch/Makefile             |   7 +-
190885
 sysdeps/x86_64/multiarch/ifunc-evex.h         |  55 ++++++
190885
 sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  15 ++
190885
 sysdeps/x86_64/multiarch/memchr-evex-rtm.S    |   8 +
190885
 sysdeps/x86_64/multiarch/memchr-evex.S        | 161 ++++++++++++++----
190885
 sysdeps/x86_64/multiarch/memchr.c             |   2 +-
190885
 sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S |   3 +
190885
 sysdeps/x86_64/multiarch/rawmemchr.c          |   2 +-
190885
 sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S   |   3 +
190885
 sysdeps/x86_64/multiarch/wmemchr.c            |   2 +-
190885
 10 files changed, 217 insertions(+), 41 deletions(-)
190885
 create mode 100644 sysdeps/x86_64/multiarch/ifunc-evex.h
190885
 create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-rtm.S
190885
 create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
190885
 create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S
190885
190885
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
190885
index 65fde4eb..26be4095 100644
190885
--- a/sysdeps/x86_64/multiarch/Makefile
190885
+++ b/sysdeps/x86_64/multiarch/Makefile
190885
@@ -77,7 +77,9 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
190885
 		   strncmp-evex \
190885
 		   strncpy-evex \
190885
 		   strnlen-evex \
190885
-		   strrchr-evex
190885
+		   strrchr-evex \
190885
+		   memchr-evex-rtm \
190885
+		   rawmemchr-evex-rtm
190885
 CFLAGS-varshift.c += -msse4
190885
 CFLAGS-strcspn-c.c += -msse4
190885
 CFLAGS-strpbrk-c.c += -msse4
190885
@@ -110,7 +112,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
190885
 		   wcsnlen-evex \
190885
 		   wcsrchr-evex \
190885
 		   wmemchr-evex \
190885
-		   wmemcmp-evex-movbe
190885
+		   wmemcmp-evex-movbe \
190885
+		   wmemchr-evex-rtm
190885
 endif
190885
 
190885
 ifeq ($(subdir),debug)
190885
diff --git a/sysdeps/x86_64/multiarch/ifunc-evex.h b/sysdeps/x86_64/multiarch/ifunc-evex.h
190885
new file mode 100644
190885
index 00000000..fc391edb
190885
--- /dev/null
190885
+++ b/sysdeps/x86_64/multiarch/ifunc-evex.h
190885
@@ -0,0 +1,55 @@
190885
+/* Common definition for ifunc selection optimized with EVEX.
190885
+   All versions must be listed in ifunc-impl-list.c.
190885
+   Copyright (C) 2017-2021 Free Software Foundation, Inc.
190885
+   This file is part of the GNU C Library.
190885
+
190885
+   The GNU C Library is free software; you can redistribute it and/or
190885
+   modify it under the terms of the GNU Lesser General Public
190885
+   License as published by the Free Software Foundation; either
190885
+   version 2.1 of the License, or (at your option) any later version.
190885
+
190885
+   The GNU C Library is distributed in the hope that it will be useful,
190885
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
190885
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
190885
+   Lesser General Public License for more details.
190885
+
190885
+   You should have received a copy of the GNU Lesser General Public
190885
+   License along with the GNU C Library; if not, see
190885
+   <https://www.gnu.org/licenses/>.  */
190885
+
190885
+#include <init-arch.h>
190885
+
190885
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
190885
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
190885
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
190885
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
190885
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_rtm) attribute_hidden;
190885
+
190885
+
190885
+static inline void *
190885
+IFUNC_SELECTOR (void)
190885
+{
190885
+  const struct cpu_features* cpu_features = __get_cpu_features ();
190885
+
190885
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
190885
+      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
190885
+      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
190885
+    {
190885
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
190885
+	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
190885
+	{
190885
+	  if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
190885
+	    return OPTIMIZE (evex_rtm);
190885
+
190885
+	  return OPTIMIZE (evex);
190885
+	}
190885
+
190885
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
190885
+	return OPTIMIZE (avx2_rtm);
190885
+
190885
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
190885
+	return OPTIMIZE (avx2);
190885
+    }
190885
+
190885
+  return OPTIMIZE (sse2);
190885
+}
190885
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
190885
index d59d65f8..ac097e8d 100644
190885
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
190885
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
190885
@@ -52,6 +52,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
190885
 			       && CPU_FEATURE_USABLE (AVX512BW)
190885
 			       && CPU_FEATURE_USABLE (BMI2)),
190885
 			      __memchr_evex)
190885
+	      IFUNC_IMPL_ADD (array, i, memchr,
190885
+			      (CPU_FEATURE_USABLE (AVX512VL)
190885
+			       && CPU_FEATURE_USABLE (AVX512BW)
190885
+			       && CPU_FEATURE_USABLE (BMI2)),
190885
+			      __memchr_evex_rtm)
190885
 	      IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_sse2))
190885
 
190885
   /* Support sysdeps/x86_64/multiarch/memcmp.c.  */
190885
@@ -288,6 +293,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
190885
 			       && CPU_FEATURE_USABLE (AVX512BW)
190885
 			       && CPU_FEATURE_USABLE (BMI2)),
190885
 			      __rawmemchr_evex)
190885
+	      IFUNC_IMPL_ADD (array, i, rawmemchr,
190885
+			      (CPU_FEATURE_USABLE (AVX512VL)
190885
+			       && CPU_FEATURE_USABLE (AVX512BW)
190885
+			       && CPU_FEATURE_USABLE (BMI2)),
190885
+			      __rawmemchr_evex_rtm)
190885
 	      IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2))
190885
 
190885
   /* Support sysdeps/x86_64/multiarch/strlen.c.  */
190885
@@ -711,6 +721,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
190885
 			       && CPU_FEATURE_USABLE (AVX512BW)
190885
 			       && CPU_FEATURE_USABLE (BMI2)),
190885
 			      __wmemchr_evex)
190885
+	      IFUNC_IMPL_ADD (array, i, wmemchr,
190885
+			      (CPU_FEATURE_USABLE (AVX512VL)
190885
+			       && CPU_FEATURE_USABLE (AVX512BW)
190885
+			       && CPU_FEATURE_USABLE (BMI2)),
190885
+			      __wmemchr_evex_rtm)
190885
 	      IFUNC_IMPL_ADD (array, i, wmemchr, 1, __wmemchr_sse2))
190885
 
190885
   /* Support sysdeps/x86_64/multiarch/wmemcmp.c.  */
190885
diff --git a/sysdeps/x86_64/multiarch/memchr-evex-rtm.S b/sysdeps/x86_64/multiarch/memchr-evex-rtm.S
190885
new file mode 100644
190885
index 00000000..19871882
190885
--- /dev/null
190885
+++ b/sysdeps/x86_64/multiarch/memchr-evex-rtm.S
190885
@@ -0,0 +1,8 @@
190885
+#ifndef MEMCHR
190885
+# define MEMCHR __memchr_evex_rtm
190885
+#endif
190885
+
190885
+#define USE_IN_RTM 1
190885
+#define SECTION(p) p##.evex.rtm
190885
+
190885
+#include "memchr-evex.S"
190885
diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
190885
index f3fdad4f..4d0ed6d1 100644
190885
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
190885
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
190885
@@ -38,10 +38,32 @@
190885
 #  define CHAR_SIZE	1
190885
 # endif
190885
 
190885
+	/* In the 4x loop the RTM and non-RTM versions have data pointer
190885
+	   off by VEC_SIZE * 4 with RTM version being VEC_SIZE * 4 greater.
190885
+	   This is represented by BASE_OFFSET. As well because the RTM
190885
+	   version uses vpcmp which stores a bit per element compared where
190885
+	   the non-RTM version uses vpcmpeq which stores a bit per byte
190885
+	   compared RET_SCALE of CHAR_SIZE is only relevant for the RTM
190885
+	   version.  */
190885
+# ifdef USE_IN_RTM
190885
+#  define VZEROUPPER
190885
+#  define BASE_OFFSET	(VEC_SIZE * 4)
190885
+#  define RET_SCALE	CHAR_SIZE
190885
+# else
190885
+#  define VZEROUPPER	vzeroupper
190885
+#  define BASE_OFFSET	0
190885
+#  define RET_SCALE	1
190885
+# endif
190885
+
190885
+	/* In the return from 4x loop memchr and rawmemchr versions have
190885
+	   data pointers off by VEC_SIZE * 4 with memchr version being
190885
+	   VEC_SIZE * 4 greater.  */
190885
 # ifdef USE_AS_RAWMEMCHR
190885
+#  define RET_OFFSET	(BASE_OFFSET - (VEC_SIZE * 4))
190885
 #  define RAW_PTR_REG	rcx
190885
 #  define ALGN_PTR_REG	rdi
190885
 # else
190885
+#  define RET_OFFSET	BASE_OFFSET
190885
 #  define RAW_PTR_REG	rdi
190885
 #  define ALGN_PTR_REG	rcx
190885
 # endif
190885
@@ -57,11 +79,15 @@
190885
 # define YMM5		ymm21
190885
 # define YMM6		ymm22
190885
 
190885
+# ifndef SECTION
190885
+#  define SECTION(p)	p##.evex
190885
+# endif
190885
+
190885
 # define VEC_SIZE 32
190885
 # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
190885
 # define PAGE_SIZE 4096
190885
 
190885
-	.section .text.evex,"ax",@progbits
190885
+	.section SECTION(.text),"ax",@progbits
190885
 ENTRY (MEMCHR)
190885
 # ifndef USE_AS_RAWMEMCHR
190885
 	/* Check for zero length.  */
190885
@@ -237,14 +263,15 @@ L(cross_page_continue):
190885
 	/* Check if at last CHAR_PER_VEC * 4 length.  */
190885
 	subq	$(CHAR_PER_VEC * 4), %rdx
190885
 	jbe	L(last_4x_vec_or_less_cmpeq)
190885
-	addq	$VEC_SIZE, %rdi
190885
+	/* +VEC_SIZE if USE_IN_RTM otherwise +VEC_SIZE * 5.  */
190885
+	addq	$(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
190885
 
190885
 	/* Align data to VEC_SIZE * 4 for the loop and readjust length.
190885
 	 */
190885
 #  ifdef USE_AS_WMEMCHR
190885
 	movl	%edi, %ecx
190885
 	andq	$-(4 * VEC_SIZE), %rdi
190885
-	andl	$(VEC_SIZE * 4 - 1), %ecx
190885
+	subl	%edi, %ecx
190885
 	/* NB: Divide bytes by 4 to get the wchar_t count.  */
190885
 	sarl	$2, %ecx
190885
 	addq	%rcx, %rdx
190885
@@ -254,15 +281,28 @@ L(cross_page_continue):
190885
 	subq	%rdi, %rdx
190885
 #  endif
190885
 # else
190885
-	addq	$VEC_SIZE, %rdi
190885
+	addq	$(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
190885
 	andq	$-(4 * VEC_SIZE), %rdi
190885
 # endif
190885
-
190885
+# ifdef USE_IN_RTM
190885
 	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
190885
+# else
190885
+	/* copy ymmmatch to ymm0 so we can use vpcmpeq which is not
190885
+	   encodable with EVEX registers (ymm16-ymm31).  */
190885
+	vmovdqa64 %YMMMATCH, %ymm0
190885
+# endif
190885
 
190885
 	/* Compare 4 * VEC at a time forward.  */
190885
 	.p2align 4
190885
 L(loop_4x_vec):
190885
+	/* Two versions of the loop. One that does not require
190885
+	   vzeroupper by not using ymm0-ymm15 and another does that require
190885
+	   vzeroupper because it uses ymm0-ymm15. The reason why ymm0-ymm15
190885
+	   is used at all is because there is no EVEX encoding vpcmpeq and
190885
+	   with vpcmpeq this loop can be performed more efficiently. The
190885
+	   non-vzeroupper version is safe for RTM while the vzeroupper
190885
+	   version should be prefered if RTM are not supported.  */
190885
+# ifdef USE_IN_RTM
190885
 	/* It would be possible to save some instructions using 4x VPCMP
190885
 	   but bottleneck on port 5 makes it not woth it.  */
190885
 	VPCMP	$4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
190885
@@ -273,12 +313,55 @@ L(loop_4x_vec):
190885
 	/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask.  */
190885
 	VPMINU	%YMM2, %YMM3, %YMM3{%k1}{z}
190885
 	VPCMP	$0, %YMM3, %YMMZERO, %k2
190885
+# else
190885
+	/* Since vptern can only take 3x vectors fastest to do 1 vec
190885
+	   seperately with EVEX vpcmp.  */
190885
+#  ifdef USE_AS_WMEMCHR
190885
+	/* vptern can only accept masks for epi32/epi64 so can only save
190885
+	   instruction using not equals mask on vptern with wmemchr.  */
190885
+	VPCMP	$4, (%rdi), %YMMMATCH, %k1
190885
+#  else
190885
+	VPCMP	$0, (%rdi), %YMMMATCH, %k1
190885
+#  endif
190885
+	/* Compare 3x with vpcmpeq and or them all together with vptern.
190885
+	 */
190885
+	VPCMPEQ	VEC_SIZE(%rdi), %ymm0, %ymm2
190885
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
190885
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
190885
+#  ifdef USE_AS_WMEMCHR
190885
+	/* This takes the not of or between ymm2, ymm3, ymm4 as well as
190885
+	   combines result from VEC0 with zero mask.  */
190885
+	vpternlogd $1, %ymm2, %ymm3, %ymm4{%k1}{z}
190885
+	vpmovmskb %ymm4, %ecx
190885
+#  else
190885
+	/* 254 is mask for oring ymm2, ymm3, ymm4 into ymm4.  */
190885
+	vpternlogd $254, %ymm2, %ymm3, %ymm4
190885
+	vpmovmskb %ymm4, %ecx
190885
+	kmovd	%k1, %eax
190885
+#  endif
190885
+# endif
190885
+
190885
 # ifdef USE_AS_RAWMEMCHR
190885
 	subq	$-(VEC_SIZE * 4), %rdi
190885
+# endif
190885
+# ifdef USE_IN_RTM
190885
 	kortestd %k2, %k3
190885
+# else
190885
+#  ifdef USE_AS_WMEMCHR
190885
+	/* ecx contains not of matches. All 1s means no matches. incl will
190885
+	   overflow and set zeroflag if that is the case.  */
190885
+	incl	%ecx
190885
+#  else
190885
+	/* If either VEC1 (eax) or VEC2-VEC4 (ecx) are not zero. Adding
190885
+	   to ecx is not an issue because if eax is non-zero it will be
190885
+	   used for returning the match. If it is zero the add does
190885
+	   nothing.  */
190885
+	addq	%rax, %rcx
190885
+#  endif
190885
+# endif
190885
+# ifdef USE_AS_RAWMEMCHR
190885
 	jz	L(loop_4x_vec)
190885
 # else
190885
-	kortestd %k2, %k3
190885
 	jnz	L(loop_4x_vec_end)
190885
 
190885
 	subq	$-(VEC_SIZE * 4), %rdi
190885
@@ -288,10 +371,11 @@ L(loop_4x_vec):
190885
 
190885
 	/* Fall through into less than 4 remaining vectors of length case.
190885
 	 */
190885
-	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
190885
+	VPCMP	$0, BASE_OFFSET(%rdi), %YMMMATCH, %k0
190885
+	addq	$(BASE_OFFSET - VEC_SIZE), %rdi
190885
 	kmovd	%k0, %eax
190885
-	addq	$(VEC_SIZE * 3), %rdi
190885
-	.p2align 4
190885
+	VZEROUPPER
190885
+
190885
 L(last_4x_vec_or_less):
190885
 	/* Check if first VEC contained match.  */
190885
 	testl	%eax, %eax
190885
@@ -338,73 +422,78 @@ L(loop_4x_vec_end):
190885
 	/* rawmemchr will fall through into this if match was found in
190885
 	   loop.  */
190885
 
190885
+# if defined USE_IN_RTM || defined USE_AS_WMEMCHR
190885
 	/* k1 has not of matches with VEC1.  */
190885
 	kmovd	%k1, %eax
190885
-# ifdef USE_AS_WMEMCHR
190885
+#  ifdef USE_AS_WMEMCHR
190885
 	subl	$((1 << CHAR_PER_VEC) - 1), %eax
190885
-# else
190885
+#  else
190885
 	incl	%eax
190885
+#  endif
190885
+# else
190885
+	/* eax already has matches for VEC1.  */
190885
+	testl	%eax, %eax
190885
 # endif
190885
 	jnz	L(last_vec_x1_return)
190885
 
190885
+# ifdef USE_IN_RTM
190885
 	VPCMP	$0, %YMM2, %YMMZERO, %k0
190885
 	kmovd	%k0, %eax
190885
+# else
190885
+	vpmovmskb %ymm2, %eax
190885
+# endif
190885
 	testl	%eax, %eax
190885
 	jnz	L(last_vec_x2_return)
190885
 
190885
+# ifdef USE_IN_RTM
190885
 	kmovd	%k2, %eax
190885
 	testl	%eax, %eax
190885
 	jnz	L(last_vec_x3_return)
190885
 
190885
 	kmovd	%k3, %eax
190885
 	tzcntl	%eax, %eax
190885
-# ifdef USE_AS_RAWMEMCHR
190885
-	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
190885
+	leaq	(VEC_SIZE * 3 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
190885
 # else
190885
-	leaq	(VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
190885
+	vpmovmskb %ymm3, %eax
190885
+	/* Combine matches in VEC3 (eax) with matches in VEC4 (ecx).  */
190885
+	salq	$VEC_SIZE, %rcx
190885
+	orq	%rcx, %rax
190885
+	tzcntq	%rax, %rax
190885
+	leaq	(VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax), %rax
190885
+	VZEROUPPER
190885
 # endif
190885
 	ret
190885
 
190885
 	.p2align 4
190885
 L(last_vec_x1_return):
190885
 	tzcntl	%eax, %eax
190885
-# ifdef USE_AS_RAWMEMCHR
190885
-#  ifdef USE_AS_WMEMCHR
190885
+# if defined USE_AS_WMEMCHR || RET_OFFSET != 0
190885
 	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
190885
-	leaq	(%rdi, %rax, CHAR_SIZE), %rax
190885
-#  else
190885
-	addq	%rdi, %rax
190885
-#  endif
190885
+	leaq	RET_OFFSET(%rdi, %rax, CHAR_SIZE), %rax
190885
 # else
190885
-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
190885
-	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
190885
+	addq	%rdi, %rax
190885
 # endif
190885
+	VZEROUPPER
190885
 	ret
190885
 
190885
 	.p2align 4
190885
 L(last_vec_x2_return):
190885
 	tzcntl	%eax, %eax
190885
-# ifdef USE_AS_RAWMEMCHR
190885
-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
190885
-	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
190885
-# else
190885
-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
190885
-	leaq	(VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
190885
-# endif
190885
+	/* NB: Multiply bytes by RET_SCALE to get the wchar_t count
190885
+	   if relevant (RET_SCALE = CHAR_SIZE if USE_AS_WMEMCHAR and
190885
+	   USE_IN_RTM are both defined. Otherwise RET_SCALE = 1.  */
190885
+	leaq	(VEC_SIZE + RET_OFFSET)(%rdi, %rax, RET_SCALE), %rax
190885
+	VZEROUPPER
190885
 	ret
190885
 
190885
+# ifdef USE_IN_RTM
190885
 	.p2align 4
190885
 L(last_vec_x3_return):
190885
 	tzcntl	%eax, %eax
190885
-# ifdef USE_AS_RAWMEMCHR
190885
-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
190885
-	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
190885
-# else
190885
 	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
190885
-	leaq	(VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
190885
-# endif
190885
+	leaq	(VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
190885
 	ret
190885
-
190885
+# endif
190885
 
190885
 # ifndef USE_AS_RAWMEMCHR
190885
 L(last_4x_vec_or_less_cmpeq):
190885
diff --git a/sysdeps/x86_64/multiarch/memchr.c b/sysdeps/x86_64/multiarch/memchr.c
190885
index 016f5784..f28aea77 100644
190885
--- a/sysdeps/x86_64/multiarch/memchr.c
190885
+++ b/sysdeps/x86_64/multiarch/memchr.c
190885
@@ -24,7 +24,7 @@
190885
 # undef memchr
190885
 
190885
 # define SYMBOL_NAME memchr
190885
-# include "ifunc-avx2.h"
190885
+# include "ifunc-evex.h"
190885
 
190885
 libc_ifunc_redirected (__redirect_memchr, memchr, IFUNC_SELECTOR ());
190885
 strong_alias (memchr, __memchr)
190885
diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
190885
new file mode 100644
190885
index 00000000..deda1ca3
190885
--- /dev/null
190885
+++ b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
190885
@@ -0,0 +1,3 @@
190885
+#define MEMCHR __rawmemchr_evex_rtm
190885
+#define USE_AS_RAWMEMCHR 1
190885
+#include "memchr-evex-rtm.S"
190885
diff --git a/sysdeps/x86_64/multiarch/rawmemchr.c b/sysdeps/x86_64/multiarch/rawmemchr.c
190885
index 8a0bc313..1f764f35 100644
190885
--- a/sysdeps/x86_64/multiarch/rawmemchr.c
190885
+++ b/sysdeps/x86_64/multiarch/rawmemchr.c
190885
@@ -26,7 +26,7 @@
190885
 # undef __rawmemchr
190885
 
190885
 # define SYMBOL_NAME rawmemchr
190885
-# include "ifunc-avx2.h"
190885
+# include "ifunc-evex.h"
190885
 
190885
 libc_ifunc_redirected (__redirect_rawmemchr, __rawmemchr,
190885
 		       IFUNC_SELECTOR ());
190885
diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S
190885
new file mode 100644
190885
index 00000000..a346cd35
190885
--- /dev/null
190885
+++ b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S
190885
@@ -0,0 +1,3 @@
190885
+#define MEMCHR __wmemchr_evex_rtm
190885
+#define USE_AS_WMEMCHR 1
190885
+#include "memchr-evex-rtm.S"
190885
diff --git a/sysdeps/x86_64/multiarch/wmemchr.c b/sysdeps/x86_64/multiarch/wmemchr.c
190885
index 6d833702..f9c91915 100644
190885
--- a/sysdeps/x86_64/multiarch/wmemchr.c
190885
+++ b/sysdeps/x86_64/multiarch/wmemchr.c
190885
@@ -26,7 +26,7 @@
190885
 # undef __wmemchr
190885
 
190885
 # define SYMBOL_NAME wmemchr
190885
-# include "ifunc-avx2.h"
190885
+# include "ifunc-evex.h"
190885
 
190885
 libc_ifunc_redirected (__redirect_wmemchr, __wmemchr, IFUNC_SELECTOR ());
190885
 weak_alias (__wmemchr, wmemchr)
190885
-- 
190885
GitLab
190885