|
|
190885 |
From adf509678af2cf861f3e6d34aba8d062cfc27bf5 Mon Sep 17 00:00:00 2001
|
|
|
190885 |
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
|
190885 |
Date: Tue, 4 May 2021 19:02:40 -0400
|
|
|
190885 |
Subject: [PATCH] x86: Add EVEX optimized memchr family not safe for RTM
|
|
|
190885 |
|
|
|
190885 |
No bug.
|
|
|
190885 |
|
|
|
190885 |
This commit adds a new implementation for EVEX memchr that is not safe
|
|
|
190885 |
for RTM because it uses vzeroupper. The benefit is that by using
|
|
|
190885 |
ymm0-ymm15 it can use vpcmpeq and vpternlogd in the 4x loop which is
|
|
|
190885 |
faster than the RTM safe version which cannot use vpcmpeq because
|
|
|
190885 |
there is no EVEX encoding for the instruction. All parts of the
|
|
|
190885 |
implementation aside from the 4x loop are the same for the two
|
|
|
190885 |
versions and the optimization is only relevant for large sizes.
|
|
|
190885 |
|
|
|
190885 |
Tigerlake:
|
|
|
190885 |
size , algn , Pos , Cur T , New T , Win , Dif
|
|
|
190885 |
512 , 6 , 192 , 9.2 , 9.04 , no-RTM , 0.16
|
|
|
190885 |
512 , 7 , 224 , 9.19 , 8.98 , no-RTM , 0.21
|
|
|
190885 |
2048 , 0 , 256 , 10.74 , 10.54 , no-RTM , 0.2
|
|
|
190885 |
2048 , 0 , 512 , 14.81 , 14.87 , RTM , 0.06
|
|
|
190885 |
2048 , 0 , 1024 , 22.97 , 22.57 , no-RTM , 0.4
|
|
|
190885 |
2048 , 0 , 2048 , 37.49 , 34.51 , no-RTM , 2.98 <--
|
|
|
190885 |
|
|
|
190885 |
Icelake:
|
|
|
190885 |
size , algn , Pos , Cur T , New T , Win , Dif
|
|
|
190885 |
512 , 6 , 192 , 7.6 , 7.3 , no-RTM , 0.3
|
|
|
190885 |
512 , 7 , 224 , 7.63 , 7.27 , no-RTM , 0.36
|
|
|
190885 |
2048 , 0 , 256 , 8.48 , 8.38 , no-RTM , 0.1
|
|
|
190885 |
2048 , 0 , 512 , 11.57 , 11.42 , no-RTM , 0.15
|
|
|
190885 |
2048 , 0 , 1024 , 17.92 , 17.38 , no-RTM , 0.54
|
|
|
190885 |
2048 , 0 , 2048 , 30.37 , 27.34 , no-RTM , 3.03 <--
|
|
|
190885 |
|
|
|
190885 |
test-memchr, test-wmemchr, and test-rawmemchr are all passing.
|
|
|
190885 |
|
|
|
190885 |
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
|
190885 |
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
|
190885 |
(cherry picked from commit 104c7b1967c3e78435c6f7eab5e225a7eddf9c6e)
|
|
|
190885 |
---
|
|
|
190885 |
sysdeps/x86_64/multiarch/Makefile | 7 +-
|
|
|
190885 |
sysdeps/x86_64/multiarch/ifunc-evex.h | 55 ++++++
|
|
|
190885 |
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 ++
|
|
|
190885 |
sysdeps/x86_64/multiarch/memchr-evex-rtm.S | 8 +
|
|
|
190885 |
sysdeps/x86_64/multiarch/memchr-evex.S | 161 ++++++++++++++----
|
|
|
190885 |
sysdeps/x86_64/multiarch/memchr.c | 2 +-
|
|
|
190885 |
sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S | 3 +
|
|
|
190885 |
sysdeps/x86_64/multiarch/rawmemchr.c | 2 +-
|
|
|
190885 |
sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S | 3 +
|
|
|
190885 |
sysdeps/x86_64/multiarch/wmemchr.c | 2 +-
|
|
|
190885 |
10 files changed, 217 insertions(+), 41 deletions(-)
|
|
|
190885 |
create mode 100644 sysdeps/x86_64/multiarch/ifunc-evex.h
|
|
|
190885 |
create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-rtm.S
|
|
|
190885 |
create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
|
|
|
190885 |
create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S
|
|
|
190885 |
|
|
|
190885 |
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
|
|
|
190885 |
index 65fde4eb..26be4095 100644
|
|
|
190885 |
--- a/sysdeps/x86_64/multiarch/Makefile
|
|
|
190885 |
+++ b/sysdeps/x86_64/multiarch/Makefile
|
|
|
190885 |
@@ -77,7 +77,9 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
|
|
|
190885 |
strncmp-evex \
|
|
|
190885 |
strncpy-evex \
|
|
|
190885 |
strnlen-evex \
|
|
|
190885 |
- strrchr-evex
|
|
|
190885 |
+ strrchr-evex \
|
|
|
190885 |
+ memchr-evex-rtm \
|
|
|
190885 |
+ rawmemchr-evex-rtm
|
|
|
190885 |
CFLAGS-varshift.c += -msse4
|
|
|
190885 |
CFLAGS-strcspn-c.c += -msse4
|
|
|
190885 |
CFLAGS-strpbrk-c.c += -msse4
|
|
|
190885 |
@@ -110,7 +112,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
|
|
|
190885 |
wcsnlen-evex \
|
|
|
190885 |
wcsrchr-evex \
|
|
|
190885 |
wmemchr-evex \
|
|
|
190885 |
- wmemcmp-evex-movbe
|
|
|
190885 |
+ wmemcmp-evex-movbe \
|
|
|
190885 |
+ wmemchr-evex-rtm
|
|
|
190885 |
endif
|
|
|
190885 |
|
|
|
190885 |
ifeq ($(subdir),debug)
|
|
|
190885 |
diff --git a/sysdeps/x86_64/multiarch/ifunc-evex.h b/sysdeps/x86_64/multiarch/ifunc-evex.h
|
|
|
190885 |
new file mode 100644
|
|
|
190885 |
index 00000000..fc391edb
|
|
|
190885 |
--- /dev/null
|
|
|
190885 |
+++ b/sysdeps/x86_64/multiarch/ifunc-evex.h
|
|
|
190885 |
@@ -0,0 +1,55 @@
|
|
|
190885 |
+/* Common definition for ifunc selection optimized with EVEX.
|
|
|
190885 |
+ All versions must be listed in ifunc-impl-list.c.
|
|
|
190885 |
+ Copyright (C) 2017-2021 Free Software Foundation, Inc.
|
|
|
190885 |
+ This file is part of the GNU C Library.
|
|
|
190885 |
+
|
|
|
190885 |
+ The GNU C Library is free software; you can redistribute it and/or
|
|
|
190885 |
+ modify it under the terms of the GNU Lesser General Public
|
|
|
190885 |
+ License as published by the Free Software Foundation; either
|
|
|
190885 |
+ version 2.1 of the License, or (at your option) any later version.
|
|
|
190885 |
+
|
|
|
190885 |
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
|
190885 |
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
190885 |
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
190885 |
+ Lesser General Public License for more details.
|
|
|
190885 |
+
|
|
|
190885 |
+ You should have received a copy of the GNU Lesser General Public
|
|
|
190885 |
+ License along with the GNU C Library; if not, see
|
|
|
190885 |
+ <https://www.gnu.org/licenses/>. */
|
|
|
190885 |
+
|
|
|
190885 |
+#include <init-arch.h>
|
|
|
190885 |
+
|
|
|
190885 |
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
|
|
|
190885 |
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
|
|
|
190885 |
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
|
|
|
190885 |
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
|
|
|
190885 |
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_rtm) attribute_hidden;
|
|
|
190885 |
+
|
|
|
190885 |
+
|
|
|
190885 |
+static inline void *
|
|
|
190885 |
+IFUNC_SELECTOR (void)
|
|
|
190885 |
+{
|
|
|
190885 |
+ const struct cpu_features* cpu_features = __get_cpu_features ();
|
|
|
190885 |
+
|
|
|
190885 |
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
|
|
190885 |
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
|
|
|
190885 |
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
|
|
190885 |
+ {
|
|
|
190885 |
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
|
|
190885 |
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
|
|
|
190885 |
+ {
|
|
|
190885 |
+ if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
|
|
|
190885 |
+ return OPTIMIZE (evex_rtm);
|
|
|
190885 |
+
|
|
|
190885 |
+ return OPTIMIZE (evex);
|
|
|
190885 |
+ }
|
|
|
190885 |
+
|
|
|
190885 |
+ if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
|
|
|
190885 |
+ return OPTIMIZE (avx2_rtm);
|
|
|
190885 |
+
|
|
|
190885 |
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
|
|
190885 |
+ return OPTIMIZE (avx2);
|
|
|
190885 |
+ }
|
|
|
190885 |
+
|
|
|
190885 |
+ return OPTIMIZE (sse2);
|
|
|
190885 |
+}
|
|
|
190885 |
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
|
190885 |
index d59d65f8..ac097e8d 100644
|
|
|
190885 |
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
|
190885 |
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
|
190885 |
@@ -52,6 +52,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
|
190885 |
&& CPU_FEATURE_USABLE (AVX512BW)
|
|
|
190885 |
&& CPU_FEATURE_USABLE (BMI2)),
|
|
|
190885 |
__memchr_evex)
|
|
|
190885 |
+ IFUNC_IMPL_ADD (array, i, memchr,
|
|
|
190885 |
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
|
190885 |
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
|
190885 |
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
|
190885 |
+ __memchr_evex_rtm)
|
|
|
190885 |
IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_sse2))
|
|
|
190885 |
|
|
|
190885 |
/* Support sysdeps/x86_64/multiarch/memcmp.c. */
|
|
|
190885 |
@@ -288,6 +293,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
|
190885 |
&& CPU_FEATURE_USABLE (AVX512BW)
|
|
|
190885 |
&& CPU_FEATURE_USABLE (BMI2)),
|
|
|
190885 |
__rawmemchr_evex)
|
|
|
190885 |
+ IFUNC_IMPL_ADD (array, i, rawmemchr,
|
|
|
190885 |
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
|
190885 |
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
|
190885 |
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
|
190885 |
+ __rawmemchr_evex_rtm)
|
|
|
190885 |
IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2))
|
|
|
190885 |
|
|
|
190885 |
/* Support sysdeps/x86_64/multiarch/strlen.c. */
|
|
|
190885 |
@@ -711,6 +721,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
|
190885 |
&& CPU_FEATURE_USABLE (AVX512BW)
|
|
|
190885 |
&& CPU_FEATURE_USABLE (BMI2)),
|
|
|
190885 |
__wmemchr_evex)
|
|
|
190885 |
+ IFUNC_IMPL_ADD (array, i, wmemchr,
|
|
|
190885 |
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
|
190885 |
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
|
190885 |
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
|
190885 |
+ __wmemchr_evex_rtm)
|
|
|
190885 |
IFUNC_IMPL_ADD (array, i, wmemchr, 1, __wmemchr_sse2))
|
|
|
190885 |
|
|
|
190885 |
/* Support sysdeps/x86_64/multiarch/wmemcmp.c. */
|
|
|
190885 |
diff --git a/sysdeps/x86_64/multiarch/memchr-evex-rtm.S b/sysdeps/x86_64/multiarch/memchr-evex-rtm.S
|
|
|
190885 |
new file mode 100644
|
|
|
190885 |
index 00000000..19871882
|
|
|
190885 |
--- /dev/null
|
|
|
190885 |
+++ b/sysdeps/x86_64/multiarch/memchr-evex-rtm.S
|
|
|
190885 |
@@ -0,0 +1,8 @@
|
|
|
190885 |
+#ifndef MEMCHR
|
|
|
190885 |
+# define MEMCHR __memchr_evex_rtm
|
|
|
190885 |
+#endif
|
|
|
190885 |
+
|
|
|
190885 |
+#define USE_IN_RTM 1
|
|
|
190885 |
+#define SECTION(p) p##.evex.rtm
|
|
|
190885 |
+
|
|
|
190885 |
+#include "memchr-evex.S"
|
|
|
190885 |
diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
|
|
|
190885 |
index f3fdad4f..4d0ed6d1 100644
|
|
|
190885 |
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
|
|
|
190885 |
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
|
|
|
190885 |
@@ -38,10 +38,32 @@
|
|
|
190885 |
# define CHAR_SIZE 1
|
|
|
190885 |
# endif
|
|
|
190885 |
|
|
|
190885 |
+ /* In the 4x loop the RTM and non-RTM versions have data pointer
|
|
|
190885 |
+ off by VEC_SIZE * 4 with RTM version being VEC_SIZE * 4 greater.
|
|
|
190885 |
+ This is represented by BASE_OFFSET. As well because the RTM
|
|
|
190885 |
+ version uses vpcmp which stores a bit per element compared where
|
|
|
190885 |
+ the non-RTM version uses vpcmpeq which stores a bit per byte
|
|
|
190885 |
+ compared RET_SCALE of CHAR_SIZE is only relevant for the RTM
|
|
|
190885 |
+ version. */
|
|
|
190885 |
+# ifdef USE_IN_RTM
|
|
|
190885 |
+# define VZEROUPPER
|
|
|
190885 |
+# define BASE_OFFSET (VEC_SIZE * 4)
|
|
|
190885 |
+# define RET_SCALE CHAR_SIZE
|
|
|
190885 |
+# else
|
|
|
190885 |
+# define VZEROUPPER vzeroupper
|
|
|
190885 |
+# define BASE_OFFSET 0
|
|
|
190885 |
+# define RET_SCALE 1
|
|
|
190885 |
+# endif
|
|
|
190885 |
+
|
|
|
190885 |
+ /* In the return from 4x loop memchr and rawmemchr versions have
|
|
|
190885 |
+ data pointers off by VEC_SIZE * 4 with memchr version being
|
|
|
190885 |
+ VEC_SIZE * 4 greater. */
|
|
|
190885 |
# ifdef USE_AS_RAWMEMCHR
|
|
|
190885 |
+# define RET_OFFSET (BASE_OFFSET - (VEC_SIZE * 4))
|
|
|
190885 |
# define RAW_PTR_REG rcx
|
|
|
190885 |
# define ALGN_PTR_REG rdi
|
|
|
190885 |
# else
|
|
|
190885 |
+# define RET_OFFSET BASE_OFFSET
|
|
|
190885 |
# define RAW_PTR_REG rdi
|
|
|
190885 |
# define ALGN_PTR_REG rcx
|
|
|
190885 |
# endif
|
|
|
190885 |
@@ -57,11 +79,15 @@
|
|
|
190885 |
# define YMM5 ymm21
|
|
|
190885 |
# define YMM6 ymm22
|
|
|
190885 |
|
|
|
190885 |
+# ifndef SECTION
|
|
|
190885 |
+# define SECTION(p) p##.evex
|
|
|
190885 |
+# endif
|
|
|
190885 |
+
|
|
|
190885 |
# define VEC_SIZE 32
|
|
|
190885 |
# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
|
|
|
190885 |
# define PAGE_SIZE 4096
|
|
|
190885 |
|
|
|
190885 |
- .section .text.evex,"ax",@progbits
|
|
|
190885 |
+ .section SECTION(.text),"ax",@progbits
|
|
|
190885 |
ENTRY (MEMCHR)
|
|
|
190885 |
# ifndef USE_AS_RAWMEMCHR
|
|
|
190885 |
/* Check for zero length. */
|
|
|
190885 |
@@ -237,14 +263,15 @@ L(cross_page_continue):
|
|
|
190885 |
/* Check if at last CHAR_PER_VEC * 4 length. */
|
|
|
190885 |
subq $(CHAR_PER_VEC * 4), %rdx
|
|
|
190885 |
jbe L(last_4x_vec_or_less_cmpeq)
|
|
|
190885 |
- addq $VEC_SIZE, %rdi
|
|
|
190885 |
+ /* +VEC_SIZE if USE_IN_RTM otherwise +VEC_SIZE * 5. */
|
|
|
190885 |
+ addq $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
|
|
|
190885 |
|
|
|
190885 |
/* Align data to VEC_SIZE * 4 for the loop and readjust length.
|
|
|
190885 |
*/
|
|
|
190885 |
# ifdef USE_AS_WMEMCHR
|
|
|
190885 |
movl %edi, %ecx
|
|
|
190885 |
andq $-(4 * VEC_SIZE), %rdi
|
|
|
190885 |
- andl $(VEC_SIZE * 4 - 1), %ecx
|
|
|
190885 |
+ subl %edi, %ecx
|
|
|
190885 |
/* NB: Divide bytes by 4 to get the wchar_t count. */
|
|
|
190885 |
sarl $2, %ecx
|
|
|
190885 |
addq %rcx, %rdx
|
|
|
190885 |
@@ -254,15 +281,28 @@ L(cross_page_continue):
|
|
|
190885 |
subq %rdi, %rdx
|
|
|
190885 |
# endif
|
|
|
190885 |
# else
|
|
|
190885 |
- addq $VEC_SIZE, %rdi
|
|
|
190885 |
+ addq $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
|
|
|
190885 |
andq $-(4 * VEC_SIZE), %rdi
|
|
|
190885 |
# endif
|
|
|
190885 |
-
|
|
|
190885 |
+# ifdef USE_IN_RTM
|
|
|
190885 |
vpxorq %XMMZERO, %XMMZERO, %XMMZERO
|
|
|
190885 |
+# else
|
|
|
190885 |
+ /* copy ymmmatch to ymm0 so we can use vpcmpeq which is not
|
|
|
190885 |
+ encodable with EVEX registers (ymm16-ymm31). */
|
|
|
190885 |
+ vmovdqa64 %YMMMATCH, %ymm0
|
|
|
190885 |
+# endif
|
|
|
190885 |
|
|
|
190885 |
/* Compare 4 * VEC at a time forward. */
|
|
|
190885 |
.p2align 4
|
|
|
190885 |
L(loop_4x_vec):
|
|
|
190885 |
+ /* Two versions of the loop. One that does not require
|
|
|
190885 |
+ vzeroupper by not using ymm0-ymm15 and another does that require
|
|
|
190885 |
+ vzeroupper because it uses ymm0-ymm15. The reason why ymm0-ymm15
|
|
|
190885 |
+ is used at all is because there is no EVEX encoding vpcmpeq and
|
|
|
190885 |
+ with vpcmpeq this loop can be performed more efficiently. The
|
|
|
190885 |
+ non-vzeroupper version is safe for RTM while the vzeroupper
|
|
|
190885 |
+ version should be prefered if RTM are not supported. */
|
|
|
190885 |
+# ifdef USE_IN_RTM
|
|
|
190885 |
/* It would be possible to save some instructions using 4x VPCMP
|
|
|
190885 |
but bottleneck on port 5 makes it not woth it. */
|
|
|
190885 |
VPCMP $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
|
|
|
190885 |
@@ -273,12 +313,55 @@ L(loop_4x_vec):
|
|
|
190885 |
/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */
|
|
|
190885 |
VPMINU %YMM2, %YMM3, %YMM3{%k1}{z}
|
|
|
190885 |
VPCMP $0, %YMM3, %YMMZERO, %k2
|
|
|
190885 |
+# else
|
|
|
190885 |
+ /* Since vptern can only take 3x vectors fastest to do 1 vec
|
|
|
190885 |
+ seperately with EVEX vpcmp. */
|
|
|
190885 |
+# ifdef USE_AS_WMEMCHR
|
|
|
190885 |
+ /* vptern can only accept masks for epi32/epi64 so can only save
|
|
|
190885 |
+ instruction using not equals mask on vptern with wmemchr. */
|
|
|
190885 |
+ VPCMP $4, (%rdi), %YMMMATCH, %k1
|
|
|
190885 |
+# else
|
|
|
190885 |
+ VPCMP $0, (%rdi), %YMMMATCH, %k1
|
|
|
190885 |
+# endif
|
|
|
190885 |
+ /* Compare 3x with vpcmpeq and or them all together with vptern.
|
|
|
190885 |
+ */
|
|
|
190885 |
+ VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
|
|
|
190885 |
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
|
|
|
190885 |
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
|
|
|
190885 |
+# ifdef USE_AS_WMEMCHR
|
|
|
190885 |
+ /* This takes the not of or between ymm2, ymm3, ymm4 as well as
|
|
|
190885 |
+ combines result from VEC0 with zero mask. */
|
|
|
190885 |
+ vpternlogd $1, %ymm2, %ymm3, %ymm4{%k1}{z}
|
|
|
190885 |
+ vpmovmskb %ymm4, %ecx
|
|
|
190885 |
+# else
|
|
|
190885 |
+ /* 254 is mask for oring ymm2, ymm3, ymm4 into ymm4. */
|
|
|
190885 |
+ vpternlogd $254, %ymm2, %ymm3, %ymm4
|
|
|
190885 |
+ vpmovmskb %ymm4, %ecx
|
|
|
190885 |
+ kmovd %k1, %eax
|
|
|
190885 |
+# endif
|
|
|
190885 |
+# endif
|
|
|
190885 |
+
|
|
|
190885 |
# ifdef USE_AS_RAWMEMCHR
|
|
|
190885 |
subq $-(VEC_SIZE * 4), %rdi
|
|
|
190885 |
+# endif
|
|
|
190885 |
+# ifdef USE_IN_RTM
|
|
|
190885 |
kortestd %k2, %k3
|
|
|
190885 |
+# else
|
|
|
190885 |
+# ifdef USE_AS_WMEMCHR
|
|
|
190885 |
+ /* ecx contains not of matches. All 1s means no matches. incl will
|
|
|
190885 |
+ overflow and set zeroflag if that is the case. */
|
|
|
190885 |
+ incl %ecx
|
|
|
190885 |
+# else
|
|
|
190885 |
+ /* If either VEC1 (eax) or VEC2-VEC4 (ecx) are not zero. Adding
|
|
|
190885 |
+ to ecx is not an issue because if eax is non-zero it will be
|
|
|
190885 |
+ used for returning the match. If it is zero the add does
|
|
|
190885 |
+ nothing. */
|
|
|
190885 |
+ addq %rax, %rcx
|
|
|
190885 |
+# endif
|
|
|
190885 |
+# endif
|
|
|
190885 |
+# ifdef USE_AS_RAWMEMCHR
|
|
|
190885 |
jz L(loop_4x_vec)
|
|
|
190885 |
# else
|
|
|
190885 |
- kortestd %k2, %k3
|
|
|
190885 |
jnz L(loop_4x_vec_end)
|
|
|
190885 |
|
|
|
190885 |
subq $-(VEC_SIZE * 4), %rdi
|
|
|
190885 |
@@ -288,10 +371,11 @@ L(loop_4x_vec):
|
|
|
190885 |
|
|
|
190885 |
/* Fall through into less than 4 remaining vectors of length case.
|
|
|
190885 |
*/
|
|
|
190885 |
- VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
|
|
|
190885 |
+ VPCMP $0, BASE_OFFSET(%rdi), %YMMMATCH, %k0
|
|
|
190885 |
+ addq $(BASE_OFFSET - VEC_SIZE), %rdi
|
|
|
190885 |
kmovd %k0, %eax
|
|
|
190885 |
- addq $(VEC_SIZE * 3), %rdi
|
|
|
190885 |
- .p2align 4
|
|
|
190885 |
+ VZEROUPPER
|
|
|
190885 |
+
|
|
|
190885 |
L(last_4x_vec_or_less):
|
|
|
190885 |
/* Check if first VEC contained match. */
|
|
|
190885 |
testl %eax, %eax
|
|
|
190885 |
@@ -338,73 +422,78 @@ L(loop_4x_vec_end):
|
|
|
190885 |
/* rawmemchr will fall through into this if match was found in
|
|
|
190885 |
loop. */
|
|
|
190885 |
|
|
|
190885 |
+# if defined USE_IN_RTM || defined USE_AS_WMEMCHR
|
|
|
190885 |
/* k1 has not of matches with VEC1. */
|
|
|
190885 |
kmovd %k1, %eax
|
|
|
190885 |
-# ifdef USE_AS_WMEMCHR
|
|
|
190885 |
+# ifdef USE_AS_WMEMCHR
|
|
|
190885 |
subl $((1 << CHAR_PER_VEC) - 1), %eax
|
|
|
190885 |
-# else
|
|
|
190885 |
+# else
|
|
|
190885 |
incl %eax
|
|
|
190885 |
+# endif
|
|
|
190885 |
+# else
|
|
|
190885 |
+ /* eax already has matches for VEC1. */
|
|
|
190885 |
+ testl %eax, %eax
|
|
|
190885 |
# endif
|
|
|
190885 |
jnz L(last_vec_x1_return)
|
|
|
190885 |
|
|
|
190885 |
+# ifdef USE_IN_RTM
|
|
|
190885 |
VPCMP $0, %YMM2, %YMMZERO, %k0
|
|
|
190885 |
kmovd %k0, %eax
|
|
|
190885 |
+# else
|
|
|
190885 |
+ vpmovmskb %ymm2, %eax
|
|
|
190885 |
+# endif
|
|
|
190885 |
testl %eax, %eax
|
|
|
190885 |
jnz L(last_vec_x2_return)
|
|
|
190885 |
|
|
|
190885 |
+# ifdef USE_IN_RTM
|
|
|
190885 |
kmovd %k2, %eax
|
|
|
190885 |
testl %eax, %eax
|
|
|
190885 |
jnz L(last_vec_x3_return)
|
|
|
190885 |
|
|
|
190885 |
kmovd %k3, %eax
|
|
|
190885 |
tzcntl %eax, %eax
|
|
|
190885 |
-# ifdef USE_AS_RAWMEMCHR
|
|
|
190885 |
- leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
|
|
|
190885 |
+ leaq (VEC_SIZE * 3 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
|
|
|
190885 |
# else
|
|
|
190885 |
- leaq (VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
|
|
|
190885 |
+ vpmovmskb %ymm3, %eax
|
|
|
190885 |
+ /* Combine matches in VEC3 (eax) with matches in VEC4 (ecx). */
|
|
|
190885 |
+ salq $VEC_SIZE, %rcx
|
|
|
190885 |
+ orq %rcx, %rax
|
|
|
190885 |
+ tzcntq %rax, %rax
|
|
|
190885 |
+ leaq (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax), %rax
|
|
|
190885 |
+ VZEROUPPER
|
|
|
190885 |
# endif
|
|
|
190885 |
ret
|
|
|
190885 |
|
|
|
190885 |
.p2align 4
|
|
|
190885 |
L(last_vec_x1_return):
|
|
|
190885 |
tzcntl %eax, %eax
|
|
|
190885 |
-# ifdef USE_AS_RAWMEMCHR
|
|
|
190885 |
-# ifdef USE_AS_WMEMCHR
|
|
|
190885 |
+# if defined USE_AS_WMEMCHR || RET_OFFSET != 0
|
|
|
190885 |
/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
|
|
190885 |
- leaq (%rdi, %rax, CHAR_SIZE), %rax
|
|
|
190885 |
-# else
|
|
|
190885 |
- addq %rdi, %rax
|
|
|
190885 |
-# endif
|
|
|
190885 |
+ leaq RET_OFFSET(%rdi, %rax, CHAR_SIZE), %rax
|
|
|
190885 |
# else
|
|
|
190885 |
- /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
|
|
190885 |
- leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
|
|
|
190885 |
+ addq %rdi, %rax
|
|
|
190885 |
# endif
|
|
|
190885 |
+ VZEROUPPER
|
|
|
190885 |
ret
|
|
|
190885 |
|
|
|
190885 |
.p2align 4
|
|
|
190885 |
L(last_vec_x2_return):
|
|
|
190885 |
tzcntl %eax, %eax
|
|
|
190885 |
-# ifdef USE_AS_RAWMEMCHR
|
|
|
190885 |
- /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
|
|
190885 |
- leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
|
|
|
190885 |
-# else
|
|
|
190885 |
- /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
|
|
190885 |
- leaq (VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
|
|
|
190885 |
-# endif
|
|
|
190885 |
+ /* NB: Multiply bytes by RET_SCALE to get the wchar_t count
|
|
|
190885 |
+ if relevant (RET_SCALE = CHAR_SIZE if USE_AS_WMEMCHAR and
|
|
|
190885 |
+ USE_IN_RTM are both defined. Otherwise RET_SCALE = 1. */
|
|
|
190885 |
+ leaq (VEC_SIZE + RET_OFFSET)(%rdi, %rax, RET_SCALE), %rax
|
|
|
190885 |
+ VZEROUPPER
|
|
|
190885 |
ret
|
|
|
190885 |
|
|
|
190885 |
+# ifdef USE_IN_RTM
|
|
|
190885 |
.p2align 4
|
|
|
190885 |
L(last_vec_x3_return):
|
|
|
190885 |
tzcntl %eax, %eax
|
|
|
190885 |
-# ifdef USE_AS_RAWMEMCHR
|
|
|
190885 |
- /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
|
|
190885 |
- leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
|
|
|
190885 |
-# else
|
|
|
190885 |
/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
|
|
190885 |
- leaq (VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
|
|
|
190885 |
-# endif
|
|
|
190885 |
+ leaq (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
|
|
|
190885 |
ret
|
|
|
190885 |
-
|
|
|
190885 |
+# endif
|
|
|
190885 |
|
|
|
190885 |
# ifndef USE_AS_RAWMEMCHR
|
|
|
190885 |
L(last_4x_vec_or_less_cmpeq):
|
|
|
190885 |
diff --git a/sysdeps/x86_64/multiarch/memchr.c b/sysdeps/x86_64/multiarch/memchr.c
|
|
|
190885 |
index 016f5784..f28aea77 100644
|
|
|
190885 |
--- a/sysdeps/x86_64/multiarch/memchr.c
|
|
|
190885 |
+++ b/sysdeps/x86_64/multiarch/memchr.c
|
|
|
190885 |
@@ -24,7 +24,7 @@
|
|
|
190885 |
# undef memchr
|
|
|
190885 |
|
|
|
190885 |
# define SYMBOL_NAME memchr
|
|
|
190885 |
-# include "ifunc-avx2.h"
|
|
|
190885 |
+# include "ifunc-evex.h"
|
|
|
190885 |
|
|
|
190885 |
libc_ifunc_redirected (__redirect_memchr, memchr, IFUNC_SELECTOR ());
|
|
|
190885 |
strong_alias (memchr, __memchr)
|
|
|
190885 |
diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
|
|
|
190885 |
new file mode 100644
|
|
|
190885 |
index 00000000..deda1ca3
|
|
|
190885 |
--- /dev/null
|
|
|
190885 |
+++ b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
|
|
|
190885 |
@@ -0,0 +1,3 @@
|
|
|
190885 |
+#define MEMCHR __rawmemchr_evex_rtm
|
|
|
190885 |
+#define USE_AS_RAWMEMCHR 1
|
|
|
190885 |
+#include "memchr-evex-rtm.S"
|
|
|
190885 |
diff --git a/sysdeps/x86_64/multiarch/rawmemchr.c b/sysdeps/x86_64/multiarch/rawmemchr.c
|
|
|
190885 |
index 8a0bc313..1f764f35 100644
|
|
|
190885 |
--- a/sysdeps/x86_64/multiarch/rawmemchr.c
|
|
|
190885 |
+++ b/sysdeps/x86_64/multiarch/rawmemchr.c
|
|
|
190885 |
@@ -26,7 +26,7 @@
|
|
|
190885 |
# undef __rawmemchr
|
|
|
190885 |
|
|
|
190885 |
# define SYMBOL_NAME rawmemchr
|
|
|
190885 |
-# include "ifunc-avx2.h"
|
|
|
190885 |
+# include "ifunc-evex.h"
|
|
|
190885 |
|
|
|
190885 |
libc_ifunc_redirected (__redirect_rawmemchr, __rawmemchr,
|
|
|
190885 |
IFUNC_SELECTOR ());
|
|
|
190885 |
diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S
|
|
|
190885 |
new file mode 100644
|
|
|
190885 |
index 00000000..a346cd35
|
|
|
190885 |
--- /dev/null
|
|
|
190885 |
+++ b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S
|
|
|
190885 |
@@ -0,0 +1,3 @@
|
|
|
190885 |
+#define MEMCHR __wmemchr_evex_rtm
|
|
|
190885 |
+#define USE_AS_WMEMCHR 1
|
|
|
190885 |
+#include "memchr-evex-rtm.S"
|
|
|
190885 |
diff --git a/sysdeps/x86_64/multiarch/wmemchr.c b/sysdeps/x86_64/multiarch/wmemchr.c
|
|
|
190885 |
index 6d833702..f9c91915 100644
|
|
|
190885 |
--- a/sysdeps/x86_64/multiarch/wmemchr.c
|
|
|
190885 |
+++ b/sysdeps/x86_64/multiarch/wmemchr.c
|
|
|
190885 |
@@ -26,7 +26,7 @@
|
|
|
190885 |
# undef __wmemchr
|
|
|
190885 |
|
|
|
190885 |
# define SYMBOL_NAME wmemchr
|
|
|
190885 |
-# include "ifunc-avx2.h"
|
|
|
190885 |
+# include "ifunc-evex.h"
|
|
|
190885 |
|
|
|
190885 |
libc_ifunc_redirected (__redirect_wmemchr, __wmemchr, IFUNC_SELECTOR ());
|
|
|
190885 |
weak_alias (__wmemchr, wmemchr)
|
|
|
190885 |
--
|
|
|
190885 |
GitLab
|
|
|
190885 |
|