| From 22a1b88414d40b700c84689d08a6026e3fdee874 Mon Sep 17 00:00:00 2001 |
| From: "H.J. Lu" <hjl.tools@gmail.com> |
| Date: Fri, 5 Mar 2021 06:24:52 -0800 |
| Subject: [PATCH] x86-64: Add ifunc-avx2.h functions with 256-bit EVEX |
| |
| Update ifunc-avx2.h, strchr.c, strcmp.c, strncmp.c and wcsnlen.c to |
| select the function optimized with 256-bit EVEX instructions using |
| YMM16-YMM31 registers to avoid RTM abort with usable AVX512VL, AVX512BW |
| and BMI2 since VZEROUPPER isn't needed at function exit. |
| |
| For strcmp/strncmp, prefer AVX2 strcmp/strncmp if Prefer_AVX2_STRCMP |
| is set. |
| |
| (cherry picked from commit 1fd8c163a83d96ace1ff78fa6bac7aee084f6f77) |
| |
| sysdeps/x86_64/multiarch/Makefile | 21 +- |
| sysdeps/x86_64/multiarch/ifunc-avx2.h | 14 +- |
| sysdeps/x86_64/multiarch/ifunc-impl-list.c | 81 ++ |
| sysdeps/x86_64/multiarch/memchr-evex.S | 381 +++++++ |
| sysdeps/x86_64/multiarch/memrchr-evex.S | 337 +++++++ |
| sysdeps/x86_64/multiarch/rawmemchr-evex.S | 4 + |
| sysdeps/x86_64/multiarch/strchr-evex.S | 335 +++++++ |
| sysdeps/x86_64/multiarch/strchr.c | 14 +- |
| sysdeps/x86_64/multiarch/strchrnul-evex.S | 3 + |
| sysdeps/x86_64/multiarch/strcmp-evex.S | 1043 ++++++++++++++++++++ |
| sysdeps/x86_64/multiarch/strcmp.c | 15 +- |
| sysdeps/x86_64/multiarch/strlen-evex.S | 436 ++++++++ |
| sysdeps/x86_64/multiarch/strncmp-evex.S | 3 + |
| sysdeps/x86_64/multiarch/strncmp.c | 15 +- |
| sysdeps/x86_64/multiarch/strnlen-evex.S | 4 + |
| sysdeps/x86_64/multiarch/strrchr-evex.S | 265 +++++ |
| sysdeps/x86_64/multiarch/wcschr-evex.S | 3 + |
| sysdeps/x86_64/multiarch/wcscmp-evex.S | 4 + |
| sysdeps/x86_64/multiarch/wcslen-evex.S | 4 + |
| sysdeps/x86_64/multiarch/wcsncmp-evex.S | 5 + |
| sysdeps/x86_64/multiarch/wcsnlen-evex.S | 5 + |
| sysdeps/x86_64/multiarch/wcsnlen.c | 14 +- |
| sysdeps/x86_64/multiarch/wcsrchr-evex.S | 3 + |
| sysdeps/x86_64/multiarch/wmemchr-evex.S | 4 + |
| 24 files changed, 2996 insertions(+), 17 deletions(-) |
| create mode 100644 sysdeps/x86_64/multiarch/memchr-evex.S |
| create mode 100644 sysdeps/x86_64/multiarch/memrchr-evex.S |
| create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex.S |
| create mode 100644 sysdeps/x86_64/multiarch/strchr-evex.S |
| create mode 100644 sysdeps/x86_64/multiarch/strchrnul-evex.S |
| create mode 100644 sysdeps/x86_64/multiarch/strcmp-evex.S |
| create mode 100644 sysdeps/x86_64/multiarch/strlen-evex.S |
| create mode 100644 sysdeps/x86_64/multiarch/strncmp-evex.S |
| create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex.S |
| create mode 100644 sysdeps/x86_64/multiarch/strrchr-evex.S |
| create mode 100644 sysdeps/x86_64/multiarch/wcschr-evex.S |
| create mode 100644 sysdeps/x86_64/multiarch/wcscmp-evex.S |
| create mode 100644 sysdeps/x86_64/multiarch/wcslen-evex.S |
| create mode 100644 sysdeps/x86_64/multiarch/wcsncmp-evex.S |
| create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-evex.S |
| create mode 100644 sysdeps/x86_64/multiarch/wcsrchr-evex.S |
| create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex.S |
| |
| diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile |
| index 9477538a..5ce85882 100644 |
| |
| |
| @@ -39,7 +39,17 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \ |
| memmove-avx512-unaligned-erms \ |
| memset-sse2-unaligned-erms \ |
| memset-avx2-unaligned-erms \ |
| - memset-avx512-unaligned-erms |
| + memset-avx512-unaligned-erms \ |
| + memchr-evex \ |
| + memrchr-evex \ |
| + rawmemchr-evex \ |
| + strchr-evex \ |
| + strchrnul-evex \ |
| + strcmp-evex \ |
| + strlen-evex \ |
| + strncmp-evex \ |
| + strnlen-evex \ |
| + strrchr-evex |
| CFLAGS-varshift.c += -msse4 |
| CFLAGS-strcspn-c.c += -msse4 |
| CFLAGS-strpbrk-c.c += -msse4 |
| @@ -56,7 +66,14 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \ |
| wcschr-sse2 wcschr-avx2 \ |
| wcsrchr-sse2 wcsrchr-avx2 \ |
| wcsnlen-sse4_1 wcsnlen-c \ |
| - wcslen-sse2 wcslen-avx2 wcsnlen-avx2 |
| + wcslen-sse2 wcslen-avx2 wcsnlen-avx2 \ |
| + wcschr-evex \ |
| + wcscmp-evex \ |
| + wcslen-evex \ |
| + wcsncmp-evex \ |
| + wcsnlen-evex \ |
| + wcsrchr-evex \ |
| + wmemchr-evex |
| endif |
| |
| ifeq ($(subdir),debug) |
| diff --git a/sysdeps/x86_64/multiarch/ifunc-avx2.h b/sysdeps/x86_64/multiarch/ifunc-avx2.h |
| index 5c88640a..7081b0c9 100644 |
| |
| |
| @@ -21,16 +21,24 @@ |
| |
| extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; |
| extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; |
| +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; |
| |
| static inline void * |
| IFUNC_SELECTOR (void) |
| { |
| const struct cpu_features* cpu_features = __get_cpu_features (); |
| |
| - if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) |
| - && CPU_FEATURE_USABLE_P (cpu_features, AVX2) |
| + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) |
| && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) |
| - return OPTIMIZE (avx2); |
| + { |
| + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) |
| + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) |
| + && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) |
| + return OPTIMIZE (evex); |
| + |
| + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) |
| + return OPTIMIZE (avx2); |
| + } |
| |
| return OPTIMIZE (sse2); |
| } |
| diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c |
| index fe13505c..bd7d9f19 100644 |
| |
| |
| @@ -43,6 +43,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, |
| IFUNC_IMPL_ADD (array, i, memchr, |
| CPU_FEATURE_USABLE (AVX2), |
| __memchr_avx2) |
| + IFUNC_IMPL_ADD (array, i, memchr, |
| + (CPU_FEATURE_USABLE (AVX512VL) |
| + && CPU_FEATURE_USABLE (AVX512BW) |
| + && CPU_FEATURE_USABLE (BMI2)), |
| + __memchr_evex) |
| IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_sse2)) |
| |
| /* Support sysdeps/x86_64/multiarch/memcmp.c. */ |
| @@ -121,6 +126,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, |
| IFUNC_IMPL_ADD (array, i, memrchr, |
| CPU_FEATURE_USABLE (AVX2), |
| __memrchr_avx2) |
| + IFUNC_IMPL_ADD (array, i, memrchr, |
| + (CPU_FEATURE_USABLE (AVX512VL) |
| + && CPU_FEATURE_USABLE (AVX512BW)), |
| + __memrchr_evex) |
| + |
| IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_sse2)) |
| |
| #ifdef SHARED |
| @@ -179,6 +189,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, |
| IFUNC_IMPL_ADD (array, i, rawmemchr, |
| CPU_FEATURE_USABLE (AVX2), |
| __rawmemchr_avx2) |
| + IFUNC_IMPL_ADD (array, i, rawmemchr, |
| + (CPU_FEATURE_USABLE (AVX512VL) |
| + && CPU_FEATURE_USABLE (AVX512BW) |
| + && CPU_FEATURE_USABLE (BMI2)), |
| + __rawmemchr_evex) |
| IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2)) |
| |
| /* Support sysdeps/x86_64/multiarch/strlen.c. */ |
| @@ -186,6 +201,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, |
| IFUNC_IMPL_ADD (array, i, strlen, |
| CPU_FEATURE_USABLE (AVX2), |
| __strlen_avx2) |
| + IFUNC_IMPL_ADD (array, i, strlen, |
| + (CPU_FEATURE_USABLE (AVX512VL) |
| + && CPU_FEATURE_USABLE (AVX512BW)), |
| + __strlen_evex) |
| IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2)) |
| |
| /* Support sysdeps/x86_64/multiarch/strnlen.c. */ |
| @@ -193,6 +212,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, |
| IFUNC_IMPL_ADD (array, i, strnlen, |
| CPU_FEATURE_USABLE (AVX2), |
| __strnlen_avx2) |
| + IFUNC_IMPL_ADD (array, i, strnlen, |
| + (CPU_FEATURE_USABLE (AVX512VL) |
| + && CPU_FEATURE_USABLE (AVX512BW)), |
| + __strnlen_evex) |
| IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2)) |
| |
| /* Support sysdeps/x86_64/multiarch/stpncpy.c. */ |
| @@ -255,6 +278,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, |
| IFUNC_IMPL_ADD (array, i, strchr, |
| CPU_FEATURE_USABLE (AVX2), |
| __strchr_avx2) |
| + IFUNC_IMPL_ADD (array, i, strchr, |
| + (CPU_FEATURE_USABLE (AVX512VL) |
| + && CPU_FEATURE_USABLE (AVX512BW) |
| + && CPU_FEATURE_USABLE (BMI2)), |
| + __strchr_evex) |
| IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2_no_bsf) |
| IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2)) |
| |
| @@ -263,6 +291,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, |
| IFUNC_IMPL_ADD (array, i, strchrnul, |
| CPU_FEATURE_USABLE (AVX2), |
| __strchrnul_avx2) |
| + IFUNC_IMPL_ADD (array, i, strchrnul, |
| + (CPU_FEATURE_USABLE (AVX512VL) |
| + && CPU_FEATURE_USABLE (AVX512BW) |
| + && CPU_FEATURE_USABLE (BMI2)), |
| + __strchrnul_evex) |
| IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_sse2)) |
| |
| /* Support sysdeps/x86_64/multiarch/strrchr.c. */ |
| @@ -270,6 +303,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, |
| IFUNC_IMPL_ADD (array, i, strrchr, |
| CPU_FEATURE_USABLE (AVX2), |
| __strrchr_avx2) |
| + IFUNC_IMPL_ADD (array, i, strrchr, |
| + (CPU_FEATURE_USABLE (AVX512VL) |
| + && CPU_FEATURE_USABLE (AVX512BW)), |
| + __strrchr_evex) |
| IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_sse2)) |
| |
| /* Support sysdeps/x86_64/multiarch/strcmp.c. */ |
| @@ -277,6 +314,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, |
| IFUNC_IMPL_ADD (array, i, strcmp, |
| CPU_FEATURE_USABLE (AVX2), |
| __strcmp_avx2) |
| + IFUNC_IMPL_ADD (array, i, strcmp, |
| + (CPU_FEATURE_USABLE (AVX512VL) |
| + && CPU_FEATURE_USABLE (AVX512BW) |
| + && CPU_FEATURE_USABLE (BMI2)), |
| + __strcmp_evex) |
| IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSE4_2), |
| __strcmp_sse42) |
| IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSSE3), |
| @@ -370,6 +412,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, |
| IFUNC_IMPL_ADD (array, i, wcschr, |
| CPU_FEATURE_USABLE (AVX2), |
| __wcschr_avx2) |
| + IFUNC_IMPL_ADD (array, i, wcschr, |
| + (CPU_FEATURE_USABLE (AVX512VL) |
| + && CPU_FEATURE_USABLE (AVX512BW) |
| + && CPU_FEATURE_USABLE (BMI2)), |
| + __wcschr_evex) |
| IFUNC_IMPL_ADD (array, i, wcschr, 1, __wcschr_sse2)) |
| |
| /* Support sysdeps/x86_64/multiarch/wcsrchr.c. */ |
| @@ -377,6 +424,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, |
| IFUNC_IMPL_ADD (array, i, wcsrchr, |
| CPU_FEATURE_USABLE (AVX2), |
| __wcsrchr_avx2) |
| + IFUNC_IMPL_ADD (array, i, wcsrchr, |
| + (CPU_FEATURE_USABLE (AVX512VL) |
| + && CPU_FEATURE_USABLE (AVX512BW) |
| + && CPU_FEATURE_USABLE (BMI2)), |
| + __wcsrchr_evex) |
| IFUNC_IMPL_ADD (array, i, wcsrchr, 1, __wcsrchr_sse2)) |
| |
| /* Support sysdeps/x86_64/multiarch/wcscmp.c. */ |
| @@ -384,6 +436,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, |
| IFUNC_IMPL_ADD (array, i, wcscmp, |
| CPU_FEATURE_USABLE (AVX2), |
| __wcscmp_avx2) |
| + IFUNC_IMPL_ADD (array, i, wcscmp, |
| + (CPU_FEATURE_USABLE (AVX512VL) |
| + && CPU_FEATURE_USABLE (AVX512BW) |
| + && CPU_FEATURE_USABLE (BMI2)), |
| + __wcscmp_evex) |
| IFUNC_IMPL_ADD (array, i, wcscmp, 1, __wcscmp_sse2)) |
| |
| /* Support sysdeps/x86_64/multiarch/wcsncmp.c. */ |
| @@ -391,6 +448,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, |
| IFUNC_IMPL_ADD (array, i, wcsncmp, |
| CPU_FEATURE_USABLE (AVX2), |
| __wcsncmp_avx2) |
| + IFUNC_IMPL_ADD (array, i, wcsncmp, |
| + (CPU_FEATURE_USABLE (AVX512VL) |
| + && CPU_FEATURE_USABLE (AVX512BW) |
| + && CPU_FEATURE_USABLE (BMI2)), |
| + __wcsncmp_evex) |
| IFUNC_IMPL_ADD (array, i, wcsncmp, 1, __wcsncmp_sse2)) |
| |
| /* Support sysdeps/x86_64/multiarch/wcscpy.c. */ |
| @@ -404,6 +466,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, |
| IFUNC_IMPL_ADD (array, i, wcslen, |
| CPU_FEATURE_USABLE (AVX2), |
| __wcslen_avx2) |
| + IFUNC_IMPL_ADD (array, i, wcslen, |
| + (CPU_FEATURE_USABLE (AVX512VL) |
| + && CPU_FEATURE_USABLE (AVX512BW) |
| + && CPU_FEATURE_USABLE (BMI2)), |
| + __wcslen_evex) |
| IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2)) |
| |
| /* Support sysdeps/x86_64/multiarch/wcsnlen.c. */ |
| @@ -411,6 +478,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, |
| IFUNC_IMPL_ADD (array, i, wcsnlen, |
| CPU_FEATURE_USABLE (AVX2), |
| __wcsnlen_avx2) |
| + IFUNC_IMPL_ADD (array, i, wcsnlen, |
| + (CPU_FEATURE_USABLE (AVX512VL) |
| + && CPU_FEATURE_USABLE (AVX512BW) |
| + && CPU_FEATURE_USABLE (BMI2)), |
| + __wcsnlen_evex) |
| IFUNC_IMPL_ADD (array, i, wcsnlen, |
| CPU_FEATURE_USABLE (SSE4_1), |
| __wcsnlen_sse4_1) |
| @@ -421,6 +493,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, |
| IFUNC_IMPL_ADD (array, i, wmemchr, |
| CPU_FEATURE_USABLE (AVX2), |
| __wmemchr_avx2) |
| + IFUNC_IMPL_ADD (array, i, wmemchr, |
| + (CPU_FEATURE_USABLE (AVX512VL) |
| + && CPU_FEATURE_USABLE (AVX512BW) |
| + && CPU_FEATURE_USABLE (BMI2)), |
| + __wmemchr_evex) |
| IFUNC_IMPL_ADD (array, i, wmemchr, 1, __wmemchr_sse2)) |
| |
| /* Support sysdeps/x86_64/multiarch/wmemcmp.c. */ |
| @@ -568,6 +645,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, |
| IFUNC_IMPL_ADD (array, i, strncmp, |
| CPU_FEATURE_USABLE (AVX2), |
| __strncmp_avx2) |
| + IFUNC_IMPL_ADD (array, i, strncmp, |
| + (CPU_FEATURE_USABLE (AVX512VL) |
| + && CPU_FEATURE_USABLE (AVX512BW)), |
| + __strncmp_evex) |
| IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSE4_2), |
| __strncmp_sse42) |
| IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSSE3), |
| diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S |
| new file mode 100644 |
| index 00000000..6dd5d67b |
| |
| |
| @@ -0,0 +1,381 @@ |
| +/* memchr/wmemchr optimized with 256-bit EVEX instructions. |
| + Copyright (C) 2021 Free Software Foundation, Inc. |
| + This file is part of the GNU C Library. |
| + |
| + The GNU C Library is free software; you can redistribute it and/or |
| + modify it under the terms of the GNU Lesser General Public |
| + License as published by the Free Software Foundation; either |
| + version 2.1 of the License, or (at your option) any later version. |
| + |
| + The GNU C Library is distributed in the hope that it will be useful, |
| + but WITHOUT ANY WARRANTY; without even the implied warranty of |
| + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| + Lesser General Public License for more details. |
| + |
| + You should have received a copy of the GNU Lesser General Public |
| + License along with the GNU C Library; if not, see |
| + <https://www.gnu.org/licenses/>. */ |
| + |
| +#if IS_IN (libc) |
| + |
| +# include <sysdep.h> |
| + |
| +# ifndef MEMCHR |
| +# define MEMCHR __memchr_evex |
| +# endif |
| + |
| +# ifdef USE_AS_WMEMCHR |
| +# define VPBROADCAST vpbroadcastd |
| +# define VPCMP vpcmpd |
| +# define SHIFT_REG r8d |
| +# else |
| +# define VPBROADCAST vpbroadcastb |
| +# define VPCMP vpcmpb |
| +# define SHIFT_REG ecx |
| +# endif |
| + |
| +# define XMMMATCH xmm16 |
| +# define YMMMATCH ymm16 |
| +# define YMM1 ymm17 |
| +# define YMM2 ymm18 |
| +# define YMM3 ymm19 |
| +# define YMM4 ymm20 |
| +# define YMM5 ymm21 |
| +# define YMM6 ymm22 |
| + |
| +# define VEC_SIZE 32 |
| + |
| + .section .text.evex,"ax",@progbits |
| +ENTRY (MEMCHR) |
| +# ifndef USE_AS_RAWMEMCHR |
| + /* Check for zero length. */ |
| + test %RDX_LP, %RDX_LP |
| + jz L(zero) |
| +# endif |
| + movl %edi, %ecx |
| +# ifdef USE_AS_WMEMCHR |
| + shl $2, %RDX_LP |
| +# else |
| +# ifdef __ILP32__ |
| + /* Clear the upper 32 bits. */ |
| + movl %edx, %edx |
| +# endif |
| +# endif |
| + /* Broadcast CHAR to YMMMATCH. */ |
| + VPBROADCAST %esi, %YMMMATCH |
| + /* Check if we may cross page boundary with one vector load. */ |
| + andl $(2 * VEC_SIZE - 1), %ecx |
| + cmpl $VEC_SIZE, %ecx |
| + ja L(cros_page_boundary) |
| + |
| + /* Check the first VEC_SIZE bytes. */ |
| + VPCMP $0, (%rdi), %YMMMATCH, %k1 |
| + kmovd %k1, %eax |
| + testl %eax, %eax |
| + |
| +# ifndef USE_AS_RAWMEMCHR |
| + jnz L(first_vec_x0_check) |
| + /* Adjust length and check the end of data. */ |
| + subq $VEC_SIZE, %rdx |
| + jbe L(zero) |
| +# else |
| + jnz L(first_vec_x0) |
| +# endif |
| + |
| + /* Align data for aligned loads in the loop. */ |
| + addq $VEC_SIZE, %rdi |
| + andl $(VEC_SIZE - 1), %ecx |
| + andq $-VEC_SIZE, %rdi |
| + |
| +# ifndef USE_AS_RAWMEMCHR |
| + /* Adjust length. */ |
| + addq %rcx, %rdx |
| + |
| + subq $(VEC_SIZE * 4), %rdx |
| + jbe L(last_4x_vec_or_less) |
| +# endif |
| + jmp L(more_4x_vec) |
| + |
| + .p2align 4 |
| +L(cros_page_boundary): |
| + andl $(VEC_SIZE - 1), %ecx |
| +# ifdef USE_AS_WMEMCHR |
| + /* NB: Divide shift count by 4 since each bit in K1 represent 4 |
| + bytes. */ |
| + movl %ecx, %SHIFT_REG |
| + sarl $2, %SHIFT_REG |
| +# endif |
| + andq $-VEC_SIZE, %rdi |
| + VPCMP $0, (%rdi), %YMMMATCH, %k1 |
| + kmovd %k1, %eax |
| + /* Remove the leading bytes. */ |
| + sarxl %SHIFT_REG, %eax, %eax |
| + testl %eax, %eax |
| + jz L(aligned_more) |
| + tzcntl %eax, %eax |
| +# ifdef USE_AS_WMEMCHR |
| + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| + sall $2, %eax |
| +# endif |
| +# ifndef USE_AS_RAWMEMCHR |
| + /* Check the end of data. */ |
| + cmpq %rax, %rdx |
| + jbe L(zero) |
| +# endif |
| + addq %rdi, %rax |
| + addq %rcx, %rax |
| + ret |
| + |
| + .p2align 4 |
| +L(aligned_more): |
| +# ifndef USE_AS_RAWMEMCHR |
| + /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)" |
| + instead of "(rdx + rcx) - VEC_SIZE" to void possible addition |
| + overflow. */ |
| + negq %rcx |
| + addq $VEC_SIZE, %rcx |
| + |
| + /* Check the end of data. */ |
| + subq %rcx, %rdx |
| + jbe L(zero) |
| +# endif |
| + |
| + addq $VEC_SIZE, %rdi |
| + |
| +# ifndef USE_AS_RAWMEMCHR |
| + subq $(VEC_SIZE * 4), %rdx |
| + jbe L(last_4x_vec_or_less) |
| +# endif |
| + |
| +L(more_4x_vec): |
| + /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time |
| + since data is only aligned to VEC_SIZE. */ |
| + VPCMP $0, (%rdi), %YMMMATCH, %k1 |
| + kmovd %k1, %eax |
| + testl %eax, %eax |
| + jnz L(first_vec_x0) |
| + |
| + VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1 |
| + kmovd %k1, %eax |
| + testl %eax, %eax |
| + jnz L(first_vec_x1) |
| + |
| + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1 |
| + kmovd %k1, %eax |
| + testl %eax, %eax |
| + jnz L(first_vec_x2) |
| + |
| + VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1 |
| + kmovd %k1, %eax |
| + testl %eax, %eax |
| + jnz L(first_vec_x3) |
| + |
| + addq $(VEC_SIZE * 4), %rdi |
| + |
| +# ifndef USE_AS_RAWMEMCHR |
| + subq $(VEC_SIZE * 4), %rdx |
| + jbe L(last_4x_vec_or_less) |
| +# endif |
| + |
| + /* Align data to 4 * VEC_SIZE. */ |
| + movq %rdi, %rcx |
| + andl $(4 * VEC_SIZE - 1), %ecx |
| + andq $-(4 * VEC_SIZE), %rdi |
| + |
| +# ifndef USE_AS_RAWMEMCHR |
| + /* Adjust length. */ |
| + addq %rcx, %rdx |
| +# endif |
| + |
| + .p2align 4 |
| +L(loop_4x_vec): |
| + /* Compare 4 * VEC at a time forward. */ |
| + VPCMP $0, (%rdi), %YMMMATCH, %k1 |
| + VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k2 |
| + kord %k1, %k2, %k5 |
| + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3 |
| + VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4 |
| + |
| + kord %k3, %k4, %k6 |
| + kortestd %k5, %k6 |
| + jnz L(4x_vec_end) |
| + |
| + addq $(VEC_SIZE * 4), %rdi |
| + |
| +# ifdef USE_AS_RAWMEMCHR |
| + jmp L(loop_4x_vec) |
| +# else |
| + subq $(VEC_SIZE * 4), %rdx |
| + ja L(loop_4x_vec) |
| + |
| +L(last_4x_vec_or_less): |
| + /* Less than 4 * VEC and aligned to VEC_SIZE. */ |
| + addl $(VEC_SIZE * 2), %edx |
| + jle L(last_2x_vec) |
| + |
| + VPCMP $0, (%rdi), %YMMMATCH, %k1 |
| + kmovd %k1, %eax |
| + testl %eax, %eax |
| + jnz L(first_vec_x0) |
| + |
| + VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1 |
| + kmovd %k1, %eax |
| + testl %eax, %eax |
| + jnz L(first_vec_x1) |
| + |
| + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1 |
| + kmovd %k1, %eax |
| + testl %eax, %eax |
| + |
| + jnz L(first_vec_x2_check) |
| + subl $VEC_SIZE, %edx |
| + jle L(zero) |
| + |
| + VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1 |
| + kmovd %k1, %eax |
| + testl %eax, %eax |
| + |
| + jnz L(first_vec_x3_check) |
| + xorl %eax, %eax |
| + ret |
| + |
| + .p2align 4 |
| +L(last_2x_vec): |
| + addl $(VEC_SIZE * 2), %edx |
| + VPCMP $0, (%rdi), %YMMMATCH, %k1 |
| + kmovd %k1, %eax |
| + testl %eax, %eax |
| + |
| + jnz L(first_vec_x0_check) |
| + subl $VEC_SIZE, %edx |
| + jle L(zero) |
| + |
| + VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1 |
| + kmovd %k1, %eax |
| + testl %eax, %eax |
| + jnz L(first_vec_x1_check) |
| + xorl %eax, %eax |
| + ret |
| + |
| + .p2align 4 |
| +L(first_vec_x0_check): |
| + tzcntl %eax, %eax |
| +# ifdef USE_AS_WMEMCHR |
| + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| + sall $2, %eax |
| +# endif |
| + /* Check the end of data. */ |
| + cmpq %rax, %rdx |
| + jbe L(zero) |
| + addq %rdi, %rax |
| + ret |
| + |
| + .p2align 4 |
| +L(first_vec_x1_check): |
| + tzcntl %eax, %eax |
| +# ifdef USE_AS_WMEMCHR |
| + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| + sall $2, %eax |
| +# endif |
| + /* Check the end of data. */ |
| + cmpq %rax, %rdx |
| + jbe L(zero) |
| + addq $VEC_SIZE, %rax |
| + addq %rdi, %rax |
| + ret |
| + |
| + .p2align 4 |
| +L(first_vec_x2_check): |
| + tzcntl %eax, %eax |
| +# ifdef USE_AS_WMEMCHR |
| + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| + sall $2, %eax |
| +# endif |
| + /* Check the end of data. */ |
| + cmpq %rax, %rdx |
| + jbe L(zero) |
| + addq $(VEC_SIZE * 2), %rax |
| + addq %rdi, %rax |
| + ret |
| + |
| + .p2align 4 |
| +L(first_vec_x3_check): |
| + tzcntl %eax, %eax |
| +# ifdef USE_AS_WMEMCHR |
| + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| + sall $2, %eax |
| +# endif |
| + /* Check the end of data. */ |
| + cmpq %rax, %rdx |
| + jbe L(zero) |
| + addq $(VEC_SIZE * 3), %rax |
| + addq %rdi, %rax |
| + ret |
| + |
| + .p2align 4 |
| +L(zero): |
| + xorl %eax, %eax |
| + ret |
| +# endif |
| + |
| + .p2align 4 |
| +L(first_vec_x0): |
| + tzcntl %eax, %eax |
| +# ifdef USE_AS_WMEMCHR |
| + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| + leaq (%rdi, %rax, 4), %rax |
| +# else |
| + addq %rdi, %rax |
| +# endif |
| + ret |
| + |
| + .p2align 4 |
| +L(first_vec_x1): |
| + tzcntl %eax, %eax |
| +# ifdef USE_AS_WMEMCHR |
| + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| + leaq VEC_SIZE(%rdi, %rax, 4), %rax |
| +# else |
| + addq $VEC_SIZE, %rax |
| + addq %rdi, %rax |
| +# endif |
| + ret |
| + |
| + .p2align 4 |
| +L(first_vec_x2): |
| + tzcntl %eax, %eax |
| +# ifdef USE_AS_WMEMCHR |
| + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| + leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax |
| +# else |
| + addq $(VEC_SIZE * 2), %rax |
| + addq %rdi, %rax |
| +# endif |
| + ret |
| + |
| + .p2align 4 |
| +L(4x_vec_end): |
| + kmovd %k1, %eax |
| + testl %eax, %eax |
| + jnz L(first_vec_x0) |
| + kmovd %k2, %eax |
| + testl %eax, %eax |
| + jnz L(first_vec_x1) |
| + kmovd %k3, %eax |
| + testl %eax, %eax |
| + jnz L(first_vec_x2) |
| + kmovd %k4, %eax |
| + testl %eax, %eax |
| +L(first_vec_x3): |
| + tzcntl %eax, %eax |
| +# ifdef USE_AS_WMEMCHR |
| + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| + leaq (VEC_SIZE * 3)(%rdi, %rax, 4), %rax |
| +# else |
| + addq $(VEC_SIZE * 3), %rax |
| + addq %rdi, %rax |
| +# endif |
| + ret |
| + |
| +END (MEMCHR) |
| +#endif |
| diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S |
| new file mode 100644 |
| index 00000000..16bf8e02 |
| |
| |
| @@ -0,0 +1,337 @@ |
| +/* memrchr optimized with 256-bit EVEX instructions. |
| + Copyright (C) 2021 Free Software Foundation, Inc. |
| + This file is part of the GNU C Library. |
| + |
| + The GNU C Library is free software; you can redistribute it and/or |
| + modify it under the terms of the GNU Lesser General Public |
| + License as published by the Free Software Foundation; either |
| + version 2.1 of the License, or (at your option) any later version. |
| + |
| + The GNU C Library is distributed in the hope that it will be useful, |
| + but WITHOUT ANY WARRANTY; without even the implied warranty of |
| + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| + Lesser General Public License for more details. |
| + |
| + You should have received a copy of the GNU Lesser General Public |
| + License along with the GNU C Library; if not, see |
| + <https://www.gnu.org/licenses/>. */ |
| + |
| +#if IS_IN (libc) |
| + |
| +# include <sysdep.h> |
| + |
| +# define VMOVA vmovdqa64 |
| + |
| +# define YMMMATCH ymm16 |
| + |
| +# define VEC_SIZE 32 |
| + |
| + .section .text.evex,"ax",@progbits |
| +ENTRY (__memrchr_evex) |
| + /* Broadcast CHAR to YMMMATCH. */ |
| + vpbroadcastb %esi, %YMMMATCH |
| + |
| + sub $VEC_SIZE, %RDX_LP |
| + jbe L(last_vec_or_less) |
| + |
| + add %RDX_LP, %RDI_LP |
| + |
| + /* Check the last VEC_SIZE bytes. */ |
| + vpcmpb $0, (%rdi), %YMMMATCH, %k1 |
| + kmovd %k1, %eax |
| + testl %eax, %eax |
| + jnz L(last_vec_x0) |
| + |
| + subq $(VEC_SIZE * 4), %rdi |
| + movl %edi, %ecx |
| + andl $(VEC_SIZE - 1), %ecx |
| + jz L(aligned_more) |
| + |
| + /* Align data for aligned loads in the loop. */ |
| + addq $VEC_SIZE, %rdi |
| + addq $VEC_SIZE, %rdx |
| + andq $-VEC_SIZE, %rdi |
| + subq %rcx, %rdx |
| + |
| + .p2align 4 |
| +L(aligned_more): |
| + subq $(VEC_SIZE * 4), %rdx |
| + jbe L(last_4x_vec_or_less) |
| + |
| + /* Check the last 4 * VEC_SIZE. Only one VEC_SIZE at a time |
| + since data is only aligned to VEC_SIZE. */ |
| + vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1 |
| + kmovd %k1, %eax |
| + testl %eax, %eax |
| + jnz L(last_vec_x3) |
| + |
| + vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2 |
| + kmovd %k2, %eax |
| + testl %eax, %eax |
| + jnz L(last_vec_x2) |
| + |
| + vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k3 |
| + kmovd %k3, %eax |
| + testl %eax, %eax |
| + jnz L(last_vec_x1) |
| + |
| + vpcmpb $0, (%rdi), %YMMMATCH, %k4 |
| + kmovd %k4, %eax |
| + testl %eax, %eax |
| + jnz L(last_vec_x0) |
| + |
| + /* Align data to 4 * VEC_SIZE for loop with fewer branches. |
| + There are some overlaps with above if data isn't aligned |
| + to 4 * VEC_SIZE. */ |
| + movl %edi, %ecx |
| + andl $(VEC_SIZE * 4 - 1), %ecx |
| + jz L(loop_4x_vec) |
| + |
| + addq $(VEC_SIZE * 4), %rdi |
| + addq $(VEC_SIZE * 4), %rdx |
| + andq $-(VEC_SIZE * 4), %rdi |
| + subq %rcx, %rdx |
| + |
| + .p2align 4 |
| +L(loop_4x_vec): |
| + /* Compare 4 * VEC at a time forward. */ |
| + subq $(VEC_SIZE * 4), %rdi |
| + subq $(VEC_SIZE * 4), %rdx |
| + jbe L(last_4x_vec_or_less) |
| + |
| + vpcmpb $0, (%rdi), %YMMMATCH, %k1 |
| + vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k2 |
| + kord %k1, %k2, %k5 |
| + vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3 |
| + vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4 |
| + |
| + kord %k3, %k4, %k6 |
| + kortestd %k5, %k6 |
| + jz L(loop_4x_vec) |
| + |
| + /* There is a match. */ |
| + kmovd %k4, %eax |
| + testl %eax, %eax |
| + jnz L(last_vec_x3) |
| + |
| + kmovd %k3, %eax |
| + testl %eax, %eax |
| + jnz L(last_vec_x2) |
| + |
| + kmovd %k2, %eax |
| + testl %eax, %eax |
| + jnz L(last_vec_x1) |
| + |
| + kmovd %k1, %eax |
| + bsrl %eax, %eax |
| + addq %rdi, %rax |
| + ret |
| + |
| + .p2align 4 |
| +L(last_4x_vec_or_less): |
| + addl $(VEC_SIZE * 4), %edx |
| + cmpl $(VEC_SIZE * 2), %edx |
| + jbe L(last_2x_vec) |
| + |
| + vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1 |
| + kmovd %k1, %eax |
| + testl %eax, %eax |
| + jnz L(last_vec_x3) |
| + |
| + vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2 |
| + kmovd %k2, %eax |
| + testl %eax, %eax |
| + jnz L(last_vec_x2) |
| + |
| + vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k3 |
| + kmovd %k3, %eax |
| + testl %eax, %eax |
| + jnz L(last_vec_x1_check) |
| + cmpl $(VEC_SIZE * 3), %edx |
| + jbe L(zero) |
| + |
| + vpcmpb $0, (%rdi), %YMMMATCH, %k4 |
| + kmovd %k4, %eax |
| + testl %eax, %eax |
| + jz L(zero) |
| + bsrl %eax, %eax |
| + subq $(VEC_SIZE * 4), %rdx |
| + addq %rax, %rdx |
| + jl L(zero) |
| + addq %rdi, %rax |
| + ret |
| + |
| + .p2align 4 |
| +L(last_2x_vec): |
| + vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1 |
| + kmovd %k1, %eax |
| + testl %eax, %eax |
| + jnz L(last_vec_x3_check) |
| + cmpl $VEC_SIZE, %edx |
| + jbe L(zero) |
| + |
| + vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1 |
| + kmovd %k1, %eax |
| + testl %eax, %eax |
| + jz L(zero) |
| + bsrl %eax, %eax |
| + subq $(VEC_SIZE * 2), %rdx |
| + addq %rax, %rdx |
| + jl L(zero) |
| + addl $(VEC_SIZE * 2), %eax |
| + addq %rdi, %rax |
| + ret |
| + |
| + .p2align 4 |
| +L(last_vec_x0): |
| + bsrl %eax, %eax |
| + addq %rdi, %rax |
| + ret |
| + |
| + .p2align 4 |
| +L(last_vec_x1): |
| + bsrl %eax, %eax |
| + addl $VEC_SIZE, %eax |
| + addq %rdi, %rax |
| + ret |
| + |
| + .p2align 4 |
| +L(last_vec_x2): |
| + bsrl %eax, %eax |
| + addl $(VEC_SIZE * 2), %eax |
| + addq %rdi, %rax |
| + ret |
| + |
| + .p2align 4 |
| +L(last_vec_x3): |
| + bsrl %eax, %eax |
| + addl $(VEC_SIZE * 3), %eax |
| + addq %rdi, %rax |
| + ret |
| + |
| + .p2align 4 |
| +L(last_vec_x1_check): |
| + bsrl %eax, %eax |
| + subq $(VEC_SIZE * 3), %rdx |
| + addq %rax, %rdx |
| + jl L(zero) |
| + addl $VEC_SIZE, %eax |
| + addq %rdi, %rax |
| + ret |
| + |
| + .p2align 4 |
| +L(last_vec_x3_check): |
| + bsrl %eax, %eax |
| + subq $VEC_SIZE, %rdx |
| + addq %rax, %rdx |
| + jl L(zero) |
| + addl $(VEC_SIZE * 3), %eax |
| + addq %rdi, %rax |
| + ret |
| + |
| + .p2align 4 |
| +L(zero): |
| + xorl %eax, %eax |
| + ret |
| + |
| + .p2align 4 |
| +L(last_vec_or_less_aligned): |
| + movl %edx, %ecx |
| + |
| + vpcmpb $0, (%rdi), %YMMMATCH, %k1 |
| + |
| + movl $1, %edx |
| + /* Support rdx << 32. */ |
| + salq %cl, %rdx |
| + subq $1, %rdx |
| + |
| + kmovd %k1, %eax |
| + |
| + /* Remove the trailing bytes. */ |
| + andl %edx, %eax |
| + testl %eax, %eax |
| + jz L(zero) |
| + |
| + bsrl %eax, %eax |
| + addq %rdi, %rax |
| + ret |
| + |
| + .p2align 4 |
| +L(last_vec_or_less): |
| + addl $VEC_SIZE, %edx |
| + |
| + /* Check for zero length. */ |
| + testl %edx, %edx |
| + jz L(zero) |
| + |
| + movl %edi, %ecx |
| + andl $(VEC_SIZE - 1), %ecx |
| + jz L(last_vec_or_less_aligned) |
| + |
| + movl %ecx, %esi |
| + movl %ecx, %r8d |
| + addl %edx, %esi |
| + andq $-VEC_SIZE, %rdi |
| + |
| + subl $VEC_SIZE, %esi |
| + ja L(last_vec_2x_aligned) |
| + |
| + /* Check the last VEC. */ |
| + vpcmpb $0, (%rdi), %YMMMATCH, %k1 |
| + kmovd %k1, %eax |
| + |
| + /* Remove the leading and trailing bytes. */ |
| + sarl %cl, %eax |
| + movl %edx, %ecx |
| + |
| + movl $1, %edx |
| + sall %cl, %edx |
| + subl $1, %edx |
| + |
| + andl %edx, %eax |
| + testl %eax, %eax |
| + jz L(zero) |
| + |
| + bsrl %eax, %eax |
| + addq %rdi, %rax |
| + addq %r8, %rax |
| + ret |
| + |
| + .p2align 4 |
| +L(last_vec_2x_aligned): |
| + movl %esi, %ecx |
| + |
| + /* Check the last VEC. */ |
| + vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k1 |
| + |
| + movl $1, %edx |
| + sall %cl, %edx |
| + subl $1, %edx |
| + |
| + kmovd %k1, %eax |
| + |
| + /* Remove the trailing bytes. */ |
| + andl %edx, %eax |
| + |
| + testl %eax, %eax |
| + jnz L(last_vec_x1) |
| + |
| + /* Check the second last VEC. */ |
| + vpcmpb $0, (%rdi), %YMMMATCH, %k1 |
| + |
| + movl %r8d, %ecx |
| + |
| + kmovd %k1, %eax |
| + |
| + /* Remove the leading bytes. Must use unsigned right shift for |
| + bsrl below. */ |
| + shrl %cl, %eax |
| + testl %eax, %eax |
| + jz L(zero) |
| + |
| + bsrl %eax, %eax |
| + addq %rdi, %rax |
| + addq %r8, %rax |
| + ret |
| +END (__memrchr_evex) |
| +#endif |
| diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex.S b/sysdeps/x86_64/multiarch/rawmemchr-evex.S |
| new file mode 100644 |
| index 00000000..ec942b77 |
| |
| |
| @@ -0,0 +1,4 @@ |
| +#define MEMCHR __rawmemchr_evex |
| +#define USE_AS_RAWMEMCHR 1 |
| + |
| +#include "memchr-evex.S" |
| diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S |
| new file mode 100644 |
| index 00000000..ddc86a70 |
| |
| |
| @@ -0,0 +1,335 @@ |
| +/* strchr/strchrnul optimized with 256-bit EVEX instructions. |
| + Copyright (C) 2021 Free Software Foundation, Inc. |
| + This file is part of the GNU C Library. |
| + |
| + The GNU C Library is free software; you can redistribute it and/or |
| + modify it under the terms of the GNU Lesser General Public |
| + License as published by the Free Software Foundation; either |
| + version 2.1 of the License, or (at your option) any later version. |
| + |
| + The GNU C Library is distributed in the hope that it will be useful, |
| + but WITHOUT ANY WARRANTY; without even the implied warranty of |
| + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| + Lesser General Public License for more details. |
| + |
| + You should have received a copy of the GNU Lesser General Public |
| + License along with the GNU C Library; if not, see |
| + <https://www.gnu.org/licenses/>. */ |
| + |
| +#if IS_IN (libc) |
| + |
| +# include <sysdep.h> |
| + |
| +# ifndef STRCHR |
| +# define STRCHR __strchr_evex |
| +# endif |
| + |
| +# define VMOVU vmovdqu64 |
| +# define VMOVA vmovdqa64 |
| + |
| +# ifdef USE_AS_WCSCHR |
| +# define VPBROADCAST vpbroadcastd |
| +# define VPCMP vpcmpd |
| +# define VPMINU vpminud |
| +# define CHAR_REG esi |
| +# define SHIFT_REG r8d |
| +# else |
| +# define VPBROADCAST vpbroadcastb |
| +# define VPCMP vpcmpb |
| +# define VPMINU vpminub |
| +# define CHAR_REG sil |
| +# define SHIFT_REG ecx |
| +# endif |
| + |
| +# define XMMZERO xmm16 |
| + |
| +# define YMMZERO ymm16 |
| +# define YMM0 ymm17 |
| +# define YMM1 ymm18 |
| +# define YMM2 ymm19 |
| +# define YMM3 ymm20 |
| +# define YMM4 ymm21 |
| +# define YMM5 ymm22 |
| +# define YMM6 ymm23 |
| +# define YMM7 ymm24 |
| +# define YMM8 ymm25 |
| + |
| +# define VEC_SIZE 32 |
| +# define PAGE_SIZE 4096 |
| + |
| + .section .text.evex,"ax",@progbits |
| +ENTRY (STRCHR) |
| + movl %edi, %ecx |
| +# ifndef USE_AS_STRCHRNUL |
| + xorl %edx, %edx |
| +# endif |
| + |
| + /* Broadcast CHAR to YMM0. */ |
| + VPBROADCAST %esi, %YMM0 |
| + |
| + vpxorq %XMMZERO, %XMMZERO, %XMMZERO |
| + |
| + /* Check if we cross page boundary with one vector load. */ |
| + andl $(PAGE_SIZE - 1), %ecx |
| + cmpl $(PAGE_SIZE - VEC_SIZE), %ecx |
| + ja L(cross_page_boundary) |
| + |
| + /* Check the first VEC_SIZE bytes. Search for both CHAR and the |
| + null bytes. */ |
| + VMOVU (%rdi), %YMM1 |
| + |
| + /* Leaves only CHARS matching esi as 0. */ |
| + vpxorq %YMM1, %YMM0, %YMM2 |
| + VPMINU %YMM2, %YMM1, %YMM2 |
| + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ |
| + VPCMP $0, %YMMZERO, %YMM2, %k0 |
| + ktestd %k0, %k0 |
| + jz L(more_vecs) |
| + kmovd %k0, %eax |
| + tzcntl %eax, %eax |
| + /* Found CHAR or the null byte. */ |
| +# ifdef USE_AS_WCSCHR |
| + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| + leaq (%rdi, %rax, 4), %rax |
| +# else |
| + addq %rdi, %rax |
| +# endif |
| +# ifndef USE_AS_STRCHRNUL |
| + cmp (%rax), %CHAR_REG |
| + cmovne %rdx, %rax |
| +# endif |
| + ret |
| + |
| + .p2align 4 |
| +L(more_vecs): |
| + /* Align data for aligned loads in the loop. */ |
| + andq $-VEC_SIZE, %rdi |
| +L(aligned_more): |
| + |
| + /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time |
| + since data is only aligned to VEC_SIZE. */ |
| + VMOVA VEC_SIZE(%rdi), %YMM1 |
| + addq $VEC_SIZE, %rdi |
| + |
| + /* Leaves only CHARS matching esi as 0. */ |
| + vpxorq %YMM1, %YMM0, %YMM2 |
| + VPMINU %YMM2, %YMM1, %YMM2 |
| + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ |
| + VPCMP $0, %YMMZERO, %YMM2, %k0 |
| + kmovd %k0, %eax |
| + testl %eax, %eax |
| + jnz L(first_vec_x0) |
| + |
| + VMOVA VEC_SIZE(%rdi), %YMM1 |
| + /* Leaves only CHARS matching esi as 0. */ |
| + vpxorq %YMM1, %YMM0, %YMM2 |
| + VPMINU %YMM2, %YMM1, %YMM2 |
| + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ |
| + VPCMP $0, %YMMZERO, %YMM2, %k0 |
| + kmovd %k0, %eax |
| + testl %eax, %eax |
| + jnz L(first_vec_x1) |
| + |
| + VMOVA (VEC_SIZE * 2)(%rdi), %YMM1 |
| + /* Leaves only CHARS matching esi as 0. */ |
| + vpxorq %YMM1, %YMM0, %YMM2 |
| + VPMINU %YMM2, %YMM1, %YMM2 |
| + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ |
| + VPCMP $0, %YMMZERO, %YMM2, %k0 |
| + kmovd %k0, %eax |
| + testl %eax, %eax |
| + jnz L(first_vec_x2) |
| + |
| + VMOVA (VEC_SIZE * 3)(%rdi), %YMM1 |
| + /* Leaves only CHARS matching esi as 0. */ |
| + vpxorq %YMM1, %YMM0, %YMM2 |
| + VPMINU %YMM2, %YMM1, %YMM2 |
| + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ |
| + VPCMP $0, %YMMZERO, %YMM2, %k0 |
| + ktestd %k0, %k0 |
| + jz L(prep_loop_4x) |
| + |
| + kmovd %k0, %eax |
| + tzcntl %eax, %eax |
| + /* Found CHAR or the null byte. */ |
| +# ifdef USE_AS_WCSCHR |
| + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| + leaq (VEC_SIZE * 3)(%rdi, %rax, 4), %rax |
| +# else |
| + leaq (VEC_SIZE * 3)(%rdi, %rax), %rax |
| +# endif |
| +# ifndef USE_AS_STRCHRNUL |
| + cmp (%rax), %CHAR_REG |
| + cmovne %rdx, %rax |
| +# endif |
| + ret |
| + |
| + .p2align 4 |
| +L(first_vec_x0): |
| + tzcntl %eax, %eax |
| + /* Found CHAR or the null byte. */ |
| +# ifdef USE_AS_WCSCHR |
| + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| + leaq (%rdi, %rax, 4), %rax |
| +# else |
| + addq %rdi, %rax |
| +# endif |
| +# ifndef USE_AS_STRCHRNUL |
| + cmp (%rax), %CHAR_REG |
| + cmovne %rdx, %rax |
| +# endif |
| + ret |
| + |
| + .p2align 4 |
| +L(first_vec_x1): |
| + tzcntl %eax, %eax |
| + /* Found CHAR or the null byte. */ |
| +# ifdef USE_AS_WCSCHR |
| + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| + leaq VEC_SIZE(%rdi, %rax, 4), %rax |
| +# else |
| + leaq VEC_SIZE(%rdi, %rax), %rax |
| +# endif |
| +# ifndef USE_AS_STRCHRNUL |
| + cmp (%rax), %CHAR_REG |
| + cmovne %rdx, %rax |
| +# endif |
| + ret |
| + |
| + .p2align 4 |
| +L(first_vec_x2): |
| + tzcntl %eax, %eax |
| + /* Found CHAR or the null byte. */ |
| +# ifdef USE_AS_WCSCHR |
| + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| + leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax |
| +# else |
| + leaq (VEC_SIZE * 2)(%rdi, %rax), %rax |
| +# endif |
| +# ifndef USE_AS_STRCHRNUL |
| + cmp (%rax), %CHAR_REG |
| + cmovne %rdx, %rax |
| +# endif |
| + ret |
| + |
| +L(prep_loop_4x): |
| + /* Align data to 4 * VEC_SIZE. */ |
| + andq $-(VEC_SIZE * 4), %rdi |
| + |
| + .p2align 4 |
| +L(loop_4x_vec): |
| + /* Compare 4 * VEC at a time forward. */ |
| + VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 |
| + VMOVA (VEC_SIZE * 5)(%rdi), %YMM2 |
| + VMOVA (VEC_SIZE * 6)(%rdi), %YMM3 |
| + VMOVA (VEC_SIZE * 7)(%rdi), %YMM4 |
| + |
| + /* Leaves only CHARS matching esi as 0. */ |
| + vpxorq %YMM1, %YMM0, %YMM5 |
| + vpxorq %YMM2, %YMM0, %YMM6 |
| + vpxorq %YMM3, %YMM0, %YMM7 |
| + vpxorq %YMM4, %YMM0, %YMM8 |
| + |
| + VPMINU %YMM5, %YMM1, %YMM5 |
| + VPMINU %YMM6, %YMM2, %YMM6 |
| + VPMINU %YMM7, %YMM3, %YMM7 |
| + VPMINU %YMM8, %YMM4, %YMM8 |
| + |
| + VPMINU %YMM5, %YMM6, %YMM1 |
| + VPMINU %YMM7, %YMM8, %YMM2 |
| + |
| + VPMINU %YMM1, %YMM2, %YMM1 |
| + |
| + /* Each bit in K0 represents a CHAR or a null byte. */ |
| + VPCMP $0, %YMMZERO, %YMM1, %k0 |
| + |
| + addq $(VEC_SIZE * 4), %rdi |
| + |
| + ktestd %k0, %k0 |
| + jz L(loop_4x_vec) |
| + |
| + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ |
| + VPCMP $0, %YMMZERO, %YMM5, %k0 |
| + kmovd %k0, %eax |
| + testl %eax, %eax |
| + jnz L(first_vec_x0) |
| + |
| + /* Each bit in K1 represents a CHAR or a null byte in YMM2. */ |
| + VPCMP $0, %YMMZERO, %YMM6, %k1 |
| + kmovd %k1, %eax |
| + testl %eax, %eax |
| + jnz L(first_vec_x1) |
| + |
| + /* Each bit in K2 represents a CHAR or a null byte in YMM3. */ |
| + VPCMP $0, %YMMZERO, %YMM7, %k2 |
| + /* Each bit in K3 represents a CHAR or a null byte in YMM4. */ |
| + VPCMP $0, %YMMZERO, %YMM8, %k3 |
| + |
| +# ifdef USE_AS_WCSCHR |
| + /* NB: Each bit in K2/K3 represents 4-byte element. */ |
| + kshiftlw $8, %k3, %k1 |
| +# else |
| + kshiftlq $32, %k3, %k1 |
| +# endif |
| + |
| + /* Each bit in K1 represents a NULL or a mismatch. */ |
| + korq %k1, %k2, %k1 |
| + kmovq %k1, %rax |
| + |
| + tzcntq %rax, %rax |
| +# ifdef USE_AS_WCSCHR |
| + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| + leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax |
| +# else |
| + leaq (VEC_SIZE * 2)(%rdi, %rax), %rax |
| +# endif |
| +# ifndef USE_AS_STRCHRNUL |
| + cmp (%rax), %CHAR_REG |
| + cmovne %rdx, %rax |
| +# endif |
| + ret |
| + |
| + /* Cold case for crossing page with first load. */ |
| + .p2align 4 |
| +L(cross_page_boundary): |
| + andq $-VEC_SIZE, %rdi |
| + andl $(VEC_SIZE - 1), %ecx |
| + |
| + VMOVA (%rdi), %YMM1 |
| + |
| + /* Leaves only CHARS matching esi as 0. */ |
| + vpxorq %YMM1, %YMM0, %YMM2 |
| + VPMINU %YMM2, %YMM1, %YMM2 |
| + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ |
| + VPCMP $0, %YMMZERO, %YMM2, %k0 |
| + kmovd %k0, %eax |
| + testl %eax, %eax |
| + |
| +# ifdef USE_AS_WCSCHR |
| + /* NB: Divide shift count by 4 since each bit in K1 represent 4 |
| + bytes. */ |
| + movl %ecx, %SHIFT_REG |
| + sarl $2, %SHIFT_REG |
| +# endif |
| + |
| + /* Remove the leading bits. */ |
| + sarxl %SHIFT_REG, %eax, %eax |
| + testl %eax, %eax |
| + |
| + jz L(aligned_more) |
| + tzcntl %eax, %eax |
| + addq %rcx, %rdi |
| +# ifdef USE_AS_WCSCHR |
| + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| + leaq (%rdi, %rax, 4), %rax |
| +# else |
| + addq %rdi, %rax |
| +# endif |
| +# ifndef USE_AS_STRCHRNUL |
| + cmp (%rax), %CHAR_REG |
| + cmovne %rdx, %rax |
| +# endif |
| + ret |
| + |
| +END (STRCHR) |
| +# endif |
| diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c |
| index 32954713..be05e197 100644 |
| |
| |
| @@ -29,16 +29,24 @@ |
| extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; |
| extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_no_bsf) attribute_hidden; |
| extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; |
| +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; |
| |
| static inline void * |
| IFUNC_SELECTOR (void) |
| { |
| const struct cpu_features* cpu_features = __get_cpu_features (); |
| |
| - if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) |
| - && CPU_FEATURE_USABLE_P (cpu_features, AVX2) |
| + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) |
| && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) |
| - return OPTIMIZE (avx2); |
| + { |
| + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) |
| + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) |
| + && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) |
| + return OPTIMIZE (evex); |
| + |
| + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) |
| + return OPTIMIZE (avx2); |
| + } |
| |
| if (CPU_FEATURES_ARCH_P (cpu_features, Slow_BSF)) |
| return OPTIMIZE (sse2_no_bsf); |
| diff --git a/sysdeps/x86_64/multiarch/strchrnul-evex.S b/sysdeps/x86_64/multiarch/strchrnul-evex.S |
| new file mode 100644 |
| index 00000000..064fe7ca |
| |
| |
| @@ -0,0 +1,3 @@ |
| +#define STRCHR __strchrnul_evex |
| +#define USE_AS_STRCHRNUL 1 |
| +#include "strchr-evex.S" |
| diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S |
| new file mode 100644 |
| index 00000000..459eeed0 |
| |
| |
| @@ -0,0 +1,1043 @@ |
| +/* strcmp/wcscmp/strncmp/wcsncmp optimized with 256-bit EVEX instructions. |
| + Copyright (C) 2021 Free Software Foundation, Inc. |
| + This file is part of the GNU C Library. |
| + |
| + The GNU C Library is free software; you can redistribute it and/or |
| + modify it under the terms of the GNU Lesser General Public |
| + License as published by the Free Software Foundation; either |
| + version 2.1 of the License, or (at your option) any later version. |
| + |
| + The GNU C Library is distributed in the hope that it will be useful, |
| + but WITHOUT ANY WARRANTY; without even the implied warranty of |
| + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| + Lesser General Public License for more details. |
| + |
| + You should have received a copy of the GNU Lesser General Public |
| + License along with the GNU C Library; if not, see |
| + <https://www.gnu.org/licenses/>. */ |
| + |
| +#if IS_IN (libc) |
| + |
| +# include <sysdep.h> |
| + |
| +# ifndef STRCMP |
| +# define STRCMP __strcmp_evex |
| +# endif |
| + |
| +# define PAGE_SIZE 4096 |
| + |
| +/* VEC_SIZE = Number of bytes in a ymm register */ |
| +# define VEC_SIZE 32 |
| + |
| +/* Shift for dividing by (VEC_SIZE * 4). */ |
| +# define DIVIDE_BY_VEC_4_SHIFT 7 |
| +# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) |
| +# error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) |
| +# endif |
| + |
| +# define VMOVU vmovdqu64 |
| +# define VMOVA vmovdqa64 |
| + |
| +# ifdef USE_AS_WCSCMP |
| +/* Compare packed dwords. */ |
| +# define VPCMP vpcmpd |
| +# define SHIFT_REG32 r8d |
| +# define SHIFT_REG64 r8 |
| +/* 1 dword char == 4 bytes. */ |
| +# define SIZE_OF_CHAR 4 |
| +# else |
| +/* Compare packed bytes. */ |
| +# define VPCMP vpcmpb |
| +# define SHIFT_REG32 ecx |
| +# define SHIFT_REG64 rcx |
| +/* 1 byte char == 1 byte. */ |
| +# define SIZE_OF_CHAR 1 |
| +# endif |
| + |
| +# define XMMZERO xmm16 |
| +# define XMM0 xmm17 |
| +# define XMM1 xmm18 |
| + |
| +# define YMMZERO ymm16 |
| +# define YMM0 ymm17 |
| +# define YMM1 ymm18 |
| +# define YMM2 ymm19 |
| +# define YMM3 ymm20 |
| +# define YMM4 ymm21 |
| +# define YMM5 ymm22 |
| +# define YMM6 ymm23 |
| +# define YMM7 ymm24 |
| + |
| +/* Warning! |
| + wcscmp/wcsncmp have to use SIGNED comparison for elements. |
| + strcmp/strncmp have to use UNSIGNED comparison for elements. |
| +*/ |
| + |
| +/* The main idea of the string comparison (byte or dword) using 256-bit |
| + EVEX instructions consists of comparing (VPCMP) two ymm vectors. The |
| + latter can be on either packed bytes or dwords depending on |
| + USE_AS_WCSCMP. In order to check the null char, algorithm keeps the |
| + matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2 |
| + KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes) |
| + are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd |
| + instructions. Main loop (away from from page boundary) compares 4 |
| + vectors are a time, effectively comparing 4 x VEC_SIZE bytes (128 |
| + bytes) on each loop. |
| + |
| + The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic |
| + is the same as strcmp, except that an a maximum offset is tracked. If |
| + the maximum offset is reached before a difference is found, zero is |
| + returned. */ |
| + |
| + .section .text.evex,"ax",@progbits |
| +ENTRY (STRCMP) |
| +# ifdef USE_AS_STRNCMP |
| + /* Check for simple cases (0 or 1) in offset. */ |
| + cmp $1, %RDX_LP |
| + je L(char0) |
| + jb L(zero) |
| +# ifdef USE_AS_WCSCMP |
| + /* Convert units: from wide to byte char. */ |
| + shl $2, %RDX_LP |
| +# endif |
| + /* Register %r11 tracks the maximum offset. */ |
| + mov %RDX_LP, %R11_LP |
| +# endif |
| + movl %edi, %eax |
| + xorl %edx, %edx |
| + /* Make %XMMZERO (%YMMZERO) all zeros in this function. */ |
| + vpxorq %XMMZERO, %XMMZERO, %XMMZERO |
| + orl %esi, %eax |
| + andl $(PAGE_SIZE - 1), %eax |
| + cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax |
| + jg L(cross_page) |
| + /* Start comparing 4 vectors. */ |
| + VMOVU (%rdi), %YMM0 |
| + VMOVU (%rsi), %YMM1 |
| + |
| + /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */ |
| + VPCMP $4, %YMM0, %YMM1, %k0 |
| + |
| + /* Check for NULL in YMM0. */ |
| + VPCMP $0, %YMMZERO, %YMM0, %k1 |
| + /* Check for NULL in YMM1. */ |
| + VPCMP $0, %YMMZERO, %YMM1, %k2 |
| + /* Each bit in K1 represents a NULL in YMM0 or YMM1. */ |
| + kord %k1, %k2, %k1 |
| + |
| + /* Each bit in K1 represents: |
| + 1. A mismatch in YMM0 and YMM1. Or |
| + 2. A NULL in YMM0 or YMM1. |
| + */ |
| + kord %k0, %k1, %k1 |
| + |
| + ktestd %k1, %k1 |
| + je L(next_3_vectors) |
| + kmovd %k1, %ecx |
| + tzcntl %ecx, %edx |
| +# ifdef USE_AS_WCSCMP |
| + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| + sall $2, %edx |
| +# endif |
| +# ifdef USE_AS_STRNCMP |
| + /* Return 0 if the mismatched index (%rdx) is after the maximum |
| + offset (%r11). */ |
| + cmpq %r11, %rdx |
| + jae L(zero) |
| +# endif |
| +# ifdef USE_AS_WCSCMP |
| + xorl %eax, %eax |
| + movl (%rdi, %rdx), %ecx |
| + cmpl (%rsi, %rdx), %ecx |
| + je L(return) |
| +L(wcscmp_return): |
| + setl %al |
| + negl %eax |
| + orl $1, %eax |
| +L(return): |
| +# else |
| + movzbl (%rdi, %rdx), %eax |
| + movzbl (%rsi, %rdx), %edx |
| + subl %edx, %eax |
| +# endif |
| + ret |
| + |
| + .p2align 4 |
| +L(return_vec_size): |
| + kmovd %k1, %ecx |
| + tzcntl %ecx, %edx |
| +# ifdef USE_AS_WCSCMP |
| + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| + sall $2, %edx |
| +# endif |
| +# ifdef USE_AS_STRNCMP |
| + /* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after |
| + the maximum offset (%r11). */ |
| + addq $VEC_SIZE, %rdx |
| + cmpq %r11, %rdx |
| + jae L(zero) |
| +# ifdef USE_AS_WCSCMP |
| + xorl %eax, %eax |
| + movl (%rdi, %rdx), %ecx |
| + cmpl (%rsi, %rdx), %ecx |
| + jne L(wcscmp_return) |
| +# else |
| + movzbl (%rdi, %rdx), %eax |
| + movzbl (%rsi, %rdx), %edx |
| + subl %edx, %eax |
| +# endif |
| +# else |
| +# ifdef USE_AS_WCSCMP |
| + xorl %eax, %eax |
| + movl VEC_SIZE(%rdi, %rdx), %ecx |
| + cmpl VEC_SIZE(%rsi, %rdx), %ecx |
| + jne L(wcscmp_return) |
| +# else |
| + movzbl VEC_SIZE(%rdi, %rdx), %eax |
| + movzbl VEC_SIZE(%rsi, %rdx), %edx |
| + subl %edx, %eax |
| +# endif |
| +# endif |
| + ret |
| + |
| + .p2align 4 |
| +L(return_2_vec_size): |
| + kmovd %k1, %ecx |
| + tzcntl %ecx, %edx |
| +# ifdef USE_AS_WCSCMP |
| + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| + sall $2, %edx |
| +# endif |
| +# ifdef USE_AS_STRNCMP |
| + /* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is |
| + after the maximum offset (%r11). */ |
| + addq $(VEC_SIZE * 2), %rdx |
| + cmpq %r11, %rdx |
| + jae L(zero) |
| +# ifdef USE_AS_WCSCMP |
| + xorl %eax, %eax |
| + movl (%rdi, %rdx), %ecx |
| + cmpl (%rsi, %rdx), %ecx |
| + jne L(wcscmp_return) |
| +# else |
| + movzbl (%rdi, %rdx), %eax |
| + movzbl (%rsi, %rdx), %edx |
| + subl %edx, %eax |
| +# endif |
| +# else |
| +# ifdef USE_AS_WCSCMP |
| + xorl %eax, %eax |
| + movl (VEC_SIZE * 2)(%rdi, %rdx), %ecx |
| + cmpl (VEC_SIZE * 2)(%rsi, %rdx), %ecx |
| + jne L(wcscmp_return) |
| +# else |
| + movzbl (VEC_SIZE * 2)(%rdi, %rdx), %eax |
| + movzbl (VEC_SIZE * 2)(%rsi, %rdx), %edx |
| + subl %edx, %eax |
| +# endif |
| +# endif |
| + ret |
| + |
| + .p2align 4 |
| +L(return_3_vec_size): |
| + kmovd %k1, %ecx |
| + tzcntl %ecx, %edx |
| +# ifdef USE_AS_WCSCMP |
| + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| + sall $2, %edx |
| +# endif |
| +# ifdef USE_AS_STRNCMP |
| + /* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is |
| + after the maximum offset (%r11). */ |
| + addq $(VEC_SIZE * 3), %rdx |
| + cmpq %r11, %rdx |
| + jae L(zero) |
| +# ifdef USE_AS_WCSCMP |
| + xorl %eax, %eax |
| + movl (%rdi, %rdx), %ecx |
| + cmpl (%rsi, %rdx), %ecx |
| + jne L(wcscmp_return) |
| +# else |
| + movzbl (%rdi, %rdx), %eax |
| + movzbl (%rsi, %rdx), %edx |
| + subl %edx, %eax |
| +# endif |
| +# else |
| +# ifdef USE_AS_WCSCMP |
| + xorl %eax, %eax |
| + movl (VEC_SIZE * 3)(%rdi, %rdx), %ecx |
| + cmpl (VEC_SIZE * 3)(%rsi, %rdx), %ecx |
| + jne L(wcscmp_return) |
| +# else |
| + movzbl (VEC_SIZE * 3)(%rdi, %rdx), %eax |
| + movzbl (VEC_SIZE * 3)(%rsi, %rdx), %edx |
| + subl %edx, %eax |
| +# endif |
| +# endif |
| + ret |
| + |
| + .p2align 4 |
| +L(next_3_vectors): |
| + VMOVU VEC_SIZE(%rdi), %YMM0 |
| + VMOVU VEC_SIZE(%rsi), %YMM1 |
| + /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */ |
| + VPCMP $4, %YMM0, %YMM1, %k0 |
| + VPCMP $0, %YMMZERO, %YMM0, %k1 |
| + VPCMP $0, %YMMZERO, %YMM1, %k2 |
| + /* Each bit in K1 represents a NULL in YMM0 or YMM1. */ |
| + kord %k1, %k2, %k1 |
| + /* Each bit in K1 represents a NULL or a mismatch. */ |
| + kord %k0, %k1, %k1 |
| + ktestd %k1, %k1 |
| + jne L(return_vec_size) |
| + |
| + VMOVU (VEC_SIZE * 2)(%rdi), %YMM2 |
| + VMOVU (VEC_SIZE * 3)(%rdi), %YMM3 |
| + VMOVU (VEC_SIZE * 2)(%rsi), %YMM4 |
| + VMOVU (VEC_SIZE * 3)(%rsi), %YMM5 |
| + |
| + /* Each bit in K0 represents a mismatch in YMM2 and YMM4. */ |
| + VPCMP $4, %YMM2, %YMM4, %k0 |
| + VPCMP $0, %YMMZERO, %YMM2, %k1 |
| + VPCMP $0, %YMMZERO, %YMM4, %k2 |
| + /* Each bit in K1 represents a NULL in YMM2 or YMM4. */ |
| + kord %k1, %k2, %k1 |
| + /* Each bit in K1 represents a NULL or a mismatch. */ |
| + kord %k0, %k1, %k1 |
| + ktestd %k1, %k1 |
| + jne L(return_2_vec_size) |
| + |
| + /* Each bit in K0 represents a mismatch in YMM3 and YMM5. */ |
| + VPCMP $4, %YMM3, %YMM5, %k0 |
| + VPCMP $0, %YMMZERO, %YMM3, %k1 |
| + VPCMP $0, %YMMZERO, %YMM5, %k2 |
| + /* Each bit in K1 represents a NULL in YMM3 or YMM5. */ |
| + kord %k1, %k2, %k1 |
| + /* Each bit in K1 represents a NULL or a mismatch. */ |
| + kord %k0, %k1, %k1 |
| + ktestd %k1, %k1 |
| + jne L(return_3_vec_size) |
| +L(main_loop_header): |
| + leaq (VEC_SIZE * 4)(%rdi), %rdx |
| + movl $PAGE_SIZE, %ecx |
| + /* Align load via RAX. */ |
| + andq $-(VEC_SIZE * 4), %rdx |
| + subq %rdi, %rdx |
| + leaq (%rdi, %rdx), %rax |
| +# ifdef USE_AS_STRNCMP |
| + /* Starting from this point, the maximum offset, or simply the |
| + 'offset', DECREASES by the same amount when base pointers are |
| + moved forward. Return 0 when: |
| + 1) On match: offset <= the matched vector index. |
| + 2) On mistmach, offset is before the mistmatched index. |
| + */ |
| + subq %rdx, %r11 |
| + jbe L(zero) |
| +# endif |
| + addq %rsi, %rdx |
| + movq %rdx, %rsi |
| + andl $(PAGE_SIZE - 1), %esi |
| + /* Number of bytes before page crossing. */ |
| + subq %rsi, %rcx |
| + /* Number of VEC_SIZE * 4 blocks before page crossing. */ |
| + shrq $DIVIDE_BY_VEC_4_SHIFT, %rcx |
| + /* ESI: Number of VEC_SIZE * 4 blocks before page crossing. */ |
| + movl %ecx, %esi |
| + jmp L(loop_start) |
| + |
| + .p2align 4 |
| +L(loop): |
| +# ifdef USE_AS_STRNCMP |
| + /* Base pointers are moved forward by 4 * VEC_SIZE. Decrease |
| + the maximum offset (%r11) by the same amount. */ |
| + subq $(VEC_SIZE * 4), %r11 |
| + jbe L(zero) |
| +# endif |
| + addq $(VEC_SIZE * 4), %rax |
| + addq $(VEC_SIZE * 4), %rdx |
| +L(loop_start): |
| + testl %esi, %esi |
| + leal -1(%esi), %esi |
| + je L(loop_cross_page) |
| +L(back_to_loop): |
| + /* Main loop, comparing 4 vectors are a time. */ |
| + VMOVA (%rax), %YMM0 |
| + VMOVA VEC_SIZE(%rax), %YMM2 |
| + VMOVA (VEC_SIZE * 2)(%rax), %YMM4 |
| + VMOVA (VEC_SIZE * 3)(%rax), %YMM6 |
| + VMOVU (%rdx), %YMM1 |
| + VMOVU VEC_SIZE(%rdx), %YMM3 |
| + VMOVU (VEC_SIZE * 2)(%rdx), %YMM5 |
| + VMOVU (VEC_SIZE * 3)(%rdx), %YMM7 |
| + |
| + VPCMP $4, %YMM0, %YMM1, %k0 |
| + VPCMP $0, %YMMZERO, %YMM0, %k1 |
| + VPCMP $0, %YMMZERO, %YMM1, %k2 |
| + kord %k1, %k2, %k1 |
| + /* Each bit in K4 represents a NULL or a mismatch in YMM0 and |
| + YMM1. */ |
| + kord %k0, %k1, %k4 |
| + |
| + VPCMP $4, %YMM2, %YMM3, %k0 |
| + VPCMP $0, %YMMZERO, %YMM2, %k1 |
| + VPCMP $0, %YMMZERO, %YMM3, %k2 |
| + kord %k1, %k2, %k1 |
| + /* Each bit in K5 represents a NULL or a mismatch in YMM2 and |
| + YMM3. */ |
| + kord %k0, %k1, %k5 |
| + |
| + VPCMP $4, %YMM4, %YMM5, %k0 |
| + VPCMP $0, %YMMZERO, %YMM4, %k1 |
| + VPCMP $0, %YMMZERO, %YMM5, %k2 |
| + kord %k1, %k2, %k1 |
| + /* Each bit in K6 represents a NULL or a mismatch in YMM4 and |
| + YMM5. */ |
| + kord %k0, %k1, %k6 |
| + |
| + VPCMP $4, %YMM6, %YMM7, %k0 |
| + VPCMP $0, %YMMZERO, %YMM6, %k1 |
| + VPCMP $0, %YMMZERO, %YMM7, %k2 |
| + kord %k1, %k2, %k1 |
| + /* Each bit in K7 represents a NULL or a mismatch in YMM6 and |
| + YMM7. */ |
| + kord %k0, %k1, %k7 |
| + |
| + kord %k4, %k5, %k0 |
| + kord %k6, %k7, %k1 |
| + |
| + /* Test each mask (32 bits) individually because for VEC_SIZE |
| + == 32 is not possible to OR the four masks and keep all bits |
| + in a 64-bit integer register, differing from SSE2 strcmp |
| + where ORing is possible. */ |
| + kortestd %k0, %k1 |
| + je L(loop) |
| + ktestd %k4, %k4 |
| + je L(test_vec) |
| + kmovd %k4, %edi |
| + tzcntl %edi, %ecx |
| +# ifdef USE_AS_WCSCMP |
| + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| + sall $2, %ecx |
| +# endif |
| +# ifdef USE_AS_STRNCMP |
| + cmpq %rcx, %r11 |
| + jbe L(zero) |
| +# ifdef USE_AS_WCSCMP |
| + movq %rax, %rsi |
| + xorl %eax, %eax |
| + movl (%rsi, %rcx), %edi |
| + cmpl (%rdx, %rcx), %edi |
| + jne L(wcscmp_return) |
| +# else |
| + movzbl (%rax, %rcx), %eax |
| + movzbl (%rdx, %rcx), %edx |
| + subl %edx, %eax |
| +# endif |
| +# else |
| +# ifdef USE_AS_WCSCMP |
| + movq %rax, %rsi |
| + xorl %eax, %eax |
| + movl (%rsi, %rcx), %edi |
| + cmpl (%rdx, %rcx), %edi |
| + jne L(wcscmp_return) |
| +# else |
| + movzbl (%rax, %rcx), %eax |
| + movzbl (%rdx, %rcx), %edx |
| + subl %edx, %eax |
| +# endif |
| +# endif |
| + ret |
| + |
| + .p2align 4 |
| +L(test_vec): |
| +# ifdef USE_AS_STRNCMP |
| + /* The first vector matched. Return 0 if the maximum offset |
| + (%r11) <= VEC_SIZE. */ |
| + cmpq $VEC_SIZE, %r11 |
| + jbe L(zero) |
| +# endif |
| + ktestd %k5, %k5 |
| + je L(test_2_vec) |
| + kmovd %k5, %ecx |
| + tzcntl %ecx, %edi |
| +# ifdef USE_AS_WCSCMP |
| + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| + sall $2, %edi |
| +# endif |
| +# ifdef USE_AS_STRNCMP |
| + addq $VEC_SIZE, %rdi |
| + cmpq %rdi, %r11 |
| + jbe L(zero) |
| +# ifdef USE_AS_WCSCMP |
| + movq %rax, %rsi |
| + xorl %eax, %eax |
| + movl (%rsi, %rdi), %ecx |
| + cmpl (%rdx, %rdi), %ecx |
| + jne L(wcscmp_return) |
| +# else |
| + movzbl (%rax, %rdi), %eax |
| + movzbl (%rdx, %rdi), %edx |
| + subl %edx, %eax |
| +# endif |
| +# else |
| +# ifdef USE_AS_WCSCMP |
| + movq %rax, %rsi |
| + xorl %eax, %eax |
| + movl VEC_SIZE(%rsi, %rdi), %ecx |
| + cmpl VEC_SIZE(%rdx, %rdi), %ecx |
| + jne L(wcscmp_return) |
| +# else |
| + movzbl VEC_SIZE(%rax, %rdi), %eax |
| + movzbl VEC_SIZE(%rdx, %rdi), %edx |
| + subl %edx, %eax |
| +# endif |
| +# endif |
| + ret |
| + |
| + .p2align 4 |
| +L(test_2_vec): |
| +# ifdef USE_AS_STRNCMP |
| + /* The first 2 vectors matched. Return 0 if the maximum offset |
| + (%r11) <= 2 * VEC_SIZE. */ |
| + cmpq $(VEC_SIZE * 2), %r11 |
| + jbe L(zero) |
| +# endif |
| + ktestd %k6, %k6 |
| + je L(test_3_vec) |
| + kmovd %k6, %ecx |
| + tzcntl %ecx, %edi |
| +# ifdef USE_AS_WCSCMP |
| + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| + sall $2, %edi |
| +# endif |
| +# ifdef USE_AS_STRNCMP |
| + addq $(VEC_SIZE * 2), %rdi |
| + cmpq %rdi, %r11 |
| + jbe L(zero) |
| +# ifdef USE_AS_WCSCMP |
| + movq %rax, %rsi |
| + xorl %eax, %eax |
| + movl (%rsi, %rdi), %ecx |
| + cmpl (%rdx, %rdi), %ecx |
| + jne L(wcscmp_return) |
| +# else |
| + movzbl (%rax, %rdi), %eax |
| + movzbl (%rdx, %rdi), %edx |
| + subl %edx, %eax |
| +# endif |
| +# else |
| +# ifdef USE_AS_WCSCMP |
| + movq %rax, %rsi |
| + xorl %eax, %eax |
| + movl (VEC_SIZE * 2)(%rsi, %rdi), %ecx |
| + cmpl (VEC_SIZE * 2)(%rdx, %rdi), %ecx |
| + jne L(wcscmp_return) |
| +# else |
| + movzbl (VEC_SIZE * 2)(%rax, %rdi), %eax |
| + movzbl (VEC_SIZE * 2)(%rdx, %rdi), %edx |
| + subl %edx, %eax |
| +# endif |
| +# endif |
| + ret |
| + |
| + .p2align 4 |
| +L(test_3_vec): |
| +# ifdef USE_AS_STRNCMP |
| + /* The first 3 vectors matched. Return 0 if the maximum offset |
| + (%r11) <= 3 * VEC_SIZE. */ |
| + cmpq $(VEC_SIZE * 3), %r11 |
| + jbe L(zero) |
| +# endif |
| + kmovd %k7, %esi |
| + tzcntl %esi, %ecx |
| +# ifdef USE_AS_WCSCMP |
| + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| + sall $2, %ecx |
| +# endif |
| +# ifdef USE_AS_STRNCMP |
| + addq $(VEC_SIZE * 3), %rcx |
| + cmpq %rcx, %r11 |
| + jbe L(zero) |
| +# ifdef USE_AS_WCSCMP |
| + movq %rax, %rsi |
| + xorl %eax, %eax |
| + movl (%rsi, %rcx), %esi |
| + cmpl (%rdx, %rcx), %esi |
| + jne L(wcscmp_return) |
| +# else |
| + movzbl (%rax, %rcx), %eax |
| + movzbl (%rdx, %rcx), %edx |
| + subl %edx, %eax |
| +# endif |
| +# else |
| +# ifdef USE_AS_WCSCMP |
| + movq %rax, %rsi |
| + xorl %eax, %eax |
| + movl (VEC_SIZE * 3)(%rsi, %rcx), %esi |
| + cmpl (VEC_SIZE * 3)(%rdx, %rcx), %esi |
| + jne L(wcscmp_return) |
| +# else |
| + movzbl (VEC_SIZE * 3)(%rax, %rcx), %eax |
| + movzbl (VEC_SIZE * 3)(%rdx, %rcx), %edx |
| + subl %edx, %eax |
| +# endif |
| +# endif |
| + ret |
| + |
| + .p2align 4 |
| +L(loop_cross_page): |
| + xorl %r10d, %r10d |
| + movq %rdx, %rcx |
| + /* Align load via RDX. We load the extra ECX bytes which should |
| + be ignored. */ |
| + andl $((VEC_SIZE * 4) - 1), %ecx |
| + /* R10 is -RCX. */ |
| + subq %rcx, %r10 |
| + |
| + /* This works only if VEC_SIZE * 2 == 64. */ |
| +# if (VEC_SIZE * 2) != 64 |
| +# error (VEC_SIZE * 2) != 64 |
| +# endif |
| + |
| + /* Check if the first VEC_SIZE * 2 bytes should be ignored. */ |
| + cmpl $(VEC_SIZE * 2), %ecx |
| + jge L(loop_cross_page_2_vec) |
| + |
| + VMOVU (%rax, %r10), %YMM2 |
| + VMOVU VEC_SIZE(%rax, %r10), %YMM3 |
| + VMOVU (%rdx, %r10), %YMM4 |
| + VMOVU VEC_SIZE(%rdx, %r10), %YMM5 |
| + |
| + VPCMP $4, %YMM4, %YMM2, %k0 |
| + VPCMP $0, %YMMZERO, %YMM2, %k1 |
| + VPCMP $0, %YMMZERO, %YMM4, %k2 |
| + kord %k1, %k2, %k1 |
| + /* Each bit in K1 represents a NULL or a mismatch in YMM2 and |
| + YMM4. */ |
| + kord %k0, %k1, %k1 |
| + |
| + VPCMP $4, %YMM5, %YMM3, %k3 |
| + VPCMP $0, %YMMZERO, %YMM3, %k4 |
| + VPCMP $0, %YMMZERO, %YMM5, %k5 |
| + kord %k4, %k5, %k4 |
| + /* Each bit in K3 represents a NULL or a mismatch in YMM3 and |
| + YMM5. */ |
| + kord %k3, %k4, %k3 |
| + |
| +# ifdef USE_AS_WCSCMP |
| + /* NB: Each bit in K1/K3 represents 4-byte element. */ |
| + kshiftlw $8, %k3, %k2 |
| + /* NB: Divide shift count by 4 since each bit in K1 represent 4 |
| + bytes. */ |
| + movl %ecx, %SHIFT_REG32 |
| + sarl $2, %SHIFT_REG32 |
| +# else |
| + kshiftlq $32, %k3, %k2 |
| +# endif |
| + |
| + /* Each bit in K1 represents a NULL or a mismatch. */ |
| + korq %k1, %k2, %k1 |
| + kmovq %k1, %rdi |
| + |
| + /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */ |
| + shrxq %SHIFT_REG64, %rdi, %rdi |
| + testq %rdi, %rdi |
| + je L(loop_cross_page_2_vec) |
| + tzcntq %rdi, %rcx |
| +# ifdef USE_AS_WCSCMP |
| + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| + sall $2, %ecx |
| +# endif |
| +# ifdef USE_AS_STRNCMP |
| + cmpq %rcx, %r11 |
| + jbe L(zero) |
| +# ifdef USE_AS_WCSCMP |
| + movq %rax, %rsi |
| + xorl %eax, %eax |
| + movl (%rsi, %rcx), %edi |
| + cmpl (%rdx, %rcx), %edi |
| + jne L(wcscmp_return) |
| +# else |
| + movzbl (%rax, %rcx), %eax |
| + movzbl (%rdx, %rcx), %edx |
| + subl %edx, %eax |
| +# endif |
| +# else |
| +# ifdef USE_AS_WCSCMP |
| + movq %rax, %rsi |
| + xorl %eax, %eax |
| + movl (%rsi, %rcx), %edi |
| + cmpl (%rdx, %rcx), %edi |
| + jne L(wcscmp_return) |
| +# else |
| + movzbl (%rax, %rcx), %eax |
| + movzbl (%rdx, %rcx), %edx |
| + subl %edx, %eax |
| +# endif |
| +# endif |
| + ret |
| + |
| + .p2align 4 |
| +L(loop_cross_page_2_vec): |
| + /* The first VEC_SIZE * 2 bytes match or are ignored. */ |
| + VMOVU (VEC_SIZE * 2)(%rax, %r10), %YMM0 |
| + VMOVU (VEC_SIZE * 3)(%rax, %r10), %YMM1 |
| + VMOVU (VEC_SIZE * 2)(%rdx, %r10), %YMM2 |
| + VMOVU (VEC_SIZE * 3)(%rdx, %r10), %YMM3 |
| + |
| + VPCMP $4, %YMM0, %YMM2, %k0 |
| + VPCMP $0, %YMMZERO, %YMM0, %k1 |
| + VPCMP $0, %YMMZERO, %YMM2, %k2 |
| + kord %k1, %k2, %k1 |
| + /* Each bit in K1 represents a NULL or a mismatch in YMM0 and |
| + YMM2. */ |
| + kord %k0, %k1, %k1 |
| + |
| + VPCMP $4, %YMM1, %YMM3, %k3 |
| + VPCMP $0, %YMMZERO, %YMM1, %k4 |
| + VPCMP $0, %YMMZERO, %YMM3, %k5 |
| + kord %k4, %k5, %k4 |
| + /* Each bit in K3 represents a NULL or a mismatch in YMM1 and |
| + YMM3. */ |
| + kord %k3, %k4, %k3 |
| + |
| +# ifdef USE_AS_WCSCMP |
| + /* NB: Each bit in K1/K3 represents 4-byte element. */ |
| + kshiftlw $8, %k3, %k2 |
| +# else |
| + kshiftlq $32, %k3, %k2 |
| +# endif |
| + |
| + /* Each bit in K1 represents a NULL or a mismatch. */ |
| + korq %k1, %k2, %k1 |
| + kmovq %k1, %rdi |
| + |
| + xorl %r8d, %r8d |
| + /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */ |
| + subl $(VEC_SIZE * 2), %ecx |
| + jle 1f |
| + /* R8 has number of bytes skipped. */ |
| + movl %ecx, %r8d |
| +# ifdef USE_AS_WCSCMP |
| + /* NB: Divide shift count by 4 since each bit in K1 represent 4 |
| + bytes. */ |
| + sarl $2, %ecx |
| +# endif |
| + /* Skip ECX bytes. */ |
| + shrq %cl, %rdi |
| +1: |
| + /* Before jumping back to the loop, set ESI to the number of |
| + VEC_SIZE * 4 blocks before page crossing. */ |
| + movl $(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi |
| + |
| + testq %rdi, %rdi |
| +# ifdef USE_AS_STRNCMP |
| + /* At this point, if %rdi value is 0, it already tested |
| + VEC_SIZE*4+%r10 byte starting from %rax. This label |
| + checks whether strncmp maximum offset reached or not. */ |
| + je L(string_nbyte_offset_check) |
| +# else |
| + je L(back_to_loop) |
| +# endif |
| + tzcntq %rdi, %rcx |
| +# ifdef USE_AS_WCSCMP |
| + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| + sall $2, %ecx |
| +# endif |
| + addq %r10, %rcx |
| + /* Adjust for number of bytes skipped. */ |
| + addq %r8, %rcx |
| +# ifdef USE_AS_STRNCMP |
| + addq $(VEC_SIZE * 2), %rcx |
| + subq %rcx, %r11 |
| + jbe L(zero) |
| +# ifdef USE_AS_WCSCMP |
| + movq %rax, %rsi |
| + xorl %eax, %eax |
| + movl (%rsi, %rcx), %edi |
| + cmpl (%rdx, %rcx), %edi |
| + jne L(wcscmp_return) |
| +# else |
| + movzbl (%rax, %rcx), %eax |
| + movzbl (%rdx, %rcx), %edx |
| + subl %edx, %eax |
| +# endif |
| +# else |
| +# ifdef USE_AS_WCSCMP |
| + movq %rax, %rsi |
| + xorl %eax, %eax |
| + movl (VEC_SIZE * 2)(%rsi, %rcx), %edi |
| + cmpl (VEC_SIZE * 2)(%rdx, %rcx), %edi |
| + jne L(wcscmp_return) |
| +# else |
| + movzbl (VEC_SIZE * 2)(%rax, %rcx), %eax |
| + movzbl (VEC_SIZE * 2)(%rdx, %rcx), %edx |
| + subl %edx, %eax |
| +# endif |
| +# endif |
| + ret |
| + |
| +# ifdef USE_AS_STRNCMP |
| +L(string_nbyte_offset_check): |
| + leaq (VEC_SIZE * 4)(%r10), %r10 |
| + cmpq %r10, %r11 |
| + jbe L(zero) |
| + jmp L(back_to_loop) |
| +# endif |
| + |
| + .p2align 4 |
| +L(cross_page_loop): |
| + /* Check one byte/dword at a time. */ |
| +# ifdef USE_AS_WCSCMP |
| + cmpl %ecx, %eax |
| +# else |
| + subl %ecx, %eax |
| +# endif |
| + jne L(different) |
| + addl $SIZE_OF_CHAR, %edx |
| + cmpl $(VEC_SIZE * 4), %edx |
| + je L(main_loop_header) |
| +# ifdef USE_AS_STRNCMP |
| + cmpq %r11, %rdx |
| + jae L(zero) |
| +# endif |
| +# ifdef USE_AS_WCSCMP |
| + movl (%rdi, %rdx), %eax |
| + movl (%rsi, %rdx), %ecx |
| +# else |
| + movzbl (%rdi, %rdx), %eax |
| + movzbl (%rsi, %rdx), %ecx |
| +# endif |
| + /* Check null char. */ |
| + testl %eax, %eax |
| + jne L(cross_page_loop) |
| + /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED |
| + comparisons. */ |
| + subl %ecx, %eax |
| +# ifndef USE_AS_WCSCMP |
| +L(different): |
| +# endif |
| + ret |
| + |
| +# ifdef USE_AS_WCSCMP |
| + .p2align 4 |
| +L(different): |
| + /* Use movl to avoid modifying EFLAGS. */ |
| + movl $0, %eax |
| + setl %al |
| + negl %eax |
| + orl $1, %eax |
| + ret |
| +# endif |
| + |
| +# ifdef USE_AS_STRNCMP |
| + .p2align 4 |
| +L(zero): |
| + xorl %eax, %eax |
| + ret |
| + |
| + .p2align 4 |
| +L(char0): |
| +# ifdef USE_AS_WCSCMP |
| + xorl %eax, %eax |
| + movl (%rdi), %ecx |
| + cmpl (%rsi), %ecx |
| + jne L(wcscmp_return) |
| +# else |
| + movzbl (%rsi), %ecx |
| + movzbl (%rdi), %eax |
| + subl %ecx, %eax |
| +# endif |
| + ret |
| +# endif |
| + |
| + .p2align 4 |
| +L(last_vector): |
| + addq %rdx, %rdi |
| + addq %rdx, %rsi |
| +# ifdef USE_AS_STRNCMP |
| + subq %rdx, %r11 |
| +# endif |
| + tzcntl %ecx, %edx |
| +# ifdef USE_AS_WCSCMP |
| + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| + sall $2, %edx |
| +# endif |
| +# ifdef USE_AS_STRNCMP |
| + cmpq %r11, %rdx |
| + jae L(zero) |
| +# endif |
| +# ifdef USE_AS_WCSCMP |
| + xorl %eax, %eax |
| + movl (%rdi, %rdx), %ecx |
| + cmpl (%rsi, %rdx), %ecx |
| + jne L(wcscmp_return) |
| +# else |
| + movzbl (%rdi, %rdx), %eax |
| + movzbl (%rsi, %rdx), %edx |
| + subl %edx, %eax |
| +# endif |
| + ret |
| + |
| + /* Comparing on page boundary region requires special treatment: |
| + It must done one vector at the time, starting with the wider |
| + ymm vector if possible, if not, with xmm. If fetching 16 bytes |
| + (xmm) still passes the boundary, byte comparison must be done. |
| + */ |
| + .p2align 4 |
| +L(cross_page): |
| + /* Try one ymm vector at a time. */ |
| + cmpl $(PAGE_SIZE - VEC_SIZE), %eax |
| + jg L(cross_page_1_vector) |
| +L(loop_1_vector): |
| + VMOVU (%rdi, %rdx), %YMM0 |
| + VMOVU (%rsi, %rdx), %YMM1 |
| + |
| + /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */ |
| + VPCMP $4, %YMM0, %YMM1, %k0 |
| + VPCMP $0, %YMMZERO, %YMM0, %k1 |
| + VPCMP $0, %YMMZERO, %YMM1, %k2 |
| + /* Each bit in K1 represents a NULL in YMM0 or YMM1. */ |
| + kord %k1, %k2, %k1 |
| + /* Each bit in K1 represents a NULL or a mismatch. */ |
| + kord %k0, %k1, %k1 |
| + kmovd %k1, %ecx |
| + testl %ecx, %ecx |
| + jne L(last_vector) |
| + |
| + addl $VEC_SIZE, %edx |
| + |
| + addl $VEC_SIZE, %eax |
| +# ifdef USE_AS_STRNCMP |
| + /* Return 0 if the current offset (%rdx) >= the maximum offset |
| + (%r11). */ |
| + cmpq %r11, %rdx |
| + jae L(zero) |
| +# endif |
| + cmpl $(PAGE_SIZE - VEC_SIZE), %eax |
| + jle L(loop_1_vector) |
| +L(cross_page_1_vector): |
| + /* Less than 32 bytes to check, try one xmm vector. */ |
| + cmpl $(PAGE_SIZE - 16), %eax |
| + jg L(cross_page_1_xmm) |
| + VMOVU (%rdi, %rdx), %XMM0 |
| + VMOVU (%rsi, %rdx), %XMM1 |
| + |
| + /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */ |
| + VPCMP $4, %XMM0, %XMM1, %k0 |
| + VPCMP $0, %XMMZERO, %XMM0, %k1 |
| + VPCMP $0, %XMMZERO, %XMM1, %k2 |
| + /* Each bit in K1 represents a NULL in XMM0 or XMM1. */ |
| + korw %k1, %k2, %k1 |
| + /* Each bit in K1 represents a NULL or a mismatch. */ |
| + korw %k0, %k1, %k1 |
| + kmovw %k1, %ecx |
| + testl %ecx, %ecx |
| + jne L(last_vector) |
| + |
| + addl $16, %edx |
| +# ifndef USE_AS_WCSCMP |
| + addl $16, %eax |
| +# endif |
| +# ifdef USE_AS_STRNCMP |
| + /* Return 0 if the current offset (%rdx) >= the maximum offset |
| + (%r11). */ |
| + cmpq %r11, %rdx |
| + jae L(zero) |
| +# endif |
| + |
| +L(cross_page_1_xmm): |
| +# ifndef USE_AS_WCSCMP |
| + /* Less than 16 bytes to check, try 8 byte vector. NB: No need |
| + for wcscmp nor wcsncmp since wide char is 4 bytes. */ |
| + cmpl $(PAGE_SIZE - 8), %eax |
| + jg L(cross_page_8bytes) |
| + vmovq (%rdi, %rdx), %XMM0 |
| + vmovq (%rsi, %rdx), %XMM1 |
| + |
| + /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */ |
| + VPCMP $4, %XMM0, %XMM1, %k0 |
| + VPCMP $0, %XMMZERO, %XMM0, %k1 |
| + VPCMP $0, %XMMZERO, %XMM1, %k2 |
| + /* Each bit in K1 represents a NULL in XMM0 or XMM1. */ |
| + kord %k1, %k2, %k1 |
| + /* Each bit in K1 represents a NULL or a mismatch. */ |
| + kord %k0, %k1, %k1 |
| + kmovd %k1, %ecx |
| + |
| +# ifdef USE_AS_WCSCMP |
| + /* Only last 2 bits are valid. */ |
| + andl $0x3, %ecx |
| +# else |
| + /* Only last 8 bits are valid. */ |
| + andl $0xff, %ecx |
| +# endif |
| + |
| + testl %ecx, %ecx |
| + jne L(last_vector) |
| + |
| + addl $8, %edx |
| + addl $8, %eax |
| +# ifdef USE_AS_STRNCMP |
| + /* Return 0 if the current offset (%rdx) >= the maximum offset |
| + (%r11). */ |
| + cmpq %r11, %rdx |
| + jae L(zero) |
| +# endif |
| + |
| +L(cross_page_8bytes): |
| + /* Less than 8 bytes to check, try 4 byte vector. */ |
| + cmpl $(PAGE_SIZE - 4), %eax |
| + jg L(cross_page_4bytes) |
| + vmovd (%rdi, %rdx), %XMM0 |
| + vmovd (%rsi, %rdx), %XMM1 |
| + |
| + /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */ |
| + VPCMP $4, %XMM0, %XMM1, %k0 |
| + VPCMP $0, %XMMZERO, %XMM0, %k1 |
| + VPCMP $0, %XMMZERO, %XMM1, %k2 |
| + /* Each bit in K1 represents a NULL in XMM0 or XMM1. */ |
| + kord %k1, %k2, %k1 |
| + /* Each bit in K1 represents a NULL or a mismatch. */ |
| + kord %k0, %k1, %k1 |
| + kmovd %k1, %ecx |
| + |
| +# ifdef USE_AS_WCSCMP |
| + /* Only the last bit is valid. */ |
| + andl $0x1, %ecx |
| +# else |
| + /* Only last 4 bits are valid. */ |
| + andl $0xf, %ecx |
| +# endif |
| + |
| + testl %ecx, %ecx |
| + jne L(last_vector) |
| + |
| + addl $4, %edx |
| +# ifdef USE_AS_STRNCMP |
| + /* Return 0 if the current offset (%rdx) >= the maximum offset |
| + (%r11). */ |
| + cmpq %r11, %rdx |
| + jae L(zero) |
| +# endif |
| + |
| +L(cross_page_4bytes): |
| +# endif |
| + /* Less than 4 bytes to check, try one byte/dword at a time. */ |
| +# ifdef USE_AS_STRNCMP |
| + cmpq %r11, %rdx |
| + jae L(zero) |
| +# endif |
| +# ifdef USE_AS_WCSCMP |
| + movl (%rdi, %rdx), %eax |
| + movl (%rsi, %rdx), %ecx |
| +# else |
| + movzbl (%rdi, %rdx), %eax |
| + movzbl (%rsi, %rdx), %ecx |
| +# endif |
| + testl %eax, %eax |
| + jne L(cross_page_loop) |
| + subl %ecx, %eax |
| + ret |
| +END (STRCMP) |
| +#endif |
| diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c |
| index 3f433fbc..c5f38510 100644 |
| |
| |
| @@ -30,16 +30,25 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; |
| extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; |
| extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; |
| extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; |
| +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; |
| |
| static inline void * |
| IFUNC_SELECTOR (void) |
| { |
| const struct cpu_features* cpu_features = __get_cpu_features (); |
| |
| - if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) |
| - && CPU_FEATURE_USABLE_P (cpu_features, AVX2) |
| + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) |
| && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) |
| - return OPTIMIZE (avx2); |
| + { |
| + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) |
| + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) |
| + && CPU_FEATURE_USABLE_P (cpu_features, BMI2) |
| + && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP)) |
| + return OPTIMIZE (evex); |
| + |
| + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) |
| + return OPTIMIZE (avx2); |
| + } |
| |
| if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load)) |
| return OPTIMIZE (sse2_unaligned); |
| diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S |
| new file mode 100644 |
| index 00000000..cd022509 |
| |
| |
| @@ -0,0 +1,436 @@ |
| +/* strlen/strnlen/wcslen/wcsnlen optimized with 256-bit EVEX instructions. |
| + Copyright (C) 2021 Free Software Foundation, Inc. |
| + This file is part of the GNU C Library. |
| + |
| + The GNU C Library is free software; you can redistribute it and/or |
| + modify it under the terms of the GNU Lesser General Public |
| + License as published by the Free Software Foundation; either |
| + version 2.1 of the License, or (at your option) any later version. |
| + |
| + The GNU C Library is distributed in the hope that it will be useful, |
| + but WITHOUT ANY WARRANTY; without even the implied warranty of |
| + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| + Lesser General Public License for more details. |
| + |
| + You should have received a copy of the GNU Lesser General Public |
| + License along with the GNU C Library; if not, see |
| + <https://www.gnu.org/licenses/>. */ |
| + |
| +#if IS_IN (libc) |
| + |
| +# include <sysdep.h> |
| + |
| +# ifndef STRLEN |
| +# define STRLEN __strlen_evex |
| +# endif |
| + |
| +# define VMOVA vmovdqa64 |
| + |
| +# ifdef USE_AS_WCSLEN |
| +# define VPCMP vpcmpd |
| +# define VPMINU vpminud |
| +# define SHIFT_REG r9d |
| +# else |
| +# define VPCMP vpcmpb |
| +# define VPMINU vpminub |
| +# define SHIFT_REG ecx |
| +# endif |
| + |
| +# define XMMZERO xmm16 |
| +# define YMMZERO ymm16 |
| +# define YMM1 ymm17 |
| +# define YMM2 ymm18 |
| +# define YMM3 ymm19 |
| +# define YMM4 ymm20 |
| +# define YMM5 ymm21 |
| +# define YMM6 ymm22 |
| + |
| +# define VEC_SIZE 32 |
| + |
| + .section .text.evex,"ax",@progbits |
| +ENTRY (STRLEN) |
| +# ifdef USE_AS_STRNLEN |
| + /* Check for zero length. */ |
| + test %RSI_LP, %RSI_LP |
| + jz L(zero) |
| +# ifdef USE_AS_WCSLEN |
| + shl $2, %RSI_LP |
| +# elif defined __ILP32__ |
| + /* Clear the upper 32 bits. */ |
| + movl %esi, %esi |
| +# endif |
| + mov %RSI_LP, %R8_LP |
| +# endif |
| + movl %edi, %ecx |
| + movq %rdi, %rdx |
| + vpxorq %XMMZERO, %XMMZERO, %XMMZERO |
| + |
| + /* Check if we may cross page boundary with one vector load. */ |
| + andl $(2 * VEC_SIZE - 1), %ecx |
| + cmpl $VEC_SIZE, %ecx |
| + ja L(cros_page_boundary) |
| + |
| + /* Check the first VEC_SIZE bytes. Each bit in K0 represents a |
| + null byte. */ |
| + VPCMP $0, (%rdi), %YMMZERO, %k0 |
| + kmovd %k0, %eax |
| + testl %eax, %eax |
| + |
| +# ifdef USE_AS_STRNLEN |
| + jnz L(first_vec_x0_check) |
| + /* Adjust length and check the end of data. */ |
| + subq $VEC_SIZE, %rsi |
| + jbe L(max) |
| +# else |
| + jnz L(first_vec_x0) |
| +# endif |
| + |
| + /* Align data for aligned loads in the loop. */ |
| + addq $VEC_SIZE, %rdi |
| + andl $(VEC_SIZE - 1), %ecx |
| + andq $-VEC_SIZE, %rdi |
| + |
| +# ifdef USE_AS_STRNLEN |
| + /* Adjust length. */ |
| + addq %rcx, %rsi |
| + |
| + subq $(VEC_SIZE * 4), %rsi |
| + jbe L(last_4x_vec_or_less) |
| +# endif |
| + jmp L(more_4x_vec) |
| + |
| + .p2align 4 |
| +L(cros_page_boundary): |
| + andl $(VEC_SIZE - 1), %ecx |
| + andq $-VEC_SIZE, %rdi |
| + |
| +# ifdef USE_AS_WCSLEN |
| + /* NB: Divide shift count by 4 since each bit in K0 represent 4 |
| + bytes. */ |
| + movl %ecx, %SHIFT_REG |
| + sarl $2, %SHIFT_REG |
| +# endif |
| + VPCMP $0, (%rdi), %YMMZERO, %k0 |
| + kmovd %k0, %eax |
| + |
| + /* Remove the leading bytes. */ |
| + sarxl %SHIFT_REG, %eax, %eax |
| + testl %eax, %eax |
| + jz L(aligned_more) |
| + tzcntl %eax, %eax |
| +# ifdef USE_AS_WCSLEN |
| + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| + sall $2, %eax |
| +# endif |
| +# ifdef USE_AS_STRNLEN |
| + /* Check the end of data. */ |
| + cmpq %rax, %rsi |
| + jbe L(max) |
| +# endif |
| + addq %rdi, %rax |
| + addq %rcx, %rax |
| + subq %rdx, %rax |
| +# ifdef USE_AS_WCSLEN |
| + shrq $2, %rax |
| +# endif |
| + ret |
| + |
| + .p2align 4 |
| +L(aligned_more): |
| +# ifdef USE_AS_STRNLEN |
| + /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE" |
| + with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE" |
| + to void possible addition overflow. */ |
| + negq %rcx |
| + addq $VEC_SIZE, %rcx |
| + |
| + /* Check the end of data. */ |
| + subq %rcx, %rsi |
| + jbe L(max) |
| +# endif |
| + |
| + addq $VEC_SIZE, %rdi |
| + |
| +# ifdef USE_AS_STRNLEN |
| + subq $(VEC_SIZE * 4), %rsi |
| + jbe L(last_4x_vec_or_less) |
| +# endif |
| + |
| +L(more_4x_vec): |
| + /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time |
| + since data is only aligned to VEC_SIZE. */ |
| + VPCMP $0, (%rdi), %YMMZERO, %k0 |
| + kmovd %k0, %eax |
| + testl %eax, %eax |
| + jnz L(first_vec_x0) |
| + |
| + VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0 |
| + kmovd %k0, %eax |
| + testl %eax, %eax |
| + jnz L(first_vec_x1) |
| + |
| + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 |
| + kmovd %k0, %eax |
| + testl %eax, %eax |
| + jnz L(first_vec_x2) |
| + |
| + VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 |
| + kmovd %k0, %eax |
| + testl %eax, %eax |
| + jnz L(first_vec_x3) |
| + |
| + addq $(VEC_SIZE * 4), %rdi |
| + |
| +# ifdef USE_AS_STRNLEN |
| + subq $(VEC_SIZE * 4), %rsi |
| + jbe L(last_4x_vec_or_less) |
| +# endif |
| + |
| + /* Align data to 4 * VEC_SIZE. */ |
| + movq %rdi, %rcx |
| + andl $(4 * VEC_SIZE - 1), %ecx |
| + andq $-(4 * VEC_SIZE), %rdi |
| + |
| +# ifdef USE_AS_STRNLEN |
| + /* Adjust length. */ |
| + addq %rcx, %rsi |
| +# endif |
| + |
| + .p2align 4 |
| +L(loop_4x_vec): |
| + /* Compare 4 * VEC at a time forward. */ |
| + VMOVA (%rdi), %YMM1 |
| + VMOVA VEC_SIZE(%rdi), %YMM2 |
| + VMOVA (VEC_SIZE * 2)(%rdi), %YMM3 |
| + VMOVA (VEC_SIZE * 3)(%rdi), %YMM4 |
| + |
| + VPMINU %YMM1, %YMM2, %YMM5 |
| + VPMINU %YMM3, %YMM4, %YMM6 |
| + |
| + VPMINU %YMM5, %YMM6, %YMM5 |
| + VPCMP $0, %YMM5, %YMMZERO, %k0 |
| + ktestd %k0, %k0 |
| + jnz L(4x_vec_end) |
| + |
| + addq $(VEC_SIZE * 4), %rdi |
| + |
| +# ifndef USE_AS_STRNLEN |
| + jmp L(loop_4x_vec) |
| +# else |
| + subq $(VEC_SIZE * 4), %rsi |
| + ja L(loop_4x_vec) |
| + |
| +L(last_4x_vec_or_less): |
| + /* Less than 4 * VEC and aligned to VEC_SIZE. */ |
| + addl $(VEC_SIZE * 2), %esi |
| + jle L(last_2x_vec) |
| + |
| + VPCMP $0, (%rdi), %YMMZERO, %k0 |
| + kmovd %k0, %eax |
| + testl %eax, %eax |
| + jnz L(first_vec_x0) |
| + |
| + VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0 |
| + kmovd %k0, %eax |
| + testl %eax, %eax |
| + jnz L(first_vec_x1) |
| + |
| + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 |
| + kmovd %k0, %eax |
| + testl %eax, %eax |
| + jnz L(first_vec_x2_check) |
| + subl $VEC_SIZE, %esi |
| + jle L(max) |
| + |
| + VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 |
| + kmovd %k0, %eax |
| + testl %eax, %eax |
| + jnz L(first_vec_x3_check) |
| + movq %r8, %rax |
| +# ifdef USE_AS_WCSLEN |
| + shrq $2, %rax |
| +# endif |
| + ret |
| + |
| + .p2align 4 |
| +L(last_2x_vec): |
| + addl $(VEC_SIZE * 2), %esi |
| + |
| + VPCMP $0, (%rdi), %YMMZERO, %k0 |
| + kmovd %k0, %eax |
| + testl %eax, %eax |
| + jnz L(first_vec_x0_check) |
| + subl $VEC_SIZE, %esi |
| + jle L(max) |
| + |
| + VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0 |
| + kmovd %k0, %eax |
| + testl %eax, %eax |
| + jnz L(first_vec_x1_check) |
| + movq %r8, %rax |
| +# ifdef USE_AS_WCSLEN |
| + shrq $2, %rax |
| +# endif |
| + ret |
| + |
| + .p2align 4 |
| +L(first_vec_x0_check): |
| + tzcntl %eax, %eax |
| +# ifdef USE_AS_WCSLEN |
| + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| + sall $2, %eax |
| +# endif |
| + /* Check the end of data. */ |
| + cmpq %rax, %rsi |
| + jbe L(max) |
| + addq %rdi, %rax |
| + subq %rdx, %rax |
| +# ifdef USE_AS_WCSLEN |
| + shrq $2, %rax |
| +# endif |
| + ret |
| + |
| + .p2align 4 |
| +L(first_vec_x1_check): |
| + tzcntl %eax, %eax |
| +# ifdef USE_AS_WCSLEN |
| + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| + sall $2, %eax |
| +# endif |
| + /* Check the end of data. */ |
| + cmpq %rax, %rsi |
| + jbe L(max) |
| + addq $VEC_SIZE, %rax |
| + addq %rdi, %rax |
| + subq %rdx, %rax |
| +# ifdef USE_AS_WCSLEN |
| + shrq $2, %rax |
| +# endif |
| + ret |
| + |
| + .p2align 4 |
| +L(first_vec_x2_check): |
| + tzcntl %eax, %eax |
| +# ifdef USE_AS_WCSLEN |
| + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| + sall $2, %eax |
| +# endif |
| + /* Check the end of data. */ |
| + cmpq %rax, %rsi |
| + jbe L(max) |
| + addq $(VEC_SIZE * 2), %rax |
| + addq %rdi, %rax |
| + subq %rdx, %rax |
| +# ifdef USE_AS_WCSLEN |
| + shrq $2, %rax |
| +# endif |
| + ret |
| + |
| + .p2align 4 |
| +L(first_vec_x3_check): |
| + tzcntl %eax, %eax |
| +# ifdef USE_AS_WCSLEN |
| + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| + sall $2, %eax |
| +# endif |
| + /* Check the end of data. */ |
| + cmpq %rax, %rsi |
| + jbe L(max) |
| + addq $(VEC_SIZE * 3), %rax |
| + addq %rdi, %rax |
| + subq %rdx, %rax |
| +# ifdef USE_AS_WCSLEN |
| + shrq $2, %rax |
| +# endif |
| + ret |
| + |
| + .p2align 4 |
| +L(max): |
| + movq %r8, %rax |
| +# ifdef USE_AS_WCSLEN |
| + shrq $2, %rax |
| +# endif |
| + ret |
| + |
| + .p2align 4 |
| +L(zero): |
| + xorl %eax, %eax |
| + ret |
| +# endif |
| + |
| + .p2align 4 |
| +L(first_vec_x0): |
| + tzcntl %eax, %eax |
| +# ifdef USE_AS_WCSLEN |
| + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| + sall $2, %eax |
| +# endif |
| + addq %rdi, %rax |
| + subq %rdx, %rax |
| +# ifdef USE_AS_WCSLEN |
| + shrq $2, %rax |
| +# endif |
| + ret |
| + |
| + .p2align 4 |
| +L(first_vec_x1): |
| + tzcntl %eax, %eax |
| +# ifdef USE_AS_WCSLEN |
| + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| + sall $2, %eax |
| +# endif |
| + addq $VEC_SIZE, %rax |
| + addq %rdi, %rax |
| + subq %rdx, %rax |
| +# ifdef USE_AS_WCSLEN |
| + shrq $2, %rax |
| +# endif |
| + ret |
| + |
| + .p2align 4 |
| +L(first_vec_x2): |
| + tzcntl %eax, %eax |
| +# ifdef USE_AS_WCSLEN |
| + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| + sall $2, %eax |
| +# endif |
| + addq $(VEC_SIZE * 2), %rax |
| + addq %rdi, %rax |
| + subq %rdx, %rax |
| +# ifdef USE_AS_WCSLEN |
| + shrq $2, %rax |
| +# endif |
| + ret |
| + |
| + .p2align 4 |
| +L(4x_vec_end): |
| + VPCMP $0, %YMM1, %YMMZERO, %k0 |
| + kmovd %k0, %eax |
| + testl %eax, %eax |
| + jnz L(first_vec_x0) |
| + VPCMP $0, %YMM2, %YMMZERO, %k1 |
| + kmovd %k1, %eax |
| + testl %eax, %eax |
| + jnz L(first_vec_x1) |
| + VPCMP $0, %YMM3, %YMMZERO, %k2 |
| + kmovd %k2, %eax |
| + testl %eax, %eax |
| + jnz L(first_vec_x2) |
| + VPCMP $0, %YMM4, %YMMZERO, %k3 |
| + kmovd %k3, %eax |
| +L(first_vec_x3): |
| + tzcntl %eax, %eax |
| +# ifdef USE_AS_WCSLEN |
| + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| + sall $2, %eax |
| +# endif |
| + addq $(VEC_SIZE * 3), %rax |
| + addq %rdi, %rax |
| + subq %rdx, %rax |
| +# ifdef USE_AS_WCSLEN |
| + shrq $2, %rax |
| +# endif |
| + ret |
| + |
| +END (STRLEN) |
| +#endif |
| diff --git a/sysdeps/x86_64/multiarch/strncmp-evex.S b/sysdeps/x86_64/multiarch/strncmp-evex.S |
| new file mode 100644 |
| index 00000000..a1d53e8c |
| |
| |
| @@ -0,0 +1,3 @@ |
| +#define STRCMP __strncmp_evex |
| +#define USE_AS_STRNCMP 1 |
| +#include "strcmp-evex.S" |
| diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c |
| index 686d654f..4c15542f 100644 |
| |
| |
| @@ -30,16 +30,25 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; |
| extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; |
| extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden; |
| extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; |
| +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; |
| |
| static inline void * |
| IFUNC_SELECTOR (void) |
| { |
| const struct cpu_features* cpu_features = __get_cpu_features (); |
| |
| - if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) |
| - && CPU_FEATURE_USABLE_P (cpu_features, AVX2) |
| + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) |
| && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) |
| - return OPTIMIZE (avx2); |
| + { |
| + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) |
| + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) |
| + && CPU_FEATURE_USABLE_P (cpu_features, BMI2) |
| + && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP)) |
| + return OPTIMIZE (evex); |
| + |
| + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) |
| + return OPTIMIZE (avx2); |
| + } |
| |
| if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_2) |
| && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2)) |
| diff --git a/sysdeps/x86_64/multiarch/strnlen-evex.S b/sysdeps/x86_64/multiarch/strnlen-evex.S |
| new file mode 100644 |
| index 00000000..722022f3 |
| |
| |
| @@ -0,0 +1,4 @@ |
| +#define STRLEN __strnlen_evex |
| +#define USE_AS_STRNLEN 1 |
| + |
| +#include "strlen-evex.S" |
| diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S |
| new file mode 100644 |
| index 00000000..f920b5a5 |
| |
| |
| @@ -0,0 +1,265 @@ |
| +/* strrchr/wcsrchr optimized with 256-bit EVEX instructions. |
| + Copyright (C) 2021 Free Software Foundation, Inc. |
| + This file is part of the GNU C Library. |
| + |
| + The GNU C Library is free software; you can redistribute it and/or |
| + modify it under the terms of the GNU Lesser General Public |
| + License as published by the Free Software Foundation; either |
| + version 2.1 of the License, or (at your option) any later version. |
| + |
| + The GNU C Library is distributed in the hope that it will be useful, |
| + but WITHOUT ANY WARRANTY; without even the implied warranty of |
| + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| + Lesser General Public License for more details. |
| + |
| + You should have received a copy of the GNU Lesser General Public |
| + License along with the GNU C Library; if not, see |
| + <https://www.gnu.org/licenses/>. */ |
| + |
| +#if IS_IN (libc) |
| + |
| +# include <sysdep.h> |
| + |
| +# ifndef STRRCHR |
| +# define STRRCHR __strrchr_evex |
| +# endif |
| + |
| +# define VMOVU vmovdqu64 |
| +# define VMOVA vmovdqa64 |
| + |
| +# ifdef USE_AS_WCSRCHR |
| +# define VPBROADCAST vpbroadcastd |
| +# define VPCMP vpcmpd |
| +# define SHIFT_REG r8d |
| +# else |
| +# define VPBROADCAST vpbroadcastb |
| +# define VPCMP vpcmpb |
| +# define SHIFT_REG ecx |
| +# endif |
| + |
| +# define XMMZERO xmm16 |
| +# define YMMZERO ymm16 |
| +# define YMMMATCH ymm17 |
| +# define YMM1 ymm18 |
| + |
| +# define VEC_SIZE 32 |
| + |
| + .section .text.evex,"ax",@progbits |
| +ENTRY (STRRCHR) |
| + movl %edi, %ecx |
| + /* Broadcast CHAR to YMMMATCH. */ |
| + VPBROADCAST %esi, %YMMMATCH |
| + |
| + vpxorq %XMMZERO, %XMMZERO, %XMMZERO |
| + |
| + /* Check if we may cross page boundary with one vector load. */ |
| + andl $(2 * VEC_SIZE - 1), %ecx |
| + cmpl $VEC_SIZE, %ecx |
| + ja L(cros_page_boundary) |
| + |
| + VMOVU (%rdi), %YMM1 |
| + |
| + /* Each bit in K0 represents a null byte in YMM1. */ |
| + VPCMP $0, %YMMZERO, %YMM1, %k0 |
| + /* Each bit in K1 represents a CHAR in YMM1. */ |
| + VPCMP $0, %YMMMATCH, %YMM1, %k1 |
| + kmovd %k0, %ecx |
| + kmovd %k1, %eax |
| + |
| + addq $VEC_SIZE, %rdi |
| + |
| + testl %eax, %eax |
| + jnz L(first_vec) |
| + |
| + testl %ecx, %ecx |
| + jnz L(return_null) |
| + |
| + andq $-VEC_SIZE, %rdi |
| + xorl %edx, %edx |
| + jmp L(aligned_loop) |
| + |
| + .p2align 4 |
| +L(first_vec): |
| + /* Check if there is a null byte. */ |
| + testl %ecx, %ecx |
| + jnz L(char_and_nul_in_first_vec) |
| + |
| + /* Remember the match and keep searching. */ |
| + movl %eax, %edx |
| + movq %rdi, %rsi |
| + andq $-VEC_SIZE, %rdi |
| + jmp L(aligned_loop) |
| + |
| + .p2align 4 |
| +L(cros_page_boundary): |
| + andl $(VEC_SIZE - 1), %ecx |
| + andq $-VEC_SIZE, %rdi |
| + |
| +# ifdef USE_AS_WCSRCHR |
| + /* NB: Divide shift count by 4 since each bit in K1 represent 4 |
| + bytes. */ |
| + movl %ecx, %SHIFT_REG |
| + sarl $2, %SHIFT_REG |
| +# endif |
| + |
| + VMOVA (%rdi), %YMM1 |
| + |
| + /* Each bit in K0 represents a null byte in YMM1. */ |
| + VPCMP $0, %YMMZERO, %YMM1, %k0 |
| + /* Each bit in K1 represents a CHAR in YMM1. */ |
| + VPCMP $0, %YMMMATCH, %YMM1, %k1 |
| + kmovd %k0, %edx |
| + kmovd %k1, %eax |
| + |
| + shrxl %SHIFT_REG, %edx, %edx |
| + shrxl %SHIFT_REG, %eax, %eax |
| + addq $VEC_SIZE, %rdi |
| + |
| + /* Check if there is a CHAR. */ |
| + testl %eax, %eax |
| + jnz L(found_char) |
| + |
| + testl %edx, %edx |
| + jnz L(return_null) |
| + |
| + jmp L(aligned_loop) |
| + |
| + .p2align 4 |
| +L(found_char): |
| + testl %edx, %edx |
| + jnz L(char_and_nul) |
| + |
| + /* Remember the match and keep searching. */ |
| + movl %eax, %edx |
| + leaq (%rdi, %rcx), %rsi |
| + |
| + .p2align 4 |
| +L(aligned_loop): |
| + VMOVA (%rdi), %YMM1 |
| + addq $VEC_SIZE, %rdi |
| + |
| + /* Each bit in K0 represents a null byte in YMM1. */ |
| + VPCMP $0, %YMMZERO, %YMM1, %k0 |
| + /* Each bit in K1 represents a CHAR in YMM1. */ |
| + VPCMP $0, %YMMMATCH, %YMM1, %k1 |
| + kmovd %k0, %ecx |
| + kmovd %k1, %eax |
| + orl %eax, %ecx |
| + jnz L(char_nor_null) |
| + |
| + VMOVA (%rdi), %YMM1 |
| + add $VEC_SIZE, %rdi |
| + |
| + /* Each bit in K0 represents a null byte in YMM1. */ |
| + VPCMP $0, %YMMZERO, %YMM1, %k0 |
| + /* Each bit in K1 represents a CHAR in YMM1. */ |
| + VPCMP $0, %YMMMATCH, %YMM1, %k1 |
| + kmovd %k0, %ecx |
| + kmovd %k1, %eax |
| + orl %eax, %ecx |
| + jnz L(char_nor_null) |
| + |
| + VMOVA (%rdi), %YMM1 |
| + addq $VEC_SIZE, %rdi |
| + |
| + /* Each bit in K0 represents a null byte in YMM1. */ |
| + VPCMP $0, %YMMZERO, %YMM1, %k0 |
| + /* Each bit in K1 represents a CHAR in YMM1. */ |
| + VPCMP $0, %YMMMATCH, %YMM1, %k1 |
| + kmovd %k0, %ecx |
| + kmovd %k1, %eax |
| + orl %eax, %ecx |
| + jnz L(char_nor_null) |
| + |
| + VMOVA (%rdi), %YMM1 |
| + addq $VEC_SIZE, %rdi |
| + |
| + /* Each bit in K0 represents a null byte in YMM1. */ |
| + VPCMP $0, %YMMZERO, %YMM1, %k0 |
| + /* Each bit in K1 represents a CHAR in YMM1. */ |
| + VPCMP $0, %YMMMATCH, %YMM1, %k1 |
| + kmovd %k0, %ecx |
| + kmovd %k1, %eax |
| + orl %eax, %ecx |
| + jz L(aligned_loop) |
| + |
| + .p2align 4 |
| +L(char_nor_null): |
| + /* Find a CHAR or a null byte in a loop. */ |
| + testl %eax, %eax |
| + jnz L(match) |
| +L(return_value): |
| + testl %edx, %edx |
| + jz L(return_null) |
| + movl %edx, %eax |
| + movq %rsi, %rdi |
| + bsrl %eax, %eax |
| +# ifdef USE_AS_WCSRCHR |
| + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| + leaq -VEC_SIZE(%rdi, %rax, 4), %rax |
| +# else |
| + leaq -VEC_SIZE(%rdi, %rax), %rax |
| +# endif |
| + ret |
| + |
| + .p2align 4 |
| +L(match): |
| + /* Find a CHAR. Check if there is a null byte. */ |
| + kmovd %k0, %ecx |
| + testl %ecx, %ecx |
| + jnz L(find_nul) |
| + |
| + /* Remember the match and keep searching. */ |
| + movl %eax, %edx |
| + movq %rdi, %rsi |
| + jmp L(aligned_loop) |
| + |
| + .p2align 4 |
| +L(find_nul): |
| + /* Mask out any matching bits after the null byte. */ |
| + movl %ecx, %r8d |
| + subl $1, %r8d |
| + xorl %ecx, %r8d |
| + andl %r8d, %eax |
| + testl %eax, %eax |
| + /* If there is no CHAR here, return the remembered one. */ |
| + jz L(return_value) |
| + bsrl %eax, %eax |
| +# ifdef USE_AS_WCSRCHR |
| + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| + leaq -VEC_SIZE(%rdi, %rax, 4), %rax |
| +# else |
| + leaq -VEC_SIZE(%rdi, %rax), %rax |
| +# endif |
| + ret |
| + |
| + .p2align 4 |
| +L(char_and_nul): |
| + /* Find both a CHAR and a null byte. */ |
| + addq %rcx, %rdi |
| + movl %edx, %ecx |
| +L(char_and_nul_in_first_vec): |
| + /* Mask out any matching bits after the null byte. */ |
| + movl %ecx, %r8d |
| + subl $1, %r8d |
| + xorl %ecx, %r8d |
| + andl %r8d, %eax |
| + testl %eax, %eax |
| + /* Return null pointer if the null byte comes first. */ |
| + jz L(return_null) |
| + bsrl %eax, %eax |
| +# ifdef USE_AS_WCSRCHR |
| + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| + leaq -VEC_SIZE(%rdi, %rax, 4), %rax |
| +# else |
| + leaq -VEC_SIZE(%rdi, %rax), %rax |
| +# endif |
| + ret |
| + |
| + .p2align 4 |
| +L(return_null): |
| + xorl %eax, %eax |
| + ret |
| + |
| +END (STRRCHR) |
| +#endif |
| diff --git a/sysdeps/x86_64/multiarch/wcschr-evex.S b/sysdeps/x86_64/multiarch/wcschr-evex.S |
| new file mode 100644 |
| index 00000000..7cb8f1e4 |
| |
| |
| @@ -0,0 +1,3 @@ |
| +#define STRCHR __wcschr_evex |
| +#define USE_AS_WCSCHR 1 |
| +#include "strchr-evex.S" |
| diff --git a/sysdeps/x86_64/multiarch/wcscmp-evex.S b/sysdeps/x86_64/multiarch/wcscmp-evex.S |
| new file mode 100644 |
| index 00000000..42e73e51 |
| |
| |
| @@ -0,0 +1,4 @@ |
| +#define STRCMP __wcscmp_evex |
| +#define USE_AS_WCSCMP 1 |
| + |
| +#include "strcmp-evex.S" |
| diff --git a/sysdeps/x86_64/multiarch/wcslen-evex.S b/sysdeps/x86_64/multiarch/wcslen-evex.S |
| new file mode 100644 |
| index 00000000..bdafa83b |
| |
| |
| @@ -0,0 +1,4 @@ |
| +#define STRLEN __wcslen_evex |
| +#define USE_AS_WCSLEN 1 |
| + |
| +#include "strlen-evex.S" |
| diff --git a/sysdeps/x86_64/multiarch/wcsncmp-evex.S b/sysdeps/x86_64/multiarch/wcsncmp-evex.S |
| new file mode 100644 |
| index 00000000..8a8e3107 |
| |
| |
| @@ -0,0 +1,5 @@ |
| +#define STRCMP __wcsncmp_evex |
| +#define USE_AS_STRNCMP 1 |
| +#define USE_AS_WCSCMP 1 |
| + |
| +#include "strcmp-evex.S" |
| diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex.S b/sysdeps/x86_64/multiarch/wcsnlen-evex.S |
| new file mode 100644 |
| index 00000000..24773bb4 |
| |
| |
| @@ -0,0 +1,5 @@ |
| +#define STRLEN __wcsnlen_evex |
| +#define USE_AS_WCSLEN 1 |
| +#define USE_AS_STRNLEN 1 |
| + |
| +#include "strlen-evex.S" |
| diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c |
| index b3144c93..84254b83 100644 |
| |
| |
| @@ -29,16 +29,24 @@ |
| extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; |
| extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; |
| extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; |
| +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; |
| |
| static inline void * |
| IFUNC_SELECTOR (void) |
| { |
| const struct cpu_features* cpu_features = __get_cpu_features (); |
| |
| - if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) |
| - && CPU_FEATURE_USABLE_P (cpu_features, AVX2) |
| + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) |
| && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) |
| - return OPTIMIZE (avx2); |
| + { |
| + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) |
| + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) |
| + && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) |
| + return OPTIMIZE (evex); |
| + |
| + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) |
| + return OPTIMIZE (avx2); |
| + } |
| |
| if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1)) |
| return OPTIMIZE (sse4_1); |
| diff --git a/sysdeps/x86_64/multiarch/wcsrchr-evex.S b/sysdeps/x86_64/multiarch/wcsrchr-evex.S |
| new file mode 100644 |
| index 00000000..c64602f7 |
| |
| |
| @@ -0,0 +1,3 @@ |
| +#define STRRCHR __wcsrchr_evex |
| +#define USE_AS_WCSRCHR 1 |
| +#include "strrchr-evex.S" |
| diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex.S b/sysdeps/x86_64/multiarch/wmemchr-evex.S |
| new file mode 100644 |
| index 00000000..06cd0f9f |
| |
| |
| @@ -0,0 +1,4 @@ |
| +#define MEMCHR __wmemchr_evex |
| +#define USE_AS_WMEMCHR 1 |
| + |
| +#include "memchr-evex.S" |
| -- |
| GitLab |
| |