diff --git a/SOURCES/glibc-sw28895.patch b/SOURCES/glibc-sw28895.patch new file mode 100644 index 0000000..617ed93 --- /dev/null +++ b/SOURCES/glibc-sw28895.patch @@ -0,0 +1,91 @@ +From 7aeaaad7fb98f1c68b77bd9dee0ad0e6e92cd0ff Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Tue, 15 Feb 2022 20:27:21 -0600 +Subject: [PATCH] x86: Fix bug in strncmp-evex and strncmp-avx2 [BZ #28895] + +Logic can read before the start of `s1` / `s2` if both `s1` and `s2` +are near the start of a page. To avoid having the result contimated by +these comparisons the `strcmp` variants would mask off these +comparisons. This was missing in the `strncmp` variants causing +the bug. This commit adds the masking to `strncmp` so that out of +range comparisons don't affect the result. + +test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass as +well a full xcheck on x86_64 linux. +Reviewed-by: H.J. Lu + +(cherry picked from commit e108c02a5e23c8c88ce66d8705d4a24bb6b9a8bf) +--- + string/test-strncmp.c | 23 +++++++++++++++++++++++ + sysdeps/x86_64/multiarch/strcmp-avx2.S | 1 + + sysdeps/x86_64/multiarch/strcmp-evex.S | 1 + + 3 files changed, 25 insertions(+) + +diff --git a/string/test-strncmp.c b/string/test-strncmp.c +index 927a6daa..e61fffd9 100644 +--- a/string/test-strncmp.c ++++ b/string/test-strncmp.c +@@ -403,6 +403,28 @@ check2 (void) + free (s2); + } + ++static void ++check4 (void) ++{ ++ /* To trigger bug 28895; We need 1) both s1 and s2 to be within 32 bytes of ++ the end of the page. 2) For there to be no mismatch/null byte before the ++ first page cross. 3) For length (`n`) to be large enough for one string to ++ cross the page. And 4) for there to be either mismatch/null bytes before ++ the start of the strings. */ ++ ++ size_t size = 10; ++ size_t addr_mask = (getpagesize () - 1) ^ (sizeof (CHAR) - 1); ++ CHAR *s1 = (CHAR *)(buf1 + (addr_mask & 0xffa)); ++ CHAR *s2 = (CHAR *)(buf2 + (addr_mask & 0xfed)); ++ int exp_result; ++ ++ STRCPY (s1, L ("tst-tlsmod%")); ++ STRCPY (s2, L ("tst-tls-manydynamic73mod")); ++ exp_result = SIMPLE_STRNCMP (s1, s2, size); ++ FOR_EACH_IMPL (impl, 0) ++ check_result (impl, s1, s2, size, exp_result); ++} ++ + static void + check3 (void) + { +@@ -445,6 +467,7 @@ test_main (void) + check1 (); + check2 (); + check3 (); ++ check4 (); + + printf ("%23s", ""); + FOR_EACH_IMPL (impl, 0) +diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S +index 04675aa4..179cc0e3 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S ++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S +@@ -661,6 +661,7 @@ L(ret8): + # ifdef USE_AS_STRNCMP + .p2align 4,, 10 + L(return_page_cross_end_check): ++ andl %r10d, %ecx + tzcntl %ecx, %ecx + leal -VEC_SIZE(%rax, %rcx), %ecx + cmpl %ecx, %edx +diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S +index ed56af8e..0dfa62bd 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-evex.S ++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S +@@ -689,6 +689,7 @@ L(ret8): + # ifdef USE_AS_STRNCMP + .p2align 4,, 10 + L(return_page_cross_end_check): ++ andl %r10d, %ecx + tzcntl %ecx, %ecx + leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx + # ifdef USE_AS_WCSCMP +-- +GitLab + diff --git a/SOURCES/glibc-sw28896-2.patch b/SOURCES/glibc-sw28896-2.patch new file mode 100644 index 0000000..eb64cc6 --- /dev/null +++ b/SOURCES/glibc-sw28896-2.patch @@ -0,0 +1,147 @@ +From 50b1abfc917024905d84d261ba94682460193220 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Fri, 18 Feb 2022 14:19:15 -0600 +Subject: [PATCH] x86: Test wcscmp RTM in the wcsncmp overflow case [BZ #28896] + +In the overflow fallback strncmp-avx2-rtm and wcsncmp-avx2-rtm would +call strcmp-avx2 and wcscmp-avx2 respectively. This would have +not checks around vzeroupper and would trigger spurious +aborts. This commit fixes that. + +test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass on +AVX2 machines with and without RTM. +Reviewed-by: H.J. Lu + +(cherry picked from commit 7835d611af0854e69a0c71e3806f8fe379282d6f) +--- + sysdeps/x86/Makefile | 5 ++++- + sysdeps/x86/tst-strncmp-rtm.c | 32 +++++++++++++++++++++++--------- + sysdeps/x86/tst-wcsncmp-rtm.c | 21 +++++++++++++++++++++ + 3 files changed, 48 insertions(+), 10 deletions(-) + create mode 100644 sysdeps/x86/tst-wcsncmp-rtm.c + +diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile +index 2d814915..c2111f49 100644 +--- a/sysdeps/x86/Makefile ++++ b/sysdeps/x86/Makefile +@@ -28,7 +28,9 @@ tests += \ + tst-strcpy-rtm \ + tst-strlen-rtm \ + tst-strncmp-rtm \ +- tst-strrchr-rtm ++ tst-strrchr-rtm \ ++ tst-wcsncmp-rtm \ ++# tests + + CFLAGS-tst-memchr-rtm.c += -mrtm + CFLAGS-tst-memcmp-rtm.c += -mrtm +@@ -40,6 +42,7 @@ CFLAGS-tst-strcpy-rtm.c += -mrtm + CFLAGS-tst-strlen-rtm.c += -mrtm + CFLAGS-tst-strncmp-rtm.c += -mrtm -Wno-error + CFLAGS-tst-strrchr-rtm.c += -mrtm ++CFLAGS-tst-wcsncmp-rtm.c += -mrtm -Wno-error + endif + + ifneq ($(enable-cet),no) +diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c +index 4d0004b5..4e9f094f 100644 +--- a/sysdeps/x86/tst-strncmp-rtm.c ++++ b/sysdeps/x86/tst-strncmp-rtm.c +@@ -19,18 +19,32 @@ + #include + #include + ++#ifdef WIDE ++# define CHAR wchar_t ++# define MEMSET wmemset ++# define STRNCMP wcsncmp ++# define TEST_NAME wcsncmp ++#else /* !WIDE */ ++# define CHAR char ++# define MEMSET memset ++# define STRNCMP strncmp ++# define TEST_NAME strncmp ++#endif /* !WIDE */ ++ ++ ++ + #define LOOP 3000 + #define STRING_SIZE 1024 +-char string1[STRING_SIZE]; +-char string2[STRING_SIZE]; ++CHAR string1[STRING_SIZE]; ++CHAR string2[STRING_SIZE]; + + __attribute__ ((noinline, noclone)) + static int + prepare (void) + { +- memset (string1, 'a', STRING_SIZE - 1); +- memset (string2, 'a', STRING_SIZE - 1); +- if (strncmp (string1, string2, STRING_SIZE) == 0) ++ MEMSET (string1, 'a', STRING_SIZE - 1); ++ MEMSET (string2, 'a', STRING_SIZE - 1); ++ if (STRNCMP (string1, string2, STRING_SIZE) == 0) + return EXIT_SUCCESS; + else + return EXIT_FAILURE; +@@ -40,7 +54,7 @@ __attribute__ ((noinline, noclone)) + static int + function (void) + { +- if (strncmp (string1, string2, STRING_SIZE) == 0) ++ if (STRNCMP (string1, string2, STRING_SIZE) == 0) + return 0; + else + return 1; +@@ -50,7 +64,7 @@ __attribute__ ((noinline, noclone)) + static int + function_overflow (void) + { +- if (strncmp (string1, string2, SIZE_MAX) == 0) ++ if (STRNCMP (string1, string2, SIZE_MAX) == 0) + return 0; + else + return 1; +@@ -59,9 +73,9 @@ function_overflow (void) + static int + do_test (void) + { +- int status = do_test_1 ("strncmp", LOOP, prepare, function); ++ int status = do_test_1 (TEST_NAME, LOOP, prepare, function); + if (status != EXIT_SUCCESS) + return status; +- status = do_test_1 ("strncmp", LOOP, prepare, function_overflow); ++ status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow); + return status; + } +diff --git a/sysdeps/x86/tst-wcsncmp-rtm.c b/sysdeps/x86/tst-wcsncmp-rtm.c +new file mode 100644 +index 00000000..bad3b863 +--- /dev/null ++++ b/sysdeps/x86/tst-wcsncmp-rtm.c +@@ -0,0 +1,21 @@ ++/* Test case for wcsncmp inside a transactionally executing RTM region. ++ Copyright (C) 2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#define WIDE 1 ++#include ++#include "tst-strncmp-rtm.c" +-- +GitLab + diff --git a/SOURCES/glibc-sw28896-3.patch b/SOURCES/glibc-sw28896-3.patch new file mode 100644 index 0000000..6b47e24 --- /dev/null +++ b/SOURCES/glibc-sw28896-3.patch @@ -0,0 +1,78 @@ +From fd7108a4c9dc46113f4041e7575c1f1217c0d57d Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Thu, 24 Mar 2022 15:50:33 -0500 +Subject: [PATCH] x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ + #28896] + +Overflow case for __wcsncmp_avx2_rtm should be __wcscmp_avx2_rtm not +__wcscmp_avx2. + +commit ddf0992cf57a93200e0c782e2a94d0733a5a0b87 +Author: Noah Goldstein +Date: Sun Jan 9 16:02:21 2022 -0600 + + x86: Fix __wcsncmp_avx2 in strcmp-avx2.S [BZ# 28755] + +Set the wrong fallback function for `__wcsncmp_avx2_rtm`. It was set +to fallback on to `__wcscmp_avx2` instead of `__wcscmp_avx2_rtm` which +can cause spurious aborts. + +This change will need to be backported. + +All string/memory tests pass. +Reviewed-by: H.J. Lu + +(cherry picked from commit 9fef7039a7d04947bc89296ee0d187bc8d89b772) +--- + sysdeps/x86/tst-strncmp-rtm.c | 15 +++++++++++++++ + sysdeps/x86_64/multiarch/strcmp-avx2.S | 2 +- + 2 files changed, 16 insertions(+), 1 deletion(-) + +diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c +index aef9866c..ba6543be 100644 +--- a/sysdeps/x86/tst-strncmp-rtm.c ++++ b/sysdeps/x86/tst-strncmp-rtm.c +@@ -70,6 +70,16 @@ function_overflow (void) + return 1; + } + ++__attribute__ ((noinline, noclone)) ++static int ++function_overflow2 (void) ++{ ++ if (STRNCMP (string1, string2, SIZE_MAX >> 4) == 0) ++ return 0; ++ else ++ return 1; ++} ++ + static int + do_test (void) + { +@@ -77,5 +87,10 @@ do_test (void) + if (status != EXIT_SUCCESS) + return status; + status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow); ++ if (status != EXIT_SUCCESS) ++ return status; ++ status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow2); ++ if (status != EXIT_SUCCESS) ++ return status; + return status; + } +diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S +index 179cc0e3..782f9472 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S ++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S +@@ -122,7 +122,7 @@ ENTRY(STRCMP) + are cases where length is large enough that it can never be a + bound on valid memory so just use wcscmp. */ + shrq $56, %rcx +- jnz __wcscmp_avx2 ++ jnz OVERFLOW_STRCMP + + leaq (, %rdx, 4), %rdx + # endif +-- +GitLab + diff --git a/SOURCES/glibc-sw29127.patch b/SOURCES/glibc-sw29127.patch new file mode 100644 index 0000000..f1a7a76 --- /dev/null +++ b/SOURCES/glibc-sw29127.patch @@ -0,0 +1,57 @@ +From 0ad216b458445251e6f98d74382faf3606569731 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Tue, 15 Feb 2022 08:18:15 -0600 +Subject: [PATCH] x86: Fallback {str|wcs}cmp RTM in the ncmp overflow case [BZ + #29127] + +Re-cherry-pick commit c627209832 for strcmp-avx2.S change which was +omitted in intial cherry pick because at the time this bug was not +present on release branch. + +Fixes BZ #29127. + +In the overflow fallback strncmp-avx2-rtm and wcsncmp-avx2-rtm would +call strcmp-avx2 and wcscmp-avx2 respectively. This would have +not checks around vzeroupper and would trigger spurious +aborts. This commit fixes that. + +test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass on +AVX2 machines with and without RTM. + +Co-authored-by: H.J. Lu +(cherry picked from commit c6272098323153db373f2986c67786ea8c85f1cf) +--- + sysdeps/x86_64/multiarch/strcmp-avx2.S | 8 ++------ + 1 file changed, 2 insertions(+), 6 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S +index 28cc98b6..e267c6cb 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S ++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S +@@ -345,10 +345,10 @@ L(one_or_less): + movq %LOCALE_REG, %rdx + # endif + jb L(ret_zero) +-# ifdef USE_AS_WCSCMP + /* 'nbe' covers the case where length is negative (large + unsigned). */ +- jnbe __wcscmp_avx2 ++ jnbe OVERFLOW_STRCMP ++# ifdef USE_AS_WCSCMP + movl (%rdi), %edx + xorl %eax, %eax + cmpl (%rsi), %edx +@@ -357,10 +357,6 @@ L(one_or_less): + negl %eax + orl $1, %eax + # else +- /* 'nbe' covers the case where length is negative (large +- unsigned). */ +- +- jnbe __strcmp_avx2 + movzbl (%rdi), %eax + movzbl (%rsi), %ecx + TOLOWER_gpr (%rax, %eax) +-- +GitLab + diff --git a/SOURCES/ia-add-backoff-spinlock.patch b/SOURCES/ia-add-backoff-spinlock.patch new file mode 100644 index 0000000..27bf481 --- /dev/null +++ b/SOURCES/ia-add-backoff-spinlock.patch @@ -0,0 +1,219 @@ +From d4b1ecdf48cfe0e711ec201533811b7d823d1b7d Mon Sep 17 00:00:00 2001 +From: Wangyang Guo +Date: Fri, 6 May 2022 01:50:10 +0000 +Subject: [PATCH] nptl: Add backoff mechanism to spinlock loop + +When mutiple threads waiting for lock at the same time, once lock owner +releases the lock, waiters will see lock available and all try to lock, +which may cause an expensive CAS storm. + +Binary exponential backoff with random jitter is introduced. As try-lock +attempt increases, there is more likely that a larger number threads +compete for adaptive mutex lock, so increase wait time in exponential. +A random jitter is also added to avoid synchronous try-lock from other +threads. + +v2: Remove read-check before try-lock for performance. + +v3: +1. Restore read-check since it works well in some platform. +2. Make backoff arch dependent, and enable it for x86_64. +3. Limit max backoff to reduce latency in large critical section. + +v4: Fix strict-prototypes error in sysdeps/nptl/pthread_mutex_backoff.h + +v5: Commit log updated for regression in large critical section. + +Result of pthread-mutex-locks bench + +Test Platform: Xeon 8280L (2 socket, 112 CPUs in total) +First Row: thread number +First Col: critical section length +Values: backoff vs upstream, time based, low is better + +non-critical-length: 1 + 1 2 4 8 16 32 64 112 140 +0 0.99 0.58 0.52 0.49 0.43 0.44 0.46 0.52 0.54 +1 0.98 0.43 0.56 0.50 0.44 0.45 0.50 0.56 0.57 +2 0.99 0.41 0.57 0.51 0.45 0.47 0.48 0.60 0.61 +4 0.99 0.45 0.59 0.53 0.48 0.49 0.52 0.64 0.65 +8 1.00 0.66 0.71 0.63 0.56 0.59 0.66 0.72 0.71 +16 0.97 0.78 0.91 0.73 0.67 0.70 0.79 0.80 0.80 +32 0.95 1.17 0.98 0.87 0.82 0.86 0.89 0.90 0.90 +64 0.96 0.95 1.01 1.01 0.98 1.00 1.03 0.99 0.99 +128 0.99 1.01 1.01 1.17 1.08 1.12 1.02 0.97 1.02 + +non-critical-length: 32 + 1 2 4 8 16 32 64 112 140 +0 1.03 0.97 0.75 0.65 0.58 0.58 0.56 0.70 0.70 +1 0.94 0.95 0.76 0.65 0.58 0.58 0.61 0.71 0.72 +2 0.97 0.96 0.77 0.66 0.58 0.59 0.62 0.74 0.74 +4 0.99 0.96 0.78 0.66 0.60 0.61 0.66 0.76 0.77 +8 0.99 0.99 0.84 0.70 0.64 0.66 0.71 0.80 0.80 +16 0.98 0.97 0.95 0.76 0.70 0.73 0.81 0.85 0.84 +32 1.04 1.12 1.04 0.89 0.82 0.86 0.93 0.91 0.91 +64 0.99 1.15 1.07 1.00 0.99 1.01 1.05 0.99 0.99 +128 1.00 1.21 1.20 1.22 1.25 1.31 1.12 1.10 0.99 + +non-critical-length: 128 + 1 2 4 8 16 32 64 112 140 +0 1.02 1.00 0.99 0.67 0.61 0.61 0.61 0.74 0.73 +1 0.95 0.99 1.00 0.68 0.61 0.60 0.60 0.74 0.74 +2 1.00 1.04 1.00 0.68 0.59 0.61 0.65 0.76 0.76 +4 1.00 0.96 0.98 0.70 0.63 0.63 0.67 0.78 0.77 +8 1.01 1.02 0.89 0.73 0.65 0.67 0.71 0.81 0.80 +16 0.99 0.96 0.96 0.79 0.71 0.73 0.80 0.84 0.84 +32 0.99 0.95 1.05 0.89 0.84 0.85 0.94 0.92 0.91 +64 1.00 0.99 1.16 1.04 1.00 1.02 1.06 0.99 0.99 +128 1.00 1.06 0.98 1.14 1.39 1.26 1.08 1.02 0.98 + +There is regression in large critical section. But adaptive mutex is +aimed for "quick" locks. Small critical section is more common when +users choose to use adaptive pthread_mutex. + +Signed-off-by: Wangyang Guo +Reviewed-by: H.J. Lu +(cherry picked from commit 8162147872491bb5b48e91543b19c49a29ae6b6d) +--- + nptl/pthreadP.h | 1 + + nptl/pthread_mutex_lock.c | 16 +++++++-- + sysdeps/nptl/pthread_mutex_backoff.h | 35 ++++++++++++++++++ + sysdeps/x86_64/nptl/pthread_mutex_backoff.h | 39 +++++++++++++++++++++ + 4 files changed, 89 insertions(+), 2 deletions(-) + create mode 100644 sysdeps/nptl/pthread_mutex_backoff.h + create mode 100644 sysdeps/x86_64/nptl/pthread_mutex_backoff.h + +diff --git a/nptl/pthreadP.h b/nptl/pthreadP.h +index 7ddc166c..1550e3b6 100644 +--- a/nptl/pthreadP.h ++++ b/nptl/pthreadP.h +@@ -33,6 +33,7 @@ + #include + #include + #include ++#include + + + /* Atomic operations on TLS memory. */ +diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c +index d96a9933..c7770fc9 100644 +--- a/nptl/pthread_mutex_lock.c ++++ b/nptl/pthread_mutex_lock.c +@@ -133,14 +133,26 @@ __pthread_mutex_lock (pthread_mutex_t *mutex) + int cnt = 0; + int max_cnt = MIN (MAX_ADAPTIVE_COUNT, + mutex->__data.__spins * 2 + 10); ++ int spin_count, exp_backoff = 1; ++ unsigned int jitter = get_jitter (); + do + { +- if (cnt++ >= max_cnt) ++ /* In each loop, spin count is exponential backoff plus ++ random jitter, random range is [0, exp_backoff-1]. */ ++ spin_count = exp_backoff + (jitter & (exp_backoff - 1)); ++ cnt += spin_count; ++ if (cnt >= max_cnt) + { ++ /* If cnt exceeds max spin count, just go to wait ++ queue. */ + LLL_MUTEX_LOCK (mutex); + break; + } +- atomic_spin_nop (); ++ do ++ atomic_spin_nop (); ++ while (--spin_count > 0); ++ /* Prepare for next loop. */ ++ exp_backoff = get_next_backoff (exp_backoff); + } + while (LLL_MUTEX_READ_LOCK (mutex) != 0 + || LLL_MUTEX_TRYLOCK (mutex) != 0); +diff --git a/sysdeps/nptl/pthread_mutex_backoff.h b/sysdeps/nptl/pthread_mutex_backoff.h +new file mode 100644 +index 00000000..5b26c22a +--- /dev/null ++++ b/sysdeps/nptl/pthread_mutex_backoff.h +@@ -0,0 +1,35 @@ ++/* Pthread mutex backoff configuration. ++ Copyright (C) 2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++#ifndef _PTHREAD_MUTEX_BACKOFF_H ++#define _PTHREAD_MUTEX_BACKOFF_H 1 ++ ++static inline unsigned int ++get_jitter (void) ++{ ++ /* Arch dependent random jitter, return 0 disables random. */ ++ return 0; ++} ++ ++static inline int ++get_next_backoff (int backoff) ++{ ++ /* Next backoff, return 1 disables mutex backoff. */ ++ return 1; ++} ++ ++#endif +diff --git a/sysdeps/x86_64/nptl/pthread_mutex_backoff.h b/sysdeps/x86_64/nptl/pthread_mutex_backoff.h +new file mode 100644 +index 00000000..ec74c3d9 +--- /dev/null ++++ b/sysdeps/x86_64/nptl/pthread_mutex_backoff.h +@@ -0,0 +1,39 @@ ++/* Pthread mutex backoff configuration. ++ Copyright (C) 2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++#ifndef _PTHREAD_MUTEX_BACKOFF_H ++#define _PTHREAD_MUTEX_BACKOFF_H 1 ++ ++#include ++ ++static inline unsigned int ++get_jitter (void) ++{ ++ return get_fast_jitter (); ++} ++ ++#define MAX_BACKOFF 16 ++ ++static inline int ++get_next_backoff (int backoff) ++{ ++ /* Binary expontial backoff. Limiting max backoff ++ can reduce latency in large critical section. */ ++ return (backoff < MAX_BACKOFF) ? backoff << 1 : backoff; ++} ++ ++#endif +-- +GitLab + diff --git a/SOURCES/ia-add-fast-jitter.patch b/SOURCES/ia-add-fast-jitter.patch new file mode 100644 index 0000000..51b07d7 --- /dev/null +++ b/SOURCES/ia-add-fast-jitter.patch @@ -0,0 +1,74 @@ +From 1edbd8aad68ea5ddd729e1cbd307c84b1386459a Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 27 Apr 2022 15:13:02 -0500 +Subject: [PATCH] sysdeps: Add 'get_fast_jitter' interace in fast-jitter.h + +'get_fast_jitter' is meant to be used purely for performance +purposes. In all cases it's used it should be acceptable to get no +randomness (see default case). An example use case is in setting +jitter for retries between threads at a lock. There is a +performance benefit to having jitter, but only if the jitter can +be generated very quickly and ultimately there is no serious issue +if no jitter is generated. + +The implementation generally uses 'HP_TIMING_NOW' iff it is +inlined (avoid any potential syscall paths). +Reviewed-by: H.J. Lu + +(cherry picked from commit 911c63a51c690dd1a97dfc587097277029baf00f) +--- + sysdeps/generic/fast-jitter.h | 42 +++++++++++++++++++++++++++++++++++ + 1 file changed, 42 insertions(+) + create mode 100644 sysdeps/generic/fast-jitter.h + +diff --git a/sysdeps/generic/fast-jitter.h b/sysdeps/generic/fast-jitter.h +new file mode 100644 +index 00000000..4dd53e34 +--- /dev/null ++++ b/sysdeps/generic/fast-jitter.h +@@ -0,0 +1,42 @@ ++/* Fallback for fast jitter just return 0. ++ Copyright (C) 2019-2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#ifndef _FAST_JITTER_H ++# define _FAST_JITTER_H ++ ++# include ++# include ++ ++/* Baseline just return 0. We could create jitter using a clock or ++ 'random_bits' but that may imply a syscall and the goal of ++ 'get_fast_jitter' is minimal overhead "randomness" when such ++ randomness helps performance. Adding high overhead the function ++ defeats the purpose. */ ++static inline uint32_t ++get_fast_jitter (void) ++{ ++# if HP_TIMING_INLINE ++ hp_timing_t jitter; ++ HP_TIMING_NOW (jitter); ++ return (uint32_t) jitter; ++# else ++ return 0; ++# endif ++} ++ ++#endif +-- +GitLab + diff --git a/SOURCES/ia-code_cleanup-memcmp-avx2-movbe.patch b/SOURCES/ia-code_cleanup-memcmp-avx2-movbe.patch new file mode 100644 index 0000000..2f04515 --- /dev/null +++ b/SOURCES/ia-code_cleanup-memcmp-avx2-movbe.patch @@ -0,0 +1,264 @@ +From 4619b6dbf13c17a13be2d2a0bdc9fcc2640b0f86 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Fri, 15 Apr 2022 12:28:01 -0500 +Subject: [PATCH] x86: Cleanup page cross code in memcmp-avx2-movbe.S + +Old code was both inefficient and wasted code size. New code (-62 +bytes) and comparable or better performance in the page cross case. + +geometric_mean(N=20) of page cross cases New / Original: 0.960 + +size, align0, align1, ret, New Time/Old Time + 1, 4095, 0, 0, 1.001 + 1, 4095, 0, 1, 0.999 + 1, 4095, 0, -1, 1.0 + 2, 4094, 0, 0, 1.0 + 2, 4094, 0, 1, 1.0 + 2, 4094, 0, -1, 1.0 + 3, 4093, 0, 0, 1.0 + 3, 4093, 0, 1, 1.0 + 3, 4093, 0, -1, 1.0 + 4, 4092, 0, 0, 0.987 + 4, 4092, 0, 1, 1.0 + 4, 4092, 0, -1, 1.0 + 5, 4091, 0, 0, 0.984 + 5, 4091, 0, 1, 1.002 + 5, 4091, 0, -1, 1.005 + 6, 4090, 0, 0, 0.993 + 6, 4090, 0, 1, 1.001 + 6, 4090, 0, -1, 1.003 + 7, 4089, 0, 0, 0.991 + 7, 4089, 0, 1, 1.0 + 7, 4089, 0, -1, 1.001 + 8, 4088, 0, 0, 0.875 + 8, 4088, 0, 1, 0.881 + 8, 4088, 0, -1, 0.888 + 9, 4087, 0, 0, 0.872 + 9, 4087, 0, 1, 0.879 + 9, 4087, 0, -1, 0.883 + 10, 4086, 0, 0, 0.878 + 10, 4086, 0, 1, 0.886 + 10, 4086, 0, -1, 0.873 + 11, 4085, 0, 0, 0.878 + 11, 4085, 0, 1, 0.881 + 11, 4085, 0, -1, 0.879 + 12, 4084, 0, 0, 0.873 + 12, 4084, 0, 1, 0.889 + 12, 4084, 0, -1, 0.875 + 13, 4083, 0, 0, 0.873 + 13, 4083, 0, 1, 0.863 + 13, 4083, 0, -1, 0.863 + 14, 4082, 0, 0, 0.838 + 14, 4082, 0, 1, 0.869 + 14, 4082, 0, -1, 0.877 + 15, 4081, 0, 0, 0.841 + 15, 4081, 0, 1, 0.869 + 15, 4081, 0, -1, 0.876 + 16, 4080, 0, 0, 0.988 + 16, 4080, 0, 1, 0.99 + 16, 4080, 0, -1, 0.989 + 17, 4079, 0, 0, 0.978 + 17, 4079, 0, 1, 0.981 + 17, 4079, 0, -1, 0.98 + 18, 4078, 0, 0, 0.981 + 18, 4078, 0, 1, 0.98 + 18, 4078, 0, -1, 0.985 + 19, 4077, 0, 0, 0.977 + 19, 4077, 0, 1, 0.979 + 19, 4077, 0, -1, 0.986 + 20, 4076, 0, 0, 0.977 + 20, 4076, 0, 1, 0.986 + 20, 4076, 0, -1, 0.984 + 21, 4075, 0, 0, 0.977 + 21, 4075, 0, 1, 0.983 + 21, 4075, 0, -1, 0.988 + 22, 4074, 0, 0, 0.983 + 22, 4074, 0, 1, 0.994 + 22, 4074, 0, -1, 0.993 + 23, 4073, 0, 0, 0.98 + 23, 4073, 0, 1, 0.992 + 23, 4073, 0, -1, 0.995 + 24, 4072, 0, 0, 0.989 + 24, 4072, 0, 1, 0.989 + 24, 4072, 0, -1, 0.991 + 25, 4071, 0, 0, 0.99 + 25, 4071, 0, 1, 0.999 + 25, 4071, 0, -1, 0.996 + 26, 4070, 0, 0, 0.993 + 26, 4070, 0, 1, 0.995 + 26, 4070, 0, -1, 0.998 + 27, 4069, 0, 0, 0.993 + 27, 4069, 0, 1, 0.999 + 27, 4069, 0, -1, 1.0 + 28, 4068, 0, 0, 0.997 + 28, 4068, 0, 1, 1.0 + 28, 4068, 0, -1, 0.999 + 29, 4067, 0, 0, 0.996 + 29, 4067, 0, 1, 0.999 + 29, 4067, 0, -1, 0.999 + 30, 4066, 0, 0, 0.991 + 30, 4066, 0, 1, 1.001 + 30, 4066, 0, -1, 0.999 + 31, 4065, 0, 0, 0.988 + 31, 4065, 0, 1, 0.998 + 31, 4065, 0, -1, 0.998 +Reviewed-by: H.J. Lu + +(cherry picked from commit 23102686ec67b856a2d4fd25ddaa1c0b8d175c4f) +--- + sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 98 ++++++++++++-------- + 1 file changed, 61 insertions(+), 37 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S +index 16fc673e..99258cf5 100644 +--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S ++++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S +@@ -429,22 +429,21 @@ L(page_cross_less_vec): + # ifndef USE_AS_WMEMCMP + cmpl $8, %edx + jae L(between_8_15) ++ /* Fall through for [4, 7]. */ + cmpl $4, %edx +- jae L(between_4_7) ++ jb L(between_2_3) + +- /* Load as big endian to avoid branches. */ +- movzwl (%rdi), %eax +- movzwl (%rsi), %ecx +- shll $8, %eax +- shll $8, %ecx +- bswap %eax +- bswap %ecx +- movzbl -1(%rdi, %rdx), %edi +- movzbl -1(%rsi, %rdx), %esi +- orl %edi, %eax +- orl %esi, %ecx +- /* Subtraction is okay because the upper 8 bits are zero. */ +- subl %ecx, %eax ++ movbe (%rdi), %eax ++ movbe (%rsi), %ecx ++ shlq $32, %rax ++ shlq $32, %rcx ++ movbe -4(%rdi, %rdx), %edi ++ movbe -4(%rsi, %rdx), %esi ++ orq %rdi, %rax ++ orq %rsi, %rcx ++ subq %rcx, %rax ++ /* Fast path for return zero. */ ++ jnz L(ret_nonzero) + /* No ymm register was touched. */ + ret + +@@ -457,9 +456,33 @@ L(one_or_less): + /* No ymm register was touched. */ + ret + ++ .p2align 4,, 5 ++L(ret_nonzero): ++ sbbl %eax, %eax ++ orl $1, %eax ++ /* No ymm register was touched. */ ++ ret ++ ++ .p2align 4,, 2 ++L(zero): ++ xorl %eax, %eax ++ /* No ymm register was touched. */ ++ ret ++ + .p2align 4 + L(between_8_15): +-# endif ++ movbe (%rdi), %rax ++ movbe (%rsi), %rcx ++ subq %rcx, %rax ++ jnz L(ret_nonzero) ++ movbe -8(%rdi, %rdx), %rax ++ movbe -8(%rsi, %rdx), %rcx ++ subq %rcx, %rax ++ /* Fast path for return zero. */ ++ jnz L(ret_nonzero) ++ /* No ymm register was touched. */ ++ ret ++# else + /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */ + vmovq (%rdi), %xmm1 + vmovq (%rsi), %xmm2 +@@ -475,16 +498,13 @@ L(between_8_15): + VPCMPEQ %xmm1, %xmm2, %xmm2 + vpmovmskb %xmm2, %eax + subl $0xffff, %eax ++ /* Fast path for return zero. */ + jnz L(return_vec_0) + /* No ymm register was touched. */ + ret ++# endif + +- .p2align 4 +-L(zero): +- xorl %eax, %eax +- ret +- +- .p2align 4 ++ .p2align 4,, 10 + L(between_16_31): + /* From 16 to 31 bytes. No branch when size == 16. */ + vmovdqu (%rsi), %xmm2 +@@ -501,11 +521,17 @@ L(between_16_31): + VPCMPEQ (%rdi), %xmm2, %xmm2 + vpmovmskb %xmm2, %eax + subl $0xffff, %eax ++ /* Fast path for return zero. */ + jnz L(return_vec_0) + /* No ymm register was touched. */ + ret + + # ifdef USE_AS_WMEMCMP ++ .p2align 4,, 2 ++L(zero): ++ xorl %eax, %eax ++ ret ++ + .p2align 4 + L(one_or_less): + jb L(zero) +@@ -520,22 +546,20 @@ L(one_or_less): + # else + + .p2align 4 +-L(between_4_7): +- /* Load as big endian with overlapping movbe to avoid branches. +- */ +- movbe (%rdi), %eax +- movbe (%rsi), %ecx +- shlq $32, %rax +- shlq $32, %rcx +- movbe -4(%rdi, %rdx), %edi +- movbe -4(%rsi, %rdx), %esi +- orq %rdi, %rax +- orq %rsi, %rcx +- subq %rcx, %rax +- jz L(zero_4_7) +- sbbl %eax, %eax +- orl $1, %eax +-L(zero_4_7): ++L(between_2_3): ++ /* Load as big endian to avoid branches. */ ++ movzwl (%rdi), %eax ++ movzwl (%rsi), %ecx ++ bswap %eax ++ bswap %ecx ++ shrl %eax ++ shrl %ecx ++ movzbl -1(%rdi, %rdx), %edi ++ movzbl -1(%rsi, %rdx), %esi ++ orl %edi, %eax ++ orl %esi, %ecx ++ /* Subtraction is okay because the upper bit is zero. */ ++ subl %ecx, %eax + /* No ymm register was touched. */ + ret + # endif +-- +GitLab + diff --git a/SOURCES/ia-code_cleanup-strchr-avx2.patch b/SOURCES/ia-code_cleanup-strchr-avx2.patch new file mode 100644 index 0000000..3d1111b --- /dev/null +++ b/SOURCES/ia-code_cleanup-strchr-avx2.patch @@ -0,0 +1,373 @@ +From 1da58a6d12719da8cc2035c2f6f9928d2ad61a20 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 23 Mar 2022 16:57:16 -0500 +Subject: [PATCH] x86: Code cleanup in strchr-avx2 and comment justifying + branch + +Small code cleanup for size: -53 bytes. + +Add comment justifying using a branch to do NULL/non-null return. + +All string/memory tests pass and no regressions in benchtests. + +geometric_mean(N=20) of all benchmarks Original / New: 1.00 +Reviewed-by: H.J. Lu + +(cherry picked from commit a6fbf4d51e9ba8063c4f8331564892ead9c67344) +--- + sysdeps/x86_64/multiarch/strchr-avx2.S | 204 +++++++++++++------------ + 1 file changed, 107 insertions(+), 97 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S +index 5884726b..89dd2bf7 100644 +--- a/sysdeps/x86_64/multiarch/strchr-avx2.S ++++ b/sysdeps/x86_64/multiarch/strchr-avx2.S +@@ -48,13 +48,13 @@ + # define PAGE_SIZE 4096 + + .section SECTION(.text),"ax",@progbits +-ENTRY (STRCHR) ++ENTRY_P2ALIGN (STRCHR, 5) + /* Broadcast CHAR to YMM0. */ + vmovd %esi, %xmm0 + movl %edi, %eax + andl $(PAGE_SIZE - 1), %eax + VPBROADCAST %xmm0, %ymm0 +- vpxor %xmm9, %xmm9, %xmm9 ++ vpxor %xmm1, %xmm1, %xmm1 + + /* Check if we cross page boundary with one vector load. */ + cmpl $(PAGE_SIZE - VEC_SIZE), %eax +@@ -62,37 +62,29 @@ ENTRY (STRCHR) + + /* Check the first VEC_SIZE bytes. Search for both CHAR and the + null byte. */ +- vmovdqu (%rdi), %ymm8 +- VPCMPEQ %ymm8, %ymm0, %ymm1 +- VPCMPEQ %ymm8, %ymm9, %ymm2 +- vpor %ymm1, %ymm2, %ymm1 +- vpmovmskb %ymm1, %eax ++ vmovdqu (%rdi), %ymm2 ++ VPCMPEQ %ymm2, %ymm0, %ymm3 ++ VPCMPEQ %ymm2, %ymm1, %ymm2 ++ vpor %ymm3, %ymm2, %ymm3 ++ vpmovmskb %ymm3, %eax + testl %eax, %eax + jz L(aligned_more) + tzcntl %eax, %eax + # ifndef USE_AS_STRCHRNUL +- /* Found CHAR or the null byte. */ +- cmp (%rdi, %rax), %CHAR_REG +- jne L(zero) +-# endif +- addq %rdi, %rax +- VZEROUPPER_RETURN +- +- /* .p2align 5 helps keep performance more consistent if ENTRY() +- alignment % 32 was either 16 or 0. As well this makes the +- alignment % 32 of the loop_4x_vec fixed which makes tuning it +- easier. */ +- .p2align 5 +-L(first_vec_x4): +- tzcntl %eax, %eax +- addq $(VEC_SIZE * 3 + 1), %rdi +-# ifndef USE_AS_STRCHRNUL +- /* Found CHAR or the null byte. */ ++ /* Found CHAR or the null byte. */ + cmp (%rdi, %rax), %CHAR_REG ++ /* NB: Use a branch instead of cmovcc here. The expectation is ++ that with strchr the user will branch based on input being ++ null. Since this branch will be 100% predictive of the user ++ branch a branch miss here should save what otherwise would ++ be branch miss in the user code. Otherwise using a branch 1) ++ saves code size and 2) is faster in highly predictable ++ environments. */ + jne L(zero) + # endif + addq %rdi, %rax +- VZEROUPPER_RETURN ++L(return_vzeroupper): ++ ZERO_UPPER_VEC_REGISTERS_RETURN + + # ifndef USE_AS_STRCHRNUL + L(zero): +@@ -103,7 +95,8 @@ L(zero): + + .p2align 4 + L(first_vec_x1): +- tzcntl %eax, %eax ++ /* Use bsf to save code size. */ ++ bsfl %eax, %eax + incq %rdi + # ifndef USE_AS_STRCHRNUL + /* Found CHAR or the null byte. */ +@@ -113,9 +106,10 @@ L(first_vec_x1): + addq %rdi, %rax + VZEROUPPER_RETURN + +- .p2align 4 ++ .p2align 4,, 10 + L(first_vec_x2): +- tzcntl %eax, %eax ++ /* Use bsf to save code size. */ ++ bsfl %eax, %eax + addq $(VEC_SIZE + 1), %rdi + # ifndef USE_AS_STRCHRNUL + /* Found CHAR or the null byte. */ +@@ -125,9 +119,10 @@ L(first_vec_x2): + addq %rdi, %rax + VZEROUPPER_RETURN + +- .p2align 4 ++ .p2align 4,, 8 + L(first_vec_x3): +- tzcntl %eax, %eax ++ /* Use bsf to save code size. */ ++ bsfl %eax, %eax + addq $(VEC_SIZE * 2 + 1), %rdi + # ifndef USE_AS_STRCHRNUL + /* Found CHAR or the null byte. */ +@@ -137,6 +132,21 @@ L(first_vec_x3): + addq %rdi, %rax + VZEROUPPER_RETURN + ++ .p2align 4,, 10 ++L(first_vec_x4): ++ /* Use bsf to save code size. */ ++ bsfl %eax, %eax ++ addq $(VEC_SIZE * 3 + 1), %rdi ++# ifndef USE_AS_STRCHRNUL ++ /* Found CHAR or the null byte. */ ++ cmp (%rdi, %rax), %CHAR_REG ++ jne L(zero) ++# endif ++ addq %rdi, %rax ++ VZEROUPPER_RETURN ++ ++ ++ + .p2align 4 + L(aligned_more): + /* Align data to VEC_SIZE - 1. This is the same number of +@@ -146,90 +156,92 @@ L(aligned_more): + L(cross_page_continue): + /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time + since data is only aligned to VEC_SIZE. */ +- vmovdqa 1(%rdi), %ymm8 +- VPCMPEQ %ymm8, %ymm0, %ymm1 +- VPCMPEQ %ymm8, %ymm9, %ymm2 +- vpor %ymm1, %ymm2, %ymm1 +- vpmovmskb %ymm1, %eax ++ vmovdqa 1(%rdi), %ymm2 ++ VPCMPEQ %ymm2, %ymm0, %ymm3 ++ VPCMPEQ %ymm2, %ymm1, %ymm2 ++ vpor %ymm3, %ymm2, %ymm3 ++ vpmovmskb %ymm3, %eax + testl %eax, %eax + jnz L(first_vec_x1) + +- vmovdqa (VEC_SIZE + 1)(%rdi), %ymm8 +- VPCMPEQ %ymm8, %ymm0, %ymm1 +- VPCMPEQ %ymm8, %ymm9, %ymm2 +- vpor %ymm1, %ymm2, %ymm1 +- vpmovmskb %ymm1, %eax ++ vmovdqa (VEC_SIZE + 1)(%rdi), %ymm2 ++ VPCMPEQ %ymm2, %ymm0, %ymm3 ++ VPCMPEQ %ymm2, %ymm1, %ymm2 ++ vpor %ymm3, %ymm2, %ymm3 ++ vpmovmskb %ymm3, %eax + testl %eax, %eax + jnz L(first_vec_x2) + +- vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm8 +- VPCMPEQ %ymm8, %ymm0, %ymm1 +- VPCMPEQ %ymm8, %ymm9, %ymm2 +- vpor %ymm1, %ymm2, %ymm1 +- vpmovmskb %ymm1, %eax ++ vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm2 ++ VPCMPEQ %ymm2, %ymm0, %ymm3 ++ VPCMPEQ %ymm2, %ymm1, %ymm2 ++ vpor %ymm3, %ymm2, %ymm3 ++ vpmovmskb %ymm3, %eax + testl %eax, %eax + jnz L(first_vec_x3) + +- vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm8 +- VPCMPEQ %ymm8, %ymm0, %ymm1 +- VPCMPEQ %ymm8, %ymm9, %ymm2 +- vpor %ymm1, %ymm2, %ymm1 +- vpmovmskb %ymm1, %eax ++ vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm2 ++ VPCMPEQ %ymm2, %ymm0, %ymm3 ++ VPCMPEQ %ymm2, %ymm1, %ymm2 ++ vpor %ymm3, %ymm2, %ymm3 ++ vpmovmskb %ymm3, %eax + testl %eax, %eax + jnz L(first_vec_x4) +- /* Align data to VEC_SIZE * 4 - 1. */ +- addq $(VEC_SIZE * 4 + 1), %rdi +- andq $-(VEC_SIZE * 4), %rdi ++ /* Align data to VEC_SIZE * 4 - 1. */ ++ incq %rdi ++ orq $(VEC_SIZE * 4 - 1), %rdi + .p2align 4 + L(loop_4x_vec): + /* Compare 4 * VEC at a time forward. */ +- vmovdqa (%rdi), %ymm5 +- vmovdqa (VEC_SIZE)(%rdi), %ymm6 +- vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7 +- vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 ++ vmovdqa 1(%rdi), %ymm6 ++ vmovdqa (VEC_SIZE + 1)(%rdi), %ymm7 + + /* Leaves only CHARS matching esi as 0. */ +- vpxor %ymm5, %ymm0, %ymm1 + vpxor %ymm6, %ymm0, %ymm2 + vpxor %ymm7, %ymm0, %ymm3 +- vpxor %ymm8, %ymm0, %ymm4 + +- VPMINU %ymm1, %ymm5, %ymm1 + VPMINU %ymm2, %ymm6, %ymm2 + VPMINU %ymm3, %ymm7, %ymm3 +- VPMINU %ymm4, %ymm8, %ymm4 + +- VPMINU %ymm1, %ymm2, %ymm5 +- VPMINU %ymm3, %ymm4, %ymm6 ++ vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm6 ++ vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm7 ++ ++ vpxor %ymm6, %ymm0, %ymm4 ++ vpxor %ymm7, %ymm0, %ymm5 ++ ++ VPMINU %ymm4, %ymm6, %ymm4 ++ VPMINU %ymm5, %ymm7, %ymm5 + +- VPMINU %ymm5, %ymm6, %ymm6 ++ VPMINU %ymm2, %ymm3, %ymm6 ++ VPMINU %ymm4, %ymm5, %ymm7 + +- VPCMPEQ %ymm6, %ymm9, %ymm6 +- vpmovmskb %ymm6, %ecx ++ VPMINU %ymm6, %ymm7, %ymm7 ++ ++ VPCMPEQ %ymm7, %ymm1, %ymm7 ++ vpmovmskb %ymm7, %ecx + subq $-(VEC_SIZE * 4), %rdi + testl %ecx, %ecx + jz L(loop_4x_vec) + +- +- VPCMPEQ %ymm1, %ymm9, %ymm1 +- vpmovmskb %ymm1, %eax ++ VPCMPEQ %ymm2, %ymm1, %ymm2 ++ vpmovmskb %ymm2, %eax + testl %eax, %eax + jnz L(last_vec_x0) + + +- VPCMPEQ %ymm5, %ymm9, %ymm2 +- vpmovmskb %ymm2, %eax ++ VPCMPEQ %ymm3, %ymm1, %ymm3 ++ vpmovmskb %ymm3, %eax + testl %eax, %eax + jnz L(last_vec_x1) + +- VPCMPEQ %ymm3, %ymm9, %ymm3 +- vpmovmskb %ymm3, %eax ++ VPCMPEQ %ymm4, %ymm1, %ymm4 ++ vpmovmskb %ymm4, %eax + /* rcx has combined result from all 4 VEC. It will only be used + if the first 3 other VEC all did not contain a match. */ + salq $32, %rcx + orq %rcx, %rax + tzcntq %rax, %rax +- subq $(VEC_SIZE * 2), %rdi ++ subq $(VEC_SIZE * 2 - 1), %rdi + # ifndef USE_AS_STRCHRNUL + /* Found CHAR or the null byte. */ + cmp (%rdi, %rax), %CHAR_REG +@@ -239,10 +251,11 @@ L(loop_4x_vec): + VZEROUPPER_RETURN + + +- .p2align 4 ++ .p2align 4,, 10 + L(last_vec_x0): +- tzcntl %eax, %eax +- addq $-(VEC_SIZE * 4), %rdi ++ /* Use bsf to save code size. */ ++ bsfl %eax, %eax ++ addq $-(VEC_SIZE * 4 - 1), %rdi + # ifndef USE_AS_STRCHRNUL + /* Found CHAR or the null byte. */ + cmp (%rdi, %rax), %CHAR_REG +@@ -251,16 +264,11 @@ L(last_vec_x0): + addq %rdi, %rax + VZEROUPPER_RETURN + +-# ifndef USE_AS_STRCHRNUL +-L(zero_end): +- xorl %eax, %eax +- VZEROUPPER_RETURN +-# endif + +- .p2align 4 ++ .p2align 4,, 10 + L(last_vec_x1): + tzcntl %eax, %eax +- subq $(VEC_SIZE * 3), %rdi ++ subq $(VEC_SIZE * 3 - 1), %rdi + # ifndef USE_AS_STRCHRNUL + /* Found CHAR or the null byte. */ + cmp (%rdi, %rax), %CHAR_REG +@@ -269,18 +277,23 @@ L(last_vec_x1): + addq %rdi, %rax + VZEROUPPER_RETURN + ++# ifndef USE_AS_STRCHRNUL ++L(zero_end): ++ xorl %eax, %eax ++ VZEROUPPER_RETURN ++# endif + + /* Cold case for crossing page with first load. */ +- .p2align 4 ++ .p2align 4,, 8 + L(cross_page_boundary): + movq %rdi, %rdx + /* Align rdi to VEC_SIZE - 1. */ + orq $(VEC_SIZE - 1), %rdi +- vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm8 +- VPCMPEQ %ymm8, %ymm0, %ymm1 +- VPCMPEQ %ymm8, %ymm9, %ymm2 +- vpor %ymm1, %ymm2, %ymm1 +- vpmovmskb %ymm1, %eax ++ vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm2 ++ VPCMPEQ %ymm2, %ymm0, %ymm3 ++ VPCMPEQ %ymm2, %ymm1, %ymm2 ++ vpor %ymm3, %ymm2, %ymm3 ++ vpmovmskb %ymm3, %eax + /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT + so no need to manually mod edx. */ + sarxl %edx, %eax, %eax +@@ -291,13 +304,10 @@ L(cross_page_boundary): + xorl %ecx, %ecx + /* Found CHAR or the null byte. */ + cmp (%rdx, %rax), %CHAR_REG +- leaq (%rdx, %rax), %rax +- cmovne %rcx, %rax +-# else +- addq %rdx, %rax ++ jne L(zero_end) + # endif +-L(return_vzeroupper): +- ZERO_UPPER_VEC_REGISTERS_RETURN ++ addq %rdx, %rax ++ VZEROUPPER_RETURN + + END (STRCHR) +-# endif ++#endif +-- +GitLab + diff --git a/SOURCES/ia-code_cleanup-strchr-evex.patch b/SOURCES/ia-code_cleanup-strchr-evex.patch new file mode 100644 index 0000000..933ef0b --- /dev/null +++ b/SOURCES/ia-code_cleanup-strchr-evex.patch @@ -0,0 +1,344 @@ +From 23371c15467ef5d3225018278d6691e110119e4b Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 23 Mar 2022 16:57:18 -0500 +Subject: [PATCH] x86: Code cleanup in strchr-evex and comment justifying + branch + +Small code cleanup for size: -81 bytes. + +Add comment justifying using a branch to do NULL/non-null return. + +All string/memory tests pass and no regressions in benchtests. + +geometric_mean(N=20) of all benchmarks New / Original: .985 +Reviewed-by: H.J. Lu + +(cherry picked from commit ec285ea90415458225623ddc0492ae3f705af043) +--- + sysdeps/x86_64/multiarch/strchr-evex.S | 146 ++++++++++++++----------- + 1 file changed, 80 insertions(+), 66 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S +index 7f9d4ee4..0b49e0ac 100644 +--- a/sysdeps/x86_64/multiarch/strchr-evex.S ++++ b/sysdeps/x86_64/multiarch/strchr-evex.S +@@ -30,6 +30,7 @@ + # ifdef USE_AS_WCSCHR + # define VPBROADCAST vpbroadcastd + # define VPCMP vpcmpd ++# define VPTESTN vptestnmd + # define VPMINU vpminud + # define CHAR_REG esi + # define SHIFT_REG ecx +@@ -37,6 +38,7 @@ + # else + # define VPBROADCAST vpbroadcastb + # define VPCMP vpcmpb ++# define VPTESTN vptestnmb + # define VPMINU vpminub + # define CHAR_REG sil + # define SHIFT_REG edx +@@ -61,13 +63,11 @@ + # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) + + .section .text.evex,"ax",@progbits +-ENTRY (STRCHR) ++ENTRY_P2ALIGN (STRCHR, 5) + /* Broadcast CHAR to YMM0. */ + VPBROADCAST %esi, %YMM0 + movl %edi, %eax + andl $(PAGE_SIZE - 1), %eax +- vpxorq %XMMZERO, %XMMZERO, %XMMZERO +- + /* Check if we cross page boundary with one vector load. + Otherwise it is safe to use an unaligned load. */ + cmpl $(PAGE_SIZE - VEC_SIZE), %eax +@@ -81,49 +81,35 @@ ENTRY (STRCHR) + vpxorq %YMM1, %YMM0, %YMM2 + VPMINU %YMM2, %YMM1, %YMM2 + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM2, %k0 ++ VPTESTN %YMM2, %YMM2, %k0 + kmovd %k0, %eax + testl %eax, %eax + jz L(aligned_more) + tzcntl %eax, %eax ++# ifndef USE_AS_STRCHRNUL ++ /* Found CHAR or the null byte. */ ++ cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG ++ /* NB: Use a branch instead of cmovcc here. The expectation is ++ that with strchr the user will branch based on input being ++ null. Since this branch will be 100% predictive of the user ++ branch a branch miss here should save what otherwise would ++ be branch miss in the user code. Otherwise using a branch 1) ++ saves code size and 2) is faster in highly predictable ++ environments. */ ++ jne L(zero) ++# endif + # ifdef USE_AS_WCSCHR + /* NB: Multiply wchar_t count by 4 to get the number of bytes. + */ + leaq (%rdi, %rax, CHAR_SIZE), %rax + # else + addq %rdi, %rax +-# endif +-# ifndef USE_AS_STRCHRNUL +- /* Found CHAR or the null byte. */ +- cmp (%rax), %CHAR_REG +- jne L(zero) + # endif + ret + +- /* .p2align 5 helps keep performance more consistent if ENTRY() +- alignment % 32 was either 16 or 0. As well this makes the +- alignment % 32 of the loop_4x_vec fixed which makes tuning it +- easier. */ +- .p2align 5 +-L(first_vec_x3): +- tzcntl %eax, %eax +-# ifndef USE_AS_STRCHRNUL +- /* Found CHAR or the null byte. */ +- cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG +- jne L(zero) +-# endif +- /* NB: Multiply sizeof char type (1 or 4) to get the number of +- bytes. */ +- leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax +- ret + +-# ifndef USE_AS_STRCHRNUL +-L(zero): +- xorl %eax, %eax +- ret +-# endif + +- .p2align 4 ++ .p2align 4,, 10 + L(first_vec_x4): + # ifndef USE_AS_STRCHRNUL + /* Check to see if first match was CHAR (k0) or null (k1). */ +@@ -144,9 +130,18 @@ L(first_vec_x4): + leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax + ret + ++# ifndef USE_AS_STRCHRNUL ++L(zero): ++ xorl %eax, %eax ++ ret ++# endif ++ ++ + .p2align 4 + L(first_vec_x1): +- tzcntl %eax, %eax ++ /* Use bsf here to save 1-byte keeping keeping the block in 1x ++ fetch block. eax guranteed non-zero. */ ++ bsfl %eax, %eax + # ifndef USE_AS_STRCHRNUL + /* Found CHAR or the null byte. */ + cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG +@@ -158,7 +153,7 @@ L(first_vec_x1): + leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax + ret + +- .p2align 4 ++ .p2align 4,, 10 + L(first_vec_x2): + # ifndef USE_AS_STRCHRNUL + /* Check to see if first match was CHAR (k0) or null (k1). */ +@@ -179,6 +174,21 @@ L(first_vec_x2): + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + ret + ++ .p2align 4,, 10 ++L(first_vec_x3): ++ /* Use bsf here to save 1-byte keeping keeping the block in 1x ++ fetch block. eax guranteed non-zero. */ ++ bsfl %eax, %eax ++# ifndef USE_AS_STRCHRNUL ++ /* Found CHAR or the null byte. */ ++ cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG ++ jne L(zero) ++# endif ++ /* NB: Multiply sizeof char type (1 or 4) to get the number of ++ bytes. */ ++ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax ++ ret ++ + .p2align 4 + L(aligned_more): + /* Align data to VEC_SIZE. */ +@@ -195,7 +205,7 @@ L(cross_page_continue): + vpxorq %YMM1, %YMM0, %YMM2 + VPMINU %YMM2, %YMM1, %YMM2 + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM2, %k0 ++ VPTESTN %YMM2, %YMM2, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x1) +@@ -206,7 +216,7 @@ L(cross_page_continue): + /* Each bit in K0 represents a CHAR in YMM1. */ + VPCMP $0, %YMM1, %YMM0, %k0 + /* Each bit in K1 represents a CHAR in YMM1. */ +- VPCMP $0, %YMM1, %YMMZERO, %k1 ++ VPTESTN %YMM1, %YMM1, %k1 + kortestd %k0, %k1 + jnz L(first_vec_x2) + +@@ -215,7 +225,7 @@ L(cross_page_continue): + vpxorq %YMM1, %YMM0, %YMM2 + VPMINU %YMM2, %YMM1, %YMM2 + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM2, %k0 ++ VPTESTN %YMM2, %YMM2, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x3) +@@ -224,7 +234,7 @@ L(cross_page_continue): + /* Each bit in K0 represents a CHAR in YMM1. */ + VPCMP $0, %YMM1, %YMM0, %k0 + /* Each bit in K1 represents a CHAR in YMM1. */ +- VPCMP $0, %YMM1, %YMMZERO, %k1 ++ VPTESTN %YMM1, %YMM1, %k1 + kortestd %k0, %k1 + jnz L(first_vec_x4) + +@@ -265,33 +275,33 @@ L(loop_4x_vec): + VPMINU %YMM3, %YMM4, %YMM4 + VPMINU %YMM2, %YMM4, %YMM4{%k4}{z} + +- VPCMP $0, %YMMZERO, %YMM4, %k1 ++ VPTESTN %YMM4, %YMM4, %k1 + kmovd %k1, %ecx + subq $-(VEC_SIZE * 4), %rdi + testl %ecx, %ecx + jz L(loop_4x_vec) + +- VPCMP $0, %YMMZERO, %YMM1, %k0 ++ VPTESTN %YMM1, %YMM1, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(last_vec_x1) + +- VPCMP $0, %YMMZERO, %YMM2, %k0 ++ VPTESTN %YMM2, %YMM2, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(last_vec_x2) + +- VPCMP $0, %YMMZERO, %YMM3, %k0 ++ VPTESTN %YMM3, %YMM3, %k0 + kmovd %k0, %eax + /* Combine YMM3 matches (eax) with YMM4 matches (ecx). */ + # ifdef USE_AS_WCSCHR + sall $8, %ecx + orl %ecx, %eax +- tzcntl %eax, %eax ++ bsfl %eax, %eax + # else + salq $32, %rcx + orq %rcx, %rax +- tzcntq %rax, %rax ++ bsfq %rax, %rax + # endif + # ifndef USE_AS_STRCHRNUL + /* Check if match was CHAR or null. */ +@@ -303,28 +313,28 @@ L(loop_4x_vec): + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + ret + +-# ifndef USE_AS_STRCHRNUL +-L(zero_end): +- xorl %eax, %eax +- ret ++ .p2align 4,, 8 ++L(last_vec_x1): ++ bsfl %eax, %eax ++# ifdef USE_AS_WCSCHR ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. ++ */ ++ leaq (%rdi, %rax, CHAR_SIZE), %rax ++# else ++ addq %rdi, %rax + # endif + +- .p2align 4 +-L(last_vec_x1): +- tzcntl %eax, %eax + # ifndef USE_AS_STRCHRNUL + /* Check if match was null. */ +- cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG ++ cmp (%rax), %CHAR_REG + jne L(zero_end) + # endif +- /* NB: Multiply sizeof char type (1 or 4) to get the number of +- bytes. */ +- leaq (%rdi, %rax, CHAR_SIZE), %rax ++ + ret + +- .p2align 4 ++ .p2align 4,, 8 + L(last_vec_x2): +- tzcntl %eax, %eax ++ bsfl %eax, %eax + # ifndef USE_AS_STRCHRNUL + /* Check if match was null. */ + cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG +@@ -336,7 +346,7 @@ L(last_vec_x2): + ret + + /* Cold case for crossing page with first load. */ +- .p2align 4 ++ .p2align 4,, 8 + L(cross_page_boundary): + movq %rdi, %rdx + /* Align rdi. */ +@@ -346,9 +356,9 @@ L(cross_page_boundary): + vpxorq %YMM1, %YMM0, %YMM2 + VPMINU %YMM2, %YMM1, %YMM2 + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM2, %k0 ++ VPTESTN %YMM2, %YMM2, %k0 + kmovd %k0, %eax +- /* Remove the leading bits. */ ++ /* Remove the leading bits. */ + # ifdef USE_AS_WCSCHR + movl %edx, %SHIFT_REG + /* NB: Divide shift count by 4 since each bit in K1 represent 4 +@@ -360,20 +370,24 @@ L(cross_page_boundary): + /* If eax is zero continue. */ + testl %eax, %eax + jz L(cross_page_continue) +- tzcntl %eax, %eax +-# ifndef USE_AS_STRCHRNUL +- /* Check to see if match was CHAR or null. */ +- cmp (%rdx, %rax, CHAR_SIZE), %CHAR_REG +- jne L(zero_end) +-# endif ++ bsfl %eax, %eax ++ + # ifdef USE_AS_WCSCHR + /* NB: Multiply wchar_t count by 4 to get the number of + bytes. */ + leaq (%rdx, %rax, CHAR_SIZE), %rax + # else + addq %rdx, %rax ++# endif ++# ifndef USE_AS_STRCHRNUL ++ /* Check to see if match was CHAR or null. */ ++ cmp (%rax), %CHAR_REG ++ je L(cross_page_ret) ++L(zero_end): ++ xorl %eax, %eax ++L(cross_page_ret): + # endif + ret + + END (STRCHR) +-# endif ++#endif +-- +GitLab + diff --git a/SOURCES/ia-imp-l.patch b/SOURCES/ia-imp-l.patch new file mode 100644 index 0000000..3648932 --- /dev/null +++ b/SOURCES/ia-imp-l.patch @@ -0,0 +1,27 @@ +From bc402884eb392fa9c3c5813fca95c0b37d9879a6 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Sat, 5 Feb 2022 11:06:01 -0800 +Subject: [PATCH] x86: Improve L to support L(XXX_SYMBOL (YYY, ZZZ)) + +(cherry picked from commit 1283948f236f209b7d3f44b69a42b96806fa6da0) +--- + sysdeps/x86/sysdep.h | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h +index a70bb3a2..49b0efe2 100644 +--- a/sysdeps/x86/sysdep.h ++++ b/sysdeps/x86/sysdep.h +@@ -111,7 +111,8 @@ enum cf_protection_level + /* Local label name for asm code. */ + #ifndef L + /* ELF-like local names start with `.L'. */ +-# define L(name) .L##name ++# define LOCAL_LABEL(name) .L##name ++# define L(name) LOCAL_LABEL(name) + #endif + + #define atom_text_section .section ".text.atom", "ax" +-- +GitLab + diff --git a/SOURCES/ia-imp-vec_gen-memset-vec-unaligned-erms.patch b/SOURCES/ia-imp-vec_gen-memset-vec-unaligned-erms.patch new file mode 100644 index 0000000..28aeb5e --- /dev/null +++ b/SOURCES/ia-imp-vec_gen-memset-vec-unaligned-erms.patch @@ -0,0 +1,460 @@ +From 413e4abc92aeb12fb4c188aa53f0425ceac0ef15 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Sun, 6 Feb 2022 00:54:18 -0600 +Subject: [PATCH] x86: Improve vec generation in memset-vec-unaligned-erms.S + +No bug. + +Split vec generation into multiple steps. This allows the +broadcast in AVX2 to use 'xmm' registers for the L(less_vec) +case. This saves an expensive lane-cross instruction and removes +the need for 'vzeroupper'. + +For SSE2 replace 2x 'punpck' instructions with zero-idiom 'pxor' for +byte broadcast. + +Results for memset-avx2 small (geomean of N = 20 benchset runs). + +size, New Time, Old Time, New / Old + 0, 4.100, 3.831, 0.934 + 1, 5.074, 4.399, 0.867 + 2, 4.433, 4.411, 0.995 + 4, 4.487, 4.415, 0.984 + 8, 4.454, 4.396, 0.987 + 16, 4.502, 4.443, 0.987 + +All relevant string/wcsmbs tests are passing. +Reviewed-by: H.J. Lu + +(cherry picked from commit b62ace2740a106222e124cc86956448fa07abf4d) +--- + sysdeps/x86_64/memset.S | 21 ++- + .../multiarch/memset-avx2-unaligned-erms.S | 18 +- + .../multiarch/memset-avx512-unaligned-erms.S | 18 +- + .../multiarch/memset-evex-unaligned-erms.S | 18 +- + .../multiarch/memset-vec-unaligned-erms.S | 164 +++++++++++------- + 5 files changed, 152 insertions(+), 87 deletions(-) + +diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S +index 8672b030..27debd2b 100644 +--- a/sysdeps/x86_64/memset.S ++++ b/sysdeps/x86_64/memset.S +@@ -28,17 +28,22 @@ + #define VMOVU movups + #define VMOVA movaps + +-#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ ++# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + movd d, %xmm0; \ +- movq r, %rax; \ +- punpcklbw %xmm0, %xmm0; \ +- punpcklwd %xmm0, %xmm0; \ +- pshufd $0, %xmm0, %xmm0 ++ pxor %xmm1, %xmm1; \ ++ pshufb %xmm1, %xmm0; \ ++ movq r, %rax + +-#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ ++# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + movd d, %xmm0; \ +- movq r, %rax; \ +- pshufd $0, %xmm0, %xmm0 ++ pshufd $0, %xmm0, %xmm0; \ ++ movq r, %rax ++ ++# define MEMSET_VDUP_TO_VEC0_HIGH() ++# define MEMSET_VDUP_TO_VEC0_LOW() ++ ++# define WMEMSET_VDUP_TO_VEC0_HIGH() ++# define WMEMSET_VDUP_TO_VEC0_LOW() + + #define SECTION(p) p + +diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +index 1af668af..c0bf2875 100644 +--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +@@ -10,15 +10,18 @@ + # define VMOVU vmovdqu + # define VMOVA vmovdqa + +-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ ++# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + vmovd d, %xmm0; \ +- movq r, %rax; \ +- vpbroadcastb %xmm0, %ymm0 ++ movq r, %rax; + +-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +- vmovd d, %xmm0; \ +- movq r, %rax; \ +- vpbroadcastd %xmm0, %ymm0 ++# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ ++ MEMSET_SET_VEC0_AND_SET_RETURN(d, r) ++ ++# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0 ++# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0 ++ ++# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0 ++# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0 + + # ifndef SECTION + # define SECTION(p) p##.avx +@@ -30,5 +33,6 @@ + # define WMEMSET_SYMBOL(p,s) p##_avx2_##s + # endif + ++# define USE_XMM_LESS_VEC + # include "memset-vec-unaligned-erms.S" + #endif +diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +index f14d6f84..5241216a 100644 +--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +@@ -15,13 +15,19 @@ + + # define VZEROUPPER + +-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +- movq r, %rax; \ +- vpbroadcastb d, %VEC0 ++# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ ++ vpbroadcastb d, %VEC0; \ ++ movq r, %rax + +-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +- movq r, %rax; \ +- vpbroadcastd d, %VEC0 ++# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ ++ vpbroadcastd d, %VEC0; \ ++ movq r, %rax ++ ++# define MEMSET_VDUP_TO_VEC0_HIGH() ++# define MEMSET_VDUP_TO_VEC0_LOW() ++ ++# define WMEMSET_VDUP_TO_VEC0_HIGH() ++# define WMEMSET_VDUP_TO_VEC0_LOW() + + # define SECTION(p) p##.evex512 + # define MEMSET_SYMBOL(p,s) p##_avx512_##s +diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S +index 64b09e77..63700215 100644 +--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S +@@ -15,13 +15,19 @@ + + # define VZEROUPPER + +-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +- movq r, %rax; \ +- vpbroadcastb d, %VEC0 ++# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ ++ vpbroadcastb d, %VEC0; \ ++ movq r, %rax + +-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +- movq r, %rax; \ +- vpbroadcastd d, %VEC0 ++# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ ++ vpbroadcastd d, %VEC0; \ ++ movq r, %rax ++ ++# define MEMSET_VDUP_TO_VEC0_HIGH() ++# define MEMSET_VDUP_TO_VEC0_LOW() ++ ++# define WMEMSET_VDUP_TO_VEC0_HIGH() ++# define WMEMSET_VDUP_TO_VEC0_LOW() + + # define SECTION(p) p##.evex + # define MEMSET_SYMBOL(p,s) p##_evex_##s +diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +index f08b7323..a67f9833 100644 +--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +@@ -58,8 +58,10 @@ + #ifndef MOVQ + # if VEC_SIZE > 16 + # define MOVQ vmovq ++# define MOVD vmovd + # else + # define MOVQ movq ++# define MOVD movd + # endif + #endif + +@@ -72,9 +74,17 @@ + #if defined USE_WITH_EVEX || defined USE_WITH_AVX512 + # define END_REG rcx + # define LOOP_REG rdi ++# define LESS_VEC_REG rax + #else + # define END_REG rdi + # define LOOP_REG rdx ++# define LESS_VEC_REG rdi ++#endif ++ ++#ifdef USE_XMM_LESS_VEC ++# define XMM_SMALL 1 ++#else ++# define XMM_SMALL 0 + #endif + + #define PAGE_SIZE 4096 +@@ -110,8 +120,12 @@ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)) + + ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned)) + shl $2, %RDX_LP +- WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) +- jmp L(entry_from_bzero) ++ WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi) ++ WMEMSET_VDUP_TO_VEC0_LOW() ++ cmpq $VEC_SIZE, %rdx ++ jb L(less_vec_no_vdup) ++ WMEMSET_VDUP_TO_VEC0_HIGH() ++ jmp L(entry_from_wmemset) + END (WMEMSET_SYMBOL (__wmemset, unaligned)) + #endif + +@@ -123,7 +137,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) + #endif + + ENTRY (MEMSET_SYMBOL (__memset, unaligned)) +- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) ++ MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi) + # ifdef __ILP32__ + /* Clear the upper 32 bits. */ + mov %edx, %edx +@@ -131,6 +145,8 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned)) + L(entry_from_bzero): + cmpq $VEC_SIZE, %rdx + jb L(less_vec) ++ MEMSET_VDUP_TO_VEC0_HIGH() ++L(entry_from_wmemset): + cmpq $(VEC_SIZE * 2), %rdx + ja L(more_2x_vec) + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ +@@ -179,27 +195,27 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) + # endif + + ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6) +- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) ++ MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi) + # ifdef __ILP32__ + /* Clear the upper 32 bits. */ + mov %edx, %edx + # endif + cmp $VEC_SIZE, %RDX_LP + jb L(less_vec) ++ MEMSET_VDUP_TO_VEC0_HIGH () + cmp $(VEC_SIZE * 2), %RDX_LP + ja L(stosb_more_2x_vec) +- /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. +- */ +- VMOVU %VEC(0), (%rax) +- VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx) ++ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ ++ VMOVU %VEC(0), (%rdi) ++ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) + VZEROUPPER_RETURN + #endif + +- .p2align 4,, 10 ++ .p2align 4,, 4 + L(last_2x_vec): + #ifdef USE_LESS_VEC_MASK_STORE +- VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx) +- VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx) ++ VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi, %rdx) ++ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) + #else + VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi) + VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi) +@@ -212,6 +228,7 @@ L(last_2x_vec): + #ifdef USE_LESS_VEC_MASK_STORE + .p2align 4,, 10 + L(less_vec): ++L(less_vec_no_vdup): + /* Less than 1 VEC. */ + # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 + # error Unsupported VEC_SIZE! +@@ -262,28 +279,18 @@ L(stosb_more_2x_vec): + /* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x] + and (4x, 8x] jump to target. */ + L(more_2x_vec): +- +- /* Two different methods of setting up pointers / compare. The +- two methods are based on the fact that EVEX/AVX512 mov +- instructions take more bytes then AVX2/SSE2 mov instructions. As +- well that EVEX/AVX512 machines also have fast LEA_BID. Both +- setup and END_REG to avoid complex address mode. For EVEX/AVX512 +- this saves code size and keeps a few targets in one fetch block. +- For AVX2/SSE2 this helps prevent AGU bottlenecks. */ +-#if defined USE_WITH_EVEX || defined USE_WITH_AVX512 +- /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + +- LOOP_4X_OFFSET) with LEA_BID. */ +- +- /* END_REG is rcx for EVEX/AVX512. */ +- leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG +-#endif +- +- /* Stores to first 2x VEC before cmp as any path forward will +- require it. */ +- VMOVU %VEC(0), (%rax) +- VMOVU %VEC(0), VEC_SIZE(%rax) ++ /* Store next 2x vec regardless. */ ++ VMOVU %VEC(0), (%rdi) ++ VMOVU %VEC(0), (VEC_SIZE * 1)(%rdi) + + ++ /* Two different methods of setting up pointers / compare. The two ++ methods are based on the fact that EVEX/AVX512 mov instructions take ++ more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512 ++ machines also have fast LEA_BID. Both setup and END_REG to avoid complex ++ address mode. For EVEX/AVX512 this saves code size and keeps a few ++ targets in one fetch block. For AVX2/SSE2 this helps prevent AGU ++ bottlenecks. */ + #if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512) + /* If AVX2/SSE2 compute END_REG (rdi) with ALU. */ + addq %rdx, %END_REG +@@ -292,6 +299,15 @@ L(more_2x_vec): + cmpq $(VEC_SIZE * 4), %rdx + jbe L(last_2x_vec) + ++ ++#if defined USE_WITH_EVEX || defined USE_WITH_AVX512 ++ /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with ++ LEA_BID. */ ++ ++ /* END_REG is rcx for EVEX/AVX512. */ ++ leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG ++#endif ++ + /* Store next 2x vec regardless. */ + VMOVU %VEC(0), (VEC_SIZE * 2)(%rax) + VMOVU %VEC(0), (VEC_SIZE * 3)(%rax) +@@ -355,65 +371,93 @@ L(stosb_local): + /* Define L(less_vec) only if not otherwise defined. */ + .p2align 4 + L(less_vec): ++ /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to ++ xmm). This is only does anything for AVX2. */ ++ MEMSET_VDUP_TO_VEC0_LOW () ++L(less_vec_no_vdup): + #endif + L(cross_page): + #if VEC_SIZE > 32 + cmpl $32, %edx +- jae L(between_32_63) ++ jge L(between_32_63) + #endif + #if VEC_SIZE > 16 + cmpl $16, %edx +- jae L(between_16_31) ++ jge L(between_16_31) ++#endif ++#ifndef USE_XMM_LESS_VEC ++ MOVQ %XMM0, %rcx + #endif +- MOVQ %XMM0, %rdi + cmpl $8, %edx +- jae L(between_8_15) ++ jge L(between_8_15) + cmpl $4, %edx +- jae L(between_4_7) ++ jge L(between_4_7) + cmpl $1, %edx +- ja L(between_2_3) +- jb L(return) +- movb %sil, (%rax) +- VZEROUPPER_RETURN ++ jg L(between_2_3) ++ jl L(between_0_0) ++ movb %sil, (%LESS_VEC_REG) ++L(between_0_0): ++ ret + +- /* Align small targets only if not doing so would cross a fetch +- line. */ ++ /* Align small targets only if not doing so would cross a fetch line. ++ */ + #if VEC_SIZE > 32 + .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE) + /* From 32 to 63. No branch when size == 32. */ + L(between_32_63): +- VMOVU %YMM0, (%rax) +- VMOVU %YMM0, -32(%rax, %rdx) ++ VMOVU %YMM0, (%LESS_VEC_REG) ++ VMOVU %YMM0, -32(%LESS_VEC_REG, %rdx) + VZEROUPPER_RETURN + #endif + + #if VEC_SIZE >= 32 +- .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE) ++ .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1) + L(between_16_31): + /* From 16 to 31. No branch when size == 16. */ +- VMOVU %XMM0, (%rax) +- VMOVU %XMM0, -16(%rax, %rdx) +- VZEROUPPER_RETURN ++ VMOVU %XMM0, (%LESS_VEC_REG) ++ VMOVU %XMM0, -16(%LESS_VEC_REG, %rdx) ++ ret + #endif + +- .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE) ++ /* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2. ++ */ ++ .p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1) + L(between_8_15): + /* From 8 to 15. No branch when size == 8. */ +- movq %rdi, (%rax) +- movq %rdi, -8(%rax, %rdx) +- VZEROUPPER_RETURN ++#ifdef USE_XMM_LESS_VEC ++ MOVQ %XMM0, (%rdi) ++ MOVQ %XMM0, -8(%rdi, %rdx) ++#else ++ movq %rcx, (%LESS_VEC_REG) ++ movq %rcx, -8(%LESS_VEC_REG, %rdx) ++#endif ++ ret + +- .p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE) ++ /* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2. ++ */ ++ .p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1) + L(between_4_7): + /* From 4 to 7. No branch when size == 4. */ +- movl %edi, (%rax) +- movl %edi, -4(%rax, %rdx) +- VZEROUPPER_RETURN ++#ifdef USE_XMM_LESS_VEC ++ MOVD %XMM0, (%rdi) ++ MOVD %XMM0, -4(%rdi, %rdx) ++#else ++ movl %ecx, (%LESS_VEC_REG) ++ movl %ecx, -4(%LESS_VEC_REG, %rdx) ++#endif ++ ret + +- .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE) ++ /* 4 * XMM_SMALL for the third mov for AVX2. */ ++ .p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1) + L(between_2_3): + /* From 2 to 3. No branch when size == 2. */ +- movw %di, (%rax) +- movb %dil, -1(%rax, %rdx) +- VZEROUPPER_RETURN ++#ifdef USE_XMM_LESS_VEC ++ movb %sil, (%rdi) ++ movb %sil, 1(%rdi) ++ movb %sil, -1(%rdi, %rdx) ++#else ++ movw %cx, (%LESS_VEC_REG) ++ movb %sil, -1(%LESS_VEC_REG, %rdx) ++#endif ++ ret + END (MEMSET_SYMBOL (__memset, unaligned_erms)) +-- +GitLab + diff --git a/SOURCES/ia-imp-wcslen.patch b/SOURCES/ia-imp-wcslen.patch new file mode 100644 index 0000000..7f2b720 --- /dev/null +++ b/SOURCES/ia-imp-wcslen.patch @@ -0,0 +1,258 @@ +From ac23759b655ceac1bd18b71f45bb1743826f0bed Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Fri, 25 Mar 2022 17:13:33 -0500 +Subject: [PATCH] x86: Small improvements for wcslen + +Just a few QOL changes. + 1. Prefer `add` > `lea` as it has high execution units it can run + on. + 2. Don't break macro-fusion between `test` and `jcc` + 3. Reduce code size by removing gratuitous padding bytes (-90 + bytes). + +geometric_mean(N=20) of all benchmarks New / Original: 0.959 + +All string/memory tests pass. +Reviewed-by: H.J. Lu + +(cherry picked from commit 244b415d386487521882debb845a040a4758cb18) +--- + sysdeps/x86_64/wcslen.S | 86 ++++++++++++++++++++--------------------- + 1 file changed, 41 insertions(+), 45 deletions(-) + +diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S +index 9f5f7232..254bb030 100644 +--- a/sysdeps/x86_64/wcslen.S ++++ b/sysdeps/x86_64/wcslen.S +@@ -41,82 +41,82 @@ ENTRY (__wcslen) + pxor %xmm0, %xmm0 + + lea 32(%rdi), %rax +- lea 16(%rdi), %rcx ++ addq $16, %rdi + and $-16, %rax + + pcmpeqd (%rax), %xmm0 + pmovmskb %xmm0, %edx + pxor %xmm1, %xmm1 ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm1 + pmovmskb %xmm1, %edx + pxor %xmm2, %xmm2 ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm2 + pmovmskb %xmm2, %edx + pxor %xmm3, %xmm3 ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm3 + pmovmskb %xmm3, %edx ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm0 + pmovmskb %xmm0, %edx ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm1 + pmovmskb %xmm1, %edx ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm2 + pmovmskb %xmm2, %edx ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm3 + pmovmskb %xmm3, %edx ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm0 + pmovmskb %xmm0, %edx ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm1 + pmovmskb %xmm1, %edx ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm2 + pmovmskb %xmm2, %edx ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm3 + pmovmskb %xmm3, %edx ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + and $-0x40, %rax +@@ -133,104 +133,100 @@ L(aligned_64_loop): + pminub %xmm0, %xmm2 + pcmpeqd %xmm3, %xmm2 + pmovmskb %xmm2, %edx ++ addq $64, %rax + test %edx, %edx +- lea 64(%rax), %rax + jz L(aligned_64_loop) + + pcmpeqd -64(%rax), %xmm3 + pmovmskb %xmm3, %edx ++ addq $48, %rdi + test %edx, %edx +- lea 48(%rcx), %rcx + jnz L(exit) + + pcmpeqd %xmm1, %xmm3 + pmovmskb %xmm3, %edx ++ addq $-16, %rdi + test %edx, %edx +- lea -16(%rcx), %rcx + jnz L(exit) + + pcmpeqd -32(%rax), %xmm3 + pmovmskb %xmm3, %edx ++ addq $-16, %rdi + test %edx, %edx +- lea -16(%rcx), %rcx + jnz L(exit) + + pcmpeqd %xmm6, %xmm3 + pmovmskb %xmm3, %edx ++ addq $-16, %rdi + test %edx, %edx +- lea -16(%rcx), %rcx +- jnz L(exit) +- +- jmp L(aligned_64_loop) ++ jz L(aligned_64_loop) + + .p2align 4 + L(exit): +- sub %rcx, %rax ++ sub %rdi, %rax + shr $2, %rax + test %dl, %dl + jz L(exit_high) + +- mov %dl, %cl +- and $15, %cl ++ andl $15, %edx + jz L(exit_1) + ret + +- .p2align 4 ++ /* No align here. Naturally aligned % 16 == 1. */ + L(exit_high): +- mov %dh, %ch +- and $15, %ch ++ andl $(15 << 8), %edx + jz L(exit_3) + add $2, %rax + ret + +- .p2align 4 ++ .p2align 3 + L(exit_1): + add $1, %rax + ret + +- .p2align 4 ++ .p2align 3 + L(exit_3): + add $3, %rax + ret + +- .p2align 4 ++ .p2align 3 + L(exit_tail0): +- xor %rax, %rax ++ xorl %eax, %eax + ret + +- .p2align 4 ++ .p2align 3 + L(exit_tail1): +- mov $1, %rax ++ movl $1, %eax + ret + +- .p2align 4 ++ .p2align 3 + L(exit_tail2): +- mov $2, %rax ++ movl $2, %eax + ret + +- .p2align 4 ++ .p2align 3 + L(exit_tail3): +- mov $3, %rax ++ movl $3, %eax + ret + +- .p2align 4 ++ .p2align 3 + L(exit_tail4): +- mov $4, %rax ++ movl $4, %eax + ret + +- .p2align 4 ++ .p2align 3 + L(exit_tail5): +- mov $5, %rax ++ movl $5, %eax + ret + +- .p2align 4 ++ .p2align 3 + L(exit_tail6): +- mov $6, %rax ++ movl $6, %eax + ret + +- .p2align 4 ++ .p2align 3 + L(exit_tail7): +- mov $7, %rax ++ movl $7, %eax + ret + + END (__wcslen) +-- +GitLab + diff --git a/SOURCES/ia-march-srt-sysdep_routines.patch b/SOURCES/ia-march-srt-sysdep_routines.patch new file mode 100644 index 0000000..2525200 --- /dev/null +++ b/SOURCES/ia-march-srt-sysdep_routines.patch @@ -0,0 +1,104 @@ +From 18bfa4d2f8ce8a33367e93ab2eaab90be1133c86 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Sat, 5 Feb 2022 11:52:33 -0800 +Subject: [PATCH] x86_64/multiarch: Sort sysdep_routines and put one entry per + line + +(cherry picked from commit c328d0152d4b14cca58407ec68143894c8863004) +--- + sysdeps/x86_64/multiarch/Makefile | 78 +++++++++++++++++++------------ + 1 file changed, 48 insertions(+), 30 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index 37d8d6f0..8c9e7812 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -132,37 +132,55 @@ CFLAGS-strspn-c.c += -msse4 + endif + + ifeq ($(subdir),wcsmbs) +-sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \ +- wmemcmp-avx2-movbe \ +- wmemchr-sse2 wmemchr-avx2 \ +- wcscmp-sse2 wcscmp-avx2 \ +- wcsncmp-sse2 wcsncmp-avx2 \ +- wcscpy-ssse3 wcscpy-c \ +- wcschr-sse2 wcschr-avx2 \ +- wcsrchr-sse2 wcsrchr-avx2 \ +- wcslen-sse2 wcslen-sse4_1 wcslen-avx2 \ +- wcsnlen-c wcsnlen-sse4_1 wcsnlen-avx2 \ +- wcschr-avx2-rtm \ +- wcscmp-avx2-rtm \ +- wcslen-avx2-rtm \ +- wcsncmp-avx2-rtm \ +- wcsnlen-avx2-rtm \ +- wcsrchr-avx2-rtm \ +- wmemchr-avx2-rtm \ +- wmemcmp-avx2-movbe-rtm \ +- wcschr-evex \ +- wcscmp-evex \ +- wcslen-evex \ +- wcsncmp-evex \ +- wcsnlen-evex \ +- wcsrchr-evex \ +- wmemchr-evex \ +- wmemcmp-evex-movbe \ +- wmemchr-evex-rtm ++sysdep_routines += \ ++ wcschr-avx2 \ ++ wcschr-avx2-rtm \ ++ wcschr-evex \ ++ wcschr-sse2 \ ++ wcscmp-avx2 \ ++ wcscmp-avx2-rtm \ ++ wcscmp-evex \ ++ wcscmp-sse2 \ ++ wcscpy-c \ ++ wcscpy-ssse3 \ ++ wcslen-avx2 \ ++ wcslen-avx2-rtm \ ++ wcslen-evex \ ++ wcslen-sse2 \ ++ wcslen-sse4_1 \ ++ wcsncmp-avx2 \ ++ wcsncmp-avx2-rtm \ ++ wcsncmp-evex \ ++ wcsncmp-sse2 \ ++ wcsnlen-avx2 \ ++ wcsnlen-avx2-rtm \ ++ wcsnlen-c \ ++ wcsnlen-evex \ ++ wcsnlen-sse4_1 \ ++ wcsrchr-avx2 \ ++ wcsrchr-avx2-rtm \ ++ wcsrchr-evex \ ++ wcsrchr-sse2 \ ++ wmemchr-avx2 \ ++ wmemchr-avx2-rtm \ ++ wmemchr-evex \ ++ wmemchr-evex-rtm \ ++ wmemchr-sse2 \ ++ wmemcmp-avx2-movbe \ ++ wmemcmp-avx2-movbe-rtm \ ++ wmemcmp-c \ ++ wmemcmp-evex-movbe \ ++ wmemcmp-sse4 \ ++ wmemcmp-ssse3 \ ++# sysdep_routines + endif + + ifeq ($(subdir),debug) +-sysdep_routines += memcpy_chk-nonshared mempcpy_chk-nonshared \ +- memmove_chk-nonshared memset_chk-nonshared \ +- wmemset_chk-nonshared ++sysdep_routines += \ ++ memcpy_chk-nonshared \ ++ memmove_chk-nonshared \ ++ mempcpy_chk-nonshared \ ++ memset_chk-nonshared \ ++ wmemset_chk-nonshared \ ++# sysdep_routines + endif +-- +GitLab + diff --git a/SOURCES/ia-opt-bzero.patch b/SOURCES/ia-opt-bzero.patch new file mode 100644 index 0000000..e844865 --- /dev/null +++ b/SOURCES/ia-opt-bzero.patch @@ -0,0 +1,750 @@ +From 94783c6e57638122cefe4e02342c7fafc3cf09f0 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Mon, 7 Feb 2022 05:55:15 -0800 +Subject: [PATCH] x86-64: Optimize bzero + +Add OPTIMIZE1 and OPTIMIZE2 in ifunc-init.h file. +Remove memcmpeq implementation from Makefile. + +memset with zero as the value to set is by far the majority value (99%+ +for Python3 and GCC). + +bzero can be slightly more optimized for this case by using a zero-idiom +xor for broadcasting the set value to a register (vector or GPR). + +Co-developed-by: Noah Goldstein +(cherry picked from commit 3d9f171bfb5325bd5f427e9fc386453358c6e840) +--- + sysdeps/generic/ifunc-init.h | 5 +- + sysdeps/x86_64/memset.S | 8 + + sysdeps/x86_64/multiarch/Makefile | 205 +++++++++++------- + sysdeps/x86_64/multiarch/bzero.c | 106 +++++++++ + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 42 ++++ + .../memset-avx2-unaligned-erms-rtm.S | 1 + + .../multiarch/memset-avx2-unaligned-erms.S | 6 + + .../multiarch/memset-avx512-unaligned-erms.S | 3 + + .../multiarch/memset-evex-unaligned-erms.S | 3 + + .../multiarch/memset-sse2-unaligned-erms.S | 1 + + .../multiarch/memset-vec-unaligned-erms.S | 110 +++++++--- + 11 files changed, 384 insertions(+), 106 deletions(-) + create mode 100644 sysdeps/x86_64/multiarch/bzero.c + +diff --git a/sysdeps/generic/ifunc-init.h b/sysdeps/generic/ifunc-init.h +index 241e4161..f7a72375 100644 +--- a/sysdeps/generic/ifunc-init.h ++++ b/sysdeps/generic/ifunc-init.h +@@ -50,5 +50,8 @@ + '___' as the optimized implementation and + '_ifunc_selector' as the IFUNC selector. */ + #define REDIRECT_NAME EVALUATOR1 (__redirect, SYMBOL_NAME) +-#define OPTIMIZE(name) EVALUATOR2 (SYMBOL_NAME, name) ++#define OPTIMIZE1(name) EVALUATOR1 (SYMBOL_NAME, name) ++#define OPTIMIZE2(name) EVALUATOR2 (SYMBOL_NAME, name) ++/* Default is to use OPTIMIZE2. */ ++#define OPTIMIZE(name) OPTIMIZE2(name) + #define IFUNC_SELECTOR EVALUATOR1 (SYMBOL_NAME, ifunc_selector) +diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S +index 4cb4aa71..a1353f89 100644 +--- a/sysdeps/x86_64/memset.S ++++ b/sysdeps/x86_64/memset.S +@@ -35,6 +35,9 @@ + punpcklwd %xmm0, %xmm0; \ + pshufd $0, %xmm0, %xmm0 + ++# define BZERO_ZERO_VEC0() \ ++ pxor %xmm0, %xmm0 ++ + # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + movd d, %xmm0; \ + pshufd $0, %xmm0, %xmm0; \ +@@ -53,6 +56,10 @@ + # define MEMSET_SYMBOL(p,s) memset + #endif + ++#ifndef BZERO_SYMBOL ++# define BZERO_SYMBOL(p,s) __bzero ++#endif ++ + #ifndef WMEMSET_SYMBOL + # define WMEMSET_CHK_SYMBOL(p,s) p + # define WMEMSET_SYMBOL(p,s) __wmemset +@@ -63,6 +70,7 @@ + libc_hidden_builtin_def (memset) + + #if IS_IN (libc) ++weak_alias (__bzero, bzero) + libc_hidden_def (__wmemset) + weak_alias (__wmemset, wmemset) + libc_hidden_weak (wmemset) +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index 26be4095..37d8d6f0 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -1,85 +1,130 @@ + ifeq ($(subdir),string) + +-sysdep_routines += strncat-c stpncpy-c strncpy-c \ +- strcmp-sse2 strcmp-sse2-unaligned strcmp-ssse3 \ +- strcmp-sse4_2 strcmp-avx2 \ +- strncmp-sse2 strncmp-ssse3 strncmp-sse4_2 strncmp-avx2 \ +- memchr-sse2 rawmemchr-sse2 memchr-avx2 rawmemchr-avx2 \ +- memrchr-sse2 memrchr-avx2 \ +- memcmp-sse2 \ +- memcmp-avx2-movbe \ +- memcmp-sse4 memcpy-ssse3 \ +- memmove-ssse3 \ +- memcpy-ssse3-back \ +- memmove-ssse3-back \ +- memmove-avx512-no-vzeroupper \ +- strcasecmp_l-sse2 strcasecmp_l-ssse3 \ +- strcasecmp_l-sse4_2 strcasecmp_l-avx \ +- strncase_l-sse2 strncase_l-ssse3 \ +- strncase_l-sse4_2 strncase_l-avx \ +- strchr-sse2 strchrnul-sse2 strchr-avx2 strchrnul-avx2 \ +- strrchr-sse2 strrchr-avx2 \ +- strlen-sse2 strnlen-sse2 strlen-avx2 strnlen-avx2 \ +- strcat-avx2 strncat-avx2 \ +- strcat-ssse3 strncat-ssse3\ +- strcpy-avx2 strncpy-avx2 \ +- strcpy-sse2 stpcpy-sse2 \ +- strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \ +- strcpy-sse2-unaligned strncpy-sse2-unaligned \ +- stpcpy-sse2-unaligned stpncpy-sse2-unaligned \ +- stpcpy-avx2 stpncpy-avx2 \ +- strcat-sse2 \ +- strcat-sse2-unaligned strncat-sse2-unaligned \ +- strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \ +- strcspn-sse2 strpbrk-sse2 strspn-sse2 \ +- strcspn-c strpbrk-c strspn-c varshift \ +- memset-avx512-no-vzeroupper \ +- memmove-sse2-unaligned-erms \ +- memmove-avx-unaligned-erms \ +- memmove-avx512-unaligned-erms \ +- memset-sse2-unaligned-erms \ +- memset-avx2-unaligned-erms \ +- memset-avx512-unaligned-erms \ +- memchr-avx2-rtm \ +- memcmp-avx2-movbe-rtm \ +- memmove-avx-unaligned-erms-rtm \ +- memrchr-avx2-rtm \ +- memset-avx2-unaligned-erms-rtm \ +- rawmemchr-avx2-rtm \ +- strchr-avx2-rtm \ +- strcmp-avx2-rtm \ +- strchrnul-avx2-rtm \ +- stpcpy-avx2-rtm \ +- stpncpy-avx2-rtm \ +- strcat-avx2-rtm \ +- strcpy-avx2-rtm \ +- strlen-avx2-rtm \ +- strncat-avx2-rtm \ +- strncmp-avx2-rtm \ +- strncpy-avx2-rtm \ +- strnlen-avx2-rtm \ +- strrchr-avx2-rtm \ +- memchr-evex \ +- memcmp-evex-movbe \ +- memmove-evex-unaligned-erms \ +- memrchr-evex \ +- memset-evex-unaligned-erms \ +- rawmemchr-evex \ +- stpcpy-evex \ +- stpncpy-evex \ +- strcat-evex \ +- strchr-evex \ +- strchrnul-evex \ +- strcmp-evex \ +- strcpy-evex \ +- strlen-evex \ +- strncat-evex \ +- strncmp-evex \ +- strncpy-evex \ +- strnlen-evex \ +- strrchr-evex \ +- memchr-evex-rtm \ +- rawmemchr-evex-rtm ++sysdep_routines += \ ++ bzero \ ++ memchr-avx2 \ ++ memchr-avx2-rtm \ ++ memchr-evex \ ++ memchr-evex-rtm \ ++ memchr-sse2 \ ++ memcmp-avx2-movbe \ ++ memcmp-avx2-movbe-rtm \ ++ memcmp-evex-movbe \ ++ memcmp-sse2 \ ++ memcmp-sse4 \ ++ memcmp-ssse3 \ ++ memcpy-ssse3 \ ++ memcpy-ssse3-back \ ++ memmove-avx-unaligned-erms \ ++ memmove-avx-unaligned-erms-rtm \ ++ memmove-avx512-no-vzeroupper \ ++ memmove-avx512-unaligned-erms \ ++ memmove-evex-unaligned-erms \ ++ memmove-sse2-unaligned-erms \ ++ memmove-ssse3 \ ++ memmove-ssse3-back \ ++ memrchr-avx2 \ ++ memrchr-avx2-rtm \ ++ memrchr-evex \ ++ memrchr-sse2 \ ++ memset-avx2-unaligned-erms \ ++ memset-avx2-unaligned-erms-rtm \ ++ memset-avx512-no-vzeroupper \ ++ memset-avx512-unaligned-erms \ ++ memset-evex-unaligned-erms \ ++ memset-sse2-unaligned-erms \ ++ rawmemchr-avx2 \ ++ rawmemchr-avx2-rtm \ ++ rawmemchr-evex \ ++ rawmemchr-evex-rtm \ ++ rawmemchr-sse2 \ ++ stpcpy-avx2 \ ++ stpcpy-avx2-rtm \ ++ stpcpy-evex \ ++ stpcpy-sse2 \ ++ stpcpy-sse2-unaligned \ ++ stpcpy-ssse3 \ ++ stpncpy-avx2 \ ++ stpncpy-avx2-rtm \ ++ stpncpy-c \ ++ stpncpy-evex \ ++ stpncpy-sse2-unaligned \ ++ stpncpy-ssse3 \ ++ strcasecmp_l-avx \ ++ strcasecmp_l-sse2 \ ++ strcasecmp_l-sse4_2 \ ++ strcasecmp_l-ssse3 \ ++ strcat-avx2 \ ++ strcat-avx2-rtm \ ++ strcat-evex \ ++ strcat-sse2 \ ++ strcat-sse2-unaligned \ ++ strcat-ssse3 \ ++ strchr-avx2 \ ++ strchr-avx2-rtm \ ++ strchr-evex \ ++ strchr-sse2 \ ++ strchr-sse2-no-bsf \ ++ strchrnul-avx2 \ ++ strchrnul-avx2-rtm \ ++ strchrnul-evex \ ++ strchrnul-sse2 \ ++ strcmp-avx2 \ ++ strcmp-avx2-rtm \ ++ strcmp-evex \ ++ strcmp-sse2 \ ++ strcmp-sse2-unaligned \ ++ strcmp-sse4_2 \ ++ strcmp-ssse3 \ ++ strcpy-avx2 \ ++ strcpy-avx2-rtm \ ++ strcpy-evex \ ++ strcpy-sse2 \ ++ strcpy-sse2-unaligned \ ++ strcpy-ssse3 \ ++ strcspn-c \ ++ strcspn-sse2 \ ++ strlen-avx2 \ ++ strlen-avx2-rtm \ ++ strlen-evex \ ++ strlen-sse2 \ ++ strncase_l-avx \ ++ strncase_l-sse2 \ ++ strncase_l-sse4_2 \ ++ strncase_l-ssse3 \ ++ strncat-avx2 \ ++ strncat-avx2-rtm \ ++ strncat-c \ ++ strncat-evex \ ++ strncat-sse2-unaligned \ ++ strncat-ssse3 \ ++ strncmp-avx2 \ ++ strncmp-avx2-rtm \ ++ strncmp-evex \ ++ strncmp-sse2 \ ++ strncmp-sse4_2 \ ++ strncmp-ssse3 \ ++ strncpy-avx2 \ ++ strncpy-avx2-rtm \ ++ strncpy-c \ ++ strncpy-evex \ ++ strncpy-sse2-unaligned \ ++ strncpy-ssse3 \ ++ strnlen-avx2 \ ++ strnlen-avx2-rtm \ ++ strnlen-evex \ ++ strnlen-sse2 \ ++ strpbrk-c \ ++ strpbrk-sse2 \ ++ strrchr-avx2 \ ++ strrchr-avx2-rtm \ ++ strrchr-evex \ ++ strrchr-sse2 \ ++ strspn-c \ ++ strspn-sse2 \ ++ strstr-sse2-unaligned \ ++ varshift \ ++# sysdep_routines + CFLAGS-varshift.c += -msse4 + CFLAGS-strcspn-c.c += -msse4 + CFLAGS-strpbrk-c.c += -msse4 +diff --git a/sysdeps/x86_64/multiarch/bzero.c b/sysdeps/x86_64/multiarch/bzero.c +new file mode 100644 +index 00000000..58a14b2c +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/bzero.c +@@ -0,0 +1,106 @@ ++/* Multiple versions of bzero. ++ All versions must be listed in ifunc-impl-list.c. ++ Copyright (C) 2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++/* Define multiple versions only for the definition in libc. */ ++#if IS_IN (libc) ++# define __bzero __redirect___bzero ++# include ++# undef __bzero ++ ++# define SYMBOL_NAME __bzero ++# include ++ ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned_erms) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_rtm) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms_rtm) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned_erms) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned_erms) ++ attribute_hidden; ++ ++static inline void * ++IFUNC_SELECTOR (void) ++{ ++ const struct cpu_features* cpu_features = __get_cpu_features (); ++ ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F) ++ && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) ++ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ return OPTIMIZE1 (avx512_unaligned_erms); ++ ++ return OPTIMIZE1 (avx512_unaligned); ++ } ++ } ++ ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) ++ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ return OPTIMIZE1 (evex_unaligned_erms); ++ ++ return OPTIMIZE1 (evex_unaligned); ++ } ++ ++ if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ return OPTIMIZE1 (avx2_unaligned_erms_rtm); ++ ++ return OPTIMIZE1 (avx2_unaligned_rtm); ++ } ++ ++ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ return OPTIMIZE1 (avx2_unaligned_erms); ++ ++ return OPTIMIZE1 (avx2_unaligned); ++ } ++ } ++ ++ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ return OPTIMIZE1 (sse2_unaligned_erms); ++ ++ return OPTIMIZE1 (sse2_unaligned); ++} ++ ++libc_ifunc_redirected (__redirect___bzero, __bzero, IFUNC_SELECTOR ()); ++ ++weak_alias (__bzero, bzero) ++#endif +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index 8be0d78a..c963d391 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -282,6 +282,48 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + __memset_avx512_no_vzeroupper) + ) + ++ /* Support sysdeps/x86_64/multiarch/bzero.c. */ ++ IFUNC_IMPL (i, name, bzero, ++ IFUNC_IMPL_ADD (array, i, bzero, 1, ++ __bzero_sse2_unaligned) ++ IFUNC_IMPL_ADD (array, i, bzero, 1, ++ __bzero_sse2_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, bzero, ++ CPU_FEATURE_USABLE (AVX2), ++ __bzero_avx2_unaligned) ++ IFUNC_IMPL_ADD (array, i, bzero, ++ CPU_FEATURE_USABLE (AVX2), ++ __bzero_avx2_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, bzero, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __bzero_avx2_unaligned_rtm) ++ IFUNC_IMPL_ADD (array, i, bzero, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __bzero_avx2_unaligned_erms_rtm) ++ IFUNC_IMPL_ADD (array, i, bzero, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __bzero_evex_unaligned) ++ IFUNC_IMPL_ADD (array, i, bzero, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __bzero_evex_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, bzero, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __bzero_avx512_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, bzero, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __bzero_avx512_unaligned) ++ ) ++ + /* Support sysdeps/x86_64/multiarch/rawmemchr.c. */ + IFUNC_IMPL (i, name, rawmemchr, + IFUNC_IMPL_ADD (array, i, rawmemchr, +diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S +index 8ac3e479..5a5ee6f6 100644 +--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S ++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S +@@ -5,6 +5,7 @@ + + #define SECTION(p) p##.avx.rtm + #define MEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm ++#define BZERO_SYMBOL(p,s) p##_avx2_##s##_rtm + #define WMEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm + + #include "memset-avx2-unaligned-erms.S" +diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +index c0bf2875..a093a283 100644 +--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +@@ -14,6 +14,9 @@ + vmovd d, %xmm0; \ + movq r, %rax; + ++# define BZERO_ZERO_VEC0() \ ++ vpxor %xmm0, %xmm0, %xmm0 ++ + # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + MEMSET_SET_VEC0_AND_SET_RETURN(d, r) + +@@ -29,6 +32,9 @@ + # ifndef MEMSET_SYMBOL + # define MEMSET_SYMBOL(p,s) p##_avx2_##s + # endif ++# ifndef BZERO_SYMBOL ++# define BZERO_SYMBOL(p,s) p##_avx2_##s ++# endif + # ifndef WMEMSET_SYMBOL + # define WMEMSET_SYMBOL(p,s) p##_avx2_##s + # endif +diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +index 5241216a..727c9213 100644 +--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +@@ -19,6 +19,9 @@ + vpbroadcastb d, %VEC0; \ + movq r, %rax + ++# define BZERO_ZERO_VEC0() \ ++ vpxorq %XMM0, %XMM0, %XMM0 ++ + # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + vpbroadcastd d, %VEC0; \ + movq r, %rax +diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S +index 63700215..5d8fa78f 100644 +--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S +@@ -19,6 +19,9 @@ + vpbroadcastb d, %VEC0; \ + movq r, %rax + ++# define BZERO_ZERO_VEC0() \ ++ vpxorq %XMM0, %XMM0, %XMM0 ++ + # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + vpbroadcastd d, %VEC0; \ + movq r, %rax +diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S +index 56b81f5c..8f579ad6 100644 +--- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S +@@ -22,6 +22,7 @@ + + #if IS_IN (libc) + # define MEMSET_SYMBOL(p,s) p##_sse2_##s ++# define BZERO_SYMBOL(p,s) MEMSET_SYMBOL (p, s) + # define WMEMSET_SYMBOL(p,s) p##_sse2_##s + + # ifdef SHARED +diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +index a67f9833..06f5f5d7 100644 +--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +@@ -26,6 +26,10 @@ + + #include + ++#ifndef BZERO_SYMBOL ++# define BZERO_SYMBOL(p,s) MEMSET_SYMBOL (p, s) ++#endif ++ + #ifndef MEMSET_CHK_SYMBOL + # define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s) + #endif +@@ -87,6 +91,18 @@ + # define XMM_SMALL 0 + #endif + ++#ifdef USE_LESS_VEC_MASK_STORE ++# define SET_REG64 rcx ++# define SET_REG32 ecx ++# define SET_REG16 cx ++# define SET_REG8 cl ++#else ++# define SET_REG64 rsi ++# define SET_REG32 esi ++# define SET_REG16 si ++# define SET_REG8 sil ++#endif ++ + #define PAGE_SIZE 4096 + + /* Macro to calculate size of small memset block for aligning +@@ -96,18 +112,6 @@ + + #ifndef SECTION + # error SECTION is not defined! +-#endif +- +- .section SECTION(.text),"ax",@progbits +-#if VEC_SIZE == 16 && IS_IN (libc) +-ENTRY (__bzero) +- mov %RDI_LP, %RAX_LP /* Set return value. */ +- mov %RSI_LP, %RDX_LP /* Set n. */ +- xorl %esi, %esi +- pxor %XMM0, %XMM0 +- jmp L(entry_from_bzero) +-END (__bzero) +-weak_alias (__bzero, bzero) + #endif + + #if IS_IN (libc) +@@ -123,12 +127,37 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned)) + WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi) + WMEMSET_VDUP_TO_VEC0_LOW() + cmpq $VEC_SIZE, %rdx +- jb L(less_vec_no_vdup) ++ jb L(less_vec_from_wmemset) + WMEMSET_VDUP_TO_VEC0_HIGH() + jmp L(entry_from_wmemset) + END (WMEMSET_SYMBOL (__wmemset, unaligned)) + #endif + ++ENTRY (BZERO_SYMBOL(__bzero, unaligned)) ++#if VEC_SIZE > 16 ++ BZERO_ZERO_VEC0 () ++#endif ++ mov %RDI_LP, %RAX_LP ++ mov %RSI_LP, %RDX_LP ++#ifndef USE_LESS_VEC_MASK_STORE ++ xorl %esi, %esi ++#endif ++ cmp $VEC_SIZE, %RDX_LP ++ jb L(less_vec_no_vdup) ++#ifdef USE_LESS_VEC_MASK_STORE ++ xorl %esi, %esi ++#endif ++#if VEC_SIZE <= 16 ++ BZERO_ZERO_VEC0 () ++#endif ++ cmp $(VEC_SIZE * 2), %RDX_LP ++ ja L(more_2x_vec) ++ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ ++ VMOVU %VEC(0), (%rdi) ++ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) ++ VZEROUPPER_RETURN ++END (BZERO_SYMBOL(__bzero, unaligned)) ++ + #if defined SHARED && IS_IN (libc) + ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) + cmp %RDX_LP, %RCX_LP +@@ -142,7 +171,6 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned)) + /* Clear the upper 32 bits. */ + mov %edx, %edx + # endif +-L(entry_from_bzero): + cmpq $VEC_SIZE, %rdx + jb L(less_vec) + MEMSET_VDUP_TO_VEC0_HIGH() +@@ -187,6 +215,31 @@ END (__memset_erms) + END (MEMSET_SYMBOL (__memset, erms)) + # endif + ++ENTRY_P2ALIGN (BZERO_SYMBOL(__bzero, unaligned_erms), 6) ++# if VEC_SIZE > 16 ++ BZERO_ZERO_VEC0 () ++# endif ++ mov %RDI_LP, %RAX_LP ++ mov %RSI_LP, %RDX_LP ++# ifndef USE_LESS_VEC_MASK_STORE ++ xorl %esi, %esi ++# endif ++ cmp $VEC_SIZE, %RDX_LP ++ jb L(less_vec_no_vdup) ++# ifdef USE_LESS_VEC_MASK_STORE ++ xorl %esi, %esi ++# endif ++# if VEC_SIZE <= 16 ++ BZERO_ZERO_VEC0 () ++# endif ++ cmp $(VEC_SIZE * 2), %RDX_LP ++ ja L(stosb_more_2x_vec) ++ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ ++ VMOVU %VEC(0), (%rdi) ++ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) ++ VZEROUPPER_RETURN ++END (BZERO_SYMBOL(__bzero, unaligned_erms)) ++ + # if defined SHARED && IS_IN (libc) + ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) + cmp %RDX_LP, %RCX_LP +@@ -229,6 +282,7 @@ L(last_2x_vec): + .p2align 4,, 10 + L(less_vec): + L(less_vec_no_vdup): ++L(less_vec_from_wmemset): + /* Less than 1 VEC. */ + # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 + # error Unsupported VEC_SIZE! +@@ -374,8 +428,11 @@ L(less_vec): + /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to + xmm). This is only does anything for AVX2. */ + MEMSET_VDUP_TO_VEC0_LOW () ++L(less_vec_from_wmemset): ++#if VEC_SIZE > 16 + L(less_vec_no_vdup): + #endif ++#endif + L(cross_page): + #if VEC_SIZE > 32 + cmpl $32, %edx +@@ -386,7 +443,10 @@ L(cross_page): + jge L(between_16_31) + #endif + #ifndef USE_XMM_LESS_VEC +- MOVQ %XMM0, %rcx ++ MOVQ %XMM0, %SET_REG64 ++#endif ++#if VEC_SIZE <= 16 ++L(less_vec_no_vdup): + #endif + cmpl $8, %edx + jge L(between_8_15) +@@ -395,7 +455,7 @@ L(cross_page): + cmpl $1, %edx + jg L(between_2_3) + jl L(between_0_0) +- movb %sil, (%LESS_VEC_REG) ++ movb %SET_REG8, (%LESS_VEC_REG) + L(between_0_0): + ret + +@@ -428,8 +488,8 @@ L(between_8_15): + MOVQ %XMM0, (%rdi) + MOVQ %XMM0, -8(%rdi, %rdx) + #else +- movq %rcx, (%LESS_VEC_REG) +- movq %rcx, -8(%LESS_VEC_REG, %rdx) ++ movq %SET_REG64, (%LESS_VEC_REG) ++ movq %SET_REG64, -8(%LESS_VEC_REG, %rdx) + #endif + ret + +@@ -442,8 +502,8 @@ L(between_4_7): + MOVD %XMM0, (%rdi) + MOVD %XMM0, -4(%rdi, %rdx) + #else +- movl %ecx, (%LESS_VEC_REG) +- movl %ecx, -4(%LESS_VEC_REG, %rdx) ++ movl %SET_REG32, (%LESS_VEC_REG) ++ movl %SET_REG32, -4(%LESS_VEC_REG, %rdx) + #endif + ret + +@@ -452,12 +512,12 @@ L(between_4_7): + L(between_2_3): + /* From 2 to 3. No branch when size == 2. */ + #ifdef USE_XMM_LESS_VEC +- movb %sil, (%rdi) +- movb %sil, 1(%rdi) +- movb %sil, -1(%rdi, %rdx) ++ movb %SET_REG8, (%rdi) ++ movb %SET_REG8, 1(%rdi) ++ movb %SET_REG8, -1(%rdi, %rdx) + #else +- movw %cx, (%LESS_VEC_REG) +- movb %sil, -1(%LESS_VEC_REG, %rdx) ++ movw %SET_REG16, (%LESS_VEC_REG) ++ movb %SET_REG8, -1(%LESS_VEC_REG, %rdx) + #endif + ret + END (MEMSET_SYMBOL (__memset, unaligned_erms)) +-- +GitLab + diff --git a/SOURCES/ia-opt-str-wcs_rchr-avx2.patch b/SOURCES/ia-opt-str-wcs_rchr-avx2.patch new file mode 100644 index 0000000..3bb015a --- /dev/null +++ b/SOURCES/ia-opt-str-wcs_rchr-avx2.patch @@ -0,0 +1,502 @@ +From 0566d7c3c34685183e4f17f209651b0fba646df8 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Thu, 21 Apr 2022 20:52:29 -0500 +Subject: [PATCH] x86: Optimize {str|wcs}rchr-avx2 + +The new code unrolls the main loop slightly without adding too much +overhead and minimizes the comparisons for the search CHAR. + +Geometric Mean of all benchmarks New / Old: 0.832 +See email for all results. + +Full xcheck passes on x86_64 with and without multiarch enabled. +Reviewed-by: H.J. Lu + +(cherry picked from commit df7e295d18ffa34f629578c0017a9881af7620f6) +--- + sysdeps/x86_64/multiarch/strrchr-avx2.S | 426 +++++++++++++++--------- + 1 file changed, 269 insertions(+), 157 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S +index c949410b..3d26fad4 100644 +--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S ++++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S +@@ -27,9 +27,13 @@ + # ifdef USE_AS_WCSRCHR + # define VPBROADCAST vpbroadcastd + # define VPCMPEQ vpcmpeqd ++# define VPMIN vpminud ++# define CHAR_SIZE 4 + # else + # define VPBROADCAST vpbroadcastb + # define VPCMPEQ vpcmpeqb ++# define VPMIN vpminub ++# define CHAR_SIZE 1 + # endif + + # ifndef VZEROUPPER +@@ -41,196 +45,304 @@ + # endif + + # define VEC_SIZE 32 ++# define PAGE_SIZE 4096 + +- .section SECTION(.text),"ax",@progbits +-ENTRY (STRRCHR) +- movd %esi, %xmm4 +- movl %edi, %ecx ++ .section SECTION(.text), "ax", @progbits ++ENTRY(STRRCHR) ++ movd %esi, %xmm7 ++ movl %edi, %eax + /* Broadcast CHAR to YMM4. */ +- VPBROADCAST %xmm4, %ymm4 ++ VPBROADCAST %xmm7, %ymm7 + vpxor %xmm0, %xmm0, %xmm0 + +- /* Check if we may cross page boundary with one vector load. */ +- andl $(2 * VEC_SIZE - 1), %ecx +- cmpl $VEC_SIZE, %ecx +- ja L(cros_page_boundary) ++ /* Shift here instead of `andl` to save code size (saves a fetch ++ block). */ ++ sall $20, %eax ++ cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax ++ ja L(cross_page) + ++L(page_cross_continue): + vmovdqu (%rdi), %ymm1 +- VPCMPEQ %ymm1, %ymm0, %ymm2 +- VPCMPEQ %ymm1, %ymm4, %ymm3 +- vpmovmskb %ymm2, %ecx +- vpmovmskb %ymm3, %eax +- addq $VEC_SIZE, %rdi ++ /* Check end of string match. */ ++ VPCMPEQ %ymm1, %ymm0, %ymm6 ++ vpmovmskb %ymm6, %ecx ++ testl %ecx, %ecx ++ jz L(aligned_more) ++ ++ /* Only check match with search CHAR if needed. */ ++ VPCMPEQ %ymm1, %ymm7, %ymm1 ++ vpmovmskb %ymm1, %eax ++ /* Check if match before first zero. */ ++ blsmskl %ecx, %ecx ++ andl %ecx, %eax ++ jz L(ret0) ++ bsrl %eax, %eax ++ addq %rdi, %rax ++ /* We are off by 3 for wcsrchr if search CHAR is non-zero. If ++ search CHAR is zero we are correct. Either way `andq ++ -CHAR_SIZE, %rax` gets the correct result. */ ++# ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++# endif ++L(ret0): ++L(return_vzeroupper): ++ ZERO_UPPER_VEC_REGISTERS_RETURN ++ ++ /* Returns for first vec x1/x2 have hard coded backward search ++ path for earlier matches. */ ++ .p2align 4,, 10 ++L(first_vec_x1): ++ VPCMPEQ %ymm2, %ymm7, %ymm6 ++ vpmovmskb %ymm6, %eax ++ blsmskl %ecx, %ecx ++ andl %ecx, %eax ++ jnz L(first_vec_x1_return) ++ ++ .p2align 4,, 4 ++L(first_vec_x0_test): ++ VPCMPEQ %ymm1, %ymm7, %ymm6 ++ vpmovmskb %ymm6, %eax ++ testl %eax, %eax ++ jz L(ret1) ++ bsrl %eax, %eax ++ addq %r8, %rax ++# ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++# endif ++L(ret1): ++ VZEROUPPER_RETURN + ++ .p2align 4,, 10 ++L(first_vec_x0_x1_test): ++ VPCMPEQ %ymm2, %ymm7, %ymm6 ++ vpmovmskb %ymm6, %eax ++ /* Check ymm2 for search CHAR match. If no match then check ymm1 ++ before returning. */ + testl %eax, %eax +- jnz L(first_vec) ++ jz L(first_vec_x0_test) ++ .p2align 4,, 4 ++L(first_vec_x1_return): ++ bsrl %eax, %eax ++ leaq 1(%rdi, %rax), %rax ++# ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++# endif ++ VZEROUPPER_RETURN + +- testl %ecx, %ecx +- jnz L(return_null) + +- andq $-VEC_SIZE, %rdi +- xorl %edx, %edx +- jmp L(aligned_loop) ++ .p2align 4,, 10 ++L(first_vec_x2): ++ VPCMPEQ %ymm3, %ymm7, %ymm6 ++ vpmovmskb %ymm6, %eax ++ blsmskl %ecx, %ecx ++ /* If no in-range search CHAR match in ymm3 then need to check ++ ymm1/ymm2 for an earlier match (we delay checking search ++ CHAR matches until needed). */ ++ andl %ecx, %eax ++ jz L(first_vec_x0_x1_test) ++ bsrl %eax, %eax ++ leaq (VEC_SIZE + 1)(%rdi, %rax), %rax ++# ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++# endif ++ VZEROUPPER_RETURN ++ + + .p2align 4 +-L(first_vec): +- /* Check if there is a nul CHAR. */ ++L(aligned_more): ++ /* Save original pointer if match was in VEC 0. */ ++ movq %rdi, %r8 ++ ++ /* Align src. */ ++ orq $(VEC_SIZE - 1), %rdi ++ vmovdqu 1(%rdi), %ymm2 ++ VPCMPEQ %ymm2, %ymm0, %ymm6 ++ vpmovmskb %ymm6, %ecx + testl %ecx, %ecx +- jnz L(char_and_nul_in_first_vec) ++ jnz L(first_vec_x1) + +- /* Remember the match and keep searching. */ +- movl %eax, %edx +- movq %rdi, %rsi +- andq $-VEC_SIZE, %rdi +- jmp L(aligned_loop) ++ vmovdqu (VEC_SIZE + 1)(%rdi), %ymm3 ++ VPCMPEQ %ymm3, %ymm0, %ymm6 ++ vpmovmskb %ymm6, %ecx ++ testl %ecx, %ecx ++ jnz L(first_vec_x2) + ++ /* Save pointer again before realigning. */ ++ movq %rdi, %rsi ++ addq $(VEC_SIZE + 1), %rdi ++ andq $-(VEC_SIZE * 2), %rdi + .p2align 4 +-L(cros_page_boundary): +- andl $(VEC_SIZE - 1), %ecx +- andq $-VEC_SIZE, %rdi +- vmovdqa (%rdi), %ymm1 +- VPCMPEQ %ymm1, %ymm0, %ymm2 +- VPCMPEQ %ymm1, %ymm4, %ymm3 +- vpmovmskb %ymm2, %edx +- vpmovmskb %ymm3, %eax +- shrl %cl, %edx +- shrl %cl, %eax +- addq $VEC_SIZE, %rdi +- +- /* Check if there is a CHAR. */ ++L(first_aligned_loop): ++ /* Do 2x VEC at a time. Any more and the cost of finding the ++ match outweights loop benefit. */ ++ vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4 ++ vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5 ++ ++ VPCMPEQ %ymm4, %ymm7, %ymm6 ++ VPMIN %ymm4, %ymm5, %ymm8 ++ VPCMPEQ %ymm5, %ymm7, %ymm10 ++ vpor %ymm6, %ymm10, %ymm5 ++ VPCMPEQ %ymm8, %ymm0, %ymm8 ++ vpor %ymm5, %ymm8, %ymm9 ++ ++ vpmovmskb %ymm9, %eax ++ addq $(VEC_SIZE * 2), %rdi ++ /* No zero or search CHAR. */ + testl %eax, %eax +- jnz L(found_char) +- +- testl %edx, %edx +- jnz L(return_null) ++ jz L(first_aligned_loop) + +- jmp L(aligned_loop) +- +- .p2align 4 +-L(found_char): +- testl %edx, %edx +- jnz L(char_and_nul) ++ /* If no zero CHAR then go to second loop (this allows us to ++ throw away all prior work). */ ++ vpmovmskb %ymm8, %ecx ++ testl %ecx, %ecx ++ jz L(second_aligned_loop_prep) + +- /* Remember the match and keep searching. */ +- movl %eax, %edx +- leaq (%rdi, %rcx), %rsi ++ /* Search char could be zero so we need to get the true match. ++ */ ++ vpmovmskb %ymm5, %eax ++ testl %eax, %eax ++ jnz L(first_aligned_loop_return) + +- .p2align 4 +-L(aligned_loop): +- vmovdqa (%rdi), %ymm1 +- VPCMPEQ %ymm1, %ymm0, %ymm2 +- addq $VEC_SIZE, %rdi +- VPCMPEQ %ymm1, %ymm4, %ymm3 +- vpmovmskb %ymm2, %ecx +- vpmovmskb %ymm3, %eax +- orl %eax, %ecx +- jnz L(char_nor_null) +- +- vmovdqa (%rdi), %ymm1 +- VPCMPEQ %ymm1, %ymm0, %ymm2 +- add $VEC_SIZE, %rdi +- VPCMPEQ %ymm1, %ymm4, %ymm3 +- vpmovmskb %ymm2, %ecx ++ .p2align 4,, 4 ++L(first_vec_x1_or_x2): ++ VPCMPEQ %ymm3, %ymm7, %ymm3 ++ VPCMPEQ %ymm2, %ymm7, %ymm2 + vpmovmskb %ymm3, %eax +- orl %eax, %ecx +- jnz L(char_nor_null) +- +- vmovdqa (%rdi), %ymm1 +- VPCMPEQ %ymm1, %ymm0, %ymm2 +- addq $VEC_SIZE, %rdi +- VPCMPEQ %ymm1, %ymm4, %ymm3 +- vpmovmskb %ymm2, %ecx +- vpmovmskb %ymm3, %eax +- orl %eax, %ecx +- jnz L(char_nor_null) +- +- vmovdqa (%rdi), %ymm1 +- VPCMPEQ %ymm1, %ymm0, %ymm2 +- addq $VEC_SIZE, %rdi +- VPCMPEQ %ymm1, %ymm4, %ymm3 +- vpmovmskb %ymm2, %ecx +- vpmovmskb %ymm3, %eax +- orl %eax, %ecx +- jz L(aligned_loop) +- +- .p2align 4 +-L(char_nor_null): +- /* Find a CHAR or a nul CHAR in a loop. */ +- testl %eax, %eax +- jnz L(match) +-L(return_value): +- testl %edx, %edx +- jz L(return_null) +- movl %edx, %eax +- movq %rsi, %rdi ++ vpmovmskb %ymm2, %edx ++ /* Use add for macro-fusion. */ ++ addq %rax, %rdx ++ jz L(first_vec_x0_test) ++ /* NB: We could move this shift to before the branch and save a ++ bit of code size / performance on the fall through. The ++ branch leads to the null case which generally seems hotter ++ than char in first 3x VEC. */ ++ salq $32, %rax ++ addq %rdx, %rax ++ bsrq %rax, %rax ++ leaq 1(%rsi, %rax), %rax ++# ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++# endif ++ VZEROUPPER_RETURN + ++ .p2align 4,, 8 ++L(first_aligned_loop_return): ++ VPCMPEQ %ymm4, %ymm0, %ymm4 ++ vpmovmskb %ymm4, %edx ++ salq $32, %rcx ++ orq %rdx, %rcx ++ ++ vpmovmskb %ymm10, %eax ++ vpmovmskb %ymm6, %edx ++ salq $32, %rax ++ orq %rdx, %rax ++ blsmskq %rcx, %rcx ++ andq %rcx, %rax ++ jz L(first_vec_x1_or_x2) ++ ++ bsrq %rax, %rax ++ leaq -(VEC_SIZE * 2)(%rdi, %rax), %rax + # ifdef USE_AS_WCSRCHR +- /* Keep the first bit for each matching CHAR for bsr. */ +- andl $0x11111111, %eax ++ andq $-CHAR_SIZE, %rax + # endif +- bsrl %eax, %eax +- leaq -VEC_SIZE(%rdi, %rax), %rax +-L(return_vzeroupper): +- ZERO_UPPER_VEC_REGISTERS_RETURN ++ VZEROUPPER_RETURN + ++ /* Search char cannot be zero. */ + .p2align 4 +-L(match): +- /* Find a CHAR. Check if there is a nul CHAR. */ +- vpmovmskb %ymm2, %ecx +- testl %ecx, %ecx +- jnz L(find_nul) +- +- /* Remember the match and keep searching. */ +- movl %eax, %edx ++L(second_aligned_loop_set_furthest_match): ++ /* Save VEC and pointer from most recent match. */ ++L(second_aligned_loop_prep): + movq %rdi, %rsi +- jmp L(aligned_loop) ++ vmovdqu %ymm6, %ymm2 ++ vmovdqu %ymm10, %ymm3 + + .p2align 4 +-L(find_nul): +-# ifdef USE_AS_WCSRCHR +- /* Keep the first bit for each matching CHAR for bsr. */ +- andl $0x11111111, %ecx +- andl $0x11111111, %eax +-# endif +- /* Mask out any matching bits after the nul CHAR. */ +- movl %ecx, %r8d +- subl $1, %r8d +- xorl %ecx, %r8d +- andl %r8d, %eax ++L(second_aligned_loop): ++ /* Search 2x at at time. */ ++ vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4 ++ vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5 ++ ++ VPCMPEQ %ymm4, %ymm7, %ymm6 ++ VPMIN %ymm4, %ymm5, %ymm1 ++ VPCMPEQ %ymm5, %ymm7, %ymm10 ++ vpor %ymm6, %ymm10, %ymm5 ++ VPCMPEQ %ymm1, %ymm0, %ymm1 ++ vpor %ymm5, %ymm1, %ymm9 ++ ++ vpmovmskb %ymm9, %eax ++ addq $(VEC_SIZE * 2), %rdi + testl %eax, %eax +- /* If there is no CHAR here, return the remembered one. */ +- jz L(return_value) +- bsrl %eax, %eax +- leaq -VEC_SIZE(%rdi, %rax), %rax +- VZEROUPPER_RETURN +- +- .p2align 4 +-L(char_and_nul): +- /* Find both a CHAR and a nul CHAR. */ +- addq %rcx, %rdi +- movl %edx, %ecx +-L(char_and_nul_in_first_vec): +-# ifdef USE_AS_WCSRCHR +- /* Keep the first bit for each matching CHAR for bsr. */ +- andl $0x11111111, %ecx +- andl $0x11111111, %eax +-# endif +- /* Mask out any matching bits after the nul CHAR. */ +- movl %ecx, %r8d +- subl $1, %r8d +- xorl %ecx, %r8d +- andl %r8d, %eax ++ jz L(second_aligned_loop) ++ vpmovmskb %ymm1, %ecx ++ testl %ecx, %ecx ++ jz L(second_aligned_loop_set_furthest_match) ++ vpmovmskb %ymm5, %eax + testl %eax, %eax +- /* Return null pointer if the nul CHAR comes first. */ +- jz L(return_null) +- bsrl %eax, %eax +- leaq -VEC_SIZE(%rdi, %rax), %rax ++ jnz L(return_new_match) ++ ++ /* This is the hot patch. We know CHAR is inbounds and that ++ ymm3/ymm2 have latest match. */ ++ .p2align 4,, 4 ++L(return_old_match): ++ vpmovmskb %ymm3, %eax ++ vpmovmskb %ymm2, %edx ++ salq $32, %rax ++ orq %rdx, %rax ++ bsrq %rax, %rax ++ /* Search char cannot be zero so safe to just use lea for ++ wcsrchr. */ ++ leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax + VZEROUPPER_RETURN + +- .p2align 4 +-L(return_null): +- xorl %eax, %eax ++ /* Last iteration also potentially has a match. */ ++ .p2align 4,, 8 ++L(return_new_match): ++ VPCMPEQ %ymm4, %ymm0, %ymm4 ++ vpmovmskb %ymm4, %edx ++ salq $32, %rcx ++ orq %rdx, %rcx ++ ++ vpmovmskb %ymm10, %eax ++ vpmovmskb %ymm6, %edx ++ salq $32, %rax ++ orq %rdx, %rax ++ blsmskq %rcx, %rcx ++ andq %rcx, %rax ++ jz L(return_old_match) ++ bsrq %rax, %rax ++ /* Search char cannot be zero so safe to just use lea for ++ wcsrchr. */ ++ leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax + VZEROUPPER_RETURN + +-END (STRRCHR) ++ .p2align 4,, 4 ++L(cross_page): ++ movq %rdi, %rsi ++ andq $-VEC_SIZE, %rsi ++ vmovdqu (%rsi), %ymm1 ++ VPCMPEQ %ymm1, %ymm0, %ymm6 ++ vpmovmskb %ymm6, %ecx ++ /* Shift out zero CHAR matches that are before the begining of ++ src (rdi). */ ++ shrxl %edi, %ecx, %ecx ++ testl %ecx, %ecx ++ jz L(page_cross_continue) ++ VPCMPEQ %ymm1, %ymm7, %ymm1 ++ vpmovmskb %ymm1, %eax ++ ++ /* Shift out search CHAR matches that are before the begining of ++ src (rdi). */ ++ shrxl %edi, %eax, %eax ++ blsmskl %ecx, %ecx ++ /* Check if any search CHAR match in range. */ ++ andl %ecx, %eax ++ jz L(ret2) ++ bsrl %eax, %eax ++ addq %rdi, %rax ++# ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++# endif ++L(ret2): ++ VZEROUPPER_RETURN ++END(STRRCHR) + #endif +-- +GitLab + diff --git a/SOURCES/ia-opt-str-wcs_rchr-evex.patch b/SOURCES/ia-opt-str-wcs_rchr-evex.patch new file mode 100644 index 0000000..e130e3e --- /dev/null +++ b/SOURCES/ia-opt-str-wcs_rchr-evex.patch @@ -0,0 +1,559 @@ +From 9ef733cbe224b1cc12e4c8acac09627ccb3a00d8 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Thu, 21 Apr 2022 20:52:30 -0500 +Subject: [PATCH] x86: Optimize {str|wcs}rchr-evex + +The new code unrolls the main loop slightly without adding too much +overhead and minimizes the comparisons for the search CHAR. + +Geometric Mean of all benchmarks New / Old: 0.755 +See email for all results. + +Full xcheck passes on x86_64 with and without multiarch enabled. +Reviewed-by: H.J. Lu + +(cherry picked from commit c966099cdc3e0fdf92f63eac09b22fa7e5f5f02d) +--- + sysdeps/x86_64/multiarch/strrchr-evex.S | 471 +++++++++++++++--------- + 1 file changed, 290 insertions(+), 181 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S +index f920b5a5..f5b6d755 100644 +--- a/sysdeps/x86_64/multiarch/strrchr-evex.S ++++ b/sysdeps/x86_64/multiarch/strrchr-evex.S +@@ -24,242 +24,351 @@ + # define STRRCHR __strrchr_evex + # endif + +-# define VMOVU vmovdqu64 +-# define VMOVA vmovdqa64 ++# define VMOVU vmovdqu64 ++# define VMOVA vmovdqa64 + + # ifdef USE_AS_WCSRCHR ++# define SHIFT_REG esi ++ ++# define kunpck kunpckbw ++# define kmov_2x kmovd ++# define maskz_2x ecx ++# define maskm_2x eax ++# define CHAR_SIZE 4 ++# define VPMIN vpminud ++# define VPTESTN vptestnmd + # define VPBROADCAST vpbroadcastd +-# define VPCMP vpcmpd +-# define SHIFT_REG r8d ++# define VPCMP vpcmpd + # else ++# define SHIFT_REG edi ++ ++# define kunpck kunpckdq ++# define kmov_2x kmovq ++# define maskz_2x rcx ++# define maskm_2x rax ++ ++# define CHAR_SIZE 1 ++# define VPMIN vpminub ++# define VPTESTN vptestnmb + # define VPBROADCAST vpbroadcastb +-# define VPCMP vpcmpb +-# define SHIFT_REG ecx ++# define VPCMP vpcmpb + # endif + + # define XMMZERO xmm16 + # define YMMZERO ymm16 + # define YMMMATCH ymm17 +-# define YMM1 ymm18 ++# define YMMSAVE ymm18 ++ ++# define YMM1 ymm19 ++# define YMM2 ymm20 ++# define YMM3 ymm21 ++# define YMM4 ymm22 ++# define YMM5 ymm23 ++# define YMM6 ymm24 ++# define YMM7 ymm25 ++# define YMM8 ymm26 + +-# define VEC_SIZE 32 + +- .section .text.evex,"ax",@progbits +-ENTRY (STRRCHR) +- movl %edi, %ecx ++# define VEC_SIZE 32 ++# define PAGE_SIZE 4096 ++ .section .text.evex, "ax", @progbits ++ENTRY(STRRCHR) ++ movl %edi, %eax + /* Broadcast CHAR to YMMMATCH. */ + VPBROADCAST %esi, %YMMMATCH + +- vpxorq %XMMZERO, %XMMZERO, %XMMZERO +- +- /* Check if we may cross page boundary with one vector load. */ +- andl $(2 * VEC_SIZE - 1), %ecx +- cmpl $VEC_SIZE, %ecx +- ja L(cros_page_boundary) ++ andl $(PAGE_SIZE - 1), %eax ++ cmpl $(PAGE_SIZE - VEC_SIZE), %eax ++ jg L(cross_page_boundary) + ++L(page_cross_continue): + VMOVU (%rdi), %YMM1 +- +- /* Each bit in K0 represents a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM1, %k0 +- /* Each bit in K1 represents a CHAR in YMM1. */ +- VPCMP $0, %YMMMATCH, %YMM1, %k1 ++ /* k0 has a 1 for each zero CHAR in YMM1. */ ++ VPTESTN %YMM1, %YMM1, %k0 + kmovd %k0, %ecx +- kmovd %k1, %eax +- +- addq $VEC_SIZE, %rdi +- +- testl %eax, %eax +- jnz L(first_vec) +- + testl %ecx, %ecx +- jnz L(return_null) +- +- andq $-VEC_SIZE, %rdi +- xorl %edx, %edx +- jmp L(aligned_loop) +- +- .p2align 4 +-L(first_vec): +- /* Check if there is a null byte. */ +- testl %ecx, %ecx +- jnz L(char_and_nul_in_first_vec) +- +- /* Remember the match and keep searching. */ +- movl %eax, %edx +- movq %rdi, %rsi +- andq $-VEC_SIZE, %rdi +- jmp L(aligned_loop) +- +- .p2align 4 +-L(cros_page_boundary): +- andl $(VEC_SIZE - 1), %ecx +- andq $-VEC_SIZE, %rdi ++ jz L(aligned_more) ++ /* fallthrough: zero CHAR in first VEC. */ + ++ /* K1 has a 1 for each search CHAR match in YMM1. */ ++ VPCMP $0, %YMMMATCH, %YMM1, %k1 ++ kmovd %k1, %eax ++ /* Build mask up until first zero CHAR (used to mask of ++ potential search CHAR matches past the end of the string). ++ */ ++ blsmskl %ecx, %ecx ++ andl %ecx, %eax ++ jz L(ret0) ++ /* Get last match (the `andl` removed any out of bounds ++ matches). */ ++ bsrl %eax, %eax + # ifdef USE_AS_WCSRCHR +- /* NB: Divide shift count by 4 since each bit in K1 represent 4 +- bytes. */ +- movl %ecx, %SHIFT_REG +- sarl $2, %SHIFT_REG ++ leaq (%rdi, %rax, CHAR_SIZE), %rax ++# else ++ addq %rdi, %rax + # endif ++L(ret0): ++ ret + +- VMOVA (%rdi), %YMM1 +- +- /* Each bit in K0 represents a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM1, %k0 +- /* Each bit in K1 represents a CHAR in YMM1. */ ++ /* Returns for first vec x1/x2/x3 have hard coded backward ++ search path for earlier matches. */ ++ .p2align 4,, 6 ++L(first_vec_x1): ++ VPCMP $0, %YMMMATCH, %YMM2, %k1 ++ kmovd %k1, %eax ++ blsmskl %ecx, %ecx ++ /* eax non-zero if search CHAR in range. */ ++ andl %ecx, %eax ++ jnz L(first_vec_x1_return) ++ ++ /* fallthrough: no match in YMM2 then need to check for earlier ++ matches (in YMM1). */ ++ .p2align 4,, 4 ++L(first_vec_x0_test): + VPCMP $0, %YMMMATCH, %YMM1, %k1 +- kmovd %k0, %edx + kmovd %k1, %eax +- +- shrxl %SHIFT_REG, %edx, %edx +- shrxl %SHIFT_REG, %eax, %eax +- addq $VEC_SIZE, %rdi +- +- /* Check if there is a CHAR. */ + testl %eax, %eax +- jnz L(found_char) +- +- testl %edx, %edx +- jnz L(return_null) +- +- jmp L(aligned_loop) +- +- .p2align 4 +-L(found_char): +- testl %edx, %edx +- jnz L(char_and_nul) +- +- /* Remember the match and keep searching. */ +- movl %eax, %edx +- leaq (%rdi, %rcx), %rsi ++ jz L(ret1) ++ bsrl %eax, %eax ++# ifdef USE_AS_WCSRCHR ++ leaq (%rsi, %rax, CHAR_SIZE), %rax ++# else ++ addq %rsi, %rax ++# endif ++L(ret1): ++ ret + +- .p2align 4 +-L(aligned_loop): +- VMOVA (%rdi), %YMM1 +- addq $VEC_SIZE, %rdi ++ .p2align 4,, 10 ++L(first_vec_x1_or_x2): ++ VPCMP $0, %YMM3, %YMMMATCH, %k3 ++ VPCMP $0, %YMM2, %YMMMATCH, %k2 ++ /* K2 and K3 have 1 for any search CHAR match. Test if any ++ matches between either of them. Otherwise check YMM1. */ ++ kortestd %k2, %k3 ++ jz L(first_vec_x0_test) ++ ++ /* Guranteed that YMM2 and YMM3 are within range so merge the ++ two bitmasks then get last result. */ ++ kunpck %k2, %k3, %k3 ++ kmovq %k3, %rax ++ bsrq %rax, %rax ++ leaq (VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax ++ ret + +- /* Each bit in K0 represents a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM1, %k0 +- /* Each bit in K1 represents a CHAR in YMM1. */ +- VPCMP $0, %YMMMATCH, %YMM1, %k1 +- kmovd %k0, %ecx ++ .p2align 4,, 6 ++L(first_vec_x3): ++ VPCMP $0, %YMMMATCH, %YMM4, %k1 + kmovd %k1, %eax +- orl %eax, %ecx +- jnz L(char_nor_null) ++ blsmskl %ecx, %ecx ++ /* If no search CHAR match in range check YMM1/YMM2/YMM3. */ ++ andl %ecx, %eax ++ jz L(first_vec_x1_or_x2) ++ bsrl %eax, %eax ++ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax ++ ret + +- VMOVA (%rdi), %YMM1 +- add $VEC_SIZE, %rdi ++ .p2align 4,, 6 ++L(first_vec_x0_x1_test): ++ VPCMP $0, %YMMMATCH, %YMM2, %k1 ++ kmovd %k1, %eax ++ /* Check YMM2 for last match first. If no match try YMM1. */ ++ testl %eax, %eax ++ jz L(first_vec_x0_test) ++ .p2align 4,, 4 ++L(first_vec_x1_return): ++ bsrl %eax, %eax ++ leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax ++ ret + +- /* Each bit in K0 represents a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM1, %k0 +- /* Each bit in K1 represents a CHAR in YMM1. */ +- VPCMP $0, %YMMMATCH, %YMM1, %k1 +- kmovd %k0, %ecx ++ .p2align 4,, 10 ++L(first_vec_x2): ++ VPCMP $0, %YMMMATCH, %YMM3, %k1 + kmovd %k1, %eax +- orl %eax, %ecx +- jnz L(char_nor_null) ++ blsmskl %ecx, %ecx ++ /* Check YMM3 for last match first. If no match try YMM2/YMM1. ++ */ ++ andl %ecx, %eax ++ jz L(first_vec_x0_x1_test) ++ bsrl %eax, %eax ++ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax ++ ret + +- VMOVA (%rdi), %YMM1 +- addq $VEC_SIZE, %rdi + +- /* Each bit in K0 represents a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM1, %k0 +- /* Each bit in K1 represents a CHAR in YMM1. */ +- VPCMP $0, %YMMMATCH, %YMM1, %k1 ++ .p2align 4 ++L(aligned_more): ++ /* Need to keep original pointer incase YMM1 has last match. */ ++ movq %rdi, %rsi ++ andq $-VEC_SIZE, %rdi ++ VMOVU VEC_SIZE(%rdi), %YMM2 ++ VPTESTN %YMM2, %YMM2, %k0 + kmovd %k0, %ecx +- kmovd %k1, %eax +- orl %eax, %ecx +- jnz L(char_nor_null) ++ testl %ecx, %ecx ++ jnz L(first_vec_x1) + +- VMOVA (%rdi), %YMM1 +- addq $VEC_SIZE, %rdi ++ VMOVU (VEC_SIZE * 2)(%rdi), %YMM3 ++ VPTESTN %YMM3, %YMM3, %k0 ++ kmovd %k0, %ecx ++ testl %ecx, %ecx ++ jnz L(first_vec_x2) + +- /* Each bit in K0 represents a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM1, %k0 +- /* Each bit in K1 represents a CHAR in YMM1. */ +- VPCMP $0, %YMMMATCH, %YMM1, %k1 ++ VMOVU (VEC_SIZE * 3)(%rdi), %YMM4 ++ VPTESTN %YMM4, %YMM4, %k0 + kmovd %k0, %ecx +- kmovd %k1, %eax +- orl %eax, %ecx +- jz L(aligned_loop) ++ movq %rdi, %r8 ++ testl %ecx, %ecx ++ jnz L(first_vec_x3) + ++ andq $-(VEC_SIZE * 2), %rdi + .p2align 4 +-L(char_nor_null): +- /* Find a CHAR or a null byte in a loop. */ ++L(first_aligned_loop): ++ /* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee ++ they don't store a match. */ ++ VMOVA (VEC_SIZE * 4)(%rdi), %YMM5 ++ VMOVA (VEC_SIZE * 5)(%rdi), %YMM6 ++ ++ VPCMP $0, %YMM5, %YMMMATCH, %k2 ++ vpxord %YMM6, %YMMMATCH, %YMM7 ++ ++ VPMIN %YMM5, %YMM6, %YMM8 ++ VPMIN %YMM8, %YMM7, %YMM7 ++ ++ VPTESTN %YMM7, %YMM7, %k1 ++ subq $(VEC_SIZE * -2), %rdi ++ kortestd %k1, %k2 ++ jz L(first_aligned_loop) ++ ++ VPCMP $0, %YMM6, %YMMMATCH, %k3 ++ VPTESTN %YMM8, %YMM8, %k1 ++ ktestd %k1, %k1 ++ jz L(second_aligned_loop_prep) ++ ++ kortestd %k2, %k3 ++ jnz L(return_first_aligned_loop) ++ ++ .p2align 4,, 6 ++L(first_vec_x1_or_x2_or_x3): ++ VPCMP $0, %YMM4, %YMMMATCH, %k4 ++ kmovd %k4, %eax + testl %eax, %eax +- jnz L(match) +-L(return_value): +- testl %edx, %edx +- jz L(return_null) +- movl %edx, %eax +- movq %rsi, %rdi ++ jz L(first_vec_x1_or_x2) + bsrl %eax, %eax +-# ifdef USE_AS_WCSRCHR +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- leaq -VEC_SIZE(%rdi, %rax, 4), %rax +-# else +- leaq -VEC_SIZE(%rdi, %rax), %rax +-# endif ++ leaq (VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax + ret + +- .p2align 4 +-L(match): +- /* Find a CHAR. Check if there is a null byte. */ +- kmovd %k0, %ecx +- testl %ecx, %ecx +- jnz L(find_nul) ++ .p2align 4,, 8 ++L(return_first_aligned_loop): ++ VPTESTN %YMM5, %YMM5, %k0 ++ kunpck %k0, %k1, %k0 ++ kmov_2x %k0, %maskz_2x ++ ++ blsmsk %maskz_2x, %maskz_2x ++ kunpck %k2, %k3, %k3 ++ kmov_2x %k3, %maskm_2x ++ and %maskz_2x, %maskm_2x ++ jz L(first_vec_x1_or_x2_or_x3) + +- /* Remember the match and keep searching. */ +- movl %eax, %edx ++ bsr %maskm_2x, %maskm_2x ++ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax ++ ret ++ ++ .p2align 4 ++ /* We can throw away the work done for the first 4x checks here ++ as we have a later match. This is the 'fast' path persay. ++ */ ++L(second_aligned_loop_prep): ++L(second_aligned_loop_set_furthest_match): + movq %rdi, %rsi +- jmp L(aligned_loop) ++ kunpck %k2, %k3, %k4 + + .p2align 4 +-L(find_nul): +- /* Mask out any matching bits after the null byte. */ +- movl %ecx, %r8d +- subl $1, %r8d +- xorl %ecx, %r8d +- andl %r8d, %eax +- testl %eax, %eax +- /* If there is no CHAR here, return the remembered one. */ +- jz L(return_value) +- bsrl %eax, %eax ++L(second_aligned_loop): ++ VMOVU (VEC_SIZE * 4)(%rdi), %YMM1 ++ VMOVU (VEC_SIZE * 5)(%rdi), %YMM2 ++ ++ VPCMP $0, %YMM1, %YMMMATCH, %k2 ++ vpxord %YMM2, %YMMMATCH, %YMM3 ++ ++ VPMIN %YMM1, %YMM2, %YMM4 ++ VPMIN %YMM3, %YMM4, %YMM3 ++ ++ VPTESTN %YMM3, %YMM3, %k1 ++ subq $(VEC_SIZE * -2), %rdi ++ kortestd %k1, %k2 ++ jz L(second_aligned_loop) ++ ++ VPCMP $0, %YMM2, %YMMMATCH, %k3 ++ VPTESTN %YMM4, %YMM4, %k1 ++ ktestd %k1, %k1 ++ jz L(second_aligned_loop_set_furthest_match) ++ ++ kortestd %k2, %k3 ++ /* branch here because there is a significant advantage interms ++ of output dependency chance in using edx. */ ++ jnz L(return_new_match) ++L(return_old_match): ++ kmovq %k4, %rax ++ bsrq %rax, %rax ++ leaq (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax ++ ret ++ ++L(return_new_match): ++ VPTESTN %YMM1, %YMM1, %k0 ++ kunpck %k0, %k1, %k0 ++ kmov_2x %k0, %maskz_2x ++ ++ blsmsk %maskz_2x, %maskz_2x ++ kunpck %k2, %k3, %k3 ++ kmov_2x %k3, %maskm_2x ++ and %maskz_2x, %maskm_2x ++ jz L(return_old_match) ++ ++ bsr %maskm_2x, %maskm_2x ++ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax ++ ret ++ ++L(cross_page_boundary): ++ /* eax contains all the page offset bits of src (rdi). `xor rdi, ++ rax` sets pointer will all page offset bits cleared so ++ offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC ++ before page cross (guranteed to be safe to read). Doing this ++ as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves ++ a bit of code size. */ ++ xorq %rdi, %rax ++ VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %YMM1 ++ VPTESTN %YMM1, %YMM1, %k0 ++ kmovd %k0, %ecx ++ ++ /* Shift out zero CHAR matches that are before the begining of ++ src (rdi). */ + # ifdef USE_AS_WCSRCHR +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- leaq -VEC_SIZE(%rdi, %rax, 4), %rax +-# else +- leaq -VEC_SIZE(%rdi, %rax), %rax ++ movl %edi, %esi ++ andl $(VEC_SIZE - 1), %esi ++ shrl $2, %esi + # endif +- ret ++ shrxl %SHIFT_REG, %ecx, %ecx + +- .p2align 4 +-L(char_and_nul): +- /* Find both a CHAR and a null byte. */ +- addq %rcx, %rdi +- movl %edx, %ecx +-L(char_and_nul_in_first_vec): +- /* Mask out any matching bits after the null byte. */ +- movl %ecx, %r8d +- subl $1, %r8d +- xorl %ecx, %r8d +- andl %r8d, %eax +- testl %eax, %eax +- /* Return null pointer if the null byte comes first. */ +- jz L(return_null) ++ testl %ecx, %ecx ++ jz L(page_cross_continue) ++ ++ /* Found zero CHAR so need to test for search CHAR. */ ++ VPCMP $0, %YMMMATCH, %YMM1, %k1 ++ kmovd %k1, %eax ++ /* Shift out search CHAR matches that are before the begining of ++ src (rdi). */ ++ shrxl %SHIFT_REG, %eax, %eax ++ ++ /* Check if any search CHAR match in range. */ ++ blsmskl %ecx, %ecx ++ andl %ecx, %eax ++ jz L(ret3) + bsrl %eax, %eax + # ifdef USE_AS_WCSRCHR +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- leaq -VEC_SIZE(%rdi, %rax, 4), %rax ++ leaq (%rdi, %rax, CHAR_SIZE), %rax + # else +- leaq -VEC_SIZE(%rdi, %rax), %rax ++ addq %rdi, %rax + # endif ++L(ret3): + ret + +- .p2align 4 +-L(return_null): +- xorl %eax, %eax +- ret +- +-END (STRRCHR) ++END(STRRCHR) + #endif +-- +GitLab + diff --git a/SOURCES/ia-opt-str-wcs_rchr-sse2.patch b/SOURCES/ia-opt-str-wcs_rchr-sse2.patch new file mode 100644 index 0000000..054edd6 --- /dev/null +++ b/SOURCES/ia-opt-str-wcs_rchr-sse2.patch @@ -0,0 +1,873 @@ +From 70016c060a99e8534469cdeb847eabe60bff2b54 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Thu, 21 Apr 2022 20:52:28 -0500 +Subject: [PATCH] x86: Optimize {str|wcs}rchr-sse2 + +The new code unrolls the main loop slightly without adding too much +overhead and minimizes the comparisons for the search CHAR. + +Geometric Mean of all benchmarks New / Old: 0.741 +See email for all results. + +Full xcheck passes on x86_64 with and without multiarch enabled. +Reviewed-by: H.J. Lu + +(cherry picked from commit 5307aa9c1800f36a64c183c091c9af392c1fa75c) +--- + sysdeps/x86_64/multiarch/strrchr-sse2.S | 2 +- + sysdeps/x86_64/multiarch/wcsrchr-sse2.S | 3 +- + sysdeps/x86_64/strrchr.S | 510 +++++++++++++++--------- + sysdeps/x86_64/wcsrchr.S | 266 +----------- + 4 files changed, 338 insertions(+), 443 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S +index 0ec76fe9..6bb1284b 100644 +--- a/sysdeps/x86_64/multiarch/strrchr-sse2.S ++++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S +@@ -17,7 +17,7 @@ + . */ + + #if IS_IN (libc) +-# define strrchr __strrchr_sse2 ++# define STRRCHR __strrchr_sse2 + + # undef weak_alias + # define weak_alias(strrchr, rindex) +diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S +index d015e953..f26d53b5 100644 +--- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S ++++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S +@@ -17,7 +17,6 @@ + . */ + + #if IS_IN (libc) +-# define wcsrchr __wcsrchr_sse2 ++# define STRRCHR __wcsrchr_sse2 + #endif +- + #include "../wcsrchr.S" +diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S +index aca98e7e..a58cc220 100644 +--- a/sysdeps/x86_64/strrchr.S ++++ b/sysdeps/x86_64/strrchr.S +@@ -19,210 +19,360 @@ + + #include + ++#ifndef STRRCHR ++# define STRRCHR strrchr ++#endif ++ ++#ifdef USE_AS_WCSRCHR ++# define PCMPEQ pcmpeqd ++# define CHAR_SIZE 4 ++# define PMINU pminud ++#else ++# define PCMPEQ pcmpeqb ++# define CHAR_SIZE 1 ++# define PMINU pminub ++#endif ++ ++#define PAGE_SIZE 4096 ++#define VEC_SIZE 16 ++ + .text +-ENTRY (strrchr) +- movd %esi, %xmm1 ++ENTRY(STRRCHR) ++ movd %esi, %xmm0 + movq %rdi, %rax +- andl $4095, %eax +- punpcklbw %xmm1, %xmm1 +- cmpq $4032, %rax +- punpcklwd %xmm1, %xmm1 +- pshufd $0, %xmm1, %xmm1 ++ andl $(PAGE_SIZE - 1), %eax ++#ifndef USE_AS_WCSRCHR ++ punpcklbw %xmm0, %xmm0 ++ punpcklwd %xmm0, %xmm0 ++#endif ++ pshufd $0, %xmm0, %xmm0 ++ cmpl $(PAGE_SIZE - VEC_SIZE), %eax + ja L(cross_page) +- movdqu (%rdi), %xmm0 ++ ++L(cross_page_continue): ++ movups (%rdi), %xmm1 + pxor %xmm2, %xmm2 +- movdqa %xmm0, %xmm3 +- pcmpeqb %xmm1, %xmm0 +- pcmpeqb %xmm2, %xmm3 +- pmovmskb %xmm0, %ecx +- pmovmskb %xmm3, %edx +- testq %rdx, %rdx +- je L(next_48_bytes) +- leaq -1(%rdx), %rax +- xorq %rdx, %rax +- andq %rcx, %rax +- je L(exit) +- bsrq %rax, %rax ++ PCMPEQ %xmm1, %xmm2 ++ pmovmskb %xmm2, %ecx ++ testl %ecx, %ecx ++ jz L(aligned_more) ++ ++ PCMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ leal -1(%rcx), %edx ++ xorl %edx, %ecx ++ andl %ecx, %eax ++ jz L(ret0) ++ bsrl %eax, %eax + addq %rdi, %rax ++ /* We are off by 3 for wcsrchr if search CHAR is non-zero. If ++ search CHAR is zero we are correct. Either way `andq ++ -CHAR_SIZE, %rax` gets the correct result. */ ++#ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++#endif ++L(ret0): + ret + ++ /* Returns for first vec x1/x2 have hard coded backward search ++ path for earlier matches. */ + .p2align 4 +-L(next_48_bytes): +- movdqu 16(%rdi), %xmm4 +- movdqa %xmm4, %xmm5 +- movdqu 32(%rdi), %xmm3 +- pcmpeqb %xmm1, %xmm4 +- pcmpeqb %xmm2, %xmm5 +- movdqu 48(%rdi), %xmm0 +- pmovmskb %xmm5, %edx +- movdqa %xmm3, %xmm5 +- pcmpeqb %xmm1, %xmm3 +- pcmpeqb %xmm2, %xmm5 +- pcmpeqb %xmm0, %xmm2 +- salq $16, %rdx +- pmovmskb %xmm3, %r8d +- pmovmskb %xmm5, %eax +- pmovmskb %xmm2, %esi +- salq $32, %r8 +- salq $32, %rax +- pcmpeqb %xmm1, %xmm0 +- orq %rdx, %rax +- movq %rsi, %rdx +- pmovmskb %xmm4, %esi +- salq $48, %rdx +- salq $16, %rsi +- orq %r8, %rsi +- orq %rcx, %rsi +- pmovmskb %xmm0, %ecx +- salq $48, %rcx +- orq %rcx, %rsi +- orq %rdx, %rax +- je L(loop_header2) +- leaq -1(%rax), %rcx +- xorq %rax, %rcx +- andq %rcx, %rsi +- je L(exit) +- bsrq %rsi, %rsi +- leaq (%rdi,%rsi), %rax ++L(first_vec_x0_test): ++ PCMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ testl %eax, %eax ++ jz L(ret0) ++ bsrl %eax, %eax ++ addq %r8, %rax ++#ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++#endif + ret + + .p2align 4 +-L(loop_header2): +- testq %rsi, %rsi +- movq %rdi, %rcx +- je L(no_c_found) +-L(loop_header): +- addq $64, %rdi +- pxor %xmm7, %xmm7 +- andq $-64, %rdi +- jmp L(loop_entry) ++L(first_vec_x1): ++ PCMPEQ %xmm0, %xmm2 ++ pmovmskb %xmm2, %eax ++ leal -1(%rcx), %edx ++ xorl %edx, %ecx ++ andl %ecx, %eax ++ jz L(first_vec_x0_test) ++ bsrl %eax, %eax ++ leaq (VEC_SIZE)(%rdi, %rax), %rax ++#ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++#endif ++ ret + + .p2align 4 +-L(loop64): +- testq %rdx, %rdx +- cmovne %rdx, %rsi +- cmovne %rdi, %rcx +- addq $64, %rdi +-L(loop_entry): +- movdqa 32(%rdi), %xmm3 +- pxor %xmm6, %xmm6 +- movdqa 48(%rdi), %xmm2 +- movdqa %xmm3, %xmm0 +- movdqa 16(%rdi), %xmm4 +- pminub %xmm2, %xmm0 +- movdqa (%rdi), %xmm5 +- pminub %xmm4, %xmm0 +- pminub %xmm5, %xmm0 +- pcmpeqb %xmm7, %xmm0 +- pmovmskb %xmm0, %eax +- movdqa %xmm5, %xmm0 +- pcmpeqb %xmm1, %xmm0 +- pmovmskb %xmm0, %r9d +- movdqa %xmm4, %xmm0 +- pcmpeqb %xmm1, %xmm0 +- pmovmskb %xmm0, %edx +- movdqa %xmm3, %xmm0 +- pcmpeqb %xmm1, %xmm0 +- salq $16, %rdx +- pmovmskb %xmm0, %r10d +- movdqa %xmm2, %xmm0 +- pcmpeqb %xmm1, %xmm0 +- salq $32, %r10 +- orq %r10, %rdx +- pmovmskb %xmm0, %r8d +- orq %r9, %rdx +- salq $48, %r8 +- orq %r8, %rdx ++L(first_vec_x1_test): ++ PCMPEQ %xmm0, %xmm2 ++ pmovmskb %xmm2, %eax + testl %eax, %eax +- je L(loop64) +- pcmpeqb %xmm6, %xmm4 +- pcmpeqb %xmm6, %xmm3 +- pcmpeqb %xmm6, %xmm5 +- pmovmskb %xmm4, %eax +- pmovmskb %xmm3, %r10d +- pcmpeqb %xmm6, %xmm2 +- pmovmskb %xmm5, %r9d +- salq $32, %r10 +- salq $16, %rax +- pmovmskb %xmm2, %r8d +- orq %r10, %rax +- orq %r9, %rax +- salq $48, %r8 +- orq %r8, %rax +- leaq -1(%rax), %r8 +- xorq %rax, %r8 +- andq %r8, %rdx +- cmovne %rdi, %rcx +- cmovne %rdx, %rsi +- bsrq %rsi, %rsi +- leaq (%rcx,%rsi), %rax ++ jz L(first_vec_x0_test) ++ bsrl %eax, %eax ++ leaq (VEC_SIZE)(%rdi, %rax), %rax ++#ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++#endif ++ ret ++ ++ .p2align 4 ++L(first_vec_x2): ++ PCMPEQ %xmm0, %xmm3 ++ pmovmskb %xmm3, %eax ++ leal -1(%rcx), %edx ++ xorl %edx, %ecx ++ andl %ecx, %eax ++ jz L(first_vec_x1_test) ++ bsrl %eax, %eax ++ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax ++#ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++#endif ++ ret ++ ++ .p2align 4 ++L(aligned_more): ++ /* Save original pointer if match was in VEC 0. */ ++ movq %rdi, %r8 ++ andq $-VEC_SIZE, %rdi ++ ++ movaps VEC_SIZE(%rdi), %xmm2 ++ pxor %xmm3, %xmm3 ++ PCMPEQ %xmm2, %xmm3 ++ pmovmskb %xmm3, %ecx ++ testl %ecx, %ecx ++ jnz L(first_vec_x1) ++ ++ movaps (VEC_SIZE * 2)(%rdi), %xmm3 ++ pxor %xmm4, %xmm4 ++ PCMPEQ %xmm3, %xmm4 ++ pmovmskb %xmm4, %ecx ++ testl %ecx, %ecx ++ jnz L(first_vec_x2) ++ ++ addq $VEC_SIZE, %rdi ++ /* Save pointer again before realigning. */ ++ movq %rdi, %rsi ++ andq $-(VEC_SIZE * 2), %rdi ++ .p2align 4 ++L(first_loop): ++ /* Do 2x VEC at a time. */ ++ movaps (VEC_SIZE * 2)(%rdi), %xmm4 ++ movaps (VEC_SIZE * 3)(%rdi), %xmm5 ++ /* Since SSE2 no pminud so wcsrchr needs seperate logic for ++ detecting zero. Note if this is found to be a bottleneck it ++ may be worth adding an SSE4.1 wcsrchr implementation. */ ++#ifdef USE_AS_WCSRCHR ++ movaps %xmm5, %xmm6 ++ pxor %xmm8, %xmm8 ++ ++ PCMPEQ %xmm8, %xmm5 ++ PCMPEQ %xmm4, %xmm8 ++ por %xmm5, %xmm8 ++#else ++ movaps %xmm5, %xmm6 ++ PMINU %xmm4, %xmm5 ++#endif ++ ++ movaps %xmm4, %xmm9 ++ PCMPEQ %xmm0, %xmm4 ++ PCMPEQ %xmm0, %xmm6 ++ movaps %xmm6, %xmm7 ++ por %xmm4, %xmm6 ++#ifndef USE_AS_WCSRCHR ++ pxor %xmm8, %xmm8 ++ PCMPEQ %xmm5, %xmm8 ++#endif ++ pmovmskb %xmm8, %ecx ++ pmovmskb %xmm6, %eax ++ ++ addq $(VEC_SIZE * 2), %rdi ++ /* Use `addl` 1) so we can undo it with `subl` and 2) it can ++ macro-fuse with `jz`. */ ++ addl %ecx, %eax ++ jz L(first_loop) ++ ++ /* Check if there is zero match. */ ++ testl %ecx, %ecx ++ jz L(second_loop_match) ++ ++ /* Check if there was a match in last iteration. */ ++ subl %ecx, %eax ++ jnz L(new_match) ++ ++L(first_loop_old_match): ++ PCMPEQ %xmm0, %xmm2 ++ PCMPEQ %xmm0, %xmm3 ++ pmovmskb %xmm2, %ecx ++ pmovmskb %xmm3, %eax ++ addl %eax, %ecx ++ jz L(first_vec_x0_test) ++ /* NB: We could move this shift to before the branch and save a ++ bit of code size / performance on the fall through. The ++ branch leads to the null case which generally seems hotter ++ than char in first 3x VEC. */ ++ sall $16, %eax ++ orl %ecx, %eax ++ ++ bsrl %eax, %eax ++ addq %rsi, %rax ++#ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++#endif ++ ret ++ ++ .p2align 4 ++L(new_match): ++ pxor %xmm6, %xmm6 ++ PCMPEQ %xmm9, %xmm6 ++ pmovmskb %xmm6, %eax ++ sall $16, %ecx ++ orl %eax, %ecx ++ ++ /* We can't reuse either of the old comparisons as since we mask ++ of zeros after first zero (instead of using the full ++ comparison) we can't gurantee no interference between match ++ after end of string and valid match. */ ++ pmovmskb %xmm4, %eax ++ pmovmskb %xmm7, %edx ++ sall $16, %edx ++ orl %edx, %eax ++ ++ leal -1(%ecx), %edx ++ xorl %edx, %ecx ++ andl %ecx, %eax ++ jz L(first_loop_old_match) ++ bsrl %eax, %eax ++ addq %rdi, %rax ++#ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++#endif + ret + ++ /* Save minimum state for getting most recent match. We can ++ throw out all previous work. */ + .p2align 4 +-L(no_c_found): +- movl $1, %esi +- xorl %ecx, %ecx +- jmp L(loop_header) ++L(second_loop_match): ++ movq %rdi, %rsi ++ movaps %xmm4, %xmm2 ++ movaps %xmm7, %xmm3 + + .p2align 4 +-L(exit): +- xorl %eax, %eax ++L(second_loop): ++ movaps (VEC_SIZE * 2)(%rdi), %xmm4 ++ movaps (VEC_SIZE * 3)(%rdi), %xmm5 ++ /* Since SSE2 no pminud so wcsrchr needs seperate logic for ++ detecting zero. Note if this is found to be a bottleneck it ++ may be worth adding an SSE4.1 wcsrchr implementation. */ ++#ifdef USE_AS_WCSRCHR ++ movaps %xmm5, %xmm6 ++ pxor %xmm8, %xmm8 ++ ++ PCMPEQ %xmm8, %xmm5 ++ PCMPEQ %xmm4, %xmm8 ++ por %xmm5, %xmm8 ++#else ++ movaps %xmm5, %xmm6 ++ PMINU %xmm4, %xmm5 ++#endif ++ ++ movaps %xmm4, %xmm9 ++ PCMPEQ %xmm0, %xmm4 ++ PCMPEQ %xmm0, %xmm6 ++ movaps %xmm6, %xmm7 ++ por %xmm4, %xmm6 ++#ifndef USE_AS_WCSRCHR ++ pxor %xmm8, %xmm8 ++ PCMPEQ %xmm5, %xmm8 ++#endif ++ ++ pmovmskb %xmm8, %ecx ++ pmovmskb %xmm6, %eax ++ ++ addq $(VEC_SIZE * 2), %rdi ++ /* Either null term or new occurence of CHAR. */ ++ addl %ecx, %eax ++ jz L(second_loop) ++ ++ /* No null term so much be new occurence of CHAR. */ ++ testl %ecx, %ecx ++ jz L(second_loop_match) ++ ++ ++ subl %ecx, %eax ++ jnz L(second_loop_new_match) ++ ++L(second_loop_old_match): ++ pmovmskb %xmm2, %ecx ++ pmovmskb %xmm3, %eax ++ sall $16, %eax ++ orl %ecx, %eax ++ bsrl %eax, %eax ++ addq %rsi, %rax ++#ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++#endif + ret + + .p2align 4 ++L(second_loop_new_match): ++ pxor %xmm6, %xmm6 ++ PCMPEQ %xmm9, %xmm6 ++ pmovmskb %xmm6, %eax ++ sall $16, %ecx ++ orl %eax, %ecx ++ ++ /* We can't reuse either of the old comparisons as since we mask ++ of zeros after first zero (instead of using the full ++ comparison) we can't gurantee no interference between match ++ after end of string and valid match. */ ++ pmovmskb %xmm4, %eax ++ pmovmskb %xmm7, %edx ++ sall $16, %edx ++ orl %edx, %eax ++ ++ leal -1(%ecx), %edx ++ xorl %edx, %ecx ++ andl %ecx, %eax ++ jz L(second_loop_old_match) ++ bsrl %eax, %eax ++ addq %rdi, %rax ++#ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++#endif ++ ret ++ ++ .p2align 4,, 4 + L(cross_page): +- movq %rdi, %rax +- pxor %xmm0, %xmm0 +- andq $-64, %rax +- movdqu (%rax), %xmm5 +- movdqa %xmm5, %xmm6 +- movdqu 16(%rax), %xmm4 +- pcmpeqb %xmm1, %xmm5 +- pcmpeqb %xmm0, %xmm6 +- movdqu 32(%rax), %xmm3 +- pmovmskb %xmm6, %esi +- movdqa %xmm4, %xmm6 +- movdqu 48(%rax), %xmm2 +- pcmpeqb %xmm1, %xmm4 +- pcmpeqb %xmm0, %xmm6 +- pmovmskb %xmm6, %edx +- movdqa %xmm3, %xmm6 +- pcmpeqb %xmm1, %xmm3 +- pcmpeqb %xmm0, %xmm6 +- pcmpeqb %xmm2, %xmm0 +- salq $16, %rdx +- pmovmskb %xmm3, %r9d +- pmovmskb %xmm6, %r8d +- pmovmskb %xmm0, %ecx +- salq $32, %r9 +- salq $32, %r8 +- pcmpeqb %xmm1, %xmm2 +- orq %r8, %rdx +- salq $48, %rcx +- pmovmskb %xmm5, %r8d +- orq %rsi, %rdx +- pmovmskb %xmm4, %esi +- orq %rcx, %rdx +- pmovmskb %xmm2, %ecx +- salq $16, %rsi +- salq $48, %rcx +- orq %r9, %rsi +- orq %r8, %rsi +- orq %rcx, %rsi ++ movq %rdi, %rsi ++ andq $-VEC_SIZE, %rsi ++ movaps (%rsi), %xmm1 ++ pxor %xmm2, %xmm2 ++ PCMPEQ %xmm1, %xmm2 ++ pmovmskb %xmm2, %edx + movl %edi, %ecx +- subl %eax, %ecx +- shrq %cl, %rdx +- shrq %cl, %rsi +- testq %rdx, %rdx +- je L(loop_header2) +- leaq -1(%rdx), %rax +- xorq %rdx, %rax +- andq %rax, %rsi +- je L(exit) +- bsrq %rsi, %rax ++ andl $(VEC_SIZE - 1), %ecx ++ sarl %cl, %edx ++ jz L(cross_page_continue) ++ PCMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ sarl %cl, %eax ++ leal -1(%rdx), %ecx ++ xorl %edx, %ecx ++ andl %ecx, %eax ++ jz L(ret1) ++ bsrl %eax, %eax + addq %rdi, %rax ++#ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++#endif ++L(ret1): + ret +-END (strrchr) ++END(STRRCHR) + +-weak_alias (strrchr, rindex) +-libc_hidden_builtin_def (strrchr) ++#ifndef USE_AS_WCSRCHR ++ weak_alias (STRRCHR, rindex) ++ libc_hidden_builtin_def (STRRCHR) ++#endif +diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S +index 2f388537..ae3cfa7d 100644 +--- a/sysdeps/x86_64/wcsrchr.S ++++ b/sysdeps/x86_64/wcsrchr.S +@@ -17,266 +17,12 @@ + License along with the GNU C Library; if not, see + . */ + +-#include + +- .text +-ENTRY (wcsrchr) ++#define USE_AS_WCSRCHR 1 ++#define NO_PMINU 1 + +- movd %rsi, %xmm1 +- mov %rdi, %rcx +- punpckldq %xmm1, %xmm1 +- pxor %xmm2, %xmm2 +- punpckldq %xmm1, %xmm1 +- and $63, %rcx +- cmp $48, %rcx +- ja L(crosscache) ++#ifndef STRRCHR ++# define STRRCHR wcsrchr ++#endif + +- movdqu (%rdi), %xmm0 +- pcmpeqd %xmm0, %xmm2 +- pcmpeqd %xmm1, %xmm0 +- pmovmskb %xmm2, %rcx +- pmovmskb %xmm0, %rax +- add $16, %rdi +- +- test %rax, %rax +- jnz L(unaligned_match1) +- +- test %rcx, %rcx +- jnz L(return_null) +- +- and $-16, %rdi +- xor %r8, %r8 +- jmp L(loop) +- +- .p2align 4 +-L(unaligned_match1): +- test %rcx, %rcx +- jnz L(prolog_find_zero_1) +- +- mov %rax, %r8 +- mov %rdi, %rsi +- and $-16, %rdi +- jmp L(loop) +- +- .p2align 4 +-L(crosscache): +- and $15, %rcx +- and $-16, %rdi +- pxor %xmm3, %xmm3 +- movdqa (%rdi), %xmm0 +- pcmpeqd %xmm0, %xmm3 +- pcmpeqd %xmm1, %xmm0 +- pmovmskb %xmm3, %rdx +- pmovmskb %xmm0, %rax +- shr %cl, %rdx +- shr %cl, %rax +- add $16, %rdi +- +- test %rax, %rax +- jnz L(unaligned_match) +- +- test %rdx, %rdx +- jnz L(return_null) +- +- xor %r8, %r8 +- jmp L(loop) +- +- .p2align 4 +-L(unaligned_match): +- test %rdx, %rdx +- jnz L(prolog_find_zero) +- +- mov %rax, %r8 +- lea (%rdi, %rcx), %rsi +- +-/* Loop start on aligned string. */ +- .p2align 4 +-L(loop): +- movdqa (%rdi), %xmm0 +- pcmpeqd %xmm0, %xmm2 +- add $16, %rdi +- pcmpeqd %xmm1, %xmm0 +- pmovmskb %xmm2, %rcx +- pmovmskb %xmm0, %rax +- or %rax, %rcx +- jnz L(matches) +- +- movdqa (%rdi), %xmm3 +- pcmpeqd %xmm3, %xmm2 +- add $16, %rdi +- pcmpeqd %xmm1, %xmm3 +- pmovmskb %xmm2, %rcx +- pmovmskb %xmm3, %rax +- or %rax, %rcx +- jnz L(matches) +- +- movdqa (%rdi), %xmm4 +- pcmpeqd %xmm4, %xmm2 +- add $16, %rdi +- pcmpeqd %xmm1, %xmm4 +- pmovmskb %xmm2, %rcx +- pmovmskb %xmm4, %rax +- or %rax, %rcx +- jnz L(matches) +- +- movdqa (%rdi), %xmm5 +- pcmpeqd %xmm5, %xmm2 +- add $16, %rdi +- pcmpeqd %xmm1, %xmm5 +- pmovmskb %xmm2, %rcx +- pmovmskb %xmm5, %rax +- or %rax, %rcx +- jz L(loop) +- +- .p2align 4 +-L(matches): +- test %rax, %rax +- jnz L(match) +-L(return_value): +- test %r8, %r8 +- jz L(return_null) +- mov %r8, %rax +- mov %rsi, %rdi +- +- test $15 << 4, %ah +- jnz L(match_fourth_wchar) +- test %ah, %ah +- jnz L(match_third_wchar) +- test $15 << 4, %al +- jnz L(match_second_wchar) +- lea -16(%rdi), %rax +- ret +- +- .p2align 4 +-L(match): +- pmovmskb %xmm2, %rcx +- test %rcx, %rcx +- jnz L(find_zero) +- mov %rax, %r8 +- mov %rdi, %rsi +- jmp L(loop) +- +- .p2align 4 +-L(find_zero): +- test $15, %cl +- jnz L(find_zero_in_first_wchar) +- test %cl, %cl +- jnz L(find_zero_in_second_wchar) +- test $15, %ch +- jnz L(find_zero_in_third_wchar) +- +- and $1 << 13 - 1, %rax +- jz L(return_value) +- +- test $15 << 4, %ah +- jnz L(match_fourth_wchar) +- test %ah, %ah +- jnz L(match_third_wchar) +- test $15 << 4, %al +- jnz L(match_second_wchar) +- lea -16(%rdi), %rax +- ret +- +- .p2align 4 +-L(find_zero_in_first_wchar): +- test $1, %rax +- jz L(return_value) +- lea -16(%rdi), %rax +- ret +- +- .p2align 4 +-L(find_zero_in_second_wchar): +- and $1 << 5 - 1, %rax +- jz L(return_value) +- +- test $15 << 4, %al +- jnz L(match_second_wchar) +- lea -16(%rdi), %rax +- ret +- +- .p2align 4 +-L(find_zero_in_third_wchar): +- and $1 << 9 - 1, %rax +- jz L(return_value) +- +- test %ah, %ah +- jnz L(match_third_wchar) +- test $15 << 4, %al +- jnz L(match_second_wchar) +- lea -16(%rdi), %rax +- ret +- +- .p2align 4 +-L(prolog_find_zero): +- add %rcx, %rdi +- mov %rdx, %rcx +-L(prolog_find_zero_1): +- test $15, %cl +- jnz L(prolog_find_zero_in_first_wchar) +- test %cl, %cl +- jnz L(prolog_find_zero_in_second_wchar) +- test $15, %ch +- jnz L(prolog_find_zero_in_third_wchar) +- +- and $1 << 13 - 1, %rax +- jz L(return_null) +- +- test $15 << 4, %ah +- jnz L(match_fourth_wchar) +- test %ah, %ah +- jnz L(match_third_wchar) +- test $15 << 4, %al +- jnz L(match_second_wchar) +- lea -16(%rdi), %rax +- ret +- +- .p2align 4 +-L(prolog_find_zero_in_first_wchar): +- test $1, %rax +- jz L(return_null) +- lea -16(%rdi), %rax +- ret +- +- .p2align 4 +-L(prolog_find_zero_in_second_wchar): +- and $1 << 5 - 1, %rax +- jz L(return_null) +- +- test $15 << 4, %al +- jnz L(match_second_wchar) +- lea -16(%rdi), %rax +- ret +- +- .p2align 4 +-L(prolog_find_zero_in_third_wchar): +- and $1 << 9 - 1, %rax +- jz L(return_null) +- +- test %ah, %ah +- jnz L(match_third_wchar) +- test $15 << 4, %al +- jnz L(match_second_wchar) +- lea -16(%rdi), %rax +- ret +- +- .p2align 4 +-L(match_second_wchar): +- lea -12(%rdi), %rax +- ret +- +- .p2align 4 +-L(match_third_wchar): +- lea -8(%rdi), %rax +- ret +- +- .p2align 4 +-L(match_fourth_wchar): +- lea -4(%rdi), %rax +- ret +- +- .p2align 4 +-L(return_null): +- xor %rax, %rax +- ret +- +-END (wcsrchr) ++#include "../strrchr.S" +-- +GitLab + diff --git a/SOURCES/ia-opt-strcmp-avx2.patch b/SOURCES/ia-opt-strcmp-avx2.patch new file mode 100644 index 0000000..9067242 --- /dev/null +++ b/SOURCES/ia-opt-strcmp-avx2.patch @@ -0,0 +1,1794 @@ +From 5e9c6a33e767576c063e1fc0077b3a749518e8f0 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Mon, 10 Jan 2022 15:35:38 -0600 +Subject: [PATCH] x86: Optimize strcmp-avx2.S + +Optimization are primarily to the loop logic and how the page cross +logic interacts with the loop. + +The page cross logic is at times more expensive for short strings near +the end of a page but not crossing the page. This is done to retest +the page cross conditions with a non-faulty check and to improve the +logic for entering the loop afterwards. This is only particular cases, +however, and is general made up for by more than 10x improvements on +the transition from the page cross -> loop case. + +The non-page cross cases are improved most for smaller sizes [0, 128] +and go about even for (128, 4096]. The loop page cross logic is +improved so some more significant speedup is seen there as well. + +test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass. + +Signed-off-by: Noah Goldstein +(cherry picked from commit b77b06e0e296f1a2276c27a67e1d44f2cfa38d45) +--- + sysdeps/x86_64/multiarch/strcmp-avx2.S | 1592 ++++++++++++++---------- + 1 file changed, 940 insertions(+), 652 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S +index 70d8499b..554ffe4c 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S ++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S +@@ -26,35 +26,57 @@ + + # define PAGE_SIZE 4096 + +-/* VEC_SIZE = Number of bytes in a ymm register */ ++ /* VEC_SIZE = Number of bytes in a ymm register. */ + # define VEC_SIZE 32 + +-/* Shift for dividing by (VEC_SIZE * 4). */ +-# define DIVIDE_BY_VEC_4_SHIFT 7 +-# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) +-# error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) +-# endif ++# define VMOVU vmovdqu ++# define VMOVA vmovdqa + + # ifdef USE_AS_WCSCMP +-/* Compare packed dwords. */ ++ /* Compare packed dwords. */ + # define VPCMPEQ vpcmpeqd +-/* Compare packed dwords and store minimum. */ ++ /* Compare packed dwords and store minimum. */ + # define VPMINU vpminud +-/* 1 dword char == 4 bytes. */ ++ /* 1 dword char == 4 bytes. */ + # define SIZE_OF_CHAR 4 + # else +-/* Compare packed bytes. */ ++ /* Compare packed bytes. */ + # define VPCMPEQ vpcmpeqb +-/* Compare packed bytes and store minimum. */ ++ /* Compare packed bytes and store minimum. */ + # define VPMINU vpminub +-/* 1 byte char == 1 byte. */ ++ /* 1 byte char == 1 byte. */ + # define SIZE_OF_CHAR 1 + # endif + ++# ifdef USE_AS_STRNCMP ++# define LOOP_REG r9d ++# define LOOP_REG64 r9 ++ ++# define OFFSET_REG8 r9b ++# define OFFSET_REG r9d ++# define OFFSET_REG64 r9 ++# else ++# define LOOP_REG edx ++# define LOOP_REG64 rdx ++ ++# define OFFSET_REG8 dl ++# define OFFSET_REG edx ++# define OFFSET_REG64 rdx ++# endif ++ + # ifndef VZEROUPPER + # define VZEROUPPER vzeroupper + # endif + ++# if defined USE_AS_STRNCMP ++# define VEC_OFFSET 0 ++# else ++# define VEC_OFFSET (-VEC_SIZE) ++# endif ++ ++# define xmmZERO xmm15 ++# define ymmZERO ymm15 ++ + # ifndef SECTION + # define SECTION(p) p##.avx + # endif +@@ -79,783 +101,1049 @@ + the maximum offset is reached before a difference is found, zero is + returned. */ + +- .section SECTION(.text),"ax",@progbits +-ENTRY (STRCMP) ++ .section SECTION(.text), "ax", @progbits ++ENTRY(STRCMP) + # ifdef USE_AS_STRNCMP +- /* Check for simple cases (0 or 1) in offset. */ ++# ifdef __ILP32__ ++ /* Clear the upper 32 bits. */ ++ movl %edx, %rdx ++# endif + cmp $1, %RDX_LP +- je L(char0) +- jb L(zero) ++ /* Signed comparison intentional. We use this branch to also ++ test cases where length >= 2^63. These very large sizes can be ++ handled with strcmp as there is no way for that length to ++ actually bound the buffer. */ ++ jle L(one_or_less) + # ifdef USE_AS_WCSCMP +-# ifndef __ILP32__ + movq %rdx, %rcx +- /* Check if length could overflow when multiplied by +- sizeof(wchar_t). Checking top 8 bits will cover all potential +- overflow cases as well as redirect cases where its impossible to +- length to bound a valid memory region. In these cases just use +- 'wcscmp'. */ ++ ++ /* Multiplying length by sizeof(wchar_t) can result in overflow. ++ Check if that is possible. All cases where overflow are possible ++ are cases where length is large enough that it can never be a ++ bound on valid memory so just use wcscmp. */ + shrq $56, %rcx +- jnz OVERFLOW_STRCMP +-# endif +- /* Convert units: from wide to byte char. */ +- shl $2, %RDX_LP ++ jnz __wcscmp_avx2 ++ ++ leaq (, %rdx, 4), %rdx + # endif +- /* Register %r11 tracks the maximum offset. */ +- mov %RDX_LP, %R11_LP + # endif ++ vpxor %xmmZERO, %xmmZERO, %xmmZERO + movl %edi, %eax +- xorl %edx, %edx +- /* Make %xmm7 (%ymm7) all zeros in this function. */ +- vpxor %xmm7, %xmm7, %xmm7 + orl %esi, %eax +- andl $(PAGE_SIZE - 1), %eax +- cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax +- jg L(cross_page) +- /* Start comparing 4 vectors. */ +- vmovdqu (%rdi), %ymm1 +- VPCMPEQ (%rsi), %ymm1, %ymm0 +- VPMINU %ymm1, %ymm0, %ymm0 +- VPCMPEQ %ymm7, %ymm0, %ymm0 +- vpmovmskb %ymm0, %ecx +- testl %ecx, %ecx +- je L(next_3_vectors) +- tzcntl %ecx, %edx ++ sall $20, %eax ++ /* Check if s1 or s2 may cross a page in next 4x VEC loads. */ ++ cmpl $((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax ++ ja L(page_cross) ++ ++L(no_page_cross): ++ /* Safe to compare 4x vectors. */ ++ VMOVU (%rdi), %ymm0 ++ /* 1s where s1 and s2 equal. */ ++ VPCMPEQ (%rsi), %ymm0, %ymm1 ++ /* 1s at null CHAR. */ ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ /* 1s where s1 and s2 equal AND not null CHAR. */ ++ vpandn %ymm1, %ymm2, %ymm1 ++ ++ /* All 1s -> keep going, any 0s -> return. */ ++ vpmovmskb %ymm1, %ecx + # ifdef USE_AS_STRNCMP +- /* Return 0 if the mismatched index (%rdx) is after the maximum +- offset (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) ++ cmpq $VEC_SIZE, %rdx ++ jbe L(vec_0_test_len) + # endif ++ ++ /* All 1s represents all equals. incl will overflow to zero in ++ all equals case. Otherwise 1s will carry until position of first ++ mismatch. */ ++ incl %ecx ++ jz L(more_3x_vec) ++ ++ .p2align 4,, 4 ++L(return_vec_0): ++ tzcntl %ecx, %ecx + # ifdef USE_AS_WCSCMP ++ movl (%rdi, %rcx), %edx + xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- je L(return) +-L(wcscmp_return): ++ cmpl (%rsi, %rcx), %edx ++ je L(ret0) + setl %al + negl %eax + orl $1, %eax +-L(return): + # else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax ++ movzbl (%rdi, %rcx), %eax ++ movzbl (%rsi, %rcx), %ecx ++ subl %ecx, %eax + # endif ++L(ret0): + L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN + +- .p2align 4 +-L(return_vec_size): +- tzcntl %ecx, %edx + # ifdef USE_AS_STRNCMP +- /* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after +- the maximum offset (%r11). */ +- addq $VEC_SIZE, %rdx +- cmpq %r11, %rdx +- jae L(zero) +-# ifdef USE_AS_WCSCMP ++ .p2align 4,, 8 ++L(vec_0_test_len): ++ notl %ecx ++ bzhil %edx, %ecx, %eax ++ jnz L(return_vec_0) ++ /* Align if will cross fetch block. */ ++ .p2align 4,, 2 ++L(ret_zero): + xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax +-# endif +-# else ++ VZEROUPPER_RETURN ++ ++ .p2align 4,, 5 ++L(one_or_less): ++ jb L(ret_zero) + # ifdef USE_AS_WCSCMP ++ /* 'nbe' covers the case where length is negative (large ++ unsigned). */ ++ jnbe __wcscmp_avx2 ++ movl (%rdi), %edx + xorl %eax, %eax +- movl VEC_SIZE(%rdi, %rdx), %ecx +- cmpl VEC_SIZE(%rsi, %rdx), %ecx +- jne L(wcscmp_return) ++ cmpl (%rsi), %edx ++ je L(ret1) ++ setl %al ++ negl %eax ++ orl $1, %eax + # else +- movzbl VEC_SIZE(%rdi, %rdx), %eax +- movzbl VEC_SIZE(%rsi, %rdx), %edx +- subl %edx, %eax ++ /* 'nbe' covers the case where length is negative (large ++ unsigned). */ ++ ++ jnbe __strcmp_avx2 ++ movzbl (%rdi), %eax ++ movzbl (%rsi), %ecx ++ subl %ecx, %eax + # endif ++L(ret1): ++ ret + # endif +- VZEROUPPER_RETURN + +- .p2align 4 +-L(return_2_vec_size): +- tzcntl %ecx, %edx ++ .p2align 4,, 10 ++L(return_vec_1): ++ tzcntl %ecx, %ecx + # ifdef USE_AS_STRNCMP +- /* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is +- after the maximum offset (%r11). */ +- addq $(VEC_SIZE * 2), %rdx +- cmpq %r11, %rdx +- jae L(zero) +-# ifdef USE_AS_WCSCMP ++ /* rdx must be > CHAR_PER_VEC so save to subtract w.o fear of ++ overflow. */ ++ addq $-VEC_SIZE, %rdx ++ cmpq %rcx, %rdx ++ jbe L(ret_zero) ++# endif ++# ifdef USE_AS_WCSCMP ++ movl VEC_SIZE(%rdi, %rcx), %edx + xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax +-# endif ++ cmpl VEC_SIZE(%rsi, %rcx), %edx ++ je L(ret2) ++ setl %al ++ negl %eax ++ orl $1, %eax + # else +-# ifdef USE_AS_WCSCMP +- xorl %eax, %eax +- movl (VEC_SIZE * 2)(%rdi, %rdx), %ecx +- cmpl (VEC_SIZE * 2)(%rsi, %rdx), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (VEC_SIZE * 2)(%rdi, %rdx), %eax +- movzbl (VEC_SIZE * 2)(%rsi, %rdx), %edx +- subl %edx, %eax +-# endif ++ movzbl VEC_SIZE(%rdi, %rcx), %eax ++ movzbl VEC_SIZE(%rsi, %rcx), %ecx ++ subl %ecx, %eax + # endif ++L(ret2): + VZEROUPPER_RETURN + +- .p2align 4 +-L(return_3_vec_size): +- tzcntl %ecx, %edx ++ .p2align 4,, 10 + # ifdef USE_AS_STRNCMP +- /* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is +- after the maximum offset (%r11). */ +- addq $(VEC_SIZE * 3), %rdx +- cmpq %r11, %rdx +- jae L(zero) +-# ifdef USE_AS_WCSCMP ++L(return_vec_3): ++ salq $32, %rcx ++# endif ++ ++L(return_vec_2): ++# ifndef USE_AS_STRNCMP ++ tzcntl %ecx, %ecx ++# else ++ tzcntq %rcx, %rcx ++ cmpq %rcx, %rdx ++ jbe L(ret_zero) ++# endif ++ ++# ifdef USE_AS_WCSCMP ++ movl (VEC_SIZE * 2)(%rdi, %rcx), %edx + xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax +-# endif ++ cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx ++ je L(ret3) ++ setl %al ++ negl %eax ++ orl $1, %eax + # else ++ movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax ++ movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++# endif ++L(ret3): ++ VZEROUPPER_RETURN ++ ++# ifndef USE_AS_STRNCMP ++ .p2align 4,, 10 ++L(return_vec_3): ++ tzcntl %ecx, %ecx + # ifdef USE_AS_WCSCMP ++ movl (VEC_SIZE * 3)(%rdi, %rcx), %edx + xorl %eax, %eax +- movl (VEC_SIZE * 3)(%rdi, %rdx), %ecx +- cmpl (VEC_SIZE * 3)(%rsi, %rdx), %ecx +- jne L(wcscmp_return) ++ cmpl (VEC_SIZE * 3)(%rsi, %rcx), %edx ++ je L(ret4) ++ setl %al ++ negl %eax ++ orl $1, %eax + # else +- movzbl (VEC_SIZE * 3)(%rdi, %rdx), %eax +- movzbl (VEC_SIZE * 3)(%rsi, %rdx), %edx +- subl %edx, %eax ++ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax ++ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx ++ subl %ecx, %eax + # endif +-# endif ++L(ret4): + VZEROUPPER_RETURN ++# endif ++ ++ .p2align 4,, 10 ++L(more_3x_vec): ++ /* Safe to compare 4x vectors. */ ++ VMOVU VEC_SIZE(%rdi), %ymm0 ++ VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ incl %ecx ++ jnz L(return_vec_1) ++ ++# ifdef USE_AS_STRNCMP ++ subq $(VEC_SIZE * 2), %rdx ++ jbe L(ret_zero) ++# endif ++ ++ VMOVU (VEC_SIZE * 2)(%rdi), %ymm0 ++ VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ incl %ecx ++ jnz L(return_vec_2) ++ ++ VMOVU (VEC_SIZE * 3)(%rdi), %ymm0 ++ VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ incl %ecx ++ jnz L(return_vec_3) + +- .p2align 4 +-L(next_3_vectors): +- vmovdqu VEC_SIZE(%rdi), %ymm6 +- VPCMPEQ VEC_SIZE(%rsi), %ymm6, %ymm3 +- VPMINU %ymm6, %ymm3, %ymm3 +- VPCMPEQ %ymm7, %ymm3, %ymm3 +- vpmovmskb %ymm3, %ecx +- testl %ecx, %ecx +- jne L(return_vec_size) +- vmovdqu (VEC_SIZE * 2)(%rdi), %ymm5 +- vmovdqu (VEC_SIZE * 3)(%rdi), %ymm4 +- vmovdqu (VEC_SIZE * 3)(%rsi), %ymm0 +- VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm5, %ymm2 +- VPMINU %ymm5, %ymm2, %ymm2 +- VPCMPEQ %ymm4, %ymm0, %ymm0 +- VPCMPEQ %ymm7, %ymm2, %ymm2 +- vpmovmskb %ymm2, %ecx +- testl %ecx, %ecx +- jne L(return_2_vec_size) +- VPMINU %ymm4, %ymm0, %ymm0 +- VPCMPEQ %ymm7, %ymm0, %ymm0 +- vpmovmskb %ymm0, %ecx +- testl %ecx, %ecx +- jne L(return_3_vec_size) +-L(main_loop_header): +- leaq (VEC_SIZE * 4)(%rdi), %rdx +- movl $PAGE_SIZE, %ecx +- /* Align load via RAX. */ +- andq $-(VEC_SIZE * 4), %rdx +- subq %rdi, %rdx +- leaq (%rdi, %rdx), %rax + # ifdef USE_AS_STRNCMP +- /* Starting from this point, the maximum offset, or simply the +- 'offset', DECREASES by the same amount when base pointers are +- moved forward. Return 0 when: +- 1) On match: offset <= the matched vector index. +- 2) On mistmach, offset is before the mistmatched index. ++ cmpq $(VEC_SIZE * 2), %rdx ++ jbe L(ret_zero) ++# endif ++ ++# ifdef USE_AS_WCSCMP ++ /* any non-zero positive value that doesn't inference with 0x1. + */ +- subq %rdx, %r11 +- jbe L(zero) +-# endif +- addq %rsi, %rdx +- movq %rdx, %rsi +- andl $(PAGE_SIZE - 1), %esi +- /* Number of bytes before page crossing. */ +- subq %rsi, %rcx +- /* Number of VEC_SIZE * 4 blocks before page crossing. */ +- shrq $DIVIDE_BY_VEC_4_SHIFT, %rcx +- /* ESI: Number of VEC_SIZE * 4 blocks before page crossing. */ +- movl %ecx, %esi +- jmp L(loop_start) ++ movl $2, %r8d + ++# else ++ xorl %r8d, %r8d ++# endif ++ ++ /* The prepare labels are various entry points from the page ++ cross logic. */ ++L(prepare_loop): ++ ++# ifdef USE_AS_STRNCMP ++ /* Store N + (VEC_SIZE * 4) and place check at the begining of ++ the loop. */ ++ leaq (VEC_SIZE * 2)(%rdi, %rdx), %rdx ++# endif ++L(prepare_loop_no_len): ++ ++ /* Align s1 and adjust s2 accordingly. */ ++ subq %rdi, %rsi ++ andq $-(VEC_SIZE * 4), %rdi ++ addq %rdi, %rsi ++ ++# ifdef USE_AS_STRNCMP ++ subq %rdi, %rdx ++# endif ++ ++L(prepare_loop_aligned): ++ /* eax stores distance from rsi to next page cross. These cases ++ need to be handled specially as the 4x loop could potentially ++ read memory past the length of s1 or s2 and across a page ++ boundary. */ ++ movl $-(VEC_SIZE * 4), %eax ++ subl %esi, %eax ++ andl $(PAGE_SIZE - 1), %eax ++ ++ /* Loop 4x comparisons at a time. */ + .p2align 4 + L(loop): ++ ++ /* End condition for strncmp. */ + # ifdef USE_AS_STRNCMP +- /* Base pointers are moved forward by 4 * VEC_SIZE. Decrease +- the maximum offset (%r11) by the same amount. */ +- subq $(VEC_SIZE * 4), %r11 +- jbe L(zero) +-# endif +- addq $(VEC_SIZE * 4), %rax +- addq $(VEC_SIZE * 4), %rdx +-L(loop_start): +- testl %esi, %esi +- leal -1(%esi), %esi +- je L(loop_cross_page) +-L(back_to_loop): +- /* Main loop, comparing 4 vectors are a time. */ +- vmovdqa (%rax), %ymm0 +- vmovdqa VEC_SIZE(%rax), %ymm3 +- VPCMPEQ (%rdx), %ymm0, %ymm4 +- VPCMPEQ VEC_SIZE(%rdx), %ymm3, %ymm1 +- VPMINU %ymm0, %ymm4, %ymm4 +- VPMINU %ymm3, %ymm1, %ymm1 +- vmovdqa (VEC_SIZE * 2)(%rax), %ymm2 +- VPMINU %ymm1, %ymm4, %ymm0 +- vmovdqa (VEC_SIZE * 3)(%rax), %ymm3 +- VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm2, %ymm5 +- VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm3, %ymm6 +- VPMINU %ymm2, %ymm5, %ymm5 +- VPMINU %ymm3, %ymm6, %ymm6 +- VPMINU %ymm5, %ymm0, %ymm0 +- VPMINU %ymm6, %ymm0, %ymm0 +- VPCMPEQ %ymm7, %ymm0, %ymm0 +- +- /* Test each mask (32 bits) individually because for VEC_SIZE +- == 32 is not possible to OR the four masks and keep all bits +- in a 64-bit integer register, differing from SSE2 strcmp +- where ORing is possible. */ +- vpmovmskb %ymm0, %ecx ++ subq $(VEC_SIZE * 4), %rdx ++ jbe L(ret_zero) ++# endif ++ ++ subq $-(VEC_SIZE * 4), %rdi ++ subq $-(VEC_SIZE * 4), %rsi ++ ++ /* Check if rsi loads will cross a page boundary. */ ++ addl $-(VEC_SIZE * 4), %eax ++ jnb L(page_cross_during_loop) ++ ++ /* Loop entry after handling page cross during loop. */ ++L(loop_skip_page_cross_check): ++ VMOVA (VEC_SIZE * 0)(%rdi), %ymm0 ++ VMOVA (VEC_SIZE * 1)(%rdi), %ymm2 ++ VMOVA (VEC_SIZE * 2)(%rdi), %ymm4 ++ VMOVA (VEC_SIZE * 3)(%rdi), %ymm6 ++ ++ /* ymm1 all 1s where s1 and s2 equal. All 0s otherwise. */ ++ VPCMPEQ (VEC_SIZE * 0)(%rsi), %ymm0, %ymm1 ++ ++ VPCMPEQ (VEC_SIZE * 1)(%rsi), %ymm2, %ymm3 ++ VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5 ++ VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7 ++ ++ ++ /* If any mismatches or null CHAR then 0 CHAR, otherwise non- ++ zero. */ ++ vpand %ymm0, %ymm1, %ymm1 ++ ++ ++ vpand %ymm2, %ymm3, %ymm3 ++ vpand %ymm4, %ymm5, %ymm5 ++ vpand %ymm6, %ymm7, %ymm7 ++ ++ VPMINU %ymm1, %ymm3, %ymm3 ++ VPMINU %ymm5, %ymm7, %ymm7 ++ ++ /* Reduce all 0 CHARs for the 4x VEC into ymm7. */ ++ VPMINU %ymm3, %ymm7, %ymm7 ++ ++ /* If any 0 CHAR then done. */ ++ VPCMPEQ %ymm7, %ymmZERO, %ymm7 ++ vpmovmskb %ymm7, %LOOP_REG ++ testl %LOOP_REG, %LOOP_REG ++ jz L(loop) ++ ++ /* Find which VEC has the mismatch of end of string. */ ++ VPCMPEQ %ymm1, %ymmZERO, %ymm1 ++ vpmovmskb %ymm1, %ecx + testl %ecx, %ecx +- je L(loop) +- VPCMPEQ %ymm7, %ymm4, %ymm0 +- vpmovmskb %ymm0, %edi +- testl %edi, %edi +- je L(test_vec) +- tzcntl %edi, %ecx ++ jnz L(return_vec_0_end) ++ ++ ++ VPCMPEQ %ymm3, %ymmZERO, %ymm3 ++ vpmovmskb %ymm3, %ecx ++ testl %ecx, %ecx ++ jnz L(return_vec_1_end) ++ ++L(return_vec_2_3_end): + # ifdef USE_AS_STRNCMP +- cmpq %rcx, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++ subq $(VEC_SIZE * 2), %rdx ++ jbe L(ret_zero_end) ++# endif ++ ++ VPCMPEQ %ymm5, %ymmZERO, %ymm5 ++ vpmovmskb %ymm5, %ecx ++ testl %ecx, %ecx ++ jnz L(return_vec_2_end) ++ ++ /* LOOP_REG contains matches for null/mismatch from the loop. If ++ VEC 0,1,and 2 all have no null and no mismatches then mismatch ++ must entirely be from VEC 3 which is fully represented by ++ LOOP_REG. */ ++ tzcntl %LOOP_REG, %LOOP_REG ++ ++# ifdef USE_AS_STRNCMP ++ subl $-(VEC_SIZE), %LOOP_REG ++ cmpq %LOOP_REG64, %rdx ++ jbe L(ret_zero_end) ++# endif ++ ++# ifdef USE_AS_WCSCMP ++ movl (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %ecx + xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif ++ cmpl (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx ++ je L(ret5) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax + # else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif ++ movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax ++ movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif ++L(ret5): + VZEROUPPER_RETURN + +- .p2align 4 +-L(test_vec): + # ifdef USE_AS_STRNCMP +- /* The first vector matched. Return 0 if the maximum offset +- (%r11) <= VEC_SIZE. */ +- cmpq $VEC_SIZE, %r11 +- jbe L(zero) ++ .p2align 4,, 2 ++L(ret_zero_end): ++ xorl %eax, %eax ++ VZEROUPPER_RETURN + # endif +- VPCMPEQ %ymm7, %ymm1, %ymm1 +- vpmovmskb %ymm1, %ecx +- testl %ecx, %ecx +- je L(test_2_vec) +- tzcntl %ecx, %edi ++ ++ ++ /* The L(return_vec_N_end) differ from L(return_vec_N) in that ++ they use the value of `r8` to negate the return value. This is ++ because the page cross logic can swap `rdi` and `rsi`. */ ++ .p2align 4,, 10 + # ifdef USE_AS_STRNCMP +- addq $VEC_SIZE, %rdi +- cmpq %rdi, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++L(return_vec_1_end): ++ salq $32, %rcx ++# endif ++L(return_vec_0_end): ++# ifndef USE_AS_STRNCMP ++ tzcntl %ecx, %ecx ++# else ++ tzcntq %rcx, %rcx ++ cmpq %rcx, %rdx ++ jbe L(ret_zero_end) ++# endif ++ ++# ifdef USE_AS_WCSCMP ++ movl (%rdi, %rcx), %edx + xorl %eax, %eax +- movl (%rsi, %rdi), %ecx +- cmpl (%rdx, %rdi), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rdi), %eax +- movzbl (%rdx, %rdi), %edx +- subl %edx, %eax +-# endif ++ cmpl (%rsi, %rcx), %edx ++ je L(ret6) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax + # else ++ movzbl (%rdi, %rcx), %eax ++ movzbl (%rsi, %rcx), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax ++# endif ++L(ret6): ++ VZEROUPPER_RETURN ++ ++# ifndef USE_AS_STRNCMP ++ .p2align 4,, 10 ++L(return_vec_1_end): ++ tzcntl %ecx, %ecx + # ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++ movl VEC_SIZE(%rdi, %rcx), %edx + xorl %eax, %eax +- movl VEC_SIZE(%rsi, %rdi), %ecx +- cmpl VEC_SIZE(%rdx, %rdi), %ecx +- jne L(wcscmp_return) ++ cmpl VEC_SIZE(%rsi, %rcx), %edx ++ je L(ret7) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax + # else +- movzbl VEC_SIZE(%rax, %rdi), %eax +- movzbl VEC_SIZE(%rdx, %rdi), %edx +- subl %edx, %eax ++ movzbl VEC_SIZE(%rdi, %rcx), %eax ++ movzbl VEC_SIZE(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif +-# endif ++L(ret7): + VZEROUPPER_RETURN ++# endif + +- .p2align 4 +-L(test_2_vec): ++ .p2align 4,, 10 ++L(return_vec_2_end): ++ tzcntl %ecx, %ecx + # ifdef USE_AS_STRNCMP +- /* The first 2 vectors matched. Return 0 if the maximum offset +- (%r11) <= 2 * VEC_SIZE. */ +- cmpq $(VEC_SIZE * 2), %r11 +- jbe L(zero) ++ cmpq %rcx, %rdx ++ jbe L(ret_zero_page_cross) + # endif +- VPCMPEQ %ymm7, %ymm5, %ymm5 +- vpmovmskb %ymm5, %ecx +- testl %ecx, %ecx +- je L(test_3_vec) +- tzcntl %ecx, %edi +-# ifdef USE_AS_STRNCMP +- addq $(VEC_SIZE * 2), %rdi +- cmpq %rdi, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++# ifdef USE_AS_WCSCMP ++ movl (VEC_SIZE * 2)(%rdi, %rcx), %edx + xorl %eax, %eax +- movl (%rsi, %rdi), %ecx +- cmpl (%rdx, %rdi), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rdi), %eax +- movzbl (%rdx, %rdi), %edx +- subl %edx, %eax +-# endif ++ cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx ++ je L(ret11) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax + # else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (VEC_SIZE * 2)(%rsi, %rdi), %ecx +- cmpl (VEC_SIZE * 2)(%rdx, %rdi), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (VEC_SIZE * 2)(%rax, %rdi), %eax +- movzbl (VEC_SIZE * 2)(%rdx, %rdi), %edx +- subl %edx, %eax +-# endif ++ movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax ++ movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif ++L(ret11): + VZEROUPPER_RETURN + +- .p2align 4 +-L(test_3_vec): ++ ++ /* Page cross in rsi in next 4x VEC. */ ++ ++ /* TODO: Improve logic here. */ ++ .p2align 4,, 10 ++L(page_cross_during_loop): ++ /* eax contains [distance_from_page - (VEC_SIZE * 4)]. */ ++ ++ /* Optimistically rsi and rdi and both aligned inwhich case we ++ don't need any logic here. */ ++ cmpl $-(VEC_SIZE * 4), %eax ++ /* Don't adjust eax before jumping back to loop and we will ++ never hit page cross case again. */ ++ je L(loop_skip_page_cross_check) ++ ++ /* Check if we can safely load a VEC. */ ++ cmpl $-(VEC_SIZE * 3), %eax ++ jle L(less_1x_vec_till_page_cross) ++ ++ VMOVA (%rdi), %ymm0 ++ VPCMPEQ (%rsi), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ incl %ecx ++ jnz L(return_vec_0_end) ++ ++ /* if distance >= 2x VEC then eax > -(VEC_SIZE * 2). */ ++ cmpl $-(VEC_SIZE * 2), %eax ++ jg L(more_2x_vec_till_page_cross) ++ ++ .p2align 4,, 4 ++L(less_1x_vec_till_page_cross): ++ subl $-(VEC_SIZE * 4), %eax ++ /* Guranteed safe to read from rdi - VEC_SIZE here. The only ++ concerning case is first iteration if incoming s1 was near start ++ of a page and s2 near end. If s1 was near the start of the page ++ we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe ++ to read back -VEC_SIZE. If rdi is truly at the start of a page ++ here, it means the previous page (rdi - VEC_SIZE) has already ++ been loaded earlier so must be valid. */ ++ VMOVU -VEC_SIZE(%rdi, %rax), %ymm0 ++ VPCMPEQ -VEC_SIZE(%rsi, %rax), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ ++ /* Mask of potentially valid bits. The lower bits can be out of ++ range comparisons (but safe regarding page crosses). */ ++ movl $-1, %r10d ++ shlxl %esi, %r10d, %r10d ++ notl %ecx ++ + # ifdef USE_AS_STRNCMP +- /* The first 3 vectors matched. Return 0 if the maximum offset +- (%r11) <= 3 * VEC_SIZE. */ +- cmpq $(VEC_SIZE * 3), %r11 +- jbe L(zero) +-# endif +- VPCMPEQ %ymm7, %ymm6, %ymm6 +- vpmovmskb %ymm6, %esi +- tzcntl %esi, %ecx ++ cmpq %rax, %rdx ++ jbe L(return_page_cross_end_check) ++# endif ++ movl %eax, %OFFSET_REG ++ addl $(PAGE_SIZE - VEC_SIZE * 4), %eax ++ ++ andl %r10d, %ecx ++ jz L(loop_skip_page_cross_check) ++ ++ .p2align 4,, 3 ++L(return_page_cross_end): ++ tzcntl %ecx, %ecx ++ + # ifdef USE_AS_STRNCMP +- addq $(VEC_SIZE * 3), %rcx +- cmpq %rcx, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (%rsi, %rcx), %esi +- cmpl (%rdx, %rcx), %esi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif ++ leal -VEC_SIZE(%OFFSET_REG64, %rcx), %ecx ++L(return_page_cross_cmp_mem): + # else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++ addl %OFFSET_REG, %ecx ++# endif ++# ifdef USE_AS_WCSCMP ++ movl VEC_OFFSET(%rdi, %rcx), %edx + xorl %eax, %eax +- movl (VEC_SIZE * 3)(%rsi, %rcx), %esi +- cmpl (VEC_SIZE * 3)(%rdx, %rcx), %esi +- jne L(wcscmp_return) +-# else +- movzbl (VEC_SIZE * 3)(%rax, %rcx), %eax +- movzbl (VEC_SIZE * 3)(%rdx, %rcx), %edx +- subl %edx, %eax +-# endif ++ cmpl VEC_OFFSET(%rsi, %rcx), %edx ++ je L(ret8) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax ++# else ++ movzbl VEC_OFFSET(%rdi, %rcx), %eax ++ movzbl VEC_OFFSET(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif ++L(ret8): + VZEROUPPER_RETURN + +- .p2align 4 +-L(loop_cross_page): +- xorl %r10d, %r10d +- movq %rdx, %rcx +- /* Align load via RDX. We load the extra ECX bytes which should +- be ignored. */ +- andl $((VEC_SIZE * 4) - 1), %ecx +- /* R10 is -RCX. */ +- subq %rcx, %r10 +- +- /* This works only if VEC_SIZE * 2 == 64. */ +-# if (VEC_SIZE * 2) != 64 +-# error (VEC_SIZE * 2) != 64 +-# endif +- +- /* Check if the first VEC_SIZE * 2 bytes should be ignored. */ +- cmpl $(VEC_SIZE * 2), %ecx +- jge L(loop_cross_page_2_vec) +- +- vmovdqu (%rax, %r10), %ymm2 +- vmovdqu VEC_SIZE(%rax, %r10), %ymm3 +- VPCMPEQ (%rdx, %r10), %ymm2, %ymm0 +- VPCMPEQ VEC_SIZE(%rdx, %r10), %ymm3, %ymm1 +- VPMINU %ymm2, %ymm0, %ymm0 +- VPMINU %ymm3, %ymm1, %ymm1 +- VPCMPEQ %ymm7, %ymm0, %ymm0 +- VPCMPEQ %ymm7, %ymm1, %ymm1 +- +- vpmovmskb %ymm0, %edi +- vpmovmskb %ymm1, %esi +- +- salq $32, %rsi +- xorq %rsi, %rdi +- +- /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */ +- shrq %cl, %rdi +- +- testq %rdi, %rdi +- je L(loop_cross_page_2_vec) +- tzcntq %rdi, %rcx + # ifdef USE_AS_STRNCMP +- cmpq %rcx, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++ .p2align 4,, 10 ++L(return_page_cross_end_check): ++ tzcntl %ecx, %ecx ++ leal -VEC_SIZE(%rax, %rcx), %ecx ++ cmpl %ecx, %edx ++ ja L(return_page_cross_cmp_mem) + xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif +-# else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif +-# endif + VZEROUPPER_RETURN ++# endif + +- .p2align 4 +-L(loop_cross_page_2_vec): +- /* The first VEC_SIZE * 2 bytes match or are ignored. */ +- vmovdqu (VEC_SIZE * 2)(%rax, %r10), %ymm2 +- vmovdqu (VEC_SIZE * 3)(%rax, %r10), %ymm3 +- VPCMPEQ (VEC_SIZE * 2)(%rdx, %r10), %ymm2, %ymm5 +- VPMINU %ymm2, %ymm5, %ymm5 +- VPCMPEQ (VEC_SIZE * 3)(%rdx, %r10), %ymm3, %ymm6 +- VPCMPEQ %ymm7, %ymm5, %ymm5 +- VPMINU %ymm3, %ymm6, %ymm6 +- VPCMPEQ %ymm7, %ymm6, %ymm6 +- +- vpmovmskb %ymm5, %edi +- vpmovmskb %ymm6, %esi +- +- salq $32, %rsi +- xorq %rsi, %rdi + +- xorl %r8d, %r8d +- /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */ +- subl $(VEC_SIZE * 2), %ecx +- jle 1f +- /* Skip ECX bytes. */ +- shrq %cl, %rdi +- /* R8 has number of bytes skipped. */ +- movl %ecx, %r8d +-1: +- /* Before jumping back to the loop, set ESI to the number of +- VEC_SIZE * 4 blocks before page crossing. */ +- movl $(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi +- +- testq %rdi, %rdi ++ .p2align 4,, 10 ++L(more_2x_vec_till_page_cross): ++ /* If more 2x vec till cross we will complete a full loop ++ iteration here. */ ++ ++ VMOVU VEC_SIZE(%rdi), %ymm0 ++ VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ incl %ecx ++ jnz L(return_vec_1_end) ++ + # ifdef USE_AS_STRNCMP +- /* At this point, if %rdi value is 0, it already tested +- VEC_SIZE*4+%r10 byte starting from %rax. This label +- checks whether strncmp maximum offset reached or not. */ +- je L(string_nbyte_offset_check) +-# else +- je L(back_to_loop) ++ cmpq $(VEC_SIZE * 2), %rdx ++ jbe L(ret_zero_in_loop_page_cross) + # endif +- tzcntq %rdi, %rcx +- addq %r10, %rcx +- /* Adjust for number of bytes skipped. */ +- addq %r8, %rcx ++ ++ subl $-(VEC_SIZE * 4), %eax ++ ++ /* Safe to include comparisons from lower bytes. */ ++ VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %ymm0 ++ VPCMPEQ -(VEC_SIZE * 2)(%rsi, %rax), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ incl %ecx ++ jnz L(return_vec_page_cross_0) ++ ++ VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %ymm0 ++ VPCMPEQ -(VEC_SIZE * 1)(%rsi, %rax), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ incl %ecx ++ jnz L(return_vec_page_cross_1) ++ + # ifdef USE_AS_STRNCMP +- addq $(VEC_SIZE * 2), %rcx +- subq %rcx, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++ /* Must check length here as length might proclude reading next ++ page. */ ++ cmpq %rax, %rdx ++ jbe L(ret_zero_in_loop_page_cross) ++# endif ++ ++ /* Finish the loop. */ ++ VMOVA (VEC_SIZE * 2)(%rdi), %ymm4 ++ VMOVA (VEC_SIZE * 3)(%rdi), %ymm6 ++ ++ VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5 ++ VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7 ++ vpand %ymm4, %ymm5, %ymm5 ++ vpand %ymm6, %ymm7, %ymm7 ++ VPMINU %ymm5, %ymm7, %ymm7 ++ VPCMPEQ %ymm7, %ymmZERO, %ymm7 ++ vpmovmskb %ymm7, %LOOP_REG ++ testl %LOOP_REG, %LOOP_REG ++ jnz L(return_vec_2_3_end) ++ ++ /* Best for code size to include ucond-jmp here. Would be faster ++ if this case is hot to duplicate the L(return_vec_2_3_end) code ++ as fall-through and have jump back to loop on mismatch ++ comparison. */ ++ subq $-(VEC_SIZE * 4), %rdi ++ subq $-(VEC_SIZE * 4), %rsi ++ addl $(PAGE_SIZE - VEC_SIZE * 8), %eax ++# ifdef USE_AS_STRNCMP ++ subq $(VEC_SIZE * 4), %rdx ++ ja L(loop_skip_page_cross_check) ++L(ret_zero_in_loop_page_cross): + xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif ++ VZEROUPPER_RETURN + # else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (VEC_SIZE * 2)(%rsi, %rcx), %edi +- cmpl (VEC_SIZE * 2)(%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (VEC_SIZE * 2)(%rax, %rcx), %eax +- movzbl (VEC_SIZE * 2)(%rdx, %rcx), %edx +- subl %edx, %eax +-# endif ++ jmp L(loop_skip_page_cross_check) + # endif +- VZEROUPPER_RETURN + ++ ++ .p2align 4,, 10 ++L(return_vec_page_cross_0): ++ addl $-VEC_SIZE, %eax ++L(return_vec_page_cross_1): ++ tzcntl %ecx, %ecx + # ifdef USE_AS_STRNCMP +-L(string_nbyte_offset_check): +- leaq (VEC_SIZE * 4)(%r10), %r10 +- cmpq %r10, %r11 +- jbe L(zero) +- jmp L(back_to_loop) ++ leal -VEC_SIZE(%rax, %rcx), %ecx ++ cmpq %rcx, %rdx ++ jbe L(ret_zero_in_loop_page_cross) ++# else ++ addl %eax, %ecx + # endif + +- .p2align 4 +-L(cross_page_loop): +- /* Check one byte/dword at a time. */ + # ifdef USE_AS_WCSCMP +- cmpl %ecx, %eax ++ movl VEC_OFFSET(%rdi, %rcx), %edx ++ xorl %eax, %eax ++ cmpl VEC_OFFSET(%rsi, %rcx), %edx ++ je L(ret9) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax + # else ++ movzbl VEC_OFFSET(%rdi, %rcx), %eax ++ movzbl VEC_OFFSET(%rsi, %rcx), %ecx + subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif +- jne L(different) +- addl $SIZE_OF_CHAR, %edx +- cmpl $(VEC_SIZE * 4), %edx +- je L(main_loop_header) +-# ifdef USE_AS_STRNCMP +- cmpq %r11, %rdx +- jae L(zero) ++L(ret9): ++ VZEROUPPER_RETURN ++ ++ ++ .p2align 4,, 10 ++L(page_cross): ++# ifndef USE_AS_STRNCMP ++ /* If both are VEC aligned we don't need any special logic here. ++ Only valid for strcmp where stop condition is guranteed to be ++ reachable by just reading memory. */ ++ testl $((VEC_SIZE - 1) << 20), %eax ++ jz L(no_page_cross) + # endif ++ ++ movl %edi, %eax ++ movl %esi, %ecx ++ andl $(PAGE_SIZE - 1), %eax ++ andl $(PAGE_SIZE - 1), %ecx ++ ++ xorl %OFFSET_REG, %OFFSET_REG ++ ++ /* Check which is closer to page cross, s1 or s2. */ ++ cmpl %eax, %ecx ++ jg L(page_cross_s2) ++ ++ /* The previous page cross check has false positives. Check for ++ true positive as page cross logic is very expensive. */ ++ subl $(PAGE_SIZE - VEC_SIZE * 4), %eax ++ jbe L(no_page_cross) ++ ++ /* Set r8 to not interfere with normal return value (rdi and rsi ++ did not swap). */ + # ifdef USE_AS_WCSCMP +- movl (%rdi, %rdx), %eax +- movl (%rsi, %rdx), %ecx ++ /* any non-zero positive value that doesn't inference with 0x1. ++ */ ++ movl $2, %r8d + # else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %ecx ++ xorl %r8d, %r8d + # endif +- /* Check null char. */ +- testl %eax, %eax +- jne L(cross_page_loop) +- /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED +- comparisons. */ +- subl %ecx, %eax +-# ifndef USE_AS_WCSCMP +-L(different): ++ ++ /* Check if less than 1x VEC till page cross. */ ++ subl $(VEC_SIZE * 3), %eax ++ jg L(less_1x_vec_till_page) ++ ++ /* If more than 1x VEC till page cross, loop throuh safely ++ loadable memory until within 1x VEC of page cross. */ ++ ++ .p2align 4,, 10 ++L(page_cross_loop): ++ ++ VMOVU (%rdi, %OFFSET_REG64), %ymm0 ++ VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ incl %ecx ++ ++ jnz L(check_ret_vec_page_cross) ++ addl $VEC_SIZE, %OFFSET_REG ++# ifdef USE_AS_STRNCMP ++ cmpq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross) + # endif +- VZEROUPPER_RETURN ++ addl $VEC_SIZE, %eax ++ jl L(page_cross_loop) ++ ++ subl %eax, %OFFSET_REG ++ /* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed ++ to not cross page so is safe to load. Since we have already ++ loaded at least 1 VEC from rsi it is also guranteed to be safe. ++ */ ++ ++ VMOVU (%rdi, %OFFSET_REG64), %ymm0 ++ VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ ++# ifdef USE_AS_STRNCMP ++ leal VEC_SIZE(%OFFSET_REG64), %eax ++ cmpq %rax, %rdx ++ jbe L(check_ret_vec_page_cross2) ++ addq %rdi, %rdx ++# endif ++ incl %ecx ++ jz L(prepare_loop_no_len) + ++ .p2align 4,, 4 ++L(ret_vec_page_cross): ++# ifndef USE_AS_STRNCMP ++L(check_ret_vec_page_cross): ++# endif ++ tzcntl %ecx, %ecx ++ addl %OFFSET_REG, %ecx ++L(ret_vec_page_cross_cont): + # ifdef USE_AS_WCSCMP +- .p2align 4 +-L(different): +- /* Use movl to avoid modifying EFLAGS. */ +- movl $0, %eax ++ movl (%rdi, %rcx), %edx ++ xorl %eax, %eax ++ cmpl (%rsi, %rcx), %edx ++ je L(ret12) + setl %al + negl %eax +- orl $1, %eax +- VZEROUPPER_RETURN ++ xorl %r8d, %eax ++# else ++ movzbl (%rdi, %rcx), %eax ++ movzbl (%rsi, %rcx), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif ++L(ret12): ++ VZEROUPPER_RETURN + + # ifdef USE_AS_STRNCMP +- .p2align 4 +-L(zero): ++ .p2align 4,, 10 ++L(check_ret_vec_page_cross2): ++ incl %ecx ++L(check_ret_vec_page_cross): ++ tzcntl %ecx, %ecx ++ addl %OFFSET_REG, %ecx ++ cmpq %rcx, %rdx ++ ja L(ret_vec_page_cross_cont) ++ .p2align 4,, 2 ++L(ret_zero_page_cross): + xorl %eax, %eax + VZEROUPPER_RETURN ++# endif + +- .p2align 4 +-L(char0): +-# ifdef USE_AS_WCSCMP +- xorl %eax, %eax +- movl (%rdi), %ecx +- cmpl (%rsi), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rsi), %ecx +- movzbl (%rdi), %eax +- subl %ecx, %eax +-# endif +- VZEROUPPER_RETURN ++ .p2align 4,, 4 ++L(page_cross_s2): ++ /* Ensure this is a true page cross. */ ++ subl $(PAGE_SIZE - VEC_SIZE * 4), %ecx ++ jbe L(no_page_cross) ++ ++ ++ movl %ecx, %eax ++ movq %rdi, %rcx ++ movq %rsi, %rdi ++ movq %rcx, %rsi ++ ++ /* set r8 to negate return value as rdi and rsi swapped. */ ++# ifdef USE_AS_WCSCMP ++ movl $-4, %r8d ++# else ++ movl $-1, %r8d + # endif ++ xorl %OFFSET_REG, %OFFSET_REG + +- .p2align 4 +-L(last_vector): +- addq %rdx, %rdi +- addq %rdx, %rsi ++ /* Check if more than 1x VEC till page cross. */ ++ subl $(VEC_SIZE * 3), %eax ++ jle L(page_cross_loop) ++ ++ .p2align 4,, 6 ++L(less_1x_vec_till_page): ++ /* Find largest load size we can use. */ ++ cmpl $16, %eax ++ ja L(less_16_till_page) ++ ++ VMOVU (%rdi), %xmm0 ++ VPCMPEQ (%rsi), %xmm0, %xmm1 ++ VPCMPEQ %xmm0, %xmmZERO, %xmm2 ++ vpandn %xmm1, %xmm2, %xmm1 ++ vpmovmskb %ymm1, %ecx ++ incw %cx ++ jnz L(check_ret_vec_page_cross) ++ movl $16, %OFFSET_REG + # ifdef USE_AS_STRNCMP +- subq %rdx, %r11 ++ cmpq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross_slow_case0) ++ subl %eax, %OFFSET_REG ++# else ++ /* Explicit check for 16 byte alignment. */ ++ subl %eax, %OFFSET_REG ++ jz L(prepare_loop) + # endif +- tzcntl %ecx, %edx ++ ++ VMOVU (%rdi, %OFFSET_REG64), %xmm0 ++ VPCMPEQ (%rsi, %OFFSET_REG64), %xmm0, %xmm1 ++ VPCMPEQ %xmm0, %xmmZERO, %xmm2 ++ vpandn %xmm1, %xmm2, %xmm1 ++ vpmovmskb %ymm1, %ecx ++ incw %cx ++ jnz L(check_ret_vec_page_cross) ++ + # ifdef USE_AS_STRNCMP +- cmpq %r11, %rdx +- jae L(zero) ++ addl $16, %OFFSET_REG ++ subq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross_slow_case0) ++ subq $-(VEC_SIZE * 4), %rdx ++ ++ leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi ++ leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi ++# else ++ leaq (16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi ++ leaq (16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi + # endif +-# ifdef USE_AS_WCSCMP ++ jmp L(prepare_loop_aligned) ++ ++# ifdef USE_AS_STRNCMP ++ .p2align 4,, 2 ++L(ret_zero_page_cross_slow_case0): + xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax ++ ret + # endif +- VZEROUPPER_RETURN + +- /* Comparing on page boundary region requires special treatment: +- It must done one vector at the time, starting with the wider +- ymm vector if possible, if not, with xmm. If fetching 16 bytes +- (xmm) still passes the boundary, byte comparison must be done. +- */ +- .p2align 4 +-L(cross_page): +- /* Try one ymm vector at a time. */ +- cmpl $(PAGE_SIZE - VEC_SIZE), %eax +- jg L(cross_page_1_vector) +-L(loop_1_vector): +- vmovdqu (%rdi, %rdx), %ymm1 +- VPCMPEQ (%rsi, %rdx), %ymm1, %ymm0 +- VPMINU %ymm1, %ymm0, %ymm0 +- VPCMPEQ %ymm7, %ymm0, %ymm0 +- vpmovmskb %ymm0, %ecx +- testl %ecx, %ecx +- jne L(last_vector) + +- addl $VEC_SIZE, %edx ++ .p2align 4,, 10 ++L(less_16_till_page): ++ /* Find largest load size we can use. */ ++ cmpl $24, %eax ++ ja L(less_8_till_page) + +- addl $VEC_SIZE, %eax +-# ifdef USE_AS_STRNCMP +- /* Return 0 if the current offset (%rdx) >= the maximum offset +- (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) +-# endif +- cmpl $(PAGE_SIZE - VEC_SIZE), %eax +- jle L(loop_1_vector) +-L(cross_page_1_vector): +- /* Less than 32 bytes to check, try one xmm vector. */ +- cmpl $(PAGE_SIZE - 16), %eax +- jg L(cross_page_1_xmm) +- vmovdqu (%rdi, %rdx), %xmm1 +- VPCMPEQ (%rsi, %rdx), %xmm1, %xmm0 +- VPMINU %xmm1, %xmm0, %xmm0 +- VPCMPEQ %xmm7, %xmm0, %xmm0 +- vpmovmskb %xmm0, %ecx +- testl %ecx, %ecx +- jne L(last_vector) ++ vmovq (%rdi), %xmm0 ++ vmovq (%rsi), %xmm1 ++ VPCMPEQ %xmm0, %xmmZERO, %xmm2 ++ VPCMPEQ %xmm1, %xmm0, %xmm1 ++ vpandn %xmm1, %xmm2, %xmm1 ++ vpmovmskb %ymm1, %ecx ++ incb %cl ++ jnz L(check_ret_vec_page_cross) + +- addl $16, %edx +-# ifndef USE_AS_WCSCMP +- addl $16, %eax ++ ++# ifdef USE_AS_STRNCMP ++ cmpq $8, %rdx ++ jbe L(ret_zero_page_cross_slow_case0) + # endif ++ movl $24, %OFFSET_REG ++ /* Explicit check for 16 byte alignment. */ ++ subl %eax, %OFFSET_REG ++ ++ ++ ++ vmovq (%rdi, %OFFSET_REG64), %xmm0 ++ vmovq (%rsi, %OFFSET_REG64), %xmm1 ++ VPCMPEQ %xmm0, %xmmZERO, %xmm2 ++ VPCMPEQ %xmm1, %xmm0, %xmm1 ++ vpandn %xmm1, %xmm2, %xmm1 ++ vpmovmskb %ymm1, %ecx ++ incb %cl ++ jnz L(check_ret_vec_page_cross) ++ + # ifdef USE_AS_STRNCMP +- /* Return 0 if the current offset (%rdx) >= the maximum offset +- (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) +-# endif +- +-L(cross_page_1_xmm): +-# ifndef USE_AS_WCSCMP +- /* Less than 16 bytes to check, try 8 byte vector. NB: No need +- for wcscmp nor wcsncmp since wide char is 4 bytes. */ +- cmpl $(PAGE_SIZE - 8), %eax +- jg L(cross_page_8bytes) +- vmovq (%rdi, %rdx), %xmm1 +- vmovq (%rsi, %rdx), %xmm0 +- VPCMPEQ %xmm0, %xmm1, %xmm0 +- VPMINU %xmm1, %xmm0, %xmm0 +- VPCMPEQ %xmm7, %xmm0, %xmm0 +- vpmovmskb %xmm0, %ecx +- /* Only last 8 bits are valid. */ +- andl $0xff, %ecx +- testl %ecx, %ecx +- jne L(last_vector) ++ addl $8, %OFFSET_REG ++ subq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross_slow_case0) ++ subq $-(VEC_SIZE * 4), %rdx + +- addl $8, %edx +- addl $8, %eax ++ leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi ++ leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi ++# else ++ leaq (8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi ++ leaq (8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi ++# endif ++ jmp L(prepare_loop_aligned) ++ ++ ++ .p2align 4,, 10 ++L(less_8_till_page): ++# ifdef USE_AS_WCSCMP ++ /* If using wchar then this is the only check before we reach ++ the page boundary. */ ++ movl (%rdi), %eax ++ movl (%rsi), %ecx ++ cmpl %ecx, %eax ++ jnz L(ret_less_8_wcs) + # ifdef USE_AS_STRNCMP +- /* Return 0 if the current offset (%rdx) >= the maximum offset +- (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) ++ addq %rdi, %rdx ++ /* We already checked for len <= 1 so cannot hit that case here. ++ */ + # endif ++ testl %eax, %eax ++ jnz L(prepare_loop_no_len) ++ ret + +-L(cross_page_8bytes): +- /* Less than 8 bytes to check, try 4 byte vector. */ +- cmpl $(PAGE_SIZE - 4), %eax +- jg L(cross_page_4bytes) +- vmovd (%rdi, %rdx), %xmm1 +- vmovd (%rsi, %rdx), %xmm0 +- VPCMPEQ %xmm0, %xmm1, %xmm0 +- VPMINU %xmm1, %xmm0, %xmm0 +- VPCMPEQ %xmm7, %xmm0, %xmm0 +- vpmovmskb %xmm0, %ecx +- /* Only last 4 bits are valid. */ +- andl $0xf, %ecx +- testl %ecx, %ecx +- jne L(last_vector) ++ .p2align 4,, 8 ++L(ret_less_8_wcs): ++ setl %OFFSET_REG8 ++ negl %OFFSET_REG ++ movl %OFFSET_REG, %eax ++ xorl %r8d, %eax ++ ret ++ ++# else ++ ++ /* Find largest load size we can use. */ ++ cmpl $28, %eax ++ ja L(less_4_till_page) ++ ++ vmovd (%rdi), %xmm0 ++ vmovd (%rsi), %xmm1 ++ VPCMPEQ %xmm0, %xmmZERO, %xmm2 ++ VPCMPEQ %xmm1, %xmm0, %xmm1 ++ vpandn %xmm1, %xmm2, %xmm1 ++ vpmovmskb %ymm1, %ecx ++ subl $0xf, %ecx ++ jnz L(check_ret_vec_page_cross) + +- addl $4, %edx + # ifdef USE_AS_STRNCMP +- /* Return 0 if the current offset (%rdx) >= the maximum offset +- (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) ++ cmpq $4, %rdx ++ jbe L(ret_zero_page_cross_slow_case1) + # endif ++ movl $28, %OFFSET_REG ++ /* Explicit check for 16 byte alignment. */ ++ subl %eax, %OFFSET_REG + +-L(cross_page_4bytes): +-# endif +- /* Less than 4 bytes to check, try one byte/dword at a time. */ +-# ifdef USE_AS_STRNCMP +- cmpq %r11, %rdx +- jae L(zero) +-# endif +-# ifdef USE_AS_WCSCMP +- movl (%rdi, %rdx), %eax +- movl (%rsi, %rdx), %ecx +-# else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %ecx +-# endif +- testl %eax, %eax +- jne L(cross_page_loop) ++ ++ ++ vmovd (%rdi, %OFFSET_REG64), %xmm0 ++ vmovd (%rsi, %OFFSET_REG64), %xmm1 ++ VPCMPEQ %xmm0, %xmmZERO, %xmm2 ++ VPCMPEQ %xmm1, %xmm0, %xmm1 ++ vpandn %xmm1, %xmm2, %xmm1 ++ vpmovmskb %ymm1, %ecx ++ subl $0xf, %ecx ++ jnz L(check_ret_vec_page_cross) ++ ++# ifdef USE_AS_STRNCMP ++ addl $4, %OFFSET_REG ++ subq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross_slow_case1) ++ subq $-(VEC_SIZE * 4), %rdx ++ ++ leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi ++ leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi ++# else ++ leaq (4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi ++ leaq (4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi ++# endif ++ jmp L(prepare_loop_aligned) ++ ++# ifdef USE_AS_STRNCMP ++ .p2align 4,, 2 ++L(ret_zero_page_cross_slow_case1): ++ xorl %eax, %eax ++ ret ++# endif ++ ++ .p2align 4,, 10 ++L(less_4_till_page): ++ subq %rdi, %rsi ++ /* Extremely slow byte comparison loop. */ ++L(less_4_loop): ++ movzbl (%rdi), %eax ++ movzbl (%rsi, %rdi), %ecx + subl %ecx, %eax +- VZEROUPPER_RETURN +-END (STRCMP) ++ jnz L(ret_less_4_loop) ++ testl %ecx, %ecx ++ jz L(ret_zero_4_loop) ++# ifdef USE_AS_STRNCMP ++ decq %rdx ++ jz L(ret_zero_4_loop) ++# endif ++ incq %rdi ++ /* end condition is reach page boundary (rdi is aligned). */ ++ testl $31, %edi ++ jnz L(less_4_loop) ++ leaq -(VEC_SIZE * 4)(%rdi, %rsi), %rsi ++ addq $-(VEC_SIZE * 4), %rdi ++# ifdef USE_AS_STRNCMP ++ subq $-(VEC_SIZE * 4), %rdx ++# endif ++ jmp L(prepare_loop_aligned) ++ ++L(ret_zero_4_loop): ++ xorl %eax, %eax ++ ret ++L(ret_less_4_loop): ++ xorl %r8d, %eax ++ subl %r8d, %eax ++ ret ++# endif ++END(STRCMP) + #endif +-- +GitLab + diff --git a/SOURCES/ia-opt-strcmp-evex.patch b/SOURCES/ia-opt-strcmp-evex.patch new file mode 100644 index 0000000..f5019bf --- /dev/null +++ b/SOURCES/ia-opt-strcmp-evex.patch @@ -0,0 +1,1992 @@ +From d16c728bff5a92a254d7078d1766a4f3070acd66 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Mon, 10 Jan 2022 15:35:39 -0600 +Subject: [PATCH] x86: Optimize strcmp-evex.S + +Optimization are primarily to the loop logic and how the page cross +logic interacts with the loop. + +The page cross logic is at times more expensive for short strings near +the end of a page but not crossing the page. This is done to retest +the page cross conditions with a non-faulty check and to improve the +logic for entering the loop afterwards. This is only particular cases, +however, and is general made up for by more than 10x improvements on +the transition from the page cross -> loop case. + +The non-page cross cases as well are nearly universally improved. + +test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass. + +Signed-off-by: Noah Goldstein +(cherry picked from commit 8418eb3ff4b781d31c4ed5dc6c0bd7356bc45db9) +--- + sysdeps/x86_64/multiarch/strcmp-evex.S | 1712 +++++++++++++----------- + 1 file changed, 919 insertions(+), 793 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S +index 6f5c4bf9..99d8409a 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-evex.S ++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S +@@ -26,54 +26,69 @@ + + # define PAGE_SIZE 4096 + +-/* VEC_SIZE = Number of bytes in a ymm register */ ++ /* VEC_SIZE = Number of bytes in a ymm register. */ + # define VEC_SIZE 32 ++# define CHAR_PER_VEC (VEC_SIZE / SIZE_OF_CHAR) + +-/* Shift for dividing by (VEC_SIZE * 4). */ +-# define DIVIDE_BY_VEC_4_SHIFT 7 +-# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) +-# error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) +-# endif +- +-# define VMOVU vmovdqu64 +-# define VMOVA vmovdqa64 ++# define VMOVU vmovdqu64 ++# define VMOVA vmovdqa64 + + # ifdef USE_AS_WCSCMP +-/* Compare packed dwords. */ +-# define VPCMP vpcmpd ++# define TESTEQ subl $0xff, ++ /* Compare packed dwords. */ ++# define VPCMP vpcmpd + # define VPMINU vpminud + # define VPTESTM vptestmd +-# define SHIFT_REG32 r8d +-# define SHIFT_REG64 r8 +-/* 1 dword char == 4 bytes. */ ++ /* 1 dword char == 4 bytes. */ + # define SIZE_OF_CHAR 4 + # else +-/* Compare packed bytes. */ +-# define VPCMP vpcmpb ++# define TESTEQ incl ++ /* Compare packed bytes. */ ++# define VPCMP vpcmpb + # define VPMINU vpminub + # define VPTESTM vptestmb +-# define SHIFT_REG32 ecx +-# define SHIFT_REG64 rcx +-/* 1 byte char == 1 byte. */ ++ /* 1 byte char == 1 byte. */ + # define SIZE_OF_CHAR 1 + # endif + ++# ifdef USE_AS_STRNCMP ++# define LOOP_REG r9d ++# define LOOP_REG64 r9 ++ ++# define OFFSET_REG8 r9b ++# define OFFSET_REG r9d ++# define OFFSET_REG64 r9 ++# else ++# define LOOP_REG edx ++# define LOOP_REG64 rdx ++ ++# define OFFSET_REG8 dl ++# define OFFSET_REG edx ++# define OFFSET_REG64 rdx ++# endif ++ ++# if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP ++# define VEC_OFFSET 0 ++# else ++# define VEC_OFFSET (-VEC_SIZE) ++# endif ++ + # define XMMZERO xmm16 +-# define XMM0 xmm17 +-# define XMM1 xmm18 ++# define XMM0 xmm17 ++# define XMM1 xmm18 + + # define YMMZERO ymm16 +-# define YMM0 ymm17 +-# define YMM1 ymm18 +-# define YMM2 ymm19 +-# define YMM3 ymm20 +-# define YMM4 ymm21 +-# define YMM5 ymm22 +-# define YMM6 ymm23 +-# define YMM7 ymm24 +-# define YMM8 ymm25 +-# define YMM9 ymm26 +-# define YMM10 ymm27 ++# define YMM0 ymm17 ++# define YMM1 ymm18 ++# define YMM2 ymm19 ++# define YMM3 ymm20 ++# define YMM4 ymm21 ++# define YMM5 ymm22 ++# define YMM6 ymm23 ++# define YMM7 ymm24 ++# define YMM8 ymm25 ++# define YMM9 ymm26 ++# define YMM10 ymm27 + + /* Warning! + wcscmp/wcsncmp have to use SIGNED comparison for elements. +@@ -96,985 +111,1096 @@ + the maximum offset is reached before a difference is found, zero is + returned. */ + +- .section .text.evex,"ax",@progbits +-ENTRY (STRCMP) ++ .section .text.evex, "ax", @progbits ++ENTRY(STRCMP) + # ifdef USE_AS_STRNCMP +- /* Check for simple cases (0 or 1) in offset. */ +- cmp $1, %RDX_LP +- je L(char0) +- jb L(zero) +-# ifdef USE_AS_WCSCMP +-# ifndef __ILP32__ +- movq %rdx, %rcx +- /* Check if length could overflow when multiplied by +- sizeof(wchar_t). Checking top 8 bits will cover all potential +- overflow cases as well as redirect cases where its impossible to +- length to bound a valid memory region. In these cases just use +- 'wcscmp'. */ +- shrq $56, %rcx +- jnz __wcscmp_evex +-# endif +- /* Convert units: from wide to byte char. */ +- shl $2, %RDX_LP ++# ifdef __ILP32__ ++ /* Clear the upper 32 bits. */ ++ movl %edx, %rdx + # endif +- /* Register %r11 tracks the maximum offset. */ +- mov %RDX_LP, %R11_LP ++ cmp $1, %RDX_LP ++ /* Signed comparison intentional. We use this branch to also ++ test cases where length >= 2^63. These very large sizes can be ++ handled with strcmp as there is no way for that length to ++ actually bound the buffer. */ ++ jle L(one_or_less) + # endif + movl %edi, %eax +- xorl %edx, %edx +- /* Make %XMMZERO (%YMMZERO) all zeros in this function. */ +- vpxorq %XMMZERO, %XMMZERO, %XMMZERO + orl %esi, %eax +- andl $(PAGE_SIZE - 1), %eax +- cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax +- jg L(cross_page) +- /* Start comparing 4 vectors. */ ++ /* Shift out the bits irrelivant to page boundary ([63:12]). */ ++ sall $20, %eax ++ /* Check if s1 or s2 may cross a page in next 4x VEC loads. */ ++ cmpl $((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax ++ ja L(page_cross) ++ ++L(no_page_cross): ++ /* Safe to compare 4x vectors. */ + VMOVU (%rdi), %YMM0 +- +- /* Each bit set in K2 represents a non-null CHAR in YMM0. */ + VPTESTM %YMM0, %YMM0, %k2 +- + /* Each bit cleared in K1 represents a mismatch or a null CHAR + in YMM0 and 32 bytes at (%rsi). */ + VPCMP $0, (%rsi), %YMM0, %k1{%k2} +- + kmovd %k1, %ecx +-# ifdef USE_AS_WCSCMP +- subl $0xff, %ecx +-# else +- incl %ecx +-# endif +- je L(next_3_vectors) +- tzcntl %ecx, %edx +-# ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %edx +-# endif + # ifdef USE_AS_STRNCMP +- /* Return 0 if the mismatched index (%rdx) is after the maximum +- offset (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) ++ cmpq $CHAR_PER_VEC, %rdx ++ jbe L(vec_0_test_len) + # endif ++ ++ /* TESTEQ is `incl` for strcmp/strncmp and `subl $0xff` for ++ wcscmp/wcsncmp. */ ++ ++ /* All 1s represents all equals. TESTEQ will overflow to zero in ++ all equals case. Otherwise 1s will carry until position of first ++ mismatch. */ ++ TESTEQ %ecx ++ jz L(more_3x_vec) ++ ++ .p2align 4,, 4 ++L(return_vec_0): ++ tzcntl %ecx, %ecx + # ifdef USE_AS_WCSCMP ++ movl (%rdi, %rcx, SIZE_OF_CHAR), %edx + xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- je L(return) +-L(wcscmp_return): ++ cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx ++ je L(ret0) + setl %al + negl %eax + orl $1, %eax +-L(return): + # else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax ++ movzbl (%rdi, %rcx), %eax ++ movzbl (%rsi, %rcx), %ecx ++ subl %ecx, %eax + # endif ++L(ret0): + ret + +-L(return_vec_size): +- tzcntl %ecx, %edx +-# ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %edx +-# endif + # ifdef USE_AS_STRNCMP +- /* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after +- the maximum offset (%r11). */ +- addq $VEC_SIZE, %rdx +- cmpq %r11, %rdx +- jae L(zero) +-# ifdef USE_AS_WCSCMP ++ .p2align 4,, 4 ++L(vec_0_test_len): ++ notl %ecx ++ bzhil %edx, %ecx, %eax ++ jnz L(return_vec_0) ++ /* Align if will cross fetch block. */ ++ .p2align 4,, 2 ++L(ret_zero): + xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax +-# endif +-# else ++ ret ++ ++ .p2align 4,, 5 ++L(one_or_less): ++ jb L(ret_zero) + # ifdef USE_AS_WCSCMP ++ /* 'nbe' covers the case where length is negative (large ++ unsigned). */ ++ jnbe __wcscmp_evex ++ movl (%rdi), %edx + xorl %eax, %eax +- movl VEC_SIZE(%rdi, %rdx), %ecx +- cmpl VEC_SIZE(%rsi, %rdx), %ecx +- jne L(wcscmp_return) ++ cmpl (%rsi), %edx ++ je L(ret1) ++ setl %al ++ negl %eax ++ orl $1, %eax + # else +- movzbl VEC_SIZE(%rdi, %rdx), %eax +- movzbl VEC_SIZE(%rsi, %rdx), %edx +- subl %edx, %eax ++ /* 'nbe' covers the case where length is negative (large ++ unsigned). */ ++ jnbe __strcmp_evex ++ movzbl (%rdi), %eax ++ movzbl (%rsi), %ecx ++ subl %ecx, %eax + # endif +-# endif ++L(ret1): + ret ++# endif + +-L(return_2_vec_size): +- tzcntl %ecx, %edx ++ .p2align 4,, 10 ++L(return_vec_1): ++ tzcntl %ecx, %ecx ++# ifdef USE_AS_STRNCMP ++ /* rdx must be > CHAR_PER_VEC so its safe to subtract without ++ worrying about underflow. */ ++ addq $-CHAR_PER_VEC, %rdx ++ cmpq %rcx, %rdx ++ jbe L(ret_zero) ++# endif + # ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %edx ++ movl VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx ++ xorl %eax, %eax ++ cmpl VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx ++ je L(ret2) ++ setl %al ++ negl %eax ++ orl $1, %eax ++# else ++ movzbl VEC_SIZE(%rdi, %rcx), %eax ++ movzbl VEC_SIZE(%rsi, %rcx), %ecx ++ subl %ecx, %eax + # endif ++L(ret2): ++ ret ++ ++ .p2align 4,, 10 + # ifdef USE_AS_STRNCMP +- /* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is +- after the maximum offset (%r11). */ +- addq $(VEC_SIZE * 2), %rdx +- cmpq %r11, %rdx +- jae L(zero) +-# ifdef USE_AS_WCSCMP +- xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- jne L(wcscmp_return) ++L(return_vec_3): ++# if CHAR_PER_VEC <= 16 ++ sall $CHAR_PER_VEC, %ecx + # else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax ++ salq $CHAR_PER_VEC, %rcx + # endif ++# endif ++L(return_vec_2): ++# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP) ++ tzcntl %ecx, %ecx + # else +-# ifdef USE_AS_WCSCMP +- xorl %eax, %eax +- movl (VEC_SIZE * 2)(%rdi, %rdx), %ecx +- cmpl (VEC_SIZE * 2)(%rsi, %rdx), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (VEC_SIZE * 2)(%rdi, %rdx), %eax +- movzbl (VEC_SIZE * 2)(%rsi, %rdx), %edx +- subl %edx, %eax +-# endif ++ tzcntq %rcx, %rcx + # endif +- ret + +-L(return_3_vec_size): +- tzcntl %ecx, %edx +-# ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %edx +-# endif + # ifdef USE_AS_STRNCMP +- /* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is +- after the maximum offset (%r11). */ +- addq $(VEC_SIZE * 3), %rdx +- cmpq %r11, %rdx +- jae L(zero) +-# ifdef USE_AS_WCSCMP ++ cmpq %rcx, %rdx ++ jbe L(ret_zero) ++# endif ++ ++# ifdef USE_AS_WCSCMP ++ movl (VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx + xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax +-# endif ++ cmpl (VEC_SIZE * 2)(%rsi, %rcx, SIZE_OF_CHAR), %edx ++ je L(ret3) ++ setl %al ++ negl %eax ++ orl $1, %eax + # else ++ movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax ++ movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++# endif ++L(ret3): ++ ret ++ ++# ifndef USE_AS_STRNCMP ++ .p2align 4,, 10 ++L(return_vec_3): ++ tzcntl %ecx, %ecx + # ifdef USE_AS_WCSCMP ++ movl (VEC_SIZE * 3)(%rdi, %rcx, SIZE_OF_CHAR), %edx + xorl %eax, %eax +- movl (VEC_SIZE * 3)(%rdi, %rdx), %ecx +- cmpl (VEC_SIZE * 3)(%rsi, %rdx), %ecx +- jne L(wcscmp_return) ++ cmpl (VEC_SIZE * 3)(%rsi, %rcx, SIZE_OF_CHAR), %edx ++ je L(ret4) ++ setl %al ++ negl %eax ++ orl $1, %eax + # else +- movzbl (VEC_SIZE * 3)(%rdi, %rdx), %eax +- movzbl (VEC_SIZE * 3)(%rsi, %rdx), %edx +- subl %edx, %eax ++ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax ++ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx ++ subl %ecx, %eax + # endif +-# endif ++L(ret4): + ret ++# endif + +- .p2align 4 +-L(next_3_vectors): +- VMOVU VEC_SIZE(%rdi), %YMM0 +- /* Each bit set in K2 represents a non-null CHAR in YMM0. */ ++ /* 32 byte align here ensures the main loop is ideally aligned ++ for DSB. */ ++ .p2align 5 ++L(more_3x_vec): ++ /* Safe to compare 4x vectors. */ ++ VMOVU (VEC_SIZE)(%rdi), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- /* Each bit cleared in K1 represents a mismatch or a null CHAR +- in YMM0 and 32 bytes at VEC_SIZE(%rsi). */ +- VPCMP $0, VEC_SIZE(%rsi), %YMM0, %k1{%k2} ++ VPCMP $0, (VEC_SIZE)(%rsi), %YMM0, %k1{%k2} + kmovd %k1, %ecx +-# ifdef USE_AS_WCSCMP +- subl $0xff, %ecx +-# else +- incl %ecx ++ TESTEQ %ecx ++ jnz L(return_vec_1) ++ ++# ifdef USE_AS_STRNCMP ++ subq $(CHAR_PER_VEC * 2), %rdx ++ jbe L(ret_zero) + # endif +- jne L(return_vec_size) + + VMOVU (VEC_SIZE * 2)(%rdi), %YMM0 +- /* Each bit set in K2 represents a non-null CHAR in YMM0. */ + VPTESTM %YMM0, %YMM0, %k2 +- /* Each bit cleared in K1 represents a mismatch or a null CHAR +- in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi). */ + VPCMP $0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2} + kmovd %k1, %ecx +-# ifdef USE_AS_WCSCMP +- subl $0xff, %ecx +-# else +- incl %ecx +-# endif +- jne L(return_2_vec_size) ++ TESTEQ %ecx ++ jnz L(return_vec_2) + + VMOVU (VEC_SIZE * 3)(%rdi), %YMM0 +- /* Each bit set in K2 represents a non-null CHAR in YMM0. */ + VPTESTM %YMM0, %YMM0, %k2 +- /* Each bit cleared in K1 represents a mismatch or a null CHAR +- in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi). */ + VPCMP $0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2} + kmovd %k1, %ecx ++ TESTEQ %ecx ++ jnz L(return_vec_3) ++ ++# ifdef USE_AS_STRNCMP ++ cmpq $(CHAR_PER_VEC * 2), %rdx ++ jbe L(ret_zero) ++# endif ++ ++ + # ifdef USE_AS_WCSCMP +- subl $0xff, %ecx ++ /* any non-zero positive value that doesn't inference with 0x1. ++ */ ++ movl $2, %r8d ++ + # else +- incl %ecx ++ xorl %r8d, %r8d + # endif +- jne L(return_3_vec_size) +-L(main_loop_header): +- leaq (VEC_SIZE * 4)(%rdi), %rdx +- movl $PAGE_SIZE, %ecx +- /* Align load via RAX. */ +- andq $-(VEC_SIZE * 4), %rdx +- subq %rdi, %rdx +- leaq (%rdi, %rdx), %rax ++ ++ /* The prepare labels are various entry points from the page ++ cross logic. */ ++L(prepare_loop): ++ + # ifdef USE_AS_STRNCMP +- /* Starting from this point, the maximum offset, or simply the +- 'offset', DECREASES by the same amount when base pointers are +- moved forward. Return 0 when: +- 1) On match: offset <= the matched vector index. +- 2) On mistmach, offset is before the mistmatched index. +- */ +- subq %rdx, %r11 +- jbe L(zero) ++# ifdef USE_AS_WCSCMP ++L(prepare_loop_no_len): ++ movl %edi, %ecx ++ andl $(VEC_SIZE * 4 - 1), %ecx ++ shrl $2, %ecx ++ leaq (CHAR_PER_VEC * 2)(%rdx, %rcx), %rdx ++# else ++ /* Store N + (VEC_SIZE * 4) and place check at the begining of ++ the loop. */ ++ leaq (VEC_SIZE * 2)(%rdi, %rdx), %rdx ++L(prepare_loop_no_len): ++# endif ++# else ++L(prepare_loop_no_len): + # endif +- addq %rsi, %rdx +- movq %rdx, %rsi +- andl $(PAGE_SIZE - 1), %esi +- /* Number of bytes before page crossing. */ +- subq %rsi, %rcx +- /* Number of VEC_SIZE * 4 blocks before page crossing. */ +- shrq $DIVIDE_BY_VEC_4_SHIFT, %rcx +- /* ESI: Number of VEC_SIZE * 4 blocks before page crossing. */ +- movl %ecx, %esi +- jmp L(loop_start) + ++ /* Align s1 and adjust s2 accordingly. */ ++ subq %rdi, %rsi ++ andq $-(VEC_SIZE * 4), %rdi ++L(prepare_loop_readj): ++ addq %rdi, %rsi ++# if (defined USE_AS_STRNCMP) && !(defined USE_AS_WCSCMP) ++ subq %rdi, %rdx ++# endif ++ ++L(prepare_loop_aligned): ++ /* eax stores distance from rsi to next page cross. These cases ++ need to be handled specially as the 4x loop could potentially ++ read memory past the length of s1 or s2 and across a page ++ boundary. */ ++ movl $-(VEC_SIZE * 4), %eax ++ subl %esi, %eax ++ andl $(PAGE_SIZE - 1), %eax ++ ++ vpxorq %YMMZERO, %YMMZERO, %YMMZERO ++ ++ /* Loop 4x comparisons at a time. */ + .p2align 4 + L(loop): ++ ++ /* End condition for strncmp. */ + # ifdef USE_AS_STRNCMP +- /* Base pointers are moved forward by 4 * VEC_SIZE. Decrease +- the maximum offset (%r11) by the same amount. */ +- subq $(VEC_SIZE * 4), %r11 +- jbe L(zero) ++ subq $(CHAR_PER_VEC * 4), %rdx ++ jbe L(ret_zero) + # endif +- addq $(VEC_SIZE * 4), %rax +- addq $(VEC_SIZE * 4), %rdx +-L(loop_start): +- testl %esi, %esi +- leal -1(%esi), %esi +- je L(loop_cross_page) +-L(back_to_loop): +- /* Main loop, comparing 4 vectors are a time. */ +- VMOVA (%rax), %YMM0 +- VMOVA VEC_SIZE(%rax), %YMM2 +- VMOVA (VEC_SIZE * 2)(%rax), %YMM4 +- VMOVA (VEC_SIZE * 3)(%rax), %YMM6 ++ ++ subq $-(VEC_SIZE * 4), %rdi ++ subq $-(VEC_SIZE * 4), %rsi ++ ++ /* Check if rsi loads will cross a page boundary. */ ++ addl $-(VEC_SIZE * 4), %eax ++ jnb L(page_cross_during_loop) ++ ++ /* Loop entry after handling page cross during loop. */ ++L(loop_skip_page_cross_check): ++ VMOVA (VEC_SIZE * 0)(%rdi), %YMM0 ++ VMOVA (VEC_SIZE * 1)(%rdi), %YMM2 ++ VMOVA (VEC_SIZE * 2)(%rdi), %YMM4 ++ VMOVA (VEC_SIZE * 3)(%rdi), %YMM6 + + VPMINU %YMM0, %YMM2, %YMM8 + VPMINU %YMM4, %YMM6, %YMM9 + +- /* A zero CHAR in YMM8 means that there is a null CHAR. */ +- VPMINU %YMM8, %YMM9, %YMM8 ++ /* A zero CHAR in YMM9 means that there is a null CHAR. */ ++ VPMINU %YMM8, %YMM9, %YMM9 + + /* Each bit set in K1 represents a non-null CHAR in YMM8. */ +- VPTESTM %YMM8, %YMM8, %k1 ++ VPTESTM %YMM9, %YMM9, %k1 + +- /* (YMM ^ YMM): A non-zero CHAR represents a mismatch. */ +- vpxorq (%rdx), %YMM0, %YMM1 +- vpxorq VEC_SIZE(%rdx), %YMM2, %YMM3 +- vpxorq (VEC_SIZE * 2)(%rdx), %YMM4, %YMM5 +- vpxorq (VEC_SIZE * 3)(%rdx), %YMM6, %YMM7 ++ vpxorq (VEC_SIZE * 0)(%rsi), %YMM0, %YMM1 ++ vpxorq (VEC_SIZE * 1)(%rsi), %YMM2, %YMM3 ++ vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5 ++ /* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while ++ oring with YMM1. Result is stored in YMM6. */ ++ vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6 + +- vporq %YMM1, %YMM3, %YMM9 +- vporq %YMM5, %YMM7, %YMM10 ++ /* Or together YMM3, YMM5, and YMM6. */ ++ vpternlogd $0xfe, %YMM3, %YMM5, %YMM6 + +- /* A non-zero CHAR in YMM9 represents a mismatch. */ +- vporq %YMM9, %YMM10, %YMM9 + +- /* Each bit cleared in K0 represents a mismatch or a null CHAR. */ +- VPCMP $0, %YMMZERO, %YMM9, %k0{%k1} +- kmovd %k0, %ecx +-# ifdef USE_AS_WCSCMP +- subl $0xff, %ecx +-# else +- incl %ecx +-# endif +- je L(loop) ++ /* A non-zero CHAR in YMM6 represents a mismatch. */ ++ VPCMP $0, %YMMZERO, %YMM6, %k0{%k1} ++ kmovd %k0, %LOOP_REG + +- /* Each bit set in K1 represents a non-null CHAR in YMM0. */ ++ TESTEQ %LOOP_REG ++ jz L(loop) ++ ++ ++ /* Find which VEC has the mismatch of end of string. */ + VPTESTM %YMM0, %YMM0, %k1 +- /* Each bit cleared in K0 represents a mismatch or a null CHAR +- in YMM0 and (%rdx). */ + VPCMP $0, %YMMZERO, %YMM1, %k0{%k1} + kmovd %k0, %ecx +-# ifdef USE_AS_WCSCMP +- subl $0xff, %ecx +-# else +- incl %ecx +-# endif +- je L(test_vec) +- tzcntl %ecx, %ecx +-# ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %ecx +-# endif +-# ifdef USE_AS_STRNCMP +- cmpq %rcx, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif +-# else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif +-# endif +- ret ++ TESTEQ %ecx ++ jnz L(return_vec_0_end) + +- .p2align 4 +-L(test_vec): +-# ifdef USE_AS_STRNCMP +- /* The first vector matched. Return 0 if the maximum offset +- (%r11) <= VEC_SIZE. */ +- cmpq $VEC_SIZE, %r11 +- jbe L(zero) +-# endif +- /* Each bit set in K1 represents a non-null CHAR in YMM2. */ + VPTESTM %YMM2, %YMM2, %k1 +- /* Each bit cleared in K0 represents a mismatch or a null CHAR +- in YMM2 and VEC_SIZE(%rdx). */ + VPCMP $0, %YMMZERO, %YMM3, %k0{%k1} + kmovd %k0, %ecx +-# ifdef USE_AS_WCSCMP +- subl $0xff, %ecx +-# else +- incl %ecx +-# endif +- je L(test_2_vec) +- tzcntl %ecx, %edi +-# ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %edi +-# endif +-# ifdef USE_AS_STRNCMP +- addq $VEC_SIZE, %rdi +- cmpq %rdi, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (%rsi, %rdi), %ecx +- cmpl (%rdx, %rdi), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rdi), %eax +- movzbl (%rdx, %rdi), %edx +- subl %edx, %eax +-# endif +-# else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl VEC_SIZE(%rsi, %rdi), %ecx +- cmpl VEC_SIZE(%rdx, %rdi), %ecx +- jne L(wcscmp_return) +-# else +- movzbl VEC_SIZE(%rax, %rdi), %eax +- movzbl VEC_SIZE(%rdx, %rdi), %edx +- subl %edx, %eax +-# endif +-# endif +- ret ++ TESTEQ %ecx ++ jnz L(return_vec_1_end) + +- .p2align 4 +-L(test_2_vec): ++ ++ /* Handle VEC 2 and 3 without branches. */ ++L(return_vec_2_3_end): + # ifdef USE_AS_STRNCMP +- /* The first 2 vectors matched. Return 0 if the maximum offset +- (%r11) <= 2 * VEC_SIZE. */ +- cmpq $(VEC_SIZE * 2), %r11 +- jbe L(zero) ++ subq $(CHAR_PER_VEC * 2), %rdx ++ jbe L(ret_zero_end) + # endif +- /* Each bit set in K1 represents a non-null CHAR in YMM4. */ ++ + VPTESTM %YMM4, %YMM4, %k1 +- /* Each bit cleared in K0 represents a mismatch or a null CHAR +- in YMM4 and (VEC_SIZE * 2)(%rdx). */ + VPCMP $0, %YMMZERO, %YMM5, %k0{%k1} + kmovd %k0, %ecx +-# ifdef USE_AS_WCSCMP +- subl $0xff, %ecx ++ TESTEQ %ecx ++# if CHAR_PER_VEC <= 16 ++ sall $CHAR_PER_VEC, %LOOP_REG ++ orl %ecx, %LOOP_REG + # else +- incl %ecx ++ salq $CHAR_PER_VEC, %LOOP_REG64 ++ orq %rcx, %LOOP_REG64 ++# endif ++L(return_vec_3_end): ++ /* LOOP_REG contains matches for null/mismatch from the loop. If ++ VEC 0,1,and 2 all have no null and no mismatches then mismatch ++ must entirely be from VEC 3 which is fully represented by ++ LOOP_REG. */ ++# if CHAR_PER_VEC <= 16 ++ tzcntl %LOOP_REG, %LOOP_REG ++# else ++ tzcntq %LOOP_REG64, %LOOP_REG64 ++# endif ++# ifdef USE_AS_STRNCMP ++ cmpq %LOOP_REG64, %rdx ++ jbe L(ret_zero_end) + # endif +- je L(test_3_vec) +- tzcntl %ecx, %edi ++ + # ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %edi ++ movl (VEC_SIZE * 2)(%rdi, %LOOP_REG64, SIZE_OF_CHAR), %ecx ++ xorl %eax, %eax ++ cmpl (VEC_SIZE * 2)(%rsi, %LOOP_REG64, SIZE_OF_CHAR), %ecx ++ je L(ret5) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax ++# else ++ movzbl (VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax ++ movzbl (VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif ++L(ret5): ++ ret ++ + # ifdef USE_AS_STRNCMP +- addq $(VEC_SIZE * 2), %rdi +- cmpq %rdi, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++ .p2align 4,, 2 ++L(ret_zero_end): + xorl %eax, %eax +- movl (%rsi, %rdi), %ecx +- cmpl (%rdx, %rdi), %ecx +- jne L(wcscmp_return) ++ ret ++# endif ++ ++ ++ /* The L(return_vec_N_end) differ from L(return_vec_N) in that ++ they use the value of `r8` to negate the return value. This is ++ because the page cross logic can swap `rdi` and `rsi`. */ ++ .p2align 4,, 10 ++# ifdef USE_AS_STRNCMP ++L(return_vec_1_end): ++# if CHAR_PER_VEC <= 16 ++ sall $CHAR_PER_VEC, %ecx + # else +- movzbl (%rax, %rdi), %eax +- movzbl (%rdx, %rdi), %edx +- subl %edx, %eax ++ salq $CHAR_PER_VEC, %rcx + # endif ++# endif ++L(return_vec_0_end): ++# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP) ++ tzcntl %ecx, %ecx + # else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (VEC_SIZE * 2)(%rsi, %rdi), %ecx +- cmpl (VEC_SIZE * 2)(%rdx, %rdi), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (VEC_SIZE * 2)(%rax, %rdi), %eax +- movzbl (VEC_SIZE * 2)(%rdx, %rdi), %edx +- subl %edx, %eax +-# endif ++ tzcntq %rcx, %rcx + # endif +- ret + +- .p2align 4 +-L(test_3_vec): + # ifdef USE_AS_STRNCMP +- /* The first 3 vectors matched. Return 0 if the maximum offset +- (%r11) <= 3 * VEC_SIZE. */ +- cmpq $(VEC_SIZE * 3), %r11 +- jbe L(zero) ++ cmpq %rcx, %rdx ++ jbe L(ret_zero_end) + # endif +- /* Each bit set in K1 represents a non-null CHAR in YMM6. */ +- VPTESTM %YMM6, %YMM6, %k1 +- /* Each bit cleared in K0 represents a mismatch or a null CHAR +- in YMM6 and (VEC_SIZE * 3)(%rdx). */ +- VPCMP $0, %YMMZERO, %YMM7, %k0{%k1} +- kmovd %k0, %ecx ++ + # ifdef USE_AS_WCSCMP +- subl $0xff, %ecx ++ movl (%rdi, %rcx, SIZE_OF_CHAR), %edx ++ xorl %eax, %eax ++ cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx ++ je L(ret6) ++ setl %al ++ negl %eax ++ /* This is the non-zero case for `eax` so just xorl with `r8d` ++ flip is `rdi` and `rsi` where swapped. */ ++ xorl %r8d, %eax + # else +- incl %ecx ++ movzbl (%rdi, %rcx), %eax ++ movzbl (%rsi, %rcx), %ecx ++ subl %ecx, %eax ++ /* Flip `eax` if `rdi` and `rsi` where swapped in page cross ++ logic. Subtract `r8d` after xor for zero case. */ ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif ++L(ret6): ++ ret ++ ++# ifndef USE_AS_STRNCMP ++ .p2align 4,, 10 ++L(return_vec_1_end): + tzcntl %ecx, %ecx +-# ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %ecx +-# endif +-# ifdef USE_AS_STRNCMP +- addq $(VEC_SIZE * 3), %rcx +- cmpq %rcx, %r11 +- jbe L(zero) + # ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++ movl VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx + xorl %eax, %eax +- movl (%rsi, %rcx), %esi +- cmpl (%rdx, %rcx), %esi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif +-# else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (VEC_SIZE * 3)(%rsi, %rcx), %esi +- cmpl (VEC_SIZE * 3)(%rdx, %rcx), %esi +- jne L(wcscmp_return) ++ cmpl VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx ++ je L(ret7) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax + # else +- movzbl (VEC_SIZE * 3)(%rax, %rcx), %eax +- movzbl (VEC_SIZE * 3)(%rdx, %rcx), %edx +- subl %edx, %eax ++ movzbl VEC_SIZE(%rdi, %rcx), %eax ++ movzbl VEC_SIZE(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif +-# endif ++L(ret7): + ret +- +- .p2align 4 +-L(loop_cross_page): +- xorl %r10d, %r10d +- movq %rdx, %rcx +- /* Align load via RDX. We load the extra ECX bytes which should +- be ignored. */ +- andl $((VEC_SIZE * 4) - 1), %ecx +- /* R10 is -RCX. */ +- subq %rcx, %r10 +- +- /* This works only if VEC_SIZE * 2 == 64. */ +-# if (VEC_SIZE * 2) != 64 +-# error (VEC_SIZE * 2) != 64 + # endif + +- /* Check if the first VEC_SIZE * 2 bytes should be ignored. */ +- cmpl $(VEC_SIZE * 2), %ecx +- jge L(loop_cross_page_2_vec) + +- VMOVU (%rax, %r10), %YMM2 +- VMOVU VEC_SIZE(%rax, %r10), %YMM3 ++ /* Page cross in rsi in next 4x VEC. */ + +- /* Each bit set in K2 represents a non-null CHAR in YMM2. */ +- VPTESTM %YMM2, %YMM2, %k2 +- /* Each bit cleared in K1 represents a mismatch or a null CHAR +- in YMM2 and 32 bytes at (%rdx, %r10). */ +- VPCMP $0, (%rdx, %r10), %YMM2, %k1{%k2} +- kmovd %k1, %r9d +- /* Don't use subl since it is the lower 16/32 bits of RDI +- below. */ +- notl %r9d +-# ifdef USE_AS_WCSCMP +- /* Only last 8 bits are valid. */ +- andl $0xff, %r9d +-# endif ++ /* TODO: Improve logic here. */ ++ .p2align 4,, 10 ++L(page_cross_during_loop): ++ /* eax contains [distance_from_page - (VEC_SIZE * 4)]. */ + +- /* Each bit set in K4 represents a non-null CHAR in YMM3. */ +- VPTESTM %YMM3, %YMM3, %k4 +- /* Each bit cleared in K3 represents a mismatch or a null CHAR +- in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10). */ +- VPCMP $0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4} +- kmovd %k3, %edi +- /* Must use notl %edi here as lower bits are for CHAR +- comparisons potentially out of range thus can be 0 without +- indicating mismatch. */ +- notl %edi +-# ifdef USE_AS_WCSCMP +- /* Don't use subl since it is the upper 8 bits of EDI below. */ +- andl $0xff, %edi +-# endif ++ /* Optimistically rsi and rdi and both aligned in which case we ++ don't need any logic here. */ ++ cmpl $-(VEC_SIZE * 4), %eax ++ /* Don't adjust eax before jumping back to loop and we will ++ never hit page cross case again. */ ++ je L(loop_skip_page_cross_check) + +-# ifdef USE_AS_WCSCMP +- /* NB: Each bit in EDI/R9D represents 4-byte element. */ +- sall $8, %edi +- /* NB: Divide shift count by 4 since each bit in K1 represent 4 +- bytes. */ +- movl %ecx, %SHIFT_REG32 +- sarl $2, %SHIFT_REG32 +- +- /* Each bit in EDI represents a null CHAR or a mismatch. */ +- orl %r9d, %edi +-# else +- salq $32, %rdi ++ /* Check if we can safely load a VEC. */ ++ cmpl $-(VEC_SIZE * 3), %eax ++ jle L(less_1x_vec_till_page_cross) + +- /* Each bit in RDI represents a null CHAR or a mismatch. */ +- orq %r9, %rdi +-# endif ++ VMOVA (%rdi), %YMM0 ++ VPTESTM %YMM0, %YMM0, %k2 ++ VPCMP $0, (%rsi), %YMM0, %k1{%k2} ++ kmovd %k1, %ecx ++ TESTEQ %ecx ++ jnz L(return_vec_0_end) ++ ++ /* if distance >= 2x VEC then eax > -(VEC_SIZE * 2). */ ++ cmpl $-(VEC_SIZE * 2), %eax ++ jg L(more_2x_vec_till_page_cross) ++ ++ .p2align 4,, 4 ++L(less_1x_vec_till_page_cross): ++ subl $-(VEC_SIZE * 4), %eax ++ /* Guranteed safe to read from rdi - VEC_SIZE here. The only ++ concerning case is first iteration if incoming s1 was near start ++ of a page and s2 near end. If s1 was near the start of the page ++ we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe ++ to read back -VEC_SIZE. If rdi is truly at the start of a page ++ here, it means the previous page (rdi - VEC_SIZE) has already ++ been loaded earlier so must be valid. */ ++ VMOVU -VEC_SIZE(%rdi, %rax), %YMM0 ++ VPTESTM %YMM0, %YMM0, %k2 ++ VPCMP $0, -VEC_SIZE(%rsi, %rax), %YMM0, %k1{%k2} ++ ++ /* Mask of potentially valid bits. The lower bits can be out of ++ range comparisons (but safe regarding page crosses). */ + +- /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */ +- shrxq %SHIFT_REG64, %rdi, %rdi +- testq %rdi, %rdi +- je L(loop_cross_page_2_vec) +- tzcntq %rdi, %rcx + # ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %ecx ++ movl $-1, %r10d ++ movl %esi, %ecx ++ andl $(VEC_SIZE - 1), %ecx ++ shrl $2, %ecx ++ shlxl %ecx, %r10d, %ecx ++ movzbl %cl, %r10d ++# else ++ movl $-1, %ecx ++ shlxl %esi, %ecx, %r10d + # endif ++ ++ kmovd %k1, %ecx ++ notl %ecx ++ ++ + # ifdef USE_AS_STRNCMP +- cmpq %rcx, %r11 +- jbe L(zero) + # ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) ++ movl %eax, %r11d ++ shrl $2, %r11d ++ cmpq %r11, %rdx + # else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax ++ cmpq %rax, %rdx + # endif ++ jbe L(return_page_cross_end_check) ++# endif ++ movl %eax, %OFFSET_REG ++ ++ /* Readjust eax before potentially returning to the loop. */ ++ addl $(PAGE_SIZE - VEC_SIZE * 4), %eax ++ ++ andl %r10d, %ecx ++ jz L(loop_skip_page_cross_check) ++ ++ .p2align 4,, 3 ++L(return_page_cross_end): ++ tzcntl %ecx, %ecx ++ ++# if (defined USE_AS_STRNCMP) || (defined USE_AS_WCSCMP) ++ leal -VEC_SIZE(%OFFSET_REG64, %rcx, SIZE_OF_CHAR), %ecx ++L(return_page_cross_cmp_mem): + # else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++ addl %OFFSET_REG, %ecx ++# endif ++# ifdef USE_AS_WCSCMP ++ movl VEC_OFFSET(%rdi, %rcx), %edx + xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif ++ cmpl VEC_OFFSET(%rsi, %rcx), %edx ++ je L(ret8) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax ++# else ++ movzbl VEC_OFFSET(%rdi, %rcx), %eax ++ movzbl VEC_OFFSET(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif ++L(ret8): + ret + +- .p2align 4 +-L(loop_cross_page_2_vec): +- /* The first VEC_SIZE * 2 bytes match or are ignored. */ +- VMOVU (VEC_SIZE * 2)(%rax, %r10), %YMM0 +- VMOVU (VEC_SIZE * 3)(%rax, %r10), %YMM1 ++# ifdef USE_AS_STRNCMP ++ .p2align 4,, 10 ++L(return_page_cross_end_check): ++ tzcntl %ecx, %ecx ++ leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx ++# ifdef USE_AS_WCSCMP ++ sall $2, %edx ++# endif ++ cmpl %ecx, %edx ++ ja L(return_page_cross_cmp_mem) ++ xorl %eax, %eax ++ ret ++# endif ++ + ++ .p2align 4,, 10 ++L(more_2x_vec_till_page_cross): ++ /* If more 2x vec till cross we will complete a full loop ++ iteration here. */ ++ ++ VMOVA VEC_SIZE(%rdi), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- /* Each bit cleared in K1 represents a mismatch or a null CHAR +- in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rdx, %r10). */ +- VPCMP $0, (VEC_SIZE * 2)(%rdx, %r10), %YMM0, %k1{%k2} +- kmovd %k1, %r9d +- /* Don't use subl since it is the lower 16/32 bits of RDI +- below. */ +- notl %r9d +-# ifdef USE_AS_WCSCMP +- /* Only last 8 bits are valid. */ +- andl $0xff, %r9d +-# endif ++ VPCMP $0, VEC_SIZE(%rsi), %YMM0, %k1{%k2} ++ kmovd %k1, %ecx ++ TESTEQ %ecx ++ jnz L(return_vec_1_end) + +- VPTESTM %YMM1, %YMM1, %k4 +- /* Each bit cleared in K3 represents a mismatch or a null CHAR +- in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10). */ +- VPCMP $0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4} +- kmovd %k3, %edi +- /* Must use notl %edi here as lower bits are for CHAR +- comparisons potentially out of range thus can be 0 without +- indicating mismatch. */ +- notl %edi +-# ifdef USE_AS_WCSCMP +- /* Don't use subl since it is the upper 8 bits of EDI below. */ +- andl $0xff, %edi ++# ifdef USE_AS_STRNCMP ++ cmpq $(CHAR_PER_VEC * 2), %rdx ++ jbe L(ret_zero_in_loop_page_cross) + # endif + +-# ifdef USE_AS_WCSCMP +- /* NB: Each bit in EDI/R9D represents 4-byte element. */ +- sall $8, %edi ++ subl $-(VEC_SIZE * 4), %eax + +- /* Each bit in EDI represents a null CHAR or a mismatch. */ +- orl %r9d, %edi +-# else +- salq $32, %rdi ++ /* Safe to include comparisons from lower bytes. */ ++ VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %YMM0 ++ VPTESTM %YMM0, %YMM0, %k2 ++ VPCMP $0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM0, %k1{%k2} ++ kmovd %k1, %ecx ++ TESTEQ %ecx ++ jnz L(return_vec_page_cross_0) ++ ++ VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %YMM0 ++ VPTESTM %YMM0, %YMM0, %k2 ++ VPCMP $0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM0, %k1{%k2} ++ kmovd %k1, %ecx ++ TESTEQ %ecx ++ jnz L(return_vec_page_cross_1) + +- /* Each bit in RDI represents a null CHAR or a mismatch. */ +- orq %r9, %rdi ++# ifdef USE_AS_STRNCMP ++ /* Must check length here as length might proclude reading next ++ page. */ ++# ifdef USE_AS_WCSCMP ++ movl %eax, %r11d ++ shrl $2, %r11d ++ cmpq %r11, %rdx ++# else ++ cmpq %rax, %rdx ++# endif ++ jbe L(ret_zero_in_loop_page_cross) + # endif + +- xorl %r8d, %r8d +- /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */ +- subl $(VEC_SIZE * 2), %ecx +- jle 1f +- /* R8 has number of bytes skipped. */ +- movl %ecx, %r8d +-# ifdef USE_AS_WCSCMP +- /* NB: Divide shift count by 4 since each bit in RDI represent 4 +- bytes. */ +- sarl $2, %ecx +- /* Skip ECX bytes. */ +- shrl %cl, %edi ++ /* Finish the loop. */ ++ VMOVA (VEC_SIZE * 2)(%rdi), %YMM4 ++ VMOVA (VEC_SIZE * 3)(%rdi), %YMM6 ++ VPMINU %YMM4, %YMM6, %YMM9 ++ VPTESTM %YMM9, %YMM9, %k1 ++ ++ vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5 ++ /* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6). */ ++ vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6 ++ ++ VPCMP $0, %YMMZERO, %YMM6, %k0{%k1} ++ kmovd %k0, %LOOP_REG ++ TESTEQ %LOOP_REG ++ jnz L(return_vec_2_3_end) ++ ++ /* Best for code size to include ucond-jmp here. Would be faster ++ if this case is hot to duplicate the L(return_vec_2_3_end) code ++ as fall-through and have jump back to loop on mismatch ++ comparison. */ ++ subq $-(VEC_SIZE * 4), %rdi ++ subq $-(VEC_SIZE * 4), %rsi ++ addl $(PAGE_SIZE - VEC_SIZE * 8), %eax ++# ifdef USE_AS_STRNCMP ++ subq $(CHAR_PER_VEC * 4), %rdx ++ ja L(loop_skip_page_cross_check) ++L(ret_zero_in_loop_page_cross): ++ xorl %eax, %eax ++ ret + # else +- /* Skip ECX bytes. */ +- shrq %cl, %rdi ++ jmp L(loop_skip_page_cross_check) + # endif +-1: +- /* Before jumping back to the loop, set ESI to the number of +- VEC_SIZE * 4 blocks before page crossing. */ +- movl $(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi + +- testq %rdi, %rdi +-# ifdef USE_AS_STRNCMP +- /* At this point, if %rdi value is 0, it already tested +- VEC_SIZE*4+%r10 byte starting from %rax. This label +- checks whether strncmp maximum offset reached or not. */ +- je L(string_nbyte_offset_check) ++ ++ .p2align 4,, 10 ++L(return_vec_page_cross_0): ++ addl $-VEC_SIZE, %eax ++L(return_vec_page_cross_1): ++ tzcntl %ecx, %ecx ++# if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP ++ leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx ++# ifdef USE_AS_STRNCMP ++# ifdef USE_AS_WCSCMP ++ /* Must divide ecx instead of multiply rdx due to overflow. */ ++ movl %ecx, %eax ++ shrl $2, %eax ++ cmpq %rax, %rdx ++# else ++ cmpq %rcx, %rdx ++# endif ++ jbe L(ret_zero_in_loop_page_cross) ++# endif + # else +- je L(back_to_loop) ++ addl %eax, %ecx + # endif +- tzcntq %rdi, %rcx ++ + # ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %ecx +-# endif +- addq %r10, %rcx +- /* Adjust for number of bytes skipped. */ +- addq %r8, %rcx +-# ifdef USE_AS_STRNCMP +- addq $(VEC_SIZE * 2), %rcx +- subq %rcx, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++ movl VEC_OFFSET(%rdi, %rcx), %edx + xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif ++ cmpl VEC_OFFSET(%rsi, %rcx), %edx ++ je L(ret9) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax + # else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (VEC_SIZE * 2)(%rsi, %rcx), %edi +- cmpl (VEC_SIZE * 2)(%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (VEC_SIZE * 2)(%rax, %rcx), %eax +- movzbl (VEC_SIZE * 2)(%rdx, %rcx), %edx +- subl %edx, %eax +-# endif ++ movzbl VEC_OFFSET(%rdi, %rcx), %eax ++ movzbl VEC_OFFSET(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif ++L(ret9): + ret + +-# ifdef USE_AS_STRNCMP +-L(string_nbyte_offset_check): +- leaq (VEC_SIZE * 4)(%r10), %r10 +- cmpq %r10, %r11 +- jbe L(zero) +- jmp L(back_to_loop) ++ ++ .p2align 4,, 10 ++L(page_cross): ++# ifndef USE_AS_STRNCMP ++ /* If both are VEC aligned we don't need any special logic here. ++ Only valid for strcmp where stop condition is guranteed to be ++ reachable by just reading memory. */ ++ testl $((VEC_SIZE - 1) << 20), %eax ++ jz L(no_page_cross) + # endif + +- .p2align 4 +-L(cross_page_loop): +- /* Check one byte/dword at a time. */ ++ movl %edi, %eax ++ movl %esi, %ecx ++ andl $(PAGE_SIZE - 1), %eax ++ andl $(PAGE_SIZE - 1), %ecx ++ ++ xorl %OFFSET_REG, %OFFSET_REG ++ ++ /* Check which is closer to page cross, s1 or s2. */ ++ cmpl %eax, %ecx ++ jg L(page_cross_s2) ++ ++ /* The previous page cross check has false positives. Check for ++ true positive as page cross logic is very expensive. */ ++ subl $(PAGE_SIZE - VEC_SIZE * 4), %eax ++ jbe L(no_page_cross) ++ ++ ++ /* Set r8 to not interfere with normal return value (rdi and rsi ++ did not swap). */ + # ifdef USE_AS_WCSCMP +- cmpl %ecx, %eax ++ /* any non-zero positive value that doesn't inference with 0x1. ++ */ ++ movl $2, %r8d + # else +- subl %ecx, %eax ++ xorl %r8d, %r8d + # endif +- jne L(different) +- addl $SIZE_OF_CHAR, %edx +- cmpl $(VEC_SIZE * 4), %edx +- je L(main_loop_header) ++ ++ /* Check if less than 1x VEC till page cross. */ ++ subl $(VEC_SIZE * 3), %eax ++ jg L(less_1x_vec_till_page) ++ ++ ++ /* If more than 1x VEC till page cross, loop throuh safely ++ loadable memory until within 1x VEC of page cross. */ ++ .p2align 4,, 8 ++L(page_cross_loop): ++ VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0 ++ VPTESTM %YMM0, %YMM0, %k2 ++ VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2} ++ kmovd %k1, %ecx ++ TESTEQ %ecx ++ jnz L(check_ret_vec_page_cross) ++ addl $CHAR_PER_VEC, %OFFSET_REG + # ifdef USE_AS_STRNCMP +- cmpq %r11, %rdx +- jae L(zero) ++ cmpq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross) + # endif ++ addl $VEC_SIZE, %eax ++ jl L(page_cross_loop) ++ + # ifdef USE_AS_WCSCMP +- movl (%rdi, %rdx), %eax +- movl (%rsi, %rdx), %ecx +-# else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %ecx ++ shrl $2, %eax + # endif +- /* Check null CHAR. */ +- testl %eax, %eax +- jne L(cross_page_loop) +- /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED +- comparisons. */ +- subl %ecx, %eax +-# ifndef USE_AS_WCSCMP +-L(different): ++ ++ ++ subl %eax, %OFFSET_REG ++ /* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed ++ to not cross page so is safe to load. Since we have already ++ loaded at least 1 VEC from rsi it is also guranteed to be safe. ++ */ ++ VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0 ++ VPTESTM %YMM0, %YMM0, %k2 ++ VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2} ++ ++ kmovd %k1, %ecx ++# ifdef USE_AS_STRNCMP ++ leal CHAR_PER_VEC(%OFFSET_REG64), %eax ++ cmpq %rax, %rdx ++ jbe L(check_ret_vec_page_cross2) ++# ifdef USE_AS_WCSCMP ++ addq $-(CHAR_PER_VEC * 2), %rdx ++# else ++ addq %rdi, %rdx ++# endif + # endif +- ret ++ TESTEQ %ecx ++ jz L(prepare_loop_no_len) + ++ .p2align 4,, 4 ++L(ret_vec_page_cross): ++# ifndef USE_AS_STRNCMP ++L(check_ret_vec_page_cross): ++# endif ++ tzcntl %ecx, %ecx ++ addl %OFFSET_REG, %ecx ++L(ret_vec_page_cross_cont): + # ifdef USE_AS_WCSCMP +- .p2align 4 +-L(different): +- /* Use movl to avoid modifying EFLAGS. */ +- movl $0, %eax ++ movl (%rdi, %rcx, SIZE_OF_CHAR), %edx ++ xorl %eax, %eax ++ cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx ++ je L(ret12) + setl %al + negl %eax +- orl $1, %eax +- ret ++ xorl %r8d, %eax ++# else ++ movzbl (%rdi, %rcx, SIZE_OF_CHAR), %eax ++ movzbl (%rsi, %rcx, SIZE_OF_CHAR), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif ++L(ret12): ++ ret ++ + + # ifdef USE_AS_STRNCMP +- .p2align 4 +-L(zero): ++ .p2align 4,, 10 ++L(check_ret_vec_page_cross2): ++ TESTEQ %ecx ++L(check_ret_vec_page_cross): ++ tzcntl %ecx, %ecx ++ addl %OFFSET_REG, %ecx ++ cmpq %rcx, %rdx ++ ja L(ret_vec_page_cross_cont) ++ .p2align 4,, 2 ++L(ret_zero_page_cross): + xorl %eax, %eax + ret ++# endif + +- .p2align 4 +-L(char0): +-# ifdef USE_AS_WCSCMP +- xorl %eax, %eax +- movl (%rdi), %ecx +- cmpl (%rsi), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rsi), %ecx +- movzbl (%rdi), %eax +- subl %ecx, %eax +-# endif +- ret ++ .p2align 4,, 4 ++L(page_cross_s2): ++ /* Ensure this is a true page cross. */ ++ subl $(PAGE_SIZE - VEC_SIZE * 4), %ecx ++ jbe L(no_page_cross) ++ ++ ++ movl %ecx, %eax ++ movq %rdi, %rcx ++ movq %rsi, %rdi ++ movq %rcx, %rsi ++ ++ /* set r8 to negate return value as rdi and rsi swapped. */ ++# ifdef USE_AS_WCSCMP ++ movl $-4, %r8d ++# else ++ movl $-1, %r8d + # endif ++ xorl %OFFSET_REG, %OFFSET_REG + +- .p2align 4 +-L(last_vector): +- addq %rdx, %rdi +- addq %rdx, %rsi +-# ifdef USE_AS_STRNCMP +- subq %rdx, %r11 ++ /* Check if more than 1x VEC till page cross. */ ++ subl $(VEC_SIZE * 3), %eax ++ jle L(page_cross_loop) ++ ++ .p2align 4,, 6 ++L(less_1x_vec_till_page): ++# ifdef USE_AS_WCSCMP ++ shrl $2, %eax + # endif +- tzcntl %ecx, %edx ++ /* Find largest load size we can use. */ ++ cmpl $(16 / SIZE_OF_CHAR), %eax ++ ja L(less_16_till_page) ++ ++ /* Use 16 byte comparison. */ ++ vmovdqu (%rdi), %xmm0 ++ VPTESTM %xmm0, %xmm0, %k2 ++ VPCMP $0, (%rsi), %xmm0, %k1{%k2} ++ kmovd %k1, %ecx + # ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %edx ++ subl $0xf, %ecx ++# else ++ incw %cx + # endif ++ jnz L(check_ret_vec_page_cross) ++ movl $(16 / SIZE_OF_CHAR), %OFFSET_REG + # ifdef USE_AS_STRNCMP +- cmpq %r11, %rdx +- jae L(zero) ++ cmpq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross_slow_case0) ++ subl %eax, %OFFSET_REG ++# else ++ /* Explicit check for 16 byte alignment. */ ++ subl %eax, %OFFSET_REG ++ jz L(prepare_loop) + # endif ++ vmovdqu (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0 ++ VPTESTM %xmm0, %xmm0, %k2 ++ VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0, %k1{%k2} ++ kmovd %k1, %ecx + # ifdef USE_AS_WCSCMP +- xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- jne L(wcscmp_return) ++ subl $0xf, %ecx + # else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax ++ incw %cx + # endif ++ jnz L(check_ret_vec_page_cross) ++# ifdef USE_AS_STRNCMP ++ addl $(16 / SIZE_OF_CHAR), %OFFSET_REG ++ subq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross_slow_case0) ++ subq $-(CHAR_PER_VEC * 4), %rdx ++ ++ leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi ++ leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi ++# else ++ leaq (16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi ++ leaq (16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi ++# endif ++ jmp L(prepare_loop_aligned) ++ ++# ifdef USE_AS_STRNCMP ++ .p2align 4,, 2 ++L(ret_zero_page_cross_slow_case0): ++ xorl %eax, %eax + ret ++# endif + +- /* Comparing on page boundary region requires special treatment: +- It must done one vector at the time, starting with the wider +- ymm vector if possible, if not, with xmm. If fetching 16 bytes +- (xmm) still passes the boundary, byte comparison must be done. +- */ +- .p2align 4 +-L(cross_page): +- /* Try one ymm vector at a time. */ +- cmpl $(PAGE_SIZE - VEC_SIZE), %eax +- jg L(cross_page_1_vector) +-L(loop_1_vector): +- VMOVU (%rdi, %rdx), %YMM0 + +- VPTESTM %YMM0, %YMM0, %k2 +- /* Each bit cleared in K1 represents a mismatch or a null CHAR +- in YMM0 and 32 bytes at (%rsi, %rdx). */ +- VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2} ++ .p2align 4,, 10 ++L(less_16_till_page): ++ cmpl $(24 / SIZE_OF_CHAR), %eax ++ ja L(less_8_till_page) ++ ++ /* Use 8 byte comparison. */ ++ vmovq (%rdi), %xmm0 ++ vmovq (%rsi), %xmm1 ++ VPTESTM %xmm0, %xmm0, %k2 ++ VPCMP $0, %xmm1, %xmm0, %k1{%k2} + kmovd %k1, %ecx + # ifdef USE_AS_WCSCMP +- subl $0xff, %ecx ++ subl $0x3, %ecx + # else +- incl %ecx ++ incb %cl + # endif +- jne L(last_vector) ++ jnz L(check_ret_vec_page_cross) + +- addl $VEC_SIZE, %edx + +- addl $VEC_SIZE, %eax + # ifdef USE_AS_STRNCMP +- /* Return 0 if the current offset (%rdx) >= the maximum offset +- (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) ++ cmpq $(8 / SIZE_OF_CHAR), %rdx ++ jbe L(ret_zero_page_cross_slow_case0) + # endif +- cmpl $(PAGE_SIZE - VEC_SIZE), %eax +- jle L(loop_1_vector) +-L(cross_page_1_vector): +- /* Less than 32 bytes to check, try one xmm vector. */ +- cmpl $(PAGE_SIZE - 16), %eax +- jg L(cross_page_1_xmm) +- VMOVU (%rdi, %rdx), %XMM0 ++ movl $(24 / SIZE_OF_CHAR), %OFFSET_REG ++ subl %eax, %OFFSET_REG + +- VPTESTM %YMM0, %YMM0, %k2 +- /* Each bit cleared in K1 represents a mismatch or a null CHAR +- in XMM0 and 16 bytes at (%rsi, %rdx). */ +- VPCMP $0, (%rsi, %rdx), %XMM0, %k1{%k2} ++ vmovq (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0 ++ vmovq (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1 ++ VPTESTM %xmm0, %xmm0, %k2 ++ VPCMP $0, %xmm1, %xmm0, %k1{%k2} + kmovd %k1, %ecx + # ifdef USE_AS_WCSCMP +- subl $0xf, %ecx ++ subl $0x3, %ecx + # else +- subl $0xffff, %ecx ++ incb %cl + # endif +- jne L(last_vector) ++ jnz L(check_ret_vec_page_cross) ++ + +- addl $16, %edx +-# ifndef USE_AS_WCSCMP +- addl $16, %eax +-# endif + # ifdef USE_AS_STRNCMP +- /* Return 0 if the current offset (%rdx) >= the maximum offset +- (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) ++ addl $(8 / SIZE_OF_CHAR), %OFFSET_REG ++ subq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross_slow_case0) ++ subq $-(CHAR_PER_VEC * 4), %rdx ++ ++ leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi ++ leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi ++# else ++ leaq (8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi ++ leaq (8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi + # endif ++ jmp L(prepare_loop_aligned) + +-L(cross_page_1_xmm): +-# ifndef USE_AS_WCSCMP +- /* Less than 16 bytes to check, try 8 byte vector. NB: No need +- for wcscmp nor wcsncmp since wide char is 4 bytes. */ +- cmpl $(PAGE_SIZE - 8), %eax +- jg L(cross_page_8bytes) +- vmovq (%rdi, %rdx), %XMM0 +- vmovq (%rsi, %rdx), %XMM1 + +- VPTESTM %YMM0, %YMM0, %k2 +- /* Each bit cleared in K1 represents a mismatch or a null CHAR +- in XMM0 and XMM1. */ +- VPCMP $0, %XMM1, %XMM0, %k1{%k2} +- kmovb %k1, %ecx ++ ++ ++ .p2align 4,, 10 ++L(less_8_till_page): + # ifdef USE_AS_WCSCMP +- subl $0x3, %ecx ++ /* If using wchar then this is the only check before we reach ++ the page boundary. */ ++ movl (%rdi), %eax ++ movl (%rsi), %ecx ++ cmpl %ecx, %eax ++ jnz L(ret_less_8_wcs) ++# ifdef USE_AS_STRNCMP ++ addq $-(CHAR_PER_VEC * 2), %rdx ++ /* We already checked for len <= 1 so cannot hit that case here. ++ */ ++# endif ++ testl %eax, %eax ++ jnz L(prepare_loop) ++ ret ++ ++ .p2align 4,, 8 ++L(ret_less_8_wcs): ++ setl %OFFSET_REG8 ++ negl %OFFSET_REG ++ movl %OFFSET_REG, %eax ++ xorl %r8d, %eax ++ ret ++ + # else +- subl $0xff, %ecx +-# endif +- jne L(last_vector) ++ cmpl $28, %eax ++ ja L(less_4_till_page) ++ ++ vmovd (%rdi), %xmm0 ++ vmovd (%rsi), %xmm1 ++ VPTESTM %xmm0, %xmm0, %k2 ++ VPCMP $0, %xmm1, %xmm0, %k1{%k2} ++ kmovd %k1, %ecx ++ subl $0xf, %ecx ++ jnz L(check_ret_vec_page_cross) + +- addl $8, %edx +- addl $8, %eax + # ifdef USE_AS_STRNCMP +- /* Return 0 if the current offset (%rdx) >= the maximum offset +- (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) ++ cmpq $4, %rdx ++ jbe L(ret_zero_page_cross_slow_case1) + # endif ++ movl $(28 / SIZE_OF_CHAR), %OFFSET_REG ++ subl %eax, %OFFSET_REG + +-L(cross_page_8bytes): +- /* Less than 8 bytes to check, try 4 byte vector. */ +- cmpl $(PAGE_SIZE - 4), %eax +- jg L(cross_page_4bytes) +- vmovd (%rdi, %rdx), %XMM0 +- vmovd (%rsi, %rdx), %XMM1 +- +- VPTESTM %YMM0, %YMM0, %k2 +- /* Each bit cleared in K1 represents a mismatch or a null CHAR +- in XMM0 and XMM1. */ +- VPCMP $0, %XMM1, %XMM0, %k1{%k2} ++ vmovd (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0 ++ vmovd (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1 ++ VPTESTM %xmm0, %xmm0, %k2 ++ VPCMP $0, %xmm1, %xmm0, %k1{%k2} + kmovd %k1, %ecx +-# ifdef USE_AS_WCSCMP +- subl $0x1, %ecx +-# else + subl $0xf, %ecx +-# endif +- jne L(last_vector) ++ jnz L(check_ret_vec_page_cross) ++# ifdef USE_AS_STRNCMP ++ addl $(4 / SIZE_OF_CHAR), %OFFSET_REG ++ subq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross_slow_case1) ++ subq $-(CHAR_PER_VEC * 4), %rdx ++ ++ leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi ++ leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi ++# else ++ leaq (4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi ++ leaq (4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi ++# endif ++ jmp L(prepare_loop_aligned) ++ + +- addl $4, %edx + # ifdef USE_AS_STRNCMP +- /* Return 0 if the current offset (%rdx) >= the maximum offset +- (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) ++ .p2align 4,, 2 ++L(ret_zero_page_cross_slow_case1): ++ xorl %eax, %eax ++ ret + # endif + +-L(cross_page_4bytes): +-# endif +- /* Less than 4 bytes to check, try one byte/dword at a time. */ +-# ifdef USE_AS_STRNCMP +- cmpq %r11, %rdx +- jae L(zero) +-# endif +-# ifdef USE_AS_WCSCMP +- movl (%rdi, %rdx), %eax +- movl (%rsi, %rdx), %ecx +-# else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %ecx +-# endif +- testl %eax, %eax +- jne L(cross_page_loop) ++ .p2align 4,, 10 ++L(less_4_till_page): ++ subq %rdi, %rsi ++ /* Extremely slow byte comparison loop. */ ++L(less_4_loop): ++ movzbl (%rdi), %eax ++ movzbl (%rsi, %rdi), %ecx + subl %ecx, %eax ++ jnz L(ret_less_4_loop) ++ testl %ecx, %ecx ++ jz L(ret_zero_4_loop) ++# ifdef USE_AS_STRNCMP ++ decq %rdx ++ jz L(ret_zero_4_loop) ++# endif ++ incq %rdi ++ /* end condition is reach page boundary (rdi is aligned). */ ++ testl $31, %edi ++ jnz L(less_4_loop) ++ leaq -(VEC_SIZE * 4)(%rdi, %rsi), %rsi ++ addq $-(VEC_SIZE * 4), %rdi ++# ifdef USE_AS_STRNCMP ++ subq $-(CHAR_PER_VEC * 4), %rdx ++# endif ++ jmp L(prepare_loop_aligned) ++ ++L(ret_zero_4_loop): ++ xorl %eax, %eax ++ ret ++L(ret_less_4_loop): ++ xorl %r8d, %eax ++ subl %r8d, %eax + ret +-END (STRCMP) ++# endif ++END(STRCMP) + #endif +-- +GitLab + diff --git a/SOURCES/ia-opt-strcspn_strpbrk-strcspn-c.patch b/SOURCES/ia-opt-strcspn_strpbrk-strcspn-c.patch new file mode 100644 index 0000000..0fb63f7 --- /dev/null +++ b/SOURCES/ia-opt-strcspn_strpbrk-strcspn-c.patch @@ -0,0 +1,148 @@ +From 36926710d4ddab6f7d5fa9559cd5e70ccc95e13a Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 23 Mar 2022 16:57:22 -0500 +Subject: [PATCH] x86: Optimize strcspn and strpbrk in strcspn-c.c + +Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of +_mm_cmpistri. Also change offset to unsigned to avoid unnecessary +sign extensions. + +geometric_mean(N=20) of all benchmarks that dont fallback on +sse2/strlen; New / Original: .928 + +All string/memory tests pass. +Reviewed-by: H.J. Lu + +(cherry picked from commit 30d627d477d7255345a4b713cf352ac32d644d61) +--- + sysdeps/x86_64/multiarch/strcspn-c.c | 83 +++++++++++++--------------- + 1 file changed, 37 insertions(+), 46 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c +index 857af104..6cce4296 100644 +--- a/sysdeps/x86_64/multiarch/strcspn-c.c ++++ b/sysdeps/x86_64/multiarch/strcspn-c.c +@@ -85,83 +85,74 @@ STRCSPN_SSE42 (const char *s, const char *a) + RETURN (NULL, strlen (s)); + + const char *aligned; +- __m128i mask; +- int offset = (int) ((size_t) a & 15); ++ __m128i mask, maskz, zero; ++ unsigned int maskz_bits; ++ unsigned int offset = (unsigned int) ((size_t) a & 15); ++ zero = _mm_set1_epi8 (0); + if (offset != 0) + { + /* Load masks. */ + aligned = (const char *) ((size_t) a & -16L); + __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); +- +- mask = __m128i_shift_right (mask0, offset); ++ maskz = _mm_cmpeq_epi8 (mask0, zero); + + /* Find where the NULL terminator is. */ +- int length = _mm_cmpistri (mask, mask, 0x3a); +- if (length == 16 - offset) +- { +- /* There is no NULL terminator. */ +- __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16)); +- int index = _mm_cmpistri (mask1, mask1, 0x3a); +- length += index; +- +- /* Don't use SSE4.2 if the length of A > 16. */ +- if (length > 16) +- return STRCSPN_SSE2 (s, a); +- +- if (index != 0) +- { +- /* Combine mask0 and mask1. We could play games with +- palignr, but frankly this data should be in L1 now +- so do the merge via an unaligned load. */ +- mask = _mm_loadu_si128 ((__m128i *) a); +- } +- } ++ maskz_bits = _mm_movemask_epi8 (maskz) >> offset; ++ if (maskz_bits != 0) ++ { ++ mask = __m128i_shift_right (mask0, offset); ++ offset = (unsigned int) ((size_t) s & 15); ++ if (offset) ++ goto start_unaligned; ++ ++ aligned = s; ++ goto start_loop; ++ } + } +- else +- { +- /* A is aligned. */ +- mask = _mm_load_si128 ((__m128i *) a); + +- /* Find where the NULL terminator is. */ +- int length = _mm_cmpistri (mask, mask, 0x3a); +- if (length == 16) +- { +- /* There is no NULL terminator. Don't use SSE4.2 if the length +- of A > 16. */ +- if (a[16] != 0) +- return STRCSPN_SSE2 (s, a); +- } ++ /* A is aligned. */ ++ mask = _mm_loadu_si128 ((__m128i *) a); ++ /* Find where the NULL terminator is. */ ++ maskz = _mm_cmpeq_epi8 (mask, zero); ++ maskz_bits = _mm_movemask_epi8 (maskz); ++ if (maskz_bits == 0) ++ { ++ /* There is no NULL terminator. Don't use SSE4.2 if the length ++ of A > 16. */ ++ if (a[16] != 0) ++ return STRCSPN_SSE2 (s, a); + } + +- offset = (int) ((size_t) s & 15); ++ aligned = s; ++ offset = (unsigned int) ((size_t) s & 15); + if (offset != 0) + { ++ start_unaligned: + /* Check partial string. */ + aligned = (const char *) ((size_t) s & -16L); + __m128i value = _mm_load_si128 ((__m128i *) aligned); + + value = __m128i_shift_right (value, offset); + +- int length = _mm_cmpistri (mask, value, 0x2); ++ unsigned int length = _mm_cmpistri (mask, value, 0x2); + /* No need to check ZFlag since ZFlag is always 1. */ +- int cflag = _mm_cmpistrc (mask, value, 0x2); ++ unsigned int cflag = _mm_cmpistrc (mask, value, 0x2); + if (cflag) + RETURN ((char *) (s + length), length); + /* Find where the NULL terminator is. */ +- int index = _mm_cmpistri (value, value, 0x3a); ++ unsigned int index = _mm_cmpistri (value, value, 0x3a); + if (index < 16 - offset) + RETURN (NULL, index); + aligned += 16; + } +- else +- aligned = s; + ++start_loop: + while (1) + { + __m128i value = _mm_load_si128 ((__m128i *) aligned); +- int index = _mm_cmpistri (mask, value, 0x2); +- int cflag = _mm_cmpistrc (mask, value, 0x2); +- int zflag = _mm_cmpistrz (mask, value, 0x2); ++ unsigned int index = _mm_cmpistri (mask, value, 0x2); ++ unsigned int cflag = _mm_cmpistrc (mask, value, 0x2); ++ unsigned int zflag = _mm_cmpistrz (mask, value, 0x2); + if (cflag) + RETURN ((char *) (aligned + index), (size_t) (aligned + index - s)); + if (zflag) +-- +GitLab + diff --git a/SOURCES/ia-opt-strspn-strspn-c.patch b/SOURCES/ia-opt-strspn-strspn-c.patch new file mode 100644 index 0000000..aec44d3 --- /dev/null +++ b/SOURCES/ia-opt-strspn-strspn-c.patch @@ -0,0 +1,148 @@ +From cdcf8794677acba1fc38ac101bcf52deee23d91d Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 23 Mar 2022 16:57:24 -0500 +Subject: [PATCH] x86: Optimize strspn in strspn-c.c + +Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of +_mm_cmpistri. Also change offset to unsigned to avoid unnecessary +sign extensions. + +geometric_mean(N=20) of all benchmarks that dont fallback on +sse2; New / Original: .901 + +All string/memory tests pass. +Reviewed-by: H.J. Lu + +(cherry picked from commit 412d10343168b05b8cf6c3683457cf9711d28046) +--- + sysdeps/x86_64/multiarch/strspn-c.c | 86 +++++++++++++---------------- + 1 file changed, 39 insertions(+), 47 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c +index 4554cff0..87c5e4bf 100644 +--- a/sysdeps/x86_64/multiarch/strspn-c.c ++++ b/sysdeps/x86_64/multiarch/strspn-c.c +@@ -63,81 +63,73 @@ __strspn_sse42 (const char *s, const char *a) + return 0; + + const char *aligned; +- __m128i mask; +- int offset = (int) ((size_t) a & 15); ++ __m128i mask, maskz, zero; ++ unsigned int maskz_bits; ++ unsigned int offset = (int) ((size_t) a & 15); ++ zero = _mm_set1_epi8 (0); + if (offset != 0) + { + /* Load masks. */ + aligned = (const char *) ((size_t) a & -16L); + __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); +- +- mask = __m128i_shift_right (mask0, offset); ++ maskz = _mm_cmpeq_epi8 (mask0, zero); + + /* Find where the NULL terminator is. */ +- int length = _mm_cmpistri (mask, mask, 0x3a); +- if (length == 16 - offset) +- { +- /* There is no NULL terminator. */ +- __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16)); +- int index = _mm_cmpistri (mask1, mask1, 0x3a); +- length += index; +- +- /* Don't use SSE4.2 if the length of A > 16. */ +- if (length > 16) +- return __strspn_sse2 (s, a); +- +- if (index != 0) +- { +- /* Combine mask0 and mask1. We could play games with +- palignr, but frankly this data should be in L1 now +- so do the merge via an unaligned load. */ +- mask = _mm_loadu_si128 ((__m128i *) a); +- } +- } ++ maskz_bits = _mm_movemask_epi8 (maskz) >> offset; ++ if (maskz_bits != 0) ++ { ++ mask = __m128i_shift_right (mask0, offset); ++ offset = (unsigned int) ((size_t) s & 15); ++ if (offset) ++ goto start_unaligned; ++ ++ aligned = s; ++ goto start_loop; ++ } + } +- else +- { +- /* A is aligned. */ +- mask = _mm_load_si128 ((__m128i *) a); + +- /* Find where the NULL terminator is. */ +- int length = _mm_cmpistri (mask, mask, 0x3a); +- if (length == 16) +- { +- /* There is no NULL terminator. Don't use SSE4.2 if the length +- of A > 16. */ +- if (a[16] != 0) +- return __strspn_sse2 (s, a); +- } ++ /* A is aligned. */ ++ mask = _mm_loadu_si128 ((__m128i *) a); ++ ++ /* Find where the NULL terminator is. */ ++ maskz = _mm_cmpeq_epi8 (mask, zero); ++ maskz_bits = _mm_movemask_epi8 (maskz); ++ if (maskz_bits == 0) ++ { ++ /* There is no NULL terminator. Don't use SSE4.2 if the length ++ of A > 16. */ ++ if (a[16] != 0) ++ return __strspn_sse2 (s, a); + } ++ aligned = s; ++ offset = (unsigned int) ((size_t) s & 15); + +- offset = (int) ((size_t) s & 15); + if (offset != 0) + { ++ start_unaligned: + /* Check partial string. */ + aligned = (const char *) ((size_t) s & -16L); + __m128i value = _mm_load_si128 ((__m128i *) aligned); ++ __m128i adj_value = __m128i_shift_right (value, offset); + +- value = __m128i_shift_right (value, offset); +- +- int length = _mm_cmpistri (mask, value, 0x12); ++ unsigned int length = _mm_cmpistri (mask, adj_value, 0x12); + /* No need to check CFlag since it is always 1. */ + if (length < 16 - offset) + return length; + /* Find where the NULL terminator is. */ +- int index = _mm_cmpistri (value, value, 0x3a); +- if (index < 16 - offset) ++ maskz = _mm_cmpeq_epi8 (value, zero); ++ maskz_bits = _mm_movemask_epi8 (maskz) >> offset; ++ if (maskz_bits != 0) + return length; + aligned += 16; + } +- else +- aligned = s; + ++start_loop: + while (1) + { + __m128i value = _mm_load_si128 ((__m128i *) aligned); +- int index = _mm_cmpistri (mask, value, 0x12); +- int cflag = _mm_cmpistrc (mask, value, 0x12); ++ unsigned int index = _mm_cmpistri (mask, value, 0x12); ++ unsigned int cflag = _mm_cmpistrc (mask, value, 0x12); + if (cflag) + return (size_t) (aligned + index - s); + aligned += 16; +-- +GitLab + diff --git a/SOURCES/ia-opt-strxcasecmp-avx2.patch b/SOURCES/ia-opt-strxcasecmp-avx2.patch new file mode 100644 index 0000000..27cca42 --- /dev/null +++ b/SOURCES/ia-opt-strxcasecmp-avx2.patch @@ -0,0 +1,760 @@ +From 92783628b724089230e9b4ecab872de807652efe Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Thu, 24 Mar 2022 18:56:12 -0500 +Subject: [PATCH] x86: Add AVX2 optimized str{n}casecmp + +geometric_mean(N=40) of all benchmarks AVX2 / SSE42: .702 + +All string/memory tests pass. +Reviewed-by: H.J. Lu + +(cherry picked from commit bbf81222343fed5cd704001a2ae0d86c71544151) +--- + sysdeps/x86_64/multiarch/Makefile | 4 + + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 28 +++ + sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 12 + + .../x86_64/multiarch/strcasecmp_l-avx2-rtm.S | 15 ++ + sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S | 23 ++ + sysdeps/x86_64/multiarch/strcmp-avx2.S | 237 +++++++++++++++--- + .../x86_64/multiarch/strncase_l-avx2-rtm.S | 16 ++ + sysdeps/x86_64/multiarch/strncase_l-avx2.S | 27 ++ + 8 files changed, 331 insertions(+), 31 deletions(-) + create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S + create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2.S + +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index 8c9e7812..711ecf2e 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -51,6 +51,8 @@ sysdep_routines += \ + stpncpy-sse2-unaligned \ + stpncpy-ssse3 \ + strcasecmp_l-avx \ ++ strcasecmp_l-avx2 \ ++ strcasecmp_l-avx2-rtm \ + strcasecmp_l-sse2 \ + strcasecmp_l-sse4_2 \ + strcasecmp_l-ssse3 \ +@@ -89,6 +91,8 @@ sysdep_routines += \ + strlen-evex \ + strlen-sse2 \ + strncase_l-avx \ ++ strncase_l-avx2 \ ++ strncase_l-avx2-rtm \ + strncase_l-sse2 \ + strncase_l-sse4_2 \ + strncase_l-ssse3 \ +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index c963d391..d873e1be 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -418,6 +418,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + + /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */ + IFUNC_IMPL (i, name, strcasecmp, ++ IFUNC_IMPL_ADD (array, i, strcasecmp, ++ CPU_FEATURE_USABLE (AVX2), ++ __strcasecmp_avx2) ++ IFUNC_IMPL_ADD (array, i, strcasecmp, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __strcasecmp_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strcasecmp, + CPU_FEATURE_USABLE (AVX), + __strcasecmp_avx) +@@ -431,6 +438,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + + /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */ + IFUNC_IMPL (i, name, strcasecmp_l, ++ IFUNC_IMPL_ADD (array, i, strcasecmp, ++ CPU_FEATURE_USABLE (AVX2), ++ __strcasecmp_l_avx2) ++ IFUNC_IMPL_ADD (array, i, strcasecmp, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __strcasecmp_l_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strcasecmp_l, + CPU_FEATURE_USABLE (AVX), + __strcasecmp_l_avx) +@@ -558,6 +572,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + + /* Support sysdeps/x86_64/multiarch/strncase_l.c. */ + IFUNC_IMPL (i, name, strncasecmp, ++ IFUNC_IMPL_ADD (array, i, strncasecmp, ++ CPU_FEATURE_USABLE (AVX2), ++ __strncasecmp_avx2) ++ IFUNC_IMPL_ADD (array, i, strncasecmp, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __strncasecmp_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strncasecmp, + CPU_FEATURE_USABLE (AVX), + __strncasecmp_avx) +@@ -572,6 +593,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + + /* Support sysdeps/x86_64/multiarch/strncase_l.c. */ + IFUNC_IMPL (i, name, strncasecmp_l, ++ IFUNC_IMPL_ADD (array, i, strncasecmp, ++ CPU_FEATURE_USABLE (AVX2), ++ __strncasecmp_l_avx2) ++ IFUNC_IMPL_ADD (array, i, strncasecmp, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __strncasecmp_l_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strncasecmp_l, + CPU_FEATURE_USABLE (AVX), + __strncasecmp_l_avx) +diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h +index 6a4bb078..926508c4 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h ++++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h +@@ -23,12 +23,24 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; + + static inline void * + IFUNC_SELECTOR (void) + { + const struct cpu_features* cpu_features = __get_cpu_features (); + ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) ++ && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) ++ return OPTIMIZE (avx2_rtm); ++ ++ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) ++ return OPTIMIZE (avx2); ++ } ++ + if (CPU_FEATURE_USABLE_P (cpu_features, AVX)) + return OPTIMIZE (avx); + +diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S +new file mode 100644 +index 00000000..09957fc3 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S +@@ -0,0 +1,15 @@ ++#ifndef STRCMP ++# define STRCMP __strcasecmp_l_avx2_rtm ++#endif ++ ++#define _GLABEL(x) x ## _rtm ++#define GLABEL(x) _GLABEL(x) ++ ++#define ZERO_UPPER_VEC_REGISTERS_RETURN \ ++ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST ++ ++#define VZEROUPPER_RETURN jmp L(return_vzeroupper) ++ ++#define SECTION(p) p##.avx.rtm ++ ++#include "strcasecmp_l-avx2.S" +diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S +new file mode 100644 +index 00000000..e2762f2a +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S +@@ -0,0 +1,23 @@ ++/* strcasecmp_l optimized with AVX2. ++ Copyright (C) 2017-2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#ifndef STRCMP ++# define STRCMP __strcasecmp_l_avx2 ++#endif ++#define USE_AS_STRCASECMP_L ++#include "strcmp-avx2.S" +diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S +index 782f9472..28cc98b6 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S ++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S +@@ -20,6 +20,10 @@ + + # include + ++# if defined USE_AS_STRCASECMP_L ++# include "locale-defines.h" ++# endif ++ + # ifndef STRCMP + # define STRCMP __strcmp_avx2 + # endif +@@ -74,13 +78,88 @@ + # define VEC_OFFSET (-VEC_SIZE) + # endif + ++# ifdef USE_AS_STRCASECMP_L ++# define BYTE_LOOP_REG OFFSET_REG ++# else ++# define BYTE_LOOP_REG ecx ++# endif ++ ++# ifdef USE_AS_STRCASECMP_L ++# ifdef USE_AS_STRNCMP ++# define STRCASECMP __strncasecmp_avx2 ++# define LOCALE_REG rcx ++# define LOCALE_REG_LP RCX_LP ++# define STRCASECMP_NONASCII __strncasecmp_l_nonascii ++# else ++# define STRCASECMP __strcasecmp_avx2 ++# define LOCALE_REG rdx ++# define LOCALE_REG_LP RDX_LP ++# define STRCASECMP_NONASCII __strcasecmp_l_nonascii ++# endif ++# endif ++ + # define xmmZERO xmm15 + # define ymmZERO ymm15 + ++# define LCASE_MIN_ymm %ymm10 ++# define LCASE_MAX_ymm %ymm11 ++# define CASE_ADD_ymm %ymm12 ++ ++# define LCASE_MIN_xmm %xmm10 ++# define LCASE_MAX_xmm %xmm11 ++# define CASE_ADD_xmm %xmm12 ++ ++ /* r11 is never use elsewhere so this is safe to maintain. */ ++# define TOLOWER_BASE %r11 ++ + # ifndef SECTION + # define SECTION(p) p##.avx + # endif + ++# ifdef USE_AS_STRCASECMP_L ++# define REG(x, y) x ## y ++# define TOLOWER(reg1_in, reg1_out, reg2_in, reg2_out, ext) \ ++ vpaddb REG(LCASE_MIN_, ext), reg1_in, REG(%ext, 8); \ ++ vpaddb REG(LCASE_MIN_, ext), reg2_in, REG(%ext, 9); \ ++ vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 8), REG(%ext, 8); \ ++ vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 9), REG(%ext, 9); \ ++ vpandn REG(CASE_ADD_, ext), REG(%ext, 8), REG(%ext, 8); \ ++ vpandn REG(CASE_ADD_, ext), REG(%ext, 9), REG(%ext, 9); \ ++ vpaddb REG(%ext, 8), reg1_in, reg1_out; \ ++ vpaddb REG(%ext, 9), reg2_in, reg2_out ++ ++# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst ++# define TOLOWER_ymm(...) TOLOWER(__VA_ARGS__, ymm) ++# define TOLOWER_xmm(...) TOLOWER(__VA_ARGS__, xmm) ++ ++# define CMP_R1_R2(s1_reg, s2_reg, scratch_reg, reg_out, ext) \ ++ TOLOWER (s1_reg, scratch_reg, s2_reg, s2_reg, ext); \ ++ VPCMPEQ scratch_reg, s2_reg, reg_out ++ ++# define CMP_R1_S2(s1_reg, s2_mem, scratch_reg, reg_out, ext) \ ++ VMOVU s2_mem, reg_out; \ ++ CMP_R1_R2(s1_reg, reg_out, scratch_reg, reg_out, ext) ++ ++# define CMP_R1_R2_ymm(...) CMP_R1_R2(__VA_ARGS__, ymm) ++# define CMP_R1_R2_xmm(...) CMP_R1_R2(__VA_ARGS__, xmm) ++ ++# define CMP_R1_S2_ymm(...) CMP_R1_S2(__VA_ARGS__, ymm) ++# define CMP_R1_S2_xmm(...) CMP_R1_S2(__VA_ARGS__, xmm) ++ ++# else ++# define TOLOWER_gpr(...) ++# define TOLOWER_ymm(...) ++# define TOLOWER_xmm(...) ++ ++# define CMP_R1_R2_ymm(s1_reg, s2_reg, scratch_reg, reg_out) \ ++ VPCMPEQ s2_reg, s1_reg, reg_out ++ ++# define CMP_R1_R2_xmm(...) CMP_R1_R2_ymm(__VA_ARGS__) ++ ++# define CMP_R1_S2_ymm(...) CMP_R1_R2_ymm(__VA_ARGS__) ++# define CMP_R1_S2_xmm(...) CMP_R1_R2_xmm(__VA_ARGS__) ++# endif ++ + /* Warning! + wcscmp/wcsncmp have to use SIGNED comparison for elements. + strcmp/strncmp have to use UNSIGNED comparison for elements. +@@ -102,8 +181,49 @@ + returned. */ + + .section SECTION(.text), "ax", @progbits +-ENTRY(STRCMP) ++ .align 16 ++ .type STRCMP, @function ++ .globl STRCMP ++ .hidden STRCMP ++ ++# ifndef GLABEL ++# define GLABEL(...) __VA_ARGS__ ++# endif ++ ++# ifdef USE_AS_STRCASECMP_L ++ENTRY (GLABEL(STRCASECMP)) ++ movq __libc_tsd_LOCALE@gottpoff(%rip), %rax ++ mov %fs:(%rax), %LOCALE_REG_LP ++ ++ /* Either 1 or 5 bytes (dependeing if CET is enabled). */ ++ .p2align 4 ++END (GLABEL(STRCASECMP)) ++ /* FALLTHROUGH to strcasecmp/strncasecmp_l. */ ++# endif ++ ++ .p2align 4 ++STRCMP: ++ cfi_startproc ++ _CET_ENDBR ++ CALL_MCOUNT ++ ++# if defined USE_AS_STRCASECMP_L ++ /* We have to fall back on the C implementation for locales with ++ encodings not matching ASCII for single bytes. */ ++# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 ++ mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP ++# else ++ mov (%LOCALE_REG), %RAX_LP ++# endif ++ testl $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax) ++ jne STRCASECMP_NONASCII ++ leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE ++# endif ++ + # ifdef USE_AS_STRNCMP ++ /* Don't overwrite LOCALE_REG (rcx) until we have pass ++ L(one_or_less). Otherwise we might use the wrong locale in ++ the OVERFLOW_STRCMP (strcasecmp_l). */ + # ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %edx, %edx +@@ -128,6 +248,30 @@ ENTRY(STRCMP) + # endif + # endif + vpxor %xmmZERO, %xmmZERO, %xmmZERO ++# if defined USE_AS_STRCASECMP_L ++ .section .rodata.cst32, "aM", @progbits, 32 ++ .align 32 ++L(lcase_min): ++ .quad 0x3f3f3f3f3f3f3f3f ++ .quad 0x3f3f3f3f3f3f3f3f ++ .quad 0x3f3f3f3f3f3f3f3f ++ .quad 0x3f3f3f3f3f3f3f3f ++L(lcase_max): ++ .quad 0x9999999999999999 ++ .quad 0x9999999999999999 ++ .quad 0x9999999999999999 ++ .quad 0x9999999999999999 ++L(case_add): ++ .quad 0x2020202020202020 ++ .quad 0x2020202020202020 ++ .quad 0x2020202020202020 ++ .quad 0x2020202020202020 ++ .previous ++ ++ vmovdqa L(lcase_min)(%rip), LCASE_MIN_ymm ++ vmovdqa L(lcase_max)(%rip), LCASE_MAX_ymm ++ vmovdqa L(case_add)(%rip), CASE_ADD_ymm ++# endif + movl %edi, %eax + orl %esi, %eax + sall $20, %eax +@@ -138,8 +282,10 @@ ENTRY(STRCMP) + L(no_page_cross): + /* Safe to compare 4x vectors. */ + VMOVU (%rdi), %ymm0 +- /* 1s where s1 and s2 equal. */ +- VPCMPEQ (%rsi), %ymm0, %ymm1 ++ /* 1s where s1 and s2 equal. Just VPCMPEQ if its not strcasecmp. ++ Otherwise converts ymm0 and load from rsi to lower. ymm2 is ++ scratch and ymm1 is the return. */ ++ CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1) + /* 1s at null CHAR. */ + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + /* 1s where s1 and s2 equal AND not null CHAR. */ +@@ -172,6 +318,8 @@ L(return_vec_0): + # else + movzbl (%rdi, %rcx), %eax + movzbl (%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + # endif + L(ret0): +@@ -192,6 +340,10 @@ L(ret_zero): + + .p2align 4,, 5 + L(one_or_less): ++# ifdef USE_AS_STRCASECMP_L ++ /* Set locale argument for strcasecmp. */ ++ movq %LOCALE_REG, %rdx ++# endif + jb L(ret_zero) + # ifdef USE_AS_WCSCMP + /* 'nbe' covers the case where length is negative (large +@@ -211,6 +363,8 @@ L(one_or_less): + jnbe __strcmp_avx2 + movzbl (%rdi), %eax + movzbl (%rsi), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + # endif + L(ret1): +@@ -238,6 +392,8 @@ L(return_vec_1): + # else + movzbl VEC_SIZE(%rdi, %rcx), %eax + movzbl VEC_SIZE(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + # endif + L(ret2): +@@ -269,6 +425,8 @@ L(return_vec_2): + # else + movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax + movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + # endif + L(ret3): +@@ -289,6 +447,8 @@ L(return_vec_3): + # else + movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax + movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + # endif + L(ret4): +@@ -299,7 +459,7 @@ L(ret4): + L(more_3x_vec): + /* Safe to compare 4x vectors. */ + VMOVU VEC_SIZE(%rdi), %ymm0 +- VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1 ++ CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx +@@ -312,7 +472,7 @@ L(more_3x_vec): + # endif + + VMOVU (VEC_SIZE * 2)(%rdi), %ymm0 +- VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm0, %ymm1 ++ CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 2)(%rsi), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx +@@ -320,7 +480,7 @@ L(more_3x_vec): + jnz L(return_vec_2) + + VMOVU (VEC_SIZE * 3)(%rdi), %ymm0 +- VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm0, %ymm1 ++ CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 3)(%rsi), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx +@@ -395,12 +555,10 @@ L(loop_skip_page_cross_check): + VMOVA (VEC_SIZE * 3)(%rdi), %ymm6 + + /* ymm1 all 1s where s1 and s2 equal. All 0s otherwise. */ +- VPCMPEQ (VEC_SIZE * 0)(%rsi), %ymm0, %ymm1 +- +- VPCMPEQ (VEC_SIZE * 1)(%rsi), %ymm2, %ymm3 +- VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5 +- VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7 +- ++ CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 0)(%rsi), %ymm3, %ymm1) ++ CMP_R1_S2_ymm (%ymm2, (VEC_SIZE * 1)(%rsi), %ymm5, %ymm3) ++ CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5) ++ CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7) + + /* If any mismatches or null CHAR then 0 CHAR, otherwise non- + zero. */ +@@ -469,6 +627,8 @@ L(return_vec_2_3_end): + # else + movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax + movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -512,6 +672,8 @@ L(return_vec_0_end): + # else + movzbl (%rdi, %rcx), %eax + movzbl (%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -534,6 +696,8 @@ L(return_vec_1_end): + # else + movzbl VEC_SIZE(%rdi, %rcx), %eax + movzbl VEC_SIZE(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -560,6 +724,8 @@ L(return_vec_2_end): + # else + movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax + movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -587,7 +753,7 @@ L(page_cross_during_loop): + jle L(less_1x_vec_till_page_cross) + + VMOVA (%rdi), %ymm0 +- VPCMPEQ (%rsi), %ymm0, %ymm1 ++ CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx +@@ -609,7 +775,7 @@ L(less_1x_vec_till_page_cross): + here, it means the previous page (rdi - VEC_SIZE) has already + been loaded earlier so must be valid. */ + VMOVU -VEC_SIZE(%rdi, %rax), %ymm0 +- VPCMPEQ -VEC_SIZE(%rsi, %rax), %ymm0, %ymm1 ++ CMP_R1_S2_ymm (%ymm0, -VEC_SIZE(%rsi, %rax), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx +@@ -651,6 +817,8 @@ L(return_page_cross_cmp_mem): + # else + movzbl VEC_OFFSET(%rdi, %rcx), %eax + movzbl VEC_OFFSET(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -677,7 +845,7 @@ L(more_2x_vec_till_page_cross): + iteration here. */ + + VMOVU VEC_SIZE(%rdi), %ymm0 +- VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1 ++ CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx +@@ -693,7 +861,7 @@ L(more_2x_vec_till_page_cross): + + /* Safe to include comparisons from lower bytes. */ + VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %ymm0 +- VPCMPEQ -(VEC_SIZE * 2)(%rsi, %rax), %ymm0, %ymm1 ++ CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 2)(%rsi, %rax), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx +@@ -701,7 +869,7 @@ L(more_2x_vec_till_page_cross): + jnz L(return_vec_page_cross_0) + + VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %ymm0 +- VPCMPEQ -(VEC_SIZE * 1)(%rsi, %rax), %ymm0, %ymm1 ++ CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 1)(%rsi, %rax), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx +@@ -719,8 +887,8 @@ L(more_2x_vec_till_page_cross): + VMOVA (VEC_SIZE * 2)(%rdi), %ymm4 + VMOVA (VEC_SIZE * 3)(%rdi), %ymm6 + +- VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5 +- VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7 ++ CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5) ++ CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7) + vpand %ymm4, %ymm5, %ymm5 + vpand %ymm6, %ymm7, %ymm7 + VPMINU %ymm5, %ymm7, %ymm7 +@@ -771,6 +939,8 @@ L(return_vec_page_cross_1): + # else + movzbl VEC_OFFSET(%rdi, %rcx), %eax + movzbl VEC_OFFSET(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -826,7 +996,7 @@ L(page_cross): + L(page_cross_loop): + + VMOVU (%rdi, %OFFSET_REG64), %ymm0 +- VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1 ++ CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx +@@ -844,11 +1014,11 @@ L(page_cross_loop): + subl %eax, %OFFSET_REG + /* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed + to not cross page so is safe to load. Since we have already +- loaded at least 1 VEC from rsi it is also guranteed to be safe. +- */ ++ loaded at least 1 VEC from rsi it is also guranteed to be ++ safe. */ + + VMOVU (%rdi, %OFFSET_REG64), %ymm0 +- VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1 ++ CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx +@@ -881,6 +1051,8 @@ L(ret_vec_page_cross_cont): + # else + movzbl (%rdi, %rcx), %eax + movzbl (%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -934,7 +1106,7 @@ L(less_1x_vec_till_page): + ja L(less_16_till_page) + + VMOVU (%rdi), %xmm0 +- VPCMPEQ (%rsi), %xmm0, %xmm1 ++ CMP_R1_S2_xmm (%xmm0, (%rsi), %xmm2, %xmm1) + VPCMPEQ %xmm0, %xmmZERO, %xmm2 + vpandn %xmm1, %xmm2, %xmm1 + vpmovmskb %ymm1, %ecx +@@ -952,7 +1124,7 @@ L(less_1x_vec_till_page): + # endif + + VMOVU (%rdi, %OFFSET_REG64), %xmm0 +- VPCMPEQ (%rsi, %OFFSET_REG64), %xmm0, %xmm1 ++ CMP_R1_S2_xmm (%xmm0, (%rsi, %OFFSET_REG64), %xmm2, %xmm1) + VPCMPEQ %xmm0, %xmmZERO, %xmm2 + vpandn %xmm1, %xmm2, %xmm1 + vpmovmskb %ymm1, %ecx +@@ -990,7 +1162,7 @@ L(less_16_till_page): + vmovq (%rdi), %xmm0 + vmovq (%rsi), %xmm1 + VPCMPEQ %xmm0, %xmmZERO, %xmm2 +- VPCMPEQ %xmm1, %xmm0, %xmm1 ++ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1) + vpandn %xmm1, %xmm2, %xmm1 + vpmovmskb %ymm1, %ecx + incb %cl +@@ -1010,7 +1182,7 @@ L(less_16_till_page): + vmovq (%rdi, %OFFSET_REG64), %xmm0 + vmovq (%rsi, %OFFSET_REG64), %xmm1 + VPCMPEQ %xmm0, %xmmZERO, %xmm2 +- VPCMPEQ %xmm1, %xmm0, %xmm1 ++ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1) + vpandn %xmm1, %xmm2, %xmm1 + vpmovmskb %ymm1, %ecx + incb %cl +@@ -1066,7 +1238,7 @@ L(ret_less_8_wcs): + vmovd (%rdi), %xmm0 + vmovd (%rsi), %xmm1 + VPCMPEQ %xmm0, %xmmZERO, %xmm2 +- VPCMPEQ %xmm1, %xmm0, %xmm1 ++ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1) + vpandn %xmm1, %xmm2, %xmm1 + vpmovmskb %ymm1, %ecx + subl $0xf, %ecx +@@ -1085,7 +1257,7 @@ L(ret_less_8_wcs): + vmovd (%rdi, %OFFSET_REG64), %xmm0 + vmovd (%rsi, %OFFSET_REG64), %xmm1 + VPCMPEQ %xmm0, %xmmZERO, %xmm2 +- VPCMPEQ %xmm1, %xmm0, %xmm1 ++ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1) + vpandn %xmm1, %xmm2, %xmm1 + vpmovmskb %ymm1, %ecx + subl $0xf, %ecx +@@ -1119,7 +1291,9 @@ L(less_4_till_page): + L(less_4_loop): + movzbl (%rdi), %eax + movzbl (%rsi, %rdi), %ecx +- subl %ecx, %eax ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %BYTE_LOOP_REG) ++ subl %BYTE_LOOP_REG, %eax + jnz L(ret_less_4_loop) + testl %ecx, %ecx + jz L(ret_zero_4_loop) +@@ -1146,5 +1320,6 @@ L(ret_less_4_loop): + subl %r8d, %eax + ret + # endif +-END(STRCMP) ++ cfi_endproc ++ .size STRCMP, .-STRCMP + #endif +diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S +new file mode 100644 +index 00000000..58c05dcf +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S +@@ -0,0 +1,16 @@ ++#ifndef STRCMP ++# define STRCMP __strncasecmp_l_avx2_rtm ++#endif ++ ++#define _GLABEL(x) x ## _rtm ++#define GLABEL(x) _GLABEL(x) ++ ++#define ZERO_UPPER_VEC_REGISTERS_RETURN \ ++ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST ++ ++#define VZEROUPPER_RETURN jmp L(return_vzeroupper) ++ ++#define SECTION(p) p##.avx.rtm ++#define OVERFLOW_STRCMP __strcasecmp_l_avx2_rtm ++ ++#include "strncase_l-avx2.S" +diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2.S b/sysdeps/x86_64/multiarch/strncase_l-avx2.S +new file mode 100644 +index 00000000..48c0aa21 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strncase_l-avx2.S +@@ -0,0 +1,27 @@ ++/* strncasecmp_l optimized with AVX2. ++ Copyright (C) 2017-2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#ifndef STRCMP ++# define STRCMP __strncasecmp_l_avx2 ++#endif ++#define USE_AS_STRCASECMP_L ++#define USE_AS_STRNCMP ++#ifndef OVERFLOW_STRCMP ++# define OVERFLOW_STRCMP __strcasecmp_l_avx2 ++#endif ++#include "strcmp-avx2.S" +-- +GitLab + diff --git a/SOURCES/ia-opt-strxcasecmp-evex.patch b/SOURCES/ia-opt-strxcasecmp-evex.patch new file mode 100644 index 0000000..f60685f --- /dev/null +++ b/SOURCES/ia-opt-strxcasecmp-evex.patch @@ -0,0 +1,815 @@ +From 01a9cf0e384dc66504f88663cef26f52925d4c50 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Thu, 24 Mar 2022 18:56:13 -0500 +Subject: [PATCH] x86: Add EVEX optimized str{n}casecmp + +geometric_mean(N=40) of all benchmarks EVEX / SSE42: .621 + +All string/memory tests pass. +Reviewed-by: H.J. Lu + +(cherry picked from commit 84e7c46df4086873eae28a1fb87d2cf5388b1e16) +--- + sysdeps/x86_64/multiarch/Makefile | 2 + + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 + + sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 5 + + sysdeps/x86_64/multiarch/strcasecmp_l-evex.S | 23 ++ + sysdeps/x86_64/multiarch/strcmp-evex.S | 290 ++++++++++++++++--- + sysdeps/x86_64/multiarch/strncase_l-evex.S | 25 ++ + 6 files changed, 321 insertions(+), 40 deletions(-) + create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-evex.S + create mode 100644 sysdeps/x86_64/multiarch/strncase_l-evex.S + +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index 711ecf2e..359712c1 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -53,6 +53,7 @@ sysdep_routines += \ + strcasecmp_l-avx \ + strcasecmp_l-avx2 \ + strcasecmp_l-avx2-rtm \ ++ strcasecmp_l-evex \ + strcasecmp_l-sse2 \ + strcasecmp_l-sse4_2 \ + strcasecmp_l-ssse3 \ +@@ -93,6 +94,7 @@ sysdep_routines += \ + strncase_l-avx \ + strncase_l-avx2 \ + strncase_l-avx2-rtm \ ++ strncase_l-evex \ + strncase_l-sse2 \ + strncase_l-sse4_2 \ + strncase_l-ssse3 \ +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index d873e1be..1dedc637 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -418,6 +418,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + + /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */ + IFUNC_IMPL (i, name, strcasecmp, ++ IFUNC_IMPL_ADD (array, i, strcasecmp, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __strcasecmp_evex) + IFUNC_IMPL_ADD (array, i, strcasecmp, + CPU_FEATURE_USABLE (AVX2), + __strcasecmp_avx2) +@@ -438,6 +442,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + + /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */ + IFUNC_IMPL (i, name, strcasecmp_l, ++ IFUNC_IMPL_ADD (array, i, strcasecmp, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __strcasecmp_l_evex) + IFUNC_IMPL_ADD (array, i, strcasecmp, + CPU_FEATURE_USABLE (AVX2), + __strcasecmp_l_avx2) +@@ -572,6 +580,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + + /* Support sysdeps/x86_64/multiarch/strncase_l.c. */ + IFUNC_IMPL (i, name, strncasecmp, ++ IFUNC_IMPL_ADD (array, i, strncasecmp, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __strncasecmp_evex) + IFUNC_IMPL_ADD (array, i, strncasecmp, + CPU_FEATURE_USABLE (AVX2), + __strncasecmp_avx2) +@@ -593,6 +605,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + + /* Support sysdeps/x86_64/multiarch/strncase_l.c. */ + IFUNC_IMPL (i, name, strncasecmp_l, ++ IFUNC_IMPL_ADD (array, i, strncasecmp, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __strncasecmp_l_evex) + IFUNC_IMPL_ADD (array, i, strncasecmp, + CPU_FEATURE_USABLE (AVX2), + __strncasecmp_l_avx2) +diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h +index 926508c4..6dd49a21 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h ++++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h +@@ -25,6 +25,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; + + static inline void * + IFUNC_SELECTOR (void) +@@ -34,6 +35,10 @@ IFUNC_SELECTOR (void) + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) + { ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) ++ return OPTIMIZE (evex); ++ + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) + return OPTIMIZE (avx2_rtm); + +diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S +new file mode 100644 +index 00000000..58642db7 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S +@@ -0,0 +1,23 @@ ++/* strcasecmp_l optimized with EVEX. ++ Copyright (C) 2017-2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#ifndef STRCMP ++# define STRCMP __strcasecmp_l_evex ++#endif ++#define USE_AS_STRCASECMP_L ++#include "strcmp-evex.S" +diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S +index 0dfa62bd..b81b5775 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-evex.S ++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S +@@ -19,6 +19,9 @@ + #if IS_IN (libc) + + # include ++# if defined USE_AS_STRCASECMP_L ++# include "locale-defines.h" ++# endif + + # ifndef STRCMP + # define STRCMP __strcmp_evex +@@ -34,19 +37,29 @@ + # define VMOVA vmovdqa64 + + # ifdef USE_AS_WCSCMP +-# define TESTEQ subl $0xff, ++# ifndef OVERFLOW_STRCMP ++# define OVERFLOW_STRCMP __wcscmp_evex ++# endif ++ ++# define TESTEQ subl $0xff, + /* Compare packed dwords. */ + # define VPCMP vpcmpd + # define VPMINU vpminud + # define VPTESTM vptestmd ++# define VPTESTNM vptestnmd + /* 1 dword char == 4 bytes. */ + # define SIZE_OF_CHAR 4 + # else ++# ifndef OVERFLOW_STRCMP ++# define OVERFLOW_STRCMP __strcmp_evex ++# endif ++ + # define TESTEQ incl + /* Compare packed bytes. */ + # define VPCMP vpcmpb + # define VPMINU vpminub + # define VPTESTM vptestmb ++# define VPTESTNM vptestnmb + /* 1 byte char == 1 byte. */ + # define SIZE_OF_CHAR 1 + # endif +@@ -73,11 +86,16 @@ + # define VEC_OFFSET (-VEC_SIZE) + # endif + +-# define XMMZERO xmm16 + # define XMM0 xmm17 + # define XMM1 xmm18 + +-# define YMMZERO ymm16 ++# define XMM10 xmm27 ++# define XMM11 xmm28 ++# define XMM12 xmm29 ++# define XMM13 xmm30 ++# define XMM14 xmm31 ++ ++ + # define YMM0 ymm17 + # define YMM1 ymm18 + # define YMM2 ymm19 +@@ -89,6 +107,87 @@ + # define YMM8 ymm25 + # define YMM9 ymm26 + # define YMM10 ymm27 ++# define YMM11 ymm28 ++# define YMM12 ymm29 ++# define YMM13 ymm30 ++# define YMM14 ymm31 ++ ++# ifdef USE_AS_STRCASECMP_L ++# define BYTE_LOOP_REG OFFSET_REG ++# else ++# define BYTE_LOOP_REG ecx ++# endif ++ ++# ifdef USE_AS_STRCASECMP_L ++# ifdef USE_AS_STRNCMP ++# define STRCASECMP __strncasecmp_evex ++# define LOCALE_REG rcx ++# define LOCALE_REG_LP RCX_LP ++# define STRCASECMP_NONASCII __strncasecmp_l_nonascii ++# else ++# define STRCASECMP __strcasecmp_evex ++# define LOCALE_REG rdx ++# define LOCALE_REG_LP RDX_LP ++# define STRCASECMP_NONASCII __strcasecmp_l_nonascii ++# endif ++# endif ++ ++# define LCASE_MIN_YMM %YMM12 ++# define LCASE_MAX_YMM %YMM13 ++# define CASE_ADD_YMM %YMM14 ++ ++# define LCASE_MIN_XMM %XMM12 ++# define LCASE_MAX_XMM %XMM13 ++# define CASE_ADD_XMM %XMM14 ++ ++ /* NB: wcsncmp uses r11 but strcasecmp is never used in ++ conjunction with wcscmp. */ ++# define TOLOWER_BASE %r11 ++ ++# ifdef USE_AS_STRCASECMP_L ++# define _REG(x, y) x ## y ++# define REG(x, y) _REG(x, y) ++# define TOLOWER(reg1, reg2, ext) \ ++ vpsubb REG(LCASE_MIN_, ext), reg1, REG(%ext, 10); \ ++ vpsubb REG(LCASE_MIN_, ext), reg2, REG(%ext, 11); \ ++ vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5; \ ++ vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6; \ ++ vpaddb reg1, REG(CASE_ADD_, ext), reg1{%k5}; \ ++ vpaddb reg2, REG(CASE_ADD_, ext), reg2{%k6} ++ ++# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst ++# define TOLOWER_YMM(...) TOLOWER(__VA_ARGS__, YMM) ++# define TOLOWER_XMM(...) TOLOWER(__VA_ARGS__, XMM) ++ ++# define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext) \ ++ TOLOWER (s1_reg, s2_reg, ext); \ ++ VPCMP $0, s1_reg, s2_reg, reg_out ++ ++# define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext) \ ++ VMOVU s2_mem, s2_reg; \ ++ CMP_R1_R2(s1_reg, s2_reg, reg_out, ext) ++ ++# define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM) ++# define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM) ++ ++# define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM) ++# define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM) ++ ++# else ++# define TOLOWER_gpr(...) ++# define TOLOWER_YMM(...) ++# define TOLOWER_XMM(...) ++ ++# define CMP_R1_R2_YMM(s1_reg, s2_reg, reg_out) \ ++ VPCMP $0, s2_reg, s1_reg, reg_out ++ ++# define CMP_R1_R2_XMM(...) CMP_R1_R2_YMM(__VA_ARGS__) ++ ++# define CMP_R1_S2_YMM(s1_reg, s2_mem, unused, reg_out) \ ++ VPCMP $0, s2_mem, s1_reg, reg_out ++ ++# define CMP_R1_S2_XMM(...) CMP_R1_S2_YMM(__VA_ARGS__) ++# endif + + /* Warning! + wcscmp/wcsncmp have to use SIGNED comparison for elements. +@@ -112,8 +211,45 @@ + returned. */ + + .section .text.evex, "ax", @progbits +-ENTRY(STRCMP) ++ .align 16 ++ .type STRCMP, @function ++ .globl STRCMP ++ .hidden STRCMP ++ ++# ifdef USE_AS_STRCASECMP_L ++ENTRY (STRCASECMP) ++ movq __libc_tsd_LOCALE@gottpoff(%rip), %rax ++ mov %fs:(%rax), %LOCALE_REG_LP ++ ++ /* Either 1 or 5 bytes (dependeing if CET is enabled). */ ++ .p2align 4 ++END (STRCASECMP) ++ /* FALLTHROUGH to strcasecmp/strncasecmp_l. */ ++# endif ++ ++ .p2align 4 ++STRCMP: ++ cfi_startproc ++ _CET_ENDBR ++ CALL_MCOUNT ++ ++# if defined USE_AS_STRCASECMP_L ++ /* We have to fall back on the C implementation for locales with ++ encodings not matching ASCII for single bytes. */ ++# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 ++ mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP ++# else ++ mov (%LOCALE_REG), %RAX_LP ++# endif ++ testl $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax) ++ jne STRCASECMP_NONASCII ++ leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE ++# endif ++ + # ifdef USE_AS_STRNCMP ++ /* Don't overwrite LOCALE_REG (rcx) until we have pass ++ L(one_or_less). Otherwise we might use the wrong locale in ++ the OVERFLOW_STRCMP (strcasecmp_l). */ + # ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %edx, %edx +@@ -125,6 +261,32 @@ ENTRY(STRCMP) + actually bound the buffer. */ + jle L(one_or_less) + # endif ++ ++# if defined USE_AS_STRCASECMP_L ++ .section .rodata.cst32, "aM", @progbits, 32 ++ .align 32 ++L(lcase_min): ++ .quad 0x4141414141414141 ++ .quad 0x4141414141414141 ++ .quad 0x4141414141414141 ++ .quad 0x4141414141414141 ++L(lcase_max): ++ .quad 0x1a1a1a1a1a1a1a1a ++ .quad 0x1a1a1a1a1a1a1a1a ++ .quad 0x1a1a1a1a1a1a1a1a ++ .quad 0x1a1a1a1a1a1a1a1a ++L(case_add): ++ .quad 0x2020202020202020 ++ .quad 0x2020202020202020 ++ .quad 0x2020202020202020 ++ .quad 0x2020202020202020 ++ .previous ++ ++ vmovdqa64 L(lcase_min)(%rip), LCASE_MIN_YMM ++ vmovdqa64 L(lcase_max)(%rip), LCASE_MAX_YMM ++ vmovdqa64 L(case_add)(%rip), CASE_ADD_YMM ++# endif ++ + movl %edi, %eax + orl %esi, %eax + /* Shift out the bits irrelivant to page boundary ([63:12]). */ +@@ -139,7 +301,7 @@ L(no_page_cross): + VPTESTM %YMM0, %YMM0, %k2 + /* Each bit cleared in K1 represents a mismatch or a null CHAR + in YMM0 and 32 bytes at (%rsi). */ +- VPCMP $0, (%rsi), %YMM0, %k1{%k2} ++ CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2} + kmovd %k1, %ecx + # ifdef USE_AS_STRNCMP + cmpq $CHAR_PER_VEC, %rdx +@@ -169,6 +331,8 @@ L(return_vec_0): + # else + movzbl (%rdi, %rcx), %eax + movzbl (%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + # endif + L(ret0): +@@ -188,11 +352,15 @@ L(ret_zero): + + .p2align 4,, 5 + L(one_or_less): ++# ifdef USE_AS_STRCASECMP_L ++ /* Set locale argument for strcasecmp. */ ++ movq %LOCALE_REG, %rdx ++# endif + jb L(ret_zero) +-# ifdef USE_AS_WCSCMP + /* 'nbe' covers the case where length is negative (large + unsigned). */ +- jnbe __wcscmp_evex ++ jnbe OVERFLOW_STRCMP ++# ifdef USE_AS_WCSCMP + movl (%rdi), %edx + xorl %eax, %eax + cmpl (%rsi), %edx +@@ -201,11 +369,10 @@ L(one_or_less): + negl %eax + orl $1, %eax + # else +- /* 'nbe' covers the case where length is negative (large +- unsigned). */ +- jnbe __strcmp_evex + movzbl (%rdi), %eax + movzbl (%rsi), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + # endif + L(ret1): +@@ -233,6 +400,8 @@ L(return_vec_1): + # else + movzbl VEC_SIZE(%rdi, %rcx), %eax + movzbl VEC_SIZE(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + # endif + L(ret2): +@@ -270,6 +439,8 @@ L(return_vec_2): + # else + movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax + movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + # endif + L(ret3): +@@ -290,6 +461,8 @@ L(return_vec_3): + # else + movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax + movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + # endif + L(ret4): +@@ -303,7 +476,7 @@ L(more_3x_vec): + /* Safe to compare 4x vectors. */ + VMOVU (VEC_SIZE)(%rdi), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- VPCMP $0, (VEC_SIZE)(%rsi), %YMM0, %k1{%k2} ++ CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2} + kmovd %k1, %ecx + TESTEQ %ecx + jnz L(return_vec_1) +@@ -315,14 +488,14 @@ L(more_3x_vec): + + VMOVU (VEC_SIZE * 2)(%rdi), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- VPCMP $0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2} ++ CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2} + kmovd %k1, %ecx + TESTEQ %ecx + jnz L(return_vec_2) + + VMOVU (VEC_SIZE * 3)(%rdi), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- VPCMP $0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2} ++ CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2} + kmovd %k1, %ecx + TESTEQ %ecx + jnz L(return_vec_3) +@@ -381,7 +554,6 @@ L(prepare_loop_aligned): + subl %esi, %eax + andl $(PAGE_SIZE - 1), %eax + +- vpxorq %YMMZERO, %YMMZERO, %YMMZERO + + /* Loop 4x comparisons at a time. */ + .p2align 4 +@@ -413,22 +585,35 @@ L(loop_skip_page_cross_check): + /* A zero CHAR in YMM9 means that there is a null CHAR. */ + VPMINU %YMM8, %YMM9, %YMM9 + +- /* Each bit set in K1 represents a non-null CHAR in YMM8. */ ++ /* Each bit set in K1 represents a non-null CHAR in YMM9. */ + VPTESTM %YMM9, %YMM9, %k1 +- ++# ifndef USE_AS_STRCASECMP_L + vpxorq (VEC_SIZE * 0)(%rsi), %YMM0, %YMM1 + vpxorq (VEC_SIZE * 1)(%rsi), %YMM2, %YMM3 + vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5 + /* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while + oring with YMM1. Result is stored in YMM6. */ + vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6 +- ++# else ++ VMOVU (VEC_SIZE * 0)(%rsi), %YMM1 ++ TOLOWER_YMM (%YMM0, %YMM1) ++ VMOVU (VEC_SIZE * 1)(%rsi), %YMM3 ++ TOLOWER_YMM (%YMM2, %YMM3) ++ VMOVU (VEC_SIZE * 2)(%rsi), %YMM5 ++ TOLOWER_YMM (%YMM4, %YMM5) ++ VMOVU (VEC_SIZE * 3)(%rsi), %YMM7 ++ TOLOWER_YMM (%YMM6, %YMM7) ++ vpxorq %YMM0, %YMM1, %YMM1 ++ vpxorq %YMM2, %YMM3, %YMM3 ++ vpxorq %YMM4, %YMM5, %YMM5 ++ vpternlogd $0xde, %YMM7, %YMM1, %YMM6 ++# endif + /* Or together YMM3, YMM5, and YMM6. */ + vpternlogd $0xfe, %YMM3, %YMM5, %YMM6 + + + /* A non-zero CHAR in YMM6 represents a mismatch. */ +- VPCMP $0, %YMMZERO, %YMM6, %k0{%k1} ++ VPTESTNM %YMM6, %YMM6, %k0{%k1} + kmovd %k0, %LOOP_REG + + TESTEQ %LOOP_REG +@@ -437,13 +622,13 @@ L(loop_skip_page_cross_check): + + /* Find which VEC has the mismatch of end of string. */ + VPTESTM %YMM0, %YMM0, %k1 +- VPCMP $0, %YMMZERO, %YMM1, %k0{%k1} ++ VPTESTNM %YMM1, %YMM1, %k0{%k1} + kmovd %k0, %ecx + TESTEQ %ecx + jnz L(return_vec_0_end) + + VPTESTM %YMM2, %YMM2, %k1 +- VPCMP $0, %YMMZERO, %YMM3, %k0{%k1} ++ VPTESTNM %YMM3, %YMM3, %k0{%k1} + kmovd %k0, %ecx + TESTEQ %ecx + jnz L(return_vec_1_end) +@@ -457,7 +642,7 @@ L(return_vec_2_3_end): + # endif + + VPTESTM %YMM4, %YMM4, %k1 +- VPCMP $0, %YMMZERO, %YMM5, %k0{%k1} ++ VPTESTNM %YMM5, %YMM5, %k0{%k1} + kmovd %k0, %ecx + TESTEQ %ecx + # if CHAR_PER_VEC <= 16 +@@ -493,6 +678,8 @@ L(return_vec_3_end): + # else + movzbl (VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax + movzbl (VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -545,6 +732,8 @@ L(return_vec_0_end): + # else + movzbl (%rdi, %rcx), %eax + movzbl (%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + /* Flip `eax` if `rdi` and `rsi` where swapped in page cross + logic. Subtract `r8d` after xor for zero case. */ +@@ -569,6 +758,8 @@ L(return_vec_1_end): + # else + movzbl VEC_SIZE(%rdi, %rcx), %eax + movzbl VEC_SIZE(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -598,7 +789,7 @@ L(page_cross_during_loop): + + VMOVA (%rdi), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- VPCMP $0, (%rsi), %YMM0, %k1{%k2} ++ CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2} + kmovd %k1, %ecx + TESTEQ %ecx + jnz L(return_vec_0_end) +@@ -619,8 +810,7 @@ L(less_1x_vec_till_page_cross): + been loaded earlier so must be valid. */ + VMOVU -VEC_SIZE(%rdi, %rax), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- VPCMP $0, -VEC_SIZE(%rsi, %rax), %YMM0, %k1{%k2} +- ++ CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2} + /* Mask of potentially valid bits. The lower bits can be out of + range comparisons (but safe regarding page crosses). */ + +@@ -642,6 +832,8 @@ L(less_1x_vec_till_page_cross): + + # ifdef USE_AS_STRNCMP + # ifdef USE_AS_WCSCMP ++ /* NB: strcasecmp not used with WCSCMP so this access to r11 is ++ safe. */ + movl %eax, %r11d + shrl $2, %r11d + cmpq %r11, %rdx +@@ -679,6 +871,8 @@ L(return_page_cross_cmp_mem): + # else + movzbl VEC_OFFSET(%rdi, %rcx), %eax + movzbl VEC_OFFSET(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -709,7 +903,7 @@ L(more_2x_vec_till_page_cross): + + VMOVA VEC_SIZE(%rdi), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- VPCMP $0, VEC_SIZE(%rsi), %YMM0, %k1{%k2} ++ CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2} + kmovd %k1, %ecx + TESTEQ %ecx + jnz L(return_vec_1_end) +@@ -724,14 +918,14 @@ L(more_2x_vec_till_page_cross): + /* Safe to include comparisons from lower bytes. */ + VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- VPCMP $0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM0, %k1{%k2} ++ CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2} + kmovd %k1, %ecx + TESTEQ %ecx + jnz L(return_vec_page_cross_0) + + VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- VPCMP $0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM0, %k1{%k2} ++ CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2} + kmovd %k1, %ecx + TESTEQ %ecx + jnz L(return_vec_page_cross_1) +@@ -740,6 +934,8 @@ L(more_2x_vec_till_page_cross): + /* Must check length here as length might proclude reading next + page. */ + # ifdef USE_AS_WCSCMP ++ /* NB: strcasecmp not used with WCSCMP so this access to r11 is ++ safe. */ + movl %eax, %r11d + shrl $2, %r11d + cmpq %r11, %rdx +@@ -754,12 +950,19 @@ L(more_2x_vec_till_page_cross): + VMOVA (VEC_SIZE * 3)(%rdi), %YMM6 + VPMINU %YMM4, %YMM6, %YMM9 + VPTESTM %YMM9, %YMM9, %k1 +- ++# ifndef USE_AS_STRCASECMP_L + vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5 + /* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6). */ + vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6 +- +- VPCMP $0, %YMMZERO, %YMM6, %k0{%k1} ++# else ++ VMOVU (VEC_SIZE * 2)(%rsi), %YMM5 ++ TOLOWER_YMM (%YMM4, %YMM5) ++ VMOVU (VEC_SIZE * 3)(%rsi), %YMM7 ++ TOLOWER_YMM (%YMM6, %YMM7) ++ vpxorq %YMM4, %YMM5, %YMM5 ++ vpternlogd $0xde, %YMM7, %YMM5, %YMM6 ++# endif ++ VPTESTNM %YMM6, %YMM6, %k0{%k1} + kmovd %k0, %LOOP_REG + TESTEQ %LOOP_REG + jnz L(return_vec_2_3_end) +@@ -815,6 +1018,8 @@ L(return_vec_page_cross_1): + # else + movzbl VEC_OFFSET(%rdi, %rcx), %eax + movzbl VEC_OFFSET(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -871,7 +1076,7 @@ L(page_cross): + L(page_cross_loop): + VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2} ++ CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2} + kmovd %k1, %ecx + TESTEQ %ecx + jnz L(check_ret_vec_page_cross) +@@ -895,7 +1100,7 @@ L(page_cross_loop): + */ + VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2} ++ CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2} + + kmovd %k1, %ecx + # ifdef USE_AS_STRNCMP +@@ -930,6 +1135,8 @@ L(ret_vec_page_cross_cont): + # else + movzbl (%rdi, %rcx, SIZE_OF_CHAR), %eax + movzbl (%rsi, %rcx, SIZE_OF_CHAR), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -989,7 +1196,7 @@ L(less_1x_vec_till_page): + /* Use 16 byte comparison. */ + vmovdqu (%rdi), %xmm0 + VPTESTM %xmm0, %xmm0, %k2 +- VPCMP $0, (%rsi), %xmm0, %k1{%k2} ++ CMP_R1_S2_XMM (%xmm0, (%rsi), %xmm1, %k1){%k2} + kmovd %k1, %ecx + # ifdef USE_AS_WCSCMP + subl $0xf, %ecx +@@ -1009,7 +1216,7 @@ L(less_1x_vec_till_page): + # endif + vmovdqu (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0 + VPTESTM %xmm0, %xmm0, %k2 +- VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0, %k1{%k2} ++ CMP_R1_S2_XMM (%xmm0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1, %k1){%k2} + kmovd %k1, %ecx + # ifdef USE_AS_WCSCMP + subl $0xf, %ecx +@@ -1048,7 +1255,7 @@ L(less_16_till_page): + vmovq (%rdi), %xmm0 + vmovq (%rsi), %xmm1 + VPTESTM %xmm0, %xmm0, %k2 +- VPCMP $0, %xmm1, %xmm0, %k1{%k2} ++ CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2} + kmovd %k1, %ecx + # ifdef USE_AS_WCSCMP + subl $0x3, %ecx +@@ -1068,7 +1275,7 @@ L(less_16_till_page): + vmovq (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0 + vmovq (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1 + VPTESTM %xmm0, %xmm0, %k2 +- VPCMP $0, %xmm1, %xmm0, %k1{%k2} ++ CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2} + kmovd %k1, %ecx + # ifdef USE_AS_WCSCMP + subl $0x3, %ecx +@@ -1128,7 +1335,7 @@ L(ret_less_8_wcs): + vmovd (%rdi), %xmm0 + vmovd (%rsi), %xmm1 + VPTESTM %xmm0, %xmm0, %k2 +- VPCMP $0, %xmm1, %xmm0, %k1{%k2} ++ CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2} + kmovd %k1, %ecx + subl $0xf, %ecx + jnz L(check_ret_vec_page_cross) +@@ -1143,7 +1350,7 @@ L(ret_less_8_wcs): + vmovd (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0 + vmovd (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1 + VPTESTM %xmm0, %xmm0, %k2 +- VPCMP $0, %xmm1, %xmm0, %k1{%k2} ++ CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2} + kmovd %k1, %ecx + subl $0xf, %ecx + jnz L(check_ret_vec_page_cross) +@@ -1176,7 +1383,9 @@ L(less_4_till_page): + L(less_4_loop): + movzbl (%rdi), %eax + movzbl (%rsi, %rdi), %ecx +- subl %ecx, %eax ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %BYTE_LOOP_REG) ++ subl %BYTE_LOOP_REG, %eax + jnz L(ret_less_4_loop) + testl %ecx, %ecx + jz L(ret_zero_4_loop) +@@ -1203,5 +1412,6 @@ L(ret_less_4_loop): + subl %r8d, %eax + ret + # endif +-END(STRCMP) ++ cfi_endproc ++ .size STRCMP, .-STRCMP + #endif +diff --git a/sysdeps/x86_64/multiarch/strncase_l-evex.S b/sysdeps/x86_64/multiarch/strncase_l-evex.S +new file mode 100644 +index 00000000..8a5af369 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strncase_l-evex.S +@@ -0,0 +1,25 @@ ++/* strncasecmp_l optimized with EVEX. ++ Copyright (C) 2017-2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#ifndef STRCMP ++# define STRCMP __strncasecmp_l_evex ++#endif ++#define OVERFLOW_STRCMP __strcasecmp_l_evex ++#define USE_AS_STRCASECMP_L ++#define USE_AS_STRNCMP ++#include "strcmp-evex.S" +-- +GitLab + diff --git a/SOURCES/ia-opt-strxcasecmp-srtcmp-sse42.patch b/SOURCES/ia-opt-strxcasecmp-srtcmp-sse42.patch new file mode 100644 index 0000000..cadff3e --- /dev/null +++ b/SOURCES/ia-opt-strxcasecmp-srtcmp-sse42.patch @@ -0,0 +1,144 @@ +From 371154789e234ff53a97adfc92355a3871f66847 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 23 Mar 2022 16:57:38 -0500 +Subject: [PATCH] x86: Optimize str{n}casecmp TOLOWER logic in strcmp-sse42.S + +Slightly faster method of doing TOLOWER that saves an +instruction. + +Also replace the hard coded 5-byte no with .p2align 4. On builds with +CET enabled this misaligned entry to strcasecmp. + +geometric_mean(N=40) of all benchmarks New / Original: .920 + +All string/memory tests pass. +Reviewed-by: H.J. Lu + +(cherry picked from commit d154758e618ec9324f5d339c46db0aa27e8b1226) +--- + sysdeps/x86_64/multiarch/strcmp-sse42.S | 83 +++++++++++-------------- + 1 file changed, 35 insertions(+), 48 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S +index d8fdeb3a..59e8ddfc 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S ++++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S +@@ -89,9 +89,8 @@ ENTRY (GLABEL(__strcasecmp)) + movq __libc_tsd_LOCALE@gottpoff(%rip),%rax + mov %fs:(%rax),%RDX_LP + +- // XXX 5 byte should be before the function +- /* 5-byte NOP. */ +- .byte 0x0f,0x1f,0x44,0x00,0x00 ++ /* Either 1 or 5 bytes (dependeing if CET is enabled). */ ++ .p2align 4 + END (GLABEL(__strcasecmp)) + /* FALLTHROUGH to strcasecmp_l. */ + #endif +@@ -100,9 +99,8 @@ ENTRY (GLABEL(__strncasecmp)) + movq __libc_tsd_LOCALE@gottpoff(%rip),%rax + mov %fs:(%rax),%RCX_LP + +- // XXX 5 byte should be before the function +- /* 5-byte NOP. */ +- .byte 0x0f,0x1f,0x44,0x00,0x00 ++ /* Either 1 or 5 bytes (dependeing if CET is enabled). */ ++ .p2align 4 + END (GLABEL(__strncasecmp)) + /* FALLTHROUGH to strncasecmp_l. */ + #endif +@@ -170,27 +168,22 @@ STRCMP_SSE42: + #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + .section .rodata.cst16,"aM",@progbits,16 + .align 16 +-LABEL(belowupper): +- .quad 0x4040404040404040 +- .quad 0x4040404040404040 +-LABEL(topupper): +-# ifdef USE_AVX +- .quad 0x5a5a5a5a5a5a5a5a +- .quad 0x5a5a5a5a5a5a5a5a +-# else +- .quad 0x5b5b5b5b5b5b5b5b +- .quad 0x5b5b5b5b5b5b5b5b +-# endif +-LABEL(touppermask): ++LABEL(lcase_min): ++ .quad 0x3f3f3f3f3f3f3f3f ++ .quad 0x3f3f3f3f3f3f3f3f ++LABEL(lcase_max): ++ .quad 0x9999999999999999 ++ .quad 0x9999999999999999 ++LABEL(case_add): + .quad 0x2020202020202020 + .quad 0x2020202020202020 + .previous +- movdqa LABEL(belowupper)(%rip), %xmm4 +-# define UCLOW_reg %xmm4 +- movdqa LABEL(topupper)(%rip), %xmm5 +-# define UCHIGH_reg %xmm5 +- movdqa LABEL(touppermask)(%rip), %xmm6 +-# define LCQWORD_reg %xmm6 ++ movdqa LABEL(lcase_min)(%rip), %xmm4 ++# define LCASE_MIN_reg %xmm4 ++ movdqa LABEL(lcase_max)(%rip), %xmm5 ++# define LCASE_MAX_reg %xmm5 ++ movdqa LABEL(case_add)(%rip), %xmm6 ++# define CASE_ADD_reg %xmm6 + #endif + cmp $0x30, %ecx + ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */ +@@ -201,32 +194,26 @@ LABEL(touppermask): + #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + # ifdef USE_AVX + # define TOLOWER(reg1, reg2) \ +- vpcmpgtb UCLOW_reg, reg1, %xmm7; \ +- vpcmpgtb UCHIGH_reg, reg1, %xmm8; \ +- vpcmpgtb UCLOW_reg, reg2, %xmm9; \ +- vpcmpgtb UCHIGH_reg, reg2, %xmm10; \ +- vpandn %xmm7, %xmm8, %xmm8; \ +- vpandn %xmm9, %xmm10, %xmm10; \ +- vpand LCQWORD_reg, %xmm8, %xmm8; \ +- vpand LCQWORD_reg, %xmm10, %xmm10; \ +- vpor reg1, %xmm8, reg1; \ +- vpor reg2, %xmm10, reg2 ++ vpaddb LCASE_MIN_reg, reg1, %xmm7; \ ++ vpaddb LCASE_MIN_reg, reg2, %xmm8; \ ++ vpcmpgtb LCASE_MAX_reg, %xmm7, %xmm7; \ ++ vpcmpgtb LCASE_MAX_reg, %xmm8, %xmm8; \ ++ vpandn CASE_ADD_reg, %xmm7, %xmm7; \ ++ vpandn CASE_ADD_reg, %xmm8, %xmm8; \ ++ vpaddb %xmm7, reg1, reg1; \ ++ vpaddb %xmm8, reg2, reg2 + # else + # define TOLOWER(reg1, reg2) \ +- movdqa reg1, %xmm7; \ +- movdqa UCHIGH_reg, %xmm8; \ +- movdqa reg2, %xmm9; \ +- movdqa UCHIGH_reg, %xmm10; \ +- pcmpgtb UCLOW_reg, %xmm7; \ +- pcmpgtb reg1, %xmm8; \ +- pcmpgtb UCLOW_reg, %xmm9; \ +- pcmpgtb reg2, %xmm10; \ +- pand %xmm8, %xmm7; \ +- pand %xmm10, %xmm9; \ +- pand LCQWORD_reg, %xmm7; \ +- pand LCQWORD_reg, %xmm9; \ +- por %xmm7, reg1; \ +- por %xmm9, reg2 ++ movdqa LCASE_MIN_reg, %xmm7; \ ++ movdqa LCASE_MIN_reg, %xmm8; \ ++ paddb reg1, %xmm7; \ ++ paddb reg2, %xmm8; \ ++ pcmpgtb LCASE_MAX_reg, %xmm7; \ ++ pcmpgtb LCASE_MAX_reg, %xmm8; \ ++ pandn CASE_ADD_reg, %xmm7; \ ++ pandn CASE_ADD_reg, %xmm8; \ ++ paddb %xmm7, reg1; \ ++ paddb %xmm8, reg2 + # endif + TOLOWER (%xmm1, %xmm2) + #else +-- +GitLab + diff --git a/SOURCES/ia-opt-strxcasecmp-srtcmp.patch b/SOURCES/ia-opt-strxcasecmp-srtcmp.patch new file mode 100644 index 0000000..f9fa6fa --- /dev/null +++ b/SOURCES/ia-opt-strxcasecmp-srtcmp.patch @@ -0,0 +1,123 @@ +From 017773f93b0e41f3b164e5db86d0c7b7f75675e9 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 23 Mar 2022 16:57:36 -0500 +Subject: [PATCH] x86: Optimize str{n}casecmp TOLOWER logic in strcmp.S + +Slightly faster method of doing TOLOWER that saves an +instruction. + +Also replace the hard coded 5-byte no with .p2align 4. On builds with +CET enabled this misaligned entry to strcasecmp. + +geometric_mean(N=40) of all benchmarks New / Original: .894 + +All string/memory tests pass. +Reviewed-by: H.J. Lu + +(cherry picked from commit 670b54bc585ea4a94f3b2e9272ba44aa6b730b73) +--- + sysdeps/x86_64/strcmp.S | 64 +++++++++++++++++++---------------------- + 1 file changed, 29 insertions(+), 35 deletions(-) + +diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S +index aa6df898..f454ce5b 100644 +--- a/sysdeps/x86_64/strcmp.S ++++ b/sysdeps/x86_64/strcmp.S +@@ -78,9 +78,8 @@ ENTRY2 (__strcasecmp) + movq __libc_tsd_LOCALE@gottpoff(%rip),%rax + mov %fs:(%rax),%RDX_LP + +- // XXX 5 byte should be before the function +- /* 5-byte NOP. */ +- .byte 0x0f,0x1f,0x44,0x00,0x00 ++ /* Either 1 or 5 bytes (dependeing if CET is enabled). */ ++ .p2align 4 + END2 (__strcasecmp) + # ifndef NO_NOLOCALE_ALIAS + weak_alias (__strcasecmp, strcasecmp) +@@ -97,9 +96,8 @@ ENTRY2 (__strncasecmp) + movq __libc_tsd_LOCALE@gottpoff(%rip),%rax + mov %fs:(%rax),%RCX_LP + +- // XXX 5 byte should be before the function +- /* 5-byte NOP. */ +- .byte 0x0f,0x1f,0x44,0x00,0x00 ++ /* Either 1 or 5 bytes (dependeing if CET is enabled). */ ++ .p2align 4 + END2 (__strncasecmp) + # ifndef NO_NOLOCALE_ALIAS + weak_alias (__strncasecmp, strncasecmp) +@@ -149,22 +147,22 @@ ENTRY (STRCMP) + #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + .section .rodata.cst16,"aM",@progbits,16 + .align 16 +-.Lbelowupper: +- .quad 0x4040404040404040 +- .quad 0x4040404040404040 +-.Ltopupper: +- .quad 0x5b5b5b5b5b5b5b5b +- .quad 0x5b5b5b5b5b5b5b5b +-.Ltouppermask: ++.Llcase_min: ++ .quad 0x3f3f3f3f3f3f3f3f ++ .quad 0x3f3f3f3f3f3f3f3f ++.Llcase_max: ++ .quad 0x9999999999999999 ++ .quad 0x9999999999999999 ++.Lcase_add: + .quad 0x2020202020202020 + .quad 0x2020202020202020 + .previous +- movdqa .Lbelowupper(%rip), %xmm5 +-# define UCLOW_reg %xmm5 +- movdqa .Ltopupper(%rip), %xmm6 +-# define UCHIGH_reg %xmm6 +- movdqa .Ltouppermask(%rip), %xmm7 +-# define LCQWORD_reg %xmm7 ++ movdqa .Llcase_min(%rip), %xmm5 ++# define LCASE_MIN_reg %xmm5 ++ movdqa .Llcase_max(%rip), %xmm6 ++# define LCASE_MAX_reg %xmm6 ++ movdqa .Lcase_add(%rip), %xmm7 ++# define CASE_ADD_reg %xmm7 + #endif + cmp $0x30, %ecx + ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */ +@@ -175,22 +173,18 @@ ENTRY (STRCMP) + movhpd 8(%rdi), %xmm1 + movhpd 8(%rsi), %xmm2 + #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +-# define TOLOWER(reg1, reg2) \ +- movdqa reg1, %xmm8; \ +- movdqa UCHIGH_reg, %xmm9; \ +- movdqa reg2, %xmm10; \ +- movdqa UCHIGH_reg, %xmm11; \ +- pcmpgtb UCLOW_reg, %xmm8; \ +- pcmpgtb reg1, %xmm9; \ +- pcmpgtb UCLOW_reg, %xmm10; \ +- pcmpgtb reg2, %xmm11; \ +- pand %xmm9, %xmm8; \ +- pand %xmm11, %xmm10; \ +- pand LCQWORD_reg, %xmm8; \ +- pand LCQWORD_reg, %xmm10; \ +- por %xmm8, reg1; \ +- por %xmm10, reg2 +- TOLOWER (%xmm1, %xmm2) ++# define TOLOWER(reg1, reg2) \ ++ movdqa LCASE_MIN_reg, %xmm8; \ ++ movdqa LCASE_MIN_reg, %xmm9; \ ++ paddb reg1, %xmm8; \ ++ paddb reg2, %xmm9; \ ++ pcmpgtb LCASE_MAX_reg, %xmm8; \ ++ pcmpgtb LCASE_MAX_reg, %xmm9; \ ++ pandn CASE_ADD_reg, %xmm8; \ ++ pandn CASE_ADD_reg, %xmm9; \ ++ paddb %xmm8, reg1; \ ++ paddb %xmm9, reg2 ++ TOLOWER (%xmm1, %xmm2) + #else + # define TOLOWER(reg1, reg2) + #endif +-- +GitLab + diff --git a/SOURCES/ia-rmv-bcopy-opt.patch b/SOURCES/ia-rmv-bcopy-opt.patch new file mode 100644 index 0000000..9a01168 --- /dev/null +++ b/SOURCES/ia-rmv-bcopy-opt.patch @@ -0,0 +1,30 @@ +From 14483cebfb833ff520e921f46b78d53f46a86df0 Mon Sep 17 00:00:00 2001 +From: Adhemerval Zanella +Date: Thu, 10 Feb 2022 11:23:24 -0300 +Subject: [PATCH] x86_64: Remove bcopy optimizations + +The symbols is not present in current POSIX specification and compiler +already generates memmove call. + +(cherry picked from commit bf92893a14ebc161b08b28acc24fa06ae6be19cb) +--- + sysdeps/x86_64/multiarch/bcopy.S | 7 ------- + 1 file changed, 7 deletions(-) + delete mode 100644 sysdeps/x86_64/multiarch/bcopy.S + +diff --git a/sysdeps/x86_64/multiarch/bcopy.S b/sysdeps/x86_64/multiarch/bcopy.S +deleted file mode 100644 +index 639f02bd..00000000 +--- a/sysdeps/x86_64/multiarch/bcopy.S ++++ /dev/null +@@ -1,7 +0,0 @@ +-#include +- +- .text +-ENTRY(bcopy) +- xchg %rdi, %rsi +- jmp __libc_memmove /* Branch to IFUNC memmove. */ +-END(bcopy) +-- +GitLab + diff --git a/SOURCES/ia-rmv-memcmp-sse4.patch b/SOURCES/ia-rmv-memcmp-sse4.patch new file mode 100644 index 0000000..91381c7 --- /dev/null +++ b/SOURCES/ia-rmv-memcmp-sse4.patch @@ -0,0 +1,965 @@ +From f5078f5cabb6e330506c2cad6ad89476438aafcb Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Fri, 15 Apr 2022 12:28:00 -0500 +Subject: [PATCH] x86: Remove memcmp-sse4.S + +Code didn't actually use any sse4 instructions since `ptest` was +removed in: + +commit 2f9062d7171850451e6044ef78d91ff8c017b9c0 +Author: Noah Goldstein +Date: Wed Nov 10 16:18:56 2021 -0600 + + x86: Shrink memcmp-sse4.S code size + +The new memcmp-sse2 implementation is also faster. + +geometric_mean(N=20) of page cross cases SSE2 / SSE4: 0.905 + +Note there are two regressions preferring SSE2 for Size = 1 and Size = +65. + +Size = 1: +size, align0, align1, ret, New Time/Old Time + 1, 1, 1, 0, 1.2 + 1, 1, 1, 1, 1.197 + 1, 1, 1, -1, 1.2 + +This is intentional. Size == 1 is significantly less hot based on +profiles of GCC11 and Python3 than sizes [4, 8] (which is made +hotter). + +Python3 Size = 1 -> 13.64% +Python3 Size = [4, 8] -> 60.92% + +GCC11 Size = 1 -> 1.29% +GCC11 Size = [4, 8] -> 33.86% + +size, align0, align1, ret, New Time/Old Time + 4, 4, 4, 0, 0.622 + 4, 4, 4, 1, 0.797 + 4, 4, 4, -1, 0.805 + 5, 5, 5, 0, 0.623 + 5, 5, 5, 1, 0.777 + 5, 5, 5, -1, 0.802 + 6, 6, 6, 0, 0.625 + 6, 6, 6, 1, 0.813 + 6, 6, 6, -1, 0.788 + 7, 7, 7, 0, 0.625 + 7, 7, 7, 1, 0.799 + 7, 7, 7, -1, 0.795 + 8, 8, 8, 0, 0.625 + 8, 8, 8, 1, 0.848 + 8, 8, 8, -1, 0.914 + 9, 9, 9, 0, 0.625 + +Size = 65: +size, align0, align1, ret, New Time/Old Time + 65, 0, 0, 0, 1.103 + 65, 0, 0, 1, 1.216 + 65, 0, 0, -1, 1.227 + 65, 65, 0, 0, 1.091 + 65, 0, 65, 1, 1.19 + 65, 65, 65, -1, 1.215 + +This is because A) the checks in range [65, 96] are now unrolled 2x +and B) because smaller values <= 16 are now given a hotter path. By +contrast the SSE4 version has a branch for Size = 80. The unrolled +version has get better performance for returns which need both +comparisons. + +size, align0, align1, ret, New Time/Old Time + 128, 4, 8, 0, 0.858 + 128, 4, 8, 1, 0.879 + 128, 4, 8, -1, 0.888 + +As well, out of microbenchmark environments that are not full +predictable the branch will have a real-cost. +Reviewed-by: H.J. Lu + +(cherry picked from commit 7cbc03d03091d5664060924789afe46d30a5477e) +--- + sysdeps/x86_64/multiarch/Makefile | 2 - + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 - + sysdeps/x86_64/multiarch/ifunc-memcmp.h | 4 - + sysdeps/x86_64/multiarch/memcmp-sse4.S | 804 --------------------- + 4 files changed, 814 deletions(-) + delete mode 100644 sysdeps/x86_64/multiarch/memcmp-sse4.S + +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index bca82e38..b503e4b8 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -11,7 +11,6 @@ sysdep_routines += \ + memcmp-avx2-movbe-rtm \ + memcmp-evex-movbe \ + memcmp-sse2 \ +- memcmp-sse4 \ + memcmp-ssse3 \ + memcpy-ssse3 \ + memcpy-ssse3-back \ +@@ -174,7 +173,6 @@ sysdep_routines += \ + wmemcmp-avx2-movbe-rtm \ + wmemcmp-c \ + wmemcmp-evex-movbe \ +- wmemcmp-sse4 \ + wmemcmp-ssse3 \ + # sysdep_routines + endif +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index 14314367..450a2917 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -78,8 +78,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + && CPU_FEATURE_USABLE (BMI2) + && CPU_FEATURE_USABLE (MOVBE)), + __memcmp_evex_movbe) +- IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1), +- __memcmp_sse4_1) + IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3), + __memcmp_ssse3) + IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2)) +@@ -824,8 +822,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + && CPU_FEATURE_USABLE (BMI2) + && CPU_FEATURE_USABLE (MOVBE)), + __wmemcmp_evex_movbe) +- IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1), +- __wmemcmp_sse4_1) + IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3), + __wmemcmp_ssse3) + IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2)) +diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h +index 690dffe8..0bc47a7f 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h ++++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h +@@ -21,7 +21,6 @@ + + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; +-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden; +@@ -47,9 +46,6 @@ IFUNC_SELECTOR (void) + return OPTIMIZE (avx2_movbe); + } + +- if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1)) +- return OPTIMIZE (sse4_1); +- + if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)) + return OPTIMIZE (ssse3); + +diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S +deleted file mode 100644 +index 50060006..00000000 +--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S ++++ /dev/null +@@ -1,804 +0,0 @@ +-/* memcmp with SSE4.1, wmemcmp with SSE4.1 +- Copyright (C) 2010-2018 Free Software Foundation, Inc. +- Contributed by Intel Corporation. +- This file is part of the GNU C Library. +- +- The GNU C Library is free software; you can redistribute it and/or +- modify it under the terms of the GNU Lesser General Public +- License as published by the Free Software Foundation; either +- version 2.1 of the License, or (at your option) any later version. +- +- The GNU C Library is distributed in the hope that it will be useful, +- but WITHOUT ANY WARRANTY; without even the implied warranty of +- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +- Lesser General Public License for more details. +- +- You should have received a copy of the GNU Lesser General Public +- License along with the GNU C Library; if not, see +- . */ +- +-#if IS_IN (libc) +- +-# include +- +-# ifndef MEMCMP +-# define MEMCMP __memcmp_sse4_1 +-# endif +- +-#ifdef USE_AS_WMEMCMP +-# define CMPEQ pcmpeqd +-# define CHAR_SIZE 4 +-#else +-# define CMPEQ pcmpeqb +-# define CHAR_SIZE 1 +-#endif +- +- +-/* Warning! +- wmemcmp has to use SIGNED comparison for elements. +- memcmp has to use UNSIGNED comparison for elemnts. +-*/ +- +- .section .text.sse4.1,"ax",@progbits +-ENTRY (MEMCMP) +-# ifdef USE_AS_WMEMCMP +- shl $2, %RDX_LP +-# elif defined __ILP32__ +- /* Clear the upper 32 bits. */ +- mov %edx, %edx +-# endif +- cmp $79, %RDX_LP +- ja L(79bytesormore) +- +- cmp $CHAR_SIZE, %RDX_LP +- jbe L(firstbyte) +- +- /* N in (CHAR_SIZE, 79) bytes. */ +- cmpl $32, %edx +- ja L(more_32_bytes) +- +- cmpl $16, %edx +- jae L(16_to_32_bytes) +- +-# ifndef USE_AS_WMEMCMP +- cmpl $8, %edx +- jae L(8_to_16_bytes) +- +- cmpl $4, %edx +- jb L(2_to_3_bytes) +- +- movl (%rdi), %eax +- movl (%rsi), %ecx +- +- bswap %eax +- bswap %ecx +- +- shlq $32, %rax +- shlq $32, %rcx +- +- movl -4(%rdi, %rdx), %edi +- movl -4(%rsi, %rdx), %esi +- +- bswap %edi +- bswap %esi +- +- orq %rdi, %rax +- orq %rsi, %rcx +- subq %rcx, %rax +- cmovne %edx, %eax +- sbbl %ecx, %ecx +- orl %ecx, %eax +- ret +- +- .p2align 4,, 8 +-L(2_to_3_bytes): +- movzwl (%rdi), %eax +- movzwl (%rsi), %ecx +- shll $8, %eax +- shll $8, %ecx +- bswap %eax +- bswap %ecx +- movzbl -1(%rdi, %rdx), %edi +- movzbl -1(%rsi, %rdx), %esi +- orl %edi, %eax +- orl %esi, %ecx +- subl %ecx, %eax +- ret +- +- .p2align 4,, 8 +-L(8_to_16_bytes): +- movq (%rdi), %rax +- movq (%rsi), %rcx +- +- bswap %rax +- bswap %rcx +- +- subq %rcx, %rax +- jne L(8_to_16_bytes_done) +- +- movq -8(%rdi, %rdx), %rax +- movq -8(%rsi, %rdx), %rcx +- +- bswap %rax +- bswap %rcx +- +- subq %rcx, %rax +- +-L(8_to_16_bytes_done): +- cmovne %edx, %eax +- sbbl %ecx, %ecx +- orl %ecx, %eax +- ret +-# else +- xorl %eax, %eax +- movl (%rdi), %ecx +- cmpl (%rsi), %ecx +- jne L(8_to_16_bytes_done) +- movl 4(%rdi), %ecx +- cmpl 4(%rsi), %ecx +- jne L(8_to_16_bytes_done) +- movl -4(%rdi, %rdx), %ecx +- cmpl -4(%rsi, %rdx), %ecx +- jne L(8_to_16_bytes_done) +- ret +-# endif +- +- .p2align 4,, 3 +-L(ret_zero): +- xorl %eax, %eax +-L(zero): +- ret +- +- .p2align 4,, 8 +-L(firstbyte): +- jb L(ret_zero) +-# ifdef USE_AS_WMEMCMP +- xorl %eax, %eax +- movl (%rdi), %ecx +- cmpl (%rsi), %ecx +- je L(zero) +-L(8_to_16_bytes_done): +- setg %al +- leal -1(%rax, %rax), %eax +-# else +- movzbl (%rdi), %eax +- movzbl (%rsi), %ecx +- sub %ecx, %eax +-# endif +- ret +- +- .p2align 4 +-L(vec_return_begin_48): +- addq $16, %rdi +- addq $16, %rsi +-L(vec_return_begin_32): +- bsfl %eax, %eax +-# ifdef USE_AS_WMEMCMP +- movl 32(%rdi, %rax), %ecx +- xorl %edx, %edx +- cmpl 32(%rsi, %rax), %ecx +- setg %dl +- leal -1(%rdx, %rdx), %eax +-# else +- movzbl 32(%rsi, %rax), %ecx +- movzbl 32(%rdi, %rax), %eax +- subl %ecx, %eax +-# endif +- ret +- +- .p2align 4 +-L(vec_return_begin_16): +- addq $16, %rdi +- addq $16, %rsi +-L(vec_return_begin): +- bsfl %eax, %eax +-# ifdef USE_AS_WMEMCMP +- movl (%rdi, %rax), %ecx +- xorl %edx, %edx +- cmpl (%rsi, %rax), %ecx +- setg %dl +- leal -1(%rdx, %rdx), %eax +-# else +- movzbl (%rsi, %rax), %ecx +- movzbl (%rdi, %rax), %eax +- subl %ecx, %eax +-# endif +- ret +- +- .p2align 4 +-L(vec_return_end_16): +- subl $16, %edx +-L(vec_return_end): +- bsfl %eax, %eax +- addl %edx, %eax +-# ifdef USE_AS_WMEMCMP +- movl -16(%rdi, %rax), %ecx +- xorl %edx, %edx +- cmpl -16(%rsi, %rax), %ecx +- setg %dl +- leal -1(%rdx, %rdx), %eax +-# else +- movzbl -16(%rsi, %rax), %ecx +- movzbl -16(%rdi, %rax), %eax +- subl %ecx, %eax +-# endif +- ret +- +- .p2align 4,, 8 +-L(more_32_bytes): +- movdqu (%rdi), %xmm0 +- movdqu (%rsi), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin) +- +- movdqu 16(%rdi), %xmm0 +- movdqu 16(%rsi), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_16) +- +- cmpl $64, %edx +- jbe L(32_to_64_bytes) +- movdqu 32(%rdi), %xmm0 +- movdqu 32(%rsi), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_32) +- +- .p2align 4,, 6 +-L(32_to_64_bytes): +- movdqu -32(%rdi, %rdx), %xmm0 +- movdqu -32(%rsi, %rdx), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_end_16) +- +- movdqu -16(%rdi, %rdx), %xmm0 +- movdqu -16(%rsi, %rdx), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_end) +- ret +- +- .p2align 4 +-L(16_to_32_bytes): +- movdqu (%rdi), %xmm0 +- movdqu (%rsi), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin) +- +- movdqu -16(%rdi, %rdx), %xmm0 +- movdqu -16(%rsi, %rdx), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_end) +- ret +- +- +- .p2align 4 +-L(79bytesormore): +- movdqu (%rdi), %xmm0 +- movdqu (%rsi), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin) +- +- +- mov %rsi, %rcx +- and $-16, %rsi +- add $16, %rsi +- sub %rsi, %rcx +- +- sub %rcx, %rdi +- add %rcx, %rdx +- test $0xf, %rdi +- jz L(2aligned) +- +- cmp $128, %rdx +- ja L(128bytesormore) +- +- .p2align 4,, 6 +-L(less128bytes): +- movdqu (%rdi), %xmm1 +- CMPEQ (%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin) +- +- movdqu 16(%rdi), %xmm1 +- CMPEQ 16(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_16) +- +- movdqu 32(%rdi), %xmm1 +- CMPEQ 32(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_32) +- +- movdqu 48(%rdi), %xmm1 +- CMPEQ 48(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_48) +- +- cmp $96, %rdx +- jb L(32_to_64_bytes) +- +- addq $64, %rdi +- addq $64, %rsi +- subq $64, %rdx +- +- .p2align 4,, 6 +-L(last_64_bytes): +- movdqu (%rdi), %xmm1 +- CMPEQ (%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin) +- +- movdqu 16(%rdi), %xmm1 +- CMPEQ 16(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_16) +- +- movdqu -32(%rdi, %rdx), %xmm0 +- movdqu -32(%rsi, %rdx), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_end_16) +- +- movdqu -16(%rdi, %rdx), %xmm0 +- movdqu -16(%rsi, %rdx), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_end) +- ret +- +- .p2align 4 +-L(128bytesormore): +- cmp $256, %rdx +- ja L(unaligned_loop) +-L(less256bytes): +- movdqu (%rdi), %xmm1 +- CMPEQ (%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin) +- +- movdqu 16(%rdi), %xmm1 +- CMPEQ 16(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_16) +- +- movdqu 32(%rdi), %xmm1 +- CMPEQ 32(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_32) +- +- movdqu 48(%rdi), %xmm1 +- CMPEQ 48(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_48) +- +- addq $64, %rdi +- addq $64, %rsi +- +- movdqu (%rdi), %xmm1 +- CMPEQ (%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin) +- +- movdqu 16(%rdi), %xmm1 +- CMPEQ 16(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_16) +- +- movdqu 32(%rdi), %xmm1 +- CMPEQ 32(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_32) +- +- movdqu 48(%rdi), %xmm1 +- CMPEQ 48(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_48) +- +- addq $-128, %rdx +- subq $-64, %rsi +- subq $-64, %rdi +- +- cmp $64, %rdx +- ja L(less128bytes) +- +- cmp $32, %rdx +- ja L(last_64_bytes) +- +- movdqu -32(%rdi, %rdx), %xmm0 +- movdqu -32(%rsi, %rdx), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_end_16) +- +- movdqu -16(%rdi, %rdx), %xmm0 +- movdqu -16(%rsi, %rdx), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_end) +- ret +- +- .p2align 4 +-L(unaligned_loop): +-# ifdef DATA_CACHE_SIZE_HALF +- mov $DATA_CACHE_SIZE_HALF, %R8_LP +-# else +- mov __x86_data_cache_size_half(%rip), %R8_LP +-# endif +- movq %r8, %r9 +- addq %r8, %r8 +- addq %r9, %r8 +- cmpq %r8, %rdx +- ja L(L2_L3_cache_unaligned) +- sub $64, %rdx +- .p2align 4 +-L(64bytesormore_loop): +- movdqu (%rdi), %xmm0 +- movdqu 16(%rdi), %xmm1 +- movdqu 32(%rdi), %xmm2 +- movdqu 48(%rdi), %xmm3 +- +- CMPEQ (%rsi), %xmm0 +- CMPEQ 16(%rsi), %xmm1 +- CMPEQ 32(%rsi), %xmm2 +- CMPEQ 48(%rsi), %xmm3 +- +- pand %xmm0, %xmm1 +- pand %xmm2, %xmm3 +- pand %xmm1, %xmm3 +- +- pmovmskb %xmm3, %eax +- incw %ax +- jnz L(64bytesormore_loop_end) +- +- add $64, %rsi +- add $64, %rdi +- sub $64, %rdx +- ja L(64bytesormore_loop) +- +- .p2align 4,, 6 +-L(loop_tail): +- addq %rdx, %rdi +- movdqu (%rdi), %xmm0 +- movdqu 16(%rdi), %xmm1 +- movdqu 32(%rdi), %xmm2 +- movdqu 48(%rdi), %xmm3 +- +- addq %rdx, %rsi +- movdqu (%rsi), %xmm4 +- movdqu 16(%rsi), %xmm5 +- movdqu 32(%rsi), %xmm6 +- movdqu 48(%rsi), %xmm7 +- +- CMPEQ %xmm4, %xmm0 +- CMPEQ %xmm5, %xmm1 +- CMPEQ %xmm6, %xmm2 +- CMPEQ %xmm7, %xmm3 +- +- pand %xmm0, %xmm1 +- pand %xmm2, %xmm3 +- pand %xmm1, %xmm3 +- +- pmovmskb %xmm3, %eax +- incw %ax +- jnz L(64bytesormore_loop_end) +- ret +- +-L(L2_L3_cache_unaligned): +- subq $64, %rdx +- .p2align 4 +-L(L2_L3_unaligned_128bytes_loop): +- prefetchnta 0x1c0(%rdi) +- prefetchnta 0x1c0(%rsi) +- +- movdqu (%rdi), %xmm0 +- movdqu 16(%rdi), %xmm1 +- movdqu 32(%rdi), %xmm2 +- movdqu 48(%rdi), %xmm3 +- +- CMPEQ (%rsi), %xmm0 +- CMPEQ 16(%rsi), %xmm1 +- CMPEQ 32(%rsi), %xmm2 +- CMPEQ 48(%rsi), %xmm3 +- +- pand %xmm0, %xmm1 +- pand %xmm2, %xmm3 +- pand %xmm1, %xmm3 +- +- pmovmskb %xmm3, %eax +- incw %ax +- jnz L(64bytesormore_loop_end) +- +- add $64, %rsi +- add $64, %rdi +- sub $64, %rdx +- ja L(L2_L3_unaligned_128bytes_loop) +- jmp L(loop_tail) +- +- +- /* This case is for machines which are sensitive for unaligned +- * instructions. */ +- .p2align 4 +-L(2aligned): +- cmp $128, %rdx +- ja L(128bytesormorein2aligned) +-L(less128bytesin2aligned): +- movdqa (%rdi), %xmm1 +- CMPEQ (%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin) +- +- movdqa 16(%rdi), %xmm1 +- CMPEQ 16(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_16) +- +- movdqa 32(%rdi), %xmm1 +- CMPEQ 32(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_32) +- +- movdqa 48(%rdi), %xmm1 +- CMPEQ 48(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_48) +- +- cmp $96, %rdx +- jb L(32_to_64_bytes) +- +- addq $64, %rdi +- addq $64, %rsi +- subq $64, %rdx +- +- .p2align 4,, 6 +-L(aligned_last_64_bytes): +- movdqa (%rdi), %xmm1 +- CMPEQ (%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin) +- +- movdqa 16(%rdi), %xmm1 +- CMPEQ 16(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_16) +- +- movdqu -32(%rdi, %rdx), %xmm0 +- movdqu -32(%rsi, %rdx), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_end_16) +- +- movdqu -16(%rdi, %rdx), %xmm0 +- movdqu -16(%rsi, %rdx), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_end) +- ret +- +- .p2align 4 +-L(128bytesormorein2aligned): +- cmp $256, %rdx +- ja L(aligned_loop) +-L(less256bytesin2alinged): +- movdqa (%rdi), %xmm1 +- CMPEQ (%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin) +- +- movdqa 16(%rdi), %xmm1 +- CMPEQ 16(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_16) +- +- movdqa 32(%rdi), %xmm1 +- CMPEQ 32(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_32) +- +- movdqa 48(%rdi), %xmm1 +- CMPEQ 48(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_48) +- +- addq $64, %rdi +- addq $64, %rsi +- +- movdqa (%rdi), %xmm1 +- CMPEQ (%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin) +- +- movdqa 16(%rdi), %xmm1 +- CMPEQ 16(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_16) +- +- movdqa 32(%rdi), %xmm1 +- CMPEQ 32(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_32) +- +- movdqa 48(%rdi), %xmm1 +- CMPEQ 48(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_48) +- +- addq $-128, %rdx +- subq $-64, %rsi +- subq $-64, %rdi +- +- cmp $64, %rdx +- ja L(less128bytesin2aligned) +- +- cmp $32, %rdx +- ja L(aligned_last_64_bytes) +- +- movdqu -32(%rdi, %rdx), %xmm0 +- movdqu -32(%rsi, %rdx), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_end_16) +- +- movdqu -16(%rdi, %rdx), %xmm0 +- movdqu -16(%rsi, %rdx), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_end) +- ret +- +- .p2align 4 +-L(aligned_loop): +-# ifdef DATA_CACHE_SIZE_HALF +- mov $DATA_CACHE_SIZE_HALF, %R8_LP +-# else +- mov __x86_data_cache_size_half(%rip), %R8_LP +-# endif +- movq %r8, %r9 +- addq %r8, %r8 +- addq %r9, %r8 +- cmpq %r8, %rdx +- ja L(L2_L3_cache_aligned) +- +- sub $64, %rdx +- .p2align 4 +-L(64bytesormore_loopin2aligned): +- movdqa (%rdi), %xmm0 +- movdqa 16(%rdi), %xmm1 +- movdqa 32(%rdi), %xmm2 +- movdqa 48(%rdi), %xmm3 +- +- CMPEQ (%rsi), %xmm0 +- CMPEQ 16(%rsi), %xmm1 +- CMPEQ 32(%rsi), %xmm2 +- CMPEQ 48(%rsi), %xmm3 +- +- pand %xmm0, %xmm1 +- pand %xmm2, %xmm3 +- pand %xmm1, %xmm3 +- +- pmovmskb %xmm3, %eax +- incw %ax +- jnz L(64bytesormore_loop_end) +- add $64, %rsi +- add $64, %rdi +- sub $64, %rdx +- ja L(64bytesormore_loopin2aligned) +- jmp L(loop_tail) +- +-L(L2_L3_cache_aligned): +- subq $64, %rdx +- .p2align 4 +-L(L2_L3_aligned_128bytes_loop): +- prefetchnta 0x1c0(%rdi) +- prefetchnta 0x1c0(%rsi) +- movdqa (%rdi), %xmm0 +- movdqa 16(%rdi), %xmm1 +- movdqa 32(%rdi), %xmm2 +- movdqa 48(%rdi), %xmm3 +- +- CMPEQ (%rsi), %xmm0 +- CMPEQ 16(%rsi), %xmm1 +- CMPEQ 32(%rsi), %xmm2 +- CMPEQ 48(%rsi), %xmm3 +- +- pand %xmm0, %xmm1 +- pand %xmm2, %xmm3 +- pand %xmm1, %xmm3 +- +- pmovmskb %xmm3, %eax +- incw %ax +- jnz L(64bytesormore_loop_end) +- +- addq $64, %rsi +- addq $64, %rdi +- subq $64, %rdx +- ja L(L2_L3_aligned_128bytes_loop) +- jmp L(loop_tail) +- +- .p2align 4 +-L(64bytesormore_loop_end): +- pmovmskb %xmm0, %ecx +- incw %cx +- jnz L(loop_end_ret) +- +- pmovmskb %xmm1, %ecx +- notw %cx +- sall $16, %ecx +- jnz L(loop_end_ret) +- +- pmovmskb %xmm2, %ecx +- notw %cx +- shlq $32, %rcx +- jnz L(loop_end_ret) +- +- addq $48, %rdi +- addq $48, %rsi +- movq %rax, %rcx +- +- .p2align 4,, 6 +-L(loop_end_ret): +- bsfq %rcx, %rcx +-# ifdef USE_AS_WMEMCMP +- movl (%rdi, %rcx), %eax +- xorl %edx, %edx +- cmpl (%rsi, %rcx), %eax +- setg %dl +- leal -1(%rdx, %rdx), %eax +-# else +- movzbl (%rdi, %rcx), %eax +- movzbl (%rsi, %rcx), %ecx +- subl %ecx, %eax +-# endif +- ret +-END (MEMCMP) +-#endif +-- +GitLab + diff --git a/SOURCES/ia-rmv-set-memset-vec-unaligned-erms.patch b/SOURCES/ia-rmv-set-memset-vec-unaligned-erms.patch new file mode 100644 index 0000000..86e8c06 --- /dev/null +++ b/SOURCES/ia-rmv-set-memset-vec-unaligned-erms.patch @@ -0,0 +1,34 @@ +From 3149dab571d1739a507ef8185af0d72d269b5ee3 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Sat, 12 Feb 2022 00:45:00 -0600 +Subject: [PATCH] x86: Set .text section in memset-vec-unaligned-erms + +commit 3d9f171bfb5325bd5f427e9fc386453358c6e840 +Author: H.J. Lu +Date: Mon Feb 7 05:55:15 2022 -0800 + + x86-64: Optimize bzero + +Remove setting the .text section for the code. This commit +adds that back. + +(cherry picked from commit 7912236f4a597deb092650ca79f33504ddb4af28) +--- + sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +index 06f5f5d7..4fb475c0 100644 +--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +@@ -114,6 +114,7 @@ + # error SECTION is not defined! + #endif + ++ .section SECTION(.text), "ax", @progbits + #if IS_IN (libc) + # if defined SHARED + ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)) +-- +GitLab + diff --git a/SOURCES/ia-rmv-ssse3_inst-memset.patch b/SOURCES/ia-rmv-ssse3_inst-memset.patch new file mode 100644 index 0000000..5288574 --- /dev/null +++ b/SOURCES/ia-rmv-ssse3_inst-memset.patch @@ -0,0 +1,41 @@ +From 4412c21a9027bbe6546b2a329a741d26c2477136 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Mon, 7 Feb 2022 00:32:23 -0600 +Subject: [PATCH] x86: Remove SSSE3 instruction for broadcast in memset.S (SSE2 + Only) + +commit b62ace2740a106222e124cc86956448fa07abf4d +Author: Noah Goldstein +Date: Sun Feb 6 00:54:18 2022 -0600 + + x86: Improve vec generation in memset-vec-unaligned-erms.S + +Revert usage of 'pshufb' in broadcast logic as it is an SSSE3 +instruction and memset.S is restricted to only SSE2 instructions. + +(cherry picked from commit 1b0c60f95bbe2eded80b2bb5be75c0e45b11cde1) +--- + sysdeps/x86_64/memset.S | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S +index 27debd2b..4cb4aa71 100644 +--- a/sysdeps/x86_64/memset.S ++++ b/sysdeps/x86_64/memset.S +@@ -30,9 +30,10 @@ + + # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + movd d, %xmm0; \ +- pxor %xmm1, %xmm1; \ +- pshufb %xmm1, %xmm0; \ +- movq r, %rax ++ movq r, %rax; \ ++ punpcklbw %xmm0, %xmm0; \ ++ punpcklwd %xmm0, %xmm0; \ ++ pshufd $0, %xmm0, %xmm0 + + # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + movd d, %xmm0; \ +-- +GitLab + diff --git a/SOURCES/ia-rmv-strcspn-sse2.patch b/SOURCES/ia-rmv-strcspn-sse2.patch new file mode 100644 index 0000000..7292b95 --- /dev/null +++ b/SOURCES/ia-rmv-strcspn-sse2.patch @@ -0,0 +1,172 @@ +From bb034f8ae84535c1263032311594f229fd3ad1a9 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 23 Mar 2022 16:57:26 -0500 +Subject: [PATCH] x86: Remove strcspn-sse2.S and use the generic implementation + +The generic implementation is faster. + +geometric_mean(N=20) of all benchmarks New / Original: .678 + +All string/memory tests pass. +Reviewed-by: H.J. Lu + +(cherry picked from commit fe28e7d9d9535ebab4081d195c553b4fbf39d9ae) +--- + .../{strcspn-sse2.S => strcspn-sse2.c} | 6 +- + sysdeps/x86_64/strcspn.S | 122 ------------------ + 2 files changed, 3 insertions(+), 125 deletions(-) + rename sysdeps/x86_64/multiarch/{strcspn-sse2.S => strcspn-sse2.c} (89%) + delete mode 100644 sysdeps/x86_64/strcspn.S + +diff --git a/sysdeps/x86_64/multiarch/strcspn-sse2.S b/sysdeps/x86_64/multiarch/strcspn-sse2.c +similarity index 89% +rename from sysdeps/x86_64/multiarch/strcspn-sse2.S +rename to sysdeps/x86_64/multiarch/strcspn-sse2.c +index 8a0c69d7..32debee4 100644 +--- a/sysdeps/x86_64/multiarch/strcspn-sse2.S ++++ b/sysdeps/x86_64/multiarch/strcspn-sse2.c +@@ -19,10 +19,10 @@ + #if IS_IN (libc) + + # include +-# define strcspn __strcspn_sse2 ++# define STRCSPN __strcspn_sse2 + + # undef libc_hidden_builtin_def +-# define libc_hidden_builtin_def(strcspn) ++# define libc_hidden_builtin_def(STRCSPN) + #endif + +-#include ++#include +diff --git a/sysdeps/x86_64/strcspn.S b/sysdeps/x86_64/strcspn.S +deleted file mode 100644 +index 7f9202d6..00000000 +--- a/sysdeps/x86_64/strcspn.S ++++ /dev/null +@@ -1,122 +0,0 @@ +-/* strcspn (str, ss) -- Return the length of the initial segment of STR +- which contains no characters from SS. +- For AMD x86-64. +- Copyright (C) 1994-2018 Free Software Foundation, Inc. +- This file is part of the GNU C Library. +- Contributed by Ulrich Drepper . +- Bug fixes by Alan Modra . +- Adopted for x86-64 by Andreas Jaeger . +- +- The GNU C Library is free software; you can redistribute it and/or +- modify it under the terms of the GNU Lesser General Public +- License as published by the Free Software Foundation; either +- version 2.1 of the License, or (at your option) any later version. +- +- The GNU C Library is distributed in the hope that it will be useful, +- but WITHOUT ANY WARRANTY; without even the implied warranty of +- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +- Lesser General Public License for more details. +- +- You should have received a copy of the GNU Lesser General Public +- License along with the GNU C Library; if not, see +- . */ +- +-#include +-#include "asm-syntax.h" +- +- .text +-ENTRY (strcspn) +- +- movq %rdi, %rdx /* Save SRC. */ +- +- /* First we create a table with flags for all possible characters. +- For the ASCII (7bit/8bit) or ISO-8859-X character sets which are +- supported by the C string functions we have 256 characters. +- Before inserting marks for the stop characters we clear the whole +- table. */ +- movq %rdi, %r8 /* Save value. */ +- subq $256, %rsp /* Make space for 256 bytes. */ +- cfi_adjust_cfa_offset(256) +- movl $32, %ecx /* 32*8 bytes = 256 bytes. */ +- movq %rsp, %rdi +- xorl %eax, %eax /* We store 0s. */ +- cld +- rep +- stosq +- +- movq %rsi, %rax /* Setup skipset. */ +- +-/* For understanding the following code remember that %rcx == 0 now. +- Although all the following instruction only modify %cl we always +- have a correct zero-extended 64-bit value in %rcx. */ +- +- .p2align 4 +-L(2): movb (%rax), %cl /* get byte from skipset */ +- testb %cl, %cl /* is NUL char? */ +- jz L(1) /* yes => start compare loop */ +- movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */ +- +- movb 1(%rax), %cl /* get byte from skipset */ +- testb $0xff, %cl /* is NUL char? */ +- jz L(1) /* yes => start compare loop */ +- movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */ +- +- movb 2(%rax), %cl /* get byte from skipset */ +- testb $0xff, %cl /* is NUL char? */ +- jz L(1) /* yes => start compare loop */ +- movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */ +- +- movb 3(%rax), %cl /* get byte from skipset */ +- addq $4, %rax /* increment skipset pointer */ +- movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */ +- testb $0xff, %cl /* is NUL char? */ +- jnz L(2) /* no => process next dword from skipset */ +- +-L(1): leaq -4(%rdx), %rax /* prepare loop */ +- +- /* We use a neat trick for the following loop. Normally we would +- have to test for two termination conditions +- 1. a character in the skipset was found +- and +- 2. the end of the string was found +- But as a sign that the character is in the skipset we store its +- value in the table. But the value of NUL is NUL so the loop +- terminates for NUL in every case. */ +- +- .p2align 4 +-L(3): addq $4, %rax /* adjust pointer for full loop round */ +- +- movb (%rax), %cl /* get byte from string */ +- cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */ +- je L(4) /* yes => return */ +- +- movb 1(%rax), %cl /* get byte from string */ +- cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */ +- je L(5) /* yes => return */ +- +- movb 2(%rax), %cl /* get byte from string */ +- cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */ +- jz L(6) /* yes => return */ +- +- movb 3(%rax), %cl /* get byte from string */ +- cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */ +- jne L(3) /* no => start loop again */ +- +- incq %rax /* adjust pointer */ +-L(6): incq %rax +-L(5): incq %rax +- +-L(4): addq $256, %rsp /* remove skipset */ +- cfi_adjust_cfa_offset(-256) +-#ifdef USE_AS_STRPBRK +- xorl %edx,%edx +- orb %cl, %cl /* was last character NUL? */ +- cmovzq %rdx, %rax /* Yes: return NULL */ +-#else +- subq %rdx, %rax /* we have to return the number of valid +- characters, so compute distance to first +- non-valid character */ +-#endif +- ret +-END (strcspn) +-libc_hidden_builtin_def (strcspn) +-- +GitLab + diff --git a/SOURCES/ia-rmv-strpbrk-sse2.patch b/SOURCES/ia-rmv-strpbrk-sse2.patch new file mode 100644 index 0000000..e180808 --- /dev/null +++ b/SOURCES/ia-rmv-strpbrk-sse2.patch @@ -0,0 +1,52 @@ +From 1620feaf35e282954d1261dd2a7beaf09306659b Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 23 Mar 2022 16:57:27 -0500 +Subject: [PATCH] x86: Remove strpbrk-sse2.S and use the generic implementation + +The generic implementation is faster (see strcspn commit). + +All string/memory tests pass. +Reviewed-by: H.J. Lu + +(cherry picked from commit 653358535280a599382cb6c77538a187dac6a87f) +--- + .../x86_64/multiarch/{strpbrk-sse2.S => strpbrk-sse2.c} | 7 +++---- + sysdeps/x86_64/strpbrk.S | 3 --- + 2 files changed, 3 insertions(+), 7 deletions(-) + rename sysdeps/x86_64/multiarch/{strpbrk-sse2.S => strpbrk-sse2.c} (87%) + delete mode 100644 sysdeps/x86_64/strpbrk.S + +diff --git a/sysdeps/x86_64/multiarch/strpbrk-sse2.S b/sysdeps/x86_64/multiarch/strpbrk-sse2.c +similarity index 87% +rename from sysdeps/x86_64/multiarch/strpbrk-sse2.S +rename to sysdeps/x86_64/multiarch/strpbrk-sse2.c +index 3c6a74db..ec0b6fda 100644 +--- a/sysdeps/x86_64/multiarch/strpbrk-sse2.S ++++ b/sysdeps/x86_64/multiarch/strpbrk-sse2.c +@@ -19,11 +19,10 @@ + #if IS_IN (libc) + + # include +-# define strcspn __strpbrk_sse2 ++# define STRPBRK __strpbrk_sse2 + + # undef libc_hidden_builtin_def +-# define libc_hidden_builtin_def(strpbrk) ++# define libc_hidden_builtin_def(STRPBRK) + #endif + +-#define USE_AS_STRPBRK +-#include ++#include +diff --git a/sysdeps/x86_64/strpbrk.S b/sysdeps/x86_64/strpbrk.S +deleted file mode 100644 +index 21888a5b..00000000 +--- a/sysdeps/x86_64/strpbrk.S ++++ /dev/null +@@ -1,3 +0,0 @@ +-#define strcspn strpbrk +-#define USE_AS_STRPBRK +-#include +-- +GitLab + diff --git a/SOURCES/ia-rmv-strspn-sse2.patch b/SOURCES/ia-rmv-strspn-sse2.patch new file mode 100644 index 0000000..a6c90c1 --- /dev/null +++ b/SOURCES/ia-rmv-strspn-sse2.patch @@ -0,0 +1,165 @@ +From 9356e90aa423ef4335404da233617ee85c3a05e4 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 23 Mar 2022 16:57:29 -0500 +Subject: [PATCH] x86: Remove strspn-sse2.S and use the generic implementation + +The generic implementation is faster. + +geometric_mean(N=20) of all benchmarks New / Original: .710 + +All string/memory tests pass. +Reviewed-by: H.J. Lu + +(cherry picked from commit 9c8a6ad620b49a27120ecdd7049c26bf05900397) +--- + .../{strspn-sse2.S => strspn-sse2.c} | 6 +- + sysdeps/x86_64/strspn.S | 115 ------------------ + 2 files changed, 3 insertions(+), 118 deletions(-) + rename sysdeps/x86_64/multiarch/{strspn-sse2.S => strspn-sse2.c} (89%) + delete mode 100644 sysdeps/x86_64/strspn.S + +diff --git a/sysdeps/x86_64/multiarch/strspn-sse2.S b/sysdeps/x86_64/multiarch/strspn-sse2.c +similarity index 89% +rename from sysdeps/x86_64/multiarch/strspn-sse2.S +rename to sysdeps/x86_64/multiarch/strspn-sse2.c +index 4686cdd5..ab0dae40 100644 +--- a/sysdeps/x86_64/multiarch/strspn-sse2.S ++++ b/sysdeps/x86_64/multiarch/strspn-sse2.c +@@ -19,10 +19,10 @@ + #if IS_IN (libc) + + # include +-# define strspn __strspn_sse2 ++# define STRSPN __strspn_sse2 + + # undef libc_hidden_builtin_def +-# define libc_hidden_builtin_def(strspn) ++# define libc_hidden_builtin_def(STRSPN) + #endif + +-#include ++#include +diff --git a/sysdeps/x86_64/strspn.S b/sysdeps/x86_64/strspn.S +deleted file mode 100644 +index 635f1bc6..00000000 +--- a/sysdeps/x86_64/strspn.S ++++ /dev/null +@@ -1,115 +0,0 @@ +-/* strspn (str, ss) -- Return the length of the initial segment of STR +- which contains only characters from SS. +- For AMD x86-64. +- Copyright (C) 1994-2018 Free Software Foundation, Inc. +- This file is part of the GNU C Library. +- Contributed by Ulrich Drepper . +- Bug fixes by Alan Modra . +- Adopted for x86-64 by Andreas Jaeger . +- +- The GNU C Library is free software; you can redistribute it and/or +- modify it under the terms of the GNU Lesser General Public +- License as published by the Free Software Foundation; either +- version 2.1 of the License, or (at your option) any later version. +- +- The GNU C Library is distributed in the hope that it will be useful, +- but WITHOUT ANY WARRANTY; without even the implied warranty of +- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +- Lesser General Public License for more details. +- +- You should have received a copy of the GNU Lesser General Public +- License along with the GNU C Library; if not, see +- . */ +- +-#include +- +- .text +-ENTRY (strspn) +- +- movq %rdi, %rdx /* Save SRC. */ +- +- /* First we create a table with flags for all possible characters. +- For the ASCII (7bit/8bit) or ISO-8859-X character sets which are +- supported by the C string functions we have 256 characters. +- Before inserting marks for the stop characters we clear the whole +- table. */ +- movq %rdi, %r8 /* Save value. */ +- subq $256, %rsp /* Make space for 256 bytes. */ +- cfi_adjust_cfa_offset(256) +- movl $32, %ecx /* 32*8 bytes = 256 bytes. */ +- movq %rsp, %rdi +- xorl %eax, %eax /* We store 0s. */ +- cld +- rep +- stosq +- +- movq %rsi, %rax /* Setup stopset. */ +- +-/* For understanding the following code remember that %rcx == 0 now. +- Although all the following instruction only modify %cl we always +- have a correct zero-extended 64-bit value in %rcx. */ +- +- .p2align 4 +-L(2): movb (%rax), %cl /* get byte from stopset */ +- testb %cl, %cl /* is NUL char? */ +- jz L(1) /* yes => start compare loop */ +- movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */ +- +- movb 1(%rax), %cl /* get byte from stopset */ +- testb $0xff, %cl /* is NUL char? */ +- jz L(1) /* yes => start compare loop */ +- movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */ +- +- movb 2(%rax), %cl /* get byte from stopset */ +- testb $0xff, %cl /* is NUL char? */ +- jz L(1) /* yes => start compare loop */ +- movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */ +- +- movb 3(%rax), %cl /* get byte from stopset */ +- addq $4, %rax /* increment stopset pointer */ +- movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */ +- testb $0xff, %cl /* is NUL char? */ +- jnz L(2) /* no => process next dword from stopset */ +- +-L(1): leaq -4(%rdx), %rax /* prepare loop */ +- +- /* We use a neat trick for the following loop. Normally we would +- have to test for two termination conditions +- 1. a character in the stopset was found +- and +- 2. the end of the string was found +- But as a sign that the character is in the stopset we store its +- value in the table. But the value of NUL is NUL so the loop +- terminates for NUL in every case. */ +- +- .p2align 4 +-L(3): addq $4, %rax /* adjust pointer for full loop round */ +- +- movb (%rax), %cl /* get byte from string */ +- testb %cl, (%rsp,%rcx) /* is it contained in skipset? */ +- jz L(4) /* no => return */ +- +- movb 1(%rax), %cl /* get byte from string */ +- testb %cl, (%rsp,%rcx) /* is it contained in skipset? */ +- jz L(5) /* no => return */ +- +- movb 2(%rax), %cl /* get byte from string */ +- testb %cl, (%rsp,%rcx) /* is it contained in skipset? */ +- jz L(6) /* no => return */ +- +- movb 3(%rax), %cl /* get byte from string */ +- testb %cl, (%rsp,%rcx) /* is it contained in skipset? */ +- jnz L(3) /* yes => start loop again */ +- +- incq %rax /* adjust pointer */ +-L(6): incq %rax +-L(5): incq %rax +- +-L(4): addq $256, %rsp /* remove stopset */ +- cfi_adjust_cfa_offset(-256) +- subq %rdx, %rax /* we have to return the number of valid +- characters, so compute distance to first +- non-valid character */ +- ret +-END (strspn) +-libc_hidden_builtin_def (strspn) +-- +GitLab + diff --git a/SOURCES/ia-rmv-strxcasecmp-avx.patch b/SOURCES/ia-rmv-strxcasecmp-avx.patch new file mode 100644 index 0000000..ba3f140 --- /dev/null +++ b/SOURCES/ia-rmv-strxcasecmp-avx.patch @@ -0,0 +1,914 @@ +From ea4c320faffe618f70854985887c7ca08a1dcf4b Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 23 Mar 2022 16:57:46 -0500 +Subject: [PATCH] x86: Remove AVX str{n}casecmp + +The rational is: + +1. SSE42 has nearly identical logic so any benefit is minimal (3.4% + regression on Tigerlake using SSE42 versus AVX across the + benchtest suite). +2. AVX2 version covers the majority of targets that previously + prefered it. +3. The targets where AVX would still be best (SnB and IVB) are + becoming outdated. + +All in all the saving the code size is worth it. + +All string/memory tests pass. +Reviewed-by: H.J. Lu + +(cherry picked from commit 305769b2a15c2e96f9e1b5195d3c4e0d6f0f4b68) +--- + sysdeps/x86_64/multiarch/Makefile | 2 - + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 - + sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 4 - + sysdeps/x86_64/multiarch/strcasecmp_l-avx.S | 22 -- + sysdeps/x86_64/multiarch/strcmp-sse42.S | 240 +++++++++----------- + sysdeps/x86_64/multiarch/strncase_l-avx.S | 22 -- + 6 files changed, 105 insertions(+), 197 deletions(-) + delete mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx.S + delete mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx.S + +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index 359712c1..bca82e38 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -50,7 +50,6 @@ sysdep_routines += \ + stpncpy-evex \ + stpncpy-sse2-unaligned \ + stpncpy-ssse3 \ +- strcasecmp_l-avx \ + strcasecmp_l-avx2 \ + strcasecmp_l-avx2-rtm \ + strcasecmp_l-evex \ +@@ -91,7 +90,6 @@ sysdep_routines += \ + strlen-avx2-rtm \ + strlen-evex \ + strlen-sse2 \ +- strncase_l-avx \ + strncase_l-avx2 \ + strncase_l-avx2-rtm \ + strncase_l-evex \ +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index 1dedc637..14314367 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -429,9 +429,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strcasecmp_avx2_rtm) +- IFUNC_IMPL_ADD (array, i, strcasecmp, +- CPU_FEATURE_USABLE (AVX), +- __strcasecmp_avx) + IFUNC_IMPL_ADD (array, i, strcasecmp, + CPU_FEATURE_USABLE (SSE4_2), + __strcasecmp_sse42) +@@ -453,9 +450,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strcasecmp_l_avx2_rtm) +- IFUNC_IMPL_ADD (array, i, strcasecmp_l, +- CPU_FEATURE_USABLE (AVX), +- __strcasecmp_l_avx) + IFUNC_IMPL_ADD (array, i, strcasecmp_l, + CPU_FEATURE_USABLE (SSE4_2), + __strcasecmp_l_sse42) +@@ -591,9 +585,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strncasecmp_avx2_rtm) +- IFUNC_IMPL_ADD (array, i, strncasecmp, +- CPU_FEATURE_USABLE (AVX), +- __strncasecmp_avx) + IFUNC_IMPL_ADD (array, i, strncasecmp, + CPU_FEATURE_USABLE (SSE4_2), + __strncasecmp_sse42) +@@ -616,9 +607,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strncasecmp_l_avx2_rtm) +- IFUNC_IMPL_ADD (array, i, strncasecmp_l, +- CPU_FEATURE_USABLE (AVX), +- __strncasecmp_l_avx) + IFUNC_IMPL_ADD (array, i, strncasecmp_l, + CPU_FEATURE_USABLE (SSE4_2), + __strncasecmp_l_sse42) +diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h +index 6dd49a21..34cfbb8f 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h ++++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h +@@ -22,7 +22,6 @@ + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden; +-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; +@@ -46,9 +45,6 @@ IFUNC_SELECTOR (void) + return OPTIMIZE (avx2); + } + +- if (CPU_FEATURE_USABLE_P (cpu_features, AVX)) +- return OPTIMIZE (avx); +- + if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_2) + && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2)) + return OPTIMIZE (sse42); +diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S +deleted file mode 100644 +index 56a03547..00000000 +--- a/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S ++++ /dev/null +@@ -1,22 +0,0 @@ +-/* strcasecmp_l optimized with AVX. +- Copyright (C) 2017-2018 Free Software Foundation, Inc. +- This file is part of the GNU C Library. +- +- The GNU C Library is free software; you can redistribute it and/or +- modify it under the terms of the GNU Lesser General Public +- License as published by the Free Software Foundation; either +- version 2.1 of the License, or (at your option) any later version. +- +- The GNU C Library is distributed in the hope that it will be useful, +- but WITHOUT ANY WARRANTY; without even the implied warranty of +- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +- Lesser General Public License for more details. +- +- You should have received a copy of the GNU Lesser General Public +- License along with the GNU C Library; if not, see +- . */ +- +-#define STRCMP_SSE42 __strcasecmp_l_avx +-#define USE_AVX 1 +-#define USE_AS_STRCASECMP_L +-#include "strcmp-sse42.S" +diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S +index 59e8ddfc..0a42b7a4 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S ++++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S +@@ -42,13 +42,8 @@ + # define UPDATE_STRNCMP_COUNTER + #endif + +-#ifdef USE_AVX +-# define SECTION avx +-# define GLABEL(l) l##_avx +-#else +-# define SECTION sse4.2 +-# define GLABEL(l) l##_sse42 +-#endif ++#define SECTION sse4.2 ++#define GLABEL(l) l##_sse42 + + #define LABEL(l) .L##l + +@@ -106,21 +101,7 @@ END (GLABEL(__strncasecmp)) + #endif + + +-#ifdef USE_AVX +-# define movdqa vmovdqa +-# define movdqu vmovdqu +-# define pmovmskb vpmovmskb +-# define pcmpistri vpcmpistri +-# define psubb vpsubb +-# define pcmpeqb vpcmpeqb +-# define psrldq vpsrldq +-# define pslldq vpslldq +-# define palignr vpalignr +-# define pxor vpxor +-# define D(arg) arg, arg +-#else +-# define D(arg) arg +-#endif ++#define arg arg + + STRCMP_SSE42: + cfi_startproc +@@ -192,18 +173,7 @@ LABEL(case_add): + movdqu (%rdi), %xmm1 + movdqu (%rsi), %xmm2 + #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +-# ifdef USE_AVX +-# define TOLOWER(reg1, reg2) \ +- vpaddb LCASE_MIN_reg, reg1, %xmm7; \ +- vpaddb LCASE_MIN_reg, reg2, %xmm8; \ +- vpcmpgtb LCASE_MAX_reg, %xmm7, %xmm7; \ +- vpcmpgtb LCASE_MAX_reg, %xmm8, %xmm8; \ +- vpandn CASE_ADD_reg, %xmm7, %xmm7; \ +- vpandn CASE_ADD_reg, %xmm8, %xmm8; \ +- vpaddb %xmm7, reg1, reg1; \ +- vpaddb %xmm8, reg2, reg2 +-# else +-# define TOLOWER(reg1, reg2) \ ++# define TOLOWER(reg1, reg2) \ + movdqa LCASE_MIN_reg, %xmm7; \ + movdqa LCASE_MIN_reg, %xmm8; \ + paddb reg1, %xmm7; \ +@@ -214,15 +184,15 @@ LABEL(case_add): + pandn CASE_ADD_reg, %xmm8; \ + paddb %xmm7, reg1; \ + paddb %xmm8, reg2 +-# endif ++ + TOLOWER (%xmm1, %xmm2) + #else + # define TOLOWER(reg1, reg2) + #endif +- pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char checks */ +- pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ +- pcmpeqb %xmm2, D(%xmm1) /* compare first 16 bytes for equality */ +- psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/ ++ pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */ ++ pcmpeqb %xmm1, %xmm0 /* Any null chars? */ ++ pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */ ++ psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */ + jnz LABEL(less16bytes)/* If not, find different value or null char */ +@@ -246,7 +216,7 @@ LABEL(crosscache): + xor %r8d, %r8d + and $0xf, %ecx /* offset of rsi */ + and $0xf, %eax /* offset of rdi */ +- pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char check */ ++ pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */ + cmp %eax, %ecx + je LABEL(ashr_0) /* rsi and rdi relative offset same */ + ja LABEL(bigger) +@@ -260,7 +230,7 @@ LABEL(bigger): + sub %rcx, %r9 + lea LABEL(unaligned_table)(%rip), %r10 + movslq (%r10, %r9,4), %r9 +- pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ ++ pcmpeqb %xmm1, %xmm0 /* Any null chars? */ + lea (%r10, %r9), %r10 + _CET_NOTRACK jmp *%r10 /* jump to corresponding case */ + +@@ -273,15 +243,15 @@ LABEL(bigger): + LABEL(ashr_0): + + movdqa (%rsi), %xmm1 +- pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ ++ pcmpeqb %xmm1, %xmm0 /* Any null chars? */ + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L +- pcmpeqb (%rdi), D(%xmm1) /* compare 16 bytes for equality */ ++ pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */ + #else + movdqa (%rdi), %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm2, D(%xmm1) /* compare 16 bytes for equality */ ++ pcmpeqb %xmm2, %xmm1 /* compare 16 bytes for equality */ + #endif +- psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/ ++ psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %r9d + shr %cl, %edx /* adjust 0xffff for offset */ + shr %cl, %r9d /* adjust for 16-byte offset */ +@@ -361,10 +331,10 @@ LABEL(ashr_0_exit_use): + */ + .p2align 4 + LABEL(ashr_1): +- pslldq $15, D(%xmm2) /* shift first string to align with second */ ++ pslldq $15, %xmm2 /* shift first string to align with second */ + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) /* compare 16 bytes for equality */ +- psubb %xmm0, D(%xmm2) /* packed sub of comparison results*/ ++ pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */ ++ psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %r9d + shr %cl, %edx /* adjust 0xffff for offset */ + shr %cl, %r9d /* adjust for 16-byte offset */ +@@ -392,7 +362,7 @@ LABEL(loop_ashr_1_use): + + LABEL(nibble_ashr_1_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $1, -16(%rdi, %rdx), D(%xmm0) ++ palignr $1, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -411,7 +381,7 @@ LABEL(nibble_ashr_1_restart_use): + jg LABEL(nibble_ashr_1_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $1, -16(%rdi, %rdx), D(%xmm0) ++ palignr $1, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -431,7 +401,7 @@ LABEL(nibble_ashr_1_restart_use): + LABEL(nibble_ashr_1_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $1, D(%xmm0) ++ psrldq $1, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -449,10 +419,10 @@ LABEL(nibble_ashr_1_use): + */ + .p2align 4 + LABEL(ashr_2): +- pslldq $14, D(%xmm2) ++ pslldq $14, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -480,7 +450,7 @@ LABEL(loop_ashr_2_use): + + LABEL(nibble_ashr_2_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $2, -16(%rdi, %rdx), D(%xmm0) ++ palignr $2, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -499,7 +469,7 @@ LABEL(nibble_ashr_2_restart_use): + jg LABEL(nibble_ashr_2_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $2, -16(%rdi, %rdx), D(%xmm0) ++ palignr $2, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -519,7 +489,7 @@ LABEL(nibble_ashr_2_restart_use): + LABEL(nibble_ashr_2_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $2, D(%xmm0) ++ psrldq $2, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -537,10 +507,10 @@ LABEL(nibble_ashr_2_use): + */ + .p2align 4 + LABEL(ashr_3): +- pslldq $13, D(%xmm2) ++ pslldq $13, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -568,7 +538,7 @@ LABEL(loop_ashr_3_use): + + LABEL(nibble_ashr_3_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $3, -16(%rdi, %rdx), D(%xmm0) ++ palignr $3, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -587,7 +557,7 @@ LABEL(nibble_ashr_3_restart_use): + jg LABEL(nibble_ashr_3_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $3, -16(%rdi, %rdx), D(%xmm0) ++ palignr $3, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -607,7 +577,7 @@ LABEL(nibble_ashr_3_restart_use): + LABEL(nibble_ashr_3_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $3, D(%xmm0) ++ psrldq $3, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -625,10 +595,10 @@ LABEL(nibble_ashr_3_use): + */ + .p2align 4 + LABEL(ashr_4): +- pslldq $12, D(%xmm2) ++ pslldq $12, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -657,7 +627,7 @@ LABEL(loop_ashr_4_use): + + LABEL(nibble_ashr_4_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $4, -16(%rdi, %rdx), D(%xmm0) ++ palignr $4, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -676,7 +646,7 @@ LABEL(nibble_ashr_4_restart_use): + jg LABEL(nibble_ashr_4_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $4, -16(%rdi, %rdx), D(%xmm0) ++ palignr $4, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -696,7 +666,7 @@ LABEL(nibble_ashr_4_restart_use): + LABEL(nibble_ashr_4_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $4, D(%xmm0) ++ psrldq $4, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -714,10 +684,10 @@ LABEL(nibble_ashr_4_use): + */ + .p2align 4 + LABEL(ashr_5): +- pslldq $11, D(%xmm2) ++ pslldq $11, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -746,7 +716,7 @@ LABEL(loop_ashr_5_use): + + LABEL(nibble_ashr_5_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $5, -16(%rdi, %rdx), D(%xmm0) ++ palignr $5, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -766,7 +736,7 @@ LABEL(nibble_ashr_5_restart_use): + + movdqa (%rdi, %rdx), %xmm0 + +- palignr $5, -16(%rdi, %rdx), D(%xmm0) ++ palignr $5, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -786,7 +756,7 @@ LABEL(nibble_ashr_5_restart_use): + LABEL(nibble_ashr_5_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $5, D(%xmm0) ++ psrldq $5, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -804,10 +774,10 @@ LABEL(nibble_ashr_5_use): + */ + .p2align 4 + LABEL(ashr_6): +- pslldq $10, D(%xmm2) ++ pslldq $10, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -836,7 +806,7 @@ LABEL(loop_ashr_6_use): + + LABEL(nibble_ashr_6_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $6, -16(%rdi, %rdx), D(%xmm0) ++ palignr $6, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -855,7 +825,7 @@ LABEL(nibble_ashr_6_restart_use): + jg LABEL(nibble_ashr_6_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $6, -16(%rdi, %rdx), D(%xmm0) ++ palignr $6, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -875,7 +845,7 @@ LABEL(nibble_ashr_6_restart_use): + LABEL(nibble_ashr_6_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $6, D(%xmm0) ++ psrldq $6, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -893,10 +863,10 @@ LABEL(nibble_ashr_6_use): + */ + .p2align 4 + LABEL(ashr_7): +- pslldq $9, D(%xmm2) ++ pslldq $9, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -925,7 +895,7 @@ LABEL(loop_ashr_7_use): + + LABEL(nibble_ashr_7_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $7, -16(%rdi, %rdx), D(%xmm0) ++ palignr $7, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -944,7 +914,7 @@ LABEL(nibble_ashr_7_restart_use): + jg LABEL(nibble_ashr_7_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $7, -16(%rdi, %rdx), D(%xmm0) ++ palignr $7, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -964,7 +934,7 @@ LABEL(nibble_ashr_7_restart_use): + LABEL(nibble_ashr_7_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $7, D(%xmm0) ++ psrldq $7, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -982,10 +952,10 @@ LABEL(nibble_ashr_7_use): + */ + .p2align 4 + LABEL(ashr_8): +- pslldq $8, D(%xmm2) ++ pslldq $8, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -1014,7 +984,7 @@ LABEL(loop_ashr_8_use): + + LABEL(nibble_ashr_8_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $8, -16(%rdi, %rdx), D(%xmm0) ++ palignr $8, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1033,7 +1003,7 @@ LABEL(nibble_ashr_8_restart_use): + jg LABEL(nibble_ashr_8_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $8, -16(%rdi, %rdx), D(%xmm0) ++ palignr $8, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1053,7 +1023,7 @@ LABEL(nibble_ashr_8_restart_use): + LABEL(nibble_ashr_8_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $8, D(%xmm0) ++ psrldq $8, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -1071,10 +1041,10 @@ LABEL(nibble_ashr_8_use): + */ + .p2align 4 + LABEL(ashr_9): +- pslldq $7, D(%xmm2) ++ pslldq $7, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -1104,7 +1074,7 @@ LABEL(loop_ashr_9_use): + LABEL(nibble_ashr_9_restart_use): + movdqa (%rdi, %rdx), %xmm0 + +- palignr $9, -16(%rdi, %rdx), D(%xmm0) ++ palignr $9, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1123,7 +1093,7 @@ LABEL(nibble_ashr_9_restart_use): + jg LABEL(nibble_ashr_9_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $9, -16(%rdi, %rdx), D(%xmm0) ++ palignr $9, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1143,7 +1113,7 @@ LABEL(nibble_ashr_9_restart_use): + LABEL(nibble_ashr_9_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $9, D(%xmm0) ++ psrldq $9, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -1161,10 +1131,10 @@ LABEL(nibble_ashr_9_use): + */ + .p2align 4 + LABEL(ashr_10): +- pslldq $6, D(%xmm2) ++ pslldq $6, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -1193,7 +1163,7 @@ LABEL(loop_ashr_10_use): + + LABEL(nibble_ashr_10_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $10, -16(%rdi, %rdx), D(%xmm0) ++ palignr $10, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1212,7 +1182,7 @@ LABEL(nibble_ashr_10_restart_use): + jg LABEL(nibble_ashr_10_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $10, -16(%rdi, %rdx), D(%xmm0) ++ palignr $10, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1232,7 +1202,7 @@ LABEL(nibble_ashr_10_restart_use): + LABEL(nibble_ashr_10_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $10, D(%xmm0) ++ psrldq $10, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -1250,10 +1220,10 @@ LABEL(nibble_ashr_10_use): + */ + .p2align 4 + LABEL(ashr_11): +- pslldq $5, D(%xmm2) ++ pslldq $5, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -1282,7 +1252,7 @@ LABEL(loop_ashr_11_use): + + LABEL(nibble_ashr_11_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $11, -16(%rdi, %rdx), D(%xmm0) ++ palignr $11, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1301,7 +1271,7 @@ LABEL(nibble_ashr_11_restart_use): + jg LABEL(nibble_ashr_11_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $11, -16(%rdi, %rdx), D(%xmm0) ++ palignr $11, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1321,7 +1291,7 @@ LABEL(nibble_ashr_11_restart_use): + LABEL(nibble_ashr_11_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $11, D(%xmm0) ++ psrldq $11, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -1339,10 +1309,10 @@ LABEL(nibble_ashr_11_use): + */ + .p2align 4 + LABEL(ashr_12): +- pslldq $4, D(%xmm2) ++ pslldq $4, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -1371,7 +1341,7 @@ LABEL(loop_ashr_12_use): + + LABEL(nibble_ashr_12_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $12, -16(%rdi, %rdx), D(%xmm0) ++ palignr $12, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1390,7 +1360,7 @@ LABEL(nibble_ashr_12_restart_use): + jg LABEL(nibble_ashr_12_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $12, -16(%rdi, %rdx), D(%xmm0) ++ palignr $12, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1410,7 +1380,7 @@ LABEL(nibble_ashr_12_restart_use): + LABEL(nibble_ashr_12_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $12, D(%xmm0) ++ psrldq $12, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -1428,10 +1398,10 @@ LABEL(nibble_ashr_12_use): + */ + .p2align 4 + LABEL(ashr_13): +- pslldq $3, D(%xmm2) ++ pslldq $3, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -1461,7 +1431,7 @@ LABEL(loop_ashr_13_use): + + LABEL(nibble_ashr_13_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $13, -16(%rdi, %rdx), D(%xmm0) ++ palignr $13, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1480,7 +1450,7 @@ LABEL(nibble_ashr_13_restart_use): + jg LABEL(nibble_ashr_13_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $13, -16(%rdi, %rdx), D(%xmm0) ++ palignr $13, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1500,7 +1470,7 @@ LABEL(nibble_ashr_13_restart_use): + LABEL(nibble_ashr_13_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $13, D(%xmm0) ++ psrldq $13, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -1518,10 +1488,10 @@ LABEL(nibble_ashr_13_use): + */ + .p2align 4 + LABEL(ashr_14): +- pslldq $2, D(%xmm2) ++ pslldq $2, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -1551,7 +1521,7 @@ LABEL(loop_ashr_14_use): + + LABEL(nibble_ashr_14_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $14, -16(%rdi, %rdx), D(%xmm0) ++ palignr $14, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1570,7 +1540,7 @@ LABEL(nibble_ashr_14_restart_use): + jg LABEL(nibble_ashr_14_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $14, -16(%rdi, %rdx), D(%xmm0) ++ palignr $14, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1590,7 +1560,7 @@ LABEL(nibble_ashr_14_restart_use): + LABEL(nibble_ashr_14_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $14, D(%xmm0) ++ psrldq $14, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -1608,10 +1578,10 @@ LABEL(nibble_ashr_14_use): + */ + .p2align 4 + LABEL(ashr_15): +- pslldq $1, D(%xmm2) ++ pslldq $1, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -1643,7 +1613,7 @@ LABEL(loop_ashr_15_use): + + LABEL(nibble_ashr_15_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $15, -16(%rdi, %rdx), D(%xmm0) ++ palignr $15, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1662,7 +1632,7 @@ LABEL(nibble_ashr_15_restart_use): + jg LABEL(nibble_ashr_15_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $15, -16(%rdi, %rdx), D(%xmm0) ++ palignr $15, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1682,7 +1652,7 @@ LABEL(nibble_ashr_15_restart_use): + LABEL(nibble_ashr_15_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $15, D(%xmm0) ++ psrldq $15, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx.S b/sysdeps/x86_64/multiarch/strncase_l-avx.S +deleted file mode 100644 +index 0c4e525b..00000000 +--- a/sysdeps/x86_64/multiarch/strncase_l-avx.S ++++ /dev/null +@@ -1,22 +0,0 @@ +-/* strncasecmp_l optimized with AVX. +- Copyright (C) 2017-2018 Free Software Foundation, Inc. +- This file is part of the GNU C Library. +- +- The GNU C Library is free software; you can redistribute it and/or +- modify it under the terms of the GNU Lesser General Public +- License as published by the Free Software Foundation; either +- version 2.1 of the License, or (at your option) any later version. +- +- The GNU C Library is distributed in the hope that it will be useful, +- but WITHOUT ANY WARRANTY; without even the implied warranty of +- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +- Lesser General Public License for more details. +- +- You should have received a copy of the GNU Lesser General Public +- License along with the GNU C Library; if not, see +- . */ +- +-#define STRCMP_SSE42 __strncasecmp_l_avx +-#define USE_AVX 1 +-#define USE_AS_STRNCASECMP_L +-#include "strcmp-sse42.S" +-- +GitLab + diff --git a/SOURCES/ia-rmv-weak_alias-memset-sse2.patch b/SOURCES/ia-rmv-weak_alias-memset-sse2.patch new file mode 100644 index 0000000..64859a2 --- /dev/null +++ b/SOURCES/ia-rmv-weak_alias-memset-sse2.patch @@ -0,0 +1,37 @@ +From 4e487ee2fe85385052ac7b18d8e6686e79f78d14 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Thu, 10 Feb 2022 11:52:50 -0800 +Subject: [PATCH] x86-64: Remove bzero weak alias in SS2 memset + +commit 3d9f171bfb5325bd5f427e9fc386453358c6e840 +Author: H.J. Lu +Date: Mon Feb 7 05:55:15 2022 -0800 + + x86-64: Optimize bzero + +added the optimized bzero. Remove bzero weak alias in SS2 memset to +avoid undefined __bzero in memset-sse2-unaligned-erms. + +(cherry picked from commit 0fb8800029d230b3711bf722b2a47db92d0e273f) +--- + sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S | 4 +--- + 1 file changed, 1 insertion(+), 3 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S +index 8f579ad6..af51362b 100644 +--- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S +@@ -31,9 +31,7 @@ + # endif + + # undef weak_alias +-# define weak_alias(original, alias) \ +- .weak bzero; bzero = __bzero +- ++# define weak_alias(original, alias) + # undef strong_alias + # define strong_alias(ignored1, ignored2) + #endif +-- +GitLab + diff --git a/SOURCES/ia-strcmp-avx2-fix.patch b/SOURCES/ia-strcmp-avx2-fix.patch new file mode 100644 index 0000000..d256c37 --- /dev/null +++ b/SOURCES/ia-strcmp-avx2-fix.patch @@ -0,0 +1,34 @@ +From d7ca99114ba67e94c99aac230ac025720cf30918 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Fri, 4 Feb 2022 11:09:10 -0800 +Subject: [PATCH] x86-64: Fix strcmp-avx2.S + +Change "movl %edx, %rdx" to "movl %edx, %edx" in: + +commit b77b06e0e296f1a2276c27a67e1d44f2cfa38d45 +Author: Noah Goldstein +Date: Mon Jan 10 15:35:38 2022 -0600 + + x86: Optimize strcmp-avx2.S + +(cherry picked from commit c15efd011cea3d8f0494269eb539583215a1feed) +--- + sysdeps/x86_64/multiarch/strcmp-avx2.S | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S +index 554ffe4c..04675aa4 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S ++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S +@@ -106,7 +106,7 @@ ENTRY(STRCMP) + # ifdef USE_AS_STRNCMP + # ifdef __ILP32__ + /* Clear the upper 32 bits. */ +- movl %edx, %rdx ++ movl %edx, %edx + # endif + cmp $1, %RDX_LP + /* Signed comparison intentional. We use this branch to also +-- +GitLab + diff --git a/SOURCES/ia-strcmp-evex-fix.patch b/SOURCES/ia-strcmp-evex-fix.patch new file mode 100644 index 0000000..66e55cc --- /dev/null +++ b/SOURCES/ia-strcmp-evex-fix.patch @@ -0,0 +1,34 @@ +From f9a6857a55fb5586d4c34baa46700d8428e02273 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Fri, 4 Feb 2022 11:11:08 -0800 +Subject: [PATCH] x86-64: Fix strcmp-evex.S + +Change "movl %edx, %rdx" to "movl %edx, %edx" in: + +commit 8418eb3ff4b781d31c4ed5dc6c0bd7356bc45db9 +Author: Noah Goldstein +Date: Mon Jan 10 15:35:39 2022 -0600 + + x86: Optimize strcmp-evex.S + +(cherry picked from commit 0e0199a9e02ebe42e2b36958964d63f03573c382) +--- + sysdeps/x86_64/multiarch/strcmp-evex.S | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S +index 99d8409a..ed56af8e 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-evex.S ++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S +@@ -116,7 +116,7 @@ ENTRY(STRCMP) + # ifdef USE_AS_STRNCMP + # ifdef __ILP32__ + /* Clear the upper 32 bits. */ +- movl %edx, %rdx ++ movl %edx, %edx + # endif + cmp $1, %RDX_LP + /* Signed comparison intentional. We use this branch to also +-- +GitLab + diff --git a/SOURCES/ia-test_name-string-tst-strncmp-rtm.patch b/SOURCES/ia-test_name-string-tst-strncmp-rtm.patch new file mode 100644 index 0000000..0e8df73 --- /dev/null +++ b/SOURCES/ia-test_name-string-tst-strncmp-rtm.patch @@ -0,0 +1,37 @@ +From 985c2ec17c739b506311000a3932a8f8491b4001 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Fri, 18 Feb 2022 17:00:25 -0600 +Subject: [PATCH] x86: Fix TEST_NAME to make it a string in tst-strncmp-rtm.c + +Previously TEST_NAME was passing a function pointer. This didn't fail +because of the -Wno-error flag (to allow for overflow sizes passed +to strncmp/wcsncmp) + +Reviewed-by: H.J. Lu +(cherry picked from commit b98d0bbf747f39770e0caba7e984ce9f8f900330) +--- + sysdeps/x86/tst-strncmp-rtm.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c +index 4e9f094f..aef9866c 100644 +--- a/sysdeps/x86/tst-strncmp-rtm.c ++++ b/sysdeps/x86/tst-strncmp-rtm.c +@@ -23,12 +23,12 @@ + # define CHAR wchar_t + # define MEMSET wmemset + # define STRNCMP wcsncmp +-# define TEST_NAME wcsncmp ++# define TEST_NAME "wcsncmp" + #else /* !WIDE */ + # define CHAR char + # define MEMSET memset + # define STRNCMP strncmp +-# define TEST_NAME strncmp ++# define TEST_NAME "strncmp" + #endif /* !WIDE */ + + +-- +GitLab + diff --git a/SPECS/glibc.spec b/SPECS/glibc.spec index bd945a7..436d4d7 100644 --- a/SPECS/glibc.spec +++ b/SPECS/glibc.spec @@ -1,6 +1,6 @@ %define glibcsrcdir glibc-2.28 %define glibcversion 2.28 -%define glibcrelease 204%{?dist} +%define glibcrelease 205%{?dist} # Pre-release tarballs are pulled in from git using a command that is # effectively: # @@ -975,7 +975,43 @@ Patch10073: glibc-sw28646.patch Patch10074: ia-no-index_arch_prefer_no_avx512-avx-vnni.patch Patch10075: ia-opt-less_vec-memcmp-evex-movb.patch Patch10076: glibc-sw28537-4.patch - +Patch10077: glibc-sw28896-2.patch +Patch10078: ia-test_name-string-tst-strncmp-rtm.patch +Patch10079: ia-opt-strcmp-avx2.patch +Patch10080: ia-opt-strcmp-evex.patch +Patch10081: ia-strcmp-avx2-fix.patch +Patch10082: ia-strcmp-evex-fix.patch +Patch10083: ia-imp-vec_gen-memset-vec-unaligned-erms.patch +Patch10084: ia-rmv-ssse3_inst-memset.patch +Patch10085: ia-opt-bzero.patch +Patch10086: ia-rmv-set-memset-vec-unaligned-erms.patch +Patch10087: glibc-sw28895.patch +Patch10088: glibc-sw28896-3.patch +Patch10089: ia-imp-l.patch +Patch10090: ia-march-srt-sysdep_routines.patch +Patch10091: ia-rmv-weak_alias-memset-sse2.patch +Patch10092: ia-rmv-bcopy-opt.patch +Patch10093: ia-code_cleanup-strchr-avx2.patch +Patch10094: ia-code_cleanup-strchr-evex.patch +Patch10095: ia-opt-strcspn_strpbrk-strcspn-c.patch +Patch10096: ia-opt-strspn-strspn-c.patch +Patch10097: ia-rmv-strcspn-sse2.patch +Patch10098: ia-rmv-strpbrk-sse2.patch +Patch10099: ia-rmv-strspn-sse2.patch +Patch10100: ia-opt-strxcasecmp-srtcmp.patch +Patch10101: ia-opt-strxcasecmp-srtcmp-sse42.patch +Patch10102: ia-opt-strxcasecmp-avx2.patch +Patch10103: ia-opt-strxcasecmp-evex.patch +Patch10104: ia-rmv-strxcasecmp-avx.patch +Patch10105: ia-imp-wcslen.patch +Patch10106: ia-rmv-memcmp-sse4.patch +Patch10107: ia-code_cleanup-memcmp-avx2-movbe.patch +Patch10108: ia-opt-str-wcs_rchr-sse2.patch +Patch10109: ia-opt-str-wcs_rchr-avx2.patch +Patch10110: ia-opt-str-wcs_rchr-evex.patch +Patch10111: ia-add-fast-jitter.patch +Patch10112: ia-add-backoff-spinlock.patch +Patch10113: glibc-sw29127.patch ############################################################################## # Continued list of core "glibc" package information: @@ -2806,8 +2842,11 @@ fi %files -f compat-libpthread-nonshared.filelist -n compat-libpthread-nonshared %changelog +* Thu Jun 09 2022 Ali Erdinc Koroglu - 2.28.205 +- Intel glibc optimizations + * Fri May 20 2022 Ali Erdinc Koroglu - 2.28.204 -- Intel architecture optimizations +- Intel glibc optimizations * Tue May 17 2022 Patsy Griffin - 2.28-203 - 390x: Add support for IBM z16. (#2077835)