diff --git a/SOURCES/glibc-sw28895.patch b/SOURCES/glibc-sw28895.patch
new file mode 100644
index 0000000..617ed93
--- /dev/null
+++ b/SOURCES/glibc-sw28895.patch
@@ -0,0 +1,91 @@
+From 7aeaaad7fb98f1c68b77bd9dee0ad0e6e92cd0ff Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Tue, 15 Feb 2022 20:27:21 -0600
+Subject: [PATCH] x86: Fix bug in strncmp-evex and strncmp-avx2 [BZ #28895]
+
+Logic can read before the start of `s1` / `s2` if both `s1` and `s2`
+are near the start of a page. To avoid having the result contimated by
+these comparisons the `strcmp` variants would mask off these
+comparisons. This was missing in the `strncmp` variants causing
+the bug. This commit adds the masking to `strncmp` so that out of
+range comparisons don't affect the result.
+
+test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass as
+well a full xcheck on x86_64 linux.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+
+(cherry picked from commit e108c02a5e23c8c88ce66d8705d4a24bb6b9a8bf)
+---
+ string/test-strncmp.c                  | 23 +++++++++++++++++++++++
+ sysdeps/x86_64/multiarch/strcmp-avx2.S |  1 +
+ sysdeps/x86_64/multiarch/strcmp-evex.S |  1 +
+ 3 files changed, 25 insertions(+)
+
+diff --git a/string/test-strncmp.c b/string/test-strncmp.c
+index 927a6daa..e61fffd9 100644
+--- a/string/test-strncmp.c
++++ b/string/test-strncmp.c
+@@ -403,6 +403,28 @@ check2 (void)
+   free (s2);
+ }
+ 
++static void
++check4 (void)
++{
++  /* To trigger bug 28895; We need 1) both s1 and s2 to be within 32 bytes of
++     the end of the page. 2) For there to be no mismatch/null byte before the
++     first page cross. 3) For length (`n`) to be large enough for one string to
++     cross the page. And 4) for there to be either mismatch/null bytes before
++     the start of the strings.  */
++
++  size_t size = 10;
++  size_t addr_mask = (getpagesize () - 1) ^ (sizeof (CHAR) - 1);
++  CHAR *s1 = (CHAR *)(buf1 + (addr_mask & 0xffa));
++  CHAR *s2 = (CHAR *)(buf2 + (addr_mask & 0xfed));
++  int exp_result;
++
++  STRCPY (s1, L ("tst-tlsmod%"));
++  STRCPY (s2, L ("tst-tls-manydynamic73mod"));
++  exp_result = SIMPLE_STRNCMP (s1, s2, size);
++  FOR_EACH_IMPL (impl, 0)
++  check_result (impl, s1, s2, size, exp_result);
++}
++
+ static void
+ check3 (void)
+ {
+@@ -445,6 +467,7 @@ test_main (void)
+   check1 ();
+   check2 ();
+   check3 ();
++  check4 ();
+ 
+   printf ("%23s", "");
+   FOR_EACH_IMPL (impl, 0)
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index 04675aa4..179cc0e3 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -661,6 +661,7 @@ L(ret8):
+ # ifdef USE_AS_STRNCMP
+ 	.p2align 4,, 10
+ L(return_page_cross_end_check):
++	andl	%r10d, %ecx
+ 	tzcntl	%ecx, %ecx
+ 	leal	-VEC_SIZE(%rax, %rcx), %ecx
+ 	cmpl	%ecx, %edx
+diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
+index ed56af8e..0dfa62bd 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
+@@ -689,6 +689,7 @@ L(ret8):
+ # ifdef USE_AS_STRNCMP
+ 	.p2align 4,, 10
+ L(return_page_cross_end_check):
++	andl	%r10d, %ecx
+ 	tzcntl	%ecx, %ecx
+ 	leal	-VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
+ #  ifdef USE_AS_WCSCMP
+-- 
+GitLab
+
diff --git a/SOURCES/glibc-sw28896-2.patch b/SOURCES/glibc-sw28896-2.patch
new file mode 100644
index 0000000..eb64cc6
--- /dev/null
+++ b/SOURCES/glibc-sw28896-2.patch
@@ -0,0 +1,147 @@
+From 50b1abfc917024905d84d261ba94682460193220 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 18 Feb 2022 14:19:15 -0600
+Subject: [PATCH] x86: Test wcscmp RTM in the wcsncmp overflow case [BZ #28896]
+
+In the overflow fallback strncmp-avx2-rtm and wcsncmp-avx2-rtm would
+call strcmp-avx2 and wcscmp-avx2 respectively. This would have
+not checks around vzeroupper and would trigger spurious
+aborts. This commit fixes that.
+
+test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass on
+AVX2 machines with and without RTM.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+
+(cherry picked from commit 7835d611af0854e69a0c71e3806f8fe379282d6f)
+---
+ sysdeps/x86/Makefile          |  5 ++++-
+ sysdeps/x86/tst-strncmp-rtm.c | 32 +++++++++++++++++++++++---------
+ sysdeps/x86/tst-wcsncmp-rtm.c | 21 +++++++++++++++++++++
+ 3 files changed, 48 insertions(+), 10 deletions(-)
+ create mode 100644 sysdeps/x86/tst-wcsncmp-rtm.c
+
+diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
+index 2d814915..c2111f49 100644
+--- a/sysdeps/x86/Makefile
++++ b/sysdeps/x86/Makefile
+@@ -28,7 +28,9 @@ tests += \
+   tst-strcpy-rtm \
+   tst-strlen-rtm \
+   tst-strncmp-rtm \
+-  tst-strrchr-rtm
++  tst-strrchr-rtm \
++  tst-wcsncmp-rtm \
++# tests
+ 
+ CFLAGS-tst-memchr-rtm.c += -mrtm
+ CFLAGS-tst-memcmp-rtm.c += -mrtm
+@@ -40,6 +42,7 @@ CFLAGS-tst-strcpy-rtm.c += -mrtm
+ CFLAGS-tst-strlen-rtm.c += -mrtm
+ CFLAGS-tst-strncmp-rtm.c += -mrtm -Wno-error
+ CFLAGS-tst-strrchr-rtm.c += -mrtm
++CFLAGS-tst-wcsncmp-rtm.c += -mrtm -Wno-error
+ endif
+ 
+ ifneq ($(enable-cet),no)
+diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
+index 4d0004b5..4e9f094f 100644
+--- a/sysdeps/x86/tst-strncmp-rtm.c
++++ b/sysdeps/x86/tst-strncmp-rtm.c
+@@ -19,18 +19,32 @@
+ #include <stdint.h>
+ #include <tst-string-rtm.h>
+ 
++#ifdef WIDE
++# define CHAR wchar_t
++# define MEMSET wmemset
++# define STRNCMP wcsncmp
++# define TEST_NAME wcsncmp
++#else /* !WIDE */
++# define CHAR char
++# define MEMSET memset
++# define STRNCMP strncmp
++# define TEST_NAME strncmp
++#endif /* !WIDE */
++
++
++
+ #define LOOP 3000
+ #define STRING_SIZE 1024
+-char string1[STRING_SIZE];
+-char string2[STRING_SIZE];
++CHAR string1[STRING_SIZE];
++CHAR string2[STRING_SIZE];
+ 
+ __attribute__ ((noinline, noclone))
+ static int
+ prepare (void)
+ {
+-  memset (string1, 'a', STRING_SIZE - 1);
+-  memset (string2, 'a', STRING_SIZE - 1);
+-  if (strncmp (string1, string2, STRING_SIZE) == 0)
++  MEMSET (string1, 'a', STRING_SIZE - 1);
++  MEMSET (string2, 'a', STRING_SIZE - 1);
++  if (STRNCMP (string1, string2, STRING_SIZE) == 0)
+     return EXIT_SUCCESS;
+   else
+     return EXIT_FAILURE;
+@@ -40,7 +54,7 @@ __attribute__ ((noinline, noclone))
+ static int
+ function (void)
+ {
+-  if (strncmp (string1, string2, STRING_SIZE) == 0)
++  if (STRNCMP (string1, string2, STRING_SIZE) == 0)
+     return 0;
+   else
+     return 1;
+@@ -50,7 +64,7 @@ __attribute__ ((noinline, noclone))
+ static int
+ function_overflow (void)
+ {
+-  if (strncmp (string1, string2, SIZE_MAX) == 0)
++  if (STRNCMP (string1, string2, SIZE_MAX) == 0)
+     return 0;
+   else
+     return 1;
+@@ -59,9 +73,9 @@ function_overflow (void)
+ static int
+ do_test (void)
+ {
+-  int status = do_test_1 ("strncmp", LOOP, prepare, function);
++  int status = do_test_1 (TEST_NAME, LOOP, prepare, function);
+   if (status != EXIT_SUCCESS)
+     return status;
+-  status = do_test_1 ("strncmp", LOOP, prepare, function_overflow);
++  status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow);
+   return status;
+ }
+diff --git a/sysdeps/x86/tst-wcsncmp-rtm.c b/sysdeps/x86/tst-wcsncmp-rtm.c
+new file mode 100644
+index 00000000..bad3b863
+--- /dev/null
++++ b/sysdeps/x86/tst-wcsncmp-rtm.c
+@@ -0,0 +1,21 @@
++/* Test case for wcsncmp inside a transactionally executing RTM region.
++   Copyright (C) 2022 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#define WIDE 1
++#include <wchar.h>
++#include "tst-strncmp-rtm.c"
+-- 
+GitLab
+
diff --git a/SOURCES/glibc-sw28896-3.patch b/SOURCES/glibc-sw28896-3.patch
new file mode 100644
index 0000000..6b47e24
--- /dev/null
+++ b/SOURCES/glibc-sw28896-3.patch
@@ -0,0 +1,78 @@
+From fd7108a4c9dc46113f4041e7575c1f1217c0d57d Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Thu, 24 Mar 2022 15:50:33 -0500
+Subject: [PATCH] x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ
+ #28896]
+
+Overflow case for __wcsncmp_avx2_rtm should be __wcscmp_avx2_rtm not
+__wcscmp_avx2.
+
+commit ddf0992cf57a93200e0c782e2a94d0733a5a0b87
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Sun Jan 9 16:02:21 2022 -0600
+
+    x86: Fix __wcsncmp_avx2 in strcmp-avx2.S [BZ# 28755]
+
+Set the wrong fallback function for `__wcsncmp_avx2_rtm`. It was set
+to fallback on to `__wcscmp_avx2` instead of `__wcscmp_avx2_rtm` which
+can cause spurious aborts.
+
+This change will need to be backported.
+
+All string/memory tests pass.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+
+(cherry picked from commit 9fef7039a7d04947bc89296ee0d187bc8d89b772)
+---
+ sysdeps/x86/tst-strncmp-rtm.c          | 15 +++++++++++++++
+ sysdeps/x86_64/multiarch/strcmp-avx2.S |  2 +-
+ 2 files changed, 16 insertions(+), 1 deletion(-)
+
+diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
+index aef9866c..ba6543be 100644
+--- a/sysdeps/x86/tst-strncmp-rtm.c
++++ b/sysdeps/x86/tst-strncmp-rtm.c
+@@ -70,6 +70,16 @@ function_overflow (void)
+     return 1;
+ }
+ 
++__attribute__ ((noinline, noclone))
++static int
++function_overflow2 (void)
++{
++  if (STRNCMP (string1, string2, SIZE_MAX >> 4) == 0)
++    return 0;
++  else
++    return 1;
++}
++
+ static int
+ do_test (void)
+ {
+@@ -77,5 +87,10 @@ do_test (void)
+   if (status != EXIT_SUCCESS)
+     return status;
+   status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow);
++  if (status != EXIT_SUCCESS)
++    return status;
++  status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow2);
++  if (status != EXIT_SUCCESS)
++    return status;
+   return status;
+ }
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index 179cc0e3..782f9472 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -122,7 +122,7 @@ ENTRY(STRCMP)
+ 	   are cases where length is large enough that it can never be a
+ 	   bound on valid memory so just use wcscmp.  */
+ 	shrq	$56, %rcx
+-	jnz	__wcscmp_avx2
++	jnz	OVERFLOW_STRCMP
+ 
+ 	leaq	(, %rdx, 4), %rdx
+ #  endif
+-- 
+GitLab
+
diff --git a/SOURCES/glibc-sw29127.patch b/SOURCES/glibc-sw29127.patch
new file mode 100644
index 0000000..f1a7a76
--- /dev/null
+++ b/SOURCES/glibc-sw29127.patch
@@ -0,0 +1,57 @@
+From 0ad216b458445251e6f98d74382faf3606569731 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Tue, 15 Feb 2022 08:18:15 -0600
+Subject: [PATCH] x86: Fallback {str|wcs}cmp RTM in the ncmp overflow case [BZ
+ #29127]
+
+Re-cherry-pick commit c627209832 for strcmp-avx2.S change which was
+omitted in intial cherry pick because at the time this bug was not
+present on release branch.
+
+Fixes BZ #29127.
+
+In the overflow fallback strncmp-avx2-rtm and wcsncmp-avx2-rtm would
+call strcmp-avx2 and wcscmp-avx2 respectively. This would have
+not checks around vzeroupper and would trigger spurious
+aborts. This commit fixes that.
+
+test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass on
+AVX2 machines with and without RTM.
+
+Co-authored-by: H.J. Lu <hjl.tools@gmail.com>
+(cherry picked from commit c6272098323153db373f2986c67786ea8c85f1cf)
+---
+ sysdeps/x86_64/multiarch/strcmp-avx2.S | 8 ++------
+ 1 file changed, 2 insertions(+), 6 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index 28cc98b6..e267c6cb 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -345,10 +345,10 @@ L(one_or_less):
+ 	movq	%LOCALE_REG, %rdx
+ #  endif
+ 	jb	L(ret_zero)
+-#  ifdef USE_AS_WCSCMP
+ 	/* 'nbe' covers the case where length is negative (large
+ 	   unsigned).  */
+-	jnbe	__wcscmp_avx2
++	jnbe	OVERFLOW_STRCMP
++#  ifdef USE_AS_WCSCMP
+ 	movl	(%rdi), %edx
+ 	xorl	%eax, %eax
+ 	cmpl	(%rsi), %edx
+@@ -357,10 +357,6 @@ L(one_or_less):
+ 	negl	%eax
+ 	orl	$1, %eax
+ #  else
+-	/* 'nbe' covers the case where length is negative (large
+-	   unsigned).  */
+-
+-	jnbe	__strcmp_avx2
+ 	movzbl	(%rdi), %eax
+ 	movzbl	(%rsi), %ecx
+ 	TOLOWER_gpr (%rax, %eax)
+-- 
+GitLab
+
diff --git a/SOURCES/ia-add-backoff-spinlock.patch b/SOURCES/ia-add-backoff-spinlock.patch
new file mode 100644
index 0000000..27bf481
--- /dev/null
+++ b/SOURCES/ia-add-backoff-spinlock.patch
@@ -0,0 +1,219 @@
+From d4b1ecdf48cfe0e711ec201533811b7d823d1b7d Mon Sep 17 00:00:00 2001
+From: Wangyang Guo <wangyang.guo@intel.com>
+Date: Fri, 6 May 2022 01:50:10 +0000
+Subject: [PATCH] nptl: Add backoff mechanism to spinlock loop
+
+When mutiple threads waiting for lock at the same time, once lock owner
+releases the lock, waiters will see lock available and all try to lock,
+which may cause an expensive CAS storm.
+
+Binary exponential backoff with random jitter is introduced. As try-lock
+attempt increases, there is more likely that a larger number threads
+compete for adaptive mutex lock, so increase wait time in exponential.
+A random jitter is also added to avoid synchronous try-lock from other
+threads.
+
+v2: Remove read-check before try-lock for performance.
+
+v3:
+1. Restore read-check since it works well in some platform.
+2. Make backoff arch dependent, and enable it for x86_64.
+3. Limit max backoff to reduce latency in large critical section.
+
+v4: Fix strict-prototypes error in sysdeps/nptl/pthread_mutex_backoff.h
+
+v5: Commit log updated for regression in large critical section.
+
+Result of pthread-mutex-locks bench
+
+Test Platform: Xeon 8280L (2 socket, 112 CPUs in total)
+First Row: thread number
+First Col: critical section length
+Values: backoff vs upstream, time based, low is better
+
+non-critical-length: 1
+	1	2	4	8	16	32	64	112	140
+0	0.99	0.58	0.52	0.49	0.43	0.44	0.46	0.52	0.54
+1	0.98	0.43	0.56	0.50	0.44	0.45	0.50	0.56	0.57
+2	0.99	0.41	0.57	0.51	0.45	0.47	0.48	0.60	0.61
+4	0.99	0.45	0.59	0.53	0.48	0.49	0.52	0.64	0.65
+8	1.00	0.66	0.71	0.63	0.56	0.59	0.66	0.72	0.71
+16	0.97	0.78	0.91	0.73	0.67	0.70	0.79	0.80	0.80
+32	0.95	1.17	0.98	0.87	0.82	0.86	0.89	0.90	0.90
+64	0.96	0.95	1.01	1.01	0.98	1.00	1.03	0.99	0.99
+128	0.99	1.01	1.01	1.17	1.08	1.12	1.02	0.97	1.02
+
+non-critical-length: 32
+	1	2	4	8	16	32	64	112	140
+0	1.03	0.97	0.75	0.65	0.58	0.58	0.56	0.70	0.70
+1	0.94	0.95	0.76	0.65	0.58	0.58	0.61	0.71	0.72
+2	0.97	0.96	0.77	0.66	0.58	0.59	0.62	0.74	0.74
+4	0.99	0.96	0.78	0.66	0.60	0.61	0.66	0.76	0.77
+8	0.99	0.99	0.84	0.70	0.64	0.66	0.71	0.80	0.80
+16	0.98	0.97	0.95	0.76	0.70	0.73	0.81	0.85	0.84
+32	1.04	1.12	1.04	0.89	0.82	0.86	0.93	0.91	0.91
+64	0.99	1.15	1.07	1.00	0.99	1.01	1.05	0.99	0.99
+128	1.00	1.21	1.20	1.22	1.25	1.31	1.12	1.10	0.99
+
+non-critical-length: 128
+	1	2	4	8	16	32	64	112	140
+0	1.02	1.00	0.99	0.67	0.61	0.61	0.61	0.74	0.73
+1	0.95	0.99	1.00	0.68	0.61	0.60	0.60	0.74	0.74
+2	1.00	1.04	1.00	0.68	0.59	0.61	0.65	0.76	0.76
+4	1.00	0.96	0.98	0.70	0.63	0.63	0.67	0.78	0.77
+8	1.01	1.02	0.89	0.73	0.65	0.67	0.71	0.81	0.80
+16	0.99	0.96	0.96	0.79	0.71	0.73	0.80	0.84	0.84
+32	0.99	0.95	1.05	0.89	0.84	0.85	0.94	0.92	0.91
+64	1.00	0.99	1.16	1.04	1.00	1.02	1.06	0.99	0.99
+128	1.00	1.06	0.98	1.14	1.39	1.26	1.08	1.02	0.98
+
+There is regression in large critical section. But adaptive mutex is
+aimed for "quick" locks. Small critical section is more common when
+users choose to use adaptive pthread_mutex.
+
+Signed-off-by: Wangyang Guo <wangyang.guo@intel.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+(cherry picked from commit 8162147872491bb5b48e91543b19c49a29ae6b6d)
+---
+ nptl/pthreadP.h                             |  1 +
+ nptl/pthread_mutex_lock.c                   | 16 +++++++--
+ sysdeps/nptl/pthread_mutex_backoff.h        | 35 ++++++++++++++++++
+ sysdeps/x86_64/nptl/pthread_mutex_backoff.h | 39 +++++++++++++++++++++
+ 4 files changed, 89 insertions(+), 2 deletions(-)
+ create mode 100644 sysdeps/nptl/pthread_mutex_backoff.h
+ create mode 100644 sysdeps/x86_64/nptl/pthread_mutex_backoff.h
+
+diff --git a/nptl/pthreadP.h b/nptl/pthreadP.h
+index 7ddc166c..1550e3b6 100644
+--- a/nptl/pthreadP.h
++++ b/nptl/pthreadP.h
+@@ -33,6 +33,7 @@
+ #include <kernel-features.h>
+ #include <errno.h>
+ #include <internal-signals.h>
++#include <pthread_mutex_backoff.h>
+ 
+ 
+ /* Atomic operations on TLS memory.  */
+diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
+index d96a9933..c7770fc9 100644
+--- a/nptl/pthread_mutex_lock.c
++++ b/nptl/pthread_mutex_lock.c
+@@ -133,14 +133,26 @@ __pthread_mutex_lock (pthread_mutex_t *mutex)
+ 	  int cnt = 0;
+ 	  int max_cnt = MIN (MAX_ADAPTIVE_COUNT,
+ 			     mutex->__data.__spins * 2 + 10);
++	  int spin_count, exp_backoff = 1;
++	  unsigned int jitter = get_jitter ();
+ 	  do
+ 	    {
+-	      if (cnt++ >= max_cnt)
++	      /* In each loop, spin count is exponential backoff plus
++		 random jitter, random range is [0, exp_backoff-1].  */
++	      spin_count = exp_backoff + (jitter & (exp_backoff - 1));
++	      cnt += spin_count;
++	      if (cnt >= max_cnt)
+ 		{
++		  /* If cnt exceeds max spin count, just go to wait
++		     queue.  */
+ 		  LLL_MUTEX_LOCK (mutex);
+ 		  break;
+ 		}
+-	      atomic_spin_nop ();
++	      do
++		atomic_spin_nop ();
++	      while (--spin_count > 0);
++	      /* Prepare for next loop.  */
++	      exp_backoff = get_next_backoff (exp_backoff);
+ 	    }
+ 	  while (LLL_MUTEX_READ_LOCK (mutex) != 0
+ 		 || LLL_MUTEX_TRYLOCK (mutex) != 0);
+diff --git a/sysdeps/nptl/pthread_mutex_backoff.h b/sysdeps/nptl/pthread_mutex_backoff.h
+new file mode 100644
+index 00000000..5b26c22a
+--- /dev/null
++++ b/sysdeps/nptl/pthread_mutex_backoff.h
+@@ -0,0 +1,35 @@
++/* Pthread mutex backoff configuration.
++   Copyright (C) 2022 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++#ifndef _PTHREAD_MUTEX_BACKOFF_H
++#define _PTHREAD_MUTEX_BACKOFF_H 1
++
++static inline unsigned int
++get_jitter (void)
++{
++  /* Arch dependent random jitter, return 0 disables random.  */
++  return 0;
++}
++
++static inline int
++get_next_backoff (int backoff)
++{
++  /* Next backoff, return 1 disables mutex backoff.  */
++  return 1;
++}
++
++#endif
+diff --git a/sysdeps/x86_64/nptl/pthread_mutex_backoff.h b/sysdeps/x86_64/nptl/pthread_mutex_backoff.h
+new file mode 100644
+index 00000000..ec74c3d9
+--- /dev/null
++++ b/sysdeps/x86_64/nptl/pthread_mutex_backoff.h
+@@ -0,0 +1,39 @@
++/* Pthread mutex backoff configuration.
++   Copyright (C) 2022 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++#ifndef _PTHREAD_MUTEX_BACKOFF_H
++#define _PTHREAD_MUTEX_BACKOFF_H 1
++
++#include <fast-jitter.h>
++
++static inline unsigned int
++get_jitter (void)
++{
++  return get_fast_jitter ();
++}
++
++#define MAX_BACKOFF 16
++
++static inline int
++get_next_backoff (int backoff)
++{
++  /* Binary expontial backoff. Limiting max backoff
++     can reduce latency in large critical section.  */
++  return (backoff < MAX_BACKOFF) ? backoff << 1 : backoff;
++}
++
++#endif
+-- 
+GitLab
+
diff --git a/SOURCES/ia-add-fast-jitter.patch b/SOURCES/ia-add-fast-jitter.patch
new file mode 100644
index 0000000..51b07d7
--- /dev/null
+++ b/SOURCES/ia-add-fast-jitter.patch
@@ -0,0 +1,74 @@
+From 1edbd8aad68ea5ddd729e1cbd307c84b1386459a Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 27 Apr 2022 15:13:02 -0500
+Subject: [PATCH] sysdeps: Add 'get_fast_jitter' interace in fast-jitter.h
+
+'get_fast_jitter' is meant to be used purely for performance
+purposes. In all cases it's used it should be acceptable to get no
+randomness (see default case). An example use case is in setting
+jitter for retries between threads at a lock. There is a
+performance benefit to having jitter, but only if the jitter can
+be generated very quickly and ultimately there is no serious issue
+if no jitter is generated.
+
+The implementation generally uses 'HP_TIMING_NOW' iff it is
+inlined (avoid any potential syscall paths).
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+
+(cherry picked from commit 911c63a51c690dd1a97dfc587097277029baf00f)
+---
+ sysdeps/generic/fast-jitter.h | 42 +++++++++++++++++++++++++++++++++++
+ 1 file changed, 42 insertions(+)
+ create mode 100644 sysdeps/generic/fast-jitter.h
+
+diff --git a/sysdeps/generic/fast-jitter.h b/sysdeps/generic/fast-jitter.h
+new file mode 100644
+index 00000000..4dd53e34
+--- /dev/null
++++ b/sysdeps/generic/fast-jitter.h
+@@ -0,0 +1,42 @@
++/* Fallback for fast jitter just return 0.
++   Copyright (C) 2019-2022 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#ifndef _FAST_JITTER_H
++# define _FAST_JITTER_H
++
++# include <stdint.h>
++# include <hp-timing.h>
++
++/* Baseline just return 0.  We could create jitter using a clock or
++   'random_bits' but that may imply a syscall and the goal of
++   'get_fast_jitter' is minimal overhead "randomness" when such
++   randomness helps performance.  Adding high overhead the function
++   defeats the purpose.  */
++static inline uint32_t
++get_fast_jitter (void)
++{
++# if HP_TIMING_INLINE
++  hp_timing_t jitter;
++  HP_TIMING_NOW (jitter);
++  return (uint32_t) jitter;
++# else
++  return 0;
++# endif
++}
++
++#endif
+-- 
+GitLab
+
diff --git a/SOURCES/ia-code_cleanup-memcmp-avx2-movbe.patch b/SOURCES/ia-code_cleanup-memcmp-avx2-movbe.patch
new file mode 100644
index 0000000..2f04515
--- /dev/null
+++ b/SOURCES/ia-code_cleanup-memcmp-avx2-movbe.patch
@@ -0,0 +1,264 @@
+From 4619b6dbf13c17a13be2d2a0bdc9fcc2640b0f86 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 15 Apr 2022 12:28:01 -0500
+Subject: [PATCH] x86: Cleanup page cross code in memcmp-avx2-movbe.S
+
+Old code was both inefficient and wasted code size. New code (-62
+bytes) and comparable or better performance in the page cross case.
+
+geometric_mean(N=20) of page cross cases New / Original: 0.960
+
+size, align0, align1, ret, New Time/Old Time
+   1,   4095,      0,   0,             1.001
+   1,   4095,      0,   1,             0.999
+   1,   4095,      0,  -1,               1.0
+   2,   4094,      0,   0,               1.0
+   2,   4094,      0,   1,               1.0
+   2,   4094,      0,  -1,               1.0
+   3,   4093,      0,   0,               1.0
+   3,   4093,      0,   1,               1.0
+   3,   4093,      0,  -1,               1.0
+   4,   4092,      0,   0,             0.987
+   4,   4092,      0,   1,               1.0
+   4,   4092,      0,  -1,               1.0
+   5,   4091,      0,   0,             0.984
+   5,   4091,      0,   1,             1.002
+   5,   4091,      0,  -1,             1.005
+   6,   4090,      0,   0,             0.993
+   6,   4090,      0,   1,             1.001
+   6,   4090,      0,  -1,             1.003
+   7,   4089,      0,   0,             0.991
+   7,   4089,      0,   1,               1.0
+   7,   4089,      0,  -1,             1.001
+   8,   4088,      0,   0,             0.875
+   8,   4088,      0,   1,             0.881
+   8,   4088,      0,  -1,             0.888
+   9,   4087,      0,   0,             0.872
+   9,   4087,      0,   1,             0.879
+   9,   4087,      0,  -1,             0.883
+  10,   4086,      0,   0,             0.878
+  10,   4086,      0,   1,             0.886
+  10,   4086,      0,  -1,             0.873
+  11,   4085,      0,   0,             0.878
+  11,   4085,      0,   1,             0.881
+  11,   4085,      0,  -1,             0.879
+  12,   4084,      0,   0,             0.873
+  12,   4084,      0,   1,             0.889
+  12,   4084,      0,  -1,             0.875
+  13,   4083,      0,   0,             0.873
+  13,   4083,      0,   1,             0.863
+  13,   4083,      0,  -1,             0.863
+  14,   4082,      0,   0,             0.838
+  14,   4082,      0,   1,             0.869
+  14,   4082,      0,  -1,             0.877
+  15,   4081,      0,   0,             0.841
+  15,   4081,      0,   1,             0.869
+  15,   4081,      0,  -1,             0.876
+  16,   4080,      0,   0,             0.988
+  16,   4080,      0,   1,              0.99
+  16,   4080,      0,  -1,             0.989
+  17,   4079,      0,   0,             0.978
+  17,   4079,      0,   1,             0.981
+  17,   4079,      0,  -1,              0.98
+  18,   4078,      0,   0,             0.981
+  18,   4078,      0,   1,              0.98
+  18,   4078,      0,  -1,             0.985
+  19,   4077,      0,   0,             0.977
+  19,   4077,      0,   1,             0.979
+  19,   4077,      0,  -1,             0.986
+  20,   4076,      0,   0,             0.977
+  20,   4076,      0,   1,             0.986
+  20,   4076,      0,  -1,             0.984
+  21,   4075,      0,   0,             0.977
+  21,   4075,      0,   1,             0.983
+  21,   4075,      0,  -1,             0.988
+  22,   4074,      0,   0,             0.983
+  22,   4074,      0,   1,             0.994
+  22,   4074,      0,  -1,             0.993
+  23,   4073,      0,   0,              0.98
+  23,   4073,      0,   1,             0.992
+  23,   4073,      0,  -1,             0.995
+  24,   4072,      0,   0,             0.989
+  24,   4072,      0,   1,             0.989
+  24,   4072,      0,  -1,             0.991
+  25,   4071,      0,   0,              0.99
+  25,   4071,      0,   1,             0.999
+  25,   4071,      0,  -1,             0.996
+  26,   4070,      0,   0,             0.993
+  26,   4070,      0,   1,             0.995
+  26,   4070,      0,  -1,             0.998
+  27,   4069,      0,   0,             0.993
+  27,   4069,      0,   1,             0.999
+  27,   4069,      0,  -1,               1.0
+  28,   4068,      0,   0,             0.997
+  28,   4068,      0,   1,               1.0
+  28,   4068,      0,  -1,             0.999
+  29,   4067,      0,   0,             0.996
+  29,   4067,      0,   1,             0.999
+  29,   4067,      0,  -1,             0.999
+  30,   4066,      0,   0,             0.991
+  30,   4066,      0,   1,             1.001
+  30,   4066,      0,  -1,             0.999
+  31,   4065,      0,   0,             0.988
+  31,   4065,      0,   1,             0.998
+  31,   4065,      0,  -1,             0.998
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+
+(cherry picked from commit 23102686ec67b856a2d4fd25ddaa1c0b8d175c4f)
+---
+ sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 98 ++++++++++++--------
+ 1 file changed, 61 insertions(+), 37 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+index 16fc673e..99258cf5 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
++++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+@@ -429,22 +429,21 @@ L(page_cross_less_vec):
+ # ifndef USE_AS_WMEMCMP
+ 	cmpl	$8, %edx
+ 	jae	L(between_8_15)
++	/* Fall through for [4, 7].  */
+ 	cmpl	$4, %edx
+-	jae	L(between_4_7)
++	jb	L(between_2_3)
+ 
+-	/* Load as big endian to avoid branches.  */
+-	movzwl	(%rdi), %eax
+-	movzwl	(%rsi), %ecx
+-	shll	$8, %eax
+-	shll	$8, %ecx
+-	bswap	%eax
+-	bswap	%ecx
+-	movzbl	-1(%rdi, %rdx), %edi
+-	movzbl	-1(%rsi, %rdx), %esi
+-	orl	%edi, %eax
+-	orl	%esi, %ecx
+-	/* Subtraction is okay because the upper 8 bits are zero.  */
+-	subl	%ecx, %eax
++	movbe	(%rdi), %eax
++	movbe	(%rsi), %ecx
++	shlq	$32, %rax
++	shlq	$32, %rcx
++	movbe	-4(%rdi, %rdx), %edi
++	movbe	-4(%rsi, %rdx), %esi
++	orq	%rdi, %rax
++	orq	%rsi, %rcx
++	subq	%rcx, %rax
++	/* Fast path for return zero.  */
++	jnz	L(ret_nonzero)
+ 	/* No ymm register was touched.  */
+ 	ret
+ 
+@@ -457,9 +456,33 @@ L(one_or_less):
+ 	/* No ymm register was touched.  */
+ 	ret
+ 
++	.p2align 4,, 5
++L(ret_nonzero):
++	sbbl	%eax, %eax
++	orl	$1, %eax
++	/* No ymm register was touched.  */
++	ret
++
++	.p2align 4,, 2
++L(zero):
++	xorl	%eax, %eax
++	/* No ymm register was touched.  */
++	ret
++
+ 	.p2align 4
+ L(between_8_15):
+-# endif
++	movbe	(%rdi), %rax
++	movbe	(%rsi), %rcx
++	subq	%rcx, %rax
++	jnz	L(ret_nonzero)
++	movbe	-8(%rdi, %rdx), %rax
++	movbe	-8(%rsi, %rdx), %rcx
++	subq	%rcx, %rax
++	/* Fast path for return zero.  */
++	jnz	L(ret_nonzero)
++	/* No ymm register was touched.  */
++	ret
++# else
+ 	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
+ 	vmovq	(%rdi), %xmm1
+ 	vmovq	(%rsi), %xmm2
+@@ -475,16 +498,13 @@ L(between_8_15):
+ 	VPCMPEQ	%xmm1, %xmm2, %xmm2
+ 	vpmovmskb %xmm2, %eax
+ 	subl	$0xffff, %eax
++	/* Fast path for return zero.  */
+ 	jnz	L(return_vec_0)
+ 	/* No ymm register was touched.  */
+ 	ret
++# endif
+ 
+-	.p2align 4
+-L(zero):
+-	xorl	%eax, %eax
+-	ret
+-
+-	.p2align 4
++	.p2align 4,, 10
+ L(between_16_31):
+ 	/* From 16 to 31 bytes.  No branch when size == 16.  */
+ 	vmovdqu	(%rsi), %xmm2
+@@ -501,11 +521,17 @@ L(between_16_31):
+ 	VPCMPEQ	(%rdi), %xmm2, %xmm2
+ 	vpmovmskb %xmm2, %eax
+ 	subl	$0xffff, %eax
++	/* Fast path for return zero.  */
+ 	jnz	L(return_vec_0)
+ 	/* No ymm register was touched.  */
+ 	ret
+ 
+ # ifdef USE_AS_WMEMCMP
++	.p2align 4,, 2
++L(zero):
++	xorl	%eax, %eax
++	ret
++
+ 	.p2align 4
+ L(one_or_less):
+ 	jb	L(zero)
+@@ -520,22 +546,20 @@ L(one_or_less):
+ # else
+ 
+ 	.p2align 4
+-L(between_4_7):
+-	/* Load as big endian with overlapping movbe to avoid branches.
+-	 */
+-	movbe	(%rdi), %eax
+-	movbe	(%rsi), %ecx
+-	shlq	$32, %rax
+-	shlq	$32, %rcx
+-	movbe	-4(%rdi, %rdx), %edi
+-	movbe	-4(%rsi, %rdx), %esi
+-	orq	%rdi, %rax
+-	orq	%rsi, %rcx
+-	subq	%rcx, %rax
+-	jz	L(zero_4_7)
+-	sbbl	%eax, %eax
+-	orl	$1, %eax
+-L(zero_4_7):
++L(between_2_3):
++	/* Load as big endian to avoid branches.  */
++	movzwl	(%rdi), %eax
++	movzwl	(%rsi), %ecx
++	bswap	%eax
++	bswap	%ecx
++	shrl	%eax
++	shrl	%ecx
++	movzbl	-1(%rdi, %rdx), %edi
++	movzbl	-1(%rsi, %rdx), %esi
++	orl	%edi, %eax
++	orl	%esi, %ecx
++	/* Subtraction is okay because the upper bit is zero.  */
++	subl	%ecx, %eax
+ 	/* No ymm register was touched.  */
+ 	ret
+ # endif
+-- 
+GitLab
+
diff --git a/SOURCES/ia-code_cleanup-strchr-avx2.patch b/SOURCES/ia-code_cleanup-strchr-avx2.patch
new file mode 100644
index 0000000..3d1111b
--- /dev/null
+++ b/SOURCES/ia-code_cleanup-strchr-avx2.patch
@@ -0,0 +1,373 @@
+From 1da58a6d12719da8cc2035c2f6f9928d2ad61a20 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Mar 2022 16:57:16 -0500
+Subject: [PATCH] x86: Code cleanup in strchr-avx2 and comment justifying
+ branch
+
+Small code cleanup for size: -53 bytes.
+
+Add comment justifying using a branch to do NULL/non-null return.
+
+All string/memory tests pass and no regressions in benchtests.
+
+geometric_mean(N=20) of all benchmarks Original / New: 1.00
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+
+(cherry picked from commit a6fbf4d51e9ba8063c4f8331564892ead9c67344)
+---
+ sysdeps/x86_64/multiarch/strchr-avx2.S | 204 +++++++++++++------------
+ 1 file changed, 107 insertions(+), 97 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
+index 5884726b..89dd2bf7 100644
+--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
++++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
+@@ -48,13 +48,13 @@
+ # define PAGE_SIZE 4096
+ 
+ 	.section SECTION(.text),"ax",@progbits
+-ENTRY (STRCHR)
++ENTRY_P2ALIGN (STRCHR, 5)
+ 	/* Broadcast CHAR to YMM0.	*/
+ 	vmovd	%esi, %xmm0
+ 	movl	%edi, %eax
+ 	andl	$(PAGE_SIZE - 1), %eax
+ 	VPBROADCAST	%xmm0, %ymm0
+-	vpxor	%xmm9, %xmm9, %xmm9
++	vpxor	%xmm1, %xmm1, %xmm1
+ 
+ 	/* Check if we cross page boundary with one vector load.  */
+ 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+@@ -62,37 +62,29 @@ ENTRY (STRCHR)
+ 
+ 	/* Check the first VEC_SIZE bytes.	Search for both CHAR and the
+ 	   null byte.  */
+-	vmovdqu	(%rdi), %ymm8
+-	VPCMPEQ	%ymm8, %ymm0, %ymm1
+-	VPCMPEQ	%ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
++	vmovdqu	(%rdi), %ymm2
++	VPCMPEQ	%ymm2, %ymm0, %ymm3
++	VPCMPEQ	%ymm2, %ymm1, %ymm2
++	vpor	%ymm3, %ymm2, %ymm3
++	vpmovmskb %ymm3, %eax
+ 	testl	%eax, %eax
+ 	jz	L(aligned_more)
+ 	tzcntl	%eax, %eax
+ # ifndef USE_AS_STRCHRNUL
+-	/* Found CHAR or the null byte.	 */
+-	cmp	(%rdi, %rax), %CHAR_REG
+-	jne	L(zero)
+-# endif
+-	addq	%rdi, %rax
+-	VZEROUPPER_RETURN
+-
+-	/* .p2align 5 helps keep performance more consistent if ENTRY()
+-	   alignment % 32 was either 16 or 0. As well this makes the
+-	   alignment % 32 of the loop_4x_vec fixed which makes tuning it
+-	   easier.  */
+-	.p2align 5
+-L(first_vec_x4):
+-	tzcntl	%eax, %eax
+-	addq	$(VEC_SIZE * 3 + 1), %rdi
+-# ifndef USE_AS_STRCHRNUL
+-	/* Found CHAR or the null byte.	 */
++	/* Found CHAR or the null byte.  */
+ 	cmp	(%rdi, %rax), %CHAR_REG
++	/* NB: Use a branch instead of cmovcc here. The expectation is
++	   that with strchr the user will branch based on input being
++	   null. Since this branch will be 100% predictive of the user
++	   branch a branch miss here should save what otherwise would
++	   be branch miss in the user code. Otherwise using a branch 1)
++	   saves code size and 2) is faster in highly predictable
++	   environments.  */
+ 	jne	L(zero)
+ # endif
+ 	addq	%rdi, %rax
+-	VZEROUPPER_RETURN
++L(return_vzeroupper):
++	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ # ifndef USE_AS_STRCHRNUL
+ L(zero):
+@@ -103,7 +95,8 @@ L(zero):
+ 
+ 	.p2align 4
+ L(first_vec_x1):
+-	tzcntl	%eax, %eax
++	/* Use bsf to save code size.  */
++	bsfl	%eax, %eax
+ 	incq	%rdi
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Found CHAR or the null byte.	 */
+@@ -113,9 +106,10 @@ L(first_vec_x1):
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+ 
+-	.p2align 4
++	.p2align 4,, 10
+ L(first_vec_x2):
+-	tzcntl	%eax, %eax
++	/* Use bsf to save code size.  */
++	bsfl	%eax, %eax
+ 	addq	$(VEC_SIZE + 1), %rdi
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Found CHAR or the null byte.	 */
+@@ -125,9 +119,10 @@ L(first_vec_x2):
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+ 
+-	.p2align 4
++	.p2align 4,, 8
+ L(first_vec_x3):
+-	tzcntl	%eax, %eax
++	/* Use bsf to save code size.  */
++	bsfl	%eax, %eax
+ 	addq	$(VEC_SIZE * 2 + 1), %rdi
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Found CHAR or the null byte.	 */
+@@ -137,6 +132,21 @@ L(first_vec_x3):
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+ 
++	.p2align 4,, 10
++L(first_vec_x4):
++	/* Use bsf to save code size.  */
++	bsfl	%eax, %eax
++	addq	$(VEC_SIZE * 3 + 1), %rdi
++# ifndef USE_AS_STRCHRNUL
++	/* Found CHAR or the null byte.	 */
++	cmp	(%rdi, %rax), %CHAR_REG
++	jne	L(zero)
++# endif
++	addq	%rdi, %rax
++	VZEROUPPER_RETURN
++
++
++
+ 	.p2align 4
+ L(aligned_more):
+ 	/* Align data to VEC_SIZE - 1. This is the same number of
+@@ -146,90 +156,92 @@ L(aligned_more):
+ L(cross_page_continue):
+ 	/* Check the next 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+ 	   since data is only aligned to VEC_SIZE.  */
+-	vmovdqa	1(%rdi), %ymm8
+-	VPCMPEQ	%ymm8, %ymm0, %ymm1
+-	VPCMPEQ	%ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
++	vmovdqa	1(%rdi), %ymm2
++	VPCMPEQ	%ymm2, %ymm0, %ymm3
++	VPCMPEQ	%ymm2, %ymm1, %ymm2
++	vpor	%ymm3, %ymm2, %ymm3
++	vpmovmskb %ymm3, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1)
+ 
+-	vmovdqa	(VEC_SIZE + 1)(%rdi), %ymm8
+-	VPCMPEQ	%ymm8, %ymm0, %ymm1
+-	VPCMPEQ	%ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
++	vmovdqa	(VEC_SIZE + 1)(%rdi), %ymm2
++	VPCMPEQ	%ymm2, %ymm0, %ymm3
++	VPCMPEQ	%ymm2, %ymm1, %ymm2
++	vpor	%ymm3, %ymm2, %ymm3
++	vpmovmskb %ymm3, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x2)
+ 
+-	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm8
+-	VPCMPEQ	%ymm8, %ymm0, %ymm1
+-	VPCMPEQ	%ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
++	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm2
++	VPCMPEQ	%ymm2, %ymm0, %ymm3
++	VPCMPEQ	%ymm2, %ymm1, %ymm2
++	vpor	%ymm3, %ymm2, %ymm3
++	vpmovmskb %ymm3, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x3)
+ 
+-	vmovdqa	(VEC_SIZE * 3 + 1)(%rdi), %ymm8
+-	VPCMPEQ	%ymm8, %ymm0, %ymm1
+-	VPCMPEQ	%ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
++	vmovdqa	(VEC_SIZE * 3 + 1)(%rdi), %ymm2
++	VPCMPEQ	%ymm2, %ymm0, %ymm3
++	VPCMPEQ	%ymm2, %ymm1, %ymm2
++	vpor	%ymm3, %ymm2, %ymm3
++	vpmovmskb %ymm3, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x4)
+-	/* Align data to VEC_SIZE * 4 - 1.	*/
+-	addq	$(VEC_SIZE * 4 + 1), %rdi
+-	andq	$-(VEC_SIZE * 4), %rdi
++	/* Align data to VEC_SIZE * 4 - 1.  */
++	incq	%rdi
++	orq	$(VEC_SIZE * 4 - 1), %rdi
+ 	.p2align 4
+ L(loop_4x_vec):
+ 	/* Compare 4 * VEC at a time forward.  */
+-	vmovdqa	(%rdi), %ymm5
+-	vmovdqa	(VEC_SIZE)(%rdi), %ymm6
+-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm7
+-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
++	vmovdqa	1(%rdi), %ymm6
++	vmovdqa	(VEC_SIZE + 1)(%rdi), %ymm7
+ 
+ 	/* Leaves only CHARS matching esi as 0.	 */
+-	vpxor	%ymm5, %ymm0, %ymm1
+ 	vpxor	%ymm6, %ymm0, %ymm2
+ 	vpxor	%ymm7, %ymm0, %ymm3
+-	vpxor	%ymm8, %ymm0, %ymm4
+ 
+-	VPMINU	%ymm1, %ymm5, %ymm1
+ 	VPMINU	%ymm2, %ymm6, %ymm2
+ 	VPMINU	%ymm3, %ymm7, %ymm3
+-	VPMINU	%ymm4, %ymm8, %ymm4
+ 
+-	VPMINU	%ymm1, %ymm2, %ymm5
+-	VPMINU	%ymm3, %ymm4, %ymm6
++	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm6
++	vmovdqa	(VEC_SIZE * 3 + 1)(%rdi), %ymm7
++
++	vpxor	%ymm6, %ymm0, %ymm4
++	vpxor	%ymm7, %ymm0, %ymm5
++
++	VPMINU	%ymm4, %ymm6, %ymm4
++	VPMINU	%ymm5, %ymm7, %ymm5
+ 
+-	VPMINU	%ymm5, %ymm6, %ymm6
++	VPMINU	%ymm2, %ymm3, %ymm6
++	VPMINU	%ymm4, %ymm5, %ymm7
+ 
+-	VPCMPEQ	%ymm6, %ymm9, %ymm6
+-	vpmovmskb %ymm6, %ecx
++	VPMINU	%ymm6, %ymm7, %ymm7
++
++	VPCMPEQ	%ymm7, %ymm1, %ymm7
++	vpmovmskb %ymm7, %ecx
+ 	subq	$-(VEC_SIZE * 4), %rdi
+ 	testl	%ecx, %ecx
+ 	jz	L(loop_4x_vec)
+ 
+-
+-	VPCMPEQ	%ymm1, %ymm9, %ymm1
+-	vpmovmskb %ymm1, %eax
++	VPCMPEQ	%ymm2, %ymm1, %ymm2
++	vpmovmskb %ymm2, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_x0)
+ 
+ 
+-	VPCMPEQ	%ymm5, %ymm9, %ymm2
+-	vpmovmskb %ymm2, %eax
++	VPCMPEQ	%ymm3, %ymm1, %ymm3
++	vpmovmskb %ymm3, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_x1)
+ 
+-	VPCMPEQ	%ymm3, %ymm9, %ymm3
+-	vpmovmskb %ymm3, %eax
++	VPCMPEQ	%ymm4, %ymm1, %ymm4
++	vpmovmskb %ymm4, %eax
+ 	/* rcx has combined result from all 4 VEC. It will only be used
+ 	   if the first 3 other VEC all did not contain a match.  */
+ 	salq	$32, %rcx
+ 	orq	%rcx, %rax
+ 	tzcntq	%rax, %rax
+-	subq	$(VEC_SIZE * 2), %rdi
++	subq	$(VEC_SIZE * 2 - 1), %rdi
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Found CHAR or the null byte.	 */
+ 	cmp	(%rdi, %rax), %CHAR_REG
+@@ -239,10 +251,11 @@ L(loop_4x_vec):
+ 	VZEROUPPER_RETURN
+ 
+ 
+-	.p2align 4
++	.p2align 4,, 10
+ L(last_vec_x0):
+-	tzcntl	%eax, %eax
+-	addq	$-(VEC_SIZE * 4), %rdi
++	/* Use bsf to save code size.  */
++	bsfl	%eax, %eax
++	addq	$-(VEC_SIZE * 4 - 1), %rdi
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Found CHAR or the null byte.	 */
+ 	cmp	(%rdi, %rax), %CHAR_REG
+@@ -251,16 +264,11 @@ L(last_vec_x0):
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+ 
+-# ifndef USE_AS_STRCHRNUL
+-L(zero_end):
+-	xorl	%eax, %eax
+-	VZEROUPPER_RETURN
+-# endif
+ 
+-	.p2align 4
++	.p2align 4,, 10
+ L(last_vec_x1):
+ 	tzcntl	%eax, %eax
+-	subq	$(VEC_SIZE * 3), %rdi
++	subq	$(VEC_SIZE * 3 - 1), %rdi
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Found CHAR or the null byte.	 */
+ 	cmp	(%rdi, %rax), %CHAR_REG
+@@ -269,18 +277,23 @@ L(last_vec_x1):
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+ 
++# ifndef USE_AS_STRCHRNUL
++L(zero_end):
++	xorl	%eax, %eax
++	VZEROUPPER_RETURN
++# endif
+ 
+ 	/* Cold case for crossing page with first load.	 */
+-	.p2align 4
++	.p2align 4,, 8
+ L(cross_page_boundary):
+ 	movq	%rdi, %rdx
+ 	/* Align rdi to VEC_SIZE - 1.  */
+ 	orq	$(VEC_SIZE - 1), %rdi
+-	vmovdqa	-(VEC_SIZE - 1)(%rdi), %ymm8
+-	VPCMPEQ	%ymm8, %ymm0, %ymm1
+-	VPCMPEQ	%ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
++	vmovdqa	-(VEC_SIZE - 1)(%rdi), %ymm2
++	VPCMPEQ	%ymm2, %ymm0, %ymm3
++	VPCMPEQ	%ymm2, %ymm1, %ymm2
++	vpor	%ymm3, %ymm2, %ymm3
++	vpmovmskb %ymm3, %eax
+ 	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
+ 	   so no need to manually mod edx.  */
+ 	sarxl	%edx, %eax, %eax
+@@ -291,13 +304,10 @@ L(cross_page_boundary):
+ 	xorl	%ecx, %ecx
+ 	/* Found CHAR or the null byte.	 */
+ 	cmp	(%rdx, %rax), %CHAR_REG
+-	leaq	(%rdx, %rax), %rax
+-	cmovne	%rcx, %rax
+-# else
+-	addq	%rdx, %rax
++	jne	L(zero_end)
+ # endif
+-L(return_vzeroupper):
+-	ZERO_UPPER_VEC_REGISTERS_RETURN
++	addq	%rdx, %rax
++	VZEROUPPER_RETURN
+ 
+ END (STRCHR)
+-# endif
++#endif
+-- 
+GitLab
+
diff --git a/SOURCES/ia-code_cleanup-strchr-evex.patch b/SOURCES/ia-code_cleanup-strchr-evex.patch
new file mode 100644
index 0000000..933ef0b
--- /dev/null
+++ b/SOURCES/ia-code_cleanup-strchr-evex.patch
@@ -0,0 +1,344 @@
+From 23371c15467ef5d3225018278d6691e110119e4b Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Mar 2022 16:57:18 -0500
+Subject: [PATCH] x86: Code cleanup in strchr-evex and comment justifying
+ branch
+
+Small code cleanup for size: -81 bytes.
+
+Add comment justifying using a branch to do NULL/non-null return.
+
+All string/memory tests pass and no regressions in benchtests.
+
+geometric_mean(N=20) of all benchmarks New / Original: .985
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+
+(cherry picked from commit ec285ea90415458225623ddc0492ae3f705af043)
+---
+ sysdeps/x86_64/multiarch/strchr-evex.S | 146 ++++++++++++++-----------
+ 1 file changed, 80 insertions(+), 66 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
+index 7f9d4ee4..0b49e0ac 100644
+--- a/sysdeps/x86_64/multiarch/strchr-evex.S
++++ b/sysdeps/x86_64/multiarch/strchr-evex.S
+@@ -30,6 +30,7 @@
+ # ifdef USE_AS_WCSCHR
+ #  define VPBROADCAST	vpbroadcastd
+ #  define VPCMP		vpcmpd
++#  define VPTESTN	vptestnmd
+ #  define VPMINU	vpminud
+ #  define CHAR_REG	esi
+ #  define SHIFT_REG	ecx
+@@ -37,6 +38,7 @@
+ # else
+ #  define VPBROADCAST	vpbroadcastb
+ #  define VPCMP		vpcmpb
++#  define VPTESTN	vptestnmb
+ #  define VPMINU	vpminub
+ #  define CHAR_REG	sil
+ #  define SHIFT_REG	edx
+@@ -61,13 +63,11 @@
+ # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+ 
+ 	.section .text.evex,"ax",@progbits
+-ENTRY (STRCHR)
++ENTRY_P2ALIGN (STRCHR, 5)
+ 	/* Broadcast CHAR to YMM0.	*/
+ 	VPBROADCAST	%esi, %YMM0
+ 	movl	%edi, %eax
+ 	andl	$(PAGE_SIZE - 1), %eax
+-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+-
+ 	/* Check if we cross page boundary with one vector load.
+ 	   Otherwise it is safe to use an unaligned load.  */
+ 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+@@ -81,49 +81,35 @@ ENTRY (STRCHR)
+ 	vpxorq	%YMM1, %YMM0, %YMM2
+ 	VPMINU	%YMM2, %YMM1, %YMM2
+ 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM2, %k0
++	VPTESTN	%YMM2, %YMM2, %k0
+ 	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+ 	jz	L(aligned_more)
+ 	tzcntl	%eax, %eax
++# ifndef USE_AS_STRCHRNUL
++	/* Found CHAR or the null byte.  */
++	cmp	(%rdi, %rax, CHAR_SIZE), %CHAR_REG
++	/* NB: Use a branch instead of cmovcc here. The expectation is
++	   that with strchr the user will branch based on input being
++	   null. Since this branch will be 100% predictive of the user
++	   branch a branch miss here should save what otherwise would
++	   be branch miss in the user code. Otherwise using a branch 1)
++	   saves code size and 2) is faster in highly predictable
++	   environments.  */
++	jne	L(zero)
++# endif
+ # ifdef USE_AS_WCSCHR
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.
+ 	 */
+ 	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+ # else
+ 	addq	%rdi, %rax
+-# endif
+-# ifndef USE_AS_STRCHRNUL
+-	/* Found CHAR or the null byte.	 */
+-	cmp	(%rax), %CHAR_REG
+-	jne	L(zero)
+ # endif
+ 	ret
+ 
+-	/* .p2align 5 helps keep performance more consistent if ENTRY()
+-	   alignment % 32 was either 16 or 0. As well this makes the
+-	   alignment % 32 of the loop_4x_vec fixed which makes tuning it
+-	   easier.  */
+-	.p2align 5
+-L(first_vec_x3):
+-	tzcntl	%eax, %eax
+-# ifndef USE_AS_STRCHRNUL
+-	/* Found CHAR or the null byte.	 */
+-	cmp	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+-	jne	L(zero)
+-# endif
+-	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+-	   bytes.  */
+-	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+-	ret
+ 
+-# ifndef USE_AS_STRCHRNUL
+-L(zero):
+-	xorl	%eax, %eax
+-	ret
+-# endif
+ 
+-	.p2align 4
++	.p2align 4,, 10
+ L(first_vec_x4):
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Check to see if first match was CHAR (k0) or null (k1).  */
+@@ -144,9 +130,18 @@ L(first_vec_x4):
+ 	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+ 
++# ifndef USE_AS_STRCHRNUL
++L(zero):
++	xorl	%eax, %eax
++	ret
++# endif
++
++
+ 	.p2align 4
+ L(first_vec_x1):
+-	tzcntl	%eax, %eax
++	/* Use bsf here to save 1-byte keeping keeping the block in 1x
++	   fetch block. eax guranteed non-zero.  */
++	bsfl	%eax, %eax
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Found CHAR or the null byte.	 */
+ 	cmp	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+@@ -158,7 +153,7 @@ L(first_vec_x1):
+ 	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+ 
+-	.p2align 4
++	.p2align 4,, 10
+ L(first_vec_x2):
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Check to see if first match was CHAR (k0) or null (k1).  */
+@@ -179,6 +174,21 @@ L(first_vec_x2):
+ 	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+ 
++	.p2align 4,, 10
++L(first_vec_x3):
++	/* Use bsf here to save 1-byte keeping keeping the block in 1x
++	   fetch block. eax guranteed non-zero.  */
++	bsfl	%eax, %eax
++# ifndef USE_AS_STRCHRNUL
++	/* Found CHAR or the null byte.	 */
++	cmp	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
++	jne	L(zero)
++# endif
++	/* NB: Multiply sizeof char type (1 or 4) to get the number of
++	   bytes.  */
++	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
++	ret
++
+ 	.p2align 4
+ L(aligned_more):
+ 	/* Align data to VEC_SIZE.  */
+@@ -195,7 +205,7 @@ L(cross_page_continue):
+ 	vpxorq	%YMM1, %YMM0, %YMM2
+ 	VPMINU	%YMM2, %YMM1, %YMM2
+ 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM2, %k0
++	VPTESTN	%YMM2, %YMM2, %k0
+ 	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1)
+@@ -206,7 +216,7 @@ L(cross_page_continue):
+ 	/* Each bit in K0 represents a CHAR in YMM1.  */
+ 	VPCMP	$0, %YMM1, %YMM0, %k0
+ 	/* Each bit in K1 represents a CHAR in YMM1.  */
+-	VPCMP	$0, %YMM1, %YMMZERO, %k1
++	VPTESTN	%YMM1, %YMM1, %k1
+ 	kortestd	%k0, %k1
+ 	jnz	L(first_vec_x2)
+ 
+@@ -215,7 +225,7 @@ L(cross_page_continue):
+ 	vpxorq	%YMM1, %YMM0, %YMM2
+ 	VPMINU	%YMM2, %YMM1, %YMM2
+ 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM2, %k0
++	VPTESTN	%YMM2, %YMM2, %k0
+ 	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x3)
+@@ -224,7 +234,7 @@ L(cross_page_continue):
+ 	/* Each bit in K0 represents a CHAR in YMM1.  */
+ 	VPCMP	$0, %YMM1, %YMM0, %k0
+ 	/* Each bit in K1 represents a CHAR in YMM1.  */
+-	VPCMP	$0, %YMM1, %YMMZERO, %k1
++	VPTESTN	%YMM1, %YMM1, %k1
+ 	kortestd	%k0, %k1
+ 	jnz	L(first_vec_x4)
+ 
+@@ -265,33 +275,33 @@ L(loop_4x_vec):
+ 	VPMINU	%YMM3, %YMM4, %YMM4
+ 	VPMINU	%YMM2, %YMM4, %YMM4{%k4}{z}
+ 
+-	VPCMP	$0, %YMMZERO, %YMM4, %k1
++	VPTESTN	%YMM4, %YMM4, %k1
+ 	kmovd	%k1, %ecx
+ 	subq	$-(VEC_SIZE * 4), %rdi
+ 	testl	%ecx, %ecx
+ 	jz	L(loop_4x_vec)
+ 
+-	VPCMP	$0, %YMMZERO, %YMM1, %k0
++	VPTESTN	%YMM1, %YMM1, %k0
+ 	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_x1)
+ 
+-	VPCMP	$0, %YMMZERO, %YMM2, %k0
++	VPTESTN	%YMM2, %YMM2, %k0
+ 	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_x2)
+ 
+-	VPCMP	$0, %YMMZERO, %YMM3, %k0
++	VPTESTN	%YMM3, %YMM3, %k0
+ 	kmovd	%k0, %eax
+ 	/* Combine YMM3 matches (eax) with YMM4 matches (ecx).  */
+ # ifdef USE_AS_WCSCHR
+ 	sall	$8, %ecx
+ 	orl	%ecx, %eax
+-	tzcntl	%eax, %eax
++	bsfl	%eax, %eax
+ # else
+ 	salq	$32, %rcx
+ 	orq	%rcx, %rax
+-	tzcntq	%rax, %rax
++	bsfq	%rax, %rax
+ # endif
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Check if match was CHAR or null.  */
+@@ -303,28 +313,28 @@ L(loop_4x_vec):
+ 	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+ 
+-# ifndef USE_AS_STRCHRNUL
+-L(zero_end):
+-	xorl	%eax, %eax
+-	ret
++	.p2align 4,, 8
++L(last_vec_x1):
++	bsfl	%eax, %eax
++# ifdef USE_AS_WCSCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.
++	   */
++	leaq	(%rdi, %rax, CHAR_SIZE), %rax
++# else
++	addq	%rdi, %rax
+ # endif
+ 
+-	.p2align 4
+-L(last_vec_x1):
+-	tzcntl	%eax, %eax
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Check if match was null.  */
+-	cmp	(%rdi, %rax, CHAR_SIZE), %CHAR_REG
++	cmp	(%rax), %CHAR_REG
+ 	jne	L(zero_end)
+ # endif
+-	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+-	   bytes.  */
+-	leaq	(%rdi, %rax, CHAR_SIZE), %rax
++
+ 	ret
+ 
+-	.p2align 4
++	.p2align 4,, 8
+ L(last_vec_x2):
+-	tzcntl	%eax, %eax
++	bsfl	%eax, %eax
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Check if match was null.  */
+ 	cmp	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+@@ -336,7 +346,7 @@ L(last_vec_x2):
+ 	ret
+ 
+ 	/* Cold case for crossing page with first load.	 */
+-	.p2align 4
++	.p2align 4,, 8
+ L(cross_page_boundary):
+ 	movq	%rdi, %rdx
+ 	/* Align rdi.  */
+@@ -346,9 +356,9 @@ L(cross_page_boundary):
+ 	vpxorq	%YMM1, %YMM0, %YMM2
+ 	VPMINU	%YMM2, %YMM1, %YMM2
+ 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM2, %k0
++	VPTESTN	%YMM2, %YMM2, %k0
+ 	kmovd	%k0, %eax
+-	/* Remove the leading bits.	 */
++	/* Remove the leading bits.  */
+ # ifdef USE_AS_WCSCHR
+ 	movl	%edx, %SHIFT_REG
+ 	/* NB: Divide shift count by 4 since each bit in K1 represent 4
+@@ -360,20 +370,24 @@ L(cross_page_boundary):
+ 	/* If eax is zero continue.  */
+ 	testl	%eax, %eax
+ 	jz	L(cross_page_continue)
+-	tzcntl	%eax, %eax
+-# ifndef USE_AS_STRCHRNUL
+-	/* Check to see if match was CHAR or null.  */
+-	cmp	(%rdx, %rax, CHAR_SIZE), %CHAR_REG
+-	jne	L(zero_end)
+-# endif
++	bsfl	%eax, %eax
++
+ # ifdef USE_AS_WCSCHR
+ 	/* NB: Multiply wchar_t count by 4 to get the number of
+ 	   bytes.  */
+ 	leaq	(%rdx, %rax, CHAR_SIZE), %rax
+ # else
+ 	addq	%rdx, %rax
++# endif
++# ifndef USE_AS_STRCHRNUL
++	/* Check to see if match was CHAR or null.  */
++	cmp	(%rax), %CHAR_REG
++	je	L(cross_page_ret)
++L(zero_end):
++	xorl	%eax, %eax
++L(cross_page_ret):
+ # endif
+ 	ret
+ 
+ END (STRCHR)
+-# endif
++#endif
+-- 
+GitLab
+
diff --git a/SOURCES/ia-imp-l.patch b/SOURCES/ia-imp-l.patch
new file mode 100644
index 0000000..3648932
--- /dev/null
+++ b/SOURCES/ia-imp-l.patch
@@ -0,0 +1,27 @@
+From bc402884eb392fa9c3c5813fca95c0b37d9879a6 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Sat, 5 Feb 2022 11:06:01 -0800
+Subject: [PATCH] x86: Improve L to support L(XXX_SYMBOL (YYY, ZZZ))
+
+(cherry picked from commit 1283948f236f209b7d3f44b69a42b96806fa6da0)
+---
+ sysdeps/x86/sysdep.h | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
+index a70bb3a2..49b0efe2 100644
+--- a/sysdeps/x86/sysdep.h
++++ b/sysdeps/x86/sysdep.h
+@@ -111,7 +111,8 @@ enum cf_protection_level
+ /* Local label name for asm code. */
+ #ifndef L
+ /* ELF-like local names start with `.L'.  */
+-# define L(name)	.L##name
++# define LOCAL_LABEL(name) .L##name
++# define L(name)	LOCAL_LABEL(name)
+ #endif
+ 
+ #define atom_text_section .section ".text.atom", "ax"
+-- 
+GitLab
+
diff --git a/SOURCES/ia-imp-vec_gen-memset-vec-unaligned-erms.patch b/SOURCES/ia-imp-vec_gen-memset-vec-unaligned-erms.patch
new file mode 100644
index 0000000..28aeb5e
--- /dev/null
+++ b/SOURCES/ia-imp-vec_gen-memset-vec-unaligned-erms.patch
@@ -0,0 +1,460 @@
+From 413e4abc92aeb12fb4c188aa53f0425ceac0ef15 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Sun, 6 Feb 2022 00:54:18 -0600
+Subject: [PATCH] x86: Improve vec generation in memset-vec-unaligned-erms.S
+
+No bug.
+
+Split vec generation into multiple steps. This allows the
+broadcast in AVX2 to use 'xmm' registers for the L(less_vec)
+case. This saves an expensive lane-cross instruction and removes
+the need for 'vzeroupper'.
+
+For SSE2 replace 2x 'punpck' instructions with zero-idiom 'pxor' for
+byte broadcast.
+
+Results for memset-avx2 small (geomean of N = 20 benchset runs).
+
+size, New Time, Old Time, New / Old
+   0,    4.100,    3.831,     0.934
+   1,    5.074,    4.399,     0.867
+   2,    4.433,    4.411,     0.995
+   4,    4.487,    4.415,     0.984
+   8,    4.454,    4.396,     0.987
+  16,    4.502,    4.443,     0.987
+
+All relevant string/wcsmbs tests are passing.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+
+(cherry picked from commit b62ace2740a106222e124cc86956448fa07abf4d)
+---
+ sysdeps/x86_64/memset.S                       |  21 ++-
+ .../multiarch/memset-avx2-unaligned-erms.S    |  18 +-
+ .../multiarch/memset-avx512-unaligned-erms.S  |  18 +-
+ .../multiarch/memset-evex-unaligned-erms.S    |  18 +-
+ .../multiarch/memset-vec-unaligned-erms.S     | 164 +++++++++++-------
+ 5 files changed, 152 insertions(+), 87 deletions(-)
+
+diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
+index 8672b030..27debd2b 100644
+--- a/sysdeps/x86_64/memset.S
++++ b/sysdeps/x86_64/memset.S
+@@ -28,17 +28,22 @@
+ #define VMOVU     movups
+ #define VMOVA     movaps
+ 
+-#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
++# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   movd d, %xmm0; \
+-  movq r, %rax; \
+-  punpcklbw %xmm0, %xmm0; \
+-  punpcklwd %xmm0, %xmm0; \
+-  pshufd $0, %xmm0, %xmm0
++  pxor %xmm1, %xmm1; \
++  pshufb %xmm1, %xmm0; \
++  movq r, %rax
+ 
+-#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
++# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   movd d, %xmm0; \
+-  movq r, %rax; \
+-  pshufd $0, %xmm0, %xmm0
++  pshufd $0, %xmm0, %xmm0; \
++  movq r, %rax
++
++# define MEMSET_VDUP_TO_VEC0_HIGH()
++# define MEMSET_VDUP_TO_VEC0_LOW()
++
++# define WMEMSET_VDUP_TO_VEC0_HIGH()
++# define WMEMSET_VDUP_TO_VEC0_LOW()
+ 
+ #define SECTION(p)		p
+ 
+diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+index 1af668af..c0bf2875 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+@@ -10,15 +10,18 @@
+ # define VMOVU     vmovdqu
+ # define VMOVA     vmovdqa
+ 
+-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
++# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   vmovd d, %xmm0; \
+-  movq r, %rax; \
+-  vpbroadcastb %xmm0, %ymm0
++  movq r, %rax;
+ 
+-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+-  vmovd d, %xmm0; \
+-  movq r, %rax; \
+-  vpbroadcastd %xmm0, %ymm0
++# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
++  MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
++
++# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0
++# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0
++
++# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0
++# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0
+ 
+ # ifndef SECTION
+ #  define SECTION(p)		p##.avx
+@@ -30,5 +33,6 @@
+ #  define WMEMSET_SYMBOL(p,s)	p##_avx2_##s
+ # endif
+ 
++# define USE_XMM_LESS_VEC
+ # include "memset-vec-unaligned-erms.S"
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+index f14d6f84..5241216a 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+@@ -15,13 +15,19 @@
+ 
+ # define VZEROUPPER
+ 
+-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+-  movq r, %rax; \
+-  vpbroadcastb d, %VEC0
++# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
++  vpbroadcastb d, %VEC0; \
++  movq r, %rax
+ 
+-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+-  movq r, %rax; \
+-  vpbroadcastd d, %VEC0
++# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
++  vpbroadcastd d, %VEC0; \
++  movq r, %rax
++
++# define MEMSET_VDUP_TO_VEC0_HIGH()
++# define MEMSET_VDUP_TO_VEC0_LOW()
++
++# define WMEMSET_VDUP_TO_VEC0_HIGH()
++# define WMEMSET_VDUP_TO_VEC0_LOW()
+ 
+ # define SECTION(p)		p##.evex512
+ # define MEMSET_SYMBOL(p,s)	p##_avx512_##s
+diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+index 64b09e77..63700215 100644
+--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+@@ -15,13 +15,19 @@
+ 
+ # define VZEROUPPER
+ 
+-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+-  movq r, %rax; \
+-  vpbroadcastb d, %VEC0
++# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
++  vpbroadcastb d, %VEC0; \
++  movq r, %rax
+ 
+-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+-  movq r, %rax; \
+-  vpbroadcastd d, %VEC0
++# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
++  vpbroadcastd d, %VEC0; \
++  movq r, %rax
++
++# define MEMSET_VDUP_TO_VEC0_HIGH()
++# define MEMSET_VDUP_TO_VEC0_LOW()
++
++# define WMEMSET_VDUP_TO_VEC0_HIGH()
++# define WMEMSET_VDUP_TO_VEC0_LOW()
+ 
+ # define SECTION(p)		p##.evex
+ # define MEMSET_SYMBOL(p,s)	p##_evex_##s
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index f08b7323..a67f9833 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -58,8 +58,10 @@
+ #ifndef MOVQ
+ # if VEC_SIZE > 16
+ #  define MOVQ				vmovq
++#  define MOVD				vmovd
+ # else
+ #  define MOVQ				movq
++#  define MOVD				movd
+ # endif
+ #endif
+ 
+@@ -72,9 +74,17 @@
+ #if defined USE_WITH_EVEX || defined USE_WITH_AVX512
+ # define END_REG	rcx
+ # define LOOP_REG	rdi
++# define LESS_VEC_REG	rax
+ #else
+ # define END_REG	rdi
+ # define LOOP_REG	rdx
++# define LESS_VEC_REG	rdi
++#endif
++
++#ifdef USE_XMM_LESS_VEC
++# define XMM_SMALL	1
++#else
++# define XMM_SMALL	0
+ #endif
+ 
+ #define PAGE_SIZE 4096
+@@ -110,8 +120,12 @@ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
+ 
+ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
+ 	shl	$2, %RDX_LP
+-	WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+-	jmp	L(entry_from_bzero)
++	WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
++	WMEMSET_VDUP_TO_VEC0_LOW()
++	cmpq	$VEC_SIZE, %rdx
++	jb	L(less_vec_no_vdup)
++	WMEMSET_VDUP_TO_VEC0_HIGH()
++	jmp	L(entry_from_wmemset)
+ END (WMEMSET_SYMBOL (__wmemset, unaligned))
+ #endif
+ 
+@@ -123,7 +137,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
+ #endif
+ 
+ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
+-	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
++	MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
+ # ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	mov	%edx, %edx
+@@ -131,6 +145,8 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
+ L(entry_from_bzero):
+ 	cmpq	$VEC_SIZE, %rdx
+ 	jb	L(less_vec)
++	MEMSET_VDUP_TO_VEC0_HIGH()
++L(entry_from_wmemset):
+ 	cmpq	$(VEC_SIZE * 2), %rdx
+ 	ja	L(more_2x_vec)
+ 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+@@ -179,27 +195,27 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
+ # endif
+ 
+ ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
+-	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
++	MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
+ # ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	mov	%edx, %edx
+ # endif
+ 	cmp	$VEC_SIZE, %RDX_LP
+ 	jb	L(less_vec)
++	MEMSET_VDUP_TO_VEC0_HIGH ()
+ 	cmp	$(VEC_SIZE * 2), %RDX_LP
+ 	ja	L(stosb_more_2x_vec)
+-	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
+-	 */
+-	VMOVU	%VEC(0), (%rax)
+-	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
++	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
++	VMOVU	%VEC(0), (%rdi)
++	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
+ 	VZEROUPPER_RETURN
+ #endif
+ 
+-	.p2align 4,, 10
++	.p2align 4,, 4
+ L(last_2x_vec):
+ #ifdef USE_LESS_VEC_MASK_STORE
+-	VMOVU	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
+-	VMOVU	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
++	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi, %rdx)
++	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
+ #else
+ 	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi)
+ 	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi)
+@@ -212,6 +228,7 @@ L(last_2x_vec):
+ #ifdef USE_LESS_VEC_MASK_STORE
+ 	.p2align 4,, 10
+ L(less_vec):
++L(less_vec_no_vdup):
+ 	/* Less than 1 VEC.  */
+ # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+ #  error Unsupported VEC_SIZE!
+@@ -262,28 +279,18 @@ L(stosb_more_2x_vec):
+ 	/* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
+ 	   and (4x, 8x] jump to target.  */
+ L(more_2x_vec):
+-
+-	/* Two different methods of setting up pointers / compare. The
+-	   two methods are based on the fact that EVEX/AVX512 mov
+-	   instructions take more bytes then AVX2/SSE2 mov instructions. As
+-	   well that EVEX/AVX512 machines also have fast LEA_BID. Both
+-	   setup and END_REG to avoid complex address mode. For EVEX/AVX512
+-	   this saves code size and keeps a few targets in one fetch block.
+-	   For AVX2/SSE2 this helps prevent AGU bottlenecks.  */
+-#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
+-	/* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
+-	   LOOP_4X_OFFSET) with LEA_BID.  */
+-
+-	/* END_REG is rcx for EVEX/AVX512.  */
+-	leaq	-(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
+-#endif
+-
+-	/* Stores to first 2x VEC before cmp as any path forward will
+-	   require it.  */
+-	VMOVU	%VEC(0), (%rax)
+-	VMOVU	%VEC(0), VEC_SIZE(%rax)
++	/* Store next 2x vec regardless.  */
++	VMOVU	%VEC(0), (%rdi)
++	VMOVU	%VEC(0), (VEC_SIZE * 1)(%rdi)
+ 
+ 
++	/* Two different methods of setting up pointers / compare. The two
++	   methods are based on the fact that EVEX/AVX512 mov instructions take
++	   more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512
++	   machines also have fast LEA_BID. Both setup and END_REG to avoid complex
++	   address mode. For EVEX/AVX512 this saves code size and keeps a few
++	   targets in one fetch block. For AVX2/SSE2 this helps prevent AGU
++	   bottlenecks.  */
+ #if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
+ 	/* If AVX2/SSE2 compute END_REG (rdi) with ALU.  */
+ 	addq	%rdx, %END_REG
+@@ -292,6 +299,15 @@ L(more_2x_vec):
+ 	cmpq	$(VEC_SIZE * 4), %rdx
+ 	jbe	L(last_2x_vec)
+ 
++
++#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
++	/* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with
++	   LEA_BID.  */
++
++	/* END_REG is rcx for EVEX/AVX512.  */
++	leaq	-(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
++#endif
++
+ 	/* Store next 2x vec regardless.  */
+ 	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rax)
+ 	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rax)
+@@ -355,65 +371,93 @@ L(stosb_local):
+ 	/* Define L(less_vec) only if not otherwise defined.  */
+ 	.p2align 4
+ L(less_vec):
++	/* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
++	   xmm). This is only does anything for AVX2.  */
++	MEMSET_VDUP_TO_VEC0_LOW ()
++L(less_vec_no_vdup):
+ #endif
+ L(cross_page):
+ #if VEC_SIZE > 32
+ 	cmpl	$32, %edx
+-	jae	L(between_32_63)
++	jge	L(between_32_63)
+ #endif
+ #if VEC_SIZE > 16
+ 	cmpl	$16, %edx
+-	jae	L(between_16_31)
++	jge	L(between_16_31)
++#endif
++#ifndef USE_XMM_LESS_VEC
++	MOVQ	%XMM0, %rcx
+ #endif
+-	MOVQ	%XMM0, %rdi
+ 	cmpl	$8, %edx
+-	jae	L(between_8_15)
++	jge	L(between_8_15)
+ 	cmpl	$4, %edx
+-	jae	L(between_4_7)
++	jge	L(between_4_7)
+ 	cmpl	$1, %edx
+-	ja	L(between_2_3)
+-	jb	L(return)
+-	movb	%sil, (%rax)
+-	VZEROUPPER_RETURN
++	jg	L(between_2_3)
++	jl	L(between_0_0)
++	movb	%sil, (%LESS_VEC_REG)
++L(between_0_0):
++	ret
+ 
+-	/* Align small targets only if not doing so would cross a fetch
+-	   line.  */
++	/* Align small targets only if not doing so would cross a fetch line.
++	 */
+ #if VEC_SIZE > 32
+ 	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
+ 	/* From 32 to 63.  No branch when size == 32.  */
+ L(between_32_63):
+-	VMOVU	%YMM0, (%rax)
+-	VMOVU	%YMM0, -32(%rax, %rdx)
++	VMOVU	%YMM0, (%LESS_VEC_REG)
++	VMOVU	%YMM0, -32(%LESS_VEC_REG, %rdx)
+ 	VZEROUPPER_RETURN
+ #endif
+ 
+ #if VEC_SIZE >= 32
+-	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
++	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1)
+ L(between_16_31):
+ 	/* From 16 to 31.  No branch when size == 16.  */
+-	VMOVU	%XMM0, (%rax)
+-	VMOVU	%XMM0, -16(%rax, %rdx)
+-	VZEROUPPER_RETURN
++	VMOVU	%XMM0, (%LESS_VEC_REG)
++	VMOVU	%XMM0, -16(%LESS_VEC_REG, %rdx)
++	ret
+ #endif
+ 
+-	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
++	/* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
++	 */
++	.p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1)
+ L(between_8_15):
+ 	/* From 8 to 15.  No branch when size == 8.  */
+-	movq	%rdi, (%rax)
+-	movq	%rdi, -8(%rax, %rdx)
+-	VZEROUPPER_RETURN
++#ifdef USE_XMM_LESS_VEC
++	MOVQ	%XMM0, (%rdi)
++	MOVQ	%XMM0, -8(%rdi, %rdx)
++#else
++	movq	%rcx, (%LESS_VEC_REG)
++	movq	%rcx, -8(%LESS_VEC_REG, %rdx)
++#endif
++	ret
+ 
+-	.p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
++	/* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
++	 */
++	.p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1)
+ L(between_4_7):
+ 	/* From 4 to 7.  No branch when size == 4.  */
+-	movl	%edi, (%rax)
+-	movl	%edi, -4(%rax, %rdx)
+-	VZEROUPPER_RETURN
++#ifdef USE_XMM_LESS_VEC
++	MOVD	%XMM0, (%rdi)
++	MOVD	%XMM0, -4(%rdi, %rdx)
++#else
++	movl	%ecx, (%LESS_VEC_REG)
++	movl	%ecx, -4(%LESS_VEC_REG, %rdx)
++#endif
++	ret
+ 
+-	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
++	/* 4 * XMM_SMALL for the third mov for AVX2.  */
++	.p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1)
+ L(between_2_3):
+ 	/* From 2 to 3.  No branch when size == 2.  */
+-	movw	%di, (%rax)
+-	movb	%dil, -1(%rax, %rdx)
+-	VZEROUPPER_RETURN
++#ifdef USE_XMM_LESS_VEC
++	movb	%sil, (%rdi)
++	movb	%sil, 1(%rdi)
++	movb	%sil, -1(%rdi, %rdx)
++#else
++	movw	%cx, (%LESS_VEC_REG)
++	movb	%sil, -1(%LESS_VEC_REG, %rdx)
++#endif
++	ret
+ END (MEMSET_SYMBOL (__memset, unaligned_erms))
+-- 
+GitLab
+
diff --git a/SOURCES/ia-imp-wcslen.patch b/SOURCES/ia-imp-wcslen.patch
new file mode 100644
index 0000000..7f2b720
--- /dev/null
+++ b/SOURCES/ia-imp-wcslen.patch
@@ -0,0 +1,258 @@
+From ac23759b655ceac1bd18b71f45bb1743826f0bed Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 25 Mar 2022 17:13:33 -0500
+Subject: [PATCH] x86: Small improvements for wcslen
+
+Just a few QOL changes.
+    1. Prefer `add` > `lea` as it has high execution units it can run
+       on.
+    2. Don't break macro-fusion between `test` and `jcc`
+    3. Reduce code size by removing gratuitous padding bytes (-90
+       bytes).
+
+geometric_mean(N=20) of all benchmarks New / Original: 0.959
+
+All string/memory tests pass.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+
+(cherry picked from commit 244b415d386487521882debb845a040a4758cb18)
+---
+ sysdeps/x86_64/wcslen.S | 86 ++++++++++++++++++++---------------------
+ 1 file changed, 41 insertions(+), 45 deletions(-)
+
+diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S
+index 9f5f7232..254bb030 100644
+--- a/sysdeps/x86_64/wcslen.S
++++ b/sysdeps/x86_64/wcslen.S
+@@ -41,82 +41,82 @@ ENTRY (__wcslen)
+ 	pxor	%xmm0, %xmm0
+ 
+ 	lea	32(%rdi), %rax
+-	lea	16(%rdi), %rcx
++	addq	$16, %rdi
+ 	and	$-16, %rax
+ 
+ 	pcmpeqd	(%rax), %xmm0
+ 	pmovmskb %xmm0, %edx
+ 	pxor	%xmm1, %xmm1
++	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	(%rax), %xmm1
+ 	pmovmskb %xmm1, %edx
+ 	pxor	%xmm2, %xmm2
++	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	(%rax), %xmm2
+ 	pmovmskb %xmm2, %edx
+ 	pxor	%xmm3, %xmm3
++	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	(%rax), %xmm3
+ 	pmovmskb %xmm3, %edx
++	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	(%rax), %xmm0
+ 	pmovmskb %xmm0, %edx
++	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	(%rax), %xmm1
+ 	pmovmskb %xmm1, %edx
++	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	(%rax), %xmm2
+ 	pmovmskb %xmm2, %edx
++	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	(%rax), %xmm3
+ 	pmovmskb %xmm3, %edx
++	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	(%rax), %xmm0
+ 	pmovmskb %xmm0, %edx
++	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	(%rax), %xmm1
+ 	pmovmskb %xmm1, %edx
++	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	(%rax), %xmm2
+ 	pmovmskb %xmm2, %edx
++	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	(%rax), %xmm3
+ 	pmovmskb %xmm3, %edx
++	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	and	$-0x40, %rax
+@@ -133,104 +133,100 @@ L(aligned_64_loop):
+ 	pminub	%xmm0, %xmm2
+ 	pcmpeqd	%xmm3, %xmm2
+ 	pmovmskb %xmm2, %edx
++	addq	$64, %rax
+ 	test	%edx, %edx
+-	lea	64(%rax), %rax
+ 	jz	L(aligned_64_loop)
+ 
+ 	pcmpeqd	-64(%rax), %xmm3
+ 	pmovmskb %xmm3, %edx
++    addq	$48, %rdi
+ 	test	%edx, %edx
+-	lea	48(%rcx), %rcx
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	%xmm1, %xmm3
+ 	pmovmskb %xmm3, %edx
++    addq	$-16, %rdi
+ 	test	%edx, %edx
+-	lea	-16(%rcx), %rcx
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	-32(%rax), %xmm3
+ 	pmovmskb %xmm3, %edx
++    addq	$-16, %rdi
+ 	test	%edx, %edx
+-	lea	-16(%rcx), %rcx
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	%xmm6, %xmm3
+ 	pmovmskb %xmm3, %edx
++    addq	$-16, %rdi
+ 	test	%edx, %edx
+-	lea	-16(%rcx), %rcx
+-	jnz	L(exit)
+-
+-	jmp	L(aligned_64_loop)
++	jz	L(aligned_64_loop)
+ 
+ 	.p2align 4
+ L(exit):
+-	sub	%rcx, %rax
++	sub	%rdi, %rax
+ 	shr	$2, %rax
+ 	test	%dl, %dl
+ 	jz	L(exit_high)
+ 
+-	mov	%dl, %cl
+-	and	$15, %cl
++	andl	$15, %edx
+ 	jz	L(exit_1)
+ 	ret
+ 
+-	.p2align 4
++	/* No align here. Naturally aligned % 16 == 1.  */
+ L(exit_high):
+-	mov	%dh, %ch
+-	and	$15, %ch
++	andl	$(15 << 8), %edx
+ 	jz	L(exit_3)
+ 	add	$2, %rax
+ 	ret
+ 
+-	.p2align 4
++	.p2align 3
+ L(exit_1):
+ 	add	$1, %rax
+ 	ret
+ 
+-	.p2align 4
++	.p2align 3
+ L(exit_3):
+ 	add	$3, %rax
+ 	ret
+ 
+-	.p2align 4
++	.p2align 3
+ L(exit_tail0):
+-	xor	%rax, %rax
++	xorl	%eax, %eax
+ 	ret
+ 
+-	.p2align 4
++	.p2align 3
+ L(exit_tail1):
+-	mov	$1, %rax
++	movl	$1, %eax
+ 	ret
+ 
+-	.p2align 4
++	.p2align 3
+ L(exit_tail2):
+-	mov	$2, %rax
++	movl	$2, %eax
+ 	ret
+ 
+-	.p2align 4
++	.p2align 3
+ L(exit_tail3):
+-	mov	$3, %rax
++	movl	$3, %eax
+ 	ret
+ 
+-	.p2align 4
++	.p2align 3
+ L(exit_tail4):
+-	mov	$4, %rax
++	movl	$4, %eax
+ 	ret
+ 
+-	.p2align 4
++	.p2align 3
+ L(exit_tail5):
+-	mov	$5, %rax
++	movl	$5, %eax
+ 	ret
+ 
+-	.p2align 4
++	.p2align 3
+ L(exit_tail6):
+-	mov	$6, %rax
++	movl	$6, %eax
+ 	ret
+ 
+-	.p2align 4
++	.p2align 3
+ L(exit_tail7):
+-	mov	$7, %rax
++	movl	$7, %eax
+ 	ret
+ 
+ END (__wcslen)
+-- 
+GitLab
+
diff --git a/SOURCES/ia-march-srt-sysdep_routines.patch b/SOURCES/ia-march-srt-sysdep_routines.patch
new file mode 100644
index 0000000..2525200
--- /dev/null
+++ b/SOURCES/ia-march-srt-sysdep_routines.patch
@@ -0,0 +1,104 @@
+From 18bfa4d2f8ce8a33367e93ab2eaab90be1133c86 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Sat, 5 Feb 2022 11:52:33 -0800
+Subject: [PATCH] x86_64/multiarch: Sort sysdep_routines and put one entry per
+ line
+
+(cherry picked from commit c328d0152d4b14cca58407ec68143894c8863004)
+---
+ sysdeps/x86_64/multiarch/Makefile | 78 +++++++++++++++++++------------
+ 1 file changed, 48 insertions(+), 30 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 37d8d6f0..8c9e7812 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
++++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -132,37 +132,55 @@ CFLAGS-strspn-c.c += -msse4
+ endif
+ 
+ ifeq ($(subdir),wcsmbs)
+-sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
+-		   wmemcmp-avx2-movbe \
+-		   wmemchr-sse2 wmemchr-avx2 \
+-		   wcscmp-sse2 wcscmp-avx2 \
+-		   wcsncmp-sse2 wcsncmp-avx2 \
+-		   wcscpy-ssse3 wcscpy-c \
+-		   wcschr-sse2 wcschr-avx2 \
+-		   wcsrchr-sse2 wcsrchr-avx2 \
+-		   wcslen-sse2 wcslen-sse4_1 wcslen-avx2 \
+-		   wcsnlen-c wcsnlen-sse4_1 wcsnlen-avx2 \
+-		   wcschr-avx2-rtm \
+-		   wcscmp-avx2-rtm \
+-		   wcslen-avx2-rtm \
+-		   wcsncmp-avx2-rtm \
+-		   wcsnlen-avx2-rtm \
+-		   wcsrchr-avx2-rtm \
+-		   wmemchr-avx2-rtm \
+-		   wmemcmp-avx2-movbe-rtm \
+-		   wcschr-evex \
+-		   wcscmp-evex \
+-		   wcslen-evex \
+-		   wcsncmp-evex \
+-		   wcsnlen-evex \
+-		   wcsrchr-evex \
+-		   wmemchr-evex \
+-		   wmemcmp-evex-movbe \
+-		   wmemchr-evex-rtm
++sysdep_routines += \
++  wcschr-avx2 \
++  wcschr-avx2-rtm \
++  wcschr-evex \
++  wcschr-sse2 \
++  wcscmp-avx2 \
++  wcscmp-avx2-rtm \
++  wcscmp-evex \
++  wcscmp-sse2 \
++  wcscpy-c \
++  wcscpy-ssse3 \
++  wcslen-avx2 \
++  wcslen-avx2-rtm \
++  wcslen-evex \
++  wcslen-sse2 \
++  wcslen-sse4_1 \
++  wcsncmp-avx2 \
++  wcsncmp-avx2-rtm \
++  wcsncmp-evex \
++  wcsncmp-sse2 \
++  wcsnlen-avx2 \
++  wcsnlen-avx2-rtm \
++  wcsnlen-c \
++  wcsnlen-evex \
++  wcsnlen-sse4_1 \
++  wcsrchr-avx2 \
++  wcsrchr-avx2-rtm \
++  wcsrchr-evex \
++  wcsrchr-sse2 \
++  wmemchr-avx2 \
++  wmemchr-avx2-rtm \
++  wmemchr-evex \
++  wmemchr-evex-rtm \
++  wmemchr-sse2 \
++  wmemcmp-avx2-movbe \
++  wmemcmp-avx2-movbe-rtm \
++  wmemcmp-c \
++  wmemcmp-evex-movbe \
++  wmemcmp-sse4 \
++  wmemcmp-ssse3 \
++# sysdep_routines
+ endif
+ 
+ ifeq ($(subdir),debug)
+-sysdep_routines += memcpy_chk-nonshared mempcpy_chk-nonshared \
+-		   memmove_chk-nonshared memset_chk-nonshared \
+-		   wmemset_chk-nonshared
++sysdep_routines += \
++  memcpy_chk-nonshared \
++  memmove_chk-nonshared \
++  mempcpy_chk-nonshared \
++  memset_chk-nonshared \
++  wmemset_chk-nonshared \
++# sysdep_routines
+ endif
+-- 
+GitLab
+
diff --git a/SOURCES/ia-opt-bzero.patch b/SOURCES/ia-opt-bzero.patch
new file mode 100644
index 0000000..e844865
--- /dev/null
+++ b/SOURCES/ia-opt-bzero.patch
@@ -0,0 +1,750 @@
+From 94783c6e57638122cefe4e02342c7fafc3cf09f0 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 7 Feb 2022 05:55:15 -0800
+Subject: [PATCH] x86-64: Optimize bzero
+
+Add OPTIMIZE1 and OPTIMIZE2 in ifunc-init.h file.
+Remove memcmpeq implementation from Makefile.
+
+memset with zero as the value to set is by far the majority value (99%+
+for Python3 and GCC).
+
+bzero can be slightly more optimized for this case by using a zero-idiom
+xor for broadcasting the set value to a register (vector or GPR).
+
+Co-developed-by: Noah Goldstein <goldstein.w.n@gmail.com>
+(cherry picked from commit 3d9f171bfb5325bd5f427e9fc386453358c6e840)
+---
+ sysdeps/generic/ifunc-init.h                  |   5 +-
+ sysdeps/x86_64/memset.S                       |   8 +
+ sysdeps/x86_64/multiarch/Makefile             | 205 +++++++++++-------
+ sysdeps/x86_64/multiarch/bzero.c              | 106 +++++++++
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  42 ++++
+ .../memset-avx2-unaligned-erms-rtm.S          |   1 +
+ .../multiarch/memset-avx2-unaligned-erms.S    |   6 +
+ .../multiarch/memset-avx512-unaligned-erms.S  |   3 +
+ .../multiarch/memset-evex-unaligned-erms.S    |   3 +
+ .../multiarch/memset-sse2-unaligned-erms.S    |   1 +
+ .../multiarch/memset-vec-unaligned-erms.S     | 110 +++++++---
+ 11 files changed, 384 insertions(+), 106 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/bzero.c
+
+diff --git a/sysdeps/generic/ifunc-init.h b/sysdeps/generic/ifunc-init.h
+index 241e4161..f7a72375 100644
+--- a/sysdeps/generic/ifunc-init.h
++++ b/sysdeps/generic/ifunc-init.h
+@@ -50,5 +50,8 @@
+    '__<symbol>_<variant>' as the optimized implementation and
+    '<symbol>_ifunc_selector' as the IFUNC selector.  */
+ #define REDIRECT_NAME	EVALUATOR1 (__redirect, SYMBOL_NAME)
+-#define OPTIMIZE(name)	EVALUATOR2 (SYMBOL_NAME, name)
++#define OPTIMIZE1(name)	EVALUATOR1 (SYMBOL_NAME, name)
++#define OPTIMIZE2(name)	EVALUATOR2 (SYMBOL_NAME, name)
++/* Default is to use OPTIMIZE2.  */
++#define OPTIMIZE(name)	OPTIMIZE2(name)
+ #define IFUNC_SELECTOR	EVALUATOR1 (SYMBOL_NAME, ifunc_selector)
+diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
+index 4cb4aa71..a1353f89 100644
+--- a/sysdeps/x86_64/memset.S
++++ b/sysdeps/x86_64/memset.S
+@@ -35,6 +35,9 @@
+   punpcklwd %xmm0, %xmm0; \
+   pshufd $0, %xmm0, %xmm0
+ 
++# define BZERO_ZERO_VEC0() \
++  pxor %xmm0, %xmm0
++
+ # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   movd d, %xmm0; \
+   pshufd $0, %xmm0, %xmm0; \
+@@ -53,6 +56,10 @@
+ # define MEMSET_SYMBOL(p,s)	memset
+ #endif
+ 
++#ifndef BZERO_SYMBOL
++# define BZERO_SYMBOL(p,s)	__bzero
++#endif
++
+ #ifndef WMEMSET_SYMBOL
+ # define WMEMSET_CHK_SYMBOL(p,s) p
+ # define WMEMSET_SYMBOL(p,s)	__wmemset
+@@ -63,6 +70,7 @@
+ libc_hidden_builtin_def (memset)
+ 
+ #if IS_IN (libc)
++weak_alias (__bzero, bzero)
+ libc_hidden_def (__wmemset)
+ weak_alias (__wmemset, wmemset)
+ libc_hidden_weak (wmemset)
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 26be4095..37d8d6f0 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
++++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -1,85 +1,130 @@
+ ifeq ($(subdir),string)
+ 
+-sysdep_routines += strncat-c stpncpy-c strncpy-c \
+-		   strcmp-sse2 strcmp-sse2-unaligned strcmp-ssse3  \
+-		   strcmp-sse4_2 strcmp-avx2 \
+-		   strncmp-sse2 strncmp-ssse3 strncmp-sse4_2 strncmp-avx2 \
+-		   memchr-sse2 rawmemchr-sse2 memchr-avx2 rawmemchr-avx2 \
+-		   memrchr-sse2 memrchr-avx2 \
+-		   memcmp-sse2 \
+-		   memcmp-avx2-movbe \
+-		   memcmp-sse4 memcpy-ssse3 \
+-		   memmove-ssse3 \
+-		   memcpy-ssse3-back \
+-		   memmove-ssse3-back \
+-		   memmove-avx512-no-vzeroupper \
+-		   strcasecmp_l-sse2 strcasecmp_l-ssse3 \
+-		   strcasecmp_l-sse4_2 strcasecmp_l-avx \
+-		   strncase_l-sse2 strncase_l-ssse3 \
+-		   strncase_l-sse4_2 strncase_l-avx \
+-		   strchr-sse2 strchrnul-sse2 strchr-avx2 strchrnul-avx2 \
+-		   strrchr-sse2 strrchr-avx2 \
+-		   strlen-sse2 strnlen-sse2 strlen-avx2 strnlen-avx2 \
+-		   strcat-avx2 strncat-avx2 \
+-		   strcat-ssse3 strncat-ssse3\
+-		   strcpy-avx2 strncpy-avx2 \
+-		   strcpy-sse2 stpcpy-sse2 \
+-		   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
+-		   strcpy-sse2-unaligned strncpy-sse2-unaligned \
+-		   stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
+-		   stpcpy-avx2 stpncpy-avx2 \
+-		   strcat-sse2 \
+-		   strcat-sse2-unaligned strncat-sse2-unaligned \
+-		   strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
+-		   strcspn-sse2 strpbrk-sse2 strspn-sse2 \
+-		   strcspn-c strpbrk-c strspn-c varshift \
+-		   memset-avx512-no-vzeroupper \
+-		   memmove-sse2-unaligned-erms \
+-		   memmove-avx-unaligned-erms \
+-		   memmove-avx512-unaligned-erms \
+-		   memset-sse2-unaligned-erms \
+-		   memset-avx2-unaligned-erms \
+-		   memset-avx512-unaligned-erms \
+-		   memchr-avx2-rtm \
+-		   memcmp-avx2-movbe-rtm \
+-		   memmove-avx-unaligned-erms-rtm \
+-		   memrchr-avx2-rtm \
+-		   memset-avx2-unaligned-erms-rtm \
+-		   rawmemchr-avx2-rtm \
+-		   strchr-avx2-rtm \
+-		   strcmp-avx2-rtm \
+-		   strchrnul-avx2-rtm \
+-		   stpcpy-avx2-rtm \
+-		   stpncpy-avx2-rtm \
+-		   strcat-avx2-rtm \
+-		   strcpy-avx2-rtm \
+-		   strlen-avx2-rtm \
+-		   strncat-avx2-rtm \
+-		   strncmp-avx2-rtm \
+-		   strncpy-avx2-rtm \
+-		   strnlen-avx2-rtm \
+-		   strrchr-avx2-rtm \
+-		   memchr-evex \
+-		   memcmp-evex-movbe \
+-		   memmove-evex-unaligned-erms \
+-		   memrchr-evex \
+-		   memset-evex-unaligned-erms \
+-		   rawmemchr-evex \
+-		   stpcpy-evex \
+-		   stpncpy-evex \
+-		   strcat-evex \
+-		   strchr-evex \
+-		   strchrnul-evex \
+-		   strcmp-evex \
+-		   strcpy-evex \
+-		   strlen-evex \
+-		   strncat-evex \
+-		   strncmp-evex \
+-		   strncpy-evex \
+-		   strnlen-evex \
+-		   strrchr-evex \
+-		   memchr-evex-rtm \
+-		   rawmemchr-evex-rtm
++sysdep_routines += \
++  bzero \
++  memchr-avx2 \
++  memchr-avx2-rtm \
++  memchr-evex \
++  memchr-evex-rtm \
++  memchr-sse2 \
++  memcmp-avx2-movbe \
++  memcmp-avx2-movbe-rtm \
++  memcmp-evex-movbe \
++  memcmp-sse2 \
++  memcmp-sse4 \
++  memcmp-ssse3 \
++  memcpy-ssse3 \
++  memcpy-ssse3-back \
++  memmove-avx-unaligned-erms \
++  memmove-avx-unaligned-erms-rtm \
++  memmove-avx512-no-vzeroupper \
++  memmove-avx512-unaligned-erms \
++  memmove-evex-unaligned-erms \
++  memmove-sse2-unaligned-erms \
++  memmove-ssse3 \
++  memmove-ssse3-back \
++  memrchr-avx2 \
++  memrchr-avx2-rtm \
++  memrchr-evex \
++  memrchr-sse2 \
++  memset-avx2-unaligned-erms \
++  memset-avx2-unaligned-erms-rtm \
++  memset-avx512-no-vzeroupper \
++  memset-avx512-unaligned-erms \
++  memset-evex-unaligned-erms \
++  memset-sse2-unaligned-erms \
++  rawmemchr-avx2 \
++  rawmemchr-avx2-rtm \
++  rawmemchr-evex \
++  rawmemchr-evex-rtm \
++  rawmemchr-sse2 \
++  stpcpy-avx2 \
++  stpcpy-avx2-rtm \
++  stpcpy-evex \
++  stpcpy-sse2 \
++  stpcpy-sse2-unaligned \
++  stpcpy-ssse3 \
++  stpncpy-avx2 \
++  stpncpy-avx2-rtm \
++  stpncpy-c \
++  stpncpy-evex \
++  stpncpy-sse2-unaligned \
++  stpncpy-ssse3 \
++  strcasecmp_l-avx \
++  strcasecmp_l-sse2 \
++  strcasecmp_l-sse4_2 \
++  strcasecmp_l-ssse3 \
++  strcat-avx2 \
++  strcat-avx2-rtm \
++  strcat-evex \
++  strcat-sse2 \
++  strcat-sse2-unaligned \
++  strcat-ssse3 \
++  strchr-avx2 \
++  strchr-avx2-rtm \
++  strchr-evex \
++  strchr-sse2 \
++  strchr-sse2-no-bsf \
++  strchrnul-avx2 \
++  strchrnul-avx2-rtm \
++  strchrnul-evex \
++  strchrnul-sse2 \
++  strcmp-avx2 \
++  strcmp-avx2-rtm \
++  strcmp-evex \
++  strcmp-sse2 \
++  strcmp-sse2-unaligned \
++  strcmp-sse4_2 \
++  strcmp-ssse3 \
++  strcpy-avx2 \
++  strcpy-avx2-rtm \
++  strcpy-evex \
++  strcpy-sse2 \
++  strcpy-sse2-unaligned \
++  strcpy-ssse3 \
++  strcspn-c \
++  strcspn-sse2 \
++  strlen-avx2 \
++  strlen-avx2-rtm \
++  strlen-evex \
++  strlen-sse2 \
++  strncase_l-avx \
++  strncase_l-sse2 \
++  strncase_l-sse4_2 \
++  strncase_l-ssse3 \
++  strncat-avx2 \
++  strncat-avx2-rtm \
++  strncat-c \
++  strncat-evex \
++  strncat-sse2-unaligned \
++  strncat-ssse3 \
++  strncmp-avx2 \
++  strncmp-avx2-rtm \
++  strncmp-evex \
++  strncmp-sse2 \
++  strncmp-sse4_2 \
++  strncmp-ssse3 \
++  strncpy-avx2 \
++  strncpy-avx2-rtm \
++  strncpy-c \
++  strncpy-evex \
++  strncpy-sse2-unaligned \
++  strncpy-ssse3 \
++  strnlen-avx2 \
++  strnlen-avx2-rtm \
++  strnlen-evex \
++  strnlen-sse2 \
++  strpbrk-c \
++  strpbrk-sse2 \
++  strrchr-avx2 \
++  strrchr-avx2-rtm \
++  strrchr-evex \
++  strrchr-sse2 \
++  strspn-c \
++  strspn-sse2 \
++  strstr-sse2-unaligned \
++  varshift \
++# sysdep_routines
+ CFLAGS-varshift.c += -msse4
+ CFLAGS-strcspn-c.c += -msse4
+ CFLAGS-strpbrk-c.c += -msse4
+diff --git a/sysdeps/x86_64/multiarch/bzero.c b/sysdeps/x86_64/multiarch/bzero.c
+new file mode 100644
+index 00000000..58a14b2c
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/bzero.c
+@@ -0,0 +1,106 @@
++/* Multiple versions of bzero.
++   All versions must be listed in ifunc-impl-list.c.
++   Copyright (C) 2022 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++/* Define multiple versions only for the definition in libc.  */
++#if IS_IN (libc)
++# define __bzero __redirect___bzero
++# include <string.h>
++# undef __bzero
++
++# define SYMBOL_NAME __bzero
++# include <init-arch.h>
++
++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned)
++  attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned_erms)
++  attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms)
++  attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_rtm)
++  attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms_rtm)
++  attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned)
++  attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned_erms)
++  attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned)
++  attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned_erms)
++  attribute_hidden;
++
++static inline void *
++IFUNC_SELECTOR (void)
++{
++  const struct cpu_features* cpu_features = __get_cpu_features ();
++
++  if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
++      && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
++    {
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
++          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
++          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
++	{
++	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++	    return OPTIMIZE1 (avx512_unaligned_erms);
++
++	  return OPTIMIZE1 (avx512_unaligned);
++	}
++    }
++
++  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
++    {
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
++          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
++          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
++	{
++	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++	    return OPTIMIZE1 (evex_unaligned_erms);
++
++	  return OPTIMIZE1 (evex_unaligned);
++	}
++
++      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	{
++	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++	    return OPTIMIZE1 (avx2_unaligned_erms_rtm);
++
++	  return OPTIMIZE1 (avx2_unaligned_rtm);
++	}
++
++      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
++	{
++	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++	    return OPTIMIZE1 (avx2_unaligned_erms);
++
++	  return OPTIMIZE1 (avx2_unaligned);
++	}
++    }
++
++  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++    return OPTIMIZE1 (sse2_unaligned_erms);
++
++  return OPTIMIZE1 (sse2_unaligned);
++}
++
++libc_ifunc_redirected (__redirect___bzero, __bzero, IFUNC_SELECTOR ());
++
++weak_alias (__bzero, bzero)
++#endif
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index 8be0d78a..c963d391 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -282,6 +282,48 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      __memset_avx512_no_vzeroupper)
+ 	     )
+ 
++  /* Support sysdeps/x86_64/multiarch/bzero.c.  */
++  IFUNC_IMPL (i, name, bzero,
++	      IFUNC_IMPL_ADD (array, i, bzero, 1,
++			      __bzero_sse2_unaligned)
++	      IFUNC_IMPL_ADD (array, i, bzero, 1,
++			      __bzero_sse2_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, bzero,
++			      CPU_FEATURE_USABLE (AVX2),
++			      __bzero_avx2_unaligned)
++	      IFUNC_IMPL_ADD (array, i, bzero,
++			      CPU_FEATURE_USABLE (AVX2),
++			      __bzero_avx2_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, bzero,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __bzero_avx2_unaligned_rtm)
++	      IFUNC_IMPL_ADD (array, i, bzero,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __bzero_avx2_unaligned_erms_rtm)
++	      IFUNC_IMPL_ADD (array, i, bzero,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __bzero_evex_unaligned)
++	      IFUNC_IMPL_ADD (array, i, bzero,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __bzero_evex_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, bzero,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __bzero_avx512_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, bzero,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __bzero_avx512_unaligned)
++	     )
++
+   /* Support sysdeps/x86_64/multiarch/rawmemchr.c.  */
+   IFUNC_IMPL (i, name, rawmemchr,
+ 	      IFUNC_IMPL_ADD (array, i, rawmemchr,
+diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
+index 8ac3e479..5a5ee6f6 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
+@@ -5,6 +5,7 @@
+ 
+ #define SECTION(p) p##.avx.rtm
+ #define MEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
++#define BZERO_SYMBOL(p,s)	p##_avx2_##s##_rtm
+ #define WMEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
+ 
+ #include "memset-avx2-unaligned-erms.S"
+diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+index c0bf2875..a093a283 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+@@ -14,6 +14,9 @@
+   vmovd d, %xmm0; \
+   movq r, %rax;
+ 
++# define BZERO_ZERO_VEC0() \
++  vpxor %xmm0, %xmm0, %xmm0
++
+ # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
+ 
+@@ -29,6 +32,9 @@
+ # ifndef MEMSET_SYMBOL
+ #  define MEMSET_SYMBOL(p,s)	p##_avx2_##s
+ # endif
++# ifndef BZERO_SYMBOL
++#  define BZERO_SYMBOL(p,s)	p##_avx2_##s
++# endif
+ # ifndef WMEMSET_SYMBOL
+ #  define WMEMSET_SYMBOL(p,s)	p##_avx2_##s
+ # endif
+diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+index 5241216a..727c9213 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+@@ -19,6 +19,9 @@
+   vpbroadcastb d, %VEC0; \
+   movq r, %rax
+ 
++# define BZERO_ZERO_VEC0() \
++  vpxorq %XMM0, %XMM0, %XMM0
++
+ # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   vpbroadcastd d, %VEC0; \
+   movq r, %rax
+diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+index 63700215..5d8fa78f 100644
+--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+@@ -19,6 +19,9 @@
+   vpbroadcastb d, %VEC0; \
+   movq r, %rax
+ 
++# define BZERO_ZERO_VEC0() \
++  vpxorq %XMM0, %XMM0, %XMM0
++
+ # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   vpbroadcastd d, %VEC0; \
+   movq r, %rax
+diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
+index 56b81f5c..8f579ad6 100644
+--- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
+@@ -22,6 +22,7 @@
+ 
+ #if IS_IN (libc)
+ # define MEMSET_SYMBOL(p,s)	p##_sse2_##s
++# define BZERO_SYMBOL(p,s)	MEMSET_SYMBOL (p, s)
+ # define WMEMSET_SYMBOL(p,s)	p##_sse2_##s
+ 
+ # ifdef SHARED
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index a67f9833..06f5f5d7 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -26,6 +26,10 @@
+ 
+ #include <sysdep.h>
+ 
++#ifndef BZERO_SYMBOL
++# define BZERO_SYMBOL(p,s)		MEMSET_SYMBOL (p, s)
++#endif
++
+ #ifndef MEMSET_CHK_SYMBOL
+ # define MEMSET_CHK_SYMBOL(p,s)		MEMSET_SYMBOL(p, s)
+ #endif
+@@ -87,6 +91,18 @@
+ # define XMM_SMALL	0
+ #endif
+ 
++#ifdef USE_LESS_VEC_MASK_STORE
++# define SET_REG64	rcx
++# define SET_REG32	ecx
++# define SET_REG16	cx
++# define SET_REG8	cl
++#else
++# define SET_REG64	rsi
++# define SET_REG32	esi
++# define SET_REG16	si
++# define SET_REG8	sil
++#endif
++
+ #define PAGE_SIZE 4096
+ 
+ /* Macro to calculate size of small memset block for aligning
+@@ -96,18 +112,6 @@
+ 
+ #ifndef SECTION
+ # error SECTION is not defined!
+-#endif
+-
+-	.section SECTION(.text),"ax",@progbits
+-#if VEC_SIZE == 16 && IS_IN (libc)
+-ENTRY (__bzero)
+-	mov	%RDI_LP, %RAX_LP /* Set return value.  */
+-	mov	%RSI_LP, %RDX_LP /* Set n.  */
+-	xorl	%esi, %esi
+-	pxor	%XMM0, %XMM0
+-	jmp	L(entry_from_bzero)
+-END (__bzero)
+-weak_alias (__bzero, bzero)
+ #endif
+ 
+ #if IS_IN (libc)
+@@ -123,12 +127,37 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
+ 	WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
+ 	WMEMSET_VDUP_TO_VEC0_LOW()
+ 	cmpq	$VEC_SIZE, %rdx
+-	jb	L(less_vec_no_vdup)
++	jb	L(less_vec_from_wmemset)
+ 	WMEMSET_VDUP_TO_VEC0_HIGH()
+ 	jmp	L(entry_from_wmemset)
+ END (WMEMSET_SYMBOL (__wmemset, unaligned))
+ #endif
+ 
++ENTRY (BZERO_SYMBOL(__bzero, unaligned))
++#if VEC_SIZE > 16
++	BZERO_ZERO_VEC0 ()
++#endif
++	mov	%RDI_LP, %RAX_LP
++	mov	%RSI_LP, %RDX_LP
++#ifndef USE_LESS_VEC_MASK_STORE
++	xorl	%esi, %esi
++#endif
++	cmp	$VEC_SIZE, %RDX_LP
++	jb	L(less_vec_no_vdup)
++#ifdef USE_LESS_VEC_MASK_STORE
++	xorl	%esi, %esi
++#endif
++#if VEC_SIZE <= 16
++	BZERO_ZERO_VEC0 ()
++#endif
++	cmp	$(VEC_SIZE * 2), %RDX_LP
++	ja	L(more_2x_vec)
++	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
++	VMOVU	%VEC(0), (%rdi)
++	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
++	VZEROUPPER_RETURN
++END (BZERO_SYMBOL(__bzero, unaligned))
++
+ #if defined SHARED && IS_IN (libc)
+ ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
+ 	cmp	%RDX_LP, %RCX_LP
+@@ -142,7 +171,6 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
+ 	/* Clear the upper 32 bits.  */
+ 	mov	%edx, %edx
+ # endif
+-L(entry_from_bzero):
+ 	cmpq	$VEC_SIZE, %rdx
+ 	jb	L(less_vec)
+ 	MEMSET_VDUP_TO_VEC0_HIGH()
+@@ -187,6 +215,31 @@ END (__memset_erms)
+ END (MEMSET_SYMBOL (__memset, erms))
+ # endif
+ 
++ENTRY_P2ALIGN (BZERO_SYMBOL(__bzero, unaligned_erms), 6)
++# if VEC_SIZE > 16
++	BZERO_ZERO_VEC0 ()
++# endif
++	mov	%RDI_LP, %RAX_LP
++	mov	%RSI_LP, %RDX_LP
++# ifndef USE_LESS_VEC_MASK_STORE
++	xorl	%esi, %esi
++# endif
++	cmp	$VEC_SIZE, %RDX_LP
++	jb	L(less_vec_no_vdup)
++# ifdef USE_LESS_VEC_MASK_STORE
++	xorl	%esi, %esi
++# endif
++# if VEC_SIZE <= 16
++	BZERO_ZERO_VEC0 ()
++# endif
++	cmp	$(VEC_SIZE * 2), %RDX_LP
++	ja	L(stosb_more_2x_vec)
++	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
++	VMOVU	%VEC(0), (%rdi)
++	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
++	VZEROUPPER_RETURN
++END (BZERO_SYMBOL(__bzero, unaligned_erms))
++
+ # if defined SHARED && IS_IN (libc)
+ ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
+ 	cmp	%RDX_LP, %RCX_LP
+@@ -229,6 +282,7 @@ L(last_2x_vec):
+ 	.p2align 4,, 10
+ L(less_vec):
+ L(less_vec_no_vdup):
++L(less_vec_from_wmemset):
+ 	/* Less than 1 VEC.  */
+ # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+ #  error Unsupported VEC_SIZE!
+@@ -374,8 +428,11 @@ L(less_vec):
+ 	/* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
+ 	   xmm). This is only does anything for AVX2.  */
+ 	MEMSET_VDUP_TO_VEC0_LOW ()
++L(less_vec_from_wmemset):
++#if VEC_SIZE > 16
+ L(less_vec_no_vdup):
+ #endif
++#endif
+ L(cross_page):
+ #if VEC_SIZE > 32
+ 	cmpl	$32, %edx
+@@ -386,7 +443,10 @@ L(cross_page):
+ 	jge	L(between_16_31)
+ #endif
+ #ifndef USE_XMM_LESS_VEC
+-	MOVQ	%XMM0, %rcx
++	MOVQ	%XMM0, %SET_REG64
++#endif
++#if VEC_SIZE <= 16
++L(less_vec_no_vdup):
+ #endif
+ 	cmpl	$8, %edx
+ 	jge	L(between_8_15)
+@@ -395,7 +455,7 @@ L(cross_page):
+ 	cmpl	$1, %edx
+ 	jg	L(between_2_3)
+ 	jl	L(between_0_0)
+-	movb	%sil, (%LESS_VEC_REG)
++	movb	%SET_REG8, (%LESS_VEC_REG)
+ L(between_0_0):
+ 	ret
+ 
+@@ -428,8 +488,8 @@ L(between_8_15):
+ 	MOVQ	%XMM0, (%rdi)
+ 	MOVQ	%XMM0, -8(%rdi, %rdx)
+ #else
+-	movq	%rcx, (%LESS_VEC_REG)
+-	movq	%rcx, -8(%LESS_VEC_REG, %rdx)
++	movq	%SET_REG64, (%LESS_VEC_REG)
++	movq	%SET_REG64, -8(%LESS_VEC_REG, %rdx)
+ #endif
+ 	ret
+ 
+@@ -442,8 +502,8 @@ L(between_4_7):
+ 	MOVD	%XMM0, (%rdi)
+ 	MOVD	%XMM0, -4(%rdi, %rdx)
+ #else
+-	movl	%ecx, (%LESS_VEC_REG)
+-	movl	%ecx, -4(%LESS_VEC_REG, %rdx)
++	movl	%SET_REG32, (%LESS_VEC_REG)
++	movl	%SET_REG32, -4(%LESS_VEC_REG, %rdx)
+ #endif
+ 	ret
+ 
+@@ -452,12 +512,12 @@ L(between_4_7):
+ L(between_2_3):
+ 	/* From 2 to 3.  No branch when size == 2.  */
+ #ifdef USE_XMM_LESS_VEC
+-	movb	%sil, (%rdi)
+-	movb	%sil, 1(%rdi)
+-	movb	%sil, -1(%rdi, %rdx)
++	movb	%SET_REG8, (%rdi)
++	movb	%SET_REG8, 1(%rdi)
++	movb	%SET_REG8, -1(%rdi, %rdx)
+ #else
+-	movw	%cx, (%LESS_VEC_REG)
+-	movb	%sil, -1(%LESS_VEC_REG, %rdx)
++	movw	%SET_REG16, (%LESS_VEC_REG)
++	movb	%SET_REG8, -1(%LESS_VEC_REG, %rdx)
+ #endif
+ 	ret
+ END (MEMSET_SYMBOL (__memset, unaligned_erms))
+-- 
+GitLab
+
diff --git a/SOURCES/ia-opt-str-wcs_rchr-avx2.patch b/SOURCES/ia-opt-str-wcs_rchr-avx2.patch
new file mode 100644
index 0000000..3bb015a
--- /dev/null
+++ b/SOURCES/ia-opt-str-wcs_rchr-avx2.patch
@@ -0,0 +1,502 @@
+From 0566d7c3c34685183e4f17f209651b0fba646df8 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Thu, 21 Apr 2022 20:52:29 -0500
+Subject: [PATCH] x86: Optimize {str|wcs}rchr-avx2
+
+The new code unrolls the main loop slightly without adding too much
+overhead and minimizes the comparisons for the search CHAR.
+
+Geometric Mean of all benchmarks New / Old: 0.832
+See email for all results.
+
+Full xcheck passes on x86_64 with and without multiarch enabled.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+
+(cherry picked from commit df7e295d18ffa34f629578c0017a9881af7620f6)
+---
+ sysdeps/x86_64/multiarch/strrchr-avx2.S | 426 +++++++++++++++---------
+ 1 file changed, 269 insertions(+), 157 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
+index c949410b..3d26fad4 100644
+--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
++++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
+@@ -27,9 +27,13 @@
+ # ifdef USE_AS_WCSRCHR
+ #  define VPBROADCAST	vpbroadcastd
+ #  define VPCMPEQ	vpcmpeqd
++#  define VPMIN	vpminud
++#  define CHAR_SIZE	4
+ # else
+ #  define VPBROADCAST	vpbroadcastb
+ #  define VPCMPEQ	vpcmpeqb
++#  define VPMIN	vpminub
++#  define CHAR_SIZE	1
+ # endif
+ 
+ # ifndef VZEROUPPER
+@@ -41,196 +45,304 @@
+ # endif
+ 
+ # define VEC_SIZE	32
++# define PAGE_SIZE	4096
+ 
+-	.section SECTION(.text),"ax",@progbits
+-ENTRY (STRRCHR)
+-	movd	%esi, %xmm4
+-	movl	%edi, %ecx
++	.section SECTION(.text), "ax", @progbits
++ENTRY(STRRCHR)
++	movd	%esi, %xmm7
++	movl	%edi, %eax
+ 	/* Broadcast CHAR to YMM4.  */
+-	VPBROADCAST %xmm4, %ymm4
++	VPBROADCAST %xmm7, %ymm7
+ 	vpxor	%xmm0, %xmm0, %xmm0
+ 
+-	/* Check if we may cross page boundary with one vector load.  */
+-	andl	$(2 * VEC_SIZE - 1), %ecx
+-	cmpl	$VEC_SIZE, %ecx
+-	ja	L(cros_page_boundary)
++	/* Shift here instead of `andl` to save code size (saves a fetch
++	   block).  */
++	sall	$20, %eax
++	cmpl	$((PAGE_SIZE - VEC_SIZE) << 20), %eax
++	ja	L(cross_page)
+ 
++L(page_cross_continue):
+ 	vmovdqu	(%rdi), %ymm1
+-	VPCMPEQ	%ymm1, %ymm0, %ymm2
+-	VPCMPEQ	%ymm1, %ymm4, %ymm3
+-	vpmovmskb %ymm2, %ecx
+-	vpmovmskb %ymm3, %eax
+-	addq	$VEC_SIZE, %rdi
++	/* Check end of string match.  */
++	VPCMPEQ	%ymm1, %ymm0, %ymm6
++	vpmovmskb %ymm6, %ecx
++	testl	%ecx, %ecx
++	jz	L(aligned_more)
++
++	/* Only check match with search CHAR if needed.  */
++	VPCMPEQ	%ymm1, %ymm7, %ymm1
++	vpmovmskb %ymm1, %eax
++	/* Check if match before first zero.  */
++	blsmskl	%ecx, %ecx
++	andl	%ecx, %eax
++	jz	L(ret0)
++	bsrl	%eax, %eax
++	addq	%rdi, %rax
++	/* We are off by 3 for wcsrchr if search CHAR is non-zero. If
++	   search CHAR is zero we are correct. Either way `andq
++	   -CHAR_SIZE, %rax` gets the correct result.  */
++# ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++# endif
++L(ret0):
++L(return_vzeroupper):
++	ZERO_UPPER_VEC_REGISTERS_RETURN
++
++	/* Returns for first vec x1/x2 have hard coded backward search
++	   path for earlier matches.  */
++	.p2align 4,, 10
++L(first_vec_x1):
++	VPCMPEQ	%ymm2, %ymm7, %ymm6
++	vpmovmskb %ymm6, %eax
++	blsmskl	%ecx, %ecx
++	andl	%ecx, %eax
++	jnz	L(first_vec_x1_return)
++
++	.p2align 4,, 4
++L(first_vec_x0_test):
++	VPCMPEQ	%ymm1, %ymm7, %ymm6
++	vpmovmskb %ymm6, %eax
++	testl	%eax, %eax
++	jz	L(ret1)
++	bsrl	%eax, %eax
++	addq	%r8, %rax
++# ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++# endif
++L(ret1):
++	VZEROUPPER_RETURN
+ 
++	.p2align 4,, 10
++L(first_vec_x0_x1_test):
++	VPCMPEQ	%ymm2, %ymm7, %ymm6
++	vpmovmskb %ymm6, %eax
++	/* Check ymm2 for search CHAR match. If no match then check ymm1
++	   before returning.  */
+ 	testl	%eax, %eax
+-	jnz	L(first_vec)
++	jz	L(first_vec_x0_test)
++	.p2align 4,, 4
++L(first_vec_x1_return):
++	bsrl	%eax, %eax
++	leaq	1(%rdi, %rax), %rax
++# ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++# endif
++	VZEROUPPER_RETURN
+ 
+-	testl	%ecx, %ecx
+-	jnz	L(return_null)
+ 
+-	andq	$-VEC_SIZE, %rdi
+-	xorl	%edx, %edx
+-	jmp	L(aligned_loop)
++	.p2align 4,, 10
++L(first_vec_x2):
++	VPCMPEQ	%ymm3, %ymm7, %ymm6
++	vpmovmskb %ymm6, %eax
++	blsmskl	%ecx, %ecx
++	/* If no in-range search CHAR match in ymm3 then need to check
++	   ymm1/ymm2 for an earlier match (we delay checking search
++	   CHAR matches until needed).  */
++	andl	%ecx, %eax
++	jz	L(first_vec_x0_x1_test)
++	bsrl	%eax, %eax
++	leaq	(VEC_SIZE + 1)(%rdi, %rax), %rax
++# ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++# endif
++	VZEROUPPER_RETURN
++
+ 
+ 	.p2align 4
+-L(first_vec):
+-	/* Check if there is a nul CHAR.  */
++L(aligned_more):
++	/* Save original pointer if match was in VEC 0.  */
++	movq	%rdi, %r8
++
++	/* Align src.  */
++	orq	$(VEC_SIZE - 1), %rdi
++	vmovdqu	1(%rdi), %ymm2
++	VPCMPEQ	%ymm2, %ymm0, %ymm6
++	vpmovmskb %ymm6, %ecx
+ 	testl	%ecx, %ecx
+-	jnz	L(char_and_nul_in_first_vec)
++	jnz	L(first_vec_x1)
+ 
+-	/* Remember the match and keep searching.  */
+-	movl	%eax, %edx
+-	movq	%rdi, %rsi
+-	andq	$-VEC_SIZE, %rdi
+-	jmp	L(aligned_loop)
++	vmovdqu	(VEC_SIZE + 1)(%rdi), %ymm3
++	VPCMPEQ	%ymm3, %ymm0, %ymm6
++	vpmovmskb %ymm6, %ecx
++	testl	%ecx, %ecx
++	jnz	L(first_vec_x2)
+ 
++	/* Save pointer again before realigning.  */
++	movq	%rdi, %rsi
++	addq	$(VEC_SIZE + 1), %rdi
++	andq	$-(VEC_SIZE * 2), %rdi
+ 	.p2align 4
+-L(cros_page_boundary):
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
+-	vmovdqa	(%rdi), %ymm1
+-	VPCMPEQ	%ymm1, %ymm0, %ymm2
+-	VPCMPEQ	%ymm1, %ymm4, %ymm3
+-	vpmovmskb %ymm2, %edx
+-	vpmovmskb %ymm3, %eax
+-	shrl	%cl, %edx
+-	shrl	%cl, %eax
+-	addq	$VEC_SIZE, %rdi
+-
+-	/* Check if there is a CHAR.  */
++L(first_aligned_loop):
++	/* Do 2x VEC at a time. Any more and the cost of finding the
++	   match outweights loop benefit.  */
++	vmovdqa	(VEC_SIZE * 0)(%rdi), %ymm4
++	vmovdqa	(VEC_SIZE * 1)(%rdi), %ymm5
++
++	VPCMPEQ	%ymm4, %ymm7, %ymm6
++	VPMIN	%ymm4, %ymm5, %ymm8
++	VPCMPEQ	%ymm5, %ymm7, %ymm10
++	vpor	%ymm6, %ymm10, %ymm5
++	VPCMPEQ	%ymm8, %ymm0, %ymm8
++	vpor	%ymm5, %ymm8, %ymm9
++
++	vpmovmskb %ymm9, %eax
++	addq	$(VEC_SIZE * 2), %rdi
++	/* No zero or search CHAR.  */
+ 	testl	%eax, %eax
+-	jnz	L(found_char)
+-
+-	testl	%edx, %edx
+-	jnz	L(return_null)
++	jz	L(first_aligned_loop)
+ 
+-	jmp	L(aligned_loop)
+-
+-	.p2align 4
+-L(found_char):
+-	testl	%edx, %edx
+-	jnz	L(char_and_nul)
++	/* If no zero CHAR then go to second loop (this allows us to
++	   throw away all prior work).  */
++	vpmovmskb %ymm8, %ecx
++	testl	%ecx, %ecx
++	jz	L(second_aligned_loop_prep)
+ 
+-	/* Remember the match and keep searching.  */
+-	movl	%eax, %edx
+-	leaq	(%rdi, %rcx), %rsi
++	/* Search char could be zero so we need to get the true match.
++	 */
++	vpmovmskb %ymm5, %eax
++	testl	%eax, %eax
++	jnz	L(first_aligned_loop_return)
+ 
+-	.p2align 4
+-L(aligned_loop):
+-	vmovdqa	(%rdi), %ymm1
+-	VPCMPEQ	%ymm1, %ymm0, %ymm2
+-	addq	$VEC_SIZE, %rdi
+-	VPCMPEQ	%ymm1, %ymm4, %ymm3
+-	vpmovmskb %ymm2, %ecx
+-	vpmovmskb %ymm3, %eax
+-	orl	%eax, %ecx
+-	jnz	L(char_nor_null)
+-
+-	vmovdqa	(%rdi), %ymm1
+-	VPCMPEQ	%ymm1, %ymm0, %ymm2
+-	add	$VEC_SIZE, %rdi
+-	VPCMPEQ	%ymm1, %ymm4, %ymm3
+-	vpmovmskb %ymm2, %ecx
++	.p2align 4,, 4
++L(first_vec_x1_or_x2):
++	VPCMPEQ	%ymm3, %ymm7, %ymm3
++	VPCMPEQ	%ymm2, %ymm7, %ymm2
+ 	vpmovmskb %ymm3, %eax
+-	orl	%eax, %ecx
+-	jnz	L(char_nor_null)
+-
+-	vmovdqa	(%rdi), %ymm1
+-	VPCMPEQ	%ymm1, %ymm0, %ymm2
+-	addq	$VEC_SIZE, %rdi
+-	VPCMPEQ	%ymm1, %ymm4, %ymm3
+-	vpmovmskb %ymm2, %ecx
+-	vpmovmskb %ymm3, %eax
+-	orl	%eax, %ecx
+-	jnz	L(char_nor_null)
+-
+-	vmovdqa	(%rdi), %ymm1
+-	VPCMPEQ	%ymm1, %ymm0, %ymm2
+-	addq	$VEC_SIZE, %rdi
+-	VPCMPEQ	%ymm1, %ymm4, %ymm3
+-	vpmovmskb %ymm2, %ecx
+-	vpmovmskb %ymm3, %eax
+-	orl	%eax, %ecx
+-	jz	L(aligned_loop)
+-
+-	.p2align 4
+-L(char_nor_null):
+-	/* Find a CHAR or a nul CHAR in a loop.  */
+-	testl	%eax, %eax
+-	jnz	L(match)
+-L(return_value):
+-	testl	%edx, %edx
+-	jz	L(return_null)
+-	movl	%edx, %eax
+-	movq	%rsi, %rdi
++	vpmovmskb %ymm2, %edx
++	/* Use add for macro-fusion.  */
++	addq	%rax, %rdx
++	jz	L(first_vec_x0_test)
++	/* NB: We could move this shift to before the branch and save a
++	   bit of code size / performance on the fall through. The
++	   branch leads to the null case which generally seems hotter
++	   than char in first 3x VEC.  */
++	salq	$32, %rax
++	addq	%rdx, %rax
++	bsrq	%rax, %rax
++	leaq	1(%rsi, %rax), %rax
++# ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++# endif
++	VZEROUPPER_RETURN
+ 
++	.p2align 4,, 8
++L(first_aligned_loop_return):
++	VPCMPEQ	%ymm4, %ymm0, %ymm4
++	vpmovmskb %ymm4, %edx
++	salq	$32, %rcx
++	orq	%rdx, %rcx
++
++	vpmovmskb %ymm10, %eax
++	vpmovmskb %ymm6, %edx
++	salq	$32, %rax
++	orq	%rdx, %rax
++	blsmskq	%rcx, %rcx
++	andq	%rcx, %rax
++	jz	L(first_vec_x1_or_x2)
++
++	bsrq	%rax, %rax
++	leaq	-(VEC_SIZE * 2)(%rdi, %rax), %rax
+ # ifdef USE_AS_WCSRCHR
+-	/* Keep the first bit for each matching CHAR for bsr.  */
+-	andl	$0x11111111, %eax
++	andq	$-CHAR_SIZE, %rax
+ # endif
+-	bsrl	%eax, %eax
+-	leaq	-VEC_SIZE(%rdi, %rax), %rax
+-L(return_vzeroupper):
+-	ZERO_UPPER_VEC_REGISTERS_RETURN
++	VZEROUPPER_RETURN
+ 
++	/* Search char cannot be zero.  */
+ 	.p2align 4
+-L(match):
+-	/* Find a CHAR.  Check if there is a nul CHAR.  */
+-	vpmovmskb %ymm2, %ecx
+-	testl	%ecx, %ecx
+-	jnz	L(find_nul)
+-
+-	/* Remember the match and keep searching.  */
+-	movl	%eax, %edx
++L(second_aligned_loop_set_furthest_match):
++	/* Save VEC and pointer from most recent match.  */
++L(second_aligned_loop_prep):
+ 	movq	%rdi, %rsi
+-	jmp	L(aligned_loop)
++	vmovdqu	%ymm6, %ymm2
++	vmovdqu	%ymm10, %ymm3
+ 
+ 	.p2align 4
+-L(find_nul):
+-# ifdef USE_AS_WCSRCHR
+-	/* Keep the first bit for each matching CHAR for bsr.  */
+-	andl	$0x11111111, %ecx
+-	andl	$0x11111111, %eax
+-# endif
+-	/* Mask out any matching bits after the nul CHAR.  */
+-	movl	%ecx, %r8d
+-	subl	$1, %r8d
+-	xorl	%ecx, %r8d
+-	andl	%r8d, %eax
++L(second_aligned_loop):
++	/* Search 2x at at time.  */
++	vmovdqa	(VEC_SIZE * 0)(%rdi), %ymm4
++	vmovdqa	(VEC_SIZE * 1)(%rdi), %ymm5
++
++	VPCMPEQ	%ymm4, %ymm7, %ymm6
++	VPMIN	%ymm4, %ymm5, %ymm1
++	VPCMPEQ	%ymm5, %ymm7, %ymm10
++	vpor	%ymm6, %ymm10, %ymm5
++	VPCMPEQ	%ymm1, %ymm0, %ymm1
++	vpor	%ymm5, %ymm1, %ymm9
++
++	vpmovmskb %ymm9, %eax
++	addq	$(VEC_SIZE * 2), %rdi
+ 	testl	%eax, %eax
+-	/* If there is no CHAR here, return the remembered one.  */
+-	jz	L(return_value)
+-	bsrl	%eax, %eax
+-	leaq	-VEC_SIZE(%rdi, %rax), %rax
+-	VZEROUPPER_RETURN
+-
+-	.p2align 4
+-L(char_and_nul):
+-	/* Find both a CHAR and a nul CHAR.  */
+-	addq	%rcx, %rdi
+-	movl	%edx, %ecx
+-L(char_and_nul_in_first_vec):
+-# ifdef USE_AS_WCSRCHR
+-	/* Keep the first bit for each matching CHAR for bsr.  */
+-	andl	$0x11111111, %ecx
+-	andl	$0x11111111, %eax
+-# endif
+-	/* Mask out any matching bits after the nul CHAR.  */
+-	movl	%ecx, %r8d
+-	subl	$1, %r8d
+-	xorl	%ecx, %r8d
+-	andl	%r8d, %eax
++	jz	L(second_aligned_loop)
++	vpmovmskb %ymm1, %ecx
++	testl	%ecx, %ecx
++	jz	L(second_aligned_loop_set_furthest_match)
++	vpmovmskb %ymm5, %eax
+ 	testl	%eax, %eax
+-	/* Return null pointer if the nul CHAR comes first.  */
+-	jz	L(return_null)
+-	bsrl	%eax, %eax
+-	leaq	-VEC_SIZE(%rdi, %rax), %rax
++	jnz	L(return_new_match)
++
++	/* This is the hot patch. We know CHAR is inbounds and that
++	   ymm3/ymm2 have latest match.  */
++	.p2align 4,, 4
++L(return_old_match):
++	vpmovmskb %ymm3, %eax
++	vpmovmskb %ymm2, %edx
++	salq	$32, %rax
++	orq	%rdx, %rax
++	bsrq	%rax, %rax
++	/* Search char cannot be zero so safe to just use lea for
++	   wcsrchr.  */
++	leaq	(VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax
+ 	VZEROUPPER_RETURN
+ 
+-	.p2align 4
+-L(return_null):
+-	xorl	%eax, %eax
++	/* Last iteration also potentially has a match.  */
++	.p2align 4,, 8
++L(return_new_match):
++	VPCMPEQ	%ymm4, %ymm0, %ymm4
++	vpmovmskb %ymm4, %edx
++	salq	$32, %rcx
++	orq	%rdx, %rcx
++
++	vpmovmskb %ymm10, %eax
++	vpmovmskb %ymm6, %edx
++	salq	$32, %rax
++	orq	%rdx, %rax
++	blsmskq	%rcx, %rcx
++	andq	%rcx, %rax
++	jz	L(return_old_match)
++	bsrq	%rax, %rax
++	/* Search char cannot be zero so safe to just use lea for
++	   wcsrchr.  */
++	leaq	(VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax
+ 	VZEROUPPER_RETURN
+ 
+-END (STRRCHR)
++	.p2align 4,, 4
++L(cross_page):
++	movq	%rdi, %rsi
++	andq	$-VEC_SIZE, %rsi
++	vmovdqu	(%rsi), %ymm1
++	VPCMPEQ	%ymm1, %ymm0, %ymm6
++	vpmovmskb %ymm6, %ecx
++	/* Shift out zero CHAR matches that are before the begining of
++	   src (rdi).  */
++	shrxl	%edi, %ecx, %ecx
++	testl	%ecx, %ecx
++	jz	L(page_cross_continue)
++	VPCMPEQ	%ymm1, %ymm7, %ymm1
++	vpmovmskb %ymm1, %eax
++
++	/* Shift out search CHAR matches that are before the begining of
++	   src (rdi).  */
++	shrxl	%edi, %eax, %eax
++	blsmskl	%ecx, %ecx
++	/* Check if any search CHAR match in range.  */
++	andl	%ecx, %eax
++	jz	L(ret2)
++	bsrl	%eax, %eax
++	addq	%rdi, %rax
++# ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++# endif
++L(ret2):
++	VZEROUPPER_RETURN
++END(STRRCHR)
+ #endif
+-- 
+GitLab
+
diff --git a/SOURCES/ia-opt-str-wcs_rchr-evex.patch b/SOURCES/ia-opt-str-wcs_rchr-evex.patch
new file mode 100644
index 0000000..e130e3e
--- /dev/null
+++ b/SOURCES/ia-opt-str-wcs_rchr-evex.patch
@@ -0,0 +1,559 @@
+From 9ef733cbe224b1cc12e4c8acac09627ccb3a00d8 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Thu, 21 Apr 2022 20:52:30 -0500
+Subject: [PATCH] x86: Optimize {str|wcs}rchr-evex
+
+The new code unrolls the main loop slightly without adding too much
+overhead and minimizes the comparisons for the search CHAR.
+
+Geometric Mean of all benchmarks New / Old: 0.755
+See email for all results.
+
+Full xcheck passes on x86_64 with and without multiarch enabled.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+
+(cherry picked from commit c966099cdc3e0fdf92f63eac09b22fa7e5f5f02d)
+---
+ sysdeps/x86_64/multiarch/strrchr-evex.S | 471 +++++++++++++++---------
+ 1 file changed, 290 insertions(+), 181 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
+index f920b5a5..f5b6d755 100644
+--- a/sysdeps/x86_64/multiarch/strrchr-evex.S
++++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
+@@ -24,242 +24,351 @@
+ #  define STRRCHR	__strrchr_evex
+ # endif
+ 
+-# define VMOVU		vmovdqu64
+-# define VMOVA		vmovdqa64
++# define VMOVU	vmovdqu64
++# define VMOVA	vmovdqa64
+ 
+ # ifdef USE_AS_WCSRCHR
++#  define SHIFT_REG	esi
++
++#  define kunpck	kunpckbw
++#  define kmov_2x	kmovd
++#  define maskz_2x	ecx
++#  define maskm_2x	eax
++#  define CHAR_SIZE	4
++#  define VPMIN	vpminud
++#  define VPTESTN	vptestnmd
+ #  define VPBROADCAST	vpbroadcastd
+-#  define VPCMP		vpcmpd
+-#  define SHIFT_REG	r8d
++#  define VPCMP	vpcmpd
+ # else
++#  define SHIFT_REG	edi
++
++#  define kunpck	kunpckdq
++#  define kmov_2x	kmovq
++#  define maskz_2x	rcx
++#  define maskm_2x	rax
++
++#  define CHAR_SIZE	1
++#  define VPMIN	vpminub
++#  define VPTESTN	vptestnmb
+ #  define VPBROADCAST	vpbroadcastb
+-#  define VPCMP		vpcmpb
+-#  define SHIFT_REG	ecx
++#  define VPCMP	vpcmpb
+ # endif
+ 
+ # define XMMZERO	xmm16
+ # define YMMZERO	ymm16
+ # define YMMMATCH	ymm17
+-# define YMM1		ymm18
++# define YMMSAVE	ymm18
++
++# define YMM1	ymm19
++# define YMM2	ymm20
++# define YMM3	ymm21
++# define YMM4	ymm22
++# define YMM5	ymm23
++# define YMM6	ymm24
++# define YMM7	ymm25
++# define YMM8	ymm26
+ 
+-# define VEC_SIZE	32
+ 
+-	.section .text.evex,"ax",@progbits
+-ENTRY (STRRCHR)
+-	movl	%edi, %ecx
++# define VEC_SIZE	32
++# define PAGE_SIZE	4096
++	.section .text.evex, "ax", @progbits
++ENTRY(STRRCHR)
++	movl	%edi, %eax
+ 	/* Broadcast CHAR to YMMMATCH.  */
+ 	VPBROADCAST %esi, %YMMMATCH
+ 
+-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+-
+-	/* Check if we may cross page boundary with one vector load.  */
+-	andl	$(2 * VEC_SIZE - 1), %ecx
+-	cmpl	$VEC_SIZE, %ecx
+-	ja	L(cros_page_boundary)
++	andl	$(PAGE_SIZE - 1), %eax
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
++	jg	L(cross_page_boundary)
+ 
++L(page_cross_continue):
+ 	VMOVU	(%rdi), %YMM1
+-
+-	/* Each bit in K0 represents a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM1, %k0
+-	/* Each bit in K1 represents a CHAR in YMM1.  */
+-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
++	/* k0 has a 1 for each zero CHAR in YMM1.  */
++	VPTESTN	%YMM1, %YMM1, %k0
+ 	kmovd	%k0, %ecx
+-	kmovd	%k1, %eax
+-
+-	addq	$VEC_SIZE, %rdi
+-
+-	testl	%eax, %eax
+-	jnz	L(first_vec)
+-
+ 	testl	%ecx, %ecx
+-	jnz	L(return_null)
+-
+-	andq	$-VEC_SIZE, %rdi
+-	xorl	%edx, %edx
+-	jmp	L(aligned_loop)
+-
+-	.p2align 4
+-L(first_vec):
+-	/* Check if there is a null byte.  */
+-	testl	%ecx, %ecx
+-	jnz	L(char_and_nul_in_first_vec)
+-
+-	/* Remember the match and keep searching.  */
+-	movl	%eax, %edx
+-	movq	%rdi, %rsi
+-	andq	$-VEC_SIZE, %rdi
+-	jmp	L(aligned_loop)
+-
+-	.p2align 4
+-L(cros_page_boundary):
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
++	jz	L(aligned_more)
++	/* fallthrough: zero CHAR in first VEC.  */
+ 
++	/* K1 has a 1 for each search CHAR match in YMM1.  */
++	VPCMP	$0, %YMMMATCH, %YMM1, %k1
++	kmovd	%k1, %eax
++	/* Build mask up until first zero CHAR (used to mask of
++	   potential search CHAR matches past the end of the string).
++	 */
++	blsmskl	%ecx, %ecx
++	andl	%ecx, %eax
++	jz	L(ret0)
++	/* Get last match (the `andl` removed any out of bounds
++	   matches).  */
++	bsrl	%eax, %eax
+ # ifdef USE_AS_WCSRCHR
+-	/* NB: Divide shift count by 4 since each bit in K1 represent 4
+-	   bytes.  */
+-	movl	%ecx, %SHIFT_REG
+-	sarl	$2, %SHIFT_REG
++	leaq	(%rdi, %rax, CHAR_SIZE), %rax
++# else
++	addq	%rdi, %rax
+ # endif
++L(ret0):
++	ret
+ 
+-	VMOVA	(%rdi), %YMM1
+-
+-	/* Each bit in K0 represents a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM1, %k0
+-	/* Each bit in K1 represents a CHAR in YMM1.  */
++	/* Returns for first vec x1/x2/x3 have hard coded backward
++	   search path for earlier matches.  */
++	.p2align 4,, 6
++L(first_vec_x1):
++	VPCMP	$0, %YMMMATCH, %YMM2, %k1
++	kmovd	%k1, %eax
++	blsmskl	%ecx, %ecx
++	/* eax non-zero if search CHAR in range.  */
++	andl	%ecx, %eax
++	jnz	L(first_vec_x1_return)
++
++	/* fallthrough: no match in YMM2 then need to check for earlier
++	   matches (in YMM1).  */
++	.p2align 4,, 4
++L(first_vec_x0_test):
+ 	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+-	kmovd	%k0, %edx
+ 	kmovd	%k1, %eax
+-
+-	shrxl	%SHIFT_REG, %edx, %edx
+-	shrxl	%SHIFT_REG, %eax, %eax
+-	addq	$VEC_SIZE, %rdi
+-
+-	/* Check if there is a CHAR.  */
+ 	testl	%eax, %eax
+-	jnz	L(found_char)
+-
+-	testl	%edx, %edx
+-	jnz	L(return_null)
+-
+-	jmp	L(aligned_loop)
+-
+-	.p2align 4
+-L(found_char):
+-	testl	%edx, %edx
+-	jnz	L(char_and_nul)
+-
+-	/* Remember the match and keep searching.  */
+-	movl	%eax, %edx
+-	leaq	(%rdi, %rcx), %rsi
++	jz	L(ret1)
++	bsrl	%eax, %eax
++# ifdef USE_AS_WCSRCHR
++	leaq	(%rsi, %rax, CHAR_SIZE), %rax
++# else
++	addq	%rsi, %rax
++# endif
++L(ret1):
++	ret
+ 
+-	.p2align 4
+-L(aligned_loop):
+-	VMOVA	(%rdi), %YMM1
+-	addq	$VEC_SIZE, %rdi
++	.p2align 4,, 10
++L(first_vec_x1_or_x2):
++	VPCMP	$0, %YMM3, %YMMMATCH, %k3
++	VPCMP	$0, %YMM2, %YMMMATCH, %k2
++	/* K2 and K3 have 1 for any search CHAR match. Test if any
++	   matches between either of them. Otherwise check YMM1.  */
++	kortestd %k2, %k3
++	jz	L(first_vec_x0_test)
++
++	/* Guranteed that YMM2 and YMM3 are within range so merge the
++	   two bitmasks then get last result.  */
++	kunpck	%k2, %k3, %k3
++	kmovq	%k3, %rax
++	bsrq	%rax, %rax
++	leaq	(VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax
++	ret
+ 
+-	/* Each bit in K0 represents a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM1, %k0
+-	/* Each bit in K1 represents a CHAR in YMM1.  */
+-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+-	kmovd	%k0, %ecx
++	.p2align 4,, 6
++L(first_vec_x3):
++	VPCMP	$0, %YMMMATCH, %YMM4, %k1
+ 	kmovd	%k1, %eax
+-	orl	%eax, %ecx
+-	jnz	L(char_nor_null)
++	blsmskl	%ecx, %ecx
++	/* If no search CHAR match in range check YMM1/YMM2/YMM3.  */
++	andl	%ecx, %eax
++	jz	L(first_vec_x1_or_x2)
++	bsrl	%eax, %eax
++	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
++	ret
+ 
+-	VMOVA	(%rdi), %YMM1
+-	add	$VEC_SIZE, %rdi
++	.p2align 4,, 6
++L(first_vec_x0_x1_test):
++	VPCMP	$0, %YMMMATCH, %YMM2, %k1
++	kmovd	%k1, %eax
++	/* Check YMM2 for last match first. If no match try YMM1.  */
++	testl	%eax, %eax
++	jz	L(first_vec_x0_test)
++	.p2align 4,, 4
++L(first_vec_x1_return):
++	bsrl	%eax, %eax
++	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
++	ret
+ 
+-	/* Each bit in K0 represents a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM1, %k0
+-	/* Each bit in K1 represents a CHAR in YMM1.  */
+-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+-	kmovd	%k0, %ecx
++	.p2align 4,, 10
++L(first_vec_x2):
++	VPCMP	$0, %YMMMATCH, %YMM3, %k1
+ 	kmovd	%k1, %eax
+-	orl	%eax, %ecx
+-	jnz	L(char_nor_null)
++	blsmskl	%ecx, %ecx
++	/* Check YMM3 for last match first. If no match try YMM2/YMM1.
++	 */
++	andl	%ecx, %eax
++	jz	L(first_vec_x0_x1_test)
++	bsrl	%eax, %eax
++	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
++	ret
+ 
+-	VMOVA	(%rdi), %YMM1
+-	addq	$VEC_SIZE, %rdi
+ 
+-	/* Each bit in K0 represents a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM1, %k0
+-	/* Each bit in K1 represents a CHAR in YMM1.  */
+-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
++	.p2align 4
++L(aligned_more):
++	/* Need to keep original pointer incase YMM1 has last match.  */
++	movq	%rdi, %rsi
++	andq	$-VEC_SIZE, %rdi
++	VMOVU	VEC_SIZE(%rdi), %YMM2
++	VPTESTN	%YMM2, %YMM2, %k0
+ 	kmovd	%k0, %ecx
+-	kmovd	%k1, %eax
+-	orl	%eax, %ecx
+-	jnz	L(char_nor_null)
++	testl	%ecx, %ecx
++	jnz	L(first_vec_x1)
+ 
+-	VMOVA	(%rdi), %YMM1
+-	addq	$VEC_SIZE, %rdi
++	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM3
++	VPTESTN	%YMM3, %YMM3, %k0
++	kmovd	%k0, %ecx
++	testl	%ecx, %ecx
++	jnz	L(first_vec_x2)
+ 
+-	/* Each bit in K0 represents a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM1, %k0
+-	/* Each bit in K1 represents a CHAR in YMM1.  */
+-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
++	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM4
++	VPTESTN	%YMM4, %YMM4, %k0
+ 	kmovd	%k0, %ecx
+-	kmovd	%k1, %eax
+-	orl	%eax, %ecx
+-	jz	L(aligned_loop)
++	movq	%rdi, %r8
++	testl	%ecx, %ecx
++	jnz	L(first_vec_x3)
+ 
++	andq	$-(VEC_SIZE * 2), %rdi
+ 	.p2align 4
+-L(char_nor_null):
+-	/* Find a CHAR or a null byte in a loop.  */
++L(first_aligned_loop):
++	/* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee
++	   they don't store a match.  */
++	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM5
++	VMOVA	(VEC_SIZE * 5)(%rdi), %YMM6
++
++	VPCMP	$0, %YMM5, %YMMMATCH, %k2
++	vpxord	%YMM6, %YMMMATCH, %YMM7
++
++	VPMIN	%YMM5, %YMM6, %YMM8
++	VPMIN	%YMM8, %YMM7, %YMM7
++
++	VPTESTN	%YMM7, %YMM7, %k1
++	subq	$(VEC_SIZE * -2), %rdi
++	kortestd %k1, %k2
++	jz	L(first_aligned_loop)
++
++	VPCMP	$0, %YMM6, %YMMMATCH, %k3
++	VPTESTN	%YMM8, %YMM8, %k1
++	ktestd	%k1, %k1
++	jz	L(second_aligned_loop_prep)
++
++	kortestd %k2, %k3
++	jnz	L(return_first_aligned_loop)
++
++	.p2align 4,, 6
++L(first_vec_x1_or_x2_or_x3):
++	VPCMP	$0, %YMM4, %YMMMATCH, %k4
++	kmovd	%k4, %eax
+ 	testl	%eax, %eax
+-	jnz	L(match)
+-L(return_value):
+-	testl	%edx, %edx
+-	jz	L(return_null)
+-	movl	%edx, %eax
+-	movq	%rsi, %rdi
++	jz	L(first_vec_x1_or_x2)
+ 	bsrl	%eax, %eax
+-# ifdef USE_AS_WCSRCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
+-# else
+-	leaq	-VEC_SIZE(%rdi, %rax), %rax
+-# endif
++	leaq	(VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
+ 	ret
+ 
+-	.p2align 4
+-L(match):
+-	/* Find a CHAR.  Check if there is a null byte.  */
+-	kmovd	%k0, %ecx
+-	testl	%ecx, %ecx
+-	jnz	L(find_nul)
++	.p2align 4,, 8
++L(return_first_aligned_loop):
++	VPTESTN	%YMM5, %YMM5, %k0
++	kunpck	%k0, %k1, %k0
++	kmov_2x	%k0, %maskz_2x
++
++	blsmsk	%maskz_2x, %maskz_2x
++	kunpck	%k2, %k3, %k3
++	kmov_2x	%k3, %maskm_2x
++	and	%maskz_2x, %maskm_2x
++	jz	L(first_vec_x1_or_x2_or_x3)
+ 
+-	/* Remember the match and keep searching.  */
+-	movl	%eax, %edx
++	bsr	%maskm_2x, %maskm_2x
++	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
++	ret
++
++	.p2align 4
++	/* We can throw away the work done for the first 4x checks here
++	   as we have a later match. This is the 'fast' path persay.
++	 */
++L(second_aligned_loop_prep):
++L(second_aligned_loop_set_furthest_match):
+ 	movq	%rdi, %rsi
+-	jmp	L(aligned_loop)
++	kunpck	%k2, %k3, %k4
+ 
+ 	.p2align 4
+-L(find_nul):
+-	/* Mask out any matching bits after the null byte.  */
+-	movl	%ecx, %r8d
+-	subl	$1, %r8d
+-	xorl	%ecx, %r8d
+-	andl	%r8d, %eax
+-	testl	%eax, %eax
+-	/* If there is no CHAR here, return the remembered one.  */
+-	jz	L(return_value)
+-	bsrl	%eax, %eax
++L(second_aligned_loop):
++	VMOVU	(VEC_SIZE * 4)(%rdi), %YMM1
++	VMOVU	(VEC_SIZE * 5)(%rdi), %YMM2
++
++	VPCMP	$0, %YMM1, %YMMMATCH, %k2
++	vpxord	%YMM2, %YMMMATCH, %YMM3
++
++	VPMIN	%YMM1, %YMM2, %YMM4
++	VPMIN	%YMM3, %YMM4, %YMM3
++
++	VPTESTN	%YMM3, %YMM3, %k1
++	subq	$(VEC_SIZE * -2), %rdi
++	kortestd %k1, %k2
++	jz	L(second_aligned_loop)
++
++	VPCMP	$0, %YMM2, %YMMMATCH, %k3
++	VPTESTN	%YMM4, %YMM4, %k1
++	ktestd	%k1, %k1
++	jz	L(second_aligned_loop_set_furthest_match)
++
++	kortestd %k2, %k3
++	/* branch here because there is a significant advantage interms
++	   of output dependency chance in using edx.  */
++	jnz	L(return_new_match)
++L(return_old_match):
++	kmovq	%k4, %rax
++	bsrq	%rax, %rax
++	leaq	(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax
++	ret
++
++L(return_new_match):
++	VPTESTN	%YMM1, %YMM1, %k0
++	kunpck	%k0, %k1, %k0
++	kmov_2x	%k0, %maskz_2x
++
++	blsmsk	%maskz_2x, %maskz_2x
++	kunpck	%k2, %k3, %k3
++	kmov_2x	%k3, %maskm_2x
++	and	%maskz_2x, %maskm_2x
++	jz	L(return_old_match)
++
++	bsr	%maskm_2x, %maskm_2x
++	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
++	ret
++
++L(cross_page_boundary):
++	/* eax contains all the page offset bits of src (rdi). `xor rdi,
++	   rax` sets pointer will all page offset bits cleared so
++	   offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
++	   before page cross (guranteed to be safe to read). Doing this
++	   as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
++	   a bit of code size.  */
++	xorq	%rdi, %rax
++	VMOVU	(PAGE_SIZE - VEC_SIZE)(%rax), %YMM1
++	VPTESTN	%YMM1, %YMM1, %k0
++	kmovd	%k0, %ecx
++
++	/* Shift out zero CHAR matches that are before the begining of
++	   src (rdi).  */
+ # ifdef USE_AS_WCSRCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
+-# else
+-	leaq	-VEC_SIZE(%rdi, %rax), %rax
++	movl	%edi, %esi
++	andl	$(VEC_SIZE - 1), %esi
++	shrl	$2, %esi
+ # endif
+-	ret
++	shrxl	%SHIFT_REG, %ecx, %ecx
+ 
+-	.p2align 4
+-L(char_and_nul):
+-	/* Find both a CHAR and a null byte.  */
+-	addq	%rcx, %rdi
+-	movl	%edx, %ecx
+-L(char_and_nul_in_first_vec):
+-	/* Mask out any matching bits after the null byte.  */
+-	movl	%ecx, %r8d
+-	subl	$1, %r8d
+-	xorl	%ecx, %r8d
+-	andl	%r8d, %eax
+-	testl	%eax, %eax
+-	/* Return null pointer if the null byte comes first.  */
+-	jz	L(return_null)
++	testl	%ecx, %ecx
++	jz	L(page_cross_continue)
++
++	/* Found zero CHAR so need to test for search CHAR.  */
++	VPCMP	$0, %YMMMATCH, %YMM1, %k1
++	kmovd	%k1, %eax
++	/* Shift out search CHAR matches that are before the begining of
++	   src (rdi).  */
++	shrxl	%SHIFT_REG, %eax, %eax
++
++	/* Check if any search CHAR match in range.  */
++	blsmskl	%ecx, %ecx
++	andl	%ecx, %eax
++	jz	L(ret3)
+ 	bsrl	%eax, %eax
+ # ifdef USE_AS_WCSRCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
++	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+ # else
+-	leaq	-VEC_SIZE(%rdi, %rax), %rax
++	addq	%rdi, %rax
+ # endif
++L(ret3):
+ 	ret
+ 
+-	.p2align 4
+-L(return_null):
+-	xorl	%eax, %eax
+-	ret
+-
+-END (STRRCHR)
++END(STRRCHR)
+ #endif
+-- 
+GitLab
+
diff --git a/SOURCES/ia-opt-str-wcs_rchr-sse2.patch b/SOURCES/ia-opt-str-wcs_rchr-sse2.patch
new file mode 100644
index 0000000..054edd6
--- /dev/null
+++ b/SOURCES/ia-opt-str-wcs_rchr-sse2.patch
@@ -0,0 +1,873 @@
+From 70016c060a99e8534469cdeb847eabe60bff2b54 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Thu, 21 Apr 2022 20:52:28 -0500
+Subject: [PATCH] x86: Optimize {str|wcs}rchr-sse2
+
+The new code unrolls the main loop slightly without adding too much
+overhead and minimizes the comparisons for the search CHAR.
+
+Geometric Mean of all benchmarks New / Old: 0.741
+See email for all results.
+
+Full xcheck passes on x86_64 with and without multiarch enabled.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+
+(cherry picked from commit 5307aa9c1800f36a64c183c091c9af392c1fa75c)
+---
+ sysdeps/x86_64/multiarch/strrchr-sse2.S |   2 +-
+ sysdeps/x86_64/multiarch/wcsrchr-sse2.S |   3 +-
+ sysdeps/x86_64/strrchr.S                | 510 +++++++++++++++---------
+ sysdeps/x86_64/wcsrchr.S                | 266 +-----------
+ 4 files changed, 338 insertions(+), 443 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
+index 0ec76fe9..6bb1284b 100644
+--- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
++++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
+@@ -17,7 +17,7 @@
+    <http://www.gnu.org/licenses/>.  */
+ 
+ #if IS_IN (libc)
+-# define strrchr __strrchr_sse2
++# define STRRCHR __strrchr_sse2
+ 
+ # undef weak_alias
+ # define weak_alias(strrchr, rindex)
+diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
+index d015e953..f26d53b5 100644
+--- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
++++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
+@@ -17,7 +17,6 @@
+    <http://www.gnu.org/licenses/>.  */
+ 
+ #if IS_IN (libc)
+-# define wcsrchr __wcsrchr_sse2
++# define STRRCHR	__wcsrchr_sse2
+ #endif
+-
+ #include "../wcsrchr.S"
+diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
+index aca98e7e..a58cc220 100644
+--- a/sysdeps/x86_64/strrchr.S
++++ b/sysdeps/x86_64/strrchr.S
+@@ -19,210 +19,360 @@
+ 
+ #include <sysdep.h>
+ 
++#ifndef STRRCHR
++# define STRRCHR	strrchr
++#endif
++
++#ifdef USE_AS_WCSRCHR
++# define PCMPEQ	pcmpeqd
++# define CHAR_SIZE	4
++# define PMINU	pminud
++#else
++# define PCMPEQ	pcmpeqb
++# define CHAR_SIZE	1
++# define PMINU	pminub
++#endif
++
++#define PAGE_SIZE	4096
++#define VEC_SIZE	16
++
+ 	.text
+-ENTRY (strrchr)
+-	movd	%esi, %xmm1
++ENTRY(STRRCHR)
++	movd	%esi, %xmm0
+ 	movq	%rdi, %rax
+-	andl	$4095, %eax
+-	punpcklbw	%xmm1, %xmm1
+-	cmpq	$4032, %rax
+-	punpcklwd	%xmm1, %xmm1
+-	pshufd	$0, %xmm1, %xmm1
++	andl	$(PAGE_SIZE - 1), %eax
++#ifndef USE_AS_WCSRCHR
++	punpcklbw %xmm0, %xmm0
++	punpcklwd %xmm0, %xmm0
++#endif
++	pshufd	$0, %xmm0, %xmm0
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+ 	ja	L(cross_page)
+-	movdqu	(%rdi), %xmm0
++
++L(cross_page_continue):
++	movups	(%rdi), %xmm1
+ 	pxor	%xmm2, %xmm2
+-	movdqa	%xmm0, %xmm3
+-	pcmpeqb	%xmm1, %xmm0
+-	pcmpeqb	%xmm2, %xmm3
+-	pmovmskb	%xmm0, %ecx
+-	pmovmskb	%xmm3, %edx
+-	testq	%rdx, %rdx
+-	je	L(next_48_bytes)
+-	leaq	-1(%rdx), %rax
+-	xorq	%rdx, %rax
+-	andq	%rcx, %rax
+-	je	L(exit)
+-	bsrq	%rax, %rax
++	PCMPEQ	%xmm1, %xmm2
++	pmovmskb %xmm2, %ecx
++	testl	%ecx, %ecx
++	jz	L(aligned_more)
++
++	PCMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	leal	-1(%rcx), %edx
++	xorl	%edx, %ecx
++	andl	%ecx, %eax
++	jz	L(ret0)
++	bsrl	%eax, %eax
+ 	addq	%rdi, %rax
++	/* We are off by 3 for wcsrchr if search CHAR is non-zero. If
++	   search CHAR is zero we are correct. Either way `andq
++	   -CHAR_SIZE, %rax` gets the correct result.  */
++#ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++#endif
++L(ret0):
+ 	ret
+ 
++	/* Returns for first vec x1/x2 have hard coded backward search
++	   path for earlier matches.  */
+ 	.p2align 4
+-L(next_48_bytes):
+-	movdqu	16(%rdi), %xmm4
+-	movdqa	%xmm4, %xmm5
+-	movdqu	32(%rdi), %xmm3
+-	pcmpeqb	%xmm1, %xmm4
+-	pcmpeqb	%xmm2, %xmm5
+-	movdqu	48(%rdi), %xmm0
+-	pmovmskb	%xmm5, %edx
+-	movdqa	%xmm3, %xmm5
+-	pcmpeqb	%xmm1, %xmm3
+-	pcmpeqb	%xmm2, %xmm5
+-	pcmpeqb	%xmm0, %xmm2
+-	salq	$16, %rdx
+-	pmovmskb	%xmm3, %r8d
+-	pmovmskb	%xmm5, %eax
+-	pmovmskb	%xmm2, %esi
+-	salq	$32, %r8
+-	salq	$32, %rax
+-	pcmpeqb	%xmm1, %xmm0
+-	orq	%rdx, %rax
+-	movq	%rsi, %rdx
+-	pmovmskb	%xmm4, %esi
+-	salq	$48, %rdx
+-	salq	$16, %rsi
+-	orq	%r8, %rsi
+-	orq	%rcx, %rsi
+-	pmovmskb	%xmm0, %ecx
+-	salq	$48, %rcx
+-	orq	%rcx, %rsi
+-	orq	%rdx, %rax
+-	je	L(loop_header2)
+-	leaq	-1(%rax), %rcx
+-	xorq	%rax, %rcx
+-	andq	%rcx, %rsi
+-	je	L(exit)
+-	bsrq	%rsi, %rsi
+-	leaq	(%rdi,%rsi), %rax
++L(first_vec_x0_test):
++	PCMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	testl	%eax, %eax
++	jz	L(ret0)
++	bsrl	%eax, %eax
++	addq	%r8, %rax
++#ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++#endif
+ 	ret
+ 
+ 	.p2align 4
+-L(loop_header2):
+-	testq	%rsi, %rsi
+-	movq	%rdi, %rcx
+-	je	L(no_c_found)
+-L(loop_header):
+-	addq	$64, %rdi
+-	pxor	%xmm7, %xmm7
+-	andq	$-64, %rdi
+-	jmp	L(loop_entry)
++L(first_vec_x1):
++	PCMPEQ	%xmm0, %xmm2
++	pmovmskb %xmm2, %eax
++	leal	-1(%rcx), %edx
++	xorl	%edx, %ecx
++	andl	%ecx, %eax
++	jz	L(first_vec_x0_test)
++	bsrl	%eax, %eax
++	leaq	(VEC_SIZE)(%rdi, %rax), %rax
++#ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++#endif
++	ret
+ 
+ 	.p2align 4
+-L(loop64):
+-	testq	%rdx, %rdx
+-	cmovne	%rdx, %rsi
+-	cmovne	%rdi, %rcx
+-	addq	$64, %rdi
+-L(loop_entry):
+-	movdqa	32(%rdi), %xmm3
+-	pxor	%xmm6, %xmm6
+-	movdqa	48(%rdi), %xmm2
+-	movdqa	%xmm3, %xmm0
+-	movdqa	16(%rdi), %xmm4
+-	pminub	%xmm2, %xmm0
+-	movdqa	(%rdi), %xmm5
+-	pminub	%xmm4, %xmm0
+-	pminub	%xmm5, %xmm0
+-	pcmpeqb	%xmm7, %xmm0
+-	pmovmskb	%xmm0, %eax
+-	movdqa	%xmm5, %xmm0
+-	pcmpeqb	%xmm1, %xmm0
+-	pmovmskb	%xmm0, %r9d
+-	movdqa	%xmm4, %xmm0
+-	pcmpeqb	%xmm1, %xmm0
+-	pmovmskb	%xmm0, %edx
+-	movdqa	%xmm3, %xmm0
+-	pcmpeqb	%xmm1, %xmm0
+-	salq	$16, %rdx
+-	pmovmskb	%xmm0, %r10d
+-	movdqa	%xmm2, %xmm0
+-	pcmpeqb	%xmm1, %xmm0
+-	salq	$32, %r10
+-	orq	%r10, %rdx
+-	pmovmskb	%xmm0, %r8d
+-	orq	%r9, %rdx
+-	salq	$48, %r8
+-	orq	%r8, %rdx
++L(first_vec_x1_test):
++	PCMPEQ	%xmm0, %xmm2
++	pmovmskb %xmm2, %eax
+ 	testl	%eax, %eax
+-	je	L(loop64)
+-	pcmpeqb	%xmm6, %xmm4
+-	pcmpeqb	%xmm6, %xmm3
+-	pcmpeqb	%xmm6, %xmm5
+-	pmovmskb	%xmm4, %eax
+-	pmovmskb	%xmm3, %r10d
+-	pcmpeqb	%xmm6, %xmm2
+-	pmovmskb	%xmm5, %r9d
+-	salq	$32, %r10
+-	salq	$16, %rax
+-	pmovmskb	%xmm2, %r8d
+-	orq	%r10, %rax
+-	orq	%r9, %rax
+-	salq	$48, %r8
+-	orq	%r8, %rax
+-	leaq	-1(%rax), %r8
+-	xorq	%rax, %r8
+-	andq	%r8, %rdx
+-	cmovne	%rdi, %rcx
+-	cmovne	%rdx, %rsi
+-	bsrq	%rsi, %rsi
+-	leaq	(%rcx,%rsi), %rax
++	jz	L(first_vec_x0_test)
++	bsrl	%eax, %eax
++	leaq	(VEC_SIZE)(%rdi, %rax), %rax
++#ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++#endif
++	ret
++
++	.p2align 4
++L(first_vec_x2):
++	PCMPEQ	%xmm0, %xmm3
++	pmovmskb %xmm3, %eax
++	leal	-1(%rcx), %edx
++	xorl	%edx, %ecx
++	andl	%ecx, %eax
++	jz	L(first_vec_x1_test)
++	bsrl	%eax, %eax
++	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
++#ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++#endif
++	ret
++
++	.p2align 4
++L(aligned_more):
++	/* Save original pointer if match was in VEC 0.  */
++	movq	%rdi, %r8
++	andq	$-VEC_SIZE, %rdi
++
++	movaps	VEC_SIZE(%rdi), %xmm2
++	pxor	%xmm3, %xmm3
++	PCMPEQ	%xmm2, %xmm3
++	pmovmskb %xmm3, %ecx
++	testl	%ecx, %ecx
++	jnz	L(first_vec_x1)
++
++	movaps	(VEC_SIZE * 2)(%rdi), %xmm3
++	pxor	%xmm4, %xmm4
++	PCMPEQ	%xmm3, %xmm4
++	pmovmskb %xmm4, %ecx
++	testl	%ecx, %ecx
++	jnz	L(first_vec_x2)
++
++	addq	$VEC_SIZE, %rdi
++	/* Save pointer again before realigning.  */
++	movq	%rdi, %rsi
++	andq	$-(VEC_SIZE * 2), %rdi
++	.p2align 4
++L(first_loop):
++	/* Do 2x VEC at a time.  */
++	movaps	(VEC_SIZE * 2)(%rdi), %xmm4
++	movaps	(VEC_SIZE * 3)(%rdi), %xmm5
++	/* Since SSE2 no pminud so wcsrchr needs seperate logic for
++	   detecting zero. Note if this is found to be a bottleneck it
++	   may be worth adding an SSE4.1 wcsrchr implementation.  */
++#ifdef USE_AS_WCSRCHR
++	movaps	%xmm5, %xmm6
++	pxor	%xmm8, %xmm8
++
++	PCMPEQ	%xmm8, %xmm5
++	PCMPEQ	%xmm4, %xmm8
++	por	%xmm5, %xmm8
++#else
++	movaps	%xmm5, %xmm6
++	PMINU	%xmm4, %xmm5
++#endif
++
++	movaps	%xmm4, %xmm9
++	PCMPEQ	%xmm0, %xmm4
++	PCMPEQ	%xmm0, %xmm6
++	movaps	%xmm6, %xmm7
++	por	%xmm4, %xmm6
++#ifndef USE_AS_WCSRCHR
++	pxor	%xmm8, %xmm8
++	PCMPEQ	%xmm5, %xmm8
++#endif
++	pmovmskb %xmm8, %ecx
++	pmovmskb %xmm6, %eax
++
++	addq	$(VEC_SIZE * 2), %rdi
++	/* Use `addl` 1) so we can undo it with `subl` and 2) it can
++	   macro-fuse with `jz`.  */
++	addl	%ecx, %eax
++	jz	L(first_loop)
++
++	/* Check if there is zero match.  */
++	testl	%ecx, %ecx
++	jz	L(second_loop_match)
++
++	/* Check if there was a match in last iteration.  */
++	subl	%ecx, %eax
++	jnz	L(new_match)
++
++L(first_loop_old_match):
++	PCMPEQ	%xmm0, %xmm2
++	PCMPEQ	%xmm0, %xmm3
++	pmovmskb %xmm2, %ecx
++	pmovmskb %xmm3, %eax
++	addl	%eax, %ecx
++	jz	L(first_vec_x0_test)
++	/* NB: We could move this shift to before the branch and save a
++	   bit of code size / performance on the fall through. The
++	   branch leads to the null case which generally seems hotter
++	   than char in first 3x VEC.  */
++	sall	$16, %eax
++	orl	%ecx, %eax
++
++	bsrl	%eax, %eax
++	addq	%rsi, %rax
++#ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++#endif
++	ret
++
++	.p2align 4
++L(new_match):
++	pxor	%xmm6, %xmm6
++	PCMPEQ	%xmm9, %xmm6
++	pmovmskb %xmm6, %eax
++	sall	$16, %ecx
++	orl	%eax, %ecx
++
++	/* We can't reuse either of the old comparisons as since we mask
++	   of zeros after first zero (instead of using the full
++	   comparison) we can't gurantee no interference between match
++	   after end of string and valid match.  */
++	pmovmskb %xmm4, %eax
++	pmovmskb %xmm7, %edx
++	sall	$16, %edx
++	orl	%edx, %eax
++
++	leal	-1(%ecx), %edx
++	xorl	%edx, %ecx
++	andl	%ecx, %eax
++	jz	L(first_loop_old_match)
++	bsrl	%eax, %eax
++	addq	%rdi, %rax
++#ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++#endif
+ 	ret
+ 
++	/* Save minimum state for getting most recent match. We can
++	   throw out all previous work.  */
+ 	.p2align 4
+-L(no_c_found):
+-	movl	$1, %esi
+-	xorl	%ecx, %ecx
+-	jmp	L(loop_header)
++L(second_loop_match):
++	movq	%rdi, %rsi
++	movaps	%xmm4, %xmm2
++	movaps	%xmm7, %xmm3
+ 
+ 	.p2align 4
+-L(exit):
+-	xorl	%eax, %eax
++L(second_loop):
++	movaps	(VEC_SIZE * 2)(%rdi), %xmm4
++	movaps	(VEC_SIZE * 3)(%rdi), %xmm5
++	/* Since SSE2 no pminud so wcsrchr needs seperate logic for
++	   detecting zero. Note if this is found to be a bottleneck it
++	   may be worth adding an SSE4.1 wcsrchr implementation.  */
++#ifdef USE_AS_WCSRCHR
++	movaps	%xmm5, %xmm6
++	pxor	%xmm8, %xmm8
++
++	PCMPEQ	%xmm8, %xmm5
++	PCMPEQ	%xmm4, %xmm8
++	por	%xmm5, %xmm8
++#else
++	movaps	%xmm5, %xmm6
++	PMINU	%xmm4, %xmm5
++#endif
++
++	movaps	%xmm4, %xmm9
++	PCMPEQ	%xmm0, %xmm4
++	PCMPEQ	%xmm0, %xmm6
++	movaps	%xmm6, %xmm7
++	por	%xmm4, %xmm6
++#ifndef USE_AS_WCSRCHR
++	pxor	%xmm8, %xmm8
++	PCMPEQ	%xmm5, %xmm8
++#endif
++
++	pmovmskb %xmm8, %ecx
++	pmovmskb %xmm6, %eax
++
++	addq	$(VEC_SIZE * 2), %rdi
++	/* Either null term or new occurence of CHAR.  */
++	addl	%ecx, %eax
++	jz	L(second_loop)
++
++	/* No null term so much be new occurence of CHAR.  */
++	testl	%ecx, %ecx
++	jz	L(second_loop_match)
++
++
++	subl	%ecx, %eax
++	jnz	L(second_loop_new_match)
++
++L(second_loop_old_match):
++	pmovmskb %xmm2, %ecx
++	pmovmskb %xmm3, %eax
++	sall	$16, %eax
++	orl	%ecx, %eax
++	bsrl	%eax, %eax
++	addq	%rsi, %rax
++#ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++#endif
+ 	ret
+ 
+ 	.p2align 4
++L(second_loop_new_match):
++	pxor	%xmm6, %xmm6
++	PCMPEQ	%xmm9, %xmm6
++	pmovmskb %xmm6, %eax
++	sall	$16, %ecx
++	orl	%eax, %ecx
++
++	/* We can't reuse either of the old comparisons as since we mask
++	   of zeros after first zero (instead of using the full
++	   comparison) we can't gurantee no interference between match
++	   after end of string and valid match.  */
++	pmovmskb %xmm4, %eax
++	pmovmskb %xmm7, %edx
++	sall	$16, %edx
++	orl	%edx, %eax
++
++	leal	-1(%ecx), %edx
++	xorl	%edx, %ecx
++	andl	%ecx, %eax
++	jz	L(second_loop_old_match)
++	bsrl	%eax, %eax
++	addq	%rdi, %rax
++#ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++#endif
++	ret
++
++	.p2align 4,, 4
+ L(cross_page):
+-	movq	%rdi, %rax
+-	pxor	%xmm0, %xmm0
+-	andq	$-64, %rax
+-	movdqu	(%rax), %xmm5
+-	movdqa	%xmm5, %xmm6
+-	movdqu	16(%rax), %xmm4
+-	pcmpeqb	%xmm1, %xmm5
+-	pcmpeqb	%xmm0, %xmm6
+-	movdqu	32(%rax), %xmm3
+-	pmovmskb	%xmm6, %esi
+-	movdqa	%xmm4, %xmm6
+-	movdqu	48(%rax), %xmm2
+-	pcmpeqb	%xmm1, %xmm4
+-	pcmpeqb	%xmm0, %xmm6
+-	pmovmskb	%xmm6, %edx
+-	movdqa	%xmm3, %xmm6
+-	pcmpeqb	%xmm1, %xmm3
+-	pcmpeqb	%xmm0, %xmm6
+-	pcmpeqb	%xmm2, %xmm0
+-	salq	$16, %rdx
+-	pmovmskb	%xmm3, %r9d
+-	pmovmskb	%xmm6, %r8d
+-	pmovmskb	%xmm0, %ecx
+-	salq	$32, %r9
+-	salq	$32, %r8
+-	pcmpeqb	%xmm1, %xmm2
+-	orq	%r8, %rdx
+-	salq	$48, %rcx
+-	pmovmskb	%xmm5, %r8d
+-	orq	%rsi, %rdx
+-	pmovmskb	%xmm4, %esi
+-	orq	%rcx, %rdx
+-	pmovmskb	%xmm2, %ecx
+-	salq	$16, %rsi
+-	salq	$48, %rcx
+-	orq	%r9, %rsi
+-	orq	%r8, %rsi
+-	orq	%rcx, %rsi
++	movq	%rdi, %rsi
++	andq	$-VEC_SIZE, %rsi
++	movaps	(%rsi), %xmm1
++	pxor	%xmm2, %xmm2
++	PCMPEQ	%xmm1, %xmm2
++	pmovmskb %xmm2, %edx
+ 	movl	%edi, %ecx
+-	subl	%eax, %ecx
+-	shrq	%cl, %rdx
+-	shrq	%cl, %rsi
+-	testq	%rdx, %rdx
+-	je	L(loop_header2)
+-	leaq	-1(%rdx), %rax
+-	xorq	%rdx, %rax
+-	andq	%rax, %rsi
+-	je	L(exit)
+-	bsrq	%rsi, %rax
++	andl	$(VEC_SIZE - 1), %ecx
++	sarl	%cl, %edx
++	jz	L(cross_page_continue)
++	PCMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	sarl	%cl, %eax
++	leal	-1(%rdx), %ecx
++	xorl	%edx, %ecx
++	andl	%ecx, %eax
++	jz	L(ret1)
++	bsrl	%eax, %eax
+ 	addq	%rdi, %rax
++#ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++#endif
++L(ret1):
+ 	ret
+-END (strrchr)
++END(STRRCHR)
+ 
+-weak_alias (strrchr, rindex)
+-libc_hidden_builtin_def (strrchr)
++#ifndef USE_AS_WCSRCHR
++	weak_alias (STRRCHR, rindex)
++	libc_hidden_builtin_def (STRRCHR)
++#endif
+diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
+index 2f388537..ae3cfa7d 100644
+--- a/sysdeps/x86_64/wcsrchr.S
++++ b/sysdeps/x86_64/wcsrchr.S
+@@ -17,266 +17,12 @@
+    License along with the GNU C Library; if not, see
+    <http://www.gnu.org/licenses/>.  */
+ 
+-#include <sysdep.h>
+ 
+-	.text
+-ENTRY (wcsrchr)
++#define USE_AS_WCSRCHR	1
++#define NO_PMINU	1
+ 
+-	movd	%rsi, %xmm1
+-	mov	%rdi, %rcx
+-	punpckldq %xmm1, %xmm1
+-	pxor	%xmm2, %xmm2
+-	punpckldq %xmm1, %xmm1
+-	and	$63, %rcx
+-	cmp	$48, %rcx
+-	ja	L(crosscache)
++#ifndef STRRCHR
++# define STRRCHR	wcsrchr
++#endif
+ 
+-	movdqu	(%rdi), %xmm0
+-	pcmpeqd	%xmm0, %xmm2
+-	pcmpeqd	%xmm1, %xmm0
+-	pmovmskb %xmm2, %rcx
+-	pmovmskb %xmm0, %rax
+-	add	$16, %rdi
+-
+-	test	%rax, %rax
+-	jnz	L(unaligned_match1)
+-
+-	test	%rcx, %rcx
+-	jnz	L(return_null)
+-
+-	and	$-16, %rdi
+-	xor	%r8, %r8
+-	jmp	L(loop)
+-
+-	.p2align 4
+-L(unaligned_match1):
+-	test	%rcx, %rcx
+-	jnz	L(prolog_find_zero_1)
+-
+-	mov	%rax, %r8
+-	mov	%rdi, %rsi
+-	and	$-16, %rdi
+-	jmp	L(loop)
+-
+-	.p2align 4
+-L(crosscache):
+-	and	$15, %rcx
+-	and	$-16, %rdi
+-	pxor	%xmm3, %xmm3
+-	movdqa	(%rdi), %xmm0
+-	pcmpeqd	%xmm0, %xmm3
+-	pcmpeqd	%xmm1, %xmm0
+-	pmovmskb %xmm3, %rdx
+-	pmovmskb %xmm0, %rax
+-	shr	%cl, %rdx
+-	shr	%cl, %rax
+-	add	$16, %rdi
+-
+-	test	%rax, %rax
+-	jnz	L(unaligned_match)
+-
+-	test	%rdx, %rdx
+-	jnz	L(return_null)
+-
+-	xor	%r8, %r8
+-	jmp	L(loop)
+-
+-	.p2align 4
+-L(unaligned_match):
+-	test	%rdx, %rdx
+-	jnz	L(prolog_find_zero)
+-
+-	mov	%rax, %r8
+-	lea	(%rdi, %rcx), %rsi
+-
+-/* Loop start on aligned string.  */
+-	.p2align 4
+-L(loop):
+-	movdqa	(%rdi), %xmm0
+-	pcmpeqd	%xmm0, %xmm2
+-	add	$16, %rdi
+-	pcmpeqd	%xmm1, %xmm0
+-	pmovmskb %xmm2, %rcx
+-	pmovmskb %xmm0, %rax
+-	or	%rax, %rcx
+-	jnz	L(matches)
+-
+-	movdqa	(%rdi), %xmm3
+-	pcmpeqd	%xmm3, %xmm2
+-	add	$16, %rdi
+-	pcmpeqd	%xmm1, %xmm3
+-	pmovmskb %xmm2, %rcx
+-	pmovmskb %xmm3, %rax
+-	or	%rax, %rcx
+-	jnz	L(matches)
+-
+-	movdqa	(%rdi), %xmm4
+-	pcmpeqd	%xmm4, %xmm2
+-	add	$16, %rdi
+-	pcmpeqd	%xmm1, %xmm4
+-	pmovmskb %xmm2, %rcx
+-	pmovmskb %xmm4, %rax
+-	or	%rax, %rcx
+-	jnz	L(matches)
+-
+-	movdqa	(%rdi), %xmm5
+-	pcmpeqd	%xmm5, %xmm2
+-	add	$16, %rdi
+-	pcmpeqd	%xmm1, %xmm5
+-	pmovmskb %xmm2, %rcx
+-	pmovmskb %xmm5, %rax
+-	or	%rax, %rcx
+-	jz	L(loop)
+-
+-	.p2align 4
+-L(matches):
+-	test	%rax, %rax
+-	jnz	L(match)
+-L(return_value):
+-	test	%r8, %r8
+-	jz	L(return_null)
+-	mov	%r8, %rax
+-	mov	%rsi, %rdi
+-
+-	test	$15 << 4, %ah
+-	jnz	L(match_fourth_wchar)
+-	test	%ah, %ah
+-	jnz	L(match_third_wchar)
+-	test	$15 << 4, %al
+-	jnz	L(match_second_wchar)
+-	lea	-16(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(match):
+-	pmovmskb %xmm2, %rcx
+-	test	%rcx, %rcx
+-	jnz	L(find_zero)
+-	mov	%rax, %r8
+-	mov	%rdi, %rsi
+-	jmp	L(loop)
+-
+-	.p2align 4
+-L(find_zero):
+-	test	$15, %cl
+-	jnz	L(find_zero_in_first_wchar)
+-	test	%cl, %cl
+-	jnz	L(find_zero_in_second_wchar)
+-	test	$15, %ch
+-	jnz	L(find_zero_in_third_wchar)
+-
+-	and	$1 << 13 - 1, %rax
+-	jz	L(return_value)
+-
+-	test	$15 << 4, %ah
+-	jnz	L(match_fourth_wchar)
+-	test	%ah, %ah
+-	jnz	L(match_third_wchar)
+-	test	$15 << 4, %al
+-	jnz	L(match_second_wchar)
+-	lea	-16(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(find_zero_in_first_wchar):
+-	test	$1, %rax
+-	jz	L(return_value)
+-	lea	-16(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(find_zero_in_second_wchar):
+-	and	$1 << 5 - 1, %rax
+-	jz	L(return_value)
+-
+-	test	$15 << 4, %al
+-	jnz	L(match_second_wchar)
+-	lea	-16(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(find_zero_in_third_wchar):
+-	and	$1 << 9 - 1, %rax
+-	jz	L(return_value)
+-
+-	test	%ah, %ah
+-	jnz	L(match_third_wchar)
+-	test	$15 << 4, %al
+-	jnz	L(match_second_wchar)
+-	lea	-16(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(prolog_find_zero):
+-	add	%rcx, %rdi
+-	mov     %rdx, %rcx
+-L(prolog_find_zero_1):
+-	test	$15, %cl
+-	jnz	L(prolog_find_zero_in_first_wchar)
+-	test	%cl, %cl
+-	jnz	L(prolog_find_zero_in_second_wchar)
+-	test	$15, %ch
+-	jnz	L(prolog_find_zero_in_third_wchar)
+-
+-	and	$1 << 13 - 1, %rax
+-	jz	L(return_null)
+-
+-	test	$15 << 4, %ah
+-	jnz	L(match_fourth_wchar)
+-	test	%ah, %ah
+-	jnz	L(match_third_wchar)
+-	test	$15 << 4, %al
+-	jnz	L(match_second_wchar)
+-	lea	-16(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(prolog_find_zero_in_first_wchar):
+-	test	$1, %rax
+-	jz	L(return_null)
+-	lea	-16(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(prolog_find_zero_in_second_wchar):
+-	and	$1 << 5 - 1, %rax
+-	jz	L(return_null)
+-
+-	test	$15 << 4, %al
+-	jnz	L(match_second_wchar)
+-	lea	-16(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(prolog_find_zero_in_third_wchar):
+-	and	$1 << 9 - 1, %rax
+-	jz	L(return_null)
+-
+-	test	%ah, %ah
+-	jnz	L(match_third_wchar)
+-	test	$15 << 4, %al
+-	jnz	L(match_second_wchar)
+-	lea	-16(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(match_second_wchar):
+-	lea	-12(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(match_third_wchar):
+-	lea	-8(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(match_fourth_wchar):
+-	lea	-4(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(return_null):
+-	xor	%rax, %rax
+-	ret
+-
+-END (wcsrchr)
++#include "../strrchr.S"
+-- 
+GitLab
+
diff --git a/SOURCES/ia-opt-strcmp-avx2.patch b/SOURCES/ia-opt-strcmp-avx2.patch
new file mode 100644
index 0000000..9067242
--- /dev/null
+++ b/SOURCES/ia-opt-strcmp-avx2.patch
@@ -0,0 +1,1794 @@
+From 5e9c6a33e767576c063e1fc0077b3a749518e8f0 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 10 Jan 2022 15:35:38 -0600
+Subject: [PATCH] x86: Optimize strcmp-avx2.S
+
+Optimization are primarily to the loop logic and how the page cross
+logic interacts with the loop.
+
+The page cross logic is at times more expensive for short strings near
+the end of a page but not crossing the page. This is done to retest
+the page cross conditions with a non-faulty check and to improve the
+logic for entering the loop afterwards. This is only particular cases,
+however, and is general made up for by more than 10x improvements on
+the transition from the page cross -> loop case.
+
+The non-page cross cases are improved most for smaller sizes [0, 128]
+and go about even for (128, 4096]. The loop page cross logic is
+improved so some more significant speedup is seen there as well.
+
+test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+(cherry picked from commit b77b06e0e296f1a2276c27a67e1d44f2cfa38d45)
+---
+ sysdeps/x86_64/multiarch/strcmp-avx2.S | 1592 ++++++++++++++----------
+ 1 file changed, 940 insertions(+), 652 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index 70d8499b..554ffe4c 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -26,35 +26,57 @@
+ 
+ # define PAGE_SIZE	4096
+ 
+-/* VEC_SIZE = Number of bytes in a ymm register */
++	/* VEC_SIZE = Number of bytes in a ymm register.  */
+ # define VEC_SIZE	32
+ 
+-/* Shift for dividing by (VEC_SIZE * 4).  */
+-# define DIVIDE_BY_VEC_4_SHIFT	7
+-# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
+-#  error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
+-# endif
++# define VMOVU	vmovdqu
++# define VMOVA	vmovdqa
+ 
+ # ifdef USE_AS_WCSCMP
+-/* Compare packed dwords.  */
++	/* Compare packed dwords.  */
+ #  define VPCMPEQ	vpcmpeqd
+-/* Compare packed dwords and store minimum.  */
++	/* Compare packed dwords and store minimum.  */
+ #  define VPMINU	vpminud
+-/* 1 dword char == 4 bytes.  */
++	/* 1 dword char == 4 bytes.  */
+ #  define SIZE_OF_CHAR	4
+ # else
+-/* Compare packed bytes.  */
++	/* Compare packed bytes.  */
+ #  define VPCMPEQ	vpcmpeqb
+-/* Compare packed bytes and store minimum.  */
++	/* Compare packed bytes and store minimum.  */
+ #  define VPMINU	vpminub
+-/* 1 byte char == 1 byte.  */
++	/* 1 byte char == 1 byte.  */
+ #  define SIZE_OF_CHAR	1
+ # endif
+ 
++# ifdef USE_AS_STRNCMP
++#  define LOOP_REG	r9d
++#  define LOOP_REG64	r9
++
++#  define OFFSET_REG8	r9b
++#  define OFFSET_REG	r9d
++#  define OFFSET_REG64	r9
++# else
++#  define LOOP_REG	edx
++#  define LOOP_REG64	rdx
++
++#  define OFFSET_REG8	dl
++#  define OFFSET_REG	edx
++#  define OFFSET_REG64	rdx
++# endif
++
+ # ifndef VZEROUPPER
+ #  define VZEROUPPER	vzeroupper
+ # endif
+ 
++# if defined USE_AS_STRNCMP
++#  define VEC_OFFSET	0
++# else
++#  define VEC_OFFSET	(-VEC_SIZE)
++# endif
++
++# define xmmZERO	xmm15
++# define ymmZERO	ymm15
++
+ # ifndef SECTION
+ #  define SECTION(p)	p##.avx
+ # endif
+@@ -79,783 +101,1049 @@
+    the maximum offset is reached before a difference is found, zero is
+    returned.  */
+ 
+-	.section SECTION(.text),"ax",@progbits
+-ENTRY (STRCMP)
++	.section SECTION(.text), "ax", @progbits
++ENTRY(STRCMP)
+ # ifdef USE_AS_STRNCMP
+-	/* Check for simple cases (0 or 1) in offset.  */
++#  ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	movl	%edx, %rdx
++#  endif
+ 	cmp	$1, %RDX_LP
+-	je	L(char0)
+-	jb	L(zero)
++	/* Signed comparison intentional. We use this branch to also
++	   test cases where length >= 2^63. These very large sizes can be
++	   handled with strcmp as there is no way for that length to
++	   actually bound the buffer.  */
++	jle	L(one_or_less)
+ #  ifdef USE_AS_WCSCMP
+-#  ifndef __ILP32__
+ 	movq	%rdx, %rcx
+-	/* Check if length could overflow when multiplied by
+-	   sizeof(wchar_t). Checking top 8 bits will cover all potential
+-	   overflow cases as well as redirect cases where its impossible to
+-	   length to bound a valid memory region. In these cases just use
+-	   'wcscmp'.  */
++
++	/* Multiplying length by sizeof(wchar_t) can result in overflow.
++	   Check if that is possible. All cases where overflow are possible
++	   are cases where length is large enough that it can never be a
++	   bound on valid memory so just use wcscmp.  */
+ 	shrq	$56, %rcx
+-	jnz	OVERFLOW_STRCMP
+-#  endif
+-	/* Convert units: from wide to byte char.  */
+-	shl	$2, %RDX_LP
++	jnz	__wcscmp_avx2
++
++	leaq	(, %rdx, 4), %rdx
+ #  endif
+-	/* Register %r11 tracks the maximum offset.  */
+-	mov	%RDX_LP, %R11_LP
+ # endif
++	vpxor	%xmmZERO, %xmmZERO, %xmmZERO
+ 	movl	%edi, %eax
+-	xorl	%edx, %edx
+-	/* Make %xmm7 (%ymm7) all zeros in this function.  */
+-	vpxor	%xmm7, %xmm7, %xmm7
+ 	orl	%esi, %eax
+-	andl	$(PAGE_SIZE - 1), %eax
+-	cmpl	$(PAGE_SIZE - (VEC_SIZE * 4)), %eax
+-	jg	L(cross_page)
+-	/* Start comparing 4 vectors.  */
+-	vmovdqu	(%rdi), %ymm1
+-	VPCMPEQ	(%rsi), %ymm1, %ymm0
+-	VPMINU	%ymm1, %ymm0, %ymm0
+-	VPCMPEQ	%ymm7, %ymm0, %ymm0
+-	vpmovmskb %ymm0, %ecx
+-	testl	%ecx, %ecx
+-	je	L(next_3_vectors)
+-	tzcntl	%ecx, %edx
++	sall	$20, %eax
++	/* Check if s1 or s2 may cross a page  in next 4x VEC loads.  */
++	cmpl	$((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax
++	ja	L(page_cross)
++
++L(no_page_cross):
++	/* Safe to compare 4x vectors.  */
++	VMOVU	(%rdi), %ymm0
++	/* 1s where s1 and s2 equal.  */
++	VPCMPEQ	(%rsi), %ymm0, %ymm1
++	/* 1s at null CHAR.  */
++	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
++	/* 1s where s1 and s2 equal AND not null CHAR.  */
++	vpandn	%ymm1, %ymm2, %ymm1
++
++	/* All 1s -> keep going, any 0s -> return.  */
++	vpmovmskb %ymm1, %ecx
+ # ifdef USE_AS_STRNCMP
+-	/* Return 0 if the mismatched index (%rdx) is after the maximum
+-	   offset (%r11).   */
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++	cmpq	$VEC_SIZE, %rdx
++	jbe	L(vec_0_test_len)
+ # endif
++
++	/* All 1s represents all equals. incl will overflow to zero in
++	   all equals case. Otherwise 1s will carry until position of first
++	   mismatch.  */
++	incl	%ecx
++	jz	L(more_3x_vec)
++
++	.p2align 4,, 4
++L(return_vec_0):
++	tzcntl	%ecx, %ecx
+ # ifdef USE_AS_WCSCMP
++	movl	(%rdi, %rcx), %edx
+ 	xorl	%eax, %eax
+-	movl	(%rdi, %rdx), %ecx
+-	cmpl	(%rsi, %rdx), %ecx
+-	je	L(return)
+-L(wcscmp_return):
++	cmpl	(%rsi, %rcx), %edx
++	je	L(ret0)
+ 	setl	%al
+ 	negl	%eax
+ 	orl	$1, %eax
+-L(return):
+ # else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %edx
+-	subl	%edx, %eax
++	movzbl	(%rdi, %rcx), %eax
++	movzbl	(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
+ # endif
++L(ret0):
+ L(return_vzeroupper):
+ 	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+-	.p2align 4
+-L(return_vec_size):
+-	tzcntl	%ecx, %edx
+ # ifdef USE_AS_STRNCMP
+-	/* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after
+-	   the maximum offset (%r11).  */
+-	addq	$VEC_SIZE, %rdx
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
+-#  ifdef USE_AS_WCSCMP
++	.p2align 4,, 8
++L(vec_0_test_len):
++	notl	%ecx
++	bzhil	%edx, %ecx, %eax
++	jnz	L(return_vec_0)
++	/* Align if will cross fetch block.  */
++	.p2align 4,, 2
++L(ret_zero):
+ 	xorl	%eax, %eax
+-	movl	(%rdi, %rdx), %ecx
+-	cmpl	(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %edx
+-	subl	%edx, %eax
+-#  endif
+-# else
++	VZEROUPPER_RETURN
++
++	.p2align 4,, 5
++L(one_or_less):
++	jb	L(ret_zero)
+ #  ifdef USE_AS_WCSCMP
++	/* 'nbe' covers the case where length is negative (large
++	   unsigned).  */
++	jnbe	__wcscmp_avx2
++	movl	(%rdi), %edx
+ 	xorl	%eax, %eax
+-	movl	VEC_SIZE(%rdi, %rdx), %ecx
+-	cmpl	VEC_SIZE(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
++	cmpl	(%rsi), %edx
++	je	L(ret1)
++	setl	%al
++	negl	%eax
++	orl	$1, %eax
+ #  else
+-	movzbl	VEC_SIZE(%rdi, %rdx), %eax
+-	movzbl	VEC_SIZE(%rsi, %rdx), %edx
+-	subl	%edx, %eax
++	/* 'nbe' covers the case where length is negative (large
++	   unsigned).  */
++
++	jnbe	__strcmp_avx2
++	movzbl	(%rdi), %eax
++	movzbl	(%rsi), %ecx
++	subl	%ecx, %eax
+ #  endif
++L(ret1):
++	ret
+ # endif
+-	VZEROUPPER_RETURN
+ 
+-	.p2align 4
+-L(return_2_vec_size):
+-	tzcntl	%ecx, %edx
++	.p2align 4,, 10
++L(return_vec_1):
++	tzcntl	%ecx, %ecx
+ # ifdef USE_AS_STRNCMP
+-	/* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is
+-	   after the maximum offset (%r11).  */
+-	addq	$(VEC_SIZE * 2), %rdx
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
+-#  ifdef USE_AS_WCSCMP
++	/* rdx must be > CHAR_PER_VEC so save to subtract w.o fear of
++	   overflow.  */
++	addq	$-VEC_SIZE, %rdx
++	cmpq	%rcx, %rdx
++	jbe	L(ret_zero)
++# endif
++# ifdef USE_AS_WCSCMP
++	movl	VEC_SIZE(%rdi, %rcx), %edx
+ 	xorl	%eax, %eax
+-	movl	(%rdi, %rdx), %ecx
+-	cmpl	(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %edx
+-	subl	%edx, %eax
+-#  endif
++	cmpl	VEC_SIZE(%rsi, %rcx), %edx
++	je	L(ret2)
++	setl	%al
++	negl	%eax
++	orl	$1, %eax
+ # else
+-#  ifdef USE_AS_WCSCMP
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 2)(%rdi, %rdx), %ecx
+-	cmpl	(VEC_SIZE * 2)(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(VEC_SIZE * 2)(%rdi, %rdx), %eax
+-	movzbl	(VEC_SIZE * 2)(%rsi, %rdx), %edx
+-	subl	%edx, %eax
+-#  endif
++	movzbl	VEC_SIZE(%rdi, %rcx), %eax
++	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
+ # endif
++L(ret2):
+ 	VZEROUPPER_RETURN
+ 
+-	.p2align 4
+-L(return_3_vec_size):
+-	tzcntl	%ecx, %edx
++	.p2align 4,, 10
+ # ifdef USE_AS_STRNCMP
+-	/* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is
+-	   after the maximum offset (%r11).  */
+-	addq	$(VEC_SIZE * 3), %rdx
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
+-#  ifdef USE_AS_WCSCMP
++L(return_vec_3):
++	salq	$32, %rcx
++# endif
++
++L(return_vec_2):
++# ifndef USE_AS_STRNCMP
++	tzcntl	%ecx, %ecx
++# else
++	tzcntq	%rcx, %rcx
++	cmpq	%rcx, %rdx
++	jbe	L(ret_zero)
++# endif
++
++# ifdef USE_AS_WCSCMP
++	movl	(VEC_SIZE * 2)(%rdi, %rcx), %edx
+ 	xorl	%eax, %eax
+-	movl	(%rdi, %rdx), %ecx
+-	cmpl	(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %edx
+-	subl	%edx, %eax
+-#  endif
++	cmpl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
++	je	L(ret3)
++	setl	%al
++	negl	%eax
++	orl	$1, %eax
+ # else
++	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
++	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++# endif
++L(ret3):
++	VZEROUPPER_RETURN
++
++# ifndef USE_AS_STRNCMP
++	.p2align 4,, 10
++L(return_vec_3):
++	tzcntl	%ecx, %ecx
+ #  ifdef USE_AS_WCSCMP
++	movl	(VEC_SIZE * 3)(%rdi, %rcx), %edx
+ 	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 3)(%rdi, %rdx), %ecx
+-	cmpl	(VEC_SIZE * 3)(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
++	cmpl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
++	je	L(ret4)
++	setl	%al
++	negl	%eax
++	orl	$1, %eax
+ #  else
+-	movzbl	(VEC_SIZE * 3)(%rdi, %rdx), %eax
+-	movzbl	(VEC_SIZE * 3)(%rsi, %rdx), %edx
+-	subl	%edx, %eax
++	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
++	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
+ #  endif
+-# endif
++L(ret4):
+ 	VZEROUPPER_RETURN
++# endif
++
++	.p2align 4,, 10
++L(more_3x_vec):
++	/* Safe to compare 4x vectors.  */
++	VMOVU	VEC_SIZE(%rdi), %ymm0
++	VPCMPEQ	VEC_SIZE(%rsi), %ymm0, %ymm1
++	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
++	vpandn	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %ecx
++	incl	%ecx
++	jnz	L(return_vec_1)
++
++# ifdef USE_AS_STRNCMP
++	subq	$(VEC_SIZE * 2), %rdx
++	jbe	L(ret_zero)
++# endif
++
++	VMOVU	(VEC_SIZE * 2)(%rdi), %ymm0
++	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm0, %ymm1
++	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
++	vpandn	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %ecx
++	incl	%ecx
++	jnz	L(return_vec_2)
++
++	VMOVU	(VEC_SIZE * 3)(%rdi), %ymm0
++	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %ymm0, %ymm1
++	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
++	vpandn	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %ecx
++	incl	%ecx
++	jnz	L(return_vec_3)
+ 
+-	.p2align 4
+-L(next_3_vectors):
+-	vmovdqu	VEC_SIZE(%rdi), %ymm6
+-	VPCMPEQ	VEC_SIZE(%rsi), %ymm6, %ymm3
+-	VPMINU	%ymm6, %ymm3, %ymm3
+-	VPCMPEQ	%ymm7, %ymm3, %ymm3
+-	vpmovmskb %ymm3, %ecx
+-	testl	%ecx, %ecx
+-	jne	L(return_vec_size)
+-	vmovdqu	(VEC_SIZE * 2)(%rdi), %ymm5
+-	vmovdqu	(VEC_SIZE * 3)(%rdi), %ymm4
+-	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm0
+-	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm5, %ymm2
+-	VPMINU	%ymm5, %ymm2, %ymm2
+-	VPCMPEQ	%ymm4, %ymm0, %ymm0
+-	VPCMPEQ	%ymm7, %ymm2, %ymm2
+-	vpmovmskb %ymm2, %ecx
+-	testl	%ecx, %ecx
+-	jne	L(return_2_vec_size)
+-	VPMINU	%ymm4, %ymm0, %ymm0
+-	VPCMPEQ	%ymm7, %ymm0, %ymm0
+-	vpmovmskb %ymm0, %ecx
+-	testl	%ecx, %ecx
+-	jne	L(return_3_vec_size)
+-L(main_loop_header):
+-	leaq	(VEC_SIZE * 4)(%rdi), %rdx
+-	movl	$PAGE_SIZE, %ecx
+-	/* Align load via RAX.  */
+-	andq	$-(VEC_SIZE * 4), %rdx
+-	subq	%rdi, %rdx
+-	leaq	(%rdi, %rdx), %rax
+ # ifdef USE_AS_STRNCMP
+-	/* Starting from this point, the maximum offset, or simply the
+-	   'offset', DECREASES by the same amount when base pointers are
+-	   moved forward.  Return 0 when:
+-	     1) On match: offset <= the matched vector index.
+-	     2) On mistmach, offset is before the mistmatched index.
++	cmpq	$(VEC_SIZE * 2), %rdx
++	jbe	L(ret_zero)
++# endif
++
++# ifdef USE_AS_WCSCMP
++	/* any non-zero positive value that doesn't inference with 0x1.
+ 	 */
+-	subq	%rdx, %r11
+-	jbe	L(zero)
+-# endif
+-	addq	%rsi, %rdx
+-	movq	%rdx, %rsi
+-	andl	$(PAGE_SIZE - 1), %esi
+-	/* Number of bytes before page crossing.  */
+-	subq	%rsi, %rcx
+-	/* Number of VEC_SIZE * 4 blocks before page crossing.  */
+-	shrq	$DIVIDE_BY_VEC_4_SHIFT, %rcx
+-	/* ESI: Number of VEC_SIZE * 4 blocks before page crossing.   */
+-	movl	%ecx, %esi
+-	jmp	L(loop_start)
++	movl	$2, %r8d
+ 
++# else
++	xorl	%r8d, %r8d
++# endif
++
++	/* The prepare labels are various entry points from the page
++	   cross logic.  */
++L(prepare_loop):
++
++# ifdef USE_AS_STRNCMP
++	/* Store N + (VEC_SIZE * 4) and place check at the begining of
++	   the loop.  */
++	leaq	(VEC_SIZE * 2)(%rdi, %rdx), %rdx
++# endif
++L(prepare_loop_no_len):
++
++	/* Align s1 and adjust s2 accordingly.  */
++	subq	%rdi, %rsi
++	andq	$-(VEC_SIZE * 4), %rdi
++	addq	%rdi, %rsi
++
++# ifdef USE_AS_STRNCMP
++	subq	%rdi, %rdx
++# endif
++
++L(prepare_loop_aligned):
++	/* eax stores distance from rsi to next page cross. These cases
++	   need to be handled specially as the 4x loop could potentially
++	   read memory past the length of s1 or s2 and across a page
++	   boundary.  */
++	movl	$-(VEC_SIZE * 4), %eax
++	subl	%esi, %eax
++	andl	$(PAGE_SIZE - 1), %eax
++
++	/* Loop 4x comparisons at a time.  */
+ 	.p2align 4
+ L(loop):
++
++	/* End condition for strncmp.  */
+ # ifdef USE_AS_STRNCMP
+-	/* Base pointers are moved forward by 4 * VEC_SIZE.  Decrease
+-	   the maximum offset (%r11) by the same amount.  */
+-	subq	$(VEC_SIZE * 4), %r11
+-	jbe	L(zero)
+-# endif
+-	addq	$(VEC_SIZE * 4), %rax
+-	addq	$(VEC_SIZE * 4), %rdx
+-L(loop_start):
+-	testl	%esi, %esi
+-	leal	-1(%esi), %esi
+-	je	L(loop_cross_page)
+-L(back_to_loop):
+-	/* Main loop, comparing 4 vectors are a time.  */
+-	vmovdqa	(%rax), %ymm0
+-	vmovdqa	VEC_SIZE(%rax), %ymm3
+-	VPCMPEQ	(%rdx), %ymm0, %ymm4
+-	VPCMPEQ	VEC_SIZE(%rdx), %ymm3, %ymm1
+-	VPMINU	%ymm0, %ymm4, %ymm4
+-	VPMINU	%ymm3, %ymm1, %ymm1
+-	vmovdqa	(VEC_SIZE * 2)(%rax), %ymm2
+-	VPMINU	%ymm1, %ymm4, %ymm0
+-	vmovdqa	(VEC_SIZE * 3)(%rax), %ymm3
+-	VPCMPEQ	(VEC_SIZE * 2)(%rdx), %ymm2, %ymm5
+-	VPCMPEQ	(VEC_SIZE * 3)(%rdx), %ymm3, %ymm6
+-	VPMINU	%ymm2, %ymm5, %ymm5
+-	VPMINU	%ymm3, %ymm6, %ymm6
+-	VPMINU	%ymm5, %ymm0, %ymm0
+-	VPMINU	%ymm6, %ymm0, %ymm0
+-	VPCMPEQ	%ymm7, %ymm0, %ymm0
+-
+-	/* Test each mask (32 bits) individually because for VEC_SIZE
+-	   == 32 is not possible to OR the four masks and keep all bits
+-	   in a 64-bit integer register, differing from SSE2 strcmp
+-	   where ORing is possible.  */
+-	vpmovmskb %ymm0, %ecx
++	subq	$(VEC_SIZE * 4), %rdx
++	jbe	L(ret_zero)
++# endif
++
++	subq	$-(VEC_SIZE * 4), %rdi
++	subq	$-(VEC_SIZE * 4), %rsi
++
++	/* Check if rsi loads will cross a page boundary.  */
++	addl	$-(VEC_SIZE * 4), %eax
++	jnb	L(page_cross_during_loop)
++
++	/* Loop entry after handling page cross during loop.  */
++L(loop_skip_page_cross_check):
++	VMOVA	(VEC_SIZE * 0)(%rdi), %ymm0
++	VMOVA	(VEC_SIZE * 1)(%rdi), %ymm2
++	VMOVA	(VEC_SIZE * 2)(%rdi), %ymm4
++	VMOVA	(VEC_SIZE * 3)(%rdi), %ymm6
++
++	/* ymm1 all 1s where s1 and s2 equal. All 0s otherwise.  */
++	VPCMPEQ	(VEC_SIZE * 0)(%rsi), %ymm0, %ymm1
++
++	VPCMPEQ	(VEC_SIZE * 1)(%rsi), %ymm2, %ymm3
++	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
++	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
++
++
++	/* If any mismatches or null CHAR then 0 CHAR, otherwise non-
++	   zero.  */
++	vpand	%ymm0, %ymm1, %ymm1
++
++
++	vpand	%ymm2, %ymm3, %ymm3
++	vpand	%ymm4, %ymm5, %ymm5
++	vpand	%ymm6, %ymm7, %ymm7
++
++	VPMINU	%ymm1, %ymm3, %ymm3
++	VPMINU	%ymm5, %ymm7, %ymm7
++
++	/* Reduce all 0 CHARs for the 4x VEC into ymm7.  */
++	VPMINU	%ymm3, %ymm7, %ymm7
++
++	/* If any 0 CHAR then done.  */
++	VPCMPEQ	%ymm7, %ymmZERO, %ymm7
++	vpmovmskb %ymm7, %LOOP_REG
++	testl	%LOOP_REG, %LOOP_REG
++	jz	L(loop)
++
++	/* Find which VEC has the mismatch of end of string.  */
++	VPCMPEQ	%ymm1, %ymmZERO, %ymm1
++	vpmovmskb %ymm1, %ecx
+ 	testl	%ecx, %ecx
+-	je	L(loop)
+-	VPCMPEQ	%ymm7, %ymm4, %ymm0
+-	vpmovmskb %ymm0, %edi
+-	testl	%edi, %edi
+-	je	L(test_vec)
+-	tzcntl	%edi, %ecx
++	jnz	L(return_vec_0_end)
++
++
++	VPCMPEQ	%ymm3, %ymmZERO, %ymm3
++	vpmovmskb %ymm3, %ecx
++	testl	%ecx, %ecx
++	jnz	L(return_vec_1_end)
++
++L(return_vec_2_3_end):
+ # ifdef USE_AS_STRNCMP
+-	cmpq	%rcx, %r11
+-	jbe	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
++	subq	$(VEC_SIZE * 2), %rdx
++	jbe	L(ret_zero_end)
++# endif
++
++	VPCMPEQ	%ymm5, %ymmZERO, %ymm5
++	vpmovmskb %ymm5, %ecx
++	testl	%ecx, %ecx
++	jnz	L(return_vec_2_end)
++
++	/* LOOP_REG contains matches for null/mismatch from the loop. If
++	   VEC 0,1,and 2 all have no null and no mismatches then mismatch
++	   must entirely be from VEC 3 which is fully represented by
++	   LOOP_REG.  */
++	tzcntl	%LOOP_REG, %LOOP_REG
++
++# ifdef USE_AS_STRNCMP
++	subl	$-(VEC_SIZE), %LOOP_REG
++	cmpq	%LOOP_REG64, %rdx
++	jbe	L(ret_zero_end)
++# endif
++
++# ifdef USE_AS_WCSCMP
++	movl	(VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %ecx
+ 	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %edi
+-	cmpl	(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
++	cmpl	(VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
++	je	L(ret5)
++	setl	%al
++	negl	%eax
++	xorl	%r8d, %eax
+ # else
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %edi
+-	cmpl	(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
++	movzbl	(VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax
++	movzbl	(VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
++	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ # endif
++L(ret5):
+ 	VZEROUPPER_RETURN
+ 
+-	.p2align 4
+-L(test_vec):
+ # ifdef USE_AS_STRNCMP
+-	/* The first vector matched.  Return 0 if the maximum offset
+-	   (%r11) <= VEC_SIZE.  */
+-	cmpq	$VEC_SIZE, %r11
+-	jbe	L(zero)
++	.p2align 4,, 2
++L(ret_zero_end):
++	xorl	%eax, %eax
++	VZEROUPPER_RETURN
+ # endif
+-	VPCMPEQ	%ymm7, %ymm1, %ymm1
+-	vpmovmskb %ymm1, %ecx
+-	testl	%ecx, %ecx
+-	je	L(test_2_vec)
+-	tzcntl	%ecx, %edi
++
++
++	/* The L(return_vec_N_end) differ from L(return_vec_N) in that
++	   they use the value of `r8` to negate the return value. This is
++	   because the page cross logic can swap `rdi` and `rsi`.  */
++	.p2align 4,, 10
+ # ifdef USE_AS_STRNCMP
+-	addq	$VEC_SIZE, %rdi
+-	cmpq	%rdi, %r11
+-	jbe	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
++L(return_vec_1_end):
++	salq	$32, %rcx
++# endif
++L(return_vec_0_end):
++# ifndef USE_AS_STRNCMP
++	tzcntl	%ecx, %ecx
++# else
++	tzcntq	%rcx, %rcx
++	cmpq	%rcx, %rdx
++	jbe	L(ret_zero_end)
++# endif
++
++# ifdef USE_AS_WCSCMP
++	movl	(%rdi, %rcx), %edx
+ 	xorl	%eax, %eax
+-	movl	(%rsi, %rdi), %ecx
+-	cmpl	(%rdx, %rdi), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rdi), %eax
+-	movzbl	(%rdx, %rdi), %edx
+-	subl	%edx, %eax
+-#  endif
++	cmpl	(%rsi, %rcx), %edx
++	je	L(ret6)
++	setl	%al
++	negl	%eax
++	xorl	%r8d, %eax
+ # else
++	movzbl	(%rdi, %rcx), %eax
++	movzbl	(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
++# endif
++L(ret6):
++	VZEROUPPER_RETURN
++
++# ifndef USE_AS_STRNCMP
++	.p2align 4,, 10
++L(return_vec_1_end):
++	tzcntl	%ecx, %ecx
+ #  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
++	movl	VEC_SIZE(%rdi, %rcx), %edx
+ 	xorl	%eax, %eax
+-	movl	VEC_SIZE(%rsi, %rdi), %ecx
+-	cmpl	VEC_SIZE(%rdx, %rdi), %ecx
+-	jne	L(wcscmp_return)
++	cmpl	VEC_SIZE(%rsi, %rcx), %edx
++	je	L(ret7)
++	setl	%al
++	negl	%eax
++	xorl	%r8d, %eax
+ #  else
+-	movzbl	VEC_SIZE(%rax, %rdi), %eax
+-	movzbl	VEC_SIZE(%rdx, %rdi), %edx
+-	subl	%edx, %eax
++	movzbl	VEC_SIZE(%rdi, %rcx), %eax
++	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ #  endif
+-# endif
++L(ret7):
+ 	VZEROUPPER_RETURN
++# endif
+ 
+-	.p2align 4
+-L(test_2_vec):
++	.p2align 4,, 10
++L(return_vec_2_end):
++	tzcntl	%ecx, %ecx
+ # ifdef USE_AS_STRNCMP
+-	/* The first 2 vectors matched.  Return 0 if the maximum offset
+-	   (%r11) <= 2 * VEC_SIZE.  */
+-	cmpq	$(VEC_SIZE * 2), %r11
+-	jbe	L(zero)
++	cmpq	%rcx, %rdx
++	jbe	L(ret_zero_page_cross)
+ # endif
+-	VPCMPEQ	%ymm7, %ymm5, %ymm5
+-	vpmovmskb %ymm5, %ecx
+-	testl	%ecx, %ecx
+-	je	L(test_3_vec)
+-	tzcntl	%ecx, %edi
+-# ifdef USE_AS_STRNCMP
+-	addq	$(VEC_SIZE * 2), %rdi
+-	cmpq	%rdi, %r11
+-	jbe	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
++# ifdef USE_AS_WCSCMP
++	movl	(VEC_SIZE * 2)(%rdi, %rcx), %edx
+ 	xorl	%eax, %eax
+-	movl	(%rsi, %rdi), %ecx
+-	cmpl	(%rdx, %rdi), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rdi), %eax
+-	movzbl	(%rdx, %rdi), %edx
+-	subl	%edx, %eax
+-#  endif
++	cmpl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
++	je	L(ret11)
++	setl	%al
++	negl	%eax
++	xorl	%r8d, %eax
+ # else
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 2)(%rsi, %rdi), %ecx
+-	cmpl	(VEC_SIZE * 2)(%rdx, %rdi), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(VEC_SIZE * 2)(%rax, %rdi), %eax
+-	movzbl	(VEC_SIZE * 2)(%rdx, %rdi), %edx
+-	subl	%edx, %eax
+-#  endif
++	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
++	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ # endif
++L(ret11):
+ 	VZEROUPPER_RETURN
+ 
+-	.p2align 4
+-L(test_3_vec):
++
++	/* Page cross in rsi in next 4x VEC.  */
++
++	/* TODO: Improve logic here.  */
++	.p2align 4,, 10
++L(page_cross_during_loop):
++	/* eax contains [distance_from_page - (VEC_SIZE * 4)].  */
++
++	/* Optimistically rsi and rdi and both aligned inwhich case we
++	   don't need any logic here.  */
++	cmpl	$-(VEC_SIZE * 4), %eax
++	/* Don't adjust eax before jumping back to loop and we will
++	   never hit page cross case again.  */
++	je	L(loop_skip_page_cross_check)
++
++	/* Check if we can safely load a VEC.  */
++	cmpl	$-(VEC_SIZE * 3), %eax
++	jle	L(less_1x_vec_till_page_cross)
++
++	VMOVA	(%rdi), %ymm0
++	VPCMPEQ	(%rsi), %ymm0, %ymm1
++	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
++	vpandn	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %ecx
++	incl	%ecx
++	jnz	L(return_vec_0_end)
++
++	/* if distance >= 2x VEC then eax > -(VEC_SIZE * 2).  */
++	cmpl	$-(VEC_SIZE * 2), %eax
++	jg	L(more_2x_vec_till_page_cross)
++
++	.p2align 4,, 4
++L(less_1x_vec_till_page_cross):
++	subl	$-(VEC_SIZE * 4), %eax
++	/* Guranteed safe to read from rdi - VEC_SIZE here. The only
++	   concerning case is first iteration if incoming s1 was near start
++	   of a page and s2 near end. If s1 was near the start of the page
++	   we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe
++	   to read back -VEC_SIZE. If rdi is truly at the start of a page
++	   here, it means the previous page (rdi - VEC_SIZE) has already
++	   been loaded earlier so must be valid.  */
++	VMOVU	-VEC_SIZE(%rdi, %rax), %ymm0
++	VPCMPEQ	-VEC_SIZE(%rsi, %rax), %ymm0, %ymm1
++	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
++	vpandn	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %ecx
++
++	/* Mask of potentially valid bits. The lower bits can be out of
++	   range comparisons (but safe regarding page crosses).  */
++	movl	$-1, %r10d
++	shlxl	%esi, %r10d, %r10d
++	notl	%ecx
++
+ # ifdef USE_AS_STRNCMP
+-	/* The first 3 vectors matched.  Return 0 if the maximum offset
+-	   (%r11) <= 3 * VEC_SIZE.  */
+-	cmpq	$(VEC_SIZE * 3), %r11
+-	jbe	L(zero)
+-# endif
+-	VPCMPEQ	%ymm7, %ymm6, %ymm6
+-	vpmovmskb %ymm6, %esi
+-	tzcntl	%esi, %ecx
++	cmpq	%rax, %rdx
++	jbe	L(return_page_cross_end_check)
++# endif
++	movl	%eax, %OFFSET_REG
++	addl	$(PAGE_SIZE - VEC_SIZE * 4), %eax
++
++	andl	%r10d, %ecx
++	jz	L(loop_skip_page_cross_check)
++
++	.p2align 4,, 3
++L(return_page_cross_end):
++	tzcntl	%ecx, %ecx
++
+ # ifdef USE_AS_STRNCMP
+-	addq	$(VEC_SIZE * 3), %rcx
+-	cmpq	%rcx, %r11
+-	jbe	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %esi
+-	cmpl	(%rdx, %rcx), %esi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
++	leal	-VEC_SIZE(%OFFSET_REG64, %rcx), %ecx
++L(return_page_cross_cmp_mem):
+ # else
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
++	addl	%OFFSET_REG, %ecx
++# endif
++# ifdef USE_AS_WCSCMP
++	movl	VEC_OFFSET(%rdi, %rcx), %edx
+ 	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 3)(%rsi, %rcx), %esi
+-	cmpl	(VEC_SIZE * 3)(%rdx, %rcx), %esi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(VEC_SIZE * 3)(%rax, %rcx), %eax
+-	movzbl	(VEC_SIZE * 3)(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
++	cmpl	VEC_OFFSET(%rsi, %rcx), %edx
++	je	L(ret8)
++	setl	%al
++	negl	%eax
++	xorl	%r8d, %eax
++# else
++	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
++	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ # endif
++L(ret8):
+ 	VZEROUPPER_RETURN
+ 
+-	.p2align 4
+-L(loop_cross_page):
+-	xorl	%r10d, %r10d
+-	movq	%rdx, %rcx
+-	/* Align load via RDX.  We load the extra ECX bytes which should
+-	   be ignored.  */
+-	andl	$((VEC_SIZE * 4) - 1), %ecx
+-	/* R10 is -RCX.  */
+-	subq	%rcx, %r10
+-
+-	/* This works only if VEC_SIZE * 2 == 64. */
+-# if (VEC_SIZE * 2) != 64
+-#  error (VEC_SIZE * 2) != 64
+-# endif
+-
+-	/* Check if the first VEC_SIZE * 2 bytes should be ignored.  */
+-	cmpl	$(VEC_SIZE * 2), %ecx
+-	jge	L(loop_cross_page_2_vec)
+-
+-	vmovdqu	(%rax, %r10), %ymm2
+-	vmovdqu	VEC_SIZE(%rax, %r10), %ymm3
+-	VPCMPEQ	(%rdx, %r10), %ymm2, %ymm0
+-	VPCMPEQ	VEC_SIZE(%rdx, %r10), %ymm3, %ymm1
+-	VPMINU	%ymm2, %ymm0, %ymm0
+-	VPMINU	%ymm3, %ymm1, %ymm1
+-	VPCMPEQ	%ymm7, %ymm0, %ymm0
+-	VPCMPEQ	%ymm7, %ymm1, %ymm1
+-
+-	vpmovmskb %ymm0, %edi
+-	vpmovmskb %ymm1, %esi
+-
+-	salq	$32, %rsi
+-	xorq	%rsi, %rdi
+-
+-	/* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes.  */
+-	shrq	%cl, %rdi
+-
+-	testq	%rdi, %rdi
+-	je	L(loop_cross_page_2_vec)
+-	tzcntq	%rdi, %rcx
+ # ifdef USE_AS_STRNCMP
+-	cmpq	%rcx, %r11
+-	jbe	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
++	.p2align 4,, 10
++L(return_page_cross_end_check):
++	tzcntl	%ecx, %ecx
++	leal	-VEC_SIZE(%rax, %rcx), %ecx
++	cmpl	%ecx, %edx
++	ja	L(return_page_cross_cmp_mem)
+ 	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %edi
+-	cmpl	(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
+-# else
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %edi
+-	cmpl	(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
+-# endif
+ 	VZEROUPPER_RETURN
++# endif
+ 
+-	.p2align 4
+-L(loop_cross_page_2_vec):
+-	/* The first VEC_SIZE * 2 bytes match or are ignored.  */
+-	vmovdqu	(VEC_SIZE * 2)(%rax, %r10), %ymm2
+-	vmovdqu	(VEC_SIZE * 3)(%rax, %r10), %ymm3
+-	VPCMPEQ	(VEC_SIZE * 2)(%rdx, %r10), %ymm2, %ymm5
+-	VPMINU	%ymm2, %ymm5, %ymm5
+-	VPCMPEQ	(VEC_SIZE * 3)(%rdx, %r10), %ymm3, %ymm6
+-	VPCMPEQ	%ymm7, %ymm5, %ymm5
+-	VPMINU	%ymm3, %ymm6, %ymm6
+-	VPCMPEQ	%ymm7, %ymm6, %ymm6
+-
+-	vpmovmskb %ymm5, %edi
+-	vpmovmskb %ymm6, %esi
+-
+-	salq	$32, %rsi
+-	xorq	%rsi, %rdi
+ 
+-	xorl	%r8d, %r8d
+-	/* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes.  */
+-	subl	$(VEC_SIZE * 2), %ecx
+-	jle	1f
+-	/* Skip ECX bytes.  */
+-	shrq	%cl, %rdi
+-	/* R8 has number of bytes skipped.  */
+-	movl	%ecx, %r8d
+-1:
+-	/* Before jumping back to the loop, set ESI to the number of
+-	   VEC_SIZE * 4 blocks before page crossing.  */
+-	movl	$(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi
+-
+-	testq	%rdi, %rdi
++	.p2align 4,, 10
++L(more_2x_vec_till_page_cross):
++	/* If more 2x vec till cross we will complete a full loop
++	   iteration here.  */
++
++	VMOVU	VEC_SIZE(%rdi), %ymm0
++	VPCMPEQ	VEC_SIZE(%rsi), %ymm0, %ymm1
++	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
++	vpandn	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %ecx
++	incl	%ecx
++	jnz	L(return_vec_1_end)
++
+ # ifdef USE_AS_STRNCMP
+-	/* At this point, if %rdi value is 0, it already tested
+-	   VEC_SIZE*4+%r10 byte starting from %rax. This label
+-	   checks whether strncmp maximum offset reached or not.  */
+-	je	L(string_nbyte_offset_check)
+-# else
+-	je	L(back_to_loop)
++	cmpq	$(VEC_SIZE * 2), %rdx
++	jbe	L(ret_zero_in_loop_page_cross)
+ # endif
+-	tzcntq	%rdi, %rcx
+-	addq	%r10, %rcx
+-	/* Adjust for number of bytes skipped.  */
+-	addq	%r8, %rcx
++
++	subl	$-(VEC_SIZE * 4), %eax
++
++	/* Safe to include comparisons from lower bytes.  */
++	VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %ymm0
++	VPCMPEQ	-(VEC_SIZE * 2)(%rsi, %rax), %ymm0, %ymm1
++	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
++	vpandn	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %ecx
++	incl	%ecx
++	jnz	L(return_vec_page_cross_0)
++
++	VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %ymm0
++	VPCMPEQ	-(VEC_SIZE * 1)(%rsi, %rax), %ymm0, %ymm1
++	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
++	vpandn	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %ecx
++	incl	%ecx
++	jnz	L(return_vec_page_cross_1)
++
+ # ifdef USE_AS_STRNCMP
+-	addq	$(VEC_SIZE * 2), %rcx
+-	subq	%rcx, %r11
+-	jbe	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
++	/* Must check length here as length might proclude reading next
++	   page.  */
++	cmpq	%rax, %rdx
++	jbe	L(ret_zero_in_loop_page_cross)
++# endif
++
++	/* Finish the loop.  */
++	VMOVA	(VEC_SIZE * 2)(%rdi), %ymm4
++	VMOVA	(VEC_SIZE * 3)(%rdi), %ymm6
++
++	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
++	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
++	vpand	%ymm4, %ymm5, %ymm5
++	vpand	%ymm6, %ymm7, %ymm7
++	VPMINU	%ymm5, %ymm7, %ymm7
++	VPCMPEQ	%ymm7, %ymmZERO, %ymm7
++	vpmovmskb %ymm7, %LOOP_REG
++	testl	%LOOP_REG, %LOOP_REG
++	jnz	L(return_vec_2_3_end)
++
++	/* Best for code size to include ucond-jmp here. Would be faster
++	   if this case is hot to duplicate the L(return_vec_2_3_end) code
++	   as fall-through and have jump back to loop on mismatch
++	   comparison.  */
++	subq	$-(VEC_SIZE * 4), %rdi
++	subq	$-(VEC_SIZE * 4), %rsi
++	addl	$(PAGE_SIZE - VEC_SIZE * 8), %eax
++# ifdef USE_AS_STRNCMP
++	subq	$(VEC_SIZE * 4), %rdx
++	ja	L(loop_skip_page_cross_check)
++L(ret_zero_in_loop_page_cross):
+ 	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %edi
+-	cmpl	(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
++	VZEROUPPER_RETURN
+ # else
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 2)(%rsi, %rcx), %edi
+-	cmpl	(VEC_SIZE * 2)(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(VEC_SIZE * 2)(%rax, %rcx), %eax
+-	movzbl	(VEC_SIZE * 2)(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
++	jmp	L(loop_skip_page_cross_check)
+ # endif
+-	VZEROUPPER_RETURN
+ 
++
++	.p2align 4,, 10
++L(return_vec_page_cross_0):
++	addl	$-VEC_SIZE, %eax
++L(return_vec_page_cross_1):
++	tzcntl	%ecx, %ecx
+ # ifdef USE_AS_STRNCMP
+-L(string_nbyte_offset_check):
+-	leaq	(VEC_SIZE * 4)(%r10), %r10
+-	cmpq	%r10, %r11
+-	jbe	L(zero)
+-	jmp	L(back_to_loop)
++	leal	-VEC_SIZE(%rax, %rcx), %ecx
++	cmpq	%rcx, %rdx
++	jbe	L(ret_zero_in_loop_page_cross)
++# else
++	addl	%eax, %ecx
+ # endif
+ 
+-	.p2align 4
+-L(cross_page_loop):
+-	/* Check one byte/dword at a time.  */
+ # ifdef USE_AS_WCSCMP
+-	cmpl	%ecx, %eax
++	movl	VEC_OFFSET(%rdi, %rcx), %edx
++	xorl	%eax, %eax
++	cmpl	VEC_OFFSET(%rsi, %rcx), %edx
++	je	L(ret9)
++	setl	%al
++	negl	%eax
++	xorl	%r8d, %eax
+ # else
++	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
++	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
+ 	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ # endif
+-	jne	L(different)
+-	addl	$SIZE_OF_CHAR, %edx
+-	cmpl	$(VEC_SIZE * 4), %edx
+-	je	L(main_loop_header)
+-# ifdef USE_AS_STRNCMP
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++L(ret9):
++	VZEROUPPER_RETURN
++
++
++	.p2align 4,, 10
++L(page_cross):
++# ifndef USE_AS_STRNCMP
++	/* If both are VEC aligned we don't need any special logic here.
++	   Only valid for strcmp where stop condition is guranteed to be
++	   reachable by just reading memory.  */
++	testl	$((VEC_SIZE - 1) << 20), %eax
++	jz	L(no_page_cross)
+ # endif
++
++	movl	%edi, %eax
++	movl	%esi, %ecx
++	andl	$(PAGE_SIZE - 1), %eax
++	andl	$(PAGE_SIZE - 1), %ecx
++
++	xorl	%OFFSET_REG, %OFFSET_REG
++
++	/* Check which is closer to page cross, s1 or s2.  */
++	cmpl	%eax, %ecx
++	jg	L(page_cross_s2)
++
++	/* The previous page cross check has false positives. Check for
++	   true positive as page cross logic is very expensive.  */
++	subl	$(PAGE_SIZE - VEC_SIZE * 4), %eax
++	jbe	L(no_page_cross)
++
++	/* Set r8 to not interfere with normal return value (rdi and rsi
++	   did not swap).  */
+ # ifdef USE_AS_WCSCMP
+-	movl	(%rdi, %rdx), %eax
+-	movl	(%rsi, %rdx), %ecx
++	/* any non-zero positive value that doesn't inference with 0x1.
++	 */
++	movl	$2, %r8d
+ # else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %ecx
++	xorl	%r8d, %r8d
+ # endif
+-	/* Check null char.  */
+-	testl	%eax, %eax
+-	jne	L(cross_page_loop)
+-	/* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
+-	   comparisons.  */
+-	subl	%ecx, %eax
+-# ifndef USE_AS_WCSCMP
+-L(different):
++
++	/* Check if less than 1x VEC till page cross.  */
++	subl	$(VEC_SIZE * 3), %eax
++	jg	L(less_1x_vec_till_page)
++
++	/* If more than 1x VEC till page cross, loop throuh safely
++	   loadable memory until within 1x VEC of page cross.  */
++
++	.p2align 4,, 10
++L(page_cross_loop):
++
++	VMOVU	(%rdi, %OFFSET_REG64), %ymm0
++	VPCMPEQ	(%rsi, %OFFSET_REG64), %ymm0, %ymm1
++	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
++	vpandn	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %ecx
++	incl	%ecx
++
++	jnz	L(check_ret_vec_page_cross)
++	addl	$VEC_SIZE, %OFFSET_REG
++# ifdef USE_AS_STRNCMP
++	cmpq	%OFFSET_REG64, %rdx
++	jbe	L(ret_zero_page_cross)
+ # endif
+-	VZEROUPPER_RETURN
++	addl	$VEC_SIZE, %eax
++	jl	L(page_cross_loop)
++
++	subl	%eax, %OFFSET_REG
++	/* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
++	   to not cross page so is safe to load. Since we have already
++	   loaded at least 1 VEC from rsi it is also guranteed to be safe.
++	 */
++
++	VMOVU	(%rdi, %OFFSET_REG64), %ymm0
++	VPCMPEQ	(%rsi, %OFFSET_REG64), %ymm0, %ymm1
++	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
++	vpandn	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %ecx
++
++# ifdef USE_AS_STRNCMP
++	leal	VEC_SIZE(%OFFSET_REG64), %eax
++	cmpq	%rax, %rdx
++	jbe	L(check_ret_vec_page_cross2)
++	addq	%rdi, %rdx
++# endif
++	incl	%ecx
++	jz	L(prepare_loop_no_len)
+ 
++	.p2align 4,, 4
++L(ret_vec_page_cross):
++# ifndef USE_AS_STRNCMP
++L(check_ret_vec_page_cross):
++# endif
++	tzcntl	%ecx, %ecx
++	addl	%OFFSET_REG, %ecx
++L(ret_vec_page_cross_cont):
+ # ifdef USE_AS_WCSCMP
+-	.p2align 4
+-L(different):
+-	/* Use movl to avoid modifying EFLAGS.  */
+-	movl	$0, %eax
++	movl	(%rdi, %rcx), %edx
++	xorl	%eax, %eax
++	cmpl	(%rsi, %rcx), %edx
++	je	L(ret12)
+ 	setl	%al
+ 	negl	%eax
+-	orl	$1, %eax
+-	VZEROUPPER_RETURN
++	xorl	%r8d, %eax
++# else
++	movzbl	(%rdi, %rcx), %eax
++	movzbl	(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ # endif
++L(ret12):
++	VZEROUPPER_RETURN
+ 
+ # ifdef USE_AS_STRNCMP
+-	.p2align 4
+-L(zero):
++	.p2align 4,, 10
++L(check_ret_vec_page_cross2):
++	incl	%ecx
++L(check_ret_vec_page_cross):
++	tzcntl	%ecx, %ecx
++	addl	%OFFSET_REG, %ecx
++	cmpq	%rcx, %rdx
++	ja	L(ret_vec_page_cross_cont)
++	.p2align 4,, 2
++L(ret_zero_page_cross):
+ 	xorl	%eax, %eax
+ 	VZEROUPPER_RETURN
++# endif
+ 
+-	.p2align 4
+-L(char0):
+-#  ifdef USE_AS_WCSCMP
+-	xorl	%eax, %eax
+-	movl	(%rdi), %ecx
+-	cmpl	(%rsi), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rsi), %ecx
+-	movzbl	(%rdi), %eax
+-	subl	%ecx, %eax
+-#  endif
+-	VZEROUPPER_RETURN
++	.p2align 4,, 4
++L(page_cross_s2):
++	/* Ensure this is a true page cross.  */
++	subl	$(PAGE_SIZE - VEC_SIZE * 4), %ecx
++	jbe	L(no_page_cross)
++
++
++	movl	%ecx, %eax
++	movq	%rdi, %rcx
++	movq	%rsi, %rdi
++	movq	%rcx, %rsi
++
++	/* set r8 to negate return value as rdi and rsi swapped.  */
++# ifdef USE_AS_WCSCMP
++	movl	$-4, %r8d
++# else
++	movl	$-1, %r8d
+ # endif
++	xorl	%OFFSET_REG, %OFFSET_REG
+ 
+-	.p2align 4
+-L(last_vector):
+-	addq	%rdx, %rdi
+-	addq	%rdx, %rsi
++	/* Check if more than 1x VEC till page cross.  */
++	subl	$(VEC_SIZE * 3), %eax
++	jle	L(page_cross_loop)
++
++	.p2align 4,, 6
++L(less_1x_vec_till_page):
++	/* Find largest load size we can use.  */
++	cmpl	$16, %eax
++	ja	L(less_16_till_page)
++
++	VMOVU	(%rdi), %xmm0
++	VPCMPEQ	(%rsi), %xmm0, %xmm1
++	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
++	vpandn	%xmm1, %xmm2, %xmm1
++	vpmovmskb %ymm1, %ecx
++	incw	%cx
++	jnz	L(check_ret_vec_page_cross)
++	movl	$16, %OFFSET_REG
+ # ifdef USE_AS_STRNCMP
+-	subq	%rdx, %r11
++	cmpq	%OFFSET_REG64, %rdx
++	jbe	L(ret_zero_page_cross_slow_case0)
++	subl	%eax, %OFFSET_REG
++# else
++	/* Explicit check for 16 byte alignment.  */
++	subl	%eax, %OFFSET_REG
++	jz	L(prepare_loop)
+ # endif
+-	tzcntl	%ecx, %edx
++
++	VMOVU	(%rdi, %OFFSET_REG64), %xmm0
++	VPCMPEQ	(%rsi, %OFFSET_REG64), %xmm0, %xmm1
++	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
++	vpandn	%xmm1, %xmm2, %xmm1
++	vpmovmskb %ymm1, %ecx
++	incw	%cx
++	jnz	L(check_ret_vec_page_cross)
++
+ # ifdef USE_AS_STRNCMP
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++	addl	$16, %OFFSET_REG
++	subq	%OFFSET_REG64, %rdx
++	jbe	L(ret_zero_page_cross_slow_case0)
++	subq	$-(VEC_SIZE * 4), %rdx
++
++	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
++	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
++# else
++	leaq	(16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
++	leaq	(16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
+ # endif
+-# ifdef USE_AS_WCSCMP
++	jmp	L(prepare_loop_aligned)
++
++# ifdef USE_AS_STRNCMP
++	.p2align 4,, 2
++L(ret_zero_page_cross_slow_case0):
+ 	xorl	%eax, %eax
+-	movl	(%rdi, %rdx), %ecx
+-	cmpl	(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
+-# else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %edx
+-	subl	%edx, %eax
++	ret
+ # endif
+-	VZEROUPPER_RETURN
+ 
+-	/* Comparing on page boundary region requires special treatment:
+-	   It must done one vector at the time, starting with the wider
+-	   ymm vector if possible, if not, with xmm. If fetching 16 bytes
+-	   (xmm) still passes the boundary, byte comparison must be done.
+-	 */
+-	.p2align 4
+-L(cross_page):
+-	/* Try one ymm vector at a time.  */
+-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+-	jg	L(cross_page_1_vector)
+-L(loop_1_vector):
+-	vmovdqu	(%rdi, %rdx), %ymm1
+-	VPCMPEQ	(%rsi, %rdx), %ymm1, %ymm0
+-	VPMINU	%ymm1, %ymm0, %ymm0
+-	VPCMPEQ	%ymm7, %ymm0, %ymm0
+-	vpmovmskb %ymm0, %ecx
+-	testl	%ecx, %ecx
+-	jne	L(last_vector)
+ 
+-	addl	$VEC_SIZE, %edx
++	.p2align 4,, 10
++L(less_16_till_page):
++	/* Find largest load size we can use.  */
++	cmpl	$24, %eax
++	ja	L(less_8_till_page)
+ 
+-	addl	$VEC_SIZE, %eax
+-# ifdef USE_AS_STRNCMP
+-	/* Return 0 if the current offset (%rdx) >= the maximum offset
+-	   (%r11).  */
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
+-# endif
+-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+-	jle	L(loop_1_vector)
+-L(cross_page_1_vector):
+-	/* Less than 32 bytes to check, try one xmm vector.  */
+-	cmpl	$(PAGE_SIZE - 16), %eax
+-	jg	L(cross_page_1_xmm)
+-	vmovdqu	(%rdi, %rdx), %xmm1
+-	VPCMPEQ	(%rsi, %rdx), %xmm1, %xmm0
+-	VPMINU	%xmm1, %xmm0, %xmm0
+-	VPCMPEQ	%xmm7, %xmm0, %xmm0
+-	vpmovmskb %xmm0, %ecx
+-	testl	%ecx, %ecx
+-	jne	L(last_vector)
++	vmovq	(%rdi), %xmm0
++	vmovq	(%rsi), %xmm1
++	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
++	VPCMPEQ	%xmm1, %xmm0, %xmm1
++	vpandn	%xmm1, %xmm2, %xmm1
++	vpmovmskb %ymm1, %ecx
++	incb	%cl
++	jnz	L(check_ret_vec_page_cross)
+ 
+-	addl	$16, %edx
+-# ifndef USE_AS_WCSCMP
+-	addl	$16, %eax
++
++# ifdef USE_AS_STRNCMP
++	cmpq	$8, %rdx
++	jbe	L(ret_zero_page_cross_slow_case0)
+ # endif
++	movl	$24, %OFFSET_REG
++	/* Explicit check for 16 byte alignment.  */
++	subl	%eax, %OFFSET_REG
++
++
++
++	vmovq	(%rdi, %OFFSET_REG64), %xmm0
++	vmovq	(%rsi, %OFFSET_REG64), %xmm1
++	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
++	VPCMPEQ	%xmm1, %xmm0, %xmm1
++	vpandn	%xmm1, %xmm2, %xmm1
++	vpmovmskb %ymm1, %ecx
++	incb	%cl
++	jnz	L(check_ret_vec_page_cross)
++
+ # ifdef USE_AS_STRNCMP
+-	/* Return 0 if the current offset (%rdx) >= the maximum offset
+-	   (%r11).  */
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
+-# endif
+-
+-L(cross_page_1_xmm):
+-# ifndef USE_AS_WCSCMP
+-	/* Less than 16 bytes to check, try 8 byte vector.  NB: No need
+-	   for wcscmp nor wcsncmp since wide char is 4 bytes.   */
+-	cmpl	$(PAGE_SIZE - 8), %eax
+-	jg	L(cross_page_8bytes)
+-	vmovq	(%rdi, %rdx), %xmm1
+-	vmovq	(%rsi, %rdx), %xmm0
+-	VPCMPEQ	%xmm0, %xmm1, %xmm0
+-	VPMINU	%xmm1, %xmm0, %xmm0
+-	VPCMPEQ	%xmm7, %xmm0, %xmm0
+-	vpmovmskb %xmm0, %ecx
+-	/* Only last 8 bits are valid.  */
+-	andl	$0xff, %ecx
+-	testl	%ecx, %ecx
+-	jne	L(last_vector)
++	addl	$8, %OFFSET_REG
++	subq	%OFFSET_REG64, %rdx
++	jbe	L(ret_zero_page_cross_slow_case0)
++	subq	$-(VEC_SIZE * 4), %rdx
+ 
+-	addl	$8, %edx
+-	addl	$8, %eax
++	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
++	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
++# else
++	leaq	(8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
++	leaq	(8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
++# endif
++	jmp	L(prepare_loop_aligned)
++
++
++	.p2align 4,, 10
++L(less_8_till_page):
++# ifdef USE_AS_WCSCMP
++	/* If using wchar then this is the only check before we reach
++	   the page boundary.  */
++	movl	(%rdi), %eax
++	movl	(%rsi), %ecx
++	cmpl	%ecx, %eax
++	jnz	L(ret_less_8_wcs)
+ #  ifdef USE_AS_STRNCMP
+-	/* Return 0 if the current offset (%rdx) >= the maximum offset
+-	   (%r11).  */
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++	addq	%rdi, %rdx
++	/* We already checked for len <= 1 so cannot hit that case here.
++	 */
+ #  endif
++	testl	%eax, %eax
++	jnz	L(prepare_loop_no_len)
++	ret
+ 
+-L(cross_page_8bytes):
+-	/* Less than 8 bytes to check, try 4 byte vector.  */
+-	cmpl	$(PAGE_SIZE - 4), %eax
+-	jg	L(cross_page_4bytes)
+-	vmovd	(%rdi, %rdx), %xmm1
+-	vmovd	(%rsi, %rdx), %xmm0
+-	VPCMPEQ	%xmm0, %xmm1, %xmm0
+-	VPMINU	%xmm1, %xmm0, %xmm0
+-	VPCMPEQ	%xmm7, %xmm0, %xmm0
+-	vpmovmskb %xmm0, %ecx
+-	/* Only last 4 bits are valid.  */
+-	andl	$0xf, %ecx
+-	testl	%ecx, %ecx
+-	jne	L(last_vector)
++	.p2align 4,, 8
++L(ret_less_8_wcs):
++	setl	%OFFSET_REG8
++	negl	%OFFSET_REG
++	movl	%OFFSET_REG, %eax
++	xorl	%r8d, %eax
++	ret
++
++# else
++
++	/* Find largest load size we can use.  */
++	cmpl	$28, %eax
++	ja	L(less_4_till_page)
++
++	vmovd	(%rdi), %xmm0
++	vmovd	(%rsi), %xmm1
++	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
++	VPCMPEQ	%xmm1, %xmm0, %xmm1
++	vpandn	%xmm1, %xmm2, %xmm1
++	vpmovmskb %ymm1, %ecx
++	subl	$0xf, %ecx
++	jnz	L(check_ret_vec_page_cross)
+ 
+-	addl	$4, %edx
+ #  ifdef USE_AS_STRNCMP
+-	/* Return 0 if the current offset (%rdx) >= the maximum offset
+-	   (%r11).  */
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++	cmpq	$4, %rdx
++	jbe	L(ret_zero_page_cross_slow_case1)
+ #  endif
++	movl	$28, %OFFSET_REG
++	/* Explicit check for 16 byte alignment.  */
++	subl	%eax, %OFFSET_REG
+ 
+-L(cross_page_4bytes):
+-# endif
+-	/* Less than 4 bytes to check, try one byte/dword at a time.  */
+-# ifdef USE_AS_STRNCMP
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
+-# endif
+-# ifdef USE_AS_WCSCMP
+-	movl	(%rdi, %rdx), %eax
+-	movl	(%rsi, %rdx), %ecx
+-# else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %ecx
+-# endif
+-	testl	%eax, %eax
+-	jne	L(cross_page_loop)
++
++
++	vmovd	(%rdi, %OFFSET_REG64), %xmm0
++	vmovd	(%rsi, %OFFSET_REG64), %xmm1
++	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
++	VPCMPEQ	%xmm1, %xmm0, %xmm1
++	vpandn	%xmm1, %xmm2, %xmm1
++	vpmovmskb %ymm1, %ecx
++	subl	$0xf, %ecx
++	jnz	L(check_ret_vec_page_cross)
++
++#  ifdef USE_AS_STRNCMP
++	addl	$4, %OFFSET_REG
++	subq	%OFFSET_REG64, %rdx
++	jbe	L(ret_zero_page_cross_slow_case1)
++	subq	$-(VEC_SIZE * 4), %rdx
++
++	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
++	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
++#  else
++	leaq	(4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
++	leaq	(4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
++#  endif
++	jmp	L(prepare_loop_aligned)
++
++#  ifdef USE_AS_STRNCMP
++	.p2align 4,, 2
++L(ret_zero_page_cross_slow_case1):
++	xorl	%eax, %eax
++	ret
++#  endif
++
++	.p2align 4,, 10
++L(less_4_till_page):
++	subq	%rdi, %rsi
++	/* Extremely slow byte comparison loop.  */
++L(less_4_loop):
++	movzbl	(%rdi), %eax
++	movzbl	(%rsi, %rdi), %ecx
+ 	subl	%ecx, %eax
+-	VZEROUPPER_RETURN
+-END (STRCMP)
++	jnz	L(ret_less_4_loop)
++	testl	%ecx, %ecx
++	jz	L(ret_zero_4_loop)
++#  ifdef USE_AS_STRNCMP
++	decq	%rdx
++	jz	L(ret_zero_4_loop)
++#  endif
++	incq	%rdi
++	/* end condition is reach page boundary (rdi is aligned).  */
++	testl	$31, %edi
++	jnz	L(less_4_loop)
++	leaq	-(VEC_SIZE * 4)(%rdi, %rsi), %rsi
++	addq	$-(VEC_SIZE * 4), %rdi
++#  ifdef USE_AS_STRNCMP
++	subq	$-(VEC_SIZE * 4), %rdx
++#  endif
++	jmp	L(prepare_loop_aligned)
++
++L(ret_zero_4_loop):
++	xorl	%eax, %eax
++	ret
++L(ret_less_4_loop):
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
++	ret
++# endif
++END(STRCMP)
+ #endif
+-- 
+GitLab
+
diff --git a/SOURCES/ia-opt-strcmp-evex.patch b/SOURCES/ia-opt-strcmp-evex.patch
new file mode 100644
index 0000000..f5019bf
--- /dev/null
+++ b/SOURCES/ia-opt-strcmp-evex.patch
@@ -0,0 +1,1992 @@
+From d16c728bff5a92a254d7078d1766a4f3070acd66 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 10 Jan 2022 15:35:39 -0600
+Subject: [PATCH] x86: Optimize strcmp-evex.S
+
+Optimization are primarily to the loop logic and how the page cross
+logic interacts with the loop.
+
+The page cross logic is at times more expensive for short strings near
+the end of a page but not crossing the page. This is done to retest
+the page cross conditions with a non-faulty check and to improve the
+logic for entering the loop afterwards. This is only particular cases,
+however, and is general made up for by more than 10x improvements on
+the transition from the page cross -> loop case.
+
+The non-page cross cases as well are nearly universally improved.
+
+test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+(cherry picked from commit 8418eb3ff4b781d31c4ed5dc6c0bd7356bc45db9)
+---
+ sysdeps/x86_64/multiarch/strcmp-evex.S | 1712 +++++++++++++-----------
+ 1 file changed, 919 insertions(+), 793 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
+index 6f5c4bf9..99d8409a 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
+@@ -26,54 +26,69 @@
+ 
+ # define PAGE_SIZE	4096
+ 
+-/* VEC_SIZE = Number of bytes in a ymm register */
++	/* VEC_SIZE = Number of bytes in a ymm register.  */
+ # define VEC_SIZE	32
++# define CHAR_PER_VEC	(VEC_SIZE	/	SIZE_OF_CHAR)
+ 
+-/* Shift for dividing by (VEC_SIZE * 4).  */
+-# define DIVIDE_BY_VEC_4_SHIFT	7
+-# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
+-#  error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
+-# endif
+-
+-# define VMOVU		vmovdqu64
+-# define VMOVA		vmovdqa64
++# define VMOVU	vmovdqu64
++# define VMOVA	vmovdqa64
+ 
+ # ifdef USE_AS_WCSCMP
+-/* Compare packed dwords.  */
+-#  define VPCMP		vpcmpd
++#  define TESTEQ	subl	$0xff,
++	/* Compare packed dwords.  */
++#  define VPCMP	vpcmpd
+ #  define VPMINU	vpminud
+ #  define VPTESTM	vptestmd
+-#  define SHIFT_REG32	r8d
+-#  define SHIFT_REG64	r8
+-/* 1 dword char == 4 bytes.  */
++	/* 1 dword char == 4 bytes.  */
+ #  define SIZE_OF_CHAR	4
+ # else
+-/* Compare packed bytes.  */
+-#  define VPCMP		vpcmpb
++#  define TESTEQ	incl
++	/* Compare packed bytes.  */
++#  define VPCMP	vpcmpb
+ #  define VPMINU	vpminub
+ #  define VPTESTM	vptestmb
+-#  define SHIFT_REG32	ecx
+-#  define SHIFT_REG64	rcx
+-/* 1 byte char == 1 byte.  */
++	/* 1 byte char == 1 byte.  */
+ #  define SIZE_OF_CHAR	1
+ # endif
+ 
++# ifdef USE_AS_STRNCMP
++#  define LOOP_REG	r9d
++#  define LOOP_REG64	r9
++
++#  define OFFSET_REG8	r9b
++#  define OFFSET_REG	r9d
++#  define OFFSET_REG64	r9
++# else
++#  define LOOP_REG	edx
++#  define LOOP_REG64	rdx
++
++#  define OFFSET_REG8	dl
++#  define OFFSET_REG	edx
++#  define OFFSET_REG64	rdx
++# endif
++
++# if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP
++#  define VEC_OFFSET	0
++# else
++#  define VEC_OFFSET	(-VEC_SIZE)
++# endif
++
+ # define XMMZERO	xmm16
+-# define XMM0		xmm17
+-# define XMM1		xmm18
++# define XMM0	xmm17
++# define XMM1	xmm18
+ 
+ # define YMMZERO	ymm16
+-# define YMM0		ymm17
+-# define YMM1		ymm18
+-# define YMM2		ymm19
+-# define YMM3		ymm20
+-# define YMM4		ymm21
+-# define YMM5		ymm22
+-# define YMM6		ymm23
+-# define YMM7		ymm24
+-# define YMM8		ymm25
+-# define YMM9		ymm26
+-# define YMM10		ymm27
++# define YMM0	ymm17
++# define YMM1	ymm18
++# define YMM2	ymm19
++# define YMM3	ymm20
++# define YMM4	ymm21
++# define YMM5	ymm22
++# define YMM6	ymm23
++# define YMM7	ymm24
++# define YMM8	ymm25
++# define YMM9	ymm26
++# define YMM10	ymm27
+ 
+ /* Warning!
+            wcscmp/wcsncmp have to use SIGNED comparison for elements.
+@@ -96,985 +111,1096 @@
+    the maximum offset is reached before a difference is found, zero is
+    returned.  */
+ 
+-	.section .text.evex,"ax",@progbits
+-ENTRY (STRCMP)
++	.section .text.evex, "ax", @progbits
++ENTRY(STRCMP)
+ # ifdef USE_AS_STRNCMP
+-	/* Check for simple cases (0 or 1) in offset.  */
+-	cmp	$1, %RDX_LP
+-	je	L(char0)
+-	jb	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-#  ifndef __ILP32__
+-	movq	%rdx, %rcx
+-	/* Check if length could overflow when multiplied by
+-	   sizeof(wchar_t). Checking top 8 bits will cover all potential
+-	   overflow cases as well as redirect cases where its impossible to
+-	   length to bound a valid memory region. In these cases just use
+-	   'wcscmp'.  */
+-	shrq	$56, %rcx
+-	jnz	__wcscmp_evex
+-#  endif
+-	/* Convert units: from wide to byte char.  */
+-	shl	$2, %RDX_LP
++#  ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	movl	%edx, %rdx
+ #  endif
+-	/* Register %r11 tracks the maximum offset.  */
+-	mov	%RDX_LP, %R11_LP
++	cmp	$1, %RDX_LP
++	/* Signed comparison intentional. We use this branch to also
++	   test cases where length >= 2^63. These very large sizes can be
++	   handled with strcmp as there is no way for that length to
++	   actually bound the buffer.  */
++	jle	L(one_or_less)
+ # endif
+ 	movl	%edi, %eax
+-	xorl	%edx, %edx
+-	/* Make %XMMZERO (%YMMZERO) all zeros in this function.  */
+-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+ 	orl	%esi, %eax
+-	andl	$(PAGE_SIZE - 1), %eax
+-	cmpl	$(PAGE_SIZE - (VEC_SIZE * 4)), %eax
+-	jg	L(cross_page)
+-	/* Start comparing 4 vectors.  */
++	/* Shift out the bits irrelivant to page boundary ([63:12]).  */
++	sall	$20, %eax
++	/* Check if s1 or s2 may cross a page in next 4x VEC loads.  */
++	cmpl	$((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax
++	ja	L(page_cross)
++
++L(no_page_cross):
++	/* Safe to compare 4x vectors.  */
+ 	VMOVU	(%rdi), %YMM0
+-
+-	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-
+ 	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+ 	   in YMM0 and 32 bytes at (%rsi).  */
+ 	VPCMP	$0, (%rsi), %YMM0, %k1{%k2}
+-
+ 	kmovd	%k1, %ecx
+-# ifdef USE_AS_WCSCMP
+-	subl	$0xff, %ecx
+-# else
+-	incl	%ecx
+-# endif
+-	je	L(next_3_vectors)
+-	tzcntl	%ecx, %edx
+-# ifdef USE_AS_WCSCMP
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %edx
+-# endif
+ # ifdef USE_AS_STRNCMP
+-	/* Return 0 if the mismatched index (%rdx) is after the maximum
+-	   offset (%r11).   */
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++	cmpq	$CHAR_PER_VEC, %rdx
++	jbe	L(vec_0_test_len)
+ # endif
++
++	/* TESTEQ is `incl` for strcmp/strncmp and `subl $0xff` for
++	   wcscmp/wcsncmp.  */
++
++	/* All 1s represents all equals. TESTEQ will overflow to zero in
++	   all equals case. Otherwise 1s will carry until position of first
++	   mismatch.  */
++	TESTEQ	%ecx
++	jz	L(more_3x_vec)
++
++	.p2align 4,, 4
++L(return_vec_0):
++	tzcntl	%ecx, %ecx
+ # ifdef USE_AS_WCSCMP
++	movl	(%rdi, %rcx, SIZE_OF_CHAR), %edx
+ 	xorl	%eax, %eax
+-	movl	(%rdi, %rdx), %ecx
+-	cmpl	(%rsi, %rdx), %ecx
+-	je	L(return)
+-L(wcscmp_return):
++	cmpl	(%rsi, %rcx, SIZE_OF_CHAR), %edx
++	je	L(ret0)
+ 	setl	%al
+ 	negl	%eax
+ 	orl	$1, %eax
+-L(return):
+ # else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %edx
+-	subl	%edx, %eax
++	movzbl	(%rdi, %rcx), %eax
++	movzbl	(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
+ # endif
++L(ret0):
+ 	ret
+ 
+-L(return_vec_size):
+-	tzcntl	%ecx, %edx
+-# ifdef USE_AS_WCSCMP
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %edx
+-# endif
+ # ifdef USE_AS_STRNCMP
+-	/* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after
+-	   the maximum offset (%r11).  */
+-	addq	$VEC_SIZE, %rdx
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
+-#  ifdef USE_AS_WCSCMP
++	.p2align 4,, 4
++L(vec_0_test_len):
++	notl	%ecx
++	bzhil	%edx, %ecx, %eax
++	jnz	L(return_vec_0)
++	/* Align if will cross fetch block.  */
++	.p2align 4,, 2
++L(ret_zero):
+ 	xorl	%eax, %eax
+-	movl	(%rdi, %rdx), %ecx
+-	cmpl	(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %edx
+-	subl	%edx, %eax
+-#  endif
+-# else
++	ret
++
++	.p2align 4,, 5
++L(one_or_less):
++	jb	L(ret_zero)
+ #  ifdef USE_AS_WCSCMP
++	/* 'nbe' covers the case where length is negative (large
++	   unsigned).  */
++	jnbe	__wcscmp_evex
++	movl	(%rdi), %edx
+ 	xorl	%eax, %eax
+-	movl	VEC_SIZE(%rdi, %rdx), %ecx
+-	cmpl	VEC_SIZE(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
++	cmpl	(%rsi), %edx
++	je	L(ret1)
++	setl	%al
++	negl	%eax
++	orl	$1, %eax
+ #  else
+-	movzbl	VEC_SIZE(%rdi, %rdx), %eax
+-	movzbl	VEC_SIZE(%rsi, %rdx), %edx
+-	subl	%edx, %eax
++	/* 'nbe' covers the case where length is negative (large
++	   unsigned).  */
++	jnbe	__strcmp_evex
++	movzbl	(%rdi), %eax
++	movzbl	(%rsi), %ecx
++	subl	%ecx, %eax
+ #  endif
+-# endif
++L(ret1):
+ 	ret
++# endif
+ 
+-L(return_2_vec_size):
+-	tzcntl	%ecx, %edx
++	.p2align 4,, 10
++L(return_vec_1):
++	tzcntl	%ecx, %ecx
++# ifdef USE_AS_STRNCMP
++	/* rdx must be > CHAR_PER_VEC so its safe to subtract without
++	   worrying about underflow.  */
++	addq	$-CHAR_PER_VEC, %rdx
++	cmpq	%rcx, %rdx
++	jbe	L(ret_zero)
++# endif
+ # ifdef USE_AS_WCSCMP
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %edx
++	movl	VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx
++	xorl	%eax, %eax
++	cmpl	VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx
++	je	L(ret2)
++	setl	%al
++	negl	%eax
++	orl	$1, %eax
++# else
++	movzbl	VEC_SIZE(%rdi, %rcx), %eax
++	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
+ # endif
++L(ret2):
++	ret
++
++	.p2align 4,, 10
+ # ifdef USE_AS_STRNCMP
+-	/* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is
+-	   after the maximum offset (%r11).  */
+-	addq	$(VEC_SIZE * 2), %rdx
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-	xorl	%eax, %eax
+-	movl	(%rdi, %rdx), %ecx
+-	cmpl	(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
++L(return_vec_3):
++#  if CHAR_PER_VEC <= 16
++	sall	$CHAR_PER_VEC, %ecx
+ #  else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %edx
+-	subl	%edx, %eax
++	salq	$CHAR_PER_VEC, %rcx
+ #  endif
++# endif
++L(return_vec_2):
++# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
++	tzcntl	%ecx, %ecx
+ # else
+-#  ifdef USE_AS_WCSCMP
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 2)(%rdi, %rdx), %ecx
+-	cmpl	(VEC_SIZE * 2)(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(VEC_SIZE * 2)(%rdi, %rdx), %eax
+-	movzbl	(VEC_SIZE * 2)(%rsi, %rdx), %edx
+-	subl	%edx, %eax
+-#  endif
++	tzcntq	%rcx, %rcx
+ # endif
+-	ret
+ 
+-L(return_3_vec_size):
+-	tzcntl	%ecx, %edx
+-# ifdef USE_AS_WCSCMP
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %edx
+-# endif
+ # ifdef USE_AS_STRNCMP
+-	/* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is
+-	   after the maximum offset (%r11).  */
+-	addq	$(VEC_SIZE * 3), %rdx
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
+-#  ifdef USE_AS_WCSCMP
++	cmpq	%rcx, %rdx
++	jbe	L(ret_zero)
++# endif
++
++# ifdef USE_AS_WCSCMP
++	movl	(VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx
+ 	xorl	%eax, %eax
+-	movl	(%rdi, %rdx), %ecx
+-	cmpl	(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %edx
+-	subl	%edx, %eax
+-#  endif
++	cmpl	(VEC_SIZE * 2)(%rsi, %rcx, SIZE_OF_CHAR), %edx
++	je	L(ret3)
++	setl	%al
++	negl	%eax
++	orl	$1, %eax
+ # else
++	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
++	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++# endif
++L(ret3):
++	ret
++
++# ifndef USE_AS_STRNCMP
++	.p2align 4,, 10
++L(return_vec_3):
++	tzcntl	%ecx, %ecx
+ #  ifdef USE_AS_WCSCMP
++	movl	(VEC_SIZE * 3)(%rdi, %rcx, SIZE_OF_CHAR), %edx
+ 	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 3)(%rdi, %rdx), %ecx
+-	cmpl	(VEC_SIZE * 3)(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
++	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, SIZE_OF_CHAR), %edx
++	je	L(ret4)
++	setl	%al
++	negl	%eax
++	orl	$1, %eax
+ #  else
+-	movzbl	(VEC_SIZE * 3)(%rdi, %rdx), %eax
+-	movzbl	(VEC_SIZE * 3)(%rsi, %rdx), %edx
+-	subl	%edx, %eax
++	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
++	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
+ #  endif
+-# endif
++L(ret4):
+ 	ret
++# endif
+ 
+-	.p2align 4
+-L(next_3_vectors):
+-	VMOVU	VEC_SIZE(%rdi), %YMM0
+-	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
++	/* 32 byte align here ensures the main loop is ideally aligned
++	   for DSB.  */
++	.p2align 5
++L(more_3x_vec):
++	/* Safe to compare 4x vectors.  */
++	VMOVU	(VEC_SIZE)(%rdi), %YMM0
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+-	   in YMM0 and 32 bytes at VEC_SIZE(%rsi).  */
+-	VPCMP	$0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
++	VPCMP	$0, (VEC_SIZE)(%rsi), %YMM0, %k1{%k2}
+ 	kmovd	%k1, %ecx
+-# ifdef USE_AS_WCSCMP
+-	subl	$0xff, %ecx
+-# else
+-	incl	%ecx
++	TESTEQ	%ecx
++	jnz	L(return_vec_1)
++
++# ifdef USE_AS_STRNCMP
++	subq	$(CHAR_PER_VEC * 2), %rdx
++	jbe	L(ret_zero)
+ # endif
+-	jne	L(return_vec_size)
+ 
+ 	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM0
+-	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+-	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi).  */
+ 	VPCMP	$0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
+ 	kmovd	%k1, %ecx
+-# ifdef USE_AS_WCSCMP
+-	subl	$0xff, %ecx
+-# else
+-	incl	%ecx
+-# endif
+-	jne	L(return_2_vec_size)
++	TESTEQ	%ecx
++	jnz	L(return_vec_2)
+ 
+ 	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM0
+-	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+-	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi).  */
+ 	VPCMP	$0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
+ 	kmovd	%k1, %ecx
++	TESTEQ	%ecx
++	jnz	L(return_vec_3)
++
++# ifdef USE_AS_STRNCMP
++	cmpq	$(CHAR_PER_VEC * 2), %rdx
++	jbe	L(ret_zero)
++# endif
++
++
+ # ifdef USE_AS_WCSCMP
+-	subl	$0xff, %ecx
++	/* any non-zero positive value that doesn't inference with 0x1.
++	 */
++	movl	$2, %r8d
++
+ # else
+-	incl	%ecx
++	xorl	%r8d, %r8d
+ # endif
+-	jne	L(return_3_vec_size)
+-L(main_loop_header):
+-	leaq	(VEC_SIZE * 4)(%rdi), %rdx
+-	movl	$PAGE_SIZE, %ecx
+-	/* Align load via RAX.  */
+-	andq	$-(VEC_SIZE * 4), %rdx
+-	subq	%rdi, %rdx
+-	leaq	(%rdi, %rdx), %rax
++
++	/* The prepare labels are various entry points from the page
++	   cross logic.  */
++L(prepare_loop):
++
+ # ifdef USE_AS_STRNCMP
+-	/* Starting from this point, the maximum offset, or simply the
+-	   'offset', DECREASES by the same amount when base pointers are
+-	   moved forward.  Return 0 when:
+-	     1) On match: offset <= the matched vector index.
+-	     2) On mistmach, offset is before the mistmatched index.
+-	 */
+-	subq	%rdx, %r11
+-	jbe	L(zero)
++#  ifdef USE_AS_WCSCMP
++L(prepare_loop_no_len):
++	movl	%edi, %ecx
++	andl	$(VEC_SIZE * 4 - 1), %ecx
++	shrl	$2, %ecx
++	leaq	(CHAR_PER_VEC * 2)(%rdx, %rcx), %rdx
++#  else
++	/* Store N + (VEC_SIZE * 4) and place check at the begining of
++	   the loop.  */
++	leaq	(VEC_SIZE * 2)(%rdi, %rdx), %rdx
++L(prepare_loop_no_len):
++#  endif
++# else
++L(prepare_loop_no_len):
+ # endif
+-	addq	%rsi, %rdx
+-	movq	%rdx, %rsi
+-	andl	$(PAGE_SIZE - 1), %esi
+-	/* Number of bytes before page crossing.  */
+-	subq	%rsi, %rcx
+-	/* Number of VEC_SIZE * 4 blocks before page crossing.  */
+-	shrq	$DIVIDE_BY_VEC_4_SHIFT, %rcx
+-	/* ESI: Number of VEC_SIZE * 4 blocks before page crossing.   */
+-	movl	%ecx, %esi
+-	jmp	L(loop_start)
+ 
++	/* Align s1 and adjust s2 accordingly.  */
++	subq	%rdi, %rsi
++	andq	$-(VEC_SIZE * 4), %rdi
++L(prepare_loop_readj):
++	addq	%rdi, %rsi
++# if (defined USE_AS_STRNCMP) && !(defined USE_AS_WCSCMP)
++	subq	%rdi, %rdx
++# endif
++
++L(prepare_loop_aligned):
++	/* eax stores distance from rsi to next page cross. These cases
++	   need to be handled specially as the 4x loop could potentially
++	   read memory past the length of s1 or s2 and across a page
++	   boundary.  */
++	movl	$-(VEC_SIZE * 4), %eax
++	subl	%esi, %eax
++	andl	$(PAGE_SIZE - 1), %eax
++
++	vpxorq	%YMMZERO, %YMMZERO, %YMMZERO
++
++	/* Loop 4x comparisons at a time.  */
+ 	.p2align 4
+ L(loop):
++
++	/* End condition for strncmp.  */
+ # ifdef USE_AS_STRNCMP
+-	/* Base pointers are moved forward by 4 * VEC_SIZE.  Decrease
+-	   the maximum offset (%r11) by the same amount.  */
+-	subq	$(VEC_SIZE * 4), %r11
+-	jbe	L(zero)
++	subq	$(CHAR_PER_VEC * 4), %rdx
++	jbe	L(ret_zero)
+ # endif
+-	addq	$(VEC_SIZE * 4), %rax
+-	addq	$(VEC_SIZE * 4), %rdx
+-L(loop_start):
+-	testl	%esi, %esi
+-	leal	-1(%esi), %esi
+-	je	L(loop_cross_page)
+-L(back_to_loop):
+-	/* Main loop, comparing 4 vectors are a time.  */
+-	VMOVA	(%rax), %YMM0
+-	VMOVA	VEC_SIZE(%rax), %YMM2
+-	VMOVA	(VEC_SIZE * 2)(%rax), %YMM4
+-	VMOVA	(VEC_SIZE * 3)(%rax), %YMM6
++
++	subq	$-(VEC_SIZE * 4), %rdi
++	subq	$-(VEC_SIZE * 4), %rsi
++
++	/* Check if rsi loads will cross a page boundary.  */
++	addl	$-(VEC_SIZE * 4), %eax
++	jnb	L(page_cross_during_loop)
++
++	/* Loop entry after handling page cross during loop.  */
++L(loop_skip_page_cross_check):
++	VMOVA	(VEC_SIZE * 0)(%rdi), %YMM0
++	VMOVA	(VEC_SIZE * 1)(%rdi), %YMM2
++	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM4
++	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM6
+ 
+ 	VPMINU	%YMM0, %YMM2, %YMM8
+ 	VPMINU	%YMM4, %YMM6, %YMM9
+ 
+-	/* A zero CHAR in YMM8 means that there is a null CHAR.  */
+-	VPMINU	%YMM8, %YMM9, %YMM8
++	/* A zero CHAR in YMM9 means that there is a null CHAR.  */
++	VPMINU	%YMM8, %YMM9, %YMM9
+ 
+ 	/* Each bit set in K1 represents a non-null CHAR in YMM8.  */
+-	VPTESTM	%YMM8, %YMM8, %k1
++	VPTESTM	%YMM9, %YMM9, %k1
+ 
+-	/* (YMM ^ YMM): A non-zero CHAR represents a mismatch.  */
+-	vpxorq	(%rdx), %YMM0, %YMM1
+-	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM3
+-	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM4, %YMM5
+-	vpxorq	(VEC_SIZE * 3)(%rdx), %YMM6, %YMM7
++	vpxorq	(VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
++	vpxorq	(VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
++	vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
++	/* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
++	   oring with YMM1. Result is stored in YMM6.  */
++	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
+ 
+-	vporq	%YMM1, %YMM3, %YMM9
+-	vporq	%YMM5, %YMM7, %YMM10
++	/* Or together YMM3, YMM5, and YMM6.  */
++	vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
+ 
+-	/* A non-zero CHAR in YMM9 represents a mismatch.  */
+-	vporq	%YMM9, %YMM10, %YMM9
+ 
+-	/* Each bit cleared in K0 represents a mismatch or a null CHAR.  */
+-	VPCMP	$0, %YMMZERO, %YMM9, %k0{%k1}
+-	kmovd   %k0, %ecx
+-# ifdef USE_AS_WCSCMP
+-	subl	$0xff, %ecx
+-# else
+-	incl	%ecx
+-# endif
+-	je	 L(loop)
++	/* A non-zero CHAR in YMM6 represents a mismatch.  */
++	VPCMP	$0, %YMMZERO, %YMM6, %k0{%k1}
++	kmovd	%k0, %LOOP_REG
+ 
+-	/* Each bit set in K1 represents a non-null CHAR in YMM0.  */
++	TESTEQ	%LOOP_REG
++	jz	L(loop)
++
++
++	/* Find which VEC has the mismatch of end of string.  */
+ 	VPTESTM	%YMM0, %YMM0, %k1
+-	/* Each bit cleared in K0 represents a mismatch or a null CHAR
+-	   in YMM0 and (%rdx).  */
+ 	VPCMP	$0, %YMMZERO, %YMM1, %k0{%k1}
+ 	kmovd	%k0, %ecx
+-# ifdef USE_AS_WCSCMP
+-	subl	$0xff, %ecx
+-# else
+-	incl	%ecx
+-# endif
+-	je	L(test_vec)
+-	tzcntl	%ecx, %ecx
+-# ifdef USE_AS_WCSCMP
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %ecx
+-# endif
+-# ifdef USE_AS_STRNCMP
+-	cmpq	%rcx, %r11
+-	jbe	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %edi
+-	cmpl	(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
+-# else
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %edi
+-	cmpl	(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
+-# endif
+-	ret
++	TESTEQ	%ecx
++	jnz	L(return_vec_0_end)
+ 
+-	.p2align 4
+-L(test_vec):
+-# ifdef USE_AS_STRNCMP
+-	/* The first vector matched.  Return 0 if the maximum offset
+-	   (%r11) <= VEC_SIZE.  */
+-	cmpq	$VEC_SIZE, %r11
+-	jbe	L(zero)
+-# endif
+-	/* Each bit set in K1 represents a non-null CHAR in YMM2.  */
+ 	VPTESTM	%YMM2, %YMM2, %k1
+-	/* Each bit cleared in K0 represents a mismatch or a null CHAR
+-	   in YMM2 and VEC_SIZE(%rdx).  */
+ 	VPCMP	$0, %YMMZERO, %YMM3, %k0{%k1}
+ 	kmovd	%k0, %ecx
+-# ifdef USE_AS_WCSCMP
+-	subl	$0xff, %ecx
+-# else
+-	incl	%ecx
+-# endif
+-	je	L(test_2_vec)
+-	tzcntl	%ecx, %edi
+-# ifdef USE_AS_WCSCMP
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %edi
+-# endif
+-# ifdef USE_AS_STRNCMP
+-	addq	$VEC_SIZE, %rdi
+-	cmpq	%rdi, %r11
+-	jbe	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(%rsi, %rdi), %ecx
+-	cmpl	(%rdx, %rdi), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rdi), %eax
+-	movzbl	(%rdx, %rdi), %edx
+-	subl	%edx, %eax
+-#  endif
+-# else
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	VEC_SIZE(%rsi, %rdi), %ecx
+-	cmpl	VEC_SIZE(%rdx, %rdi), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	VEC_SIZE(%rax, %rdi), %eax
+-	movzbl	VEC_SIZE(%rdx, %rdi), %edx
+-	subl	%edx, %eax
+-#  endif
+-# endif
+-	ret
++	TESTEQ	%ecx
++	jnz	L(return_vec_1_end)
+ 
+-	.p2align 4
+-L(test_2_vec):
++
++	/* Handle VEC 2 and 3 without branches.  */
++L(return_vec_2_3_end):
+ # ifdef USE_AS_STRNCMP
+-	/* The first 2 vectors matched.  Return 0 if the maximum offset
+-	   (%r11) <= 2 * VEC_SIZE.  */
+-	cmpq	$(VEC_SIZE * 2), %r11
+-	jbe	L(zero)
++	subq	$(CHAR_PER_VEC * 2), %rdx
++	jbe	L(ret_zero_end)
+ # endif
+-	/* Each bit set in K1 represents a non-null CHAR in YMM4.  */
++
+ 	VPTESTM	%YMM4, %YMM4, %k1
+-	/* Each bit cleared in K0 represents a mismatch or a null CHAR
+-	   in YMM4 and (VEC_SIZE * 2)(%rdx).  */
+ 	VPCMP	$0, %YMMZERO, %YMM5, %k0{%k1}
+ 	kmovd	%k0, %ecx
+-# ifdef USE_AS_WCSCMP
+-	subl	$0xff, %ecx
++	TESTEQ	%ecx
++# if CHAR_PER_VEC <= 16
++	sall	$CHAR_PER_VEC, %LOOP_REG
++	orl	%ecx, %LOOP_REG
+ # else
+-	incl	%ecx
++	salq	$CHAR_PER_VEC, %LOOP_REG64
++	orq	%rcx, %LOOP_REG64
++# endif
++L(return_vec_3_end):
++	/* LOOP_REG contains matches for null/mismatch from the loop. If
++	   VEC 0,1,and 2 all have no null and no mismatches then mismatch
++	   must entirely be from VEC 3 which is fully represented by
++	   LOOP_REG.  */
++# if CHAR_PER_VEC <= 16
++	tzcntl	%LOOP_REG, %LOOP_REG
++# else
++	tzcntq	%LOOP_REG64, %LOOP_REG64
++# endif
++# ifdef USE_AS_STRNCMP
++	cmpq	%LOOP_REG64, %rdx
++	jbe	L(ret_zero_end)
+ # endif
+-	je	L(test_3_vec)
+-	tzcntl	%ecx, %edi
++
+ # ifdef USE_AS_WCSCMP
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %edi
++	movl	(VEC_SIZE * 2)(%rdi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
++	xorl	%eax, %eax
++	cmpl	(VEC_SIZE * 2)(%rsi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
++	je	L(ret5)
++	setl	%al
++	negl	%eax
++	xorl	%r8d, %eax
++# else
++	movzbl	(VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax
++	movzbl	(VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx
++	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ # endif
++L(ret5):
++	ret
++
+ # ifdef USE_AS_STRNCMP
+-	addq	$(VEC_SIZE * 2), %rdi
+-	cmpq	%rdi, %r11
+-	jbe	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
++	.p2align 4,, 2
++L(ret_zero_end):
+ 	xorl	%eax, %eax
+-	movl	(%rsi, %rdi), %ecx
+-	cmpl	(%rdx, %rdi), %ecx
+-	jne	L(wcscmp_return)
++	ret
++# endif
++
++
++	/* The L(return_vec_N_end) differ from L(return_vec_N) in that
++	   they use the value of `r8` to negate the return value. This is
++	   because the page cross logic can swap `rdi` and `rsi`.  */
++	.p2align 4,, 10
++# ifdef USE_AS_STRNCMP
++L(return_vec_1_end):
++#  if CHAR_PER_VEC <= 16
++	sall	$CHAR_PER_VEC, %ecx
+ #  else
+-	movzbl	(%rax, %rdi), %eax
+-	movzbl	(%rdx, %rdi), %edx
+-	subl	%edx, %eax
++	salq	$CHAR_PER_VEC, %rcx
+ #  endif
++# endif
++L(return_vec_0_end):
++# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
++	tzcntl	%ecx, %ecx
+ # else
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 2)(%rsi, %rdi), %ecx
+-	cmpl	(VEC_SIZE * 2)(%rdx, %rdi), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(VEC_SIZE * 2)(%rax, %rdi), %eax
+-	movzbl	(VEC_SIZE * 2)(%rdx, %rdi), %edx
+-	subl	%edx, %eax
+-#  endif
++	tzcntq	%rcx, %rcx
+ # endif
+-	ret
+ 
+-	.p2align 4
+-L(test_3_vec):
+ # ifdef USE_AS_STRNCMP
+-	/* The first 3 vectors matched.  Return 0 if the maximum offset
+-	   (%r11) <= 3 * VEC_SIZE.  */
+-	cmpq	$(VEC_SIZE * 3), %r11
+-	jbe	L(zero)
++	cmpq	%rcx, %rdx
++	jbe	L(ret_zero_end)
+ # endif
+-	/* Each bit set in K1 represents a non-null CHAR in YMM6.  */
+-	VPTESTM	%YMM6, %YMM6, %k1
+-	/* Each bit cleared in K0 represents a mismatch or a null CHAR
+-	   in YMM6 and (VEC_SIZE * 3)(%rdx).  */
+-	VPCMP	$0, %YMMZERO, %YMM7, %k0{%k1}
+-	kmovd	%k0, %ecx
++
+ # ifdef USE_AS_WCSCMP
+-	subl	$0xff, %ecx
++	movl	(%rdi, %rcx, SIZE_OF_CHAR), %edx
++	xorl	%eax, %eax
++	cmpl	(%rsi, %rcx, SIZE_OF_CHAR), %edx
++	je	L(ret6)
++	setl	%al
++	negl	%eax
++	/* This is the non-zero case for `eax` so just xorl with `r8d`
++	   flip is `rdi` and `rsi` where swapped.  */
++	xorl	%r8d, %eax
+ # else
+-	incl	%ecx
++	movzbl	(%rdi, %rcx), %eax
++	movzbl	(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++	/* Flip `eax` if `rdi` and `rsi` where swapped in page cross
++	   logic. Subtract `r8d` after xor for zero case.  */
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ # endif
++L(ret6):
++	ret
++
++# ifndef USE_AS_STRNCMP
++	.p2align 4,, 10
++L(return_vec_1_end):
+ 	tzcntl	%ecx, %ecx
+-# ifdef USE_AS_WCSCMP
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %ecx
+-# endif
+-# ifdef USE_AS_STRNCMP
+-	addq	$(VEC_SIZE * 3), %rcx
+-	cmpq	%rcx, %r11
+-	jbe	L(zero)
+ #  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
++	movl	VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx
+ 	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %esi
+-	cmpl	(%rdx, %rcx), %esi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
+-# else
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 3)(%rsi, %rcx), %esi
+-	cmpl	(VEC_SIZE * 3)(%rdx, %rcx), %esi
+-	jne	L(wcscmp_return)
++	cmpl	VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx
++	je	L(ret7)
++	setl	%al
++	negl	%eax
++	xorl	%r8d, %eax
+ #  else
+-	movzbl	(VEC_SIZE * 3)(%rax, %rcx), %eax
+-	movzbl	(VEC_SIZE * 3)(%rdx, %rcx), %edx
+-	subl	%edx, %eax
++	movzbl	VEC_SIZE(%rdi, %rcx), %eax
++	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ #  endif
+-# endif
++L(ret7):
+ 	ret
+-
+-	.p2align 4
+-L(loop_cross_page):
+-	xorl	%r10d, %r10d
+-	movq	%rdx, %rcx
+-	/* Align load via RDX.  We load the extra ECX bytes which should
+-	   be ignored.  */
+-	andl	$((VEC_SIZE * 4) - 1), %ecx
+-	/* R10 is -RCX.  */
+-	subq	%rcx, %r10
+-
+-	/* This works only if VEC_SIZE * 2 == 64. */
+-# if (VEC_SIZE * 2) != 64
+-#  error (VEC_SIZE * 2) != 64
+ # endif
+ 
+-	/* Check if the first VEC_SIZE * 2 bytes should be ignored.  */
+-	cmpl	$(VEC_SIZE * 2), %ecx
+-	jge	L(loop_cross_page_2_vec)
+ 
+-	VMOVU	(%rax, %r10), %YMM2
+-	VMOVU	VEC_SIZE(%rax, %r10), %YMM3
++	/* Page cross in rsi in next 4x VEC.  */
+ 
+-	/* Each bit set in K2 represents a non-null CHAR in YMM2.  */
+-	VPTESTM	%YMM2, %YMM2, %k2
+-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+-	   in YMM2 and 32 bytes at (%rdx, %r10).  */
+-	VPCMP	$0, (%rdx, %r10), %YMM2, %k1{%k2}
+-	kmovd	%k1, %r9d
+-	/* Don't use subl since it is the lower 16/32 bits of RDI
+-	   below.  */
+-	notl	%r9d
+-# ifdef USE_AS_WCSCMP
+-	/* Only last 8 bits are valid.  */
+-	andl	$0xff, %r9d
+-# endif
++	/* TODO: Improve logic here.  */
++	.p2align 4,, 10
++L(page_cross_during_loop):
++	/* eax contains [distance_from_page - (VEC_SIZE * 4)].  */
+ 
+-	/* Each bit set in K4 represents a non-null CHAR in YMM3.  */
+-	VPTESTM	%YMM3, %YMM3, %k4
+-	/* Each bit cleared in K3 represents a mismatch or a null CHAR
+-	   in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10).  */
+-	VPCMP	$0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4}
+-	kmovd	%k3, %edi
+-    /* Must use notl %edi here as lower bits are for CHAR
+-	   comparisons potentially out of range thus can be 0 without
+-	   indicating mismatch.  */
+-	notl	%edi
+-# ifdef USE_AS_WCSCMP
+-	/* Don't use subl since it is the upper 8 bits of EDI below.  */
+-	andl	$0xff, %edi
+-# endif
++	/* Optimistically rsi and rdi and both aligned in which case we
++	   don't need any logic here.  */
++	cmpl	$-(VEC_SIZE * 4), %eax
++	/* Don't adjust eax before jumping back to loop and we will
++	   never hit page cross case again.  */
++	je	L(loop_skip_page_cross_check)
+ 
+-# ifdef USE_AS_WCSCMP
+-	/* NB: Each bit in EDI/R9D represents 4-byte element.  */
+-	sall	$8, %edi
+-	/* NB: Divide shift count by 4 since each bit in K1 represent 4
+-	   bytes.  */
+-	movl	%ecx, %SHIFT_REG32
+-	sarl	$2, %SHIFT_REG32
+-
+-	/* Each bit in EDI represents a null CHAR or a mismatch.  */
+-	orl	%r9d, %edi
+-# else
+-	salq	$32, %rdi
++	/* Check if we can safely load a VEC.  */
++	cmpl	$-(VEC_SIZE * 3), %eax
++	jle	L(less_1x_vec_till_page_cross)
+ 
+-	/* Each bit in RDI represents a null CHAR or a mismatch.  */
+-	orq	%r9, %rdi
+-# endif
++	VMOVA	(%rdi), %YMM0
++	VPTESTM	%YMM0, %YMM0, %k2
++	VPCMP	$0, (%rsi), %YMM0, %k1{%k2}
++	kmovd	%k1, %ecx
++	TESTEQ	%ecx
++	jnz	L(return_vec_0_end)
++
++	/* if distance >= 2x VEC then eax > -(VEC_SIZE * 2).  */
++	cmpl	$-(VEC_SIZE * 2), %eax
++	jg	L(more_2x_vec_till_page_cross)
++
++	.p2align 4,, 4
++L(less_1x_vec_till_page_cross):
++	subl	$-(VEC_SIZE * 4), %eax
++	/* Guranteed safe to read from rdi - VEC_SIZE here. The only
++	   concerning case is first iteration if incoming s1 was near start
++	   of a page and s2 near end. If s1 was near the start of the page
++	   we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe
++	   to read back -VEC_SIZE. If rdi is truly at the start of a page
++	   here, it means the previous page (rdi - VEC_SIZE) has already
++	   been loaded earlier so must be valid.  */
++	VMOVU	-VEC_SIZE(%rdi, %rax), %YMM0
++	VPTESTM	%YMM0, %YMM0, %k2
++	VPCMP	$0, -VEC_SIZE(%rsi, %rax), %YMM0, %k1{%k2}
++
++	/* Mask of potentially valid bits. The lower bits can be out of
++	   range comparisons (but safe regarding page crosses).  */
+ 
+-	/* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes.  */
+-	shrxq	%SHIFT_REG64, %rdi, %rdi
+-	testq	%rdi, %rdi
+-	je	L(loop_cross_page_2_vec)
+-	tzcntq	%rdi, %rcx
+ # ifdef USE_AS_WCSCMP
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %ecx
++	movl	$-1, %r10d
++	movl	%esi, %ecx
++	andl	$(VEC_SIZE - 1), %ecx
++	shrl	$2, %ecx
++	shlxl	%ecx, %r10d, %ecx
++	movzbl	%cl, %r10d
++# else
++	movl	$-1, %ecx
++	shlxl	%esi, %ecx, %r10d
+ # endif
++
++	kmovd	%k1, %ecx
++	notl	%ecx
++
++
+ # ifdef USE_AS_STRNCMP
+-	cmpq	%rcx, %r11
+-	jbe	L(zero)
+ #  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %edi
+-	cmpl	(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
++	movl	%eax, %r11d
++	shrl	$2, %r11d
++	cmpq	%r11, %rdx
+ #  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
++	cmpq	%rax, %rdx
+ #  endif
++	jbe	L(return_page_cross_end_check)
++# endif
++	movl	%eax, %OFFSET_REG
++
++	/* Readjust eax before potentially returning to the loop.  */
++	addl	$(PAGE_SIZE - VEC_SIZE * 4), %eax
++
++	andl	%r10d, %ecx
++	jz	L(loop_skip_page_cross_check)
++
++	.p2align 4,, 3
++L(return_page_cross_end):
++	tzcntl	%ecx, %ecx
++
++# if (defined USE_AS_STRNCMP) || (defined USE_AS_WCSCMP)
++	leal	-VEC_SIZE(%OFFSET_REG64, %rcx, SIZE_OF_CHAR), %ecx
++L(return_page_cross_cmp_mem):
+ # else
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
++	addl	%OFFSET_REG, %ecx
++# endif
++# ifdef USE_AS_WCSCMP
++	movl	VEC_OFFSET(%rdi, %rcx), %edx
+ 	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %edi
+-	cmpl	(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
++	cmpl	VEC_OFFSET(%rsi, %rcx), %edx
++	je	L(ret8)
++	setl	%al
++	negl	%eax
++	xorl	%r8d, %eax
++# else
++	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
++	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ # endif
++L(ret8):
+ 	ret
+ 
+-	.p2align 4
+-L(loop_cross_page_2_vec):
+-	/* The first VEC_SIZE * 2 bytes match or are ignored.  */
+-	VMOVU	(VEC_SIZE * 2)(%rax, %r10), %YMM0
+-	VMOVU	(VEC_SIZE * 3)(%rax, %r10), %YMM1
++# ifdef USE_AS_STRNCMP
++	.p2align 4,, 10
++L(return_page_cross_end_check):
++	tzcntl	%ecx, %ecx
++	leal	-VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
++#  ifdef USE_AS_WCSCMP
++	sall	$2, %edx
++#  endif
++	cmpl	%ecx, %edx
++	ja	L(return_page_cross_cmp_mem)
++	xorl	%eax, %eax
++	ret
++# endif
++
+ 
++	.p2align 4,, 10
++L(more_2x_vec_till_page_cross):
++	/* If more 2x vec till cross we will complete a full loop
++	   iteration here.  */
++
++	VMOVA	VEC_SIZE(%rdi), %YMM0
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+-	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rdx, %r10).  */
+-	VPCMP	$0, (VEC_SIZE * 2)(%rdx, %r10), %YMM0, %k1{%k2}
+-	kmovd	%k1, %r9d
+-	/* Don't use subl since it is the lower 16/32 bits of RDI
+-	   below.  */
+-	notl	%r9d
+-# ifdef USE_AS_WCSCMP
+-	/* Only last 8 bits are valid.  */
+-	andl	$0xff, %r9d
+-# endif
++	VPCMP	$0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
++	kmovd	%k1, %ecx
++	TESTEQ	%ecx
++	jnz	L(return_vec_1_end)
+ 
+-	VPTESTM	%YMM1, %YMM1, %k4
+-	/* Each bit cleared in K3 represents a mismatch or a null CHAR
+-	   in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10).  */
+-	VPCMP	$0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4}
+-	kmovd	%k3, %edi
+-	/* Must use notl %edi here as lower bits are for CHAR
+-	   comparisons potentially out of range thus can be 0 without
+-	   indicating mismatch.  */
+-	notl	%edi
+-# ifdef USE_AS_WCSCMP
+-	/* Don't use subl since it is the upper 8 bits of EDI below.  */
+-	andl	$0xff, %edi
++# ifdef USE_AS_STRNCMP
++	cmpq	$(CHAR_PER_VEC * 2), %rdx
++	jbe	L(ret_zero_in_loop_page_cross)
+ # endif
+ 
+-# ifdef USE_AS_WCSCMP
+-	/* NB: Each bit in EDI/R9D represents 4-byte element.  */
+-	sall	$8, %edi
++	subl	$-(VEC_SIZE * 4), %eax
+ 
+-	/* Each bit in EDI represents a null CHAR or a mismatch.  */
+-	orl	%r9d, %edi
+-# else
+-	salq	$32, %rdi
++	/* Safe to include comparisons from lower bytes.  */
++	VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %YMM0
++	VPTESTM	%YMM0, %YMM0, %k2
++	VPCMP	$0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM0, %k1{%k2}
++	kmovd	%k1, %ecx
++	TESTEQ	%ecx
++	jnz	L(return_vec_page_cross_0)
++
++	VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %YMM0
++	VPTESTM	%YMM0, %YMM0, %k2
++	VPCMP	$0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM0, %k1{%k2}
++	kmovd	%k1, %ecx
++	TESTEQ	%ecx
++	jnz	L(return_vec_page_cross_1)
+ 
+-	/* Each bit in RDI represents a null CHAR or a mismatch.  */
+-	orq	%r9, %rdi
++# ifdef USE_AS_STRNCMP
++	/* Must check length here as length might proclude reading next
++	   page.  */
++#  ifdef USE_AS_WCSCMP
++	movl	%eax, %r11d
++	shrl	$2, %r11d
++	cmpq	%r11, %rdx
++#  else
++	cmpq	%rax, %rdx
++#  endif
++	jbe	L(ret_zero_in_loop_page_cross)
+ # endif
+ 
+-	xorl	%r8d, %r8d
+-	/* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes.  */
+-	subl	$(VEC_SIZE * 2), %ecx
+-	jle	1f
+-	/* R8 has number of bytes skipped.  */
+-	movl	%ecx, %r8d
+-# ifdef USE_AS_WCSCMP
+-	/* NB: Divide shift count by 4 since each bit in RDI represent 4
+-	   bytes.  */
+-	sarl	$2, %ecx
+-	/* Skip ECX bytes.  */
+-	shrl	%cl, %edi
++	/* Finish the loop.  */
++	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM4
++	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM6
++	VPMINU	%YMM4, %YMM6, %YMM9
++	VPTESTM	%YMM9, %YMM9, %k1
++
++	vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
++	/* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6).  */
++	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6
++
++	VPCMP	$0, %YMMZERO, %YMM6, %k0{%k1}
++	kmovd	%k0, %LOOP_REG
++	TESTEQ	%LOOP_REG
++	jnz	L(return_vec_2_3_end)
++
++	/* Best for code size to include ucond-jmp here. Would be faster
++	   if this case is hot to duplicate the L(return_vec_2_3_end) code
++	   as fall-through and have jump back to loop on mismatch
++	   comparison.  */
++	subq	$-(VEC_SIZE * 4), %rdi
++	subq	$-(VEC_SIZE * 4), %rsi
++	addl	$(PAGE_SIZE - VEC_SIZE * 8), %eax
++# ifdef USE_AS_STRNCMP
++	subq	$(CHAR_PER_VEC * 4), %rdx
++	ja	L(loop_skip_page_cross_check)
++L(ret_zero_in_loop_page_cross):
++	xorl	%eax, %eax
++	ret
+ # else
+-	/* Skip ECX bytes.  */
+-	shrq	%cl, %rdi
++	jmp	L(loop_skip_page_cross_check)
+ # endif
+-1:
+-	/* Before jumping back to the loop, set ESI to the number of
+-	   VEC_SIZE * 4 blocks before page crossing.  */
+-	movl	$(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi
+ 
+-	testq	%rdi, %rdi
+-# ifdef USE_AS_STRNCMP
+-	/* At this point, if %rdi value is 0, it already tested
+-	   VEC_SIZE*4+%r10 byte starting from %rax. This label
+-	   checks whether strncmp maximum offset reached or not.  */
+-	je	L(string_nbyte_offset_check)
++
++	.p2align 4,, 10
++L(return_vec_page_cross_0):
++	addl	$-VEC_SIZE, %eax
++L(return_vec_page_cross_1):
++	tzcntl	%ecx, %ecx
++# if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP
++	leal	-VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
++#  ifdef USE_AS_STRNCMP
++#   ifdef USE_AS_WCSCMP
++	/* Must divide ecx instead of multiply rdx due to overflow.  */
++	movl	%ecx, %eax
++	shrl	$2, %eax
++	cmpq	%rax, %rdx
++#   else
++	cmpq	%rcx, %rdx
++#   endif
++	jbe	L(ret_zero_in_loop_page_cross)
++#  endif
+ # else
+-	je	L(back_to_loop)
++	addl	%eax, %ecx
+ # endif
+-	tzcntq	%rdi, %rcx
++
+ # ifdef USE_AS_WCSCMP
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %ecx
+-# endif
+-	addq	%r10, %rcx
+-	/* Adjust for number of bytes skipped.  */
+-	addq	%r8, %rcx
+-# ifdef USE_AS_STRNCMP
+-	addq	$(VEC_SIZE * 2), %rcx
+-	subq	%rcx, %r11
+-	jbe	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
++	movl	VEC_OFFSET(%rdi, %rcx), %edx
+ 	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %edi
+-	cmpl	(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
++	cmpl	VEC_OFFSET(%rsi, %rcx), %edx
++	je	L(ret9)
++	setl	%al
++	negl	%eax
++	xorl	%r8d, %eax
+ # else
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 2)(%rsi, %rcx), %edi
+-	cmpl	(VEC_SIZE * 2)(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(VEC_SIZE * 2)(%rax, %rcx), %eax
+-	movzbl	(VEC_SIZE * 2)(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
++	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
++	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ # endif
++L(ret9):
+ 	ret
+ 
+-# ifdef USE_AS_STRNCMP
+-L(string_nbyte_offset_check):
+-	leaq	(VEC_SIZE * 4)(%r10), %r10
+-	cmpq	%r10, %r11
+-	jbe	L(zero)
+-	jmp	L(back_to_loop)
++
++	.p2align 4,, 10
++L(page_cross):
++# ifndef USE_AS_STRNCMP
++	/* If both are VEC aligned we don't need any special logic here.
++	   Only valid for strcmp where stop condition is guranteed to be
++	   reachable by just reading memory.  */
++	testl	$((VEC_SIZE - 1) << 20), %eax
++	jz	L(no_page_cross)
+ # endif
+ 
+-	.p2align 4
+-L(cross_page_loop):
+-	/* Check one byte/dword at a time.  */
++	movl	%edi, %eax
++	movl	%esi, %ecx
++	andl	$(PAGE_SIZE - 1), %eax
++	andl	$(PAGE_SIZE - 1), %ecx
++
++	xorl	%OFFSET_REG, %OFFSET_REG
++
++	/* Check which is closer to page cross, s1 or s2.  */
++	cmpl	%eax, %ecx
++	jg	L(page_cross_s2)
++
++	/* The previous page cross check has false positives. Check for
++	   true positive as page cross logic is very expensive.  */
++	subl	$(PAGE_SIZE - VEC_SIZE * 4), %eax
++	jbe	L(no_page_cross)
++
++
++	/* Set r8 to not interfere with normal return value (rdi and rsi
++	   did not swap).  */
+ # ifdef USE_AS_WCSCMP
+-	cmpl	%ecx, %eax
++	/* any non-zero positive value that doesn't inference with 0x1.
++	 */
++	movl	$2, %r8d
+ # else
+-	subl	%ecx, %eax
++	xorl	%r8d, %r8d
+ # endif
+-	jne	L(different)
+-	addl	$SIZE_OF_CHAR, %edx
+-	cmpl	$(VEC_SIZE * 4), %edx
+-	je	L(main_loop_header)
++
++	/* Check if less than 1x VEC till page cross.  */
++	subl	$(VEC_SIZE * 3), %eax
++	jg	L(less_1x_vec_till_page)
++
++
++	/* If more than 1x VEC till page cross, loop throuh safely
++	   loadable memory until within 1x VEC of page cross.  */
++	.p2align 4,, 8
++L(page_cross_loop):
++	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
++	VPTESTM	%YMM0, %YMM0, %k2
++	VPCMP	$0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
++	kmovd	%k1, %ecx
++	TESTEQ	%ecx
++	jnz	L(check_ret_vec_page_cross)
++	addl	$CHAR_PER_VEC, %OFFSET_REG
+ # ifdef USE_AS_STRNCMP
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++	cmpq	%OFFSET_REG64, %rdx
++	jbe	L(ret_zero_page_cross)
+ # endif
++	addl	$VEC_SIZE, %eax
++	jl	L(page_cross_loop)
++
+ # ifdef USE_AS_WCSCMP
+-	movl	(%rdi, %rdx), %eax
+-	movl	(%rsi, %rdx), %ecx
+-# else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %ecx
++	shrl	$2, %eax
+ # endif
+-	/* Check null CHAR.  */
+-	testl	%eax, %eax
+-	jne	L(cross_page_loop)
+-	/* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
+-	   comparisons.  */
+-	subl	%ecx, %eax
+-# ifndef USE_AS_WCSCMP
+-L(different):
++
++
++	subl	%eax, %OFFSET_REG
++	/* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
++	   to not cross page so is safe to load. Since we have already
++	   loaded at least 1 VEC from rsi it is also guranteed to be safe.
++	 */
++	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
++	VPTESTM	%YMM0, %YMM0, %k2
++	VPCMP	$0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
++
++	kmovd	%k1, %ecx
++# ifdef USE_AS_STRNCMP
++	leal	CHAR_PER_VEC(%OFFSET_REG64), %eax
++	cmpq	%rax, %rdx
++	jbe	L(check_ret_vec_page_cross2)
++#  ifdef USE_AS_WCSCMP
++	addq	$-(CHAR_PER_VEC * 2), %rdx
++#  else
++	addq	%rdi, %rdx
++#  endif
+ # endif
+-	ret
++	TESTEQ	%ecx
++	jz	L(prepare_loop_no_len)
+ 
++	.p2align 4,, 4
++L(ret_vec_page_cross):
++# ifndef USE_AS_STRNCMP
++L(check_ret_vec_page_cross):
++# endif
++	tzcntl	%ecx, %ecx
++	addl	%OFFSET_REG, %ecx
++L(ret_vec_page_cross_cont):
+ # ifdef USE_AS_WCSCMP
+-	.p2align 4
+-L(different):
+-	/* Use movl to avoid modifying EFLAGS.  */
+-	movl	$0, %eax
++	movl	(%rdi, %rcx, SIZE_OF_CHAR), %edx
++	xorl	%eax, %eax
++	cmpl	(%rsi, %rcx, SIZE_OF_CHAR), %edx
++	je	L(ret12)
+ 	setl	%al
+ 	negl	%eax
+-	orl	$1, %eax
+-	ret
++	xorl	%r8d, %eax
++# else
++	movzbl	(%rdi, %rcx, SIZE_OF_CHAR), %eax
++	movzbl	(%rsi, %rcx, SIZE_OF_CHAR), %ecx
++	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ # endif
++L(ret12):
++	ret
++
+ 
+ # ifdef USE_AS_STRNCMP
+-	.p2align 4
+-L(zero):
++	.p2align 4,, 10
++L(check_ret_vec_page_cross2):
++	TESTEQ	%ecx
++L(check_ret_vec_page_cross):
++	tzcntl	%ecx, %ecx
++	addl	%OFFSET_REG, %ecx
++	cmpq	%rcx, %rdx
++	ja	L(ret_vec_page_cross_cont)
++	.p2align 4,, 2
++L(ret_zero_page_cross):
+ 	xorl	%eax, %eax
+ 	ret
++# endif
+ 
+-	.p2align 4
+-L(char0):
+-#  ifdef USE_AS_WCSCMP
+-	xorl	%eax, %eax
+-	movl	(%rdi), %ecx
+-	cmpl	(%rsi), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rsi), %ecx
+-	movzbl	(%rdi), %eax
+-	subl	%ecx, %eax
+-#  endif
+-	ret
++	.p2align 4,, 4
++L(page_cross_s2):
++	/* Ensure this is a true page cross.  */
++	subl	$(PAGE_SIZE - VEC_SIZE * 4), %ecx
++	jbe	L(no_page_cross)
++
++
++	movl	%ecx, %eax
++	movq	%rdi, %rcx
++	movq	%rsi, %rdi
++	movq	%rcx, %rsi
++
++	/* set r8 to negate return value as rdi and rsi swapped.  */
++# ifdef USE_AS_WCSCMP
++	movl	$-4, %r8d
++# else
++	movl	$-1, %r8d
+ # endif
++	xorl	%OFFSET_REG, %OFFSET_REG
+ 
+-	.p2align 4
+-L(last_vector):
+-	addq	%rdx, %rdi
+-	addq	%rdx, %rsi
+-# ifdef USE_AS_STRNCMP
+-	subq	%rdx, %r11
++	/* Check if more than 1x VEC till page cross.  */
++	subl	$(VEC_SIZE * 3), %eax
++	jle	L(page_cross_loop)
++
++	.p2align 4,, 6
++L(less_1x_vec_till_page):
++# ifdef USE_AS_WCSCMP
++	shrl	$2, %eax
+ # endif
+-	tzcntl	%ecx, %edx
++	/* Find largest load size we can use.  */
++	cmpl	$(16 / SIZE_OF_CHAR), %eax
++	ja	L(less_16_till_page)
++
++	/* Use 16 byte comparison.  */
++	vmovdqu	(%rdi), %xmm0
++	VPTESTM	%xmm0, %xmm0, %k2
++	VPCMP	$0, (%rsi), %xmm0, %k1{%k2}
++	kmovd	%k1, %ecx
+ # ifdef USE_AS_WCSCMP
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %edx
++	subl	$0xf, %ecx
++# else
++	incw	%cx
+ # endif
++	jnz	L(check_ret_vec_page_cross)
++	movl	$(16 / SIZE_OF_CHAR), %OFFSET_REG
+ # ifdef USE_AS_STRNCMP
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++	cmpq	%OFFSET_REG64, %rdx
++	jbe	L(ret_zero_page_cross_slow_case0)
++	subl	%eax, %OFFSET_REG
++# else
++	/* Explicit check for 16 byte alignment.  */
++	subl	%eax, %OFFSET_REG
++	jz	L(prepare_loop)
+ # endif
++	vmovdqu	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
++	VPTESTM	%xmm0, %xmm0, %k2
++	VPCMP	$0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0, %k1{%k2}
++	kmovd	%k1, %ecx
+ # ifdef USE_AS_WCSCMP
+-	xorl	%eax, %eax
+-	movl	(%rdi, %rdx), %ecx
+-	cmpl	(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
++	subl	$0xf, %ecx
+ # else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %edx
+-	subl	%edx, %eax
++	incw	%cx
+ # endif
++	jnz	L(check_ret_vec_page_cross)
++# ifdef USE_AS_STRNCMP
++	addl	$(16 / SIZE_OF_CHAR), %OFFSET_REG
++	subq	%OFFSET_REG64, %rdx
++	jbe	L(ret_zero_page_cross_slow_case0)
++	subq	$-(CHAR_PER_VEC * 4), %rdx
++
++	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
++	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
++# else
++	leaq	(16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
++	leaq	(16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
++# endif
++	jmp	L(prepare_loop_aligned)
++
++# ifdef USE_AS_STRNCMP
++	.p2align 4,, 2
++L(ret_zero_page_cross_slow_case0):
++	xorl	%eax, %eax
+ 	ret
++# endif
+ 
+-	/* Comparing on page boundary region requires special treatment:
+-	   It must done one vector at the time, starting with the wider
+-	   ymm vector if possible, if not, with xmm. If fetching 16 bytes
+-	   (xmm) still passes the boundary, byte comparison must be done.
+-	 */
+-	.p2align 4
+-L(cross_page):
+-	/* Try one ymm vector at a time.  */
+-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+-	jg	L(cross_page_1_vector)
+-L(loop_1_vector):
+-	VMOVU	(%rdi, %rdx), %YMM0
+ 
+-	VPTESTM	%YMM0, %YMM0, %k2
+-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+-	   in YMM0 and 32 bytes at (%rsi, %rdx).  */
+-	VPCMP	$0, (%rsi, %rdx), %YMM0, %k1{%k2}
++	.p2align 4,, 10
++L(less_16_till_page):
++	cmpl	$(24 / SIZE_OF_CHAR), %eax
++	ja	L(less_8_till_page)
++
++	/* Use 8 byte comparison.  */
++	vmovq	(%rdi), %xmm0
++	vmovq	(%rsi), %xmm1
++	VPTESTM	%xmm0, %xmm0, %k2
++	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
+ 	kmovd	%k1, %ecx
+ # ifdef USE_AS_WCSCMP
+-	subl	$0xff, %ecx
++	subl	$0x3, %ecx
+ # else
+-	incl	%ecx
++	incb	%cl
+ # endif
+-	jne	L(last_vector)
++	jnz	L(check_ret_vec_page_cross)
+ 
+-	addl	$VEC_SIZE, %edx
+ 
+-	addl	$VEC_SIZE, %eax
+ # ifdef USE_AS_STRNCMP
+-	/* Return 0 if the current offset (%rdx) >= the maximum offset
+-	   (%r11).  */
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++	cmpq	$(8 / SIZE_OF_CHAR), %rdx
++	jbe	L(ret_zero_page_cross_slow_case0)
+ # endif
+-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+-	jle	L(loop_1_vector)
+-L(cross_page_1_vector):
+-	/* Less than 32 bytes to check, try one xmm vector.  */
+-	cmpl	$(PAGE_SIZE - 16), %eax
+-	jg	L(cross_page_1_xmm)
+-	VMOVU	(%rdi, %rdx), %XMM0
++	movl	$(24 / SIZE_OF_CHAR), %OFFSET_REG
++	subl	%eax, %OFFSET_REG
+ 
+-	VPTESTM	%YMM0, %YMM0, %k2
+-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+-	   in XMM0 and 16 bytes at (%rsi, %rdx).  */
+-	VPCMP	$0, (%rsi, %rdx), %XMM0, %k1{%k2}
++	vmovq	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
++	vmovq	(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
++	VPTESTM	%xmm0, %xmm0, %k2
++	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
+ 	kmovd	%k1, %ecx
+ # ifdef USE_AS_WCSCMP
+-	subl	$0xf, %ecx
++	subl	$0x3, %ecx
+ # else
+-	subl	$0xffff, %ecx
++	incb	%cl
+ # endif
+-	jne	L(last_vector)
++	jnz	L(check_ret_vec_page_cross)
++
+ 
+-	addl	$16, %edx
+-# ifndef USE_AS_WCSCMP
+-	addl	$16, %eax
+-# endif
+ # ifdef USE_AS_STRNCMP
+-	/* Return 0 if the current offset (%rdx) >= the maximum offset
+-	   (%r11).  */
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++	addl	$(8 / SIZE_OF_CHAR), %OFFSET_REG
++	subq	%OFFSET_REG64, %rdx
++	jbe	L(ret_zero_page_cross_slow_case0)
++	subq	$-(CHAR_PER_VEC * 4), %rdx
++
++	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
++	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
++# else
++	leaq	(8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
++	leaq	(8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
+ # endif
++	jmp	L(prepare_loop_aligned)
+ 
+-L(cross_page_1_xmm):
+-# ifndef USE_AS_WCSCMP
+-	/* Less than 16 bytes to check, try 8 byte vector.  NB: No need
+-	   for wcscmp nor wcsncmp since wide char is 4 bytes.   */
+-	cmpl	$(PAGE_SIZE - 8), %eax
+-	jg	L(cross_page_8bytes)
+-	vmovq	(%rdi, %rdx), %XMM0
+-	vmovq	(%rsi, %rdx), %XMM1
+ 
+-	VPTESTM	%YMM0, %YMM0, %k2
+-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+-	   in XMM0 and XMM1.  */
+-	VPCMP	$0, %XMM1, %XMM0, %k1{%k2}
+-	kmovb	%k1, %ecx
++
++
++	.p2align 4,, 10
++L(less_8_till_page):
+ # ifdef USE_AS_WCSCMP
+-	subl	$0x3, %ecx
++	/* If using wchar then this is the only check before we reach
++	   the page boundary.  */
++	movl	(%rdi), %eax
++	movl	(%rsi), %ecx
++	cmpl	%ecx, %eax
++	jnz	L(ret_less_8_wcs)
++#  ifdef USE_AS_STRNCMP
++	addq	$-(CHAR_PER_VEC * 2), %rdx
++	/* We already checked for len <= 1 so cannot hit that case here.
++	 */
++#  endif
++	testl	%eax, %eax
++	jnz	L(prepare_loop)
++	ret
++
++	.p2align 4,, 8
++L(ret_less_8_wcs):
++	setl	%OFFSET_REG8
++	negl	%OFFSET_REG
++	movl	%OFFSET_REG, %eax
++	xorl	%r8d, %eax
++	ret
++
+ # else
+-	subl	$0xff, %ecx
+-# endif
+-	jne	L(last_vector)
++	cmpl	$28, %eax
++	ja	L(less_4_till_page)
++
++	vmovd	(%rdi), %xmm0
++	vmovd	(%rsi), %xmm1
++	VPTESTM	%xmm0, %xmm0, %k2
++	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
++	kmovd	%k1, %ecx
++	subl	$0xf, %ecx
++	jnz	L(check_ret_vec_page_cross)
+ 
+-	addl	$8, %edx
+-	addl	$8, %eax
+ #  ifdef USE_AS_STRNCMP
+-	/* Return 0 if the current offset (%rdx) >= the maximum offset
+-	   (%r11).  */
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++	cmpq	$4, %rdx
++	jbe	L(ret_zero_page_cross_slow_case1)
+ #  endif
++	movl	$(28 / SIZE_OF_CHAR), %OFFSET_REG
++	subl	%eax, %OFFSET_REG
+ 
+-L(cross_page_8bytes):
+-	/* Less than 8 bytes to check, try 4 byte vector.  */
+-	cmpl	$(PAGE_SIZE - 4), %eax
+-	jg	L(cross_page_4bytes)
+-	vmovd	(%rdi, %rdx), %XMM0
+-	vmovd	(%rsi, %rdx), %XMM1
+-
+-	VPTESTM	%YMM0, %YMM0, %k2
+-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+-	   in XMM0 and XMM1.  */
+-	VPCMP	$0, %XMM1, %XMM0, %k1{%k2}
++	vmovd	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
++	vmovd	(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
++	VPTESTM	%xmm0, %xmm0, %k2
++	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
+ 	kmovd	%k1, %ecx
+-# ifdef USE_AS_WCSCMP
+-	subl	$0x1, %ecx
+-# else
+ 	subl	$0xf, %ecx
+-# endif
+-	jne	L(last_vector)
++	jnz	L(check_ret_vec_page_cross)
++#  ifdef USE_AS_STRNCMP
++	addl	$(4 / SIZE_OF_CHAR), %OFFSET_REG
++	subq	%OFFSET_REG64, %rdx
++	jbe	L(ret_zero_page_cross_slow_case1)
++	subq	$-(CHAR_PER_VEC * 4), %rdx
++
++	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
++	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
++#  else
++	leaq	(4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
++	leaq	(4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
++#  endif
++	jmp	L(prepare_loop_aligned)
++
+ 
+-	addl	$4, %edx
+ #  ifdef USE_AS_STRNCMP
+-	/* Return 0 if the current offset (%rdx) >= the maximum offset
+-	   (%r11).  */
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++	.p2align 4,, 2
++L(ret_zero_page_cross_slow_case1):
++	xorl	%eax, %eax
++	ret
+ #  endif
+ 
+-L(cross_page_4bytes):
+-# endif
+-	/* Less than 4 bytes to check, try one byte/dword at a time.  */
+-# ifdef USE_AS_STRNCMP
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
+-# endif
+-# ifdef USE_AS_WCSCMP
+-	movl	(%rdi, %rdx), %eax
+-	movl	(%rsi, %rdx), %ecx
+-# else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %ecx
+-# endif
+-	testl	%eax, %eax
+-	jne	L(cross_page_loop)
++	.p2align 4,, 10
++L(less_4_till_page):
++	subq	%rdi, %rsi
++	/* Extremely slow byte comparison loop.  */
++L(less_4_loop):
++	movzbl	(%rdi), %eax
++	movzbl	(%rsi, %rdi), %ecx
+ 	subl	%ecx, %eax
++	jnz	L(ret_less_4_loop)
++	testl	%ecx, %ecx
++	jz	L(ret_zero_4_loop)
++#  ifdef USE_AS_STRNCMP
++	decq	%rdx
++	jz	L(ret_zero_4_loop)
++#  endif
++	incq	%rdi
++	/* end condition is reach page boundary (rdi is aligned).  */
++	testl	$31, %edi
++	jnz	L(less_4_loop)
++	leaq	-(VEC_SIZE * 4)(%rdi, %rsi), %rsi
++	addq	$-(VEC_SIZE * 4), %rdi
++#  ifdef USE_AS_STRNCMP
++	subq	$-(CHAR_PER_VEC * 4), %rdx
++#  endif
++	jmp	L(prepare_loop_aligned)
++
++L(ret_zero_4_loop):
++	xorl	%eax, %eax
++	ret
++L(ret_less_4_loop):
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ 	ret
+-END (STRCMP)
++# endif
++END(STRCMP)
+ #endif
+-- 
+GitLab
+
diff --git a/SOURCES/ia-opt-strcspn_strpbrk-strcspn-c.patch b/SOURCES/ia-opt-strcspn_strpbrk-strcspn-c.patch
new file mode 100644
index 0000000..0fb63f7
--- /dev/null
+++ b/SOURCES/ia-opt-strcspn_strpbrk-strcspn-c.patch
@@ -0,0 +1,148 @@
+From 36926710d4ddab6f7d5fa9559cd5e70ccc95e13a Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Mar 2022 16:57:22 -0500
+Subject: [PATCH] x86: Optimize strcspn and strpbrk in strcspn-c.c
+
+Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of
+_mm_cmpistri. Also change offset to unsigned to avoid unnecessary
+sign extensions.
+
+geometric_mean(N=20) of all benchmarks that dont fallback on
+sse2/strlen; New / Original: .928
+
+All string/memory tests pass.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+
+(cherry picked from commit 30d627d477d7255345a4b713cf352ac32d644d61)
+---
+ sysdeps/x86_64/multiarch/strcspn-c.c | 83 +++++++++++++---------------
+ 1 file changed, 37 insertions(+), 46 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c
+index 857af104..6cce4296 100644
+--- a/sysdeps/x86_64/multiarch/strcspn-c.c
++++ b/sysdeps/x86_64/multiarch/strcspn-c.c
+@@ -85,83 +85,74 @@ STRCSPN_SSE42 (const char *s, const char *a)
+     RETURN (NULL, strlen (s));
+ 
+   const char *aligned;
+-  __m128i mask;
+-  int offset = (int) ((size_t) a & 15);
++  __m128i mask, maskz, zero;
++  unsigned int maskz_bits;
++  unsigned int offset = (unsigned int) ((size_t) a & 15);
++  zero = _mm_set1_epi8 (0);
+   if (offset != 0)
+     {
+       /* Load masks.  */
+       aligned = (const char *) ((size_t) a & -16L);
+       __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
+-
+-      mask = __m128i_shift_right (mask0, offset);
++      maskz = _mm_cmpeq_epi8 (mask0, zero);
+ 
+       /* Find where the NULL terminator is.  */
+-      int length = _mm_cmpistri (mask, mask, 0x3a);
+-      if (length == 16 - offset)
+-	{
+-	  /* There is no NULL terminator.  */
+-	  __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
+-	  int index = _mm_cmpistri (mask1, mask1, 0x3a);
+-	  length += index;
+-
+-	  /* Don't use SSE4.2 if the length of A > 16.  */
+-	  if (length > 16)
+-	    return STRCSPN_SSE2 (s, a);
+-
+-	  if (index != 0)
+-	    {
+-	      /* Combine mask0 and mask1.  We could play games with
+-		 palignr, but frankly this data should be in L1 now
+-		 so do the merge via an unaligned load.  */
+-	      mask = _mm_loadu_si128 ((__m128i *) a);
+-	    }
+-	}
++      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
++      if (maskz_bits != 0)
++        {
++          mask = __m128i_shift_right (mask0, offset);
++          offset = (unsigned int) ((size_t) s & 15);
++          if (offset)
++            goto start_unaligned;
++
++          aligned = s;
++          goto start_loop;
++        }
+     }
+-  else
+-    {
+-      /* A is aligned.  */
+-      mask = _mm_load_si128 ((__m128i *) a);
+ 
+-      /* Find where the NULL terminator is.  */
+-      int length = _mm_cmpistri (mask, mask, 0x3a);
+-      if (length == 16)
+-	{
+-	  /* There is no NULL terminator.  Don't use SSE4.2 if the length
+-	     of A > 16.  */
+-	  if (a[16] != 0)
+-	    return STRCSPN_SSE2 (s, a);
+-	}
++  /* A is aligned.  */
++  mask = _mm_loadu_si128 ((__m128i *) a);
++  /* Find where the NULL terminator is.  */
++  maskz = _mm_cmpeq_epi8 (mask, zero);
++  maskz_bits = _mm_movemask_epi8 (maskz);
++  if (maskz_bits == 0)
++    {
++      /* There is no NULL terminator.  Don't use SSE4.2 if the length
++         of A > 16.  */
++      if (a[16] != 0)
++        return STRCSPN_SSE2 (s, a);
+     }
+ 
+-  offset = (int) ((size_t) s & 15);
++  aligned = s;
++  offset = (unsigned int) ((size_t) s & 15);
+   if (offset != 0)
+     {
++    start_unaligned:
+       /* Check partial string.  */
+       aligned = (const char *) ((size_t) s & -16L);
+       __m128i value = _mm_load_si128 ((__m128i *) aligned);
+ 
+       value = __m128i_shift_right (value, offset);
+ 
+-      int length = _mm_cmpistri (mask, value, 0x2);
++      unsigned int length = _mm_cmpistri (mask, value, 0x2);
+       /* No need to check ZFlag since ZFlag is always 1.  */
+-      int cflag = _mm_cmpistrc (mask, value, 0x2);
++      unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
+       if (cflag)
+ 	RETURN ((char *) (s + length), length);
+       /* Find where the NULL terminator is.  */
+-      int index = _mm_cmpistri (value, value, 0x3a);
++      unsigned int index = _mm_cmpistri (value, value, 0x3a);
+       if (index < 16 - offset)
+ 	RETURN (NULL, index);
+       aligned += 16;
+     }
+-  else
+-    aligned = s;
+ 
++start_loop:
+   while (1)
+     {
+       __m128i value = _mm_load_si128 ((__m128i *) aligned);
+-      int index = _mm_cmpistri (mask, value, 0x2);
+-      int cflag = _mm_cmpistrc (mask, value, 0x2);
+-      int zflag = _mm_cmpistrz (mask, value, 0x2);
++      unsigned int index = _mm_cmpistri (mask, value, 0x2);
++      unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
++      unsigned int zflag = _mm_cmpistrz (mask, value, 0x2);
+       if (cflag)
+ 	RETURN ((char *) (aligned + index), (size_t) (aligned + index - s));
+       if (zflag)
+-- 
+GitLab
+
diff --git a/SOURCES/ia-opt-strspn-strspn-c.patch b/SOURCES/ia-opt-strspn-strspn-c.patch
new file mode 100644
index 0000000..aec44d3
--- /dev/null
+++ b/SOURCES/ia-opt-strspn-strspn-c.patch
@@ -0,0 +1,148 @@
+From cdcf8794677acba1fc38ac101bcf52deee23d91d Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Mar 2022 16:57:24 -0500
+Subject: [PATCH] x86: Optimize strspn in strspn-c.c
+
+Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of
+_mm_cmpistri. Also change offset to unsigned to avoid unnecessary
+sign extensions.
+
+geometric_mean(N=20) of all benchmarks that dont fallback on
+sse2; New / Original: .901
+
+All string/memory tests pass.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+
+(cherry picked from commit 412d10343168b05b8cf6c3683457cf9711d28046)
+---
+ sysdeps/x86_64/multiarch/strspn-c.c | 86 +++++++++++++----------------
+ 1 file changed, 39 insertions(+), 47 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c
+index 4554cff0..87c5e4bf 100644
+--- a/sysdeps/x86_64/multiarch/strspn-c.c
++++ b/sysdeps/x86_64/multiarch/strspn-c.c
+@@ -63,81 +63,73 @@ __strspn_sse42 (const char *s, const char *a)
+     return 0;
+ 
+   const char *aligned;
+-  __m128i mask;
+-  int offset = (int) ((size_t) a & 15);
++  __m128i mask, maskz, zero;
++  unsigned int maskz_bits;
++  unsigned int offset = (int) ((size_t) a & 15);
++  zero = _mm_set1_epi8 (0);
+   if (offset != 0)
+     {
+       /* Load masks.  */
+       aligned = (const char *) ((size_t) a & -16L);
+       __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
+-
+-      mask = __m128i_shift_right (mask0, offset);
++      maskz = _mm_cmpeq_epi8 (mask0, zero);
+ 
+       /* Find where the NULL terminator is.  */
+-      int length = _mm_cmpistri (mask, mask, 0x3a);
+-      if (length == 16 - offset)
+-	{
+-	  /* There is no NULL terminator.  */
+-	  __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
+-	  int index = _mm_cmpistri (mask1, mask1, 0x3a);
+-	  length += index;
+-
+-	  /* Don't use SSE4.2 if the length of A > 16.  */
+-	  if (length > 16)
+-	    return __strspn_sse2 (s, a);
+-
+-	  if (index != 0)
+-	    {
+-	      /* Combine mask0 and mask1.  We could play games with
+-		 palignr, but frankly this data should be in L1 now
+-		 so do the merge via an unaligned load.  */
+-	      mask = _mm_loadu_si128 ((__m128i *) a);
+-	    }
+-	}
++      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
++      if (maskz_bits != 0)
++        {
++          mask = __m128i_shift_right (mask0, offset);
++          offset = (unsigned int) ((size_t) s & 15);
++          if (offset)
++            goto start_unaligned;
++
++          aligned = s;
++          goto start_loop;
++        }
+     }
+-  else
+-    {
+-      /* A is aligned.  */
+-      mask = _mm_load_si128 ((__m128i *) a);
+ 
+-      /* Find where the NULL terminator is.  */
+-      int length = _mm_cmpistri (mask, mask, 0x3a);
+-      if (length == 16)
+-	{
+-	  /* There is no NULL terminator.  Don't use SSE4.2 if the length
+-	     of A > 16.  */
+-	  if (a[16] != 0)
+-	    return __strspn_sse2 (s, a);
+-	}
++  /* A is aligned.  */
++  mask = _mm_loadu_si128 ((__m128i *) a);
++
++  /* Find where the NULL terminator is.  */
++  maskz = _mm_cmpeq_epi8 (mask, zero);
++  maskz_bits = _mm_movemask_epi8 (maskz);
++  if (maskz_bits == 0)
++    {
++      /* There is no NULL terminator.  Don't use SSE4.2 if the length
++         of A > 16.  */
++      if (a[16] != 0)
++        return __strspn_sse2 (s, a);
+     }
++  aligned = s;
++  offset = (unsigned int) ((size_t) s & 15);
+ 
+-  offset = (int) ((size_t) s & 15);
+   if (offset != 0)
+     {
++    start_unaligned:
+       /* Check partial string.  */
+       aligned = (const char *) ((size_t) s & -16L);
+       __m128i value = _mm_load_si128 ((__m128i *) aligned);
++      __m128i adj_value = __m128i_shift_right (value, offset);
+ 
+-      value = __m128i_shift_right (value, offset);
+-
+-      int length = _mm_cmpistri (mask, value, 0x12);
++      unsigned int length = _mm_cmpistri (mask, adj_value, 0x12);
+       /* No need to check CFlag since it is always 1.  */
+       if (length < 16 - offset)
+ 	return length;
+       /* Find where the NULL terminator is.  */
+-      int index = _mm_cmpistri (value, value, 0x3a);
+-      if (index < 16 - offset)
++      maskz = _mm_cmpeq_epi8 (value, zero);
++      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
++      if (maskz_bits != 0)
+ 	return length;
+       aligned += 16;
+     }
+-  else
+-    aligned = s;
+ 
++start_loop:
+   while (1)
+     {
+       __m128i value = _mm_load_si128 ((__m128i *) aligned);
+-      int index = _mm_cmpistri (mask, value, 0x12);
+-      int cflag = _mm_cmpistrc (mask, value, 0x12);
++      unsigned int index = _mm_cmpistri (mask, value, 0x12);
++      unsigned int cflag = _mm_cmpistrc (mask, value, 0x12);
+       if (cflag)
+ 	return (size_t) (aligned + index - s);
+       aligned += 16;
+-- 
+GitLab
+
diff --git a/SOURCES/ia-opt-strxcasecmp-avx2.patch b/SOURCES/ia-opt-strxcasecmp-avx2.patch
new file mode 100644
index 0000000..27cca42
--- /dev/null
+++ b/SOURCES/ia-opt-strxcasecmp-avx2.patch
@@ -0,0 +1,760 @@
+From 92783628b724089230e9b4ecab872de807652efe Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Thu, 24 Mar 2022 18:56:12 -0500
+Subject: [PATCH] x86: Add AVX2 optimized str{n}casecmp
+
+geometric_mean(N=40) of all benchmarks AVX2 / SSE42: .702
+
+All string/memory tests pass.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+
+(cherry picked from commit bbf81222343fed5cd704001a2ae0d86c71544151)
+---
+ sysdeps/x86_64/multiarch/Makefile             |   4 +
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  28 +++
+ sysdeps/x86_64/multiarch/ifunc-strcasecmp.h   |  12 +
+ .../x86_64/multiarch/strcasecmp_l-avx2-rtm.S  |  15 ++
+ sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S  |  23 ++
+ sysdeps/x86_64/multiarch/strcmp-avx2.S        | 237 +++++++++++++++---
+ .../x86_64/multiarch/strncase_l-avx2-rtm.S    |  16 ++
+ sysdeps/x86_64/multiarch/strncase_l-avx2.S    |  27 ++
+ 8 files changed, 331 insertions(+), 31 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
+ create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2.S
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 8c9e7812..711ecf2e 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
++++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -51,6 +51,8 @@ sysdep_routines += \
+   stpncpy-sse2-unaligned \
+   stpncpy-ssse3 \
+   strcasecmp_l-avx \
++  strcasecmp_l-avx2 \
++  strcasecmp_l-avx2-rtm \
+   strcasecmp_l-sse2 \
+   strcasecmp_l-sse4_2 \
+   strcasecmp_l-ssse3 \
+@@ -89,6 +91,8 @@ sysdep_routines += \
+   strlen-evex \
+   strlen-sse2 \
+   strncase_l-avx \
++  strncase_l-avx2 \
++  strncase_l-avx2-rtm \
+   strncase_l-sse2 \
+   strncase_l-sse4_2 \
+   strncase_l-ssse3 \
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index c963d391..d873e1be 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -418,6 +418,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 
+   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
+   IFUNC_IMPL (i, name, strcasecmp,
++	      IFUNC_IMPL_ADD (array, i, strcasecmp,
++			      CPU_FEATURE_USABLE (AVX2),
++			      __strcasecmp_avx2)
++	      IFUNC_IMPL_ADD (array, i, strcasecmp,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __strcasecmp_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __strcasecmp_avx)
+@@ -431,6 +438,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 
+   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
+   IFUNC_IMPL (i, name, strcasecmp_l,
++	      IFUNC_IMPL_ADD (array, i, strcasecmp,
++			      CPU_FEATURE_USABLE (AVX2),
++			      __strcasecmp_l_avx2)
++	      IFUNC_IMPL_ADD (array, i, strcasecmp,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __strcasecmp_l_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __strcasecmp_l_avx)
+@@ -558,6 +572,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 
+   /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
+   IFUNC_IMPL (i, name, strncasecmp,
++	      IFUNC_IMPL_ADD (array, i, strncasecmp,
++			      CPU_FEATURE_USABLE (AVX2),
++			      __strncasecmp_avx2)
++	      IFUNC_IMPL_ADD (array, i, strncasecmp,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __strncasecmp_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __strncasecmp_avx)
+@@ -572,6 +593,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 
+   /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
+   IFUNC_IMPL (i, name, strncasecmp_l,
++	      IFUNC_IMPL_ADD (array, i, strncasecmp,
++			      CPU_FEATURE_USABLE (AVX2),
++			      __strncasecmp_l_avx2)
++	      IFUNC_IMPL_ADD (array, i, strncasecmp,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __strncasecmp_l_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __strncasecmp_l_avx)
+diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+index 6a4bb078..926508c4 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
++++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+@@ -23,12 +23,24 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+ 
+ static inline void *
+ IFUNC_SELECTOR (void)
+ {
+   const struct cpu_features* cpu_features = __get_cpu_features ();
+ 
++  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
++      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
++    {
++      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++        return OPTIMIZE (avx2_rtm);
++
++      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
++        return OPTIMIZE (avx2);
++    }
++
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
+     return OPTIMIZE (avx);
+ 
+diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
+new file mode 100644
+index 00000000..09957fc3
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
+@@ -0,0 +1,15 @@
++#ifndef STRCMP
++# define STRCMP	__strcasecmp_l_avx2_rtm
++#endif
++
++#define _GLABEL(x)	x ## _rtm
++#define GLABEL(x)	_GLABEL(x)
++
++#define ZERO_UPPER_VEC_REGISTERS_RETURN	\
++	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
++
++#define VZEROUPPER_RETURN	jmp L(return_vzeroupper)
++
++#define SECTION(p)	p##.avx.rtm
++
++#include "strcasecmp_l-avx2.S"
+diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
+new file mode 100644
+index 00000000..e2762f2a
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
+@@ -0,0 +1,23 @@
++/* strcasecmp_l optimized with AVX2.
++   Copyright (C) 2017-2022 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#ifndef STRCMP
++# define STRCMP	__strcasecmp_l_avx2
++#endif
++#define USE_AS_STRCASECMP_L
++#include "strcmp-avx2.S"
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index 782f9472..28cc98b6 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -20,6 +20,10 @@
+ 
+ # include <sysdep.h>
+ 
++# if defined USE_AS_STRCASECMP_L
++#  include "locale-defines.h"
++# endif
++
+ # ifndef STRCMP
+ #  define STRCMP	__strcmp_avx2
+ # endif
+@@ -74,13 +78,88 @@
+ #  define VEC_OFFSET	(-VEC_SIZE)
+ # endif
+ 
++# ifdef USE_AS_STRCASECMP_L
++#  define BYTE_LOOP_REG	OFFSET_REG
++# else
++#  define BYTE_LOOP_REG	ecx
++# endif
++
++# ifdef USE_AS_STRCASECMP_L
++#  ifdef USE_AS_STRNCMP
++#   define STRCASECMP	__strncasecmp_avx2
++#   define LOCALE_REG	rcx
++#   define LOCALE_REG_LP	RCX_LP
++#   define STRCASECMP_NONASCII	__strncasecmp_l_nonascii
++#  else
++#   define STRCASECMP	__strcasecmp_avx2
++#   define LOCALE_REG	rdx
++#   define LOCALE_REG_LP	RDX_LP
++#   define STRCASECMP_NONASCII	__strcasecmp_l_nonascii
++#  endif
++# endif
++
+ # define xmmZERO	xmm15
+ # define ymmZERO	ymm15
+ 
++# define LCASE_MIN_ymm	%ymm10
++# define LCASE_MAX_ymm	%ymm11
++# define CASE_ADD_ymm	%ymm12
++
++# define LCASE_MIN_xmm	%xmm10
++# define LCASE_MAX_xmm	%xmm11
++# define CASE_ADD_xmm	%xmm12
++
++	/* r11 is never use elsewhere so this is safe to maintain.  */
++# define TOLOWER_BASE	%r11
++
+ # ifndef SECTION
+ #  define SECTION(p)	p##.avx
+ # endif
+ 
++# ifdef USE_AS_STRCASECMP_L
++#  define REG(x, y) x ## y
++#  define TOLOWER(reg1_in, reg1_out, reg2_in, reg2_out, ext)			\
++	vpaddb	REG(LCASE_MIN_, ext), reg1_in, REG(%ext, 8);				\
++	vpaddb	REG(LCASE_MIN_, ext), reg2_in, REG(%ext, 9);				\
++	vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 8), REG(%ext, 8);			\
++	vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 9), REG(%ext, 9);			\
++	vpandn	REG(CASE_ADD_, ext), REG(%ext, 8), REG(%ext, 8);			\
++	vpandn	REG(CASE_ADD_, ext), REG(%ext, 9), REG(%ext, 9);			\
++	vpaddb	REG(%ext, 8), reg1_in, reg1_out;							\
++	vpaddb	REG(%ext, 9), reg2_in, reg2_out
++
++#  define TOLOWER_gpr(src, dst)	movl (TOLOWER_BASE, src, 4), dst
++#  define TOLOWER_ymm(...)	TOLOWER(__VA_ARGS__, ymm)
++#  define TOLOWER_xmm(...)	TOLOWER(__VA_ARGS__, xmm)
++
++#  define CMP_R1_R2(s1_reg, s2_reg, scratch_reg, reg_out, ext)			\
++	TOLOWER	(s1_reg, scratch_reg, s2_reg, s2_reg, ext);					\
++	VPCMPEQ	scratch_reg, s2_reg, reg_out
++
++#  define CMP_R1_S2(s1_reg, s2_mem, scratch_reg, reg_out, ext)			\
++	VMOVU	s2_mem, reg_out;											\
++	CMP_R1_R2(s1_reg, reg_out, scratch_reg, reg_out, ext)
++
++#  define CMP_R1_R2_ymm(...) CMP_R1_R2(__VA_ARGS__, ymm)
++#  define CMP_R1_R2_xmm(...) CMP_R1_R2(__VA_ARGS__, xmm)
++
++#  define CMP_R1_S2_ymm(...) CMP_R1_S2(__VA_ARGS__, ymm)
++#  define CMP_R1_S2_xmm(...) CMP_R1_S2(__VA_ARGS__, xmm)
++
++# else
++#  define TOLOWER_gpr(...)
++#  define TOLOWER_ymm(...)
++#  define TOLOWER_xmm(...)
++
++#  define CMP_R1_R2_ymm(s1_reg, s2_reg, scratch_reg, reg_out)			\
++	VPCMPEQ	s2_reg, s1_reg, reg_out
++
++#  define CMP_R1_R2_xmm(...) CMP_R1_R2_ymm(__VA_ARGS__)
++
++#  define CMP_R1_S2_ymm(...) CMP_R1_R2_ymm(__VA_ARGS__)
++#  define CMP_R1_S2_xmm(...) CMP_R1_R2_xmm(__VA_ARGS__)
++# endif
++
+ /* Warning!
+            wcscmp/wcsncmp have to use SIGNED comparison for elements.
+            strcmp/strncmp have to use UNSIGNED comparison for elements.
+@@ -102,8 +181,49 @@
+    returned.  */
+ 
+ 	.section SECTION(.text), "ax", @progbits
+-ENTRY(STRCMP)
++	.align	16
++	.type	STRCMP, @function
++	.globl	STRCMP
++	.hidden	STRCMP
++
++# ifndef GLABEL
++#  define GLABEL(...)	__VA_ARGS__
++# endif
++
++# ifdef USE_AS_STRCASECMP_L
++ENTRY (GLABEL(STRCASECMP))
++	movq	__libc_tsd_LOCALE@gottpoff(%rip), %rax
++	mov	%fs:(%rax), %LOCALE_REG_LP
++
++	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
++	.p2align 4
++END (GLABEL(STRCASECMP))
++	/* FALLTHROUGH to strcasecmp/strncasecmp_l.  */
++# endif
++
++	.p2align 4
++STRCMP:
++	cfi_startproc
++	_CET_ENDBR
++	CALL_MCOUNT
++
++# if defined USE_AS_STRCASECMP_L
++	/* We have to fall back on the C implementation for locales with
++	   encodings not matching ASCII for single bytes.  */
++#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
++	mov	LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
++#  else
++	mov	(%LOCALE_REG), %RAX_LP
++#  endif
++	testl	$1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
++	jne	STRCASECMP_NONASCII
++	leaq	_nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
++# endif
++
+ # ifdef USE_AS_STRNCMP
++	/* Don't overwrite LOCALE_REG (rcx) until we have pass
++	   L(one_or_less). Otherwise we might use the wrong locale in
++	   the OVERFLOW_STRCMP (strcasecmp_l).  */
+ #  ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	movl	%edx, %edx
+@@ -128,6 +248,30 @@ ENTRY(STRCMP)
+ #  endif
+ # endif
+ 	vpxor	%xmmZERO, %xmmZERO, %xmmZERO
++# if defined USE_AS_STRCASECMP_L
++	.section .rodata.cst32, "aM", @progbits, 32
++	.align	32
++L(lcase_min):
++	.quad	0x3f3f3f3f3f3f3f3f
++	.quad	0x3f3f3f3f3f3f3f3f
++	.quad	0x3f3f3f3f3f3f3f3f
++	.quad	0x3f3f3f3f3f3f3f3f
++L(lcase_max):
++	.quad	0x9999999999999999
++	.quad	0x9999999999999999
++	.quad	0x9999999999999999
++	.quad	0x9999999999999999
++L(case_add):
++	.quad	0x2020202020202020
++	.quad	0x2020202020202020
++	.quad	0x2020202020202020
++	.quad	0x2020202020202020
++	.previous
++
++	vmovdqa	L(lcase_min)(%rip), LCASE_MIN_ymm
++	vmovdqa	L(lcase_max)(%rip), LCASE_MAX_ymm
++	vmovdqa	L(case_add)(%rip), CASE_ADD_ymm
++# endif
+ 	movl	%edi, %eax
+ 	orl	%esi, %eax
+ 	sall	$20, %eax
+@@ -138,8 +282,10 @@ ENTRY(STRCMP)
+ L(no_page_cross):
+ 	/* Safe to compare 4x vectors.  */
+ 	VMOVU	(%rdi), %ymm0
+-	/* 1s where s1 and s2 equal.  */
+-	VPCMPEQ	(%rsi), %ymm0, %ymm1
++	/* 1s where s1 and s2 equal. Just VPCMPEQ if its not strcasecmp.
++	   Otherwise converts ymm0 and load from rsi to lower. ymm2 is
++	   scratch and ymm1 is the return.  */
++	CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
+ 	/* 1s at null CHAR.  */
+ 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+ 	/* 1s where s1 and s2 equal AND not null CHAR.  */
+@@ -172,6 +318,8 @@ L(return_vec_0):
+ # else
+ 	movzbl	(%rdi, %rcx), %eax
+ 	movzbl	(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ # endif
+ L(ret0):
+@@ -192,6 +340,10 @@ L(ret_zero):
+ 
+ 	.p2align 4,, 5
+ L(one_or_less):
++#  ifdef USE_AS_STRCASECMP_L
++	/* Set locale argument for strcasecmp.  */
++	movq	%LOCALE_REG, %rdx
++#  endif
+ 	jb	L(ret_zero)
+ #  ifdef USE_AS_WCSCMP
+ 	/* 'nbe' covers the case where length is negative (large
+@@ -211,6 +363,8 @@ L(one_or_less):
+ 	jnbe	__strcmp_avx2
+ 	movzbl	(%rdi), %eax
+ 	movzbl	(%rsi), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ #  endif
+ L(ret1):
+@@ -238,6 +392,8 @@ L(return_vec_1):
+ # else
+ 	movzbl	VEC_SIZE(%rdi, %rcx), %eax
+ 	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ # endif
+ L(ret2):
+@@ -269,6 +425,8 @@ L(return_vec_2):
+ # else
+ 	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
+ 	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ # endif
+ L(ret3):
+@@ -289,6 +447,8 @@ L(return_vec_3):
+ #  else
+ 	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
+ 	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ #  endif
+ L(ret4):
+@@ -299,7 +459,7 @@ L(ret4):
+ L(more_3x_vec):
+ 	/* Safe to compare 4x vectors.  */
+ 	VMOVU	VEC_SIZE(%rdi), %ymm0
+-	VPCMPEQ	VEC_SIZE(%rsi), %ymm0, %ymm1
++	CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
+ 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+ 	vpandn	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %ecx
+@@ -312,7 +472,7 @@ L(more_3x_vec):
+ # endif
+ 
+ 	VMOVU	(VEC_SIZE * 2)(%rdi), %ymm0
+-	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm0, %ymm1
++	CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 2)(%rsi), %ymm2, %ymm1)
+ 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+ 	vpandn	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %ecx
+@@ -320,7 +480,7 @@ L(more_3x_vec):
+ 	jnz	L(return_vec_2)
+ 
+ 	VMOVU	(VEC_SIZE * 3)(%rdi), %ymm0
+-	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %ymm0, %ymm1
++	CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 3)(%rsi), %ymm2, %ymm1)
+ 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+ 	vpandn	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %ecx
+@@ -395,12 +555,10 @@ L(loop_skip_page_cross_check):
+ 	VMOVA	(VEC_SIZE * 3)(%rdi), %ymm6
+ 
+ 	/* ymm1 all 1s where s1 and s2 equal. All 0s otherwise.  */
+-	VPCMPEQ	(VEC_SIZE * 0)(%rsi), %ymm0, %ymm1
+-
+-	VPCMPEQ	(VEC_SIZE * 1)(%rsi), %ymm2, %ymm3
+-	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
+-	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
+-
++	CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 0)(%rsi), %ymm3, %ymm1)
++	CMP_R1_S2_ymm (%ymm2, (VEC_SIZE * 1)(%rsi), %ymm5, %ymm3)
++	CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
++	CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
+ 
+ 	/* If any mismatches or null CHAR then 0 CHAR, otherwise non-
+ 	   zero.  */
+@@ -469,6 +627,8 @@ L(return_vec_2_3_end):
+ # else
+ 	movzbl	(VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax
+ 	movzbl	(VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ 	xorl	%r8d, %eax
+ 	subl	%r8d, %eax
+@@ -512,6 +672,8 @@ L(return_vec_0_end):
+ # else
+ 	movzbl	(%rdi, %rcx), %eax
+ 	movzbl	(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ 	xorl	%r8d, %eax
+ 	subl	%r8d, %eax
+@@ -534,6 +696,8 @@ L(return_vec_1_end):
+ #  else
+ 	movzbl	VEC_SIZE(%rdi, %rcx), %eax
+ 	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ 	xorl	%r8d, %eax
+ 	subl	%r8d, %eax
+@@ -560,6 +724,8 @@ L(return_vec_2_end):
+ # else
+ 	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
+ 	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ 	xorl	%r8d, %eax
+ 	subl	%r8d, %eax
+@@ -587,7 +753,7 @@ L(page_cross_during_loop):
+ 	jle	L(less_1x_vec_till_page_cross)
+ 
+ 	VMOVA	(%rdi), %ymm0
+-	VPCMPEQ	(%rsi), %ymm0, %ymm1
++	CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
+ 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+ 	vpandn	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %ecx
+@@ -609,7 +775,7 @@ L(less_1x_vec_till_page_cross):
+ 	   here, it means the previous page (rdi - VEC_SIZE) has already
+ 	   been loaded earlier so must be valid.  */
+ 	VMOVU	-VEC_SIZE(%rdi, %rax), %ymm0
+-	VPCMPEQ	-VEC_SIZE(%rsi, %rax), %ymm0, %ymm1
++	CMP_R1_S2_ymm (%ymm0, -VEC_SIZE(%rsi, %rax), %ymm2, %ymm1)
+ 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+ 	vpandn	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %ecx
+@@ -651,6 +817,8 @@ L(return_page_cross_cmp_mem):
+ # else
+ 	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
+ 	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ 	xorl	%r8d, %eax
+ 	subl	%r8d, %eax
+@@ -677,7 +845,7 @@ L(more_2x_vec_till_page_cross):
+ 	   iteration here.  */
+ 
+ 	VMOVU	VEC_SIZE(%rdi), %ymm0
+-	VPCMPEQ	VEC_SIZE(%rsi), %ymm0, %ymm1
++	CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
+ 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+ 	vpandn	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %ecx
+@@ -693,7 +861,7 @@ L(more_2x_vec_till_page_cross):
+ 
+ 	/* Safe to include comparisons from lower bytes.  */
+ 	VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %ymm0
+-	VPCMPEQ	-(VEC_SIZE * 2)(%rsi, %rax), %ymm0, %ymm1
++	CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 2)(%rsi, %rax), %ymm2, %ymm1)
+ 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+ 	vpandn	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %ecx
+@@ -701,7 +869,7 @@ L(more_2x_vec_till_page_cross):
+ 	jnz	L(return_vec_page_cross_0)
+ 
+ 	VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %ymm0
+-	VPCMPEQ	-(VEC_SIZE * 1)(%rsi, %rax), %ymm0, %ymm1
++	CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 1)(%rsi, %rax), %ymm2, %ymm1)
+ 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+ 	vpandn	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %ecx
+@@ -719,8 +887,8 @@ L(more_2x_vec_till_page_cross):
+ 	VMOVA	(VEC_SIZE * 2)(%rdi), %ymm4
+ 	VMOVA	(VEC_SIZE * 3)(%rdi), %ymm6
+ 
+-	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
+-	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
++	CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
++	CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
+ 	vpand	%ymm4, %ymm5, %ymm5
+ 	vpand	%ymm6, %ymm7, %ymm7
+ 	VPMINU	%ymm5, %ymm7, %ymm7
+@@ -771,6 +939,8 @@ L(return_vec_page_cross_1):
+ # else
+ 	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
+ 	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ 	xorl	%r8d, %eax
+ 	subl	%r8d, %eax
+@@ -826,7 +996,7 @@ L(page_cross):
+ L(page_cross_loop):
+ 
+ 	VMOVU	(%rdi, %OFFSET_REG64), %ymm0
+-	VPCMPEQ	(%rsi, %OFFSET_REG64), %ymm0, %ymm1
++	CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
+ 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+ 	vpandn	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %ecx
+@@ -844,11 +1014,11 @@ L(page_cross_loop):
+ 	subl	%eax, %OFFSET_REG
+ 	/* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
+ 	   to not cross page so is safe to load. Since we have already
+-	   loaded at least 1 VEC from rsi it is also guranteed to be safe.
+-	 */
++	   loaded at least 1 VEC from rsi it is also guranteed to be
++	   safe.  */
+ 
+ 	VMOVU	(%rdi, %OFFSET_REG64), %ymm0
+-	VPCMPEQ	(%rsi, %OFFSET_REG64), %ymm0, %ymm1
++	CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
+ 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+ 	vpandn	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %ecx
+@@ -881,6 +1051,8 @@ L(ret_vec_page_cross_cont):
+ # else
+ 	movzbl	(%rdi, %rcx), %eax
+ 	movzbl	(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ 	xorl	%r8d, %eax
+ 	subl	%r8d, %eax
+@@ -934,7 +1106,7 @@ L(less_1x_vec_till_page):
+ 	ja	L(less_16_till_page)
+ 
+ 	VMOVU	(%rdi), %xmm0
+-	VPCMPEQ	(%rsi), %xmm0, %xmm1
++	CMP_R1_S2_xmm (%xmm0, (%rsi), %xmm2, %xmm1)
+ 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
+ 	vpandn	%xmm1, %xmm2, %xmm1
+ 	vpmovmskb %ymm1, %ecx
+@@ -952,7 +1124,7 @@ L(less_1x_vec_till_page):
+ # endif
+ 
+ 	VMOVU	(%rdi, %OFFSET_REG64), %xmm0
+-	VPCMPEQ	(%rsi, %OFFSET_REG64), %xmm0, %xmm1
++	CMP_R1_S2_xmm (%xmm0, (%rsi, %OFFSET_REG64), %xmm2, %xmm1)
+ 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
+ 	vpandn	%xmm1, %xmm2, %xmm1
+ 	vpmovmskb %ymm1, %ecx
+@@ -990,7 +1162,7 @@ L(less_16_till_page):
+ 	vmovq	(%rdi), %xmm0
+ 	vmovq	(%rsi), %xmm1
+ 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
+-	VPCMPEQ	%xmm1, %xmm0, %xmm1
++	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
+ 	vpandn	%xmm1, %xmm2, %xmm1
+ 	vpmovmskb %ymm1, %ecx
+ 	incb	%cl
+@@ -1010,7 +1182,7 @@ L(less_16_till_page):
+ 	vmovq	(%rdi, %OFFSET_REG64), %xmm0
+ 	vmovq	(%rsi, %OFFSET_REG64), %xmm1
+ 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
+-	VPCMPEQ	%xmm1, %xmm0, %xmm1
++	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
+ 	vpandn	%xmm1, %xmm2, %xmm1
+ 	vpmovmskb %ymm1, %ecx
+ 	incb	%cl
+@@ -1066,7 +1238,7 @@ L(ret_less_8_wcs):
+ 	vmovd	(%rdi), %xmm0
+ 	vmovd	(%rsi), %xmm1
+ 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
+-	VPCMPEQ	%xmm1, %xmm0, %xmm1
++	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
+ 	vpandn	%xmm1, %xmm2, %xmm1
+ 	vpmovmskb %ymm1, %ecx
+ 	subl	$0xf, %ecx
+@@ -1085,7 +1257,7 @@ L(ret_less_8_wcs):
+ 	vmovd	(%rdi, %OFFSET_REG64), %xmm0
+ 	vmovd	(%rsi, %OFFSET_REG64), %xmm1
+ 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
+-	VPCMPEQ	%xmm1, %xmm0, %xmm1
++	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
+ 	vpandn	%xmm1, %xmm2, %xmm1
+ 	vpmovmskb %ymm1, %ecx
+ 	subl	$0xf, %ecx
+@@ -1119,7 +1291,9 @@ L(less_4_till_page):
+ L(less_4_loop):
+ 	movzbl	(%rdi), %eax
+ 	movzbl	(%rsi, %rdi), %ecx
+-	subl	%ecx, %eax
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
++	subl	%BYTE_LOOP_REG, %eax
+ 	jnz	L(ret_less_4_loop)
+ 	testl	%ecx, %ecx
+ 	jz	L(ret_zero_4_loop)
+@@ -1146,5 +1320,6 @@ L(ret_less_4_loop):
+ 	subl	%r8d, %eax
+ 	ret
+ # endif
+-END(STRCMP)
++	cfi_endproc
++	.size	STRCMP, .-STRCMP
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
+new file mode 100644
+index 00000000..58c05dcf
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
+@@ -0,0 +1,16 @@
++#ifndef STRCMP
++# define STRCMP	__strncasecmp_l_avx2_rtm
++#endif
++
++#define _GLABEL(x)	x ## _rtm
++#define GLABEL(x)	_GLABEL(x)
++
++#define ZERO_UPPER_VEC_REGISTERS_RETURN	\
++	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
++
++#define VZEROUPPER_RETURN	jmp L(return_vzeroupper)
++
++#define SECTION(p)	p##.avx.rtm
++#define OVERFLOW_STRCMP	__strcasecmp_l_avx2_rtm
++
++#include "strncase_l-avx2.S"
+diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2.S b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
+new file mode 100644
+index 00000000..48c0aa21
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
+@@ -0,0 +1,27 @@
++/* strncasecmp_l optimized with AVX2.
++   Copyright (C) 2017-2022 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#ifndef STRCMP
++# define STRCMP	__strncasecmp_l_avx2
++#endif
++#define USE_AS_STRCASECMP_L
++#define USE_AS_STRNCMP
++#ifndef OVERFLOW_STRCMP
++# define OVERFLOW_STRCMP	__strcasecmp_l_avx2
++#endif
++#include "strcmp-avx2.S"
+-- 
+GitLab
+
diff --git a/SOURCES/ia-opt-strxcasecmp-evex.patch b/SOURCES/ia-opt-strxcasecmp-evex.patch
new file mode 100644
index 0000000..f60685f
--- /dev/null
+++ b/SOURCES/ia-opt-strxcasecmp-evex.patch
@@ -0,0 +1,815 @@
+From 01a9cf0e384dc66504f88663cef26f52925d4c50 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Thu, 24 Mar 2022 18:56:13 -0500
+Subject: [PATCH] x86: Add EVEX optimized str{n}casecmp
+
+geometric_mean(N=40) of all benchmarks EVEX / SSE42: .621
+
+All string/memory tests pass.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+
+(cherry picked from commit 84e7c46df4086873eae28a1fb87d2cf5388b1e16)
+---
+ sysdeps/x86_64/multiarch/Makefile            |   2 +
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  16 +
+ sysdeps/x86_64/multiarch/ifunc-strcasecmp.h  |   5 +
+ sysdeps/x86_64/multiarch/strcasecmp_l-evex.S |  23 ++
+ sysdeps/x86_64/multiarch/strcmp-evex.S       | 290 ++++++++++++++++---
+ sysdeps/x86_64/multiarch/strncase_l-evex.S   |  25 ++
+ 6 files changed, 321 insertions(+), 40 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/strncase_l-evex.S
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 711ecf2e..359712c1 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
++++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -53,6 +53,7 @@ sysdep_routines += \
+   strcasecmp_l-avx \
+   strcasecmp_l-avx2 \
+   strcasecmp_l-avx2-rtm \
++  strcasecmp_l-evex \
+   strcasecmp_l-sse2 \
+   strcasecmp_l-sse4_2 \
+   strcasecmp_l-ssse3 \
+@@ -93,6 +94,7 @@ sysdep_routines += \
+   strncase_l-avx \
+   strncase_l-avx2 \
+   strncase_l-avx2-rtm \
++  strncase_l-evex \
+   strncase_l-sse2 \
+   strncase_l-sse4_2 \
+   strncase_l-ssse3 \
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index d873e1be..1dedc637 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -418,6 +418,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 
+   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
+   IFUNC_IMPL (i, name, strcasecmp,
++	      IFUNC_IMPL_ADD (array, i, strcasecmp,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __strcasecmp_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strcasecmp_avx2)
+@@ -438,6 +442,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 
+   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
+   IFUNC_IMPL (i, name, strcasecmp_l,
++	      IFUNC_IMPL_ADD (array, i, strcasecmp,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __strcasecmp_l_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strcasecmp_l_avx2)
+@@ -572,6 +580,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 
+   /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
+   IFUNC_IMPL (i, name, strncasecmp,
++	      IFUNC_IMPL_ADD (array, i, strncasecmp,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __strncasecmp_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strncasecmp_avx2)
+@@ -593,6 +605,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 
+   /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
+   IFUNC_IMPL (i, name, strncasecmp_l,
++	      IFUNC_IMPL_ADD (array, i, strncasecmp,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __strncasecmp_l_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strncasecmp_l_avx2)
+diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+index 926508c4..6dd49a21 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
++++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+@@ -25,6 +25,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+ 
+ static inline void *
+ IFUNC_SELECTOR (void)
+@@ -34,6 +35,10 @@ IFUNC_SELECTOR (void)
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+     {
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
++          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
++        return OPTIMIZE (evex);
++
+       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+         return OPTIMIZE (avx2_rtm);
+ 
+diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
+new file mode 100644
+index 00000000..58642db7
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
+@@ -0,0 +1,23 @@
++/* strcasecmp_l optimized with EVEX.
++   Copyright (C) 2017-2022 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#ifndef STRCMP
++# define STRCMP	__strcasecmp_l_evex
++#endif
++#define USE_AS_STRCASECMP_L
++#include "strcmp-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
+index 0dfa62bd..b81b5775 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
+@@ -19,6 +19,9 @@
+ #if IS_IN (libc)
+ 
+ # include <sysdep.h>
++# if defined USE_AS_STRCASECMP_L
++#  include "locale-defines.h"
++# endif
+ 
+ # ifndef STRCMP
+ #  define STRCMP	__strcmp_evex
+@@ -34,19 +37,29 @@
+ # define VMOVA	vmovdqa64
+ 
+ # ifdef USE_AS_WCSCMP
+-#  define TESTEQ	subl	$0xff,
++#  ifndef OVERFLOW_STRCMP
++#   define OVERFLOW_STRCMP	__wcscmp_evex
++#  endif
++
++#  define TESTEQ	subl $0xff,
+ 	/* Compare packed dwords.  */
+ #  define VPCMP	vpcmpd
+ #  define VPMINU	vpminud
+ #  define VPTESTM	vptestmd
++#  define VPTESTNM	vptestnmd
+ 	/* 1 dword char == 4 bytes.  */
+ #  define SIZE_OF_CHAR	4
+ # else
++#  ifndef OVERFLOW_STRCMP
++#   define OVERFLOW_STRCMP	__strcmp_evex
++#  endif
++
+ #  define TESTEQ	incl
+ 	/* Compare packed bytes.  */
+ #  define VPCMP	vpcmpb
+ #  define VPMINU	vpminub
+ #  define VPTESTM	vptestmb
++#  define VPTESTNM	vptestnmb
+ 	/* 1 byte char == 1 byte.  */
+ #  define SIZE_OF_CHAR	1
+ # endif
+@@ -73,11 +86,16 @@
+ #  define VEC_OFFSET	(-VEC_SIZE)
+ # endif
+ 
+-# define XMMZERO	xmm16
+ # define XMM0	xmm17
+ # define XMM1	xmm18
+ 
+-# define YMMZERO	ymm16
++# define XMM10	xmm27
++# define XMM11	xmm28
++# define XMM12	xmm29
++# define XMM13	xmm30
++# define XMM14	xmm31
++
++
+ # define YMM0	ymm17
+ # define YMM1	ymm18
+ # define YMM2	ymm19
+@@ -89,6 +107,87 @@
+ # define YMM8	ymm25
+ # define YMM9	ymm26
+ # define YMM10	ymm27
++# define YMM11	ymm28
++# define YMM12	ymm29
++# define YMM13	ymm30
++# define YMM14	ymm31
++
++# ifdef USE_AS_STRCASECMP_L
++#  define BYTE_LOOP_REG	OFFSET_REG
++# else
++#  define BYTE_LOOP_REG	ecx
++# endif
++
++# ifdef USE_AS_STRCASECMP_L
++#  ifdef USE_AS_STRNCMP
++#   define STRCASECMP	__strncasecmp_evex
++#   define LOCALE_REG	rcx
++#   define LOCALE_REG_LP	RCX_LP
++#   define STRCASECMP_NONASCII	__strncasecmp_l_nonascii
++#  else
++#   define STRCASECMP	__strcasecmp_evex
++#   define LOCALE_REG	rdx
++#   define LOCALE_REG_LP	RDX_LP
++#   define STRCASECMP_NONASCII	__strcasecmp_l_nonascii
++#  endif
++# endif
++
++# define LCASE_MIN_YMM	%YMM12
++# define LCASE_MAX_YMM	%YMM13
++# define CASE_ADD_YMM	%YMM14
++
++# define LCASE_MIN_XMM	%XMM12
++# define LCASE_MAX_XMM	%XMM13
++# define CASE_ADD_XMM	%XMM14
++
++	/* NB: wcsncmp uses r11 but strcasecmp is never used in
++	   conjunction with wcscmp.  */
++# define TOLOWER_BASE	%r11
++
++# ifdef USE_AS_STRCASECMP_L
++#  define _REG(x, y) x ## y
++#  define REG(x, y) _REG(x, y)
++#  define TOLOWER(reg1, reg2, ext)										\
++	vpsubb	REG(LCASE_MIN_, ext), reg1, REG(%ext, 10);					\
++	vpsubb	REG(LCASE_MIN_, ext), reg2, REG(%ext, 11);					\
++	vpcmpub	$1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5;				\
++	vpcmpub	$1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6;				\
++	vpaddb	reg1, REG(CASE_ADD_, ext), reg1{%k5};						\
++	vpaddb	reg2, REG(CASE_ADD_, ext), reg2{%k6}
++
++#  define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
++#  define TOLOWER_YMM(...)	TOLOWER(__VA_ARGS__, YMM)
++#  define TOLOWER_XMM(...)	TOLOWER(__VA_ARGS__, XMM)
++
++#  define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)						\
++	TOLOWER	(s1_reg, s2_reg, ext);										\
++	VPCMP	$0, s1_reg, s2_reg, reg_out
++
++#  define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext)				\
++	VMOVU	s2_mem, s2_reg;												\
++	CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)
++
++#  define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM)
++#  define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM)
++
++#  define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM)
++#  define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM)
++
++# else
++#  define TOLOWER_gpr(...)
++#  define TOLOWER_YMM(...)
++#  define TOLOWER_XMM(...)
++
++#  define CMP_R1_R2_YMM(s1_reg, s2_reg, reg_out)						\
++	VPCMP	$0, s2_reg, s1_reg, reg_out
++
++#  define CMP_R1_R2_XMM(...) CMP_R1_R2_YMM(__VA_ARGS__)
++
++#  define CMP_R1_S2_YMM(s1_reg, s2_mem, unused, reg_out)				\
++	VPCMP	$0, s2_mem, s1_reg, reg_out
++
++#  define CMP_R1_S2_XMM(...) CMP_R1_S2_YMM(__VA_ARGS__)
++# endif
+ 
+ /* Warning!
+            wcscmp/wcsncmp have to use SIGNED comparison for elements.
+@@ -112,8 +211,45 @@
+    returned.  */
+ 
+ 	.section .text.evex, "ax", @progbits
+-ENTRY(STRCMP)
++	.align	16
++	.type	STRCMP, @function
++	.globl	STRCMP
++	.hidden	STRCMP
++
++# ifdef USE_AS_STRCASECMP_L
++ENTRY (STRCASECMP)
++	movq	__libc_tsd_LOCALE@gottpoff(%rip), %rax
++	mov	%fs:(%rax), %LOCALE_REG_LP
++
++	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
++	.p2align 4
++END (STRCASECMP)
++	/* FALLTHROUGH to strcasecmp/strncasecmp_l.  */
++# endif
++
++	.p2align 4
++STRCMP:
++	cfi_startproc
++	_CET_ENDBR
++	CALL_MCOUNT
++
++# if defined USE_AS_STRCASECMP_L
++	/* We have to fall back on the C implementation for locales with
++	   encodings not matching ASCII for single bytes.  */
++#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
++	mov	LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
++#  else
++	mov	(%LOCALE_REG), %RAX_LP
++#  endif
++	testl	$1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
++	jne	STRCASECMP_NONASCII
++	leaq	_nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
++# endif
++
+ # ifdef USE_AS_STRNCMP
++	/* Don't overwrite LOCALE_REG (rcx) until we have pass
++	   L(one_or_less). Otherwise we might use the wrong locale in
++	   the OVERFLOW_STRCMP (strcasecmp_l).  */
+ #  ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	movl	%edx, %edx
+@@ -125,6 +261,32 @@ ENTRY(STRCMP)
+ 	   actually bound the buffer.  */
+ 	jle	L(one_or_less)
+ # endif
++
++# if defined USE_AS_STRCASECMP_L
++	.section .rodata.cst32, "aM", @progbits, 32
++	.align	32
++L(lcase_min):
++	.quad	0x4141414141414141
++	.quad	0x4141414141414141
++	.quad	0x4141414141414141
++	.quad	0x4141414141414141
++L(lcase_max):
++	.quad	0x1a1a1a1a1a1a1a1a
++	.quad	0x1a1a1a1a1a1a1a1a
++	.quad	0x1a1a1a1a1a1a1a1a
++	.quad	0x1a1a1a1a1a1a1a1a
++L(case_add):
++	.quad	0x2020202020202020
++	.quad	0x2020202020202020
++	.quad	0x2020202020202020
++	.quad	0x2020202020202020
++	.previous
++
++	vmovdqa64 L(lcase_min)(%rip), LCASE_MIN_YMM
++	vmovdqa64 L(lcase_max)(%rip), LCASE_MAX_YMM
++	vmovdqa64 L(case_add)(%rip), CASE_ADD_YMM
++# endif
++
+ 	movl	%edi, %eax
+ 	orl	%esi, %eax
+ 	/* Shift out the bits irrelivant to page boundary ([63:12]).  */
+@@ -139,7 +301,7 @@ L(no_page_cross):
+ 	VPTESTM	%YMM0, %YMM0, %k2
+ 	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+ 	   in YMM0 and 32 bytes at (%rsi).  */
+-	VPCMP	$0, (%rsi), %YMM0, %k1{%k2}
++	CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
+ 	kmovd	%k1, %ecx
+ # ifdef USE_AS_STRNCMP
+ 	cmpq	$CHAR_PER_VEC, %rdx
+@@ -169,6 +331,8 @@ L(return_vec_0):
+ # else
+ 	movzbl	(%rdi, %rcx), %eax
+ 	movzbl	(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ # endif
+ L(ret0):
+@@ -188,11 +352,15 @@ L(ret_zero):
+ 
+ 	.p2align 4,, 5
+ L(one_or_less):
++#  ifdef USE_AS_STRCASECMP_L
++	/* Set locale argument for strcasecmp.  */
++	movq	%LOCALE_REG, %rdx
++#  endif
+ 	jb	L(ret_zero)
+-#  ifdef USE_AS_WCSCMP
+ 	/* 'nbe' covers the case where length is negative (large
+ 	   unsigned).  */
+-	jnbe	__wcscmp_evex
++	jnbe	OVERFLOW_STRCMP
++#  ifdef USE_AS_WCSCMP
+ 	movl	(%rdi), %edx
+ 	xorl	%eax, %eax
+ 	cmpl	(%rsi), %edx
+@@ -201,11 +369,10 @@ L(one_or_less):
+ 	negl	%eax
+ 	orl	$1, %eax
+ #  else
+-	/* 'nbe' covers the case where length is negative (large
+-	   unsigned).  */
+-	jnbe	__strcmp_evex
+ 	movzbl	(%rdi), %eax
+ 	movzbl	(%rsi), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ #  endif
+ L(ret1):
+@@ -233,6 +400,8 @@ L(return_vec_1):
+ # else
+ 	movzbl	VEC_SIZE(%rdi, %rcx), %eax
+ 	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ # endif
+ L(ret2):
+@@ -270,6 +439,8 @@ L(return_vec_2):
+ # else
+ 	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
+ 	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ # endif
+ L(ret3):
+@@ -290,6 +461,8 @@ L(return_vec_3):
+ #  else
+ 	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
+ 	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ #  endif
+ L(ret4):
+@@ -303,7 +476,7 @@ L(more_3x_vec):
+ 	/* Safe to compare 4x vectors.  */
+ 	VMOVU	(VEC_SIZE)(%rdi), %YMM0
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	VPCMP	$0, (VEC_SIZE)(%rsi), %YMM0, %k1{%k2}
++	CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
+ 	kmovd	%k1, %ecx
+ 	TESTEQ	%ecx
+ 	jnz	L(return_vec_1)
+@@ -315,14 +488,14 @@ L(more_3x_vec):
+ 
+ 	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM0
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	VPCMP	$0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
++	CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2}
+ 	kmovd	%k1, %ecx
+ 	TESTEQ	%ecx
+ 	jnz	L(return_vec_2)
+ 
+ 	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM0
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	VPCMP	$0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
++	CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2}
+ 	kmovd	%k1, %ecx
+ 	TESTEQ	%ecx
+ 	jnz	L(return_vec_3)
+@@ -381,7 +554,6 @@ L(prepare_loop_aligned):
+ 	subl	%esi, %eax
+ 	andl	$(PAGE_SIZE - 1), %eax
+ 
+-	vpxorq	%YMMZERO, %YMMZERO, %YMMZERO
+ 
+ 	/* Loop 4x comparisons at a time.  */
+ 	.p2align 4
+@@ -413,22 +585,35 @@ L(loop_skip_page_cross_check):
+ 	/* A zero CHAR in YMM9 means that there is a null CHAR.  */
+ 	VPMINU	%YMM8, %YMM9, %YMM9
+ 
+-	/* Each bit set in K1 represents a non-null CHAR in YMM8.  */
++	/* Each bit set in K1 represents a non-null CHAR in YMM9.  */
+ 	VPTESTM	%YMM9, %YMM9, %k1
+-
++# ifndef USE_AS_STRCASECMP_L
+ 	vpxorq	(VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
+ 	vpxorq	(VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
+ 	vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
+ 	/* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
+ 	   oring with YMM1. Result is stored in YMM6.  */
+ 	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
+-
++# else
++	VMOVU	(VEC_SIZE * 0)(%rsi), %YMM1
++	TOLOWER_YMM (%YMM0, %YMM1)
++	VMOVU	(VEC_SIZE * 1)(%rsi), %YMM3
++	TOLOWER_YMM (%YMM2, %YMM3)
++	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM5
++	TOLOWER_YMM (%YMM4, %YMM5)
++	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM7
++	TOLOWER_YMM (%YMM6, %YMM7)
++	vpxorq	%YMM0, %YMM1, %YMM1
++	vpxorq	%YMM2, %YMM3, %YMM3
++	vpxorq	%YMM4, %YMM5, %YMM5
++	vpternlogd $0xde, %YMM7, %YMM1, %YMM6
++# endif
+ 	/* Or together YMM3, YMM5, and YMM6.  */
+ 	vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
+ 
+ 
+ 	/* A non-zero CHAR in YMM6 represents a mismatch.  */
+-	VPCMP	$0, %YMMZERO, %YMM6, %k0{%k1}
++	VPTESTNM %YMM6, %YMM6, %k0{%k1}
+ 	kmovd	%k0, %LOOP_REG
+ 
+ 	TESTEQ	%LOOP_REG
+@@ -437,13 +622,13 @@ L(loop_skip_page_cross_check):
+ 
+ 	/* Find which VEC has the mismatch of end of string.  */
+ 	VPTESTM	%YMM0, %YMM0, %k1
+-	VPCMP	$0, %YMMZERO, %YMM1, %k0{%k1}
++	VPTESTNM %YMM1, %YMM1, %k0{%k1}
+ 	kmovd	%k0, %ecx
+ 	TESTEQ	%ecx
+ 	jnz	L(return_vec_0_end)
+ 
+ 	VPTESTM	%YMM2, %YMM2, %k1
+-	VPCMP	$0, %YMMZERO, %YMM3, %k0{%k1}
++	VPTESTNM %YMM3, %YMM3, %k0{%k1}
+ 	kmovd	%k0, %ecx
+ 	TESTEQ	%ecx
+ 	jnz	L(return_vec_1_end)
+@@ -457,7 +642,7 @@ L(return_vec_2_3_end):
+ # endif
+ 
+ 	VPTESTM	%YMM4, %YMM4, %k1
+-	VPCMP	$0, %YMMZERO, %YMM5, %k0{%k1}
++	VPTESTNM %YMM5, %YMM5, %k0{%k1}
+ 	kmovd	%k0, %ecx
+ 	TESTEQ	%ecx
+ # if CHAR_PER_VEC <= 16
+@@ -493,6 +678,8 @@ L(return_vec_3_end):
+ # else
+ 	movzbl	(VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax
+ 	movzbl	(VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ 	xorl	%r8d, %eax
+ 	subl	%r8d, %eax
+@@ -545,6 +732,8 @@ L(return_vec_0_end):
+ # else
+ 	movzbl	(%rdi, %rcx), %eax
+ 	movzbl	(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ 	/* Flip `eax` if `rdi` and `rsi` where swapped in page cross
+ 	   logic. Subtract `r8d` after xor for zero case.  */
+@@ -569,6 +758,8 @@ L(return_vec_1_end):
+ #  else
+ 	movzbl	VEC_SIZE(%rdi, %rcx), %eax
+ 	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ 	xorl	%r8d, %eax
+ 	subl	%r8d, %eax
+@@ -598,7 +789,7 @@ L(page_cross_during_loop):
+ 
+ 	VMOVA	(%rdi), %YMM0
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	VPCMP	$0, (%rsi), %YMM0, %k1{%k2}
++	CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
+ 	kmovd	%k1, %ecx
+ 	TESTEQ	%ecx
+ 	jnz	L(return_vec_0_end)
+@@ -619,8 +810,7 @@ L(less_1x_vec_till_page_cross):
+ 	   been loaded earlier so must be valid.  */
+ 	VMOVU	-VEC_SIZE(%rdi, %rax), %YMM0
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	VPCMP	$0, -VEC_SIZE(%rsi, %rax), %YMM0, %k1{%k2}
+-
++	CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2}
+ 	/* Mask of potentially valid bits. The lower bits can be out of
+ 	   range comparisons (but safe regarding page crosses).  */
+ 
+@@ -642,6 +832,8 @@ L(less_1x_vec_till_page_cross):
+ 
+ # ifdef USE_AS_STRNCMP
+ #  ifdef USE_AS_WCSCMP
++	/* NB: strcasecmp not used with WCSCMP so this access to r11 is
++	   safe.  */
+ 	movl	%eax, %r11d
+ 	shrl	$2, %r11d
+ 	cmpq	%r11, %rdx
+@@ -679,6 +871,8 @@ L(return_page_cross_cmp_mem):
+ # else
+ 	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
+ 	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ 	xorl	%r8d, %eax
+ 	subl	%r8d, %eax
+@@ -709,7 +903,7 @@ L(more_2x_vec_till_page_cross):
+ 
+ 	VMOVA	VEC_SIZE(%rdi), %YMM0
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	VPCMP	$0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
++	CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
+ 	kmovd	%k1, %ecx
+ 	TESTEQ	%ecx
+ 	jnz	L(return_vec_1_end)
+@@ -724,14 +918,14 @@ L(more_2x_vec_till_page_cross):
+ 	/* Safe to include comparisons from lower bytes.  */
+ 	VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %YMM0
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	VPCMP	$0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM0, %k1{%k2}
++	CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2}
+ 	kmovd	%k1, %ecx
+ 	TESTEQ	%ecx
+ 	jnz	L(return_vec_page_cross_0)
+ 
+ 	VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %YMM0
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	VPCMP	$0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM0, %k1{%k2}
++	CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2}
+ 	kmovd	%k1, %ecx
+ 	TESTEQ	%ecx
+ 	jnz	L(return_vec_page_cross_1)
+@@ -740,6 +934,8 @@ L(more_2x_vec_till_page_cross):
+ 	/* Must check length here as length might proclude reading next
+ 	   page.  */
+ #  ifdef USE_AS_WCSCMP
++	/* NB: strcasecmp not used with WCSCMP so this access to r11 is
++	   safe.  */
+ 	movl	%eax, %r11d
+ 	shrl	$2, %r11d
+ 	cmpq	%r11, %rdx
+@@ -754,12 +950,19 @@ L(more_2x_vec_till_page_cross):
+ 	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM6
+ 	VPMINU	%YMM4, %YMM6, %YMM9
+ 	VPTESTM	%YMM9, %YMM9, %k1
+-
++# ifndef USE_AS_STRCASECMP_L
+ 	vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
+ 	/* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6).  */
+ 	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6
+-
+-	VPCMP	$0, %YMMZERO, %YMM6, %k0{%k1}
++# else
++	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM5
++	TOLOWER_YMM (%YMM4, %YMM5)
++	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM7
++	TOLOWER_YMM (%YMM6, %YMM7)
++	vpxorq	%YMM4, %YMM5, %YMM5
++	vpternlogd $0xde, %YMM7, %YMM5, %YMM6
++# endif
++	VPTESTNM %YMM6, %YMM6, %k0{%k1}
+ 	kmovd	%k0, %LOOP_REG
+ 	TESTEQ	%LOOP_REG
+ 	jnz	L(return_vec_2_3_end)
+@@ -815,6 +1018,8 @@ L(return_vec_page_cross_1):
+ # else
+ 	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
+ 	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ 	xorl	%r8d, %eax
+ 	subl	%r8d, %eax
+@@ -871,7 +1076,7 @@ L(page_cross):
+ L(page_cross_loop):
+ 	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	VPCMP	$0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
++	CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
+ 	kmovd	%k1, %ecx
+ 	TESTEQ	%ecx
+ 	jnz	L(check_ret_vec_page_cross)
+@@ -895,7 +1100,7 @@ L(page_cross_loop):
+ 	 */
+ 	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	VPCMP	$0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
++	CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
+ 
+ 	kmovd	%k1, %ecx
+ # ifdef USE_AS_STRNCMP
+@@ -930,6 +1135,8 @@ L(ret_vec_page_cross_cont):
+ # else
+ 	movzbl	(%rdi, %rcx, SIZE_OF_CHAR), %eax
+ 	movzbl	(%rsi, %rcx, SIZE_OF_CHAR), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ 	xorl	%r8d, %eax
+ 	subl	%r8d, %eax
+@@ -989,7 +1196,7 @@ L(less_1x_vec_till_page):
+ 	/* Use 16 byte comparison.  */
+ 	vmovdqu	(%rdi), %xmm0
+ 	VPTESTM	%xmm0, %xmm0, %k2
+-	VPCMP	$0, (%rsi), %xmm0, %k1{%k2}
++	CMP_R1_S2_XMM (%xmm0, (%rsi), %xmm1, %k1){%k2}
+ 	kmovd	%k1, %ecx
+ # ifdef USE_AS_WCSCMP
+ 	subl	$0xf, %ecx
+@@ -1009,7 +1216,7 @@ L(less_1x_vec_till_page):
+ # endif
+ 	vmovdqu	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
+ 	VPTESTM	%xmm0, %xmm0, %k2
+-	VPCMP	$0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0, %k1{%k2}
++	CMP_R1_S2_XMM (%xmm0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1, %k1){%k2}
+ 	kmovd	%k1, %ecx
+ # ifdef USE_AS_WCSCMP
+ 	subl	$0xf, %ecx
+@@ -1048,7 +1255,7 @@ L(less_16_till_page):
+ 	vmovq	(%rdi), %xmm0
+ 	vmovq	(%rsi), %xmm1
+ 	VPTESTM	%xmm0, %xmm0, %k2
+-	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
++	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
+ 	kmovd	%k1, %ecx
+ # ifdef USE_AS_WCSCMP
+ 	subl	$0x3, %ecx
+@@ -1068,7 +1275,7 @@ L(less_16_till_page):
+ 	vmovq	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
+ 	vmovq	(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
+ 	VPTESTM	%xmm0, %xmm0, %k2
+-	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
++	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
+ 	kmovd	%k1, %ecx
+ # ifdef USE_AS_WCSCMP
+ 	subl	$0x3, %ecx
+@@ -1128,7 +1335,7 @@ L(ret_less_8_wcs):
+ 	vmovd	(%rdi), %xmm0
+ 	vmovd	(%rsi), %xmm1
+ 	VPTESTM	%xmm0, %xmm0, %k2
+-	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
++	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
+ 	kmovd	%k1, %ecx
+ 	subl	$0xf, %ecx
+ 	jnz	L(check_ret_vec_page_cross)
+@@ -1143,7 +1350,7 @@ L(ret_less_8_wcs):
+ 	vmovd	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
+ 	vmovd	(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
+ 	VPTESTM	%xmm0, %xmm0, %k2
+-	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
++	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
+ 	kmovd	%k1, %ecx
+ 	subl	$0xf, %ecx
+ 	jnz	L(check_ret_vec_page_cross)
+@@ -1176,7 +1383,9 @@ L(less_4_till_page):
+ L(less_4_loop):
+ 	movzbl	(%rdi), %eax
+ 	movzbl	(%rsi, %rdi), %ecx
+-	subl	%ecx, %eax
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
++	subl	%BYTE_LOOP_REG, %eax
+ 	jnz	L(ret_less_4_loop)
+ 	testl	%ecx, %ecx
+ 	jz	L(ret_zero_4_loop)
+@@ -1203,5 +1412,6 @@ L(ret_less_4_loop):
+ 	subl	%r8d, %eax
+ 	ret
+ # endif
+-END(STRCMP)
++	cfi_endproc
++	.size	STRCMP, .-STRCMP
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/strncase_l-evex.S b/sysdeps/x86_64/multiarch/strncase_l-evex.S
+new file mode 100644
+index 00000000..8a5af369
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strncase_l-evex.S
+@@ -0,0 +1,25 @@
++/* strncasecmp_l optimized with EVEX.
++   Copyright (C) 2017-2022 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#ifndef STRCMP
++# define STRCMP	__strncasecmp_l_evex
++#endif
++#define OVERFLOW_STRCMP	__strcasecmp_l_evex
++#define USE_AS_STRCASECMP_L
++#define USE_AS_STRNCMP
++#include "strcmp-evex.S"
+-- 
+GitLab
+
diff --git a/SOURCES/ia-opt-strxcasecmp-srtcmp-sse42.patch b/SOURCES/ia-opt-strxcasecmp-srtcmp-sse42.patch
new file mode 100644
index 0000000..cadff3e
--- /dev/null
+++ b/SOURCES/ia-opt-strxcasecmp-srtcmp-sse42.patch
@@ -0,0 +1,144 @@
+From 371154789e234ff53a97adfc92355a3871f66847 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Mar 2022 16:57:38 -0500
+Subject: [PATCH] x86: Optimize str{n}casecmp TOLOWER logic in strcmp-sse42.S
+
+Slightly faster method of doing TOLOWER that saves an
+instruction.
+
+Also replace the hard coded 5-byte no with .p2align 4. On builds with
+CET enabled this misaligned entry to strcasecmp.
+
+geometric_mean(N=40) of all benchmarks New / Original: .920
+
+All string/memory tests pass.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+
+(cherry picked from commit d154758e618ec9324f5d339c46db0aa27e8b1226)
+---
+ sysdeps/x86_64/multiarch/strcmp-sse42.S | 83 +++++++++++--------------
+ 1 file changed, 35 insertions(+), 48 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
+index d8fdeb3a..59e8ddfc 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
++++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
+@@ -89,9 +89,8 @@ ENTRY (GLABEL(__strcasecmp))
+ 	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
+ 	mov	%fs:(%rax),%RDX_LP
+ 
+-	// XXX 5 byte should be before the function
+-	/* 5-byte NOP.  */
+-	.byte	0x0f,0x1f,0x44,0x00,0x00
++	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
++	.p2align 4
+ END (GLABEL(__strcasecmp))
+ 	/* FALLTHROUGH to strcasecmp_l.  */
+ #endif
+@@ -100,9 +99,8 @@ ENTRY (GLABEL(__strncasecmp))
+ 	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
+ 	mov	%fs:(%rax),%RCX_LP
+ 
+-	// XXX 5 byte should be before the function
+-	/* 5-byte NOP.  */
+-	.byte	0x0f,0x1f,0x44,0x00,0x00
++	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
++	.p2align 4
+ END (GLABEL(__strncasecmp))
+ 	/* FALLTHROUGH to strncasecmp_l.  */
+ #endif
+@@ -170,27 +168,22 @@ STRCMP_SSE42:
+ #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ 	.section .rodata.cst16,"aM",@progbits,16
+ 	.align 16
+-LABEL(belowupper):
+-	.quad	0x4040404040404040
+-	.quad	0x4040404040404040
+-LABEL(topupper):
+-# ifdef USE_AVX
+-	.quad	0x5a5a5a5a5a5a5a5a
+-	.quad	0x5a5a5a5a5a5a5a5a
+-# else
+-	.quad	0x5b5b5b5b5b5b5b5b
+-	.quad	0x5b5b5b5b5b5b5b5b
+-# endif
+-LABEL(touppermask):
++LABEL(lcase_min):
++	.quad	0x3f3f3f3f3f3f3f3f
++	.quad	0x3f3f3f3f3f3f3f3f
++LABEL(lcase_max):
++	.quad	0x9999999999999999
++	.quad	0x9999999999999999
++LABEL(case_add):
+ 	.quad	0x2020202020202020
+ 	.quad	0x2020202020202020
+ 	.previous
+-	movdqa	LABEL(belowupper)(%rip), %xmm4
+-# define UCLOW_reg %xmm4
+-	movdqa	LABEL(topupper)(%rip), %xmm5
+-# define UCHIGH_reg %xmm5
+-	movdqa	LABEL(touppermask)(%rip), %xmm6
+-# define LCQWORD_reg %xmm6
++	movdqa	LABEL(lcase_min)(%rip), %xmm4
++# define LCASE_MIN_reg %xmm4
++	movdqa	LABEL(lcase_max)(%rip), %xmm5
++# define LCASE_MAX_reg %xmm5
++	movdqa	LABEL(case_add)(%rip), %xmm6
++# define CASE_ADD_reg %xmm6
+ #endif
+ 	cmp	$0x30, %ecx
+ 	ja	LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
+@@ -201,32 +194,26 @@ LABEL(touppermask):
+ #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ # ifdef USE_AVX
+ #  define TOLOWER(reg1, reg2) \
+-	vpcmpgtb UCLOW_reg, reg1, %xmm7;			\
+-	vpcmpgtb UCHIGH_reg, reg1, %xmm8;			\
+-	vpcmpgtb UCLOW_reg, reg2, %xmm9;			\
+-	vpcmpgtb UCHIGH_reg, reg2, %xmm10;			\
+-	vpandn	%xmm7, %xmm8, %xmm8;					\
+-	vpandn	%xmm9, %xmm10, %xmm10;					\
+-	vpand	LCQWORD_reg, %xmm8, %xmm8;				\
+-	vpand	LCQWORD_reg, %xmm10, %xmm10;				\
+-	vpor	reg1, %xmm8, reg1;					\
+-	vpor	reg2, %xmm10, reg2
++	vpaddb	LCASE_MIN_reg, reg1, %xmm7;					\
++	vpaddb	LCASE_MIN_reg, reg2, %xmm8;					\
++	vpcmpgtb LCASE_MAX_reg, %xmm7, %xmm7;					\
++	vpcmpgtb LCASE_MAX_reg, %xmm8, %xmm8;					\
++	vpandn	CASE_ADD_reg, %xmm7, %xmm7;					\
++	vpandn	CASE_ADD_reg, %xmm8, %xmm8;					\
++	vpaddb	%xmm7, reg1, reg1;					\
++	vpaddb	%xmm8, reg2, reg2
+ # else
+ #  define TOLOWER(reg1, reg2) \
+-	movdqa	reg1, %xmm7;					\
+-	movdqa	UCHIGH_reg, %xmm8;				\
+-	movdqa	reg2, %xmm9;					\
+-	movdqa	UCHIGH_reg, %xmm10;				\
+-	pcmpgtb	UCLOW_reg, %xmm7;				\
+-	pcmpgtb	reg1, %xmm8;					\
+-	pcmpgtb	UCLOW_reg, %xmm9;				\
+-	pcmpgtb	reg2, %xmm10;					\
+-	pand	%xmm8, %xmm7;					\
+-	pand	%xmm10, %xmm9;					\
+-	pand	LCQWORD_reg, %xmm7;				\
+-	pand	LCQWORD_reg, %xmm9;				\
+-	por	%xmm7, reg1;					\
+-	por	%xmm9, reg2
++	movdqa	LCASE_MIN_reg, %xmm7;					\
++	movdqa	LCASE_MIN_reg, %xmm8;					\
++	paddb	reg1, %xmm7;					\
++	paddb	reg2, %xmm8;					\
++	pcmpgtb	LCASE_MAX_reg, %xmm7;				\
++	pcmpgtb	LCASE_MAX_reg, %xmm8;				\
++	pandn	CASE_ADD_reg, %xmm7;					\
++	pandn	CASE_ADD_reg, %xmm8;					\
++	paddb	%xmm7, reg1;					\
++	paddb	%xmm8, reg2
+ # endif
+ 	TOLOWER (%xmm1, %xmm2)
+ #else
+-- 
+GitLab
+
diff --git a/SOURCES/ia-opt-strxcasecmp-srtcmp.patch b/SOURCES/ia-opt-strxcasecmp-srtcmp.patch
new file mode 100644
index 0000000..f9fa6fa
--- /dev/null
+++ b/SOURCES/ia-opt-strxcasecmp-srtcmp.patch
@@ -0,0 +1,123 @@
+From 017773f93b0e41f3b164e5db86d0c7b7f75675e9 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Mar 2022 16:57:36 -0500
+Subject: [PATCH] x86: Optimize str{n}casecmp TOLOWER logic in strcmp.S
+
+Slightly faster method of doing TOLOWER that saves an
+instruction.
+
+Also replace the hard coded 5-byte no with .p2align 4. On builds with
+CET enabled this misaligned entry to strcasecmp.
+
+geometric_mean(N=40) of all benchmarks New / Original: .894
+
+All string/memory tests pass.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+
+(cherry picked from commit 670b54bc585ea4a94f3b2e9272ba44aa6b730b73)
+---
+ sysdeps/x86_64/strcmp.S | 64 +++++++++++++++++++----------------------
+ 1 file changed, 29 insertions(+), 35 deletions(-)
+
+diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
+index aa6df898..f454ce5b 100644
+--- a/sysdeps/x86_64/strcmp.S
++++ b/sysdeps/x86_64/strcmp.S
+@@ -78,9 +78,8 @@ ENTRY2 (__strcasecmp)
+ 	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
+ 	mov	%fs:(%rax),%RDX_LP
+ 
+-	// XXX 5 byte should be before the function
+-	/* 5-byte NOP.  */
+-	.byte	0x0f,0x1f,0x44,0x00,0x00
++	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
++	.p2align 4
+ END2 (__strcasecmp)
+ # ifndef NO_NOLOCALE_ALIAS
+ weak_alias (__strcasecmp, strcasecmp)
+@@ -97,9 +96,8 @@ ENTRY2 (__strncasecmp)
+ 	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
+ 	mov	%fs:(%rax),%RCX_LP
+ 
+-	// XXX 5 byte should be before the function
+-	/* 5-byte NOP.  */
+-	.byte	0x0f,0x1f,0x44,0x00,0x00
++	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
++	.p2align 4
+ END2 (__strncasecmp)
+ # ifndef NO_NOLOCALE_ALIAS
+ weak_alias (__strncasecmp, strncasecmp)
+@@ -149,22 +147,22 @@ ENTRY (STRCMP)
+ #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ 	.section .rodata.cst16,"aM",@progbits,16
+ 	.align 16
+-.Lbelowupper:
+-	.quad	0x4040404040404040
+-	.quad	0x4040404040404040
+-.Ltopupper:
+-	.quad	0x5b5b5b5b5b5b5b5b
+-	.quad	0x5b5b5b5b5b5b5b5b
+-.Ltouppermask:
++.Llcase_min:
++	.quad	0x3f3f3f3f3f3f3f3f
++	.quad	0x3f3f3f3f3f3f3f3f
++.Llcase_max:
++	.quad	0x9999999999999999
++	.quad	0x9999999999999999
++.Lcase_add:
+ 	.quad	0x2020202020202020
+ 	.quad	0x2020202020202020
+ 	.previous
+-	movdqa	.Lbelowupper(%rip), %xmm5
+-# define UCLOW_reg %xmm5
+-	movdqa	.Ltopupper(%rip), %xmm6
+-# define UCHIGH_reg %xmm6
+-	movdqa	.Ltouppermask(%rip), %xmm7
+-# define LCQWORD_reg %xmm7
++	movdqa	.Llcase_min(%rip), %xmm5
++# define LCASE_MIN_reg %xmm5
++	movdqa	.Llcase_max(%rip), %xmm6
++# define LCASE_MAX_reg %xmm6
++	movdqa	.Lcase_add(%rip), %xmm7
++# define CASE_ADD_reg %xmm7
+ #endif
+ 	cmp	$0x30, %ecx
+ 	ja	LABEL(crosscache)	/* rsi: 16-byte load will cross cache line */
+@@ -175,22 +173,18 @@ ENTRY (STRCMP)
+ 	movhpd	8(%rdi), %xmm1
+ 	movhpd	8(%rsi), %xmm2
+ #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+-# define TOLOWER(reg1, reg2) \
+-	movdqa	reg1, %xmm8;					\
+-	movdqa	UCHIGH_reg, %xmm9;				\
+-	movdqa	reg2, %xmm10;					\
+-	movdqa	UCHIGH_reg, %xmm11;				\
+-	pcmpgtb	UCLOW_reg, %xmm8;				\
+-	pcmpgtb	reg1, %xmm9;					\
+-	pcmpgtb	UCLOW_reg, %xmm10;				\
+-	pcmpgtb	reg2, %xmm11;					\
+-	pand	%xmm9, %xmm8;					\
+-	pand	%xmm11, %xmm10;					\
+-	pand	LCQWORD_reg, %xmm8;				\
+-	pand	LCQWORD_reg, %xmm10;				\
+-	por	%xmm8, reg1;					\
+-	por	%xmm10, reg2
+-	TOLOWER (%xmm1, %xmm2)
++#  define TOLOWER(reg1, reg2) \
++	movdqa	LCASE_MIN_reg, %xmm8;					\
++	movdqa	LCASE_MIN_reg, %xmm9;					\
++	paddb	reg1, %xmm8;					\
++	paddb	reg2, %xmm9;					\
++	pcmpgtb	LCASE_MAX_reg, %xmm8;				\
++	pcmpgtb	LCASE_MAX_reg, %xmm9;				\
++	pandn	CASE_ADD_reg, %xmm8;					\
++	pandn	CASE_ADD_reg, %xmm9;					\
++	paddb	%xmm8, reg1;					\
++	paddb	%xmm9, reg2
++	TOLOWER	(%xmm1, %xmm2)
+ #else
+ # define TOLOWER(reg1, reg2)
+ #endif
+-- 
+GitLab
+
diff --git a/SOURCES/ia-rmv-bcopy-opt.patch b/SOURCES/ia-rmv-bcopy-opt.patch
new file mode 100644
index 0000000..9a01168
--- /dev/null
+++ b/SOURCES/ia-rmv-bcopy-opt.patch
@@ -0,0 +1,30 @@
+From 14483cebfb833ff520e921f46b78d53f46a86df0 Mon Sep 17 00:00:00 2001
+From: Adhemerval Zanella <adhemerval.zanella@linaro.org>
+Date: Thu, 10 Feb 2022 11:23:24 -0300
+Subject: [PATCH] x86_64: Remove bcopy optimizations
+
+The symbols is not present in current POSIX specification and compiler
+already generates memmove call.
+
+(cherry picked from commit bf92893a14ebc161b08b28acc24fa06ae6be19cb)
+---
+ sysdeps/x86_64/multiarch/bcopy.S | 7 -------
+ 1 file changed, 7 deletions(-)
+ delete mode 100644 sysdeps/x86_64/multiarch/bcopy.S
+
+diff --git a/sysdeps/x86_64/multiarch/bcopy.S b/sysdeps/x86_64/multiarch/bcopy.S
+deleted file mode 100644
+index 639f02bd..00000000
+--- a/sysdeps/x86_64/multiarch/bcopy.S
++++ /dev/null
+@@ -1,7 +0,0 @@
+-#include <sysdep.h>
+-
+-	.text
+-ENTRY(bcopy)
+-	xchg	%rdi, %rsi
+-	jmp	__libc_memmove	/* Branch to IFUNC memmove.  */
+-END(bcopy)
+-- 
+GitLab
+
diff --git a/SOURCES/ia-rmv-memcmp-sse4.patch b/SOURCES/ia-rmv-memcmp-sse4.patch
new file mode 100644
index 0000000..91381c7
--- /dev/null
+++ b/SOURCES/ia-rmv-memcmp-sse4.patch
@@ -0,0 +1,965 @@
+From f5078f5cabb6e330506c2cad6ad89476438aafcb Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 15 Apr 2022 12:28:00 -0500
+Subject: [PATCH] x86: Remove memcmp-sse4.S
+
+Code didn't actually use any sse4 instructions since `ptest` was
+removed in:
+
+commit 2f9062d7171850451e6044ef78d91ff8c017b9c0
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Wed Nov 10 16:18:56 2021 -0600
+
+    x86: Shrink memcmp-sse4.S code size
+
+The new memcmp-sse2 implementation is also faster.
+
+geometric_mean(N=20) of page cross cases SSE2 / SSE4: 0.905
+
+Note there are two regressions preferring SSE2 for Size = 1 and Size =
+65.
+
+Size = 1:
+size, align0, align1, ret, New Time/Old Time
+   1,      1,      1,   0,               1.2
+   1,      1,      1,   1,             1.197
+   1,      1,      1,  -1,               1.2
+
+This is intentional. Size == 1 is significantly less hot based on
+profiles of GCC11 and Python3 than sizes [4, 8] (which is made
+hotter).
+
+Python3 Size = 1        -> 13.64%
+Python3 Size = [4, 8]   -> 60.92%
+
+GCC11   Size = 1        ->  1.29%
+GCC11   Size = [4, 8]   -> 33.86%
+
+size, align0, align1, ret, New Time/Old Time
+   4,      4,      4,   0,             0.622
+   4,      4,      4,   1,             0.797
+   4,      4,      4,  -1,             0.805
+   5,      5,      5,   0,             0.623
+   5,      5,      5,   1,             0.777
+   5,      5,      5,  -1,             0.802
+   6,      6,      6,   0,             0.625
+   6,      6,      6,   1,             0.813
+   6,      6,      6,  -1,             0.788
+   7,      7,      7,   0,             0.625
+   7,      7,      7,   1,             0.799
+   7,      7,      7,  -1,             0.795
+   8,      8,      8,   0,             0.625
+   8,      8,      8,   1,             0.848
+   8,      8,      8,  -1,             0.914
+   9,      9,      9,   0,             0.625
+
+Size = 65:
+size, align0, align1, ret, New Time/Old Time
+  65,      0,      0,   0,             1.103
+  65,      0,      0,   1,             1.216
+  65,      0,      0,  -1,             1.227
+  65,     65,      0,   0,             1.091
+  65,      0,     65,   1,              1.19
+  65,     65,     65,  -1,             1.215
+
+This is because A) the checks in range [65, 96] are now unrolled 2x
+and B) because smaller values <= 16 are now given a hotter path. By
+contrast the SSE4 version has a branch for Size = 80. The unrolled
+version has get better performance for returns which need both
+comparisons.
+
+size, align0, align1, ret, New Time/Old Time
+ 128,      4,      8,   0,             0.858
+ 128,      4,      8,   1,             0.879
+ 128,      4,      8,  -1,             0.888
+
+As well, out of microbenchmark environments that are not full
+predictable the branch will have a real-cost.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+
+(cherry picked from commit 7cbc03d03091d5664060924789afe46d30a5477e)
+---
+ sysdeps/x86_64/multiarch/Makefile          |   2 -
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c |   4 -
+ sysdeps/x86_64/multiarch/ifunc-memcmp.h    |   4 -
+ sysdeps/x86_64/multiarch/memcmp-sse4.S     | 804 ---------------------
+ 4 files changed, 814 deletions(-)
+ delete mode 100644 sysdeps/x86_64/multiarch/memcmp-sse4.S
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index bca82e38..b503e4b8 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
++++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -11,7 +11,6 @@ sysdep_routines += \
+   memcmp-avx2-movbe-rtm \
+   memcmp-evex-movbe \
+   memcmp-sse2 \
+-  memcmp-sse4 \
+   memcmp-ssse3 \
+   memcpy-ssse3 \
+   memcpy-ssse3-back \
+@@ -174,7 +173,6 @@ sysdep_routines += \
+   wmemcmp-avx2-movbe-rtm \
+   wmemcmp-c \
+   wmemcmp-evex-movbe \
+-  wmemcmp-sse4 \
+   wmemcmp-ssse3 \
+ # sysdep_routines
+ endif
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index 14314367..450a2917 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -78,8 +78,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)),
+ 			      __memcmp_evex_movbe)
+-	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
+-			      __memcmp_sse4_1)
+ 	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3),
+ 			      __memcmp_ssse3)
+ 	      IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
+@@ -824,8 +822,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)),
+ 			      __wmemcmp_evex_movbe)
+-	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
+-			      __wmemcmp_sse4_1)
+ 	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3),
+ 			      __wmemcmp_ssse3)
+ 	      IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+index 690dffe8..0bc47a7f 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
++++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+@@ -21,7 +21,6 @@
+ 
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
+@@ -47,9 +46,6 @@ IFUNC_SELECTOR (void)
+ 	return OPTIMIZE (avx2_movbe);
+     }
+ 
+-  if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
+-    return OPTIMIZE (sse4_1);
+-
+   if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
+     return OPTIMIZE (ssse3);
+ 
+diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
+deleted file mode 100644
+index 50060006..00000000
+--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
++++ /dev/null
+@@ -1,804 +0,0 @@
+-/* memcmp with SSE4.1, wmemcmp with SSE4.1
+-   Copyright (C) 2010-2018 Free Software Foundation, Inc.
+-   Contributed by Intel Corporation.
+-   This file is part of the GNU C Library.
+-
+-   The GNU C Library is free software; you can redistribute it and/or
+-   modify it under the terms of the GNU Lesser General Public
+-   License as published by the Free Software Foundation; either
+-   version 2.1 of the License, or (at your option) any later version.
+-
+-   The GNU C Library is distributed in the hope that it will be useful,
+-   but WITHOUT ANY WARRANTY; without even the implied warranty of
+-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+-   Lesser General Public License for more details.
+-
+-   You should have received a copy of the GNU Lesser General Public
+-   License along with the GNU C Library; if not, see
+-   <http://www.gnu.org/licenses/>.  */
+-
+-#if IS_IN (libc)
+-
+-# include <sysdep.h>
+-
+-# ifndef MEMCMP
+-#  define MEMCMP	__memcmp_sse4_1
+-# endif
+-
+-#ifdef USE_AS_WMEMCMP
+-# define CMPEQ	pcmpeqd
+-# define CHAR_SIZE	4
+-#else
+-# define CMPEQ	pcmpeqb
+-# define CHAR_SIZE	1
+-#endif
+-
+-
+-/* Warning!
+-           wmemcmp has to use SIGNED comparison for elements.
+-           memcmp has to use UNSIGNED comparison for elemnts.
+-*/
+-
+-	.section .text.sse4.1,"ax",@progbits
+-ENTRY (MEMCMP)
+-# ifdef USE_AS_WMEMCMP
+-	shl	$2, %RDX_LP
+-# elif defined __ILP32__
+-	/* Clear the upper 32 bits.  */
+-	mov	%edx, %edx
+-# endif
+-	cmp	$79, %RDX_LP
+-	ja	L(79bytesormore)
+-
+-	cmp	$CHAR_SIZE, %RDX_LP
+-	jbe	L(firstbyte)
+-
+-	/* N in (CHAR_SIZE, 79) bytes.  */
+-	cmpl	$32, %edx
+-	ja	L(more_32_bytes)
+-
+-	cmpl	$16, %edx
+-	jae	L(16_to_32_bytes)
+-
+-# ifndef USE_AS_WMEMCMP
+-	cmpl	$8, %edx
+-	jae	L(8_to_16_bytes)
+-
+-	cmpl	$4, %edx
+-	jb	L(2_to_3_bytes)
+-
+-	movl	(%rdi), %eax
+-	movl	(%rsi), %ecx
+-
+-	bswap	%eax
+-	bswap	%ecx
+-
+-	shlq	$32, %rax
+-	shlq	$32, %rcx
+-
+-	movl	-4(%rdi, %rdx), %edi
+-	movl	-4(%rsi, %rdx), %esi
+-
+-	bswap	%edi
+-	bswap	%esi
+-
+-	orq	%rdi, %rax
+-	orq	%rsi, %rcx
+-	subq	%rcx, %rax
+-	cmovne	%edx, %eax
+-	sbbl	%ecx, %ecx
+-	orl	%ecx, %eax
+-	ret
+-
+-	.p2align 4,, 8
+-L(2_to_3_bytes):
+-	movzwl	(%rdi), %eax
+-	movzwl	(%rsi), %ecx
+-	shll	$8, %eax
+-	shll	$8, %ecx
+-	bswap	%eax
+-	bswap	%ecx
+-	movzbl	-1(%rdi, %rdx), %edi
+-	movzbl	-1(%rsi, %rdx), %esi
+-	orl	%edi, %eax
+-	orl	%esi, %ecx
+-	subl	%ecx, %eax
+-	ret
+-
+-	.p2align 4,, 8
+-L(8_to_16_bytes):
+-	movq	(%rdi), %rax
+-	movq	(%rsi), %rcx
+-
+-	bswap	%rax
+-	bswap	%rcx
+-
+-	subq	%rcx, %rax
+-	jne	L(8_to_16_bytes_done)
+-
+-	movq	-8(%rdi, %rdx), %rax
+-	movq	-8(%rsi, %rdx), %rcx
+-
+-	bswap	%rax
+-	bswap	%rcx
+-
+-	subq	%rcx, %rax
+-
+-L(8_to_16_bytes_done):
+-	cmovne	%edx, %eax
+-	sbbl	%ecx, %ecx
+-	orl	%ecx, %eax
+-	ret
+-# else
+-	xorl	%eax, %eax
+-	movl	(%rdi), %ecx
+-	cmpl	(%rsi), %ecx
+-	jne	L(8_to_16_bytes_done)
+-	movl	4(%rdi), %ecx
+-	cmpl	4(%rsi), %ecx
+-	jne	L(8_to_16_bytes_done)
+-	movl	-4(%rdi, %rdx), %ecx
+-	cmpl	-4(%rsi, %rdx), %ecx
+-	jne	L(8_to_16_bytes_done)
+-	ret
+-# endif
+-
+-	.p2align 4,, 3
+-L(ret_zero):
+-	xorl	%eax, %eax
+-L(zero):
+-	ret
+-
+-	.p2align 4,, 8
+-L(firstbyte):
+-	jb	L(ret_zero)
+-# ifdef USE_AS_WMEMCMP
+-	xorl	%eax, %eax
+-	movl	(%rdi), %ecx
+-	cmpl	(%rsi), %ecx
+-	je	L(zero)
+-L(8_to_16_bytes_done):
+-	setg	%al
+-	leal	-1(%rax, %rax), %eax
+-# else
+-	movzbl	(%rdi), %eax
+-	movzbl	(%rsi), %ecx
+-	sub	%ecx, %eax
+-# endif
+-	ret
+-
+-	.p2align 4
+-L(vec_return_begin_48):
+-	addq	$16, %rdi
+-	addq	$16, %rsi
+-L(vec_return_begin_32):
+-	bsfl	%eax, %eax
+-# ifdef USE_AS_WMEMCMP
+-	movl	32(%rdi, %rax), %ecx
+-	xorl	%edx, %edx
+-	cmpl	32(%rsi, %rax), %ecx
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
+-# else
+-	movzbl	32(%rsi, %rax), %ecx
+-	movzbl	32(%rdi, %rax), %eax
+-	subl	%ecx, %eax
+-# endif
+-	ret
+-
+-	.p2align 4
+-L(vec_return_begin_16):
+-	addq	$16, %rdi
+-	addq	$16, %rsi
+-L(vec_return_begin):
+-	bsfl	%eax, %eax
+-# ifdef USE_AS_WMEMCMP
+-	movl	(%rdi, %rax), %ecx
+-	xorl	%edx, %edx
+-	cmpl	(%rsi, %rax), %ecx
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
+-# else
+-	movzbl	(%rsi, %rax), %ecx
+-	movzbl	(%rdi, %rax), %eax
+-	subl	%ecx, %eax
+-# endif
+-	ret
+-
+-	.p2align 4
+-L(vec_return_end_16):
+-	subl	$16, %edx
+-L(vec_return_end):
+-	bsfl	%eax, %eax
+-	addl	%edx, %eax
+-# ifdef USE_AS_WMEMCMP
+-	movl	-16(%rdi, %rax), %ecx
+-	xorl	%edx, %edx
+-	cmpl	-16(%rsi, %rax), %ecx
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
+-# else
+-	movzbl	-16(%rsi, %rax), %ecx
+-	movzbl	-16(%rdi, %rax), %eax
+-	subl	%ecx, %eax
+-# endif
+-	ret
+-
+-	.p2align 4,, 8
+-L(more_32_bytes):
+-	movdqu	(%rdi), %xmm0
+-	movdqu	(%rsi), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin)
+-
+-	movdqu	16(%rdi), %xmm0
+-	movdqu	16(%rsi), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_16)
+-
+-	cmpl	$64, %edx
+-	jbe	L(32_to_64_bytes)
+-	movdqu	32(%rdi), %xmm0
+-	movdqu	32(%rsi), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_32)
+-
+-	.p2align 4,, 6
+-L(32_to_64_bytes):
+-	movdqu	-32(%rdi, %rdx), %xmm0
+-	movdqu	-32(%rsi, %rdx), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_end_16)
+-
+-	movdqu	-16(%rdi, %rdx), %xmm0
+-	movdqu	-16(%rsi, %rdx), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_end)
+-	ret
+-
+-	.p2align 4
+-L(16_to_32_bytes):
+-	movdqu	(%rdi), %xmm0
+-	movdqu	(%rsi), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin)
+-
+-	movdqu	-16(%rdi, %rdx), %xmm0
+-	movdqu	-16(%rsi, %rdx), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_end)
+-	ret
+-
+-
+-	.p2align 4
+-L(79bytesormore):
+-	movdqu	(%rdi), %xmm0
+-	movdqu	(%rsi), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin)
+-
+-
+-	mov	%rsi, %rcx
+-	and	$-16, %rsi
+-	add	$16, %rsi
+-	sub	%rsi, %rcx
+-
+-	sub	%rcx, %rdi
+-	add	%rcx, %rdx
+-	test	$0xf, %rdi
+-	jz	L(2aligned)
+-
+-	cmp	$128, %rdx
+-	ja	L(128bytesormore)
+-
+-	.p2align 4,, 6
+-L(less128bytes):
+-	movdqu	(%rdi), %xmm1
+-	CMPEQ	(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin)
+-
+-	movdqu	16(%rdi), %xmm1
+-	CMPEQ	16(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_16)
+-
+-	movdqu	32(%rdi), %xmm1
+-	CMPEQ	32(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_32)
+-
+-	movdqu	48(%rdi), %xmm1
+-	CMPEQ	48(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_48)
+-
+-	cmp	$96, %rdx
+-	jb	L(32_to_64_bytes)
+-
+-	addq	$64, %rdi
+-	addq	$64, %rsi
+-	subq	$64, %rdx
+-
+-	.p2align 4,, 6
+-L(last_64_bytes):
+-	movdqu	(%rdi), %xmm1
+-	CMPEQ	(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin)
+-
+-	movdqu	16(%rdi), %xmm1
+-	CMPEQ	16(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_16)
+-
+-	movdqu	-32(%rdi, %rdx), %xmm0
+-	movdqu	-32(%rsi, %rdx), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_end_16)
+-
+-	movdqu	-16(%rdi, %rdx), %xmm0
+-	movdqu	-16(%rsi, %rdx), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_end)
+-	ret
+-
+-	.p2align 4
+-L(128bytesormore):
+-	cmp	$256, %rdx
+-	ja	L(unaligned_loop)
+-L(less256bytes):
+-	movdqu	(%rdi), %xmm1
+-	CMPEQ	(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin)
+-
+-	movdqu	16(%rdi), %xmm1
+-	CMPEQ	16(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_16)
+-
+-	movdqu	32(%rdi), %xmm1
+-	CMPEQ	32(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_32)
+-
+-	movdqu	48(%rdi), %xmm1
+-	CMPEQ	48(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_48)
+-
+-	addq	$64, %rdi
+-	addq	$64, %rsi
+-
+-	movdqu	(%rdi), %xmm1
+-	CMPEQ	(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin)
+-
+-	movdqu	16(%rdi), %xmm1
+-	CMPEQ	16(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_16)
+-
+-	movdqu	32(%rdi), %xmm1
+-	CMPEQ	32(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_32)
+-
+-	movdqu	48(%rdi), %xmm1
+-	CMPEQ	48(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_48)
+-
+-	addq	$-128, %rdx
+-	subq	$-64, %rsi
+-	subq	$-64, %rdi
+-
+-	cmp	$64, %rdx
+-	ja	L(less128bytes)
+-
+-	cmp	$32, %rdx
+-	ja	L(last_64_bytes)
+-
+-	movdqu	-32(%rdi, %rdx), %xmm0
+-	movdqu	-32(%rsi, %rdx), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_end_16)
+-
+-	movdqu	-16(%rdi, %rdx), %xmm0
+-	movdqu	-16(%rsi, %rdx), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_end)
+-	ret
+-
+-	.p2align 4
+-L(unaligned_loop):
+-# ifdef DATA_CACHE_SIZE_HALF
+-	mov	$DATA_CACHE_SIZE_HALF, %R8_LP
+-# else
+-	mov	__x86_data_cache_size_half(%rip), %R8_LP
+-# endif
+-	movq	%r8, %r9
+-	addq	%r8, %r8
+-	addq	%r9, %r8
+-	cmpq	%r8, %rdx
+-	ja	L(L2_L3_cache_unaligned)
+-	sub	$64, %rdx
+-	.p2align 4
+-L(64bytesormore_loop):
+-	movdqu	(%rdi), %xmm0
+-	movdqu	16(%rdi), %xmm1
+-	movdqu	32(%rdi), %xmm2
+-	movdqu	48(%rdi), %xmm3
+-
+-	CMPEQ	(%rsi), %xmm0
+-	CMPEQ	16(%rsi), %xmm1
+-	CMPEQ	32(%rsi), %xmm2
+-	CMPEQ	48(%rsi), %xmm3
+-
+-	pand	%xmm0, %xmm1
+-	pand	%xmm2, %xmm3
+-	pand	%xmm1, %xmm3
+-
+-	pmovmskb %xmm3, %eax
+-	incw	%ax
+-	jnz	L(64bytesormore_loop_end)
+-
+-	add	$64, %rsi
+-	add	$64, %rdi
+-	sub	$64, %rdx
+-	ja	L(64bytesormore_loop)
+-
+-	.p2align 4,, 6
+-L(loop_tail):
+-	addq	%rdx, %rdi
+-	movdqu	(%rdi), %xmm0
+-	movdqu	16(%rdi), %xmm1
+-	movdqu	32(%rdi), %xmm2
+-	movdqu	48(%rdi), %xmm3
+-
+-	addq	%rdx, %rsi
+-	movdqu	(%rsi), %xmm4
+-	movdqu	16(%rsi), %xmm5
+-	movdqu	32(%rsi), %xmm6
+-	movdqu	48(%rsi), %xmm7
+-
+-	CMPEQ	%xmm4, %xmm0
+-	CMPEQ	%xmm5, %xmm1
+-	CMPEQ	%xmm6, %xmm2
+-	CMPEQ	%xmm7, %xmm3
+-
+-	pand	%xmm0, %xmm1
+-	pand	%xmm2, %xmm3
+-	pand	%xmm1, %xmm3
+-
+-	pmovmskb %xmm3, %eax
+-	incw	%ax
+-	jnz	L(64bytesormore_loop_end)
+-	ret
+-
+-L(L2_L3_cache_unaligned):
+-	subq	$64, %rdx
+-	.p2align 4
+-L(L2_L3_unaligned_128bytes_loop):
+-	prefetchnta 0x1c0(%rdi)
+-	prefetchnta 0x1c0(%rsi)
+-
+-	movdqu	(%rdi), %xmm0
+-	movdqu	16(%rdi), %xmm1
+-	movdqu	32(%rdi), %xmm2
+-	movdqu	48(%rdi), %xmm3
+-
+-	CMPEQ	(%rsi), %xmm0
+-	CMPEQ	16(%rsi), %xmm1
+-	CMPEQ	32(%rsi), %xmm2
+-	CMPEQ	48(%rsi), %xmm3
+-
+-	pand	%xmm0, %xmm1
+-	pand	%xmm2, %xmm3
+-	pand	%xmm1, %xmm3
+-
+-	pmovmskb %xmm3, %eax
+-	incw	%ax
+-	jnz	L(64bytesormore_loop_end)
+-
+-	add	$64, %rsi
+-	add	$64, %rdi
+-	sub	$64, %rdx
+-	ja	L(L2_L3_unaligned_128bytes_loop)
+-	jmp	L(loop_tail)
+-
+-
+-	/* This case is for machines which are sensitive for unaligned
+-	 * instructions.  */
+-	.p2align 4
+-L(2aligned):
+-	cmp	$128, %rdx
+-	ja	L(128bytesormorein2aligned)
+-L(less128bytesin2aligned):
+-	movdqa	(%rdi), %xmm1
+-	CMPEQ	(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin)
+-
+-	movdqa	16(%rdi), %xmm1
+-	CMPEQ	16(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_16)
+-
+-	movdqa	32(%rdi), %xmm1
+-	CMPEQ	32(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_32)
+-
+-	movdqa	48(%rdi), %xmm1
+-	CMPEQ	48(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_48)
+-
+-	cmp	$96, %rdx
+-	jb	L(32_to_64_bytes)
+-
+-	addq	$64, %rdi
+-	addq	$64, %rsi
+-	subq	$64, %rdx
+-
+-	.p2align 4,, 6
+-L(aligned_last_64_bytes):
+-	movdqa	(%rdi), %xmm1
+-	CMPEQ	(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin)
+-
+-	movdqa	16(%rdi), %xmm1
+-	CMPEQ	16(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_16)
+-
+-	movdqu	-32(%rdi, %rdx), %xmm0
+-	movdqu	-32(%rsi, %rdx), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_end_16)
+-
+-	movdqu	-16(%rdi, %rdx), %xmm0
+-	movdqu	-16(%rsi, %rdx), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_end)
+-	ret
+-
+-	.p2align 4
+-L(128bytesormorein2aligned):
+-	cmp	$256, %rdx
+-	ja	L(aligned_loop)
+-L(less256bytesin2alinged):
+-	movdqa	(%rdi), %xmm1
+-	CMPEQ	(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin)
+-
+-	movdqa	16(%rdi), %xmm1
+-	CMPEQ	16(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_16)
+-
+-	movdqa	32(%rdi), %xmm1
+-	CMPEQ	32(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_32)
+-
+-	movdqa	48(%rdi), %xmm1
+-	CMPEQ	48(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_48)
+-
+-	addq	$64, %rdi
+-	addq	$64, %rsi
+-
+-	movdqa	(%rdi), %xmm1
+-	CMPEQ	(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin)
+-
+-	movdqa	16(%rdi), %xmm1
+-	CMPEQ	16(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_16)
+-
+-	movdqa	32(%rdi), %xmm1
+-	CMPEQ	32(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_32)
+-
+-	movdqa	48(%rdi), %xmm1
+-	CMPEQ	48(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_48)
+-
+-	addq	$-128, %rdx
+-	subq	$-64, %rsi
+-	subq	$-64, %rdi
+-
+-	cmp	$64, %rdx
+-	ja	L(less128bytesin2aligned)
+-
+-	cmp	$32, %rdx
+-	ja	L(aligned_last_64_bytes)
+-
+-	movdqu	-32(%rdi, %rdx), %xmm0
+-	movdqu	-32(%rsi, %rdx), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_end_16)
+-
+-	movdqu	-16(%rdi, %rdx), %xmm0
+-	movdqu	-16(%rsi, %rdx), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_end)
+-	ret
+-
+-	.p2align 4
+-L(aligned_loop):
+-# ifdef DATA_CACHE_SIZE_HALF
+-	mov	$DATA_CACHE_SIZE_HALF, %R8_LP
+-# else
+-	mov	__x86_data_cache_size_half(%rip), %R8_LP
+-# endif
+-	movq	%r8, %r9
+-	addq	%r8, %r8
+-	addq	%r9, %r8
+-	cmpq	%r8, %rdx
+-	ja	L(L2_L3_cache_aligned)
+-
+-	sub	$64, %rdx
+-	.p2align 4
+-L(64bytesormore_loopin2aligned):
+-	movdqa	(%rdi), %xmm0
+-	movdqa	16(%rdi), %xmm1
+-	movdqa	32(%rdi), %xmm2
+-	movdqa	48(%rdi), %xmm3
+-
+-	CMPEQ	(%rsi), %xmm0
+-	CMPEQ	16(%rsi), %xmm1
+-	CMPEQ	32(%rsi), %xmm2
+-	CMPEQ	48(%rsi), %xmm3
+-
+-	pand	%xmm0, %xmm1
+-	pand	%xmm2, %xmm3
+-	pand	%xmm1, %xmm3
+-
+-	pmovmskb %xmm3, %eax
+-	incw	%ax
+-	jnz	L(64bytesormore_loop_end)
+-	add	$64, %rsi
+-	add	$64, %rdi
+-	sub	$64, %rdx
+-	ja	L(64bytesormore_loopin2aligned)
+-	jmp	L(loop_tail)
+-
+-L(L2_L3_cache_aligned):
+-	subq	$64, %rdx
+-	.p2align 4
+-L(L2_L3_aligned_128bytes_loop):
+-	prefetchnta 0x1c0(%rdi)
+-	prefetchnta 0x1c0(%rsi)
+-	movdqa	(%rdi), %xmm0
+-	movdqa	16(%rdi), %xmm1
+-	movdqa	32(%rdi), %xmm2
+-	movdqa	48(%rdi), %xmm3
+-
+-	CMPEQ	(%rsi), %xmm0
+-	CMPEQ	16(%rsi), %xmm1
+-	CMPEQ	32(%rsi), %xmm2
+-	CMPEQ	48(%rsi), %xmm3
+-
+-	pand	%xmm0, %xmm1
+-	pand	%xmm2, %xmm3
+-	pand	%xmm1, %xmm3
+-
+-	pmovmskb %xmm3, %eax
+-	incw	%ax
+-	jnz	L(64bytesormore_loop_end)
+-
+-	addq	$64, %rsi
+-	addq	$64, %rdi
+-	subq	$64, %rdx
+-	ja	L(L2_L3_aligned_128bytes_loop)
+-	jmp	L(loop_tail)
+-
+-	.p2align 4
+-L(64bytesormore_loop_end):
+-	pmovmskb %xmm0, %ecx
+-	incw	%cx
+-	jnz	L(loop_end_ret)
+-
+-	pmovmskb %xmm1, %ecx
+-	notw	%cx
+-	sall	$16, %ecx
+-	jnz	L(loop_end_ret)
+-
+-	pmovmskb %xmm2, %ecx
+-	notw	%cx
+-	shlq	$32, %rcx
+-	jnz	L(loop_end_ret)
+-
+-	addq	$48, %rdi
+-	addq	$48, %rsi
+-	movq	%rax, %rcx
+-
+-	.p2align 4,, 6
+-L(loop_end_ret):
+-	bsfq	%rcx, %rcx
+-# ifdef USE_AS_WMEMCMP
+-	movl	(%rdi, %rcx), %eax
+-	xorl	%edx, %edx
+-	cmpl	(%rsi, %rcx), %eax
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
+-# else
+-	movzbl	(%rdi, %rcx), %eax
+-	movzbl	(%rsi, %rcx), %ecx
+-	subl	%ecx, %eax
+-# endif
+-	ret
+-END (MEMCMP)
+-#endif
+-- 
+GitLab
+
diff --git a/SOURCES/ia-rmv-set-memset-vec-unaligned-erms.patch b/SOURCES/ia-rmv-set-memset-vec-unaligned-erms.patch
new file mode 100644
index 0000000..86e8c06
--- /dev/null
+++ b/SOURCES/ia-rmv-set-memset-vec-unaligned-erms.patch
@@ -0,0 +1,34 @@
+From 3149dab571d1739a507ef8185af0d72d269b5ee3 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Sat, 12 Feb 2022 00:45:00 -0600
+Subject: [PATCH] x86: Set .text section in memset-vec-unaligned-erms
+
+commit 3d9f171bfb5325bd5f427e9fc386453358c6e840
+Author: H.J. Lu <hjl.tools@gmail.com>
+Date:   Mon Feb 7 05:55:15 2022 -0800
+
+    x86-64: Optimize bzero
+
+Remove setting the .text section for the code. This commit
+adds that back.
+
+(cherry picked from commit 7912236f4a597deb092650ca79f33504ddb4af28)
+---
+ sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index 06f5f5d7..4fb475c0 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -114,6 +114,7 @@
+ # error SECTION is not defined!
+ #endif
+ 
++	.section SECTION(.text), "ax", @progbits
+ #if IS_IN (libc)
+ # if defined SHARED
+ ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
+-- 
+GitLab
+
diff --git a/SOURCES/ia-rmv-ssse3_inst-memset.patch b/SOURCES/ia-rmv-ssse3_inst-memset.patch
new file mode 100644
index 0000000..5288574
--- /dev/null
+++ b/SOURCES/ia-rmv-ssse3_inst-memset.patch
@@ -0,0 +1,41 @@
+From 4412c21a9027bbe6546b2a329a741d26c2477136 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 7 Feb 2022 00:32:23 -0600
+Subject: [PATCH] x86: Remove SSSE3 instruction for broadcast in memset.S (SSE2
+ Only)
+
+commit b62ace2740a106222e124cc86956448fa07abf4d
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Sun Feb 6 00:54:18 2022 -0600
+
+    x86: Improve vec generation in memset-vec-unaligned-erms.S
+
+Revert usage of 'pshufb' in broadcast logic as it is an SSSE3
+instruction and memset.S is restricted to only SSE2 instructions.
+
+(cherry picked from commit 1b0c60f95bbe2eded80b2bb5be75c0e45b11cde1)
+---
+ sysdeps/x86_64/memset.S | 7 ++++---
+ 1 file changed, 4 insertions(+), 3 deletions(-)
+
+diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
+index 27debd2b..4cb4aa71 100644
+--- a/sysdeps/x86_64/memset.S
++++ b/sysdeps/x86_64/memset.S
+@@ -30,9 +30,10 @@
+ 
+ # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   movd d, %xmm0; \
+-  pxor %xmm1, %xmm1; \
+-  pshufb %xmm1, %xmm0; \
+-  movq r, %rax
++  movq r, %rax; \
++  punpcklbw %xmm0, %xmm0; \
++  punpcklwd %xmm0, %xmm0; \
++  pshufd $0, %xmm0, %xmm0
+ 
+ # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   movd d, %xmm0; \
+-- 
+GitLab
+
diff --git a/SOURCES/ia-rmv-strcspn-sse2.patch b/SOURCES/ia-rmv-strcspn-sse2.patch
new file mode 100644
index 0000000..7292b95
--- /dev/null
+++ b/SOURCES/ia-rmv-strcspn-sse2.patch
@@ -0,0 +1,172 @@
+From bb034f8ae84535c1263032311594f229fd3ad1a9 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Mar 2022 16:57:26 -0500
+Subject: [PATCH] x86: Remove strcspn-sse2.S and use the generic implementation
+
+The generic implementation is faster.
+
+geometric_mean(N=20) of all benchmarks New / Original: .678
+
+All string/memory tests pass.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+
+(cherry picked from commit fe28e7d9d9535ebab4081d195c553b4fbf39d9ae)
+---
+ .../{strcspn-sse2.S => strcspn-sse2.c}        |   6 +-
+ sysdeps/x86_64/strcspn.S                      | 122 ------------------
+ 2 files changed, 3 insertions(+), 125 deletions(-)
+ rename sysdeps/x86_64/multiarch/{strcspn-sse2.S => strcspn-sse2.c} (89%)
+ delete mode 100644 sysdeps/x86_64/strcspn.S
+
+diff --git a/sysdeps/x86_64/multiarch/strcspn-sse2.S b/sysdeps/x86_64/multiarch/strcspn-sse2.c
+similarity index 89%
+rename from sysdeps/x86_64/multiarch/strcspn-sse2.S
+rename to sysdeps/x86_64/multiarch/strcspn-sse2.c
+index 8a0c69d7..32debee4 100644
+--- a/sysdeps/x86_64/multiarch/strcspn-sse2.S
++++ b/sysdeps/x86_64/multiarch/strcspn-sse2.c
+@@ -19,10 +19,10 @@
+ #if IS_IN (libc)
+ 
+ # include <sysdep.h>
+-# define strcspn __strcspn_sse2
++# define STRCSPN __strcspn_sse2
+ 
+ # undef libc_hidden_builtin_def
+-# define libc_hidden_builtin_def(strcspn)
++# define libc_hidden_builtin_def(STRCSPN)
+ #endif
+ 
+-#include <sysdeps/x86_64/strcspn.S>
++#include <string/strcspn.c>
+diff --git a/sysdeps/x86_64/strcspn.S b/sysdeps/x86_64/strcspn.S
+deleted file mode 100644
+index 7f9202d6..00000000
+--- a/sysdeps/x86_64/strcspn.S
++++ /dev/null
+@@ -1,122 +0,0 @@
+-/* strcspn (str, ss) -- Return the length of the initial segment of STR
+-			which contains no characters from SS.
+-   For AMD x86-64.
+-   Copyright (C) 1994-2018 Free Software Foundation, Inc.
+-   This file is part of the GNU C Library.
+-   Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>.
+-   Bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>.
+-   Adopted for x86-64 by Andreas Jaeger <aj@suse.de>.
+-
+-   The GNU C Library is free software; you can redistribute it and/or
+-   modify it under the terms of the GNU Lesser General Public
+-   License as published by the Free Software Foundation; either
+-   version 2.1 of the License, or (at your option) any later version.
+-
+-   The GNU C Library is distributed in the hope that it will be useful,
+-   but WITHOUT ANY WARRANTY; without even the implied warranty of
+-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+-   Lesser General Public License for more details.
+-
+-   You should have received a copy of the GNU Lesser General Public
+-   License along with the GNU C Library; if not, see
+-   <http://www.gnu.org/licenses/>.  */
+-
+-#include <sysdep.h>
+-#include "asm-syntax.h"
+-
+-	.text
+-ENTRY (strcspn)
+-
+-	movq %rdi, %rdx		/* Save SRC.  */
+-
+-	/* First we create a table with flags for all possible characters.
+-	   For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
+-	   supported by the C string functions we have 256 characters.
+-	   Before inserting marks for the stop characters we clear the whole
+-	   table.  */
+-	movq %rdi, %r8			/* Save value.  */
+-	subq $256, %rsp			/* Make space for 256 bytes.  */
+-	cfi_adjust_cfa_offset(256)
+-	movl $32,  %ecx			/* 32*8 bytes = 256 bytes.  */
+-	movq %rsp, %rdi
+-	xorl %eax, %eax			/* We store 0s.  */
+-	cld
+-	rep
+-	stosq
+-
+-	movq %rsi, %rax			/* Setup skipset.  */
+-
+-/* For understanding the following code remember that %rcx == 0 now.
+-   Although all the following instruction only modify %cl we always
+-   have a correct zero-extended 64-bit value in %rcx.  */
+-
+-	.p2align 4
+-L(2):	movb (%rax), %cl	/* get byte from skipset */
+-	testb %cl, %cl		/* is NUL char? */
+-	jz L(1)			/* yes => start compare loop */
+-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in skipset table */
+-
+-	movb 1(%rax), %cl	/* get byte from skipset */
+-	testb $0xff, %cl	/* is NUL char? */
+-	jz L(1)			/* yes => start compare loop */
+-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in skipset table */
+-
+-	movb 2(%rax), %cl	/* get byte from skipset */
+-	testb $0xff, %cl	/* is NUL char? */
+-	jz L(1)			/* yes => start compare loop */
+-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in skipset table */
+-
+-	movb 3(%rax), %cl	/* get byte from skipset */
+-	addq $4, %rax		/* increment skipset pointer */
+-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in skipset table */
+-	testb $0xff, %cl	/* is NUL char? */
+-	jnz L(2)		/* no => process next dword from skipset */
+-
+-L(1):	leaq -4(%rdx), %rax	/* prepare loop */
+-
+-	/* We use a neat trick for the following loop.  Normally we would
+-	   have to test for two termination conditions
+-	   1. a character in the skipset was found
+-	   and
+-	   2. the end of the string was found
+-	   But as a sign that the character is in the skipset we store its
+-	   value in the table.  But the value of NUL is NUL so the loop
+-	   terminates for NUL in every case.  */
+-
+-	.p2align 4
+-L(3):	addq $4, %rax		/* adjust pointer for full loop round */
+-
+-	movb (%rax), %cl	/* get byte from string */
+-	cmpb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
+-	je L(4)			/* yes => return */
+-
+-	movb 1(%rax), %cl	/* get byte from string */
+-	cmpb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
+-	je L(5)			/* yes => return */
+-
+-	movb 2(%rax), %cl	/* get byte from string */
+-	cmpb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
+-	jz L(6)			/* yes => return */
+-
+-	movb 3(%rax), %cl	/* get byte from string */
+-	cmpb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
+-	jne L(3)		/* no => start loop again */
+-
+-	incq %rax		/* adjust pointer */
+-L(6):	incq %rax
+-L(5):	incq %rax
+-
+-L(4):	addq $256, %rsp		/* remove skipset */
+-	cfi_adjust_cfa_offset(-256)
+-#ifdef USE_AS_STRPBRK
+-	xorl %edx,%edx
+-	orb %cl, %cl		/* was last character NUL? */
+-	cmovzq %rdx, %rax	/* Yes:	return NULL */
+-#else
+-	subq %rdx, %rax		/* we have to return the number of valid
+-				   characters, so compute distance to first
+-				   non-valid character */
+-#endif
+-	ret
+-END (strcspn)
+-libc_hidden_builtin_def (strcspn)
+-- 
+GitLab
+
diff --git a/SOURCES/ia-rmv-strpbrk-sse2.patch b/SOURCES/ia-rmv-strpbrk-sse2.patch
new file mode 100644
index 0000000..e180808
--- /dev/null
+++ b/SOURCES/ia-rmv-strpbrk-sse2.patch
@@ -0,0 +1,52 @@
+From 1620feaf35e282954d1261dd2a7beaf09306659b Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Mar 2022 16:57:27 -0500
+Subject: [PATCH] x86: Remove strpbrk-sse2.S and use the generic implementation
+
+The generic implementation is faster (see strcspn commit).
+
+All string/memory tests pass.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+
+(cherry picked from commit 653358535280a599382cb6c77538a187dac6a87f)
+---
+ .../x86_64/multiarch/{strpbrk-sse2.S => strpbrk-sse2.c}    | 7 +++----
+ sysdeps/x86_64/strpbrk.S                                   | 3 ---
+ 2 files changed, 3 insertions(+), 7 deletions(-)
+ rename sysdeps/x86_64/multiarch/{strpbrk-sse2.S => strpbrk-sse2.c} (87%)
+ delete mode 100644 sysdeps/x86_64/strpbrk.S
+
+diff --git a/sysdeps/x86_64/multiarch/strpbrk-sse2.S b/sysdeps/x86_64/multiarch/strpbrk-sse2.c
+similarity index 87%
+rename from sysdeps/x86_64/multiarch/strpbrk-sse2.S
+rename to sysdeps/x86_64/multiarch/strpbrk-sse2.c
+index 3c6a74db..ec0b6fda 100644
+--- a/sysdeps/x86_64/multiarch/strpbrk-sse2.S
++++ b/sysdeps/x86_64/multiarch/strpbrk-sse2.c
+@@ -19,11 +19,10 @@
+ #if IS_IN (libc)
+ 
+ # include <sysdep.h>
+-# define strcspn __strpbrk_sse2
++# define STRPBRK __strpbrk_sse2
+ 
+ # undef libc_hidden_builtin_def
+-# define libc_hidden_builtin_def(strpbrk)
++# define libc_hidden_builtin_def(STRPBRK)
+ #endif
+ 
+-#define USE_AS_STRPBRK
+-#include <sysdeps/x86_64/strcspn.S>
++#include <string/strpbrk.c>
+diff --git a/sysdeps/x86_64/strpbrk.S b/sysdeps/x86_64/strpbrk.S
+deleted file mode 100644
+index 21888a5b..00000000
+--- a/sysdeps/x86_64/strpbrk.S
++++ /dev/null
+@@ -1,3 +0,0 @@
+-#define strcspn strpbrk
+-#define USE_AS_STRPBRK
+-#include <sysdeps/x86_64/strcspn.S>
+-- 
+GitLab
+
diff --git a/SOURCES/ia-rmv-strspn-sse2.patch b/SOURCES/ia-rmv-strspn-sse2.patch
new file mode 100644
index 0000000..a6c90c1
--- /dev/null
+++ b/SOURCES/ia-rmv-strspn-sse2.patch
@@ -0,0 +1,165 @@
+From 9356e90aa423ef4335404da233617ee85c3a05e4 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Mar 2022 16:57:29 -0500
+Subject: [PATCH] x86: Remove strspn-sse2.S and use the generic implementation
+
+The generic implementation is faster.
+
+geometric_mean(N=20) of all benchmarks New / Original: .710
+
+All string/memory tests pass.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+
+(cherry picked from commit 9c8a6ad620b49a27120ecdd7049c26bf05900397)
+---
+ .../{strspn-sse2.S => strspn-sse2.c}          |   6 +-
+ sysdeps/x86_64/strspn.S                       | 115 ------------------
+ 2 files changed, 3 insertions(+), 118 deletions(-)
+ rename sysdeps/x86_64/multiarch/{strspn-sse2.S => strspn-sse2.c} (89%)
+ delete mode 100644 sysdeps/x86_64/strspn.S
+
+diff --git a/sysdeps/x86_64/multiarch/strspn-sse2.S b/sysdeps/x86_64/multiarch/strspn-sse2.c
+similarity index 89%
+rename from sysdeps/x86_64/multiarch/strspn-sse2.S
+rename to sysdeps/x86_64/multiarch/strspn-sse2.c
+index 4686cdd5..ab0dae40 100644
+--- a/sysdeps/x86_64/multiarch/strspn-sse2.S
++++ b/sysdeps/x86_64/multiarch/strspn-sse2.c
+@@ -19,10 +19,10 @@
+ #if IS_IN (libc)
+ 
+ # include <sysdep.h>
+-# define strspn __strspn_sse2
++# define STRSPN __strspn_sse2
+ 
+ # undef libc_hidden_builtin_def
+-# define libc_hidden_builtin_def(strspn)
++# define libc_hidden_builtin_def(STRSPN)
+ #endif
+ 
+-#include <sysdeps/x86_64/strspn.S>
++#include <string/strspn.c>
+diff --git a/sysdeps/x86_64/strspn.S b/sysdeps/x86_64/strspn.S
+deleted file mode 100644
+index 635f1bc6..00000000
+--- a/sysdeps/x86_64/strspn.S
++++ /dev/null
+@@ -1,115 +0,0 @@
+-/* strspn (str, ss) -- Return the length of the initial segment of STR
+-			which contains only characters from SS.
+-   For AMD x86-64.
+-   Copyright (C) 1994-2018 Free Software Foundation, Inc.
+-   This file is part of the GNU C Library.
+-   Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>.
+-   Bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>.
+-   Adopted for x86-64 by Andreas Jaeger <aj@suse.de>.
+-
+-   The GNU C Library is free software; you can redistribute it and/or
+-   modify it under the terms of the GNU Lesser General Public
+-   License as published by the Free Software Foundation; either
+-   version 2.1 of the License, or (at your option) any later version.
+-
+-   The GNU C Library is distributed in the hope that it will be useful,
+-   but WITHOUT ANY WARRANTY; without even the implied warranty of
+-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+-   Lesser General Public License for more details.
+-
+-   You should have received a copy of the GNU Lesser General Public
+-   License along with the GNU C Library; if not, see
+-   <http://www.gnu.org/licenses/>.  */
+-
+-#include <sysdep.h>
+-
+-	.text
+-ENTRY (strspn)
+-
+-	movq %rdi, %rdx		/* Save SRC.  */
+-
+-	/* First we create a table with flags for all possible characters.
+-	   For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
+-	   supported by the C string functions we have 256 characters.
+-	   Before inserting marks for the stop characters we clear the whole
+-	   table.  */
+-	movq %rdi, %r8			/* Save value.  */
+-	subq $256, %rsp			/* Make space for 256 bytes.  */
+-	cfi_adjust_cfa_offset(256)
+-	movl $32,  %ecx			/* 32*8 bytes = 256 bytes.  */
+-	movq %rsp, %rdi
+-	xorl %eax, %eax			/* We store 0s.  */
+-	cld
+-	rep
+-	stosq
+-
+-	movq %rsi, %rax			/* Setup stopset.  */
+-
+-/* For understanding the following code remember that %rcx == 0 now.
+-   Although all the following instruction only modify %cl we always
+-   have a correct zero-extended 64-bit value in %rcx.  */
+-
+-	.p2align 4
+-L(2):	movb (%rax), %cl	/* get byte from stopset */
+-	testb %cl, %cl		/* is NUL char? */
+-	jz L(1)			/* yes => start compare loop */
+-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in stopset table */
+-
+-	movb 1(%rax), %cl	/* get byte from stopset */
+-	testb $0xff, %cl	/* is NUL char? */
+-	jz L(1)			/* yes => start compare loop */
+-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in stopset table */
+-
+-	movb 2(%rax), %cl	/* get byte from stopset */
+-	testb $0xff, %cl	/* is NUL char? */
+-	jz L(1)			/* yes => start compare loop */
+-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in stopset table */
+-
+-	movb 3(%rax), %cl	/* get byte from stopset */
+-	addq $4, %rax		/* increment stopset pointer */
+-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in stopset table */
+-	testb $0xff, %cl	/* is NUL char? */
+-	jnz L(2)		/* no => process next dword from stopset */
+-
+-L(1):	leaq -4(%rdx), %rax	/* prepare loop */
+-
+-	/* We use a neat trick for the following loop.  Normally we would
+-	   have to test for two termination conditions
+-	   1. a character in the stopset was found
+-	   and
+-	   2. the end of the string was found
+-	   But as a sign that the character is in the stopset we store its
+-	   value in the table.  But the value of NUL is NUL so the loop
+-	   terminates for NUL in every case.  */
+-
+-	.p2align 4
+-L(3):	addq $4, %rax		/* adjust pointer for full loop round */
+-
+-	movb (%rax), %cl	/* get byte from string */
+-	testb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
+-	jz L(4)			/* no => return */
+-
+-	movb 1(%rax), %cl	/* get byte from string */
+-	testb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
+-	jz L(5)			/* no => return */
+-
+-	movb 2(%rax), %cl	/* get byte from string */
+-	testb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
+-	jz L(6)			/* no => return */
+-
+-	movb 3(%rax), %cl	/* get byte from string */
+-	testb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
+-	jnz L(3)		/* yes => start loop again */
+-
+-	incq %rax		/* adjust pointer */
+-L(6):	incq %rax
+-L(5):	incq %rax
+-
+-L(4):	addq $256, %rsp		/* remove stopset */
+-	cfi_adjust_cfa_offset(-256)
+-	subq %rdx, %rax		/* we have to return the number of valid
+-				   characters, so compute distance to first
+-				   non-valid character */
+-	ret
+-END (strspn)
+-libc_hidden_builtin_def (strspn)
+-- 
+GitLab
+
diff --git a/SOURCES/ia-rmv-strxcasecmp-avx.patch b/SOURCES/ia-rmv-strxcasecmp-avx.patch
new file mode 100644
index 0000000..ba3f140
--- /dev/null
+++ b/SOURCES/ia-rmv-strxcasecmp-avx.patch
@@ -0,0 +1,914 @@
+From ea4c320faffe618f70854985887c7ca08a1dcf4b Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Mar 2022 16:57:46 -0500
+Subject: [PATCH] x86: Remove AVX str{n}casecmp
+
+The rational is:
+
+1. SSE42 has nearly identical logic so any benefit is minimal (3.4%
+   regression on Tigerlake using SSE42 versus AVX across the
+   benchtest suite).
+2. AVX2 version covers the majority of targets that previously
+   prefered it.
+3. The targets where AVX would still be best (SnB and IVB) are
+   becoming outdated.
+
+All in all the saving the code size is worth it.
+
+All string/memory tests pass.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+
+(cherry picked from commit 305769b2a15c2e96f9e1b5195d3c4e0d6f0f4b68)
+---
+ sysdeps/x86_64/multiarch/Makefile           |   2 -
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c  |  12 -
+ sysdeps/x86_64/multiarch/ifunc-strcasecmp.h |   4 -
+ sysdeps/x86_64/multiarch/strcasecmp_l-avx.S |  22 --
+ sysdeps/x86_64/multiarch/strcmp-sse42.S     | 240 +++++++++-----------
+ sysdeps/x86_64/multiarch/strncase_l-avx.S   |  22 --
+ 6 files changed, 105 insertions(+), 197 deletions(-)
+ delete mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx.S
+ delete mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx.S
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 359712c1..bca82e38 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
++++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -50,7 +50,6 @@ sysdep_routines += \
+   stpncpy-evex \
+   stpncpy-sse2-unaligned \
+   stpncpy-ssse3 \
+-  strcasecmp_l-avx \
+   strcasecmp_l-avx2 \
+   strcasecmp_l-avx2-rtm \
+   strcasecmp_l-evex \
+@@ -91,7 +90,6 @@ sysdep_routines += \
+   strlen-avx2-rtm \
+   strlen-evex \
+   strlen-sse2 \
+-  strncase_l-avx \
+   strncase_l-avx2 \
+   strncase_l-avx2-rtm \
+   strncase_l-evex \
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index 1dedc637..14314367 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -429,9 +429,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __strcasecmp_avx2_rtm)
+-	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+-			      CPU_FEATURE_USABLE (AVX),
+-			      __strcasecmp_avx)
+ 	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+ 			      CPU_FEATURE_USABLE (SSE4_2),
+ 			      __strcasecmp_sse42)
+@@ -453,9 +450,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __strcasecmp_l_avx2_rtm)
+-	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
+-			      CPU_FEATURE_USABLE (AVX),
+-			      __strcasecmp_l_avx)
+ 	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
+ 			      CPU_FEATURE_USABLE (SSE4_2),
+ 			      __strcasecmp_l_sse42)
+@@ -591,9 +585,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __strncasecmp_avx2_rtm)
+-	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+-			      CPU_FEATURE_USABLE (AVX),
+-			      __strncasecmp_avx)
+ 	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+ 			      CPU_FEATURE_USABLE (SSE4_2),
+ 			      __strncasecmp_sse42)
+@@ -616,9 +607,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __strncasecmp_l_avx2_rtm)
+-	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
+-			      CPU_FEATURE_USABLE (AVX),
+-			      __strncasecmp_l_avx)
+ 	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
+ 			      CPU_FEATURE_USABLE (SSE4_2),
+ 			      __strncasecmp_l_sse42)
+diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+index 6dd49a21..34cfbb8f 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
++++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+@@ -22,7 +22,6 @@
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
+-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+@@ -46,9 +45,6 @@ IFUNC_SELECTOR (void)
+         return OPTIMIZE (avx2);
+     }
+ 
+-  if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
+-    return OPTIMIZE (avx);
+-
+   if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_2)
+       && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
+     return OPTIMIZE (sse42);
+diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S
+deleted file mode 100644
+index 56a03547..00000000
+--- a/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S
++++ /dev/null
+@@ -1,22 +0,0 @@
+-/* strcasecmp_l optimized with AVX.
+-   Copyright (C) 2017-2018 Free Software Foundation, Inc.
+-   This file is part of the GNU C Library.
+-
+-   The GNU C Library is free software; you can redistribute it and/or
+-   modify it under the terms of the GNU Lesser General Public
+-   License as published by the Free Software Foundation; either
+-   version 2.1 of the License, or (at your option) any later version.
+-
+-   The GNU C Library is distributed in the hope that it will be useful,
+-   but WITHOUT ANY WARRANTY; without even the implied warranty of
+-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+-   Lesser General Public License for more details.
+-
+-   You should have received a copy of the GNU Lesser General Public
+-   License along with the GNU C Library; if not, see
+-   <http://www.gnu.org/licenses/>.  */
+-
+-#define STRCMP_SSE42 __strcasecmp_l_avx
+-#define USE_AVX 1
+-#define USE_AS_STRCASECMP_L
+-#include "strcmp-sse42.S"
+diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
+index 59e8ddfc..0a42b7a4 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
++++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
+@@ -42,13 +42,8 @@
+ # define UPDATE_STRNCMP_COUNTER
+ #endif
+ 
+-#ifdef USE_AVX
+-# define SECTION	avx
+-# define GLABEL(l)	l##_avx
+-#else
+-# define SECTION	sse4.2
+-# define GLABEL(l)	l##_sse42
+-#endif
++#define SECTION	sse4.2
++#define GLABEL(l)	l##_sse42
+ 
+ #define LABEL(l)	.L##l
+ 
+@@ -106,21 +101,7 @@ END (GLABEL(__strncasecmp))
+ #endif
+ 
+ 
+-#ifdef USE_AVX
+-# define movdqa vmovdqa
+-# define movdqu vmovdqu
+-# define pmovmskb vpmovmskb
+-# define pcmpistri vpcmpistri
+-# define psubb vpsubb
+-# define pcmpeqb vpcmpeqb
+-# define psrldq vpsrldq
+-# define pslldq vpslldq
+-# define palignr vpalignr
+-# define pxor vpxor
+-# define D(arg) arg, arg
+-#else
+-# define D(arg) arg
+-#endif
++#define arg arg
+ 
+ STRCMP_SSE42:
+ 	cfi_startproc
+@@ -192,18 +173,7 @@ LABEL(case_add):
+ 	movdqu	(%rdi), %xmm1
+ 	movdqu	(%rsi), %xmm2
+ #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+-# ifdef USE_AVX
+-#  define TOLOWER(reg1, reg2) \
+-	vpaddb	LCASE_MIN_reg, reg1, %xmm7;					\
+-	vpaddb	LCASE_MIN_reg, reg2, %xmm8;					\
+-	vpcmpgtb LCASE_MAX_reg, %xmm7, %xmm7;					\
+-	vpcmpgtb LCASE_MAX_reg, %xmm8, %xmm8;					\
+-	vpandn	CASE_ADD_reg, %xmm7, %xmm7;					\
+-	vpandn	CASE_ADD_reg, %xmm8, %xmm8;					\
+-	vpaddb	%xmm7, reg1, reg1;					\
+-	vpaddb	%xmm8, reg2, reg2
+-# else
+-#  define TOLOWER(reg1, reg2) \
++# define TOLOWER(reg1, reg2) \
+ 	movdqa	LCASE_MIN_reg, %xmm7;					\
+ 	movdqa	LCASE_MIN_reg, %xmm8;					\
+ 	paddb	reg1, %xmm7;					\
+@@ -214,15 +184,15 @@ LABEL(case_add):
+ 	pandn	CASE_ADD_reg, %xmm8;					\
+ 	paddb	%xmm7, reg1;					\
+ 	paddb	%xmm8, reg2
+-# endif
++
+ 	TOLOWER (%xmm1, %xmm2)
+ #else
+ # define TOLOWER(reg1, reg2)
+ #endif
+-	pxor	%xmm0, D(%xmm0)		/* clear %xmm0 for null char checks */
+-	pcmpeqb	%xmm1, D(%xmm0)		/* Any null chars? */
+-	pcmpeqb	%xmm2, D(%xmm1)		/* compare first 16 bytes for equality */
+-	psubb	%xmm0, D(%xmm1)		/* packed sub of comparison results*/
++	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char checks */
++	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
++	pcmpeqb	%xmm2, %xmm1		/* compare first 16 bytes for equality */
++	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+ 	pmovmskb %xmm1, %edx
+ 	sub	$0xffff, %edx		/* if first 16 bytes are same, edx == 0xffff */
+ 	jnz	LABEL(less16bytes)/* If not, find different value or null char */
+@@ -246,7 +216,7 @@ LABEL(crosscache):
+ 	xor	%r8d, %r8d
+ 	and	$0xf, %ecx		/* offset of rsi */
+ 	and	$0xf, %eax		/* offset of rdi */
+-	pxor	%xmm0, D(%xmm0)		/* clear %xmm0 for null char check */
++	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char check */
+ 	cmp	%eax, %ecx
+ 	je	LABEL(ashr_0)		/* rsi and rdi relative offset same */
+ 	ja	LABEL(bigger)
+@@ -260,7 +230,7 @@ LABEL(bigger):
+ 	sub	%rcx, %r9
+ 	lea	LABEL(unaligned_table)(%rip), %r10
+ 	movslq	(%r10, %r9,4), %r9
+-	pcmpeqb	%xmm1, D(%xmm0)		/* Any null chars? */
++	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
+ 	lea	(%r10, %r9), %r10
+ 	_CET_NOTRACK jmp *%r10		/* jump to corresponding case */
+ 
+@@ -273,15 +243,15 @@ LABEL(bigger):
+ LABEL(ashr_0):
+ 
+ 	movdqa	(%rsi), %xmm1
+-	pcmpeqb	%xmm1, D(%xmm0)		/* Any null chars? */
++	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+-	pcmpeqb	(%rdi), D(%xmm1)	/* compare 16 bytes for equality */
++	pcmpeqb	(%rdi), %xmm1		/* compare 16 bytes for equality */
+ #else
+ 	movdqa	(%rdi), %xmm2
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm2, D(%xmm1)		/* compare 16 bytes for equality */
++	pcmpeqb	%xmm2, %xmm1		/* compare 16 bytes for equality */
+ #endif
+-	psubb	%xmm0, D(%xmm1)		/* packed sub of comparison results*/
++	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+ 	pmovmskb %xmm1, %r9d
+ 	shr	%cl, %edx		/* adjust 0xffff for offset */
+ 	shr	%cl, %r9d		/* adjust for 16-byte offset */
+@@ -361,10 +331,10 @@ LABEL(ashr_0_exit_use):
+  */
+ 	.p2align 4
+ LABEL(ashr_1):
+-	pslldq	$15, D(%xmm2)		/* shift first string to align with second */
++	pslldq	$15, %xmm2		/* shift first string to align with second */
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm1, D(%xmm2)		/* compare 16 bytes for equality */
+-	psubb	%xmm0, D(%xmm2)		/* packed sub of comparison results*/
++	pcmpeqb	%xmm1, %xmm2		/* compare 16 bytes for equality */
++	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+ 	pmovmskb %xmm2, %r9d
+ 	shr	%cl, %edx		/* adjust 0xffff for offset */
+ 	shr	%cl, %r9d		/* adjust for 16-byte offset */
+@@ -392,7 +362,7 @@ LABEL(loop_ashr_1_use):
+ 
+ LABEL(nibble_ashr_1_restart_use):
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $1, -16(%rdi, %rdx), D(%xmm0)
++	palignr $1, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+ #else
+@@ -411,7 +381,7 @@ LABEL(nibble_ashr_1_restart_use):
+ 	jg	LABEL(nibble_ashr_1_use)
+ 
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $1, -16(%rdi, %rdx), D(%xmm0)
++	palignr $1, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+ #else
+@@ -431,7 +401,7 @@ LABEL(nibble_ashr_1_restart_use):
+ LABEL(nibble_ashr_1_use):
+ 	sub	$0x1000, %r10
+ 	movdqa	-16(%rdi, %rdx), %xmm0
+-	psrldq	$1, D(%xmm0)
++	psrldq	$1, %xmm0
+ 	pcmpistri      $0x3a,%xmm0, %xmm0
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ 	cmp	%r11, %rcx
+@@ -449,10 +419,10 @@ LABEL(nibble_ashr_1_use):
+  */
+ 	.p2align 4
+ LABEL(ashr_2):
+-	pslldq	$14, D(%xmm2)
++	pslldq	$14, %xmm2
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm1, D(%xmm2)
+-	psubb	%xmm0, D(%xmm2)
++	pcmpeqb	%xmm1, %xmm2
++	psubb	%xmm0, %xmm2
+ 	pmovmskb %xmm2, %r9d
+ 	shr	%cl, %edx
+ 	shr	%cl, %r9d
+@@ -480,7 +450,7 @@ LABEL(loop_ashr_2_use):
+ 
+ LABEL(nibble_ashr_2_restart_use):
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $2, -16(%rdi, %rdx), D(%xmm0)
++	palignr $2, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+ #else
+@@ -499,7 +469,7 @@ LABEL(nibble_ashr_2_restart_use):
+ 	jg	LABEL(nibble_ashr_2_use)
+ 
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $2, -16(%rdi, %rdx), D(%xmm0)
++	palignr $2, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+ #else
+@@ -519,7 +489,7 @@ LABEL(nibble_ashr_2_restart_use):
+ LABEL(nibble_ashr_2_use):
+ 	sub	$0x1000, %r10
+ 	movdqa	-16(%rdi, %rdx), %xmm0
+-	psrldq	$2, D(%xmm0)
++	psrldq	$2, %xmm0
+ 	pcmpistri      $0x3a,%xmm0, %xmm0
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ 	cmp	%r11, %rcx
+@@ -537,10 +507,10 @@ LABEL(nibble_ashr_2_use):
+  */
+ 	.p2align 4
+ LABEL(ashr_3):
+-	pslldq	$13, D(%xmm2)
++	pslldq	$13, %xmm2
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm1, D(%xmm2)
+-	psubb	%xmm0, D(%xmm2)
++	pcmpeqb	%xmm1, %xmm2
++	psubb	%xmm0, %xmm2
+ 	pmovmskb %xmm2, %r9d
+ 	shr	%cl, %edx
+ 	shr	%cl, %r9d
+@@ -568,7 +538,7 @@ LABEL(loop_ashr_3_use):
+ 
+ LABEL(nibble_ashr_3_restart_use):
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $3, -16(%rdi, %rdx), D(%xmm0)
++	palignr $3, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+ #else
+@@ -587,7 +557,7 @@ LABEL(nibble_ashr_3_restart_use):
+ 	jg	LABEL(nibble_ashr_3_use)
+ 
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $3, -16(%rdi, %rdx), D(%xmm0)
++	palignr $3, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+ #else
+@@ -607,7 +577,7 @@ LABEL(nibble_ashr_3_restart_use):
+ LABEL(nibble_ashr_3_use):
+ 	sub	$0x1000, %r10
+ 	movdqa	-16(%rdi, %rdx), %xmm0
+-	psrldq	$3, D(%xmm0)
++	psrldq	$3, %xmm0
+ 	pcmpistri      $0x3a,%xmm0, %xmm0
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ 	cmp	%r11, %rcx
+@@ -625,10 +595,10 @@ LABEL(nibble_ashr_3_use):
+  */
+ 	.p2align 4
+ LABEL(ashr_4):
+-	pslldq	$12, D(%xmm2)
++	pslldq	$12, %xmm2
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm1, D(%xmm2)
+-	psubb	%xmm0, D(%xmm2)
++	pcmpeqb	%xmm1, %xmm2
++	psubb	%xmm0, %xmm2
+ 	pmovmskb %xmm2, %r9d
+ 	shr	%cl, %edx
+ 	shr	%cl, %r9d
+@@ -657,7 +627,7 @@ LABEL(loop_ashr_4_use):
+ 
+ LABEL(nibble_ashr_4_restart_use):
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $4, -16(%rdi, %rdx), D(%xmm0)
++	palignr $4, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+ #else
+@@ -676,7 +646,7 @@ LABEL(nibble_ashr_4_restart_use):
+ 	jg	LABEL(nibble_ashr_4_use)
+ 
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $4, -16(%rdi, %rdx), D(%xmm0)
++	palignr $4, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+ #else
+@@ -696,7 +666,7 @@ LABEL(nibble_ashr_4_restart_use):
+ LABEL(nibble_ashr_4_use):
+ 	sub	$0x1000, %r10
+ 	movdqa	-16(%rdi, %rdx), %xmm0
+-	psrldq	$4, D(%xmm0)
++	psrldq	$4, %xmm0
+ 	pcmpistri      $0x3a,%xmm0, %xmm0
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ 	cmp	%r11, %rcx
+@@ -714,10 +684,10 @@ LABEL(nibble_ashr_4_use):
+  */
+ 	.p2align 4
+ LABEL(ashr_5):
+-	pslldq	$11, D(%xmm2)
++	pslldq	$11, %xmm2
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm1, D(%xmm2)
+-	psubb	%xmm0, D(%xmm2)
++	pcmpeqb	%xmm1, %xmm2
++	psubb	%xmm0, %xmm2
+ 	pmovmskb %xmm2, %r9d
+ 	shr	%cl, %edx
+ 	shr	%cl, %r9d
+@@ -746,7 +716,7 @@ LABEL(loop_ashr_5_use):
+ 
+ LABEL(nibble_ashr_5_restart_use):
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $5, -16(%rdi, %rdx), D(%xmm0)
++	palignr $5, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+ #else
+@@ -766,7 +736,7 @@ LABEL(nibble_ashr_5_restart_use):
+ 
+ 	movdqa	(%rdi, %rdx), %xmm0
+ 
+-	palignr $5, -16(%rdi, %rdx), D(%xmm0)
++	palignr $5, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+ #else
+@@ -786,7 +756,7 @@ LABEL(nibble_ashr_5_restart_use):
+ LABEL(nibble_ashr_5_use):
+ 	sub	$0x1000, %r10
+ 	movdqa	-16(%rdi, %rdx), %xmm0
+-	psrldq	$5, D(%xmm0)
++	psrldq	$5, %xmm0
+ 	pcmpistri      $0x3a,%xmm0, %xmm0
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ 	cmp	%r11, %rcx
+@@ -804,10 +774,10 @@ LABEL(nibble_ashr_5_use):
+  */
+ 	.p2align 4
+ LABEL(ashr_6):
+-	pslldq	$10, D(%xmm2)
++	pslldq	$10, %xmm2
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm1, D(%xmm2)
+-	psubb	%xmm0, D(%xmm2)
++	pcmpeqb	%xmm1, %xmm2
++	psubb	%xmm0, %xmm2
+ 	pmovmskb %xmm2, %r9d
+ 	shr	%cl, %edx
+ 	shr	%cl, %r9d
+@@ -836,7 +806,7 @@ LABEL(loop_ashr_6_use):
+ 
+ LABEL(nibble_ashr_6_restart_use):
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $6, -16(%rdi, %rdx), D(%xmm0)
++	palignr $6, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+ #else
+@@ -855,7 +825,7 @@ LABEL(nibble_ashr_6_restart_use):
+ 	jg	LABEL(nibble_ashr_6_use)
+ 
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $6, -16(%rdi, %rdx), D(%xmm0)
++	palignr $6, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+ #else
+@@ -875,7 +845,7 @@ LABEL(nibble_ashr_6_restart_use):
+ LABEL(nibble_ashr_6_use):
+ 	sub	$0x1000, %r10
+ 	movdqa	-16(%rdi, %rdx), %xmm0
+-	psrldq	$6, D(%xmm0)
++	psrldq	$6, %xmm0
+ 	pcmpistri      $0x3a,%xmm0, %xmm0
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ 	cmp	%r11, %rcx
+@@ -893,10 +863,10 @@ LABEL(nibble_ashr_6_use):
+  */
+ 	.p2align 4
+ LABEL(ashr_7):
+-	pslldq	$9, D(%xmm2)
++	pslldq	$9, %xmm2
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm1, D(%xmm2)
+-	psubb	%xmm0, D(%xmm2)
++	pcmpeqb	%xmm1, %xmm2
++	psubb	%xmm0, %xmm2
+ 	pmovmskb %xmm2, %r9d
+ 	shr	%cl, %edx
+ 	shr	%cl, %r9d
+@@ -925,7 +895,7 @@ LABEL(loop_ashr_7_use):
+ 
+ LABEL(nibble_ashr_7_restart_use):
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $7, -16(%rdi, %rdx), D(%xmm0)
++	palignr $7, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+ #else
+@@ -944,7 +914,7 @@ LABEL(nibble_ashr_7_restart_use):
+ 	jg	LABEL(nibble_ashr_7_use)
+ 
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $7, -16(%rdi, %rdx), D(%xmm0)
++	palignr $7, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+ #else
+@@ -964,7 +934,7 @@ LABEL(nibble_ashr_7_restart_use):
+ LABEL(nibble_ashr_7_use):
+ 	sub	$0x1000, %r10
+ 	movdqa	-16(%rdi, %rdx), %xmm0
+-	psrldq	$7, D(%xmm0)
++	psrldq	$7, %xmm0
+ 	pcmpistri      $0x3a,%xmm0, %xmm0
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ 	cmp	%r11, %rcx
+@@ -982,10 +952,10 @@ LABEL(nibble_ashr_7_use):
+  */
+ 	.p2align 4
+ LABEL(ashr_8):
+-	pslldq	$8, D(%xmm2)
++	pslldq	$8, %xmm2
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm1, D(%xmm2)
+-	psubb	%xmm0, D(%xmm2)
++	pcmpeqb	%xmm1, %xmm2
++	psubb	%xmm0, %xmm2
+ 	pmovmskb %xmm2, %r9d
+ 	shr	%cl, %edx
+ 	shr	%cl, %r9d
+@@ -1014,7 +984,7 @@ LABEL(loop_ashr_8_use):
+ 
+ LABEL(nibble_ashr_8_restart_use):
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $8, -16(%rdi, %rdx), D(%xmm0)
++	palignr $8, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1033,7 +1003,7 @@ LABEL(nibble_ashr_8_restart_use):
+ 	jg	LABEL(nibble_ashr_8_use)
+ 
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $8, -16(%rdi, %rdx), D(%xmm0)
++	palignr $8, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1053,7 +1023,7 @@ LABEL(nibble_ashr_8_restart_use):
+ LABEL(nibble_ashr_8_use):
+ 	sub	$0x1000, %r10
+ 	movdqa	-16(%rdi, %rdx), %xmm0
+-	psrldq	$8, D(%xmm0)
++	psrldq	$8, %xmm0
+ 	pcmpistri      $0x3a,%xmm0, %xmm0
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ 	cmp	%r11, %rcx
+@@ -1071,10 +1041,10 @@ LABEL(nibble_ashr_8_use):
+  */
+ 	.p2align 4
+ LABEL(ashr_9):
+-	pslldq	$7, D(%xmm2)
++	pslldq	$7, %xmm2
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm1, D(%xmm2)
+-	psubb	%xmm0, D(%xmm2)
++	pcmpeqb	%xmm1, %xmm2
++	psubb	%xmm0, %xmm2
+ 	pmovmskb %xmm2, %r9d
+ 	shr	%cl, %edx
+ 	shr	%cl, %r9d
+@@ -1104,7 +1074,7 @@ LABEL(loop_ashr_9_use):
+ LABEL(nibble_ashr_9_restart_use):
+ 	movdqa	(%rdi, %rdx), %xmm0
+ 
+-	palignr $9, -16(%rdi, %rdx), D(%xmm0)
++	palignr $9, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1123,7 +1093,7 @@ LABEL(nibble_ashr_9_restart_use):
+ 	jg	LABEL(nibble_ashr_9_use)
+ 
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $9, -16(%rdi, %rdx), D(%xmm0)
++	palignr $9, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1143,7 +1113,7 @@ LABEL(nibble_ashr_9_restart_use):
+ LABEL(nibble_ashr_9_use):
+ 	sub	$0x1000, %r10
+ 	movdqa	-16(%rdi, %rdx), %xmm0
+-	psrldq	$9, D(%xmm0)
++	psrldq	$9, %xmm0
+ 	pcmpistri      $0x3a,%xmm0, %xmm0
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ 	cmp	%r11, %rcx
+@@ -1161,10 +1131,10 @@ LABEL(nibble_ashr_9_use):
+  */
+ 	.p2align 4
+ LABEL(ashr_10):
+-	pslldq	$6, D(%xmm2)
++	pslldq	$6, %xmm2
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm1, D(%xmm2)
+-	psubb	%xmm0, D(%xmm2)
++	pcmpeqb	%xmm1, %xmm2
++	psubb	%xmm0, %xmm2
+ 	pmovmskb %xmm2, %r9d
+ 	shr	%cl, %edx
+ 	shr	%cl, %r9d
+@@ -1193,7 +1163,7 @@ LABEL(loop_ashr_10_use):
+ 
+ LABEL(nibble_ashr_10_restart_use):
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $10, -16(%rdi, %rdx), D(%xmm0)
++	palignr $10, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1212,7 +1182,7 @@ LABEL(nibble_ashr_10_restart_use):
+ 	jg	LABEL(nibble_ashr_10_use)
+ 
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $10, -16(%rdi, %rdx), D(%xmm0)
++	palignr $10, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1232,7 +1202,7 @@ LABEL(nibble_ashr_10_restart_use):
+ LABEL(nibble_ashr_10_use):
+ 	sub	$0x1000, %r10
+ 	movdqa	-16(%rdi, %rdx), %xmm0
+-	psrldq	$10, D(%xmm0)
++	psrldq	$10, %xmm0
+ 	pcmpistri      $0x3a,%xmm0, %xmm0
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ 	cmp	%r11, %rcx
+@@ -1250,10 +1220,10 @@ LABEL(nibble_ashr_10_use):
+  */
+ 	.p2align 4
+ LABEL(ashr_11):
+-	pslldq	$5, D(%xmm2)
++	pslldq	$5, %xmm2
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm1, D(%xmm2)
+-	psubb	%xmm0, D(%xmm2)
++	pcmpeqb	%xmm1, %xmm2
++	psubb	%xmm0, %xmm2
+ 	pmovmskb %xmm2, %r9d
+ 	shr	%cl, %edx
+ 	shr	%cl, %r9d
+@@ -1282,7 +1252,7 @@ LABEL(loop_ashr_11_use):
+ 
+ LABEL(nibble_ashr_11_restart_use):
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $11, -16(%rdi, %rdx), D(%xmm0)
++	palignr $11, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1301,7 +1271,7 @@ LABEL(nibble_ashr_11_restart_use):
+ 	jg	LABEL(nibble_ashr_11_use)
+ 
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $11, -16(%rdi, %rdx), D(%xmm0)
++	palignr $11, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1321,7 +1291,7 @@ LABEL(nibble_ashr_11_restart_use):
+ LABEL(nibble_ashr_11_use):
+ 	sub	$0x1000, %r10
+ 	movdqa	-16(%rdi, %rdx), %xmm0
+-	psrldq	$11, D(%xmm0)
++	psrldq	$11, %xmm0
+ 	pcmpistri      $0x3a,%xmm0, %xmm0
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ 	cmp	%r11, %rcx
+@@ -1339,10 +1309,10 @@ LABEL(nibble_ashr_11_use):
+  */
+ 	.p2align 4
+ LABEL(ashr_12):
+-	pslldq	$4, D(%xmm2)
++	pslldq	$4, %xmm2
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm1, D(%xmm2)
+-	psubb	%xmm0, D(%xmm2)
++	pcmpeqb	%xmm1, %xmm2
++	psubb	%xmm0, %xmm2
+ 	pmovmskb %xmm2, %r9d
+ 	shr	%cl, %edx
+ 	shr	%cl, %r9d
+@@ -1371,7 +1341,7 @@ LABEL(loop_ashr_12_use):
+ 
+ LABEL(nibble_ashr_12_restart_use):
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $12, -16(%rdi, %rdx), D(%xmm0)
++	palignr $12, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1390,7 +1360,7 @@ LABEL(nibble_ashr_12_restart_use):
+ 	jg	LABEL(nibble_ashr_12_use)
+ 
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $12, -16(%rdi, %rdx), D(%xmm0)
++	palignr $12, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1410,7 +1380,7 @@ LABEL(nibble_ashr_12_restart_use):
+ LABEL(nibble_ashr_12_use):
+ 	sub	$0x1000, %r10
+ 	movdqa	-16(%rdi, %rdx), %xmm0
+-	psrldq	$12, D(%xmm0)
++	psrldq	$12, %xmm0
+ 	pcmpistri      $0x3a,%xmm0, %xmm0
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ 	cmp	%r11, %rcx
+@@ -1428,10 +1398,10 @@ LABEL(nibble_ashr_12_use):
+  */
+ 	.p2align 4
+ LABEL(ashr_13):
+-	pslldq	$3, D(%xmm2)
++	pslldq	$3, %xmm2
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm1, D(%xmm2)
+-	psubb	%xmm0, D(%xmm2)
++	pcmpeqb	%xmm1, %xmm2
++	psubb	%xmm0, %xmm2
+ 	pmovmskb %xmm2, %r9d
+ 	shr	%cl, %edx
+ 	shr	%cl, %r9d
+@@ -1461,7 +1431,7 @@ LABEL(loop_ashr_13_use):
+ 
+ LABEL(nibble_ashr_13_restart_use):
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $13, -16(%rdi, %rdx), D(%xmm0)
++	palignr $13, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1480,7 +1450,7 @@ LABEL(nibble_ashr_13_restart_use):
+ 	jg	LABEL(nibble_ashr_13_use)
+ 
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $13, -16(%rdi, %rdx), D(%xmm0)
++	palignr $13, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1500,7 +1470,7 @@ LABEL(nibble_ashr_13_restart_use):
+ LABEL(nibble_ashr_13_use):
+ 	sub	$0x1000, %r10
+ 	movdqa	-16(%rdi, %rdx), %xmm0
+-	psrldq	$13, D(%xmm0)
++	psrldq	$13, %xmm0
+ 	pcmpistri      $0x3a,%xmm0, %xmm0
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ 	cmp	%r11, %rcx
+@@ -1518,10 +1488,10 @@ LABEL(nibble_ashr_13_use):
+  */
+ 	.p2align 4
+ LABEL(ashr_14):
+-	pslldq  $2, D(%xmm2)
++	pslldq  $2, %xmm2
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm1, D(%xmm2)
+-	psubb	%xmm0, D(%xmm2)
++	pcmpeqb	%xmm1, %xmm2
++	psubb	%xmm0, %xmm2
+ 	pmovmskb %xmm2, %r9d
+ 	shr	%cl, %edx
+ 	shr	%cl, %r9d
+@@ -1551,7 +1521,7 @@ LABEL(loop_ashr_14_use):
+ 
+ LABEL(nibble_ashr_14_restart_use):
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $14, -16(%rdi, %rdx), D(%xmm0)
++	palignr $14, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1570,7 +1540,7 @@ LABEL(nibble_ashr_14_restart_use):
+ 	jg	LABEL(nibble_ashr_14_use)
+ 
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $14, -16(%rdi, %rdx), D(%xmm0)
++	palignr $14, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1590,7 +1560,7 @@ LABEL(nibble_ashr_14_restart_use):
+ LABEL(nibble_ashr_14_use):
+ 	sub	$0x1000, %r10
+ 	movdqa	-16(%rdi, %rdx), %xmm0
+-	psrldq	$14, D(%xmm0)
++	psrldq	$14, %xmm0
+ 	pcmpistri      $0x3a,%xmm0, %xmm0
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ 	cmp	%r11, %rcx
+@@ -1608,10 +1578,10 @@ LABEL(nibble_ashr_14_use):
+  */
+ 	.p2align 4
+ LABEL(ashr_15):
+-	pslldq	$1, D(%xmm2)
++	pslldq	$1, %xmm2
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm1, D(%xmm2)
+-	psubb	%xmm0, D(%xmm2)
++	pcmpeqb	%xmm1, %xmm2
++	psubb	%xmm0, %xmm2
+ 	pmovmskb %xmm2, %r9d
+ 	shr	%cl, %edx
+ 	shr	%cl, %r9d
+@@ -1643,7 +1613,7 @@ LABEL(loop_ashr_15_use):
+ 
+ LABEL(nibble_ashr_15_restart_use):
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $15, -16(%rdi, %rdx), D(%xmm0)
++	palignr $15, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1662,7 +1632,7 @@ LABEL(nibble_ashr_15_restart_use):
+ 	jg	LABEL(nibble_ashr_15_use)
+ 
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $15, -16(%rdi, %rdx), D(%xmm0)
++	palignr $15, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1682,7 +1652,7 @@ LABEL(nibble_ashr_15_restart_use):
+ LABEL(nibble_ashr_15_use):
+ 	sub	$0x1000, %r10
+ 	movdqa	-16(%rdi, %rdx), %xmm0
+-	psrldq	$15, D(%xmm0)
++	psrldq	$15, %xmm0
+ 	pcmpistri      $0x3a,%xmm0, %xmm0
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ 	cmp	%r11, %rcx
+diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx.S b/sysdeps/x86_64/multiarch/strncase_l-avx.S
+deleted file mode 100644
+index 0c4e525b..00000000
+--- a/sysdeps/x86_64/multiarch/strncase_l-avx.S
++++ /dev/null
+@@ -1,22 +0,0 @@
+-/* strncasecmp_l optimized with AVX.
+-   Copyright (C) 2017-2018 Free Software Foundation, Inc.
+-   This file is part of the GNU C Library.
+-
+-   The GNU C Library is free software; you can redistribute it and/or
+-   modify it under the terms of the GNU Lesser General Public
+-   License as published by the Free Software Foundation; either
+-   version 2.1 of the License, or (at your option) any later version.
+-
+-   The GNU C Library is distributed in the hope that it will be useful,
+-   but WITHOUT ANY WARRANTY; without even the implied warranty of
+-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+-   Lesser General Public License for more details.
+-
+-   You should have received a copy of the GNU Lesser General Public
+-   License along with the GNU C Library; if not, see
+-   <http://www.gnu.org/licenses/>.  */
+-
+-#define STRCMP_SSE42 __strncasecmp_l_avx
+-#define USE_AVX 1
+-#define USE_AS_STRNCASECMP_L
+-#include "strcmp-sse42.S"
+-- 
+GitLab
+
diff --git a/SOURCES/ia-rmv-weak_alias-memset-sse2.patch b/SOURCES/ia-rmv-weak_alias-memset-sse2.patch
new file mode 100644
index 0000000..64859a2
--- /dev/null
+++ b/SOURCES/ia-rmv-weak_alias-memset-sse2.patch
@@ -0,0 +1,37 @@
+From 4e487ee2fe85385052ac7b18d8e6686e79f78d14 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Thu, 10 Feb 2022 11:52:50 -0800
+Subject: [PATCH] x86-64: Remove bzero weak alias in SS2 memset
+
+commit 3d9f171bfb5325bd5f427e9fc386453358c6e840
+Author: H.J. Lu <hjl.tools@gmail.com>
+Date:   Mon Feb 7 05:55:15 2022 -0800
+
+    x86-64: Optimize bzero
+
+added the optimized bzero.  Remove bzero weak alias in SS2 memset to
+avoid undefined __bzero in memset-sse2-unaligned-erms.
+
+(cherry picked from commit 0fb8800029d230b3711bf722b2a47db92d0e273f)
+---
+ sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S | 4 +---
+ 1 file changed, 1 insertion(+), 3 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
+index 8f579ad6..af51362b 100644
+--- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
+@@ -31,9 +31,7 @@
+ # endif
+ 
+ # undef weak_alias
+-# define weak_alias(original, alias) \
+-	.weak bzero; bzero = __bzero
+-
++# define weak_alias(original, alias)
+ # undef strong_alias
+ # define strong_alias(ignored1, ignored2)
+ #endif
+-- 
+GitLab
+
diff --git a/SOURCES/ia-strcmp-avx2-fix.patch b/SOURCES/ia-strcmp-avx2-fix.patch
new file mode 100644
index 0000000..d256c37
--- /dev/null
+++ b/SOURCES/ia-strcmp-avx2-fix.patch
@@ -0,0 +1,34 @@
+From d7ca99114ba67e94c99aac230ac025720cf30918 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 4 Feb 2022 11:09:10 -0800
+Subject: [PATCH] x86-64: Fix strcmp-avx2.S
+
+Change "movl %edx, %rdx" to "movl %edx, %edx" in:
+
+commit b77b06e0e296f1a2276c27a67e1d44f2cfa38d45
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Mon Jan 10 15:35:38 2022 -0600
+
+    x86: Optimize strcmp-avx2.S
+
+(cherry picked from commit c15efd011cea3d8f0494269eb539583215a1feed)
+---
+ sysdeps/x86_64/multiarch/strcmp-avx2.S | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index 554ffe4c..04675aa4 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -106,7 +106,7 @@ ENTRY(STRCMP)
+ # ifdef USE_AS_STRNCMP
+ #  ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+-	movl	%edx, %rdx
++	movl	%edx, %edx
+ #  endif
+ 	cmp	$1, %RDX_LP
+ 	/* Signed comparison intentional. We use this branch to also
+-- 
+GitLab
+
diff --git a/SOURCES/ia-strcmp-evex-fix.patch b/SOURCES/ia-strcmp-evex-fix.patch
new file mode 100644
index 0000000..66e55cc
--- /dev/null
+++ b/SOURCES/ia-strcmp-evex-fix.patch
@@ -0,0 +1,34 @@
+From f9a6857a55fb5586d4c34baa46700d8428e02273 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 4 Feb 2022 11:11:08 -0800
+Subject: [PATCH] x86-64: Fix strcmp-evex.S
+
+Change "movl %edx, %rdx" to "movl %edx, %edx" in:
+
+commit 8418eb3ff4b781d31c4ed5dc6c0bd7356bc45db9
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Mon Jan 10 15:35:39 2022 -0600
+
+    x86: Optimize strcmp-evex.S
+
+(cherry picked from commit 0e0199a9e02ebe42e2b36958964d63f03573c382)
+---
+ sysdeps/x86_64/multiarch/strcmp-evex.S | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
+index 99d8409a..ed56af8e 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
+@@ -116,7 +116,7 @@ ENTRY(STRCMP)
+ # ifdef USE_AS_STRNCMP
+ #  ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+-	movl	%edx, %rdx
++	movl	%edx, %edx
+ #  endif
+ 	cmp	$1, %RDX_LP
+ 	/* Signed comparison intentional. We use this branch to also
+-- 
+GitLab
+
diff --git a/SOURCES/ia-test_name-string-tst-strncmp-rtm.patch b/SOURCES/ia-test_name-string-tst-strncmp-rtm.patch
new file mode 100644
index 0000000..0e8df73
--- /dev/null
+++ b/SOURCES/ia-test_name-string-tst-strncmp-rtm.patch
@@ -0,0 +1,37 @@
+From 985c2ec17c739b506311000a3932a8f8491b4001 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 18 Feb 2022 17:00:25 -0600
+Subject: [PATCH] x86: Fix TEST_NAME to make it a string in tst-strncmp-rtm.c
+
+Previously TEST_NAME was passing a function pointer. This didn't fail
+because of the -Wno-error flag (to allow for overflow sizes passed
+to strncmp/wcsncmp)
+
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+(cherry picked from commit b98d0bbf747f39770e0caba7e984ce9f8f900330)
+---
+ sysdeps/x86/tst-strncmp-rtm.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
+index 4e9f094f..aef9866c 100644
+--- a/sysdeps/x86/tst-strncmp-rtm.c
++++ b/sysdeps/x86/tst-strncmp-rtm.c
+@@ -23,12 +23,12 @@
+ # define CHAR wchar_t
+ # define MEMSET wmemset
+ # define STRNCMP wcsncmp
+-# define TEST_NAME wcsncmp
++# define TEST_NAME "wcsncmp"
+ #else /* !WIDE */
+ # define CHAR char
+ # define MEMSET memset
+ # define STRNCMP strncmp
+-# define TEST_NAME strncmp
++# define TEST_NAME "strncmp"
+ #endif /* !WIDE */
+ 
+ 
+-- 
+GitLab
+
diff --git a/SPECS/glibc.spec b/SPECS/glibc.spec
index bd945a7..436d4d7 100644
--- a/SPECS/glibc.spec
+++ b/SPECS/glibc.spec
@@ -1,6 +1,6 @@
 %define glibcsrcdir glibc-2.28
 %define glibcversion 2.28
-%define glibcrelease 204%{?dist}
+%define glibcrelease 205%{?dist}
 # Pre-release tarballs are pulled in from git using a command that is
 # effectively:
 #
@@ -975,7 +975,43 @@ Patch10073: glibc-sw28646.patch
 Patch10074: ia-no-index_arch_prefer_no_avx512-avx-vnni.patch
 Patch10075: ia-opt-less_vec-memcmp-evex-movb.patch
 Patch10076: glibc-sw28537-4.patch
-
+Patch10077: glibc-sw28896-2.patch
+Patch10078: ia-test_name-string-tst-strncmp-rtm.patch
+Patch10079: ia-opt-strcmp-avx2.patch
+Patch10080: ia-opt-strcmp-evex.patch
+Patch10081: ia-strcmp-avx2-fix.patch
+Patch10082: ia-strcmp-evex-fix.patch
+Patch10083: ia-imp-vec_gen-memset-vec-unaligned-erms.patch
+Patch10084: ia-rmv-ssse3_inst-memset.patch
+Patch10085: ia-opt-bzero.patch
+Patch10086: ia-rmv-set-memset-vec-unaligned-erms.patch
+Patch10087: glibc-sw28895.patch
+Patch10088: glibc-sw28896-3.patch
+Patch10089: ia-imp-l.patch
+Patch10090: ia-march-srt-sysdep_routines.patch
+Patch10091: ia-rmv-weak_alias-memset-sse2.patch
+Patch10092: ia-rmv-bcopy-opt.patch
+Patch10093: ia-code_cleanup-strchr-avx2.patch
+Patch10094: ia-code_cleanup-strchr-evex.patch
+Patch10095: ia-opt-strcspn_strpbrk-strcspn-c.patch
+Patch10096: ia-opt-strspn-strspn-c.patch
+Patch10097: ia-rmv-strcspn-sse2.patch
+Patch10098: ia-rmv-strpbrk-sse2.patch
+Patch10099: ia-rmv-strspn-sse2.patch
+Patch10100: ia-opt-strxcasecmp-srtcmp.patch
+Patch10101: ia-opt-strxcasecmp-srtcmp-sse42.patch
+Patch10102: ia-opt-strxcasecmp-avx2.patch
+Patch10103: ia-opt-strxcasecmp-evex.patch
+Patch10104: ia-rmv-strxcasecmp-avx.patch
+Patch10105: ia-imp-wcslen.patch
+Patch10106: ia-rmv-memcmp-sse4.patch
+Patch10107: ia-code_cleanup-memcmp-avx2-movbe.patch
+Patch10108: ia-opt-str-wcs_rchr-sse2.patch
+Patch10109: ia-opt-str-wcs_rchr-avx2.patch
+Patch10110: ia-opt-str-wcs_rchr-evex.patch
+Patch10111: ia-add-fast-jitter.patch
+Patch10112: ia-add-backoff-spinlock.patch
+Patch10113: glibc-sw29127.patch
 
 ##############################################################################
 # Continued list of core "glibc" package information:
@@ -2806,8 +2842,11 @@ fi
 %files -f compat-libpthread-nonshared.filelist -n compat-libpthread-nonshared
 
 %changelog
+* Thu Jun 09 2022 Ali Erdinc Koroglu <aekoroglu@centosproject.org> - 2.28.205
+- Intel glibc optimizations
+
 * Fri May 20 2022 Ali Erdinc Koroglu <aekoroglu@centosproject.org> - 2.28.204
-- Intel architecture optimizations
+- Intel glibc optimizations
 
 * Tue May 17 2022 Patsy Griffin <patsy@redhat.com> - 2.28-203
 - 390x: Add support for IBM z16. (#2077835)