diff --git a/SOURCES/glibc-rh1961109.patch b/SOURCES/glibc-rh1961109.patch
new file mode 100644
index 0000000..e47c4d7
--- /dev/null
+++ b/SOURCES/glibc-rh1961109.patch
@@ -0,0 +1,165 @@
+commit f17164bd51db31f47fbbdae826c63b6d78184c45
+Author: Florian Weimer <fweimer@redhat.com>
+Date:   Tue May 18 07:21:33 2021 +0200
+
+    localedata: Use U+00AF MACRON in more EBCDIC charsets [BZ #27882]
+    
+    This updates IBM256, IBM277, IBM278, IBM280, IBM284, IBM297, IBM424
+    in the same way that IBM273 was updated for bug 23290.
+    
+    IBM256 and IBM424 still have holes after this change, so HAS_HOLES
+    is not updated.
+    
+    Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
+
+diff --git a/iconvdata/ibm277.c b/iconvdata/ibm277.c
+index f93ca2acb8718dd5..0e337dbbdc06a02f 100644
+--- a/iconvdata/ibm277.c
++++ b/iconvdata/ibm277.c
+@@ -23,6 +23,6 @@
+ #define TABLES <ibm277.h>
+ 
+ #define CHARSET_NAME	"IBM277//"
+-#define HAS_HOLES	1	/* Not all 256 character are defined.  */
++#define HAS_HOLES	0
+ 
+ #include <8bit-gap.c>
+diff --git a/iconvdata/ibm278.c b/iconvdata/ibm278.c
+index 4263000760472913..7450fb8e5b846101 100644
+--- a/iconvdata/ibm278.c
++++ b/iconvdata/ibm278.c
+@@ -23,6 +23,6 @@
+ #define TABLES <ibm278.h>
+ 
+ #define CHARSET_NAME	"IBM278//"
+-#define HAS_HOLES	1	/* Not all 256 character are defined.  */
++#define HAS_HOLES	0
+ 
+ #include <8bit-gap.c>
+diff --git a/iconvdata/ibm280.c b/iconvdata/ibm280.c
+index 3efddd7dec2728d9..2ea5478e4e0d7007 100644
+--- a/iconvdata/ibm280.c
++++ b/iconvdata/ibm280.c
+@@ -23,6 +23,6 @@
+ #define TABLES <ibm280.h>
+ 
+ #define CHARSET_NAME	"IBM280//"
+-#define HAS_HOLES	1	/* Not all 256 character are defined.  */
++#define HAS_HOLES	0
+ 
+ #include <8bit-gap.c>
+diff --git a/iconvdata/ibm284.c b/iconvdata/ibm284.c
+index 57dab27d0cec4a33..8dbbc6344d18528f 100644
+--- a/iconvdata/ibm284.c
++++ b/iconvdata/ibm284.c
+@@ -23,6 +23,6 @@
+ #define TABLES <ibm284.h>
+ 
+ #define CHARSET_NAME	"IBM284//"
+-#define HAS_HOLES	1	/* Not all 256 character are defined.  */
++#define HAS_HOLES	0
+ 
+ #include <8bit-gap.c>
+diff --git a/iconvdata/ibm297.c b/iconvdata/ibm297.c
+index f355659afd4b4502..81e63ba1f28f1548 100644
+--- a/iconvdata/ibm297.c
++++ b/iconvdata/ibm297.c
+@@ -23,6 +23,6 @@
+ #define TABLES <ibm297.h>
+ 
+ #define CHARSET_NAME	"IBM297//"
+-#define HAS_HOLES	1	/* Not all 256 character are defined.  */
++#define HAS_HOLES	0
+ 
+ #include <8bit-gap.c>
+diff --git a/localedata/charmaps/IBM256 b/localedata/charmaps/IBM256
+index 5cfd2db5f436cd07..bdc1abf0ade3bfc4 100644
+--- a/localedata/charmaps/IBM256
++++ b/localedata/charmaps/IBM256
+@@ -194,7 +194,7 @@ CHARMAP
+ <U00BE>     /xb9         VULGAR FRACTION THREE QUARTERS
+ <U00AC>     /xba         NOT SIGN
+ <U007C>     /xbb         VERTICAL LINE
+-<U203E>     /xbc         OVERLINE
++<U00AF>     /xbc         MACRON
+ <U00A8>     /xbd         DIAERESIS
+ <U00B4>     /xbe         ACUTE ACCENT
+ <U2017>     /xbf         DOUBLE LOW LINE
+diff --git a/localedata/charmaps/IBM277 b/localedata/charmaps/IBM277
+index 1c0b5cb9fb659364..2f6e3992109a2b33 100644
+--- a/localedata/charmaps/IBM277
++++ b/localedata/charmaps/IBM277
+@@ -195,7 +195,7 @@ CHARMAP
+ <U00BE>     /xb9         VULGAR FRACTION THREE QUARTERS
+ <U00AC>     /xba         NOT SIGN
+ <U007C>     /xbb         VERTICAL LINE
+-<U203E>     /xbc         OVERLINE
++<U00AF>     /xbc         MACRON
+ <U00A8>     /xbd         DIAERESIS
+ <U00B4>     /xbe         ACUTE ACCENT
+ <U00D7>     /xbf         MULTIPLICATION SIGN
+diff --git a/localedata/charmaps/IBM278 b/localedata/charmaps/IBM278
+index 646961501c74c4df..bdfae7621028f003 100644
+--- a/localedata/charmaps/IBM278
++++ b/localedata/charmaps/IBM278
+@@ -196,7 +196,7 @@ CHARMAP
+ <U00BE>     /xb9         VULGAR FRACTION THREE QUARTERS
+ <U00AC>     /xba         NOT SIGN
+ <U007C>     /xbb         VERTICAL LINE
+-<U203E>     /xbc         OVERLINE
++<U00AF>     /xbc         MACRON
+ <U00A8>     /xbd         DIAERESIS
+ <U00B4>     /xbe         ACUTE ACCENT
+ <U00D7>     /xbf         MULTIPLICATION SIGN
+diff --git a/localedata/charmaps/IBM280 b/localedata/charmaps/IBM280
+index 5de3b3e7b96796c0..4c31242806b0ac19 100644
+--- a/localedata/charmaps/IBM280
++++ b/localedata/charmaps/IBM280
+@@ -195,7 +195,7 @@ CHARMAP
+ <U00BE>     /xb9         VULGAR FRACTION THREE QUARTERS
+ <U00AC>     /xba         NOT SIGN
+ <U007C>     /xbb         VERTICAL LINE
+-<U203E>     /xbc         OVERLINE
++<U00AF>     /xbc         MACRON
+ <U00A8>     /xbd         DIAERESIS
+ <U00B4>     /xbe         ACUTE ACCENT
+ <U00D7>     /xbf         MULTIPLICATION SIGN
+diff --git a/localedata/charmaps/IBM284 b/localedata/charmaps/IBM284
+index c64b2a65ab748540..46a8737a715e4e56 100644
+--- a/localedata/charmaps/IBM284
++++ b/localedata/charmaps/IBM284
+@@ -195,7 +195,7 @@ CHARMAP
+ <U00BE>     /xb9         VULGAR FRACTION THREE QUARTERS
+ <U005E>     /xba         CIRCUMFLEX ACCENT
+ <U0021>     /xbb         EXCLAMATION MARK
+-<U203E>     /xbc         OVERLINE
++<U00AF>     /xbc         MACRON
+ <U007E>     /xbd         TILDE
+ <U00B4>     /xbe         ACUTE ACCENT
+ <U00D7>     /xbf         MULTIPLICATION SIGN
+diff --git a/localedata/charmaps/IBM297 b/localedata/charmaps/IBM297
+index 33b74eee437241aa..14361ad418cf1bc7 100644
+--- a/localedata/charmaps/IBM297
++++ b/localedata/charmaps/IBM297
+@@ -195,7 +195,7 @@ CHARMAP
+ <U00BE>     /xb9         VULGAR FRACTION THREE QUARTERS
+ <U00AC>     /xba         NOT SIGN
+ <U007C>     /xbb         VERTICAL LINE
+-<U203E>     /xbc         OVERLINE
++<U00AF>     /xbc         MACRON
+ <U007E>     /xbd         TILDE
+ <U00B4>     /xbe         ACUTE ACCENT
+ <U00D7>     /xbf         MULTIPLICATION SIGN
+diff --git a/localedata/charmaps/IBM424 b/localedata/charmaps/IBM424
+index 883e43b8ae04ee4c..deca11e1b18ec0a6 100644
+--- a/localedata/charmaps/IBM424
++++ b/localedata/charmaps/IBM424
+@@ -175,7 +175,7 @@ CHARMAP
+ <U00BE>     /xb9         VULGAR FRACTION THREE QUARTERS
+ <U005B>     /xba         LEFT SQUARE BRACKET
+ <U005D>     /xbb         RIGHT SQUARE BRACKET
+-<U203E>     /xbc         OVERLINE
++<U00AF>     /xbc         MACRON
+ <U00A8>     /xbd         DIAERESIS
+ <U00B4>     /xbe         ACUTE ACCENT
+ <U00D7>     /xbf         MULTIPLICATION SIGN
diff --git a/SOURCES/glibc-rh1982608.patch b/SOURCES/glibc-rh1982608.patch
new file mode 100644
index 0000000..6f67ed6
--- /dev/null
+++ b/SOURCES/glibc-rh1982608.patch
@@ -0,0 +1,2216 @@
+This is a rebase of posix/glob.c from upstream (gnulib->glibc->rhel).
+
+Relevent upstream commits:
+
+7c477b57a31487eda516db02b9e04f22d1a6e6af posix/glob.c: update from gnulib
+  (This is the master commit to which we're syncing)
+
+gnulib commit 98f034a0c2ba8917c96f363de1a8d66244e411da
+  (This is the gnulib commit to which glibc upstream sync'd)
+
+Additional glibc upstream commits of note:
+84f7ce84474c1648ce96884f1c91ca7b97ca3fc2 posix: Add glob64 with 64-bit time_t support
+  (just posix/glob.c and sysdeps/gnu/glob64-lstat-compat.c)
+9a7ab0769b295cbf5232140401742a8f34bda3de hurd: Fix glob lstat compatibility
+4883360415f1ed772ba44decc501d59deb17bdf0 posix: Sync glob code with gnulib
+04986243d1af37ac0177ed2f9db0a066ebd2b212 Remove internal usage of extensible stat functions
+ddc650e9b3dc916eab417ce9f79e67337b05035c Fix use-after-free in glob when expanding ~user (bug 25414)
+
+
+diff -rup a/posix/glob-lstat-compat.c b/posix/glob-lstat-compat.c
+--- a/posix/glob-lstat-compat.c	2018-08-01 01:10:47.000000000 -0400
++++ b/posix/glob-lstat-compat.c	2022-05-02 22:49:06.504676711 -0400
+@@ -28,7 +28,8 @@
+ # define GLOB_ATTRIBUTE attribute_compat_text_section
+ 
+ /* Avoid calling gl_lstat with GLOB_ALTDIRFUNC.  */
+-# define GLOB_NO_LSTAT
++# define GLOB_LSTAT   gl_stat
++# define GLOB_LSTAT64 __stat64
+ 
+ # include <posix/glob.c>
+ 
+diff -rup a/posix/glob.c b/posix/glob.c
+--- a/posix/glob.c	2022-05-03 14:37:52.959042051 -0400
++++ b/posix/glob.c	2022-05-02 22:49:18.655134696 -0400
+@@ -1,4 +1,4 @@
+-/* Copyright (C) 1991-2018 Free Software Foundation, Inc.
++/* Copyright (C) 1991-2022 Free Software Foundation, Inc.
+    This file is part of the GNU C Library.
+ 
+    The GNU C Library is free software; you can redistribute it and/or
+@@ -13,11 +13,22 @@
+ 
+    You should have received a copy of the GNU Lesser General Public
+    License along with the GNU C Library; if not, see
+-   <http://www.gnu.org/licenses/>.  */
++   <https://www.gnu.org/licenses/>.  */
++
++#ifndef _LIBC
++
++/* Don't use __attribute__ __nonnull__ in this compilation unit.  Otherwise gcc
++   optimizes away the pattern == NULL test below.  */
++# define _GL_ARG_NONNULL(params)
++
++# include <libc-config.h>
++
++#endif
+ 
+ #include <glob.h>
+ 
+ #include <errno.h>
++#include <fcntl.h>
+ #include <sys/types.h>
+ #include <sys/stat.h>
+ #include <stdbool.h>
+@@ -26,7 +37,7 @@
+ #include <assert.h>
+ #include <unistd.h>
+ 
+-#if (defined _WIN32 || defined __WIN32__) && ! defined __CYGWIN__
++#if defined _WIN32 && ! defined __CYGWIN__
+ # define WINDOWS32
+ #endif
+ 
+@@ -46,30 +57,38 @@
+ # define sysconf(id) __sysconf (id)
+ # define closedir(dir) __closedir (dir)
+ # define opendir(name) __opendir (name)
++# undef dirfd
++# define dirfd(str) __dirfd (str)
+ # define readdir(str) __readdir64 (str)
+ # define getpwnam_r(name, bufp, buf, len, res) \
+     __getpwnam_r (name, bufp, buf, len, res)
+-# ifndef __lstat64
+-#  define __lstat64(fname, buf) __lxstat64 (_STAT_VER, fname, buf)
++# define FLEXIBLE_ARRAY_MEMBER
++# ifndef struct_stat
++#  define struct_stat           struct stat
+ # endif
+-# ifndef __stat64
+-#  define __stat64(fname, buf) __xstat64 (_STAT_VER, fname, buf)
++# ifndef struct_stat64
++#  define struct_stat64         struct stat64
++# endif
++# ifndef GLOB_LSTAT
++#  define GLOB_LSTAT            gl_lstat
++# endif
++# ifndef GLOB_FSTATAT64
++#  define GLOB_FSTATAT64        __fstatat64
+ # endif
+-# define struct_stat64		struct stat64
+-# define FLEXIBLE_ARRAY_MEMBER
+ # include <shlib-compat.h>
+ #else /* !_LIBC */
+ # define __glob                 glob
+ # define __getlogin_r(buf, len) getlogin_r (buf, len)
+-# define __lstat64(fname, buf)  lstat (fname, buf)
+-# define __stat64(fname, buf)   stat (fname, buf)
+ # define __fxstatat64(_, d, f, st, flag) fstatat (d, f, st, flag)
+-# define struct_stat64          struct stat
+ # ifndef __MVS__
+ #  define __alloca              alloca
+ # endif
+ # define __readdir              readdir
+ # define COMPILE_GLOB64
++# define struct_stat            struct stat
++# define struct_stat64          struct stat
++# define GLOB_LSTAT             gl_lstat
++# define GLOB_FSTATAT64         fstatat
+ #endif /* _LIBC */
+ 
+ #include <fnmatch.h>
+@@ -80,7 +99,9 @@
+ 
+ static const char *next_brace_sub (const char *begin, int flags) __THROWNL;
+ 
+-typedef uint_fast8_t dirent_type;
++/* The type of ((struct dirent *) 0)->d_type is 'unsigned char' on most
++   platforms, but 'unsigned int' in the mingw from mingw.org.  */
++typedef uint_fast32_t dirent_type;
+ 
+ #if !defined _LIBC && !defined HAVE_STRUCT_DIRENT_D_TYPE
+ /* Any distinct values will do here.
+@@ -119,9 +140,9 @@ readdir_result_type (struct readdir_resu
+ /* Construct an initializer for a struct readdir_result object from a
+    struct dirent *.  No copy of the name is made.  */
+ #define READDIR_RESULT_INITIALIZER(source) \
+-  {					   \
+-    source->d_name,			   \
+-    D_TYPE_TO_RESULT (source)		   \
++  {                                        \
++    source->d_name,                        \
++    D_TYPE_TO_RESULT (source)              \
+   }
+ 
+ /* Call gl_readdir on STREAM.  This macro can be overridden to reduce
+@@ -186,22 +207,15 @@ glob_lstat (glob_t *pglob, int flags, co
+ {
+ /* Use on glob-lstat-compat.c to provide a compat symbol which does not
+    use lstat / gl_lstat.  */
+-#ifdef GLOB_NO_LSTAT
+-# define GL_LSTAT gl_stat
+-# define LSTAT64 __stat64
+-#else
+-# define GL_LSTAT gl_lstat
+-# define LSTAT64 __lstat64
+-#endif
+-
+   union
+   {
+-    struct stat st;
++    struct_stat st;
+     struct_stat64 st64;
+   } ust;
+   return (__glibc_unlikely (flags & GLOB_ALTDIRFUNC)
+-          ? pglob->GL_LSTAT (fullname, &ust.st)
+-          : LSTAT64 (fullname, &ust.st64));
++          ? pglob->GLOB_LSTAT (fullname, &ust.st)
++          : GLOB_FSTATAT64 (AT_FDCWD, fullname, &ust.st64,
++                            AT_SYMLINK_NOFOLLOW));
+ }
+ 
+ /* Set *R = A + B.  Return true if the answer is mathematically
+@@ -211,7 +225,7 @@ glob_lstat (glob_t *pglob, int flags, co
+ static bool
+ size_add_wrapv (size_t a, size_t b, size_t *r)
+ {
+-#if 5 <= __GNUC__ && !defined __ICC
++#if 7 <= __GNUC__ && !defined __ICC
+   return __builtin_add_overflow (a, b, r);
+ #else
+   *r = a + b;
+@@ -228,8 +242,8 @@ glob_use_alloca (size_t alloca_used, siz
+ }
+ 
+ static int glob_in_dir (const char *pattern, const char *directory,
+-			int flags, int (*errfunc) (const char *, int),
+-			glob_t *pglob, size_t alloca_used);
++                        int flags, int (*errfunc) (const char *, int),
++                        glob_t *pglob, size_t alloca_used);
+ static int prefix_array (const char *prefix, char **array, size_t n) __THROWNL;
+ static int collated_compare (const void *, const void *) __THROWNL;
+ 
+@@ -239,11 +253,12 @@ static int collated_compare (const void
+ static bool
+ is_dir (char const *filename, int flags, glob_t const *pglob)
+ {
+-  struct stat st;
++  struct_stat st;
+   struct_stat64 st64;
+   return (__glibc_unlikely (flags & GLOB_ALTDIRFUNC)
+           ? pglob->gl_stat (filename, &st) == 0 && S_ISDIR (st.st_mode)
+-          : __stat64 (filename, &st64) == 0 && S_ISDIR (st64.st_mode));
++          : (GLOB_FSTATAT64 (AT_FDCWD, filename, &st64, 0) == 0
++             && S_ISDIR (st64.st_mode)));
+ }
+ 
+ /* Find the end of the sub-pattern in a brace expression.  */
+@@ -254,17 +269,17 @@ next_brace_sub (const char *cp, int flag
+   while (*cp != '\0')
+     if ((flags & GLOB_NOESCAPE) == 0 && *cp == '\\')
+       {
+-	if (*++cp == '\0')
+-	  break;
+-	++cp;
++        if (*++cp == '\0')
++          break;
++        ++cp;
+       }
+     else
+       {
+-	if ((*cp == '}' && depth-- == 0) || (*cp == ',' && depth == 0))
+-	  break;
++        if ((*cp == '}' && depth-- == 0) || (*cp == ',' && depth == 0))
++          break;
+ 
+-	if (*cp++ == '{')
+-	  depth++;
++        if (*cp++ == '{')
++          depth++;
+       }
+ 
+   return *cp != '\0' ? cp : NULL;
+@@ -285,7 +300,7 @@ next_brace_sub (const char *cp, int flag
+ int
+ GLOB_ATTRIBUTE
+ __glob (const char *pattern, int flags, int (*errfunc) (const char *, int),
+-	glob_t *pglob)
++        glob_t *pglob)
+ {
+   const char *filename;
+   char *dirname = NULL;
+@@ -319,22 +334,22 @@ __glob (const char *pattern, int flags,
+     {
+       pglob->gl_pathc = 0;
+       if (!(flags & GLOB_DOOFFS))
+-	pglob->gl_pathv = NULL;
++        pglob->gl_pathv = NULL;
+       else
+-	{
+-	  size_t i;
++        {
++          size_t i;
+ 
+-	  if (pglob->gl_offs >= ~((size_t) 0) / sizeof (char *))
+-	    return GLOB_NOSPACE;
++          if (pglob->gl_offs >= ~((size_t) 0) / sizeof (char *))
++            return GLOB_NOSPACE;
+ 
+-	  pglob->gl_pathv = (char **) malloc ((pglob->gl_offs + 1)
+-					      * sizeof (char *));
+-	  if (pglob->gl_pathv == NULL)
+-	    return GLOB_NOSPACE;
+-
+-	  for (i = 0; i <= pglob->gl_offs; ++i)
+-	    pglob->gl_pathv[i] = NULL;
+-	}
++          pglob->gl_pathv = (char **) malloc ((pglob->gl_offs + 1)
++                                              * sizeof (char *));
++          if (pglob->gl_pathv == NULL)
++            return GLOB_NOSPACE;
++
++          for (i = 0; i <= pglob->gl_offs; ++i)
++            pglob->gl_pathv[i] = NULL;
++        }
+     }
+ 
+   if (flags & GLOB_BRACE)
+@@ -342,129 +357,129 @@ __glob (const char *pattern, int flags,
+       const char *begin;
+ 
+       if (flags & GLOB_NOESCAPE)
+-	begin = strchr (pattern, '{');
++        begin = strchr (pattern, '{');
+       else
+-	{
+-	  begin = pattern;
+-	  while (1)
+-	    {
+-	      if (*begin == '\0')
+-		{
+-		  begin = NULL;
+-		  break;
+-		}
+-
+-	      if (*begin == '\\' && begin[1] != '\0')
+-		++begin;
+-	      else if (*begin == '{')
+-		break;
+-
+-	      ++begin;
+-	    }
+-	}
++        {
++          begin = pattern;
++          while (1)
++            {
++              if (*begin == '\0')
++                {
++                  begin = NULL;
++                  break;
++                }
++
++              if (*begin == '\\' && begin[1] != '\0')
++                ++begin;
++              else if (*begin == '{')
++                break;
++
++              ++begin;
++            }
++        }
+ 
+       if (begin != NULL)
+-	{
+-	  /* Allocate working buffer large enough for our work.  Note that
+-	    we have at least an opening and closing brace.  */
+-	  size_t firstc;
+-	  char *alt_start;
+-	  const char *p;
+-	  const char *next;
+-	  const char *rest;
+-	  size_t rest_len;
+-	  char *onealt;
+-	  size_t pattern_len = strlen (pattern) - 1;
+-	  int alloca_onealt = glob_use_alloca (alloca_used, pattern_len);
+-	  if (alloca_onealt)
+-	    onealt = alloca_account (pattern_len, alloca_used);
+-	  else
+-	    {
+-	      onealt = malloc (pattern_len);
+-	      if (onealt == NULL)
+-		return GLOB_NOSPACE;
+-	    }
+-
+-	  /* We know the prefix for all sub-patterns.  */
+-	  alt_start = mempcpy (onealt, pattern, begin - pattern);
+-
+-	  /* Find the first sub-pattern and at the same time find the
+-	     rest after the closing brace.  */
+-	  next = next_brace_sub (begin + 1, flags);
+-	  if (next == NULL)
+-	    {
+-	      /* It is an invalid expression.  */
+-	    illegal_brace:
+-	      if (__glibc_unlikely (!alloca_onealt))
+-		free (onealt);
+-	      flags &= ~GLOB_BRACE;
+-	      goto no_brace;
+-	    }
+-
+-	  /* Now find the end of the whole brace expression.  */
+-	  rest = next;
+-	  while (*rest != '}')
+-	    {
+-	      rest = next_brace_sub (rest + 1, flags);
+-	      if (rest == NULL)
+-		/* It is an illegal expression.  */
+-		goto illegal_brace;
+-	    }
+-	  /* Please note that we now can be sure the brace expression
+-	     is well-formed.  */
+-	  rest_len = strlen (++rest) + 1;
+-
+-	  /* We have a brace expression.  BEGIN points to the opening {,
+-	     NEXT points past the terminator of the first element, and END
+-	     points past the final }.  We will accumulate result names from
+-	     recursive runs for each brace alternative in the buffer using
+-	     GLOB_APPEND.  */
+-	  firstc = pglob->gl_pathc;
+-
+-	  p = begin + 1;
+-	  while (1)
+-	    {
+-	      int result;
+-
+-	      /* Construct the new glob expression.  */
+-	      mempcpy (mempcpy (alt_start, p, next - p), rest, rest_len);
+-
+-	      result = __glob (onealt,
+-			       ((flags & ~(GLOB_NOCHECK | GLOB_NOMAGIC))
+-				| GLOB_APPEND),
+-			       errfunc, pglob);
+-
+-	      /* If we got an error, return it.  */
+-	      if (result && result != GLOB_NOMATCH)
+-		{
+-		  if (__glibc_unlikely (!alloca_onealt))
+-		    free (onealt);
+-		  if (!(flags & GLOB_APPEND))
+-		    {
+-		      globfree (pglob);
+-		      pglob->gl_pathc = 0;
+-		    }
+-		  return result;
+-		}
+-
+-	      if (*next == '}')
+-		/* We saw the last entry.  */
+-		break;
+-
+-	      p = next + 1;
+-	      next = next_brace_sub (p, flags);
+-	      assert (next != NULL);
+-	    }
+-
+-	  if (__glibc_unlikely (!alloca_onealt))
+-	    free (onealt);
+-
+-	  if (pglob->gl_pathc != firstc)
+-	    /* We found some entries.  */
+-	    return 0;
+-	  else if (!(flags & (GLOB_NOCHECK|GLOB_NOMAGIC)))
+-	    return GLOB_NOMATCH;
+-	}
++        {
++          /* Allocate working buffer large enough for our work.  Note that
++             we have at least an opening and closing brace.  */
++          size_t firstc;
++          char *alt_start;
++          const char *p;
++          const char *next;
++          const char *rest;
++          size_t rest_len;
++          char *onealt;
++          size_t pattern_len = strlen (pattern) - 1;
++          int alloca_onealt = glob_use_alloca (alloca_used, pattern_len);
++          if (alloca_onealt)
++            onealt = alloca_account (pattern_len, alloca_used);
++          else
++            {
++              onealt = malloc (pattern_len);
++              if (onealt == NULL)
++                return GLOB_NOSPACE;
++            }
++
++          /* We know the prefix for all sub-patterns.  */
++          alt_start = mempcpy (onealt, pattern, begin - pattern);
++
++          /* Find the first sub-pattern and at the same time find the
++             rest after the closing brace.  */
++          next = next_brace_sub (begin + 1, flags);
++          if (next == NULL)
++            {
++              /* It is an invalid expression.  */
++            illegal_brace:
++              if (__glibc_unlikely (!alloca_onealt))
++                free (onealt);
++              flags &= ~GLOB_BRACE;
++              goto no_brace;
++            }
++
++          /* Now find the end of the whole brace expression.  */
++          rest = next;
++          while (*rest != '}')
++            {
++              rest = next_brace_sub (rest + 1, flags);
++              if (rest == NULL)
++                /* It is an illegal expression.  */
++                goto illegal_brace;
++            }
++          /* Please note that we now can be sure the brace expression
++             is well-formed.  */
++          rest_len = strlen (++rest) + 1;
++
++          /* We have a brace expression.  BEGIN points to the opening {,
++             NEXT points past the terminator of the first element, and END
++             points past the final }.  We will accumulate result names from
++             recursive runs for each brace alternative in the buffer using
++             GLOB_APPEND.  */
++          firstc = pglob->gl_pathc;
++
++          p = begin + 1;
++          while (1)
++            {
++              int result;
++
++              /* Construct the new glob expression.  */
++              mempcpy (mempcpy (alt_start, p, next - p), rest, rest_len);
++
++              result = __glob (onealt,
++                               ((flags & ~(GLOB_NOCHECK | GLOB_NOMAGIC))
++                                | GLOB_APPEND),
++                               errfunc, pglob);
++
++              /* If we got an error, return it.  */
++              if (result && result != GLOB_NOMATCH)
++                {
++                  if (__glibc_unlikely (!alloca_onealt))
++                    free (onealt);
++                  if (!(flags & GLOB_APPEND))
++                    {
++                      globfree (pglob);
++                      pglob->gl_pathc = 0;
++                    }
++                  return result;
++                }
++
++              if (*next == '}')
++                /* We saw the last entry.  */
++                break;
++
++              p = next + 1;
++              next = next_brace_sub (p, flags);
++              assert (next != NULL);
++            }
++
++          if (__glibc_unlikely (!alloca_onealt))
++            free (onealt);
++
++          if (pglob->gl_pathc != firstc)
++            /* We found some entries.  */
++            return 0;
++          else if (!(flags & (GLOB_NOCHECK|GLOB_NOMAGIC)))
++            return GLOB_NOMATCH;
++        }
+     }
+ 
+  no_brace:
+@@ -486,33 +501,33 @@ __glob (const char *pattern, int flags,
+   if (filename == NULL)
+     {
+       /* This can mean two things: a simple name or "~name".  The latter
+-	 case is nothing but a notation for a directory.  */
++         case is nothing but a notation for a directory.  */
+       if ((flags & (GLOB_TILDE|GLOB_TILDE_CHECK)) && pattern[0] == '~')
+-	{
+-	  dirname = (char *) pattern;
+-	  dirlen = strlen (pattern);
+-
+-	  /* Set FILENAME to NULL as a special flag.  This is ugly but
+-	     other solutions would require much more code.  We test for
+-	     this special case below.  */
+-	  filename = NULL;
+-	}
++        {
++          dirname = (char *) pattern;
++          dirlen = strlen (pattern);
++
++          /* Set FILENAME to NULL as a special flag.  This is ugly but
++             other solutions would require much more code.  We test for
++             this special case below.  */
++          filename = NULL;
++        }
+       else
+-	{
+-	  if (__glibc_unlikely (pattern[0] == '\0'))
+-	    {
+-	      dirs.gl_pathv = NULL;
+-	      goto no_matches;
+-	    }
+-
+-	  filename = pattern;
+-	  dirname = (char *) ".";
+-	  dirlen = 0;
+-	}
++        {
++          if (__glibc_unlikely (pattern[0] == '\0'))
++            {
++              dirs.gl_pathv = NULL;
++              goto no_matches;
++            }
++
++          filename = pattern;
++          dirname = (char *) ".";
++          dirlen = 0;
++        }
+     }
+   else if (filename == pattern
+-	   || (filename == pattern + 1 && pattern[0] == '\\'
+-	       && (flags & GLOB_NOESCAPE) == 0))
++           || (filename == pattern + 1 && pattern[0] == '\\'
++               && (flags & GLOB_NOESCAPE) == 0))
+     {
+       /* "/pattern" or "\\/pattern".  */
+       dirname = (char *) "/";
+@@ -525,32 +540,32 @@ __glob (const char *pattern, int flags,
+       dirlen = filename - pattern;
+ #if defined __MSDOS__ || defined WINDOWS32
+       if (*filename == ':'
+-	  || (filename > pattern + 1 && filename[-1] == ':'))
+-	{
+-	  char *drive_spec;
+-
+-	  ++dirlen;
+-	  drive_spec = __alloca (dirlen + 1);
+-	  *((char *) mempcpy (drive_spec, pattern, dirlen)) = '\0';
+-	  /* For now, disallow wildcards in the drive spec, to
+-	     prevent infinite recursion in glob.  */
+-	  if (__glob_pattern_p (drive_spec, !(flags & GLOB_NOESCAPE)))
+-	    return GLOB_NOMATCH;
+-	  /* If this is "d:pattern", we need to copy ':' to DIRNAME
+-	     as well.  If it's "d:/pattern", don't remove the slash
+-	     from "d:/", since "d:" and "d:/" are not the same.*/
+-	}
++          || (filename > pattern + 1 && filename[-1] == ':'))
++        {
++          char *drive_spec;
++
++          ++dirlen;
++          drive_spec = __alloca (dirlen + 1);
++          *((char *) mempcpy (drive_spec, pattern, dirlen)) = '\0';
++          /* For now, disallow wildcards in the drive spec, to
++             prevent infinite recursion in glob.  */
++          if (__glob_pattern_p (drive_spec, !(flags & GLOB_NOESCAPE)))
++            return GLOB_NOMATCH;
++          /* If this is "d:pattern", we need to copy ':' to DIRNAME
++             as well.  If it's "d:/pattern", don't remove the slash
++             from "d:/", since "d:" and "d:/" are not the same.*/
++        }
+ #endif
+ 
+       if (glob_use_alloca (alloca_used, dirlen + 1))
+-	newp = alloca_account (dirlen + 1, alloca_used);
++        newp = alloca_account (dirlen + 1, alloca_used);
+       else
+-	{
+-	  newp = malloc (dirlen + 1);
+-	  if (newp == NULL)
+-	    return GLOB_NOSPACE;
+-	  malloc_dirname = 1;
+-	}
++        {
++          newp = malloc (dirlen + 1);
++          if (newp == NULL)
++            return GLOB_NOSPACE;
++          malloc_dirname = 1;
++        }
+       *((char *) mempcpy (newp, pattern, dirlen)) = '\0';
+       dirname = newp;
+       ++filename;
+@@ -566,363 +581,383 @@ __glob (const char *pattern, int flags,
+ 
+       if (filename[0] == '\0' && dirlen > 1 && !drive_root)
+         /* "pattern/".  Expand "pattern", appending slashes.  */
+-	{
+-	  int orig_flags = flags;
+-	  if (!(flags & GLOB_NOESCAPE) && dirname[dirlen - 1] == '\\')
+-	    {
+-	      /* "pattern\\/".  Remove the final backslash if it hasn't
+-		 been quoted.  */
+-	      char *p = (char *) &dirname[dirlen - 1];
+-
+-	      while (p > dirname && p[-1] == '\\') --p;
+-	      if ((&dirname[dirlen] - p) & 1)
+-		{
+-		  *(char *) &dirname[--dirlen] = '\0';
+-		  flags &= ~(GLOB_NOCHECK | GLOB_NOMAGIC);
+-		}
+-	    }
+-	  int val = __glob (dirname, flags | GLOB_MARK, errfunc, pglob);
+-	  if (val == 0)
+-	    pglob->gl_flags = ((pglob->gl_flags & ~GLOB_MARK)
+-			       | (flags & GLOB_MARK));
+-	  else if (val == GLOB_NOMATCH && flags != orig_flags)
+-	    {
+-	      /* Make sure globfree (&dirs); is a nop.  */
+-	      dirs.gl_pathv = NULL;
+-	      flags = orig_flags;
+-	      oldcount = pglob->gl_pathc + pglob->gl_offs;
+-	      goto no_matches;
+-	    }
+-	  retval = val;
+-	  goto out;
+-	}
++        {
++          int orig_flags = flags;
++          if (!(flags & GLOB_NOESCAPE) && dirname[dirlen - 1] == '\\')
++            {
++              /* "pattern\\/".  Remove the final backslash if it hasn't
++                 been quoted.  */
++              char *p = (char *) &dirname[dirlen - 1];
++
++              while (p > dirname && p[-1] == '\\') --p;
++              if ((&dirname[dirlen] - p) & 1)
++                {
++                  *(char *) &dirname[--dirlen] = '\0';
++                  flags &= ~(GLOB_NOCHECK | GLOB_NOMAGIC);
++                }
++            }
++          int val = __glob (dirname, flags | GLOB_MARK, errfunc, pglob);
++          if (val == 0)
++            pglob->gl_flags = ((pglob->gl_flags & ~GLOB_MARK)
++                               | (flags & GLOB_MARK));
++          else if (val == GLOB_NOMATCH && flags != orig_flags)
++            {
++              /* Make sure globfree (&dirs); is a nop.  */
++              dirs.gl_pathv = NULL;
++              flags = orig_flags;
++              oldcount = pglob->gl_pathc + pglob->gl_offs;
++              goto no_matches;
++            }
++          retval = val;
++          goto out;
++        }
+     }
+ 
+   if ((flags & (GLOB_TILDE|GLOB_TILDE_CHECK)) && dirname[0] == '~')
+     {
+       if (dirname[1] == '\0' || dirname[1] == '/'
+-	  || (!(flags & GLOB_NOESCAPE) && dirname[1] == '\\'
+-	      && (dirname[2] == '\0' || dirname[2] == '/')))
+-	{
+-	  /* Look up home directory.  */
+-	  char *home_dir = getenv ("HOME");
+-	  int malloc_home_dir = 0;
+-	  if (home_dir == NULL || home_dir[0] == '\0')
+-	    {
++          || (!(flags & GLOB_NOESCAPE) && dirname[1] == '\\'
++              && (dirname[2] == '\0' || dirname[2] == '/')))
++        {
++          /* Look up home directory.  */
++          char *home_dir = getenv ("HOME");
++          int malloc_home_dir = 0;
++          if (home_dir == NULL || home_dir[0] == '\0')
++            {
+ #ifdef WINDOWS32
+-	      /* Windows NT defines HOMEDRIVE and HOMEPATH.  But give
+-		 preference to HOME, because the user can change HOME.  */
+-	      const char *home_drive = getenv ("HOMEDRIVE");
+-	      const char *home_path = getenv ("HOMEPATH");
+-
+-	      if (home_drive != NULL && home_path != NULL)
+-		{
+-		  size_t home_drive_len = strlen (home_drive);
+-		  size_t home_path_len = strlen (home_path);
+-		  char *mem = alloca (home_drive_len + home_path_len + 1);
+-
+-		  memcpy (mem, home_drive, home_drive_len);
+-		  memcpy (mem + home_drive_len, home_path, home_path_len + 1);
+-		  home_dir = mem;
+-		}
+-	      else
+-		home_dir = "c:/users/default"; /* poor default */
++              /* Windows NT defines HOMEDRIVE and HOMEPATH.  But give
++                 preference to HOME, because the user can change HOME.  */
++              const char *home_drive = getenv ("HOMEDRIVE");
++              const char *home_path = getenv ("HOMEPATH");
++
++              if (home_drive != NULL && home_path != NULL)
++                {
++                  size_t home_drive_len = strlen (home_drive);
++                  size_t home_path_len = strlen (home_path);
++                  char *mem = alloca (home_drive_len + home_path_len + 1);
++
++                  memcpy (mem, home_drive, home_drive_len);
++                  memcpy (mem + home_drive_len, home_path, home_path_len + 1);
++                  home_dir = mem;
++                }
++              else
++                home_dir = "c:/users/default"; /* poor default */
+ #else
+-	      int err;
+-	      struct passwd *p;
+-	      struct passwd pwbuf;
+-	      struct scratch_buffer s;
+-	      scratch_buffer_init (&s);
+-	      while (true)
+-		{
+-		  p = NULL;
+-		  err = __getlogin_r (s.data, s.length);
+-		  if (err == 0)
+-		    {
++              int err;
++              struct passwd *p;
++              struct passwd pwbuf;
++              struct scratch_buffer s;
++              scratch_buffer_init (&s);
++              while (true)
++                {
++                  p = NULL;
++                  err = __getlogin_r (s.data, s.length);
++                  if (err == 0)
++                    {
+ # if defined HAVE_GETPWNAM_R || defined _LIBC
+-		      size_t ssize = strlen (s.data) + 1;
+-		      char *sdata = s.data;
+-		      err = getpwnam_r (sdata, &pwbuf, sdata + ssize,
+-					s.length - ssize, &p);
++                      size_t ssize = strlen (s.data) + 1;
++                      char *sdata = s.data;
++                      err = getpwnam_r (sdata, &pwbuf, sdata + ssize,
++                                        s.length - ssize, &p);
+ # else
+-		      p = getpwnam (s.data);
+-		      if (p == NULL)
+-			err = errno;
++                      p = getpwnam (s.data);
++                      if (p == NULL)
++                        err = errno;
+ # endif
+-		    }
+-		  if (err != ERANGE)
+-		    break;
+-		  if (!scratch_buffer_grow (&s))
+-		    {
+-		      retval = GLOB_NOSPACE;
+-		      goto out;
+-		    }
+-		}
+-	      if (err == 0)
+-		{
+-		  home_dir = strdup (p->pw_dir);
+-		  malloc_home_dir = 1;
+-		}
+-	      scratch_buffer_free (&s);
+-	      if (err == 0 && home_dir == NULL)
+-		{
+-		  retval = GLOB_NOSPACE;
+-		  goto out;
+-		}
++                    }
++                  if (err != ERANGE)
++                    break;
++                  if (!scratch_buffer_grow (&s))
++                    {
++                      retval = GLOB_NOSPACE;
++                      goto out;
++                    }
++                }
++              if (err == 0)
++                {
++                  home_dir = strdup (p->pw_dir);
++                  malloc_home_dir = 1;
++                }
++              scratch_buffer_free (&s);
++              if (err == 0 && home_dir == NULL)
++                {
++                  retval = GLOB_NOSPACE;
++                  goto out;
++                }
+ #endif /* WINDOWS32 */
+-	    }
+-	  if (home_dir == NULL || home_dir[0] == '\0')
+-	    {
+-	      if (__glibc_unlikely (malloc_home_dir))
+-		free (home_dir);
+-	      if (flags & GLOB_TILDE_CHECK)
+-		{
+-		  retval = GLOB_NOMATCH;
+-		  goto out;
+-		}
+-	      else
+-		{
+-		  home_dir = (char *) "~"; /* No luck.  */
+-		  malloc_home_dir = 0;
+-		}
+-	    }
+-	  /* Now construct the full directory.  */
+-	  if (dirname[1] == '\0')
+-	    {
+-	      if (__glibc_unlikely (malloc_dirname))
+-		free (dirname);
+-
+-	      dirname = home_dir;
+-	      dirlen = strlen (dirname);
+-	      malloc_dirname = malloc_home_dir;
+-	    }
+-	  else
+-	    {
+-	      char *newp;
+-	      size_t home_len = strlen (home_dir);
+-	      int use_alloca = glob_use_alloca (alloca_used, home_len + dirlen);
+-	      if (use_alloca)
+-		newp = alloca_account (home_len + dirlen, alloca_used);
+-	      else
+-		{
+-		  newp = malloc (home_len + dirlen);
+-		  if (newp == NULL)
+-		    {
+-		      if (__glibc_unlikely (malloc_home_dir))
+-			free (home_dir);
+-		      retval = GLOB_NOSPACE;
+-		      goto out;
+-		    }
+-		}
+-
+-	      mempcpy (mempcpy (newp, home_dir, home_len),
+-		       &dirname[1], dirlen);
+-
+-	      if (__glibc_unlikely (malloc_dirname))
+-		free (dirname);
+-
+-	      dirname = newp;
+-	      dirlen += home_len - 1;
+-	      malloc_dirname = !use_alloca;
+-
+-	      if (__glibc_unlikely (malloc_home_dir))
+-		free (home_dir);
+-	    }
+-	  dirname_modified = 1;
+-	}
++            }
++          if (home_dir == NULL || home_dir[0] == '\0')
++            {
++              if (__glibc_unlikely (malloc_home_dir))
++                free (home_dir);
++              if (flags & GLOB_TILDE_CHECK)
++                {
++                  retval = GLOB_NOMATCH;
++                  goto out;
++                }
++              else
++                {
++                  home_dir = (char *) "~"; /* No luck.  */
++                  malloc_home_dir = 0;
++                }
++            }
++          /* Now construct the full directory.  */
++          if (dirname[1] == '\0')
++            {
++              if (__glibc_unlikely (malloc_dirname))
++                free (dirname);
++
++              dirname = home_dir;
++              dirlen = strlen (dirname);
++              malloc_dirname = malloc_home_dir;
++            }
++          else
++            {
++              char *newp;
++              size_t home_len = strlen (home_dir);
++              int use_alloca = glob_use_alloca (alloca_used, home_len + dirlen);
++              if (use_alloca)
++                newp = alloca_account (home_len + dirlen, alloca_used);
++              else
++                {
++                  newp = malloc (home_len + dirlen);
++                  if (newp == NULL)
++                    {
++                      if (__glibc_unlikely (malloc_home_dir))
++                        free (home_dir);
++                      retval = GLOB_NOSPACE;
++                      goto out;
++                    }
++                }
++
++              mempcpy (mempcpy (newp, home_dir, home_len),
++                       &dirname[1], dirlen);
++
++              if (__glibc_unlikely (malloc_dirname))
++                free (dirname);
++
++              dirname = newp;
++              dirlen += home_len - 1;
++              malloc_dirname = !use_alloca;
++
++              if (__glibc_unlikely (malloc_home_dir))
++                free (home_dir);
++            }
++          dirname_modified = 1;
++        }
+       else
+-	{
++        {
+ #ifndef WINDOWS32
+-	  char *end_name = strchr (dirname, '/');
+-	  char *user_name;
+-	  int malloc_user_name = 0;
+-	  char *unescape = NULL;
+-
+-	  if (!(flags & GLOB_NOESCAPE))
+-	    {
+-	      if (end_name == NULL)
+-		{
+-		  unescape = strchr (dirname, '\\');
+-		  if (unescape)
+-		    end_name = strchr (unescape, '\0');
+-		}
+-	      else
+-		unescape = memchr (dirname, '\\', end_name - dirname);
+-	    }
+-	  if (end_name == NULL)
+-	    user_name = dirname + 1;
+-	  else
+-	    {
+-	      char *newp;
+-	      if (glob_use_alloca (alloca_used, end_name - dirname))
+-		newp = alloca_account (end_name - dirname, alloca_used);
+-	      else
+-		{
+-		  newp = malloc (end_name - dirname);
+-		  if (newp == NULL)
+-		    {
+-		      retval = GLOB_NOSPACE;
+-		      goto out;
+-		    }
+-		  malloc_user_name = 1;
+-		}
+-	      if (unescape != NULL)
+-		{
+-		  char *p = mempcpy (newp, dirname + 1,
+-				     unescape - dirname - 1);
+-		  char *q = unescape;
+-		  while (q != end_name)
+-		    {
+-		      if (*q == '\\')
+-			{
+-			  if (q + 1 == end_name)
+-			    {
+-			      /* "~fo\\o\\" unescape to user_name "foo\\",
+-				 but "~fo\\o\\/" unescape to user_name
+-				 "foo".  */
+-			      if (filename == NULL)
+-				*p++ = '\\';
+-			      break;
+-			    }
+-			  ++q;
+-			}
+-		      *p++ = *q++;
+-		    }
+-		  *p = '\0';
+-		}
+-	      else
+-		*((char *) mempcpy (newp, dirname + 1, end_name - dirname - 1))
+-		  = '\0';
+-	      user_name = newp;
+-	    }
+-
+-	  /* Look up specific user's home directory.  */
+-	  {
+-	    struct passwd *p;
+-	    struct scratch_buffer pwtmpbuf;
+-	    scratch_buffer_init (&pwtmpbuf);
++          /* Recognize ~user as a shorthand for the specified user's home
++             directory.  */
++          char *end_name = strchr (dirname, '/');
++          char *user_name;
++          int malloc_user_name = 0;
++          char *unescape = NULL;
++
++          if (!(flags & GLOB_NOESCAPE))
++            {
++              if (end_name == NULL)
++                {
++                  unescape = strchr (dirname, '\\');
++                  if (unescape)
++                    end_name = strchr (unescape, '\0');
++                }
++              else
++                unescape = memchr (dirname, '\\', end_name - dirname);
++            }
++          if (end_name == NULL)
++            user_name = dirname + 1;
++          else
++            {
++              char *newp;
++              if (glob_use_alloca (alloca_used, end_name - dirname))
++                newp = alloca_account (end_name - dirname, alloca_used);
++              else
++                {
++                  newp = malloc (end_name - dirname);
++                  if (newp == NULL)
++                    {
++                      retval = GLOB_NOSPACE;
++                      goto out;
++                    }
++                  malloc_user_name = 1;
++                }
++              if (unescape != NULL)
++                {
++                  char *p = mempcpy (newp, dirname + 1,
++                                     unescape - dirname - 1);
++                  char *q = unescape;
++                  while (q != end_name)
++                    {
++                      if (*q == '\\')
++                        {
++                          if (q + 1 == end_name)
++                            {
++                              /* "~fo\\o\\" unescape to user_name "foo\\",
++                                 but "~fo\\o\\/" unescape to user_name
++                                 "foo".  */
++                              if (filename == NULL)
++                                *p++ = '\\';
++                              break;
++                            }
++                          ++q;
++                        }
++                      *p++ = *q++;
++                    }
++                  *p = '\0';
++                }
++              else
++                *((char *) mempcpy (newp, dirname + 1, end_name - dirname - 1))
++                  = '\0';
++              user_name = newp;
++            }
++
++          /* Look up specific user's home directory.  */
++          {
++            struct passwd *p;
++            struct scratch_buffer pwtmpbuf;
++            scratch_buffer_init (&pwtmpbuf);
+ 
+ #  if defined HAVE_GETPWNAM_R || defined _LIBC
+-	    struct passwd pwbuf;
++            struct passwd pwbuf;
+ 
+-	    while (getpwnam_r (user_name, &pwbuf,
+-			       pwtmpbuf.data, pwtmpbuf.length, &p)
+-		   == ERANGE)
+-	      {
+-		if (!scratch_buffer_grow (&pwtmpbuf))
+-		  {
+-		    retval = GLOB_NOSPACE;
+-		    goto out;
+-		  }
+-	      }
++            while (getpwnam_r (user_name, &pwbuf,
++                               pwtmpbuf.data, pwtmpbuf.length, &p)
++                   == ERANGE)
++              {
++                if (!scratch_buffer_grow (&pwtmpbuf))
++                  {
++                    retval = GLOB_NOSPACE;
++                    goto out;
++                  }
++              }
+ #  else
+-	    p = getpwnam (user_name);
++            p = getpwnam (user_name);
+ #  endif
+ 
+-	    if (__glibc_unlikely (malloc_user_name))
+-	      free (user_name);
++            if (__glibc_unlikely (malloc_user_name))
++              free (user_name);
+ 
+-	    /* If we found a home directory use this.  */
+-	    if (p != NULL)
+-	      {
+-		size_t home_len = strlen (p->pw_dir);
+-		size_t rest_len = end_name == NULL ? 0 : strlen (end_name);
+-		char *d, *newp;
+-		bool use_alloca = glob_use_alloca (alloca_used,
+-						   home_len + rest_len + 1);
+-
+-		if (use_alloca)
+-		  newp = alloca_account (home_len + rest_len + 1, alloca_used);
+-		else
+-		  {
+-		    newp = malloc (home_len + rest_len + 1);
+-		    if (newp == NULL)
+-		      {
+-			scratch_buffer_free (&pwtmpbuf);
+-			retval = GLOB_NOSPACE;
+-			goto out;
+-		      }
+-		  }
+-		d = mempcpy (newp, p->pw_dir, home_len);
+-		if (end_name != NULL)
+-		  d = mempcpy (d, end_name, rest_len);
+-		*d = '\0';
+-
+-		if (__glibc_unlikely (malloc_dirname))
+-		  free (dirname);
+-		dirname = newp;
+-		malloc_dirname = !use_alloca;
+-
+-		dirlen = home_len + rest_len;
+-		dirname_modified = 1;
+-	      }
+-	    else
+-	      {
+-		if (flags & GLOB_TILDE_CHECK)
+-		  {
+-		    /* We have to regard it as an error if we cannot find the
+-		       home directory.  */
+-		    retval = GLOB_NOMATCH;
+-		    goto out;
+-		  }
+-	      }
+-	    scratch_buffer_free (&pwtmpbuf);
+-	  }
+-#endif /* !WINDOWS32 */
+-	}
++            /* If we found a home directory use this.  */
++            if (p != NULL)
++              {
++                size_t home_len = strlen (p->pw_dir);
++                size_t rest_len = end_name == NULL ? 0 : strlen (end_name);
++                /* dirname contains end_name; we can't free it now.  */
++                char *prev_dirname =
++                  (__glibc_unlikely (malloc_dirname) ? dirname : NULL);
++                char *d;
++
++                malloc_dirname = 0;
++
++                if (glob_use_alloca (alloca_used, home_len + rest_len + 1))
++                  dirname = alloca_account (home_len + rest_len + 1,
++                                            alloca_used);
++                else
++                  {
++                    dirname = malloc (home_len + rest_len + 1);
++                    if (dirname == NULL)
++                      {
++                        free (prev_dirname);
++                        scratch_buffer_free (&pwtmpbuf);
++                        retval = GLOB_NOSPACE;
++                        goto out;
++                      }
++                    malloc_dirname = 1;
++                  }
++                d = mempcpy (dirname, p->pw_dir, home_len);
++                if (end_name != NULL)
++                  d = mempcpy (d, end_name, rest_len);
++                *d = '\0';
++
++                free (prev_dirname);
++
++                dirlen = home_len + rest_len;
++                dirname_modified = 1;
++              }
++            else
++              {
++                if (flags & GLOB_TILDE_CHECK)
++                  {
++                    /* We have to regard it as an error if we cannot find the
++                       home directory.  */
++                    retval = GLOB_NOMATCH;
++                    goto out;
++                  }
++              }
++            scratch_buffer_free (&pwtmpbuf);
++          }
++#else /* WINDOWS32 */
++          /* On native Windows, access to a user's home directory
++             (via GetUserProfileDirectory) or to a user's environment
++             variables (via ExpandEnvironmentStringsForUser) requires
++             the credentials of the user.  Therefore we cannot support
++             the ~user syntax on this platform.
++             Handling ~user specially (and treat it like plain ~) if
++             user is getenv ("USERNAME") would not be a good idea,
++             since it would make people think that ~user is supported
++             in general.  */
++          if (flags & GLOB_TILDE_CHECK)
++            {
++              retval = GLOB_NOMATCH;
++              goto out;
++            }
++#endif /* WINDOWS32 */
++        }
+     }
+ 
+   /* Now test whether we looked for "~" or "~NAME".  In this case we
+      can give the answer now.  */
+   if (filename == NULL)
+     {
+-	size_t newcount = pglob->gl_pathc + pglob->gl_offs;
+-	char **new_gl_pathv;
++      size_t newcount = pglob->gl_pathc + pglob->gl_offs;
++      char **new_gl_pathv;
++
++      if (newcount > SIZE_MAX / sizeof (char *) - 2)
++        {
++        nospace:
++          free (pglob->gl_pathv);
++          pglob->gl_pathv = NULL;
++          pglob->gl_pathc = 0;
++          retval = GLOB_NOSPACE;
++          goto out;
++        }
+ 
+-	if (newcount > SIZE_MAX / sizeof (char *) - 2)
+-	  {
+-	  nospace:
+-	    free (pglob->gl_pathv);
+-	    pglob->gl_pathv = NULL;
+-	    pglob->gl_pathc = 0;
+-	    retval = GLOB_NOSPACE;
+-	    goto out;
+-	  }
+-
+-	new_gl_pathv = realloc (pglob->gl_pathv,
+-				(newcount + 2) * sizeof (char *));
+-	if (new_gl_pathv == NULL)
+-	  goto nospace;
+-	pglob->gl_pathv = new_gl_pathv;
+-
+-	if (flags & GLOB_MARK && is_dir (dirname, flags, pglob))
+-	  {
+-	    char *p;
+-	    pglob->gl_pathv[newcount] = malloc (dirlen + 2);
+-	    if (pglob->gl_pathv[newcount] == NULL)
+-	      goto nospace;
+-	    p = mempcpy (pglob->gl_pathv[newcount], dirname, dirlen);
+-	    p[0] = '/';
+-	    p[1] = '\0';
+-	    if (__glibc_unlikely (malloc_dirname))
+-	      free (dirname);
+-	  }
+-	else
+-	  {
+-	    if (__glibc_unlikely (malloc_dirname))
+-	      pglob->gl_pathv[newcount] = dirname;
+-	    else
+-	      {
+-		pglob->gl_pathv[newcount] = strdup (dirname);
+-		if (pglob->gl_pathv[newcount] == NULL)
+-		  goto nospace;
+-	      }
+-	  }
+-	pglob->gl_pathv[++newcount] = NULL;
+-	++pglob->gl_pathc;
+-	pglob->gl_flags = flags;
++      new_gl_pathv = realloc (pglob->gl_pathv,
++                              (newcount + 2) * sizeof (char *));
++      if (new_gl_pathv == NULL)
++        goto nospace;
++      pglob->gl_pathv = new_gl_pathv;
++
++      if (flags & GLOB_MARK && is_dir (dirname, flags, pglob))
++        {
++          char *p;
++          pglob->gl_pathv[newcount] = malloc (dirlen + 2);
++          if (pglob->gl_pathv[newcount] == NULL)
++            goto nospace;
++          p = mempcpy (pglob->gl_pathv[newcount], dirname, dirlen);
++          p[0] = '/';
++          p[1] = '\0';
++          if (__glibc_unlikely (malloc_dirname))
++            free (dirname);
++        }
++      else
++        {
++          if (__glibc_unlikely (malloc_dirname))
++            pglob->gl_pathv[newcount] = dirname;
++          else
++            {
++              pglob->gl_pathv[newcount] = strdup (dirname);
++              if (pglob->gl_pathv[newcount] == NULL)
++                goto nospace;
++            }
++        }
++      pglob->gl_pathv[++newcount] = NULL;
++      ++pglob->gl_pathc;
++      pglob->gl_flags = flags;
+ 
+-	return 0;
++      return 0;
+     }
+ 
+   meta = __glob_pattern_type (dirname, !(flags & GLOB_NOESCAPE));
+@@ -934,135 +969,135 @@ __glob (const char *pattern, int flags,
+   if (meta & (GLOBPAT_SPECIAL | GLOBPAT_BRACKET))
+     {
+       /* The directory name contains metacharacters, so we
+-	 have to glob for the directory, and then glob for
+-	 the pattern in each directory found.  */
++         have to glob for the directory, and then glob for
++         the pattern in each directory found.  */
+       size_t i;
+ 
+       if (!(flags & GLOB_NOESCAPE) && dirlen > 0 && dirname[dirlen - 1] == '\\')
+-	{
+-	  /* "foo\\/bar".  Remove the final backslash from dirname
+-	     if it has not been quoted.  */
+-	  char *p = (char *) &dirname[dirlen - 1];
+-
+-	  while (p > dirname && p[-1] == '\\') --p;
+-	  if ((&dirname[dirlen] - p) & 1)
+-	    *(char *) &dirname[--dirlen] = '\0';
+-	}
++        {
++          /* "foo\\/bar".  Remove the final backslash from dirname
++             if it has not been quoted.  */
++          char *p = (char *) &dirname[dirlen - 1];
++
++          while (p > dirname && p[-1] == '\\') --p;
++          if ((&dirname[dirlen] - p) & 1)
++            *(char *) &dirname[--dirlen] = '\0';
++        }
+ 
+       if (__glibc_unlikely ((flags & GLOB_ALTDIRFUNC) != 0))
+-	{
+-	  /* Use the alternative access functions also in the recursive
+-	     call.  */
+-	  dirs.gl_opendir = pglob->gl_opendir;
+-	  dirs.gl_readdir = pglob->gl_readdir;
+-	  dirs.gl_closedir = pglob->gl_closedir;
+-	  dirs.gl_stat = pglob->gl_stat;
+-	  dirs.gl_lstat = pglob->gl_lstat;
+-	}
++        {
++          /* Use the alternative access functions also in the recursive
++             call.  */
++          dirs.gl_opendir = pglob->gl_opendir;
++          dirs.gl_readdir = pglob->gl_readdir;
++          dirs.gl_closedir = pglob->gl_closedir;
++          dirs.gl_stat = pglob->gl_stat;
++          dirs.gl_lstat = pglob->gl_lstat;
++        }
+ 
+       status = __glob (dirname,
+-		       ((flags & (GLOB_ERR | GLOB_NOESCAPE | GLOB_ALTDIRFUNC))
+-			| GLOB_NOSORT | GLOB_ONLYDIR),
+-		       errfunc, &dirs);
++                       ((flags & (GLOB_ERR | GLOB_NOESCAPE | GLOB_ALTDIRFUNC))
++                        | GLOB_NOSORT | GLOB_ONLYDIR),
++                       errfunc, &dirs);
+       if (status != 0)
+-	{
+-	  if ((flags & GLOB_NOCHECK) == 0 || status != GLOB_NOMATCH)
+-	    {
+-	      retval = status;
+-	      goto out;
+-	    }
+-	  goto no_matches;
+-	}
++        {
++          if ((flags & GLOB_NOCHECK) == 0 || status != GLOB_NOMATCH)
++            {
++              retval = status;
++              goto out;
++            }
++          goto no_matches;
++        }
+ 
+       /* We have successfully globbed the preceding directory name.
+-	 For each name we found, call glob_in_dir on it and FILENAME,
+-	 appending the results to PGLOB.  */
++         For each name we found, call glob_in_dir on it and FILENAME,
++         appending the results to PGLOB.  */
+       for (i = 0; i < dirs.gl_pathc; ++i)
+-	{
+-	  size_t old_pathc;
++        {
++          size_t old_pathc;
+ 
+-	  old_pathc = pglob->gl_pathc;
+-	  status = glob_in_dir (filename, dirs.gl_pathv[i],
+-				((flags | GLOB_APPEND)
+-				 & ~(GLOB_NOCHECK | GLOB_NOMAGIC)),
+-				errfunc, pglob, alloca_used);
+-	  if (status == GLOB_NOMATCH)
+-	    /* No matches in this directory.  Try the next.  */
+-	    continue;
+-
+-	  if (status != 0)
+-	    {
+-	      globfree (&dirs);
+-	      globfree (pglob);
+-	      pglob->gl_pathc = 0;
+-	      retval = status;
+-	      goto out;
+-	    }
+-
+-	  /* Stick the directory on the front of each name.  */
+-	  if (prefix_array (dirs.gl_pathv[i],
+-			    &pglob->gl_pathv[old_pathc + pglob->gl_offs],
+-			    pglob->gl_pathc - old_pathc))
+-	    {
+-	      globfree (&dirs);
+-	      globfree (pglob);
+-	      pglob->gl_pathc = 0;
+-	      retval = GLOB_NOSPACE;
+-	      goto out;
+-	    }
+-	}
++          old_pathc = pglob->gl_pathc;
++          status = glob_in_dir (filename, dirs.gl_pathv[i],
++                                ((flags | GLOB_APPEND)
++                                 & ~(GLOB_NOCHECK | GLOB_NOMAGIC)),
++                                errfunc, pglob, alloca_used);
++          if (status == GLOB_NOMATCH)
++            /* No matches in this directory.  Try the next.  */
++            continue;
++
++          if (status != 0)
++            {
++              globfree (&dirs);
++              globfree (pglob);
++              pglob->gl_pathc = 0;
++              retval = status;
++              goto out;
++            }
++
++          /* Stick the directory on the front of each name.  */
++          if (prefix_array (dirs.gl_pathv[i],
++                            &pglob->gl_pathv[old_pathc + pglob->gl_offs],
++                            pglob->gl_pathc - old_pathc))
++            {
++              globfree (&dirs);
++              globfree (pglob);
++              pglob->gl_pathc = 0;
++              retval = GLOB_NOSPACE;
++              goto out;
++            }
++        }
+ 
+       flags |= GLOB_MAGCHAR;
+ 
+       /* We have ignored the GLOB_NOCHECK flag in the 'glob_in_dir' calls.
+-	 But if we have not found any matching entry and the GLOB_NOCHECK
+-	 flag was set we must return the input pattern itself.  */
++         But if we have not found any matching entry and the GLOB_NOCHECK
++         flag was set we must return the input pattern itself.  */
+       if (pglob->gl_pathc + pglob->gl_offs == oldcount)
+-	{
+-	no_matches:
+-	  /* No matches.  */
+-	  if (flags & GLOB_NOCHECK)
+-	    {
+-	      size_t newcount = pglob->gl_pathc + pglob->gl_offs;
+-	      char **new_gl_pathv;
+-
+-	      if (newcount > SIZE_MAX / sizeof (char *) - 2)
+-		{
+-		nospace2:
+-		  globfree (&dirs);
+-		  retval = GLOB_NOSPACE;
+-		  goto out;
+-		}
+-
+-	      new_gl_pathv = realloc (pglob->gl_pathv,
+-				      (newcount + 2) * sizeof (char *));
+-	      if (new_gl_pathv == NULL)
+-		goto nospace2;
+-	      pglob->gl_pathv = new_gl_pathv;
+-
+-	      pglob->gl_pathv[newcount] = strdup (pattern);
+-	      if (pglob->gl_pathv[newcount] == NULL)
+-		{
+-		  globfree (&dirs);
+-		  globfree (pglob);
+-		  pglob->gl_pathc = 0;
+-		  retval = GLOB_NOSPACE;
+-		  goto out;
+-		}
+-
+-	      ++pglob->gl_pathc;
+-	      ++newcount;
+-
+-	      pglob->gl_pathv[newcount] = NULL;
+-	      pglob->gl_flags = flags;
+-	    }
+-	  else
+-	    {
+-	      globfree (&dirs);
+-	      retval = GLOB_NOMATCH;
+-	      goto out;
+-	    }
+-	}
++        {
++        no_matches:
++          /* No matches.  */
++          if (flags & GLOB_NOCHECK)
++            {
++              size_t newcount = pglob->gl_pathc + pglob->gl_offs;
++              char **new_gl_pathv;
++
++              if (newcount > SIZE_MAX / sizeof (char *) - 2)
++                {
++                nospace2:
++                  globfree (&dirs);
++                  retval = GLOB_NOSPACE;
++                  goto out;
++                }
++
++              new_gl_pathv = realloc (pglob->gl_pathv,
++                                      (newcount + 2) * sizeof (char *));
++              if (new_gl_pathv == NULL)
++                goto nospace2;
++              pglob->gl_pathv = new_gl_pathv;
++
++              pglob->gl_pathv[newcount] = strdup (pattern);
++              if (pglob->gl_pathv[newcount] == NULL)
++                {
++                  globfree (&dirs);
++                  globfree (pglob);
++                  pglob->gl_pathc = 0;
++                  retval = GLOB_NOSPACE;
++                  goto out;
++                }
++
++              ++pglob->gl_pathc;
++              ++newcount;
++
++              pglob->gl_pathv[newcount] = NULL;
++              pglob->gl_flags = flags;
++            }
++          else
++            {
++              globfree (&dirs);
++              retval = GLOB_NOMATCH;
++              goto out;
++            }
++        }
+ 
+       globfree (&dirs);
+     }
+@@ -1072,57 +1107,57 @@ __glob (const char *pattern, int flags,
+       int orig_flags = flags;
+ 
+       if (meta & GLOBPAT_BACKSLASH)
+-	{
+-	  char *p = strchr (dirname, '\\'), *q;
+-	  /* We need to unescape the dirname string.  It is certainly
+-	     allocated by alloca, as otherwise filename would be NULL
+-	     or dirname wouldn't contain backslashes.  */
+-	  q = p;
+-	  do
+-	    {
+-	      if (*p == '\\')
+-		{
+-		  *q = *++p;
+-		  --dirlen;
+-		}
+-	      else
+-		*q = *p;
+-	      ++q;
+-	    }
+-	  while (*p++ != '\0');
+-	  dirname_modified = 1;
+-	}
++        {
++          char *p = strchr (dirname, '\\'), *q;
++          /* We need to unescape the dirname string.  It is certainly
++             allocated by alloca, as otherwise filename would be NULL
++             or dirname wouldn't contain backslashes.  */
++          q = p;
++          do
++            {
++              if (*p == '\\')
++                {
++                  *q = *++p;
++                  --dirlen;
++                }
++              else
++                *q = *p;
++              ++q;
++            }
++          while (*p++ != '\0');
++          dirname_modified = 1;
++        }
+       if (dirname_modified)
+-	flags &= ~(GLOB_NOCHECK | GLOB_NOMAGIC);
++        flags &= ~(GLOB_NOCHECK | GLOB_NOMAGIC);
+       status = glob_in_dir (filename, dirname, flags, errfunc, pglob,
+-			    alloca_used);
++                            alloca_used);
+       if (status != 0)
+-	{
+-	  if (status == GLOB_NOMATCH && flags != orig_flags
+-	      && pglob->gl_pathc + pglob->gl_offs == oldcount)
+-	    {
+-	      /* Make sure globfree (&dirs); is a nop.  */
+-	      dirs.gl_pathv = NULL;
+-	      flags = orig_flags;
+-	      goto no_matches;
+-	    }
+-	  retval = status;
+-	  goto out;
+-	}
++        {
++          if (status == GLOB_NOMATCH && flags != orig_flags
++              && pglob->gl_pathc + pglob->gl_offs == oldcount)
++            {
++              /* Make sure globfree (&dirs); is a nop.  */
++              dirs.gl_pathv = NULL;
++              flags = orig_flags;
++              goto no_matches;
++            }
++          retval = status;
++          goto out;
++        }
+ 
+       if (dirlen > 0)
+-	{
+-	  /* Stick the directory on the front of each name.  */
+-	  if (prefix_array (dirname,
+-			    &pglob->gl_pathv[old_pathc + pglob->gl_offs],
+-			    pglob->gl_pathc - old_pathc))
+-	    {
+-	      globfree (pglob);
+-	      pglob->gl_pathc = 0;
+-	      retval = GLOB_NOSPACE;
+-	      goto out;
+-	    }
+-	}
++        {
++          /* Stick the directory on the front of each name.  */
++          if (prefix_array (dirname,
++                            &pglob->gl_pathv[old_pathc + pglob->gl_offs],
++                            pglob->gl_pathc - old_pathc))
++            {
++              globfree (pglob);
++              pglob->gl_pathc = 0;
++              retval = GLOB_NOSPACE;
++              goto out;
++            }
++        }
+     }
+ 
+   if (flags & GLOB_MARK)
+@@ -1131,28 +1166,28 @@ __glob (const char *pattern, int flags,
+       size_t i;
+ 
+       for (i = oldcount; i < pglob->gl_pathc + pglob->gl_offs; ++i)
+-	if (is_dir (pglob->gl_pathv[i], flags, pglob))
+-	  {
+-	    size_t len = strlen (pglob->gl_pathv[i]) + 2;
+-	    char *new = realloc (pglob->gl_pathv[i], len);
+-	    if (new == NULL)
+-	      {
+-		globfree (pglob);
+-		pglob->gl_pathc = 0;
+-		retval = GLOB_NOSPACE;
+-		goto out;
+-	      }
+-	    strcpy (&new[len - 2], "/");
+-	    pglob->gl_pathv[i] = new;
+-	  }
++        if (is_dir (pglob->gl_pathv[i], flags, pglob))
++          {
++            size_t len = strlen (pglob->gl_pathv[i]) + 2;
++            char *new = realloc (pglob->gl_pathv[i], len);
++            if (new == NULL)
++              {
++                globfree (pglob);
++                pglob->gl_pathc = 0;
++                retval = GLOB_NOSPACE;
++                goto out;
++              }
++            strcpy (&new[len - 2], "/");
++            pglob->gl_pathv[i] = new;
++          }
+     }
+ 
+   if (!(flags & GLOB_NOSORT))
+     {
+       /* Sort the vector.  */
+       qsort (&pglob->gl_pathv[oldcount],
+-	     pglob->gl_pathc + pglob->gl_offs - oldcount,
+-	     sizeof (char *), collated_compare);
++             pglob->gl_pathc + pglob->gl_offs - oldcount,
++             sizeof (char *), collated_compare);
+     }
+ 
+  out:
+@@ -1204,14 +1239,14 @@ prefix_array (const char *dirname, char
+   if (dirlen > 1)
+     {
+       if (dirname[dirlen - 1] == '/' && dirname[dirlen - 2] == ':')
+-	/* DIRNAME is "d:/".  Don't prepend the slash from DIRNAME.  */
+-	--dirlen;
++        /* DIRNAME is "d:/".  Don't prepend the slash from DIRNAME.  */
++        --dirlen;
+       else if (dirname[dirlen - 1] == ':')
+-	{
+-	  /* DIRNAME is "d:".  Use ':' instead of '/'.  */
+-	  --dirlen;
+-	  dirsep_char = ':';
+-	}
++        {
++          /* DIRNAME is "d:".  Use ':' instead of '/'.  */
++          --dirlen;
++          dirsep_char = ':';
++        }
+     }
+ #endif
+ 
+@@ -1220,16 +1255,16 @@ prefix_array (const char *dirname, char
+       size_t eltlen = strlen (array[i]) + 1;
+       char *new = malloc (dirlen + 1 + eltlen);
+       if (new == NULL)
+-	{
+-	  while (i > 0)
+-	    free (array[--i]);
+-	  return 1;
+-	}
++        {
++          while (i > 0)
++            free (array[--i]);
++          return 1;
++        }
+ 
+       {
+-	char *endp = mempcpy (new, dirname, dirlen);
+-	*endp++ = dirsep_char;
+-	mempcpy (endp, array[i], eltlen);
++        char *endp = mempcpy (new, dirname, dirlen);
++        *endp++ = dirsep_char;
++        mempcpy (endp, array[i], eltlen);
+       }
+       free (array[i]);
+       array[i] = new;
+@@ -1244,11 +1279,13 @@ prefix_array (const char *dirname, char
+    The GLOB_APPEND flag is assumed to be set (always appends).  */
+ static int
+ glob_in_dir (const char *pattern, const char *directory, int flags,
+-	     int (*errfunc) (const char *, int),
+-	     glob_t *pglob, size_t alloca_used)
++             int (*errfunc) (const char *, int),
++             glob_t *pglob, size_t alloca_used)
+ {
+   size_t dirlen = strlen (directory);
+   void *stream = NULL;
++  struct scratch_buffer s;
++  scratch_buffer_init (&s);
+ # define GLOBNAMES_MEMBERS(nnames) \
+     struct globnames *next; size_t count; char *name[nnames];
+   struct globnames { GLOBNAMES_MEMBERS (FLEXIBLE_ARRAY_MEMBER) };
+@@ -1273,8 +1310,8 @@ glob_in_dir (const char *pattern, const
+   if (meta == GLOBPAT_NONE && (flags & (GLOB_NOCHECK|GLOB_NOMAGIC)))
+     {
+       /* We need not do any tests.  The PATTERN contains no meta
+-	 characters and we must not return an error therefore the
+-	 result will always contain exactly one name.  */
++         characters and we must not return an error therefore the
++         result will always contain exactly one name.  */
+       flags |= GLOB_NOCHECK;
+     }
+   else if (meta == GLOBPAT_NONE)
+@@ -1288,102 +1325,127 @@ glob_in_dir (const char *pattern, const
+       if (alloca_fullname)
+         fullname = alloca_account (fullsize, alloca_used);
+       else
+-	{
+-	  fullname = malloc (fullsize);
+-	  if (fullname == NULL)
+-	    return GLOB_NOSPACE;
+-	}
++        {
++          fullname = malloc (fullsize);
++          if (fullname == NULL)
++            return GLOB_NOSPACE;
++        }
+ 
+       mempcpy (mempcpy (mempcpy (fullname, directory, dirlen),
+-			"/", 1),
+-	       pattern, patlen + 1);
++                        "/", 1),
++               pattern, patlen + 1);
+       if (glob_lstat (pglob, flags, fullname) == 0
+-	  || errno == EOVERFLOW)
+-	/* We found this file to be existing.  Now tell the rest
+-	   of the function to copy this name into the result.  */
+-	flags |= GLOB_NOCHECK;
++          || errno == EOVERFLOW)
++        /* We found this file to be existing.  Now tell the rest
++           of the function to copy this name into the result.  */
++        flags |= GLOB_NOCHECK;
+ 
+       if (__glibc_unlikely (!alloca_fullname))
+-	free (fullname);
++        free (fullname);
+     }
+   else
+     {
+       stream = (__builtin_expect (flags & GLOB_ALTDIRFUNC, 0)
+-		? (*pglob->gl_opendir) (directory)
+-		: opendir (directory));
++                ? (*pglob->gl_opendir) (directory)
++                : opendir (directory));
+       if (stream == NULL)
+-	{
+-	  if (errno != ENOTDIR
+-	      && ((errfunc != NULL && (*errfunc) (directory, errno))
+-		  || (flags & GLOB_ERR)))
+-	    return GLOB_ABORTED;
+-	}
++        {
++          if (errno != ENOTDIR
++              && ((errfunc != NULL && (*errfunc) (directory, errno))
++                  || (flags & GLOB_ERR)))
++            return GLOB_ABORTED;
++        }
+       else
+-	{
+-	  int fnm_flags = ((!(flags & GLOB_PERIOD) ? FNM_PERIOD : 0)
+-			   | ((flags & GLOB_NOESCAPE) ? FNM_NOESCAPE : 0));
+-	  flags |= GLOB_MAGCHAR;
+-
+-	  while (1)
+-	    {
+-	      struct readdir_result d;
+-	      {
+-		if (__builtin_expect (flags & GLOB_ALTDIRFUNC, 0))
+-		  d = convert_dirent (GL_READDIR (pglob, stream));
+-		else
+-		  {
++        {
++          int dfd = dirfd (stream);
++          int fnm_flags = ((!(flags & GLOB_PERIOD) ? FNM_PERIOD : 0)
++                           | ((flags & GLOB_NOESCAPE) ? FNM_NOESCAPE : 0));
++          flags |= GLOB_MAGCHAR;
++
++          while (1)
++            {
++              struct readdir_result d;
++              {
++                if (__builtin_expect (flags & GLOB_ALTDIRFUNC, 0))
++                  d = convert_dirent (GL_READDIR (pglob, stream));
++                else
++                  {
+ #ifdef COMPILE_GLOB64
+-		    d = convert_dirent (__readdir (stream));
++                    d = convert_dirent (__readdir (stream));
+ #else
+-		    d = convert_dirent64 (__readdir64 (stream));
++                    d = convert_dirent64 (__readdir64 (stream));
+ #endif
+-		  }
+-	      }
+-	      if (d.name == NULL)
+-		break;
+-
+-	      /* If we shall match only directories use the information
+-		 provided by the dirent call if possible.  */
+-	      if (flags & GLOB_ONLYDIR)
+-		switch (readdir_result_type (d))
+-		  {
+-		  case DT_DIR: case DT_LNK: case DT_UNKNOWN: break;
+-		  default: continue;
+-		  }
+-
+-	      if (fnmatch (pattern, d.name, fnm_flags) == 0)
+-		{
+-		  if (cur == names->count)
+-		    {
+-		      struct globnames *newnames;
+-		      size_t count = names->count * 2;
+-		      size_t nameoff = offsetof (struct globnames, name);
+-		      size_t size = FLEXSIZEOF (struct globnames, name,
+-						count * sizeof (char *));
+-		      if ((SIZE_MAX - nameoff) / 2 / sizeof (char *)
+-			  < names->count)
+-			goto memory_error;
+-		      if (glob_use_alloca (alloca_used, size))
+-			newnames = names_alloca
+-			  = alloca_account (size, alloca_used);
+-		      else if ((newnames = malloc (size))
+-			       == NULL)
+-			goto memory_error;
+-		      newnames->count = count;
+-		      newnames->next = names;
+-		      names = newnames;
+-		      cur = 0;
+-		    }
+-		  names->name[cur] = strdup (d.name);
+-		  if (names->name[cur] == NULL)
+-		    goto memory_error;
+-		  ++cur;
+-		  ++nfound;
+-		  if (SIZE_MAX - pglob->gl_offs <= nfound)
+-		    goto memory_error;
+-		}
+-	    }
+-	}
++                  }
++              }
++              if (d.name == NULL)
++                break;
++
++              /* If we shall match only directories use the information
++                 provided by the dirent call if possible.  */
++              if (flags & GLOB_ONLYDIR)
++                switch (readdir_result_type (d))
++                  {
++                  default: continue;
++                  case DT_DIR: break;
++                  case DT_LNK: case DT_UNKNOWN:
++                    /* The filesystem was too lazy to give us a hint,
++                       so we have to do it the hard way.  */
++                    if (__glibc_unlikely (dfd < 0 || flags & GLOB_ALTDIRFUNC))
++                      {
++                        size_t namelen = strlen (d.name);
++                        size_t need = dirlen + 1 + namelen + 1;
++                        if (s.length < need
++                            && !scratch_buffer_set_array_size (&s, need, 1))
++                          goto memory_error;
++                        char *p = mempcpy (s.data, directory, dirlen);
++                        *p = '/';
++                        p += p[-1] != '/';
++                        memcpy (p, d.name, namelen + 1);
++                        if (! is_dir (s.data, flags, pglob))
++                          continue;
++                      }
++                    else
++                      {
++                        struct_stat64 st64;
++                        if (! (GLOB_FSTATAT64 (dfd, d.name, &st64, 0) == 0
++                               && S_ISDIR (st64.st_mode)))
++                          continue;
++                      }
++                  }
++
++              if (fnmatch (pattern, d.name, fnm_flags) == 0)
++                {
++                  if (cur == names->count)
++                    {
++                      struct globnames *newnames;
++                      size_t count = names->count * 2;
++                      size_t nameoff = offsetof (struct globnames, name);
++                      size_t size = FLEXSIZEOF (struct globnames, name,
++                                                count * sizeof (char *));
++                      if ((SIZE_MAX - nameoff) / 2 / sizeof (char *)
++                          < names->count)
++                        goto memory_error;
++                      if (glob_use_alloca (alloca_used, size))
++                        newnames = names_alloca
++                          = alloca_account (size, alloca_used);
++                      else if ((newnames = malloc (size))
++                               == NULL)
++                        goto memory_error;
++                      newnames->count = count;
++                      newnames->next = names;
++                      names = newnames;
++                      cur = 0;
++                    }
++                  names->name[cur] = strdup (d.name);
++                  if (names->name[cur] == NULL)
++                    goto memory_error;
++                  ++cur;
++                  ++nfound;
++                  if (SIZE_MAX - pglob->gl_offs <= nfound)
++                    goto memory_error;
++                }
++            }
++        }
+     }
+ 
+   if (nfound == 0 && (flags & GLOB_NOCHECK))
+@@ -1392,7 +1454,7 @@ glob_in_dir (const char *pattern, const
+       nfound = 1;
+       names->name[cur] = malloc (len + 1);
+       if (names->name[cur] == NULL)
+-	goto memory_error;
++        goto memory_error;
+       *((char *) mempcpy (names->name[cur++], pattern, len)) = '\0';
+     }
+ 
+@@ -1403,82 +1465,83 @@ glob_in_dir (const char *pattern, const
+       result = 0;
+ 
+       if (SIZE_MAX / sizeof (char *) - pglob->gl_pathc
+-	  < pglob->gl_offs + nfound + 1)
+-	goto memory_error;
++          < pglob->gl_offs + nfound + 1)
++        goto memory_error;
+ 
+       new_gl_pathv
+-	= realloc (pglob->gl_pathv,
+-		   (pglob->gl_pathc + pglob->gl_offs + nfound + 1)
+-		    * sizeof (char *));
++        = realloc (pglob->gl_pathv,
++                   (pglob->gl_pathc + pglob->gl_offs + nfound + 1)
++                    * sizeof (char *));
+ 
+       if (new_gl_pathv == NULL)
+-	{
+-	memory_error:
+-	  while (1)
+-	    {
+-	      struct globnames *old = names;
+-	      for (size_t i = 0; i < cur; ++i)
+-		free (names->name[i]);
+-	      names = names->next;
+-	      /* NB: we will not leak memory here if we exit without
+-		 freeing the current block assigned to OLD.  At least
+-		 the very first block is always allocated on the stack
+-		 and this is the block assigned to OLD here.  */
+-	      if (names == NULL)
+-		{
+-		  assert (old == init_names);
+-		  break;
+-		}
+-	      cur = names->count;
+-	      if (old == names_alloca)
+-		names_alloca = names;
+-	      else
+-		free (old);
+-	    }
+-	  result = GLOB_NOSPACE;
+-	}
++        {
++        memory_error:
++          while (1)
++            {
++              struct globnames *old = names;
++              for (size_t i = 0; i < cur; ++i)
++                free (names->name[i]);
++              names = names->next;
++              /* NB: we will not leak memory here if we exit without
++                 freeing the current block assigned to OLD.  At least
++                 the very first block is always allocated on the stack
++                 and this is the block assigned to OLD here.  */
++              if (names == NULL)
++                {
++                  assert (old == init_names);
++                  break;
++                }
++              cur = names->count;
++              if (old == names_alloca)
++                names_alloca = names;
++              else
++                free (old);
++            }
++          result = GLOB_NOSPACE;
++        }
+       else
+-	{
+-	  while (1)
+-	    {
+-	      struct globnames *old = names;
+-	      for (size_t i = 0; i < cur; ++i)
+-		new_gl_pathv[pglob->gl_offs + pglob->gl_pathc++]
+-		  = names->name[i];
+-	      names = names->next;
+-	      /* NB: we will not leak memory here if we exit without
+-		 freeing the current block assigned to OLD.  At least
+-		 the very first block is always allocated on the stack
+-		 and this is the block assigned to OLD here.  */
+-	      if (names == NULL)
+-		{
+-		  assert (old == init_names);
+-		  break;
+-		}
+-	      cur = names->count;
+-	      if (old == names_alloca)
+-		names_alloca = names;
+-	      else
+-		free (old);
+-	    }
++        {
++          while (1)
++            {
++              struct globnames *old = names;
++              for (size_t i = 0; i < cur; ++i)
++                new_gl_pathv[pglob->gl_offs + pglob->gl_pathc++]
++                  = names->name[i];
++              names = names->next;
++              /* NB: we will not leak memory here if we exit without
++                 freeing the current block assigned to OLD.  At least
++                 the very first block is always allocated on the stack
++                 and this is the block assigned to OLD here.  */
++              if (names == NULL)
++                {
++                  assert (old == init_names);
++                  break;
++                }
++              cur = names->count;
++              if (old == names_alloca)
++                names_alloca = names;
++              else
++                free (old);
++            }
+ 
+-	  pglob->gl_pathv = new_gl_pathv;
++          pglob->gl_pathv = new_gl_pathv;
+ 
+-	  pglob->gl_pathv[pglob->gl_offs + pglob->gl_pathc] = NULL;
++          pglob->gl_pathv[pglob->gl_offs + pglob->gl_pathc] = NULL;
+ 
+-	  pglob->gl_flags = flags;
+-	}
++          pglob->gl_flags = flags;
++        }
+     }
+ 
+   if (stream != NULL)
+     {
+       save = errno;
+       if (__glibc_unlikely (flags & GLOB_ALTDIRFUNC))
+-	(*pglob->gl_closedir) (stream);
++        (*pglob->gl_closedir) (stream);
+       else
+-	closedir (stream);
++        closedir (stream);
+       __set_errno (save);
+     }
+ 
++  scratch_buffer_free (&s);
+   return result;
+ }
+diff -rup a/sysdeps/gnu/glob-lstat-compat.c b/sysdeps/gnu/glob-lstat-compat.c
+--- a/sysdeps/gnu/glob-lstat-compat.c	2018-08-01 01:10:47.000000000 -0400
++++ b/sysdeps/gnu/glob-lstat-compat.c	2022-05-02 17:51:04.167557574 -0400
+@@ -29,7 +29,8 @@
+ #define GLOB_ATTRIBUTE attribute_compat_text_section
+ 
+ /* Avoid calling gl_lstat with GLOB_ALTDIRFUNC.  */
+-#define GLOB_NO_LSTAT
++#define GLOB_LSTAT   gl_stat
++#define GLOB_LSTAT64 __stat64
+ 
+ #include <posix/glob.c>
+ 
+diff -rup a/sysdeps/unix/sysv/linux/glob-lstat-compat.c b/sysdeps/unix/sysv/linux/glob-lstat-compat.c
+--- a/sysdeps/unix/sysv/linux/glob-lstat-compat.c	2018-08-01 01:10:47.000000000 -0400
++++ b/sysdeps/unix/sysv/linux/glob-lstat-compat.c	2022-05-02 23:05:45.197297341 -0400
+@@ -30,7 +30,12 @@
+ #define GLOB_ATTRIBUTE attribute_compat_text_section
+ 
+ /* Avoid calling gl_lstat with GLOB_ALTDIRFUNC.  */
+-#define GLOB_NO_LSTAT
++# define COMPILE_GLOB64	1
++# define struct_stat    struct stat
++# define struct_stat64  struct stat64
++# define GLOB_LSTAT     gl_stat
++# define GLOB_STAT64    __stat64
++# define GLOB_LSTAT64   __stat64
+ 
+ #include <posix/glob.c>
+ 
diff --git a/SOURCES/glibc-rh2077835.patch b/SOURCES/glibc-rh2077835.patch
new file mode 100644
index 0000000..7323d49
--- /dev/null
+++ b/SOURCES/glibc-rh2077835.patch
@@ -0,0 +1,211 @@
+commit 2376944b9e5c0364b9fb473e4d8dabca31b57167
+Author: Stefan Liebler <stli@linux.ibm.com>
+Date:   Wed Apr 13 14:36:09 2022 +0200
+
+    S390: Add new s390 platform z16.
+
+    The new IBM z16 is added to platform string array.
+    The macro _DL_PLATFORMS_COUNT is incremented.
+
+    _dl_hwcaps_subdir is extended by "z16" if HWCAP_S390_VXRS_PDE2
+    is set. HWCAP_S390_NNPA is not tested in _dl_hwcaps_subdirs_active
+    as those instructions may be replaced or removed in future.
+
+    tst-glibc-hwcaps.c is extended in order to test z16 via new marker5.
+
+    A fatal glibc error is dumped if glibc was build with architecture
+    level set for z16, but run on an older machine. (See dl-hwcap-check.h)
+
+Reworked for RHEL 8.7.0
+
+diff -Nrup a/elf/Makefile b/elf/Makefile
+--- a/elf/Makefile	2022-05-16 21:48:11.267916411 -0400
++++ b/elf/Makefile	2022-05-16 21:48:56.106095151 -0400
+@@ -347,7 +347,8 @@ modules-names = testobj1 testobj2 testob
+ 		libmarkermod2-1 libmarkermod2-2 \
+ 		libmarkermod3-1 libmarkermod3-2 libmarkermod3-3 \
+ 		libmarkermod4-1 libmarkermod4-2 libmarkermod4-3 libmarkermod4-4 \
+-		tst-tls20mod-bad tst-tls21mod \
++		libmarkermod5-1 libmarkermod5-2 libmarkermod5-3 libmarkermod5-4 \
++		libmarkermod5-5 tst-tls20mod-bad tst-tls21mod \
+ 
+ # Most modules build with _ISOMAC defined, but those filtered out
+ # depend on internal headers.
+@@ -1782,6 +1783,7 @@ LDFLAGS-libmarkermod1-1.so += -Wl,-sonam
+ LDFLAGS-libmarkermod2-1.so += -Wl,-soname,libmarkermod2.so
+ LDFLAGS-libmarkermod3-1.so += -Wl,-soname,libmarkermod3.so
+ LDFLAGS-libmarkermod4-1.so += -Wl,-soname,libmarkermod4.so
++LDFLAGS-libmarkermod5-1.so += -Wl,-soname,libmarkermod5.so
+ $(objpfx)libmarkermod%.os : markermodMARKER-VALUE.c
+ 	$(compile-command.c) \
+ 	  -DMARKER=marker$(firstword $(subst -, ,$*)) \
+@@ -1794,6 +1796,8 @@ $(objpfx)libmarkermod3.so: $(objpfx)libm
+ 	cp $< $@
+ $(objpfx)libmarkermod4.so: $(objpfx)libmarkermod4-1.so
+ 	cp $< $@
++$(objpfx)libmarkermod5.so: $(objpfx)libmarkermod5-1.so
++	cp $< $@
+ 
+ # tst-glibc-hwcaps-prepend checks that --glibc-hwcaps-prepend is
+ # preferred over auto-detected subdirectories.
+diff -Nrup a/elf/tst-glibc-hwcaps-cache.script b/elf/tst-glibc-hwcaps-cache.script
+--- a/elf/tst-glibc-hwcaps-cache.script	2022-05-16 21:48:11.053915558 -0400
++++ b/elf/tst-glibc-hwcaps-cache.script	2022-05-16 21:48:56.107095155 -0400
+@@ -4,6 +4,7 @@
+ cp $B/elf/libmarkermod2-1.so $L/libmarkermod2.so
+ cp $B/elf/libmarkermod3-1.so $L/libmarkermod3.so
+ cp $B/elf/libmarkermod4-1.so $L/libmarkermod4.so
++cp $B/elf/libmarkermod5-1.so $L/libmarkermod5.so
+ 
+ mkdirp 0770 $L/glibc-hwcaps/power9
+ cp $B/elf/libmarkermod2-2.so $L/glibc-hwcaps/power9/libmarkermod2.so
+@@ -20,6 +21,11 @@ mkdirp 0770 $L/glibc-hwcaps/z15
+ cp $B/elf/libmarkermod4-2.so $L/glibc-hwcaps/z13/libmarkermod4.so
+ cp $B/elf/libmarkermod4-3.so $L/glibc-hwcaps/z14/libmarkermod4.so
+ cp $B/elf/libmarkermod4-4.so $L/glibc-hwcaps/z15/libmarkermod4.so
++mkdirp 0770 $L/glibc-hwcaps/z16
++cp $B/elf/libmarkermod5-2.so $L/glibc-hwcaps/z13/libmarkermod5.so
++cp $B/elf/libmarkermod5-3.so $L/glibc-hwcaps/z14/libmarkermod5.so
++cp $B/elf/libmarkermod5-4.so $L/glibc-hwcaps/z15/libmarkermod5.so
++cp $B/elf/libmarkermod5-5.so $L/glibc-hwcaps/z16/libmarkermod5.so
+ 
+ mkdirp 0770 $L/glibc-hwcaps/x86-64-v2
+ cp $B/elf/libmarkermod2-2.so $L/glibc-hwcaps/x86-64-v2/libmarkermod2.so
+diff -Nrup a/sysdeps/s390/dl-procinfo.c b/sysdeps/s390/dl-procinfo.c
+--- a/sysdeps/s390/dl-procinfo.c	2022-05-16 21:48:11.250916343 -0400
++++ b/sysdeps/s390/dl-procinfo.c	2022-05-16 21:48:56.107095155 -0400
+@@ -64,11 +64,12 @@ PROCINFO_CLASS const char _dl_s390_cap_f
+ #if !defined PROCINFO_DECL && defined SHARED
+   ._dl_s390_platforms
+ #else
+-PROCINFO_CLASS const char _dl_s390_platforms[10][7]
++PROCINFO_CLASS const char _dl_s390_platforms[11][7]
+ #endif
+ #ifndef PROCINFO_DECL
+ = {
+-    "g5", "z900", "z990", "z9-109", "z10", "z196", "zEC12", "z13", "z14", "z15"
++    "g5", "z900", "z990", "z9-109", "z10", "z196", "zEC12", "z13", "z14", "z15",
++    "z16"
+   }
+ #endif
+ #if !defined SHARED || defined PROCINFO_DECL
+diff -Nrup a/sysdeps/s390/dl-procinfo.h b/sysdeps/s390/dl-procinfo.h
+--- a/sysdeps/s390/dl-procinfo.h	2022-05-16 21:48:11.250916343 -0400
++++ b/sysdeps/s390/dl-procinfo.h	2022-05-16 21:48:56.107095155 -0400
+@@ -23,7 +23,7 @@
+ 
+ #define _DL_HWCAP_COUNT 23
+ 
+-#define _DL_PLATFORMS_COUNT	10
++#define _DL_PLATFORMS_COUNT	11
+ 
+ /* The kernel provides up to 32 capability bits with elf_hwcap.  */
+ #define _DL_FIRST_PLATFORM	32
+diff -Nrup a/sysdeps/s390/s390-64/dl-hwcaps-subdirs.c b/sysdeps/s390/s390-64/dl-hwcaps-subdirs.c
+--- a/sysdeps/s390/s390-64/dl-hwcaps-subdirs.c	2022-05-16 21:48:11.053915558 -0400
++++ b/sysdeps/s390/s390-64/dl-hwcaps-subdirs.c	2022-05-16 21:58:02.840301911 -0400
+@@ -19,8 +19,8 @@
+ #include <dl-hwcaps.h>
+ #include <ldsodefs.h>
+ 
+-const char _dl_hwcaps_subdirs[] = "z15:z14:z13";
+-enum { subdirs_count = 3 }; /* Number of components in _dl_hwcaps_subdirs.  */
++const char _dl_hwcaps_subdirs[] = "z16:z15:z14:z13";
++enum { subdirs_count = 4 }; /* Number of components in _dl_hwcaps_subdirs.  */
+ 
+ uint32_t
+ _dl_hwcaps_subdirs_active (void)
+@@ -50,5 +50,12 @@ _dl_hwcaps_subdirs_active (void)
+     return _dl_hwcaps_subdirs_build_bitmask (subdirs_count, active);
+   ++active;
+ 
++  /* z16.
++   Note: We do not list HWCAP_S390_NNPA here as, according to the Principles of
++   Operation, those instructions may be replaced or removed in future.  */
++  if (!(GLRO (dl_hwcap) & HWCAP_S390_VXRS_PDE2))
++    return _dl_hwcaps_subdirs_build_bitmask (subdirs_count, active);
++  ++active;
++
+   return _dl_hwcaps_subdirs_build_bitmask (subdirs_count, active);
+ }
+diff -Nrup a/sysdeps/s390/s390-64/Makefile b/sysdeps/s390/s390-64/Makefile
+--- a/sysdeps/s390/s390-64/Makefile	2022-05-16 21:48:11.053915558 -0400
++++ b/sysdeps/s390/s390-64/Makefile	2022-05-16 21:54:08.832355745 -0400
+@@ -7,8 +7,11 @@ CFLAGS-rtld.c += -Wno-uninitialized -Wno
+ CFLAGS-dl-load.c += -Wno-unused
+ CFLAGS-dl-reloc.c += -Wno-unused
+ 
+-$(objpfx)tst-glibc-hwcaps: $(objpfx)libmarkermod2-1.so \
+-  $(objpfx)libmarkermod3-1.so $(objpfx)libmarkermod4-1.so
++$(objpfx)tst-glibc-hwcaps: \
++    $(objpfx)libmarkermod2-1.so \
++    $(objpfx)libmarkermod3-1.so \
++    $(objpfx)libmarkermod4-1.so \
++    $(objpfx)libmarkermod5-1.so
+ $(objpfx)tst-glibc-hwcaps.out: \
+   $(objpfx)libmarkermod2.so \
+     $(objpfx)glibc-hwcaps/z13/libmarkermod2.so \
+@@ -19,6 +22,11 @@ $(objpfx)tst-glibc-hwcaps.out: \
+     $(objpfx)glibc-hwcaps/z13/libmarkermod4.so \
+     $(objpfx)glibc-hwcaps/z14/libmarkermod4.so \
+     $(objpfx)glibc-hwcaps/z15/libmarkermod4.so \
++  $(objpfx)libmarkermod5.so \
++    $(objpfx)glibc-hwcaps/z13/libmarkermod5.so \
++    $(objpfx)glibc-hwcaps/z14/libmarkermod5.so \
++    $(objpfx)glibc-hwcaps/z15/libmarkermod5.so \
++    $(objpfx)glibc-hwcaps/z16/libmarkermod5.so
+ 
+ $(objpfx)glibc-hwcaps/z13/libmarkermod2.so: $(objpfx)libmarkermod2-2.so
+ 	$(make-target-directory)
+@@ -38,6 +46,18 @@ $(objpfx)glibc-hwcaps/z14/libmarkermod4.
+ $(objpfx)glibc-hwcaps/z15/libmarkermod4.so: $(objpfx)libmarkermod4-4.so
+ 	$(make-target-directory)
+ 	cp $< $@
++$(objpfx)glibc-hwcaps/z13/libmarkermod5.so: $(objpfx)libmarkermod5-2.so
++	$(make-target-directory)
++	cp $< $@
++$(objpfx)glibc-hwcaps/z14/libmarkermod5.so: $(objpfx)libmarkermod5-3.so
++	$(make-target-directory)
++	cp $< $@
++$(objpfx)glibc-hwcaps/z15/libmarkermod5.so: $(objpfx)libmarkermod5-4.so
++	$(make-target-directory)
++	cp $< $@
++$(objpfx)glibc-hwcaps/z16/libmarkermod5.so: $(objpfx)libmarkermod5-5.so
++	$(make-target-directory)
++	cp $< $@
+ 
+ ifeq (no,$(build-hardcoded-path-in-tests))
+ # This is an ld.so.cache test, and RPATH/RUNPATH in the executable
+diff -Nrup a/sysdeps/s390/s390-64/tst-glibc-hwcaps.c b/sysdeps/s390/s390-64/tst-glibc-hwcaps.c
+--- a/sysdeps/s390/s390-64/tst-glibc-hwcaps.c	2022-05-16 21:48:11.053915558 -0400
++++ b/sysdeps/s390/s390-64/tst-glibc-hwcaps.c	2022-05-16 21:48:56.107095155 -0400
+@@ -25,6 +25,7 @@
+ extern int marker2 (void);
+ extern int marker3 (void);
+ extern int marker4 (void);
++extern int marker5 (void);
+ 
+ /* Return the arch level, 10 for the baseline libmarkermod*.so's.  */
+ static int
+@@ -63,9 +64,13 @@ compute_level (void)
+     return 12;
+   if (strcmp (platform, "z15") == 0)
+     return 13;
++  if (strcmp (platform, "z16") == 0)
++    return 14;
+   printf ("warning: unrecognized AT_PLATFORM value: %s\n", platform);
+   /* Assume that the new platform supports z15.  */
+   return 13;
++  /* Assume that the new platform supports z16.  */
++  return 14;
+ }
+ 
+ static int
+@@ -76,6 +81,7 @@ do_test (void)
+   TEST_COMPARE (marker2 (), MIN (level - 9, 2));
+   TEST_COMPARE (marker3 (), MIN (level - 9, 3));
+   TEST_COMPARE (marker4 (), MIN (level - 9, 4));
++  TEST_COMPARE (marker5 (), MIN (level - 9, 5));
+   return 0;
+ }
+ 
diff --git a/SOURCES/glibc-rh2086853.patch b/SOURCES/glibc-rh2086853.patch
new file mode 100644
index 0000000..d11e4cb
--- /dev/null
+++ b/SOURCES/glibc-rh2086853.patch
@@ -0,0 +1,30 @@
+commit 61a87530108ec9181e1b18a9b727ec3cc3ba7532
+Author: Siddhesh Poyarekar <siddhesh@sourceware.org>
+Date:   Fri May 13 10:01:47 2022 +0530
+
+    fortify: Ensure that __glibc_fortify condition is a constant [BZ #29141]
+    
+    The fix c8ee1c85 introduced a -1 check for object size without also
+    checking that object size is a constant.  Because of this, the tree
+    optimizer passes in gcc fail to fold away one of the branches in
+    __glibc_fortify and trips on a spurious Wstringop-overflow.  The warning
+    itself is incorrect and the branch does go away eventually in DCE in the
+    rtl passes in gcc, but the constant check is a helpful hint to simplify
+    code early, so add it in.
+    
+    Resolves: BZ #29141
+    Signed-off-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
+
+diff --git a/misc/sys/cdefs.h b/misc/sys/cdefs.h
+index 404496c7d6da4fb3..f3d7efdd2a9320f7 100644
+--- a/misc/sys/cdefs.h
++++ b/misc/sys/cdefs.h
+@@ -145,7 +145,7 @@
+ /* Length is known to be safe at compile time if the __L * __S <= __OBJSZ
+    condition can be folded to a constant and if it is true, or unknown (-1) */
+ #define __glibc_safe_or_unknown_len(__l, __s, __osz) \
+-  ((__osz) == (__SIZE_TYPE__) -1					      \
++  ((__builtin_constant_p (__osz) && (__osz) == (__SIZE_TYPE__) -1)	      \
+    || (__glibc_unsigned_or_positive (__l)				      \
+        && __builtin_constant_p (__glibc_safe_len_cond ((__SIZE_TYPE__) (__l), \
+ 						       (__s), (__osz)))	      \
diff --git a/SOURCES/glibc-sw24097-1.patch b/SOURCES/glibc-sw24097-1.patch
new file mode 100644
index 0000000..0894a24
--- /dev/null
+++ b/SOURCES/glibc-sw24097-1.patch
@@ -0,0 +1,254 @@
+From ea31ff23cc0f5577d6947a717a2d733d7963f5c6 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 1 Feb 2019 12:17:09 -0800
+Subject: [PATCH] x86-64 memchr/wmemchr: Properly handle the length parameter
+ [BZ #24097]
+
+On x32, the size_t parameter may be passed in the lower 32 bits of a
+64-bit register with the non-zero upper 32 bits.  The string/memory
+functions written in assembly can only use the lower 32 bits of a
+64-bit register as length or must clear the upper 32 bits before using
+the full 64-bit register for length.
+
+This pach fixes memchr/wmemchr for x32.  Tested on x86-64 and x32.  On
+x86-64, libc.so is the same with and withou the fix.
+
+	[BZ #24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/memchr.S: Use RDX_LP for length.  Clear the
+	upper 32 bits of RDX register.
+	* sysdeps/x86_64/multiarch/memchr-avx2.S: Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memchr and
+	tst-size_t-wmemchr.
+	* sysdeps/x86_64/x32/test-size_t.h: New file.
+	* sysdeps/x86_64/x32/tst-size_t-memchr.c: Likewise.
+	* sysdeps/x86_64/x32/tst-size_t-wmemchr.c: Likewise.
+
+(cherry picked from commit 97700a34f36721b11a754cf37a1cc40695ece1fd)
+---
+ sysdeps/x86_64/memchr.S                 | 10 ++--
+ sysdeps/x86_64/multiarch/memchr-avx2.S  |  8 ++-
+ sysdeps/x86_64/x32/Makefile             |  8 +++
+ sysdeps/x86_64/x32/test-size_t.h        | 35 ++++++++++++
+ sysdeps/x86_64/x32/tst-size_t-memchr.c  | 72 +++++++++++++++++++++++++
+ sysdeps/x86_64/x32/tst-size_t-wmemchr.c | 20 +++++++
+ 6 files changed, 148 insertions(+), 5 deletions(-)
+ create mode 100644 sysdeps/x86_64/x32/test-size_t.h
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-memchr.c
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemchr.c
+
+diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
+index feef5d4f..cb320257 100644
+--- a/sysdeps/x86_64/memchr.S
++++ b/sysdeps/x86_64/memchr.S
+@@ -34,12 +34,16 @@ ENTRY(MEMCHR)
+ 	mov	%edi, %ecx
+ 
+ #ifdef USE_AS_WMEMCHR
+-	test	%rdx, %rdx
++	test	%RDX_LP, %RDX_LP
+ 	jz	L(return_null)
+-	shl	$2, %rdx
++	shl	$2, %RDX_LP
+ #else
++# ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	movl	%edx, %edx
++# endif
+ 	punpcklbw %xmm1, %xmm1
+-	test	%rdx, %rdx
++	test	%RDX_LP, %RDX_LP
+ 	jz	L(return_null)
+ 	punpcklbw %xmm1, %xmm1
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
+index 5f5e7725..c81da19b 100644
+--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
++++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
+@@ -40,16 +40,20 @@
+ ENTRY (MEMCHR)
+ # ifndef USE_AS_RAWMEMCHR
+ 	/* Check for zero length.  */
+-	testq	%rdx, %rdx
++	test	%RDX_LP, %RDX_LP
+ 	jz	L(null)
+ # endif
+ 	movl	%edi, %ecx
+ 	/* Broadcast CHAR to YMM0.  */
+ 	vmovd	%esi, %xmm0
+ # ifdef USE_AS_WMEMCHR
+-	shl	$2, %rdx
++	shl	$2, %RDX_LP
+ 	vpbroadcastd %xmm0, %ymm0
+ # else
++#  ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	movl	%edx, %edx
++#  endif
+ 	vpbroadcastb %xmm0, %ymm0
+ # endif
+ 	/* Check if we may cross page boundary with one vector load.  */
+diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
+index f2ebc24f..7d528889 100644
+--- a/sysdeps/x86_64/x32/Makefile
++++ b/sysdeps/x86_64/x32/Makefile
+@@ -4,3 +4,11 @@ ifeq ($(subdir),math)
+ # 64-bit llround.  Add -fno-builtin-lround to silence the compiler.
+ CFLAGS-s_llround.c += -fno-builtin-lround
+ endif
++
++ifeq ($(subdir),string)
++tests += tst-size_t-memchr
++endif
++
++ifeq ($(subdir),wcsmbs)
++tests += tst-size_t-wmemchr
++endif
+diff --git a/sysdeps/x86_64/x32/test-size_t.h b/sysdeps/x86_64/x32/test-size_t.h
+new file mode 100644
+index 00000000..78a94086
+--- /dev/null
++++ b/sysdeps/x86_64/x32/test-size_t.h
+@@ -0,0 +1,35 @@
++/* Test string/memory functions with size_t in the lower 32 bits of
++   64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#define TEST_MAIN
++#include <string/test-string.h>
++
++/* On x32, parameter_t may be passed in a 64-bit register with the LEN
++   field in the lower 32 bits.  When the LEN field of 64-bit register
++   is passed to string/memory function as the size_t parameter, only
++   the lower 32 bits can be used.  */
++typedef struct
++{
++  union
++    {
++      size_t len;
++      void (*fn) (void);
++    };
++  void *p;
++} parameter_t;
+diff --git a/sysdeps/x86_64/x32/tst-size_t-memchr.c b/sysdeps/x86_64/x32/tst-size_t-memchr.c
+new file mode 100644
+index 00000000..29a3daf1
+--- /dev/null
++++ b/sysdeps/x86_64/x32/tst-size_t-memchr.c
+@@ -0,0 +1,72 @@
++/* Test memchr with size_t in the lower 32 bits of 64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#ifndef WIDE
++# define TEST_NAME "memchr"
++#else
++# define TEST_NAME "wmemchr"
++#endif /* WIDE */
++#include "test-size_t.h"
++
++#ifndef WIDE
++# define MEMCHR memchr
++# define CHAR char
++# define UCHAR unsigned char
++#else
++# include <wchar.h>
++# define MEMCHR wmemchr
++# define CHAR wchar_t
++# define UCHAR wchar_t
++#endif /* WIDE */
++
++IMPL (MEMCHR, 1)
++
++typedef CHAR * (*proto_t) (const CHAR*, int, size_t);
++
++static CHAR *
++__attribute__ ((noinline, noclone))
++do_memchr (parameter_t a, parameter_t b)
++{
++  return CALL (&b, a.p, (uintptr_t) b.p, a.len);
++}
++
++static int
++test_main (void)
++{
++  test_init ();
++
++  parameter_t src = { { page_size / sizeof (CHAR) }, buf2 };
++  parameter_t c = { { 0 }, (void *) (uintptr_t) 0x12 };
++
++  int ret = 0;
++  FOR_EACH_IMPL (impl, 0)
++    {
++      c.fn = impl->fn;
++      CHAR *res = do_memchr (src, c);
++      if (res)
++	{
++	  error (0, 0, "Wrong result in function %s: %p != NULL",
++		 impl->name, res);
++	  ret = 1;
++	}
++    }
++
++  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
++}
++
++#include <support/test-driver.c>
+diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemchr.c b/sysdeps/x86_64/x32/tst-size_t-wmemchr.c
+new file mode 100644
+index 00000000..877801d6
+--- /dev/null
++++ b/sysdeps/x86_64/x32/tst-size_t-wmemchr.c
+@@ -0,0 +1,20 @@
++/* Test wmemchr with size_t in the lower 32 bits of 64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#define WIDE 1
++#include "tst-size_t-memchr.c"
+-- 
+GitLab
+
diff --git a/SOURCES/glibc-sw24097-2.patch b/SOURCES/glibc-sw24097-2.patch
new file mode 100644
index 0000000..937d393
--- /dev/null
+++ b/SOURCES/glibc-sw24097-2.patch
@@ -0,0 +1,227 @@
+From a107bd201f836b1d82e1a9086a2a48625ac1c0dd Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Wed, 2 Mar 2022 13:52:42 -0800
+Subject: [PATCH] x86-64 memcmp/wmemcmp: Properly handle the length parameter
+ [BZ #24097]
+
+On x32, the size_t parameter may be passed in the lower 32 bits of a
+64-bit register with the non-zero upper 32 bits.  The string/memory
+functions written in assembly can only use the lower 32 bits of a
+64-bit register as length or must clear the upper 32 bits before using
+the full 64-bit register for length.
+
+This pach fixes memcmp/wmemcmp for x32.  Tested on x86-64 and x32.  On
+x86-64, libc.so is the same with and withou the fix.
+
+	[BZ #24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S: Use RDX_LP for
+	length.  Clear the upper 32 bits of RDX register.
+	* sysdeps/x86_64/multiarch/memcmp-sse4.S: Likewise.
+	* sysdeps/x86_64/multiarch/memcmp-ssse3.S: Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcmp and
+	tst-size_t-wmemcmp.
+	* sysdeps/x86_64/x32/tst-size_t-memcmp.c: New file.
+	* sysdeps/x86_64/x32/tst-size_t-wmemcmp.c: Likewise.
+
+(cherry picked from commit b304fc201d2f6baf52ea790df8643e99772243cd)
+---
+ sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S |  7 +-
+ sysdeps/x86_64/multiarch/memcmp-sse4.S       |  9 ++-
+ sysdeps/x86_64/multiarch/memcmp-ssse3.S      |  7 +-
+ sysdeps/x86_64/x32/Makefile                  |  4 +-
+ sysdeps/x86_64/x32/tst-size_t-memcmp.c       | 76 ++++++++++++++++++++
+ sysdeps/x86_64/x32/tst-size_t-wmemcmp.c      | 20 ++++++
+ 6 files changed, 114 insertions(+), 9 deletions(-)
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcmp.c
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemcmp.c
+
+diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+index 30f764c3..e3a35b89 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
++++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+@@ -58,9 +58,12 @@
+ 	.section .text.avx,"ax",@progbits
+ ENTRY (MEMCMP)
+ # ifdef USE_AS_WMEMCMP
+-	shl	$2, %rdx
++	shl	$2, %RDX_LP
++# elif defined __ILP32__
++	/* Clear the upper 32 bits.  */
++	movl	%edx, %edx
+ # endif
+-	cmpq	$VEC_SIZE, %rdx
++	cmp	$VEC_SIZE, %RDX_LP
+ 	jb	L(less_vec)
+ 
+ 	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
+index 8e164f2c..302900f5 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
++++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S
+@@ -42,13 +42,16 @@
+ 	.section .text.sse4.1,"ax",@progbits
+ ENTRY (MEMCMP)
+ # ifdef USE_AS_WMEMCMP
+-	shl	$2, %rdx
++	shl	$2, %RDX_LP
++# elif defined __ILP32__
++	/* Clear the upper 32 bits.  */
++	mov	%edx, %edx
+ # endif
+ 	pxor	%xmm0, %xmm0
+-	cmp	$79, %rdx
++	cmp	$79, %RDX_LP
+ 	ja	L(79bytesormore)
+ # ifndef USE_AS_WMEMCMP
+-	cmp	$1, %rdx
++	cmp	$1, %RDX_LP
+ 	je	L(firstbyte)
+ # endif
+ 	add	%rdx, %rsi
+diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
+index 6f76c641..69d030fc 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S
++++ b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
+@@ -33,9 +33,12 @@
+ 	atom_text_section
+ ENTRY (MEMCMP)
+ # ifdef USE_AS_WMEMCMP
+-	shl	$2, %rdx
+-	test	%rdx, %rdx
++	shl	$2, %RDX_LP
++	test	%RDX_LP, %RDX_LP
+ 	jz	L(equal)
++# elif defined __ILP32__
++	/* Clear the upper 32 bits.  */
++	mov	%edx, %edx
+ # endif
+ 	mov	%rdx, %rcx
+ 	mov	%rdi, %rdx
+diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
+index 7d528889..ddec7f04 100644
+--- a/sysdeps/x86_64/x32/Makefile
++++ b/sysdeps/x86_64/x32/Makefile
+@@ -6,9 +6,9 @@ CFLAGS-s_llround.c += -fno-builtin-lround
+ endif
+ 
+ ifeq ($(subdir),string)
+-tests += tst-size_t-memchr
++tests += tst-size_t-memchr tst-size_t-memcmp
+ endif
+ 
+ ifeq ($(subdir),wcsmbs)
+-tests += tst-size_t-wmemchr
++tests += tst-size_t-wmemchr tst-size_t-wmemcmp
+ endif
+diff --git a/sysdeps/x86_64/x32/tst-size_t-memcmp.c b/sysdeps/x86_64/x32/tst-size_t-memcmp.c
+new file mode 100644
+index 00000000..9bd6fdb4
+--- /dev/null
++++ b/sysdeps/x86_64/x32/tst-size_t-memcmp.c
+@@ -0,0 +1,76 @@
++/* Test memcmp with size_t in the lower 32 bits of 64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#define TEST_MAIN
++#ifdef WIDE
++# define TEST_NAME "wmemcmp"
++#else
++# define TEST_NAME "memcmp"
++#endif
++
++#include "test-size_t.h"
++
++#ifdef WIDE
++# include <inttypes.h>
++# include <wchar.h>
++
++# define MEMCMP wmemcmp
++# define CHAR wchar_t
++#else
++# define MEMCMP memcmp
++# define CHAR char
++#endif
++
++IMPL (MEMCMP, 1)
++
++typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
++
++static int
++__attribute__ ((noinline, noclone))
++do_memcmp (parameter_t a, parameter_t b)
++{
++  return CALL (&b, a.p, b.p, a.len);
++}
++
++static int
++test_main (void)
++{
++  test_init ();
++
++  parameter_t dest = { { page_size / sizeof (CHAR) }, buf1 };
++  parameter_t src = { { 0 }, buf2 };
++
++  memcpy (buf1, buf2, page_size);
++
++  int ret = 0;
++  FOR_EACH_IMPL (impl, 0)
++    {
++      src.fn = impl->fn;
++      int res = do_memcmp (dest, src);
++      if (res)
++	{
++	  error (0, 0, "Wrong result in function %s: %i != 0",
++		 impl->name, res);
++	  ret = 1;
++	}
++    }
++
++  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
++}
++
++#include <support/test-driver.c>
+diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c b/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c
+new file mode 100644
+index 00000000..e8b5ffd0
+--- /dev/null
++++ b/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c
+@@ -0,0 +1,20 @@
++/* Test wmemcmp with size_t in the lower 32 bits of 64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#define WIDE 1
++#include "tst-size_t-memcmp.c"
+-- 
+GitLab
+
diff --git a/SOURCES/glibc-sw24097-3.patch b/SOURCES/glibc-sw24097-3.patch
new file mode 100644
index 0000000..45c3c04
--- /dev/null
+++ b/SOURCES/glibc-sw24097-3.patch
@@ -0,0 +1,393 @@
+From e6e209eb05545a36283335b07a1db9471cbccd1a Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Wed, 2 Mar 2022 13:56:00 -0800
+Subject: [PATCH] x86-64 memcpy: Properly handle the length parameter [BZ
+ #24097]
+
+On x32, the size_t parameter may be passed in the lower 32 bits of a
+64-bit register with the non-zero upper 32 bits.  The string/memory
+functions written in assembly can only use the lower 32 bits of a
+64-bit register as length or must clear the upper 32 bits before using
+the full 64-bit register for length.
+
+This pach fixes memcpy for x32.  Tested on x86-64 and x32.  On x86-64,
+libc.so is the same with and withou the fix.
+
+	[BZ #24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/multiarch/memcpy-ssse3-back.S: Use RDX_LP for
+	length.  Clear the upper 32 bits of RDX register.
+	* sysdeps/x86_64/multiarch/memcpy-ssse3.S: Likewise.
+	* sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S:
+	Likewise.
+	* sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:
+	Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcpy.
+	tst-size_t-wmemchr.
+	* sysdeps/x86_64/x32/tst-size_t-memcpy.c: New file.
+
+(cherry picked from commit 231c56760c1e2ded21ad96bbb860b1f08c556c7a)
+---
+ sysdeps/x86_64/multiarch/memcpy-ssse3-back.S  | 17 ++++--
+ sysdeps/x86_64/multiarch/memcpy-ssse3.S       | 17 ++++--
+ .../multiarch/memmove-avx512-no-vzeroupper.S  | 16 +++--
+ .../multiarch/memmove-vec-unaligned-erms.S    | 54 +++++++++--------
+ sysdeps/x86_64/x32/Makefile                   |  2 +-
+ sysdeps/x86_64/x32/tst-size_t-memcpy.c        | 58 +++++++++++++++++++
+ 6 files changed, 122 insertions(+), 42 deletions(-)
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcpy.c
+
+diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
+index 3cd11233..568eebd3 100644
+--- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
++++ b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
+@@ -45,28 +45,33 @@
+ 	.section .text.ssse3,"ax",@progbits
+ #if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
+ ENTRY (MEMPCPY_CHK)
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (MEMPCPY_CHK)
+ 
+ ENTRY (MEMPCPY)
+-	movq	%rdi, %rax
+-	addq	%rdx, %rax
++	mov	%RDI_LP, %RAX_LP
++	add	%RDX_LP, %RAX_LP
+ 	jmp	L(start)
+ END (MEMPCPY)
+ #endif
+ 
+ #if !defined USE_AS_BCOPY
+ ENTRY (MEMCPY_CHK)
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (MEMCPY_CHK)
+ #endif
+ 
+ ENTRY (MEMCPY)
+-	mov	%rdi, %rax
++	mov	%RDI_LP, %RAX_LP
+ #ifdef USE_AS_MEMPCPY
+-	add	%rdx, %rax
++	add	%RDX_LP, %RAX_LP
++#endif
++
++#ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	mov	%edx, %edx
+ #endif
+ 
+ #ifdef USE_AS_MEMMOVE
+diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
+index 0240bfa3..0bd5ee99 100644
+--- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
++++ b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
+@@ -45,28 +45,33 @@
+ 	.section .text.ssse3,"ax",@progbits
+ #if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
+ ENTRY (MEMPCPY_CHK)
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (MEMPCPY_CHK)
+ 
+ ENTRY (MEMPCPY)
+-	movq	%rdi, %rax
+-	addq	%rdx, %rax
++	mov	%RDI_LP, %RAX_LP
++	add	%RDX_LP, %RAX_LP
+ 	jmp	L(start)
+ END (MEMPCPY)
+ #endif
+ 
+ #if !defined USE_AS_BCOPY
+ ENTRY (MEMCPY_CHK)
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (MEMCPY_CHK)
+ #endif
+ 
+ ENTRY (MEMCPY)
+-	mov	%rdi, %rax
++	mov	%RDI_LP, %RAX_LP
+ #ifdef USE_AS_MEMPCPY
+-	add	%rdx, %rax
++	add	%RDX_LP, %RAX_LP
++#endif
++
++#ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	mov	%edx, %edx
+ #endif
+ 
+ #ifdef USE_AS_MEMMOVE
+diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
+index effc3ac2..6ca2bbc9 100644
+--- a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
++++ b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
+@@ -24,27 +24,31 @@
+ 
+ 	.section .text.avx512,"ax",@progbits
+ ENTRY (__mempcpy_chk_avx512_no_vzeroupper)
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (__mempcpy_chk_avx512_no_vzeroupper)
+ 
+ ENTRY (__mempcpy_avx512_no_vzeroupper)
+-	movq	%rdi, %rax
+-	addq	%rdx, %rax
++	mov	%RDI_LP, %RAX_LP
++	add	%RDX_LP, %RAX_LP
+ 	jmp	L(start)
+ END (__mempcpy_avx512_no_vzeroupper)
+ 
+ ENTRY (__memmove_chk_avx512_no_vzeroupper)
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (__memmove_chk_avx512_no_vzeroupper)
+ 
+ ENTRY (__memmove_avx512_no_vzeroupper)
+-	mov	%rdi, %rax
++	mov	%RDI_LP, %RAX_LP
+ # ifdef USE_AS_MEMPCPY
+-	add	%rdx, %rax
++	add	%RDX_LP, %RAX_LP
+ # endif
+ L(start):
++# ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	mov	%edx, %edx
++# endif
+ 	lea	(%rsi, %rdx), %rcx
+ 	lea	(%rdi, %rdx), %r9
+ 	cmp	$512, %rdx
+diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+index c952576c..274aa1c7 100644
+--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+@@ -95,20 +95,20 @@
+ 	.section SECTION(.text),"ax",@progbits
+ #if defined SHARED && IS_IN (libc)
+ ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
+ #endif
+ 
+ ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
+-	movq	%rdi, %rax
+-	addq	%rdx, %rax
++	mov	%RDI_LP, %RAX_LP
++	add	%RDX_LP, %RAX_LP
+ 	jmp	L(start)
+ END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
+ 
+ #if defined SHARED && IS_IN (libc)
+ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
+ #endif
+@@ -116,9 +116,13 @@ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
+ ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
+ 	movq	%rdi, %rax
+ L(start):
+-	cmpq	$VEC_SIZE, %rdx
++# ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	movl	%edx, %edx
++# endif
++	cmp	$VEC_SIZE, %RDX_LP
+ 	jb	L(less_vec)
+-	cmpq	$(VEC_SIZE * 2), %rdx
++	cmp	$(VEC_SIZE * 2), %RDX_LP
+ 	ja	L(more_2x_vec)
+ #if !defined USE_MULTIARCH || !IS_IN (libc)
+ L(last_2x_vec):
+@@ -138,38 +142,38 @@ END (MEMMOVE_SYMBOL (__memmove, unaligned))
+ 
+ # if VEC_SIZE == 16
+ ENTRY (__mempcpy_chk_erms)
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (__mempcpy_chk_erms)
+ 
+ /* Only used to measure performance of REP MOVSB.  */
+ ENTRY (__mempcpy_erms)
+-	movq	%rdi, %rax
++	mov	%RDI_LP, %RAX_LP
+ 	/* Skip zero length.  */
+-	testq	%rdx, %rdx
++	test	%RDX_LP, %RDX_LP
+ 	jz	2f
+-	addq	%rdx, %rax
++	add	%RDX_LP, %RAX_LP
+ 	jmp	L(start_movsb)
+ END (__mempcpy_erms)
+ 
+ ENTRY (__memmove_chk_erms)
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (__memmove_chk_erms)
+ 
+ ENTRY (__memmove_erms)
+ 	movq	%rdi, %rax
+ 	/* Skip zero length.  */
+-	testq	%rdx, %rdx
++	test	%RDX_LP, %RDX_LP
+ 	jz	2f
+ L(start_movsb):
+-	movq	%rdx, %rcx
+-	cmpq	%rsi, %rdi
++	mov	%RDX_LP, %RCX_LP
++	cmp	%RSI_LP, %RDI_LP
+ 	jb	1f
+ 	/* Source == destination is less common.  */
+ 	je	2f
+-	leaq	(%rsi,%rcx), %rdx
+-	cmpq	%rdx, %rdi
++	lea	(%rsi,%rcx), %RDX_LP
++	cmp	%RDX_LP, %RDI_LP
+ 	jb	L(movsb_backward)
+ 1:
+ 	rep movsb
+@@ -189,20 +193,20 @@ strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
+ 
+ # ifdef SHARED
+ ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
+ # endif
+ 
+ ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
+-	movq	%rdi, %rax
+-	addq	%rdx, %rax
++	mov	%RDI_LP, %RAX_LP
++	add	%RDX_LP, %RAX_LP
+ 	jmp	L(start_erms)
+ END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
+ 
+ # ifdef SHARED
+ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
+ # endif
+@@ -210,9 +214,13 @@ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
+ ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+ 	movq	%rdi, %rax
+ L(start_erms):
+-	cmpq	$VEC_SIZE, %rdx
++# ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	movl	%edx, %edx
++# endif
++	cmp	$VEC_SIZE, %RDX_LP
+ 	jb	L(less_vec)
+-	cmpq	$(VEC_SIZE * 2), %rdx
++	cmp	$(VEC_SIZE * 2), %RDX_LP
+ 	ja	L(movsb_more_2x_vec)
+ L(last_2x_vec):
+ 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */
+@@ -236,7 +244,7 @@ L(movsb):
+ 	/* Avoid slow backward REP MOVSB.  */
+ 	jb	L(more_8x_vec_backward)
+ 1:
+-	movq	%rdx, %rcx
++	mov	%RDX_LP, %RCX_LP
+ 	rep movsb
+ L(nop):
+ 	ret
+diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
+index ddec7f04..2fe1e5ac 100644
+--- a/sysdeps/x86_64/x32/Makefile
++++ b/sysdeps/x86_64/x32/Makefile
+@@ -6,7 +6,7 @@ CFLAGS-s_llround.c += -fno-builtin-lround
+ endif
+ 
+ ifeq ($(subdir),string)
+-tests += tst-size_t-memchr tst-size_t-memcmp
++tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy
+ endif
+ 
+ ifeq ($(subdir),wcsmbs)
+diff --git a/sysdeps/x86_64/x32/tst-size_t-memcpy.c b/sysdeps/x86_64/x32/tst-size_t-memcpy.c
+new file mode 100644
+index 00000000..66b71e17
+--- /dev/null
++++ b/sysdeps/x86_64/x32/tst-size_t-memcpy.c
+@@ -0,0 +1,58 @@
++/* Test memcpy with size_t in the lower 32 bits of 64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#define TEST_NAME "memcpy"
++#include "test-size_t.h"
++
++IMPL (memcpy, 1)
++
++typedef void *(*proto_t) (void *, const void *, size_t);
++
++static void *
++__attribute__ ((noinline, noclone))
++do_memcpy (parameter_t a, parameter_t b)
++{
++  return CALL (&b, a.p, b.p, a.len);
++}
++
++static int
++test_main (void)
++{
++  test_init ();
++
++  parameter_t dest = { { page_size }, buf1 };
++  parameter_t src = { { 0 }, buf2 };
++
++  int ret = 0;
++  FOR_EACH_IMPL (impl, 0)
++    {
++      src.fn = impl->fn;
++      do_memcpy (dest, src);
++      int res = memcmp (dest.p, src.p, dest.len);
++      if (res)
++	{
++	  error (0, 0, "Wrong result in function %s: %i != 0",
++		 impl->name, res);
++	  ret = 1;
++	}
++    }
++
++  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
++}
++
++#include <support/test-driver.c>
+-- 
+GitLab
+
diff --git a/SOURCES/glibc-sw24097-4.patch b/SOURCES/glibc-sw24097-4.patch
new file mode 100644
index 0000000..ed6ef9e
--- /dev/null
+++ b/SOURCES/glibc-sw24097-4.patch
@@ -0,0 +1,148 @@
+From c62e1cf68a70de8ac4cc8aebae760784009777c5 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 1 Feb 2019 12:20:54 -0800
+Subject: [PATCH] x86-64 memrchr: Properly handle the length parameter [BZ
+ #24097]
+
+On x32, the size_t parameter may be passed in the lower 32 bits of a
+64-bit register with the non-zero upper 32 bits.  The string/memory
+functions written in assembly can only use the lower 32 bits of a
+64-bit register as length or must clear the upper 32 bits before using
+the full 64-bit register for length.
+
+This pach fixes memrchr for x32.  Tested on x86-64 and x32.  On x86-64,
+libc.so is the same with and withou the fix.
+
+	[BZ #24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/memrchr.S: Use RDX_LP for length.
+	* sysdeps/x86_64/multiarch/memrchr-avx2.S: Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memrchr.
+	* sysdeps/x86_64/x32/tst-size_t-memrchr.c: New file.
+
+(cherry picked from commit ecd8b842cf37ea112e59cd9085ff1f1b6e208ae0)
+---
+ sysdeps/x86_64/memrchr.S                |  4 +-
+ sysdeps/x86_64/multiarch/memrchr-avx2.S |  4 +-
+ sysdeps/x86_64/x32/Makefile             |  3 +-
+ sysdeps/x86_64/x32/tst-size_t-memrchr.c | 57 +++++++++++++++++++++++++
+ 4 files changed, 63 insertions(+), 5 deletions(-)
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-memrchr.c
+
+diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
+index b8e3fa1d..dc82f8f7 100644
+--- a/sysdeps/x86_64/memrchr.S
++++ b/sysdeps/x86_64/memrchr.S
+@@ -24,13 +24,13 @@
+ ENTRY (__memrchr)
+ 	movd	%esi, %xmm1
+ 
+-	sub	$16, %rdx
++	sub	$16, %RDX_LP
+ 	jbe	L(length_less16)
+ 
+ 	punpcklbw	%xmm1, %xmm1
+ 	punpcklbw	%xmm1, %xmm1
+ 
+-	add	%rdx, %rdi
++	add	%RDX_LP, %RDI_LP
+ 	pshufd	$0, %xmm1, %xmm1
+ 
+ 	movdqu	(%rdi), %xmm0
+diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
+index b41a58bc..ce488dd9 100644
+--- a/sysdeps/x86_64/multiarch/memrchr-avx2.S
++++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
+@@ -32,10 +32,10 @@ ENTRY (__memrchr_avx2)
+ 	vmovd	%esi, %xmm0
+ 	vpbroadcastb %xmm0, %ymm0
+ 
+-	subq	$VEC_SIZE, %rdx
++	sub	$VEC_SIZE, %RDX_LP
+ 	jbe	L(last_vec_or_less)
+ 
+-	addq	%rdx, %rdi
++	add	%RDX_LP, %RDI_LP
+ 
+ 	/* Check the last VEC_SIZE bytes.  */
+ 	vpcmpeqb (%rdi), %ymm0, %ymm1
+diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
+index 2fe1e5ac..e99dbd7c 100644
+--- a/sysdeps/x86_64/x32/Makefile
++++ b/sysdeps/x86_64/x32/Makefile
+@@ -6,7 +6,8 @@ CFLAGS-s_llround.c += -fno-builtin-lround
+ endif
+ 
+ ifeq ($(subdir),string)
+-tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy
++tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
++	 tst-size_t-memrchr
+ endif
+ 
+ ifeq ($(subdir),wcsmbs)
+diff --git a/sysdeps/x86_64/x32/tst-size_t-memrchr.c b/sysdeps/x86_64/x32/tst-size_t-memrchr.c
+new file mode 100644
+index 00000000..c83699c0
+--- /dev/null
++++ b/sysdeps/x86_64/x32/tst-size_t-memrchr.c
+@@ -0,0 +1,57 @@
++/* Test memrchr with size_t in the lower 32 bits of 64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#define TEST_NAME "memrchr"
++#include "test-size_t.h"
++
++IMPL (memchr, 1)
++
++typedef void * (*proto_t) (const void *, int, size_t);
++
++static void *
++__attribute__ ((noinline, noclone))
++do_memrchr (parameter_t a, parameter_t b)
++{
++  return CALL (&b, a.p, (uintptr_t) b.p, a.len);
++}
++
++static int
++test_main (void)
++{
++  test_init ();
++
++  parameter_t src = { { page_size }, buf2 };
++  parameter_t c = { { 0 }, (void *) (uintptr_t) 0x12 };
++
++  int ret = 0;
++  FOR_EACH_IMPL (impl, 0)
++    {
++      c.fn = impl->fn;
++      void * res = do_memrchr (src, c);
++      if (res)
++	{
++	  error (0, 0, "Wrong result in function %s: %p != NULL",
++		 impl->name, res);
++	  ret = 1;
++	}
++    }
++
++  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
++}
++
++#include <support/test-driver.c>
+-- 
+GitLab
+
diff --git a/SOURCES/glibc-sw24097-5.patch b/SOURCES/glibc-sw24097-5.patch
new file mode 100644
index 0000000..8c51514
--- /dev/null
+++ b/SOURCES/glibc-sw24097-5.patch
@@ -0,0 +1,287 @@
+From bd6061191c63fb24025f3bb4cfc88519024f2b7a Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Wed, 2 Mar 2022 14:03:51 -0800
+Subject: [PATCH] x86-64 memset/wmemset: Properly handle the length parameter
+ [BZ #24097]
+
+On x32, the size_t parameter may be passed in the lower 32 bits of a
+64-bit register with the non-zero upper 32 bits.  The string/memory
+functions written in assembly can only use the lower 32 bits of a
+64-bit register as length or must clear the upper 32 bits before using
+the full 64-bit register for length.
+
+This pach fixes memset/wmemset for x32.  Tested on x86-64 and x32.  On
+x86-64, libc.so is the same with and withou the fix.
+
+	[BZ #24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S: Use
+	RDX_LP for length.  Clear the upper 32 bits of RDX register.
+	* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-wmemset.
+	* sysdeps/x86_64/x32/tst-size_t-memset.c: New file.
+	* sysdeps/x86_64/x32/tst-size_t-wmemset.c: Likewise.
+
+(cherry picked from commit 82d0b4a4d76db554eb6757acb790fcea30b19965)
+---
+ .../multiarch/memset-avx512-no-vzeroupper.S   |  6 +-
+ .../multiarch/memset-vec-unaligned-erms.S     | 34 +++++----
+ sysdeps/x86_64/x32/Makefile                   |  4 +-
+ sysdeps/x86_64/x32/tst-size_t-memset.c        | 73 +++++++++++++++++++
+ sysdeps/x86_64/x32/tst-size_t-wmemset.c       | 20 +++++
+ 5 files changed, 121 insertions(+), 16 deletions(-)
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-memset.c
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemset.c
+
+diff --git a/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
+index 689cc119..99e25519 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
++++ b/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
+@@ -29,12 +29,16 @@
+ 	.section .text.avx512,"ax",@progbits
+ #if defined PIC
+ ENTRY (MEMSET_CHK)
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (MEMSET_CHK)
+ #endif
+ 
+ ENTRY (MEMSET)
++# ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	mov	%edx, %edx
++# endif
+ 	vpxor	%xmm0, %xmm0, %xmm0
+ 	vmovd	%esi, %xmm1
+ 	lea	(%rdi, %rdx), %rsi
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index 270a1d49..9a0fd818 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -65,8 +65,8 @@
+ 	.section SECTION(.text),"ax",@progbits
+ #if VEC_SIZE == 16 && IS_IN (libc)
+ ENTRY (__bzero)
+-	movq	%rdi, %rax /* Set return value.  */
+-	movq	%rsi, %rdx /* Set n.  */
++	mov	%RDI_LP, %RAX_LP /* Set return value.  */
++	mov	%RSI_LP, %RDX_LP /* Set n.  */
+ 	pxor	%xmm0, %xmm0
+ 	jmp	L(entry_from_bzero)
+ END (__bzero)
+@@ -76,13 +76,13 @@ weak_alias (__bzero, bzero)
+ #if IS_IN (libc)
+ # if defined SHARED
+ ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
+ # endif
+ 
+ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
+-	shlq	$2, %rdx
++	shl	$2, %RDX_LP
+ 	WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+ 	jmp	L(entry_from_bzero)
+ END (WMEMSET_SYMBOL (__wmemset, unaligned))
+@@ -90,13 +90,17 @@ END (WMEMSET_SYMBOL (__wmemset, unaligned))
+ 
+ #if defined SHARED && IS_IN (libc)
+ ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
+ #endif
+ 
+ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
+ 	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
++# ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	mov	%edx, %edx
++# endif
+ L(entry_from_bzero):
+ 	cmpq	$VEC_SIZE, %rdx
+ 	jb	L(less_vec)
+@@ -112,14 +116,14 @@ END (MEMSET_SYMBOL (__memset, unaligned))
+ 
+ # if VEC_SIZE == 16
+ ENTRY (__memset_chk_erms)
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (__memset_chk_erms)
+ 
+ /* Only used to measure performance of REP STOSB.  */
+ ENTRY (__memset_erms)
+ 	/* Skip zero length.  */
+-	testq	%rdx, %rdx
++	test	%RDX_LP, %RDX_LP
+ 	jnz	 L(stosb)
+ 	movq	%rdi, %rax
+ 	ret
+@@ -131,11 +135,11 @@ ENTRY (MEMSET_SYMBOL (__memset, erms))
+ L(stosb):
+ 	/* Issue vzeroupper before rep stosb.  */
+ 	VZEROUPPER
+-	movq	%rdx, %rcx
++	mov	%RDX_LP, %RCX_LP
+ 	movzbl	%sil, %eax
+-	movq	%rdi, %rdx
++	mov	%RDI_LP, %RDX_LP
+ 	rep stosb
+-	movq	%rdx, %rax
++	mov	%RDX_LP, %RAX_LP
+ 	ret
+ # if VEC_SIZE == 16
+ END (__memset_erms)
+@@ -145,16 +149,20 @@ END (MEMSET_SYMBOL (__memset, erms))
+ 
+ # if defined SHARED && IS_IN (libc)
+ ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
+ # endif
+ 
+ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
+ 	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+-	cmpq	$VEC_SIZE, %rdx
++# ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	mov	%edx, %edx
++# endif
++	cmp	$VEC_SIZE, %RDX_LP
+ 	jb	L(less_vec)
+-	cmpq	$(VEC_SIZE * 2), %rdx
++	cmp	$(VEC_SIZE * 2), %RDX_LP
+ 	ja	L(stosb_more_2x_vec)
+ 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+ 	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
+index e99dbd7c..98bd9ae9 100644
+--- a/sysdeps/x86_64/x32/Makefile
++++ b/sysdeps/x86_64/x32/Makefile
+@@ -7,9 +7,9 @@ endif
+ 
+ ifeq ($(subdir),string)
+ tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
+-	 tst-size_t-memrchr
++	 tst-size_t-memrchr tst-size_t-memset
+ endif
+ 
+ ifeq ($(subdir),wcsmbs)
+-tests += tst-size_t-wmemchr tst-size_t-wmemcmp
++tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset
+ endif
+diff --git a/sysdeps/x86_64/x32/tst-size_t-memset.c b/sysdeps/x86_64/x32/tst-size_t-memset.c
+new file mode 100644
+index 00000000..2c367af6
+--- /dev/null
++++ b/sysdeps/x86_64/x32/tst-size_t-memset.c
+@@ -0,0 +1,73 @@
++/* Test memset with size_t in the lower 32 bits of 64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#ifdef WIDE
++# define TEST_NAME "wmemset"
++#else
++# define TEST_NAME "memset"
++#endif /* WIDE */
++
++#include "test-size_t.h"
++
++#ifdef WIDE
++# include <wchar.h>
++# define MEMSET wmemset
++# define CHAR wchar_t
++#else
++# define MEMSET memset
++# define CHAR char
++#endif /* WIDE */
++
++IMPL (MEMSET, 1)
++
++typedef CHAR *(*proto_t) (CHAR *, int, size_t);
++
++static void *
++__attribute__ ((noinline, noclone))
++do_memset (parameter_t a, parameter_t b)
++{
++  return CALL (&b, a.p, (uintptr_t) b.p, a.len);
++}
++
++static int
++test_main (void)
++{
++  test_init ();
++
++  CHAR ch = 0x23;
++  parameter_t src = { { page_size / sizeof (CHAR) }, buf2 };
++  parameter_t c = { { 0 }, (void *) (uintptr_t) ch };
++
++  int ret = 0;
++  FOR_EACH_IMPL (impl, 0)
++    {
++      c.fn = impl->fn;
++      CHAR *p = (CHAR *) do_memset (src, c);
++      size_t i;
++      for (i = 0; i < src.len; i++)
++	if (p[i] != ch)
++	  {
++	    error (0, 0, "Wrong result in function %s", impl->name);
++	    ret = 1;
++	  }
++    }
++
++  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
++}
++
++#include <support/test-driver.c>
+diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemset.c b/sysdeps/x86_64/x32/tst-size_t-wmemset.c
+new file mode 100644
+index 00000000..955eb488
+--- /dev/null
++++ b/sysdeps/x86_64/x32/tst-size_t-wmemset.c
+@@ -0,0 +1,20 @@
++/* Test wmemset with size_t in the lower 32 bits of 64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#define WIDE 1
++#include "tst-size_t-memset.c"
+-- 
+GitLab
+
diff --git a/SOURCES/glibc-sw24097-6.patch b/SOURCES/glibc-sw24097-6.patch
new file mode 100644
index 0000000..6884642
--- /dev/null
+++ b/SOURCES/glibc-sw24097-6.patch
@@ -0,0 +1,297 @@
+From 323c698927f67a153e278cc99870998a9c7c87af Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 1 Feb 2019 12:22:33 -0800
+Subject: [PATCH] x86-64 strncmp family: Properly handle the length parameter
+ [BZ #24097]
+
+On x32, the size_t parameter may be passed in the lower 32 bits of a
+64-bit register with the non-zero upper 32 bits.  The string/memory
+functions written in assembly can only use the lower 32 bits of a
+64-bit register as length or must clear the upper 32 bits before using
+the full 64-bit register for length.
+
+This pach fixes the strncmp family for x32.  Tested on x86-64 and x32.
+On x86-64, libc.so is the same with and withou the fix.
+
+	[BZ #24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/multiarch/strcmp-avx2.S: Use RDX_LP for length.
+	* sysdeps/x86_64/multiarch/strcmp-sse42.S: Likewise.
+	* sysdeps/x86_64/strcmp.S: Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strncasecmp,
+	tst-size_t-strncmp and tst-size_t-wcsncmp.
+	* sysdeps/x86_64/x32/tst-size_t-strncasecmp.c: New file.
+	* sysdeps/x86_64/x32/tst-size_t-strncmp.c: Likewise.
+	* sysdeps/x86_64/x32/tst-size_t-wcsncmp.c: Likewise.
+
+(cherry picked from commit ee915088a0231cd421054dbd8abab7aadf331153)
+---
+ sysdeps/x86_64/multiarch/strcmp-avx2.S      |  6 +-
+ sysdeps/x86_64/multiarch/strcmp-sse42.S     |  6 +-
+ sysdeps/x86_64/strcmp.S                     |  6 +-
+ sysdeps/x86_64/x32/Makefile                 |  6 +-
+ sysdeps/x86_64/x32/tst-size_t-strncasecmp.c | 59 ++++++++++++++++
+ sysdeps/x86_64/x32/tst-size_t-strncmp.c     | 78 +++++++++++++++++++++
+ sysdeps/x86_64/x32/tst-size_t-wcsncmp.c     | 20 ++++++
+ 7 files changed, 170 insertions(+), 11 deletions(-)
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-strncasecmp.c
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-strncmp.c
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-wcsncmp.c
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index 327e3d87..156c1949 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -79,15 +79,15 @@
+ ENTRY (STRCMP)
+ # ifdef USE_AS_STRNCMP
+ 	/* Check for simple cases (0 or 1) in offset.  */
+-	cmp	$1, %rdx
++	cmp	$1, %RDX_LP
+ 	je	L(char0)
+ 	jb	L(zero)
+ #  ifdef USE_AS_WCSCMP
+ 	/* Convert units: from wide to byte char.  */
+-	shl	$2, %rdx
++	shl	$2, %RDX_LP
+ #  endif
+ 	/* Register %r11 tracks the maximum offset.  */
+-	movq	%rdx, %r11
++	mov	%RDX_LP, %R11_LP
+ # endif
+ 	movl	%edi, %eax
+ 	xorl	%edx, %edx
+diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
+index d3c07bd2..a1ebea46 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
++++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
+@@ -156,11 +156,11 @@ STRCMP_SSE42:
+ #endif
+ 
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+-	test	%rdx, %rdx
++	test	%RDX_LP, %RDX_LP
+ 	je	LABEL(strcmp_exitz)
+-	cmp	$1, %rdx
++	cmp	$1, %RDX_LP
+ 	je	LABEL(Byte0)
+-	mov	%rdx, %r11
++	mov	%RDX_LP, %R11_LP
+ #endif
+ 	mov	%esi, %ecx
+ 	mov	%edi, %eax
+diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
+index e16945b9..f47c8ad4 100644
+--- a/sysdeps/x86_64/strcmp.S
++++ b/sysdeps/x86_64/strcmp.S
+@@ -135,11 +135,11 @@ ENTRY (STRCMP)
+  * This implementation uses SSE to compare up to 16 bytes at a time.
+  */
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+-	test	%rdx, %rdx
++	test	%RDX_LP, %RDX_LP
+ 	je	LABEL(strcmp_exitz)
+-	cmp	$1, %rdx
++	cmp	$1, %RDX_LP
+ 	je	LABEL(Byte0)
+-	mov	%rdx, %r11
++	mov	%RDX_LP, %R11_LP
+ #endif
+ 	mov	%esi, %ecx
+ 	mov	%edi, %eax
+diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
+index 98bd9ae9..db302839 100644
+--- a/sysdeps/x86_64/x32/Makefile
++++ b/sysdeps/x86_64/x32/Makefile
+@@ -7,9 +7,11 @@ endif
+ 
+ ifeq ($(subdir),string)
+ tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
+-	 tst-size_t-memrchr tst-size_t-memset
++	 tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \
++	 tst-size_t-strncmp
+ endif
+ 
+ ifeq ($(subdir),wcsmbs)
+-tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset
++tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset \
++	 tst-size_t-wcsncmp
+ endif
+diff --git a/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c b/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c
+new file mode 100644
+index 00000000..86233593
+--- /dev/null
++++ b/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c
+@@ -0,0 +1,59 @@
++/* Test strncaecmp with size_t in the lower 32 bits of 64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#define TEST_NAME "strncasecmp"
++#include "test-size_t.h"
++
++IMPL (strncasecmp, 1)
++
++typedef int (*proto_t) (const char *, const char *, size_t);
++
++static int
++__attribute__ ((noinline, noclone))
++do_strncasecmp (parameter_t a, parameter_t b)
++{
++  return CALL (&b, a.p, b.p, a.len);
++}
++
++static int
++test_main (void)
++{
++  test_init ();
++
++  parameter_t dest = { { page_size }, buf1 };
++  parameter_t src = { { 0 }, buf2 };
++
++  strncpy ((char *) buf1, (const char *) buf2, page_size);
++
++  int ret = 0;
++  FOR_EACH_IMPL (impl, 0)
++    {
++      src.fn = impl->fn;
++      int res = do_strncasecmp (dest, src);
++      if (res)
++	{
++	  error (0, 0, "Wrong result in function %s: %i != 0",
++		 impl->name, res);
++	  ret = 1;
++	}
++    }
++
++  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
++}
++
++#include <support/test-driver.c>
+diff --git a/sysdeps/x86_64/x32/tst-size_t-strncmp.c b/sysdeps/x86_64/x32/tst-size_t-strncmp.c
+new file mode 100644
+index 00000000..54e6bd83
+--- /dev/null
++++ b/sysdeps/x86_64/x32/tst-size_t-strncmp.c
+@@ -0,0 +1,78 @@
++/* Test strncmp with size_t in the lower 32 bits of 64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#ifdef WIDE
++# define TEST_NAME "wcsncmp"
++#else
++# define TEST_NAME "strncmp"
++#endif
++
++#include "test-size_t.h"
++
++#ifdef WIDE
++# include <wchar.h>
++
++# define STRNCMP wcsncmp
++# define STRNCPY wcsncpy
++# define CHAR wchar_t
++#else
++# define STRNCMP strncmp
++# define STRNCPY strncpy
++# define CHAR char
++#endif
++
++IMPL (STRNCMP, 1)
++
++typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
++
++
++static int
++__attribute__ ((noinline, noclone))
++do_strncmp (parameter_t a, parameter_t b)
++{
++  return CALL (&b, a.p, b.p, a.len);
++}
++
++static int
++test_main (void)
++{
++  test_init ();
++
++  size_t size = page_size / sizeof (CHAR);
++  parameter_t dest = { { size }, buf1 };
++  parameter_t src = { { 0 }, buf2 };
++
++  STRNCPY ((CHAR *) buf1, (const CHAR *) buf2, size);
++
++  int ret = 0;
++  FOR_EACH_IMPL (impl, 0)
++    {
++      src.fn = impl->fn;
++      int res = do_strncmp (dest, src);
++      if (res)
++	{
++	  error (0, 0, "Wrong result in function %s: %i != 0",
++		 impl->name, res);
++	  ret = 1;
++	}
++    }
++
++  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
++}
++
++#include <support/test-driver.c>
+diff --git a/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c b/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c
+new file mode 100644
+index 00000000..4829647c
+--- /dev/null
++++ b/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c
+@@ -0,0 +1,20 @@
++/* Test wcsncmp with size_t in the lower 32 bits of 64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#define WIDE 1
++#include "tst-size_t-strncmp.c"
+-- 
+GitLab
+
diff --git a/SOURCES/glibc-sw24097-7.patch b/SOURCES/glibc-sw24097-7.patch
new file mode 100644
index 0000000..2cff09e
--- /dev/null
+++ b/SOURCES/glibc-sw24097-7.patch
@@ -0,0 +1,148 @@
+From 4f97e8708512462e87d2f1f1bdeb8cf55ce10e5a Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 1 Feb 2019 12:23:23 -0800
+Subject: [PATCH] x86-64 strncpy: Properly handle the length parameter [BZ
+ #24097]
+
+On x32, the size_t parameter may be passed in the lower 32 bits of a
+64-bit register with the non-zero upper 32 bits.  The string/memory
+functions written in assembly can only use the lower 32 bits of a
+64-bit register as length or must clear the upper 32 bits before using
+the full 64-bit register for length.
+
+This pach fixes strncpy for x32.  Tested on x86-64 and x32.  On x86-64,
+libc.so is the same with and withou the fix.
+
+	[BZ #24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S: Use RDX_LP
+	for length.
+	* sysdeps/x86_64/multiarch/strcpy-ssse3.S: Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strncpy.
+	* sysdeps/x86_64/x32/tst-size_t-strncpy.c: New file.
+
+(cherry picked from commit c7c54f65b080affb87a1513dee449c8ad6143c8b)
+---
+ .../x86_64/multiarch/strcpy-sse2-unaligned.S  |  4 +-
+ sysdeps/x86_64/multiarch/strcpy-ssse3.S       |  6 +-
+ sysdeps/x86_64/x32/Makefile                   |  2 +-
+ sysdeps/x86_64/x32/tst-size_t-strncpy.c       | 58 +++++++++++++++++++
+ 4 files changed, 64 insertions(+), 6 deletions(-)
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-strncpy.c
+
+diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
+index 72bf7e85..50aca22d 100644
+--- a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
++++ b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
+@@ -40,8 +40,8 @@
+ .text
+ ENTRY (STRCPY)
+ #  ifdef USE_AS_STRNCPY
+-	mov	%rdx, %r8
+-	test	%r8, %r8
++	mov	%RDX_LP, %R8_LP
++	test	%R8_LP, %R8_LP
+ 	jz	L(ExitZero)
+ #  endif
+ 	mov	%rsi, %rcx
+diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
+index 9858d0c4..0a62814a 100644
+--- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S
++++ b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
+@@ -31,13 +31,13 @@ ENTRY (STRCPY)
+ 
+ 	mov	%rsi, %rcx
+ #  ifdef USE_AS_STRNCPY
+-	mov	%rdx, %r8
++	mov	%RDX_LP, %R8_LP
+ #  endif
+ 	mov	%rdi, %rdx
+ #  ifdef USE_AS_STRNCPY
+-	test	%r8, %r8
++	test	%R8_LP, %R8_LP
+ 	jz	L(Exit0)
+-	cmp	$8, %r8
++	cmp	$8, %R8_LP
+ 	jbe	L(StrncpyExit8Bytes)
+ # endif
+ 	cmpb	$0, (%rcx)
+diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
+index db302839..2a9e20a9 100644
+--- a/sysdeps/x86_64/x32/Makefile
++++ b/sysdeps/x86_64/x32/Makefile
+@@ -8,7 +8,7 @@ endif
+ ifeq ($(subdir),string)
+ tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
+ 	 tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \
+-	 tst-size_t-strncmp
++	 tst-size_t-strncmp tst-size_t-strncpy
+ endif
+ 
+ ifeq ($(subdir),wcsmbs)
+diff --git a/sysdeps/x86_64/x32/tst-size_t-strncpy.c b/sysdeps/x86_64/x32/tst-size_t-strncpy.c
+new file mode 100644
+index 00000000..4dec71e6
+--- /dev/null
++++ b/sysdeps/x86_64/x32/tst-size_t-strncpy.c
+@@ -0,0 +1,58 @@
++/* Test strncpy with size_t in the lower 32 bits of 64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#define TEST_NAME "strncpy"
++#include "test-size_t.h"
++
++IMPL (strncpy, 1)
++
++typedef char *(*proto_t) (char *, const char*, size_t);
++
++static void *
++__attribute__ ((noinline, noclone))
++do_strncpy (parameter_t a, parameter_t b)
++{
++  return CALL (&b, a.p, b.p, a.len);
++}
++
++static int
++test_main (void)
++{
++  test_init ();
++
++  parameter_t dest = { { page_size }, buf1 };
++  parameter_t src = { { 0 }, buf2 };
++
++  int ret = 0;
++  FOR_EACH_IMPL (impl, 0)
++    {
++      src.fn = impl->fn;
++      do_strncpy (dest, src);
++      int res = strncmp (dest.p, src.p, dest.len);
++      if (res)
++	{
++	  error (0, 0, "Wrong result in function %s: %i != 0",
++		 impl->name, res);
++	  ret = 1;
++	}
++    }
++
++  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
++}
++
++#include <support/test-driver.c>
+-- 
+GitLab
+
diff --git a/SOURCES/glibc-sw24097-8.patch b/SOURCES/glibc-sw24097-8.patch
new file mode 100644
index 0000000..49f7327
--- /dev/null
+++ b/SOURCES/glibc-sw24097-8.patch
@@ -0,0 +1,215 @@
+From 3e2556a4407e551a9c4086e67f46ef8d235cfe0d Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 1 Feb 2019 12:24:08 -0800
+Subject: [PATCH] x86-64 strnlen/wcsnlen: Properly handle the length parameter
+ [BZ #24097]
+
+On x32, the size_t parameter may be passed in the lower 32 bits of a
+64-bit register with the non-zero upper 32 bits.  The string/memory
+functions written in assembly can only use the lower 32 bits of a
+64-bit register as length or must clear the upper 32 bits before using
+the full 64-bit register for length.
+
+This pach fixes strnlen/wcsnlen for x32.  Tested on x86-64 and x32.  On
+x86-64, libc.so is the same with and withou the fix.
+
+	[BZ #24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/multiarch/strlen-avx2.S: Use RSI_LP for length.
+	Clear the upper 32 bits of RSI register.
+	* sysdeps/x86_64/strlen.S: Use RSI_LP for length.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strnlen
+	and tst-size_t-wcsnlen.
+	* sysdeps/x86_64/x32/tst-size_t-strnlen.c: New file.
+	* sysdeps/x86_64/x32/tst-size_t-wcsnlen.c: Likewise.
+
+(cherry picked from commit 5165de69c0908e28a380cbd4bb054e55ea4abc95)
+---
+ sysdeps/x86_64/multiarch/strlen-avx2.S  |  9 ++--
+ sysdeps/x86_64/strlen.S                 | 12 ++---
+ sysdeps/x86_64/x32/Makefile             |  4 +-
+ sysdeps/x86_64/x32/tst-size_t-strnlen.c | 72 +++++++++++++++++++++++++
+ sysdeps/x86_64/x32/tst-size_t-wcsnlen.c | 20 +++++++
+ 5 files changed, 106 insertions(+), 11 deletions(-)
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-strnlen.c
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-wcsnlen.c
+
+diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
+index fb2418cd..645e0446 100644
+--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
++++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
+@@ -42,12 +42,15 @@
+ ENTRY (STRLEN)
+ # ifdef USE_AS_STRNLEN
+ 	/* Check for zero length.  */
+-	testq	%rsi, %rsi
++	test	%RSI_LP, %RSI_LP
+ 	jz	L(zero)
+ #  ifdef USE_AS_WCSLEN
+-	shl	$2, %rsi
++	shl	$2, %RSI_LP
++#  elif defined __ILP32__
++	/* Clear the upper 32 bits.  */
++	movl	%esi, %esi
+ #  endif
+-	movq	%rsi, %r8
++	mov	%RSI_LP, %R8_LP
+ # endif
+ 	movl	%edi, %ecx
+ 	movq	%rdi, %rdx
+diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
+index 01cb5fa8..f845f3d4 100644
+--- a/sysdeps/x86_64/strlen.S
++++ b/sysdeps/x86_64/strlen.S
+@@ -59,21 +59,21 @@ ENTRY(strlen)
+ 
+ #ifdef AS_STRNLEN
+ /* Do not read anything when n==0.  */
+-	test	%rsi, %rsi
++	test	%RSI_LP, %RSI_LP
+ 	jne	L(n_nonzero)
+ 	xor	%rax, %rax
+ 	ret
+ L(n_nonzero):
+ # ifdef AS_WCSLEN
+-	shlq	$2, %rsi
++	shl	$2, %RSI_LP
+ # endif
+ 
+ /* Initialize long lived registers.  */
+ 
+-	add	%rdi, %rsi
+-	mov	%rsi, %r10
+-	and	$-64, %r10
+-	mov	%rsi, %r11
++	add	%RDI_LP, %RSI_LP
++	mov	%RSI_LP, %R10_LP
++	and	$-64, %R10_LP
++	mov	%RSI_LP, %R11_LP
+ #endif
+ 
+ 	pxor	%xmm0, %xmm0
+diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
+index 2a9e20a9..1557724b 100644
+--- a/sysdeps/x86_64/x32/Makefile
++++ b/sysdeps/x86_64/x32/Makefile
+@@ -8,10 +8,10 @@ endif
+ ifeq ($(subdir),string)
+ tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
+ 	 tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \
+-	 tst-size_t-strncmp tst-size_t-strncpy
++	 tst-size_t-strncmp tst-size_t-strncpy tst-size_t-strnlen
+ endif
+ 
+ ifeq ($(subdir),wcsmbs)
+ tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset \
+-	 tst-size_t-wcsncmp
++	 tst-size_t-wcsncmp tst-size_t-wcsnlen
+ endif
+diff --git a/sysdeps/x86_64/x32/tst-size_t-strnlen.c b/sysdeps/x86_64/x32/tst-size_t-strnlen.c
+new file mode 100644
+index 00000000..690a4a8a
+--- /dev/null
++++ b/sysdeps/x86_64/x32/tst-size_t-strnlen.c
+@@ -0,0 +1,72 @@
++/* Test strnlen with size_t in the lower 32 bits of 64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#ifdef WIDE
++# define TEST_NAME "wcsnlen"
++#else
++# define TEST_NAME "strnlen"
++#endif /* WIDE */
++
++#include "test-size_t.h"
++
++#ifdef WIDE
++# include <wchar.h>
++# define STRNLEN wcsnlen
++# define CHAR wchar_t
++#else
++# define STRNLEN strnlen
++# define CHAR char
++#endif /* WIDE */
++
++IMPL (STRNLEN, 1)
++
++typedef size_t (*proto_t) (const CHAR *, size_t);
++
++static size_t
++__attribute__ ((noinline, noclone))
++do_strnlen (parameter_t a, parameter_t b)
++{
++  return CALL (&a, a.p, b.len);
++}
++
++static int
++test_main (void)
++{
++  test_init ();
++
++  size_t size = page_size / sizeof (CHAR);
++  parameter_t src = { { 0 }, buf2 };
++  parameter_t c = { { size }, (void *) (uintptr_t) 'a' };
++
++  int ret = 0;
++  FOR_EACH_IMPL (impl, 0)
++    {
++      src.fn = impl->fn;
++      size_t res = do_strnlen (src, c);
++      if (res != size)
++	{
++	  error (0, 0, "Wrong result in function %s: 0x%x != 0x%x",
++		 impl->name, res, size);
++	  ret = 1;
++	}
++    }
++
++  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
++}
++
++#include <support/test-driver.c>
+diff --git a/sysdeps/x86_64/x32/tst-size_t-wcsnlen.c b/sysdeps/x86_64/x32/tst-size_t-wcsnlen.c
+new file mode 100644
+index 00000000..093b4bbe
+--- /dev/null
++++ b/sysdeps/x86_64/x32/tst-size_t-wcsnlen.c
+@@ -0,0 +1,20 @@
++/* Test wcsnlen with size_t in the lower 32 bits of 64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#define WIDE 1
++#include "tst-size_t-strnlen.c"
+-- 
+GitLab
+
diff --git a/SOURCES/glibc-sw24155.patch b/SOURCES/glibc-sw24155.patch
new file mode 100644
index 0000000..49005d6
--- /dev/null
+++ b/SOURCES/glibc-sw24155.patch
@@ -0,0 +1,201 @@
+From 3cef261bb97c00473cb41ae4d7401a46e3d96ce7 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 4 Feb 2019 08:55:52 -0800
+Subject: [PATCH] x86-64 memcmp: Use unsigned Jcc instructions on size [BZ
+ #24155]
+
+Since the size argument is unsigned. we should use unsigned Jcc
+instructions, instead of signed, to check size.
+
+Tested on x86-64 and x32, with and without --disable-multi-arch.
+
+	[BZ #24155]
+	CVE-2019-7309
+	* NEWS: Updated for CVE-2019-7309.
+	* sysdeps/x86_64/memcmp.S: Use RDX_LP for size.  Clear the
+	upper 32 bits of RDX register for x32.  Use unsigned Jcc
+	instructions, instead of signed.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcmp-2.
+	* sysdeps/x86_64/x32/tst-size_t-memcmp-2.c: New test.
+
+(cherry picked from commit 3f635fb43389b54f682fc9ed2acc0b2aaf4a923d)
+---
+ sysdeps/x86_64/memcmp.S                  | 20 +++---
+ sysdeps/x86_64/x32/Makefile              |  3 +-
+ sysdeps/x86_64/x32/tst-size_t-memcmp-2.c | 79 ++++++++++++++++++++++++
+ 3 files changed, 93 insertions(+), 9 deletions(-)
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcmp-2.c
+
+diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S
+index bcb4a2e8..45918d37 100644
+--- a/sysdeps/x86_64/memcmp.S
++++ b/sysdeps/x86_64/memcmp.S
+@@ -21,14 +21,18 @@
+ 
+ 	.text
+ ENTRY (memcmp)
+-	test	%rdx, %rdx
++#ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	movl	%edx, %edx
++#endif
++	test	%RDX_LP, %RDX_LP
+ 	jz	L(finz)
+ 	cmpq	$1, %rdx
+-	jle	L(finr1b)
++	jbe	L(finr1b)
+ 	subq	%rdi, %rsi
+ 	movq	%rdx, %r10
+ 	cmpq	$32, %r10
+-	jge	L(gt32)
++	jae	L(gt32)
+ 	/* Handle small chunks and last block of less than 32 bytes.  */
+ L(small):
+ 	testq	$1, %r10
+@@ -156,7 +160,7 @@ L(A32):
+ 	movq	%r11, %r10
+ 	andq	$-32, %r10
+ 	cmpq	%r10, %rdi
+-        jge	L(mt16)
++        jae	L(mt16)
+ 	/* Pre-unroll to be ready for unrolled 64B loop.  */
+ 	testq	$32, %rdi
+ 	jz	L(A64)
+@@ -178,7 +182,7 @@ L(A64):
+ 	movq	%r11, %r10
+ 	andq	$-64, %r10
+ 	cmpq	%r10, %rdi
+-        jge	L(mt32)
++        jae	L(mt32)
+ 
+ L(A64main):
+ 	movdqu    (%rdi,%rsi), %xmm0
+@@ -216,7 +220,7 @@ L(mt32):
+ 	movq	%r11, %r10
+ 	andq	$-32, %r10
+ 	cmpq	%r10, %rdi
+-        jge	L(mt16)
++        jae	L(mt16)
+ 
+ L(A32main):
+ 	movdqu    (%rdi,%rsi), %xmm0
+@@ -254,7 +258,7 @@ L(ATR):
+ 	movq	%r11, %r10
+ 	andq	$-32, %r10
+ 	cmpq	%r10, %rdi
+-        jge	L(mt16)
++        jae	L(mt16)
+ 	testq	$16, %rdi
+ 	jz	L(ATR32)
+ 
+@@ -325,7 +329,7 @@ L(ATR64main):
+ 	movq	%r11, %r10
+ 	andq	$-32, %r10
+ 	cmpq	%r10, %rdi
+-        jge	L(mt16)
++        jae	L(mt16)
+ 
+ L(ATR32res):
+ 	movdqa    (%rdi,%rsi), %xmm0
+diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
+index 1557724b..87489565 100644
+--- a/sysdeps/x86_64/x32/Makefile
++++ b/sysdeps/x86_64/x32/Makefile
+@@ -8,7 +8,8 @@ endif
+ ifeq ($(subdir),string)
+ tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
+ 	 tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \
+-	 tst-size_t-strncmp tst-size_t-strncpy tst-size_t-strnlen
++	 tst-size_t-strncmp tst-size_t-strncpy tst-size_t-strnlen \
++	 tst-size_t-memcmp-2
+ endif
+ 
+ ifeq ($(subdir),wcsmbs)
+diff --git a/sysdeps/x86_64/x32/tst-size_t-memcmp-2.c b/sysdeps/x86_64/x32/tst-size_t-memcmp-2.c
+new file mode 100644
+index 00000000..d8ae1a08
+--- /dev/null
++++ b/sysdeps/x86_64/x32/tst-size_t-memcmp-2.c
+@@ -0,0 +1,79 @@
++/* Test memcmp with size_t in the lower 32 bits of 64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#define TEST_MAIN
++#ifdef WIDE
++# define TEST_NAME "wmemcmp"
++#else
++# define TEST_NAME "memcmp"
++#endif
++
++#include "test-size_t.h"
++
++#ifdef WIDE
++# include <inttypes.h>
++# include <wchar.h>
++
++# define MEMCMP wmemcmp
++# define CHAR wchar_t
++#else
++# define MEMCMP memcmp
++# define CHAR char
++#endif
++
++IMPL (MEMCMP, 1)
++
++typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
++
++static int
++__attribute__ ((noinline, noclone))
++do_memcmp (parameter_t a, parameter_t b)
++{
++  return CALL (&b, a.p, b.p, a.len);
++}
++
++static int
++test_main (void)
++{
++  test_init ();
++
++  parameter_t dest = { { page_size / sizeof (CHAR) }, buf1 };
++  parameter_t src = { { 0 }, buf2 };
++
++  memcpy (buf1, buf2, page_size);
++
++  CHAR *p = (CHAR *) buf1;
++  p[page_size / sizeof (CHAR) - 1] = (CHAR) 1;
++
++  int ret = 0;
++  FOR_EACH_IMPL (impl, 0)
++    {
++      src.fn = impl->fn;
++      int res = do_memcmp (dest, src);
++      if (res >= 0)
++	{
++	  error (0, 0, "Wrong result in function %s: %i >= 0",
++		 impl->name, res);
++	  ret = 1;
++	}
++    }
++
++  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
++}
++
++#include <support/test-driver.c>
+-- 
+GitLab
+
diff --git a/SOURCES/glibc-sw25966.patch b/SOURCES/glibc-sw25966.patch
new file mode 100644
index 0000000..11c9df8
--- /dev/null
+++ b/SOURCES/glibc-sw25966.patch
@@ -0,0 +1,52 @@
+From c640bf1ea99d9598f8e3afc922567d8b3623d23e Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Wed, 2 Mar 2022 14:57:54 -0800
+Subject: [PATCH] x86-64: Use RDX_LP on __x86_shared_non_temporal_threshold [BZ
+ #25966]
+
+Since __x86_shared_non_temporal_threshold is defined as
+
+long int __x86_shared_non_temporal_threshold;
+
+and long int is 4 bytes for x32, use RDX_LP to compare against
+__x86_shared_non_temporal_threshold in assembly code.
+
+(cherry picked from commit 55c7bcc71b84123d5d4bd2814366a6b05fcf8ebd)
+---
+ sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+index 71f5954d..673b73aa 100644
+--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+@@ -245,7 +245,7 @@ L(return):
+ #endif
+ 
+ L(movsb):
+-	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
++	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
+ 	jae	L(more_8x_vec)
+ 	cmpq	%rsi, %rdi
+ 	jb	1f
+@@ -397,7 +397,7 @@ L(more_8x_vec):
+ 	addq	%r8, %rdx
+ #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+ 	/* Check non-temporal store threshold.  */
+-	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
++	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
+ 	ja	L(large_forward)
+ #endif
+ L(loop_4x_vec_forward):
+@@ -448,7 +448,7 @@ L(more_8x_vec_backward):
+ 	subq	%r8, %rdx
+ #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+ 	/* Check non-temporal store threshold.  */
+-	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
++	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
+ 	ja	L(large_backward)
+ #endif
+ L(loop_4x_vec_backward):
+-- 
+GitLab
+
diff --git a/SOURCES/glibc-sw27130.patch b/SOURCES/glibc-sw27130.patch
new file mode 100644
index 0000000..9065525
--- /dev/null
+++ b/SOURCES/glibc-sw27130.patch
@@ -0,0 +1,68 @@
+From b4cf48b235d64d0eab5739e6d496717c0023a32e Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Wed, 2 Mar 2022 15:17:29 -0800
+Subject: [PATCH] x86-64: Avoid rep movsb with short distance [BZ #27130]
+
+When copying with "rep movsb", if the distance between source and
+destination is N*4GB + [1..63] with N >= 0, performance may be very
+slow.  This patch updates memmove-vec-unaligned-erms.S for AVX and
+AVX512 versions with the distance in RCX:
+
+	cmpl	$63, %ecx
+	// Don't use "rep movsb" if ECX <= 63
+	jbe	L(Don't use rep movsb")
+	Use "rep movsb"
+
+Benchtests data with bench-memcpy, bench-memcpy-large, bench-memcpy-random
+and bench-memcpy-walk on Skylake, Ice Lake and Tiger Lake show that its
+performance impact is within noise range as "rep movsb" is only used for
+data size >= 4KB.
+
+(cherry picked from commit 3ec5d83d2a237d39e7fd6ef7a0bc8ac4c171a4a5)
+---
+ .../multiarch/memmove-vec-unaligned-erms.S    | 21 +++++++++++++++++++
+ 1 file changed, 21 insertions(+)
+
+diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+index 673b73aa..c475fed4 100644
+--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+@@ -64,6 +64,13 @@
+ # endif
+ #endif
+ 
++/* Avoid short distance rep movsb only with non-SSE vector.  */
++#ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
++# define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
++#else
++# define AVOID_SHORT_DISTANCE_REP_MOVSB 0
++#endif
++
+ #ifndef PREFETCH
+ # define PREFETCH(addr) prefetcht0 addr
+ #endif
+@@ -255,7 +262,21 @@ L(movsb):
+ 	cmpq	%r9, %rdi
+ 	/* Avoid slow backward REP MOVSB.  */
+ 	jb	L(more_8x_vec_backward)
++# if AVOID_SHORT_DISTANCE_REP_MOVSB
++	movq	%rdi, %rcx
++	subq	%rsi, %rcx
++	jmp	2f
++# endif
+ 1:
++# if AVOID_SHORT_DISTANCE_REP_MOVSB
++	movq	%rsi, %rcx
++	subq	%rdi, %rcx
++2:
++/* Avoid "rep movsb" if RCX, the distance between source and destination,
++   is N*4GB + [1..63] with N >= 0.  */
++	cmpl	$63, %ecx
++	jbe	L(more_2x_vec)	/* Avoid "rep movsb" if ECX <= 63.  */
++# endif
+ 	mov	%RDX_LP, %RCX_LP
+ 	rep movsb
+ L(nop):
+-- 
+GitLab
+
diff --git a/SOURCES/glibc-sw27974-1.patch b/SOURCES/glibc-sw27974-1.patch
new file mode 100644
index 0000000..d584422
--- /dev/null
+++ b/SOURCES/glibc-sw27974-1.patch
@@ -0,0 +1,388 @@
+From d8d5c44ed7636fdd2b736e152f8207ca063da386 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 9 Jun 2021 16:25:32 -0400
+Subject: [PATCH] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 [BZ
+ #27974]
+
+This commit fixes the bug mentioned in the previous commit.
+
+The previous implementations of wmemchr in these files relied
+on n * sizeof(wchar_t) which was not guranteed by the standard.
+
+The new overflow tests added in the previous commit now
+pass (As well as all the other tests).
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+(cherry picked from commit 645a158978f9520e74074e8c14047503be4db0f0)
+---
+ sysdeps/x86_64/memchr.S                | 77 +++++++++++++++++++-------
+ sysdeps/x86_64/multiarch/memchr-avx2.S | 58 +++++++++++++------
+ 2 files changed, 98 insertions(+), 37 deletions(-)
+
+diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
+index cb320257..24f9a0c5 100644
+--- a/sysdeps/x86_64/memchr.S
++++ b/sysdeps/x86_64/memchr.S
+@@ -21,9 +21,11 @@
+ #ifdef USE_AS_WMEMCHR
+ # define MEMCHR		wmemchr
+ # define PCMPEQ		pcmpeqd
++# define CHAR_PER_VEC	4
+ #else
+ # define MEMCHR		memchr
+ # define PCMPEQ		pcmpeqb
++# define CHAR_PER_VEC	16
+ #endif
+ 
+ /* fast SSE2 version with using pmaxub and 64 byte loop */
+@@ -33,15 +35,14 @@ ENTRY(MEMCHR)
+ 	movd	%esi, %xmm1
+ 	mov	%edi, %ecx
+ 
++#ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	movl	%edx, %edx
++#endif
+ #ifdef USE_AS_WMEMCHR
+ 	test	%RDX_LP, %RDX_LP
+ 	jz	L(return_null)
+-	shl	$2, %RDX_LP
+ #else
+-# ifdef __ILP32__
+-	/* Clear the upper 32 bits.  */
+-	movl	%edx, %edx
+-# endif
+ 	punpcklbw %xmm1, %xmm1
+ 	test	%RDX_LP, %RDX_LP
+ 	jz	L(return_null)
+@@ -60,13 +61,16 @@ ENTRY(MEMCHR)
+ 	test	%eax, %eax
+ 
+ 	jnz	L(matches_1)
+-	sub	$16, %rdx
++	sub	$CHAR_PER_VEC, %rdx
+ 	jbe	L(return_null)
+ 	add	$16, %rdi
+ 	and	$15, %ecx
+ 	and	$-16, %rdi
++#ifdef USE_AS_WMEMCHR
++	shr	$2, %ecx
++#endif
+ 	add	%rcx, %rdx
+-	sub	$64, %rdx
++	sub	$(CHAR_PER_VEC * 4), %rdx
+ 	jbe	L(exit_loop)
+ 	jmp	L(loop_prolog)
+ 
+@@ -77,16 +81,21 @@ L(crosscache):
+ 	movdqa	(%rdi), %xmm0
+ 
+ 	PCMPEQ	%xmm1, %xmm0
+-/* Check if there is a match.  */
++	/* Check if there is a match.  */
+ 	pmovmskb %xmm0, %eax
+-/* Remove the leading bytes.  */
++	/* Remove the leading bytes.  */
+ 	sar	%cl, %eax
+ 	test	%eax, %eax
+ 	je	L(unaligned_no_match)
+-/* Check which byte is a match.  */
++	/* Check which byte is a match.  */
+ 	bsf	%eax, %eax
+-
++#ifdef USE_AS_WMEMCHR
++	mov	%eax, %esi
++	shr	$2, %esi
++	sub	%rsi, %rdx
++#else
+ 	sub	%rax, %rdx
++#endif
+ 	jbe	L(return_null)
+ 	add	%rdi, %rax
+ 	add	%rcx, %rax
+@@ -94,15 +103,18 @@ L(crosscache):
+ 
+ 	.p2align 4
+ L(unaligned_no_match):
+-        /* "rcx" is less than 16.  Calculate "rdx + rcx - 16" by using
++	/* "rcx" is less than 16.  Calculate "rdx + rcx - 16" by using
+ 	   "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void
+ 	   possible addition overflow.  */
+ 	neg	%rcx
+ 	add	$16, %rcx
++#ifdef USE_AS_WMEMCHR
++	shr	$2, %ecx
++#endif
+ 	sub	%rcx, %rdx
+ 	jbe	L(return_null)
+ 	add	$16, %rdi
+-	sub	$64, %rdx
++	sub	$(CHAR_PER_VEC * 4), %rdx
+ 	jbe	L(exit_loop)
+ 
+ 	.p2align 4
+@@ -135,7 +147,7 @@ L(loop_prolog):
+ 	test	$0x3f, %rdi
+ 	jz	L(align64_loop)
+ 
+-	sub	$64, %rdx
++	sub	$(CHAR_PER_VEC * 4), %rdx
+ 	jbe	L(exit_loop)
+ 
+ 	movdqa	(%rdi), %xmm0
+@@ -167,11 +179,14 @@ L(loop_prolog):
+ 	mov	%rdi, %rcx
+ 	and	$-64, %rdi
+ 	and	$63, %ecx
++#ifdef USE_AS_WMEMCHR
++	shr	$2, %ecx
++#endif
+ 	add	%rcx, %rdx
+ 
+ 	.p2align 4
+ L(align64_loop):
+-	sub	$64, %rdx
++	sub	$(CHAR_PER_VEC * 4), %rdx
+ 	jbe	L(exit_loop)
+ 	movdqa	(%rdi), %xmm0
+ 	movdqa	16(%rdi), %xmm2
+@@ -218,7 +233,7 @@ L(align64_loop):
+ 
+ 	.p2align 4
+ L(exit_loop):
+-	add	$32, %edx
++	add	$(CHAR_PER_VEC * 2), %edx
+ 	jle	L(exit_loop_32)
+ 
+ 	movdqa	(%rdi), %xmm0
+@@ -238,7 +253,7 @@ L(exit_loop):
+ 	pmovmskb %xmm3, %eax
+ 	test	%eax, %eax
+ 	jnz	L(matches32_1)
+-	sub	$16, %edx
++	sub	$CHAR_PER_VEC, %edx
+ 	jle	L(return_null)
+ 
+ 	PCMPEQ	48(%rdi), %xmm1
+@@ -250,13 +265,13 @@ L(exit_loop):
+ 
+ 	.p2align 4
+ L(exit_loop_32):
+-	add	$32, %edx
++	add	$(CHAR_PER_VEC * 2), %edx
+ 	movdqa	(%rdi), %xmm0
+ 	PCMPEQ	%xmm1, %xmm0
+ 	pmovmskb %xmm0, %eax
+ 	test	%eax, %eax
+ 	jnz	L(matches_1)
+-	sub	$16, %edx
++	sub	$CHAR_PER_VEC, %edx
+ 	jbe	L(return_null)
+ 
+ 	PCMPEQ	16(%rdi), %xmm1
+@@ -293,7 +308,13 @@ L(matches32):
+ 	.p2align 4
+ L(matches_1):
+ 	bsf	%eax, %eax
++#ifdef USE_AS_WMEMCHR
++	mov	%eax, %esi
++	shr	$2, %esi
++	sub	%rsi, %rdx
++#else
+ 	sub	%rax, %rdx
++#endif
+ 	jbe	L(return_null)
+ 	add	%rdi, %rax
+ 	ret
+@@ -301,7 +322,13 @@ L(matches_1):
+ 	.p2align 4
+ L(matches16_1):
+ 	bsf	%eax, %eax
++#ifdef USE_AS_WMEMCHR
++	mov	%eax, %esi
++	shr	$2, %esi
++	sub	%rsi, %rdx
++#else
+ 	sub	%rax, %rdx
++#endif
+ 	jbe	L(return_null)
+ 	lea	16(%rdi, %rax), %rax
+ 	ret
+@@ -309,7 +336,13 @@ L(matches16_1):
+ 	.p2align 4
+ L(matches32_1):
+ 	bsf	%eax, %eax
++#ifdef USE_AS_WMEMCHR
++	mov	%eax, %esi
++	shr	$2, %esi
++	sub	%rsi, %rdx
++#else
+ 	sub	%rax, %rdx
++#endif
+ 	jbe	L(return_null)
+ 	lea	32(%rdi, %rax), %rax
+ 	ret
+@@ -317,7 +350,13 @@ L(matches32_1):
+ 	.p2align 4
+ L(matches48_1):
+ 	bsf	%eax, %eax
++#ifdef USE_AS_WMEMCHR
++	mov	%eax, %esi
++	shr	$2, %esi
++	sub	%rsi, %rdx
++#else
+ 	sub	%rax, %rdx
++#endif
+ 	jbe	L(return_null)
+ 	lea	48(%rdi, %rax), %rax
+ 	ret
+diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
+index b377f22e..16027abb 100644
+--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
++++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
+@@ -54,21 +54,19 @@
+ 
+ # define VEC_SIZE 32
+ # define PAGE_SIZE 4096
++# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+ 
+ 	.section SECTION(.text),"ax",@progbits
+ ENTRY (MEMCHR)
+ # ifndef USE_AS_RAWMEMCHR
+ 	/* Check for zero length.  */
+-	test	%RDX_LP, %RDX_LP
+-	jz	L(null)
+-# endif
+-# ifdef USE_AS_WMEMCHR
+-	shl	$2, %RDX_LP
+-# else
+ #  ifdef __ILP32__
+-	/* Clear the upper 32 bits.  */
+-	movl	%edx, %edx
++	/* Clear upper bits.  */
++	and	%RDX_LP, %RDX_LP
++#  else
++	test	%RDX_LP, %RDX_LP
+ #  endif
++	jz	L(null)
+ # endif
+ 	/* Broadcast CHAR to YMMMATCH.  */
+ 	vmovd	%esi, %xmm0
+@@ -84,7 +82,7 @@ ENTRY (MEMCHR)
+ 	vpmovmskb %ymm1, %eax
+ # ifndef USE_AS_RAWMEMCHR
+ 	/* If length < CHAR_PER_VEC handle special.  */
+-	cmpq	$VEC_SIZE, %rdx
++	cmpq	$CHAR_PER_VEC, %rdx
+ 	jbe	L(first_vec_x0)
+ # endif
+ 	testl	%eax, %eax
+@@ -98,6 +96,10 @@ ENTRY (MEMCHR)
+ L(first_vec_x0):
+ 	/* Check if first match was before length.  */
+ 	tzcntl	%eax, %eax
++#  ifdef USE_AS_WMEMCHR
++	/* NB: Multiply length by 4 to get byte count.  */
++	sall	$2, %edx
++#  endif
+ 	xorl	%ecx, %ecx
+ 	cmpl	%eax, %edx
+ 	leaq	(%rdi, %rax), %rax
+@@ -110,12 +112,12 @@ L(null):
+ # endif
+ 	.p2align 4
+ L(cross_page_boundary):
+-	/* Save pointer before aligning as its original value is necessary
+-	   for computer return address if byte is found or adjusting length
+-	   if it is not and this is memchr.  */
++	/* Save pointer before aligning as its original value is
++	   necessary for computer return address if byte is found or
++	   adjusting length if it is not and this is memchr.  */
+ 	movq	%rdi, %rcx
+-	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
+-	   rdi for rawmemchr.  */
++	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
++	   and rdi for rawmemchr.  */
+ 	orq	$(VEC_SIZE - 1), %ALGN_PTR_REG
+ 	VPCMPEQ	-(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
+@@ -124,6 +126,10 @@ L(cross_page_boundary):
+ 	   match).  */
+ 	leaq	1(%ALGN_PTR_REG), %rsi
+ 	subq	%RRAW_PTR_REG, %rsi
++#  ifdef USE_AS_WMEMCHR
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
++	shrl	$2, %esi
++#  endif
+ # endif
+ 	/* Remove the leading bytes.  */
+ 	sarxl	%ERAW_PTR_REG, %eax, %eax
+@@ -181,6 +187,10 @@ L(cross_page_continue):
+ 	orq	$(VEC_SIZE - 1), %rdi
+ 	/* esi is for adjusting length to see if near the end.  */
+ 	leal	(VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
++#  ifdef USE_AS_WMEMCHR
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarl	$2, %esi
++#  endif
+ # else
+ 	orq	$(VEC_SIZE - 1), %rdi
+ L(cross_page_continue):
+@@ -213,7 +223,7 @@ L(cross_page_continue):
+ 
+ # ifndef USE_AS_RAWMEMCHR
+ 	/* Check if at last VEC_SIZE * 4 length.  */
+-	subq	$(VEC_SIZE * 4), %rdx
++	subq	$(CHAR_PER_VEC * 4), %rdx
+ 	jbe	L(last_4x_vec_or_less_cmpeq)
+ 	/* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
+ 	   length.  */
+@@ -221,6 +231,10 @@ L(cross_page_continue):
+ 	movl	%edi, %ecx
+ 	orq	$(VEC_SIZE * 4 - 1), %rdi
+ 	andl	$(VEC_SIZE * 4 - 1), %ecx
++#  ifdef USE_AS_WMEMCHR
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarl	$2, %ecx
++#  endif
+ 	addq	%rcx, %rdx
+ # else
+ 	/* Align data to VEC_SIZE * 4 - 1 for loop.  */
+@@ -250,15 +264,19 @@ L(loop_4x_vec):
+ 
+ 	subq	$-(VEC_SIZE * 4), %rdi
+ 
+-	subq	$(VEC_SIZE * 4), %rdx
++	subq	$(CHAR_PER_VEC * 4), %rdx
+ 	ja	L(loop_4x_vec)
+ 
+-	/* Fall through into less than 4 remaining vectors of length case.
+-	 */
++	/* Fall through into less than 4 remaining vectors of length
++	   case.  */
+ 	VPCMPEQ	(VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
+ 	.p2align 4
+ L(last_4x_vec_or_less):
++#  ifdef USE_AS_WMEMCHR
++	/* NB: Multiply length by 4 to get byte count.  */
++	sall	$2, %edx
++#  endif
+ 	/* Check if first VEC contained match.  */
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1_check)
+@@ -355,6 +373,10 @@ L(last_vec_x2_return):
+ L(last_4x_vec_or_less_cmpeq):
+ 	VPCMPEQ	(VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
++#  ifdef USE_AS_WMEMCHR
++	/* NB: Multiply length by 4 to get byte count.  */
++	sall	$2, %edx
++#  endif
+ 	subq	$-(VEC_SIZE * 4), %rdi
+ 	/* Check first VEC regardless.  */
+ 	testl	%eax, %eax
+-- 
+GitLab
+
diff --git a/SOURCES/glibc-sw27974-2.patch b/SOURCES/glibc-sw27974-2.patch
new file mode 100644
index 0000000..96edc19
--- /dev/null
+++ b/SOURCES/glibc-sw27974-2.patch
@@ -0,0 +1,497 @@
+From 78ff769ceac455cb6749f64effe77d178216f0b0 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Jun 2021 01:56:29 -0400
+Subject: [PATCH] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 [BZ
+ #27974]
+
+This commit fixes the bug mentioned in the previous commit.
+
+The previous implementations of wmemchr in these files relied
+on maxlen * sizeof(wchar_t) which was not guranteed by the standard.
+
+The new overflow tests added in the previous commit now
+pass (As well as all the other tests).
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+(cherry picked from commit a775a7a3eb1e85b54af0b4ee5ff4dcf66772a1fb)
+---
+ sysdeps/x86_64/multiarch/strlen-avx2.S | 130 ++++++++++++++++++-------
+ sysdeps/x86_64/multiarch/strlen-vec.S  |  15 ++-
+ 2 files changed, 107 insertions(+), 38 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
+index be8a5db5..37688966 100644
+--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
++++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
+@@ -44,21 +44,21 @@
+ 
+ # define VEC_SIZE 32
+ # define PAGE_SIZE 4096
++# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+ 
+ 	.section SECTION(.text),"ax",@progbits
+ ENTRY (STRLEN)
+ # ifdef USE_AS_STRNLEN
+ 	/* Check zero length.  */
++#  ifdef __ILP32__
++	/* Clear upper bits.  */
++	and	%RSI_LP, %RSI_LP
++#  else
+ 	test	%RSI_LP, %RSI_LP
++#  endif
+ 	jz	L(zero)
+ 	/* Store max len in R8_LP before adjusting if using WCSLEN.  */
+ 	mov	%RSI_LP, %R8_LP
+-#  ifdef USE_AS_WCSLEN
+-	shl	$2, %RSI_LP
+-#  elif defined __ILP32__
+-	/* Clear the upper 32 bits.  */
+-	movl	%esi, %esi
+-#  endif
+ # endif
+ 	movl	%edi, %eax
+ 	movq	%rdi, %rdx
+@@ -72,10 +72,10 @@ ENTRY (STRLEN)
+ 
+ 	/* Check the first VEC_SIZE bytes.  */
+ 	VPCMPEQ	(%rdi), %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
++	vpmovmskb %ymm1, %eax
+ # ifdef USE_AS_STRNLEN
+ 	/* If length < VEC_SIZE handle special.  */
+-	cmpq	$VEC_SIZE, %rsi
++	cmpq	$CHAR_PER_VEC, %rsi
+ 	jbe	L(first_vec_x0)
+ # endif
+ 	/* If empty continue to aligned_more. Otherwise return bit
+@@ -84,6 +84,7 @@ ENTRY (STRLEN)
+ 	jz	L(aligned_more)
+ 	tzcntl	%eax, %eax
+ # ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrl	$2, %eax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -97,9 +98,14 @@ L(zero):
+ L(first_vec_x0):
+ 	/* Set bit for max len so that tzcnt will return min of max len
+ 	   and position of first match.  */
++#  ifdef USE_AS_WCSLEN
++	/* NB: Multiply length by 4 to get byte count.  */
++	sall	$2, %esi
++#  endif
+ 	btsq	%rsi, %rax
+ 	tzcntl	%eax, %eax
+ #  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrl	$2, %eax
+ #  endif
+ 	VZEROUPPER_RETURN
+@@ -113,14 +119,19 @@ L(first_vec_x1):
+ # ifdef USE_AS_STRNLEN
+ 	/* Use ecx which was computed earlier to compute correct value.
+ 	 */
++#  ifdef USE_AS_WCSLEN
++	leal	-(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
++#  else
+ 	subl	$(VEC_SIZE * 4 + 1), %ecx
+ 	addl	%ecx, %eax
++#  endif
+ # else
+ 	subl	%edx, %edi
+ 	incl	%edi
+ 	addl	%edi, %eax
+ # endif
+ # ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrl	$2, %eax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -133,14 +144,19 @@ L(first_vec_x2):
+ # ifdef USE_AS_STRNLEN
+ 	/* Use ecx which was computed earlier to compute correct value.
+ 	 */
++#  ifdef USE_AS_WCSLEN
++	leal	-(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
++#  else
+ 	subl	$(VEC_SIZE * 3 + 1), %ecx
+ 	addl	%ecx, %eax
++#  endif
+ # else
+ 	subl	%edx, %edi
+ 	addl	$(VEC_SIZE + 1), %edi
+ 	addl	%edi, %eax
+ # endif
+ # ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrl	$2, %eax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -153,14 +169,19 @@ L(first_vec_x3):
+ # ifdef USE_AS_STRNLEN
+ 	/* Use ecx which was computed earlier to compute correct value.
+ 	 */
++#  ifdef USE_AS_WCSLEN
++	leal	-(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
++#  else
+ 	subl	$(VEC_SIZE * 2 + 1), %ecx
+ 	addl	%ecx, %eax
++#  endif
+ # else
+ 	subl	%edx, %edi
+ 	addl	$(VEC_SIZE * 2 + 1), %edi
+ 	addl	%edi, %eax
+ # endif
+ # ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrl	$2, %eax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -173,14 +194,19 @@ L(first_vec_x4):
+ # ifdef USE_AS_STRNLEN
+ 	/* Use ecx which was computed earlier to compute correct value.
+ 	 */
++#  ifdef USE_AS_WCSLEN
++	leal	-(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
++#  else
+ 	subl	$(VEC_SIZE + 1), %ecx
+ 	addl	%ecx, %eax
++#  endif
+ # else
+ 	subl	%edx, %edi
+ 	addl	$(VEC_SIZE * 3 + 1), %edi
+ 	addl	%edi, %eax
+ # endif
+ # ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrl	$2, %eax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -195,10 +221,14 @@ L(cross_page_continue):
+ 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+ 	   since data is only aligned to VEC_SIZE.  */
+ # ifdef USE_AS_STRNLEN
+-	/* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
+-	   it simplies the logic in last_4x_vec_or_less.  */
++	/* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
++	   because it simplies the logic in last_4x_vec_or_less.  */
+ 	leaq	(VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
+ 	subq	%rdx, %rcx
++#  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarl	$2, %ecx
++#  endif
+ # endif
+ 	/* Load first VEC regardless.  */
+ 	VPCMPEQ	1(%rdi), %ymm0, %ymm1
+@@ -207,34 +237,38 @@ L(cross_page_continue):
+ 	subq	%rcx, %rsi
+ 	jb	L(last_4x_vec_or_less)
+ # endif
+-	vpmovmskb	%ymm1, %eax
++	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1)
+ 
+ 	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
++	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x2)
+ 
+ 	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
++	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x3)
+ 
+ 	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
++	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x4)
+ 
+ 	/* Align data to VEC_SIZE * 4 - 1.  */
+ # ifdef USE_AS_STRNLEN
+ 	/* Before adjusting length check if at last VEC_SIZE * 4.  */
+-	cmpq	$(VEC_SIZE * 4 - 1), %rsi
++	cmpq	$(CHAR_PER_VEC * 4 - 1), %rsi
+ 	jbe	L(last_4x_vec_or_less_load)
+ 	incq	%rdi
+ 	movl	%edi, %ecx
+ 	orq	$(VEC_SIZE * 4 - 1), %rdi
+ 	andl	$(VEC_SIZE * 4 - 1), %ecx
++#  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarl	$2, %ecx
++#  endif
+ 	/* Readjust length.  */
+ 	addq	%rcx, %rsi
+ # else
+@@ -246,13 +280,13 @@ L(cross_page_continue):
+ L(loop_4x_vec):
+ # ifdef USE_AS_STRNLEN
+ 	/* Break if at end of length.  */
+-	subq	$(VEC_SIZE * 4), %rsi
++	subq	$(CHAR_PER_VEC * 4), %rsi
+ 	jb	L(last_4x_vec_or_less_cmpeq)
+ # endif
+-	/* Save some code size by microfusing VPMINU with the load. Since
+-	   the matches in ymm2/ymm4 can only be returned if there where no
+-	   matches in ymm1/ymm3 respectively there is no issue with overlap.
+-	 */
++	/* Save some code size by microfusing VPMINU with the load.
++	   Since the matches in ymm2/ymm4 can only be returned if there
++	   where no matches in ymm1/ymm3 respectively there is no issue
++	   with overlap.  */
+ 	vmovdqa	1(%rdi), %ymm1
+ 	VPMINU	(VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
+ 	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm3
+@@ -260,7 +294,7 @@ L(loop_4x_vec):
+ 
+ 	VPMINU	%ymm2, %ymm4, %ymm5
+ 	VPCMPEQ	%ymm5, %ymm0, %ymm5
+-	vpmovmskb	%ymm5, %ecx
++	vpmovmskb %ymm5, %ecx
+ 
+ 	subq	$-(VEC_SIZE * 4), %rdi
+ 	testl	%ecx, %ecx
+@@ -268,27 +302,28 @@ L(loop_4x_vec):
+ 
+ 
+ 	VPCMPEQ	%ymm1, %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
++	vpmovmskb %ymm1, %eax
+ 	subq	%rdx, %rdi
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_return_x0)
+ 
+ 	VPCMPEQ	%ymm2, %ymm0, %ymm2
+-	vpmovmskb	%ymm2, %eax
++	vpmovmskb %ymm2, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_return_x1)
+ 
+ 	/* Combine last 2 VEC.  */
+ 	VPCMPEQ	%ymm3, %ymm0, %ymm3
+-	vpmovmskb	%ymm3, %eax
+-	/* rcx has combined result from all 4 VEC. It will only be used if
+-	   the first 3 other VEC all did not contain a match.  */
++	vpmovmskb %ymm3, %eax
++	/* rcx has combined result from all 4 VEC. It will only be used
++	   if the first 3 other VEC all did not contain a match.  */
+ 	salq	$32, %rcx
+ 	orq	%rcx, %rax
+ 	tzcntq	%rax, %rax
+ 	subq	$(VEC_SIZE * 2 - 1), %rdi
+ 	addq	%rdi, %rax
+ # ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrq	$2, %rax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -297,15 +332,19 @@ L(loop_4x_vec):
+ # ifdef USE_AS_STRNLEN
+ 	.p2align 4
+ L(last_4x_vec_or_less_load):
+-	/* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
++	/* Depending on entry adjust rdi / prepare first VEC in ymm1.
++	 */
+ 	subq	$-(VEC_SIZE * 4), %rdi
+ L(last_4x_vec_or_less_cmpeq):
+ 	VPCMPEQ	1(%rdi), %ymm0, %ymm1
+ L(last_4x_vec_or_less):
+-
+-	vpmovmskb	%ymm1, %eax
+-	/* If remaining length > VEC_SIZE * 2. This works if esi is off by
+-	   VEC_SIZE * 4.  */
++#  ifdef USE_AS_WCSLEN
++	/* NB: Multiply length by 4 to get byte count.  */
++	sall	$2, %esi
++#  endif
++	vpmovmskb %ymm1, %eax
++	/* If remaining length > VEC_SIZE * 2. This works if esi is off
++	   by VEC_SIZE * 4.  */
+ 	testl	$(VEC_SIZE * 2), %esi
+ 	jnz	L(last_4x_vec)
+ 
+@@ -320,7 +359,7 @@ L(last_4x_vec_or_less):
+ 	jb	L(max)
+ 
+ 	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
++	vpmovmskb %ymm1, %eax
+ 	tzcntl	%eax, %eax
+ 	/* Check the end of data.  */
+ 	cmpl	%eax, %esi
+@@ -329,6 +368,7 @@ L(last_4x_vec_or_less):
+ 	addl	$(VEC_SIZE + 1), %eax
+ 	addq	%rdi, %rax
+ #  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrq	$2, %rax
+ #  endif
+ 	VZEROUPPER_RETURN
+@@ -340,6 +380,7 @@ L(last_vec_return_x0):
+ 	subq	$(VEC_SIZE * 4 - 1), %rdi
+ 	addq	%rdi, %rax
+ # ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrq	$2, %rax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -350,6 +391,7 @@ L(last_vec_return_x1):
+ 	subq	$(VEC_SIZE * 3 - 1), %rdi
+ 	addq	%rdi, %rax
+ # ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrq	$2, %rax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -366,6 +408,7 @@ L(last_vec_x1_check):
+ 	incl	%eax
+ 	addq	%rdi, %rax
+ #  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrq	$2, %rax
+ #  endif
+ 	VZEROUPPER_RETURN
+@@ -381,14 +424,14 @@ L(last_4x_vec):
+ 	jnz	L(last_vec_x1)
+ 
+ 	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
++	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_x2)
+ 
+ 	/* Normalize length.  */
+ 	andl	$(VEC_SIZE * 4 - 1), %esi
+ 	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
++	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_x3)
+ 
+@@ -396,7 +439,7 @@ L(last_4x_vec):
+ 	jb	L(max)
+ 
+ 	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
++	vpmovmskb %ymm1, %eax
+ 	tzcntl	%eax, %eax
+ 	/* Check the end of data.  */
+ 	cmpl	%eax, %esi
+@@ -405,6 +448,7 @@ L(last_4x_vec):
+ 	addl	$(VEC_SIZE * 3 + 1), %eax
+ 	addq	%rdi, %rax
+ #  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrq	$2, %rax
+ #  endif
+ 	VZEROUPPER_RETURN
+@@ -419,6 +463,7 @@ L(last_vec_x1):
+ 	incl	%eax
+ 	addq	%rdi, %rax
+ #  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrq	$2, %rax
+ #  endif
+ 	VZEROUPPER_RETURN
+@@ -432,6 +477,7 @@ L(last_vec_x2):
+ 	addl	$(VEC_SIZE + 1), %eax
+ 	addq	%rdi, %rax
+ #  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrq	$2, %rax
+ #  endif
+ 	VZEROUPPER_RETURN
+@@ -447,6 +493,7 @@ L(last_vec_x3):
+ 	addl	$(VEC_SIZE * 2 + 1), %eax
+ 	addq	%rdi, %rax
+ #  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrq	$2, %rax
+ #  endif
+ 	VZEROUPPER_RETURN
+@@ -455,13 +502,13 @@ L(max_end):
+ 	VZEROUPPER_RETURN
+ # endif
+ 
+-	/* Cold case for crossing page with first load.	 */
++	/* Cold case for crossing page with first load.  */
+ 	.p2align 4
+ L(cross_page_boundary):
+ 	/* Align data to VEC_SIZE - 1.  */
+ 	orq	$(VEC_SIZE - 1), %rdi
+ 	VPCMPEQ	-(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
++	vpmovmskb %ymm1, %eax
+ 	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
+ 	   so no need to manually mod rdx.  */
+ 	sarxl	%edx, %eax, %eax
+@@ -470,6 +517,10 @@ L(cross_page_boundary):
+ 	jnz	L(cross_page_less_vec)
+ 	leaq	1(%rdi), %rcx
+ 	subq	%rdx, %rcx
++#  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
++	shrl	$2, %ecx
++#  endif
+ 	/* Check length.  */
+ 	cmpq	%rsi, %rcx
+ 	jb	L(cross_page_continue)
+@@ -479,6 +530,7 @@ L(cross_page_boundary):
+ 	jz	L(cross_page_continue)
+ 	tzcntl	%eax, %eax
+ #  ifdef USE_AS_WCSLEN
++	/* NB: Divide length by 4 to get wchar_t count.  */
+ 	shrl	$2, %eax
+ #  endif
+ # endif
+@@ -489,6 +541,10 @@ L(return_vzeroupper):
+ 	.p2align 4
+ L(cross_page_less_vec):
+ 	tzcntl	%eax, %eax
++#  ifdef USE_AS_WCSLEN
++	/* NB: Multiply length by 4 to get byte count.  */
++	sall	$2, %esi
++#  endif
+ 	cmpq	%rax, %rsi
+ 	cmovb	%esi, %eax
+ #  ifdef USE_AS_WCSLEN
+diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
+index 8f660bb9..439e486a 100644
+--- a/sysdeps/x86_64/multiarch/strlen-vec.S
++++ b/sysdeps/x86_64/multiarch/strlen-vec.S
+@@ -65,12 +65,25 @@ ENTRY(strlen)
+ 	ret
+ L(n_nonzero):
+ # ifdef AS_WCSLEN
+-	shl	$2, %RSI_LP
++/* Check for overflow from maxlen * sizeof(wchar_t). If it would
++   overflow the only way this program doesn't have undefined behavior 
++   is if there is a null terminator in valid memory so wcslen will 
++   suffice.  */
++	mov	%RSI_LP, %R10_LP
++	sar	$62, %R10_LP
++	test	%R10_LP, %R10_LP
++	jnz	__wcslen_sse4_1
++	sal	$2, %RSI_LP
+ # endif
+ 
++
+ /* Initialize long lived registers.  */
+ 
+ 	add	%RDI_LP, %RSI_LP
++# ifdef AS_WCSLEN
++/* Check for overflow again from s + maxlen * sizeof(wchar_t).  */
++	jbe	__wcslen_sse4_1
++# endif
+ 	mov	%RSI_LP, %R10_LP
+ 	and	$-64, %R10_LP
+ 	mov	%RSI_LP, %R11_LP
+-- 
+GitLab
+
diff --git a/SOURCES/glibc-sw28033.patch b/SOURCES/glibc-sw28033.patch
new file mode 100644
index 0000000..d6b677c
--- /dev/null
+++ b/SOURCES/glibc-sw28033.patch
@@ -0,0 +1,153 @@
+From 81174145d5f7b76744c8f635165e88735a944fca Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Wed, 2 Mar 2022 14:45:02 -0800
+Subject: [PATCH] x86: Check RTM_ALWAYS_ABORT for RTM [BZ #28033]
+
+From
+
+https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html
+
+* Intel TSX will be disabled by default.
+* The processor will force abort all Restricted Transactional Memory (RTM)
+  transactions by default.
+* A new CPUID bit CPUID.07H.0H.EDX[11](RTM_ALWAYS_ABORT) will be enumerated,
+  which is set to indicate to updated software that the loaded microcode is
+  forcing RTM abort.
+* On processors that enumerate support for RTM, the CPUID enumeration bits
+  for Intel TSX (CPUID.07H.0H.EBX[11] and CPUID.07H.0H.EBX[4]) continue to
+  be set by default after microcode update.
+* Workloads that were benefited from Intel TSX might experience a change
+  in performance.
+* System software may use a new bit in Model-Specific Register (MSR) 0x10F
+  TSX_FORCE_ABORT[TSX_CPUID_CLEAR] functionality to clear the Hardware Lock
+  Elision (HLE) and RTM bits to indicate to software that Intel TSX is
+  disabled.
+
+1. Add RTM_ALWAYS_ABORT to CPUID features.
+2. Set RTM usable only if RTM_ALWAYS_ABORT isn't set.  This skips the
+string/tst-memchr-rtm etc. testcases on the affected processors, which
+always fail after a microcde update.
+3. Check RTM feature, instead of usability, against /proc/cpuinfo.
+
+This fixes BZ #28033.
+
+(cherry picked from commit ea8e465a6b8d0f26c72bcbe453a854de3abf68ec)
+---
+ manual/platform.texi                    | 3 +++
+ sysdeps/x86/cpu-features.c              | 5 ++++-
+ sysdeps/x86/sys/platform/x86.h          | 6 +++---
+ sysdeps/x86/tst-cpu-features-supports.c | 2 +-
+ sysdeps/x86/tst-get-cpu-features.c      | 2 ++
+ 5 files changed, 13 insertions(+), 5 deletions(-)
+
+diff --git a/manual/platform.texi b/manual/platform.texi
+index 8fec2933..b7e8aef7 100644
+--- a/manual/platform.texi
++++ b/manual/platform.texi
+@@ -510,6 +510,9 @@ capability.
+ @item
+ @code{RTM} -- RTM instruction extensions.
+ 
++@item
++@code{RTM_ALWAYS_ABORT} -- Transactions always abort, making RTM unusable.
++
+ @item
+ @code{SDBG} -- IA32_DEBUG_INTERFACE MSR for silicon debug.
+ 
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index 3610ee5c..4889f062 100644
+--- a/sysdeps/x86/cpu-features.c
++++ b/sysdeps/x86/cpu-features.c
+@@ -74,7 +74,6 @@ update_usable (struct cpu_features *cpu_features)
+   CPU_FEATURE_SET_USABLE (cpu_features, HLE);
+   CPU_FEATURE_SET_USABLE (cpu_features, BMI2);
+   CPU_FEATURE_SET_USABLE (cpu_features, ERMS);
+-  CPU_FEATURE_SET_USABLE (cpu_features, RTM);
+   CPU_FEATURE_SET_USABLE (cpu_features, RDSEED);
+   CPU_FEATURE_SET_USABLE (cpu_features, ADX);
+   CPU_FEATURE_SET_USABLE (cpu_features, CLFLUSHOPT);
+@@ -90,6 +89,7 @@ update_usable (struct cpu_features *cpu_features)
+   CPU_FEATURE_SET_USABLE (cpu_features, MOVDIRI);
+   CPU_FEATURE_SET_USABLE (cpu_features, MOVDIR64B);
+   CPU_FEATURE_SET_USABLE (cpu_features, FSRM);
++  CPU_FEATURE_SET_USABLE (cpu_features, RTM_ALWAYS_ABORT);
+   CPU_FEATURE_SET_USABLE (cpu_features, SERIALIZE);
+   CPU_FEATURE_SET_USABLE (cpu_features, TSXLDTRK);
+   CPU_FEATURE_SET_USABLE (cpu_features, LAHF64_SAHF64);
+@@ -779,6 +779,9 @@ no_cpuid:
+     GLRO(dl_platform) = "i586";
+ #endif
+ 
++  if (!CPU_FEATURES_CPU_P (cpu_features, RTM_ALWAYS_ABORT))
++    CPU_FEATURE_SET_USABLE (cpu_features, RTM);
++
+ #if CET_ENABLED
+ # if HAVE_TUNABLES
+   TUNABLE_GET (x86_ibt, tunable_val_t *,
+diff --git a/sysdeps/x86/sys/platform/x86.h b/sysdeps/x86/sys/platform/x86.h
+index e5cc7c68..7a434926 100644
+--- a/sysdeps/x86/sys/platform/x86.h
++++ b/sysdeps/x86/sys/platform/x86.h
+@@ -247,7 +247,7 @@ extern const struct cpu_features *__x86_get_cpu_features (unsigned int)
+ #define bit_cpu_AVX512_VP2INTERSECT (1u << 8)
+ #define bit_cpu_INDEX_7_EDX_9	(1u << 9)
+ #define bit_cpu_MD_CLEAR	(1u << 10)
+-#define bit_cpu_INDEX_7_EDX_11	(1u << 11)
++#define bit_cpu_RTM_ALWAYS_ABORT (1u << 11)
+ #define bit_cpu_INDEX_7_EDX_12	(1u << 12)
+ #define bit_cpu_INDEX_7_EDX_13	(1u << 13)
+ #define bit_cpu_SERIALIZE	(1u << 14)
+@@ -471,7 +471,7 @@ extern const struct cpu_features *__x86_get_cpu_features (unsigned int)
+ #define index_cpu_AVX512_VP2INTERSECT COMMON_CPUID_INDEX_7
+ #define index_cpu_INDEX_7_EDX_9	COMMON_CPUID_INDEX_7
+ #define index_cpu_MD_CLEAR	COMMON_CPUID_INDEX_7
+-#define index_cpu_INDEX_7_EDX_11 COMMON_CPUID_INDEX_7
++#define index_cpu_RTM_ALWAYS_ABORT COMMON_CPUID_INDEX_7
+ #define index_cpu_INDEX_7_EDX_12 COMMON_CPUID_INDEX_7
+ #define index_cpu_INDEX_7_EDX_13 COMMON_CPUID_INDEX_7
+ #define index_cpu_SERIALIZE	COMMON_CPUID_INDEX_7
+@@ -695,7 +695,7 @@ extern const struct cpu_features *__x86_get_cpu_features (unsigned int)
+ #define reg_AVX512_VP2INTERSECT	edx
+ #define reg_INDEX_7_EDX_9	edx
+ #define reg_MD_CLEAR		edx
+-#define reg_INDEX_7_EDX_11	edx
++#define reg_RTM_ALWAYS_ABORT	edx
+ #define reg_INDEX_7_EDX_12	edx
+ #define reg_INDEX_7_EDX_13	edx
+ #define reg_SERIALIZE		edx
+diff --git a/sysdeps/x86/tst-cpu-features-supports.c b/sysdeps/x86/tst-cpu-features-supports.c
+index 287cf01f..8100a319 100644
+--- a/sysdeps/x86/tst-cpu-features-supports.c
++++ b/sysdeps/x86/tst-cpu-features-supports.c
+@@ -152,7 +152,7 @@ do_test (int argc, char **argv)
+   fails += CHECK_SUPPORTS (rdpid, RDPID);
+   fails += CHECK_SUPPORTS (rdrnd, RDRAND);
+   fails += CHECK_SUPPORTS (rdseed, RDSEED);
+-  fails += CHECK_SUPPORTS (rtm, RTM);
++  fails += CHECK_CPU_SUPPORTS (rtm, RTM);
+   fails += CHECK_SUPPORTS (serialize, SERIALIZE);
+   fails += CHECK_SUPPORTS (sha, SHA);
+   fails += CHECK_CPU_SUPPORTS (shstk, SHSTK);
+diff --git a/sysdeps/x86/tst-get-cpu-features.c b/sysdeps/x86/tst-get-cpu-features.c
+index 2763deb6..0717e5d8 100644
+--- a/sysdeps/x86/tst-get-cpu-features.c
++++ b/sysdeps/x86/tst-get-cpu-features.c
+@@ -183,6 +183,7 @@ do_test (void)
+   CHECK_CPU_FEATURE (UINTR);
+   CHECK_CPU_FEATURE (AVX512_VP2INTERSECT);
+   CHECK_CPU_FEATURE (MD_CLEAR);
++  CHECK_CPU_FEATURE (RTM_ALWAYS_ABORT);
+   CHECK_CPU_FEATURE (SERIALIZE);
+   CHECK_CPU_FEATURE (HYBRID);
+   CHECK_CPU_FEATURE (TSXLDTRK);
+@@ -344,6 +345,7 @@ do_test (void)
+   CHECK_CPU_FEATURE_USABLE (FSRM);
+   CHECK_CPU_FEATURE_USABLE (AVX512_VP2INTERSECT);
+   CHECK_CPU_FEATURE_USABLE (MD_CLEAR);
++  CHECK_CPU_FEATURE_USABLE (RTM_ALWAYS_ABORT);
+   CHECK_CPU_FEATURE_USABLE (SERIALIZE);
+   CHECK_CPU_FEATURE_USABLE (HYBRID);
+   CHECK_CPU_FEATURE_USABLE (TSXLDTRK);
+-- 
+GitLab
+
diff --git a/SOURCES/glibc-sw28064.patch b/SOURCES/glibc-sw28064.patch
new file mode 100644
index 0000000..edc3b0d
--- /dev/null
+++ b/SOURCES/glibc-sw28064.patch
@@ -0,0 +1,51 @@
+From 88405b75deedca35e9fbccc8a39368fcf0e58783 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Thu, 8 Jul 2021 16:13:19 -0400
+Subject: [PATCH] x86: Remove wcsnlen-sse4_1 from wcslen ifunc-impl-list [BZ
+ #28064]
+
+The following commit
+
+commit 6f573a27b6c8b4236445810a44660612323f5a73
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Wed Jun 23 01:19:34 2021 -0400
+
+    x86-64: Add wcslen optimize for sse4.1
+
+Added wcsnlen-sse4.1 to the wcslen ifunc implementation list and did
+not add wcslen-sse4.1 to wcslen ifunc implementation list. This commit
+fixes that by removing wcsnlen-sse4.1 from the wcslen ifunc
+implementation list and adding wcslen-sse4.1 to the ifunc
+implementation list.
+
+Testing:
+test-wcslen.c, test-rsi-wcslen.c, and test-rsi-strlen.c are passing as
+well as all other tests in wcsmbs and string.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+(cherry picked from commit 0679442defedf7e52a94264975880ab8674736b2)
+---
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index 580913ca..695cdba6 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -657,9 +657,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+ 			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wcslen_evex)
+-	      IFUNC_IMPL_ADD (array, i, wcsnlen,
++	      IFUNC_IMPL_ADD (array, i, wcslen,
+ 			      CPU_FEATURE_USABLE (SSE4_1),
+-			      __wcsnlen_sse4_1)
++			      __wcslen_sse4_1)
+ 	      IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */
+-- 
+GitLab
+
diff --git a/SOURCES/glibc-sw28252.patch b/SOURCES/glibc-sw28252.patch
new file mode 100644
index 0000000..0632632
--- /dev/null
+++ b/SOURCES/glibc-sw28252.patch
@@ -0,0 +1,269 @@
+From c166f44e4488af4f4af035645775fe44b12bab13 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 20 Aug 2021 06:42:24 -0700
+Subject: [PATCH] x86-64: Optimize load of all bits set into ZMM register [BZ
+ #28252]
+
+Optimize loads of all bits set into ZMM register in AVX512 SVML codes
+by replacing
+
+	vpbroadcastq .L_2il0floatpacket.16(%rip), %zmmX
+
+and
+
+	vmovups   .L_2il0floatpacket.13(%rip), %zmmX
+
+with
+	vpternlogd $0xff, %zmmX, %zmmX, %zmmX
+
+This fixes BZ #28252.
+
+(cherry picked from commit 78c9ec9000f873abe7a15a91b87080a2e4308260)
+---
+ .../x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S   |  7 +------
+ .../x86_64/fpu/multiarch/svml_d_log8_core_avx512.S   |  7 +------
+ .../x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S   |  7 +------
+ .../fpu/multiarch/svml_d_sincos8_core_avx512.S       |  7 +------
+ .../x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S |  7 +------
+ .../x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S |  7 +------
+ .../x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S |  7 +------
+ .../x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S | 12 ++----------
+ .../fpu/multiarch/svml_s_sincosf16_core_avx512.S     |  7 +------
+ .../x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S |  7 +------
+ 10 files changed, 11 insertions(+), 64 deletions(-)
+
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
+index 24e3b363..07dfed85 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
+@@ -265,7 +265,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_cos
+         vmovaps   %zmm0, %zmm8
+ 
+ /* Check for large arguments path */
+-        vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2
++        vpternlogd $0xff, %zmm2, %zmm2, %zmm2
+ 
+ /*
+   ARGUMENT RANGE REDUCTION:
+@@ -456,8 +456,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_cos
+         jmp       .LBL_2_7
+ #endif
+ END (_ZGVeN8v_cos_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.16:
+-	.long	0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.16,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
+index ae8af8d8..ddb60e5b 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
+@@ -274,7 +274,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_log
+ 
+ /* preserve mantissa, set input exponent to 2^(-10) */
+         vpternlogq $248, _ExpMask(%rax), %zmm3, %zmm2
+-        vpbroadcastq .L_2il0floatpacket.12(%rip), %zmm1
++        vpternlogd $0xff, %zmm1, %zmm1, %zmm1
+         vpsrlq    $32, %zmm4, %zmm6
+ 
+ /* reciprocal approximation good to at least 11 bits */
+@@ -461,8 +461,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_log
+         jmp       .LBL_2_7
+ #endif
+ END (_ZGVeN8v_log_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.12:
+-	.long	0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.12,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
+index 2d4b14fd..529c454a 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
+@@ -261,7 +261,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_sin
+         andq      $-64, %rsp
+         subq      $1280, %rsp
+         movq      __svml_d_trig_data@GOTPCREL(%rip), %rax
+-        vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14
++        vpternlogd $0xff, %zmm1, %zmm1, %zmm14
+         vmovups __dAbsMask(%rax), %zmm7
+         vmovups __dInvPI(%rax), %zmm2
+         vmovups __dRShifter(%rax), %zmm1
+@@ -458,8 +458,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_sin
+         jmp       .LBL_2_7
+ #endif
+ END (_ZGVeN8v_sin_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.14:
+-	.long	0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.14,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
+index 2df626c0..e501a53a 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
+@@ -430,7 +430,7 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN4vl8l8_sincos
+ 
+ /* SinPoly = SinR*SinPoly */
+         vfmadd213pd %zmm5, %zmm5, %zmm4
+-        vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3
++        vpternlogd $0xff, %zmm3, %zmm3, %zmm3
+ 
+ /* Update Cos result's sign */
+         vxorpd    %zmm2, %zmm1, %zmm1
+@@ -741,8 +741,3 @@ END (_ZGVeN8vvv_sincos_knl)
+ ENTRY (_ZGVeN8vvv_sincos_skx)
+ WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx
+ END (_ZGVeN8vvv_sincos_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.15:
+-	.long	0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.15,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
+index 6ea1137b..377af394 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
+@@ -278,7 +278,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
+   X = X - Y*PI1 - Y*PI2 - Y*PI3
+  */
+         vmovaps   %zmm0, %zmm6
+-        vmovups   .L_2il0floatpacket.13(%rip), %zmm12
++        vpternlogd $0xff, %zmm12, %zmm12, %zmm12
+         vmovups __sRShifter(%rax), %zmm3
+         vmovups __sPI1_FMA(%rax), %zmm5
+         vmovups __sA9_FMA(%rax), %zmm9
+@@ -453,8 +453,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
+         jmp       .LBL_2_7
+ #endif
+ END (_ZGVeN16v_cosf_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.13:
+-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.13,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
+index 89ba0df2..46f33d46 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
+@@ -264,7 +264,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
+         vmovaps   %zmm0, %zmm7
+ 
+ /* compare against threshold */
+-        vmovups   .L_2il0floatpacket.13(%rip), %zmm3
++        vpternlogd $0xff, %zmm3, %zmm3, %zmm3
+         vmovups __sInvLn2(%rax), %zmm4
+         vmovups __sShifter(%rax), %zmm1
+         vmovups __sLn2hi(%rax), %zmm6
+@@ -440,8 +440,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
+ 
+ #endif
+ END (_ZGVeN16v_expf_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.13:
+-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.13,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
+index 4cf0a96f..9e254956 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
+@@ -235,7 +235,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_logf
+         andq      $-64, %rsp
+         subq      $1280, %rsp
+         movq      __svml_slog_data@GOTPCREL(%rip), %rax
+-        vmovups   .L_2il0floatpacket.7(%rip), %zmm6
++        vpternlogd $0xff, %zmm6, %zmm6, %zmm6
+         vmovups _iBrkValue(%rax), %zmm4
+         vmovups _sPoly_7(%rax), %zmm8
+ 
+@@ -409,8 +409,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_logf
+ 
+ #endif
+ END (_ZGVeN16v_logf_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.7:
+-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.7,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
+index bdcd50af..e8331ba1 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
+@@ -385,7 +385,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
+         vpsrlq    $32, %zmm3, %zmm2
+         vpmovqd   %zmm2, %ymm11
+         vcvtps2pd %ymm14, %zmm13
+-        vmovups   .L_2il0floatpacket.23(%rip), %zmm14
++        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
+         vmovaps   %zmm14, %zmm26
+         vpandd _ABSMASK(%rax), %zmm1, %zmm8
+         vpcmpd    $1, _INF(%rax), %zmm8, %k2
+@@ -427,7 +427,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
+         vpmovqd   %zmm11, %ymm5
+         vpxord    %zmm10, %zmm10, %zmm10
+         vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3}
+-        vpbroadcastq .L_2il0floatpacket.24(%rip), %zmm4
++        vpternlogd $0xff, %zmm4, %zmm4, %zmm4
+         vpxord    %zmm11, %zmm11, %zmm11
+         vcvtdq2pd %ymm7, %zmm7
+         vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1}
+@@ -643,11 +643,3 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
+         jmp       .LBL_2_7
+ #endif
+ END (_ZGVeN16vv_powf_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.23:
+-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.23,@object
+-.L_2il0floatpacket.24:
+-	.long	0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.24,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
+index 5fa4bc41..1f46f334 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
+@@ -317,7 +317,7 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf
+ 
+ /* Result sign calculations */
+         vpternlogd $150, %zmm0, %zmm14, %zmm1
+-        vmovups   .L_2il0floatpacket.13(%rip), %zmm14
++        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
+ 
+ /* Add correction term 0.5 for cos() part */
+         vaddps    %zmm8, %zmm5, %zmm15
+@@ -748,8 +748,3 @@ END (_ZGVeN16vvv_sincosf_knl)
+ ENTRY (_ZGVeN16vvv_sincosf_skx)
+ WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx
+ END (_ZGVeN16vvv_sincosf_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.13:
+-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.13,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
+index 141f747e..1fc9308a 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
+@@ -280,7 +280,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf
+         movq      __svml_s_trig_data@GOTPCREL(%rip), %rax
+ 
+ /* Check for large and special values */
+-        vmovups   .L_2il0floatpacket.11(%rip), %zmm14
++        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
+         vmovups __sAbsMask(%rax), %zmm5
+         vmovups __sInvPI(%rax), %zmm1
+         vmovups __sRShifter(%rax), %zmm2
+@@ -472,8 +472,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf
+         jmp       .LBL_2_7
+ #endif
+ END (_ZGVeN16v_sinf_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.11:
+-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.11,@object
+-- 
+GitLab
+
diff --git a/SOURCES/glibc-sw28537-1.patch b/SOURCES/glibc-sw28537-1.patch
new file mode 100644
index 0000000..31fdf34
--- /dev/null
+++ b/SOURCES/glibc-sw28537-1.patch
@@ -0,0 +1,39 @@
+From ca37f880abe1c8e3464fd646fd8b89171d8fca0c Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Thu, 11 Nov 2021 06:31:51 -0800
+Subject: [PATCH] Avoid extra load with CAS in __pthread_mutex_lock_full [BZ
+ #28537]
+
+Replace boolean CAS with value CAS to avoid the extra load.
+
+Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
+(cherry picked from commit 0b82747dc48d5bf0871bdc6da8cb6eec1256355f)
+---
+ nptl/pthread_mutex_lock.c | 10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
+index 29cc143e..60ada70d 100644
+--- a/nptl/pthread_mutex_lock.c
++++ b/nptl/pthread_mutex_lock.c
+@@ -292,12 +292,12 @@ __pthread_mutex_lock_full (pthread_mutex_t *mutex)
+ 	     meantime.  */
+ 	  if ((oldval & FUTEX_WAITERS) == 0)
+ 	    {
+-	      if (atomic_compare_and_exchange_bool_acq (&mutex->__data.__lock,
+-							oldval | FUTEX_WAITERS,
+-							oldval)
+-		  != 0)
++	      int val;
++	      if ((val = atomic_compare_and_exchange_val_acq
++		   (&mutex->__data.__lock, oldval | FUTEX_WAITERS,
++		    oldval)) != oldval)
+ 		{
+-		  oldval = mutex->__data.__lock;
++		  oldval = val;
+ 		  continue;
+ 		}
+ 	      oldval |= FUTEX_WAITERS;
+-- 
+GitLab
+
diff --git a/SOURCES/glibc-sw28537-2.patch b/SOURCES/glibc-sw28537-2.patch
new file mode 100644
index 0000000..1e477b9
--- /dev/null
+++ b/SOURCES/glibc-sw28537-2.patch
@@ -0,0 +1,39 @@
+From 56212a35fe258f45f242f4bb80866e4980e4ec0e Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Thu, 11 Nov 2021 06:54:01 -0800
+Subject: [PATCH] Avoid extra load with CAS in __pthread_mutex_clocklock_common
+ [BZ #28537]
+
+Replace boolean CAS with value CAS to avoid the extra load.
+
+Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
+(cherry picked from commit 49302b8fdf9103b6fc0a398678668a22fa19574c)
+---
+ nptl/pthread_mutex_timedlock.c | 10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+diff --git a/nptl/pthread_mutex_timedlock.c b/nptl/pthread_mutex_timedlock.c
+index 888c12fe..c4627ef6 100644
+--- a/nptl/pthread_mutex_timedlock.c
++++ b/nptl/pthread_mutex_timedlock.c
+@@ -269,12 +269,12 @@ __pthread_mutex_timedlock (pthread_mutex_t *mutex,
+ 	     meantime.  */
+ 	  if ((oldval & FUTEX_WAITERS) == 0)
+ 	    {
+-	      if (atomic_compare_and_exchange_bool_acq (&mutex->__data.__lock,
+-							oldval | FUTEX_WAITERS,
+-							oldval)
+-		  != 0)
++	      int val;
++	      if ((val = atomic_compare_and_exchange_val_acq
++		   (&mutex->__data.__lock, oldval | FUTEX_WAITERS,
++		    oldval)) != oldval)
+ 		{
+-		  oldval = mutex->__data.__lock;
++		  oldval = val;
+ 		  continue;
+ 		}
+ 	      oldval |= FUTEX_WAITERS;
+-- 
+GitLab
+
diff --git a/SOURCES/glibc-sw28537-3.patch b/SOURCES/glibc-sw28537-3.patch
new file mode 100644
index 0000000..422dcc9
--- /dev/null
+++ b/SOURCES/glibc-sw28537-3.patch
@@ -0,0 +1,51 @@
+From 2fa2ea233bec906b682fc82376649a1a6e18e9df Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Tue, 2 Nov 2021 18:33:07 -0700
+Subject: [PATCH] Add LLL_MUTEX_READ_LOCK [BZ #28537]
+
+CAS instruction is expensive.  From the x86 CPU's point of view, getting
+a cache line for writing is more expensive than reading.  See Appendix
+A.2 Spinlock in:
+
+https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/xeon-lock-scaling-analysis-paper.pdf
+
+The full compare and swap will grab the cache line exclusive and cause
+excessive cache line bouncing.
+
+Add LLL_MUTEX_READ_LOCK to do an atomic load and skip CAS in spinlock
+loop if compare may fail to reduce cache line bouncing on contended locks.
+
+Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
+(cherry picked from commit d672a98a1af106bd68deb15576710cd61363f7a6)
+---
+ nptl/pthread_mutex_lock.c | 7 +++++++
+ 1 file changed, 7 insertions(+)
+
+diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
+index 60ada70d..eb4d8baa 100644
+--- a/nptl/pthread_mutex_lock.c
++++ b/nptl/pthread_mutex_lock.c
+@@ -56,6 +56,11 @@
+ #define FORCE_ELISION(m, s)
+ #endif
+ 
++#ifndef LLL_MUTEX_READ_LOCK
++# define LLL_MUTEX_READ_LOCK(mutex) \
++  atomic_load_relaxed (&(mutex)->__data.__lock)
++#endif
++
+ static int __pthread_mutex_lock_full (pthread_mutex_t *mutex)
+      __attribute_noinline__;
+ 
+@@ -136,6 +141,8 @@ __pthread_mutex_lock (pthread_mutex_t *mutex)
+ 		  break;
+ 		}
+ 	      atomic_spin_nop ();
++	      if (LLL_MUTEX_READ_LOCK (mutex) != 0)
++		continue;
+ 	    }
+ 	  while (LLL_MUTEX_TRYLOCK (mutex) != 0);
+ 
+-- 
+GitLab
+
diff --git a/SOURCES/glibc-sw28537-4.patch b/SOURCES/glibc-sw28537-4.patch
new file mode 100644
index 0000000..08457cb
--- /dev/null
+++ b/SOURCES/glibc-sw28537-4.patch
@@ -0,0 +1,43 @@
+From bae6e60ec32acdbc5e61d94d6e222e456b796054 Mon Sep 17 00:00:00 2001
+From: Jangwoong Kim <6812skiii@gmail.com>
+Date: Tue, 14 Dec 2021 21:30:51 +0900
+Subject: [PATCH] nptl: Effectively skip CAS in spinlock loop
+
+The commit:
+"Add LLL_MUTEX_READ_LOCK [BZ #28537]"
+SHA1: d672a98a1af106bd68deb15576710cd61363f7a6
+
+introduced LLL_MUTEX_READ_LOCK, to skip CAS in spinlock loop
+if atomic load fails. But, "continue" inside of do-while loop
+does not skip the evaluation of escape expression, thus CAS
+is not skipped.
+
+Replace do-while with while and skip LLL_MUTEX_TRYLOCK if
+LLL_MUTEX_READ_LOCK fails.
+
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+(cherry picked from commit 6b8dbbd03ac88f169b65b5c7d7278576a11d2e44)
+---
+ nptl/pthread_mutex_lock.c | 5 ++---
+ 1 file changed, 2 insertions(+), 3 deletions(-)
+
+diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
+index a633d95e..d96a9933 100644
+--- a/nptl/pthread_mutex_lock.c
++++ b/nptl/pthread_mutex_lock.c
+@@ -141,10 +141,9 @@ __pthread_mutex_lock (pthread_mutex_t *mutex)
+ 		  break;
+ 		}
+ 	      atomic_spin_nop ();
+-	      if (LLL_MUTEX_READ_LOCK (mutex) != 0)
+-		continue;
+ 	    }
+-	  while (LLL_MUTEX_TRYLOCK (mutex) != 0);
++	  while (LLL_MUTEX_READ_LOCK (mutex) != 0
++		 || LLL_MUTEX_TRYLOCK (mutex) != 0);
+ 
+ 	  mutex->__data.__spins += (cnt - mutex->__data.__spins) / 8;
+ 	}
+-- 
+GitLab
+
diff --git a/SOURCES/glibc-sw28646.patch b/SOURCES/glibc-sw28646.patch
new file mode 100644
index 0000000..c0e2f6f
--- /dev/null
+++ b/SOURCES/glibc-sw28646.patch
@@ -0,0 +1,56 @@
+From 3c819146751bedbca46f5f089a786d7b5595baa8 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Wed, 2 Mar 2022 16:30:33 -0800
+Subject: [PATCH] x86-64: Use notl in EVEX strcmp [BZ #28646]
+
+Must use notl %edi here as lower bits are for CHAR comparisons
+potentially out of range thus can be 0 without indicating mismatch.
+This fixes BZ #28646.
+
+Co-Authored-By: H.J. Lu <hjl.tools@gmail.com>
+(cherry picked from commit 4df1fa6ddc8925a75f3da644d5da3bb16eb33f02)
+---
+ sysdeps/x86_64/multiarch/strcmp-evex.S | 14 ++++++++------
+ 1 file changed, 8 insertions(+), 6 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
+index 82f12ac8..6f5c4bf9 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
+@@ -656,12 +656,13 @@ L(loop_cross_page):
+ 	   in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10).  */
+ 	VPCMP	$0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4}
+ 	kmovd	%k3, %edi
++    /* Must use notl %edi here as lower bits are for CHAR
++	   comparisons potentially out of range thus can be 0 without
++	   indicating mismatch.  */
++	notl	%edi
+ # ifdef USE_AS_WCSCMP
+ 	/* Don't use subl since it is the upper 8 bits of EDI below.  */
+-	notl	%edi
+ 	andl	$0xff, %edi
+-# else
+-	incl	%edi
+ # endif
+ 
+ # ifdef USE_AS_WCSCMP
+@@ -743,12 +744,13 @@ L(loop_cross_page_2_vec):
+ 	   in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10).  */
+ 	VPCMP	$0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4}
+ 	kmovd	%k3, %edi
++	/* Must use notl %edi here as lower bits are for CHAR
++	   comparisons potentially out of range thus can be 0 without
++	   indicating mismatch.  */
++	notl	%edi
+ # ifdef USE_AS_WCSCMP
+ 	/* Don't use subl since it is the upper 8 bits of EDI below.  */
+-	notl	%edi
+ 	andl	$0xff, %edi
+-# else
+-	incl	%edi
+ # endif
+ 
+ # ifdef USE_AS_WCSCMP
+-- 
+GitLab
+
diff --git a/SOURCES/glibc-sw28755-1.patch b/SOURCES/glibc-sw28755-1.patch
new file mode 100644
index 0000000..5b9e539
--- /dev/null
+++ b/SOURCES/glibc-sw28755-1.patch
@@ -0,0 +1,41 @@
+From 62bbd92e0ffb25a1e31b3d6b0f03a307109320c1 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Wed, 2 Mar 2022 12:42:18 -0800
+Subject: [PATCH] x86: Fix __wcsncmp_avx2 in strcmp-avx2.S [BZ# 28755]
+
+Fixes [BZ# 28755] for wcsncmp by redirecting length >= 2^56 to
+__wcscmp_avx2. For x86_64 this covers the entire address range so any
+length larger could not possibly be used to bound `s1` or `s2`.
+
+test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+(cherry picked from commit ddf0992cf57a93200e0c782e2a94d0733a5a0b87)
+---
+ sysdeps/x86_64/multiarch/strcmp-avx2.S | 10 ++++++++++
+ 1 file changed, 10 insertions(+)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index 156c1949..8fb8eedc 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -83,6 +83,16 @@ ENTRY (STRCMP)
+ 	je	L(char0)
+ 	jb	L(zero)
+ #  ifdef USE_AS_WCSCMP
++#  ifndef __ILP32__
++	movq	%rdx, %rcx
++	/* Check if length could overflow when multiplied by
++	   sizeof(wchar_t). Checking top 8 bits will cover all potential
++	   overflow cases as well as redirect cases where its impossible to
++	   length to bound a valid memory region. In these cases just use
++	   'wcscmp'.  */
++	shrq	$56, %rcx
++	jnz	__wcscmp_avx2
++#  endif
+ 	/* Convert units: from wide to byte char.  */
+ 	shl	$2, %RDX_LP
+ #  endif
+-- 
+GitLab
+
diff --git a/SOURCES/glibc-sw28755-2.patch b/SOURCES/glibc-sw28755-2.patch
new file mode 100644
index 0000000..4b87cb6
--- /dev/null
+++ b/SOURCES/glibc-sw28755-2.patch
@@ -0,0 +1,41 @@
+From edb82a1c8cb1e29dae2fa897013aa5f74edc537e Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Sun, 9 Jan 2022 16:02:28 -0600
+Subject: [PATCH] x86: Fix __wcsncmp_evex in strcmp-evex.S [BZ# 28755]
+
+Fixes [BZ# 28755] for wcsncmp by redirecting length >= 2^56 to
+__wcscmp_evex. For x86_64 this covers the entire address range so any
+length larger could not possibly be used to bound `s1` or `s2`.
+
+test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+(cherry picked from commit 7e08db3359c86c94918feb33a1182cd0ff3bb10b)
+---
+ sysdeps/x86_64/multiarch/strcmp-evex.S | 10 ++++++++++
+ 1 file changed, 10 insertions(+)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
+index 459eeed0..d5aa6daa 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
+@@ -97,6 +97,16 @@ ENTRY (STRCMP)
+ 	je	L(char0)
+ 	jb	L(zero)
+ #  ifdef USE_AS_WCSCMP
++#  ifndef __ILP32__
++	movq	%rdx, %rcx
++	/* Check if length could overflow when multiplied by
++	   sizeof(wchar_t). Checking top 8 bits will cover all potential
++	   overflow cases as well as redirect cases where its impossible to
++	   length to bound a valid memory region. In these cases just use
++	   'wcscmp'.  */
++	shrq	$56, %rcx
++	jnz	__wcscmp_evex
++#  endif
+ 	/* Convert units: from wide to byte char.  */
+ 	shl	$2, %RDX_LP
+ #  endif
+-- 
+GitLab
+
diff --git a/SOURCES/glibc-sw28896.patch b/SOURCES/glibc-sw28896.patch
new file mode 100644
index 0000000..051c7a9
--- /dev/null
+++ b/SOURCES/glibc-sw28896.patch
@@ -0,0 +1,132 @@
+From 8f2267e67ea924c949ca4a70a0d2c484d81a8fc9 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Tue, 15 Feb 2022 08:18:15 -0600
+Subject: [PATCH] x86: Fallback {str|wcs}cmp RTM in the ncmp overflow case [BZ
+ #28896]
+
+In the overflow fallback strncmp-avx2-rtm and wcsncmp-avx2-rtm would
+call strcmp-avx2 and wcscmp-avx2 respectively. This would have
+not checks around vzeroupper and would trigger spurious
+aborts. This commit fixes that.
+
+test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass on
+AVX2 machines with and without RTM.
+
+Co-authored-by: H.J. Lu <hjl.tools@gmail.com>
+
+(cherry picked from commit c6272098323153db373f2986c67786ea8c85f1cf)
+---
+ sysdeps/x86/Makefile                        |  2 +-
+ sysdeps/x86/tst-strncmp-rtm.c               | 17 ++++++++++++++++-
+ sysdeps/x86_64/multiarch/strcmp-avx2.S      |  2 +-
+ sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S |  1 +
+ sysdeps/x86_64/multiarch/strncmp-avx2.S     |  1 +
+ sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S |  2 +-
+ sysdeps/x86_64/multiarch/wcsncmp-avx2.S     |  2 +-
+ 7 files changed, 22 insertions(+), 5 deletions(-)
+
+diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
+index 5be71ada..2d814915 100644
+--- a/sysdeps/x86/Makefile
++++ b/sysdeps/x86/Makefile
+@@ -38,7 +38,7 @@ CFLAGS-tst-memset-rtm.c += -mrtm
+ CFLAGS-tst-strchr-rtm.c += -mrtm
+ CFLAGS-tst-strcpy-rtm.c += -mrtm
+ CFLAGS-tst-strlen-rtm.c += -mrtm
+-CFLAGS-tst-strncmp-rtm.c += -mrtm
++CFLAGS-tst-strncmp-rtm.c += -mrtm -Wno-error
+ CFLAGS-tst-strrchr-rtm.c += -mrtm
+ endif
+ 
+diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
+index 236ad951..4d0004b5 100644
+--- a/sysdeps/x86/tst-strncmp-rtm.c
++++ b/sysdeps/x86/tst-strncmp-rtm.c
+@@ -16,6 +16,7 @@
+    License along with the GNU C Library; if not, see
+    <https://www.gnu.org/licenses/>.  */
+ 
++#include <stdint.h>
+ #include <tst-string-rtm.h>
+ 
+ #define LOOP 3000
+@@ -45,8 +46,22 @@ function (void)
+     return 1;
+ }
+ 
++__attribute__ ((noinline, noclone))
++static int
++function_overflow (void)
++{
++  if (strncmp (string1, string2, SIZE_MAX) == 0)
++    return 0;
++  else
++    return 1;
++}
++
+ static int
+ do_test (void)
+ {
+-  return do_test_1 ("strncmp", LOOP, prepare, function);
++  int status = do_test_1 ("strncmp", LOOP, prepare, function);
++  if (status != EXIT_SUCCESS)
++    return status;
++  status = do_test_1 ("strncmp", LOOP, prepare, function_overflow);
++  return status;
+ }
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index 5d1c9d90..433ae047 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -95,7 +95,7 @@ ENTRY (STRCMP)
+ 	   length to bound a valid memory region. In these cases just use
+ 	   'wcscmp'.  */
+ 	shrq	$56, %rcx
+-	jnz	__wcscmp_avx2
++	jnz	OVERFLOW_STRCMP
+ #  endif
+ 	/* Convert units: from wide to byte char.  */
+ 	shl	$2, %RDX_LP
+diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
+index 37d1224b..68bad365 100644
+--- a/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
++++ b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
+@@ -1,3 +1,4 @@
+ #define STRCMP	__strncmp_avx2_rtm
+ #define USE_AS_STRNCMP 1
++#define OVERFLOW_STRCMP	__strcmp_avx2_rtm
+ #include "strcmp-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2.S b/sysdeps/x86_64/multiarch/strncmp-avx2.S
+index 1678bcc2..f138e9f1 100644
+--- a/sysdeps/x86_64/multiarch/strncmp-avx2.S
++++ b/sysdeps/x86_64/multiarch/strncmp-avx2.S
+@@ -1,3 +1,4 @@
+ #define STRCMP	__strncmp_avx2
+ #define USE_AS_STRNCMP 1
++#define OVERFLOW_STRCMP __strcmp_avx2
+ #include "strcmp-avx2.S"
+diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
+index 4e88c70c..f467582c 100644
+--- a/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
++++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
+@@ -1,5 +1,5 @@
+ #define STRCMP __wcsncmp_avx2_rtm
+ #define USE_AS_STRNCMP 1
+ #define USE_AS_WCSCMP 1
+-
++#define OVERFLOW_STRCMP	__wcscmp_avx2_rtm
+ #include "strcmp-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2.S
+index 4fa1de4d..e9ede522 100644
+--- a/sysdeps/x86_64/multiarch/wcsncmp-avx2.S
++++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2.S
+@@ -1,5 +1,5 @@
+ #define STRCMP __wcsncmp_avx2
+ #define USE_AS_STRNCMP 1
+ #define USE_AS_WCSCMP 1
+-
++#define OVERFLOW_STRCMP	__wcscmp_avx2
+ #include "strcmp-avx2.S"
+-- 
+GitLab
+
diff --git a/SOURCES/ia-avoid_short_distance_rep_movsb.patch b/SOURCES/ia-avoid_short_distance_rep_movsb.patch
new file mode 100644
index 0000000..a7d087b
--- /dev/null
+++ b/SOURCES/ia-avoid_short_distance_rep_movsb.patch
@@ -0,0 +1,136 @@
+From 869a7106bfc301aa021a77a9bcede85eddd17da1 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Wed, 2 Mar 2022 15:33:52 -0800
+Subject: [PATCH] x86-64: Add Avoid_Short_Distance_REP_MOVSB
+
+commit 3ec5d83d2a237d39e7fd6ef7a0bc8ac4c171a4a5
+Author: H.J. Lu <hjl.tools@gmail.com>
+Date:   Sat Jan 25 14:19:40 2020 -0800
+
+    x86-64: Avoid rep movsb with short distance [BZ #27130]
+
+introduced some regressions on Intel processors without Fast Short REP
+MOV (FSRM).  Add Avoid_Short_Distance_REP_MOVSB to avoid rep movsb with
+short distance only on Intel processors with FSRM.  bench-memmove-large
+on Skylake server shows that cycles of __memmove_evex_unaligned_erms
+improves for the following data size:
+
+                                  before    after    Improvement
+length=4127, align1=3, align2=0:  479.38    349.25      27%
+length=4223, align1=9, align2=5:  405.62    333.25      18%
+length=8223, align1=3, align2=0:  786.12    496.38      37%
+length=8319, align1=9, align2=5:  727.50    501.38      31%
+length=16415, align1=3, align2=0: 1436.88   840.00      41%
+length=16511, align1=9, align2=5: 1375.50   836.38      39%
+length=32799, align1=3, align2=0: 2890.00   1860.12     36%
+length=32895, align1=9, align2=5: 2891.38   1931.88     33%
+
+(cherry picked from commit 91cc803d27bda34919717b496b53cf279e44a922)
+---
+ sysdeps/x86/cacheinfo.h                                  | 9 +++++++++
+ sysdeps/x86/cpu-features.c                               | 5 +++++
+ .../include/cpu-features-preferred_feature_index_1.def   | 1 +
+ sysdeps/x86/sysdep.h                                     | 5 +++++
+ sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S    | 5 +++++
+ 5 files changed, 25 insertions(+)
+
+diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
+index b982982f..f72f634a 100644
+--- a/sysdeps/x86/cacheinfo.h
++++ b/sysdeps/x86/cacheinfo.h
+@@ -48,6 +48,11 @@ long int __x86_rep_stosb_threshold attribute_hidden = 2048;
+ /* Threshold to stop using Enhanced REP MOVSB.  */
+ long int __x86_rep_movsb_stop_threshold attribute_hidden;
+ 
++/* A bit-wise OR of string/memory requirements for optimal performance
++   e.g. X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB.  These bits
++   are used at runtime to tune implementation behavior.  */
++int __x86_string_control attribute_hidden;
++
+ static void
+ get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr,
+ 		       long int core)
+@@ -435,6 +440,10 @@ init_cacheinfo (void)
+   if (cpu_features->basic.kind != arch_kind_amd)
+     __x86_rep_movsb_stop_threshold = __x86_shared_non_temporal_threshold;
+ 
++  if (CPU_FEATURES_ARCH_P (cpu_features, Avoid_Short_Distance_REP_MOVSB))
++    __x86_string_control
++      |= X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB;
++
+ # if HAVE_TUNABLES
+   __x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold;
+ # endif
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index 4889f062..8885b48e 100644
+--- a/sysdeps/x86/cpu-features.c
++++ b/sysdeps/x86/cpu-features.c
+@@ -580,6 +580,11 @@ init_cpu_features (struct cpu_features *cpu_features)
+ 	      &= ~bit_arch_AVX_Fast_Unaligned_Load;
+ 	  }
+ 	}
++
++      /* Avoid avoid short distance REP MOVSB on processor with FSRM.  */
++      if (CPU_FEATURES_CPU_P (cpu_features, FSRM))
++	cpu_features->preferred[index_arch_Avoid_Short_Distance_REP_MOVSB]
++	  |= bit_arch_Avoid_Short_Distance_REP_MOVSB;
+     }
+   /* This spells out "CentaurHauls" or " Shanghai ".  */
+   else if ((ebx == 0x746e6543 && ecx == 0x736c7561 && edx == 0x48727561)
+diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+index 4ca70b40..f2340624 100644
+--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
++++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+@@ -33,3 +33,4 @@ BIT (Prefer_FSRM)
+ BIT (Prefer_No_AVX512)
+ BIT (MathVec_Prefer_No_AVX512)
+ BIT (Prefer_AVX2_STRCMP)
++BIT (Avoid_Short_Distance_REP_MOVSB)
+diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
+index f41f4ebd..01bac0f6 100644
+--- a/sysdeps/x86/sysdep.h
++++ b/sysdeps/x86/sysdep.h
+@@ -57,6 +57,11 @@ enum cf_protection_level
+ #define STATE_SAVE_MASK \
+   ((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7))
+ 
++/* Constants for bits in __x86_string_control:  */
++
++/* Avoid short distance REP MOVSB.  */
++#define X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB	(1 << 0)
++
+ #ifdef	__ASSEMBLER__
+ 
+ /* Syntactic details of assembler.  */
+diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+index 620ce3a8..0469bf99 100644
+--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+@@ -325,12 +325,16 @@ L(movsb):
+ 	/* Avoid slow backward REP MOVSB.  */
+ 	jb	L(more_8x_vec_backward)
+ # if AVOID_SHORT_DISTANCE_REP_MOVSB
++	andl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
++	jz	3f
+ 	movq	%rdi, %rcx
+ 	subq	%rsi, %rcx
+ 	jmp	2f
+ # endif
+ 1:
+ # if AVOID_SHORT_DISTANCE_REP_MOVSB
++	andl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
++	jz	3f
+ 	movq	%rsi, %rcx
+ 	subq	%rdi, %rcx
+ 2:
+@@ -338,6 +342,7 @@ L(movsb):
+    is N*4GB + [1..63] with N >= 0.  */
+ 	cmpl	$63, %ecx
+ 	jbe	L(more_2x_vec)	/* Avoid "rep movsb" if ECX <= 63.  */
++3:
+ # endif
+ 	mov	%RDX_LP, %RCX_LP
+ 	rep movsb
+-- 
+GitLab
+
diff --git a/SOURCES/ia-avx-opt-funct-rtm.patch b/SOURCES/ia-avx-opt-funct-rtm.patch
new file mode 100644
index 0000000..5101b4c
--- /dev/null
+++ b/SOURCES/ia-avx-opt-funct-rtm.patch
@@ -0,0 +1,2564 @@
+From 041fc69e69905aa9193178c49b44dce7bb8b5d6d Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Wed, 2 Mar 2022 14:25:33 -0800
+Subject: [PATCH]  x86-64: Add AVX optimized string/memory functions for RTM
+
+Since VZEROUPPER triggers RTM abort while VZEROALL won't, select AVX
+optimized string/memory functions with
+
+	xtest
+	jz	1f
+	vzeroall
+	ret
+1:
+	vzeroupper
+	ret
+
+at function exit on processors with usable RTM, but without 256-bit EVEX
+instructions to avoid VZEROUPPER inside a transactionally executing RTM
+region.
+
+(cherry picked from commit 7ebba91361badf7531d4e75050627a88d424872f)
+---
+ sysdeps/x86_64/multiarch/Makefile             |  27 +++
+ sysdeps/x86_64/multiarch/ifunc-avx2.h         |   4 +
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c    | 170 ++++++++++++++++++
+ sysdeps/x86_64/multiarch/ifunc-memcmp.h       |   4 +
+ sysdeps/x86_64/multiarch/ifunc-memmove.h      |  12 ++
+ sysdeps/x86_64/multiarch/ifunc-memset.h       |  12 ++
+ sysdeps/x86_64/multiarch/ifunc-strcpy.h       |   4 +
+ sysdeps/x86_64/multiarch/ifunc-wmemset.h      |   5 +
+ sysdeps/x86_64/multiarch/memchr-avx2-rtm.S    |  12 ++
+ sysdeps/x86_64/multiarch/memchr-avx2.S        |  45 +++--
+ .../x86_64/multiarch/memcmp-avx2-movbe-rtm.S  |  12 ++
+ sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S  |  28 ++-
+ .../memmove-avx-unaligned-erms-rtm.S          |  17 ++
+ .../multiarch/memmove-vec-unaligned-erms.S    |  33 ++--
+ sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S   |  12 ++
+ sysdeps/x86_64/multiarch/memrchr-avx2.S       |  53 +++---
+ .../memset-avx2-unaligned-erms-rtm.S          |  10 ++
+ .../multiarch/memset-avx2-unaligned-erms.S    |  12 +-
+ .../multiarch/memset-vec-unaligned-erms.S     |  41 ++---
+ sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S |   4 +
+ sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S    |   3 +
+ sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S   |   4 +
+ sysdeps/x86_64/multiarch/strcat-avx2-rtm.S    |  12 ++
+ sysdeps/x86_64/multiarch/strcat-avx2.S        |   6 +-
+ sysdeps/x86_64/multiarch/strchr-avx2-rtm.S    |  12 ++
+ sysdeps/x86_64/multiarch/strchr-avx2.S        |  22 +--
+ sysdeps/x86_64/multiarch/strchr.c             |   4 +
+ sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S |   3 +
+ sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S    |  12 ++
+ sysdeps/x86_64/multiarch/strcmp-avx2.S        |  55 +++---
+ sysdeps/x86_64/multiarch/strcmp.c             |   4 +
+ sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S    |  12 ++
+ sysdeps/x86_64/multiarch/strcpy-avx2.S        |  85 ++++-----
+ sysdeps/x86_64/multiarch/strlen-avx2-rtm.S    |  12 ++
+ sysdeps/x86_64/multiarch/strlen-avx2.S        |  43 ++---
+ sysdeps/x86_64/multiarch/strncat-avx2-rtm.S   |   3 +
+ sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S   |   3 +
+ sysdeps/x86_64/multiarch/strncmp.c            |   4 +
+ sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S   |   3 +
+ sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S   |   4 +
+ sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S   |  12 ++
+ sysdeps/x86_64/multiarch/strrchr-avx2.S       |  19 +-
+ sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S    |   3 +
+ sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S    |   4 +
+ sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S    |   4 +
+ sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S   |   5 +
+ sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S   |   5 +
+ sysdeps/x86_64/multiarch/wcsnlen.c            |   4 +
+ sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S   |   3 +
+ sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S   |   4 +
+ .../x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S |   4 +
+ sysdeps/x86_64/sysdep.h                       |  22 +++
+ 52 files changed, 668 insertions(+), 244 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/strchr-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/strlen-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 9d79b138..491c7698 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
++++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -40,6 +40,25 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
+ 		   memset-sse2-unaligned-erms \
+ 		   memset-avx2-unaligned-erms \
+ 		   memset-avx512-unaligned-erms \
++		   memchr-avx2-rtm \
++		   memcmp-avx2-movbe-rtm \
++		   memmove-avx-unaligned-erms-rtm \
++		   memrchr-avx2-rtm \
++		   memset-avx2-unaligned-erms-rtm \
++		   rawmemchr-avx2-rtm \
++		   strchr-avx2-rtm \
++		   strcmp-avx2-rtm \
++		   strchrnul-avx2-rtm \
++		   stpcpy-avx2-rtm \
++		   stpncpy-avx2-rtm \
++		   strcat-avx2-rtm \
++		   strcpy-avx2-rtm \
++		   strlen-avx2-rtm \
++		   strncat-avx2-rtm \
++		   strncmp-avx2-rtm \
++		   strncpy-avx2-rtm \
++		   strnlen-avx2-rtm \
++		   strrchr-avx2-rtm \
+ 		   memchr-evex \
+ 		   memcmp-evex-movbe \
+ 		   memmove-evex-unaligned-erms \
+@@ -76,6 +95,14 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
+ 		   wcsrchr-sse2 wcsrchr-avx2 \
+ 		   wcsnlen-sse4_1 wcsnlen-c \
+ 		   wcslen-sse2 wcslen-avx2 wcsnlen-avx2 \
++		   wcschr-avx2-rtm \
++		   wcscmp-avx2-rtm \
++		   wcslen-avx2-rtm \
++		   wcsncmp-avx2-rtm \
++		   wcsnlen-avx2-rtm \
++		   wcsrchr-avx2-rtm \
++		   wmemchr-avx2-rtm \
++		   wmemcmp-avx2-movbe-rtm \
+ 		   wcschr-evex \
+ 		   wcscmp-evex \
+ 		   wcslen-evex \
+diff --git a/sysdeps/x86_64/multiarch/ifunc-avx2.h b/sysdeps/x86_64/multiarch/ifunc-avx2.h
+index 7081b0c9..e0f30e61 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-avx2.h
++++ b/sysdeps/x86_64/multiarch/ifunc-avx2.h
+@@ -21,6 +21,7 @@
+ 
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+ 
+ static inline void *
+@@ -36,6 +37,9 @@ IFUNC_SELECTOR (void)
+ 	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+ 	return OPTIMIZE (evex);
+ 
++      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	return OPTIMIZE (avx2_rtm);
++
+       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ 	return OPTIMIZE (avx2);
+     }
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index c8da910e..c1efeec0 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -43,6 +43,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, memchr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __memchr_avx2)
++	      IFUNC_IMPL_ADD (array, i, memchr,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __memchr_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, memchr,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+@@ -56,6 +60,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)),
+ 			      __memcmp_avx2_movbe)
++	      IFUNC_IMPL_ADD (array, i, memcmp,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (MOVBE)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __memcmp_avx2_movbe_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, memcmp,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+@@ -85,6 +94,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __memmove_chk_avx_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
++			      (CPU_FEATURE_USABLE (AVX)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __memmove_chk_avx_unaligned_rtm)
++	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
++			      (CPU_FEATURE_USABLE (AVX)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __memmove_chk_avx_unaligned_erms_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ 			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memmove_chk_evex_unaligned)
+@@ -113,6 +130,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, memmove,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __memmove_avx_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, memmove,
++			      (CPU_FEATURE_USABLE (AVX)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __memmove_avx_unaligned_rtm)
++	      IFUNC_IMPL_ADD (array, i, memmove,
++			      (CPU_FEATURE_USABLE (AVX)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __memmove_avx_unaligned_erms_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, memmove,
+ 			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memmove_evex_unaligned)
+@@ -143,6 +168,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, memrchr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __memrchr_avx2)
++	      IFUNC_IMPL_ADD (array, i, memrchr,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __memrchr_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, memrchr,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+@@ -165,6 +194,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __memset_chk_avx2_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, __memset_chk,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __memset_chk_avx2_unaligned_rtm)
++	      IFUNC_IMPL_ADD (array, i, __memset_chk,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __memset_chk_avx2_unaligned_erms_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+@@ -198,6 +235,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __memset_avx2_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, memset,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __memset_avx2_unaligned_rtm)
++	      IFUNC_IMPL_ADD (array, i, memset,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __memset_avx2_unaligned_erms_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+@@ -222,6 +267,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, rawmemchr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __rawmemchr_avx2)
++	      IFUNC_IMPL_ADD (array, i, rawmemchr,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __rawmemchr_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, rawmemchr,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+@@ -234,6 +283,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, strlen,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strlen_avx2)
++	      IFUNC_IMPL_ADD (array, i, strlen,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __strlen_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strlen,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+@@ -245,6 +298,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, strnlen,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strnlen_avx2)
++	      IFUNC_IMPL_ADD (array, i, strnlen,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __strnlen_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strnlen,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+@@ -257,6 +314,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      __stpncpy_ssse3)
+ 	      IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2),
+ 			      __stpncpy_avx2)
++	      IFUNC_IMPL_ADD (array, i, stpncpy,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __stpncpy_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, stpncpy,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+@@ -271,6 +332,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      __stpcpy_ssse3)
+ 	      IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2),
+ 			      __stpcpy_avx2)
++	      IFUNC_IMPL_ADD (array, i, stpcpy,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __stpcpy_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, stpcpy,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+@@ -309,6 +374,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   IFUNC_IMPL (i, name, strcat,
+ 	      IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (AVX2),
+ 			      __strcat_avx2)
++	      IFUNC_IMPL_ADD (array, i, strcat,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __strcat_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strcat,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+@@ -323,6 +392,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, strchr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strchr_avx2)
++	      IFUNC_IMPL_ADD (array, i, strchr,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __strchr_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strchr,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+@@ -336,6 +409,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, strchrnul,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strchrnul_avx2)
++	      IFUNC_IMPL_ADD (array, i, strchrnul,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __strchrnul_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strchrnul,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+@@ -348,6 +425,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, strrchr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strrchr_avx2)
++	      IFUNC_IMPL_ADD (array, i, strrchr,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __strrchr_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strrchr,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+@@ -359,6 +440,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, strcmp,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strcmp_avx2)
++	      IFUNC_IMPL_ADD (array, i, strcmp,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __strcmp_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strcmp,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+@@ -375,6 +460,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   IFUNC_IMPL (i, name, strcpy,
+ 	      IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (AVX2),
+ 			      __strcpy_avx2)
++	      IFUNC_IMPL_ADD (array, i, strcpy,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __strcpy_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strcpy,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+@@ -422,6 +511,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   IFUNC_IMPL (i, name, strncat,
+ 	      IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (AVX2),
+ 			      __strncat_avx2)
++	      IFUNC_IMPL_ADD (array, i, strncat,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __strncat_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strncat,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+@@ -436,6 +529,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   IFUNC_IMPL (i, name, strncpy,
+ 	      IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (AVX2),
+ 			      __strncpy_avx2)
++	      IFUNC_IMPL_ADD (array, i, strncpy,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __strncpy_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strncpy,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+@@ -469,6 +566,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wcschr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wcschr_avx2)
++	      IFUNC_IMPL_ADD (array, i, wcschr,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __wcschr_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wcschr,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+@@ -481,6 +582,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wcsrchr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wcsrchr_avx2)
++	      IFUNC_IMPL_ADD (array, i, wcsrchr,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __wcsrchr_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wcsrchr,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+@@ -493,6 +598,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wcscmp,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wcscmp_avx2)
++	      IFUNC_IMPL_ADD (array, i, wcscmp,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __wcscmp_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wcscmp,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+@@ -505,6 +614,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wcsncmp,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wcsncmp_avx2)
++	      IFUNC_IMPL_ADD (array, i, wcsncmp,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __wcsncmp_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wcsncmp,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+@@ -523,6 +636,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wcslen,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wcslen_avx2)
++	      IFUNC_IMPL_ADD (array, i, wcslen,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __wcslen_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wcslen,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+@@ -535,6 +652,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wcsnlen_avx2)
++	      IFUNC_IMPL_ADD (array, i, wcsnlen,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __wcsnlen_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+@@ -550,6 +671,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wmemchr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wmemchr_avx2)
++	      IFUNC_IMPL_ADD (array, i, wmemchr,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __wmemchr_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wmemchr,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+@@ -563,6 +688,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)),
+ 			      __wmemcmp_avx2_movbe)
++	      IFUNC_IMPL_ADD (array, i, wmemcmp,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (MOVBE)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __wmemcmp_avx2_movbe_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wmemcmp,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+@@ -581,6 +711,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wmemset,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wmemset_avx2_unaligned)
++	      IFUNC_IMPL_ADD (array, i, wmemset,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __wmemset_avx2_unaligned_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wmemset,
+ 			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __wmemset_evex_unaligned)
+@@ -606,6 +740,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __memcpy_chk_avx_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
++			      (CPU_FEATURE_USABLE (AVX)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __memcpy_chk_avx_unaligned_rtm)
++	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
++			      (CPU_FEATURE_USABLE (AVX)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __memcpy_chk_avx_unaligned_erms_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ 			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memcpy_chk_evex_unaligned)
+@@ -634,6 +776,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, memcpy,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __memcpy_avx_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, memcpy,
++			      (CPU_FEATURE_USABLE (AVX)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __memcpy_avx_unaligned_rtm)
++	      IFUNC_IMPL_ADD (array, i, memcpy,
++			      (CPU_FEATURE_USABLE (AVX)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __memcpy_avx_unaligned_erms_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, memcpy,
+ 			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memcpy_evex_unaligned)
+@@ -676,6 +826,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __mempcpy_chk_avx_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
++			      (CPU_FEATURE_USABLE (AVX)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __mempcpy_chk_avx_unaligned_rtm)
++	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
++			      (CPU_FEATURE_USABLE (AVX)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __mempcpy_chk_avx_unaligned_erms_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ 			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __mempcpy_chk_evex_unaligned)
+@@ -713,6 +871,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, mempcpy,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __mempcpy_avx_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, mempcpy,
++			      (CPU_FEATURE_USABLE (AVX)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __mempcpy_avx_unaligned_rtm)
++	      IFUNC_IMPL_ADD (array, i, mempcpy,
++			      (CPU_FEATURE_USABLE (AVX)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __mempcpy_avx_unaligned_erms_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, mempcpy,
+ 			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __mempcpy_evex_unaligned)
+@@ -734,6 +900,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, strncmp,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strncmp_avx2)
++	      IFUNC_IMPL_ADD (array, i, strncmp,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __strncmp_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strncmp,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+index 3ca1f0a6..8043c635 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
++++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+@@ -23,6 +23,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
+ 
+ static inline void *
+@@ -38,6 +39,9 @@ IFUNC_SELECTOR (void)
+ 	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+ 	return OPTIMIZE (evex_movbe);
+ 
++      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	return OPTIMIZE (avx2_movbe_rtm);
++
+       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ 	return OPTIMIZE (avx2_movbe);
+     }
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
+index 6f8bce5f..fa09b9fb 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
++++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
+@@ -29,6 +29,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
+   attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_rtm)
++  attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms_rtm)
++  attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
+   attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
+@@ -71,6 +75,14 @@ IFUNC_SELECTOR (void)
+ 	  return OPTIMIZE (evex_unaligned);
+ 	}
+ 
++      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	{
++	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++	    return OPTIMIZE (avx_unaligned_erms_rtm);
++
++	  return OPTIMIZE (avx_unaligned_rtm);
++	}
++
+       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ 	{
+ 	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
+index 6f31f4dc..6f3375cc 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
++++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
+@@ -27,6 +27,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms)
+   attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_rtm)
++  attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms_rtm)
++  attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
+   attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
+@@ -69,6 +73,14 @@ IFUNC_SELECTOR (void)
+ 	  return OPTIMIZE (evex_unaligned);
+ 	}
+ 
++      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	{
++	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++	    return OPTIMIZE (avx2_unaligned_erms_rtm);
++
++	  return OPTIMIZE (avx2_unaligned_rtm);
++	}
++
+       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ 	{
+ 	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
+index deae6348..a924762e 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h
++++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
+@@ -25,6 +25,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
+   attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+ 
+ static inline void *
+@@ -39,6 +40,9 @@ IFUNC_SELECTOR (void)
+ 	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+ 	return OPTIMIZE (evex);
+ 
++      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	return OPTIMIZE (avx2_rtm);
++
+       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ 	return OPTIMIZE (avx2);
+     }
+diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+index 9290c4bf..bdc94c6c 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h
++++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+@@ -20,6 +20,8 @@
+ 
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_rtm)
++  attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden;
+ 
+@@ -39,6 +41,9 @@ IFUNC_SELECTOR (void)
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
+ 	return OPTIMIZE (evex_unaligned);
+ 
++      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	return OPTIMIZE (avx2_unaligned_rtm);
++
+       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ 	return OPTIMIZE (avx2_unaligned);
+     }
+diff --git a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
+new file mode 100644
+index 00000000..87b076c7
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
+@@ -0,0 +1,12 @@
++#ifndef MEMCHR
++# define MEMCHR __memchr_avx2_rtm
++#endif
++
++#define ZERO_UPPER_VEC_REGISTERS_RETURN \
++  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
++
++#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
++
++#define SECTION(p) p##.avx.rtm
++
++#include "memchr-avx2.S"
+diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
+index c81da19b..cf893e77 100644
+--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
++++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
+@@ -34,9 +34,13 @@
+ #  define VZEROUPPER	vzeroupper
+ # endif
+ 
++# ifndef SECTION
++#  define SECTION(p)	p##.avx
++# endif
++
+ # define VEC_SIZE 32
+ 
+-	.section .text.avx,"ax",@progbits
++	.section SECTION(.text),"ax",@progbits
+ ENTRY (MEMCHR)
+ # ifndef USE_AS_RAWMEMCHR
+ 	/* Check for zero length.  */
+@@ -107,8 +111,8 @@ L(cros_page_boundary):
+ # endif
+ 	addq	%rdi, %rax
+ 	addq	%rcx, %rax
+-	VZEROUPPER
+-	ret
++L(return_vzeroupper):
++	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ 	.p2align 4
+ L(aligned_more):
+@@ -224,8 +228,7 @@ L(last_4x_vec_or_less):
+ 
+ 	jnz	L(first_vec_x3_check)
+ 	xorl	%eax, %eax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(last_2x_vec):
+@@ -243,8 +246,7 @@ L(last_2x_vec):
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1_check)
+ 	xorl	%eax, %eax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x0_check):
+@@ -253,8 +255,7 @@ L(first_vec_x0_check):
+ 	cmpq	%rax, %rdx
+ 	jbe	L(zero)
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x1_check):
+@@ -264,8 +265,7 @@ L(first_vec_x1_check):
+ 	jbe	L(zero)
+ 	addq	$VEC_SIZE, %rax
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x2_check):
+@@ -275,8 +275,7 @@ L(first_vec_x2_check):
+ 	jbe	L(zero)
+ 	addq	$(VEC_SIZE * 2), %rax
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x3_check):
+@@ -286,12 +285,14 @@ L(first_vec_x3_check):
+ 	jbe	L(zero)
+ 	addq	$(VEC_SIZE * 3), %rax
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(zero):
+-	VZEROUPPER
++	xorl	%eax, %eax
++	jmp     L(return_vzeroupper)
++
++	.p2align 4
+ L(null):
+ 	xorl	%eax, %eax
+ 	ret
+@@ -301,24 +302,21 @@ L(null):
+ L(first_vec_x0):
+ 	tzcntl	%eax, %eax
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x1):
+ 	tzcntl	%eax, %eax
+ 	addq	$VEC_SIZE, %rax
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x2):
+ 	tzcntl	%eax, %eax
+ 	addq	$(VEC_SIZE * 2), %rax
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(4x_vec_end):
+@@ -337,8 +335,7 @@ L(first_vec_x3):
+ 	tzcntl	%eax, %eax
+ 	addq	$(VEC_SIZE * 3), %rax
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ END (MEMCHR)
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S
+new file mode 100644
+index 00000000..cf4eff5d
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S
+@@ -0,0 +1,12 @@
++#ifndef MEMCMP
++# define MEMCMP __memcmp_avx2_movbe_rtm
++#endif
++
++#define ZERO_UPPER_VEC_REGISTERS_RETURN \
++  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
++
++#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
++
++#define SECTION(p) p##.avx.rtm
++
++#include "memcmp-avx2-movbe.S"
+diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+index e3a35b89..9d5c9c72 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
++++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+@@ -47,6 +47,10 @@
+ #  define VZEROUPPER	vzeroupper
+ # endif
+ 
++# ifndef SECTION
++#  define SECTION(p)	p##.avx
++# endif
++
+ # define VEC_SIZE 32
+ # define VEC_MASK ((1 << VEC_SIZE) - 1)
+ 
+@@ -55,7 +59,7 @@
+            memcmp has to use UNSIGNED comparison for elemnts.
+ */
+ 
+-	.section .text.avx,"ax",@progbits
++	.section SECTION(.text),"ax",@progbits
+ ENTRY (MEMCMP)
+ # ifdef USE_AS_WMEMCMP
+ 	shl	$2, %RDX_LP
+@@ -123,8 +127,8 @@ ENTRY (MEMCMP)
+ 	vptest	%ymm0, %ymm5
+ 	jnc	L(4x_vec_end)
+ 	xorl	%eax, %eax
+-	VZEROUPPER
+-	ret
++L(return_vzeroupper):
++	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ 	.p2align 4
+ L(last_2x_vec):
+@@ -144,8 +148,7 @@ L(last_vec):
+ 	vpmovmskb %ymm2, %eax
+ 	subl    $VEC_MASK, %eax
+ 	jnz	L(first_vec)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec):
+@@ -164,8 +167,7 @@ L(wmemcmp_return):
+ 	movzbl	(%rsi, %rcx), %edx
+ 	sub	%edx, %eax
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ # ifdef USE_AS_WMEMCMP
+ 	.p2align 4
+@@ -367,8 +369,7 @@ L(last_4x_vec):
+ 	vpmovmskb %ymm2, %eax
+ 	subl    $VEC_MASK, %eax
+ 	jnz	L(first_vec)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(4x_vec_end):
+@@ -394,8 +395,7 @@ L(4x_vec_end):
+ 	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
+ 	sub	%edx, %eax
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x1):
+@@ -410,8 +410,7 @@ L(first_vec_x1):
+ 	movzbl	VEC_SIZE(%rsi, %rcx), %edx
+ 	sub	%edx, %eax
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x2):
+@@ -426,7 +425,6 @@ L(first_vec_x2):
+ 	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
+ 	sub	%edx, %eax
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ END (MEMCMP)
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
+new file mode 100644
+index 00000000..1ec1962e
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
+@@ -0,0 +1,17 @@
++#if IS_IN (libc)
++# define VEC_SIZE	32
++# define VEC(i)		ymm##i
++# define VMOVNT		vmovntdq
++# define VMOVU		vmovdqu
++# define VMOVA		vmovdqa
++
++# define ZERO_UPPER_VEC_REGISTERS_RETURN \
++  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
++
++# define VZEROUPPER_RETURN jmp	 L(return)
++
++# define SECTION(p)		p##.avx.rtm
++# define MEMMOVE_SYMBOL(p,s)	p##_avx_##s##_rtm
++
++# include "memmove-vec-unaligned-erms.S"
++#endif
+diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+index 08e21692..71f5954d 100644
+--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+@@ -140,11 +140,12 @@ L(last_2x_vec):
+ 	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
+ 	VMOVU	%VEC(0), (%rdi)
+ 	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
+-	VZEROUPPER
+ #if !defined USE_MULTIARCH || !IS_IN (libc)
+ L(nop):
+-#endif
+ 	ret
++#else
++	VZEROUPPER_RETURN
++#endif
+ #if defined USE_MULTIARCH && IS_IN (libc)
+ END (MEMMOVE_SYMBOL (__memmove, unaligned))
+ 
+@@ -237,8 +238,11 @@ L(last_2x_vec):
+ 	VMOVU	%VEC(0), (%rdi)
+ 	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
+ L(return):
+-	VZEROUPPER
++#if VEC_SIZE > 16
++	ZERO_UPPER_VEC_REGISTERS_RETURN
++#else
+ 	ret
++#endif
+ 
+ L(movsb):
+ 	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
+@@ -289,8 +293,7 @@ L(between_32_63):
+ 	VMOVU	-32(%rsi,%rdx), %YMM1
+ 	VMOVU	%YMM0, (%rdi)
+ 	VMOVU	%YMM1, -32(%rdi,%rdx)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ #endif
+ #if VEC_SIZE > 16
+ 	/* From 16 to 31.  No branch when size == 16.  */
+@@ -299,7 +302,7 @@ L(between_16_31):
+ 	VMOVU	-16(%rsi,%rdx), %XMM1
+ 	VMOVU	%XMM0, (%rdi)
+ 	VMOVU	%XMM1, -16(%rdi,%rdx)
+-	ret
++	VZEROUPPER_RETURN
+ #endif
+ L(between_8_15):
+ 	/* From 8 to 15.  No branch when size == 8.  */
+@@ -352,8 +355,7 @@ L(more_2x_vec):
+ 	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
+ 	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
+ 	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ L(last_4x_vec):
+ 	/* Copy from 2 * VEC to 4 * VEC. */
+ 	VMOVU	(%rsi), %VEC(0)
+@@ -364,8 +366,7 @@ L(last_4x_vec):
+ 	VMOVU	%VEC(1), VEC_SIZE(%rdi)
+ 	VMOVU	%VEC(2), -VEC_SIZE(%rdi,%rdx)
+ 	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ L(more_8x_vec):
+ 	cmpq	%rsi, %rdi
+@@ -421,8 +422,7 @@ L(loop_4x_vec_forward):
+ 	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
+ 	/* Store the first VEC.  */
+ 	VMOVU	%VEC(4), (%r11)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ L(more_8x_vec_backward):
+ 	/* Load the first 4 * VEC and last VEC to support overlapping
+@@ -473,8 +473,7 @@ L(loop_4x_vec_backward):
+ 	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
+ 	/* Store the last VEC.  */
+ 	VMOVU	%VEC(8), (%r11)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+ L(large_forward):
+@@ -509,8 +508,7 @@ L(loop_large_forward):
+ 	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
+ 	/* Store the first VEC.  */
+ 	VMOVU	%VEC(4), (%r11)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ L(large_backward):
+ 	/* Don't use non-temporal store if there is overlap between
+@@ -544,8 +542,7 @@ L(loop_large_backward):
+ 	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
+ 	/* Store the last VEC.  */
+ 	VMOVU	%VEC(8), (%r11)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ #endif
+ END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+ 
+diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
+new file mode 100644
+index 00000000..cea2d2a7
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
+@@ -0,0 +1,12 @@
++#ifndef MEMRCHR
++# define MEMRCHR __memrchr_avx2_rtm
++#endif
++
++#define ZERO_UPPER_VEC_REGISTERS_RETURN \
++  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
++
++#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
++
++#define SECTION(p) p##.avx.rtm
++
++#include "memrchr-avx2.S"
+diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
+index ce488dd9..20efe7ac 100644
+--- a/sysdeps/x86_64/multiarch/memrchr-avx2.S
++++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
+@@ -20,14 +20,22 @@
+ 
+ # include <sysdep.h>
+ 
++# ifndef MEMRCHR
++#  define MEMRCHR	__memrchr_avx2
++# endif
++
+ # ifndef VZEROUPPER
+ #  define VZEROUPPER	vzeroupper
+ # endif
+ 
++# ifndef SECTION
++#  define SECTION(p)	p##.avx
++# endif
++
+ # define VEC_SIZE 32
+ 
+-	.section .text.avx,"ax",@progbits
+-ENTRY (__memrchr_avx2)
++	.section SECTION(.text),"ax",@progbits
++ENTRY (MEMRCHR)
+ 	/* Broadcast CHAR to YMM0.  */
+ 	vmovd	%esi, %xmm0
+ 	vpbroadcastb %xmm0, %ymm0
+@@ -134,8 +142,8 @@ L(loop_4x_vec):
+ 	vpmovmskb %ymm1, %eax
+ 	bsrl	%eax, %eax
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++L(return_vzeroupper):
++	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ 	.p2align 4
+ L(last_4x_vec_or_less):
+@@ -169,8 +177,7 @@ L(last_4x_vec_or_less):
+ 	addq	%rax, %rdx
+ 	jl	L(zero)
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(last_2x_vec):
+@@ -191,31 +198,27 @@ L(last_2x_vec):
+ 	jl	L(zero)
+ 	addl	$(VEC_SIZE * 2), %eax
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(last_vec_x0):
+ 	bsrl	%eax, %eax
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(last_vec_x1):
+ 	bsrl	%eax, %eax
+ 	addl	$VEC_SIZE, %eax
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(last_vec_x2):
+ 	bsrl	%eax, %eax
+ 	addl	$(VEC_SIZE * 2), %eax
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(last_vec_x3):
+@@ -232,8 +235,7 @@ L(last_vec_x1_check):
+ 	jl	L(zero)
+ 	addl	$VEC_SIZE, %eax
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(last_vec_x3_check):
+@@ -243,12 +245,14 @@ L(last_vec_x3_check):
+ 	jl	L(zero)
+ 	addl	$(VEC_SIZE * 3), %eax
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(zero):
+-	VZEROUPPER
++	xorl	%eax, %eax
++	VZEROUPPER_RETURN
++
++	.p2align 4
+ L(null):
+ 	xorl	%eax, %eax
+ 	ret
+@@ -273,8 +277,7 @@ L(last_vec_or_less_aligned):
+ 
+ 	bsrl	%eax, %eax
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(last_vec_or_less):
+@@ -315,8 +318,7 @@ L(last_vec_or_less):
+ 	bsrl	%eax, %eax
+ 	addq	%rdi, %rax
+ 	addq	%r8, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(last_vec_2x_aligned):
+@@ -353,7 +355,6 @@ L(last_vec_2x_aligned):
+ 	bsrl	%eax, %eax
+ 	addq	%rdi, %rax
+ 	addq	%r8, %rax
+-	VZEROUPPER
+-	ret
+-END (__memrchr_avx2)
++	VZEROUPPER_RETURN
++END (MEMRCHR)
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
+new file mode 100644
+index 00000000..8ac3e479
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
+@@ -0,0 +1,10 @@
++#define ZERO_UPPER_VEC_REGISTERS_RETURN \
++  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
++
++#define VZEROUPPER_RETURN jmp	 L(return)
++
++#define SECTION(p) p##.avx.rtm
++#define MEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
++#define WMEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
++
++#include "memset-avx2-unaligned-erms.S"
+diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+index 7ab3d898..ae0860f3 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+@@ -14,9 +14,15 @@
+   movq r, %rax; \
+   vpbroadcastd %xmm0, %ymm0
+ 
+-# define SECTION(p)		p##.avx
+-# define MEMSET_SYMBOL(p,s)	p##_avx2_##s
+-# define WMEMSET_SYMBOL(p,s)	p##_avx2_##s
++# ifndef SECTION
++#  define SECTION(p)		p##.avx
++# endif
++# ifndef MEMSET_SYMBOL
++#  define MEMSET_SYMBOL(p,s)	p##_avx2_##s
++# endif
++# ifndef WMEMSET_SYMBOL
++#  define WMEMSET_SYMBOL(p,s)	p##_avx2_##s
++# endif
+ 
+ # include "memset-vec-unaligned-erms.S"
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index 71e91a8f..bae5cba4 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -45,17 +45,14 @@
+ #ifndef VZEROUPPER
+ # if VEC_SIZE > 16
+ #  define VZEROUPPER			vzeroupper
++#  define VZEROUPPER_SHORT_RETURN	vzeroupper; ret
+ # else
+ #  define VZEROUPPER
+ # endif
+ #endif
+ 
+ #ifndef VZEROUPPER_SHORT_RETURN
+-# if VEC_SIZE > 16
+-#  define VZEROUPPER_SHORT_RETURN	vzeroupper
+-# else
+-#  define VZEROUPPER_SHORT_RETURN	rep
+-# endif
++# define VZEROUPPER_SHORT_RETURN	rep; ret
+ #endif
+ 
+ #ifndef MOVQ
+@@ -117,8 +114,7 @@ L(entry_from_bzero):
+ 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+ 	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+ 	VMOVU	%VEC(0), (%rdi)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ #if defined USE_MULTIARCH && IS_IN (libc)
+ END (MEMSET_SYMBOL (__memset, unaligned))
+ 
+@@ -141,14 +137,12 @@ ENTRY (__memset_erms)
+ ENTRY (MEMSET_SYMBOL (__memset, erms))
+ # endif
+ L(stosb):
+-	/* Issue vzeroupper before rep stosb.  */
+-	VZEROUPPER
+ 	mov	%RDX_LP, %RCX_LP
+ 	movzbl	%sil, %eax
+ 	mov	%RDI_LP, %RDX_LP
+ 	rep stosb
+ 	mov	%RDX_LP, %RAX_LP
+-	ret
++	VZEROUPPER_RETURN
+ # if VEC_SIZE == 16
+ END (__memset_erms)
+ # else
+@@ -175,8 +169,7 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
+ 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+ 	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+ 	VMOVU	%VEC(0), (%rdi)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ L(stosb_more_2x_vec):
+ 	cmp	__x86_rep_stosb_threshold(%rip), %RDX_LP
+@@ -190,8 +183,11 @@ L(more_2x_vec):
+ 	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+ 	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
+ L(return):
+-	VZEROUPPER
++#if VEC_SIZE > 16
++	ZERO_UPPER_VEC_REGISTERS_RETURN
++#else
+ 	ret
++#endif
+ 
+ L(loop_start):
+ 	leaq	(VEC_SIZE * 4)(%rdi), %rcx
+@@ -217,7 +213,6 @@ L(loop):
+ 	cmpq	%rcx, %rdx
+ 	jne	L(loop)
+ 	VZEROUPPER_SHORT_RETURN
+-	ret
+ L(less_vec):
+ 	/* Less than 1 VEC.  */
+ # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+@@ -241,40 +236,34 @@ L(less_vec):
+ 	jb	1f
+ 	movb	%cl, (%rdi)
+ 1:
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ # if VEC_SIZE > 32
+ 	/* From 32 to 63.  No branch when size == 32.  */
+ L(between_32_63):
+ 	VMOVU	%YMM0, -32(%rdi,%rdx)
+ 	VMOVU	%YMM0, (%rdi)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ # endif
+ # if VEC_SIZE > 16
+ 	/* From 16 to 31.  No branch when size == 16.  */
+ L(between_16_31):
+ 	VMOVU	%XMM0, -16(%rdi,%rdx)
+ 	VMOVU	%XMM0, (%rdi)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ # endif
+ 	/* From 8 to 15.  No branch when size == 8.  */
+ L(between_8_15):
+ 	movq	%rcx, -8(%rdi,%rdx)
+ 	movq	%rcx, (%rdi)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ L(between_4_7):
+ 	/* From 4 to 7.  No branch when size == 4.  */
+ 	movl	%ecx, -4(%rdi,%rdx)
+ 	movl	%ecx, (%rdi)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ L(between_2_3):
+ 	/* From 2 to 3.  No branch when size == 2.  */
+ 	movw	%cx, -2(%rdi,%rdx)
+ 	movw	%cx, (%rdi)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ END (MEMSET_SYMBOL (__memset, unaligned_erms))
+diff --git a/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S
+new file mode 100644
+index 00000000..acc5f6e2
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S
+@@ -0,0 +1,4 @@
++#define MEMCHR __rawmemchr_avx2_rtm
++#define USE_AS_RAWMEMCHR 1
++
++#include "memchr-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
+new file mode 100644
+index 00000000..2b9c07a5
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
+@@ -0,0 +1,3 @@
++#define USE_AS_STPCPY
++#define STRCPY __stpcpy_avx2_rtm
++#include "strcpy-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
+new file mode 100644
+index 00000000..60a2ccfe
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
+@@ -0,0 +1,4 @@
++#define USE_AS_STPCPY
++#define USE_AS_STRNCPY
++#define STRCPY __stpncpy_avx2_rtm
++#include "strcpy-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
+new file mode 100644
+index 00000000..637fb557
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
+@@ -0,0 +1,12 @@
++#ifndef STRCAT
++# define STRCAT __strcat_avx2_rtm
++#endif
++
++#define ZERO_UPPER_VEC_REGISTERS_RETURN \
++  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
++
++#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
++
++#define SECTION(p) p##.avx.rtm
++
++#include "strcat-avx2.S"
+diff --git a/sysdeps/x86_64/multiarch/strcat-avx2.S b/sysdeps/x86_64/multiarch/strcat-avx2.S
+index b0623564..aa48c058 100644
+--- a/sysdeps/x86_64/multiarch/strcat-avx2.S
++++ b/sysdeps/x86_64/multiarch/strcat-avx2.S
+@@ -30,7 +30,11 @@
+ /* Number of bytes in a vector register */
+ # define VEC_SIZE	32
+ 
+-	.section .text.avx,"ax",@progbits
++# ifndef SECTION
++#  define SECTION(p)	p##.avx
++# endif
++
++	.section SECTION(.text),"ax",@progbits
+ ENTRY (STRCAT)
+ 	mov	%rdi, %r9
+ # ifdef USE_AS_STRNCAT
+diff --git a/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S
+new file mode 100644
+index 00000000..81f20d1d
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S
+@@ -0,0 +1,12 @@
++#ifndef STRCHR
++# define STRCHR __strchr_avx2_rtm
++#endif
++
++#define ZERO_UPPER_VEC_REGISTERS_RETURN \
++  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
++
++#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
++
++#define SECTION(p) p##.avx.rtm
++
++#include "strchr-avx2.S"
+diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
+index 47bc3c99..da7d2620 100644
+--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
++++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
+@@ -38,9 +38,13 @@
+ #  define VZEROUPPER	vzeroupper
+ # endif
+ 
++# ifndef SECTION
++#  define SECTION(p)	p##.avx
++# endif
++
+ # define VEC_SIZE 32
+ 
+-	.section .text.avx,"ax",@progbits
++	.section SECTION(.text),"ax",@progbits
+ ENTRY (STRCHR)
+ 	movl	%edi, %ecx
+ 	/* Broadcast CHAR to YMM0.  */
+@@ -93,8 +97,8 @@ L(cros_page_boundary):
+ 	cmp	(%rax), %CHAR_REG
+ 	cmovne	%rdx, %rax
+ # endif
+-	VZEROUPPER
+-	ret
++L(return_vzeroupper):
++	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ 	.p2align 4
+ L(aligned_more):
+@@ -190,8 +194,7 @@ L(first_vec_x0):
+ 	cmp	(%rax), %CHAR_REG
+ 	cmovne	%rdx, %rax
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x1):
+@@ -205,8 +208,7 @@ L(first_vec_x1):
+ 	cmp	(%rax), %CHAR_REG
+ 	cmovne	%rdx, %rax
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x2):
+@@ -220,8 +222,7 @@ L(first_vec_x2):
+ 	cmp	(%rax), %CHAR_REG
+ 	cmovne	%rdx, %rax
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(4x_vec_end):
+@@ -247,8 +248,7 @@ L(first_vec_x3):
+ 	cmp	(%rax), %CHAR_REG
+ 	cmovne	%rdx, %rax
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ END (STRCHR)
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
+index be05e197..7e582f02 100644
+--- a/sysdeps/x86_64/multiarch/strchr.c
++++ b/sysdeps/x86_64/multiarch/strchr.c
+@@ -29,6 +29,7 @@
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_no_bsf) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+ 
+ static inline void *
+@@ -44,6 +45,9 @@ IFUNC_SELECTOR (void)
+ 	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+ 	return OPTIMIZE (evex);
+ 
++      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	return OPTIMIZE (avx2_rtm);
++
+       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ 	return OPTIMIZE (avx2);
+     }
+diff --git a/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S b/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S
+new file mode 100644
+index 00000000..cdcf818b
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S
+@@ -0,0 +1,3 @@
++#define STRCHR __strchrnul_avx2_rtm
++#define USE_AS_STRCHRNUL 1
++#include "strchr-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S
+new file mode 100644
+index 00000000..aecd30d9
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S
+@@ -0,0 +1,12 @@
++#ifndef STRCMP
++# define STRCMP __strcmp_avx2_rtm
++#endif
++
++#define ZERO_UPPER_VEC_REGISTERS_RETURN \
++  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
++
++#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
++
++#define SECTION(p) p##.avx.rtm
++
++#include "strcmp-avx2.S"
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index 8fb8eedc..5d1c9d90 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -55,6 +55,10 @@
+ #  define VZEROUPPER	vzeroupper
+ # endif
+ 
++# ifndef SECTION
++#  define SECTION(p)	p##.avx
++# endif
++
+ /* Warning!
+            wcscmp/wcsncmp have to use SIGNED comparison for elements.
+            strcmp/strncmp have to use UNSIGNED comparison for elements.
+@@ -75,7 +79,7 @@
+    the maximum offset is reached before a difference is found, zero is
+    returned.  */
+ 
+-	.section .text.avx,"ax",@progbits
++	.section SECTION(.text),"ax",@progbits
+ ENTRY (STRCMP)
+ # ifdef USE_AS_STRNCMP
+ 	/* Check for simple cases (0 or 1) in offset.  */
+@@ -137,8 +141,8 @@ L(return):
+ 	movzbl	(%rsi, %rdx), %edx
+ 	subl	%edx, %eax
+ # endif
+-	VZEROUPPER
+-	ret
++L(return_vzeroupper):
++	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ 	.p2align 4
+ L(return_vec_size):
+@@ -171,8 +175,7 @@ L(return_vec_size):
+ 	subl	%edx, %eax
+ #  endif
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(return_2_vec_size):
+@@ -205,8 +208,7 @@ L(return_2_vec_size):
+ 	subl	%edx, %eax
+ #  endif
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(return_3_vec_size):
+@@ -239,8 +241,7 @@ L(return_3_vec_size):
+ 	subl	%edx, %eax
+ #  endif
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(next_3_vectors):
+@@ -366,8 +367,7 @@ L(back_to_loop):
+ 	subl	%edx, %eax
+ #  endif
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(test_vec):
+@@ -410,8 +410,7 @@ L(test_vec):
+ 	subl	%edx, %eax
+ #  endif
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(test_2_vec):
+@@ -454,8 +453,7 @@ L(test_2_vec):
+ 	subl	%edx, %eax
+ #  endif
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(test_3_vec):
+@@ -496,8 +494,7 @@ L(test_3_vec):
+ 	subl	%edx, %eax
+ #  endif
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(loop_cross_page):
+@@ -566,8 +563,7 @@ L(loop_cross_page):
+ 	subl	%edx, %eax
+ #  endif
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(loop_cross_page_2_vec):
+@@ -641,8 +637,7 @@ L(loop_cross_page_2_vec):
+ 	subl	%edx, %eax
+ #  endif
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ # ifdef USE_AS_STRNCMP
+ L(string_nbyte_offset_check):
+@@ -684,8 +679,7 @@ L(cross_page_loop):
+ # ifndef USE_AS_WCSCMP
+ L(different):
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ # ifdef USE_AS_WCSCMP
+ 	.p2align 4
+@@ -695,16 +689,14 @@ L(different):
+ 	setl	%al
+ 	negl	%eax
+ 	orl	$1, %eax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ # endif
+ 
+ # ifdef USE_AS_STRNCMP
+ 	.p2align 4
+ L(zero):
+ 	xorl	%eax, %eax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(char0):
+@@ -718,8 +710,7 @@ L(char0):
+ 	movzbl	(%rdi), %eax
+ 	subl	%ecx, %eax
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ # endif
+ 
+ 	.p2align 4
+@@ -744,8 +735,7 @@ L(last_vector):
+ 	movzbl	(%rsi, %rdx), %edx
+ 	subl	%edx, %eax
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	/* Comparing on page boundary region requires special treatment:
+ 	   It must done one vector at the time, starting with the wider
+@@ -866,7 +856,6 @@ L(cross_page_4bytes):
+ 	testl	%eax, %eax
+ 	jne	L(cross_page_loop)
+ 	subl	%ecx, %eax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ END (STRCMP)
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c
+index c5f38510..11bbea2b 100644
+--- a/sysdeps/x86_64/multiarch/strcmp.c
++++ b/sysdeps/x86_64/multiarch/strcmp.c
+@@ -30,6 +30,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+ 
+ static inline void *
+@@ -46,6 +47,9 @@ IFUNC_SELECTOR (void)
+ 	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP))
+ 	return OPTIMIZE (evex);
+ 
++      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	return OPTIMIZE (avx2_rtm);
++
+       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ 	return OPTIMIZE (avx2);
+     }
+diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
+new file mode 100644
+index 00000000..c2c581ec
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
+@@ -0,0 +1,12 @@
++#ifndef STRCPY
++# define STRCPY __strcpy_avx2_rtm
++#endif
++
++#define ZERO_UPPER_VEC_REGISTERS_RETURN \
++  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
++
++#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
++
++#define SECTION(p) p##.avx.rtm
++
++#include "strcpy-avx2.S"
+diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
+index 81677f90..613c59aa 100644
+--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
++++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
+@@ -37,6 +37,10 @@
+ #  define VZEROUPPER	vzeroupper
+ # endif
+ 
++# ifndef SECTION
++#  define SECTION(p)	p##.avx
++# endif
++
+ /* zero register */
+ #define xmmZ	xmm0
+ #define ymmZ	ymm0
+@@ -46,7 +50,7 @@
+ 
+ # ifndef USE_AS_STRCAT
+ 
+-	.section .text.avx,"ax",@progbits
++	.section SECTION(.text),"ax",@progbits
+ ENTRY (STRCPY)
+ #  ifdef USE_AS_STRNCPY
+ 	mov	%rdx, %r8
+@@ -369,8 +373,8 @@ L(CopyVecSizeExit):
+ 	lea	1(%rdi), %rdi
+ 	jnz	L(StrncpyFillTailWithZero)
+ # endif
+-	VZEROUPPER
+-	ret
++L(return_vzeroupper):
++	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ 	.p2align 4
+ L(CopyTwoVecSize1):
+@@ -553,8 +557,7 @@ L(Exit1):
+ 	lea	2(%rdi), %rdi
+ 	jnz	L(StrncpyFillTailWithZero)
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(Exit2):
+@@ -569,8 +572,7 @@ L(Exit2):
+ 	lea	3(%rdi), %rdi
+ 	jnz	L(StrncpyFillTailWithZero)
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(Exit3):
+@@ -584,8 +586,7 @@ L(Exit3):
+ 	lea	4(%rdi), %rdi
+ 	jnz	L(StrncpyFillTailWithZero)
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(Exit4_7):
+@@ -602,8 +603,7 @@ L(Exit4_7):
+ 	lea	1(%rdi, %rdx), %rdi
+ 	jnz	L(StrncpyFillTailWithZero)
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(Exit8_15):
+@@ -620,8 +620,7 @@ L(Exit8_15):
+ 	lea	1(%rdi, %rdx), %rdi
+ 	jnz	L(StrncpyFillTailWithZero)
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(Exit16_31):
+@@ -638,8 +637,7 @@ L(Exit16_31):
+ 	lea 1(%rdi, %rdx), %rdi
+ 	jnz L(StrncpyFillTailWithZero)
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(Exit32_63):
+@@ -656,8 +654,7 @@ L(Exit32_63):
+ 	lea	1(%rdi, %rdx), %rdi
+ 	jnz	L(StrncpyFillTailWithZero)
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ # ifdef USE_AS_STRNCPY
+ 
+@@ -671,8 +668,7 @@ L(StrncpyExit1):
+ #  ifdef USE_AS_STRCAT
+ 	movb	$0, 1(%rdi)
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(StrncpyExit2):
+@@ -684,8 +680,7 @@ L(StrncpyExit2):
+ #  ifdef USE_AS_STRCAT
+ 	movb	$0, 2(%rdi)
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(StrncpyExit3_4):
+@@ -699,8 +694,7 @@ L(StrncpyExit3_4):
+ #  ifdef USE_AS_STRCAT
+ 	movb	$0, (%rdi, %r8)
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(StrncpyExit5_8):
+@@ -714,8 +708,7 @@ L(StrncpyExit5_8):
+ #  ifdef USE_AS_STRCAT
+ 	movb	$0, (%rdi, %r8)
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(StrncpyExit9_16):
+@@ -729,8 +722,7 @@ L(StrncpyExit9_16):
+ #  ifdef USE_AS_STRCAT
+ 	movb	$0, (%rdi, %r8)
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(StrncpyExit17_32):
+@@ -744,8 +736,7 @@ L(StrncpyExit17_32):
+ #  ifdef USE_AS_STRCAT
+ 	movb	$0, (%rdi, %r8)
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(StrncpyExit33_64):
+@@ -760,8 +751,7 @@ L(StrncpyExit33_64):
+ #  ifdef USE_AS_STRCAT
+ 	movb	$0, (%rdi, %r8)
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(StrncpyExit65):
+@@ -778,50 +768,43 @@ L(StrncpyExit65):
+ #  ifdef USE_AS_STRCAT
+ 	movb	$0, 65(%rdi)
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ #  ifndef USE_AS_STRCAT
+ 
+ 	.p2align 4
+ L(Fill1):
+ 	mov	%dl, (%rdi)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(Fill2):
+ 	mov	%dx, (%rdi)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(Fill3_4):
+ 	mov	%dx, (%rdi)
+ 	mov     %dx, -2(%rdi, %r8)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(Fill5_8):
+ 	mov	%edx, (%rdi)
+ 	mov     %edx, -4(%rdi, %r8)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(Fill9_16):
+ 	mov	%rdx, (%rdi)
+ 	mov	%rdx, -8(%rdi, %r8)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(Fill17_32):
+ 	vmovdqu %xmmZ, (%rdi)
+ 	vmovdqu %xmmZ, -16(%rdi, %r8)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(CopyVecSizeUnalignedVec2):
+@@ -898,8 +881,7 @@ L(Fill):
+ 	cmp	$1, %r8d
+ 	ja	L(Fill2)
+ 	je	L(Fill1)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ /* end of ifndef USE_AS_STRCAT */
+ #  endif
+@@ -929,8 +911,7 @@ L(UnalignedFourVecSizeLeaveCase3):
+ #  ifdef USE_AS_STRCAT
+ 	movb	$0, (VEC_SIZE * 4)(%rdi)
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(UnalignedFourVecSizeLeaveCase2):
+@@ -1001,16 +982,14 @@ L(StrncpyExit):
+ #  ifdef USE_AS_STRCAT
+ 	movb	$0, (%rdi)
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(ExitZero):
+ #  ifndef USE_AS_STRCAT
+ 	mov	%rdi, %rax
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ # endif
+ 
+diff --git a/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S
+new file mode 100644
+index 00000000..75b4b761
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S
+@@ -0,0 +1,12 @@
++#ifndef STRLEN
++# define STRLEN __strlen_avx2_rtm
++#endif
++
++#define ZERO_UPPER_VEC_REGISTERS_RETURN \
++  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
++
++#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
++
++#define SECTION(p) p##.avx.rtm
++
++#include "strlen-avx2.S"
+diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
+index 645e0446..82826e10 100644
+--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
++++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
+@@ -36,9 +36,13 @@
+ #  define VZEROUPPER	vzeroupper
+ # endif
+ 
++# ifndef SECTION
++#  define SECTION(p)	p##.avx
++# endif
++
+ # define VEC_SIZE 32
+ 
+-	.section .text.avx,"ax",@progbits
++	.section SECTION(.text),"ax",@progbits
+ ENTRY (STRLEN)
+ # ifdef USE_AS_STRNLEN
+ 	/* Check for zero length.  */
+@@ -111,8 +115,8 @@ L(cros_page_boundary):
+ # ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ # endif
+-	VZEROUPPER
+-	ret
++L(return_vzeroupper):
++	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ 	.p2align 4
+ L(aligned_more):
+@@ -231,8 +235,7 @@ L(last_4x_vec_or_less):
+ #  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(last_2x_vec):
+@@ -253,8 +256,7 @@ L(last_2x_vec):
+ #  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x0_check):
+@@ -267,8 +269,7 @@ L(first_vec_x0_check):
+ #  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x1_check):
+@@ -282,8 +283,7 @@ L(first_vec_x1_check):
+ #  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x2_check):
+@@ -297,8 +297,7 @@ L(first_vec_x2_check):
+ #  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x3_check):
+@@ -312,8 +311,7 @@ L(first_vec_x3_check):
+ #  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(max):
+@@ -321,8 +319,7 @@ L(max):
+ #  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(zero):
+@@ -338,8 +335,7 @@ L(first_vec_x0):
+ # ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x1):
+@@ -350,8 +346,7 @@ L(first_vec_x1):
+ # ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x2):
+@@ -362,8 +357,7 @@ L(first_vec_x2):
+ # ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(4x_vec_end):
+@@ -389,8 +383,7 @@ L(first_vec_x3):
+ # ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ END (STRLEN)
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
+new file mode 100644
+index 00000000..0dcea18d
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
+@@ -0,0 +1,3 @@
++#define USE_AS_STRNCAT
++#define STRCAT __strncat_avx2_rtm
++#include "strcat-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
+new file mode 100644
+index 00000000..37d1224b
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
+@@ -0,0 +1,3 @@
++#define STRCMP	__strncmp_avx2_rtm
++#define USE_AS_STRNCMP 1
++#include "strcmp-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c
+index 4c15542f..44c85116 100644
+--- a/sysdeps/x86_64/multiarch/strncmp.c
++++ b/sysdeps/x86_64/multiarch/strncmp.c
+@@ -30,6 +30,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+ 
+ static inline void *
+@@ -46,6 +47,9 @@ IFUNC_SELECTOR (void)
+ 	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP))
+ 	return OPTIMIZE (evex);
+ 
++      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	return OPTIMIZE (avx2_rtm);
++
+       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ 	return OPTIMIZE (avx2);
+     }
+diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
+new file mode 100644
+index 00000000..79e70832
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
+@@ -0,0 +1,3 @@
++#define USE_AS_STRNCPY
++#define STRCPY __strncpy_avx2_rtm
++#include "strcpy-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S
+new file mode 100644
+index 00000000..04f1626a
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S
+@@ -0,0 +1,4 @@
++#define STRLEN __strnlen_avx2_rtm
++#define USE_AS_STRNLEN 1
++
++#include "strlen-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S
+new file mode 100644
+index 00000000..5def14ec
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S
+@@ -0,0 +1,12 @@
++#ifndef STRRCHR
++# define STRRCHR __strrchr_avx2_rtm
++#endif
++
++#define ZERO_UPPER_VEC_REGISTERS_RETURN \
++  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
++
++#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
++
++#define SECTION(p) p##.avx.rtm
++
++#include "strrchr-avx2.S"
+diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
+index 4381e6ab..9f22a15e 100644
+--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
++++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
+@@ -36,9 +36,13 @@
+ #  define VZEROUPPER	vzeroupper
+ # endif
+ 
++# ifndef SECTION
++#  define SECTION(p)	p##.avx
++# endif
++
+ # define VEC_SIZE	32
+ 
+-	.section .text.avx,"ax",@progbits
++	.section SECTION(.text),"ax",@progbits
+ ENTRY (STRRCHR)
+ 	movd	%esi, %xmm4
+ 	movl	%edi, %ecx
+@@ -166,8 +170,8 @@ L(return_value):
+ # endif
+ 	bsrl	%eax, %eax
+ 	leaq	-VEC_SIZE(%rdi, %rax), %rax
+-	VZEROUPPER
+-	ret
++L(return_vzeroupper):
++	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ 	.p2align 4
+ L(match):
+@@ -198,8 +202,7 @@ L(find_nul):
+ 	jz	L(return_value)
+ 	bsrl	%eax, %eax
+ 	leaq	-VEC_SIZE(%rdi, %rax), %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(char_and_nul):
+@@ -222,14 +225,12 @@ L(char_and_nul_in_first_vec):
+ 	jz	L(return_null)
+ 	bsrl	%eax, %eax
+ 	leaq	-VEC_SIZE(%rdi, %rax), %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(return_null):
+ 	xorl	%eax, %eax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ END (STRRCHR)
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S
+new file mode 100644
+index 00000000..d49dbbf0
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S
+@@ -0,0 +1,3 @@
++#define STRCHR __wcschr_avx2_rtm
++#define USE_AS_WCSCHR 1
++#include "strchr-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S
+new file mode 100644
+index 00000000..d6ca2b80
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S
+@@ -0,0 +1,4 @@
++#define STRCMP __wcscmp_avx2_rtm
++#define USE_AS_WCSCMP 1
++
++#include "strcmp-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S
+new file mode 100644
+index 00000000..35658d73
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S
+@@ -0,0 +1,4 @@
++#define STRLEN __wcslen_avx2_rtm
++#define USE_AS_WCSLEN 1
++
++#include "strlen-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
+new file mode 100644
+index 00000000..4e88c70c
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
+@@ -0,0 +1,5 @@
++#define STRCMP __wcsncmp_avx2_rtm
++#define USE_AS_STRNCMP 1
++#define USE_AS_WCSCMP 1
++
++#include "strcmp-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S
+new file mode 100644
+index 00000000..7437ebee
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S
+@@ -0,0 +1,5 @@
++#define STRLEN __wcsnlen_avx2_rtm
++#define USE_AS_WCSLEN 1
++#define USE_AS_STRNLEN 1
++
++#include "strlen-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c
+index 84254b83..20b731ae 100644
+--- a/sysdeps/x86_64/multiarch/wcsnlen.c
++++ b/sysdeps/x86_64/multiarch/wcsnlen.c
+@@ -29,6 +29,7 @@
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+ 
+ static inline void *
+@@ -44,6 +45,9 @@ IFUNC_SELECTOR (void)
+ 	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+ 	return OPTIMIZE (evex);
+ 
++      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	return OPTIMIZE (avx2_rtm);
++
+       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ 	return OPTIMIZE (avx2);
+     }
+diff --git a/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S
+new file mode 100644
+index 00000000..9bf76083
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S
+@@ -0,0 +1,3 @@
++#define STRRCHR __wcsrchr_avx2_rtm
++#define USE_AS_WCSRCHR 1
++#include "strrchr-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S
+new file mode 100644
+index 00000000..58ed21db
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S
+@@ -0,0 +1,4 @@
++#define MEMCHR __wmemchr_avx2_rtm
++#define USE_AS_WMEMCHR 1
++
++#include "memchr-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S b/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S
+new file mode 100644
+index 00000000..31104d12
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S
+@@ -0,0 +1,4 @@
++#define MEMCMP __wmemcmp_avx2_movbe_rtm
++#define USE_AS_WMEMCMP 1
++
++#include "memcmp-avx2-movbe-rtm.S"
+diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h
+index 1738d7f9..223f1a59 100644
+--- a/sysdeps/x86_64/sysdep.h
++++ b/sysdeps/x86_64/sysdep.h
+@@ -95,6 +95,28 @@ lose:									      \
+ #define R14_LP	r14
+ #define R15_LP	r15
+ 
++/* Zero upper vector registers and return with xtest.  NB: Use VZEROALL
++   to avoid RTM abort triggered by VZEROUPPER inside transactionally.  */
++#define ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST \
++	xtest;							\
++	jz	1f;						\
++	vzeroall;						\
++	ret;							\
++1:								\
++	vzeroupper;						\
++	ret
++
++/* Zero upper vector registers and return.  */
++#ifndef ZERO_UPPER_VEC_REGISTERS_RETURN
++# define ZERO_UPPER_VEC_REGISTERS_RETURN \
++	VZEROUPPER;						\
++	ret
++#endif
++
++#ifndef VZEROUPPER_RETURN
++# define VZEROUPPER_RETURN	VZEROUPPER; ret
++#endif
++
+ #else	/* __ASSEMBLER__ */
+ 
+ /* Long and pointer size in bytes.  */
+-- 
+GitLab
+
diff --git a/SOURCES/ia-bmi2-req-strchr-avx2.patch b/SOURCES/ia-bmi2-req-strchr-avx2.patch
new file mode 100644
index 0000000..a4316f6
--- /dev/null
+++ b/SOURCES/ia-bmi2-req-strchr-avx2.patch
@@ -0,0 +1,93 @@
+From 6b2150da7cac8cccdcf1de189b3ab7cbd6519535 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 19 Apr 2021 10:45:07 -0700
+Subject: [PATCH] x86-64: Require BMI2 for strchr-avx2.S
+
+Since strchr-avx2.S updated by
+
+commit 1f745ecc2109890886b161d4791e1406fdfc29b8
+Author: noah <goldstein.w.n@gmail.com>
+Date:   Wed Feb 3 00:38:59 2021 -0500
+
+    x86-64: Refactor and improve performance of strchr-avx2.S
+
+uses sarx:
+
+c4 e2 72 f7 c0       	sarx   %ecx,%eax,%eax
+
+for strchr-avx2 family functions, require BMI2 in ifunc-impl-list.c and
+ifunc-avx2.h.
+
+(cherry picked from commit 83c5b368226c34a2f0a5287df40fc290b2b34359)
+---
+ sysdeps/x86_64/multiarch/ifunc-avx2.h      |  4 ++--
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +++++++++---
+ 2 files changed, 11 insertions(+), 5 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/ifunc-avx2.h b/sysdeps/x86_64/multiarch/ifunc-avx2.h
+index e0f30e61..ef72b73f 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-avx2.h
++++ b/sysdeps/x86_64/multiarch/ifunc-avx2.h
+@@ -30,11 +30,11 @@ IFUNC_SELECTOR (void)
+   const struct cpu_features* cpu_features = __get_cpu_features ();
+ 
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
++      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+     {
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
++	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+ 	return OPTIMIZE (evex);
+ 
+       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index 695cdba6..85b8863a 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -400,10 +400,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   /* Support sysdeps/x86_64/multiarch/strchr.c.  */
+   IFUNC_IMPL (i, name, strchr,
+ 	      IFUNC_IMPL_ADD (array, i, strchr,
+-			      CPU_FEATURE_USABLE (AVX2),
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __strchr_avx2)
+ 	      IFUNC_IMPL_ADD (array, i, strchr,
+ 			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __strchr_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strchr,
+@@ -417,10 +419,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   /* Support sysdeps/x86_64/multiarch/strchrnul.c.  */
+   IFUNC_IMPL (i, name, strchrnul,
+ 	      IFUNC_IMPL_ADD (array, i, strchrnul,
+-			      CPU_FEATURE_USABLE (AVX2),
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __strchrnul_avx2)
+ 	      IFUNC_IMPL_ADD (array, i, strchrnul,
+ 			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __strchrnul_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strchrnul,
+@@ -574,10 +578,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   /* Support sysdeps/x86_64/multiarch/wcschr.c.  */
+   IFUNC_IMPL (i, name, wcschr,
+ 	      IFUNC_IMPL_ADD (array, i, wcschr,
+-			      CPU_FEATURE_USABLE (AVX2),
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wcschr_avx2)
+ 	      IFUNC_IMPL_ADD (array, i, wcschr,
+ 			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __wcschr_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wcschr,
+-- 
+GitLab
+
diff --git a/SOURCES/ia-bmi2-req-strlen-strnlen.patch b/SOURCES/ia-bmi2-req-strlen-strnlen.patch
new file mode 100644
index 0000000..7533cdd
--- /dev/null
+++ b/SOURCES/ia-bmi2-req-strlen-strnlen.patch
@@ -0,0 +1,52 @@
+From 6518570fa9ff5c4d0a7e0c4b6788fb0f8e9ebb22 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 19 Apr 2021 07:07:21 -0700
+Subject: [PATCH] x86-64: Require BMI2 for __strlen_evex and __strnlen_evex
+
+Since __strlen_evex and __strnlen_evex added by
+
+commit 1fd8c163a83d96ace1ff78fa6bac7aee084f6f77
+Author: H.J. Lu <hjl.tools@gmail.com>
+Date:   Fri Mar 5 06:24:52 2021 -0800
+
+    x86-64: Add ifunc-avx2.h functions with 256-bit EVEX
+
+use sarx:
+
+c4 e2 6a f7 c0       	sarx   %edx,%eax,%eax
+
+require BMI2 for __strlen_evex and __strnlen_evex in ifunc-impl-list.c.
+ifunc-avx2.h already requires BMI2 for EVEX implementation.
+
+(cherry picked from commit 55bf411b451c13f0fb7ff3d3bf9a820020b45df1)
+---
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index fec384f6..cbfc1a5d 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -293,7 +293,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      __strlen_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strlen,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __strlen_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
+ 
+@@ -308,7 +309,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      __strnlen_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strnlen,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __strnlen_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
+ 
+-- 
+GitLab
+
diff --git a/SOURCES/ia-double-rep_movsb_threshold-erms.patch b/SOURCES/ia-double-rep_movsb_threshold-erms.patch
new file mode 100644
index 0000000..2c8534c
--- /dev/null
+++ b/SOURCES/ia-double-rep_movsb_threshold-erms.patch
@@ -0,0 +1,136 @@
+From aa1f037077a41b36dd477e6ca754e207b37d661a Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Wed, 2 Mar 2022 16:27:24 -0800
+Subject: [PATCH] x86: Double size of ERMS rep_movsb_threshold in
+ dl-cacheinfo.h
+
+No bug.
+
+This patch doubles the rep_movsb_threshold when using ERMS. Based on
+benchmarks the vector copy loop, especially now that it handles 4k
+aliasing, is better for these medium ranged.
+
+On Skylake with ERMS:
+
+Size,   Align1, Align2, dst>src,(rep movsb) / (vec copy)
+4096,   0,      0,      0,      0.975
+4096,   0,      0,      1,      0.953
+4096,   12,     0,      0,      0.969
+4096,   12,     0,      1,      0.872
+4096,   44,     0,      0,      0.979
+4096,   44,     0,      1,      0.83
+4096,   0,      12,     0,      1.006
+4096,   0,      12,     1,      0.989
+4096,   0,      44,     0,      0.739
+4096,   0,      44,     1,      0.942
+4096,   12,     12,     0,      1.009
+4096,   12,     12,     1,      0.973
+4096,   44,     44,     0,      0.791
+4096,   44,     44,     1,      0.961
+4096,   2048,   0,      0,      0.978
+4096,   2048,   0,      1,      0.951
+4096,   2060,   0,      0,      0.986
+4096,   2060,   0,      1,      0.963
+4096,   2048,   12,     0,      0.971
+4096,   2048,   12,     1,      0.941
+4096,   2060,   12,     0,      0.977
+4096,   2060,   12,     1,      0.949
+8192,   0,      0,      0,      0.85
+8192,   0,      0,      1,      0.845
+8192,   13,     0,      0,      0.937
+8192,   13,     0,      1,      0.939
+8192,   45,     0,      0,      0.932
+8192,   45,     0,      1,      0.927
+8192,   0,      13,     0,      0.621
+8192,   0,      13,     1,      0.62
+8192,   0,      45,     0,      0.53
+8192,   0,      45,     1,      0.516
+8192,   13,     13,     0,      0.664
+8192,   13,     13,     1,      0.659
+8192,   45,     45,     0,      0.593
+8192,   45,     45,     1,      0.575
+8192,   2048,   0,      0,      0.854
+8192,   2048,   0,      1,      0.834
+8192,   2061,   0,      0,      0.863
+8192,   2061,   0,      1,      0.857
+8192,   2048,   13,     0,      0.63
+8192,   2048,   13,     1,      0.629
+8192,   2061,   13,     0,      0.627
+8192,   2061,   13,     1,      0.62
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+(cherry picked from commit 475b63702ef38b69558fc3d31a0b66776a70f1d3)
+---
+ sysdeps/x86/cacheinfo.h      |  8 +++++---
+ sysdeps/x86/dl-tunables.list | 26 +++++++++++++++-----------
+ 2 files changed, 20 insertions(+), 14 deletions(-)
+
+diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
+index cc3941d3..ac025e08 100644
+--- a/sysdeps/x86/cacheinfo.h
++++ b/sysdeps/x86/cacheinfo.h
+@@ -411,18 +411,20 @@ init_cacheinfo (void)
+ 
+   /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8.  */
+   unsigned int minimum_rep_movsb_threshold;
+-  /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16).  */
++  /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for
++     VEC_SIZE == 64 or 32.  For VEC_SIZE == 16, the default REP MOVSB
++     threshold is 2048 * (VEC_SIZE / 16).  */
+   unsigned int rep_movsb_threshold;
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
+       && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
+     {
+-      rep_movsb_threshold = 2048 * (64 / 16);
++      rep_movsb_threshold = 4096 * (64 / 16);
+       minimum_rep_movsb_threshold = 64 * 8;
+     }
+   else if (CPU_FEATURE_PREFERRED_P (cpu_features,
+ 				    AVX_Fast_Unaligned_Load))
+     {
+-      rep_movsb_threshold = 2048 * (32 / 16);
++      rep_movsb_threshold = 4096 * (32 / 16);
+       minimum_rep_movsb_threshold = 32 * 8;
+     }
+   else
+diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list
+index 89bf2966..56c6834a 100644
+--- a/sysdeps/x86/dl-tunables.list
++++ b/sysdeps/x86/dl-tunables.list
+@@ -32,17 +32,21 @@ glibc {
+     }
+     x86_rep_movsb_threshold {
+       type: SIZE_T
+-      # Since there is overhead to set up REP MOVSB operation, REP MOVSB
+-      # isn't faster on short data.  The memcpy micro benchmark in glibc
+-      # shows that 2KB is the approximate value above which REP MOVSB
+-      # becomes faster than SSE2 optimization on processors with Enhanced
+-      # REP MOVSB.  Since larger register size can move more data with a
+-      # single load and store, the threshold is higher with larger register
+-      # size.  Note: Since the REP MOVSB threshold must be greater than 8
+-      # times of vector size and the default value is 2048 * (vector size
+-      # / 16), the default value and the minimum value must be updated at
+-      # run-time.  NB: Don't set the default value since we can't tell if
+-      # the tunable value is set by user or not [BZ #27069].
++      # Since there is overhead to set up REP MOVSB operation, REP
++      # MOVSB isn't faster on short data.  The memcpy micro benchmark
++      # in glibc shows that 2KB is the approximate value above which
++      # REP MOVSB becomes faster than SSE2 optimization on processors
++      # with Enhanced REP MOVSB.  Since larger register size can move
++      # more data with a single load and store, the threshold is
++      # higher with larger register size.  Micro benchmarks show AVX
++      # REP MOVSB becomes faster apprximately at 8KB.  The AVX512
++      # threshold is extrapolated to 16KB.  For machines with FSRM the
++      # threshold is universally set at 2112 bytes.  Note: Since the
++      # REP MOVSB threshold must be greater than 8 times of vector
++      # size and the default value is 4096 * (vector size / 16), the
++      # default value and the minimum value must be updated at
++      # run-time.  NB: Don't set the default value since we can't tell
++      # if the tunable value is set by user or not [BZ #27069].
+       minval: 1
+     }
+     x86_rep_stosb_threshold {
+-- 
+GitLab
+
diff --git a/SOURCES/ia-ifdef-indt-strlen-evex.patch b/SOURCES/ia-ifdef-indt-strlen-evex.patch
new file mode 100644
index 0000000..d0f63fd
--- /dev/null
+++ b/SOURCES/ia-ifdef-indt-strlen-evex.patch
@@ -0,0 +1,72 @@
+From b1ecc9d86746b8c0285935eae99c31ede19a7d9e Mon Sep 17 00:00:00 2001
+From: Sunil K Pandey <skpgkp2@gmail.com>
+Date: Thu, 1 Apr 2021 15:47:04 -0700
+Subject: [PATCH] x86-64: Fix ifdef indentation in strlen-evex.S
+
+Fix some indentations of ifdef in file strlen-evex.S which are off by 1
+and confusing to read.
+
+(cherry picked from commit 595c22ecd8e87a27fd19270ed30fdbae9ad25426)
+---
+ sysdeps/x86_64/multiarch/strlen-evex.S | 16 ++++++++--------
+ 1 file changed, 8 insertions(+), 8 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
+index cd022509..05838190 100644
+--- a/sysdeps/x86_64/multiarch/strlen-evex.S
++++ b/sysdeps/x86_64/multiarch/strlen-evex.S
+@@ -276,10 +276,10 @@ L(last_2x_vec):
+ 	.p2align 4
+ L(first_vec_x0_check):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WCSLEN
++#  ifdef USE_AS_WCSLEN
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+ 	sall	$2, %eax
+-# endif
++#  endif
+ 	/* Check the end of data.  */
+ 	cmpq	%rax, %rsi
+ 	jbe	L(max)
+@@ -293,10 +293,10 @@ L(first_vec_x0_check):
+ 	.p2align 4
+ L(first_vec_x1_check):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WCSLEN
++#  ifdef USE_AS_WCSLEN
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+ 	sall	$2, %eax
+-# endif
++#  endif
+ 	/* Check the end of data.  */
+ 	cmpq	%rax, %rsi
+ 	jbe	L(max)
+@@ -311,10 +311,10 @@ L(first_vec_x1_check):
+ 	.p2align 4
+ L(first_vec_x2_check):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WCSLEN
++#  ifdef USE_AS_WCSLEN
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+ 	sall	$2, %eax
+-# endif
++#  endif
+ 	/* Check the end of data.  */
+ 	cmpq	%rax, %rsi
+ 	jbe	L(max)
+@@ -329,10 +329,10 @@ L(first_vec_x2_check):
+ 	.p2align 4
+ L(first_vec_x3_check):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WCSLEN
++#  ifdef USE_AS_WCSLEN
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+ 	sall	$2, %eax
+-# endif
++#  endif
+ 	/* Check the end of data.  */
+ 	cmpq	%rax, %rsi
+ 	jbe	L(max)
+-- 
+GitLab
+
diff --git a/SOURCES/ia-imp-strcmp-evex.patch b/SOURCES/ia-imp-strcmp-evex.patch
new file mode 100644
index 0000000..52229e9
--- /dev/null
+++ b/SOURCES/ia-imp-strcmp-evex.patch
@@ -0,0 +1,695 @@
+From 4c0828b81a6d4fd6ca49a4fcef45bb6b479d0d67 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 29 Oct 2021 12:40:20 -0700
+Subject: [PATCH] x86-64: Improve EVEX strcmp with masked load
+
+In strcmp-evex.S, to compare 2 32-byte strings, replace
+
+        VMOVU   (%rdi, %rdx), %YMM0
+        VMOVU   (%rsi, %rdx), %YMM1
+        /* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
+        VPCMP   $4, %YMM0, %YMM1, %k0
+        VPCMP   $0, %YMMZERO, %YMM0, %k1
+        VPCMP   $0, %YMMZERO, %YMM1, %k2
+        /* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
+        kord    %k1, %k2, %k1
+        /* Each bit in K1 represents a NULL or a mismatch.  */
+        kord    %k0, %k1, %k1
+        kmovd   %k1, %ecx
+        testl   %ecx, %ecx
+        jne     L(last_vector)
+
+with
+
+        VMOVU   (%rdi, %rdx), %YMM0
+        VPTESTM %YMM0, %YMM0, %k2
+        /* Each bit cleared in K1 represents a mismatch or a null CHAR
+           in YMM0 and 32 bytes at (%rsi, %rdx).  */
+        VPCMP   $0, (%rsi, %rdx), %YMM0, %k1{%k2}
+        kmovd   %k1, %ecx
+        incl    %ecx
+        jne     L(last_vector)
+
+It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
+and Ice Lake.
+
+Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
+(cherry picked from commit c46e9afb2df5fc9e39ff4d13777e4b4c26e04e55)
+---
+ sysdeps/x86_64/multiarch/strcmp-evex.S | 461 +++++++++++++------------
+ 1 file changed, 243 insertions(+), 218 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
+index d5aa6daa..82f12ac8 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
+@@ -41,6 +41,8 @@
+ # ifdef USE_AS_WCSCMP
+ /* Compare packed dwords.  */
+ #  define VPCMP		vpcmpd
++#  define VPMINU	vpminud
++#  define VPTESTM	vptestmd
+ #  define SHIFT_REG32	r8d
+ #  define SHIFT_REG64	r8
+ /* 1 dword char == 4 bytes.  */
+@@ -48,6 +50,8 @@
+ # else
+ /* Compare packed bytes.  */
+ #  define VPCMP		vpcmpb
++#  define VPMINU	vpminub
++#  define VPTESTM	vptestmb
+ #  define SHIFT_REG32	ecx
+ #  define SHIFT_REG64	rcx
+ /* 1 byte char == 1 byte.  */
+@@ -67,6 +71,9 @@
+ # define YMM5		ymm22
+ # define YMM6		ymm23
+ # define YMM7		ymm24
++# define YMM8		ymm25
++# define YMM9		ymm26
++# define YMM10		ymm27
+ 
+ /* Warning!
+            wcscmp/wcsncmp have to use SIGNED comparison for elements.
+@@ -76,7 +83,7 @@
+ /* The main idea of the string comparison (byte or dword) using 256-bit
+    EVEX instructions consists of comparing (VPCMP) two ymm vectors. The
+    latter can be on either packed bytes or dwords depending on
+-   USE_AS_WCSCMP. In order to check the null char, algorithm keeps the
++   USE_AS_WCSCMP. In order to check the null CHAR, algorithm keeps the
+    matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2
+    KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes)
+    are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd
+@@ -123,27 +130,21 @@ ENTRY (STRCMP)
+ 	jg	L(cross_page)
+ 	/* Start comparing 4 vectors.  */
+ 	VMOVU	(%rdi), %YMM0
+-	VMOVU	(%rsi), %YMM1
+ 
+-	/* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
+-	VPCMP	$4, %YMM0, %YMM1, %k0
++	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
++	VPTESTM	%YMM0, %YMM0, %k2
+ 
+-	/* Check for NULL in YMM0.  */
+-	VPCMP	$0, %YMMZERO, %YMM0, %k1
+-	/* Check for NULL in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM1, %k2
+-	/* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
+-	kord	%k1, %k2, %k1
++	/* Each bit cleared in K1 represents a mismatch or a null CHAR
++	   in YMM0 and 32 bytes at (%rsi).  */
++	VPCMP	$0, (%rsi), %YMM0, %k1{%k2}
+ 
+-	/* Each bit in K1 represents:
+-	   1. A mismatch in YMM0 and YMM1.  Or
+-	   2. A NULL in YMM0 or YMM1.
+-	 */
+-	kord	%k0, %k1, %k1
+-
+-	ktestd	%k1, %k1
+-	je	L(next_3_vectors)
+ 	kmovd	%k1, %ecx
++# ifdef USE_AS_WCSCMP
++	subl	$0xff, %ecx
++# else
++	incl	%ecx
++# endif
++	je	L(next_3_vectors)
+ 	tzcntl	%ecx, %edx
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+@@ -172,9 +173,7 @@ L(return):
+ # endif
+ 	ret
+ 
+-	.p2align 4
+ L(return_vec_size):
+-	kmovd	%k1, %ecx
+ 	tzcntl	%ecx, %edx
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+@@ -210,9 +209,7 @@ L(return_vec_size):
+ # endif
+ 	ret
+ 
+-	.p2align 4
+ L(return_2_vec_size):
+-	kmovd	%k1, %ecx
+ 	tzcntl	%ecx, %edx
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+@@ -248,9 +245,7 @@ L(return_2_vec_size):
+ # endif
+ 	ret
+ 
+-	.p2align 4
+ L(return_3_vec_size):
+-	kmovd	%k1, %ecx
+ 	tzcntl	%ecx, %edx
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+@@ -289,43 +284,45 @@ L(return_3_vec_size):
+ 	.p2align 4
+ L(next_3_vectors):
+ 	VMOVU	VEC_SIZE(%rdi), %YMM0
+-	VMOVU	VEC_SIZE(%rsi), %YMM1
+-	/* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
+-	VPCMP	$4, %YMM0, %YMM1, %k0
+-	VPCMP	$0, %YMMZERO, %YMM0, %k1
+-	VPCMP	$0, %YMMZERO, %YMM1, %k2
+-	/* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	kord	%k0, %k1, %k1
+-	ktestd	%k1, %k1
++	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
++	VPTESTM	%YMM0, %YMM0, %k2
++	/* Each bit cleared in K1 represents a mismatch or a null CHAR
++	   in YMM0 and 32 bytes at VEC_SIZE(%rsi).  */
++	VPCMP	$0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
++	kmovd	%k1, %ecx
++# ifdef USE_AS_WCSCMP
++	subl	$0xff, %ecx
++# else
++	incl	%ecx
++# endif
+ 	jne	L(return_vec_size)
+ 
+-	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM2
+-	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM3
+-	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM4
+-	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM5
+-
+-	/* Each bit in K0 represents a mismatch in YMM2 and YMM4.  */
+-	VPCMP	$4, %YMM2, %YMM4, %k0
+-	VPCMP	$0, %YMMZERO, %YMM2, %k1
+-	VPCMP	$0, %YMMZERO, %YMM4, %k2
+-	/* Each bit in K1 represents a NULL in YMM2 or YMM4.  */
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	kord	%k0, %k1, %k1
+-	ktestd	%k1, %k1
++	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM0
++	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
++	VPTESTM	%YMM0, %YMM0, %k2
++	/* Each bit cleared in K1 represents a mismatch or a null CHAR
++	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi).  */
++	VPCMP	$0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
++	kmovd	%k1, %ecx
++# ifdef USE_AS_WCSCMP
++	subl	$0xff, %ecx
++# else
++	incl	%ecx
++# endif
+ 	jne	L(return_2_vec_size)
+ 
+-	/* Each bit in K0 represents a mismatch in YMM3 and YMM5.  */
+-	VPCMP	$4, %YMM3, %YMM5, %k0
+-	VPCMP	$0, %YMMZERO, %YMM3, %k1
+-	VPCMP	$0, %YMMZERO, %YMM5, %k2
+-	/* Each bit in K1 represents a NULL in YMM3 or YMM5.  */
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	kord	%k0, %k1, %k1
+-	ktestd	%k1, %k1
++	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM0
++	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
++	VPTESTM	%YMM0, %YMM0, %k2
++	/* Each bit cleared in K1 represents a mismatch or a null CHAR
++	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi).  */
++	VPCMP	$0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
++	kmovd	%k1, %ecx
++# ifdef USE_AS_WCSCMP
++	subl	$0xff, %ecx
++# else
++	incl	%ecx
++# endif
+ 	jne	L(return_3_vec_size)
+ L(main_loop_header):
+ 	leaq	(VEC_SIZE * 4)(%rdi), %rdx
+@@ -375,56 +372,51 @@ L(back_to_loop):
+ 	VMOVA	VEC_SIZE(%rax), %YMM2
+ 	VMOVA	(VEC_SIZE * 2)(%rax), %YMM4
+ 	VMOVA	(VEC_SIZE * 3)(%rax), %YMM6
+-	VMOVU	(%rdx), %YMM1
+-	VMOVU	VEC_SIZE(%rdx), %YMM3
+-	VMOVU	(VEC_SIZE * 2)(%rdx), %YMM5
+-	VMOVU	(VEC_SIZE * 3)(%rdx), %YMM7
+-
+-	VPCMP	$4, %YMM0, %YMM1, %k0
+-	VPCMP	$0, %YMMZERO, %YMM0, %k1
+-	VPCMP	$0, %YMMZERO, %YMM1, %k2
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K4 represents a NULL or a mismatch in YMM0 and
+-	   YMM1.  */
+-	kord	%k0, %k1, %k4
+-
+-	VPCMP	$4, %YMM2, %YMM3, %k0
+-	VPCMP	$0, %YMMZERO, %YMM2, %k1
+-	VPCMP	$0, %YMMZERO, %YMM3, %k2
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K5 represents a NULL or a mismatch in YMM2 and
+-	   YMM3.  */
+-	kord	%k0, %k1, %k5
+-
+-	VPCMP	$4, %YMM4, %YMM5, %k0
+-	VPCMP	$0, %YMMZERO, %YMM4, %k1
+-	VPCMP	$0, %YMMZERO, %YMM5, %k2
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K6 represents a NULL or a mismatch in YMM4 and
+-	   YMM5.  */
+-	kord	%k0, %k1, %k6
+-
+-	VPCMP	$4, %YMM6, %YMM7, %k0
+-	VPCMP	$0, %YMMZERO, %YMM6, %k1
+-	VPCMP	$0, %YMMZERO, %YMM7, %k2
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K7 represents a NULL or a mismatch in YMM6 and
+-	   YMM7.  */
+-	kord	%k0, %k1, %k7
+-
+-	kord	%k4, %k5, %k0
+-	kord	%k6, %k7, %k1
+-
+-	/* Test each mask (32 bits) individually because for VEC_SIZE
+-	   == 32 is not possible to OR the four masks and keep all bits
+-	   in a 64-bit integer register, differing from SSE2 strcmp
+-	   where ORing is possible.  */
+-	kortestd %k0, %k1
+-	je	L(loop)
+-	ktestd	%k4, %k4
++
++	VPMINU	%YMM0, %YMM2, %YMM8
++	VPMINU	%YMM4, %YMM6, %YMM9
++
++	/* A zero CHAR in YMM8 means that there is a null CHAR.  */
++	VPMINU	%YMM8, %YMM9, %YMM8
++
++	/* Each bit set in K1 represents a non-null CHAR in YMM8.  */
++	VPTESTM	%YMM8, %YMM8, %k1
++
++	/* (YMM ^ YMM): A non-zero CHAR represents a mismatch.  */
++	vpxorq	(%rdx), %YMM0, %YMM1
++	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM3
++	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM4, %YMM5
++	vpxorq	(VEC_SIZE * 3)(%rdx), %YMM6, %YMM7
++
++	vporq	%YMM1, %YMM3, %YMM9
++	vporq	%YMM5, %YMM7, %YMM10
++
++	/* A non-zero CHAR in YMM9 represents a mismatch.  */
++	vporq	%YMM9, %YMM10, %YMM9
++
++	/* Each bit cleared in K0 represents a mismatch or a null CHAR.  */
++	VPCMP	$0, %YMMZERO, %YMM9, %k0{%k1}
++	kmovd   %k0, %ecx
++# ifdef USE_AS_WCSCMP
++	subl	$0xff, %ecx
++# else
++	incl	%ecx
++# endif
++	je	 L(loop)
++
++	/* Each bit set in K1 represents a non-null CHAR in YMM0.  */
++	VPTESTM	%YMM0, %YMM0, %k1
++	/* Each bit cleared in K0 represents a mismatch or a null CHAR
++	   in YMM0 and (%rdx).  */
++	VPCMP	$0, %YMMZERO, %YMM1, %k0{%k1}
++	kmovd	%k0, %ecx
++# ifdef USE_AS_WCSCMP
++	subl	$0xff, %ecx
++# else
++	incl	%ecx
++# endif
+ 	je	L(test_vec)
+-	kmovd	%k4, %edi
+-	tzcntl	%edi, %ecx
++	tzcntl	%ecx, %ecx
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+ 	sall	$2, %ecx
+@@ -466,9 +458,18 @@ L(test_vec):
+ 	cmpq	$VEC_SIZE, %r11
+ 	jbe	L(zero)
+ # endif
+-	ktestd	%k5, %k5
++	/* Each bit set in K1 represents a non-null CHAR in YMM2.  */
++	VPTESTM	%YMM2, %YMM2, %k1
++	/* Each bit cleared in K0 represents a mismatch or a null CHAR
++	   in YMM2 and VEC_SIZE(%rdx).  */
++	VPCMP	$0, %YMMZERO, %YMM3, %k0{%k1}
++	kmovd	%k0, %ecx
++# ifdef USE_AS_WCSCMP
++	subl	$0xff, %ecx
++# else
++	incl	%ecx
++# endif
+ 	je	L(test_2_vec)
+-	kmovd	%k5, %ecx
+ 	tzcntl	%ecx, %edi
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+@@ -512,9 +513,18 @@ L(test_2_vec):
+ 	cmpq	$(VEC_SIZE * 2), %r11
+ 	jbe	L(zero)
+ # endif
+-	ktestd	%k6, %k6
++	/* Each bit set in K1 represents a non-null CHAR in YMM4.  */
++	VPTESTM	%YMM4, %YMM4, %k1
++	/* Each bit cleared in K0 represents a mismatch or a null CHAR
++	   in YMM4 and (VEC_SIZE * 2)(%rdx).  */
++	VPCMP	$0, %YMMZERO, %YMM5, %k0{%k1}
++	kmovd	%k0, %ecx
++# ifdef USE_AS_WCSCMP
++	subl	$0xff, %ecx
++# else
++	incl	%ecx
++# endif
+ 	je	L(test_3_vec)
+-	kmovd	%k6, %ecx
+ 	tzcntl	%ecx, %edi
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+@@ -558,8 +568,18 @@ L(test_3_vec):
+ 	cmpq	$(VEC_SIZE * 3), %r11
+ 	jbe	L(zero)
+ # endif
+-	kmovd	%k7, %esi
+-	tzcntl	%esi, %ecx
++	/* Each bit set in K1 represents a non-null CHAR in YMM6.  */
++	VPTESTM	%YMM6, %YMM6, %k1
++	/* Each bit cleared in K0 represents a mismatch or a null CHAR
++	   in YMM6 and (VEC_SIZE * 3)(%rdx).  */
++	VPCMP	$0, %YMMZERO, %YMM7, %k0{%k1}
++	kmovd	%k0, %ecx
++# ifdef USE_AS_WCSCMP
++	subl	$0xff, %ecx
++# else
++	incl	%ecx
++# endif
++	tzcntl	%ecx, %ecx
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+ 	sall	$2, %ecx
+@@ -615,39 +635,51 @@ L(loop_cross_page):
+ 
+ 	VMOVU	(%rax, %r10), %YMM2
+ 	VMOVU	VEC_SIZE(%rax, %r10), %YMM3
+-	VMOVU	(%rdx, %r10), %YMM4
+-	VMOVU	VEC_SIZE(%rdx, %r10), %YMM5
+-
+-	VPCMP	$4, %YMM4, %YMM2, %k0
+-	VPCMP	$0, %YMMZERO, %YMM2, %k1
+-	VPCMP	$0, %YMMZERO, %YMM4, %k2
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch in YMM2 and
+-	   YMM4.  */
+-	kord	%k0, %k1, %k1
+-
+-	VPCMP	$4, %YMM5, %YMM3, %k3
+-	VPCMP	$0, %YMMZERO, %YMM3, %k4
+-	VPCMP	$0, %YMMZERO, %YMM5, %k5
+-	kord	%k4, %k5, %k4
+-	/* Each bit in K3 represents a NULL or a mismatch in YMM3 and
+-	   YMM5.  */
+-	kord	%k3, %k4, %k3
++
++	/* Each bit set in K2 represents a non-null CHAR in YMM2.  */
++	VPTESTM	%YMM2, %YMM2, %k2
++	/* Each bit cleared in K1 represents a mismatch or a null CHAR
++	   in YMM2 and 32 bytes at (%rdx, %r10).  */
++	VPCMP	$0, (%rdx, %r10), %YMM2, %k1{%k2}
++	kmovd	%k1, %r9d
++	/* Don't use subl since it is the lower 16/32 bits of RDI
++	   below.  */
++	notl	%r9d
++# ifdef USE_AS_WCSCMP
++	/* Only last 8 bits are valid.  */
++	andl	$0xff, %r9d
++# endif
++
++	/* Each bit set in K4 represents a non-null CHAR in YMM3.  */
++	VPTESTM	%YMM3, %YMM3, %k4
++	/* Each bit cleared in K3 represents a mismatch or a null CHAR
++	   in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10).  */
++	VPCMP	$0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4}
++	kmovd	%k3, %edi
++# ifdef USE_AS_WCSCMP
++	/* Don't use subl since it is the upper 8 bits of EDI below.  */
++	notl	%edi
++	andl	$0xff, %edi
++# else
++	incl	%edi
++# endif
+ 
+ # ifdef USE_AS_WCSCMP
+-	/* NB: Each bit in K1/K3 represents 4-byte element.  */
+-	kshiftlw $8, %k3, %k2
++	/* NB: Each bit in EDI/R9D represents 4-byte element.  */
++	sall	$8, %edi
+ 	/* NB: Divide shift count by 4 since each bit in K1 represent 4
+ 	   bytes.  */
+ 	movl	%ecx, %SHIFT_REG32
+ 	sarl	$2, %SHIFT_REG32
++
++	/* Each bit in EDI represents a null CHAR or a mismatch.  */
++	orl	%r9d, %edi
+ # else
+-	kshiftlq $32, %k3, %k2
+-# endif
++	salq	$32, %rdi
+ 
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	korq	%k1, %k2, %k1
+-	kmovq	%k1, %rdi
++	/* Each bit in RDI represents a null CHAR or a mismatch.  */
++	orq	%r9, %rdi
++# endif
+ 
+ 	/* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes.  */
+ 	shrxq	%SHIFT_REG64, %rdi, %rdi
+@@ -692,35 +724,45 @@ L(loop_cross_page_2_vec):
+ 	/* The first VEC_SIZE * 2 bytes match or are ignored.  */
+ 	VMOVU	(VEC_SIZE * 2)(%rax, %r10), %YMM0
+ 	VMOVU	(VEC_SIZE * 3)(%rax, %r10), %YMM1
+-	VMOVU	(VEC_SIZE * 2)(%rdx, %r10), %YMM2
+-	VMOVU	(VEC_SIZE * 3)(%rdx, %r10), %YMM3
+-
+-	VPCMP	$4, %YMM0, %YMM2, %k0
+-	VPCMP	$0, %YMMZERO, %YMM0, %k1
+-	VPCMP	$0, %YMMZERO, %YMM2, %k2
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch in YMM0 and
+-	   YMM2.  */
+-	kord	%k0, %k1, %k1
+-
+-	VPCMP	$4, %YMM1, %YMM3, %k3
+-	VPCMP	$0, %YMMZERO, %YMM1, %k4
+-	VPCMP	$0, %YMMZERO, %YMM3, %k5
+-	kord	%k4, %k5, %k4
+-	/* Each bit in K3 represents a NULL or a mismatch in YMM1 and
+-	   YMM3.  */
+-	kord	%k3, %k4, %k3
+ 
++	VPTESTM	%YMM0, %YMM0, %k2
++	/* Each bit cleared in K1 represents a mismatch or a null CHAR
++	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rdx, %r10).  */
++	VPCMP	$0, (VEC_SIZE * 2)(%rdx, %r10), %YMM0, %k1{%k2}
++	kmovd	%k1, %r9d
++	/* Don't use subl since it is the lower 16/32 bits of RDI
++	   below.  */
++	notl	%r9d
+ # ifdef USE_AS_WCSCMP
+-	/* NB: Each bit in K1/K3 represents 4-byte element.  */
+-	kshiftlw $8, %k3, %k2
++	/* Only last 8 bits are valid.  */
++	andl	$0xff, %r9d
++# endif
++
++	VPTESTM	%YMM1, %YMM1, %k4
++	/* Each bit cleared in K3 represents a mismatch or a null CHAR
++	   in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10).  */
++	VPCMP	$0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4}
++	kmovd	%k3, %edi
++# ifdef USE_AS_WCSCMP
++	/* Don't use subl since it is the upper 8 bits of EDI below.  */
++	notl	%edi
++	andl	$0xff, %edi
+ # else
+-	kshiftlq $32, %k3, %k2
++	incl	%edi
+ # endif
+ 
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	korq	%k1, %k2, %k1
+-	kmovq	%k1, %rdi
++# ifdef USE_AS_WCSCMP
++	/* NB: Each bit in EDI/R9D represents 4-byte element.  */
++	sall	$8, %edi
++
++	/* Each bit in EDI represents a null CHAR or a mismatch.  */
++	orl	%r9d, %edi
++# else
++	salq	$32, %rdi
++
++	/* Each bit in RDI represents a null CHAR or a mismatch.  */
++	orq	%r9, %rdi
++# endif
+ 
+ 	xorl	%r8d, %r8d
+ 	/* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes.  */
+@@ -729,12 +771,15 @@ L(loop_cross_page_2_vec):
+ 	/* R8 has number of bytes skipped.  */
+ 	movl	%ecx, %r8d
+ # ifdef USE_AS_WCSCMP
+-	/* NB: Divide shift count by 4 since each bit in K1 represent 4
++	/* NB: Divide shift count by 4 since each bit in RDI represent 4
+ 	   bytes.  */
+ 	sarl	$2, %ecx
+-# endif
++	/* Skip ECX bytes.  */
++	shrl	%cl, %edi
++# else
+ 	/* Skip ECX bytes.  */
+ 	shrq	%cl, %rdi
++# endif
+ 1:
+ 	/* Before jumping back to the loop, set ESI to the number of
+ 	   VEC_SIZE * 4 blocks before page crossing.  */
+@@ -818,7 +863,7 @@ L(cross_page_loop):
+ 	movzbl	(%rdi, %rdx), %eax
+ 	movzbl	(%rsi, %rdx), %ecx
+ # endif
+-	/* Check null char.  */
++	/* Check null CHAR.  */
+ 	testl	%eax, %eax
+ 	jne	L(cross_page_loop)
+ 	/* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
+@@ -901,18 +946,17 @@ L(cross_page):
+ 	jg	L(cross_page_1_vector)
+ L(loop_1_vector):
+ 	VMOVU	(%rdi, %rdx), %YMM0
+-	VMOVU	(%rsi, %rdx), %YMM1
+-
+-	/* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
+-	VPCMP	$4, %YMM0, %YMM1, %k0
+-	VPCMP	$0, %YMMZERO, %YMM0, %k1
+-	VPCMP	$0, %YMMZERO, %YMM1, %k2
+-	/* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	kord	%k0, %k1, %k1
++
++	VPTESTM	%YMM0, %YMM0, %k2
++	/* Each bit cleared in K1 represents a mismatch or a null CHAR
++	   in YMM0 and 32 bytes at (%rsi, %rdx).  */
++	VPCMP	$0, (%rsi, %rdx), %YMM0, %k1{%k2}
+ 	kmovd	%k1, %ecx
+-	testl	%ecx, %ecx
++# ifdef USE_AS_WCSCMP
++	subl	$0xff, %ecx
++# else
++	incl	%ecx
++# endif
+ 	jne	L(last_vector)
+ 
+ 	addl	$VEC_SIZE, %edx
+@@ -931,18 +975,17 @@ L(cross_page_1_vector):
+ 	cmpl	$(PAGE_SIZE - 16), %eax
+ 	jg	L(cross_page_1_xmm)
+ 	VMOVU	(%rdi, %rdx), %XMM0
+-	VMOVU	(%rsi, %rdx), %XMM1
+-
+-	/* Each bit in K0 represents a mismatch in XMM0 and XMM1.  */
+-	VPCMP	$4, %XMM0, %XMM1, %k0
+-	VPCMP	$0, %XMMZERO, %XMM0, %k1
+-	VPCMP	$0, %XMMZERO, %XMM1, %k2
+-	/* Each bit in K1 represents a NULL in XMM0 or XMM1.  */
+-	korw	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	korw	%k0, %k1, %k1
+-	kmovw	%k1, %ecx
+-	testl	%ecx, %ecx
++
++	VPTESTM	%YMM0, %YMM0, %k2
++	/* Each bit cleared in K1 represents a mismatch or a null CHAR
++	   in XMM0 and 16 bytes at (%rsi, %rdx).  */
++	VPCMP	$0, (%rsi, %rdx), %XMM0, %k1{%k2}
++	kmovd	%k1, %ecx
++# ifdef USE_AS_WCSCMP
++	subl	$0xf, %ecx
++# else
++	subl	$0xffff, %ecx
++# endif
+ 	jne	L(last_vector)
+ 
+ 	addl	$16, %edx
+@@ -965,25 +1008,16 @@ L(cross_page_1_xmm):
+ 	vmovq	(%rdi, %rdx), %XMM0
+ 	vmovq	(%rsi, %rdx), %XMM1
+ 
+-	/* Each bit in K0 represents a mismatch in XMM0 and XMM1.  */
+-	VPCMP	$4, %XMM0, %XMM1, %k0
+-	VPCMP	$0, %XMMZERO, %XMM0, %k1
+-	VPCMP	$0, %XMMZERO, %XMM1, %k2
+-	/* Each bit in K1 represents a NULL in XMM0 or XMM1.  */
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	kord	%k0, %k1, %k1
+-	kmovd	%k1, %ecx
+-
++	VPTESTM	%YMM0, %YMM0, %k2
++	/* Each bit cleared in K1 represents a mismatch or a null CHAR
++	   in XMM0 and XMM1.  */
++	VPCMP	$0, %XMM1, %XMM0, %k1{%k2}
++	kmovb	%k1, %ecx
+ # ifdef USE_AS_WCSCMP
+-	/* Only last 2 bits are valid.  */
+-	andl	$0x3, %ecx
++	subl	$0x3, %ecx
+ # else
+-	/* Only last 8 bits are valid.  */
+-	andl	$0xff, %ecx
++	subl	$0xff, %ecx
+ # endif
+-
+-	testl	%ecx, %ecx
+ 	jne	L(last_vector)
+ 
+ 	addl	$8, %edx
+@@ -1002,25 +1036,16 @@ L(cross_page_8bytes):
+ 	vmovd	(%rdi, %rdx), %XMM0
+ 	vmovd	(%rsi, %rdx), %XMM1
+ 
+-	/* Each bit in K0 represents a mismatch in XMM0 and XMM1.  */
+-	VPCMP	$4, %XMM0, %XMM1, %k0
+-	VPCMP	$0, %XMMZERO, %XMM0, %k1
+-	VPCMP	$0, %XMMZERO, %XMM1, %k2
+-	/* Each bit in K1 represents a NULL in XMM0 or XMM1.  */
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	kord	%k0, %k1, %k1
++	VPTESTM	%YMM0, %YMM0, %k2
++	/* Each bit cleared in K1 represents a mismatch or a null CHAR
++	   in XMM0 and XMM1.  */
++	VPCMP	$0, %XMM1, %XMM0, %k1{%k2}
+ 	kmovd	%k1, %ecx
+-
+ # ifdef USE_AS_WCSCMP
+-	/* Only the last bit is valid.  */
+-	andl	$0x1, %ecx
++	subl	$0x1, %ecx
+ # else
+-	/* Only last 4 bits are valid.  */
+-	andl	$0xf, %ecx
++	subl	$0xf, %ecx
+ # endif
+-
+-	testl	%ecx, %ecx
+ 	jne	L(last_vector)
+ 
+ 	addl	$4, %edx
+-- 
+GitLab
+
diff --git a/SOURCES/ia-impr-memmove-vec-unaligned-erms.patch b/SOURCES/ia-impr-memmove-vec-unaligned-erms.patch
new file mode 100644
index 0000000..15fd4cd
--- /dev/null
+++ b/SOURCES/ia-impr-memmove-vec-unaligned-erms.patch
@@ -0,0 +1,84 @@
+From 29dfcb178f889d1bd91e9eb9d98d7e74fddcde51 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Sun, 23 May 2021 19:43:24 -0400
+Subject: [PATCH] x86: Improve memmove-vec-unaligned-erms.S
+
+This patch changes the condition for copy 4x VEC so that if length is
+exactly equal to 4 * VEC_SIZE it will use the 4x VEC case instead of
+8x VEC case.
+
+Results For Skylake memcpy-avx2-erms
+size, al1 , al2 , Cur T   , New T   , Win , New / Cur
+128 , 0   , 0   , 9.137   , 6.873   , New , 75.22
+128 , 7   , 0   , 12.933  , 7.732   , New , 59.79
+128 , 0   , 7   , 11.852  , 6.76    , New , 57.04
+128 , 7   , 7   , 12.587  , 6.808   , New , 54.09
+
+Results For Icelake memcpy-evex-erms
+size, al1 , al2 , Cur T   , New T   , Win , New / Cur
+128 , 0   , 0   , 9.963   , 5.416   , New , 54.36
+128 , 7   , 0   , 16.467  , 8.061   , New , 48.95
+128 , 0   , 7   , 14.388  , 7.644   , New , 53.13
+128 , 7   , 7   , 14.546  , 7.642   , New , 52.54
+
+Results For Tigerlake memcpy-evex-erms
+size, al1 , al2 , Cur T   , New T   , Win , New / Cur
+128 , 0   , 0   , 8.979   , 4.95    , New , 55.13
+128 , 7   , 0   , 14.245  , 7.122   , New , 50.0
+128 , 0   , 7   , 12.668  , 6.675   , New , 52.69
+128 , 7   , 7   , 13.042  , 6.802   , New , 52.15
+
+Results For Skylake memmove-avx2-erms
+size, al1 , al2 , Cur T   , New T   , Win , New / Cur
+128 , 0   , 32  , 6.181   , 5.691   , New , 92.07
+128 , 32  , 0   , 6.165   , 5.752   , New , 93.3
+128 , 0   , 7   , 13.923  , 9.37    , New , 67.3
+128 , 7   , 0   , 12.049  , 10.182  , New , 84.5
+
+Results For Icelake memmove-evex-erms
+size, al1 , al2 , Cur T   , New T   , Win , New / Cur
+128 , 0   , 32  , 5.479   , 4.889   , New , 89.23
+128 , 32  , 0   , 5.127   , 4.911   , New , 95.79
+128 , 0   , 7   , 18.885  , 13.547  , New , 71.73
+128 , 7   , 0   , 15.565  , 14.436  , New , 92.75
+
+Results For Tigerlake memmove-evex-erms
+size, al1 , al2 , Cur T   , New T   , Win , New / Cur
+128 , 0   , 32  , 5.275   , 4.815   , New , 91.28
+128 , 32  , 0   , 5.376   , 4.565   , New , 84.91
+128 , 0   , 7   , 19.426  , 14.273  , New , 73.47
+128 , 7   , 0   , 15.924  , 14.951  , New , 93.89
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+(cherry picked from commit 1b992204f68af851e905c16016756fd4421e1934)
+---
+ sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+index 3e2dd6bc..572cef04 100644
+--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+@@ -417,8 +417,8 @@ L(more_2x_vec):
+ 	cmpq	$(VEC_SIZE * 8), %rdx
+ 	ja	L(more_8x_vec)
+ 	cmpq	$(VEC_SIZE * 4), %rdx
+-	jb	L(last_4x_vec)
+-	/* Copy from 4 * VEC to 8 * VEC, inclusively. */
++	jbe	L(last_4x_vec)
++	/* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
+ 	VMOVU	(%rsi), %VEC(0)
+ 	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+ 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+@@ -437,7 +437,7 @@ L(more_2x_vec):
+ 	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
+ 	VZEROUPPER_RETURN
+ L(last_4x_vec):
+-	/* Copy from 2 * VEC to 4 * VEC. */
++	/* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
+ 	VMOVU	(%rsi), %VEC(0)
+ 	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+ 	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(2)
+-- 
+GitLab
+
diff --git a/SOURCES/ia-impr-memset-vec-unaligned-erms.patch b/SOURCES/ia-impr-memset-vec-unaligned-erms.patch
new file mode 100644
index 0000000..2256ac6
--- /dev/null
+++ b/SOURCES/ia-impr-memset-vec-unaligned-erms.patch
@@ -0,0 +1,104 @@
+From 91272636c23028e55554be4e677bf40ac22b1adc Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Thu, 20 May 2021 13:13:51 -0400
+Subject: [PATCH] x86: Improve memset-vec-unaligned-erms.S
+
+No bug. This commit makes a few small improvements to
+memset-vec-unaligned-erms.S. The changes are 1) only aligning to 64
+instead of 128. Either alignment will perform equally well in a loop
+and 128 just increases the odds of having to do an extra iteration
+which can be significant overhead for small values. 2) Align some
+targets and the loop. 3) Remove an ALU from the alignment process. 4)
+Reorder the last 4x VEC so that they are stored after the loop. 5)
+Move the condition for leq 8x VEC to before the alignment
+process. test-memset and test-wmemset are both passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+(cherry picked from commit 6abf27980a947f9b6e514d6b33b83059d39566ae)
+---
+ .../multiarch/memset-vec-unaligned-erms.S     | 50 +++++++++++--------
+ 1 file changed, 28 insertions(+), 22 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index f877ac9d..909c33f6 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -173,17 +173,22 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
+ 	VMOVU	%VEC(0), (%rdi)
+ 	VZEROUPPER_RETURN
+ 
++	.p2align 4
+ L(stosb_more_2x_vec):
+ 	cmp	__x86_rep_stosb_threshold(%rip), %RDX_LP
+ 	ja	L(stosb)
++#else
++	.p2align 4
+ #endif
+ L(more_2x_vec):
+-	cmpq  $(VEC_SIZE * 4), %rdx
+-	ja	L(loop_start)
++	/* Stores to first 2x VEC before cmp as any path forward will
++	   require it.  */
+ 	VMOVU	%VEC(0), (%rdi)
+ 	VMOVU	%VEC(0), VEC_SIZE(%rdi)
+-	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
++	cmpq	$(VEC_SIZE * 4), %rdx
++	ja	L(loop_start)
+ 	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
++	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+ L(return):
+ #if VEC_SIZE > 16
+ 	ZERO_UPPER_VEC_REGISTERS_RETURN
+@@ -192,28 +197,29 @@ L(return):
+ #endif
+ 
+ L(loop_start):
+-	leaq	(VEC_SIZE * 4)(%rdi), %rcx
+-	VMOVU	%VEC(0), (%rdi)
+-	andq	$-(VEC_SIZE * 4), %rcx
+-	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+-	VMOVU	%VEC(0), VEC_SIZE(%rdi)
+-	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
+ 	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rdi)
+-	VMOVU	%VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
+ 	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rdi)
+-	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
+-	addq	%rdi, %rdx
+-	andq	$-(VEC_SIZE * 4), %rdx
+-	cmpq	%rdx, %rcx
+-	je	L(return)
++	cmpq	$(VEC_SIZE * 8), %rdx
++	jbe	L(loop_end)
++	andq	$-(VEC_SIZE * 2), %rdi
++	subq	$-(VEC_SIZE * 4), %rdi
++	leaq	-(VEC_SIZE * 4)(%rax, %rdx), %rcx
++	.p2align 4
+ L(loop):
+-	VMOVA	%VEC(0), (%rcx)
+-	VMOVA	%VEC(0), VEC_SIZE(%rcx)
+-	VMOVA	%VEC(0), (VEC_SIZE * 2)(%rcx)
+-	VMOVA	%VEC(0), (VEC_SIZE * 3)(%rcx)
+-	addq	$(VEC_SIZE * 4), %rcx
+-	cmpq	%rcx, %rdx
+-	jne	L(loop)
++	VMOVA	%VEC(0), (%rdi)
++	VMOVA	%VEC(0), VEC_SIZE(%rdi)
++	VMOVA	%VEC(0), (VEC_SIZE * 2)(%rdi)
++	VMOVA	%VEC(0), (VEC_SIZE * 3)(%rdi)
++	subq	$-(VEC_SIZE * 4), %rdi
++	cmpq	%rcx, %rdi
++	jb	L(loop)
++L(loop_end):
++	/* NB: rax is set as ptr in MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
++	       rdx as length is also unchanged.  */
++	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rax, %rdx)
++	VMOVU	%VEC(0), -(VEC_SIZE * 3)(%rax, %rdx)
++	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rax, %rdx)
++	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
+ 	VZEROUPPER_SHORT_RETURN
+ 
+ 	.p2align 4
+-- 
+GitLab
+
diff --git a/SOURCES/ia-memchr-opt-avx2.patch b/SOURCES/ia-memchr-opt-avx2.patch
new file mode 100644
index 0000000..9dc6487
--- /dev/null
+++ b/SOURCES/ia-memchr-opt-avx2.patch
@@ -0,0 +1,584 @@
+From acd575144cc6340edfbf0a0e0580e38344ab623a Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 3 May 2021 03:01:58 -0400
+Subject: [PATCH] x86: Optimize memchr-avx2.S
+
+No bug. This commit optimizes memchr-avx2.S. The optimizations include
+replacing some branches with cmovcc, avoiding some branches entirely
+in the less_4x_vec case, making the page cross logic less strict,
+asaving a few instructions the in loop return loop. test-memchr,
+test-rawmemchr, and test-wmemchr are all passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+(cherry picked from commit acfd088a1963ba51cd83c78f95c0ab25ead79e04)
+---
+ sysdeps/x86_64/multiarch/memchr-avx2.S | 425 ++++++++++++++-----------
+ 1 file changed, 247 insertions(+), 178 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
+index cf893e77..b377f22e 100644
+--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
++++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
+@@ -26,8 +26,22 @@
+ 
+ # ifdef USE_AS_WMEMCHR
+ #  define VPCMPEQ	vpcmpeqd
++#  define VPBROADCAST	vpbroadcastd
++#  define CHAR_SIZE	4
+ # else
+ #  define VPCMPEQ	vpcmpeqb
++#  define VPBROADCAST	vpbroadcastb
++#  define CHAR_SIZE	1
++# endif
++
++# ifdef USE_AS_RAWMEMCHR
++#  define ERAW_PTR_REG	ecx
++#  define RRAW_PTR_REG	rcx
++#  define ALGN_PTR_REG	rdi
++# else
++#  define ERAW_PTR_REG	edi
++#  define RRAW_PTR_REG	rdi
++#  define ALGN_PTR_REG	rcx
+ # endif
+ 
+ # ifndef VZEROUPPER
+@@ -39,6 +53,7 @@
+ # endif
+ 
+ # define VEC_SIZE 32
++# define PAGE_SIZE 4096
+ 
+ 	.section SECTION(.text),"ax",@progbits
+ ENTRY (MEMCHR)
+@@ -47,295 +62,349 @@ ENTRY (MEMCHR)
+ 	test	%RDX_LP, %RDX_LP
+ 	jz	L(null)
+ # endif
+-	movl	%edi, %ecx
+-	/* Broadcast CHAR to YMM0.  */
+-	vmovd	%esi, %xmm0
+ # ifdef USE_AS_WMEMCHR
+ 	shl	$2, %RDX_LP
+-	vpbroadcastd %xmm0, %ymm0
+ # else
+ #  ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	movl	%edx, %edx
+ #  endif
+-	vpbroadcastb %xmm0, %ymm0
+ # endif
++	/* Broadcast CHAR to YMMMATCH.  */
++	vmovd	%esi, %xmm0
++	VPBROADCAST %xmm0, %ymm0
+ 	/* Check if we may cross page boundary with one vector load.  */
+-	andl	$(2 * VEC_SIZE - 1), %ecx
+-	cmpl	$VEC_SIZE, %ecx
+-	ja	L(cros_page_boundary)
++	movl	%edi, %eax
++	andl	$(PAGE_SIZE - 1), %eax
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
++	ja	L(cross_page_boundary)
+ 
+ 	/* Check the first VEC_SIZE bytes.  */
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
++	VPCMPEQ	(%rdi), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-
+ # ifndef USE_AS_RAWMEMCHR
+-	jnz	L(first_vec_x0_check)
+-	/* Adjust length and check the end of data.  */
+-	subq	$VEC_SIZE, %rdx
+-	jbe	L(zero)
+-# else
+-	jnz	L(first_vec_x0)
++	/* If length < CHAR_PER_VEC handle special.  */
++	cmpq	$VEC_SIZE, %rdx
++	jbe	L(first_vec_x0)
+ # endif
+-
+-	/* Align data for aligned loads in the loop.  */
+-	addq	$VEC_SIZE, %rdi
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
++	testl	%eax, %eax
++	jz	L(aligned_more)
++	tzcntl	%eax, %eax
++	addq	%rdi, %rax
++	VZEROUPPER_RETURN
+ 
+ # ifndef USE_AS_RAWMEMCHR
+-	/* Adjust length.  */
+-	addq	%rcx, %rdx
++	.p2align 5
++L(first_vec_x0):
++	/* Check if first match was before length.  */
++	tzcntl	%eax, %eax
++	xorl	%ecx, %ecx
++	cmpl	%eax, %edx
++	leaq	(%rdi, %rax), %rax
++	cmovle	%rcx, %rax
++	VZEROUPPER_RETURN
+ 
+-	subq	$(VEC_SIZE * 4), %rdx
+-	jbe	L(last_4x_vec_or_less)
++L(null):
++	xorl	%eax, %eax
++	ret
+ # endif
+-	jmp	L(more_4x_vec)
+-
+ 	.p2align 4
+-L(cros_page_boundary):
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
++L(cross_page_boundary):
++	/* Save pointer before aligning as its original value is necessary
++	   for computer return address if byte is found or adjusting length
++	   if it is not and this is memchr.  */
++	movq	%rdi, %rcx
++	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
++	   rdi for rawmemchr.  */
++	orq	$(VEC_SIZE - 1), %ALGN_PTR_REG
++	VPCMPEQ	-(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
++# ifndef USE_AS_RAWMEMCHR
++	/* Calculate length until end of page (length checked for a
++	   match).  */
++	leaq	1(%ALGN_PTR_REG), %rsi
++	subq	%RRAW_PTR_REG, %rsi
++# endif
+ 	/* Remove the leading bytes.  */
+-	sarl	%cl, %eax
+-	testl	%eax, %eax
+-	jz	L(aligned_more)
+-	tzcntl	%eax, %eax
++	sarxl	%ERAW_PTR_REG, %eax, %eax
+ # ifndef USE_AS_RAWMEMCHR
+ 	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
++	cmpq	%rsi, %rdx
++	jbe	L(first_vec_x0)
+ # endif
+-	addq	%rdi, %rax
+-	addq	%rcx, %rax
++	testl	%eax, %eax
++	jz	L(cross_page_continue)
++	tzcntl	%eax, %eax
++	addq	%RRAW_PTR_REG, %rax
+ L(return_vzeroupper):
+ 	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ 	.p2align 4
+-L(aligned_more):
+-# ifndef USE_AS_RAWMEMCHR
+-        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
+-	   instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
+-	   overflow.  */
+-	negq	%rcx
+-	addq	$VEC_SIZE, %rcx
++L(first_vec_x1):
++	tzcntl	%eax, %eax
++	incq	%rdi
++	addq	%rdi, %rax
++	VZEROUPPER_RETURN
+ 
+-	/* Check the end of data.  */
+-	subq	%rcx, %rdx
+-	jbe	L(zero)
+-# endif
++	.p2align 4
++L(first_vec_x2):
++	tzcntl	%eax, %eax
++	addq	$(VEC_SIZE + 1), %rdi
++	addq	%rdi, %rax
++	VZEROUPPER_RETURN
++
++	.p2align 4
++L(first_vec_x3):
++	tzcntl	%eax, %eax
++	addq	$(VEC_SIZE * 2 + 1), %rdi
++	addq	%rdi, %rax
++	VZEROUPPER_RETURN
+ 
+-	addq	$VEC_SIZE, %rdi
+ 
+-# ifndef USE_AS_RAWMEMCHR
+-	subq	$(VEC_SIZE * 4), %rdx
+-	jbe	L(last_4x_vec_or_less)
+-# endif
++	.p2align 4
++L(first_vec_x4):
++	tzcntl	%eax, %eax
++	addq	$(VEC_SIZE * 3 + 1), %rdi
++	addq	%rdi, %rax
++	VZEROUPPER_RETURN
+ 
+-L(more_4x_vec):
++	.p2align 4
++L(aligned_more):
+ 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+ 	   since data is only aligned to VEC_SIZE.  */
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+ 
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
++# ifndef USE_AS_RAWMEMCHR
++L(cross_page_continue):
++	/* Align data to VEC_SIZE - 1.  */
++	xorl	%ecx, %ecx
++	subl	%edi, %ecx
++	orq	$(VEC_SIZE - 1), %rdi
++	/* esi is for adjusting length to see if near the end.  */
++	leal	(VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
++# else
++	orq	$(VEC_SIZE - 1), %rdi
++L(cross_page_continue):
++# endif
++	/* Load first VEC regardless.  */
++	VPCMPEQ	1(%rdi), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
++# ifndef USE_AS_RAWMEMCHR
++	/* Adjust length. If near end handle specially.  */
++	subq	%rsi, %rdx
++	jbe	L(last_4x_vec_or_less)
++# endif
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1)
+ 
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
++	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x2)
+ 
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
++	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x3)
+ 
+-	addq	$(VEC_SIZE * 4), %rdi
++	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
++	vpmovmskb %ymm1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x4)
+ 
+ # ifndef USE_AS_RAWMEMCHR
++	/* Check if at last VEC_SIZE * 4 length.  */
+ 	subq	$(VEC_SIZE * 4), %rdx
+-	jbe	L(last_4x_vec_or_less)
+-# endif
+-
+-	/* Align data to 4 * VEC_SIZE.  */
+-	movq	%rdi, %rcx
+-	andl	$(4 * VEC_SIZE - 1), %ecx
+-	andq	$-(4 * VEC_SIZE), %rdi
+-
+-# ifndef USE_AS_RAWMEMCHR
+-	/* Adjust length.  */
++	jbe	L(last_4x_vec_or_less_cmpeq)
++	/* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
++	   length.  */
++	incq	%rdi
++	movl	%edi, %ecx
++	orq	$(VEC_SIZE * 4 - 1), %rdi
++	andl	$(VEC_SIZE * 4 - 1), %ecx
+ 	addq	%rcx, %rdx
++# else
++	/* Align data to VEC_SIZE * 4 - 1 for loop.  */
++	incq	%rdi
++	orq	$(VEC_SIZE * 4 - 1), %rdi
+ # endif
+ 
++	/* Compare 4 * VEC at a time forward.  */
+ 	.p2align 4
+ L(loop_4x_vec):
+-	/* Compare 4 * VEC at a time forward.  */
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
+-
++	VPCMPEQ	1(%rdi), %ymm0, %ymm1
++	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm2
++	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3
++	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4
+ 	vpor	%ymm1, %ymm2, %ymm5
+ 	vpor	%ymm3, %ymm4, %ymm6
+ 	vpor	%ymm5, %ymm6, %ymm5
+ 
+-	vpmovmskb %ymm5, %eax
+-	testl	%eax, %eax
+-	jnz	L(4x_vec_end)
+-
+-	addq	$(VEC_SIZE * 4), %rdi
+-
++	vpmovmskb %ymm5, %ecx
+ # ifdef USE_AS_RAWMEMCHR
+-	jmp	L(loop_4x_vec)
++	subq	$-(VEC_SIZE * 4), %rdi
++	testl	%ecx, %ecx
++	jz	L(loop_4x_vec)
+ # else
+-	subq	$(VEC_SIZE * 4), %rdx
+-	ja	L(loop_4x_vec)
++	testl	%ecx, %ecx
++	jnz	L(loop_4x_vec_end)
+ 
+-L(last_4x_vec_or_less):
+-	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
+-	addl	$(VEC_SIZE * 2), %edx
+-	jle	L(last_2x_vec)
++	subq	$-(VEC_SIZE * 4), %rdi
+ 
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
++	subq	$(VEC_SIZE * 4), %rdx
++	ja	L(loop_4x_vec)
+ 
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
++	/* Fall through into less than 4 remaining vectors of length case.
++	 */
++	VPCMPEQ	(VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
++	.p2align 4
++L(last_4x_vec_or_less):
++	/* Check if first VEC contained match.  */
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
++	jnz	L(first_vec_x1_check)
+ 
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
++	/* If remaining length > VEC_SIZE * 2.  */
++	addl	$(VEC_SIZE * 2), %edx
++	jg	L(last_4x_vec)
+ 
+-	jnz	L(first_vec_x2_check)
+-	subl	$VEC_SIZE, %edx
+-	jle	L(zero)
++L(last_2x_vec):
++	/* If remaining length < VEC_SIZE.  */
++	addl	$VEC_SIZE, %edx
++	jle	L(zero_end)
+ 
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
++	/* Check VEC2 and compare any match with remaining length.  */
++	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-
+-	jnz	L(first_vec_x3_check)
+-	xorl	%eax, %eax
++	tzcntl	%eax, %eax
++	cmpl	%eax, %edx
++	jbe	L(set_zero_end)
++	addq	$(VEC_SIZE + 1), %rdi
++	addq	%rdi, %rax
++L(zero_end):
+ 	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(last_2x_vec):
+-	addl	$(VEC_SIZE * 2), %edx
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
++L(loop_4x_vec_end):
++# endif
++	/* rawmemchr will fall through into this if match was found in
++	   loop.  */
++
+ 	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
++	jnz	L(last_vec_x1_return)
+ 
+-	jnz	L(first_vec_x0_check)
+-	subl	$VEC_SIZE, %edx
+-	jle	L(zero)
+-
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
++	vpmovmskb %ymm2, %eax
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x1_check)
+-	xorl	%eax, %eax
+-	VZEROUPPER_RETURN
++	jnz	L(last_vec_x2_return)
+ 
+-	.p2align 4
+-L(first_vec_x0_check):
+-	tzcntl	%eax, %eax
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
++	vpmovmskb %ymm3, %eax
++	/* Combine VEC3 matches (eax) with VEC4 matches (ecx).  */
++	salq	$32, %rcx
++	orq	%rcx, %rax
++	tzcntq	%rax, %rax
++# ifdef USE_AS_RAWMEMCHR
++	subq	$(VEC_SIZE * 2 - 1), %rdi
++# else
++	subq	$-(VEC_SIZE * 2 + 1), %rdi
++# endif
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
++# ifndef USE_AS_RAWMEMCHR
+ 
+ 	.p2align 4
+ L(first_vec_x1_check):
+ 	tzcntl	%eax, %eax
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
+-	addq	$VEC_SIZE, %rax
++	/* Adjust length.  */
++	subl	$-(VEC_SIZE * 4), %edx
++	/* Check if match within remaining length.  */
++	cmpl	%eax, %edx
++	jbe	L(set_zero_end)
++	incq	%rdi
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
++	.p2align 4
++L(set_zero_end):
++	xorl	%eax, %eax
++	VZEROUPPER_RETURN
++# endif
+ 
+ 	.p2align 4
+-L(first_vec_x2_check):
++L(last_vec_x1_return):
+ 	tzcntl	%eax, %eax
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
+-	addq	$(VEC_SIZE * 2), %rax
++# ifdef USE_AS_RAWMEMCHR
++	subq	$(VEC_SIZE * 4 - 1), %rdi
++# else
++	incq	%rdi
++# endif
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(first_vec_x3_check):
++L(last_vec_x2_return):
+ 	tzcntl	%eax, %eax
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
+-	addq	$(VEC_SIZE * 3), %rax
++# ifdef USE_AS_RAWMEMCHR
++	subq	$(VEC_SIZE * 3 - 1), %rdi
++# else
++	subq	$-(VEC_SIZE + 1), %rdi
++# endif
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+ 
++# ifndef USE_AS_RAWMEMCHR
+ 	.p2align 4
+-L(zero):
+-	xorl	%eax, %eax
+-	jmp     L(return_vzeroupper)
++L(last_4x_vec_or_less_cmpeq):
++	VPCMPEQ	(VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
++	vpmovmskb %ymm1, %eax
++	subq	$-(VEC_SIZE * 4), %rdi
++	/* Check first VEC regardless.  */
++	testl	%eax, %eax
++	jnz	L(first_vec_x1_check)
+ 
++	/* If remaining length <= CHAR_PER_VEC * 2.  */
++	addl	$(VEC_SIZE * 2), %edx
++	jle	L(last_2x_vec)
+ 	.p2align 4
+-L(null):
+-	xorl	%eax, %eax
+-	ret
+-# endif
++L(last_4x_vec):
++	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
++	vpmovmskb %ymm1, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x2_return)
+ 
+-	.p2align 4
+-L(first_vec_x0):
+-	tzcntl	%eax, %eax
+-	addq	%rdi, %rax
+-	VZEROUPPER_RETURN
++	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
++	vpmovmskb %ymm1, %eax
+ 
+-	.p2align 4
+-L(first_vec_x1):
+-	tzcntl	%eax, %eax
+-	addq	$VEC_SIZE, %rax
+-	addq	%rdi, %rax
+-	VZEROUPPER_RETURN
++	/* Create mask for possible matches within remaining length.  */
++	movq	$-1, %rcx
++	bzhiq	%rdx, %rcx, %rcx
+ 
+-	.p2align 4
+-L(first_vec_x2):
++	/* Test matches in data against length match.  */
++	andl	%ecx, %eax
++	jnz	L(last_vec_x3)
++
++	/* if remaining length <= VEC_SIZE * 3 (Note this is after
++	   remaining length was found to be > VEC_SIZE * 2.  */
++	subl	$VEC_SIZE, %edx
++	jbe	L(zero_end2)
++
++	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
++	vpmovmskb %ymm1, %eax
++	/* Shift remaining length mask for last VEC.  */
++	shrq	$32, %rcx
++	andl	%ecx, %eax
++	jz	L(zero_end2)
+ 	tzcntl	%eax, %eax
+-	addq	$(VEC_SIZE * 2), %rax
++	addq	$(VEC_SIZE * 3 + 1), %rdi
+ 	addq	%rdi, %rax
++L(zero_end2):
+ 	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(4x_vec_end):
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+-	vpmovmskb %ymm2, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+-	vpmovmskb %ymm3, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x2)
+-	vpmovmskb %ymm4, %eax
+-	testl	%eax, %eax
+-L(first_vec_x3):
++L(last_vec_x3):
+ 	tzcntl	%eax, %eax
+-	addq	$(VEC_SIZE * 3), %rax
++	subq	$-(VEC_SIZE * 2 + 1), %rdi
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
++# endif
+ 
+ END (MEMCHR)
+ #endif
+-- 
+GitLab
+
diff --git a/SOURCES/ia-move-strlen-multiarch.patch b/SOURCES/ia-move-strlen-multiarch.patch
new file mode 100644
index 0000000..2b7084c
--- /dev/null
+++ b/SOURCES/ia-move-strlen-multiarch.patch
@@ -0,0 +1,563 @@
+From 3848cd2cab96c673c98ea339aeefd5a27837f587 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Tue, 22 Jun 2021 20:42:10 -0700
+Subject: [PATCH] x86-64: Move strlen.S to multiarch/strlen-vec.S
+
+Since strlen.S contains SSE2 version of strlen/strnlen and SSE4.1
+version of wcslen/wcsnlen, move strlen.S to multiarch/strlen-vec.S
+and include multiarch/strlen-vec.S from SSE2 and SSE4.1 variants.
+This also removes the unused symbols, __GI___strlen_sse2 and
+__GI___wcsnlen_sse4_1.
+
+(cherry picked from commit a0db678071c60b6c47c468d231dd0b3694ba7a98)
+---
+ sysdeps/x86_64/multiarch/strlen-sse2.S    |   2 +-
+ sysdeps/x86_64/multiarch/strlen-vec.S     | 257 ++++++++++++++++++++++
+ sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S |   2 +-
+ sysdeps/x86_64/strlen.S                   | 243 +-------------------
+ 4 files changed, 262 insertions(+), 242 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/strlen-vec.S
+
+diff --git a/sysdeps/x86_64/multiarch/strlen-sse2.S b/sysdeps/x86_64/multiarch/strlen-sse2.S
+index 7bc57b8d..449c8a7f 100644
+--- a/sysdeps/x86_64/multiarch/strlen-sse2.S
++++ b/sysdeps/x86_64/multiarch/strlen-sse2.S
+@@ -20,4 +20,4 @@
+ # define strlen __strlen_sse2
+ #endif
+ 
+-#include "../strlen.S"
++#include "strlen-vec.S"
+diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
+new file mode 100644
+index 00000000..8f660bb9
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strlen-vec.S
+@@ -0,0 +1,257 @@
++/* SSE2 version of strlen and SSE4.1 version of wcslen.
++   Copyright (C) 2012-2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <sysdep.h>
++
++#ifdef AS_WCSLEN
++# define PMINU		pminud
++# define PCMPEQ		pcmpeqd
++# define SHIFT_RETURN	shrq $2, %rax
++#else
++# define PMINU		pminub
++# define PCMPEQ		pcmpeqb
++# define SHIFT_RETURN
++#endif
++
++/* Long lived register in strlen(s), strnlen(s, n) are:
++
++	%xmm3 - zero
++	%rdi   - s
++	%r10  (s+n) & (~(64-1))
++	%r11   s+n
++*/
++
++
++.text
++ENTRY(strlen)
++
++/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx.  */
++#define FIND_ZERO	\
++	PCMPEQ	(%rax), %xmm0;	\
++	PCMPEQ	16(%rax), %xmm1;	\
++	PCMPEQ	32(%rax), %xmm2;	\
++	PCMPEQ	48(%rax), %xmm3;	\
++	pmovmskb	%xmm0, %esi;	\
++	pmovmskb	%xmm1, %edx;	\
++	pmovmskb	%xmm2, %r8d;	\
++	pmovmskb	%xmm3, %ecx;	\
++	salq	$16, %rdx;	\
++	salq	$16, %rcx;	\
++	orq	%rsi, %rdx;	\
++	orq	%r8, %rcx;	\
++	salq	$32, %rcx;	\
++	orq	%rcx, %rdx;
++
++#ifdef AS_STRNLEN
++/* Do not read anything when n==0.  */
++	test	%RSI_LP, %RSI_LP
++	jne	L(n_nonzero)
++	xor	%rax, %rax
++	ret
++L(n_nonzero):
++# ifdef AS_WCSLEN
++	shl	$2, %RSI_LP
++# endif
++
++/* Initialize long lived registers.  */
++
++	add	%RDI_LP, %RSI_LP
++	mov	%RSI_LP, %R10_LP
++	and	$-64, %R10_LP
++	mov	%RSI_LP, %R11_LP
++#endif
++
++	pxor	%xmm0, %xmm0
++	pxor	%xmm1, %xmm1
++	pxor	%xmm2, %xmm2
++	pxor	%xmm3, %xmm3
++	movq	%rdi, %rax
++	movq	%rdi, %rcx
++	andq	$4095, %rcx
++/* Offsets 4032-4047 will be aligned into 4032 thus fit into page.  */
++	cmpq	$4047, %rcx
++/* We cannot unify this branching as it would be ~6 cycles slower.  */
++	ja	L(cross_page)
++
++#ifdef AS_STRNLEN
++/* Test if end is among first 64 bytes.  */
++# define STRNLEN_PROLOG	\
++	mov	%r11, %rsi;	\
++	subq	%rax, %rsi;	\
++	andq	$-64, %rax;	\
++	testq	$-64, %rsi;	\
++	je	L(strnlen_ret)
++#else
++# define STRNLEN_PROLOG  andq $-64, %rax;
++#endif
++
++/* Ignore bits in mask that come before start of string.  */
++#define PROLOG(lab)	\
++	movq	%rdi, %rcx;	\
++	xorq	%rax, %rcx;	\
++	STRNLEN_PROLOG;	\
++	sarq	%cl, %rdx;	\
++	test	%rdx, %rdx;	\
++	je	L(lab);	\
++	bsfq	%rdx, %rax;	\
++	SHIFT_RETURN;		\
++	ret
++
++#ifdef AS_STRNLEN
++	andq	$-16, %rax
++	FIND_ZERO
++#else
++	/* Test first 16 bytes unaligned.  */
++	movdqu	(%rax), %xmm4
++	PCMPEQ	%xmm0, %xmm4
++	pmovmskb	%xmm4, %edx
++	test	%edx, %edx
++	je 	L(next48_bytes)
++	bsf	%edx, %eax /* If eax is zeroed 16bit bsf can be used.  */
++	SHIFT_RETURN
++	ret
++
++L(next48_bytes):
++/* Same as FIND_ZERO except we do not check first 16 bytes.  */
++	andq	$-16, %rax
++	PCMPEQ 16(%rax), %xmm1
++	PCMPEQ 32(%rax), %xmm2
++	PCMPEQ 48(%rax), %xmm3
++	pmovmskb	%xmm1, %edx
++	pmovmskb	%xmm2, %r8d
++	pmovmskb	%xmm3, %ecx
++	salq	$16, %rdx
++	salq	$16, %rcx
++	orq	%r8, %rcx
++	salq	$32, %rcx
++	orq	%rcx, %rdx
++#endif
++
++	/* When no zero byte is found xmm1-3 are zero so we do not have to
++	   zero them.  */
++	PROLOG(loop)
++
++	.p2align 4
++L(cross_page):
++	andq	$-64, %rax
++	FIND_ZERO
++	PROLOG(loop_init)
++
++#ifdef AS_STRNLEN
++/* We must do this check to correctly handle strnlen (s, -1).  */
++L(strnlen_ret):
++	bts	%rsi, %rdx
++	sarq	%cl, %rdx
++	test	%rdx, %rdx
++	je	L(loop_init)
++	bsfq	%rdx, %rax
++	SHIFT_RETURN
++	ret
++#endif
++	.p2align 4
++L(loop_init):
++	pxor	%xmm1, %xmm1
++	pxor	%xmm2, %xmm2
++	pxor	%xmm3, %xmm3
++#ifdef AS_STRNLEN
++	.p2align 4
++L(loop):
++
++	addq	$64, %rax
++	cmpq	%rax, %r10
++	je	L(exit_end)
++
++	movdqa	(%rax), %xmm0
++	PMINU	16(%rax), %xmm0
++	PMINU	32(%rax), %xmm0
++	PMINU	48(%rax), %xmm0
++	PCMPEQ	%xmm3, %xmm0
++	pmovmskb	%xmm0, %edx
++	testl	%edx, %edx
++	jne	L(exit)
++	jmp	L(loop)
++
++	.p2align 4
++L(exit_end):
++	cmp	%rax, %r11
++	je	L(first) /* Do not read when end is at page boundary.  */
++	pxor	%xmm0, %xmm0
++	FIND_ZERO
++
++L(first):
++	bts	%r11, %rdx
++	bsfq	%rdx, %rdx
++	addq	%rdx, %rax
++	subq	%rdi, %rax
++	SHIFT_RETURN
++	ret
++
++	.p2align 4
++L(exit):
++	pxor	%xmm0, %xmm0
++	FIND_ZERO
++
++	bsfq	%rdx, %rdx
++	addq	%rdx, %rax
++	subq	%rdi, %rax
++	SHIFT_RETURN
++	ret
++
++#else
++
++	/* Main loop.  Unrolled twice to improve L2 cache performance on core2.  */
++	.p2align 4
++L(loop):
++
++	movdqa	64(%rax), %xmm0
++	PMINU	80(%rax), %xmm0
++	PMINU	96(%rax), %xmm0
++	PMINU	112(%rax), %xmm0
++	PCMPEQ	%xmm3, %xmm0
++	pmovmskb	%xmm0, %edx
++	testl	%edx, %edx
++	jne	L(exit64)
++
++	subq	$-128, %rax
++
++	movdqa	(%rax), %xmm0
++	PMINU	16(%rax), %xmm0
++	PMINU	32(%rax), %xmm0
++	PMINU	48(%rax), %xmm0
++	PCMPEQ	%xmm3, %xmm0
++	pmovmskb	%xmm0, %edx
++	testl	%edx, %edx
++	jne	L(exit0)
++	jmp	L(loop)
++
++	.p2align 4
++L(exit64):
++	addq	$64, %rax
++L(exit0):
++	pxor	%xmm0, %xmm0
++	FIND_ZERO
++
++	bsfq	%rdx, %rdx
++	addq	%rdx, %rax
++	subq	%rdi, %rax
++	SHIFT_RETURN
++	ret
++
++#endif
++
++END(strlen)
+diff --git a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
+index a8cab0cb..5fa51fe0 100644
+--- a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
++++ b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
+@@ -2,4 +2,4 @@
+ #define AS_STRNLEN
+ #define strlen	__wcsnlen_sse4_1
+ 
+-#include "../strlen.S"
++#include "strlen-vec.S"
+diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
+index f845f3d4..ad047d84 100644
+--- a/sysdeps/x86_64/strlen.S
++++ b/sysdeps/x86_64/strlen.S
+@@ -1,5 +1,5 @@
+-/* SSE2 version of strlen/wcslen.
+-   Copyright (C) 2012-2018 Free Software Foundation, Inc.
++/* SSE2 version of strlen.
++   Copyright (C) 2021 Free Software Foundation, Inc.
+    This file is part of the GNU C Library.
+ 
+    The GNU C Library is free software; you can redistribute it and/or
+@@ -16,243 +16,6 @@
+    License along with the GNU C Library; if not, see
+    <http://www.gnu.org/licenses/>.  */
+ 
+-#include <sysdep.h>
++#include "multiarch/strlen-vec.S"
+ 
+-#ifdef AS_WCSLEN
+-# define PMINU		pminud
+-# define PCMPEQ		pcmpeqd
+-# define SHIFT_RETURN	shrq $2, %rax
+-#else
+-# define PMINU		pminub
+-# define PCMPEQ		pcmpeqb
+-# define SHIFT_RETURN
+-#endif
+-
+-/* Long lived register in strlen(s), strnlen(s, n) are:
+-
+-	%xmm3 - zero
+-	%rdi   - s
+-	%r10  (s+n) & (~(64-1))
+-	%r11   s+n
+-*/
+-
+-
+-.text
+-ENTRY(strlen)
+-
+-/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx.  */
+-#define FIND_ZERO	\
+-	PCMPEQ	(%rax), %xmm0;	\
+-	PCMPEQ	16(%rax), %xmm1;	\
+-	PCMPEQ	32(%rax), %xmm2;	\
+-	PCMPEQ	48(%rax), %xmm3;	\
+-	pmovmskb	%xmm0, %esi;	\
+-	pmovmskb	%xmm1, %edx;	\
+-	pmovmskb	%xmm2, %r8d;	\
+-	pmovmskb	%xmm3, %ecx;	\
+-	salq	$16, %rdx;	\
+-	salq	$16, %rcx;	\
+-	orq	%rsi, %rdx;	\
+-	orq	%r8, %rcx;	\
+-	salq	$32, %rcx;	\
+-	orq	%rcx, %rdx;
+-
+-#ifdef AS_STRNLEN
+-/* Do not read anything when n==0.  */
+-	test	%RSI_LP, %RSI_LP
+-	jne	L(n_nonzero)
+-	xor	%rax, %rax
+-	ret
+-L(n_nonzero):
+-# ifdef AS_WCSLEN
+-	shl	$2, %RSI_LP
+-# endif
+-
+-/* Initialize long lived registers.  */
+-
+-	add	%RDI_LP, %RSI_LP
+-	mov	%RSI_LP, %R10_LP
+-	and	$-64, %R10_LP
+-	mov	%RSI_LP, %R11_LP
+-#endif
+-
+-	pxor	%xmm0, %xmm0
+-	pxor	%xmm1, %xmm1
+-	pxor	%xmm2, %xmm2
+-	pxor	%xmm3, %xmm3
+-	movq	%rdi, %rax
+-	movq	%rdi, %rcx
+-	andq	$4095, %rcx
+-/* Offsets 4032-4047 will be aligned into 4032 thus fit into page.  */
+-	cmpq	$4047, %rcx
+-/* We cannot unify this branching as it would be ~6 cycles slower.  */
+-	ja	L(cross_page)
+-
+-#ifdef AS_STRNLEN
+-/* Test if end is among first 64 bytes.  */
+-# define STRNLEN_PROLOG	\
+-	mov	%r11, %rsi;	\
+-	subq	%rax, %rsi;	\
+-	andq	$-64, %rax;	\
+-	testq	$-64, %rsi;	\
+-	je	L(strnlen_ret)
+-#else
+-# define STRNLEN_PROLOG  andq $-64, %rax;
+-#endif
+-
+-/* Ignore bits in mask that come before start of string.  */
+-#define PROLOG(lab)	\
+-	movq	%rdi, %rcx;	\
+-	xorq	%rax, %rcx;	\
+-	STRNLEN_PROLOG;	\
+-	sarq	%cl, %rdx;	\
+-	test	%rdx, %rdx;	\
+-	je	L(lab);	\
+-	bsfq	%rdx, %rax;	\
+-	SHIFT_RETURN;		\
+-	ret
+-
+-#ifdef AS_STRNLEN
+-	andq	$-16, %rax
+-	FIND_ZERO
+-#else
+-	/* Test first 16 bytes unaligned.  */
+-	movdqu	(%rax), %xmm4
+-	PCMPEQ	%xmm0, %xmm4
+-	pmovmskb	%xmm4, %edx
+-	test	%edx, %edx
+-	je 	L(next48_bytes)
+-	bsf	%edx, %eax /* If eax is zeroed 16bit bsf can be used.  */
+-	SHIFT_RETURN
+-	ret
+-
+-L(next48_bytes):
+-/* Same as FIND_ZERO except we do not check first 16 bytes.  */
+-	andq	$-16, %rax
+-	PCMPEQ 16(%rax), %xmm1
+-	PCMPEQ 32(%rax), %xmm2
+-	PCMPEQ 48(%rax), %xmm3
+-	pmovmskb	%xmm1, %edx
+-	pmovmskb	%xmm2, %r8d
+-	pmovmskb	%xmm3, %ecx
+-	salq	$16, %rdx
+-	salq	$16, %rcx
+-	orq	%r8, %rcx
+-	salq	$32, %rcx
+-	orq	%rcx, %rdx
+-#endif
+-
+-	/* When no zero byte is found xmm1-3 are zero so we do not have to
+-	   zero them.  */
+-	PROLOG(loop)
+-
+-	.p2align 4
+-L(cross_page):
+-	andq	$-64, %rax
+-	FIND_ZERO
+-	PROLOG(loop_init)
+-
+-#ifdef AS_STRNLEN
+-/* We must do this check to correctly handle strnlen (s, -1).  */
+-L(strnlen_ret):
+-	bts	%rsi, %rdx
+-	sarq	%cl, %rdx
+-	test	%rdx, %rdx
+-	je	L(loop_init)
+-	bsfq	%rdx, %rax
+-	SHIFT_RETURN
+-	ret
+-#endif
+-	.p2align 4
+-L(loop_init):
+-	pxor	%xmm1, %xmm1
+-	pxor	%xmm2, %xmm2
+-	pxor	%xmm3, %xmm3
+-#ifdef AS_STRNLEN
+-	.p2align 4
+-L(loop):
+-
+-	addq	$64, %rax
+-	cmpq	%rax, %r10
+-	je	L(exit_end)
+-
+-	movdqa	(%rax), %xmm0
+-	PMINU	16(%rax), %xmm0
+-	PMINU	32(%rax), %xmm0
+-	PMINU	48(%rax), %xmm0
+-	PCMPEQ	%xmm3, %xmm0
+-	pmovmskb	%xmm0, %edx
+-	testl	%edx, %edx
+-	jne	L(exit)
+-	jmp	L(loop)
+-
+-	.p2align 4
+-L(exit_end):
+-	cmp	%rax, %r11
+-	je	L(first) /* Do not read when end is at page boundary.  */
+-	pxor	%xmm0, %xmm0
+-	FIND_ZERO
+-
+-L(first):
+-	bts	%r11, %rdx
+-	bsfq	%rdx, %rdx
+-	addq	%rdx, %rax
+-	subq	%rdi, %rax
+-	SHIFT_RETURN
+-	ret
+-
+-	.p2align 4
+-L(exit):
+-	pxor	%xmm0, %xmm0
+-	FIND_ZERO
+-
+-	bsfq	%rdx, %rdx
+-	addq	%rdx, %rax
+-	subq	%rdi, %rax
+-	SHIFT_RETURN
+-	ret
+-
+-#else
+-
+-	/* Main loop.  Unrolled twice to improve L2 cache performance on core2.  */
+-	.p2align 4
+-L(loop):
+-
+-	movdqa	64(%rax), %xmm0
+-	PMINU	80(%rax), %xmm0
+-	PMINU	96(%rax), %xmm0
+-	PMINU	112(%rax), %xmm0
+-	PCMPEQ	%xmm3, %xmm0
+-	pmovmskb	%xmm0, %edx
+-	testl	%edx, %edx
+-	jne	L(exit64)
+-
+-	subq	$-128, %rax
+-
+-	movdqa	(%rax), %xmm0
+-	PMINU	16(%rax), %xmm0
+-	PMINU	32(%rax), %xmm0
+-	PMINU	48(%rax), %xmm0
+-	PCMPEQ	%xmm3, %xmm0
+-	pmovmskb	%xmm0, %edx
+-	testl	%edx, %edx
+-	jne	L(exit0)
+-	jmp	L(loop)
+-
+-	.p2align 4
+-L(exit64):
+-	addq	$64, %rax
+-L(exit0):
+-	pxor	%xmm0, %xmm0
+-	FIND_ZERO
+-
+-	bsfq	%rdx, %rdx
+-	addq	%rdx, %rax
+-	subq	%rdi, %rax
+-	SHIFT_RETURN
+-	ret
+-
+-#endif
+-
+-END(strlen)
+ libc_hidden_builtin_def (strlen)
+-- 
+GitLab
+
diff --git a/SOURCES/ia-new-macro-entry_p2align.patch b/SOURCES/ia-new-macro-entry_p2align.patch
new file mode 100644
index 0000000..3cdf3ec
--- /dev/null
+++ b/SOURCES/ia-new-macro-entry_p2align.patch
@@ -0,0 +1,48 @@
+From ed5f8e29601a35c955183ed218c78438596ed824 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Tue, 21 Sep 2021 18:31:49 -0500
+Subject: [PATCH] x86: Modify ENTRY in sysdep.h so that p2align can be
+ specified
+
+No bug.
+
+This change adds a new macro ENTRY_P2ALIGN which takes a second
+argument, log2 of the desired function alignment.
+
+The old ENTRY(name) macro is just ENTRY_P2ALIGN(name, 4) so this
+doesn't affect any existing functionality.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+(cherry picked from commit fc5bd179ef3a953dff8d1655bd530d0e230ffe71)
+---
+ sysdeps/x86/sysdep.h | 7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)
+
+diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
+index 01bac0f6..a70bb3a2 100644
+--- a/sysdeps/x86/sysdep.h
++++ b/sysdeps/x86/sysdep.h
+@@ -78,15 +78,18 @@ enum cf_protection_level
+ #define ASM_SIZE_DIRECTIVE(name) .size name,.-name;
+ 
+ /* Define an entry point visible from C.  */
+-#define	ENTRY(name)							      \
++#define	ENTRY_P2ALIGN(name, alignment)					      \
+   .globl C_SYMBOL_NAME(name);						      \
+   .type C_SYMBOL_NAME(name),@function;					      \
+-  .align ALIGNARG(4);							      \
++  .align ALIGNARG(alignment);						      \
+   C_LABEL(name)								      \
+   cfi_startproc;							      \
+   _CET_ENDBR;								      \
+   CALL_MCOUNT
+ 
++/* Common entry 16 byte aligns.  */
++#define ENTRY(name) ENTRY_P2ALIGN (name, 4)
++
+ #undef	END
+ #define END(name)							      \
+   cfi_endproc;								      \
+-- 
+GitLab
+
diff --git a/SOURCES/ia-no-index_arch_prefer_no_avx512-avx-vnni.patch b/SOURCES/ia-no-index_arch_prefer_no_avx512-avx-vnni.patch
new file mode 100644
index 0000000..b094272
--- /dev/null
+++ b/SOURCES/ia-no-index_arch_prefer_no_avx512-avx-vnni.patch
@@ -0,0 +1,36 @@
+From f64058914af3c3a429e5947a8a90220240f7d51b Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 6 Dec 2021 07:14:12 -0800
+Subject: [PATCH] x86: Don't set Prefer_No_AVX512 for processors with AVX512
+ and AVX-VNNI
+
+Don't set Prefer_No_AVX512 on processors with AVX512 and AVX-VNNI since
+they won't lower CPU frequency when ZMM load and store instructions are
+used.
+
+(cherry picked from commit ceeffe968c01b1202e482f4855cb6baf5c6cb713)
+---
+ sysdeps/x86/cpu-features.c | 7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)
+
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index 956bfb4f..5ff2baa0 100644
+--- a/sysdeps/x86/cpu-features.c
++++ b/sysdeps/x86/cpu-features.c
+@@ -525,8 +525,11 @@ init_cpu_features (struct cpu_features *cpu_features)
+ 	  |= bit_arch_Prefer_No_VZEROUPPER;
+       else
+ 	{
+-	  cpu_features->preferred[index_arch_Prefer_No_AVX512]
+-	    |= bit_arch_Prefer_No_AVX512;
++	  /* Processors with AVX512 and AVX-VNNI won't lower CPU frequency
++	     when ZMM load and store instructions are used.  */
++	  if (!CPU_FEATURES_CPU_P (cpu_features, AVX_VNNI))
++	    cpu_features->preferred[index_arch_Prefer_No_AVX512]
++	      |= bit_arch_Prefer_No_AVX512;
+ 
+ 	  /* Avoid RTM abort triggered by VZEROUPPER inside a
+ 	     transactionally executing RTM region.  */
+-- 
+GitLab
+
diff --git a/SOURCES/ia-opt-less_vec-memcmp-evex-movb.patch b/SOURCES/ia-opt-less_vec-memcmp-evex-movb.patch
new file mode 100644
index 0000000..0f0e9c1
--- /dev/null
+++ b/SOURCES/ia-opt-less_vec-memcmp-evex-movb.patch
@@ -0,0 +1,389 @@
+From 6c6a4eae9d8131531c9231f9f177d2db4130df01 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 24 Dec 2021 18:54:41 -0600
+Subject: [PATCH] x86: Optimize L(less_vec) case in memcmp-evex-movbe.S
+
+No bug.
+Optimizations are twofold.
+
+1) Replace page cross and 0/1 checks with masked load instructions in
+   L(less_vec). In applications this reduces branch-misses in the
+   hot [0, 32] case.
+2) Change controlflow so that L(less_vec) case gets the fall through.
+
+Change 2) helps copies in the [0, 32] size range but comes at the cost
+of copies in the [33, 64] size range.  From profiles of GCC and
+Python3, 94%+ and 99%+ of calls are in the [0, 32] range so this
+appears to the the right tradeoff.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+(cherry picked from commit abddd61de090ae84e380aff68a98bd94ef704667)
+---
+ sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 249 +++++--------------
+ 1 file changed, 56 insertions(+), 193 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+index 640f6757..d2899e7c 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
++++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+@@ -62,15 +62,18 @@ Latency:
+ # define VMOVU		vmovdqu64
+ 
+ # ifdef USE_AS_WMEMCMP
++#  define VMOVU_MASK	vmovdqu32
+ #  define CHAR_SIZE	4
+ #  define VPCMP	vpcmpd
+ #  define VPTEST	vptestmd
+ # else
++#  define VMOVU_MASK	vmovdqu8
+ #  define CHAR_SIZE	1
+ #  define VPCMP	vpcmpub
+ #  define VPTEST	vptestmb
+ # endif
+ 
++
+ # define VEC_SIZE	32
+ # define PAGE_SIZE	4096
+ # define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+@@ -102,12 +105,48 @@ ENTRY_P2ALIGN (MEMCMP, 6)
+ 	movl	%edx, %edx
+ # endif
+ 	cmp	$CHAR_PER_VEC, %RDX_LP
+-	jb	L(less_vec)
++	/* Fall through for [0, VEC_SIZE] as its the hottest.  */
++	ja	L(more_1x_vec)
++
++	/* Create mask for CHAR's we want to compare. This allows us to
++	   avoid having to include page cross logic.  */
++	movl	$-1, %ecx
++	bzhil	%edx, %ecx, %ecx
++	kmovd	%ecx, %k2
++
++	/* Safe to load full ymm with mask.  */
++	VMOVU_MASK (%rsi), %YMM2{%k2}
++	VPCMP	$4,(%rdi), %YMM2, %k1{%k2}
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_0)
++	ret
+ 
++	.p2align 4
++L(return_vec_0):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCMP
++	movl	(%rdi, %rax, CHAR_SIZE), %ecx
++	xorl	%edx, %edx
++	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
++	/* NB: no partial register stall here because xorl zero idiom
++	   above.  */
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	(%rsi, %rax), %ecx
++	movzbl	(%rdi, %rax), %eax
++	subl	%ecx, %eax
++# endif
++	ret
++
++
++	.p2align 4
++L(more_1x_vec):
+ 	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+ 	VMOVU	(%rsi), %YMM1
+ 	/* Use compare not equals to directly check for mismatch.  */
+-	VPCMP	$4, (%rdi), %YMM1, %k1
++	VPCMP	$4,(%rdi), %YMM1, %k1
+ 	kmovd	%k1, %eax
+ 	/* NB: eax must be destination register if going to
+ 	   L(return_vec_[0,2]). For L(return_vec_3) destination register
+@@ -131,13 +170,13 @@ ENTRY_P2ALIGN (MEMCMP, 6)
+ 
+ 	/* Check third and fourth VEC no matter what.  */
+ 	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
+-	VPCMP	$4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1
++	VPCMP	$4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(return_vec_2)
+ 
+ 	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
+-	VPCMP	$4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1
++	VPCMP	$4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1
+ 	kmovd	%k1, %ecx
+ 	testl	%ecx, %ecx
+ 	jnz	L(return_vec_3)
+@@ -169,7 +208,7 @@ ENTRY_P2ALIGN (MEMCMP, 6)
+ 	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
+ 	/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
+ 	   oring with YMM1. Result is stored in YMM4.  */
+-	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
++	vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
+ 
+ 	/* Or together YMM2, YMM3, and YMM4 into YMM4.  */
+ 	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
+@@ -184,7 +223,8 @@ ENTRY_P2ALIGN (MEMCMP, 6)
+ 	/* NB: eax must be zero to reach here.  */
+ 	ret
+ 
+-	.p2align 4
++
++	.p2align 4,, 8
+ L(8x_end_return_vec_0_1_2_3):
+ 	movq	%rdx, %rdi
+ L(8x_return_vec_0_1_2_3):
+@@ -222,23 +262,6 @@ L(return_vec_3):
+ # endif
+ 	ret
+ 
+-	.p2align 4
+-L(return_vec_0):
+-	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCMP
+-	movl	(%rdi, %rax, CHAR_SIZE), %ecx
+-	xorl	%edx, %edx
+-	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
+-	/* NB: no partial register stall here because xorl zero idiom
+-	   above.  */
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
+-# else
+-	movzbl	(%rsi, %rax), %ecx
+-	movzbl	(%rdi, %rax), %eax
+-	subl	%ecx, %eax
+-# endif
+-	ret
+ 
+ 	.p2align 4
+ L(return_vec_1):
+@@ -297,7 +320,7 @@ L(loop_4x_vec):
+ 	VMOVU	(VEC_SIZE * 2)(%rsi, %rdi), %YMM3
+ 	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
+ 	VMOVU	(VEC_SIZE * 3)(%rsi, %rdi), %YMM4
+-	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
++	vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
+ 	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
+ 	VPTEST	%YMM4, %YMM4, %k1
+ 	kmovd	%k1, %ecx
+@@ -324,7 +347,7 @@ L(loop_4x_vec):
+ 	VMOVU	VEC_SIZE(%rsi, %rdx), %YMM2
+ 	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM2
+ 	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM4
+-	vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
++	vpternlogd $0xde,(VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
+ 	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
+ 	VPTEST	%YMM4, %YMM4, %k1
+ 	kmovd	%k1, %ecx
+@@ -336,14 +359,14 @@ L(loop_4x_vec):
+ 	/* Only entry is from L(more_8x_vec).  */
+ 	.p2align 4,, 10
+ L(8x_last_2x_vec):
+-	VPCMP	$4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1
++	VPCMP	$4,(VEC_SIZE * 2)(%rdx), %YMM3, %k1
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(8x_return_vec_2)
+ 	/* Naturally aligned to 16 bytes.  */
+ L(8x_last_1x_vec):
+ 	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM1
+-	VPCMP	$4, (VEC_SIZE * 3)(%rdx), %YMM1, %k1
++	VPCMP	$4,(VEC_SIZE * 3)(%rdx), %YMM1, %k1
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(8x_return_vec_3)
+@@ -392,7 +415,9 @@ L(last_1x_vec):
+ 	jnz	L(return_vec_0_end)
+ 	ret
+ 
+-	.p2align 4,, 10
++
++	/* Don't align. Takes 2-fetch blocks either way and aligning
++	   will cause code to spill into another cacheline.  */
+ L(return_vec_1_end):
+ 	/* Use bsf to save code size. This is necessary to have
+ 	   L(one_or_less) fit in aligning bytes between.  */
+@@ -411,31 +436,8 @@ L(return_vec_1_end):
+ # endif
+ 	ret
+ 
+-	/* NB: L(one_or_less) fits in alignment padding between
+-	   L(return_vec_1_end) and L(return_vec_0_end).  */
+-# ifdef USE_AS_WMEMCMP
+-L(one_or_less):
+-	jb	L(zero)
+-	movl	(%rdi), %ecx
+-	xorl	%edx, %edx
+-	cmpl	(%rsi), %ecx
+-	je	L(zero)
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
+-	ret
+-# else
+-L(one_or_less):
+-	jb	L(zero)
+-	movzbl	(%rsi), %ecx
+-	movzbl	(%rdi), %eax
+-	subl	%ecx, %eax
+-	ret
+-# endif
+-L(zero):
+-	xorl	%eax, %eax
+-	ret
+-
+-	.p2align 4
++	/* Don't align. Takes 2-fetch blocks either way and aligning
++	   will cause code to spill into another cacheline.  */
+ L(return_vec_0_end):
+ 	tzcntl	%eax, %eax
+ 	addl	%edx, %eax
+@@ -451,146 +453,7 @@ L(return_vec_0_end):
+ 	subl	%ecx, %eax
+ # endif
+ 	ret
++	/* 1-byte until next cache line.  */
+ 
+-	.p2align 4
+-L(less_vec):
+-	/* Check if one or less CHAR. This is necessary for size == 0
+-	   but is also faster for size == CHAR_SIZE.  */
+-	cmpl	$1, %edx
+-	jbe	L(one_or_less)
+-
+-	/* Check if loading one VEC from either s1 or s2 could cause a
+-	   page cross. This can have false positives but is by far the
+-	   fastest method.  */
+-	movl	%edi, %eax
+-	orl	%esi, %eax
+-	andl	$(PAGE_SIZE - 1), %eax
+-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+-	jg	L(page_cross_less_vec)
+-
+-	/* No page cross possible.  */
+-	VMOVU	(%rsi), %YMM2
+-	VPCMP	$4, (%rdi), %YMM2, %k1
+-	kmovd	%k1, %eax
+-	/* Check if any matches where in bounds. Intentionally not
+-	   storing result in eax to limit dependency chain if it goes to
+-	   L(return_vec_0_lv).  */
+-	bzhil	%edx, %eax, %edx
+-	jnz	L(return_vec_0_lv)
+-	xorl	%eax, %eax
+-	ret
+-
+-	/* Essentially duplicate of L(return_vec_0). Ends up not costing
+-	   any code as shrinks L(less_vec) by allowing 2-byte encoding of
+-	   the jump and ends up fitting in aligning bytes. As well fits on
+-	   same cache line as L(less_vec) so also saves a line from having
+-	   to be fetched on cold calls to memcmp.  */
+-	.p2align 4,, 4
+-L(return_vec_0_lv):
+-	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCMP
+-	movl	(%rdi, %rax, CHAR_SIZE), %ecx
+-	xorl	%edx, %edx
+-	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
+-	/* NB: no partial register stall here because xorl zero idiom
+-	   above.  */
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
+-# else
+-	movzbl	(%rsi, %rax), %ecx
+-	movzbl	(%rdi, %rax), %eax
+-	subl	%ecx, %eax
+-# endif
+-	ret
+-
+-	.p2align 4
+-L(page_cross_less_vec):
+-	/* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
+-	   bytes.  */
+-	cmpl	$(16 / CHAR_SIZE), %edx
+-	jae	L(between_16_31)
+-# ifndef USE_AS_WMEMCMP
+-	cmpl	$8, %edx
+-	jae	L(between_8_15)
+-	cmpl	$4, %edx
+-	jb	L(between_2_3)
+-
+-	/* Load as big endian with overlapping movbe to avoid branches.
+-	 */
+-	movbe	(%rdi), %eax
+-	movbe	(%rsi), %ecx
+-	shlq	$32, %rax
+-	shlq	$32, %rcx
+-	movbe	-4(%rdi, %rdx), %edi
+-	movbe	-4(%rsi, %rdx), %esi
+-	orq	%rdi, %rax
+-	orq	%rsi, %rcx
+-	subq	%rcx, %rax
+-	/* edx is guranteed to be positive int32 in range [4, 7].  */
+-	cmovne	%edx, %eax
+-	/* ecx is -1 if rcx > rax. Otherwise 0.  */
+-	sbbl	%ecx, %ecx
+-	/* If rcx > rax, then ecx is 0 and eax is positive. If rcx ==
+-	   rax then eax and ecx are zero. If rax < rax then ecx is -1 so
+-	   eax doesn't matter.  */
+-	orl	%ecx, %eax
+-	ret
+-
+-	.p2align 4,, 8
+-L(between_8_15):
+-# endif
+-	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
+-	vmovq	(%rdi), %xmm1
+-	vmovq	(%rsi), %xmm2
+-	VPCMP	$4, %xmm1, %xmm2, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(return_vec_0_lv)
+-	/* Use overlapping loads to avoid branches.  */
+-	vmovq	-8(%rdi, %rdx, CHAR_SIZE), %xmm1
+-	vmovq	-8(%rsi, %rdx, CHAR_SIZE), %xmm2
+-	VPCMP	$4, %xmm1, %xmm2, %k1
+-	addl	$(CHAR_PER_VEC - (8 / CHAR_SIZE)), %edx
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(return_vec_0_end)
+-	ret
+-
+-	.p2align 4,, 8
+-L(between_16_31):
+-	/* From 16 to 31 bytes.  No branch when size == 16.  */
+-
+-	/* Use movups to save code size.  */
+-	vmovdqu	(%rsi), %xmm2
+-	VPCMP	$4, (%rdi), %xmm2, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(return_vec_0_lv)
+-	/* Use overlapping loads to avoid branches.  */
+-	vmovdqu	-16(%rsi, %rdx, CHAR_SIZE), %xmm2
+-	VPCMP	$4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
+-	addl	$(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(return_vec_0_end)
+-	ret
+-
+-# ifndef USE_AS_WMEMCMP
+-L(between_2_3):
+-	/* Load as big endian to avoid branches.  */
+-	movzwl	(%rdi), %eax
+-	movzwl	(%rsi), %ecx
+-	shll	$8, %eax
+-	shll	$8, %ecx
+-	bswap	%eax
+-	bswap	%ecx
+-	movzbl	-1(%rdi, %rdx), %edi
+-	movzbl	-1(%rsi, %rdx), %esi
+-	orl	%edi, %eax
+-	orl	%esi, %ecx
+-	/* Subtraction is okay because the upper 8 bits are zero.  */
+-	subl	%ecx, %eax
+-	ret
+-# endif
+ END (MEMCMP)
+ #endif
+-- 
+GitLab
+
diff --git a/SOURCES/ia-opt-less_vec-memset-avx512.patch b/SOURCES/ia-opt-less_vec-memset-avx512.patch
new file mode 100644
index 0000000..0dac8a4
--- /dev/null
+++ b/SOURCES/ia-opt-less_vec-memset-avx512.patch
@@ -0,0 +1,265 @@
+From 1e65cc3c2278019406125f13d48494ece9f47e95 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 19 Apr 2021 17:48:10 -0400
+Subject: [PATCH] x86: Optimize less_vec evex and avx512
+ memset-vec-unaligned-erms.S
+
+No bug. This commit adds optimized cased for less_vec memset case that
+uses the avx512vl/avx512bw mask store avoiding the excessive
+branches. test-memset and test-wmemset are passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+(cherry picked from commit f53790272ce7bdc5ecd14b45f65d0464d2a61a3a)
+---
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c    | 40 ++++++++++-----
+ sysdeps/x86_64/multiarch/ifunc-memset.h       |  6 ++-
+ .../multiarch/memset-avx512-unaligned-erms.S  |  2 +-
+ .../multiarch/memset-evex-unaligned-erms.S    |  2 +-
+ .../multiarch/memset-vec-unaligned-erms.S     | 51 +++++++++++++++----
+ 5 files changed, 74 insertions(+), 27 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index 85b8863a..d59d65f8 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -204,19 +204,23 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      __memset_chk_avx2_unaligned_erms_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __memset_chk_evex_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __memset_chk_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __memset_chk_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __memset_chk_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+@@ -247,19 +251,23 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      __memset_avx2_unaligned_erms_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __memset_evex_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __memset_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __memset_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __memset_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+@@ -739,10 +747,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __wmemset_avx2_unaligned_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wmemset,
+-			      CPU_FEATURE_USABLE (AVX512VL),
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wmemset_evex_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, wmemset,
+-			      CPU_FEATURE_USABLE (AVX512VL),
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wmemset_avx512_unaligned))
+ 
+ #ifdef SHARED
+@@ -946,10 +958,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wmemset_chk_avx2_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+-			      CPU_FEATURE_USABLE (AVX512VL),
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wmemset_chk_evex_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wmemset_chk_avx512_unaligned))
+ #endif
+ 
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
+index 19795938..100e3707 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
++++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
+@@ -54,7 +54,8 @@ IFUNC_SELECTOR (void)
+       && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
+     {
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
++          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
++          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+ 	{
+ 	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ 	    return OPTIMIZE (avx512_unaligned_erms);
+@@ -68,7 +69,8 @@ IFUNC_SELECTOR (void)
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
+     {
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
++          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
++          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+ 	{
+ 	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ 	    return OPTIMIZE (evex_unaligned_erms);
+diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+index 22e7b187..8ad842fc 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+@@ -19,6 +19,6 @@
+ # define SECTION(p)		p##.evex512
+ # define MEMSET_SYMBOL(p,s)	p##_avx512_##s
+ # define WMEMSET_SYMBOL(p,s)	p##_avx512_##s
+-
++# define USE_LESS_VEC_MASK_STORE	1
+ # include "memset-vec-unaligned-erms.S"
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+index ae0a4d6e..640f0929 100644
+--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+@@ -19,6 +19,6 @@
+ # define SECTION(p)		p##.evex
+ # define MEMSET_SYMBOL(p,s)	p##_evex_##s
+ # define WMEMSET_SYMBOL(p,s)	p##_evex_##s
+-
++# define USE_LESS_VEC_MASK_STORE	1
+ # include "memset-vec-unaligned-erms.S"
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index bae5cba4..f877ac9d 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -63,6 +63,8 @@
+ # endif
+ #endif
+ 
++#define PAGE_SIZE 4096
++
+ #ifndef SECTION
+ # error SECTION is not defined!
+ #endif
+@@ -213,11 +215,38 @@ L(loop):
+ 	cmpq	%rcx, %rdx
+ 	jne	L(loop)
+ 	VZEROUPPER_SHORT_RETURN
++
++	.p2align 4
+ L(less_vec):
+ 	/* Less than 1 VEC.  */
+ # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+ #  error Unsupported VEC_SIZE!
+ # endif
++# ifdef USE_LESS_VEC_MASK_STORE
++	/* Clear high bits from edi. Only keeping bits relevant to page
++	   cross check. Note that we are using rax which is set in
++	   MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.
++	 */
++	andl	$(PAGE_SIZE - 1), %edi
++	/* Check if VEC_SIZE store cross page. Mask stores suffer serious
++	   performance degradation when it has to fault supress.  */
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %edi
++	ja	L(cross_page)
++# if VEC_SIZE > 32
++	movq	$-1, %rcx
++	bzhiq	%rdx, %rcx, %rcx
++	kmovq	%rcx, %k1
++# else
++	movl	$-1, %ecx
++	bzhil	%edx, %ecx, %ecx
++	kmovd	%ecx, %k1
++# endif
++	vmovdqu8	%VEC(0), (%rax) {%k1}
++	VZEROUPPER_RETURN
++
++	.p2align 4
++L(cross_page):
++# endif
+ # if VEC_SIZE > 32
+ 	cmpb	$32, %dl
+ 	jae	L(between_32_63)
+@@ -234,36 +263,36 @@ L(less_vec):
+ 	cmpb	$1, %dl
+ 	ja	L(between_2_3)
+ 	jb	1f
+-	movb	%cl, (%rdi)
++	movb	%cl, (%rax)
+ 1:
+ 	VZEROUPPER_RETURN
+ # if VEC_SIZE > 32
+ 	/* From 32 to 63.  No branch when size == 32.  */
+ L(between_32_63):
+-	VMOVU	%YMM0, -32(%rdi,%rdx)
+-	VMOVU	%YMM0, (%rdi)
++	VMOVU	%YMM0, -32(%rax,%rdx)
++	VMOVU	%YMM0, (%rax)
+ 	VZEROUPPER_RETURN
+ # endif
+ # if VEC_SIZE > 16
+ 	/* From 16 to 31.  No branch when size == 16.  */
+ L(between_16_31):
+-	VMOVU	%XMM0, -16(%rdi,%rdx)
+-	VMOVU	%XMM0, (%rdi)
++	VMOVU	%XMM0, -16(%rax,%rdx)
++	VMOVU	%XMM0, (%rax)
+ 	VZEROUPPER_RETURN
+ # endif
+ 	/* From 8 to 15.  No branch when size == 8.  */
+ L(between_8_15):
+-	movq	%rcx, -8(%rdi,%rdx)
+-	movq	%rcx, (%rdi)
++	movq	%rcx, -8(%rax,%rdx)
++	movq	%rcx, (%rax)
+ 	VZEROUPPER_RETURN
+ L(between_4_7):
+ 	/* From 4 to 7.  No branch when size == 4.  */
+-	movl	%ecx, -4(%rdi,%rdx)
+-	movl	%ecx, (%rdi)
++	movl	%ecx, -4(%rax,%rdx)
++	movl	%ecx, (%rax)
+ 	VZEROUPPER_RETURN
+ L(between_2_3):
+ 	/* From 2 to 3.  No branch when size == 2.  */
+-	movw	%cx, -2(%rdi,%rdx)
+-	movw	%cx, (%rdi)
++	movw	%cx, -2(%rax,%rdx)
++	movw	%cx, (%rax)
+ 	VZEROUPPER_RETURN
+ END (MEMSET_SYMBOL (__memset, unaligned_erms))
+-- 
+GitLab
+
diff --git a/SOURCES/ia-opt-memchr-evex-rtm.patch b/SOURCES/ia-opt-memchr-evex-rtm.patch
new file mode 100644
index 0000000..83f3d2b
--- /dev/null
+++ b/SOURCES/ia-opt-memchr-evex-rtm.patch
@@ -0,0 +1,536 @@
+From adf509678af2cf861f3e6d34aba8d062cfc27bf5 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Tue, 4 May 2021 19:02:40 -0400
+Subject: [PATCH] x86: Add EVEX optimized memchr family not safe for RTM
+
+No bug.
+
+This commit adds a new implementation for EVEX memchr that is not safe
+for RTM because it uses vzeroupper. The benefit is that by using
+ymm0-ymm15 it can use vpcmpeq and vpternlogd in the 4x loop which is
+faster than the RTM safe version which cannot use vpcmpeq because
+there is no EVEX encoding for the instruction. All parts of the
+implementation aside from the 4x loop are the same for the two
+versions and the optimization is only relevant for large sizes.
+
+Tigerlake:
+size  , algn  , Pos   , Cur T , New T , Win     , Dif
+512   , 6     , 192   , 9.2   , 9.04  , no-RTM  , 0.16
+512   , 7     , 224   , 9.19  , 8.98  , no-RTM  , 0.21
+2048  , 0     , 256   , 10.74 , 10.54 , no-RTM  , 0.2
+2048  , 0     , 512   , 14.81 , 14.87 , RTM     , 0.06
+2048  , 0     , 1024  , 22.97 , 22.57 , no-RTM  , 0.4
+2048  , 0     , 2048  , 37.49 , 34.51 , no-RTM  , 2.98   <--
+
+Icelake:
+size  , algn  , Pos   , Cur T , New T , Win     , Dif
+512   , 6     , 192   , 7.6   , 7.3   , no-RTM  , 0.3
+512   , 7     , 224   , 7.63  , 7.27  , no-RTM  , 0.36
+2048  , 0     , 256   , 8.48  , 8.38  , no-RTM  , 0.1
+2048  , 0     , 512   , 11.57 , 11.42 , no-RTM  , 0.15
+2048  , 0     , 1024  , 17.92 , 17.38 , no-RTM  , 0.54
+2048  , 0     , 2048  , 30.37 , 27.34 , no-RTM  , 3.03   <--
+
+test-memchr, test-wmemchr, and test-rawmemchr are all passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+(cherry picked from commit 104c7b1967c3e78435c6f7eab5e225a7eddf9c6e)
+---
+ sysdeps/x86_64/multiarch/Makefile             |   7 +-
+ sysdeps/x86_64/multiarch/ifunc-evex.h         |  55 ++++++
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  15 ++
+ sysdeps/x86_64/multiarch/memchr-evex-rtm.S    |   8 +
+ sysdeps/x86_64/multiarch/memchr-evex.S        | 161 ++++++++++++++----
+ sysdeps/x86_64/multiarch/memchr.c             |   2 +-
+ sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S |   3 +
+ sysdeps/x86_64/multiarch/rawmemchr.c          |   2 +-
+ sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S   |   3 +
+ sysdeps/x86_64/multiarch/wmemchr.c            |   2 +-
+ 10 files changed, 217 insertions(+), 41 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/ifunc-evex.h
+ create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 65fde4eb..26be4095 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
++++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -77,7 +77,9 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
+ 		   strncmp-evex \
+ 		   strncpy-evex \
+ 		   strnlen-evex \
+-		   strrchr-evex
++		   strrchr-evex \
++		   memchr-evex-rtm \
++		   rawmemchr-evex-rtm
+ CFLAGS-varshift.c += -msse4
+ CFLAGS-strcspn-c.c += -msse4
+ CFLAGS-strpbrk-c.c += -msse4
+@@ -110,7 +112,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
+ 		   wcsnlen-evex \
+ 		   wcsrchr-evex \
+ 		   wmemchr-evex \
+-		   wmemcmp-evex-movbe
++		   wmemcmp-evex-movbe \
++		   wmemchr-evex-rtm
+ endif
+ 
+ ifeq ($(subdir),debug)
+diff --git a/sysdeps/x86_64/multiarch/ifunc-evex.h b/sysdeps/x86_64/multiarch/ifunc-evex.h
+new file mode 100644
+index 00000000..fc391edb
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/ifunc-evex.h
+@@ -0,0 +1,55 @@
++/* Common definition for ifunc selection optimized with EVEX.
++   All versions must be listed in ifunc-impl-list.c.
++   Copyright (C) 2017-2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <init-arch.h>
++
++extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_rtm) attribute_hidden;
++
++
++static inline void *
++IFUNC_SELECTOR (void)
++{
++  const struct cpu_features* cpu_features = __get_cpu_features ();
++
++  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
++      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
++      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
++    {
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
++	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
++	{
++	  if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	    return OPTIMIZE (evex_rtm);
++
++	  return OPTIMIZE (evex);
++	}
++
++      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	return OPTIMIZE (avx2_rtm);
++
++      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
++	return OPTIMIZE (avx2);
++    }
++
++  return OPTIMIZE (sse2);
++}
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index d59d65f8..ac097e8d 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -52,6 +52,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+ 			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __memchr_evex)
++	      IFUNC_IMPL_ADD (array, i, memchr,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __memchr_evex_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/memcmp.c.  */
+@@ -288,6 +293,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+ 			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __rawmemchr_evex)
++	      IFUNC_IMPL_ADD (array, i, rawmemchr,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __rawmemchr_evex_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/strlen.c.  */
+@@ -711,6 +721,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+ 			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wmemchr_evex)
++	      IFUNC_IMPL_ADD (array, i, wmemchr,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __wmemchr_evex_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wmemchr, 1, __wmemchr_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/wmemcmp.c.  */
+diff --git a/sysdeps/x86_64/multiarch/memchr-evex-rtm.S b/sysdeps/x86_64/multiarch/memchr-evex-rtm.S
+new file mode 100644
+index 00000000..19871882
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/memchr-evex-rtm.S
+@@ -0,0 +1,8 @@
++#ifndef MEMCHR
++# define MEMCHR __memchr_evex_rtm
++#endif
++
++#define USE_IN_RTM 1
++#define SECTION(p) p##.evex.rtm
++
++#include "memchr-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
+index f3fdad4f..4d0ed6d1 100644
+--- a/sysdeps/x86_64/multiarch/memchr-evex.S
++++ b/sysdeps/x86_64/multiarch/memchr-evex.S
+@@ -38,10 +38,32 @@
+ #  define CHAR_SIZE	1
+ # endif
+ 
++	/* In the 4x loop the RTM and non-RTM versions have data pointer
++	   off by VEC_SIZE * 4 with RTM version being VEC_SIZE * 4 greater.
++	   This is represented by BASE_OFFSET. As well because the RTM
++	   version uses vpcmp which stores a bit per element compared where
++	   the non-RTM version uses vpcmpeq which stores a bit per byte
++	   compared RET_SCALE of CHAR_SIZE is only relevant for the RTM
++	   version.  */
++# ifdef USE_IN_RTM
++#  define VZEROUPPER
++#  define BASE_OFFSET	(VEC_SIZE * 4)
++#  define RET_SCALE	CHAR_SIZE
++# else
++#  define VZEROUPPER	vzeroupper
++#  define BASE_OFFSET	0
++#  define RET_SCALE	1
++# endif
++
++	/* In the return from 4x loop memchr and rawmemchr versions have
++	   data pointers off by VEC_SIZE * 4 with memchr version being
++	   VEC_SIZE * 4 greater.  */
+ # ifdef USE_AS_RAWMEMCHR
++#  define RET_OFFSET	(BASE_OFFSET - (VEC_SIZE * 4))
+ #  define RAW_PTR_REG	rcx
+ #  define ALGN_PTR_REG	rdi
+ # else
++#  define RET_OFFSET	BASE_OFFSET
+ #  define RAW_PTR_REG	rdi
+ #  define ALGN_PTR_REG	rcx
+ # endif
+@@ -57,11 +79,15 @@
+ # define YMM5		ymm21
+ # define YMM6		ymm22
+ 
++# ifndef SECTION
++#  define SECTION(p)	p##.evex
++# endif
++
+ # define VEC_SIZE 32
+ # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+ # define PAGE_SIZE 4096
+ 
+-	.section .text.evex,"ax",@progbits
++	.section SECTION(.text),"ax",@progbits
+ ENTRY (MEMCHR)
+ # ifndef USE_AS_RAWMEMCHR
+ 	/* Check for zero length.  */
+@@ -237,14 +263,15 @@ L(cross_page_continue):
+ 	/* Check if at last CHAR_PER_VEC * 4 length.  */
+ 	subq	$(CHAR_PER_VEC * 4), %rdx
+ 	jbe	L(last_4x_vec_or_less_cmpeq)
+-	addq	$VEC_SIZE, %rdi
++	/* +VEC_SIZE if USE_IN_RTM otherwise +VEC_SIZE * 5.  */
++	addq	$(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
+ 
+ 	/* Align data to VEC_SIZE * 4 for the loop and readjust length.
+ 	 */
+ #  ifdef USE_AS_WMEMCHR
+ 	movl	%edi, %ecx
+ 	andq	$-(4 * VEC_SIZE), %rdi
+-	andl	$(VEC_SIZE * 4 - 1), %ecx
++	subl	%edi, %ecx
+ 	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+ 	sarl	$2, %ecx
+ 	addq	%rcx, %rdx
+@@ -254,15 +281,28 @@ L(cross_page_continue):
+ 	subq	%rdi, %rdx
+ #  endif
+ # else
+-	addq	$VEC_SIZE, %rdi
++	addq	$(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
+ 	andq	$-(4 * VEC_SIZE), %rdi
+ # endif
+-
++# ifdef USE_IN_RTM
+ 	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
++# else
++	/* copy ymmmatch to ymm0 so we can use vpcmpeq which is not
++	   encodable with EVEX registers (ymm16-ymm31).  */
++	vmovdqa64 %YMMMATCH, %ymm0
++# endif
+ 
+ 	/* Compare 4 * VEC at a time forward.  */
+ 	.p2align 4
+ L(loop_4x_vec):
++	/* Two versions of the loop. One that does not require
++	   vzeroupper by not using ymm0-ymm15 and another does that require
++	   vzeroupper because it uses ymm0-ymm15. The reason why ymm0-ymm15
++	   is used at all is because there is no EVEX encoding vpcmpeq and
++	   with vpcmpeq this loop can be performed more efficiently. The
++	   non-vzeroupper version is safe for RTM while the vzeroupper
++	   version should be prefered if RTM are not supported.  */
++# ifdef USE_IN_RTM
+ 	/* It would be possible to save some instructions using 4x VPCMP
+ 	   but bottleneck on port 5 makes it not woth it.  */
+ 	VPCMP	$4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
+@@ -273,12 +313,55 @@ L(loop_4x_vec):
+ 	/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask.  */
+ 	VPMINU	%YMM2, %YMM3, %YMM3{%k1}{z}
+ 	VPCMP	$0, %YMM3, %YMMZERO, %k2
++# else
++	/* Since vptern can only take 3x vectors fastest to do 1 vec
++	   seperately with EVEX vpcmp.  */
++#  ifdef USE_AS_WMEMCHR
++	/* vptern can only accept masks for epi32/epi64 so can only save
++	   instruction using not equals mask on vptern with wmemchr.  */
++	VPCMP	$4, (%rdi), %YMMMATCH, %k1
++#  else
++	VPCMP	$0, (%rdi), %YMMMATCH, %k1
++#  endif
++	/* Compare 3x with vpcmpeq and or them all together with vptern.
++	 */
++	VPCMPEQ	VEC_SIZE(%rdi), %ymm0, %ymm2
++	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
++	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
++#  ifdef USE_AS_WMEMCHR
++	/* This takes the not of or between ymm2, ymm3, ymm4 as well as
++	   combines result from VEC0 with zero mask.  */
++	vpternlogd $1, %ymm2, %ymm3, %ymm4{%k1}{z}
++	vpmovmskb %ymm4, %ecx
++#  else
++	/* 254 is mask for oring ymm2, ymm3, ymm4 into ymm4.  */
++	vpternlogd $254, %ymm2, %ymm3, %ymm4
++	vpmovmskb %ymm4, %ecx
++	kmovd	%k1, %eax
++#  endif
++# endif
++
+ # ifdef USE_AS_RAWMEMCHR
+ 	subq	$-(VEC_SIZE * 4), %rdi
++# endif
++# ifdef USE_IN_RTM
+ 	kortestd %k2, %k3
++# else
++#  ifdef USE_AS_WMEMCHR
++	/* ecx contains not of matches. All 1s means no matches. incl will
++	   overflow and set zeroflag if that is the case.  */
++	incl	%ecx
++#  else
++	/* If either VEC1 (eax) or VEC2-VEC4 (ecx) are not zero. Adding
++	   to ecx is not an issue because if eax is non-zero it will be
++	   used for returning the match. If it is zero the add does
++	   nothing.  */
++	addq	%rax, %rcx
++#  endif
++# endif
++# ifdef USE_AS_RAWMEMCHR
+ 	jz	L(loop_4x_vec)
+ # else
+-	kortestd %k2, %k3
+ 	jnz	L(loop_4x_vec_end)
+ 
+ 	subq	$-(VEC_SIZE * 4), %rdi
+@@ -288,10 +371,11 @@ L(loop_4x_vec):
+ 
+ 	/* Fall through into less than 4 remaining vectors of length case.
+ 	 */
+-	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
++	VPCMP	$0, BASE_OFFSET(%rdi), %YMMMATCH, %k0
++	addq	$(BASE_OFFSET - VEC_SIZE), %rdi
+ 	kmovd	%k0, %eax
+-	addq	$(VEC_SIZE * 3), %rdi
+-	.p2align 4
++	VZEROUPPER
++
+ L(last_4x_vec_or_less):
+ 	/* Check if first VEC contained match.  */
+ 	testl	%eax, %eax
+@@ -338,73 +422,78 @@ L(loop_4x_vec_end):
+ 	/* rawmemchr will fall through into this if match was found in
+ 	   loop.  */
+ 
++# if defined USE_IN_RTM || defined USE_AS_WMEMCHR
+ 	/* k1 has not of matches with VEC1.  */
+ 	kmovd	%k1, %eax
+-# ifdef USE_AS_WMEMCHR
++#  ifdef USE_AS_WMEMCHR
+ 	subl	$((1 << CHAR_PER_VEC) - 1), %eax
+-# else
++#  else
+ 	incl	%eax
++#  endif
++# else
++	/* eax already has matches for VEC1.  */
++	testl	%eax, %eax
+ # endif
+ 	jnz	L(last_vec_x1_return)
+ 
++# ifdef USE_IN_RTM
+ 	VPCMP	$0, %YMM2, %YMMZERO, %k0
+ 	kmovd	%k0, %eax
++# else
++	vpmovmskb %ymm2, %eax
++# endif
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_x2_return)
+ 
++# ifdef USE_IN_RTM
+ 	kmovd	%k2, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_x3_return)
+ 
+ 	kmovd	%k3, %eax
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_RAWMEMCHR
+-	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
++	leaq	(VEC_SIZE * 3 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
+ # else
+-	leaq	(VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
++	vpmovmskb %ymm3, %eax
++	/* Combine matches in VEC3 (eax) with matches in VEC4 (ecx).  */
++	salq	$VEC_SIZE, %rcx
++	orq	%rcx, %rax
++	tzcntq	%rax, %rax
++	leaq	(VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax), %rax
++	VZEROUPPER
+ # endif
+ 	ret
+ 
+ 	.p2align 4
+ L(last_vec_x1_return):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_RAWMEMCHR
+-#  ifdef USE_AS_WMEMCHR
++# if defined USE_AS_WMEMCHR || RET_OFFSET != 0
+ 	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+-	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+-#  else
+-	addq	%rdi, %rax
+-#  endif
++	leaq	RET_OFFSET(%rdi, %rax, CHAR_SIZE), %rax
+ # else
+-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+-	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
++	addq	%rdi, %rax
+ # endif
++	VZEROUPPER
+ 	ret
+ 
+ 	.p2align 4
+ L(last_vec_x2_return):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_RAWMEMCHR
+-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+-	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+-# else
+-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+-	leaq	(VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
+-# endif
++	/* NB: Multiply bytes by RET_SCALE to get the wchar_t count
++	   if relevant (RET_SCALE = CHAR_SIZE if USE_AS_WMEMCHAR and
++	   USE_IN_RTM are both defined. Otherwise RET_SCALE = 1.  */
++	leaq	(VEC_SIZE + RET_OFFSET)(%rdi, %rax, RET_SCALE), %rax
++	VZEROUPPER
+ 	ret
+ 
++# ifdef USE_IN_RTM
+ 	.p2align 4
+ L(last_vec_x3_return):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_RAWMEMCHR
+-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+-	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+-# else
+ 	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+-	leaq	(VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
+-# endif
++	leaq	(VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+-
++# endif
+ 
+ # ifndef USE_AS_RAWMEMCHR
+ L(last_4x_vec_or_less_cmpeq):
+diff --git a/sysdeps/x86_64/multiarch/memchr.c b/sysdeps/x86_64/multiarch/memchr.c
+index 016f5784..f28aea77 100644
+--- a/sysdeps/x86_64/multiarch/memchr.c
++++ b/sysdeps/x86_64/multiarch/memchr.c
+@@ -24,7 +24,7 @@
+ # undef memchr
+ 
+ # define SYMBOL_NAME memchr
+-# include "ifunc-avx2.h"
++# include "ifunc-evex.h"
+ 
+ libc_ifunc_redirected (__redirect_memchr, memchr, IFUNC_SELECTOR ());
+ strong_alias (memchr, __memchr)
+diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
+new file mode 100644
+index 00000000..deda1ca3
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
+@@ -0,0 +1,3 @@
++#define MEMCHR __rawmemchr_evex_rtm
++#define USE_AS_RAWMEMCHR 1
++#include "memchr-evex-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/rawmemchr.c b/sysdeps/x86_64/multiarch/rawmemchr.c
+index 8a0bc313..1f764f35 100644
+--- a/sysdeps/x86_64/multiarch/rawmemchr.c
++++ b/sysdeps/x86_64/multiarch/rawmemchr.c
+@@ -26,7 +26,7 @@
+ # undef __rawmemchr
+ 
+ # define SYMBOL_NAME rawmemchr
+-# include "ifunc-avx2.h"
++# include "ifunc-evex.h"
+ 
+ libc_ifunc_redirected (__redirect_rawmemchr, __rawmemchr,
+ 		       IFUNC_SELECTOR ());
+diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S
+new file mode 100644
+index 00000000..a346cd35
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S
+@@ -0,0 +1,3 @@
++#define MEMCHR __wmemchr_evex_rtm
++#define USE_AS_WMEMCHR 1
++#include "memchr-evex-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/wmemchr.c b/sysdeps/x86_64/multiarch/wmemchr.c
+index 6d833702..f9c91915 100644
+--- a/sysdeps/x86_64/multiarch/wmemchr.c
++++ b/sysdeps/x86_64/multiarch/wmemchr.c
+@@ -26,7 +26,7 @@
+ # undef __wmemchr
+ 
+ # define SYMBOL_NAME wmemchr
+-# include "ifunc-avx2.h"
++# include "ifunc-evex.h"
+ 
+ libc_ifunc_redirected (__redirect_wmemchr, __wmemchr, IFUNC_SELECTOR ());
+ weak_alias (__wmemchr, wmemchr)
+-- 
+GitLab
+
diff --git a/SOURCES/ia-opt-memchr-evex.patch b/SOURCES/ia-opt-memchr-evex.patch
new file mode 100644
index 0000000..905e400
--- /dev/null
+++ b/SOURCES/ia-opt-memchr-evex.patch
@@ -0,0 +1,701 @@
+From 8bb82fd62bdb2d8741998a5a1be38388890452da Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 3 May 2021 03:03:19 -0400
+Subject: [PATCH] x86: Optimize memchr-evex.S
+
+No bug. This commit optimizes memchr-evex.S. The optimizations include
+replacing some branches with cmovcc, avoiding some branches entirely
+in the less_4x_vec case, making the page cross logic less strict,
+saving some ALU in the alignment process, and most importantly
+increasing ILP in the 4x loop. test-memchr, test-rawmemchr, and
+test-wmemchr are all passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+(cherry picked from commit 2a76821c3081d2c0231ecd2618f52662cb48fccd)
+---
+ sysdeps/x86_64/multiarch/memchr-evex.S | 547 +++++++++++++++----------
+ 1 file changed, 322 insertions(+), 225 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
+index 6dd5d67b..81d5cd64 100644
+--- a/sysdeps/x86_64/multiarch/memchr-evex.S
++++ b/sysdeps/x86_64/multiarch/memchr-evex.S
+@@ -26,14 +26,28 @@
+ 
+ # ifdef USE_AS_WMEMCHR
+ #  define VPBROADCAST	vpbroadcastd
+-#  define VPCMP		vpcmpd
+-#  define SHIFT_REG	r8d
++#  define VPMINU	vpminud
++#  define VPCMP	vpcmpd
++#  define VPCMPEQ	vpcmpeqd
++#  define CHAR_SIZE	4
+ # else
+ #  define VPBROADCAST	vpbroadcastb
+-#  define VPCMP		vpcmpb
+-#  define SHIFT_REG	ecx
++#  define VPMINU	vpminub
++#  define VPCMP	vpcmpb
++#  define VPCMPEQ	vpcmpeqb
++#  define CHAR_SIZE	1
+ # endif
+ 
++# ifdef USE_AS_RAWMEMCHR
++#  define RAW_PTR_REG	rcx
++#  define ALGN_PTR_REG	rdi
++# else
++#  define RAW_PTR_REG	rdi
++#  define ALGN_PTR_REG	rcx
++# endif
++
++# define XMMZERO	xmm23
++# define YMMZERO	ymm23
+ # define XMMMATCH	xmm16
+ # define YMMMATCH	ymm16
+ # define YMM1		ymm17
+@@ -44,6 +58,8 @@
+ # define YMM6		ymm22
+ 
+ # define VEC_SIZE 32
++# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
++# define PAGE_SIZE 4096
+ 
+ 	.section .text.evex,"ax",@progbits
+ ENTRY (MEMCHR)
+@@ -51,11 +67,7 @@ ENTRY (MEMCHR)
+ 	/* Check for zero length.  */
+ 	test	%RDX_LP, %RDX_LP
+ 	jz	L(zero)
+-# endif
+-	movl	%edi, %ecx
+-# ifdef USE_AS_WMEMCHR
+-	shl	$2, %RDX_LP
+-# else
++
+ #  ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	movl	%edx, %edx
+@@ -64,318 +76,403 @@ ENTRY (MEMCHR)
+ 	/* Broadcast CHAR to YMMMATCH.  */
+ 	VPBROADCAST %esi, %YMMMATCH
+ 	/* Check if we may cross page boundary with one vector load.  */
+-	andl	$(2 * VEC_SIZE - 1), %ecx
+-	cmpl	$VEC_SIZE, %ecx
+-	ja	L(cros_page_boundary)
++	movl	%edi, %eax
++	andl	$(PAGE_SIZE - 1), %eax
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
++	ja	L(cross_page_boundary)
+ 
+ 	/* Check the first VEC_SIZE bytes.  */
+-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-
++	VPCMP	$0, (%rdi), %YMMMATCH, %k0
++	kmovd	%k0, %eax
+ # ifndef USE_AS_RAWMEMCHR
+-	jnz	L(first_vec_x0_check)
+-	/* Adjust length and check the end of data.  */
+-	subq	$VEC_SIZE, %rdx
+-	jbe	L(zero)
++	/* If length < CHAR_PER_VEC handle special.  */
++	cmpq	$CHAR_PER_VEC, %rdx
++	jbe	L(first_vec_x0)
++# endif
++	testl	%eax, %eax
++	jz	L(aligned_more)
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCHR
++	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
++	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+ # else
+-	jnz	L(first_vec_x0)
++	addq	%rdi, %rax
+ # endif
+-
+-	/* Align data for aligned loads in the loop.  */
+-	addq	$VEC_SIZE, %rdi
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
++	ret
+ 
+ # ifndef USE_AS_RAWMEMCHR
+-	/* Adjust length.  */
+-	addq	%rcx, %rdx
+-
+-	subq	$(VEC_SIZE * 4), %rdx
+-	jbe	L(last_4x_vec_or_less)
+-# endif
+-	jmp	L(more_4x_vec)
++L(zero):
++	xorl	%eax, %eax
++	ret
+ 
++	.p2align 5
++L(first_vec_x0):
++	/* Check if first match was before length.  */
++	tzcntl	%eax, %eax
++	xorl	%ecx, %ecx
++	cmpl	%eax, %edx
++	leaq	(%rdi, %rax, CHAR_SIZE), %rax
++	cmovle	%rcx, %rax
++	ret
++# else
++	/* NB: first_vec_x0 is 17 bytes which will leave
++	   cross_page_boundary (which is relatively cold) close enough
++	   to ideal alignment. So only realign L(cross_page_boundary) if
++	   rawmemchr.  */
+ 	.p2align 4
+-L(cros_page_boundary):
+-	andl	$(VEC_SIZE - 1), %ecx
++# endif
++L(cross_page_boundary):
++	/* Save pointer before aligning as its original value is
++	   necessary for computer return address if byte is found or
++	   adjusting length if it is not and this is memchr.  */
++	movq	%rdi, %rcx
++	/* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi
++	   for rawmemchr.  */
++	andq	$-VEC_SIZE, %ALGN_PTR_REG
++	VPCMP	$0, (%ALGN_PTR_REG), %YMMMATCH, %k0
++	kmovd	%k0, %r8d
+ # ifdef USE_AS_WMEMCHR
+-	/* NB: Divide shift count by 4 since each bit in K1 represent 4
++	/* NB: Divide shift count by 4 since each bit in K0 represent 4
+ 	   bytes.  */
+-	movl	%ecx, %SHIFT_REG
+-	sarl	$2, %SHIFT_REG
++	sarl	$2, %eax
++# endif
++# ifndef USE_AS_RAWMEMCHR
++	movl	$(PAGE_SIZE / CHAR_SIZE), %esi
++	subl	%eax, %esi
+ # endif
+-	andq	$-VEC_SIZE, %rdi
+-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+-	/* Remove the leading bytes.  */
+-	sarxl	%SHIFT_REG, %eax, %eax
+-	testl	%eax, %eax
+-	jz	L(aligned_more)
+-	tzcntl	%eax, %eax
+ # ifdef USE_AS_WMEMCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
++	andl	$(CHAR_PER_VEC - 1), %eax
+ # endif
++	/* Remove the leading bytes.  */
++	sarxl	%eax, %r8d, %eax
+ # ifndef USE_AS_RAWMEMCHR
+ 	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
++	cmpq	%rsi, %rdx
++	jbe	L(first_vec_x0)
++# endif
++	testl	%eax, %eax
++	jz	L(cross_page_continue)
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCHR
++	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
++	leaq	(%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
++# else
++	addq	%RAW_PTR_REG, %rax
+ # endif
+-	addq	%rdi, %rax
+-	addq	%rcx, %rax
+ 	ret
+ 
+ 	.p2align 4
+-L(aligned_more):
+-# ifndef USE_AS_RAWMEMCHR
+-        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
+-	   instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
+-	   overflow.  */
+-	negq	%rcx
+-	addq	$VEC_SIZE, %rcx
++L(first_vec_x1):
++	tzcntl	%eax, %eax
++	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
++	ret
+ 
+-	/* Check the end of data.  */
+-	subq	%rcx, %rdx
+-	jbe	L(zero)
+-# endif
++	.p2align 4
++L(first_vec_x2):
++	tzcntl	%eax, %eax
++	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
++	ret
+ 
+-	addq	$VEC_SIZE, %rdi
++	.p2align 4
++L(first_vec_x3):
++	tzcntl	%eax, %eax
++	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
++	ret
+ 
+-# ifndef USE_AS_RAWMEMCHR
+-	subq	$(VEC_SIZE * 4), %rdx
+-	jbe	L(last_4x_vec_or_less)
+-# endif
++	.p2align 4
++L(first_vec_x4):
++	tzcntl	%eax, %eax
++	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
++	ret
+ 
+-L(more_4x_vec):
++	.p2align 5
++L(aligned_more):
+ 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+ 	   since data is only aligned to VEC_SIZE.  */
+-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+ 
+-	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
++# ifndef USE_AS_RAWMEMCHR
++	/* Align data to VEC_SIZE.  */
++L(cross_page_continue):
++	xorl	%ecx, %ecx
++	subl	%edi, %ecx
++	andq	$-VEC_SIZE, %rdi
++	/* esi is for adjusting length to see if near the end.  */
++	leal	(VEC_SIZE * 5)(%rdi, %rcx), %esi
++#  ifdef USE_AS_WMEMCHR
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarl	$2, %esi
++#  endif
++# else
++	andq	$-VEC_SIZE, %rdi
++L(cross_page_continue):
++# endif
++	/* Load first VEC regardless.  */
++	VPCMP	$0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
++	kmovd	%k0, %eax
++# ifndef USE_AS_RAWMEMCHR
++	/* Adjust length. If near end handle specially.  */
++	subq	%rsi, %rdx
++	jbe	L(last_4x_vec_or_less)
++# endif
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1)
+ 
+-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
++	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
++	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x2)
+ 
+-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
++	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
++	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x3)
+ 
+-	addq	$(VEC_SIZE * 4), %rdi
++	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x4)
++
+ 
+ # ifndef USE_AS_RAWMEMCHR
+-	subq	$(VEC_SIZE * 4), %rdx
+-	jbe	L(last_4x_vec_or_less)
+-# endif
++	/* Check if at last CHAR_PER_VEC * 4 length.  */
++	subq	$(CHAR_PER_VEC * 4), %rdx
++	jbe	L(last_4x_vec_or_less_cmpeq)
++	addq	$VEC_SIZE, %rdi
+ 
+-	/* Align data to 4 * VEC_SIZE.  */
+-	movq	%rdi, %rcx
+-	andl	$(4 * VEC_SIZE - 1), %ecx
++	/* Align data to VEC_SIZE * 4 for the loop and readjust length.
++	 */
++#  ifdef USE_AS_WMEMCHR
++	movl	%edi, %ecx
+ 	andq	$-(4 * VEC_SIZE), %rdi
+-
+-# ifndef USE_AS_RAWMEMCHR
+-	/* Adjust length.  */
++	andl	$(VEC_SIZE * 4 - 1), %ecx
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarl	$2, %ecx
+ 	addq	%rcx, %rdx
++#  else
++	addq	%rdi, %rdx
++	andq	$-(4 * VEC_SIZE), %rdi
++	subq	%rdi, %rdx
++#  endif
++# else
++	addq	$VEC_SIZE, %rdi
++	andq	$-(4 * VEC_SIZE), %rdi
+ # endif
+ 
++	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
++
++	/* Compare 4 * VEC at a time forward.  */
+ 	.p2align 4
+ L(loop_4x_vec):
+-	/* Compare 4 * VEC at a time forward.  */
+-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+-	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k2
+-	kord	%k1, %k2, %k5
+-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
+-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
+-
+-	kord	%k3, %k4, %k6
+-	kortestd %k5, %k6
+-	jnz	L(4x_vec_end)
+-
+-	addq	$(VEC_SIZE * 4), %rdi
+-
++	/* It would be possible to save some instructions using 4x VPCMP
++	   but bottleneck on port 5 makes it not woth it.  */
++	VPCMP	$4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
++	/* xor will set bytes match esi to zero.  */
++	vpxorq	(VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
++	vpxorq	(VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
++	VPCMP	$0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
++	/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask.  */
++	VPMINU	%YMM2, %YMM3, %YMM3 {%k1} {z}
++	VPCMP	$0, %YMM3, %YMMZERO, %k2
+ # ifdef USE_AS_RAWMEMCHR
+-	jmp	L(loop_4x_vec)
++	subq	$-(VEC_SIZE * 4), %rdi
++	kortestd %k2, %k3
++	jz	L(loop_4x_vec)
+ # else
+-	subq	$(VEC_SIZE * 4), %rdx
++	kortestd %k2, %k3
++	jnz	L(loop_4x_vec_end)
++
++	subq	$-(VEC_SIZE * 4), %rdi
++
++	subq	$(CHAR_PER_VEC * 4), %rdx
+ 	ja	L(loop_4x_vec)
+ 
++	/* Fall through into less than 4 remaining vectors of length case.
++	 */
++	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
++	kmovd	%k0, %eax
++	addq	$(VEC_SIZE * 3), %rdi
++	.p2align 4
+ L(last_4x_vec_or_less):
+-	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
+-	addl	$(VEC_SIZE * 2), %edx
+-	jle	L(last_2x_vec)
+-
+-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
++	/* Check if first VEC contained match.  */
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
++	jnz	L(first_vec_x1_check)
+ 
+-	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
++	/* If remaining length > CHAR_PER_VEC * 2.  */
++	addl	$(CHAR_PER_VEC * 2), %edx
++	jg	L(last_4x_vec)
+ 
+-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
++L(last_2x_vec):
++	/* If remaining length < CHAR_PER_VEC.  */
++	addl	$CHAR_PER_VEC, %edx
++	jle	L(zero_end)
+ 
+-	jnz	L(first_vec_x2_check)
+-	subl	$VEC_SIZE, %edx
+-	jle	L(zero)
++	/* Check VEC2 and compare any match with remaining length.  */
++	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
++	kmovd	%k0, %eax
++	tzcntl	%eax, %eax
++	cmpl	%eax, %edx
++	jbe	L(set_zero_end)
++	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
++L(zero_end):
++	ret
+ 
+-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+ 
+-	jnz	L(first_vec_x3_check)
++	.p2align 4
++L(first_vec_x1_check):
++	tzcntl	%eax, %eax
++	/* Adjust length.  */
++	subl	$-(CHAR_PER_VEC * 4), %edx
++	/* Check if match within remaining length.  */
++	cmpl	%eax, %edx
++	jbe	L(set_zero_end)
++	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
++	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
++	ret
++L(set_zero_end):
+ 	xorl	%eax, %eax
+ 	ret
+ 
+ 	.p2align 4
+-L(last_2x_vec):
+-	addl	$(VEC_SIZE * 2), %edx
+-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
++L(loop_4x_vec_end):
++# endif
++	/* rawmemchr will fall through into this if match was found in
++	   loop.  */
++
++	/* k1 has not of matches with VEC1.  */
+ 	kmovd	%k1, %eax
+-	testl	%eax, %eax
++# ifdef USE_AS_WMEMCHR
++	subl	$((1 << CHAR_PER_VEC) - 1), %eax
++# else
++	incl	%eax
++# endif
++	jnz	L(last_vec_x1_return)
+ 
+-	jnz	L(first_vec_x0_check)
+-	subl	$VEC_SIZE, %edx
+-	jle	L(zero)
++	VPCMP	$0, %YMM2, %YMMZERO, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x2_return)
+ 
+-	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
++	kmovd	%k2, %eax
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x1_check)
+-	xorl	%eax, %eax
+-	ret
++	jnz	L(last_vec_x3_return)
+ 
+-	.p2align 4
+-L(first_vec_x0_check):
++	kmovd	%k3, %eax
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
++# ifdef USE_AS_RAWMEMCHR
++	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
++# else
++	leaq	(VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
+ # endif
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
+-	addq	%rdi, %rax
+ 	ret
+ 
+ 	.p2align 4
+-L(first_vec_x1_check):
++L(last_vec_x1_return):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-# endif
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
+-	addq	$VEC_SIZE, %rax
++# ifdef USE_AS_RAWMEMCHR
++#  ifdef USE_AS_WMEMCHR
++	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
++	leaq	(%rdi, %rax, CHAR_SIZE), %rax
++#  else
+ 	addq	%rdi, %rax
+-	ret
+-
+-	.p2align 4
+-L(first_vec_x2_check):
+-	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
++#  endif
++# else
++	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
++	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+ # endif
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
+-	addq	$(VEC_SIZE * 2), %rax
+-	addq	%rdi, %rax
+ 	ret
+ 
+ 	.p2align 4
+-L(first_vec_x3_check):
++L(last_vec_x2_return):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
++# ifdef USE_AS_RAWMEMCHR
++	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
++	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
++# else
++	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
++	leaq	(VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
+ # endif
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
+-	addq	$(VEC_SIZE * 3), %rax
+-	addq	%rdi, %rax
+ 	ret
+ 
+ 	.p2align 4
+-L(zero):
+-	xorl	%eax, %eax
+-	ret
+-# endif
+-
+-	.p2align 4
+-L(first_vec_x0):
++L(last_vec_x3_return):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	(%rdi, %rax, 4), %rax
++# ifdef USE_AS_RAWMEMCHR
++	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
++	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ # else
+-	addq	%rdi, %rax
++	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
++	leaq	(VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
+ # endif
+ 	ret
+ 
++
++# ifndef USE_AS_RAWMEMCHR
++L(last_4x_vec_or_less_cmpeq):
++	VPCMP	$0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
++	kmovd	%k0, %eax
++	subq	$-(VEC_SIZE * 4), %rdi
++	/* Check first VEC regardless.  */
++	testl	%eax, %eax
++	jnz	L(first_vec_x1_check)
++
++	/* If remaining length <= CHAR_PER_VEC * 2.  */
++	addl	$(CHAR_PER_VEC * 2), %edx
++	jle	L(last_2x_vec)
++
+ 	.p2align 4
+-L(first_vec_x1):
++L(last_4x_vec):
++	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x2)
++
++
++	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
++	kmovd	%k0, %eax
++	/* Create mask for possible matches within remaining length.  */
++#  ifdef USE_AS_WMEMCHR
++	movl	$((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
++	bzhil	%edx, %ecx, %ecx
++#  else
++	movq	$-1, %rcx
++	bzhiq	%rdx, %rcx, %rcx
++#  endif
++	/* Test matches in data against length match.  */
++	andl	%ecx, %eax
++	jnz	L(last_vec_x3)
++
++	/* if remaining length <= CHAR_PER_VEC * 3 (Note this is after
++	   remaining length was found to be > CHAR_PER_VEC * 2.  */
++	subl	$CHAR_PER_VEC, %edx
++	jbe	L(zero_end2)
++
++
++	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
++	kmovd	%k0, %eax
++	/* Shift remaining length mask for last VEC.  */
++#  ifdef USE_AS_WMEMCHR
++	shrl	$CHAR_PER_VEC, %ecx
++#  else
++	shrq	$CHAR_PER_VEC, %rcx
++#  endif
++	andl	%ecx, %eax
++	jz	L(zero_end2)
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	VEC_SIZE(%rdi, %rax, 4), %rax
+-# else
+-	addq	$VEC_SIZE, %rax
+-	addq	%rdi, %rax
+-# endif
++	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
++L(zero_end2):
+ 	ret
+ 
+-	.p2align 4
+-L(first_vec_x2):
++L(last_vec_x2):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
+-# else
+-	addq	$(VEC_SIZE * 2), %rax
+-	addq	%rdi, %rax
+-# endif
++	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+ 
+ 	.p2align 4
+-L(4x_vec_end):
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+-	kmovd	%k2, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+-	kmovd	%k3, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x2)
+-	kmovd	%k4, %eax
+-	testl	%eax, %eax
+-L(first_vec_x3):
++L(last_vec_x3):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	(VEC_SIZE * 3)(%rdi, %rax, 4), %rax
+-# else
+-	addq	$(VEC_SIZE * 3), %rax
+-	addq	%rdi, %rax
+-# endif
++	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
++# endif
+ 
+ END (MEMCHR)
+ #endif
+-- 
+GitLab
+
diff --git a/SOURCES/ia-opt-memcmp-avx2-movbe.patch b/SOURCES/ia-opt-memcmp-avx2-movbe.patch
new file mode 100644
index 0000000..d55adb7
--- /dev/null
+++ b/SOURCES/ia-opt-memcmp-avx2-movbe.patch
@@ -0,0 +1,873 @@
+From 65438851072f6131049a0ae471dcab90870e51f3 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 17 May 2021 13:56:52 -0400
+Subject: [PATCH] x86: Optimize memcmp-avx2-movbe.S
+
+No bug. This commit optimizes memcmp-avx2.S. The optimizations include
+adding a new vec compare path for small sizes, reorganizing the entry
+control flow, and removing some unnecissary ALU instructions from the
+main loop. test-memcmp and test-wmemcmp are both passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+(cherry picked from commit 16d12015c57701b08d7bbed6ec536641bcafb428)
+---
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c   |   6 +
+ sysdeps/x86_64/multiarch/ifunc-memcmp.h      |   1 +
+ sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 676 +++++++++++--------
+ 3 files changed, 402 insertions(+), 281 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index ac097e8d..8be0d78a 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -63,16 +63,19 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   IFUNC_IMPL (i, name, memcmp,
+ 	      IFUNC_IMPL_ADD (array, i, memcmp,
+ 			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)),
+ 			      __memcmp_avx2_movbe)
+ 	      IFUNC_IMPL_ADD (array, i, memcmp,
+ 			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __memcmp_avx2_movbe_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, memcmp,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)),
+ 			      __memcmp_evex_movbe)
+ 	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
+@@ -732,16 +735,19 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   IFUNC_IMPL (i, name, wmemcmp,
+ 	      IFUNC_IMPL_ADD (array, i, wmemcmp,
+ 			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)),
+ 			      __wmemcmp_avx2_movbe)
+ 	      IFUNC_IMPL_ADD (array, i, wmemcmp,
+ 			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __wmemcmp_avx2_movbe_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wmemcmp,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)),
+ 			      __wmemcmp_evex_movbe)
+ 	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+index 8043c635..690dffe8 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
++++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+@@ -33,6 +33,7 @@ IFUNC_SELECTOR (void)
+ 
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+       && CPU_FEATURE_USABLE_P (cpu_features, MOVBE)
++      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+     {
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+index 9d5c9c72..16fc673e 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
++++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+@@ -19,17 +19,23 @@
+ #if IS_IN (libc)
+ 
+ /* memcmp/wmemcmp is implemented as:
+-   1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
+-      to avoid branches.
+-   2. Use overlapping compare to avoid branch.
+-   3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
+-      bytes for wmemcmp.
+-   4. If size is 8 * VEC_SIZE or less, unroll the loop.
+-   5. Compare 4 * VEC_SIZE at a time with the aligned first memory
++   1. Use ymm vector compares when possible. The only case where
++      vector compares is not possible for when size < VEC_SIZE
++      and loading from either s1 or s2 would cause a page cross.
++   2. For size from 2 to 7 bytes on page cross, load as big endian
++      with movbe and bswap to avoid branches.
++   3. Use xmm vector compare when size >= 4 bytes for memcmp or
++      size >= 8 bytes for wmemcmp.
++   4. Optimistically compare up to first 4 * VEC_SIZE one at a
++      to check for early mismatches. Only do this if its guranteed the
++      work is not wasted.
++   5. If size is 8 * VEC_SIZE or less, unroll the loop.
++   6. Compare 4 * VEC_SIZE at a time with the aligned first memory
+       area.
+-   6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
+-   7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
+-   8. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
++   7. Use 2 vector compares when size is 2 * VEC_SIZE or less.
++   8. Use 4 vector compares when size is 4 * VEC_SIZE or less.
++   9. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
++
+ 
+ # include <sysdep.h>
+ 
+@@ -38,8 +44,10 @@
+ # endif
+ 
+ # ifdef USE_AS_WMEMCMP
++#  define CHAR_SIZE	4
+ #  define VPCMPEQ	vpcmpeqd
+ # else
++#  define CHAR_SIZE	1
+ #  define VPCMPEQ	vpcmpeqb
+ # endif
+ 
+@@ -52,7 +60,7 @@
+ # endif
+ 
+ # define VEC_SIZE 32
+-# define VEC_MASK ((1 << VEC_SIZE) - 1)
++# define PAGE_SIZE	4096
+ 
+ /* Warning!
+            wmemcmp has to use SIGNED comparison for elements.
+@@ -71,136 +79,359 @@ ENTRY (MEMCMP)
+ 	jb	L(less_vec)
+ 
+ 	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+-	vmovdqu	(%rsi), %ymm2
+-	VPCMPEQ (%rdi), %ymm2, %ymm2
+-	vpmovmskb %ymm2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
++	vmovdqu	(%rsi), %ymm1
++	VPCMPEQ	(%rdi), %ymm1, %ymm1
++	vpmovmskb %ymm1, %eax
++	/* NB: eax must be destination register if going to
++	   L(return_vec_[0,2]). For L(return_vec_3 destination register
++	   must be ecx.  */
++	incl	%eax
++	jnz	L(return_vec_0)
+ 
+ 	cmpq	$(VEC_SIZE * 2), %rdx
+-	jbe	L(last_vec)
+-
+-	VPCMPEQ	%ymm0, %ymm0, %ymm0
+-	/* More than 2 * VEC.  */
+-	cmpq	$(VEC_SIZE * 8), %rdx
+-	ja	L(more_8x_vec)
+-	cmpq	$(VEC_SIZE * 4), %rdx
+-	jb	L(last_4x_vec)
+-
+-	/* From 4 * VEC to 8 * VEC, inclusively. */
+-	vmovdqu	(%rsi), %ymm1
+-	VPCMPEQ (%rdi), %ymm1, %ymm1
++	jbe	L(last_1x_vec)
+ 
++	/* Check second VEC no matter what.  */
+ 	vmovdqu	VEC_SIZE(%rsi), %ymm2
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
++	VPCMPEQ	VEC_SIZE(%rdi), %ymm2, %ymm2
++	vpmovmskb %ymm2, %eax
++	/* If all 4 VEC where equal eax will be all 1s so incl will
++	   overflow and set zero flag.  */
++	incl	%eax
++	jnz	L(return_vec_1)
+ 
+-	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
++	/* Less than 4 * VEC.  */
++	cmpq	$(VEC_SIZE * 4), %rdx
++	jbe	L(last_2x_vec)
+ 
++	/* Check third and fourth VEC no matter what.  */
++	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
++	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
++	vpmovmskb %ymm3, %eax
++	incl	%eax
++	jnz	L(return_vec_2)
+ 	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm4
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
++	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
++	vpmovmskb %ymm4, %ecx
++	incl	%ecx
++	jnz	L(return_vec_3)
+ 
+-	vpand	%ymm1, %ymm2, %ymm5
+-	vpand	%ymm3, %ymm4, %ymm6
+-	vpand	%ymm5, %ymm6, %ymm5
++	/* Go to 4x VEC loop.  */
++	cmpq	$(VEC_SIZE * 8), %rdx
++	ja	L(more_8x_vec)
+ 
+-	vptest	%ymm0, %ymm5
+-	jnc	L(4x_vec_end)
++	/* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
++	   branches.  */
+ 
++	/* Load first two VEC from s2 before adjusting addresses.  */
++	vmovdqu	-(VEC_SIZE * 4)(%rsi, %rdx), %ymm1
++	vmovdqu	-(VEC_SIZE * 3)(%rsi, %rdx), %ymm2
+ 	leaq	-(4 * VEC_SIZE)(%rdi, %rdx), %rdi
+ 	leaq	-(4 * VEC_SIZE)(%rsi, %rdx), %rsi
+-	vmovdqu	(%rsi), %ymm1
+-	VPCMPEQ (%rdi), %ymm1, %ymm1
+ 
+-	vmovdqu	VEC_SIZE(%rsi), %ymm2
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
+-	vpand	%ymm2, %ymm1, %ymm5
++	/* Wait to load from s1 until addressed adjust due to
++	   unlamination of microfusion with complex address mode.  */
++	VPCMPEQ	(%rdi), %ymm1, %ymm1
++	VPCMPEQ	(VEC_SIZE)(%rdi), %ymm2, %ymm2
+ 
+ 	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+-	vpand	%ymm3, %ymm5, %ymm5
+-
++	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+ 	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm4
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+-	vpand	%ymm4, %ymm5, %ymm5
++	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+ 
+-	vptest	%ymm0, %ymm5
+-	jnc	L(4x_vec_end)
+-	xorl	%eax, %eax
++	/* Reduce VEC0 - VEC4.  */
++	vpand	%ymm1, %ymm2, %ymm5
++	vpand	%ymm3, %ymm4, %ymm6
++	vpand	%ymm5, %ymm6, %ymm7
++	vpmovmskb %ymm7, %ecx
++	incl	%ecx
++	jnz	L(return_vec_0_1_2_3)
++	/* NB: eax must be zero to reach here.  */
++	VZEROUPPER_RETURN
++
++	.p2align 4
++L(return_vec_0):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCMP
++	movl	(%rdi, %rax), %ecx
++	xorl	%edx, %edx
++	cmpl	(%rsi, %rax), %ecx
++	/* NB: no partial register stall here because xorl zero idiom
++	   above.  */
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	(%rsi, %rax), %ecx
++	movzbl	(%rdi, %rax), %eax
++	subl	%ecx, %eax
++# endif
+ L(return_vzeroupper):
+ 	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ 	.p2align 4
+-L(last_2x_vec):
+-	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+-	vmovdqu	(%rsi), %ymm2
+-	VPCMPEQ (%rdi), %ymm2, %ymm2
+-	vpmovmskb %ymm2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
++L(return_vec_1):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCMP
++	movl	VEC_SIZE(%rdi, %rax), %ecx
++	xorl	%edx, %edx
++	cmpl	VEC_SIZE(%rsi, %rax), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	VEC_SIZE(%rsi, %rax), %ecx
++	movzbl	VEC_SIZE(%rdi, %rax), %eax
++	subl	%ecx, %eax
++# endif
++	VZEROUPPER_RETURN
++
++	.p2align 4
++L(return_vec_2):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCMP
++	movl	(VEC_SIZE * 2)(%rdi, %rax), %ecx
++	xorl	%edx, %edx
++	cmpl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
++	movzbl	(VEC_SIZE * 2)(%rdi, %rax), %eax
++	subl	%ecx, %eax
++# endif
++	VZEROUPPER_RETURN
++
++	/* NB: p2align 5 here to ensure 4x loop is 32 byte aligned.  */
++	.p2align 5
++L(8x_return_vec_0_1_2_3):
++	/* Returning from L(more_8x_vec) requires restoring rsi.  */
++	addq	%rdi, %rsi
++L(return_vec_0_1_2_3):
++	vpmovmskb %ymm1, %eax
++	incl	%eax
++	jnz	L(return_vec_0)
+ 
+-L(last_vec):
+-	/* Use overlapping loads to avoid branches.  */
+-	leaq	-VEC_SIZE(%rdi, %rdx), %rdi
+-	leaq	-VEC_SIZE(%rsi, %rdx), %rsi
+-	vmovdqu	(%rsi), %ymm2
+-	VPCMPEQ (%rdi), %ymm2, %ymm2
+ 	vpmovmskb %ymm2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
++	incl	%eax
++	jnz	L(return_vec_1)
++
++	vpmovmskb %ymm3, %eax
++	incl	%eax
++	jnz	L(return_vec_2)
++L(return_vec_3):
++	tzcntl	%ecx, %ecx
++# ifdef USE_AS_WMEMCMP
++	movl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
++	xorl	%edx, %edx
++	cmpl	(VEC_SIZE * 3)(%rsi, %rcx), %eax
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
++	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++# endif
++	VZEROUPPER_RETURN
++
++	.p2align 4
++L(more_8x_vec):
++	/* Set end of s1 in rdx.  */
++	leaq	-(VEC_SIZE * 4)(%rdi, %rdx), %rdx
++	/* rsi stores s2 - s1. This allows loop to only update one
++	   pointer.  */
++	subq	%rdi, %rsi
++	/* Align s1 pointer.  */
++	andq	$-VEC_SIZE, %rdi
++	/* Adjust because first 4x vec where check already.  */
++	subq	$-(VEC_SIZE * 4), %rdi
++	.p2align 4
++L(loop_4x_vec):
++	/* rsi has s2 - s1 so get correct address by adding s1 (in rdi).
++	 */
++	vmovdqu	(%rsi, %rdi), %ymm1
++	VPCMPEQ	(%rdi), %ymm1, %ymm1
++
++	vmovdqu	VEC_SIZE(%rsi, %rdi), %ymm2
++	VPCMPEQ	VEC_SIZE(%rdi), %ymm2, %ymm2
++
++	vmovdqu	(VEC_SIZE * 2)(%rsi, %rdi), %ymm3
++	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
++
++	vmovdqu	(VEC_SIZE * 3)(%rsi, %rdi), %ymm4
++	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
++
++	vpand	%ymm1, %ymm2, %ymm5
++	vpand	%ymm3, %ymm4, %ymm6
++	vpand	%ymm5, %ymm6, %ymm7
++	vpmovmskb %ymm7, %ecx
++	incl	%ecx
++	jnz	L(8x_return_vec_0_1_2_3)
++	subq	$-(VEC_SIZE * 4), %rdi
++	/* Check if s1 pointer at end.  */
++	cmpq	%rdx, %rdi
++	jb	L(loop_4x_vec)
++
++	subq	%rdx, %rdi
++	/* rdi has 4 * VEC_SIZE - remaining length.  */
++	cmpl	$(VEC_SIZE * 3), %edi
++	jae	L(8x_last_1x_vec)
++	/* Load regardless of branch.  */
++	vmovdqu	(VEC_SIZE * 2)(%rsi, %rdx), %ymm3
++	cmpl	$(VEC_SIZE * 2), %edi
++	jae	L(8x_last_2x_vec)
++
++	/* Check last 4 VEC.  */
++	vmovdqu	(%rsi, %rdx), %ymm1
++	VPCMPEQ	(%rdx), %ymm1, %ymm1
++
++	vmovdqu	VEC_SIZE(%rsi, %rdx), %ymm2
++	VPCMPEQ	VEC_SIZE(%rdx), %ymm2, %ymm2
++
++	VPCMPEQ	(VEC_SIZE * 2)(%rdx), %ymm3, %ymm3
++
++	vmovdqu	(VEC_SIZE * 3)(%rsi, %rdx), %ymm4
++	VPCMPEQ	(VEC_SIZE * 3)(%rdx), %ymm4, %ymm4
++
++	vpand	%ymm1, %ymm2, %ymm5
++	vpand	%ymm3, %ymm4, %ymm6
++	vpand	%ymm5, %ymm6, %ymm7
++	vpmovmskb %ymm7, %ecx
++	/* Restore s1 pointer to rdi.  */
++	movq	%rdx, %rdi
++	incl	%ecx
++	jnz	L(8x_return_vec_0_1_2_3)
++	/* NB: eax must be zero to reach here.  */
++	VZEROUPPER_RETURN
++
++	/* Only entry is from L(more_8x_vec).  */
++	.p2align 4
++L(8x_last_2x_vec):
++	/* Check second to last VEC. rdx store end pointer of s1 and
++	   ymm3 has already been loaded with second to last VEC from s2.
++	 */
++	VPCMPEQ	(VEC_SIZE * 2)(%rdx), %ymm3, %ymm3
++	vpmovmskb %ymm3, %eax
++	incl	%eax
++	jnz	L(8x_return_vec_2)
++	/* Check last VEC.  */
++	.p2align 4
++L(8x_last_1x_vec):
++	vmovdqu	(VEC_SIZE * 3)(%rsi, %rdx), %ymm4
++	VPCMPEQ	(VEC_SIZE * 3)(%rdx), %ymm4, %ymm4
++	vpmovmskb %ymm4, %eax
++	incl	%eax
++	jnz	L(8x_return_vec_3)
+ 	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(first_vec):
+-	/* A byte or int32 is different within 16 or 32 bytes.  */
+-	tzcntl	%eax, %ecx
++L(last_2x_vec):
++	/* Check second to last VEC.  */
++	vmovdqu	-(VEC_SIZE * 2)(%rsi, %rdx), %ymm1
++	VPCMPEQ	-(VEC_SIZE * 2)(%rdi, %rdx), %ymm1, %ymm1
++	vpmovmskb %ymm1, %eax
++	incl	%eax
++	jnz	L(return_vec_1_end)
++	/* Check last VEC.  */
++L(last_1x_vec):
++	vmovdqu	-(VEC_SIZE * 1)(%rsi, %rdx), %ymm1
++	VPCMPEQ	-(VEC_SIZE * 1)(%rdi, %rdx), %ymm1, %ymm1
++	vpmovmskb %ymm1, %eax
++	incl	%eax
++	jnz	L(return_vec_0_end)
++	VZEROUPPER_RETURN
++
++	.p2align 4
++L(8x_return_vec_2):
++	subq	$VEC_SIZE, %rdx
++L(8x_return_vec_3):
++	tzcntl	%eax, %eax
++	addq	%rdx, %rax
+ # ifdef USE_AS_WMEMCMP
+-	xorl	%eax, %eax
+-	movl	(%rdi, %rcx), %edx
+-	cmpl	(%rsi, %rcx), %edx
+-L(wmemcmp_return):
+-	setl	%al
+-	negl	%eax
+-	orl	$1, %eax
++	movl	(VEC_SIZE * 3)(%rax), %ecx
++	xorl	%edx, %edx
++	cmpl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
+ # else
+-	movzbl	(%rdi, %rcx), %eax
+-	movzbl	(%rsi, %rcx), %edx
+-	sub	%edx, %eax
++	movzbl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
++	movzbl	(VEC_SIZE * 3)(%rax), %eax
++	subl	%ecx, %eax
+ # endif
+ 	VZEROUPPER_RETURN
+ 
+-# ifdef USE_AS_WMEMCMP
+ 	.p2align 4
+-L(4):
+-	xorl	%eax, %eax
+-	movl	(%rdi), %edx
+-	cmpl	(%rsi), %edx
+-	jne	L(wmemcmp_return)
+-	ret
++L(return_vec_1_end):
++	tzcntl	%eax, %eax
++	addl	%edx, %eax
++# ifdef USE_AS_WMEMCMP
++	movl	-(VEC_SIZE * 2)(%rdi, %rax), %ecx
++	xorl	%edx, %edx
++	cmpl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
+ # else
++	movzbl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
++	movzbl	-(VEC_SIZE * 2)(%rdi, %rax), %eax
++	subl	%ecx, %eax
++# endif
++	VZEROUPPER_RETURN
++
+ 	.p2align 4
+-L(between_4_7):
+-	/* Load as big endian with overlapping movbe to avoid branches.  */
+-	movbe	(%rdi), %eax
+-	movbe	(%rsi), %ecx
+-	shlq	$32, %rax
+-	shlq	$32, %rcx
+-	movbe	-4(%rdi, %rdx), %edi
+-	movbe	-4(%rsi, %rdx), %esi
+-	orq	%rdi, %rax
+-	orq	%rsi, %rcx
+-	subq	%rcx, %rax
+-	je	L(exit)
+-	sbbl	%eax, %eax
+-	orl	$1, %eax
+-	ret
++L(return_vec_0_end):
++	tzcntl	%eax, %eax
++	addl	%edx, %eax
++# ifdef USE_AS_WMEMCMP
++	movl	-VEC_SIZE(%rdi, %rax), %ecx
++	xorl	%edx, %edx
++	cmpl	-VEC_SIZE(%rsi, %rax), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	-VEC_SIZE(%rsi, %rax), %ecx
++	movzbl	-VEC_SIZE(%rdi, %rax), %eax
++	subl	%ecx, %eax
++# endif
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(exit):
+-	ret
++L(less_vec):
++	/* Check if one or less CHAR. This is necessary for size = 0 but
++	   is also faster for size = CHAR_SIZE.  */
++	cmpl	$CHAR_SIZE, %edx
++	jbe	L(one_or_less)
++
++	/* Check if loading one VEC from either s1 or s2 could cause a
++	   page cross. This can have false positives but is by far the
++	   fastest method.  */
++	movl	%edi, %eax
++	orl	%esi, %eax
++	andl	$(PAGE_SIZE - 1), %eax
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
++	jg	L(page_cross_less_vec)
++
++	/* No page cross possible.  */
++	vmovdqu	(%rsi), %ymm2
++	VPCMPEQ	(%rdi), %ymm2, %ymm2
++	vpmovmskb %ymm2, %eax
++	incl	%eax
++	/* Result will be zero if s1 and s2 match. Otherwise first set
++	   bit will be first mismatch.  */
++	bzhil	%edx, %eax, %edx
++	jnz	L(return_vec_0)
++	xorl	%eax, %eax
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(between_2_3):
++L(page_cross_less_vec):
++	/* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
++	   bytes.  */
++	cmpl	$16, %edx
++	jae	L(between_16_31)
++# ifndef USE_AS_WMEMCMP
++	cmpl	$8, %edx
++	jae	L(between_8_15)
++	cmpl	$4, %edx
++	jae	L(between_4_7)
++
+ 	/* Load as big endian to avoid branches.  */
+ 	movzwl	(%rdi), %eax
+ 	movzwl	(%rsi), %ecx
+@@ -208,223 +439,106 @@ L(between_2_3):
+ 	shll	$8, %ecx
+ 	bswap	%eax
+ 	bswap	%ecx
+-	movb	-1(%rdi, %rdx), %al
+-	movb	-1(%rsi, %rdx), %cl
++	movzbl	-1(%rdi, %rdx), %edi
++	movzbl	-1(%rsi, %rdx), %esi
++	orl	%edi, %eax
++	orl	%esi, %ecx
+ 	/* Subtraction is okay because the upper 8 bits are zero.  */
+ 	subl	%ecx, %eax
++	/* No ymm register was touched.  */
+ 	ret
+ 
+ 	.p2align 4
+-L(1):
+-	movzbl	(%rdi), %eax
++L(one_or_less):
++	jb	L(zero)
+ 	movzbl	(%rsi), %ecx
++	movzbl	(%rdi), %eax
+ 	subl	%ecx, %eax
+-	ret
+-# endif
+-
+-	.p2align 4
+-L(zero):
+-	xorl	%eax, %eax
++	/* No ymm register was touched.  */
+ 	ret
+ 
+ 	.p2align 4
+-L(less_vec):
+-# ifdef USE_AS_WMEMCMP
+-	/* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes.  */
+-	cmpb	$4, %dl
+-	je	L(4)
+-	jb	L(zero)
+-# else
+-	cmpb	$1, %dl
+-	je	L(1)
+-	jb	L(zero)
+-	cmpb	$4, %dl
+-	jb	L(between_2_3)
+-	cmpb	$8, %dl
+-	jb	L(between_4_7)
++L(between_8_15):
+ # endif
+-	cmpb	$16, %dl
+-	jae	L(between_16_31)
+-	/* It is between 8 and 15 bytes.  */
++	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
+ 	vmovq	(%rdi), %xmm1
+ 	vmovq	(%rsi), %xmm2
+-	VPCMPEQ %xmm1, %xmm2, %xmm2
++	VPCMPEQ	%xmm1, %xmm2, %xmm2
+ 	vpmovmskb %xmm2, %eax
+-	subl    $0xffff, %eax
+-	jnz	L(first_vec)
++	subl	$0xffff, %eax
++	jnz	L(return_vec_0)
+ 	/* Use overlapping loads to avoid branches.  */
+ 	leaq	-8(%rdi, %rdx), %rdi
+ 	leaq	-8(%rsi, %rdx), %rsi
+ 	vmovq	(%rdi), %xmm1
+ 	vmovq	(%rsi), %xmm2
+-	VPCMPEQ %xmm1, %xmm2, %xmm2
++	VPCMPEQ	%xmm1, %xmm2, %xmm2
+ 	vpmovmskb %xmm2, %eax
+-	subl    $0xffff, %eax
+-	jnz	L(first_vec)
++	subl	$0xffff, %eax
++	jnz	L(return_vec_0)
++	/* No ymm register was touched.  */
++	ret
++
++	.p2align 4
++L(zero):
++	xorl	%eax, %eax
+ 	ret
+ 
+ 	.p2align 4
+ L(between_16_31):
+ 	/* From 16 to 31 bytes.  No branch when size == 16.  */
+ 	vmovdqu	(%rsi), %xmm2
+-	VPCMPEQ (%rdi), %xmm2, %xmm2
++	VPCMPEQ	(%rdi), %xmm2, %xmm2
+ 	vpmovmskb %xmm2, %eax
+-	subl    $0xffff, %eax
+-	jnz	L(first_vec)
++	subl	$0xffff, %eax
++	jnz	L(return_vec_0)
+ 
+ 	/* Use overlapping loads to avoid branches.  */
++
++	vmovdqu	-16(%rsi, %rdx), %xmm2
+ 	leaq	-16(%rdi, %rdx), %rdi
+ 	leaq	-16(%rsi, %rdx), %rsi
+-	vmovdqu	(%rsi), %xmm2
+-	VPCMPEQ (%rdi), %xmm2, %xmm2
++	VPCMPEQ	(%rdi), %xmm2, %xmm2
+ 	vpmovmskb %xmm2, %eax
+-	subl    $0xffff, %eax
+-	jnz	L(first_vec)
++	subl	$0xffff, %eax
++	jnz	L(return_vec_0)
++	/* No ymm register was touched.  */
+ 	ret
+ 
+-	.p2align 4
+-L(more_8x_vec):
+-	/* More than 8 * VEC.  Check the first VEC.  */
+-	vmovdqu	(%rsi), %ymm2
+-	VPCMPEQ (%rdi), %ymm2, %ymm2
+-	vpmovmskb %ymm2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+-
+-	/* Align the first memory area for aligned loads in the loop.
+-	   Compute how much the first memory area is misaligned.  */
+-	movq	%rdi, %rcx
+-	andl	$(VEC_SIZE - 1), %ecx
+-	/* Get the negative of offset for alignment.  */
+-	subq	$VEC_SIZE, %rcx
+-	/* Adjust the second memory area.  */
+-	subq	%rcx, %rsi
+-	/* Adjust the first memory area which should be aligned now.  */
+-	subq	%rcx, %rdi
+-	/* Adjust length.  */
+-	addq	%rcx, %rdx
+-
+-L(loop_4x_vec):
+-	/* Compare 4 * VEC at a time forward.  */
+-	vmovdqu	(%rsi), %ymm1
+-	VPCMPEQ (%rdi), %ymm1, %ymm1
+-
+-	vmovdqu	VEC_SIZE(%rsi), %ymm2
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
+-	vpand	%ymm2, %ymm1, %ymm5
+-
+-	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+-	vpand	%ymm3, %ymm5, %ymm5
+-
+-	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm4
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+-	vpand	%ymm4, %ymm5, %ymm5
+-
+-	vptest	%ymm0, %ymm5
+-	jnc	L(4x_vec_end)
+-
+-	addq	$(VEC_SIZE * 4), %rdi
+-	addq	$(VEC_SIZE * 4), %rsi
+-
+-	subq	$(VEC_SIZE * 4), %rdx
+-	cmpq	$(VEC_SIZE * 4), %rdx
+-	jae	L(loop_4x_vec)
+-
+-	/* Less than 4 * VEC.  */
+-	cmpq	$VEC_SIZE, %rdx
+-	jbe	L(last_vec)
+-	cmpq	$(VEC_SIZE * 2), %rdx
+-	jbe	L(last_2x_vec)
+-
+-L(last_4x_vec):
+-	/* From 2 * VEC to 4 * VEC. */
+-	vmovdqu	(%rsi), %ymm2
+-	VPCMPEQ (%rdi), %ymm2, %ymm2
+-	vpmovmskb %ymm2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+-
+-	addq	$VEC_SIZE, %rdi
+-	addq	$VEC_SIZE, %rsi
+-	vmovdqu	(%rsi), %ymm2
+-	VPCMPEQ (%rdi), %ymm2, %ymm2
+-	vpmovmskb %ymm2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+-
+-	/* Use overlapping loads to avoid branches.  */
+-	leaq	-(3 * VEC_SIZE)(%rdi, %rdx), %rdi
+-	leaq	-(3 * VEC_SIZE)(%rsi, %rdx), %rsi
+-	vmovdqu	(%rsi), %ymm2
+-	VPCMPEQ (%rdi), %ymm2, %ymm2
+-	vpmovmskb %ymm2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+-
+-	addq	$VEC_SIZE, %rdi
+-	addq	$VEC_SIZE, %rsi
+-	vmovdqu	(%rsi), %ymm2
+-	VPCMPEQ (%rdi), %ymm2, %ymm2
+-	vpmovmskb %ymm2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+-	VZEROUPPER_RETURN
+-
+-	.p2align 4
+-L(4x_vec_end):
+-	vpmovmskb %ymm1, %eax
+-	subl	$VEC_MASK, %eax
+-	jnz	L(first_vec)
+-	vpmovmskb %ymm2, %eax
+-	subl	$VEC_MASK, %eax
+-	jnz	L(first_vec_x1)
+-	vpmovmskb %ymm3, %eax
+-	subl	$VEC_MASK, %eax
+-	jnz	L(first_vec_x2)
+-	vpmovmskb %ymm4, %eax
+-	subl	$VEC_MASK, %eax
+-	tzcntl	%eax, %ecx
+ # ifdef USE_AS_WMEMCMP
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 3)(%rdi, %rcx), %edx
+-	cmpl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
+-	jmp	L(wmemcmp_return)
+-# else
+-	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
+-	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
+-	sub	%edx, %eax
+-# endif
+-	VZEROUPPER_RETURN
+-
+ 	.p2align 4
+-L(first_vec_x1):
+-	tzcntl	%eax, %ecx
+-# ifdef USE_AS_WMEMCMP
+-	xorl	%eax, %eax
+-	movl	VEC_SIZE(%rdi, %rcx), %edx
+-	cmpl	VEC_SIZE(%rsi, %rcx), %edx
+-	jmp	L(wmemcmp_return)
++L(one_or_less):
++	jb	L(zero)
++	movl	(%rdi), %ecx
++	xorl	%edx, %edx
++	cmpl	(%rsi), %ecx
++	je	L(zero)
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++	/* No ymm register was touched.  */
++	ret
+ # else
+-	movzbl	VEC_SIZE(%rdi, %rcx), %eax
+-	movzbl	VEC_SIZE(%rsi, %rcx), %edx
+-	sub	%edx, %eax
+-# endif
+-	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(first_vec_x2):
+-	tzcntl	%eax, %ecx
+-# ifdef USE_AS_WMEMCMP
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 2)(%rdi, %rcx), %edx
+-	cmpl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
+-	jmp	L(wmemcmp_return)
+-# else
+-	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
+-	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
+-	sub	%edx, %eax
++L(between_4_7):
++	/* Load as big endian with overlapping movbe to avoid branches.
++	 */
++	movbe	(%rdi), %eax
++	movbe	(%rsi), %ecx
++	shlq	$32, %rax
++	shlq	$32, %rcx
++	movbe	-4(%rdi, %rdx), %edi
++	movbe	-4(%rsi, %rdx), %esi
++	orq	%rdi, %rax
++	orq	%rsi, %rcx
++	subq	%rcx, %rax
++	jz	L(zero_4_7)
++	sbbl	%eax, %eax
++	orl	$1, %eax
++L(zero_4_7):
++	/* No ymm register was touched.  */
++	ret
+ # endif
+-	VZEROUPPER_RETURN
++
+ END (MEMCMP)
+ #endif
+-- 
+GitLab
+
diff --git a/SOURCES/ia-opt-memcmp-evex-movbe-1.patch b/SOURCES/ia-opt-memcmp-evex-movbe-1.patch
new file mode 100644
index 0000000..0216ed0
--- /dev/null
+++ b/SOURCES/ia-opt-memcmp-evex-movbe-1.patch
@@ -0,0 +1,851 @@
+From 2d8eaea7ad74328d806d3f1a377f1168eaa2f348 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 17 May 2021 13:57:24 -0400
+Subject: [PATCH] x86: Optimize memcmp-evex-movbe.S
+
+No bug. This commit optimizes memcmp-evex.S. The optimizations include
+adding a new vec compare path for small sizes, reorganizing the entry
+control flow, removing some unnecissary ALU instructions from the main
+loop, and most importantly replacing the heavy use of vpcmp + kand
+logic with vpxor + vptern. test-memcmp and test-wmemcmp are both
+passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+(cherry picked from commit 4ad473e97acdc5f6d811755b67c09f2128a644ce)
+---
+ sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 710 +++++++++++--------
+ 1 file changed, 408 insertions(+), 302 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+index 9c093972..654dc7ac 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
++++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+@@ -19,17 +19,22 @@
+ #if IS_IN (libc)
+ 
+ /* memcmp/wmemcmp is implemented as:
+-   1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
+-      to avoid branches.
+-   2. Use overlapping compare to avoid branch.
+-   3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
+-      bytes for wmemcmp.
+-   4. If size is 8 * VEC_SIZE or less, unroll the loop.
+-   5. Compare 4 * VEC_SIZE at a time with the aligned first memory
++   1. Use ymm vector compares when possible. The only case where
++      vector compares is not possible for when size < CHAR_PER_VEC
++      and loading from either s1 or s2 would cause a page cross.
++   2. For size from 2 to 7 bytes on page cross, load as big endian
++      with movbe and bswap to avoid branches.
++   3. Use xmm vector compare when size >= 4 bytes for memcmp or
++      size >= 8 bytes for wmemcmp.
++   4. Optimistically compare up to first 4 * CHAR_PER_VEC one at a
++      to check for early mismatches. Only do this if its guranteed the
++      work is not wasted.
++   5. If size is 8 * VEC_SIZE or less, unroll the loop.
++   6. Compare 4 * VEC_SIZE at a time with the aligned first memory
+       area.
+-   6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
+-   7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
+-   8. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
++   7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less.
++   8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less.
++   9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less.  */
+ 
+ # include <sysdep.h>
+ 
+@@ -40,11 +45,21 @@
+ # define VMOVU		vmovdqu64
+ 
+ # ifdef USE_AS_WMEMCMP
+-#  define VPCMPEQ	vpcmpeqd
++#  define CHAR_SIZE	4
++#  define VPCMP	vpcmpd
+ # else
+-#  define VPCMPEQ	vpcmpeqb
++#  define CHAR_SIZE	1
++#  define VPCMP	vpcmpub
+ # endif
+ 
++# define VEC_SIZE	32
++# define PAGE_SIZE	4096
++# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
++
++# define XMM0		xmm16
++# define XMM1		xmm17
++# define XMM2		xmm18
++# define YMM0		ymm16
+ # define XMM1		xmm17
+ # define XMM2		xmm18
+ # define YMM1		ymm17
+@@ -54,15 +69,6 @@
+ # define YMM5		ymm21
+ # define YMM6		ymm22
+ 
+-# define VEC_SIZE 32
+-# ifdef USE_AS_WMEMCMP
+-#  define VEC_MASK 0xff
+-#  define XMM_MASK 0xf
+-# else
+-#  define VEC_MASK 0xffffffff
+-#  define XMM_MASK 0xffff
+-# endif
+-
+ /* Warning!
+            wmemcmp has to use SIGNED comparison for elements.
+            memcmp has to use UNSIGNED comparison for elemnts.
+@@ -70,145 +76,370 @@
+ 
+ 	.section .text.evex,"ax",@progbits
+ ENTRY (MEMCMP)
+-# ifdef USE_AS_WMEMCMP
+-	shl	$2, %RDX_LP
+-# elif defined __ILP32__
++# ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	movl	%edx, %edx
+ # endif
+-	cmp	$VEC_SIZE, %RDX_LP
++	cmp	$CHAR_PER_VEC, %RDX_LP
+ 	jb	L(less_vec)
+ 
+ 	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+-	VMOVU	(%rsi), %YMM2
+-	VPCMPEQ (%rdi), %YMM2, %k1
++	VMOVU	(%rsi), %YMM1
++	/* Use compare not equals to directly check for mismatch.  */
++	VPCMP	$4, (%rdi), %YMM1, %k1
+ 	kmovd	%k1, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+-
+-	cmpq	$(VEC_SIZE * 2), %rdx
+-	jbe	L(last_vec)
+-
+-	/* More than 2 * VEC.  */
+-	cmpq	$(VEC_SIZE * 8), %rdx
+-	ja	L(more_8x_vec)
+-	cmpq	$(VEC_SIZE * 4), %rdx
+-	jb	L(last_4x_vec)
++	/* NB: eax must be destination register if going to
++	   L(return_vec_[0,2]). For L(return_vec_3 destination register
++	   must be ecx.  */
++	testl	%eax, %eax
++	jnz	L(return_vec_0)
+ 
+-	/* From 4 * VEC to 8 * VEC, inclusively. */
+-	VMOVU	(%rsi), %YMM1
+-	VPCMPEQ (%rdi), %YMM1, %k1
++	cmpq	$(CHAR_PER_VEC * 2), %rdx
++	jbe	L(last_1x_vec)
+ 
++	/* Check second VEC no matter what.  */
+ 	VMOVU	VEC_SIZE(%rsi), %YMM2
+-	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
++	VPCMP	$4, VEC_SIZE(%rdi), %YMM2, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_1)
++
++	/* Less than 4 * VEC.  */
++	cmpq	$(CHAR_PER_VEC * 4), %rdx
++	jbe	L(last_2x_vec)
+ 
++	/* Check third and fourth VEC no matter what.  */
+ 	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
++	VPCMP	$4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_2)
+ 
+ 	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
++	VPCMP	$4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1
++	kmovd	%k1, %ecx
++	testl	%ecx, %ecx
++	jnz	L(return_vec_3)
+ 
+-	kandd	%k1, %k2, %k5
+-	kandd	%k3, %k4, %k6
+-	kandd	%k5, %k6, %k6
++	/* Zero YMM0. 4x VEC reduction is done with vpxor + vtern so
++	   compare with zero to get a mask is needed.  */
++	vpxorq	%XMM0, %XMM0, %XMM0
+ 
+-	kmovd	%k6, %eax
+-	cmpl	$VEC_MASK, %eax
+-	jne	L(4x_vec_end)
++	/* Go to 4x VEC loop.  */
++	cmpq	$(CHAR_PER_VEC * 8), %rdx
++	ja	L(more_8x_vec)
+ 
+-	leaq	-(4 * VEC_SIZE)(%rdi, %rdx), %rdi
+-	leaq	-(4 * VEC_SIZE)(%rsi, %rdx), %rsi
+-	VMOVU	(%rsi), %YMM1
+-	VPCMPEQ (%rdi), %YMM1, %k1
++	/* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
++	   branches.  */
+ 
+-	VMOVU	VEC_SIZE(%rsi), %YMM2
+-	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
+-	kandd	%k1, %k2, %k5
++	/* Load first two VEC from s2 before adjusting addresses.  */
++	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx, CHAR_SIZE), %YMM1
++	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %YMM2
++	leaq	-(4 * VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %rdi
++	leaq	-(4 * VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
++
++	/* Wait to load from s1 until addressed adjust due to
++	   unlamination of microfusion with complex address mode.  */
++
++	/* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
++	   will have some 1s.  */
++	vpxorq	(%rdi), %YMM1, %YMM1
++	vpxorq	(VEC_SIZE)(%rdi), %YMM2, %YMM2
+ 
+ 	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
+-	kandd	%k3, %k5, %k5
++	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
++	/* Or together YMM1, YMM2, and YMM3 into YMM3.  */
++	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
+ 
+ 	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
+-	kandd	%k4, %k5, %k5
++	/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
++	   oring with YMM3. Result is stored in YMM4.  */
++	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
++	/* Compare YMM4 with 0. If any 1s s1 and s2 don't match.  */
++	VPCMP	$4, %YMM4, %YMM0, %k1
++	kmovd	%k1, %ecx
++	testl	%ecx, %ecx
++	jnz	L(return_vec_0_1_2_3)
++	/* NB: eax must be zero to reach here.  */
++	ret
+ 
+-	kmovd	%k5, %eax
+-	cmpl	$VEC_MASK, %eax
+-	jne	L(4x_vec_end)
+-	xorl	%eax, %eax
++	/* NB: aligning 32 here allows for the rest of the jump targets
++	   to be tuned for 32 byte alignment. Most important this ensures
++	   the L(more_8x_vec) loop is 32 byte aligned.  */
++	.p2align 5
++L(less_vec):
++	/* Check if one or less CHAR. This is necessary for size = 0 but
++	   is also faster for size = CHAR_SIZE.  */
++	cmpl	$1, %edx
++	jbe	L(one_or_less)
++
++	/* Check if loading one VEC from either s1 or s2 could cause a
++	   page cross. This can have false positives but is by far the
++	   fastest method.  */
++	movl	%edi, %eax
++	orl	%esi, %eax
++	andl	$(PAGE_SIZE - 1), %eax
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
++	jg	L(page_cross_less_vec)
++
++	/* No page cross possible.  */
++	VMOVU	(%rsi), %YMM2
++	VPCMP	$4, (%rdi), %YMM2, %k1
++	kmovd	%k1, %eax
++	/* Create mask in ecx for potentially in bound matches.  */
++	bzhil	%edx, %eax, %eax
++	jnz	L(return_vec_0)
+ 	ret
+ 
+ 	.p2align 4
+-L(last_2x_vec):
+-	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+-	VMOVU	(%rsi), %YMM2
+-	VPCMPEQ (%rdi), %YMM2, %k2
+-	kmovd	%k2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
++L(return_vec_0):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCMP
++	movl	(%rdi, %rax, CHAR_SIZE), %ecx
++	xorl	%edx, %edx
++	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
++	/* NB: no partial register stall here because xorl zero idiom
++	   above.  */
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	(%rsi, %rax), %ecx
++	movzbl	(%rdi, %rax), %eax
++	subl	%ecx, %eax
++# endif
++	ret
+ 
+-L(last_vec):
+-	/* Use overlapping loads to avoid branches.  */
+-	leaq	-VEC_SIZE(%rdi, %rdx), %rdi
+-	leaq	-VEC_SIZE(%rsi, %rdx), %rsi
+-	VMOVU	(%rsi), %YMM2
+-	VPCMPEQ (%rdi), %YMM2, %k2
+-	kmovd	%k2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
++	/* NB: No p2align necessary. Alignment  % 16 is naturally 1
++	   which is good enough for a target not in a loop.  */
++L(return_vec_1):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCMP
++	movl	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
++	xorl	%edx, %edx
++	cmpl	VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	VEC_SIZE(%rsi, %rax), %ecx
++	movzbl	VEC_SIZE(%rdi, %rax), %eax
++	subl	%ecx, %eax
++# endif
+ 	ret
+ 
+-	.p2align 4
+-L(first_vec):
+-	/* A byte or int32 is different within 16 or 32 bytes.  */
+-	tzcntl	%eax, %ecx
++	/* NB: No p2align necessary. Alignment  % 16 is naturally 2
++	   which is good enough for a target not in a loop.  */
++L(return_vec_2):
++	tzcntl	%eax, %eax
+ # ifdef USE_AS_WMEMCMP
+-	xorl	%eax, %eax
+-	movl	(%rdi, %rcx, 4), %edx
+-	cmpl	(%rsi, %rcx, 4), %edx
+-L(wmemcmp_return):
+-	setl	%al
+-	negl	%eax
+-	orl	$1, %eax
++	movl	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
++	xorl	%edx, %edx
++	cmpl	(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
+ # else
+-	movzbl	(%rdi, %rcx), %eax
+-	movzbl	(%rsi, %rcx), %edx
+-	sub	%edx, %eax
++	movzbl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
++	movzbl	(VEC_SIZE * 2)(%rdi, %rax), %eax
++	subl	%ecx, %eax
+ # endif
+ 	ret
+ 
++	.p2align 4
++L(8x_return_vec_0_1_2_3):
++	/* Returning from L(more_8x_vec) requires restoring rsi.  */
++	addq	%rdi, %rsi
++L(return_vec_0_1_2_3):
++	VPCMP	$4, %YMM1, %YMM0, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_0)
++
++	VPCMP	$4, %YMM2, %YMM0, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_1)
++
++	VPCMP	$4, %YMM3, %YMM0, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_2)
++L(return_vec_3):
++	tzcntl	%ecx, %ecx
+ # ifdef USE_AS_WMEMCMP
++	movl	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
++	xorl	%edx, %edx
++	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
++	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++# endif
++	ret
++
+ 	.p2align 4
+-L(4):
+-	xorl	%eax, %eax
+-	movl	(%rdi), %edx
+-	cmpl	(%rsi), %edx
+-	jne	L(wmemcmp_return)
++L(more_8x_vec):
++	/* Set end of s1 in rdx.  */
++	leaq	-(VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rdx
++	/* rsi stores s2 - s1. This allows loop to only update one
++	   pointer.  */
++	subq	%rdi, %rsi
++	/* Align s1 pointer.  */
++	andq	$-VEC_SIZE, %rdi
++	/* Adjust because first 4x vec where check already.  */
++	subq	$-(VEC_SIZE * 4), %rdi
++	.p2align 4
++L(loop_4x_vec):
++	VMOVU	(%rsi, %rdi), %YMM1
++	vpxorq	(%rdi), %YMM1, %YMM1
++
++	VMOVU	VEC_SIZE(%rsi, %rdi), %YMM2
++	vpxorq	VEC_SIZE(%rdi), %YMM2, %YMM2
++
++	VMOVU	(VEC_SIZE * 2)(%rsi, %rdi), %YMM3
++	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
++	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
++
++	VMOVU	(VEC_SIZE * 3)(%rsi, %rdi), %YMM4
++	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
++	VPCMP	$4, %YMM4, %YMM0, %k1
++	kmovd	%k1, %ecx
++	testl	%ecx, %ecx
++	jnz	L(8x_return_vec_0_1_2_3)
++	subq	$-(VEC_SIZE * 4), %rdi
++	cmpq	%rdx, %rdi
++	jb	L(loop_4x_vec)
++
++	subq	%rdx, %rdi
++	/* rdi has 4 * VEC_SIZE - remaining length.  */
++	cmpl	$(VEC_SIZE * 3), %edi
++	jae	L(8x_last_1x_vec)
++	/* Load regardless of branch.  */
++	VMOVU	(VEC_SIZE * 2)(%rsi, %rdx), %YMM3
++	cmpl	$(VEC_SIZE * 2), %edi
++	jae	L(8x_last_2x_vec)
++
++	VMOVU	(%rsi, %rdx), %YMM1
++	vpxorq	(%rdx), %YMM1, %YMM1
++
++	VMOVU	VEC_SIZE(%rsi, %rdx), %YMM2
++	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM2
++
++	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
++	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
++
++	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM4
++	vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM3, %YMM4
++	VPCMP	$4, %YMM4, %YMM0, %k1
++	kmovd	%k1, %ecx
++	/* Restore s1 pointer to rdi.  */
++	movq	%rdx, %rdi
++	testl	%ecx, %ecx
++	jnz	L(8x_return_vec_0_1_2_3)
++	/* NB: eax must be zero to reach here.  */
++	ret
++
++	/* Only entry is from L(more_8x_vec).  */
++	.p2align 4
++L(8x_last_2x_vec):
++	VPCMP	$4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(8x_return_vec_2)
++	/* Naturally aligned to 16 bytes.  */
++L(8x_last_1x_vec):
++	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM1
++	VPCMP	$4, (VEC_SIZE * 3)(%rdx), %YMM1, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(8x_return_vec_3)
++	ret
++
++	.p2align 4
++L(last_2x_vec):
++	/* Check second to last VEC.  */
++	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1
++	VPCMP	$4, -(VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_1_end)
++
++	/* Check last VEC.  */
++	.p2align 4
++L(last_1x_vec):
++	VMOVU	-(VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE), %YMM1
++	VPCMP	$4, -(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_0_end)
+ 	ret
++
++	.p2align 4
++L(8x_return_vec_2):
++	subq	$VEC_SIZE, %rdx
++L(8x_return_vec_3):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCMP
++	leaq	(%rdx, %rax, CHAR_SIZE), %rax
++	movl	(VEC_SIZE * 3)(%rax), %ecx
++	xorl	%edx, %edx
++	cmpl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
+ # else
++	addq	%rdx, %rax
++	movzbl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
++	movzbl	(VEC_SIZE * 3)(%rax), %eax
++	subl	%ecx, %eax
++# endif
++	ret
++
+ 	.p2align 4
+-L(between_4_7):
+-	/* Load as big endian with overlapping movbe to avoid branches.  */
+-	movbe	(%rdi), %eax
+-	movbe	(%rsi), %ecx
+-	shlq	$32, %rax
+-	shlq	$32, %rcx
+-	movbe	-4(%rdi, %rdx), %edi
+-	movbe	-4(%rsi, %rdx), %esi
+-	orq	%rdi, %rax
+-	orq	%rsi, %rcx
+-	subq	%rcx, %rax
+-	je	L(exit)
+-	sbbl	%eax, %eax
+-	orl	$1, %eax
++L(return_vec_0_end):
++	tzcntl	%eax, %eax
++	addl	%edx, %eax
++# ifdef USE_AS_WMEMCMP
++	movl	-VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
++	xorl	%edx, %edx
++	cmpl	-VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	-VEC_SIZE(%rsi, %rax), %ecx
++	movzbl	-VEC_SIZE(%rdi, %rax), %eax
++	subl	%ecx, %eax
++# endif
+ 	ret
+ 
+ 	.p2align 4
+-L(exit):
++L(return_vec_1_end):
++	tzcntl	%eax, %eax
++	addl	%edx, %eax
++# ifdef USE_AS_WMEMCMP
++	movl	-(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
++	xorl	%edx, %edx
++	cmpl	-(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
++	movzbl	-(VEC_SIZE * 2)(%rdi, %rax), %eax
++	subl	%ecx, %eax
++# endif
+ 	ret
+ 
++
+ 	.p2align 4
++L(page_cross_less_vec):
++	/* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
++	   bytes.  */
++	cmpl	$(16 / CHAR_SIZE), %edx
++	jae	L(between_16_31)
++# ifndef USE_AS_WMEMCMP
++	cmpl	$8, %edx
++	jae	L(between_8_15)
++	cmpl	$4, %edx
++	jae	L(between_4_7)
+ L(between_2_3):
+ 	/* Load as big endian to avoid branches.  */
+ 	movzwl	(%rdi), %eax
+@@ -217,224 +448,99 @@ L(between_2_3):
+ 	shll	$8, %ecx
+ 	bswap	%eax
+ 	bswap	%ecx
+-	movb	-1(%rdi, %rdx), %al
+-	movb	-1(%rsi, %rdx), %cl
++	movzbl	-1(%rdi, %rdx), %edi
++	movzbl	-1(%rsi, %rdx), %esi
++	orl	%edi, %eax
++	orl	%esi, %ecx
+ 	/* Subtraction is okay because the upper 8 bits are zero.  */
+ 	subl	%ecx, %eax
+ 	ret
+-
+ 	.p2align 4
+-L(1):
+-	movzbl	(%rdi), %eax
++L(one_or_less):
++	jb	L(zero)
+ 	movzbl	(%rsi), %ecx
++	movzbl	(%rdi), %eax
+ 	subl	%ecx, %eax
+ 	ret
+-# endif
+-
+-	.p2align 4
+-L(zero):
+-	xorl	%eax, %eax
+-	ret
+ 
+ 	.p2align 4
+-L(less_vec):
+-# ifdef USE_AS_WMEMCMP
+-	/* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes.  */
+-	cmpb	$4, %dl
+-	je	L(4)
+-	jb	L(zero)
+-# else
+-	cmpb	$1, %dl
+-	je	L(1)
+-	jb	L(zero)
+-	cmpb	$4, %dl
+-	jb	L(between_2_3)
+-	cmpb	$8, %dl
+-	jb	L(between_4_7)
++L(between_8_15):
+ # endif
+-	cmpb	$16, %dl
+-	jae	L(between_16_31)
+-	/* It is between 8 and 15 bytes.  */
++	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
+ 	vmovq	(%rdi), %XMM1
+ 	vmovq	(%rsi), %XMM2
+-	VPCMPEQ %XMM1, %XMM2, %k2
+-	kmovw	%k2, %eax
+-	subl    $XMM_MASK, %eax
+-	jnz	L(first_vec)
++	VPCMP	$4, %XMM1, %XMM2, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_0)
+ 	/* Use overlapping loads to avoid branches.  */
+-	leaq	-8(%rdi, %rdx), %rdi
+-	leaq	-8(%rsi, %rdx), %rsi
++	leaq	-8(%rdi, %rdx, CHAR_SIZE), %rdi
++	leaq	-8(%rsi, %rdx, CHAR_SIZE), %rsi
+ 	vmovq	(%rdi), %XMM1
+ 	vmovq	(%rsi), %XMM2
+-	VPCMPEQ %XMM1, %XMM2, %k2
+-	kmovw	%k2, %eax
+-	subl    $XMM_MASK, %eax
+-	jnz	L(first_vec)
++	VPCMP	$4, %XMM1, %XMM2, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_0)
+ 	ret
+ 
+ 	.p2align 4
+-L(between_16_31):
+-	/* From 16 to 31 bytes.  No branch when size == 16.  */
+-	VMOVU	(%rsi), %XMM2
+-	VPCMPEQ (%rdi), %XMM2, %k2
+-	kmovw	%k2, %eax
+-	subl    $XMM_MASK, %eax
+-	jnz	L(first_vec)
+-
+-	/* Use overlapping loads to avoid branches.  */
+-	leaq	-16(%rdi, %rdx), %rdi
+-	leaq	-16(%rsi, %rdx), %rsi
+-	VMOVU	(%rsi), %XMM2
+-	VPCMPEQ (%rdi), %XMM2, %k2
+-	kmovw	%k2, %eax
+-	subl    $XMM_MASK, %eax
+-	jnz	L(first_vec)
++L(zero):
++	xorl	%eax, %eax
+ 	ret
+ 
+ 	.p2align 4
+-L(more_8x_vec):
+-	/* More than 8 * VEC.  Check the first VEC.  */
+-	VMOVU	(%rsi), %YMM2
+-	VPCMPEQ (%rdi), %YMM2, %k2
+-	kmovd	%k2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+-
+-	/* Align the first memory area for aligned loads in the loop.
+-	   Compute how much the first memory area is misaligned.  */
+-	movq	%rdi, %rcx
+-	andl	$(VEC_SIZE - 1), %ecx
+-	/* Get the negative of offset for alignment.  */
+-	subq	$VEC_SIZE, %rcx
+-	/* Adjust the second memory area.  */
+-	subq	%rcx, %rsi
+-	/* Adjust the first memory area which should be aligned now.  */
+-	subq	%rcx, %rdi
+-	/* Adjust length.  */
+-	addq	%rcx, %rdx
+-
+-L(loop_4x_vec):
+-	/* Compare 4 * VEC at a time forward.  */
+-	VMOVU	(%rsi), %YMM1
+-	VPCMPEQ (%rdi), %YMM1, %k1
+-
+-	VMOVU	VEC_SIZE(%rsi), %YMM2
+-	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
+-	kandd	%k2, %k1, %k5
+-
+-	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
+-	kandd	%k3, %k5, %k5
+-
+-	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
+-	kandd	%k4, %k5, %k5
+-
+-	kmovd	%k5, %eax
+-	cmpl	$VEC_MASK, %eax
+-	jne	L(4x_vec_end)
+-
+-	addq	$(VEC_SIZE * 4), %rdi
+-	addq	$(VEC_SIZE * 4), %rsi
+-
+-	subq	$(VEC_SIZE * 4), %rdx
+-	cmpq	$(VEC_SIZE * 4), %rdx
+-	jae	L(loop_4x_vec)
+-
+-	/* Less than 4 * VEC.  */
+-	cmpq	$VEC_SIZE, %rdx
+-	jbe	L(last_vec)
+-	cmpq	$(VEC_SIZE * 2), %rdx
+-	jbe	L(last_2x_vec)
+-
+-L(last_4x_vec):
+-	/* From 2 * VEC to 4 * VEC. */
+-	VMOVU	(%rsi), %YMM2
+-	VPCMPEQ (%rdi), %YMM2, %k2
+-	kmovd	%k2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+-
+-	addq	$VEC_SIZE, %rdi
+-	addq	$VEC_SIZE, %rsi
+-	VMOVU	(%rsi), %YMM2
+-	VPCMPEQ (%rdi), %YMM2, %k2
+-	kmovd	%k2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
++L(between_16_31):
++	/* From 16 to 31 bytes.  No branch when size == 16.  */
++	VMOVU	(%rsi), %XMM2
++	VPCMP	$4, (%rdi), %XMM2, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_0)
+ 
+ 	/* Use overlapping loads to avoid branches.  */
+-	leaq	-(3 * VEC_SIZE)(%rdi, %rdx), %rdi
+-	leaq	-(3 * VEC_SIZE)(%rsi, %rdx), %rsi
+-	VMOVU	(%rsi), %YMM2
+-	VPCMPEQ (%rdi), %YMM2, %k2
+-	kmovd	%k2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+ 
+-	addq	$VEC_SIZE, %rdi
+-	addq	$VEC_SIZE, %rsi
+-	VMOVU	(%rsi), %YMM2
+-	VPCMPEQ (%rdi), %YMM2, %k2
+-	kmovd	%k2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+-	ret
+-
+-	.p2align 4
+-L(4x_vec_end):
++	VMOVU	-16(%rsi, %rdx, CHAR_SIZE), %XMM2
++	leaq	-16(%rdi, %rdx, CHAR_SIZE), %rdi
++	leaq	-16(%rsi, %rdx, CHAR_SIZE), %rsi
++	VPCMP	$4, (%rdi), %XMM2, %k1
+ 	kmovd	%k1, %eax
+-	subl	$VEC_MASK, %eax
+-	jnz	L(first_vec)
+-	kmovd	%k2, %eax
+-	subl	$VEC_MASK, %eax
+-	jnz	L(first_vec_x1)
+-	kmovd	%k3, %eax
+-	subl	$VEC_MASK, %eax
+-	jnz	L(first_vec_x2)
+-	kmovd	%k4, %eax
+-	subl	$VEC_MASK, %eax
+-	tzcntl	%eax, %ecx
+-# ifdef USE_AS_WMEMCMP
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 3)(%rdi, %rcx, 4), %edx
+-	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, 4), %edx
+-	jmp	L(wmemcmp_return)
+-# else
+-	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
+-	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
+-	sub	%edx, %eax
+-# endif
++	testl	%eax, %eax
++	jnz	L(return_vec_0)
+ 	ret
+ 
+-	.p2align 4
+-L(first_vec_x1):
+-	tzcntl	%eax, %ecx
+ # ifdef USE_AS_WMEMCMP
+-	xorl	%eax, %eax
+-	movl	VEC_SIZE(%rdi, %rcx, 4), %edx
+-	cmpl	VEC_SIZE(%rsi, %rcx, 4), %edx
+-	jmp	L(wmemcmp_return)
+-# else
+-	movzbl	VEC_SIZE(%rdi, %rcx), %eax
+-	movzbl	VEC_SIZE(%rsi, %rcx), %edx
+-	sub	%edx, %eax
+-# endif
++	.p2align 4
++L(one_or_less):
++	jb	L(zero)
++	movl	(%rdi), %ecx
++	xorl	%edx, %edx
++	cmpl	(%rsi), %ecx
++	je	L(zero)
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
+ 	ret
++# else
+ 
+ 	.p2align 4
+-L(first_vec_x2):
+-	tzcntl	%eax, %ecx
+-# ifdef USE_AS_WMEMCMP
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 2)(%rdi, %rcx, 4), %edx
+-	cmpl	(VEC_SIZE * 2)(%rsi, %rcx, 4), %edx
+-	jmp	L(wmemcmp_return)
+-# else
+-	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
+-	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
+-	sub	%edx, %eax
+-# endif
++L(between_4_7):
++	/* Load as big endian with overlapping movbe to avoid branches.
++	 */
++	movbe	(%rdi), %eax
++	movbe	(%rsi), %ecx
++	shlq	$32, %rax
++	shlq	$32, %rcx
++	movbe	-4(%rdi, %rdx), %edi
++	movbe	-4(%rsi, %rdx), %esi
++	orq	%rdi, %rax
++	orq	%rsi, %rcx
++	subq	%rcx, %rax
++	jz	L(zero_4_7)
++	sbbl	%eax, %eax
++	orl	$1, %eax
++L(zero_4_7):
+ 	ret
++# endif
++
+ END (MEMCMP)
+ #endif
+-- 
+GitLab
+
diff --git a/SOURCES/ia-opt-memcmp-evex-movbe-2.patch b/SOURCES/ia-opt-memcmp-evex-movbe-2.patch
new file mode 100644
index 0000000..360a4f9
--- /dev/null
+++ b/SOURCES/ia-opt-memcmp-evex-movbe-2.patch
@@ -0,0 +1,659 @@
+From 851ab0499680a3369da724d3d6d2ba71652d530d Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Tue, 21 Sep 2021 18:45:03 -0500
+Subject: [PATCH] x86: Optimize memcmp-evex-movbe.S for frontend behavior and
+ size
+
+No bug.
+
+The frontend optimizations are to:
+1. Reorganize logically connected basic blocks so they are either in
+   the same cache line or adjacent cache lines.
+2. Avoid cases when basic blocks unnecissarily cross cache lines.
+3. Try and 32 byte align any basic blocks possible without sacrificing
+   code size. Smaller / Less hot basic blocks are used for this.
+
+Overall code size shrunk by 168 bytes. This should make up for any
+extra costs due to aligning to 64 bytes.
+
+In general performance before deviated a great deal dependending on
+whether entry alignment % 64 was 0, 16, 32, or 48. These changes
+essentially make it so that the current implementation is at least
+equal to the best alignment of the original for any arguments.
+
+The only additional optimization is in the page cross case. Branch on
+equals case was removed from the size == [4, 7] case. As well the [4,
+7] and [2, 3] case where swapped as [4, 7] is likely a more hot
+argument size.
+
+test-memcmp and test-wmemcmp are both passing.
+
+(cherry picked from commit 1bd8b8d58fc9967cc073d2c13bfb6befefca2faa)
+---
+ sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 434 +++++++++++--------
+ 1 file changed, 242 insertions(+), 192 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+index 654dc7ac..2761b54f 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
++++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+@@ -34,7 +34,24 @@
+       area.
+    7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less.
+    8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less.
+-   9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less.  */
++   9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less.
++
++When possible the implementation tries to optimize for frontend in the
++following ways:
++Throughput:
++    1. All code sections that fit are able to run optimally out of the
++       LSD.
++    2. All code sections that fit are able to run optimally out of the
++       DSB
++    3. Basic blocks are contained in minimum number of fetch blocks
++       necessary.
++
++Latency:
++    1. Logically connected basic blocks are put in the same
++       cache-line.
++    2. Logically connected basic blocks that do not fit in the same
++       cache-line are put in adjacent lines. This can get beneficial
++       L2 spatial prefetching and L1 next-line prefetching.  */
+ 
+ # include <sysdep.h>
+ 
+@@ -47,9 +64,11 @@
+ # ifdef USE_AS_WMEMCMP
+ #  define CHAR_SIZE	4
+ #  define VPCMP	vpcmpd
++#  define VPTEST	vptestmd
+ # else
+ #  define CHAR_SIZE	1
+ #  define VPCMP	vpcmpub
++#  define VPTEST	vptestmb
+ # endif
+ 
+ # define VEC_SIZE	32
+@@ -75,7 +94,9 @@
+ */
+ 
+ 	.section .text.evex,"ax",@progbits
+-ENTRY (MEMCMP)
++/* Cache align memcmp entry. This allows for much more thorough
++   frontend optimization.  */
++ENTRY_P2ALIGN (MEMCMP, 6)
+ # ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	movl	%edx, %edx
+@@ -89,7 +110,7 @@ ENTRY (MEMCMP)
+ 	VPCMP	$4, (%rdi), %YMM1, %k1
+ 	kmovd	%k1, %eax
+ 	/* NB: eax must be destination register if going to
+-	   L(return_vec_[0,2]). For L(return_vec_3 destination register
++	   L(return_vec_[0,2]). For L(return_vec_3) destination register
+ 	   must be ecx.  */
+ 	testl	%eax, %eax
+ 	jnz	L(return_vec_0)
+@@ -121,10 +142,6 @@ ENTRY (MEMCMP)
+ 	testl	%ecx, %ecx
+ 	jnz	L(return_vec_3)
+ 
+-	/* Zero YMM0. 4x VEC reduction is done with vpxor + vtern so
+-	   compare with zero to get a mask is needed.  */
+-	vpxorq	%XMM0, %XMM0, %XMM0
+-
+ 	/* Go to 4x VEC loop.  */
+ 	cmpq	$(CHAR_PER_VEC * 8), %rdx
+ 	ja	L(more_8x_vec)
+@@ -148,47 +165,61 @@ ENTRY (MEMCMP)
+ 
+ 	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
+ 	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
+-	/* Or together YMM1, YMM2, and YMM3 into YMM3.  */
+-	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
+ 
+ 	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
+ 	/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
+-	   oring with YMM3. Result is stored in YMM4.  */
+-	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
+-	/* Compare YMM4 with 0. If any 1s s1 and s2 don't match.  */
+-	VPCMP	$4, %YMM4, %YMM0, %k1
++	   oring with YMM1. Result is stored in YMM4.  */
++	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
++
++	/* Or together YMM2, YMM3, and YMM4 into YMM4.  */
++	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
++
++	/* Test YMM4 against itself. Store any CHAR mismatches in k1.
++	 */
++	VPTEST	%YMM4, %YMM4, %k1
++	/* k1 must go to ecx for L(return_vec_0_1_2_3).  */
+ 	kmovd	%k1, %ecx
+ 	testl	%ecx, %ecx
+ 	jnz	L(return_vec_0_1_2_3)
+ 	/* NB: eax must be zero to reach here.  */
+ 	ret
+ 
+-	/* NB: aligning 32 here allows for the rest of the jump targets
+-	   to be tuned for 32 byte alignment. Most important this ensures
+-	   the L(more_8x_vec) loop is 32 byte aligned.  */
+-	.p2align 5
+-L(less_vec):
+-	/* Check if one or less CHAR. This is necessary for size = 0 but
+-	   is also faster for size = CHAR_SIZE.  */
+-	cmpl	$1, %edx
+-	jbe	L(one_or_less)
++	.p2align 4
++L(8x_end_return_vec_0_1_2_3):
++	movq	%rdx, %rdi
++L(8x_return_vec_0_1_2_3):
++	addq	%rdi, %rsi
++L(return_vec_0_1_2_3):
++	VPTEST	%YMM1, %YMM1, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_0)
+ 
+-	/* Check if loading one VEC from either s1 or s2 could cause a
+-	   page cross. This can have false positives but is by far the
+-	   fastest method.  */
+-	movl	%edi, %eax
+-	orl	%esi, %eax
+-	andl	$(PAGE_SIZE - 1), %eax
+-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+-	jg	L(page_cross_less_vec)
++	VPTEST	%YMM2, %YMM2, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_1)
+ 
+-	/* No page cross possible.  */
+-	VMOVU	(%rsi), %YMM2
+-	VPCMP	$4, (%rdi), %YMM2, %k1
+-	kmovd	%k1, %eax
+-	/* Create mask in ecx for potentially in bound matches.  */
+-	bzhil	%edx, %eax, %eax
+-	jnz	L(return_vec_0)
++	VPTEST	%YMM3, %YMM3, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_2)
++L(return_vec_3):
++	/* bsf saves 1 byte from tzcnt. This keep L(return_vec_3) in one
++	   fetch block and the entire L(*return_vec_0_1_2_3) in 1 cache
++	   line.  */
++	bsfl	%ecx, %ecx
++# ifdef USE_AS_WMEMCMP
++	movl	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
++	xorl	%edx, %edx
++	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
++	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++# endif
+ 	ret
+ 
+ 	.p2align 4
+@@ -209,10 +240,11 @@ L(return_vec_0):
+ # endif
+ 	ret
+ 
+-	/* NB: No p2align necessary. Alignment  % 16 is naturally 1
+-	   which is good enough for a target not in a loop.  */
++	.p2align 4
+ L(return_vec_1):
+-	tzcntl	%eax, %eax
++	/* bsf saves 1 byte over tzcnt and keeps L(return_vec_1) in one
++	   fetch block.  */
++	bsfl	%eax, %eax
+ # ifdef USE_AS_WMEMCMP
+ 	movl	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
+ 	xorl	%edx, %edx
+@@ -226,10 +258,11 @@ L(return_vec_1):
+ # endif
+ 	ret
+ 
+-	/* NB: No p2align necessary. Alignment  % 16 is naturally 2
+-	   which is good enough for a target not in a loop.  */
++	.p2align 4,, 10
+ L(return_vec_2):
+-	tzcntl	%eax, %eax
++	/* bsf saves 1 byte over tzcnt and keeps L(return_vec_2) in one
++	   fetch block.  */
++	bsfl	%eax, %eax
+ # ifdef USE_AS_WMEMCMP
+ 	movl	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
+ 	xorl	%edx, %edx
+@@ -243,40 +276,6 @@ L(return_vec_2):
+ # endif
+ 	ret
+ 
+-	.p2align 4
+-L(8x_return_vec_0_1_2_3):
+-	/* Returning from L(more_8x_vec) requires restoring rsi.  */
+-	addq	%rdi, %rsi
+-L(return_vec_0_1_2_3):
+-	VPCMP	$4, %YMM1, %YMM0, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(return_vec_0)
+-
+-	VPCMP	$4, %YMM2, %YMM0, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(return_vec_1)
+-
+-	VPCMP	$4, %YMM3, %YMM0, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(return_vec_2)
+-L(return_vec_3):
+-	tzcntl	%ecx, %ecx
+-# ifdef USE_AS_WMEMCMP
+-	movl	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
+-	xorl	%edx, %edx
+-	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
+-# else
+-	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
+-	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
+-	subl	%ecx, %eax
+-# endif
+-	ret
+-
+ 	.p2align 4
+ L(more_8x_vec):
+ 	/* Set end of s1 in rdx.  */
+@@ -288,21 +287,19 @@ L(more_8x_vec):
+ 	andq	$-VEC_SIZE, %rdi
+ 	/* Adjust because first 4x vec where check already.  */
+ 	subq	$-(VEC_SIZE * 4), %rdi
++
+ 	.p2align 4
+ L(loop_4x_vec):
+ 	VMOVU	(%rsi, %rdi), %YMM1
+ 	vpxorq	(%rdi), %YMM1, %YMM1
+-
+ 	VMOVU	VEC_SIZE(%rsi, %rdi), %YMM2
+ 	vpxorq	VEC_SIZE(%rdi), %YMM2, %YMM2
+-
+ 	VMOVU	(VEC_SIZE * 2)(%rsi, %rdi), %YMM3
+ 	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
+-	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
+-
+ 	VMOVU	(VEC_SIZE * 3)(%rsi, %rdi), %YMM4
+-	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
+-	VPCMP	$4, %YMM4, %YMM0, %k1
++	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
++	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
++	VPTEST	%YMM4, %YMM4, %k1
+ 	kmovd	%k1, %ecx
+ 	testl	%ecx, %ecx
+ 	jnz	L(8x_return_vec_0_1_2_3)
+@@ -319,28 +316,25 @@ L(loop_4x_vec):
+ 	cmpl	$(VEC_SIZE * 2), %edi
+ 	jae	L(8x_last_2x_vec)
+ 
++	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
++
+ 	VMOVU	(%rsi, %rdx), %YMM1
+ 	vpxorq	(%rdx), %YMM1, %YMM1
+ 
+ 	VMOVU	VEC_SIZE(%rsi, %rdx), %YMM2
+ 	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM2
+-
+-	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
+-	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
+-
+ 	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM4
+-	vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM3, %YMM4
+-	VPCMP	$4, %YMM4, %YMM0, %k1
++	vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
++	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
++	VPTEST	%YMM4, %YMM4, %k1
+ 	kmovd	%k1, %ecx
+-	/* Restore s1 pointer to rdi.  */
+-	movq	%rdx, %rdi
+ 	testl	%ecx, %ecx
+-	jnz	L(8x_return_vec_0_1_2_3)
++	jnz	L(8x_end_return_vec_0_1_2_3)
+ 	/* NB: eax must be zero to reach here.  */
+ 	ret
+ 
+ 	/* Only entry is from L(more_8x_vec).  */
+-	.p2align 4
++	.p2align 4,, 10
+ L(8x_last_2x_vec):
+ 	VPCMP	$4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1
+ 	kmovd	%k1, %eax
+@@ -355,7 +349,31 @@ L(8x_last_1x_vec):
+ 	jnz	L(8x_return_vec_3)
+ 	ret
+ 
+-	.p2align 4
++	/* Not ideally aligned (at offset +9 bytes in fetch block) but
++	   not aligning keeps it in the same cache line as
++	   L(8x_last_1x/2x_vec) so likely worth it. As well, saves code
++	   size.  */
++	.p2align 4,, 4
++L(8x_return_vec_2):
++	subq	$VEC_SIZE, %rdx
++L(8x_return_vec_3):
++	bsfl	%eax, %eax
++# ifdef USE_AS_WMEMCMP
++	leaq	(%rdx, %rax, CHAR_SIZE), %rax
++	movl	(VEC_SIZE * 3)(%rax), %ecx
++	xorl	%edx, %edx
++	cmpl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	addq	%rdx, %rax
++	movzbl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
++	movzbl	(VEC_SIZE * 3)(%rax), %eax
++	subl	%ecx, %eax
++# endif
++	ret
++
++	.p2align 4,, 10
+ L(last_2x_vec):
+ 	/* Check second to last VEC.  */
+ 	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1
+@@ -374,26 +392,49 @@ L(last_1x_vec):
+ 	jnz	L(return_vec_0_end)
+ 	ret
+ 
+-	.p2align 4
+-L(8x_return_vec_2):
+-	subq	$VEC_SIZE, %rdx
+-L(8x_return_vec_3):
+-	tzcntl	%eax, %eax
++	.p2align 4,, 10
++L(return_vec_1_end):
++	/* Use bsf to save code size. This is necessary to have
++	   L(one_or_less) fit in aligning bytes between.  */
++	bsfl	%eax, %eax
++	addl	%edx, %eax
+ # ifdef USE_AS_WMEMCMP
+-	leaq	(%rdx, %rax, CHAR_SIZE), %rax
+-	movl	(VEC_SIZE * 3)(%rax), %ecx
++	movl	-(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
+ 	xorl	%edx, %edx
+-	cmpl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
++	cmpl	-(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
+ 	setg	%dl
+ 	leal	-1(%rdx, %rdx), %eax
+ # else
+-	addq	%rdx, %rax
+-	movzbl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
+-	movzbl	(VEC_SIZE * 3)(%rax), %eax
++	movzbl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
++	movzbl	-(VEC_SIZE * 2)(%rdi, %rax), %eax
+ 	subl	%ecx, %eax
+ # endif
+ 	ret
+ 
++	/* NB: L(one_or_less) fits in alignment padding between
++	   L(return_vec_1_end) and L(return_vec_0_end).  */
++# ifdef USE_AS_WMEMCMP
++L(one_or_less):
++	jb	L(zero)
++	movl	(%rdi), %ecx
++	xorl	%edx, %edx
++	cmpl	(%rsi), %ecx
++	je	L(zero)
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++	ret
++# else
++L(one_or_less):
++	jb	L(zero)
++	movzbl	(%rsi), %ecx
++	movzbl	(%rdi), %eax
++	subl	%ecx, %eax
++	ret
++# endif
++L(zero):
++	xorl	%eax, %eax
++	ret
++
+ 	.p2align 4
+ L(return_vec_0_end):
+ 	tzcntl	%eax, %eax
+@@ -412,23 +453,56 @@ L(return_vec_0_end):
+ 	ret
+ 
+ 	.p2align 4
+-L(return_vec_1_end):
++L(less_vec):
++	/* Check if one or less CHAR. This is necessary for size == 0
++	   but is also faster for size == CHAR_SIZE.  */
++	cmpl	$1, %edx
++	jbe	L(one_or_less)
++
++	/* Check if loading one VEC from either s1 or s2 could cause a
++	   page cross. This can have false positives but is by far the
++	   fastest method.  */
++	movl	%edi, %eax
++	orl	%esi, %eax
++	andl	$(PAGE_SIZE - 1), %eax
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
++	jg	L(page_cross_less_vec)
++
++	/* No page cross possible.  */
++	VMOVU	(%rsi), %YMM2
++	VPCMP	$4, (%rdi), %YMM2, %k1
++	kmovd	%k1, %eax
++	/* Check if any matches where in bounds. Intentionally not
++	   storing result in eax to limit dependency chain if it goes to
++	   L(return_vec_0_lv).  */
++	bzhil	%edx, %eax, %edx
++	jnz	L(return_vec_0_lv)
++	xorl	%eax, %eax
++	ret
++
++	/* Essentially duplicate of L(return_vec_0). Ends up not costing
++	   any code as shrinks L(less_vec) by allowing 2-byte encoding of
++	   the jump and ends up fitting in aligning bytes. As well fits on
++	   same cache line as L(less_vec) so also saves a line from having
++	   to be fetched on cold calls to memcmp.  */
++	.p2align 4,, 4
++L(return_vec_0_lv):
+ 	tzcntl	%eax, %eax
+-	addl	%edx, %eax
+ # ifdef USE_AS_WMEMCMP
+-	movl	-(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
++	movl	(%rdi, %rax, CHAR_SIZE), %ecx
+ 	xorl	%edx, %edx
+-	cmpl	-(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
++	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
++	/* NB: no partial register stall here because xorl zero idiom
++	   above.  */
+ 	setg	%dl
+ 	leal	-1(%rdx, %rdx), %eax
+ # else
+-	movzbl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
+-	movzbl	-(VEC_SIZE * 2)(%rdi, %rax), %eax
++	movzbl	(%rsi, %rax), %ecx
++	movzbl	(%rdi, %rax), %eax
+ 	subl	%ecx, %eax
+ # endif
+ 	ret
+ 
+-
+ 	.p2align 4
+ L(page_cross_less_vec):
+ 	/* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
+@@ -439,108 +513,84 @@ L(page_cross_less_vec):
+ 	cmpl	$8, %edx
+ 	jae	L(between_8_15)
+ 	cmpl	$4, %edx
+-	jae	L(between_4_7)
+-L(between_2_3):
+-	/* Load as big endian to avoid branches.  */
+-	movzwl	(%rdi), %eax
+-	movzwl	(%rsi), %ecx
+-	shll	$8, %eax
+-	shll	$8, %ecx
+-	bswap	%eax
+-	bswap	%ecx
+-	movzbl	-1(%rdi, %rdx), %edi
+-	movzbl	-1(%rsi, %rdx), %esi
+-	orl	%edi, %eax
+-	orl	%esi, %ecx
+-	/* Subtraction is okay because the upper 8 bits are zero.  */
+-	subl	%ecx, %eax
+-	ret
+-	.p2align 4
+-L(one_or_less):
+-	jb	L(zero)
+-	movzbl	(%rsi), %ecx
+-	movzbl	(%rdi), %eax
+-	subl	%ecx, %eax
++	jb	L(between_2_3)
++
++	/* Load as big endian with overlapping movbe to avoid branches.
++	 */
++	movbe	(%rdi), %eax
++	movbe	(%rsi), %ecx
++	shlq	$32, %rax
++	shlq	$32, %rcx
++	movbe	-4(%rdi, %rdx), %edi
++	movbe	-4(%rsi, %rdx), %esi
++	orq	%rdi, %rax
++	orq	%rsi, %rcx
++	subq	%rcx, %rax
++	/* edx is guranteed to be positive int32 in range [4, 7].  */
++	cmovne	%edx, %eax
++	/* ecx is -1 if rcx > rax. Otherwise 0.  */
++	sbbl	%ecx, %ecx
++	/* If rcx > rax, then ecx is 0 and eax is positive. If rcx ==
++	   rax then eax and ecx are zero. If rax < rax then ecx is -1 so
++	   eax doesn't matter.  */
++	orl	%ecx, %eax
+ 	ret
+ 
+-	.p2align 4
++	.p2align 4,, 8
+ L(between_8_15):
+ # endif
+ 	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
+-	vmovq	(%rdi), %XMM1
+-	vmovq	(%rsi), %XMM2
+-	VPCMP	$4, %XMM1, %XMM2, %k1
++	vmovq	(%rdi), %xmm1
++	vmovq	(%rsi), %xmm2
++	VPCMP	$4, %xmm1, %xmm2, %k1
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+-	jnz	L(return_vec_0)
++	jnz	L(return_vec_0_lv)
+ 	/* Use overlapping loads to avoid branches.  */
+-	leaq	-8(%rdi, %rdx, CHAR_SIZE), %rdi
+-	leaq	-8(%rsi, %rdx, CHAR_SIZE), %rsi
+-	vmovq	(%rdi), %XMM1
+-	vmovq	(%rsi), %XMM2
+-	VPCMP	$4, %XMM1, %XMM2, %k1
++	vmovq	-8(%rdi, %rdx, CHAR_SIZE), %xmm1
++	vmovq	-8(%rsi, %rdx, CHAR_SIZE), %xmm2
++	VPCMP	$4, %xmm1, %xmm2, %k1
++	addl	$(CHAR_PER_VEC - (8 / CHAR_SIZE)), %edx
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+-	jnz	L(return_vec_0)
+-	ret
+-
+-	.p2align 4
+-L(zero):
+-	xorl	%eax, %eax
++	jnz	L(return_vec_0_end)
+ 	ret
+ 
+-	.p2align 4
++	.p2align 4,, 8
+ L(between_16_31):
+ 	/* From 16 to 31 bytes.  No branch when size == 16.  */
+-	VMOVU	(%rsi), %XMM2
+-	VPCMP	$4, (%rdi), %XMM2, %k1
++
++	/* Use movups to save code size.  */
++	movups	(%rsi), %xmm2
++	VPCMP	$4, (%rdi), %xmm2, %k1
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+-	jnz	L(return_vec_0)
+-
++	jnz	L(return_vec_0_lv)
+ 	/* Use overlapping loads to avoid branches.  */
+-
+-	VMOVU	-16(%rsi, %rdx, CHAR_SIZE), %XMM2
+-	leaq	-16(%rdi, %rdx, CHAR_SIZE), %rdi
+-	leaq	-16(%rsi, %rdx, CHAR_SIZE), %rsi
+-	VPCMP	$4, (%rdi), %XMM2, %k1
++	movups	-16(%rsi, %rdx, CHAR_SIZE), %xmm2
++	VPCMP	$4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
++	addl	$(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+-	jnz	L(return_vec_0)
+-	ret
+-
+-# ifdef USE_AS_WMEMCMP
+-	.p2align 4
+-L(one_or_less):
+-	jb	L(zero)
+-	movl	(%rdi), %ecx
+-	xorl	%edx, %edx
+-	cmpl	(%rsi), %ecx
+-	je	L(zero)
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
++	jnz	L(return_vec_0_end)
+ 	ret
+-# else
+ 
+-	.p2align 4
+-L(between_4_7):
+-	/* Load as big endian with overlapping movbe to avoid branches.
+-	 */
+-	movbe	(%rdi), %eax
+-	movbe	(%rsi), %ecx
+-	shlq	$32, %rax
+-	shlq	$32, %rcx
+-	movbe	-4(%rdi, %rdx), %edi
+-	movbe	-4(%rsi, %rdx), %esi
+-	orq	%rdi, %rax
+-	orq	%rsi, %rcx
+-	subq	%rcx, %rax
+-	jz	L(zero_4_7)
+-	sbbl	%eax, %eax
+-	orl	$1, %eax
+-L(zero_4_7):
++# ifndef USE_AS_WMEMCMP
++L(between_2_3):
++	/* Load as big endian to avoid branches.  */
++	movzwl	(%rdi), %eax
++	movzwl	(%rsi), %ecx
++	shll	$8, %eax
++	shll	$8, %ecx
++	bswap	%eax
++	bswap	%ecx
++	movzbl	-1(%rdi, %rdx), %edi
++	movzbl	-1(%rsi, %rdx), %esi
++	orl	%edi, %eax
++	orl	%esi, %ecx
++	/* Subtraction is okay because the upper 8 bits are zero.  */
++	subl	%ecx, %eax
+ 	ret
+ # endif
+-
+ END (MEMCMP)
+ #endif
+-- 
+GitLab
+
diff --git a/SOURCES/ia-opt-memmove-vec-unaligned-erms.patch b/SOURCES/ia-opt-memmove-vec-unaligned-erms.patch
new file mode 100644
index 0000000..697c32f
--- /dev/null
+++ b/SOURCES/ia-opt-memmove-vec-unaligned-erms.patch
@@ -0,0 +1,853 @@
+From b27eed69c1aa2e0fcdcda8b34249ee5b50b913d6 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 1 Nov 2021 00:49:51 -0500
+Subject: [PATCH] x86: Optimize memmove-vec-unaligned-erms.S
+
+No bug.
+
+The optimizations are as follows:
+
+1) Always align entry to 64 bytes. This makes behavior more
+   predictable and makes other frontend optimizations easier.
+
+2) Make the L(more_8x_vec) cases 4k aliasing aware. This can have
+   significant benefits in the case that:
+        0 < (dst - src) < [256, 512]
+
+3) Align before `rep movsb`. For ERMS this is roughly a [0, 30%]
+   improvement and for FSRM [-10%, 25%].
+
+In addition to these primary changes there is general cleanup
+throughout to optimize the aligning routines and control flow logic.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+(cherry picked from commit a6b7502ec0c2da89a7437f43171f160d713e39c6)
+---
+ sysdeps/x86_64/memmove.S                      |   2 +-
+ .../memmove-avx-unaligned-erms-rtm.S          |   2 +-
+ .../multiarch/memmove-avx-unaligned-erms.S    |   2 +-
+ .../multiarch/memmove-avx512-unaligned-erms.S |   2 +-
+ .../multiarch/memmove-evex-unaligned-erms.S   |   2 +-
+ .../multiarch/memmove-vec-unaligned-erms.S    | 595 +++++++++++-------
+ 6 files changed, 381 insertions(+), 224 deletions(-)
+
+diff --git a/sysdeps/x86_64/memmove.S b/sysdeps/x86_64/memmove.S
+index 9cc92ff9..990fa6c5 100644
+--- a/sysdeps/x86_64/memmove.S
++++ b/sysdeps/x86_64/memmove.S
+@@ -25,7 +25,7 @@
+ /* Use movups and movaps for smaller code sizes.  */
+ #define VMOVU		movups
+ #define VMOVA		movaps
+-
++#define MOV_SIZE	3
+ #define SECTION(p)		p
+ 
+ #ifdef USE_MULTIARCH
+diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
+index 1ec1962e..67a55f0c 100644
+--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
++++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
+@@ -4,7 +4,7 @@
+ # define VMOVNT		vmovntdq
+ # define VMOVU		vmovdqu
+ # define VMOVA		vmovdqa
+-
++# define MOV_SIZE	4
+ # define ZERO_UPPER_VEC_REGISTERS_RETURN \
+   ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+ 
+diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
+index e195e93f..975ae6c0 100644
+--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
+@@ -4,7 +4,7 @@
+ # define VMOVNT		vmovntdq
+ # define VMOVU		vmovdqu
+ # define VMOVA		vmovdqa
+-
++# define MOV_SIZE	4
+ # define SECTION(p)		p##.avx
+ # define MEMMOVE_SYMBOL(p,s)	p##_avx_##s
+ 
+diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+index 848848ab..0fa71268 100644
+--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+@@ -25,7 +25,7 @@
+ # define VMOVU		vmovdqu64
+ # define VMOVA		vmovdqa64
+ # define VZEROUPPER
+-
++# define MOV_SIZE	6
+ # define SECTION(p)		p##.evex512
+ # define MEMMOVE_SYMBOL(p,s)	p##_avx512_##s
+ 
+diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
+index 0cbce8f9..88715441 100644
+--- a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
+@@ -25,7 +25,7 @@
+ # define VMOVU		vmovdqu64
+ # define VMOVA		vmovdqa64
+ # define VZEROUPPER
+-
++# define MOV_SIZE	6
+ # define SECTION(p)		p##.evex
+ # define MEMMOVE_SYMBOL(p,s)	p##_evex_##s
+ 
+diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+index c0809b1b..e5495286 100644
+--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+@@ -76,6 +76,25 @@
+ # endif
+ #endif
+ 
++/* Whether to align before movsb. Ultimately we want 64 byte
++   align and not worth it to load 4x VEC for VEC_SIZE == 16.  */
++#define ALIGN_MOVSB	(VEC_SIZE > 16)
++/* Number of bytes to align movsb to.  */
++#define MOVSB_ALIGN_TO	64
++
++#define SMALL_MOV_SIZE	(MOV_SIZE <= 4)
++#define LARGE_MOV_SIZE	(MOV_SIZE > 4)
++
++#if SMALL_MOV_SIZE + LARGE_MOV_SIZE != 1
++# error MOV_SIZE Unknown
++#endif
++
++#if LARGE_MOV_SIZE
++# define SMALL_SIZE_OFFSET	(4)
++#else
++# define SMALL_SIZE_OFFSET	(0)
++#endif
++
+ #ifndef PAGE_SIZE
+ # define PAGE_SIZE 4096
+ #endif
+@@ -199,25 +218,21 @@ L(start):
+ # endif
+ 	cmp	$VEC_SIZE, %RDX_LP
+ 	jb	L(less_vec)
++	/* Load regardless.  */
++	VMOVU	(%rsi), %VEC(0)
+ 	cmp	$(VEC_SIZE * 2), %RDX_LP
+ 	ja	L(more_2x_vec)
+-#if !defined USE_MULTIARCH || !IS_IN (libc)
+-L(last_2x_vec):
+-#endif
+ 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+-	VMOVU	(%rsi), %VEC(0)
+ 	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
+ 	VMOVU	%VEC(0), (%rdi)
+ 	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
+-#if !defined USE_MULTIARCH || !IS_IN (libc)
+-L(nop):
+-	ret
++#if !(defined USE_MULTIARCH && IS_IN (libc))
++	ZERO_UPPER_VEC_REGISTERS_RETURN
+ #else
+ 	VZEROUPPER_RETURN
+ #endif
+ #if defined USE_MULTIARCH && IS_IN (libc)
+ END (MEMMOVE_SYMBOL (__memmove, unaligned))
+-
+ # if VEC_SIZE == 16
+ ENTRY (__mempcpy_chk_erms)
+ 	cmp	%RDX_LP, %RCX_LP
+@@ -289,7 +304,7 @@ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
+ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
+ # endif
+ 
+-ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
++ENTRY_P2ALIGN (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6)
+ 	movq	%rdi, %rax
+ L(start_erms):
+ # ifdef __ILP32__
+@@ -298,310 +313,448 @@ L(start_erms):
+ # endif
+ 	cmp	$VEC_SIZE, %RDX_LP
+ 	jb	L(less_vec)
++	/* Load regardless.  */
++	VMOVU	(%rsi), %VEC(0)
+ 	cmp	$(VEC_SIZE * 2), %RDX_LP
+ 	ja	L(movsb_more_2x_vec)
+-L(last_2x_vec):
+-	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */
+-	VMOVU	(%rsi), %VEC(0)
+-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
++	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
++	 */
++	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(1)
+ 	VMOVU	%VEC(0), (%rdi)
+-	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
++	VMOVU	%VEC(1), -VEC_SIZE(%rdi, %rdx)
+ L(return):
+-#if VEC_SIZE > 16
++# if VEC_SIZE > 16
+ 	ZERO_UPPER_VEC_REGISTERS_RETURN
+-#else
++# else
+ 	ret
++# endif
+ #endif
+ 
+-L(movsb):
+-	cmp     __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
+-	jae	L(more_8x_vec)
+-	cmpq	%rsi, %rdi
+-	jb	1f
+-	/* Source == destination is less common.  */
+-	je	L(nop)
+-	leaq	(%rsi,%rdx), %r9
+-	cmpq	%r9, %rdi
+-	/* Avoid slow backward REP MOVSB.  */
+-	jb	L(more_8x_vec_backward)
+-# if AVOID_SHORT_DISTANCE_REP_MOVSB
+-	testl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
+-	jz	3f
+-	movq	%rdi, %rcx
+-	subq	%rsi, %rcx
+-	jmp	2f
+-# endif
+-1:
+-# if AVOID_SHORT_DISTANCE_REP_MOVSB
+-	testl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
+-	jz	3f
+-	movq	%rsi, %rcx
+-	subq	%rdi, %rcx
+-2:
+-/* Avoid "rep movsb" if RCX, the distance between source and destination,
+-   is N*4GB + [1..63] with N >= 0.  */
+-	cmpl	$63, %ecx
+-	jbe	L(more_2x_vec)	/* Avoid "rep movsb" if ECX <= 63.  */
+-3:
+-# endif
+-	mov	%RDX_LP, %RCX_LP
+-	rep movsb
+-L(nop):
++#if LARGE_MOV_SIZE
++	/* If LARGE_MOV_SIZE this fits in the aligning bytes between the
++	   ENTRY block and L(less_vec).  */
++	.p2align 4,, 8
++L(between_4_7):
++	/* From 4 to 7.  No branch when size == 4.  */
++	movl	(%rsi), %ecx
++	movl	(%rsi, %rdx), %esi
++	movl	%ecx, (%rdi)
++	movl	%esi, (%rdi, %rdx)
+ 	ret
+ #endif
+ 
++	.p2align 4
+ L(less_vec):
+ 	/* Less than 1 VEC.  */
+ #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+ # error Unsupported VEC_SIZE!
+ #endif
+ #if VEC_SIZE > 32
+-	cmpb	$32, %dl
++	cmpl	$32, %edx
+ 	jae	L(between_32_63)
+ #endif
+ #if VEC_SIZE > 16
+-	cmpb	$16, %dl
++	cmpl	$16, %edx
+ 	jae	L(between_16_31)
+ #endif
+-	cmpb	$8, %dl
++	cmpl	$8, %edx
+ 	jae	L(between_8_15)
+-	cmpb	$4, %dl
++#if SMALL_MOV_SIZE
++	cmpl	$4, %edx
++#else
++	subq	$4, %rdx
++#endif
+ 	jae	L(between_4_7)
+-	cmpb	$1, %dl
+-	ja	L(between_2_3)
+-	jb	1f
+-	movzbl	(%rsi), %ecx
++	cmpl	$(1 - SMALL_SIZE_OFFSET), %edx
++	jl	L(copy_0)
++	movb	(%rsi), %cl
++	je	L(copy_1)
++	movzwl	(-2 + SMALL_SIZE_OFFSET)(%rsi, %rdx), %esi
++	movw	%si, (-2 + SMALL_SIZE_OFFSET)(%rdi, %rdx)
++L(copy_1):
+ 	movb	%cl, (%rdi)
+-1:
++L(copy_0):
+ 	ret
++
++#if SMALL_MOV_SIZE
++	.p2align 4,, 8
++L(between_4_7):
++	/* From 4 to 7.  No branch when size == 4.  */
++	movl	-4(%rsi, %rdx), %ecx
++	movl	(%rsi), %esi
++	movl	%ecx, -4(%rdi, %rdx)
++	movl	%esi, (%rdi)
++	ret
++#endif
++
++#if VEC_SIZE > 16
++	/* From 16 to 31.  No branch when size == 16.  */
++	.p2align 4,, 8
++L(between_16_31):
++	vmovdqu	(%rsi), %xmm0
++	vmovdqu	-16(%rsi, %rdx), %xmm1
++	vmovdqu	%xmm0, (%rdi)
++	vmovdqu	%xmm1, -16(%rdi, %rdx)
++	/* No ymm registers have been touched.  */
++	ret
++#endif
++
+ #if VEC_SIZE > 32
++	.p2align 4,, 10
+ L(between_32_63):
+ 	/* From 32 to 63.  No branch when size == 32.  */
+ 	VMOVU	(%rsi), %YMM0
+-	VMOVU	-32(%rsi,%rdx), %YMM1
++	VMOVU	-32(%rsi, %rdx), %YMM1
+ 	VMOVU	%YMM0, (%rdi)
+-	VMOVU	%YMM1, -32(%rdi,%rdx)
+-	VZEROUPPER_RETURN
+-#endif
+-#if VEC_SIZE > 16
+-	/* From 16 to 31.  No branch when size == 16.  */
+-L(between_16_31):
+-	VMOVU	(%rsi), %XMM0
+-	VMOVU	-16(%rsi,%rdx), %XMM1
+-	VMOVU	%XMM0, (%rdi)
+-	VMOVU	%XMM1, -16(%rdi,%rdx)
++	VMOVU	%YMM1, -32(%rdi, %rdx)
+ 	VZEROUPPER_RETURN
+ #endif
++
++	.p2align 4,, 10
+ L(between_8_15):
+ 	/* From 8 to 15.  No branch when size == 8.  */
+-	movq	-8(%rsi,%rdx), %rcx
++	movq	-8(%rsi, %rdx), %rcx
+ 	movq	(%rsi), %rsi
+-	movq	%rcx, -8(%rdi,%rdx)
+ 	movq	%rsi, (%rdi)
++	movq	%rcx, -8(%rdi, %rdx)
+ 	ret
+-L(between_4_7):
+-	/* From 4 to 7.  No branch when size == 4.  */
+-	movl	-4(%rsi,%rdx), %ecx
+-	movl	(%rsi), %esi
+-	movl	%ecx, -4(%rdi,%rdx)
+-	movl	%esi, (%rdi)
+-	ret
+-L(between_2_3):
+-	/* From 2 to 3.  No branch when size == 2.  */
+-	movzwl	-2(%rsi,%rdx), %ecx
+-	movzwl	(%rsi), %esi
+-	movw	%cx, -2(%rdi,%rdx)
+-	movw	%si, (%rdi)
+-	ret
+ 
++	.p2align 4,, 10
++L(last_4x_vec):
++	/* Copy from 2 * VEC + 1 to 4 * VEC, inclusively.  */
++
++	/* VEC(0) and VEC(1) have already been loaded.  */
++	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(2)
++	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
++	VMOVU	%VEC(0), (%rdi)
++	VMOVU	%VEC(1), VEC_SIZE(%rdi)
++	VMOVU	%VEC(2), -VEC_SIZE(%rdi, %rdx)
++	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx)
++	VZEROUPPER_RETURN
++
++	.p2align 4
+ #if defined USE_MULTIARCH && IS_IN (libc)
+ L(movsb_more_2x_vec):
+ 	cmp	__x86_rep_movsb_threshold(%rip), %RDX_LP
+ 	ja	L(movsb)
+ #endif
+ L(more_2x_vec):
+-	/* More than 2 * VEC and there may be overlap between destination
+-	   and source.  */
++	/* More than 2 * VEC and there may be overlap between
++	   destination and source.  */
+ 	cmpq	$(VEC_SIZE * 8), %rdx
+ 	ja	L(more_8x_vec)
++	/* Load VEC(1) regardless. VEC(0) has already been loaded.  */
++	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+ 	cmpq	$(VEC_SIZE * 4), %rdx
+ 	jbe	L(last_4x_vec)
+-	/* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
+-	VMOVU	(%rsi), %VEC(0)
+-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
++	/* Copy from 4 * VEC + 1 to 8 * VEC, inclusively.  */
+ 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+ 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(4)
+-	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
+-	VMOVU	-(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
+-	VMOVU	-(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
++	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(4)
++	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5)
++	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6)
++	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7)
+ 	VMOVU	%VEC(0), (%rdi)
+ 	VMOVU	%VEC(1), VEC_SIZE(%rdi)
+ 	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
+ 	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
+-	VMOVU	%VEC(4), -VEC_SIZE(%rdi,%rdx)
+-	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
+-	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
+-	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
+-	VZEROUPPER_RETURN
+-L(last_4x_vec):
+-	/* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
+-	VMOVU	(%rsi), %VEC(0)
+-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(2)
+-	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
+-	VMOVU	%VEC(0), (%rdi)
+-	VMOVU	%VEC(1), VEC_SIZE(%rdi)
+-	VMOVU	%VEC(2), -VEC_SIZE(%rdi,%rdx)
+-	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
++	VMOVU	%VEC(4), -VEC_SIZE(%rdi, %rdx)
++	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx)
++	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx)
++	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx)
+ 	VZEROUPPER_RETURN
+ 
++	.p2align 4,, 4
+ L(more_8x_vec):
++	movq	%rdi, %rcx
++	subq	%rsi, %rcx
++	/* Go to backwards temporal copy if overlap no matter what as
++	   backward REP MOVSB is slow and we don't want to use NT stores if
++	   there is overlap.  */
++	cmpq	%rdx, %rcx
++	/* L(more_8x_vec_backward_check_nop) checks for src == dst.  */
++	jb	L(more_8x_vec_backward_check_nop)
+ 	/* Check if non-temporal move candidate.  */
+ #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+ 	/* Check non-temporal store threshold.  */
+-	cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
++	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
+ 	ja	L(large_memcpy_2x)
+ #endif
+-	/* Entry if rdx is greater than non-temporal threshold but there
+-       is overlap.  */
++	/* To reach this point there cannot be overlap and dst > src. So
++	   check for overlap and src > dst in which case correctness
++	   requires forward copy. Otherwise decide between backward/forward
++	   copy depending on address aliasing.  */
++
++	/* Entry if rdx is greater than __x86_rep_movsb_stop_threshold
++	   but less than __x86_shared_non_temporal_threshold.  */
+ L(more_8x_vec_check):
+-	cmpq	%rsi, %rdi
+-	ja	L(more_8x_vec_backward)
+-	/* Source == destination is less common.  */
+-	je	L(nop)
+-	/* Load the first VEC and last 4 * VEC to support overlapping
+-	   addresses.  */
+-	VMOVU	(%rsi), %VEC(4)
++	/* rcx contains dst - src. Add back length (rdx).  */
++	leaq	(%rcx, %rdx), %r8
++	/* If r8 has different sign than rcx then there is overlap so we
++	   must do forward copy.  */
++	xorq	%rcx, %r8
++	/* Isolate just sign bit of r8.  */
++	shrq	$63, %r8
++	/* Get 4k difference dst - src.  */
++	andl	$(PAGE_SIZE - 256), %ecx
++	/* If r8 is non-zero must do foward for correctness. Otherwise
++	   if ecx is non-zero there is 4k False Alaising so do backward
++	   copy.  */
++	addl	%r8d, %ecx
++	jz	L(more_8x_vec_backward)
++
++	/* if rdx is greater than __x86_shared_non_temporal_threshold
++	   but there is overlap, or from short distance movsb.  */
++L(more_8x_vec_forward):
++	/* Load first and last 4 * VEC to support overlapping addresses.
++	 */
++
++	/* First vec was already loaded into VEC(0).  */
+ 	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(5)
+ 	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
++	/* Save begining of dst.  */
++	movq	%rdi, %rcx
++	/* Align dst to VEC_SIZE - 1.  */
++	orq	$(VEC_SIZE - 1), %rdi
+ 	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
+ 	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
+-	/* Save start and stop of the destination buffer.  */
+-	movq	%rdi, %r11
+-	leaq	-VEC_SIZE(%rdi, %rdx), %rcx
+-	/* Align destination for aligned stores in the loop.  Compute
+-	   how much destination is misaligned.  */
+-	movq	%rdi, %r8
+-	andq	$(VEC_SIZE - 1), %r8
+-	/* Get the negative of offset for alignment.  */
+-	subq	$VEC_SIZE, %r8
+-	/* Adjust source.  */
+-	subq	%r8, %rsi
+-	/* Adjust destination which should be aligned now.  */
+-	subq	%r8, %rdi
+-	/* Adjust length.  */
+-	addq	%r8, %rdx
+ 
+-	.p2align 4
++	/* Subtract dst from src. Add back after dst aligned.  */
++	subq	%rcx, %rsi
++	/* Finish aligning dst.  */
++	incq	%rdi
++	/* Restore src adjusted with new value for aligned dst.  */
++	addq	%rdi, %rsi
++	/* Store end of buffer minus tail in rdx.  */
++	leaq	(VEC_SIZE * -4)(%rcx, %rdx), %rdx
++
++	/* Dont use multi-byte nop to align.  */
++	.p2align 4,, 11
+ L(loop_4x_vec_forward):
+ 	/* Copy 4 * VEC a time forward.  */
+-	VMOVU	(%rsi), %VEC(0)
+-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
++	VMOVU	(%rsi), %VEC(1)
++	VMOVU	VEC_SIZE(%rsi), %VEC(2)
++	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(3)
++	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(4)
+ 	subq	$-(VEC_SIZE * 4), %rsi
+-	addq	$-(VEC_SIZE * 4), %rdx
+-	VMOVA	%VEC(0), (%rdi)
+-	VMOVA	%VEC(1), VEC_SIZE(%rdi)
+-	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
+-	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
++	VMOVA	%VEC(1), (%rdi)
++	VMOVA	%VEC(2), VEC_SIZE(%rdi)
++	VMOVA	%VEC(3), (VEC_SIZE * 2)(%rdi)
++	VMOVA	%VEC(4), (VEC_SIZE * 3)(%rdi)
+ 	subq	$-(VEC_SIZE * 4), %rdi
+-	cmpq	$(VEC_SIZE * 4), %rdx
++	cmpq	%rdi, %rdx
+ 	ja	L(loop_4x_vec_forward)
+ 	/* Store the last 4 * VEC.  */
+-	VMOVU	%VEC(5), (%rcx)
+-	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
+-	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
+-	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
++	VMOVU	%VEC(5), (VEC_SIZE * 3)(%rdx)
++	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdx)
++	VMOVU	%VEC(7), VEC_SIZE(%rdx)
++	VMOVU	%VEC(8), (%rdx)
+ 	/* Store the first VEC.  */
+-	VMOVU	%VEC(4), (%r11)
++	VMOVU	%VEC(0), (%rcx)
++	/* Keep L(nop_backward) target close to jmp for 2-byte encoding.
++	 */
++L(nop_backward):
+ 	VZEROUPPER_RETURN
+ 
++	.p2align 4,, 8
++L(more_8x_vec_backward_check_nop):
++	/* rcx contains dst - src. Test for dst == src to skip all of
++	   memmove.  */
++	testq	%rcx, %rcx
++	jz	L(nop_backward)
+ L(more_8x_vec_backward):
+ 	/* Load the first 4 * VEC and last VEC to support overlapping
+ 	   addresses.  */
+-	VMOVU	(%rsi), %VEC(4)
++
++	/* First vec was also loaded into VEC(0).  */
+ 	VMOVU	VEC_SIZE(%rsi), %VEC(5)
+ 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(6)
++	/* Begining of region for 4x backward copy stored in rcx.  */
++	leaq	(VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx
+ 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(7)
+-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(8)
+-	/* Save stop of the destination buffer.  */
+-	leaq	-VEC_SIZE(%rdi, %rdx), %r11
+-	/* Align destination end for aligned stores in the loop.  Compute
+-	   how much destination end is misaligned.  */
+-	leaq	-VEC_SIZE(%rsi, %rdx), %rcx
+-	movq	%r11, %r9
+-	movq	%r11, %r8
+-	andq	$(VEC_SIZE - 1), %r8
+-	/* Adjust source.  */
+-	subq	%r8, %rcx
+-	/* Adjust the end of destination which should be aligned now.  */
+-	subq	%r8, %r9
+-	/* Adjust length.  */
+-	subq	%r8, %rdx
+-
+-	.p2align 4
++	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(8)
++	/* Subtract dst from src. Add back after dst aligned.  */
++	subq	%rdi, %rsi
++	/* Align dst.  */
++	andq	$-(VEC_SIZE), %rcx
++	/* Restore src.  */
++	addq	%rcx, %rsi
++
++	/* Don't use multi-byte nop to align.  */
++	.p2align 4,, 11
+ L(loop_4x_vec_backward):
+ 	/* Copy 4 * VEC a time backward.  */
+-	VMOVU	(%rcx), %VEC(0)
+-	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
+-	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
+-	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
+-	addq	$-(VEC_SIZE * 4), %rcx
+-	addq	$-(VEC_SIZE * 4), %rdx
+-	VMOVA	%VEC(0), (%r9)
+-	VMOVA	%VEC(1), -VEC_SIZE(%r9)
+-	VMOVA	%VEC(2), -(VEC_SIZE * 2)(%r9)
+-	VMOVA	%VEC(3), -(VEC_SIZE * 3)(%r9)
+-	addq	$-(VEC_SIZE * 4), %r9
+-	cmpq	$(VEC_SIZE * 4), %rdx
+-	ja	L(loop_4x_vec_backward)
++	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(1)
++	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
++	VMOVU	(VEC_SIZE * 1)(%rsi), %VEC(3)
++	VMOVU	(VEC_SIZE * 0)(%rsi), %VEC(4)
++	addq	$(VEC_SIZE * -4), %rsi
++	VMOVA	%VEC(1), (VEC_SIZE * 3)(%rcx)
++	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rcx)
++	VMOVA	%VEC(3), (VEC_SIZE * 1)(%rcx)
++	VMOVA	%VEC(4), (VEC_SIZE * 0)(%rcx)
++	addq	$(VEC_SIZE * -4), %rcx
++	cmpq	%rcx, %rdi
++	jb	L(loop_4x_vec_backward)
+ 	/* Store the first 4 * VEC.  */
+-	VMOVU	%VEC(4), (%rdi)
++	VMOVU	%VEC(0), (%rdi)
+ 	VMOVU	%VEC(5), VEC_SIZE(%rdi)
+ 	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
+ 	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
+ 	/* Store the last VEC.  */
+-	VMOVU	%VEC(8), (%r11)
++	VMOVU	%VEC(8), -VEC_SIZE(%rdx, %rdi)
++	VZEROUPPER_RETURN
++
++#if defined USE_MULTIARCH && IS_IN (libc)
++	/* L(skip_short_movsb_check) is only used with ERMS. Not for
++	   FSRM.  */
++	.p2align 5,, 16
++# if ALIGN_MOVSB
++L(skip_short_movsb_check):
++#  if MOVSB_ALIGN_TO > VEC_SIZE
++	VMOVU	VEC_SIZE(%rsi), %VEC(1)
++#  endif
++#  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
++#   error Unsupported MOVSB_ALIGN_TO
++#  endif
++	/* If CPU does not have FSRM two options for aligning. Align src
++	   if dst and src 4k alias. Otherwise align dst.  */
++	testl	$(PAGE_SIZE - 512), %ecx
++	jnz	L(movsb_align_dst)
++	/* Fall through. dst and src 4k alias. It's better to align src
++	   here because the bottleneck will be loads dues to the false
++	   dependency on dst.  */
++
++	/* rcx already has dst - src.  */
++	movq	%rcx, %r9
++	/* Add src to len. Subtract back after src aligned. -1 because
++	   src is initially aligned to MOVSB_ALIGN_TO - 1.  */
++	leaq	-1(%rsi, %rdx), %rcx
++	/* Inclusively align src to MOVSB_ALIGN_TO - 1.  */
++	orq	$(MOVSB_ALIGN_TO - 1), %rsi
++	/* Restore dst and len adjusted with new values for aligned dst.
++	 */
++	leaq	1(%rsi, %r9), %rdi
++	subq	%rsi, %rcx
++	/* Finish aligning src.  */
++	incq	%rsi
++
++	rep	movsb
++
++	VMOVU	%VEC(0), (%r8)
++#  if MOVSB_ALIGN_TO > VEC_SIZE
++	VMOVU	%VEC(1), VEC_SIZE(%r8)
++#  endif
+ 	VZEROUPPER_RETURN
++# endif
++
++	.p2align 4,, 12
++L(movsb):
++	movq	%rdi, %rcx
++	subq	%rsi, %rcx
++	/* Go to backwards temporal copy if overlap no matter what as
++	   backward REP MOVSB is slow and we don't want to use NT stores if
++	   there is overlap.  */
++	cmpq	%rdx, %rcx
++	/* L(more_8x_vec_backward_check_nop) checks for src == dst.  */
++	jb	L(more_8x_vec_backward_check_nop)
++# if ALIGN_MOVSB
++	/* Save dest for storing aligning VECs later.  */
++	movq	%rdi, %r8
++# endif
++	/* If above __x86_rep_movsb_stop_threshold most likely is
++	   candidate for NT moves aswell.  */
++	cmp	__x86_rep_movsb_stop_threshold(%rip), %RDX_LP
++	jae	L(large_memcpy_2x_check)
++# if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB
++	/* Only avoid short movsb if CPU has FSRM.  */
++	testl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
++	jz	L(skip_short_movsb_check)
++#  if AVOID_SHORT_DISTANCE_REP_MOVSB
++	/* Avoid "rep movsb" if RCX, the distance between source and
++	   destination, is N*4GB + [1..63] with N >= 0.  */
++
++	/* ecx contains dst - src. Early check for backward copy
++	   conditions means only case of slow movsb with src = dst + [0,
++	   63] is ecx in [-63, 0]. Use unsigned comparison with -64 check
++	   for that case.  */
++	cmpl	$-64, %ecx
++	ja	L(more_8x_vec_forward)
++#  endif
++# endif
++# if ALIGN_MOVSB
++#  if MOVSB_ALIGN_TO > VEC_SIZE
++	VMOVU	VEC_SIZE(%rsi), %VEC(1)
++#  endif
++#  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
++#   error Unsupported MOVSB_ALIGN_TO
++#  endif
++	/* Fall through means cpu has FSRM. In that case exclusively
++	   align destination.  */
++L(movsb_align_dst):
++	/* Subtract dst from src. Add back after dst aligned.  */
++	subq	%rdi, %rsi
++	/* Exclusively align dst to MOVSB_ALIGN_TO (64).  */
++	addq	$(MOVSB_ALIGN_TO - 1), %rdi
++	/* Add dst to len. Subtract back after dst aligned.  */
++	leaq	(%r8, %rdx), %rcx
++	/* Finish aligning dst.  */
++	andq	$-(MOVSB_ALIGN_TO), %rdi
++	/* Restore src and len adjusted with new values for aligned dst.
++	 */
++	addq	%rdi, %rsi
++	subq	%rdi, %rcx
++
++	rep	movsb
++
++	/* Store VECs loaded for aligning.  */
++	VMOVU	%VEC(0), (%r8)
++#  if MOVSB_ALIGN_TO > VEC_SIZE
++	VMOVU	%VEC(1), VEC_SIZE(%r8)
++#  endif
++	VZEROUPPER_RETURN
++# else	/* !ALIGN_MOVSB.  */
++L(skip_short_movsb_check):
++	mov	%RDX_LP, %RCX_LP
++	rep	movsb
++	ret
++# endif
++#endif
+ 
++	.p2align 4,, 10
+ #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+-	.p2align 4
++L(large_memcpy_2x_check):
++	cmp	__x86_rep_movsb_threshold(%rip), %RDX_LP
++	jb	L(more_8x_vec_check)
+ L(large_memcpy_2x):
+-	/* Compute absolute value of difference between source and
+-	   destination.  */
+-	movq	%rdi, %r9
+-	subq	%rsi, %r9
+-	movq	%r9, %r8
+-	leaq	-1(%r9), %rcx
+-	sarq	$63, %r8
+-	xorq	%r8, %r9
+-	subq	%r8, %r9
+-	/* Don't use non-temporal store if there is overlap between
+-	   destination and source since destination may be in cache when
+-	   source is loaded.  */
+-	cmpq	%r9, %rdx
+-	ja	L(more_8x_vec_check)
++	/* To reach this point it is impossible for dst > src and
++	   overlap. Remaining to check is src > dst and overlap. rcx
++	   already contains dst - src. Negate rcx to get src - dst. If
++	   length > rcx then there is overlap and forward copy is best.  */
++	negq	%rcx
++	cmpq	%rcx, %rdx
++	ja	L(more_8x_vec_forward)
+ 
+ 	/* Cache align destination. First store the first 64 bytes then
+ 	   adjust alignments.  */
+-	VMOVU	(%rsi), %VEC(8)
+-#if VEC_SIZE < 64
+-	VMOVU	VEC_SIZE(%rsi), %VEC(9)
+-#if VEC_SIZE < 32
+-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(10)
+-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(11)
+-#endif
+-#endif
+-	VMOVU	%VEC(8), (%rdi)
+-#if VEC_SIZE < 64
+-	VMOVU	%VEC(9), VEC_SIZE(%rdi)
+-#if VEC_SIZE < 32
+-	VMOVU	%VEC(10), (VEC_SIZE * 2)(%rdi)
+-	VMOVU	%VEC(11), (VEC_SIZE * 3)(%rdi)
+-#endif
+-#endif
++
++	/* First vec was also loaded into VEC(0).  */
++# if VEC_SIZE < 64
++	VMOVU	VEC_SIZE(%rsi), %VEC(1)
++#  if VEC_SIZE < 32
++	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
++	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
++#  endif
++# endif
++	VMOVU	%VEC(0), (%rdi)
++# if VEC_SIZE < 64
++	VMOVU	%VEC(1), VEC_SIZE(%rdi)
++#  if VEC_SIZE < 32
++	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
++	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
++#  endif
++# endif
++
+ 	/* Adjust source, destination, and size.  */
+ 	movq	%rdi, %r8
+ 	andq	$63, %r8
+@@ -614,9 +767,13 @@ L(large_memcpy_2x):
+ 	/* Adjust length.  */
+ 	addq	%r8, %rdx
+ 
+-	/* Test if source and destination addresses will alias. If they do
+-	   the larger pipeline in large_memcpy_4x alleviated the
++	/* Test if source and destination addresses will alias. If they
++	   do the larger pipeline in large_memcpy_4x alleviated the
+ 	   performance drop.  */
++
++	/* ecx contains -(dst - src). not ecx will return dst - src - 1
++	   which works for testing aliasing.  */
++	notl	%ecx
+ 	testl	$(PAGE_SIZE - VEC_SIZE * 8), %ecx
+ 	jz	L(large_memcpy_4x)
+ 
+@@ -704,8 +861,8 @@ L(loop_large_memcpy_4x_outer):
+ 	/* ecx stores inner loop counter.  */
+ 	movl	$(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
+ L(loop_large_memcpy_4x_inner):
+-	/* Only one prefetch set per page as doing 4 pages give more time
+-	   for prefetcher to keep up.  */
++	/* Only one prefetch set per page as doing 4 pages give more
++	   time for prefetcher to keep up.  */
+ 	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
+ 	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
+ 	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
+-- 
+GitLab
+
diff --git a/SOURCES/ia-opt-memset-vec-unaligned-erms.patch b/SOURCES/ia-opt-memset-vec-unaligned-erms.patch
new file mode 100644
index 0000000..63f57dd
--- /dev/null
+++ b/SOURCES/ia-opt-memset-vec-unaligned-erms.patch
@@ -0,0 +1,506 @@
+From 5deda2b73383bf16788cc83c8ea6262d89608263 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 20 Sep 2021 16:20:15 -0500
+Subject: [PATCH] x86: Optimize memset-vec-unaligned-erms.S
+
+No bug.
+
+Optimization are
+
+1. change control flow for L(more_2x_vec) to fall through to loop and
+   jump for L(less_4x_vec) and L(less_8x_vec). This uses less code
+   size and saves jumps for length > 4x VEC_SIZE.
+
+2. For EVEX/AVX512 move L(less_vec) closer to entry.
+
+3. Avoid complex address mode for length > 2x VEC_SIZE
+
+4. Slightly better aligning code for the loop from the perspective of
+   code size and uops.
+
+5. Align targets so they make full use of their fetch block and if
+   possible cache line.
+
+6. Try and reduce total number of icache lines that will need to be
+   pulled in for a given length.
+
+7. Include "local" version of stosb target. For AVX2/EVEX/AVX512
+   jumping to the stosb target in the sse2 code section will almost
+   certainly be to a new page. The new version does increase code size
+   marginally by duplicating the target but should get better iTLB
+   behavior as a result.
+
+test-memset, test-wmemset, and test-bzero are all passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+(cherry picked from commit e59ced238482fd71f3e493717f14f6507346741e)
+---
+ sysdeps/x86_64/memset.S                       |  10 +-
+ .../multiarch/memset-avx2-unaligned-erms.S    |  10 +-
+ .../multiarch/memset-avx512-unaligned-erms.S  |  11 +-
+ .../multiarch/memset-evex-unaligned-erms.S    |  11 +-
+ .../multiarch/memset-vec-unaligned-erms.S     | 285 ++++++++++++------
+ 5 files changed, 232 insertions(+), 95 deletions(-)
+
+diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
+index b3426795..8672b030 100644
+--- a/sysdeps/x86_64/memset.S
++++ b/sysdeps/x86_64/memset.S
+@@ -18,13 +18,15 @@
+    <http://www.gnu.org/licenses/>.  */
+ 
+ #include <sysdep.h>
++#define USE_WITH_SSE2	1
+ 
+ #define VEC_SIZE	16
++#define MOV_SIZE	3
++#define RET_SIZE	1
++
+ #define VEC(i)		xmm##i
+-/* Don't use movups and movaps since it will get larger nop paddings for
+-   alignment.  */
+-#define VMOVU		movdqu
+-#define VMOVA		movdqa
++#define VMOVU     movups
++#define VMOVA     movaps
+ 
+ #define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+   movd d, %xmm0; \
+diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+index ae0860f3..1af668af 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+@@ -1,8 +1,14 @@
+ #if IS_IN (libc)
++# define USE_WITH_AVX2	1
++
+ # define VEC_SIZE	32
++# define MOV_SIZE	4
++# define RET_SIZE	4
++
+ # define VEC(i)		ymm##i
+-# define VMOVU		vmovdqu
+-# define VMOVA		vmovdqa
++
++# define VMOVU     vmovdqu
++# define VMOVA     vmovdqa
+ 
+ # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+   vmovd d, %xmm0; \
+diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+index 8ad842fc..f14d6f84 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+@@ -1,11 +1,18 @@
+ #if IS_IN (libc)
++# define USE_WITH_AVX512	1
++
+ # define VEC_SIZE	64
++# define MOV_SIZE	6
++# define RET_SIZE	1
++
+ # define XMM0		xmm16
+ # define YMM0		ymm16
+ # define VEC0		zmm16
+ # define VEC(i)		VEC##i
+-# define VMOVU		vmovdqu64
+-# define VMOVA		vmovdqa64
++
++# define VMOVU     vmovdqu64
++# define VMOVA     vmovdqa64
++
+ # define VZEROUPPER
+ 
+ # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+index 640f0929..64b09e77 100644
+--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+@@ -1,11 +1,18 @@
+ #if IS_IN (libc)
++# define USE_WITH_EVEX	1
++
+ # define VEC_SIZE	32
++# define MOV_SIZE	6
++# define RET_SIZE	1
++
+ # define XMM0		xmm16
+ # define YMM0		ymm16
+ # define VEC0		ymm16
+ # define VEC(i)		VEC##i
+-# define VMOVU		vmovdqu64
+-# define VMOVA		vmovdqa64
++
++# define VMOVU     vmovdqu64
++# define VMOVA     vmovdqa64
++
+ # define VZEROUPPER
+ 
+ # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index 909c33f6..f08b7323 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -63,8 +63,27 @@
+ # endif
+ #endif
+ 
++#if VEC_SIZE == 64
++# define LOOP_4X_OFFSET	(VEC_SIZE * 4)
++#else
++# define LOOP_4X_OFFSET	(0)
++#endif
++
++#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
++# define END_REG	rcx
++# define LOOP_REG	rdi
++#else
++# define END_REG	rdi
++# define LOOP_REG	rdx
++#endif
++
+ #define PAGE_SIZE 4096
+ 
++/* Macro to calculate size of small memset block for aligning
++   purposes.  */
++#define SMALL_MEMSET_ALIGN(mov_sz,	ret_sz)	(2 * (mov_sz) + (ret_sz) + 1)
++
++
+ #ifndef SECTION
+ # error SECTION is not defined!
+ #endif
+@@ -74,6 +93,7 @@
+ ENTRY (__bzero)
+ 	mov	%RDI_LP, %RAX_LP /* Set return value.  */
+ 	mov	%RSI_LP, %RDX_LP /* Set n.  */
++	xorl	%esi, %esi
+ 	pxor	%XMM0, %XMM0
+ 	jmp	L(entry_from_bzero)
+ END (__bzero)
+@@ -158,7 +178,7 @@ ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
+ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
+ # endif
+ 
+-ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
++ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
+ 	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+ # ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+@@ -168,75 +188,43 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
+ 	jb	L(less_vec)
+ 	cmp	$(VEC_SIZE * 2), %RDX_LP
+ 	ja	L(stosb_more_2x_vec)
+-	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+-	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+-	VMOVU	%VEC(0), (%rdi)
++	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
++	 */
++	VMOVU	%VEC(0), (%rax)
++	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
+ 	VZEROUPPER_RETURN
+-
+-	.p2align 4
+-L(stosb_more_2x_vec):
+-	cmp	__x86_rep_stosb_threshold(%rip), %RDX_LP
+-	ja	L(stosb)
+-#else
+-	.p2align 4
+ #endif
+-L(more_2x_vec):
+-	/* Stores to first 2x VEC before cmp as any path forward will
+-	   require it.  */
+-	VMOVU	%VEC(0), (%rdi)
+-	VMOVU	%VEC(0), VEC_SIZE(%rdi)
+-	cmpq	$(VEC_SIZE * 4), %rdx
+-	ja	L(loop_start)
+-	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
+-	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+-L(return):
+-#if VEC_SIZE > 16
+-	ZERO_UPPER_VEC_REGISTERS_RETURN
++
++	.p2align 4,, 10
++L(last_2x_vec):
++#ifdef USE_LESS_VEC_MASK_STORE
++	VMOVU	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
++	VMOVU	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
+ #else
+-	ret
++	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi)
++	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi)
+ #endif
++	VZEROUPPER_RETURN
+ 
+-L(loop_start):
+-	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rdi)
+-	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rdi)
+-	cmpq	$(VEC_SIZE * 8), %rdx
+-	jbe	L(loop_end)
+-	andq	$-(VEC_SIZE * 2), %rdi
+-	subq	$-(VEC_SIZE * 4), %rdi
+-	leaq	-(VEC_SIZE * 4)(%rax, %rdx), %rcx
+-	.p2align 4
+-L(loop):
+-	VMOVA	%VEC(0), (%rdi)
+-	VMOVA	%VEC(0), VEC_SIZE(%rdi)
+-	VMOVA	%VEC(0), (VEC_SIZE * 2)(%rdi)
+-	VMOVA	%VEC(0), (VEC_SIZE * 3)(%rdi)
+-	subq	$-(VEC_SIZE * 4), %rdi
+-	cmpq	%rcx, %rdi
+-	jb	L(loop)
+-L(loop_end):
+-	/* NB: rax is set as ptr in MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
+-	       rdx as length is also unchanged.  */
+-	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rax, %rdx)
+-	VMOVU	%VEC(0), -(VEC_SIZE * 3)(%rax, %rdx)
+-	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rax, %rdx)
+-	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
+-	VZEROUPPER_SHORT_RETURN
+-
+-	.p2align 4
++	/* If have AVX512 mask instructions put L(less_vec) close to
++	   entry as it doesn't take much space and is likely a hot target.
++	 */
++#ifdef USE_LESS_VEC_MASK_STORE
++	.p2align 4,, 10
+ L(less_vec):
+ 	/* Less than 1 VEC.  */
+ # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+ #  error Unsupported VEC_SIZE!
+ # endif
+-# ifdef USE_LESS_VEC_MASK_STORE
+ 	/* Clear high bits from edi. Only keeping bits relevant to page
+ 	   cross check. Note that we are using rax which is set in
+-	   MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.
+-	 */
++	   MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.  */
+ 	andl	$(PAGE_SIZE - 1), %edi
+-	/* Check if VEC_SIZE store cross page. Mask stores suffer serious
+-	   performance degradation when it has to fault supress.  */
++	/* Check if VEC_SIZE store cross page. Mask stores suffer
++	   serious performance degradation when it has to fault supress.
++	 */
+ 	cmpl	$(PAGE_SIZE - VEC_SIZE), %edi
++	/* This is generally considered a cold target.  */
+ 	ja	L(cross_page)
+ # if VEC_SIZE > 32
+ 	movq	$-1, %rcx
+@@ -247,58 +235,185 @@ L(less_vec):
+ 	bzhil	%edx, %ecx, %ecx
+ 	kmovd	%ecx, %k1
+ # endif
+-	vmovdqu8	%VEC(0), (%rax) {%k1}
++	vmovdqu8 %VEC(0), (%rax){%k1}
+ 	VZEROUPPER_RETURN
+ 
++# if defined USE_MULTIARCH && IS_IN (libc)
++	/* Include L(stosb_local) here if including L(less_vec) between
++	   L(stosb_more_2x_vec) and ENTRY. This is to cache align the
++	   L(stosb_more_2x_vec) target.  */
++	.p2align 4,, 10
++L(stosb_local):
++	movzbl	%sil, %eax
++	mov	%RDX_LP, %RCX_LP
++	mov	%RDI_LP, %RDX_LP
++	rep	stosb
++	mov	%RDX_LP, %RAX_LP
++	VZEROUPPER_RETURN
++# endif
++#endif
++
++#if defined USE_MULTIARCH && IS_IN (libc)
+ 	.p2align 4
+-L(cross_page):
++L(stosb_more_2x_vec):
++	cmp	__x86_rep_stosb_threshold(%rip), %RDX_LP
++	ja	L(stosb_local)
++#endif
++	/* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
++	   and (4x, 8x] jump to target.  */
++L(more_2x_vec):
++
++	/* Two different methods of setting up pointers / compare. The
++	   two methods are based on the fact that EVEX/AVX512 mov
++	   instructions take more bytes then AVX2/SSE2 mov instructions. As
++	   well that EVEX/AVX512 machines also have fast LEA_BID. Both
++	   setup and END_REG to avoid complex address mode. For EVEX/AVX512
++	   this saves code size and keeps a few targets in one fetch block.
++	   For AVX2/SSE2 this helps prevent AGU bottlenecks.  */
++#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
++	/* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
++	   LOOP_4X_OFFSET) with LEA_BID.  */
++
++	/* END_REG is rcx for EVEX/AVX512.  */
++	leaq	-(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
++#endif
++
++	/* Stores to first 2x VEC before cmp as any path forward will
++	   require it.  */
++	VMOVU	%VEC(0), (%rax)
++	VMOVU	%VEC(0), VEC_SIZE(%rax)
++
++
++#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
++	/* If AVX2/SSE2 compute END_REG (rdi) with ALU.  */
++	addq	%rdx, %END_REG
++#endif
++
++	cmpq	$(VEC_SIZE * 4), %rdx
++	jbe	L(last_2x_vec)
++
++	/* Store next 2x vec regardless.  */
++	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rax)
++	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rax)
++
++
++#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
++	/* If LOOP_4X_OFFSET don't readjust LOOP_REG (rdi), just add
++	   extra offset to addresses in loop. Used for AVX512 to save space
++	   as no way to get (VEC_SIZE * 4) in imm8.  */
++# if LOOP_4X_OFFSET == 0
++	subq	$-(VEC_SIZE * 4), %LOOP_REG
+ # endif
+-# if VEC_SIZE > 32
+-	cmpb	$32, %dl
+-	jae	L(between_32_63)
++	/* Avoid imm32 compare here to save code size.  */
++	cmpq	%rdi, %rcx
++#else
++	addq	$-(VEC_SIZE * 4), %END_REG
++	cmpq	$(VEC_SIZE * 8), %rdx
++#endif
++	jbe	L(last_4x_vec)
++#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
++	/* Set LOOP_REG (rdx).  */
++	leaq	(VEC_SIZE * 4)(%rax), %LOOP_REG
++#endif
++	/* Align dst for loop.  */
++	andq	$(VEC_SIZE * -2), %LOOP_REG
++	.p2align 4
++L(loop):
++	VMOVA	%VEC(0), LOOP_4X_OFFSET(%LOOP_REG)
++	VMOVA	%VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG)
++	VMOVA	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG)
++	VMOVA	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG)
++	subq	$-(VEC_SIZE * 4), %LOOP_REG
++	cmpq	%END_REG, %LOOP_REG
++	jb	L(loop)
++	.p2align 4,, MOV_SIZE
++L(last_4x_vec):
++	VMOVU	%VEC(0), LOOP_4X_OFFSET(%END_REG)
++	VMOVU	%VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG)
++	VMOVU	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG)
++	VMOVU	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG)
++L(return):
++#if VEC_SIZE > 16
++	ZERO_UPPER_VEC_REGISTERS_RETURN
++#else
++	ret
++#endif
++
++	.p2align 4,, 10
++#ifndef USE_LESS_VEC_MASK_STORE
++# if defined USE_MULTIARCH && IS_IN (libc)
++	/* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
++	   range for 2-byte jump encoding.  */
++L(stosb_local):
++	movzbl	%sil, %eax
++	mov	%RDX_LP, %RCX_LP
++	mov	%RDI_LP, %RDX_LP
++	rep	stosb
++	mov	%RDX_LP, %RAX_LP
++	VZEROUPPER_RETURN
+ # endif
+-# if VEC_SIZE > 16
+-	cmpb	$16, %dl
++	/* Define L(less_vec) only if not otherwise defined.  */
++	.p2align 4
++L(less_vec):
++#endif
++L(cross_page):
++#if VEC_SIZE > 32
++	cmpl	$32, %edx
++	jae	L(between_32_63)
++#endif
++#if VEC_SIZE > 16
++	cmpl	$16, %edx
+ 	jae	L(between_16_31)
+-# endif
+-	MOVQ	%XMM0, %rcx
+-	cmpb	$8, %dl
++#endif
++	MOVQ	%XMM0, %rdi
++	cmpl	$8, %edx
+ 	jae	L(between_8_15)
+-	cmpb	$4, %dl
++	cmpl	$4, %edx
+ 	jae	L(between_4_7)
+-	cmpb	$1, %dl
++	cmpl	$1, %edx
+ 	ja	L(between_2_3)
+-	jb	1f
+-	movb	%cl, (%rax)
+-1:
++	jb	L(return)
++	movb	%sil, (%rax)
+ 	VZEROUPPER_RETURN
+-# if VEC_SIZE > 32
++
++	/* Align small targets only if not doing so would cross a fetch
++	   line.  */
++#if VEC_SIZE > 32
++	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
+ 	/* From 32 to 63.  No branch when size == 32.  */
+ L(between_32_63):
+-	VMOVU	%YMM0, -32(%rax,%rdx)
+ 	VMOVU	%YMM0, (%rax)
++	VMOVU	%YMM0, -32(%rax, %rdx)
+ 	VZEROUPPER_RETURN
+-# endif
+-# if VEC_SIZE > 16
+-	/* From 16 to 31.  No branch when size == 16.  */
++#endif
++
++#if VEC_SIZE >= 32
++	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
+ L(between_16_31):
+-	VMOVU	%XMM0, -16(%rax,%rdx)
++	/* From 16 to 31.  No branch when size == 16.  */
+ 	VMOVU	%XMM0, (%rax)
++	VMOVU	%XMM0, -16(%rax, %rdx)
+ 	VZEROUPPER_RETURN
+-# endif
+-	/* From 8 to 15.  No branch when size == 8.  */
++#endif
++
++	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
+ L(between_8_15):
+-	movq	%rcx, -8(%rax,%rdx)
+-	movq	%rcx, (%rax)
++	/* From 8 to 15.  No branch when size == 8.  */
++	movq	%rdi, (%rax)
++	movq	%rdi, -8(%rax, %rdx)
+ 	VZEROUPPER_RETURN
++
++	.p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
+ L(between_4_7):
+ 	/* From 4 to 7.  No branch when size == 4.  */
+-	movl	%ecx, -4(%rax,%rdx)
+-	movl	%ecx, (%rax)
++	movl	%edi, (%rax)
++	movl	%edi, -4(%rax, %rdx)
+ 	VZEROUPPER_RETURN
++
++	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
+ L(between_2_3):
+ 	/* From 2 to 3.  No branch when size == 2.  */
+-	movw	%cx, -2(%rax,%rdx)
+-	movw	%cx, (%rax)
++	movw	%di, (%rax)
++	movb	%dil, -1(%rax, %rdx)
+ 	VZEROUPPER_RETURN
+ END (MEMSET_SYMBOL (__memset, unaligned_erms))
+-- 
+GitLab
+
diff --git a/SOURCES/ia-opt-strchr-avx2.patch b/SOURCES/ia-opt-strchr-avx2.patch
new file mode 100644
index 0000000..20978ce
--- /dev/null
+++ b/SOURCES/ia-opt-strchr-avx2.patch
@@ -0,0 +1,392 @@
+From 93b1c47bd092f8e1444a10b5d6ec20e44d66459a Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 23 Apr 2021 15:56:24 -0400
+Subject: [PATCH] x86: Optimize strchr-avx2.S
+
+No bug. This commit optimizes strchr-avx2.S. The optimizations are all
+small things such as save an ALU in the alignment process, saving a
+few instructions in the loop return, saving some bytes in the main
+loop, and increasing the ILP in the return cases. test-strchr,
+test-strchrnul, test-wcschr, and test-wcschrnul are all passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+(cherry picked from commit ccabe7971f508709d034b63b8672f6f751a3d356)
+---
+ sysdeps/x86_64/multiarch/strchr-avx2.S | 290 +++++++++++++++----------
+ 1 file changed, 170 insertions(+), 120 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
+index 919d256c..5884726b 100644
+--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
++++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
+@@ -49,133 +49,144 @@
+ 
+ 	.section SECTION(.text),"ax",@progbits
+ ENTRY (STRCHR)
+-	movl	%edi, %ecx
+-# ifndef USE_AS_STRCHRNUL
+-	xorl	%edx, %edx
+-# endif
+-
+ 	/* Broadcast CHAR to YMM0.	*/
+ 	vmovd	%esi, %xmm0
++	movl	%edi, %eax
++	andl	$(PAGE_SIZE - 1), %eax
++	VPBROADCAST	%xmm0, %ymm0
+ 	vpxor	%xmm9, %xmm9, %xmm9
+-	VPBROADCAST %xmm0, %ymm0
+ 
+ 	/* Check if we cross page boundary with one vector load.  */
+-	andl	$(PAGE_SIZE - 1), %ecx
+-	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
+-	ja  L(cross_page_boundary)
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
++	ja	L(cross_page_boundary)
+ 
+ 	/* Check the first VEC_SIZE bytes.	Search for both CHAR and the
+ 	   null byte.  */
+ 	vmovdqu	(%rdi), %ymm8
+-	VPCMPEQ %ymm8, %ymm0, %ymm1
+-	VPCMPEQ %ymm8, %ymm9, %ymm2
++	VPCMPEQ	%ymm8, %ymm0, %ymm1
++	VPCMPEQ	%ymm8, %ymm9, %ymm2
+ 	vpor	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+-	jz	L(more_vecs)
++	jz	L(aligned_more)
+ 	tzcntl	%eax, %eax
++# ifndef USE_AS_STRCHRNUL
+ 	/* Found CHAR or the null byte.	 */
++	cmp	(%rdi, %rax), %CHAR_REG
++	jne	L(zero)
++# endif
+ 	addq	%rdi, %rax
++	VZEROUPPER_RETURN
++
++	/* .p2align 5 helps keep performance more consistent if ENTRY()
++	   alignment % 32 was either 16 or 0. As well this makes the
++	   alignment % 32 of the loop_4x_vec fixed which makes tuning it
++	   easier.  */
++	.p2align 5
++L(first_vec_x4):
++	tzcntl	%eax, %eax
++	addq	$(VEC_SIZE * 3 + 1), %rdi
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
++	/* Found CHAR or the null byte.	 */
++	cmp	(%rdi, %rax), %CHAR_REG
++	jne	L(zero)
+ # endif
+-L(return_vzeroupper):
+-	ZERO_UPPER_VEC_REGISTERS_RETURN
+-
+-	.p2align 4
+-L(more_vecs):
+-	/* Align data for aligned loads in the loop.  */
+-	andq	$-VEC_SIZE, %rdi
+-L(aligned_more):
+-
+-	/* Check the next 4 * VEC_SIZE.	 Only one VEC_SIZE at a time
+-	   since data is only aligned to VEC_SIZE.	*/
+-	vmovdqa	VEC_SIZE(%rdi), %ymm8
+-	addq	$VEC_SIZE, %rdi
+-	VPCMPEQ %ymm8, %ymm0, %ymm1
+-	VPCMPEQ %ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+-
+-	vmovdqa	VEC_SIZE(%rdi), %ymm8
+-	VPCMPEQ %ymm8, %ymm0, %ymm1
+-	VPCMPEQ %ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+-
+-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm8
+-	VPCMPEQ %ymm8, %ymm0, %ymm1
+-	VPCMPEQ %ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x2)
+-
+-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
+-	VPCMPEQ %ymm8, %ymm0, %ymm1
+-	VPCMPEQ %ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jz	L(prep_loop_4x)
++	addq	%rdi, %rax
++	VZEROUPPER_RETURN
+ 
+-	tzcntl	%eax, %eax
+-	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
++L(zero):
++	xorl	%eax, %eax
++	VZEROUPPER_RETURN
+ # endif
+-	VZEROUPPER
+-	ret
++
+ 
+ 	.p2align 4
+-L(first_vec_x0):
++L(first_vec_x1):
+ 	tzcntl	%eax, %eax
+-	/* Found CHAR or the null byte.	 */
+-	addq	%rdi, %rax
++	incq	%rdi
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
++	/* Found CHAR or the null byte.	 */
++	cmp	(%rdi, %rax), %CHAR_REG
++	jne	L(zero)
+ # endif
++	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(first_vec_x1):
++L(first_vec_x2):
+ 	tzcntl	%eax, %eax
+-	leaq	VEC_SIZE(%rdi, %rax), %rax
++	addq	$(VEC_SIZE + 1), %rdi
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
++	/* Found CHAR or the null byte.	 */
++	cmp	(%rdi, %rax), %CHAR_REG
++	jne	L(zero)
+ # endif
++	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(first_vec_x2):
++L(first_vec_x3):
+ 	tzcntl	%eax, %eax
+-	/* Found CHAR or the null byte.	 */
+-	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
++	addq	$(VEC_SIZE * 2 + 1), %rdi
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
++	/* Found CHAR or the null byte.	 */
++	cmp	(%rdi, %rax), %CHAR_REG
++	jne	L(zero)
+ # endif
++	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+ 
+-L(prep_loop_4x):
+-	/* Align data to 4 * VEC_SIZE.	*/
+-	andq	$-(VEC_SIZE * 4), %rdi
++	.p2align 4
++L(aligned_more):
++	/* Align data to VEC_SIZE - 1. This is the same number of
++	   instructions as using andq -VEC_SIZE but saves 4 bytes of code
++	   on x4 check.  */
++	orq	$(VEC_SIZE - 1), %rdi
++L(cross_page_continue):
++	/* Check the next 4 * VEC_SIZE.  Only one VEC_SIZE at a time
++	   since data is only aligned to VEC_SIZE.  */
++	vmovdqa	1(%rdi), %ymm8
++	VPCMPEQ	%ymm8, %ymm0, %ymm1
++	VPCMPEQ	%ymm8, %ymm9, %ymm2
++	vpor	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x1)
++
++	vmovdqa	(VEC_SIZE + 1)(%rdi), %ymm8
++	VPCMPEQ	%ymm8, %ymm0, %ymm1
++	VPCMPEQ	%ymm8, %ymm9, %ymm2
++	vpor	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x2)
++
++	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm8
++	VPCMPEQ	%ymm8, %ymm0, %ymm1
++	VPCMPEQ	%ymm8, %ymm9, %ymm2
++	vpor	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x3)
+ 
++	vmovdqa	(VEC_SIZE * 3 + 1)(%rdi), %ymm8
++	VPCMPEQ	%ymm8, %ymm0, %ymm1
++	VPCMPEQ	%ymm8, %ymm9, %ymm2
++	vpor	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x4)
++	/* Align data to VEC_SIZE * 4 - 1.	*/
++	addq	$(VEC_SIZE * 4 + 1), %rdi
++	andq	$-(VEC_SIZE * 4), %rdi
+ 	.p2align 4
+ L(loop_4x_vec):
+ 	/* Compare 4 * VEC at a time forward.  */
+-	vmovdqa	(VEC_SIZE * 4)(%rdi), %ymm5
+-	vmovdqa	(VEC_SIZE * 5)(%rdi), %ymm6
+-	vmovdqa	(VEC_SIZE * 6)(%rdi), %ymm7
+-	vmovdqa	(VEC_SIZE * 7)(%rdi), %ymm8
++	vmovdqa	(%rdi), %ymm5
++	vmovdqa	(VEC_SIZE)(%rdi), %ymm6
++	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm7
++	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
+ 
+ 	/* Leaves only CHARS matching esi as 0.	 */
+ 	vpxor	%ymm5, %ymm0, %ymm1
+@@ -191,63 +202,102 @@ L(loop_4x_vec):
+ 	VPMINU	%ymm1, %ymm2, %ymm5
+ 	VPMINU	%ymm3, %ymm4, %ymm6
+ 
+-	VPMINU	%ymm5, %ymm6, %ymm5
++	VPMINU	%ymm5, %ymm6, %ymm6
+ 
+-	VPCMPEQ %ymm5, %ymm9, %ymm5
+-	vpmovmskb %ymm5, %eax
++	VPCMPEQ	%ymm6, %ymm9, %ymm6
++	vpmovmskb %ymm6, %ecx
++	subq	$-(VEC_SIZE * 4), %rdi
++	testl	%ecx, %ecx
++	jz	L(loop_4x_vec)
+ 
+-	addq	$(VEC_SIZE * 4), %rdi
+-	testl	%eax, %eax
+-	jz  L(loop_4x_vec)
+ 
+-	VPCMPEQ %ymm1, %ymm9, %ymm1
++	VPCMPEQ	%ymm1, %ymm9, %ymm1
+ 	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
++	jnz	L(last_vec_x0)
++
+ 
+-	VPCMPEQ %ymm2, %ymm9, %ymm2
++	VPCMPEQ	%ymm5, %ymm9, %ymm2
+ 	vpmovmskb %ymm2, %eax
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
++	jnz	L(last_vec_x1)
++
++	VPCMPEQ	%ymm3, %ymm9, %ymm3
++	vpmovmskb %ymm3, %eax
++	/* rcx has combined result from all 4 VEC. It will only be used
++	   if the first 3 other VEC all did not contain a match.  */
++	salq	$32, %rcx
++	orq	%rcx, %rax
++	tzcntq	%rax, %rax
++	subq	$(VEC_SIZE * 2), %rdi
++# ifndef USE_AS_STRCHRNUL
++	/* Found CHAR or the null byte.	 */
++	cmp	(%rdi, %rax), %CHAR_REG
++	jne	L(zero_end)
++# endif
++	addq	%rdi, %rax
++	VZEROUPPER_RETURN
++
++
++	.p2align 4
++L(last_vec_x0):
++	tzcntl	%eax, %eax
++	addq	$-(VEC_SIZE * 4), %rdi
++# ifndef USE_AS_STRCHRNUL
++	/* Found CHAR or the null byte.	 */
++	cmp	(%rdi, %rax), %CHAR_REG
++	jne	L(zero_end)
++# endif
++	addq	%rdi, %rax
++	VZEROUPPER_RETURN
+ 
+-	VPCMPEQ %ymm3, %ymm9, %ymm3
+-	VPCMPEQ %ymm4, %ymm9, %ymm4
+-	vpmovmskb %ymm3, %ecx
+-	vpmovmskb %ymm4, %eax
+-	salq	$32, %rax
+-	orq %rcx, %rax
+-	tzcntq  %rax, %rax
+-	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
++L(zero_end):
++	xorl	%eax, %eax
++	VZEROUPPER_RETURN
+ # endif
+-	VZEROUPPER
+-	ret
++
++	.p2align 4
++L(last_vec_x1):
++	tzcntl	%eax, %eax
++	subq	$(VEC_SIZE * 3), %rdi
++# ifndef USE_AS_STRCHRNUL
++	/* Found CHAR or the null byte.	 */
++	cmp	(%rdi, %rax), %CHAR_REG
++	jne	L(zero_end)
++# endif
++	addq	%rdi, %rax
++	VZEROUPPER_RETURN
++
+ 
+ 	/* Cold case for crossing page with first load.	 */
+ 	.p2align 4
+ L(cross_page_boundary):
+-	andq	$-VEC_SIZE, %rdi
+-	andl	$(VEC_SIZE - 1), %ecx
+-
+-	vmovdqa	(%rdi), %ymm8
+-	VPCMPEQ %ymm8, %ymm0, %ymm1
+-	VPCMPEQ %ymm8, %ymm9, %ymm2
++	movq	%rdi, %rdx
++	/* Align rdi to VEC_SIZE - 1.  */
++	orq	$(VEC_SIZE - 1), %rdi
++	vmovdqa	-(VEC_SIZE - 1)(%rdi), %ymm8
++	VPCMPEQ	%ymm8, %ymm0, %ymm1
++	VPCMPEQ	%ymm8, %ymm9, %ymm2
+ 	vpor	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %eax
+-	/* Remove the leading bits.	 */
+-	sarxl	%ecx, %eax, %eax
++	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
++	   so no need to manually mod edx.  */
++	sarxl	%edx, %eax, %eax
+ 	testl	%eax, %eax
+-	jz	L(aligned_more)
++	jz	L(cross_page_continue)
+ 	tzcntl	%eax, %eax
+-	addq	%rcx, %rdi
+-	addq	%rdi, %rax
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
++	xorl	%ecx, %ecx
++	/* Found CHAR or the null byte.	 */
++	cmp	(%rdx, %rax), %CHAR_REG
++	leaq	(%rdx, %rax), %rax
++	cmovne	%rcx, %rax
++# else
++	addq	%rdx, %rax
+ # endif
+-	VZEROUPPER_RETURN
++L(return_vzeroupper):
++	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ END (STRCHR)
+ # endif
+-- 
+GitLab
+
diff --git a/SOURCES/ia-opt-strchr-evex.patch b/SOURCES/ia-opt-strchr-evex.patch
new file mode 100644
index 0000000..faf7223
--- /dev/null
+++ b/SOURCES/ia-opt-strchr-evex.patch
@@ -0,0 +1,532 @@
+From c17aa053d5b26520fddad8bfb590b521cb027280 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 23 Apr 2021 15:56:25 -0400
+Subject: [PATCH] x86: Optimize strchr-evex.S
+
+No bug. This commit optimizes strchr-evex.S. The optimizations are
+mostly small things such as save an ALU in the alignment process,
+saving a few instructions in the loop return. The one significant
+change is saving 2 instructions in the 4x loop. test-strchr,
+test-strchrnul, test-wcschr, and test-wcschrnul are all passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+(cherry picked from commit 7f3e7c262cab4e2401e4331a6ef29c428de02044)
+---
+ sysdeps/x86_64/multiarch/strchr-evex.S | 392 ++++++++++++++-----------
+ 1 file changed, 218 insertions(+), 174 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
+index ddc86a70..7f9d4ee4 100644
+--- a/sysdeps/x86_64/multiarch/strchr-evex.S
++++ b/sysdeps/x86_64/multiarch/strchr-evex.S
+@@ -32,13 +32,15 @@
+ #  define VPCMP		vpcmpd
+ #  define VPMINU	vpminud
+ #  define CHAR_REG	esi
+-#  define SHIFT_REG	r8d
++#  define SHIFT_REG	ecx
++#  define CHAR_SIZE	4
+ # else
+ #  define VPBROADCAST	vpbroadcastb
+ #  define VPCMP		vpcmpb
+ #  define VPMINU	vpminub
+ #  define CHAR_REG	sil
+-#  define SHIFT_REG	ecx
++#  define SHIFT_REG	edx
++#  define CHAR_SIZE	1
+ # endif
+ 
+ # define XMMZERO	xmm16
+@@ -56,23 +58,20 @@
+ 
+ # define VEC_SIZE 32
+ # define PAGE_SIZE 4096
++# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+ 
+ 	.section .text.evex,"ax",@progbits
+ ENTRY (STRCHR)
+-	movl	%edi, %ecx
+-# ifndef USE_AS_STRCHRNUL
+-	xorl	%edx, %edx
+-# endif
+-
+ 	/* Broadcast CHAR to YMM0.	*/
+-	VPBROADCAST %esi, %YMM0
+-
++	VPBROADCAST	%esi, %YMM0
++	movl	%edi, %eax
++	andl	$(PAGE_SIZE - 1), %eax
+ 	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+ 
+-	/* Check if we cross page boundary with one vector load.  */
+-	andl	$(PAGE_SIZE - 1), %ecx
+-	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
+-	ja  L(cross_page_boundary)
++	/* Check if we cross page boundary with one vector load.
++	   Otherwise it is safe to use an unaligned load.  */
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
++	ja	L(cross_page_boundary)
+ 
+ 	/* Check the first VEC_SIZE bytes. Search for both CHAR and the
+ 	   null bytes.  */
+@@ -83,251 +82,296 @@ ENTRY (STRCHR)
+ 	VPMINU	%YMM2, %YMM1, %YMM2
+ 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+ 	VPCMP	$0, %YMMZERO, %YMM2, %k0
+-	ktestd	%k0, %k0
+-	jz	L(more_vecs)
+ 	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jz	L(aligned_more)
+ 	tzcntl	%eax, %eax
+-	/* Found CHAR or the null byte.	 */
+ # ifdef USE_AS_WCSCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	(%rdi, %rax, 4), %rax
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.
++	 */
++	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+ # else
+ 	addq	%rdi, %rax
+ # endif
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
++	/* Found CHAR or the null byte.	 */
++	cmp	(%rax), %CHAR_REG
++	jne	L(zero)
+ # endif
+ 	ret
+ 
+-	.p2align 4
+-L(more_vecs):
+-	/* Align data for aligned loads in the loop.  */
+-	andq	$-VEC_SIZE, %rdi
+-L(aligned_more):
+-
+-	/* Check the next 4 * VEC_SIZE.	 Only one VEC_SIZE at a time
+-	   since data is only aligned to VEC_SIZE.	*/
+-	VMOVA	VEC_SIZE(%rdi), %YMM1
+-	addq	$VEC_SIZE, %rdi
+-
+-	/* Leaves only CHARS matching esi as 0.  */
+-	vpxorq	%YMM1, %YMM0, %YMM2
+-	VPMINU	%YMM2, %YMM1, %YMM2
+-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM2, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+-
+-	VMOVA	VEC_SIZE(%rdi), %YMM1
+-	/* Leaves only CHARS matching esi as 0.  */
+-	vpxorq	%YMM1, %YMM0, %YMM2
+-	VPMINU	%YMM2, %YMM1, %YMM2
+-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM2, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+-
+-	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM1
+-	/* Leaves only CHARS matching esi as 0.  */
+-	vpxorq	%YMM1, %YMM0, %YMM2
+-	VPMINU	%YMM2, %YMM1, %YMM2
+-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM2, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x2)
+-
+-	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM1
+-	/* Leaves only CHARS matching esi as 0.  */
+-	vpxorq	%YMM1, %YMM0, %YMM2
+-	VPMINU	%YMM2, %YMM1, %YMM2
+-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM2, %k0
+-	ktestd	%k0, %k0
+-	jz	L(prep_loop_4x)
+-
+-	kmovd	%k0, %eax
++	/* .p2align 5 helps keep performance more consistent if ENTRY()
++	   alignment % 32 was either 16 or 0. As well this makes the
++	   alignment % 32 of the loop_4x_vec fixed which makes tuning it
++	   easier.  */
++	.p2align 5
++L(first_vec_x3):
+ 	tzcntl	%eax, %eax
++# ifndef USE_AS_STRCHRNUL
+ 	/* Found CHAR or the null byte.	 */
+-# ifdef USE_AS_WCSCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	(VEC_SIZE * 3)(%rdi, %rax, 4), %rax
+-# else
+-	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
++	cmp	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
++	jne	L(zero)
+ # endif
++	/* NB: Multiply sizeof char type (1 or 4) to get the number of
++	   bytes.  */
++	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
++	ret
++
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
+-# endif
++L(zero):
++	xorl	%eax, %eax
+ 	ret
++# endif
+ 
+ 	.p2align 4
+-L(first_vec_x0):
++L(first_vec_x4):
++# ifndef USE_AS_STRCHRNUL
++	/* Check to see if first match was CHAR (k0) or null (k1).  */
++	kmovd	%k0, %eax
+ 	tzcntl	%eax, %eax
+-	/* Found CHAR or the null byte.	 */
+-# ifdef USE_AS_WCSCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	(%rdi, %rax, 4), %rax
++	kmovd	%k1, %ecx
++	/* bzhil will not be 0 if first match was null.  */
++	bzhil	%eax, %ecx, %ecx
++	jne	L(zero)
+ # else
+-	addq	%rdi, %rax
+-# endif
+-# ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
++	/* Combine CHAR and null matches.  */
++	kord	%k0, %k1, %k0
++	kmovd	%k0, %eax
++	tzcntl	%eax, %eax
+ # endif
++	/* NB: Multiply sizeof char type (1 or 4) to get the number of
++	   bytes.  */
++	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+ 
+ 	.p2align 4
+ L(first_vec_x1):
+ 	tzcntl	%eax, %eax
+-	/* Found CHAR or the null byte.	 */
+-# ifdef USE_AS_WCSCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	VEC_SIZE(%rdi, %rax, 4), %rax
+-# else
+-	leaq	VEC_SIZE(%rdi, %rax), %rax
+-# endif
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
++	/* Found CHAR or the null byte.	 */
++	cmp	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
++	jne	L(zero)
++
+ # endif
++	/* NB: Multiply sizeof char type (1 or 4) to get the number of
++	   bytes.  */
++	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+ 
+ 	.p2align 4
+ L(first_vec_x2):
++# ifndef USE_AS_STRCHRNUL
++	/* Check to see if first match was CHAR (k0) or null (k1).  */
++	kmovd	%k0, %eax
+ 	tzcntl	%eax, %eax
+-	/* Found CHAR or the null byte.	 */
+-# ifdef USE_AS_WCSCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
++	kmovd	%k1, %ecx
++	/* bzhil will not be 0 if first match was null.  */
++	bzhil	%eax, %ecx, %ecx
++	jne	L(zero)
+ # else
+-	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+-# endif
+-# ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
++	/* Combine CHAR and null matches.  */
++	kord	%k0, %k1, %k0
++	kmovd	%k0, %eax
++	tzcntl	%eax, %eax
+ # endif
++	/* NB: Multiply sizeof char type (1 or 4) to get the number of
++	   bytes.  */
++	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+ 
+-L(prep_loop_4x):
+-	/* Align data to 4 * VEC_SIZE.	*/
++	.p2align 4
++L(aligned_more):
++	/* Align data to VEC_SIZE.  */
++	andq	$-VEC_SIZE, %rdi
++L(cross_page_continue):
++	/* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since
++	   data is only aligned to VEC_SIZE. Use two alternating methods
++	   for checking VEC to balance latency and port contention.  */
++
++	/* This method has higher latency but has better port
++	   distribution.  */
++	VMOVA	(VEC_SIZE)(%rdi), %YMM1
++	/* Leaves only CHARS matching esi as 0.  */
++	vpxorq	%YMM1, %YMM0, %YMM2
++	VPMINU	%YMM2, %YMM1, %YMM2
++	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM2, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x1)
++
++	/* This method has higher latency but has better port
++	   distribution.  */
++	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM1
++	/* Each bit in K0 represents a CHAR in YMM1.  */
++	VPCMP	$0, %YMM1, %YMM0, %k0
++	/* Each bit in K1 represents a CHAR in YMM1.  */
++	VPCMP	$0, %YMM1, %YMMZERO, %k1
++	kortestd	%k0, %k1
++	jnz	L(first_vec_x2)
++
++	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM1
++	/* Leaves only CHARS matching esi as 0.  */
++	vpxorq	%YMM1, %YMM0, %YMM2
++	VPMINU	%YMM2, %YMM1, %YMM2
++	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM2, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x3)
++
++	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
++	/* Each bit in K0 represents a CHAR in YMM1.  */
++	VPCMP	$0, %YMM1, %YMM0, %k0
++	/* Each bit in K1 represents a CHAR in YMM1.  */
++	VPCMP	$0, %YMM1, %YMMZERO, %k1
++	kortestd	%k0, %k1
++	jnz	L(first_vec_x4)
++
++	/* Align data to VEC_SIZE * 4 for the loop.  */
++	addq	$VEC_SIZE, %rdi
+ 	andq	$-(VEC_SIZE * 4), %rdi
+ 
+ 	.p2align 4
+ L(loop_4x_vec):
+-	/* Compare 4 * VEC at a time forward.  */
++	/* Check 4x VEC at a time. No penalty to imm32 offset with evex
++	   encoding.  */
+ 	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
+ 	VMOVA	(VEC_SIZE * 5)(%rdi), %YMM2
+ 	VMOVA	(VEC_SIZE * 6)(%rdi), %YMM3
+ 	VMOVA	(VEC_SIZE * 7)(%rdi), %YMM4
+ 
+-	/* Leaves only CHARS matching esi as 0.  */
++	/* For YMM1 and YMM3 use xor to set the CHARs matching esi to
++	   zero.  */
+ 	vpxorq	%YMM1, %YMM0, %YMM5
+-	vpxorq	%YMM2, %YMM0, %YMM6
++	/* For YMM2 and YMM4 cmp not equals to CHAR and store result in
++	   k register. Its possible to save either 1 or 2 instructions
++	   using cmp no equals method for either YMM1 or YMM1 and YMM3
++	   respectively but bottleneck on p5 makes it not worth it.  */
++	VPCMP	$4, %YMM0, %YMM2, %k2
+ 	vpxorq	%YMM3, %YMM0, %YMM7
+-	vpxorq	%YMM4, %YMM0, %YMM8
+-
+-	VPMINU	%YMM5, %YMM1, %YMM5
+-	VPMINU	%YMM6, %YMM2, %YMM6
+-	VPMINU	%YMM7, %YMM3, %YMM7
+-	VPMINU	%YMM8, %YMM4, %YMM8
+-
+-	VPMINU	%YMM5, %YMM6, %YMM1
+-	VPMINU	%YMM7, %YMM8, %YMM2
+-
+-	VPMINU	%YMM1, %YMM2, %YMM1
+-
+-	/* Each bit in K0 represents a CHAR or a null byte.  */
+-	VPCMP	$0, %YMMZERO, %YMM1, %k0
+-
+-	addq	$(VEC_SIZE * 4), %rdi
+-
+-	ktestd	%k0, %k0
++	VPCMP	$4, %YMM0, %YMM4, %k4
++
++	/* Use min to select all zeros from either xor or end of string).
++	 */
++	VPMINU	%YMM1, %YMM5, %YMM1
++	VPMINU	%YMM3, %YMM7, %YMM3
++
++	/* Use min + zeromask to select for zeros. Since k2 and k4 will
++	   have 0 as positions that matched with CHAR which will set
++	   zero in the corresponding destination bytes in YMM2 / YMM4.
++	 */
++	VPMINU	%YMM1, %YMM2, %YMM2{%k2}{z}
++	VPMINU	%YMM3, %YMM4, %YMM4
++	VPMINU	%YMM2, %YMM4, %YMM4{%k4}{z}
++
++	VPCMP	$0, %YMMZERO, %YMM4, %k1
++	kmovd	%k1, %ecx
++	subq	$-(VEC_SIZE * 4), %rdi
++	testl	%ecx, %ecx
+ 	jz	L(loop_4x_vec)
+ 
+-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM5, %k0
++	VPCMP	$0, %YMMZERO, %YMM1, %k0
+ 	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
++	jnz	L(last_vec_x1)
+ 
+-	/* Each bit in K1 represents a CHAR or a null byte in YMM2.  */
+-	VPCMP	$0, %YMMZERO, %YMM6, %k1
+-	kmovd	%k1, %eax
++	VPCMP	$0, %YMMZERO, %YMM2, %k0
++	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+-
+-	/* Each bit in K2 represents a CHAR or a null byte in YMM3.  */
+-	VPCMP	$0, %YMMZERO, %YMM7, %k2
+-	/* Each bit in K3 represents a CHAR or a null byte in YMM4.  */
+-	VPCMP	$0, %YMMZERO, %YMM8, %k3
++	jnz	L(last_vec_x2)
+ 
++	VPCMP	$0, %YMMZERO, %YMM3, %k0
++	kmovd	%k0, %eax
++	/* Combine YMM3 matches (eax) with YMM4 matches (ecx).  */
+ # ifdef USE_AS_WCSCHR
+-	/* NB: Each bit in K2/K3 represents 4-byte element.  */
+-	kshiftlw $8, %k3, %k1
++	sall	$8, %ecx
++	orl	%ecx, %eax
++	tzcntl	%eax, %eax
+ # else
+-	kshiftlq $32, %k3, %k1
++	salq	$32, %rcx
++	orq	%rcx, %rax
++	tzcntq	%rax, %rax
+ # endif
++# ifndef USE_AS_STRCHRNUL
++	/* Check if match was CHAR or null.  */
++	cmp	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
++	jne	L(zero_end)
++# endif
++	/* NB: Multiply sizeof char type (1 or 4) to get the number of
++	   bytes.  */
++	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
++	ret
+ 
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	korq	%k1, %k2, %k1
+-	kmovq	%k1, %rax
++# ifndef USE_AS_STRCHRNUL
++L(zero_end):
++	xorl	%eax, %eax
++	ret
++# endif
+ 
+-	tzcntq  %rax, %rax
+-# ifdef USE_AS_WCSCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
+-# else
+-	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
++	.p2align 4
++L(last_vec_x1):
++	tzcntl	%eax, %eax
++# ifndef USE_AS_STRCHRNUL
++	/* Check if match was null.  */
++	cmp	(%rdi, %rax, CHAR_SIZE), %CHAR_REG
++	jne	L(zero_end)
+ # endif
++	/* NB: Multiply sizeof char type (1 or 4) to get the number of
++	   bytes.  */
++	leaq	(%rdi, %rax, CHAR_SIZE), %rax
++	ret
++
++	.p2align 4
++L(last_vec_x2):
++	tzcntl	%eax, %eax
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
++	/* Check if match was null.  */
++	cmp	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
++	jne	L(zero_end)
+ # endif
++	/* NB: Multiply sizeof char type (1 or 4) to get the number of
++	   bytes.  */
++	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+ 
+ 	/* Cold case for crossing page with first load.	 */
+ 	.p2align 4
+ L(cross_page_boundary):
++	movq	%rdi, %rdx
++	/* Align rdi.  */
+ 	andq	$-VEC_SIZE, %rdi
+-	andl	$(VEC_SIZE - 1), %ecx
+-
+ 	VMOVA	(%rdi), %YMM1
+-
+ 	/* Leaves only CHARS matching esi as 0.  */
+ 	vpxorq	%YMM1, %YMM0, %YMM2
+ 	VPMINU	%YMM2, %YMM1, %YMM2
+ 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+ 	VPCMP	$0, %YMMZERO, %YMM2, %k0
+ 	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-
++	/* Remove the leading bits.	 */
+ # ifdef USE_AS_WCSCHR
++	movl	%edx, %SHIFT_REG
+ 	/* NB: Divide shift count by 4 since each bit in K1 represent 4
+ 	   bytes.  */
+-	movl	%ecx, %SHIFT_REG
+-	sarl    $2, %SHIFT_REG
++	sarl	$2, %SHIFT_REG
++	andl	$(CHAR_PER_VEC - 1), %SHIFT_REG
+ # endif
+-
+-	/* Remove the leading bits.	 */
+ 	sarxl	%SHIFT_REG, %eax, %eax
++	/* If eax is zero continue.  */
+ 	testl	%eax, %eax
+-
+-	jz	L(aligned_more)
++	jz	L(cross_page_continue)
+ 	tzcntl	%eax, %eax
+-	addq	%rcx, %rdi
++# ifndef USE_AS_STRCHRNUL
++	/* Check to see if match was CHAR or null.  */
++	cmp	(%rdx, %rax, CHAR_SIZE), %CHAR_REG
++	jne	L(zero_end)
++# endif
+ # ifdef USE_AS_WCSCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	(%rdi, %rax, 4), %rax
++	/* NB: Multiply wchar_t count by 4 to get the number of
++	   bytes.  */
++	leaq	(%rdx, %rax, CHAR_SIZE), %rax
+ # else
+-	addq	%rdi, %rax
+-# endif
+-# ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
++	addq	%rdx, %rax
+ # endif
+ 	ret
+ 
+-- 
+GitLab
+
diff --git a/SOURCES/ia-opt-strlen-evex.patch b/SOURCES/ia-opt-strlen-evex.patch
new file mode 100644
index 0000000..ce1a1af
--- /dev/null
+++ b/SOURCES/ia-opt-strlen-evex.patch
@@ -0,0 +1,745 @@
+From 417f10b43cd3a0bc5c67b0b5151e92b722bdd8d7 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 19 Apr 2021 19:36:06 -0400
+Subject: [PATCH] x86: Optimize strlen-evex.S
+
+No bug. This commit optimizes strlen-evex.S. The
+optimizations are mostly small things but they add up to roughly
+10-30% performance improvement for strlen. The results for strnlen are
+bit more ambiguous. test-strlen, test-strnlen, test-wcslen, and
+test-wcsnlen are all passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+(cherry picked from commit 4ba65586847751372520a36757c17f114588794e)
+---
+ sysdeps/x86_64/multiarch/strlen-evex.S | 581 ++++++++++++++-----------
+ 1 file changed, 317 insertions(+), 264 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
+index 05838190..4bf6874b 100644
+--- a/sysdeps/x86_64/multiarch/strlen-evex.S
++++ b/sysdeps/x86_64/multiarch/strlen-evex.S
+@@ -29,11 +29,13 @@
+ # ifdef USE_AS_WCSLEN
+ #  define VPCMP		vpcmpd
+ #  define VPMINU	vpminud
+-#  define SHIFT_REG	r9d
++#  define SHIFT_REG ecx
++#  define CHAR_SIZE	4
+ # else
+ #  define VPCMP		vpcmpb
+ #  define VPMINU	vpminub
+-#  define SHIFT_REG	ecx
++#  define SHIFT_REG edx
++#  define CHAR_SIZE	1
+ # endif
+ 
+ # define XMMZERO	xmm16
+@@ -46,132 +48,165 @@
+ # define YMM6		ymm22
+ 
+ # define VEC_SIZE 32
++# define PAGE_SIZE 4096
++# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+ 
+ 	.section .text.evex,"ax",@progbits
+ ENTRY (STRLEN)
+ # ifdef USE_AS_STRNLEN
+-	/* Check for zero length.  */
++	/* Check zero length.  */
+ 	test	%RSI_LP, %RSI_LP
+ 	jz	L(zero)
+-#  ifdef USE_AS_WCSLEN
+-	shl	$2, %RSI_LP
+-#  elif defined __ILP32__
++#  ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	movl	%esi, %esi
+ #  endif
+ 	mov	%RSI_LP, %R8_LP
+ # endif
+-	movl	%edi, %ecx
+-	movq	%rdi, %rdx
++	movl	%edi, %eax
+ 	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+-
++	/* Clear high bits from edi. Only keeping bits relevant to page
++	   cross check.  */
++	andl	$(PAGE_SIZE - 1), %eax
+ 	/* Check if we may cross page boundary with one vector load.  */
+-	andl	$(2 * VEC_SIZE - 1), %ecx
+-	cmpl	$VEC_SIZE, %ecx
+-	ja	L(cros_page_boundary)
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
++	ja	L(cross_page_boundary)
+ 
+ 	/* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
+ 	   null byte.  */
+ 	VPCMP	$0, (%rdi), %YMMZERO, %k0
+ 	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-
+ # ifdef USE_AS_STRNLEN
+-	jnz	L(first_vec_x0_check)
+-	/* Adjust length and check the end of data.  */
+-	subq	$VEC_SIZE, %rsi
+-	jbe	L(max)
+-# else
+-	jnz	L(first_vec_x0)
++	/* If length < CHAR_PER_VEC handle special.  */
++	cmpq	$CHAR_PER_VEC, %rsi
++	jbe	L(first_vec_x0)
+ # endif
+-
+-	/* Align data for aligned loads in the loop.  */
+-	addq	$VEC_SIZE, %rdi
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
+-
++	testl	%eax, %eax
++	jz	L(aligned_more)
++	tzcntl	%eax, %eax
++	ret
+ # ifdef USE_AS_STRNLEN
+-	/* Adjust length.  */
+-	addq	%rcx, %rsi
++L(zero):
++	xorl	%eax, %eax
++	ret
+ 
+-	subq	$(VEC_SIZE * 4), %rsi
+-	jbe	L(last_4x_vec_or_less)
++	.p2align 4
++L(first_vec_x0):
++	/* Set bit for max len so that tzcnt will return min of max len
++	   and position of first match.  */
++	btsq	%rsi, %rax
++	tzcntl	%eax, %eax
++	ret
+ # endif
+-	jmp	L(more_4x_vec)
+ 
+ 	.p2align 4
+-L(cros_page_boundary):
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
+-
+-# ifdef USE_AS_WCSLEN
+-	/* NB: Divide shift count by 4 since each bit in K0 represent 4
+-	   bytes.  */
+-	movl	%ecx, %SHIFT_REG
+-	sarl	$2, %SHIFT_REG
++L(first_vec_x1):
++	tzcntl	%eax, %eax
++	/* Safe to use 32 bit instructions as these are only called for
++	   size = [1, 159].  */
++# ifdef USE_AS_STRNLEN
++	/* Use ecx which was computed earlier to compute correct value.
++	 */
++	leal	-(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax
++# else
++	subl	%edx, %edi
++#  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarl	$2, %edi
++#  endif
++	leal	CHAR_PER_VEC(%rdi, %rax), %eax
+ # endif
+-	VPCMP	$0, (%rdi), %YMMZERO, %k0
+-	kmovd	%k0, %eax
++	ret
+ 
+-	/* Remove the leading bytes.  */
+-	sarxl	%SHIFT_REG, %eax, %eax
+-	testl	%eax, %eax
+-	jz	L(aligned_more)
++	.p2align 4
++L(first_vec_x2):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WCSLEN
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-# endif
++	/* Safe to use 32 bit instructions as these are only called for
++	   size = [1, 159].  */
+ # ifdef USE_AS_STRNLEN
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
+-# endif
+-	addq	%rdi, %rax
+-	addq	%rcx, %rax
+-	subq	%rdx, %rax
+-# ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
++	/* Use ecx which was computed earlier to compute correct value.
++	 */
++	leal	-(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax
++# else
++	subl	%edx, %edi
++#  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarl	$2, %edi
++#  endif
++	leal	(CHAR_PER_VEC * 2)(%rdi, %rax), %eax
+ # endif
+ 	ret
+ 
+ 	.p2align 4
+-L(aligned_more):
++L(first_vec_x3):
++	tzcntl	%eax, %eax
++	/* Safe to use 32 bit instructions as these are only called for
++	   size = [1, 159].  */
+ # ifdef USE_AS_STRNLEN
+-        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
+-	    with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
+-	    to void possible addition overflow.  */
+-	negq	%rcx
+-	addq	$VEC_SIZE, %rcx
+-
+-	/* Check the end of data.  */
+-	subq	%rcx, %rsi
+-	jbe	L(max)
++	/* Use ecx which was computed earlier to compute correct value.
++	 */
++	leal	-(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax
++# else
++	subl	%edx, %edi
++#  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarl	$2, %edi
++#  endif
++	leal	(CHAR_PER_VEC * 3)(%rdi, %rax), %eax
+ # endif
++	ret
+ 
+-	addq	$VEC_SIZE, %rdi
+-
++	.p2align 4
++L(first_vec_x4):
++	tzcntl	%eax, %eax
++	/* Safe to use 32 bit instructions as these are only called for
++	   size = [1, 159].  */
+ # ifdef USE_AS_STRNLEN
+-	subq	$(VEC_SIZE * 4), %rsi
+-	jbe	L(last_4x_vec_or_less)
++	/* Use ecx which was computed earlier to compute correct value.
++	 */
++	leal	-(CHAR_PER_VEC + 1)(%rcx, %rax), %eax
++# else
++	subl	%edx, %edi
++#  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarl	$2, %edi
++#  endif
++	leal	(CHAR_PER_VEC * 4)(%rdi, %rax), %eax
+ # endif
++	ret
+ 
+-L(more_4x_vec):
++	.p2align 5
++L(aligned_more):
++	movq	%rdi, %rdx
++	/* Align data to VEC_SIZE.  */
++	andq	$-(VEC_SIZE), %rdi
++L(cross_page_continue):
+ 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+ 	   since data is only aligned to VEC_SIZE.  */
+-	VPCMP	$0, (%rdi), %YMMZERO, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+-
++# ifdef USE_AS_STRNLEN
++	/* + CHAR_SIZE because it simplies the logic in
++	   last_4x_vec_or_less.  */
++	leaq	(VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx
++	subq	%rdx, %rcx
++#  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarl	$2, %ecx
++#  endif
++# endif
++	/* Load first VEC regardless.  */
+ 	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
++# ifdef USE_AS_STRNLEN
++	/* Adjust length. If near end handle specially.  */
++	subq	%rcx, %rsi
++	jb	L(last_4x_vec_or_less)
++# endif
+ 	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1)
+ 
+ 	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
+ 	kmovd	%k0, %eax
+-	testl	%eax, %eax
++	test	%eax, %eax
+ 	jnz	L(first_vec_x2)
+ 
+ 	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
+@@ -179,258 +214,276 @@ L(more_4x_vec):
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x3)
+ 
+-	addq	$(VEC_SIZE * 4), %rdi
+-
+-# ifdef USE_AS_STRNLEN
+-	subq	$(VEC_SIZE * 4), %rsi
+-	jbe	L(last_4x_vec_or_less)
+-# endif
+-
+-	/* Align data to 4 * VEC_SIZE.  */
+-	movq	%rdi, %rcx
+-	andl	$(4 * VEC_SIZE - 1), %ecx
+-	andq	$-(4 * VEC_SIZE), %rdi
++	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x4)
+ 
++	addq	$VEC_SIZE, %rdi
+ # ifdef USE_AS_STRNLEN
+-	/* Adjust length.  */
++	/* Check if at last VEC_SIZE * 4 length.  */
++	cmpq	$(CHAR_PER_VEC * 4 - 1), %rsi
++	jbe	L(last_4x_vec_or_less_load)
++	movl	%edi, %ecx
++	andl	$(VEC_SIZE * 4 - 1), %ecx
++#  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarl	$2, %ecx
++#  endif
++	/* Readjust length.  */
+ 	addq	%rcx, %rsi
+ # endif
++	/* Align data to VEC_SIZE * 4.  */
++	andq	$-(VEC_SIZE * 4), %rdi
+ 
++	/* Compare 4 * VEC at a time forward.  */
+ 	.p2align 4
+ L(loop_4x_vec):
+-	/* Compare 4 * VEC at a time forward.  */
+-	VMOVA	(%rdi), %YMM1
+-	VMOVA	VEC_SIZE(%rdi), %YMM2
+-	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM3
+-	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM4
+-
+-	VPMINU	%YMM1, %YMM2, %YMM5
+-	VPMINU	%YMM3, %YMM4, %YMM6
++	/* Load first VEC regardless.  */
++	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
++# ifdef USE_AS_STRNLEN
++	/* Break if at end of length.  */
++	subq	$(CHAR_PER_VEC * 4), %rsi
++	jb	L(last_4x_vec_or_less_cmpeq)
++# endif
++	/* Save some code size by microfusing VPMINU with the load. Since
++	   the matches in ymm2/ymm4 can only be returned if there where no
++	   matches in ymm1/ymm3 respectively there is no issue with overlap.
++	 */
++	VPMINU	(VEC_SIZE * 5)(%rdi), %YMM1, %YMM2
++	VMOVA	(VEC_SIZE * 6)(%rdi), %YMM3
++	VPMINU	(VEC_SIZE * 7)(%rdi), %YMM3, %YMM4
++
++	VPCMP	$0, %YMM2, %YMMZERO, %k0
++	VPCMP	$0, %YMM4, %YMMZERO, %k1
++	subq	$-(VEC_SIZE * 4), %rdi
++	kortestd	%k0, %k1
++	jz	L(loop_4x_vec)
++
++	/* Check if end was in first half.  */
++	kmovd	%k0, %eax
++	subq	%rdx, %rdi
++# ifdef USE_AS_WCSLEN
++	shrq	$2, %rdi
++# endif
++	testl	%eax, %eax
++	jz	L(second_vec_return)
+ 
+-	VPMINU	%YMM5, %YMM6, %YMM5
+-	VPCMP	$0, %YMM5, %YMMZERO, %k0
+-	ktestd	%k0, %k0
+-	jnz	L(4x_vec_end)
++	VPCMP	$0, %YMM1, %YMMZERO, %k2
++	kmovd	%k2, %edx
++	/* Combine VEC1 matches (edx) with VEC2 matches (eax).  */
++# ifdef USE_AS_WCSLEN
++	sall	$CHAR_PER_VEC, %eax
++	orl	%edx, %eax
++	tzcntl	%eax, %eax
++# else
++	salq	$CHAR_PER_VEC, %rax
++	orq	%rdx, %rax
++	tzcntq	%rax, %rax
++# endif
++	addq	%rdi, %rax
++	ret
+ 
+-	addq	$(VEC_SIZE * 4), %rdi
+ 
+-# ifndef USE_AS_STRNLEN
+-	jmp	L(loop_4x_vec)
+-# else
+-	subq	$(VEC_SIZE * 4), %rsi
+-	ja	L(loop_4x_vec)
++# ifdef USE_AS_STRNLEN
+ 
++L(last_4x_vec_or_less_load):
++	/* Depending on entry adjust rdi / prepare first VEC in YMM1.  */
++	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
++L(last_4x_vec_or_less_cmpeq):
++	VPCMP	$0, %YMM1, %YMMZERO, %k0
++	addq	$(VEC_SIZE * 3), %rdi
+ L(last_4x_vec_or_less):
+-	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
+-	addl	$(VEC_SIZE * 2), %esi
+-	jle	L(last_2x_vec)
+-
+-	VPCMP	$0, (%rdi), %YMMZERO, %k0
+ 	kmovd	%k0, %eax
++	/* If remaining length > VEC_SIZE * 2. This works if esi is off by
++	   VEC_SIZE * 4.  */
++	testl	$(CHAR_PER_VEC * 2), %esi
++	jnz	L(last_4x_vec)
++
++	/* length may have been negative or positive by an offset of
++	   CHAR_PER_VEC * 4 depending on where this was called from. This
++	   fixes that.  */
++	andl	$(CHAR_PER_VEC * 4 - 1), %esi
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
++	jnz	L(last_vec_x1_check)
+ 
+-	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
++	/* Check the end of data.  */
++	subl	$CHAR_PER_VEC, %esi
++	jb	L(max)
+ 
+ 	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
+ 	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x2_check)
+-	subl	$VEC_SIZE, %esi
+-	jle	L(max)
++	tzcntl	%eax, %eax
++	/* Check the end of data.  */
++	cmpl	%eax, %esi
++	jb	L(max)
+ 
+-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x3_check)
++	subq	%rdx, %rdi
++#  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarq	$2, %rdi
++#  endif
++	leaq	(CHAR_PER_VEC * 2)(%rdi, %rax), %rax
++	ret
++L(max):
+ 	movq	%r8, %rax
++	ret
++# endif
++
++	/* Placed here in strnlen so that the jcc L(last_4x_vec_or_less)
++	   in the 4x VEC loop can use 2 byte encoding.  */
++	.p2align 4
++L(second_vec_return):
++	VPCMP	$0, %YMM3, %YMMZERO, %k0
++	/* Combine YMM3 matches (k0) with YMM4 matches (k1).  */
++# ifdef USE_AS_WCSLEN
++	kunpckbw	%k0, %k1, %k0
++	kmovd	%k0, %eax
++	tzcntl	%eax, %eax
++# else
++	kunpckdq	%k0, %k1, %k0
++	kmovq	%k0, %rax
++	tzcntq	%rax, %rax
++# endif
++	leaq	(CHAR_PER_VEC * 2)(%rdi, %rax), %rax
++	ret
++
++
++# ifdef USE_AS_STRNLEN
++L(last_vec_x1_check):
++	tzcntl	%eax, %eax
++	/* Check the end of data.  */
++	cmpl	%eax, %esi
++	jb	L(max)
++	subq	%rdx, %rdi
+ #  ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarq	$2, %rdi
+ #  endif
++	leaq	(CHAR_PER_VEC)(%rdi, %rax), %rax
+ 	ret
+ 
+ 	.p2align 4
+-L(last_2x_vec):
+-	addl	$(VEC_SIZE * 2), %esi
++L(last_4x_vec):
++	/* Test first 2x VEC normally.  */
++	testl	%eax, %eax
++	jnz	L(last_vec_x1)
+ 
+-	VPCMP	$0, (%rdi), %YMMZERO, %k0
++	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
+ 	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x0_check)
+-	subl	$VEC_SIZE, %esi
+-	jle	L(max)
++	jnz	L(last_vec_x2)
+ 
+-	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
++	/* Normalize length.  */
++	andl	$(CHAR_PER_VEC * 4 - 1), %esi
++	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
+ 	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x1_check)
+-	movq	%r8, %rax
+-#  ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+-#  endif
+-	ret
++	jnz	L(last_vec_x3)
+ 
+-	.p2align 4
+-L(first_vec_x0_check):
++	/* Check the end of data.  */
++	subl	$(CHAR_PER_VEC * 3), %esi
++	jb	L(max)
++
++	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
++	kmovd	%k0, %eax
+ 	tzcntl	%eax, %eax
+-#  ifdef USE_AS_WCSLEN
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-#  endif
+ 	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
+-	addq	%rdi, %rax
+-	subq	%rdx, %rax
++	cmpl	%eax, %esi
++	jb	L(max_end)
++
++	subq	%rdx, %rdi
+ #  ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarq	$2, %rdi
+ #  endif
++	leaq	(CHAR_PER_VEC * 4)(%rdi, %rax), %rax
+ 	ret
+ 
+ 	.p2align 4
+-L(first_vec_x1_check):
++L(last_vec_x1):
+ 	tzcntl	%eax, %eax
++	subq	%rdx, %rdi
+ #  ifdef USE_AS_WCSLEN
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-#  endif
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
+-	addq	$VEC_SIZE, %rax
+-	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-#  ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarq	$2, %rdi
+ #  endif
++	leaq	(CHAR_PER_VEC)(%rdi, %rax), %rax
+ 	ret
+ 
+ 	.p2align 4
+-L(first_vec_x2_check):
++L(last_vec_x2):
+ 	tzcntl	%eax, %eax
++	subq	%rdx, %rdi
+ #  ifdef USE_AS_WCSLEN
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-#  endif
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
+-	addq	$(VEC_SIZE * 2), %rax
+-	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-#  ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarq	$2, %rdi
+ #  endif
++	leaq	(CHAR_PER_VEC * 2)(%rdi, %rax), %rax
+ 	ret
+ 
+ 	.p2align 4
+-L(first_vec_x3_check):
++L(last_vec_x3):
+ 	tzcntl	%eax, %eax
+-#  ifdef USE_AS_WCSLEN
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-#  endif
++	subl	$(CHAR_PER_VEC * 2), %esi
+ 	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
+-	addq	$(VEC_SIZE * 3), %rax
+-	addq	%rdi, %rax
+-	subq	%rdx, %rax
++	cmpl	%eax, %esi
++	jb	L(max_end)
++	subq	%rdx, %rdi
+ #  ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarq	$2, %rdi
+ #  endif
++	leaq	(CHAR_PER_VEC * 3)(%rdi, %rax), %rax
+ 	ret
+-
+-	.p2align 4
+-L(max):
++L(max_end):
+ 	movq	%r8, %rax
+-#  ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+-#  endif
+-	ret
+-
+-	.p2align 4
+-L(zero):
+-	xorl	%eax, %eax
+ 	ret
+ # endif
+ 
++	/* Cold case for crossing page with first load.	 */
+ 	.p2align 4
+-L(first_vec_x0):
+-	tzcntl	%eax, %eax
+-# ifdef USE_AS_WCSLEN
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-# endif
+-	addq	%rdi, %rax
+-	subq	%rdx, %rax
++L(cross_page_boundary):
++	movq	%rdi, %rdx
++	/* Align data to VEC_SIZE.  */
++	andq	$-VEC_SIZE, %rdi
++	VPCMP	$0, (%rdi), %YMMZERO, %k0
++	kmovd	%k0, %eax
++	/* Remove the leading bytes.  */
+ # ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
++	/* NB: Divide shift count by 4 since each bit in K0 represent 4
++	   bytes.  */
++	movl	%edx, %ecx
++	shrl	$2, %ecx
++	andl	$(CHAR_PER_VEC - 1), %ecx
+ # endif
+-	ret
+-
+-	.p2align 4
+-L(first_vec_x1):
++	/* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise.  */
++	sarxl	%SHIFT_REG, %eax, %eax
++	testl	%eax, %eax
++# ifndef USE_AS_STRNLEN
++	jz	L(cross_page_continue)
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WCSLEN
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-# endif
+-	addq	$VEC_SIZE, %rax
+-	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-# ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+-# endif
+ 	ret
+-
+-	.p2align 4
+-L(first_vec_x2):
+-	tzcntl	%eax, %eax
+-# ifdef USE_AS_WCSLEN
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-# endif
+-	addq	$(VEC_SIZE * 2), %rax
+-	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-# ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+-# endif
++# else
++	jnz	L(cross_page_less_vec)
++#  ifndef USE_AS_WCSLEN
++	movl	%edx, %ecx
++	andl	$(CHAR_PER_VEC - 1), %ecx
++#  endif
++	movl	$CHAR_PER_VEC, %eax
++	subl	%ecx, %eax
++	/* Check the end of data.  */
++	cmpq	%rax, %rsi
++	ja	L(cross_page_continue)
++	movl	%esi, %eax
+ 	ret
+-
+-	.p2align 4
+-L(4x_vec_end):
+-	VPCMP	$0, %YMM1, %YMMZERO, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+-	VPCMP	$0, %YMM2, %YMMZERO, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+-	VPCMP	$0, %YMM3, %YMMZERO, %k2
+-	kmovd	%k2, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x2)
+-	VPCMP	$0, %YMM4, %YMMZERO, %k3
+-	kmovd	%k3, %eax
+-L(first_vec_x3):
++L(cross_page_less_vec):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WCSLEN
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-# endif
+-	addq	$(VEC_SIZE * 3), %rax
+-	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-# ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+-# endif
++	/* Select min of length and position of first null.  */
++	cmpq	%rax, %rsi
++	cmovb	%esi, %eax
+ 	ret
++# endif
+ 
+ END (STRLEN)
+ #endif
+-- 
+GitLab
+
diff --git a/SOURCES/ia-prefer_no_vzeroupper.patch b/SOURCES/ia-prefer_no_vzeroupper.patch
new file mode 100644
index 0000000..bcdcc5c
--- /dev/null
+++ b/SOURCES/ia-prefer_no_vzeroupper.patch
@@ -0,0 +1,75 @@
+From 18a0eba6e84524e3a1cbb544ed1f53dce9531b46 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Wed, 2 Mar 2022 12:52:23 -0800
+Subject: [PATCH] x86: Set Prefer_No_VZEROUPPER and add Prefer_AVX2_STRCMP
+
+1. Set Prefer_No_VZEROUPPER if RTM is usable to avoid RTM abort triggered
+by VZEROUPPER inside a transactionally executing RTM region.
+2. Since to compare 2 32-byte strings, 256-bit EVEX strcmp requires 2
+loads, 3 VPCMPs and 2 KORDs while AVX2 strcmp requires 1 load, 2 VPCMPEQs,
+1 VPMINU and 1 VPMOVMSKB, AVX2 strcmp is faster than EVEX strcmp.  Add
+Prefer_AVX2_STRCMP to prefer AVX2 strcmp family functions.
+
+(cherry picked from commit 1da50d4bda07f04135dca39f40e79fc9eabed1f8)
+---
+ sysdeps/x86/cpu-features.c                    | 20 +++++++++++++++++--
+ sysdeps/x86/cpu-tunables.c                    |  2 ++
+ ...cpu-features-preferred_feature_index_1.def |  1 +
+ 3 files changed, 21 insertions(+), 2 deletions(-)
+
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index 91042505..3610ee5c 100644
+--- a/sysdeps/x86/cpu-features.c
++++ b/sysdeps/x86/cpu-features.c
+@@ -524,8 +524,24 @@ init_cpu_features (struct cpu_features *cpu_features)
+ 	cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER]
+ 	  |= bit_arch_Prefer_No_VZEROUPPER;
+       else
+-	cpu_features->preferred[index_arch_Prefer_No_AVX512]
+-	  |= bit_arch_Prefer_No_AVX512;
++	{
++	  cpu_features->preferred[index_arch_Prefer_No_AVX512]
++	    |= bit_arch_Prefer_No_AVX512;
++
++	  /* Avoid RTM abort triggered by VZEROUPPER inside a
++	     transactionally executing RTM region.  */
++	  if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	    cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER]
++	      |= bit_arch_Prefer_No_VZEROUPPER;
++
++	  /* Since to compare 2 32-byte strings, 256-bit EVEX strcmp
++	     requires 2 loads, 3 VPCMPs and 2 KORDs while AVX2 strcmp
++	     requires 1 load, 2 VPCMPEQs, 1 VPMINU and 1 VPMOVMSKB,
++	     AVX2 strcmp is faster than EVEX strcmp.  */
++	  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
++	    cpu_features->preferred[index_arch_Prefer_AVX2_STRCMP]
++	      |= bit_arch_Prefer_AVX2_STRCMP;
++	}
+     }
+   /* This spells out "AuthenticAMD".  */
+   else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
+diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
+index 3173b2b9..73adbaba 100644
+--- a/sysdeps/x86/cpu-tunables.c
++++ b/sysdeps/x86/cpu-tunables.c
+@@ -239,6 +239,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
+ 	      CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
+ 						Fast_Copy_Backward,
+ 						disable, 18);
++	      CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH
++		(n, cpu_features, Prefer_AVX2_STRCMP, AVX2, disable, 18);
+ 	    }
+ 	  break;
+ 	case 19:
+diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+index 17a5cc42..4ca70b40 100644
+--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
++++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+@@ -32,3 +32,4 @@ BIT (Prefer_ERMS)
+ BIT (Prefer_FSRM)
+ BIT (Prefer_No_AVX512)
+ BIT (MathVec_Prefer_No_AVX512)
++BIT (Prefer_AVX2_STRCMP)
+-- 
+GitLab
+
diff --git a/SOURCES/ia-redirect-roundeven-funct.patch b/SOURCES/ia-redirect-roundeven-funct.patch
new file mode 100644
index 0000000..819f0bd
--- /dev/null
+++ b/SOURCES/ia-redirect-roundeven-funct.patch
@@ -0,0 +1,114 @@
+From 9682778522977444546c061170912da902635a4e Mon Sep 17 00:00:00 2001
+From: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
+Date: Mon, 24 May 2021 09:43:10 +0800
+Subject: [PATCH] math: redirect roundeven function
+
+This patch redirect roundeven function for futhermore changes.
+
+Signed-off-by: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+(cherry picked from commit 447954a206837b5f153869cfeeeab44631c3fac9)
+---
+ include/math.h                             | 3 ++-
+ sysdeps/ieee754/dbl-64/s_roundeven.c       | 4 +++-
+ sysdeps/ieee754/float128/s_roundevenf128.c | 1 +
+ sysdeps/ieee754/flt-32/s_roundevenf.c      | 3 +++
+ sysdeps/ieee754/ldbl-128/s_roundevenl.c    | 1 +
+ sysdeps/ieee754/ldbl-96/s_roundevenl.c     | 1 +
+ 6 files changed, 11 insertions(+), 2 deletions(-)
+
+diff --git a/include/math.h b/include/math.h
+index e21d34b8..1f9f9a54 100644
+--- a/include/math.h
++++ b/include/math.h
+@@ -38,7 +38,6 @@ libm_hidden_proto (__issignaling)
+ libm_hidden_proto (__issignalingf)
+ libm_hidden_proto (__exp)
+ libm_hidden_proto (__expf)
+-libm_hidden_proto (__roundeven)
+ 
+ # ifndef __NO_LONG_DOUBLE_MATH
+ libm_hidden_proto (__fpclassifyl)
+@@ -56,6 +55,8 @@ libm_hidden_proto (__expm1f128)
+ 
+ # if !(defined __FINITE_MATH_ONLY__ && __FINITE_MATH_ONLY__ > 0)
+ #  ifndef NO_MATH_REDIRECT
++float (roundevenf) (float) asm ("__roundevenf");
++double (roundeven) (double) asm ("__roundeven");
+ /* Declare sqrt for use within GLIBC.  Compilers typically inline sqrt as a
+    single instruction.  Use an asm to avoid use of PLTs if it doesn't.  */
+ float (sqrtf) (float) asm ("__ieee754_sqrtf");
+diff --git a/sysdeps/ieee754/dbl-64/s_roundeven.c b/sysdeps/ieee754/dbl-64/s_roundeven.c
+index 1438e81d..61962184 100644
+--- a/sysdeps/ieee754/dbl-64/s_roundeven.c
++++ b/sysdeps/ieee754/dbl-64/s_roundeven.c
+@@ -17,6 +17,7 @@
+    License along with the GNU C Library; if not, see
+    <http://www.gnu.org/licenses/>.  */
+ 
++#define NO_MATH_REDIRECT
+ #include <math.h>
+ #include <math_private.h>
+ #include <libm-alias-double.h>
+@@ -101,5 +102,6 @@ __roundeven (double x)
+   INSERT_WORDS (x, hx, lx);
+   return x;
+ }
+-hidden_def (__roundeven)
++#ifndef __roundeven
+ libm_alias_double (__roundeven, roundeven)
++#endif
+diff --git a/sysdeps/ieee754/float128/s_roundevenf128.c b/sysdeps/ieee754/float128/s_roundevenf128.c
+index 5a9b3f39..e0faf727 100644
+--- a/sysdeps/ieee754/float128/s_roundevenf128.c
++++ b/sysdeps/ieee754/float128/s_roundevenf128.c
+@@ -1,2 +1,3 @@
++#define NO_MATH_REDIRECT
+ #include <float128_private.h>
+ #include "../ldbl-128/s_roundevenl.c"
+diff --git a/sysdeps/ieee754/flt-32/s_roundevenf.c b/sysdeps/ieee754/flt-32/s_roundevenf.c
+index 90f991d5..a661875e 100644
+--- a/sysdeps/ieee754/flt-32/s_roundevenf.c
++++ b/sysdeps/ieee754/flt-32/s_roundevenf.c
+@@ -17,6 +17,7 @@
+    License along with the GNU C Library; if not, see
+    <http://www.gnu.org/licenses/>.  */
+ 
++#define NO_MATH_REDIRECT
+ #include <math.h>
+ #include <math_private.h>
+ #include <libm-alias-float.h>
+@@ -67,4 +68,6 @@ __roundevenf (float x)
+   SET_FLOAT_WORD (x, ix);
+   return x;
+ }
++#ifndef __roundevenf
+ libm_alias_float (__roundeven, roundeven)
++#endif
+diff --git a/sysdeps/ieee754/ldbl-128/s_roundevenl.c b/sysdeps/ieee754/ldbl-128/s_roundevenl.c
+index 5fc59af4..b9375b6c 100644
+--- a/sysdeps/ieee754/ldbl-128/s_roundevenl.c
++++ b/sysdeps/ieee754/ldbl-128/s_roundevenl.c
+@@ -17,6 +17,7 @@
+    License along with the GNU C Library; if not, see
+    <http://www.gnu.org/licenses/>.  */
+ 
++#define NO_MATH_REDIRECT
+ #include <math.h>
+ #include <math_private.h>
+ #include <libm-alias-ldouble.h>
+diff --git a/sysdeps/ieee754/ldbl-96/s_roundevenl.c b/sysdeps/ieee754/ldbl-96/s_roundevenl.c
+index be2e4fa4..65031ab7 100644
+--- a/sysdeps/ieee754/ldbl-96/s_roundevenl.c
++++ b/sysdeps/ieee754/ldbl-96/s_roundevenl.c
+@@ -17,6 +17,7 @@
+    License along with the GNU C Library; if not, see
+    <http://www.gnu.org/licenses/>.  */
+ 
++#define NO_MATH_REDIRECT
+ #include <math.h>
+ #include <math_private.h>
+ #include <libm-alias-ldouble.h>
+-- 
+GitLab
+
diff --git a/SOURCES/ia-refacto-imp-prf-strchr-avx2.patch b/SOURCES/ia-refacto-imp-prf-strchr-avx2.patch
new file mode 100644
index 0000000..69316a9
--- /dev/null
+++ b/SOURCES/ia-refacto-imp-prf-strchr-avx2.patch
@@ -0,0 +1,356 @@
+From 43847e49c2dab633146c9b6c682ed5768ccda7cd Mon Sep 17 00:00:00 2001
+From: noah <goldstein.w.n@gmail.com>
+Date: Wed, 3 Feb 2021 00:38:59 -0500
+Subject: [PATCH] x86-64: Refactor and improve performance of strchr-avx2.S
+
+No bug. Just seemed the performance could be improved a bit. Observed
+and expected behavior are unchanged. Optimized body of main
+loop. Updated page cross logic and optimized accordingly. Made a few
+minor instruction selection modifications. No regressions in test
+suite. Both test-strchrnul and test-strchr passed.
+
+(cherry picked from commit 1f745ecc2109890886b161d4791e1406fdfc29b8)
+---
+ sysdeps/x86_64/multiarch/strchr-avx2.S | 225 ++++++++++++-------------
+ sysdeps/x86_64/multiarch/strchr.c      |   4 +-
+ 2 files changed, 114 insertions(+), 115 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
+index da7d2620..919d256c 100644
+--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
++++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
+@@ -27,10 +27,12 @@
+ # ifdef USE_AS_WCSCHR
+ #  define VPBROADCAST	vpbroadcastd
+ #  define VPCMPEQ	vpcmpeqd
++#  define VPMINU	vpminud
+ #  define CHAR_REG	esi
+ # else
+ #  define VPBROADCAST	vpbroadcastb
+ #  define VPCMPEQ	vpcmpeqb
++#  define VPMINU	vpminub
+ #  define CHAR_REG	sil
+ # endif
+ 
+@@ -43,71 +45,54 @@
+ # endif
+ 
+ # define VEC_SIZE 32
++# define PAGE_SIZE 4096
+ 
+ 	.section SECTION(.text),"ax",@progbits
+ ENTRY (STRCHR)
+ 	movl	%edi, %ecx
+-	/* Broadcast CHAR to YMM0.  */
++# ifndef USE_AS_STRCHRNUL
++	xorl	%edx, %edx
++# endif
++
++	/* Broadcast CHAR to YMM0.	*/
+ 	vmovd	%esi, %xmm0
+ 	vpxor	%xmm9, %xmm9, %xmm9
+ 	VPBROADCAST %xmm0, %ymm0
+-	/* Check if we may cross page boundary with one vector load.  */
+-	andl	$(2 * VEC_SIZE - 1), %ecx
+-	cmpl	$VEC_SIZE, %ecx
+-	ja	L(cros_page_boundary)
+ 
+-	/* Check the first VEC_SIZE bytes.  Search for both CHAR and the
+-	   null byte.  */
+-	vmovdqu	(%rdi), %ymm8
+-	VPCMPEQ %ymm8, %ymm0, %ymm1
+-	VPCMPEQ %ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
++	/* Check if we cross page boundary with one vector load.  */
++	andl	$(PAGE_SIZE - 1), %ecx
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
++	ja  L(cross_page_boundary)
+ 
+-	/* Align data for aligned loads in the loop.  */
+-	addq	$VEC_SIZE, %rdi
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
+-
+-	jmp	L(more_4x_vec)
+-
+-	.p2align 4
+-L(cros_page_boundary):
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
++	/* Check the first VEC_SIZE bytes.	Search for both CHAR and the
++	   null byte.  */
+ 	vmovdqu	(%rdi), %ymm8
+ 	VPCMPEQ %ymm8, %ymm0, %ymm1
+ 	VPCMPEQ %ymm8, %ymm9, %ymm2
+ 	vpor	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %eax
+-	/* Remove the leading bytes.  */
+-	sarl	%cl, %eax
+ 	testl	%eax, %eax
+-	jz	L(aligned_more)
+-	/* Found CHAR or the null byte.  */
++	jz	L(more_vecs)
+ 	tzcntl	%eax, %eax
+-	addq	%rcx, %rax
+-# ifdef USE_AS_STRCHRNUL
++	/* Found CHAR or the null byte.	 */
+ 	addq	%rdi, %rax
+-# else
+-	xorl	%edx, %edx
+-	leaq	(%rdi, %rax), %rax
+-	cmp	(%rax), %CHAR_REG
++# ifndef USE_AS_STRCHRNUL
++	cmp (%rax), %CHAR_REG
+ 	cmovne	%rdx, %rax
+ # endif
+ L(return_vzeroupper):
+ 	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ 	.p2align 4
++L(more_vecs):
++	/* Align data for aligned loads in the loop.  */
++	andq	$-VEC_SIZE, %rdi
+ L(aligned_more):
+-	addq	$VEC_SIZE, %rdi
+ 
+-L(more_4x_vec):
+-	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+-	   since data is only aligned to VEC_SIZE.  */
+-	vmovdqa	(%rdi), %ymm8
++	/* Check the next 4 * VEC_SIZE.	 Only one VEC_SIZE at a time
++	   since data is only aligned to VEC_SIZE.	*/
++	vmovdqa	VEC_SIZE(%rdi), %ymm8
++	addq	$VEC_SIZE, %rdi
+ 	VPCMPEQ %ymm8, %ymm0, %ymm1
+ 	VPCMPEQ %ymm8, %ymm9, %ymm2
+ 	vpor	%ymm1, %ymm2, %ymm1
+@@ -137,61 +122,24 @@ L(more_4x_vec):
+ 	vpor	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x3)
+-
+-	addq	$(VEC_SIZE * 4), %rdi
+-
+-	/* Align data to 4 * VEC_SIZE.  */
+-	movq	%rdi, %rcx
+-	andl	$(4 * VEC_SIZE - 1), %ecx
+-	andq	$-(4 * VEC_SIZE), %rdi
+-
+-	.p2align 4
+-L(loop_4x_vec):
+-	/* Compare 4 * VEC at a time forward.  */
+-	vmovdqa	(%rdi), %ymm5
+-	vmovdqa	VEC_SIZE(%rdi), %ymm6
+-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm7
+-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
+-
+-	VPCMPEQ %ymm5, %ymm0, %ymm1
+-	VPCMPEQ %ymm6, %ymm0, %ymm2
+-	VPCMPEQ %ymm7, %ymm0, %ymm3
+-	VPCMPEQ %ymm8, %ymm0, %ymm4
+-
+-	VPCMPEQ %ymm5, %ymm9, %ymm5
+-	VPCMPEQ %ymm6, %ymm9, %ymm6
+-	VPCMPEQ %ymm7, %ymm9, %ymm7
+-	VPCMPEQ %ymm8, %ymm9, %ymm8
+-
+-	vpor	%ymm1, %ymm5, %ymm1
+-	vpor	%ymm2, %ymm6, %ymm2
+-	vpor	%ymm3, %ymm7, %ymm3
+-	vpor	%ymm4, %ymm8, %ymm4
+-
+-	vpor	%ymm1, %ymm2, %ymm5
+-	vpor	%ymm3, %ymm4, %ymm6
+-
+-	vpor	%ymm5, %ymm6, %ymm5
+-
+-	vpmovmskb %ymm5, %eax
+-	testl	%eax, %eax
+-	jnz	L(4x_vec_end)
+-
+-	addq	$(VEC_SIZE * 4), %rdi
++	jz	L(prep_loop_4x)
+ 
+-	jmp	L(loop_4x_vec)
++	tzcntl	%eax, %eax
++	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
++# ifndef USE_AS_STRCHRNUL
++	cmp (%rax), %CHAR_REG
++	cmovne	%rdx, %rax
++# endif
++	VZEROUPPER
++	ret
+ 
+ 	.p2align 4
+ L(first_vec_x0):
+-	/* Found CHAR or the null byte.  */
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_STRCHRNUL
++	/* Found CHAR or the null byte.	 */
+ 	addq	%rdi, %rax
+-# else
+-	xorl	%edx, %edx
+-	leaq	(%rdi, %rax), %rax
+-	cmp	(%rax), %CHAR_REG
++# ifndef USE_AS_STRCHRNUL
++	cmp (%rax), %CHAR_REG
+ 	cmovne	%rdx, %rax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -199,13 +147,9 @@ L(first_vec_x0):
+ 	.p2align 4
+ L(first_vec_x1):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_STRCHRNUL
+-	addq	$VEC_SIZE, %rax
+-	addq	%rdi, %rax
+-# else
+-	xorl	%edx, %edx
+ 	leaq	VEC_SIZE(%rdi, %rax), %rax
+-	cmp	(%rax), %CHAR_REG
++# ifndef USE_AS_STRCHRNUL
++	cmp (%rax), %CHAR_REG
+ 	cmovne	%rdx, %rax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -213,42 +157,97 @@ L(first_vec_x1):
+ 	.p2align 4
+ L(first_vec_x2):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_STRCHRNUL
+-	addq	$(VEC_SIZE * 2), %rax
+-	addq	%rdi, %rax
+-# else
+-	xorl	%edx, %edx
++	/* Found CHAR or the null byte.	 */
+ 	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+-	cmp	(%rax), %CHAR_REG
++# ifndef USE_AS_STRCHRNUL
++	cmp (%rax), %CHAR_REG
+ 	cmovne	%rdx, %rax
+ # endif
+ 	VZEROUPPER_RETURN
+ 
++L(prep_loop_4x):
++	/* Align data to 4 * VEC_SIZE.	*/
++	andq	$-(VEC_SIZE * 4), %rdi
++
+ 	.p2align 4
+-L(4x_vec_end):
++L(loop_4x_vec):
++	/* Compare 4 * VEC at a time forward.  */
++	vmovdqa	(VEC_SIZE * 4)(%rdi), %ymm5
++	vmovdqa	(VEC_SIZE * 5)(%rdi), %ymm6
++	vmovdqa	(VEC_SIZE * 6)(%rdi), %ymm7
++	vmovdqa	(VEC_SIZE * 7)(%rdi), %ymm8
++
++	/* Leaves only CHARS matching esi as 0.	 */
++	vpxor	%ymm5, %ymm0, %ymm1
++	vpxor	%ymm6, %ymm0, %ymm2
++	vpxor	%ymm7, %ymm0, %ymm3
++	vpxor	%ymm8, %ymm0, %ymm4
++
++	VPMINU	%ymm1, %ymm5, %ymm1
++	VPMINU	%ymm2, %ymm6, %ymm2
++	VPMINU	%ymm3, %ymm7, %ymm3
++	VPMINU	%ymm4, %ymm8, %ymm4
++
++	VPMINU	%ymm1, %ymm2, %ymm5
++	VPMINU	%ymm3, %ymm4, %ymm6
++
++	VPMINU	%ymm5, %ymm6, %ymm5
++
++	VPCMPEQ %ymm5, %ymm9, %ymm5
++	vpmovmskb %ymm5, %eax
++
++	addq	$(VEC_SIZE * 4), %rdi
++	testl	%eax, %eax
++	jz  L(loop_4x_vec)
++
++	VPCMPEQ %ymm1, %ymm9, %ymm1
+ 	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x0)
++
++	VPCMPEQ %ymm2, %ymm9, %ymm2
+ 	vpmovmskb %ymm2, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1)
+-	vpmovmskb %ymm3, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x2)
++
++	VPCMPEQ %ymm3, %ymm9, %ymm3
++	VPCMPEQ %ymm4, %ymm9, %ymm4
++	vpmovmskb %ymm3, %ecx
+ 	vpmovmskb %ymm4, %eax
++	salq	$32, %rax
++	orq %rcx, %rax
++	tzcntq  %rax, %rax
++	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
++# ifndef USE_AS_STRCHRNUL
++	cmp (%rax), %CHAR_REG
++	cmovne	%rdx, %rax
++# endif
++	VZEROUPPER
++	ret
++
++	/* Cold case for crossing page with first load.	 */
++	.p2align 4
++L(cross_page_boundary):
++	andq	$-VEC_SIZE, %rdi
++	andl	$(VEC_SIZE - 1), %ecx
++
++	vmovdqa	(%rdi), %ymm8
++	VPCMPEQ %ymm8, %ymm0, %ymm1
++	VPCMPEQ %ymm8, %ymm9, %ymm2
++	vpor	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %eax
++	/* Remove the leading bits.	 */
++	sarxl	%ecx, %eax, %eax
+ 	testl	%eax, %eax
+-L(first_vec_x3):
++	jz	L(aligned_more)
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_STRCHRNUL
+-	addq	$(VEC_SIZE * 3), %rax
++	addq	%rcx, %rdi
+ 	addq	%rdi, %rax
+-# else
+-	xorl	%edx, %edx
+-	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
+-	cmp	(%rax), %CHAR_REG
++# ifndef USE_AS_STRCHRNUL
++	cmp (%rax), %CHAR_REG
+ 	cmovne	%rdx, %rax
+ # endif
+ 	VZEROUPPER_RETURN
+ 
+ END (STRCHR)
+-#endif
++# endif
+diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
+index 7e582f02..5225bd4f 100644
+--- a/sysdeps/x86_64/multiarch/strchr.c
++++ b/sysdeps/x86_64/multiarch/strchr.c
+@@ -38,11 +38,11 @@ IFUNC_SELECTOR (void)
+   const struct cpu_features* cpu_features = __get_cpu_features ();
+ 
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
++      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+     {
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
++	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+ 	return OPTIMIZE (evex);
+ 
+       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+-- 
+GitLab
+
diff --git a/SOURCES/ia-rmv-ofl-chk-wcsnlen-sse4_1.patch b/SOURCES/ia-rmv-ofl-chk-wcsnlen-sse4_1.patch
new file mode 100644
index 0000000..f42dbe1
--- /dev/null
+++ b/SOURCES/ia-rmv-ofl-chk-wcsnlen-sse4_1.patch
@@ -0,0 +1,55 @@
+From 820e5fe6c8c30693cde469829b482665c8129811 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Jun 2021 19:19:34 -0400
+Subject: [PATCH] x86: Remove unnecessary overflow check from wcsnlen-sse4_1.S
+
+No bug. The way wcsnlen will check if near the end of maxlen
+is the following macro:
+
+	mov	%r11, %rsi;	\
+	subq	%rax, %rsi;	\
+	andq	$-64, %rax;	\
+	testq	$-64, %rsi;	\
+	je	L(strnlen_ret)
+
+Which words independently of s + maxlen overflowing. So the
+second overflow check is unnecissary for correctness and
+just extra overhead in the common no overflow case.
+
+test-strlen.c, test-wcslen.c, test-strnlen.c and test-wcsnlen.c are
+all passing
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+(cherry picked from commit 08cbcd4dbc686bb38ec3093aff2f919fbff5ec17)
+---
+ sysdeps/x86_64/multiarch/strlen-vec.S | 7 -------
+ 1 file changed, 7 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
+index 439e486a..b7657282 100644
+--- a/sysdeps/x86_64/multiarch/strlen-vec.S
++++ b/sysdeps/x86_64/multiarch/strlen-vec.S
+@@ -71,19 +71,12 @@ L(n_nonzero):
+    suffice.  */
+ 	mov	%RSI_LP, %R10_LP
+ 	sar	$62, %R10_LP
+-	test	%R10_LP, %R10_LP
+ 	jnz	__wcslen_sse4_1
+ 	sal	$2, %RSI_LP
+ # endif
+ 
+-
+ /* Initialize long lived registers.  */
+-
+ 	add	%RDI_LP, %RSI_LP
+-# ifdef AS_WCSLEN
+-/* Check for overflow again from s + maxlen * sizeof(wchar_t).  */
+-	jbe	__wcslen_sse4_1
+-# endif
+ 	mov	%RSI_LP, %R10_LP
+ 	and	$-64, %R10_LP
+ 	mov	%RSI_LP, %R11_LP
+-- 
+GitLab
+
diff --git a/SOURCES/ia-rmv-prefer_avx2_strcmp.patch b/SOURCES/ia-rmv-prefer_avx2_strcmp.patch
new file mode 100644
index 0000000..f31187d
--- /dev/null
+++ b/SOURCES/ia-rmv-prefer_avx2_strcmp.patch
@@ -0,0 +1,94 @@
+From 69ce2769234e36e62c2db82e1e638124f076e82e Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Wed, 2 Mar 2022 15:42:47 -0800
+Subject: [PATCH] x86-64: Remove Prefer_AVX2_STRCMP
+
+Remove Prefer_AVX2_STRCMP to enable EVEX strcmp.  When comparing 2 32-byte
+strings, EVEX strcmp has been improved to require 1 load, 1 VPTESTM, 1
+VPCMP, 1 KMOVD and 1 INCL instead of 2 loads, 3 VPCMPs, 2 KORDs, 1 KMOVD
+and 1 TESTL while AVX2 strcmp requires 1 load, 2 VPCMPEQs, 1 VPMINU, 1
+VPMOVMSKB and 1 TESTL.  EVEX strcmp is now faster than AVX2 strcmp by up
+to 40% on Tiger Lake and Ice Lake.
+
+(cherry picked from commit 14dbbf46a007ae5df36646b51ad0c9e5f5259f30)
+---
+ sysdeps/x86/cpu-features.c                                | 8 --------
+ sysdeps/x86/cpu-tunables.c                                | 2 --
+ .../include/cpu-features-preferred_feature_index_1.def    | 1 -
+ sysdeps/x86_64/multiarch/strcmp.c                         | 3 +--
+ sysdeps/x86_64/multiarch/strncmp.c                        | 3 +--
+ 5 files changed, 2 insertions(+), 15 deletions(-)
+
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index 8885b48e..956bfb4f 100644
+--- a/sysdeps/x86/cpu-features.c
++++ b/sysdeps/x86/cpu-features.c
+@@ -533,14 +533,6 @@ init_cpu_features (struct cpu_features *cpu_features)
+ 	  if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+ 	    cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER]
+ 	      |= bit_arch_Prefer_No_VZEROUPPER;
+-
+-	  /* Since to compare 2 32-byte strings, 256-bit EVEX strcmp
+-	     requires 2 loads, 3 VPCMPs and 2 KORDs while AVX2 strcmp
+-	     requires 1 load, 2 VPCMPEQs, 1 VPMINU and 1 VPMOVMSKB,
+-	     AVX2 strcmp is faster than EVEX strcmp.  */
+-	  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
+-	    cpu_features->preferred[index_arch_Prefer_AVX2_STRCMP]
+-	      |= bit_arch_Prefer_AVX2_STRCMP;
+ 	}
+     }
+   /* This spells out "AuthenticAMD".  */
+diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
+index 73adbaba..3173b2b9 100644
+--- a/sysdeps/x86/cpu-tunables.c
++++ b/sysdeps/x86/cpu-tunables.c
+@@ -239,8 +239,6 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
+ 	      CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
+ 						Fast_Copy_Backward,
+ 						disable, 18);
+-	      CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH
+-		(n, cpu_features, Prefer_AVX2_STRCMP, AVX2, disable, 18);
+ 	    }
+ 	  break;
+ 	case 19:
+diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+index f2340624..e7277b33 100644
+--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
++++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+@@ -32,5 +32,4 @@ BIT (Prefer_ERMS)
+ BIT (Prefer_FSRM)
+ BIT (Prefer_No_AVX512)
+ BIT (MathVec_Prefer_No_AVX512)
+-BIT (Prefer_AVX2_STRCMP)
+ BIT (Avoid_Short_Distance_REP_MOVSB)
+diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c
+index 11bbea2b..f8a7220e 100644
+--- a/sysdeps/x86_64/multiarch/strcmp.c
++++ b/sysdeps/x86_64/multiarch/strcmp.c
+@@ -43,8 +43,7 @@ IFUNC_SELECTOR (void)
+     {
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+ 	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
+-	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP))
++	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+ 	return OPTIMIZE (evex);
+ 
+       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c
+index 44c85116..7903f807 100644
+--- a/sysdeps/x86_64/multiarch/strncmp.c
++++ b/sysdeps/x86_64/multiarch/strncmp.c
+@@ -43,8 +43,7 @@ IFUNC_SELECTOR (void)
+     {
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+ 	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
+-	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP))
++	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+ 	return OPTIMIZE (evex);
+ 
+       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+-- 
+GitLab
+
diff --git a/SOURCES/ia-roundeven_sse4_1.patch b/SOURCES/ia-roundeven_sse4_1.patch
new file mode 100644
index 0000000..01f0153
--- /dev/null
+++ b/SOURCES/ia-roundeven_sse4_1.patch
@@ -0,0 +1,242 @@
+From 125c5c04cccb16dabf9d0e693df154608289d1dd Mon Sep 17 00:00:00 2001
+From: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
+Date: Mon, 24 May 2021 09:43:11 +0800
+Subject: [PATCH] x86_64: roundeven with sse4.1 support
+
+This patch adds support for the sse4.1 hardware floating point
+roundeven.
+
+Here is some benchmark results on my systems:
+
+=AMD Ryzen 9 3900X 12-Core Processor=
+
+* benchmark result before this commit
+|            |    roundeven |   roundevenf |
+|------------|--------------|--------------|
+| duration   |  3.75587e+09 |  3.75114e+09 |
+| iterations |  3.93053e+08 |  4.35402e+08 |
+| max        | 52.592       | 58.71        |
+| min        |  7.98        |  7.22        |
+| mean       |  9.55563     |  8.61535     |
+
+* benchmark result after this commit
+|            |     roundeven |   roundevenf |
+|------------|---------------|--------------|
+| duration   |   3.73815e+09 |  3.73738e+09 |
+| iterations |   5.82692e+08 |  5.91498e+08 |
+| max        |  56.468       | 51.642       |
+| min        |   6.27        |  6.156       |
+| mean       |   6.41532     |  6.3185      |
+
+=Intel(R) Pentium(R) CPU D1508 @ 2.20GHz=
+
+* benchmark result before this commit
+|            |    roundeven |   roundevenf |
+|------------|--------------|--------------|
+| duration   |  2.18208e+09 |  2.18258e+09 |
+| iterations |  2.39932e+08 |  2.46924e+08 |
+| max        | 96.378       | 98.035       |
+| min        |  6.776       |  5.94        |
+| mean       |  9.09456     |  8.83907     |
+
+* benchmark result after this commit
+|            |    roundeven |   roundevenf |
+|------------|--------------|--------------|
+| duration   |  2.17415e+09 |  2.17005e+09 |
+| iterations |  3.56193e+08 |  4.09824e+08 |
+| max        | 51.693       | 97.192       |
+| min        |  5.926       |  5.093       |
+| mean       |  6.10385     |  5.29507     |
+
+Signed-off-by: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+(cherry picked from commit 1683249d17e14827b6579529742eb895027dfa84)
+---
+ sysdeps/x86_64/fpu/multiarch/Makefile         |  5 +--
+ sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c  |  2 ++
+ .../x86_64/fpu/multiarch/s_roundeven-sse4_1.S | 24 ++++++++++++++
+ sysdeps/x86_64/fpu/multiarch/s_roundeven.c    | 31 +++++++++++++++++++
+ sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c |  3 ++
+ .../fpu/multiarch/s_roundevenf-sse4_1.S       | 24 ++++++++++++++
+ sysdeps/x86_64/fpu/multiarch/s_roundevenf.c   | 31 +++++++++++++++++++
+ 7 files changed, 118 insertions(+), 2 deletions(-)
+ create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c
+ create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S
+ create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundeven.c
+ create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c
+ create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S
+ create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundevenf.c
+
+diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
+index 9f387248..6ddd1c01 100644
+--- a/sysdeps/x86_64/fpu/multiarch/Makefile
++++ b/sysdeps/x86_64/fpu/multiarch/Makefile
+@@ -1,11 +1,12 @@
+ ifeq ($(subdir),math)
+ libm-sysdep_routines += s_floor-c s_ceil-c s_floorf-c s_ceilf-c \
+ 			s_rint-c s_rintf-c s_nearbyint-c s_nearbyintf-c \
+-			s_trunc-c s_truncf-c
++			s_roundeven-c s_roundevenf-c s_trunc-c s_truncf-c
+ 
+ libm-sysdep_routines += s_ceil-sse4_1 s_ceilf-sse4_1 s_floor-sse4_1 \
+ 			s_floorf-sse4_1 s_nearbyint-sse4_1 \
+-			s_nearbyintf-sse4_1 s_rint-sse4_1 s_rintf-sse4_1 \
++			s_nearbyintf-sse4_1 s_roundeven-sse4_1 \
++			s_roundevenf-sse4_1 s_rint-sse4_1 s_rintf-sse4_1 \
+ 			s_trunc-sse4_1 s_truncf-sse4_1
+ 
+ libm-sysdep_routines += e_exp-fma e_log-fma e_pow-fma s_atan-fma \
+diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c b/sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c
+new file mode 100644
+index 00000000..c7be43cb
+--- /dev/null
++++ b/sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c
+@@ -0,0 +1,2 @@
++#define __roundeven __roundeven_c
++#include <sysdeps/ieee754/dbl-64/s_roundeven.c>
+diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S b/sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S
+new file mode 100644
+index 00000000..6ae8f6b1
+--- /dev/null
++++ b/sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S
+@@ -0,0 +1,24 @@
++/* Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <sysdep.h>
++
++	.section .text.sse4.1,"ax",@progbits
++ENTRY(__roundeven_sse41)
++	roundsd	$8, %xmm0, %xmm0
++	ret
++END(__roundeven_sse41)
+diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundeven.c b/sysdeps/x86_64/fpu/multiarch/s_roundeven.c
+new file mode 100644
+index 00000000..d92eda65
+--- /dev/null
++++ b/sysdeps/x86_64/fpu/multiarch/s_roundeven.c
+@@ -0,0 +1,31 @@
++/* Multiple versions of __roundeven.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <libm-alias-double.h>
++
++#define roundeven __redirect_roundeven
++#define __roundeven __redirect___roundeven
++#include <math.h>
++#undef roundeven
++#undef __roundeven
++
++#define SYMBOL_NAME roundeven
++#include "ifunc-sse4_1.h"
++
++libc_ifunc_redirected (__redirect_roundeven, __roundeven, IFUNC_SELECTOR ());
++libm_alias_double (__roundeven, roundeven)
+diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c
+new file mode 100644
+index 00000000..72a6e7d1
+--- /dev/null
++++ b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c
+@@ -0,0 +1,3 @@
++#undef __roundevenf
++#define __roundevenf __roundevenf_c
++#include <sysdeps/ieee754/flt-32/s_roundevenf.c>
+diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S
+new file mode 100644
+index 00000000..a76e1080
+--- /dev/null
++++ b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S
+@@ -0,0 +1,24 @@
++/* Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <sysdep.h>
++
++	.section .text.sse4.1,"ax",@progbits
++ENTRY(__roundevenf_sse41)
++	roundss	$8, %xmm0, %xmm0
++	ret
++END(__roundevenf_sse41)
+diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundevenf.c b/sysdeps/x86_64/fpu/multiarch/s_roundevenf.c
+new file mode 100644
+index 00000000..2ee196e6
+--- /dev/null
++++ b/sysdeps/x86_64/fpu/multiarch/s_roundevenf.c
+@@ -0,0 +1,31 @@
++/* Multiple versions of __roundevenf.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <libm-alias-float.h>
++
++#define roundevenf __redirect_roundevenf
++#define __roundevenf __redirect___roundevenf
++#include <math.h>
++#undef roundevenf
++#undef __roundevenf
++
++#define SYMBOL_NAME roundevenf
++#include "ifunc-sse4_1.h"
++
++libc_ifunc_redirected (__redirect_roundevenf, __roundevenf, IFUNC_SELECTOR ());
++libm_alias_float (__roundeven, roundeven)
+-- 
+GitLab
+
diff --git a/SOURCES/ia-rplc-cas-avoid-extra-load.patch b/SOURCES/ia-rplc-cas-avoid-extra-load.patch
new file mode 100644
index 0000000..ed94b27
--- /dev/null
+++ b/SOURCES/ia-rplc-cas-avoid-extra-load.patch
@@ -0,0 +1,72 @@
+From 77950da440c2b12c153b77ab8b4d1d00c88ca3a9 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 12 Nov 2021 11:47:42 -0800
+Subject: [PATCH] Move assignment out of the CAS condition
+
+Update
+
+commit 49302b8fdf9103b6fc0a398678668a22fa19574c
+Author: H.J. Lu <hjl.tools@gmail.com>
+Date:   Thu Nov 11 06:54:01 2021 -0800
+
+    Avoid extra load with CAS in __pthread_mutex_clocklock_common [BZ #28537]
+
+    Replace boolean CAS with value CAS to avoid the extra load.
+
+and
+
+commit 0b82747dc48d5bf0871bdc6da8cb6eec1256355f
+Author: H.J. Lu <hjl.tools@gmail.com>
+Date:   Thu Nov 11 06:31:51 2021 -0800
+
+    Avoid extra load with CAS in __pthread_mutex_lock_full [BZ #28537]
+
+    Replace boolean CAS with value CAS to avoid the extra load.
+
+by moving assignment out of the CAS condition.
+
+(cherry picked from commit 120ac6d238825452e8024e2f627da33b2508dfd3)
+---
+ nptl/pthread_mutex_lock.c      | 7 +++----
+ nptl/pthread_mutex_timedlock.c | 7 +++----
+ 2 files changed, 6 insertions(+), 8 deletions(-)
+
+diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
+index eb4d8baa..a633d95e 100644
+--- a/nptl/pthread_mutex_lock.c
++++ b/nptl/pthread_mutex_lock.c
+@@ -299,10 +299,9 @@ __pthread_mutex_lock_full (pthread_mutex_t *mutex)
+ 	     meantime.  */
+ 	  if ((oldval & FUTEX_WAITERS) == 0)
+ 	    {
+-	      int val;
+-	      if ((val = atomic_compare_and_exchange_val_acq
+-		   (&mutex->__data.__lock, oldval | FUTEX_WAITERS,
+-		    oldval)) != oldval)
++	      int val = atomic_compare_and_exchange_val_acq
++		(&mutex->__data.__lock, oldval | FUTEX_WAITERS, oldval);
++	      if (val != oldval)
+ 		{
+ 		  oldval = val;
+ 		  continue;
+diff --git a/nptl/pthread_mutex_timedlock.c b/nptl/pthread_mutex_timedlock.c
+index c4627ef6..a76c30b7 100644
+--- a/nptl/pthread_mutex_timedlock.c
++++ b/nptl/pthread_mutex_timedlock.c
+@@ -269,10 +269,9 @@ __pthread_mutex_timedlock (pthread_mutex_t *mutex,
+ 	     meantime.  */
+ 	  if ((oldval & FUTEX_WAITERS) == 0)
+ 	    {
+-	      int val;
+-	      if ((val = atomic_compare_and_exchange_val_acq
+-		   (&mutex->__data.__lock, oldval | FUTEX_WAITERS,
+-		    oldval)) != oldval)
++	      int val = atomic_compare_and_exchange_val_acq
++		(&mutex->__data.__lock, oldval | FUTEX_WAITERS, oldval);
++	      if (val != oldval)
+ 		{
+ 		  oldval = val;
+ 		  continue;
+-- 
+GitLab
+
diff --git a/SOURCES/ia-rplc-movzx-movzbl.patch b/SOURCES/ia-rplc-movzx-movzbl.patch
new file mode 100644
index 0000000..6e6d89c
--- /dev/null
+++ b/SOURCES/ia-rplc-movzx-movzbl.patch
@@ -0,0 +1,54 @@
+From d867c25060d86f0223347734ef5005ad0792554c Mon Sep 17 00:00:00 2001
+From: Fangrui Song <maskray@google.com>
+Date: Tue, 2 Nov 2021 20:59:52 -0700
+Subject: [PATCH] x86-64: Replace movzx with movzbl
+
+Clang cannot assemble movzx in the AT&T dialect mode.
+
+../sysdeps/x86_64/strcmp.S:2232:16: error: invalid operand for instruction
+ movzx (%rsi), %ecx
+               ^~~~
+
+Change movzx to movzbl, which follows the AT&T dialect and is used
+elsewhere in the file.
+
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+(cherry picked from commit 6720d36b6623c5e48c070d86acf61198b33e144e)
+---
+ sysdeps/x86_64/multiarch/strcmp-sse42.S | 4 ++--
+ sysdeps/x86_64/strcmp.S                 | 4 ++--
+ 2 files changed, 4 insertions(+), 4 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
+index a1ebea46..d8fdeb3a 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
++++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
+@@ -1771,8 +1771,8 @@ LABEL(strcmp_exitz):
+ 	.p2align 4
+ 	// XXX Same as code above
+ LABEL(Byte0):
+-	movzx	(%rsi), %ecx
+-	movzx	(%rdi), %eax
++	movzbl	(%rsi), %ecx
++	movzbl	(%rdi), %eax
+ 
+ #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ 	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
+diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
+index f47c8ad4..aa6df898 100644
+--- a/sysdeps/x86_64/strcmp.S
++++ b/sysdeps/x86_64/strcmp.S
+@@ -2232,8 +2232,8 @@ LABEL(strcmp_exitz):
+ 
+ 	.p2align 4
+ LABEL(Byte0):
+-	movzx	(%rsi), %ecx
+-	movzx	(%rdi), %eax
++	movzbl	(%rsi), %ecx
++	movzbl	(%rdi), %eax
+ 
+ #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ 	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
+-- 
+GitLab
+
diff --git a/SOURCES/ia-rplc-sse2-inst-avx-memcmp-evex-movbe.patch b/SOURCES/ia-rplc-sse2-inst-avx-memcmp-evex-movbe.patch
new file mode 100644
index 0000000..a42aa4b
--- /dev/null
+++ b/SOURCES/ia-rplc-sse2-inst-avx-memcmp-evex-movbe.patch
@@ -0,0 +1,46 @@
+From e7e0ac928b21e5f47e5b648723851c5270db24f2 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Sat, 23 Oct 2021 01:26:47 -0400
+Subject: [PATCH] x86: Replace sse2 instructions with avx in
+ memcmp-evex-movbe.S
+
+This commit replaces two usages of SSE2 'movups' with AVX 'vmovdqu'.
+
+it could potentially be dangerous to use SSE2 if this function is ever
+called without using 'vzeroupper' beforehand. While compilers appear
+to use 'vzeroupper' before function calls if AVX2 has been used, using
+SSE2 here is more brittle. Since it is not absolutely necessary it
+should be avoided.
+
+It costs 2-extra bytes but the extra bytes should only eat into
+alignment padding.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+
+(cherry picked from commit bad852b61b79503fcb3c5fc379c70f768df3e1fb)
+---
+ sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+index 2761b54f..640f6757 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
++++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+@@ -561,13 +561,13 @@ L(between_16_31):
+ 	/* From 16 to 31 bytes.  No branch when size == 16.  */
+ 
+ 	/* Use movups to save code size.  */
+-	movups	(%rsi), %xmm2
++	vmovdqu	(%rsi), %xmm2
+ 	VPCMP	$4, (%rdi), %xmm2, %k1
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(return_vec_0_lv)
+ 	/* Use overlapping loads to avoid branches.  */
+-	movups	-16(%rsi, %rdx, CHAR_SIZE), %xmm2
++	vmovdqu	-16(%rsi, %rdx, CHAR_SIZE), %xmm2
+ 	VPCMP	$4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
+ 	addl	$(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
+ 	kmovd	%k1, %eax
+-- 
+GitLab
+
diff --git a/SOURCES/ia-set-rep_movsb_threshold-2112.patch b/SOURCES/ia-set-rep_movsb_threshold-2112.patch
new file mode 100644
index 0000000..fbba9b9
--- /dev/null
+++ b/SOURCES/ia-set-rep_movsb_threshold-2112.patch
@@ -0,0 +1,57 @@
+From 238aa7dce4d9684b6326665c79eabad72d5679bf Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Wed, 2 Mar 2022 16:21:12 -0800
+Subject: [PATCH] x86: Set rep_movsb_threshold to 2112 on processors with FSRM
+
+The glibc memcpy benchmark on Intel Core i7-1065G7 (Ice Lake) showed
+that REP MOVSB became faster after 2112 bytes:
+
+                                      Vector Move       REP MOVSB
+length=2112, align1=0, align2=0:        24.20             24.40
+length=2112, align1=1, align2=0:        26.07             23.13
+length=2112, align1=0, align2=1:        27.18             28.13
+length=2112, align1=1, align2=1:        26.23             25.16
+length=2176, align1=0, align2=0:        23.18             22.52
+length=2176, align1=2, align2=0:        25.45             22.52
+length=2176, align1=0, align2=2:        27.14             27.82
+length=2176, align1=2, align2=2:        22.73             25.56
+length=2240, align1=0, align2=0:        24.62             24.25
+length=2240, align1=3, align2=0:        29.77             27.15
+length=2240, align1=0, align2=3:        35.55             29.93
+length=2240, align1=3, align2=3:        34.49             25.15
+length=2304, align1=0, align2=0:        34.75             26.64
+length=2304, align1=4, align2=0:        32.09             22.63
+length=2304, align1=0, align2=4:        28.43             31.24
+
+Use REP MOVSB for data size > 2112 bytes in memcpy on processors with
+fast short REP MOVSB (FSRM).
+
+	* sysdeps/x86/dl-cacheinfo.h (dl_init_cacheinfo): Set
+	rep_movsb_threshold to 2112 on processors with fast short REP
+	MOVSB (FSRM).
+
+(cherry picked from commit cf2c57526ba4b57e6863ad4db8a868e2678adce8)
+---
+ sysdeps/x86/cacheinfo.h | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
+index f72f634a..cc3941d3 100644
+--- a/sysdeps/x86/cacheinfo.h
++++ b/sysdeps/x86/cacheinfo.h
+@@ -430,6 +430,12 @@ init_cacheinfo (void)
+       rep_movsb_threshold = 2048 * (16 / 16);
+       minimum_rep_movsb_threshold = 16 * 8;
+     }
++
++  /* NB: The default REP MOVSB threshold is 2112 on processors with fast
++     short REP MOVSB (FSRM).  */
++  if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
++    rep_movsb_threshold = 2112;
++
+   if (cpu_features->rep_movsb_threshold > minimum_rep_movsb_threshold)
+     __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
+   else
+-- 
+GitLab
+
diff --git a/SOURCES/ia-shrink-memcmp-sse4-code-size.patch b/SOURCES/ia-shrink-memcmp-sse4-code-size.patch
new file mode 100644
index 0000000..86d4d16
--- /dev/null
+++ b/SOURCES/ia-shrink-memcmp-sse4-code-size.patch
@@ -0,0 +1,2428 @@
+From 17bb3526d088d8854b446a6faabf1fad3812b2c9 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 10 Nov 2021 16:18:56 -0600
+Subject: [PATCH] x86: Shrink memcmp-sse4.S code size
+
+No bug.
+
+This implementation refactors memcmp-sse4.S primarily with minimizing
+code size in mind. It does this by removing the lookup table logic and
+removing the unrolled check from (256, 512] bytes.
+
+memcmp-sse4 code size reduction : -3487 bytes
+wmemcmp-sse4 code size reduction: -1472 bytes
+
+The current memcmp-sse4.S implementation has a large code size
+cost. This has serious adverse affects on the ICache / ITLB. While
+in micro-benchmarks the implementations appears fast, traces of
+real-world code have shown that the speed in micro benchmarks does not
+translate when the ICache/ITLB are not primed, and that the cost
+of the code size has measurable negative affects on overall
+application performance.
+
+See https://research.google/pubs/pub48320/ for more details.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+(cherry picked from commit 2f9062d7171850451e6044ef78d91ff8c017b9c0)
+---
+ sysdeps/x86_64/multiarch/memcmp-sse4.S | 2267 +++++++-----------------
+ 1 file changed, 646 insertions(+), 1621 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
+index 302900f5..50060006 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
++++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S
+@@ -25,14 +25,14 @@
+ #  define MEMCMP	__memcmp_sse4_1
+ # endif
+ 
+-# define JMPTBL(I, B)	(I - B)
++#ifdef USE_AS_WMEMCMP
++# define CMPEQ	pcmpeqd
++# define CHAR_SIZE	4
++#else
++# define CMPEQ	pcmpeqb
++# define CHAR_SIZE	1
++#endif
+ 
+-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+-  lea		TABLE(%rip), %r11;				\
+-  movslq	(%r11, INDEX, SCALE), %rcx;			\
+-  add		%r11, %rcx;					\
+-  _CET_NOTRACK jmp *%rcx;					\
+-  ud2
+ 
+ /* Warning!
+            wmemcmp has to use SIGNED comparison for elements.
+@@ -47,33 +47,253 @@ ENTRY (MEMCMP)
+ 	/* Clear the upper 32 bits.  */
+ 	mov	%edx, %edx
+ # endif
+-	pxor	%xmm0, %xmm0
+ 	cmp	$79, %RDX_LP
+ 	ja	L(79bytesormore)
++
++	cmp	$CHAR_SIZE, %RDX_LP
++	jbe	L(firstbyte)
++
++	/* N in (CHAR_SIZE, 79) bytes.  */
++	cmpl	$32, %edx
++	ja	L(more_32_bytes)
++
++	cmpl	$16, %edx
++	jae	L(16_to_32_bytes)
++
+ # ifndef USE_AS_WMEMCMP
+-	cmp	$1, %RDX_LP
+-	je	L(firstbyte)
++	cmpl	$8, %edx
++	jae	L(8_to_16_bytes)
++
++	cmpl	$4, %edx
++	jb	L(2_to_3_bytes)
++
++	movl	(%rdi), %eax
++	movl	(%rsi), %ecx
++
++	bswap	%eax
++	bswap	%ecx
++
++	shlq	$32, %rax
++	shlq	$32, %rcx
++
++	movl	-4(%rdi, %rdx), %edi
++	movl	-4(%rsi, %rdx), %esi
++
++	bswap	%edi
++	bswap	%esi
++
++	orq	%rdi, %rax
++	orq	%rsi, %rcx
++	subq	%rcx, %rax
++	cmovne	%edx, %eax
++	sbbl	%ecx, %ecx
++	orl	%ecx, %eax
++	ret
++
++	.p2align 4,, 8
++L(2_to_3_bytes):
++	movzwl	(%rdi), %eax
++	movzwl	(%rsi), %ecx
++	shll	$8, %eax
++	shll	$8, %ecx
++	bswap	%eax
++	bswap	%ecx
++	movzbl	-1(%rdi, %rdx), %edi
++	movzbl	-1(%rsi, %rdx), %esi
++	orl	%edi, %eax
++	orl	%esi, %ecx
++	subl	%ecx, %eax
++	ret
++
++	.p2align 4,, 8
++L(8_to_16_bytes):
++	movq	(%rdi), %rax
++	movq	(%rsi), %rcx
++
++	bswap	%rax
++	bswap	%rcx
++
++	subq	%rcx, %rax
++	jne	L(8_to_16_bytes_done)
++
++	movq	-8(%rdi, %rdx), %rax
++	movq	-8(%rsi, %rdx), %rcx
++
++	bswap	%rax
++	bswap	%rcx
++
++	subq	%rcx, %rax
++
++L(8_to_16_bytes_done):
++	cmovne	%edx, %eax
++	sbbl	%ecx, %ecx
++	orl	%ecx, %eax
++	ret
++# else
++	xorl	%eax, %eax
++	movl	(%rdi), %ecx
++	cmpl	(%rsi), %ecx
++	jne	L(8_to_16_bytes_done)
++	movl	4(%rdi), %ecx
++	cmpl	4(%rsi), %ecx
++	jne	L(8_to_16_bytes_done)
++	movl	-4(%rdi, %rdx), %ecx
++	cmpl	-4(%rsi, %rdx), %ecx
++	jne	L(8_to_16_bytes_done)
++	ret
+ # endif
+-	add	%rdx, %rsi
+-	add	%rdx, %rdi
+-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+ 
+-# ifndef USE_AS_WMEMCMP
+-	.p2align 4
++	.p2align 4,, 3
++L(ret_zero):
++	xorl	%eax, %eax
++L(zero):
++	ret
++
++	.p2align 4,, 8
+ L(firstbyte):
++	jb	L(ret_zero)
++# ifdef USE_AS_WMEMCMP
++	xorl	%eax, %eax
++	movl	(%rdi), %ecx
++	cmpl	(%rsi), %ecx
++	je	L(zero)
++L(8_to_16_bytes_done):
++	setg	%al
++	leal	-1(%rax, %rax), %eax
++# else
+ 	movzbl	(%rdi), %eax
+ 	movzbl	(%rsi), %ecx
+ 	sub	%ecx, %eax
++# endif
+ 	ret
++
++	.p2align 4
++L(vec_return_begin_48):
++	addq	$16, %rdi
++	addq	$16, %rsi
++L(vec_return_begin_32):
++	bsfl	%eax, %eax
++# ifdef USE_AS_WMEMCMP
++	movl	32(%rdi, %rax), %ecx
++	xorl	%edx, %edx
++	cmpl	32(%rsi, %rax), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	32(%rsi, %rax), %ecx
++	movzbl	32(%rdi, %rax), %eax
++	subl	%ecx, %eax
++# endif
++	ret
++
++	.p2align 4
++L(vec_return_begin_16):
++	addq	$16, %rdi
++	addq	$16, %rsi
++L(vec_return_begin):
++	bsfl	%eax, %eax
++# ifdef USE_AS_WMEMCMP
++	movl	(%rdi, %rax), %ecx
++	xorl	%edx, %edx
++	cmpl	(%rsi, %rax), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	(%rsi, %rax), %ecx
++	movzbl	(%rdi, %rax), %eax
++	subl	%ecx, %eax
++# endif
++	ret
++
++	.p2align 4
++L(vec_return_end_16):
++	subl	$16, %edx
++L(vec_return_end):
++	bsfl	%eax, %eax
++	addl	%edx, %eax
++# ifdef USE_AS_WMEMCMP
++	movl	-16(%rdi, %rax), %ecx
++	xorl	%edx, %edx
++	cmpl	-16(%rsi, %rax), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	-16(%rsi, %rax), %ecx
++	movzbl	-16(%rdi, %rax), %eax
++	subl	%ecx, %eax
+ # endif
++	ret
++
++	.p2align 4,, 8
++L(more_32_bytes):
++	movdqu	(%rdi), %xmm0
++	movdqu	(%rsi), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin)
++
++	movdqu	16(%rdi), %xmm0
++	movdqu	16(%rsi), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_16)
++
++	cmpl	$64, %edx
++	jbe	L(32_to_64_bytes)
++	movdqu	32(%rdi), %xmm0
++	movdqu	32(%rsi), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_32)
++
++	.p2align 4,, 6
++L(32_to_64_bytes):
++	movdqu	-32(%rdi, %rdx), %xmm0
++	movdqu	-32(%rsi, %rdx), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_end_16)
++
++	movdqu	-16(%rdi, %rdx), %xmm0
++	movdqu	-16(%rsi, %rdx), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_end)
++	ret
++
++	.p2align 4
++L(16_to_32_bytes):
++	movdqu	(%rdi), %xmm0
++	movdqu	(%rsi), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin)
++
++	movdqu	-16(%rdi, %rdx), %xmm0
++	movdqu	-16(%rsi, %rdx), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_end)
++	ret
++
+ 
+ 	.p2align 4
+ L(79bytesormore):
++	movdqu	(%rdi), %xmm0
+ 	movdqu	(%rsi), %xmm1
+-	movdqu	(%rdi), %xmm2
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytesin256)
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin)
++
++
+ 	mov	%rsi, %rcx
+ 	and	$-16, %rsi
+ 	add	$16, %rsi
+@@ -86,1694 +306,499 @@ L(79bytesormore):
+ 
+ 	cmp	$128, %rdx
+ 	ja	L(128bytesormore)
+-L(less128bytes):
+-	sub	$64, %rdx
+-
+-	movdqu	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytesin256)
+ 
+-	movdqu	16(%rdi), %xmm2
+-	pxor	16(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(32bytesin256)
+-
+-	movdqu	32(%rdi), %xmm2
+-	pxor	32(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(48bytesin256)
+-
+-	movdqu	48(%rdi), %xmm2
+-	pxor	48(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(64bytesin256)
+-	cmp	$32, %rdx
+-	jb	L(less32bytesin64)
+-
+-	movdqu	64(%rdi), %xmm2
+-	pxor	64(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(80bytesin256)
+-
+-	movdqu	80(%rdi), %xmm2
+-	pxor	80(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(96bytesin256)
+-	sub	$32, %rdx
+-	add	$32, %rdi
+-	add	$32, %rsi
+-L(less32bytesin64):
+-	add	$64, %rdi
+-	add	$64, %rsi
+-	add	%rdx, %rsi
+-	add	%rdx, %rdi
+-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
++	.p2align 4,, 6
++L(less128bytes):
++	movdqu	(%rdi), %xmm1
++	CMPEQ	(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin)
++
++	movdqu	16(%rdi), %xmm1
++	CMPEQ	16(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_16)
++
++	movdqu	32(%rdi), %xmm1
++	CMPEQ	32(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_32)
++
++	movdqu	48(%rdi), %xmm1
++	CMPEQ	48(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_48)
++
++	cmp	$96, %rdx
++	jb	L(32_to_64_bytes)
++
++	addq	$64, %rdi
++	addq	$64, %rsi
++	subq	$64, %rdx
++
++	.p2align 4,, 6
++L(last_64_bytes):
++	movdqu	(%rdi), %xmm1
++	CMPEQ	(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin)
++
++	movdqu	16(%rdi), %xmm1
++	CMPEQ	16(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_16)
++
++	movdqu	-32(%rdi, %rdx), %xmm0
++	movdqu	-32(%rsi, %rdx), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_end_16)
++
++	movdqu	-16(%rdi, %rdx), %xmm0
++	movdqu	-16(%rsi, %rdx), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_end)
++	ret
+ 
++	.p2align 4
+ L(128bytesormore):
+-	cmp	$512, %rdx
+-	ja	L(512bytesormore)
+ 	cmp	$256, %rdx
+-	ja	L(less512bytes)
++	ja	L(unaligned_loop)
+ L(less256bytes):
+-	sub	$128, %rdx
+-
+-	movdqu	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytesin256)
+-
+-	movdqu	16(%rdi), %xmm2
+-	pxor	16(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(32bytesin256)
+-
+-	movdqu	32(%rdi), %xmm2
+-	pxor	32(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(48bytesin256)
+-
+-	movdqu	48(%rdi), %xmm2
+-	pxor	48(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(64bytesin256)
+-
+-	movdqu	64(%rdi), %xmm2
+-	pxor	64(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(80bytesin256)
+-
+-	movdqu	80(%rdi), %xmm2
+-	pxor	80(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(96bytesin256)
+-
+-	movdqu	96(%rdi), %xmm2
+-	pxor	96(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(112bytesin256)
+-
+-	movdqu	112(%rdi), %xmm2
+-	pxor	112(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(128bytesin256)
+-
+-	add	$128, %rsi
+-	add	$128, %rdi
+-
+-	cmp	$64, %rdx
+-	jae	L(less128bytes)
+-
+-	cmp	$32, %rdx
+-	jb	L(less32bytesin128)
+-
+-	movdqu	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytesin256)
+-
+-	movdqu	16(%rdi), %xmm2
+-	pxor	16(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(32bytesin256)
+-	sub	$32, %rdx
+-	add	$32, %rdi
+-	add	$32, %rsi
+-L(less32bytesin128):
+-	add	%rdx, %rsi
+-	add	%rdx, %rdi
+-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+-
+-L(less512bytes):
+-	sub	$256, %rdx
+-	movdqu	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytesin256)
+-
+-	movdqu	16(%rdi), %xmm2
+-	pxor	16(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(32bytesin256)
+-
+-	movdqu	32(%rdi), %xmm2
+-	pxor	32(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(48bytesin256)
+-
+-	movdqu	48(%rdi), %xmm2
+-	pxor	48(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(64bytesin256)
+-
+-	movdqu	64(%rdi), %xmm2
+-	pxor	64(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(80bytesin256)
+-
+-	movdqu	80(%rdi), %xmm2
+-	pxor	80(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(96bytesin256)
+-
+-	movdqu	96(%rdi), %xmm2
+-	pxor	96(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(112bytesin256)
+-
+-	movdqu	112(%rdi), %xmm2
+-	pxor	112(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(128bytesin256)
+-
+-	movdqu	128(%rdi), %xmm2
+-	pxor	128(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(144bytesin256)
+-
+-	movdqu	144(%rdi), %xmm2
+-	pxor	144(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(160bytesin256)
+-
+-	movdqu	160(%rdi), %xmm2
+-	pxor	160(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(176bytesin256)
+-
+-	movdqu	176(%rdi), %xmm2
+-	pxor	176(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(192bytesin256)
+-
+-	movdqu	192(%rdi), %xmm2
+-	pxor	192(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(208bytesin256)
+-
+-	movdqu	208(%rdi), %xmm2
+-	pxor	208(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(224bytesin256)
+-
+-	movdqu	224(%rdi), %xmm2
+-	pxor	224(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(240bytesin256)
+-
+-	movdqu	240(%rdi), %xmm2
+-	pxor	240(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(256bytesin256)
+-
+-	add	$256, %rsi
+-	add	$256, %rdi
+-
+-	cmp	$128, %rdx
+-	jae	L(less256bytes)
++	movdqu	(%rdi), %xmm1
++	CMPEQ	(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin)
++
++	movdqu	16(%rdi), %xmm1
++	CMPEQ	16(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_16)
++
++	movdqu	32(%rdi), %xmm1
++	CMPEQ	32(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_32)
++
++	movdqu	48(%rdi), %xmm1
++	CMPEQ	48(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_48)
++
++	addq	$64, %rdi
++	addq	$64, %rsi
++
++	movdqu	(%rdi), %xmm1
++	CMPEQ	(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin)
++
++	movdqu	16(%rdi), %xmm1
++	CMPEQ	16(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_16)
++
++	movdqu	32(%rdi), %xmm1
++	CMPEQ	32(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_32)
++
++	movdqu	48(%rdi), %xmm1
++	CMPEQ	48(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_48)
++
++	addq	$-128, %rdx
++	subq	$-64, %rsi
++	subq	$-64, %rdi
+ 
+ 	cmp	$64, %rdx
+-	jae	L(less128bytes)
++	ja	L(less128bytes)
+ 
+ 	cmp	$32, %rdx
+-	jb	L(less32bytesin256)
+-
+-	movdqu	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytesin256)
+-
+-	movdqu	16(%rdi), %xmm2
+-	pxor	16(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(32bytesin256)
+-	sub	$32, %rdx
+-	add	$32, %rdi
+-	add	$32, %rsi
+-L(less32bytesin256):
+-	add	%rdx, %rsi
+-	add	%rdx, %rdi
+-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
++	ja	L(last_64_bytes)
++
++	movdqu	-32(%rdi, %rdx), %xmm0
++	movdqu	-32(%rsi, %rdx), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_end_16)
++
++	movdqu	-16(%rdi, %rdx), %xmm0
++	movdqu	-16(%rsi, %rdx), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_end)
++	ret
+ 
+ 	.p2align 4
+-L(512bytesormore):
++L(unaligned_loop):
+ # ifdef DATA_CACHE_SIZE_HALF
+ 	mov	$DATA_CACHE_SIZE_HALF, %R8_LP
+ # else
+ 	mov	__x86_data_cache_size_half(%rip), %R8_LP
+ # endif
+-	mov	%r8, %r9
+-	shr	$1, %r8
+-	add	%r9, %r8
+-	cmp	%r8, %rdx
+-	ja	L(L2_L3_cache_unaglined)
++	movq	%r8, %r9
++	addq	%r8, %r8
++	addq	%r9, %r8
++	cmpq	%r8, %rdx
++	ja	L(L2_L3_cache_unaligned)
+ 	sub	$64, %rdx
+ 	.p2align 4
+ L(64bytesormore_loop):
+-	movdqu	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	movdqa	%xmm2, %xmm1
++	movdqu	(%rdi), %xmm0
++	movdqu	16(%rdi), %xmm1
++	movdqu	32(%rdi), %xmm2
++	movdqu	48(%rdi), %xmm3
+ 
+-	movdqu	16(%rdi), %xmm3
+-	pxor	16(%rsi), %xmm3
+-	por	%xmm3, %xmm1
++	CMPEQ	(%rsi), %xmm0
++	CMPEQ	16(%rsi), %xmm1
++	CMPEQ	32(%rsi), %xmm2
++	CMPEQ	48(%rsi), %xmm3
+ 
+-	movdqu	32(%rdi), %xmm4
+-	pxor	32(%rsi), %xmm4
+-	por	%xmm4, %xmm1
++	pand	%xmm0, %xmm1
++	pand	%xmm2, %xmm3
++	pand	%xmm1, %xmm3
+ 
+-	movdqu	48(%rdi), %xmm5
+-	pxor	48(%rsi), %xmm5
+-	por	%xmm5, %xmm1
++	pmovmskb %xmm3, %eax
++	incw	%ax
++	jnz	L(64bytesormore_loop_end)
+ 
+-	ptest	%xmm1, %xmm0
+-	jnc	L(64bytesormore_loop_end)
+ 	add	$64, %rsi
+ 	add	$64, %rdi
+ 	sub	$64, %rdx
+-	jae	L(64bytesormore_loop)
++	ja	L(64bytesormore_loop)
+ 
+-	add	$64, %rdx
+-	add	%rdx, %rsi
+-	add	%rdx, %rdi
+-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
++	.p2align 4,, 6
++L(loop_tail):
++	addq	%rdx, %rdi
++	movdqu	(%rdi), %xmm0
++	movdqu	16(%rdi), %xmm1
++	movdqu	32(%rdi), %xmm2
++	movdqu	48(%rdi), %xmm3
++
++	addq	%rdx, %rsi
++	movdqu	(%rsi), %xmm4
++	movdqu	16(%rsi), %xmm5
++	movdqu	32(%rsi), %xmm6
++	movdqu	48(%rsi), %xmm7
++
++	CMPEQ	%xmm4, %xmm0
++	CMPEQ	%xmm5, %xmm1
++	CMPEQ	%xmm6, %xmm2
++	CMPEQ	%xmm7, %xmm3
++
++	pand	%xmm0, %xmm1
++	pand	%xmm2, %xmm3
++	pand	%xmm1, %xmm3
++
++	pmovmskb %xmm3, %eax
++	incw	%ax
++	jnz	L(64bytesormore_loop_end)
++	ret
+ 
+-L(L2_L3_cache_unaglined):
+-	sub	$64, %rdx
++L(L2_L3_cache_unaligned):
++	subq	$64, %rdx
+ 	.p2align 4
+ L(L2_L3_unaligned_128bytes_loop):
+ 	prefetchnta 0x1c0(%rdi)
+ 	prefetchnta 0x1c0(%rsi)
+-	movdqu	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	movdqa	%xmm2, %xmm1
+ 
+-	movdqu	16(%rdi), %xmm3
+-	pxor	16(%rsi), %xmm3
+-	por	%xmm3, %xmm1
++	movdqu	(%rdi), %xmm0
++	movdqu	16(%rdi), %xmm1
++	movdqu	32(%rdi), %xmm2
++	movdqu	48(%rdi), %xmm3
++
++	CMPEQ	(%rsi), %xmm0
++	CMPEQ	16(%rsi), %xmm1
++	CMPEQ	32(%rsi), %xmm2
++	CMPEQ	48(%rsi), %xmm3
+ 
+-	movdqu	32(%rdi), %xmm4
+-	pxor	32(%rsi), %xmm4
+-	por	%xmm4, %xmm1
++	pand	%xmm0, %xmm1
++	pand	%xmm2, %xmm3
++	pand	%xmm1, %xmm3
+ 
+-	movdqu	48(%rdi), %xmm5
+-	pxor	48(%rsi), %xmm5
+-	por	%xmm5, %xmm1
++	pmovmskb %xmm3, %eax
++	incw	%ax
++	jnz	L(64bytesormore_loop_end)
+ 
+-	ptest	%xmm1, %xmm0
+-	jnc	L(64bytesormore_loop_end)
+ 	add	$64, %rsi
+ 	add	$64, %rdi
+ 	sub	$64, %rdx
+-	jae	L(L2_L3_unaligned_128bytes_loop)
++	ja	L(L2_L3_unaligned_128bytes_loop)
++	jmp	L(loop_tail)
+ 
+-	add	$64, %rdx
+-	add	%rdx, %rsi
+-	add	%rdx, %rdi
+-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+ 
+-/*
+- * This case is for machines which are sensitive for unaligned instructions.
+- */
++	/* This case is for machines which are sensitive for unaligned
++	 * instructions.  */
+ 	.p2align 4
+ L(2aligned):
+ 	cmp	$128, %rdx
+ 	ja	L(128bytesormorein2aligned)
+ L(less128bytesin2aligned):
+-	sub	$64, %rdx
+-
+-	movdqa	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytesin256)
+-
+-	movdqa	16(%rdi), %xmm2
+-	pxor	16(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(32bytesin256)
+-
+-	movdqa	32(%rdi), %xmm2
+-	pxor	32(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(48bytesin256)
+-
+-	movdqa	48(%rdi), %xmm2
+-	pxor	48(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(64bytesin256)
+-	cmp	$32, %rdx
+-	jb	L(less32bytesin64in2alinged)
+-
+-	movdqa	64(%rdi), %xmm2
+-	pxor	64(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(80bytesin256)
+-
+-	movdqa	80(%rdi), %xmm2
+-	pxor	80(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(96bytesin256)
+-	sub	$32, %rdx
+-	add	$32, %rdi
+-	add	$32, %rsi
+-L(less32bytesin64in2alinged):
+-	add	$64, %rdi
+-	add	$64, %rsi
+-	add	%rdx, %rsi
+-	add	%rdx, %rdi
+-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
++	movdqa	(%rdi), %xmm1
++	CMPEQ	(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin)
++
++	movdqa	16(%rdi), %xmm1
++	CMPEQ	16(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_16)
++
++	movdqa	32(%rdi), %xmm1
++	CMPEQ	32(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_32)
++
++	movdqa	48(%rdi), %xmm1
++	CMPEQ	48(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_48)
++
++	cmp	$96, %rdx
++	jb	L(32_to_64_bytes)
++
++	addq	$64, %rdi
++	addq	$64, %rsi
++	subq	$64, %rdx
++
++	.p2align 4,, 6
++L(aligned_last_64_bytes):
++	movdqa	(%rdi), %xmm1
++	CMPEQ	(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin)
++
++	movdqa	16(%rdi), %xmm1
++	CMPEQ	16(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_16)
++
++	movdqu	-32(%rdi, %rdx), %xmm0
++	movdqu	-32(%rsi, %rdx), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_end_16)
++
++	movdqu	-16(%rdi, %rdx), %xmm0
++	movdqu	-16(%rsi, %rdx), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_end)
++	ret
+ 
+ 	.p2align 4
+ L(128bytesormorein2aligned):
+-	cmp	$512, %rdx
+-	ja	L(512bytesormorein2aligned)
+ 	cmp	$256, %rdx
+-	ja	L(256bytesormorein2aligned)
++	ja	L(aligned_loop)
+ L(less256bytesin2alinged):
+-	sub	$128, %rdx
+-
+-	movdqa	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytesin256)
+-
+-	movdqa	16(%rdi), %xmm2
+-	pxor	16(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(32bytesin256)
+-
+-	movdqa	32(%rdi), %xmm2
+-	pxor	32(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(48bytesin256)
+-
+-	movdqa	48(%rdi), %xmm2
+-	pxor	48(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(64bytesin256)
+-
+-	movdqa	64(%rdi), %xmm2
+-	pxor	64(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(80bytesin256)
+-
+-	movdqa	80(%rdi), %xmm2
+-	pxor	80(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(96bytesin256)
+-
+-	movdqa	96(%rdi), %xmm2
+-	pxor	96(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(112bytesin256)
+-
+-	movdqa	112(%rdi), %xmm2
+-	pxor	112(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(128bytesin256)
+-
+-	add	$128, %rsi
+-	add	$128, %rdi
++	movdqa	(%rdi), %xmm1
++	CMPEQ	(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin)
++
++	movdqa	16(%rdi), %xmm1
++	CMPEQ	16(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_16)
++
++	movdqa	32(%rdi), %xmm1
++	CMPEQ	32(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_32)
++
++	movdqa	48(%rdi), %xmm1
++	CMPEQ	48(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_48)
++
++	addq	$64, %rdi
++	addq	$64, %rsi
++
++	movdqa	(%rdi), %xmm1
++	CMPEQ	(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin)
++
++	movdqa	16(%rdi), %xmm1
++	CMPEQ	16(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_16)
++
++	movdqa	32(%rdi), %xmm1
++	CMPEQ	32(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_32)
++
++	movdqa	48(%rdi), %xmm1
++	CMPEQ	48(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_48)
++
++	addq	$-128, %rdx
++	subq	$-64, %rsi
++	subq	$-64, %rdi
+ 
+ 	cmp	$64, %rdx
+-	jae	L(less128bytesin2aligned)
++	ja	L(less128bytesin2aligned)
+ 
+ 	cmp	$32, %rdx
+-	jb	L(less32bytesin128in2aligned)
+-
+-	movdqu	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytesin256)
+-
+-	movdqu	16(%rdi), %xmm2
+-	pxor	16(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(32bytesin256)
+-	sub	$32, %rdx
+-	add	$32, %rdi
+-	add	$32, %rsi
+-L(less32bytesin128in2aligned):
+-	add	%rdx, %rsi
+-	add	%rdx, %rdi
+-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+-
+-	.p2align 4
+-L(256bytesormorein2aligned):
+-
+-	sub	$256, %rdx
+-	movdqa	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytesin256)
+-
+-	movdqa	16(%rdi), %xmm2
+-	pxor	16(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(32bytesin256)
+-
+-	movdqa	32(%rdi), %xmm2
+-	pxor	32(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(48bytesin256)
+-
+-	movdqa	48(%rdi), %xmm2
+-	pxor	48(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(64bytesin256)
+-
+-	movdqa	64(%rdi), %xmm2
+-	pxor	64(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(80bytesin256)
+-
+-	movdqa	80(%rdi), %xmm2
+-	pxor	80(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(96bytesin256)
+-
+-	movdqa	96(%rdi), %xmm2
+-	pxor	96(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(112bytesin256)
+-
+-	movdqa	112(%rdi), %xmm2
+-	pxor	112(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(128bytesin256)
+-
+-	movdqa	128(%rdi), %xmm2
+-	pxor	128(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(144bytesin256)
+-
+-	movdqa	144(%rdi), %xmm2
+-	pxor	144(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(160bytesin256)
+-
+-	movdqa	160(%rdi), %xmm2
+-	pxor	160(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(176bytesin256)
+-
+-	movdqa	176(%rdi), %xmm2
+-	pxor	176(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(192bytesin256)
+-
+-	movdqa	192(%rdi), %xmm2
+-	pxor	192(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(208bytesin256)
+-
+-	movdqa	208(%rdi), %xmm2
+-	pxor	208(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(224bytesin256)
+-
+-	movdqa	224(%rdi), %xmm2
+-	pxor	224(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(240bytesin256)
+-
+-	movdqa	240(%rdi), %xmm2
+-	pxor	240(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(256bytesin256)
+-
+-	add	$256, %rsi
+-	add	$256, %rdi
+-
+-	cmp	$128, %rdx
+-	jae	L(less256bytesin2alinged)
+-
+-	cmp	$64, %rdx
+-	jae	L(less128bytesin2aligned)
+-
+-	cmp	$32, %rdx
+-	jb	L(less32bytesin256in2alinged)
+-
+-	movdqa	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytesin256)
+-
+-	movdqa	16(%rdi), %xmm2
+-	pxor	16(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(32bytesin256)
+-	sub	$32, %rdx
+-	add	$32, %rdi
+-	add	$32, %rsi
+-L(less32bytesin256in2alinged):
+-	add	%rdx, %rsi
+-	add	%rdx, %rdi
+-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
++	ja	L(aligned_last_64_bytes)
++
++	movdqu	-32(%rdi, %rdx), %xmm0
++	movdqu	-32(%rsi, %rdx), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_end_16)
++
++	movdqu	-16(%rdi, %rdx), %xmm0
++	movdqu	-16(%rsi, %rdx), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_end)
++	ret
+ 
+ 	.p2align 4
+-L(512bytesormorein2aligned):
++L(aligned_loop):
+ # ifdef DATA_CACHE_SIZE_HALF
+ 	mov	$DATA_CACHE_SIZE_HALF, %R8_LP
+ # else
+ 	mov	__x86_data_cache_size_half(%rip), %R8_LP
+ # endif
+-	mov	%r8, %r9
+-	shr	$1, %r8
+-	add	%r9, %r8
+-	cmp	%r8, %rdx
+-	ja	L(L2_L3_cache_aglined)
++	movq	%r8, %r9
++	addq	%r8, %r8
++	addq	%r9, %r8
++	cmpq	%r8, %rdx
++	ja	L(L2_L3_cache_aligned)
+ 
+ 	sub	$64, %rdx
+ 	.p2align 4
+ L(64bytesormore_loopin2aligned):
+-	movdqa	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	movdqa	%xmm2, %xmm1
+-
+-	movdqa	16(%rdi), %xmm3
+-	pxor	16(%rsi), %xmm3
+-	por	%xmm3, %xmm1
++	movdqa	(%rdi), %xmm0
++	movdqa	16(%rdi), %xmm1
++	movdqa	32(%rdi), %xmm2
++	movdqa	48(%rdi), %xmm3
+ 
+-	movdqa	32(%rdi), %xmm4
+-	pxor	32(%rsi), %xmm4
+-	por	%xmm4, %xmm1
++	CMPEQ	(%rsi), %xmm0
++	CMPEQ	16(%rsi), %xmm1
++	CMPEQ	32(%rsi), %xmm2
++	CMPEQ	48(%rsi), %xmm3
+ 
+-	movdqa	48(%rdi), %xmm5
+-	pxor	48(%rsi), %xmm5
+-	por	%xmm5, %xmm1
++	pand	%xmm0, %xmm1
++	pand	%xmm2, %xmm3
++	pand	%xmm1, %xmm3
+ 
+-	ptest	%xmm1, %xmm0
+-	jnc	L(64bytesormore_loop_end)
++	pmovmskb %xmm3, %eax
++	incw	%ax
++	jnz	L(64bytesormore_loop_end)
+ 	add	$64, %rsi
+ 	add	$64, %rdi
+ 	sub	$64, %rdx
+-	jae	L(64bytesormore_loopin2aligned)
+-
+-	add	$64, %rdx
+-	add	%rdx, %rsi
+-	add	%rdx, %rdi
+-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+-L(L2_L3_cache_aglined):
+-	sub	$64, %rdx
++	ja	L(64bytesormore_loopin2aligned)
++	jmp	L(loop_tail)
+ 
++L(L2_L3_cache_aligned):
++	subq	$64, %rdx
+ 	.p2align 4
+ L(L2_L3_aligned_128bytes_loop):
+ 	prefetchnta 0x1c0(%rdi)
+ 	prefetchnta 0x1c0(%rsi)
+-	movdqa	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	movdqa	%xmm2, %xmm1
+-
+-	movdqa	16(%rdi), %xmm3
+-	pxor	16(%rsi), %xmm3
+-	por	%xmm3, %xmm1
++	movdqa	(%rdi), %xmm0
++	movdqa	16(%rdi), %xmm1
++	movdqa	32(%rdi), %xmm2
++	movdqa	48(%rdi), %xmm3
+ 
+-	movdqa	32(%rdi), %xmm4
+-	pxor	32(%rsi), %xmm4
+-	por	%xmm4, %xmm1
++	CMPEQ	(%rsi), %xmm0
++	CMPEQ	16(%rsi), %xmm1
++	CMPEQ	32(%rsi), %xmm2
++	CMPEQ	48(%rsi), %xmm3
+ 
+-	movdqa	48(%rdi), %xmm5
+-	pxor	48(%rsi), %xmm5
+-	por	%xmm5, %xmm1
++	pand	%xmm0, %xmm1
++	pand	%xmm2, %xmm3
++	pand	%xmm1, %xmm3
+ 
+-	ptest	%xmm1, %xmm0
+-	jnc	L(64bytesormore_loop_end)
+-	add	$64, %rsi
+-	add	$64, %rdi
+-	sub	$64, %rdx
+-	jae	L(L2_L3_aligned_128bytes_loop)
+-
+-	add	$64, %rdx
+-	add	%rdx, %rsi
+-	add	%rdx, %rdi
+-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
++	pmovmskb %xmm3, %eax
++	incw	%ax
++	jnz	L(64bytesormore_loop_end)
+ 
++	addq	$64, %rsi
++	addq	$64, %rdi
++	subq	$64, %rdx
++	ja	L(L2_L3_aligned_128bytes_loop)
++	jmp	L(loop_tail)
+ 
+ 	.p2align 4
+ L(64bytesormore_loop_end):
+-	add	$16, %rdi
+-	add	$16, %rsi
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytes)
+-
+-	add	$16, %rdi
+-	add	$16, %rsi
+-	ptest	%xmm3, %xmm0
+-	jnc	L(16bytes)
+-
+-	add	$16, %rdi
+-	add	$16, %rsi
+-	ptest	%xmm4, %xmm0
+-	jnc	L(16bytes)
+-
+-	add	$16, %rdi
+-	add	$16, %rsi
+-	jmp	L(16bytes)
+-
+-L(256bytesin256):
+-	add	$256, %rdi
+-	add	$256, %rsi
+-	jmp	L(16bytes)
+-L(240bytesin256):
+-	add	$240, %rdi
+-	add	$240, %rsi
+-	jmp	L(16bytes)
+-L(224bytesin256):
+-	add	$224, %rdi
+-	add	$224, %rsi
+-	jmp	L(16bytes)
+-L(208bytesin256):
+-	add	$208, %rdi
+-	add	$208, %rsi
+-	jmp	L(16bytes)
+-L(192bytesin256):
+-	add	$192, %rdi
+-	add	$192, %rsi
+-	jmp	L(16bytes)
+-L(176bytesin256):
+-	add	$176, %rdi
+-	add	$176, %rsi
+-	jmp	L(16bytes)
+-L(160bytesin256):
+-	add	$160, %rdi
+-	add	$160, %rsi
+-	jmp	L(16bytes)
+-L(144bytesin256):
+-	add	$144, %rdi
+-	add	$144, %rsi
+-	jmp	L(16bytes)
+-L(128bytesin256):
+-	add	$128, %rdi
+-	add	$128, %rsi
+-	jmp	L(16bytes)
+-L(112bytesin256):
+-	add	$112, %rdi
+-	add	$112, %rsi
+-	jmp	L(16bytes)
+-L(96bytesin256):
+-	add	$96, %rdi
+-	add	$96, %rsi
+-	jmp	L(16bytes)
+-L(80bytesin256):
+-	add	$80, %rdi
+-	add	$80, %rsi
+-	jmp	L(16bytes)
+-L(64bytesin256):
+-	add	$64, %rdi
+-	add	$64, %rsi
+-	jmp	L(16bytes)
+-L(48bytesin256):
+-	add	$16, %rdi
+-	add	$16, %rsi
+-L(32bytesin256):
+-	add	$16, %rdi
+-	add	$16, %rsi
+-L(16bytesin256):
+-	add	$16, %rdi
+-	add	$16, %rsi
+-L(16bytes):
+-	mov	-16(%rdi), %rax
+-	mov	-16(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-L(8bytes):
+-	mov	-8(%rdi), %rax
+-	mov	-8(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-	.p2align 4
+-L(12bytes):
+-	mov	-12(%rdi), %rax
+-	mov	-12(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-L(4bytes):
+-	mov	-4(%rsi), %ecx
+-# ifndef USE_AS_WMEMCMP
+-	mov	-4(%rdi), %eax
+-	cmp	%eax, %ecx
+-# else
+-	cmp	-4(%rdi), %ecx
+-# endif
+-	jne	L(diffin4bytes)
+-L(0bytes):
+-	xor	%eax, %eax
+-	ret
+-
+-# ifndef USE_AS_WMEMCMP
+-/* unreal case for wmemcmp */
+-	.p2align 4
+-L(65bytes):
+-	movdqu	-65(%rdi), %xmm1
+-	movdqu	-65(%rsi), %xmm2
+-	mov	$-65, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(49bytes):
+-	movdqu	-49(%rdi), %xmm1
+-	movdqu	-49(%rsi), %xmm2
+-	mov	$-49, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(33bytes):
+-	movdqu	-33(%rdi), %xmm1
+-	movdqu	-33(%rsi), %xmm2
+-	mov	$-33, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(17bytes):
+-	mov	-17(%rdi), %rax
+-	mov	-17(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-L(9bytes):
+-	mov	-9(%rdi), %rax
+-	mov	-9(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	movzbl	-1(%rdi), %eax
+-	movzbl	-1(%rsi), %edx
+-	sub	%edx, %eax
+-	ret
+-
+-	.p2align 4
+-L(13bytes):
+-	mov	-13(%rdi), %rax
+-	mov	-13(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	mov	-8(%rdi), %rax
+-	mov	-8(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-	.p2align 4
+-L(5bytes):
+-	mov	-5(%rdi), %eax
+-	mov	-5(%rsi), %ecx
+-	cmp	%eax, %ecx
+-	jne	L(diffin4bytes)
+-	movzbl	-1(%rdi), %eax
+-	movzbl	-1(%rsi), %edx
+-	sub	%edx, %eax
+-	ret
+-
+-	.p2align 4
+-L(66bytes):
+-	movdqu	-66(%rdi), %xmm1
+-	movdqu	-66(%rsi), %xmm2
+-	mov	$-66, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(50bytes):
+-	movdqu	-50(%rdi), %xmm1
+-	movdqu	-50(%rsi), %xmm2
+-	mov	$-50, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(34bytes):
+-	movdqu	-34(%rdi), %xmm1
+-	movdqu	-34(%rsi), %xmm2
+-	mov	$-34, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(18bytes):
+-	mov	-18(%rdi), %rax
+-	mov	-18(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-L(10bytes):
+-	mov	-10(%rdi), %rax
+-	mov	-10(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	movzwl	-2(%rdi), %eax
+-	movzwl	-2(%rsi), %ecx
+-	cmp	%cl, %al
+-	jne	L(end)
+-	and	$0xffff, %eax
+-	and	$0xffff, %ecx
+-	sub	%ecx, %eax
+-	ret
+-
+-	.p2align 4
+-L(14bytes):
+-	mov	-14(%rdi), %rax
+-	mov	-14(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	mov	-8(%rdi), %rax
+-	mov	-8(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-	.p2align 4
+-L(6bytes):
+-	mov	-6(%rdi), %eax
+-	mov	-6(%rsi), %ecx
+-	cmp	%eax, %ecx
+-	jne	L(diffin4bytes)
+-L(2bytes):
+-	movzwl	-2(%rsi), %ecx
+-	movzwl	-2(%rdi), %eax
+-	cmp	%cl, %al
+-	jne	L(end)
+-	and	$0xffff, %eax
+-	and	$0xffff, %ecx
+-	sub	%ecx, %eax
+-	ret
+-
+-	.p2align 4
+-L(67bytes):
+-	movdqu	-67(%rdi), %xmm2
+-	movdqu	-67(%rsi), %xmm1
+-	mov	$-67, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(51bytes):
+-	movdqu	-51(%rdi), %xmm2
+-	movdqu	-51(%rsi), %xmm1
+-	mov	$-51, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(35bytes):
+-	movdqu	-35(%rsi), %xmm1
+-	movdqu	-35(%rdi), %xmm2
+-	mov	$-35, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(19bytes):
+-	mov	-19(%rdi), %rax
+-	mov	-19(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-L(11bytes):
+-	mov	-11(%rdi), %rax
+-	mov	-11(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	mov	-4(%rdi), %eax
+-	mov	-4(%rsi), %ecx
+-	cmp	%eax, %ecx
+-	jne	L(diffin4bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-	.p2align 4
+-L(15bytes):
+-	mov	-15(%rdi), %rax
+-	mov	-15(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	mov	-8(%rdi), %rax
+-	mov	-8(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-	.p2align 4
+-L(7bytes):
+-	mov	-7(%rdi), %eax
+-	mov	-7(%rsi), %ecx
+-	cmp	%eax, %ecx
+-	jne	L(diffin4bytes)
+-	mov	-4(%rdi), %eax
+-	mov	-4(%rsi), %ecx
+-	cmp	%eax, %ecx
+-	jne	L(diffin4bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-	.p2align 4
+-L(3bytes):
+-	movzwl	-3(%rdi), %eax
+-	movzwl	-3(%rsi), %ecx
+-	cmp	%eax, %ecx
+-	jne	L(diffin2bytes)
+-L(1bytes):
+-	movzbl	-1(%rdi), %eax
+-	movzbl	-1(%rsi), %ecx
+-	sub	%ecx, %eax
+-	ret
+-# endif
+-
+-	.p2align 4
+-L(68bytes):
+-	movdqu	-68(%rdi), %xmm2
+-	movdqu	-68(%rsi), %xmm1
+-	mov	$-68, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(52bytes):
+-	movdqu	-52(%rdi), %xmm2
+-	movdqu	-52(%rsi), %xmm1
+-	mov	$-52, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(36bytes):
+-	movdqu	-36(%rdi), %xmm2
+-	movdqu	-36(%rsi), %xmm1
+-	mov	$-36, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(20bytes):
+-	movdqu	-20(%rdi), %xmm2
+-	movdqu	-20(%rsi), %xmm1
+-	mov	$-20, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-	mov	-4(%rsi), %ecx
+-
+-# ifndef USE_AS_WMEMCMP
+-	mov	-4(%rdi), %eax
+-	cmp	%eax, %ecx
+-# else
+-	cmp	-4(%rdi), %ecx
+-# endif
+-	jne	L(diffin4bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-# ifndef USE_AS_WMEMCMP
+-/* unreal cases for wmemcmp */
+-	.p2align 4
+-L(69bytes):
+-	movdqu	-69(%rsi), %xmm1
+-	movdqu	-69(%rdi), %xmm2
+-	mov	$-69, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(53bytes):
+-	movdqu	-53(%rsi), %xmm1
+-	movdqu	-53(%rdi), %xmm2
+-	mov	$-53, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(37bytes):
+-	movdqu	-37(%rsi), %xmm1
+-	movdqu	-37(%rdi), %xmm2
+-	mov	$-37, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(21bytes):
+-	movdqu	-21(%rsi), %xmm1
+-	movdqu	-21(%rdi), %xmm2
+-	mov	$-21, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-	mov	-8(%rdi), %rax
+-	mov	-8(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-	.p2align 4
+-L(70bytes):
+-	movdqu	-70(%rsi), %xmm1
+-	movdqu	-70(%rdi), %xmm2
+-	mov	$-70, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(54bytes):
+-	movdqu	-54(%rsi), %xmm1
+-	movdqu	-54(%rdi), %xmm2
+-	mov	$-54, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(38bytes):
+-	movdqu	-38(%rsi), %xmm1
+-	movdqu	-38(%rdi), %xmm2
+-	mov	$-38, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(22bytes):
+-	movdqu	-22(%rsi), %xmm1
+-	movdqu	-22(%rdi), %xmm2
+-	mov	$-22, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-	mov	-8(%rdi), %rax
+-	mov	-8(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-	.p2align 4
+-L(71bytes):
+-	movdqu	-71(%rsi), %xmm1
+-	movdqu	-71(%rdi), %xmm2
+-	mov	$-71, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(55bytes):
+-	movdqu	-55(%rdi), %xmm2
+-	movdqu	-55(%rsi), %xmm1
+-	mov	$-55, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(39bytes):
+-	movdqu	-39(%rdi), %xmm2
+-	movdqu	-39(%rsi), %xmm1
+-	mov	$-39, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(23bytes):
+-	movdqu	-23(%rdi), %xmm2
+-	movdqu	-23(%rsi), %xmm1
+-	mov	$-23, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-	mov	-8(%rdi), %rax
+-	mov	-8(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-# endif
+-
+-	.p2align 4
+-L(72bytes):
+-	movdqu	-72(%rsi), %xmm1
+-	movdqu	-72(%rdi), %xmm2
+-	mov	$-72, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(56bytes):
+-	movdqu	-56(%rdi), %xmm2
+-	movdqu	-56(%rsi), %xmm1
+-	mov	$-56, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(40bytes):
+-	movdqu	-40(%rdi), %xmm2
+-	movdqu	-40(%rsi), %xmm1
+-	mov	$-40, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(24bytes):
+-	movdqu	-24(%rdi), %xmm2
+-	movdqu	-24(%rsi), %xmm1
+-	mov	$-24, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-
+-	mov	-8(%rsi), %rcx
+-	mov	-8(%rdi), %rax
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-# ifndef USE_AS_WMEMCMP
+-/* unreal cases for wmemcmp */
+-	.p2align 4
+-L(73bytes):
+-	movdqu	-73(%rsi), %xmm1
+-	movdqu	-73(%rdi), %xmm2
+-	mov	$-73, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(57bytes):
+-	movdqu	-57(%rdi), %xmm2
+-	movdqu	-57(%rsi), %xmm1
+-	mov	$-57, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(41bytes):
+-	movdqu	-41(%rdi), %xmm2
+-	movdqu	-41(%rsi), %xmm1
+-	mov	$-41, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(25bytes):
+-	movdqu	-25(%rdi), %xmm2
+-	movdqu	-25(%rsi), %xmm1
+-	mov	$-25, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-	mov	-9(%rdi), %rax
+-	mov	-9(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	movzbl	-1(%rdi), %eax
+-	movzbl	-1(%rsi), %ecx
+-	sub	%ecx, %eax
+-	ret
+-
+-	.p2align 4
+-L(74bytes):
+-	movdqu	-74(%rsi), %xmm1
+-	movdqu	-74(%rdi), %xmm2
+-	mov	$-74, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(58bytes):
+-	movdqu	-58(%rdi), %xmm2
+-	movdqu	-58(%rsi), %xmm1
+-	mov	$-58, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(42bytes):
+-	movdqu	-42(%rdi), %xmm2
+-	movdqu	-42(%rsi), %xmm1
+-	mov	$-42, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(26bytes):
+-	movdqu	-26(%rdi), %xmm2
+-	movdqu	-26(%rsi), %xmm1
+-	mov	$-26, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-	mov	-10(%rdi), %rax
+-	mov	-10(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	movzwl	-2(%rdi), %eax
+-	movzwl	-2(%rsi), %ecx
+-	jmp	L(diffin2bytes)
+-
+-	.p2align 4
+-L(75bytes):
+-	movdqu	-75(%rsi), %xmm1
+-	movdqu	-75(%rdi), %xmm2
+-	mov	$-75, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(59bytes):
+-	movdqu	-59(%rdi), %xmm2
+-	movdqu	-59(%rsi), %xmm1
+-	mov	$-59, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(43bytes):
+-	movdqu	-43(%rdi), %xmm2
+-	movdqu	-43(%rsi), %xmm1
+-	mov	$-43, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(27bytes):
+-	movdqu	-27(%rdi), %xmm2
+-	movdqu	-27(%rsi), %xmm1
+-	mov	$-27, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-	mov	-11(%rdi), %rax
+-	mov	-11(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	mov	-4(%rdi), %eax
+-	mov	-4(%rsi), %ecx
+-	cmp	%eax, %ecx
+-	jne	L(diffin4bytes)
+-	xor	%eax, %eax
+-	ret
+-# endif
+-	.p2align 4
+-L(76bytes):
+-	movdqu	-76(%rsi), %xmm1
+-	movdqu	-76(%rdi), %xmm2
+-	mov	$-76, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(60bytes):
+-	movdqu	-60(%rdi), %xmm2
+-	movdqu	-60(%rsi), %xmm1
+-	mov	$-60, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(44bytes):
+-	movdqu	-44(%rdi), %xmm2
+-	movdqu	-44(%rsi), %xmm1
+-	mov	$-44, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(28bytes):
+-	movdqu	-28(%rdi), %xmm2
+-	movdqu	-28(%rsi), %xmm1
+-	mov	$-28, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-	mov	-12(%rdi), %rax
+-	mov	-12(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	mov	-4(%rsi), %ecx
+-# ifndef USE_AS_WMEMCMP
+-	mov	-4(%rdi), %eax
+-	cmp	%eax, %ecx
+-# else
+-	cmp	-4(%rdi), %ecx
+-# endif
+-	jne	L(diffin4bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-# ifndef USE_AS_WMEMCMP
+-/* unreal cases for wmemcmp */
+-	.p2align 4
+-L(77bytes):
+-	movdqu	-77(%rsi), %xmm1
+-	movdqu	-77(%rdi), %xmm2
+-	mov	$-77, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(61bytes):
+-	movdqu	-61(%rdi), %xmm2
+-	movdqu	-61(%rsi), %xmm1
+-	mov	$-61, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(45bytes):
+-	movdqu	-45(%rdi), %xmm2
+-	movdqu	-45(%rsi), %xmm1
+-	mov	$-45, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(29bytes):
+-	movdqu	-29(%rdi), %xmm2
+-	movdqu	-29(%rsi), %xmm1
+-	mov	$-29, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-
+-	mov	-13(%rdi), %rax
+-	mov	-13(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-
+-	mov	-8(%rdi), %rax
+-	mov	-8(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-	.p2align 4
+-L(78bytes):
+-	movdqu	-78(%rsi), %xmm1
+-	movdqu	-78(%rdi), %xmm2
+-	mov	$-78, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(62bytes):
+-	movdqu	-62(%rdi), %xmm2
+-	movdqu	-62(%rsi), %xmm1
+-	mov	$-62, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(46bytes):
+-	movdqu	-46(%rdi), %xmm2
+-	movdqu	-46(%rsi), %xmm1
+-	mov	$-46, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(30bytes):
+-	movdqu	-30(%rdi), %xmm2
+-	movdqu	-30(%rsi), %xmm1
+-	mov	$-30, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-	mov	-14(%rdi), %rax
+-	mov	-14(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	mov	-8(%rdi), %rax
+-	mov	-8(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-	.p2align 4
+-L(79bytes):
+-	movdqu	-79(%rsi), %xmm1
+-	movdqu	-79(%rdi), %xmm2
+-	mov	$-79, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(63bytes):
+-	movdqu	-63(%rdi), %xmm2
+-	movdqu	-63(%rsi), %xmm1
+-	mov	$-63, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(47bytes):
+-	movdqu	-47(%rdi), %xmm2
+-	movdqu	-47(%rsi), %xmm1
+-	mov	$-47, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(31bytes):
+-	movdqu	-31(%rdi), %xmm2
+-	movdqu	-31(%rsi), %xmm1
+-	mov	$-31, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-	mov	-15(%rdi), %rax
+-	mov	-15(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	mov	-8(%rdi), %rax
+-	mov	-8(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-# endif
+-	.p2align 4
+-L(64bytes):
+-	movdqu	-64(%rdi), %xmm2
+-	movdqu	-64(%rsi), %xmm1
+-	mov	$-64, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(48bytes):
+-	movdqu	-48(%rdi), %xmm2
+-	movdqu	-48(%rsi), %xmm1
+-	mov	$-48, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(32bytes):
+-	movdqu	-32(%rdi), %xmm2
+-	movdqu	-32(%rsi), %xmm1
+-	mov	$-32, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-
+-	mov	-16(%rdi), %rax
+-	mov	-16(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-
+-	mov	-8(%rdi), %rax
+-	mov	-8(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-/*
+- * Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block.
+- */
+-	.p2align 3
+-L(less16bytes):
+-	movsbq	%dl, %rdx
+-	mov	(%rsi, %rdx), %rcx
+-	mov	(%rdi, %rdx), %rax
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	mov	8(%rsi, %rdx), %rcx
+-	mov	8(%rdi, %rdx), %rax
+-L(diffin8bytes):
+-	cmp	%eax, %ecx
+-	jne	L(diffin4bytes)
+-	shr	$32, %rcx
+-	shr	$32, %rax
+-
++	pmovmskb %xmm0, %ecx
++	incw	%cx
++	jnz	L(loop_end_ret)
++
++	pmovmskb %xmm1, %ecx
++	notw	%cx
++	sall	$16, %ecx
++	jnz	L(loop_end_ret)
++
++	pmovmskb %xmm2, %ecx
++	notw	%cx
++	shlq	$32, %rcx
++	jnz	L(loop_end_ret)
++
++	addq	$48, %rdi
++	addq	$48, %rsi
++	movq	%rax, %rcx
++
++	.p2align 4,, 6
++L(loop_end_ret):
++	bsfq	%rcx, %rcx
+ # ifdef USE_AS_WMEMCMP
+-/* for wmemcmp */
+-	cmp	%eax, %ecx
+-	jne	L(diffin4bytes)
+-	xor	%eax, %eax
+-	ret
+-# endif
+-
+-L(diffin4bytes):
+-# ifndef USE_AS_WMEMCMP
+-	cmp	%cx, %ax
+-	jne	L(diffin2bytes)
+-	shr	$16, %ecx
+-	shr	$16, %eax
+-L(diffin2bytes):
+-	cmp	%cl, %al
+-	jne	L(end)
+-	and	$0xffff, %eax
+-	and	$0xffff, %ecx
+-	sub	%ecx, %eax
+-	ret
+-
+-	.p2align 4
+-L(end):
+-	and	$0xff, %eax
+-	and	$0xff, %ecx
+-	sub	%ecx, %eax
+-	ret
++	movl	(%rdi, %rcx), %eax
++	xorl	%edx, %edx
++	cmpl	(%rsi, %rcx), %eax
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
+ # else
+-
+-/* for wmemcmp */
+-	mov	$1, %eax
+-	jl	L(nequal_bigger)
+-	neg	%eax
+-	ret
+-
+-	.p2align 4
+-L(nequal_bigger):
+-	ret
+-
+-L(unreal_case):
+-	xor	%eax, %eax
+-	ret
++	movzbl	(%rdi, %rcx), %eax
++	movzbl	(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
+ # endif
+-
++	ret
+ END (MEMCMP)
+-
+-	.section .rodata.sse4.1,"a",@progbits
+-	.p2align 3
+-# ifndef USE_AS_WMEMCMP
+-L(table_64bytes):
+-	.int	JMPTBL (L(0bytes), L(table_64bytes))
+-	.int	JMPTBL (L(1bytes), L(table_64bytes))
+-	.int	JMPTBL (L(2bytes), L(table_64bytes))
+-	.int	JMPTBL (L(3bytes), L(table_64bytes))
+-	.int	JMPTBL (L(4bytes), L(table_64bytes))
+-	.int	JMPTBL (L(5bytes), L(table_64bytes))
+-	.int	JMPTBL (L(6bytes), L(table_64bytes))
+-	.int	JMPTBL (L(7bytes), L(table_64bytes))
+-	.int	JMPTBL (L(8bytes), L(table_64bytes))
+-	.int	JMPTBL (L(9bytes), L(table_64bytes))
+-	.int	JMPTBL (L(10bytes), L(table_64bytes))
+-	.int	JMPTBL (L(11bytes), L(table_64bytes))
+-	.int	JMPTBL (L(12bytes), L(table_64bytes))
+-	.int	JMPTBL (L(13bytes), L(table_64bytes))
+-	.int	JMPTBL (L(14bytes), L(table_64bytes))
+-	.int	JMPTBL (L(15bytes), L(table_64bytes))
+-	.int	JMPTBL (L(16bytes), L(table_64bytes))
+-	.int	JMPTBL (L(17bytes), L(table_64bytes))
+-	.int	JMPTBL (L(18bytes), L(table_64bytes))
+-	.int	JMPTBL (L(19bytes), L(table_64bytes))
+-	.int	JMPTBL (L(20bytes), L(table_64bytes))
+-	.int	JMPTBL (L(21bytes), L(table_64bytes))
+-	.int	JMPTBL (L(22bytes), L(table_64bytes))
+-	.int	JMPTBL (L(23bytes), L(table_64bytes))
+-	.int	JMPTBL (L(24bytes), L(table_64bytes))
+-	.int	JMPTBL (L(25bytes), L(table_64bytes))
+-	.int	JMPTBL (L(26bytes), L(table_64bytes))
+-	.int	JMPTBL (L(27bytes), L(table_64bytes))
+-	.int	JMPTBL (L(28bytes), L(table_64bytes))
+-	.int	JMPTBL (L(29bytes), L(table_64bytes))
+-	.int	JMPTBL (L(30bytes), L(table_64bytes))
+-	.int	JMPTBL (L(31bytes), L(table_64bytes))
+-	.int	JMPTBL (L(32bytes), L(table_64bytes))
+-	.int	JMPTBL (L(33bytes), L(table_64bytes))
+-	.int	JMPTBL (L(34bytes), L(table_64bytes))
+-	.int	JMPTBL (L(35bytes), L(table_64bytes))
+-	.int	JMPTBL (L(36bytes), L(table_64bytes))
+-	.int	JMPTBL (L(37bytes), L(table_64bytes))
+-	.int	JMPTBL (L(38bytes), L(table_64bytes))
+-	.int	JMPTBL (L(39bytes), L(table_64bytes))
+-	.int	JMPTBL (L(40bytes), L(table_64bytes))
+-	.int	JMPTBL (L(41bytes), L(table_64bytes))
+-	.int	JMPTBL (L(42bytes), L(table_64bytes))
+-	.int	JMPTBL (L(43bytes), L(table_64bytes))
+-	.int	JMPTBL (L(44bytes), L(table_64bytes))
+-	.int	JMPTBL (L(45bytes), L(table_64bytes))
+-	.int	JMPTBL (L(46bytes), L(table_64bytes))
+-	.int	JMPTBL (L(47bytes), L(table_64bytes))
+-	.int	JMPTBL (L(48bytes), L(table_64bytes))
+-	.int	JMPTBL (L(49bytes), L(table_64bytes))
+-	.int	JMPTBL (L(50bytes), L(table_64bytes))
+-	.int	JMPTBL (L(51bytes), L(table_64bytes))
+-	.int	JMPTBL (L(52bytes), L(table_64bytes))
+-	.int	JMPTBL (L(53bytes), L(table_64bytes))
+-	.int	JMPTBL (L(54bytes), L(table_64bytes))
+-	.int	JMPTBL (L(55bytes), L(table_64bytes))
+-	.int	JMPTBL (L(56bytes), L(table_64bytes))
+-	.int	JMPTBL (L(57bytes), L(table_64bytes))
+-	.int	JMPTBL (L(58bytes), L(table_64bytes))
+-	.int	JMPTBL (L(59bytes), L(table_64bytes))
+-	.int	JMPTBL (L(60bytes), L(table_64bytes))
+-	.int	JMPTBL (L(61bytes), L(table_64bytes))
+-	.int	JMPTBL (L(62bytes), L(table_64bytes))
+-	.int	JMPTBL (L(63bytes), L(table_64bytes))
+-	.int	JMPTBL (L(64bytes), L(table_64bytes))
+-	.int	JMPTBL (L(65bytes), L(table_64bytes))
+-	.int	JMPTBL (L(66bytes), L(table_64bytes))
+-	.int	JMPTBL (L(67bytes), L(table_64bytes))
+-	.int	JMPTBL (L(68bytes), L(table_64bytes))
+-	.int	JMPTBL (L(69bytes), L(table_64bytes))
+-	.int	JMPTBL (L(70bytes), L(table_64bytes))
+-	.int	JMPTBL (L(71bytes), L(table_64bytes))
+-	.int	JMPTBL (L(72bytes), L(table_64bytes))
+-	.int	JMPTBL (L(73bytes), L(table_64bytes))
+-	.int	JMPTBL (L(74bytes), L(table_64bytes))
+-	.int	JMPTBL (L(75bytes), L(table_64bytes))
+-	.int	JMPTBL (L(76bytes), L(table_64bytes))
+-	.int	JMPTBL (L(77bytes), L(table_64bytes))
+-	.int	JMPTBL (L(78bytes), L(table_64bytes))
+-	.int	JMPTBL (L(79bytes), L(table_64bytes))
+-# else
+-L(table_64bytes):
+-	.int	JMPTBL (L(0bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(4bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(8bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(12bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(16bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(20bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(24bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(28bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(32bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(36bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(40bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(44bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(48bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(52bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(56bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(60bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(64bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(68bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(72bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(76bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-# endif
+ #endif
+-- 
+GitLab
+
diff --git a/SOURCES/ia-string-funct-test-rtm.patch b/SOURCES/ia-string-funct-test-rtm.patch
new file mode 100644
index 0000000..a0675d9
--- /dev/null
+++ b/SOURCES/ia-string-funct-test-rtm.patch
@@ -0,0 +1,736 @@
+From 294c22ed9aa3f576f58b9af69c9a34225fcf02aa Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Tue, 23 Feb 2021 06:33:10 -0800
+Subject: [PATCH] x86: Add string/memory function tests in RTM region
+
+At function exit, AVX optimized string/memory functions have VZEROUPPER
+which triggers RTM abort.   When such functions are called inside a
+transactionally executing RTM region, RTM abort causes severe performance
+degradation.  Add tests to verify that string/memory functions won't
+cause RTM abort in RTM region.
+
+(cherry picked from commit 4bd660be40967cd69072f69ebc2ad32bfcc1f206)
+---
+ sysdeps/x86/Makefile          | 23 +++++++++++
+ sysdeps/x86/tst-memchr-rtm.c  | 54 ++++++++++++++++++++++++++
+ sysdeps/x86/tst-memcmp-rtm.c  | 52 +++++++++++++++++++++++++
+ sysdeps/x86/tst-memmove-rtm.c | 53 ++++++++++++++++++++++++++
+ sysdeps/x86/tst-memrchr-rtm.c | 54 ++++++++++++++++++++++++++
+ sysdeps/x86/tst-memset-rtm.c  | 45 ++++++++++++++++++++++
+ sysdeps/x86/tst-strchr-rtm.c  | 54 ++++++++++++++++++++++++++
+ sysdeps/x86/tst-strcpy-rtm.c  | 53 ++++++++++++++++++++++++++
+ sysdeps/x86/tst-string-rtm.h  | 72 +++++++++++++++++++++++++++++++++++
+ sysdeps/x86/tst-strlen-rtm.c  | 53 ++++++++++++++++++++++++++
+ sysdeps/x86/tst-strncmp-rtm.c | 52 +++++++++++++++++++++++++
+ sysdeps/x86/tst-strrchr-rtm.c | 53 ++++++++++++++++++++++++++
+ 12 files changed, 618 insertions(+)
+ create mode 100644 sysdeps/x86/tst-memchr-rtm.c
+ create mode 100644 sysdeps/x86/tst-memcmp-rtm.c
+ create mode 100644 sysdeps/x86/tst-memmove-rtm.c
+ create mode 100644 sysdeps/x86/tst-memrchr-rtm.c
+ create mode 100644 sysdeps/x86/tst-memset-rtm.c
+ create mode 100644 sysdeps/x86/tst-strchr-rtm.c
+ create mode 100644 sysdeps/x86/tst-strcpy-rtm.c
+ create mode 100644 sysdeps/x86/tst-string-rtm.h
+ create mode 100644 sysdeps/x86/tst-strlen-rtm.c
+ create mode 100644 sysdeps/x86/tst-strncmp-rtm.c
+ create mode 100644 sysdeps/x86/tst-strrchr-rtm.c
+
+diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
+index 59e928e9..5be71ada 100644
+--- a/sysdeps/x86/Makefile
++++ b/sysdeps/x86/Makefile
+@@ -17,6 +17,29 @@ endif
+ 
+ ifeq ($(subdir),string)
+ sysdep_routines += cacheinfo
++
++tests += \
++  tst-memchr-rtm \
++  tst-memcmp-rtm \
++  tst-memmove-rtm \
++  tst-memrchr-rtm \
++  tst-memset-rtm \
++  tst-strchr-rtm \
++  tst-strcpy-rtm \
++  tst-strlen-rtm \
++  tst-strncmp-rtm \
++  tst-strrchr-rtm
++
++CFLAGS-tst-memchr-rtm.c += -mrtm
++CFLAGS-tst-memcmp-rtm.c += -mrtm
++CFLAGS-tst-memmove-rtm.c += -mrtm
++CFLAGS-tst-memrchr-rtm.c += -mrtm
++CFLAGS-tst-memset-rtm.c += -mrtm
++CFLAGS-tst-strchr-rtm.c += -mrtm
++CFLAGS-tst-strcpy-rtm.c += -mrtm
++CFLAGS-tst-strlen-rtm.c += -mrtm
++CFLAGS-tst-strncmp-rtm.c += -mrtm
++CFLAGS-tst-strrchr-rtm.c += -mrtm
+ endif
+ 
+ ifneq ($(enable-cet),no)
+diff --git a/sysdeps/x86/tst-memchr-rtm.c b/sysdeps/x86/tst-memchr-rtm.c
+new file mode 100644
+index 00000000..e4749401
+--- /dev/null
++++ b/sysdeps/x86/tst-memchr-rtm.c
+@@ -0,0 +1,54 @@
++/* Test case for memchr inside a transactionally executing RTM region.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <tst-string-rtm.h>
++
++#define LOOP 3000
++#define STRING_SIZE 1024
++char string1[STRING_SIZE];
++
++__attribute__ ((noinline, noclone))
++static int
++prepare (void)
++{
++  memset (string1, 'a', STRING_SIZE);
++  string1[100] = 'c';
++  string1[STRING_SIZE - 100] = 'c';
++  char *p = memchr (string1, 'c', STRING_SIZE);
++  if (p == &string1[100])
++    return EXIT_SUCCESS;
++  else
++    return EXIT_FAILURE;
++}
++
++__attribute__ ((noinline, noclone))
++static int
++function (void)
++{
++  char *p = memchr (string1, 'c', STRING_SIZE);
++  if (p == &string1[100])
++    return 0;
++  else
++    return 1;
++}
++
++static int
++do_test (void)
++{
++  return do_test_1 ("memchr", LOOP, prepare, function);
++}
+diff --git a/sysdeps/x86/tst-memcmp-rtm.c b/sysdeps/x86/tst-memcmp-rtm.c
+new file mode 100644
+index 00000000..e4c8a623
+--- /dev/null
++++ b/sysdeps/x86/tst-memcmp-rtm.c
+@@ -0,0 +1,52 @@
++/* Test case for memcmp inside a transactionally executing RTM region.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <tst-string-rtm.h>
++
++#define LOOP 3000
++#define STRING_SIZE 1024
++char string1[STRING_SIZE];
++char string2[STRING_SIZE];
++
++__attribute__ ((noinline, noclone))
++static int
++prepare (void)
++{
++  memset (string1, 'a', STRING_SIZE);
++  memset (string2, 'a', STRING_SIZE);
++  if (memcmp (string1, string2, STRING_SIZE) == 0)
++    return EXIT_SUCCESS;
++  else
++    return EXIT_FAILURE;
++}
++
++__attribute__ ((noinline, noclone))
++static int
++function (void)
++{
++  if (memcmp (string1, string2, STRING_SIZE) == 0)
++    return 0;
++  else
++    return 1;
++}
++
++static int
++do_test (void)
++{
++  return do_test_1 ("memcmp", LOOP, prepare, function);
++}
+diff --git a/sysdeps/x86/tst-memmove-rtm.c b/sysdeps/x86/tst-memmove-rtm.c
+new file mode 100644
+index 00000000..4bf97ef1
+--- /dev/null
++++ b/sysdeps/x86/tst-memmove-rtm.c
+@@ -0,0 +1,53 @@
++/* Test case for memmove inside a transactionally executing RTM region.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <tst-string-rtm.h>
++
++#define LOOP 3000
++#define STRING_SIZE 1024
++char string1[STRING_SIZE];
++char string2[STRING_SIZE];
++
++__attribute__ ((noinline, noclone))
++static int
++prepare (void)
++{
++  memset (string1, 'a', STRING_SIZE);
++  if (memmove (string2, string1, STRING_SIZE) == string2
++      && memcmp (string2, string1, STRING_SIZE) == 0)
++    return EXIT_SUCCESS;
++  else
++    return EXIT_FAILURE;
++}
++
++__attribute__ ((noinline, noclone))
++static int
++function (void)
++{
++  if (memmove (string2, string1, STRING_SIZE) == string2
++      && memcmp (string2, string1, STRING_SIZE) == 0)
++    return 0;
++  else
++    return 1;
++}
++
++static int
++do_test (void)
++{
++  return do_test_1 ("memmove", LOOP, prepare, function);
++}
+diff --git a/sysdeps/x86/tst-memrchr-rtm.c b/sysdeps/x86/tst-memrchr-rtm.c
+new file mode 100644
+index 00000000..a57a5a8e
+--- /dev/null
++++ b/sysdeps/x86/tst-memrchr-rtm.c
+@@ -0,0 +1,54 @@
++/* Test case for memrchr inside a transactionally executing RTM region.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <tst-string-rtm.h>
++
++#define LOOP 3000
++#define STRING_SIZE 1024
++char string1[STRING_SIZE];
++
++__attribute__ ((noinline, noclone))
++static int
++prepare (void)
++{
++  memset (string1, 'a', STRING_SIZE);
++  string1[100] = 'c';
++  string1[STRING_SIZE - 100] = 'c';
++  char *p = memrchr (string1, 'c', STRING_SIZE);
++  if (p == &string1[STRING_SIZE - 100])
++    return EXIT_SUCCESS;
++  else
++    return EXIT_FAILURE;
++}
++
++__attribute__ ((noinline, noclone))
++static int
++function (void)
++{
++  char *p = memrchr (string1, 'c', STRING_SIZE);
++  if (p == &string1[STRING_SIZE - 100])
++    return 0;
++  else
++    return 1;
++}
++
++static int
++do_test (void)
++{
++  return do_test_1 ("memrchr", LOOP, prepare, function);
++}
+diff --git a/sysdeps/x86/tst-memset-rtm.c b/sysdeps/x86/tst-memset-rtm.c
+new file mode 100644
+index 00000000..bf343a4d
+--- /dev/null
++++ b/sysdeps/x86/tst-memset-rtm.c
+@@ -0,0 +1,45 @@
++/* Test case for memset inside a transactionally executing RTM region.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <tst-string-rtm.h>
++
++#define LOOP 3000
++#define STRING_SIZE 1024
++char string1[STRING_SIZE];
++
++__attribute__ ((noinline, noclone))
++static int
++prepare (void)
++{
++  memset (string1, 'a', STRING_SIZE);
++  return EXIT_SUCCESS;
++}
++
++__attribute__ ((noinline, noclone))
++static int
++function (void)
++{
++  memset (string1, 'a', STRING_SIZE);
++  return 0;
++}
++
++static int
++do_test (void)
++{
++  return do_test_1 ("memset", LOOP, prepare, function);
++}
+diff --git a/sysdeps/x86/tst-strchr-rtm.c b/sysdeps/x86/tst-strchr-rtm.c
+new file mode 100644
+index 00000000..a82e29c0
+--- /dev/null
++++ b/sysdeps/x86/tst-strchr-rtm.c
+@@ -0,0 +1,54 @@
++/* Test case for strchr inside a transactionally executing RTM region.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <tst-string-rtm.h>
++
++#define LOOP 3000
++#define STRING_SIZE 1024
++char string1[STRING_SIZE];
++
++__attribute__ ((noinline, noclone))
++static int
++prepare (void)
++{
++  memset (string1, 'a', STRING_SIZE - 1);
++  string1[100] = 'c';
++  string1[STRING_SIZE - 100] = 'c';
++  char *p = strchr (string1, 'c');
++  if (p == &string1[100])
++    return EXIT_SUCCESS;
++  else
++    return EXIT_FAILURE;
++}
++
++__attribute__ ((noinline, noclone))
++static int
++function (void)
++{
++  char *p = strchr (string1, 'c');
++  if (p == &string1[100])
++    return 0;
++  else
++    return 1;
++}
++
++static int
++do_test (void)
++{
++  return do_test_1 ("strchr", LOOP, prepare, function);
++}
+diff --git a/sysdeps/x86/tst-strcpy-rtm.c b/sysdeps/x86/tst-strcpy-rtm.c
+new file mode 100644
+index 00000000..2b2a583f
+--- /dev/null
++++ b/sysdeps/x86/tst-strcpy-rtm.c
+@@ -0,0 +1,53 @@
++/* Test case for strcpy inside a transactionally executing RTM region.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <tst-string-rtm.h>
++
++#define LOOP 3000
++#define STRING_SIZE 1024
++char string1[STRING_SIZE];
++char string2[STRING_SIZE];
++
++__attribute__ ((noinline, noclone))
++static int
++prepare (void)
++{
++  memset (string1, 'a', STRING_SIZE - 1);
++  if (strcpy (string2, string1) == string2
++      && strcmp (string2, string1) == 0)
++    return EXIT_SUCCESS;
++  else
++    return EXIT_FAILURE;
++}
++
++__attribute__ ((noinline, noclone))
++static int
++function (void)
++{
++  if (strcpy (string2, string1) == string2
++      && strcmp (string2, string1) == 0)
++    return 0;
++  else
++    return 1;
++}
++
++static int
++do_test (void)
++{
++  return do_test_1 ("strcpy", LOOP, prepare, function);
++}
+diff --git a/sysdeps/x86/tst-string-rtm.h b/sysdeps/x86/tst-string-rtm.h
+new file mode 100644
+index 00000000..d2470afa
+--- /dev/null
++++ b/sysdeps/x86/tst-string-rtm.h
+@@ -0,0 +1,72 @@
++/* Test string function in a transactionally executing RTM region.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <string.h>
++#include <x86intrin.h>
++#include <sys/platform/x86.h>
++#include <support/check.h>
++#include <support/test-driver.h>
++
++static int
++do_test_1 (const char *name, unsigned int loop, int (*prepare) (void),
++	   int (*function) (void))
++{
++  if (!CPU_FEATURE_USABLE (RTM))
++    return EXIT_UNSUPPORTED;
++
++  int status = prepare ();
++  if (status != EXIT_SUCCESS)
++    return status;
++
++  unsigned int i;
++  unsigned int naborts = 0;
++  unsigned int failed = 0;
++  for (i = 0; i < loop; i++)
++    {
++      failed |= function ();
++      if (_xbegin() == _XBEGIN_STARTED)
++	{
++	  failed |= function ();
++	  _xend();
++	}
++      else
++	{
++	  failed |= function ();
++	  ++naborts;
++	}
++    }
++
++  if (failed)
++    FAIL_EXIT1 ("%s() failed", name);
++
++  if (naborts)
++    {
++      /* NB: Low single digit (<= 5%) noise-level aborts are normal for
++	 TSX.  */
++      double rate = 100 * ((double) naborts) / ((double) loop);
++      if (rate > 5)
++	FAIL_EXIT1 ("TSX abort rate: %.2f%% (%d out of %d)",
++		    rate, naborts, loop);
++    }
++
++  return EXIT_SUCCESS;
++}
++
++static int do_test (void);
++
++#include <support/test-driver.c>
+diff --git a/sysdeps/x86/tst-strlen-rtm.c b/sysdeps/x86/tst-strlen-rtm.c
+new file mode 100644
+index 00000000..0dcf14db
+--- /dev/null
++++ b/sysdeps/x86/tst-strlen-rtm.c
+@@ -0,0 +1,53 @@
++/* Test case for strlen inside a transactionally executing RTM region.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <tst-string-rtm.h>
++
++#define LOOP 3000
++#define STRING_SIZE 1024
++char string1[STRING_SIZE];
++
++__attribute__ ((noinline, noclone))
++static int
++prepare (void)
++{
++  memset (string1, 'a', STRING_SIZE - 1);
++  string1[STRING_SIZE - 100] = '\0';
++  size_t len = strlen (string1);
++  if (len == STRING_SIZE - 100)
++    return EXIT_SUCCESS;
++  else
++    return EXIT_FAILURE;
++}
++
++__attribute__ ((noinline, noclone))
++static int
++function (void)
++{
++  size_t len = strlen (string1);
++  if (len == STRING_SIZE - 100)
++    return 0;
++  else
++    return 1;
++}
++
++static int
++do_test (void)
++{
++  return do_test_1 ("strlen", LOOP, prepare, function);
++}
+diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
+new file mode 100644
+index 00000000..236ad951
+--- /dev/null
++++ b/sysdeps/x86/tst-strncmp-rtm.c
+@@ -0,0 +1,52 @@
++/* Test case for strncmp inside a transactionally executing RTM region.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <tst-string-rtm.h>
++
++#define LOOP 3000
++#define STRING_SIZE 1024
++char string1[STRING_SIZE];
++char string2[STRING_SIZE];
++
++__attribute__ ((noinline, noclone))
++static int
++prepare (void)
++{
++  memset (string1, 'a', STRING_SIZE - 1);
++  memset (string2, 'a', STRING_SIZE - 1);
++  if (strncmp (string1, string2, STRING_SIZE) == 0)
++    return EXIT_SUCCESS;
++  else
++    return EXIT_FAILURE;
++}
++
++__attribute__ ((noinline, noclone))
++static int
++function (void)
++{
++  if (strncmp (string1, string2, STRING_SIZE) == 0)
++    return 0;
++  else
++    return 1;
++}
++
++static int
++do_test (void)
++{
++  return do_test_1 ("strncmp", LOOP, prepare, function);
++}
+diff --git a/sysdeps/x86/tst-strrchr-rtm.c b/sysdeps/x86/tst-strrchr-rtm.c
+new file mode 100644
+index 00000000..e32bfaf5
+--- /dev/null
++++ b/sysdeps/x86/tst-strrchr-rtm.c
+@@ -0,0 +1,53 @@
++/* Test case for strrchr inside a transactionally executing RTM region.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <tst-string-rtm.h>
++
++#define LOOP 3000
++#define STRING_SIZE 1024
++char string1[STRING_SIZE];
++
++__attribute__ ((noinline, noclone))
++static int
++prepare (void)
++{
++  memset (string1, 'a', STRING_SIZE - 1);
++  string1[STRING_SIZE - 100] = 'c';
++  char *p = strrchr (string1, 'c');
++  if (p == &string1[STRING_SIZE - 100])
++    return EXIT_SUCCESS;
++  else
++    return EXIT_FAILURE;
++}
++
++__attribute__ ((noinline, noclone))
++static int
++function (void)
++{
++  char *p = strrchr (string1, 'c');
++  if (p == &string1[STRING_SIZE - 100])
++    return 0;
++  else
++    return 1;
++}
++
++static int
++do_test (void)
++{
++  return do_test_1 ("strrchr", LOOP, prepare, function);
++}
+-- 
+GitLab
+
diff --git a/SOURCES/ia-strlen-opt-avx2.patch b/SOURCES/ia-strlen-opt-avx2.patch
new file mode 100644
index 0000000..b4f2fa6
--- /dev/null
+++ b/SOURCES/ia-strlen-opt-avx2.patch
@@ -0,0 +1,767 @@
+From f97e72cd3b822c04bfdda110dcf252b19afd2bcd Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Wed, 2 Mar 2022 14:29:44 -0800
+Subject: [PATCH] x86: Optimize strlen-avx2.S
+
+No bug. This commit optimizes strlen-avx2.S. The optimizations are
+mostly small things but they add up to roughly 10-30% performance
+improvement for strlen. The results for strnlen are bit more
+ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen
+are all passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+(cherry picked from commit aaa23c35071537e2dcf5807e956802ed215210aa)
+---
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c |  16 +-
+ sysdeps/x86_64/multiarch/strlen-avx2.S     | 532 +++++++++++++--------
+ 2 files changed, 334 insertions(+), 214 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index cbfc1a5d..f1a6460a 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -285,10 +285,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   /* Support sysdeps/x86_64/multiarch/strlen.c.  */
+   IFUNC_IMPL (i, name, strlen,
+ 	      IFUNC_IMPL_ADD (array, i, strlen,
+-			      CPU_FEATURE_USABLE (AVX2),
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __strlen_avx2)
+ 	      IFUNC_IMPL_ADD (array, i, strlen,
+ 			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __strlen_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strlen,
+@@ -301,10 +303,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
+   IFUNC_IMPL (i, name, strnlen,
+ 	      IFUNC_IMPL_ADD (array, i, strnlen,
+-			      CPU_FEATURE_USABLE (AVX2),
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __strnlen_avx2)
+ 	      IFUNC_IMPL_ADD (array, i, strnlen,
+ 			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __strnlen_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strnlen,
+@@ -640,10 +644,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   /* Support sysdeps/x86_64/multiarch/wcslen.c.  */
+   IFUNC_IMPL (i, name, wcslen,
+ 	      IFUNC_IMPL_ADD (array, i, wcslen,
+-			      CPU_FEATURE_USABLE (AVX2),
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wcslen_avx2)
+ 	      IFUNC_IMPL_ADD (array, i, wcslen,
+ 			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __wcslen_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wcslen,
+@@ -656,10 +662,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */
+   IFUNC_IMPL (i, name, wcsnlen,
+ 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+-			      CPU_FEATURE_USABLE (AVX2),
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wcsnlen_avx2)
+ 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+ 			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __wcsnlen_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
+index 82826e10..be8a5db5 100644
+--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
++++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
+@@ -27,9 +27,11 @@
+ # ifdef USE_AS_WCSLEN
+ #  define VPCMPEQ	vpcmpeqd
+ #  define VPMINU	vpminud
++#  define CHAR_SIZE	4
+ # else
+ #  define VPCMPEQ	vpcmpeqb
+ #  define VPMINU	vpminub
++#  define CHAR_SIZE	1
+ # endif
+ 
+ # ifndef VZEROUPPER
+@@ -41,349 +43,459 @@
+ # endif
+ 
+ # define VEC_SIZE 32
++# define PAGE_SIZE 4096
+ 
+ 	.section SECTION(.text),"ax",@progbits
+ ENTRY (STRLEN)
+ # ifdef USE_AS_STRNLEN
+-	/* Check for zero length.  */
++	/* Check zero length.  */
+ 	test	%RSI_LP, %RSI_LP
+ 	jz	L(zero)
++	/* Store max len in R8_LP before adjusting if using WCSLEN.  */
++	mov	%RSI_LP, %R8_LP
+ #  ifdef USE_AS_WCSLEN
+ 	shl	$2, %RSI_LP
+ #  elif defined __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	movl	%esi, %esi
+ #  endif
+-	mov	%RSI_LP, %R8_LP
+ # endif
+-	movl	%edi, %ecx
++	movl	%edi, %eax
+ 	movq	%rdi, %rdx
+ 	vpxor	%xmm0, %xmm0, %xmm0
+-
++	/* Clear high bits from edi. Only keeping bits relevant to page
++	   cross check.  */
++	andl	$(PAGE_SIZE - 1), %eax
+ 	/* Check if we may cross page boundary with one vector load.  */
+-	andl	$(2 * VEC_SIZE - 1), %ecx
+-	cmpl	$VEC_SIZE, %ecx
+-	ja	L(cros_page_boundary)
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
++	ja	L(cross_page_boundary)
+ 
+ 	/* Check the first VEC_SIZE bytes.  */
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-
++	VPCMPEQ	(%rdi), %ymm0, %ymm1
++	vpmovmskb	%ymm1, %eax
+ # ifdef USE_AS_STRNLEN
+-	jnz	L(first_vec_x0_check)
+-	/* Adjust length and check the end of data.  */
+-	subq	$VEC_SIZE, %rsi
+-	jbe	L(max)
+-# else
+-	jnz	L(first_vec_x0)
++	/* If length < VEC_SIZE handle special.  */
++	cmpq	$VEC_SIZE, %rsi
++	jbe	L(first_vec_x0)
+ # endif
+-
+-	/* Align data for aligned loads in the loop.  */
+-	addq	$VEC_SIZE, %rdi
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
++	/* If empty continue to aligned_more. Otherwise return bit
++	   position of first match.  */
++	testl	%eax, %eax
++	jz	L(aligned_more)
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WCSLEN
++	shrl	$2, %eax
++# endif
++	VZEROUPPER_RETURN
+ 
+ # ifdef USE_AS_STRNLEN
+-	/* Adjust length.  */
+-	addq	%rcx, %rsi
++L(zero):
++	xorl	%eax, %eax
++	ret
+ 
+-	subq	$(VEC_SIZE * 4), %rsi
+-	jbe	L(last_4x_vec_or_less)
++	.p2align 4
++L(first_vec_x0):
++	/* Set bit for max len so that tzcnt will return min of max len
++	   and position of first match.  */
++	btsq	%rsi, %rax
++	tzcntl	%eax, %eax
++#  ifdef USE_AS_WCSLEN
++	shrl	$2, %eax
++#  endif
++	VZEROUPPER_RETURN
+ # endif
+-	jmp	L(more_4x_vec)
+ 
+ 	.p2align 4
+-L(cros_page_boundary):
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	/* Remove the leading bytes.  */
+-	sarl	%cl, %eax
+-	testl	%eax, %eax
+-	jz	L(aligned_more)
++L(first_vec_x1):
+ 	tzcntl	%eax, %eax
++	/* Safe to use 32 bit instructions as these are only called for
++	   size = [1, 159].  */
+ # ifdef USE_AS_STRNLEN
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
++	/* Use ecx which was computed earlier to compute correct value.
++	 */
++	subl	$(VEC_SIZE * 4 + 1), %ecx
++	addl	%ecx, %eax
++# else
++	subl	%edx, %edi
++	incl	%edi
++	addl	%edi, %eax
+ # endif
+-	addq	%rdi, %rax
+-	addq	%rcx, %rax
+-	subq	%rdx, %rax
+ # ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
++	shrl	$2, %eax
+ # endif
+-L(return_vzeroupper):
+-	ZERO_UPPER_VEC_REGISTERS_RETURN
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(aligned_more):
++L(first_vec_x2):
++	tzcntl	%eax, %eax
++	/* Safe to use 32 bit instructions as these are only called for
++	   size = [1, 159].  */
+ # ifdef USE_AS_STRNLEN
+-        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
+-	    with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
+-	    to void possible addition overflow.  */
+-	negq	%rcx
+-	addq	$VEC_SIZE, %rcx
+-
+-	/* Check the end of data.  */
+-	subq	%rcx, %rsi
+-	jbe	L(max)
++	/* Use ecx which was computed earlier to compute correct value.
++	 */
++	subl	$(VEC_SIZE * 3 + 1), %ecx
++	addl	%ecx, %eax
++# else
++	subl	%edx, %edi
++	addl	$(VEC_SIZE + 1), %edi
++	addl	%edi, %eax
+ # endif
++# ifdef USE_AS_WCSLEN
++	shrl	$2, %eax
++# endif
++	VZEROUPPER_RETURN
+ 
+-	addq	$VEC_SIZE, %rdi
++	.p2align 4
++L(first_vec_x3):
++	tzcntl	%eax, %eax
++	/* Safe to use 32 bit instructions as these are only called for
++	   size = [1, 159].  */
++# ifdef USE_AS_STRNLEN
++	/* Use ecx which was computed earlier to compute correct value.
++	 */
++	subl	$(VEC_SIZE * 2 + 1), %ecx
++	addl	%ecx, %eax
++# else
++	subl	%edx, %edi
++	addl	$(VEC_SIZE * 2 + 1), %edi
++	addl	%edi, %eax
++# endif
++# ifdef USE_AS_WCSLEN
++	shrl	$2, %eax
++# endif
++	VZEROUPPER_RETURN
+ 
++	.p2align 4
++L(first_vec_x4):
++	tzcntl	%eax, %eax
++	/* Safe to use 32 bit instructions as these are only called for
++	   size = [1, 159].  */
+ # ifdef USE_AS_STRNLEN
+-	subq	$(VEC_SIZE * 4), %rsi
+-	jbe	L(last_4x_vec_or_less)
++	/* Use ecx which was computed earlier to compute correct value.
++	 */
++	subl	$(VEC_SIZE + 1), %ecx
++	addl	%ecx, %eax
++# else
++	subl	%edx, %edi
++	addl	$(VEC_SIZE * 3 + 1), %edi
++	addl	%edi, %eax
+ # endif
++# ifdef USE_AS_WCSLEN
++	shrl	$2, %eax
++# endif
++	VZEROUPPER_RETURN
+ 
+-L(more_4x_vec):
++	.p2align 5
++L(aligned_more):
++	/* Align data to VEC_SIZE - 1. This is the same number of
++	   instructions as using andq with -VEC_SIZE but saves 4 bytes of
++	   code on the x4 check.  */
++	orq	$(VEC_SIZE - 1), %rdi
++L(cross_page_continue):
+ 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+ 	   since data is only aligned to VEC_SIZE.  */
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+-
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
++# ifdef USE_AS_STRNLEN
++	/* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
++	   it simplies the logic in last_4x_vec_or_less.  */
++	leaq	(VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
++	subq	%rdx, %rcx
++# endif
++	/* Load first VEC regardless.  */
++	VPCMPEQ	1(%rdi), %ymm0, %ymm1
++# ifdef USE_AS_STRNLEN
++	/* Adjust length. If near end handle specially.  */
++	subq	%rcx, %rsi
++	jb	L(last_4x_vec_or_less)
++# endif
++	vpmovmskb	%ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1)
+ 
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
++	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
++	vpmovmskb	%ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x2)
+ 
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
++	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
++	vpmovmskb	%ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x3)
+ 
+-	addq	$(VEC_SIZE * 4), %rdi
+-
+-# ifdef USE_AS_STRNLEN
+-	subq	$(VEC_SIZE * 4), %rsi
+-	jbe	L(last_4x_vec_or_less)
+-# endif
+-
+-	/* Align data to 4 * VEC_SIZE.  */
+-	movq	%rdi, %rcx
+-	andl	$(4 * VEC_SIZE - 1), %ecx
+-	andq	$-(4 * VEC_SIZE), %rdi
++	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
++	vpmovmskb	%ymm1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x4)
+ 
++	/* Align data to VEC_SIZE * 4 - 1.  */
+ # ifdef USE_AS_STRNLEN
+-	/* Adjust length.  */
++	/* Before adjusting length check if at last VEC_SIZE * 4.  */
++	cmpq	$(VEC_SIZE * 4 - 1), %rsi
++	jbe	L(last_4x_vec_or_less_load)
++	incq	%rdi
++	movl	%edi, %ecx
++	orq	$(VEC_SIZE * 4 - 1), %rdi
++	andl	$(VEC_SIZE * 4 - 1), %ecx
++	/* Readjust length.  */
+ 	addq	%rcx, %rsi
++# else
++	incq	%rdi
++	orq	$(VEC_SIZE * 4 - 1), %rdi
+ # endif
+-
++	/* Compare 4 * VEC at a time forward.  */
+ 	.p2align 4
+ L(loop_4x_vec):
+-	/* Compare 4 * VEC at a time forward.  */
+-	vmovdqa (%rdi), %ymm1
+-	vmovdqa	VEC_SIZE(%rdi), %ymm2
+-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm3
+-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm4
+-	VPMINU	%ymm1, %ymm2, %ymm5
+-	VPMINU	%ymm3, %ymm4, %ymm6
+-	VPMINU	%ymm5, %ymm6, %ymm5
+-
+-	VPCMPEQ	%ymm5, %ymm0, %ymm5
+-	vpmovmskb %ymm5, %eax
+-	testl	%eax, %eax
+-	jnz	L(4x_vec_end)
+-
+-	addq	$(VEC_SIZE * 4), %rdi
+-
+-# ifndef USE_AS_STRNLEN
+-	jmp	L(loop_4x_vec)
+-# else
++# ifdef USE_AS_STRNLEN
++	/* Break if at end of length.  */
+ 	subq	$(VEC_SIZE * 4), %rsi
+-	ja	L(loop_4x_vec)
+-
+-L(last_4x_vec_or_less):
+-	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
+-	addl	$(VEC_SIZE * 2), %esi
+-	jle	L(last_2x_vec)
++	jb	L(last_4x_vec_or_less_cmpeq)
++# endif
++	/* Save some code size by microfusing VPMINU with the load. Since
++	   the matches in ymm2/ymm4 can only be returned if there where no
++	   matches in ymm1/ymm3 respectively there is no issue with overlap.
++	 */
++	vmovdqa	1(%rdi), %ymm1
++	VPMINU	(VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
++	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm3
++	VPMINU	(VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4
++
++	VPMINU	%ymm2, %ymm4, %ymm5
++	VPCMPEQ	%ymm5, %ymm0, %ymm5
++	vpmovmskb	%ymm5, %ecx
+ 
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
++	subq	$-(VEC_SIZE * 4), %rdi
++	testl	%ecx, %ecx
++	jz	L(loop_4x_vec)
+ 
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+ 
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
++	VPCMPEQ	%ymm1, %ymm0, %ymm1
++	vpmovmskb	%ymm1, %eax
++	subq	%rdx, %rdi
+ 	testl	%eax, %eax
++	jnz	L(last_vec_return_x0)
+ 
+-	jnz	L(first_vec_x2_check)
+-	subl	$VEC_SIZE, %esi
+-	jle	L(max)
+-
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
++	VPCMPEQ	%ymm2, %ymm0, %ymm2
++	vpmovmskb	%ymm2, %eax
+ 	testl	%eax, %eax
+-
+-	jnz	L(first_vec_x3_check)
+-	movq	%r8, %rax
+-#  ifdef USE_AS_WCSLEN
++	jnz	L(last_vec_return_x1)
++
++	/* Combine last 2 VEC.  */
++	VPCMPEQ	%ymm3, %ymm0, %ymm3
++	vpmovmskb	%ymm3, %eax
++	/* rcx has combined result from all 4 VEC. It will only be used if
++	   the first 3 other VEC all did not contain a match.  */
++	salq	$32, %rcx
++	orq	%rcx, %rax
++	tzcntq	%rax, %rax
++	subq	$(VEC_SIZE * 2 - 1), %rdi
++	addq	%rdi, %rax
++# ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+-#  endif
++# endif
+ 	VZEROUPPER_RETURN
+ 
++
++# ifdef USE_AS_STRNLEN
+ 	.p2align 4
+-L(last_2x_vec):
+-	addl	$(VEC_SIZE * 2), %esi
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
++L(last_4x_vec_or_less_load):
++	/* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
++	subq	$-(VEC_SIZE * 4), %rdi
++L(last_4x_vec_or_less_cmpeq):
++	VPCMPEQ	1(%rdi), %ymm0, %ymm1
++L(last_4x_vec_or_less):
+ 
+-	jnz	L(first_vec_x0_check)
+-	subl	$VEC_SIZE, %esi
+-	jle	L(max)
++	vpmovmskb	%ymm1, %eax
++	/* If remaining length > VEC_SIZE * 2. This works if esi is off by
++	   VEC_SIZE * 4.  */
++	testl	$(VEC_SIZE * 2), %esi
++	jnz	L(last_4x_vec)
+ 
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
++	/* length may have been negative or positive by an offset of
++	   VEC_SIZE * 4 depending on where this was called from. This fixes
++	   that.  */
++	andl	$(VEC_SIZE * 4 - 1), %esi
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x1_check)
+-	movq	%r8, %rax
+-#  ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+-#  endif
+-	VZEROUPPER_RETURN
++	jnz	L(last_vec_x1_check)
+ 
+-	.p2align 4
+-L(first_vec_x0_check):
++	subl	$VEC_SIZE, %esi
++	jb	L(max)
++
++	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
++	vpmovmskb	%ymm1, %eax
+ 	tzcntl	%eax, %eax
+ 	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
++	cmpl	%eax, %esi
++	jb	L(max)
++	subq	%rdx, %rdi
++	addl	$(VEC_SIZE + 1), %eax
+ 	addq	%rdi, %rax
+-	subq	%rdx, %rax
+ #  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ #  endif
+ 	VZEROUPPER_RETURN
++# endif
+ 
+ 	.p2align 4
+-L(first_vec_x1_check):
++L(last_vec_return_x0):
+ 	tzcntl	%eax, %eax
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
+-	addq	$VEC_SIZE, %rax
++	subq	$(VEC_SIZE * 4 - 1), %rdi
+ 	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-#  ifdef USE_AS_WCSLEN
++# ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+-#  endif
++# endif
+ 	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(first_vec_x2_check):
++L(last_vec_return_x1):
+ 	tzcntl	%eax, %eax
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
+-	addq	$(VEC_SIZE * 2), %rax
++	subq	$(VEC_SIZE * 3 - 1), %rdi
+ 	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-#  ifdef USE_AS_WCSLEN
++# ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+-#  endif
++# endif
+ 	VZEROUPPER_RETURN
+ 
++# ifdef USE_AS_STRNLEN
+ 	.p2align 4
+-L(first_vec_x3_check):
++L(last_vec_x1_check):
++
+ 	tzcntl	%eax, %eax
+ 	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
+-	addq	$(VEC_SIZE * 3), %rax
++	cmpl	%eax, %esi
++	jb	L(max)
++	subq	%rdx, %rdi
++	incl	%eax
+ 	addq	%rdi, %rax
+-	subq	%rdx, %rax
+ #  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ #  endif
+ 	VZEROUPPER_RETURN
+ 
+-	.p2align 4
+ L(max):
+ 	movq	%r8, %rax
++	VZEROUPPER_RETURN
++
++	.p2align 4
++L(last_4x_vec):
++	/* Test first 2x VEC normally.  */
++	testl	%eax, %eax
++	jnz	L(last_vec_x1)
++
++	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
++	vpmovmskb	%ymm1, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x2)
++
++	/* Normalize length.  */
++	andl	$(VEC_SIZE * 4 - 1), %esi
++	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
++	vpmovmskb	%ymm1, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x3)
++
++	subl	$(VEC_SIZE * 3), %esi
++	jb	L(max)
++
++	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
++	vpmovmskb	%ymm1, %eax
++	tzcntl	%eax, %eax
++	/* Check the end of data.  */
++	cmpl	%eax, %esi
++	jb	L(max)
++	subq	%rdx, %rdi
++	addl	$(VEC_SIZE * 3 + 1), %eax
++	addq	%rdi, %rax
+ #  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ #  endif
+ 	VZEROUPPER_RETURN
+ 
+-	.p2align 4
+-L(zero):
+-	xorl	%eax, %eax
+-	ret
+-# endif
+ 
+ 	.p2align 4
+-L(first_vec_x0):
++L(last_vec_x1):
++	/* essentially duplicates of first_vec_x1 but use 64 bit
++	   instructions.  */
+ 	tzcntl	%eax, %eax
++	subq	%rdx, %rdi
++	incl	%eax
+ 	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-# ifdef USE_AS_WCSLEN
++#  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+-# endif
++#  endif
+ 	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(first_vec_x1):
++L(last_vec_x2):
++	/* essentially duplicates of first_vec_x1 but use 64 bit
++	   instructions.  */
+ 	tzcntl	%eax, %eax
+-	addq	$VEC_SIZE, %rax
++	subq	%rdx, %rdi
++	addl	$(VEC_SIZE + 1), %eax
+ 	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-# ifdef USE_AS_WCSLEN
++#  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+-# endif
++#  endif
+ 	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(first_vec_x2):
++L(last_vec_x3):
+ 	tzcntl	%eax, %eax
+-	addq	$(VEC_SIZE * 2), %rax
++	subl	$(VEC_SIZE * 2), %esi
++	/* Check the end of data.  */
++	cmpl	%eax, %esi
++	jb	L(max_end)
++	subq	%rdx, %rdi
++	addl	$(VEC_SIZE * 2 + 1), %eax
+ 	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-# ifdef USE_AS_WCSLEN
++#  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+-# endif
++#  endif
++	VZEROUPPER_RETURN
++L(max_end):
++	movq	%r8, %rax
+ 	VZEROUPPER_RETURN
++# endif
+ 
++	/* Cold case for crossing page with first load.	 */
+ 	.p2align 4
+-L(4x_vec_end):
+-	VPCMPEQ	%ymm1, %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+-	VPCMPEQ %ymm2, %ymm0, %ymm2
+-	vpmovmskb %ymm2, %eax
++L(cross_page_boundary):
++	/* Align data to VEC_SIZE - 1.  */
++	orq	$(VEC_SIZE - 1), %rdi
++	VPCMPEQ	-(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
++	vpmovmskb	%ymm1, %eax
++	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
++	   so no need to manually mod rdx.  */
++	sarxl	%edx, %eax, %eax
++# ifdef USE_AS_STRNLEN
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+-	VPCMPEQ %ymm3, %ymm0, %ymm3
+-	vpmovmskb %ymm3, %eax
++	jnz	L(cross_page_less_vec)
++	leaq	1(%rdi), %rcx
++	subq	%rdx, %rcx
++	/* Check length.  */
++	cmpq	%rsi, %rcx
++	jb	L(cross_page_continue)
++	movq	%r8, %rax
++# else
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x2)
+-	VPCMPEQ %ymm4, %ymm0, %ymm4
+-	vpmovmskb %ymm4, %eax
+-L(first_vec_x3):
++	jz	L(cross_page_continue)
+ 	tzcntl	%eax, %eax
+-	addq	$(VEC_SIZE * 3), %rax
+-	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-# ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
++#  ifdef USE_AS_WCSLEN
++	shrl	$2, %eax
++#  endif
+ # endif
++L(return_vzeroupper):
++	ZERO_UPPER_VEC_REGISTERS_RETURN
++
++# ifdef USE_AS_STRNLEN
++	.p2align 4
++L(cross_page_less_vec):
++	tzcntl	%eax, %eax
++	cmpq	%rax, %rsi
++	cmovb	%esi, %eax
++#  ifdef USE_AS_WCSLEN
++	shrl	$2, %eax
++#  endif
+ 	VZEROUPPER_RETURN
++# endif
+ 
+ END (STRLEN)
+ #endif
+-- 
+GitLab
+
diff --git a/SOURCES/ia-testl-x86_string_control.patch b/SOURCES/ia-testl-x86_string_control.patch
new file mode 100644
index 0000000..e89ee57
--- /dev/null
+++ b/SOURCES/ia-testl-x86_string_control.patch
@@ -0,0 +1,39 @@
+From 7af32e1d2ba4a5a98a2033bf18de200912ae90a2 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Sat, 28 Aug 2021 06:10:38 -0700
+Subject: [PATCH] x86-64: Use testl to check __x86_string_control
+
+Use testl, instead of andl, to check __x86_string_control to avoid
+updating __x86_string_control.
+
+Reviewed-by: Carlos O'Donell <carlos@redhat.com>
+(cherry picked from commit 3c8b9879cab6d41787bc5b14c1748f62fd6d0e5f)
+---
+ sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+index 0469bf99..c0809b1b 100644
+--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+@@ -325,7 +325,7 @@ L(movsb):
+ 	/* Avoid slow backward REP MOVSB.  */
+ 	jb	L(more_8x_vec_backward)
+ # if AVOID_SHORT_DISTANCE_REP_MOVSB
+-	andl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
++	testl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
+ 	jz	3f
+ 	movq	%rdi, %rcx
+ 	subq	%rsi, %rcx
+@@ -333,7 +333,7 @@ L(movsb):
+ # endif
+ 1:
+ # if AVOID_SHORT_DISTANCE_REP_MOVSB
+-	andl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
++	testl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
+ 	jz	3f
+ 	movq	%rsi, %rcx
+ 	subq	%rdi, %rcx
+-- 
+GitLab
+
diff --git a/SOURCES/ia-unk-vector-opr-memchr-evex.patch b/SOURCES/ia-unk-vector-opr-memchr-evex.patch
new file mode 100644
index 0000000..d0070d0
--- /dev/null
+++ b/SOURCES/ia-unk-vector-opr-memchr-evex.patch
@@ -0,0 +1,30 @@
+From 3210a5e4acc812b94b8514d8d9664c7b4b6451af Mon Sep 17 00:00:00 2001
+From: Alice Xu <alice.d.xu@gmail.com>
+Date: Fri, 7 May 2021 19:03:21 -0700
+Subject: [PATCH] x86-64: Fix an unknown vector operation in memchr-evex.S
+
+An unknown vector operation occurred in commit 2a76821c308. Fixed it
+by using "ymm{k1}{z}" but not "ymm {k1} {z}".
+
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+(cherry picked from commit 6ea916adfa0ab9af6e7dc6adcf6f977dfe017835)
+---
+ sysdeps/x86_64/multiarch/memchr-evex.S | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
+index 81d5cd64..f3fdad4f 100644
+--- a/sysdeps/x86_64/multiarch/memchr-evex.S
++++ b/sysdeps/x86_64/multiarch/memchr-evex.S
+@@ -271,7 +271,7 @@ L(loop_4x_vec):
+ 	vpxorq	(VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
+ 	VPCMP	$0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
+ 	/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask.  */
+-	VPMINU	%YMM2, %YMM3, %YMM3 {%k1} {z}
++	VPMINU	%YMM2, %YMM3, %YMM3{%k1}{z}
+ 	VPCMP	$0, %YMM3, %YMMZERO, %k2
+ # ifdef USE_AS_RAWMEMCHR
+ 	subq	$-(VEC_SIZE * 4), %rdi
+-- 
+GitLab
+
diff --git a/SOURCES/ia-upd-256bit-evex-instr-1.patch b/SOURCES/ia-upd-256bit-evex-instr-1.patch
new file mode 100644
index 0000000..4a299bf
--- /dev/null
+++ b/SOURCES/ia-upd-256bit-evex-instr-1.patch
@@ -0,0 +1,3407 @@
+From 22a1b88414d40b700c84689d08a6026e3fdee874 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 5 Mar 2021 06:24:52 -0800
+Subject: [PATCH] x86-64: Add ifunc-avx2.h functions with 256-bit EVEX
+
+Update ifunc-avx2.h, strchr.c, strcmp.c, strncmp.c and wcsnlen.c to
+select the function optimized with 256-bit EVEX instructions using
+YMM16-YMM31 registers to avoid RTM abort with usable AVX512VL, AVX512BW
+and BMI2 since VZEROUPPER isn't needed at function exit.
+
+For strcmp/strncmp, prefer AVX2 strcmp/strncmp if Prefer_AVX2_STRCMP
+is set.
+
+(cherry picked from commit 1fd8c163a83d96ace1ff78fa6bac7aee084f6f77)
+---
+ sysdeps/x86_64/multiarch/Makefile          |   21 +-
+ sysdeps/x86_64/multiarch/ifunc-avx2.h      |   14 +-
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c |   81 ++
+ sysdeps/x86_64/multiarch/memchr-evex.S     |  381 +++++++
+ sysdeps/x86_64/multiarch/memrchr-evex.S    |  337 +++++++
+ sysdeps/x86_64/multiarch/rawmemchr-evex.S  |    4 +
+ sysdeps/x86_64/multiarch/strchr-evex.S     |  335 +++++++
+ sysdeps/x86_64/multiarch/strchr.c          |   14 +-
+ sysdeps/x86_64/multiarch/strchrnul-evex.S  |    3 +
+ sysdeps/x86_64/multiarch/strcmp-evex.S     | 1043 ++++++++++++++++++++
+ sysdeps/x86_64/multiarch/strcmp.c          |   15 +-
+ sysdeps/x86_64/multiarch/strlen-evex.S     |  436 ++++++++
+ sysdeps/x86_64/multiarch/strncmp-evex.S    |    3 +
+ sysdeps/x86_64/multiarch/strncmp.c         |   15 +-
+ sysdeps/x86_64/multiarch/strnlen-evex.S    |    4 +
+ sysdeps/x86_64/multiarch/strrchr-evex.S    |  265 +++++
+ sysdeps/x86_64/multiarch/wcschr-evex.S     |    3 +
+ sysdeps/x86_64/multiarch/wcscmp-evex.S     |    4 +
+ sysdeps/x86_64/multiarch/wcslen-evex.S     |    4 +
+ sysdeps/x86_64/multiarch/wcsncmp-evex.S    |    5 +
+ sysdeps/x86_64/multiarch/wcsnlen-evex.S    |    5 +
+ sysdeps/x86_64/multiarch/wcsnlen.c         |   14 +-
+ sysdeps/x86_64/multiarch/wcsrchr-evex.S    |    3 +
+ sysdeps/x86_64/multiarch/wmemchr-evex.S    |    4 +
+ 24 files changed, 2996 insertions(+), 17 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/memchr-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/memrchr-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/strchr-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/strchrnul-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/strcmp-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/strlen-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/strncmp-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/strrchr-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/wcschr-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/wcscmp-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/wcslen-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/wcsncmp-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/wcsrchr-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex.S
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 9477538a..5ce85882 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
++++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -39,7 +39,17 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
+ 		   memmove-avx512-unaligned-erms \
+ 		   memset-sse2-unaligned-erms \
+ 		   memset-avx2-unaligned-erms \
+-		   memset-avx512-unaligned-erms
++		   memset-avx512-unaligned-erms \
++		   memchr-evex \
++		   memrchr-evex \
++		   rawmemchr-evex \
++		   strchr-evex \
++		   strchrnul-evex \
++		   strcmp-evex \
++		   strlen-evex \
++		   strncmp-evex \
++		   strnlen-evex \
++		   strrchr-evex
+ CFLAGS-varshift.c += -msse4
+ CFLAGS-strcspn-c.c += -msse4
+ CFLAGS-strpbrk-c.c += -msse4
+@@ -56,7 +66,14 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
+ 		   wcschr-sse2 wcschr-avx2 \
+ 		   wcsrchr-sse2 wcsrchr-avx2 \
+ 		   wcsnlen-sse4_1 wcsnlen-c \
+-		   wcslen-sse2 wcslen-avx2 wcsnlen-avx2
++		   wcslen-sse2 wcslen-avx2 wcsnlen-avx2 \
++		   wcschr-evex \
++		   wcscmp-evex \
++		   wcslen-evex \
++		   wcsncmp-evex \
++		   wcsnlen-evex \
++		   wcsrchr-evex \
++		   wmemchr-evex
+ endif
+ 
+ ifeq ($(subdir),debug)
+diff --git a/sysdeps/x86_64/multiarch/ifunc-avx2.h b/sysdeps/x86_64/multiarch/ifunc-avx2.h
+index 5c88640a..7081b0c9 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-avx2.h
++++ b/sysdeps/x86_64/multiarch/ifunc-avx2.h
+@@ -21,16 +21,24 @@
+ 
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+ 
+ static inline void *
+ IFUNC_SELECTOR (void)
+ {
+   const struct cpu_features* cpu_features = __get_cpu_features ();
+ 
+-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+-      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
++  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+-    return OPTIMIZE (avx2);
++    {
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
++	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
++	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
++	return OPTIMIZE (evex);
++
++      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
++	return OPTIMIZE (avx2);
++    }
+ 
+   return OPTIMIZE (sse2);
+ }
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index fe13505c..bd7d9f19 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -43,6 +43,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, memchr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __memchr_avx2)
++	      IFUNC_IMPL_ADD (array, i, memchr,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __memchr_evex)
+ 	      IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/memcmp.c.  */
+@@ -121,6 +126,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, memrchr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __memrchr_avx2)
++	      IFUNC_IMPL_ADD (array, i, memrchr,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __memrchr_evex)
++
+ 	      IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_sse2))
+ 
+ #ifdef SHARED
+@@ -179,6 +189,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, rawmemchr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __rawmemchr_avx2)
++	      IFUNC_IMPL_ADD (array, i, rawmemchr,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __rawmemchr_evex)
+ 	      IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/strlen.c.  */
+@@ -186,6 +201,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, strlen,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strlen_avx2)
++	      IFUNC_IMPL_ADD (array, i, strlen,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __strlen_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
+@@ -193,6 +212,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, strnlen,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strnlen_avx2)
++	      IFUNC_IMPL_ADD (array, i, strnlen,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __strnlen_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/stpncpy.c.  */
+@@ -255,6 +278,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, strchr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strchr_avx2)
++	      IFUNC_IMPL_ADD (array, i, strchr,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __strchr_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2_no_bsf)
+ 	      IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2))
+ 
+@@ -263,6 +291,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, strchrnul,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strchrnul_avx2)
++	      IFUNC_IMPL_ADD (array, i, strchrnul,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __strchrnul_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/strrchr.c.  */
+@@ -270,6 +303,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, strrchr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strrchr_avx2)
++	      IFUNC_IMPL_ADD (array, i, strrchr,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __strrchr_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/strcmp.c.  */
+@@ -277,6 +314,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, strcmp,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strcmp_avx2)
++	      IFUNC_IMPL_ADD (array, i, strcmp,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __strcmp_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSE4_2),
+ 			      __strcmp_sse42)
+ 	      IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSSE3),
+@@ -370,6 +412,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wcschr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wcschr_avx2)
++	      IFUNC_IMPL_ADD (array, i, wcschr,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __wcschr_evex)
+ 	      IFUNC_IMPL_ADD (array, i, wcschr, 1, __wcschr_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/wcsrchr.c.  */
+@@ -377,6 +424,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wcsrchr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wcsrchr_avx2)
++	      IFUNC_IMPL_ADD (array, i, wcsrchr,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __wcsrchr_evex)
+ 	      IFUNC_IMPL_ADD (array, i, wcsrchr, 1, __wcsrchr_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/wcscmp.c.  */
+@@ -384,6 +436,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wcscmp,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wcscmp_avx2)
++	      IFUNC_IMPL_ADD (array, i, wcscmp,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __wcscmp_evex)
+ 	      IFUNC_IMPL_ADD (array, i, wcscmp, 1, __wcscmp_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/wcsncmp.c.  */
+@@ -391,6 +448,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wcsncmp,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wcsncmp_avx2)
++	      IFUNC_IMPL_ADD (array, i, wcsncmp,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __wcsncmp_evex)
+ 	      IFUNC_IMPL_ADD (array, i, wcsncmp, 1, __wcsncmp_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/wcscpy.c.  */
+@@ -404,6 +466,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wcslen,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wcslen_avx2)
++	      IFUNC_IMPL_ADD (array, i, wcslen,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __wcslen_evex)
+ 	      IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */
+@@ -411,6 +478,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wcsnlen_avx2)
++	      IFUNC_IMPL_ADD (array, i, wcsnlen,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __wcsnlen_evex)
+ 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+ 			      CPU_FEATURE_USABLE (SSE4_1),
+ 			      __wcsnlen_sse4_1)
+@@ -421,6 +493,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wmemchr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wmemchr_avx2)
++	      IFUNC_IMPL_ADD (array, i, wmemchr,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __wmemchr_evex)
+ 	      IFUNC_IMPL_ADD (array, i, wmemchr, 1, __wmemchr_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/wmemcmp.c.  */
+@@ -568,6 +645,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, strncmp,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strncmp_avx2)
++	      IFUNC_IMPL_ADD (array, i, strncmp,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __strncmp_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSE4_2),
+ 			      __strncmp_sse42)
+ 	      IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSSE3),
+diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
+new file mode 100644
+index 00000000..6dd5d67b
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/memchr-evex.S
+@@ -0,0 +1,381 @@
++/* memchr/wmemchr optimized with 256-bit EVEX instructions.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#if IS_IN (libc)
++
++# include <sysdep.h>
++
++# ifndef MEMCHR
++#  define MEMCHR	__memchr_evex
++# endif
++
++# ifdef USE_AS_WMEMCHR
++#  define VPBROADCAST	vpbroadcastd
++#  define VPCMP		vpcmpd
++#  define SHIFT_REG	r8d
++# else
++#  define VPBROADCAST	vpbroadcastb
++#  define VPCMP		vpcmpb
++#  define SHIFT_REG	ecx
++# endif
++
++# define XMMMATCH	xmm16
++# define YMMMATCH	ymm16
++# define YMM1		ymm17
++# define YMM2		ymm18
++# define YMM3		ymm19
++# define YMM4		ymm20
++# define YMM5		ymm21
++# define YMM6		ymm22
++
++# define VEC_SIZE 32
++
++	.section .text.evex,"ax",@progbits
++ENTRY (MEMCHR)
++# ifndef USE_AS_RAWMEMCHR
++	/* Check for zero length.  */
++	test	%RDX_LP, %RDX_LP
++	jz	L(zero)
++# endif
++	movl	%edi, %ecx
++# ifdef USE_AS_WMEMCHR
++	shl	$2, %RDX_LP
++# else
++#  ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	movl	%edx, %edx
++#  endif
++# endif
++	/* Broadcast CHAR to YMMMATCH.  */
++	VPBROADCAST %esi, %YMMMATCH
++	/* Check if we may cross page boundary with one vector load.  */
++	andl	$(2 * VEC_SIZE - 1), %ecx
++	cmpl	$VEC_SIZE, %ecx
++	ja	L(cros_page_boundary)
++
++	/* Check the first VEC_SIZE bytes.  */
++	VPCMP	$0, (%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++
++# ifndef USE_AS_RAWMEMCHR
++	jnz	L(first_vec_x0_check)
++	/* Adjust length and check the end of data.  */
++	subq	$VEC_SIZE, %rdx
++	jbe	L(zero)
++# else
++	jnz	L(first_vec_x0)
++# endif
++
++	/* Align data for aligned loads in the loop.  */
++	addq	$VEC_SIZE, %rdi
++	andl	$(VEC_SIZE - 1), %ecx
++	andq	$-VEC_SIZE, %rdi
++
++# ifndef USE_AS_RAWMEMCHR
++	/* Adjust length.  */
++	addq	%rcx, %rdx
++
++	subq	$(VEC_SIZE * 4), %rdx
++	jbe	L(last_4x_vec_or_less)
++# endif
++	jmp	L(more_4x_vec)
++
++	.p2align 4
++L(cros_page_boundary):
++	andl	$(VEC_SIZE - 1), %ecx
++# ifdef USE_AS_WMEMCHR
++	/* NB: Divide shift count by 4 since each bit in K1 represent 4
++	   bytes.  */
++	movl	%ecx, %SHIFT_REG
++	sarl	$2, %SHIFT_REG
++# endif
++	andq	$-VEC_SIZE, %rdi
++	VPCMP	$0, (%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	/* Remove the leading bytes.  */
++	sarxl	%SHIFT_REG, %eax, %eax
++	testl	%eax, %eax
++	jz	L(aligned_more)
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %eax
++# endif
++# ifndef USE_AS_RAWMEMCHR
++	/* Check the end of data.  */
++	cmpq	%rax, %rdx
++	jbe	L(zero)
++# endif
++	addq	%rdi, %rax
++	addq	%rcx, %rax
++	ret
++
++	.p2align 4
++L(aligned_more):
++# ifndef USE_AS_RAWMEMCHR
++        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
++	   instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
++	   overflow.  */
++	negq	%rcx
++	addq	$VEC_SIZE, %rcx
++
++	/* Check the end of data.  */
++	subq	%rcx, %rdx
++	jbe	L(zero)
++# endif
++
++	addq	$VEC_SIZE, %rdi
++
++# ifndef USE_AS_RAWMEMCHR
++	subq	$(VEC_SIZE * 4), %rdx
++	jbe	L(last_4x_vec_or_less)
++# endif
++
++L(more_4x_vec):
++	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
++	   since data is only aligned to VEC_SIZE.  */
++	VPCMP	$0, (%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x0)
++
++	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x1)
++
++	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x2)
++
++	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x3)
++
++	addq	$(VEC_SIZE * 4), %rdi
++
++# ifndef USE_AS_RAWMEMCHR
++	subq	$(VEC_SIZE * 4), %rdx
++	jbe	L(last_4x_vec_or_less)
++# endif
++
++	/* Align data to 4 * VEC_SIZE.  */
++	movq	%rdi, %rcx
++	andl	$(4 * VEC_SIZE - 1), %ecx
++	andq	$-(4 * VEC_SIZE), %rdi
++
++# ifndef USE_AS_RAWMEMCHR
++	/* Adjust length.  */
++	addq	%rcx, %rdx
++# endif
++
++	.p2align 4
++L(loop_4x_vec):
++	/* Compare 4 * VEC at a time forward.  */
++	VPCMP	$0, (%rdi), %YMMMATCH, %k1
++	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k2
++	kord	%k1, %k2, %k5
++	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
++	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
++
++	kord	%k3, %k4, %k6
++	kortestd %k5, %k6
++	jnz	L(4x_vec_end)
++
++	addq	$(VEC_SIZE * 4), %rdi
++
++# ifdef USE_AS_RAWMEMCHR
++	jmp	L(loop_4x_vec)
++# else
++	subq	$(VEC_SIZE * 4), %rdx
++	ja	L(loop_4x_vec)
++
++L(last_4x_vec_or_less):
++	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
++	addl	$(VEC_SIZE * 2), %edx
++	jle	L(last_2x_vec)
++
++	VPCMP	$0, (%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x0)
++
++	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x1)
++
++	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++
++	jnz	L(first_vec_x2_check)
++	subl	$VEC_SIZE, %edx
++	jle	L(zero)
++
++	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++
++	jnz	L(first_vec_x3_check)
++	xorl	%eax, %eax
++	ret
++
++	.p2align 4
++L(last_2x_vec):
++	addl	$(VEC_SIZE * 2), %edx
++	VPCMP	$0, (%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++
++	jnz	L(first_vec_x0_check)
++	subl	$VEC_SIZE, %edx
++	jle	L(zero)
++
++	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x1_check)
++	xorl	%eax, %eax
++	ret
++
++	.p2align 4
++L(first_vec_x0_check):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %eax
++# endif
++	/* Check the end of data.  */
++	cmpq	%rax, %rdx
++	jbe	L(zero)
++	addq	%rdi, %rax
++	ret
++
++	.p2align 4
++L(first_vec_x1_check):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %eax
++# endif
++	/* Check the end of data.  */
++	cmpq	%rax, %rdx
++	jbe	L(zero)
++	addq	$VEC_SIZE, %rax
++	addq	%rdi, %rax
++	ret
++
++	.p2align 4
++L(first_vec_x2_check):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %eax
++# endif
++	/* Check the end of data.  */
++	cmpq	%rax, %rdx
++	jbe	L(zero)
++	addq	$(VEC_SIZE * 2), %rax
++	addq	%rdi, %rax
++	ret
++
++	.p2align 4
++L(first_vec_x3_check):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %eax
++# endif
++	/* Check the end of data.  */
++	cmpq	%rax, %rdx
++	jbe	L(zero)
++	addq	$(VEC_SIZE * 3), %rax
++	addq	%rdi, %rax
++	ret
++
++	.p2align 4
++L(zero):
++	xorl	%eax, %eax
++	ret
++# endif
++
++	.p2align 4
++L(first_vec_x0):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	leaq	(%rdi, %rax, 4), %rax
++# else
++	addq	%rdi, %rax
++# endif
++	ret
++
++	.p2align 4
++L(first_vec_x1):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	leaq	VEC_SIZE(%rdi, %rax, 4), %rax
++# else
++	addq	$VEC_SIZE, %rax
++	addq	%rdi, %rax
++# endif
++	ret
++
++	.p2align 4
++L(first_vec_x2):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
++# else
++	addq	$(VEC_SIZE * 2), %rax
++	addq	%rdi, %rax
++# endif
++	ret
++
++	.p2align 4
++L(4x_vec_end):
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x0)
++	kmovd	%k2, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x1)
++	kmovd	%k3, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x2)
++	kmovd	%k4, %eax
++	testl	%eax, %eax
++L(first_vec_x3):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	leaq	(VEC_SIZE * 3)(%rdi, %rax, 4), %rax
++# else
++	addq	$(VEC_SIZE * 3), %rax
++	addq	%rdi, %rax
++# endif
++	ret
++
++END (MEMCHR)
++#endif
+diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S
+new file mode 100644
+index 00000000..16bf8e02
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/memrchr-evex.S
+@@ -0,0 +1,337 @@
++/* memrchr optimized with 256-bit EVEX instructions.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#if IS_IN (libc)
++
++# include <sysdep.h>
++
++# define VMOVA		vmovdqa64
++
++# define YMMMATCH	ymm16
++
++# define VEC_SIZE 32
++
++	.section .text.evex,"ax",@progbits
++ENTRY (__memrchr_evex)
++	/* Broadcast CHAR to YMMMATCH.  */
++	vpbroadcastb %esi, %YMMMATCH
++
++	sub	$VEC_SIZE, %RDX_LP
++	jbe	L(last_vec_or_less)
++
++	add	%RDX_LP, %RDI_LP
++
++	/* Check the last VEC_SIZE bytes.  */
++	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x0)
++
++	subq	$(VEC_SIZE * 4), %rdi
++	movl	%edi, %ecx
++	andl	$(VEC_SIZE - 1), %ecx
++	jz	L(aligned_more)
++
++	/* Align data for aligned loads in the loop.  */
++	addq	$VEC_SIZE, %rdi
++	addq	$VEC_SIZE, %rdx
++	andq	$-VEC_SIZE, %rdi
++	subq	%rcx, %rdx
++
++	.p2align 4
++L(aligned_more):
++	subq	$(VEC_SIZE * 4), %rdx
++	jbe	L(last_4x_vec_or_less)
++
++	/* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
++	   since data is only aligned to VEC_SIZE.  */
++	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x3)
++
++	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
++	kmovd	%k2, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x2)
++
++	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k3
++	kmovd	%k3, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x1)
++
++	vpcmpb	$0, (%rdi), %YMMMATCH, %k4
++	kmovd	%k4, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x0)
++
++	/* Align data to 4 * VEC_SIZE for loop with fewer branches.
++	   There are some overlaps with above if data isn't aligned
++	   to 4 * VEC_SIZE.  */
++	movl	%edi, %ecx
++	andl	$(VEC_SIZE * 4 - 1), %ecx
++	jz	L(loop_4x_vec)
++
++	addq	$(VEC_SIZE * 4), %rdi
++	addq	$(VEC_SIZE * 4), %rdx
++	andq	$-(VEC_SIZE * 4), %rdi
++	subq	%rcx, %rdx
++
++	.p2align 4
++L(loop_4x_vec):
++	/* Compare 4 * VEC at a time forward.  */
++	subq	$(VEC_SIZE * 4), %rdi
++	subq	$(VEC_SIZE * 4), %rdx
++	jbe	L(last_4x_vec_or_less)
++
++	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
++	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k2
++	kord	%k1, %k2, %k5
++	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
++	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
++
++	kord	%k3, %k4, %k6
++	kortestd %k5, %k6
++	jz	L(loop_4x_vec)
++
++	/* There is a match.  */
++	kmovd	%k4, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x3)
++
++	kmovd	%k3, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x2)
++
++	kmovd	%k2, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x1)
++
++	kmovd	%k1, %eax
++	bsrl	%eax, %eax
++	addq	%rdi, %rax
++	ret
++
++	.p2align 4
++L(last_4x_vec_or_less):
++	addl	$(VEC_SIZE * 4), %edx
++	cmpl	$(VEC_SIZE * 2), %edx
++	jbe	L(last_2x_vec)
++
++	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x3)
++
++	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
++	kmovd	%k2, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x2)
++
++	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k3
++	kmovd	%k3, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x1_check)
++	cmpl	$(VEC_SIZE * 3), %edx
++	jbe	L(zero)
++
++	vpcmpb	$0, (%rdi), %YMMMATCH, %k4
++	kmovd	%k4, %eax
++	testl	%eax, %eax
++	jz	L(zero)
++	bsrl	%eax, %eax
++	subq	$(VEC_SIZE * 4), %rdx
++	addq	%rax, %rdx
++	jl	L(zero)
++	addq	%rdi, %rax
++	ret
++
++	.p2align 4
++L(last_2x_vec):
++	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x3_check)
++	cmpl	$VEC_SIZE, %edx
++	jbe	L(zero)
++
++	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jz	L(zero)
++	bsrl	%eax, %eax
++	subq	$(VEC_SIZE * 2), %rdx
++	addq	%rax, %rdx
++	jl	L(zero)
++	addl	$(VEC_SIZE * 2), %eax
++	addq	%rdi, %rax
++	ret
++
++	.p2align 4
++L(last_vec_x0):
++	bsrl	%eax, %eax
++	addq	%rdi, %rax
++	ret
++
++	.p2align 4
++L(last_vec_x1):
++	bsrl	%eax, %eax
++	addl	$VEC_SIZE, %eax
++	addq	%rdi, %rax
++	ret
++
++	.p2align 4
++L(last_vec_x2):
++	bsrl	%eax, %eax
++	addl	$(VEC_SIZE * 2), %eax
++	addq	%rdi, %rax
++	ret
++
++	.p2align 4
++L(last_vec_x3):
++	bsrl	%eax, %eax
++	addl	$(VEC_SIZE * 3), %eax
++	addq	%rdi, %rax
++	ret
++
++	.p2align 4
++L(last_vec_x1_check):
++	bsrl	%eax, %eax
++	subq	$(VEC_SIZE * 3), %rdx
++	addq	%rax, %rdx
++	jl	L(zero)
++	addl	$VEC_SIZE, %eax
++	addq	%rdi, %rax
++	ret
++
++	.p2align 4
++L(last_vec_x3_check):
++	bsrl	%eax, %eax
++	subq	$VEC_SIZE, %rdx
++	addq	%rax, %rdx
++	jl	L(zero)
++	addl	$(VEC_SIZE * 3), %eax
++	addq	%rdi, %rax
++	ret
++
++	.p2align 4
++L(zero):
++	xorl	%eax, %eax
++	ret
++
++	.p2align 4
++L(last_vec_or_less_aligned):
++	movl	%edx, %ecx
++
++	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
++
++	movl	$1, %edx
++	/* Support rdx << 32.  */
++	salq	%cl, %rdx
++	subq	$1, %rdx
++
++	kmovd	%k1, %eax
++
++	/* Remove the trailing bytes.  */
++	andl	%edx, %eax
++	testl	%eax, %eax
++	jz	L(zero)
++
++	bsrl	%eax, %eax
++	addq	%rdi, %rax
++	ret
++
++	.p2align 4
++L(last_vec_or_less):
++	addl	$VEC_SIZE, %edx
++
++	/* Check for zero length.  */
++	testl	%edx, %edx
++	jz	L(zero)
++
++	movl	%edi, %ecx
++	andl	$(VEC_SIZE - 1), %ecx
++	jz	L(last_vec_or_less_aligned)
++
++	movl	%ecx, %esi
++	movl	%ecx, %r8d
++	addl	%edx, %esi
++	andq	$-VEC_SIZE, %rdi
++
++	subl	$VEC_SIZE, %esi
++	ja	L(last_vec_2x_aligned)
++
++	/* Check the last VEC.  */
++	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++
++	/* Remove the leading and trailing bytes.  */
++	sarl	%cl, %eax
++	movl	%edx, %ecx
++
++	movl	$1, %edx
++	sall	%cl, %edx
++	subl	$1, %edx
++
++	andl	%edx, %eax
++	testl	%eax, %eax
++	jz	L(zero)
++
++	bsrl	%eax, %eax
++	addq	%rdi, %rax
++	addq	%r8, %rax
++	ret
++
++	.p2align 4
++L(last_vec_2x_aligned):
++	movl	%esi, %ecx
++
++	/* Check the last VEC.  */
++	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
++
++	movl	$1, %edx
++	sall	%cl, %edx
++	subl	$1, %edx
++
++	kmovd	%k1, %eax
++
++	/* Remove the trailing bytes.  */
++	andl	%edx, %eax
++
++	testl	%eax, %eax
++	jnz	L(last_vec_x1)
++
++	/* Check the second last VEC.  */
++	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
++
++	movl	%r8d, %ecx
++
++	kmovd	%k1, %eax
++
++	/* Remove the leading bytes.  Must use unsigned right shift for
++	   bsrl below.  */
++	shrl	%cl, %eax
++	testl	%eax, %eax
++	jz	L(zero)
++
++	bsrl	%eax, %eax
++	addq	%rdi, %rax
++	addq	%r8, %rax
++	ret
++END (__memrchr_evex)
++#endif
+diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex.S b/sysdeps/x86_64/multiarch/rawmemchr-evex.S
+new file mode 100644
+index 00000000..ec942b77
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/rawmemchr-evex.S
+@@ -0,0 +1,4 @@
++#define MEMCHR __rawmemchr_evex
++#define USE_AS_RAWMEMCHR 1
++
++#include "memchr-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
+new file mode 100644
+index 00000000..ddc86a70
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strchr-evex.S
+@@ -0,0 +1,335 @@
++/* strchr/strchrnul optimized with 256-bit EVEX instructions.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#if IS_IN (libc)
++
++# include <sysdep.h>
++
++# ifndef STRCHR
++#  define STRCHR	__strchr_evex
++# endif
++
++# define VMOVU		vmovdqu64
++# define VMOVA		vmovdqa64
++
++# ifdef USE_AS_WCSCHR
++#  define VPBROADCAST	vpbroadcastd
++#  define VPCMP		vpcmpd
++#  define VPMINU	vpminud
++#  define CHAR_REG	esi
++#  define SHIFT_REG	r8d
++# else
++#  define VPBROADCAST	vpbroadcastb
++#  define VPCMP		vpcmpb
++#  define VPMINU	vpminub
++#  define CHAR_REG	sil
++#  define SHIFT_REG	ecx
++# endif
++
++# define XMMZERO	xmm16
++
++# define YMMZERO	ymm16
++# define YMM0		ymm17
++# define YMM1		ymm18
++# define YMM2		ymm19
++# define YMM3		ymm20
++# define YMM4		ymm21
++# define YMM5		ymm22
++# define YMM6		ymm23
++# define YMM7		ymm24
++# define YMM8		ymm25
++
++# define VEC_SIZE 32
++# define PAGE_SIZE 4096
++
++	.section .text.evex,"ax",@progbits
++ENTRY (STRCHR)
++	movl	%edi, %ecx
++# ifndef USE_AS_STRCHRNUL
++	xorl	%edx, %edx
++# endif
++
++	/* Broadcast CHAR to YMM0.	*/
++	VPBROADCAST %esi, %YMM0
++
++	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
++
++	/* Check if we cross page boundary with one vector load.  */
++	andl	$(PAGE_SIZE - 1), %ecx
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
++	ja  L(cross_page_boundary)
++
++	/* Check the first VEC_SIZE bytes. Search for both CHAR and the
++	   null bytes.  */
++	VMOVU	(%rdi), %YMM1
++
++	/* Leaves only CHARS matching esi as 0.  */
++	vpxorq	%YMM1, %YMM0, %YMM2
++	VPMINU	%YMM2, %YMM1, %YMM2
++	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM2, %k0
++	ktestd	%k0, %k0
++	jz	L(more_vecs)
++	kmovd	%k0, %eax
++	tzcntl	%eax, %eax
++	/* Found CHAR or the null byte.	 */
++# ifdef USE_AS_WCSCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	leaq	(%rdi, %rax, 4), %rax
++# else
++	addq	%rdi, %rax
++# endif
++# ifndef USE_AS_STRCHRNUL
++	cmp (%rax), %CHAR_REG
++	cmovne	%rdx, %rax
++# endif
++	ret
++
++	.p2align 4
++L(more_vecs):
++	/* Align data for aligned loads in the loop.  */
++	andq	$-VEC_SIZE, %rdi
++L(aligned_more):
++
++	/* Check the next 4 * VEC_SIZE.	 Only one VEC_SIZE at a time
++	   since data is only aligned to VEC_SIZE.	*/
++	VMOVA	VEC_SIZE(%rdi), %YMM1
++	addq	$VEC_SIZE, %rdi
++
++	/* Leaves only CHARS matching esi as 0.  */
++	vpxorq	%YMM1, %YMM0, %YMM2
++	VPMINU	%YMM2, %YMM1, %YMM2
++	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM2, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x0)
++
++	VMOVA	VEC_SIZE(%rdi), %YMM1
++	/* Leaves only CHARS matching esi as 0.  */
++	vpxorq	%YMM1, %YMM0, %YMM2
++	VPMINU	%YMM2, %YMM1, %YMM2
++	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM2, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x1)
++
++	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM1
++	/* Leaves only CHARS matching esi as 0.  */
++	vpxorq	%YMM1, %YMM0, %YMM2
++	VPMINU	%YMM2, %YMM1, %YMM2
++	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM2, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x2)
++
++	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM1
++	/* Leaves only CHARS matching esi as 0.  */
++	vpxorq	%YMM1, %YMM0, %YMM2
++	VPMINU	%YMM2, %YMM1, %YMM2
++	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM2, %k0
++	ktestd	%k0, %k0
++	jz	L(prep_loop_4x)
++
++	kmovd	%k0, %eax
++	tzcntl	%eax, %eax
++	/* Found CHAR or the null byte.	 */
++# ifdef USE_AS_WCSCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	leaq	(VEC_SIZE * 3)(%rdi, %rax, 4), %rax
++# else
++	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
++# endif
++# ifndef USE_AS_STRCHRNUL
++	cmp (%rax), %CHAR_REG
++	cmovne	%rdx, %rax
++# endif
++	ret
++
++	.p2align 4
++L(first_vec_x0):
++	tzcntl	%eax, %eax
++	/* Found CHAR or the null byte.	 */
++# ifdef USE_AS_WCSCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	leaq	(%rdi, %rax, 4), %rax
++# else
++	addq	%rdi, %rax
++# endif
++# ifndef USE_AS_STRCHRNUL
++	cmp (%rax), %CHAR_REG
++	cmovne	%rdx, %rax
++# endif
++	ret
++
++	.p2align 4
++L(first_vec_x1):
++	tzcntl	%eax, %eax
++	/* Found CHAR or the null byte.	 */
++# ifdef USE_AS_WCSCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	leaq	VEC_SIZE(%rdi, %rax, 4), %rax
++# else
++	leaq	VEC_SIZE(%rdi, %rax), %rax
++# endif
++# ifndef USE_AS_STRCHRNUL
++	cmp (%rax), %CHAR_REG
++	cmovne	%rdx, %rax
++# endif
++	ret
++
++	.p2align 4
++L(first_vec_x2):
++	tzcntl	%eax, %eax
++	/* Found CHAR or the null byte.	 */
++# ifdef USE_AS_WCSCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
++# else
++	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
++# endif
++# ifndef USE_AS_STRCHRNUL
++	cmp (%rax), %CHAR_REG
++	cmovne	%rdx, %rax
++# endif
++	ret
++
++L(prep_loop_4x):
++	/* Align data to 4 * VEC_SIZE.	*/
++	andq	$-(VEC_SIZE * 4), %rdi
++
++	.p2align 4
++L(loop_4x_vec):
++	/* Compare 4 * VEC at a time forward.  */
++	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
++	VMOVA	(VEC_SIZE * 5)(%rdi), %YMM2
++	VMOVA	(VEC_SIZE * 6)(%rdi), %YMM3
++	VMOVA	(VEC_SIZE * 7)(%rdi), %YMM4
++
++	/* Leaves only CHARS matching esi as 0.  */
++	vpxorq	%YMM1, %YMM0, %YMM5
++	vpxorq	%YMM2, %YMM0, %YMM6
++	vpxorq	%YMM3, %YMM0, %YMM7
++	vpxorq	%YMM4, %YMM0, %YMM8
++
++	VPMINU	%YMM5, %YMM1, %YMM5
++	VPMINU	%YMM6, %YMM2, %YMM6
++	VPMINU	%YMM7, %YMM3, %YMM7
++	VPMINU	%YMM8, %YMM4, %YMM8
++
++	VPMINU	%YMM5, %YMM6, %YMM1
++	VPMINU	%YMM7, %YMM8, %YMM2
++
++	VPMINU	%YMM1, %YMM2, %YMM1
++
++	/* Each bit in K0 represents a CHAR or a null byte.  */
++	VPCMP	$0, %YMMZERO, %YMM1, %k0
++
++	addq	$(VEC_SIZE * 4), %rdi
++
++	ktestd	%k0, %k0
++	jz	L(loop_4x_vec)
++
++	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM5, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x0)
++
++	/* Each bit in K1 represents a CHAR or a null byte in YMM2.  */
++	VPCMP	$0, %YMMZERO, %YMM6, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x1)
++
++	/* Each bit in K2 represents a CHAR or a null byte in YMM3.  */
++	VPCMP	$0, %YMMZERO, %YMM7, %k2
++	/* Each bit in K3 represents a CHAR or a null byte in YMM4.  */
++	VPCMP	$0, %YMMZERO, %YMM8, %k3
++
++# ifdef USE_AS_WCSCHR
++	/* NB: Each bit in K2/K3 represents 4-byte element.  */
++	kshiftlw $8, %k3, %k1
++# else
++	kshiftlq $32, %k3, %k1
++# endif
++
++	/* Each bit in K1 represents a NULL or a mismatch.  */
++	korq	%k1, %k2, %k1
++	kmovq	%k1, %rax
++
++	tzcntq  %rax, %rax
++# ifdef USE_AS_WCSCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
++# else
++	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
++# endif
++# ifndef USE_AS_STRCHRNUL
++	cmp (%rax), %CHAR_REG
++	cmovne	%rdx, %rax
++# endif
++	ret
++
++	/* Cold case for crossing page with first load.	 */
++	.p2align 4
++L(cross_page_boundary):
++	andq	$-VEC_SIZE, %rdi
++	andl	$(VEC_SIZE - 1), %ecx
++
++	VMOVA	(%rdi), %YMM1
++
++	/* Leaves only CHARS matching esi as 0.  */
++	vpxorq	%YMM1, %YMM0, %YMM2
++	VPMINU	%YMM2, %YMM1, %YMM2
++	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM2, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++
++# ifdef USE_AS_WCSCHR
++	/* NB: Divide shift count by 4 since each bit in K1 represent 4
++	   bytes.  */
++	movl	%ecx, %SHIFT_REG
++	sarl    $2, %SHIFT_REG
++# endif
++
++	/* Remove the leading bits.	 */
++	sarxl	%SHIFT_REG, %eax, %eax
++	testl	%eax, %eax
++
++	jz	L(aligned_more)
++	tzcntl	%eax, %eax
++	addq	%rcx, %rdi
++# ifdef USE_AS_WCSCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	leaq	(%rdi, %rax, 4), %rax
++# else
++	addq	%rdi, %rax
++# endif
++# ifndef USE_AS_STRCHRNUL
++	cmp (%rax), %CHAR_REG
++	cmovne	%rdx, %rax
++# endif
++	ret
++
++END (STRCHR)
++# endif
+diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
+index 32954713..be05e197 100644
+--- a/sysdeps/x86_64/multiarch/strchr.c
++++ b/sysdeps/x86_64/multiarch/strchr.c
+@@ -29,16 +29,24 @@
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_no_bsf) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+ 
+ static inline void *
+ IFUNC_SELECTOR (void)
+ {
+   const struct cpu_features* cpu_features = __get_cpu_features ();
+ 
+-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+-      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
++  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+-    return OPTIMIZE (avx2);
++    {
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
++	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
++	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
++	return OPTIMIZE (evex);
++
++      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
++	return OPTIMIZE (avx2);
++    }
+ 
+   if (CPU_FEATURES_ARCH_P (cpu_features, Slow_BSF))
+     return OPTIMIZE (sse2_no_bsf);
+diff --git a/sysdeps/x86_64/multiarch/strchrnul-evex.S b/sysdeps/x86_64/multiarch/strchrnul-evex.S
+new file mode 100644
+index 00000000..064fe7ca
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strchrnul-evex.S
+@@ -0,0 +1,3 @@
++#define STRCHR __strchrnul_evex
++#define USE_AS_STRCHRNUL 1
++#include "strchr-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
+new file mode 100644
+index 00000000..459eeed0
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
+@@ -0,0 +1,1043 @@
++/* strcmp/wcscmp/strncmp/wcsncmp optimized with 256-bit EVEX instructions.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#if IS_IN (libc)
++
++# include <sysdep.h>
++
++# ifndef STRCMP
++#  define STRCMP	__strcmp_evex
++# endif
++
++# define PAGE_SIZE	4096
++
++/* VEC_SIZE = Number of bytes in a ymm register */
++# define VEC_SIZE	32
++
++/* Shift for dividing by (VEC_SIZE * 4).  */
++# define DIVIDE_BY_VEC_4_SHIFT	7
++# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
++#  error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
++# endif
++
++# define VMOVU		vmovdqu64
++# define VMOVA		vmovdqa64
++
++# ifdef USE_AS_WCSCMP
++/* Compare packed dwords.  */
++#  define VPCMP		vpcmpd
++#  define SHIFT_REG32	r8d
++#  define SHIFT_REG64	r8
++/* 1 dword char == 4 bytes.  */
++#  define SIZE_OF_CHAR	4
++# else
++/* Compare packed bytes.  */
++#  define VPCMP		vpcmpb
++#  define SHIFT_REG32	ecx
++#  define SHIFT_REG64	rcx
++/* 1 byte char == 1 byte.  */
++#  define SIZE_OF_CHAR	1
++# endif
++
++# define XMMZERO	xmm16
++# define XMM0		xmm17
++# define XMM1		xmm18
++
++# define YMMZERO	ymm16
++# define YMM0		ymm17
++# define YMM1		ymm18
++# define YMM2		ymm19
++# define YMM3		ymm20
++# define YMM4		ymm21
++# define YMM5		ymm22
++# define YMM6		ymm23
++# define YMM7		ymm24
++
++/* Warning!
++           wcscmp/wcsncmp have to use SIGNED comparison for elements.
++           strcmp/strncmp have to use UNSIGNED comparison for elements.
++*/
++
++/* The main idea of the string comparison (byte or dword) using 256-bit
++   EVEX instructions consists of comparing (VPCMP) two ymm vectors. The
++   latter can be on either packed bytes or dwords depending on
++   USE_AS_WCSCMP. In order to check the null char, algorithm keeps the
++   matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2
++   KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes)
++   are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd
++   instructions.  Main loop (away from from page boundary) compares 4
++   vectors are a time, effectively comparing 4 x VEC_SIZE bytes (128
++   bytes) on each loop.
++
++   The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic
++   is the same as strcmp, except that an a maximum offset is tracked.  If
++   the maximum offset is reached before a difference is found, zero is
++   returned.  */
++
++	.section .text.evex,"ax",@progbits
++ENTRY (STRCMP)
++# ifdef USE_AS_STRNCMP
++	/* Check for simple cases (0 or 1) in offset.  */
++	cmp	$1, %RDX_LP
++	je	L(char0)
++	jb	L(zero)
++#  ifdef USE_AS_WCSCMP
++	/* Convert units: from wide to byte char.  */
++	shl	$2, %RDX_LP
++#  endif
++	/* Register %r11 tracks the maximum offset.  */
++	mov	%RDX_LP, %R11_LP
++# endif
++	movl	%edi, %eax
++	xorl	%edx, %edx
++	/* Make %XMMZERO (%YMMZERO) all zeros in this function.  */
++	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
++	orl	%esi, %eax
++	andl	$(PAGE_SIZE - 1), %eax
++	cmpl	$(PAGE_SIZE - (VEC_SIZE * 4)), %eax
++	jg	L(cross_page)
++	/* Start comparing 4 vectors.  */
++	VMOVU	(%rdi), %YMM0
++	VMOVU	(%rsi), %YMM1
++
++	/* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
++	VPCMP	$4, %YMM0, %YMM1, %k0
++
++	/* Check for NULL in YMM0.  */
++	VPCMP	$0, %YMMZERO, %YMM0, %k1
++	/* Check for NULL in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM1, %k2
++	/* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
++	kord	%k1, %k2, %k1
++
++	/* Each bit in K1 represents:
++	   1. A mismatch in YMM0 and YMM1.  Or
++	   2. A NULL in YMM0 or YMM1.
++	 */
++	kord	%k0, %k1, %k1
++
++	ktestd	%k1, %k1
++	je	L(next_3_vectors)
++	kmovd	%k1, %ecx
++	tzcntl	%ecx, %edx
++# ifdef USE_AS_WCSCMP
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %edx
++# endif
++# ifdef USE_AS_STRNCMP
++	/* Return 0 if the mismatched index (%rdx) is after the maximum
++	   offset (%r11).   */
++	cmpq	%r11, %rdx
++	jae	L(zero)
++# endif
++# ifdef USE_AS_WCSCMP
++	xorl	%eax, %eax
++	movl	(%rdi, %rdx), %ecx
++	cmpl	(%rsi, %rdx), %ecx
++	je	L(return)
++L(wcscmp_return):
++	setl	%al
++	negl	%eax
++	orl	$1, %eax
++L(return):
++# else
++	movzbl	(%rdi, %rdx), %eax
++	movzbl	(%rsi, %rdx), %edx
++	subl	%edx, %eax
++# endif
++	ret
++
++	.p2align 4
++L(return_vec_size):
++	kmovd	%k1, %ecx
++	tzcntl	%ecx, %edx
++# ifdef USE_AS_WCSCMP
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %edx
++# endif
++# ifdef USE_AS_STRNCMP
++	/* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after
++	   the maximum offset (%r11).  */
++	addq	$VEC_SIZE, %rdx
++	cmpq	%r11, %rdx
++	jae	L(zero)
++#  ifdef USE_AS_WCSCMP
++	xorl	%eax, %eax
++	movl	(%rdi, %rdx), %ecx
++	cmpl	(%rsi, %rdx), %ecx
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rdi, %rdx), %eax
++	movzbl	(%rsi, %rdx), %edx
++	subl	%edx, %eax
++#  endif
++# else
++#  ifdef USE_AS_WCSCMP
++	xorl	%eax, %eax
++	movl	VEC_SIZE(%rdi, %rdx), %ecx
++	cmpl	VEC_SIZE(%rsi, %rdx), %ecx
++	jne	L(wcscmp_return)
++#  else
++	movzbl	VEC_SIZE(%rdi, %rdx), %eax
++	movzbl	VEC_SIZE(%rsi, %rdx), %edx
++	subl	%edx, %eax
++#  endif
++# endif
++	ret
++
++	.p2align 4
++L(return_2_vec_size):
++	kmovd	%k1, %ecx
++	tzcntl	%ecx, %edx
++# ifdef USE_AS_WCSCMP
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %edx
++# endif
++# ifdef USE_AS_STRNCMP
++	/* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is
++	   after the maximum offset (%r11).  */
++	addq	$(VEC_SIZE * 2), %rdx
++	cmpq	%r11, %rdx
++	jae	L(zero)
++#  ifdef USE_AS_WCSCMP
++	xorl	%eax, %eax
++	movl	(%rdi, %rdx), %ecx
++	cmpl	(%rsi, %rdx), %ecx
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rdi, %rdx), %eax
++	movzbl	(%rsi, %rdx), %edx
++	subl	%edx, %eax
++#  endif
++# else
++#  ifdef USE_AS_WCSCMP
++	xorl	%eax, %eax
++	movl	(VEC_SIZE * 2)(%rdi, %rdx), %ecx
++	cmpl	(VEC_SIZE * 2)(%rsi, %rdx), %ecx
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(VEC_SIZE * 2)(%rdi, %rdx), %eax
++	movzbl	(VEC_SIZE * 2)(%rsi, %rdx), %edx
++	subl	%edx, %eax
++#  endif
++# endif
++	ret
++
++	.p2align 4
++L(return_3_vec_size):
++	kmovd	%k1, %ecx
++	tzcntl	%ecx, %edx
++# ifdef USE_AS_WCSCMP
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %edx
++# endif
++# ifdef USE_AS_STRNCMP
++	/* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is
++	   after the maximum offset (%r11).  */
++	addq	$(VEC_SIZE * 3), %rdx
++	cmpq	%r11, %rdx
++	jae	L(zero)
++#  ifdef USE_AS_WCSCMP
++	xorl	%eax, %eax
++	movl	(%rdi, %rdx), %ecx
++	cmpl	(%rsi, %rdx), %ecx
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rdi, %rdx), %eax
++	movzbl	(%rsi, %rdx), %edx
++	subl	%edx, %eax
++#  endif
++# else
++#  ifdef USE_AS_WCSCMP
++	xorl	%eax, %eax
++	movl	(VEC_SIZE * 3)(%rdi, %rdx), %ecx
++	cmpl	(VEC_SIZE * 3)(%rsi, %rdx), %ecx
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(VEC_SIZE * 3)(%rdi, %rdx), %eax
++	movzbl	(VEC_SIZE * 3)(%rsi, %rdx), %edx
++	subl	%edx, %eax
++#  endif
++# endif
++	ret
++
++	.p2align 4
++L(next_3_vectors):
++	VMOVU	VEC_SIZE(%rdi), %YMM0
++	VMOVU	VEC_SIZE(%rsi), %YMM1
++	/* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
++	VPCMP	$4, %YMM0, %YMM1, %k0
++	VPCMP	$0, %YMMZERO, %YMM0, %k1
++	VPCMP	$0, %YMMZERO, %YMM1, %k2
++	/* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
++	kord	%k1, %k2, %k1
++	/* Each bit in K1 represents a NULL or a mismatch.  */
++	kord	%k0, %k1, %k1
++	ktestd	%k1, %k1
++	jne	L(return_vec_size)
++
++	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM2
++	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM3
++	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM4
++	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM5
++
++	/* Each bit in K0 represents a mismatch in YMM2 and YMM4.  */
++	VPCMP	$4, %YMM2, %YMM4, %k0
++	VPCMP	$0, %YMMZERO, %YMM2, %k1
++	VPCMP	$0, %YMMZERO, %YMM4, %k2
++	/* Each bit in K1 represents a NULL in YMM2 or YMM4.  */
++	kord	%k1, %k2, %k1
++	/* Each bit in K1 represents a NULL or a mismatch.  */
++	kord	%k0, %k1, %k1
++	ktestd	%k1, %k1
++	jne	L(return_2_vec_size)
++
++	/* Each bit in K0 represents a mismatch in YMM3 and YMM5.  */
++	VPCMP	$4, %YMM3, %YMM5, %k0
++	VPCMP	$0, %YMMZERO, %YMM3, %k1
++	VPCMP	$0, %YMMZERO, %YMM5, %k2
++	/* Each bit in K1 represents a NULL in YMM3 or YMM5.  */
++	kord	%k1, %k2, %k1
++	/* Each bit in K1 represents a NULL or a mismatch.  */
++	kord	%k0, %k1, %k1
++	ktestd	%k1, %k1
++	jne	L(return_3_vec_size)
++L(main_loop_header):
++	leaq	(VEC_SIZE * 4)(%rdi), %rdx
++	movl	$PAGE_SIZE, %ecx
++	/* Align load via RAX.  */
++	andq	$-(VEC_SIZE * 4), %rdx
++	subq	%rdi, %rdx
++	leaq	(%rdi, %rdx), %rax
++# ifdef USE_AS_STRNCMP
++	/* Starting from this point, the maximum offset, or simply the
++	   'offset', DECREASES by the same amount when base pointers are
++	   moved forward.  Return 0 when:
++	     1) On match: offset <= the matched vector index.
++	     2) On mistmach, offset is before the mistmatched index.
++	 */
++	subq	%rdx, %r11
++	jbe	L(zero)
++# endif
++	addq	%rsi, %rdx
++	movq	%rdx, %rsi
++	andl	$(PAGE_SIZE - 1), %esi
++	/* Number of bytes before page crossing.  */
++	subq	%rsi, %rcx
++	/* Number of VEC_SIZE * 4 blocks before page crossing.  */
++	shrq	$DIVIDE_BY_VEC_4_SHIFT, %rcx
++	/* ESI: Number of VEC_SIZE * 4 blocks before page crossing.   */
++	movl	%ecx, %esi
++	jmp	L(loop_start)
++
++	.p2align 4
++L(loop):
++# ifdef USE_AS_STRNCMP
++	/* Base pointers are moved forward by 4 * VEC_SIZE.  Decrease
++	   the maximum offset (%r11) by the same amount.  */
++	subq	$(VEC_SIZE * 4), %r11
++	jbe	L(zero)
++# endif
++	addq	$(VEC_SIZE * 4), %rax
++	addq	$(VEC_SIZE * 4), %rdx
++L(loop_start):
++	testl	%esi, %esi
++	leal	-1(%esi), %esi
++	je	L(loop_cross_page)
++L(back_to_loop):
++	/* Main loop, comparing 4 vectors are a time.  */
++	VMOVA	(%rax), %YMM0
++	VMOVA	VEC_SIZE(%rax), %YMM2
++	VMOVA	(VEC_SIZE * 2)(%rax), %YMM4
++	VMOVA	(VEC_SIZE * 3)(%rax), %YMM6
++	VMOVU	(%rdx), %YMM1
++	VMOVU	VEC_SIZE(%rdx), %YMM3
++	VMOVU	(VEC_SIZE * 2)(%rdx), %YMM5
++	VMOVU	(VEC_SIZE * 3)(%rdx), %YMM7
++
++	VPCMP	$4, %YMM0, %YMM1, %k0
++	VPCMP	$0, %YMMZERO, %YMM0, %k1
++	VPCMP	$0, %YMMZERO, %YMM1, %k2
++	kord	%k1, %k2, %k1
++	/* Each bit in K4 represents a NULL or a mismatch in YMM0 and
++	   YMM1.  */
++	kord	%k0, %k1, %k4
++
++	VPCMP	$4, %YMM2, %YMM3, %k0
++	VPCMP	$0, %YMMZERO, %YMM2, %k1
++	VPCMP	$0, %YMMZERO, %YMM3, %k2
++	kord	%k1, %k2, %k1
++	/* Each bit in K5 represents a NULL or a mismatch in YMM2 and
++	   YMM3.  */
++	kord	%k0, %k1, %k5
++
++	VPCMP	$4, %YMM4, %YMM5, %k0
++	VPCMP	$0, %YMMZERO, %YMM4, %k1
++	VPCMP	$0, %YMMZERO, %YMM5, %k2
++	kord	%k1, %k2, %k1
++	/* Each bit in K6 represents a NULL or a mismatch in YMM4 and
++	   YMM5.  */
++	kord	%k0, %k1, %k6
++
++	VPCMP	$4, %YMM6, %YMM7, %k0
++	VPCMP	$0, %YMMZERO, %YMM6, %k1
++	VPCMP	$0, %YMMZERO, %YMM7, %k2
++	kord	%k1, %k2, %k1
++	/* Each bit in K7 represents a NULL or a mismatch in YMM6 and
++	   YMM7.  */
++	kord	%k0, %k1, %k7
++
++	kord	%k4, %k5, %k0
++	kord	%k6, %k7, %k1
++
++	/* Test each mask (32 bits) individually because for VEC_SIZE
++	   == 32 is not possible to OR the four masks and keep all bits
++	   in a 64-bit integer register, differing from SSE2 strcmp
++	   where ORing is possible.  */
++	kortestd %k0, %k1
++	je	L(loop)
++	ktestd	%k4, %k4
++	je	L(test_vec)
++	kmovd	%k4, %edi
++	tzcntl	%edi, %ecx
++# ifdef USE_AS_WCSCMP
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %ecx
++# endif
++# ifdef USE_AS_STRNCMP
++	cmpq	%rcx, %r11
++	jbe	L(zero)
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	(%rsi, %rcx), %edi
++	cmpl	(%rdx, %rcx), %edi
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rax, %rcx), %eax
++	movzbl	(%rdx, %rcx), %edx
++	subl	%edx, %eax
++#  endif
++# else
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	(%rsi, %rcx), %edi
++	cmpl	(%rdx, %rcx), %edi
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rax, %rcx), %eax
++	movzbl	(%rdx, %rcx), %edx
++	subl	%edx, %eax
++#  endif
++# endif
++	ret
++
++	.p2align 4
++L(test_vec):
++# ifdef USE_AS_STRNCMP
++	/* The first vector matched.  Return 0 if the maximum offset
++	   (%r11) <= VEC_SIZE.  */
++	cmpq	$VEC_SIZE, %r11
++	jbe	L(zero)
++# endif
++	ktestd	%k5, %k5
++	je	L(test_2_vec)
++	kmovd	%k5, %ecx
++	tzcntl	%ecx, %edi
++# ifdef USE_AS_WCSCMP
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %edi
++# endif
++# ifdef USE_AS_STRNCMP
++	addq	$VEC_SIZE, %rdi
++	cmpq	%rdi, %r11
++	jbe	L(zero)
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	(%rsi, %rdi), %ecx
++	cmpl	(%rdx, %rdi), %ecx
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rax, %rdi), %eax
++	movzbl	(%rdx, %rdi), %edx
++	subl	%edx, %eax
++#  endif
++# else
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	VEC_SIZE(%rsi, %rdi), %ecx
++	cmpl	VEC_SIZE(%rdx, %rdi), %ecx
++	jne	L(wcscmp_return)
++#  else
++	movzbl	VEC_SIZE(%rax, %rdi), %eax
++	movzbl	VEC_SIZE(%rdx, %rdi), %edx
++	subl	%edx, %eax
++#  endif
++# endif
++	ret
++
++	.p2align 4
++L(test_2_vec):
++# ifdef USE_AS_STRNCMP
++	/* The first 2 vectors matched.  Return 0 if the maximum offset
++	   (%r11) <= 2 * VEC_SIZE.  */
++	cmpq	$(VEC_SIZE * 2), %r11
++	jbe	L(zero)
++# endif
++	ktestd	%k6, %k6
++	je	L(test_3_vec)
++	kmovd	%k6, %ecx
++	tzcntl	%ecx, %edi
++# ifdef USE_AS_WCSCMP
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %edi
++# endif
++# ifdef USE_AS_STRNCMP
++	addq	$(VEC_SIZE * 2), %rdi
++	cmpq	%rdi, %r11
++	jbe	L(zero)
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	(%rsi, %rdi), %ecx
++	cmpl	(%rdx, %rdi), %ecx
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rax, %rdi), %eax
++	movzbl	(%rdx, %rdi), %edx
++	subl	%edx, %eax
++#  endif
++# else
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	(VEC_SIZE * 2)(%rsi, %rdi), %ecx
++	cmpl	(VEC_SIZE * 2)(%rdx, %rdi), %ecx
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(VEC_SIZE * 2)(%rax, %rdi), %eax
++	movzbl	(VEC_SIZE * 2)(%rdx, %rdi), %edx
++	subl	%edx, %eax
++#  endif
++# endif
++	ret
++
++	.p2align 4
++L(test_3_vec):
++# ifdef USE_AS_STRNCMP
++	/* The first 3 vectors matched.  Return 0 if the maximum offset
++	   (%r11) <= 3 * VEC_SIZE.  */
++	cmpq	$(VEC_SIZE * 3), %r11
++	jbe	L(zero)
++# endif
++	kmovd	%k7, %esi
++	tzcntl	%esi, %ecx
++# ifdef USE_AS_WCSCMP
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %ecx
++# endif
++# ifdef USE_AS_STRNCMP
++	addq	$(VEC_SIZE * 3), %rcx
++	cmpq	%rcx, %r11
++	jbe	L(zero)
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	(%rsi, %rcx), %esi
++	cmpl	(%rdx, %rcx), %esi
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rax, %rcx), %eax
++	movzbl	(%rdx, %rcx), %edx
++	subl	%edx, %eax
++#  endif
++# else
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	(VEC_SIZE * 3)(%rsi, %rcx), %esi
++	cmpl	(VEC_SIZE * 3)(%rdx, %rcx), %esi
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(VEC_SIZE * 3)(%rax, %rcx), %eax
++	movzbl	(VEC_SIZE * 3)(%rdx, %rcx), %edx
++	subl	%edx, %eax
++#  endif
++# endif
++	ret
++
++	.p2align 4
++L(loop_cross_page):
++	xorl	%r10d, %r10d
++	movq	%rdx, %rcx
++	/* Align load via RDX.  We load the extra ECX bytes which should
++	   be ignored.  */
++	andl	$((VEC_SIZE * 4) - 1), %ecx
++	/* R10 is -RCX.  */
++	subq	%rcx, %r10
++
++	/* This works only if VEC_SIZE * 2 == 64. */
++# if (VEC_SIZE * 2) != 64
++#  error (VEC_SIZE * 2) != 64
++# endif
++
++	/* Check if the first VEC_SIZE * 2 bytes should be ignored.  */
++	cmpl	$(VEC_SIZE * 2), %ecx
++	jge	L(loop_cross_page_2_vec)
++
++	VMOVU	(%rax, %r10), %YMM2
++	VMOVU	VEC_SIZE(%rax, %r10), %YMM3
++	VMOVU	(%rdx, %r10), %YMM4
++	VMOVU	VEC_SIZE(%rdx, %r10), %YMM5
++
++	VPCMP	$4, %YMM4, %YMM2, %k0
++	VPCMP	$0, %YMMZERO, %YMM2, %k1
++	VPCMP	$0, %YMMZERO, %YMM4, %k2
++	kord	%k1, %k2, %k1
++	/* Each bit in K1 represents a NULL or a mismatch in YMM2 and
++	   YMM4.  */
++	kord	%k0, %k1, %k1
++
++	VPCMP	$4, %YMM5, %YMM3, %k3
++	VPCMP	$0, %YMMZERO, %YMM3, %k4
++	VPCMP	$0, %YMMZERO, %YMM5, %k5
++	kord	%k4, %k5, %k4
++	/* Each bit in K3 represents a NULL or a mismatch in YMM3 and
++	   YMM5.  */
++	kord	%k3, %k4, %k3
++
++# ifdef USE_AS_WCSCMP
++	/* NB: Each bit in K1/K3 represents 4-byte element.  */
++	kshiftlw $8, %k3, %k2
++	/* NB: Divide shift count by 4 since each bit in K1 represent 4
++	   bytes.  */
++	movl	%ecx, %SHIFT_REG32
++	sarl	$2, %SHIFT_REG32
++# else
++	kshiftlq $32, %k3, %k2
++# endif
++
++	/* Each bit in K1 represents a NULL or a mismatch.  */
++	korq	%k1, %k2, %k1
++	kmovq	%k1, %rdi
++
++	/* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes.  */
++	shrxq	%SHIFT_REG64, %rdi, %rdi
++	testq	%rdi, %rdi
++	je	L(loop_cross_page_2_vec)
++	tzcntq	%rdi, %rcx
++# ifdef USE_AS_WCSCMP
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %ecx
++# endif
++# ifdef USE_AS_STRNCMP
++	cmpq	%rcx, %r11
++	jbe	L(zero)
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	(%rsi, %rcx), %edi
++	cmpl	(%rdx, %rcx), %edi
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rax, %rcx), %eax
++	movzbl	(%rdx, %rcx), %edx
++	subl	%edx, %eax
++#  endif
++# else
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	(%rsi, %rcx), %edi
++	cmpl	(%rdx, %rcx), %edi
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rax, %rcx), %eax
++	movzbl	(%rdx, %rcx), %edx
++	subl	%edx, %eax
++#  endif
++# endif
++	ret
++
++	.p2align 4
++L(loop_cross_page_2_vec):
++	/* The first VEC_SIZE * 2 bytes match or are ignored.  */
++	VMOVU	(VEC_SIZE * 2)(%rax, %r10), %YMM0
++	VMOVU	(VEC_SIZE * 3)(%rax, %r10), %YMM1
++	VMOVU	(VEC_SIZE * 2)(%rdx, %r10), %YMM2
++	VMOVU	(VEC_SIZE * 3)(%rdx, %r10), %YMM3
++
++	VPCMP	$4, %YMM0, %YMM2, %k0
++	VPCMP	$0, %YMMZERO, %YMM0, %k1
++	VPCMP	$0, %YMMZERO, %YMM2, %k2
++	kord	%k1, %k2, %k1
++	/* Each bit in K1 represents a NULL or a mismatch in YMM0 and
++	   YMM2.  */
++	kord	%k0, %k1, %k1
++
++	VPCMP	$4, %YMM1, %YMM3, %k3
++	VPCMP	$0, %YMMZERO, %YMM1, %k4
++	VPCMP	$0, %YMMZERO, %YMM3, %k5
++	kord	%k4, %k5, %k4
++	/* Each bit in K3 represents a NULL or a mismatch in YMM1 and
++	   YMM3.  */
++	kord	%k3, %k4, %k3
++
++# ifdef USE_AS_WCSCMP
++	/* NB: Each bit in K1/K3 represents 4-byte element.  */
++	kshiftlw $8, %k3, %k2
++# else
++	kshiftlq $32, %k3, %k2
++# endif
++
++	/* Each bit in K1 represents a NULL or a mismatch.  */
++	korq	%k1, %k2, %k1
++	kmovq	%k1, %rdi
++
++	xorl	%r8d, %r8d
++	/* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes.  */
++	subl	$(VEC_SIZE * 2), %ecx
++	jle	1f
++	/* R8 has number of bytes skipped.  */
++	movl	%ecx, %r8d
++# ifdef USE_AS_WCSCMP
++	/* NB: Divide shift count by 4 since each bit in K1 represent 4
++	   bytes.  */
++	sarl	$2, %ecx
++# endif
++	/* Skip ECX bytes.  */
++	shrq	%cl, %rdi
++1:
++	/* Before jumping back to the loop, set ESI to the number of
++	   VEC_SIZE * 4 blocks before page crossing.  */
++	movl	$(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi
++
++	testq	%rdi, %rdi
++# ifdef USE_AS_STRNCMP
++	/* At this point, if %rdi value is 0, it already tested
++	   VEC_SIZE*4+%r10 byte starting from %rax. This label
++	   checks whether strncmp maximum offset reached or not.  */
++	je	L(string_nbyte_offset_check)
++# else
++	je	L(back_to_loop)
++# endif
++	tzcntq	%rdi, %rcx
++# ifdef USE_AS_WCSCMP
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %ecx
++# endif
++	addq	%r10, %rcx
++	/* Adjust for number of bytes skipped.  */
++	addq	%r8, %rcx
++# ifdef USE_AS_STRNCMP
++	addq	$(VEC_SIZE * 2), %rcx
++	subq	%rcx, %r11
++	jbe	L(zero)
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	(%rsi, %rcx), %edi
++	cmpl	(%rdx, %rcx), %edi
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rax, %rcx), %eax
++	movzbl	(%rdx, %rcx), %edx
++	subl	%edx, %eax
++#  endif
++# else
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	(VEC_SIZE * 2)(%rsi, %rcx), %edi
++	cmpl	(VEC_SIZE * 2)(%rdx, %rcx), %edi
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(VEC_SIZE * 2)(%rax, %rcx), %eax
++	movzbl	(VEC_SIZE * 2)(%rdx, %rcx), %edx
++	subl	%edx, %eax
++#  endif
++# endif
++	ret
++
++# ifdef USE_AS_STRNCMP
++L(string_nbyte_offset_check):
++	leaq	(VEC_SIZE * 4)(%r10), %r10
++	cmpq	%r10, %r11
++	jbe	L(zero)
++	jmp	L(back_to_loop)
++# endif
++
++	.p2align 4
++L(cross_page_loop):
++	/* Check one byte/dword at a time.  */
++# ifdef USE_AS_WCSCMP
++	cmpl	%ecx, %eax
++# else
++	subl	%ecx, %eax
++# endif
++	jne	L(different)
++	addl	$SIZE_OF_CHAR, %edx
++	cmpl	$(VEC_SIZE * 4), %edx
++	je	L(main_loop_header)
++# ifdef USE_AS_STRNCMP
++	cmpq	%r11, %rdx
++	jae	L(zero)
++# endif
++# ifdef USE_AS_WCSCMP
++	movl	(%rdi, %rdx), %eax
++	movl	(%rsi, %rdx), %ecx
++# else
++	movzbl	(%rdi, %rdx), %eax
++	movzbl	(%rsi, %rdx), %ecx
++# endif
++	/* Check null char.  */
++	testl	%eax, %eax
++	jne	L(cross_page_loop)
++	/* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
++	   comparisons.  */
++	subl	%ecx, %eax
++# ifndef USE_AS_WCSCMP
++L(different):
++# endif
++	ret
++
++# ifdef USE_AS_WCSCMP
++	.p2align 4
++L(different):
++	/* Use movl to avoid modifying EFLAGS.  */
++	movl	$0, %eax
++	setl	%al
++	negl	%eax
++	orl	$1, %eax
++	ret
++# endif
++
++# ifdef USE_AS_STRNCMP
++	.p2align 4
++L(zero):
++	xorl	%eax, %eax
++	ret
++
++	.p2align 4
++L(char0):
++#  ifdef USE_AS_WCSCMP
++	xorl	%eax, %eax
++	movl	(%rdi), %ecx
++	cmpl	(%rsi), %ecx
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rsi), %ecx
++	movzbl	(%rdi), %eax
++	subl	%ecx, %eax
++#  endif
++	ret
++# endif
++
++	.p2align 4
++L(last_vector):
++	addq	%rdx, %rdi
++	addq	%rdx, %rsi
++# ifdef USE_AS_STRNCMP
++	subq	%rdx, %r11
++# endif
++	tzcntl	%ecx, %edx
++# ifdef USE_AS_WCSCMP
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %edx
++# endif
++# ifdef USE_AS_STRNCMP
++	cmpq	%r11, %rdx
++	jae	L(zero)
++# endif
++# ifdef USE_AS_WCSCMP
++	xorl	%eax, %eax
++	movl	(%rdi, %rdx), %ecx
++	cmpl	(%rsi, %rdx), %ecx
++	jne	L(wcscmp_return)
++# else
++	movzbl	(%rdi, %rdx), %eax
++	movzbl	(%rsi, %rdx), %edx
++	subl	%edx, %eax
++# endif
++	ret
++
++	/* Comparing on page boundary region requires special treatment:
++	   It must done one vector at the time, starting with the wider
++	   ymm vector if possible, if not, with xmm. If fetching 16 bytes
++	   (xmm) still passes the boundary, byte comparison must be done.
++	 */
++	.p2align 4
++L(cross_page):
++	/* Try one ymm vector at a time.  */
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
++	jg	L(cross_page_1_vector)
++L(loop_1_vector):
++	VMOVU	(%rdi, %rdx), %YMM0
++	VMOVU	(%rsi, %rdx), %YMM1
++
++	/* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
++	VPCMP	$4, %YMM0, %YMM1, %k0
++	VPCMP	$0, %YMMZERO, %YMM0, %k1
++	VPCMP	$0, %YMMZERO, %YMM1, %k2
++	/* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
++	kord	%k1, %k2, %k1
++	/* Each bit in K1 represents a NULL or a mismatch.  */
++	kord	%k0, %k1, %k1
++	kmovd	%k1, %ecx
++	testl	%ecx, %ecx
++	jne	L(last_vector)
++
++	addl	$VEC_SIZE, %edx
++
++	addl	$VEC_SIZE, %eax
++# ifdef USE_AS_STRNCMP
++	/* Return 0 if the current offset (%rdx) >= the maximum offset
++	   (%r11).  */
++	cmpq	%r11, %rdx
++	jae	L(zero)
++# endif
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
++	jle	L(loop_1_vector)
++L(cross_page_1_vector):
++	/* Less than 32 bytes to check, try one xmm vector.  */
++	cmpl	$(PAGE_SIZE - 16), %eax
++	jg	L(cross_page_1_xmm)
++	VMOVU	(%rdi, %rdx), %XMM0
++	VMOVU	(%rsi, %rdx), %XMM1
++
++	/* Each bit in K0 represents a mismatch in XMM0 and XMM1.  */
++	VPCMP	$4, %XMM0, %XMM1, %k0
++	VPCMP	$0, %XMMZERO, %XMM0, %k1
++	VPCMP	$0, %XMMZERO, %XMM1, %k2
++	/* Each bit in K1 represents a NULL in XMM0 or XMM1.  */
++	korw	%k1, %k2, %k1
++	/* Each bit in K1 represents a NULL or a mismatch.  */
++	korw	%k0, %k1, %k1
++	kmovw	%k1, %ecx
++	testl	%ecx, %ecx
++	jne	L(last_vector)
++
++	addl	$16, %edx
++# ifndef USE_AS_WCSCMP
++	addl	$16, %eax
++# endif
++# ifdef USE_AS_STRNCMP
++	/* Return 0 if the current offset (%rdx) >= the maximum offset
++	   (%r11).  */
++	cmpq	%r11, %rdx
++	jae	L(zero)
++# endif
++
++L(cross_page_1_xmm):
++# ifndef USE_AS_WCSCMP
++	/* Less than 16 bytes to check, try 8 byte vector.  NB: No need
++	   for wcscmp nor wcsncmp since wide char is 4 bytes.   */
++	cmpl	$(PAGE_SIZE - 8), %eax
++	jg	L(cross_page_8bytes)
++	vmovq	(%rdi, %rdx), %XMM0
++	vmovq	(%rsi, %rdx), %XMM1
++
++	/* Each bit in K0 represents a mismatch in XMM0 and XMM1.  */
++	VPCMP	$4, %XMM0, %XMM1, %k0
++	VPCMP	$0, %XMMZERO, %XMM0, %k1
++	VPCMP	$0, %XMMZERO, %XMM1, %k2
++	/* Each bit in K1 represents a NULL in XMM0 or XMM1.  */
++	kord	%k1, %k2, %k1
++	/* Each bit in K1 represents a NULL or a mismatch.  */
++	kord	%k0, %k1, %k1
++	kmovd	%k1, %ecx
++
++# ifdef USE_AS_WCSCMP
++	/* Only last 2 bits are valid.  */
++	andl	$0x3, %ecx
++# else
++	/* Only last 8 bits are valid.  */
++	andl	$0xff, %ecx
++# endif
++
++	testl	%ecx, %ecx
++	jne	L(last_vector)
++
++	addl	$8, %edx
++	addl	$8, %eax
++#  ifdef USE_AS_STRNCMP
++	/* Return 0 if the current offset (%rdx) >= the maximum offset
++	   (%r11).  */
++	cmpq	%r11, %rdx
++	jae	L(zero)
++#  endif
++
++L(cross_page_8bytes):
++	/* Less than 8 bytes to check, try 4 byte vector.  */
++	cmpl	$(PAGE_SIZE - 4), %eax
++	jg	L(cross_page_4bytes)
++	vmovd	(%rdi, %rdx), %XMM0
++	vmovd	(%rsi, %rdx), %XMM1
++
++	/* Each bit in K0 represents a mismatch in XMM0 and XMM1.  */
++	VPCMP	$4, %XMM0, %XMM1, %k0
++	VPCMP	$0, %XMMZERO, %XMM0, %k1
++	VPCMP	$0, %XMMZERO, %XMM1, %k2
++	/* Each bit in K1 represents a NULL in XMM0 or XMM1.  */
++	kord	%k1, %k2, %k1
++	/* Each bit in K1 represents a NULL or a mismatch.  */
++	kord	%k0, %k1, %k1
++	kmovd	%k1, %ecx
++
++# ifdef USE_AS_WCSCMP
++	/* Only the last bit is valid.  */
++	andl	$0x1, %ecx
++# else
++	/* Only last 4 bits are valid.  */
++	andl	$0xf, %ecx
++# endif
++
++	testl	%ecx, %ecx
++	jne	L(last_vector)
++
++	addl	$4, %edx
++#  ifdef USE_AS_STRNCMP
++	/* Return 0 if the current offset (%rdx) >= the maximum offset
++	   (%r11).  */
++	cmpq	%r11, %rdx
++	jae	L(zero)
++#  endif
++
++L(cross_page_4bytes):
++# endif
++	/* Less than 4 bytes to check, try one byte/dword at a time.  */
++# ifdef USE_AS_STRNCMP
++	cmpq	%r11, %rdx
++	jae	L(zero)
++# endif
++# ifdef USE_AS_WCSCMP
++	movl	(%rdi, %rdx), %eax
++	movl	(%rsi, %rdx), %ecx
++# else
++	movzbl	(%rdi, %rdx), %eax
++	movzbl	(%rsi, %rdx), %ecx
++# endif
++	testl	%eax, %eax
++	jne	L(cross_page_loop)
++	subl	%ecx, %eax
++	ret
++END (STRCMP)
++#endif
+diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c
+index 3f433fbc..c5f38510 100644
+--- a/sysdeps/x86_64/multiarch/strcmp.c
++++ b/sysdeps/x86_64/multiarch/strcmp.c
+@@ -30,16 +30,25 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+ 
+ static inline void *
+ IFUNC_SELECTOR (void)
+ {
+   const struct cpu_features* cpu_features = __get_cpu_features ();
+ 
+-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+-      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
++  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+-    return OPTIMIZE (avx2);
++    {
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
++	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
++	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
++	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP))
++	return OPTIMIZE (evex);
++
++      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
++	return OPTIMIZE (avx2);
++    }
+ 
+   if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
+     return OPTIMIZE (sse2_unaligned);
+diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
+new file mode 100644
+index 00000000..cd022509
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strlen-evex.S
+@@ -0,0 +1,436 @@
++/* strlen/strnlen/wcslen/wcsnlen optimized with 256-bit EVEX instructions.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#if IS_IN (libc)
++
++# include <sysdep.h>
++
++# ifndef STRLEN
++#  define STRLEN	__strlen_evex
++# endif
++
++# define VMOVA		vmovdqa64
++
++# ifdef USE_AS_WCSLEN
++#  define VPCMP		vpcmpd
++#  define VPMINU	vpminud
++#  define SHIFT_REG	r9d
++# else
++#  define VPCMP		vpcmpb
++#  define VPMINU	vpminub
++#  define SHIFT_REG	ecx
++# endif
++
++# define XMMZERO	xmm16
++# define YMMZERO	ymm16
++# define YMM1		ymm17
++# define YMM2		ymm18
++# define YMM3		ymm19
++# define YMM4		ymm20
++# define YMM5		ymm21
++# define YMM6		ymm22
++
++# define VEC_SIZE 32
++
++	.section .text.evex,"ax",@progbits
++ENTRY (STRLEN)
++# ifdef USE_AS_STRNLEN
++	/* Check for zero length.  */
++	test	%RSI_LP, %RSI_LP
++	jz	L(zero)
++#  ifdef USE_AS_WCSLEN
++	shl	$2, %RSI_LP
++#  elif defined __ILP32__
++	/* Clear the upper 32 bits.  */
++	movl	%esi, %esi
++#  endif
++	mov	%RSI_LP, %R8_LP
++# endif
++	movl	%edi, %ecx
++	movq	%rdi, %rdx
++	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
++
++	/* Check if we may cross page boundary with one vector load.  */
++	andl	$(2 * VEC_SIZE - 1), %ecx
++	cmpl	$VEC_SIZE, %ecx
++	ja	L(cros_page_boundary)
++
++	/* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
++	   null byte.  */
++	VPCMP	$0, (%rdi), %YMMZERO, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++
++# ifdef USE_AS_STRNLEN
++	jnz	L(first_vec_x0_check)
++	/* Adjust length and check the end of data.  */
++	subq	$VEC_SIZE, %rsi
++	jbe	L(max)
++# else
++	jnz	L(first_vec_x0)
++# endif
++
++	/* Align data for aligned loads in the loop.  */
++	addq	$VEC_SIZE, %rdi
++	andl	$(VEC_SIZE - 1), %ecx
++	andq	$-VEC_SIZE, %rdi
++
++# ifdef USE_AS_STRNLEN
++	/* Adjust length.  */
++	addq	%rcx, %rsi
++
++	subq	$(VEC_SIZE * 4), %rsi
++	jbe	L(last_4x_vec_or_less)
++# endif
++	jmp	L(more_4x_vec)
++
++	.p2align 4
++L(cros_page_boundary):
++	andl	$(VEC_SIZE - 1), %ecx
++	andq	$-VEC_SIZE, %rdi
++
++# ifdef USE_AS_WCSLEN
++	/* NB: Divide shift count by 4 since each bit in K0 represent 4
++	   bytes.  */
++	movl	%ecx, %SHIFT_REG
++	sarl	$2, %SHIFT_REG
++# endif
++	VPCMP	$0, (%rdi), %YMMZERO, %k0
++	kmovd	%k0, %eax
++
++	/* Remove the leading bytes.  */
++	sarxl	%SHIFT_REG, %eax, %eax
++	testl	%eax, %eax
++	jz	L(aligned_more)
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WCSLEN
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %eax
++# endif
++# ifdef USE_AS_STRNLEN
++	/* Check the end of data.  */
++	cmpq	%rax, %rsi
++	jbe	L(max)
++# endif
++	addq	%rdi, %rax
++	addq	%rcx, %rax
++	subq	%rdx, %rax
++# ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++# endif
++	ret
++
++	.p2align 4
++L(aligned_more):
++# ifdef USE_AS_STRNLEN
++        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
++	    with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
++	    to void possible addition overflow.  */
++	negq	%rcx
++	addq	$VEC_SIZE, %rcx
++
++	/* Check the end of data.  */
++	subq	%rcx, %rsi
++	jbe	L(max)
++# endif
++
++	addq	$VEC_SIZE, %rdi
++
++# ifdef USE_AS_STRNLEN
++	subq	$(VEC_SIZE * 4), %rsi
++	jbe	L(last_4x_vec_or_less)
++# endif
++
++L(more_4x_vec):
++	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
++	   since data is only aligned to VEC_SIZE.  */
++	VPCMP	$0, (%rdi), %YMMZERO, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x0)
++
++	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x1)
++
++	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x2)
++
++	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x3)
++
++	addq	$(VEC_SIZE * 4), %rdi
++
++# ifdef USE_AS_STRNLEN
++	subq	$(VEC_SIZE * 4), %rsi
++	jbe	L(last_4x_vec_or_less)
++# endif
++
++	/* Align data to 4 * VEC_SIZE.  */
++	movq	%rdi, %rcx
++	andl	$(4 * VEC_SIZE - 1), %ecx
++	andq	$-(4 * VEC_SIZE), %rdi
++
++# ifdef USE_AS_STRNLEN
++	/* Adjust length.  */
++	addq	%rcx, %rsi
++# endif
++
++	.p2align 4
++L(loop_4x_vec):
++	/* Compare 4 * VEC at a time forward.  */
++	VMOVA	(%rdi), %YMM1
++	VMOVA	VEC_SIZE(%rdi), %YMM2
++	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM3
++	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM4
++
++	VPMINU	%YMM1, %YMM2, %YMM5
++	VPMINU	%YMM3, %YMM4, %YMM6
++
++	VPMINU	%YMM5, %YMM6, %YMM5
++	VPCMP	$0, %YMM5, %YMMZERO, %k0
++	ktestd	%k0, %k0
++	jnz	L(4x_vec_end)
++
++	addq	$(VEC_SIZE * 4), %rdi
++
++# ifndef USE_AS_STRNLEN
++	jmp	L(loop_4x_vec)
++# else
++	subq	$(VEC_SIZE * 4), %rsi
++	ja	L(loop_4x_vec)
++
++L(last_4x_vec_or_less):
++	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
++	addl	$(VEC_SIZE * 2), %esi
++	jle	L(last_2x_vec)
++
++	VPCMP	$0, (%rdi), %YMMZERO, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x0)
++
++	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x1)
++
++	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x2_check)
++	subl	$VEC_SIZE, %esi
++	jle	L(max)
++
++	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x3_check)
++	movq	%r8, %rax
++#  ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++#  endif
++	ret
++
++	.p2align 4
++L(last_2x_vec):
++	addl	$(VEC_SIZE * 2), %esi
++
++	VPCMP	$0, (%rdi), %YMMZERO, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x0_check)
++	subl	$VEC_SIZE, %esi
++	jle	L(max)
++
++	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x1_check)
++	movq	%r8, %rax
++#  ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++#  endif
++	ret
++
++	.p2align 4
++L(first_vec_x0_check):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WCSLEN
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %eax
++# endif
++	/* Check the end of data.  */
++	cmpq	%rax, %rsi
++	jbe	L(max)
++	addq	%rdi, %rax
++	subq	%rdx, %rax
++#  ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++#  endif
++	ret
++
++	.p2align 4
++L(first_vec_x1_check):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WCSLEN
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %eax
++# endif
++	/* Check the end of data.  */
++	cmpq	%rax, %rsi
++	jbe	L(max)
++	addq	$VEC_SIZE, %rax
++	addq	%rdi, %rax
++	subq	%rdx, %rax
++#  ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++#  endif
++	ret
++
++	.p2align 4
++L(first_vec_x2_check):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WCSLEN
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %eax
++# endif
++	/* Check the end of data.  */
++	cmpq	%rax, %rsi
++	jbe	L(max)
++	addq	$(VEC_SIZE * 2), %rax
++	addq	%rdi, %rax
++	subq	%rdx, %rax
++#  ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++#  endif
++	ret
++
++	.p2align 4
++L(first_vec_x3_check):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WCSLEN
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %eax
++# endif
++	/* Check the end of data.  */
++	cmpq	%rax, %rsi
++	jbe	L(max)
++	addq	$(VEC_SIZE * 3), %rax
++	addq	%rdi, %rax
++	subq	%rdx, %rax
++#  ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++#  endif
++	ret
++
++	.p2align 4
++L(max):
++	movq	%r8, %rax
++#  ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++#  endif
++	ret
++
++	.p2align 4
++L(zero):
++	xorl	%eax, %eax
++	ret
++# endif
++
++	.p2align 4
++L(first_vec_x0):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WCSLEN
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %eax
++# endif
++	addq	%rdi, %rax
++	subq	%rdx, %rax
++# ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++# endif
++	ret
++
++	.p2align 4
++L(first_vec_x1):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WCSLEN
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %eax
++# endif
++	addq	$VEC_SIZE, %rax
++	addq	%rdi, %rax
++	subq	%rdx, %rax
++# ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++# endif
++	ret
++
++	.p2align 4
++L(first_vec_x2):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WCSLEN
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %eax
++# endif
++	addq	$(VEC_SIZE * 2), %rax
++	addq	%rdi, %rax
++	subq	%rdx, %rax
++# ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++# endif
++	ret
++
++	.p2align 4
++L(4x_vec_end):
++	VPCMP	$0, %YMM1, %YMMZERO, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x0)
++	VPCMP	$0, %YMM2, %YMMZERO, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x1)
++	VPCMP	$0, %YMM3, %YMMZERO, %k2
++	kmovd	%k2, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x2)
++	VPCMP	$0, %YMM4, %YMMZERO, %k3
++	kmovd	%k3, %eax
++L(first_vec_x3):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WCSLEN
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %eax
++# endif
++	addq	$(VEC_SIZE * 3), %rax
++	addq	%rdi, %rax
++	subq	%rdx, %rax
++# ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++# endif
++	ret
++
++END (STRLEN)
++#endif
+diff --git a/sysdeps/x86_64/multiarch/strncmp-evex.S b/sysdeps/x86_64/multiarch/strncmp-evex.S
+new file mode 100644
+index 00000000..a1d53e8c
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strncmp-evex.S
+@@ -0,0 +1,3 @@
++#define STRCMP	__strncmp_evex
++#define USE_AS_STRNCMP 1
++#include "strcmp-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c
+index 686d654f..4c15542f 100644
+--- a/sysdeps/x86_64/multiarch/strncmp.c
++++ b/sysdeps/x86_64/multiarch/strncmp.c
+@@ -30,16 +30,25 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+ 
+ static inline void *
+ IFUNC_SELECTOR (void)
+ {
+   const struct cpu_features* cpu_features = __get_cpu_features ();
+ 
+-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+-      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
++  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+-    return OPTIMIZE (avx2);
++    {
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
++	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
++	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
++	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP))
++	return OPTIMIZE (evex);
++
++      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
++	return OPTIMIZE (avx2);
++    }
+ 
+   if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_2)
+       && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
+diff --git a/sysdeps/x86_64/multiarch/strnlen-evex.S b/sysdeps/x86_64/multiarch/strnlen-evex.S
+new file mode 100644
+index 00000000..722022f3
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strnlen-evex.S
+@@ -0,0 +1,4 @@
++#define STRLEN __strnlen_evex
++#define USE_AS_STRNLEN 1
++
++#include "strlen-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
+new file mode 100644
+index 00000000..f920b5a5
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
+@@ -0,0 +1,265 @@
++/* strrchr/wcsrchr optimized with 256-bit EVEX instructions.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#if IS_IN (libc)
++
++# include <sysdep.h>
++
++# ifndef STRRCHR
++#  define STRRCHR	__strrchr_evex
++# endif
++
++# define VMOVU		vmovdqu64
++# define VMOVA		vmovdqa64
++
++# ifdef USE_AS_WCSRCHR
++#  define VPBROADCAST	vpbroadcastd
++#  define VPCMP		vpcmpd
++#  define SHIFT_REG	r8d
++# else
++#  define VPBROADCAST	vpbroadcastb
++#  define VPCMP		vpcmpb
++#  define SHIFT_REG	ecx
++# endif
++
++# define XMMZERO	xmm16
++# define YMMZERO	ymm16
++# define YMMMATCH	ymm17
++# define YMM1		ymm18
++
++# define VEC_SIZE	32
++
++	.section .text.evex,"ax",@progbits
++ENTRY (STRRCHR)
++	movl	%edi, %ecx
++	/* Broadcast CHAR to YMMMATCH.  */
++	VPBROADCAST %esi, %YMMMATCH
++
++	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
++
++	/* Check if we may cross page boundary with one vector load.  */
++	andl	$(2 * VEC_SIZE - 1), %ecx
++	cmpl	$VEC_SIZE, %ecx
++	ja	L(cros_page_boundary)
++
++	VMOVU	(%rdi), %YMM1
++
++	/* Each bit in K0 represents a null byte in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM1, %k0
++	/* Each bit in K1 represents a CHAR in YMM1.  */
++	VPCMP	$0, %YMMMATCH, %YMM1, %k1
++	kmovd	%k0, %ecx
++	kmovd	%k1, %eax
++
++	addq	$VEC_SIZE, %rdi
++
++	testl	%eax, %eax
++	jnz	L(first_vec)
++
++	testl	%ecx, %ecx
++	jnz	L(return_null)
++
++	andq	$-VEC_SIZE, %rdi
++	xorl	%edx, %edx
++	jmp	L(aligned_loop)
++
++	.p2align 4
++L(first_vec):
++	/* Check if there is a null byte.  */
++	testl	%ecx, %ecx
++	jnz	L(char_and_nul_in_first_vec)
++
++	/* Remember the match and keep searching.  */
++	movl	%eax, %edx
++	movq	%rdi, %rsi
++	andq	$-VEC_SIZE, %rdi
++	jmp	L(aligned_loop)
++
++	.p2align 4
++L(cros_page_boundary):
++	andl	$(VEC_SIZE - 1), %ecx
++	andq	$-VEC_SIZE, %rdi
++
++# ifdef USE_AS_WCSRCHR
++	/* NB: Divide shift count by 4 since each bit in K1 represent 4
++	   bytes.  */
++	movl	%ecx, %SHIFT_REG
++	sarl	$2, %SHIFT_REG
++# endif
++
++	VMOVA	(%rdi), %YMM1
++
++	/* Each bit in K0 represents a null byte in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM1, %k0
++	/* Each bit in K1 represents a CHAR in YMM1.  */
++	VPCMP	$0, %YMMMATCH, %YMM1, %k1
++	kmovd	%k0, %edx
++	kmovd	%k1, %eax
++
++	shrxl	%SHIFT_REG, %edx, %edx
++	shrxl	%SHIFT_REG, %eax, %eax
++	addq	$VEC_SIZE, %rdi
++
++	/* Check if there is a CHAR.  */
++	testl	%eax, %eax
++	jnz	L(found_char)
++
++	testl	%edx, %edx
++	jnz	L(return_null)
++
++	jmp	L(aligned_loop)
++
++	.p2align 4
++L(found_char):
++	testl	%edx, %edx
++	jnz	L(char_and_nul)
++
++	/* Remember the match and keep searching.  */
++	movl	%eax, %edx
++	leaq	(%rdi, %rcx), %rsi
++
++	.p2align 4
++L(aligned_loop):
++	VMOVA	(%rdi), %YMM1
++	addq	$VEC_SIZE, %rdi
++
++	/* Each bit in K0 represents a null byte in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM1, %k0
++	/* Each bit in K1 represents a CHAR in YMM1.  */
++	VPCMP	$0, %YMMMATCH, %YMM1, %k1
++	kmovd	%k0, %ecx
++	kmovd	%k1, %eax
++	orl	%eax, %ecx
++	jnz	L(char_nor_null)
++
++	VMOVA	(%rdi), %YMM1
++	add	$VEC_SIZE, %rdi
++
++	/* Each bit in K0 represents a null byte in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM1, %k0
++	/* Each bit in K1 represents a CHAR in YMM1.  */
++	VPCMP	$0, %YMMMATCH, %YMM1, %k1
++	kmovd	%k0, %ecx
++	kmovd	%k1, %eax
++	orl	%eax, %ecx
++	jnz	L(char_nor_null)
++
++	VMOVA	(%rdi), %YMM1
++	addq	$VEC_SIZE, %rdi
++
++	/* Each bit in K0 represents a null byte in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM1, %k0
++	/* Each bit in K1 represents a CHAR in YMM1.  */
++	VPCMP	$0, %YMMMATCH, %YMM1, %k1
++	kmovd	%k0, %ecx
++	kmovd	%k1, %eax
++	orl	%eax, %ecx
++	jnz	L(char_nor_null)
++
++	VMOVA	(%rdi), %YMM1
++	addq	$VEC_SIZE, %rdi
++
++	/* Each bit in K0 represents a null byte in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM1, %k0
++	/* Each bit in K1 represents a CHAR in YMM1.  */
++	VPCMP	$0, %YMMMATCH, %YMM1, %k1
++	kmovd	%k0, %ecx
++	kmovd	%k1, %eax
++	orl	%eax, %ecx
++	jz	L(aligned_loop)
++
++	.p2align 4
++L(char_nor_null):
++	/* Find a CHAR or a null byte in a loop.  */
++	testl	%eax, %eax
++	jnz	L(match)
++L(return_value):
++	testl	%edx, %edx
++	jz	L(return_null)
++	movl	%edx, %eax
++	movq	%rsi, %rdi
++	bsrl	%eax, %eax
++# ifdef USE_AS_WCSRCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
++# else
++	leaq	-VEC_SIZE(%rdi, %rax), %rax
++# endif
++	ret
++
++	.p2align 4
++L(match):
++	/* Find a CHAR.  Check if there is a null byte.  */
++	kmovd	%k0, %ecx
++	testl	%ecx, %ecx
++	jnz	L(find_nul)
++
++	/* Remember the match and keep searching.  */
++	movl	%eax, %edx
++	movq	%rdi, %rsi
++	jmp	L(aligned_loop)
++
++	.p2align 4
++L(find_nul):
++	/* Mask out any matching bits after the null byte.  */
++	movl	%ecx, %r8d
++	subl	$1, %r8d
++	xorl	%ecx, %r8d
++	andl	%r8d, %eax
++	testl	%eax, %eax
++	/* If there is no CHAR here, return the remembered one.  */
++	jz	L(return_value)
++	bsrl	%eax, %eax
++# ifdef USE_AS_WCSRCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
++# else
++	leaq	-VEC_SIZE(%rdi, %rax), %rax
++# endif
++	ret
++
++	.p2align 4
++L(char_and_nul):
++	/* Find both a CHAR and a null byte.  */
++	addq	%rcx, %rdi
++	movl	%edx, %ecx
++L(char_and_nul_in_first_vec):
++	/* Mask out any matching bits after the null byte.  */
++	movl	%ecx, %r8d
++	subl	$1, %r8d
++	xorl	%ecx, %r8d
++	andl	%r8d, %eax
++	testl	%eax, %eax
++	/* Return null pointer if the null byte comes first.  */
++	jz	L(return_null)
++	bsrl	%eax, %eax
++# ifdef USE_AS_WCSRCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
++# else
++	leaq	-VEC_SIZE(%rdi, %rax), %rax
++# endif
++	ret
++
++	.p2align 4
++L(return_null):
++	xorl	%eax, %eax
++	ret
++
++END (STRRCHR)
++#endif
+diff --git a/sysdeps/x86_64/multiarch/wcschr-evex.S b/sysdeps/x86_64/multiarch/wcschr-evex.S
+new file mode 100644
+index 00000000..7cb8f1e4
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wcschr-evex.S
+@@ -0,0 +1,3 @@
++#define STRCHR __wcschr_evex
++#define USE_AS_WCSCHR 1
++#include "strchr-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/wcscmp-evex.S b/sysdeps/x86_64/multiarch/wcscmp-evex.S
+new file mode 100644
+index 00000000..42e73e51
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wcscmp-evex.S
+@@ -0,0 +1,4 @@
++#define STRCMP __wcscmp_evex
++#define USE_AS_WCSCMP 1
++
++#include "strcmp-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/wcslen-evex.S b/sysdeps/x86_64/multiarch/wcslen-evex.S
+new file mode 100644
+index 00000000..bdafa83b
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wcslen-evex.S
+@@ -0,0 +1,4 @@
++#define STRLEN __wcslen_evex
++#define USE_AS_WCSLEN 1
++
++#include "strlen-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/wcsncmp-evex.S b/sysdeps/x86_64/multiarch/wcsncmp-evex.S
+new file mode 100644
+index 00000000..8a8e3107
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wcsncmp-evex.S
+@@ -0,0 +1,5 @@
++#define STRCMP __wcsncmp_evex
++#define USE_AS_STRNCMP 1
++#define USE_AS_WCSCMP 1
++
++#include "strcmp-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex.S b/sysdeps/x86_64/multiarch/wcsnlen-evex.S
+new file mode 100644
+index 00000000..24773bb4
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wcsnlen-evex.S
+@@ -0,0 +1,5 @@
++#define STRLEN __wcsnlen_evex
++#define USE_AS_WCSLEN 1
++#define USE_AS_STRNLEN 1
++
++#include "strlen-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c
+index b3144c93..84254b83 100644
+--- a/sysdeps/x86_64/multiarch/wcsnlen.c
++++ b/sysdeps/x86_64/multiarch/wcsnlen.c
+@@ -29,16 +29,24 @@
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+ 
+ static inline void *
+ IFUNC_SELECTOR (void)
+ {
+   const struct cpu_features* cpu_features = __get_cpu_features ();
+ 
+-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+-      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
++  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+-    return OPTIMIZE (avx2);
++    {
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
++	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
++	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
++	return OPTIMIZE (evex);
++
++      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
++	return OPTIMIZE (avx2);
++    }
+ 
+   if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
+     return OPTIMIZE (sse4_1);
+diff --git a/sysdeps/x86_64/multiarch/wcsrchr-evex.S b/sysdeps/x86_64/multiarch/wcsrchr-evex.S
+new file mode 100644
+index 00000000..c64602f7
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wcsrchr-evex.S
+@@ -0,0 +1,3 @@
++#define STRRCHR __wcsrchr_evex
++#define USE_AS_WCSRCHR 1
++#include "strrchr-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex.S b/sysdeps/x86_64/multiarch/wmemchr-evex.S
+new file mode 100644
+index 00000000..06cd0f9f
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wmemchr-evex.S
+@@ -0,0 +1,4 @@
++#define MEMCHR __wmemchr_evex
++#define USE_AS_WMEMCHR 1
++
++#include "memchr-evex.S"
+-- 
+GitLab
+
diff --git a/SOURCES/ia-upd-256bit-evex-instr-2.patch b/SOURCES/ia-upd-256bit-evex-instr-2.patch
new file mode 100644
index 0000000..9db964f
--- /dev/null
+++ b/SOURCES/ia-upd-256bit-evex-instr-2.patch
@@ -0,0 +1,1489 @@
+From 98192464b47c056515b6ac5ff218c197bd75618d Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 5 Mar 2021 06:36:50 -0800
+Subject: [PATCH] x86-64: Add strcpy family functions with 256-bit EVEX
+
+Update ifunc-strcpy.h to select the function optimized with 256-bit EVEX
+instructions using YMM16-YMM31 registers to avoid RTM abort with usable
+AVX512VL and AVX512BW since VZEROUPPER isn't needed at function exit.
+
+(cherry picked from commit 525bc2a32c9710df40371f951217c6ae7a923aee)
+---
+ sysdeps/x86_64/multiarch/Makefile          |    6 +
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c |   24 +
+ sysdeps/x86_64/multiarch/ifunc-strcpy.h    |   13 +-
+ sysdeps/x86_64/multiarch/stpcpy-evex.S     |    3 +
+ sysdeps/x86_64/multiarch/stpncpy-evex.S    |    4 +
+ sysdeps/x86_64/multiarch/strcat-evex.S     |  283 ++++++
+ sysdeps/x86_64/multiarch/strcpy-evex.S     | 1003 ++++++++++++++++++++
+ sysdeps/x86_64/multiarch/strncat-evex.S    |    3 +
+ sysdeps/x86_64/multiarch/strncpy-evex.S    |    3 +
+ 9 files changed, 1339 insertions(+), 3 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/stpcpy-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/stpncpy-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/strcat-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/strcpy-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/strncat-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/strncpy-evex.S
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 5ce85882..46783cd1 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
++++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -43,11 +43,17 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
+ 		   memchr-evex \
+ 		   memrchr-evex \
+ 		   rawmemchr-evex \
++		   stpcpy-evex \
++		   stpncpy-evex \
++		   strcat-evex \
+ 		   strchr-evex \
+ 		   strchrnul-evex \
+ 		   strcmp-evex \
++		   strcpy-evex \
+ 		   strlen-evex \
++		   strncat-evex \
+ 		   strncmp-evex \
++		   strncpy-evex \
+ 		   strnlen-evex \
+ 		   strrchr-evex
+ CFLAGS-varshift.c += -msse4
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index bd7d9f19..082e4da3 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -224,6 +224,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      __stpncpy_ssse3)
+ 	      IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2),
+ 			      __stpncpy_avx2)
++	      IFUNC_IMPL_ADD (array, i, stpncpy,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __stpncpy_evex)
+ 	      IFUNC_IMPL_ADD (array, i, stpncpy, 1,
+ 			      __stpncpy_sse2_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_sse2))
+@@ -234,6 +238,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      __stpcpy_ssse3)
+ 	      IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2),
+ 			      __stpcpy_avx2)
++	      IFUNC_IMPL_ADD (array, i, stpcpy,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __stpcpy_evex)
+ 	      IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2))
+ 
+@@ -268,6 +276,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   IFUNC_IMPL (i, name, strcat,
+ 	      IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (AVX2),
+ 			      __strcat_avx2)
++	      IFUNC_IMPL_ADD (array, i, strcat,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __strcat_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (SSSE3),
+ 			      __strcat_ssse3)
+ 	      IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned)
+@@ -330,6 +342,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   IFUNC_IMPL (i, name, strcpy,
+ 	      IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (AVX2),
+ 			      __strcpy_avx2)
++	      IFUNC_IMPL_ADD (array, i, strcpy,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __strcpy_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (SSSE3),
+ 			      __strcpy_ssse3)
+ 	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
+@@ -373,6 +389,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   IFUNC_IMPL (i, name, strncat,
+ 	      IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (AVX2),
+ 			      __strncat_avx2)
++	      IFUNC_IMPL_ADD (array, i, strncat,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __strncat_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (SSSE3),
+ 			      __strncat_ssse3)
+ 	      IFUNC_IMPL_ADD (array, i, strncat, 1,
+@@ -383,6 +403,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   IFUNC_IMPL (i, name, strncpy,
+ 	      IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (AVX2),
+ 			      __strncpy_avx2)
++	      IFUNC_IMPL_ADD (array, i, strncpy,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __strncpy_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (SSSE3),
+ 			      __strncpy_ssse3)
+ 	      IFUNC_IMPL_ADD (array, i, strncpy, 1,
+diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
+index 100dca5c..deae6348 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h
++++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
+@@ -25,16 +25,23 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
+   attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+ 
+ static inline void *
+ IFUNC_SELECTOR (void)
+ {
+   const struct cpu_features* cpu_features = __get_cpu_features ();
+ 
+-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+-      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
++  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+-    return OPTIMIZE (avx2);
++    {
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
++	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
++	return OPTIMIZE (evex);
++
++      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
++	return OPTIMIZE (avx2);
++    }
+ 
+   if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
+     return OPTIMIZE (sse2_unaligned);
+diff --git a/sysdeps/x86_64/multiarch/stpcpy-evex.S b/sysdeps/x86_64/multiarch/stpcpy-evex.S
+new file mode 100644
+index 00000000..7c6f26cd
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/stpcpy-evex.S
+@@ -0,0 +1,3 @@
++#define USE_AS_STPCPY
++#define STRCPY __stpcpy_evex
++#include "strcpy-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/stpncpy-evex.S b/sysdeps/x86_64/multiarch/stpncpy-evex.S
+new file mode 100644
+index 00000000..1570014d
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/stpncpy-evex.S
+@@ -0,0 +1,4 @@
++#define USE_AS_STPCPY
++#define USE_AS_STRNCPY
++#define STRCPY __stpncpy_evex
++#include "strcpy-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/strcat-evex.S b/sysdeps/x86_64/multiarch/strcat-evex.S
+new file mode 100644
+index 00000000..97c3d85b
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strcat-evex.S
+@@ -0,0 +1,283 @@
++/* strcat with 256-bit EVEX instructions.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#if IS_IN (libc)
++
++# include <sysdep.h>
++
++# ifndef STRCAT
++#  define STRCAT  __strcat_evex
++# endif
++
++# define VMOVU		vmovdqu64
++# define VMOVA		vmovdqa64
++
++/* zero register */
++# define XMMZERO	xmm16
++# define YMMZERO	ymm16
++# define YMM0		ymm17
++# define YMM1		ymm18
++
++# define USE_AS_STRCAT
++
++/* Number of bytes in a vector register */
++# define VEC_SIZE	32
++
++	.section .text.evex,"ax",@progbits
++ENTRY (STRCAT)
++	mov	%rdi, %r9
++# ifdef USE_AS_STRNCAT
++	mov	%rdx, %r8
++# endif
++
++	xor	%eax, %eax
++	mov	%edi, %ecx
++	and	$((VEC_SIZE * 4) - 1), %ecx
++	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
++	cmp	$(VEC_SIZE * 3), %ecx
++	ja	L(fourth_vector_boundary)
++	vpcmpb	$0, (%rdi), %YMMZERO, %k0
++	kmovd	%k0, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_first_vector)
++	mov	%rdi, %rax
++	and	$-VEC_SIZE, %rax
++	jmp	L(align_vec_size_start)
++L(fourth_vector_boundary):
++	mov	%rdi, %rax
++	and	$-VEC_SIZE, %rax
++	vpcmpb	$0, (%rax), %YMMZERO, %k0
++	mov	$-1, %r10d
++	sub	%rax, %rcx
++	shl	%cl, %r10d
++	kmovd	%k0, %edx
++	and	%r10d, %edx
++	jnz	L(exit)
++
++L(align_vec_size_start):
++	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
++	kmovd	%k0, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_second_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
++	kmovd	%k1, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_third_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
++	kmovd	%k2, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_fourth_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
++	kmovd	%k3, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_fifth_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
++	add	$(VEC_SIZE * 4), %rax
++	kmovd	%k4, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_second_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
++	kmovd	%k1, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_third_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
++	kmovd	%k2, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_fourth_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
++	kmovd	%k3, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_fifth_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
++	kmovd	%k4, %edx
++	add	$(VEC_SIZE * 4), %rax
++	test	%edx, %edx
++	jnz	L(exit_null_on_second_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
++	kmovd	%k1, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_third_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
++	kmovd	%k2, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_fourth_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
++	kmovd	%k3, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_fifth_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
++	add	$(VEC_SIZE * 4), %rax
++	kmovd	%k4, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_second_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
++	kmovd	%k1, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_third_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
++	kmovd	%k2, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_fourth_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
++	kmovd	%k3, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_fifth_vector)
++
++	test	$((VEC_SIZE * 4) - 1), %rax
++	jz	L(align_four_vec_loop)
++
++	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
++	add	$(VEC_SIZE * 5), %rax
++	kmovd	%k4, %edx
++	test	%edx, %edx
++	jnz	L(exit)
++
++	test	$((VEC_SIZE * 4) - 1), %rax
++	jz	L(align_four_vec_loop)
++
++	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
++	add	$VEC_SIZE, %rax
++	kmovd	%k0, %edx
++	test	%edx, %edx
++	jnz	L(exit)
++
++	test	$((VEC_SIZE * 4) - 1), %rax
++	jz	L(align_four_vec_loop)
++
++	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
++	add	$VEC_SIZE, %rax
++	kmovd	%k0, %edx
++	test	%edx, %edx
++	jnz	L(exit)
++
++	test	$((VEC_SIZE * 4) - 1), %rax
++	jz	L(align_four_vec_loop)
++
++	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k1
++	add	$VEC_SIZE, %rax
++	kmovd	%k1, %edx
++	test	%edx, %edx
++	jnz	L(exit)
++
++	add	$VEC_SIZE, %rax
++
++	.p2align 4
++L(align_four_vec_loop):
++	VMOVA	(%rax), %YMM0
++	VMOVA	(VEC_SIZE * 2)(%rax), %YMM1
++	vpminub	VEC_SIZE(%rax), %YMM0, %YMM0
++	vpminub	(VEC_SIZE * 3)(%rax), %YMM1, %YMM1
++	vpminub	%YMM0, %YMM1, %YMM0
++	/* If K0 != 0, there is a null byte.  */
++	vpcmpb	$0, %YMM0, %YMMZERO, %k0
++	add	$(VEC_SIZE * 4), %rax
++	ktestd	%k0, %k0
++	jz	L(align_four_vec_loop)
++
++	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %YMMZERO, %k0
++	sub	$(VEC_SIZE * 5), %rax
++	kmovd	%k0, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_second_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
++	kmovd	%k1, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_third_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
++	kmovd	%k2, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_fourth_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
++	kmovd	%k3, %edx
++	sub	%rdi, %rax
++	bsf	%rdx, %rdx
++	add	%rdx, %rax
++	add	$(VEC_SIZE * 4), %rax
++	jmp	L(StartStrcpyPart)
++
++	.p2align 4
++L(exit):
++	sub	%rdi, %rax
++L(exit_null_on_first_vector):
++	bsf	%rdx, %rdx
++	add	%rdx, %rax
++	jmp	L(StartStrcpyPart)
++
++	.p2align 4
++L(exit_null_on_second_vector):
++	sub	%rdi, %rax
++	bsf	%rdx, %rdx
++	add	%rdx, %rax
++	add	$VEC_SIZE, %rax
++	jmp	L(StartStrcpyPart)
++
++	.p2align 4
++L(exit_null_on_third_vector):
++	sub	%rdi, %rax
++	bsf	%rdx, %rdx
++	add	%rdx, %rax
++	add	$(VEC_SIZE * 2), %rax
++	jmp	L(StartStrcpyPart)
++
++	.p2align 4
++L(exit_null_on_fourth_vector):
++	sub	%rdi, %rax
++	bsf	%rdx, %rdx
++	add	%rdx, %rax
++	add	$(VEC_SIZE * 3), %rax
++	jmp	L(StartStrcpyPart)
++
++	.p2align 4
++L(exit_null_on_fifth_vector):
++	sub	%rdi, %rax
++	bsf	%rdx, %rdx
++	add	%rdx, %rax
++	add	$(VEC_SIZE * 4), %rax
++
++	.p2align 4
++L(StartStrcpyPart):
++	lea	(%r9, %rax), %rdi
++	mov	%rsi, %rcx
++	mov	%r9, %rax      /* save result */
++
++# ifdef USE_AS_STRNCAT
++	test	%r8, %r8
++	jz	L(ExitZero)
++#  define USE_AS_STRNCPY
++# endif
++
++# include "strcpy-evex.S"
++#endif
+diff --git a/sysdeps/x86_64/multiarch/strcpy-evex.S b/sysdeps/x86_64/multiarch/strcpy-evex.S
+new file mode 100644
+index 00000000..a343a1a6
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strcpy-evex.S
+@@ -0,0 +1,1003 @@
++/* strcpy with 256-bit EVEX instructions.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#if IS_IN (libc)
++
++# ifndef USE_AS_STRCAT
++#  include <sysdep.h>
++
++#  ifndef STRCPY
++#   define STRCPY  __strcpy_evex
++#  endif
++
++# endif
++
++# define VMOVU		vmovdqu64
++# define VMOVA		vmovdqa64
++
++/* Number of bytes in a vector register */
++# ifndef VEC_SIZE
++#  define VEC_SIZE	32
++# endif
++
++# define XMM2		xmm18
++# define XMM3		xmm19
++
++# define YMM2		ymm18
++# define YMM3		ymm19
++# define YMM4		ymm20
++# define YMM5		ymm21
++# define YMM6		ymm22
++# define YMM7		ymm23
++
++# ifndef USE_AS_STRCAT
++
++/* zero register */
++#  define XMMZERO	xmm16
++#  define YMMZERO	ymm16
++#  define YMM1		ymm17
++
++	.section .text.evex,"ax",@progbits
++ENTRY (STRCPY)
++#  ifdef USE_AS_STRNCPY
++	mov	%RDX_LP, %R8_LP
++	test	%R8_LP, %R8_LP
++	jz	L(ExitZero)
++#  endif
++	mov	%rsi, %rcx
++#  ifndef USE_AS_STPCPY
++	mov	%rdi, %rax      /* save result */
++#  endif
++
++	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
++# endif
++
++	and	$((VEC_SIZE * 4) - 1), %ecx
++	cmp	$(VEC_SIZE * 2), %ecx
++	jbe	L(SourceStringAlignmentLessTwoVecSize)
++
++	and	$-VEC_SIZE, %rsi
++	and	$(VEC_SIZE - 1), %ecx
++
++	vpcmpb	$0, (%rsi), %YMMZERO, %k0
++	kmovd	%k0, %edx
++	shr	%cl, %rdx
++
++# ifdef USE_AS_STRNCPY
++#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
++	mov	$VEC_SIZE, %r10
++	sub	%rcx, %r10
++	cmp	%r10, %r8
++#  else
++	mov	$(VEC_SIZE + 1), %r10
++	sub	%rcx, %r10
++	cmp	%r10, %r8
++#  endif
++	jbe	L(CopyVecSizeTailCase2OrCase3)
++# endif
++	test	%edx, %edx
++	jnz	L(CopyVecSizeTail)
++
++	vpcmpb	$0, VEC_SIZE(%rsi), %YMMZERO, %k1
++	kmovd	%k1, %edx
++
++# ifdef USE_AS_STRNCPY
++	add	$VEC_SIZE, %r10
++	cmp	%r10, %r8
++	jbe	L(CopyTwoVecSizeCase2OrCase3)
++# endif
++	test	%edx, %edx
++	jnz	L(CopyTwoVecSize)
++
++	VMOVU	(%rsi, %rcx), %YMM2   /* copy VEC_SIZE bytes */
++	VMOVU	%YMM2, (%rdi)
++
++/* If source address alignment != destination address alignment */
++	.p2align 4
++L(UnalignVecSizeBoth):
++	sub	%rcx, %rdi
++# ifdef USE_AS_STRNCPY
++	add	%rcx, %r8
++	sbb	%rcx, %rcx
++	or	%rcx, %r8
++# endif
++	mov	$VEC_SIZE, %rcx
++	VMOVA	(%rsi, %rcx), %YMM2
++	VMOVU	%YMM2, (%rdi, %rcx)
++	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
++	vpcmpb	$0, %YMM2, %YMMZERO, %k0
++	kmovd	%k0, %edx
++	add	$VEC_SIZE, %rcx
++# ifdef USE_AS_STRNCPY
++	sub	$(VEC_SIZE * 3), %r8
++	jbe	L(CopyVecSizeCase2OrCase3)
++# endif
++	test	%edx, %edx
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	jnz	L(CopyVecSizeUnalignedVec2)
++# else
++	jnz	L(CopyVecSize)
++# endif
++
++	VMOVU	%YMM2, (%rdi, %rcx)
++	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM3
++	vpcmpb	$0, %YMM3, %YMMZERO, %k0
++	kmovd	%k0, %edx
++	add	$VEC_SIZE, %rcx
++# ifdef USE_AS_STRNCPY
++	sub	$VEC_SIZE, %r8
++	jbe	L(CopyVecSizeCase2OrCase3)
++# endif
++	test	%edx, %edx
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	jnz	L(CopyVecSizeUnalignedVec3)
++# else
++	jnz	L(CopyVecSize)
++# endif
++
++	VMOVU	%YMM3, (%rdi, %rcx)
++	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM4
++	vpcmpb	$0, %YMM4, %YMMZERO, %k0
++	kmovd	%k0, %edx
++	add	$VEC_SIZE, %rcx
++# ifdef USE_AS_STRNCPY
++	sub	$VEC_SIZE, %r8
++	jbe	L(CopyVecSizeCase2OrCase3)
++# endif
++	test	%edx, %edx
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	jnz	L(CopyVecSizeUnalignedVec4)
++# else
++	jnz	L(CopyVecSize)
++# endif
++
++	VMOVU	%YMM4, (%rdi, %rcx)
++	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
++	vpcmpb	$0, %YMM2, %YMMZERO, %k0
++	kmovd	%k0, %edx
++	add	$VEC_SIZE, %rcx
++# ifdef USE_AS_STRNCPY
++	sub	$VEC_SIZE, %r8
++	jbe	L(CopyVecSizeCase2OrCase3)
++# endif
++	test	%edx, %edx
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	jnz	L(CopyVecSizeUnalignedVec2)
++# else
++	jnz	L(CopyVecSize)
++# endif
++
++	VMOVU	%YMM2, (%rdi, %rcx)
++	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
++	vpcmpb	$0, %YMM2, %YMMZERO, %k0
++	kmovd	%k0, %edx
++	add	$VEC_SIZE, %rcx
++# ifdef USE_AS_STRNCPY
++	sub	$VEC_SIZE, %r8
++	jbe	L(CopyVecSizeCase2OrCase3)
++# endif
++	test	%edx, %edx
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	jnz	L(CopyVecSizeUnalignedVec2)
++# else
++	jnz	L(CopyVecSize)
++# endif
++
++	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM3
++	VMOVU	%YMM2, (%rdi, %rcx)
++	vpcmpb	$0, %YMM3, %YMMZERO, %k0
++	kmovd	%k0, %edx
++	add	$VEC_SIZE, %rcx
++# ifdef USE_AS_STRNCPY
++	sub	$VEC_SIZE, %r8
++	jbe	L(CopyVecSizeCase2OrCase3)
++# endif
++	test	%edx, %edx
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	jnz	L(CopyVecSizeUnalignedVec3)
++# else
++	jnz	L(CopyVecSize)
++# endif
++
++	VMOVU	%YMM3, (%rdi, %rcx)
++	mov	%rsi, %rdx
++	lea	VEC_SIZE(%rsi, %rcx), %rsi
++	and	$-(VEC_SIZE * 4), %rsi
++	sub	%rsi, %rdx
++	sub	%rdx, %rdi
++# ifdef USE_AS_STRNCPY
++	lea	(VEC_SIZE * 8)(%r8, %rdx), %r8
++# endif
++L(UnalignedFourVecSizeLoop):
++	VMOVA	(%rsi), %YMM4
++	VMOVA	VEC_SIZE(%rsi), %YMM5
++	VMOVA	(VEC_SIZE * 2)(%rsi), %YMM6
++	VMOVA	(VEC_SIZE * 3)(%rsi), %YMM7
++	vpminub	%YMM5, %YMM4, %YMM2
++	vpminub	%YMM7, %YMM6, %YMM3
++	vpminub	%YMM2, %YMM3, %YMM2
++	/* If K7 != 0, there is a null byte.  */
++	vpcmpb	$0, %YMM2, %YMMZERO, %k7
++	kmovd	%k7, %edx
++# ifdef USE_AS_STRNCPY
++	sub	$(VEC_SIZE * 4), %r8
++	jbe	L(UnalignedLeaveCase2OrCase3)
++# endif
++	test	%edx, %edx
++	jnz	L(UnalignedFourVecSizeLeave)
++
++L(UnalignedFourVecSizeLoop_start):
++	add	$(VEC_SIZE * 4), %rdi
++	add	$(VEC_SIZE * 4), %rsi
++	VMOVU	%YMM4, -(VEC_SIZE * 4)(%rdi)
++	VMOVA	(%rsi), %YMM4
++	VMOVU	%YMM5, -(VEC_SIZE * 3)(%rdi)
++	VMOVA	VEC_SIZE(%rsi), %YMM5
++	vpminub	%YMM5, %YMM4, %YMM2
++	VMOVU	%YMM6, -(VEC_SIZE * 2)(%rdi)
++	VMOVA	(VEC_SIZE * 2)(%rsi), %YMM6
++	VMOVU	%YMM7, -VEC_SIZE(%rdi)
++	VMOVA	(VEC_SIZE * 3)(%rsi), %YMM7
++	vpminub	%YMM7, %YMM6, %YMM3
++	vpminub	%YMM2, %YMM3, %YMM2
++	/* If K7 != 0, there is a null byte.  */
++	vpcmpb	$0, %YMM2, %YMMZERO, %k7
++	kmovd	%k7, %edx
++# ifdef USE_AS_STRNCPY
++	sub	$(VEC_SIZE * 4), %r8
++	jbe	L(UnalignedLeaveCase2OrCase3)
++# endif
++	test	%edx, %edx
++	jz	L(UnalignedFourVecSizeLoop_start)
++
++L(UnalignedFourVecSizeLeave):
++	vpcmpb	$0, %YMM4, %YMMZERO, %k1
++	kmovd	%k1, %edx
++	test	%edx, %edx
++	jnz	L(CopyVecSizeUnaligned_0)
++
++	vpcmpb	$0, %YMM5, %YMMZERO, %k2
++	kmovd	%k2, %ecx
++	test	%ecx, %ecx
++	jnz	L(CopyVecSizeUnaligned_16)
++
++	vpcmpb	$0, %YMM6, %YMMZERO, %k3
++	kmovd	%k3, %edx
++	test	%edx, %edx
++	jnz	L(CopyVecSizeUnaligned_32)
++
++	vpcmpb	$0, %YMM7, %YMMZERO, %k4
++	kmovd	%k4, %ecx
++	bsf	%ecx, %edx
++	VMOVU	%YMM4, (%rdi)
++	VMOVU	%YMM5, VEC_SIZE(%rdi)
++	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++# ifdef USE_AS_STPCPY
++	lea	(VEC_SIZE * 3)(%rdi, %rdx), %rax
++# endif
++	VMOVU	%YMM7, (VEC_SIZE * 3)(%rdi)
++	add	$(VEC_SIZE - 1), %r8
++	sub	%rdx, %r8
++	lea	((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
++	jmp	L(StrncpyFillTailWithZero)
++# else
++	add	$(VEC_SIZE * 3), %rsi
++	add	$(VEC_SIZE * 3), %rdi
++	jmp	L(CopyVecSizeExit)
++# endif
++
++/* If source address alignment == destination address alignment */
++
++L(SourceStringAlignmentLessTwoVecSize):
++	VMOVU	(%rsi), %YMM3
++	VMOVU	VEC_SIZE(%rsi), %YMM2
++	vpcmpb	$0, %YMM3, %YMMZERO, %k0
++	kmovd	%k0, %edx
++
++# ifdef USE_AS_STRNCPY
++#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
++	cmp	$VEC_SIZE, %r8
++#  else
++	cmp	$(VEC_SIZE + 1), %r8
++#  endif
++	jbe	L(CopyVecSizeTail1Case2OrCase3)
++# endif
++	test	%edx, %edx
++	jnz	L(CopyVecSizeTail1)
++
++	VMOVU	%YMM3, (%rdi)
++	vpcmpb	$0, %YMM2, %YMMZERO, %k0
++	kmovd	%k0, %edx
++
++# ifdef USE_AS_STRNCPY
++#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
++	cmp	$(VEC_SIZE * 2), %r8
++#  else
++	cmp	$((VEC_SIZE * 2) + 1), %r8
++#  endif
++	jbe	L(CopyTwoVecSize1Case2OrCase3)
++# endif
++	test	%edx, %edx
++	jnz	L(CopyTwoVecSize1)
++
++	and	$-VEC_SIZE, %rsi
++	and	$(VEC_SIZE - 1), %ecx
++	jmp	L(UnalignVecSizeBoth)
++
++/*------End of main part with loops---------------------*/
++
++/* Case1 */
++
++# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
++	.p2align 4
++L(CopyVecSize):
++	add	%rcx, %rdi
++# endif
++L(CopyVecSizeTail):
++	add	%rcx, %rsi
++L(CopyVecSizeTail1):
++	bsf	%edx, %edx
++L(CopyVecSizeExit):
++	cmp	$32, %edx
++	jae	L(Exit32_63)
++	cmp	$16, %edx
++	jae	L(Exit16_31)
++	cmp	$8, %edx
++	jae	L(Exit8_15)
++	cmp	$4, %edx
++	jae	L(Exit4_7)
++	cmp	$3, %edx
++	je	L(Exit3)
++	cmp	$1, %edx
++	ja	L(Exit2)
++	je	L(Exit1)
++	movb	$0, (%rdi)
++# ifdef USE_AS_STPCPY
++	lea	(%rdi), %rax
++# endif
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	sub	$1, %r8
++	lea	1(%rdi), %rdi
++	jnz	L(StrncpyFillTailWithZero)
++# endif
++	ret
++
++	.p2align 4
++L(CopyTwoVecSize1):
++	add	$VEC_SIZE, %rsi
++	add	$VEC_SIZE, %rdi
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	sub	$VEC_SIZE, %r8
++# endif
++	jmp	L(CopyVecSizeTail1)
++
++	.p2align 4
++L(CopyTwoVecSize):
++	bsf	%edx, %edx
++	add	%rcx, %rsi
++	add	$VEC_SIZE, %edx
++	sub	%ecx, %edx
++	jmp	L(CopyVecSizeExit)
++
++	.p2align 4
++L(CopyVecSizeUnaligned_0):
++	bsf	%edx, %edx
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++# ifdef USE_AS_STPCPY
++	lea	(%rdi, %rdx), %rax
++# endif
++	VMOVU	%YMM4, (%rdi)
++	add	$((VEC_SIZE * 4) - 1), %r8
++	sub	%rdx, %r8
++	lea	1(%rdi, %rdx), %rdi
++	jmp	L(StrncpyFillTailWithZero)
++# else
++	jmp	L(CopyVecSizeExit)
++# endif
++
++	.p2align 4
++L(CopyVecSizeUnaligned_16):
++	bsf	%ecx, %edx
++	VMOVU	%YMM4, (%rdi)
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++# ifdef USE_AS_STPCPY
++	lea	VEC_SIZE(%rdi, %rdx), %rax
++# endif
++	VMOVU	%YMM5, VEC_SIZE(%rdi)
++	add	$((VEC_SIZE * 3) - 1), %r8
++	sub	%rdx, %r8
++	lea	(VEC_SIZE + 1)(%rdi, %rdx), %rdi
++	jmp	L(StrncpyFillTailWithZero)
++# else
++	add	$VEC_SIZE, %rsi
++	add	$VEC_SIZE, %rdi
++	jmp	L(CopyVecSizeExit)
++# endif
++
++	.p2align 4
++L(CopyVecSizeUnaligned_32):
++	bsf	%edx, %edx
++	VMOVU	%YMM4, (%rdi)
++	VMOVU	%YMM5, VEC_SIZE(%rdi)
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++# ifdef USE_AS_STPCPY
++	lea	(VEC_SIZE * 2)(%rdi, %rdx), %rax
++# endif
++	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
++	add	$((VEC_SIZE * 2) - 1), %r8
++	sub	%rdx, %r8
++	lea	((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
++	jmp	L(StrncpyFillTailWithZero)
++# else
++	add	$(VEC_SIZE * 2), %rsi
++	add	$(VEC_SIZE * 2), %rdi
++	jmp	L(CopyVecSizeExit)
++# endif
++
++# ifdef USE_AS_STRNCPY
++#  ifndef USE_AS_STRCAT
++	.p2align 4
++L(CopyVecSizeUnalignedVec6):
++	VMOVU	%YMM6, (%rdi, %rcx)
++	jmp	L(CopyVecSizeVecExit)
++
++	.p2align 4
++L(CopyVecSizeUnalignedVec5):
++	VMOVU	%YMM5, (%rdi, %rcx)
++	jmp	L(CopyVecSizeVecExit)
++
++	.p2align 4
++L(CopyVecSizeUnalignedVec4):
++	VMOVU	%YMM4, (%rdi, %rcx)
++	jmp	L(CopyVecSizeVecExit)
++
++	.p2align 4
++L(CopyVecSizeUnalignedVec3):
++	VMOVU	%YMM3, (%rdi, %rcx)
++	jmp	L(CopyVecSizeVecExit)
++#  endif
++
++/* Case2 */
++
++	.p2align 4
++L(CopyVecSizeCase2):
++	add	$VEC_SIZE, %r8
++	add	%rcx, %rdi
++	add	%rcx, %rsi
++	bsf	%edx, %edx
++	cmp	%r8d, %edx
++	jb	L(CopyVecSizeExit)
++	jmp	L(StrncpyExit)
++
++	.p2align 4
++L(CopyTwoVecSizeCase2):
++	add	%rcx, %rsi
++	bsf	%edx, %edx
++	add	$VEC_SIZE, %edx
++	sub	%ecx, %edx
++	cmp	%r8d, %edx
++	jb	L(CopyVecSizeExit)
++	jmp	L(StrncpyExit)
++
++L(CopyVecSizeTailCase2):
++	add	%rcx, %rsi
++	bsf	%edx, %edx
++	cmp	%r8d, %edx
++	jb	L(CopyVecSizeExit)
++	jmp	L(StrncpyExit)
++
++L(CopyVecSizeTail1Case2):
++	bsf	%edx, %edx
++	cmp	%r8d, %edx
++	jb	L(CopyVecSizeExit)
++	jmp	L(StrncpyExit)
++
++/* Case2 or Case3,  Case3 */
++
++	.p2align 4
++L(CopyVecSizeCase2OrCase3):
++	test	%rdx, %rdx
++	jnz	L(CopyVecSizeCase2)
++L(CopyVecSizeCase3):
++	add	$VEC_SIZE, %r8
++	add	%rcx, %rdi
++	add	%rcx, %rsi
++	jmp	L(StrncpyExit)
++
++	.p2align 4
++L(CopyTwoVecSizeCase2OrCase3):
++	test	%rdx, %rdx
++	jnz	L(CopyTwoVecSizeCase2)
++	add	%rcx, %rsi
++	jmp	L(StrncpyExit)
++
++	.p2align 4
++L(CopyVecSizeTailCase2OrCase3):
++	test	%rdx, %rdx
++	jnz	L(CopyVecSizeTailCase2)
++	add	%rcx, %rsi
++	jmp	L(StrncpyExit)
++
++	.p2align 4
++L(CopyTwoVecSize1Case2OrCase3):
++	add	$VEC_SIZE, %rdi
++	add	$VEC_SIZE, %rsi
++	sub	$VEC_SIZE, %r8
++L(CopyVecSizeTail1Case2OrCase3):
++	test	%rdx, %rdx
++	jnz	L(CopyVecSizeTail1Case2)
++	jmp	L(StrncpyExit)
++# endif
++
++/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
++
++	.p2align 4
++L(Exit1):
++	movzwl	(%rsi), %edx
++	mov	%dx, (%rdi)
++# ifdef USE_AS_STPCPY
++	lea	1(%rdi), %rax
++# endif
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	sub	$2, %r8
++	lea	2(%rdi), %rdi
++	jnz	L(StrncpyFillTailWithZero)
++# endif
++	ret
++
++	.p2align 4
++L(Exit2):
++	movzwl	(%rsi), %ecx
++	mov	%cx, (%rdi)
++	movb	$0, 2(%rdi)
++# ifdef USE_AS_STPCPY
++	lea	2(%rdi), %rax
++# endif
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	sub	$3, %r8
++	lea	3(%rdi), %rdi
++	jnz	L(StrncpyFillTailWithZero)
++# endif
++	ret
++
++	.p2align 4
++L(Exit3):
++	mov	(%rsi), %edx
++	mov	%edx, (%rdi)
++# ifdef USE_AS_STPCPY
++	lea	3(%rdi), %rax
++# endif
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	sub	$4, %r8
++	lea	4(%rdi), %rdi
++	jnz	L(StrncpyFillTailWithZero)
++# endif
++	ret
++
++	.p2align 4
++L(Exit4_7):
++	mov	(%rsi), %ecx
++	mov	%ecx, (%rdi)
++	mov	-3(%rsi, %rdx), %ecx
++	mov	%ecx, -3(%rdi, %rdx)
++# ifdef USE_AS_STPCPY
++	lea	(%rdi, %rdx), %rax
++# endif
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	sub	%rdx, %r8
++	sub	$1, %r8
++	lea	1(%rdi, %rdx), %rdi
++	jnz	L(StrncpyFillTailWithZero)
++# endif
++	ret
++
++	.p2align 4
++L(Exit8_15):
++	mov	(%rsi), %rcx
++	mov	-7(%rsi, %rdx), %r9
++	mov	%rcx, (%rdi)
++	mov	%r9, -7(%rdi, %rdx)
++# ifdef USE_AS_STPCPY
++	lea	(%rdi, %rdx), %rax
++# endif
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	sub	%rdx, %r8
++	sub	$1, %r8
++	lea	1(%rdi, %rdx), %rdi
++	jnz	L(StrncpyFillTailWithZero)
++# endif
++	ret
++
++	.p2align 4
++L(Exit16_31):
++	VMOVU	(%rsi), %XMM2
++	VMOVU	-15(%rsi, %rdx), %XMM3
++	VMOVU	%XMM2, (%rdi)
++	VMOVU	%XMM3, -15(%rdi, %rdx)
++# ifdef USE_AS_STPCPY
++	lea	(%rdi, %rdx), %rax
++# endif
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	sub %rdx, %r8
++	sub $1, %r8
++	lea 1(%rdi, %rdx), %rdi
++	jnz L(StrncpyFillTailWithZero)
++# endif
++	ret
++
++	.p2align 4
++L(Exit32_63):
++	VMOVU	(%rsi), %YMM2
++	VMOVU	-31(%rsi, %rdx), %YMM3
++	VMOVU	%YMM2, (%rdi)
++	VMOVU	%YMM3, -31(%rdi, %rdx)
++# ifdef USE_AS_STPCPY
++	lea	(%rdi, %rdx), %rax
++# endif
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	sub	%rdx, %r8
++	sub	$1, %r8
++	lea	1(%rdi, %rdx), %rdi
++	jnz	L(StrncpyFillTailWithZero)
++# endif
++	ret
++
++# ifdef USE_AS_STRNCPY
++
++	.p2align 4
++L(StrncpyExit1):
++	movzbl	(%rsi), %edx
++	mov	%dl, (%rdi)
++#  ifdef USE_AS_STPCPY
++	lea	1(%rdi), %rax
++#  endif
++#  ifdef USE_AS_STRCAT
++	movb	$0, 1(%rdi)
++#  endif
++	ret
++
++	.p2align 4
++L(StrncpyExit2):
++	movzwl	(%rsi), %edx
++	mov	%dx, (%rdi)
++#  ifdef USE_AS_STPCPY
++	lea	2(%rdi), %rax
++#  endif
++#  ifdef USE_AS_STRCAT
++	movb	$0, 2(%rdi)
++#  endif
++	ret
++
++	.p2align 4
++L(StrncpyExit3_4):
++	movzwl	(%rsi), %ecx
++	movzwl	-2(%rsi, %r8), %edx
++	mov	%cx, (%rdi)
++	mov	%dx, -2(%rdi, %r8)
++#  ifdef USE_AS_STPCPY
++	lea	(%rdi, %r8), %rax
++#  endif
++#  ifdef USE_AS_STRCAT
++	movb	$0, (%rdi, %r8)
++#  endif
++	ret
++
++	.p2align 4
++L(StrncpyExit5_8):
++	mov	(%rsi), %ecx
++	mov	-4(%rsi, %r8), %edx
++	mov	%ecx, (%rdi)
++	mov	%edx, -4(%rdi, %r8)
++#  ifdef USE_AS_STPCPY
++	lea	(%rdi, %r8), %rax
++#  endif
++#  ifdef USE_AS_STRCAT
++	movb	$0, (%rdi, %r8)
++#  endif
++	ret
++
++	.p2align 4
++L(StrncpyExit9_16):
++	mov	(%rsi), %rcx
++	mov	-8(%rsi, %r8), %rdx
++	mov	%rcx, (%rdi)
++	mov	%rdx, -8(%rdi, %r8)
++#  ifdef USE_AS_STPCPY
++	lea	(%rdi, %r8), %rax
++#  endif
++#  ifdef USE_AS_STRCAT
++	movb	$0, (%rdi, %r8)
++#  endif
++	ret
++
++	.p2align 4
++L(StrncpyExit17_32):
++	VMOVU	(%rsi), %XMM2
++	VMOVU	-16(%rsi, %r8), %XMM3
++	VMOVU	%XMM2, (%rdi)
++	VMOVU	%XMM3, -16(%rdi, %r8)
++#  ifdef USE_AS_STPCPY
++	lea	(%rdi, %r8), %rax
++#  endif
++#  ifdef USE_AS_STRCAT
++	movb	$0, (%rdi, %r8)
++#  endif
++	ret
++
++	.p2align 4
++L(StrncpyExit33_64):
++	/*  0/32, 31/16 */
++	VMOVU	(%rsi), %YMM2
++	VMOVU	-VEC_SIZE(%rsi, %r8), %YMM3
++	VMOVU	%YMM2, (%rdi)
++	VMOVU	%YMM3, -VEC_SIZE(%rdi, %r8)
++#  ifdef USE_AS_STPCPY
++	lea	(%rdi, %r8), %rax
++#  endif
++#  ifdef USE_AS_STRCAT
++	movb	$0, (%rdi, %r8)
++#  endif
++	ret
++
++	.p2align 4
++L(StrncpyExit65):
++	/* 0/32, 32/32, 64/1 */
++	VMOVU	(%rsi), %YMM2
++	VMOVU	32(%rsi), %YMM3
++	mov	64(%rsi), %cl
++	VMOVU	%YMM2, (%rdi)
++	VMOVU	%YMM3, 32(%rdi)
++	mov	%cl, 64(%rdi)
++#  ifdef USE_AS_STPCPY
++	lea	65(%rdi), %rax
++#  endif
++#  ifdef USE_AS_STRCAT
++	movb	$0, 65(%rdi)
++#  endif
++	ret
++
++#  ifndef USE_AS_STRCAT
++
++	.p2align 4
++L(Fill1):
++	mov	%dl, (%rdi)
++	ret
++
++	.p2align 4
++L(Fill2):
++	mov	%dx, (%rdi)
++	ret
++
++	.p2align 4
++L(Fill3_4):
++	mov	%dx, (%rdi)
++	mov     %dx, -2(%rdi, %r8)
++	ret
++
++	.p2align 4
++L(Fill5_8):
++	mov	%edx, (%rdi)
++	mov     %edx, -4(%rdi, %r8)
++	ret
++
++	.p2align 4
++L(Fill9_16):
++	mov	%rdx, (%rdi)
++	mov	%rdx, -8(%rdi, %r8)
++	ret
++
++	.p2align 4
++L(Fill17_32):
++	VMOVU	%XMMZERO, (%rdi)
++	VMOVU	%XMMZERO, -16(%rdi, %r8)
++	ret
++
++	.p2align 4
++L(CopyVecSizeUnalignedVec2):
++	VMOVU	%YMM2, (%rdi, %rcx)
++
++	.p2align 4
++L(CopyVecSizeVecExit):
++	bsf	%edx, %edx
++	add	$(VEC_SIZE - 1), %r8
++	add	%rcx, %rdi
++#   ifdef USE_AS_STPCPY
++	lea	(%rdi, %rdx), %rax
++#   endif
++	sub	%rdx, %r8
++	lea	1(%rdi, %rdx), %rdi
++
++	.p2align 4
++L(StrncpyFillTailWithZero):
++	xor	%edx, %edx
++	sub	$VEC_SIZE, %r8
++	jbe	L(StrncpyFillExit)
++
++	VMOVU	%YMMZERO, (%rdi)
++	add	$VEC_SIZE, %rdi
++
++	mov	%rdi, %rsi
++	and	$(VEC_SIZE - 1), %esi
++	sub	%rsi, %rdi
++	add	%rsi, %r8
++	sub	$(VEC_SIZE * 4), %r8
++	jb	L(StrncpyFillLessFourVecSize)
++
++L(StrncpyFillLoopVmovdqa):
++	VMOVA	%YMMZERO, (%rdi)
++	VMOVA	%YMMZERO, VEC_SIZE(%rdi)
++	VMOVA	%YMMZERO, (VEC_SIZE * 2)(%rdi)
++	VMOVA	%YMMZERO, (VEC_SIZE * 3)(%rdi)
++	add	$(VEC_SIZE * 4), %rdi
++	sub	$(VEC_SIZE * 4), %r8
++	jae	L(StrncpyFillLoopVmovdqa)
++
++L(StrncpyFillLessFourVecSize):
++	add	$(VEC_SIZE * 2), %r8
++	jl	L(StrncpyFillLessTwoVecSize)
++	VMOVA	%YMMZERO, (%rdi)
++	VMOVA	%YMMZERO, VEC_SIZE(%rdi)
++	add	$(VEC_SIZE * 2), %rdi
++	sub	$VEC_SIZE, %r8
++	jl	L(StrncpyFillExit)
++	VMOVA	%YMMZERO, (%rdi)
++	add	$VEC_SIZE, %rdi
++	jmp	L(Fill)
++
++	.p2align 4
++L(StrncpyFillLessTwoVecSize):
++	add	$VEC_SIZE, %r8
++	jl	L(StrncpyFillExit)
++	VMOVA	%YMMZERO, (%rdi)
++	add	$VEC_SIZE, %rdi
++	jmp	L(Fill)
++
++	.p2align 4
++L(StrncpyFillExit):
++	add	$VEC_SIZE, %r8
++L(Fill):
++	cmp	$17, %r8d
++	jae	L(Fill17_32)
++	cmp	$9, %r8d
++	jae	L(Fill9_16)
++	cmp	$5, %r8d
++	jae	L(Fill5_8)
++	cmp	$3, %r8d
++	jae	L(Fill3_4)
++	cmp	$1, %r8d
++	ja	L(Fill2)
++	je	L(Fill1)
++	ret
++
++/* end of ifndef USE_AS_STRCAT */
++#  endif
++
++	.p2align 4
++L(UnalignedLeaveCase2OrCase3):
++	test	%rdx, %rdx
++	jnz	L(UnalignedFourVecSizeLeaveCase2)
++L(UnalignedFourVecSizeLeaveCase3):
++	lea	(VEC_SIZE * 4)(%r8), %rcx
++	and	$-VEC_SIZE, %rcx
++	add	$(VEC_SIZE * 3), %r8
++	jl	L(CopyVecSizeCase3)
++	VMOVU	%YMM4, (%rdi)
++	sub	$VEC_SIZE, %r8
++	jb	L(CopyVecSizeCase3)
++	VMOVU	%YMM5, VEC_SIZE(%rdi)
++	sub	$VEC_SIZE, %r8
++	jb	L(CopyVecSizeCase3)
++	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
++	sub	$VEC_SIZE, %r8
++	jb	L(CopyVecSizeCase3)
++	VMOVU	%YMM7, (VEC_SIZE * 3)(%rdi)
++#  ifdef USE_AS_STPCPY
++	lea	(VEC_SIZE * 4)(%rdi), %rax
++#  endif
++#  ifdef USE_AS_STRCAT
++	movb	$0, (VEC_SIZE * 4)(%rdi)
++#  endif
++	ret
++
++	.p2align 4
++L(UnalignedFourVecSizeLeaveCase2):
++	xor	%ecx, %ecx
++	vpcmpb	$0, %YMM4, %YMMZERO, %k1
++	kmovd	%k1, %edx
++	add	$(VEC_SIZE * 3), %r8
++	jle	L(CopyVecSizeCase2OrCase3)
++	test	%edx, %edx
++#  ifndef USE_AS_STRCAT
++	jnz	L(CopyVecSizeUnalignedVec4)
++#  else
++	jnz	L(CopyVecSize)
++#  endif
++	vpcmpb	$0, %YMM5, %YMMZERO, %k2
++	kmovd	%k2, %edx
++	VMOVU	%YMM4, (%rdi)
++	add	$VEC_SIZE, %rcx
++	sub	$VEC_SIZE, %r8
++	jbe	L(CopyVecSizeCase2OrCase3)
++	test	%edx, %edx
++#  ifndef USE_AS_STRCAT
++	jnz	L(CopyVecSizeUnalignedVec5)
++#  else
++	jnz	L(CopyVecSize)
++#  endif
++
++	vpcmpb	$0, %YMM6, %YMMZERO, %k3
++	kmovd	%k3, %edx
++	VMOVU	%YMM5, VEC_SIZE(%rdi)
++	add	$VEC_SIZE, %rcx
++	sub	$VEC_SIZE, %r8
++	jbe	L(CopyVecSizeCase2OrCase3)
++	test	%edx, %edx
++#  ifndef USE_AS_STRCAT
++	jnz	L(CopyVecSizeUnalignedVec6)
++#  else
++	jnz	L(CopyVecSize)
++#  endif
++
++	vpcmpb	$0, %YMM7, %YMMZERO, %k4
++	kmovd	%k4, %edx
++	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
++	lea	VEC_SIZE(%rdi, %rcx), %rdi
++	lea	VEC_SIZE(%rsi, %rcx), %rsi
++	bsf	%edx, %edx
++	cmp	%r8d, %edx
++	jb	L(CopyVecSizeExit)
++L(StrncpyExit):
++	cmp	$65, %r8d
++	je	L(StrncpyExit65)
++	cmp	$33, %r8d
++	jae	L(StrncpyExit33_64)
++	cmp	$17, %r8d
++	jae	L(StrncpyExit17_32)
++	cmp	$9, %r8d
++	jae	L(StrncpyExit9_16)
++	cmp	$5, %r8d
++	jae	L(StrncpyExit5_8)
++	cmp	$3, %r8d
++	jae	L(StrncpyExit3_4)
++	cmp	$1, %r8d
++	ja	L(StrncpyExit2)
++	je	L(StrncpyExit1)
++#  ifdef USE_AS_STPCPY
++	mov	%rdi, %rax
++#  endif
++#  ifdef USE_AS_STRCAT
++	movb	$0, (%rdi)
++#  endif
++	ret
++
++	.p2align 4
++L(ExitZero):
++#  ifndef USE_AS_STRCAT
++	mov	%rdi, %rax
++#  endif
++	ret
++
++# endif
++
++# ifndef USE_AS_STRCAT
++END (STRCPY)
++# else
++END (STRCAT)
++# endif
++#endif
+diff --git a/sysdeps/x86_64/multiarch/strncat-evex.S b/sysdeps/x86_64/multiarch/strncat-evex.S
+new file mode 100644
+index 00000000..8884f023
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strncat-evex.S
+@@ -0,0 +1,3 @@
++#define USE_AS_STRNCAT
++#define STRCAT __strncat_evex
++#include "strcat-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/strncpy-evex.S b/sysdeps/x86_64/multiarch/strncpy-evex.S
+new file mode 100644
+index 00000000..40e391f0
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strncpy-evex.S
+@@ -0,0 +1,3 @@
++#define USE_AS_STRNCPY
++#define STRCPY __strncpy_evex
++#include "strcpy-evex.S"
+-- 
+GitLab
+
diff --git a/SOURCES/ia-upd-256bit-evex-instr-3.patch b/SOURCES/ia-upd-256bit-evex-instr-3.patch
new file mode 100644
index 0000000..1fc1f59
--- /dev/null
+++ b/SOURCES/ia-upd-256bit-evex-instr-3.patch
@@ -0,0 +1,243 @@
+From c2440c1e45d53140531105de024f7b9ceb53c51e Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 5 Mar 2021 06:46:08 -0800
+Subject: [PATCH] x86-64: Add memmove family functions with 256-bit EVEX
+
+Update ifunc-memmove.h to select the function optimized with 256-bit EVEX
+instructions using YMM16-YMM31 registers to avoid RTM abort with usable
+AVX512VL since VZEROUPPER isn't needed at function exit.
+
+(cherry picked from commit 63ad43566f7a25d140dc723598aeb441ad657eed)
+---
+ sysdeps/x86_64/multiarch/Makefile             |  1 +
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c    | 36 +++++++++++++++++++
+ sysdeps/x86_64/multiarch/ifunc-memmove.h      | 21 +++++++++--
+ .../multiarch/memmove-evex-unaligned-erms.S   | 33 +++++++++++++++++
+ .../multiarch/memmove-vec-unaligned-erms.S    | 24 ++++++++-----
+ 5 files changed, 104 insertions(+), 11 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 46783cd1..4563fc56 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
++++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -41,6 +41,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
+ 		   memset-avx2-unaligned-erms \
+ 		   memset-avx512-unaligned-erms \
+ 		   memchr-evex \
++		   memmove-evex-unaligned-erms \
+ 		   memrchr-evex \
+ 		   rawmemchr-evex \
+ 		   stpcpy-evex \
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index 082e4da3..6bd3abfc 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -80,6 +80,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __memmove_chk_avx_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
++			      CPU_FEATURE_USABLE (AVX512VL),
++			      __memmove_chk_evex_unaligned)
++	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
++			      CPU_FEATURE_USABLE (AVX512VL),
++			      __memmove_chk_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ 			      CPU_FEATURE_USABLE (SSSE3),
+ 			      __memmove_chk_ssse3_back)
+@@ -102,6 +108,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, memmove,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __memmove_avx_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, memmove,
++			      CPU_FEATURE_USABLE (AVX512VL),
++			      __memmove_evex_unaligned)
++	      IFUNC_IMPL_ADD (array, i, memmove,
++			      CPU_FEATURE_USABLE (AVX512VL),
++			      __memmove_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, memmove,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __memmove_avx512_no_vzeroupper)
+@@ -565,6 +577,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __memcpy_chk_avx_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
++			      CPU_FEATURE_USABLE (AVX512VL),
++			      __memcpy_chk_evex_unaligned)
++	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
++			      CPU_FEATURE_USABLE (AVX512VL),
++			      __memcpy_chk_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ 			      CPU_FEATURE_USABLE (SSSE3),
+ 			      __memcpy_chk_ssse3_back)
+@@ -587,6 +605,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, memcpy,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __memcpy_avx_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, memcpy,
++			      CPU_FEATURE_USABLE (AVX512VL),
++			      __memcpy_evex_unaligned)
++	      IFUNC_IMPL_ADD (array, i, memcpy,
++			      CPU_FEATURE_USABLE (AVX512VL),
++			      __memcpy_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
+ 			      __memcpy_ssse3_back)
+ 	      IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
+@@ -623,6 +647,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __mempcpy_chk_avx_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
++			      CPU_FEATURE_USABLE (AVX512VL),
++			      __mempcpy_chk_evex_unaligned)
++	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
++			      CPU_FEATURE_USABLE (AVX512VL),
++			      __mempcpy_chk_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ 			      CPU_FEATURE_USABLE (SSSE3),
+ 			      __mempcpy_chk_ssse3_back)
+@@ -654,6 +684,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, mempcpy,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __mempcpy_avx_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, mempcpy,
++			      CPU_FEATURE_USABLE (AVX512VL),
++			      __mempcpy_evex_unaligned)
++	      IFUNC_IMPL_ADD (array, i, mempcpy,
++			      CPU_FEATURE_USABLE (AVX512VL),
++			      __mempcpy_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
+ 			      __mempcpy_ssse3_back)
+ 	      IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
+index 5e5f0299..6f8bce5f 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
++++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
+@@ -29,6 +29,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
+   attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
++  attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
++  attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned)
+   attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms)
+@@ -59,10 +63,21 @@ IFUNC_SELECTOR (void)
+ 
+   if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+     {
+-      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+-	return OPTIMIZE (avx_unaligned_erms);
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
++	{
++	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++	    return OPTIMIZE (evex_unaligned_erms);
++
++	  return OPTIMIZE (evex_unaligned);
++	}
++
++      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
++	{
++	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++	    return OPTIMIZE (avx_unaligned_erms);
+ 
+-      return OPTIMIZE (avx_unaligned);
++	  return OPTIMIZE (avx_unaligned);
++	}
+     }
+ 
+   if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
+diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
+new file mode 100644
+index 00000000..0cbce8f9
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
+@@ -0,0 +1,33 @@
++#if IS_IN (libc)
++# define VEC_SIZE	32
++# define XMM0		xmm16
++# define XMM1		xmm17
++# define YMM0		ymm16
++# define YMM1		ymm17
++# define VEC0		ymm16
++# define VEC1		ymm17
++# define VEC2		ymm18
++# define VEC3		ymm19
++# define VEC4		ymm20
++# define VEC5		ymm21
++# define VEC6		ymm22
++# define VEC7		ymm23
++# define VEC8		ymm24
++# define VEC9		ymm25
++# define VEC10		ymm26
++# define VEC11		ymm27
++# define VEC12		ymm28
++# define VEC13		ymm29
++# define VEC14		ymm30
++# define VEC15		ymm31
++# define VEC(i)		VEC##i
++# define VMOVNT		vmovntdq
++# define VMOVU		vmovdqu64
++# define VMOVA		vmovdqa64
++# define VZEROUPPER
++
++# define SECTION(p)		p##.evex
++# define MEMMOVE_SYMBOL(p,s)	p##_evex_##s
++
++# include "memmove-vec-unaligned-erms.S"
++#endif
+diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+index 274aa1c7..08e21692 100644
+--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+@@ -48,6 +48,14 @@
+ # define MEMMOVE_CHK_SYMBOL(p,s)	MEMMOVE_SYMBOL(p, s)
+ #endif
+ 
++#ifndef XMM0
++# define XMM0				xmm0
++#endif
++
++#ifndef YMM0
++# define YMM0				ymm0
++#endif
++
+ #ifndef VZEROUPPER
+ # if VEC_SIZE > 16
+ #  define VZEROUPPER vzeroupper
+@@ -277,20 +285,20 @@ L(less_vec):
+ #if VEC_SIZE > 32
+ L(between_32_63):
+ 	/* From 32 to 63.  No branch when size == 32.  */
+-	vmovdqu	(%rsi), %ymm0
+-	vmovdqu	-32(%rsi,%rdx), %ymm1
+-	vmovdqu	%ymm0, (%rdi)
+-	vmovdqu	%ymm1, -32(%rdi,%rdx)
++	VMOVU	(%rsi), %YMM0
++	VMOVU	-32(%rsi,%rdx), %YMM1
++	VMOVU	%YMM0, (%rdi)
++	VMOVU	%YMM1, -32(%rdi,%rdx)
+ 	VZEROUPPER
+ 	ret
+ #endif
+ #if VEC_SIZE > 16
+ 	/* From 16 to 31.  No branch when size == 16.  */
+ L(between_16_31):
+-	vmovdqu	(%rsi), %xmm0
+-	vmovdqu	-16(%rsi,%rdx), %xmm1
+-	vmovdqu	%xmm0, (%rdi)
+-	vmovdqu	%xmm1, -16(%rdi,%rdx)
++	VMOVU	(%rsi), %XMM0
++	VMOVU	-16(%rsi,%rdx), %XMM1
++	VMOVU	%XMM0, (%rdi)
++	VMOVU	%XMM1, -16(%rdi,%rdx)
+ 	ret
+ #endif
+ L(between_8_15):
+-- 
+GitLab
+
diff --git a/SOURCES/ia-upd-256bit-evex-instr-4.patch b/SOURCES/ia-upd-256bit-evex-instr-4.patch
new file mode 100644
index 0000000..4fcabfb
--- /dev/null
+++ b/SOURCES/ia-upd-256bit-evex-instr-4.patch
@@ -0,0 +1,255 @@
+From 3d5101ddb7a4004459ca3f894caa47cfe9208be6 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 5 Mar 2021 07:15:03 -0800
+Subject: [PATCH] x86-64: Add memset family functions with 256-bit EVEX
+
+Update ifunc-memset.h/ifunc-wmemset.h to select the function optimized
+with 256-bit EVEX instructions using YMM16-YMM31 registers to avoid RTM
+abort with usable AVX512VL and AVX512BW since VZEROUPPER isn't needed at
+function exit.
+
+(cherry picked from commit 1b968b6b9b3aac702ac2f133e0dd16cfdbb415ee)
+---
+ sysdeps/x86_64/multiarch/Makefile             |  1 +
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c    | 22 +++++++++++++++++
+ sysdeps/x86_64/multiarch/ifunc-memset.h       | 24 +++++++++++++++----
+ sysdeps/x86_64/multiarch/ifunc-wmemset.h      | 13 ++++++----
+ .../multiarch/memset-evex-unaligned-erms.S    | 24 +++++++++++++++++++
+ .../multiarch/memset-vec-unaligned-erms.S     | 20 +++++++++++-----
+ 6 files changed, 90 insertions(+), 14 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 4563fc56..1cc0a10e 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
++++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -43,6 +43,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
+ 		   memchr-evex \
+ 		   memmove-evex-unaligned-erms \
+ 		   memrchr-evex \
++		   memset-evex-unaligned-erms \
+ 		   rawmemchr-evex \
+ 		   stpcpy-evex \
+ 		   stpncpy-evex \
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index 6bd3abfc..7cf83485 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -160,6 +160,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __memset_chk_avx2_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, __memset_chk,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __memset_chk_evex_unaligned)
++	      IFUNC_IMPL_ADD (array, i, __memset_chk,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __memset_chk_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __memset_chk_avx512_unaligned_erms)
+@@ -185,6 +193,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __memset_avx2_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, memset,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __memset_evex_unaligned)
++	      IFUNC_IMPL_ADD (array, i, memset,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __memset_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __memset_avx512_unaligned_erms)
+@@ -555,6 +571,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wmemset,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wmemset_avx2_unaligned)
++	      IFUNC_IMPL_ADD (array, i, wmemset,
++			      CPU_FEATURE_USABLE (AVX512VL),
++			      __wmemset_evex_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, wmemset,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __wmemset_avx512_unaligned))
+@@ -723,6 +742,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wmemset_chk_avx2_unaligned)
++	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
++			      CPU_FEATURE_USABLE (AVX512VL),
++			      __wmemset_chk_evex_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __wmemset_chk_avx512_unaligned))
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
+index 708bd72e..6f31f4dc 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
++++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
+@@ -27,6 +27,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms)
+   attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
++  attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
++  attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned)
+   attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms)
+@@ -56,10 +60,22 @@ IFUNC_SELECTOR (void)
+ 
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
+     {
+-      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+-	return OPTIMIZE (avx2_unaligned_erms);
+-      else
+-	return OPTIMIZE (avx2_unaligned);
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
++	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
++	{
++	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++	    return OPTIMIZE (evex_unaligned_erms);
++
++	  return OPTIMIZE (evex_unaligned);
++	}
++
++      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
++	{
++	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++	    return OPTIMIZE (avx2_unaligned_erms);
++
++	  return OPTIMIZE (avx2_unaligned);
++	}
+     }
+ 
+   if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+index eb242210..9290c4bf 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h
++++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+@@ -20,6 +20,7 @@
+ 
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden;
+ 
+ static inline void *
+@@ -27,14 +28,18 @@ IFUNC_SELECTOR (void)
+ {
+   const struct cpu_features* cpu_features = __get_cpu_features ();
+ 
+-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+-      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
++  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+     {
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
+-	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
++	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
++	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ 	return OPTIMIZE (avx512_unaligned);
+-      else
++
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
++	return OPTIMIZE (evex_unaligned);
++
++      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ 	return OPTIMIZE (avx2_unaligned);
+     }
+ 
+diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+new file mode 100644
+index 00000000..ae0a4d6e
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+@@ -0,0 +1,24 @@
++#if IS_IN (libc)
++# define VEC_SIZE	32
++# define XMM0		xmm16
++# define YMM0		ymm16
++# define VEC0		ymm16
++# define VEC(i)		VEC##i
++# define VMOVU		vmovdqu64
++# define VMOVA		vmovdqa64
++# define VZEROUPPER
++
++# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
++  movq r, %rax; \
++  vpbroadcastb d, %VEC0
++
++# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
++  movq r, %rax; \
++  vpbroadcastd d, %VEC0
++
++# define SECTION(p)		p##.evex
++# define MEMSET_SYMBOL(p,s)	p##_evex_##s
++# define WMEMSET_SYMBOL(p,s)	p##_evex_##s
++
++# include "memset-vec-unaligned-erms.S"
++#endif
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index 9a0fd818..71e91a8f 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -34,6 +34,14 @@
+ # define WMEMSET_CHK_SYMBOL(p,s)	WMEMSET_SYMBOL(p, s)
+ #endif
+ 
++#ifndef XMM0
++# define XMM0				xmm0
++#endif
++
++#ifndef YMM0
++# define YMM0				ymm0
++#endif
++
+ #ifndef VZEROUPPER
+ # if VEC_SIZE > 16
+ #  define VZEROUPPER			vzeroupper
+@@ -67,7 +75,7 @@
+ ENTRY (__bzero)
+ 	mov	%RDI_LP, %RAX_LP /* Set return value.  */
+ 	mov	%RSI_LP, %RDX_LP /* Set n.  */
+-	pxor	%xmm0, %xmm0
++	pxor	%XMM0, %XMM0
+ 	jmp	L(entry_from_bzero)
+ END (__bzero)
+ weak_alias (__bzero, bzero)
+@@ -223,7 +231,7 @@ L(less_vec):
+ 	cmpb	$16, %dl
+ 	jae	L(between_16_31)
+ # endif
+-	MOVQ	%xmm0, %rcx
++	MOVQ	%XMM0, %rcx
+ 	cmpb	$8, %dl
+ 	jae	L(between_8_15)
+ 	cmpb	$4, %dl
+@@ -238,16 +246,16 @@ L(less_vec):
+ # if VEC_SIZE > 32
+ 	/* From 32 to 63.  No branch when size == 32.  */
+ L(between_32_63):
+-	vmovdqu	%ymm0, -32(%rdi,%rdx)
+-	vmovdqu	%ymm0, (%rdi)
++	VMOVU	%YMM0, -32(%rdi,%rdx)
++	VMOVU	%YMM0, (%rdi)
+ 	VZEROUPPER
+ 	ret
+ # endif
+ # if VEC_SIZE > 16
+ 	/* From 16 to 31.  No branch when size == 16.  */
+ L(between_16_31):
+-	vmovdqu	%xmm0, -16(%rdi,%rdx)
+-	vmovdqu	%xmm0, (%rdi)
++	VMOVU	%XMM0, -16(%rdi,%rdx)
++	VMOVU	%XMM0, (%rdi)
+ 	VZEROUPPER
+ 	ret
+ # endif
+-- 
+GitLab
+
diff --git a/SOURCES/ia-upd-256bit-evex-instr-5.patch b/SOURCES/ia-upd-256bit-evex-instr-5.patch
new file mode 100644
index 0000000..a2d8191
--- /dev/null
+++ b/SOURCES/ia-upd-256bit-evex-instr-5.patch
@@ -0,0 +1,562 @@
+From cd2eeb1be618b5edfc9c6929c07201ff941b31d9 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 5 Mar 2021 07:20:28 -0800
+Subject: [PATCH] x86-64: Add memcmp family functions with 256-bit EVEX
+
+Update ifunc-memcmp.h to select the function optimized with 256-bit EVEX
+instructions using YMM16-YMM31 registers to avoid RTM abort with usable
+AVX512VL, AVX512BW and MOVBE since VZEROUPPER isn't needed at function
+exit.
+
+(cherry picked from commit 91264fe3577fe887b4860923fa6142b5274c8965)
+---
+ sysdeps/x86_64/multiarch/Makefile             |   4 +-
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  10 +
+ sysdeps/x86_64/multiarch/ifunc-memcmp.h       |  13 +-
+ sysdeps/x86_64/multiarch/memcmp-evex-movbe.S  | 440 ++++++++++++++++++
+ sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S |   4 +
+ 5 files changed, 467 insertions(+), 4 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+ create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 1cc0a10e..9d79b138 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
++++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -41,6 +41,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
+ 		   memset-avx2-unaligned-erms \
+ 		   memset-avx512-unaligned-erms \
+ 		   memchr-evex \
++		   memcmp-evex-movbe \
+ 		   memmove-evex-unaligned-erms \
+ 		   memrchr-evex \
+ 		   memset-evex-unaligned-erms \
+@@ -81,7 +82,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
+ 		   wcsncmp-evex \
+ 		   wcsnlen-evex \
+ 		   wcsrchr-evex \
+-		   wmemchr-evex
++		   wmemchr-evex \
++		   wmemcmp-evex-movbe
+ endif
+ 
+ ifeq ($(subdir),debug)
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index 7cf83485..c8da910e 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -56,6 +56,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)),
+ 			      __memcmp_avx2_movbe)
++	      IFUNC_IMPL_ADD (array, i, memcmp,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (MOVBE)),
++			      __memcmp_evex_movbe)
+ 	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
+ 			      __memcmp_sse4_1)
+ 	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3),
+@@ -558,6 +563,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)),
+ 			      __wmemcmp_avx2_movbe)
++	      IFUNC_IMPL_ADD (array, i, wmemcmp,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (MOVBE)),
++			      __wmemcmp_evex_movbe)
+ 	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
+ 			      __wmemcmp_sse4_1)
+ 	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3),
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+index 6c1f3153..3ca1f0a6 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
++++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+@@ -23,17 +23,24 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
+ 
+ static inline void *
+ IFUNC_SELECTOR (void)
+ {
+   const struct cpu_features* cpu_features = __get_cpu_features ();
+ 
+-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+-      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
++  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+       && CPU_FEATURE_USABLE_P (cpu_features, MOVBE)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+-    return OPTIMIZE (avx2_movbe);
++    {
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
++	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
++	return OPTIMIZE (evex_movbe);
++
++      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
++	return OPTIMIZE (avx2_movbe);
++    }
+ 
+   if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
+     return OPTIMIZE (sse4_1);
+diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+new file mode 100644
+index 00000000..9c093972
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+@@ -0,0 +1,440 @@
++/* memcmp/wmemcmp optimized with 256-bit EVEX instructions.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#if IS_IN (libc)
++
++/* memcmp/wmemcmp is implemented as:
++   1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
++      to avoid branches.
++   2. Use overlapping compare to avoid branch.
++   3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
++      bytes for wmemcmp.
++   4. If size is 8 * VEC_SIZE or less, unroll the loop.
++   5. Compare 4 * VEC_SIZE at a time with the aligned first memory
++      area.
++   6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
++   7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
++   8. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
++
++# include <sysdep.h>
++
++# ifndef MEMCMP
++#  define MEMCMP	__memcmp_evex_movbe
++# endif
++
++# define VMOVU		vmovdqu64
++
++# ifdef USE_AS_WMEMCMP
++#  define VPCMPEQ	vpcmpeqd
++# else
++#  define VPCMPEQ	vpcmpeqb
++# endif
++
++# define XMM1		xmm17
++# define XMM2		xmm18
++# define YMM1		ymm17
++# define YMM2		ymm18
++# define YMM3		ymm19
++# define YMM4		ymm20
++# define YMM5		ymm21
++# define YMM6		ymm22
++
++# define VEC_SIZE 32
++# ifdef USE_AS_WMEMCMP
++#  define VEC_MASK 0xff
++#  define XMM_MASK 0xf
++# else
++#  define VEC_MASK 0xffffffff
++#  define XMM_MASK 0xffff
++# endif
++
++/* Warning!
++           wmemcmp has to use SIGNED comparison for elements.
++           memcmp has to use UNSIGNED comparison for elemnts.
++*/
++
++	.section .text.evex,"ax",@progbits
++ENTRY (MEMCMP)
++# ifdef USE_AS_WMEMCMP
++	shl	$2, %RDX_LP
++# elif defined __ILP32__
++	/* Clear the upper 32 bits.  */
++	movl	%edx, %edx
++# endif
++	cmp	$VEC_SIZE, %RDX_LP
++	jb	L(less_vec)
++
++	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
++	VMOVU	(%rsi), %YMM2
++	VPCMPEQ (%rdi), %YMM2, %k1
++	kmovd	%k1, %eax
++	subl    $VEC_MASK, %eax
++	jnz	L(first_vec)
++
++	cmpq	$(VEC_SIZE * 2), %rdx
++	jbe	L(last_vec)
++
++	/* More than 2 * VEC.  */
++	cmpq	$(VEC_SIZE * 8), %rdx
++	ja	L(more_8x_vec)
++	cmpq	$(VEC_SIZE * 4), %rdx
++	jb	L(last_4x_vec)
++
++	/* From 4 * VEC to 8 * VEC, inclusively. */
++	VMOVU	(%rsi), %YMM1
++	VPCMPEQ (%rdi), %YMM1, %k1
++
++	VMOVU	VEC_SIZE(%rsi), %YMM2
++	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
++
++	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
++	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
++
++	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
++	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
++
++	kandd	%k1, %k2, %k5
++	kandd	%k3, %k4, %k6
++	kandd	%k5, %k6, %k6
++
++	kmovd	%k6, %eax
++	cmpl	$VEC_MASK, %eax
++	jne	L(4x_vec_end)
++
++	leaq	-(4 * VEC_SIZE)(%rdi, %rdx), %rdi
++	leaq	-(4 * VEC_SIZE)(%rsi, %rdx), %rsi
++	VMOVU	(%rsi), %YMM1
++	VPCMPEQ (%rdi), %YMM1, %k1
++
++	VMOVU	VEC_SIZE(%rsi), %YMM2
++	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
++	kandd	%k1, %k2, %k5
++
++	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
++	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
++	kandd	%k3, %k5, %k5
++
++	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
++	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
++	kandd	%k4, %k5, %k5
++
++	kmovd	%k5, %eax
++	cmpl	$VEC_MASK, %eax
++	jne	L(4x_vec_end)
++	xorl	%eax, %eax
++	ret
++
++	.p2align 4
++L(last_2x_vec):
++	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
++	VMOVU	(%rsi), %YMM2
++	VPCMPEQ (%rdi), %YMM2, %k2
++	kmovd	%k2, %eax
++	subl    $VEC_MASK, %eax
++	jnz	L(first_vec)
++
++L(last_vec):
++	/* Use overlapping loads to avoid branches.  */
++	leaq	-VEC_SIZE(%rdi, %rdx), %rdi
++	leaq	-VEC_SIZE(%rsi, %rdx), %rsi
++	VMOVU	(%rsi), %YMM2
++	VPCMPEQ (%rdi), %YMM2, %k2
++	kmovd	%k2, %eax
++	subl    $VEC_MASK, %eax
++	jnz	L(first_vec)
++	ret
++
++	.p2align 4
++L(first_vec):
++	/* A byte or int32 is different within 16 or 32 bytes.  */
++	tzcntl	%eax, %ecx
++# ifdef USE_AS_WMEMCMP
++	xorl	%eax, %eax
++	movl	(%rdi, %rcx, 4), %edx
++	cmpl	(%rsi, %rcx, 4), %edx
++L(wmemcmp_return):
++	setl	%al
++	negl	%eax
++	orl	$1, %eax
++# else
++	movzbl	(%rdi, %rcx), %eax
++	movzbl	(%rsi, %rcx), %edx
++	sub	%edx, %eax
++# endif
++	ret
++
++# ifdef USE_AS_WMEMCMP
++	.p2align 4
++L(4):
++	xorl	%eax, %eax
++	movl	(%rdi), %edx
++	cmpl	(%rsi), %edx
++	jne	L(wmemcmp_return)
++	ret
++# else
++	.p2align 4
++L(between_4_7):
++	/* Load as big endian with overlapping movbe to avoid branches.  */
++	movbe	(%rdi), %eax
++	movbe	(%rsi), %ecx
++	shlq	$32, %rax
++	shlq	$32, %rcx
++	movbe	-4(%rdi, %rdx), %edi
++	movbe	-4(%rsi, %rdx), %esi
++	orq	%rdi, %rax
++	orq	%rsi, %rcx
++	subq	%rcx, %rax
++	je	L(exit)
++	sbbl	%eax, %eax
++	orl	$1, %eax
++	ret
++
++	.p2align 4
++L(exit):
++	ret
++
++	.p2align 4
++L(between_2_3):
++	/* Load as big endian to avoid branches.  */
++	movzwl	(%rdi), %eax
++	movzwl	(%rsi), %ecx
++	shll	$8, %eax
++	shll	$8, %ecx
++	bswap	%eax
++	bswap	%ecx
++	movb	-1(%rdi, %rdx), %al
++	movb	-1(%rsi, %rdx), %cl
++	/* Subtraction is okay because the upper 8 bits are zero.  */
++	subl	%ecx, %eax
++	ret
++
++	.p2align 4
++L(1):
++	movzbl	(%rdi), %eax
++	movzbl	(%rsi), %ecx
++	subl	%ecx, %eax
++	ret
++# endif
++
++	.p2align 4
++L(zero):
++	xorl	%eax, %eax
++	ret
++
++	.p2align 4
++L(less_vec):
++# ifdef USE_AS_WMEMCMP
++	/* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes.  */
++	cmpb	$4, %dl
++	je	L(4)
++	jb	L(zero)
++# else
++	cmpb	$1, %dl
++	je	L(1)
++	jb	L(zero)
++	cmpb	$4, %dl
++	jb	L(between_2_3)
++	cmpb	$8, %dl
++	jb	L(between_4_7)
++# endif
++	cmpb	$16, %dl
++	jae	L(between_16_31)
++	/* It is between 8 and 15 bytes.  */
++	vmovq	(%rdi), %XMM1
++	vmovq	(%rsi), %XMM2
++	VPCMPEQ %XMM1, %XMM2, %k2
++	kmovw	%k2, %eax
++	subl    $XMM_MASK, %eax
++	jnz	L(first_vec)
++	/* Use overlapping loads to avoid branches.  */
++	leaq	-8(%rdi, %rdx), %rdi
++	leaq	-8(%rsi, %rdx), %rsi
++	vmovq	(%rdi), %XMM1
++	vmovq	(%rsi), %XMM2
++	VPCMPEQ %XMM1, %XMM2, %k2
++	kmovw	%k2, %eax
++	subl    $XMM_MASK, %eax
++	jnz	L(first_vec)
++	ret
++
++	.p2align 4
++L(between_16_31):
++	/* From 16 to 31 bytes.  No branch when size == 16.  */
++	VMOVU	(%rsi), %XMM2
++	VPCMPEQ (%rdi), %XMM2, %k2
++	kmovw	%k2, %eax
++	subl    $XMM_MASK, %eax
++	jnz	L(first_vec)
++
++	/* Use overlapping loads to avoid branches.  */
++	leaq	-16(%rdi, %rdx), %rdi
++	leaq	-16(%rsi, %rdx), %rsi
++	VMOVU	(%rsi), %XMM2
++	VPCMPEQ (%rdi), %XMM2, %k2
++	kmovw	%k2, %eax
++	subl    $XMM_MASK, %eax
++	jnz	L(first_vec)
++	ret
++
++	.p2align 4
++L(more_8x_vec):
++	/* More than 8 * VEC.  Check the first VEC.  */
++	VMOVU	(%rsi), %YMM2
++	VPCMPEQ (%rdi), %YMM2, %k2
++	kmovd	%k2, %eax
++	subl    $VEC_MASK, %eax
++	jnz	L(first_vec)
++
++	/* Align the first memory area for aligned loads in the loop.
++	   Compute how much the first memory area is misaligned.  */
++	movq	%rdi, %rcx
++	andl	$(VEC_SIZE - 1), %ecx
++	/* Get the negative of offset for alignment.  */
++	subq	$VEC_SIZE, %rcx
++	/* Adjust the second memory area.  */
++	subq	%rcx, %rsi
++	/* Adjust the first memory area which should be aligned now.  */
++	subq	%rcx, %rdi
++	/* Adjust length.  */
++	addq	%rcx, %rdx
++
++L(loop_4x_vec):
++	/* Compare 4 * VEC at a time forward.  */
++	VMOVU	(%rsi), %YMM1
++	VPCMPEQ (%rdi), %YMM1, %k1
++
++	VMOVU	VEC_SIZE(%rsi), %YMM2
++	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
++	kandd	%k2, %k1, %k5
++
++	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
++	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
++	kandd	%k3, %k5, %k5
++
++	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
++	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
++	kandd	%k4, %k5, %k5
++
++	kmovd	%k5, %eax
++	cmpl	$VEC_MASK, %eax
++	jne	L(4x_vec_end)
++
++	addq	$(VEC_SIZE * 4), %rdi
++	addq	$(VEC_SIZE * 4), %rsi
++
++	subq	$(VEC_SIZE * 4), %rdx
++	cmpq	$(VEC_SIZE * 4), %rdx
++	jae	L(loop_4x_vec)
++
++	/* Less than 4 * VEC.  */
++	cmpq	$VEC_SIZE, %rdx
++	jbe	L(last_vec)
++	cmpq	$(VEC_SIZE * 2), %rdx
++	jbe	L(last_2x_vec)
++
++L(last_4x_vec):
++	/* From 2 * VEC to 4 * VEC. */
++	VMOVU	(%rsi), %YMM2
++	VPCMPEQ (%rdi), %YMM2, %k2
++	kmovd	%k2, %eax
++	subl    $VEC_MASK, %eax
++	jnz	L(first_vec)
++
++	addq	$VEC_SIZE, %rdi
++	addq	$VEC_SIZE, %rsi
++	VMOVU	(%rsi), %YMM2
++	VPCMPEQ (%rdi), %YMM2, %k2
++	kmovd	%k2, %eax
++	subl    $VEC_MASK, %eax
++	jnz	L(first_vec)
++
++	/* Use overlapping loads to avoid branches.  */
++	leaq	-(3 * VEC_SIZE)(%rdi, %rdx), %rdi
++	leaq	-(3 * VEC_SIZE)(%rsi, %rdx), %rsi
++	VMOVU	(%rsi), %YMM2
++	VPCMPEQ (%rdi), %YMM2, %k2
++	kmovd	%k2, %eax
++	subl    $VEC_MASK, %eax
++	jnz	L(first_vec)
++
++	addq	$VEC_SIZE, %rdi
++	addq	$VEC_SIZE, %rsi
++	VMOVU	(%rsi), %YMM2
++	VPCMPEQ (%rdi), %YMM2, %k2
++	kmovd	%k2, %eax
++	subl    $VEC_MASK, %eax
++	jnz	L(first_vec)
++	ret
++
++	.p2align 4
++L(4x_vec_end):
++	kmovd	%k1, %eax
++	subl	$VEC_MASK, %eax
++	jnz	L(first_vec)
++	kmovd	%k2, %eax
++	subl	$VEC_MASK, %eax
++	jnz	L(first_vec_x1)
++	kmovd	%k3, %eax
++	subl	$VEC_MASK, %eax
++	jnz	L(first_vec_x2)
++	kmovd	%k4, %eax
++	subl	$VEC_MASK, %eax
++	tzcntl	%eax, %ecx
++# ifdef USE_AS_WMEMCMP
++	xorl	%eax, %eax
++	movl	(VEC_SIZE * 3)(%rdi, %rcx, 4), %edx
++	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, 4), %edx
++	jmp	L(wmemcmp_return)
++# else
++	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
++	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
++	sub	%edx, %eax
++# endif
++	ret
++
++	.p2align 4
++L(first_vec_x1):
++	tzcntl	%eax, %ecx
++# ifdef USE_AS_WMEMCMP
++	xorl	%eax, %eax
++	movl	VEC_SIZE(%rdi, %rcx, 4), %edx
++	cmpl	VEC_SIZE(%rsi, %rcx, 4), %edx
++	jmp	L(wmemcmp_return)
++# else
++	movzbl	VEC_SIZE(%rdi, %rcx), %eax
++	movzbl	VEC_SIZE(%rsi, %rcx), %edx
++	sub	%edx, %eax
++# endif
++	ret
++
++	.p2align 4
++L(first_vec_x2):
++	tzcntl	%eax, %ecx
++# ifdef USE_AS_WMEMCMP
++	xorl	%eax, %eax
++	movl	(VEC_SIZE * 2)(%rdi, %rcx, 4), %edx
++	cmpl	(VEC_SIZE * 2)(%rsi, %rcx, 4), %edx
++	jmp	L(wmemcmp_return)
++# else
++	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
++	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
++	sub	%edx, %eax
++# endif
++	ret
++END (MEMCMP)
++#endif
+diff --git a/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
+new file mode 100644
+index 00000000..4726d74a
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
+@@ -0,0 +1,4 @@
++#define MEMCMP __wmemcmp_evex_movbe
++#define USE_AS_WMEMCMP 1
++
++#include "memcmp-evex-movbe.S"
+-- 
+GitLab
+
diff --git a/SOURCES/ia-upd-large-memcpy.patch b/SOURCES/ia-upd-large-memcpy.patch
new file mode 100644
index 0000000..2ae72f0
--- /dev/null
+++ b/SOURCES/ia-upd-large-memcpy.patch
@@ -0,0 +1,445 @@
+From 8f4f10bfb4793f47accee8ea86438879a889b595 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Wed, 2 Mar 2022 15:20:14 -0800
+Subject: [PATCH] x86: Update large memcpy case in memmove-vec-unaligned-erms.S
+
+No Bug. This commit updates the large memcpy case (no overlap). The
+update is to perform memcpy on either 2 or 4 contiguous pages at
+once. This 1) helps to alleviate the affects of false memory aliasing
+when destination and source have a close 4k alignment and 2) In most
+cases and for most DRAM units is a modestly more efficient access
+pattern. These changes are a clear performance improvement for
+VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy,
+test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all
+pass.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+(cherry picked from commit 1a8605b6cd257e8a74e29b5b71c057211f5fb847)
+---
+ .../multiarch/memmove-vec-unaligned-erms.S    | 338 ++++++++++++++----
+ 1 file changed, 265 insertions(+), 73 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+index c475fed4..3e2dd6bc 100644
+--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+@@ -32,7 +32,16 @@
+       overlapping addresses.
+    6. If size >= __x86_shared_non_temporal_threshold and there is no
+       overlap between destination and source, use non-temporal store
+-      instead of aligned store.  */
++      instead of aligned store copying from either 2 or 4 pages at
++      once.
++   8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold
++      and source and destination do not page alias, copy from 2 pages
++      at once using non-temporal stores. Page aliasing in this case is
++      considered true if destination's page alignment - sources' page
++      alignment is less than 8 * VEC_SIZE.
++   9. If size >= 16 * __x86_shared_non_temporal_threshold or source
++      and destination do page alias copy from 4 pages at once using
++      non-temporal stores.  */
+ 
+ #include <sysdep.h>
+ 
+@@ -64,6 +73,34 @@
+ # endif
+ #endif
+ 
++#ifndef PAGE_SIZE
++# define PAGE_SIZE 4096
++#endif
++
++#if PAGE_SIZE != 4096
++# error Unsupported PAGE_SIZE
++#endif
++
++#ifndef LOG_PAGE_SIZE
++# define LOG_PAGE_SIZE 12
++#endif
++
++#if PAGE_SIZE != (1 << LOG_PAGE_SIZE)
++# error Invalid LOG_PAGE_SIZE
++#endif
++
++/* Byte per page for large_memcpy inner loop.  */
++#if VEC_SIZE == 64
++# define LARGE_LOAD_SIZE (VEC_SIZE * 2)
++#else
++# define LARGE_LOAD_SIZE (VEC_SIZE * 4)
++#endif
++
++/* Amount to shift rdx by to compare for memcpy_large_4x.  */
++#ifndef LOG_4X_MEMCPY_THRESH
++# define LOG_4X_MEMCPY_THRESH 4
++#endif
++
+ /* Avoid short distance rep movsb only with non-SSE vector.  */
+ #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
+ # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
+@@ -103,6 +140,28 @@
+ # error Unsupported PREFETCH_SIZE!
+ #endif
+ 
++#if LARGE_LOAD_SIZE == (VEC_SIZE * 2)
++# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \
++	VMOVU	(offset)base, vec0; \
++	VMOVU	((offset) + VEC_SIZE)base, vec1;
++# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \
++	VMOVNT  vec0, (offset)base; \
++	VMOVNT  vec1, ((offset) + VEC_SIZE)base;
++#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4)
++# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
++	VMOVU	(offset)base, vec0; \
++	VMOVU	((offset) + VEC_SIZE)base, vec1; \
++	VMOVU	((offset) + VEC_SIZE * 2)base, vec2; \
++	VMOVU	((offset) + VEC_SIZE * 3)base, vec3;
++# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
++	VMOVNT	vec0, (offset)base; \
++	VMOVNT	vec1, ((offset) + VEC_SIZE)base; \
++	VMOVNT	vec2, ((offset) + VEC_SIZE * 2)base; \
++	VMOVNT	vec3, ((offset) + VEC_SIZE * 3)base;
++#else
++# error Invalid LARGE_LOAD_SIZE
++#endif
++
+ #ifndef SECTION
+ # error SECTION is not defined!
+ #endif
+@@ -390,6 +449,15 @@ L(last_4x_vec):
+ 	VZEROUPPER_RETURN
+ 
+ L(more_8x_vec):
++	/* Check if non-temporal move candidate.  */
++#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
++	/* Check non-temporal store threshold.  */
++	cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
++	ja	L(large_memcpy_2x)
++#endif
++	/* Entry if rdx is greater than non-temporal threshold but there
++       is overlap.  */
++L(more_8x_vec_check):
+ 	cmpq	%rsi, %rdi
+ 	ja	L(more_8x_vec_backward)
+ 	/* Source == destination is less common.  */
+@@ -416,24 +484,21 @@ L(more_8x_vec):
+ 	subq	%r8, %rdi
+ 	/* Adjust length.  */
+ 	addq	%r8, %rdx
+-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+-	/* Check non-temporal store threshold.  */
+-	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
+-	ja	L(large_forward)
+-#endif
++
++	.p2align 4
+ L(loop_4x_vec_forward):
+ 	/* Copy 4 * VEC a time forward.  */
+ 	VMOVU	(%rsi), %VEC(0)
+ 	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+ 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+ 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+-	addq	$(VEC_SIZE * 4), %rsi
+-	subq	$(VEC_SIZE * 4), %rdx
++	subq	$-(VEC_SIZE * 4), %rsi
++	addq	$-(VEC_SIZE * 4), %rdx
+ 	VMOVA	%VEC(0), (%rdi)
+ 	VMOVA	%VEC(1), VEC_SIZE(%rdi)
+ 	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
+ 	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
+-	addq	$(VEC_SIZE * 4), %rdi
++	subq	$-(VEC_SIZE * 4), %rdi
+ 	cmpq	$(VEC_SIZE * 4), %rdx
+ 	ja	L(loop_4x_vec_forward)
+ 	/* Store the last 4 * VEC.  */
+@@ -467,24 +532,21 @@ L(more_8x_vec_backward):
+ 	subq	%r8, %r9
+ 	/* Adjust length.  */
+ 	subq	%r8, %rdx
+-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+-	/* Check non-temporal store threshold.  */
+-	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
+-	ja	L(large_backward)
+-#endif
++
++	.p2align 4
+ L(loop_4x_vec_backward):
+ 	/* Copy 4 * VEC a time backward.  */
+ 	VMOVU	(%rcx), %VEC(0)
+ 	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
+ 	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
+ 	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
+-	subq	$(VEC_SIZE * 4), %rcx
+-	subq	$(VEC_SIZE * 4), %rdx
++	addq	$-(VEC_SIZE * 4), %rcx
++	addq	$-(VEC_SIZE * 4), %rdx
+ 	VMOVA	%VEC(0), (%r9)
+ 	VMOVA	%VEC(1), -VEC_SIZE(%r9)
+ 	VMOVA	%VEC(2), -(VEC_SIZE * 2)(%r9)
+ 	VMOVA	%VEC(3), -(VEC_SIZE * 3)(%r9)
+-	subq	$(VEC_SIZE * 4), %r9
++	addq	$-(VEC_SIZE * 4), %r9
+ 	cmpq	$(VEC_SIZE * 4), %rdx
+ 	ja	L(loop_4x_vec_backward)
+ 	/* Store the first 4 * VEC.  */
+@@ -497,72 +559,202 @@ L(loop_4x_vec_backward):
+ 	VZEROUPPER_RETURN
+ 
+ #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+-L(large_forward):
++	.p2align 4
++L(large_memcpy_2x):
++	/* Compute absolute value of difference between source and
++	   destination.  */
++	movq	%rdi, %r9
++	subq	%rsi, %r9
++	movq	%r9, %r8
++	leaq	-1(%r9), %rcx
++	sarq	$63, %r8
++	xorq	%r8, %r9
++	subq	%r8, %r9
+ 	/* Don't use non-temporal store if there is overlap between
+-	   destination and source since destination may be in cache
+-	   when source is loaded.  */
+-	leaq    (%rdi, %rdx), %r10
+-	cmpq    %r10, %rsi
+-	jb	L(loop_4x_vec_forward)
+-L(loop_large_forward):
++	   destination and source since destination may be in cache when
++	   source is loaded.  */
++	cmpq	%r9, %rdx
++	ja	L(more_8x_vec_check)
++
++	/* Cache align destination. First store the first 64 bytes then
++	   adjust alignments.  */
++	VMOVU	(%rsi), %VEC(8)
++#if VEC_SIZE < 64
++	VMOVU	VEC_SIZE(%rsi), %VEC(9)
++#if VEC_SIZE < 32
++	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(10)
++	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(11)
++#endif
++#endif
++	VMOVU	%VEC(8), (%rdi)
++#if VEC_SIZE < 64
++	VMOVU	%VEC(9), VEC_SIZE(%rdi)
++#if VEC_SIZE < 32
++	VMOVU	%VEC(10), (VEC_SIZE * 2)(%rdi)
++	VMOVU	%VEC(11), (VEC_SIZE * 3)(%rdi)
++#endif
++#endif
++	/* Adjust source, destination, and size.  */
++	movq	%rdi, %r8
++	andq	$63, %r8
++	/* Get the negative of offset for alignment.  */
++	subq	$64, %r8
++	/* Adjust source.  */
++	subq	%r8, %rsi
++	/* Adjust destination which should be aligned now.  */
++	subq	%r8, %rdi
++	/* Adjust length.  */
++	addq	%r8, %rdx
++
++	/* Test if source and destination addresses will alias. If they do
++	   the larger pipeline in large_memcpy_4x alleviated the
++	   performance drop.  */
++	testl	$(PAGE_SIZE - VEC_SIZE * 8), %ecx
++	jz	L(large_memcpy_4x)
++
++	movq	%rdx, %r10
++	shrq	$LOG_4X_MEMCPY_THRESH, %r10
++	cmp	__x86_shared_non_temporal_threshold(%rip), %r10
++	jae	L(large_memcpy_4x)
++
++	/* edx will store remainder size for copying tail.  */
++	andl	$(PAGE_SIZE * 2 - 1), %edx
++	/* r10 stores outer loop counter.  */
++	shrq	$((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10
++	/* Copy 4x VEC at a time from 2 pages.  */
++	.p2align 4
++L(loop_large_memcpy_2x_outer):
++	/* ecx stores inner loop counter.  */
++	movl	$(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
++L(loop_large_memcpy_2x_inner):
++	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
++	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
++	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
++	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
++	/* Load vectors from rsi.  */
++	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
++	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
++	subq	$-LARGE_LOAD_SIZE, %rsi
++	/* Non-temporal store vectors to rdi.  */
++	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
++	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
++	subq	$-LARGE_LOAD_SIZE, %rdi
++	decl	%ecx
++	jnz	L(loop_large_memcpy_2x_inner)
++	addq	$PAGE_SIZE, %rdi
++	addq	$PAGE_SIZE, %rsi
++	decq	%r10
++	jne	L(loop_large_memcpy_2x_outer)
++	sfence
++
++	/* Check if only last 4 loads are needed.  */
++	cmpl	$(VEC_SIZE * 4), %edx
++	jbe	L(large_memcpy_2x_end)
++
++	/* Handle the last 2 * PAGE_SIZE bytes.  */
++L(loop_large_memcpy_2x_tail):
+ 	/* Copy 4 * VEC a time forward with non-temporal stores.  */
+-	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
+-	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
++	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
++	PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
+ 	VMOVU	(%rsi), %VEC(0)
+ 	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+ 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+ 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+-	addq	$PREFETCHED_LOAD_SIZE, %rsi
+-	subq	$PREFETCHED_LOAD_SIZE, %rdx
+-	VMOVNT	%VEC(0), (%rdi)
+-	VMOVNT	%VEC(1), VEC_SIZE(%rdi)
+-	VMOVNT	%VEC(2), (VEC_SIZE * 2)(%rdi)
+-	VMOVNT	%VEC(3), (VEC_SIZE * 3)(%rdi)
+-	addq	$PREFETCHED_LOAD_SIZE, %rdi
+-	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
+-	ja	L(loop_large_forward)
+-	sfence
++	subq	$-(VEC_SIZE * 4), %rsi
++	addl	$-(VEC_SIZE * 4), %edx
++	VMOVA	%VEC(0), (%rdi)
++	VMOVA	%VEC(1), VEC_SIZE(%rdi)
++	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
++	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
++	subq	$-(VEC_SIZE * 4), %rdi
++	cmpl	$(VEC_SIZE * 4), %edx
++	ja	L(loop_large_memcpy_2x_tail)
++
++L(large_memcpy_2x_end):
+ 	/* Store the last 4 * VEC.  */
+-	VMOVU	%VEC(5), (%rcx)
+-	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
+-	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
+-	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
+-	/* Store the first VEC.  */
+-	VMOVU	%VEC(4), (%r11)
++	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
++	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
++	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
++	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)
++
++	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
++	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
++	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
++	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)
+ 	VZEROUPPER_RETURN
+ 
+-L(large_backward):
+-	/* Don't use non-temporal store if there is overlap between
+-	   destination and source since destination may be in cache
+-	   when source is loaded.  */
+-	leaq    (%rcx, %rdx), %r10
+-	cmpq    %r10, %r9
+-	jb	L(loop_4x_vec_backward)
+-L(loop_large_backward):
+-	/* Copy 4 * VEC a time backward with non-temporal stores.  */
+-	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
+-	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
+-	VMOVU	(%rcx), %VEC(0)
+-	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
+-	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
+-	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
+-	subq	$PREFETCHED_LOAD_SIZE, %rcx
+-	subq	$PREFETCHED_LOAD_SIZE, %rdx
+-	VMOVNT	%VEC(0), (%r9)
+-	VMOVNT	%VEC(1), -VEC_SIZE(%r9)
+-	VMOVNT	%VEC(2), -(VEC_SIZE * 2)(%r9)
+-	VMOVNT	%VEC(3), -(VEC_SIZE * 3)(%r9)
+-	subq	$PREFETCHED_LOAD_SIZE, %r9
+-	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
+-	ja	L(loop_large_backward)
++	.p2align 4
++L(large_memcpy_4x):
++	movq	%rdx, %r10
++	/* edx will store remainder size for copying tail.  */
++	andl	$(PAGE_SIZE * 4 - 1), %edx
++	/* r10 stores outer loop counter.  */
++	shrq	$(LOG_PAGE_SIZE + 2), %r10
++	/* Copy 4x VEC at a time from 4 pages.  */
++	.p2align 4
++L(loop_large_memcpy_4x_outer):
++	/* ecx stores inner loop counter.  */
++	movl	$(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
++L(loop_large_memcpy_4x_inner):
++	/* Only one prefetch set per page as doing 4 pages give more time
++	   for prefetcher to keep up.  */
++	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
++	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
++	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
++	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
++	/* Load vectors from rsi.  */
++	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
++	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
++	LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
++	LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
++	subq	$-LARGE_LOAD_SIZE, %rsi
++	/* Non-temporal store vectors to rdi.  */
++	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
++	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
++	STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
++	STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
++	subq	$-LARGE_LOAD_SIZE, %rdi
++	decl	%ecx
++	jnz	L(loop_large_memcpy_4x_inner)
++	addq	$(PAGE_SIZE * 3), %rdi
++	addq	$(PAGE_SIZE * 3), %rsi
++	decq	%r10
++	jne	L(loop_large_memcpy_4x_outer)
+ 	sfence
+-	/* Store the first 4 * VEC.  */
+-	VMOVU	%VEC(4), (%rdi)
+-	VMOVU	%VEC(5), VEC_SIZE(%rdi)
+-	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
+-	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
+-	/* Store the last VEC.  */
+-	VMOVU	%VEC(8), (%r11)
++	/* Check if only last 4 loads are needed.  */
++	cmpl	$(VEC_SIZE * 4), %edx
++	jbe	L(large_memcpy_4x_end)
++
++	/* Handle the last 4  * PAGE_SIZE bytes.  */
++L(loop_large_memcpy_4x_tail):
++	/* Copy 4 * VEC a time forward with non-temporal stores.  */
++	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
++	PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
++	VMOVU	(%rsi), %VEC(0)
++	VMOVU	VEC_SIZE(%rsi), %VEC(1)
++	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
++	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
++	subq	$-(VEC_SIZE * 4), %rsi
++	addl	$-(VEC_SIZE * 4), %edx
++	VMOVA	%VEC(0), (%rdi)
++	VMOVA	%VEC(1), VEC_SIZE(%rdi)
++	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
++	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
++	subq	$-(VEC_SIZE * 4), %rdi
++	cmpl	$(VEC_SIZE * 4), %edx
++	ja	L(loop_large_memcpy_4x_tail)
++
++L(large_memcpy_4x_end):
++	/* Store the last 4 * VEC.  */
++	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
++	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
++	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
++	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)
++
++	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
++	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
++	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
++	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)
+ 	VZEROUPPER_RETURN
+ #endif
+ END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+-- 
+GitLab
+
diff --git a/SOURCES/ia-upperbound-enh-rep_movsb.patch b/SOURCES/ia-upperbound-enh-rep_movsb.patch
new file mode 100644
index 0000000..55b8646
--- /dev/null
+++ b/SOURCES/ia-upperbound-enh-rep_movsb.patch
@@ -0,0 +1,91 @@
+From 571a3ddd938b742af8fc2b02f26b4b3296ea8a94 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Wed, 2 Mar 2022 16:12:40 -0800
+Subject: [PATCH] x86: Adding an upper bound for Enhanced REP MOVSB.
+
+In the process of optimizing memcpy for AMD machines, we have found the
+vector move operations are outperforming enhanced REP MOVSB for data
+transfers above the L2 cache size on Zen3 architectures.
+To handle this use case, we are adding an upper bound parameter on
+enhanced REP MOVSB:'__x86_rep_movsb_stop_threshold'.
+As per large-bench results, we are configuring this parameter to the
+L2 cache size for AMD machines and applicable from Zen3 architecture
+supporting the ERMS feature.
+For architectures other than AMD, it is the computed value of
+non-temporal threshold parameter.
+
+Reviewed-by: Premachandra Mallappa <premachandra.mallappa@amd.com>
+
+(cherry picked from commit 6e02b3e9327b7dbb063958d2b124b64fcb4bbe3f)
+---
+ sysdeps/x86/cacheinfo.h                            | 14 ++++++++++++++
+ .../x86_64/multiarch/memmove-vec-unaligned-erms.S  |  7 +++++--
+ 2 files changed, 19 insertions(+), 2 deletions(-)
+
+diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
+index 02556961..b982982f 100644
+--- a/sysdeps/x86/cacheinfo.h
++++ b/sysdeps/x86/cacheinfo.h
+@@ -45,6 +45,9 @@ long int __x86_rep_movsb_threshold attribute_hidden = 2048;
+ /* Threshold to use Enhanced REP STOSB.  */
+ long int __x86_rep_stosb_threshold attribute_hidden = 2048;
+ 
++/* Threshold to stop using Enhanced REP MOVSB.  */
++long int __x86_rep_movsb_stop_threshold attribute_hidden;
++
+ static void
+ get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr,
+ 		       long int core)
+@@ -352,6 +355,12 @@ init_cacheinfo (void)
+ 	      shared += core;
+             }
+ 	}
++
++      /* ERMS feature is implemented from AMD Zen3 architecture and it is
++	 performing poorly for data above L2 cache size. Henceforth, adding
++	 an upper bound threshold parameter to limit the usage of Enhanced
++	 REP MOVSB operations and setting its value to L2 cache size.  */
++      __x86_rep_movsb_stop_threshold = core;
+     }
+ 
+   if (cpu_features->data_cache_size != 0)
+@@ -421,6 +430,11 @@ init_cacheinfo (void)
+   else
+     __x86_rep_movsb_threshold = rep_movsb_threshold;
+ 
++  /* Setting the upper bound of ERMS to the computed value of
++     non-temporal threshold for architectures other than AMD.  */
++  if (cpu_features->basic.kind != arch_kind_amd)
++    __x86_rep_movsb_stop_threshold = __x86_shared_non_temporal_threshold;
++
+ # if HAVE_TUNABLES
+   __x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold;
+ # endif
+diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+index 572cef04..620ce3a8 100644
+--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+@@ -30,7 +30,10 @@
+       load and aligned store.  Load the last 4 * VEC and first VEC
+       before the loop and store them after the loop to support
+       overlapping addresses.
+-   6. If size >= __x86_shared_non_temporal_threshold and there is no
++   6. On machines with ERMS feature, if size greater than equal or to
++      __x86_rep_movsb_threshold and less than
++      __x86_rep_movsb_stop_threshold, then REP MOVSB will be used.
++   7. If size >= __x86_shared_non_temporal_threshold and there is no
+       overlap between destination and source, use non-temporal store
+       instead of aligned store copying from either 2 or 4 pages at
+       once.
+@@ -311,7 +314,7 @@ L(return):
+ #endif
+ 
+ L(movsb):
+-	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
++	cmp     __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
+ 	jae	L(more_8x_vec)
+ 	cmpq	%rsi, %rdi
+ 	jb	1f
+-- 
+GitLab
+
diff --git a/SOURCES/ia-use-xmmn-vpxor.patch b/SOURCES/ia-use-xmmn-vpxor.patch
new file mode 100644
index 0000000..a35cf75
--- /dev/null
+++ b/SOURCES/ia-use-xmmn-vpxor.patch
@@ -0,0 +1,45 @@
+From e7e6385876577e04101a9b9ed8d10c34f634c905 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Thu, 11 Jun 2020 12:41:18 -0700
+Subject: [PATCH] x86_64: Use %xmmN with vpxor to clear a vector register
+
+Since "vpxor %xmmN, %xmmN, %xmmN" clears the whole vector register, use
+%xmmN, instead of %ymmN, with vpxor to clear a vector register.
+
+(cherry picked from commit a35a59036ebae3efcdf5e8167610e0656fca9770)
+---
+ sysdeps/x86_64/multiarch/strcmp-avx2.S  | 4 ++--
+ sysdeps/x86_64/multiarch/strrchr-avx2.S | 2 +-
+ 2 files changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index 433ae047..70d8499b 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -105,8 +105,8 @@ ENTRY (STRCMP)
+ # endif
+ 	movl	%edi, %eax
+ 	xorl	%edx, %edx
+-	/* Make %ymm7 all zeros in this function.  */
+-	vpxor	%ymm7, %ymm7, %ymm7
++	/* Make %xmm7 (%ymm7) all zeros in this function.  */
++	vpxor	%xmm7, %xmm7, %xmm7
+ 	orl	%esi, %eax
+ 	andl	$(PAGE_SIZE - 1), %eax
+ 	cmpl	$(PAGE_SIZE - (VEC_SIZE * 4)), %eax
+diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
+index 9f22a15e..c949410b 100644
+--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
++++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
+@@ -48,7 +48,7 @@ ENTRY (STRRCHR)
+ 	movl	%edi, %ecx
+ 	/* Broadcast CHAR to YMM4.  */
+ 	VPBROADCAST %xmm4, %ymm4
+-	vpxor	%ymm0, %ymm0, %ymm0
++	vpxor	%xmm0, %xmm0, %xmm0
+ 
+ 	/* Check if we may cross page boundary with one vector load.  */
+ 	andl	$(2 * VEC_SIZE - 1), %ecx
+-- 
+GitLab
+
diff --git a/SOURCES/ia-use-zmm16-zmm31-avx512-1.patch b/SOURCES/ia-use-zmm16-zmm31-avx512-1.patch
new file mode 100644
index 0000000..851e2ac
--- /dev/null
+++ b/SOURCES/ia-use-zmm16-zmm31-avx512-1.patch
@@ -0,0 +1,149 @@
+From df380c9a1a74df30968bacb067ed77e583bd960d Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Sun, 7 Mar 2021 09:44:18 -0800
+Subject: [PATCH] x86-64: Use ZMM16-ZMM31 in AVX512 memset family functions
+
+Update ifunc-memset.h/ifunc-wmemset.h to select the function optimized
+with AVX512 instructions using ZMM16-ZMM31 registers to avoid RTM abort
+with usable AVX512VL and AVX512BW since VZEROUPPER isn't needed at
+function exit.
+
+(cherry picked from commit 4e2d8f352774b56078c34648b14a2412c38384f4)
+---
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c       | 14 +++++++++-----
+ sysdeps/x86_64/multiarch/ifunc-memset.h          | 13 ++++++++-----
+ sysdeps/x86_64/multiarch/ifunc-wmemset.h         | 12 ++++++------
+ .../multiarch/memset-avx512-unaligned-erms.S     | 16 ++++++++--------
+ 4 files changed, 31 insertions(+), 24 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index c1efeec0..d969a156 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -211,10 +211,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+ 			      __memset_chk_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
+ 			      __memset_chk_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
+ 			      __memset_chk_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+@@ -252,10 +254,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+ 			      __memset_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
+ 			      __memset_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
+ 			      __memset_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+@@ -719,7 +723,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __wmemset_evex_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, wmemset,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __wmemset_avx512_unaligned))
+ 
+ #ifdef SHARED
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
+index 6f3375cc..19795938 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
++++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
+@@ -53,13 +53,16 @@ IFUNC_SELECTOR (void)
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
+       && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
+     {
+-      if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+-	return OPTIMIZE (avx512_no_vzeroupper);
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
++	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
++	{
++	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++	    return OPTIMIZE (avx512_unaligned_erms);
+ 
+-      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+-	return OPTIMIZE (avx512_unaligned_erms);
++	  return OPTIMIZE (avx512_unaligned);
++	}
+ 
+-      return OPTIMIZE (avx512_unaligned);
++      return OPTIMIZE (avx512_no_vzeroupper);
+     }
+ 
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
+diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+index bdc94c6c..98c5d406 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h
++++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+@@ -33,13 +33,13 @@ IFUNC_SELECTOR (void)
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+     {
+-      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
+-	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
+-	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+-	return OPTIMIZE (avx512_unaligned);
+-
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
+-	return OPTIMIZE (evex_unaligned);
++	{
++	  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
++	    return OPTIMIZE (avx512_unaligned);
++
++	  return OPTIMIZE (evex_unaligned);
++	}
+ 
+       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+ 	return OPTIMIZE (avx2_unaligned_rtm);
+diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+index 0783979c..22e7b187 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+@@ -1,22 +1,22 @@
+ #if IS_IN (libc)
+ # define VEC_SIZE	64
+-# define VEC(i)		zmm##i
++# define XMM0		xmm16
++# define YMM0		ymm16
++# define VEC0		zmm16
++# define VEC(i)		VEC##i
+ # define VMOVU		vmovdqu64
+ # define VMOVA		vmovdqa64
++# define VZEROUPPER
+ 
+ # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+-  vmovd d, %xmm0; \
+   movq r, %rax; \
+-  vpbroadcastb %xmm0, %xmm0; \
+-  vpbroadcastq %xmm0, %zmm0
++  vpbroadcastb d, %VEC0
+ 
+ # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+-  vmovd d, %xmm0; \
+   movq r, %rax; \
+-  vpbroadcastd %xmm0, %xmm0; \
+-  vpbroadcastq %xmm0, %zmm0
++  vpbroadcastd d, %VEC0
+ 
+-# define SECTION(p)		p##.avx512
++# define SECTION(p)		p##.evex512
+ # define MEMSET_SYMBOL(p,s)	p##_avx512_##s
+ # define WMEMSET_SYMBOL(p,s)	p##_avx512_##s
+ 
+-- 
+GitLab
+
diff --git a/SOURCES/ia-use-zmm16-zmm31-avx512-2.patch b/SOURCES/ia-use-zmm16-zmm31-avx512-2.patch
new file mode 100644
index 0000000..756f4d0
--- /dev/null
+++ b/SOURCES/ia-use-zmm16-zmm31-avx512-2.patch
@@ -0,0 +1,165 @@
+From edb998421aef6f9b3bc7716f618d913dcfff64f7 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Sun, 7 Mar 2021 09:45:23 -0800
+Subject: [PATCH] x86-64: Use ZMM16-ZMM31 in AVX512 memmove family functions
+
+Update ifunc-memmove.h to select the function optimized with AVX512
+instructions using ZMM16-ZMM31 registers to avoid RTM abort with usable
+AVX512VL since VZEROUPPER isn't needed at function exit.
+
+(cherry picked from commit e4fda4631017e49d4ee5a2755db34289b6860fa4)
+---
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c    | 24 +++++++++---------
+ sysdeps/x86_64/multiarch/ifunc-memmove.h      | 12 +++++----
+ .../multiarch/memmove-avx512-unaligned-erms.S | 25 +++++++++++++++++--
+ 3 files changed, 42 insertions(+), 19 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index d969a156..fec384f6 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -83,10 +83,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __memmove_chk_avx512_no_vzeroupper)
+ 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memmove_chk_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memmove_chk_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ 			      CPU_FEATURE_USABLE (AVX),
+@@ -148,10 +148,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __memmove_avx512_no_vzeroupper)
+ 	      IFUNC_IMPL_ADD (array, i, memmove,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memmove_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, memmove,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memmove_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
+ 			      __memmove_ssse3_back)
+@@ -733,10 +733,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __memcpy_chk_avx512_no_vzeroupper)
+ 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memcpy_chk_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memcpy_chk_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ 			      CPU_FEATURE_USABLE (AVX),
+@@ -802,10 +802,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __memcpy_avx512_no_vzeroupper)
+ 	      IFUNC_IMPL_ADD (array, i, memcpy,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memcpy_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, memcpy,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memcpy_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, memcpy, 1,
+@@ -819,10 +819,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __mempcpy_chk_avx512_no_vzeroupper)
+ 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __mempcpy_chk_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __mempcpy_chk_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ 			      CPU_FEATURE_USABLE (AVX),
+@@ -864,10 +864,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __mempcpy_avx512_no_vzeroupper)
+ 	      IFUNC_IMPL_ADD (array, i, mempcpy,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __mempcpy_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, mempcpy,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __mempcpy_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, mempcpy,
+ 			      CPU_FEATURE_USABLE (AVX),
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
+index fa09b9fb..014e95c7 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
++++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
+@@ -56,13 +56,15 @@ IFUNC_SELECTOR (void)
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
+       && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
+     {
+-      if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+-	return OPTIMIZE (avx512_no_vzeroupper);
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
++	{
++	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++	    return OPTIMIZE (avx512_unaligned_erms);
+ 
+-      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+-	return OPTIMIZE (avx512_unaligned_erms);
++	  return OPTIMIZE (avx512_unaligned);
++	}
+ 
+-      return OPTIMIZE (avx512_unaligned);
++      return OPTIMIZE (avx512_no_vzeroupper);
+     }
+ 
+   if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+index aac1515c..848848ab 100644
+--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+@@ -1,11 +1,32 @@
+ #if IS_IN (libc)
+ # define VEC_SIZE	64
+-# define VEC(i)		zmm##i
++# define XMM0		xmm16
++# define XMM1		xmm17
++# define YMM0		ymm16
++# define YMM1		ymm17
++# define VEC0		zmm16
++# define VEC1		zmm17
++# define VEC2		zmm18
++# define VEC3		zmm19
++# define VEC4		zmm20
++# define VEC5		zmm21
++# define VEC6		zmm22
++# define VEC7		zmm23
++# define VEC8		zmm24
++# define VEC9		zmm25
++# define VEC10		zmm26
++# define VEC11		zmm27
++# define VEC12		zmm28
++# define VEC13		zmm29
++# define VEC14		zmm30
++# define VEC15		zmm31
++# define VEC(i)		VEC##i
+ # define VMOVNT		vmovntdq
+ # define VMOVU		vmovdqu64
+ # define VMOVA		vmovdqa64
++# define VZEROUPPER
+ 
+-# define SECTION(p)		p##.avx512
++# define SECTION(p)		p##.evex512
+ # define MEMMOVE_SYMBOL(p,s)	p##_avx512_##s
+ 
+ # include "memmove-vec-unaligned-erms.S"
+-- 
+GitLab
+
diff --git a/SOURCES/ia-wcslen-opt-sse4_1.patch b/SOURCES/ia-wcslen-opt-sse4_1.patch
new file mode 100644
index 0000000..1169d8d
--- /dev/null
+++ b/SOURCES/ia-wcslen-opt-sse4_1.patch
@@ -0,0 +1,181 @@
+From f98c4e939dc562ee4f687cead51b6fc5fb5ad18f Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Jun 2021 01:19:34 -0400
+Subject: [PATCH] x86-64: Add wcslen optimize for sse4.1
+
+No bug. This comment adds the ifunc / build infrastructure
+necessary for wcslen to prefer the sse4.1 implementation
+in strlen-vec.S. test-wcslen.c is passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+(cherry picked from commit 6f573a27b6c8b4236445810a44660612323f5a73)
+---
+ sysdeps/x86_64/multiarch/Makefile          |  4 +-
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c |  3 ++
+ sysdeps/x86_64/multiarch/ifunc-wcslen.h    | 52 ++++++++++++++++++++++
+ sysdeps/x86_64/multiarch/wcslen-sse4_1.S   |  4 ++
+ sysdeps/x86_64/multiarch/wcslen.c          |  2 +-
+ sysdeps/x86_64/multiarch/wcsnlen.c         | 34 +-------------
+ 6 files changed, 63 insertions(+), 36 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/ifunc-wcslen.h
+ create mode 100644 sysdeps/x86_64/multiarch/wcslen-sse4_1.S
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 491c7698..65fde4eb 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
++++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -93,8 +93,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
+ 		   wcscpy-ssse3 wcscpy-c \
+ 		   wcschr-sse2 wcschr-avx2 \
+ 		   wcsrchr-sse2 wcsrchr-avx2 \
+-		   wcsnlen-sse4_1 wcsnlen-c \
+-		   wcslen-sse2 wcslen-avx2 wcsnlen-avx2 \
++		   wcslen-sse2 wcslen-sse4_1 wcslen-avx2 \
++		   wcsnlen-c wcsnlen-sse4_1 wcsnlen-avx2 \
+ 		   wcschr-avx2-rtm \
+ 		   wcscmp-avx2-rtm \
+ 		   wcslen-avx2-rtm \
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index f1a6460a..580913ca 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -657,6 +657,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+ 			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wcslen_evex)
++	      IFUNC_IMPL_ADD (array, i, wcsnlen,
++			      CPU_FEATURE_USABLE (SSE4_1),
++			      __wcsnlen_sse4_1)
+ 	      IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */
+diff --git a/sysdeps/x86_64/multiarch/ifunc-wcslen.h b/sysdeps/x86_64/multiarch/ifunc-wcslen.h
+new file mode 100644
+index 00000000..39e33473
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/ifunc-wcslen.h
+@@ -0,0 +1,52 @@
++/* Common definition for ifunc selections for wcslen and wcsnlen
++   All versions must be listed in ifunc-impl-list.c.
++   Copyright (C) 2017-2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <init-arch.h>
++
++extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
++
++static inline void *
++IFUNC_SELECTOR (void)
++{
++  const struct cpu_features* cpu_features = __get_cpu_features ();
++
++  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
++      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
++      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
++    {
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
++	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
++	return OPTIMIZE (evex);
++
++      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	return OPTIMIZE (avx2_rtm);
++
++      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
++	return OPTIMIZE (avx2);
++    }
++
++  if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
++    return OPTIMIZE (sse4_1);
++
++  return OPTIMIZE (sse2);
++}
+diff --git a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
+new file mode 100644
+index 00000000..7e62621a
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
+@@ -0,0 +1,4 @@
++#define AS_WCSLEN
++#define strlen	__wcslen_sse4_1
++
++#include "strlen-vec.S"
+diff --git a/sysdeps/x86_64/multiarch/wcslen.c b/sysdeps/x86_64/multiarch/wcslen.c
+index 6d06e47c..3b04b75b 100644
+--- a/sysdeps/x86_64/multiarch/wcslen.c
++++ b/sysdeps/x86_64/multiarch/wcslen.c
+@@ -24,7 +24,7 @@
+ # undef __wcslen
+ 
+ # define SYMBOL_NAME wcslen
+-# include "ifunc-avx2.h"
++# include "ifunc-wcslen.h"
+ 
+ libc_ifunc_redirected (__redirect_wcslen, __wcslen, IFUNC_SELECTOR ());
+ weak_alias (__wcslen, wcslen);
+diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c
+index 20b731ae..06736410 100644
+--- a/sysdeps/x86_64/multiarch/wcsnlen.c
++++ b/sysdeps/x86_64/multiarch/wcsnlen.c
+@@ -24,39 +24,7 @@
+ # undef __wcsnlen
+ 
+ # define SYMBOL_NAME wcsnlen
+-# include <init-arch.h>
+-
+-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
+-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+-extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+-
+-static inline void *
+-IFUNC_SELECTOR (void)
+-{
+-  const struct cpu_features* cpu_features = __get_cpu_features ();
+-
+-  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+-      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+-    {
+-      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+-	return OPTIMIZE (evex);
+-
+-      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+-	return OPTIMIZE (avx2_rtm);
+-
+-      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+-	return OPTIMIZE (avx2);
+-    }
+-
+-  if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
+-    return OPTIMIZE (sse4_1);
+-
+-  return OPTIMIZE (sse2);
+-}
++# include "ifunc-wcslen.h"
+ 
+ libc_ifunc_redirected (__redirect_wcsnlen, __wcsnlen, IFUNC_SELECTOR ());
+ weak_alias (__wcsnlen, wcsnlen);
+-- 
+GitLab
+
diff --git a/SOURCES/ia-wordsize-64-roundeven.patch b/SOURCES/ia-wordsize-64-roundeven.patch
new file mode 100644
index 0000000..c11a83f
--- /dev/null
+++ b/SOURCES/ia-wordsize-64-roundeven.patch
@@ -0,0 +1,39 @@
+From 12932b371cd6eb8c3edbfb037e615ffbb546be86 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Tue, 22 Feb 2022 06:49:39 -0800
+Subject: [PATCH] math: Redirect wordsize-64 roundeven function
+
+Redirect wordsize-64 roundeven function when there is no
+
+commit 9e97f239eae1f2b1d2e694d844c0f6fd7c4dd271
+Author: Wilco Dijkstra <wdijkstr@arm.com>
+Date:   Thu Jan 7 15:26:26 2021 +0000
+
+    Remove dbl-64/wordsize-64 (part 2)
+---
+ sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+diff --git a/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c b/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c
+index 7bbbb2dc..8728d0f2 100644
+--- a/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c
++++ b/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c
+@@ -17,6 +17,7 @@
+    License along with the GNU C Library; if not, see
+    <http://www.gnu.org/licenses/>.  */
+ 
++#define NO_MATH_REDIRECT
+ #include <math.h>
+ #include <math_private.h>
+ #include <libm-alias-double.h>
+@@ -67,5 +68,6 @@ __roundeven (double x)
+   INSERT_WORDS64 (x, ix);
+   return x;
+ }
+-hidden_def (__roundeven)
++#ifndef __roundeven
+ libm_alias_double (__roundeven, roundeven)
++#endif
+-- 
+GitLab
+
diff --git a/SPECS/glibc.spec b/SPECS/glibc.spec
index 2c37db1..bd945a7 100644
--- a/SPECS/glibc.spec
+++ b/SPECS/glibc.spec
@@ -1,6 +1,6 @@
 %define glibcsrcdir glibc-2.28
 %define glibcversion 2.28
-%define glibcrelease 200%{?dist}
+%define glibcrelease 204%{?dist}
 # Pre-release tarballs are pulled in from git using a command that is
 # effectively:
 #
@@ -893,11 +893,89 @@ Patch698: glibc-rh2065588-11.patch
 Patch699: glibc-rh2065588-12.patch
 Patch700: glibc-rh2065588-13.patch
 Patch701: glibc-rh2072329.patch 
+Patch702: glibc-rh1982608.patch
+Patch703: glibc-rh1961109.patch
+Patch704: glibc-rh2086853.patch 
+Patch705: glibc-rh2077835.patch
 
 # Intel Optimizations
-Patch1001: 0001-void-short-distance-rep-movsb.patch
-Patch1002: 0002-pack-up-patches.patch
-Patch1003: 0003-only-avoid-short-distance-rep-mobsb-on-icx.patch
+Patch10001: glibc-sw24097-1.patch
+Patch10002: glibc-sw24097-2.patch
+Patch10003: glibc-sw24097-3.patch
+Patch10004: glibc-sw24097-4.patch
+Patch10005: glibc-sw24097-5.patch
+Patch10006: glibc-sw24097-6.patch
+Patch10007: glibc-sw24097-7.patch
+Patch10008: glibc-sw24097-8.patch
+Patch10009: glibc-sw24155.patch
+Patch10010: glibc-sw28755-1.patch
+Patch10011: ia-prefer_no_vzeroupper.patch
+Patch10012: ia-upd-256bit-evex-instr-1.patch
+Patch10013: ia-upd-256bit-evex-instr-2.patch
+Patch10014: ia-upd-256bit-evex-instr-3.patch
+Patch10015: ia-upd-256bit-evex-instr-4.patch
+Patch10016: ia-upd-256bit-evex-instr-5.patch
+Patch10017: ia-avx-opt-funct-rtm.patch
+Patch10018: ia-string-funct-test-rtm.patch
+Patch10019: ia-use-zmm16-zmm31-avx512-1.patch
+Patch10020: ia-use-zmm16-zmm31-avx512-2.patch
+Patch10021: ia-ifdef-indt-strlen-evex.patch
+Patch10022: ia-bmi2-req-strlen-strnlen.patch
+Patch10023: ia-memchr-opt-avx2.patch
+Patch10024: glibc-sw27974-1.patch
+Patch10025: ia-strlen-opt-avx2.patch
+Patch10026: ia-opt-memchr-evex.patch
+Patch10027: ia-unk-vector-opr-memchr-evex.patch
+Patch10028: ia-move-strlen-multiarch.patch
+Patch10029: ia-wcslen-opt-sse4_1.patch
+Patch10030: glibc-sw27974-2.patch
+Patch10031: ia-opt-strlen-evex.patch
+Patch10032: glibc-sw28033.patch
+Patch10033: glibc-sw28064.patch
+Patch10034: glibc-sw28896.patch
+Patch10035: glibc-sw25966.patch
+Patch10036: ia-use-xmmn-vpxor.patch
+Patch10037: ia-refacto-imp-prf-strchr-avx2.patch
+Patch10038: glibc-sw27130.patch
+Patch10039: ia-upd-large-memcpy.patch
+Patch10040: ia-bmi2-req-strchr-avx2.patch
+Patch10041: ia-opt-less_vec-memset-avx512.patch
+Patch10042: ia-opt-strchr-avx2.patch
+Patch10043: ia-opt-strchr-evex.patch
+Patch10044: ia-opt-memchr-evex-rtm.patch
+Patch10045: ia-opt-memcmp-avx2-movbe.patch
+Patch10046: ia-opt-memcmp-evex-movbe-1.patch
+Patch10047: ia-impr-memset-vec-unaligned-erms.patch
+Patch10048: ia-impr-memmove-vec-unaligned-erms.patch
+Patch10049: ia-rmv-ofl-chk-wcsnlen-sse4_1.patch
+Patch10050: ia-upperbound-enh-rep_movsb.patch
+Patch10051: ia-avoid_short_distance_rep_movsb.patch
+Patch10052: ia-testl-x86_string_control.patch
+Patch10053: ia-wordsize-64-roundeven.patch
+Patch10054: ia-redirect-roundeven-funct.patch
+Patch10055: ia-roundeven_sse4_1.patch
+Patch10056: glibc-sw28755-2.patch
+Patch10057: glibc-sw28252.patch
+Patch10058: ia-new-macro-entry_p2align.patch
+Patch10059: ia-opt-memcmp-evex-movbe-2.patch
+Patch10060: ia-opt-memset-vec-unaligned-erms.patch
+Patch10061: ia-rplc-sse2-inst-avx-memcmp-evex-movbe.patch
+Patch10062: ia-imp-strcmp-evex.patch
+Patch10063: ia-rmv-prefer_avx2_strcmp.patch
+Patch10064: ia-rplc-movzx-movzbl.patch
+Patch10065: ia-set-rep_movsb_threshold-2112.patch
+Patch10066: ia-opt-memmove-vec-unaligned-erms.patch
+Patch10067: ia-double-rep_movsb_threshold-erms.patch
+Patch10068: ia-shrink-memcmp-sse4-code-size.patch
+Patch10069: glibc-sw28537-1.patch
+Patch10070: glibc-sw28537-2.patch
+Patch10071: glibc-sw28537-3.patch
+Patch10072: ia-rplc-cas-avoid-extra-load.patch
+Patch10073: glibc-sw28646.patch
+Patch10074: ia-no-index_arch_prefer_no_avx512-avx-vnni.patch
+Patch10075: ia-opt-less_vec-memcmp-evex-movb.patch
+Patch10076: glibc-sw28537-4.patch
+
 
 ##############################################################################
 # Continued list of core "glibc" package information:
@@ -2728,8 +2806,20 @@ fi
 %files -f compat-libpthread-nonshared.filelist -n compat-libpthread-nonshared
 
 %changelog
-* Tue May 03 2022 Ali Erdinc Koroglu <aekoroglu@centosproject.org> - 2.28-200
-- Intel optimizations added
+* Fri May 20 2022 Ali Erdinc Koroglu <aekoroglu@centosproject.org> - 2.28.204
+- Intel architecture optimizations
+
+* Tue May 17 2022 Patsy Griffin <patsy@redhat.com> - 2.28-203
+- 390x: Add support for IBM z16. (#2077835)
+
+* Mon May 16 2022 Siddhesh Poyarekar <siddhesh@redhat.com> - 2.28-202
+- Ensure that condition in __glibc_fortify is a constant (#2086853)
+
+* Tue May 10 2022 Arjun Shankar <arjun@redhat.com> - 2.28-201
+- Add missing MACRON to EBCDIC character sets (#1961109)
+
+* Wed May  4 2022 DJ Delorie <dj@redhat.com> - 2.28-200
+- Fix glob defects on certain XFS filesystems (#1982608)
 
 * Tue Apr 26 2022 Siddhesh Poyarekar <siddhesh@redhat.com> - 2.28-199
 - Fix fortify false positive with mbsrtowcs and mbsnrtowcs (#2072329).