8ae002
Full backports of the following patches:
8ae002
8ae002
commit b97eb2bdb1ed72982a7821c3078be591051cef59
8ae002
Author: H.J. Lu <hjl.tools@gmail.com>
8ae002
Date:   Mon Mar 16 14:58:43 2015 -0700
8ae002
8ae002
    Preserve bound registers in _dl_runtime_resolve
8ae002
    
8ae002
    We need to add a BND prefix before indirect branch at the end of
8ae002
    _dl_runtime_resolve to preserve bound registers.
8ae002
8ae002
commit ddd85a65b6e3d6ec1e756c1f78559f99a2c943ca
8ae002
Author: H.J. Lu <hjl.tools@gmail.com>
8ae002
Date:   Tue Jul 7 05:23:24 2015 -0700
8ae002
8ae002
    Add and use sysdeps/i386/link-defines.sym
8ae002
    
8ae002
    Define macros for fields in La_i86_regs and La_i86_retval and use them
8ae002
    in dl-trampoline.S, instead of hardcoded values.
8ae002
8ae002
commit 14c5cbabc2d11004ab223ae5eae761ddf83ef99e
8ae002
Author: Igor Zamyatin <igor.zamyatin@intel.com>
8ae002
Date:   Thu Jul 9 06:50:12 2015 -0700
8ae002
8ae002
    Preserve bound registers for pointer pass/return
8ae002
    
8ae002
    We need to save/restore bound registers and add a BND prefix before
8ae002
    branches in _dl_runtime_profile so that bound registers for pointer
8ae002
    pass and return are preserved when LD_AUDIT is used.
8ae002
8ae002
8ae002
commit f3dcae82d54e5097e18e1d6ef4ff55c2ea4e621e
8ae002
Author: H.J. Lu <hjl.tools@gmail.com>
8ae002
Date:   Tue Aug 25 04:33:54 2015 -0700
8ae002
8ae002
    Save and restore vector registers in x86-64 ld.so
8ae002
    
8ae002
    This patch adds SSE, AVX and AVX512 versions of _dl_runtime_resolve
8ae002
    and _dl_runtime_profile, which save and restore the first 8 vector
8ae002
    registers used for parameter passing.  elf_machine_runtime_setup
8ae002
    selects the proper _dl_runtime_resolve or _dl_runtime_profile based
8ae002
    on _dl_x86_cpu_features.  It avoids race condition caused by
8ae002
    FOREIGN_CALL macros, which are only used for x86-64.
8ae002
    
8ae002
    Performance impact of saving and restoring 8 vector registers are
8ae002
    negligible on Nehalem, Sandy Bridge, Ivy Bridge and Haswell when
8ae002
    ld.so is optimized with SSE2.
8ae002
8ae002
commit fb0f7a6755c1bfaec38f490fbfcaa39a66ee3604
8ae002
Author: H.J. Lu <hjl.tools@gmail.com>
8ae002
Date:   Tue Sep 6 08:50:55 2016 -0700
8ae002
8ae002
    X86-64: Add _dl_runtime_resolve_avx[512]_{opt|slow} [BZ #20508]
8ae002
    
8ae002
    There is transition penalty when SSE instructions are mixed with 256-bit
8ae002
    AVX or 512-bit AVX512 load instructions.  Since _dl_runtime_resolve_avx
8ae002
    and _dl_runtime_profile_avx512 save/restore 256-bit YMM/512-bit ZMM
8ae002
    registers, there is transition penalty when SSE instructions are used
8ae002
    with lazy binding on AVX and AVX512 processors.
8ae002
    
8ae002
    To avoid SSE transition penalty, if only the lower 128 bits of the first
8ae002
    8 vector registers are non-zero, we can preserve %xmm0 - %xmm7 registers
8ae002
    with the zero upper bits.
8ae002
    
8ae002
    For AVX and AVX512 processors which support XGETBV with ECX == 1, we can
8ae002
    use XGETBV with ECX == 1 to check if the upper 128 bits of YMM registers
8ae002
    or the upper 256 bits of ZMM registers are zero.  We can restore only the
8ae002
    non-zero portion of vector registers with AVX/AVX512 load instructions
8ae002
    which will zero-extend upper bits of vector registers.
8ae002
    
8ae002
    This patch adds _dl_runtime_resolve_sse_vex which saves and restores
8ae002
    XMM registers with 128-bit AVX store/load instructions.  It is used to
8ae002
    preserve YMM/ZMM registers when only the lower 128 bits are non-zero.
8ae002
    _dl_runtime_resolve_avx_opt and _dl_runtime_resolve_avx512_opt are added
8ae002
    and used on AVX/AVX512 processors supporting XGETBV with ECX == 1 so
8ae002
    that we store and load only the non-zero portion of vector registers.
8ae002
    This avoids SSE transition penalty caused by _dl_runtime_resolve_avx and
8ae002
    _dl_runtime_profile_avx512 when only the lower 128 bits of vector
8ae002
    registers are used.
8ae002
    
8ae002
    _dl_runtime_resolve_avx_slow is added and used for AVX processors which
8ae002
    don't support XGETBV with ECX == 1.  Since there is no SSE transition
8ae002
    penalty on AVX512 processors which don't support XGETBV with ECX == 1,
8ae002
    _dl_runtime_resolve_avx512_slow isn't provided.
8ae002
8ae002
commit 3403a17fea8ccef7dc5f99553a13231acf838744
8ae002
Author: H.J. Lu <hjl.tools@gmail.com>
8ae002
Date:   Thu Feb 9 12:19:44 2017 -0800
8ae002
8ae002
    x86-64: Verify that _dl_runtime_resolve preserves vector registers
8ae002
    
8ae002
    On x86-64, _dl_runtime_resolve must preserve the first 8 vector
8ae002
    registers.  Add 3 _dl_runtime_resolve tests to verify that SSE,
8ae002
    AVX and AVX512 registers are preserved.
8ae002
8ae002
commit c15f8eb50cea7ad1a4ccece6e0982bf426d52c00
8ae002
Author: H.J. Lu <hjl.tools@gmail.com>
8ae002
Date:   Tue Mar 21 10:59:31 2017 -0700
8ae002
8ae002
    x86-64: Improve branch predication in _dl_runtime_resolve_avx512_opt [BZ #21258]
8ae002
    
8ae002
    On Skylake server, _dl_runtime_resolve_avx512_opt is used to preserve
8ae002
    the first 8 vector registers.  The code layout is
8ae002
    
8ae002
      if only %xmm0 - %xmm7 registers are used
8ae002
         preserve %xmm0 - %xmm7 registers
8ae002
      if only %ymm0 - %ymm7 registers are used
8ae002
         preserve %ymm0 - %ymm7 registers
8ae002
      preserve %zmm0 - %zmm7 registers
8ae002
    
8ae002
    Branch predication always executes the fallthrough code path to preserve
8ae002
    %zmm0 - %zmm7 registers speculatively, even though only %xmm0 - %xmm7
8ae002
    registers are used.  This leads to lower CPU frequency on Skylake
8ae002
    server.  This patch changes the fallthrough code path to preserve
8ae002
    %xmm0 - %xmm7 registers instead:
8ae002
    
8ae002
      if whole %zmm0 - %zmm7 registers are used
8ae002
        preserve %zmm0 - %zmm7 registers
8ae002
      if only %ymm0 - %ymm7 registers are used
8ae002
         preserve %ymm0 - %ymm7 registers
8ae002
      preserve %xmm0 - %xmm7 registers
8ae002
8ae002
    Tested on Skylake server.
8ae002
    
8ae002
            [BZ #21258]
8ae002
            * sysdeps/x86_64/dl-trampoline.S (_dl_runtime_resolve_opt):
8ae002
            Define only if _dl_runtime_resolve is defined to
8ae002
            _dl_runtime_resolve_sse_vex.
8ae002
            * sysdeps/x86_64/dl-trampoline.h (_dl_runtime_resolve_opt):
8ae002
            Fallthrough to _dl_runtime_resolve_sse_vex.
8ae002
8ae002
Index: glibc-2.17-c758a686/nptl/sysdeps/x86_64/tcb-offsets.sym
8ae002
===================================================================
8ae002
--- glibc-2.17-c758a686.orig/nptl/sysdeps/x86_64/tcb-offsets.sym
8ae002
+++ glibc-2.17-c758a686/nptl/sysdeps/x86_64/tcb-offsets.sym
8ae002
@@ -15,7 +15,6 @@ VGETCPU_CACHE_OFFSET	offsetof (tcbhead_t
8ae002
 #ifndef __ASSUME_PRIVATE_FUTEX
8ae002
 PRIVATE_FUTEX		offsetof (tcbhead_t, private_futex)
8ae002
 #endif
8ae002
-RTLD_SAVESPACE_SSE	offsetof (tcbhead_t, rtld_savespace_sse)
8ae002
 
8ae002
 -- Not strictly offsets, but these values are also used in the TCB.
8ae002
 TCB_CANCELSTATE_BITMASK	 CANCELSTATE_BITMASK
8ae002
Index: glibc-2.17-c758a686/nptl/sysdeps/x86_64/tls.h
8ae002
===================================================================
8ae002
--- glibc-2.17-c758a686.orig/nptl/sysdeps/x86_64/tls.h
8ae002
+++ glibc-2.17-c758a686/nptl/sysdeps/x86_64/tls.h
8ae002
@@ -67,12 +67,13 @@ typedef struct
8ae002
 # else
8ae002
   int __unused1;
8ae002
 # endif
8ae002
-  int rtld_must_xmm_save;
8ae002
+  int __glibc_unused1;
8ae002
   /* Reservation of some values for the TM ABI.  */
8ae002
   void *__private_tm[5];
8ae002
   long int __unused2;
8ae002
-  /* Have space for the post-AVX register size.  */
8ae002
-  __128bits rtld_savespace_sse[8][4] __attribute__ ((aligned (32)));
8ae002
+  /* Must be kept even if it is no longer used by glibc since programs,
8ae002
+     like AddressSanitizer, depend on the size of tcbhead_t.  */
8ae002
+  __128bits __glibc_unused2[8][4] __attribute__ ((aligned (32)));
8ae002
 
8ae002
   void *__padding[8];
8ae002
 } tcbhead_t;
8ae002
@@ -380,41 +381,6 @@ typedef struct
8ae002
 # define THREAD_GSCOPE_WAIT() \
8ae002
   GL(dl_wait_lookup_done) ()
8ae002
 
8ae002
-
8ae002
-# ifdef SHARED
8ae002
-/* Defined in dl-trampoline.S.  */
8ae002
-extern void _dl_x86_64_save_sse (void);
8ae002
-extern void _dl_x86_64_restore_sse (void);
8ae002
-
8ae002
-# define RTLD_CHECK_FOREIGN_CALL \
8ae002
-  (THREAD_GETMEM (THREAD_SELF, header.rtld_must_xmm_save) != 0)
8ae002
-
8ae002
-/* NB: Don't use the xchg operation because that would imply a lock
8ae002
-   prefix which is expensive and unnecessary.  The cache line is also
8ae002
-   not contested at all.  */
8ae002
-#  define RTLD_ENABLE_FOREIGN_CALL \
8ae002
-  int old_rtld_must_xmm_save = THREAD_GETMEM (THREAD_SELF,		      \
8ae002
-					      header.rtld_must_xmm_save);     \
8ae002
-  THREAD_SETMEM (THREAD_SELF, header.rtld_must_xmm_save, 1)
8ae002
-
8ae002
-#  define RTLD_PREPARE_FOREIGN_CALL \
8ae002
-  do if (THREAD_GETMEM (THREAD_SELF, header.rtld_must_xmm_save))	      \
8ae002
-    {									      \
8ae002
-      _dl_x86_64_save_sse ();						      \
8ae002
-      THREAD_SETMEM (THREAD_SELF, header.rtld_must_xmm_save, 0);	      \
8ae002
-    }									      \
8ae002
-  while (0)
8ae002
-
8ae002
-#  define RTLD_FINALIZE_FOREIGN_CALL \
8ae002
-  do {									      \
8ae002
-    if (THREAD_GETMEM (THREAD_SELF, header.rtld_must_xmm_save) == 0)	      \
8ae002
-      _dl_x86_64_restore_sse ();					      \
8ae002
-    THREAD_SETMEM (THREAD_SELF, header.rtld_must_xmm_save,		      \
8ae002
-		   old_rtld_must_xmm_save);				      \
8ae002
-  } while (0)
8ae002
-# endif
8ae002
-
8ae002
-
8ae002
 #endif /* __ASSEMBLER__ */
8ae002
 
8ae002
 #endif	/* tls.h */
8ae002
Index: glibc-2.17-c758a686/sysdeps/i386/Makefile
8ae002
===================================================================
8ae002
--- glibc-2.17-c758a686.orig/sysdeps/i386/Makefile
8ae002
+++ glibc-2.17-c758a686/sysdeps/i386/Makefile
8ae002
@@ -33,6 +33,7 @@ sysdep-CFLAGS += -mpreferred-stack-bound
8ae002
 else
8ae002
 ifeq ($(subdir),csu)
8ae002
 sysdep-CFLAGS += -mpreferred-stack-boundary=4
8ae002
+gen-as-const-headers += link-defines.sym
8ae002
 else
8ae002
 # Likewise, any function which calls user callbacks
8ae002
 uses-callbacks += -mpreferred-stack-boundary=4
8ae002
Index: glibc-2.17-c758a686/sysdeps/i386/configure
8ae002
===================================================================
8ae002
--- glibc-2.17-c758a686.orig/sysdeps/i386/configure
8ae002
+++ glibc-2.17-c758a686/sysdeps/i386/configure
8ae002
@@ -179,5 +179,32 @@ fi
8ae002
 { $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_cc_novzeroupper" >&5
8ae002
 $as_echo "$libc_cv_cc_novzeroupper" >&6; }
8ae002
 
8ae002
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for Intel MPX support" >&5
8ae002
+$as_echo_n "checking for Intel MPX support... " >&6; }
8ae002
+if ${libc_cv_asm_mpx+:} false; then :
8ae002
+  $as_echo_n "(cached) " >&6
8ae002
+else
8ae002
+  cat > conftest.s <<\EOF
8ae002
+        bndmov %bnd0,(%esp)
8ae002
+EOF
8ae002
+if { ac_try='${CC-cc} -c $ASFLAGS conftest.s 1>&5'
8ae002
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
8ae002
+  (eval $ac_try) 2>&5
8ae002
+  ac_status=$?
8ae002
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
8ae002
+  test $ac_status = 0; }; }; then
8ae002
+  libc_cv_asm_mpx=yes
8ae002
+else
8ae002
+  libc_cv_asm_mpx=no
8ae002
+fi
8ae002
+rm -f conftest*
8ae002
+fi
8ae002
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_asm_mpx" >&5
8ae002
+$as_echo "$libc_cv_asm_mpx" >&6; }
8ae002
+if test $libc_cv_asm_mpx == yes; then
8ae002
+  $as_echo "#define HAVE_MPX_SUPPORT 1" >>confdefs.h
8ae002
+
8ae002
+fi
8ae002
+
8ae002
 $as_echo "#define PI_STATIC_AND_HIDDEN 1" >>confdefs.h
8ae002
 
8ae002
Index: glibc-2.17-c758a686/sysdeps/i386/configure.in
8ae002
===================================================================
8ae002
--- glibc-2.17-c758a686.orig/sysdeps/i386/configure.in
8ae002
+++ glibc-2.17-c758a686/sysdeps/i386/configure.in
8ae002
@@ -53,6 +53,21 @@ LIBC_TRY_CC_OPTION([-mno-vzeroupper],
8ae002
 		   [libc_cv_cc_novzeroupper=no])
8ae002
 ])
8ae002
 
8ae002
+dnl Check whether asm supports Intel MPX
8ae002
+AC_CACHE_CHECK(for Intel MPX support, libc_cv_asm_mpx, [dnl
8ae002
+cat > conftest.s <<\EOF
8ae002
+        bndmov %bnd0,(%esp)
8ae002
+EOF
8ae002
+if AC_TRY_COMMAND(${CC-cc} -c $ASFLAGS conftest.s 1>&AS_MESSAGE_LOG_FD); then
8ae002
+  libc_cv_asm_mpx=yes
8ae002
+else
8ae002
+  libc_cv_asm_mpx=no
8ae002
+fi
8ae002
+rm -f conftest*])
8ae002
+if test $libc_cv_asm_mpx == yes; then
8ae002
+  AC_DEFINE(HAVE_MPX_SUPPORT)
8ae002
+fi
8ae002
+
8ae002
 dnl It is always possible to access static and hidden symbols in an
8ae002
 dnl position independent way.
8ae002
 AC_DEFINE(PI_STATIC_AND_HIDDEN)
8ae002
Index: glibc-2.17-c758a686/sysdeps/i386/dl-trampoline.S
8ae002
===================================================================
8ae002
--- glibc-2.17-c758a686.orig/sysdeps/i386/dl-trampoline.S
8ae002
+++ glibc-2.17-c758a686/sysdeps/i386/dl-trampoline.S
8ae002
@@ -17,6 +17,13 @@
8ae002
    <http://www.gnu.org/licenses/>.  */
8ae002
 
8ae002
 #include <sysdep.h>
8ae002
+#include <link-defines.h>
8ae002
+
8ae002
+#ifdef HAVE_MPX_SUPPORT
8ae002
+# define PRESERVE_BND_REGS_PREFIX bnd
8ae002
+#else
8ae002
+# define PRESERVE_BND_REGS_PREFIX .byte 0xf2
8ae002
+#endif
8ae002
 
8ae002
 	.text
8ae002
 	.globl _dl_runtime_resolve
8ae002
@@ -161,24 +168,47 @@ _dl_runtime_profile:
8ae002
 	    +4      free
8ae002
 	   %esp     free
8ae002
 	*/
8ae002
-	subl $20, %esp
8ae002
-	cfi_adjust_cfa_offset (20)
8ae002
-	movl %eax, (%esp)
8ae002
-	movl %edx, 4(%esp)
8ae002
-	fstpt 8(%esp)
8ae002
-	fstpt 20(%esp)
8ae002
+#if LONG_DOUBLE_SIZE != 12
8ae002
+# error "long double size must be 12 bytes"
8ae002
+#endif
8ae002
+	# Allocate space for La_i86_retval and subtract 12 free bytes.
8ae002
+	subl $(LRV_SIZE - 12), %esp
8ae002
+	cfi_adjust_cfa_offset (LRV_SIZE - 12)
8ae002
+	movl %eax, LRV_EAX_OFFSET(%esp)
8ae002
+	movl %edx, LRV_EDX_OFFSET(%esp)
8ae002
+	fstpt LRV_ST0_OFFSET(%esp)
8ae002
+	fstpt LRV_ST1_OFFSET(%esp)
8ae002
+#ifdef HAVE_MPX_SUPPORT
8ae002
+	bndmov %bnd0, LRV_BND0_OFFSET(%esp)
8ae002
+	bndmov %bnd1, LRV_BND1_OFFSET(%esp)
8ae002
+#else
8ae002
+	.byte 0x66,0x0f,0x1b,0x44,0x24,LRV_BND0_OFFSET
8ae002
+	.byte 0x66,0x0f,0x1b,0x4c,0x24,LRV_BND1_OFFSET
8ae002
+#endif
8ae002
 	pushl %esp
8ae002
 	cfi_adjust_cfa_offset (4)
8ae002
-	leal 36(%esp), %ecx
8ae002
-	movl 56(%esp), %eax
8ae002
-	movl 60(%esp), %edx
8ae002
+	# Address of La_i86_regs area.
8ae002
+	leal (LRV_SIZE + 4)(%esp), %ecx
8ae002
+	# PLT2
8ae002
+	movl (LRV_SIZE + 4 + LR_SIZE)(%esp), %eax
8ae002
+	# PLT1
8ae002
+	movl (LRV_SIZE + 4 + LR_SIZE + 4)(%esp), %edx
8ae002
 	call _dl_call_pltexit
8ae002
-	movl (%esp), %eax
8ae002
-	movl 4(%esp), %edx
8ae002
-	fldt 20(%esp)
8ae002
-	fldt 8(%esp)
8ae002
-	addl $60, %esp
8ae002
-	cfi_adjust_cfa_offset (-60)
8ae002
+	movl LRV_EAX_OFFSET(%esp), %eax
8ae002
+	movl LRV_EDX_OFFSET(%esp), %edx
8ae002
+	fldt LRV_ST1_OFFSET(%esp)
8ae002
+	fldt LRV_ST0_OFFSET(%esp)
8ae002
+#ifdef HAVE_MPX_SUPPORT
8ae002
+	bndmov LRV_BND0_OFFSET(%esp), %bnd0
8ae002
+	bndmov LRV_BND1_OFFSET(%esp), %bnd1
8ae002
+#else
8ae002
+	.byte 0x66,0x0f,0x1a,0x44,0x24,LRV_BND0_OFFSET
8ae002
+	.byte 0x66,0x0f,0x1a,0x4c,0x24,LRV_BND1_OFFSET
8ae002
+#endif
8ae002
+	# Restore stack before return.
8ae002
+	addl $(LRV_SIZE + 4 + LR_SIZE + 4), %esp
8ae002
+	cfi_adjust_cfa_offset (-(LRV_SIZE + 4 + LR_SIZE + 4))
8ae002
+	PRESERVE_BND_REGS_PREFIX
8ae002
 	ret
8ae002
 	cfi_endproc
8ae002
 	.size _dl_runtime_profile, .-_dl_runtime_profile
8ae002
Index: glibc-2.17-c758a686/sysdeps/i386/link-defines.sym
8ae002
===================================================================
8ae002
--- /dev/null
8ae002
+++ glibc-2.17-c758a686/sysdeps/i386/link-defines.sym
8ae002
@@ -0,0 +1,20 @@
8ae002
+#include "link.h"
8ae002
+#include <stddef.h>
8ae002
+
8ae002
+--
8ae002
+LONG_DOUBLE_SIZE	sizeof (long double)
8ae002
+
8ae002
+LR_SIZE			sizeof (struct La_i86_regs)
8ae002
+LR_EDX_OFFSET		offsetof (struct La_i86_regs, lr_edx)
8ae002
+LR_ECX_OFFSET		offsetof (struct La_i86_regs, lr_ecx)
8ae002
+LR_EAX_OFFSET		offsetof (struct La_i86_regs, lr_eax)
8ae002
+LR_EBP_OFFSET		offsetof (struct La_i86_regs, lr_ebp)
8ae002
+LR_ESP_OFFSET		offsetof (struct La_i86_regs, lr_esp)
8ae002
+
8ae002
+LRV_SIZE		sizeof (struct La_i86_retval)
8ae002
+LRV_EAX_OFFSET		offsetof (struct La_i86_retval, lrv_eax)
8ae002
+LRV_EDX_OFFSET		offsetof (struct La_i86_retval, lrv_edx)
8ae002
+LRV_ST0_OFFSET		offsetof (struct La_i86_retval, lrv_st0)
8ae002
+LRV_ST1_OFFSET		offsetof (struct La_i86_retval, lrv_st1)
8ae002
+LRV_BND0_OFFSET		offsetof (struct La_i86_retval, lrv_bnd0)
8ae002
+LRV_BND1_OFFSET		offsetof (struct La_i86_retval, lrv_bnd1)
8ae002
Index: glibc-2.17-c758a686/sysdeps/x86/bits/link.h
8ae002
===================================================================
8ae002
--- glibc-2.17-c758a686.orig/sysdeps/x86/bits/link.h
8ae002
+++ glibc-2.17-c758a686/sysdeps/x86/bits/link.h
8ae002
@@ -38,6 +38,8 @@ typedef struct La_i86_retval
8ae002
   uint32_t lrv_edx;
8ae002
   long double lrv_st0;
8ae002
   long double lrv_st1;
8ae002
+  uint64_t lrv_bnd0;
8ae002
+  uint64_t lrv_bnd1;
8ae002
 } La_i86_retval;
8ae002
 
8ae002
 
8ae002
Index: glibc-2.17-c758a686/sysdeps/x86/cpu-features.c
8ae002
===================================================================
8ae002
--- glibc-2.17-c758a686.orig/sysdeps/x86/cpu-features.c
8ae002
+++ glibc-2.17-c758a686/sysdeps/x86/cpu-features.c
8ae002
@@ -130,6 +130,20 @@ init_cpu_features (struct cpu_features *
8ae002
 	      break;
8ae002
 	    }
8ae002
 	}
8ae002
+
8ae002
+      /* To avoid SSE transition penalty, use _dl_runtime_resolve_slow.
8ae002
+         If XGETBV suports ECX == 1, use _dl_runtime_resolve_opt.  */
8ae002
+      cpu_features->feature[index_Use_dl_runtime_resolve_slow]
8ae002
+	|= bit_Use_dl_runtime_resolve_slow;
8ae002
+      if (cpu_features->max_cpuid >= 0xd)
8ae002
+	{
8ae002
+	  unsigned int eax;
8ae002
+
8ae002
+	  __cpuid_count (0xd, 1, eax, ebx, ecx, edx);
8ae002
+	  if ((eax & (1 << 2)) != 0)
8ae002
+	    cpu_features->feature[index_Use_dl_runtime_resolve_opt]
8ae002
+	      |= bit_Use_dl_runtime_resolve_opt;
8ae002
+	}
8ae002
     }
8ae002
   /* This spells out "AuthenticAMD".  */
8ae002
   else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
8ae002
Index: glibc-2.17-c758a686/sysdeps/x86/cpu-features.h
8ae002
===================================================================
8ae002
--- glibc-2.17-c758a686.orig/sysdeps/x86/cpu-features.h
8ae002
+++ glibc-2.17-c758a686/sysdeps/x86/cpu-features.h
8ae002
@@ -34,6 +34,9 @@
8ae002
 #define bit_AVX512DQ_Usable		(1 << 13)
8ae002
 #define bit_Prefer_MAP_32BIT_EXEC	(1 << 16)
8ae002
 #define bit_Prefer_No_VZEROUPPER	(1 << 17)
8ae002
+#define bit_Use_dl_runtime_resolve_opt	(1 << 20)
8ae002
+#define bit_Use_dl_runtime_resolve_slow	(1 << 21)
8ae002
+
8ae002
 
8ae002
 /* CPUID Feature flags.  */
8ae002
 
8ae002
@@ -95,6 +98,9 @@
8ae002
 # define index_AVX512DQ_Usable		FEATURE_INDEX_1*FEATURE_SIZE
8ae002
 # define index_Prefer_MAP_32BIT_EXEC	FEATURE_INDEX_1*FEATURE_SIZE
8ae002
 # define index_Prefer_No_VZEROUPPER	FEATURE_INDEX_1*FEATURE_SIZE
8ae002
+# define index_Use_dl_runtime_resolve_opt FEATURE_INDEX_1*FEATURE_SIZE
8ae002
+# define index_Use_dl_runtime_resolve_slow FEATURE_INDEX_1*FEATURE_SIZE
8ae002
+
8ae002
 
8ae002
 # if defined (_LIBC) && !IS_IN (nonlib)
8ae002
 #  ifdef __x86_64__
8ae002
@@ -273,6 +279,8 @@ extern const struct cpu_features *__get_
8ae002
 # define index_AVX512DQ_Usable		FEATURE_INDEX_1
8ae002
 # define index_Prefer_MAP_32BIT_EXEC	FEATURE_INDEX_1
8ae002
 # define index_Prefer_No_VZEROUPPER     FEATURE_INDEX_1
8ae002
+# define index_Use_dl_runtime_resolve_opt FEATURE_INDEX_1
8ae002
+# define index_Use_dl_runtime_resolve_slow FEATURE_INDEX_1
8ae002
 
8ae002
 #endif	/* !__ASSEMBLER__ */
8ae002
 
8ae002
Index: glibc-2.17-c758a686/sysdeps/x86_64/Makefile
8ae002
===================================================================
8ae002
--- glibc-2.17-c758a686.orig/sysdeps/x86_64/Makefile
8ae002
+++ glibc-2.17-c758a686/sysdeps/x86_64/Makefile
8ae002
@@ -21,6 +21,11 @@ endif
8ae002
 ifeq ($(subdir),elf)
8ae002
 sysdep-dl-routines += tlsdesc dl-tlsdesc
8ae002
 
8ae002
+tests += ifuncmain8
8ae002
+modules-names += ifuncmod8
8ae002
+
8ae002
+$(objpfx)ifuncmain8: $(objpfx)ifuncmod8.so
8ae002
+
8ae002
 tests += tst-quad1 tst-quad2
8ae002
 modules-names += tst-quadmod1 tst-quadmod2
8ae002
 
8ae002
@@ -34,18 +39,32 @@ tests-pie += $(quad-pie-test)
8ae002
 $(objpfx)tst-quad1pie: $(objpfx)tst-quadmod1pie.o
8ae002
 $(objpfx)tst-quad2pie: $(objpfx)tst-quadmod2pie.o
8ae002
 
8ae002
+tests += tst-sse tst-avx tst-avx512
8ae002
+test-extras += tst-avx-aux tst-avx512-aux
8ae002
+extra-test-objs += tst-avx-aux.o tst-avx512-aux.o
8ae002
+
8ae002
 tests += tst-audit10
8ae002
-modules-names += tst-auditmod10a tst-auditmod10b
8ae002
+modules-names += tst-auditmod10a tst-auditmod10b \
8ae002
+		 tst-ssemod tst-avxmod tst-avx512mod
8ae002
 
8ae002
 $(objpfx)tst-audit10: $(objpfx)tst-auditmod10a.so
8ae002
 $(objpfx)tst-audit10.out: $(objpfx)tst-auditmod10b.so
8ae002
 tst-audit10-ENV = LD_AUDIT=$(objpfx)tst-auditmod10b.so
8ae002
 
8ae002
+$(objpfx)tst-sse: $(objpfx)tst-ssemod.so
8ae002
+$(objpfx)tst-avx: $(objpfx)tst-avx-aux.o $(objpfx)tst-avxmod.so
8ae002
+$(objpfx)tst-avx512: $(objpfx)tst-avx512-aux.o $(objpfx)tst-avx512mod.so
8ae002
+
8ae002
+CFLAGS-tst-avx-aux.c += $(AVX-CFLAGS)
8ae002
+CFLAGS-tst-avxmod.c += $(AVX-CFLAGS)
8ae002
+
8ae002
 ifeq (yes,$(config-cflags-avx512))
8ae002
 AVX512-CFLAGS = -mavx512f
8ae002
 CFLAGS-tst-audit10.c += $(AVX512-CFLAGS)
8ae002
 CFLAGS-tst-auditmod10a.c += $(AVX512-CFLAGS)
8ae002
 CFLAGS-tst-auditmod10b.c += $(AVX512-CFLAGS)
8ae002
+CFLAGS-tst-avx512-aux.c += $(AVX512-CFLAGS)
8ae002
+CFLAGS-tst-avx512mod.c += $(AVX512-CFLAGS)
8ae002
 endif
8ae002
 endif
8ae002
 
8ae002
Index: glibc-2.17-c758a686/sysdeps/x86_64/dl-machine.h
8ae002
===================================================================
8ae002
--- glibc-2.17-c758a686.orig/sysdeps/x86_64/dl-machine.h
8ae002
+++ glibc-2.17-c758a686/sysdeps/x86_64/dl-machine.h
8ae002
@@ -66,8 +66,15 @@ static inline int __attribute__ ((unused
8ae002
 elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
8ae002
 {
8ae002
   Elf64_Addr *got;
8ae002
-  extern void _dl_runtime_resolve (ElfW(Word)) attribute_hidden;
8ae002
-  extern void _dl_runtime_profile (ElfW(Word)) attribute_hidden;
8ae002
+  extern void _dl_runtime_resolve_sse (ElfW(Word)) attribute_hidden;
8ae002
+  extern void _dl_runtime_resolve_avx (ElfW(Word)) attribute_hidden;
8ae002
+  extern void _dl_runtime_resolve_avx_slow (ElfW(Word)) attribute_hidden;
8ae002
+  extern void _dl_runtime_resolve_avx_opt (ElfW(Word)) attribute_hidden;
8ae002
+  extern void _dl_runtime_resolve_avx512 (ElfW(Word)) attribute_hidden;
8ae002
+  extern void _dl_runtime_resolve_avx512_opt (ElfW(Word)) attribute_hidden;
8ae002
+  extern void _dl_runtime_profile_sse (ElfW(Word)) attribute_hidden;
8ae002
+  extern void _dl_runtime_profile_avx (ElfW(Word)) attribute_hidden;
8ae002
+  extern void _dl_runtime_profile_avx512 (ElfW(Word)) attribute_hidden;
8ae002
 
8ae002
   if (l->l_info[DT_JMPREL] && lazy)
8ae002
     {
8ae002
@@ -95,7 +102,12 @@ elf_machine_runtime_setup (struct link_m
8ae002
 	 end in this function.  */
8ae002
       if (__builtin_expect (profile, 0))
8ae002
 	{
8ae002
-	  *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile;
8ae002
+	  if (HAS_ARCH_FEATURE (AVX512F_Usable))
8ae002
+	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx512;
8ae002
+	  else if (HAS_ARCH_FEATURE (AVX_Usable))
8ae002
+	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx;
8ae002
+	  else
8ae002
+	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_sse;
8ae002
 
8ae002
 	  if (GLRO(dl_profile) != NULL
8ae002
 	      && _dl_name_match_p (GLRO(dl_profile), l))
8ae002
@@ -104,9 +116,34 @@ elf_machine_runtime_setup (struct link_m
8ae002
 	    GL(dl_profile_map) = l;
8ae002
 	}
8ae002
       else
8ae002
-	/* This function will get called to fix up the GOT entry indicated by
8ae002
-	   the offset on the stack, and then jump to the resolved address.  */
8ae002
-	*(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve;
8ae002
+	{
8ae002
+	  /* This function will get called to fix up the GOT entry
8ae002
+	     indicated by the offset on the stack, and then jump to
8ae002
+	     the resolved address.  */
8ae002
+	  if (HAS_ARCH_FEATURE (AVX512F_Usable))
8ae002
+	    {
8ae002
+	      if (HAS_ARCH_FEATURE (Use_dl_runtime_resolve_opt))
8ae002
+		*(ElfW(Addr) *) (got + 2)
8ae002
+		  = (ElfW(Addr)) &_dl_runtime_resolve_avx512_opt;
8ae002
+	      else
8ae002
+		*(ElfW(Addr) *) (got + 2)
8ae002
+		  = (ElfW(Addr)) &_dl_runtime_resolve_avx512;
8ae002
+	    }
8ae002
+	  else if (HAS_ARCH_FEATURE (AVX_Usable))
8ae002
+	    {
8ae002
+	      if (HAS_ARCH_FEATURE (Use_dl_runtime_resolve_opt))
8ae002
+		*(ElfW(Addr) *) (got + 2)
8ae002
+		  = (ElfW(Addr)) &_dl_runtime_resolve_avx_opt;
8ae002
+	      else if (HAS_ARCH_FEATURE (Use_dl_runtime_resolve_slow))
8ae002
+		*(ElfW(Addr) *) (got + 2)
8ae002
+		  = (ElfW(Addr)) &_dl_runtime_resolve_avx_slow;
8ae002
+	      else
8ae002
+		*(ElfW(Addr) *) (got + 2)
8ae002
+		  = (ElfW(Addr)) &_dl_runtime_resolve_avx;
8ae002
+	    }
8ae002
+	  else
8ae002
+	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_sse;
8ae002
+	}
8ae002
     }
8ae002
 
8ae002
   if (l->l_info[ADDRIDX (DT_TLSDESC_GOT)] && lazy)
8ae002
Index: glibc-2.17-c758a686/sysdeps/x86_64/dl-trampoline.S
8ae002
===================================================================
8ae002
--- glibc-2.17-c758a686.orig/sysdeps/x86_64/dl-trampoline.S
8ae002
+++ glibc-2.17-c758a686/sysdeps/x86_64/dl-trampoline.S
8ae002
@@ -18,28 +18,52 @@
8ae002
 
8ae002
 #include <config.h>
8ae002
 #include <sysdep.h>
8ae002
+#include <cpu-features.h>
8ae002
 #include <link-defines.h>
8ae002
 
8ae002
-#if (RTLD_SAVESPACE_SSE % 32) != 0
8ae002
-# error RTLD_SAVESPACE_SSE must be aligned to 32 bytes
8ae002
+#ifndef DL_STACK_ALIGNMENT
8ae002
+/* Due to GCC bug:
8ae002
+
8ae002
+   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
8ae002
+
8ae002
+   __tls_get_addr may be called with 8-byte stack alignment.  Although
8ae002
+   this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume
8ae002
+   that stack will be always aligned at 16 bytes.  We use unaligned
8ae002
+   16-byte move to load and store SSE registers, which has no penalty
8ae002
+   on modern processors if stack is 16-byte aligned.  */
8ae002
+# define DL_STACK_ALIGNMENT 8
8ae002
 #endif
8ae002
 
8ae002
+#ifndef DL_RUNIME_UNALIGNED_VEC_SIZE
8ae002
+/* The maximum size of unaligned vector load and store.  */
8ae002
+# define DL_RUNIME_UNALIGNED_VEC_SIZE 16
8ae002
+#endif
8ae002
+
8ae002
+/* True if _dl_runtime_resolve should align stack to VEC_SIZE bytes.  */
8ae002
+#define DL_RUNIME_RESOLVE_REALIGN_STACK \
8ae002
+  (VEC_SIZE > DL_STACK_ALIGNMENT \
8ae002
+   && VEC_SIZE > DL_RUNIME_UNALIGNED_VEC_SIZE)
8ae002
+
8ae002
+/* Align vector register save area to 16 bytes.  */
8ae002
+#define REGISTER_SAVE_VEC_OFF	0
8ae002
+
8ae002
 /* Area on stack to save and restore registers used for parameter
8ae002
    passing when calling _dl_fixup.  */
8ae002
 #ifdef __ILP32__
8ae002
-/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX.  */
8ae002
-# define REGISTER_SAVE_AREA	(8 * 7)
8ae002
-# define REGISTER_SAVE_RAX	0
8ae002
+# define REGISTER_SAVE_RAX	(REGISTER_SAVE_VEC_OFF + VEC_SIZE * 8)
8ae002
+# define PRESERVE_BND_REGS_PREFIX
8ae002
 #else
8ae002
-/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as BND0,
8ae002
-   BND1, BND2, BND3.  */
8ae002
-# define REGISTER_SAVE_AREA	(8 * 7 + 16 * 4)
8ae002
 /* Align bound register save area to 16 bytes.  */
8ae002
-# define REGISTER_SAVE_BND0	0
8ae002
+# define REGISTER_SAVE_BND0	(REGISTER_SAVE_VEC_OFF + VEC_SIZE * 8)
8ae002
 # define REGISTER_SAVE_BND1	(REGISTER_SAVE_BND0 + 16)
8ae002
 # define REGISTER_SAVE_BND2	(REGISTER_SAVE_BND1 + 16)
8ae002
 # define REGISTER_SAVE_BND3	(REGISTER_SAVE_BND2 + 16)
8ae002
 # define REGISTER_SAVE_RAX	(REGISTER_SAVE_BND3 + 16)
8ae002
+# ifdef HAVE_MPX_SUPPORT
8ae002
+#  define PRESERVE_BND_REGS_PREFIX bnd
8ae002
+# else
8ae002
+#  define PRESERVE_BND_REGS_PREFIX .byte 0xf2
8ae002
+# endif
8ae002
 #endif
8ae002
 #define REGISTER_SAVE_RCX	(REGISTER_SAVE_RAX + 8)
8ae002
 #define REGISTER_SAVE_RDX	(REGISTER_SAVE_RCX + 8)
8ae002
@@ -48,376 +72,71 @@
8ae002
 #define REGISTER_SAVE_R8	(REGISTER_SAVE_RDI + 8)
8ae002
 #define REGISTER_SAVE_R9	(REGISTER_SAVE_R8 + 8)
8ae002
 
8ae002
-	.text
8ae002
-	.globl _dl_runtime_resolve
8ae002
-	.type _dl_runtime_resolve, @function
8ae002
-	.align 16
8ae002
-	cfi_startproc
8ae002
-_dl_runtime_resolve:
8ae002
-	cfi_adjust_cfa_offset(16) # Incorporate PLT
8ae002
-	subq $REGISTER_SAVE_AREA,%rsp
8ae002
-	cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
8ae002
-	# Preserve registers otherwise clobbered.
8ae002
-	movq %rax, REGISTER_SAVE_RAX(%rsp)
8ae002
-	movq %rcx, REGISTER_SAVE_RCX(%rsp)
8ae002
-	movq %rdx, REGISTER_SAVE_RDX(%rsp)
8ae002
-	movq %rsi, REGISTER_SAVE_RSI(%rsp)
8ae002
-	movq %rdi, REGISTER_SAVE_RDI(%rsp)
8ae002
-	movq %r8, REGISTER_SAVE_R8(%rsp)
8ae002
-	movq %r9, REGISTER_SAVE_R9(%rsp)
8ae002
-#ifndef __ILP32__
8ae002
-	# We also have to preserve bound registers.  These are nops if
8ae002
-	# Intel MPX isn't available or disabled.
8ae002
-# ifdef HAVE_MPX_SUPPORT
8ae002
-	bndmov %bnd0, REGISTER_SAVE_BND0(%rsp)
8ae002
-	bndmov %bnd1, REGISTER_SAVE_BND1(%rsp)
8ae002
-	bndmov %bnd2, REGISTER_SAVE_BND2(%rsp)
8ae002
-	bndmov %bnd3, REGISTER_SAVE_BND3(%rsp)
8ae002
-# else
8ae002
-	.byte 0x66,0x0f,0x1b,0x44,0x24,REGISTER_SAVE_BND0
8ae002
-	.byte 0x66,0x0f,0x1b,0x4c,0x24,REGISTER_SAVE_BND1
8ae002
-	.byte 0x66,0x0f,0x1b,0x54,0x24,REGISTER_SAVE_BND2
8ae002
-	.byte 0x66,0x0f,0x1b,0x5c,0x24,REGISTER_SAVE_BND3
8ae002
-# endif
8ae002
-#endif
8ae002
-	# Copy args pushed by PLT in register.
8ae002
-	# %rdi: link_map, %rsi: reloc_index
8ae002
-	movq (REGISTER_SAVE_AREA + 8)(%rsp), %rsi
8ae002
-	movq REGISTER_SAVE_AREA(%rsp), %rdi
8ae002
-	call _dl_fixup		# Call resolver.
8ae002
-	movq %rax, %r11		# Save return value
8ae002
-#ifndef __ILP32__
8ae002
-	# Restore bound registers.  These are nops if Intel MPX isn't
8ae002
-	# avaiable or disabled.
8ae002
-# ifdef HAVE_MPX_SUPPORT
8ae002
-	bndmov REGISTER_SAVE_BND3(%rsp), %bnd3
8ae002
-	bndmov REGISTER_SAVE_BND2(%rsp), %bnd2
8ae002
-	bndmov REGISTER_SAVE_BND1(%rsp), %bnd1
8ae002
-	bndmov REGISTER_SAVE_BND0(%rsp), %bnd0
8ae002
-# else
8ae002
-	.byte 0x66,0x0f,0x1a,0x5c,0x24,REGISTER_SAVE_BND3
8ae002
-	.byte 0x66,0x0f,0x1a,0x54,0x24,REGISTER_SAVE_BND2
8ae002
-	.byte 0x66,0x0f,0x1a,0x4c,0x24,REGISTER_SAVE_BND1
8ae002
-	.byte 0x66,0x0f,0x1a,0x44,0x24,REGISTER_SAVE_BND0
8ae002
-# endif
8ae002
+#define VEC_SIZE		64
8ae002
+#define VMOVA			vmovdqa64
8ae002
+#if DL_RUNIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT
8ae002
+# define VMOV			vmovdqa64
8ae002
+#else
8ae002
+# define VMOV			vmovdqu64
8ae002
 #endif
8ae002
-	# Get register content back.
8ae002
-	movq REGISTER_SAVE_R9(%rsp), %r9
8ae002
-	movq REGISTER_SAVE_R8(%rsp), %r8
8ae002
-	movq REGISTER_SAVE_RDI(%rsp), %rdi
8ae002
-	movq REGISTER_SAVE_RSI(%rsp), %rsi
8ae002
-	movq REGISTER_SAVE_RDX(%rsp), %rdx
8ae002
-	movq REGISTER_SAVE_RCX(%rsp), %rcx
8ae002
-	movq REGISTER_SAVE_RAX(%rsp), %rax
8ae002
-	# Adjust stack(PLT did 2 pushes)
8ae002
-	addq $(REGISTER_SAVE_AREA + 16), %rsp
8ae002
-	cfi_adjust_cfa_offset(-(REGISTER_SAVE_AREA + 16))
8ae002
-	jmp *%r11		# Jump to function address.
8ae002
-	cfi_endproc
8ae002
-	.size _dl_runtime_resolve, .-_dl_runtime_resolve
8ae002
-
8ae002
-
8ae002
-#ifndef PROF
8ae002
-	.globl _dl_runtime_profile
8ae002
-	.type _dl_runtime_profile, @function
8ae002
-	.align 16
8ae002
-	cfi_startproc
8ae002
-
8ae002
-_dl_runtime_profile:
8ae002
-	cfi_adjust_cfa_offset(16) # Incorporate PLT
8ae002
-	/* The La_x86_64_regs data structure pointed to by the
8ae002
-	   fourth paramater must be 16-byte aligned.  This must
8ae002
-	   be explicitly enforced.  We have the set up a dynamically
8ae002
-	   sized stack frame.  %rbx points to the top half which
8ae002
-	   has a fixed size and preserves the original stack pointer.  */
8ae002
-
8ae002
-	subq $32, %rsp		# Allocate the local storage.
8ae002
-	cfi_adjust_cfa_offset(32)
8ae002
-	movq %rbx, (%rsp)
8ae002
-	cfi_rel_offset(%rbx, 0)
8ae002
-
8ae002
-	/* On the stack:
8ae002
-		56(%rbx)	parameter #1
8ae002
-		48(%rbx)	return address
8ae002
-
8ae002
-		40(%rbx)	reloc index
8ae002
-		32(%rbx)	link_map
8ae002
-
8ae002
-		24(%rbx)	La_x86_64_regs pointer
8ae002
-		16(%rbx)	framesize
8ae002
-		 8(%rbx)	rax
8ae002
-		  (%rbx)	rbx
8ae002
-	*/
8ae002
-
8ae002
-	movq %rax, 8(%rsp)
8ae002
-	movq %rsp, %rbx
8ae002
-	cfi_def_cfa_register(%rbx)
8ae002
-
8ae002
-	/* Actively align the La_x86_64_regs structure.  */
8ae002
-	andq $0xfffffffffffffff0, %rsp
8ae002
-# if defined HAVE_AVX_SUPPORT || defined HAVE_AVX512_ASM_SUPPORT
8ae002
-	/* sizeof(La_x86_64_regs).  Need extra space for 8 SSE registers
8ae002
-	   to detect if any xmm0-xmm7 registers are changed by audit
8ae002
-	   module.  */
8ae002
-	subq $(LR_SIZE + XMM_SIZE*8), %rsp
8ae002
-# else
8ae002
-	subq $LR_SIZE, %rsp		# sizeof(La_x86_64_regs)
8ae002
-# endif
8ae002
-	movq %rsp, 24(%rbx)
8ae002
-
8ae002
-	/* Fill the La_x86_64_regs structure.  */
8ae002
-	movq %rdx, LR_RDX_OFFSET(%rsp)
8ae002
-	movq %r8,  LR_R8_OFFSET(%rsp)
8ae002
-	movq %r9,  LR_R9_OFFSET(%rsp)
8ae002
-	movq %rcx, LR_RCX_OFFSET(%rsp)
8ae002
-	movq %rsi, LR_RSI_OFFSET(%rsp)
8ae002
-	movq %rdi, LR_RDI_OFFSET(%rsp)
8ae002
-	movq %rbp, LR_RBP_OFFSET(%rsp)
8ae002
-
8ae002
-	leaq 48(%rbx), %rax
8ae002
-	movq %rax, LR_RSP_OFFSET(%rsp)
8ae002
-
8ae002
-	/* We always store the XMM registers even if AVX is available.
8ae002
-	   This is to provide backward binary compatility for existing
8ae002
-	   audit modules.  */
8ae002
-	movaps %xmm0,		   (LR_XMM_OFFSET)(%rsp)
8ae002
-	movaps %xmm1, (LR_XMM_OFFSET +   XMM_SIZE)(%rsp)
8ae002
-	movaps %xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp)
8ae002
-	movaps %xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp)
8ae002
-	movaps %xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp)
8ae002
-	movaps %xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp)
8ae002
-	movaps %xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp)
8ae002
-	movaps %xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp)
8ae002
-
8ae002
-# ifndef __ILP32__
8ae002
-#  ifdef HAVE_MPX_SUPPORT
8ae002
-	bndmov %bnd0, 		   (LR_BND_OFFSET)(%rsp)  # Preserve bound
8ae002
-	bndmov %bnd1, (LR_BND_OFFSET +   BND_SIZE)(%rsp)  # registers. Nops if
8ae002
-	bndmov %bnd2, (LR_BND_OFFSET + BND_SIZE*2)(%rsp)  # MPX not available
8ae002
-	bndmov %bnd3, (LR_BND_OFFSET + BND_SIZE*3)(%rsp)  # or disabled.
8ae002
-#  else
8ae002
-	.byte 0x66,0x0f,0x1b,0x84,0x24;.long (LR_BND_OFFSET)
8ae002
-	.byte 0x66,0x0f,0x1b,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE)
8ae002
-	.byte 0x66,0x0f,0x1b,0x84,0x24;.long (LR_BND_OFFSET + BND_SIZE*2)
8ae002
-	.byte 0x66,0x0f,0x1b,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE*3)
8ae002
-#  endif
8ae002
-# endif
8ae002
-
8ae002
-# if defined HAVE_AVX_SUPPORT || defined HAVE_AVX512_ASM_SUPPORT
8ae002
-	.data
8ae002
-L(have_avx):
8ae002
-	.zero 4
8ae002
-	.size L(have_avx), 4
8ae002
-	.previous
8ae002
-
8ae002
-	cmpl	$0, L(have_avx)(%rip)
8ae002
-	jne	L(defined)
8ae002
-	movq	%rbx, %r11		# Save rbx
8ae002
-	movl	$1, %eax
8ae002
-	cpuid
8ae002
-	movq	%r11,%rbx		# Restore rbx
8ae002
-	xorl	%eax, %eax
8ae002
-	// AVX and XSAVE supported?
8ae002
-	andl	$((1 << 28) | (1 << 27)), %ecx
8ae002
-	cmpl	$((1 << 28) | (1 << 27)), %ecx
8ae002
-	jne	10f
8ae002
-#  ifdef HAVE_AVX512_ASM_SUPPORT
8ae002
-	// AVX512 supported in processor?
8ae002
-	movq	%rbx, %r11		# Save rbx
8ae002
-	xorl	%ecx, %ecx
8ae002
-	mov	$0x7, %eax
8ae002
-	cpuid
8ae002
-	andl	$(1 << 16), %ebx
8ae002
-#  endif
8ae002
-	xorl	%ecx, %ecx
8ae002
-	// Get XFEATURE_ENABLED_MASK
8ae002
-	xgetbv
8ae002
-#  ifdef HAVE_AVX512_ASM_SUPPORT
8ae002
-	test	%ebx, %ebx
8ae002
-	movq	%r11, %rbx		# Restore rbx
8ae002
-	je	20f
8ae002
-	// Verify that XCR0[7:5] = '111b' and
8ae002
-	// XCR0[2:1] = '11b' which means
8ae002
-	// that zmm state is enabled
8ae002
-	andl	$0xe6, %eax
8ae002
-	cmpl	$0xe6, %eax
8ae002
-	jne	20f
8ae002
-	movl	%eax, L(have_avx)(%rip)
8ae002
-L(avx512):
8ae002
-#   define RESTORE_AVX
8ae002
-#   define VMOV    vmovdqu64
8ae002
-#   define VEC(i)  zmm##i
8ae002
-#   define MORE_CODE
8ae002
-#   include "dl-trampoline.h"
8ae002
-#   undef VMOV
8ae002
-#   undef VEC
8ae002
-#   undef RESTORE_AVX
8ae002
-#  endif
8ae002
-20:	andl	$0x6, %eax
8ae002
-10:	subl	$0x5, %eax
8ae002
-	movl	%eax, L(have_avx)(%rip)
8ae002
-	cmpl	$0, %eax
8ae002
-
8ae002
-L(defined):
8ae002
-	js	L(no_avx)
8ae002
-#  ifdef HAVE_AVX512_ASM_SUPPORT
8ae002
-	cmpl	$0xe6, L(have_avx)(%rip)
8ae002
-	je	L(avx512)
8ae002
-#  endif
8ae002
-
8ae002
-#  define RESTORE_AVX
8ae002
-#  define VMOV    vmovdqu
8ae002
-#  define VEC(i)  ymm##i
8ae002
-#  define MORE_CODE
8ae002
-#  include "dl-trampoline.h"
8ae002
-
8ae002
-	.align 16
8ae002
-L(no_avx):
8ae002
-# endif
8ae002
-
8ae002
-# undef RESTORE_AVX
8ae002
-# include "dl-trampoline.h"
8ae002
-
8ae002
-	cfi_endproc
8ae002
-	.size _dl_runtime_profile, .-_dl_runtime_profile
8ae002
+#define VEC(i)			zmm##i
8ae002
+#define _dl_runtime_resolve	_dl_runtime_resolve_avx512
8ae002
+#define _dl_runtime_profile	_dl_runtime_profile_avx512
8ae002
+#define RESTORE_AVX
8ae002
+#include "dl-trampoline.h"
8ae002
+#undef _dl_runtime_resolve
8ae002
+#undef _dl_runtime_profile
8ae002
+#undef VEC
8ae002
+#undef VMOV
8ae002
+#undef VMOVA
8ae002
+#undef VEC_SIZE
8ae002
+
8ae002
+#define VEC_SIZE		32
8ae002
+#define VMOVA			vmovdqa
8ae002
+#if DL_RUNIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT
8ae002
+# define VMOV			vmovdqa
8ae002
+#else
8ae002
+# define VMOV			vmovdqu
8ae002
 #endif
8ae002
-
8ae002
-
8ae002
-#ifdef SHARED
8ae002
-	.globl _dl_x86_64_save_sse
8ae002
-	.type _dl_x86_64_save_sse, @function
8ae002
-	.align 16
8ae002
-	cfi_startproc
8ae002
-_dl_x86_64_save_sse:
8ae002
-# if defined HAVE_AVX_SUPPORT || defined HAVE_AVX512_ASM_SUPPORT
8ae002
-	cmpl	$0, L(have_avx)(%rip)
8ae002
-	jne	L(defined_5)
8ae002
-	movq	%rbx, %r11		# Save rbx
8ae002
-	movl	$1, %eax
8ae002
-	cpuid
8ae002
-	movq	%r11,%rbx		# Restore rbx
8ae002
-	xorl	%eax, %eax
8ae002
-	// AVX and XSAVE supported?
8ae002
-	andl	$((1 << 28) | (1 << 27)), %ecx
8ae002
-	cmpl	$((1 << 28) | (1 << 27)), %ecx
8ae002
-	jne	1f
8ae002
-#  ifdef HAVE_AVX512_ASM_SUPPORT
8ae002
-	// AVX512 supported in a processor?
8ae002
-	movq	%rbx, %r11              # Save rbx
8ae002
-	xorl	%ecx,%ecx
8ae002
-	mov	$0x7,%eax
8ae002
-	cpuid
8ae002
-	andl	$(1 << 16), %ebx
8ae002
-#  endif
8ae002
-	xorl	%ecx, %ecx
8ae002
-	// Get XFEATURE_ENABLED_MASK
8ae002
-	xgetbv
8ae002
-#  ifdef HAVE_AVX512_ASM_SUPPORT
8ae002
-	test	%ebx, %ebx
8ae002
-	movq	%r11, %rbx		# Restore rbx
8ae002
-	je	2f
8ae002
-	// Verify that XCR0[7:5] = '111b' and
8ae002
-	// XCR0[2:1] = '11b' which means
8ae002
-	// that zmm state is enabled
8ae002
-	andl	$0xe6, %eax
8ae002
-	movl	%eax, L(have_avx)(%rip)
8ae002
-	cmpl	$0xe6, %eax
8ae002
-	je	L(avx512_5)
8ae002
-#  endif
8ae002
-
8ae002
-2:	andl	$0x6, %eax
8ae002
-1:	subl	$0x5, %eax
8ae002
-	movl	%eax, L(have_avx)(%rip)
8ae002
-	cmpl	$0, %eax
8ae002
-
8ae002
-L(defined_5):
8ae002
-	js	L(no_avx5)
8ae002
-#  ifdef HAVE_AVX512_ASM_SUPPORT
8ae002
-	cmpl	$0xe6, L(have_avx)(%rip)
8ae002
-	je	L(avx512_5)
8ae002
-#  endif
8ae002
-
8ae002
-	vmovdqa %ymm0, %fs:RTLD_SAVESPACE_SSE+0*YMM_SIZE
8ae002
-	vmovdqa %ymm1, %fs:RTLD_SAVESPACE_SSE+1*YMM_SIZE
8ae002
-	vmovdqa %ymm2, %fs:RTLD_SAVESPACE_SSE+2*YMM_SIZE
8ae002
-	vmovdqa %ymm3, %fs:RTLD_SAVESPACE_SSE+3*YMM_SIZE
8ae002
-	vmovdqa %ymm4, %fs:RTLD_SAVESPACE_SSE+4*YMM_SIZE
8ae002
-	vmovdqa %ymm5, %fs:RTLD_SAVESPACE_SSE+5*YMM_SIZE
8ae002
-	vmovdqa %ymm6, %fs:RTLD_SAVESPACE_SSE+6*YMM_SIZE
8ae002
-	vmovdqa %ymm7, %fs:RTLD_SAVESPACE_SSE+7*YMM_SIZE
8ae002
-	ret
8ae002
-#  ifdef HAVE_AVX512_ASM_SUPPORT
8ae002
-L(avx512_5):
8ae002
-	vmovdqu64 %zmm0, %fs:RTLD_SAVESPACE_SSE+0*ZMM_SIZE
8ae002
-	vmovdqu64 %zmm1, %fs:RTLD_SAVESPACE_SSE+1*ZMM_SIZE
8ae002
-	vmovdqu64 %zmm2, %fs:RTLD_SAVESPACE_SSE+2*ZMM_SIZE
8ae002
-	vmovdqu64 %zmm3, %fs:RTLD_SAVESPACE_SSE+3*ZMM_SIZE
8ae002
-	vmovdqu64 %zmm4, %fs:RTLD_SAVESPACE_SSE+4*ZMM_SIZE
8ae002
-	vmovdqu64 %zmm5, %fs:RTLD_SAVESPACE_SSE+5*ZMM_SIZE
8ae002
-	vmovdqu64 %zmm6, %fs:RTLD_SAVESPACE_SSE+6*ZMM_SIZE
8ae002
-	vmovdqu64 %zmm7, %fs:RTLD_SAVESPACE_SSE+7*ZMM_SIZE
8ae002
-	ret
8ae002
-#  endif
8ae002
-L(no_avx5):
8ae002
-# endif
8ae002
-	movdqa	%xmm0, %fs:RTLD_SAVESPACE_SSE+0*XMM_SIZE
8ae002
-	movdqa	%xmm1, %fs:RTLD_SAVESPACE_SSE+1*XMM_SIZE
8ae002
-	movdqa	%xmm2, %fs:RTLD_SAVESPACE_SSE+2*XMM_SIZE
8ae002
-	movdqa	%xmm3, %fs:RTLD_SAVESPACE_SSE+3*XMM_SIZE
8ae002
-	movdqa	%xmm4, %fs:RTLD_SAVESPACE_SSE+4*XMM_SIZE
8ae002
-	movdqa	%xmm5, %fs:RTLD_SAVESPACE_SSE+5*XMM_SIZE
8ae002
-	movdqa	%xmm6, %fs:RTLD_SAVESPACE_SSE+6*XMM_SIZE
8ae002
-	movdqa	%xmm7, %fs:RTLD_SAVESPACE_SSE+7*XMM_SIZE
8ae002
-	ret
8ae002
-	cfi_endproc
8ae002
-	.size _dl_x86_64_save_sse, .-_dl_x86_64_save_sse
8ae002
-
8ae002
-
8ae002
-	.globl _dl_x86_64_restore_sse
8ae002
-	.type _dl_x86_64_restore_sse, @function
8ae002
-	.align 16
8ae002
-	cfi_startproc
8ae002
-_dl_x86_64_restore_sse:
8ae002
-# if defined HAVE_AVX_SUPPORT || defined HAVE_AVX512_ASM_SUPPORT
8ae002
-	cmpl	$0, L(have_avx)(%rip)
8ae002
-	js	L(no_avx6)
8ae002
-#  ifdef HAVE_AVX512_ASM_SUPPORT
8ae002
-	cmpl	$0xe6, L(have_avx)(%rip)
8ae002
-	je	L(avx512_6)
8ae002
-#  endif
8ae002
-
8ae002
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+0*YMM_SIZE, %ymm0
8ae002
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+1*YMM_SIZE, %ymm1
8ae002
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+2*YMM_SIZE, %ymm2
8ae002
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+3*YMM_SIZE, %ymm3
8ae002
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+4*YMM_SIZE, %ymm4
8ae002
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+5*YMM_SIZE, %ymm5
8ae002
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+6*YMM_SIZE, %ymm6
8ae002
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+7*YMM_SIZE, %ymm7
8ae002
-	ret
8ae002
-#  ifdef HAVE_AVX512_ASM_SUPPORT
8ae002
-L(avx512_6):
8ae002
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+0*ZMM_SIZE, %zmm0
8ae002
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+1*ZMM_SIZE, %zmm1
8ae002
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+2*ZMM_SIZE, %zmm2
8ae002
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+3*ZMM_SIZE, %zmm3
8ae002
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+4*ZMM_SIZE, %zmm4
8ae002
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+5*ZMM_SIZE, %zmm5
8ae002
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+6*ZMM_SIZE, %zmm6
8ae002
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+7*ZMM_SIZE, %zmm7
8ae002
-	ret
8ae002
-#  endif
8ae002
-L(no_avx6):
8ae002
-# endif
8ae002
-	movdqa	%fs:RTLD_SAVESPACE_SSE+0*XMM_SIZE, %xmm0
8ae002
-	movdqa	%fs:RTLD_SAVESPACE_SSE+1*XMM_SIZE, %xmm1
8ae002
-	movdqa	%fs:RTLD_SAVESPACE_SSE+2*XMM_SIZE, %xmm2
8ae002
-	movdqa	%fs:RTLD_SAVESPACE_SSE+3*XMM_SIZE, %xmm3
8ae002
-	movdqa	%fs:RTLD_SAVESPACE_SSE+4*XMM_SIZE, %xmm4
8ae002
-	movdqa	%fs:RTLD_SAVESPACE_SSE+5*XMM_SIZE, %xmm5
8ae002
-	movdqa	%fs:RTLD_SAVESPACE_SSE+6*XMM_SIZE, %xmm6
8ae002
-	movdqa	%fs:RTLD_SAVESPACE_SSE+7*XMM_SIZE, %xmm7
8ae002
-	ret
8ae002
-	cfi_endproc
8ae002
-	.size _dl_x86_64_restore_sse, .-_dl_x86_64_restore_sse
8ae002
+#define VEC(i)			ymm##i
8ae002
+#define _dl_runtime_resolve	_dl_runtime_resolve_avx
8ae002
+#define _dl_runtime_resolve_opt	_dl_runtime_resolve_avx_opt
8ae002
+#define _dl_runtime_profile	_dl_runtime_profile_avx
8ae002
+#include "dl-trampoline.h"
8ae002
+#undef _dl_runtime_resolve
8ae002
+#undef _dl_runtime_resolve_opt
8ae002
+#undef _dl_runtime_profile
8ae002
+#undef VEC
8ae002
+#undef VMOV
8ae002
+#undef VMOVA
8ae002
+#undef VEC_SIZE
8ae002
+
8ae002
+/* movaps/movups is 1-byte shorter.  */
8ae002
+#define VEC_SIZE		16
8ae002
+#define VMOVA			movaps
8ae002
+#if DL_RUNIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT
8ae002
+# define VMOV			movaps
8ae002
+#else
8ae002
+# define VMOV			movups
8ae002
+ #endif
8ae002
+#define VEC(i)			xmm##i
8ae002
+#define _dl_runtime_resolve	_dl_runtime_resolve_sse
8ae002
+#define _dl_runtime_profile	_dl_runtime_profile_sse
8ae002
+#undef RESTORE_AVX
8ae002
+#include "dl-trampoline.h"
8ae002
+#undef _dl_runtime_resolve
8ae002
+#undef _dl_runtime_profile
8ae002
+#undef VMOV
8ae002
+#undef VMOVA
8ae002
+
8ae002
+/* Used by _dl_runtime_resolve_avx_opt/_dl_runtime_resolve_avx512_opt
8ae002
+   to preserve the full vector registers with zero upper bits.  */
8ae002
+#define VMOVA			vmovdqa
8ae002
+#if DL_RUNTIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT
8ae002
+# define VMOV			vmovdqa
8ae002
+#else
8ae002
+# define VMOV			vmovdqu
8ae002
 #endif
8ae002
+#define _dl_runtime_resolve	_dl_runtime_resolve_sse_vex
8ae002
+#define _dl_runtime_resolve_opt	_dl_runtime_resolve_avx512_opt
8ae002
+#include "dl-trampoline.h"
8ae002
Index: glibc-2.17-c758a686/sysdeps/x86_64/dl-trampoline.h
8ae002
===================================================================
8ae002
--- glibc-2.17-c758a686.orig/sysdeps/x86_64/dl-trampoline.h
8ae002
+++ glibc-2.17-c758a686/sysdeps/x86_64/dl-trampoline.h
8ae002
@@ -1,6 +1,5 @@
8ae002
-/* Partial PLT profile trampoline to save and restore x86-64 vector
8ae002
-   registers.
8ae002
-   Copyright (C) 2009, 2011 Free Software Foundation, Inc.
8ae002
+/* PLT trampolines.  x86-64 version.
8ae002
+   Copyright (C) 2009-2015 Free Software Foundation, Inc.
8ae002
    This file is part of the GNU C Library.
8ae002
 
8ae002
    The GNU C Library is free software; you can redistribute it and/or
8ae002
@@ -17,16 +16,355 @@
8ae002
    License along with the GNU C Library; if not, see
8ae002
    <http://www.gnu.org/licenses/>.  */
8ae002
 
8ae002
-#ifdef RESTORE_AVX
8ae002
+#undef REGISTER_SAVE_AREA_RAW
8ae002
+#ifdef __ILP32__
8ae002
+/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as VEC0 to
8ae002
+   VEC7.  */
8ae002
+# define REGISTER_SAVE_AREA_RAW	(8 * 7 + VEC_SIZE * 8)
8ae002
+#else
8ae002
+/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as
8ae002
+   BND0, BND1, BND2, BND3 and VEC0 to VEC7. */
8ae002
+# define REGISTER_SAVE_AREA_RAW	(8 * 7 + 16 * 4 + VEC_SIZE * 8)
8ae002
+#endif
8ae002
+
8ae002
+#undef REGISTER_SAVE_AREA
8ae002
+#undef LOCAL_STORAGE_AREA
8ae002
+#undef BASE
8ae002
+#if DL_RUNIME_RESOLVE_REALIGN_STACK
8ae002
+# define REGISTER_SAVE_AREA	(REGISTER_SAVE_AREA_RAW + 8)
8ae002
+/* Local stack area before jumping to function address: RBX.  */
8ae002
+# define LOCAL_STORAGE_AREA	8
8ae002
+# define BASE			rbx
8ae002
+# if (REGISTER_SAVE_AREA % VEC_SIZE) != 0
8ae002
+#  error REGISTER_SAVE_AREA must be multples of VEC_SIZE
8ae002
+# endif
8ae002
+#else
8ae002
+# define REGISTER_SAVE_AREA	REGISTER_SAVE_AREA_RAW
8ae002
+/* Local stack area before jumping to function address:  All saved
8ae002
+   registers.  */
8ae002
+# define LOCAL_STORAGE_AREA	REGISTER_SAVE_AREA
8ae002
+# define BASE			rsp
8ae002
+# if (REGISTER_SAVE_AREA % 16) != 8
8ae002
+#  error REGISTER_SAVE_AREA must be odd multples of 8
8ae002
+# endif
8ae002
+#endif
8ae002
+
8ae002
+	.text
8ae002
+#ifdef _dl_runtime_resolve_opt
8ae002
+/* Use the smallest vector registers to preserve the full YMM/ZMM
8ae002
+   registers to avoid SSE transition penalty.  */
8ae002
+
8ae002
+# if VEC_SIZE == 32
8ae002
+/* Check if the upper 128 bits in %ymm0 - %ymm7 registers are non-zero
8ae002
+   and preserve %xmm0 - %xmm7 registers with the zero upper bits.  Since
8ae002
+   there is no SSE transition penalty on AVX512 processors which don't
8ae002
+   support XGETBV with ECX == 1, _dl_runtime_resolve_avx512_slow isn't
8ae002
+   provided.   */
8ae002
+	.globl _dl_runtime_resolve_avx_slow
8ae002
+	.hidden _dl_runtime_resolve_avx_slow
8ae002
+	.type _dl_runtime_resolve_avx_slow, @function
8ae002
+	.align 16
8ae002
+_dl_runtime_resolve_avx_slow:
8ae002
+	cfi_startproc
8ae002
+	cfi_adjust_cfa_offset(16) # Incorporate PLT
8ae002
+	vorpd %ymm0, %ymm1, %ymm8
8ae002
+	vorpd %ymm2, %ymm3, %ymm9
8ae002
+	vorpd %ymm4, %ymm5, %ymm10
8ae002
+	vorpd %ymm6, %ymm7, %ymm11
8ae002
+	vorpd %ymm8, %ymm9, %ymm9
8ae002
+	vorpd %ymm10, %ymm11, %ymm10
8ae002
+	vpcmpeqd %xmm8, %xmm8, %xmm8
8ae002
+	vorpd %ymm9, %ymm10, %ymm10
8ae002
+	vptest %ymm10, %ymm8
8ae002
+	# Preserve %ymm0 - %ymm7 registers if the upper 128 bits of any
8ae002
+	# %ymm0 - %ymm7 registers aren't zero.
8ae002
+	PRESERVE_BND_REGS_PREFIX
8ae002
+	jnc _dl_runtime_resolve_avx
8ae002
+	# Use vzeroupper to avoid SSE transition penalty.
8ae002
+	vzeroupper
8ae002
+	# Preserve %xmm0 - %xmm7 registers with the zero upper 128 bits
8ae002
+	# when the upper 128 bits of %ymm0 - %ymm7 registers are zero.
8ae002
+	PRESERVE_BND_REGS_PREFIX
8ae002
+	jmp _dl_runtime_resolve_sse_vex
8ae002
+	cfi_adjust_cfa_offset(-16) # Restore PLT adjustment
8ae002
+	cfi_endproc
8ae002
+	.size _dl_runtime_resolve_avx_slow, .-_dl_runtime_resolve_avx_slow
8ae002
+# endif
8ae002
+
8ae002
+/* Use XGETBV with ECX == 1 to check which bits in vector registers are
8ae002
+   non-zero and only preserve the non-zero lower bits with zero upper
8ae002
+   bits.  */
8ae002
+	.globl _dl_runtime_resolve_opt
8ae002
+	.hidden _dl_runtime_resolve_opt
8ae002
+	.type _dl_runtime_resolve_opt, @function
8ae002
+	.align 16
8ae002
+_dl_runtime_resolve_opt:
8ae002
+	cfi_startproc
8ae002
+	cfi_adjust_cfa_offset(16) # Incorporate PLT
8ae002
+	pushq %rax
8ae002
+	cfi_adjust_cfa_offset(8)
8ae002
+	cfi_rel_offset(%rax, 0)
8ae002
+	pushq %rcx
8ae002
+	cfi_adjust_cfa_offset(8)
8ae002
+	cfi_rel_offset(%rcx, 0)
8ae002
+	pushq %rdx
8ae002
+	cfi_adjust_cfa_offset(8)
8ae002
+	cfi_rel_offset(%rdx, 0)
8ae002
+	movl $1, %ecx
8ae002
+	xgetbv
8ae002
+	movl %eax, %r11d
8ae002
+	popq %rdx
8ae002
+	cfi_adjust_cfa_offset(-8)
8ae002
+	cfi_restore (%rdx)
8ae002
+	popq %rcx
8ae002
+	cfi_adjust_cfa_offset(-8)
8ae002
+	cfi_restore (%rcx)
8ae002
+	popq %rax
8ae002
+	cfi_adjust_cfa_offset(-8)
8ae002
+	cfi_restore (%rax)
8ae002
+# if VEC_SIZE == 32
8ae002
+	# For YMM registers, check if YMM state is in use.
8ae002
+	andl $bit_YMM_state, %r11d
8ae002
+	# Preserve %xmm0 - %xmm7 registers with the zero upper 128 bits if
8ae002
+	# YMM state isn't in use.
8ae002
+	PRESERVE_BND_REGS_PREFIX
8ae002
+	jz _dl_runtime_resolve_sse_vex
8ae002
+# elif VEC_SIZE == 16
8ae002
+	# For ZMM registers, check if YMM state and ZMM state are in
8ae002
+	# use.
8ae002
+	andl $(bit_YMM_state | bit_ZMM0_15_state), %r11d
8ae002
+	cmpl $bit_YMM_state, %r11d
8ae002
+	# Preserve %zmm0 - %zmm7 registers if ZMM state is in use.
8ae002
+	PRESERVE_BND_REGS_PREFIX
8ae002
+	jg _dl_runtime_resolve_avx512
8ae002
+	# Preserve %ymm0 - %ymm7 registers with the zero upper 256 bits if
8ae002
+	# ZMM state isn't in use.
8ae002
+	PRESERVE_BND_REGS_PREFIX
8ae002
+	je _dl_runtime_resolve_avx
8ae002
+	# Preserve %xmm0 - %xmm7 registers with the zero upper 384 bits if
8ae002
+	# neither YMM state nor ZMM state are in use.
8ae002
+# else
8ae002
+#  error Unsupported VEC_SIZE!
8ae002
+# endif
8ae002
+	cfi_adjust_cfa_offset(-16) # Restore PLT adjustment
8ae002
+	cfi_endproc
8ae002
+	.size _dl_runtime_resolve_opt, .-_dl_runtime_resolve_opt
8ae002
+#endif
8ae002
+	.globl _dl_runtime_resolve
8ae002
+	.hidden _dl_runtime_resolve
8ae002
+	.type _dl_runtime_resolve, @function
8ae002
+	.align 16
8ae002
+	cfi_startproc
8ae002
+_dl_runtime_resolve:
8ae002
+	cfi_adjust_cfa_offset(16) # Incorporate PLT
8ae002
+#if DL_RUNIME_RESOLVE_REALIGN_STACK
8ae002
+# if LOCAL_STORAGE_AREA != 8
8ae002
+#  error LOCAL_STORAGE_AREA must be 8
8ae002
+# endif
8ae002
+	pushq %rbx			# push subtracts stack by 8.
8ae002
+	cfi_adjust_cfa_offset(8)
8ae002
+	cfi_rel_offset(%rbx, 0)
8ae002
+	mov %RSP_LP, %RBX_LP
8ae002
+	cfi_def_cfa_register(%rbx)
8ae002
+	and $-VEC_SIZE, %RSP_LP
8ae002
+#endif
8ae002
+	sub $REGISTER_SAVE_AREA, %RSP_LP
8ae002
+	cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
8ae002
+	# Preserve registers otherwise clobbered.
8ae002
+	movq %rax, REGISTER_SAVE_RAX(%rsp)
8ae002
+	movq %rcx, REGISTER_SAVE_RCX(%rsp)
8ae002
+	movq %rdx, REGISTER_SAVE_RDX(%rsp)
8ae002
+	movq %rsi, REGISTER_SAVE_RSI(%rsp)
8ae002
+	movq %rdi, REGISTER_SAVE_RDI(%rsp)
8ae002
+	movq %r8, REGISTER_SAVE_R8(%rsp)
8ae002
+	movq %r9, REGISTER_SAVE_R9(%rsp)
8ae002
+	VMOV %VEC(0), (REGISTER_SAVE_VEC_OFF)(%rsp)
8ae002
+	VMOV %VEC(1), (REGISTER_SAVE_VEC_OFF + VEC_SIZE)(%rsp)
8ae002
+	VMOV %VEC(2), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 2)(%rsp)
8ae002
+	VMOV %VEC(3), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 3)(%rsp)
8ae002
+	VMOV %VEC(4), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 4)(%rsp)
8ae002
+	VMOV %VEC(5), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 5)(%rsp)
8ae002
+	VMOV %VEC(6), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 6)(%rsp)
8ae002
+	VMOV %VEC(7), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 7)(%rsp)
8ae002
+#ifndef __ILP32__
8ae002
+	# We also have to preserve bound registers.  These are nops if
8ae002
+	# Intel MPX isn't available or disabled.
8ae002
+# ifdef HAVE_MPX_SUPPORT
8ae002
+	bndmov %bnd0, REGISTER_SAVE_BND0(%rsp)
8ae002
+	bndmov %bnd1, REGISTER_SAVE_BND1(%rsp)
8ae002
+	bndmov %bnd2, REGISTER_SAVE_BND2(%rsp)
8ae002
+	bndmov %bnd3, REGISTER_SAVE_BND3(%rsp)
8ae002
+# else
8ae002
+#  if REGISTER_SAVE_BND0 == 0
8ae002
+	.byte 0x66,0x0f,0x1b,0x04,0x24
8ae002
+#  else
8ae002
+	.byte 0x66,0x0f,0x1b,0x44,0x24,REGISTER_SAVE_BND0
8ae002
+#  endif
8ae002
+	.byte 0x66,0x0f,0x1b,0x4c,0x24,REGISTER_SAVE_BND1
8ae002
+	.byte 0x66,0x0f,0x1b,0x54,0x24,REGISTER_SAVE_BND2
8ae002
+	.byte 0x66,0x0f,0x1b,0x5c,0x24,REGISTER_SAVE_BND3
8ae002
+# endif
8ae002
+#endif
8ae002
+	# Copy args pushed by PLT in register.
8ae002
+	# %rdi: link_map, %rsi: reloc_index
8ae002
+	mov (LOCAL_STORAGE_AREA + 8)(%BASE), %RSI_LP
8ae002
+	mov LOCAL_STORAGE_AREA(%BASE), %RDI_LP
8ae002
+	call _dl_fixup		# Call resolver.
8ae002
+	mov %RAX_LP, %R11_LP	# Save return value
8ae002
+#ifndef __ILP32__
8ae002
+	# Restore bound registers.  These are nops if Intel MPX isn't
8ae002
+	# avaiable or disabled.
8ae002
+# ifdef HAVE_MPX_SUPPORT
8ae002
+	bndmov REGISTER_SAVE_BND3(%rsp), %bnd3
8ae002
+	bndmov REGISTER_SAVE_BND2(%rsp), %bnd2
8ae002
+	bndmov REGISTER_SAVE_BND1(%rsp), %bnd1
8ae002
+	bndmov REGISTER_SAVE_BND0(%rsp), %bnd0
8ae002
+# else
8ae002
+	.byte 0x66,0x0f,0x1a,0x5c,0x24,REGISTER_SAVE_BND3
8ae002
+	.byte 0x66,0x0f,0x1a,0x54,0x24,REGISTER_SAVE_BND2
8ae002
+	.byte 0x66,0x0f,0x1a,0x4c,0x24,REGISTER_SAVE_BND1
8ae002
+#  if REGISTER_SAVE_BND0 == 0
8ae002
+	.byte 0x66,0x0f,0x1a,0x04,0x24
8ae002
+#  else
8ae002
+	.byte 0x66,0x0f,0x1a,0x44,0x24,REGISTER_SAVE_BND0
8ae002
+#  endif
8ae002
+# endif
8ae002
+#endif
8ae002
+	# Get register content back.
8ae002
+	movq REGISTER_SAVE_R9(%rsp), %r9
8ae002
+	movq REGISTER_SAVE_R8(%rsp), %r8
8ae002
+	movq REGISTER_SAVE_RDI(%rsp), %rdi
8ae002
+	movq REGISTER_SAVE_RSI(%rsp), %rsi
8ae002
+	movq REGISTER_SAVE_RDX(%rsp), %rdx
8ae002
+	movq REGISTER_SAVE_RCX(%rsp), %rcx
8ae002
+	movq REGISTER_SAVE_RAX(%rsp), %rax
8ae002
+	VMOV (REGISTER_SAVE_VEC_OFF)(%rsp), %VEC(0)
8ae002
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE)(%rsp), %VEC(1)
8ae002
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 2)(%rsp), %VEC(2)
8ae002
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 3)(%rsp), %VEC(3)
8ae002
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 4)(%rsp), %VEC(4)
8ae002
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 5)(%rsp), %VEC(5)
8ae002
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 6)(%rsp), %VEC(6)
8ae002
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 7)(%rsp), %VEC(7)
8ae002
+#if DL_RUNIME_RESOLVE_REALIGN_STACK
8ae002
+	mov %RBX_LP, %RSP_LP
8ae002
+	cfi_def_cfa_register(%rsp)
8ae002
+	movq (%rsp), %rbx
8ae002
+	cfi_restore(%rbx)
8ae002
+#endif
8ae002
+	# Adjust stack(PLT did 2 pushes)
8ae002
+	add $(LOCAL_STORAGE_AREA + 16), %RSP_LP
8ae002
+	cfi_adjust_cfa_offset(-(LOCAL_STORAGE_AREA + 16))
8ae002
+	# Preserve bound registers.
8ae002
+	PRESERVE_BND_REGS_PREFIX
8ae002
+	jmp *%r11		# Jump to function address.
8ae002
+	cfi_endproc
8ae002
+	.size _dl_runtime_resolve, .-_dl_runtime_resolve
8ae002
+
8ae002
+
8ae002
+/* To preserve %xmm0 - %xmm7 registers, dl-trampoline.h is included
8ae002
+   twice, for _dl_runtime_resolve_sse and _dl_runtime_resolve_sse_vex.
8ae002
+   But we don't need another _dl_runtime_profile for XMM registers.  */
8ae002
+#if !defined PROF && defined _dl_runtime_profile
8ae002
+# if (LR_VECTOR_OFFSET % VEC_SIZE) != 0
8ae002
+#  error LR_VECTOR_OFFSET must be multples of VEC_SIZE
8ae002
+# endif
8ae002
+
8ae002
+	.globl _dl_runtime_profile
8ae002
+	.hidden _dl_runtime_profile
8ae002
+	.type _dl_runtime_profile, @function
8ae002
+	.align 16
8ae002
+_dl_runtime_profile:
8ae002
+	cfi_startproc
8ae002
+	cfi_adjust_cfa_offset(16) # Incorporate PLT
8ae002
+	/* The La_x86_64_regs data structure pointed to by the
8ae002
+	   fourth paramater must be VEC_SIZE-byte aligned.  This must
8ae002
+	   be explicitly enforced.  We have the set up a dynamically
8ae002
+	   sized stack frame.  %rbx points to the top half which
8ae002
+	   has a fixed size and preserves the original stack pointer.  */
8ae002
+
8ae002
+	sub $32, %RSP_LP	# Allocate the local storage.
8ae002
+	cfi_adjust_cfa_offset(32)
8ae002
+	movq %rbx, (%rsp)
8ae002
+	cfi_rel_offset(%rbx, 0)
8ae002
+
8ae002
+	/* On the stack:
8ae002
+		56(%rbx)	parameter #1
8ae002
+		48(%rbx)	return address
8ae002
+
8ae002
+		40(%rbx)	reloc index
8ae002
+		32(%rbx)	link_map
8ae002
+
8ae002
+		24(%rbx)	La_x86_64_regs pointer
8ae002
+		16(%rbx)	framesize
8ae002
+		 8(%rbx)	rax
8ae002
+		  (%rbx)	rbx
8ae002
+	*/
8ae002
+
8ae002
+	movq %rax, 8(%rsp)
8ae002
+	mov %RSP_LP, %RBX_LP
8ae002
+	cfi_def_cfa_register(%rbx)
8ae002
+
8ae002
+	/* Actively align the La_x86_64_regs structure.  */
8ae002
+	and $-VEC_SIZE, %RSP_LP
8ae002
+# if defined HAVE_AVX_SUPPORT || defined HAVE_AVX512_ASM_SUPPORT
8ae002
+	/* sizeof(La_x86_64_regs).  Need extra space for 8 SSE registers
8ae002
+	   to detect if any xmm0-xmm7 registers are changed by audit
8ae002
+	   module.  */
8ae002
+	sub $(LR_SIZE + XMM_SIZE*8), %RSP_LP
8ae002
+# else
8ae002
+	sub $LR_SIZE, %RSP_LP		# sizeof(La_x86_64_regs)
8ae002
+# endif
8ae002
+	movq %rsp, 24(%rbx)
8ae002
+
8ae002
+	/* Fill the La_x86_64_regs structure.  */
8ae002
+	movq %rdx, LR_RDX_OFFSET(%rsp)
8ae002
+	movq %r8,  LR_R8_OFFSET(%rsp)
8ae002
+	movq %r9,  LR_R9_OFFSET(%rsp)
8ae002
+	movq %rcx, LR_RCX_OFFSET(%rsp)
8ae002
+	movq %rsi, LR_RSI_OFFSET(%rsp)
8ae002
+	movq %rdi, LR_RDI_OFFSET(%rsp)
8ae002
+	movq %rbp, LR_RBP_OFFSET(%rsp)
8ae002
+
8ae002
+	lea 48(%rbx), %RAX_LP
8ae002
+	movq %rax, LR_RSP_OFFSET(%rsp)
8ae002
+
8ae002
+	/* We always store the XMM registers even if AVX is available.
8ae002
+	   This is to provide backward binary compatibility for existing
8ae002
+	   audit modules.  */
8ae002
+	movaps %xmm0,		   (LR_XMM_OFFSET)(%rsp)
8ae002
+	movaps %xmm1, (LR_XMM_OFFSET +   XMM_SIZE)(%rsp)
8ae002
+	movaps %xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp)
8ae002
+	movaps %xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp)
8ae002
+	movaps %xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp)
8ae002
+	movaps %xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp)
8ae002
+	movaps %xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp)
8ae002
+	movaps %xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp)
8ae002
+
8ae002
+# ifndef __ILP32__
8ae002
+#  ifdef HAVE_MPX_SUPPORT
8ae002
+	bndmov %bnd0, 		   (LR_BND_OFFSET)(%rsp)  # Preserve bound
8ae002
+	bndmov %bnd1, (LR_BND_OFFSET +   BND_SIZE)(%rsp)  # registers. Nops if
8ae002
+	bndmov %bnd2, (LR_BND_OFFSET + BND_SIZE*2)(%rsp)  # MPX not available
8ae002
+	bndmov %bnd3, (LR_BND_OFFSET + BND_SIZE*3)(%rsp)  # or disabled.
8ae002
+#  else
8ae002
+	.byte 0x66,0x0f,0x1b,0x84,0x24;.long (LR_BND_OFFSET)
8ae002
+	.byte 0x66,0x0f,0x1b,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE)
8ae002
+	.byte 0x66,0x0f,0x1b,0x94,0x24;.long (LR_BND_OFFSET + BND_SIZE*2)
8ae002
+	.byte 0x66,0x0f,0x1b,0x9c,0x24;.long (LR_BND_OFFSET + BND_SIZE*3)
8ae002
+#  endif
8ae002
+# endif
8ae002
+
8ae002
+# ifdef RESTORE_AVX
8ae002
 	/* This is to support AVX audit modules.  */
8ae002
-	VMOV %VEC(0),		      (LR_VECTOR_OFFSET)(%rsp)
8ae002
-	VMOV %VEC(1), (LR_VECTOR_OFFSET +   VECTOR_SIZE)(%rsp)
8ae002
-	VMOV %VEC(2), (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp)
8ae002
-	VMOV %VEC(3), (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp)
8ae002
-	VMOV %VEC(4), (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp)
8ae002
-	VMOV %VEC(5), (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp)
8ae002
-	VMOV %VEC(6), (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp)
8ae002
-	VMOV %VEC(7), (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp)
8ae002
+	VMOVA %VEC(0),		      (LR_VECTOR_OFFSET)(%rsp)
8ae002
+	VMOVA %VEC(1), (LR_VECTOR_OFFSET +   VECTOR_SIZE)(%rsp)
8ae002
+	VMOVA %VEC(2), (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp)
8ae002
+	VMOVA %VEC(3), (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp)
8ae002
+	VMOVA %VEC(4), (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp)
8ae002
+	VMOVA %VEC(5), (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp)
8ae002
+	VMOVA %VEC(6), (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp)
8ae002
+	VMOVA %VEC(7), (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp)
8ae002
 
8ae002
 	/* Save xmm0-xmm7 registers to detect if any of them are
8ae002
 	   changed by audit module.  */
8ae002
@@ -38,7 +376,7 @@
8ae002
 	vmovdqa %xmm5, (LR_SIZE + XMM_SIZE*5)(%rsp)
8ae002
 	vmovdqa %xmm6, (LR_SIZE + XMM_SIZE*6)(%rsp)
8ae002
 	vmovdqa %xmm7, (LR_SIZE + XMM_SIZE*7)(%rsp)
8ae002
-#endif
8ae002
+# endif
8ae002
 
8ae002
 	mov %RSP_LP, %RCX_LP	# La_x86_64_regs pointer to %rcx.
8ae002
 	mov 48(%rbx), %RDX_LP	# Load return address if needed.
8ae002
@@ -63,21 +401,7 @@
8ae002
 	movaps (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6
8ae002
 	movaps (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7
8ae002
 
8ae002
-#ifndef __ILP32__
8ae002
-# ifdef HAVE_MPX_SUPPORT
8ae002
-	bndmov 		    (LR_BND_OFFSET)(%rsp), %bnd0  # Restore bound
8ae002
-	bndmov (LR_BND_OFFSET +   BND_SIZE)(%rsp), %bnd1  # registers.
8ae002
-	bndmov (LR_BND_OFFSET + BND_SIZE*2)(%rsp), %bnd2
8ae002
-	bndmov (LR_BND_OFFSET + BND_SIZE*3)(%rsp), %bnd3
8ae002
-# else
8ae002
-	.byte 0x66,0x0f,0x1a,0x84,0x24;.long (LR_BND_OFFSET)
8ae002
-	.byte 0x66,0x0f,0x1a,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE)
8ae002
-	.byte 0x66,0x0f,0x1a,0x94,0x24;.long (LR_BND_OFFSET + BND_SIZE*2)
8ae002
-	.byte 0x66,0x0f,0x1a,0x9c,0x24;.long (LR_BND_OFFSET + BND_SIZE*3)
8ae002
-# endif
8ae002
-#endif
8ae002
-
8ae002
-#ifdef RESTORE_AVX
8ae002
+# ifdef RESTORE_AVX
8ae002
 	/* Check if any xmm0-xmm7 registers are changed by audit
8ae002
 	   module.  */
8ae002
 	vpcmpeqq (LR_SIZE)(%rsp), %xmm0, %xmm8
8ae002
@@ -86,7 +410,7 @@
8ae002
 	je 2f
8ae002
 	vmovdqa	%xmm0, (LR_VECTOR_OFFSET)(%rsp)
8ae002
 	jmp 1f
8ae002
-2:	VMOV (LR_VECTOR_OFFSET)(%rsp), %VEC(0)
8ae002
+2:	VMOVA (LR_VECTOR_OFFSET)(%rsp), %VEC(0)
8ae002
 	vmovdqa	%xmm0, (LR_XMM_OFFSET)(%rsp)
8ae002
 
8ae002
 1:	vpcmpeqq (LR_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm8
8ae002
@@ -95,7 +419,7 @@
8ae002
 	je 2f
8ae002
 	vmovdqa	%xmm1, (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp)
8ae002
 	jmp 1f
8ae002
-2:	VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %VEC(1)
8ae002
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %VEC(1)
8ae002
 	vmovdqa	%xmm1, (LR_XMM_OFFSET + XMM_SIZE)(%rsp)
8ae002
 
8ae002
 1:	vpcmpeqq (LR_SIZE + XMM_SIZE*2)(%rsp), %xmm2, %xmm8
8ae002
@@ -104,7 +428,7 @@
8ae002
 	je 2f
8ae002
 	vmovdqa	%xmm2, (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp)
8ae002
 	jmp 1f
8ae002
-2:	VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %VEC(2)
8ae002
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %VEC(2)
8ae002
 	vmovdqa	%xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp)
8ae002
 
8ae002
 1:	vpcmpeqq (LR_SIZE + XMM_SIZE*3)(%rsp), %xmm3, %xmm8
8ae002
@@ -113,7 +437,7 @@
8ae002
 	je 2f
8ae002
 	vmovdqa	%xmm3, (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp)
8ae002
 	jmp 1f
8ae002
-2:	VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %VEC(3)
8ae002
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %VEC(3)
8ae002
 	vmovdqa	%xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp)
8ae002
 
8ae002
 1:	vpcmpeqq (LR_SIZE + XMM_SIZE*4)(%rsp), %xmm4, %xmm8
8ae002
@@ -122,7 +446,7 @@
8ae002
 	je 2f
8ae002
 	vmovdqa	%xmm4, (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp)
8ae002
 	jmp 1f
8ae002
-2:	VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %VEC(4)
8ae002
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %VEC(4)
8ae002
 	vmovdqa	%xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp)
8ae002
 
8ae002
 1:	vpcmpeqq (LR_SIZE + XMM_SIZE*5)(%rsp), %xmm5, %xmm8
8ae002
@@ -131,7 +455,7 @@
8ae002
 	je 2f
8ae002
 	vmovdqa	%xmm5, (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp)
8ae002
 	jmp 1f
8ae002
-2:	VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %VEC(5)
8ae002
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %VEC(5)
8ae002
 	vmovdqa	%xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp)
8ae002
 
8ae002
 1:	vpcmpeqq (LR_SIZE + XMM_SIZE*6)(%rsp), %xmm6, %xmm8
8ae002
@@ -140,7 +464,7 @@
8ae002
 	je 2f
8ae002
 	vmovdqa	%xmm6, (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp)
8ae002
 	jmp 1f
8ae002
-2:	VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %VEC(6)
8ae002
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %VEC(6)
8ae002
 	vmovdqa	%xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp)
8ae002
 
8ae002
 1:	vpcmpeqq (LR_SIZE + XMM_SIZE*7)(%rsp), %xmm7, %xmm8
8ae002
@@ -149,13 +473,29 @@
8ae002
 	je 2f
8ae002
 	vmovdqa	%xmm7, (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp)
8ae002
 	jmp 1f
8ae002
-2:	VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp), %VEC(7)
8ae002
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp), %VEC(7)
8ae002
 	vmovdqa	%xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp)
8ae002
 
8ae002
 1:
8ae002
-#endif
8ae002
+# endif
8ae002
+
8ae002
+# ifndef __ILP32__
8ae002
+#  ifdef HAVE_MPX_SUPPORT
8ae002
+	bndmov              (LR_BND_OFFSET)(%rsp), %bnd0  # Restore bound
8ae002
+	bndmov (LR_BND_OFFSET +   BND_SIZE)(%rsp), %bnd1  # registers.
8ae002
+	bndmov (LR_BND_OFFSET + BND_SIZE*2)(%rsp), %bnd2
8ae002
+	bndmov (LR_BND_OFFSET + BND_SIZE*3)(%rsp), %bnd3
8ae002
+#  else
8ae002
+	.byte 0x66,0x0f,0x1a,0x84,0x24;.long (LR_BND_OFFSET)
8ae002
+	.byte 0x66,0x0f,0x1a,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE)
8ae002
+	.byte 0x66,0x0f,0x1a,0x94,0x24;.long (LR_BND_OFFSET + BND_SIZE*2)
8ae002
+	.byte 0x66,0x0f,0x1a,0x9c,0x24;.long (LR_BND_OFFSET + BND_SIZE*3)
8ae002
+#  endif
8ae002
+# endif
8ae002
+
8ae002
 	mov  16(%rbx), %R10_LP	# Anything in framesize?
8ae002
 	test %R10_LP, %R10_LP
8ae002
+	PRESERVE_BND_REGS_PREFIX
8ae002
 	jns 3f
8ae002
 
8ae002
 	/* There's nothing in the frame size, so there
8ae002
@@ -166,14 +506,15 @@
8ae002
 	movq LR_RSI_OFFSET(%rsp), %rsi
8ae002
 	movq LR_RDI_OFFSET(%rsp), %rdi
8ae002
 
8ae002
-	movq %rbx, %rsp
8ae002
+	mov %RBX_LP, %RSP_LP
8ae002
 	movq (%rsp), %rbx
8ae002
-	cfi_restore(rbx)
8ae002
+	cfi_restore(%rbx)
8ae002
 	cfi_def_cfa_register(%rsp)
8ae002
 
8ae002
-	addq $48, %rsp		# Adjust the stack to the return value
8ae002
+	add $48, %RSP_LP	# Adjust the stack to the return value
8ae002
 				# (eats the reloc index and link_map)
8ae002
 	cfi_adjust_cfa_offset(-48)
8ae002
+	PRESERVE_BND_REGS_PREFIX
8ae002
 	jmp *%r11		# Jump to function address.
8ae002
 
8ae002
 3:
8ae002
@@ -186,13 +527,13 @@
8ae002
 	   temporary buffer of the size specified by the 'framesize'
8ae002
 	   returned from _dl_profile_fixup */
8ae002
 
8ae002
-	leaq LR_RSP_OFFSET(%rbx), %rsi	# stack
8ae002
-	addq $8, %r10
8ae002
-	andq $0xfffffffffffffff0, %r10
8ae002
-	movq %r10, %rcx
8ae002
-	subq %r10, %rsp
8ae002
-	movq %rsp, %rdi
8ae002
-	shrq $3, %rcx
8ae002
+	lea LR_RSP_OFFSET(%rbx), %RSI_LP # stack
8ae002
+	add $8, %R10_LP
8ae002
+	and $-16, %R10_LP
8ae002
+	mov %R10_LP, %RCX_LP
8ae002
+	sub %R10_LP, %RSP_LP
8ae002
+	mov %RSP_LP, %RDI_LP
8ae002
+	shr $3, %RCX_LP
8ae002
 	rep
8ae002
 	movsq
8ae002
 
8ae002
@@ -200,23 +541,24 @@
8ae002
 	movq 32(%rdi), %rsi
8ae002
 	movq 40(%rdi), %rdi
8ae002
 
8ae002
+	PRESERVE_BND_REGS_PREFIX
8ae002
 	call *%r11
8ae002
 
8ae002
-	mov 24(%rbx), %rsp	# Drop the copied stack content
8ae002
+	mov 24(%rbx), %RSP_LP	# Drop the copied stack content
8ae002
 
8ae002
 	/* Now we have to prepare the La_x86_64_retval structure for the
8ae002
 	   _dl_call_pltexit.  The La_x86_64_regs is being pointed by rsp now,
8ae002
 	   so we just need to allocate the sizeof(La_x86_64_retval) space on
8ae002
 	   the stack, since the alignment has already been taken care of. */
8ae002
-#ifdef RESTORE_AVX
8ae002
+# ifdef RESTORE_AVX
8ae002
 	/* sizeof(La_x86_64_retval).  Need extra space for 2 SSE
8ae002
 	   registers to detect if xmm0/xmm1 registers are changed
8ae002
 	   by audit module.  */
8ae002
-	subq $(LRV_SIZE + XMM_SIZE*2), %rsp
8ae002
-#else
8ae002
-	subq $LRV_SIZE, %rsp	# sizeof(La_x86_64_retval)
8ae002
-#endif
8ae002
-	movq %rsp, %rcx		# La_x86_64_retval argument to %rcx.
8ae002
+	sub $(LRV_SIZE + XMM_SIZE*2), %RSP_LP
8ae002
+# else
8ae002
+	sub $LRV_SIZE, %RSP_LP	# sizeof(La_x86_64_retval)
8ae002
+# endif
8ae002
+	mov %RSP_LP, %RCX_LP	# La_x86_64_retval argument to %rcx.
8ae002
 
8ae002
 	/* Fill in the La_x86_64_retval structure.  */
8ae002
 	movq %rax, LRV_RAX_OFFSET(%rcx)
8ae002
@@ -225,26 +567,26 @@
8ae002
 	movaps %xmm0, LRV_XMM0_OFFSET(%rcx)
8ae002
 	movaps %xmm1, LRV_XMM1_OFFSET(%rcx)
8ae002
 
8ae002
-#ifdef RESTORE_AVX
8ae002
+# ifdef RESTORE_AVX
8ae002
 	/* This is to support AVX audit modules.  */
8ae002
-	VMOV %VEC(0), LRV_VECTOR0_OFFSET(%rcx)
8ae002
-	VMOV %VEC(1), LRV_VECTOR1_OFFSET(%rcx)
8ae002
+	VMOVA %VEC(0), LRV_VECTOR0_OFFSET(%rcx)
8ae002
+	VMOVA %VEC(1), LRV_VECTOR1_OFFSET(%rcx)
8ae002
 
8ae002
 	/* Save xmm0/xmm1 registers to detect if they are changed
8ae002
 	   by audit module.  */
8ae002
 	vmovdqa %xmm0,		  (LRV_SIZE)(%rcx)
8ae002
 	vmovdqa %xmm1, (LRV_SIZE + XMM_SIZE)(%rcx)
8ae002
-#endif
8ae002
+# endif
8ae002
 
8ae002
-#ifndef __ILP32__
8ae002
-# ifdef HAVE_MPX_SUPPORT
8ae002
+# ifndef __ILP32__
8ae002
+#  ifdef HAVE_MPX_SUPPORT
8ae002
 	bndmov %bnd0, LRV_BND0_OFFSET(%rcx)  # Preserve returned bounds.
8ae002
 	bndmov %bnd1, LRV_BND1_OFFSET(%rcx)
8ae002
-# else
8ae002
+#  else
8ae002
 	.byte  0x66,0x0f,0x1b,0x81;.long (LRV_BND0_OFFSET)
8ae002
 	.byte  0x66,0x0f,0x1b,0x89;.long (LRV_BND1_OFFSET)
8ae002
+#  endif
8ae002
 # endif
8ae002
-#endif
8ae002
 
8ae002
 	fstpt LRV_ST0_OFFSET(%rcx)
8ae002
 	fstpt LRV_ST1_OFFSET(%rcx)
8ae002
@@ -261,49 +603,47 @@
8ae002
 	movaps LRV_XMM0_OFFSET(%rsp), %xmm0
8ae002
 	movaps LRV_XMM1_OFFSET(%rsp), %xmm1
8ae002
 
8ae002
-#ifdef RESTORE_AVX
8ae002
+# ifdef RESTORE_AVX
8ae002
 	/* Check if xmm0/xmm1 registers are changed by audit module.  */
8ae002
 	vpcmpeqq (LRV_SIZE)(%rsp), %xmm0, %xmm2
8ae002
 	vpmovmskb %xmm2, %esi
8ae002
 	cmpl $0xffff, %esi
8ae002
 	jne 1f
8ae002
-	VMOV LRV_VECTOR0_OFFSET(%rsp), %VEC(0)
8ae002
+	VMOVA LRV_VECTOR0_OFFSET(%rsp), %VEC(0)
8ae002
 
8ae002
 1:	vpcmpeqq (LRV_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2
8ae002
 	vpmovmskb %xmm2, %esi
8ae002
 	cmpl $0xffff, %esi
8ae002
 	jne 1f
8ae002
-	VMOV LRV_VECTOR1_OFFSET(%rsp), %VEC(1)
8ae002
+	VMOVA LRV_VECTOR1_OFFSET(%rsp), %VEC(1)
8ae002
 
8ae002
 1:
8ae002
-#endif
8ae002
+# endif
8ae002
 
8ae002
-#ifndef __ILP32__
8ae002
-# ifdef HAVE_MPX_SUPPORT
8ae002
-	bndmov LRV_BND0_OFFSET(%rcx), %bnd0  # Restore bound registers.
8ae002
-	bndmov LRV_BND1_OFFSET(%rcx), %bnd1
8ae002
-# else
8ae002
-	.byte  0x66,0x0f,0x1a,0x81;.long (LRV_BND0_OFFSET)
8ae002
-	.byte  0x66,0x0f,0x1a,0x89;.long (LRV_BND1_OFFSET)
8ae002
+# ifndef __ILP32__
8ae002
+#  ifdef HAVE_MPX_SUPPORT
8ae002
+	bndmov LRV_BND0_OFFSET(%rsp), %bnd0  # Restore bound registers.
8ae002
+	bndmov LRV_BND1_OFFSET(%rsp), %bnd1
8ae002
+#  else
8ae002
+	.byte  0x66,0x0f,0x1a,0x84,0x24;.long (LRV_BND0_OFFSET)
8ae002
+	.byte  0x66,0x0f,0x1a,0x8c,0x24;.long (LRV_BND1_OFFSET)
8ae002
+#  endif
8ae002
 # endif
8ae002
-#endif
8ae002
 
8ae002
 	fldt LRV_ST1_OFFSET(%rsp)
8ae002
 	fldt LRV_ST0_OFFSET(%rsp)
8ae002
 
8ae002
-	movq %rbx, %rsp
8ae002
+	mov %RBX_LP, %RSP_LP
8ae002
 	movq (%rsp), %rbx
8ae002
-	cfi_restore(rbx)
8ae002
+	cfi_restore(%rbx)
8ae002
 	cfi_def_cfa_register(%rsp)
8ae002
 
8ae002
-	addq $48, %rsp		# Adjust the stack to the return value
8ae002
+	add $48, %RSP_LP	# Adjust the stack to the return value
8ae002
 				# (eats the reloc index and link_map)
8ae002
 	cfi_adjust_cfa_offset(-48)
8ae002
+	PRESERVE_BND_REGS_PREFIX
8ae002
 	retq
8ae002
 
8ae002
-#ifdef MORE_CODE
8ae002
-	cfi_adjust_cfa_offset(48)
8ae002
-	cfi_rel_offset(%rbx, 0)
8ae002
-	cfi_def_cfa_register(%rbx)
8ae002
-# undef MORE_CODE
8ae002
+	cfi_endproc
8ae002
+	.size _dl_runtime_profile, .-_dl_runtime_profile
8ae002
 #endif
8ae002
Index: glibc-2.17-c758a686/sysdeps/x86_64/ifuncmain8.c
8ae002
===================================================================
8ae002
--- /dev/null
8ae002
+++ glibc-2.17-c758a686/sysdeps/x86_64/ifuncmain8.c
8ae002
@@ -0,0 +1,32 @@
8ae002
+/* Test IFUNC selector with floating-point parameters.
8ae002
+   Copyright (C) 2015 Free Software Foundation, Inc.
8ae002
+   This file is part of the GNU C Library.
8ae002
+
8ae002
+   The GNU C Library is free software; you can redistribute it and/or
8ae002
+   modify it under the terms of the GNU Lesser General Public
8ae002
+   License as published by the Free Software Foundation; either
8ae002
+   version 2.1 of the License, or (at your option) any later version.
8ae002
+
8ae002
+   The GNU C Library is distributed in the hope that it will be useful,
8ae002
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
8ae002
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
8ae002
+   Lesser General Public License for more details.
8ae002
+
8ae002
+   You should have received a copy of the GNU Lesser General Public
8ae002
+   License along with the GNU C Library; if not, see
8ae002
+   <http://www.gnu.org/licenses/>.  */
8ae002
+
8ae002
+#include <stdlib.h>
8ae002
+
8ae002
+extern float foo (float);
8ae002
+
8ae002
+static int
8ae002
+do_test (void)
8ae002
+{
8ae002
+  if (foo (2) != 3)
8ae002
+    abort ();
8ae002
+  return 0;
8ae002
+}
8ae002
+
8ae002
+#define TEST_FUNCTION do_test ()
8ae002
+#include "../test-skeleton.c"
8ae002
Index: glibc-2.17-c758a686/sysdeps/x86_64/ifuncmod8.c
8ae002
===================================================================
8ae002
--- /dev/null
8ae002
+++ glibc-2.17-c758a686/sysdeps/x86_64/ifuncmod8.c
8ae002
@@ -0,0 +1,36 @@
8ae002
+/* Test IFUNC selector with floating-point parameters.
8ae002
+   Copyright (C) 2015 Free Software Foundation, Inc.
8ae002
+   This file is part of the GNU C Library.
8ae002
+
8ae002
+   The GNU C Library is free software; you can redistribute it and/or
8ae002
+   modify it under the terms of the GNU Lesser General Public
8ae002
+   License as published by the Free Software Foundation; either
8ae002
+   version 2.1 of the License, or (at your option) any later version.
8ae002
+
8ae002
+   The GNU C Library is distributed in the hope that it will be useful,
8ae002
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
8ae002
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
8ae002
+   Lesser General Public License for more details.
8ae002
+
8ae002
+   You should have received a copy of the GNU Lesser General Public
8ae002
+   License along with the GNU C Library; if not, see
8ae002
+   <http://www.gnu.org/licenses/>.  */
8ae002
+
8ae002
+#include <emmintrin.h>
8ae002
+
8ae002
+void * foo_ifunc (void) __asm__ ("foo");
8ae002
+__asm__(".type foo, %gnu_indirect_function");
8ae002
+
8ae002
+static float
8ae002
+foo_impl (float x)
8ae002
+{
8ae002
+  return x + 1;
8ae002
+}
8ae002
+
8ae002
+void *
8ae002
+foo_ifunc (void)
8ae002
+{
8ae002
+  __m128i xmm = _mm_set1_epi32 (-1);
8ae002
+  asm volatile ("movdqa %0, %%xmm0" : : "x" (xmm) : "xmm0" );
8ae002
+  return foo_impl;
8ae002
+}
8ae002
Index: glibc-2.17-c758a686/sysdeps/x86_64/tst-avx-aux.c
8ae002
===================================================================
8ae002
--- /dev/null
8ae002
+++ glibc-2.17-c758a686/sysdeps/x86_64/tst-avx-aux.c
8ae002
@@ -0,0 +1,47 @@
8ae002
+/* Test case for preserved AVX registers in dynamic linker, -mavx part.
8ae002
+   Copyright (C) 2017 Free Software Foundation, Inc.
8ae002
+   This file is part of the GNU C Library.
8ae002
+
8ae002
+   The GNU C Library is free software; you can redistribute it and/or
8ae002
+   modify it under the terms of the GNU Lesser General Public
8ae002
+   License as published by the Free Software Foundation; either
8ae002
+   version 2.1 of the License, or (at your option) any later version.
8ae002
+
8ae002
+   The GNU C Library is distributed in the hope that it will be useful,
8ae002
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
8ae002
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
8ae002
+   Lesser General Public License for more details.
8ae002
+
8ae002
+   You should have received a copy of the GNU Lesser General Public
8ae002
+   License along with the GNU C Library; if not, see
8ae002
+   <http://www.gnu.org/licenses/>.  */
8ae002
+
8ae002
+#include <immintrin.h>
8ae002
+#include <stdlib.h>
8ae002
+#include <string.h>
8ae002
+
8ae002
+int
8ae002
+tst_avx_aux (void)
8ae002
+{
8ae002
+#ifdef __AVX__
8ae002
+  extern __m256i avx_test (__m256i, __m256i, __m256i, __m256i,
8ae002
+			   __m256i, __m256i, __m256i, __m256i);
8ae002
+
8ae002
+  __m256i ymm0 = _mm256_set1_epi32 (0);
8ae002
+  __m256i ymm1 = _mm256_set1_epi32 (1);
8ae002
+  __m256i ymm2 = _mm256_set1_epi32 (2);
8ae002
+  __m256i ymm3 = _mm256_set1_epi32 (3);
8ae002
+  __m256i ymm4 = _mm256_set1_epi32 (4);
8ae002
+  __m256i ymm5 = _mm256_set1_epi32 (5);
8ae002
+  __m256i ymm6 = _mm256_set1_epi32 (6);
8ae002
+  __m256i ymm7 = _mm256_set1_epi32 (7);
8ae002
+  __m256i ret = avx_test (ymm0, ymm1, ymm2, ymm3,
8ae002
+			  ymm4, ymm5, ymm6, ymm7);
8ae002
+  ymm0 =  _mm256_set1_epi32 (0x12349876);
8ae002
+  if (memcmp (&ymm0, &ret, sizeof (ret)))
8ae002
+    abort ();
8ae002
+  return 0;
8ae002
+#else  /* __AVX__ */
8ae002
+  return 77;
8ae002
+#endif  /* __AVX__ */
8ae002
+}
8ae002
Index: glibc-2.17-c758a686/sysdeps/x86_64/tst-avx.c
8ae002
===================================================================
8ae002
--- /dev/null
8ae002
+++ glibc-2.17-c758a686/sysdeps/x86_64/tst-avx.c
8ae002
@@ -0,0 +1,49 @@
8ae002
+/* Test case for preserved AVX registers in dynamic linker.
8ae002
+   Copyright (C) 2017 Free Software Foundation, Inc.
8ae002
+   This file is part of the GNU C Library.
8ae002
+
8ae002
+   The GNU C Library is free software; you can redistribute it and/or
8ae002
+   modify it under the terms of the GNU Lesser General Public
8ae002
+   License as published by the Free Software Foundation; either
8ae002
+   version 2.1 of the License, or (at your option) any later version.
8ae002
+
8ae002
+   The GNU C Library is distributed in the hope that it will be useful,
8ae002
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
8ae002
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
8ae002
+   Lesser General Public License for more details.
8ae002
+
8ae002
+   You should have received a copy of the GNU Lesser General Public
8ae002
+   License along with the GNU C Library; if not, see
8ae002
+   <http://www.gnu.org/licenses/>.  */
8ae002
+
8ae002
+#include <cpuid.h>
8ae002
+
8ae002
+int tst_avx_aux (void);
8ae002
+
8ae002
+static int
8ae002
+avx_enabled (void)
8ae002
+{
8ae002
+  unsigned int eax, ebx, ecx, edx;
8ae002
+
8ae002
+  if (__get_cpuid (1, &eax, &ebx, &ecx, &edx) == 0
8ae002
+      || (ecx & (bit_AVX | bit_OSXSAVE)) != (bit_AVX | bit_OSXSAVE))
8ae002
+    return 0;
8ae002
+
8ae002
+  /* Check the OS has AVX and SSE saving enabled.  */
8ae002
+  asm ("xgetbv" : "=a" (eax), "=d" (edx) : "c" (0));
8ae002
+
8ae002
+  return (eax & 6) == 6;
8ae002
+}
8ae002
+
8ae002
+static int
8ae002
+do_test (void)
8ae002
+{
8ae002
+  /* Run AVX test only if AVX is supported.  */
8ae002
+  if (avx_enabled ())
8ae002
+    return tst_avx_aux ();
8ae002
+  else
8ae002
+    return 77;
8ae002
+}
8ae002
+
8ae002
+#define TEST_FUNCTION do_test ()
8ae002
+#include "../../test-skeleton.c"
8ae002
Index: glibc-2.17-c758a686/sysdeps/x86_64/tst-avx512-aux.c
8ae002
===================================================================
8ae002
--- /dev/null
8ae002
+++ glibc-2.17-c758a686/sysdeps/x86_64/tst-avx512-aux.c
8ae002
@@ -0,0 +1,48 @@
8ae002
+/* Test case for preserved AVX512 registers in dynamic linker,
8ae002
+   -mavx512 part.
8ae002
+   Copyright (C) 2017 Free Software Foundation, Inc.
8ae002
+   This file is part of the GNU C Library.
8ae002
+
8ae002
+   The GNU C Library is free software; you can redistribute it and/or
8ae002
+   modify it under the terms of the GNU Lesser General Public
8ae002
+   License as published by the Free Software Foundation; either
8ae002
+   version 2.1 of the License, or (at your option) any later version.
8ae002
+
8ae002
+   The GNU C Library is distributed in the hope that it will be useful,
8ae002
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
8ae002
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
8ae002
+   Lesser General Public License for more details.
8ae002
+
8ae002
+   You should have received a copy of the GNU Lesser General Public
8ae002
+   License along with the GNU C Library; if not, see
8ae002
+   <http://www.gnu.org/licenses/>.  */
8ae002
+
8ae002
+#include <immintrin.h>
8ae002
+#include <stdlib.h>
8ae002
+#include <string.h>
8ae002
+
8ae002
+int
8ae002
+tst_avx512_aux (void)
8ae002
+{
8ae002
+#ifdef __AVX512F__
8ae002
+  extern __m512i avx512_test (__m512i, __m512i, __m512i, __m512i,
8ae002
+			      __m512i, __m512i, __m512i, __m512i);
8ae002
+
8ae002
+  __m512i zmm0 = _mm512_set1_epi32 (0);
8ae002
+  __m512i zmm1 = _mm512_set1_epi32 (1);
8ae002
+  __m512i zmm2 = _mm512_set1_epi32 (2);
8ae002
+  __m512i zmm3 = _mm512_set1_epi32 (3);
8ae002
+  __m512i zmm4 = _mm512_set1_epi32 (4);
8ae002
+  __m512i zmm5 = _mm512_set1_epi32 (5);
8ae002
+  __m512i zmm6 = _mm512_set1_epi32 (6);
8ae002
+  __m512i zmm7 = _mm512_set1_epi32 (7);
8ae002
+  __m512i ret = avx512_test (zmm0, zmm1, zmm2, zmm3,
8ae002
+			     zmm4, zmm5, zmm6, zmm7);
8ae002
+  zmm0 =  _mm512_set1_epi32 (0x12349876);
8ae002
+  if (memcmp (&zmm0, &ret, sizeof (ret)))
8ae002
+    abort ();
8ae002
+  return 0;
8ae002
+#else  /* __AVX512F__ */
8ae002
+  return 77;
8ae002
+#endif  /* __AVX512F__ */
8ae002
+}
8ae002
Index: glibc-2.17-c758a686/sysdeps/x86_64/tst-avx512.c
8ae002
===================================================================
8ae002
--- /dev/null
8ae002
+++ glibc-2.17-c758a686/sysdeps/x86_64/tst-avx512.c
8ae002
@@ -0,0 +1,57 @@
8ae002
+/* Test case for preserved AVX512 registers in dynamic linker.
8ae002
+   Copyright (C) 2017 Free Software Foundation, Inc.
8ae002
+   This file is part of the GNU C Library.
8ae002
+
8ae002
+   The GNU C Library is free software; you can redistribute it and/or
8ae002
+   modify it under the terms of the GNU Lesser General Public
8ae002
+   License as published by the Free Software Foundation; either
8ae002
+   version 2.1 of the License, or (at your option) any later version.
8ae002
+
8ae002
+   The GNU C Library is distributed in the hope that it will be useful,
8ae002
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
8ae002
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
8ae002
+   Lesser General Public License for more details.
8ae002
+
8ae002
+   You should have received a copy of the GNU Lesser General Public
8ae002
+   License along with the GNU C Library; if not, see
8ae002
+   <http://www.gnu.org/licenses/>.  */
8ae002
+
8ae002
+#include <cpuid.h>
8ae002
+
8ae002
+int tst_avx512_aux (void);
8ae002
+
8ae002
+static int
8ae002
+avx512_enabled (void)
8ae002
+{
8ae002
+#ifdef bit_AVX512F
8ae002
+  unsigned int eax, ebx, ecx, edx;
8ae002
+
8ae002
+  if (__get_cpuid (1, &eax, &ebx, &ecx, &edx) == 0
8ae002
+      || (ecx & (bit_AVX | bit_OSXSAVE)) != (bit_AVX | bit_OSXSAVE))
8ae002
+    return 0;
8ae002
+
8ae002
+  __cpuid_count (7, 0, eax, ebx, ecx, edx);
8ae002
+  if (!(ebx & bit_AVX512F))
8ae002
+    return 0;
8ae002
+
8ae002
+  asm ("xgetbv" : "=a" (eax), "=d" (edx) : "c" (0));
8ae002
+
8ae002
+  /* Verify that ZMM, YMM and XMM states are enabled.  */
8ae002
+  return (eax & 0xe6) == 0xe6;
8ae002
+#else
8ae002
+  return 0;
8ae002
+#endif
8ae002
+}
8ae002
+
8ae002
+static int
8ae002
+do_test (void)
8ae002
+{
8ae002
+  /* Run AVX512 test only if AVX512 is supported.  */
8ae002
+  if (avx512_enabled ())
8ae002
+    return tst_avx512_aux ();
8ae002
+  else
8ae002
+    return 77;
8ae002
+}
8ae002
+
8ae002
+#define TEST_FUNCTION do_test ()
8ae002
+#include "../../test-skeleton.c"
8ae002
Index: glibc-2.17-c758a686/sysdeps/x86_64/tst-avx512mod.c
8ae002
===================================================================
8ae002
--- /dev/null
8ae002
+++ glibc-2.17-c758a686/sysdeps/x86_64/tst-avx512mod.c
8ae002
@@ -0,0 +1,48 @@
8ae002
+/* Test case for x86-64 preserved AVX512 registers in dynamic linker.  */
8ae002
+
8ae002
+#ifdef __AVX512F__
8ae002
+#include <stdlib.h>
8ae002
+#include <string.h>
8ae002
+#include <immintrin.h>
8ae002
+
8ae002
+__m512i
8ae002
+avx512_test (__m512i x0, __m512i x1, __m512i x2, __m512i x3,
8ae002
+	     __m512i x4, __m512i x5, __m512i x6, __m512i x7)
8ae002
+{
8ae002
+  __m512i zmm;
8ae002
+
8ae002
+  zmm = _mm512_set1_epi32 (0);
8ae002
+  if (memcmp (&zmm, &x0, sizeof (zmm)))
8ae002
+    abort ();
8ae002
+
8ae002
+  zmm = _mm512_set1_epi32 (1);
8ae002
+  if (memcmp (&zmm, &x1, sizeof (zmm)))
8ae002
+    abort ();
8ae002
+
8ae002
+  zmm = _mm512_set1_epi32 (2);
8ae002
+  if (memcmp (&zmm, &x2, sizeof (zmm)))
8ae002
+    abort ();
8ae002
+
8ae002
+  zmm = _mm512_set1_epi32 (3);
8ae002
+  if (memcmp (&zmm, &x3, sizeof (zmm)))
8ae002
+    abort ();
8ae002
+
8ae002
+  zmm = _mm512_set1_epi32 (4);
8ae002
+  if (memcmp (&zmm, &x4, sizeof (zmm)))
8ae002
+    abort ();
8ae002
+
8ae002
+  zmm = _mm512_set1_epi32 (5);
8ae002
+  if (memcmp (&zmm, &x5, sizeof (zmm)))
8ae002
+    abort ();
8ae002
+
8ae002
+  zmm = _mm512_set1_epi32 (6);
8ae002
+  if (memcmp (&zmm, &x6, sizeof (zmm)))
8ae002
+    abort ();
8ae002
+
8ae002
+  zmm = _mm512_set1_epi32 (7);
8ae002
+  if (memcmp (&zmm, &x7, sizeof (zmm)))
8ae002
+    abort ();
8ae002
+
8ae002
+  return _mm512_set1_epi32 (0x12349876);
8ae002
+}
8ae002
+#endif
8ae002
Index: glibc-2.17-c758a686/sysdeps/x86_64/tst-ssemod.c
8ae002
===================================================================
8ae002
--- /dev/null
8ae002
+++ glibc-2.17-c758a686/sysdeps/x86_64/tst-ssemod.c
8ae002
@@ -0,0 +1,46 @@
8ae002
+/* Test case for x86-64 preserved SSE registers in dynamic linker.  */
8ae002
+
8ae002
+#include <stdlib.h>
8ae002
+#include <string.h>
8ae002
+#include <immintrin.h>
8ae002
+
8ae002
+__m128i
8ae002
+sse_test (__m128i x0, __m128i x1, __m128i x2, __m128i x3,
8ae002
+	  __m128i x4, __m128i x5, __m128i x6, __m128i x7)
8ae002
+{
8ae002
+  __m128i xmm;
8ae002
+
8ae002
+  xmm = _mm_set1_epi32 (0);
8ae002
+  if (memcmp (&xmm, &x0, sizeof (xmm)))
8ae002
+    abort ();
8ae002
+
8ae002
+  xmm = _mm_set1_epi32 (1);
8ae002
+  if (memcmp (&xmm, &x1, sizeof (xmm)))
8ae002
+    abort ();
8ae002
+
8ae002
+  xmm = _mm_set1_epi32 (2);
8ae002
+  if (memcmp (&xmm, &x2, sizeof (xmm)))
8ae002
+    abort ();
8ae002
+
8ae002
+  xmm = _mm_set1_epi32 (3);
8ae002
+  if (memcmp (&xmm, &x3, sizeof (xmm)))
8ae002
+    abort ();
8ae002
+
8ae002
+  xmm = _mm_set1_epi32 (4);
8ae002
+  if (memcmp (&xmm, &x4, sizeof (xmm)))
8ae002
+    abort ();
8ae002
+
8ae002
+  xmm = _mm_set1_epi32 (5);
8ae002
+  if (memcmp (&xmm, &x5, sizeof (xmm)))
8ae002
+    abort ();
8ae002
+
8ae002
+  xmm = _mm_set1_epi32 (6);
8ae002
+  if (memcmp (&xmm, &x6, sizeof (xmm)))
8ae002
+    abort ();
8ae002
+
8ae002
+  xmm = _mm_set1_epi32 (7);
8ae002
+  if (memcmp (&xmm, &x7, sizeof (xmm)))
8ae002
+    abort ();
8ae002
+
8ae002
+  return _mm_set1_epi32 (0x12349876);
8ae002
+}
8ae002
Index: glibc-2.17-c758a686/sysdeps/x86_64/tst-sse.c
8ae002
===================================================================
8ae002
--- /dev/null
8ae002
+++ glibc-2.17-c758a686/sysdeps/x86_64/tst-sse.c
8ae002
@@ -0,0 +1,46 @@
8ae002
+/* Test case for preserved SSE registers in dynamic linker.
8ae002
+   Copyright (C) 2017 Free Software Foundation, Inc.
8ae002
+   This file is part of the GNU C Library.
8ae002
+
8ae002
+   The GNU C Library is free software; you can redistribute it and/or
8ae002
+   modify it under the terms of the GNU Lesser General Public
8ae002
+   License as published by the Free Software Foundation; either
8ae002
+   version 2.1 of the License, or (at your option) any later version.
8ae002
+
8ae002
+   The GNU C Library is distributed in the hope that it will be useful,
8ae002
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
8ae002
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
8ae002
+   Lesser General Public License for more details.
8ae002
+
8ae002
+   You should have received a copy of the GNU Lesser General Public
8ae002
+   License along with the GNU C Library; if not, see
8ae002
+   <http://www.gnu.org/licenses/>.  */
8ae002
+
8ae002
+#include <immintrin.h>
8ae002
+#include <stdlib.h>
8ae002
+#include <string.h>
8ae002
+
8ae002
+extern __m128i sse_test (__m128i, __m128i, __m128i, __m128i,
8ae002
+                        __m128i, __m128i, __m128i, __m128i);
8ae002
+
8ae002
+static int
8ae002
+do_test (void)
8ae002
+{
8ae002
+  __m128i xmm0 = _mm_set1_epi32 (0);
8ae002
+  __m128i xmm1 = _mm_set1_epi32 (1);
8ae002
+  __m128i xmm2 = _mm_set1_epi32 (2);
8ae002
+  __m128i xmm3 = _mm_set1_epi32 (3);
8ae002
+  __m128i xmm4 = _mm_set1_epi32 (4);
8ae002
+  __m128i xmm5 = _mm_set1_epi32 (5);
8ae002
+  __m128i xmm6 = _mm_set1_epi32 (6);
8ae002
+  __m128i xmm7 = _mm_set1_epi32 (7);
8ae002
+  __m128i ret = sse_test (xmm0, xmm1, xmm2, xmm3,
8ae002
+                         xmm4, xmm5, xmm6, xmm7);
8ae002
+  xmm0 =  _mm_set1_epi32 (0x12349876);
8ae002
+  if (memcmp (&xmm0, &ret, sizeof (ret)))
8ae002
+    abort ();
8ae002
+  return 0;
8ae002
+}
8ae002
+
8ae002
+#define TEST_FUNCTION do_test ()
8ae002
+#include "../../test-skeleton.c"
8ae002
Index: glibc-2.17-c758a686/sysdeps/x86_64/tst-avxmod.c
8ae002
===================================================================
8ae002
--- /dev/null
8ae002
+++ glibc-2.17-c758a686/sysdeps/x86_64/tst-avxmod.c
8ae002
@@ -0,0 +1,49 @@
8ae002
+
8ae002
+/* Test case for x86-64 preserved AVX registers in dynamic linker.  */
8ae002
+
8ae002
+#ifdef __AVX__
8ae002
+#include <stdlib.h>
8ae002
+#include <string.h>
8ae002
+#include <immintrin.h>
8ae002
+
8ae002
+__m256i
8ae002
+avx_test (__m256i x0, __m256i x1, __m256i x2, __m256i x3,
8ae002
+	  __m256i x4, __m256i x5, __m256i x6, __m256i x7)
8ae002
+{
8ae002
+  __m256i ymm;
8ae002
+
8ae002
+  ymm = _mm256_set1_epi32 (0);
8ae002
+  if (memcmp (&ymm, &x0, sizeof (ymm)))
8ae002
+    abort ();
8ae002
+
8ae002
+  ymm = _mm256_set1_epi32 (1);
8ae002
+  if (memcmp (&ymm, &x1, sizeof (ymm)))
8ae002
+    abort ();
8ae002
+
8ae002
+  ymm = _mm256_set1_epi32 (2);
8ae002
+  if (memcmp (&ymm, &x2, sizeof (ymm)))
8ae002
+    abort ();
8ae002
+
8ae002
+  ymm = _mm256_set1_epi32 (3);
8ae002
+  if (memcmp (&ymm, &x3, sizeof (ymm)))
8ae002
+    abort ();
8ae002
+
8ae002
+  ymm = _mm256_set1_epi32 (4);
8ae002
+  if (memcmp (&ymm, &x4, sizeof (ymm)))
8ae002
+    abort ();
8ae002
+
8ae002
+  ymm = _mm256_set1_epi32 (5);
8ae002
+  if (memcmp (&ymm, &x5, sizeof (ymm)))
8ae002
+    abort ();
8ae002
+
8ae002
+  ymm = _mm256_set1_epi32 (6);
8ae002
+  if (memcmp (&ymm, &x6, sizeof (ymm)))
8ae002
+    abort ();
8ae002
+
8ae002
+  ymm = _mm256_set1_epi32 (7);
8ae002
+  if (memcmp (&ymm, &x7, sizeof (ymm)))
8ae002
+    abort ();
8ae002
+
8ae002
+  return _mm256_set1_epi32 (0x12349876);
8ae002
+}
8ae002
+#endif