ce426f
Backport from Hongjiu Lu <hongjiu.lu@intel.com> of these upstream
ce426f
commits:
ce426f
ce426f
commit b52b0d793dcb226ecb0ecca1e672ca265973233c
ce426f
Author: H.J. Lu <hjl.tools@gmail.com>
ce426f
Date:   Fri Oct 20 11:00:08 2017 -0700
ce426f
ce426f
    x86-64: Use fxsave/xsave/xsavec in _dl_runtime_resolve [BZ #21265]
ce426f
    
ce426f
    In _dl_runtime_resolve, use fxsave/xsave/xsavec to preserve all vector,
ce426f
    mask and bound registers.  It simplifies _dl_runtime_resolve and supports
ce426f
    different calling conventions.  ld.so code size is reduced by more than
ce426f
    1 KB.  However, use fxsave/xsave/xsavec takes a little bit more cycles
ce426f
    than saving and restoring vector and bound registers individually.
ce426f
    
ce426f
    Latency for _dl_runtime_resolve to lookup the function, foo, from one
ce426f
    shared library plus libc.so:
ce426f
    
ce426f
                                 Before    After     Change
ce426f
    
ce426f
    Westmere (SSE)/fxsave         345      866       151%
ce426f
    IvyBridge (AVX)/xsave         420      643       53%
ce426f
    Haswell (AVX)/xsave           713      1252      75%
ce426f
    Skylake (AVX+MPX)/xsavec      559      719       28%
ce426f
    Skylake (AVX512+MPX)/xsavec   145      272       87%
ce426f
    Ryzen (AVX)/xsavec            280      553       97%
ce426f
    
ce426f
    This is the worst case where portion of time spent for saving and
ce426f
    restoring registers is bigger than majority of cases.  With smaller
ce426f
    _dl_runtime_resolve code size, overall performance impact is negligible.
ce426f
    
ce426f
    On IvyBridge, differences in build and test time of binutils with lazy
ce426f
    binding GCC and binutils are noises.  On Westmere, differences in
ce426f
    bootstrap and "makc check" time of GCC 7 with lazy binding GCC and
ce426f
    binutils are also noises.
ce426f
ce426f
commit 0ac8ee53e8efbfd6e1c37094b4653f5c2dad65b5
ce426f
Author: H.J. Lu <hjl.tools@gmail.com>
ce426f
Date:   Fri Aug 26 08:57:42 2016 -0700
ce426f
ce426f
    X86-64: Correct CFA in _dl_runtime_resolve
ce426f
    
ce426f
    When stack is re-aligned in _dl_runtime_resolve, there is no need to
ce426f
    adjust CFA when allocating register save area on stack.
ce426f
    
ce426f
            * sysdeps/x86_64/dl-trampoline.h (_dl_runtime_resolve): Don't
ce426f
            adjust CFA when allocating register save area on re-aligned
ce426f
            stack.
ce426f
ce426f
Storing the full xsave state size in xsave_state_full_size was not needed
ce426f
because RHEL7 does not have the full tunables support that would use this,
ce426f
therefore support for xsave_state_full_size has been removed from the
ce426f
changes in b52b0d793dcb226ecb0ecca1e672ca265973233c
ce426f
51f0aa
Index: glibc-2.17-c758a686/sysdeps/x86/cpu-features-offsets.sym
51f0aa
===================================================================
51f0aa
--- glibc-2.17-c758a686.orig/sysdeps/x86/cpu-features-offsets.sym
51f0aa
+++ glibc-2.17-c758a686/sysdeps/x86/cpu-features-offsets.sym
ce426f
@@ -5,3 +5,5 @@
ce426f
 #define rtld_global_ro_offsetof(mem) offsetof (struct rtld_global_ro, mem)
ce426f
 
ce426f
 RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET rtld_global_ro_offsetof (_dl_x86_cpu_features)
ce426f
+
ce426f
+XSAVE_STATE_SIZE_OFFSET	offsetof (struct cpu_features, xsave_state_size)
51f0aa
Index: glibc-2.17-c758a686/sysdeps/x86/cpu-features.c
51f0aa
===================================================================
51f0aa
--- glibc-2.17-c758a686.orig/sysdeps/x86/cpu-features.c
51f0aa
+++ glibc-2.17-c758a686/sysdeps/x86/cpu-features.c
ce426f
@@ -18,6 +18,7 @@
ce426f
 
ce426f
 #include <cpuid.h>
ce426f
 #include <cpu-features.h>
ce426f
+#include <libc-internal.h>
ce426f
 
ce426f
 static inline void
ce426f
 get_common_indeces (struct cpu_features *cpu_features,
51f0aa
@@ -148,20 +149,6 @@ init_cpu_features (struct cpu_features *
ce426f
 	      break;
ce426f
 	    }
ce426f
 	}
ce426f
-
ce426f
-      /* To avoid SSE transition penalty, use _dl_runtime_resolve_slow.
ce426f
-         If XGETBV suports ECX == 1, use _dl_runtime_resolve_opt.  */
ce426f
-      cpu_features->feature[index_Use_dl_runtime_resolve_slow]
ce426f
-	|= bit_Use_dl_runtime_resolve_slow;
ce426f
-      if (cpu_features->max_cpuid >= 0xd)
ce426f
-	{
ce426f
-	  unsigned int eax;
ce426f
-
ce426f
-	  __cpuid_count (0xd, 1, eax, ebx, ecx, edx);
ce426f
-	  if ((eax & (1 << 2)) != 0)
ce426f
-	    cpu_features->feature[index_Use_dl_runtime_resolve_opt]
ce426f
-	      |= bit_Use_dl_runtime_resolve_opt;
ce426f
-	}
ce426f
     }
ce426f
   /* This spells out "AuthenticAMD".  */
ce426f
   else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
51f0aa
@@ -244,6 +231,71 @@ init_cpu_features (struct cpu_features *
ce426f
 	  if (HAS_CPU_FEATURE (FMA4))
ce426f
 	    cpu_features->feature[index_FMA4_Usable] |= bit_FMA4_Usable;
51f0aa
 	}
51f0aa
+
51f0aa
+      /* For _dl_runtime_resolve, set xsave_state_size to xsave area
51f0aa
+	 size + integer register save size and align it to 64 bytes.  */
51f0aa
+      if (cpu_features->max_cpuid >= 0xd)
51f0aa
+	{
51f0aa
+	  unsigned int eax, ebx, ecx, edx;
ce426f
+
51f0aa
+	  __cpuid_count (0xd, 0, eax, ebx, ecx, edx);
51f0aa
+	  if (ebx != 0)
ce426f
+	    {
51f0aa
+	      cpu_features->xsave_state_size
51f0aa
+	       = ALIGN_UP (ebx + STATE_SAVE_OFFSET, 64);
51f0aa
+
51f0aa
+	      __cpuid_count (0xd, 1, eax, ebx, ecx, edx);
ce426f
+
51f0aa
+	      /* Check if XSAVEC is available.  */
51f0aa
+	      if ((eax & (1 << 1)) != 0)
ce426f
+		{
51f0aa
+		  unsigned int xstate_comp_offsets[32];
51f0aa
+		  unsigned int xstate_comp_sizes[32];
51f0aa
+		  unsigned int i;
ce426f
+
51f0aa
+		  xstate_comp_offsets[0] = 0;
51f0aa
+		  xstate_comp_offsets[1] = 160;
51f0aa
+		  xstate_comp_offsets[2] = 576;
51f0aa
+		  xstate_comp_sizes[0] = 160;
51f0aa
+		  xstate_comp_sizes[1] = 256;
ce426f
+
51f0aa
+		  for (i = 2; i < 32; i++)
ce426f
+		    {
51f0aa
+		      if ((STATE_SAVE_MASK & (1 << i)) != 0)
ce426f
+			{
51f0aa
+			  __cpuid_count (0xd, i, eax, ebx, ecx, edx);
51f0aa
+			  xstate_comp_sizes[i] = eax;
51f0aa
+			}
51f0aa
+		      else
51f0aa
+			{
51f0aa
+			  ecx = 0;
51f0aa
+			  xstate_comp_sizes[i] = 0;
ce426f
+			}
ce426f
+
51f0aa
+		      if (i > 2)
ce426f
+			{
51f0aa
+			  xstate_comp_offsets[i]
51f0aa
+			   = (xstate_comp_offsets[i - 1]
51f0aa
+			      + xstate_comp_sizes[i -1]);
51f0aa
+			  if ((ecx & (1 << 1)) != 0)
51f0aa
+			    xstate_comp_offsets[i]
51f0aa
+			     = ALIGN_UP (xstate_comp_offsets[i], 64);
ce426f
+			}
ce426f
+		    }
51f0aa
+
51f0aa
+		  /* Use XSAVEC.  */
51f0aa
+		  unsigned int size
51f0aa
+		   = xstate_comp_offsets[31] + xstate_comp_sizes[31];
51f0aa
+		  if (size)
51f0aa
+		    {
51f0aa
+		      cpu_features->xsave_state_size
51f0aa
+		       = ALIGN_UP (size + STATE_SAVE_OFFSET, 64);
51f0aa
+		      cpu_features->feature[index_XSAVEC_Usable]
51f0aa
+		       |= bit_XSAVEC_Usable;
51f0aa
+		    }
ce426f
+		}
ce426f
+	    }
51f0aa
+	}
ce426f
     }
ce426f
 
51f0aa
   cpu_features->family = family;
51f0aa
Index: glibc-2.17-c758a686/sysdeps/x86/cpu-features.h
51f0aa
===================================================================
51f0aa
--- glibc-2.17-c758a686.orig/sysdeps/x86/cpu-features.h
51f0aa
+++ glibc-2.17-c758a686/sysdeps/x86/cpu-features.h
ce426f
@@ -34,8 +34,7 @@
ce426f
 #define bit_AVX512DQ_Usable		(1 << 13)
ce426f
 #define bit_Prefer_MAP_32BIT_EXEC	(1 << 16)
ce426f
 #define bit_Prefer_No_VZEROUPPER	(1 << 17)
ce426f
-#define bit_Use_dl_runtime_resolve_opt	(1 << 20)
ce426f
-#define bit_Use_dl_runtime_resolve_slow	(1 << 21)
ce426f
+#define bit_XSAVEC_Usable		(1 << 18)
ce426f
 
ce426f
 
ce426f
 /* CPUID Feature flags.  */
ce426f
@@ -70,10 +69,20 @@
ce426f
 /* The current maximum size of the feature integer bit array.  */
ce426f
 #define FEATURE_INDEX_MAX 1
ce426f
 
ce426f
+/* Offset for fxsave/xsave area used by _dl_runtime_resolve.  Also need
ce426f
+   space to preserve RCX, RDX, RSI, RDI, R8, R9 and RAX.  It must be
ce426f
+   aligned to 16 bytes for fxsave and 64 bytes for xsave.  */
ce426f
+#define STATE_SAVE_OFFSET (8 * 7 + 8)
ce426f
+
ce426f
+/* Save SSE, AVX, AVX512, mask and bound registers.  */
ce426f
+#define STATE_SAVE_MASK \
ce426f
+  ((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7))
ce426f
+
ce426f
 #ifdef	__ASSEMBLER__
ce426f
 
ce426f
 # include <ifunc-defines.h>
ce426f
 # include <rtld-global-offsets.h>
ce426f
+# include <cpu-features-offsets.h>
ce426f
 
ce426f
 # define index_SSE2	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_EDX_OFFSET
ce426f
 # define index_SSSE3	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
ce426f
@@ -98,8 +107,6 @@
ce426f
 # define index_AVX512DQ_Usable		FEATURE_INDEX_1*FEATURE_SIZE
ce426f
 # define index_Prefer_MAP_32BIT_EXEC	FEATURE_INDEX_1*FEATURE_SIZE
ce426f
 # define index_Prefer_No_VZEROUPPER	FEATURE_INDEX_1*FEATURE_SIZE
ce426f
-# define index_Use_dl_runtime_resolve_opt FEATURE_INDEX_1*FEATURE_SIZE
ce426f
-# define index_Use_dl_runtime_resolve_slow FEATURE_INDEX_1*FEATURE_SIZE
ce426f
 
ce426f
 
ce426f
 # if defined (_LIBC) && !IS_IN (nonlib)
ce426f
@@ -214,6 +221,12 @@ struct cpu_features
ce426f
   } cpuid[COMMON_CPUID_INDEX_MAX];
ce426f
   unsigned int family;
ce426f
   unsigned int model;
ce426f
+  /* The type must be unsigned long int so that we use
ce426f
+
ce426f
+	sub xsave_state_size_offset(%rip) %RSP_LP
ce426f
+
ce426f
+     in _dl_runtime_resolve.  */
ce426f
+  unsigned long int xsave_state_size;
ce426f
   unsigned int feature[FEATURE_INDEX_MAX];
ce426f
 };
ce426f
 
51f0aa
@@ -279,8 +292,7 @@ extern const struct cpu_features *__get_
ce426f
 # define index_AVX512DQ_Usable		FEATURE_INDEX_1
ce426f
 # define index_Prefer_MAP_32BIT_EXEC	FEATURE_INDEX_1
ce426f
 # define index_Prefer_No_VZEROUPPER     FEATURE_INDEX_1
ce426f
-# define index_Use_dl_runtime_resolve_opt FEATURE_INDEX_1
ce426f
-# define index_Use_dl_runtime_resolve_slow FEATURE_INDEX_1
ce426f
+# define index_XSAVEC_Usable		FEATURE_INDEX_1
ce426f
 
ce426f
 #endif	/* !__ASSEMBLER__ */
ce426f
 
51f0aa
Index: glibc-2.17-c758a686/sysdeps/x86_64/dl-machine.h
51f0aa
===================================================================
51f0aa
--- glibc-2.17-c758a686.orig/sysdeps/x86_64/dl-machine.h
51f0aa
+++ glibc-2.17-c758a686/sysdeps/x86_64/dl-machine.h
51f0aa
@@ -66,12 +66,9 @@ static inline int __attribute__ ((unused
ce426f
 elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
ce426f
 {
ce426f
   Elf64_Addr *got;
ce426f
-  extern void _dl_runtime_resolve_sse (ElfW(Word)) attribute_hidden;
ce426f
-  extern void _dl_runtime_resolve_avx (ElfW(Word)) attribute_hidden;
ce426f
-  extern void _dl_runtime_resolve_avx_slow (ElfW(Word)) attribute_hidden;
ce426f
-  extern void _dl_runtime_resolve_avx_opt (ElfW(Word)) attribute_hidden;
ce426f
-  extern void _dl_runtime_resolve_avx512 (ElfW(Word)) attribute_hidden;
ce426f
-  extern void _dl_runtime_resolve_avx512_opt (ElfW(Word)) attribute_hidden;
ce426f
+  extern void _dl_runtime_resolve_fxsave (ElfW(Word)) attribute_hidden;
ce426f
+  extern void _dl_runtime_resolve_xsave (ElfW(Word)) attribute_hidden;
ce426f
+  extern void _dl_runtime_resolve_xsavec (ElfW(Word)) attribute_hidden;
ce426f
   extern void _dl_runtime_profile_sse (ElfW(Word)) attribute_hidden;
ce426f
   extern void _dl_runtime_profile_avx (ElfW(Word)) attribute_hidden;
ce426f
   extern void _dl_runtime_profile_avx512 (ElfW(Word)) attribute_hidden;
51f0aa
@@ -120,29 +117,14 @@ elf_machine_runtime_setup (struct link_m
ce426f
 	  /* This function will get called to fix up the GOT entry
ce426f
 	     indicated by the offset on the stack, and then jump to
ce426f
 	     the resolved address.  */
ce426f
-	  if (HAS_ARCH_FEATURE (AVX512F_Usable))
ce426f
-	    {
ce426f
-	      if (HAS_ARCH_FEATURE (Use_dl_runtime_resolve_opt))
ce426f
-		*(ElfW(Addr) *) (got + 2)
ce426f
-		  = (ElfW(Addr)) &_dl_runtime_resolve_avx512_opt;
ce426f
-	      else
ce426f
-		*(ElfW(Addr) *) (got + 2)
ce426f
-		  = (ElfW(Addr)) &_dl_runtime_resolve_avx512;
ce426f
-	    }
ce426f
-	  else if (HAS_ARCH_FEATURE (AVX_Usable))
ce426f
-	    {
ce426f
-	      if (HAS_ARCH_FEATURE (Use_dl_runtime_resolve_opt))
ce426f
-		*(ElfW(Addr) *) (got + 2)
ce426f
-		  = (ElfW(Addr)) &_dl_runtime_resolve_avx_opt;
ce426f
-	      else if (HAS_ARCH_FEATURE (Use_dl_runtime_resolve_slow))
ce426f
-		*(ElfW(Addr) *) (got + 2)
ce426f
-		  = (ElfW(Addr)) &_dl_runtime_resolve_avx_slow;
ce426f
-	      else
ce426f
-		*(ElfW(Addr) *) (got + 2)
ce426f
-		  = (ElfW(Addr)) &_dl_runtime_resolve_avx;
ce426f
-	    }
ce426f
+	  if (GLRO(dl_x86_cpu_features).xsave_state_size != 0)
ce426f
+	    *(ElfW(Addr) *) (got + 2)
ce426f
+	      = (HAS_ARCH_FEATURE (XSAVEC_Usable)
ce426f
+		 ? (ElfW(Addr)) &_dl_runtime_resolve_xsavec
ce426f
+		 : (ElfW(Addr)) &_dl_runtime_resolve_xsave);
ce426f
 	  else
ce426f
-	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_sse;
ce426f
+	    *(ElfW(Addr) *) (got + 2)
ce426f
+	      = (ElfW(Addr)) &_dl_runtime_resolve_fxsave;
ce426f
 	}
ce426f
     }
ce426f
 
51f0aa
Index: glibc-2.17-c758a686/sysdeps/x86_64/dl-trampoline.S
51f0aa
===================================================================
51f0aa
--- glibc-2.17-c758a686.orig/sysdeps/x86_64/dl-trampoline.S
51f0aa
+++ glibc-2.17-c758a686/sysdeps/x86_64/dl-trampoline.S
ce426f
@@ -34,37 +34,24 @@
ce426f
 # define DL_STACK_ALIGNMENT 8
ce426f
 #endif
ce426f
 
ce426f
-#ifndef DL_RUNIME_UNALIGNED_VEC_SIZE
ce426f
-/* The maximum size of unaligned vector load and store.  */
ce426f
-# define DL_RUNIME_UNALIGNED_VEC_SIZE 16
ce426f
-#endif
ce426f
-
ce426f
-/* True if _dl_runtime_resolve should align stack to VEC_SIZE bytes.  */
ce426f
-#define DL_RUNIME_RESOLVE_REALIGN_STACK \
ce426f
-  (VEC_SIZE > DL_STACK_ALIGNMENT \
ce426f
-   && VEC_SIZE > DL_RUNIME_UNALIGNED_VEC_SIZE)
ce426f
-
ce426f
-/* Align vector register save area to 16 bytes.  */
ce426f
-#define REGISTER_SAVE_VEC_OFF	0
ce426f
+/* True if _dl_runtime_resolve should align stack for STATE_SAVE or align
ce426f
+   stack to 16 bytes before calling _dl_fixup.  */
ce426f
+#define DL_RUNTIME_RESOLVE_REALIGN_STACK \
ce426f
+  (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \
ce426f
+   || 16 > DL_STACK_ALIGNMENT)
ce426f
 
ce426f
 /* Area on stack to save and restore registers used for parameter
ce426f
    passing when calling _dl_fixup.  */
ce426f
 #ifdef __ILP32__
ce426f
-# define REGISTER_SAVE_RAX	(REGISTER_SAVE_VEC_OFF + VEC_SIZE * 8)
ce426f
 # define PRESERVE_BND_REGS_PREFIX
ce426f
 #else
ce426f
-/* Align bound register save area to 16 bytes.  */
ce426f
-# define REGISTER_SAVE_BND0	(REGISTER_SAVE_VEC_OFF + VEC_SIZE * 8)
ce426f
-# define REGISTER_SAVE_BND1	(REGISTER_SAVE_BND0 + 16)
ce426f
-# define REGISTER_SAVE_BND2	(REGISTER_SAVE_BND1 + 16)
ce426f
-# define REGISTER_SAVE_BND3	(REGISTER_SAVE_BND2 + 16)
ce426f
-# define REGISTER_SAVE_RAX	(REGISTER_SAVE_BND3 + 16)
ce426f
 # ifdef HAVE_MPX_SUPPORT
ce426f
 #  define PRESERVE_BND_REGS_PREFIX bnd
ce426f
 # else
ce426f
 #  define PRESERVE_BND_REGS_PREFIX .byte 0xf2
ce426f
 # endif
ce426f
 #endif
ce426f
+#define REGISTER_SAVE_RAX	0
ce426f
 #define REGISTER_SAVE_RCX	(REGISTER_SAVE_RAX + 8)
ce426f
 #define REGISTER_SAVE_RDX	(REGISTER_SAVE_RCX + 8)
ce426f
 #define REGISTER_SAVE_RSI	(REGISTER_SAVE_RDX + 8)
ce426f
@@ -72,71 +59,60 @@
ce426f
 #define REGISTER_SAVE_R8	(REGISTER_SAVE_RDI + 8)
ce426f
 #define REGISTER_SAVE_R9	(REGISTER_SAVE_R8 + 8)
ce426f
 
ce426f
+#define RESTORE_AVX
ce426f
+
ce426f
 #define VEC_SIZE		64
ce426f
 #define VMOVA			vmovdqa64
ce426f
-#if DL_RUNIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT
ce426f
-# define VMOV			vmovdqa64
ce426f
-#else
ce426f
-# define VMOV			vmovdqu64
ce426f
-#endif
ce426f
 #define VEC(i)			zmm##i
ce426f
-#define _dl_runtime_resolve	_dl_runtime_resolve_avx512
ce426f
 #define _dl_runtime_profile	_dl_runtime_profile_avx512
ce426f
-#define RESTORE_AVX
ce426f
 #include "dl-trampoline.h"
ce426f
-#undef _dl_runtime_resolve
ce426f
 #undef _dl_runtime_profile
ce426f
 #undef VEC
ce426f
-#undef VMOV
ce426f
 #undef VMOVA
ce426f
 #undef VEC_SIZE
ce426f
 
ce426f
 #define VEC_SIZE		32
ce426f
 #define VMOVA			vmovdqa
ce426f
-#if DL_RUNIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT
ce426f
-# define VMOV			vmovdqa
ce426f
-#else
ce426f
-# define VMOV			vmovdqu
ce426f
-#endif
ce426f
 #define VEC(i)			ymm##i
ce426f
-#define _dl_runtime_resolve	_dl_runtime_resolve_avx
ce426f
-#define _dl_runtime_resolve_opt	_dl_runtime_resolve_avx_opt
ce426f
 #define _dl_runtime_profile	_dl_runtime_profile_avx
ce426f
 #include "dl-trampoline.h"
ce426f
-#undef _dl_runtime_resolve
ce426f
-#undef _dl_runtime_resolve_opt
ce426f
 #undef _dl_runtime_profile
ce426f
 #undef VEC
ce426f
-#undef VMOV
ce426f
 #undef VMOVA
ce426f
 #undef VEC_SIZE
ce426f
 
ce426f
 /* movaps/movups is 1-byte shorter.  */
ce426f
 #define VEC_SIZE		16
ce426f
 #define VMOVA			movaps
ce426f
-#if DL_RUNIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT
ce426f
-# define VMOV			movaps
ce426f
-#else
ce426f
-# define VMOV			movups
ce426f
- #endif
ce426f
 #define VEC(i)			xmm##i
ce426f
-#define _dl_runtime_resolve	_dl_runtime_resolve_sse
ce426f
 #define _dl_runtime_profile	_dl_runtime_profile_sse
ce426f
 #undef RESTORE_AVX
ce426f
 #include "dl-trampoline.h"
ce426f
-#undef _dl_runtime_resolve
ce426f
 #undef _dl_runtime_profile
ce426f
-#undef VMOV
ce426f
+#undef VEC
ce426f
 #undef VMOVA
ce426f
+#undef VEC_SIZE
ce426f
 
ce426f
-/* Used by _dl_runtime_resolve_avx_opt/_dl_runtime_resolve_avx512_opt
ce426f
-   to preserve the full vector registers with zero upper bits.  */
ce426f
-#define VMOVA			vmovdqa
ce426f
-#if DL_RUNTIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT
ce426f
-# define VMOV			vmovdqa
ce426f
-#else
ce426f
-# define VMOV			vmovdqu
ce426f
-#endif
ce426f
-#define _dl_runtime_resolve	_dl_runtime_resolve_sse_vex
ce426f
-#define _dl_runtime_resolve_opt	_dl_runtime_resolve_avx512_opt
ce426f
+#define USE_FXSAVE
ce426f
+#define STATE_SAVE_ALIGNMENT	16
ce426f
+#define _dl_runtime_resolve	_dl_runtime_resolve_fxsave
ce426f
 #include "dl-trampoline.h"
ce426f
+#undef _dl_runtime_resolve
ce426f
+#undef USE_FXSAVE
ce426f
+#undef STATE_SAVE_ALIGNMENT
ce426f
+
ce426f
+#define USE_XSAVE
ce426f
+#define STATE_SAVE_ALIGNMENT	64
ce426f
+#define _dl_runtime_resolve	_dl_runtime_resolve_xsave
ce426f
+#include "dl-trampoline.h"
ce426f
+#undef _dl_runtime_resolve
ce426f
+#undef USE_XSAVE
ce426f
+#undef STATE_SAVE_ALIGNMENT
ce426f
+
ce426f
+#define USE_XSAVEC
ce426f
+#define STATE_SAVE_ALIGNMENT	64
ce426f
+#define _dl_runtime_resolve	_dl_runtime_resolve_xsavec
ce426f
+#include "dl-trampoline.h"
ce426f
+#undef _dl_runtime_resolve
ce426f
+#undef USE_XSAVEC
ce426f
+#undef STATE_SAVE_ALIGNMENT
51f0aa
Index: glibc-2.17-c758a686/sysdeps/x86_64/dl-trampoline.h
51f0aa
===================================================================
51f0aa
--- glibc-2.17-c758a686.orig/sysdeps/x86_64/dl-trampoline.h
51f0aa
+++ glibc-2.17-c758a686/sysdeps/x86_64/dl-trampoline.h
ce426f
@@ -16,140 +16,47 @@
ce426f
    License along with the GNU C Library; if not, see
ce426f
    <http://www.gnu.org/licenses/>.  */
ce426f
 
ce426f
-#undef REGISTER_SAVE_AREA_RAW
ce426f
-#ifdef __ILP32__
ce426f
-/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as VEC0 to
ce426f
-   VEC7.  */
ce426f
-# define REGISTER_SAVE_AREA_RAW	(8 * 7 + VEC_SIZE * 8)
ce426f
-#else
ce426f
-/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as
ce426f
-   BND0, BND1, BND2, BND3 and VEC0 to VEC7. */
ce426f
-# define REGISTER_SAVE_AREA_RAW	(8 * 7 + 16 * 4 + VEC_SIZE * 8)
ce426f
-#endif
ce426f
+	.text
ce426f
+#ifdef _dl_runtime_resolve
ce426f
 
ce426f
-#undef REGISTER_SAVE_AREA
ce426f
-#undef LOCAL_STORAGE_AREA
ce426f
-#undef BASE
ce426f
-#if DL_RUNIME_RESOLVE_REALIGN_STACK
ce426f
-# define REGISTER_SAVE_AREA	(REGISTER_SAVE_AREA_RAW + 8)
ce426f
-/* Local stack area before jumping to function address: RBX.  */
ce426f
-# define LOCAL_STORAGE_AREA	8
ce426f
-# define BASE			rbx
ce426f
-# if (REGISTER_SAVE_AREA % VEC_SIZE) != 0
ce426f
-#  error REGISTER_SAVE_AREA must be multples of VEC_SIZE
ce426f
-# endif
ce426f
-#else
ce426f
-# define REGISTER_SAVE_AREA	REGISTER_SAVE_AREA_RAW
ce426f
-/* Local stack area before jumping to function address:  All saved
ce426f
-   registers.  */
ce426f
-# define LOCAL_STORAGE_AREA	REGISTER_SAVE_AREA
ce426f
-# define BASE			rsp
ce426f
-# if (REGISTER_SAVE_AREA % 16) != 8
ce426f
-#  error REGISTER_SAVE_AREA must be odd multples of 8
ce426f
+# undef REGISTER_SAVE_AREA
ce426f
+# undef LOCAL_STORAGE_AREA
ce426f
+# undef BASE
ce426f
+
ce426f
+# if (STATE_SAVE_ALIGNMENT % 16) != 0
ce426f
+#  error STATE_SAVE_ALIGNMENT must be multples of 16
ce426f
 # endif
ce426f
-#endif
ce426f
 
ce426f
-	.text
ce426f
-#ifdef _dl_runtime_resolve_opt
ce426f
-/* Use the smallest vector registers to preserve the full YMM/ZMM
ce426f
-   registers to avoid SSE transition penalty.  */
ce426f
-
ce426f
-# if VEC_SIZE == 32
ce426f
-/* Check if the upper 128 bits in %ymm0 - %ymm7 registers are non-zero
ce426f
-   and preserve %xmm0 - %xmm7 registers with the zero upper bits.  Since
ce426f
-   there is no SSE transition penalty on AVX512 processors which don't
ce426f
-   support XGETBV with ECX == 1, _dl_runtime_resolve_avx512_slow isn't
ce426f
-   provided.   */
ce426f
-	.globl _dl_runtime_resolve_avx_slow
ce426f
-	.hidden _dl_runtime_resolve_avx_slow
ce426f
-	.type _dl_runtime_resolve_avx_slow, @function
ce426f
-	.align 16
ce426f
-_dl_runtime_resolve_avx_slow:
ce426f
-	cfi_startproc
ce426f
-	cfi_adjust_cfa_offset(16) # Incorporate PLT
ce426f
-	vorpd %ymm0, %ymm1, %ymm8
ce426f
-	vorpd %ymm2, %ymm3, %ymm9
ce426f
-	vorpd %ymm4, %ymm5, %ymm10
ce426f
-	vorpd %ymm6, %ymm7, %ymm11
ce426f
-	vorpd %ymm8, %ymm9, %ymm9
ce426f
-	vorpd %ymm10, %ymm11, %ymm10
ce426f
-	vpcmpeqd %xmm8, %xmm8, %xmm8
ce426f
-	vorpd %ymm9, %ymm10, %ymm10
ce426f
-	vptest %ymm10, %ymm8
ce426f
-	# Preserve %ymm0 - %ymm7 registers if the upper 128 bits of any
ce426f
-	# %ymm0 - %ymm7 registers aren't zero.
ce426f
-	PRESERVE_BND_REGS_PREFIX
ce426f
-	jnc _dl_runtime_resolve_avx
ce426f
-	# Use vzeroupper to avoid SSE transition penalty.
ce426f
-	vzeroupper
ce426f
-	# Preserve %xmm0 - %xmm7 registers with the zero upper 128 bits
ce426f
-	# when the upper 128 bits of %ymm0 - %ymm7 registers are zero.
ce426f
-	PRESERVE_BND_REGS_PREFIX
ce426f
-	jmp _dl_runtime_resolve_sse_vex
ce426f
-	cfi_adjust_cfa_offset(-16) # Restore PLT adjustment
ce426f
-	cfi_endproc
ce426f
-	.size _dl_runtime_resolve_avx_slow, .-_dl_runtime_resolve_avx_slow
ce426f
+# if (STATE_SAVE_OFFSET % STATE_SAVE_ALIGNMENT) != 0
ce426f
+#  error STATE_SAVE_OFFSET must be multples of STATE_SAVE_ALIGNMENT
ce426f
 # endif
ce426f
 
ce426f
-/* Use XGETBV with ECX == 1 to check which bits in vector registers are
ce426f
-   non-zero and only preserve the non-zero lower bits with zero upper
ce426f
-   bits.  */
ce426f
-	.globl _dl_runtime_resolve_opt
ce426f
-	.hidden _dl_runtime_resolve_opt
ce426f
-	.type _dl_runtime_resolve_opt, @function
ce426f
-	.align 16
ce426f
-_dl_runtime_resolve_opt:
ce426f
-	cfi_startproc
ce426f
-	cfi_adjust_cfa_offset(16) # Incorporate PLT
ce426f
-	pushq %rax
ce426f
-	cfi_adjust_cfa_offset(8)
ce426f
-	cfi_rel_offset(%rax, 0)
ce426f
-	pushq %rcx
ce426f
-	cfi_adjust_cfa_offset(8)
ce426f
-	cfi_rel_offset(%rcx, 0)
ce426f
-	pushq %rdx
ce426f
-	cfi_adjust_cfa_offset(8)
ce426f
-	cfi_rel_offset(%rdx, 0)
ce426f
-	movl $1, %ecx
ce426f
-	xgetbv
ce426f
-	movl %eax, %r11d
ce426f
-	popq %rdx
ce426f
-	cfi_adjust_cfa_offset(-8)
ce426f
-	cfi_restore (%rdx)
ce426f
-	popq %rcx
ce426f
-	cfi_adjust_cfa_offset(-8)
ce426f
-	cfi_restore (%rcx)
ce426f
-	popq %rax
ce426f
-	cfi_adjust_cfa_offset(-8)
ce426f
-	cfi_restore (%rax)
ce426f
-# if VEC_SIZE == 32
ce426f
-	# For YMM registers, check if YMM state is in use.
ce426f
-	andl $bit_YMM_state, %r11d
ce426f
-	# Preserve %xmm0 - %xmm7 registers with the zero upper 128 bits if
ce426f
-	# YMM state isn't in use.
ce426f
-	PRESERVE_BND_REGS_PREFIX
ce426f
-	jz _dl_runtime_resolve_sse_vex
ce426f
-# elif VEC_SIZE == 16
ce426f
-	# For ZMM registers, check if YMM state and ZMM state are in
ce426f
-	# use.
ce426f
-	andl $(bit_YMM_state | bit_ZMM0_15_state), %r11d
ce426f
-	cmpl $bit_YMM_state, %r11d
ce426f
-	# Preserve %zmm0 - %zmm7 registers if ZMM state is in use.
ce426f
-	PRESERVE_BND_REGS_PREFIX
ce426f
-	jg _dl_runtime_resolve_avx512
ce426f
-	# Preserve %ymm0 - %ymm7 registers with the zero upper 256 bits if
ce426f
-	# ZMM state isn't in use.
ce426f
-	PRESERVE_BND_REGS_PREFIX
ce426f
-	je _dl_runtime_resolve_avx
ce426f
-	# Preserve %xmm0 - %xmm7 registers with the zero upper 384 bits if
ce426f
-	# neither YMM state nor ZMM state are in use.
ce426f
+# if DL_RUNTIME_RESOLVE_REALIGN_STACK
ce426f
+/* Local stack area before jumping to function address: RBX.  */
ce426f
+#  define LOCAL_STORAGE_AREA	8
ce426f
+#  define BASE			rbx
ce426f
+#  ifdef USE_FXSAVE
ce426f
+/* Use fxsave to save XMM registers.  */
ce426f
+#   define REGISTER_SAVE_AREA	(512 + STATE_SAVE_OFFSET)
ce426f
+#   if (REGISTER_SAVE_AREA % 16) != 0
ce426f
+#    error REGISTER_SAVE_AREA must be multples of 16
ce426f
+#   endif
ce426f
+#  endif
ce426f
 # else
ce426f
-#  error Unsupported VEC_SIZE!
ce426f
+#  ifndef USE_FXSAVE
ce426f
+#   error USE_FXSAVE must be defined
ce426f
+#  endif
ce426f
+/* Use fxsave to save XMM registers.  */
ce426f
+#  define REGISTER_SAVE_AREA	(512 + STATE_SAVE_OFFSET + 8)
ce426f
+/* Local stack area before jumping to function address:  All saved
ce426f
+   registers.  */
ce426f
+#  define LOCAL_STORAGE_AREA	REGISTER_SAVE_AREA
ce426f
+#  define BASE			rsp
ce426f
+#  if (REGISTER_SAVE_AREA % 16) != 8
ce426f
+#   error REGISTER_SAVE_AREA must be odd multples of 8
ce426f
+#  endif
ce426f
 # endif
ce426f
-	cfi_adjust_cfa_offset(-16) # Restore PLT adjustment
ce426f
-	cfi_endproc
ce426f
-	.size _dl_runtime_resolve_opt, .-_dl_runtime_resolve_opt
ce426f
-#endif
ce426f
+
ce426f
 	.globl _dl_runtime_resolve
ce426f
 	.hidden _dl_runtime_resolve
ce426f
 	.type _dl_runtime_resolve, @function
ce426f
@@ -157,19 +64,30 @@ _dl_runtime_resolve_opt:
ce426f
 	cfi_startproc
ce426f
 _dl_runtime_resolve:
ce426f
 	cfi_adjust_cfa_offset(16) # Incorporate PLT
ce426f
-#if DL_RUNIME_RESOLVE_REALIGN_STACK
ce426f
-# if LOCAL_STORAGE_AREA != 8
ce426f
-#  error LOCAL_STORAGE_AREA must be 8
ce426f
-# endif
ce426f
+# if DL_RUNTIME_RESOLVE_REALIGN_STACK
ce426f
+#  if LOCAL_STORAGE_AREA != 8
ce426f
+#   error LOCAL_STORAGE_AREA must be 8
ce426f
+#  endif
ce426f
 	pushq %rbx			# push subtracts stack by 8.
ce426f
 	cfi_adjust_cfa_offset(8)
ce426f
 	cfi_rel_offset(%rbx, 0)
ce426f
 	mov %RSP_LP, %RBX_LP
ce426f
 	cfi_def_cfa_register(%rbx)
ce426f
-	and $-VEC_SIZE, %RSP_LP
ce426f
-#endif
ce426f
+	and $-STATE_SAVE_ALIGNMENT, %RSP_LP
ce426f
+# endif
ce426f
+# ifdef REGISTER_SAVE_AREA
ce426f
 	sub $REGISTER_SAVE_AREA, %RSP_LP
ce426f
+#  if !DL_RUNTIME_RESOLVE_REALIGN_STACK
ce426f
 	cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
ce426f
+#  endif
ce426f
+# else
ce426f
+	# Allocate stack space of the required size to save the state.
ce426f
+#  if IS_IN (rtld)
ce426f
+	sub _rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP
ce426f
+#  else
ce426f
+	sub _dl_x86_cpu_features+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP
ce426f
+#  endif
ce426f
+# endif
ce426f
 	# Preserve registers otherwise clobbered.
ce426f
 	movq %rax, REGISTER_SAVE_RAX(%rsp)
ce426f
 	movq %rcx, REGISTER_SAVE_RCX(%rsp)
ce426f
@@ -178,59 +96,48 @@ _dl_runtime_resolve:
ce426f
 	movq %rdi, REGISTER_SAVE_RDI(%rsp)
ce426f
 	movq %r8, REGISTER_SAVE_R8(%rsp)
ce426f
 	movq %r9, REGISTER_SAVE_R9(%rsp)
ce426f
-	VMOV %VEC(0), (REGISTER_SAVE_VEC_OFF)(%rsp)
ce426f
-	VMOV %VEC(1), (REGISTER_SAVE_VEC_OFF + VEC_SIZE)(%rsp)
ce426f
-	VMOV %VEC(2), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 2)(%rsp)
ce426f
-	VMOV %VEC(3), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 3)(%rsp)
ce426f
-	VMOV %VEC(4), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 4)(%rsp)
ce426f
-	VMOV %VEC(5), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 5)(%rsp)
ce426f
-	VMOV %VEC(6), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 6)(%rsp)
ce426f
-	VMOV %VEC(7), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 7)(%rsp)
ce426f
-#ifndef __ILP32__
ce426f
-	# We also have to preserve bound registers.  These are nops if
ce426f
-	# Intel MPX isn't available or disabled.
ce426f
-# ifdef HAVE_MPX_SUPPORT
ce426f
-	bndmov %bnd0, REGISTER_SAVE_BND0(%rsp)
ce426f
-	bndmov %bnd1, REGISTER_SAVE_BND1(%rsp)
ce426f
-	bndmov %bnd2, REGISTER_SAVE_BND2(%rsp)
ce426f
-	bndmov %bnd3, REGISTER_SAVE_BND3(%rsp)
ce426f
+# ifdef USE_FXSAVE
ce426f
+	fxsave STATE_SAVE_OFFSET(%rsp)
ce426f
 # else
ce426f
-#  if REGISTER_SAVE_BND0 == 0
ce426f
-	.byte 0x66,0x0f,0x1b,0x04,0x24
ce426f
+	movl $STATE_SAVE_MASK, %eax
ce426f
+	xorl %edx, %edx
ce426f
+	# Clear the XSAVE Header.
ce426f
+#  ifdef USE_XSAVE
ce426f
+	movq %rdx, (STATE_SAVE_OFFSET + 512)(%rsp)
ce426f
+	movq %rdx, (STATE_SAVE_OFFSET + 512 + 8)(%rsp)
ce426f
+#  endif
ce426f
+	movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 2)(%rsp)
ce426f
+	movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 3)(%rsp)
ce426f
+	movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 4)(%rsp)
ce426f
+	movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 5)(%rsp)
ce426f
+	movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 6)(%rsp)
ce426f
+	movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 7)(%rsp)
ce426f
+#  ifdef USE_XSAVE
ce426f
+	xsave STATE_SAVE_OFFSET(%rsp)
ce426f
 #  else
ce426f
-	.byte 0x66,0x0f,0x1b,0x44,0x24,REGISTER_SAVE_BND0
ce426f
+	# Since glibc 2.23 requires only binutils 2.22 or later, xsavec
ce426f
+	# may not be supported.  Use .byte directive instead.
ce426f
+#   if STATE_SAVE_OFFSET != 0x40
ce426f
+#    error STATE_SAVE_OFFSET != 0x40
ce426f
+#   endif
ce426f
+	# xsavec STATE_SAVE_OFFSET(%rsp)
ce426f
+	.byte 0x0f, 0xc7, 0x64, 0x24, 0x40
ce426f
 #  endif
ce426f
-	.byte 0x66,0x0f,0x1b,0x4c,0x24,REGISTER_SAVE_BND1
ce426f
-	.byte 0x66,0x0f,0x1b,0x54,0x24,REGISTER_SAVE_BND2
ce426f
-	.byte 0x66,0x0f,0x1b,0x5c,0x24,REGISTER_SAVE_BND3
ce426f
 # endif
ce426f
-#endif
ce426f
 	# Copy args pushed by PLT in register.
ce426f
 	# %rdi: link_map, %rsi: reloc_index
ce426f
 	mov (LOCAL_STORAGE_AREA + 8)(%BASE), %RSI_LP
ce426f
 	mov LOCAL_STORAGE_AREA(%BASE), %RDI_LP
ce426f
 	call _dl_fixup		# Call resolver.
ce426f
 	mov %RAX_LP, %R11_LP	# Save return value
ce426f
-#ifndef __ILP32__
ce426f
-	# Restore bound registers.  These are nops if Intel MPX isn't
ce426f
-	# avaiable or disabled.
ce426f
-# ifdef HAVE_MPX_SUPPORT
ce426f
-	bndmov REGISTER_SAVE_BND3(%rsp), %bnd3
ce426f
-	bndmov REGISTER_SAVE_BND2(%rsp), %bnd2
ce426f
-	bndmov REGISTER_SAVE_BND1(%rsp), %bnd1
ce426f
-	bndmov REGISTER_SAVE_BND0(%rsp), %bnd0
ce426f
+	# Get register content back.
ce426f
+# ifdef USE_FXSAVE
ce426f
+	fxrstor STATE_SAVE_OFFSET(%rsp)
ce426f
 # else
ce426f
-	.byte 0x66,0x0f,0x1a,0x5c,0x24,REGISTER_SAVE_BND3
ce426f
-	.byte 0x66,0x0f,0x1a,0x54,0x24,REGISTER_SAVE_BND2
ce426f
-	.byte 0x66,0x0f,0x1a,0x4c,0x24,REGISTER_SAVE_BND1
ce426f
-#  if REGISTER_SAVE_BND0 == 0
ce426f
-	.byte 0x66,0x0f,0x1a,0x04,0x24
ce426f
-#  else
ce426f
-	.byte 0x66,0x0f,0x1a,0x44,0x24,REGISTER_SAVE_BND0
ce426f
-#  endif
ce426f
+	movl $STATE_SAVE_MASK, %eax
ce426f
+	xorl %edx, %edx
ce426f
+	xrstor STATE_SAVE_OFFSET(%rsp)
ce426f
 # endif
ce426f
-#endif
ce426f
-	# Get register content back.
ce426f
 	movq REGISTER_SAVE_R9(%rsp), %r9
ce426f
 	movq REGISTER_SAVE_R8(%rsp), %r8
ce426f
 	movq REGISTER_SAVE_RDI(%rsp), %rdi
ce426f
@@ -238,20 +145,12 @@ _dl_runtime_resolve:
ce426f
 	movq REGISTER_SAVE_RDX(%rsp), %rdx
ce426f
 	movq REGISTER_SAVE_RCX(%rsp), %rcx
ce426f
 	movq REGISTER_SAVE_RAX(%rsp), %rax
ce426f
-	VMOV (REGISTER_SAVE_VEC_OFF)(%rsp), %VEC(0)
ce426f
-	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE)(%rsp), %VEC(1)
ce426f
-	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 2)(%rsp), %VEC(2)
ce426f
-	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 3)(%rsp), %VEC(3)
ce426f
-	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 4)(%rsp), %VEC(4)
ce426f
-	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 5)(%rsp), %VEC(5)
ce426f
-	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 6)(%rsp), %VEC(6)
ce426f
-	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 7)(%rsp), %VEC(7)
ce426f
-#if DL_RUNIME_RESOLVE_REALIGN_STACK
ce426f
+# if DL_RUNTIME_RESOLVE_REALIGN_STACK
ce426f
 	mov %RBX_LP, %RSP_LP
ce426f
 	cfi_def_cfa_register(%rsp)
ce426f
 	movq (%rsp), %rbx
ce426f
 	cfi_restore(%rbx)
ce426f
-#endif
ce426f
+# endif
ce426f
 	# Adjust stack(PLT did 2 pushes)
ce426f
 	add $(LOCAL_STORAGE_AREA + 16), %RSP_LP
ce426f
 	cfi_adjust_cfa_offset(-(LOCAL_STORAGE_AREA + 16))
ce426f
@@ -260,11 +159,9 @@ _dl_runtime_resolve:
ce426f
 	jmp *%r11		# Jump to function address.
ce426f
 	cfi_endproc
ce426f
 	.size _dl_runtime_resolve, .-_dl_runtime_resolve
ce426f
+#endif
ce426f
 
ce426f
 
ce426f
-/* To preserve %xmm0 - %xmm7 registers, dl-trampoline.h is included
ce426f
-   twice, for _dl_runtime_resolve_sse and _dl_runtime_resolve_sse_vex.
ce426f
-   But we don't need another _dl_runtime_profile for XMM registers.  */
ce426f
 #if !defined PROF && defined _dl_runtime_profile
ce426f
 # if (LR_VECTOR_OFFSET % VEC_SIZE) != 0
ce426f
 #  error LR_VECTOR_OFFSET must be multples of VEC_SIZE