olga / rpms / glibc

Forked from rpms/glibc 5 years ago
Clone

Blame SOURCES/glibc-rh1504969.patch

c65238
Backport from Hongjiu Lu <hongjiu.lu@intel.com> of these upstream
c65238
commits:
c65238
c65238
commit b52b0d793dcb226ecb0ecca1e672ca265973233c
c65238
Author: H.J. Lu <hjl.tools@gmail.com>
c65238
Date:   Fri Oct 20 11:00:08 2017 -0700
c65238
c65238
    x86-64: Use fxsave/xsave/xsavec in _dl_runtime_resolve [BZ #21265]
c65238
    
c65238
    In _dl_runtime_resolve, use fxsave/xsave/xsavec to preserve all vector,
c65238
    mask and bound registers.  It simplifies _dl_runtime_resolve and supports
c65238
    different calling conventions.  ld.so code size is reduced by more than
c65238
    1 KB.  However, use fxsave/xsave/xsavec takes a little bit more cycles
c65238
    than saving and restoring vector and bound registers individually.
c65238
    
c65238
    Latency for _dl_runtime_resolve to lookup the function, foo, from one
c65238
    shared library plus libc.so:
c65238
    
c65238
                                 Before    After     Change
c65238
    
c65238
    Westmere (SSE)/fxsave         345      866       151%
c65238
    IvyBridge (AVX)/xsave         420      643       53%
c65238
    Haswell (AVX)/xsave           713      1252      75%
c65238
    Skylake (AVX+MPX)/xsavec      559      719       28%
c65238
    Skylake (AVX512+MPX)/xsavec   145      272       87%
c65238
    Ryzen (AVX)/xsavec            280      553       97%
c65238
    
c65238
    This is the worst case where portion of time spent for saving and
c65238
    restoring registers is bigger than majority of cases.  With smaller
c65238
    _dl_runtime_resolve code size, overall performance impact is negligible.
c65238
    
c65238
    On IvyBridge, differences in build and test time of binutils with lazy
c65238
    binding GCC and binutils are noises.  On Westmere, differences in
c65238
    bootstrap and "makc check" time of GCC 7 with lazy binding GCC and
c65238
    binutils are also noises.
c65238
c65238
commit 0ac8ee53e8efbfd6e1c37094b4653f5c2dad65b5
c65238
Author: H.J. Lu <hjl.tools@gmail.com>
c65238
Date:   Fri Aug 26 08:57:42 2016 -0700
c65238
c65238
    X86-64: Correct CFA in _dl_runtime_resolve
c65238
    
c65238
    When stack is re-aligned in _dl_runtime_resolve, there is no need to
c65238
    adjust CFA when allocating register save area on stack.
c65238
    
c65238
            * sysdeps/x86_64/dl-trampoline.h (_dl_runtime_resolve): Don't
c65238
            adjust CFA when allocating register save area on re-aligned
c65238
            stack.
c65238
c65238
Storing the full xsave state size in xsave_state_full_size was not needed
c65238
because RHEL7 does not have the full tunables support that would use this,
c65238
therefore support for xsave_state_full_size has been removed from the
c65238
changes in b52b0d793dcb226ecb0ecca1e672ca265973233c
c65238
c65238
diff --git a/sysdeps/x86/cpu-features-offsets.sym b/sysdeps/x86/cpu-features-offsets.sym
c65238
index a9d53d195f9eb609..1415005fc22be806 100644
c65238
--- a/sysdeps/x86/cpu-features-offsets.sym
c65238
+++ b/sysdeps/x86/cpu-features-offsets.sym
c65238
@@ -5,3 +5,5 @@
c65238
 #define rtld_global_ro_offsetof(mem) offsetof (struct rtld_global_ro, mem)
c65238
 
c65238
 RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET rtld_global_ro_offsetof (_dl_x86_cpu_features)
c65238
+
c65238
+XSAVE_STATE_SIZE_OFFSET	offsetof (struct cpu_features, xsave_state_size)
c65238
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
c65238
index 17e9835f5716ca12..c9bb4fa6f524ba4e 100644
c65238
--- a/sysdeps/x86/cpu-features.c
c65238
+++ b/sysdeps/x86/cpu-features.c
c65238
@@ -18,6 +18,7 @@
c65238
 
c65238
 #include <cpuid.h>
c65238
 #include <cpu-features.h>
c65238
+#include <libc-internal.h>
c65238
 
c65238
 static inline void
c65238
 get_common_indeces (struct cpu_features *cpu_features,
c65238
@@ -148,20 +149,6 @@ init_cpu_features (struct cpu_features *cpu_features)
c65238
 	      break;
c65238
 	    }
c65238
 	}
c65238
-
c65238
-      /* To avoid SSE transition penalty, use _dl_runtime_resolve_slow.
c65238
-         If XGETBV suports ECX == 1, use _dl_runtime_resolve_opt.  */
c65238
-      cpu_features->feature[index_Use_dl_runtime_resolve_slow]
c65238
-	|= bit_Use_dl_runtime_resolve_slow;
c65238
-      if (cpu_features->max_cpuid >= 0xd)
c65238
-	{
c65238
-	  unsigned int eax;
c65238
-
c65238
-	  __cpuid_count (0xd, 1, eax, ebx, ecx, edx);
c65238
-	  if ((eax & (1 << 2)) != 0)
c65238
-	    cpu_features->feature[index_Use_dl_runtime_resolve_opt]
c65238
-	      |= bit_Use_dl_runtime_resolve_opt;
c65238
-	}
c65238
     }
c65238
   /* This spells out "AuthenticAMD".  */
c65238
   else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
c65238
@@ -243,6 +230,71 @@ init_cpu_features (struct cpu_features *cpu_features)
c65238
 	  /* Determine if FMA4 is usable.  */
c65238
 	  if (HAS_CPU_FEATURE (FMA4))
c65238
 	    cpu_features->feature[index_FMA4_Usable] |= bit_FMA4_Usable;
c65238
+
c65238
+	  /* For _dl_runtime_resolve, set xsave_state_size to xsave area
c65238
+	     size + integer register save size and align it to 64 bytes.  */
c65238
+	  if (cpu_features->max_cpuid >= 0xd)
c65238
+	    {
c65238
+	      unsigned int eax, ebx, ecx, edx;
c65238
+
c65238
+	      __cpuid_count (0xd, 0, eax, ebx, ecx, edx);
c65238
+	      if (ebx != 0)
c65238
+		{
c65238
+		  cpu_features->xsave_state_size
c65238
+		= ALIGN_UP (ebx + STATE_SAVE_OFFSET, 64);
c65238
+
c65238
+		  __cpuid_count (0xd, 1, eax, ebx, ecx, edx);
c65238
+
c65238
+		  /* Check if XSAVEC is available.  */
c65238
+		  if ((eax & (1 << 1)) != 0)
c65238
+		    {
c65238
+		      unsigned int xstate_comp_offsets[32];
c65238
+		      unsigned int xstate_comp_sizes[32];
c65238
+		      unsigned int i;
c65238
+
c65238
+		      xstate_comp_offsets[0] = 0;
c65238
+		      xstate_comp_offsets[1] = 160;
c65238
+		      xstate_comp_offsets[2] = 576;
c65238
+		      xstate_comp_sizes[0] = 160;
c65238
+		      xstate_comp_sizes[1] = 256;
c65238
+
c65238
+		      for (i = 2; i < 32; i++)
c65238
+			{
c65238
+			  if ((STATE_SAVE_MASK & (1 << i)) != 0)
c65238
+			    {
c65238
+			      __cpuid_count (0xd, i, eax, ebx, ecx, edx);
c65238
+			      xstate_comp_sizes[i] = eax;
c65238
+			    }
c65238
+			  else
c65238
+			    {
c65238
+			      ecx = 0;
c65238
+			      xstate_comp_sizes[i] = 0;
c65238
+			    }
c65238
+
c65238
+			  if (i > 2)
c65238
+			    {
c65238
+			      xstate_comp_offsets[i]
c65238
+				= (xstate_comp_offsets[i - 1]
c65238
+				   + xstate_comp_sizes[i -1]);
c65238
+			      if ((ecx & (1 << 1)) != 0)
c65238
+				xstate_comp_offsets[i]
c65238
+			      = ALIGN_UP (xstate_comp_offsets[i], 64);
c65238
+			    }
c65238
+			}
c65238
+
c65238
+		      /* Use XSAVEC.  */
c65238
+		      unsigned int size
c65238
+			= xstate_comp_offsets[31] + xstate_comp_sizes[31];
c65238
+		      if (size)
c65238
+			{
c65238
+			  cpu_features->xsave_state_size
c65238
+			    = ALIGN_UP (size + STATE_SAVE_OFFSET, 64);
c65238
+			  cpu_features->feature[index_XSAVEC_Usable]
c65238
+			    |= bit_XSAVEC_Usable;
c65238
+			}
c65238
+		    }
c65238
+		}
c65238
+	    }
c65238
 	}
c65238
     }
c65238
 
c65238
diff --git a/sysdeps/x86/cpu-features.h b/sysdeps/x86/cpu-features.h
c65238
index c69abb349af8f09c..4e2e6fabb39ab600 100644
c65238
--- a/sysdeps/x86/cpu-features.h
c65238
+++ b/sysdeps/x86/cpu-features.h
c65238
@@ -34,8 +34,7 @@
c65238
 #define bit_AVX512DQ_Usable		(1 << 13)
c65238
 #define bit_Prefer_MAP_32BIT_EXEC	(1 << 16)
c65238
 #define bit_Prefer_No_VZEROUPPER	(1 << 17)
c65238
-#define bit_Use_dl_runtime_resolve_opt	(1 << 20)
c65238
-#define bit_Use_dl_runtime_resolve_slow	(1 << 21)
c65238
+#define bit_XSAVEC_Usable		(1 << 18)
c65238
 
c65238
 
c65238
 /* CPUID Feature flags.  */
c65238
@@ -70,10 +69,20 @@
c65238
 /* The current maximum size of the feature integer bit array.  */
c65238
 #define FEATURE_INDEX_MAX 1
c65238
 
c65238
+/* Offset for fxsave/xsave area used by _dl_runtime_resolve.  Also need
c65238
+   space to preserve RCX, RDX, RSI, RDI, R8, R9 and RAX.  It must be
c65238
+   aligned to 16 bytes for fxsave and 64 bytes for xsave.  */
c65238
+#define STATE_SAVE_OFFSET (8 * 7 + 8)
c65238
+
c65238
+/* Save SSE, AVX, AVX512, mask and bound registers.  */
c65238
+#define STATE_SAVE_MASK \
c65238
+  ((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7))
c65238
+
c65238
 #ifdef	__ASSEMBLER__
c65238
 
c65238
 # include <ifunc-defines.h>
c65238
 # include <rtld-global-offsets.h>
c65238
+# include <cpu-features-offsets.h>
c65238
 
c65238
 # define index_SSE2	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_EDX_OFFSET
c65238
 # define index_SSSE3	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
c65238
@@ -98,8 +107,6 @@
c65238
 # define index_AVX512DQ_Usable		FEATURE_INDEX_1*FEATURE_SIZE
c65238
 # define index_Prefer_MAP_32BIT_EXEC	FEATURE_INDEX_1*FEATURE_SIZE
c65238
 # define index_Prefer_No_VZEROUPPER	FEATURE_INDEX_1*FEATURE_SIZE
c65238
-# define index_Use_dl_runtime_resolve_opt FEATURE_INDEX_1*FEATURE_SIZE
c65238
-# define index_Use_dl_runtime_resolve_slow FEATURE_INDEX_1*FEATURE_SIZE
c65238
 
c65238
 
c65238
 # if defined (_LIBC) && !IS_IN (nonlib)
c65238
@@ -214,6 +221,12 @@ struct cpu_features
c65238
   } cpuid[COMMON_CPUID_INDEX_MAX];
c65238
   unsigned int family;
c65238
   unsigned int model;
c65238
+  /* The type must be unsigned long int so that we use
c65238
+
c65238
+	sub xsave_state_size_offset(%rip) %RSP_LP
c65238
+
c65238
+     in _dl_runtime_resolve.  */
c65238
+  unsigned long int xsave_state_size;
c65238
   unsigned int feature[FEATURE_INDEX_MAX];
c65238
 };
c65238
 
c65238
@@ -279,8 +292,7 @@ extern const struct cpu_features *__get_cpu_features (void)
c65238
 # define index_AVX512DQ_Usable		FEATURE_INDEX_1
c65238
 # define index_Prefer_MAP_32BIT_EXEC	FEATURE_INDEX_1
c65238
 # define index_Prefer_No_VZEROUPPER     FEATURE_INDEX_1
c65238
-# define index_Use_dl_runtime_resolve_opt FEATURE_INDEX_1
c65238
-# define index_Use_dl_runtime_resolve_slow FEATURE_INDEX_1
c65238
+# define index_XSAVEC_Usable		FEATURE_INDEX_1
c65238
 
c65238
 #endif	/* !__ASSEMBLER__ */
c65238
 
c65238
diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
c65238
index 2a4cda1aff57db98..da89f2a6174a0d94 100644
c65238
--- a/sysdeps/x86_64/dl-machine.h
c65238
+++ b/sysdeps/x86_64/dl-machine.h
c65238
@@ -66,12 +66,9 @@ static inline int __attribute__ ((unused, always_inline))
c65238
 elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
c65238
 {
c65238
   Elf64_Addr *got;
c65238
-  extern void _dl_runtime_resolve_sse (ElfW(Word)) attribute_hidden;
c65238
-  extern void _dl_runtime_resolve_avx (ElfW(Word)) attribute_hidden;
c65238
-  extern void _dl_runtime_resolve_avx_slow (ElfW(Word)) attribute_hidden;
c65238
-  extern void _dl_runtime_resolve_avx_opt (ElfW(Word)) attribute_hidden;
c65238
-  extern void _dl_runtime_resolve_avx512 (ElfW(Word)) attribute_hidden;
c65238
-  extern void _dl_runtime_resolve_avx512_opt (ElfW(Word)) attribute_hidden;
c65238
+  extern void _dl_runtime_resolve_fxsave (ElfW(Word)) attribute_hidden;
c65238
+  extern void _dl_runtime_resolve_xsave (ElfW(Word)) attribute_hidden;
c65238
+  extern void _dl_runtime_resolve_xsavec (ElfW(Word)) attribute_hidden;
c65238
   extern void _dl_runtime_profile_sse (ElfW(Word)) attribute_hidden;
c65238
   extern void _dl_runtime_profile_avx (ElfW(Word)) attribute_hidden;
c65238
   extern void _dl_runtime_profile_avx512 (ElfW(Word)) attribute_hidden;
c65238
@@ -120,29 +117,14 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
c65238
 	  /* This function will get called to fix up the GOT entry
c65238
 	     indicated by the offset on the stack, and then jump to
c65238
 	     the resolved address.  */
c65238
-	  if (HAS_ARCH_FEATURE (AVX512F_Usable))
c65238
-	    {
c65238
-	      if (HAS_ARCH_FEATURE (Use_dl_runtime_resolve_opt))
c65238
-		*(ElfW(Addr) *) (got + 2)
c65238
-		  = (ElfW(Addr)) &_dl_runtime_resolve_avx512_opt;
c65238
-	      else
c65238
-		*(ElfW(Addr) *) (got + 2)
c65238
-		  = (ElfW(Addr)) &_dl_runtime_resolve_avx512;
c65238
-	    }
c65238
-	  else if (HAS_ARCH_FEATURE (AVX_Usable))
c65238
-	    {
c65238
-	      if (HAS_ARCH_FEATURE (Use_dl_runtime_resolve_opt))
c65238
-		*(ElfW(Addr) *) (got + 2)
c65238
-		  = (ElfW(Addr)) &_dl_runtime_resolve_avx_opt;
c65238
-	      else if (HAS_ARCH_FEATURE (Use_dl_runtime_resolve_slow))
c65238
-		*(ElfW(Addr) *) (got + 2)
c65238
-		  = (ElfW(Addr)) &_dl_runtime_resolve_avx_slow;
c65238
-	      else
c65238
-		*(ElfW(Addr) *) (got + 2)
c65238
-		  = (ElfW(Addr)) &_dl_runtime_resolve_avx;
c65238
-	    }
c65238
+	  if (GLRO(dl_x86_cpu_features).xsave_state_size != 0)
c65238
+	    *(ElfW(Addr) *) (got + 2)
c65238
+	      = (HAS_ARCH_FEATURE (XSAVEC_Usable)
c65238
+		 ? (ElfW(Addr)) &_dl_runtime_resolve_xsavec
c65238
+		 : (ElfW(Addr)) &_dl_runtime_resolve_xsave);
c65238
 	  else
c65238
-	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_sse;
c65238
+	    *(ElfW(Addr) *) (got + 2)
c65238
+	      = (ElfW(Addr)) &_dl_runtime_resolve_fxsave;
c65238
 	}
c65238
     }
c65238
 
c65238
diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
c65238
index bd2d72edfea406e5..215a314f06ca874c 100644
c65238
--- a/sysdeps/x86_64/dl-trampoline.S
c65238
+++ b/sysdeps/x86_64/dl-trampoline.S
c65238
@@ -34,37 +34,24 @@
c65238
 # define DL_STACK_ALIGNMENT 8
c65238
 #endif
c65238
 
c65238
-#ifndef DL_RUNIME_UNALIGNED_VEC_SIZE
c65238
-/* The maximum size of unaligned vector load and store.  */
c65238
-# define DL_RUNIME_UNALIGNED_VEC_SIZE 16
c65238
-#endif
c65238
-
c65238
-/* True if _dl_runtime_resolve should align stack to VEC_SIZE bytes.  */
c65238
-#define DL_RUNIME_RESOLVE_REALIGN_STACK \
c65238
-  (VEC_SIZE > DL_STACK_ALIGNMENT \
c65238
-   && VEC_SIZE > DL_RUNIME_UNALIGNED_VEC_SIZE)
c65238
-
c65238
-/* Align vector register save area to 16 bytes.  */
c65238
-#define REGISTER_SAVE_VEC_OFF	0
c65238
+/* True if _dl_runtime_resolve should align stack for STATE_SAVE or align
c65238
+   stack to 16 bytes before calling _dl_fixup.  */
c65238
+#define DL_RUNTIME_RESOLVE_REALIGN_STACK \
c65238
+  (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \
c65238
+   || 16 > DL_STACK_ALIGNMENT)
c65238
 
c65238
 /* Area on stack to save and restore registers used for parameter
c65238
    passing when calling _dl_fixup.  */
c65238
 #ifdef __ILP32__
c65238
-# define REGISTER_SAVE_RAX	(REGISTER_SAVE_VEC_OFF + VEC_SIZE * 8)
c65238
 # define PRESERVE_BND_REGS_PREFIX
c65238
 #else
c65238
-/* Align bound register save area to 16 bytes.  */
c65238
-# define REGISTER_SAVE_BND0	(REGISTER_SAVE_VEC_OFF + VEC_SIZE * 8)
c65238
-# define REGISTER_SAVE_BND1	(REGISTER_SAVE_BND0 + 16)
c65238
-# define REGISTER_SAVE_BND2	(REGISTER_SAVE_BND1 + 16)
c65238
-# define REGISTER_SAVE_BND3	(REGISTER_SAVE_BND2 + 16)
c65238
-# define REGISTER_SAVE_RAX	(REGISTER_SAVE_BND3 + 16)
c65238
 # ifdef HAVE_MPX_SUPPORT
c65238
 #  define PRESERVE_BND_REGS_PREFIX bnd
c65238
 # else
c65238
 #  define PRESERVE_BND_REGS_PREFIX .byte 0xf2
c65238
 # endif
c65238
 #endif
c65238
+#define REGISTER_SAVE_RAX	0
c65238
 #define REGISTER_SAVE_RCX	(REGISTER_SAVE_RAX + 8)
c65238
 #define REGISTER_SAVE_RDX	(REGISTER_SAVE_RCX + 8)
c65238
 #define REGISTER_SAVE_RSI	(REGISTER_SAVE_RDX + 8)
c65238
@@ -72,71 +59,60 @@
c65238
 #define REGISTER_SAVE_R8	(REGISTER_SAVE_RDI + 8)
c65238
 #define REGISTER_SAVE_R9	(REGISTER_SAVE_R8 + 8)
c65238
 
c65238
+#define RESTORE_AVX
c65238
+
c65238
 #define VEC_SIZE		64
c65238
 #define VMOVA			vmovdqa64
c65238
-#if DL_RUNIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT
c65238
-# define VMOV			vmovdqa64
c65238
-#else
c65238
-# define VMOV			vmovdqu64
c65238
-#endif
c65238
 #define VEC(i)			zmm##i
c65238
-#define _dl_runtime_resolve	_dl_runtime_resolve_avx512
c65238
 #define _dl_runtime_profile	_dl_runtime_profile_avx512
c65238
-#define RESTORE_AVX
c65238
 #include "dl-trampoline.h"
c65238
-#undef _dl_runtime_resolve
c65238
 #undef _dl_runtime_profile
c65238
 #undef VEC
c65238
-#undef VMOV
c65238
 #undef VMOVA
c65238
 #undef VEC_SIZE
c65238
 
c65238
 #define VEC_SIZE		32
c65238
 #define VMOVA			vmovdqa
c65238
-#if DL_RUNIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT
c65238
-# define VMOV			vmovdqa
c65238
-#else
c65238
-# define VMOV			vmovdqu
c65238
-#endif
c65238
 #define VEC(i)			ymm##i
c65238
-#define _dl_runtime_resolve	_dl_runtime_resolve_avx
c65238
-#define _dl_runtime_resolve_opt	_dl_runtime_resolve_avx_opt
c65238
 #define _dl_runtime_profile	_dl_runtime_profile_avx
c65238
 #include "dl-trampoline.h"
c65238
-#undef _dl_runtime_resolve
c65238
-#undef _dl_runtime_resolve_opt
c65238
 #undef _dl_runtime_profile
c65238
 #undef VEC
c65238
-#undef VMOV
c65238
 #undef VMOVA
c65238
 #undef VEC_SIZE
c65238
 
c65238
 /* movaps/movups is 1-byte shorter.  */
c65238
 #define VEC_SIZE		16
c65238
 #define VMOVA			movaps
c65238
-#if DL_RUNIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT
c65238
-# define VMOV			movaps
c65238
-#else
c65238
-# define VMOV			movups
c65238
- #endif
c65238
 #define VEC(i)			xmm##i
c65238
-#define _dl_runtime_resolve	_dl_runtime_resolve_sse
c65238
 #define _dl_runtime_profile	_dl_runtime_profile_sse
c65238
 #undef RESTORE_AVX
c65238
 #include "dl-trampoline.h"
c65238
-#undef _dl_runtime_resolve
c65238
 #undef _dl_runtime_profile
c65238
-#undef VMOV
c65238
+#undef VEC
c65238
 #undef VMOVA
c65238
+#undef VEC_SIZE
c65238
 
c65238
-/* Used by _dl_runtime_resolve_avx_opt/_dl_runtime_resolve_avx512_opt
c65238
-   to preserve the full vector registers with zero upper bits.  */
c65238
-#define VMOVA			vmovdqa
c65238
-#if DL_RUNTIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT
c65238
-# define VMOV			vmovdqa
c65238
-#else
c65238
-# define VMOV			vmovdqu
c65238
-#endif
c65238
-#define _dl_runtime_resolve	_dl_runtime_resolve_sse_vex
c65238
-#define _dl_runtime_resolve_opt	_dl_runtime_resolve_avx512_opt
c65238
+#define USE_FXSAVE
c65238
+#define STATE_SAVE_ALIGNMENT	16
c65238
+#define _dl_runtime_resolve	_dl_runtime_resolve_fxsave
c65238
 #include "dl-trampoline.h"
c65238
+#undef _dl_runtime_resolve
c65238
+#undef USE_FXSAVE
c65238
+#undef STATE_SAVE_ALIGNMENT
c65238
+
c65238
+#define USE_XSAVE
c65238
+#define STATE_SAVE_ALIGNMENT	64
c65238
+#define _dl_runtime_resolve	_dl_runtime_resolve_xsave
c65238
+#include "dl-trampoline.h"
c65238
+#undef _dl_runtime_resolve
c65238
+#undef USE_XSAVE
c65238
+#undef STATE_SAVE_ALIGNMENT
c65238
+
c65238
+#define USE_XSAVEC
c65238
+#define STATE_SAVE_ALIGNMENT	64
c65238
+#define _dl_runtime_resolve	_dl_runtime_resolve_xsavec
c65238
+#include "dl-trampoline.h"
c65238
+#undef _dl_runtime_resolve
c65238
+#undef USE_XSAVEC
c65238
+#undef STATE_SAVE_ALIGNMENT
c65238
diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h
c65238
index 849cab4cd30e122a..525de575e3c4e52c 100644
c65238
--- a/sysdeps/x86_64/dl-trampoline.h
c65238
+++ b/sysdeps/x86_64/dl-trampoline.h
c65238
@@ -16,140 +16,47 @@
c65238
    License along with the GNU C Library; if not, see
c65238
    <http://www.gnu.org/licenses/>.  */
c65238
 
c65238
-#undef REGISTER_SAVE_AREA_RAW
c65238
-#ifdef __ILP32__
c65238
-/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as VEC0 to
c65238
-   VEC7.  */
c65238
-# define REGISTER_SAVE_AREA_RAW	(8 * 7 + VEC_SIZE * 8)
c65238
-#else
c65238
-/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as
c65238
-   BND0, BND1, BND2, BND3 and VEC0 to VEC7. */
c65238
-# define REGISTER_SAVE_AREA_RAW	(8 * 7 + 16 * 4 + VEC_SIZE * 8)
c65238
-#endif
c65238
+	.text
c65238
+#ifdef _dl_runtime_resolve
c65238
 
c65238
-#undef REGISTER_SAVE_AREA
c65238
-#undef LOCAL_STORAGE_AREA
c65238
-#undef BASE
c65238
-#if DL_RUNIME_RESOLVE_REALIGN_STACK
c65238
-# define REGISTER_SAVE_AREA	(REGISTER_SAVE_AREA_RAW + 8)
c65238
-/* Local stack area before jumping to function address: RBX.  */
c65238
-# define LOCAL_STORAGE_AREA	8
c65238
-# define BASE			rbx
c65238
-# if (REGISTER_SAVE_AREA % VEC_SIZE) != 0
c65238
-#  error REGISTER_SAVE_AREA must be multples of VEC_SIZE
c65238
-# endif
c65238
-#else
c65238
-# define REGISTER_SAVE_AREA	REGISTER_SAVE_AREA_RAW
c65238
-/* Local stack area before jumping to function address:  All saved
c65238
-   registers.  */
c65238
-# define LOCAL_STORAGE_AREA	REGISTER_SAVE_AREA
c65238
-# define BASE			rsp
c65238
-# if (REGISTER_SAVE_AREA % 16) != 8
c65238
-#  error REGISTER_SAVE_AREA must be odd multples of 8
c65238
+# undef REGISTER_SAVE_AREA
c65238
+# undef LOCAL_STORAGE_AREA
c65238
+# undef BASE
c65238
+
c65238
+# if (STATE_SAVE_ALIGNMENT % 16) != 0
c65238
+#  error STATE_SAVE_ALIGNMENT must be multples of 16
c65238
 # endif
c65238
-#endif
c65238
 
c65238
-	.text
c65238
-#ifdef _dl_runtime_resolve_opt
c65238
-/* Use the smallest vector registers to preserve the full YMM/ZMM
c65238
-   registers to avoid SSE transition penalty.  */
c65238
-
c65238
-# if VEC_SIZE == 32
c65238
-/* Check if the upper 128 bits in %ymm0 - %ymm7 registers are non-zero
c65238
-   and preserve %xmm0 - %xmm7 registers with the zero upper bits.  Since
c65238
-   there is no SSE transition penalty on AVX512 processors which don't
c65238
-   support XGETBV with ECX == 1, _dl_runtime_resolve_avx512_slow isn't
c65238
-   provided.   */
c65238
-	.globl _dl_runtime_resolve_avx_slow
c65238
-	.hidden _dl_runtime_resolve_avx_slow
c65238
-	.type _dl_runtime_resolve_avx_slow, @function
c65238
-	.align 16
c65238
-_dl_runtime_resolve_avx_slow:
c65238
-	cfi_startproc
c65238
-	cfi_adjust_cfa_offset(16) # Incorporate PLT
c65238
-	vorpd %ymm0, %ymm1, %ymm8
c65238
-	vorpd %ymm2, %ymm3, %ymm9
c65238
-	vorpd %ymm4, %ymm5, %ymm10
c65238
-	vorpd %ymm6, %ymm7, %ymm11
c65238
-	vorpd %ymm8, %ymm9, %ymm9
c65238
-	vorpd %ymm10, %ymm11, %ymm10
c65238
-	vpcmpeqd %xmm8, %xmm8, %xmm8
c65238
-	vorpd %ymm9, %ymm10, %ymm10
c65238
-	vptest %ymm10, %ymm8
c65238
-	# Preserve %ymm0 - %ymm7 registers if the upper 128 bits of any
c65238
-	# %ymm0 - %ymm7 registers aren't zero.
c65238
-	PRESERVE_BND_REGS_PREFIX
c65238
-	jnc _dl_runtime_resolve_avx
c65238
-	# Use vzeroupper to avoid SSE transition penalty.
c65238
-	vzeroupper
c65238
-	# Preserve %xmm0 - %xmm7 registers with the zero upper 128 bits
c65238
-	# when the upper 128 bits of %ymm0 - %ymm7 registers are zero.
c65238
-	PRESERVE_BND_REGS_PREFIX
c65238
-	jmp _dl_runtime_resolve_sse_vex
c65238
-	cfi_adjust_cfa_offset(-16) # Restore PLT adjustment
c65238
-	cfi_endproc
c65238
-	.size _dl_runtime_resolve_avx_slow, .-_dl_runtime_resolve_avx_slow
c65238
+# if (STATE_SAVE_OFFSET % STATE_SAVE_ALIGNMENT) != 0
c65238
+#  error STATE_SAVE_OFFSET must be multples of STATE_SAVE_ALIGNMENT
c65238
 # endif
c65238
 
c65238
-/* Use XGETBV with ECX == 1 to check which bits in vector registers are
c65238
-   non-zero and only preserve the non-zero lower bits with zero upper
c65238
-   bits.  */
c65238
-	.globl _dl_runtime_resolve_opt
c65238
-	.hidden _dl_runtime_resolve_opt
c65238
-	.type _dl_runtime_resolve_opt, @function
c65238
-	.align 16
c65238
-_dl_runtime_resolve_opt:
c65238
-	cfi_startproc
c65238
-	cfi_adjust_cfa_offset(16) # Incorporate PLT
c65238
-	pushq %rax
c65238
-	cfi_adjust_cfa_offset(8)
c65238
-	cfi_rel_offset(%rax, 0)
c65238
-	pushq %rcx
c65238
-	cfi_adjust_cfa_offset(8)
c65238
-	cfi_rel_offset(%rcx, 0)
c65238
-	pushq %rdx
c65238
-	cfi_adjust_cfa_offset(8)
c65238
-	cfi_rel_offset(%rdx, 0)
c65238
-	movl $1, %ecx
c65238
-	xgetbv
c65238
-	movl %eax, %r11d
c65238
-	popq %rdx
c65238
-	cfi_adjust_cfa_offset(-8)
c65238
-	cfi_restore (%rdx)
c65238
-	popq %rcx
c65238
-	cfi_adjust_cfa_offset(-8)
c65238
-	cfi_restore (%rcx)
c65238
-	popq %rax
c65238
-	cfi_adjust_cfa_offset(-8)
c65238
-	cfi_restore (%rax)
c65238
-# if VEC_SIZE == 32
c65238
-	# For YMM registers, check if YMM state is in use.
c65238
-	andl $bit_YMM_state, %r11d
c65238
-	# Preserve %xmm0 - %xmm7 registers with the zero upper 128 bits if
c65238
-	# YMM state isn't in use.
c65238
-	PRESERVE_BND_REGS_PREFIX
c65238
-	jz _dl_runtime_resolve_sse_vex
c65238
-# elif VEC_SIZE == 16
c65238
-	# For ZMM registers, check if YMM state and ZMM state are in
c65238
-	# use.
c65238
-	andl $(bit_YMM_state | bit_ZMM0_15_state), %r11d
c65238
-	cmpl $bit_YMM_state, %r11d
c65238
-	# Preserve %zmm0 - %zmm7 registers if ZMM state is in use.
c65238
-	PRESERVE_BND_REGS_PREFIX
c65238
-	jg _dl_runtime_resolve_avx512
c65238
-	# Preserve %ymm0 - %ymm7 registers with the zero upper 256 bits if
c65238
-	# ZMM state isn't in use.
c65238
-	PRESERVE_BND_REGS_PREFIX
c65238
-	je _dl_runtime_resolve_avx
c65238
-	# Preserve %xmm0 - %xmm7 registers with the zero upper 384 bits if
c65238
-	# neither YMM state nor ZMM state are in use.
c65238
+# if DL_RUNTIME_RESOLVE_REALIGN_STACK
c65238
+/* Local stack area before jumping to function address: RBX.  */
c65238
+#  define LOCAL_STORAGE_AREA	8
c65238
+#  define BASE			rbx
c65238
+#  ifdef USE_FXSAVE
c65238
+/* Use fxsave to save XMM registers.  */
c65238
+#   define REGISTER_SAVE_AREA	(512 + STATE_SAVE_OFFSET)
c65238
+#   if (REGISTER_SAVE_AREA % 16) != 0
c65238
+#    error REGISTER_SAVE_AREA must be multples of 16
c65238
+#   endif
c65238
+#  endif
c65238
 # else
c65238
-#  error Unsupported VEC_SIZE!
c65238
+#  ifndef USE_FXSAVE
c65238
+#   error USE_FXSAVE must be defined
c65238
+#  endif
c65238
+/* Use fxsave to save XMM registers.  */
c65238
+#  define REGISTER_SAVE_AREA	(512 + STATE_SAVE_OFFSET + 8)
c65238
+/* Local stack area before jumping to function address:  All saved
c65238
+   registers.  */
c65238
+#  define LOCAL_STORAGE_AREA	REGISTER_SAVE_AREA
c65238
+#  define BASE			rsp
c65238
+#  if (REGISTER_SAVE_AREA % 16) != 8
c65238
+#   error REGISTER_SAVE_AREA must be odd multples of 8
c65238
+#  endif
c65238
 # endif
c65238
-	cfi_adjust_cfa_offset(-16) # Restore PLT adjustment
c65238
-	cfi_endproc
c65238
-	.size _dl_runtime_resolve_opt, .-_dl_runtime_resolve_opt
c65238
-#endif
c65238
+
c65238
 	.globl _dl_runtime_resolve
c65238
 	.hidden _dl_runtime_resolve
c65238
 	.type _dl_runtime_resolve, @function
c65238
@@ -157,19 +64,30 @@ _dl_runtime_resolve_opt:
c65238
 	cfi_startproc
c65238
 _dl_runtime_resolve:
c65238
 	cfi_adjust_cfa_offset(16) # Incorporate PLT
c65238
-#if DL_RUNIME_RESOLVE_REALIGN_STACK
c65238
-# if LOCAL_STORAGE_AREA != 8
c65238
-#  error LOCAL_STORAGE_AREA must be 8
c65238
-# endif
c65238
+# if DL_RUNTIME_RESOLVE_REALIGN_STACK
c65238
+#  if LOCAL_STORAGE_AREA != 8
c65238
+#   error LOCAL_STORAGE_AREA must be 8
c65238
+#  endif
c65238
 	pushq %rbx			# push subtracts stack by 8.
c65238
 	cfi_adjust_cfa_offset(8)
c65238
 	cfi_rel_offset(%rbx, 0)
c65238
 	mov %RSP_LP, %RBX_LP
c65238
 	cfi_def_cfa_register(%rbx)
c65238
-	and $-VEC_SIZE, %RSP_LP
c65238
-#endif
c65238
+	and $-STATE_SAVE_ALIGNMENT, %RSP_LP
c65238
+# endif
c65238
+# ifdef REGISTER_SAVE_AREA
c65238
 	sub $REGISTER_SAVE_AREA, %RSP_LP
c65238
+#  if !DL_RUNTIME_RESOLVE_REALIGN_STACK
c65238
 	cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
c65238
+#  endif
c65238
+# else
c65238
+	# Allocate stack space of the required size to save the state.
c65238
+#  if IS_IN (rtld)
c65238
+	sub _rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP
c65238
+#  else
c65238
+	sub _dl_x86_cpu_features+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP
c65238
+#  endif
c65238
+# endif
c65238
 	# Preserve registers otherwise clobbered.
c65238
 	movq %rax, REGISTER_SAVE_RAX(%rsp)
c65238
 	movq %rcx, REGISTER_SAVE_RCX(%rsp)
c65238
@@ -178,59 +96,48 @@ _dl_runtime_resolve:
c65238
 	movq %rdi, REGISTER_SAVE_RDI(%rsp)
c65238
 	movq %r8, REGISTER_SAVE_R8(%rsp)
c65238
 	movq %r9, REGISTER_SAVE_R9(%rsp)
c65238
-	VMOV %VEC(0), (REGISTER_SAVE_VEC_OFF)(%rsp)
c65238
-	VMOV %VEC(1), (REGISTER_SAVE_VEC_OFF + VEC_SIZE)(%rsp)
c65238
-	VMOV %VEC(2), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 2)(%rsp)
c65238
-	VMOV %VEC(3), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 3)(%rsp)
c65238
-	VMOV %VEC(4), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 4)(%rsp)
c65238
-	VMOV %VEC(5), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 5)(%rsp)
c65238
-	VMOV %VEC(6), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 6)(%rsp)
c65238
-	VMOV %VEC(7), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 7)(%rsp)
c65238
-#ifndef __ILP32__
c65238
-	# We also have to preserve bound registers.  These are nops if
c65238
-	# Intel MPX isn't available or disabled.
c65238
-# ifdef HAVE_MPX_SUPPORT
c65238
-	bndmov %bnd0, REGISTER_SAVE_BND0(%rsp)
c65238
-	bndmov %bnd1, REGISTER_SAVE_BND1(%rsp)
c65238
-	bndmov %bnd2, REGISTER_SAVE_BND2(%rsp)
c65238
-	bndmov %bnd3, REGISTER_SAVE_BND3(%rsp)
c65238
+# ifdef USE_FXSAVE
c65238
+	fxsave STATE_SAVE_OFFSET(%rsp)
c65238
 # else
c65238
-#  if REGISTER_SAVE_BND0 == 0
c65238
-	.byte 0x66,0x0f,0x1b,0x04,0x24
c65238
+	movl $STATE_SAVE_MASK, %eax
c65238
+	xorl %edx, %edx
c65238
+	# Clear the XSAVE Header.
c65238
+#  ifdef USE_XSAVE
c65238
+	movq %rdx, (STATE_SAVE_OFFSET + 512)(%rsp)
c65238
+	movq %rdx, (STATE_SAVE_OFFSET + 512 + 8)(%rsp)
c65238
+#  endif
c65238
+	movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 2)(%rsp)
c65238
+	movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 3)(%rsp)
c65238
+	movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 4)(%rsp)
c65238
+	movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 5)(%rsp)
c65238
+	movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 6)(%rsp)
c65238
+	movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 7)(%rsp)
c65238
+#  ifdef USE_XSAVE
c65238
+	xsave STATE_SAVE_OFFSET(%rsp)
c65238
 #  else
c65238
-	.byte 0x66,0x0f,0x1b,0x44,0x24,REGISTER_SAVE_BND0
c65238
+	# Since glibc 2.23 requires only binutils 2.22 or later, xsavec
c65238
+	# may not be supported.  Use .byte directive instead.
c65238
+#   if STATE_SAVE_OFFSET != 0x40
c65238
+#    error STATE_SAVE_OFFSET != 0x40
c65238
+#   endif
c65238
+	# xsavec STATE_SAVE_OFFSET(%rsp)
c65238
+	.byte 0x0f, 0xc7, 0x64, 0x24, 0x40
c65238
 #  endif
c65238
-	.byte 0x66,0x0f,0x1b,0x4c,0x24,REGISTER_SAVE_BND1
c65238
-	.byte 0x66,0x0f,0x1b,0x54,0x24,REGISTER_SAVE_BND2
c65238
-	.byte 0x66,0x0f,0x1b,0x5c,0x24,REGISTER_SAVE_BND3
c65238
 # endif
c65238
-#endif
c65238
 	# Copy args pushed by PLT in register.
c65238
 	# %rdi: link_map, %rsi: reloc_index
c65238
 	mov (LOCAL_STORAGE_AREA + 8)(%BASE), %RSI_LP
c65238
 	mov LOCAL_STORAGE_AREA(%BASE), %RDI_LP
c65238
 	call _dl_fixup		# Call resolver.
c65238
 	mov %RAX_LP, %R11_LP	# Save return value
c65238
-#ifndef __ILP32__
c65238
-	# Restore bound registers.  These are nops if Intel MPX isn't
c65238
-	# avaiable or disabled.
c65238
-# ifdef HAVE_MPX_SUPPORT
c65238
-	bndmov REGISTER_SAVE_BND3(%rsp), %bnd3
c65238
-	bndmov REGISTER_SAVE_BND2(%rsp), %bnd2
c65238
-	bndmov REGISTER_SAVE_BND1(%rsp), %bnd1
c65238
-	bndmov REGISTER_SAVE_BND0(%rsp), %bnd0
c65238
+	# Get register content back.
c65238
+# ifdef USE_FXSAVE
c65238
+	fxrstor STATE_SAVE_OFFSET(%rsp)
c65238
 # else
c65238
-	.byte 0x66,0x0f,0x1a,0x5c,0x24,REGISTER_SAVE_BND3
c65238
-	.byte 0x66,0x0f,0x1a,0x54,0x24,REGISTER_SAVE_BND2
c65238
-	.byte 0x66,0x0f,0x1a,0x4c,0x24,REGISTER_SAVE_BND1
c65238
-#  if REGISTER_SAVE_BND0 == 0
c65238
-	.byte 0x66,0x0f,0x1a,0x04,0x24
c65238
-#  else
c65238
-	.byte 0x66,0x0f,0x1a,0x44,0x24,REGISTER_SAVE_BND0
c65238
-#  endif
c65238
+	movl $STATE_SAVE_MASK, %eax
c65238
+	xorl %edx, %edx
c65238
+	xrstor STATE_SAVE_OFFSET(%rsp)
c65238
 # endif
c65238
-#endif
c65238
-	# Get register content back.
c65238
 	movq REGISTER_SAVE_R9(%rsp), %r9
c65238
 	movq REGISTER_SAVE_R8(%rsp), %r8
c65238
 	movq REGISTER_SAVE_RDI(%rsp), %rdi
c65238
@@ -238,20 +145,12 @@ _dl_runtime_resolve:
c65238
 	movq REGISTER_SAVE_RDX(%rsp), %rdx
c65238
 	movq REGISTER_SAVE_RCX(%rsp), %rcx
c65238
 	movq REGISTER_SAVE_RAX(%rsp), %rax
c65238
-	VMOV (REGISTER_SAVE_VEC_OFF)(%rsp), %VEC(0)
c65238
-	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE)(%rsp), %VEC(1)
c65238
-	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 2)(%rsp), %VEC(2)
c65238
-	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 3)(%rsp), %VEC(3)
c65238
-	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 4)(%rsp), %VEC(4)
c65238
-	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 5)(%rsp), %VEC(5)
c65238
-	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 6)(%rsp), %VEC(6)
c65238
-	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 7)(%rsp), %VEC(7)
c65238
-#if DL_RUNIME_RESOLVE_REALIGN_STACK
c65238
+# if DL_RUNTIME_RESOLVE_REALIGN_STACK
c65238
 	mov %RBX_LP, %RSP_LP
c65238
 	cfi_def_cfa_register(%rsp)
c65238
 	movq (%rsp), %rbx
c65238
 	cfi_restore(%rbx)
c65238
-#endif
c65238
+# endif
c65238
 	# Adjust stack(PLT did 2 pushes)
c65238
 	add $(LOCAL_STORAGE_AREA + 16), %RSP_LP
c65238
 	cfi_adjust_cfa_offset(-(LOCAL_STORAGE_AREA + 16))
c65238
@@ -260,11 +159,9 @@ _dl_runtime_resolve:
c65238
 	jmp *%r11		# Jump to function address.
c65238
 	cfi_endproc
c65238
 	.size _dl_runtime_resolve, .-_dl_runtime_resolve
c65238
+#endif
c65238
 
c65238
 
c65238
-/* To preserve %xmm0 - %xmm7 registers, dl-trampoline.h is included
c65238
-   twice, for _dl_runtime_resolve_sse and _dl_runtime_resolve_sse_vex.
c65238
-   But we don't need another _dl_runtime_profile for XMM registers.  */
c65238
 #if !defined PROF && defined _dl_runtime_profile
c65238
 # if (LR_VECTOR_OFFSET % VEC_SIZE) != 0
c65238
 #  error LR_VECTOR_OFFSET must be multples of VEC_SIZE