olga / rpms / glibc

Forked from rpms/glibc 5 years ago
Clone
00db10
Backport from Hongjiu Lu <hongjiu.lu@intel.com> of these upstream
00db10
commits:
00db10
00db10
commit b52b0d793dcb226ecb0ecca1e672ca265973233c
00db10
Author: H.J. Lu <hjl.tools@gmail.com>
00db10
Date:   Fri Oct 20 11:00:08 2017 -0700
00db10
00db10
    x86-64: Use fxsave/xsave/xsavec in _dl_runtime_resolve [BZ #21265]
00db10
    
00db10
    In _dl_runtime_resolve, use fxsave/xsave/xsavec to preserve all vector,
00db10
    mask and bound registers.  It simplifies _dl_runtime_resolve and supports
00db10
    different calling conventions.  ld.so code size is reduced by more than
00db10
    1 KB.  However, use fxsave/xsave/xsavec takes a little bit more cycles
00db10
    than saving and restoring vector and bound registers individually.
00db10
    
00db10
    Latency for _dl_runtime_resolve to lookup the function, foo, from one
00db10
    shared library plus libc.so:
00db10
    
00db10
                                 Before    After     Change
00db10
    
00db10
    Westmere (SSE)/fxsave         345      866       151%
00db10
    IvyBridge (AVX)/xsave         420      643       53%
00db10
    Haswell (AVX)/xsave           713      1252      75%
00db10
    Skylake (AVX+MPX)/xsavec      559      719       28%
00db10
    Skylake (AVX512+MPX)/xsavec   145      272       87%
00db10
    Ryzen (AVX)/xsavec            280      553       97%
00db10
    
00db10
    This is the worst case where portion of time spent for saving and
00db10
    restoring registers is bigger than majority of cases.  With smaller
00db10
    _dl_runtime_resolve code size, overall performance impact is negligible.
00db10
    
00db10
    On IvyBridge, differences in build and test time of binutils with lazy
00db10
    binding GCC and binutils are noises.  On Westmere, differences in
00db10
    bootstrap and "makc check" time of GCC 7 with lazy binding GCC and
00db10
    binutils are also noises.
00db10
00db10
commit 0ac8ee53e8efbfd6e1c37094b4653f5c2dad65b5
00db10
Author: H.J. Lu <hjl.tools@gmail.com>
00db10
Date:   Fri Aug 26 08:57:42 2016 -0700
00db10
00db10
    X86-64: Correct CFA in _dl_runtime_resolve
00db10
    
00db10
    When stack is re-aligned in _dl_runtime_resolve, there is no need to
00db10
    adjust CFA when allocating register save area on stack.
00db10
    
00db10
            * sysdeps/x86_64/dl-trampoline.h (_dl_runtime_resolve): Don't
00db10
            adjust CFA when allocating register save area on re-aligned
00db10
            stack.
00db10
00db10
Storing the full xsave state size in xsave_state_full_size was not needed
00db10
because RHEL7 does not have the full tunables support that would use this,
00db10
therefore support for xsave_state_full_size has been removed from the
00db10
changes in b52b0d793dcb226ecb0ecca1e672ca265973233c
00db10
00db10
Index: glibc-2.17-c758a686/sysdeps/x86/cpu-features-offsets.sym
00db10
===================================================================
00db10
--- glibc-2.17-c758a686.orig/sysdeps/x86/cpu-features-offsets.sym
00db10
+++ glibc-2.17-c758a686/sysdeps/x86/cpu-features-offsets.sym
00db10
@@ -5,3 +5,5 @@
00db10
 #define rtld_global_ro_offsetof(mem) offsetof (struct rtld_global_ro, mem)
00db10
 
00db10
 RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET rtld_global_ro_offsetof (_dl_x86_cpu_features)
00db10
+
00db10
+XSAVE_STATE_SIZE_OFFSET	offsetof (struct cpu_features, xsave_state_size)
00db10
Index: glibc-2.17-c758a686/sysdeps/x86/cpu-features.c
00db10
===================================================================
00db10
--- glibc-2.17-c758a686.orig/sysdeps/x86/cpu-features.c
00db10
+++ glibc-2.17-c758a686/sysdeps/x86/cpu-features.c
00db10
@@ -18,6 +18,7 @@
00db10
 
00db10
 #include <cpuid.h>
00db10
 #include <cpu-features.h>
00db10
+#include <libc-internal.h>
00db10
 
00db10
 static inline void
00db10
 get_common_indeces (struct cpu_features *cpu_features,
00db10
@@ -148,20 +149,6 @@ init_cpu_features (struct cpu_features *
00db10
 	      break;
00db10
 	    }
00db10
 	}
00db10
-
00db10
-      /* To avoid SSE transition penalty, use _dl_runtime_resolve_slow.
00db10
-         If XGETBV suports ECX == 1, use _dl_runtime_resolve_opt.  */
00db10
-      cpu_features->feature[index_Use_dl_runtime_resolve_slow]
00db10
-	|= bit_Use_dl_runtime_resolve_slow;
00db10
-      if (cpu_features->max_cpuid >= 0xd)
00db10
-	{
00db10
-	  unsigned int eax;
00db10
-
00db10
-	  __cpuid_count (0xd, 1, eax, ebx, ecx, edx);
00db10
-	  if ((eax & (1 << 2)) != 0)
00db10
-	    cpu_features->feature[index_Use_dl_runtime_resolve_opt]
00db10
-	      |= bit_Use_dl_runtime_resolve_opt;
00db10
-	}
00db10
     }
00db10
   /* This spells out "AuthenticAMD".  */
00db10
   else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
00db10
@@ -244,6 +231,71 @@ init_cpu_features (struct cpu_features *
00db10
 	  if (HAS_CPU_FEATURE (FMA4))
00db10
 	    cpu_features->feature[index_FMA4_Usable] |= bit_FMA4_Usable;
00db10
 	}
00db10
+
00db10
+      /* For _dl_runtime_resolve, set xsave_state_size to xsave area
00db10
+	 size + integer register save size and align it to 64 bytes.  */
00db10
+      if (cpu_features->max_cpuid >= 0xd)
00db10
+	{
00db10
+	  unsigned int eax, ebx, ecx, edx;
00db10
+
00db10
+	  __cpuid_count (0xd, 0, eax, ebx, ecx, edx);
00db10
+	  if (ebx != 0)
00db10
+	    {
00db10
+	      cpu_features->xsave_state_size
00db10
+	       = ALIGN_UP (ebx + STATE_SAVE_OFFSET, 64);
00db10
+
00db10
+	      __cpuid_count (0xd, 1, eax, ebx, ecx, edx);
00db10
+
00db10
+	      /* Check if XSAVEC is available.  */
00db10
+	      if ((eax & (1 << 1)) != 0)
00db10
+		{
00db10
+		  unsigned int xstate_comp_offsets[32];
00db10
+		  unsigned int xstate_comp_sizes[32];
00db10
+		  unsigned int i;
00db10
+
00db10
+		  xstate_comp_offsets[0] = 0;
00db10
+		  xstate_comp_offsets[1] = 160;
00db10
+		  xstate_comp_offsets[2] = 576;
00db10
+		  xstate_comp_sizes[0] = 160;
00db10
+		  xstate_comp_sizes[1] = 256;
00db10
+
00db10
+		  for (i = 2; i < 32; i++)
00db10
+		    {
00db10
+		      if ((STATE_SAVE_MASK & (1 << i)) != 0)
00db10
+			{
00db10
+			  __cpuid_count (0xd, i, eax, ebx, ecx, edx);
00db10
+			  xstate_comp_sizes[i] = eax;
00db10
+			}
00db10
+		      else
00db10
+			{
00db10
+			  ecx = 0;
00db10
+			  xstate_comp_sizes[i] = 0;
00db10
+			}
00db10
+
00db10
+		      if (i > 2)
00db10
+			{
00db10
+			  xstate_comp_offsets[i]
00db10
+			   = (xstate_comp_offsets[i - 1]
00db10
+			      + xstate_comp_sizes[i -1]);
00db10
+			  if ((ecx & (1 << 1)) != 0)
00db10
+			    xstate_comp_offsets[i]
00db10
+			     = ALIGN_UP (xstate_comp_offsets[i], 64);
00db10
+			}
00db10
+		    }
00db10
+
00db10
+		  /* Use XSAVEC.  */
00db10
+		  unsigned int size
00db10
+		   = xstate_comp_offsets[31] + xstate_comp_sizes[31];
00db10
+		  if (size)
00db10
+		    {
00db10
+		      cpu_features->xsave_state_size
00db10
+		       = ALIGN_UP (size + STATE_SAVE_OFFSET, 64);
00db10
+		      cpu_features->feature[index_XSAVEC_Usable]
00db10
+		       |= bit_XSAVEC_Usable;
00db10
+		    }
00db10
+		}
00db10
+	    }
00db10
+	}
00db10
     }
00db10
 
00db10
   cpu_features->family = family;
00db10
Index: glibc-2.17-c758a686/sysdeps/x86/cpu-features.h
00db10
===================================================================
00db10
--- glibc-2.17-c758a686.orig/sysdeps/x86/cpu-features.h
00db10
+++ glibc-2.17-c758a686/sysdeps/x86/cpu-features.h
00db10
@@ -34,8 +34,7 @@
00db10
 #define bit_AVX512DQ_Usable		(1 << 13)
00db10
 #define bit_Prefer_MAP_32BIT_EXEC	(1 << 16)
00db10
 #define bit_Prefer_No_VZEROUPPER	(1 << 17)
00db10
-#define bit_Use_dl_runtime_resolve_opt	(1 << 20)
00db10
-#define bit_Use_dl_runtime_resolve_slow	(1 << 21)
00db10
+#define bit_XSAVEC_Usable		(1 << 18)
00db10
 
00db10
 
00db10
 /* CPUID Feature flags.  */
00db10
@@ -70,10 +69,20 @@
00db10
 /* The current maximum size of the feature integer bit array.  */
00db10
 #define FEATURE_INDEX_MAX 1
00db10
 
00db10
+/* Offset for fxsave/xsave area used by _dl_runtime_resolve.  Also need
00db10
+   space to preserve RCX, RDX, RSI, RDI, R8, R9 and RAX.  It must be
00db10
+   aligned to 16 bytes for fxsave and 64 bytes for xsave.  */
00db10
+#define STATE_SAVE_OFFSET (8 * 7 + 8)
00db10
+
00db10
+/* Save SSE, AVX, AVX512, mask and bound registers.  */
00db10
+#define STATE_SAVE_MASK \
00db10
+  ((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7))
00db10
+
00db10
 #ifdef	__ASSEMBLER__
00db10
 
00db10
 # include <ifunc-defines.h>
00db10
 # include <rtld-global-offsets.h>
00db10
+# include <cpu-features-offsets.h>
00db10
 
00db10
 # define index_SSE2	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_EDX_OFFSET
00db10
 # define index_SSSE3	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
00db10
@@ -98,8 +107,6 @@
00db10
 # define index_AVX512DQ_Usable		FEATURE_INDEX_1*FEATURE_SIZE
00db10
 # define index_Prefer_MAP_32BIT_EXEC	FEATURE_INDEX_1*FEATURE_SIZE
00db10
 # define index_Prefer_No_VZEROUPPER	FEATURE_INDEX_1*FEATURE_SIZE
00db10
-# define index_Use_dl_runtime_resolve_opt FEATURE_INDEX_1*FEATURE_SIZE
00db10
-# define index_Use_dl_runtime_resolve_slow FEATURE_INDEX_1*FEATURE_SIZE
00db10
 
00db10
 
00db10
 # if defined (_LIBC) && !IS_IN (nonlib)
00db10
@@ -214,6 +221,12 @@ struct cpu_features
00db10
   } cpuid[COMMON_CPUID_INDEX_MAX];
00db10
   unsigned int family;
00db10
   unsigned int model;
00db10
+  /* The type must be unsigned long int so that we use
00db10
+
00db10
+	sub xsave_state_size_offset(%rip) %RSP_LP
00db10
+
00db10
+     in _dl_runtime_resolve.  */
00db10
+  unsigned long int xsave_state_size;
00db10
   unsigned int feature[FEATURE_INDEX_MAX];
00db10
 };
00db10
 
00db10
@@ -279,8 +292,7 @@ extern const struct cpu_features *__get_
00db10
 # define index_AVX512DQ_Usable		FEATURE_INDEX_1
00db10
 # define index_Prefer_MAP_32BIT_EXEC	FEATURE_INDEX_1
00db10
 # define index_Prefer_No_VZEROUPPER     FEATURE_INDEX_1
00db10
-# define index_Use_dl_runtime_resolve_opt FEATURE_INDEX_1
00db10
-# define index_Use_dl_runtime_resolve_slow FEATURE_INDEX_1
00db10
+# define index_XSAVEC_Usable		FEATURE_INDEX_1
00db10
 
00db10
 #endif	/* !__ASSEMBLER__ */
00db10
 
00db10
Index: glibc-2.17-c758a686/sysdeps/x86_64/dl-machine.h
00db10
===================================================================
00db10
--- glibc-2.17-c758a686.orig/sysdeps/x86_64/dl-machine.h
00db10
+++ glibc-2.17-c758a686/sysdeps/x86_64/dl-machine.h
00db10
@@ -66,12 +66,9 @@ static inline int __attribute__ ((unused
00db10
 elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
00db10
 {
00db10
   Elf64_Addr *got;
00db10
-  extern void _dl_runtime_resolve_sse (ElfW(Word)) attribute_hidden;
00db10
-  extern void _dl_runtime_resolve_avx (ElfW(Word)) attribute_hidden;
00db10
-  extern void _dl_runtime_resolve_avx_slow (ElfW(Word)) attribute_hidden;
00db10
-  extern void _dl_runtime_resolve_avx_opt (ElfW(Word)) attribute_hidden;
00db10
-  extern void _dl_runtime_resolve_avx512 (ElfW(Word)) attribute_hidden;
00db10
-  extern void _dl_runtime_resolve_avx512_opt (ElfW(Word)) attribute_hidden;
00db10
+  extern void _dl_runtime_resolve_fxsave (ElfW(Word)) attribute_hidden;
00db10
+  extern void _dl_runtime_resolve_xsave (ElfW(Word)) attribute_hidden;
00db10
+  extern void _dl_runtime_resolve_xsavec (ElfW(Word)) attribute_hidden;
00db10
   extern void _dl_runtime_profile_sse (ElfW(Word)) attribute_hidden;
00db10
   extern void _dl_runtime_profile_avx (ElfW(Word)) attribute_hidden;
00db10
   extern void _dl_runtime_profile_avx512 (ElfW(Word)) attribute_hidden;
00db10
@@ -120,29 +117,14 @@ elf_machine_runtime_setup (struct link_m
00db10
 	  /* This function will get called to fix up the GOT entry
00db10
 	     indicated by the offset on the stack, and then jump to
00db10
 	     the resolved address.  */
00db10
-	  if (HAS_ARCH_FEATURE (AVX512F_Usable))
00db10
-	    {
00db10
-	      if (HAS_ARCH_FEATURE (Use_dl_runtime_resolve_opt))
00db10
-		*(ElfW(Addr) *) (got + 2)
00db10
-		  = (ElfW(Addr)) &_dl_runtime_resolve_avx512_opt;
00db10
-	      else
00db10
-		*(ElfW(Addr) *) (got + 2)
00db10
-		  = (ElfW(Addr)) &_dl_runtime_resolve_avx512;
00db10
-	    }
00db10
-	  else if (HAS_ARCH_FEATURE (AVX_Usable))
00db10
-	    {
00db10
-	      if (HAS_ARCH_FEATURE (Use_dl_runtime_resolve_opt))
00db10
-		*(ElfW(Addr) *) (got + 2)
00db10
-		  = (ElfW(Addr)) &_dl_runtime_resolve_avx_opt;
00db10
-	      else if (HAS_ARCH_FEATURE (Use_dl_runtime_resolve_slow))
00db10
-		*(ElfW(Addr) *) (got + 2)
00db10
-		  = (ElfW(Addr)) &_dl_runtime_resolve_avx_slow;
00db10
-	      else
00db10
-		*(ElfW(Addr) *) (got + 2)
00db10
-		  = (ElfW(Addr)) &_dl_runtime_resolve_avx;
00db10
-	    }
00db10
+	  if (GLRO(dl_x86_cpu_features).xsave_state_size != 0)
00db10
+	    *(ElfW(Addr) *) (got + 2)
00db10
+	      = (HAS_ARCH_FEATURE (XSAVEC_Usable)
00db10
+		 ? (ElfW(Addr)) &_dl_runtime_resolve_xsavec
00db10
+		 : (ElfW(Addr)) &_dl_runtime_resolve_xsave);
00db10
 	  else
00db10
-	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_sse;
00db10
+	    *(ElfW(Addr) *) (got + 2)
00db10
+	      = (ElfW(Addr)) &_dl_runtime_resolve_fxsave;
00db10
 	}
00db10
     }
00db10
 
00db10
Index: glibc-2.17-c758a686/sysdeps/x86_64/dl-trampoline.S
00db10
===================================================================
00db10
--- glibc-2.17-c758a686.orig/sysdeps/x86_64/dl-trampoline.S
00db10
+++ glibc-2.17-c758a686/sysdeps/x86_64/dl-trampoline.S
00db10
@@ -34,37 +34,24 @@
00db10
 # define DL_STACK_ALIGNMENT 8
00db10
 #endif
00db10
 
00db10
-#ifndef DL_RUNIME_UNALIGNED_VEC_SIZE
00db10
-/* The maximum size of unaligned vector load and store.  */
00db10
-# define DL_RUNIME_UNALIGNED_VEC_SIZE 16
00db10
-#endif
00db10
-
00db10
-/* True if _dl_runtime_resolve should align stack to VEC_SIZE bytes.  */
00db10
-#define DL_RUNIME_RESOLVE_REALIGN_STACK \
00db10
-  (VEC_SIZE > DL_STACK_ALIGNMENT \
00db10
-   && VEC_SIZE > DL_RUNIME_UNALIGNED_VEC_SIZE)
00db10
-
00db10
-/* Align vector register save area to 16 bytes.  */
00db10
-#define REGISTER_SAVE_VEC_OFF	0
00db10
+/* True if _dl_runtime_resolve should align stack for STATE_SAVE or align
00db10
+   stack to 16 bytes before calling _dl_fixup.  */
00db10
+#define DL_RUNTIME_RESOLVE_REALIGN_STACK \
00db10
+  (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \
00db10
+   || 16 > DL_STACK_ALIGNMENT)
00db10
 
00db10
 /* Area on stack to save and restore registers used for parameter
00db10
    passing when calling _dl_fixup.  */
00db10
 #ifdef __ILP32__
00db10
-# define REGISTER_SAVE_RAX	(REGISTER_SAVE_VEC_OFF + VEC_SIZE * 8)
00db10
 # define PRESERVE_BND_REGS_PREFIX
00db10
 #else
00db10
-/* Align bound register save area to 16 bytes.  */
00db10
-# define REGISTER_SAVE_BND0	(REGISTER_SAVE_VEC_OFF + VEC_SIZE * 8)
00db10
-# define REGISTER_SAVE_BND1	(REGISTER_SAVE_BND0 + 16)
00db10
-# define REGISTER_SAVE_BND2	(REGISTER_SAVE_BND1 + 16)
00db10
-# define REGISTER_SAVE_BND3	(REGISTER_SAVE_BND2 + 16)
00db10
-# define REGISTER_SAVE_RAX	(REGISTER_SAVE_BND3 + 16)
00db10
 # ifdef HAVE_MPX_SUPPORT
00db10
 #  define PRESERVE_BND_REGS_PREFIX bnd
00db10
 # else
00db10
 #  define PRESERVE_BND_REGS_PREFIX .byte 0xf2
00db10
 # endif
00db10
 #endif
00db10
+#define REGISTER_SAVE_RAX	0
00db10
 #define REGISTER_SAVE_RCX	(REGISTER_SAVE_RAX + 8)
00db10
 #define REGISTER_SAVE_RDX	(REGISTER_SAVE_RCX + 8)
00db10
 #define REGISTER_SAVE_RSI	(REGISTER_SAVE_RDX + 8)
00db10
@@ -72,71 +59,60 @@
00db10
 #define REGISTER_SAVE_R8	(REGISTER_SAVE_RDI + 8)
00db10
 #define REGISTER_SAVE_R9	(REGISTER_SAVE_R8 + 8)
00db10
 
00db10
+#define RESTORE_AVX
00db10
+
00db10
 #define VEC_SIZE		64
00db10
 #define VMOVA			vmovdqa64
00db10
-#if DL_RUNIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT
00db10
-# define VMOV			vmovdqa64
00db10
-#else
00db10
-# define VMOV			vmovdqu64
00db10
-#endif
00db10
 #define VEC(i)			zmm##i
00db10
-#define _dl_runtime_resolve	_dl_runtime_resolve_avx512
00db10
 #define _dl_runtime_profile	_dl_runtime_profile_avx512
00db10
-#define RESTORE_AVX
00db10
 #include "dl-trampoline.h"
00db10
-#undef _dl_runtime_resolve
00db10
 #undef _dl_runtime_profile
00db10
 #undef VEC
00db10
-#undef VMOV
00db10
 #undef VMOVA
00db10
 #undef VEC_SIZE
00db10
 
00db10
 #define VEC_SIZE		32
00db10
 #define VMOVA			vmovdqa
00db10
-#if DL_RUNIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT
00db10
-# define VMOV			vmovdqa
00db10
-#else
00db10
-# define VMOV			vmovdqu
00db10
-#endif
00db10
 #define VEC(i)			ymm##i
00db10
-#define _dl_runtime_resolve	_dl_runtime_resolve_avx
00db10
-#define _dl_runtime_resolve_opt	_dl_runtime_resolve_avx_opt
00db10
 #define _dl_runtime_profile	_dl_runtime_profile_avx
00db10
 #include "dl-trampoline.h"
00db10
-#undef _dl_runtime_resolve
00db10
-#undef _dl_runtime_resolve_opt
00db10
 #undef _dl_runtime_profile
00db10
 #undef VEC
00db10
-#undef VMOV
00db10
 #undef VMOVA
00db10
 #undef VEC_SIZE
00db10
 
00db10
 /* movaps/movups is 1-byte shorter.  */
00db10
 #define VEC_SIZE		16
00db10
 #define VMOVA			movaps
00db10
-#if DL_RUNIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT
00db10
-# define VMOV			movaps
00db10
-#else
00db10
-# define VMOV			movups
00db10
- #endif
00db10
 #define VEC(i)			xmm##i
00db10
-#define _dl_runtime_resolve	_dl_runtime_resolve_sse
00db10
 #define _dl_runtime_profile	_dl_runtime_profile_sse
00db10
 #undef RESTORE_AVX
00db10
 #include "dl-trampoline.h"
00db10
-#undef _dl_runtime_resolve
00db10
 #undef _dl_runtime_profile
00db10
-#undef VMOV
00db10
+#undef VEC
00db10
 #undef VMOVA
00db10
+#undef VEC_SIZE
00db10
 
00db10
-/* Used by _dl_runtime_resolve_avx_opt/_dl_runtime_resolve_avx512_opt
00db10
-   to preserve the full vector registers with zero upper bits.  */
00db10
-#define VMOVA			vmovdqa
00db10
-#if DL_RUNTIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT
00db10
-# define VMOV			vmovdqa
00db10
-#else
00db10
-# define VMOV			vmovdqu
00db10
-#endif
00db10
-#define _dl_runtime_resolve	_dl_runtime_resolve_sse_vex
00db10
-#define _dl_runtime_resolve_opt	_dl_runtime_resolve_avx512_opt
00db10
+#define USE_FXSAVE
00db10
+#define STATE_SAVE_ALIGNMENT	16
00db10
+#define _dl_runtime_resolve	_dl_runtime_resolve_fxsave
00db10
 #include "dl-trampoline.h"
00db10
+#undef _dl_runtime_resolve
00db10
+#undef USE_FXSAVE
00db10
+#undef STATE_SAVE_ALIGNMENT
00db10
+
00db10
+#define USE_XSAVE
00db10
+#define STATE_SAVE_ALIGNMENT	64
00db10
+#define _dl_runtime_resolve	_dl_runtime_resolve_xsave
00db10
+#include "dl-trampoline.h"
00db10
+#undef _dl_runtime_resolve
00db10
+#undef USE_XSAVE
00db10
+#undef STATE_SAVE_ALIGNMENT
00db10
+
00db10
+#define USE_XSAVEC
00db10
+#define STATE_SAVE_ALIGNMENT	64
00db10
+#define _dl_runtime_resolve	_dl_runtime_resolve_xsavec
00db10
+#include "dl-trampoline.h"
00db10
+#undef _dl_runtime_resolve
00db10
+#undef USE_XSAVEC
00db10
+#undef STATE_SAVE_ALIGNMENT
00db10
Index: glibc-2.17-c758a686/sysdeps/x86_64/dl-trampoline.h
00db10
===================================================================
00db10
--- glibc-2.17-c758a686.orig/sysdeps/x86_64/dl-trampoline.h
00db10
+++ glibc-2.17-c758a686/sysdeps/x86_64/dl-trampoline.h
00db10
@@ -16,140 +16,47 @@
00db10
    License along with the GNU C Library; if not, see
00db10
    <http://www.gnu.org/licenses/>.  */
00db10
 
00db10
-#undef REGISTER_SAVE_AREA_RAW
00db10
-#ifdef __ILP32__
00db10
-/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as VEC0 to
00db10
-   VEC7.  */
00db10
-# define REGISTER_SAVE_AREA_RAW	(8 * 7 + VEC_SIZE * 8)
00db10
-#else
00db10
-/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as
00db10
-   BND0, BND1, BND2, BND3 and VEC0 to VEC7. */
00db10
-# define REGISTER_SAVE_AREA_RAW	(8 * 7 + 16 * 4 + VEC_SIZE * 8)
00db10
-#endif
00db10
+	.text
00db10
+#ifdef _dl_runtime_resolve
00db10
 
00db10
-#undef REGISTER_SAVE_AREA
00db10
-#undef LOCAL_STORAGE_AREA
00db10
-#undef BASE
00db10
-#if DL_RUNIME_RESOLVE_REALIGN_STACK
00db10
-# define REGISTER_SAVE_AREA	(REGISTER_SAVE_AREA_RAW + 8)
00db10
-/* Local stack area before jumping to function address: RBX.  */
00db10
-# define LOCAL_STORAGE_AREA	8
00db10
-# define BASE			rbx
00db10
-# if (REGISTER_SAVE_AREA % VEC_SIZE) != 0
00db10
-#  error REGISTER_SAVE_AREA must be multples of VEC_SIZE
00db10
-# endif
00db10
-#else
00db10
-# define REGISTER_SAVE_AREA	REGISTER_SAVE_AREA_RAW
00db10
-/* Local stack area before jumping to function address:  All saved
00db10
-   registers.  */
00db10
-# define LOCAL_STORAGE_AREA	REGISTER_SAVE_AREA
00db10
-# define BASE			rsp
00db10
-# if (REGISTER_SAVE_AREA % 16) != 8
00db10
-#  error REGISTER_SAVE_AREA must be odd multples of 8
00db10
+# undef REGISTER_SAVE_AREA
00db10
+# undef LOCAL_STORAGE_AREA
00db10
+# undef BASE
00db10
+
00db10
+# if (STATE_SAVE_ALIGNMENT % 16) != 0
00db10
+#  error STATE_SAVE_ALIGNMENT must be multples of 16
00db10
 # endif
00db10
-#endif
00db10
 
00db10
-	.text
00db10
-#ifdef _dl_runtime_resolve_opt
00db10
-/* Use the smallest vector registers to preserve the full YMM/ZMM
00db10
-   registers to avoid SSE transition penalty.  */
00db10
-
00db10
-# if VEC_SIZE == 32
00db10
-/* Check if the upper 128 bits in %ymm0 - %ymm7 registers are non-zero
00db10
-   and preserve %xmm0 - %xmm7 registers with the zero upper bits.  Since
00db10
-   there is no SSE transition penalty on AVX512 processors which don't
00db10
-   support XGETBV with ECX == 1, _dl_runtime_resolve_avx512_slow isn't
00db10
-   provided.   */
00db10
-	.globl _dl_runtime_resolve_avx_slow
00db10
-	.hidden _dl_runtime_resolve_avx_slow
00db10
-	.type _dl_runtime_resolve_avx_slow, @function
00db10
-	.align 16
00db10
-_dl_runtime_resolve_avx_slow:
00db10
-	cfi_startproc
00db10
-	cfi_adjust_cfa_offset(16) # Incorporate PLT
00db10
-	vorpd %ymm0, %ymm1, %ymm8
00db10
-	vorpd %ymm2, %ymm3, %ymm9
00db10
-	vorpd %ymm4, %ymm5, %ymm10
00db10
-	vorpd %ymm6, %ymm7, %ymm11
00db10
-	vorpd %ymm8, %ymm9, %ymm9
00db10
-	vorpd %ymm10, %ymm11, %ymm10
00db10
-	vpcmpeqd %xmm8, %xmm8, %xmm8
00db10
-	vorpd %ymm9, %ymm10, %ymm10
00db10
-	vptest %ymm10, %ymm8
00db10
-	# Preserve %ymm0 - %ymm7 registers if the upper 128 bits of any
00db10
-	# %ymm0 - %ymm7 registers aren't zero.
00db10
-	PRESERVE_BND_REGS_PREFIX
00db10
-	jnc _dl_runtime_resolve_avx
00db10
-	# Use vzeroupper to avoid SSE transition penalty.
00db10
-	vzeroupper
00db10
-	# Preserve %xmm0 - %xmm7 registers with the zero upper 128 bits
00db10
-	# when the upper 128 bits of %ymm0 - %ymm7 registers are zero.
00db10
-	PRESERVE_BND_REGS_PREFIX
00db10
-	jmp _dl_runtime_resolve_sse_vex
00db10
-	cfi_adjust_cfa_offset(-16) # Restore PLT adjustment
00db10
-	cfi_endproc
00db10
-	.size _dl_runtime_resolve_avx_slow, .-_dl_runtime_resolve_avx_slow
00db10
+# if (STATE_SAVE_OFFSET % STATE_SAVE_ALIGNMENT) != 0
00db10
+#  error STATE_SAVE_OFFSET must be multples of STATE_SAVE_ALIGNMENT
00db10
 # endif
00db10
 
00db10
-/* Use XGETBV with ECX == 1 to check which bits in vector registers are
00db10
-   non-zero and only preserve the non-zero lower bits with zero upper
00db10
-   bits.  */
00db10
-	.globl _dl_runtime_resolve_opt
00db10
-	.hidden _dl_runtime_resolve_opt
00db10
-	.type _dl_runtime_resolve_opt, @function
00db10
-	.align 16
00db10
-_dl_runtime_resolve_opt:
00db10
-	cfi_startproc
00db10
-	cfi_adjust_cfa_offset(16) # Incorporate PLT
00db10
-	pushq %rax
00db10
-	cfi_adjust_cfa_offset(8)
00db10
-	cfi_rel_offset(%rax, 0)
00db10
-	pushq %rcx
00db10
-	cfi_adjust_cfa_offset(8)
00db10
-	cfi_rel_offset(%rcx, 0)
00db10
-	pushq %rdx
00db10
-	cfi_adjust_cfa_offset(8)
00db10
-	cfi_rel_offset(%rdx, 0)
00db10
-	movl $1, %ecx
00db10
-	xgetbv
00db10
-	movl %eax, %r11d
00db10
-	popq %rdx
00db10
-	cfi_adjust_cfa_offset(-8)
00db10
-	cfi_restore (%rdx)
00db10
-	popq %rcx
00db10
-	cfi_adjust_cfa_offset(-8)
00db10
-	cfi_restore (%rcx)
00db10
-	popq %rax
00db10
-	cfi_adjust_cfa_offset(-8)
00db10
-	cfi_restore (%rax)
00db10
-# if VEC_SIZE == 32
00db10
-	# For YMM registers, check if YMM state is in use.
00db10
-	andl $bit_YMM_state, %r11d
00db10
-	# Preserve %xmm0 - %xmm7 registers with the zero upper 128 bits if
00db10
-	# YMM state isn't in use.
00db10
-	PRESERVE_BND_REGS_PREFIX
00db10
-	jz _dl_runtime_resolve_sse_vex
00db10
-# elif VEC_SIZE == 16
00db10
-	# For ZMM registers, check if YMM state and ZMM state are in
00db10
-	# use.
00db10
-	andl $(bit_YMM_state | bit_ZMM0_15_state), %r11d
00db10
-	cmpl $bit_YMM_state, %r11d
00db10
-	# Preserve %zmm0 - %zmm7 registers if ZMM state is in use.
00db10
-	PRESERVE_BND_REGS_PREFIX
00db10
-	jg _dl_runtime_resolve_avx512
00db10
-	# Preserve %ymm0 - %ymm7 registers with the zero upper 256 bits if
00db10
-	# ZMM state isn't in use.
00db10
-	PRESERVE_BND_REGS_PREFIX
00db10
-	je _dl_runtime_resolve_avx
00db10
-	# Preserve %xmm0 - %xmm7 registers with the zero upper 384 bits if
00db10
-	# neither YMM state nor ZMM state are in use.
00db10
+# if DL_RUNTIME_RESOLVE_REALIGN_STACK
00db10
+/* Local stack area before jumping to function address: RBX.  */
00db10
+#  define LOCAL_STORAGE_AREA	8
00db10
+#  define BASE			rbx
00db10
+#  ifdef USE_FXSAVE
00db10
+/* Use fxsave to save XMM registers.  */
00db10
+#   define REGISTER_SAVE_AREA	(512 + STATE_SAVE_OFFSET)
00db10
+#   if (REGISTER_SAVE_AREA % 16) != 0
00db10
+#    error REGISTER_SAVE_AREA must be multples of 16
00db10
+#   endif
00db10
+#  endif
00db10
 # else
00db10
-#  error Unsupported VEC_SIZE!
00db10
+#  ifndef USE_FXSAVE
00db10
+#   error USE_FXSAVE must be defined
00db10
+#  endif
00db10
+/* Use fxsave to save XMM registers.  */
00db10
+#  define REGISTER_SAVE_AREA	(512 + STATE_SAVE_OFFSET + 8)
00db10
+/* Local stack area before jumping to function address:  All saved
00db10
+   registers.  */
00db10
+#  define LOCAL_STORAGE_AREA	REGISTER_SAVE_AREA
00db10
+#  define BASE			rsp
00db10
+#  if (REGISTER_SAVE_AREA % 16) != 8
00db10
+#   error REGISTER_SAVE_AREA must be odd multples of 8
00db10
+#  endif
00db10
 # endif
00db10
-	cfi_adjust_cfa_offset(-16) # Restore PLT adjustment
00db10
-	cfi_endproc
00db10
-	.size _dl_runtime_resolve_opt, .-_dl_runtime_resolve_opt
00db10
-#endif
00db10
+
00db10
 	.globl _dl_runtime_resolve
00db10
 	.hidden _dl_runtime_resolve
00db10
 	.type _dl_runtime_resolve, @function
00db10
@@ -157,19 +64,30 @@ _dl_runtime_resolve_opt:
00db10
 	cfi_startproc
00db10
 _dl_runtime_resolve:
00db10
 	cfi_adjust_cfa_offset(16) # Incorporate PLT
00db10
-#if DL_RUNIME_RESOLVE_REALIGN_STACK
00db10
-# if LOCAL_STORAGE_AREA != 8
00db10
-#  error LOCAL_STORAGE_AREA must be 8
00db10
-# endif
00db10
+# if DL_RUNTIME_RESOLVE_REALIGN_STACK
00db10
+#  if LOCAL_STORAGE_AREA != 8
00db10
+#   error LOCAL_STORAGE_AREA must be 8
00db10
+#  endif
00db10
 	pushq %rbx			# push subtracts stack by 8.
00db10
 	cfi_adjust_cfa_offset(8)
00db10
 	cfi_rel_offset(%rbx, 0)
00db10
 	mov %RSP_LP, %RBX_LP
00db10
 	cfi_def_cfa_register(%rbx)
00db10
-	and $-VEC_SIZE, %RSP_LP
00db10
-#endif
00db10
+	and $-STATE_SAVE_ALIGNMENT, %RSP_LP
00db10
+# endif
00db10
+# ifdef REGISTER_SAVE_AREA
00db10
 	sub $REGISTER_SAVE_AREA, %RSP_LP
00db10
+#  if !DL_RUNTIME_RESOLVE_REALIGN_STACK
00db10
 	cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
00db10
+#  endif
00db10
+# else
00db10
+	# Allocate stack space of the required size to save the state.
00db10
+#  if IS_IN (rtld)
00db10
+	sub _rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP
00db10
+#  else
00db10
+	sub _dl_x86_cpu_features+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP
00db10
+#  endif
00db10
+# endif
00db10
 	# Preserve registers otherwise clobbered.
00db10
 	movq %rax, REGISTER_SAVE_RAX(%rsp)
00db10
 	movq %rcx, REGISTER_SAVE_RCX(%rsp)
00db10
@@ -178,59 +96,48 @@ _dl_runtime_resolve:
00db10
 	movq %rdi, REGISTER_SAVE_RDI(%rsp)
00db10
 	movq %r8, REGISTER_SAVE_R8(%rsp)
00db10
 	movq %r9, REGISTER_SAVE_R9(%rsp)
00db10
-	VMOV %VEC(0), (REGISTER_SAVE_VEC_OFF)(%rsp)
00db10
-	VMOV %VEC(1), (REGISTER_SAVE_VEC_OFF + VEC_SIZE)(%rsp)
00db10
-	VMOV %VEC(2), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 2)(%rsp)
00db10
-	VMOV %VEC(3), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 3)(%rsp)
00db10
-	VMOV %VEC(4), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 4)(%rsp)
00db10
-	VMOV %VEC(5), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 5)(%rsp)
00db10
-	VMOV %VEC(6), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 6)(%rsp)
00db10
-	VMOV %VEC(7), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 7)(%rsp)
00db10
-#ifndef __ILP32__
00db10
-	# We also have to preserve bound registers.  These are nops if
00db10
-	# Intel MPX isn't available or disabled.
00db10
-# ifdef HAVE_MPX_SUPPORT
00db10
-	bndmov %bnd0, REGISTER_SAVE_BND0(%rsp)
00db10
-	bndmov %bnd1, REGISTER_SAVE_BND1(%rsp)
00db10
-	bndmov %bnd2, REGISTER_SAVE_BND2(%rsp)
00db10
-	bndmov %bnd3, REGISTER_SAVE_BND3(%rsp)
00db10
+# ifdef USE_FXSAVE
00db10
+	fxsave STATE_SAVE_OFFSET(%rsp)
00db10
 # else
00db10
-#  if REGISTER_SAVE_BND0 == 0
00db10
-	.byte 0x66,0x0f,0x1b,0x04,0x24
00db10
+	movl $STATE_SAVE_MASK, %eax
00db10
+	xorl %edx, %edx
00db10
+	# Clear the XSAVE Header.
00db10
+#  ifdef USE_XSAVE
00db10
+	movq %rdx, (STATE_SAVE_OFFSET + 512)(%rsp)
00db10
+	movq %rdx, (STATE_SAVE_OFFSET + 512 + 8)(%rsp)
00db10
+#  endif
00db10
+	movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 2)(%rsp)
00db10
+	movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 3)(%rsp)
00db10
+	movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 4)(%rsp)
00db10
+	movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 5)(%rsp)
00db10
+	movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 6)(%rsp)
00db10
+	movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 7)(%rsp)
00db10
+#  ifdef USE_XSAVE
00db10
+	xsave STATE_SAVE_OFFSET(%rsp)
00db10
 #  else
00db10
-	.byte 0x66,0x0f,0x1b,0x44,0x24,REGISTER_SAVE_BND0
00db10
+	# Since glibc 2.23 requires only binutils 2.22 or later, xsavec
00db10
+	# may not be supported.  Use .byte directive instead.
00db10
+#   if STATE_SAVE_OFFSET != 0x40
00db10
+#    error STATE_SAVE_OFFSET != 0x40
00db10
+#   endif
00db10
+	# xsavec STATE_SAVE_OFFSET(%rsp)
00db10
+	.byte 0x0f, 0xc7, 0x64, 0x24, 0x40
00db10
 #  endif
00db10
-	.byte 0x66,0x0f,0x1b,0x4c,0x24,REGISTER_SAVE_BND1
00db10
-	.byte 0x66,0x0f,0x1b,0x54,0x24,REGISTER_SAVE_BND2
00db10
-	.byte 0x66,0x0f,0x1b,0x5c,0x24,REGISTER_SAVE_BND3
00db10
 # endif
00db10
-#endif
00db10
 	# Copy args pushed by PLT in register.
00db10
 	# %rdi: link_map, %rsi: reloc_index
00db10
 	mov (LOCAL_STORAGE_AREA + 8)(%BASE), %RSI_LP
00db10
 	mov LOCAL_STORAGE_AREA(%BASE), %RDI_LP
00db10
 	call _dl_fixup		# Call resolver.
00db10
 	mov %RAX_LP, %R11_LP	# Save return value
00db10
-#ifndef __ILP32__
00db10
-	# Restore bound registers.  These are nops if Intel MPX isn't
00db10
-	# avaiable or disabled.
00db10
-# ifdef HAVE_MPX_SUPPORT
00db10
-	bndmov REGISTER_SAVE_BND3(%rsp), %bnd3
00db10
-	bndmov REGISTER_SAVE_BND2(%rsp), %bnd2
00db10
-	bndmov REGISTER_SAVE_BND1(%rsp), %bnd1
00db10
-	bndmov REGISTER_SAVE_BND0(%rsp), %bnd0
00db10
+	# Get register content back.
00db10
+# ifdef USE_FXSAVE
00db10
+	fxrstor STATE_SAVE_OFFSET(%rsp)
00db10
 # else
00db10
-	.byte 0x66,0x0f,0x1a,0x5c,0x24,REGISTER_SAVE_BND3
00db10
-	.byte 0x66,0x0f,0x1a,0x54,0x24,REGISTER_SAVE_BND2
00db10
-	.byte 0x66,0x0f,0x1a,0x4c,0x24,REGISTER_SAVE_BND1
00db10
-#  if REGISTER_SAVE_BND0 == 0
00db10
-	.byte 0x66,0x0f,0x1a,0x04,0x24
00db10
-#  else
00db10
-	.byte 0x66,0x0f,0x1a,0x44,0x24,REGISTER_SAVE_BND0
00db10
-#  endif
00db10
+	movl $STATE_SAVE_MASK, %eax
00db10
+	xorl %edx, %edx
00db10
+	xrstor STATE_SAVE_OFFSET(%rsp)
00db10
 # endif
00db10
-#endif
00db10
-	# Get register content back.
00db10
 	movq REGISTER_SAVE_R9(%rsp), %r9
00db10
 	movq REGISTER_SAVE_R8(%rsp), %r8
00db10
 	movq REGISTER_SAVE_RDI(%rsp), %rdi
00db10
@@ -238,20 +145,12 @@ _dl_runtime_resolve:
00db10
 	movq REGISTER_SAVE_RDX(%rsp), %rdx
00db10
 	movq REGISTER_SAVE_RCX(%rsp), %rcx
00db10
 	movq REGISTER_SAVE_RAX(%rsp), %rax
00db10
-	VMOV (REGISTER_SAVE_VEC_OFF)(%rsp), %VEC(0)
00db10
-	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE)(%rsp), %VEC(1)
00db10
-	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 2)(%rsp), %VEC(2)
00db10
-	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 3)(%rsp), %VEC(3)
00db10
-	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 4)(%rsp), %VEC(4)
00db10
-	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 5)(%rsp), %VEC(5)
00db10
-	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 6)(%rsp), %VEC(6)
00db10
-	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 7)(%rsp), %VEC(7)
00db10
-#if DL_RUNIME_RESOLVE_REALIGN_STACK
00db10
+# if DL_RUNTIME_RESOLVE_REALIGN_STACK
00db10
 	mov %RBX_LP, %RSP_LP
00db10
 	cfi_def_cfa_register(%rsp)
00db10
 	movq (%rsp), %rbx
00db10
 	cfi_restore(%rbx)
00db10
-#endif
00db10
+# endif
00db10
 	# Adjust stack(PLT did 2 pushes)
00db10
 	add $(LOCAL_STORAGE_AREA + 16), %RSP_LP
00db10
 	cfi_adjust_cfa_offset(-(LOCAL_STORAGE_AREA + 16))
00db10
@@ -260,11 +159,9 @@ _dl_runtime_resolve:
00db10
 	jmp *%r11		# Jump to function address.
00db10
 	cfi_endproc
00db10
 	.size _dl_runtime_resolve, .-_dl_runtime_resolve
00db10
+#endif
00db10
 
00db10
 
00db10
-/* To preserve %xmm0 - %xmm7 registers, dl-trampoline.h is included
00db10
-   twice, for _dl_runtime_resolve_sse and _dl_runtime_resolve_sse_vex.
00db10
-   But we don't need another _dl_runtime_profile for XMM registers.  */
00db10
 #if !defined PROF && defined _dl_runtime_profile
00db10
 # if (LR_VECTOR_OFFSET % VEC_SIZE) != 0
00db10
 #  error LR_VECTOR_OFFSET must be multples of VEC_SIZE