From c65238be29272db3bf0aee0fe5dd8b86b1afb398 Mon Sep 17 00:00:00 2001 From: CentOS Sources Date: Nov 30 2017 13:09:59 +0000 Subject: import glibc-2.17-196.el7_4.2 --- diff --git a/SOURCES/glibc-rh1504969.patch b/SOURCES/glibc-rh1504969.patch new file mode 100644 index 0000000..d6de9fe --- /dev/null +++ b/SOURCES/glibc-rh1504969.patch @@ -0,0 +1,781 @@ +Backport from Hongjiu Lu of these upstream +commits: + +commit b52b0d793dcb226ecb0ecca1e672ca265973233c +Author: H.J. Lu +Date: Fri Oct 20 11:00:08 2017 -0700 + + x86-64: Use fxsave/xsave/xsavec in _dl_runtime_resolve [BZ #21265] + + In _dl_runtime_resolve, use fxsave/xsave/xsavec to preserve all vector, + mask and bound registers. It simplifies _dl_runtime_resolve and supports + different calling conventions. ld.so code size is reduced by more than + 1 KB. However, use fxsave/xsave/xsavec takes a little bit more cycles + than saving and restoring vector and bound registers individually. + + Latency for _dl_runtime_resolve to lookup the function, foo, from one + shared library plus libc.so: + + Before After Change + + Westmere (SSE)/fxsave 345 866 151% + IvyBridge (AVX)/xsave 420 643 53% + Haswell (AVX)/xsave 713 1252 75% + Skylake (AVX+MPX)/xsavec 559 719 28% + Skylake (AVX512+MPX)/xsavec 145 272 87% + Ryzen (AVX)/xsavec 280 553 97% + + This is the worst case where portion of time spent for saving and + restoring registers is bigger than majority of cases. With smaller + _dl_runtime_resolve code size, overall performance impact is negligible. + + On IvyBridge, differences in build and test time of binutils with lazy + binding GCC and binutils are noises. On Westmere, differences in + bootstrap and "makc check" time of GCC 7 with lazy binding GCC and + binutils are also noises. + +commit 0ac8ee53e8efbfd6e1c37094b4653f5c2dad65b5 +Author: H.J. Lu +Date: Fri Aug 26 08:57:42 2016 -0700 + + X86-64: Correct CFA in _dl_runtime_resolve + + When stack is re-aligned in _dl_runtime_resolve, there is no need to + adjust CFA when allocating register save area on stack. + + * sysdeps/x86_64/dl-trampoline.h (_dl_runtime_resolve): Don't + adjust CFA when allocating register save area on re-aligned + stack. + +Storing the full xsave state size in xsave_state_full_size was not needed +because RHEL7 does not have the full tunables support that would use this, +therefore support for xsave_state_full_size has been removed from the +changes in b52b0d793dcb226ecb0ecca1e672ca265973233c + +diff --git a/sysdeps/x86/cpu-features-offsets.sym b/sysdeps/x86/cpu-features-offsets.sym +index a9d53d195f9eb609..1415005fc22be806 100644 +--- a/sysdeps/x86/cpu-features-offsets.sym ++++ b/sysdeps/x86/cpu-features-offsets.sym +@@ -5,3 +5,5 @@ + #define rtld_global_ro_offsetof(mem) offsetof (struct rtld_global_ro, mem) + + RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET rtld_global_ro_offsetof (_dl_x86_cpu_features) ++ ++XSAVE_STATE_SIZE_OFFSET offsetof (struct cpu_features, xsave_state_size) +diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c +index 17e9835f5716ca12..c9bb4fa6f524ba4e 100644 +--- a/sysdeps/x86/cpu-features.c ++++ b/sysdeps/x86/cpu-features.c +@@ -18,6 +18,7 @@ + + #include + #include ++#include + + static inline void + get_common_indeces (struct cpu_features *cpu_features, +@@ -148,20 +149,6 @@ init_cpu_features (struct cpu_features *cpu_features) + break; + } + } +- +- /* To avoid SSE transition penalty, use _dl_runtime_resolve_slow. +- If XGETBV suports ECX == 1, use _dl_runtime_resolve_opt. */ +- cpu_features->feature[index_Use_dl_runtime_resolve_slow] +- |= bit_Use_dl_runtime_resolve_slow; +- if (cpu_features->max_cpuid >= 0xd) +- { +- unsigned int eax; +- +- __cpuid_count (0xd, 1, eax, ebx, ecx, edx); +- if ((eax & (1 << 2)) != 0) +- cpu_features->feature[index_Use_dl_runtime_resolve_opt] +- |= bit_Use_dl_runtime_resolve_opt; +- } + } + /* This spells out "AuthenticAMD". */ + else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65) +@@ -243,6 +230,71 @@ init_cpu_features (struct cpu_features *cpu_features) + /* Determine if FMA4 is usable. */ + if (HAS_CPU_FEATURE (FMA4)) + cpu_features->feature[index_FMA4_Usable] |= bit_FMA4_Usable; ++ ++ /* For _dl_runtime_resolve, set xsave_state_size to xsave area ++ size + integer register save size and align it to 64 bytes. */ ++ if (cpu_features->max_cpuid >= 0xd) ++ { ++ unsigned int eax, ebx, ecx, edx; ++ ++ __cpuid_count (0xd, 0, eax, ebx, ecx, edx); ++ if (ebx != 0) ++ { ++ cpu_features->xsave_state_size ++ = ALIGN_UP (ebx + STATE_SAVE_OFFSET, 64); ++ ++ __cpuid_count (0xd, 1, eax, ebx, ecx, edx); ++ ++ /* Check if XSAVEC is available. */ ++ if ((eax & (1 << 1)) != 0) ++ { ++ unsigned int xstate_comp_offsets[32]; ++ unsigned int xstate_comp_sizes[32]; ++ unsigned int i; ++ ++ xstate_comp_offsets[0] = 0; ++ xstate_comp_offsets[1] = 160; ++ xstate_comp_offsets[2] = 576; ++ xstate_comp_sizes[0] = 160; ++ xstate_comp_sizes[1] = 256; ++ ++ for (i = 2; i < 32; i++) ++ { ++ if ((STATE_SAVE_MASK & (1 << i)) != 0) ++ { ++ __cpuid_count (0xd, i, eax, ebx, ecx, edx); ++ xstate_comp_sizes[i] = eax; ++ } ++ else ++ { ++ ecx = 0; ++ xstate_comp_sizes[i] = 0; ++ } ++ ++ if (i > 2) ++ { ++ xstate_comp_offsets[i] ++ = (xstate_comp_offsets[i - 1] ++ + xstate_comp_sizes[i -1]); ++ if ((ecx & (1 << 1)) != 0) ++ xstate_comp_offsets[i] ++ = ALIGN_UP (xstate_comp_offsets[i], 64); ++ } ++ } ++ ++ /* Use XSAVEC. */ ++ unsigned int size ++ = xstate_comp_offsets[31] + xstate_comp_sizes[31]; ++ if (size) ++ { ++ cpu_features->xsave_state_size ++ = ALIGN_UP (size + STATE_SAVE_OFFSET, 64); ++ cpu_features->feature[index_XSAVEC_Usable] ++ |= bit_XSAVEC_Usable; ++ } ++ } ++ } ++ } + } + } + +diff --git a/sysdeps/x86/cpu-features.h b/sysdeps/x86/cpu-features.h +index c69abb349af8f09c..4e2e6fabb39ab600 100644 +--- a/sysdeps/x86/cpu-features.h ++++ b/sysdeps/x86/cpu-features.h +@@ -34,8 +34,7 @@ + #define bit_AVX512DQ_Usable (1 << 13) + #define bit_Prefer_MAP_32BIT_EXEC (1 << 16) + #define bit_Prefer_No_VZEROUPPER (1 << 17) +-#define bit_Use_dl_runtime_resolve_opt (1 << 20) +-#define bit_Use_dl_runtime_resolve_slow (1 << 21) ++#define bit_XSAVEC_Usable (1 << 18) + + + /* CPUID Feature flags. */ +@@ -70,10 +69,20 @@ + /* The current maximum size of the feature integer bit array. */ + #define FEATURE_INDEX_MAX 1 + ++/* Offset for fxsave/xsave area used by _dl_runtime_resolve. Also need ++ space to preserve RCX, RDX, RSI, RDI, R8, R9 and RAX. It must be ++ aligned to 16 bytes for fxsave and 64 bytes for xsave. */ ++#define STATE_SAVE_OFFSET (8 * 7 + 8) ++ ++/* Save SSE, AVX, AVX512, mask and bound registers. */ ++#define STATE_SAVE_MASK \ ++ ((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7)) ++ + #ifdef __ASSEMBLER__ + + # include + # include ++# include + + # define index_SSE2 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_EDX_OFFSET + # define index_SSSE3 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET +@@ -98,8 +107,6 @@ + # define index_AVX512DQ_Usable FEATURE_INDEX_1*FEATURE_SIZE + # define index_Prefer_MAP_32BIT_EXEC FEATURE_INDEX_1*FEATURE_SIZE + # define index_Prefer_No_VZEROUPPER FEATURE_INDEX_1*FEATURE_SIZE +-# define index_Use_dl_runtime_resolve_opt FEATURE_INDEX_1*FEATURE_SIZE +-# define index_Use_dl_runtime_resolve_slow FEATURE_INDEX_1*FEATURE_SIZE + + + # if defined (_LIBC) && !IS_IN (nonlib) +@@ -214,6 +221,12 @@ struct cpu_features + } cpuid[COMMON_CPUID_INDEX_MAX]; + unsigned int family; + unsigned int model; ++ /* The type must be unsigned long int so that we use ++ ++ sub xsave_state_size_offset(%rip) %RSP_LP ++ ++ in _dl_runtime_resolve. */ ++ unsigned long int xsave_state_size; + unsigned int feature[FEATURE_INDEX_MAX]; + }; + +@@ -279,8 +292,7 @@ extern const struct cpu_features *__get_cpu_features (void) + # define index_AVX512DQ_Usable FEATURE_INDEX_1 + # define index_Prefer_MAP_32BIT_EXEC FEATURE_INDEX_1 + # define index_Prefer_No_VZEROUPPER FEATURE_INDEX_1 +-# define index_Use_dl_runtime_resolve_opt FEATURE_INDEX_1 +-# define index_Use_dl_runtime_resolve_slow FEATURE_INDEX_1 ++# define index_XSAVEC_Usable FEATURE_INDEX_1 + + #endif /* !__ASSEMBLER__ */ + +diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h +index 2a4cda1aff57db98..da89f2a6174a0d94 100644 +--- a/sysdeps/x86_64/dl-machine.h ++++ b/sysdeps/x86_64/dl-machine.h +@@ -66,12 +66,9 @@ static inline int __attribute__ ((unused, always_inline)) + elf_machine_runtime_setup (struct link_map *l, int lazy, int profile) + { + Elf64_Addr *got; +- extern void _dl_runtime_resolve_sse (ElfW(Word)) attribute_hidden; +- extern void _dl_runtime_resolve_avx (ElfW(Word)) attribute_hidden; +- extern void _dl_runtime_resolve_avx_slow (ElfW(Word)) attribute_hidden; +- extern void _dl_runtime_resolve_avx_opt (ElfW(Word)) attribute_hidden; +- extern void _dl_runtime_resolve_avx512 (ElfW(Word)) attribute_hidden; +- extern void _dl_runtime_resolve_avx512_opt (ElfW(Word)) attribute_hidden; ++ extern void _dl_runtime_resolve_fxsave (ElfW(Word)) attribute_hidden; ++ extern void _dl_runtime_resolve_xsave (ElfW(Word)) attribute_hidden; ++ extern void _dl_runtime_resolve_xsavec (ElfW(Word)) attribute_hidden; + extern void _dl_runtime_profile_sse (ElfW(Word)) attribute_hidden; + extern void _dl_runtime_profile_avx (ElfW(Word)) attribute_hidden; + extern void _dl_runtime_profile_avx512 (ElfW(Word)) attribute_hidden; +@@ -120,29 +117,14 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile) + /* This function will get called to fix up the GOT entry + indicated by the offset on the stack, and then jump to + the resolved address. */ +- if (HAS_ARCH_FEATURE (AVX512F_Usable)) +- { +- if (HAS_ARCH_FEATURE (Use_dl_runtime_resolve_opt)) +- *(ElfW(Addr) *) (got + 2) +- = (ElfW(Addr)) &_dl_runtime_resolve_avx512_opt; +- else +- *(ElfW(Addr) *) (got + 2) +- = (ElfW(Addr)) &_dl_runtime_resolve_avx512; +- } +- else if (HAS_ARCH_FEATURE (AVX_Usable)) +- { +- if (HAS_ARCH_FEATURE (Use_dl_runtime_resolve_opt)) +- *(ElfW(Addr) *) (got + 2) +- = (ElfW(Addr)) &_dl_runtime_resolve_avx_opt; +- else if (HAS_ARCH_FEATURE (Use_dl_runtime_resolve_slow)) +- *(ElfW(Addr) *) (got + 2) +- = (ElfW(Addr)) &_dl_runtime_resolve_avx_slow; +- else +- *(ElfW(Addr) *) (got + 2) +- = (ElfW(Addr)) &_dl_runtime_resolve_avx; +- } ++ if (GLRO(dl_x86_cpu_features).xsave_state_size != 0) ++ *(ElfW(Addr) *) (got + 2) ++ = (HAS_ARCH_FEATURE (XSAVEC_Usable) ++ ? (ElfW(Addr)) &_dl_runtime_resolve_xsavec ++ : (ElfW(Addr)) &_dl_runtime_resolve_xsave); + else +- *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_sse; ++ *(ElfW(Addr) *) (got + 2) ++ = (ElfW(Addr)) &_dl_runtime_resolve_fxsave; + } + } + +diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S +index bd2d72edfea406e5..215a314f06ca874c 100644 +--- a/sysdeps/x86_64/dl-trampoline.S ++++ b/sysdeps/x86_64/dl-trampoline.S +@@ -34,37 +34,24 @@ + # define DL_STACK_ALIGNMENT 8 + #endif + +-#ifndef DL_RUNIME_UNALIGNED_VEC_SIZE +-/* The maximum size of unaligned vector load and store. */ +-# define DL_RUNIME_UNALIGNED_VEC_SIZE 16 +-#endif +- +-/* True if _dl_runtime_resolve should align stack to VEC_SIZE bytes. */ +-#define DL_RUNIME_RESOLVE_REALIGN_STACK \ +- (VEC_SIZE > DL_STACK_ALIGNMENT \ +- && VEC_SIZE > DL_RUNIME_UNALIGNED_VEC_SIZE) +- +-/* Align vector register save area to 16 bytes. */ +-#define REGISTER_SAVE_VEC_OFF 0 ++/* True if _dl_runtime_resolve should align stack for STATE_SAVE or align ++ stack to 16 bytes before calling _dl_fixup. */ ++#define DL_RUNTIME_RESOLVE_REALIGN_STACK \ ++ (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \ ++ || 16 > DL_STACK_ALIGNMENT) + + /* Area on stack to save and restore registers used for parameter + passing when calling _dl_fixup. */ + #ifdef __ILP32__ +-# define REGISTER_SAVE_RAX (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 8) + # define PRESERVE_BND_REGS_PREFIX + #else +-/* Align bound register save area to 16 bytes. */ +-# define REGISTER_SAVE_BND0 (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 8) +-# define REGISTER_SAVE_BND1 (REGISTER_SAVE_BND0 + 16) +-# define REGISTER_SAVE_BND2 (REGISTER_SAVE_BND1 + 16) +-# define REGISTER_SAVE_BND3 (REGISTER_SAVE_BND2 + 16) +-# define REGISTER_SAVE_RAX (REGISTER_SAVE_BND3 + 16) + # ifdef HAVE_MPX_SUPPORT + # define PRESERVE_BND_REGS_PREFIX bnd + # else + # define PRESERVE_BND_REGS_PREFIX .byte 0xf2 + # endif + #endif ++#define REGISTER_SAVE_RAX 0 + #define REGISTER_SAVE_RCX (REGISTER_SAVE_RAX + 8) + #define REGISTER_SAVE_RDX (REGISTER_SAVE_RCX + 8) + #define REGISTER_SAVE_RSI (REGISTER_SAVE_RDX + 8) +@@ -72,71 +59,60 @@ + #define REGISTER_SAVE_R8 (REGISTER_SAVE_RDI + 8) + #define REGISTER_SAVE_R9 (REGISTER_SAVE_R8 + 8) + ++#define RESTORE_AVX ++ + #define VEC_SIZE 64 + #define VMOVA vmovdqa64 +-#if DL_RUNIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT +-# define VMOV vmovdqa64 +-#else +-# define VMOV vmovdqu64 +-#endif + #define VEC(i) zmm##i +-#define _dl_runtime_resolve _dl_runtime_resolve_avx512 + #define _dl_runtime_profile _dl_runtime_profile_avx512 +-#define RESTORE_AVX + #include "dl-trampoline.h" +-#undef _dl_runtime_resolve + #undef _dl_runtime_profile + #undef VEC +-#undef VMOV + #undef VMOVA + #undef VEC_SIZE + + #define VEC_SIZE 32 + #define VMOVA vmovdqa +-#if DL_RUNIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT +-# define VMOV vmovdqa +-#else +-# define VMOV vmovdqu +-#endif + #define VEC(i) ymm##i +-#define _dl_runtime_resolve _dl_runtime_resolve_avx +-#define _dl_runtime_resolve_opt _dl_runtime_resolve_avx_opt + #define _dl_runtime_profile _dl_runtime_profile_avx + #include "dl-trampoline.h" +-#undef _dl_runtime_resolve +-#undef _dl_runtime_resolve_opt + #undef _dl_runtime_profile + #undef VEC +-#undef VMOV + #undef VMOVA + #undef VEC_SIZE + + /* movaps/movups is 1-byte shorter. */ + #define VEC_SIZE 16 + #define VMOVA movaps +-#if DL_RUNIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT +-# define VMOV movaps +-#else +-# define VMOV movups +- #endif + #define VEC(i) xmm##i +-#define _dl_runtime_resolve _dl_runtime_resolve_sse + #define _dl_runtime_profile _dl_runtime_profile_sse + #undef RESTORE_AVX + #include "dl-trampoline.h" +-#undef _dl_runtime_resolve + #undef _dl_runtime_profile +-#undef VMOV ++#undef VEC + #undef VMOVA ++#undef VEC_SIZE + +-/* Used by _dl_runtime_resolve_avx_opt/_dl_runtime_resolve_avx512_opt +- to preserve the full vector registers with zero upper bits. */ +-#define VMOVA vmovdqa +-#if DL_RUNTIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT +-# define VMOV vmovdqa +-#else +-# define VMOV vmovdqu +-#endif +-#define _dl_runtime_resolve _dl_runtime_resolve_sse_vex +-#define _dl_runtime_resolve_opt _dl_runtime_resolve_avx512_opt ++#define USE_FXSAVE ++#define STATE_SAVE_ALIGNMENT 16 ++#define _dl_runtime_resolve _dl_runtime_resolve_fxsave + #include "dl-trampoline.h" ++#undef _dl_runtime_resolve ++#undef USE_FXSAVE ++#undef STATE_SAVE_ALIGNMENT ++ ++#define USE_XSAVE ++#define STATE_SAVE_ALIGNMENT 64 ++#define _dl_runtime_resolve _dl_runtime_resolve_xsave ++#include "dl-trampoline.h" ++#undef _dl_runtime_resolve ++#undef USE_XSAVE ++#undef STATE_SAVE_ALIGNMENT ++ ++#define USE_XSAVEC ++#define STATE_SAVE_ALIGNMENT 64 ++#define _dl_runtime_resolve _dl_runtime_resolve_xsavec ++#include "dl-trampoline.h" ++#undef _dl_runtime_resolve ++#undef USE_XSAVEC ++#undef STATE_SAVE_ALIGNMENT +diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h +index 849cab4cd30e122a..525de575e3c4e52c 100644 +--- a/sysdeps/x86_64/dl-trampoline.h ++++ b/sysdeps/x86_64/dl-trampoline.h +@@ -16,140 +16,47 @@ + License along with the GNU C Library; if not, see + . */ + +-#undef REGISTER_SAVE_AREA_RAW +-#ifdef __ILP32__ +-/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as VEC0 to +- VEC7. */ +-# define REGISTER_SAVE_AREA_RAW (8 * 7 + VEC_SIZE * 8) +-#else +-/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as +- BND0, BND1, BND2, BND3 and VEC0 to VEC7. */ +-# define REGISTER_SAVE_AREA_RAW (8 * 7 + 16 * 4 + VEC_SIZE * 8) +-#endif ++ .text ++#ifdef _dl_runtime_resolve + +-#undef REGISTER_SAVE_AREA +-#undef LOCAL_STORAGE_AREA +-#undef BASE +-#if DL_RUNIME_RESOLVE_REALIGN_STACK +-# define REGISTER_SAVE_AREA (REGISTER_SAVE_AREA_RAW + 8) +-/* Local stack area before jumping to function address: RBX. */ +-# define LOCAL_STORAGE_AREA 8 +-# define BASE rbx +-# if (REGISTER_SAVE_AREA % VEC_SIZE) != 0 +-# error REGISTER_SAVE_AREA must be multples of VEC_SIZE +-# endif +-#else +-# define REGISTER_SAVE_AREA REGISTER_SAVE_AREA_RAW +-/* Local stack area before jumping to function address: All saved +- registers. */ +-# define LOCAL_STORAGE_AREA REGISTER_SAVE_AREA +-# define BASE rsp +-# if (REGISTER_SAVE_AREA % 16) != 8 +-# error REGISTER_SAVE_AREA must be odd multples of 8 ++# undef REGISTER_SAVE_AREA ++# undef LOCAL_STORAGE_AREA ++# undef BASE ++ ++# if (STATE_SAVE_ALIGNMENT % 16) != 0 ++# error STATE_SAVE_ALIGNMENT must be multples of 16 + # endif +-#endif + +- .text +-#ifdef _dl_runtime_resolve_opt +-/* Use the smallest vector registers to preserve the full YMM/ZMM +- registers to avoid SSE transition penalty. */ +- +-# if VEC_SIZE == 32 +-/* Check if the upper 128 bits in %ymm0 - %ymm7 registers are non-zero +- and preserve %xmm0 - %xmm7 registers with the zero upper bits. Since +- there is no SSE transition penalty on AVX512 processors which don't +- support XGETBV with ECX == 1, _dl_runtime_resolve_avx512_slow isn't +- provided. */ +- .globl _dl_runtime_resolve_avx_slow +- .hidden _dl_runtime_resolve_avx_slow +- .type _dl_runtime_resolve_avx_slow, @function +- .align 16 +-_dl_runtime_resolve_avx_slow: +- cfi_startproc +- cfi_adjust_cfa_offset(16) # Incorporate PLT +- vorpd %ymm0, %ymm1, %ymm8 +- vorpd %ymm2, %ymm3, %ymm9 +- vorpd %ymm4, %ymm5, %ymm10 +- vorpd %ymm6, %ymm7, %ymm11 +- vorpd %ymm8, %ymm9, %ymm9 +- vorpd %ymm10, %ymm11, %ymm10 +- vpcmpeqd %xmm8, %xmm8, %xmm8 +- vorpd %ymm9, %ymm10, %ymm10 +- vptest %ymm10, %ymm8 +- # Preserve %ymm0 - %ymm7 registers if the upper 128 bits of any +- # %ymm0 - %ymm7 registers aren't zero. +- PRESERVE_BND_REGS_PREFIX +- jnc _dl_runtime_resolve_avx +- # Use vzeroupper to avoid SSE transition penalty. +- vzeroupper +- # Preserve %xmm0 - %xmm7 registers with the zero upper 128 bits +- # when the upper 128 bits of %ymm0 - %ymm7 registers are zero. +- PRESERVE_BND_REGS_PREFIX +- jmp _dl_runtime_resolve_sse_vex +- cfi_adjust_cfa_offset(-16) # Restore PLT adjustment +- cfi_endproc +- .size _dl_runtime_resolve_avx_slow, .-_dl_runtime_resolve_avx_slow ++# if (STATE_SAVE_OFFSET % STATE_SAVE_ALIGNMENT) != 0 ++# error STATE_SAVE_OFFSET must be multples of STATE_SAVE_ALIGNMENT + # endif + +-/* Use XGETBV with ECX == 1 to check which bits in vector registers are +- non-zero and only preserve the non-zero lower bits with zero upper +- bits. */ +- .globl _dl_runtime_resolve_opt +- .hidden _dl_runtime_resolve_opt +- .type _dl_runtime_resolve_opt, @function +- .align 16 +-_dl_runtime_resolve_opt: +- cfi_startproc +- cfi_adjust_cfa_offset(16) # Incorporate PLT +- pushq %rax +- cfi_adjust_cfa_offset(8) +- cfi_rel_offset(%rax, 0) +- pushq %rcx +- cfi_adjust_cfa_offset(8) +- cfi_rel_offset(%rcx, 0) +- pushq %rdx +- cfi_adjust_cfa_offset(8) +- cfi_rel_offset(%rdx, 0) +- movl $1, %ecx +- xgetbv +- movl %eax, %r11d +- popq %rdx +- cfi_adjust_cfa_offset(-8) +- cfi_restore (%rdx) +- popq %rcx +- cfi_adjust_cfa_offset(-8) +- cfi_restore (%rcx) +- popq %rax +- cfi_adjust_cfa_offset(-8) +- cfi_restore (%rax) +-# if VEC_SIZE == 32 +- # For YMM registers, check if YMM state is in use. +- andl $bit_YMM_state, %r11d +- # Preserve %xmm0 - %xmm7 registers with the zero upper 128 bits if +- # YMM state isn't in use. +- PRESERVE_BND_REGS_PREFIX +- jz _dl_runtime_resolve_sse_vex +-# elif VEC_SIZE == 16 +- # For ZMM registers, check if YMM state and ZMM state are in +- # use. +- andl $(bit_YMM_state | bit_ZMM0_15_state), %r11d +- cmpl $bit_YMM_state, %r11d +- # Preserve %zmm0 - %zmm7 registers if ZMM state is in use. +- PRESERVE_BND_REGS_PREFIX +- jg _dl_runtime_resolve_avx512 +- # Preserve %ymm0 - %ymm7 registers with the zero upper 256 bits if +- # ZMM state isn't in use. +- PRESERVE_BND_REGS_PREFIX +- je _dl_runtime_resolve_avx +- # Preserve %xmm0 - %xmm7 registers with the zero upper 384 bits if +- # neither YMM state nor ZMM state are in use. ++# if DL_RUNTIME_RESOLVE_REALIGN_STACK ++/* Local stack area before jumping to function address: RBX. */ ++# define LOCAL_STORAGE_AREA 8 ++# define BASE rbx ++# ifdef USE_FXSAVE ++/* Use fxsave to save XMM registers. */ ++# define REGISTER_SAVE_AREA (512 + STATE_SAVE_OFFSET) ++# if (REGISTER_SAVE_AREA % 16) != 0 ++# error REGISTER_SAVE_AREA must be multples of 16 ++# endif ++# endif + # else +-# error Unsupported VEC_SIZE! ++# ifndef USE_FXSAVE ++# error USE_FXSAVE must be defined ++# endif ++/* Use fxsave to save XMM registers. */ ++# define REGISTER_SAVE_AREA (512 + STATE_SAVE_OFFSET + 8) ++/* Local stack area before jumping to function address: All saved ++ registers. */ ++# define LOCAL_STORAGE_AREA REGISTER_SAVE_AREA ++# define BASE rsp ++# if (REGISTER_SAVE_AREA % 16) != 8 ++# error REGISTER_SAVE_AREA must be odd multples of 8 ++# endif + # endif +- cfi_adjust_cfa_offset(-16) # Restore PLT adjustment +- cfi_endproc +- .size _dl_runtime_resolve_opt, .-_dl_runtime_resolve_opt +-#endif ++ + .globl _dl_runtime_resolve + .hidden _dl_runtime_resolve + .type _dl_runtime_resolve, @function +@@ -157,19 +64,30 @@ _dl_runtime_resolve_opt: + cfi_startproc + _dl_runtime_resolve: + cfi_adjust_cfa_offset(16) # Incorporate PLT +-#if DL_RUNIME_RESOLVE_REALIGN_STACK +-# if LOCAL_STORAGE_AREA != 8 +-# error LOCAL_STORAGE_AREA must be 8 +-# endif ++# if DL_RUNTIME_RESOLVE_REALIGN_STACK ++# if LOCAL_STORAGE_AREA != 8 ++# error LOCAL_STORAGE_AREA must be 8 ++# endif + pushq %rbx # push subtracts stack by 8. + cfi_adjust_cfa_offset(8) + cfi_rel_offset(%rbx, 0) + mov %RSP_LP, %RBX_LP + cfi_def_cfa_register(%rbx) +- and $-VEC_SIZE, %RSP_LP +-#endif ++ and $-STATE_SAVE_ALIGNMENT, %RSP_LP ++# endif ++# ifdef REGISTER_SAVE_AREA + sub $REGISTER_SAVE_AREA, %RSP_LP ++# if !DL_RUNTIME_RESOLVE_REALIGN_STACK + cfi_adjust_cfa_offset(REGISTER_SAVE_AREA) ++# endif ++# else ++ # Allocate stack space of the required size to save the state. ++# if IS_IN (rtld) ++ sub _rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP ++# else ++ sub _dl_x86_cpu_features+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP ++# endif ++# endif + # Preserve registers otherwise clobbered. + movq %rax, REGISTER_SAVE_RAX(%rsp) + movq %rcx, REGISTER_SAVE_RCX(%rsp) +@@ -178,59 +96,48 @@ _dl_runtime_resolve: + movq %rdi, REGISTER_SAVE_RDI(%rsp) + movq %r8, REGISTER_SAVE_R8(%rsp) + movq %r9, REGISTER_SAVE_R9(%rsp) +- VMOV %VEC(0), (REGISTER_SAVE_VEC_OFF)(%rsp) +- VMOV %VEC(1), (REGISTER_SAVE_VEC_OFF + VEC_SIZE)(%rsp) +- VMOV %VEC(2), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 2)(%rsp) +- VMOV %VEC(3), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 3)(%rsp) +- VMOV %VEC(4), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 4)(%rsp) +- VMOV %VEC(5), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 5)(%rsp) +- VMOV %VEC(6), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 6)(%rsp) +- VMOV %VEC(7), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 7)(%rsp) +-#ifndef __ILP32__ +- # We also have to preserve bound registers. These are nops if +- # Intel MPX isn't available or disabled. +-# ifdef HAVE_MPX_SUPPORT +- bndmov %bnd0, REGISTER_SAVE_BND0(%rsp) +- bndmov %bnd1, REGISTER_SAVE_BND1(%rsp) +- bndmov %bnd2, REGISTER_SAVE_BND2(%rsp) +- bndmov %bnd3, REGISTER_SAVE_BND3(%rsp) ++# ifdef USE_FXSAVE ++ fxsave STATE_SAVE_OFFSET(%rsp) + # else +-# if REGISTER_SAVE_BND0 == 0 +- .byte 0x66,0x0f,0x1b,0x04,0x24 ++ movl $STATE_SAVE_MASK, %eax ++ xorl %edx, %edx ++ # Clear the XSAVE Header. ++# ifdef USE_XSAVE ++ movq %rdx, (STATE_SAVE_OFFSET + 512)(%rsp) ++ movq %rdx, (STATE_SAVE_OFFSET + 512 + 8)(%rsp) ++# endif ++ movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 2)(%rsp) ++ movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 3)(%rsp) ++ movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 4)(%rsp) ++ movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 5)(%rsp) ++ movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 6)(%rsp) ++ movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 7)(%rsp) ++# ifdef USE_XSAVE ++ xsave STATE_SAVE_OFFSET(%rsp) + # else +- .byte 0x66,0x0f,0x1b,0x44,0x24,REGISTER_SAVE_BND0 ++ # Since glibc 2.23 requires only binutils 2.22 or later, xsavec ++ # may not be supported. Use .byte directive instead. ++# if STATE_SAVE_OFFSET != 0x40 ++# error STATE_SAVE_OFFSET != 0x40 ++# endif ++ # xsavec STATE_SAVE_OFFSET(%rsp) ++ .byte 0x0f, 0xc7, 0x64, 0x24, 0x40 + # endif +- .byte 0x66,0x0f,0x1b,0x4c,0x24,REGISTER_SAVE_BND1 +- .byte 0x66,0x0f,0x1b,0x54,0x24,REGISTER_SAVE_BND2 +- .byte 0x66,0x0f,0x1b,0x5c,0x24,REGISTER_SAVE_BND3 + # endif +-#endif + # Copy args pushed by PLT in register. + # %rdi: link_map, %rsi: reloc_index + mov (LOCAL_STORAGE_AREA + 8)(%BASE), %RSI_LP + mov LOCAL_STORAGE_AREA(%BASE), %RDI_LP + call _dl_fixup # Call resolver. + mov %RAX_LP, %R11_LP # Save return value +-#ifndef __ILP32__ +- # Restore bound registers. These are nops if Intel MPX isn't +- # avaiable or disabled. +-# ifdef HAVE_MPX_SUPPORT +- bndmov REGISTER_SAVE_BND3(%rsp), %bnd3 +- bndmov REGISTER_SAVE_BND2(%rsp), %bnd2 +- bndmov REGISTER_SAVE_BND1(%rsp), %bnd1 +- bndmov REGISTER_SAVE_BND0(%rsp), %bnd0 ++ # Get register content back. ++# ifdef USE_FXSAVE ++ fxrstor STATE_SAVE_OFFSET(%rsp) + # else +- .byte 0x66,0x0f,0x1a,0x5c,0x24,REGISTER_SAVE_BND3 +- .byte 0x66,0x0f,0x1a,0x54,0x24,REGISTER_SAVE_BND2 +- .byte 0x66,0x0f,0x1a,0x4c,0x24,REGISTER_SAVE_BND1 +-# if REGISTER_SAVE_BND0 == 0 +- .byte 0x66,0x0f,0x1a,0x04,0x24 +-# else +- .byte 0x66,0x0f,0x1a,0x44,0x24,REGISTER_SAVE_BND0 +-# endif ++ movl $STATE_SAVE_MASK, %eax ++ xorl %edx, %edx ++ xrstor STATE_SAVE_OFFSET(%rsp) + # endif +-#endif +- # Get register content back. + movq REGISTER_SAVE_R9(%rsp), %r9 + movq REGISTER_SAVE_R8(%rsp), %r8 + movq REGISTER_SAVE_RDI(%rsp), %rdi +@@ -238,20 +145,12 @@ _dl_runtime_resolve: + movq REGISTER_SAVE_RDX(%rsp), %rdx + movq REGISTER_SAVE_RCX(%rsp), %rcx + movq REGISTER_SAVE_RAX(%rsp), %rax +- VMOV (REGISTER_SAVE_VEC_OFF)(%rsp), %VEC(0) +- VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE)(%rsp), %VEC(1) +- VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 2)(%rsp), %VEC(2) +- VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 3)(%rsp), %VEC(3) +- VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 4)(%rsp), %VEC(4) +- VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 5)(%rsp), %VEC(5) +- VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 6)(%rsp), %VEC(6) +- VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 7)(%rsp), %VEC(7) +-#if DL_RUNIME_RESOLVE_REALIGN_STACK ++# if DL_RUNTIME_RESOLVE_REALIGN_STACK + mov %RBX_LP, %RSP_LP + cfi_def_cfa_register(%rsp) + movq (%rsp), %rbx + cfi_restore(%rbx) +-#endif ++# endif + # Adjust stack(PLT did 2 pushes) + add $(LOCAL_STORAGE_AREA + 16), %RSP_LP + cfi_adjust_cfa_offset(-(LOCAL_STORAGE_AREA + 16)) +@@ -260,11 +159,9 @@ _dl_runtime_resolve: + jmp *%r11 # Jump to function address. + cfi_endproc + .size _dl_runtime_resolve, .-_dl_runtime_resolve ++#endif + + +-/* To preserve %xmm0 - %xmm7 registers, dl-trampoline.h is included +- twice, for _dl_runtime_resolve_sse and _dl_runtime_resolve_sse_vex. +- But we don't need another _dl_runtime_profile for XMM registers. */ + #if !defined PROF && defined _dl_runtime_profile + # if (LR_VECTOR_OFFSET % VEC_SIZE) != 0 + # error LR_VECTOR_OFFSET must be multples of VEC_SIZE diff --git a/SOURCES/glibc-rh1515114-1.patch b/SOURCES/glibc-rh1515114-1.patch new file mode 100644 index 0000000..5ea0b91 --- /dev/null +++ b/SOURCES/glibc-rh1515114-1.patch @@ -0,0 +1,70 @@ +commit 911569d02dec023d949d96aa7b0e828c91c06f55 +Author: Carlos Eduardo Seo +Date: Mon Dec 28 16:36:46 2015 -0200 + + powerpc: Fix dl-procinfo HWCAP + + HWCAP-related code should had been updated when the 32 bits of HWCAP were + used. This patch updates the code in dl-procinfo.h to loop through all + the 32 bits in HWCAP and updates _dl_powerpc_cap_flags accordingly. + +diff --git a/sysdeps/powerpc/dl-procinfo.c b/sysdeps/powerpc/dl-procinfo.c +index 6e7850e..0b55906 100644 +--- a/sysdeps/powerpc/dl-procinfo.c ++++ b/sysdeps/powerpc/dl-procinfo.c +@@ -45,11 +45,12 @@ + #if !defined PROCINFO_DECL && defined SHARED + ._dl_powerpc_cap_flags + #else +-PROCINFO_CLASS const char _dl_powerpc_cap_flags[60][10] ++PROCINFO_CLASS const char _dl_powerpc_cap_flags[64][10] + #endif + #ifndef PROCINFO_DECL + = { +- "ppcle", "true_le", "archpmu", "vsx", ++ "ppcle", "true_le", "", "", ++ "", "", "archpmu", "vsx", + "arch_2_06", "power6x", "dfp", "pa6t", + "arch_2_05", "ic_snoop", "smt", "booke", + "cellbe", "power5+", "power5", "power4", +diff --git a/sysdeps/powerpc/dl-procinfo.h b/sysdeps/powerpc/dl-procinfo.h +index bce3a49..2187c5e 100644 +--- a/sysdeps/powerpc/dl-procinfo.h ++++ b/sysdeps/powerpc/dl-procinfo.h +@@ -22,9 +22,6 @@ + #include + #include /* This defines the PPC_FEATURE[2]_* macros. */ + +-/* There are 28 bits used, but they are bits 4..31. */ +-#define _DL_HWCAP_FIRST 4 +- + /* The total number of available bits (including those prior to + _DL_HWCAP_FIRST). Some of these bits might not be used. */ + #define _DL_HWCAP_COUNT 64 +@@ -68,7 +65,7 @@ static inline const char * + __attribute__ ((unused)) + _dl_hwcap_string (int idx) + { +- return GLRO(dl_powerpc_cap_flags)[idx - _DL_HWCAP_FIRST]; ++ return GLRO(dl_powerpc_cap_flags)[idx]; + } + + static inline const char * +@@ -82,7 +79,7 @@ static inline int + __attribute__ ((unused)) + _dl_string_hwcap (const char *str) + { +- for (int i = _DL_HWCAP_FIRST; i < _DL_HWCAP_COUNT; ++i) ++ for (int i = 0; i < _DL_HWCAP_COUNT; ++i) + if (strcmp (str, _dl_hwcap_string (i)) == 0) + return i; + return -1; +@@ -180,7 +177,7 @@ _dl_procinfo (unsigned int type, unsigned long int word) + case AT_HWCAP: + _dl_printf ("AT_HWCAP: "); + +- for (int i = _DL_HWCAP_FIRST; i <= _DL_HWCAP_LAST; ++i) ++ for (int i = 0; i <= _DL_HWCAP_LAST; ++i) + if (word & (1 << i)) + _dl_printf (" %s", _dl_hwcap_string (i)); + break; diff --git a/SOURCES/glibc-rh1515114-2.patch b/SOURCES/glibc-rh1515114-2.patch new file mode 100644 index 0000000..255d05a --- /dev/null +++ b/SOURCES/glibc-rh1515114-2.patch @@ -0,0 +1,49 @@ +commit 7dcdfbcf6749cdc4c63e2613cbb3e2392d2fc2fb +Author: Tulio Magno Quites Machado Filho +Date: Fri Jun 23 09:10:32 2017 -0300 + + powerpc: Update AT_HWCAP[2] bits + + Linux commit ID a4700a26107241cc7b9ac8528b2c6714ff99983d reserved 2 more + bits for the instructions darn (Deliver a Random Number) and scv (System + Call Vectored). + + Linux commit ID 6997e57d693b07289694239e52a10d2f02c3a46f reserved + another bit for internal usage. + + * sysdeps/powerpc/bits/hwcap.h: Add PPC_FEATURE2_DARN and + PPC_FEATURE2_SCV. + * sysdeps/powerpc/dl-procinfo.c (_dl_powerpc_cap_flags): Add scv + and darn. + +diff --git a/sysdeps/powerpc/bits/hwcap.h b/sysdeps/powerpc/bits/hwcap.h +index c9daeed..dfc71c2 100644 +--- a/sysdeps/powerpc/bits/hwcap.h ++++ b/sysdeps/powerpc/bits/hwcap.h +@@ -50,6 +50,7 @@ + #define PPC_FEATURE_ARCH_2_06 0x00000100 /* ISA 2.06 */ + #define PPC_FEATURE_HAS_VSX 0x00000080 /* P7 Vector Extension. */ + #define PPC_FEATURE_PSERIES_PERFMON_COMPAT 0x00000040 ++/* Reserved by the kernel. 0x00000004 Do not use. */ + #define PPC_FEATURE_TRUE_LE 0x00000002 + #define PPC_FEATURE_PPC_LE 0x00000001 + +@@ -69,3 +70,5 @@ + #define PPC_FEATURE2_ARCH_3_00 0x00800000 /* ISA 3.0 */ + #define PPC_FEATURE2_HAS_IEEE128 0x00400000 /* VSX IEEE Binary Float + 128-bit */ ++#define PPC_FEATURE2_DARN 0x00200000 /* darn instruction. */ ++#define PPC_FEATURE2_SCV 0x00100000 /* scv syscall. */ +diff --git a/sysdeps/powerpc/dl-procinfo.c b/sysdeps/powerpc/dl-procinfo.c +index cd7329b..4dac16d 100644 +--- a/sysdeps/powerpc/dl-procinfo.c ++++ b/sysdeps/powerpc/dl-procinfo.c +@@ -62,7 +62,7 @@ PROCINFO_CLASS const char _dl_powerpc_cap_flags[64][10] + "", "", "", "", + "", "", "", "", + "", "", "", "", +- "", "", "ieee128", "arch_3_00", ++ "scv", "darn", "ieee128", "arch_3_00", + "htm-nosc", "vcrypto", "tar", "isel", + "ebb", "dscr", "htm", "arch_2_07", + } diff --git a/SOURCES/glibc-rh1515114-3.patch b/SOURCES/glibc-rh1515114-3.patch new file mode 100644 index 0000000..d1408ea --- /dev/null +++ b/SOURCES/glibc-rh1515114-3.patch @@ -0,0 +1,49 @@ +commit df0c40ee3a893238ac11f4c0d876a0c3b49d198d +Author: Tulio Magno Quites Machado Filho +Date: Fri Nov 17 21:15:15 2017 -0200 + + powerpc: Update AT_HWCAP2 bits + + Linux commit ID cba6ac4869e45cc93ac5497024d1d49576e82666 reserved a new + bit for a scenario where transactional memory is available, but the + suspended state is disabled. + + * sysdeps/powerpc/bits/hwcap.h (PPC_FEATURE2_HTM_NO_SUSPEND): New + macro. + * sysdeps/powerpc/dl-procinfo.c (_dl_powerpc_cap_flags): Add + htm-no-suspend. + + Signed-off-by: Tulio Magno Quites Machado Filho + +diff --git a/sysdeps/powerpc/bits/hwcap.h b/sysdeps/powerpc/bits/hwcap.h +index dfc71c2..0668ca0 100644 +--- a/sysdeps/powerpc/bits/hwcap.h ++++ b/sysdeps/powerpc/bits/hwcap.h +@@ -72,3 +72,5 @@ + 128-bit */ + #define PPC_FEATURE2_DARN 0x00200000 /* darn instruction. */ + #define PPC_FEATURE2_SCV 0x00100000 /* scv syscall. */ ++#define PPC_FEATURE2_HTM_NO_SUSPEND 0x00080000 /* TM without suspended ++ state. */ +diff --git a/sysdeps/powerpc/dl-procinfo.c b/sysdeps/powerpc/dl-procinfo.c +index 4dac16d..55a6e78 100644 +--- a/sysdeps/powerpc/dl-procinfo.c ++++ b/sysdeps/powerpc/dl-procinfo.c +@@ -45,7 +45,7 @@ + #if !defined PROCINFO_DECL && defined SHARED + ._dl_powerpc_cap_flags + #else +-PROCINFO_CLASS const char _dl_powerpc_cap_flags[64][10] ++PROCINFO_CLASS const char _dl_powerpc_cap_flags[64][15] + #endif + #ifndef PROCINFO_DECL + = { +@@ -61,7 +61,7 @@ PROCINFO_CLASS const char _dl_powerpc_cap_flags[64][10] + "", "", "", "", + "", "", "", "", + "", "", "", "", +- "", "", "", "", ++ "", "", "", "htm-no-suspend", + "scv", "darn", "ieee128", "arch_3_00", + "htm-nosc", "vcrypto", "tar", "isel", + "ebb", "dscr", "htm", "arch_2_07", diff --git a/SOURCES/glibc-rh1516402-1.patch b/SOURCES/glibc-rh1516402-1.patch new file mode 100644 index 0000000..053ed03 --- /dev/null +++ b/SOURCES/glibc-rh1516402-1.patch @@ -0,0 +1,89 @@ +commit 87868c2418fb74357757e3b739ce5b76b17a8929 +Author: Adhemerval Zanella +Date: Wed Jun 25 11:54:31 2014 -0500 + + PowerPC: Align power7 memcpy using VSX to quadword + + This patch changes power7 memcpy to use VSX instructions only when + memory is aligned to quardword. It is to avoid unaligned kernel traps + on non-cacheable memory (for instance, memory-mapped I/O). + +diff --git a/sysdeps/powerpc/powerpc32/power7/memcpy.S b/sysdeps/powerpc/powerpc32/power7/memcpy.S +index 52c2a6b..e540fea 100644 +--- a/sysdeps/powerpc/powerpc32/power7/memcpy.S ++++ b/sysdeps/powerpc/powerpc32/power7/memcpy.S +@@ -38,8 +38,8 @@ EALIGN (memcpy, 5, 0) + ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move + code. */ + +- andi. 11,3,7 /* Check alignment of DST. */ +- clrlwi 10,4,29 /* Check alignment of SRC. */ ++ andi. 11,3,15 /* Check alignment of DST. */ ++ clrlwi 10,4,28 /* Check alignment of SRC. */ + cmplw cr6,10,11 /* SRC and DST alignments match? */ + mr 12,4 + mr 31,5 +diff --git a/sysdeps/powerpc/powerpc64/power7/memcpy.S b/sysdeps/powerpc/powerpc64/power7/memcpy.S +index bbfd381..58d9b12 100644 +--- a/sysdeps/powerpc/powerpc64/power7/memcpy.S ++++ b/sysdeps/powerpc/powerpc64/power7/memcpy.S +@@ -36,16 +36,11 @@ EALIGN (memcpy, 5, 0) + ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move + code. */ + +-#ifdef __LITTLE_ENDIAN__ +-/* In little-endian mode, power7 takes an alignment trap on any lxvd2x +- or stxvd2x crossing a 32-byte boundary, so ensure the aligned_copy +- loop is only used for quadword aligned copies. */ ++/* Align copies using VSX instructions to quadword. It is to avoid alignment ++ traps when memcpy is used on non-cacheable memory (for instance, memory ++ mapped I/O). */ + andi. 10,3,15 + clrldi 11,4,60 +-#else +- andi. 10,3,7 /* Check alignment of DST. */ +- clrldi 11,4,61 /* Check alignment of SRC. */ +-#endif + cmpld cr6,10,11 /* SRC and DST alignments match? */ + + mr dst,3 +@@ -53,13 +48,9 @@ EALIGN (memcpy, 5, 0) + beq L(aligned_copy) + + mtocrf 0x01,0 +-#ifdef __LITTLE_ENDIAN__ + clrldi 0,0,60 +-#else +- clrldi 0,0,61 +-#endif + +-/* Get the DST and SRC aligned to 8 bytes (16 for little-endian). */ ++/* Get the DST and SRC aligned to 16 bytes. */ + 1: + bf 31,2f + lbz 6,0(src) +@@ -79,14 +70,12 @@ EALIGN (memcpy, 5, 0) + stw 6,0(dst) + addi dst,dst,4 + 8: +-#ifdef __LITTLE_ENDIAN__ + bf 28,16f + ld 6,0(src) + addi src,src,8 + std 6,0(dst) + addi dst,dst,8 + 16: +-#endif + subf cnt,0,cnt + + /* Main aligned copy loop. Copies 128 bytes at a time. */ +@@ -298,9 +287,6 @@ L(copy_LE_8): + .align 4 + L(copy_GE_32_unaligned): + clrldi 0,0,60 /* Number of bytes until the 1st dst quadword. */ +-#ifndef __LITTLE_ENDIAN__ +- andi. 10,3,15 /* Check alignment of DST (against quadwords). */ +-#endif + srdi 9,cnt,4 /* Number of full quadwords remaining. */ + + beq L(copy_GE_32_unaligned_cont) diff --git a/SOURCES/glibc-rh1516402-2.patch b/SOURCES/glibc-rh1516402-2.patch new file mode 100644 index 0000000..2455f56 --- /dev/null +++ b/SOURCES/glibc-rh1516402-2.patch @@ -0,0 +1,123 @@ +The memmove related fix is dropped in this patch because rhel-7.5 +does not have optimized memmove for POWER7. + +commit 63da5cd4a097d089033d980c42254c3356fa723f +Author: Rajalakshmi Srinivasaraghavan +Date: Wed Oct 25 13:13:53 2017 -0200 + + powerpc: Replace lxvd2x/stxvd2x with lvx/stvx in P7's memcpy/memmove + + POWER9 DD2.1 and earlier has an issue where some cache inhibited + vector load traps to the kernel, causing a performance degradation. To + handle this in memcpy and memmove, lvx/stvx is used for aligned + addresses instead of lxvd2x/stxvd2x. + + Reference: https://patchwork.ozlabs.org/patch/814059/ + + * sysdeps/powerpc/powerpc64/power7/memcpy.S: Replace + lxvd2x/stxvd2x with lvx/stvx. + * sysdeps/powerpc/powerpc64/power7/memmove.S: Likewise. + + Reviewed-by: Tulio Magno Quites Machado Filho + Reviewed-by: Adhemerval Zanella + +diff --git a/sysdeps/powerpc/powerpc64/power7/memcpy.S b/sysdeps/powerpc/powerpc64/power7/memcpy.S +index 1ccbc2e..a7cdf8b 100644 +--- a/sysdeps/powerpc/powerpc64/power7/memcpy.S ++++ b/sysdeps/powerpc/powerpc64/power7/memcpy.S +@@ -91,63 +91,63 @@ L(aligned_copy): + srdi 12,cnt,7 + cmpdi 12,0 + beq L(aligned_tail) +- lxvd2x 6,0,src +- lxvd2x 7,src,6 ++ lvx 6,0,src ++ lvx 7,src,6 + mtctr 12 + b L(aligned_128loop) + + .align 4 + L(aligned_128head): + /* for the 2nd + iteration of this loop. */ +- lxvd2x 6,0,src +- lxvd2x 7,src,6 ++ lvx 6,0,src ++ lvx 7,src,6 + L(aligned_128loop): +- lxvd2x 8,src,7 +- lxvd2x 9,src,8 +- stxvd2x 6,0,dst ++ lvx 8,src,7 ++ lvx 9,src,8 ++ stvx 6,0,dst + addi src,src,64 +- stxvd2x 7,dst,6 +- stxvd2x 8,dst,7 +- stxvd2x 9,dst,8 +- lxvd2x 6,0,src +- lxvd2x 7,src,6 ++ stvx 7,dst,6 ++ stvx 8,dst,7 ++ stvx 9,dst,8 ++ lvx 6,0,src ++ lvx 7,src,6 + addi dst,dst,64 +- lxvd2x 8,src,7 +- lxvd2x 9,src,8 ++ lvx 8,src,7 ++ lvx 9,src,8 + addi src,src,64 +- stxvd2x 6,0,dst +- stxvd2x 7,dst,6 +- stxvd2x 8,dst,7 +- stxvd2x 9,dst,8 ++ stvx 6,0,dst ++ stvx 7,dst,6 ++ stvx 8,dst,7 ++ stvx 9,dst,8 + addi dst,dst,64 + bdnz L(aligned_128head) + + L(aligned_tail): + mtocrf 0x01,cnt + bf 25,32f +- lxvd2x 6,0,src +- lxvd2x 7,src,6 +- lxvd2x 8,src,7 +- lxvd2x 9,src,8 ++ lvx 6,0,src ++ lvx 7,src,6 ++ lvx 8,src,7 ++ lvx 9,src,8 + addi src,src,64 +- stxvd2x 6,0,dst +- stxvd2x 7,dst,6 +- stxvd2x 8,dst,7 +- stxvd2x 9,dst,8 ++ stvx 6,0,dst ++ stvx 7,dst,6 ++ stvx 8,dst,7 ++ stvx 9,dst,8 + addi dst,dst,64 + 32: + bf 26,16f +- lxvd2x 6,0,src +- lxvd2x 7,src,6 ++ lvx 6,0,src ++ lvx 7,src,6 + addi src,src,32 +- stxvd2x 6,0,dst +- stxvd2x 7,dst,6 ++ stvx 6,0,dst ++ stvx 7,dst,6 + addi dst,dst,32 + 16: + bf 27,8f +- lxvd2x 6,0,src ++ lvx 6,0,src + addi src,src,16 +- stxvd2x 6,0,dst ++ stvx 6,0,dst + addi dst,dst,16 + 8: + bf 28,4f diff --git a/SPECS/glibc.spec b/SPECS/glibc.spec index 1d01f86..0ff2079 100644 --- a/SPECS/glibc.spec +++ b/SPECS/glibc.spec @@ -1,6 +1,6 @@ %define glibcsrcdir glibc-2.17-c758a686 %define glibcversion 2.17 -%define glibcrelease 196%{?dist} +%define glibcrelease 196%{?dist}.2 ############################################################################## # We support the following options: # --with/--without, @@ -1099,6 +1099,17 @@ Patch1858: glibc-rh1457177-2.patch Patch1859: glibc-rh1457177-3.patch Patch1860: glibc-rh1457177-4.patch +Patch1861: glibc-rh1504969.patch + +# RHBZ #1515114: Pegas1.0 - Update HWCAP bits for POWER9 DD2.1 +Patch1862: glibc-rh1515114-1.patch +Patch1863: glibc-rh1515114-2.patch +Patch1864: glibc-rh1515114-3.patch + +# RHBZ #1516402: Pegas1.0 - Workaround performance regressions on VSX loads on POWER9 DD2.1 +Patch1865: glibc-rh1516402-1.patch +Patch1866: glibc-rh1516402-2.patch + ############################################################################## # # Patches submitted, but not yet approved upstream. @@ -2132,6 +2143,12 @@ cp %{_sourcedir}/syscall-names.list sysdeps/unix/sysv/linux/ %patch1858 -p1 %patch1859 -p1 %patch1860 -p1 +%patch1861 -p1 +%patch1862 -p1 +%patch1863 -p1 +%patch1864 -p1 +%patch1865 -p1 +%patch1866 -p1 ############################################################################## # %%prep - Additional prep required... @@ -3282,6 +3299,13 @@ rm -f *.filelist* %endif %changelog +* Wed Nov 22 2017 Carlos O'Donell - 2.17-196.2 +- Update HWCAP bits for IBM POWER9 DD2.1 (#1515114) +- Improve memcpy performance for POWER9 DD2.1 (#1516402) + +* Tue Nov 14 2017 Carlos O'Donell - 2.17-196.1 +- x86-64: Use XSAVE/XSAVEC in the ld.so trampoline (#1513070) + * Fri Jun 16 2017 Florian Weimer - 2.17-196 - Avoid large allocas in the dynamic linker (#1452721)