We add back Prefer_SSE_for_memop since we still need it for all of the existing era implementations for RHEL 7.3. To remove it would require a more wholesale backport of optmized routines. commit e2e4f56056adddc3c1efe676b40a4b4f2453103b Author: H.J. Lu Date: Thu Aug 13 03:37:47 2015 -0700 Add _dl_x86_cpu_features to rtld_global This patch adds _dl_x86_cpu_features to rtld_global in x86 ld.so and initializes it early before __libc_start_main is called so that cpu_features is always available when it is used and we can avoid calling __init_cpu_features in IFUNC selectors. Index: glibc-2.17-c758a686/sysdeps/i386/dl-machine.h =================================================================== --- glibc-2.17-c758a686.orig/sysdeps/i386/dl-machine.h +++ glibc-2.17-c758a686/sysdeps/i386/dl-machine.h @@ -25,6 +25,7 @@ #include #include #include +#include /* Return nonzero iff ELF header is compatible with the running host. */ static inline int __attribute__ ((unused)) @@ -266,6 +267,8 @@ dl_platform_init (void) if (GLRO(dl_platform) != NULL && *GLRO(dl_platform) == '\0') /* Avoid an empty string which would disturb us. */ GLRO(dl_platform) = NULL; + + init_cpu_features (&GLRO(dl_x86_cpu_features)); } static inline Elf32_Addr Index: glibc-2.17-c758a686/sysdeps/i386/dl-procinfo.c =================================================================== --- glibc-2.17-c758a686.orig/sysdeps/i386/dl-procinfo.c +++ glibc-2.17-c758a686/sysdeps/i386/dl-procinfo.c @@ -43,6 +43,22 @@ # define PROCINFO_CLASS #endif +#if !IS_IN (ldconfig) +# if !defined PROCINFO_DECL && defined SHARED + ._dl_x86_cpu_features +# else +PROCINFO_CLASS struct cpu_features _dl_x86_cpu_features +# endif +# ifndef PROCINFO_DECL += { } +# endif +# if !defined SHARED || defined PROCINFO_DECL +; +# else +, +# endif +#endif + #if !defined PROCINFO_DECL && defined SHARED ._dl_x86_cap_flags #else Index: glibc-2.17-c758a686/sysdeps/i386/i686/cacheinfo.c =================================================================== --- glibc-2.17-c758a686.orig/sysdeps/i386/i686/cacheinfo.c +++ glibc-2.17-c758a686/sysdeps/i386/i686/cacheinfo.c @@ -8,6 +8,5 @@ #define __x86_64_raw_shared_cache_size_half __x86_raw_shared_cache_size_half #define DISABLE_PREFETCHW -#define DISABLE_PREFERRED_MEMORY_INSTRUCTION #include Index: glibc-2.17-c758a686/sysdeps/i386/i686/multiarch/Makefile =================================================================== --- glibc-2.17-c758a686.orig/sysdeps/i386/i686/multiarch/Makefile +++ glibc-2.17-c758a686/sysdeps/i386/i686/multiarch/Makefile @@ -1,5 +1,4 @@ ifeq ($(subdir),csu) -aux += init-arch tests += test-multiarch gen-as-const-headers += ifunc-defines.sym endif Index: glibc-2.17-c758a686/sysdeps/i386/i686/multiarch/Versions =================================================================== --- glibc-2.17-c758a686.orig/sysdeps/i386/i686/multiarch/Versions +++ /dev/null @@ -1,5 +0,0 @@ -libc { - GLIBC_PRIVATE { - __get_cpu_features; - } -} Index: glibc-2.17-c758a686/sysdeps/i386/i686/multiarch/ifunc-defines.sym =================================================================== --- glibc-2.17-c758a686.orig/sysdeps/i386/i686/multiarch/ifunc-defines.sym +++ glibc-2.17-c758a686/sysdeps/i386/i686/multiarch/ifunc-defines.sym @@ -4,7 +4,6 @@ -- CPU_FEATURES_SIZE sizeof (struct cpu_features) -KIND_OFFSET offsetof (struct cpu_features, kind) CPUID_OFFSET offsetof (struct cpu_features, cpuid) CPUID_SIZE sizeof (struct cpuid_registers) CPUID_EAX_OFFSET offsetof (struct cpuid_registers, eax) Index: glibc-2.17-c758a686/sysdeps/i386/ldsodefs.h =================================================================== --- glibc-2.17-c758a686.orig/sysdeps/i386/ldsodefs.h +++ glibc-2.17-c758a686/sysdeps/i386/ldsodefs.h @@ -20,6 +20,7 @@ #define _I386_LDSODEFS_H 1 #include +#include struct La_i86_regs; struct La_i86_retval; Index: glibc-2.17-c758a686/sysdeps/unix/sysv/linux/x86_64/dl-procinfo.c =================================================================== --- glibc-2.17-c758a686.orig/sysdeps/unix/sysv/linux/x86_64/dl-procinfo.c +++ glibc-2.17-c758a686/sysdeps/unix/sysv/linux/x86_64/dl-procinfo.c @@ -1,5 +1,5 @@ #if IS_IN (ldconfig) # include #else -# include +# include #endif Index: glibc-2.17-c758a686/sysdeps/x86/Makefile =================================================================== --- glibc-2.17-c758a686.orig/sysdeps/x86/Makefile +++ glibc-2.17-c758a686/sysdeps/x86/Makefile @@ -7,3 +7,14 @@ $(objpfx)tst-xmmymmzmm.out: ../sysdeps/x @echo "Checking ld.so for SSE register use. This will take a few seconds..." $(SHELL) $< $(objpfx) '$(NM)' '$(OBJDUMP)' '$(READELF)' > $@ endif + +ifeq ($(subdir),csu) +gen-as-const-headers += cpu-features-offsets.sym rtld-global-offsets.sym +endif + +ifeq ($(subdir),elf) +sysdep-dl-routines += dl-get-cpu-features + +tests += tst-get-cpu-features +tests-static += tst-get-cpu-features-static +endif Index: glibc-2.17-c758a686/sysdeps/x86/Versions =================================================================== --- /dev/null +++ glibc-2.17-c758a686/sysdeps/x86/Versions @@ -0,0 +1,5 @@ +ld { + GLIBC_PRIVATE { + __get_cpu_features; + } +} Index: glibc-2.17-c758a686/sysdeps/x86/cpu-features-offsets.sym =================================================================== --- /dev/null +++ glibc-2.17-c758a686/sysdeps/x86/cpu-features-offsets.sym @@ -0,0 +1,7 @@ +#define SHARED 1 + +#include + +#define rtld_global_ro_offsetof(mem) offsetof (struct rtld_global_ro, mem) + +RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET rtld_global_ro_offsetof (_dl_x86_cpu_features) Index: glibc-2.17-c758a686/sysdeps/x86/cpu-features.c =================================================================== --- /dev/null +++ glibc-2.17-c758a686/sysdeps/x86/cpu-features.c @@ -0,0 +1,213 @@ +/* Initialize CPU feature data. + This file is part of the GNU C Library. + Copyright (C) 2008-2015 Free Software Foundation, Inc. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include + +static inline void +get_common_indeces (struct cpu_features *cpu_features, + unsigned int *family, unsigned int *model) +{ + unsigned int eax; + __cpuid (1, eax, cpu_features->cpuid[COMMON_CPUID_INDEX_1].ebx, + cpu_features->cpuid[COMMON_CPUID_INDEX_1].ecx, + cpu_features->cpuid[COMMON_CPUID_INDEX_1].edx); + GLRO(dl_x86_cpu_features).cpuid[COMMON_CPUID_INDEX_1].eax = eax; + *family = (eax >> 8) & 0x0f; + *model = (eax >> 4) & 0x0f; +} + +static inline void +init_cpu_features (struct cpu_features *cpu_features) +{ + unsigned int ebx, ecx, edx; + unsigned int family = 0; + unsigned int model = 0; + enum cpu_features_kind kind; + + __cpuid (0, cpu_features->max_cpuid, ebx, ecx, edx); + + /* This spells out "GenuineIntel". */ + if (ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69) + { + kind = arch_kind_intel; + + get_common_indeces (cpu_features, &family, &model); + + /* Intel processors prefer SSE instruction for memory/string + routines if they are available. */ + cpu_features->feature[index_Prefer_SSE_for_memop] + |= bit_Prefer_SSE_for_memop; + + unsigned int eax = cpu_features->cpuid[COMMON_CPUID_INDEX_1].eax; + unsigned int extended_family = (eax >> 20) & 0xff; + unsigned int extended_model = (eax >> 12) & 0xf0; + if (family == 0x0f) + { + family += extended_family; + model += extended_model; + } + else if (family == 0x06) + { + ecx = cpu_features->cpuid[COMMON_CPUID_INDEX_1].ecx; + model += extended_model; + switch (model) + { + case 0x1c: + case 0x26: + /* BSF is slow on Atom. */ + cpu_features->feature[index_Slow_BSF] |= bit_Slow_BSF; + break; + + case 0x37: + case 0x4a: + case 0x4d: + case 0x5a: + case 0x5d: + /* Unaligned load versions are faster than SSSE3 + on Silvermont. */ +#if index_Fast_Unaligned_Load != index_Prefer_PMINUB_for_stringop +# error index_Fast_Unaligned_Load != index_Prefer_PMINUB_for_stringop +#endif +#if index_Fast_Unaligned_Load != index_Slow_SSE4_2 +# error index_Fast_Unaligned_Load != index_Slow_SSE4_2 +#endif + cpu_features->feature[index_Fast_Unaligned_Load] + |= (bit_Fast_Unaligned_Load + | bit_Prefer_PMINUB_for_stringop + | bit_Slow_SSE4_2); + break; + + default: + /* Unknown family 0x06 processors. Assuming this is one + of Core i3/i5/i7 processors if AVX is available. */ + if ((ecx & bit_AVX) == 0) + break; + + case 0x1a: + case 0x1e: + case 0x1f: + case 0x25: + case 0x2c: + case 0x2e: + case 0x2f: + /* Rep string instructions, copy backward, unaligned loads + and pminub are fast on Intel Core i3, i5 and i7. */ +#if index_Fast_Rep_String != index_Fast_Copy_Backward +# error index_Fast_Rep_String != index_Fast_Copy_Backward +#endif +#if index_Fast_Rep_String != index_Fast_Unaligned_Load +# error index_Fast_Rep_String != index_Fast_Unaligned_Load +#endif +#if index_Fast_Rep_String != index_Prefer_PMINUB_for_stringop +# error index_Fast_Rep_String != index_Prefer_PMINUB_for_stringop +#endif + cpu_features->feature[index_Fast_Rep_String] + |= (bit_Fast_Rep_String + | bit_Fast_Copy_Backward + | bit_Fast_Unaligned_Load + | bit_Prefer_PMINUB_for_stringop); + break; + } + } + } + /* This spells out "AuthenticAMD". */ + else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65) + { + kind = arch_kind_amd; + + get_common_indeces (cpu_features, &family, &model); + + ecx = cpu_features->cpuid[COMMON_CPUID_INDEX_1].ecx; + + /* AMD processors prefer SSE instructions for memory/string routines + if they are available, otherwise they prefer integer instructions. */ + if ((ecx & 0x200)) + cpu_features->feature[index_Prefer_SSE_for_memop] + |= bit_Prefer_SSE_for_memop; + + unsigned int eax; + __cpuid (0x80000000, eax, ebx, ecx, edx); + if (eax >= 0x80000001) + __cpuid (0x80000001, + cpu_features->cpuid[COMMON_CPUID_INDEX_80000001].eax, + cpu_features->cpuid[COMMON_CPUID_INDEX_80000001].ebx, + cpu_features->cpuid[COMMON_CPUID_INDEX_80000001].ecx, + cpu_features->cpuid[COMMON_CPUID_INDEX_80000001].edx); + } + else + kind = arch_kind_other; + + if (cpu_features->max_cpuid >= 7) + __cpuid_count (7, 0, + cpu_features->cpuid[COMMON_CPUID_INDEX_7].eax, + cpu_features->cpuid[COMMON_CPUID_INDEX_7].ebx, + cpu_features->cpuid[COMMON_CPUID_INDEX_7].ecx, + cpu_features->cpuid[COMMON_CPUID_INDEX_7].edx); + + /* Can we call xgetbv? */ + if (HAS_CPU_FEATURE (OSXSAVE)) + { + unsigned int xcrlow; + unsigned int xcrhigh; + asm ("xgetbv" : "=a" (xcrlow), "=d" (xcrhigh) : "c" (0)); + /* Is YMM and XMM state usable? */ + if ((xcrlow & (bit_YMM_state | bit_XMM_state)) == + (bit_YMM_state | bit_XMM_state)) + { + /* Determine if AVX is usable. */ + if (HAS_CPU_FEATURE (AVX)) + cpu_features->feature[index_AVX_Usable] |= bit_AVX_Usable; +#if index_AVX2_Usable != index_AVX_Fast_Unaligned_Load +# error index_AVX2_Usable != index_AVX_Fast_Unaligned_Load +#endif + /* Determine if AVX2 is usable. Unaligned load with 256-bit + AVX registers are faster on processors with AVX2. */ + if (HAS_CPU_FEATURE (AVX2)) + cpu_features->feature[index_AVX2_Usable] + |= bit_AVX2_Usable | bit_AVX_Fast_Unaligned_Load; + /* Check if OPMASK state, upper 256-bit of ZMM0-ZMM15 and + ZMM16-ZMM31 state are enabled. */ + if ((xcrlow & (bit_Opmask_state | bit_ZMM0_15_state + | bit_ZMM16_31_state)) == + (bit_Opmask_state | bit_ZMM0_15_state | bit_ZMM16_31_state)) + { + /* Determine if AVX512F is usable. */ + if (HAS_CPU_FEATURE (AVX512F)) + { + cpu_features->feature[index_AVX512F_Usable] + |= bit_AVX512F_Usable; + /* Determine if AVX512DQ is usable. */ + if (HAS_CPU_FEATURE (AVX512DQ)) + cpu_features->feature[index_AVX512DQ_Usable] + |= bit_AVX512DQ_Usable; + } + } + /* Determine if FMA is usable. */ + if (HAS_CPU_FEATURE (FMA)) + cpu_features->feature[index_FMA_Usable] |= bit_FMA_Usable; + /* Determine if FMA4 is usable. */ + if (HAS_CPU_FEATURE (FMA4)) + cpu_features->feature[index_FMA4_Usable] |= bit_FMA4_Usable; + } + } + + cpu_features->family = family; + cpu_features->model = model; + cpu_features->kind = kind; +} Index: glibc-2.17-c758a686/sysdeps/x86/cpu-features.h =================================================================== --- /dev/null +++ glibc-2.17-c758a686/sysdeps/x86/cpu-features.h @@ -0,0 +1,273 @@ +/* This file is part of the GNU C Library. + Copyright (C) 2008-2015 Free Software Foundation, Inc. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#ifndef cpu_features_h +#define cpu_features_h + +#define bit_Fast_Rep_String (1 << 0) +#define bit_Fast_Copy_Backward (1 << 1) +#define bit_Slow_BSF (1 << 2) +#define bit_Prefer_SSE_for_memop (1 << 3) +#define bit_Fast_Unaligned_Load (1 << 4) +#define bit_Prefer_PMINUB_for_stringop (1 << 5) +#define bit_AVX_Usable (1 << 6) +#define bit_FMA_Usable (1 << 7) +#define bit_FMA4_Usable (1 << 8) +#define bit_Slow_SSE4_2 (1 << 9) +#define bit_AVX2_Usable (1 << 10) +#define bit_AVX_Fast_Unaligned_Load (1 << 11) +#define bit_AVX512F_Usable (1 << 12) +#define bit_AVX512DQ_Usable (1 << 13) + +/* CPUID Feature flags. */ + +/* COMMON_CPUID_INDEX_1. */ +#define bit_SSE2 (1 << 26) +#define bit_SSSE3 (1 << 9) +#define bit_SSE4_1 (1 << 19) +#define bit_SSE4_2 (1 << 20) +#define bit_OSXSAVE (1 << 27) +#define bit_AVX (1 << 28) +#define bit_POPCOUNT (1 << 23) +#define bit_FMA (1 << 12) +#define bit_FMA4 (1 << 16) + +/* COMMON_CPUID_INDEX_7. */ +#define bit_RTM (1 << 11) +#define bit_AVX2 (1 << 5) +#define bit_AVX512F (1 << 16) +#define bit_AVX512DQ (1 << 17) + +/* XCR0 Feature flags. */ +#define bit_XMM_state (1 << 1) +#define bit_YMM_state (2 << 1) +#define bit_Opmask_state (1 << 5) +#define bit_ZMM0_15_state (1 << 6) +#define bit_ZMM16_31_state (1 << 7) + +/* The integer bit array index for the first set of internal feature bits. */ +#define FEATURE_INDEX_1 0 + +/* The current maximum size of the feature integer bit array. */ +#define FEATURE_INDEX_MAX 1 + +#ifdef __ASSEMBLER__ + +# include +# include + +# define index_SSE2 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_EDX_OFFSET +# define index_SSSE3 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET +# define index_SSE4_1 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET +# define index_SSE4_2 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET +# define index_AVX COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET +# define index_AVX2 COMMON_CPUID_INDEX_7*CPUID_SIZE+CPUID_EBX_OFFSET + +# define index_Fast_Rep_String FEATURE_INDEX_1*FEATURE_SIZE +# define index_Fast_Copy_Backward FEATURE_INDEX_1*FEATURE_SIZE +# define index_Slow_BSF FEATURE_INDEX_1*FEATURE_SIZE +# define index_Prefer_SSE_for_memop FEATURE_INDEX_1*FEATURE_SIZE +# define index_Fast_Unaligned_Load FEATURE_INDEX_1*FEATURE_SIZE +# define index_Prefer_PMINUB_for_stringop FEATURE_INDEX_1*FEATURE_SIZE +# define index_AVX_Usable FEATURE_INDEX_1*FEATURE_SIZE +# define index_FMA_Usable FEATURE_INDEX_1*FEATURE_SIZE +# define index_FMA4_Usable FEATURE_INDEX_1*FEATURE_SIZE +# define index_Slow_SSE4_2 FEATURE_INDEX_1*FEATURE_SIZE +# define index_AVX2_Usable FEATURE_INDEX_1*FEATURE_SIZE +# define index_AVX_Fast_Unaligned_Load FEATURE_INDEX_1*FEATURE_SIZE +# define index_AVX512F_Usable FEATURE_INDEX_1*FEATURE_SIZE +# define index_AVX512DQ_Usable FEATURE_INDEX_1*FEATURE_SIZE + +# if defined (_LIBC) && !IS_IN (nonlib) +# ifdef __x86_64__ +# ifdef SHARED +# if IS_IN (rtld) +# define LOAD_RTLD_GLOBAL_RO_RDX +# define HAS_FEATURE(offset, name) \ + testl $(bit_##name), _rtld_local_ro+offset+(index_##name)(%rip) +# else +# define LOAD_RTLD_GLOBAL_RO_RDX \ + mov _rtld_global_ro@GOTPCREL(%rip), %RDX_LP +# define HAS_FEATURE(offset, name) \ + testl $(bit_##name), \ + RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+offset+(index_##name)(%rdx) +# endif +# else /* SHARED */ +# define LOAD_RTLD_GLOBAL_RO_RDX +# define HAS_FEATURE(offset, name) \ + testl $(bit_##name), _dl_x86_cpu_features+offset+(index_##name)(%rip) +# endif /* !SHARED */ +# else /* __x86_64__ */ +# ifdef SHARED +# define LOAD_FUNC_GOT_EAX(func) \ + leal func@GOTOFF(%edx), %eax +# if IS_IN (rtld) +# define LOAD_GOT_AND_RTLD_GLOBAL_RO \ + LOAD_PIC_REG(dx) +# define HAS_FEATURE(offset, name) \ + testl $(bit_##name), offset+(index_##name)+_rtld_local_ro@GOTOFF(%edx) +# else +# define LOAD_GOT_AND_RTLD_GLOBAL_RO \ + LOAD_PIC_REG(dx); \ + mov _rtld_global_ro@GOT(%edx), %ecx +# define HAS_FEATURE(offset, name) \ + testl $(bit_##name), \ + RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+offset+(index_##name)(%ecx) +# endif +# else /* SHARED */ +# define LOAD_FUNC_GOT_EAX(func) \ + leal func, %eax +# define LOAD_GOT_AND_RTLD_GLOBAL_RO +# define HAS_FEATURE(offset, name) \ + testl $(bit_##name), _dl_x86_cpu_features+offset+(index_##name) +# endif /* !SHARED */ +# endif /* !__x86_64__ */ +# else /* _LIBC && !nonlib */ +# error "Sorry, is unimplemented for assembler" +# endif /* !_LIBC || nonlib */ + +/* HAS_* evaluates to true if we may use the feature at runtime. */ +# define HAS_CPU_FEATURE(name) HAS_FEATURE (CPUID_OFFSET, name) +# define HAS_ARCH_FEATURE(name) HAS_FEATURE (FEATURE_OFFSET, name) + +#else /* __ASSEMBLER__ */ + +# include +# include +# include +# include + +/* Ugly hack to make it possible to select a strstr and strcasestr + implementation that avoids using the stack for 16-byte aligned + SSE temporaries. Doing so makes it possible to call the functions + with a stack that's not 16-byte aligned as can happen, for example, + as a result of compiling the functions' callers with the GCC + -mpreferred-stack-boubdary=2 or =3 option, or with the ICC + -falign-stack=assume-4-byte option. See rhbz 1150282 for details. + + The ifunc selector uses the unaligned version by default if this + file exists and is accessible. */ +# define ENABLE_STRSTR_UNALIGNED_PATHNAME \ + "/etc/sysconfig/64bit_strstr_via_64bit_strstr_sse2_unaligned" + +static bool __attribute__ ((unused)) +use_unaligned_strstr (void) +{ + struct stat unaligned_strstr_etc_sysconfig_file; + + /* TLS may not have been set up yet, so avoid using stat since it tries to + set errno. */ + return INTERNAL_SYSCALL (stat, , 2, + ENABLE_STRSTR_UNALIGNED_PATHNAME, + &unaligned_strstr_etc_sysconfig_file) == 0; +} + +enum + { + COMMON_CPUID_INDEX_1 = 0, + COMMON_CPUID_INDEX_7, + COMMON_CPUID_INDEX_80000001, /* for AMD */ + /* Keep the following line at the end. */ + COMMON_CPUID_INDEX_MAX + }; + +struct cpu_features +{ + enum cpu_features_kind + { + arch_kind_unknown = 0, + arch_kind_intel, + arch_kind_amd, + arch_kind_other + } kind; + int max_cpuid; + struct cpuid_registers + { + unsigned int eax; + unsigned int ebx; + unsigned int ecx; + unsigned int edx; + } cpuid[COMMON_CPUID_INDEX_MAX]; + unsigned int family; + unsigned int model; + unsigned int feature[FEATURE_INDEX_MAX]; +}; + +/* Used from outside of glibc to get access to the CPU features + structure. */ +extern const struct cpu_features *__get_cpu_features (void) + __attribute__ ((const)); + +# if defined (_LIBC) && !IS_IN (nonlib) +/* Unused for x86. */ +# define INIT_ARCH() +# define __get_cpu_features() (&GLRO(dl_x86_cpu_features)) +# endif + + +/* HAS_* evaluates to true if we may use the feature at runtime. */ +# define HAS_CPU_FEATURE(name) \ + ((__get_cpu_features ()->cpuid[index_##name].reg_##name & (bit_##name)) != 0) +# define HAS_ARCH_FEATURE(name) \ + ((__get_cpu_features ()->feature[index_##name] & (bit_##name)) != 0) + +# define index_SSE2 COMMON_CPUID_INDEX_1 +# define index_SSSE3 COMMON_CPUID_INDEX_1 +# define index_SSE4_1 COMMON_CPUID_INDEX_1 +# define index_SSE4_2 COMMON_CPUID_INDEX_1 +# define index_AVX COMMON_CPUID_INDEX_1 +# define index_AVX2 COMMON_CPUID_INDEX_7 +# define index_AVX512F COMMON_CPUID_INDEX_7 +# define index_AVX512DQ COMMON_CPUID_INDEX_7 +# define index_RTM COMMON_CPUID_INDEX_7 +# define index_FMA COMMON_CPUID_INDEX_1 +# define index_FMA4 COMMON_CPUID_INDEX_80000001 +# define index_POPCOUNT COMMON_CPUID_INDEX_1 +# define index_OSXSAVE COMMON_CPUID_INDEX_1 + +# define reg_SSE2 edx +# define reg_SSSE3 ecx +# define reg_SSE4_1 ecx +# define reg_SSE4_2 ecx +# define reg_AVX ecx +# define reg_AVX2 ebx +# define reg_AVX512F ebx +# define reg_AVX512DQ ebx +# define reg_RTM ebx +# define reg_FMA ecx +# define reg_FMA4 ecx +# define reg_POPCOUNT ecx +# define reg_OSXSAVE ecx + +# define index_Fast_Rep_String FEATURE_INDEX_1 +# define index_Fast_Copy_Backward FEATURE_INDEX_1 +# define index_Slow_BSF FEATURE_INDEX_1 +# define index_Prefer_SSE_for_memop FEATURE_INDEX_1 +# define index_Fast_Unaligned_Load FEATURE_INDEX_1 +# define index_Prefer_PMINUB_for_stringop FEATURE_INDEX_1 +# define index_AVX_Usable FEATURE_INDEX_1 +# define index_FMA_Usable FEATURE_INDEX_1 +# define index_FMA4_Usable FEATURE_INDEX_1 +# define index_Slow_SSE4_2 FEATURE_INDEX_1 +# define index_AVX2_Usable FEATURE_INDEX_1 +# define index_AVX_Fast_Unaligned_Load FEATURE_INDEX_1 +# define index_AVX512F_Usable FEATURE_INDEX_1 +# define index_AVX512DQ_Usable FEATURE_INDEX_1 + +#endif /* !__ASSEMBLER__ */ + +#endif /* cpu_features_h */ Index: glibc-2.17-c758a686/sysdeps/x86/dl-get-cpu-features.c =================================================================== --- /dev/null +++ glibc-2.17-c758a686/sysdeps/x86/dl-get-cpu-features.c @@ -0,0 +1,27 @@ +/* This file is part of the GNU C Library. + Copyright (C) 2015 Free Software Foundation, Inc. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + + +#include + +#undef __get_cpu_features + +const struct cpu_features * +__get_cpu_features (void) +{ + return &GLRO(dl_x86_cpu_features); +} Index: glibc-2.17-c758a686/sysdeps/x86/libc-start.c =================================================================== --- /dev/null +++ glibc-2.17-c758a686/sysdeps/x86/libc-start.c @@ -0,0 +1,41 @@ +/* Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#ifdef SHARED +# include +# else +/* The main work is done in the generic function. */ +# define LIBC_START_DISABLE_INLINE +# define LIBC_START_MAIN generic_start_main +# include +# include +# include + +extern struct cpu_features _dl_x86_cpu_features; + +int +__libc_start_main (int (*main) (int, char **, char ** MAIN_AUXVEC_DECL), + int argc, char **argv, + __typeof (main) init, + void (*fini) (void), + void (*rtld_fini) (void), void *stack_end) +{ + init_cpu_features (&_dl_x86_cpu_features); + return generic_start_main (main, argc, argv, init, fini, rtld_fini, + stack_end); +} +#endif Index: glibc-2.17-c758a686/sysdeps/x86/rtld-global-offsets.sym =================================================================== --- /dev/null +++ glibc-2.17-c758a686/sysdeps/x86/rtld-global-offsets.sym @@ -0,0 +1,7 @@ +#define SHARED 1 + +#include + +#define rtld_global_ro_offsetof(mem) offsetof (struct rtld_global_ro, mem) + +RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET rtld_global_ro_offsetof (_dl_x86_cpu_features) Index: glibc-2.17-c758a686/sysdeps/x86/tst-get-cpu-features-static.c =================================================================== --- /dev/null +++ glibc-2.17-c758a686/sysdeps/x86/tst-get-cpu-features-static.c @@ -0,0 +1 @@ +#include "tst-get-cpu-features.c" Index: glibc-2.17-c758a686/sysdeps/x86/tst-get-cpu-features.c =================================================================== --- /dev/null +++ glibc-2.17-c758a686/sysdeps/x86/tst-get-cpu-features.c @@ -0,0 +1,31 @@ +/* Test case for x86 __get_cpu_features interface + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include + +static int +do_test (void) +{ + if (__get_cpu_features ()->kind == arch_kind_unknown) + abort (); + return 0; +} + +#define TEST_FUNCTION do_test () +#include "../../test-skeleton.c" Index: glibc-2.17-c758a686/sysdeps/x86_64/cacheinfo.c =================================================================== --- glibc-2.17-c758a686.orig/sysdeps/x86_64/cacheinfo.c +++ glibc-2.17-c758a686/sysdeps/x86_64/cacheinfo.c @@ -21,40 +21,11 @@ #include #include #include +#include "multiarch/init-arch.h" -#ifndef __cpuid_count -/* FIXME: Provide __cpuid_count if it isn't defined. Copied from gcc - 4.4.0. Remove this if gcc 4.4 is the minimum requirement. */ -# if defined(__i386__) && defined(__PIC__) -/* %ebx may be the PIC register. */ -# define __cpuid_count(level, count, a, b, c, d) \ - __asm__ ("xchg{l}\t{%%}ebx, %1\n\t" \ - "cpuid\n\t" \ - "xchg{l}\t{%%}ebx, %1\n\t" \ - : "=a" (a), "=r" (b), "=c" (c), "=d" (d) \ - : "0" (level), "2" (count)) -# else -# define __cpuid_count(level, count, a, b, c, d) \ - __asm__ ("cpuid\n\t" \ - : "=a" (a), "=b" (b), "=c" (c), "=d" (d) \ - : "0" (level), "2" (count)) -# endif -#endif - -#ifdef USE_MULTIARCH -# include "multiarch/init-arch.h" - -# define is_intel __cpu_features.kind == arch_kind_intel -# define is_amd __cpu_features.kind == arch_kind_amd -# define max_cpuid __cpu_features.max_cpuid -#else - /* This spells out "GenuineIntel". */ -# define is_intel \ - ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69 - /* This spells out "AuthenticAMD". */ -# define is_amd \ - ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65 -#endif +#define is_intel GLRO(dl_x86_cpu_features).kind == arch_kind_intel +#define is_amd GLRO(dl_x86_cpu_features).kind == arch_kind_amd +#define max_cpuid GLRO(dl_x86_cpu_features).max_cpuid static const struct intel_02_cache_info { @@ -237,21 +208,8 @@ intel_check_word (int name, unsigned int /* Intel reused this value. For family 15, model 6 it specifies the 3rd level cache. Otherwise the 2nd level cache. */ - unsigned int family; - unsigned int model; -#ifdef USE_MULTIARCH - family = __cpu_features.family; - model = __cpu_features.model; -#else - unsigned int eax; - unsigned int ebx; - unsigned int ecx; - unsigned int edx; - __cpuid (1, eax, ebx, ecx, edx); - - family = ((eax >> 20) & 0xff) + ((eax >> 8) & 0xf); - model = (((eax >>16) & 0xf) << 4) + ((eax >> 4) & 0xf); -#endif + unsigned int family = GLRO(dl_x86_cpu_features).family; + unsigned int model = GLRO(dl_x86_cpu_features).model; if (family == 15 && model == 6) { @@ -478,18 +436,6 @@ long int attribute_hidden __cache_sysconf (int name) { -#ifdef USE_MULTIARCH - if (__cpu_features.kind == arch_kind_unknown) - __init_cpu_features (); -#else - /* Find out what brand of processor. */ - unsigned int max_cpuid; - unsigned int ebx; - unsigned int ecx; - unsigned int edx; - __cpuid (0, max_cpuid, ebx, ecx, edx); -#endif - if (is_intel) return handle_intel (name, max_cpuid); @@ -525,18 +471,6 @@ long int __x86_64_raw_shared_cache_size int __x86_64_prefetchw attribute_hidden; #endif -#ifndef DISABLE_PREFERRED_MEMORY_INSTRUCTION -/* Instructions preferred for memory and string routines. - - 0: Regular instructions - 1: MMX instructions - 2: SSE2 instructions - 3: SSSE3 instructions - - */ -int __x86_64_preferred_memory_instruction attribute_hidden; -#endif - static void __attribute__((constructor)) @@ -553,14 +487,6 @@ init_cacheinfo (void) unsigned int level; unsigned int threads = 0; -#ifdef USE_MULTIARCH - if (__cpu_features.kind == arch_kind_unknown) - __init_cpu_features (); -#else - int max_cpuid; - __cpuid (0, max_cpuid, ebx, ecx, edx); -#endif - if (is_intel) { data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, max_cpuid); @@ -576,34 +502,13 @@ init_cacheinfo (void) shared = handle_intel (_SC_LEVEL2_CACHE_SIZE, max_cpuid); } - unsigned int ebx_1; - -#ifdef USE_MULTIARCH - eax = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].eax; - ebx_1 = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ebx; - ecx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx; - edx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].edx; -#else - __cpuid (1, eax, ebx_1, ecx, edx); -#endif - - unsigned int family = (eax >> 8) & 0x0f; - unsigned int model = (eax >> 4) & 0x0f; - unsigned int extended_model = (eax >> 12) & 0xf0; - -#ifndef DISABLE_PREFERRED_MEMORY_INSTRUCTION - /* Intel prefers SSSE3 instructions for memory/string routines - if they are available. */ - if ((ecx & 0x200)) - __x86_64_preferred_memory_instruction = 3; - else - __x86_64_preferred_memory_instruction = 2; -#endif - /* Figure out the number of logical threads that share the highest cache level. */ if (max_cpuid >= 4) { + unsigned int family = GLRO(dl_x86_cpu_features).family; + unsigned int model = GLRO(dl_x86_cpu_features).model; + int i = 0; /* Query until desired cache level is enumerated. */ @@ -655,7 +560,6 @@ init_cacheinfo (void) threads += 1; if (threads > 2 && level == 2 && family == 6) { - model += extended_model; switch (model) { case 0x57: @@ -678,7 +582,9 @@ init_cacheinfo (void) intel_bug_no_cache_info: /* Assume that all logical threads share the highest cache level. */ - threads = (ebx_1 >> 16) & 0xff; + threads + = ((GLRO(dl_x86_cpu_features).cpuid[COMMON_CPUID_INDEX_1].ebx + >> 16) & 0xff); } /* Cap usage of highest cache level to the number of supported @@ -693,25 +599,6 @@ init_cacheinfo (void) long int core = handle_amd (_SC_LEVEL2_CACHE_SIZE); shared = handle_amd (_SC_LEVEL3_CACHE_SIZE); -#ifndef DISABLE_PREFERRED_MEMORY_INSTRUCTION -# ifdef USE_MULTIARCH - eax = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].eax; - ebx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ebx; - ecx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx; - edx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].edx; -# else - __cpuid (1, eax, ebx, ecx, edx); -# endif - - /* AMD prefers SSSE3 instructions for memory/string routines - if they are avaiable, otherwise it prefers integer - instructions. */ - if ((ecx & 0x200)) - __x86_64_preferred_memory_instruction = 3; - else - __x86_64_preferred_memory_instruction = 0; -#endif - /* Get maximum extended function. */ __cpuid (0x80000000, max_cpuid_ex, ebx, ecx, edx); Index: glibc-2.17-c758a686/sysdeps/x86_64/dl-machine.h =================================================================== --- glibc-2.17-c758a686.orig/sysdeps/x86_64/dl-machine.h +++ glibc-2.17-c758a686/sysdeps/x86_64/dl-machine.h @@ -26,6 +26,7 @@ #include #include #include +#include /* Return nonzero iff ELF header is compatible with the running host. */ static inline int __attribute__ ((unused)) @@ -200,6 +201,8 @@ dl_platform_init (void) if (GLRO(dl_platform) != NULL && *GLRO(dl_platform) == '\0') /* Avoid an empty string which would disturb us. */ GLRO(dl_platform) = NULL; + + init_cpu_features (&GLRO(dl_x86_cpu_features)); } static inline ElfW(Addr) Index: glibc-2.17-c758a686/sysdeps/x86_64/dl-procinfo.c =================================================================== --- /dev/null +++ glibc-2.17-c758a686/sysdeps/x86_64/dl-procinfo.c @@ -0,0 +1,57 @@ +/* Data for x86-64 version of processor capability information. + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +/* If anything should be added here check whether the size of each string + is still ok with the given array size. + + All the #ifdefs in the definitions are quite irritating but + necessary if we want to avoid duplicating the information. There + are three different modes: + + - PROCINFO_DECL is defined. This means we are only interested in + declarations. + + - PROCINFO_DECL is not defined: + + + if SHARED is defined the file is included in an array + initializer. The .element = { ... } syntax is needed. + + + if SHARED is not defined a normal array initialization is + needed. + */ + +#ifndef PROCINFO_CLASS +# define PROCINFO_CLASS +#endif + +#if !defined PROCINFO_DECL && defined SHARED + ._dl_x86_cpu_features +#else +PROCINFO_CLASS struct cpu_features _dl_x86_cpu_features +#endif +#ifndef PROCINFO_DECL += { } +#endif +#if !defined SHARED || defined PROCINFO_DECL +; +#else +, +#endif + +#undef PROCINFO_DECL +#undef PROCINFO_CLASS Index: glibc-2.17-c758a686/sysdeps/x86_64/ldsodefs.h =================================================================== --- glibc-2.17-c758a686.orig/sysdeps/x86_64/ldsodefs.h +++ glibc-2.17-c758a686/sysdeps/x86_64/ldsodefs.h @@ -20,6 +20,7 @@ #define _X86_64_LDSODEFS_H 1 #include +#include struct La_x86_64_regs; struct La_x86_64_retval; Index: glibc-2.17-c758a686/sysdeps/x86_64/multiarch/Makefile =================================================================== --- glibc-2.17-c758a686.orig/sysdeps/x86_64/multiarch/Makefile +++ glibc-2.17-c758a686/sysdeps/x86_64/multiarch/Makefile @@ -1,5 +1,4 @@ ifeq ($(subdir),csu) -aux += init-arch tests += test-multiarch gen-as-const-headers += ifunc-defines.sym endif Index: glibc-2.17-c758a686/sysdeps/x86_64/multiarch/Versions =================================================================== --- glibc-2.17-c758a686.orig/sysdeps/x86_64/multiarch/Versions +++ /dev/null @@ -1,5 +0,0 @@ -libc { - GLIBC_PRIVATE { - __get_cpu_features; - } -} Index: glibc-2.17-c758a686/sysdeps/x86_64/multiarch/cacheinfo.c =================================================================== --- glibc-2.17-c758a686.orig/sysdeps/x86_64/multiarch/cacheinfo.c +++ /dev/null @@ -1,2 +0,0 @@ -#define DISABLE_PREFERRED_MEMORY_INSTRUCTION -#include "../cacheinfo.c" Index: glibc-2.17-c758a686/sysdeps/x86_64/multiarch/ifunc-defines.sym =================================================================== --- glibc-2.17-c758a686.orig/sysdeps/x86_64/multiarch/ifunc-defines.sym +++ glibc-2.17-c758a686/sysdeps/x86_64/multiarch/ifunc-defines.sym @@ -4,7 +4,6 @@ -- CPU_FEATURES_SIZE sizeof (struct cpu_features) -KIND_OFFSET offsetof (struct cpu_features, kind) CPUID_OFFSET offsetof (struct cpu_features, cpuid) CPUID_SIZE sizeof (struct cpuid_registers) CPUID_EAX_OFFSET offsetof (struct cpuid_registers, eax) Index: glibc-2.17-c758a686/sysdeps/x86_64/multiarch/init-arch.c =================================================================== --- glibc-2.17-c758a686.orig/sysdeps/x86_64/multiarch/init-arch.c +++ /dev/null @@ -1,183 +0,0 @@ -/* Initialize CPU feature data. - This file is part of the GNU C Library. - Copyright (C) 2008-2012 Free Software Foundation, Inc. - Contributed by Ulrich Drepper . - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#include -#include -#include "init-arch.h" - - -struct cpu_features __cpu_features attribute_hidden; - - -static void -get_common_indeces (unsigned int *family, unsigned int *model) -{ - __cpuid (1, __cpu_features.cpuid[COMMON_CPUID_INDEX_1].eax, - __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ebx, - __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx, - __cpu_features.cpuid[COMMON_CPUID_INDEX_1].edx); - - unsigned int eax = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].eax; - *family = (eax >> 8) & 0x0f; - *model = (eax >> 4) & 0x0f; -} - - -void -__init_cpu_features (void) -{ - unsigned int ebx; - unsigned int ecx; - unsigned int edx; - unsigned int family = 0; - unsigned int model = 0; - enum cpu_features_kind kind; - - __cpuid (0, __cpu_features.max_cpuid, ebx, ecx, edx); - - /* This spells out "GenuineIntel". */ - if (ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69) - { - kind = arch_kind_intel; - - get_common_indeces (&family, &model); - - /* Intel processors prefer SSE instruction for memory/string - routines if they are available. */ - __cpu_features.feature[index_Prefer_SSE_for_memop] - |= bit_Prefer_SSE_for_memop; - - unsigned int eax = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].eax; - unsigned int extended_family = (eax >> 20) & 0xff; - unsigned int extended_model = (eax >> 12) & 0xf0; - if (family == 0x0f) - { - family += extended_family; - model += extended_model; - } - else if (family == 0x06) - { - ecx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx; - model += extended_model; - switch (model) - { - case 0x1c: - case 0x26: - /* BSF is slow on Atom. */ - __cpu_features.feature[index_Slow_BSF] |= bit_Slow_BSF; - break; - - default: - /* Unknown family 0x06 processors. Assuming this is one - of Core i3/i5/i7 processors if AVX is available. */ - if ((ecx & bit_AVX) == 0) - break; - - case 0x1a: - case 0x1e: - case 0x1f: - case 0x25: - case 0x2c: - case 0x2e: - case 0x2f: - /* Rep string instructions, copy backward, unaligned loads - and pminub are fast on Intel Core i3, i5 and i7. */ -#if index_Fast_Rep_String != index_Fast_Copy_Backward -# error index_Fast_Rep_String != index_Fast_Copy_Backward -#endif -#if index_Fast_Rep_String != index_Fast_Unaligned_Load -# error index_Fast_Rep_String != index_Fast_Unaligned_Load -#endif -#if index_Fast_Rep_String != index_Prefer_PMINUB_for_stringop -# error index_Fast_Rep_String != index_Prefer_PMINUB_for_stringop -#endif - __cpu_features.feature[index_Fast_Rep_String] - |= (bit_Fast_Rep_String - | bit_Fast_Copy_Backward - | bit_Fast_Unaligned_Load - | bit_Prefer_PMINUB_for_stringop); - break; - } - } - } - /* This spells out "AuthenticAMD". */ - else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65) - { - kind = arch_kind_amd; - - get_common_indeces (&family, &model); - - ecx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx; - - /* AMD processors prefer SSE instructions for memory/string routines - if they are available, otherwise they prefer integer instructions. */ - if ((ecx & 0x200)) - __cpu_features.feature[index_Prefer_SSE_for_memop] - |= bit_Prefer_SSE_for_memop; - - unsigned int eax; - __cpuid (0x80000000, eax, ebx, ecx, edx); - if (eax >= 0x80000001) - __cpuid (0x80000001, - __cpu_features.cpuid[COMMON_CPUID_INDEX_80000001].eax, - __cpu_features.cpuid[COMMON_CPUID_INDEX_80000001].ebx, - __cpu_features.cpuid[COMMON_CPUID_INDEX_80000001].ecx, - __cpu_features.cpuid[COMMON_CPUID_INDEX_80000001].edx); - } - else - kind = arch_kind_other; - - /* Can we call xgetbv? */ - if (CPUID_OSXSAVE) - { - unsigned int xcrlow; - unsigned int xcrhigh; - asm ("xgetbv" : "=a" (xcrlow), "=d" (xcrhigh) : "c" (0)); - /* Is YMM and XMM state usable? */ - if ((xcrlow & (bit_YMM_state | bit_XMM_state)) == - (bit_YMM_state | bit_XMM_state)) - { - /* Determine if AVX is usable. */ - if (CPUID_AVX) - __cpu_features.feature[index_AVX_Usable] |= bit_AVX_Usable; - /* Determine if FMA is usable. */ - if (CPUID_FMA) - __cpu_features.feature[index_FMA_Usable] |= bit_FMA_Usable; - /* Determine if FMA4 is usable. */ - if (CPUID_FMA4) - __cpu_features.feature[index_FMA4_Usable] |= bit_FMA4_Usable; - } - } - - __cpu_features.family = family; - __cpu_features.model = model; - atomic_write_barrier (); - __cpu_features.kind = kind; -} - -#undef __get_cpu_features - -const struct cpu_features * -__get_cpu_features (void) -{ - if (__cpu_features.kind == arch_kind_unknown) - __init_cpu_features (); - - return &__cpu_features; -} Index: glibc-2.17-c758a686/sysdeps/x86_64/multiarch/init-arch.h =================================================================== --- glibc-2.17-c758a686.orig/sysdeps/x86_64/multiarch/init-arch.h +++ glibc-2.17-c758a686/sysdeps/x86_64/multiarch/init-arch.h @@ -15,183 +15,8 @@ License along with the GNU C Library; if not, see . */ -#define bit_Fast_Rep_String (1 << 0) -#define bit_Fast_Copy_Backward (1 << 1) -#define bit_Slow_BSF (1 << 2) -#define bit_Prefer_SSE_for_memop (1 << 3) -#define bit_Fast_Unaligned_Load (1 << 4) -#define bit_Prefer_PMINUB_for_stringop (1 << 5) -#define bit_AVX_Usable (1 << 6) -#define bit_FMA_Usable (1 << 7) -#define bit_FMA4_Usable (1 << 8) - -/* CPUID Feature flags. */ -#define bit_SSE2 (1 << 26) -#define bit_SSSE3 (1 << 9) -#define bit_SSE4_1 (1 << 19) -#define bit_SSE4_2 (1 << 20) -#define bit_OSXSAVE (1 << 27) -#define bit_AVX (1 << 28) -#define bit_POPCOUNT (1 << 23) -#define bit_FMA (1 << 12) -#define bit_FMA4 (1 << 16) - -/* XCR0 Feature flags. */ -#define bit_XMM_state (1 << 1) -#define bit_YMM_state (2 << 1) - -#ifdef __ASSEMBLER__ - -# include - -# define index_SSE2 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_EDX_OFFSET -# define index_SSSE3 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET -# define index_SSE4_1 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET -# define index_SSE4_2 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET -# define index_AVX COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET - -# define index_Fast_Rep_String FEATURE_INDEX_1*FEATURE_SIZE -# define index_Fast_Copy_Backward FEATURE_INDEX_1*FEATURE_SIZE -# define index_Slow_BSF FEATURE_INDEX_1*FEATURE_SIZE -# define index_Prefer_SSE_for_memop FEATURE_INDEX_1*FEATURE_SIZE -# define index_Fast_Unaligned_Load FEATURE_INDEX_1*FEATURE_SIZE -# define index_Prefer_PMINUB_for_stringop FEATURE_INDEX_1*FEATURE_SIZE -# define index_AVX_Usable FEATURE_INDEX_1*FEATURE_SIZE -# define index_FMA_Usable FEATURE_INDEX_1*FEATURE_SIZE -# define index_FMA4_Usable FEATURE_INDEX_1*FEATURE_SIZE - -#else /* __ASSEMBLER__ */ - -# include -# include -# include -# include - -/* Ugly hack to make it possible to select a strstr and strcasestr - implementation that avoids using the stack for 16-byte aligned - SSE temporaries. Doing so makes it possible to call the functions - with a stack that's not 16-byte aligned as can happen, for example, - as a result of compiling the functions' callers with the GCC - -mpreferred-stack-boubdary=2 or =3 option, or with the ICC - -falign-stack=assume-4-byte option. See rhbz 1150282 for details. - - The ifunc selector uses the unaligned version by default if this - file exists and is accessible. */ -# define ENABLE_STRSTR_UNALIGNED_PATHNAME \ - "/etc/sysconfig/64bit_strstr_via_64bit_strstr_sse2_unaligned" - -static bool __attribute__ ((unused)) -use_unaligned_strstr (void) -{ - struct stat unaligned_strstr_etc_sysconfig_file; - - /* TLS may not have been set up yet, so avoid using stat since it tries to - set errno. */ - return INTERNAL_SYSCALL (stat, , 2, - ENABLE_STRSTR_UNALIGNED_PATHNAME, - &unaligned_strstr_etc_sysconfig_file) == 0; -} - -enum - { - COMMON_CPUID_INDEX_1 = 0, - COMMON_CPUID_INDEX_80000001, /* for AMD */ - /* Keep the following line at the end. */ - COMMON_CPUID_INDEX_MAX - }; - -enum - { - FEATURE_INDEX_1 = 0, - /* Keep the following line at the end. */ - FEATURE_INDEX_MAX - }; - -extern struct cpu_features -{ - enum cpu_features_kind - { - arch_kind_unknown = 0, - arch_kind_intel, - arch_kind_amd, - arch_kind_other - } kind; - int max_cpuid; - struct cpuid_registers - { - unsigned int eax; - unsigned int ebx; - unsigned int ecx; - unsigned int edx; - } cpuid[COMMON_CPUID_INDEX_MAX]; - unsigned int family; - unsigned int model; - unsigned int feature[FEATURE_INDEX_MAX]; -} __cpu_features attribute_hidden; - - -extern void __init_cpu_features (void) attribute_hidden; -# define INIT_ARCH() \ - do \ - if (__cpu_features.kind == arch_kind_unknown) \ - __init_cpu_features (); \ - while (0) - -/* Used from outside libc.so to get access to the CPU features structure. */ -extern const struct cpu_features *__get_cpu_features (void) - __attribute__ ((const)); - -# if IS_IN (libc) -# define __get_cpu_features() (&__cpu_features) -# endif - -# define HAS_CPU_FEATURE(idx, reg, bit) \ - ((__get_cpu_features ()->cpuid[idx].reg & (bit)) != 0) - -/* Following are the feature tests used throughout libc. */ - -/* CPUID_* evaluates to true if the feature flag is enabled. - We always use &__cpu_features because the HAS_CPUID_* macros - are called only within __init_cpu_features, where we can't - call __get_cpu_features without infinite recursion. */ -# define HAS_CPUID_FLAG(idx, reg, bit) \ - (((&__cpu_features)->cpuid[idx].reg & (bit)) != 0) - -# define CPUID_OSXSAVE \ - HAS_CPUID_FLAG (COMMON_CPUID_INDEX_1, ecx, bit_OSXSAVE) -# define CPUID_AVX \ - HAS_CPUID_FLAG (COMMON_CPUID_INDEX_1, ecx, bit_AVX) -# define CPUID_FMA \ - HAS_CPUID_FLAG (COMMON_CPUID_INDEX_1, ecx, bit_FMA) -# define CPUID_FMA4 \ - HAS_CPUID_FLAG (COMMON_CPUID_INDEX_80000001, ecx, bit_FMA4) - -/* HAS_* evaluates to true if we may use the feature at runtime. */ -# define HAS_SSE2 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, edx, bit_SSE2) -# define HAS_POPCOUNT HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, bit_POPCOUNT) -# define HAS_SSSE3 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, bit_SSSE3) -# define HAS_SSE4_1 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, bit_SSE4_1) -# define HAS_SSE4_2 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, bit_SSE4_2) - -# define index_Fast_Rep_String FEATURE_INDEX_1 -# define index_Fast_Copy_Backward FEATURE_INDEX_1 -# define index_Slow_BSF FEATURE_INDEX_1 -# define index_Prefer_SSE_for_memop FEATURE_INDEX_1 -# define index_Fast_Unaligned_Load FEATURE_INDEX_1 -# define index_AVX_Usable FEATURE_INDEX_1 -# define index_FMA_Usable FEATURE_INDEX_1 -# define index_FMA4_Usable FEATURE_INDEX_1 - -# define HAS_ARCH_FEATURE(name) \ - ((__get_cpu_features ()->feature[index_##name] & (bit_##name)) != 0) - -# define HAS_FAST_REP_STRING HAS_ARCH_FEATURE (Fast_Rep_String) -# define HAS_FAST_COPY_BACKWARD HAS_ARCH_FEATURE (Fast_Copy_Backward) -# define HAS_SLOW_BSF HAS_ARCH_FEATURE (Slow_BSF) -# define HAS_PREFER_SSE_FOR_MEMOP HAS_ARCH_FEATURE (Prefer_SSE_for_memop) -# define HAS_FAST_UNALIGNED_LOAD HAS_ARCH_FEATURE (Fast_Unaligned_Load) -# define HAS_AVX HAS_ARCH_FEATURE (AVX_Usable) -# define HAS_FMA HAS_ARCH_FEATURE (FMA_Usable) -# define HAS_FMA4 HAS_ARCH_FEATURE (FMA4_Usable) - -#endif /* __ASSEMBLER__ */ +#ifdef __ASSEMBLER__ +# include +#else +# include +#endif