diff options
Diffstat (limited to 'REORG.TODO/sysdeps/x86/cpu-features.c')
-rw-r--r-- | REORG.TODO/sysdeps/x86/cpu-features.c | 363 |
1 files changed, 363 insertions, 0 deletions
diff --git a/REORG.TODO/sysdeps/x86/cpu-features.c b/REORG.TODO/sysdeps/x86/cpu-features.c new file mode 100644 index 0000000000..4288001cdd --- /dev/null +++ b/REORG.TODO/sysdeps/x86/cpu-features.c @@ -0,0 +1,363 @@ +/* Initialize CPU feature data. + This file is part of the GNU C Library. + Copyright (C) 2008-2017 Free Software Foundation, Inc. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <cpuid.h> +#include <cpu-features.h> +#include <dl-hwcap.h> + +static void +get_common_indeces (struct cpu_features *cpu_features, + unsigned int *family, unsigned int *model, + unsigned int *extended_model, unsigned int *stepping) +{ + if (family) + { + unsigned int eax; + __cpuid (1, eax, cpu_features->cpuid[COMMON_CPUID_INDEX_1].ebx, + cpu_features->cpuid[COMMON_CPUID_INDEX_1].ecx, + cpu_features->cpuid[COMMON_CPUID_INDEX_1].edx); + cpu_features->cpuid[COMMON_CPUID_INDEX_1].eax = eax; + *family = (eax >> 8) & 0x0f; + *model = (eax >> 4) & 0x0f; + *extended_model = (eax >> 12) & 0xf0; + *stepping = eax & 0x0f; + if (*family == 0x0f) + { + *family += (eax >> 20) & 0xff; + *model += *extended_model; + } + } + + if (cpu_features->max_cpuid >= 7) + __cpuid_count (7, 0, + cpu_features->cpuid[COMMON_CPUID_INDEX_7].eax, + cpu_features->cpuid[COMMON_CPUID_INDEX_7].ebx, + cpu_features->cpuid[COMMON_CPUID_INDEX_7].ecx, + cpu_features->cpuid[COMMON_CPUID_INDEX_7].edx); + + /* Can we call xgetbv? */ + if (CPU_FEATURES_CPU_P (cpu_features, OSXSAVE)) + { + unsigned int xcrlow; + unsigned int xcrhigh; + asm ("xgetbv" : "=a" (xcrlow), "=d" (xcrhigh) : "c" (0)); + /* Is YMM and XMM state usable? */ + if ((xcrlow & (bit_YMM_state | bit_XMM_state)) == + (bit_YMM_state | bit_XMM_state)) + { + /* Determine if AVX is usable. */ + if (CPU_FEATURES_CPU_P (cpu_features, AVX)) + { + cpu_features->feature[index_arch_AVX_Usable] + |= bit_arch_AVX_Usable; + /* The following features depend on AVX being usable. */ + /* Determine if AVX2 is usable. */ + if (CPU_FEATURES_CPU_P (cpu_features, AVX2)) + cpu_features->feature[index_arch_AVX2_Usable] + |= bit_arch_AVX2_Usable; + /* Determine if FMA is usable. */ + if (CPU_FEATURES_CPU_P (cpu_features, FMA)) + cpu_features->feature[index_arch_FMA_Usable] + |= bit_arch_FMA_Usable; + } + + /* Check if OPMASK state, upper 256-bit of ZMM0-ZMM15 and + ZMM16-ZMM31 state are enabled. */ + if ((xcrlow & (bit_Opmask_state | bit_ZMM0_15_state + | bit_ZMM16_31_state)) == + (bit_Opmask_state | bit_ZMM0_15_state | bit_ZMM16_31_state)) + { + /* Determine if AVX512F is usable. */ + if (CPU_FEATURES_CPU_P (cpu_features, AVX512F)) + { + cpu_features->feature[index_arch_AVX512F_Usable] + |= bit_arch_AVX512F_Usable; + /* Determine if AVX512DQ is usable. */ + if (CPU_FEATURES_CPU_P (cpu_features, AVX512DQ)) + cpu_features->feature[index_arch_AVX512DQ_Usable] + |= bit_arch_AVX512DQ_Usable; + } + } + } + } +} + +static inline void +init_cpu_features (struct cpu_features *cpu_features) +{ + unsigned int ebx, ecx, edx; + unsigned int family = 0; + unsigned int model = 0; + enum cpu_features_kind kind; + +#if !HAS_CPUID + if (__get_cpuid_max (0, 0) == 0) + { + kind = arch_kind_other; + goto no_cpuid; + } +#endif + + __cpuid (0, cpu_features->max_cpuid, ebx, ecx, edx); + + /* This spells out "GenuineIntel". */ + if (ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69) + { + unsigned int extended_model, stepping; + + kind = arch_kind_intel; + + get_common_indeces (cpu_features, &family, &model, &extended_model, + &stepping); + + if (family == 0x06) + { + model += extended_model; + switch (model) + { + case 0x1c: + case 0x26: + /* BSF is slow on Atom. */ + cpu_features->feature[index_arch_Slow_BSF] + |= bit_arch_Slow_BSF; + break; + + case 0x57: + /* Knights Landing. Enable Silvermont optimizations. */ + + case 0x5c: + case 0x5f: + /* Unaligned load versions are faster than SSSE3 + on Goldmont. */ + + case 0x4c: + /* Airmont is a die shrink of Silvermont. */ + + case 0x37: + case 0x4a: + case 0x4d: + case 0x5a: + case 0x5d: + /* Unaligned load versions are faster than SSSE3 + on Silvermont. */ +#if index_arch_Fast_Unaligned_Load != index_arch_Prefer_PMINUB_for_stringop +# error index_arch_Fast_Unaligned_Load != index_arch_Prefer_PMINUB_for_stringop +#endif +#if index_arch_Fast_Unaligned_Load != index_arch_Slow_SSE4_2 +# error index_arch_Fast_Unaligned_Load != index_arch_Slow_SSE4_2 +#endif +#if index_arch_Fast_Unaligned_Load != index_arch_Fast_Unaligned_Copy +# error index_arch_Fast_Unaligned_Load != index_arch_Fast_Unaligned_Copy +#endif + cpu_features->feature[index_arch_Fast_Unaligned_Load] + |= (bit_arch_Fast_Unaligned_Load + | bit_arch_Fast_Unaligned_Copy + | bit_arch_Prefer_PMINUB_for_stringop + | bit_arch_Slow_SSE4_2); + break; + + default: + /* Unknown family 0x06 processors. Assuming this is one + of Core i3/i5/i7 processors if AVX is available. */ + if (!CPU_FEATURES_CPU_P (cpu_features, AVX)) + break; + + case 0x1a: + case 0x1e: + case 0x1f: + case 0x25: + case 0x2c: + case 0x2e: + case 0x2f: + /* Rep string instructions, unaligned load, unaligned copy, + and pminub are fast on Intel Core i3, i5 and i7. */ +#if index_arch_Fast_Rep_String != index_arch_Fast_Unaligned_Load +# error index_arch_Fast_Rep_String != index_arch_Fast_Unaligned_Load +#endif +#if index_arch_Fast_Rep_String != index_arch_Prefer_PMINUB_for_stringop +# error index_arch_Fast_Rep_String != index_arch_Prefer_PMINUB_for_stringop +#endif +#if index_arch_Fast_Rep_String != index_arch_Fast_Unaligned_Copy +# error index_arch_Fast_Rep_String != index_arch_Fast_Unaligned_Copy +#endif + cpu_features->feature[index_arch_Fast_Rep_String] + |= (bit_arch_Fast_Rep_String + | bit_arch_Fast_Unaligned_Load + | bit_arch_Fast_Unaligned_Copy + | bit_arch_Prefer_PMINUB_for_stringop); + break; + + case 0x3f: + /* Xeon E7 v3 with stepping >= 4 has working TSX. */ + if (stepping >= 4) + break; + case 0x3c: + case 0x45: + case 0x46: + /* Disable Intel TSX on Haswell processors (except Xeon E7 v3 + with stepping >= 4) to avoid TSX on kernels that weren't + updated with the latest microcode package (which disables + broken feature by default). */ + cpu_features->cpuid[index_cpu_RTM].reg_RTM &= ~bit_cpu_RTM; + break; + } + } + + /* Unaligned load with 256-bit AVX registers are faster on + Intel processors with AVX2. */ + if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)) + cpu_features->feature[index_arch_AVX_Fast_Unaligned_Load] + |= bit_arch_AVX_Fast_Unaligned_Load; + + /* Since AVX512ER is unique to Xeon Phi, set Prefer_No_VZEROUPPER + if AVX512ER is available. Don't use AVX512 to avoid lower CPU + frequency if AVX512ER isn't available. */ + if (CPU_FEATURES_CPU_P (cpu_features, AVX512ER)) + cpu_features->feature[index_arch_Prefer_No_VZEROUPPER] + |= bit_arch_Prefer_No_VZEROUPPER; + else + cpu_features->feature[index_arch_Prefer_No_AVX512] + |= bit_arch_Prefer_No_AVX512; + + /* To avoid SSE transition penalty, use _dl_runtime_resolve_slow. + If XGETBV suports ECX == 1, use _dl_runtime_resolve_opt. */ + cpu_features->feature[index_arch_Use_dl_runtime_resolve_slow] + |= bit_arch_Use_dl_runtime_resolve_slow; + if (cpu_features->max_cpuid >= 0xd) + { + unsigned int eax; + + __cpuid_count (0xd, 1, eax, ebx, ecx, edx); + if ((eax & (1 << 2)) != 0) + cpu_features->feature[index_arch_Use_dl_runtime_resolve_opt] + |= bit_arch_Use_dl_runtime_resolve_opt; + } + } + /* This spells out "AuthenticAMD". */ + else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65) + { + unsigned int extended_model, stepping; + + kind = arch_kind_amd; + + get_common_indeces (cpu_features, &family, &model, &extended_model, + &stepping); + + ecx = cpu_features->cpuid[COMMON_CPUID_INDEX_1].ecx; + + unsigned int eax; + __cpuid (0x80000000, eax, ebx, ecx, edx); + if (eax >= 0x80000001) + __cpuid (0x80000001, + cpu_features->cpuid[COMMON_CPUID_INDEX_80000001].eax, + cpu_features->cpuid[COMMON_CPUID_INDEX_80000001].ebx, + cpu_features->cpuid[COMMON_CPUID_INDEX_80000001].ecx, + cpu_features->cpuid[COMMON_CPUID_INDEX_80000001].edx); + + if (HAS_ARCH_FEATURE (AVX_Usable)) + { + /* Since the FMA4 bit is in COMMON_CPUID_INDEX_80000001 and + FMA4 requires AVX, determine if FMA4 is usable here. */ + if (CPU_FEATURES_CPU_P (cpu_features, FMA4)) + cpu_features->feature[index_arch_FMA4_Usable] + |= bit_arch_FMA4_Usable; + } + + if (family == 0x15) + { +#if index_arch_Fast_Unaligned_Load != index_arch_Fast_Copy_Backward +# error index_arch_Fast_Unaligned_Load != index_arch_Fast_Copy_Backward +#endif + /* "Excavator" */ + if (model >= 0x60 && model <= 0x7f) + cpu_features->feature[index_arch_Fast_Unaligned_Load] + |= (bit_arch_Fast_Unaligned_Load + | bit_arch_Fast_Copy_Backward); + } + } + else + { + kind = arch_kind_other; + get_common_indeces (cpu_features, NULL, NULL, NULL, NULL); + } + + /* Support i586 if CX8 is available. */ + if (CPU_FEATURES_CPU_P (cpu_features, CX8)) + cpu_features->feature[index_arch_I586] |= bit_arch_I586; + + /* Support i686 if CMOV is available. */ + if (CPU_FEATURES_CPU_P (cpu_features, CMOV)) + cpu_features->feature[index_arch_I686] |= bit_arch_I686; + +#if !HAS_CPUID +no_cpuid: +#endif + + cpu_features->family = family; + cpu_features->model = model; + cpu_features->kind = kind; + + /* Reuse dl_platform, dl_hwcap and dl_hwcap_mask for x86. */ + GLRO(dl_platform) = NULL; + GLRO(dl_hwcap) = 0; +#if !HAVE_TUNABLES && defined SHARED + /* The glibc.tune.hwcap_mask tunable is initialized already, so no need to do + this. */ + GLRO(dl_hwcap_mask) = HWCAP_IMPORTANT; +#endif + +#ifdef __x86_64__ + if (cpu_features->kind == arch_kind_intel) + { + if (CPU_FEATURES_ARCH_P (cpu_features, AVX512F_Usable) + && CPU_FEATURES_CPU_P (cpu_features, AVX512CD)) + { + if (CPU_FEATURES_CPU_P (cpu_features, AVX512ER)) + { + if (CPU_FEATURES_CPU_P (cpu_features, AVX512PF)) + GLRO(dl_platform) = "xeon_phi"; + } + else + { + if (CPU_FEATURES_CPU_P (cpu_features, AVX512BW) + && CPU_FEATURES_CPU_P (cpu_features, AVX512DQ) + && CPU_FEATURES_CPU_P (cpu_features, AVX512VL)) + GLRO(dl_hwcap) |= HWCAP_X86_AVX512_1; + } + } + + if (GLRO(dl_platform) == NULL + && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable) + && CPU_FEATURES_ARCH_P (cpu_features, FMA_Usable) + && CPU_FEATURES_CPU_P (cpu_features, BMI1) + && CPU_FEATURES_CPU_P (cpu_features, BMI2) + && CPU_FEATURES_CPU_P (cpu_features, LZCNT) + && CPU_FEATURES_CPU_P (cpu_features, MOVBE) + && CPU_FEATURES_CPU_P (cpu_features, POPCNT)) + GLRO(dl_platform) = "haswell"; + } +#else + if (CPU_FEATURES_CPU_P (cpu_features, SSE2)) + GLRO(dl_hwcap) |= HWCAP_X86_SSE2; + + if (CPU_FEATURES_ARCH_P (cpu_features, I686)) + GLRO(dl_platform) = "i686"; + else if (CPU_FEATURES_ARCH_P (cpu_features, I586)) + GLRO(dl_platform) = "i586"; +#endif +} |