diff options
Diffstat (limited to 'sysdeps/x86_64')
-rw-r--r-- | sysdeps/x86_64/dl-machine.h | 24 | ||||
-rw-r--r-- | sysdeps/x86_64/dl-trampoline.S | 20 | ||||
-rw-r--r-- | sysdeps/x86_64/dl-trampoline.h | 104 |
3 files changed, 145 insertions, 3 deletions
diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h index ed0c1a8efd..c0f0fa16a2 100644 --- a/sysdeps/x86_64/dl-machine.h +++ b/sysdeps/x86_64/dl-machine.h @@ -68,7 +68,10 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile) Elf64_Addr *got; extern void _dl_runtime_resolve_sse (ElfW(Word)) attribute_hidden; extern void _dl_runtime_resolve_avx (ElfW(Word)) attribute_hidden; + extern void _dl_runtime_resolve_avx_slow (ElfW(Word)) attribute_hidden; + extern void _dl_runtime_resolve_avx_opt (ElfW(Word)) attribute_hidden; extern void _dl_runtime_resolve_avx512 (ElfW(Word)) attribute_hidden; + extern void _dl_runtime_resolve_avx512_opt (ElfW(Word)) attribute_hidden; extern void _dl_runtime_profile_sse (ElfW(Word)) attribute_hidden; extern void _dl_runtime_profile_avx (ElfW(Word)) attribute_hidden; extern void _dl_runtime_profile_avx512 (ElfW(Word)) attribute_hidden; @@ -118,9 +121,26 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile) indicated by the offset on the stack, and then jump to the resolved address. */ if (HAS_ARCH_FEATURE (AVX512F_Usable)) - *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_avx512; + { + if (HAS_ARCH_FEATURE (Use_dl_runtime_resolve_opt)) + *(ElfW(Addr) *) (got + 2) + = (ElfW(Addr)) &_dl_runtime_resolve_avx512_opt; + else + *(ElfW(Addr) *) (got + 2) + = (ElfW(Addr)) &_dl_runtime_resolve_avx512; + } else if (HAS_ARCH_FEATURE (AVX_Usable)) - *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_avx; + { + if (HAS_ARCH_FEATURE (Use_dl_runtime_resolve_opt)) + *(ElfW(Addr) *) (got + 2) + = (ElfW(Addr)) &_dl_runtime_resolve_avx_opt; + else if (HAS_ARCH_FEATURE (Use_dl_runtime_resolve_slow)) + *(ElfW(Addr) *) (got + 2) + = (ElfW(Addr)) &_dl_runtime_resolve_avx_slow; + else + *(ElfW(Addr) *) (got + 2) + = (ElfW(Addr)) &_dl_runtime_resolve_avx; + } else *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_sse; } diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S index 12f1a5cf84..39f595e1e1 100644 --- a/sysdeps/x86_64/dl-trampoline.S +++ b/sysdeps/x86_64/dl-trampoline.S @@ -18,6 +18,7 @@ #include <config.h> #include <sysdep.h> +#include <cpu-features.h> #include <link-defines.h> #ifndef DL_STACK_ALIGNMENT @@ -86,9 +87,11 @@ #endif #define VEC(i) zmm##i #define _dl_runtime_resolve _dl_runtime_resolve_avx512 +#define _dl_runtime_resolve_opt _dl_runtime_resolve_avx512_opt #define _dl_runtime_profile _dl_runtime_profile_avx512 #include "dl-trampoline.h" #undef _dl_runtime_resolve +#undef _dl_runtime_resolve_opt #undef _dl_runtime_profile #undef VEC #undef VMOV @@ -104,9 +107,11 @@ #endif #define VEC(i) ymm##i #define _dl_runtime_resolve _dl_runtime_resolve_avx +#define _dl_runtime_resolve_opt _dl_runtime_resolve_avx_opt #define _dl_runtime_profile _dl_runtime_profile_avx #include "dl-trampoline.h" #undef _dl_runtime_resolve +#undef _dl_runtime_resolve_opt #undef _dl_runtime_profile #undef VEC #undef VMOV @@ -126,3 +131,18 @@ #define _dl_runtime_profile _dl_runtime_profile_sse #undef RESTORE_AVX #include "dl-trampoline.h" +#undef _dl_runtime_resolve +#undef _dl_runtime_profile +#undef VMOV +#undef VMOVA + +/* Used by _dl_runtime_resolve_avx_opt/_dl_runtime_resolve_avx512_opt + to preserve the full vector registers with zero upper bits. */ +#define VMOVA vmovdqa +#if DL_RUNTIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT +# define VMOV vmovdqa +#else +# define VMOV vmovdqu +#endif +#define _dl_runtime_resolve _dl_runtime_resolve_sse_vex +#include "dl-trampoline.h" diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h index 8161f96b94..d6c7f989b5 100644 --- a/sysdeps/x86_64/dl-trampoline.h +++ b/sysdeps/x86_64/dl-trampoline.h @@ -50,6 +50,105 @@ #endif .text +#ifdef _dl_runtime_resolve_opt +/* Use the smallest vector registers to preserve the full YMM/ZMM + registers to avoid SSE transition penalty. */ + +# if VEC_SIZE == 32 +/* Check if the upper 128 bits in %ymm0 - %ymm7 registers are non-zero + and preserve %xmm0 - %xmm7 registers with the zero upper bits. Since + there is no SSE transition penalty on AVX512 processors which don't + support XGETBV with ECX == 1, _dl_runtime_resolve_avx512_slow isn't + provided. */ + .globl _dl_runtime_resolve_avx_slow + .hidden _dl_runtime_resolve_avx_slow + .type _dl_runtime_resolve_avx_slow, @function + .align 16 +_dl_runtime_resolve_avx_slow: + cfi_startproc + cfi_adjust_cfa_offset(16) # Incorporate PLT + vorpd %ymm0, %ymm1, %ymm8 + vorpd %ymm2, %ymm3, %ymm9 + vorpd %ymm4, %ymm5, %ymm10 + vorpd %ymm6, %ymm7, %ymm11 + vorpd %ymm8, %ymm9, %ymm9 + vorpd %ymm10, %ymm11, %ymm10 + vpcmpeqd %xmm8, %xmm8, %xmm8 + vorpd %ymm9, %ymm10, %ymm10 + vptest %ymm10, %ymm8 + # Preserve %ymm0 - %ymm7 registers if the upper 128 bits of any + # %ymm0 - %ymm7 registers aren't zero. + PRESERVE_BND_REGS_PREFIX + jnc _dl_runtime_resolve_avx + # Use vzeroupper to avoid SSE transition penalty. + vzeroupper + # Preserve %xmm0 - %xmm7 registers with the zero upper 128 bits + # when the upper 128 bits of %ymm0 - %ymm7 registers are zero. + PRESERVE_BND_REGS_PREFIX + jmp _dl_runtime_resolve_sse_vex + cfi_adjust_cfa_offset(-16) # Restore PLT adjustment + cfi_endproc + .size _dl_runtime_resolve_avx_slow, .-_dl_runtime_resolve_avx_slow +# endif + +/* Use XGETBV with ECX == 1 to check which bits in vector registers are + non-zero and only preserve the non-zero lower bits with zero upper + bits. */ + .globl _dl_runtime_resolve_opt + .hidden _dl_runtime_resolve_opt + .type _dl_runtime_resolve_opt, @function + .align 16 +_dl_runtime_resolve_opt: + cfi_startproc + cfi_adjust_cfa_offset(16) # Incorporate PLT + pushq %rax + cfi_adjust_cfa_offset(8) + cfi_rel_offset(%rax, 0) + pushq %rcx + cfi_adjust_cfa_offset(8) + cfi_rel_offset(%rcx, 0) + pushq %rdx + cfi_adjust_cfa_offset(8) + cfi_rel_offset(%rdx, 0) + movl $1, %ecx + xgetbv + movl %eax, %r11d + popq %rdx + cfi_adjust_cfa_offset(-8) + cfi_restore (%rdx) + popq %rcx + cfi_adjust_cfa_offset(-8) + cfi_restore (%rcx) + popq %rax + cfi_adjust_cfa_offset(-8) + cfi_restore (%rax) +# if VEC_SIZE == 32 + # For YMM registers, check if YMM state is in use. + andl $bit_YMM_state, %r11d + # Preserve %xmm0 - %xmm7 registers with the zero upper 128 bits if + # YMM state isn't in use. + PRESERVE_BND_REGS_PREFIX + jz _dl_runtime_resolve_sse_vex +# elif VEC_SIZE == 64 + # For ZMM registers, check if YMM state and ZMM state are in + # use. + andl $(bit_YMM_state | bit_ZMM0_15_state), %r11d + cmpl $bit_YMM_state, %r11d + # Preserve %xmm0 - %xmm7 registers with the zero upper 384 bits if + # neither YMM state nor ZMM state are in use. + PRESERVE_BND_REGS_PREFIX + jl _dl_runtime_resolve_sse_vex + # Preserve %ymm0 - %ymm7 registers with the zero upper 256 bits if + # ZMM state isn't in use. + PRESERVE_BND_REGS_PREFIX + je _dl_runtime_resolve_avx +# else +# error Unsupported VEC_SIZE! +# endif + cfi_adjust_cfa_offset(-16) # Restore PLT adjustment + cfi_endproc + .size _dl_runtime_resolve_opt, .-_dl_runtime_resolve_opt +#endif .globl _dl_runtime_resolve .hidden _dl_runtime_resolve .type _dl_runtime_resolve, @function @@ -164,7 +263,10 @@ _dl_runtime_resolve: .size _dl_runtime_resolve, .-_dl_runtime_resolve -#ifndef PROF +/* To preserve %xmm0 - %xmm7 registers, dl-trampoline.h is included + twice, for _dl_runtime_resolve_sse and _dl_runtime_resolve_sse_vex. + But we don't need another _dl_runtime_profile for XMM registers. */ +#if !defined PROF && defined _dl_runtime_profile # if (LR_VECTOR_OFFSET % VEC_SIZE) != 0 # error LR_VECTOR_OFFSET must be multples of VEC_SIZE # endif |