diff options
author | H.J. Lu <hjl.tools@gmail.com> | 2015-08-25 04:33:54 -0700 |
---|---|---|
committer | H.J. Lu <hjl.tools@gmail.com> | 2015-08-25 04:34:13 -0700 |
commit | f3dcae82d54e5097e18e1d6ef4ff55c2ea4e621e (patch) | |
tree | e43395ded84bf0aa25ecbe2d395082a182aae8b7 /sysdeps/x86_64/dl-machine.h | |
parent | 2d02fd07371bcd492c320cec649c6265787d794a (diff) | |
download | glibc-f3dcae82d54e5097e18e1d6ef4ff55c2ea4e621e.tar.gz glibc-f3dcae82d54e5097e18e1d6ef4ff55c2ea4e621e.tar.xz glibc-f3dcae82d54e5097e18e1d6ef4ff55c2ea4e621e.zip |
Save and restore vector registers in x86-64 ld.so
This patch adds SSE, AVX and AVX512 versions of _dl_runtime_resolve and _dl_runtime_profile, which save and restore the first 8 vector registers used for parameter passing. elf_machine_runtime_setup selects the proper _dl_runtime_resolve or _dl_runtime_profile based on _dl_x86_cpu_features. It avoids race condition caused by FOREIGN_CALL macros, which are only used for x86-64. Performance impact of saving and restoring 8 vector registers are negligible on Nehalem, Sandy Bridge, Ivy Bridge and Haswell when ld.so is optimized with SSE2. [BZ #15128] * sysdeps/x86_64/Makefile [$(subdir) == elf] (tests): Add ifuncmain8. (modules-names): Add ifuncmod8. ($(objpfx)ifuncmain8): New rule. * sysdeps/x86_64/dl-machine.h: Include <dl-procinfo.h> and <cpuid.h>. (elf_machine_runtime_setup): Use _dl_runtime_resolve_sse, _dl_runtime_resolve_avx, or _dl_runtime_resolve_avx512, _dl_runtime_profile_sse, _dl_runtime_profile_avx, or _dl_runtime_profile_avx512, based on HAS_ARCH_FEATURE. * sysdeps/x86_64/dl-trampoline.S: Rewrite. * sysdeps/x86_64/dl-trampoline.h: Likewise. * sysdeps/x86_64/ifuncmain8.c: New file. * sysdeps/x86_64/ifuncmod8.c: Likewise. * sysdeps/x86_64/nptl/tcb-offsets.sym (RTLD_SAVESPACE_SSE): Removed. * sysdeps/x86_64/nptl/tls.h (__128bits): Removed. (tcbhead_t): Change rtld_must_xmm_save to __glibc_unused1. Change rtld_savespace_sse to __glibc_unused2. (RTLD_CHECK_FOREIGN_CALL): Removed. (RTLD_ENABLE_FOREIGN_CALL): Likewise. (RTLD_PREPARE_FOREIGN_CALL): Likewise. (RTLD_FINALIZE_FOREIGN_CALL): Likewise.
Diffstat (limited to 'sysdeps/x86_64/dl-machine.h')
-rw-r--r-- | sysdeps/x86_64/dl-machine.h | 29 |
1 files changed, 23 insertions, 6 deletions
diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h index d22359da0e..32d25da01e 100644 --- a/sysdeps/x86_64/dl-machine.h +++ b/sysdeps/x86_64/dl-machine.h @@ -66,8 +66,12 @@ static inline int __attribute__ ((unused, always_inline)) elf_machine_runtime_setup (struct link_map *l, int lazy, int profile) { Elf64_Addr *got; - extern void _dl_runtime_resolve (ElfW(Word)) attribute_hidden; - extern void _dl_runtime_profile (ElfW(Word)) attribute_hidden; + extern void _dl_runtime_resolve_sse (ElfW(Word)) attribute_hidden; + extern void _dl_runtime_resolve_avx (ElfW(Word)) attribute_hidden; + extern void _dl_runtime_resolve_avx512 (ElfW(Word)) attribute_hidden; + extern void _dl_runtime_profile_sse (ElfW(Word)) attribute_hidden; + extern void _dl_runtime_profile_avx (ElfW(Word)) attribute_hidden; + extern void _dl_runtime_profile_avx512 (ElfW(Word)) attribute_hidden; if (l->l_info[DT_JMPREL] && lazy) { @@ -95,7 +99,12 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile) end in this function. */ if (__glibc_unlikely (profile)) { - *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile; + if (HAS_ARCH_FEATURE (AVX512F_Usable)) + *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx512; + else if (HAS_ARCH_FEATURE (AVX_Usable)) + *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx; + else + *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_sse; if (GLRO(dl_profile) != NULL && _dl_name_match_p (GLRO(dl_profile), l)) @@ -104,9 +113,17 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile) GL(dl_profile_map) = l; } else - /* This function will get called to fix up the GOT entry indicated by - the offset on the stack, and then jump to the resolved address. */ - *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve; + { + /* This function will get called to fix up the GOT entry + indicated by the offset on the stack, and then jump to + the resolved address. */ + if (HAS_ARCH_FEATURE (AVX512F_Usable)) + *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_avx512; + else if (HAS_ARCH_FEATURE (AVX_Usable)) + *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_avx; + else + *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_sse; + } } if (l->l_info[ADDRIDX (DT_TLSDESC_GOT)] && lazy) |