diff options
author | H.J. Lu <hjl.tools@gmail.com> | 2017-03-21 10:59:31 -0700 |
---|---|---|
committer | H.J. Lu <hjl.tools@gmail.com> | 2017-04-20 07:55:44 -0700 |
commit | 883cadc5543ffd3a4537498b44c782ded8a4a4e8 (patch) | |
tree | 7f8420bd13c9e7660a4e097c9f62b01a2695a2d2 /sysdeps/x86_64/dl-trampoline.S | |
parent | 83037ea1d9e84b1b44ed307f01cbb5eeac24e22d (diff) | |
download | glibc-hjl/pr21258/2.23.tar.gz glibc-hjl/pr21258/2.23.tar.xz glibc-hjl/pr21258/2.23.zip |
x86-64: Improve branch predication in _dl_runtime_resolve_avx512_opt [BZ #21258] hjl/pr21258/2.23
On Skylake server, _dl_runtime_resolve_avx512_opt is used to preserve the first 8 vector registers. The code layout is if only %xmm0 - %xmm7 registers are used preserve %xmm0 - %xmm7 registers if only %ymm0 - %ymm7 registers are used preserve %ymm0 - %ymm7 registers preserve %zmm0 - %zmm7 registers Branch predication always executes the fallthrough code path to preserve %zmm0 - %zmm7 registers speculatively, even though only %xmm0 - %xmm7 registers are used. This leads to lower CPU frequency on Skylake server. This patch changes the fallthrough code path to preserve %xmm0 - %xmm7 registers instead: if whole %zmm0 - %zmm7 registers are used preserve %zmm0 - %zmm7 registers if only %ymm0 - %ymm7 registers are used preserve %ymm0 - %ymm7 registers preserve %xmm0 - %xmm7 registers Tested on Skylake server. [BZ #21258] * sysdeps/x86_64/dl-trampoline.S (_dl_runtime_resolve_opt): Define only if _dl_runtime_resolve is defined to _dl_runtime_resolve_sse_vex. * sysdeps/x86_64/dl-trampoline.h (_dl_runtime_resolve_opt): Fallthrough to _dl_runtime_resolve_sse_vex. (cherry picked from commit c15f8eb50cea7ad1a4ccece6e0982bf426d52c00)
Diffstat (limited to 'sysdeps/x86_64/dl-trampoline.S')
-rw-r--r-- | sysdeps/x86_64/dl-trampoline.S | 5 |
1 files changed, 1 insertions, 4 deletions
diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S index 3f812b89c0..177f666164 100644 --- a/sysdeps/x86_64/dl-trampoline.S +++ b/sysdeps/x86_64/dl-trampoline.S @@ -88,11 +88,9 @@ # endif # define VEC(i) zmm##i # define _dl_runtime_resolve _dl_runtime_resolve_avx512 -# define _dl_runtime_resolve_opt _dl_runtime_resolve_avx512_opt # define _dl_runtime_profile _dl_runtime_profile_avx512 # include "dl-trampoline.h" # undef _dl_runtime_resolve -# undef _dl_runtime_resolve_opt # undef _dl_runtime_profile # undef VEC # undef VMOV @@ -101,8 +99,6 @@ #else strong_alias (_dl_runtime_resolve_avx, _dl_runtime_resolve_avx512) .hidden _dl_runtime_resolve_avx512 -strong_alias (_dl_runtime_resolve_avx_opt, _dl_runtime_resolve_avx512_opt) - .hidden _dl_runtime_resolve_avx512_opt strong_alias (_dl_runtime_profile_avx, _dl_runtime_profile_avx512) .hidden _dl_runtime_profile_avx512 #endif @@ -154,4 +150,5 @@ strong_alias (_dl_runtime_profile_avx, _dl_runtime_profile_avx512) # define VMOV vmovdqu #endif #define _dl_runtime_resolve _dl_runtime_resolve_sse_vex +#define _dl_runtime_resolve_opt _dl_runtime_resolve_avx512_opt #include "dl-trampoline.h" |