From 903b77defb6f2ee2552c06472339f33091e3c7b4 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Tue, 21 Mar 2017 10:59:31 -0700 Subject: x86-64: Improve branch predication in _dl_runtime_resolve_avx512_opt [BZ #21258] On Skylake server, _dl_runtime_resolve_avx512_opt is used to preserve the first 8 vector registers. The code layout is if only %xmm0 - %xmm7 registers are used preserve %xmm0 - %xmm7 registers if only %ymm0 - %ymm7 registers are used preserve %ymm0 - %ymm7 registers preserve %zmm0 - %zmm7 registers Branch predication always executes the fallthrough code path to preserve %zmm0 - %zmm7 registers speculatively, even though only %xmm0 - %xmm7 registers are used. This leads to lower CPU frequency on Skylake server. This patch changes the fallthrough code path to preserve %xmm0 - %xmm7 registers instead: if whole %zmm0 - %zmm7 registers are used preserve %zmm0 - %zmm7 registers if only %ymm0 - %ymm7 registers are used preserve %ymm0 - %ymm7 registers preserve %xmm0 - %xmm7 registers Tested on Skylake server. [BZ #21258] * sysdeps/x86_64/dl-trampoline.S (_dl_runtime_resolve_opt): Define only if _dl_runtime_resolve is defined to _dl_runtime_resolve_sse_vex. * sysdeps/x86_64/dl-trampoline.h (_dl_runtime_resolve_opt): Fallthrough to _dl_runtime_resolve_sse_vex. (cherry picked from commit c15f8eb50cea7ad1a4ccece6e0982bf426d52c00) --- ChangeLog | 9 +++++++++ sysdeps/x86_64/dl-trampoline.S | 3 +-- sysdeps/x86_64/dl-trampoline.h | 9 +++++---- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/ChangeLog b/ChangeLog index b37a054bae..8479fba8c4 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,12 @@ +2017-04-07 H.J. Lu + + [BZ #21258] + * sysdeps/x86_64/dl-trampoline.S (_dl_runtime_resolve_opt): + Define only if _dl_runtime_resolve is defined to + _dl_runtime_resolve_sse_vex. + * sysdeps/x86_64/dl-trampoline.h (_dl_runtime_resolve_opt): + Fallthrough to _dl_runtime_resolve_sse_vex. + 2017-04-03 Mike Frysinger [BZ #21253] diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S index 33d7fcf7d0..c14c61aa58 100644 --- a/sysdeps/x86_64/dl-trampoline.S +++ b/sysdeps/x86_64/dl-trampoline.S @@ -87,11 +87,9 @@ #endif #define VEC(i) zmm##i #define _dl_runtime_resolve _dl_runtime_resolve_avx512 -#define _dl_runtime_resolve_opt _dl_runtime_resolve_avx512_opt #define _dl_runtime_profile _dl_runtime_profile_avx512 #include "dl-trampoline.h" #undef _dl_runtime_resolve -#undef _dl_runtime_resolve_opt #undef _dl_runtime_profile #undef VEC #undef VMOV @@ -145,4 +143,5 @@ # define VMOV vmovdqu #endif #define _dl_runtime_resolve _dl_runtime_resolve_sse_vex +#define _dl_runtime_resolve_opt _dl_runtime_resolve_avx512_opt #include "dl-trampoline.h" diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h index b27fa06974..8db24c16ac 100644 --- a/sysdeps/x86_64/dl-trampoline.h +++ b/sysdeps/x86_64/dl-trampoline.h @@ -129,19 +129,20 @@ _dl_runtime_resolve_opt: # YMM state isn't in use. PRESERVE_BND_REGS_PREFIX jz _dl_runtime_resolve_sse_vex -# elif VEC_SIZE == 64 +# elif VEC_SIZE == 16 # For ZMM registers, check if YMM state and ZMM state are in # use. andl $(bit_YMM_state | bit_ZMM0_15_state), %r11d cmpl $bit_YMM_state, %r11d - # Preserve %xmm0 - %xmm7 registers with the zero upper 384 bits if - # neither YMM state nor ZMM state are in use. + # Preserve %zmm0 - %zmm7 registers if ZMM state is in use. PRESERVE_BND_REGS_PREFIX - jl _dl_runtime_resolve_sse_vex + jg _dl_runtime_resolve_avx512 # Preserve %ymm0 - %ymm7 registers with the zero upper 256 bits if # ZMM state isn't in use. PRESERVE_BND_REGS_PREFIX je _dl_runtime_resolve_avx + # Preserve %xmm0 - %xmm7 registers with the zero upper 384 bits if + # neither YMM state nor ZMM state are in use. # else # error Unsupported VEC_SIZE! # endif -- cgit 1.4.1