about summary refs log tree commit diff
path: root/sysdeps/x86_64/dl-trampoline.S
diff options
context:
space:
mode:
authorH.J. Lu <hjl.tools@gmail.com>2017-03-21 10:59:31 -0700
committerH.J. Lu <hjl.tools@gmail.com>2017-04-20 07:55:44 -0700
commit883cadc5543ffd3a4537498b44c782ded8a4a4e8 (patch)
tree7f8420bd13c9e7660a4e097c9f62b01a2695a2d2 /sysdeps/x86_64/dl-trampoline.S
parent83037ea1d9e84b1b44ed307f01cbb5eeac24e22d (diff)
downloadglibc-hjl/pr21258/2.23.tar.gz
glibc-hjl/pr21258/2.23.tar.xz
glibc-hjl/pr21258/2.23.zip
x86-64: Improve branch predication in _dl_runtime_resolve_avx512_opt [BZ #21258] hjl/pr21258/2.23
On Skylake server, _dl_runtime_resolve_avx512_opt is used to preserve
the first 8 vector registers.  The code layout is

  if only %xmm0 - %xmm7 registers are used
     preserve %xmm0 - %xmm7 registers
  if only %ymm0 - %ymm7 registers are used
     preserve %ymm0 - %ymm7 registers
  preserve %zmm0 - %zmm7 registers

Branch predication always executes the fallthrough code path to preserve
%zmm0 - %zmm7 registers speculatively, even though only %xmm0 - %xmm7
registers are used.  This leads to lower CPU frequency on Skylake
server.  This patch changes the fallthrough code path to preserve
%xmm0 - %xmm7 registers instead:

  if whole %zmm0 - %zmm7 registers are used
    preserve %zmm0 - %zmm7 registers
  if only %ymm0 - %ymm7 registers are used
     preserve %ymm0 - %ymm7 registers
  preserve %xmm0 - %xmm7 registers

Tested on Skylake server.

	[BZ #21258]
	* sysdeps/x86_64/dl-trampoline.S (_dl_runtime_resolve_opt):
	Define only if _dl_runtime_resolve is defined to
	_dl_runtime_resolve_sse_vex.
	* sysdeps/x86_64/dl-trampoline.h (_dl_runtime_resolve_opt):
	Fallthrough to _dl_runtime_resolve_sse_vex.

(cherry picked from commit c15f8eb50cea7ad1a4ccece6e0982bf426d52c00)
Diffstat (limited to 'sysdeps/x86_64/dl-trampoline.S')
-rw-r--r--sysdeps/x86_64/dl-trampoline.S5
1 files changed, 1 insertions, 4 deletions
diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
index 3f812b89c0..177f666164 100644
--- a/sysdeps/x86_64/dl-trampoline.S
+++ b/sysdeps/x86_64/dl-trampoline.S
@@ -88,11 +88,9 @@
 # endif
 # define VEC(i)			zmm##i
 # define _dl_runtime_resolve	_dl_runtime_resolve_avx512
-# define _dl_runtime_resolve_opt _dl_runtime_resolve_avx512_opt
 # define _dl_runtime_profile	_dl_runtime_profile_avx512
 # include "dl-trampoline.h"
 # undef _dl_runtime_resolve
-# undef _dl_runtime_resolve_opt
 # undef _dl_runtime_profile
 # undef VEC
 # undef VMOV
@@ -101,8 +99,6 @@
 #else
 strong_alias (_dl_runtime_resolve_avx, _dl_runtime_resolve_avx512)
 	.hidden _dl_runtime_resolve_avx512
-strong_alias (_dl_runtime_resolve_avx_opt, _dl_runtime_resolve_avx512_opt)
-	.hidden _dl_runtime_resolve_avx512_opt
 strong_alias (_dl_runtime_profile_avx, _dl_runtime_profile_avx512)
 	.hidden _dl_runtime_profile_avx512
 #endif
@@ -154,4 +150,5 @@ strong_alias (_dl_runtime_profile_avx, _dl_runtime_profile_avx512)
 # define VMOV			vmovdqu
 #endif
 #define _dl_runtime_resolve	_dl_runtime_resolve_sse_vex
+#define _dl_runtime_resolve_opt	_dl_runtime_resolve_avx512_opt
 #include "dl-trampoline.h"