about summary refs log tree commit diff
diff options
context:
space:
mode:
authorH.J. Lu <hjl.tools@gmail.com>2017-03-21 10:59:31 -0700
committerH.J. Lu <hjl.tools@gmail.com>2017-03-21 11:00:12 -0700
commitc15f8eb50cea7ad1a4ccece6e0982bf426d52c00 (patch)
treeda3251690c3d1f035acebce5350b0a5a0b33cc00
parenta640393a18329ef4044bf9213f6466cd2d1e69f3 (diff)
downloadglibc-c15f8eb50cea7ad1a4ccece6e0982bf426d52c00.tar.gz
glibc-c15f8eb50cea7ad1a4ccece6e0982bf426d52c00.tar.xz
glibc-c15f8eb50cea7ad1a4ccece6e0982bf426d52c00.zip
x86-64: Improve branch predication in _dl_runtime_resolve_avx512_opt [BZ #21258]
On Skylake server, _dl_runtime_resolve_avx512_opt is used to preserve
the first 8 vector registers.  The code layout is

  if only %xmm0 - %xmm7 registers are used
     preserve %xmm0 - %xmm7 registers
  if only %ymm0 - %ymm7 registers are used
     preserve %ymm0 - %ymm7 registers
  preserve %zmm0 - %zmm7 registers

Branch predication always executes the fallthrough code path to preserve
%zmm0 - %zmm7 registers speculatively, even though only %xmm0 - %xmm7
registers are used.  This leads to lower CPU frequency on Skylake
server.  This patch changes the fallthrough code path to preserve
%xmm0 - %xmm7 registers instead:

  if whole %zmm0 - %zmm7 registers are used
    preserve %zmm0 - %zmm7 registers
  if only %ymm0 - %ymm7 registers are used
     preserve %ymm0 - %ymm7 registers
  preserve %xmm0 - %xmm7 registers

Tested on Skylake server.

	[BZ #21258]
	* sysdeps/x86_64/dl-trampoline.S (_dl_runtime_resolve_opt):
	Define only if _dl_runtime_resolve is defined to
	_dl_runtime_resolve_sse_vex.
	* sysdeps/x86_64/dl-trampoline.h (_dl_runtime_resolve_opt):
	Fallthrough to _dl_runtime_resolve_sse_vex.
-rw-r--r--ChangeLog9
-rw-r--r--sysdeps/x86_64/dl-trampoline.S3
-rw-r--r--sysdeps/x86_64/dl-trampoline.h9
3 files changed, 15 insertions, 6 deletions
diff --git a/ChangeLog b/ChangeLog
index 3ed2df8719..1367e8eaaa 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,12 @@
+2017-03-21  H.J. Lu  <hongjiu.lu@intel.com>
+
+	[BZ #21258]
+	* sysdeps/x86_64/dl-trampoline.S (_dl_runtime_resolve_opt):
+	Define only if _dl_runtime_resolve is defined to
+	_dl_runtime_resolve_sse_vex.
+	* sysdeps/x86_64/dl-trampoline.h (_dl_runtime_resolve_opt):
+	Fallthrough to _dl_runtime_resolve_sse_vex.
+
 2017-03-21  Joseph Myers  <joseph@codesourcery.com>
 
 	* INSTALL: Regenerated.
diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
index 33d7fcf7d0..c14c61aa58 100644
--- a/sysdeps/x86_64/dl-trampoline.S
+++ b/sysdeps/x86_64/dl-trampoline.S
@@ -87,11 +87,9 @@
 #endif
 #define VEC(i)			zmm##i
 #define _dl_runtime_resolve	_dl_runtime_resolve_avx512
-#define _dl_runtime_resolve_opt	_dl_runtime_resolve_avx512_opt
 #define _dl_runtime_profile	_dl_runtime_profile_avx512
 #include "dl-trampoline.h"
 #undef _dl_runtime_resolve
-#undef _dl_runtime_resolve_opt
 #undef _dl_runtime_profile
 #undef VEC
 #undef VMOV
@@ -145,4 +143,5 @@
 # define VMOV			vmovdqu
 #endif
 #define _dl_runtime_resolve	_dl_runtime_resolve_sse_vex
+#define _dl_runtime_resolve_opt	_dl_runtime_resolve_avx512_opt
 #include "dl-trampoline.h"
diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h
index b27fa06974..8db24c16ac 100644
--- a/sysdeps/x86_64/dl-trampoline.h
+++ b/sysdeps/x86_64/dl-trampoline.h
@@ -129,19 +129,20 @@ _dl_runtime_resolve_opt:
 	# YMM state isn't in use.
 	PRESERVE_BND_REGS_PREFIX
 	jz _dl_runtime_resolve_sse_vex
-# elif VEC_SIZE == 64
+# elif VEC_SIZE == 16
 	# For ZMM registers, check if YMM state and ZMM state are in
 	# use.
 	andl $(bit_YMM_state | bit_ZMM0_15_state), %r11d
 	cmpl $bit_YMM_state, %r11d
-	# Preserve %xmm0 - %xmm7 registers with the zero upper 384 bits if
-	# neither YMM state nor ZMM state are in use.
+	# Preserve %zmm0 - %zmm7 registers if ZMM state is in use.
 	PRESERVE_BND_REGS_PREFIX
-	jl _dl_runtime_resolve_sse_vex
+	jg _dl_runtime_resolve_avx512
 	# Preserve %ymm0 - %ymm7 registers with the zero upper 256 bits if
 	# ZMM state isn't in use.
 	PRESERVE_BND_REGS_PREFIX
 	je _dl_runtime_resolve_avx
+	# Preserve %xmm0 - %xmm7 registers with the zero upper 384 bits if
+	# neither YMM state nor ZMM state are in use.
 # else
 #  error Unsupported VEC_SIZE!
 # endif