x86-64: Use _dl_runtime_resolve_opt only with AVX512F [BZ #21871]

On AVX machines with XGETBV (ECX == 1) like Skylake processors, (gdb) disass _dl_runtime_resolve_avx_opt Dump of assembler code for function _dl_runtime_resolve_avx_opt: 0x0000000000015890 <+0>: push %rax 0x0000000000015891 <+1>: push %rcx 0x0000000000015892 <+2>: push %rdx 0x0000000000015893 <+3>: mov $0x1,%ecx 0x0000000000015898 <+8>: xgetbv 0x000000000001589b <+11>: mov %eax,%r11d 0x000000000001589e <+14>: pop %rdx 0x000000000001589f <+15>: pop %rcx 0x00000000000158a0 <+16>: pop %rax 0x00000000000158a1 <+17>: and $0x4,%r11d 0x00000000000158a5 <+21>: bnd je 0x16200 <_dl_runtime_resolve_sse_vex> End of assembler dump. is slower than: (gdb) disass _dl_runtime_resolve_avx_slow Dump of assembler code for function _dl_runtime_resolve_avx_slow: 0x0000000000015850 <+0>: vorpd %ymm0,%ymm1,%ymm8 0x0000000000015854 <+4>: vorpd %ymm2,%ymm3,%ymm9 0x0000000000015858 <+8>: vorpd %ymm4,%ymm5,%ymm10 0x000000000001585c <+12>: vorpd %ymm6,%ymm7,%ymm11 0x0000000000015860 <+16>: vorpd %ymm8,%ymm9,%ymm9 0x0000000000015865 <+21>: vorpd %ymm10,%ymm11,%ymm10 0x000000000001586a <+26>: vpcmpeqd %xmm8,%xmm8,%xmm8 0x000000000001586f <+31>: vorpd %ymm9,%ymm10,%ymm10 0x0000000000015874 <+36>: vptest %ymm10,%ymm8 0x0000000000015879 <+41>: bnd jae 0x158b0 <_dl_runtime_resolve_avx> 0x000000000001587c <+44>: vzeroupper 0x000000000001587f <+47>: bnd jmpq 0x16200 <_dl_runtime_resolve_sse_vex> End of assembler dump. (gdb) since xgetbv takes much more cycles than single cycle operations like vpord/vvpcmpeq/ptest. _dl_runtime_resolve_opt should be used only with AVX512 where AVX512 instructions lead to lower CPU frequency on Skylake server. [BZ #21871] * sysdeps/x86/cpu-features.c (init_cpu_features): Set bit_arch_Use_dl_runtime_resolve_opt only with AVX512F. (cherry picked from commit d2cf37c0a2a375cf2fde69f1afbcc49e45368fc4)
author: H.J. Lu <hjl.tools@gmail.com> 2017-08-06 10:44:30 -0700
committer: H.J. Lu <hjl.tools@gmail.com> 2017-08-06 11:55:44 -0700
commit: d5a4092c367955ac0203ee603fdec625f6c924f9 (patch)
tree: 852e8b39a1dc4ea19cddf315f2042950fb62b29c
parent: 36f173ab3709b4a920a833b9af67f30bcba1ea01 (diff)
download: glibc-d5a4092c367955ac0203ee603fdec625f6c924f9.tar.gz
glibc-d5a4092c367955ac0203ee603fdec625f6c924f9.tar.xz
glibc-d5a4092c367955ac0203ee603fdec625f6c924f9.zip
2 files changed, 11 insertions, 2 deletions
diff --git a/ChangeLog b/ChangeLog
index 4f4252c4cf..ecc0da0b02 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2017-08-06  H.J. Lu  <hongjiu.lu@intel.com>
+
+	[BZ #21871]
+	* sysdeps/x86/cpu-features.c (init_cpu_features): Set
+	bit_arch_Use_dl_runtime_resolve_opt only with AVX512F.
+
 2017-02-27  Florian Weimer  <fweimer@redhat.com>
 
 	[BZ #21115]
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index d1ee922290..508ad2ae7b 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -215,10 +215,13 @@ init_cpu_features (struct cpu_features *cpu_features)
 	  |= bit_arch_Prefer_No_AVX512;
 
       /* To avoid SSE transition penalty, use _dl_runtime_resolve_slow.
-         If XGETBV suports ECX == 1, use _dl_runtime_resolve_opt.  */
+         If XGETBV suports ECX == 1, use _dl_runtime_resolve_opt.
+	 Use _dl_runtime_resolve_opt only with AVX512F since it is
+	 slower than _dl_runtime_resolve_slow with AVX.  */
       cpu_features->feature[index_arch_Use_dl_runtime_resolve_slow]
 	|= bit_arch_Use_dl_runtime_resolve_slow;
-      if (cpu_features->max_cpuid >= 0xd)
+      if (CPU_FEATURES_ARCH_P (cpu_features, AVX512F_Usable)
+	  && cpu_features->max_cpuid >= 0xd)
 	{
 	  unsigned int eax;
author	H.J. Lu <hjl.tools@gmail.com>	2017-08-06 10:44:30 -0700
committer	H.J. Lu <hjl.tools@gmail.com>	2017-08-06 11:55:44 -0700
commit	d5a4092c367955ac0203ee603fdec625f6c924f9 (patch)
tree	852e8b39a1dc4ea19cddf315f2042950fb62b29c
parent	36f173ab3709b4a920a833b9af67f30bcba1ea01 (diff)
download	glibc-d5a4092c367955ac0203ee603fdec625f6c924f9.tar.gz glibc-d5a4092c367955ac0203ee603fdec625f6c924f9.tar.xz glibc-d5a4092c367955ac0203ee603fdec625f6c924f9.zip