3 files changed, 145 insertions, 3 deletions
diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
index ed0c1a8efd..c0f0fa16a2 100644
--- a/sysdeps/x86_64/dl-machine.h
+++ b/sysdeps/x86_64/dl-machine.h
@@ -68,7 +68,10 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
   Elf64_Addr *got;
   extern void _dl_runtime_resolve_sse (ElfW(Word)) attribute_hidden;
   extern void _dl_runtime_resolve_avx (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_resolve_avx_slow (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_resolve_avx_opt (ElfW(Word)) attribute_hidden;
   extern void _dl_runtime_resolve_avx512 (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_resolve_avx512_opt (ElfW(Word)) attribute_hidden;
   extern void _dl_runtime_profile_sse (ElfW(Word)) attribute_hidden;
   extern void _dl_runtime_profile_avx (ElfW(Word)) attribute_hidden;
   extern void _dl_runtime_profile_avx512 (ElfW(Word)) attribute_hidden;
@@ -118,9 +121,26 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
 	     indicated by the offset on the stack, and then jump to
 	     the resolved address.  */
 	  if (HAS_ARCH_FEATURE (AVX512F_Usable))
-	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_avx512;
+	    {
+	      if (HAS_ARCH_FEATURE (Use_dl_runtime_resolve_opt))
+		*(ElfW(Addr) *) (got + 2)
+		  = (ElfW(Addr)) &_dl_runtime_resolve_avx512_opt;
+	      else
+		*(ElfW(Addr) *) (got + 2)
+		  = (ElfW(Addr)) &_dl_runtime_resolve_avx512;
+	    }
 	  else if (HAS_ARCH_FEATURE (AVX_Usable))
-	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_avx;
+	    {
+	      if (HAS_ARCH_FEATURE (Use_dl_runtime_resolve_opt))
+		*(ElfW(Addr) *) (got + 2)
+		  = (ElfW(Addr)) &_dl_runtime_resolve_avx_opt;
+	      else if (HAS_ARCH_FEATURE (Use_dl_runtime_resolve_slow))
+		*(ElfW(Addr) *) (got + 2)
+		  = (ElfW(Addr)) &_dl_runtime_resolve_avx_slow;
+	      else
+		*(ElfW(Addr) *) (got + 2)
+		  = (ElfW(Addr)) &_dl_runtime_resolve_avx;
+	    }
 	  else
 	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_sse;
 	}
diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
index 12f1a5cf84..39f595e1e1 100644
--- a/sysdeps/x86_64/dl-trampoline.S
+++ b/sysdeps/x86_64/dl-trampoline.S
@@ -18,6 +18,7 @@
 
 #include <config.h>
 #include <sysdep.h>
+#include <cpu-features.h>
 #include <link-defines.h>
 
 #ifndef DL_STACK_ALIGNMENT
@@ -86,9 +87,11 @@
 #endif
 #define VEC(i)			zmm##i
 #define _dl_runtime_resolve	_dl_runtime_resolve_avx512
+#define _dl_runtime_resolve_opt	_dl_runtime_resolve_avx512_opt
 #define _dl_runtime_profile	_dl_runtime_profile_avx512
 #include "dl-trampoline.h"
 #undef _dl_runtime_resolve
+#undef _dl_runtime_resolve_opt
 #undef _dl_runtime_profile
 #undef VEC
 #undef VMOV
@@ -104,9 +107,11 @@
 #endif
 #define VEC(i)			ymm##i
 #define _dl_runtime_resolve	_dl_runtime_resolve_avx
+#define _dl_runtime_resolve_opt	_dl_runtime_resolve_avx_opt
 #define _dl_runtime_profile	_dl_runtime_profile_avx
 #include "dl-trampoline.h"
 #undef _dl_runtime_resolve
+#undef _dl_runtime_resolve_opt
 #undef _dl_runtime_profile
 #undef VEC
 #undef VMOV
@@ -126,3 +131,18 @@
 #define _dl_runtime_profile	_dl_runtime_profile_sse
 #undef RESTORE_AVX
 #include "dl-trampoline.h"
+#undef _dl_runtime_resolve
+#undef _dl_runtime_profile
+#undef VMOV
+#undef VMOVA
+
+/* Used by _dl_runtime_resolve_avx_opt/_dl_runtime_resolve_avx512_opt
+   to preserve the full vector registers with zero upper bits.  */
+#define VMOVA			vmovdqa
+#if DL_RUNTIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT
+# define VMOV			vmovdqa
+#else
+# define VMOV			vmovdqu
+#endif
+#define _dl_runtime_resolve	_dl_runtime_resolve_sse_vex
+#include "dl-trampoline.h"
diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h
index 8161f96b94..d6c7f989b5 100644
--- a/sysdeps/x86_64/dl-trampoline.h
+++ b/sysdeps/x86_64/dl-trampoline.h
@@ -50,6 +50,105 @@
 #endif
 
 	.text
+#ifdef _dl_runtime_resolve_opt
+/* Use the smallest vector registers to preserve the full YMM/ZMM
+   registers to avoid SSE transition penalty.  */
+
+# if VEC_SIZE == 32
+/* Check if the upper 128 bits in %ymm0 - %ymm7 registers are non-zero
+   and preserve %xmm0 - %xmm7 registers with the zero upper bits.  Since
+   there is no SSE transition penalty on AVX512 processors which don't
+   support XGETBV with ECX == 1, _dl_runtime_resolve_avx512_slow isn't
+   provided.   */
+	.globl _dl_runtime_resolve_avx_slow
+	.hidden _dl_runtime_resolve_avx_slow
+	.type _dl_runtime_resolve_avx_slow, @function
+	.align 16
+_dl_runtime_resolve_avx_slow:
+	cfi_startproc
+	cfi_adjust_cfa_offset(16) # Incorporate PLT
+	vorpd %ymm0, %ymm1, %ymm8
+	vorpd %ymm2, %ymm3, %ymm9
+	vorpd %ymm4, %ymm5, %ymm10
+	vorpd %ymm6, %ymm7, %ymm11
+	vorpd %ymm8, %ymm9, %ymm9
+	vorpd %ymm10, %ymm11, %ymm10
+	vpcmpeqd %xmm8, %xmm8, %xmm8
+	vorpd %ymm9, %ymm10, %ymm10
+	vptest %ymm10, %ymm8
+	# Preserve %ymm0 - %ymm7 registers if the upper 128 bits of any
+	# %ymm0 - %ymm7 registers aren't zero.
+	PRESERVE_BND_REGS_PREFIX
+	jnc _dl_runtime_resolve_avx
+	# Use vzeroupper to avoid SSE transition penalty.
+	vzeroupper
+	# Preserve %xmm0 - %xmm7 registers with the zero upper 128 bits
+	# when the upper 128 bits of %ymm0 - %ymm7 registers are zero.
+	PRESERVE_BND_REGS_PREFIX
+	jmp _dl_runtime_resolve_sse_vex
+	cfi_adjust_cfa_offset(-16) # Restore PLT adjustment
+	cfi_endproc
+	.size _dl_runtime_resolve_avx_slow, .-_dl_runtime_resolve_avx_slow
+# endif
+
+/* Use XGETBV with ECX == 1 to check which bits in vector registers are
+   non-zero and only preserve the non-zero lower bits with zero upper
+   bits.  */
+	.globl _dl_runtime_resolve_opt
+	.hidden _dl_runtime_resolve_opt
+	.type _dl_runtime_resolve_opt, @function
+	.align 16
+_dl_runtime_resolve_opt:
+	cfi_startproc
+	cfi_adjust_cfa_offset(16) # Incorporate PLT
+	pushq %rax
+	cfi_adjust_cfa_offset(8)
+	cfi_rel_offset(%rax, 0)
+	pushq %rcx
+	cfi_adjust_cfa_offset(8)
+	cfi_rel_offset(%rcx, 0)
+	pushq %rdx
+	cfi_adjust_cfa_offset(8)
+	cfi_rel_offset(%rdx, 0)
+	movl $1, %ecx
+	xgetbv
+	movl %eax, %r11d
+	popq %rdx
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore (%rdx)
+	popq %rcx
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore (%rcx)
+	popq %rax
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore (%rax)
+# if VEC_SIZE == 32
+	# For YMM registers, check if YMM state is in use.
+	andl $bit_YMM_state, %r11d
+	# Preserve %xmm0 - %xmm7 registers with the zero upper 128 bits if
+	# YMM state isn't in use.
+	PRESERVE_BND_REGS_PREFIX
+	jz _dl_runtime_resolve_sse_vex
+# elif VEC_SIZE == 64
+	# For ZMM registers, check if YMM state and ZMM state are in
+	# use.
+	andl $(bit_YMM_state | bit_ZMM0_15_state), %r11d
+	cmpl $bit_YMM_state, %r11d
+	# Preserve %xmm0 - %xmm7 registers with the zero upper 384 bits if
+	# neither YMM state nor ZMM state are in use.
+	PRESERVE_BND_REGS_PREFIX
+	jl _dl_runtime_resolve_sse_vex
+	# Preserve %ymm0 - %ymm7 registers with the zero upper 256 bits if
+	# ZMM state isn't in use.
+	PRESERVE_BND_REGS_PREFIX
+	je _dl_runtime_resolve_avx
+# else
+#  error Unsupported VEC_SIZE!
+# endif
+	cfi_adjust_cfa_offset(-16) # Restore PLT adjustment
+	cfi_endproc
+	.size _dl_runtime_resolve_opt, .-_dl_runtime_resolve_opt
+#endif
 	.globl _dl_runtime_resolve
 	.hidden _dl_runtime_resolve
 	.type _dl_runtime_resolve, @function
@@ -164,7 +263,10 @@ _dl_runtime_resolve:
 	.size _dl_runtime_resolve, .-_dl_runtime_resolve
 
 
-#ifndef PROF
+/* To preserve %xmm0 - %xmm7 registers, dl-trampoline.h is included
+   twice, for _dl_runtime_resolve_sse and _dl_runtime_resolve_sse_vex.
+   But we don't need another _dl_runtime_profile for XMM registers.  */
+#if !defined PROF && defined _dl_runtime_profile
 # if (LR_VECTOR_OFFSET % VEC_SIZE) != 0
 #  error LR_VECTOR_OFFSET must be multples of VEC_SIZE
 # endif