about summary refs log tree commit diff
path: root/sysdeps/x86_64/dl-trampoline.h
diff options
context:
space:
mode:
authorH.J. Lu <hjl.tools@gmail.com>2015-08-25 04:33:54 -0700
committerH.J. Lu <hjl.tools@gmail.com>2015-08-25 04:34:13 -0700
commitf3dcae82d54e5097e18e1d6ef4ff55c2ea4e621e (patch)
treee43395ded84bf0aa25ecbe2d395082a182aae8b7 /sysdeps/x86_64/dl-trampoline.h
parent2d02fd07371bcd492c320cec649c6265787d794a (diff)
downloadglibc-f3dcae82d54e5097e18e1d6ef4ff55c2ea4e621e.tar.gz
glibc-f3dcae82d54e5097e18e1d6ef4ff55c2ea4e621e.tar.xz
glibc-f3dcae82d54e5097e18e1d6ef4ff55c2ea4e621e.zip
Save and restore vector registers in x86-64 ld.so
This patch adds SSE, AVX and AVX512 versions of _dl_runtime_resolve
and _dl_runtime_profile, which save and restore the first 8 vector
registers used for parameter passing.  elf_machine_runtime_setup
selects the proper _dl_runtime_resolve or _dl_runtime_profile based
on _dl_x86_cpu_features.  It avoids race condition caused by
FOREIGN_CALL macros, which are only used for x86-64.

Performance impact of saving and restoring 8 vector registers are
negligible on Nehalem, Sandy Bridge, Ivy Bridge and Haswell when
ld.so is optimized with SSE2.

	[BZ #15128]
	* sysdeps/x86_64/Makefile [$(subdir) == elf] (tests): Add
	ifuncmain8.
	(modules-names): Add ifuncmod8.
	($(objpfx)ifuncmain8): New rule.
	* sysdeps/x86_64/dl-machine.h: Include <dl-procinfo.h> and
	<cpuid.h>.
	(elf_machine_runtime_setup): Use _dl_runtime_resolve_sse,
	_dl_runtime_resolve_avx, or _dl_runtime_resolve_avx512,
	_dl_runtime_profile_sse, _dl_runtime_profile_avx, or
	_dl_runtime_profile_avx512, based on HAS_ARCH_FEATURE.
	* sysdeps/x86_64/dl-trampoline.S: Rewrite.
	* sysdeps/x86_64/dl-trampoline.h: Likewise.
	* sysdeps/x86_64/ifuncmain8.c: New file.
	* sysdeps/x86_64/ifuncmod8.c: Likewise.
	* sysdeps/x86_64/nptl/tcb-offsets.sym (RTLD_SAVESPACE_SSE):
	Removed.
	* sysdeps/x86_64/nptl/tls.h (__128bits): Removed.
	(tcbhead_t): Change rtld_must_xmm_save to __glibc_unused1.
	Change rtld_savespace_sse to __glibc_unused2.
	(RTLD_CHECK_FOREIGN_CALL): Removed.
	(RTLD_ENABLE_FOREIGN_CALL): Likewise.
	(RTLD_PREPARE_FOREIGN_CALL): Likewise.
	(RTLD_FINALIZE_FOREIGN_CALL): Likewise.
Diffstat (limited to 'sysdeps/x86_64/dl-trampoline.h')
-rw-r--r--sysdeps/x86_64/dl-trampoline.h366
1 files changed, 299 insertions, 67 deletions
diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h
index d542428ac2..dd6d7c7835 100644
--- a/sysdeps/x86_64/dl-trampoline.h
+++ b/sysdeps/x86_64/dl-trampoline.h
@@ -1,5 +1,4 @@
-/* Partial PLT profile trampoline to save and restore x86-64 vector
-   registers.
+/* PLT trampolines.  x86-64 version.
    Copyright (C) 2009-2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -17,16 +16,252 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#ifdef RESTORE_AVX
+#undef REGISTER_SAVE_AREA_RAW
+#ifdef __ILP32__
+/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as VEC0 to
+   VEC7.  */
+# define REGISTER_SAVE_AREA_RAW	(8 * 7 + VEC_SIZE * 8)
+#else
+/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as
+   BND0, BND1, BND2, BND3 and VEC0 to VEC7. */
+# define REGISTER_SAVE_AREA_RAW	(8 * 7 + 16 * 4 + VEC_SIZE * 8)
+#endif
+
+#undef REGISTER_SAVE_AREA
+#undef LOCAL_STORAGE_AREA
+#undef BASE
+#if DL_RUNIME_RESOLVE_REALIGN_STACK
+# define REGISTER_SAVE_AREA	(REGISTER_SAVE_AREA_RAW + 8)
+/* Local stack area before jumping to function address: RBX.  */
+# define LOCAL_STORAGE_AREA	8
+# define BASE			rbx
+# if (REGISTER_SAVE_AREA % VEC_SIZE) != 0
+#  error REGISTER_SAVE_AREA must be multples of VEC_SIZE
+# endif
+#else
+# define REGISTER_SAVE_AREA	REGISTER_SAVE_AREA_RAW
+/* Local stack area before jumping to function address:  All saved
+   registers.  */
+# define LOCAL_STORAGE_AREA	REGISTER_SAVE_AREA
+# define BASE			rsp
+# if (REGISTER_SAVE_AREA % 16) != 8
+#  error REGISTER_SAVE_AREA must be odd multples of 8
+# endif
+#endif
+
+	.text
+	.globl _dl_runtime_resolve
+	.hidden _dl_runtime_resolve
+	.type _dl_runtime_resolve, @function
+	.align 16
+	cfi_startproc
+_dl_runtime_resolve:
+	cfi_adjust_cfa_offset(16) # Incorporate PLT
+#if DL_RUNIME_RESOLVE_REALIGN_STACK
+# if LOCAL_STORAGE_AREA != 8
+#  error LOCAL_STORAGE_AREA must be 8
+# endif
+	pushq %rbx			# push subtracts stack by 8.
+	cfi_adjust_cfa_offset(8)
+	cfi_rel_offset(%rbx, 0)
+	mov %RSP_LP, %RBX_LP
+	cfi_def_cfa_register(%rbx)
+	and $-VEC_SIZE, %RSP_LP
+#endif
+	sub $REGISTER_SAVE_AREA, %RSP_LP
+	cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
+	# Preserve registers otherwise clobbered.
+	movq %rax, REGISTER_SAVE_RAX(%rsp)
+	movq %rcx, REGISTER_SAVE_RCX(%rsp)
+	movq %rdx, REGISTER_SAVE_RDX(%rsp)
+	movq %rsi, REGISTER_SAVE_RSI(%rsp)
+	movq %rdi, REGISTER_SAVE_RDI(%rsp)
+	movq %r8, REGISTER_SAVE_R8(%rsp)
+	movq %r9, REGISTER_SAVE_R9(%rsp)
+	VMOV %VEC(0), (REGISTER_SAVE_VEC_OFF)(%rsp)
+	VMOV %VEC(1), (REGISTER_SAVE_VEC_OFF + VEC_SIZE)(%rsp)
+	VMOV %VEC(2), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 2)(%rsp)
+	VMOV %VEC(3), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 3)(%rsp)
+	VMOV %VEC(4), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 4)(%rsp)
+	VMOV %VEC(5), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 5)(%rsp)
+	VMOV %VEC(6), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 6)(%rsp)
+	VMOV %VEC(7), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 7)(%rsp)
+#ifndef __ILP32__
+	# We also have to preserve bound registers.  These are nops if
+	# Intel MPX isn't available or disabled.
+# ifdef HAVE_MPX_SUPPORT
+	bndmov %bnd0, REGISTER_SAVE_BND0(%rsp)
+	bndmov %bnd1, REGISTER_SAVE_BND1(%rsp)
+	bndmov %bnd2, REGISTER_SAVE_BND2(%rsp)
+	bndmov %bnd3, REGISTER_SAVE_BND3(%rsp)
+# else
+#  if REGISTER_SAVE_BND0 == 0
+	.byte 0x66,0x0f,0x1b,0x04,0x24
+#  else
+	.byte 0x66,0x0f,0x1b,0x44,0x24,REGISTER_SAVE_BND0
+#  endif
+	.byte 0x66,0x0f,0x1b,0x4c,0x24,REGISTER_SAVE_BND1
+	.byte 0x66,0x0f,0x1b,0x54,0x24,REGISTER_SAVE_BND2
+	.byte 0x66,0x0f,0x1b,0x5c,0x24,REGISTER_SAVE_BND3
+# endif
+#endif
+	# Copy args pushed by PLT in register.
+	# %rdi: link_map, %rsi: reloc_index
+	mov (LOCAL_STORAGE_AREA + 8)(%BASE), %RSI_LP
+	mov LOCAL_STORAGE_AREA(%BASE), %RDI_LP
+	call _dl_fixup		# Call resolver.
+	mov %RAX_LP, %R11_LP	# Save return value
+#ifndef __ILP32__
+	# Restore bound registers.  These are nops if Intel MPX isn't
+	# avaiable or disabled.
+# ifdef HAVE_MPX_SUPPORT
+	bndmov REGISTER_SAVE_BND3(%rsp), %bnd3
+	bndmov REGISTER_SAVE_BND2(%rsp), %bnd2
+	bndmov REGISTER_SAVE_BND1(%rsp), %bnd1
+	bndmov REGISTER_SAVE_BND0(%rsp), %bnd0
+# else
+	.byte 0x66,0x0f,0x1a,0x5c,0x24,REGISTER_SAVE_BND3
+	.byte 0x66,0x0f,0x1a,0x54,0x24,REGISTER_SAVE_BND2
+	.byte 0x66,0x0f,0x1a,0x4c,0x24,REGISTER_SAVE_BND1
+#  if REGISTER_SAVE_BND0 == 0
+	.byte 0x66,0x0f,0x1a,0x04,0x24
+#  else
+	.byte 0x66,0x0f,0x1a,0x44,0x24,REGISTER_SAVE_BND0
+#  endif
+# endif
+#endif
+	# Get register content back.
+	movq REGISTER_SAVE_R9(%rsp), %r9
+	movq REGISTER_SAVE_R8(%rsp), %r8
+	movq REGISTER_SAVE_RDI(%rsp), %rdi
+	movq REGISTER_SAVE_RSI(%rsp), %rsi
+	movq REGISTER_SAVE_RDX(%rsp), %rdx
+	movq REGISTER_SAVE_RCX(%rsp), %rcx
+	movq REGISTER_SAVE_RAX(%rsp), %rax
+	VMOV (REGISTER_SAVE_VEC_OFF)(%rsp), %VEC(0)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE)(%rsp), %VEC(1)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 2)(%rsp), %VEC(2)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 3)(%rsp), %VEC(3)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 4)(%rsp), %VEC(4)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 5)(%rsp), %VEC(5)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 6)(%rsp), %VEC(6)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 7)(%rsp), %VEC(7)
+#if DL_RUNIME_RESOLVE_REALIGN_STACK
+	mov %RBX_LP, %RSP_LP
+	cfi_def_cfa_register(%rsp)
+	movq (%rsp), %rbx
+	cfi_restore(%rbx)
+#endif
+	# Adjust stack(PLT did 2 pushes)
+	add $(LOCAL_STORAGE_AREA + 16), %RSP_LP
+	cfi_adjust_cfa_offset(-(LOCAL_STORAGE_AREA + 16))
+	# Preserve bound registers.
+	PRESERVE_BND_REGS_PREFIX
+	jmp *%r11		# Jump to function address.
+	cfi_endproc
+	.size _dl_runtime_resolve, .-_dl_runtime_resolve
+
+
+#ifndef PROF
+# if (LR_VECTOR_OFFSET % VEC_SIZE) != 0
+#  error LR_VECTOR_OFFSET must be multples of VEC_SIZE
+# endif
+
+	.globl _dl_runtime_profile
+	.hidden _dl_runtime_profile
+	.type _dl_runtime_profile, @function
+	.align 16
+_dl_runtime_profile:
+	cfi_startproc
+	cfi_adjust_cfa_offset(16) # Incorporate PLT
+	/* The La_x86_64_regs data structure pointed to by the
+	   fourth paramater must be VEC_SIZE-byte aligned.  This must
+	   be explicitly enforced.  We have the set up a dynamically
+	   sized stack frame.  %rbx points to the top half which
+	   has a fixed size and preserves the original stack pointer.  */
+
+	sub $32, %RSP_LP	# Allocate the local storage.
+	cfi_adjust_cfa_offset(32)
+	movq %rbx, (%rsp)
+	cfi_rel_offset(%rbx, 0)
+
+	/* On the stack:
+		56(%rbx)	parameter #1
+		48(%rbx)	return address
+
+		40(%rbx)	reloc index
+		32(%rbx)	link_map
+
+		24(%rbx)	La_x86_64_regs pointer
+		16(%rbx)	framesize
+		 8(%rbx)	rax
+		  (%rbx)	rbx
+	*/
+
+	movq %rax, 8(%rsp)
+	mov %RSP_LP, %RBX_LP
+	cfi_def_cfa_register(%rbx)
+
+	/* Actively align the La_x86_64_regs structure.  */
+	and $-VEC_SIZE, %RSP_LP
+# if defined HAVE_AVX_SUPPORT || defined HAVE_AVX512_ASM_SUPPORT
+	/* sizeof(La_x86_64_regs).  Need extra space for 8 SSE registers
+	   to detect if any xmm0-xmm7 registers are changed by audit
+	   module.  */
+	sub $(LR_SIZE + XMM_SIZE*8), %RSP_LP
+# else
+	sub $LR_SIZE, %RSP_LP		# sizeof(La_x86_64_regs)
+# endif
+	movq %rsp, 24(%rbx)
+
+	/* Fill the La_x86_64_regs structure.  */
+	movq %rdx, LR_RDX_OFFSET(%rsp)
+	movq %r8,  LR_R8_OFFSET(%rsp)
+	movq %r9,  LR_R9_OFFSET(%rsp)
+	movq %rcx, LR_RCX_OFFSET(%rsp)
+	movq %rsi, LR_RSI_OFFSET(%rsp)
+	movq %rdi, LR_RDI_OFFSET(%rsp)
+	movq %rbp, LR_RBP_OFFSET(%rsp)
+
+	lea 48(%rbx), %RAX_LP
+	movq %rax, LR_RSP_OFFSET(%rsp)
+
+	/* We always store the XMM registers even if AVX is available.
+	   This is to provide backward binary compatibility for existing
+	   audit modules.  */
+	movaps %xmm0,		   (LR_XMM_OFFSET)(%rsp)
+	movaps %xmm1, (LR_XMM_OFFSET +   XMM_SIZE)(%rsp)
+	movaps %xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp)
+	movaps %xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp)
+	movaps %xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp)
+	movaps %xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp)
+	movaps %xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp)
+	movaps %xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp)
+
+# ifndef __ILP32__
+#  ifdef HAVE_MPX_SUPPORT
+	bndmov %bnd0, 		   (LR_BND_OFFSET)(%rsp)  # Preserve bound
+	bndmov %bnd1, (LR_BND_OFFSET +   BND_SIZE)(%rsp)  # registers. Nops if
+	bndmov %bnd2, (LR_BND_OFFSET + BND_SIZE*2)(%rsp)  # MPX not available
+	bndmov %bnd3, (LR_BND_OFFSET + BND_SIZE*3)(%rsp)  # or disabled.
+#  else
+	.byte 0x66,0x0f,0x1b,0x84,0x24;.long (LR_BND_OFFSET)
+	.byte 0x66,0x0f,0x1b,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE)
+	.byte 0x66,0x0f,0x1b,0x94,0x24;.long (LR_BND_OFFSET + BND_SIZE*2)
+	.byte 0x66,0x0f,0x1b,0x9c,0x24;.long (LR_BND_OFFSET + BND_SIZE*3)
+#  endif
+# endif
+
+# ifdef RESTORE_AVX
 	/* This is to support AVX audit modules.  */
-	VMOV %VEC(0),		      (LR_VECTOR_OFFSET)(%rsp)
-	VMOV %VEC(1), (LR_VECTOR_OFFSET +   VECTOR_SIZE)(%rsp)
-	VMOV %VEC(2), (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp)
-	VMOV %VEC(3), (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp)
-	VMOV %VEC(4), (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp)
-	VMOV %VEC(5), (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp)
-	VMOV %VEC(6), (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp)
-	VMOV %VEC(7), (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp)
+	VMOVA %VEC(0),		      (LR_VECTOR_OFFSET)(%rsp)
+	VMOVA %VEC(1), (LR_VECTOR_OFFSET +   VECTOR_SIZE)(%rsp)
+	VMOVA %VEC(2), (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp)
+	VMOVA %VEC(3), (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp)
+	VMOVA %VEC(4), (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp)
+	VMOVA %VEC(5), (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp)
+	VMOVA %VEC(6), (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp)
+	VMOVA %VEC(7), (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp)
 
 	/* Save xmm0-xmm7 registers to detect if any of them are
 	   changed by audit module.  */
@@ -38,7 +273,7 @@
 	vmovdqa %xmm5, (LR_SIZE + XMM_SIZE*5)(%rsp)
 	vmovdqa %xmm6, (LR_SIZE + XMM_SIZE*6)(%rsp)
 	vmovdqa %xmm7, (LR_SIZE + XMM_SIZE*7)(%rsp)
-#endif
+# endif
 
 	mov %RSP_LP, %RCX_LP	# La_x86_64_regs pointer to %rcx.
 	mov 48(%rbx), %RDX_LP	# Load return address if needed.
@@ -63,7 +298,7 @@
 	movaps (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6
 	movaps (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7
 
-#ifdef RESTORE_AVX
+# ifdef RESTORE_AVX
 	/* Check if any xmm0-xmm7 registers are changed by audit
 	   module.  */
 	vpcmpeqq (LR_SIZE)(%rsp), %xmm0, %xmm8
@@ -72,7 +307,7 @@
 	je 2f
 	vmovdqa	%xmm0, (LR_VECTOR_OFFSET)(%rsp)
 	jmp 1f
-2:	VMOV (LR_VECTOR_OFFSET)(%rsp), %VEC(0)
+2:	VMOVA (LR_VECTOR_OFFSET)(%rsp), %VEC(0)
 	vmovdqa	%xmm0, (LR_XMM_OFFSET)(%rsp)
 
 1:	vpcmpeqq (LR_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm8
@@ -81,7 +316,7 @@
 	je 2f
 	vmovdqa	%xmm1, (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp)
 	jmp 1f
-2:	VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %VEC(1)
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %VEC(1)
 	vmovdqa	%xmm1, (LR_XMM_OFFSET + XMM_SIZE)(%rsp)
 
 1:	vpcmpeqq (LR_SIZE + XMM_SIZE*2)(%rsp), %xmm2, %xmm8
@@ -90,7 +325,7 @@
 	je 2f
 	vmovdqa	%xmm2, (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp)
 	jmp 1f
-2:	VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %VEC(2)
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %VEC(2)
 	vmovdqa	%xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp)
 
 1:	vpcmpeqq (LR_SIZE + XMM_SIZE*3)(%rsp), %xmm3, %xmm8
@@ -99,7 +334,7 @@
 	je 2f
 	vmovdqa	%xmm3, (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp)
 	jmp 1f
-2:	VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %VEC(3)
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %VEC(3)
 	vmovdqa	%xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp)
 
 1:	vpcmpeqq (LR_SIZE + XMM_SIZE*4)(%rsp), %xmm4, %xmm8
@@ -108,7 +343,7 @@
 	je 2f
 	vmovdqa	%xmm4, (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp)
 	jmp 1f
-2:	VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %VEC(4)
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %VEC(4)
 	vmovdqa	%xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp)
 
 1:	vpcmpeqq (LR_SIZE + XMM_SIZE*5)(%rsp), %xmm5, %xmm8
@@ -117,7 +352,7 @@
 	je 2f
 	vmovdqa	%xmm5, (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp)
 	jmp 1f
-2:	VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %VEC(5)
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %VEC(5)
 	vmovdqa	%xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp)
 
 1:	vpcmpeqq (LR_SIZE + XMM_SIZE*6)(%rsp), %xmm6, %xmm8
@@ -126,7 +361,7 @@
 	je 2f
 	vmovdqa	%xmm6, (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp)
 	jmp 1f
-2:	VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %VEC(6)
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %VEC(6)
 	vmovdqa	%xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp)
 
 1:	vpcmpeqq (LR_SIZE + XMM_SIZE*7)(%rsp), %xmm7, %xmm8
@@ -135,25 +370,25 @@
 	je 2f
 	vmovdqa	%xmm7, (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp)
 	jmp 1f
-2:	VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp), %VEC(7)
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp), %VEC(7)
 	vmovdqa	%xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp)
 
 1:
-#endif
+# endif
 
-#ifndef __ILP32__
-# ifdef HAVE_MPX_SUPPORT
+# ifndef __ILP32__
+#  ifdef HAVE_MPX_SUPPORT
 	bndmov              (LR_BND_OFFSET)(%rsp), %bnd0  # Restore bound
 	bndmov (LR_BND_OFFSET +   BND_SIZE)(%rsp), %bnd1  # registers.
 	bndmov (LR_BND_OFFSET + BND_SIZE*2)(%rsp), %bnd2
 	bndmov (LR_BND_OFFSET + BND_SIZE*3)(%rsp), %bnd3
-# else
+#  else
 	.byte 0x66,0x0f,0x1a,0x84,0x24;.long (LR_BND_OFFSET)
 	.byte 0x66,0x0f,0x1a,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE)
 	.byte 0x66,0x0f,0x1a,0x94,0x24;.long (LR_BND_OFFSET + BND_SIZE*2)
 	.byte 0x66,0x0f,0x1a,0x9c,0x24;.long (LR_BND_OFFSET + BND_SIZE*3)
+#  endif
 # endif
-#endif
 
 	mov  16(%rbx), %R10_LP	# Anything in framesize?
 	test %R10_LP, %R10_LP
@@ -168,12 +403,12 @@
 	movq LR_RSI_OFFSET(%rsp), %rsi
 	movq LR_RDI_OFFSET(%rsp), %rdi
 
-	movq %rbx, %rsp
+	mov %RBX_LP, %RSP_LP
 	movq (%rsp), %rbx
-	cfi_restore(rbx)
+	cfi_restore(%rbx)
 	cfi_def_cfa_register(%rsp)
 
-	addq $48, %rsp		# Adjust the stack to the return value
+	add $48, %RSP_LP	# Adjust the stack to the return value
 				# (eats the reloc index and link_map)
 	cfi_adjust_cfa_offset(-48)
 	PRESERVE_BND_REGS_PREFIX
@@ -189,13 +424,13 @@
 	   temporary buffer of the size specified by the 'framesize'
 	   returned from _dl_profile_fixup */
 
-	leaq LR_RSP_OFFSET(%rbx), %rsi	# stack
-	addq $8, %r10
-	andq $0xfffffffffffffff0, %r10
-	movq %r10, %rcx
-	subq %r10, %rsp
-	movq %rsp, %rdi
-	shrq $3, %rcx
+	lea LR_RSP_OFFSET(%rbx), %RSI_LP # stack
+	add $8, %R10_LP
+	and $-16, %R10_LP
+	mov %R10_LP, %RCX_LP
+	sub %R10_LP, %RSP_LP
+	mov %RSP_LP, %RDI_LP
+	shr $3, %RCX_LP
 	rep
 	movsq
 
@@ -206,21 +441,21 @@
 	PRESERVE_BND_REGS_PREFIX
 	call *%r11
 
-	mov 24(%rbx), %rsp	# Drop the copied stack content
+	mov 24(%rbx), %RSP_LP	# Drop the copied stack content
 
 	/* Now we have to prepare the La_x86_64_retval structure for the
 	   _dl_call_pltexit.  The La_x86_64_regs is being pointed by rsp now,
 	   so we just need to allocate the sizeof(La_x86_64_retval) space on
 	   the stack, since the alignment has already been taken care of. */
-#ifdef RESTORE_AVX
+# ifdef RESTORE_AVX
 	/* sizeof(La_x86_64_retval).  Need extra space for 2 SSE
 	   registers to detect if xmm0/xmm1 registers are changed
 	   by audit module.  */
-	subq $(LRV_SIZE + XMM_SIZE*2), %rsp
-#else
-	subq $LRV_SIZE, %rsp	# sizeof(La_x86_64_retval)
-#endif
-	movq %rsp, %rcx		# La_x86_64_retval argument to %rcx.
+	sub $(LRV_SIZE + XMM_SIZE*2), %RSP_LP
+# else
+	sub $LRV_SIZE, %RSP_LP	# sizeof(La_x86_64_retval)
+# endif
+	mov %RSP_LP, %RCX_LP	# La_x86_64_retval argument to %rcx.
 
 	/* Fill in the La_x86_64_retval structure.  */
 	movq %rax, LRV_RAX_OFFSET(%rcx)
@@ -229,26 +464,26 @@
 	movaps %xmm0, LRV_XMM0_OFFSET(%rcx)
 	movaps %xmm1, LRV_XMM1_OFFSET(%rcx)
 
-#ifdef RESTORE_AVX
+# ifdef RESTORE_AVX
 	/* This is to support AVX audit modules.  */
-	VMOV %VEC(0), LRV_VECTOR0_OFFSET(%rcx)
-	VMOV %VEC(1), LRV_VECTOR1_OFFSET(%rcx)
+	VMOVA %VEC(0), LRV_VECTOR0_OFFSET(%rcx)
+	VMOVA %VEC(1), LRV_VECTOR1_OFFSET(%rcx)
 
 	/* Save xmm0/xmm1 registers to detect if they are changed
 	   by audit module.  */
 	vmovdqa %xmm0,		  (LRV_SIZE)(%rcx)
 	vmovdqa %xmm1, (LRV_SIZE + XMM_SIZE)(%rcx)
-#endif
+# endif
 
-#ifndef __ILP32__
-# ifdef HAVE_MPX_SUPPORT
+# ifndef __ILP32__
+#  ifdef HAVE_MPX_SUPPORT
 	bndmov %bnd0, LRV_BND0_OFFSET(%rcx)  # Preserve returned bounds.
 	bndmov %bnd1, LRV_BND1_OFFSET(%rcx)
-# else
+#  else
 	.byte  0x66,0x0f,0x1b,0x81;.long (LRV_BND0_OFFSET)
 	.byte  0x66,0x0f,0x1b,0x89;.long (LRV_BND1_OFFSET)
+#  endif
 # endif
-#endif
 
 	fstpt LRV_ST0_OFFSET(%rcx)
 	fstpt LRV_ST1_OFFSET(%rcx)
@@ -265,50 +500,47 @@
 	movaps LRV_XMM0_OFFSET(%rsp), %xmm0
 	movaps LRV_XMM1_OFFSET(%rsp), %xmm1
 
-#ifdef RESTORE_AVX
+# ifdef RESTORE_AVX
 	/* Check if xmm0/xmm1 registers are changed by audit module.  */
 	vpcmpeqq (LRV_SIZE)(%rsp), %xmm0, %xmm2
 	vpmovmskb %xmm2, %esi
 	cmpl $0xffff, %esi
 	jne 1f
-	VMOV LRV_VECTOR0_OFFSET(%rsp), %VEC(0)
+	VMOVA LRV_VECTOR0_OFFSET(%rsp), %VEC(0)
 
 1:	vpcmpeqq (LRV_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2
 	vpmovmskb %xmm2, %esi
 	cmpl $0xffff, %esi
 	jne 1f
-	VMOV LRV_VECTOR1_OFFSET(%rsp), %VEC(1)
+	VMOVA LRV_VECTOR1_OFFSET(%rsp), %VEC(1)
 
 1:
-#endif
+# endif
 
-#ifndef __ILP32__
-# ifdef HAVE_MPX_SUPPORT
+# ifndef __ILP32__
+#  ifdef HAVE_MPX_SUPPORT
 	bndmov LRV_BND0_OFFSET(%rsp), %bnd0  # Restore bound registers.
 	bndmov LRV_BND1_OFFSET(%rsp), %bnd1
-# else
+#  else
 	.byte  0x66,0x0f,0x1a,0x84,0x24;.long (LRV_BND0_OFFSET)
 	.byte  0x66,0x0f,0x1a,0x8c,0x24;.long (LRV_BND1_OFFSET)
+#  endif
 # endif
-#endif
 
 	fldt LRV_ST1_OFFSET(%rsp)
 	fldt LRV_ST0_OFFSET(%rsp)
 
-	movq %rbx, %rsp
+	mov %RBX_LP, %RSP_LP
 	movq (%rsp), %rbx
-	cfi_restore(rbx)
+	cfi_restore(%rbx)
 	cfi_def_cfa_register(%rsp)
 
-	addq $48, %rsp		# Adjust the stack to the return value
+	add $48, %RSP_LP	# Adjust the stack to the return value
 				# (eats the reloc index and link_map)
 	cfi_adjust_cfa_offset(-48)
 	PRESERVE_BND_REGS_PREFIX
 	retq
 
-#ifdef MORE_CODE
-	cfi_adjust_cfa_offset(48)
-	cfi_rel_offset(%rbx, 0)
-	cfi_def_cfa_register(%rbx)
-# undef MORE_CODE
+	cfi_endproc
+	.size _dl_runtime_profile, .-_dl_runtime_profile
 #endif