From d7bd7a8ae8cdb3f1414b1e032759d9ef324eb040 Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Wed, 15 Jul 2009 17:41:36 -0700
Subject: Secure AVX changes for auditing code.

The original AVX patch used a function pointer to handle the difference
between machines with and without AVX support.  This is insecure.  A
well-placed memory exploit could lead to redirection of the execution.
Using a variable and several tests is a bit slower but cannot be
exploited in this way.
---
 sysdeps/x86_64/dl-trampoline.S | 327 +++++++++++++++++++++++++++++++++++++----
 1 file changed, 295 insertions(+), 32 deletions(-)

(limited to 'sysdeps/x86_64/dl-trampoline.S')

diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
index f605351f30..2f55639662 100644
--- a/sysdeps/x86_64/dl-trampoline.S
+++ b/sysdeps/x86_64/dl-trampoline.S
@@ -96,9 +96,9 @@ _dl_runtime_profile:
 	   to detect if any xmm0-xmm7 registers are changed by audit
 	   module.  */
 	subq $(LR_SIZE + XMM_SIZE*8), %rsp
-#else
+# else
 	subq $LR_SIZE, %rsp		# sizeof(La_x86_64_regs)
-#endif
+# endif
 	movq %rsp, 24(%rbx)
 
 	/* Fill the La_x86_64_regs structure.  */
@@ -110,45 +110,308 @@ _dl_runtime_profile:
 	movq %rdi, LR_RDI_OFFSET(%rsp)
 	movq %rbp, LR_RBP_OFFSET(%rsp)
 
+	leaq 48(%rbx), %rax
+	movq %rax, LR_RSP_OFFSET(%rsp)
+
+	/* We always store the XMM registers even if AVX is available.
+	   This is to provide backward binary compatility for existing
+	   audit modules.  */
+	movaps %xmm0,		   (LR_XMM_OFFSET)(%rsp)
+	movaps %xmm1, (LR_XMM_OFFSET +   XMM_SIZE)(%rsp)
+	movaps %xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp)
+	movaps %xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp)
+	movaps %xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp)
+	movaps %xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp)
+	movaps %xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp)
+	movaps %xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp)
+
 # ifdef HAVE_AVX_SUPPORT
-	jmp *L(save_and_restore_vector)(%rip)
+	.data
+L(have_avx):
+	.zero 4
+	.size L(have_avx), 4
+	.previous
 
-	.align 16
-L(save_and_restore_vector_sse):
+	cmpl	$0, L(have_avx)(%rip)
+	jne	1f
+	movq	%rbx, %r11		# Save rbx
+	movl	$1, %eax
+	cpuid
+	movq	%r11,%rbx		# Restore rbx
+	movl	$1, %eax
+	testl	$(1 << 28), %ecx
+	jne	2f
+	negl	%eax
+2:	movl	%eax, L(have_eax)(%rip)
+	cmpl	$0, %eax
+
+1:	js	L(no_avx1)
+
+	/* This is to support AVX audit modules.  */
+	vmovdqu %ymm0,		      (LR_VECTOR_OFFSET)(%rsp)
+	vmovdqu %ymm1, (LR_VECTOR_OFFSET +   VECTOR_SIZE)(%rsp)
+	vmovdqu %ymm2, (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp)
+	vmovdqu %ymm3, (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp)
+	vmovdqu %ymm4, (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp)
+	vmovdqu %ymm5, (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp)
+	vmovdqu %ymm6, (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp)
+	vmovdqu %ymm7, (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp)
+
+	/* Save xmm0-xmm7 registers to detect if any of them are
+	   changed by audit module.  */
+	vmovdqa %xmm0,		    (LR_SIZE)(%rsp)
+	vmovdqa %xmm1, (LR_SIZE +   XMM_SIZE)(%rsp)
+	vmovdqa %xmm2, (LR_SIZE + XMM_SIZE*2)(%rsp)
+	vmovdqa %xmm3, (LR_SIZE + XMM_SIZE*3)(%rsp)
+	vmovdqa %xmm4, (LR_SIZE + XMM_SIZE*4)(%rsp)
+	vmovdqa %xmm5, (LR_SIZE + XMM_SIZE*5)(%rsp)
+	vmovdqa %xmm6, (LR_SIZE + XMM_SIZE*6)(%rsp)
+	vmovdqa %xmm7, (LR_SIZE + XMM_SIZE*7)(%rsp)
+
+L(no_avx1):
 # endif
 
-# define MOVXMM movaps
-# include "dl-trampoline.h"
+	movq %rsp, %rcx		# La_x86_64_regs pointer to %rcx.
+	movq 48(%rbx), %rdx	# Load return address if needed.
+	movq 40(%rbx), %rsi	# Copy args pushed by PLT in register.
+	movq 32(%rbx), %rdi	# %rdi: link_map, %rsi: reloc_index
+	leaq 16(%rbx), %r8
+	call _dl_profile_fixup	# Call resolver.
+
+	movq %rax, %r11		# Save return value.
+
+	movq 8(%rbx), %rax	# Get back register content.
+	movq LR_RDX_OFFSET(%rsp), %rdx
+	movq  LR_R8_OFFSET(%rsp), %r8
+	movq  LR_R9_OFFSET(%rsp), %r9
 
 # ifdef HAVE_AVX_SUPPORT
-#  undef  MOVXMM
-#  define MOVXMM vmovdqa
-#  define RESTORE_AVX
-	.align 16
-L(save_and_restore_vector_avx):
-#  include "dl-trampoline.h"
+	cmpl	$0, L(have_avx)(%rip)
+	js	L(no_avx2)
+
+	/* Check if any xmm0-xmm7 registers are changed by audit
+	   module.  */
+	vmovdqa (LR_XMM_OFFSET)(%rsp), %xmm0
+	vpcmpeqq (LR_SIZE)(%rsp), %xmm0, %xmm1
+	vpmovmskb %xmm1, %esi
+	cmpl $0xffff, %esi
+	je 1f
+	vmovdqu			(LR_VECTOR_OFFSET)(%rsp), %ymm0
+
+1:	vmovdqa (LR_XMM_OFFSET + XMM_SIZE)(%rsp), %xmm1
+	vpcmpeqq (LR_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2
+	vpmovmskb %xmm2, %esi
+	cmpl $0xffff, %esi
+	je 1f
+	vmovdqu	  (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %ymm1
+
+1:	vmovdqa (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp), %xmm2
+	vpcmpeqq (LR_SIZE + XMM_SIZE*2)(%rsp), %xmm2, %xmm3
+	vpmovmskb %xmm3, %esi
+	cmpl $0xffff, %esi
+	je 1f
+	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %ymm2
+
+1:	vmovdqa (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp), %xmm3
+	vpcmpeqq (LR_SIZE + XMM_SIZE*3)(%rsp), %xmm3, %xmm4
+	vpmovmskb %xmm4, %esi
+	cmpl $0xffff, %esi
+	je 1f
+	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %ymm3
+
+1:	vmovdqa (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp), %xmm4
+	vpcmpeqq (LR_SIZE + XMM_SIZE*4)(%rsp), %xmm4, %xmm5
+	vpmovmskb %xmm5, %esi
+	cmpl $0xffff, %esi
+	je 1f
+	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %ymm4
+
+1:	vmovdqa (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp), %xmm5
+	vpcmpeqq (LR_SIZE + XMM_SIZE*5)(%rsp), %xmm5, %xmm6
+	vpmovmskb %xmm6, %esi
+	cmpl $0xffff, %esi
+	je 1f
+	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %ymm5
+
+1:	vmovdqa (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6
+	vpcmpeqq (LR_SIZE + XMM_SIZE*6)(%rsp), %xmm6, %xmm7
+	vpmovmskb %xmm7, %esi
+	cmpl $0xffff, %esi
+	je 1f
+	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %ymm6
+
+1:	vmovdqa (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7
+	vpcmpeqq (LR_SIZE + XMM_SIZE*7)(%rsp), %xmm7, %xmm8
+	vpmovmskb %xmm8, %esi
+	cmpl $0xffff, %esi
+	je 1f
+	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp), %ymm7
+	jmp	1f
+
+L(no_avx2):
+	vmovdqa		    (LR_XMM_OFFSET)(%rsp), %xmm0
+	vmovdqa	 (LR_XMM_OFFSET + XMM_SIZE)(%rsp), %xmm1
+	vmovdqa (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp), %xmm2
+	vmovdqa (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp), %xmm3
+	vmovdqa (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp), %xmm4
+	vmovdqa (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp), %xmm5
+	vmovdqa (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6
+	vmovdqa (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7
+
+1:
+# else
+	movaps		    (LR_XMM_OFFSET)(%rsp), %xmm0
+	movaps	 (LR_XMM_OFFSET + XMM_SIZE)(%rsp), %xmm1
+	movaps (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp), %xmm2
+	movaps (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp), %xmm3
+	movaps (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp), %xmm4
+	movaps (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp), %xmm5
+	movaps (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6
+	movaps (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7
 # endif
 
-	cfi_endproc
-	.size _dl_runtime_profile, .-_dl_runtime_profile
+	movq 16(%rbx), %r10	# Anything in framesize?
+	testq %r10, %r10
+	jns 3f
+
+	/* There's nothing in the frame size, so there
+	   will be no call to the _dl_call_pltexit. */
+
+	/* Get back registers content.  */
+	movq LR_RCX_OFFSET(%rsp), %rcx
+	movq LR_RSI_OFFSET(%rsp), %rsi
+	movq LR_RDI_OFFSET(%rsp), %rdi
 
+	movq %rbx, %rsp
+	movq (%rsp), %rbx
+	cfi_restore(rbx)
+	cfi_def_cfa_register(%rsp)
+
+	addq $48, %rsp		# Adjust the stack to the return value
+				# (eats the reloc index and link_map)
+	cfi_adjust_cfa_offset(-48)
+	jmp *%r11		# Jump to function address.
+
+3:
+	cfi_adjust_cfa_offset(48)
+	cfi_rel_offset(%rbx, 0)
+	cfi_def_cfa_register(%rbx)
+
+	/* At this point we need to prepare new stack for the function
+	   which has to be called.  We copy the original stack to a
+	   temporary buffer of the size specified by the 'framesize'
+	   returned from _dl_profile_fixup */
+
+	leaq LR_RSP_OFFSET(%rbx), %rsi	# stack
+	addq $8, %r10
+	andq $0xfffffffffffffff0, %r10
+	movq %r10, %rcx
+	subq %r10, %rsp
+	movq %rsp, %rdi
+	shrq $3, %rcx
+	rep
+	movsq
+
+	movq 24(%rdi), %rcx	# Get back register content.
+	movq 32(%rdi), %rsi
+	movq 40(%rdi), %rdi
+
+	call *%r11
+
+	mov 24(%rbx), %rsp	# Drop the copied stack content
+
+	/* Now we have to prepare the La_x86_64_retval structure for the
+	   _dl_call_pltexit.  The La_x86_64_regs is being pointed by rsp now,
+	   so we just need to allocate the sizeof(La_x86_64_retval) space on
+	   the stack, since the alignment has already been taken care of. */
 # ifdef HAVE_AVX_SUPPORT
-L(check_avx):
-	mov	%rbx,%r11		# Save rbx
-	movl	$1, %eax
-	cpuid
-	mov	%r11,%rbx		# Restore rbx
-	leaq    L(save_and_restore_vector_sse)(%rip), %rax
-	andl	$(1 << 28), %ecx	# Check if AVX is available.
-	jz	L(ret)
-	leaq    L(save_and_restore_vector_avx)(%rip), %rax
-L(ret):
-	movq	%rax,L(save_and_restore_vector)(%rip)
-	jmp	*%rax
-
-	.section .data.rel.local,"aw",@progbits
-	.align	8
-L(save_and_restore_vector):
-	.quad L(check_avx)
+	/* sizeof(La_x86_64_retval).  Need extra space for 2 SSE
+	   registers to detect if xmm0/xmm1 registers are changed
+	   by audit module.  */
+	subq $(LRV_SIZE + XMM_SIZE*2), %rsp
+# else
+	subq $LRV_SIZE, %rsp	# sizeof(La_x86_64_retval)
+# endif
+	movq %rsp, %rcx		# La_x86_64_retval argument to %rcx.
+
+	/* Fill in the La_x86_64_retval structure.  */
+	movq %rax, LRV_RAX_OFFSET(%rcx)
+	movq %rdx, LRV_RDX_OFFSET(%rcx)
+
+	movaps %xmm0, LRV_XMM0_OFFSET(%rcx)
+	movaps %xmm1, LRV_XMM1_OFFSET(%rcx)
+
+# ifdef HAVE_AVX_SUPPORT
+	cmpl	$0, L(have_avx)(%rip)
+	js	L(no_avx3)
+
+	/* This is to support AVX audit modules.  */
+	vmovdqu %ymm0, LRV_VECTOR0_OFFSET(%rcx)
+	vmovdqu %ymm1, LRV_VECTOR1_OFFSET(%rcx)
+
+	/* Save xmm0/xmm1 registers to detect if they are changed
+	   by audit module.  */
+	vmovdqa %xmm0,		  (LRV_SIZE)(%rcx)
+	vmovdqa %xmm1, (LRV_SIZE + XMM_SIZE)(%rcx)
+
+L(no_avx3):
 # endif
+
+	fstpt LRV_ST0_OFFSET(%rcx)
+	fstpt LRV_ST1_OFFSET(%rcx)
+
+	movq 24(%rbx), %rdx	# La_x86_64_regs argument to %rdx.
+	movq 40(%rbx), %rsi	# Copy args pushed by PLT in register.
+        movq 32(%rbx), %rdi	# %rdi: link_map, %rsi: reloc_index
+	call _dl_call_pltexit
+
+	/* Restore return registers.  */
+	movq LRV_RAX_OFFSET(%rsp), %rax
+	movq LRV_RDX_OFFSET(%rsp), %rdx
+
+# ifdef HAVE_AVX_SUPPORT
+	cmpl	$0, L(have_avx)(%rip)
+	js	L(no_avx4)
+
+	/* Check if xmm0/xmm1 registers are changed by audit module.  */
+	vmovdqa LRV_XMM0_OFFSET(%rsp), %xmm0
+	vpcmpeqq (LRV_SIZE)(%rsp), %xmm0, %xmm1
+	vpmovmskb %xmm1, %esi
+	cmpl $0xffff, %esi
+	je 1f
+	vmovdqu LRV_VECTOR0_OFFSET(%rsp), %ymm0
+
+1:	vmovdqa LRV_XMM1_OFFSET(%rsp), %xmm1
+	vpcmpeqq (LRV_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2
+	vpmovmskb %xmm2, %esi
+	cmpl $0xffff, %esi
+	je 1f
+	vmovdqu LRV_VECTOR1_OFFSET(%rsp), %ymm1
+	jmp 1f
+
+L(no_avx4):
+	vmovdqa LRV_XMM0_OFFSET(%rsp), %xmm0
+	vmovdqa LRV_XMM1_OFFSET(%rsp), %xmm1
+
+1:
+# else
+	movaps LRV_XMM0_OFFSET(%rsp), %xmm0
+	movaps LRV_XMM1_OFFSET(%rsp), %xmm1
+# endif
+
+	fldt LRV_ST1_OFFSET(%rsp)
+	fldt LRV_ST0_OFFSET(%rsp)
+
+	movq %rbx, %rsp
+	movq (%rsp), %rbx
+	cfi_restore(rbx)
+	cfi_def_cfa_register(%rsp)
+
+	addq $48, %rsp		# Adjust the stack to the return value
+				# (eats the reloc index and link_map)
+	cfi_adjust_cfa_offset(-48)
+	retq
+
+	cfi_endproc
+	.size _dl_runtime_profile, .-_dl_runtime_profile
 #endif
-- 
cgit 1.4.1