about summary refs log tree commit diff
path: root/sysdeps/x86_64/dl-trampoline.S
diff options
context:
space:
mode:
authorIgor Zamyatin <igor.zamyatin@intel.com>2014-03-13 11:10:22 -0700
committerH.J. Lu <hjl.tools@gmail.com>2014-03-13 11:19:08 -0700
commit2d63a517e4084ec80403cd9f278690fa8b676cc4 (patch)
treed5d5dd025d9a59bd41fedf9c16799271b5dbb722 /sysdeps/x86_64/dl-trampoline.S
parent44c4e5d598bfcbb309f05ceb7a57ab02662e7f34 (diff)
downloadglibc-2d63a517e4084ec80403cd9f278690fa8b676cc4.tar.gz
glibc-2d63a517e4084ec80403cd9f278690fa8b676cc4.tar.xz
glibc-2d63a517e4084ec80403cd9f278690fa8b676cc4.zip
Save and restore AVX-512 zmm registers to x86-64 ld.so
AVX-512 ISA adds 512-bit zmm registers.  This patch updates
_dl_runtime_profile to pass zmm registers to run-time audit. It also
changes _dl_x86_64_save_sse and _dl_x86_64_restore_sse to upport zmm
registers, which are called when only when RTLD_PREPARE_FOREIGN_CALL
is used.  Its performance impact is minimum.

	* config.h.in (HAVE_AVX512_SUPPORT): New #undef.
	(HAVE_AVX512_ASM_SUPPORT): Likewise.
	* sysdeps/x86_64/bits/link.h (La_x86_64_zmm): New.
	(La_x86_64_vector): Add zmm.
	* sysdeps/x86_64/Makefile (tests): Add tst-audit10.
	(modules-names): Add tst-auditmod10a and tst-auditmod10b.
	($(objpfx)tst-audit10): New target.
	($(objpfx)tst-audit10.out): Likewise.
	(tst-audit10-ENV): New.
	(AVX512-CFLAGS): Likewise.
	(CFLAGS-tst-audit10.c): Likewise.
	(CFLAGS-tst-auditmod10a.c): Likewise.
	(CFLAGS-tst-auditmod10b.c): Likewise.
	* sysdeps/x86_64/configure.ac: Set config-cflags-avx512,
	HAVE_AVX512_SUPPORT and HAVE_AVX512_ASM_SUPPORT.
	* sysdeps/x86_64/configure: Regenerated.
	* sysdeps/x86_64/dl-trampoline.S (_dl_runtime_profile): Add
	AVX-512 zmm register support.
	(_dl_x86_64_save_sse): Likewise.
	(_dl_x86_64_restore_sse): Likewise.
	* sysdeps/x86_64/dl-trampoline.h: Updated to support different
	size vector registers.
	* sysdeps/x86_64/link-defines.sym (YMM_SIZE): New.
	(ZMM_SIZE): Likewise.
	* sysdeps/x86_64/tst-audit10.c: New file.
	* sysdeps/x86_64/tst-auditmod10a.c: Likewise.
	* sysdeps/x86_64/tst-auditmod10b.c: Likewise.
Diffstat (limited to 'sysdeps/x86_64/dl-trampoline.S')
-rw-r--r--sysdeps/x86_64/dl-trampoline.S122
1 files changed, 104 insertions, 18 deletions
diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
index ae38677e13..77c4d0f147 100644
--- a/sysdeps/x86_64/dl-trampoline.S
+++ b/sysdeps/x86_64/dl-trampoline.S
@@ -96,7 +96,7 @@ _dl_runtime_profile:
 
 	/* Actively align the La_x86_64_regs structure.  */
 	andq $0xfffffffffffffff0, %rsp
-# ifdef HAVE_AVX_SUPPORT
+# if defined HAVE_AVX_SUPPORT || defined HAVE_AVX512_ASM_SUPPORT
 	/* sizeof(La_x86_64_regs).  Need extra space for 8 SSE registers
 	   to detect if any xmm0-xmm7 registers are changed by audit
 	   module.  */
@@ -130,7 +130,7 @@ _dl_runtime_profile:
 	movaps %xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp)
 	movaps %xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp)
 
-# ifdef HAVE_AVX_SUPPORT
+# if defined HAVE_AVX_SUPPORT || defined HAVE_AVX512_ASM_SUPPORT
 	.data
 L(have_avx):
 	.zero 4
@@ -138,7 +138,7 @@ L(have_avx):
 	.previous
 
 	cmpl	$0, L(have_avx)(%rip)
-	jne	1f
+	jne	L(defined)
 	movq	%rbx, %r11		# Save rbx
 	movl	$1, %eax
 	cpuid
@@ -147,18 +147,54 @@ L(have_avx):
 	// AVX and XSAVE supported?
 	andl	$((1 << 28) | (1 << 27)), %ecx
 	cmpl	$((1 << 28) | (1 << 27)), %ecx
-	jne	2f
+	jne	10f
+#  ifdef HAVE_AVX512_ASM_SUPPORT
+	// AVX512 supported in processor?
+	movq	%rbx, %r11		# Save rbx
+	xorl	%ecx, %ecx
+	mov	$0x7, %eax
+	cpuid
+	andl	$(1 << 16), %ebx
+#  endif
 	xorl	%ecx, %ecx
 	// Get XFEATURE_ENABLED_MASK
 	xgetbv
-	andl	$0x6, %eax
-2:	subl	$0x5, %eax
+#  ifdef HAVE_AVX512_ASM_SUPPORT
+	test	%ebx, %ebx
+	movq	%r11, %rbx		# Restore rbx
+	je	20f
+	// Verify that XCR0[7:5] = '111b' and
+	// XCR0[2:1] = '11b' which means
+	// that zmm state is enabled
+	andl	$0xe6, %eax
+	cmpl	$0xe6, %eax
+	jne	20f
+	movl	%eax, L(have_avx)(%rip)
+L(avx512):
+#   define RESTORE_AVX
+#   define VMOV    vmovdqu64
+#   define VEC(i)  zmm##i
+#   define MORE_CODE
+#   include "dl-trampoline.h"
+#   undef VMOV
+#   undef VEC
+#   undef RESTORE_AVX
+#  endif
+20:	andl	$0x6, %eax
+10:	subl	$0x5, %eax
 	movl	%eax, L(have_avx)(%rip)
 	cmpl	$0, %eax
 
-1:	js	L(no_avx)
+L(defined):
+	js	L(no_avx)
+#  ifdef HAVE_AVX512_ASM_SUPPORT
+	cmpl	$0xe6, L(have_avx)(%rip)
+	je	L(avx512)
+#  endif
 
 #  define RESTORE_AVX
+#  define VMOV    vmovdqu
+#  define VEC(i)  ymm##i
 #  define MORE_CODE
 #  include "dl-trampoline.h"
 
@@ -180,9 +216,9 @@ L(no_avx):
 	.align 16
 	cfi_startproc
 _dl_x86_64_save_sse:
-# ifdef HAVE_AVX_SUPPORT
+# if defined HAVE_AVX_SUPPORT || defined HAVE_AVX512_ASM_SUPPORT
 	cmpl	$0, L(have_avx)(%rip)
-	jne	1f
+	jne	L(defined_5)
 	movq	%rbx, %r11		# Save rbx
 	movl	$1, %eax
 	cpuid
@@ -191,21 +227,43 @@ _dl_x86_64_save_sse:
 	// AVX and XSAVE supported?
 	andl	$((1 << 28) | (1 << 27)), %ecx
 	cmpl	$((1 << 28) | (1 << 27)), %ecx
-	jne	2f
+	jne	1f
+#  ifdef HAVE_AVX512_ASM_SUPPORT
+	// AVX512 supported in a processor?
+	movq	%rbx, %r11              # Save rbx
+	xorl	%ecx,%ecx
+	mov	$0x7,%eax
+	cpuid
+	andl	$(1 << 16), %ebx
+#  endif
 	xorl	%ecx, %ecx
 	// Get XFEATURE_ENABLED_MASK
 	xgetbv
-	andl	$0x6, %eax
-	cmpl	$0x6, %eax
-	// Nonzero if SSE and AVX state saving is enabled.
-	sete	%al
-2:	leal	-1(%eax,%eax), %eax
+#  ifdef HAVE_AVX512_ASM_SUPPORT
+	test	%ebx, %ebx
+	movq	%r11, %rbx		# Restore rbx
+	je	2f
+	// Verify that XCR0[7:5] = '111b' and
+	// XCR0[2:1] = '11b' which means
+	// that zmm state is enabled
+	andl	$0xe6, %eax
+	movl	%eax, L(have_avx)(%rip)
+	cmpl	$0xe6, %eax
+	je	L(avx512_5)
+#  endif
+
+2:	andl	$0x6, %eax
+1:	subl	$0x5, %eax
 	movl	%eax, L(have_avx)(%rip)
 	cmpl	$0, %eax
 
-1:	js	L(no_avx5)
+L(defined_5):
+	js	L(no_avx5)
+#  ifdef HAVE_AVX512_ASM_SUPPORT
+	cmpl	$0xe6, L(have_avx)(%rip)
+	je	L(avx512_5)
+#  endif
 
-#  define YMM_SIZE 32
 	vmovdqa %ymm0, %fs:RTLD_SAVESPACE_SSE+0*YMM_SIZE
 	vmovdqa %ymm1, %fs:RTLD_SAVESPACE_SSE+1*YMM_SIZE
 	vmovdqa %ymm2, %fs:RTLD_SAVESPACE_SSE+2*YMM_SIZE
@@ -215,6 +273,18 @@ _dl_x86_64_save_sse:
 	vmovdqa %ymm6, %fs:RTLD_SAVESPACE_SSE+6*YMM_SIZE
 	vmovdqa %ymm7, %fs:RTLD_SAVESPACE_SSE+7*YMM_SIZE
 	ret
+#  ifdef HAVE_AVX512_ASM_SUPPORT
+L(avx512_5):
+	vmovdqu64 %zmm0, %fs:RTLD_SAVESPACE_SSE+0*ZMM_SIZE
+	vmovdqu64 %zmm1, %fs:RTLD_SAVESPACE_SSE+1*ZMM_SIZE
+	vmovdqu64 %zmm2, %fs:RTLD_SAVESPACE_SSE+2*ZMM_SIZE
+	vmovdqu64 %zmm3, %fs:RTLD_SAVESPACE_SSE+3*ZMM_SIZE
+	vmovdqu64 %zmm4, %fs:RTLD_SAVESPACE_SSE+4*ZMM_SIZE
+	vmovdqu64 %zmm5, %fs:RTLD_SAVESPACE_SSE+5*ZMM_SIZE
+	vmovdqu64 %zmm6, %fs:RTLD_SAVESPACE_SSE+6*ZMM_SIZE
+	vmovdqu64 %zmm7, %fs:RTLD_SAVESPACE_SSE+7*ZMM_SIZE
+	ret
+#  endif
 L(no_avx5):
 # endif
 	movdqa	%xmm0, %fs:RTLD_SAVESPACE_SSE+0*XMM_SIZE
@@ -235,9 +305,13 @@ L(no_avx5):
 	.align 16
 	cfi_startproc
 _dl_x86_64_restore_sse:
-# ifdef HAVE_AVX_SUPPORT
+# if defined HAVE_AVX_SUPPORT || defined HAVE_AVX512_ASM_SUPPORT
 	cmpl	$0, L(have_avx)(%rip)
 	js	L(no_avx6)
+#  ifdef HAVE_AVX512_ASM_SUPPORT
+	cmpl	$0xe6, L(have_avx)(%rip)
+	je	L(avx512_6)
+#  endif
 
 	vmovdqa %fs:RTLD_SAVESPACE_SSE+0*YMM_SIZE, %ymm0
 	vmovdqa %fs:RTLD_SAVESPACE_SSE+1*YMM_SIZE, %ymm1
@@ -248,6 +322,18 @@ _dl_x86_64_restore_sse:
 	vmovdqa %fs:RTLD_SAVESPACE_SSE+6*YMM_SIZE, %ymm6
 	vmovdqa %fs:RTLD_SAVESPACE_SSE+7*YMM_SIZE, %ymm7
 	ret
+#  ifdef HAVE_AVX512_ASM_SUPPORT
+L(avx512_6):
+	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+0*ZMM_SIZE, %zmm0
+	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+1*ZMM_SIZE, %zmm1
+	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+2*ZMM_SIZE, %zmm2
+	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+3*ZMM_SIZE, %zmm3
+	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+4*ZMM_SIZE, %zmm4
+	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+5*ZMM_SIZE, %zmm5
+	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+6*ZMM_SIZE, %zmm6
+	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+7*ZMM_SIZE, %zmm7
+	ret
+#  endif
 L(no_avx6):
 # endif
 	movdqa	%fs:RTLD_SAVESPACE_SSE+0*XMM_SIZE, %xmm0