about summary refs log tree commit diff
path: root/sysdeps/x86_64/nptl
diff options
context:
space:
mode:
authorH.J. Lu <hjl.tools@gmail.com>2015-08-25 04:33:54 -0700
committerH.J. Lu <hjl.tools@gmail.com>2015-08-25 04:34:13 -0700
commitf3dcae82d54e5097e18e1d6ef4ff55c2ea4e621e (patch)
treee43395ded84bf0aa25ecbe2d395082a182aae8b7 /sysdeps/x86_64/nptl
parent2d02fd07371bcd492c320cec649c6265787d794a (diff)
downloadglibc-f3dcae82d54e5097e18e1d6ef4ff55c2ea4e621e.tar.gz
glibc-f3dcae82d54e5097e18e1d6ef4ff55c2ea4e621e.tar.xz
glibc-f3dcae82d54e5097e18e1d6ef4ff55c2ea4e621e.zip
Save and restore vector registers in x86-64 ld.so
This patch adds SSE, AVX and AVX512 versions of _dl_runtime_resolve
and _dl_runtime_profile, which save and restore the first 8 vector
registers used for parameter passing.  elf_machine_runtime_setup
selects the proper _dl_runtime_resolve or _dl_runtime_profile based
on _dl_x86_cpu_features.  It avoids race condition caused by
FOREIGN_CALL macros, which are only used for x86-64.

Performance impact of saving and restoring 8 vector registers are
negligible on Nehalem, Sandy Bridge, Ivy Bridge and Haswell when
ld.so is optimized with SSE2.

	[BZ #15128]
	* sysdeps/x86_64/Makefile [$(subdir) == elf] (tests): Add
	ifuncmain8.
	(modules-names): Add ifuncmod8.
	($(objpfx)ifuncmain8): New rule.
	* sysdeps/x86_64/dl-machine.h: Include <dl-procinfo.h> and
	<cpuid.h>.
	(elf_machine_runtime_setup): Use _dl_runtime_resolve_sse,
	_dl_runtime_resolve_avx, or _dl_runtime_resolve_avx512,
	_dl_runtime_profile_sse, _dl_runtime_profile_avx, or
	_dl_runtime_profile_avx512, based on HAS_ARCH_FEATURE.
	* sysdeps/x86_64/dl-trampoline.S: Rewrite.
	* sysdeps/x86_64/dl-trampoline.h: Likewise.
	* sysdeps/x86_64/ifuncmain8.c: New file.
	* sysdeps/x86_64/ifuncmod8.c: Likewise.
	* sysdeps/x86_64/nptl/tcb-offsets.sym (RTLD_SAVESPACE_SSE):
	Removed.
	* sysdeps/x86_64/nptl/tls.h (__128bits): Removed.
	(tcbhead_t): Change rtld_must_xmm_save to __glibc_unused1.
	Change rtld_savespace_sse to __glibc_unused2.
	(RTLD_CHECK_FOREIGN_CALL): Removed.
	(RTLD_ENABLE_FOREIGN_CALL): Likewise.
	(RTLD_PREPARE_FOREIGN_CALL): Likewise.
	(RTLD_FINALIZE_FOREIGN_CALL): Likewise.
Diffstat (limited to 'sysdeps/x86_64/nptl')
-rw-r--r--sysdeps/x86_64/nptl/tcb-offsets.sym1
-rw-r--r--sysdeps/x86_64/nptl/tls.h42
2 files changed, 4 insertions, 39 deletions
diff --git a/sysdeps/x86_64/nptl/tcb-offsets.sym b/sysdeps/x86_64/nptl/tcb-offsets.sym
index 729d1da38f..aeb752673a 100644
--- a/sysdeps/x86_64/nptl/tcb-offsets.sym
+++ b/sysdeps/x86_64/nptl/tcb-offsets.sym
@@ -16,7 +16,6 @@ VGETCPU_CACHE_OFFSET	offsetof (tcbhead_t, vgetcpu_cache)
 #ifndef __ASSUME_PRIVATE_FUTEX
 PRIVATE_FUTEX		offsetof (tcbhead_t, private_futex)
 #endif
-RTLD_SAVESPACE_SSE	offsetof (tcbhead_t, rtld_savespace_sse)
 
 -- Not strictly offsets, but these values are also used in the TCB.
 TCB_CANCELSTATE_BITMASK	 CANCELSTATE_BITMASK
diff --git a/sysdeps/x86_64/nptl/tls.h b/sysdeps/x86_64/nptl/tls.h
index d7543c651f..b73e7edf6c 100644
--- a/sysdeps/x86_64/nptl/tls.h
+++ b/sysdeps/x86_64/nptl/tls.h
@@ -67,14 +67,15 @@ typedef struct
 # else
   int __glibc_reserved1;
 # endif
-  int rtld_must_xmm_save;
+  int __glibc_unused1;
   /* Reservation of some values for the TM ABI.  */
   void *__private_tm[4];
   /* GCC split stack support.  */
   void *__private_ss;
   long int __glibc_reserved2;
-  /* Have space for the post-AVX register size.  */
-  __128bits rtld_savespace_sse[8][4] __attribute__ ((aligned (32)));
+  /* Must be kept even if it is no longer used by glibc since programs,
+     like AddressSanitizer, depend on the size of tcbhead_t.  */
+  __128bits __glibc_unused2[8][4] __attribute__ ((aligned (32)));
 
   void *__padding[8];
 } tcbhead_t;
@@ -384,41 +385,6 @@ typedef struct
 # define THREAD_GSCOPE_WAIT() \
   GL(dl_wait_lookup_done) ()
 
-
-# ifdef SHARED
-/* Defined in dl-trampoline.S.  */
-extern void _dl_x86_64_save_sse (void);
-extern void _dl_x86_64_restore_sse (void);
-
-# define RTLD_CHECK_FOREIGN_CALL \
-  (THREAD_GETMEM (THREAD_SELF, header.rtld_must_xmm_save) != 0)
-
-/* NB: Don't use the xchg operation because that would imply a lock
-   prefix which is expensive and unnecessary.  The cache line is also
-   not contested at all.  */
-#  define RTLD_ENABLE_FOREIGN_CALL \
-  int old_rtld_must_xmm_save = THREAD_GETMEM (THREAD_SELF,		      \
-					      header.rtld_must_xmm_save);     \
-  THREAD_SETMEM (THREAD_SELF, header.rtld_must_xmm_save, 1)
-
-#  define RTLD_PREPARE_FOREIGN_CALL \
-  do if (THREAD_GETMEM (THREAD_SELF, header.rtld_must_xmm_save))	      \
-    {									      \
-      _dl_x86_64_save_sse ();						      \
-      THREAD_SETMEM (THREAD_SELF, header.rtld_must_xmm_save, 0);	      \
-    }									      \
-  while (0)
-
-#  define RTLD_FINALIZE_FOREIGN_CALL \
-  do {									      \
-    if (THREAD_GETMEM (THREAD_SELF, header.rtld_must_xmm_save) == 0)	      \
-      _dl_x86_64_restore_sse ();					      \
-    THREAD_SETMEM (THREAD_SELF, header.rtld_must_xmm_save,		      \
-		   old_rtld_must_xmm_save);				      \
-  } while (0)
-# endif
-
-
 #endif /* __ASSEMBLER__ */
 
 #endif	/* tls.h */