diff options
28 files changed, 471 insertions, 67 deletions
diff --git a/ChangeLog b/ChangeLog index dd2060112a..51562e088e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,49 @@ +2009-07-29 Ulrich Drepper <drepper@redhat.com> + + * math/s_fma.c: Don't define alias if __fma is a macro. + * math/s_fmaf.c: Likewise. + * sysdeps/x86_64/multiarch/s_fma.c: New file. + * sysdeps/x86_64/multiarch/s_fmaf.c: New file. + Partially based on a patch by H.J. Lu <hongjiu.lu@intel.com>. + + * sysdeps/x86_64/multiarch/init-arch.h (__get_cpu_features): Declare. + (HAS_POPCOUNT, HAS_SSE4_2): Add variants which work outside libc. + New macro HAS_FMA. + * sysdeps/x86_64/multiarch/init-arch.c (__get_cpu_features): New + function. + * include/libc-symbols.h (libm_ifunc): Define. + * sysdeps/x86_64/multiarch/Versions: New file. + + * sysdeps/x86_64/dl-trampoline.S (_dl_runtime_profile): Improve CFI. + +2009-07-28 H.J. Lu <hongjiu.lu@intel.com> + + * sysdeps/x86_64/dl-trampoline.S: Properly restore AVX registers. + +2009-07-29 Ulrich Drepper <drepper@redhat.com> + + * elf/dl-runtime.c (_dl_fixup): Indicate before _dl_lookup_symbol_x + call that registers used in calling conventions need to be preserved. + * elf/dl-lookup.c (do_lookup_x): Use RTLD_*_FOREIGN_CALL macros + to preserve register content if necessary. + * sysdeps/x86_64/dl-trampoline.S (_dl_x86_64_save_sse): New function. + (_dl_x86_64_restore_sse): New function. + * sysdeps/x86_64/tst-xmmymm.sh: There is now one more function that + is allowed to modify xmm/ymm registers. + + * stdio-common/scanf15.c: Undefine _LIBC. We want to test from an + application's perspective. + * stdio-common/scanf17.c: Likewise. + +2009-07-28 Ulrich Drepper <drepper@redhat.com> + + * csu/libc-tls.c (__libc_setup_tls) [TLS_TCB_AT_TP]: Don't add TCB + size to memsz. + (init_static_tls) [TLS_TCB_AT_TP]: Add it to GL(dl_tls_static_size) + here. + * elf/dl-reloc.c (_dl_try_allocate_static_tls): Compute freebytes in + two steps to catch bugs. + 2009-07-27 Ulrich Drepper <drepper@redhat.com> * sysdeps/x86_64/tst-xmmymm.sh: Refine testing. The script now diff --git a/csu/libc-tls.c b/csu/libc-tls.c index 0d240ccef9..5a49942861 100644 --- a/csu/libc-tls.c +++ b/csu/libc-tls.c @@ -1,5 +1,5 @@ /* Initialization code for TLS in statically linked application. - Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc. + Copyright (C) 2002-2006, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -99,6 +99,9 @@ init_static_tls (size_t memsz, size_t align) surplus that permits dynamic loading of modules with IE-model TLS. */ GL(dl_tls_static_size) = roundup (memsz + GL(dl_tls_static_size), TLS_TCB_ALIGN); +#if TLS_TCB_AT_TP + GL(dl_tls_static_size) += TLS_TCB_SIZE; +#endif GL(dl_tls_static_used) = memsz; /* The alignment requirement for the static TLS block. */ GL(dl_tls_static_align) = align; @@ -211,9 +214,7 @@ __libc_setup_tls (size_t tcbsize, size_t tcbalign) memsz = roundup (memsz, align ?: 1); -#if TLS_TCB_AT_TP - memsz += tcbsize; -#elif TLS_DTV_AT_TP +#if TLS_DTV_AT_TP memsz += tcb_offset; #endif diff --git a/elf/dl-lookup.c b/elf/dl-lookup.c index 1d68d67a35..56724c9b4d 100644 --- a/elf/dl-lookup.c +++ b/elf/dl-lookup.c @@ -380,6 +380,10 @@ do_lookup_x (const char *undef_name, uint_fast32_t new_hash, if (size * 3 <= tab->n_elements * 4) { /* Expand the table. */ +#ifdef RTLD_CHECK_FOREIGN_CALL + /* This must not happen during runtime relocations. */ + assert (!RTLD_CHECK_FOREIGN_CALL); +#endif size_t newsize = _dl_higher_prime_number (size + 1); struct unique_sym *newentries = calloc (sizeof (struct unique_sym), newsize); @@ -405,6 +409,11 @@ do_lookup_x (const char *undef_name, uint_fast32_t new_hash, } else { +#ifdef RTLD_CHECK_FOREIGN_CALL + /* This must not happen during runtime relocations. */ + assert (!RTLD_CHECK_FOREIGN_CALL); +#endif + #define INITIAL_NUNIQUE_SYM_TABLE 31 size = INITIAL_NUNIQUE_SYM_TABLE; entries = calloc (sizeof (struct unique_sym), size); @@ -600,6 +609,10 @@ add_dependency (struct link_map *undef_map, struct link_map *map, int flags) unsigned int max = undef_map->l_reldepsmax ? undef_map->l_reldepsmax * 2 : 10; +#ifdef RTLD_PREPARE_FOREIGN_CALL + RTLD_PREPARE_FOREIGN_CALL; +#endif + newp = malloc (sizeof (*newp) + max * sizeof (struct link_map *)); if (newp == NULL) { diff --git a/elf/dl-reloc.c b/elf/dl-reloc.c index 28f08de3e7..680caadd65 100644 --- a/elf/dl-reloc.c +++ b/elf/dl-reloc.c @@ -61,7 +61,10 @@ _dl_try_allocate_static_tls (struct link_map *map) size_t n; size_t blsize; - freebytes = GL(dl_tls_static_size) - GL(dl_tls_static_used) - TLS_TCB_SIZE; + freebytes = GL(dl_tls_static_size) - GL(dl_tls_static_used); + if (freebytes < TLS_TCB_SIZE) + goto fail; + freebytes -= TLS_TCB_SIZE; blsize = map->l_tls_blocksize + map->l_tls_firstbyte_offset; if (freebytes < blsize) diff --git a/elf/dl-runtime.c b/elf/dl-runtime.c index 0eb7d4e3b9..a52120d121 100644 --- a/elf/dl-runtime.c +++ b/elf/dl-runtime.c @@ -111,6 +111,10 @@ _dl_fixup ( flags |= DL_LOOKUP_GSCOPE_LOCK; } +#ifdef RTLD_ENABLE_FOREIGN_CALL + RTLD_ENABLE_FOREIGN_CALL; +#endif + result = _dl_lookup_symbol_x (strtab + sym->st_name, l, &sym, l->l_scope, version, ELF_RTYPE_CLASS_PLT, flags, NULL); @@ -118,6 +122,10 @@ _dl_fixup ( if (!RTLD_SINGLE_THREAD_P) THREAD_GSCOPE_RESET_FLAG (); +#ifdef RTLD_FINALIZE_FOREIGN_CALL + RTLD_FINALIZE_FOREIGN_CALL; +#endif + /* Currently result contains the base load address (or link map) of the object that defines sym. Now add in the symbol offset. */ diff --git a/include/libc-symbols.h b/include/libc-symbols.h index 68da77c58e..252141eb01 100644 --- a/include/libc-symbols.h +++ b/include/libc-symbols.h @@ -1,6 +1,6 @@ /* Support macros for making weak and strong aliases for symbols, and for using symbol sets and linker warnings with GNU ld. - Copyright (C) 1995-1998, 2000-2006, 2008 Free Software Foundation, Inc. + Copyright (C) 1995-1998,2000-2006,2008,2009 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -845,6 +845,17 @@ for linking") } \ __asm__ (".type " #name ", %gnu_indirect_function"); +/* The body of the function is supposed to use __get_cpu_features + which will, if necessary, initialize the data first. */ +#define libm_ifunc(name, expr) \ + extern void *name##_ifunc (void) __asm__ (#name); \ + void *name##_ifunc (void) \ + { \ + __typeof (name) *res = expr; \ + return res; \ + } \ + __asm__ (".type " #name ", %gnu_indirect_function"); + #ifdef HAVE_ASM_SET_DIRECTIVE # define libc_ifunc_hidden_def1(local, name) \ __asm__ (declare_symbol_alias_1_stringify (ASM_GLOBAL_DIRECTIVE) \ diff --git a/math/s_fma.c b/math/s_fma.c index e5ff5a7228..476d1fe44c 100644 --- a/math/s_fma.c +++ b/math/s_fma.c @@ -1,5 +1,5 @@ /* Compute x * y + z as ternary operation. - Copyright (C) 1997, 2001 Free Software Foundation, Inc. + Copyright (C) 1997, 2001, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. @@ -25,7 +25,9 @@ __fma (double x, double y, double z) { return (x * y) + z; } +#ifndef __fma weak_alias (__fma, fma) +#endif #ifdef NO_LONG_DOUBLE strong_alias (__fma, __fmal) diff --git a/math/s_fmaf.c b/math/s_fmaf.c index caa7f3afe8..357296d70d 100644 --- a/math/s_fmaf.c +++ b/math/s_fmaf.c @@ -1,5 +1,5 @@ /* Compute x * y + z as ternary operation. - Copyright (C) 1997 Free Software Foundation, Inc. + Copyright (C) 1997, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. @@ -25,4 +25,6 @@ __fmaf (float x, float y, float z) { return (x * y) + z; } +#ifndef __fmaf weak_alias (__fmaf, fmaf) +#endif diff --git a/nptl/ChangeLog b/nptl/ChangeLog index e5fc474916..0046b20608 100644 --- a/nptl/ChangeLog +++ b/nptl/ChangeLog @@ -1,3 +1,32 @@ +2009-07-29 Ulrich Drepper <drepper@redhat.com> + + * sysdeps/x86_64/tls.h (TLS_TCB_ALIGN): Define explicitly to 32. + + * sysdeps/x86_64/tls.h (tcbhead_t): Add room for SSE registers the + dynamic linker might have to save. + Define RTLD_CHECK_FOREIGN_CALL, RTLD_ENABLE_FOREIGN_CALL, + RTLD_PREPARE_FOREIGN_CALL, and RTLD_FINALIZE_FOREIGN_CALL. Pretty + printing. + + * sysdeps/x86_64/tcb-offsets.sym: Add RTLD_SAVESPACE_SSE. + +2009-07-28 Ulrich Drepper <drepper@redhat.com> + + * pthread_mutex_lock.c [NO_INCR] (__pthread_mutex_cond_lock_adjust): + New function. + * pthreadP.h: Declare __pthread_mutex_cond_lock_adjust. + * sysdeps/unix/sysv/linux/pthread-pi-defines.sym: Add ROBUST_BIT. + * sysdeps/unix/sysv/linux/x86_64/pthread_cond_broadcast.S: Don't use + requeue_pi for robust mutexes. + * sysdeps/unix/sysv/linux/x86_64/pthread_cond_signal.S: Likewise. + * sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S: Likewise. + Don't only skip __pthread_mutex_cond_lock. Call instead + __pthread_mutex_cond_lock_adjust. + * sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S: Likewise. + + * pthread_mutex_unlock.c (__pthread_mutex_unlock_full): Minor + optimization of PI mutex handling. + 2009-07-27 Ulrich Drepper <drepper@redhat.com> [BZ #10418] diff --git a/nptl/pthreadP.h b/nptl/pthreadP.h index ed9fc625ba..43ca44c829 100644 --- a/nptl/pthreadP.h +++ b/nptl/pthreadP.h @@ -418,6 +418,8 @@ extern int __pthread_mutex_lock_internal (pthread_mutex_t *__mutex) attribute_hidden; extern int __pthread_mutex_cond_lock (pthread_mutex_t *__mutex) attribute_hidden internal_function; +extern void __pthread_mutex_cond_lock_adjust (pthread_mutex_t *__mutex) + attribute_hidden internal_function; extern int __pthread_mutex_unlock (pthread_mutex_t *__mutex); extern int __pthread_mutex_unlock_internal (pthread_mutex_t *__mutex) attribute_hidden; diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c index 406e588fdb..50dc18803d 100644 --- a/nptl/pthread_mutex_lock.c +++ b/nptl/pthread_mutex_lock.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2002-2007, 2008 Free Software Foundation, Inc. +/* Copyright (C) 2002-2007, 2008, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@redhat.com>, 2002. @@ -473,3 +473,22 @@ __pthread_mutex_lock_full (pthread_mutex_t *mutex) strong_alias (__pthread_mutex_lock, pthread_mutex_lock) strong_alias (__pthread_mutex_lock, __pthread_mutex_lock_internal) #endif + + +#ifdef NO_INCR +void +__pthread_mutex_cond_lock_adjust (mutex) + pthread_mutex_t *mutex; +{ + assert ((mutex->__data.__kind & PTHREAD_MUTEX_PRIO_INHERIT_NP) != 0); + assert ((mutex->__data.__kind & PTHREAD_MUTEX_ROBUST_NORMAL_NP) == 0); + assert ((mutex->__data.__kind & PTHREAD_MUTEX_PSHARED_BIT) == 0); + + /* Record the ownership. */ + pid_t id = THREAD_GETMEM (THREAD_SELF, tid); + mutex->__data.__owner = id; + + if (mutex->__data.__kind == PTHREAD_MUTEX_PI_RECURSIVE_NP) + ++mutex->__data.__count; +} +#endif diff --git a/nptl/pthread_mutex_unlock.c b/nptl/pthread_mutex_unlock.c index fbe8274a55..f9fe10b0f2 100644 --- a/nptl/pthread_mutex_unlock.c +++ b/nptl/pthread_mutex_unlock.c @@ -150,7 +150,7 @@ __pthread_mutex_unlock_full (pthread_mutex_t *mutex, int decr) if (--mutex->__data.__count != 0) /* We still hold the mutex. */ return 0; - goto continue_pi; + goto continue_pi_non_robust; case PTHREAD_MUTEX_PI_ROBUST_RECURSIVE_NP: /* Recursive mutex. */ @@ -173,7 +173,7 @@ __pthread_mutex_unlock_full (pthread_mutex_t *mutex, int decr) /* We still hold the mutex. */ return 0; - goto continue_pi; + goto continue_pi_robust; case PTHREAD_MUTEX_PI_ERRORCHECK_NP: case PTHREAD_MUTEX_PI_NORMAL_NP: @@ -195,9 +195,9 @@ __pthread_mutex_unlock_full (pthread_mutex_t *mutex, int decr) pi_notrecoverable: newowner = PTHREAD_MUTEX_NOTRECOVERABLE; - continue_pi: if ((mutex->__data.__kind & PTHREAD_MUTEX_ROBUST_NORMAL_NP) != 0) { + continue_pi_robust: /* Remove mutex from the list. Note: robust PI futexes are signaled by setting bit 0. */ THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, @@ -206,6 +206,7 @@ __pthread_mutex_unlock_full (pthread_mutex_t *mutex, int decr) DEQUEUE_MUTEX (mutex); } + continue_pi_non_robust: mutex->__data.__owner = newowner; if (decr) /* One less user. */ diff --git a/nptl/sysdeps/unix/sysv/linux/pthread-pi-defines.sym b/nptl/sysdeps/unix/sysv/linux/pthread-pi-defines.sym index d985c6a79b..46fbd0de74 100644 --- a/nptl/sysdeps/unix/sysv/linux/pthread-pi-defines.sym +++ b/nptl/sysdeps/unix/sysv/linux/pthread-pi-defines.sym @@ -3,5 +3,6 @@ -- These PI macros are used by assembly code. MUTEX_KIND offsetof (pthread_mutex_t, __data.__kind) +ROBUST_BIT PTHREAD_MUTEX_ROBUST_NORMAL_NP PI_BIT PTHREAD_MUTEX_PRIO_INHERIT_NP PS_BIT PTHREAD_MUTEX_PSHARED_BIT diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_broadcast.S b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_broadcast.S index 0f10ec910c..224a56088e 100644 --- a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_broadcast.S +++ b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_broadcast.S @@ -75,8 +75,10 @@ __pthread_cond_broadcast: jne 9f /* Requeue to a PI mutex if the PI bit is set. */ - testl $PI_BIT, MUTEX_KIND(%r8) - jne 81f + movl MUTEX_KIND(%r8), %eax + andl $(ROBUST_BIT|PI_BIT), %eax + cmpl $PI_BIT, %eax + je 81f /* Wake up all threads. */ #ifdef __ASSUME_PRIVATE_FUTEX diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_signal.S b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_signal.S index f1050fea7c..4d001eec7f 100644 --- a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_signal.S +++ b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_signal.S @@ -64,8 +64,10 @@ __pthread_cond_signal: /* Get the address of the mutex used. */ movq dep_mutex(%r8), %rcx - testl $PI_BIT, MUTEX_KIND(%rcx) - jne 9f + movl MUTEX_KIND(%rcx), %eax + andl $(ROBUST_BIT|PI_BIT), %eax + cmpl $PI_BIT, %eax + je 9f #ifdef __ASSUME_PRIVATE_FUTEX movl $(FUTEX_WAKE_OP|FUTEX_PRIVATE_FLAG), %esi diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S index 7486825d5f..4913beb8af 100644 --- a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S +++ b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S @@ -165,9 +165,12 @@ __pthread_cond_timedwait: je 60f movq dep_mutex(%rdi), %r8 - /* Requeue to a PI mutex if the PI bit is set. */ - testl $PI_BIT, MUTEX_KIND(%r8) - je 61f + /* Requeue to a non-robust PI mutex if the PI bit is set and + the robust bit is not set. */ + movl MUTEX_KIND(%r8), %eax + andl $(ROBUST_BIT|PI_BIT), %eax + cmpl $PI_BIT, %eax + jne 61f movl $(FUTEX_WAIT_REQUEUE_PI|FUTEX_PRIVATE_FLAG), %esi xorl %eax, %eax @@ -289,11 +292,10 @@ __pthread_cond_timedwait: /* If requeue_pi is used the kernel performs the locking of the mutex. */ -41: xorl %eax, %eax +41: movq 16(%rsp), %rdi testl %r15d, %r15d - jnz 63f + jnz 64f - movq 16(%rsp), %rdi callq __pthread_mutex_cond_lock 63: testq %rax, %rax @@ -316,12 +318,18 @@ __pthread_cond_timedwait: retq - /* Initial locking failed. */ -31: cfi_adjust_cfa_offset(4 * 8 + FRAME_SIZE) + cfi_adjust_cfa_offset(4 * 8 + FRAME_SIZE) cfi_rel_offset(%r12, FRAME_SIZE + 24) cfi_rel_offset(%r13, FRAME_SIZE + 16) cfi_rel_offset(%r14, FRAME_SIZE + 8) cfi_rel_offset(%r15, FRAME_SIZE) + +64: callq __pthread_mutex_cond_lock_adjust + movq %r14, %rax + jmp 48b + + /* Initial locking failed. */ +31: #if cond_lock != 0 addq $cond_lock, %rdi #endif diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S index 2fab38e277..a66523eab6 100644 --- a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S +++ b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S @@ -134,9 +134,12 @@ __pthread_cond_wait: je 60f movq dep_mutex-cond_futex(%rdi), %r8 - /* Requeue to a PI mutex if the PI bit is set. */ - testl $PI_BIT, MUTEX_KIND(%r8) - je 61f + /* Requeue to a non-robust PI mutex if the PI bit is set and + the robust bit is not set. */ + movl MUTEX_KIND(%r8), %eax + andl $(ROBUST_BIT|PI_BIT), %eax + cmpl $PI_BIT, %eax + jne 61f movl $(FUTEX_WAIT_REQUEUE_PI|FUTEX_PRIVATE_FLAG), %esi movl $SYS_futex, %eax @@ -234,11 +237,10 @@ __pthread_cond_wait: /* If requeue_pi is used the kernel performs the locking of the mutex. */ -11: xorl %eax, %eax +11: movq 16(%rsp), %rdi testl %r13d, %r13d - jnz 14f + jnz 18f - movq 16(%rsp), %rdi callq __pthread_mutex_cond_lock 14: addq $FRAME_SIZE, %rsp @@ -254,11 +256,16 @@ __pthread_cond_wait: /* We return the result of the mutex_lock operation. */ retq - /* Initial locking failed. */ -1: cfi_adjust_cfa_offset(16 + FRAME_SIZE) cfi_rel_offset(%r12, FRAME_SIZE + 8) cfi_rel_offset(%r13, FRAME_SIZE) + +18: callq __pthread_mutex_cond_lock_adjust + xorl %eax, %eax + jmp 14b + + /* Initial locking failed. */ +1: #if cond_lock != 0 addq $cond_lock, %rdi #endif diff --git a/nptl/sysdeps/x86_64/tcb-offsets.sym b/nptl/sysdeps/x86_64/tcb-offsets.sym index 1c70c6bde7..51f35c61cf 100644 --- a/nptl/sysdeps/x86_64/tcb-offsets.sym +++ b/nptl/sysdeps/x86_64/tcb-offsets.sym @@ -15,3 +15,4 @@ VGETCPU_CACHE_OFFSET offsetof (tcbhead_t, vgetcpu_cache) #ifndef __ASSUME_PRIVATE_FUTEX PRIVATE_FUTEX offsetof (tcbhead_t, private_futex) #endif +RTLD_SAVESPACE_SSE offsetof (tcbhead_t, rtld_savespace_sse) diff --git a/nptl/sysdeps/x86_64/tls.h b/nptl/sysdeps/x86_64/tls.h index ea89f3b1a2..4212038ab5 100644 --- a/nptl/sysdeps/x86_64/tls.h +++ b/nptl/sysdeps/x86_64/tls.h @@ -29,6 +29,7 @@ # include <sysdep.h> # include <kernel-features.h> # include <bits/wordsize.h> +# include <xmmintrin.h> /* Type for the dtv. */ @@ -55,16 +56,23 @@ typedef struct uintptr_t stack_guard; uintptr_t pointer_guard; unsigned long int vgetcpu_cache[2]; -#ifndef __ASSUME_PRIVATE_FUTEX +# ifndef __ASSUME_PRIVATE_FUTEX int private_futex; -#else +# else int __unused1; -#endif -#if __WORDSIZE == 64 - int __pad1; -#endif +# endif +# if __WORDSIZE == 64 + int rtld_must_xmm_save; +# endif /* Reservation of some values for the TM ABI. */ void *__private_tm[5]; +# if __WORDSIZE == 64 + long int __unused2; + /* Have space for the post-AVX register size. */ + __m128 rtld_savespace_sse[8][4]; + + void *__padding[8]; +# endif } tcbhead_t; #else /* __ASSEMBLER__ */ @@ -109,7 +117,12 @@ typedef struct # define TLS_TCB_SIZE sizeof (struct pthread) /* Alignment requirements for the TCB. */ -# define TLS_TCB_ALIGN __alignof__ (struct pthread) +//# define TLS_TCB_ALIGN __alignof__ (struct pthread) +// Normally the above would be correct But we have to store post-AVX +// vector registers in the TCB and we want the storage to be aligned. +// unfortunately there isn't yet a type for these values and hence no +// 32-byte alignment requirement. Make this explicit, for now. +# define TLS_TCB_ALIGN 32 /* The TCB can have any size and the memory following the address the thread pointer points to is unspecified. Allocate the TCB there. */ @@ -298,7 +311,7 @@ typedef struct /* Atomic compare and exchange on TLS, returning old value. */ -#define THREAD_ATOMIC_CMPXCHG_VAL(descr, member, newval, oldval) \ +# define THREAD_ATOMIC_CMPXCHG_VAL(descr, member, newval, oldval) \ ({ __typeof (descr->member) __ret; \ __typeof (oldval) __old = (oldval); \ if (sizeof (descr->member) == 4) \ @@ -313,7 +326,7 @@ typedef struct /* Atomic logical and. */ -#define THREAD_ATOMIC_AND(descr, member, val) \ +# define THREAD_ATOMIC_AND(descr, member, val) \ (void) ({ if (sizeof ((descr)->member) == 4) \ asm volatile (LOCK_PREFIX "andl %1, %%fs:%P0" \ :: "i" (offsetof (struct pthread, member)), \ @@ -324,7 +337,7 @@ typedef struct /* Atomic set bit. */ -#define THREAD_ATOMIC_BIT_SET(descr, member, bit) \ +# define THREAD_ATOMIC_BIT_SET(descr, member, bit) \ (void) ({ if (sizeof ((descr)->member) == 4) \ asm volatile (LOCK_PREFIX "orl %1, %%fs:%P0" \ :: "i" (offsetof (struct pthread, member)), \ @@ -334,7 +347,7 @@ typedef struct abort (); }) -#define CALL_THREAD_FCT(descr) \ +# define CALL_THREAD_FCT(descr) \ ({ void *__res; \ asm volatile ("movq %%fs:%P2, %%rdi\n\t" \ "callq *%%fs:%P1" \ @@ -355,18 +368,18 @@ typedef struct /* Set the pointer guard field in the TCB head. */ -#define THREAD_SET_POINTER_GUARD(value) \ +# define THREAD_SET_POINTER_GUARD(value) \ THREAD_SETMEM (THREAD_SELF, header.pointer_guard, value) -#define THREAD_COPY_POINTER_GUARD(descr) \ +# define THREAD_COPY_POINTER_GUARD(descr) \ ((descr)->header.pointer_guard \ = THREAD_GETMEM (THREAD_SELF, header.pointer_guard)) /* Get and set the global scope generation counter in the TCB head. */ -#define THREAD_GSCOPE_FLAG_UNUSED 0 -#define THREAD_GSCOPE_FLAG_USED 1 -#define THREAD_GSCOPE_FLAG_WAIT 2 -#define THREAD_GSCOPE_RESET_FLAG() \ +# define THREAD_GSCOPE_FLAG_UNUSED 0 +# define THREAD_GSCOPE_FLAG_USED 1 +# define THREAD_GSCOPE_FLAG_WAIT 2 +# define THREAD_GSCOPE_RESET_FLAG() \ do \ { int __res; \ asm volatile ("xchgl %0, %%fs:%P1" \ @@ -377,11 +390,40 @@ typedef struct lll_futex_wake (&THREAD_SELF->header.gscope_flag, 1, LLL_PRIVATE); \ } \ while (0) -#define THREAD_GSCOPE_SET_FLAG() \ +# define THREAD_GSCOPE_SET_FLAG() \ THREAD_SETMEM (THREAD_SELF, header.gscope_flag, THREAD_GSCOPE_FLAG_USED) -#define THREAD_GSCOPE_WAIT() \ +# define THREAD_GSCOPE_WAIT() \ GL(dl_wait_lookup_done) () + +# ifdef SHARED +/* Defined in dl-trampoline.S. */ +extern void _dl_x86_64_save_sse (void); +extern void _dl_x86_64_restore_sse (void); + +# define RTLD_CHECK_FOREIGN_CALL \ + (THREAD_GETMEM (THREAD_SELF, header.rtld_must_xmm_save) != 0) + +# define RTLD_ENABLE_FOREIGN_CALL \ + THREAD_SETMEM (THREAD_SELF, header.rtld_must_xmm_save, 1) + +# define RTLD_PREPARE_FOREIGN_CALL \ + do if (THREAD_GETMEM (THREAD_SELF, header.rtld_must_xmm_save)) \ + { \ + _dl_x86_64_save_sse (); \ + THREAD_SETMEM (THREAD_SELF, header.rtld_must_xmm_save, 0); \ + } \ + while (0) + +# define RTLD_FINALIZE_FOREIGN_CALL \ + do { \ + if (THREAD_GETMEM (THREAD_SELF, header.rtld_must_xmm_save) == 0) \ + _dl_x86_64_restore_sse (); \ + THREAD_SETMEM (THREAD_SELF, header.rtld_must_xmm_save, 0); \ + } while (0) +# endif + + #endif /* __ASSEMBLER__ */ #endif /* tls.h */ diff --git a/stdio-common/scanf15.c b/stdio-common/scanf15.c index c56715c486..851466b3a9 100644 --- a/stdio-common/scanf15.c +++ b/stdio-common/scanf15.c @@ -1,5 +1,6 @@ #undef _GNU_SOURCE #define _XOPEN_SOURCE 600 +#undef _LIBC /* The following macro definitions are a hack. They word around disabling the GNU extension while still using a few internal headers. */ #define u_char unsigned char diff --git a/stdio-common/scanf17.c b/stdio-common/scanf17.c index ee9024f9b7..4478a7022f 100644 --- a/stdio-common/scanf17.c +++ b/stdio-common/scanf17.c @@ -1,5 +1,6 @@ #undef _GNU_SOURCE #define _XOPEN_SOURCE 600 +#undef _LIBC /* The following macro definitions are a hack. They word around disabling the GNU extension while still using a few internal headers. */ #define u_char unsigned char diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S index 49d239f075..20da6956f1 100644 --- a/sysdeps/x86_64/dl-trampoline.S +++ b/sysdeps/x86_64/dl-trampoline.S @@ -61,6 +61,7 @@ _dl_runtime_resolve: cfi_startproc _dl_runtime_profile: + cfi_adjust_cfa_offset(16) # Incorporate PLT /* The La_x86_64_regs data structure pointed to by the fourth paramater must be 16-byte aligned. This must be explicitly enforced. We have the set up a dynamically @@ -68,7 +69,7 @@ _dl_runtime_profile: has a fixed size and preserves the original stack pointer. */ subq $32, %rsp # Allocate the local storage. - cfi_adjust_cfa_offset(48) # Incorporate PLT + cfi_adjust_cfa_offset(32) movq %rbx, (%rsp) cfi_rel_offset(%rbx, 0) @@ -203,49 +204,49 @@ L(no_avx1): vpcmpeqq (LR_SIZE)(%rsp), %xmm0, %xmm8 vpmovmskb %xmm8, %esi cmpl $0xffff, %esi - je 1f + jne 1f vmovdqu (LR_VECTOR_OFFSET)(%rsp), %ymm0 1: vpcmpeqq (LR_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm8 vpmovmskb %xmm8, %esi cmpl $0xffff, %esi - je 1f + jne 1f vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %ymm1 1: vpcmpeqq (LR_SIZE + XMM_SIZE*2)(%rsp), %xmm2, %xmm8 vpmovmskb %xmm8, %esi cmpl $0xffff, %esi - je 1f + jne 1f vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %ymm2 1: vpcmpeqq (LR_SIZE + XMM_SIZE*3)(%rsp), %xmm3, %xmm8 vpmovmskb %xmm8, %esi cmpl $0xffff, %esi - je 1f + jne 1f vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %ymm3 1: vpcmpeqq (LR_SIZE + XMM_SIZE*4)(%rsp), %xmm4, %xmm8 vpmovmskb %xmm8, %esi cmpl $0xffff, %esi - je 1f + jne 1f vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %ymm4 1: vpcmpeqq (LR_SIZE + XMM_SIZE*5)(%rsp), %xmm5, %xmm8 vpmovmskb %xmm8, %esi cmpl $0xffff, %esi - je 1f + jne 1f vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %ymm5 1: vpcmpeqq (LR_SIZE + XMM_SIZE*6)(%rsp), %xmm6, %xmm8 vpmovmskb %xmm8, %esi cmpl $0xffff, %esi - je 1f + jne 1f vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %ymm6 1: vpcmpeqq (LR_SIZE + XMM_SIZE*7)(%rsp), %xmm7, %xmm8 vpmovmskb %xmm8, %esi cmpl $0xffff, %esi - je 1f + jne 1f vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp), %ymm7 L(no_avx2): @@ -361,13 +362,13 @@ L(no_avx3): vpcmpeqq (LRV_SIZE)(%rsp), %xmm0, %xmm2 vpmovmskb %xmm2, %esi cmpl $0xffff, %esi - je 1f + jne 1f vmovdqu LRV_VECTOR0_OFFSET(%rsp), %ymm0 1: vpcmpeqq (LRV_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2 vpmovmskb %xmm2, %esi cmpl $0xffff, %esi - je 1f + jne 1f vmovdqu LRV_VECTOR1_OFFSET(%rsp), %ymm1 L(no_avx4): @@ -390,3 +391,85 @@ L(no_avx4): cfi_endproc .size _dl_runtime_profile, .-_dl_runtime_profile #endif + + +#ifdef SHARED + .globl _dl_x86_64_save_sse + .type _dl_x86_64_save_sse, @function + .align 16 + cfi_startproc +_dl_x86_64_save_sse: +# ifdef HAVE_AVX_SUPPORT + cmpl $0, L(have_avx)(%rip) + jne 1f + movq %rbx, %r11 # Save rbx + movl $1, %eax + cpuid + movq %r11,%rbx # Restore rbx + movl $1, %eax + testl $(1 << 28), %ecx + jne 2f + negl %eax +2: movl %eax, L(have_avx)(%rip) + cmpl $0, %eax + +1: js L(no_avx5) + +# define YMM_SIZE 32 + vmovdqa %ymm0, %fs:RTLD_SAVESPACE_SSE+0*YMM_SIZE + vmovdqa %ymm1, %fs:RTLD_SAVESPACE_SSE+1*YMM_SIZE + vmovdqa %ymm2, %fs:RTLD_SAVESPACE_SSE+2*YMM_SIZE + vmovdqa %ymm3, %fs:RTLD_SAVESPACE_SSE+3*YMM_SIZE + vmovdqa %ymm4, %fs:RTLD_SAVESPACE_SSE+4*YMM_SIZE + vmovdqa %ymm5, %fs:RTLD_SAVESPACE_SSE+5*YMM_SIZE + vmovdqa %ymm6, %fs:RTLD_SAVESPACE_SSE+6*YMM_SIZE + vmovdqa %ymm7, %fs:RTLD_SAVESPACE_SSE+7*YMM_SIZE + ret +L(no_avx5): +# endif +# define YMM_SIZE 16 + movdqa %xmm0, %fs:RTLD_SAVESPACE_SSE+0*XMM_SIZE + movdqa %xmm1, %fs:RTLD_SAVESPACE_SSE+1*XMM_SIZE + movdqa %xmm2, %fs:RTLD_SAVESPACE_SSE+2*XMM_SIZE + movdqa %xmm3, %fs:RTLD_SAVESPACE_SSE+3*XMM_SIZE + movdqa %xmm4, %fs:RTLD_SAVESPACE_SSE+4*XMM_SIZE + movdqa %xmm5, %fs:RTLD_SAVESPACE_SSE+5*XMM_SIZE + movdqa %xmm6, %fs:RTLD_SAVESPACE_SSE+6*XMM_SIZE + movdqa %xmm7, %fs:RTLD_SAVESPACE_SSE+7*XMM_SIZE + ret + cfi_endproc + .size _dl_x86_64_save_sse, .-_dl_x86_64_save_sse + + + .globl _dl_x86_64_restore_sse + .type _dl_x86_64_restore_sse, @function + .align 16 + cfi_startproc +_dl_x86_64_restore_sse: +# ifdef HAVE_AVX_SUPPORT + cmpl $0, L(have_avx)(%rip) + js L(no_avx6) + + vmovdqa %fs:RTLD_SAVESPACE_SSE+0*YMM_SIZE, %ymm0 + vmovdqa %fs:RTLD_SAVESPACE_SSE+1*YMM_SIZE, %ymm1 + vmovdqa %fs:RTLD_SAVESPACE_SSE+2*YMM_SIZE, %ymm2 + vmovdqa %fs:RTLD_SAVESPACE_SSE+3*YMM_SIZE, %ymm3 + vmovdqa %fs:RTLD_SAVESPACE_SSE+4*YMM_SIZE, %ymm4 + vmovdqa %fs:RTLD_SAVESPACE_SSE+5*YMM_SIZE, %ymm5 + vmovdqa %fs:RTLD_SAVESPACE_SSE+6*YMM_SIZE, %ymm6 + vmovdqa %fs:RTLD_SAVESPACE_SSE+7*YMM_SIZE, %ymm7 + ret +L(no_avx6): +# endif + movdqa %fs:RTLD_SAVESPACE_SSE+0*XMM_SIZE, %xmm0 + movdqa %fs:RTLD_SAVESPACE_SSE+1*XMM_SIZE, %xmm1 + movdqa %fs:RTLD_SAVESPACE_SSE+2*XMM_SIZE, %xmm2 + movdqa %fs:RTLD_SAVESPACE_SSE+3*XMM_SIZE, %xmm3 + movdqa %fs:RTLD_SAVESPACE_SSE+4*XMM_SIZE, %xmm4 + movdqa %fs:RTLD_SAVESPACE_SSE+5*XMM_SIZE, %xmm5 + movdqa %fs:RTLD_SAVESPACE_SSE+6*XMM_SIZE, %xmm6 + movdqa %fs:RTLD_SAVESPACE_SSE+7*XMM_SIZE, %xmm7 + ret + cfi_endproc + .size _dl_x86_64_restore_sse, .-_dl_x86_64_restore_sse +#endif diff --git a/sysdeps/x86_64/multiarch/Versions b/sysdeps/x86_64/multiarch/Versions new file mode 100644 index 0000000000..59b185ac8d --- /dev/null +++ b/sysdeps/x86_64/multiarch/Versions @@ -0,0 +1,5 @@ +libc { + GLIBC_PRIVATE { + __get_cpu_features; + } +} diff --git a/sysdeps/x86_64/multiarch/init-arch.c b/sysdeps/x86_64/multiarch/init-arch.c index 35fd19af0e..49b421eac8 100644 --- a/sysdeps/x86_64/multiarch/init-arch.c +++ b/sysdeps/x86_64/multiarch/init-arch.c @@ -86,3 +86,13 @@ __init_cpu_features (void) else __cpu_features.kind = arch_kind_other; } + + +const struct cpu_features * +__get_cpu_features (void) +{ + if (__cpu_features.kind == arch_kind_unknown) + __init_cpu_features (); + + return &__cpu_features; +} diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h index 48a2127418..0151e8b95b 100644 --- a/sysdeps/x86_64/multiarch/init-arch.h +++ b/sysdeps/x86_64/multiarch/init-arch.h @@ -54,10 +54,28 @@ extern void __init_cpu_features (void) attribute_hidden; __init_cpu_features (); \ while (0) +/* Used from outside libc.so to get access to the CPU features structure. */ +extern const struct cpu_features *__get_cpu_features (void) + __attribute__ ((const)); + /* Following are the feature tests used throughout libc. */ -#define HAS_POPCOUNT \ +#ifndef NOT_IN_libc +# define HAS_POPCOUNT \ ((__cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx & (1 << 23)) != 0) -#define HAS_SSE4_2 \ +# define HAS_SSE4_2 \ ((__cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx & (1 << 20)) != 0) + +# define HAS_FMA \ + ((__cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx & (1 << 12)) != 0) +#else +# define HAS_POPCOUNT \ + ((__get_cpu_features ()->cpuid[COMMON_CPUID_INDEX_1].ecx & (1 << 23)) != 0) + +# define HAS_SSE4_2 \ + ((__get_cpu_features ()->cpuid[COMMON_CPUID_INDEX_1].ecx & (1 << 20)) != 0) + +# define HAS_FMA \ + ((__get_cpu_features ()->cpuid[COMMON_CPUID_INDEX_1].ecx & (1 << 12)) != 0) +#endif diff --git a/sysdeps/x86_64/multiarch/s_fma.c b/sysdeps/x86_64/multiarch/s_fma.c new file mode 100644 index 0000000000..40601e9a68 --- /dev/null +++ b/sysdeps/x86_64/multiarch/s_fma.c @@ -0,0 +1,43 @@ +/* FMA version of fma. + Copyright (C) 2009 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <config.h> +#include <math.h> +#include <init-arch.h> + +#ifdef HAVE_AVX_SUPPORT + +extern double __fma_sse2 (double x, double y, double z); + + +double +__fma_fma (double x, double y, double z) +{ + asm ("vfmadd213sd %3, %2, %0" : "=x" (x) : "0" (x), "x" (y), "xm" (z)); + return x; +} + +libm_ifunc (__fma, HAS_FMA ? __fma_fma : __fma_sse2); +weak_alias (__fma, fma) + +# define __fma __fma_sse2 +#endif + +#include <math/s_fma.c> diff --git a/sysdeps/x86_64/multiarch/s_fmaf.c b/sysdeps/x86_64/multiarch/s_fmaf.c new file mode 100644 index 0000000000..f3d37f8f4a --- /dev/null +++ b/sysdeps/x86_64/multiarch/s_fmaf.c @@ -0,0 +1,42 @@ +/* FMA version of fmaf. + Copyright (C) 2009 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <config.h> +#include <math.h> +#include <init-arch.h> + +#ifdef HAVE_AVX_SUPPORT + +extern float __fmaf_sse2 (float x, float y, float z); + + +float +__fmaf_fma (float x, float y, float z) +{ + asm ("vfmadd213ss %3, %2, %0" : "=x" (x) : "0" (x), "x" (y), "xm" (z)); + return x; +} + +libm_ifunc (__fmaf, HAS_FMA ? __fmaf_fma : __fmaf_sse2); +weak_alias (__fmaf, fmaf) + +# define __fmaf __fmaf_sse2 +#endif + +#include <math/s_fmaf.c> diff --git a/sysdeps/x86_64/tst-xmmymm.sh b/sysdeps/x86_64/tst-xmmymm.sh index a576e7da0d..da8af7e686 100755 --- a/sysdeps/x86_64/tst-xmmymm.sh +++ b/sysdeps/x86_64/tst-xmmymm.sh @@ -59,10 +59,11 @@ for f in $tocheck; do objdump -d "$objpfx"../*/"$f" | awk 'BEGIN { last="" } /^[[:xdigit:]]* <[_[:alnum:]]*>:$/ { fct=substr($2, 2, length($2)-3) } /,%[xy]mm[[:digit:]]*$/ { if (last != fct) { print fct; last=fct} }' | while read fct; do - if test "$fct" != "_dl_runtime_profile"; then - echo "function $fct in $f modifies xmm/ymm" >> "$tmp" - result=1 + if test "$fct" = "_dl_runtime_profile" -o "$fct" = "_dl_x86_64_restore_sse"; then + continue; fi + echo "function $fct in $f modifies xmm/ymm" >> "$tmp" + result=1 done done |