summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--ChangeLog46
-rw-r--r--csu/libc-tls.c9
-rw-r--r--elf/dl-lookup.c13
-rw-r--r--elf/dl-reloc.c5
-rw-r--r--elf/dl-runtime.c8
-rw-r--r--include/libc-symbols.h13
-rw-r--r--math/s_fma.c4
-rw-r--r--math/s_fmaf.c4
-rw-r--r--nptl/ChangeLog29
-rw-r--r--nptl/pthreadP.h2
-rw-r--r--nptl/pthread_mutex_lock.c21
-rw-r--r--nptl/pthread_mutex_unlock.c7
-rw-r--r--nptl/sysdeps/unix/sysv/linux/pthread-pi-defines.sym1
-rw-r--r--nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_broadcast.S6
-rw-r--r--nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_signal.S6
-rw-r--r--nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S24
-rw-r--r--nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S23
-rw-r--r--nptl/sysdeps/x86_64/tcb-offsets.sym1
-rw-r--r--nptl/sysdeps/x86_64/tls.h80
-rw-r--r--stdio-common/scanf15.c1
-rw-r--r--stdio-common/scanf17.c1
-rw-r--r--sysdeps/x86_64/dl-trampoline.S105
-rw-r--r--sysdeps/x86_64/multiarch/Versions5
-rw-r--r--sysdeps/x86_64/multiarch/init-arch.c10
-rw-r--r--sysdeps/x86_64/multiarch/init-arch.h22
-rw-r--r--sysdeps/x86_64/multiarch/s_fma.c43
-rw-r--r--sysdeps/x86_64/multiarch/s_fmaf.c42
-rwxr-xr-xsysdeps/x86_64/tst-xmmymm.sh7
28 files changed, 471 insertions, 67 deletions
diff --git a/ChangeLog b/ChangeLog
index dd2060112a..51562e088e 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,49 @@
+2009-07-29  Ulrich Drepper  <drepper@redhat.com>
+
+	* math/s_fma.c: Don't define alias if __fma is a macro.
+	* math/s_fmaf.c: Likewise.
+	* sysdeps/x86_64/multiarch/s_fma.c: New file.
+	* sysdeps/x86_64/multiarch/s_fmaf.c: New file.
+	Partially based on a patch by H.J. Lu <hongjiu.lu@intel.com>.
+
+	* sysdeps/x86_64/multiarch/init-arch.h (__get_cpu_features): Declare.
+	(HAS_POPCOUNT, HAS_SSE4_2): Add variants which work outside libc.
+	New macro HAS_FMA.
+	* sysdeps/x86_64/multiarch/init-arch.c (__get_cpu_features): New
+	function.
+	* include/libc-symbols.h (libm_ifunc): Define.
+	* sysdeps/x86_64/multiarch/Versions: New file.
+
+	* sysdeps/x86_64/dl-trampoline.S (_dl_runtime_profile): Improve CFI.
+
+2009-07-28  H.J. Lu  <hongjiu.lu@intel.com>
+
+	* sysdeps/x86_64/dl-trampoline.S: Properly restore AVX registers.
+
+2009-07-29  Ulrich Drepper  <drepper@redhat.com>
+
+	* elf/dl-runtime.c (_dl_fixup): Indicate before _dl_lookup_symbol_x
+	call that registers used in calling conventions need to be preserved.
+	* elf/dl-lookup.c (do_lookup_x): Use RTLD_*_FOREIGN_CALL macros
+	to preserve register content if necessary.
+	* sysdeps/x86_64/dl-trampoline.S (_dl_x86_64_save_sse): New function.
+	(_dl_x86_64_restore_sse): New function.
+	* sysdeps/x86_64/tst-xmmymm.sh: There is now one more function that
+	is allowed to modify xmm/ymm registers.
+
+	* stdio-common/scanf15.c: Undefine _LIBC.  We want to test from an
+	application's perspective.
+	* stdio-common/scanf17.c: Likewise.
+
+2009-07-28  Ulrich Drepper  <drepper@redhat.com>
+
+	* csu/libc-tls.c (__libc_setup_tls) [TLS_TCB_AT_TP]: Don't add TCB
+	size to memsz.
+	(init_static_tls) [TLS_TCB_AT_TP]: Add it to GL(dl_tls_static_size)
+	here.
+	* elf/dl-reloc.c (_dl_try_allocate_static_tls): Compute freebytes in
+	two steps to catch bugs.
+
 2009-07-27  Ulrich Drepper  <drepper@redhat.com>
 
 	* sysdeps/x86_64/tst-xmmymm.sh: Refine testing.  The script now
diff --git a/csu/libc-tls.c b/csu/libc-tls.c
index 0d240ccef9..5a49942861 100644
--- a/csu/libc-tls.c
+++ b/csu/libc-tls.c
@@ -1,5 +1,5 @@
 /* Initialization code for TLS in statically linked application.
-   Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
+   Copyright (C) 2002-2006, 2009 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -99,6 +99,9 @@ init_static_tls (size_t memsz, size_t align)
      surplus that permits dynamic loading of modules with IE-model TLS.  */
   GL(dl_tls_static_size) = roundup (memsz + GL(dl_tls_static_size),
 				    TLS_TCB_ALIGN);
+#if TLS_TCB_AT_TP
+  GL(dl_tls_static_size) += TLS_TCB_SIZE;
+#endif
   GL(dl_tls_static_used) = memsz;
   /* The alignment requirement for the static TLS block.  */
   GL(dl_tls_static_align) = align;
@@ -211,9 +214,7 @@ __libc_setup_tls (size_t tcbsize, size_t tcbalign)
 
   memsz = roundup (memsz, align ?: 1);
 
-#if TLS_TCB_AT_TP
-  memsz += tcbsize;
-#elif TLS_DTV_AT_TP
+#if TLS_DTV_AT_TP
   memsz += tcb_offset;
 #endif
 
diff --git a/elf/dl-lookup.c b/elf/dl-lookup.c
index 1d68d67a35..56724c9b4d 100644
--- a/elf/dl-lookup.c
+++ b/elf/dl-lookup.c
@@ -380,6 +380,10 @@ do_lookup_x (const char *undef_name, uint_fast32_t new_hash,
 		  if (size * 3 <= tab->n_elements * 4)
 		    {
 		      /* Expand the table.  */
+#ifdef RTLD_CHECK_FOREIGN_CALL
+		      /* This must not happen during runtime relocations.  */
+		      assert (!RTLD_CHECK_FOREIGN_CALL);
+#endif
 		      size_t newsize = _dl_higher_prime_number (size + 1);
 		      struct unique_sym *newentries
 			= calloc (sizeof (struct unique_sym), newsize);
@@ -405,6 +409,11 @@ do_lookup_x (const char *undef_name, uint_fast32_t new_hash,
 		}
 	      else
 		{
+#ifdef RTLD_CHECK_FOREIGN_CALL
+		  /* This must not happen during runtime relocations.  */
+		  assert (!RTLD_CHECK_FOREIGN_CALL);
+#endif
+
 #define INITIAL_NUNIQUE_SYM_TABLE 31
 		  size = INITIAL_NUNIQUE_SYM_TABLE;
 		  entries = calloc (sizeof (struct unique_sym), size);
@@ -600,6 +609,10 @@ add_dependency (struct link_map *undef_map, struct link_map *map, int flags)
 	  unsigned int max
 	    = undef_map->l_reldepsmax ? undef_map->l_reldepsmax * 2 : 10;
 
+#ifdef RTLD_PREPARE_FOREIGN_CALL
+	  RTLD_PREPARE_FOREIGN_CALL;
+#endif
+
 	  newp = malloc (sizeof (*newp) + max * sizeof (struct link_map *));
 	  if (newp == NULL)
 	    {
diff --git a/elf/dl-reloc.c b/elf/dl-reloc.c
index 28f08de3e7..680caadd65 100644
--- a/elf/dl-reloc.c
+++ b/elf/dl-reloc.c
@@ -61,7 +61,10 @@ _dl_try_allocate_static_tls (struct link_map *map)
   size_t n;
   size_t blsize;
 
-  freebytes = GL(dl_tls_static_size) - GL(dl_tls_static_used) - TLS_TCB_SIZE;
+  freebytes = GL(dl_tls_static_size) - GL(dl_tls_static_used);
+  if (freebytes < TLS_TCB_SIZE)
+    goto fail;
+  freebytes -= TLS_TCB_SIZE;
 
   blsize = map->l_tls_blocksize + map->l_tls_firstbyte_offset;
   if (freebytes < blsize)
diff --git a/elf/dl-runtime.c b/elf/dl-runtime.c
index 0eb7d4e3b9..a52120d121 100644
--- a/elf/dl-runtime.c
+++ b/elf/dl-runtime.c
@@ -111,6 +111,10 @@ _dl_fixup (
 	  flags |= DL_LOOKUP_GSCOPE_LOCK;
 	}
 
+#ifdef RTLD_ENABLE_FOREIGN_CALL
+      RTLD_ENABLE_FOREIGN_CALL;
+#endif
+
       result = _dl_lookup_symbol_x (strtab + sym->st_name, l, &sym, l->l_scope,
 				    version, ELF_RTYPE_CLASS_PLT, flags, NULL);
 
@@ -118,6 +122,10 @@ _dl_fixup (
       if (!RTLD_SINGLE_THREAD_P)
 	THREAD_GSCOPE_RESET_FLAG ();
 
+#ifdef RTLD_FINALIZE_FOREIGN_CALL
+      RTLD_FINALIZE_FOREIGN_CALL;
+#endif
+
       /* Currently result contains the base load address (or link map)
 	 of the object that defines sym.  Now add in the symbol
 	 offset.  */
diff --git a/include/libc-symbols.h b/include/libc-symbols.h
index 68da77c58e..252141eb01 100644
--- a/include/libc-symbols.h
+++ b/include/libc-symbols.h
@@ -1,6 +1,6 @@
 /* Support macros for making weak and strong aliases for symbols,
    and for using symbol sets and linker warnings with GNU ld.
-   Copyright (C) 1995-1998, 2000-2006, 2008 Free Software Foundation, Inc.
+   Copyright (C) 1995-1998,2000-2006,2008,2009 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -845,6 +845,17 @@ for linking")
   }									\
   __asm__ (".type " #name ", %gnu_indirect_function");
 
+/* The body of the function is supposed to use __get_cpu_features
+   which will, if necessary, initialize the data first.  */
+#define libm_ifunc(name, expr)						\
+  extern void *name##_ifunc (void) __asm__ (#name);			\
+  void *name##_ifunc (void)						\
+  {									\
+    __typeof (name) *res = expr;					\
+    return res;								\
+  }									\
+  __asm__ (".type " #name ", %gnu_indirect_function");
+
 #ifdef HAVE_ASM_SET_DIRECTIVE
 # define libc_ifunc_hidden_def1(local, name)				\
     __asm__ (declare_symbol_alias_1_stringify (ASM_GLOBAL_DIRECTIVE)	\
diff --git a/math/s_fma.c b/math/s_fma.c
index e5ff5a7228..476d1fe44c 100644
--- a/math/s_fma.c
+++ b/math/s_fma.c
@@ -1,5 +1,5 @@
 /* Compute x * y + z as ternary operation.
-   Copyright (C) 1997, 2001 Free Software Foundation, Inc.
+   Copyright (C) 1997, 2001, 2009 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
 
@@ -25,7 +25,9 @@ __fma (double x, double y, double z)
 {
   return (x * y) + z;
 }
+#ifndef __fma
 weak_alias (__fma, fma)
+#endif
 
 #ifdef NO_LONG_DOUBLE
 strong_alias (__fma, __fmal)
diff --git a/math/s_fmaf.c b/math/s_fmaf.c
index caa7f3afe8..357296d70d 100644
--- a/math/s_fmaf.c
+++ b/math/s_fmaf.c
@@ -1,5 +1,5 @@
 /* Compute x * y + z as ternary operation.
-   Copyright (C) 1997 Free Software Foundation, Inc.
+   Copyright (C) 1997, 2009 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
 
@@ -25,4 +25,6 @@ __fmaf (float x, float y, float z)
 {
   return (x * y) + z;
 }
+#ifndef __fmaf
 weak_alias (__fmaf, fmaf)
+#endif
diff --git a/nptl/ChangeLog b/nptl/ChangeLog
index e5fc474916..0046b20608 100644
--- a/nptl/ChangeLog
+++ b/nptl/ChangeLog
@@ -1,3 +1,32 @@
+2009-07-29  Ulrich Drepper  <drepper@redhat.com>
+
+	* sysdeps/x86_64/tls.h (TLS_TCB_ALIGN): Define explicitly to 32.
+
+	* sysdeps/x86_64/tls.h (tcbhead_t): Add room for SSE registers the
+	dynamic linker might have to save.
+	Define RTLD_CHECK_FOREIGN_CALL, RTLD_ENABLE_FOREIGN_CALL,
+	RTLD_PREPARE_FOREIGN_CALL, and RTLD_FINALIZE_FOREIGN_CALL.  Pretty
+	printing.
+
+	* sysdeps/x86_64/tcb-offsets.sym: Add RTLD_SAVESPACE_SSE.
+
+2009-07-28  Ulrich Drepper  <drepper@redhat.com>
+
+	* pthread_mutex_lock.c [NO_INCR] (__pthread_mutex_cond_lock_adjust):
+	New function.
+	* pthreadP.h: Declare __pthread_mutex_cond_lock_adjust.
+	* sysdeps/unix/sysv/linux/pthread-pi-defines.sym: Add ROBUST_BIT.
+	* sysdeps/unix/sysv/linux/x86_64/pthread_cond_broadcast.S: Don't use
+	requeue_pi for robust mutexes.
+	* sysdeps/unix/sysv/linux/x86_64/pthread_cond_signal.S: Likewise.
+	* sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S: Likewise.
+	Don't only skip __pthread_mutex_cond_lock.  Call instead
+	__pthread_mutex_cond_lock_adjust.
+	* sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S: Likewise.
+
+	* pthread_mutex_unlock.c (__pthread_mutex_unlock_full): Minor
+	optimization of PI mutex handling.
+
 2009-07-27  Ulrich Drepper  <drepper@redhat.com>
 
 	[BZ #10418]
diff --git a/nptl/pthreadP.h b/nptl/pthreadP.h
index ed9fc625ba..43ca44c829 100644
--- a/nptl/pthreadP.h
+++ b/nptl/pthreadP.h
@@ -418,6 +418,8 @@ extern int __pthread_mutex_lock_internal (pthread_mutex_t *__mutex)
      attribute_hidden;
 extern int __pthread_mutex_cond_lock (pthread_mutex_t *__mutex)
      attribute_hidden internal_function;
+extern void __pthread_mutex_cond_lock_adjust (pthread_mutex_t *__mutex)
+     attribute_hidden internal_function;
 extern int __pthread_mutex_unlock (pthread_mutex_t *__mutex);
 extern int __pthread_mutex_unlock_internal (pthread_mutex_t *__mutex)
      attribute_hidden;
diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
index 406e588fdb..50dc18803d 100644
--- a/nptl/pthread_mutex_lock.c
+++ b/nptl/pthread_mutex_lock.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2002-2007, 2008 Free Software Foundation, Inc.
+/* Copyright (C) 2002-2007, 2008, 2009 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
 
@@ -473,3 +473,22 @@ __pthread_mutex_lock_full (pthread_mutex_t *mutex)
 strong_alias (__pthread_mutex_lock, pthread_mutex_lock)
 strong_alias (__pthread_mutex_lock, __pthread_mutex_lock_internal)
 #endif
+
+
+#ifdef NO_INCR
+void
+__pthread_mutex_cond_lock_adjust (mutex)
+     pthread_mutex_t *mutex;
+{
+  assert ((mutex->__data.__kind & PTHREAD_MUTEX_PRIO_INHERIT_NP) != 0);
+  assert ((mutex->__data.__kind & PTHREAD_MUTEX_ROBUST_NORMAL_NP) == 0);
+  assert ((mutex->__data.__kind & PTHREAD_MUTEX_PSHARED_BIT) == 0);
+
+  /* Record the ownership.  */
+  pid_t id = THREAD_GETMEM (THREAD_SELF, tid);
+  mutex->__data.__owner = id;
+
+  if (mutex->__data.__kind == PTHREAD_MUTEX_PI_RECURSIVE_NP)
+    ++mutex->__data.__count;
+}
+#endif
diff --git a/nptl/pthread_mutex_unlock.c b/nptl/pthread_mutex_unlock.c
index fbe8274a55..f9fe10b0f2 100644
--- a/nptl/pthread_mutex_unlock.c
+++ b/nptl/pthread_mutex_unlock.c
@@ -150,7 +150,7 @@ __pthread_mutex_unlock_full (pthread_mutex_t *mutex, int decr)
       if (--mutex->__data.__count != 0)
 	/* We still hold the mutex.  */
 	return 0;
-      goto continue_pi;
+      goto continue_pi_non_robust;
 
     case PTHREAD_MUTEX_PI_ROBUST_RECURSIVE_NP:
       /* Recursive mutex.  */
@@ -173,7 +173,7 @@ __pthread_mutex_unlock_full (pthread_mutex_t *mutex, int decr)
 	/* We still hold the mutex.  */
 	return 0;
 
-      goto continue_pi;
+      goto continue_pi_robust;
 
     case PTHREAD_MUTEX_PI_ERRORCHECK_NP:
     case PTHREAD_MUTEX_PI_NORMAL_NP:
@@ -195,9 +195,9 @@ __pthread_mutex_unlock_full (pthread_mutex_t *mutex, int decr)
       pi_notrecoverable:
        newowner = PTHREAD_MUTEX_NOTRECOVERABLE;
 
-    continue_pi:
       if ((mutex->__data.__kind & PTHREAD_MUTEX_ROBUST_NORMAL_NP) != 0)
 	{
+	continue_pi_robust:
 	  /* Remove mutex from the list.
 	     Note: robust PI futexes are signaled by setting bit 0.  */
 	  THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending,
@@ -206,6 +206,7 @@ __pthread_mutex_unlock_full (pthread_mutex_t *mutex, int decr)
 	  DEQUEUE_MUTEX (mutex);
 	}
 
+    continue_pi_non_robust:
       mutex->__data.__owner = newowner;
       if (decr)
 	/* One less user.  */
diff --git a/nptl/sysdeps/unix/sysv/linux/pthread-pi-defines.sym b/nptl/sysdeps/unix/sysv/linux/pthread-pi-defines.sym
index d985c6a79b..46fbd0de74 100644
--- a/nptl/sysdeps/unix/sysv/linux/pthread-pi-defines.sym
+++ b/nptl/sysdeps/unix/sysv/linux/pthread-pi-defines.sym
@@ -3,5 +3,6 @@
 -- These PI macros are used by assembly code.
 
 MUTEX_KIND	offsetof (pthread_mutex_t, __data.__kind)
+ROBUST_BIT	PTHREAD_MUTEX_ROBUST_NORMAL_NP
 PI_BIT		PTHREAD_MUTEX_PRIO_INHERIT_NP
 PS_BIT		PTHREAD_MUTEX_PSHARED_BIT
diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_broadcast.S b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_broadcast.S
index 0f10ec910c..224a56088e 100644
--- a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_broadcast.S
+++ b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_broadcast.S
@@ -75,8 +75,10 @@ __pthread_cond_broadcast:
 	jne	9f
 
 	/* Requeue to a PI mutex if the PI bit is set.  */
-	testl	$PI_BIT, MUTEX_KIND(%r8)
-	jne	81f
+	movl	MUTEX_KIND(%r8), %eax
+	andl	$(ROBUST_BIT|PI_BIT), %eax
+	cmpl	$PI_BIT, %eax
+	je	81f
 
 	/* Wake up all threads.  */
 #ifdef __ASSUME_PRIVATE_FUTEX
diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_signal.S b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_signal.S
index f1050fea7c..4d001eec7f 100644
--- a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_signal.S
+++ b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_signal.S
@@ -64,8 +64,10 @@ __pthread_cond_signal:
 
 	/* Get the address of the mutex used.  */
 	movq    dep_mutex(%r8), %rcx
-	testl	$PI_BIT, MUTEX_KIND(%rcx)
-	jne	9f
+	movl	MUTEX_KIND(%rcx), %eax
+	andl	$(ROBUST_BIT|PI_BIT), %eax
+	cmpl	$PI_BIT, %eax
+	je	9f
 
 #ifdef __ASSUME_PRIVATE_FUTEX
 	movl	$(FUTEX_WAKE_OP|FUTEX_PRIVATE_FLAG), %esi
diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S
index 7486825d5f..4913beb8af 100644
--- a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S
+++ b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S
@@ -165,9 +165,12 @@ __pthread_cond_timedwait:
 	je	60f
 
 	movq	dep_mutex(%rdi), %r8
-	/* Requeue to a PI mutex if the PI bit is set.  */
-	testl	$PI_BIT, MUTEX_KIND(%r8)
-	je	61f
+	/* Requeue to a non-robust PI mutex if the PI bit is set and
+	the robust bit is not set.  */
+	movl	MUTEX_KIND(%r8), %eax
+	andl	$(ROBUST_BIT|PI_BIT), %eax
+	cmpl	$PI_BIT, %eax
+	jne	61f
 
 	movl	$(FUTEX_WAIT_REQUEUE_PI|FUTEX_PRIVATE_FLAG), %esi
 	xorl	%eax, %eax
@@ -289,11 +292,10 @@ __pthread_cond_timedwait:
 
 	/* If requeue_pi is used the kernel performs the locking of the
 	   mutex. */
-41:	xorl	%eax, %eax
+41:	movq	16(%rsp), %rdi
 	testl	%r15d, %r15d
-	jnz	63f
+	jnz	64f
 
-	movq	16(%rsp), %rdi
 	callq	__pthread_mutex_cond_lock
 
 63:	testq	%rax, %rax
@@ -316,12 +318,18 @@ __pthread_cond_timedwait:
 
 	retq
 
-	/* Initial locking failed.  */
-31:	cfi_adjust_cfa_offset(4 * 8 + FRAME_SIZE)
+	cfi_adjust_cfa_offset(4 * 8 + FRAME_SIZE)
 	cfi_rel_offset(%r12, FRAME_SIZE + 24)
 	cfi_rel_offset(%r13, FRAME_SIZE + 16)
 	cfi_rel_offset(%r14, FRAME_SIZE + 8)
 	cfi_rel_offset(%r15, FRAME_SIZE)
+
+64:	callq	__pthread_mutex_cond_lock_adjust
+	movq	%r14, %rax
+	jmp	48b
+
+	/* Initial locking failed.  */
+31:
 #if cond_lock != 0
 	addq	$cond_lock, %rdi
 #endif
diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S
index 2fab38e277..a66523eab6 100644
--- a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S
+++ b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S
@@ -134,9 +134,12 @@ __pthread_cond_wait:
 	je	60f
 
 	movq	dep_mutex-cond_futex(%rdi), %r8
-	/* Requeue to a PI mutex if the PI bit is set.  */
-	testl	$PI_BIT, MUTEX_KIND(%r8)
-	je	61f
+	/* Requeue to a non-robust PI mutex if the PI bit is set and
+	the robust bit is not set.  */
+	movl	MUTEX_KIND(%r8), %eax
+	andl	$(ROBUST_BIT|PI_BIT), %eax
+	cmpl	$PI_BIT, %eax
+	jne	61f
 
 	movl	$(FUTEX_WAIT_REQUEUE_PI|FUTEX_PRIVATE_FLAG), %esi
 	movl	$SYS_futex, %eax
@@ -234,11 +237,10 @@ __pthread_cond_wait:
 
 	/* If requeue_pi is used the kernel performs the locking of the
 	   mutex. */
-11:	xorl	%eax, %eax
+11:	movq	16(%rsp), %rdi
 	testl	%r13d, %r13d
-	jnz	14f
+	jnz	18f
 
-	movq	16(%rsp), %rdi
 	callq	__pthread_mutex_cond_lock
 
 14:	addq	$FRAME_SIZE, %rsp
@@ -254,11 +256,16 @@ __pthread_cond_wait:
 	/* We return the result of the mutex_lock operation.  */
 	retq
 
-	/* Initial locking failed.  */
-1:
 	cfi_adjust_cfa_offset(16 + FRAME_SIZE)
 	cfi_rel_offset(%r12, FRAME_SIZE + 8)
 	cfi_rel_offset(%r13, FRAME_SIZE)
+
+18:	callq	__pthread_mutex_cond_lock_adjust
+	xorl	%eax, %eax
+	jmp	14b
+
+	/* Initial locking failed.  */
+1:
 #if cond_lock != 0
 	addq	$cond_lock, %rdi
 #endif
diff --git a/nptl/sysdeps/x86_64/tcb-offsets.sym b/nptl/sysdeps/x86_64/tcb-offsets.sym
index 1c70c6bde7..51f35c61cf 100644
--- a/nptl/sysdeps/x86_64/tcb-offsets.sym
+++ b/nptl/sysdeps/x86_64/tcb-offsets.sym
@@ -15,3 +15,4 @@ VGETCPU_CACHE_OFFSET	offsetof (tcbhead_t, vgetcpu_cache)
 #ifndef __ASSUME_PRIVATE_FUTEX
 PRIVATE_FUTEX		offsetof (tcbhead_t, private_futex)
 #endif
+RTLD_SAVESPACE_SSE	offsetof (tcbhead_t, rtld_savespace_sse)
diff --git a/nptl/sysdeps/x86_64/tls.h b/nptl/sysdeps/x86_64/tls.h
index ea89f3b1a2..4212038ab5 100644
--- a/nptl/sysdeps/x86_64/tls.h
+++ b/nptl/sysdeps/x86_64/tls.h
@@ -29,6 +29,7 @@
 # include <sysdep.h>
 # include <kernel-features.h>
 # include <bits/wordsize.h>
+# include <xmmintrin.h>
 
 
 /* Type for the dtv.  */
@@ -55,16 +56,23 @@ typedef struct
   uintptr_t stack_guard;
   uintptr_t pointer_guard;
   unsigned long int vgetcpu_cache[2];
-#ifndef __ASSUME_PRIVATE_FUTEX
+# ifndef __ASSUME_PRIVATE_FUTEX
   int private_futex;
-#else
+# else
   int __unused1;
-#endif
-#if __WORDSIZE == 64
-  int __pad1;
-#endif
+# endif
+# if __WORDSIZE == 64
+  int rtld_must_xmm_save;
+# endif
   /* Reservation of some values for the TM ABI.  */
   void *__private_tm[5];
+# if __WORDSIZE == 64
+  long int __unused2;
+  /* Have space for the post-AVX register size.  */
+  __m128 rtld_savespace_sse[8][4];
+
+  void *__padding[8];
+# endif
 } tcbhead_t;
 
 #else /* __ASSEMBLER__ */
@@ -109,7 +117,12 @@ typedef struct
 # define TLS_TCB_SIZE sizeof (struct pthread)
 
 /* Alignment requirements for the TCB.  */
-# define TLS_TCB_ALIGN __alignof__ (struct pthread)
+//# define TLS_TCB_ALIGN __alignof__ (struct pthread)
+// Normally the above would be correct  But we have to store post-AVX
+// vector registers in the TCB and we want the storage to be aligned.
+// unfortunately there isn't yet a type for these values and hence no
+// 32-byte alignment requirement.  Make this explicit, for now.
+# define TLS_TCB_ALIGN 32
 
 /* The TCB can have any size and the memory following the address the
    thread pointer points to is unspecified.  Allocate the TCB there.  */
@@ -298,7 +311,7 @@ typedef struct
 
 
 /* Atomic compare and exchange on TLS, returning old value.  */
-#define THREAD_ATOMIC_CMPXCHG_VAL(descr, member, newval, oldval) \
+# define THREAD_ATOMIC_CMPXCHG_VAL(descr, member, newval, oldval) \
   ({ __typeof (descr->member) __ret;					      \
      __typeof (oldval) __old = (oldval);				      \
      if (sizeof (descr->member) == 4)					      \
@@ -313,7 +326,7 @@ typedef struct
 
 
 /* Atomic logical and.  */
-#define THREAD_ATOMIC_AND(descr, member, val) \
+# define THREAD_ATOMIC_AND(descr, member, val) \
   (void) ({ if (sizeof ((descr)->member) == 4)				      \
 	      asm volatile (LOCK_PREFIX "andl %1, %%fs:%P0"		      \
 			    :: "i" (offsetof (struct pthread, member)),	      \
@@ -324,7 +337,7 @@ typedef struct
 
 
 /* Atomic set bit.  */
-#define THREAD_ATOMIC_BIT_SET(descr, member, bit) \
+# define THREAD_ATOMIC_BIT_SET(descr, member, bit) \
   (void) ({ if (sizeof ((descr)->member) == 4)				      \
 	      asm volatile (LOCK_PREFIX "orl %1, %%fs:%P0"		      \
 			    :: "i" (offsetof (struct pthread, member)),	      \
@@ -334,7 +347,7 @@ typedef struct
 	      abort (); })
 
 
-#define CALL_THREAD_FCT(descr) \
+# define CALL_THREAD_FCT(descr) \
   ({ void *__res;							      \
      asm volatile ("movq %%fs:%P2, %%rdi\n\t"				      \
 		   "callq *%%fs:%P1"					      \
@@ -355,18 +368,18 @@ typedef struct
 
 
 /* Set the pointer guard field in the TCB head.  */
-#define THREAD_SET_POINTER_GUARD(value) \
+# define THREAD_SET_POINTER_GUARD(value) \
   THREAD_SETMEM (THREAD_SELF, header.pointer_guard, value)
-#define THREAD_COPY_POINTER_GUARD(descr) \
+# define THREAD_COPY_POINTER_GUARD(descr) \
   ((descr)->header.pointer_guard					      \
    = THREAD_GETMEM (THREAD_SELF, header.pointer_guard))
 
 
 /* Get and set the global scope generation counter in the TCB head.  */
-#define THREAD_GSCOPE_FLAG_UNUSED 0
-#define THREAD_GSCOPE_FLAG_USED   1
-#define THREAD_GSCOPE_FLAG_WAIT   2
-#define THREAD_GSCOPE_RESET_FLAG() \
+# define THREAD_GSCOPE_FLAG_UNUSED 0
+# define THREAD_GSCOPE_FLAG_USED   1
+# define THREAD_GSCOPE_FLAG_WAIT   2
+# define THREAD_GSCOPE_RESET_FLAG() \
   do									      \
     { int __res;							      \
       asm volatile ("xchgl %0, %%fs:%P1"				      \
@@ -377,11 +390,40 @@ typedef struct
 	lll_futex_wake (&THREAD_SELF->header.gscope_flag, 1, LLL_PRIVATE);    \
     }									      \
   while (0)
-#define THREAD_GSCOPE_SET_FLAG() \
+# define THREAD_GSCOPE_SET_FLAG() \
   THREAD_SETMEM (THREAD_SELF, header.gscope_flag, THREAD_GSCOPE_FLAG_USED)
-#define THREAD_GSCOPE_WAIT() \
+# define THREAD_GSCOPE_WAIT() \
   GL(dl_wait_lookup_done) ()
 
+
+# ifdef SHARED
+/* Defined in dl-trampoline.S.  */
+extern void _dl_x86_64_save_sse (void);
+extern void _dl_x86_64_restore_sse (void);
+
+# define RTLD_CHECK_FOREIGN_CALL \
+  (THREAD_GETMEM (THREAD_SELF, header.rtld_must_xmm_save) != 0)
+
+#  define RTLD_ENABLE_FOREIGN_CALL \
+  THREAD_SETMEM (THREAD_SELF, header.rtld_must_xmm_save, 1)
+
+#  define RTLD_PREPARE_FOREIGN_CALL \
+  do if (THREAD_GETMEM (THREAD_SELF, header.rtld_must_xmm_save))	      \
+    {									      \
+      _dl_x86_64_save_sse ();						      \
+      THREAD_SETMEM (THREAD_SELF, header.rtld_must_xmm_save, 0);	      \
+    }									      \
+  while (0)
+
+#  define RTLD_FINALIZE_FOREIGN_CALL \
+  do {									      \
+    if (THREAD_GETMEM (THREAD_SELF, header.rtld_must_xmm_save) == 0)	      \
+      _dl_x86_64_restore_sse ();					      \
+    THREAD_SETMEM (THREAD_SELF, header.rtld_must_xmm_save, 0);		      \
+  } while (0)
+# endif
+
+
 #endif /* __ASSEMBLER__ */
 
 #endif	/* tls.h */
diff --git a/stdio-common/scanf15.c b/stdio-common/scanf15.c
index c56715c486..851466b3a9 100644
--- a/stdio-common/scanf15.c
+++ b/stdio-common/scanf15.c
@@ -1,5 +1,6 @@
 #undef _GNU_SOURCE
 #define _XOPEN_SOURCE 600
+#undef _LIBC
 /* The following macro definitions are a hack.  They word around disabling
    the GNU extension while still using a few internal headers.  */
 #define u_char unsigned char
diff --git a/stdio-common/scanf17.c b/stdio-common/scanf17.c
index ee9024f9b7..4478a7022f 100644
--- a/stdio-common/scanf17.c
+++ b/stdio-common/scanf17.c
@@ -1,5 +1,6 @@
 #undef _GNU_SOURCE
 #define _XOPEN_SOURCE 600
+#undef _LIBC
 /* The following macro definitions are a hack.  They word around disabling
    the GNU extension while still using a few internal headers.  */
 #define u_char unsigned char
diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
index 49d239f075..20da6956f1 100644
--- a/sysdeps/x86_64/dl-trampoline.S
+++ b/sysdeps/x86_64/dl-trampoline.S
@@ -61,6 +61,7 @@ _dl_runtime_resolve:
 	cfi_startproc
 
 _dl_runtime_profile:
+	cfi_adjust_cfa_offset(16) # Incorporate PLT
 	/* The La_x86_64_regs data structure pointed to by the
 	   fourth paramater must be 16-byte aligned.  This must
 	   be explicitly enforced.  We have the set up a dynamically
@@ -68,7 +69,7 @@ _dl_runtime_profile:
 	   has a fixed size and preserves the original stack pointer.  */
 
 	subq $32, %rsp		# Allocate the local storage.
-	cfi_adjust_cfa_offset(48) # Incorporate PLT
+	cfi_adjust_cfa_offset(32)
 	movq %rbx, (%rsp)
 	cfi_rel_offset(%rbx, 0)
 
@@ -203,49 +204,49 @@ L(no_avx1):
 	vpcmpeqq (LR_SIZE)(%rsp), %xmm0, %xmm8
 	vpmovmskb %xmm8, %esi
 	cmpl $0xffff, %esi
-	je 1f
+	jne 1f
 	vmovdqu			(LR_VECTOR_OFFSET)(%rsp), %ymm0
 
 1:	vpcmpeqq (LR_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm8
 	vpmovmskb %xmm8, %esi
 	cmpl $0xffff, %esi
-	je 1f
+	jne 1f
 	vmovdqu	  (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %ymm1
 
 1:	vpcmpeqq (LR_SIZE + XMM_SIZE*2)(%rsp), %xmm2, %xmm8
 	vpmovmskb %xmm8, %esi
 	cmpl $0xffff, %esi
-	je 1f
+	jne 1f
 	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %ymm2
 
 1:	vpcmpeqq (LR_SIZE + XMM_SIZE*3)(%rsp), %xmm3, %xmm8
 	vpmovmskb %xmm8, %esi
 	cmpl $0xffff, %esi
-	je 1f
+	jne 1f
 	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %ymm3
 
 1:	vpcmpeqq (LR_SIZE + XMM_SIZE*4)(%rsp), %xmm4, %xmm8
 	vpmovmskb %xmm8, %esi
 	cmpl $0xffff, %esi
-	je 1f
+	jne 1f
 	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %ymm4
 
 1:	vpcmpeqq (LR_SIZE + XMM_SIZE*5)(%rsp), %xmm5, %xmm8
 	vpmovmskb %xmm8, %esi
 	cmpl $0xffff, %esi
-	je 1f
+	jne 1f
 	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %ymm5
 
 1:	vpcmpeqq (LR_SIZE + XMM_SIZE*6)(%rsp), %xmm6, %xmm8
 	vpmovmskb %xmm8, %esi
 	cmpl $0xffff, %esi
-	je 1f
+	jne 1f
 	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %ymm6
 
 1:	vpcmpeqq (LR_SIZE + XMM_SIZE*7)(%rsp), %xmm7, %xmm8
 	vpmovmskb %xmm8, %esi
 	cmpl $0xffff, %esi
-	je 1f
+	jne 1f
 	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp), %ymm7
 
 L(no_avx2):
@@ -361,13 +362,13 @@ L(no_avx3):
 	vpcmpeqq (LRV_SIZE)(%rsp), %xmm0, %xmm2
 	vpmovmskb %xmm2, %esi
 	cmpl $0xffff, %esi
-	je 1f
+	jne 1f
 	vmovdqu LRV_VECTOR0_OFFSET(%rsp), %ymm0
 
 1:	vpcmpeqq (LRV_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2
 	vpmovmskb %xmm2, %esi
 	cmpl $0xffff, %esi
-	je 1f
+	jne 1f
 	vmovdqu LRV_VECTOR1_OFFSET(%rsp), %ymm1
 
 L(no_avx4):
@@ -390,3 +391,85 @@ L(no_avx4):
 	cfi_endproc
 	.size _dl_runtime_profile, .-_dl_runtime_profile
 #endif
+
+
+#ifdef SHARED
+	.globl _dl_x86_64_save_sse
+	.type _dl_x86_64_save_sse, @function
+	.align 16
+	cfi_startproc
+_dl_x86_64_save_sse:
+# ifdef HAVE_AVX_SUPPORT
+	cmpl	$0, L(have_avx)(%rip)
+	jne	1f
+	movq	%rbx, %r11		# Save rbx
+	movl	$1, %eax
+	cpuid
+	movq	%r11,%rbx		# Restore rbx
+	movl	$1, %eax
+	testl	$(1 << 28), %ecx
+	jne	2f
+	negl	%eax
+2:	movl	%eax, L(have_avx)(%rip)
+	cmpl	$0, %eax
+
+1:	js	L(no_avx5)
+
+#  define YMM_SIZE 32
+	vmovdqa %ymm0, %fs:RTLD_SAVESPACE_SSE+0*YMM_SIZE
+	vmovdqa %ymm1, %fs:RTLD_SAVESPACE_SSE+1*YMM_SIZE
+	vmovdqa %ymm2, %fs:RTLD_SAVESPACE_SSE+2*YMM_SIZE
+	vmovdqa %ymm3, %fs:RTLD_SAVESPACE_SSE+3*YMM_SIZE
+	vmovdqa %ymm4, %fs:RTLD_SAVESPACE_SSE+4*YMM_SIZE
+	vmovdqa %ymm5, %fs:RTLD_SAVESPACE_SSE+5*YMM_SIZE
+	vmovdqa %ymm6, %fs:RTLD_SAVESPACE_SSE+6*YMM_SIZE
+	vmovdqa %ymm7, %fs:RTLD_SAVESPACE_SSE+7*YMM_SIZE
+	ret
+L(no_avx5):
+# endif
+# define YMM_SIZE 16
+	movdqa	%xmm0, %fs:RTLD_SAVESPACE_SSE+0*XMM_SIZE
+	movdqa	%xmm1, %fs:RTLD_SAVESPACE_SSE+1*XMM_SIZE
+	movdqa	%xmm2, %fs:RTLD_SAVESPACE_SSE+2*XMM_SIZE
+	movdqa	%xmm3, %fs:RTLD_SAVESPACE_SSE+3*XMM_SIZE
+	movdqa	%xmm4, %fs:RTLD_SAVESPACE_SSE+4*XMM_SIZE
+	movdqa	%xmm5, %fs:RTLD_SAVESPACE_SSE+5*XMM_SIZE
+	movdqa	%xmm6, %fs:RTLD_SAVESPACE_SSE+6*XMM_SIZE
+	movdqa	%xmm7, %fs:RTLD_SAVESPACE_SSE+7*XMM_SIZE
+	ret
+	cfi_endproc
+	.size _dl_x86_64_save_sse, .-_dl_x86_64_save_sse
+
+
+	.globl _dl_x86_64_restore_sse
+	.type _dl_x86_64_restore_sse, @function
+	.align 16
+	cfi_startproc
+_dl_x86_64_restore_sse:
+# ifdef HAVE_AVX_SUPPORT
+	cmpl	$0, L(have_avx)(%rip)
+	js	L(no_avx6)
+
+	vmovdqa %fs:RTLD_SAVESPACE_SSE+0*YMM_SIZE, %ymm0
+	vmovdqa %fs:RTLD_SAVESPACE_SSE+1*YMM_SIZE, %ymm1
+	vmovdqa %fs:RTLD_SAVESPACE_SSE+2*YMM_SIZE, %ymm2
+	vmovdqa %fs:RTLD_SAVESPACE_SSE+3*YMM_SIZE, %ymm3
+	vmovdqa %fs:RTLD_SAVESPACE_SSE+4*YMM_SIZE, %ymm4
+	vmovdqa %fs:RTLD_SAVESPACE_SSE+5*YMM_SIZE, %ymm5
+	vmovdqa %fs:RTLD_SAVESPACE_SSE+6*YMM_SIZE, %ymm6
+	vmovdqa %fs:RTLD_SAVESPACE_SSE+7*YMM_SIZE, %ymm7
+	ret
+L(no_avx6):
+# endif
+	movdqa	%fs:RTLD_SAVESPACE_SSE+0*XMM_SIZE, %xmm0
+	movdqa	%fs:RTLD_SAVESPACE_SSE+1*XMM_SIZE, %xmm1
+	movdqa	%fs:RTLD_SAVESPACE_SSE+2*XMM_SIZE, %xmm2
+	movdqa	%fs:RTLD_SAVESPACE_SSE+3*XMM_SIZE, %xmm3
+	movdqa	%fs:RTLD_SAVESPACE_SSE+4*XMM_SIZE, %xmm4
+	movdqa	%fs:RTLD_SAVESPACE_SSE+5*XMM_SIZE, %xmm5
+	movdqa	%fs:RTLD_SAVESPACE_SSE+6*XMM_SIZE, %xmm6
+	movdqa	%fs:RTLD_SAVESPACE_SSE+7*XMM_SIZE, %xmm7
+	ret
+	cfi_endproc
+	.size _dl_x86_64_restore_sse, .-_dl_x86_64_restore_sse
+#endif
diff --git a/sysdeps/x86_64/multiarch/Versions b/sysdeps/x86_64/multiarch/Versions
new file mode 100644
index 0000000000..59b185ac8d
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/Versions
@@ -0,0 +1,5 @@
+libc {
+  GLIBC_PRIVATE {
+    __get_cpu_features;
+  }
+}
diff --git a/sysdeps/x86_64/multiarch/init-arch.c b/sysdeps/x86_64/multiarch/init-arch.c
index 35fd19af0e..49b421eac8 100644
--- a/sysdeps/x86_64/multiarch/init-arch.c
+++ b/sysdeps/x86_64/multiarch/init-arch.c
@@ -86,3 +86,13 @@ __init_cpu_features (void)
   else
     __cpu_features.kind = arch_kind_other;
 }
+
+
+const struct cpu_features *
+__get_cpu_features (void)
+{
+  if (__cpu_features.kind == arch_kind_unknown)
+    __init_cpu_features ();
+
+  return &__cpu_features;
+}
diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h
index 48a2127418..0151e8b95b 100644
--- a/sysdeps/x86_64/multiarch/init-arch.h
+++ b/sysdeps/x86_64/multiarch/init-arch.h
@@ -54,10 +54,28 @@ extern void __init_cpu_features (void) attribute_hidden;
       __init_cpu_features ();				\
   while (0)
 
+/* Used from outside libc.so to get access to the CPU features structure.  */
+extern const struct cpu_features *__get_cpu_features (void)
+     __attribute__ ((const));
+
 /* Following are the feature tests used throughout libc.  */
 
-#define HAS_POPCOUNT \
+#ifndef NOT_IN_libc
+# define HAS_POPCOUNT \
   ((__cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx & (1 << 23)) != 0)
 
-#define HAS_SSE4_2 \
+# define HAS_SSE4_2 \
   ((__cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx & (1 << 20)) != 0)
+
+# define HAS_FMA \
+  ((__cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx & (1 << 12)) != 0)
+#else
+# define HAS_POPCOUNT \
+  ((__get_cpu_features ()->cpuid[COMMON_CPUID_INDEX_1].ecx & (1 << 23)) != 0)
+
+# define HAS_SSE4_2 \
+  ((__get_cpu_features ()->cpuid[COMMON_CPUID_INDEX_1].ecx & (1 << 20)) != 0)
+
+# define HAS_FMA \
+  ((__get_cpu_features ()->cpuid[COMMON_CPUID_INDEX_1].ecx & (1 << 12)) != 0)
+#endif
diff --git a/sysdeps/x86_64/multiarch/s_fma.c b/sysdeps/x86_64/multiarch/s_fma.c
new file mode 100644
index 0000000000..40601e9a68
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/s_fma.c
@@ -0,0 +1,43 @@
+/* FMA version of fma.
+   Copyright (C) 2009 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA. */
+
+#include <config.h>
+#include <math.h>
+#include <init-arch.h>
+
+#ifdef HAVE_AVX_SUPPORT
+
+extern double __fma_sse2 (double x, double y, double z);
+
+
+double
+__fma_fma (double x, double y, double z)
+{
+  asm ("vfmadd213sd %3, %2, %0" : "=x" (x) : "0" (x), "x" (y), "xm" (z));
+  return x;
+}
+
+libm_ifunc (__fma, HAS_FMA ? __fma_fma : __fma_sse2);
+weak_alias (__fma, fma)
+
+# define __fma __fma_sse2
+#endif
+
+#include <math/s_fma.c>
diff --git a/sysdeps/x86_64/multiarch/s_fmaf.c b/sysdeps/x86_64/multiarch/s_fmaf.c
new file mode 100644
index 0000000000..f3d37f8f4a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/s_fmaf.c
@@ -0,0 +1,42 @@
+/* FMA version of fmaf.
+   Copyright (C) 2009 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA. */
+
+#include <config.h>
+#include <math.h>
+#include <init-arch.h>
+
+#ifdef HAVE_AVX_SUPPORT
+
+extern float __fmaf_sse2 (float x, float y, float z);
+
+
+float
+__fmaf_fma (float x, float y, float z)
+{
+  asm ("vfmadd213ss %3, %2, %0" : "=x" (x) : "0" (x), "x" (y), "xm" (z));
+  return x;
+}
+
+libm_ifunc (__fmaf, HAS_FMA ? __fmaf_fma : __fmaf_sse2);
+weak_alias (__fmaf, fmaf)
+
+# define __fmaf __fmaf_sse2
+#endif
+
+#include <math/s_fmaf.c>
diff --git a/sysdeps/x86_64/tst-xmmymm.sh b/sysdeps/x86_64/tst-xmmymm.sh
index a576e7da0d..da8af7e686 100755
--- a/sysdeps/x86_64/tst-xmmymm.sh
+++ b/sysdeps/x86_64/tst-xmmymm.sh
@@ -59,10 +59,11 @@ for f in $tocheck; do
   objdump -d "$objpfx"../*/"$f" |
   awk 'BEGIN { last="" } /^[[:xdigit:]]* <[_[:alnum:]]*>:$/ { fct=substr($2, 2, length($2)-3) } /,%[xy]mm[[:digit:]]*$/ { if (last != fct) { print fct; last=fct} }' |
   while read fct; do
-    if test "$fct" != "_dl_runtime_profile"; then
-      echo "function $fct in $f modifies xmm/ymm" >> "$tmp"
-      result=1
+    if test "$fct" = "_dl_runtime_profile" -o "$fct" = "_dl_x86_64_restore_sse"; then
+      continue;
     fi
+    echo "function $fct in $f modifies xmm/ymm" >> "$tmp"
+    result=1
   done
 done