about summary refs log tree commit diff
diff options
context:
space:
mode:
authorAndreas Schwab <schwab@redhat.com>2009-08-10 10:11:14 +0200
committerAndreas Schwab <schwab@redhat.com>2009-08-10 10:11:14 +0200
commit859300345b2632aa6c27f8c270356d8cc2e3c55f (patch)
tree5f4ffccaf0e99768e9156870078da79409629a14
parent9b26179acf35751146b18ef1ed24a79bbea73276 (diff)
parentefa0569d2bfdbb7367fce42b1c99821b85d2d3ba (diff)
downloadglibc-859300345b2632aa6c27f8c270356d8cc2e3c55f.tar.gz
glibc-859300345b2632aa6c27f8c270356d8cc2e3c55f.tar.xz
glibc-859300345b2632aa6c27f8c270356d8cc2e3c55f.zip
Merge commit 'origin/master' into fedora/master
-rw-r--r--.gitignore1
-rw-r--r--ChangeLog107
-rw-r--r--Makefile4
-rw-r--r--NEWS24
-rw-r--r--elf/Makefile21
-rw-r--r--elf/dl-lookup.c2
-rw-r--r--elf/tst-audit6.c28
-rw-r--r--elf/tst-audit7.c1
-rw-r--r--elf/tst-auditmod6a.c46
-rw-r--r--elf/tst-auditmod6b.c220
-rw-r--r--elf/tst-auditmod6c.c225
-rw-r--r--elf/tst-auditmod7a.c1
-rw-r--r--elf/tst-auditmod7b.c218
-rw-r--r--nptl/ChangeLog30
-rw-r--r--nptl/sysdeps/unix/sysv/linux/x86_64/cancellation.S115
-rw-r--r--nptl/sysdeps/unix/sysv/linux/x86_64/libc-cancellation.S22
-rw-r--r--nptl/sysdeps/unix/sysv/linux/x86_64/librt-cancellation.S22
-rw-r--r--nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S2
-rw-r--r--nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S50
-rw-r--r--nptl/sysdeps/unix/sysv/linux/x86_64/sem_timedwait.S210
-rw-r--r--nptl/sysdeps/unix/sysv/linux/x86_64/sem_wait.S58
-rw-r--r--nptl/sysdeps/unix/sysv/linux/x86_64/sysdep-cancel.h55
-rw-r--r--nptl/sysdeps/x86_64/tcb-offsets.sym10
-rwxr-xr-xsysdeps/i386/configure550
-rw-r--r--sysdeps/i386/configure.in8
-rw-r--r--sysdeps/i386/i686/multiarch/Makefile11
-rw-r--r--sysdeps/i386/i686/multiarch/strcasestr-c.c2
-rw-r--r--sysdeps/i386/i686/multiarch/strcasestr.c1
-rw-r--r--sysdeps/i386/i686/multiarch/strcspn-c.c2
-rw-r--r--sysdeps/i386/i686/multiarch/strcspn.S114
-rw-r--r--sysdeps/i386/i686/multiarch/strlen.S154
-rw-r--r--sysdeps/i386/i686/multiarch/strpbrk-c.c2
-rw-r--r--sysdeps/i386/i686/multiarch/strpbrk.S3
-rw-r--r--sysdeps/i386/i686/multiarch/strspn-c.c2
-rw-r--r--sysdeps/i386/i686/multiarch/strspn.S95
-rw-r--r--sysdeps/i386/i686/multiarch/strstr-c.c12
-rw-r--r--sysdeps/i386/i686/multiarch/strstr.c1
-rw-r--r--sysdeps/x86_64/cacheinfo.c42
-rw-r--r--sysdeps/x86_64/dl-trampoline.S244
-rw-r--r--sysdeps/x86_64/dl-trampoline.h269
-rw-r--r--sysdeps/x86_64/multiarch/Makefile2
-rw-r--r--sysdeps/x86_64/multiarch/rawmemchr.S1
-rw-r--r--sysdeps/x86_64/multiarch/strcmp-ssse3.S3
-rw-r--r--sysdeps/x86_64/multiarch/strcmp.S12
-rw-r--r--sysdeps/x86_64/multiarch/strcspn-c.c6
-rw-r--r--sysdeps/x86_64/multiarch/strlen.S1
-rw-r--r--sysdeps/x86_64/multiarch/strncmp-ssse3.S4
-rw-r--r--sysdeps/x86_64/multiarch/strspn-c.c4
-rw-r--r--sysdeps/x86_64/strcmp.S211
49 files changed, 2735 insertions, 493 deletions
diff --git a/.gitignore b/.gitignore
index d9294bec05..a64fda5108 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,6 +15,7 @@ stamp.*
 *.tgz
 *.bz2
 =*
+TAGS
 TODO
 AUTHORS
 copyr-*
diff --git a/ChangeLog b/ChangeLog
index 037532075d..81e903cf68 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,110 @@
+2009-08-01  H.J. Lu  <hongjiu.lu@intel.com>
+
+	* elf/Makefile (distribute): Add tst-audit6.c tst-auditmod6a.c
+	tst-auditmod6b.c tst-auditmod6c.c tst-audit7.c tst-auditmod7a.c
+	tst-auditmod7b.c.
+	(tests): Add tst-audit6 tst-audit7.
+	(modules-names): Add st-auditmod6a tst-auditmod6b tst-auditmod6c
+	tst-auditmod7a tst-auditmod7b.
+	($(objpfx)tst-audit6): New.
+	($(objpfx)tst-audit6.out): Likewise.
+	($(objpfx)tst-audit7): Likewise.
+	($(objpfx)tst-audit7.out): Likewise.
+	(tst-audit6-ENV): Likewise.
+	(tst-audit7-ENV): Likewise.
+	(CFLAGS-tst-auditmod6b.c): Likewise.
+	(CFLAGS-tst-auditmod6c.c): Likewise.
+	(CFLAGS-tst-auditmod7b.c): Likewise.
+	* elf/tst-audit6.c: New file.
+	* elf/tst-audit7.c: New file.
+	* elf/tst-auditmod6a.c: New file.
+	* elf/tst-auditmod6b.c: New file.
+	* elf/tst-auditmod6c.c: New file.
+	* elf/tst-auditmod7a.c: New file.
+	* elf/tst-auditmod7b.c: New file.
+	* sysdeps/x86_64/dl-trampoline.S (_dl_runtime_profile): Move
+	saving and restoring SSE/AVX registers to ...
+	* sysdeps/x86_64/dl-trampoline.h: This.  New file.
+
+2009-08-07  H.J. Lu  <hongjiu.lu@intel.com>
+
+	* sysdeps/i386/i686/multiarch/strcspn.S (STRCSPN): Use PIC
+	only if SHARED is defined.
+	* sysdeps/i386/i686/multiarch/strspn.S (strspn): Likewise.
+
+2009-08-03  Jim Meyering  <meyering@redhat.com>
+
+	* sysdeps/i386/configure.in: Use AC_HEADER_CHECK.
+
+2009-08-08  Ulrich Drepper  <drepper@redhat.com>
+
+	* sysdeps/x86_64/multiarch/strlen.S: Move SSE4.2 version into the same
+	section as the other functions for this architecture.
+	* sysdeps/x86_64/multiarch/rawmemchr.S: Likewise.
+
+2009-08-07  Ulrich Drepper  <drepper@redhat.com>
+
+	* sysdeps/x86_64/strcmp.S: Add support to compile with
+	USE_SSSE3.  In this case palignr is used.
+	* sysdeps/x86_64/multiarch/strcmp.S (strcmp): If SSE4.3 is not
+	available but SSSE3 is, pick __str{,n}cmp_ssse3.
+	* sysdeps/x86_64/multiarch/Makefile [subdir=string] (sysdep_routines):
+	Add strcmp-ssse3 and strncmp-ssse3.
+	* sysdeps/x86_64/multiarch/strcmp-ssse3.S: New file.
+	* sysdeps/x86_64/multiarch/strncmp-ssse3.S: New file.
+
+	* sysdeps/x86_64/multiarch/strcspn-c.c (STRCSPN_SSE42): Avoid
+	warning through fake initialization.
+
+2009-08-07  H.J. Lu  <hongjiu.lu@intel.com>
+
+	* sysdeps/i386/i686/multiarch/strlen.S (ENTRY): Add the missing "; \".
+
+2009-08-07  Andreas Schwab  <schwab@redhat.com>
+
+	* elf/dl-lookup.c (do_lookup_x): Enter correct name into table of
+	unique symbols.
+
+2009-08-05  H.J. Lu  <hongjiu.lu@intel.com>
+
+	* sysdeps/x86_64/cacheinfo.c (init_cacheinfo): Properly use
+	EBX from EAX = 1.  Handle EAX = 11.
+
+2009-08-07  Andreas Schwab  <schwab@redhat.com>
+
+	* Makefile (TAGS): Use separate sed -e expressions to avoid \
+	inside ''.
+
+2009-08-03  H.J. Lu  <hongjiu.lu@intel.com>
+
+	* sysdeps/i386/i686/multiarch/strcspn.S: Add comments for no
+	hidden IFUNC functions.
+	* sysdeps/i386/i686/multiarch/strspn.S: Likewise.
+
+	* sysdeps/i386/i686/multiarch/strlen.S: New file.
+
+	* sysdeps/i386/i686/multiarch/Makefile [subdir=string]
+	(sysdep_routines): Add strcspn-c, strpbrk-c, strspn-c, strstr-c, and
+	strcasestr-c.
+	(CFLAGS-strcspn-c.c): Define.
+	(CFLAGS-strpbrk-c.c): Define.
+	(CFLAGS-strspn-c.c): Define.
+	(CFLAGS-strstr.c): Define.
+	(CFLAGS-strcasestr.c): Define.
+	* sysdeps/i386/i686/multiarch/strcspn-c.c: New file.
+	* sysdeps/i386/i686/multiarch/strcspn.S: New file.
+	* sysdeps/i386/i686/multiarch/strpbrk-c.c: New file.
+	* sysdeps/i386/i686/multiarch/strpbrk.S: New file.
+	* sysdeps/i386/i686/multiarch/strspn-c.c: New file.
+	* sysdeps/i386/i686/multiarch/strspn.S: New file.
+	* sysdeps/i386/i686/multiarch/strstr-c.c: New file.
+	* sysdeps/i386/i686/multiarch/strstr.c: New file.
+	* sysdeps/i386/i686/multiarch/strcasestr-c.c: New file.
+	* sysdeps/i386/i686/multiarch/strcasestr.c: New file.
+	* sysdeps/x86_64/multiarch/strcspn-c.c (STRCSPN_SSE42): Use
+	-16L instead of 0xfffffffffffffff0L.
+	* sysdeps/x86_64/multiarch/strspn-c.c (__strspn_sse42): Likewise.
+
 2009-08-02  Ulrich Drepper  <drepper@redhat.com>
 
 	* sysdeps/i386/configure.in: Add test for <cpuid.h>.
diff --git a/Makefile b/Makefile
index cab5ff3a17..e346979c03 100644
--- a/Makefile
+++ b/Makefile
@@ -341,9 +341,9 @@ endif
 
 .PHONY: TAGS
 TAGS:
-	scripts/list-sources.sh | sed -n '/Makefile/p;\
+	scripts/list-sources.sh | sed -n -e '/Makefile/p' \
 	  $(foreach S,[chsSyl] cxx sh bash pl,\
-		    $(subst .,\.,/.$S\(.in\)*$$/p;))' \
+		    $(subst .,\.,-e '/.$S\(.in\)*$$/p')) \
 	| $(ETAGS) -o $@ -
 
 # Make the distribution tarfile.
diff --git a/NEWS b/NEWS
index 6061d42eb2..ca52e93b73 100644
--- a/NEWS
+++ b/NEWS
@@ -1,4 +1,4 @@
-GNU C Library NEWS -- history of user-visible changes.  2009-7-21
+GNU C Library NEWS -- history of user-visible changes.  2009-8-8
 Copyright (C) 1992-2008, 2009 Free Software Foundation, Inc.
 See the end for copying conditions.
 
@@ -17,12 +17,20 @@ Version 2.11
   Implemented by H.J. Lu.
 
 * New optimized string functions for x86-64: strstr, strcasestr, memcmp,
-  strcspn, strpbrk, strspn, strcpy, stpcpy, strncpy, strcmp, strncmp.
+  strcspn, strpbrk, strspn, strcpy, stpcpy, strncpy, strcmp (SSE2, SSE4.2),
+  strncmp (SSE2, SSE4.2).
   Contributed by H.J. Lu.
 
-  strlen, rawmemchr.
+  strlen, rawmemchr, strcmp (SSSE3), strncmp (SSSE3).
   Implemented by Ulrich Drepper.
 
+* New optimized string functions for x86: strlen, strcspn, strspn, strpbrk,
+  strstr, strcasestr.
+  Contributed by H.J. Lu.
+
+* Support for fma instruction in AVX on x86-64.
+  Implemented by H.J. Lu and Ulrich Drepper.
+
 * AVX support in x86-64 auditing support in ld.so.
   Implemented by H.J. Lu.
 
@@ -37,6 +45,16 @@ Version 2.11
   necessity is every process again.
   Implemented by Ulrich Drepper.
 
+* New resolver flag RES_USE_DNSSEC to enable use of verified lookup.
+  Implemented by Adam Tkac.
+
+* Optimized iconv conversions for S390x.
+  Implemented by Andreas Krebbel.
+
+* Using condvars with PI mutexes is now more efficient due to kernel
+  support for requeueing to PI futexes.  NPTL support added for x86-64.
+  Implemented by Ulrich Drepper.
+
 
 Version 2.10
 
diff --git a/elf/Makefile b/elf/Makefile
index 3baad9621d..d57c7fe7ed 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -93,6 +93,9 @@ distribute	:= rtld-Rules \
 		   tst-auditmod1.c tst-auditmod3a.c tst-auditmod3b.c \
 		   tst-auditmod4a.c tst-auditmod4b.c \
 		   tst-audit5.c tst-auditmod5a.c tst-auditmod5b.c \
+		   tst-audit6.c tst-auditmod6a.c tst-auditmod6b.c \
+		   tst-auditmod6c.c \
+		   tst-audit7.c tst-auditmod7a.c tst-auditmod7b.c \
 		   order2mod1.c order2mod2.c order2mod3.c order2mod4.c \
 		   tst-stackguard1.c tst-stackguard1-static.c \
 		   tst-array5.c tst-array5-static.c tst-array5dep.c \
@@ -200,7 +203,7 @@ tests += loadtest restest1 preloadtest loadfail multiload origtest resolvfail \
 test-srcs = tst-pathopt
 tests-execstack-yes = tst-execstack tst-execstack-needed tst-execstack-prog
 ifeq (x86_64,$(config-machine))
-tests += tst-audit3 tst-audit4 tst-audit5
+tests += tst-audit3 tst-audit4 tst-audit5 tst-audit6 tst-audit7
 endif
 endif
 ifeq (yesyes,$(have-fpie)$(build-shared))
@@ -255,7 +258,9 @@ endif
 ifeq (x86_64,$(config-machine))
 modules-names += tst-auditmod3a tst-auditmod3b \
 		tst-auditmod4a tst-auditmod4b \
-		tst-auditmod5a tst-auditmod5b
+		tst-auditmod5a tst-auditmod5b \
+		tst-auditmod6a tst-auditmod6b tst-auditmod6c \
+		tst-auditmod7a tst-auditmod7b
 endif
 modules-execstack-yes = tst-execstack-mod
 extra-test-objs += $(addsuffix .os,$(strip $(modules-names)))
@@ -987,6 +992,15 @@ $(objpfx)tst-audit5: $(objpfx)tst-auditmod5a.so
 $(objpfx)tst-audit5.out: $(objpfx)tst-auditmod5b.so
 tst-audit5-ENV = LD_AUDIT=$(objpfx)tst-auditmod5b.so
 
+$(objpfx)tst-audit6: $(objpfx)tst-auditmod6a.so
+$(objpfx)tst-audit6.out: $(objpfx)tst-auditmod6b.so \
+			 $(objpfx)tst-auditmod6c.so
+tst-audit6-ENV = LD_AUDIT=$(objpfx)tst-auditmod6b.so:$(objpfx)tst-auditmod6c.so
+
+$(objpfx)tst-audit7: $(objpfx)tst-auditmod7a.so
+$(objpfx)tst-audit7.out: $(objpfx)tst-auditmod7b.so
+tst-audit7-ENV = LD_AUDIT=$(objpfx)tst-auditmod7b.so
+
 $(objpfx)tst-global1: $(libdl)
 $(objpfx)tst-global1.out: $(objpfx)testobj6.so $(objpfx)testobj2.so
 
@@ -1134,4 +1148,7 @@ ifeq (yes,$(config-cflags-avx))
 CFLAGS-tst-audit4.c += -mavx
 CFLAGS-tst-auditmod4a.c += -mavx
 CFLAGS-tst-auditmod4b.c += -mavx
+CFLAGS-tst-auditmod6b.c += -mavx
+CFLAGS-tst-auditmod6c.c += -mavx
+CFLAGS-tst-auditmod7b.c += -mavx
 endif
diff --git a/elf/dl-lookup.c b/elf/dl-lookup.c
index 56724c9b4d..c1a1366d6f 100644
--- a/elf/dl-lookup.c
+++ b/elf/dl-lookup.c
@@ -321,7 +321,7 @@ do_lookup_x (const char *undef_name, uint_fast32_t new_hash,
 		    if (table[idx].name == NULL)
 		      {
 			table[idx].hashval = hash;
-			table[idx].name = strtab + sym->st_name;
+			table[idx].name = name;
 			if ((type_class & ELF_RTYPE_CLASS_COPY) != 0)
 			  {
 			    table[idx].sym = ref;
diff --git a/elf/tst-audit6.c b/elf/tst-audit6.c
new file mode 100644
index 0000000000..1f6dcb16e9
--- /dev/null
+++ b/elf/tst-audit6.c
@@ -0,0 +1,28 @@
+/* Test case for x86-64 preserved registers in dynamic linker.  */
+
+#include <stdlib.h>
+#include <string.h>
+#include <cpuid.h>
+#include <emmintrin.h>
+
+extern __m128i audit_test (__m128i, __m128i, __m128i, __m128i,
+			   __m128i, __m128i, __m128i, __m128i);
+
+int
+main (void)
+{
+  unsigned int eax, ebx, ecx, edx;
+
+  /* Run AVX test only if AVX is supported.  */
+  if (__get_cpuid (1, &eax, &ebx, &ecx, &edx)
+      && (ecx & bit_AVX))
+    {
+      __m128i xmm = _mm_setzero_si128 ();
+      __m128i ret = audit_test (xmm, xmm, xmm, xmm, xmm, xmm, xmm, xmm);
+
+      xmm = _mm_set1_epi32 (0x98abcdef);
+      if (memcmp (&xmm, &ret, sizeof (ret)))
+	abort ();
+    }
+  return 0;
+}
diff --git a/elf/tst-audit7.c b/elf/tst-audit7.c
new file mode 100644
index 0000000000..1d2a7de439
--- /dev/null
+++ b/elf/tst-audit7.c
@@ -0,0 +1 @@
+#include "tst-audit6.c"
diff --git a/elf/tst-auditmod6a.c b/elf/tst-auditmod6a.c
new file mode 100644
index 0000000000..c3a850ce98
--- /dev/null
+++ b/elf/tst-auditmod6a.c
@@ -0,0 +1,46 @@
+/* Test case for x86-64 preserved registers in dynamic linker.  */
+
+#include <stdlib.h>
+#include <string.h>
+#include <emmintrin.h>
+
+__m128i
+audit_test (__m128i x0, __m128i x1, __m128i x2, __m128i x3,
+	    __m128i x4, __m128i x5, __m128i x6, __m128i x7)
+{
+  __m128i xmm;
+
+  xmm =  _mm_set1_epi32 (0x100);
+  if (memcmp (&xmm, &x0, sizeof (xmm)))
+    abort ();
+
+  xmm =  _mm_set1_epi32 (0x101);
+  if (memcmp (&xmm, &x1, sizeof (xmm)))
+    abort ();
+
+  xmm =  _mm_set1_epi32 (0x102);
+  if (memcmp (&xmm, &x2, sizeof (xmm)))
+    abort ();
+
+  xmm =  _mm_set1_epi32 (0x103);
+  if (memcmp (&xmm, &x3, sizeof (xmm)))
+    abort ();
+
+  xmm =  _mm_set1_epi32 (0x104);
+  if (memcmp (&xmm, &x4, sizeof (xmm)))
+    abort ();
+
+  xmm =  _mm_set1_epi32 (0x105);
+  if (memcmp (&xmm, &x5, sizeof (xmm)))
+    abort ();
+
+  xmm =  _mm_set1_epi32 (0x106);
+  if (memcmp (&xmm, &x6, sizeof (xmm)))
+    abort ();
+
+  xmm =  _mm_set1_epi32 (0x107);
+  if (memcmp (&xmm, &x7, sizeof (xmm)))
+    abort ();
+
+  return _mm_setzero_si128 ();
+}
diff --git a/elf/tst-auditmod6b.c b/elf/tst-auditmod6b.c
new file mode 100644
index 0000000000..f756b50227
--- /dev/null
+++ b/elf/tst-auditmod6b.c
@@ -0,0 +1,220 @@
+/* Verify that changing AVX registers in audit library won't affect
+   function parameter passing/return.  */
+
+#include <dlfcn.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <bits/wordsize.h>
+#include <gnu/lib-names.h>
+
+unsigned int
+la_version (unsigned int v)
+{
+  setlinebuf (stdout);
+
+  printf ("version: %u\n", v);
+
+  char buf[20];
+  sprintf (buf, "%u", v);
+
+  return v;
+}
+
+void
+la_activity (uintptr_t *cookie, unsigned int flag)
+{
+  if (flag == LA_ACT_CONSISTENT)
+    printf ("activity: consistent\n");
+  else if (flag == LA_ACT_ADD)
+    printf ("activity: add\n");
+  else if (flag == LA_ACT_DELETE)
+    printf ("activity: delete\n");
+  else
+    printf ("activity: unknown activity %u\n", flag);
+}
+
+char *
+la_objsearch (const char *name, uintptr_t *cookie, unsigned int flag)
+{
+  char buf[100];
+  const char *flagstr;
+  if (flag == LA_SER_ORIG)
+    flagstr = "LA_SET_ORIG";
+  else if (flag == LA_SER_LIBPATH)
+    flagstr = "LA_SER_LIBPATH";
+  else if (flag == LA_SER_RUNPATH)
+    flagstr = "LA_SER_RUNPATH";
+  else if (flag == LA_SER_CONFIG)
+    flagstr = "LA_SER_CONFIG";
+  else if (flag == LA_SER_DEFAULT)
+    flagstr = "LA_SER_DEFAULT";
+  else if (flag == LA_SER_SECURE)
+    flagstr = "LA_SER_SECURE";
+  else
+    {
+       sprintf (buf, "unknown flag %d", flag);
+       flagstr = buf;
+    }
+  printf ("objsearch: %s, %s\n", name, flagstr);
+
+  return (char *) name;
+}
+
+unsigned int
+la_objopen (struct link_map *l, Lmid_t lmid, uintptr_t *cookie)
+{
+  printf ("objopen: %ld, %s\n", lmid, l->l_name);
+
+  return 3;
+}
+
+void
+la_preinit (uintptr_t *cookie)
+{
+  printf ("preinit\n");
+}
+
+unsigned int
+la_objclose  (uintptr_t *cookie)
+{
+  printf ("objclose\n");
+  return 0;
+}
+
+uintptr_t
+la_symbind64 (Elf64_Sym *sym, unsigned int ndx, uintptr_t *refcook,
+	      uintptr_t *defcook, unsigned int *flags, const char *symname)
+{
+  printf ("symbind64: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
+	  symname, (long int) sym->st_value, ndx, *flags);
+
+  return sym->st_value;
+}
+
+#define pltenter la_x86_64_gnu_pltenter
+#define pltexit la_x86_64_gnu_pltexit
+#define La_regs La_x86_64_regs
+#define La_retval La_x86_64_retval
+#define int_retval lrv_rax
+
+#include <tst-audit.h>
+
+#ifdef __AVX__
+#include <immintrin.h>
+#include <cpuid.h>
+
+static int avx = -1;
+
+static int
+__attribute ((always_inline))
+check_avx (void)
+{
+  if (avx == -1)
+    {
+      unsigned int eax, ebx, ecx, edx;
+
+      if (__get_cpuid (1, &eax, &ebx, &ecx, &edx)
+	  && (ecx & bit_AVX))
+	avx = 1;
+      else
+	avx = 0;
+    }
+  return avx;
+}
+#else
+#include <emmintrin.h>
+#endif
+
+ElfW(Addr)
+pltenter (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
+	  uintptr_t *defcook, La_regs *regs, unsigned int *flags,
+	  const char *symname, long int *framesizep)
+{
+  printf ("pltenter: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
+	  symname, (long int) sym->st_value, ndx, *flags);
+
+#ifdef __AVX__
+  if (check_avx () && strcmp (symname, "audit_test") == 0)
+    {
+      int i;
+
+      __m128i xmm = _mm_setzero_si128 ();
+      for (i = 0; i < 8; i++)
+	if (memcmp (&regs->lr_xmm[i], &xmm, sizeof (xmm))
+	    || memcmp (&regs->lr_vector[i], &xmm, sizeof (xmm)))
+	  abort ();
+
+      for (i = 0; i < 8; i += 2)
+	{
+	  regs->lr_xmm[i] = (La_x86_64_xmm) _mm_set1_epi32 (i + 1);
+	  regs->lr_vector[i].xmm[0] = regs->lr_xmm[i];
+	  regs->lr_vector[i + 1].ymm[0]
+	    = (La_x86_64_ymm) _mm256_set1_epi32 (i + 2);
+	  regs->lr_xmm[i + 1] = regs->lr_vector[i + 1].xmm[0];
+	}
+
+      __m256i ymm = _mm256_set1_epi32 (-1);
+      asm volatile ("vmovdqa %0, %%ymm0" : : "x" (ymm) : "xmm0" );
+      asm volatile ("vmovdqa %0, %%ymm1" : : "x" (ymm) : "xmm1" );
+      asm volatile ("vmovdqa %0, %%ymm2" : : "x" (ymm) : "xmm2" );
+      asm volatile ("vmovdqa %0, %%ymm3" : : "x" (ymm) : "xmm3" );
+      asm volatile ("vmovdqa %0, %%ymm4" : : "x" (ymm) : "xmm4" );
+      asm volatile ("vmovdqa %0, %%ymm5" : : "x" (ymm) : "xmm5" );
+      asm volatile ("vmovdqa %0, %%ymm6" : : "x" (ymm) : "xmm6" );
+      asm volatile ("vmovdqa %0, %%ymm7" : : "x" (ymm) : "xmm7" );
+
+      *framesizep = 1024;
+    }
+#endif
+
+  return sym->st_value;
+}
+
+unsigned int
+pltexit (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
+	 uintptr_t *defcook, const La_regs *inregs, La_retval *outregs,
+	 const char *symname)
+{
+  printf ("pltexit: symname=%s, st_value=%#lx, ndx=%u, retval=%tu\n",
+	  symname, (long int) sym->st_value, ndx, outregs->int_retval);
+
+#ifdef __AVX__
+  if (check_avx () && strcmp (symname, "audit_test") == 0)
+    {
+      int i;
+
+      __m128i xmm = _mm_setzero_si128 ();
+      if (memcmp (&outregs->lrv_xmm0, &xmm, sizeof (xmm))
+	  || memcmp (&outregs->lrv_vector0, &xmm, sizeof (xmm)))
+	abort ();
+
+      __m256i ymm;
+
+      for (i = 0; i < 8; i += 2)
+	{
+	  xmm = _mm_set1_epi32 (i + 0x100);
+	  if (memcmp (&inregs->lr_xmm[i], &xmm, sizeof (xmm))
+	      || memcmp (&inregs->lr_vector[i], &xmm, sizeof (xmm)))
+	    abort ();
+
+	  ymm = _mm256_set1_epi32 (i + 0x101);
+	  if (memcmp (&inregs->lr_xmm[i + 1],
+		      &inregs->lr_vector[i + 1].xmm[0], sizeof (xmm))
+	      || memcmp (&inregs->lr_vector[i + 1], &ymm, sizeof (ymm)))
+	    abort ();
+	}
+
+      outregs->lrv_vector0.ymm[0]
+	= (La_x86_64_ymm) _mm256_set1_epi32 (0x12349876);
+
+      ymm = _mm256_set1_epi32 (-1);
+      asm volatile ("vmovdqa %0, %%ymm0" : : "x" (ymm) : "xmm0" );
+      asm volatile ("vmovdqa %0, %%ymm1" : : "x" (ymm) : "xmm1" );
+    }
+#endif
+
+  return 0;
+}
diff --git a/elf/tst-auditmod6c.c b/elf/tst-auditmod6c.c
new file mode 100644
index 0000000000..49cbf05492
--- /dev/null
+++ b/elf/tst-auditmod6c.c
@@ -0,0 +1,225 @@
+/* Verify that changing AVX registers in audit library won't affect
+   function parameter passing/return.  */
+
+#include <dlfcn.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <bits/wordsize.h>
+#include <gnu/lib-names.h>
+
+unsigned int
+la_version (unsigned int v)
+{
+  setlinebuf (stdout);
+
+  printf ("version: %u\n", v);
+
+  char buf[20];
+  sprintf (buf, "%u", v);
+
+  return v;
+}
+
+void
+la_activity (uintptr_t *cookie, unsigned int flag)
+{
+  if (flag == LA_ACT_CONSISTENT)
+    printf ("activity: consistent\n");
+  else if (flag == LA_ACT_ADD)
+    printf ("activity: add\n");
+  else if (flag == LA_ACT_DELETE)
+    printf ("activity: delete\n");
+  else
+    printf ("activity: unknown activity %u\n", flag);
+}
+
+char *
+la_objsearch (const char *name, uintptr_t *cookie, unsigned int flag)
+{
+  char buf[100];
+  const char *flagstr;
+  if (flag == LA_SER_ORIG)
+    flagstr = "LA_SET_ORIG";
+  else if (flag == LA_SER_LIBPATH)
+    flagstr = "LA_SER_LIBPATH";
+  else if (flag == LA_SER_RUNPATH)
+    flagstr = "LA_SER_RUNPATH";
+  else if (flag == LA_SER_CONFIG)
+    flagstr = "LA_SER_CONFIG";
+  else if (flag == LA_SER_DEFAULT)
+    flagstr = "LA_SER_DEFAULT";
+  else if (flag == LA_SER_SECURE)
+    flagstr = "LA_SER_SECURE";
+  else
+    {
+       sprintf (buf, "unknown flag %d", flag);
+       flagstr = buf;
+    }
+  printf ("objsearch: %s, %s\n", name, flagstr);
+
+  return (char *) name;
+}
+
+unsigned int
+la_objopen (struct link_map *l, Lmid_t lmid, uintptr_t *cookie)
+{
+  printf ("objopen: %ld, %s\n", lmid, l->l_name);
+
+  return 3;
+}
+
+void
+la_preinit (uintptr_t *cookie)
+{
+  printf ("preinit\n");
+}
+
+unsigned int
+la_objclose  (uintptr_t *cookie)
+{
+  printf ("objclose\n");
+  return 0;
+}
+
+uintptr_t
+la_symbind64 (Elf64_Sym *sym, unsigned int ndx, uintptr_t *refcook,
+	      uintptr_t *defcook, unsigned int *flags, const char *symname)
+{
+  printf ("symbind64: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
+	  symname, (long int) sym->st_value, ndx, *flags);
+
+  return sym->st_value;
+}
+
+#define pltenter la_x86_64_gnu_pltenter
+#define pltexit la_x86_64_gnu_pltexit
+#define La_regs La_x86_64_regs
+#define La_retval La_x86_64_retval
+#define int_retval lrv_rax
+
+#include <tst-audit.h>
+
+#ifdef __AVX__
+#include <immintrin.h>
+#include <cpuid.h>
+
+static int avx = -1;
+
+static int
+__attribute ((always_inline))
+check_avx (void)
+{
+  if (avx == -1)
+    {
+      unsigned int eax, ebx, ecx, edx;
+
+      if (__get_cpuid (1, &eax, &ebx, &ecx, &edx)
+	  && (ecx & bit_AVX))
+	avx = 1;
+      else
+	avx = 0;
+    }
+  return avx;
+}
+#else
+#include <emmintrin.h>
+#endif
+
+ElfW(Addr)
+pltenter (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
+	  uintptr_t *defcook, La_regs *regs, unsigned int *flags,
+	  const char *symname, long int *framesizep)
+{
+  printf ("pltenter: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
+	  symname, (long int) sym->st_value, ndx, *flags);
+
+#ifdef __AVX__
+  if (check_avx () && strcmp (symname, "audit_test") == 0)
+    {
+      int i;
+      __m128i xmm;
+      __m256i ymm;
+
+      for (i = 0; i < 8; i += 2)
+	{
+	  xmm = _mm_set1_epi32 (i + 1);
+	  if (memcmp (&regs->lr_xmm[i], &xmm, sizeof (xmm))
+	      || memcmp (&regs->lr_vector[i], &xmm, sizeof (xmm)))
+	    abort ();
+	  regs->lr_xmm[i] = (La_x86_64_xmm) _mm_set1_epi32 (i + 0x100);
+	  regs->lr_vector[i].xmm[0] = regs->lr_xmm[i];
+
+	  ymm = _mm256_set1_epi32 (i + 2);
+	  if (memcmp (&regs->lr_xmm[i + 1],
+		      &regs->lr_vector[i + 1].xmm[0], sizeof (xmm))
+	      || memcmp (&regs->lr_vector[i + 1], &ymm, sizeof (ymm)))
+	    abort ();
+	  regs->lr_vector[i + 1].ymm[0]
+	    = (La_x86_64_ymm) _mm256_set1_epi32 (i + 0x101);
+	  regs->lr_xmm[i + 1] = regs->lr_vector[i + 1].xmm[0];
+	}
+
+      ymm = _mm256_set1_epi32 (-1);
+      asm volatile ("vmovdqa %0, %%ymm0" : : "x" (ymm) : "xmm0" );
+      asm volatile ("vmovdqa %0, %%ymm1" : : "x" (ymm) : "xmm1" );
+      asm volatile ("vmovdqa %0, %%ymm2" : : "x" (ymm) : "xmm2" );
+      asm volatile ("vmovdqa %0, %%ymm3" : : "x" (ymm) : "xmm3" );
+      asm volatile ("vmovdqa %0, %%ymm4" : : "x" (ymm) : "xmm4" );
+      asm volatile ("vmovdqa %0, %%ymm5" : : "x" (ymm) : "xmm5" );
+      asm volatile ("vmovdqa %0, %%ymm6" : : "x" (ymm) : "xmm6" );
+      asm volatile ("vmovdqa %0, %%ymm7" : : "x" (ymm) : "xmm7" );
+
+      *framesizep = 1024;
+    }
+#endif
+
+  return sym->st_value;
+}
+
+unsigned int
+pltexit (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
+	 uintptr_t *defcook, const La_regs *inregs, La_retval *outregs,
+	 const char *symname)
+{
+  printf ("pltexit: symname=%s, st_value=%#lx, ndx=%u, retval=%tu\n",
+	  symname, (long int) sym->st_value, ndx, outregs->int_retval);
+
+#ifdef __AVX__
+  if (check_avx () && strcmp (symname, "audit_test") == 0)
+    {
+      int i;
+
+      __m256i ymm = _mm256_set1_epi32 (0x12349876);;
+      if (memcmp (&outregs->lrv_vector0, &ymm, sizeof (ymm)))
+	abort ();
+
+      __m128i xmm;
+
+      for (i = 0; i < 8; i += 2)
+	{
+	  xmm = _mm_set1_epi32 (i + 0x100);
+	  if (memcmp (&inregs->lr_xmm[i], &xmm, sizeof (xmm))
+	      || memcmp (&inregs->lr_vector[i], &xmm, sizeof (xmm)))
+	    abort ();
+
+	  ymm = _mm256_set1_epi32 (i + 0x101);
+	  if (memcmp (&inregs->lr_xmm[i + 1],
+		      &inregs->lr_vector[i + 1].xmm[0], sizeof (xmm))
+	      || memcmp (&inregs->lr_vector[i + 1], &ymm, sizeof (ymm)))
+	    abort ();
+	}
+
+      outregs->lrv_vector0.ymm[0]
+	= (La_x86_64_ymm) _mm256_set1_epi32 (0x98abcdef);
+
+      ymm = _mm256_set1_epi32 (-1);
+      asm volatile ("vmovdqa %0, %%ymm0" : : "x" (ymm) : "xmm0" );
+      asm volatile ("vmovdqa %0, %%ymm1" : : "x" (ymm) : "xmm1" );
+    }
+#endif
+
+  return 0;
+}
diff --git a/elf/tst-auditmod7a.c b/elf/tst-auditmod7a.c
new file mode 100644
index 0000000000..b379df75d6
--- /dev/null
+++ b/elf/tst-auditmod7a.c
@@ -0,0 +1 @@
+#include "tst-auditmod6a.c"
diff --git a/elf/tst-auditmod7b.c b/elf/tst-auditmod7b.c
new file mode 100644
index 0000000000..eb237586fe
--- /dev/null
+++ b/elf/tst-auditmod7b.c
@@ -0,0 +1,218 @@
+/* Verify that changing AVX registers in audit library won't affect
+   function parameter passing/return.  */
+
+#include <dlfcn.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <bits/wordsize.h>
+#include <gnu/lib-names.h>
+
+unsigned int
+la_version (unsigned int v)
+{
+  setlinebuf (stdout);
+
+  printf ("version: %u\n", v);
+
+  char buf[20];
+  sprintf (buf, "%u", v);
+
+  return v;
+}
+
+void
+la_activity (uintptr_t *cookie, unsigned int flag)
+{
+  if (flag == LA_ACT_CONSISTENT)
+    printf ("activity: consistent\n");
+  else if (flag == LA_ACT_ADD)
+    printf ("activity: add\n");
+  else if (flag == LA_ACT_DELETE)
+    printf ("activity: delete\n");
+  else
+    printf ("activity: unknown activity %u\n", flag);
+}
+
+char *
+la_objsearch (const char *name, uintptr_t *cookie, unsigned int flag)
+{
+  char buf[100];
+  const char *flagstr;
+  if (flag == LA_SER_ORIG)
+    flagstr = "LA_SET_ORIG";
+  else if (flag == LA_SER_LIBPATH)
+    flagstr = "LA_SER_LIBPATH";
+  else if (flag == LA_SER_RUNPATH)
+    flagstr = "LA_SER_RUNPATH";
+  else if (flag == LA_SER_CONFIG)
+    flagstr = "LA_SER_CONFIG";
+  else if (flag == LA_SER_DEFAULT)
+    flagstr = "LA_SER_DEFAULT";
+  else if (flag == LA_SER_SECURE)
+    flagstr = "LA_SER_SECURE";
+  else
+    {
+       sprintf (buf, "unknown flag %d", flag);
+       flagstr = buf;
+    }
+  printf ("objsearch: %s, %s\n", name, flagstr);
+
+  return (char *) name;
+}
+
+unsigned int
+la_objopen (struct link_map *l, Lmid_t lmid, uintptr_t *cookie)
+{
+  printf ("objopen: %ld, %s\n", lmid, l->l_name);
+
+  return 3;
+}
+
+void
+la_preinit (uintptr_t *cookie)
+{
+  printf ("preinit\n");
+}
+
+unsigned int
+la_objclose  (uintptr_t *cookie)
+{
+  printf ("objclose\n");
+  return 0;
+}
+
+uintptr_t
+la_symbind64 (Elf64_Sym *sym, unsigned int ndx, uintptr_t *refcook,
+	      uintptr_t *defcook, unsigned int *flags, const char *symname)
+{
+  printf ("symbind64: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
+	  symname, (long int) sym->st_value, ndx, *flags);
+
+  return sym->st_value;
+}
+
+#define pltenter la_x86_64_gnu_pltenter
+#define pltexit la_x86_64_gnu_pltexit
+#define La_regs La_x86_64_regs
+#define La_retval La_x86_64_retval
+#define int_retval lrv_rax
+
+#include <tst-audit.h>
+
+#ifdef __AVX__
+#include <immintrin.h>
+#include <cpuid.h>
+
+static int avx = -1;
+
+static int
+__attribute ((always_inline))
+check_avx (void)
+{
+  if (avx == -1)
+    {
+      unsigned int eax, ebx, ecx, edx;
+
+      if (__get_cpuid (1, &eax, &ebx, &ecx, &edx)
+	  && (ecx & bit_AVX))
+	avx = 1;
+      else
+	avx = 0;
+    }
+  return avx;
+}
+#else
+#include <emmintrin.h>
+#endif
+
+ElfW(Addr)
+pltenter (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
+	  uintptr_t *defcook, La_regs *regs, unsigned int *flags,
+	  const char *symname, long int *framesizep)
+{
+  printf ("pltenter: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
+	  symname, (long int) sym->st_value, ndx, *flags);
+
+#ifdef __AVX__
+  if (check_avx () && strcmp (symname, "audit_test") == 0)
+    {
+      int i;
+
+      __m128i xmm = _mm_setzero_si128 ();
+      for (i = 0; i < 8; i++)
+	if (memcmp (&regs->lr_xmm[i], &xmm, sizeof (xmm))
+	    || memcmp (&regs->lr_vector[i], &xmm, sizeof (xmm)))
+	  abort ();
+
+      for (i = 0; i < 8; i += 2)
+	{
+	  regs->lr_xmm[i] = (La_x86_64_xmm) _mm_set1_epi32 (i + 0x100);
+	  regs->lr_vector[i + 1].ymm[0]
+	    = (La_x86_64_ymm) _mm256_set1_epi32 (i + 0x101);
+	}
+
+      __m256i ymm = _mm256_set1_epi32 (-1);
+      asm volatile ("vmovdqa %0, %%ymm0" : : "x" (ymm) : "xmm0" );
+      asm volatile ("vmovdqa %0, %%ymm1" : : "x" (ymm) : "xmm1" );
+      asm volatile ("vmovdqa %0, %%ymm2" : : "x" (ymm) : "xmm2" );
+      asm volatile ("vmovdqa %0, %%ymm3" : : "x" (ymm) : "xmm3" );
+      asm volatile ("vmovdqa %0, %%ymm4" : : "x" (ymm) : "xmm4" );
+      asm volatile ("vmovdqa %0, %%ymm5" : : "x" (ymm) : "xmm5" );
+      asm volatile ("vmovdqa %0, %%ymm6" : : "x" (ymm) : "xmm6" );
+      asm volatile ("vmovdqa %0, %%ymm7" : : "x" (ymm) : "xmm7" );
+
+      *framesizep = 1024;
+    }
+#endif
+
+  return sym->st_value;
+}
+
+unsigned int
+pltexit (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
+	 uintptr_t *defcook, const La_regs *inregs, La_retval *outregs,
+	 const char *symname)
+{
+  printf ("pltexit: symname=%s, st_value=%#lx, ndx=%u, retval=%tu\n",
+	  symname, (long int) sym->st_value, ndx, outregs->int_retval);
+
+#ifdef __AVX__
+  if (check_avx () && strcmp (symname, "audit_test") == 0)
+    {
+      int i;
+
+      __m128i xmm = _mm_setzero_si128 ();
+      if (memcmp (&outregs->lrv_xmm0, &xmm, sizeof (xmm))
+	  || memcmp (&outregs->lrv_vector0, &xmm, sizeof (xmm)))
+	abort ();
+
+      __m256i ymm;
+
+      for (i = 0; i < 8; i += 2)
+	{
+	  xmm = _mm_set1_epi32 (i + 0x100);
+	  if (memcmp (&inregs->lr_xmm[i], &xmm, sizeof (xmm))
+	      || memcmp (&inregs->lr_vector[i], &xmm, sizeof (xmm)))
+	    abort ();
+
+	  ymm = _mm256_set1_epi32 (i + 0x101);
+	  if (memcmp (&inregs->lr_xmm[i + 1],
+		      &inregs->lr_vector[i + 1].xmm[0], sizeof (xmm))
+	      || memcmp (&inregs->lr_vector[i + 1], &ymm, sizeof (ymm)))
+	    abort ();
+	}
+
+      outregs->lrv_vector0.ymm[0]
+	= (La_x86_64_ymm) _mm256_set1_epi32 (0x98abcdef);
+
+      ymm = _mm256_set1_epi32 (-1);
+      asm volatile ("vmovdqa %0, %%ymm0" : : "x" (ymm) : "xmm0" );
+      asm volatile ("vmovdqa %0, %%ymm1" : : "x" (ymm) : "xmm1" );
+    }
+#endif
+
+  return 0;
+}
diff --git a/nptl/ChangeLog b/nptl/ChangeLog
index 83d0dda46c..fe3e90f310 100644
--- a/nptl/ChangeLog
+++ b/nptl/ChangeLog
@@ -1,3 +1,33 @@
+2009-08-08  Ulrich Drepper  <drepper@redhat.com>
+
+	* sysdeps/unix/sysv/linux/x86_64/sem_timedwait.S (sem_timedwait):
+	Optimize code path used when FUTEX_CLOCK_REALTIME is supported.
+
+	* sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S
+	(__pthread_cond_wait): Optimize by avoiding use of callee-safe
+	register.
+
+2009-08-07  Ulrich Drepper  <drepper@redhat.com>
+
+	* sysdeps/unix/sysv/linux/x86_64/sem_wait.S: Little optimizations
+	enabled by the special *_asynccancel functions.
+	* sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S: Likewise.
+	* sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S: Likewise.
+
+	* sysdeps/unix/sysv/linux/x86_64/cancellation.S: Include lowlevellock.h.
+
+2009-08-04  Ulrich Drepper  <drepper@redhat.com>
+
+	* sysdeps/unix/sysv/linux/x86_64/cancellation.S: New file.
+	* sysdeps/unix/sysv/linux/x86_64/libc-cancellation.S: New file.
+	* sysdeps/unix/sysv/linux/x86_64/librt-cancellation.S: New file.
+	* sysdeps/unix/sysv/linux/x86_64/sysdep-cancel.h (PSEUDO): Optimize
+	since we can assume the special __*_{en,dis}able_asynccancel
+	functions.
+	(PUSHARGS_*, POPARGS_*, SAVESTK_*, RESTSTK_*): Removed.
+	* sysdeps/x86_64/tcb-offsets.sym: Add cancellation-related bits
+	and PTHREAD_CANCELED.
+
 2009-07-31  Ulrich Drepper  <drepper@redhat.com>
 
 	* descr.h: Better definition of *_BITMASK macros for cancellation.
diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/cancellation.S b/nptl/sysdeps/unix/sysv/linux/x86_64/cancellation.S
new file mode 100644
index 0000000000..0d48ec6fcd
--- /dev/null
+++ b/nptl/sysdeps/unix/sysv/linux/x86_64/cancellation.S
@@ -0,0 +1,115 @@
+/* Copyright (C) 2009 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@redhat.com>, 2009.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <tcb-offsets.h>
+#include <kernel-features.h>
+#include "lowlevellock.h"
+
+#ifdef IS_IN_libpthread
+# ifdef SHARED
+#  define __pthread_unwind __GI___pthread_unwind
+# endif
+#else
+# ifndef SHARED
+	.weak __pthread_unwind
+# endif
+#endif
+
+
+#ifdef __ASSUME_PRIVATE_FUTEX
+# define LOAD_PRIVATE_FUTEX_WAIT(reg) \
+	movl	$(FUTEX_WAIT | FUTEX_PRIVATE_FLAG), reg
+#else
+# if FUTEX_WAIT == 0
+#  define LOAD_PRIVATE_FUTEX_WAIT(reg) \
+	movl	%fs:PRIVATE_FUTEX, reg
+# else
+#  define LOAD_PRIVATE_FUTEX_WAIT(reg) \
+	movl	%fs:PRIVATE_FUTEX, reg ; \
+	orl	$FUTEX_WAIT, reg
+# endif
+#endif
+
+/* It is crucial that the functions in this file don't modify registers
+   other than %rax and %r11.  The syscall wrapper code depends on this
+   because it doesn't explicitly save the other registers which hold
+   relevant values.  */
+	.text
+
+	.hidden __pthread_enable_asynccancel
+ENTRY(__pthread_enable_asynccancel)
+	movl	%fs:CANCELHANDLING, %eax
+2:	movl	%eax, %r11d
+	orl	$TCB_CANCELTYPE_BITMASK, %r11d
+	cmpl	%eax, %r11d
+	je	1f
+
+	lock
+	cmpxchgl %r11d, %fs:CANCELHANDLING
+	jnz	2b
+
+	andl	$(TCB_CANCELSTATE_BITMASK|TCB_CANCELTYPE_BITMASK|TCB_CANCELED_BITMASK|TCB_EXITING_BITMASK|TCB_CANCEL_RESTMASK|TCB_TERMINATED_BITMASK), %r11d
+	cmpl	$(TCB_CANCELTYPE_BITMASK|TCB_CANCELED_BITMASK), %r11d
+	je	3f
+
+1:	ret
+
+3:	movq	$TCB_PTHREAD_CANCELED, %fs:RESULT
+	lock
+	orl	$TCB_EXITING_BITMASK, %fs:CANCELHANDLING
+	movq	%fs:CLEANUP_JMP_BUF, %rdi
+#ifdef SHARED
+	call	__pthread_unwind@PLT
+#else
+	call	__pthread_unwind
+#endif
+	hlt
+END(__pthread_enable_asynccancel)
+
+
+	.hidden __pthread_disable_asynccancel
+ENTRY(__pthread_disable_asynccancel)
+	testl	$TCB_CANCELTYPE_BITMASK, %edi
+	jnz	1f
+
+	movl	%fs:CANCELHANDLING, %eax
+2:	movl	%eax, %r11d
+	andl	$~TCB_CANCELTYPE_BITMASK, %r11d
+	lock
+	cmpxchgl %r11d, %fs:CANCELHANDLING
+	jnz	2b
+
+3:	movl	%r11d, %eax
+	andl	$(TCB_CANCELING_BITMASK|TCB_CANCELED_BITMASK), %eax
+	cmpl	$TCB_CANCELING_BITMASK, %eax
+	je	4f
+1:	ret
+
+	/* Performance doesn't matter in this loop.  We will
+	   delay until the thread is canceled.  And we will unlikely
+	   enter the loop twice.  */
+4:	movq	%fs:0, %rdi
+	movl	$__NR_futex, %eax
+	xorq	%r10, %r10
+	addq	$CANCELHANDLING, %rdi
+	LOAD_PRIVATE_FUTEX_WAIT (%esi)
+	syscall
+	jmp	3b
+END(__pthread_disable_asynccancel)
diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/libc-cancellation.S b/nptl/sysdeps/unix/sysv/linux/x86_64/libc-cancellation.S
new file mode 100644
index 0000000000..1100588502
--- /dev/null
+++ b/nptl/sysdeps/unix/sysv/linux/x86_64/libc-cancellation.S
@@ -0,0 +1,22 @@
+/* Copyright (C) 2009 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@redhat.com>, 2009.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#define __pthread_enable_asynccancel __libc_enable_asynccancel
+#define __pthread_disable_asynccancel __libc_disable_asynccancel
+#include "cancellation.S"
diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/librt-cancellation.S b/nptl/sysdeps/unix/sysv/linux/x86_64/librt-cancellation.S
new file mode 100644
index 0000000000..ce4192b5d3
--- /dev/null
+++ b/nptl/sysdeps/unix/sysv/linux/x86_64/librt-cancellation.S
@@ -0,0 +1,22 @@
+/* Copyright (C) 2009 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@redhat.com>, 2009.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#define __pthread_enable_asynccancel __librt_enable_asynccancel
+#define __pthread_disable_asynccancel __librt_disable_asynccancel
+#include "cancellation.S"
diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S
index 4913beb8af..86bdac1b1b 100644
--- a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S
+++ b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S
@@ -157,7 +157,6 @@ __pthread_cond_timedwait:
 .LcleanupSTART1:
 34:	callq	__pthread_enable_asynccancel
 	movl	%eax, (%rsp)
-	movq	8(%rsp), %rdi
 
 	movq	%r13, %r10
 	movl	$FUTEX_WAIT_BITSET, %esi
@@ -511,7 +510,6 @@ __pthread_cond_timedwait:
 .LcleanupSTART2:
 4:	callq	__pthread_enable_asynccancel
 	movl	%eax, (%rsp)
-	movq	8(%rsp), %rdi
 
 	leaq	32(%rsp), %r10
 	cmpq	$-1, dep_mutex(%rdi)
diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S
index a66523eab6..f5b929ea71 100644
--- a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S
+++ b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S
@@ -45,14 +45,8 @@ __pthread_cond_wait:
 	cfi_lsda(DW_EH_PE_udata4, .LexceptSTART)
 #endif
 
-	pushq	%r12
-	cfi_adjust_cfa_offset(8)
-	cfi_rel_offset(%r12, 0)
-	pushq	%r13
-	cfi_adjust_cfa_offset(8)
-	cfi_rel_offset(%r13, 0)
 #define FRAME_SIZE 32
-	subq	$FRAME_SIZE, %rsp
+	leaq	-FRAME_SIZE(%rsp), %rsp
 	cfi_adjust_cfa_offset(FRAME_SIZE)
 
 	/* Stack frame:
@@ -112,7 +106,7 @@ __pthread_cond_wait:
 	movl	%edx, 4(%rsp)
 
 	/* Unlock.  */
-8:	movl	cond_futex(%rdi), %r12d
+8:	movl	cond_futex(%rdi), %edx
 	LOCK
 #if cond_lock == 0
 	decl	(%rdi)
@@ -125,9 +119,7 @@ __pthread_cond_wait:
 4:	callq	__pthread_enable_asynccancel
 	movl	%eax, (%rsp)
 
-	movq	8(%rsp), %rdi
 	xorq	%r10, %r10
-	movq	%r12, %rdx
 	cmpq	$-1, dep_mutex(%rdi)
 	leaq	cond_futex(%rdi), %rdi
 	movl	$FUTEX_WAIT, %esi
@@ -145,7 +137,7 @@ __pthread_cond_wait:
 	movl	$SYS_futex, %eax
 	syscall
 
-	movl	$1, %r13d
+	movl	$1, %r8d
 #ifdef __ASSUME_REQUEUE_PI
 	jmp	62f
 #else
@@ -163,7 +155,7 @@ __pthread_cond_wait:
 #else
 	orl	%fs:PRIVATE_FUTEX, %esi
 #endif
-60:	xorl	%r13d, %r13d
+60:	xorl	%r8d, %r8d
 	movl	$SYS_futex, %eax
 	syscall
 
@@ -238,27 +230,18 @@ __pthread_cond_wait:
 	/* If requeue_pi is used the kernel performs the locking of the
 	   mutex. */
 11:	movq	16(%rsp), %rdi
-	testl	%r13d, %r13d
+	testl	%r8d, %r8d
 	jnz	18f
 
 	callq	__pthread_mutex_cond_lock
 
-14:	addq	$FRAME_SIZE, %rsp
+14:	leaq	FRAME_SIZE(%rsp), %rsp
 	cfi_adjust_cfa_offset(-FRAME_SIZE)
 
-	popq	%r13
-	cfi_adjust_cfa_offset(-8)
-	cfi_restore(%r13)
-	popq	%r12
-	cfi_adjust_cfa_offset(-8)
-	cfi_restore(%r12)
-
 	/* We return the result of the mutex_lock operation.  */
 	retq
 
-	cfi_adjust_cfa_offset(16 + FRAME_SIZE)
-	cfi_rel_offset(%r12, FRAME_SIZE + 8)
-	cfi_rel_offset(%r13, FRAME_SIZE)
+	cfi_adjust_cfa_offset(FRAME_SIZE)
 
 18:	callq	__pthread_mutex_cond_lock_adjust
 	xorl	%eax, %eax
@@ -285,7 +268,11 @@ __pthread_cond_wait:
 	movl	$LLL_PRIVATE, %eax
 	movl	$LLL_SHARED, %esi
 	cmovne	%eax, %esi
+	/* The call preserves %rdx.  */
 	callq	__lll_unlock_wake
+#if cond_lock != 0
+	subq	$cond_lock, %rdi
+#endif
 	jmp	4b
 
 	/* Locking in loop failed.  */
@@ -349,11 +336,7 @@ versioned_symbol (libpthread, __pthread_cond_wait, pthread_cond_wait,
 __condvar_cleanup1:
 	/* Stack frame:
 
-	   rsp + 48
-		    +--------------------------+
-	   rsp + 40 | %r12                     |
-		    +--------------------------+
-	   rsp + 32 | %r13                     |
+	   rsp + 32
 		    +--------------------------+
 	   rsp + 24 | unused                   |
 	            +--------------------------+
@@ -410,7 +393,7 @@ __condvar_cleanup1:
 3:	subl	$(1 << nwaiters_shift), cond_nwaiters(%rdi)
 
 	/* Wake up a thread which wants to destroy the condvar object.  */
-	xorq	%r12, %r12
+	xorl	%ecx, %ecx
 	cmpq	$0xffffffffffffffff, total_seq(%rdi)
 	jne	4f
 	movl	cond_nwaiters(%rdi), %eax
@@ -433,7 +416,7 @@ __condvar_cleanup1:
 	movl	$SYS_futex, %eax
 	syscall
 	subq	$cond_nwaiters, %rdi
-	movl	$1, %r12d
+	movl	$1, %ecx
 
 4:	LOCK
 #if cond_lock == 0
@@ -449,10 +432,11 @@ __condvar_cleanup1:
 	movl	$LLL_PRIVATE, %eax
 	movl	$LLL_SHARED, %esi
 	cmovne	%eax, %esi
+	/* The call preserves %rcx.  */
 	callq	__lll_unlock_wake
 
 	/* Wake up all waiters to make sure no signal gets lost.  */
-2:	testq	%r12, %r12
+2:	testl	%ecx, %ecx
 	jnz	5f
 	addq	$cond_futex, %rdi
 	cmpq	$-1, dep_mutex-cond_futex(%rdi)
@@ -474,8 +458,6 @@ __condvar_cleanup1:
 	callq	__pthread_mutex_cond_lock
 
 	movq	24(%rsp), %rdi
-	movq	40(%rsp), %r12
-	movq	32(%rsp), %r13
 .LcallUR:
 	call	_Unwind_Resume@PLT
 	hlt
diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/sem_timedwait.S b/nptl/sysdeps/unix/sysv/linux/x86_64/sem_timedwait.S
index 95762834d3..0291beb169 100644
--- a/nptl/sysdeps/unix/sysv/linux/x86_64/sem_timedwait.S
+++ b/nptl/sysdeps/unix/sysv/linux/x86_64/sem_timedwait.S
@@ -65,34 +65,9 @@ sem_timedwait:
 	retq
 
 	/* Check whether the timeout value is valid.  */
-1:	pushq	%r12
-	cfi_adjust_cfa_offset(8)
-	cfi_rel_offset(%r12, 0)
-	pushq	%r13
-	cfi_adjust_cfa_offset(8)
-	cfi_rel_offset(%r13, 0)
-	pushq	%r14
-	cfi_adjust_cfa_offset(8)
-	cfi_rel_offset(%r14, 0)
-#ifdef __ASSUME_FUTEX_CLOCK_REALTIME
-# define STACKFRAME 8
-#else
-# define STACKFRAME 24
-#endif
-	subq	$STACKFRAME, %rsp
-	cfi_adjust_cfa_offset(STACKFRAME)
-
-	movq	%rdi, %r12
-	movq	%rsi, %r13
-
-	/* Check for invalid nanosecond field.  */
-	cmpq	$1000000000, 8(%r13)
-	movl	$EINVAL, %r14d
+1:	cmpq	$1000000000, 8(%rsi)
 	jae	6f
 
-	LOCK
-	addq	$1, NWAITERS(%r12)
-
 #ifndef __ASSUME_FUTEX_CLOCK_REALTIME
 #  ifdef PIC
 	cmpl	$0, __have_futex_clock_realtime(%rip)
@@ -102,15 +77,22 @@ sem_timedwait:
 	je	.Lreltmo
 #endif
 
+	/* This push is only needed to store the sem_t pointer for the
+	   exception handler.  */
+	pushq	%rdi
+	cfi_adjust_cfa_offset(8)
+
+	movq	%rsi, %r10
+
+	LOCK
+	addq	$1, NWAITERS(%rdi)
+
 .LcleanupSTART:
 13:	call	__pthread_enable_asynccancel
-	movl	%eax, (%rsp)
+	movl	%eax, %r8d
 
-	movq	%r13, %r10
-#if VALUE == 0
-	movq	%r12, %rdi
-#else
-	leaq	VALUE(%r12), %rdi
+#if VALUE != 0
+	leaq	VALUE(%rdi), %rdi
 #endif
 	movl	$0xffffffff, %r9d
 	movl	$FUTEX_WAIT_BITSET|FUTEX_CLOCK_REALTIME, %esi
@@ -118,22 +100,26 @@ sem_timedwait:
 	movl	$SYS_futex, %eax
 	xorl	%edx, %edx
 	syscall
-	movq	%rax, %r14
+	movq	%rax, %r9
+#if VALUE != 0
+	leaq	-VALUE(%rdi), %rdi
+#endif
 
-	movl	(%rsp), %edi
+	xchgq	%r8, %rdi
 	call	__pthread_disable_asynccancel
 .LcleanupEND:
+	movq	%r8, %rdi
 
-	testq	%r14, %r14
+	testq	%r9, %r9
 	je	11f
-	cmpq	$-EWOULDBLOCK, %r14
+	cmpq	$-EWOULDBLOCK, %r9
 	jne	3f
 
 11:
 #if VALUE == 0
-	movl	(%r12), %eax
+	movl	(%rdi), %eax
 #else
-	movl	VALUE(%r12), %eax
+	movl	VALUE(%rdi), %eax
 #endif
 14:	testl	%eax, %eax
 	je	13b
@@ -141,49 +127,74 @@ sem_timedwait:
 	leaq	-1(%rax), %rcx
 	LOCK
 #if VALUE == 0
-	cmpxchgl %ecx, (%r12)
+	cmpxchgl %ecx, (%rdi)
 #else
-	cmpxchgl %ecx, VALUE(%r12)
+	cmpxchgl %ecx, VALUE(%rdi)
 #endif
 	jne	14b
 
-10:	xorl	%eax, %eax
+	xorl	%eax, %eax
 
 15:	LOCK
-	subq	$1, NWAITERS(%r12)
+	subq	$1, NWAITERS(%rdi)
 
-	addq	$STACKFRAME, %rsp
-	cfi_adjust_cfa_offset(-STACKFRAME)
-	popq	%r14
-	cfi_adjust_cfa_offset(-8)
-	cfi_restore(%r14)
-	popq	%r13
+	leaq	8(%rsp), %rsp
 	cfi_adjust_cfa_offset(-8)
-	cfi_restore(%r13)
-	popq	%r12
-	cfi_adjust_cfa_offset(-8)
-	cfi_restore(%r12)
 	retq
 
-	cfi_adjust_cfa_offset(STACKFRAME + 3 * 8)
-	cfi_rel_offset(%r12, STACKFRAME + 2 * 8)
-	cfi_rel_offset(%r13, STACKFRAME + 1 * 8)
-	cfi_rel_offset(%r14, STACKFRAME)
-3:	negq	%r14
-6:
+	cfi_adjust_cfa_offset(8)
+3:	negq	%r9
 #if USE___THREAD
 	movq	errno@gottpoff(%rip), %rdx
-	movl	%r14d, %fs:(%rdx)
+	movl	%r9d, %fs:(%rdx)
 #else
 	callq	__errno_location@plt
-	movl	%r14d, (%rax)
+	movl	%r9d, (%rax)
 #endif
 
 	orl	$-1, %eax
 	jmp	15b
 
+	cfi_adjust_cfa_offset(-8)
+6:
+#if USE___THREAD
+	movq	errno@gottpoff(%rip), %rdx
+	movl	$EINVAL, %fs:(%rdx)
+#else
+	callq	__errno_location@plt
+	movl	$EINVAL, (%rax)
+#endif
+
+	orl	$-1, %eax
+
+	retq
+
 #ifndef __ASSUME_FUTEX_CLOCK_REALTIME
 .Lreltmo:
+	pushq	%r12
+	cfi_adjust_cfa_offset(8)
+	cfi_rel_offset(%r12, 0)
+	pushq	%r13
+	cfi_adjust_cfa_offset(8)
+	cfi_rel_offset(%r13, 0)
+	pushq	%r14
+	cfi_adjust_cfa_offset(8)
+	cfi_rel_offset(%r14, 0)
+
+#ifdef __ASSUME_FUTEX_CLOCK_REALTIME
+# define STACKFRAME 8
+#else
+# define STACKFRAME 24
+#endif
+	subq	$STACKFRAME, %rsp
+	cfi_adjust_cfa_offset(STACKFRAME)
+
+	movq	%rdi, %r12
+	movq	%rsi, %r13
+
+	LOCK
+	addq	$1, NWAITERS(%r12)
+
 7:	xorl	%esi, %esi
 	movq	%rsp, %rdi
 	movq	$VSYSCALL_ADDR_vgettimeofday, %rax
@@ -202,7 +213,7 @@ sem_timedwait:
 	decq	%rdi
 5:	testq	%rdi, %rdi
 	movl	$ETIMEDOUT, %r14d
-	js	6b		/* Time is already up.  */
+	js	36f		/* Time is already up.  */
 
 	movq	%rdi, (%rsp)	/* Store relative timeout.  */
 	movq	%rsi, 8(%rsp)
@@ -235,7 +246,7 @@ sem_timedwait:
 	testq	%r14, %r14
 	je	9f
 	cmpq	$-EWOULDBLOCK, %r14
-	jne	3b
+	jne	33f
 
 9:
 # if VALUE == 0
@@ -254,15 +265,54 @@ sem_timedwait:
 	cmpxchgl %ecx, VALUE(%r12)
 # endif
 	jne	8b
-	jmp	10b
+
+	xorl	%eax, %eax
+
+45:	LOCK
+	subq	$1, NWAITERS(%r12)
+
+	addq	$STACKFRAME, %rsp
+	cfi_adjust_cfa_offset(-STACKFRAME)
+	popq	%r14
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore(%r14)
+	popq	%r13
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore(%r13)
+	popq	%r12
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore(%r12)
+	retq
+
+	cfi_adjust_cfa_offset(STACKFRAME + 3 * 8)
+	cfi_rel_offset(%r12, STACKFRAME + 2 * 8)
+	cfi_rel_offset(%r13, STACKFRAME + 1 * 8)
+	cfi_rel_offset(%r14, STACKFRAME)
+33:	negq	%r14
+36:
+#if USE___THREAD
+	movq	errno@gottpoff(%rip), %rdx
+	movl	%r14d, %fs:(%rdx)
+#else
+	callq	__errno_location@plt
+	movl	%r14d, (%rax)
 #endif
+
+	orl	$-1, %eax
+	jmp	45b
+#endif
+	cfi_endproc
 	.size	sem_timedwait,.-sem_timedwait
 
 
 	.type	sem_timedwait_cleanup,@function
 sem_timedwait_cleanup:
+	cfi_startproc
+	cfi_adjust_cfa_offset(8)
+
+	movq	(%rsp), %rdi
 	LOCK
-	subq	$1, NWAITERS(%r12)
+	subq	$1, NWAITERS(%rdi)
 	movq	%rax, %rdi
 .LcallUR:
 	call	_Unwind_Resume@PLT
@@ -272,6 +322,30 @@ sem_timedwait_cleanup:
 	.size	sem_timedwait_cleanup,.-sem_timedwait_cleanup
 
 
+#ifndef __ASSUME_FUTEX_CLOCK_REALTIME
+	.type	sem_timedwait_cleanup2,@function
+sem_timedwait_cleanup2:
+	cfi_startproc
+	cfi_adjust_cfa_offset(STACKFRAME + 3 * 8)
+	cfi_rel_offset(%r12, STACKFRAME + 2 * 8)
+	cfi_rel_offset(%r13, STACKFRAME + 1 * 8)
+	cfi_rel_offset(%r14, STACKFRAME)
+
+	LOCK
+	subq	$1, NWAITERS(%r12)
+	movq	%rax, %rdi
+	movq	STACKFRAME(%rsp), %r14
+	movq	STACKFRAME+8(%rsp), %r13
+	movq	STACKFRAME+16(%rsp), %r12
+.LcallUR2:
+	call	_Unwind_Resume@PLT
+	hlt
+.LENDCODE2:
+	cfi_endproc
+	.size	sem_timedwait_cleanup2,.-sem_timedwait_cleanup2
+#endif
+
+
 	.section .gcc_except_table,"a",@progbits
 .LexceptSTART:
 	.byte	DW_EH_PE_omit			# @LPStart format
@@ -286,13 +360,19 @@ sem_timedwait_cleanup:
 #ifndef __ASSUME_FUTEX_CLOCK_REALTIME
 	.uleb128 .LcleanupSTART2-.LSTARTCODE
 	.uleb128 .LcleanupEND2-.LcleanupSTART2
-	.uleb128 sem_timedwait_cleanup-.LSTARTCODE
+	.uleb128 sem_timedwait_cleanup2-.LSTARTCODE
 	.uleb128  0
 #endif
 	.uleb128 .LcallUR-.LSTARTCODE
 	.uleb128 .LENDCODE-.LcallUR
 	.uleb128 0
 	.uleb128  0
+#ifndef __ASSUME_FUTEX_CLOCK_REALTIME
+	.uleb128 .LcallUR2-.LSTARTCODE
+	.uleb128 .LENDCODE2-.LcallUR2
+	.uleb128 0
+	.uleb128  0
+#endif
 .Lcstend:
 
 
diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/sem_wait.S b/nptl/sysdeps/unix/sysv/linux/x86_64/sem_wait.S
index a01d745a17..2cf6ec10a4 100644
--- a/nptl/sysdeps/unix/sysv/linux/x86_64/sem_wait.S
+++ b/nptl/sysdeps/unix/sysv/linux/x86_64/sem_wait.S
@@ -61,16 +61,13 @@ sem_wait:
 	xorl	%eax, %eax
 	retq
 
-1:	pushq	%r12
+	/* This push is only needed to store the sem_t pointer for the
+	   exception handler.  */
+1:	pushq	%rdi
 	cfi_adjust_cfa_offset(8)
-	cfi_rel_offset(%r12, 0)
-	pushq	%r13
-	cfi_adjust_cfa_offset(8)
-	cfi_rel_offset(%r13, 0)
-	movq	%rdi, %r13
 
 	LOCK
-	addq	$1, NWAITERS(%r13)
+	addq	$1, NWAITERS(%rdi)
 
 .LcleanupSTART:
 6:	call	__pthread_enable_asynccancel
@@ -78,7 +75,6 @@ sem_wait:
 
 	xorq	%r10, %r10
 	movl	$SYS_futex, %eax
-	movq	%r13, %rdi
 #if FUTEX_WAIT == 0
 	movl	PRIVATE(%rdi), %esi
 #else
@@ -87,22 +83,23 @@ sem_wait:
 #endif
 	xorl	%edx, %edx
 	syscall
-	movq	%rax, %r12
+	movq	%rax, %rcx
 
-	movl	%r8d, %edi
+	xchgq	%r8, %rdi
 	call	__pthread_disable_asynccancel
 .LcleanupEND:
+	movq	%r8, %rdi
 
-	testq	%r12, %r12
+	testq	%rcx, %rcx
 	je	3f
-	cmpq	$-EWOULDBLOCK, %r12
+	cmpq	$-EWOULDBLOCK, %rcx
 	jne	4f
 
 3:
 #if VALUE == 0
-	movl	(%r13), %eax
+	movl	(%rdi), %eax
 #else
-	movl	VALUE(%r13), %eax
+	movl	VALUE(%rdi), %eax
 #endif
 5:	testl	%eax, %eax
 	je	6b
@@ -110,50 +107,43 @@ sem_wait:
 	leal	-1(%rax), %edx
 	LOCK
 #if VALUE == 0
-	cmpxchgl %edx, (%r13)
+	cmpxchgl %edx, (%rdi)
 #else
-	cmpxchgl %edx, VALUE(%r13)
+	cmpxchgl %edx, VALUE(%rdi)
 #endif
 	jne	5b
 
-	LOCK
-	subq	$1, NWAITERS(%r13)
-
 	xorl	%eax, %eax
 
-9:	popq	%r13
-	cfi_adjust_cfa_offset(-8)
-	cfi_restore(%r13)
-	popq	%r12
+9:	LOCK
+	subq	$1, NWAITERS(%rdi)
+
+	leaq	8(%rsp), %rsp
 	cfi_adjust_cfa_offset(-8)
-	cfi_restore(%r12)
 
 	retq
 
-	cfi_adjust_cfa_offset(2 * 8)
-	cfi_rel_offset(%r12, 8)
-	cfi_rel_offset(%r13, 0)
-4:	negq	%r12
+	cfi_adjust_cfa_offset(8)
+4:	negq	%rcx
 #if USE___THREAD
 	movq	errno@gottpoff(%rip), %rdx
-	movl	%r12d, %fs:(%rdx)
+	movl	%ecx, %fs:(%rdx)
 #else
+# error "not supported.  %rcx and %rdi must be preserved"
 	callq	__errno_location@plt
-	movl	%r12d, (%rax)
+	movl	%ecx, (%rax)
 #endif
 	orl	$-1, %eax
 
-	LOCK
-	subq	$1, NWAITERS(%r13)
-
 	jmp 9b
 	.size	sem_wait,.-sem_wait
 
 
 	.type	sem_wait_cleanup,@function
 sem_wait_cleanup:
+	movq	(%rsp), %rdi
 	LOCK
-	subq	$1, NWAITERS(%r13)
+	subq	$1, NWAITERS(%rdi)
 	movq	%rax, %rdi
 .LcallUR:
 	call	_Unwind_Resume@PLT
diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/sysdep-cancel.h b/nptl/sysdeps/unix/sysv/linux/x86_64/sysdep-cancel.h
index 3e741da794..1e92de1dcc 100644
--- a/nptl/sysdeps/unix/sysv/linux/x86_64/sysdep-cancel.h
+++ b/nptl/sysdeps/unix/sysv/linux/x86_64/sysdep-cancel.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
+/* Copyright (C) 2002-2006, 2009 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Jakub Jelinek <jakub@redhat.com>, 2002.
 
@@ -25,6 +25,10 @@
 
 #if !defined NOT_IN_libc || defined IS_IN_libpthread || defined IS_IN_librt
 
+/* The code to disable cancellation depends on the fact that the called
+   functions are special.  They don't modify registers other than %rax
+   and %r11 if they return.  Therefore we don't have to preserve other
+   registers around these calls.  */
 # undef PSEUDO
 # define PSEUDO(name, syscall_name, args)				      \
   .text;								      \
@@ -40,60 +44,23 @@
     ret;								      \
   .size __##syscall_name##_nocancel,.-__##syscall_name##_nocancel;	      \
   L(pseudo_cancel):							      \
-    /* Save registers that might get destroyed.  */			      \
-    SAVESTK_##args							      \
-    PUSHARGS_##args							      \
+    /* We always have to align the stack before calling a function.  */	      \
+    subq $8, %rsp; cfi_adjust_cfa_offset (8);				      \
     CENABLE								      \
-    /* Restore registers.  */						      \
-    POPARGS_##args							      \
     /* The return value from CENABLE is argument for CDISABLE.  */	      \
     movq %rax, (%rsp);							      \
-    movl $SYS_ify (syscall_name), %eax;					      \
-    syscall;								      \
+    DO_CALL (syscall_name, args);					      \
     movq (%rsp), %rdi;							      \
     /* Save %rax since it's the error code from the syscall.  */	      \
-    movq %rax, 8(%rsp);							      \
+    movq %rax, %rdx;							      \
     CDISABLE								      \
-    movq 8(%rsp), %rax;							      \
-    RESTSTK_##args							      \
+    movq %rdx, %rax;							      \
+    addq $8,%rsp; cfi_adjust_cfa_offset (-8);				      \
     cmpq $-4095, %rax;							      \
     jae SYSCALL_ERROR_LABEL;						      \
   L(pseudo_end):
 
 
-# define PUSHARGS_0	/* Nothing.  */
-# define PUSHARGS_1	PUSHARGS_0 movq %rdi, 8(%rsp);
-# define PUSHARGS_2	PUSHARGS_1 movq %rsi, 16(%rsp);
-# define PUSHARGS_3	PUSHARGS_2 movq %rdx, 24(%rsp);
-# define PUSHARGS_4	PUSHARGS_3 movq %rcx, 32(%rsp);
-# define PUSHARGS_5	PUSHARGS_4 movq %r8, 40(%rsp);
-# define PUSHARGS_6	PUSHARGS_5 movq %r9, 48(%rsp);
-
-# define POPARGS_0	/* Nothing.  */
-# define POPARGS_1	POPARGS_0 movq 8(%rsp), %rdi;
-# define POPARGS_2	POPARGS_1 movq 16(%rsp), %rsi;
-# define POPARGS_3	POPARGS_2 movq 24(%rsp), %rdx;
-# define POPARGS_4	POPARGS_3 movq 32(%rsp), %r10;
-# define POPARGS_5	POPARGS_4 movq 40(%rsp), %r8;
-# define POPARGS_6	POPARGS_5 movq 48(%rsp), %r9;
-
-/* We always have to align the stack before calling a function.  */
-# define SAVESTK_0	subq $24, %rsp; cfi_adjust_cfa_offset (24);
-# define SAVESTK_1	SAVESTK_0
-# define SAVESTK_2	SAVESTK_1
-# define SAVESTK_3	subq $40, %rsp; cfi_adjust_cfa_offset (40);
-# define SAVESTK_4	SAVESTK_3
-# define SAVESTK_5	subq $56, %rsp; cfi_adjust_cfa_offset (56);
-# define SAVESTK_6	SAVESTK_5
-
-# define RESTSTK_0	addq $24,%rsp; cfi_adjust_cfa_offset (-24);
-# define RESTSTK_1	RESTSTK_0
-# define RESTSTK_2	RESTSTK_1
-# define RESTSTK_3	addq $40, %rsp; cfi_adjust_cfa_offset (-40);
-# define RESTSTK_4	RESTSTK_3
-# define RESTSTK_5	addq $56, %rsp; cfi_adjust_cfa_offset (-56);
-# define RESTSTK_6	RESTSTK_5
-
 # ifdef IS_IN_libpthread
 #  define CENABLE	call __pthread_enable_asynccancel;
 #  define CDISABLE	call __pthread_disable_asynccancel;
diff --git a/nptl/sysdeps/x86_64/tcb-offsets.sym b/nptl/sysdeps/x86_64/tcb-offsets.sym
index 51f35c61cf..cf863752ee 100644
--- a/nptl/sysdeps/x86_64/tcb-offsets.sym
+++ b/nptl/sysdeps/x86_64/tcb-offsets.sym
@@ -16,3 +16,13 @@ VGETCPU_CACHE_OFFSET	offsetof (tcbhead_t, vgetcpu_cache)
 PRIVATE_FUTEX		offsetof (tcbhead_t, private_futex)
 #endif
 RTLD_SAVESPACE_SSE	offsetof (tcbhead_t, rtld_savespace_sse)
+
+-- Not strictly offsets, but these values are also used in the TCB.
+TCB_CANCELSTATE_BITMASK	 CANCELSTATE_BITMASK
+TCB_CANCELTYPE_BITMASK	 CANCELTYPE_BITMASK
+TCB_CANCELING_BITMASK	 CANCELING_BITMASK
+TCB_CANCELED_BITMASK	 CANCELED_BITMASK
+TCB_EXITING_BITMASK	 EXITING_BITMASK
+TCB_CANCEL_RESTMASK	 CANCEL_RESTMASK
+TCB_TERMINATED_BITMASK	 TERMINATED_BITMASK
+TCB_PTHREAD_CANCELED	 PTHREAD_CANCELED
diff --git a/sysdeps/i386/configure b/sysdeps/i386/configure
index ced0b31d0f..d1c4f7f501 100755
--- a/sysdeps/i386/configure
+++ b/sysdeps/i386/configure
@@ -1,13 +1,468 @@
+# Factoring default headers for most tests.
+ac_includes_default="\
+#include <stdio.h>
+#ifdef HAVE_SYS_TYPES_H
+# include <sys/types.h>
+#endif
+#ifdef HAVE_SYS_STAT_H
+# include <sys/stat.h>
+#endif
+#ifdef STDC_HEADERS
+# include <stdlib.h>
+# include <stddef.h>
+#else
+# ifdef HAVE_STDLIB_H
+#  include <stdlib.h>
+# endif
+#endif
+#ifdef HAVE_STRING_H
+# if !defined STDC_HEADERS && defined HAVE_MEMORY_H
+#  include <memory.h>
+# endif
+# include <string.h>
+#endif
+#ifdef HAVE_STRINGS_H
+# include <strings.h>
+#endif
+#ifdef HAVE_INTTYPES_H
+# include <inttypes.h>
+#endif
+#ifdef HAVE_STDINT_H
+# include <stdint.h>
+#endif
+#ifdef HAVE_UNISTD_H
+# include <unistd.h>
+#endif"
+
 # This file is generated from configure.in by Autoconf.  DO NOT EDIT!
  # Local configure fragment for sysdeps/i386.
 
 
-{ echo "$as_me:$LINENO: checking if gcc provides <cpuid.h>" >&5
-echo $ECHO_N "checking if gcc provides <cpuid.h>... $ECHO_C" >&6; }
-if test "${libc_cv_gcc_cpuid+set}" = set; then
+
+{ echo "$as_me:$LINENO: checking for grep that handles long lines and -e" >&5
+echo $ECHO_N "checking for grep that handles long lines and -e... $ECHO_C" >&6; }
+if test "${ac_cv_path_GREP+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  # Extract the first word of "grep ggrep" to use in msg output
+if test -z "$GREP"; then
+set dummy grep ggrep; ac_prog_name=$2
+if test "${ac_cv_path_GREP+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  ac_path_GREP_found=false
+# Loop through the user's path and test for each of PROGNAME-LIST
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+  for ac_prog in grep ggrep; do
+  for ac_exec_ext in '' $ac_executable_extensions; do
+    ac_path_GREP="$as_dir/$ac_prog$ac_exec_ext"
+    { test -f "$ac_path_GREP" && $as_test_x "$ac_path_GREP"; } || continue
+    # Check for GNU ac_path_GREP and select it if it is found.
+  # Check for GNU $ac_path_GREP
+case `"$ac_path_GREP" --version 2>&1` in
+*GNU*)
+  ac_cv_path_GREP="$ac_path_GREP" ac_path_GREP_found=:;;
+*)
+  ac_count=0
+  echo $ECHO_N "0123456789$ECHO_C" >"conftest.in"
+  while :
+  do
+    cat "conftest.in" "conftest.in" >"conftest.tmp"
+    mv "conftest.tmp" "conftest.in"
+    cp "conftest.in" "conftest.nl"
+    echo 'GREP' >> "conftest.nl"
+    "$ac_path_GREP" -e 'GREP$' -e '-(cannot match)-' < "conftest.nl" >"conftest.out" 2>/dev/null || break
+    diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break
+    ac_count=`expr $ac_count + 1`
+    if test $ac_count -gt ${ac_path_GREP_max-0}; then
+      # Best one so far, save it but keep looking for a better one
+      ac_cv_path_GREP="$ac_path_GREP"
+      ac_path_GREP_max=$ac_count
+    fi
+    # 10*(2^10) chars as input seems more than enough
+    test $ac_count -gt 10 && break
+  done
+  rm -f conftest.in conftest.tmp conftest.nl conftest.out;;
+esac
+
+
+    $ac_path_GREP_found && break 3
+  done
+done
+
+done
+IFS=$as_save_IFS
+
+
+fi
+
+GREP="$ac_cv_path_GREP"
+if test -z "$GREP"; then
+  { { echo "$as_me:$LINENO: error: no acceptable $ac_prog_name could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" >&5
+echo "$as_me: error: no acceptable $ac_prog_name could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" >&2;}
+   { (exit 1); exit 1; }; }
+fi
+
+else
+  ac_cv_path_GREP=$GREP
+fi
+
+
+fi
+{ echo "$as_me:$LINENO: result: $ac_cv_path_GREP" >&5
+echo "${ECHO_T}$ac_cv_path_GREP" >&6; }
+ GREP="$ac_cv_path_GREP"
+
+
+{ echo "$as_me:$LINENO: checking for egrep" >&5
+echo $ECHO_N "checking for egrep... $ECHO_C" >&6; }
+if test "${ac_cv_path_EGREP+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  if echo a | $GREP -E '(a|b)' >/dev/null 2>&1
+   then ac_cv_path_EGREP="$GREP -E"
+   else
+     # Extract the first word of "egrep" to use in msg output
+if test -z "$EGREP"; then
+set dummy egrep; ac_prog_name=$2
+if test "${ac_cv_path_EGREP+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  ac_path_EGREP_found=false
+# Loop through the user's path and test for each of PROGNAME-LIST
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+  for ac_prog in egrep; do
+  for ac_exec_ext in '' $ac_executable_extensions; do
+    ac_path_EGREP="$as_dir/$ac_prog$ac_exec_ext"
+    { test -f "$ac_path_EGREP" && $as_test_x "$ac_path_EGREP"; } || continue
+    # Check for GNU ac_path_EGREP and select it if it is found.
+  # Check for GNU $ac_path_EGREP
+case `"$ac_path_EGREP" --version 2>&1` in
+*GNU*)
+  ac_cv_path_EGREP="$ac_path_EGREP" ac_path_EGREP_found=:;;
+*)
+  ac_count=0
+  echo $ECHO_N "0123456789$ECHO_C" >"conftest.in"
+  while :
+  do
+    cat "conftest.in" "conftest.in" >"conftest.tmp"
+    mv "conftest.tmp" "conftest.in"
+    cp "conftest.in" "conftest.nl"
+    echo 'EGREP' >> "conftest.nl"
+    "$ac_path_EGREP" 'EGREP$' < "conftest.nl" >"conftest.out" 2>/dev/null || break
+    diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break
+    ac_count=`expr $ac_count + 1`
+    if test $ac_count -gt ${ac_path_EGREP_max-0}; then
+      # Best one so far, save it but keep looking for a better one
+      ac_cv_path_EGREP="$ac_path_EGREP"
+      ac_path_EGREP_max=$ac_count
+    fi
+    # 10*(2^10) chars as input seems more than enough
+    test $ac_count -gt 10 && break
+  done
+  rm -f conftest.in conftest.tmp conftest.nl conftest.out;;
+esac
+
+
+    $ac_path_EGREP_found && break 3
+  done
+done
+
+done
+IFS=$as_save_IFS
+
+
+fi
+
+EGREP="$ac_cv_path_EGREP"
+if test -z "$EGREP"; then
+  { { echo "$as_me:$LINENO: error: no acceptable $ac_prog_name could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" >&5
+echo "$as_me: error: no acceptable $ac_prog_name could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" >&2;}
+   { (exit 1); exit 1; }; }
+fi
+
+else
+  ac_cv_path_EGREP=$EGREP
+fi
+
+
+   fi
+fi
+{ echo "$as_me:$LINENO: result: $ac_cv_path_EGREP" >&5
+echo "${ECHO_T}$ac_cv_path_EGREP" >&6; }
+ EGREP="$ac_cv_path_EGREP"
+
+
+{ echo "$as_me:$LINENO: checking for ANSI C header files" >&5
+echo $ECHO_N "checking for ANSI C header files... $ECHO_C" >&6; }
+if test "${ac_cv_header_stdc+set}" = set; then
   echo $ECHO_N "(cached) $ECHO_C" >&6
 else
   cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <float.h>
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
+  ac_cv_header_stdc=yes
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_cv_header_stdc=no
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+
+if test $ac_cv_header_stdc = yes; then
+  # SunOS 4.x string.h does not declare mem*, contrary to ANSI.
+  cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+#include <string.h>
+
+_ACEOF
+if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
+  $EGREP "memchr" >/dev/null 2>&1; then
+  :
+else
+  ac_cv_header_stdc=no
+fi
+rm -f conftest*
+
+fi
+
+if test $ac_cv_header_stdc = yes; then
+  # ISC 2.0.2 stdlib.h does not declare free, contrary to ANSI.
+  cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+#include <stdlib.h>
+
+_ACEOF
+if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
+  $EGREP "free" >/dev/null 2>&1; then
+  :
+else
+  ac_cv_header_stdc=no
+fi
+rm -f conftest*
+
+fi
+
+if test $ac_cv_header_stdc = yes; then
+  # /bin/cc in Irix-4.0.5 gets non-ANSI ctype macros unless using -ansi.
+  if test "$cross_compiling" = yes; then
+  :
+else
+  cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+#include <ctype.h>
+#include <stdlib.h>
+#if ((' ' & 0x0FF) == 0x020)
+# define ISLOWER(c) ('a' <= (c) && (c) <= 'z')
+# define TOUPPER(c) (ISLOWER(c) ? 'A' + ((c) - 'a') : (c))
+#else
+# define ISLOWER(c) \
+		   (('a' <= (c) && (c) <= 'i') \
+		     || ('j' <= (c) && (c) <= 'r') \
+		     || ('s' <= (c) && (c) <= 'z'))
+# define TOUPPER(c) (ISLOWER(c) ? ((c) | 0x40) : (c))
+#endif
+
+#define XOR(e, f) (((e) && !(f)) || (!(e) && (f)))
+int
+main ()
+{
+  int i;
+  for (i = 0; i < 256; i++)
+    if (XOR (islower (i), ISLOWER (i))
+	|| toupper (i) != TOUPPER (i))
+      return 2;
+  return 0;
+}
+_ACEOF
+rm -f conftest$ac_exeext
+if { (ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_link") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && { ac_try='./conftest$ac_exeext'
+  { (case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; }; then
+  :
+else
+  echo "$as_me: program exited with status $ac_status" >&5
+echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+( exit $ac_status )
+ac_cv_header_stdc=no
+fi
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext conftest.$ac_objext conftest.$ac_ext
+fi
+
+
+fi
+fi
+{ echo "$as_me:$LINENO: result: $ac_cv_header_stdc" >&5
+echo "${ECHO_T}$ac_cv_header_stdc" >&6; }
+if test $ac_cv_header_stdc = yes; then
+
+cat >>confdefs.h <<\_ACEOF
+#define STDC_HEADERS 1
+_ACEOF
+
+fi
+
+# On IRIX 5.3, sys/types and inttypes.h are conflicting.
+
+
+
+
+
+
+
+
+
+for ac_header in sys/types.h sys/stat.h stdlib.h string.h memory.h strings.h \
+		  inttypes.h stdint.h unistd.h
+do
+as_ac_Header=`echo "ac_cv_header_$ac_header" | $as_tr_sh`
+{ echo "$as_me:$LINENO: checking for $ac_header" >&5
+echo $ECHO_N "checking for $ac_header... $ECHO_C" >&6; }
+if { as_var=$as_ac_Header; eval "test \"\${$as_var+set}\" = set"; }; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+$ac_includes_default
+
+#include <$ac_header>
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
+  eval "$as_ac_Header=yes"
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	eval "$as_ac_Header=no"
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+ac_res=`eval echo '${'$as_ac_Header'}'`
+	       { echo "$as_me:$LINENO: result: $ac_res" >&5
+echo "${ECHO_T}$ac_res" >&6; }
+if test `eval echo '${'$as_ac_Header'}'` = yes; then
+  cat >>confdefs.h <<_ACEOF
+#define `echo "HAVE_$ac_header" | $as_tr_cpp` 1
+_ACEOF
+
+fi
+
+done
+
+
+if test "${ac_cv_header_cpuid_h+set}" = set; then
+  { echo "$as_me:$LINENO: checking for cpuid.h" >&5
+echo $ECHO_N "checking for cpuid.h... $ECHO_C" >&6; }
+if test "${ac_cv_header_cpuid_h+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+fi
+{ echo "$as_me:$LINENO: result: $ac_cv_header_cpuid_h" >&5
+echo "${ECHO_T}$ac_cv_header_cpuid_h" >&6; }
+else
+  # Is the header compilable?
+{ echo "$as_me:$LINENO: checking cpuid.h usability" >&5
+echo $ECHO_N "checking cpuid.h usability... $ECHO_C" >&6; }
+cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+$ac_includes_default
 #include <cpuid.h>
 _ACEOF
 rm -f conftest.$ac_objext
@@ -27,24 +482,103 @@ eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
 	 test -z "$ac_c_werror_flag" ||
 	 test ! -s conftest.err
        } && test -s conftest.$ac_objext; then
-  libc_cv_gcc_cpuid=yes
+  ac_header_compiler=yes
 else
   echo "$as_me: failed program was:" >&5
 sed 's/^/| /' conftest.$ac_ext >&5
 
-	libc_cv_gcc_cpuid=no
+	ac_header_compiler=no
 fi
 
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+{ echo "$as_me:$LINENO: result: $ac_header_compiler" >&5
+echo "${ECHO_T}$ac_header_compiler" >&6; }
+
+# Is the header present?
+{ echo "$as_me:$LINENO: checking cpuid.h presence" >&5
+echo $ECHO_N "checking cpuid.h presence... $ECHO_C" >&6; }
+cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+#include <cpuid.h>
+_ACEOF
+if { (ac_try="$ac_cpp conftest.$ac_ext"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_cpp conftest.$ac_ext") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } >/dev/null && {
+	 test -z "$ac_c_preproc_warn_flag$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       }; then
+  ac_header_preproc=yes
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+  ac_header_preproc=no
 fi
-{ echo "$as_me:$LINENO: result: $libc_cv_gcc_cpuid" >&5
-echo "${ECHO_T}$libc_cv_gcc_cpuid" >&6; }
-if test $libc_cv_gcc_cpuid != yes; then
+
+rm -f conftest.err conftest.$ac_ext
+{ echo "$as_me:$LINENO: result: $ac_header_preproc" >&5
+echo "${ECHO_T}$ac_header_preproc" >&6; }
+
+# So?  What about this header?
+case $ac_header_compiler:$ac_header_preproc:$ac_c_preproc_warn_flag in
+  yes:no: )
+    { echo "$as_me:$LINENO: WARNING: cpuid.h: accepted by the compiler, rejected by the preprocessor!" >&5
+echo "$as_me: WARNING: cpuid.h: accepted by the compiler, rejected by the preprocessor!" >&2;}
+    { echo "$as_me:$LINENO: WARNING: cpuid.h: proceeding with the compiler's result" >&5
+echo "$as_me: WARNING: cpuid.h: proceeding with the compiler's result" >&2;}
+    ac_header_preproc=yes
+    ;;
+  no:yes:* )
+    { echo "$as_me:$LINENO: WARNING: cpuid.h: present but cannot be compiled" >&5
+echo "$as_me: WARNING: cpuid.h: present but cannot be compiled" >&2;}
+    { echo "$as_me:$LINENO: WARNING: cpuid.h:     check for missing prerequisite headers?" >&5
+echo "$as_me: WARNING: cpuid.h:     check for missing prerequisite headers?" >&2;}
+    { echo "$as_me:$LINENO: WARNING: cpuid.h: see the Autoconf documentation" >&5
+echo "$as_me: WARNING: cpuid.h: see the Autoconf documentation" >&2;}
+    { echo "$as_me:$LINENO: WARNING: cpuid.h:     section \"Present But Cannot Be Compiled\"" >&5
+echo "$as_me: WARNING: cpuid.h:     section \"Present But Cannot Be Compiled\"" >&2;}
+    { echo "$as_me:$LINENO: WARNING: cpuid.h: proceeding with the preprocessor's result" >&5
+echo "$as_me: WARNING: cpuid.h: proceeding with the preprocessor's result" >&2;}
+    { echo "$as_me:$LINENO: WARNING: cpuid.h: in the future, the compiler will take precedence" >&5
+echo "$as_me: WARNING: cpuid.h: in the future, the compiler will take precedence" >&2;}
+
+    ;;
+esac
+{ echo "$as_me:$LINENO: checking for cpuid.h" >&5
+echo $ECHO_N "checking for cpuid.h... $ECHO_C" >&6; }
+if test "${ac_cv_header_cpuid_h+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  ac_cv_header_cpuid_h=$ac_header_preproc
+fi
+{ echo "$as_me:$LINENO: result: $ac_cv_header_cpuid_h" >&5
+echo "${ECHO_T}$ac_cv_header_cpuid_h" >&6; }
+
+fi
+if test $ac_cv_header_cpuid_h = yes; then
+  :
+else
   { { echo "$as_me:$LINENO: error: gcc must provide the <cpuid.h> header" >&5
 echo "$as_me: error: gcc must provide the <cpuid.h> header" >&2;}
    { (exit 1); exit 1; }; }
 fi
 
+
+
 { echo "$as_me:$LINENO: checking if -g produces usable source locations for assembler-with-cpp" >&5
 echo $ECHO_N "checking if -g produces usable source locations for assembler-with-cpp... $ECHO_C" >&6; }
 if test "${libc_cv_cpp_asm_debuginfo+set}" = set; then
diff --git a/sysdeps/i386/configure.in b/sysdeps/i386/configure.in
index 800f928fbd..12dceaf844 100644
--- a/sysdeps/i386/configure.in
+++ b/sysdeps/i386/configure.in
@@ -1,12 +1,8 @@
 GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory.
 # Local configure fragment for sysdeps/i386.
 
-AC_CACHE_CHECK([if gcc provides <cpuid.h>], libc_cv_gcc_cpuid, [dnl
-AC_COMPILE_IFELSE([#include <cpuid.h>], libc_cv_gcc_cpuid=yes,
-		  libc_cv_gcc_cpuid=no)])
-if test $libc_cv_gcc_cpuid != yes; then
-  AC_MSG_ERROR([gcc must provide the <cpuid.h> header])
-fi
+AC_HEADER_CHECK([cpuid.h], ,
+  [AC_MSG_ERROR([gcc must provide the <cpuid.h> header])])
 
 AC_CACHE_CHECK(if -g produces usable source locations for assembler-with-cpp,
 	       libc_cv_cpp_asm_debuginfo, [dnl
diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile
index 33d98c36e6..e1553b284e 100644
--- a/sysdeps/i386/i686/multiarch/Makefile
+++ b/sysdeps/i386/i686/multiarch/Makefile
@@ -2,3 +2,14 @@ ifeq ($(subdir),csu)
 aux += init-arch
 gen-as-const-headers += ifunc-defines.sym
 endif
+
+ifeq ($(subdir),string)
+ifeq (yes,$(config-cflags-sse4))
+sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
+CFLAGS-strcspn-c.c += -msse4
+CFLAGS-strpbrk-c.c += -msse4
+CFLAGS-strspn-c.c += -msse4
+CFLAGS-strstr.c += -msse4
+CFLAGS-strcasestr.c += -msse4
+endif
+endif
diff --git a/sysdeps/i386/i686/multiarch/strcasestr-c.c b/sysdeps/i386/i686/multiarch/strcasestr-c.c
new file mode 100644
index 0000000000..0d52b0e47a
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strcasestr-c.c
@@ -0,0 +1,2 @@
+#define __strcasestr_sse2 __strcasestr_ia32
+#include <sysdeps/x86_64/multiarch/strcasestr-c.c>
diff --git a/sysdeps/i386/i686/multiarch/strcasestr.c b/sysdeps/i386/i686/multiarch/strcasestr.c
new file mode 100644
index 0000000000..511bb29ede
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strcasestr.c
@@ -0,0 +1 @@
+#include <sysdeps/x86_64/multiarch/strcasestr.c>
diff --git a/sysdeps/i386/i686/multiarch/strcspn-c.c b/sysdeps/i386/i686/multiarch/strcspn-c.c
new file mode 100644
index 0000000000..6d61e190a8
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strcspn-c.c
@@ -0,0 +1,2 @@
+#define __strcspn_sse2 __strcspn_ia32
+#include <sysdeps/x86_64/multiarch/strcspn-c.c>
diff --git a/sysdeps/i386/i686/multiarch/strcspn.S b/sysdeps/i386/i686/multiarch/strcspn.S
new file mode 100644
index 0000000000..73e7eb45a8
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strcspn.S
@@ -0,0 +1,114 @@
+/* Multiple versions of strcspn
+   Copyright (C) 2009 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <config.h>
+
+#ifdef HAVE_SSE4_SUPPORT
+
+#include <sysdep.h>
+#include <ifunc-defines.h>
+
+#ifdef USE_AS_STRPBRK
+#define STRCSPN_SSE42	__strpbrk_sse42
+#define STRCSPN_IA32	__strpbrk_ia32
+#define __GI_STRCSPN	__GI_strpbrk
+#else
+#ifndef STRCSPN
+#define STRCSPN		strcspn
+#define STRCSPN_SSE42	__strcspn_sse42
+#define STRCSPN_IA32	__strcspn_ia32
+#define __GI_STRCSPN	__GI_strcspn
+#endif
+#endif
+
+/* Define multiple versions only for the definition in libc.  Don't
+   define multiple versions for strpbrk in static library since we
+   need strpbrk before the initialization happened.  */
+#if (defined SHARED || !defined USE_AS_STRPBRK) && !defined NOT_IN_libc
+# ifdef SHARED
+	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+	.globl	__i686.get_pc_thunk.bx
+	.hidden	__i686.get_pc_thunk.bx
+	.p2align 4
+	.type	__i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+	movl	(%esp), %ebx
+	ret
+
+	.text
+ENTRY(STRCSPN)
+	.type	STRCSPN, @gnu_indirect_function
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (ebx, 0)
+	call	__i686.get_pc_thunk.bx
+	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+	jne	1f
+	call	__init_cpu_features
+1:	leal	STRCSPN_IA32@GOTOFF(%ebx), %eax
+	testl	$(1<<20), CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	STRCSPN_SSE42@GOTOFF(%ebx), %eax
+2:	popl	%ebx
+	cfi_adjust_cfa_offset (-4);
+	cfi_restore (ebx)
+	ret
+END(STRCSPN)
+# else
+	.text
+ENTRY(STRCSPN)
+	.type	STRCSPN, @gnu_indirect_function
+	cmpl	$0, KIND_OFFSET+__cpu_features
+	jne	1f
+	call	__init_cpu_features
+1:	leal	STRCSPN_IA32, %eax
+	testl	$(1<<20), CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET+__cpu_features
+	jz	2f
+	leal	STRCSPN_SSE42, %eax
+2:	ret
+END(STRCSPN)
+# endif
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type STRCSPN_IA32, @function; \
+	.globl STRCSPN_IA32; \
+	.p2align 4; \
+	STRCSPN_IA32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size STRCSPN_IA32, .-STRCSPN_IA32
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_STRCSPN; __GI_STRCSPN = STRCSPN_IA32
+#endif
+
+#endif /* HAVE_SSE4_SUPPORT */
+
+#ifdef USE_AS_STRPBRK
+#include "../../strpbrk.S"
+#else
+#include "../../strcspn.S"
+#endif
diff --git a/sysdeps/i386/i686/multiarch/strlen.S b/sysdeps/i386/i686/multiarch/strlen.S
new file mode 100644
index 0000000000..0c1e8646ff
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strlen.S
@@ -0,0 +1,154 @@
+/* Multiple versions of strlen
+   Copyright (C) 2009 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <ifunc-defines.h>
+
+/* Define multiple versions only for the definition in libc and for the
+   DSO.  In static binaries, we need strlen before the initialization
+   happened.  */
+#if defined SHARED && !defined NOT_IN_libc
+	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+	.globl	__i686.get_pc_thunk.bx
+	.hidden	__i686.get_pc_thunk.bx
+	.p2align 4
+	.type	__i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+	movl	(%esp), %ebx
+	ret
+
+	.text
+ENTRY(strlen)
+	.type	strlen, @gnu_indirect_function
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (ebx, 0)
+	call	__i686.get_pc_thunk.bx
+	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+	jne	1f
+	call	__init_cpu_features
+1:	leal	__strlen_ia32@GOTOFF(%ebx), %eax
+	testl	$(1<<26), CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_EDX_OFFSET+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__strlen_sse2@GOTOFF(%ebx), %eax
+2:	popl	%ebx
+	cfi_adjust_cfa_offset (-4);
+	cfi_restore (ebx)
+	ret
+END(strlen)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define RETURN		popl %esi; CFI_POP (esi); ret
+
+	.text
+ENTRY (__strlen_sse2)
+/*
+ * This implementation uses SSE instructions to compare up to 16 bytes
+ * at a time looking for the end of string (null char).
+ */
+	pushl	%esi
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (%esi, 0)
+	mov	8(%esp), %eax
+	mov	%eax, %ecx
+	pxor	%xmm0, %xmm0		/* 16 null chars */
+	mov	%eax, %esi
+	and	$15, %ecx
+	jz	1f			/* string is 16 byte aligned */
+
+	/*
+	* Unaligned case. Round down to 16-byte boundary before comparing
+	* 16 bytes for a null char. The code then compensates for any extra chars
+	* preceding the start of the string.
+	*/
+	and	$-16, %esi
+
+	pcmpeqb	(%esi), %xmm0
+	lea	16(%eax), %esi
+	pmovmskb %xmm0, %edx
+
+	shr	%cl, %edx		/* Compensate for bytes preceding the string */
+	test	%edx, %edx
+	jnz	2f
+	sub	%ecx, %esi		/* no null, adjust to next 16-byte boundary */
+	pxor	%xmm0, %xmm0		/* clear xmm0, may have been changed... */
+
+	.p2align 4
+1:					/* 16 byte aligned */
+	pcmpeqb	(%esi), %xmm0		/* look for null bytes */
+	pmovmskb %xmm0, %edx		/* move each byte mask of %xmm0 to edx */
+
+	add	$16, %esi		/* prepare to search next 16 bytes */
+	test	%edx, %edx		/* if no null byte, %edx must be 0 */
+	jnz	2f			/* found a null */
+
+	pcmpeqb	(%esi), %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %esi
+	test	%edx, %edx
+	jnz	2f
+
+	pcmpeqb	(%esi), %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %esi
+	test	%edx, %edx
+	jnz	2f
+
+	pcmpeqb	(%esi), %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %esi
+	test	%edx, %edx
+	jz	1b
+
+2:
+	neg	%eax
+	lea	-16(%eax, %esi), %eax	/* calculate exact offset */
+	bsf	%edx, %ecx		/* Least significant 1 bit is index of null */
+	add	%ecx, %eax
+	popl	%esi
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (%esi)
+	ret
+
+END (__strlen_sse2)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __strlen_ia32, @function; \
+	.globl __strlen_ia32; \
+	.p2align 4; \
+	__strlen_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __strlen_ia32, .-__strlen_ia32
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_strlen; __GI_strlen = __strlen_ia32
+#endif
+
+#include "../../i586/strlen.S"
diff --git a/sysdeps/i386/i686/multiarch/strpbrk-c.c b/sysdeps/i386/i686/multiarch/strpbrk-c.c
new file mode 100644
index 0000000000..5db62053b3
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strpbrk-c.c
@@ -0,0 +1,2 @@
+#define __strpbrk_sse2 __strpbrk_ia32
+#include <sysdeps/x86_64/multiarch/strpbrk-c.c>
diff --git a/sysdeps/i386/i686/multiarch/strpbrk.S b/sysdeps/i386/i686/multiarch/strpbrk.S
new file mode 100644
index 0000000000..ed5bca6a94
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strpbrk.S
@@ -0,0 +1,3 @@
+#define STRCSPN strpbrk
+#define USE_AS_STRPBRK
+#include "strcspn.S"
diff --git a/sysdeps/i386/i686/multiarch/strspn-c.c b/sysdeps/i386/i686/multiarch/strspn-c.c
new file mode 100644
index 0000000000..bea09dea71
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strspn-c.c
@@ -0,0 +1,2 @@
+#define __strspn_sse2 __strspn_ia32
+#include <sysdeps/x86_64/multiarch/strspn-c.c>
diff --git a/sysdeps/i386/i686/multiarch/strspn.S b/sysdeps/i386/i686/multiarch/strspn.S
new file mode 100644
index 0000000000..f306d2d1fb
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strspn.S
@@ -0,0 +1,95 @@
+/* Multiple versions of strspn
+   Copyright (C) 2009 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <config.h>
+
+#ifdef HAVE_SSE4_SUPPORT
+
+#include <sysdep.h>
+#include <ifunc-defines.h>
+
+/* Define multiple versions only for the definition in libc.  */
+#ifndef NOT_IN_libc
+# ifdef SHARED
+	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+	.globl	__i686.get_pc_thunk.bx
+	.hidden	__i686.get_pc_thunk.bx
+	.p2align 4
+	.type	__i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+	movl	(%esp), %ebx
+	ret
+
+	.text
+ENTRY(strspn)
+	.type	strspn, @gnu_indirect_function
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (ebx, 0)
+	call	__i686.get_pc_thunk.bx
+	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+	jne	1f
+	call	__init_cpu_features
+1:	leal	__strspn_ia32@GOTOFF(%ebx), %eax
+	testl	$(1<<20), CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__strspn_sse42@GOTOFF(%ebx), %eax
+2:	popl	%ebx
+	cfi_adjust_cfa_offset (-4);
+	cfi_restore (ebx)
+	ret
+END(strspn)
+# else
+	.text
+ENTRY(strspn)
+	.type	strspn, @gnu_indirect_function
+	cmpl	$0, KIND_OFFSET+__cpu_features
+	jne	1f
+	call	__init_cpu_features
+1:	leal	__strspn_ia32, %eax
+	testl	$(1<<20), CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET+__cpu_features
+	jz	2f
+	leal	__strspn_sse42, %eax
+2:	ret
+END(strspn)
+# endif
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __strspn_ia32, @function; \
+	.globl __strspn_ia32; \
+	.p2align 4
+	__strspn_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __strspn_ia32, .-__strspn_ia32
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_strspn; __GI_strspn = __strspn_ia32
+#endif
+
+#endif /* HAVE_SSE4_SUPPORT */
+
+#include "../../strspn.S"
diff --git a/sysdeps/i386/i686/multiarch/strstr-c.c b/sysdeps/i386/i686/multiarch/strstr-c.c
new file mode 100644
index 0000000000..7ef1157ce4
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strstr-c.c
@@ -0,0 +1,12 @@
+#include "init-arch.h"
+
+#define STRSTR __strstr_ia32
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name) \
+  __hidden_ver1 (__strstr_ia32, __GI_strstr, __strstr_ia32);
+
+#include "string/strstr.c"
+
+extern char *__strstr_sse42 (const char *, const char *);
+
+libc_ifunc (strstr, HAS_SSE4_2 ? __strstr_sse42 : __strstr_ia32);
diff --git a/sysdeps/i386/i686/multiarch/strstr.c b/sysdeps/i386/i686/multiarch/strstr.c
new file mode 100644
index 0000000000..a97428c125
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strstr.c
@@ -0,0 +1 @@
+#include <sysdeps/x86_64/multiarch/strstr.c>
diff --git a/sysdeps/x86_64/cacheinfo.c b/sysdeps/x86_64/cacheinfo.c
index f252fc2c6c..5b66c62eb3 100644
--- a/sysdeps/x86_64/cacheinfo.c
+++ b/sysdeps/x86_64/cacheinfo.c
@@ -516,13 +516,15 @@ init_cacheinfo (void)
           shared = handle_intel (_SC_LEVEL2_CACHE_SIZE, max_cpuid);
 	}
 
+      unsigned int ebx_1;
+
 #ifdef USE_MULTIARCH
       eax = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].eax;
-      ebx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ebx;
+      ebx_1 = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ebx;
       ecx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx;
       edx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].edx;
 #else
-      __cpuid (1, eax, ebx, ecx, edx);
+      __cpuid (1, eax, ebx_1, ecx, edx);
 #endif
 
 #ifndef DISABLE_PREFERRED_MEMORY_INSTRUCTION
@@ -554,14 +556,46 @@ init_cacheinfo (void)
 	    }
           while (((eax >> 5) & 0x7) != level);
 
-	  threads = ((eax >> 14) & 0x3ff) + 1;
+	  threads = (eax >> 14) & 0x3ff;
+
+	  /* If max_cpuid >= 11, THREADS is the maximum number of
+	      addressable IDs for logical processors sharing the
+	      cache, instead of the maximum number of threads
+	      sharing the cache.  */
+	  if (threads && max_cpuid >= 11)
+	    {
+	      /* Find the number of logical processors shipped in
+		 one core and apply count mask.  */
+	      i = 0;
+	      while (1)
+		{
+		  __cpuid_count (11, i++, eax, ebx, ecx, edx);
+
+		  int shipped = ebx & 0xff;
+		  int type = ecx & 0xff0;
+		  if (shipped == 0 || type == 0)
+		    break;
+		  else if (type == 0x200)
+		    {
+		      int count_mask;
+
+		      /* Compute count mask.  */
+		      asm ("bsr %1, %0"
+			   : "=r" (count_mask) : "g" (threads));
+		      count_mask = ~(-1 << (count_mask + 1));
+		      threads = (shipped - 1) & count_mask;
+		      break;
+		    }
+		}
+	    }
+	  threads += 1;
 	}
       else
         {
 	intel_bug_no_cache_info:
 	  /* Assume that all logical threads share the highest cache level.  */
 
-	  threads = (ebx >> 16) & 0xff;
+	  threads = (ebx_1 >> 16) & 0xff;
 	}
 
       /* Cap usage of highest cache level to the number of supported
diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
index 20da6956f1..f9c60ad5cf 100644
--- a/sysdeps/x86_64/dl-trampoline.S
+++ b/sysdeps/x86_64/dl-trampoline.S
@@ -146,247 +146,17 @@ L(have_avx):
 2:	movl	%eax, L(have_avx)(%rip)
 	cmpl	$0, %eax
 
-1:	js	L(no_avx1)
+1:	js	L(no_avx)
 
-	/* This is to support AVX audit modules.  */
-	vmovdqu %ymm0,		      (LR_VECTOR_OFFSET)(%rsp)
-	vmovdqu %ymm1, (LR_VECTOR_OFFSET +   VECTOR_SIZE)(%rsp)
-	vmovdqu %ymm2, (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp)
-	vmovdqu %ymm3, (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp)
-	vmovdqu %ymm4, (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp)
-	vmovdqu %ymm5, (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp)
-	vmovdqu %ymm6, (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp)
-	vmovdqu %ymm7, (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp)
+#  define RESTORE_AVX
+#  include "dl-trampoline.h"
 
-	/* Save xmm0-xmm7 registers to detect if any of them are
-	   changed by audit module.  */
-	vmovdqa %xmm0,		    (LR_SIZE)(%rsp)
-	vmovdqa %xmm1, (LR_SIZE +   XMM_SIZE)(%rsp)
-	vmovdqa %xmm2, (LR_SIZE + XMM_SIZE*2)(%rsp)
-	vmovdqa %xmm3, (LR_SIZE + XMM_SIZE*3)(%rsp)
-	vmovdqa %xmm4, (LR_SIZE + XMM_SIZE*4)(%rsp)
-	vmovdqa %xmm5, (LR_SIZE + XMM_SIZE*5)(%rsp)
-	vmovdqa %xmm6, (LR_SIZE + XMM_SIZE*6)(%rsp)
-	vmovdqa %xmm7, (LR_SIZE + XMM_SIZE*7)(%rsp)
-
-L(no_avx1):
-# endif
-
-	movq %rsp, %rcx		# La_x86_64_regs pointer to %rcx.
-	movq 48(%rbx), %rdx	# Load return address if needed.
-	movq 40(%rbx), %rsi	# Copy args pushed by PLT in register.
-	movq 32(%rbx), %rdi	# %rdi: link_map, %rsi: reloc_index
-	leaq 16(%rbx), %r8
-	call _dl_profile_fixup	# Call resolver.
-
-	movq %rax, %r11		# Save return value.
-
-	movq 8(%rbx), %rax	# Get back register content.
-	movq LR_RDX_OFFSET(%rsp), %rdx
-	movq  LR_R8_OFFSET(%rsp), %r8
-	movq  LR_R9_OFFSET(%rsp), %r9
-
-	movaps		    (LR_XMM_OFFSET)(%rsp), %xmm0
-	movaps	 (LR_XMM_OFFSET + XMM_SIZE)(%rsp), %xmm1
-	movaps (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp), %xmm2
-	movaps (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp), %xmm3
-	movaps (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp), %xmm4
-	movaps (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp), %xmm5
-	movaps (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6
-	movaps (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7
-
-# ifdef HAVE_AVX_SUPPORT
-	cmpl	$0, L(have_avx)(%rip)
-	js	L(no_avx2)
-
-	/* Check if any xmm0-xmm7 registers are changed by audit
-	   module.  */
-	vpcmpeqq (LR_SIZE)(%rsp), %xmm0, %xmm8
-	vpmovmskb %xmm8, %esi
-	cmpl $0xffff, %esi
-	jne 1f
-	vmovdqu			(LR_VECTOR_OFFSET)(%rsp), %ymm0
-
-1:	vpcmpeqq (LR_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm8
-	vpmovmskb %xmm8, %esi
-	cmpl $0xffff, %esi
-	jne 1f
-	vmovdqu	  (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %ymm1
-
-1:	vpcmpeqq (LR_SIZE + XMM_SIZE*2)(%rsp), %xmm2, %xmm8
-	vpmovmskb %xmm8, %esi
-	cmpl $0xffff, %esi
-	jne 1f
-	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %ymm2
-
-1:	vpcmpeqq (LR_SIZE + XMM_SIZE*3)(%rsp), %xmm3, %xmm8
-	vpmovmskb %xmm8, %esi
-	cmpl $0xffff, %esi
-	jne 1f
-	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %ymm3
-
-1:	vpcmpeqq (LR_SIZE + XMM_SIZE*4)(%rsp), %xmm4, %xmm8
-	vpmovmskb %xmm8, %esi
-	cmpl $0xffff, %esi
-	jne 1f
-	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %ymm4
-
-1:	vpcmpeqq (LR_SIZE + XMM_SIZE*5)(%rsp), %xmm5, %xmm8
-	vpmovmskb %xmm8, %esi
-	cmpl $0xffff, %esi
-	jne 1f
-	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %ymm5
-
-1:	vpcmpeqq (LR_SIZE + XMM_SIZE*6)(%rsp), %xmm6, %xmm8
-	vpmovmskb %xmm8, %esi
-	cmpl $0xffff, %esi
-	jne 1f
-	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %ymm6
-
-1:	vpcmpeqq (LR_SIZE + XMM_SIZE*7)(%rsp), %xmm7, %xmm8
-	vpmovmskb %xmm8, %esi
-	cmpl $0xffff, %esi
-	jne 1f
-	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp), %ymm7
-
-L(no_avx2):
-1:
-# endif
-	movq 16(%rbx), %r10	# Anything in framesize?
-	testq %r10, %r10
-	jns 3f
-
-	/* There's nothing in the frame size, so there
-	   will be no call to the _dl_call_pltexit. */
-
-	/* Get back registers content.  */
-	movq LR_RCX_OFFSET(%rsp), %rcx
-	movq LR_RSI_OFFSET(%rsp), %rsi
-	movq LR_RDI_OFFSET(%rsp), %rdi
-
-	movq %rbx, %rsp
-	movq (%rsp), %rbx
-	cfi_restore(rbx)
-	cfi_def_cfa_register(%rsp)
-
-	addq $48, %rsp		# Adjust the stack to the return value
-				# (eats the reloc index and link_map)
-	cfi_adjust_cfa_offset(-48)
-	jmp *%r11		# Jump to function address.
-
-3:
-	cfi_adjust_cfa_offset(48)
-	cfi_rel_offset(%rbx, 0)
-	cfi_def_cfa_register(%rbx)
-
-	/* At this point we need to prepare new stack for the function
-	   which has to be called.  We copy the original stack to a
-	   temporary buffer of the size specified by the 'framesize'
-	   returned from _dl_profile_fixup */
-
-	leaq LR_RSP_OFFSET(%rbx), %rsi	# stack
-	addq $8, %r10
-	andq $0xfffffffffffffff0, %r10
-	movq %r10, %rcx
-	subq %r10, %rsp
-	movq %rsp, %rdi
-	shrq $3, %rcx
-	rep
-	movsq
-
-	movq 24(%rdi), %rcx	# Get back register content.
-	movq 32(%rdi), %rsi
-	movq 40(%rdi), %rdi
-
-	call *%r11
-
-	mov 24(%rbx), %rsp	# Drop the copied stack content
-
-	/* Now we have to prepare the La_x86_64_retval structure for the
-	   _dl_call_pltexit.  The La_x86_64_regs is being pointed by rsp now,
-	   so we just need to allocate the sizeof(La_x86_64_retval) space on
-	   the stack, since the alignment has already been taken care of. */
-# ifdef HAVE_AVX_SUPPORT
-	/* sizeof(La_x86_64_retval).  Need extra space for 2 SSE
-	   registers to detect if xmm0/xmm1 registers are changed
-	   by audit module.  */
-	subq $(LRV_SIZE + XMM_SIZE*2), %rsp
-# else
-	subq $LRV_SIZE, %rsp	# sizeof(La_x86_64_retval)
-# endif
-	movq %rsp, %rcx		# La_x86_64_retval argument to %rcx.
-
-	/* Fill in the La_x86_64_retval structure.  */
-	movq %rax, LRV_RAX_OFFSET(%rcx)
-	movq %rdx, LRV_RDX_OFFSET(%rcx)
-
-	movaps %xmm0, LRV_XMM0_OFFSET(%rcx)
-	movaps %xmm1, LRV_XMM1_OFFSET(%rcx)
-
-# ifdef HAVE_AVX_SUPPORT
-	cmpl	$0, L(have_avx)(%rip)
-	js	L(no_avx3)
-
-	/* This is to support AVX audit modules.  */
-	vmovdqu %ymm0, LRV_VECTOR0_OFFSET(%rcx)
-	vmovdqu %ymm1, LRV_VECTOR1_OFFSET(%rcx)
-
-	/* Save xmm0/xmm1 registers to detect if they are changed
-	   by audit module.  */
-	vmovdqa %xmm0,		  (LRV_SIZE)(%rcx)
-	vmovdqa %xmm1, (LRV_SIZE + XMM_SIZE)(%rcx)
-
-L(no_avx3):
-# endif
-
-	fstpt LRV_ST0_OFFSET(%rcx)
-	fstpt LRV_ST1_OFFSET(%rcx)
-
-	movq 24(%rbx), %rdx	# La_x86_64_regs argument to %rdx.
-	movq 40(%rbx), %rsi	# Copy args pushed by PLT in register.
-        movq 32(%rbx), %rdi	# %rdi: link_map, %rsi: reloc_index
-	call _dl_call_pltexit
-
-	/* Restore return registers.  */
-	movq LRV_RAX_OFFSET(%rsp), %rax
-	movq LRV_RDX_OFFSET(%rsp), %rdx
-
-	movaps LRV_XMM0_OFFSET(%rsp), %xmm0
-	movaps LRV_XMM1_OFFSET(%rsp), %xmm1
-
-# ifdef HAVE_AVX_SUPPORT
-	cmpl	$0, L(have_avx)(%rip)
-	js	L(no_avx4)
-
-	/* Check if xmm0/xmm1 registers are changed by audit module.  */
-	vpcmpeqq (LRV_SIZE)(%rsp), %xmm0, %xmm2
-	vpmovmskb %xmm2, %esi
-	cmpl $0xffff, %esi
-	jne 1f
-	vmovdqu LRV_VECTOR0_OFFSET(%rsp), %ymm0
-
-1:	vpcmpeqq (LRV_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2
-	vpmovmskb %xmm2, %esi
-	cmpl $0xffff, %esi
-	jne 1f
-	vmovdqu LRV_VECTOR1_OFFSET(%rsp), %ymm1
-
-L(no_avx4):
-1:
+	.align 16
+L(no_avx):
 # endif
 
-	fldt LRV_ST1_OFFSET(%rsp)
-	fldt LRV_ST0_OFFSET(%rsp)
-
-	movq %rbx, %rsp
-	movq (%rsp), %rbx
-	cfi_restore(rbx)
-	cfi_def_cfa_register(%rsp)
-
-	addq $48, %rsp		# Adjust the stack to the return value
-				# (eats the reloc index and link_map)
-	cfi_adjust_cfa_offset(-48)
-	retq
+#  undef RESTORE_AVX
+#  include "dl-trampoline.h"
 
 	cfi_endproc
 	.size _dl_runtime_profile, .-_dl_runtime_profile
diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h
new file mode 100644
index 0000000000..5d49ed4408
--- /dev/null
+++ b/sysdeps/x86_64/dl-trampoline.h
@@ -0,0 +1,269 @@
+/* Partial PLT profile trampoline to save and restore x86-64 vector
+   registers.
+   Copyright (C) 2009 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifdef RESTORE_AVX
+	/* This is to support AVX audit modules.  */
+	vmovdqu %ymm0,		      (LR_VECTOR_OFFSET)(%rsp)
+	vmovdqu %ymm1, (LR_VECTOR_OFFSET +   VECTOR_SIZE)(%rsp)
+	vmovdqu %ymm2, (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp)
+	vmovdqu %ymm3, (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp)
+	vmovdqu %ymm4, (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp)
+	vmovdqu %ymm5, (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp)
+	vmovdqu %ymm6, (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp)
+	vmovdqu %ymm7, (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp)
+
+	/* Save xmm0-xmm7 registers to detect if any of them are
+	   changed by audit module.  */
+	vmovdqa %xmm0,		    (LR_SIZE)(%rsp)
+	vmovdqa %xmm1, (LR_SIZE +   XMM_SIZE)(%rsp)
+	vmovdqa %xmm2, (LR_SIZE + XMM_SIZE*2)(%rsp)
+	vmovdqa %xmm3, (LR_SIZE + XMM_SIZE*3)(%rsp)
+	vmovdqa %xmm4, (LR_SIZE + XMM_SIZE*4)(%rsp)
+	vmovdqa %xmm5, (LR_SIZE + XMM_SIZE*5)(%rsp)
+	vmovdqa %xmm6, (LR_SIZE + XMM_SIZE*6)(%rsp)
+	vmovdqa %xmm7, (LR_SIZE + XMM_SIZE*7)(%rsp)
+#endif
+
+	movq %rsp, %rcx		# La_x86_64_regs pointer to %rcx.
+	movq 48(%rbx), %rdx	# Load return address if needed.
+	movq 40(%rbx), %rsi	# Copy args pushed by PLT in register.
+	movq 32(%rbx), %rdi	# %rdi: link_map, %rsi: reloc_index
+	leaq 16(%rbx), %r8
+	call _dl_profile_fixup	# Call resolver.
+
+	movq %rax, %r11		# Save return value.
+
+	movq 8(%rbx), %rax	# Get back register content.
+	movq LR_RDX_OFFSET(%rsp), %rdx
+	movq  LR_R8_OFFSET(%rsp), %r8
+	movq  LR_R9_OFFSET(%rsp), %r9
+
+	movaps		    (LR_XMM_OFFSET)(%rsp), %xmm0
+	movaps	 (LR_XMM_OFFSET + XMM_SIZE)(%rsp), %xmm1
+	movaps (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp), %xmm2
+	movaps (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp), %xmm3
+	movaps (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp), %xmm4
+	movaps (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp), %xmm5
+	movaps (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6
+	movaps (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7
+
+#ifdef RESTORE_AVX
+	/* Check if any xmm0-xmm7 registers are changed by audit
+	   module.  */
+	vpcmpeqq (LR_SIZE)(%rsp), %xmm0, %xmm8
+	vpmovmskb %xmm8, %esi
+	cmpl $0xffff, %esi
+	je 2f
+	vmovdqa	%xmm0, (LR_VECTOR_OFFSET)(%rsp)
+	jmp 1f
+2:	vmovdqu	(LR_VECTOR_OFFSET)(%rsp), %ymm0
+	vmovdqa	%xmm0, (LR_XMM_OFFSET)(%rsp)
+
+1:	vpcmpeqq (LR_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm8
+	vpmovmskb %xmm8, %esi
+	cmpl $0xffff, %esi
+	je 2f
+	vmovdqa	%xmm1, (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp)
+	jmp 1f
+2:	vmovdqu	(LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %ymm1
+	vmovdqa	%xmm1, (LR_XMM_OFFSET + XMM_SIZE)(%rsp)
+
+1:	vpcmpeqq (LR_SIZE + XMM_SIZE*2)(%rsp), %xmm2, %xmm8
+	vpmovmskb %xmm8, %esi
+	cmpl $0xffff, %esi
+	je 2f
+	vmovdqa	%xmm2, (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp)
+	jmp 1f
+2:	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %ymm2
+	vmovdqa	%xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp)
+
+1:	vpcmpeqq (LR_SIZE + XMM_SIZE*3)(%rsp), %xmm3, %xmm8
+	vpmovmskb %xmm8, %esi
+	cmpl $0xffff, %esi
+	je 2f
+	vmovdqa	%xmm3, (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp)
+	jmp 1f
+2:	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %ymm3
+	vmovdqa	%xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp)
+
+1:	vpcmpeqq (LR_SIZE + XMM_SIZE*4)(%rsp), %xmm4, %xmm8
+	vpmovmskb %xmm8, %esi
+	cmpl $0xffff, %esi
+	je 2f
+	vmovdqa	%xmm4, (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp)
+	jmp 1f
+2:	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %ymm4
+	vmovdqa	%xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp)
+
+1:	vpcmpeqq (LR_SIZE + XMM_SIZE*5)(%rsp), %xmm5, %xmm8
+	vpmovmskb %xmm8, %esi
+	cmpl $0xffff, %esi
+	je 2f
+	vmovdqa	%xmm5, (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp)
+	jmp 1f
+2:	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %ymm5
+	vmovdqa	%xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp)
+
+1:	vpcmpeqq (LR_SIZE + XMM_SIZE*6)(%rsp), %xmm6, %xmm8
+	vpmovmskb %xmm8, %esi
+	cmpl $0xffff, %esi
+	je 2f
+	vmovdqa	%xmm6, (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp)
+	jmp 1f
+2:	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %ymm6
+	vmovdqa	%xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp)
+
+1:	vpcmpeqq (LR_SIZE + XMM_SIZE*7)(%rsp), %xmm7, %xmm8
+	vpmovmskb %xmm8, %esi
+	cmpl $0xffff, %esi
+	je 2f
+	vmovdqa	%xmm7, (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp)
+	jmp 1f
+2:	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp), %ymm7
+	vmovdqa	%xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp)
+
+1:
+#endif
+	movq 16(%rbx), %r10	# Anything in framesize?
+	testq %r10, %r10
+	jns 3f
+
+	/* There's nothing in the frame size, so there
+	   will be no call to the _dl_call_pltexit. */
+
+	/* Get back registers content.  */
+	movq LR_RCX_OFFSET(%rsp), %rcx
+	movq LR_RSI_OFFSET(%rsp), %rsi
+	movq LR_RDI_OFFSET(%rsp), %rdi
+
+	movq %rbx, %rsp
+	movq (%rsp), %rbx
+	cfi_restore(rbx)
+	cfi_def_cfa_register(%rsp)
+
+	addq $48, %rsp		# Adjust the stack to the return value
+				# (eats the reloc index and link_map)
+	cfi_adjust_cfa_offset(-48)
+	jmp *%r11		# Jump to function address.
+
+3:
+	cfi_adjust_cfa_offset(48)
+	cfi_rel_offset(%rbx, 0)
+	cfi_def_cfa_register(%rbx)
+
+	/* At this point we need to prepare new stack for the function
+	   which has to be called.  We copy the original stack to a
+	   temporary buffer of the size specified by the 'framesize'
+	   returned from _dl_profile_fixup */
+
+	leaq LR_RSP_OFFSET(%rbx), %rsi	# stack
+	addq $8, %r10
+	andq $0xfffffffffffffff0, %r10
+	movq %r10, %rcx
+	subq %r10, %rsp
+	movq %rsp, %rdi
+	shrq $3, %rcx
+	rep
+	movsq
+
+	movq 24(%rdi), %rcx	# Get back register content.
+	movq 32(%rdi), %rsi
+	movq 40(%rdi), %rdi
+
+	call *%r11
+
+	mov 24(%rbx), %rsp	# Drop the copied stack content
+
+	/* Now we have to prepare the La_x86_64_retval structure for the
+	   _dl_call_pltexit.  The La_x86_64_regs is being pointed by rsp now,
+	   so we just need to allocate the sizeof(La_x86_64_retval) space on
+	   the stack, since the alignment has already been taken care of. */
+# ifdef RESTORE_AVX
+	/* sizeof(La_x86_64_retval).  Need extra space for 2 SSE
+	   registers to detect if xmm0/xmm1 registers are changed
+	   by audit module.  */
+	subq $(LRV_SIZE + XMM_SIZE*2), %rsp
+# else
+	subq $LRV_SIZE, %rsp	# sizeof(La_x86_64_retval)
+# endif
+	movq %rsp, %rcx		# La_x86_64_retval argument to %rcx.
+
+	/* Fill in the La_x86_64_retval structure.  */
+	movq %rax, LRV_RAX_OFFSET(%rcx)
+	movq %rdx, LRV_RDX_OFFSET(%rcx)
+
+	movaps %xmm0, LRV_XMM0_OFFSET(%rcx)
+	movaps %xmm1, LRV_XMM1_OFFSET(%rcx)
+
+# ifdef RESTORE_AVX
+	/* This is to support AVX audit modules.  */
+	vmovdqu %ymm0, LRV_VECTOR0_OFFSET(%rcx)
+	vmovdqu %ymm1, LRV_VECTOR1_OFFSET(%rcx)
+
+	/* Save xmm0/xmm1 registers to detect if they are changed
+	   by audit module.  */
+	vmovdqa %xmm0,		  (LRV_SIZE)(%rcx)
+	vmovdqa %xmm1, (LRV_SIZE + XMM_SIZE)(%rcx)
+# endif
+
+	fstpt LRV_ST0_OFFSET(%rcx)
+	fstpt LRV_ST1_OFFSET(%rcx)
+
+	movq 24(%rbx), %rdx	# La_x86_64_regs argument to %rdx.
+	movq 40(%rbx), %rsi	# Copy args pushed by PLT in register.
+        movq 32(%rbx), %rdi	# %rdi: link_map, %rsi: reloc_index
+	call _dl_call_pltexit
+
+	/* Restore return registers.  */
+	movq LRV_RAX_OFFSET(%rsp), %rax
+	movq LRV_RDX_OFFSET(%rsp), %rdx
+
+	movaps LRV_XMM0_OFFSET(%rsp), %xmm0
+	movaps LRV_XMM1_OFFSET(%rsp), %xmm1
+
+# ifdef RESTORE_AVX
+	/* Check if xmm0/xmm1 registers are changed by audit module.  */
+	vpcmpeqq (LRV_SIZE)(%rsp), %xmm0, %xmm2
+	vpmovmskb %xmm2, %esi
+	cmpl $0xffff, %esi
+	jne 1f
+	vmovdqu LRV_VECTOR0_OFFSET(%rsp), %ymm0
+
+1:	vpcmpeqq (LRV_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2
+	vpmovmskb %xmm2, %esi
+	cmpl $0xffff, %esi
+	jne 1f
+	vmovdqu LRV_VECTOR1_OFFSET(%rsp), %ymm1
+
+1:
+# endif
+
+	fldt LRV_ST1_OFFSET(%rsp)
+	fldt LRV_ST0_OFFSET(%rsp)
+
+	movq %rbx, %rsp
+	movq (%rsp), %rbx
+	cfi_restore(rbx)
+	cfi_def_cfa_register(%rsp)
+
+	addq $48, %rsp		# Adjust the stack to the return value
+				# (eats the reloc index and link_map)
+	cfi_adjust_cfa_offset(-48)
+	retq
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index b066402204..0ded3b3261 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -4,7 +4,7 @@ gen-as-const-headers += ifunc-defines.sym
 endif
 
 ifeq ($(subdir),string)
-sysdep_routines += stpncpy-c strncpy-c
+sysdep_routines += stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3
 ifeq (yes,$(config-cflags-sse4))
 sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
 CFLAGS-strcspn-c.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/rawmemchr.S b/sysdeps/x86_64/multiarch/rawmemchr.S
index d4f265f430..08fd8769fc 100644
--- a/sysdeps/x86_64/multiarch/rawmemchr.S
+++ b/sysdeps/x86_64/multiarch/rawmemchr.S
@@ -38,6 +38,7 @@ END(rawmemchr)
 strong_alias (rawmemchr, __rawmemchr)
 
 
+	.section .text.sse4.2,"ax",@progbits
 	.align 	16
 	.type	__rawmemchr_sse42, @function
 __rawmemchr_sse42:
diff --git a/sysdeps/x86_64/multiarch/strcmp-ssse3.S b/sysdeps/x86_64/multiarch/strcmp-ssse3.S
new file mode 100644
index 0000000000..98cecb8942
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcmp-ssse3.S
@@ -0,0 +1,3 @@
+#define USE_SSSE3 1
+#define STRCMP __strcmp_ssse3
+#include "../strcmp.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp.S b/sysdeps/x86_64/multiarch/strcmp.S
index 1a315737af..05adf1e2e6 100644
--- a/sysdeps/x86_64/multiarch/strcmp.S
+++ b/sysdeps/x86_64/multiarch/strcmp.S
@@ -34,6 +34,7 @@
 	mov	%r9, %r11
 
 #define STRCMP_SSE42	__strncmp_sse42
+#define STRCMP_SSSE3	__strncmp_ssse3
 #define STRCMP_SSE2	__strncmp_sse2
 #define __GI_STRCMP	__GI_strncmp
 #else
@@ -41,6 +42,7 @@
 #ifndef STRCMP
 #define STRCMP		strcmp
 #define STRCMP_SSE42	__strcmp_sse42
+#define STRCMP_SSSE3	__strcmp_ssse3
 #define STRCMP_SSE2	__strcmp_sse2
 #define __GI_STRCMP	__GI_strcmp
 #endif
@@ -60,10 +62,14 @@ ENTRY(STRCMP)
 	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
 	jne	1f
 	call	__init_cpu_features
-1:	leaq	STRCMP_SSE2(%rip), %rax
-	testl	$(1<<20), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip)
-	jz	2f
+1:
 	leaq	STRCMP_SSE42(%rip), %rax
+	testl	$(1<<20), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip)
+	jnz	2f
+	leaq	STRCMP_SSSE3(%rip), %rax
+	testl	$(1<<9), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip)
+	jnz	2f
+	leaq	STRCMP_SSE2(%rip), %rax
 2:	ret
 END(STRCMP)
 
diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c
index 4512267d3f..daeebe1bf5 100644
--- a/sysdeps/x86_64/multiarch/strcspn-c.c
+++ b/sysdeps/x86_64/multiarch/strcspn-c.c
@@ -86,11 +86,13 @@ STRCSPN_SSE42 (const char *s, const char *a)
 
   const char *aligned;
   __m128i mask;
+  /* Fake initialization.  gcc otherwise will warn.  */
+  asm ("" : "=xm" (mask));
   int offset = (int) ((size_t) a & 15);
   if (offset != 0)
     {
       /* Load masks.  */
-      aligned = (const char *) ((size_t) a & 0xfffffffffffffff0L);
+      aligned = (const char *) ((size_t) a & -16L);
       __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
 
       switch (offset)
@@ -229,7 +231,7 @@ STRCSPN_SSE42 (const char *s, const char *a)
   if (offset != 0)
     {
       /* Check partial string.  */
-      aligned = (const char *) ((size_t) s & 0xfffffffffffffff0L);
+      aligned = (const char *) ((size_t) s & -16L);
       __m128i value = _mm_load_si128 ((__m128i *) aligned);
 
       switch (offset)
diff --git a/sysdeps/x86_64/multiarch/strlen.S b/sysdeps/x86_64/multiarch/strlen.S
index 82b03ccc28..4342c6cdab 100644
--- a/sysdeps/x86_64/multiarch/strlen.S
+++ b/sysdeps/x86_64/multiarch/strlen.S
@@ -40,6 +40,7 @@ ENTRY(strlen)
 END(strlen)
 
 
+	.section .text.sse4.2,"ax",@progbits
 	.align 	16
 	.type	__strlen_sse42, @function
 __strlen_sse42:
diff --git a/sysdeps/x86_64/multiarch/strncmp-ssse3.S b/sysdeps/x86_64/multiarch/strncmp-ssse3.S
new file mode 100644
index 0000000000..a320a3e949
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncmp-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_SSSE3 1
+#define STRCMP __strncmp_ssse3
+#define USE_AS_STRNCMP
+#include "../strcmp.S"
diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c
index 5b99f0d383..be9e8ac0a8 100644
--- a/sysdeps/x86_64/multiarch/strspn-c.c
+++ b/sysdeps/x86_64/multiarch/strspn-c.c
@@ -68,7 +68,7 @@ __strspn_sse42 (const char *s, const char *a)
   if (offset != 0)
     {
       /* Load masks.  */
-      aligned = (const char *) ((size_t) a & 0xfffffffffffffff0L);
+      aligned = (const char *) ((size_t) a & -16L);
       __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
 
       switch (offset)
@@ -207,7 +207,7 @@ __strspn_sse42 (const char *s, const char *a)
   if (offset != 0)
     {
       /* Check partial string.  */
-      aligned = (const char *) ((size_t) s & 0xfffffffffffffff0L);
+      aligned = (const char *) ((size_t) s & -16L);
       __m128i value = _mm_load_si128 ((__m128i *) aligned);
 
       switch (offset)
diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
index 340a64ba35..650ec173b6 100644
--- a/sysdeps/x86_64/strcmp.S
+++ b/sysdeps/x86_64/strcmp.S
@@ -51,7 +51,12 @@
 # endif
 #endif
 
+#ifndef USE_SSSE3
 	.text
+#else
+        .section .text.ssse3,"ax",@progbits
+#endif
+
 ENTRY (BP_SYM (STRCMP))
 #ifdef NOT_IN_libc
 /* Simple version since we can't use SSE registers in ld.so.  */
@@ -244,9 +249,13 @@ LABEL(gobble_ashr_1):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4		 /* store for next cycle */
 
+#ifndef USE_SSSE3
 	psrldq	$1, %xmm3
 	pslldq	$15, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$1, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -269,9 +278,13 @@ LABEL(gobble_ashr_1):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4		/* store for next cycle */
 
+#ifndef USE_SSSE3
 	psrldq	$1, %xmm3
-	pslldq 	$15, %xmm2
+	pslldq	$15, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$1, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -363,9 +376,13 @@ LABEL(gobble_ashr_2):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$2, %xmm3
 	pslldq	$14, %xmm2
-	por	%xmm3, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$2, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -389,9 +406,13 @@ LABEL(gobble_ashr_2):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$2, %xmm3
-	pslldq 	$14, %xmm2
-	por	%xmm3, %xmm2
+	pslldq	$14, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$2, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -477,9 +498,13 @@ LABEL(gobble_ashr_3):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$3, %xmm3
 	pslldq	$13, %xmm2
-	por	%xmm3, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$3, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -503,9 +528,13 @@ LABEL(gobble_ashr_3):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$3, %xmm3
-	pslldq 	$13, %xmm2
-	por	%xmm3, %xmm2
+	pslldq	$13, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$3, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -591,9 +620,13 @@ LABEL(gobble_ashr_4):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$4, %xmm3
 	pslldq	$12, %xmm2
-	por	%xmm3, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$4, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -617,9 +650,13 @@ LABEL(gobble_ashr_4):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$4, %xmm3
-	pslldq 	$12, %xmm2
-	por	%xmm3, %xmm2
+	pslldq	$12, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$4, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -705,9 +742,13 @@ LABEL(gobble_ashr_5):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$5, %xmm3
 	pslldq	$11, %xmm2
-	por	%xmm3, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$5, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -731,9 +772,13 @@ LABEL(gobble_ashr_5):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$5, %xmm3
-	pslldq 	$11, %xmm2
-	por	%xmm3, %xmm2
+	pslldq	$11, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$5, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -819,9 +864,13 @@ LABEL(gobble_ashr_6):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$6, %xmm3
 	pslldq	$10, %xmm2
-	por	%xmm3, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$6, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -845,9 +894,13 @@ LABEL(gobble_ashr_6):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$6, %xmm3
-	pslldq 	$10, %xmm2
-	por	%xmm3, %xmm2
+	pslldq	$10, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$6, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -933,9 +986,13 @@ LABEL(gobble_ashr_7):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$7, %xmm3
 	pslldq	$9, %xmm2
-	por	%xmm3, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$7, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -959,9 +1016,13 @@ LABEL(gobble_ashr_7):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$7, %xmm3
-	pslldq 	$9, %xmm2
-	por	%xmm3, %xmm2
+	pslldq	$9, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$7, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1047,9 +1108,13 @@ LABEL(gobble_ashr_8):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$8, %xmm3
 	pslldq	$8, %xmm2
-	por	%xmm3, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$8, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1073,9 +1138,13 @@ LABEL(gobble_ashr_8):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$8, %xmm3
-	pslldq 	$8, %xmm2
-	por	%xmm3, %xmm2
+	pslldq	$8, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$8, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1161,9 +1230,13 @@ LABEL(gobble_ashr_9):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$9, %xmm3
 	pslldq	$7, %xmm2
-	por	%xmm3, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$9, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1187,9 +1260,13 @@ LABEL(gobble_ashr_9):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$9, %xmm3
-	pslldq 	$7, %xmm2
-	por	%xmm3, %xmm2
+	pslldq	$7, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$9, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1275,9 +1352,13 @@ LABEL(gobble_ashr_10):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$10, %xmm3
 	pslldq	$6, %xmm2
-	por	%xmm3, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$10, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1301,9 +1382,13 @@ LABEL(gobble_ashr_10):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$10, %xmm3
-	pslldq 	$6, %xmm2
-	por	%xmm3, %xmm2
+	pslldq	$6, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$10, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1389,9 +1474,13 @@ LABEL(gobble_ashr_11):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$11, %xmm3
 	pslldq	$5, %xmm2
-	por	%xmm3, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$11, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1415,9 +1504,13 @@ LABEL(gobble_ashr_11):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$11, %xmm3
-	pslldq 	$5, %xmm2
-	por	%xmm3, %xmm2
+	pslldq	$5, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$11, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1503,9 +1596,13 @@ LABEL(gobble_ashr_12):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$12, %xmm3
 	pslldq	$4, %xmm2
-	por	%xmm3, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$12, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1529,9 +1626,13 @@ LABEL(gobble_ashr_12):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$12, %xmm3
-	pslldq 	$4, %xmm2
-	por	%xmm3, %xmm2
+	pslldq	$4, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$12, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1617,9 +1718,13 @@ LABEL(gobble_ashr_13):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$13, %xmm3
 	pslldq	$3, %xmm2
-	por	%xmm3, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$13, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1643,9 +1748,13 @@ LABEL(gobble_ashr_13):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$13, %xmm3
-	pslldq 	$3, %xmm2
-	por	%xmm3, %xmm2
+	pslldq	$3, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$13, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1731,9 +1840,13 @@ LABEL(gobble_ashr_14):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$14, %xmm3
 	pslldq	$2, %xmm2
-	por	%xmm3, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$14, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1757,9 +1870,13 @@ LABEL(gobble_ashr_14):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$14, %xmm3
-	pslldq 	$2, %xmm2
-	por	%xmm3, %xmm2
+	pslldq	$2, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$14, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1847,9 +1964,13 @@ LABEL(gobble_ashr_15):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$15, %xmm3
 	pslldq	$1, %xmm2
-	por	%xmm3, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$15, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1873,9 +1994,13 @@ LABEL(gobble_ashr_15):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$15, %xmm3
-	pslldq 	$1, %xmm2
-	por	%xmm3, %xmm2
+	pslldq	$1, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$15, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1