about summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--ChangeLog159
-rw-r--r--Makeconfig8
-rw-r--r--Versions.def1
-rw-r--r--conform/conformtest.pl5
-rw-r--r--conform/data/complex.h-data2
-rw-r--r--conform/data/ctype.h-data18
-rw-r--r--conform/data/dirent.h-data2
-rw-r--r--conform/data/inttypes.h-data4
-rw-r--r--conform/data/sys/select.h-data12
-rw-r--r--conform/data/sys/uio.h-data2
-rw-r--r--conform/data/time.h-data1
-rw-r--r--grp/grp.h6
-rw-r--r--malloc/malloc.c41
-rw-r--r--manual/install.texi11
-rw-r--r--nptl/ChangeLog14
-rw-r--r--nptl/Versions5
-rw-r--r--nptl/pthread_mutex_consistent.c5
-rw-r--r--nptl/pthread_mutexattr_getrobust.c5
-rw-r--r--nptl/pthread_mutexattr_setrobust.c5
-rw-r--r--nptl/sysdeps/pthread/pthread.h10
-rw-r--r--posix/regcomp.c112
-rw-r--r--posix/sched.h10
-rw-r--r--posix/unistd.h24
-rw-r--r--resolv/netdb.h16
-rw-r--r--resolv/res_send.c44
-rw-r--r--stdlib/stdlib.h14
-rw-r--r--string/strings.h4
-rw-r--r--sysdeps/generic/netinet/ip.h43
-rw-r--r--sysdeps/i386/i686/bcopy.S3
-rw-r--r--sysdeps/i386/i686/cacheinfo.c1
-rw-r--r--sysdeps/i386/i686/memcpy.S4
-rw-r--r--sysdeps/i386/i686/memmove.S27
-rw-r--r--sysdeps/i386/i686/mempcpy.S4
-rw-r--r--sysdeps/i386/i686/memset.S4
-rw-r--r--sysdeps/i386/i686/multiarch/Makefile4
-rw-r--r--sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S4
-rw-r--r--sysdeps/i386/i686/multiarch/bcopy-ssse3.S4
-rw-r--r--sysdeps/i386/i686/multiarch/bcopy.S89
-rw-r--r--sysdeps/i386/i686/multiarch/bzero-sse2-rep.S3
-rw-r--r--sysdeps/i386/i686/multiarch/bzero-sse2.S3
-rw-r--r--sysdeps/i386/i686/multiarch/bzero.S97
-rw-r--r--sysdeps/i386/i686/multiarch/ifunc-defines.sym3
-rw-r--r--sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S1785
-rw-r--r--sysdeps/i386/i686/multiarch/memcpy-ssse3.S1737
-rw-r--r--sysdeps/i386/i686/multiarch/memcpy.S90
-rw-r--r--sysdeps/i386/i686/multiarch/memcpy_chk.S64
-rw-r--r--sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S4
-rw-r--r--sysdeps/i386/i686/multiarch/memmove-ssse3.S4
-rw-r--r--sysdeps/i386/i686/multiarch/memmove.S114
-rw-r--r--sysdeps/i386/i686/multiarch/memmove_chk.S112
-rw-r--r--sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S4
-rw-r--r--sysdeps/i386/i686/multiarch/mempcpy-ssse3.S4
-rw-r--r--sysdeps/i386/i686/multiarch/mempcpy.S93
-rw-r--r--sysdeps/i386/i686/multiarch/mempcpy_chk.S64
-rw-r--r--sysdeps/i386/i686/multiarch/memset-sse2-rep.S821
-rw-r--r--sysdeps/i386/i686/multiarch/memset-sse2.S867
-rw-r--r--sysdeps/i386/i686/multiarch/memset.S112
-rw-r--r--sysdeps/i386/i686/multiarch/memset_chk.S116
-rw-r--r--sysdeps/i386/sysdep.h3
-rw-r--r--sysdeps/x86_64/cacheinfo.c10
-rw-r--r--sysdeps/x86_64/multiarch/ifunc-defines.sym3
-rw-r--r--sysdeps/x86_64/multiarch/init-arch.c18
-rw-r--r--sysdeps/x86_64/multiarch/init-arch.h14
-rw-r--r--sysdeps/x86_64/multiarch/strlen.S60
-rw-r--r--sysdeps/x86_64/strcmp.S415
65 files changed, 6917 insertions, 430 deletions
diff --git a/ChangeLog b/ChangeLog
index a7e4973d4c..be9d68d499 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,158 @@
+2010-01-14  Ulrich Drepper  <drepper@redhat.com>
+
+	[BZ #11027]
+	* sysdeps/generic/netinet/ip.h: Define IPTOS_CLASS_* macros according
+	to RFC 2474.
+	Patch by Philip Prindeville <philipp@redfish-solutions.com>.
+
+	[BZ #11141]
+	* resolv/res_send.c (reopen): Don't use IPv6 sockets for IPv4
+	addresses.
+
+	[BZ #11127]
+	* posix/regcomp.c (alc_eclosure_iter): Do not ignore
+	re_node_set_insert failure; return REG_ESPACE.
+	Patch by Paul Eggert.
+
+	* bits/confname.h: Make pre-C99-safe.
+
+2010-01-14  Ryan S. Arnold  <rsa@us.ibm.com>
+
+	* Makeconfig (+prector, +postctor, +prectorS, +postctorS): Add
+	$(sysdep-LDFLAGS) to pre and post ctors so that the proper .o
+	files are found by GCC.
+
+2010-01-14  Ulrich Drepper  <drepper@redhat.com>
+
+	* posix/unistd.h: Relax conditions for ualarm, usleep, getwd, vfork,
+	brk, and sbrk.
+
+	* string/strings.h: Relax condition for bcmp, bcopy, bzero, ffs,
+	index, and rindex.
+
+	* stdlib/stdlib.h: Relax conditions for *cvt functions and valloc.
+
+	* Versions.def: Add GLIBC_2.12 for libpthread.
+
+	[BZ #11126]
+	* malloc/malloc.c (malloc_info): Initialize malloc if not already
+	done.  Handle empty bin lists.
+
+	* posix/unistd.h: Change getpagesize and getdtablesize declaration
+	visibility some more.
+
+	* resolv/netdb.h: Relax condition to elide obsolete resolver constants.
+
+	* posix/unistd.h: Declare getpagesize in _GNU_SOURCE mode.
+
+	[BZ #11120]
+	* sysdeps/x86_64/strcmp.S: Prevent silent errors should strncmp be
+	needed outside libc.
+
+2010-01-13  Ulrich Drepper  <drepper@redhat.com>
+
+	* stdlib/stdlib.h: Parameter of unsetenv must not be NULL.
+
+2010-01-12  H.J. Lu  <hongjiu.lu@intel.com>
+
+	* sysdeps/x86_64/multiarch/strlen.S: Unroll the loop.
+
+2010-01-13  Ulrich Drepper  <drepper@redhat.com>
+
+	* stdlib/stdlib.h: Be a bit more relaxed about obsoleted mktemp symbol.
+
+2010-01-12  Ulrich Drepper  <drepper@redhat.com>
+
+	* conform/data/complex.h-data: Test for POSIX2008.
+
+	* conform/data/ctype.h-data: Fix POSIX testing.
+
+	* conform/data/dirent.h-data: ino_t only needed for XSI.
+
+	* grp/grp.h: Declare getgrent and endgrent for XPG7.
+
+	* conform/data/inttypes.h-data: Use same types as the headers.
+
+	* resolv/netdb.h: Don't define old resolver macros for XPG7.
+
+	* conform/data/sys/uio.h-data: Don't exclude POSIX2008.
+
+	* conform/conformtest.pl: Implement tag for allow-header.
+	Fix newline handling in allow-header implementation.
+
+	* conform/data/time.h-data: Use allow-header instead of duplicating
+	information.
+
+	* conform/data/time.h-data: Allow sigevent tag.
+
+	* posix/sched.h: Define time_t and pid_t for XPG7.
+
+2010-01-12  H.J. Lu  <hongjiu.lu@intel.com>
+
+	* sysdeps/i386/i686/bcopy.S: New file.
+
+	* sysdeps/i386/i686/cacheinfo.c (__x86_64_data_cache_size): Define.
+
+	* sysdeps/i386/i686/memcpy.S (__memcpy_chk): Use ENTRY_CHK
+	and END_CHK.
+	* sysdeps/i386/i686/memmove.S (__memmove_chk): Likewise.
+	* sysdeps/i386/i686/mempcpy.S (__mempcpy_chk): Likewise.
+	* sysdeps/i386/i686/memset.S (__memset_chk): Likewise.
+
+	* sysdeps/i386/i686/memmove.S: Support USE_AS_BCOPY.
+
+	* sysdeps/i386/i686/multiarch/Makefile (sysdep_routines): Add
+	bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 memmove-ssse3
+	memcpy-ssse3-rep mempcpy-ssse3-rep memmove-ssse3-rep
+	bcopy-ssse3 bcopy-ssse3-rep memset-sse2-rep bzero-sse2-rep
+	* sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S: New file.
+	* sysdeps/i386/i686/multiarch/bcopy-ssse3.S: New file.
+	* sysdeps/i386/i686/multiarch/bcopy.S: New file.
+	* sysdeps/i386/i686/multiarch/bzero-sse2-rep.S: New file.
+	* sysdeps/i386/i686/multiarch/bzero-sse2.S: New file.
+	* sysdeps/i386/i686/multiarch/bzero.S: New file.
+	* sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S: New file.
+	* sysdeps/i386/i686/multiarch/memcpy-ssse3.S: New file.
+	* sysdeps/i386/i686/multiarch/memcpy.S: New file.
+	* sysdeps/i386/i686/multiarch/memcpy_chk.S: New file.
+	* sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S: New file.
+	* sysdeps/i386/i686/multiarch/memmove-ssse3.S: New file.
+	* sysdeps/i386/i686/multiarch/memmove.S: New file.
+	* sysdeps/i386/i686/multiarch/memmove_chk.S: New file.
+	* sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S: New file.
+	* sysdeps/i386/i686/multiarch/mempcpy-ssse3.S: New file.
+	* sysdeps/i386/i686/multiarch/mempcpy.S: New file.
+	* sysdeps/i386/i686/multiarch/mempcpy_chk.S: New file.
+	* sysdeps/i386/i686/multiarch/memset-sse2-rep.S: New file.
+	* sysdeps/i386/i686/multiarch/memset-sse2.S: New file.
+	* sysdeps/i386/i686/multiarch/memset.S: New file.
+	* sysdeps/i386/i686/multiarch/memset_chk.S: New file.
+
+	* sysdeps/i386/sysdep.h (ENTRY_CHK): New.
+	(END_CHK): Likewise.
+
+	* sysdeps/i386/i686/multiarch/ifunc-defines.sym: Add
+	FEATURE_OFFSET, FEATURE_SIZE and FEATURE_INDEX_1.
+	* sysdeps/x86_64/multiarch/ifunc-defines.sym: Likewise.
+
+	* sysdeps/x86_64/cacheinfo.c (intel_02_cache_info): Add entries
+	for 0x0e and 0x80.
+	(__x86_64_data_cache_size): New.
+	(init_cacheinfo): Set __x86_64_data_cache_size.
+
+	* sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features): Turn
+	on bit_Fast_Rep_String for Intel Core i7.
+
+	* sysdeps/x86_64/multiarch/init-arch.h (bit_Fast_Rep_String): New.
+	(index_Fast_Rep_String): Likewise.
+	(FEATURE_INDEX_1): Likewise.
+	(FEATURE_INDEX_MAX): Likewise.
+	(cpu_features): Add feature.
+
+2010-01-12  Ulrich Drepper  <drepper@redhat.com>
+
+	* conform/data/sys/select.h-data: Fix up for XPG7.
+
 2010-01-12  Jakub Jelinek  <jakub@redhat.com>
 
 	* posix/sys/wait.h: Don't define W* macros etc. if
@@ -47,10 +202,6 @@
 
 	* conform/data/unistd.h-data: lockf-constants Not needed for POSIX.
 
-2010-01-11  Andreas Schwab  <schwab@redhat.com>
-
-	* bits/confname.h: Remove trailing comma.
-
 2010-01-10  Ulrich Drepper  <drepper@redhat.com>
 
 	* sysdeps/unix/sysv/linux/bits/sigaction.h: SA_RESTART, SA_NODEFER,
diff --git a/Makeconfig b/Makeconfig
index 2ebe184224..b6386283bd 100644
--- a/Makeconfig
+++ b/Makeconfig
@@ -549,11 +549,11 @@ endif
 ifeq ($(elf),yes)
 +preinit = $(addprefix $(csu-objpfx),crti.o)
 +postinit = $(addprefix $(csu-objpfx),crtn.o)
-+prector = `$(CC) --print-file-name=crtbegin.o`
-+postctor = `$(CC) --print-file-name=crtend.o`
++prector = `$(CC) $(sysdep-LDFLAGS) --print-file-name=crtbegin.o`
++postctor = `$(CC) $(sysdep-LDFLAGS) --print-file-name=crtend.o`
 # Variants of the two previous definitions for linking PIE programs.
-+prectorS = `$(CC) --print-file-name=crtbeginS.o`
-+postctorS = `$(CC) --print-file-name=crtendS.o`
++prectorS = `$(CC) $(sysdep-LDFLAGS) --print-file-name=crtbeginS.o`
++postctorS = `$(CC) $(sysdep-LDFLAGS) --print-file-name=crtendS.o`
 +interp = $(addprefix $(elf-objpfx),interp.os)
 endif
 csu-objpfx = $(common-objpfx)csu/
diff --git a/Versions.def b/Versions.def
index 70de1c14fc..eab006b83f 100644
--- a/Versions.def
+++ b/Versions.def
@@ -91,6 +91,7 @@ libpthread {
   GLIBC_2.4
   GLIBC_2.6
   GLIBC_2.11
+  GLIBC_2.12
   GLIBC_PRIVATE
 }
 libresolv {
diff --git a/conform/conformtest.pl b/conform/conformtest.pl
index c528127cee..d324d81c53 100644
--- a/conform/conformtest.pl
+++ b/conform/conformtest.pl
@@ -1048,6 +1048,7 @@ while ($#headers >= 0) {
 
     open (ALLOW, "$CC -E -D$dialect - < data/$ah-data |");
     acontrol: while (<ALLOW>) {
+      chop;
       next acontrol if (/^#/);
       next acontrol if (/^[	]*$/);
 
@@ -1057,8 +1058,8 @@ while ($#headers >= 0) {
 	push @allow, $1;
       } elsif (/^typed-constant *([a-zA-Z0-9_]*) *({([^}]*)}|([^ ]*)) *([A-Za-z0-9_]*)?/) {
 	push @allow, 1;
-      } elsif (/^type *({([^}]*)|([a-zA-Z0-9_]*))/) {
-	my($type) = "$2$3";
+      } elsif (/^(type|tag) *({([^}]*)|([a-zA-Z0-9_]*))/) {
+	my($type) = "$3$4";
 
 	# Remember that this name is allowed.
 	if ($type =~ /^struct *(.*)/) {
diff --git a/conform/data/complex.h-data b/conform/data/complex.h-data
index 5c5eedd181..006d6e4de7 100644
--- a/conform/data/complex.h-data
+++ b/conform/data/complex.h-data
@@ -1,4 +1,4 @@
-#ifdef XOPEN2K
+#if defined XOPEN2K || defined POSIX2008
 macro complex
 macro _Complex_I
 optional-macro imaginary
diff --git a/conform/data/ctype.h-data b/conform/data/ctype.h-data
index d082230f94..091d377bec 100644
--- a/conform/data/ctype.h-data
+++ b/conform/data/ctype.h-data
@@ -19,15 +19,9 @@ function int toascii (int)
 // XPG says the following two are macros.  But we test a bit more strictly.
 function int _toupper (int)
 function int _tolower (int)
+#endif
 
-// The following is not entirely correct.  It should be is[a-z]* but the
-// replacement fnmatch implementation does not grok it.
-allow is*
-allow to*
-
-allow *_t
-
-# if defined XOPEN2K8 || defined XOPEN2K8 || defined POSIX2008
+# if defined XOPEN2K8 || defined POSIX2008
 type locale_t
 
 function int isalnum_l (int, locale_t)
@@ -44,4 +38,10 @@ function int isxdigit_l (int, locale_t)
 function int tolower_l (int, locale_t)
 function int toupper_l (int, locale_t)
 # endif
-#endif
+
+// The following is not entirely correct.  It should be is[a-z]* but the
+// replacement fnmatch implementation does not grok it.
+allow is*
+allow to*
+
+allow *_t
diff --git a/conform/data/dirent.h-data b/conform/data/dirent.h-data
index 761d5786b5..f1cd59cdaf 100644
--- a/conform/data/dirent.h-data
+++ b/conform/data/dirent.h-data
@@ -8,7 +8,9 @@ element {struct dirent} ino_t d_ino
 # endif
 element {struct dirent} char d_name []
 
+# if !defined POSIX && !defined POSIX2008
 type ino_t
+#endif
 
 function int closedir (DIR*)
 function {DIR*} opendir (const char*)
diff --git a/conform/data/inttypes.h-data b/conform/data/inttypes.h-data
index 93e4713c95..17d2b9717a 100644
--- a/conform/data/inttypes.h-data
+++ b/conform/data/inttypes.h-data
@@ -17,8 +17,8 @@ function intmax_t imaxabs (intmax_t)
 function imaxdiv_t imaxdiv (intmax_t, intmax_t)
 function intmax_t strtoimax (const char*, char**, int)
 function uintmax_t strtoumax (const char*, char**, int)
-function intmax_t wcstoimax (const wchar_t*, wchar_t**, int)
-function uintmax_t wcstoumax (const wchar_t*, wchar_t**, int)
+function intmax_t wcstoimax (const __gwchar_t*, __gwchar_t**, int)
+function uintmax_t wcstoumax (const __gwchar_t*, __gwchar_t**, int)
 
 allow PRI*
 allow SCN*
diff --git a/conform/data/sys/select.h-data b/conform/data/sys/select.h-data
index 51f7844cb6..ea890d6ae9 100644
--- a/conform/data/sys/select.h-data
+++ b/conform/data/sys/select.h-data
@@ -13,16 +13,18 @@ element {struct timespec} time_t tv_sec
 element {struct timespec} long tv_nsec
 
 type fd_set
+#if defined XPG3 || defined XPG4 || defined UNIX98
 element fd_set long fds_bits []
+#endif
 
-function-macro void FD_CLR (int, fd_set*)
-function-macro int FD_ISSET (int, fd_set*)
-function-macro void FD_SET (int, fd_set*)
-function-macro void FD_ZERO (fd_set*)
+macro FD_CLR
+macro FD_ISSET
+macro FD_SET
+macro FD_ZERO
 
 macro FD_SETSIZE
 
-#ifdef XOPEN2K
+#if defined XOPEN2K || defined POSIX2008
 function int pselect (int, fd_set*, fd_set*, fd_set*, const struct timespec*, const sigset_t*)
 #endif
 function int select (int, fd_set*, fd_set*, fd_set*, struct timeval*)
diff --git a/conform/data/sys/uio.h-data b/conform/data/sys/uio.h-data
index 7833e72268..03d0a3bf3a 100644
--- a/conform/data/sys/uio.h-data
+++ b/conform/data/sys/uio.h-data
@@ -1,4 +1,4 @@
-#if !defined ISO && !defined POSIX && !defined POSIX2008
+#if !defined ISO && !defined POSIX
 type {struct iovec}
 
 element {struct iovec} {void*} iov_base
diff --git a/conform/data/time.h-data b/conform/data/time.h-data
index c07fe1dc3b..e2ff298ea7 100644
--- a/conform/data/time.h-data
+++ b/conform/data/time.h-data
@@ -97,3 +97,4 @@ allow tv_*
 allow CLOCK_*
 allow TIMER_*
 allow *_t
+allow sigevent
diff --git a/grp/grp.h b/grp/grp.h
index d7cc69d0eb..c06ef03ae6 100644
--- a/grp/grp.h
+++ b/grp/grp.h
@@ -1,4 +1,5 @@
-/* Copyright (C) 1991,1992,1995-2001,2003,2004 Free Software Foundation, Inc.
+/* Copyright (C) 1991,1992,1995-2001,2003,2004,2010
+   Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -61,7 +62,10 @@ struct group
    This function is a possible cancellation point and therefore not
    marked with __THROW.  */
 extern void setgrent (void);
+#endif
 
+#if defined __USE_SVID || defined __USE_BSD || defined __USE_XOPEN_EXTENDED \
+    || defined __USE_XOPEN2K8
 /* Close the group-file stream.
 
    This function is a possible cancellation point and therefore not
diff --git a/malloc/malloc.c b/malloc/malloc.c
index ea10d17f85..b43e454f6e 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -6369,16 +6369,19 @@ malloc_info (int options, FILE *fp)
 
     mbinptr bin = bin_at (ar_ptr, 1);
     struct malloc_chunk *r = bin->fd;
-    while (r != bin)
+    if (r != NULL)
       {
-	++sizes[NFASTBINS].count;
-	sizes[NFASTBINS].total += r->size;
-	sizes[NFASTBINS].from = MIN (sizes[NFASTBINS].from, r->size);
-	sizes[NFASTBINS].to = MAX (sizes[NFASTBINS].to, r->size);
-	r = r->fd;
+	while (r != bin)
+	  {
+	    ++sizes[NFASTBINS].count;
+	    sizes[NFASTBINS].total += r->size;
+	    sizes[NFASTBINS].from = MIN (sizes[NFASTBINS].from, r->size);
+	    sizes[NFASTBINS].to = MAX (sizes[NFASTBINS].to, r->size);
+	    r = r->fd;
+	  }
+	nblocks += sizes[NFASTBINS].count;
+	avail += sizes[NFASTBINS].total;
       }
-    nblocks += sizes[NFASTBINS].count;
-    avail += sizes[NFASTBINS].total;
 
     for (size_t i = 2; i < NBINS; ++i)
       {
@@ -6388,17 +6391,18 @@ malloc_info (int options, FILE *fp)
 	sizes[NFASTBINS - 1 + i].to = sizes[NFASTBINS - 1 + i].total
 	  = sizes[NFASTBINS - 1 + i].count = 0;
 
-	while (r != bin)
-	  {
-	    ++sizes[NFASTBINS - 1 + i].count;
-	    sizes[NFASTBINS - 1 + i].total += r->size;
-	    sizes[NFASTBINS - 1 + i].from = MIN (sizes[NFASTBINS - 1 + i].from,
+	if (r != NULL)
+	  while (r != bin)
+	    {
+	      ++sizes[NFASTBINS - 1 + i].count;
+	      sizes[NFASTBINS - 1 + i].total += r->size;
+	      sizes[NFASTBINS - 1 + i].from
+		= MIN (sizes[NFASTBINS - 1 + i].from, r->size);
+	      sizes[NFASTBINS - 1 + i].to = MAX (sizes[NFASTBINS - 1 + i].to,
 						 r->size);
-	    sizes[NFASTBINS - 1 + i].to = MAX (sizes[NFASTBINS - 1 + i].to,
-					       r->size);
 
-	    r = r->fd;
-	  }
+	      r = r->fd;
+	    }
 
 	if (sizes[NFASTBINS - 1 + i].count == 0)
 	  sizes[NFASTBINS - 1 + i].from = 0;
@@ -6460,6 +6464,9 @@ malloc_info (int options, FILE *fp)
     fputs ("</heap>\n", fp);
   }
 
+  if(__malloc_initialized < 0)
+    ptmalloc_init ();
+
   fputs ("<malloc version=\"1\">\n", fp);
 
   /* Iterate over all arenas currently in use.  */
diff --git a/manual/install.texi b/manual/install.texi
index 0cd4d622de..24b1301ef6 100644
--- a/manual/install.texi
+++ b/manual/install.texi
@@ -330,16 +330,15 @@ bugs or lack features.
 @item
 GCC 3.4 or newer, GCC 4.1 recommended
 
-The GNU C library can only be compiled with the GNU C compiler family.
-For the 2.3 releases, GCC 3.2 or higher is required; GCC 3.4 is the
-compiler we advise to use for 2.3 versions.
-For the 2.4 release, GCC 3.4 or higher is required; as of this
-writing, GCC 4.1 is the compiler we advise to use for current versions.
+For the 2.4 release or later, GCC 3.4 or higher is required; as of this
+writing, GCC 4.4 is the compiler we advise to use for current versions.
 On certain machines including @code{powerpc64}, compilers prior to GCC
 4.0 have bugs that prevent them compiling the C library code in the
 2.4 release.  On other machines, GCC 4.1 is required to build the C
 library with support for the correct @code{long double} type format;
-these include @code{powerpc} (32 bit), @code{s390} and @code{s390x}.
+these include @code{powerpc} (32 bit), @code{s390} and @code{s390x}.  For
+other architectures special compiler-provided headers are needed
+(like @file{cpuid.h} on x86) which only come with later compiler versions.
 
 You can use whatever compiler you like to compile programs that use GNU
 libc, but be aware that both GCC 2.7 and 2.8 have bugs in their
diff --git a/nptl/ChangeLog b/nptl/ChangeLog
index c8839e81f3..1da9bc3b4e 100644
--- a/nptl/ChangeLog
+++ b/nptl/ChangeLog
@@ -1,3 +1,17 @@
+2010-01-14  Ulrich Drepper  <drepper@redhat.com>
+
+	* Versions: Add pthread_mutex_consistent, pthread_mutexattr_getrobust,
+	and pthread_mutexattr_setrobust for GLIBC_2.12.
+	* pthread_mutex_consistent.c: Define alias pthread_mutex_consistent.
+	* pthread_mutexattr_getrobust.c: Define alias
+	pthread_mutexattr_getrobust.
+	* pthread_mutexattr_setrobust.c: Define alias
+	pthread_mutexattr_setrobust.
+
+2010-01-12  Ulrich Drepper  <drepper@redhat.com>
+
+	* sysdeps/pthread/pthread.h: Cleanup.  Fix up for XPG7.
+
 2010-01-08  Ulrich Drepper  <drepper@redhat.com>
 
 	* sysdeps/pthread/pthread.h: Fix pthread_mutex_consistent declaration.
diff --git a/nptl/Versions b/nptl/Versions
index 09098bf987..1f2de79fd4 100644
--- a/nptl/Versions
+++ b/nptl/Versions
@@ -244,6 +244,11 @@ libpthread {
     pthread_sigqueue;
   };
 
+  GLIBC_2.12 {
+    pthread_mutex_consistent; pthread_mutexattr_getrobust;
+    pthread_mutexattr_setrobust;
+  };
+
   GLIBC_PRIVATE {
     __pthread_initialize_minimal;
     __pthread_clock_gettime; __pthread_clock_settime;
diff --git a/nptl/pthread_mutex_consistent.c b/nptl/pthread_mutex_consistent.c
index d4f287b755..9ca3e1397e 100644
--- a/nptl/pthread_mutex_consistent.c
+++ b/nptl/pthread_mutex_consistent.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2005, 2006 Free Software Foundation, Inc.
+/* Copyright (C) 2005, 2006, 2010 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@redhat.com>, 2005.
 
@@ -22,7 +22,7 @@
 
 
 int
-pthread_mutex_consistent_np (mutex)
+pthread_mutex_consistent (mutex)
      pthread_mutex_t *mutex;
 {
   /* Test whether this is a robust mutex with a dead owner.  */
@@ -34,3 +34,4 @@ pthread_mutex_consistent_np (mutex)
 
   return 0;
 }
+weak_alias (pthread_mutex_consistent, pthread_mutex_consistent_np)
diff --git a/nptl/pthread_mutexattr_getrobust.c b/nptl/pthread_mutexattr_getrobust.c
index 5ec43d1f7c..9e61bb0f06 100644
--- a/nptl/pthread_mutexattr_getrobust.c
+++ b/nptl/pthread_mutexattr_getrobust.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2005 Free Software Foundation, Inc.
+/* Copyright (C) 2005, 2010 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@redhat.com>, 2005.
 
@@ -21,7 +21,7 @@
 
 
 int
-pthread_mutexattr_getrobust_np (attr, robustness)
+pthread_mutexattr_getrobust (attr, robustness)
      const pthread_mutexattr_t *attr;
      int *robustness;
 {
@@ -34,3 +34,4 @@ pthread_mutexattr_getrobust_np (attr, robustness)
 
   return 0;
 }
+weak_alias (pthread_mutexattr_getrobust, pthread_mutexattr_getrobust_np)
diff --git a/nptl/pthread_mutexattr_setrobust.c b/nptl/pthread_mutexattr_setrobust.c
index cf95e35b6f..df51ef53b0 100644
--- a/nptl/pthread_mutexattr_setrobust.c
+++ b/nptl/pthread_mutexattr_setrobust.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2005 Free Software Foundation, Inc.
+/* Copyright (C) 2005, 2010 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@redhat.com>, 2005.
 
@@ -22,7 +22,7 @@
 
 
 int
-pthread_mutexattr_setrobust_np (attr, robustness)
+pthread_mutexattr_setrobust (attr, robustness)
      pthread_mutexattr_t *attr;
      int robustness;
 {
@@ -41,3 +41,4 @@ pthread_mutexattr_setrobust_np (attr, robustness)
 
   return 0;
 }
+weak_alias (pthread_mutexattr_setrobust, pthread_mutexattr_setrobust_np)
diff --git a/nptl/sysdeps/pthread/pthread.h b/nptl/sysdeps/pthread/pthread.h
index 8a946ffc3a..b84fd5c7a4 100644
--- a/nptl/sysdeps/pthread/pthread.h
+++ b/nptl/sysdeps/pthread/pthread.h
@@ -25,8 +25,6 @@
 #include <sched.h>
 #include <time.h>
 
-#define __need_sigset_t
-#include <signal.h>
 #include <bits/pthreadtypes.h>
 #include <bits/setjmp.h>
 #include <bits/wordsize.h>
@@ -49,7 +47,7 @@ enum
   PTHREAD_MUTEX_RECURSIVE_NP,
   PTHREAD_MUTEX_ERRORCHECK_NP,
   PTHREAD_MUTEX_ADAPTIVE_NP
-#ifdef __USE_UNIX98
+#if defined __USE_UNIX98 || defined __USE_XOPEN2K8
   ,
   PTHREAD_MUTEX_NORMAL = PTHREAD_MUTEX_TIMED_NP,
   PTHREAD_MUTEX_RECURSIVE = PTHREAD_MUTEX_RECURSIVE_NP,
@@ -748,7 +746,6 @@ extern int pthread_mutex_unlock (pthread_mutex_t *__mutex)
      __THROW __nonnull ((1));
 
 
-#ifdef __USE_UNIX98
 /* Get the priority ceiling of MUTEX.  */
 extern int pthread_mutex_getprioceiling (__const pthread_mutex_t *
 					 __restrict __mutex,
@@ -761,7 +758,6 @@ extern int pthread_mutex_setprioceiling (pthread_mutex_t *__restrict __mutex,
 					 int __prioceiling,
 					 int *__restrict __old_ceiling)
      __THROW __nonnull ((1, 3));
-#endif
 
 
 #ifdef __USE_XOPEN2K8
@@ -797,7 +793,7 @@ extern int pthread_mutexattr_setpshared (pthread_mutexattr_t *__attr,
 					 int __pshared)
      __THROW __nonnull ((1));
 
-#ifdef __USE_UNIX98
+#if defined __USE_UNIX98 || defined __USE_XOPEN2K8
 /* Return in *KIND the mutex kind attribute in *ATTR.  */
 extern int pthread_mutexattr_gettype (__const pthread_mutexattr_t *__restrict
 				      __attr, int *__restrict __kind)
@@ -808,6 +804,7 @@ extern int pthread_mutexattr_gettype (__const pthread_mutexattr_t *__restrict
    PTHREAD_MUTEX_DEFAULT).  */
 extern int pthread_mutexattr_settype (pthread_mutexattr_t *__attr, int __kind)
      __THROW __nonnull ((1));
+#endif
 
 /* Return in *PROTOCOL the mutex protocol attribute in *ATTR.  */
 extern int pthread_mutexattr_getprotocol (__const pthread_mutexattr_t *
@@ -831,7 +828,6 @@ extern int pthread_mutexattr_getprioceiling (__const pthread_mutexattr_t *
 extern int pthread_mutexattr_setprioceiling (pthread_mutexattr_t *__attr,
 					     int __prioceiling)
      __THROW __nonnull ((1));
-#endif
 
 #ifdef __USE_XOPEN2K
 /* Get the robustness flag of the mutex attribute ATTR.  */
diff --git a/posix/regcomp.c b/posix/regcomp.c
index 6966b5da3c..542b848f14 100644
--- a/posix/regcomp.c
+++ b/posix/regcomp.c
@@ -1,6 +1,5 @@
 /* Extended regular expression matching and search library.
-   Copyright (C) 2002,2003,2004,2005,2006,2007,2009
-   Free Software Foundation, Inc.
+   Copyright (C) 2002-2007,2009,2010 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
 
@@ -616,7 +615,7 @@ free_dfa_content (re_dfa_t *dfa)
 	    re_dfastate_t *state = entry->array[j];
 	    free_state (state);
 	  }
-        re_free (entry->array);
+	re_free (entry->array);
       }
   re_free (dfa->state_table);
 #ifdef RE_ENABLE_I18N
@@ -1066,8 +1065,8 @@ optimize_utf8 (re_dfa_t *dfa)
 	  }
 	break;
       case OP_PERIOD:
-        has_period = 1;
-        break;
+	has_period = 1;
+	break;
       case OP_BACK_REF:
       case OP_ALT:
       case END_OF_RE:
@@ -1080,7 +1079,7 @@ optimize_utf8 (re_dfa_t *dfa)
       case SIMPLE_BRACKET:
 	/* Just double check.  The non-ASCII range starts at 0x80.  */
 	assert (0x80 % BITSET_WORD_BITS == 0);
-        for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
+	for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
 	  if (dfa->nodes[node].opr.sbcset[i])
 	    return;
 	break;
@@ -1161,7 +1160,7 @@ analyze (regex_t *preg)
     {
       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
       if (BE (dfa->inveclosures == NULL, 0))
-        return REG_ESPACE;
+	return REG_ESPACE;
       ret = calc_inveclosure (dfa);
     }
 
@@ -1183,16 +1182,16 @@ postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
 	 if that's the only child).  */
       while (node->left || node->right)
 	if (node->left)
-          node = node->left;
-        else
-          node = node->right;
+	  node = node->left;
+	else
+	  node = node->right;
 
       do
 	{
 	  reg_errcode_t err = fn (extra, node);
 	  if (BE (err != REG_NOERROR, 0))
 	    return err;
-          if (node->parent == NULL)
+	  if (node->parent == NULL)
 	    return REG_NOERROR;
 	  prev = node;
 	  node = node->parent;
@@ -1226,7 +1225,7 @@ preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
 	      prev = node;
 	      node = node->parent;
 	      if (!node)
-	        return REG_NOERROR;
+		return REG_NOERROR;
 	    }
 	  node = node->right;
 	}
@@ -1249,13 +1248,13 @@ optimize_subexps (void *extra, bin_tree_t *node)
     }
 
   else if (node->token.type == SUBEXP
-           && node->left && node->left->token.type == SUBEXP)
+	   && node->left && node->left->token.type == SUBEXP)
     {
       int other_idx = node->left->token.opr.idx;
 
       node->left = node->left->left;
       if (node->left)
-        node->left->parent = node;
+	node->left->parent = node;
 
       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
       if (other_idx < BITSET_WORD_BITS)
@@ -1340,9 +1339,9 @@ calc_first (void *extra, bin_tree_t *node)
       node->first = node;
       node->node_idx = re_dfa_add_node (dfa, node->token);
       if (BE (node->node_idx == -1, 0))
-        return REG_ESPACE;
+	return REG_ESPACE;
       if (node->token.type == ANCHOR)
-        dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
+	dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
     }
   return REG_NOERROR;
 }
@@ -1364,7 +1363,7 @@ calc_next (void *extra, bin_tree_t *node)
       if (node->left)
 	node->left->next = node->next;
       if (node->right)
-        node->right->next = node->next;
+	node->right->next = node->next;
       break;
     }
   return REG_NOERROR;
@@ -1643,9 +1642,10 @@ static reg_errcode_t
 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
 {
   reg_errcode_t err;
-  int i, incomplete;
+  int i;
   re_node_set eclosure;
-  incomplete = 0;
+  int ret;
+  int incomplete = 0;
   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
   if (BE (err != REG_NOERROR, 0))
     return err;
@@ -1700,8 +1700,10 @@ calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
 	  }
       }
 
-  /* Epsilon closures include itself.  */
-  re_node_set_insert (&eclosure, node);
+  /* An epsilon closure includes itself.  */
+  ret = re_node_set_insert (&eclosure, node);
+  if (BE (ret < 0, 0))
+    return REG_ESPACE;
   if (incomplete && !root)
     dfa->eclosures[node].nelem = 0;
   else
@@ -2285,7 +2287,7 @@ parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
 	  && dfa->word_ops_used == 0)
 	init_word_char (dfa);
       if (token->opr.ctx_type == WORD_DELIM
-          || token->opr.ctx_type == NOT_WORD_DELIM)
+	  || token->opr.ctx_type == NOT_WORD_DELIM)
 	{
 	  bin_tree_t *tree_first, *tree_last;
 	  if (token->opr.ctx_type == WORD_DELIM)
@@ -2293,13 +2295,13 @@ parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
 	      token->opr.ctx_type = WORD_FIRST;
 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
 	      token->opr.ctx_type = WORD_LAST;
-            }
-          else
-            {
+	    }
+	  else
+	    {
 	      token->opr.ctx_type = INSIDE_WORD;
 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
 	      token->opr.ctx_type = INSIDE_NOTWORD;
-            }
+	    }
 	  tree_last = create_token_tree (dfa, NULL, NULL, token);
 	  tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
 	  if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
@@ -2410,7 +2412,7 @@ parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
     {
       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
-        *err = REG_EPAREN;
+	*err = REG_EPAREN;
       if (BE (*err != REG_NOERROR, 0))
 	return NULL;
     }
@@ -2541,11 +2543,11 @@ parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
       elem = duplicate_tree (elem, dfa);
       tree = create_tree (dfa, tree, elem, CONCAT);
       if (BE (elem == NULL || tree == NULL, 0))
-        goto parse_dup_op_espace;
+	goto parse_dup_op_espace;
 
       tree = create_tree (dfa, tree, NULL, OP_ALT);
       if (BE (tree == NULL, 0))
-        goto parse_dup_op_espace;
+	goto parse_dup_op_espace;
     }
 
   if (old_tree)
@@ -2626,9 +2628,9 @@ build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
        no MBCSET if dfa->mb_cur_max == 1.  */
     if (mbcset)
       {
-        /* Check the space of the arrays.  */
-        if (BE (*range_alloc == mbcset->nranges, 0))
-          {
+	/* Check the space of the arrays.  */
+	if (BE (*range_alloc == mbcset->nranges, 0))
+	  {
 	    /* There is not enough space, need realloc.  */
 	    wchar_t *new_array_start, *new_array_end;
 	    int new_nranges;
@@ -2638,9 +2640,9 @@ build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
 	    /* Use realloc since mbcset->range_starts and mbcset->range_ends
 	       are NULL if *range_alloc == 0.  */
 	    new_array_start = re_realloc (mbcset->range_starts, wchar_t,
-				          new_nranges);
+					  new_nranges);
 	    new_array_end = re_realloc (mbcset->range_ends, wchar_t,
-				        new_nranges);
+					new_nranges);
 
 	    if (BE (new_array_start == NULL || new_array_end == NULL, 0))
 	      return REG_ESPACE;
@@ -2648,10 +2650,10 @@ build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
 	    mbcset->range_starts = new_array_start;
 	    mbcset->range_ends = new_array_end;
 	    *range_alloc = new_nranges;
-          }
+	  }
 
-        mbcset->range_starts[mbcset->nranges] = start_wc;
-        mbcset->range_ends[mbcset->nranges++] = end_wc;
+	mbcset->range_starts[mbcset->nranges] = start_wc;
+	mbcset->range_ends[mbcset->nranges++] = end_wc;
       }
 
     /* Build the table for single byte characters.  */
@@ -2870,8 +2872,8 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
 	 build below suffices. */
       if (nrules > 0 || dfa->mb_cur_max > 1)
 	{
-          /* Check the space of the arrays.  */
-          if (BE (*range_alloc == mbcset->nranges, 0))
+	  /* Check the space of the arrays.  */
+	  if (BE (*range_alloc == mbcset->nranges, 0))
 	    {
 	      /* There is not enough space, need realloc.  */
 	      uint32_t *new_array_start;
@@ -2883,18 +2885,18 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
 	      new_array_start = re_realloc (mbcset->range_starts, uint32_t,
 					    new_nranges);
 	      new_array_end = re_realloc (mbcset->range_ends, uint32_t,
-				          new_nranges);
+					  new_nranges);
 
 	      if (BE (new_array_start == NULL || new_array_end == NULL, 0))
-	        return REG_ESPACE;
+		return REG_ESPACE;
 
 	      mbcset->range_starts = new_array_start;
 	      mbcset->range_ends = new_array_end;
 	      *range_alloc = new_nranges;
 	    }
 
-          mbcset->range_starts[mbcset->nranges] = start_collseq;
-          mbcset->range_ends[mbcset->nranges++] = end_collseq;
+	  mbcset->range_starts[mbcset->nranges] = start_collseq;
+	  mbcset->range_ends[mbcset->nranges++] = end_collseq;
 	}
 
       /* Build the table for single byte characters.  */
@@ -3227,17 +3229,17 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
 	 of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
       if (sbc_idx < BITSET_WORDS)
 	{
-          /* Build a tree for simple bracket.  */
-          br_token.type = SIMPLE_BRACKET;
-          br_token.opr.sbcset = sbcset;
-          work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
-          if (BE (work_tree == NULL, 0))
-            goto parse_bracket_exp_espace;
+	  /* Build a tree for simple bracket.  */
+	  br_token.type = SIMPLE_BRACKET;
+	  br_token.opr.sbcset = sbcset;
+	  work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
+	  if (BE (work_tree == NULL, 0))
+	    goto parse_bracket_exp_espace;
 
-          /* Then join them by ALT node.  */
-          work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
-          if (BE (work_tree == NULL, 0))
-            goto parse_bracket_exp_espace;
+	  /* Then join them by ALT node.  */
+	  work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
+	  if (BE (work_tree == NULL, 0))
+	    goto parse_bracket_exp_espace;
 	}
       else
 	{
@@ -3256,7 +3258,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
       br_token.opr.sbcset = sbcset;
       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
       if (BE (work_tree == NULL, 0))
-        goto parse_bracket_exp_espace;
+	goto parse_bracket_exp_espace;
     }
   return work_tree;
 
@@ -3809,7 +3811,7 @@ duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
 	      node = node->parent;
 	      dup_node = dup_node->parent;
 	      if (!node)
-	        return dup_root;
+		return dup_root;
 	    }
 	  node = node->right;
 	  p_new = &dup_node->right;
diff --git a/posix/sched.h b/posix/sched.h
index e95c80f562..55bc487b37 100644
--- a/posix/sched.h
+++ b/posix/sched.h
@@ -1,5 +1,6 @@
 /* Definitions for POSIX 1003.1b-1993 (aka POSIX.4) scheduling interface.
-   Copyright (C) 1996,1997,1999,2001-2004,2007 Free Software Foundation, Inc.
+   Copyright (C) 1996,1997,1999,2001-2004,2007,2010
+   Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -28,9 +29,16 @@
 #define __need_size_t
 #include <stddef.h>
 
+#define __need_time_t
 #define __need_timespec
 #include <time.h>
 
+#ifndef __pid_t_defined
+typedef __pid_t pid_t;
+# define __pid_t_defined
+#endif
+
+
 /* Get system specific constant and data structure definitions.  */
 #include <bits/sched.h>
 /* Define the real names for the elements of `struct sched_param'.  */
diff --git a/posix/unistd.h b/posix/unistd.h
index d8697553a7..f8b84e3cb3 100644
--- a/posix/unistd.h
+++ b/posix/unistd.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 1991-2006, 2007, 2008, 2009 Free Software Foundation, Inc.
+/* Copyright (C) 1991-2009, 2010 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -440,8 +440,8 @@ extern unsigned int alarm (unsigned int __seconds) __THROW;
    __THROW.  */
 extern unsigned int sleep (unsigned int __seconds);
 
-#if ((defined __USE_BSD || defined __USE_XOPEN_EXTENDED)	\
-     && !defined __USE_XOPEN2K8) || defined __USE_GNU
+#if (defined __USE_XOPEN_EXTENDED && !defined __USE_XOPEN2K8) \
+    || defined __USE_BSD
 /* Set an alarm to go off (generating a SIGALRM signal) in VALUE
    microseconds.  If INTERVAL is nonzero, when the alarm goes off, the
    timer is reset to go off every INTERVAL microseconds thereafter.
@@ -514,8 +514,8 @@ extern char *getcwd (char *__buf, size_t __size) __THROW __wur;
 extern char *get_current_dir_name (void) __THROW;
 #endif
 
-#if ((defined __USE_BSD || defined __USE_XOPEN_EXTENDED)	\
-     && !defined __USE_XOPEN2K8) || defined __USE_GNU
+#if (defined __USE_XOPEN_EXTENDED && !defined __USE_XOPEN2K8) \
+    || defined __USE_BSD
 /* Put the absolute pathname of the current working directory in BUF.
    If successful, return BUF.  If not, put an error message in
    BUF and return NULL.  BUF should be at least PATH_MAX bytes long.  */
@@ -774,14 +774,14 @@ extern int setresgid (__gid_t __rgid, __gid_t __egid, __gid_t __sgid)
    and the process ID of the new process to the old process.  */
 extern __pid_t fork (void) __THROW;
 
-#if ((defined __USE_BSD || defined __USE_XOPEN_EXTENDED)	\
-     && !defined __USE_XOPEN2K8) || defined __USE_GNU
+#if (defined __USE_XOPEN_EXTENDED && !defined __USE_XOPEN2K8) \
+    || defined __USE_BSD
 /* Clone the calling process, but without copying the whole address space.
    The calling process is suspended until the new process exits or is
    replaced by a call to `execve'.  Return -1 for errors, 0 to the new process,
    and the process ID of the new process to the old process.  */
 extern __pid_t vfork (void) __THROW;
-#endif /* Use BSD. */
+#endif /* Use BSD or XPG < 7. */
 
 
 /* Return the pathname of the terminal FD is open on, or NULL on errors.
@@ -986,16 +986,16 @@ extern long int gethostid (void);
 extern void sync (void) __THROW;
 
 
-# ifndef __USE_XOPEN2K
+# if defined __USE_BSD || !defined __USE_XOPEN2K
 /* Return the number of bytes in a page.  This is the system's page size,
    which is not necessarily the same as the hardware page size.  */
 extern int getpagesize (void)  __THROW __attribute__ ((__const__));
-# endif
 
 
 /* Return the maximum number of file descriptors
    the current process could possibly have.  */
 extern int getdtablesize (void) __THROW;
+# endif
 
 #endif /* Use BSD || X/Open Unix.  */
 
@@ -1039,8 +1039,8 @@ extern int ftruncate64 (int __fd, __off64_t __length) __THROW __wur;
 #endif /* Use BSD || X/Open Unix || POSIX 2003.  */
 
 
-#if (defined __USE_MISC || defined __USE_XOPEN_EXTENDED) \
-    && !defined __USE_XOPEN2K
+#if (defined __USE_XOPEN_EXTENDED && !defined __USE_XOPEN2K) \
+    || defined __USE_MISC
 
 /* Set the end of accessible data space (aka "the break") to ADDR.
    Returns zero on success and -1 for errors (with errno set).  */
diff --git a/resolv/netdb.h b/resolv/netdb.h
index dc1f7cec61..7671c6aca5 100644
--- a/resolv/netdb.h
+++ b/resolv/netdb.h
@@ -1,4 +1,4 @@
-  /* Copyright (C) 1996-2002, 2003, 2004, 2009 Free Software Foundation, Inc.
+  /* Copyright (C) 1996-2004, 2009, 2010 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -53,22 +53,24 @@
 
 __BEGIN_DECLS
 
+#if defined __USE_MISC || !defined __USE_XOPEN2K8
 /* Error status for non-reentrant lookup functions.
    We use a macro to access always the thread-specific `h_errno' variable.  */
-#define h_errno (*__h_errno_location ())
+# define h_errno (*__h_errno_location ())
 
 /* Function to get address of global `h_errno' variable.  */
 extern int *__h_errno_location (void) __THROW __attribute__ ((__const__));
 
 
 /* Possible values left in `h_errno'.  */
-#define	HOST_NOT_FOUND	1	/* Authoritative Answer Host not found.  */
-#define	TRY_AGAIN	2	/* Non-Authoritative Host not found,
+# define HOST_NOT_FOUND	1	/* Authoritative Answer Host not found.  */
+# define TRY_AGAIN	2	/* Non-Authoritative Host not found,
 				   or SERVERFAIL.  */
-#define	NO_RECOVERY	3	/* Non recoverable errors, FORMERR, REFUSED,
+# define NO_RECOVERY	3	/* Non recoverable errors, FORMERR, REFUSED,
 				   NOTIMP.  */
-#define	NO_DATA		4	/* Valid name, no data record of requested
+# define NO_DATA	4	/* Valid name, no data record of requested
 				   type.  */
+#endif
 #if defined __USE_MISC || defined __USE_GNU
 # define NETDB_INTERNAL	-1	/* See errno.  */
 # define NETDB_SUCCESS	0	/* No problem.  */
@@ -85,7 +87,7 @@ extern int *__h_errno_location (void) __THROW __attribute__ ((__const__));
 # define SCOPE_DELIMITER	'%'
 #endif
 
-#if defined __USE_MISC || defined __USE_GNU
+#ifdef __USE_MISC
 /* Print error indicated by `h_errno' variable on standard error.  STR
    if non-null is printed before the error string.  */
 extern void herror (__const char *__str) __THROW;
diff --git a/resolv/res_send.c b/resolv/res_send.c
index e2bbfcc83f..28a47e42b8 100644
--- a/resolv/res_send.c
+++ b/resolv/res_send.c
@@ -199,10 +199,6 @@ static void		Perror(const res_state, FILE *, const char *, int);
 #endif
 static int		sock_eq(struct sockaddr_in6 *, struct sockaddr_in6 *);
 
-/* Reachover. */
-
-static void convaddr4to6(struct sockaddr_in6 *sa);
-
 /* Public. */
 
 /* int
@@ -911,10 +907,12 @@ static int
 reopen (res_state statp, int *terrno, int ns)
 {
 	if (EXT(statp).nssocks[ns] == -1) {
-		struct sockaddr_in6 *nsap = EXT(statp).nsaddrs[ns];
+		struct sockaddr *nsap
+		  = (struct sockaddr *) EXT(statp).nsaddrs[ns];
+		socklen_t slen;
 
 		/* only try IPv6 if IPv6 NS and if not failed before */
-		if ((EXT(statp).nscount6 > 0) && !statp->ipv6_unavail) {
+		if (nsap->sa_family == AF_INET6 && !statp->ipv6_unavail) {
 			if (__builtin_expect (__have_o_nonblock >= 0, 1)) {
 				EXT(statp).nssocks[ns] =
 				  socket(PF_INET6, SOCK_DGRAM|SOCK_NONBLOCK,
@@ -931,12 +929,8 @@ reopen (res_state statp, int *terrno, int ns)
 				  socket(PF_INET6, SOCK_DGRAM, 0);
 			if (EXT(statp).nssocks[ns] < 0)
 			    statp->ipv6_unavail = errno == EAFNOSUPPORT;
-			/* If IPv6 socket and nsap is IPv4, make it
-			   IPv4-mapped */
-			else if (nsap->sin6_family == AF_INET)
-			    convaddr4to6(nsap);
-		}
-		if (EXT(statp).nssocks[ns] < 0) {
+			slen = sizeof (struct sockaddr_in6);
+		} else if (nsap->sa_family == AF_INET) {
 			if (__builtin_expect (__have_o_nonblock >= 0, 1)) {
 				EXT(statp).nssocks[ns]
 				  = socket(PF_INET, SOCK_DGRAM|SOCK_NONBLOCK,
@@ -951,6 +945,7 @@ reopen (res_state statp, int *terrno, int ns)
 			if (__builtin_expect (__have_o_nonblock < 0, 0))
 				EXT(statp).nssocks[ns]
 				  = socket(PF_INET, SOCK_DGRAM, 0);
+			slen = sizeof (struct sockaddr_in);
 		}
 		if (EXT(statp).nssocks[ns] < 0) {
 			*terrno = errno;
@@ -969,10 +964,8 @@ reopen (res_state statp, int *terrno, int ns)
 		 * error message is received.  We can thus detect
 		 * the absence of a nameserver without timing out.
 		 */
-		if (connect(EXT(statp).nssocks[ns], (struct sockaddr *)nsap,
-			    sizeof *nsap) < 0) {
-			Aerror(statp, stderr, "connect(dg)", errno,
-			       (struct sockaddr *) nsap);
+		if (connect(EXT(statp).nssocks[ns], nsap, slen) < 0) {
+			Aerror(statp, stderr, "connect(dg)", errno, nsap);
 			__res_iclose(statp, false);
 			return (0);
 		}
@@ -1415,22 +1408,3 @@ sock_eq(struct sockaddr_in6 *a1, struct sockaddr_in6 *a2) {
 		(a1->sin6_addr.s6_addr32[3] ==
 		 ((struct sockaddr_in *)a2)->sin_addr.s_addr));
 }
-
-/*
- * Converts IPv4 family, address and port to
- * IPv6 family, IPv4-mapped IPv6 address and port.
- */
-static void
-convaddr4to6(struct sockaddr_in6 *sa)
-{
-    struct sockaddr_in *sa4p = (struct sockaddr_in *) sa;
-    in_port_t port = sa4p->sin_port;
-    in_addr_t addr = sa4p->sin_addr.s_addr;
-
-    sa->sin6_family = AF_INET6;
-    sa->sin6_port = port;
-    sa->sin6_addr.s6_addr32[0] = 0;
-    sa->sin6_addr.s6_addr32[1] = 0;
-    sa->sin6_addr.s6_addr32[2] = htonl(0xFFFF);
-    sa->sin6_addr.s6_addr32[3] = addr;
-}
diff --git a/stdlib/stdlib.h b/stdlib/stdlib.h
index 75750205c8..d1f3841f1b 100644
--- a/stdlib/stdlib.h
+++ b/stdlib/stdlib.h
@@ -497,8 +497,8 @@ extern void cfree (void *__ptr) __THROW;
 # include <alloca.h>
 #endif /* Use GNU, BSD, or misc.  */
 
-#if ((defined __USE_BSD || defined __USE_XOPEN_EXTENDED)	\
-     && !defined __USE_XOPEN2K) || defined __USE_GNU
+#if (defined __USE_XOPEN_EXTENDED && !defined __USE_XOPEN2K) \
+    || defined __USE_BSD
 /* Allocate SIZE bytes on a page boundary.  The storage cannot be freed.  */
 extern void *valloc (size_t __size) __THROW __attribute_malloc__ __wur;
 #endif
@@ -586,7 +586,7 @@ extern int setenv (__const char *__name, __const char *__value, int __replace)
      __THROW __nonnull ((2));
 
 /* Remove the variable NAME from the environment.  */
-extern int unsetenv (__const char *__name) __THROW;
+extern int unsetenv (__const char *__name) __THROW __nonnull ((1));
 #endif
 
 #ifdef	__USE_MISC
@@ -597,8 +597,8 @@ extern int clearenv (void) __THROW;
 #endif
 
 
-#if ((defined __USE_MISC || defined __USE_XOPEN_EXTENDED)	\
-     && !defined __USE_XOPEN2K) || defined __USE_GNU
+#if defined __USE_MISC \
+    || (defined __USE_XOPEN_EXTENDED && !defined __USE_XOPEN2K)
 /* Generate a unique temporary file name from TEMPLATE.
    The last six characters of TEMPLATE must be "XXXXXX";
    they are replaced with a string that makes the file name unique.
@@ -797,8 +797,8 @@ __END_NAMESPACE_C99
 #endif
 
 
-#if ((defined __USE_SVID || defined __USE_XOPEN_EXTENDED)	\
-     && !defined __USE_XOPEN2K) || defined __USE_GNU
+#if (defined __USE_XOPEN_EXTENDED && !defined __USE_XOPEN2K) \
+    || defined __USE_SVID
 /* Convert floating point numbers to strings.  The returned values are
    valid only until another call to the same function.  */
 
diff --git a/string/strings.h b/string/strings.h
index a3dde714f3..2ab83caa06 100644
--- a/string/strings.h
+++ b/string/strings.h
@@ -36,7 +36,7 @@
 
 __BEGIN_DECLS
 
-# if !defined __USE_XOPEN2K8 || defined __USE_GNU
+# if defined __USE_MISC || !defined __USE_XOPEN2K8
 /* Compare N bytes of S1 and S2 (same as memcmp).  */
 extern int bcmp (__const void *__s1, __const void *__s2, size_t __n)
      __THROW __attribute_pure__;
@@ -104,7 +104,7 @@ extern char *rindex (__const char *__s, int __c)
 #  endif
 # endif
 
-#if !defined __USE_XOPEN2K8 || defined __USE_XOPEN2K8XSI
+#if defined __USE_MISC || !defined __USE_XOPEN2K8 || defined __USE_XOPEN2K8XSI
 /* Return the position of the first bit set in I, or 0 if none are set.
    The least-significant bit is position 1, the most-significant 32.  */
 extern int ffs (int __i) __THROW __attribute__ ((const));
diff --git a/sysdeps/generic/netinet/ip.h b/sysdeps/generic/netinet/ip.h
index 38bd7556da..a837b98143 100644
--- a/sysdeps/generic/netinet/ip.h
+++ b/sysdeps/generic/netinet/ip.h
@@ -189,7 +189,26 @@ struct ip_timestamp
 #define	IPTOS_DSCP_EF		0xb8
 
 /*
- * Definitions for IP type of service (ip_tos)
+ * In RFC 2474, Section 4.2.2.1, the Class Selector Codepoints subsume
+ * the old ToS Precedence values.
+ */
+
+#define	IPTOS_CLASS_MASK		0xe0
+#define	IPTOS_CLASS(class)		((tos) & IPTOS_CLASS_MASK)
+#define	IPTOS_CLASS_CS0			0x00
+#define	IPTOS_CLASS_CS1			0x20
+#define	IPTOS_CLASS_CS2			0x40
+#define	IPTOS_CLASS_CS3			0x60
+#define	IPTOS_CLASS_CS4			0x80
+#define	IPTOS_CLASS_CS5			0xa0
+#define	IPTOS_CLASS_CS6			0xc0
+#define	IPTOS_CLASS_CS7			0xe0
+
+#define	IPTOS_CLASS_DEFAULT		IPTOS_CLASS_CS0
+
+/*
+ * Definitions for IP type of service (ip_tos) [deprecated; use DSCP
+ * and CS definitions above instead.]
  */
 #define	IPTOS_TOS_MASK		0x1E
 #define	IPTOS_TOS(tos)		((tos) & IPTOS_TOS_MASK)
@@ -200,18 +219,18 @@ struct ip_timestamp
 #define	IPTOS_MINCOST		IPTOS_LOWCOST
 
 /*
- * Definitions for IP precedence (also in ip_tos) (hopefully unused)
+ * Definitions for IP precedence (also in ip_tos) [also deprecated.]
  */
-#define	IPTOS_PREC_MASK			0xe0
-#define	IPTOS_PREC(tos)                ((tos) & IPTOS_PREC_MASK)
-#define	IPTOS_PREC_NETCONTROL		0xe0
-#define	IPTOS_PREC_INTERNETCONTROL	0xc0
-#define	IPTOS_PREC_CRITIC_ECP		0xa0
-#define	IPTOS_PREC_FLASHOVERRIDE	0x80
-#define	IPTOS_PREC_FLASH		0x60
-#define	IPTOS_PREC_IMMEDIATE		0x40
-#define	IPTOS_PREC_PRIORITY		0x20
-#define	IPTOS_PREC_ROUTINE		0x00
+#define	IPTOS_PREC_MASK			IPTOS_CLASS_MASK
+#define	IPTOS_PREC(tos)			IPTOS_CLASS(tos)
+#define	IPTOS_PREC_NETCONTROL		IPTOS_CLASS_CS7
+#define	IPTOS_PREC_INTERNETCONTROL	IPTOS_CLASS_CS6
+#define	IPTOS_PREC_CRITIC_ECP		IPTOS_CLASS_CS5
+#define	IPTOS_PREC_FLASHOVERRIDE	IPTOS_CLASS_CS4
+#define	IPTOS_PREC_FLASH		IPTOS_CLASS_CS3
+#define	IPTOS_PREC_IMMEDIATE		IPTOS_CLASS_CS2
+#define	IPTOS_PREC_PRIORITY		IPTOS_CLASS_CS1
+#define	IPTOS_PREC_ROUTINE		IPTOS_CLASS_CS0
 
 /*
  * Definitions for options.
diff --git a/sysdeps/i386/i686/bcopy.S b/sysdeps/i386/i686/bcopy.S
new file mode 100644
index 0000000000..15ef9419a4
--- /dev/null
+++ b/sysdeps/i386/i686/bcopy.S
@@ -0,0 +1,3 @@
+#define USE_AS_BCOPY
+#define memmove bcopy
+#include <sysdeps/i386/i686/memmove.S>
diff --git a/sysdeps/i386/i686/cacheinfo.c b/sysdeps/i386/i686/cacheinfo.c
index 82e4cd223e..f8b7f521ca 100644
--- a/sysdeps/i386/i686/cacheinfo.c
+++ b/sysdeps/i386/i686/cacheinfo.c
@@ -1,3 +1,4 @@
+#define __x86_64_data_cache_size __x86_data_cache_size
 #define __x86_64_data_cache_size_half __x86_data_cache_size_half
 #define __x86_64_shared_cache_size __x86_shared_cache_size
 #define __x86_64_shared_cache_size_half __x86_shared_cache_size_half
diff --git a/sysdeps/i386/i686/memcpy.S b/sysdeps/i386/i686/memcpy.S
index 0b2da1ea27..86ee082beb 100644
--- a/sysdeps/i386/i686/memcpy.S
+++ b/sysdeps/i386/i686/memcpy.S
@@ -32,11 +32,11 @@
 
 	.text
 #if defined PIC && !defined NOT_IN_libc
-ENTRY (__memcpy_chk)
+ENTRY_CHK (__memcpy_chk)
 	movl	12(%esp), %eax
 	cmpl	%eax, 16(%esp)
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (__memcpy_chk)
+END_CHK (__memcpy_chk)
 #endif
 ENTRY (BP_SYM (memcpy))
 	ENTER
diff --git a/sysdeps/i386/i686/memmove.S b/sysdeps/i386/i686/memmove.S
index b93b5c729f..981f14f4e0 100644
--- a/sysdeps/i386/i686/memmove.S
+++ b/sysdeps/i386/i686/memmove.S
@@ -26,18 +26,27 @@
 
 #define PARMS	LINKAGE+4	/* one spilled register */
 #define RTN	PARMS
-#define DEST	RTN+RTN_SIZE
-#define SRC	DEST+PTR_SIZE
-#define LEN	SRC+PTR_SIZE
 
 	.text
-#if defined PIC && !defined NOT_IN_libc
-ENTRY (__memmove_chk)
+
+#ifdef USE_AS_BCOPY
+# define SRC	RTN+RTN_SIZE
+# define DEST	SRC+PTR_SIZE
+# define LEN	DEST+PTR_SIZE
+#else
+# define DEST	RTN+RTN_SIZE
+# define SRC	DEST+PTR_SIZE
+# define LEN	SRC+PTR_SIZE
+
+# if defined PIC && !defined NOT_IN_libc
+ENTRY_CHK (__memmove_chk)
 	movl	12(%esp), %eax
 	cmpl	%eax, 16(%esp)
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (__memmove_chk)
+END_CHK (__memmove_chk)
+# endif
 #endif
+
 ENTRY (BP_SYM (memmove))
 	ENTER
 
@@ -69,8 +78,10 @@ ENTRY (BP_SYM (memmove))
 	movsl
 	movl	%edx, %esi
 	cfi_restore (esi)
+#ifndef USE_AS_BCOPY
 	movl	DEST(%esp), %eax
 	RETURN_BOUNDED_POINTER (DEST(%esp))
+#endif
 
 	popl	%edi
 	cfi_adjust_cfa_offset (-4)
@@ -101,8 +112,10 @@ ENTRY (BP_SYM (memmove))
 	movsl
 	movl	%edx, %esi
 	cfi_restore (esi)
+#ifndef USE_AS_BCOPY
 	movl	DEST(%esp), %eax
 	RETURN_BOUNDED_POINTER (DEST(%esp))
+#endif
 
 	cld
 	popl	%edi
@@ -112,4 +125,6 @@ ENTRY (BP_SYM (memmove))
 	LEAVE
 	RET_PTR
 END (BP_SYM (memmove))
+#ifndef USE_AS_BCOPY
 libc_hidden_builtin_def (memmove)
+#endif
diff --git a/sysdeps/i386/i686/mempcpy.S b/sysdeps/i386/i686/mempcpy.S
index 6437e4a5d4..c10686fb3d 100644
--- a/sysdeps/i386/i686/mempcpy.S
+++ b/sysdeps/i386/i686/mempcpy.S
@@ -32,11 +32,11 @@
 
 	.text
 #if defined PIC && !defined NOT_IN_libc
-ENTRY (__mempcpy_chk)
+ENTRY_CHK (__mempcpy_chk)
 	movl	12(%esp), %eax
 	cmpl	%eax, 16(%esp)
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (__mempcpy_chk)
+END_CHK (__mempcpy_chk)
 #endif
 ENTRY (BP_SYM (__mempcpy))
 	ENTER
diff --git a/sysdeps/i386/i686/memset.S b/sysdeps/i386/i686/memset.S
index dfa1aa7019..b343af7b64 100644
--- a/sysdeps/i386/i686/memset.S
+++ b/sysdeps/i386/i686/memset.S
@@ -40,11 +40,11 @@
 
         .text
 #if defined PIC && !defined NOT_IN_libc && !BZERO_P
-ENTRY (__memset_chk)
+ENTRY_CHK (__memset_chk)
 	movl	12(%esp), %eax
 	cmpl	%eax, 16(%esp)
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (__memset_chk)
+END_CHK (__memset_chk)
 #endif
 ENTRY (BP_SYM (memset))
 	ENTER
diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile
index e1553b284e..fbad9ae734 100644
--- a/sysdeps/i386/i686/multiarch/Makefile
+++ b/sysdeps/i386/i686/multiarch/Makefile
@@ -4,6 +4,10 @@ gen-as-const-headers += ifunc-defines.sym
 endif
 
 ifeq ($(subdir),string)
+sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \
+		   memmove-ssse3 memcpy-ssse3-rep mempcpy-ssse3-rep \
+		   memmove-ssse3-rep bcopy-ssse3 bcopy-ssse3-rep \
+		   memset-sse2-rep bzero-sse2-rep
 ifeq (yes,$(config-cflags-sse4))
 sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
 CFLAGS-strcspn-c.c += -msse4
diff --git a/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S b/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S
new file mode 100644
index 0000000000..cbc8b420e8
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define USE_AS_BCOPY
+#define MEMCPY		__bcopy_ssse3_rep
+#include "memcpy-ssse3-rep.S"
diff --git a/sysdeps/i386/i686/multiarch/bcopy-ssse3.S b/sysdeps/i386/i686/multiarch/bcopy-ssse3.S
new file mode 100644
index 0000000000..36aac44b9c
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/bcopy-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define USE_AS_BCOPY
+#define MEMCPY		__bcopy_ssse3
+#include "memcpy-ssse3.S"
diff --git a/sysdeps/i386/i686/multiarch/bcopy.S b/sysdeps/i386/i686/multiarch/bcopy.S
new file mode 100644
index 0000000000..8671bf684e
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/bcopy.S
@@ -0,0 +1,89 @@
+/* Multiple versions of bcopy
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib.  */
+#ifndef NOT_IN_libc
+# ifdef SHARED
+	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+	.globl	__i686.get_pc_thunk.bx
+	.hidden	__i686.get_pc_thunk.bx
+	.p2align 4
+	.type	__i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+	movl	(%esp), %ebx
+	ret
+
+	.text
+ENTRY(bcopy)
+	.type	bcopy, @gnu_indirect_function
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (ebx, 0)
+	call	__i686.get_pc_thunk.bx
+	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+	jne	1f
+	call	__init_cpu_features
+1:	leal	__bcopy_ia32@GOTOFF(%ebx), %eax
+	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__bcopy_ssse3@GOTOFF(%ebx), %eax
+	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__bcopy_ssse3_rep@GOTOFF(%ebx), %eax
+2:	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+	ret
+END(bcopy)
+# else
+	.text
+ENTRY(bcopy)
+	.type	bcopy, @gnu_indirect_function
+	cmpl	$0, KIND_OFFSET+__cpu_features
+	jne	1f
+	call	__init_cpu_features
+1:	leal	__bcopy_ia32, %eax
+	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
+	jz	2f
+	leal	__bcopy_ssse3, %eax
+	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features
+	jz	2f
+	leal	__bcopy_ssse3_rep, %eax
+2:	ret
+END(bcopy)
+# endif
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __bcopy_ia32, @function; \
+	.p2align 4; \
+	__bcopy_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __bcopy_ia32, .-__bcopy_ia32
+
+#endif
+
+#include "../bcopy.S"
diff --git a/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S b/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S
new file mode 100644
index 0000000000..507b288bb3
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S
@@ -0,0 +1,3 @@
+#define USE_AS_BZERO
+#define __memset_sse2_rep __bzero_sse2_rep
+#include "memset-sse2-rep.S"
diff --git a/sysdeps/i386/i686/multiarch/bzero-sse2.S b/sysdeps/i386/i686/multiarch/bzero-sse2.S
new file mode 100644
index 0000000000..8d04512e4e
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/bzero-sse2.S
@@ -0,0 +1,3 @@
+#define USE_AS_BZERO
+#define __memset_sse2 __bzero_sse2
+#include "memset-sse2.S"
diff --git a/sysdeps/i386/i686/multiarch/bzero.S b/sysdeps/i386/i686/multiarch/bzero.S
new file mode 100644
index 0000000000..8c740a42dc
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/bzero.S
@@ -0,0 +1,97 @@
+/* Multiple versions of bzero
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib.  */
+#ifndef NOT_IN_libc
+# ifdef SHARED
+	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+	.globl	__i686.get_pc_thunk.bx
+	.hidden	__i686.get_pc_thunk.bx
+	.p2align 4
+	.type	__i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+	movl	(%esp), %ebx
+	ret
+
+	.text
+ENTRY(__bzero)
+	.type	__bzero, @gnu_indirect_function
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (ebx, 0)
+	call	__i686.get_pc_thunk.bx
+	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+	jne	1f
+	call	__init_cpu_features
+1:	leal	__bzero_ia32@GOTOFF(%ebx), %eax
+	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__bzero_sse2@GOTOFF(%ebx), %eax
+	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__bzero_sse2_rep@GOTOFF(%ebx), %eax
+2:	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+	ret
+END(__bzero)
+# else
+	.text
+ENTRY(__bzero)
+	.type	__bzero, @gnu_indirect_function
+	cmpl	$0, KIND_OFFSET+__cpu_features
+	jne	1f
+	call	__init_cpu_features
+1:	leal	__bzero_ia32, %eax
+	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features
+	jz	2f
+	leal	__bzero_sse2, %eax
+	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features
+	jz	2f
+	leal	__bzero_sse2_rep, %eax
+2:	ret
+END(__bzero)
+# endif
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __bzero_ia32, @function; \
+	.p2align 4; \
+	__bzero_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __bzero_ia32, .-__bzero_ia32
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI___bzero; __GI___bzero = __bzero_ia32
+# endif
+#endif
+
+#include "../bzero.S"
diff --git a/sysdeps/i386/i686/multiarch/ifunc-defines.sym b/sysdeps/i386/i686/multiarch/ifunc-defines.sym
index e2021cdf87..eb1538abcc 100644
--- a/sysdeps/i386/i686/multiarch/ifunc-defines.sym
+++ b/sysdeps/i386/i686/multiarch/ifunc-defines.sym
@@ -13,5 +13,8 @@ CPUID_ECX_OFFSET	offsetof (struct cpuid_registers, ecx)
 CPUID_EDX_OFFSET	offsetof (struct cpuid_registers, edx)
 FAMILY_OFFSET		offsetof (struct cpu_features, family)
 MODEL_OFFSET		offsetof (struct cpu_features, model)
+FEATURE_OFFSET		offsetof (struct cpu_features, feature)
+FEATURE_SIZE		sizeof (unsigned int)
 
 COMMON_CPUID_INDEX_1
+FEATURE_INDEX_1
diff --git a/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S b/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S
new file mode 100644
index 0000000000..b26037d279
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S
@@ -0,0 +1,1785 @@
+/* memcpy with SSSE3 and REP string.
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+
+#if !defined NOT_IN_libc \
+    && (defined SHARED \
+	|| defined USE_AS_MEMMOVE \
+	|| !defined USE_MULTIARCH)
+
+#include "asm-syntax.h"
+
+#ifndef MEMCPY
+# define MEMCPY		__memcpy_ssse3_rep
+# define MEMCPY_CHK	__memcpy_chk_ssse3_rep
+#endif
+
+#ifdef USE_AS_BCOPY
+# define SRC		PARMS
+# define DEST		SRC+4
+# define LEN		DEST+4
+#else
+# define DEST		PARMS
+# define SRC		DEST+4
+# define LEN		SRC+4
+#endif
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#ifdef SHARED
+# define PARMS		8		/* Preserve EBX.  */
+# define ENTRANCE	PUSH (%ebx);
+# define RETURN_END	POP (%ebx); ret
+# define RETURN		RETURN_END; CFI_PUSH (%ebx)
+# define JMPTBL(I, B)	I - B
+
+/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
+   jump table with relative offsets.  INDEX is a register contains the
+   index into the jump table.   SCALE is the scale of INDEX. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+    /* We first load PC into EBX.  */				\
+    call	__i686.get_pc_thunk.bx;				\
+    /* Get the address of the jump table.  */			\
+    addl	$(TABLE - .), %ebx;				\
+    /* Get the entry and convert the relative offset to the	\
+       absolute address.  */					\
+    addl	(%ebx,INDEX,SCALE), %ebx;			\
+    /* We loaded the jump table.  Go.  */			\
+    jmp		*%ebx
+
+# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)			\
+    addl	$(TABLE - .), %ebx
+
+# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE)	\
+    addl	(%ebx,INDEX,SCALE), %ebx;			\
+    /* We loaded the jump table.  Go.  */			\
+    jmp		*%ebx
+
+	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+	.globl	__i686.get_pc_thunk.bx
+	.hidden	__i686.get_pc_thunk.bx
+	ALIGN (4)
+	.type	__i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+	movl	(%esp), %ebx
+	ret
+#else
+# define PARMS		4
+# define ENTRANCE
+# define RETURN_END	ret
+# define RETURN		RETURN_END
+# define JMPTBL(I, B)	I
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+   absolute offsets.  INDEX is a register contains the index into the
+   jump table.  SCALE is the scale of INDEX. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+    jmp		*TABLE(,INDEX,SCALE)
+
+# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)
+
+# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE)		\
+    jmp		*TABLE(,INDEX,SCALE)
+#endif
+
+	.section .text.ssse3,"ax",@progbits
+#if defined SHARED && !defined NOT_IN_libc && !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+#endif
+ENTRY (MEMCPY)
+	ENTRANCE
+	movl	LEN(%esp), %ecx
+	movl	SRC(%esp), %eax
+	movl	DEST(%esp), %edx
+
+#ifdef USE_AS_MEMMOVE
+	cmp	%eax, %edx
+	jb	L(copy_forward)
+	je	L(fwd_write_0bytes)
+	cmp	$32, %ecx
+	jge	L(memmove_bwd)
+	jmp	L(bk_write_less32bytes_2)
+L(memmove_bwd):
+	add	%ecx, %eax
+	cmp	%eax, %edx
+	movl	SRC(%esp), %eax
+	jb	L(copy_backward)
+
+L(copy_forward):
+#endif
+	cmp	$48, %ecx
+	jge	L(48bytesormore)
+
+L(fwd_write_less32bytes):
+#ifndef USE_AS_MEMMOVE
+	cmp	%dl, %al
+	jl	L(bk_write)
+#endif
+	add	%ecx, %edx
+	add	%ecx, %eax
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+#ifndef USE_AS_MEMMOVE
+L(bk_write):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
+#endif
+
+	ALIGN (4)
+/* ECX > 32 and EDX is 4 byte aligned.  */
+L(48bytesormore):
+	movdqu	(%eax), %xmm0
+	PUSH (%edi)
+	movl	%edx, %edi
+	and	$-16, %edx
+	PUSH (%esi)
+	add	$16, %edx
+	movl	%edi, %esi
+	sub	%edx, %edi
+	add	%edi, %ecx
+	sub	%edi, %eax
+
+#ifdef SHARED_CACHE_SIZE_HALF
+	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
+#else
+# ifdef SHARED
+	call	__i686.get_pc_thunk.bx
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+	cmp	__x86_shared_cache_size_half, %ecx
+# endif
+#endif
+
+	mov	%eax, %edi
+	jge	L(large_page)
+	and	$0xf, %edi
+	jz	L(shl_0)
+
+	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
+
+	ALIGN (4)
+L(shl_0):
+	movdqu	%xmm0, (%esi)
+	xor	%edi, %edi
+	cmp	$127, %ecx
+	ja	L(shl_0_gobble)
+	lea	-32(%ecx), %ecx
+L(shl_0_loop):
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+	jl	L(shl_0_end)
+
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+	jl	L(shl_0_end)
+
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+	jl	L(shl_0_end)
+
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+L(shl_0_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	add	%edi, %eax
+	POP (%esi)
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+L(shl_0_gobble):
+
+#ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+#else
+# ifdef SHARED
+	call	__i686.get_pc_thunk.bx
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	mov	__x86_data_cache_size_half@GOTOFF(%ebx), %edi
+# else
+	mov	__x86_data_cache_size_half, %edi
+# endif
+#endif
+	mov	%edi, %esi
+	shr	$3, %esi
+	sub	%esi, %edi
+	cmp	%edi, %ecx
+	jge	L(shl_0_gobble_mem_start)
+	lea	-128(%ecx), %ecx
+	ALIGN (4)
+L(shl_0_gobble_cache_loop):
+	movdqa	(%eax), %xmm0
+	movaps	0x10(%eax), %xmm1
+	movaps	0x20(%eax), %xmm2
+	movaps	0x30(%eax), %xmm3
+	movaps	0x40(%eax), %xmm4
+	movaps	0x50(%eax), %xmm5
+	movaps	0x60(%eax), %xmm6
+	movaps	0x70(%eax), %xmm7
+	lea	0x80(%eax), %eax
+	sub	$128, %ecx
+	movdqa	%xmm0, (%edx)
+	movaps	%xmm1, 0x10(%edx)
+	movaps	%xmm2, 0x20(%edx)
+	movaps	%xmm3, 0x30(%edx)
+	movaps	%xmm4, 0x40(%edx)
+	movaps	%xmm5, 0x50(%edx)
+	movaps	%xmm6, 0x60(%edx)
+	movaps	%xmm7, 0x70(%edx)
+	lea	0x80(%edx), %edx
+
+	jge	L(shl_0_gobble_cache_loop)
+L(shl_0_gobble_cache_loop_tail):
+	cmp	$-0x40, %ecx
+	lea	0x80(%ecx), %ecx
+	jl	L(shl_0_cache_less_64bytes)
+
+	movdqa	(%eax), %xmm0
+	sub	$0x40, %ecx
+	movdqa	0x10(%eax), %xmm1
+
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+
+	movdqa	0x20(%eax), %xmm0
+	movdqa	0x30(%eax), %xmm1
+	add	$0x40, %eax
+
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm1, 0x30(%edx)
+	add	$0x40, %edx
+L(shl_0_cache_less_64bytes):
+	cmp	$0x20, %ecx
+	jl	L(shl_0_cache_less_32bytes)
+	movdqa	(%eax), %xmm0
+	sub	$0x20, %ecx
+	movdqa	0x10(%eax), %xmm1
+	add	$0x20, %eax
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+	add	$0x20, %edx
+L(shl_0_cache_less_32bytes):
+	cmp	$0x10, %ecx
+	jl	L(shl_0_cache_less_16bytes)
+	sub	$0x10, %ecx
+	movdqa	(%eax), %xmm0
+	add	$0x10, %eax
+	movdqa	%xmm0, (%edx)
+	add	$0x10, %edx
+L(shl_0_cache_less_16bytes):
+	add	%ecx, %edx
+	add	%ecx, %eax
+	POP (%esi)
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+
+	ALIGN (4)
+L(shl_0_gobble_mem_start):
+	cmp	%al, %dl
+	je	L(copy_page_by_rep)
+	lea	-128(%ecx), %ecx
+L(shl_0_gobble_mem_loop):
+	prefetchnta 0x1c0(%eax)
+	prefetchnta 0x280(%eax)
+	prefetchnta 0x1c0(%edx)
+	prefetchnta 0x280(%edx)
+
+	movdqa	(%eax), %xmm0
+	movaps	0x10(%eax), %xmm1
+	movaps	0x20(%eax), %xmm2
+	movaps	0x30(%eax), %xmm3
+	movaps	0x40(%eax), %xmm4
+	movaps	0x50(%eax), %xmm5
+	movaps	0x60(%eax), %xmm6
+	movaps	0x70(%eax), %xmm7
+	lea	0x80(%eax), %eax
+	sub	$0x80, %ecx
+	movdqa	%xmm0, (%edx)
+	movaps	%xmm1, 0x10(%edx)
+	movaps	%xmm2, 0x20(%edx)
+	movaps	%xmm3, 0x30(%edx)
+	movaps	%xmm4, 0x40(%edx)
+	movaps	%xmm5, 0x50(%edx)
+	movaps	%xmm6, 0x60(%edx)
+	movaps	%xmm7, 0x70(%edx)
+	lea	0x80(%edx), %edx
+
+	jge	L(shl_0_gobble_mem_loop)
+	cmp	$-0x40, %ecx
+	lea	0x80(%ecx), %ecx
+	jl	L(shl_0_mem_less_64bytes)
+
+	movdqa	(%eax), %xmm0
+	sub	$0x40, %ecx
+	movdqa	0x10(%eax), %xmm1
+
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+
+	movdqa	0x20(%eax), %xmm0
+	movdqa	0x30(%eax), %xmm1
+	add	$0x40, %eax
+
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm1, 0x30(%edx)
+	add	$0x40, %edx
+L(shl_0_mem_less_64bytes):
+	cmp	$0x20, %ecx
+	jl	L(shl_0_mem_less_32bytes)
+	movdqa	(%eax), %xmm0
+	sub	$0x20, %ecx
+	movdqa	0x10(%eax), %xmm1
+	add	$0x20, %eax
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+	add	$0x20, %edx
+L(shl_0_mem_less_32bytes):
+	cmp	$0x10, %ecx
+	jl	L(shl_0_mem_less_16bytes)
+	sub	$0x10, %ecx
+	movdqa	(%eax), %xmm0
+	add	$0x10, %eax
+	movdqa	%xmm0, (%edx)
+	add	$0x10, %edx
+L(shl_0_mem_less_16bytes):
+	add	%ecx, %edx
+	add	%ecx, %eax
+	POP (%esi)
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+
+	ALIGN (4)
+L(shl_1):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-1(%eax), %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	lea	-32(%ecx), %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_1_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$1, %xmm2, %xmm3
+	palignr	$1, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jl	L(shl_1_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$1, %xmm2, %xmm3
+	palignr	$1, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_1_loop)
+
+L(shl_1_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	1(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_2):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-2(%eax), %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	lea	-32(%ecx), %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_2_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$2, %xmm2, %xmm3
+	palignr	$2, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jl	L(shl_2_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$2, %xmm2, %xmm3
+	palignr	$2, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_2_loop)
+
+L(shl_2_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	2(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_3):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-3(%eax), %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	lea	-32(%ecx), %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_3_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$3, %xmm2, %xmm3
+	palignr	$3, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jl	L(shl_3_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$3, %xmm2, %xmm3
+	palignr	$3, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_3_loop)
+
+L(shl_3_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	3(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_4):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-4(%eax), %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	lea	-32(%ecx), %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_4_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$4, %xmm2, %xmm3
+	palignr	$4, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jl	L(shl_4_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$4, %xmm2, %xmm3
+	palignr	$4, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_4_loop)
+
+L(shl_4_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	4(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_5):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-5(%eax), %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	lea	-32(%ecx), %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_5_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$5, %xmm2, %xmm3
+	palignr	$5, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jl	L(shl_5_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$5, %xmm2, %xmm3
+	palignr	$5, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_5_loop)
+
+L(shl_5_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	5(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+
+	ALIGN (4)
+L(shl_6):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-6(%eax), %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	lea	-32(%ecx), %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_6_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$6, %xmm2, %xmm3
+	palignr	$6, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jl	L(shl_6_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$6, %xmm2, %xmm3
+	palignr	$6, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_6_loop)
+
+L(shl_6_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	6(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_7):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-7(%eax), %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	lea	-32(%ecx), %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_7_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$7, %xmm2, %xmm3
+	palignr	$7, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jl	L(shl_7_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$7, %xmm2, %xmm3
+	palignr	$7, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_7_loop)
+
+L(shl_7_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	7(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_8):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-8(%eax), %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	lea	-32(%ecx), %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_8_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$8, %xmm2, %xmm3
+	palignr	$8, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jl	L(shl_8_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$8, %xmm2, %xmm3
+	palignr	$8, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_8_loop)
+
+L(shl_8_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	8(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_9):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-9(%eax), %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	lea	-32(%ecx), %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_9_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$9, %xmm2, %xmm3
+	palignr	$9, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jl	L(shl_9_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$9, %xmm2, %xmm3
+	palignr	$9, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_9_loop)
+
+L(shl_9_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	9(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_10):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-10(%eax), %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	lea	-32(%ecx), %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_10_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$10, %xmm2, %xmm3
+	palignr	$10, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jl	L(shl_10_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$10, %xmm2, %xmm3
+	palignr	$10, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_10_loop)
+
+L(shl_10_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	10(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_11):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-11(%eax), %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	lea	-32(%ecx), %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_11_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$11, %xmm2, %xmm3
+	palignr	$11, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jl	L(shl_11_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$11, %xmm2, %xmm3
+	palignr	$11, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_11_loop)
+
+L(shl_11_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	11(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_12):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-12(%eax), %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	lea	-32(%ecx), %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_12_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$12, %xmm2, %xmm3
+	palignr	$12, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jl	L(shl_12_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$12, %xmm2, %xmm3
+	palignr	$12, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_12_loop)
+
+L(shl_12_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	12(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_13):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-13(%eax), %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	lea	-32(%ecx), %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_13_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$13, %xmm2, %xmm3
+	palignr	$13, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jl	L(shl_13_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$13, %xmm2, %xmm3
+	palignr	$13, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_13_loop)
+
+L(shl_13_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	13(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_14):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-14(%eax), %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	lea	-32(%ecx), %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_14_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$14, %xmm2, %xmm3
+	palignr	$14, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jl	L(shl_14_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$14, %xmm2, %xmm3
+	palignr	$14, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_14_loop)
+
+L(shl_14_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	14(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+
+	ALIGN (4)
+L(shl_15):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-15(%eax), %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	lea	-32(%ecx), %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_15_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$15, %xmm2, %xmm3
+	palignr	$15, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jl	L(shl_15_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$15, %xmm2, %xmm3
+	palignr	$15, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_15_loop)
+
+L(shl_15_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	15(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+
+	ALIGN (4)
+L(fwd_write_44bytes):
+	movl	-44(%eax), %ecx
+	movl	%ecx, -44(%edx)
+L(fwd_write_40bytes):
+	movl	-40(%eax), %ecx
+	movl	%ecx, -40(%edx)
+L(fwd_write_36bytes):
+	movl	-36(%eax), %ecx
+	movl	%ecx, -36(%edx)
+L(fwd_write_32bytes):
+	movl	-32(%eax), %ecx
+	movl	%ecx, -32(%edx)
+L(fwd_write_28bytes):
+	movl	-28(%eax), %ecx
+	movl	%ecx, -28(%edx)
+L(fwd_write_24bytes):
+	movl	-24(%eax), %ecx
+	movl	%ecx, -24(%edx)
+L(fwd_write_20bytes):
+	movl	-20(%eax), %ecx
+	movl	%ecx, -20(%edx)
+L(fwd_write_16bytes):
+	movl	-16(%eax), %ecx
+	movl	%ecx, -16(%edx)
+L(fwd_write_12bytes):
+	movl	-12(%eax), %ecx
+	movl	%ecx, -12(%edx)
+L(fwd_write_8bytes):
+	movl	-8(%eax), %ecx
+	movl	%ecx, -8(%edx)
+L(fwd_write_4bytes):
+	movl	-4(%eax), %ecx
+	movl	%ecx, -4(%edx)
+L(fwd_write_0bytes):
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_5bytes):
+	movl	-5(%eax), %ecx
+	movl	-4(%eax), %eax
+	movl	%ecx, -5(%edx)
+	movl	%eax, -4(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_45bytes):
+	movl	-45(%eax), %ecx
+	movl	%ecx, -45(%edx)
+L(fwd_write_41bytes):
+	movl	-41(%eax), %ecx
+	movl	%ecx, -41(%edx)
+L(fwd_write_37bytes):
+	movl	-37(%eax), %ecx
+	movl	%ecx, -37(%edx)
+L(fwd_write_33bytes):
+	movl	-33(%eax), %ecx
+	movl	%ecx, -33(%edx)
+L(fwd_write_29bytes):
+	movl	-29(%eax), %ecx
+	movl	%ecx, -29(%edx)
+L(fwd_write_25bytes):
+	movl	-25(%eax), %ecx
+	movl	%ecx, -25(%edx)
+L(fwd_write_21bytes):
+	movl	-21(%eax), %ecx
+	movl	%ecx, -21(%edx)
+L(fwd_write_17bytes):
+	movl	-17(%eax), %ecx
+	movl	%ecx, -17(%edx)
+L(fwd_write_13bytes):
+	movl	-13(%eax), %ecx
+	movl	%ecx, -13(%edx)
+L(fwd_write_9bytes):
+	movl	-9(%eax), %ecx
+	movl	%ecx, -9(%edx)
+	movl	-5(%eax), %ecx
+	movl	%ecx, -5(%edx)
+L(fwd_write_1bytes):
+	movzbl	-1(%eax), %ecx
+	movb	%cl, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_46bytes):
+	movl	-46(%eax), %ecx
+	movl	%ecx, -46(%edx)
+L(fwd_write_42bytes):
+	movl	-42(%eax), %ecx
+	movl	%ecx, -42(%edx)
+L(fwd_write_38bytes):
+	movl	-38(%eax), %ecx
+	movl	%ecx, -38(%edx)
+L(fwd_write_34bytes):
+	movl	-34(%eax), %ecx
+	movl	%ecx, -34(%edx)
+L(fwd_write_30bytes):
+	movl	-30(%eax), %ecx
+	movl	%ecx, -30(%edx)
+L(fwd_write_26bytes):
+	movl	-26(%eax), %ecx
+	movl	%ecx, -26(%edx)
+L(fwd_write_22bytes):
+	movl	-22(%eax), %ecx
+	movl	%ecx, -22(%edx)
+L(fwd_write_18bytes):
+	movl	-18(%eax), %ecx
+	movl	%ecx, -18(%edx)
+L(fwd_write_14bytes):
+	movl	-14(%eax), %ecx
+	movl	%ecx, -14(%edx)
+L(fwd_write_10bytes):
+	movl	-10(%eax), %ecx
+	movl	%ecx, -10(%edx)
+L(fwd_write_6bytes):
+	movl	-6(%eax), %ecx
+	movl	%ecx, -6(%edx)
+L(fwd_write_2bytes):
+	movzwl	-2(%eax), %ecx
+	movw	%cx, -2(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_47bytes):
+	movl	-47(%eax), %ecx
+	movl	%ecx, -47(%edx)
+L(fwd_write_43bytes):
+	movl	-43(%eax), %ecx
+	movl	%ecx, -43(%edx)
+L(fwd_write_39bytes):
+	movl	-39(%eax), %ecx
+	movl	%ecx, -39(%edx)
+L(fwd_write_35bytes):
+	movl	-35(%eax), %ecx
+	movl	%ecx, -35(%edx)
+L(fwd_write_31bytes):
+	movl	-31(%eax), %ecx
+	movl	%ecx, -31(%edx)
+L(fwd_write_27bytes):
+	movl	-27(%eax), %ecx
+	movl	%ecx, -27(%edx)
+L(fwd_write_23bytes):
+	movl	-23(%eax), %ecx
+	movl	%ecx, -23(%edx)
+L(fwd_write_19bytes):
+	movl	-19(%eax), %ecx
+	movl	%ecx, -19(%edx)
+L(fwd_write_15bytes):
+	movl	-15(%eax), %ecx
+	movl	%ecx, -15(%edx)
+L(fwd_write_11bytes):
+	movl	-11(%eax), %ecx
+	movl	%ecx, -11(%edx)
+L(fwd_write_7bytes):
+	movl	-7(%eax), %ecx
+	movl	%ecx, -7(%edx)
+L(fwd_write_3bytes):
+	movzwl	-3(%eax), %ecx
+	movzbl	-1(%eax), %eax
+	movw	%cx, -3(%edx)
+	movb	%al, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(large_page):
+	movdqu	(%eax), %xmm1
+	lea	16(%eax), %eax
+	movdqu	%xmm0, (%esi)
+	movntdq	%xmm1, (%edx)
+	lea	16(%edx), %edx
+	cmp	%al, %dl
+	je	L(copy_page_by_rep)
+L(large_page_loop_init):
+	POP (%esi)
+	lea	-0x90(%ecx), %ecx
+	POP (%edi)
+L(large_page_loop):
+	prefetchnta	0x1c0(%eax)
+	prefetchnta	0x280(%eax)
+	movdqu	(%eax), %xmm0
+	movdqu	0x10(%eax), %xmm1
+	movdqu	0x20(%eax), %xmm2
+	movdqu	0x30(%eax), %xmm3
+	movdqu	0x40(%eax), %xmm4
+	movdqu	0x50(%eax), %xmm5
+	movdqu	0x60(%eax), %xmm6
+	movdqu	0x70(%eax), %xmm7
+	lea	0x80(%eax), %eax
+	lfence
+	sub	$0x80, %ecx
+	movntdq	%xmm0, (%edx)
+	movntdq	%xmm1, 0x10(%edx)
+	movntdq	%xmm2, 0x20(%edx)
+	movntdq	%xmm3, 0x30(%edx)
+	movntdq	%xmm4, 0x40(%edx)
+	movntdq	%xmm5, 0x50(%edx)
+	movntdq	%xmm6, 0x60(%edx)
+	movntdq	%xmm7, 0x70(%edx)
+	lea	0x80(%edx), %edx
+	jae	L(large_page_loop)
+	cmp	$-0x40, %ecx
+	lea	0x80(%ecx), %ecx
+	jl	L(large_page_less_64bytes)
+
+	movdqu	(%eax), %xmm0
+	movdqu	0x10(%eax), %xmm1
+	movdqu	0x20(%eax), %xmm2
+	movdqu	0x30(%eax), %xmm3
+	lea	0x40(%eax), %eax
+
+	movntdq	%xmm0, (%edx)
+	movntdq	%xmm1, 0x10(%edx)
+	movntdq	%xmm2, 0x20(%edx)
+	movntdq	%xmm3, 0x30(%edx)
+	lea	0x40(%edx), %edx
+	sub	$0x40, %ecx
+L(large_page_less_64bytes):
+	cmp	$32, %ecx
+	jl	L(large_page_less_32bytes)
+	movdqu	(%eax), %xmm0
+	movdqu	0x10(%eax), %xmm1
+	lea	0x20(%eax), %eax
+	movntdq	%xmm0, (%edx)
+	movntdq	%xmm1, 0x10(%edx)
+	lea	0x20(%edx), %edx
+	sub	$0x20, %ecx
+L(large_page_less_32bytes):
+	add	%ecx, %edx
+	add	%ecx, %eax
+	sfence
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(copy_page_by_rep):
+	mov	%eax, %esi
+	mov	%edx, %edi
+	mov	%ecx, %edx
+	shr	$2, %ecx
+	and	$3, %edx
+	rep	movsl
+	jz	L(copy_page_by_rep_exit)
+	cmp	$2, %edx
+	jb	L(copy_page_by_rep_left_1)
+	movzwl	(%esi), %eax
+	movw	%ax, (%edi)
+	add	$2, %esi
+	add	$2, %edi
+	sub	$2, %edx
+	jz	L(copy_page_by_rep_exit)
+L(copy_page_by_rep_left_1):
+	movzbl	(%esi), %eax
+	movb	%al, (%edi)
+L(copy_page_by_rep_exit):
+	POP (%esi)
+	POP (%edi)
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(bk_write_44bytes):
+	movl	40(%eax), %ecx
+	movl	%ecx, 40(%edx)
+L(bk_write_40bytes):
+	movl	36(%eax), %ecx
+	movl	%ecx, 36(%edx)
+L(bk_write_36bytes):
+	movl	32(%eax), %ecx
+	movl	%ecx, 32(%edx)
+L(bk_write_32bytes):
+	movl	28(%eax), %ecx
+	movl	%ecx, 28(%edx)
+L(bk_write_28bytes):
+	movl	24(%eax), %ecx
+	movl	%ecx, 24(%edx)
+L(bk_write_24bytes):
+	movl	20(%eax), %ecx
+	movl	%ecx, 20(%edx)
+L(bk_write_20bytes):
+	movl	16(%eax), %ecx
+	movl	%ecx, 16(%edx)
+L(bk_write_16bytes):
+	movl	12(%eax), %ecx
+	movl	%ecx, 12(%edx)
+L(bk_write_12bytes):
+	movl	8(%eax), %ecx
+	movl	%ecx, 8(%edx)
+L(bk_write_8bytes):
+	movl	4(%eax), %ecx
+	movl	%ecx, 4(%edx)
+L(bk_write_4bytes):
+	movl	(%eax), %ecx
+	movl	%ecx, (%edx)
+L(bk_write_0bytes):
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(bk_write_45bytes):
+	movl	41(%eax), %ecx
+	movl	%ecx, 41(%edx)
+L(bk_write_41bytes):
+	movl	37(%eax), %ecx
+	movl	%ecx, 37(%edx)
+L(bk_write_37bytes):
+	movl	33(%eax), %ecx
+	movl	%ecx, 33(%edx)
+L(bk_write_33bytes):
+	movl	29(%eax), %ecx
+	movl	%ecx, 29(%edx)
+L(bk_write_29bytes):
+	movl	25(%eax), %ecx
+	movl	%ecx, 25(%edx)
+L(bk_write_25bytes):
+	movl	21(%eax), %ecx
+	movl	%ecx, 21(%edx)
+L(bk_write_21bytes):
+	movl	17(%eax), %ecx
+	movl	%ecx, 17(%edx)
+L(bk_write_17bytes):
+	movl	13(%eax), %ecx
+	movl	%ecx, 13(%edx)
+L(bk_write_13bytes):
+	movl	9(%eax), %ecx
+	movl	%ecx, 9(%edx)
+L(bk_write_9bytes):
+	movl	5(%eax), %ecx
+	movl	%ecx, 5(%edx)
+L(bk_write_5bytes):
+	movl	1(%eax), %ecx
+	movl	%ecx, 1(%edx)
+L(bk_write_1bytes):
+	movzbl	(%eax), %ecx
+	movb	%cl, (%edx)
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(bk_write_46bytes):
+	movl	42(%eax), %ecx
+	movl	%ecx, 42(%edx)
+L(bk_write_42bytes):
+	movl	38(%eax), %ecx
+	movl	%ecx, 38(%edx)
+L(bk_write_38bytes):
+	movl	34(%eax), %ecx
+	movl	%ecx, 34(%edx)
+L(bk_write_34bytes):
+	movl	30(%eax), %ecx
+	movl	%ecx, 30(%edx)
+L(bk_write_30bytes):
+	movl	26(%eax), %ecx
+	movl	%ecx, 26(%edx)
+L(bk_write_26bytes):
+	movl	22(%eax), %ecx
+	movl	%ecx, 22(%edx)
+L(bk_write_22bytes):
+	movl	18(%eax), %ecx
+	movl	%ecx, 18(%edx)
+L(bk_write_18bytes):
+	movl	14(%eax), %ecx
+	movl	%ecx, 14(%edx)
+L(bk_write_14bytes):
+	movl	10(%eax), %ecx
+	movl	%ecx, 10(%edx)
+L(bk_write_10bytes):
+	movl	6(%eax), %ecx
+	movl	%ecx, 6(%edx)
+L(bk_write_6bytes):
+	movl	2(%eax), %ecx
+	movl	%ecx, 2(%edx)
+L(bk_write_2bytes):
+	movzwl	(%eax), %ecx
+	movw	%cx, (%edx)
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(bk_write_47bytes):
+	movl	43(%eax), %ecx
+	movl	%ecx, 43(%edx)
+L(bk_write_43bytes):
+	movl	39(%eax), %ecx
+	movl	%ecx, 39(%edx)
+L(bk_write_39bytes):
+	movl	35(%eax), %ecx
+	movl	%ecx, 35(%edx)
+L(bk_write_35bytes):
+	movl	31(%eax), %ecx
+	movl	%ecx, 31(%edx)
+L(bk_write_31bytes):
+	movl	27(%eax), %ecx
+	movl	%ecx, 27(%edx)
+L(bk_write_27bytes):
+	movl	23(%eax), %ecx
+	movl	%ecx, 23(%edx)
+L(bk_write_23bytes):
+	movl	19(%eax), %ecx
+	movl	%ecx, 19(%edx)
+L(bk_write_19bytes):
+	movl	15(%eax), %ecx
+	movl	%ecx, 15(%edx)
+L(bk_write_15bytes):
+	movl	11(%eax), %ecx
+	movl	%ecx, 11(%edx)
+L(bk_write_11bytes):
+	movl	7(%eax), %ecx
+	movl	%ecx, 7(%edx)
+L(bk_write_7bytes):
+	movl	3(%eax), %ecx
+	movl	%ecx, 3(%edx)
+L(bk_write_3bytes):
+	movzwl	1(%eax), %ecx
+	movw	%cx, 1(%edx)
+	movzbl	(%eax), %eax
+	movb	%al, (%edx)
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN_END
+
+
+	.pushsection .rodata.ssse3,"a",@progbits
+	ALIGN (2)
+L(table_48bytes_fwd):
+	.int	JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
+
+	ALIGN (2)
+L(shl_table):
+	.int	JMPTBL (L(shl_0), L(shl_table))
+	.int	JMPTBL (L(shl_1), L(shl_table))
+	.int	JMPTBL (L(shl_2), L(shl_table))
+	.int	JMPTBL (L(shl_3), L(shl_table))
+	.int	JMPTBL (L(shl_4), L(shl_table))
+	.int	JMPTBL (L(shl_5), L(shl_table))
+	.int	JMPTBL (L(shl_6), L(shl_table))
+	.int	JMPTBL (L(shl_7), L(shl_table))
+	.int	JMPTBL (L(shl_8), L(shl_table))
+	.int	JMPTBL (L(shl_9), L(shl_table))
+	.int	JMPTBL (L(shl_10), L(shl_table))
+	.int	JMPTBL (L(shl_11), L(shl_table))
+	.int	JMPTBL (L(shl_12), L(shl_table))
+	.int	JMPTBL (L(shl_13), L(shl_table))
+	.int	JMPTBL (L(shl_14), L(shl_table))
+	.int	JMPTBL (L(shl_15), L(shl_table))
+
+	ALIGN (2)
+L(table_48_bytes_bwd):
+	.int	JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
+
+	.popsection
+
+#ifdef USE_AS_MEMMOVE
+	ALIGN (4)
+L(copy_backward):
+	PUSH (%esi)
+	movl	%eax, %esi
+	lea	(%ecx,%edx,1),%edx
+	lea	(%ecx,%esi,1),%esi
+	testl	$0x3, %edx
+	jnz	L(bk_align)
+
+L(bk_aligned_4):
+	cmp	$64, %ecx
+	jge	L(bk_write_more64bytes)
+
+L(bk_write_64bytesless):
+	cmp	$32, %ecx
+	jl	L(bk_write_less32bytes)
+
+L(bk_write_more32bytes):
+	/* Copy 32 bytes at a time.  */
+	sub	$32, %ecx
+	movl	-4(%esi), %eax
+	movl	%eax, -4(%edx)
+	movl	-8(%esi), %eax
+	movl	%eax, -8(%edx)
+	movl	-12(%esi), %eax
+	movl	%eax, -12(%edx)
+	movl	-16(%esi), %eax
+	movl	%eax, -16(%edx)
+	movl	-20(%esi), %eax
+	movl	%eax, -20(%edx)
+	movl	-24(%esi), %eax
+	movl	%eax, -24(%edx)
+	movl	-28(%esi), %eax
+	movl	%eax, -28(%edx)
+	movl	-32(%esi), %eax
+	movl	%eax, -32(%edx)
+	sub	$32, %edx
+	sub	$32, %esi
+
+L(bk_write_less32bytes):
+	movl	%esi, %eax
+	sub	%ecx, %edx
+	sub	%ecx, %eax
+	POP (%esi)
+L(bk_write_less32bytes_2):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
+
+	ALIGN (4)
+L(bk_align):
+	cmp	$8, %ecx
+	jle	L(bk_write_less32bytes)
+	testl	$1, %edx
+	/* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
+	   then (EDX & 2) must be != 0.  */
+	jz	L(bk_got2)
+	sub	$1, %esi
+	sub	$1, %ecx
+	sub	$1, %edx
+	movzbl	(%esi), %eax
+	movb	%al, (%edx)
+
+	testl	$2, %edx
+	jz	L(bk_aligned_4)
+
+L(bk_got2):
+	sub	$2, %esi
+	sub	$2, %ecx
+	sub	$2, %edx
+	movzwl	(%esi), %eax
+	movw	%ax, (%edx)
+	jmp	L(bk_aligned_4)
+
+	ALIGN (4)
+L(bk_write_more64bytes):
+	/* Check alignment of last byte.  */
+	testl	$15, %edx
+	jz	L(bk_ssse3_cpy_pre)
+
+/* EDX is aligned 4 bytes, but not 16 bytes.  */
+L(bk_ssse3_align):
+	sub	$4, %esi
+	sub	$4, %ecx
+	sub	$4, %edx
+	movl	(%esi), %eax
+	movl	%eax, (%edx)
+
+	testl	$15, %edx
+	jz	L(bk_ssse3_cpy_pre)
+
+	sub	$4, %esi
+	sub	$4, %ecx
+	sub	$4, %edx
+	movl	(%esi), %eax
+	movl	%eax, (%edx)
+
+	testl	$15, %edx
+	jz	L(bk_ssse3_cpy_pre)
+
+	sub	$4, %esi
+	sub	$4, %ecx
+	sub	$4, %edx
+	movl	(%esi), %eax
+	movl	%eax, (%edx)
+
+L(bk_ssse3_cpy_pre):
+	cmp	$64, %ecx
+	jl	L(bk_write_more32bytes)
+
+L(bk_ssse3_cpy):
+	sub	$64, %esi
+	sub	$64, %ecx
+	sub	$64, %edx
+	movdqu	0x30(%esi), %xmm3
+	movdqa	%xmm3, 0x30(%edx)
+	movdqu	0x20(%esi), %xmm2
+	movdqa	%xmm2, 0x20(%edx)
+	movdqu	0x10(%esi), %xmm1
+	movdqa	%xmm1, 0x10(%edx)
+	movdqu	(%esi), %xmm0
+	movdqa	%xmm0, (%edx)
+	cmp	$64, %ecx
+	jge	L(bk_ssse3_cpy)
+	jmp	L(bk_write_64bytesless)
+
+#endif
+
+END (MEMCPY)
+
+#endif
diff --git a/sysdeps/i386/i686/multiarch/memcpy-ssse3.S b/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
new file mode 100644
index 0000000000..749c82d379
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
@@ -0,0 +1,1737 @@
+/* memcpy with SSSE3
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+
+#if !defined NOT_IN_libc \
+    && (defined SHARED \
+	|| defined USE_AS_MEMMOVE \
+	|| !defined USE_MULTIARCH)
+
+#include "asm-syntax.h"
+
+#ifndef MEMCPY
+# define MEMCPY		__memcpy_ssse3
+# define MEMCPY_CHK	__memcpy_chk_ssse3
+#endif
+
+#ifdef USE_AS_BCOPY
+# define SRC		PARMS
+# define DEST		SRC+4
+# define LEN		DEST+4
+#else
+# define DEST		PARMS
+# define SRC		DEST+4
+# define LEN		SRC+4
+#endif
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#ifdef SHARED
+# define PARMS		8		/* Preserve EBX.  */
+# define ENTRANCE	PUSH (%ebx);
+# define RETURN_END	POP (%ebx); ret
+# define RETURN		RETURN_END; CFI_PUSH (%ebx)
+# define JMPTBL(I, B)	I - B
+
+/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
+   jump table with relative offsets.  INDEX is a register contains the
+   index into the jump table.   SCALE is the scale of INDEX. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+    /* We first load PC into EBX.  */				\
+    call	__i686.get_pc_thunk.bx;				\
+    /* Get the address of the jump table.  */			\
+    addl	$(TABLE - .), %ebx;				\
+    /* Get the entry and convert the relative offset to the	\
+       absolute address.  */					\
+    addl	(%ebx,INDEX,SCALE), %ebx;			\
+    /* We loaded the jump table.  Go.  */			\
+    jmp		*%ebx
+
+# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)			\
+    addl	$(TABLE - .), %ebx
+
+# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE)	\
+    addl	(%ebx,INDEX,SCALE), %ebx;			\
+    /* We loaded the jump table.  Go.  */			\
+    jmp		*%ebx
+
+	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+	.globl	__i686.get_pc_thunk.bx
+	.hidden	__i686.get_pc_thunk.bx
+	ALIGN (4)
+	.type	__i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+	movl	(%esp), %ebx
+	ret
+#else
+# define PARMS		4
+# define ENTRANCE
+# define RETURN_END	ret
+# define RETURN		RETURN_END
+# define JMPTBL(I, B)	I
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+   absolute offsets.  INDEX is a register contains the index into the
+   jump table.  SCALE is the scale of INDEX. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+    jmp		*TABLE(,INDEX,SCALE)
+
+# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)
+
+# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE)		\
+    jmp		*TABLE(,INDEX,SCALE)
+#endif
+
+	.section .text.ssse3,"ax",@progbits
+#if defined SHARED && !defined NOT_IN_libc && !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+#endif
+ENTRY (MEMCPY)
+	ENTRANCE
+	movl	LEN(%esp), %ecx
+	movl	SRC(%esp), %eax
+	movl	DEST(%esp), %edx
+
+#ifdef USE_AS_MEMMOVE
+	cmp	%eax, %edx
+	jb	L(copy_forward)
+	je	L(fwd_write_0bytes)
+	cmp	$32, %ecx
+	jge	L(memmove_bwd)
+	jmp	L(bk_write_less32bytes_2)
+L(memmove_bwd):
+	add	%ecx, %eax
+	cmp	%eax, %edx
+	movl	SRC(%esp), %eax
+	jb	L(copy_backward)
+
+L(copy_forward):
+#endif
+	cmp	$48, %ecx
+	jge	L(48bytesormore)
+
+L(fwd_write_less32bytes):
+#ifndef USE_AS_MEMMOVE
+	cmp	%dl, %al
+	jl	L(bk_write)
+#endif
+	add	%ecx, %edx
+	add	%ecx, %eax
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+#ifndef USE_AS_MEMMOVE
+L(bk_write):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
+#endif
+
+	ALIGN (4)
+/* ECX > 32 and EDX is 4 byte aligned.  */
+L(48bytesormore):
+	movdqu	(%eax), %xmm0
+	PUSH (%edi)
+	movl	%edx, %edi
+	and	$-16, %edx
+	PUSH (%esi)
+	add	$16, %edx
+	movl	%edi, %esi
+	sub	%edx, %edi
+	add	%edi, %ecx
+	sub	%edi, %eax
+
+#ifdef SHARED_CACHE_SIZE_HALF
+	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
+#else
+# ifdef SHARED
+	call	__i686.get_pc_thunk.bx
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+	cmp	__x86_shared_cache_size_half, %ecx
+# endif
+#endif
+
+	mov	%eax, %edi
+	jge	L(large_page)
+	and	$0xf, %edi
+	jz	L(shl_0)
+
+	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
+
+	ALIGN (4)
+L(shl_0):
+	movdqu	%xmm0, (%esi)
+	xor	%edi, %edi
+	POP (%esi)
+	cmp	$127, %ecx
+	ja	L(shl_0_gobble)
+	lea	-32(%ecx), %ecx
+L(shl_0_loop):
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+	jl	L(shl_0_end)
+
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+	jl	L(shl_0_end)
+
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+	jl	L(shl_0_end)
+
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+L(shl_0_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	add	%edi, %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+L(shl_0_gobble):
+
+#ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+#else
+# ifdef SHARED
+	call	__i686.get_pc_thunk.bx
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+	cmp	__x86_data_cache_size_half, %ecx
+# endif
+#endif
+
+	POP (%edi)
+	lea	-128(%ecx), %ecx
+	jge	L(shl_0_gobble_mem_loop)
+L(shl_0_gobble_cache_loop):
+	movdqa	(%eax), %xmm0
+	movdqa	0x10(%eax), %xmm1
+	movdqa	0x20(%eax), %xmm2
+	movdqa	0x30(%eax), %xmm3
+	movdqa	0x40(%eax), %xmm4
+	movdqa	0x50(%eax), %xmm5
+	movdqa	0x60(%eax), %xmm6
+	movdqa	0x70(%eax), %xmm7
+	lea	0x80(%eax), %eax
+	sub	$128, %ecx
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+	movdqa	%xmm2, 0x20(%edx)
+	movdqa	%xmm3, 0x30(%edx)
+	movdqa	%xmm4, 0x40(%edx)
+	movdqa	%xmm5, 0x50(%edx)
+	movdqa	%xmm6, 0x60(%edx)
+	movdqa	%xmm7, 0x70(%edx)
+	lea	0x80(%edx), %edx
+
+	jge	L(shl_0_gobble_cache_loop)
+L(shl_0_gobble_cache_loop_tail):
+	cmp	$-0x40, %ecx
+	lea	0x80(%ecx), %ecx
+	jl	L(shl_0_cache_less_64bytes)
+
+	movdqa	(%eax), %xmm0
+	sub	$0x40, %ecx
+	movdqa	0x10(%eax), %xmm1
+
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+
+	movdqa	0x20(%eax), %xmm0
+	movdqa	0x30(%eax), %xmm1
+	add	$0x40, %eax
+
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm1, 0x30(%edx)
+	add	$0x40, %edx
+L(shl_0_cache_less_64bytes):
+	cmp	$0x20, %ecx
+	jl	L(shl_0_cache_less_32bytes)
+	movdqa	(%eax), %xmm0
+	sub	$0x20, %ecx
+	movdqa	0x10(%eax), %xmm1
+	add	$0x20, %eax
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+	add	$0x20, %edx
+L(shl_0_cache_less_32bytes):
+	cmp	$0x10, %ecx
+	jl	L(shl_0_cache_less_16bytes)
+	sub	$0x10, %ecx
+	movdqa	(%eax), %xmm0
+	add	$0x10, %eax
+	movdqa	%xmm0, (%edx)
+	add	$0x10, %edx
+L(shl_0_cache_less_16bytes):
+	add	%ecx, %edx
+	add	%ecx, %eax
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+
+	ALIGN (4)
+L(shl_0_gobble_mem_loop):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x280(%eax)
+	prefetcht0 0x1c0(%edx)
+
+	movdqa	(%eax), %xmm0
+	movdqa	0x10(%eax), %xmm1
+	movdqa	0x20(%eax), %xmm2
+	movdqa	0x30(%eax), %xmm3
+	movdqa	0x40(%eax), %xmm4
+	movdqa	0x50(%eax), %xmm5
+	movdqa	0x60(%eax), %xmm6
+	movdqa	0x70(%eax), %xmm7
+	lea	0x80(%eax), %eax
+	sub	$0x80, %ecx
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+	movdqa	%xmm2, 0x20(%edx)
+	movdqa	%xmm3, 0x30(%edx)
+	movdqa	%xmm4, 0x40(%edx)
+	movdqa	%xmm5, 0x50(%edx)
+	movdqa	%xmm6, 0x60(%edx)
+	movdqa	%xmm7, 0x70(%edx)
+	lea	0x80(%edx), %edx
+
+	jge	L(shl_0_gobble_mem_loop)
+	cmp	$-0x40, %ecx
+	lea	0x80(%ecx), %ecx
+	jl	L(shl_0_mem_less_64bytes)
+
+	movdqa	(%eax), %xmm0
+	sub	$0x40, %ecx
+	movdqa	0x10(%eax), %xmm1
+
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+
+	movdqa	0x20(%eax), %xmm0
+	movdqa	0x30(%eax), %xmm1
+	add	$0x40, %eax
+
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm1, 0x30(%edx)
+	add	$0x40, %edx
+L(shl_0_mem_less_64bytes):
+	cmp	$0x20, %ecx
+	jl	L(shl_0_mem_less_32bytes)
+	movdqa	(%eax), %xmm0
+	sub	$0x20, %ecx
+	movdqa	0x10(%eax), %xmm1
+	add	$0x20, %eax
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+	add	$0x20, %edx
+L(shl_0_mem_less_32bytes):
+	cmp	$0x10, %ecx
+	jl	L(shl_0_mem_less_16bytes)
+	sub	$0x10, %ecx
+	movdqa	(%eax), %xmm0
+	add	$0x10, %eax
+	movdqa	%xmm0, (%edx)
+	add	$0x10, %edx
+L(shl_0_mem_less_16bytes):
+	add	%ecx, %edx
+	add	%ecx, %eax
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+
+	ALIGN (4)
+L(shl_1):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-1(%eax), %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	lea	-32(%ecx), %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_1_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$1, %xmm2, %xmm3
+	palignr	$1, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jl	L(shl_1_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$1, %xmm2, %xmm3
+	palignr	$1, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_1_loop)
+
+L(shl_1_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	1(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_2):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-2(%eax), %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	lea	-32(%ecx), %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_2_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$2, %xmm2, %xmm3
+	palignr	$2, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jl	L(shl_2_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$2, %xmm2, %xmm3
+	palignr	$2, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_2_loop)
+
+L(shl_2_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	2(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_3):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-3(%eax), %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	lea	-32(%ecx), %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_3_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$3, %xmm2, %xmm3
+	palignr	$3, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jl	L(shl_3_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$3, %xmm2, %xmm3
+	palignr	$3, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_3_loop)
+
+L(shl_3_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	3(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_4):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-4(%eax), %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	lea	-32(%ecx), %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_4_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$4, %xmm2, %xmm3
+	palignr	$4, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jl	L(shl_4_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$4, %xmm2, %xmm3
+	palignr	$4, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_4_loop)
+
+L(shl_4_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	4(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_5):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-5(%eax), %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	lea	-32(%ecx), %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_5_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$5, %xmm2, %xmm3
+	palignr	$5, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jl	L(shl_5_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$5, %xmm2, %xmm3
+	palignr	$5, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_5_loop)
+
+L(shl_5_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	5(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+
+	ALIGN (4)
+L(shl_6):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-6(%eax), %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	lea	-32(%ecx), %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_6_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$6, %xmm2, %xmm3
+	palignr	$6, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jl	L(shl_6_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$6, %xmm2, %xmm3
+	palignr	$6, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_6_loop)
+
+L(shl_6_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	6(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_7):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-7(%eax), %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	lea	-32(%ecx), %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_7_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$7, %xmm2, %xmm3
+	palignr	$7, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jl	L(shl_7_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$7, %xmm2, %xmm3
+	palignr	$7, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_7_loop)
+
+L(shl_7_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	7(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_8):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-8(%eax), %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	lea	-32(%ecx), %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_8_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$8, %xmm2, %xmm3
+	palignr	$8, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jl	L(shl_8_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$8, %xmm2, %xmm3
+	palignr	$8, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_8_loop)
+
+L(shl_8_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	8(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_9):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-9(%eax), %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	lea	-32(%ecx), %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_9_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$9, %xmm2, %xmm3
+	palignr	$9, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jl	L(shl_9_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$9, %xmm2, %xmm3
+	palignr	$9, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_9_loop)
+
+L(shl_9_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	9(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_10):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-10(%eax), %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	lea	-32(%ecx), %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_10_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$10, %xmm2, %xmm3
+	palignr	$10, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jl	L(shl_10_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$10, %xmm2, %xmm3
+	palignr	$10, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_10_loop)
+
+L(shl_10_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	10(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_11):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-11(%eax), %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	lea	-32(%ecx), %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_11_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$11, %xmm2, %xmm3
+	palignr	$11, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jl	L(shl_11_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$11, %xmm2, %xmm3
+	palignr	$11, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_11_loop)
+
+L(shl_11_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	11(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_12):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-12(%eax), %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	lea	-32(%ecx), %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_12_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$12, %xmm2, %xmm3
+	palignr	$12, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jl	L(shl_12_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$12, %xmm2, %xmm3
+	palignr	$12, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_12_loop)
+
+L(shl_12_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	12(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_13):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-13(%eax), %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	lea	-32(%ecx), %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_13_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$13, %xmm2, %xmm3
+	palignr	$13, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jl	L(shl_13_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$13, %xmm2, %xmm3
+	palignr	$13, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_13_loop)
+
+L(shl_13_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	13(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_14):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-14(%eax), %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	lea	-32(%ecx), %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_14_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$14, %xmm2, %xmm3
+	palignr	$14, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jl	L(shl_14_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$14, %xmm2, %xmm3
+	palignr	$14, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_14_loop)
+
+L(shl_14_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	14(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+
+	ALIGN (4)
+L(shl_15):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-15(%eax), %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	lea	-32(%ecx), %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_15_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$15, %xmm2, %xmm3
+	palignr	$15, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jl	L(shl_15_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$15, %xmm2, %xmm3
+	palignr	$15, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_15_loop)
+
+L(shl_15_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	15(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+
+	ALIGN (4)
+L(fwd_write_44bytes):
+	movl	-44(%eax), %ecx
+	movl	%ecx, -44(%edx)
+L(fwd_write_40bytes):
+	movl	-40(%eax), %ecx
+	movl	%ecx, -40(%edx)
+L(fwd_write_36bytes):
+	movl	-36(%eax), %ecx
+	movl	%ecx, -36(%edx)
+L(fwd_write_32bytes):
+	movl	-32(%eax), %ecx
+	movl	%ecx, -32(%edx)
+L(fwd_write_28bytes):
+	movl	-28(%eax), %ecx
+	movl	%ecx, -28(%edx)
+L(fwd_write_24bytes):
+	movl	-24(%eax), %ecx
+	movl	%ecx, -24(%edx)
+L(fwd_write_20bytes):
+	movl	-20(%eax), %ecx
+	movl	%ecx, -20(%edx)
+L(fwd_write_16bytes):
+	movl	-16(%eax), %ecx
+	movl	%ecx, -16(%edx)
+L(fwd_write_12bytes):
+	movl	-12(%eax), %ecx
+	movl	%ecx, -12(%edx)
+L(fwd_write_8bytes):
+	movl	-8(%eax), %ecx
+	movl	%ecx, -8(%edx)
+L(fwd_write_4bytes):
+	movl	-4(%eax), %ecx
+	movl	%ecx, -4(%edx)
+L(fwd_write_0bytes):
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_5bytes):
+	movl	-5(%eax), %ecx
+	movl	-4(%eax), %eax
+	movl	%ecx, -5(%edx)
+	movl	%eax, -4(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_45bytes):
+	movl	-45(%eax), %ecx
+	movl	%ecx, -45(%edx)
+L(fwd_write_41bytes):
+	movl	-41(%eax), %ecx
+	movl	%ecx, -41(%edx)
+L(fwd_write_37bytes):
+	movl	-37(%eax), %ecx
+	movl	%ecx, -37(%edx)
+L(fwd_write_33bytes):
+	movl	-33(%eax), %ecx
+	movl	%ecx, -33(%edx)
+L(fwd_write_29bytes):
+	movl	-29(%eax), %ecx
+	movl	%ecx, -29(%edx)
+L(fwd_write_25bytes):
+	movl	-25(%eax), %ecx
+	movl	%ecx, -25(%edx)
+L(fwd_write_21bytes):
+	movl	-21(%eax), %ecx
+	movl	%ecx, -21(%edx)
+L(fwd_write_17bytes):
+	movl	-17(%eax), %ecx
+	movl	%ecx, -17(%edx)
+L(fwd_write_13bytes):
+	movl	-13(%eax), %ecx
+	movl	%ecx, -13(%edx)
+L(fwd_write_9bytes):
+	movl	-9(%eax), %ecx
+	movl	%ecx, -9(%edx)
+	movl	-5(%eax), %ecx
+	movl	%ecx, -5(%edx)
+L(fwd_write_1bytes):
+	movzbl	-1(%eax), %ecx
+	movb	%cl, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_46bytes):
+	movl	-46(%eax), %ecx
+	movl	%ecx, -46(%edx)
+L(fwd_write_42bytes):
+	movl	-42(%eax), %ecx
+	movl	%ecx, -42(%edx)
+L(fwd_write_38bytes):
+	movl	-38(%eax), %ecx
+	movl	%ecx, -38(%edx)
+L(fwd_write_34bytes):
+	movl	-34(%eax), %ecx
+	movl	%ecx, -34(%edx)
+L(fwd_write_30bytes):
+	movl	-30(%eax), %ecx
+	movl	%ecx, -30(%edx)
+L(fwd_write_26bytes):
+	movl	-26(%eax), %ecx
+	movl	%ecx, -26(%edx)
+L(fwd_write_22bytes):
+	movl	-22(%eax), %ecx
+	movl	%ecx, -22(%edx)
+L(fwd_write_18bytes):
+	movl	-18(%eax), %ecx
+	movl	%ecx, -18(%edx)
+L(fwd_write_14bytes):
+	movl	-14(%eax), %ecx
+	movl	%ecx, -14(%edx)
+L(fwd_write_10bytes):
+	movl	-10(%eax), %ecx
+	movl	%ecx, -10(%edx)
+L(fwd_write_6bytes):
+	movl	-6(%eax), %ecx
+	movl	%ecx, -6(%edx)
+L(fwd_write_2bytes):
+	movzwl	-2(%eax), %ecx
+	movw	%cx, -2(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_47bytes):
+	movl	-47(%eax), %ecx
+	movl	%ecx, -47(%edx)
+L(fwd_write_43bytes):
+	movl	-43(%eax), %ecx
+	movl	%ecx, -43(%edx)
+L(fwd_write_39bytes):
+	movl	-39(%eax), %ecx
+	movl	%ecx, -39(%edx)
+L(fwd_write_35bytes):
+	movl	-35(%eax), %ecx
+	movl	%ecx, -35(%edx)
+L(fwd_write_31bytes):
+	movl	-31(%eax), %ecx
+	movl	%ecx, -31(%edx)
+L(fwd_write_27bytes):
+	movl	-27(%eax), %ecx
+	movl	%ecx, -27(%edx)
+L(fwd_write_23bytes):
+	movl	-23(%eax), %ecx
+	movl	%ecx, -23(%edx)
+L(fwd_write_19bytes):
+	movl	-19(%eax), %ecx
+	movl	%ecx, -19(%edx)
+L(fwd_write_15bytes):
+	movl	-15(%eax), %ecx
+	movl	%ecx, -15(%edx)
+L(fwd_write_11bytes):
+	movl	-11(%eax), %ecx
+	movl	%ecx, -11(%edx)
+L(fwd_write_7bytes):
+	movl	-7(%eax), %ecx
+	movl	%ecx, -7(%edx)
+L(fwd_write_3bytes):
+	movzwl	-3(%eax), %ecx
+	movzbl	-1(%eax), %eax
+	movw	%cx, -3(%edx)
+	movb	%al, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(large_page):
+	movdqu	(%eax), %xmm1
+	lea	16(%eax), %eax
+	movdqu	%xmm0, (%esi)
+	movntdq	%xmm1, (%edx)
+	lea	16(%edx), %edx
+	POP (%esi)
+	lea	-0x90(%ecx), %ecx
+	POP (%edi)
+L(large_page_loop):
+	movdqu	(%eax), %xmm0
+	movdqu	0x10(%eax), %xmm1
+	movdqu	0x20(%eax), %xmm2
+	movdqu	0x30(%eax), %xmm3
+	movdqu	0x40(%eax), %xmm4
+	movdqu	0x50(%eax), %xmm5
+	movdqu	0x60(%eax), %xmm6
+	movdqu	0x70(%eax), %xmm7
+	lea	0x80(%eax), %eax
+
+	sub	$0x80, %ecx
+	movntdq	%xmm0, (%edx)
+	movntdq	%xmm1, 0x10(%edx)
+	movntdq	%xmm2, 0x20(%edx)
+	movntdq	%xmm3, 0x30(%edx)
+	movntdq	%xmm4, 0x40(%edx)
+	movntdq	%xmm5, 0x50(%edx)
+	movntdq	%xmm6, 0x60(%edx)
+	movntdq	%xmm7, 0x70(%edx)
+	lea	0x80(%edx), %edx
+	jae	L(large_page_loop)
+	cmp	$-0x40, %ecx
+	lea	0x80(%ecx), %ecx
+	jl	L(large_page_less_64bytes)
+
+	movdqu	(%eax), %xmm0
+	movdqu	0x10(%eax), %xmm1
+	movdqu	0x20(%eax), %xmm2
+	movdqu	0x30(%eax), %xmm3
+	lea	0x40(%eax), %eax
+
+	movntdq	%xmm0, (%edx)
+	movntdq	%xmm1, 0x10(%edx)
+	movntdq	%xmm2, 0x20(%edx)
+	movntdq	%xmm3, 0x30(%edx)
+	lea	0x40(%edx), %edx
+	sub	$0x40, %ecx
+L(large_page_less_64bytes):
+	cmp	$32, %ecx
+	jl	L(large_page_less_32bytes)
+	movdqu	(%eax), %xmm0
+	movdqu	0x10(%eax), %xmm1
+	lea	0x20(%eax), %eax
+	movntdq	%xmm0, (%edx)
+	movntdq	%xmm1, 0x10(%edx)
+	lea	0x20(%edx), %edx
+	sub	$0x20, %ecx
+L(large_page_less_32bytes):
+	add	%ecx, %edx
+	add	%ecx, %eax
+	sfence
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+
+	ALIGN (4)
+L(bk_write_44bytes):
+	movl	40(%eax), %ecx
+	movl	%ecx, 40(%edx)
+L(bk_write_40bytes):
+	movl	36(%eax), %ecx
+	movl	%ecx, 36(%edx)
+L(bk_write_36bytes):
+	movl	32(%eax), %ecx
+	movl	%ecx, 32(%edx)
+L(bk_write_32bytes):
+	movl	28(%eax), %ecx
+	movl	%ecx, 28(%edx)
+L(bk_write_28bytes):
+	movl	24(%eax), %ecx
+	movl	%ecx, 24(%edx)
+L(bk_write_24bytes):
+	movl	20(%eax), %ecx
+	movl	%ecx, 20(%edx)
+L(bk_write_20bytes):
+	movl	16(%eax), %ecx
+	movl	%ecx, 16(%edx)
+L(bk_write_16bytes):
+	movl	12(%eax), %ecx
+	movl	%ecx, 12(%edx)
+L(bk_write_12bytes):
+	movl	8(%eax), %ecx
+	movl	%ecx, 8(%edx)
+L(bk_write_8bytes):
+	movl	4(%eax), %ecx
+	movl	%ecx, 4(%edx)
+L(bk_write_4bytes):
+	movl	(%eax), %ecx
+	movl	%ecx, (%edx)
+L(bk_write_0bytes):
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(bk_write_45bytes):
+	movl	41(%eax), %ecx
+	movl	%ecx, 41(%edx)
+L(bk_write_41bytes):
+	movl	37(%eax), %ecx
+	movl	%ecx, 37(%edx)
+L(bk_write_37bytes):
+	movl	33(%eax), %ecx
+	movl	%ecx, 33(%edx)
+L(bk_write_33bytes):
+	movl	29(%eax), %ecx
+	movl	%ecx, 29(%edx)
+L(bk_write_29bytes):
+	movl	25(%eax), %ecx
+	movl	%ecx, 25(%edx)
+L(bk_write_25bytes):
+	movl	21(%eax), %ecx
+	movl	%ecx, 21(%edx)
+L(bk_write_21bytes):
+	movl	17(%eax), %ecx
+	movl	%ecx, 17(%edx)
+L(bk_write_17bytes):
+	movl	13(%eax), %ecx
+	movl	%ecx, 13(%edx)
+L(bk_write_13bytes):
+	movl	9(%eax), %ecx
+	movl	%ecx, 9(%edx)
+L(bk_write_9bytes):
+	movl	5(%eax), %ecx
+	movl	%ecx, 5(%edx)
+L(bk_write_5bytes):
+	movl	1(%eax), %ecx
+	movl	%ecx, 1(%edx)
+L(bk_write_1bytes):
+	movzbl	(%eax), %ecx
+	movb	%cl, (%edx)
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(bk_write_46bytes):
+	movl	42(%eax), %ecx
+	movl	%ecx, 42(%edx)
+L(bk_write_42bytes):
+	movl	38(%eax), %ecx
+	movl	%ecx, 38(%edx)
+L(bk_write_38bytes):
+	movl	34(%eax), %ecx
+	movl	%ecx, 34(%edx)
+L(bk_write_34bytes):
+	movl	30(%eax), %ecx
+	movl	%ecx, 30(%edx)
+L(bk_write_30bytes):
+	movl	26(%eax), %ecx
+	movl	%ecx, 26(%edx)
+L(bk_write_26bytes):
+	movl	22(%eax), %ecx
+	movl	%ecx, 22(%edx)
+L(bk_write_22bytes):
+	movl	18(%eax), %ecx
+	movl	%ecx, 18(%edx)
+L(bk_write_18bytes):
+	movl	14(%eax), %ecx
+	movl	%ecx, 14(%edx)
+L(bk_write_14bytes):
+	movl	10(%eax), %ecx
+	movl	%ecx, 10(%edx)
+L(bk_write_10bytes):
+	movl	6(%eax), %ecx
+	movl	%ecx, 6(%edx)
+L(bk_write_6bytes):
+	movl	2(%eax), %ecx
+	movl	%ecx, 2(%edx)
+L(bk_write_2bytes):
+	movzwl	(%eax), %ecx
+	movw	%cx, (%edx)
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(bk_write_47bytes):
+	movl	43(%eax), %ecx
+	movl	%ecx, 43(%edx)
+L(bk_write_43bytes):
+	movl	39(%eax), %ecx
+	movl	%ecx, 39(%edx)
+L(bk_write_39bytes):
+	movl	35(%eax), %ecx
+	movl	%ecx, 35(%edx)
+L(bk_write_35bytes):
+	movl	31(%eax), %ecx
+	movl	%ecx, 31(%edx)
+L(bk_write_31bytes):
+	movl	27(%eax), %ecx
+	movl	%ecx, 27(%edx)
+L(bk_write_27bytes):
+	movl	23(%eax), %ecx
+	movl	%ecx, 23(%edx)
+L(bk_write_23bytes):
+	movl	19(%eax), %ecx
+	movl	%ecx, 19(%edx)
+L(bk_write_19bytes):
+	movl	15(%eax), %ecx
+	movl	%ecx, 15(%edx)
+L(bk_write_15bytes):
+	movl	11(%eax), %ecx
+	movl	%ecx, 11(%edx)
+L(bk_write_11bytes):
+	movl	7(%eax), %ecx
+	movl	%ecx, 7(%edx)
+L(bk_write_7bytes):
+	movl	3(%eax), %ecx
+	movl	%ecx, 3(%edx)
+L(bk_write_3bytes):
+	movzwl	1(%eax), %ecx
+	movw	%cx, 1(%edx)
+	movzbl	(%eax), %eax
+	movb	%al, (%edx)
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN_END
+
+
+	.pushsection .rodata.ssse3,"a",@progbits
+	ALIGN (2)
+L(table_48bytes_fwd):
+	.int	JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
+
+	ALIGN (2)
+L(shl_table):
+	.int	JMPTBL (L(shl_0), L(shl_table))
+	.int	JMPTBL (L(shl_1), L(shl_table))
+	.int	JMPTBL (L(shl_2), L(shl_table))
+	.int	JMPTBL (L(shl_3), L(shl_table))
+	.int	JMPTBL (L(shl_4), L(shl_table))
+	.int	JMPTBL (L(shl_5), L(shl_table))
+	.int	JMPTBL (L(shl_6), L(shl_table))
+	.int	JMPTBL (L(shl_7), L(shl_table))
+	.int	JMPTBL (L(shl_8), L(shl_table))
+	.int	JMPTBL (L(shl_9), L(shl_table))
+	.int	JMPTBL (L(shl_10), L(shl_table))
+	.int	JMPTBL (L(shl_11), L(shl_table))
+	.int	JMPTBL (L(shl_12), L(shl_table))
+	.int	JMPTBL (L(shl_13), L(shl_table))
+	.int	JMPTBL (L(shl_14), L(shl_table))
+	.int	JMPTBL (L(shl_15), L(shl_table))
+
+	ALIGN (2)
+L(table_48_bytes_bwd):
+	.int	JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
+
+	.popsection
+
+#ifdef USE_AS_MEMMOVE
+	ALIGN (4)
+L(copy_backward):
+	PUSH (%esi)
+	movl	%eax, %esi
+	lea	(%ecx,%edx,1),%edx
+	lea	(%ecx,%esi,1),%esi
+	testl	$0x3, %edx
+	jnz	L(bk_align)
+
+L(bk_aligned_4):
+	cmp	$64, %ecx
+	jge	L(bk_write_more64bytes)
+
+L(bk_write_64bytesless):
+	cmp	$32, %ecx
+	jl	L(bk_write_less32bytes)
+
+L(bk_write_more32bytes):
+	/* Copy 32 bytes at a time.  */
+	sub	$32, %ecx
+	movl	-4(%esi), %eax
+	movl	%eax, -4(%edx)
+	movl	-8(%esi), %eax
+	movl	%eax, -8(%edx)
+	movl	-12(%esi), %eax
+	movl	%eax, -12(%edx)
+	movl	-16(%esi), %eax
+	movl	%eax, -16(%edx)
+	movl	-20(%esi), %eax
+	movl	%eax, -20(%edx)
+	movl	-24(%esi), %eax
+	movl	%eax, -24(%edx)
+	movl	-28(%esi), %eax
+	movl	%eax, -28(%edx)
+	movl	-32(%esi), %eax
+	movl	%eax, -32(%edx)
+	sub	$32, %edx
+	sub	$32, %esi
+
+L(bk_write_less32bytes):
+	movl	%esi, %eax
+	sub	%ecx, %edx
+	sub	%ecx, %eax
+	POP (%esi)
+L(bk_write_less32bytes_2):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
+
+	ALIGN (4)
+L(bk_align):
+	cmp	$8, %ecx
+	jle	L(bk_write_less32bytes)
+	testl	$1, %edx
+	/* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
+	   then (EDX & 2) must be != 0.  */
+	jz	L(bk_got2)
+	sub	$1, %esi
+	sub	$1, %ecx
+	sub	$1, %edx
+	movzbl	(%esi), %eax
+	movb	%al, (%edx)
+
+	testl	$2, %edx
+	jz	L(bk_aligned_4)
+
+L(bk_got2):
+	sub	$2, %esi
+	sub	$2, %ecx
+	sub	$2, %edx
+	movzwl	(%esi), %eax
+	movw	%ax, (%edx)
+	jmp	L(bk_aligned_4)
+
+	ALIGN (4)
+L(bk_write_more64bytes):
+	/* Check alignment of last byte.  */
+	testl	$15, %edx
+	jz	L(bk_ssse3_cpy_pre)
+
+/* EDX is aligned 4 bytes, but not 16 bytes.  */
+L(bk_ssse3_align):
+	sub	$4, %esi
+	sub	$4, %ecx
+	sub	$4, %edx
+	movl	(%esi), %eax
+	movl	%eax, (%edx)
+
+	testl	$15, %edx
+	jz	L(bk_ssse3_cpy_pre)
+
+	sub	$4, %esi
+	sub	$4, %ecx
+	sub	$4, %edx
+	movl	(%esi), %eax
+	movl	%eax, (%edx)
+
+	testl	$15, %edx
+	jz	L(bk_ssse3_cpy_pre)
+
+	sub	$4, %esi
+	sub	$4, %ecx
+	sub	$4, %edx
+	movl	(%esi), %eax
+	movl	%eax, (%edx)
+
+L(bk_ssse3_cpy_pre):
+	cmp	$64, %ecx
+	jl	L(bk_write_more32bytes)
+
+L(bk_ssse3_cpy):
+	sub	$64, %esi
+	sub	$64, %ecx
+	sub	$64, %edx
+	movdqu	0x30(%esi), %xmm3
+	movdqa	%xmm3, 0x30(%edx)
+	movdqu	0x20(%esi), %xmm2
+	movdqa	%xmm2, 0x20(%edx)
+	movdqu	0x10(%esi), %xmm1
+	movdqa	%xmm1, 0x10(%edx)
+	movdqu	(%esi), %xmm0
+	movdqa	%xmm0, (%edx)
+	cmp	$64, %ecx
+	jge	L(bk_ssse3_cpy)
+	jmp	L(bk_write_64bytesless)
+
+#endif
+
+END (MEMCPY)
+
+#endif
diff --git a/sysdeps/i386/i686/multiarch/memcpy.S b/sysdeps/i386/i686/multiarch/memcpy.S
new file mode 100644
index 0000000000..bf1c7cc2d2
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/memcpy.S
@@ -0,0 +1,90 @@
+/* Multiple versions of memcpy
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+   DSO.  In static binaries we need memcpy before the initialization
+   happened.  */
+#if defined SHARED && !defined NOT_IN_libc
+	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+	.globl	__i686.get_pc_thunk.bx
+	.hidden	__i686.get_pc_thunk.bx
+	.p2align 4
+	.type	__i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+	movl	(%esp), %ebx
+	ret
+
+	.text
+ENTRY(memcpy)
+	.type	memcpy, @gnu_indirect_function
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (ebx, 0)
+	call	__i686.get_pc_thunk.bx
+	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+	jne	1f
+	call	__init_cpu_features
+1:	leal	__memcpy_ia32@GOTOFF(%ebx), %eax
+	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__memcpy_ssse3@GOTOFF(%ebx), %eax
+	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__memcpy_ssse3_rep@GOTOFF(%ebx), %eax
+2:	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+	ret
+END(memcpy)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __memcpy_ia32, @function; \
+	.p2align 4; \
+	__memcpy_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __memcpy_ia32, .-__memcpy_ia32
+
+# undef ENTRY_CHK
+# define ENTRY_CHK(name) \
+	.type __memcpy_chk_ia32, @function; \
+	.globl __memcpy_chk_ia32; \
+	.p2align 4; \
+	__memcpy_chk_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END_CHK
+# define END_CHK(name) \
+	cfi_endproc; .size __memcpy_chk_ia32, .-__memcpy_chk_ia32
+
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_memcpy; __GI_memcpy = __memcpy_ia32
+#endif
+
+#include "../memcpy.S"
diff --git a/sysdeps/i386/i686/multiarch/memcpy_chk.S b/sysdeps/i386/i686/multiarch/memcpy_chk.S
new file mode 100644
index 0000000000..171ac8adef
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/memcpy_chk.S
@@ -0,0 +1,64 @@
+/* Multiple versions of __memcpy_chk
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+   DSO.  There are no multiarch memcpy functions for static binaries.
+ */
+#ifndef NOT_IN_libc
+# ifdef SHARED
+	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+	.globl	__i686.get_pc_thunk.bx
+	.hidden	__i686.get_pc_thunk.bx
+	.p2align 4
+	.type	__i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+	movl	(%esp), %ebx
+	ret
+
+	.text
+ENTRY(__memcpy_chk)
+	.type	__memcpy_chk, @gnu_indirect_function
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (ebx, 0)
+	call	__i686.get_pc_thunk.bx
+	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+	jne	1f
+	call	__init_cpu_features
+1:	leal	__memcpy_chk_ia32@GOTOFF(%ebx), %eax
+	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__memcpy_chk_ssse3@GOTOFF(%ebx), %eax
+	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__memcpy_chk_ssse3_rep@GOTOFF(%ebx), %eax
+2:	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+	ret
+END(__memcpy_chk)
+# else
+#  include "../memcpy_chk.S"
+# endif
+#endif
diff --git a/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S b/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S
new file mode 100644
index 0000000000..d202fc4a13
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY		__memmove_ssse3_rep
+#define MEMCPY_CHK	__memmove_chk_ssse3_rep
+#include "memcpy-ssse3-rep.S"
diff --git a/sysdeps/i386/i686/multiarch/memmove-ssse3.S b/sysdeps/i386/i686/multiarch/memmove-ssse3.S
new file mode 100644
index 0000000000..295430b1ef
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/memmove-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY		__memmove_ssse3
+#define MEMCPY_CHK	__memmove_chk_ssse3
+#include "memcpy-ssse3.S"
diff --git a/sysdeps/i386/i686/multiarch/memmove.S b/sysdeps/i386/i686/multiarch/memmove.S
new file mode 100644
index 0000000000..d3d8d3675c
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/memmove.S
@@ -0,0 +1,114 @@
+/* Multiple versions of memmove
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib.  */
+#ifndef NOT_IN_libc
+# ifdef SHARED
+	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+	.globl	__i686.get_pc_thunk.bx
+	.hidden	__i686.get_pc_thunk.bx
+	.p2align 4
+	.type	__i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+	movl	(%esp), %ebx
+	ret
+
+	.text
+ENTRY(memmove)
+	.type	memmove, @gnu_indirect_function
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (ebx, 0)
+	call	__i686.get_pc_thunk.bx
+	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+	jne	1f
+	call	__init_cpu_features
+1:	leal	__memmove_ia32@GOTOFF(%ebx), %eax
+	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__memmove_ssse3@GOTOFF(%ebx), %eax
+	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__memmove_ssse3_rep@GOTOFF(%ebx), %eax
+2:	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+	ret
+END(memmove)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __memmove_ia32, @function; \
+	.p2align 4; \
+	__memmove_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# else
+	.text
+ENTRY(memmove)
+	.type	memmove, @gnu_indirect_function
+	cmpl	$0, KIND_OFFSET+__cpu_features
+	jne	1f
+	call	__init_cpu_features
+1:	leal	__memmove_ia32, %eax
+	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
+	jz	2f
+	leal	__memmove_ssse3, %eax
+2:	ret
+END(memmove)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __memmove_ia32, @function; \
+	.globl __memmove_ia32; \
+	.p2align 4; \
+	__memmove_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# endif
+
+# undef END
+# define END(name) \
+	cfi_endproc; .size __memmove_ia32, .-__memmove_ia32
+
+# undef ENTRY_CHK
+# define ENTRY_CHK(name) \
+	.type __memmove_chk_ia32, @function; \
+	.globl __memmove_chk_ia32; \
+	.p2align 4; \
+	__memmove_chk_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END_CHK
+# define END_CHK(name) \
+	cfi_endproc; .size __memmove_chk_ia32, .-__memmove_chk_ia32
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_memmove; __GI_memmove = __memmove_ia32
+# endif
+#endif
+
+#include "../memmove.S"
diff --git a/sysdeps/i386/i686/multiarch/memmove_chk.S b/sysdeps/i386/i686/multiarch/memmove_chk.S
new file mode 100644
index 0000000000..e33f2a31b3
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/memmove_chk.S
@@ -0,0 +1,112 @@
+/* Multiple versions of __memmove_chk
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib.  */
+#ifndef NOT_IN_libc
+# ifdef SHARED
+	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+	.globl	__i686.get_pc_thunk.bx
+	.hidden	__i686.get_pc_thunk.bx
+	.p2align 4
+	.type	__i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+	movl	(%esp), %ebx
+	ret
+
+	.text
+ENTRY(__memmove_chk)
+	.type	__memmove_chk, @gnu_indirect_function
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (ebx, 0)
+	call	__i686.get_pc_thunk.bx
+	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+	jne	1f
+	call	__init_cpu_features
+1:	leal	__memmove_chk_ia32@GOTOFF(%ebx), %eax
+	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__memmove_chk_ssse3@GOTOFF(%ebx), %eax
+	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__memmove_chk_ssse3_rep@GOTOFF(%ebx), %eax
+2:	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+	ret
+END(__memmove_chk)
+# else
+	.text
+ENTRY(__memmove_chk)
+	.type	__memmove_chk, @gnu_indirect_function
+	cmpl	$0, KIND_OFFSET+__cpu_features
+	jne	1f
+	call	__init_cpu_features
+1:	leal	__memmove_chk_ia32, %eax
+	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
+	jz	2f
+	leal	__memmove_chk_ssse3, %eax
+	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features
+	jz	2f
+	leal	__memmove_chk_ssse3_rep, %eax
+2:	ret
+END(__memmove_chk)
+
+	.type __memmove_chk_ssse3, @function
+	.p2align 4;
+__memmove_chk_ssse3:
+	cfi_startproc
+	CALL_MCOUNT
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	__chk_fail
+	jmp	__memmove_ssse3
+	cfi_endproc
+	.size __memmove_chk_ssse3, .-__memmove_chk_ssse3
+
+	.type __memmove_chk_ssse3_rep, @function
+	.p2align 4;
+__memmove_chk_ssse3_rep:
+	cfi_startproc
+	CALL_MCOUNT
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	__chk_fail
+	jmp	__memmove_ssse3_rep
+	cfi_endproc
+	.size __memmove_chk_ssse3_rep, .-__memmove_chk_ssse3_rep
+
+	.type __memmove_chk_ia32, @function
+	.p2align 4;
+__memmove_chk_ia32:
+	cfi_startproc
+	CALL_MCOUNT
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	__chk_fail
+	jmp	__memmove_ia32
+	cfi_endproc
+	.size __memmove_chk_ia32, .-__memmove_chk_ia32
+# endif
+#endif
diff --git a/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S b/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S
new file mode 100644
index 0000000000..5357b33e18
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMPCPY
+#define MEMCPY		__mempcpy_ssse3_rep
+#define MEMCPY_CHK	__mempcpy_chk_ssse3_rep
+#include "memcpy-ssse3-rep.S"
diff --git a/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S b/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S
new file mode 100644
index 0000000000..822d98e954
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMPCPY
+#define MEMCPY		__mempcpy_ssse3
+#define MEMCPY_CHK	__mempcpy_chk_ssse3
+#include "memcpy-ssse3.S"
diff --git a/sysdeps/i386/i686/multiarch/mempcpy.S b/sysdeps/i386/i686/multiarch/mempcpy.S
new file mode 100644
index 0000000000..df830d2e63
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/mempcpy.S
@@ -0,0 +1,93 @@
+/* Multiple versions of mempcpy
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+   DSO.  In static binaries we need mempcpy before the initialization
+   happened.  */
+#if defined SHARED && !defined NOT_IN_libc
+	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+	.globl	__i686.get_pc_thunk.bx
+	.hidden	__i686.get_pc_thunk.bx
+	.p2align 4
+	.type	__i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+	movl	(%esp), %ebx
+	ret
+
+	.text
+ENTRY(__mempcpy)
+	.type	__mempcpy, @gnu_indirect_function
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (ebx, 0)
+	call	__i686.get_pc_thunk.bx
+	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+	jne	1f
+	call	__init_cpu_features
+1:	leal	__mempcpy_ia32@GOTOFF(%ebx), %eax
+	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__mempcpy_ssse3@GOTOFF(%ebx), %eax
+	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__mempcpy_ssse3_rep@GOTOFF(%ebx), %eax
+2:	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+	ret
+END(__mempcpy)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __mempcpy_ia32, @function; \
+	.p2align 4; \
+	__mempcpy_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __mempcpy_ia32, .-__mempcpy_ia32
+
+# undef ENTRY_CHK
+# define ENTRY_CHK(name) \
+	.type __mempcpy_chk_ia32, @function; \
+	.globl __mempcpy_chk_ia32; \
+	.p2align 4; \
+	__mempcpy_chk_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END_CHK
+# define END_CHK(name) \
+	cfi_endproc; .size __mempcpy_chk_ia32, .-__mempcpy_chk_ia32
+
+# undef libc_hidden_def
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+# define libc_hidden_def(name) \
+	.globl __GI_mempcpy; __GI_mempcpy = __mempcpy_ia32
+# define libc_hidden_builtin_def(name) \
+	.globl __GI___mempcpy; __GI___mempcpy = __mempcpy_ia32
+#endif
+
+#include "../mempcpy.S"
diff --git a/sysdeps/i386/i686/multiarch/mempcpy_chk.S b/sysdeps/i386/i686/multiarch/mempcpy_chk.S
new file mode 100644
index 0000000000..828fb5e608
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/mempcpy_chk.S
@@ -0,0 +1,64 @@
+/* Multiple versions of __mempcpy_chk
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+   DSO.  There are no multiarch mempcpy functions for static binaries.
+ */
+#ifndef NOT_IN_libc
+# ifdef SHARED
+	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+	.globl	__i686.get_pc_thunk.bx
+	.hidden	__i686.get_pc_thunk.bx
+	.p2align 4
+	.type	__i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+	movl	(%esp), %ebx
+	ret
+
+	.text
+ENTRY(__mempcpy_chk)
+	.type	__mempcpy_chk, @gnu_indirect_function
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (ebx, 0)
+	call	__i686.get_pc_thunk.bx
+	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+	jne	1f
+	call	__init_cpu_features
+1:	leal	__mempcpy_chk_ia32@GOTOFF(%ebx), %eax
+	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__mempcpy_chk_ssse3@GOTOFF(%ebx), %eax
+	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__mempcpy_chk_ssse3_rep@GOTOFF(%ebx), %eax
+2:	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+	ret
+END(__mempcpy_chk)
+# else
+#  include "../mempcpy_chk.S"
+# endif
+#endif
diff --git a/sysdeps/i386/i686/multiarch/memset-sse2-rep.S b/sysdeps/i386/i686/multiarch/memset-sse2-rep.S
new file mode 100644
index 0000000000..84afffeb66
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/memset-sse2-rep.S
@@ -0,0 +1,821 @@
+/* memset with SSE2 and REP string.
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifndef NOT_IN_libc
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#ifdef USE_AS_BZERO
+# define DEST		PARMS
+# define LEN		DEST+4
+# define SETRTNVAL
+#else
+# define DEST		PARMS
+# define CHR		DEST+4
+# define LEN		CHR+4
+# define SETRTNVAL	movl DEST(%esp), %eax
+#endif
+
+#ifdef SHARED
+# define ENTRANCE	PUSH (%ebx);
+# define RETURN_END	POP (%ebx); ret
+# define RETURN		RETURN_END; CFI_PUSH (%ebx)
+# define PARMS		8		/* Preserve EBX.  */
+# define JMPTBL(I, B)	I - B
+
+/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
+   jump table with relative offsets.   */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
+    /* We first load PC into EBX.  */				\
+    call	__i686.get_pc_thunk.bx;				\
+    /* Get the address of the jump table.  */			\
+    add		$(TABLE - .), %ebx;				\
+    /* Get the entry and convert the relative offset to the	\
+       absolute address.  */					\
+    add		(%ebx,%ecx,4), %ebx;				\
+    add		%ecx, %edx;					\
+    /* We loaded the jump table and adjuested EDX. Go.  */	\
+    jmp		*%ebx
+
+	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+	.globl	__i686.get_pc_thunk.bx
+	.hidden	__i686.get_pc_thunk.bx
+	ALIGN (4)
+	.type	__i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+	movl	(%esp), %ebx
+	ret
+#else
+# define ENTRANCE
+# define RETURN_END	ret
+# define RETURN		RETURN_END
+# define PARMS		4
+# define JMPTBL(I, B)	I
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+   absolute offsets.  */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
+    add		%ecx, %edx;					\
+    jmp		*TABLE(,%ecx,4)
+#endif
+
+	.section .text.sse2,"ax",@progbits
+#if defined SHARED && !defined NOT_IN_libc && !defined USE_AS_BZERO
+ENTRY (__memset_chk_sse2_rep)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__memset_chk_sse2_rep)
+#endif
+ENTRY (__memset_sse2_rep)
+	ENTRANCE
+
+	movl	LEN(%esp), %ecx
+#ifdef USE_AS_BZERO
+	xor	%eax, %eax
+#else
+	movzbl	CHR(%esp), %eax
+	movb	%al, %ah
+	/* Fill the whole EAX with pattern.  */
+	movl	%eax, %edx
+	shl	$16, %eax
+	or	%edx, %eax
+#endif
+	movl	DEST(%esp), %edx
+	cmp	$32, %ecx
+	jae	L(32bytesormore)
+
+L(write_less32bytes):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_32bytes))
+
+
+	.pushsection .rodata.sse2,"a",@progbits
+	ALIGN (2)
+L(table_less_32bytes):
+	.int	JMPTBL (L(write_0bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_1bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_2bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_3bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_4bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_5bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_6bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_7bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_8bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_9bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_10bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_11bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_12bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_13bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_14bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_15bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_16bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_17bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_18bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_19bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_20bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_21bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_22bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_23bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_24bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_25bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_26bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_27bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_28bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_29bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_30bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_31bytes), L(table_less_32bytes))
+	.popsection
+
+	ALIGN (4)
+L(write_28bytes):
+	movl	%eax, -28(%edx)
+L(write_24bytes):
+	movl	%eax, -24(%edx)
+L(write_20bytes):
+	movl	%eax, -20(%edx)
+L(write_16bytes):
+	movl	%eax, -16(%edx)
+L(write_12bytes):
+	movl	%eax, -12(%edx)
+L(write_8bytes):
+	movl	%eax, -8(%edx)
+L(write_4bytes):
+	movl	%eax, -4(%edx)
+L(write_0bytes):
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(write_29bytes):
+	movl	%eax, -29(%edx)
+L(write_25bytes):
+	movl	%eax, -25(%edx)
+L(write_21bytes):
+	movl	%eax, -21(%edx)
+L(write_17bytes):
+	movl	%eax, -17(%edx)
+L(write_13bytes):
+	movl	%eax, -13(%edx)
+L(write_9bytes):
+	movl	%eax, -9(%edx)
+L(write_5bytes):
+	movl	%eax, -5(%edx)
+L(write_1bytes):
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(write_30bytes):
+	movl	%eax, -30(%edx)
+L(write_26bytes):
+	movl	%eax, -26(%edx)
+L(write_22bytes):
+	movl	%eax, -22(%edx)
+L(write_18bytes):
+	movl	%eax, -18(%edx)
+L(write_14bytes):
+	movl	%eax, -14(%edx)
+L(write_10bytes):
+	movl	%eax, -10(%edx)
+L(write_6bytes):
+	movl	%eax, -6(%edx)
+L(write_2bytes):
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(write_31bytes):
+	movl	%eax, -31(%edx)
+L(write_27bytes):
+	movl	%eax, -27(%edx)
+L(write_23bytes):
+	movl	%eax, -23(%edx)
+L(write_19bytes):
+	movl	%eax, -19(%edx)
+L(write_15bytes):
+	movl	%eax, -15(%edx)
+L(write_11bytes):
+	movl	%eax, -11(%edx)
+L(write_7bytes):
+	movl	%eax, -7(%edx)
+L(write_3bytes):
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+/* ECX > 32 and EDX is 4 byte aligned.  */
+L(32bytesormore):
+	/* Fill xmm0 with the pattern.  */
+#ifdef USE_AS_BZERO
+	pxor	%xmm0, %xmm0
+#else
+	movd	%eax, %xmm0
+	punpcklbw %xmm0, %xmm0
+	pshufd	$0, %xmm0, %xmm0
+#endif
+	testl	$0xf, %edx
+	jz	L(aligned_16)
+/* ECX > 32 and EDX is not 16 byte aligned.  */
+L(not_aligned_16):
+	movdqu	%xmm0, (%edx)
+	movl	%edx, %eax
+	and	$-16, %edx
+	add	$16, %edx
+	sub	%edx, %eax
+	add	%eax, %ecx
+	movd	%xmm0, %eax
+
+	ALIGN (4)
+L(aligned_16):
+	cmp	$128, %ecx
+	jge	L(128bytesormore)
+
+L(aligned_16_less128bytes):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+	ALIGN (4)
+L(128bytesormore):
+	PUSH (%edi)
+#ifdef DATA_CACHE_SIZE
+	PUSH (%ebx)
+	mov	$DATA_CACHE_SIZE, %ebx
+#else
+# ifdef SHARED
+	call	__i686.get_pc_thunk.bx
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	mov	__x86_data_cache_size@GOTOFF(%ebx), %ebx
+# else
+	PUSH (%ebx)
+	mov	__x86_data_cache_size, %ebx
+# endif
+#endif
+	mov	%ebx, %edi
+	shr	$4, %ebx
+	sub	%ebx, %edi
+#if defined DATA_CACHE_SIZE || !defined SHARED
+	POP (%ebx)
+#endif
+/*
+ * When data size approximate the end of L1 cache,
+ * fast string will prefetch and combine data efficiently.
+ */
+	cmp	%edi, %ecx
+	jae	L(128bytesormore_nt)
+	subl	$128, %ecx
+L(128bytesormore_normal):
+	sub	$128, %ecx
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm0, 0x10(%edx)
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm0, 0x30(%edx)
+	movdqa	%xmm0, 0x40(%edx)
+	movdqa	%xmm0, 0x50(%edx)
+	movdqa	%xmm0, 0x60(%edx)
+	movdqa	%xmm0, 0x70(%edx)
+	lea	128(%edx), %edx
+	jl	L(128bytesless_normal)
+
+
+	sub	$128, %ecx
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm0, 0x10(%edx)
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm0, 0x30(%edx)
+	movdqa	%xmm0, 0x40(%edx)
+	movdqa	%xmm0, 0x50(%edx)
+	movdqa	%xmm0, 0x60(%edx)
+	movdqa	%xmm0, 0x70(%edx)
+	lea	128(%edx), %edx
+	jge	L(128bytesormore_normal)
+
+L(128bytesless_normal):
+	POP (%edi)
+	lea	128(%ecx), %ecx
+	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+	ALIGN (4)
+L(128bytesormore_nt):
+	mov	%edx, %edi
+	mov	%ecx, %edx
+	shr	$2, %ecx
+	and	$3, %edx
+	rep stosl
+	jz	L(copy_page_by_rep_exit)
+	cmp	$2, %edx
+	jb	L(copy_page_by_rep_left_1)
+	movw	%ax, (%edi)
+	add	$2, %edi
+	sub	$2, %edx
+	jz	L(copy_page_by_rep_exit)
+L(copy_page_by_rep_left_1):
+	movb	%al, (%edi)
+L(copy_page_by_rep_exit):
+	POP (%edi)
+	SETRTNVAL
+	RETURN
+
+	.pushsection .rodata.sse2,"a",@progbits
+	ALIGN (2)
+L(table_16_128bytes):
+	.int	JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_1bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_3bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_5bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_7bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_9bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_11bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_13bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_15bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_17bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_19bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_21bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_23bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_25bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_27bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_29bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_31bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_33bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_35bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_37bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_39bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_41bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_43bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_45bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_47bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_49bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_51bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_53bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_55bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_57bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_59bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_61bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_63bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_65bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_67bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_69bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_71bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_73bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_75bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_77bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_79bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_81bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_83bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_85bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_87bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_89bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_91bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_93bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_95bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_97bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_99bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_101bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_103bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_105bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_107bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_109bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_111bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_113bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_115bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_117bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_119bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_121bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_123bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_125bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_127bytes), L(table_16_128bytes))
+	.popsection
+
+	ALIGN (4)
+L(aligned_16_112bytes):
+	movdqa	%xmm0, -112(%edx)
+L(aligned_16_96bytes):
+	movdqa	%xmm0, -96(%edx)
+L(aligned_16_80bytes):
+	movdqa	%xmm0, -80(%edx)
+L(aligned_16_64bytes):
+	movdqa	%xmm0, -64(%edx)
+L(aligned_16_48bytes):
+	movdqa	%xmm0, -48(%edx)
+L(aligned_16_32bytes):
+	movdqa	%xmm0, -32(%edx)
+L(aligned_16_16bytes):
+	movdqa	%xmm0, -16(%edx)
+L(aligned_16_0bytes):
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_113bytes):
+	movdqa	%xmm0, -113(%edx)
+L(aligned_16_97bytes):
+	movdqa	%xmm0, -97(%edx)
+L(aligned_16_81bytes):
+	movdqa	%xmm0, -81(%edx)
+L(aligned_16_65bytes):
+	movdqa	%xmm0, -65(%edx)
+L(aligned_16_49bytes):
+	movdqa	%xmm0, -49(%edx)
+L(aligned_16_33bytes):
+	movdqa	%xmm0, -33(%edx)
+L(aligned_16_17bytes):
+	movdqa	%xmm0, -17(%edx)
+L(aligned_16_1bytes):
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_114bytes):
+	movdqa	%xmm0, -114(%edx)
+L(aligned_16_98bytes):
+	movdqa	%xmm0, -98(%edx)
+L(aligned_16_82bytes):
+	movdqa	%xmm0, -82(%edx)
+L(aligned_16_66bytes):
+	movdqa	%xmm0, -66(%edx)
+L(aligned_16_50bytes):
+	movdqa	%xmm0, -50(%edx)
+L(aligned_16_34bytes):
+	movdqa	%xmm0, -34(%edx)
+L(aligned_16_18bytes):
+	movdqa	%xmm0, -18(%edx)
+L(aligned_16_2bytes):
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_115bytes):
+	movdqa	%xmm0, -115(%edx)
+L(aligned_16_99bytes):
+	movdqa	%xmm0, -99(%edx)
+L(aligned_16_83bytes):
+	movdqa	%xmm0, -83(%edx)
+L(aligned_16_67bytes):
+	movdqa	%xmm0, -67(%edx)
+L(aligned_16_51bytes):
+	movdqa	%xmm0, -51(%edx)
+L(aligned_16_35bytes):
+	movdqa	%xmm0, -35(%edx)
+L(aligned_16_19bytes):
+	movdqa	%xmm0, -19(%edx)
+L(aligned_16_3bytes):
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_116bytes):
+	movdqa	%xmm0, -116(%edx)
+L(aligned_16_100bytes):
+	movdqa	%xmm0, -100(%edx)
+L(aligned_16_84bytes):
+	movdqa	%xmm0, -84(%edx)
+L(aligned_16_68bytes):
+	movdqa	%xmm0, -68(%edx)
+L(aligned_16_52bytes):
+	movdqa	%xmm0, -52(%edx)
+L(aligned_16_36bytes):
+	movdqa	%xmm0, -36(%edx)
+L(aligned_16_20bytes):
+	movdqa	%xmm0, -20(%edx)
+L(aligned_16_4bytes):
+	movl	%eax, -4(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_117bytes):
+	movdqa	%xmm0, -117(%edx)
+L(aligned_16_101bytes):
+	movdqa	%xmm0, -101(%edx)
+L(aligned_16_85bytes):
+	movdqa	%xmm0, -85(%edx)
+L(aligned_16_69bytes):
+	movdqa	%xmm0, -69(%edx)
+L(aligned_16_53bytes):
+	movdqa	%xmm0, -53(%edx)
+L(aligned_16_37bytes):
+	movdqa	%xmm0, -37(%edx)
+L(aligned_16_21bytes):
+	movdqa	%xmm0, -21(%edx)
+L(aligned_16_5bytes):
+	movl	%eax, -5(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_118bytes):
+	movdqa	%xmm0, -118(%edx)
+L(aligned_16_102bytes):
+	movdqa	%xmm0, -102(%edx)
+L(aligned_16_86bytes):
+	movdqa	%xmm0, -86(%edx)
+L(aligned_16_70bytes):
+	movdqa	%xmm0, -70(%edx)
+L(aligned_16_54bytes):
+	movdqa	%xmm0, -54(%edx)
+L(aligned_16_38bytes):
+	movdqa	%xmm0, -38(%edx)
+L(aligned_16_22bytes):
+	movdqa	%xmm0, -22(%edx)
+L(aligned_16_6bytes):
+	movl	%eax, -6(%edx)
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_119bytes):
+	movdqa	%xmm0, -119(%edx)
+L(aligned_16_103bytes):
+	movdqa	%xmm0, -103(%edx)
+L(aligned_16_87bytes):
+	movdqa	%xmm0, -87(%edx)
+L(aligned_16_71bytes):
+	movdqa	%xmm0, -71(%edx)
+L(aligned_16_55bytes):
+	movdqa	%xmm0, -55(%edx)
+L(aligned_16_39bytes):
+	movdqa	%xmm0, -39(%edx)
+L(aligned_16_23bytes):
+	movdqa	%xmm0, -23(%edx)
+L(aligned_16_7bytes):
+	movl	%eax, -7(%edx)
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_120bytes):
+	movdqa	%xmm0, -120(%edx)
+L(aligned_16_104bytes):
+	movdqa	%xmm0, -104(%edx)
+L(aligned_16_88bytes):
+	movdqa	%xmm0, -88(%edx)
+L(aligned_16_72bytes):
+	movdqa	%xmm0, -72(%edx)
+L(aligned_16_56bytes):
+	movdqa	%xmm0, -56(%edx)
+L(aligned_16_40bytes):
+	movdqa	%xmm0, -40(%edx)
+L(aligned_16_24bytes):
+	movdqa	%xmm0, -24(%edx)
+L(aligned_16_8bytes):
+	movq	%xmm0, -8(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_121bytes):
+	movdqa	%xmm0, -121(%edx)
+L(aligned_16_105bytes):
+	movdqa	%xmm0, -105(%edx)
+L(aligned_16_89bytes):
+	movdqa	%xmm0, -89(%edx)
+L(aligned_16_73bytes):
+	movdqa	%xmm0, -73(%edx)
+L(aligned_16_57bytes):
+	movdqa	%xmm0, -57(%edx)
+L(aligned_16_41bytes):
+	movdqa	%xmm0, -41(%edx)
+L(aligned_16_25bytes):
+	movdqa	%xmm0, -25(%edx)
+L(aligned_16_9bytes):
+	movq	%xmm0, -9(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_122bytes):
+	movdqa	%xmm0, -122(%edx)
+L(aligned_16_106bytes):
+	movdqa	%xmm0, -106(%edx)
+L(aligned_16_90bytes):
+	movdqa	%xmm0, -90(%edx)
+L(aligned_16_74bytes):
+	movdqa	%xmm0, -74(%edx)
+L(aligned_16_58bytes):
+	movdqa	%xmm0, -58(%edx)
+L(aligned_16_42bytes):
+	movdqa	%xmm0, -42(%edx)
+L(aligned_16_26bytes):
+	movdqa	%xmm0, -26(%edx)
+L(aligned_16_10bytes):
+	movq	%xmm0, -10(%edx)
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_123bytes):
+	movdqa	%xmm0, -123(%edx)
+L(aligned_16_107bytes):
+	movdqa	%xmm0, -107(%edx)
+L(aligned_16_91bytes):
+	movdqa	%xmm0, -91(%edx)
+L(aligned_16_75bytes):
+	movdqa	%xmm0, -75(%edx)
+L(aligned_16_59bytes):
+	movdqa	%xmm0, -59(%edx)
+L(aligned_16_43bytes):
+	movdqa	%xmm0, -43(%edx)
+L(aligned_16_27bytes):
+	movdqa	%xmm0, -27(%edx)
+L(aligned_16_11bytes):
+	movq	%xmm0, -11(%edx)
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_124bytes):
+	movdqa	%xmm0, -124(%edx)
+L(aligned_16_108bytes):
+	movdqa	%xmm0, -108(%edx)
+L(aligned_16_92bytes):
+	movdqa	%xmm0, -92(%edx)
+L(aligned_16_76bytes):
+	movdqa	%xmm0, -76(%edx)
+L(aligned_16_60bytes):
+	movdqa	%xmm0, -60(%edx)
+L(aligned_16_44bytes):
+	movdqa	%xmm0, -44(%edx)
+L(aligned_16_28bytes):
+	movdqa	%xmm0, -28(%edx)
+L(aligned_16_12bytes):
+	movq	%xmm0, -12(%edx)
+	movl	%eax, -4(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_125bytes):
+	movdqa	%xmm0, -125(%edx)
+L(aligned_16_109bytes):
+	movdqa	%xmm0, -109(%edx)
+L(aligned_16_93bytes):
+	movdqa	%xmm0, -93(%edx)
+L(aligned_16_77bytes):
+	movdqa	%xmm0, -77(%edx)
+L(aligned_16_61bytes):
+	movdqa	%xmm0, -61(%edx)
+L(aligned_16_45bytes):
+	movdqa	%xmm0, -45(%edx)
+L(aligned_16_29bytes):
+	movdqa	%xmm0, -29(%edx)
+L(aligned_16_13bytes):
+	movq	%xmm0, -13(%edx)
+	movl	%eax, -5(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_126bytes):
+	movdqa	%xmm0, -126(%edx)
+L(aligned_16_110bytes):
+	movdqa	%xmm0, -110(%edx)
+L(aligned_16_94bytes):
+	movdqa	%xmm0, -94(%edx)
+L(aligned_16_78bytes):
+	movdqa	%xmm0, -78(%edx)
+L(aligned_16_62bytes):
+	movdqa	%xmm0, -62(%edx)
+L(aligned_16_46bytes):
+	movdqa	%xmm0, -46(%edx)
+L(aligned_16_30bytes):
+	movdqa	%xmm0, -30(%edx)
+L(aligned_16_14bytes):
+	movq	%xmm0, -14(%edx)
+	movl	%eax, -6(%edx)
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_127bytes):
+	movdqa	%xmm0, -127(%edx)
+L(aligned_16_111bytes):
+	movdqa	%xmm0, -111(%edx)
+L(aligned_16_95bytes):
+	movdqa	%xmm0, -95(%edx)
+L(aligned_16_79bytes):
+	movdqa	%xmm0, -79(%edx)
+L(aligned_16_63bytes):
+	movdqa	%xmm0, -63(%edx)
+L(aligned_16_47bytes):
+	movdqa	%xmm0, -47(%edx)
+L(aligned_16_31bytes):
+	movdqa	%xmm0, -31(%edx)
+L(aligned_16_15bytes):
+	movq	%xmm0, -15(%edx)
+	movl	%eax, -7(%edx)
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN_END
+
+END (__memset_sse2_rep)
+
+#endif
diff --git a/sysdeps/i386/i686/multiarch/memset-sse2.S b/sysdeps/i386/i686/multiarch/memset-sse2.S
new file mode 100644
index 0000000000..b2b979193e
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/memset-sse2.S
@@ -0,0 +1,867 @@
+/* memset with SSE2
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifndef NOT_IN_libc
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#ifdef USE_AS_BZERO
+# define DEST		PARMS
+# define LEN		DEST+4
+# define SETRTNVAL
+#else
+# define DEST		PARMS
+# define CHR		DEST+4
+# define LEN		CHR+4
+# define SETRTNVAL	movl DEST(%esp), %eax
+#endif
+
+#ifdef SHARED
+# define ENTRANCE	PUSH (%ebx);
+# define RETURN_END	POP (%ebx); ret
+# define RETURN		RETURN_END; CFI_PUSH (%ebx)
+# define PARMS		8		/* Preserve EBX.  */
+# define JMPTBL(I, B)	I - B
+
+/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
+   jump table with relative offsets.   */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
+    /* We first load PC into EBX.  */				\
+    call	__i686.get_pc_thunk.bx;				\
+    /* Get the address of the jump table.  */			\
+    add		$(TABLE - .), %ebx;				\
+    /* Get the entry and convert the relative offset to the	\
+       absolute address.  */					\
+    add		(%ebx,%ecx,4), %ebx;				\
+    add		%ecx, %edx;					\
+    /* We loaded the jump table and adjuested EDX. Go.  */	\
+    jmp		*%ebx
+
+	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+	.globl	__i686.get_pc_thunk.bx
+	.hidden	__i686.get_pc_thunk.bx
+	ALIGN (4)
+	.type	__i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+	movl	(%esp), %ebx
+	ret
+#else
+# define ENTRANCE
+# define RETURN_END	ret
+# define RETURN		RETURN_END
+# define PARMS		4
+# define JMPTBL(I, B)	I
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+   absolute offsets.  */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
+    add		%ecx, %edx;					\
+    jmp		*TABLE(,%ecx,4)
+#endif
+
+	.section .text.sse2,"ax",@progbits
+#if defined SHARED && !defined NOT_IN_libc && !defined USE_AS_BZERO
+ENTRY (__memset_chk_sse2)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__memset_chk_sse2)
+#endif
+ENTRY (__memset_sse2)
+	ENTRANCE
+
+	movl	LEN(%esp), %ecx
+#ifdef USE_AS_BZERO
+	xor	%eax, %eax
+#else
+	movzbl	CHR(%esp), %eax
+	movb	%al, %ah
+	/* Fill the whole EAX with pattern.  */
+	movl	%eax, %edx
+	shl	$16, %eax
+	or	%edx, %eax
+#endif
+	movl	DEST(%esp), %edx
+	cmp	$32, %ecx
+	jae	L(32bytesormore)
+
+L(write_less32bytes):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_32bytes))
+
+
+	.pushsection .rodata.sse2,"a",@progbits
+	ALIGN (2)
+L(table_less_32bytes):
+	.int	JMPTBL (L(write_0bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_1bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_2bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_3bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_4bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_5bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_6bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_7bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_8bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_9bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_10bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_11bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_12bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_13bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_14bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_15bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_16bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_17bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_18bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_19bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_20bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_21bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_22bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_23bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_24bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_25bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_26bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_27bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_28bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_29bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_30bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_31bytes), L(table_less_32bytes))
+	.popsection
+
+	ALIGN (4)
+L(write_28bytes):
+	movl	%eax, -28(%edx)
+L(write_24bytes):
+	movl	%eax, -24(%edx)
+L(write_20bytes):
+	movl	%eax, -20(%edx)
+L(write_16bytes):
+	movl	%eax, -16(%edx)
+L(write_12bytes):
+	movl	%eax, -12(%edx)
+L(write_8bytes):
+	movl	%eax, -8(%edx)
+L(write_4bytes):
+	movl	%eax, -4(%edx)
+L(write_0bytes):
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(write_29bytes):
+	movl	%eax, -29(%edx)
+L(write_25bytes):
+	movl	%eax, -25(%edx)
+L(write_21bytes):
+	movl	%eax, -21(%edx)
+L(write_17bytes):
+	movl	%eax, -17(%edx)
+L(write_13bytes):
+	movl	%eax, -13(%edx)
+L(write_9bytes):
+	movl	%eax, -9(%edx)
+L(write_5bytes):
+	movl	%eax, -5(%edx)
+L(write_1bytes):
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(write_30bytes):
+	movl	%eax, -30(%edx)
+L(write_26bytes):
+	movl	%eax, -26(%edx)
+L(write_22bytes):
+	movl	%eax, -22(%edx)
+L(write_18bytes):
+	movl	%eax, -18(%edx)
+L(write_14bytes):
+	movl	%eax, -14(%edx)
+L(write_10bytes):
+	movl	%eax, -10(%edx)
+L(write_6bytes):
+	movl	%eax, -6(%edx)
+L(write_2bytes):
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(write_31bytes):
+	movl	%eax, -31(%edx)
+L(write_27bytes):
+	movl	%eax, -27(%edx)
+L(write_23bytes):
+	movl	%eax, -23(%edx)
+L(write_19bytes):
+	movl	%eax, -19(%edx)
+L(write_15bytes):
+	movl	%eax, -15(%edx)
+L(write_11bytes):
+	movl	%eax, -11(%edx)
+L(write_7bytes):
+	movl	%eax, -7(%edx)
+L(write_3bytes):
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+/* ECX > 32 and EDX is 4 byte aligned.  */
+L(32bytesormore):
+	/* Fill xmm0 with the pattern.  */
+#ifdef USE_AS_BZERO
+	pxor	%xmm0, %xmm0
+#else
+	movd	%eax, %xmm0
+	punpcklbw %xmm0, %xmm0
+	pshufd	$0, %xmm0, %xmm0
+#endif
+	testl	$0xf, %edx
+	jz	L(aligned_16)
+/* ECX > 32 and EDX is not 16 byte aligned.  */
+L(not_aligned_16):
+	movdqu	%xmm0, (%edx)
+	movl	%edx, %eax
+	and	$-16, %edx
+	add	$16, %edx
+	sub	%edx, %eax
+	add	%eax, %ecx
+	movd	%xmm0, %eax
+
+	ALIGN (4)
+L(aligned_16):
+	cmp	$128, %ecx
+	jge	L(128bytesormore)
+
+L(aligned_16_less128bytes):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+	ALIGN (4)
+L(128bytesormore):
+#ifdef SHARED_CACHE_SIZE
+	PUSH (%ebx)
+	mov	$SHARED_CACHE_SIZE, %ebx
+#else
+# ifdef SHARED
+	call	__i686.get_pc_thunk.bx
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	mov	__x86_shared_cache_size@GOTOFF(%ebx), %ebx
+# else
+	PUSH (%ebx)
+	mov	__x86_shared_cache_size, %ebx
+# endif
+#endif
+	cmp	%ebx, %ecx
+	jae	L(128bytesormore_nt_start)
+
+
+#ifdef DATA_CACHE_SIZE
+	POP (%ebx)
+	cmp	$DATA_CACHE_SIZE, %ecx
+#else
+# ifdef SHARED
+	call	__i686.get_pc_thunk.bx
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size@GOTOFF(%ebx), %ecx
+# else
+	POP (%ebx)
+	cmp	__x86_data_cache_size, %ecx
+# endif
+#endif
+
+	jae	L(128bytes_L2_normal)
+	subl	$128, %ecx
+L(128bytesormore_normal):
+	sub	$128, %ecx
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm0, 0x10(%edx)
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm0, 0x30(%edx)
+	movdqa	%xmm0, 0x40(%edx)
+	movdqa	%xmm0, 0x50(%edx)
+	movdqa	%xmm0, 0x60(%edx)
+	movdqa	%xmm0, 0x70(%edx)
+	lea	128(%edx), %edx
+	jl	L(128bytesless_normal)
+
+
+	sub	$128, %ecx
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm0, 0x10(%edx)
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm0, 0x30(%edx)
+	movdqa	%xmm0, 0x40(%edx)
+	movdqa	%xmm0, 0x50(%edx)
+	movdqa	%xmm0, 0x60(%edx)
+	movdqa	%xmm0, 0x70(%edx)
+	lea	128(%edx), %edx
+	jge	L(128bytesormore_normal)
+
+L(128bytesless_normal):
+	lea	128(%ecx), %ecx
+	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+	ALIGN (4)
+L(128bytes_L2_normal):
+	prefetcht0	0x380(%edx)
+	prefetcht0	0x3c0(%edx)
+	sub	$128, %ecx
+	movdqa	%xmm0, (%edx)
+	movaps	%xmm0, 0x10(%edx)
+	movaps	%xmm0, 0x20(%edx)
+	movaps	%xmm0, 0x30(%edx)
+	movaps	%xmm0, 0x40(%edx)
+	movaps	%xmm0, 0x50(%edx)
+	movaps	%xmm0, 0x60(%edx)
+	movaps	%xmm0, 0x70(%edx)
+	add	$128, %edx
+	cmp	$128, %ecx
+	jge	L(128bytes_L2_normal)
+
+L(128bytesless_L2_normal):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+L(128bytesormore_nt_start):
+	sub	%ebx, %ecx
+	ALIGN (4)
+L(128bytesormore_shared_cache_loop):
+	prefetcht0	0x3c0(%edx)
+	prefetcht0	0x380(%edx)
+	sub	$0x80, %ebx
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm0, 0x10(%edx)
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm0, 0x30(%edx)
+	movdqa	%xmm0, 0x40(%edx)
+	movdqa	%xmm0, 0x50(%edx)
+	movdqa	%xmm0, 0x60(%edx)
+	movdqa	%xmm0, 0x70(%edx)
+	add	$0x80, %edx
+	cmp	$0x80, %ebx
+	jge	L(128bytesormore_shared_cache_loop)
+	cmp	$0x80, %ecx
+	jb	L(shared_cache_loop_end)
+	ALIGN (4)
+L(128bytesormore_nt):
+	sub	$0x80, %ecx
+	movntdq	%xmm0, (%edx)
+	movntdq	%xmm0, 0x10(%edx)
+	movntdq	%xmm0, 0x20(%edx)
+	movntdq	%xmm0, 0x30(%edx)
+	movntdq	%xmm0, 0x40(%edx)
+	movntdq	%xmm0, 0x50(%edx)
+	movntdq	%xmm0, 0x60(%edx)
+	movntdq	%xmm0, 0x70(%edx)
+	add	$0x80, %edx
+	cmp	$0x80, %ecx
+	jge	L(128bytesormore_nt)
+	sfence
+L(shared_cache_loop_end):
+#if defined DATA_CACHE_SIZE || !defined SHARED
+	POP (%ebx)
+#endif
+	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+
+	.pushsection .rodata.sse2,"a",@progbits
+	ALIGN (2)
+L(table_16_128bytes):
+	.int	JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_1bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_3bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_5bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_7bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_9bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_11bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_13bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_15bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_17bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_19bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_21bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_23bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_25bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_27bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_29bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_31bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_33bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_35bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_37bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_39bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_41bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_43bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_45bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_47bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_49bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_51bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_53bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_55bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_57bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_59bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_61bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_63bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_65bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_67bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_69bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_71bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_73bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_75bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_77bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_79bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_81bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_83bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_85bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_87bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_89bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_91bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_93bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_95bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_97bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_99bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_101bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_103bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_105bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_107bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_109bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_111bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_113bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_115bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_117bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_119bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_121bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_123bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_125bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_127bytes), L(table_16_128bytes))
+	.popsection
+
+	ALIGN (4)
+L(aligned_16_112bytes):
+	movdqa	%xmm0, -112(%edx)
+L(aligned_16_96bytes):
+	movdqa	%xmm0, -96(%edx)
+L(aligned_16_80bytes):
+	movdqa	%xmm0, -80(%edx)
+L(aligned_16_64bytes):
+	movdqa	%xmm0, -64(%edx)
+L(aligned_16_48bytes):
+	movdqa	%xmm0, -48(%edx)
+L(aligned_16_32bytes):
+	movdqa	%xmm0, -32(%edx)
+L(aligned_16_16bytes):
+	movdqa	%xmm0, -16(%edx)
+L(aligned_16_0bytes):
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_113bytes):
+	movdqa	%xmm0, -113(%edx)
+L(aligned_16_97bytes):
+	movdqa	%xmm0, -97(%edx)
+L(aligned_16_81bytes):
+	movdqa	%xmm0, -81(%edx)
+L(aligned_16_65bytes):
+	movdqa	%xmm0, -65(%edx)
+L(aligned_16_49bytes):
+	movdqa	%xmm0, -49(%edx)
+L(aligned_16_33bytes):
+	movdqa	%xmm0, -33(%edx)
+L(aligned_16_17bytes):
+	movdqa	%xmm0, -17(%edx)
+L(aligned_16_1bytes):
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_114bytes):
+	movdqa	%xmm0, -114(%edx)
+L(aligned_16_98bytes):
+	movdqa	%xmm0, -98(%edx)
+L(aligned_16_82bytes):
+	movdqa	%xmm0, -82(%edx)
+L(aligned_16_66bytes):
+	movdqa	%xmm0, -66(%edx)
+L(aligned_16_50bytes):
+	movdqa	%xmm0, -50(%edx)
+L(aligned_16_34bytes):
+	movdqa	%xmm0, -34(%edx)
+L(aligned_16_18bytes):
+	movdqa	%xmm0, -18(%edx)
+L(aligned_16_2bytes):
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_115bytes):
+	movdqa	%xmm0, -115(%edx)
+L(aligned_16_99bytes):
+	movdqa	%xmm0, -99(%edx)
+L(aligned_16_83bytes):
+	movdqa	%xmm0, -83(%edx)
+L(aligned_16_67bytes):
+	movdqa	%xmm0, -67(%edx)
+L(aligned_16_51bytes):
+	movdqa	%xmm0, -51(%edx)
+L(aligned_16_35bytes):
+	movdqa	%xmm0, -35(%edx)
+L(aligned_16_19bytes):
+	movdqa	%xmm0, -19(%edx)
+L(aligned_16_3bytes):
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_116bytes):
+	movdqa	%xmm0, -116(%edx)
+L(aligned_16_100bytes):
+	movdqa	%xmm0, -100(%edx)
+L(aligned_16_84bytes):
+	movdqa	%xmm0, -84(%edx)
+L(aligned_16_68bytes):
+	movdqa	%xmm0, -68(%edx)
+L(aligned_16_52bytes):
+	movdqa	%xmm0, -52(%edx)
+L(aligned_16_36bytes):
+	movdqa	%xmm0, -36(%edx)
+L(aligned_16_20bytes):
+	movdqa	%xmm0, -20(%edx)
+L(aligned_16_4bytes):
+	movl	%eax, -4(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_117bytes):
+	movdqa	%xmm0, -117(%edx)
+L(aligned_16_101bytes):
+	movdqa	%xmm0, -101(%edx)
+L(aligned_16_85bytes):
+	movdqa	%xmm0, -85(%edx)
+L(aligned_16_69bytes):
+	movdqa	%xmm0, -69(%edx)
+L(aligned_16_53bytes):
+	movdqa	%xmm0, -53(%edx)
+L(aligned_16_37bytes):
+	movdqa	%xmm0, -37(%edx)
+L(aligned_16_21bytes):
+	movdqa	%xmm0, -21(%edx)
+L(aligned_16_5bytes):
+	movl	%eax, -5(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_118bytes):
+	movdqa	%xmm0, -118(%edx)
+L(aligned_16_102bytes):
+	movdqa	%xmm0, -102(%edx)
+L(aligned_16_86bytes):
+	movdqa	%xmm0, -86(%edx)
+L(aligned_16_70bytes):
+	movdqa	%xmm0, -70(%edx)
+L(aligned_16_54bytes):
+	movdqa	%xmm0, -54(%edx)
+L(aligned_16_38bytes):
+	movdqa	%xmm0, -38(%edx)
+L(aligned_16_22bytes):
+	movdqa	%xmm0, -22(%edx)
+L(aligned_16_6bytes):
+	movl	%eax, -6(%edx)
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_119bytes):
+	movdqa	%xmm0, -119(%edx)
+L(aligned_16_103bytes):
+	movdqa	%xmm0, -103(%edx)
+L(aligned_16_87bytes):
+	movdqa	%xmm0, -87(%edx)
+L(aligned_16_71bytes):
+	movdqa	%xmm0, -71(%edx)
+L(aligned_16_55bytes):
+	movdqa	%xmm0, -55(%edx)
+L(aligned_16_39bytes):
+	movdqa	%xmm0, -39(%edx)
+L(aligned_16_23bytes):
+	movdqa	%xmm0, -23(%edx)
+L(aligned_16_7bytes):
+	movl	%eax, -7(%edx)
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_120bytes):
+	movdqa	%xmm0, -120(%edx)
+L(aligned_16_104bytes):
+	movdqa	%xmm0, -104(%edx)
+L(aligned_16_88bytes):
+	movdqa	%xmm0, -88(%edx)
+L(aligned_16_72bytes):
+	movdqa	%xmm0, -72(%edx)
+L(aligned_16_56bytes):
+	movdqa	%xmm0, -56(%edx)
+L(aligned_16_40bytes):
+	movdqa	%xmm0, -40(%edx)
+L(aligned_16_24bytes):
+	movdqa	%xmm0, -24(%edx)
+L(aligned_16_8bytes):
+	movq	%xmm0, -8(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_121bytes):
+	movdqa	%xmm0, -121(%edx)
+L(aligned_16_105bytes):
+	movdqa	%xmm0, -105(%edx)
+L(aligned_16_89bytes):
+	movdqa	%xmm0, -89(%edx)
+L(aligned_16_73bytes):
+	movdqa	%xmm0, -73(%edx)
+L(aligned_16_57bytes):
+	movdqa	%xmm0, -57(%edx)
+L(aligned_16_41bytes):
+	movdqa	%xmm0, -41(%edx)
+L(aligned_16_25bytes):
+	movdqa	%xmm0, -25(%edx)
+L(aligned_16_9bytes):
+	movq	%xmm0, -9(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_122bytes):
+	movdqa	%xmm0, -122(%edx)
+L(aligned_16_106bytes):
+	movdqa	%xmm0, -106(%edx)
+L(aligned_16_90bytes):
+	movdqa	%xmm0, -90(%edx)
+L(aligned_16_74bytes):
+	movdqa	%xmm0, -74(%edx)
+L(aligned_16_58bytes):
+	movdqa	%xmm0, -58(%edx)
+L(aligned_16_42bytes):
+	movdqa	%xmm0, -42(%edx)
+L(aligned_16_26bytes):
+	movdqa	%xmm0, -26(%edx)
+L(aligned_16_10bytes):
+	movq	%xmm0, -10(%edx)
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_123bytes):
+	movdqa	%xmm0, -123(%edx)
+L(aligned_16_107bytes):
+	movdqa	%xmm0, -107(%edx)
+L(aligned_16_91bytes):
+	movdqa	%xmm0, -91(%edx)
+L(aligned_16_75bytes):
+	movdqa	%xmm0, -75(%edx)
+L(aligned_16_59bytes):
+	movdqa	%xmm0, -59(%edx)
+L(aligned_16_43bytes):
+	movdqa	%xmm0, -43(%edx)
+L(aligned_16_27bytes):
+	movdqa	%xmm0, -27(%edx)
+L(aligned_16_11bytes):
+	movq	%xmm0, -11(%edx)
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_124bytes):
+	movdqa	%xmm0, -124(%edx)
+L(aligned_16_108bytes):
+	movdqa	%xmm0, -108(%edx)
+L(aligned_16_92bytes):
+	movdqa	%xmm0, -92(%edx)
+L(aligned_16_76bytes):
+	movdqa	%xmm0, -76(%edx)
+L(aligned_16_60bytes):
+	movdqa	%xmm0, -60(%edx)
+L(aligned_16_44bytes):
+	movdqa	%xmm0, -44(%edx)
+L(aligned_16_28bytes):
+	movdqa	%xmm0, -28(%edx)
+L(aligned_16_12bytes):
+	movq	%xmm0, -12(%edx)
+	movl	%eax, -4(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_125bytes):
+	movdqa	%xmm0, -125(%edx)
+L(aligned_16_109bytes):
+	movdqa	%xmm0, -109(%edx)
+L(aligned_16_93bytes):
+	movdqa	%xmm0, -93(%edx)
+L(aligned_16_77bytes):
+	movdqa	%xmm0, -77(%edx)
+L(aligned_16_61bytes):
+	movdqa	%xmm0, -61(%edx)
+L(aligned_16_45bytes):
+	movdqa	%xmm0, -45(%edx)
+L(aligned_16_29bytes):
+	movdqa	%xmm0, -29(%edx)
+L(aligned_16_13bytes):
+	movq	%xmm0, -13(%edx)
+	movl	%eax, -5(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_126bytes):
+	movdqa	%xmm0, -126(%edx)
+L(aligned_16_110bytes):
+	movdqa	%xmm0, -110(%edx)
+L(aligned_16_94bytes):
+	movdqa	%xmm0, -94(%edx)
+L(aligned_16_78bytes):
+	movdqa	%xmm0, -78(%edx)
+L(aligned_16_62bytes):
+	movdqa	%xmm0, -62(%edx)
+L(aligned_16_46bytes):
+	movdqa	%xmm0, -46(%edx)
+L(aligned_16_30bytes):
+	movdqa	%xmm0, -30(%edx)
+L(aligned_16_14bytes):
+	movq	%xmm0, -14(%edx)
+	movl	%eax, -6(%edx)
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_127bytes):
+	movdqa	%xmm0, -127(%edx)
+L(aligned_16_111bytes):
+	movdqa	%xmm0, -111(%edx)
+L(aligned_16_95bytes):
+	movdqa	%xmm0, -95(%edx)
+L(aligned_16_79bytes):
+	movdqa	%xmm0, -79(%edx)
+L(aligned_16_63bytes):
+	movdqa	%xmm0, -63(%edx)
+L(aligned_16_47bytes):
+	movdqa	%xmm0, -47(%edx)
+L(aligned_16_31bytes):
+	movdqa	%xmm0, -31(%edx)
+L(aligned_16_15bytes):
+	movq	%xmm0, -15(%edx)
+	movl	%eax, -7(%edx)
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN_END
+
+END (__memset_sse2)
+
+#endif
diff --git a/sysdeps/i386/i686/multiarch/memset.S b/sysdeps/i386/i686/multiarch/memset.S
new file mode 100644
index 0000000000..34dddcef7b
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/memset.S
@@ -0,0 +1,112 @@
+/* Multiple versions of memset
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib.  */
+#ifndef NOT_IN_libc
+# ifdef SHARED
+	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+	.globl	__i686.get_pc_thunk.bx
+	.hidden	__i686.get_pc_thunk.bx
+	.p2align 4
+	.type	__i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+	movl	(%esp), %ebx
+	ret
+
+	.text
+ENTRY(memset)
+	.type	memset, @gnu_indirect_function
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (ebx, 0)
+	call	__i686.get_pc_thunk.bx
+	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+	jne	1f
+	call	__init_cpu_features
+1:	leal	__memset_ia32@GOTOFF(%ebx), %eax
+	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__memset_sse2@GOTOFF(%ebx), %eax
+	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__memset_sse2_rep@GOTOFF(%ebx), %eax
+2:	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+	ret
+END(memset)
+# else
+	.text
+ENTRY(memset)
+	.type	memset, @gnu_indirect_function
+	cmpl	$0, KIND_OFFSET+__cpu_features
+	jne	1f
+	call	__init_cpu_features
+1:	leal	__memset_ia32, %eax
+	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features
+	jz	2f
+	leal	__memset_sse2, %eax
+	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features
+	jz	2f
+	leal	__memset_sse2_rep, %eax
+2:	ret
+END(memset)
+# endif
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __memset_ia32, @function; \
+	.globl __memset_ia32; \
+	.p2align 4; \
+	__memset_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __memset_ia32, .-__memset_ia32
+
+# undef ENTRY_CHK
+# define ENTRY_CHK(name) \
+	.type __memset_chk_ia32, @function; \
+	.globl __memset_chk_ia32; \
+	.p2align 4; \
+	__memset_chk_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END_CHK
+# define END_CHK(name) \
+	cfi_endproc; .size __memset_chk_ia32, .-__memset_chk_ia32
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_memset; __GI_memset = __memset_ia32
+# endif
+
+# undef strong_alias
+# define strong_alias(original, alias)
+#endif
+
+#include "../memset.S"
diff --git a/sysdeps/i386/i686/multiarch/memset_chk.S b/sysdeps/i386/i686/multiarch/memset_chk.S
new file mode 100644
index 0000000000..d659c7e56d
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/memset_chk.S
@@ -0,0 +1,116 @@
+/* Multiple versions of __memset_chk
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib.  */
+#ifndef NOT_IN_libc
+# ifdef SHARED
+	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+	.globl	__i686.get_pc_thunk.bx
+	.hidden	__i686.get_pc_thunk.bx
+	.p2align 4
+	.type	__i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+	movl	(%esp), %ebx
+	ret
+
+	.text
+ENTRY(__memset_chk)
+	.type	__memset_chk, @gnu_indirect_function
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (ebx, 0)
+	call	__i686.get_pc_thunk.bx
+	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+	jne	1f
+	call	__init_cpu_features
+1:	leal	__memset_chk_ia32@GOTOFF(%ebx), %eax
+	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__memset_chk_sse2@GOTOFF(%ebx), %eax
+	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__memset_chk_sse2_rep@GOTOFF(%ebx), %eax
+2:	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+	ret
+END(__memset_chk)
+
+strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
+	.section .gnu.warning.__memset_zero_constant_len_parameter
+	.string "memset used with constant zero length parameter; this could be due to transposed parameters"
+# else
+	.text
+ENTRY(__memset_chk)
+	.type	__memset_chk, @gnu_indirect_function
+	cmpl	$0, KIND_OFFSET+__cpu_features
+	jne	1f
+	call	__init_cpu_features
+1:	leal	__memset_chk_ia32, %eax
+	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features
+	jz	2f
+	leal	__memset_chk_sse2, %eax
+	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features
+	jz	2f
+	leal	__memset_chk_sse2_rep, %eax
+2:	ret
+END(__memset_chk)
+
+	.type __memset_chk_sse2, @function
+	.p2align 4;
+__memset_chk_sse2:
+	cfi_startproc
+	CALL_MCOUNT
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	__chk_fail
+	jmp	__memset_sse2
+	cfi_endproc
+	.size __memset_chk_sse2, .-__memset_chk_sse2
+
+	.type __memset_chk_sse2_rep, @function
+	.p2align 4;
+__memset_chk_sse2_rep:
+	cfi_startproc
+	CALL_MCOUNT
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	__chk_fail
+	jmp	__memset_sse2_rep
+	cfi_endproc
+	.size __memset_chk_sse2_rep, .-__memset_chk_sse2_rep
+
+	.type __memset_chk_ia32, @function
+	.p2align 4;
+__memset_chk_ia32:
+	cfi_startproc
+	CALL_MCOUNT
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	__chk_fail
+	jmp	__memset_ia32
+	cfi_endproc
+	.size __memset_chk_ia32, .-__memset_chk_ia32
+# endif
+#endif
diff --git a/sysdeps/i386/sysdep.h b/sysdeps/i386/sysdep.h
index e03a8e926d..efdc82dde7 100644
--- a/sysdeps/i386/sysdep.h
+++ b/sysdeps/i386/sysdep.h
@@ -67,6 +67,9 @@
   ASM_SIZE_DIRECTIVE(name)						      \
   STABS_FUN_END(name)
 
+#define ENTRY_CHK(name) ENTRY (name)
+#define END_CHK(name) END (name)
+
 #ifdef HAVE_CPP_ASM_DEBUGINFO
 /* Disable that goop, because we just pass -g through to the assembler
    and it generates proper line number information directly.  */
diff --git a/sysdeps/x86_64/cacheinfo.c b/sysdeps/x86_64/cacheinfo.c
index 5b66c62eb3..54220379ec 100644
--- a/sysdeps/x86_64/cacheinfo.c
+++ b/sysdeps/x86_64/cacheinfo.c
@@ -74,6 +74,7 @@ static const struct intel_02_cache_info
     { 0x0a,  2, 32, M(_SC_LEVEL1_DCACHE_SIZE),    8192 },
     { 0x0c,  4, 32, M(_SC_LEVEL1_DCACHE_SIZE),   16384 },
     { 0x0d,  4, 64, M(_SC_LEVEL1_DCACHE_SIZE),   16384 },
+    { 0x0e,  6, 64, M(_SC_LEVEL1_DCACHE_SIZE),   24576 },
     { 0x21,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),   262144 },
     { 0x22,  4, 64, M(_SC_LEVEL3_CACHE_SIZE),   524288 },
     { 0x23,  8, 64, M(_SC_LEVEL3_CACHE_SIZE),  1048576 },
@@ -113,6 +114,7 @@ static const struct intel_02_cache_info
     { 0x7c,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),  1048576 },
     { 0x7d,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),  2097152 },
     { 0x7f,  2, 64, M(_SC_LEVEL2_CACHE_SIZE),   524288 },
+    { 0x80,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),   524288 },
     { 0x82,  8, 32, M(_SC_LEVEL2_CACHE_SIZE),   262144 },
     { 0x83,  8, 32, M(_SC_LEVEL2_CACHE_SIZE),   524288 },
     { 0x84,  8, 32, M(_SC_LEVEL2_CACHE_SIZE),  1048576 },
@@ -452,9 +454,10 @@ __cache_sysconf (int name)
 }
 
 
-/* Half the data cache size for use in memory and string routines, typically
+/* Data cache size for use in memory and string routines, typically
    L1 size.  */
 long int __x86_64_data_cache_size_half attribute_hidden = 32 * 1024 / 2;
+long int __x86_64_data_cache_size attribute_hidden = 32 * 1024;
 /* Shared cache size for use in memory and string routines, typically
    L2 or L3 size.  */
 long int __x86_64_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
@@ -657,7 +660,10 @@ init_cacheinfo (void)
     }
 
   if (data > 0)
-    __x86_64_data_cache_size_half = data / 2;
+    {
+      __x86_64_data_cache_size_half = data / 2;
+      __x86_64_data_cache_size = data;
+    }
 
   if (shared > 0)
     {
diff --git a/sysdeps/x86_64/multiarch/ifunc-defines.sym b/sysdeps/x86_64/multiarch/ifunc-defines.sym
index e2021cdf87..eb1538abcc 100644
--- a/sysdeps/x86_64/multiarch/ifunc-defines.sym
+++ b/sysdeps/x86_64/multiarch/ifunc-defines.sym
@@ -13,5 +13,8 @@ CPUID_ECX_OFFSET	offsetof (struct cpuid_registers, ecx)
 CPUID_EDX_OFFSET	offsetof (struct cpuid_registers, edx)
 FAMILY_OFFSET		offsetof (struct cpu_features, family)
 MODEL_OFFSET		offsetof (struct cpu_features, model)
+FEATURE_OFFSET		offsetof (struct cpu_features, feature)
+FEATURE_SIZE		sizeof (unsigned int)
 
 COMMON_CPUID_INDEX_1
+FEATURE_INDEX_1
diff --git a/sysdeps/x86_64/multiarch/init-arch.c b/sysdeps/x86_64/multiarch/init-arch.c
index 7823aceb9b..50b2a38fbd 100644
--- a/sysdeps/x86_64/multiarch/init-arch.c
+++ b/sysdeps/x86_64/multiarch/init-arch.c
@@ -64,7 +64,23 @@ __init_cpu_features (void)
 	  __cpu_features.model += extended_model;
 	}
       else if (__cpu_features.family == 0x06)
-	__cpu_features.model += extended_model;
+	{
+	  __cpu_features.model += extended_model;
+	  switch (__cpu_features.model)
+	    {
+	    case 0x1a:
+	    case 0x1e:
+	    case 0x1f:
+	    case 0x25:
+	    case 0x2e:
+	    case 0x2f:
+	      /* Rep string instructions are fast on Intel Core i3, i5
+		 and i7.  */
+	      __cpu_features.feature[index_Fast_Rep_String]
+		|= bit_Fast_Rep_String;
+	      break;
+	    }
+	}
     }
   /* This spells out "AuthenticAMD".  */
   else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h
index 0f8f77a8a1..69492cb3bf 100644
--- a/sysdeps/x86_64/multiarch/init-arch.h
+++ b/sysdeps/x86_64/multiarch/init-arch.h
@@ -16,6 +16,8 @@
    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
    02111-1307 USA.  */
 
+#define bit_Fast_Rep_String	(1 << 0)
+
 #ifdef	__ASSEMBLER__
 
 #include <ifunc-defines.h>
@@ -28,6 +30,8 @@
 #define index_SSSE3	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
 #define index_SSE4_2	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
 
+#define index_Fast_Rep_String	FEATURE_INDEX_1*FEATURE_SIZE
+
 #else	/* __ASSEMBLER__ */
 
 #include <sys/param.h>
@@ -39,6 +43,13 @@ enum
     COMMON_CPUID_INDEX_MAX
   };
 
+enum
+  {
+    FEATURE_INDEX_1 = 0,
+    /* Keep the following line at the end.  */
+    FEATURE_INDEX_MAX
+  };
+
 extern struct cpu_features
 {
   enum
@@ -58,6 +69,7 @@ extern struct cpu_features
   } cpuid[COMMON_CPUID_INDEX_MAX];
   unsigned int family;
   unsigned int model;
+  unsigned int feature[FEATURE_INDEX_MAX];
 } __cpu_features attribute_hidden;
 
 
@@ -86,4 +98,6 @@ extern const struct cpu_features *__get_cpu_features (void)
 #define HAS_SSE4_2	HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 20)
 #define HAS_FMA		HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 12)
 
+#define index_Fast_Rep_String	FEATURE_INDEX_1
+
 #endif	/* __ASSEMBLER__ */
diff --git a/sysdeps/x86_64/multiarch/strlen.S b/sysdeps/x86_64/multiarch/strlen.S
index 509f9c9605..f9641131fa 100644
--- a/sysdeps/x86_64/multiarch/strlen.S
+++ b/sysdeps/x86_64/multiarch/strlen.S
@@ -46,28 +46,58 @@ END(strlen)
 __strlen_sse42:
 	cfi_startproc
 	CALL_MCOUNT
-	pxor	%xmm2, %xmm2
-	movq	%rdi, %rcx
+	pxor	%xmm1, %xmm1
+	movl	%edi, %ecx
 	movq	%rdi, %r8
 	andq	$~15, %rdi
-	movdqa	%xmm2, %xmm1
-	pcmpeqb	(%rdi), %xmm2
-	orl	$0xffffffff, %esi
-	subq	%rdi, %rcx
-	shll	%cl, %esi
-	pmovmskb %xmm2, %edx
-	andl	%esi, %edx
-	jnz	1f
-
-2:	pcmpistri $0x08, 16(%rdi), %xmm1
-	leaq	16(%rdi), %rdi
-	jnz	2b
+	xor	%edi, %ecx
+	pcmpeqb	(%rdi), %xmm1
+	pmovmskb %xmm1, %edx
+	shrl	%cl, %edx
+	shll	%cl, %edx
+	andl	%edx, %edx
+	jnz	L(less16bytes)
+	pxor	%xmm1, %xmm1
 
+	.p2align 4
+L(more64bytes_loop):
+	pcmpistri $0x08, 16(%rdi), %xmm1
+	jz	L(more32bytes)
+
+	pcmpistri $0x08, 32(%rdi), %xmm1
+	jz	L(more48bytes)
+
+	pcmpistri $0x08, 48(%rdi), %xmm1
+	jz	L(more64bytes)
+
+	add	$64, %rdi
+	pcmpistri $0x08, (%rdi), %xmm1
+	jnz	L(more64bytes_loop)
 	leaq	(%rdi,%rcx), %rax
 	subq	%r8, %rax
 	ret
 
-1:	subq	%r8, %rdi
+	.p2align 4
+L(more32bytes):
+	leaq	16(%rdi,%rcx, 1), %rax
+	subq	%r8, %rax
+	ret
+
+	.p2align 4
+L(more48bytes):
+	leaq	32(%rdi,%rcx, 1), %rax
+	subq	%r8, %rax
+	ret
+
+	.p2align 4
+L(more64bytes):
+	leaq	48(%rdi,%rcx, 1), %rax
+	subq	%r8, %rax
+	ret
+
+	.p2align 4
+L(less16bytes):
+	subq	%r8, %rdi
 	bsfl	%edx, %eax
 	addq	%rdi, %rax
 	ret
diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
index 650ec173b6..ac3fe14679 100644
--- a/sysdeps/x86_64/strcmp.S
+++ b/sysdeps/x86_64/strcmp.S
@@ -1,5 +1,5 @@
 /* Highly optimized version for x86-64.
-   Copyright (C) 1999, 2000, 2002, 2003, 2005, 2009
+   Copyright (C) 1999, 2000, 2002, 2003, 2005, 2009, 2010
    Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Based on i686 version contributed by Ulrich Drepper
@@ -33,6 +33,13 @@
 #endif
 
 #ifdef USE_AS_STRNCMP
+/* The simplified code below is not set up to handle strncmp() so far.
+   Should this become necessary it has to be implemented.  For now
+   just report the problem.  */
+# ifdef NOT_IN_lib
+#  error "strncmp not implemented so far"
+# endif
+
 /* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
    if the new counter > the old one or is 0.  */
 # define UPDATE_STRNCMP_COUNTER				\
@@ -54,7 +61,7 @@
 #ifndef USE_SSSE3
 	.text
 #else
-        .section .text.ssse3,"ax",@progbits
+	.section .text.ssse3,"ax",@progbits
 #endif
 
 ENTRY (BP_SYM (STRCMP))
@@ -80,13 +87,13 @@ END (BP_SYM (STRCMP))
 /*
  * This implementation uses SSE to compare up to 16 bytes at a time.
  */
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	test	%rdx, %rdx
 	je	LABEL(strcmp_exitz)
 	cmp	$1, %rdx
 	je	LABEL(Byte0)
 	mov	%rdx, %r11
-#endif
+# endif
 	mov	%esi, %ecx
 	mov	%edi, %eax
 /* Use 64bit AND here to avoid long NOP padding.  */
@@ -107,10 +114,10 @@ END (BP_SYM (STRCMP))
 	pmovmskb %xmm1, %edx
 	sub	$0xffff, %edx		/* if first 16 bytes are same, edx == 0xffff */
 	jnz	LABEL(less16bytes)	/* If not, find different value or null char */
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)	/* finish comparision */
-#endif
+# endif
 	add	$16, %rsi		/* prepare to search next 16 bytes */
 	add	$16, %rdi		/* prepare to search next 16 bytes */
 
@@ -184,10 +191,10 @@ LABEL(loop_ashr_0):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)		/* mismatch or null char seen */
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-#endif
+# endif
 	add	$16, %rcx
 	movdqa	(%rsi, %rcx), %xmm1
 	movdqa	(%rdi, %rcx), %xmm2
@@ -198,10 +205,10 @@ LABEL(loop_ashr_0):
 	pmovmskb %xmm1, %edx
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-#endif
+# endif
 	add	$16, %rcx
 	jmp	LABEL(loop_ashr_0)
 
@@ -249,13 +256,13 @@ LABEL(gobble_ashr_1):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4		 /* store for next cycle */
 
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
 	psrldq	$1, %xmm3
 	pslldq	$15, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
+# else
 	palignr	$1, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+# endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -264,10 +271,10 @@ LABEL(gobble_ashr_1):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-#endif
+# endif
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
 
@@ -278,13 +285,13 @@ LABEL(gobble_ashr_1):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4		/* store for next cycle */
 
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
 	psrldq	$1, %xmm3
 	pslldq	$15, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
+# else
 	palignr	$1, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+# endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -293,10 +300,10 @@ LABEL(gobble_ashr_1):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-#endif
+# endif
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
 	jmp	LABEL(loop_ashr_1)
@@ -312,10 +319,10 @@ LABEL(nibble_ashr_1):
 	test	$0xfffe, %edx
 	jnz	LABEL(ashr_1_exittail)	/* find null char*/
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	cmp	$14, %r11
 	jbe	LABEL(ashr_1_exittail)
-#endif
+# endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10		/* substract 4K from %r10 */
@@ -334,7 +341,7 @@ LABEL(ashr_1_exittail):
 
 /*
  * The following cases will be handled by ashr_2
- * rcx(offset of rsi)  rax(offset of rdi)   relative offset   	corresponding case
+ * rcx(offset of rsi)  rax(offset of rdi)   relative offset   corresponding case
  *        n(14~15)            n -14         1(15 +(n-14) - n)         ashr_2
  */
 	.p2align 4
@@ -376,13 +383,13 @@ LABEL(gobble_ashr_2):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
 	psrldq	$2, %xmm3
 	pslldq	$14, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
+# else
 	palignr	$2, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+# endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -391,10 +398,10 @@ LABEL(gobble_ashr_2):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-#endif
+# endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -406,13 +413,13 @@ LABEL(gobble_ashr_2):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
 	psrldq	$2, %xmm3
 	pslldq	$14, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
+# else
 	palignr	$2, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+# endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -421,10 +428,10 @@ LABEL(gobble_ashr_2):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-#endif
+# endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -437,10 +444,10 @@ LABEL(nibble_ashr_2):
 	test	$0xfffc, %edx
 	jnz	LABEL(ashr_2_exittail)
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	cmp	$13, %r11
 	jbe	LABEL(ashr_2_exittail)
-#endif
+# endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -498,13 +505,13 @@ LABEL(gobble_ashr_3):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
 	psrldq	$3, %xmm3
 	pslldq	$13, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
+# else
 	palignr	$3, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+# endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -513,10 +520,10 @@ LABEL(gobble_ashr_3):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-#endif
+# endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -528,13 +535,13 @@ LABEL(gobble_ashr_3):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
 	psrldq	$3, %xmm3
 	pslldq	$13, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
+# else
 	palignr	$3, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+# endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -543,10 +550,10 @@ LABEL(gobble_ashr_3):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-#endif
+# endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -559,10 +566,10 @@ LABEL(nibble_ashr_3):
 	test	$0xfff8, %edx
 	jnz	LABEL(ashr_3_exittail)
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	cmp	$12, %r11
 	jbe	LABEL(ashr_3_exittail)
-#endif
+# endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -620,13 +627,13 @@ LABEL(gobble_ashr_4):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
 	psrldq	$4, %xmm3
 	pslldq	$12, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
+# else
 	palignr	$4, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+# endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -635,10 +642,10 @@ LABEL(gobble_ashr_4):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-#endif
+# endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -650,13 +657,13 @@ LABEL(gobble_ashr_4):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
 	psrldq	$4, %xmm3
 	pslldq	$12, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
+# else
 	palignr	$4, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+# endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -665,10 +672,10 @@ LABEL(gobble_ashr_4):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-#endif
+# endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -681,10 +688,10 @@ LABEL(nibble_ashr_4):
 	test	$0xfff0, %edx
 	jnz	LABEL(ashr_4_exittail)
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	cmp	$11, %r11
 	jbe	LABEL(ashr_4_exittail)
-#endif
+# endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -700,7 +707,7 @@ LABEL(ashr_4_exittail):
 /*
  * The following cases will be handled by ashr_5
  *  rcx(offset of rsi)  rax(offset of rdi)        relative offset      corresponding case
- *        n(11~15)          n - 11      	  4(15 +(n-11) - n)         ashr_5
+ *        n(11~15)          n - 11		  4(15 +(n-11) - n)         ashr_5
  */
 	.p2align 4
 LABEL(ashr_5):
@@ -742,13 +749,13 @@ LABEL(gobble_ashr_5):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
 	psrldq	$5, %xmm3
 	pslldq	$11, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
+# else
 	palignr	$5, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+# endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -757,10 +764,10 @@ LABEL(gobble_ashr_5):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-#endif
+# endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -772,13 +779,13 @@ LABEL(gobble_ashr_5):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
 	psrldq	$5, %xmm3
 	pslldq	$11, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
+# else
 	palignr	$5, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+# endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -787,10 +794,10 @@ LABEL(gobble_ashr_5):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-#endif
+# endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -803,10 +810,10 @@ LABEL(nibble_ashr_5):
 	test	$0xffe0, %edx
 	jnz	LABEL(ashr_5_exittail)
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	cmp	$10, %r11
 	jbe	LABEL(ashr_5_exittail)
-#endif
+# endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -822,7 +829,7 @@ LABEL(ashr_5_exittail):
 /*
  * The following cases will be handled by ashr_6
  *  rcx(offset of rsi)  rax(offset of rdi)        relative offset      corresponding case
- *        n(10~15)          n - 10      	  5(15 +(n-10) - n)         ashr_6
+ *        n(10~15)          n - 10		  5(15 +(n-10) - n)         ashr_6
  */
 	.p2align 4
 LABEL(ashr_6):
@@ -864,13 +871,13 @@ LABEL(gobble_ashr_6):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
 	psrldq	$6, %xmm3
 	pslldq	$10, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
+# else
 	palignr	$6, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+# endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -879,10 +886,10 @@ LABEL(gobble_ashr_6):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-#endif
+# endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -894,13 +901,13 @@ LABEL(gobble_ashr_6):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
 	psrldq	$6, %xmm3
 	pslldq	$10, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
+# else
 	palignr	$6, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+# endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -909,10 +916,10 @@ LABEL(gobble_ashr_6):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-#endif
+# endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -925,10 +932,10 @@ LABEL(nibble_ashr_6):
 	test	$0xffc0, %edx
 	jnz	LABEL(ashr_6_exittail)
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	cmp	$9, %r11
 	jbe	LABEL(ashr_6_exittail)
-#endif
+# endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -944,7 +951,7 @@ LABEL(ashr_6_exittail):
 /*
  * The following cases will be handled by ashr_7
  *  rcx(offset of rsi)  rax(offset of rdi)        relative offset      corresponding case
- *        n(9~15)          n - 9      	        6(15 +(n - 9) - n)         ashr_7
+ *        n(9~15)          n - 9                6(15 +(n - 9) - n)         ashr_7
  */
 	.p2align 4
 LABEL(ashr_7):
@@ -986,13 +993,13 @@ LABEL(gobble_ashr_7):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
 	psrldq	$7, %xmm3
 	pslldq	$9, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
+# else
 	palignr	$7, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+# endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1001,10 +1008,10 @@ LABEL(gobble_ashr_7):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-#endif
+# endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1016,13 +1023,13 @@ LABEL(gobble_ashr_7):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
 	psrldq	$7, %xmm3
 	pslldq	$9, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
+# else
 	palignr	$7, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+# endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1031,10 +1038,10 @@ LABEL(gobble_ashr_7):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-#endif
+# endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1047,10 +1054,10 @@ LABEL(nibble_ashr_7):
 	test	$0xff80, %edx
 	jnz	LABEL(ashr_7_exittail)
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	cmp	$8, %r11
 	jbe	LABEL(ashr_7_exittail)
-#endif
+# endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -1066,7 +1073,7 @@ LABEL(ashr_7_exittail):
 /*
  *  The following cases will be handled by ashr_8
  *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
- *        n(8~15)          n - 8      	        7(15 +(n - 8) - n)         ashr_8
+ *        n(8~15)          n - 8                7(15 +(n - 8) - n)         ashr_8
  */
 	.p2align 4
 LABEL(ashr_8):
@@ -1108,13 +1115,13 @@ LABEL(gobble_ashr_8):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
 	psrldq	$8, %xmm3
 	pslldq	$8, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
+# else
 	palignr	$8, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+# endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1123,10 +1130,10 @@ LABEL(gobble_ashr_8):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-#endif
+# endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1138,13 +1145,13 @@ LABEL(gobble_ashr_8):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
 	psrldq	$8, %xmm3
 	pslldq	$8, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
+# else
 	palignr	$8, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+# endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1153,10 +1160,10 @@ LABEL(gobble_ashr_8):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-#endif
+# endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1169,10 +1176,10 @@ LABEL(nibble_ashr_8):
 	test	$0xff00, %edx
 	jnz	LABEL(ashr_8_exittail)
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	cmp	$7, %r11
 	jbe	LABEL(ashr_8_exittail)
-#endif
+# endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -1188,7 +1195,7 @@ LABEL(ashr_8_exittail):
 /*
  *  The following cases will be handled by ashr_9
  *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
- *        n(7~15)          n - 7      	        8(15 +(n - 7) - n)         ashr_9
+ *        n(7~15)          n - 7                8(15 +(n - 7) - n)         ashr_9
  */
 	.p2align 4
 LABEL(ashr_9):
@@ -1230,13 +1237,13 @@ LABEL(gobble_ashr_9):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
 	psrldq	$9, %xmm3
 	pslldq	$7, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
+# else
 	palignr	$9, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+# endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1245,10 +1252,10 @@ LABEL(gobble_ashr_9):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-#endif
+# endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1260,13 +1267,13 @@ LABEL(gobble_ashr_9):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
 	psrldq	$9, %xmm3
 	pslldq	$7, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
+# else
 	palignr	$9, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+# endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1275,10 +1282,10 @@ LABEL(gobble_ashr_9):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-#endif
+# endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3		/* store for next cycle */
@@ -1291,10 +1298,10 @@ LABEL(nibble_ashr_9):
 	test	$0xfe00, %edx
 	jnz	LABEL(ashr_9_exittail)
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	cmp	$6, %r11
 	jbe	LABEL(ashr_9_exittail)
-#endif
+# endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -1310,7 +1317,7 @@ LABEL(ashr_9_exittail):
 /*
  *  The following cases will be handled by ashr_10
  *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
- *        n(6~15)          n - 6      	        9(15 +(n - 6) - n)         ashr_10
+ *        n(6~15)          n - 6                9(15 +(n - 6) - n)         ashr_10
  */
 	.p2align 4
 LABEL(ashr_10):
@@ -1352,13 +1359,13 @@ LABEL(gobble_ashr_10):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
 	psrldq	$10, %xmm3
 	pslldq	$6, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
+# else
 	palignr	$10, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+# endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1367,10 +1374,10 @@ LABEL(gobble_ashr_10):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-#endif
+# endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1382,13 +1389,13 @@ LABEL(gobble_ashr_10):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
 	psrldq	$10, %xmm3
 	pslldq	$6, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
+# else
 	palignr	$10, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+# endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1397,10 +1404,10 @@ LABEL(gobble_ashr_10):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-#endif
+# endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1413,10 +1420,10 @@ LABEL(nibble_ashr_10):
 	test	$0xfc00, %edx
 	jnz	LABEL(ashr_10_exittail)
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	cmp	$5, %r11
 	jbe	LABEL(ashr_10_exittail)
-#endif
+# endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -1432,7 +1439,7 @@ LABEL(ashr_10_exittail):
 /*
  *  The following cases will be handled by ashr_11
  *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
- *        n(5~15)          n - 5      	        10(15 +(n - 5) - n)         ashr_11
+ *        n(5~15)          n - 5               10(15 +(n - 5) - n)         ashr_11
  */
 	.p2align 4
 LABEL(ashr_11):
@@ -1474,13 +1481,13 @@ LABEL(gobble_ashr_11):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
 	psrldq	$11, %xmm3
 	pslldq	$5, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
+# else
 	palignr	$11, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+# endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1489,10 +1496,10 @@ LABEL(gobble_ashr_11):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-#endif
+# endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1504,13 +1511,13 @@ LABEL(gobble_ashr_11):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
 	psrldq	$11, %xmm3
 	pslldq	$5, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
+# else
 	palignr	$11, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+# endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1519,10 +1526,10 @@ LABEL(gobble_ashr_11):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-#endif
+# endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1535,10 +1542,10 @@ LABEL(nibble_ashr_11):
 	test	$0xf800, %edx
 	jnz	LABEL(ashr_11_exittail)
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	cmp	$4, %r11
 	jbe	LABEL(ashr_11_exittail)
-#endif
+# endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -1554,7 +1561,7 @@ LABEL(ashr_11_exittail):
 /*
  *  The following cases will be handled by ashr_12
  *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
- *        n(4~15)          n - 4      	        11(15 +(n - 4) - n)         ashr_12
+ *        n(4~15)          n - 4                11(15 +(n - 4) - n)         ashr_12
  */
 	.p2align 4
 LABEL(ashr_12):
@@ -1596,13 +1603,13 @@ LABEL(gobble_ashr_12):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
 	psrldq	$12, %xmm3
 	pslldq	$4, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
+# else
 	palignr	$12, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+# endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1611,10 +1618,10 @@ LABEL(gobble_ashr_12):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-#endif
+# endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1626,13 +1633,13 @@ LABEL(gobble_ashr_12):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
 	psrldq	$12, %xmm3
 	pslldq	$4, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
+# else
 	palignr	$12, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+# endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1641,10 +1648,10 @@ LABEL(gobble_ashr_12):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-#endif
+# endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1657,10 +1664,10 @@ LABEL(nibble_ashr_12):
 	test	$0xf000, %edx
 	jnz	LABEL(ashr_12_exittail)
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	cmp	$3, %r11
 	jbe	LABEL(ashr_12_exittail)
-#endif
+# endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -1676,7 +1683,7 @@ LABEL(ashr_12_exittail):
 /*
  *  The following cases will be handled by ashr_13
  *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
- *        n(3~15)          n - 3      	        12(15 +(n - 3) - n)         ashr_13
+ *        n(3~15)          n - 3                12(15 +(n - 3) - n)         ashr_13
  */
 	.p2align 4
 LABEL(ashr_13):
@@ -1718,13 +1725,13 @@ LABEL(gobble_ashr_13):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
 	psrldq	$13, %xmm3
 	pslldq	$3, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
+# else
 	palignr	$13, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+# endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1733,10 +1740,10 @@ LABEL(gobble_ashr_13):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-#endif
+# endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1748,13 +1755,13 @@ LABEL(gobble_ashr_13):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
 	psrldq	$13, %xmm3
 	pslldq	$3, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
+# else
 	palignr	$13, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+# endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1763,10 +1770,10 @@ LABEL(gobble_ashr_13):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-#endif
+# endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1779,10 +1786,10 @@ LABEL(nibble_ashr_13):
 	test	$0xe000, %edx
 	jnz	LABEL(ashr_13_exittail)
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	cmp	$2, %r11
 	jbe	LABEL(ashr_13_exittail)
-#endif
+# endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -1798,7 +1805,7 @@ LABEL(ashr_13_exittail):
 /*
  *  The following cases will be handled by ashr_14
  *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
- *        n(2~15)          n - 2      	        13(15 +(n - 2) - n)         ashr_14
+ *        n(2~15)          n - 2                13(15 +(n - 2) - n)         ashr_14
  */
 	.p2align 4
 LABEL(ashr_14):
@@ -1840,13 +1847,13 @@ LABEL(gobble_ashr_14):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
 	psrldq	$14, %xmm3
 	pslldq	$2, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
+# else
 	palignr	$14, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+# endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1855,10 +1862,10 @@ LABEL(gobble_ashr_14):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-#endif
+# endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1870,13 +1877,13 @@ LABEL(gobble_ashr_14):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
 	psrldq	$14, %xmm3
 	pslldq	$2, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
+# else
 	palignr	$14, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+# endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1885,10 +1892,10 @@ LABEL(gobble_ashr_14):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-#endif
+# endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1901,10 +1908,10 @@ LABEL(nibble_ashr_14):
 	test	$0xc000, %edx
 	jnz	LABEL(ashr_14_exittail)
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	cmp	$1, %r11
 	jbe	LABEL(ashr_14_exittail)
-#endif
+# endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -1920,7 +1927,7 @@ LABEL(ashr_14_exittail):
 /*
  *  The following cases will be handled by ashr_15
  *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
- *        n(1~15)          n - 1      	        14(15 +(n - 1) - n)         ashr_15
+ *        n(1~15)          n - 1                14(15 +(n - 1) - n)         ashr_15
  */
 	.p2align 4
 LABEL(ashr_15):
@@ -1964,13 +1971,13 @@ LABEL(gobble_ashr_15):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
 	psrldq	$15, %xmm3
 	pslldq	$1, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
+# else
 	palignr	$15, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+# endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1979,10 +1986,10 @@ LABEL(gobble_ashr_15):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-#endif
+# endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1994,13 +2001,13 @@ LABEL(gobble_ashr_15):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
 	psrldq	$15, %xmm3
 	pslldq	$1, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
+# else
 	palignr	$15, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+# endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -2009,10 +2016,10 @@ LABEL(gobble_ashr_15):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-#endif
+# endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -2025,10 +2032,10 @@ LABEL(nibble_ashr_15):
 	test	$0x8000, %edx
 	jnz	LABEL(ashr_15_exittail)
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	test	%r11, %r11
 	je	LABEL(ashr_15_exittail)
-#endif
+# endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -2062,10 +2069,10 @@ LABEL(ret):
 LABEL(less16bytes):
 	bsf	%rdx, %rdx		/* find and store bit index in %rdx */
 
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
 	sub	%rdx, %r11
 	jbe	LABEL(strcmp_exitz)
-#endif
+# endif
 	movzbl	(%rsi, %rdx), %ecx
 	movzbl	(%rdi, %rdx), %eax