summary refs log tree commit diff
diff options
context:
space:
mode:
authorAndreas Schwab <schwab@redhat.com>2010-01-19 17:01:59 +0100
committerAndreas Schwab <schwab@redhat.com>2010-01-20 17:35:37 +0100
commit7d16e875fe9092378b7d61d6c749fbb87e7d1e7e (patch)
tree422963be75b1fa594d0ea4c8f9698039be20c0af
parentf0dfc72f9d9bae2e8b210113ca2c4a4428d4cb5b (diff)
downloadglibc-7d16e875fe9092378b7d61d6c749fbb87e7d1e7e.tar.gz
glibc-7d16e875fe9092378b7d61d6c749fbb87e7d1e7e.tar.xz
glibc-7d16e875fe9092378b7d61d6c749fbb87e7d1e7e.zip
Revert "Optimize 32bit memset/memcpy with SSE2/SSSE3."
This reverts commit 3af48cbdfaeb8bc389de1caeb33bc29811da80e8.
-rw-r--r--ChangeLog62
-rw-r--r--sysdeps/i386/i686/bcopy.S3
-rw-r--r--sysdeps/i386/i686/cacheinfo.c1
-rw-r--r--sysdeps/i386/i686/memcpy.S4
-rw-r--r--sysdeps/i386/i686/memmove.S27
-rw-r--r--sysdeps/i386/i686/mempcpy.S4
-rw-r--r--sysdeps/i386/i686/memset.S4
-rw-r--r--sysdeps/i386/i686/multiarch/Makefile4
-rw-r--r--sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S4
-rw-r--r--sysdeps/i386/i686/multiarch/bcopy-ssse3.S4
-rw-r--r--sysdeps/i386/i686/multiarch/bcopy.S89
-rw-r--r--sysdeps/i386/i686/multiarch/bzero-sse2-rep.S3
-rw-r--r--sysdeps/i386/i686/multiarch/bzero-sse2.S3
-rw-r--r--sysdeps/i386/i686/multiarch/bzero.S97
-rw-r--r--sysdeps/i386/i686/multiarch/ifunc-defines.sym3
-rw-r--r--sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S1785
-rw-r--r--sysdeps/i386/i686/multiarch/memcpy-ssse3.S1737
-rw-r--r--sysdeps/i386/i686/multiarch/memcpy.S90
-rw-r--r--sysdeps/i386/i686/multiarch/memcpy_chk.S64
-rw-r--r--sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S4
-rw-r--r--sysdeps/i386/i686/multiarch/memmove-ssse3.S4
-rw-r--r--sysdeps/i386/i686/multiarch/memmove.S117
-rw-r--r--sysdeps/i386/i686/multiarch/memmove_chk.S112
-rw-r--r--sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S4
-rw-r--r--sysdeps/i386/i686/multiarch/mempcpy-ssse3.S4
-rw-r--r--sysdeps/i386/i686/multiarch/mempcpy.S93
-rw-r--r--sysdeps/i386/i686/multiarch/mempcpy_chk.S64
-rw-r--r--sysdeps/i386/i686/multiarch/memset-sse2-rep.S821
-rw-r--r--sysdeps/i386/i686/multiarch/memset-sse2.S867
-rw-r--r--sysdeps/i386/i686/multiarch/memset.S112
-rw-r--r--sysdeps/i386/i686/multiarch/memset_chk.S116
-rw-r--r--sysdeps/i386/sysdep.h3
-rw-r--r--sysdeps/x86_64/cacheinfo.c10
-rw-r--r--sysdeps/x86_64/multiarch/ifunc-defines.sym3
-rw-r--r--sysdeps/x86_64/multiarch/init-arch.c18
-rw-r--r--sysdeps/x86_64/multiarch/init-arch.h14
36 files changed, 15 insertions, 6339 deletions
diff --git a/ChangeLog b/ChangeLog
index cd80610d90..3b14790574 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -169,68 +169,6 @@
 
 	* posix/sched.h: Define time_t and pid_t for XPG7.
 
-2010-01-12  H.J. Lu  <hongjiu.lu@intel.com>
-
-	* sysdeps/i386/i686/bcopy.S: New file.
-
-	* sysdeps/i386/i686/cacheinfo.c (__x86_64_data_cache_size): Define.
-
-	* sysdeps/i386/i686/memcpy.S (__memcpy_chk): Use ENTRY_CHK
-	and END_CHK.
-	* sysdeps/i386/i686/memmove.S (__memmove_chk): Likewise.
-	* sysdeps/i386/i686/mempcpy.S (__mempcpy_chk): Likewise.
-	* sysdeps/i386/i686/memset.S (__memset_chk): Likewise.
-
-	* sysdeps/i386/i686/memmove.S: Support USE_AS_BCOPY.
-
-	* sysdeps/i386/i686/multiarch/Makefile (sysdep_routines): Add
-	bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 memmove-ssse3
-	memcpy-ssse3-rep mempcpy-ssse3-rep memmove-ssse3-rep
-	bcopy-ssse3 bcopy-ssse3-rep memset-sse2-rep bzero-sse2-rep
-	* sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S: New file.
-	* sysdeps/i386/i686/multiarch/bcopy-ssse3.S: New file.
-	* sysdeps/i386/i686/multiarch/bcopy.S: New file.
-	* sysdeps/i386/i686/multiarch/bzero-sse2-rep.S: New file.
-	* sysdeps/i386/i686/multiarch/bzero-sse2.S: New file.
-	* sysdeps/i386/i686/multiarch/bzero.S: New file.
-	* sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S: New file.
-	* sysdeps/i386/i686/multiarch/memcpy-ssse3.S: New file.
-	* sysdeps/i386/i686/multiarch/memcpy.S: New file.
-	* sysdeps/i386/i686/multiarch/memcpy_chk.S: New file.
-	* sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S: New file.
-	* sysdeps/i386/i686/multiarch/memmove-ssse3.S: New file.
-	* sysdeps/i386/i686/multiarch/memmove.S: New file.
-	* sysdeps/i386/i686/multiarch/memmove_chk.S: New file.
-	* sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S: New file.
-	* sysdeps/i386/i686/multiarch/mempcpy-ssse3.S: New file.
-	* sysdeps/i386/i686/multiarch/mempcpy.S: New file.
-	* sysdeps/i386/i686/multiarch/mempcpy_chk.S: New file.
-	* sysdeps/i386/i686/multiarch/memset-sse2-rep.S: New file.
-	* sysdeps/i386/i686/multiarch/memset-sse2.S: New file.
-	* sysdeps/i386/i686/multiarch/memset.S: New file.
-	* sysdeps/i386/i686/multiarch/memset_chk.S: New file.
-
-	* sysdeps/i386/sysdep.h (ENTRY_CHK): New.
-	(END_CHK): Likewise.
-
-	* sysdeps/i386/i686/multiarch/ifunc-defines.sym: Add
-	FEATURE_OFFSET, FEATURE_SIZE and FEATURE_INDEX_1.
-	* sysdeps/x86_64/multiarch/ifunc-defines.sym: Likewise.
-
-	* sysdeps/x86_64/cacheinfo.c (intel_02_cache_info): Add entries
-	for 0x0e and 0x80.
-	(__x86_64_data_cache_size): New.
-	(init_cacheinfo): Set __x86_64_data_cache_size.
-
-	* sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features): Turn
-	on bit_Fast_Rep_String for Intel Core i7.
-
-	* sysdeps/x86_64/multiarch/init-arch.h (bit_Fast_Rep_String): New.
-	(index_Fast_Rep_String): Likewise.
-	(FEATURE_INDEX_1): Likewise.
-	(FEATURE_INDEX_MAX): Likewise.
-	(cpu_features): Add feature.
-
 2010-01-12  Ulrich Drepper  <drepper@redhat.com>
 
 	* conform/data/sys/select.h-data: Fix up for XPG7.
diff --git a/sysdeps/i386/i686/bcopy.S b/sysdeps/i386/i686/bcopy.S
deleted file mode 100644
index 15ef9419a4..0000000000
--- a/sysdeps/i386/i686/bcopy.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_BCOPY
-#define memmove bcopy
-#include <sysdeps/i386/i686/memmove.S>
diff --git a/sysdeps/i386/i686/cacheinfo.c b/sysdeps/i386/i686/cacheinfo.c
index f8b7f521ca..82e4cd223e 100644
--- a/sysdeps/i386/i686/cacheinfo.c
+++ b/sysdeps/i386/i686/cacheinfo.c
@@ -1,4 +1,3 @@
-#define __x86_64_data_cache_size __x86_data_cache_size
 #define __x86_64_data_cache_size_half __x86_data_cache_size_half
 #define __x86_64_shared_cache_size __x86_shared_cache_size
 #define __x86_64_shared_cache_size_half __x86_shared_cache_size_half
diff --git a/sysdeps/i386/i686/memcpy.S b/sysdeps/i386/i686/memcpy.S
index 86ee082beb..0b2da1ea27 100644
--- a/sysdeps/i386/i686/memcpy.S
+++ b/sysdeps/i386/i686/memcpy.S
@@ -32,11 +32,11 @@
 
 	.text
 #if defined PIC && !defined NOT_IN_libc
-ENTRY_CHK (__memcpy_chk)
+ENTRY (__memcpy_chk)
 	movl	12(%esp), %eax
 	cmpl	%eax, 16(%esp)
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END_CHK (__memcpy_chk)
+END (__memcpy_chk)
 #endif
 ENTRY (BP_SYM (memcpy))
 	ENTER
diff --git a/sysdeps/i386/i686/memmove.S b/sysdeps/i386/i686/memmove.S
index 981f14f4e0..b93b5c729f 100644
--- a/sysdeps/i386/i686/memmove.S
+++ b/sysdeps/i386/i686/memmove.S
@@ -26,27 +26,18 @@
 
 #define PARMS	LINKAGE+4	/* one spilled register */
 #define RTN	PARMS
+#define DEST	RTN+RTN_SIZE
+#define SRC	DEST+PTR_SIZE
+#define LEN	SRC+PTR_SIZE
 
 	.text
-
-#ifdef USE_AS_BCOPY
-# define SRC	RTN+RTN_SIZE
-# define DEST	SRC+PTR_SIZE
-# define LEN	DEST+PTR_SIZE
-#else
-# define DEST	RTN+RTN_SIZE
-# define SRC	DEST+PTR_SIZE
-# define LEN	SRC+PTR_SIZE
-
-# if defined PIC && !defined NOT_IN_libc
-ENTRY_CHK (__memmove_chk)
+#if defined PIC && !defined NOT_IN_libc
+ENTRY (__memmove_chk)
 	movl	12(%esp), %eax
 	cmpl	%eax, 16(%esp)
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END_CHK (__memmove_chk)
-# endif
+END (__memmove_chk)
 #endif
-
 ENTRY (BP_SYM (memmove))
 	ENTER
 
@@ -78,10 +69,8 @@ ENTRY (BP_SYM (memmove))
 	movsl
 	movl	%edx, %esi
 	cfi_restore (esi)
-#ifndef USE_AS_BCOPY
 	movl	DEST(%esp), %eax
 	RETURN_BOUNDED_POINTER (DEST(%esp))
-#endif
 
 	popl	%edi
 	cfi_adjust_cfa_offset (-4)
@@ -112,10 +101,8 @@ ENTRY (BP_SYM (memmove))
 	movsl
 	movl	%edx, %esi
 	cfi_restore (esi)
-#ifndef USE_AS_BCOPY
 	movl	DEST(%esp), %eax
 	RETURN_BOUNDED_POINTER (DEST(%esp))
-#endif
 
 	cld
 	popl	%edi
@@ -125,6 +112,4 @@ ENTRY (BP_SYM (memmove))
 	LEAVE
 	RET_PTR
 END (BP_SYM (memmove))
-#ifndef USE_AS_BCOPY
 libc_hidden_builtin_def (memmove)
-#endif
diff --git a/sysdeps/i386/i686/mempcpy.S b/sysdeps/i386/i686/mempcpy.S
index c10686fb3d..6437e4a5d4 100644
--- a/sysdeps/i386/i686/mempcpy.S
+++ b/sysdeps/i386/i686/mempcpy.S
@@ -32,11 +32,11 @@
 
 	.text
 #if defined PIC && !defined NOT_IN_libc
-ENTRY_CHK (__mempcpy_chk)
+ENTRY (__mempcpy_chk)
 	movl	12(%esp), %eax
 	cmpl	%eax, 16(%esp)
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END_CHK (__mempcpy_chk)
+END (__mempcpy_chk)
 #endif
 ENTRY (BP_SYM (__mempcpy))
 	ENTER
diff --git a/sysdeps/i386/i686/memset.S b/sysdeps/i386/i686/memset.S
index b343af7b64..dfa1aa7019 100644
--- a/sysdeps/i386/i686/memset.S
+++ b/sysdeps/i386/i686/memset.S
@@ -40,11 +40,11 @@
 
         .text
 #if defined PIC && !defined NOT_IN_libc && !BZERO_P
-ENTRY_CHK (__memset_chk)
+ENTRY (__memset_chk)
 	movl	12(%esp), %eax
 	cmpl	%eax, 16(%esp)
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END_CHK (__memset_chk)
+END (__memset_chk)
 #endif
 ENTRY (BP_SYM (memset))
 	ENTER
diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile
index fbad9ae734..e1553b284e 100644
--- a/sysdeps/i386/i686/multiarch/Makefile
+++ b/sysdeps/i386/i686/multiarch/Makefile
@@ -4,10 +4,6 @@ gen-as-const-headers += ifunc-defines.sym
 endif
 
 ifeq ($(subdir),string)
-sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \
-		   memmove-ssse3 memcpy-ssse3-rep mempcpy-ssse3-rep \
-		   memmove-ssse3-rep bcopy-ssse3 bcopy-ssse3-rep \
-		   memset-sse2-rep bzero-sse2-rep
 ifeq (yes,$(config-cflags-sse4))
 sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
 CFLAGS-strcspn-c.c += -msse4
diff --git a/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S b/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S
deleted file mode 100644
index cbc8b420e8..0000000000
--- a/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_MEMMOVE
-#define USE_AS_BCOPY
-#define MEMCPY		__bcopy_ssse3_rep
-#include "memcpy-ssse3-rep.S"
diff --git a/sysdeps/i386/i686/multiarch/bcopy-ssse3.S b/sysdeps/i386/i686/multiarch/bcopy-ssse3.S
deleted file mode 100644
index 36aac44b9c..0000000000
--- a/sysdeps/i386/i686/multiarch/bcopy-ssse3.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_MEMMOVE
-#define USE_AS_BCOPY
-#define MEMCPY		__bcopy_ssse3
-#include "memcpy-ssse3.S"
diff --git a/sysdeps/i386/i686/multiarch/bcopy.S b/sysdeps/i386/i686/multiarch/bcopy.S
deleted file mode 100644
index 8671bf684e..0000000000
--- a/sysdeps/i386/i686/multiarch/bcopy.S
+++ /dev/null
@@ -1,89 +0,0 @@
-/* Multiple versions of bcopy
-   Copyright (C) 2010 Free Software Foundation, Inc.
-   Contributed by Intel Corporation.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-/* Define multiple versions only for the definition in lib.  */
-#ifndef NOT_IN_libc
-# ifdef SHARED
-	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
-	.globl	__i686.get_pc_thunk.bx
-	.hidden	__i686.get_pc_thunk.bx
-	.p2align 4
-	.type	__i686.get_pc_thunk.bx,@function
-__i686.get_pc_thunk.bx:
-	movl	(%esp), %ebx
-	ret
-
-	.text
-ENTRY(bcopy)
-	.type	bcopy, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	call	__i686.get_pc_thunk.bx
-	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__bcopy_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
-	jz	2f
-	leal	__bcopy_ssse3@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
-	jz	2f
-	leal	__bcopy_ssse3_rep@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
-END(bcopy)
-# else
-	.text
-ENTRY(bcopy)
-	.type	bcopy, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__bcopy_ia32, %eax
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
-	jz	2f
-	leal	__bcopy_ssse3, %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features
-	jz	2f
-	leal	__bcopy_ssse3_rep, %eax
-2:	ret
-END(bcopy)
-# endif
-
-# undef ENTRY
-# define ENTRY(name) \
-	.type __bcopy_ia32, @function; \
-	.p2align 4; \
-	__bcopy_ia32: cfi_startproc; \
-	CALL_MCOUNT
-# undef END
-# define END(name) \
-	cfi_endproc; .size __bcopy_ia32, .-__bcopy_ia32
-
-#endif
-
-#include "../bcopy.S"
diff --git a/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S b/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S
deleted file mode 100644
index 507b288bb3..0000000000
--- a/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_BZERO
-#define __memset_sse2_rep __bzero_sse2_rep
-#include "memset-sse2-rep.S"
diff --git a/sysdeps/i386/i686/multiarch/bzero-sse2.S b/sysdeps/i386/i686/multiarch/bzero-sse2.S
deleted file mode 100644
index 8d04512e4e..0000000000
--- a/sysdeps/i386/i686/multiarch/bzero-sse2.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_BZERO
-#define __memset_sse2 __bzero_sse2
-#include "memset-sse2.S"
diff --git a/sysdeps/i386/i686/multiarch/bzero.S b/sysdeps/i386/i686/multiarch/bzero.S
deleted file mode 100644
index 8c740a42dc..0000000000
--- a/sysdeps/i386/i686/multiarch/bzero.S
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Multiple versions of bzero
-   Copyright (C) 2010 Free Software Foundation, Inc.
-   Contributed by Intel Corporation.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-/* Define multiple versions only for the definition in lib.  */
-#ifndef NOT_IN_libc
-# ifdef SHARED
-	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
-	.globl	__i686.get_pc_thunk.bx
-	.hidden	__i686.get_pc_thunk.bx
-	.p2align 4
-	.type	__i686.get_pc_thunk.bx,@function
-__i686.get_pc_thunk.bx:
-	movl	(%esp), %ebx
-	ret
-
-	.text
-ENTRY(__bzero)
-	.type	__bzero, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	call	__i686.get_pc_thunk.bx
-	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__bzero_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
-	jz	2f
-	leal	__bzero_sse2@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
-	jz	2f
-	leal	__bzero_sse2_rep@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
-END(__bzero)
-# else
-	.text
-ENTRY(__bzero)
-	.type	__bzero, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__bzero_ia32, %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features
-	jz	2f
-	leal	__bzero_sse2, %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features
-	jz	2f
-	leal	__bzero_sse2_rep, %eax
-2:	ret
-END(__bzero)
-# endif
-
-# undef ENTRY
-# define ENTRY(name) \
-	.type __bzero_ia32, @function; \
-	.p2align 4; \
-	__bzero_ia32: cfi_startproc; \
-	CALL_MCOUNT
-# undef END
-# define END(name) \
-	cfi_endproc; .size __bzero_ia32, .-__bzero_ia32
-
-# ifdef SHARED
-#  undef libc_hidden_builtin_def
-/* IFUNC doesn't work with the hidden functions in shared library since
-   they will be called without setting up EBX needed for PLT which is
-   used by IFUNC.  */
-#  define libc_hidden_builtin_def(name) \
-	.globl __GI___bzero; __GI___bzero = __bzero_ia32
-# endif
-#endif
-
-#include "../bzero.S"
diff --git a/sysdeps/i386/i686/multiarch/ifunc-defines.sym b/sysdeps/i386/i686/multiarch/ifunc-defines.sym
index eb1538abcc..e2021cdf87 100644
--- a/sysdeps/i386/i686/multiarch/ifunc-defines.sym
+++ b/sysdeps/i386/i686/multiarch/ifunc-defines.sym
@@ -13,8 +13,5 @@ CPUID_ECX_OFFSET	offsetof (struct cpuid_registers, ecx)
 CPUID_EDX_OFFSET	offsetof (struct cpuid_registers, edx)
 FAMILY_OFFSET		offsetof (struct cpu_features, family)
 MODEL_OFFSET		offsetof (struct cpu_features, model)
-FEATURE_OFFSET		offsetof (struct cpu_features, feature)
-FEATURE_SIZE		sizeof (unsigned int)
 
 COMMON_CPUID_INDEX_1
-FEATURE_INDEX_1
diff --git a/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S b/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S
deleted file mode 100644
index b26037d279..0000000000
--- a/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S
+++ /dev/null
@@ -1,1785 +0,0 @@
-/* memcpy with SSSE3 and REP string.
-   Copyright (C) 2010 Free Software Foundation, Inc.
-   Contributed by Intel Corporation.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
-
-#include <sysdep.h>
-
-#if !defined NOT_IN_libc \
-    && (defined SHARED \
-	|| defined USE_AS_MEMMOVE \
-	|| !defined USE_MULTIARCH)
-
-#include "asm-syntax.h"
-
-#ifndef MEMCPY
-# define MEMCPY		__memcpy_ssse3_rep
-# define MEMCPY_CHK	__memcpy_chk_ssse3_rep
-#endif
-
-#ifdef USE_AS_BCOPY
-# define SRC		PARMS
-# define DEST		SRC+4
-# define LEN		DEST+4
-#else
-# define DEST		PARMS
-# define SRC		DEST+4
-# define LEN		SRC+4
-#endif
-
-#define CFI_PUSH(REG)						\
-  cfi_adjust_cfa_offset (4);					\
-  cfi_rel_offset (REG, 0)
-
-#define CFI_POP(REG)						\
-  cfi_adjust_cfa_offset (-4);					\
-  cfi_restore (REG)
-
-#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
-#define POP(REG)	popl REG; CFI_POP (REG)
-
-#ifdef SHARED
-# define PARMS		8		/* Preserve EBX.  */
-# define ENTRANCE	PUSH (%ebx);
-# define RETURN_END	POP (%ebx); ret
-# define RETURN		RETURN_END; CFI_PUSH (%ebx)
-# define JMPTBL(I, B)	I - B
-
-/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
-   jump table with relative offsets.  INDEX is a register contains the
-   index into the jump table.   SCALE is the scale of INDEX. */
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
-    /* We first load PC into EBX.  */				\
-    call	__i686.get_pc_thunk.bx;				\
-    /* Get the address of the jump table.  */			\
-    addl	$(TABLE - .), %ebx;				\
-    /* Get the entry and convert the relative offset to the	\
-       absolute address.  */					\
-    addl	(%ebx,INDEX,SCALE), %ebx;			\
-    /* We loaded the jump table.  Go.  */			\
-    jmp		*%ebx
-
-# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)			\
-    addl	$(TABLE - .), %ebx
-
-# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE)	\
-    addl	(%ebx,INDEX,SCALE), %ebx;			\
-    /* We loaded the jump table.  Go.  */			\
-    jmp		*%ebx
-
-	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
-	.globl	__i686.get_pc_thunk.bx
-	.hidden	__i686.get_pc_thunk.bx
-	ALIGN (4)
-	.type	__i686.get_pc_thunk.bx,@function
-__i686.get_pc_thunk.bx:
-	movl	(%esp), %ebx
-	ret
-#else
-# define PARMS		4
-# define ENTRANCE
-# define RETURN_END	ret
-# define RETURN		RETURN_END
-# define JMPTBL(I, B)	I
-
-/* Branch to an entry in a jump table.  TABLE is a jump table with
-   absolute offsets.  INDEX is a register contains the index into the
-   jump table.  SCALE is the scale of INDEX. */
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
-    jmp		*TABLE(,INDEX,SCALE)
-
-# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)
-
-# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE)		\
-    jmp		*TABLE(,INDEX,SCALE)
-#endif
-
-	.section .text.ssse3,"ax",@progbits
-#if defined SHARED && !defined NOT_IN_libc && !defined USE_AS_BCOPY
-ENTRY (MEMCPY_CHK)
-	movl	12(%esp), %eax
-	cmpl	%eax, 16(%esp)
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMCPY_CHK)
-#endif
-ENTRY (MEMCPY)
-	ENTRANCE
-	movl	LEN(%esp), %ecx
-	movl	SRC(%esp), %eax
-	movl	DEST(%esp), %edx
-
-#ifdef USE_AS_MEMMOVE
-	cmp	%eax, %edx
-	jb	L(copy_forward)
-	je	L(fwd_write_0bytes)
-	cmp	$32, %ecx
-	jge	L(memmove_bwd)
-	jmp	L(bk_write_less32bytes_2)
-L(memmove_bwd):
-	add	%ecx, %eax
-	cmp	%eax, %edx
-	movl	SRC(%esp), %eax
-	jb	L(copy_backward)
-
-L(copy_forward):
-#endif
-	cmp	$48, %ecx
-	jge	L(48bytesormore)
-
-L(fwd_write_less32bytes):
-#ifndef USE_AS_MEMMOVE
-	cmp	%dl, %al
-	jl	L(bk_write)
-#endif
-	add	%ecx, %edx
-	add	%ecx, %eax
-	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
-#ifndef USE_AS_MEMMOVE
-L(bk_write):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
-#endif
-
-	ALIGN (4)
-/* ECX > 32 and EDX is 4 byte aligned.  */
-L(48bytesormore):
-	movdqu	(%eax), %xmm0
-	PUSH (%edi)
-	movl	%edx, %edi
-	and	$-16, %edx
-	PUSH (%esi)
-	add	$16, %edx
-	movl	%edi, %esi
-	sub	%edx, %edi
-	add	%edi, %ecx
-	sub	%edi, %eax
-
-#ifdef SHARED_CACHE_SIZE_HALF
-	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
-#else
-# ifdef SHARED
-	call	__i686.get_pc_thunk.bx
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
-# else
-	cmp	__x86_shared_cache_size_half, %ecx
-# endif
-#endif
-
-	mov	%eax, %edi
-	jge	L(large_page)
-	and	$0xf, %edi
-	jz	L(shl_0)
-
-	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
-
-	ALIGN (4)
-L(shl_0):
-	movdqu	%xmm0, (%esi)
-	xor	%edi, %edi
-	cmp	$127, %ecx
-	ja	L(shl_0_gobble)
-	lea	-32(%ecx), %ecx
-L(shl_0_loop):
-	movdqa	(%eax, %edi), %xmm0
-	movdqa	16(%eax, %edi), %xmm1
-	sub	$32, %ecx
-	movdqa	%xmm0, (%edx, %edi)
-	movdqa	%xmm1, 16(%edx, %edi)
-	lea	32(%edi), %edi
-	jl	L(shl_0_end)
-
-	movdqa	(%eax, %edi), %xmm0
-	movdqa	16(%eax, %edi), %xmm1
-	sub	$32, %ecx
-	movdqa	%xmm0, (%edx, %edi)
-	movdqa	%xmm1, 16(%edx, %edi)
-	lea	32(%edi), %edi
-	jl	L(shl_0_end)
-
-	movdqa	(%eax, %edi), %xmm0
-	movdqa	16(%eax, %edi), %xmm1
-	sub	$32, %ecx
-	movdqa	%xmm0, (%edx, %edi)
-	movdqa	%xmm1, 16(%edx, %edi)
-	lea	32(%edi), %edi
-	jl	L(shl_0_end)
-
-	movdqa	(%eax, %edi), %xmm0
-	movdqa	16(%eax, %edi), %xmm1
-	sub	$32, %ecx
-	movdqa	%xmm0, (%edx, %edi)
-	movdqa	%xmm1, 16(%edx, %edi)
-	lea	32(%edi), %edi
-L(shl_0_end):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	add	%edi, %eax
-	POP (%esi)
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
-
-L(shl_0_gobble):
-
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %ecx
-#else
-# ifdef SHARED
-	call	__i686.get_pc_thunk.bx
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	mov	__x86_data_cache_size_half@GOTOFF(%ebx), %edi
-# else
-	mov	__x86_data_cache_size_half, %edi
-# endif
-#endif
-	mov	%edi, %esi
-	shr	$3, %esi
-	sub	%esi, %edi
-	cmp	%edi, %ecx
-	jge	L(shl_0_gobble_mem_start)
-	lea	-128(%ecx), %ecx
-	ALIGN (4)
-L(shl_0_gobble_cache_loop):
-	movdqa	(%eax), %xmm0
-	movaps	0x10(%eax), %xmm1
-	movaps	0x20(%eax), %xmm2
-	movaps	0x30(%eax), %xmm3
-	movaps	0x40(%eax), %xmm4
-	movaps	0x50(%eax), %xmm5
-	movaps	0x60(%eax), %xmm6
-	movaps	0x70(%eax), %xmm7
-	lea	0x80(%eax), %eax
-	sub	$128, %ecx
-	movdqa	%xmm0, (%edx)
-	movaps	%xmm1, 0x10(%edx)
-	movaps	%xmm2, 0x20(%edx)
-	movaps	%xmm3, 0x30(%edx)
-	movaps	%xmm4, 0x40(%edx)
-	movaps	%xmm5, 0x50(%edx)
-	movaps	%xmm6, 0x60(%edx)
-	movaps	%xmm7, 0x70(%edx)
-	lea	0x80(%edx), %edx
-
-	jge	L(shl_0_gobble_cache_loop)
-L(shl_0_gobble_cache_loop_tail):
-	cmp	$-0x40, %ecx
-	lea	0x80(%ecx), %ecx
-	jl	L(shl_0_cache_less_64bytes)
-
-	movdqa	(%eax), %xmm0
-	sub	$0x40, %ecx
-	movdqa	0x10(%eax), %xmm1
-
-	movdqa	%xmm0, (%edx)
-	movdqa	%xmm1, 0x10(%edx)
-
-	movdqa	0x20(%eax), %xmm0
-	movdqa	0x30(%eax), %xmm1
-	add	$0x40, %eax
-
-	movdqa	%xmm0, 0x20(%edx)
-	movdqa	%xmm1, 0x30(%edx)
-	add	$0x40, %edx
-L(shl_0_cache_less_64bytes):
-	cmp	$0x20, %ecx
-	jl	L(shl_0_cache_less_32bytes)
-	movdqa	(%eax), %xmm0
-	sub	$0x20, %ecx
-	movdqa	0x10(%eax), %xmm1
-	add	$0x20, %eax
-	movdqa	%xmm0, (%edx)
-	movdqa	%xmm1, 0x10(%edx)
-	add	$0x20, %edx
-L(shl_0_cache_less_32bytes):
-	cmp	$0x10, %ecx
-	jl	L(shl_0_cache_less_16bytes)
-	sub	$0x10, %ecx
-	movdqa	(%eax), %xmm0
-	add	$0x10, %eax
-	movdqa	%xmm0, (%edx)
-	add	$0x10, %edx
-L(shl_0_cache_less_16bytes):
-	add	%ecx, %edx
-	add	%ecx, %eax
-	POP (%esi)
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
-
-
-	ALIGN (4)
-L(shl_0_gobble_mem_start):
-	cmp	%al, %dl
-	je	L(copy_page_by_rep)
-	lea	-128(%ecx), %ecx
-L(shl_0_gobble_mem_loop):
-	prefetchnta 0x1c0(%eax)
-	prefetchnta 0x280(%eax)
-	prefetchnta 0x1c0(%edx)
-	prefetchnta 0x280(%edx)
-
-	movdqa	(%eax), %xmm0
-	movaps	0x10(%eax), %xmm1
-	movaps	0x20(%eax), %xmm2
-	movaps	0x30(%eax), %xmm3
-	movaps	0x40(%eax), %xmm4
-	movaps	0x50(%eax), %xmm5
-	movaps	0x60(%eax), %xmm6
-	movaps	0x70(%eax), %xmm7
-	lea	0x80(%eax), %eax
-	sub	$0x80, %ecx
-	movdqa	%xmm0, (%edx)
-	movaps	%xmm1, 0x10(%edx)
-	movaps	%xmm2, 0x20(%edx)
-	movaps	%xmm3, 0x30(%edx)
-	movaps	%xmm4, 0x40(%edx)
-	movaps	%xmm5, 0x50(%edx)
-	movaps	%xmm6, 0x60(%edx)
-	movaps	%xmm7, 0x70(%edx)
-	lea	0x80(%edx), %edx
-
-	jge	L(shl_0_gobble_mem_loop)
-	cmp	$-0x40, %ecx
-	lea	0x80(%ecx), %ecx
-	jl	L(shl_0_mem_less_64bytes)
-
-	movdqa	(%eax), %xmm0
-	sub	$0x40, %ecx
-	movdqa	0x10(%eax), %xmm1
-
-	movdqa	%xmm0, (%edx)
-	movdqa	%xmm1, 0x10(%edx)
-
-	movdqa	0x20(%eax), %xmm0
-	movdqa	0x30(%eax), %xmm1
-	add	$0x40, %eax
-
-	movdqa	%xmm0, 0x20(%edx)
-	movdqa	%xmm1, 0x30(%edx)
-	add	$0x40, %edx
-L(shl_0_mem_less_64bytes):
-	cmp	$0x20, %ecx
-	jl	L(shl_0_mem_less_32bytes)
-	movdqa	(%eax), %xmm0
-	sub	$0x20, %ecx
-	movdqa	0x10(%eax), %xmm1
-	add	$0x20, %eax
-	movdqa	%xmm0, (%edx)
-	movdqa	%xmm1, 0x10(%edx)
-	add	$0x20, %edx
-L(shl_0_mem_less_32bytes):
-	cmp	$0x10, %ecx
-	jl	L(shl_0_mem_less_16bytes)
-	sub	$0x10, %ecx
-	movdqa	(%eax), %xmm0
-	add	$0x10, %eax
-	movdqa	%xmm0, (%edx)
-	add	$0x10, %edx
-L(shl_0_mem_less_16bytes):
-	add	%ecx, %edx
-	add	%ecx, %eax
-	POP (%esi)
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
-
-
-	ALIGN (4)
-L(shl_1):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-1(%eax), %eax
-	movaps	(%eax), %xmm1
-	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_1_loop):
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$1, %xmm2, %xmm3
-	palignr	$1, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jl	L(shl_1_end)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$1, %xmm2, %xmm3
-	palignr	$1, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(shl_1_loop)
-
-L(shl_1_end):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	1(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
-
-	ALIGN (4)
-L(shl_2):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-2(%eax), %eax
-	movaps	(%eax), %xmm1
-	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_2_loop):
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$2, %xmm2, %xmm3
-	palignr	$2, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jl	L(shl_2_end)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$2, %xmm2, %xmm3
-	palignr	$2, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(shl_2_loop)
-
-L(shl_2_end):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	2(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
-
-	ALIGN (4)
-L(shl_3):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-3(%eax), %eax
-	movaps	(%eax), %xmm1
-	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_3_loop):
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$3, %xmm2, %xmm3
-	palignr	$3, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jl	L(shl_3_end)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$3, %xmm2, %xmm3
-	palignr	$3, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(shl_3_loop)
-
-L(shl_3_end):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	3(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
-
-	ALIGN (4)
-L(shl_4):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-4(%eax), %eax
-	movaps	(%eax), %xmm1
-	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_4_loop):
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$4, %xmm2, %xmm3
-	palignr	$4, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jl	L(shl_4_end)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$4, %xmm2, %xmm3
-	palignr	$4, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(shl_4_loop)
-
-L(shl_4_end):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	4(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
-
-	ALIGN (4)
-L(shl_5):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-5(%eax), %eax
-	movaps	(%eax), %xmm1
-	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_5_loop):
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$5, %xmm2, %xmm3
-	palignr	$5, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jl	L(shl_5_end)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$5, %xmm2, %xmm3
-	palignr	$5, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(shl_5_loop)
-
-L(shl_5_end):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	5(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
-
-
-	ALIGN (4)
-L(shl_6):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-6(%eax), %eax
-	movaps	(%eax), %xmm1
-	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_6_loop):
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$6, %xmm2, %xmm3
-	palignr	$6, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jl	L(shl_6_end)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$6, %xmm2, %xmm3
-	palignr	$6, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(shl_6_loop)
-
-L(shl_6_end):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	6(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
-
-	ALIGN (4)
-L(shl_7):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-7(%eax), %eax
-	movaps	(%eax), %xmm1
-	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_7_loop):
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$7, %xmm2, %xmm3
-	palignr	$7, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jl	L(shl_7_end)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$7, %xmm2, %xmm3
-	palignr	$7, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(shl_7_loop)
-
-L(shl_7_end):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	7(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
-
-	ALIGN (4)
-L(shl_8):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-8(%eax), %eax
-	movaps	(%eax), %xmm1
-	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_8_loop):
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$8, %xmm2, %xmm3
-	palignr	$8, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jl	L(shl_8_end)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$8, %xmm2, %xmm3
-	palignr	$8, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(shl_8_loop)
-
-L(shl_8_end):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	8(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
-
-	ALIGN (4)
-L(shl_9):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-9(%eax), %eax
-	movaps	(%eax), %xmm1
-	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_9_loop):
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$9, %xmm2, %xmm3
-	palignr	$9, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jl	L(shl_9_end)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$9, %xmm2, %xmm3
-	palignr	$9, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(shl_9_loop)
-
-L(shl_9_end):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	9(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
-
-	ALIGN (4)
-L(shl_10):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-10(%eax), %eax
-	movaps	(%eax), %xmm1
-	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_10_loop):
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$10, %xmm2, %xmm3
-	palignr	$10, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jl	L(shl_10_end)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$10, %xmm2, %xmm3
-	palignr	$10, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(shl_10_loop)
-
-L(shl_10_end):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	10(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
-
-	ALIGN (4)
-L(shl_11):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-11(%eax), %eax
-	movaps	(%eax), %xmm1
-	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_11_loop):
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$11, %xmm2, %xmm3
-	palignr	$11, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jl	L(shl_11_end)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$11, %xmm2, %xmm3
-	palignr	$11, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(shl_11_loop)
-
-L(shl_11_end):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	11(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
-
-	ALIGN (4)
-L(shl_12):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-12(%eax), %eax
-	movaps	(%eax), %xmm1
-	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_12_loop):
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$12, %xmm2, %xmm3
-	palignr	$12, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jl	L(shl_12_end)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$12, %xmm2, %xmm3
-	palignr	$12, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(shl_12_loop)
-
-L(shl_12_end):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	12(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
-
-	ALIGN (4)
-L(shl_13):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-13(%eax), %eax
-	movaps	(%eax), %xmm1
-	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_13_loop):
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$13, %xmm2, %xmm3
-	palignr	$13, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jl	L(shl_13_end)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$13, %xmm2, %xmm3
-	palignr	$13, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(shl_13_loop)
-
-L(shl_13_end):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	13(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
-
-	ALIGN (4)
-L(shl_14):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-14(%eax), %eax
-	movaps	(%eax), %xmm1
-	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_14_loop):
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$14, %xmm2, %xmm3
-	palignr	$14, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jl	L(shl_14_end)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$14, %xmm2, %xmm3
-	palignr	$14, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(shl_14_loop)
-
-L(shl_14_end):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	14(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
-
-
-	ALIGN (4)
-L(shl_15):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-15(%eax), %eax
-	movaps	(%eax), %xmm1
-	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_15_loop):
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$15, %xmm2, %xmm3
-	palignr	$15, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jl	L(shl_15_end)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$15, %xmm2, %xmm3
-	palignr	$15, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(shl_15_loop)
-
-L(shl_15_end):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	15(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
-
-
-	ALIGN (4)
-L(fwd_write_44bytes):
-	movl	-44(%eax), %ecx
-	movl	%ecx, -44(%edx)
-L(fwd_write_40bytes):
-	movl	-40(%eax), %ecx
-	movl	%ecx, -40(%edx)
-L(fwd_write_36bytes):
-	movl	-36(%eax), %ecx
-	movl	%ecx, -36(%edx)
-L(fwd_write_32bytes):
-	movl	-32(%eax), %ecx
-	movl	%ecx, -32(%edx)
-L(fwd_write_28bytes):
-	movl	-28(%eax), %ecx
-	movl	%ecx, -28(%edx)
-L(fwd_write_24bytes):
-	movl	-24(%eax), %ecx
-	movl	%ecx, -24(%edx)
-L(fwd_write_20bytes):
-	movl	-20(%eax), %ecx
-	movl	%ecx, -20(%edx)
-L(fwd_write_16bytes):
-	movl	-16(%eax), %ecx
-	movl	%ecx, -16(%edx)
-L(fwd_write_12bytes):
-	movl	-12(%eax), %ecx
-	movl	%ecx, -12(%edx)
-L(fwd_write_8bytes):
-	movl	-8(%eax), %ecx
-	movl	%ecx, -8(%edx)
-L(fwd_write_4bytes):
-	movl	-4(%eax), %ecx
-	movl	%ecx, -4(%edx)
-L(fwd_write_0bytes):
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-# else
-	movl	DEST(%esp), %eax
-# endif
-#endif
-	RETURN
-
-	ALIGN (4)
-L(fwd_write_5bytes):
-	movl	-5(%eax), %ecx
-	movl	-4(%eax), %eax
-	movl	%ecx, -5(%edx)
-	movl	%eax, -4(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-# else
-	movl	DEST(%esp), %eax
-# endif
-#endif
-	RETURN
-
-	ALIGN (4)
-L(fwd_write_45bytes):
-	movl	-45(%eax), %ecx
-	movl	%ecx, -45(%edx)
-L(fwd_write_41bytes):
-	movl	-41(%eax), %ecx
-	movl	%ecx, -41(%edx)
-L(fwd_write_37bytes):
-	movl	-37(%eax), %ecx
-	movl	%ecx, -37(%edx)
-L(fwd_write_33bytes):
-	movl	-33(%eax), %ecx
-	movl	%ecx, -33(%edx)
-L(fwd_write_29bytes):
-	movl	-29(%eax), %ecx
-	movl	%ecx, -29(%edx)
-L(fwd_write_25bytes):
-	movl	-25(%eax), %ecx
-	movl	%ecx, -25(%edx)
-L(fwd_write_21bytes):
-	movl	-21(%eax), %ecx
-	movl	%ecx, -21(%edx)
-L(fwd_write_17bytes):
-	movl	-17(%eax), %ecx
-	movl	%ecx, -17(%edx)
-L(fwd_write_13bytes):
-	movl	-13(%eax), %ecx
-	movl	%ecx, -13(%edx)
-L(fwd_write_9bytes):
-	movl	-9(%eax), %ecx
-	movl	%ecx, -9(%edx)
-	movl	-5(%eax), %ecx
-	movl	%ecx, -5(%edx)
-L(fwd_write_1bytes):
-	movzbl	-1(%eax), %ecx
-	movb	%cl, -1(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-# else
-	movl	DEST(%esp), %eax
-# endif
-#endif
-	RETURN
-
-	ALIGN (4)
-L(fwd_write_46bytes):
-	movl	-46(%eax), %ecx
-	movl	%ecx, -46(%edx)
-L(fwd_write_42bytes):
-	movl	-42(%eax), %ecx
-	movl	%ecx, -42(%edx)
-L(fwd_write_38bytes):
-	movl	-38(%eax), %ecx
-	movl	%ecx, -38(%edx)
-L(fwd_write_34bytes):
-	movl	-34(%eax), %ecx
-	movl	%ecx, -34(%edx)
-L(fwd_write_30bytes):
-	movl	-30(%eax), %ecx
-	movl	%ecx, -30(%edx)
-L(fwd_write_26bytes):
-	movl	-26(%eax), %ecx
-	movl	%ecx, -26(%edx)
-L(fwd_write_22bytes):
-	movl	-22(%eax), %ecx
-	movl	%ecx, -22(%edx)
-L(fwd_write_18bytes):
-	movl	-18(%eax), %ecx
-	movl	%ecx, -18(%edx)
-L(fwd_write_14bytes):
-	movl	-14(%eax), %ecx
-	movl	%ecx, -14(%edx)
-L(fwd_write_10bytes):
-	movl	-10(%eax), %ecx
-	movl	%ecx, -10(%edx)
-L(fwd_write_6bytes):
-	movl	-6(%eax), %ecx
-	movl	%ecx, -6(%edx)
-L(fwd_write_2bytes):
-	movzwl	-2(%eax), %ecx
-	movw	%cx, -2(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-# else
-	movl	DEST(%esp), %eax
-# endif
-#endif
-	RETURN
-
-	ALIGN (4)
-L(fwd_write_47bytes):
-	movl	-47(%eax), %ecx
-	movl	%ecx, -47(%edx)
-L(fwd_write_43bytes):
-	movl	-43(%eax), %ecx
-	movl	%ecx, -43(%edx)
-L(fwd_write_39bytes):
-	movl	-39(%eax), %ecx
-	movl	%ecx, -39(%edx)
-L(fwd_write_35bytes):
-	movl	-35(%eax), %ecx
-	movl	%ecx, -35(%edx)
-L(fwd_write_31bytes):
-	movl	-31(%eax), %ecx
-	movl	%ecx, -31(%edx)
-L(fwd_write_27bytes):
-	movl	-27(%eax), %ecx
-	movl	%ecx, -27(%edx)
-L(fwd_write_23bytes):
-	movl	-23(%eax), %ecx
-	movl	%ecx, -23(%edx)
-L(fwd_write_19bytes):
-	movl	-19(%eax), %ecx
-	movl	%ecx, -19(%edx)
-L(fwd_write_15bytes):
-	movl	-15(%eax), %ecx
-	movl	%ecx, -15(%edx)
-L(fwd_write_11bytes):
-	movl	-11(%eax), %ecx
-	movl	%ecx, -11(%edx)
-L(fwd_write_7bytes):
-	movl	-7(%eax), %ecx
-	movl	%ecx, -7(%edx)
-L(fwd_write_3bytes):
-	movzwl	-3(%eax), %ecx
-	movzbl	-1(%eax), %eax
-	movw	%cx, -3(%edx)
-	movb	%al, -1(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-# else
-	movl	DEST(%esp), %eax
-# endif
-#endif
-	RETURN
-
-	ALIGN (4)
-L(large_page):
-	movdqu	(%eax), %xmm1
-	lea	16(%eax), %eax
-	movdqu	%xmm0, (%esi)
-	movntdq	%xmm1, (%edx)
-	lea	16(%edx), %edx
-	cmp	%al, %dl
-	je	L(copy_page_by_rep)
-L(large_page_loop_init):
-	POP (%esi)
-	lea	-0x90(%ecx), %ecx
-	POP (%edi)
-L(large_page_loop):
-	prefetchnta	0x1c0(%eax)
-	prefetchnta	0x280(%eax)
-	movdqu	(%eax), %xmm0
-	movdqu	0x10(%eax), %xmm1
-	movdqu	0x20(%eax), %xmm2
-	movdqu	0x30(%eax), %xmm3
-	movdqu	0x40(%eax), %xmm4
-	movdqu	0x50(%eax), %xmm5
-	movdqu	0x60(%eax), %xmm6
-	movdqu	0x70(%eax), %xmm7
-	lea	0x80(%eax), %eax
-	lfence
-	sub	$0x80, %ecx
-	movntdq	%xmm0, (%edx)
-	movntdq	%xmm1, 0x10(%edx)
-	movntdq	%xmm2, 0x20(%edx)
-	movntdq	%xmm3, 0x30(%edx)
-	movntdq	%xmm4, 0x40(%edx)
-	movntdq	%xmm5, 0x50(%edx)
-	movntdq	%xmm6, 0x60(%edx)
-	movntdq	%xmm7, 0x70(%edx)
-	lea	0x80(%edx), %edx
-	jae	L(large_page_loop)
-	cmp	$-0x40, %ecx
-	lea	0x80(%ecx), %ecx
-	jl	L(large_page_less_64bytes)
-
-	movdqu	(%eax), %xmm0
-	movdqu	0x10(%eax), %xmm1
-	movdqu	0x20(%eax), %xmm2
-	movdqu	0x30(%eax), %xmm3
-	lea	0x40(%eax), %eax
-
-	movntdq	%xmm0, (%edx)
-	movntdq	%xmm1, 0x10(%edx)
-	movntdq	%xmm2, 0x20(%edx)
-	movntdq	%xmm3, 0x30(%edx)
-	lea	0x40(%edx), %edx
-	sub	$0x40, %ecx
-L(large_page_less_64bytes):
-	cmp	$32, %ecx
-	jl	L(large_page_less_32bytes)
-	movdqu	(%eax), %xmm0
-	movdqu	0x10(%eax), %xmm1
-	lea	0x20(%eax), %eax
-	movntdq	%xmm0, (%edx)
-	movntdq	%xmm1, 0x10(%edx)
-	lea	0x20(%edx), %edx
-	sub	$0x20, %ecx
-L(large_page_less_32bytes):
-	add	%ecx, %edx
-	add	%ecx, %eax
-	sfence
-	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
-
-	ALIGN (4)
-L(copy_page_by_rep):
-	mov	%eax, %esi
-	mov	%edx, %edi
-	mov	%ecx, %edx
-	shr	$2, %ecx
-	and	$3, %edx
-	rep	movsl
-	jz	L(copy_page_by_rep_exit)
-	cmp	$2, %edx
-	jb	L(copy_page_by_rep_left_1)
-	movzwl	(%esi), %eax
-	movw	%ax, (%edi)
-	add	$2, %esi
-	add	$2, %edi
-	sub	$2, %edx
-	jz	L(copy_page_by_rep_exit)
-L(copy_page_by_rep_left_1):
-	movzbl	(%esi), %eax
-	movb	%al, (%edi)
-L(copy_page_by_rep_exit):
-	POP (%esi)
-	POP (%edi)
-#ifndef USE_AS_BCOPY
-	movl	DEST(%esp), %eax
-# ifdef USE_AS_MEMPCPY
-	movl	LEN(%esp), %ecx
-	add	%ecx, %eax
-# endif
-#endif
-	RETURN
-
-	ALIGN (4)
-L(bk_write_44bytes):
-	movl	40(%eax), %ecx
-	movl	%ecx, 40(%edx)
-L(bk_write_40bytes):
-	movl	36(%eax), %ecx
-	movl	%ecx, 36(%edx)
-L(bk_write_36bytes):
-	movl	32(%eax), %ecx
-	movl	%ecx, 32(%edx)
-L(bk_write_32bytes):
-	movl	28(%eax), %ecx
-	movl	%ecx, 28(%edx)
-L(bk_write_28bytes):
-	movl	24(%eax), %ecx
-	movl	%ecx, 24(%edx)
-L(bk_write_24bytes):
-	movl	20(%eax), %ecx
-	movl	%ecx, 20(%edx)
-L(bk_write_20bytes):
-	movl	16(%eax), %ecx
-	movl	%ecx, 16(%edx)
-L(bk_write_16bytes):
-	movl	12(%eax), %ecx
-	movl	%ecx, 12(%edx)
-L(bk_write_12bytes):
-	movl	8(%eax), %ecx
-	movl	%ecx, 8(%edx)
-L(bk_write_8bytes):
-	movl	4(%eax), %ecx
-	movl	%ecx, 4(%edx)
-L(bk_write_4bytes):
-	movl	(%eax), %ecx
-	movl	%ecx, (%edx)
-L(bk_write_0bytes):
-#ifndef USE_AS_BCOPY
-	movl	DEST(%esp), %eax
-# ifdef USE_AS_MEMPCPY
-	movl	LEN(%esp), %ecx
-	add	%ecx, %eax
-# endif
-#endif
-	RETURN
-
-	ALIGN (4)
-L(bk_write_45bytes):
-	movl	41(%eax), %ecx
-	movl	%ecx, 41(%edx)
-L(bk_write_41bytes):
-	movl	37(%eax), %ecx
-	movl	%ecx, 37(%edx)
-L(bk_write_37bytes):
-	movl	33(%eax), %ecx
-	movl	%ecx, 33(%edx)
-L(bk_write_33bytes):
-	movl	29(%eax), %ecx
-	movl	%ecx, 29(%edx)
-L(bk_write_29bytes):
-	movl	25(%eax), %ecx
-	movl	%ecx, 25(%edx)
-L(bk_write_25bytes):
-	movl	21(%eax), %ecx
-	movl	%ecx, 21(%edx)
-L(bk_write_21bytes):
-	movl	17(%eax), %ecx
-	movl	%ecx, 17(%edx)
-L(bk_write_17bytes):
-	movl	13(%eax), %ecx
-	movl	%ecx, 13(%edx)
-L(bk_write_13bytes):
-	movl	9(%eax), %ecx
-	movl	%ecx, 9(%edx)
-L(bk_write_9bytes):
-	movl	5(%eax), %ecx
-	movl	%ecx, 5(%edx)
-L(bk_write_5bytes):
-	movl	1(%eax), %ecx
-	movl	%ecx, 1(%edx)
-L(bk_write_1bytes):
-	movzbl	(%eax), %ecx
-	movb	%cl, (%edx)
-#ifndef USE_AS_BCOPY
-	movl	DEST(%esp), %eax
-# ifdef USE_AS_MEMPCPY
-	movl	LEN(%esp), %ecx
-	add	%ecx, %eax
-# endif
-#endif
-	RETURN
-
-	ALIGN (4)
-L(bk_write_46bytes):
-	movl	42(%eax), %ecx
-	movl	%ecx, 42(%edx)
-L(bk_write_42bytes):
-	movl	38(%eax), %ecx
-	movl	%ecx, 38(%edx)
-L(bk_write_38bytes):
-	movl	34(%eax), %ecx
-	movl	%ecx, 34(%edx)
-L(bk_write_34bytes):
-	movl	30(%eax), %ecx
-	movl	%ecx, 30(%edx)
-L(bk_write_30bytes):
-	movl	26(%eax), %ecx
-	movl	%ecx, 26(%edx)
-L(bk_write_26bytes):
-	movl	22(%eax), %ecx
-	movl	%ecx, 22(%edx)
-L(bk_write_22bytes):
-	movl	18(%eax), %ecx
-	movl	%ecx, 18(%edx)
-L(bk_write_18bytes):
-	movl	14(%eax), %ecx
-	movl	%ecx, 14(%edx)
-L(bk_write_14bytes):
-	movl	10(%eax), %ecx
-	movl	%ecx, 10(%edx)
-L(bk_write_10bytes):
-	movl	6(%eax), %ecx
-	movl	%ecx, 6(%edx)
-L(bk_write_6bytes):
-	movl	2(%eax), %ecx
-	movl	%ecx, 2(%edx)
-L(bk_write_2bytes):
-	movzwl	(%eax), %ecx
-	movw	%cx, (%edx)
-#ifndef USE_AS_BCOPY
-	movl	DEST(%esp), %eax
-# ifdef USE_AS_MEMPCPY
-	movl	LEN(%esp), %ecx
-	add	%ecx, %eax
-# endif
-#endif
-	RETURN
-
-	ALIGN (4)
-L(bk_write_47bytes):
-	movl	43(%eax), %ecx
-	movl	%ecx, 43(%edx)
-L(bk_write_43bytes):
-	movl	39(%eax), %ecx
-	movl	%ecx, 39(%edx)
-L(bk_write_39bytes):
-	movl	35(%eax), %ecx
-	movl	%ecx, 35(%edx)
-L(bk_write_35bytes):
-	movl	31(%eax), %ecx
-	movl	%ecx, 31(%edx)
-L(bk_write_31bytes):
-	movl	27(%eax), %ecx
-	movl	%ecx, 27(%edx)
-L(bk_write_27bytes):
-	movl	23(%eax), %ecx
-	movl	%ecx, 23(%edx)
-L(bk_write_23bytes):
-	movl	19(%eax), %ecx
-	movl	%ecx, 19(%edx)
-L(bk_write_19bytes):
-	movl	15(%eax), %ecx
-	movl	%ecx, 15(%edx)
-L(bk_write_15bytes):
-	movl	11(%eax), %ecx
-	movl	%ecx, 11(%edx)
-L(bk_write_11bytes):
-	movl	7(%eax), %ecx
-	movl	%ecx, 7(%edx)
-L(bk_write_7bytes):
-	movl	3(%eax), %ecx
-	movl	%ecx, 3(%edx)
-L(bk_write_3bytes):
-	movzwl	1(%eax), %ecx
-	movw	%cx, 1(%edx)
-	movzbl	(%eax), %eax
-	movb	%al, (%edx)
-#ifndef USE_AS_BCOPY
-	movl	DEST(%esp), %eax
-# ifdef USE_AS_MEMPCPY
-	movl	LEN(%esp), %ecx
-	add	%ecx, %eax
-# endif
-#endif
-	RETURN_END
-
-
-	.pushsection .rodata.ssse3,"a",@progbits
-	ALIGN (2)
-L(table_48bytes_fwd):
-	.int	JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
-
-	ALIGN (2)
-L(shl_table):
-	.int	JMPTBL (L(shl_0), L(shl_table))
-	.int	JMPTBL (L(shl_1), L(shl_table))
-	.int	JMPTBL (L(shl_2), L(shl_table))
-	.int	JMPTBL (L(shl_3), L(shl_table))
-	.int	JMPTBL (L(shl_4), L(shl_table))
-	.int	JMPTBL (L(shl_5), L(shl_table))
-	.int	JMPTBL (L(shl_6), L(shl_table))
-	.int	JMPTBL (L(shl_7), L(shl_table))
-	.int	JMPTBL (L(shl_8), L(shl_table))
-	.int	JMPTBL (L(shl_9), L(shl_table))
-	.int	JMPTBL (L(shl_10), L(shl_table))
-	.int	JMPTBL (L(shl_11), L(shl_table))
-	.int	JMPTBL (L(shl_12), L(shl_table))
-	.int	JMPTBL (L(shl_13), L(shl_table))
-	.int	JMPTBL (L(shl_14), L(shl_table))
-	.int	JMPTBL (L(shl_15), L(shl_table))
-
-	ALIGN (2)
-L(table_48_bytes_bwd):
-	.int	JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
-
-	.popsection
-
-#ifdef USE_AS_MEMMOVE
-	ALIGN (4)
-L(copy_backward):
-	PUSH (%esi)
-	movl	%eax, %esi
-	lea	(%ecx,%edx,1),%edx
-	lea	(%ecx,%esi,1),%esi
-	testl	$0x3, %edx
-	jnz	L(bk_align)
-
-L(bk_aligned_4):
-	cmp	$64, %ecx
-	jge	L(bk_write_more64bytes)
-
-L(bk_write_64bytesless):
-	cmp	$32, %ecx
-	jl	L(bk_write_less32bytes)
-
-L(bk_write_more32bytes):
-	/* Copy 32 bytes at a time.  */
-	sub	$32, %ecx
-	movl	-4(%esi), %eax
-	movl	%eax, -4(%edx)
-	movl	-8(%esi), %eax
-	movl	%eax, -8(%edx)
-	movl	-12(%esi), %eax
-	movl	%eax, -12(%edx)
-	movl	-16(%esi), %eax
-	movl	%eax, -16(%edx)
-	movl	-20(%esi), %eax
-	movl	%eax, -20(%edx)
-	movl	-24(%esi), %eax
-	movl	%eax, -24(%edx)
-	movl	-28(%esi), %eax
-	movl	%eax, -28(%edx)
-	movl	-32(%esi), %eax
-	movl	%eax, -32(%edx)
-	sub	$32, %edx
-	sub	$32, %esi
-
-L(bk_write_less32bytes):
-	movl	%esi, %eax
-	sub	%ecx, %edx
-	sub	%ecx, %eax
-	POP (%esi)
-L(bk_write_less32bytes_2):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
-
-	ALIGN (4)
-L(bk_align):
-	cmp	$8, %ecx
-	jle	L(bk_write_less32bytes)
-	testl	$1, %edx
-	/* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
-	   then (EDX & 2) must be != 0.  */
-	jz	L(bk_got2)
-	sub	$1, %esi
-	sub	$1, %ecx
-	sub	$1, %edx
-	movzbl	(%esi), %eax
-	movb	%al, (%edx)
-
-	testl	$2, %edx
-	jz	L(bk_aligned_4)
-
-L(bk_got2):
-	sub	$2, %esi
-	sub	$2, %ecx
-	sub	$2, %edx
-	movzwl	(%esi), %eax
-	movw	%ax, (%edx)
-	jmp	L(bk_aligned_4)
-
-	ALIGN (4)
-L(bk_write_more64bytes):
-	/* Check alignment of last byte.  */
-	testl	$15, %edx
-	jz	L(bk_ssse3_cpy_pre)
-
-/* EDX is aligned 4 bytes, but not 16 bytes.  */
-L(bk_ssse3_align):
-	sub	$4, %esi
-	sub	$4, %ecx
-	sub	$4, %edx
-	movl	(%esi), %eax
-	movl	%eax, (%edx)
-
-	testl	$15, %edx
-	jz	L(bk_ssse3_cpy_pre)
-
-	sub	$4, %esi
-	sub	$4, %ecx
-	sub	$4, %edx
-	movl	(%esi), %eax
-	movl	%eax, (%edx)
-
-	testl	$15, %edx
-	jz	L(bk_ssse3_cpy_pre)
-
-	sub	$4, %esi
-	sub	$4, %ecx
-	sub	$4, %edx
-	movl	(%esi), %eax
-	movl	%eax, (%edx)
-
-L(bk_ssse3_cpy_pre):
-	cmp	$64, %ecx
-	jl	L(bk_write_more32bytes)
-
-L(bk_ssse3_cpy):
-	sub	$64, %esi
-	sub	$64, %ecx
-	sub	$64, %edx
-	movdqu	0x30(%esi), %xmm3
-	movdqa	%xmm3, 0x30(%edx)
-	movdqu	0x20(%esi), %xmm2
-	movdqa	%xmm2, 0x20(%edx)
-	movdqu	0x10(%esi), %xmm1
-	movdqa	%xmm1, 0x10(%edx)
-	movdqu	(%esi), %xmm0
-	movdqa	%xmm0, (%edx)
-	cmp	$64, %ecx
-	jge	L(bk_ssse3_cpy)
-	jmp	L(bk_write_64bytesless)
-
-#endif
-
-END (MEMCPY)
-
-#endif
diff --git a/sysdeps/i386/i686/multiarch/memcpy-ssse3.S b/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
deleted file mode 100644
index 749c82d379..0000000000
--- a/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
+++ /dev/null
@@ -1,1737 +0,0 @@
-/* memcpy with SSSE3
-   Copyright (C) 2010 Free Software Foundation, Inc.
-   Contributed by Intel Corporation.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
-
-#include <sysdep.h>
-
-#if !defined NOT_IN_libc \
-    && (defined SHARED \
-	|| defined USE_AS_MEMMOVE \
-	|| !defined USE_MULTIARCH)
-
-#include "asm-syntax.h"
-
-#ifndef MEMCPY
-# define MEMCPY		__memcpy_ssse3
-# define MEMCPY_CHK	__memcpy_chk_ssse3
-#endif
-
-#ifdef USE_AS_BCOPY
-# define SRC		PARMS
-# define DEST		SRC+4
-# define LEN		DEST+4
-#else
-# define DEST		PARMS
-# define SRC		DEST+4
-# define LEN		SRC+4
-#endif
-
-#define CFI_PUSH(REG)						\
-  cfi_adjust_cfa_offset (4);					\
-  cfi_rel_offset (REG, 0)
-
-#define CFI_POP(REG)						\
-  cfi_adjust_cfa_offset (-4);					\
-  cfi_restore (REG)
-
-#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
-#define POP(REG)	popl REG; CFI_POP (REG)
-
-#ifdef SHARED
-# define PARMS		8		/* Preserve EBX.  */
-# define ENTRANCE	PUSH (%ebx);
-# define RETURN_END	POP (%ebx); ret
-# define RETURN		RETURN_END; CFI_PUSH (%ebx)
-# define JMPTBL(I, B)	I - B
-
-/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
-   jump table with relative offsets.  INDEX is a register contains the
-   index into the jump table.   SCALE is the scale of INDEX. */
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
-    /* We first load PC into EBX.  */				\
-    call	__i686.get_pc_thunk.bx;				\
-    /* Get the address of the jump table.  */			\
-    addl	$(TABLE - .), %ebx;				\
-    /* Get the entry and convert the relative offset to the	\
-       absolute address.  */					\
-    addl	(%ebx,INDEX,SCALE), %ebx;			\
-    /* We loaded the jump table.  Go.  */			\
-    jmp		*%ebx
-
-# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)			\
-    addl	$(TABLE - .), %ebx
-
-# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE)	\
-    addl	(%ebx,INDEX,SCALE), %ebx;			\
-    /* We loaded the jump table.  Go.  */			\
-    jmp		*%ebx
-
-	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
-	.globl	__i686.get_pc_thunk.bx
-	.hidden	__i686.get_pc_thunk.bx
-	ALIGN (4)
-	.type	__i686.get_pc_thunk.bx,@function
-__i686.get_pc_thunk.bx:
-	movl	(%esp), %ebx
-	ret
-#else
-# define PARMS		4
-# define ENTRANCE
-# define RETURN_END	ret
-# define RETURN		RETURN_END
-# define JMPTBL(I, B)	I
-
-/* Branch to an entry in a jump table.  TABLE is a jump table with
-   absolute offsets.  INDEX is a register contains the index into the
-   jump table.  SCALE is the scale of INDEX. */
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
-    jmp		*TABLE(,INDEX,SCALE)
-
-# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)
-
-# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE)		\
-    jmp		*TABLE(,INDEX,SCALE)
-#endif
-
-	.section .text.ssse3,"ax",@progbits
-#if defined SHARED && !defined NOT_IN_libc && !defined USE_AS_BCOPY
-ENTRY (MEMCPY_CHK)
-	movl	12(%esp), %eax
-	cmpl	%eax, 16(%esp)
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMCPY_CHK)
-#endif
-ENTRY (MEMCPY)
-	ENTRANCE
-	movl	LEN(%esp), %ecx
-	movl	SRC(%esp), %eax
-	movl	DEST(%esp), %edx
-
-#ifdef USE_AS_MEMMOVE
-	cmp	%eax, %edx
-	jb	L(copy_forward)
-	je	L(fwd_write_0bytes)
-	cmp	$32, %ecx
-	jge	L(memmove_bwd)
-	jmp	L(bk_write_less32bytes_2)
-L(memmove_bwd):
-	add	%ecx, %eax
-	cmp	%eax, %edx
-	movl	SRC(%esp), %eax
-	jb	L(copy_backward)
-
-L(copy_forward):
-#endif
-	cmp	$48, %ecx
-	jge	L(48bytesormore)
-
-L(fwd_write_less32bytes):
-#ifndef USE_AS_MEMMOVE
-	cmp	%dl, %al
-	jl	L(bk_write)
-#endif
-	add	%ecx, %edx
-	add	%ecx, %eax
-	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
-#ifndef USE_AS_MEMMOVE
-L(bk_write):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
-#endif
-
-	ALIGN (4)
-/* ECX > 32 and EDX is 4 byte aligned.  */
-L(48bytesormore):
-	movdqu	(%eax), %xmm0
-	PUSH (%edi)
-	movl	%edx, %edi
-	and	$-16, %edx
-	PUSH (%esi)
-	add	$16, %edx
-	movl	%edi, %esi
-	sub	%edx, %edi
-	add	%edi, %ecx
-	sub	%edi, %eax
-
-#ifdef SHARED_CACHE_SIZE_HALF
-	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
-#else
-# ifdef SHARED
-	call	__i686.get_pc_thunk.bx
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
-# else
-	cmp	__x86_shared_cache_size_half, %ecx
-# endif
-#endif
-
-	mov	%eax, %edi
-	jge	L(large_page)
-	and	$0xf, %edi
-	jz	L(shl_0)
-
-	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
-
-	ALIGN (4)
-L(shl_0):
-	movdqu	%xmm0, (%esi)
-	xor	%edi, %edi
-	POP (%esi)
-	cmp	$127, %ecx
-	ja	L(shl_0_gobble)
-	lea	-32(%ecx), %ecx
-L(shl_0_loop):
-	movdqa	(%eax, %edi), %xmm0
-	movdqa	16(%eax, %edi), %xmm1
-	sub	$32, %ecx
-	movdqa	%xmm0, (%edx, %edi)
-	movdqa	%xmm1, 16(%edx, %edi)
-	lea	32(%edi), %edi
-	jl	L(shl_0_end)
-
-	movdqa	(%eax, %edi), %xmm0
-	movdqa	16(%eax, %edi), %xmm1
-	sub	$32, %ecx
-	movdqa	%xmm0, (%edx, %edi)
-	movdqa	%xmm1, 16(%edx, %edi)
-	lea	32(%edi), %edi
-	jl	L(shl_0_end)
-
-	movdqa	(%eax, %edi), %xmm0
-	movdqa	16(%eax, %edi), %xmm1
-	sub	$32, %ecx
-	movdqa	%xmm0, (%edx, %edi)
-	movdqa	%xmm1, 16(%edx, %edi)
-	lea	32(%edi), %edi
-	jl	L(shl_0_end)
-
-	movdqa	(%eax, %edi), %xmm0
-	movdqa	16(%eax, %edi), %xmm1
-	sub	$32, %ecx
-	movdqa	%xmm0, (%edx, %edi)
-	movdqa	%xmm1, 16(%edx, %edi)
-	lea	32(%edi), %edi
-L(shl_0_end):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	add	%edi, %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
-
-L(shl_0_gobble):
-
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %ecx
-#else
-# ifdef SHARED
-	call	__i686.get_pc_thunk.bx
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
-	cmp	__x86_data_cache_size_half, %ecx
-# endif
-#endif
-
-	POP (%edi)
-	lea	-128(%ecx), %ecx
-	jge	L(shl_0_gobble_mem_loop)
-L(shl_0_gobble_cache_loop):
-	movdqa	(%eax), %xmm0
-	movdqa	0x10(%eax), %xmm1
-	movdqa	0x20(%eax), %xmm2
-	movdqa	0x30(%eax), %xmm3
-	movdqa	0x40(%eax), %xmm4
-	movdqa	0x50(%eax), %xmm5
-	movdqa	0x60(%eax), %xmm6
-	movdqa	0x70(%eax), %xmm7
-	lea	0x80(%eax), %eax
-	sub	$128, %ecx
-	movdqa	%xmm0, (%edx)
-	movdqa	%xmm1, 0x10(%edx)
-	movdqa	%xmm2, 0x20(%edx)
-	movdqa	%xmm3, 0x30(%edx)
-	movdqa	%xmm4, 0x40(%edx)
-	movdqa	%xmm5, 0x50(%edx)
-	movdqa	%xmm6, 0x60(%edx)
-	movdqa	%xmm7, 0x70(%edx)
-	lea	0x80(%edx), %edx
-
-	jge	L(shl_0_gobble_cache_loop)
-L(shl_0_gobble_cache_loop_tail):
-	cmp	$-0x40, %ecx
-	lea	0x80(%ecx), %ecx
-	jl	L(shl_0_cache_less_64bytes)
-
-	movdqa	(%eax), %xmm0
-	sub	$0x40, %ecx
-	movdqa	0x10(%eax), %xmm1
-
-	movdqa	%xmm0, (%edx)
-	movdqa	%xmm1, 0x10(%edx)
-
-	movdqa	0x20(%eax), %xmm0
-	movdqa	0x30(%eax), %xmm1
-	add	$0x40, %eax
-
-	movdqa	%xmm0, 0x20(%edx)
-	movdqa	%xmm1, 0x30(%edx)
-	add	$0x40, %edx
-L(shl_0_cache_less_64bytes):
-	cmp	$0x20, %ecx
-	jl	L(shl_0_cache_less_32bytes)
-	movdqa	(%eax), %xmm0
-	sub	$0x20, %ecx
-	movdqa	0x10(%eax), %xmm1
-	add	$0x20, %eax
-	movdqa	%xmm0, (%edx)
-	movdqa	%xmm1, 0x10(%edx)
-	add	$0x20, %edx
-L(shl_0_cache_less_32bytes):
-	cmp	$0x10, %ecx
-	jl	L(shl_0_cache_less_16bytes)
-	sub	$0x10, %ecx
-	movdqa	(%eax), %xmm0
-	add	$0x10, %eax
-	movdqa	%xmm0, (%edx)
-	add	$0x10, %edx
-L(shl_0_cache_less_16bytes):
-	add	%ecx, %edx
-	add	%ecx, %eax
-	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
-
-
-	ALIGN (4)
-L(shl_0_gobble_mem_loop):
-	prefetcht0 0x1c0(%eax)
-	prefetcht0 0x280(%eax)
-	prefetcht0 0x1c0(%edx)
-
-	movdqa	(%eax), %xmm0
-	movdqa	0x10(%eax), %xmm1
-	movdqa	0x20(%eax), %xmm2
-	movdqa	0x30(%eax), %xmm3
-	movdqa	0x40(%eax), %xmm4
-	movdqa	0x50(%eax), %xmm5
-	movdqa	0x60(%eax), %xmm6
-	movdqa	0x70(%eax), %xmm7
-	lea	0x80(%eax), %eax
-	sub	$0x80, %ecx
-	movdqa	%xmm0, (%edx)
-	movdqa	%xmm1, 0x10(%edx)
-	movdqa	%xmm2, 0x20(%edx)
-	movdqa	%xmm3, 0x30(%edx)
-	movdqa	%xmm4, 0x40(%edx)
-	movdqa	%xmm5, 0x50(%edx)
-	movdqa	%xmm6, 0x60(%edx)
-	movdqa	%xmm7, 0x70(%edx)
-	lea	0x80(%edx), %edx
-
-	jge	L(shl_0_gobble_mem_loop)
-	cmp	$-0x40, %ecx
-	lea	0x80(%ecx), %ecx
-	jl	L(shl_0_mem_less_64bytes)
-
-	movdqa	(%eax), %xmm0
-	sub	$0x40, %ecx
-	movdqa	0x10(%eax), %xmm1
-
-	movdqa	%xmm0, (%edx)
-	movdqa	%xmm1, 0x10(%edx)
-
-	movdqa	0x20(%eax), %xmm0
-	movdqa	0x30(%eax), %xmm1
-	add	$0x40, %eax
-
-	movdqa	%xmm0, 0x20(%edx)
-	movdqa	%xmm1, 0x30(%edx)
-	add	$0x40, %edx
-L(shl_0_mem_less_64bytes):
-	cmp	$0x20, %ecx
-	jl	L(shl_0_mem_less_32bytes)
-	movdqa	(%eax), %xmm0
-	sub	$0x20, %ecx
-	movdqa	0x10(%eax), %xmm1
-	add	$0x20, %eax
-	movdqa	%xmm0, (%edx)
-	movdqa	%xmm1, 0x10(%edx)
-	add	$0x20, %edx
-L(shl_0_mem_less_32bytes):
-	cmp	$0x10, %ecx
-	jl	L(shl_0_mem_less_16bytes)
-	sub	$0x10, %ecx
-	movdqa	(%eax), %xmm0
-	add	$0x10, %eax
-	movdqa	%xmm0, (%edx)
-	add	$0x10, %edx
-L(shl_0_mem_less_16bytes):
-	add	%ecx, %edx
-	add	%ecx, %eax
-	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
-
-
-	ALIGN (4)
-L(shl_1):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-1(%eax), %eax
-	movaps	(%eax), %xmm1
-	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_1_loop):
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$1, %xmm2, %xmm3
-	palignr	$1, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jl	L(shl_1_end)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$1, %xmm2, %xmm3
-	palignr	$1, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(shl_1_loop)
-
-L(shl_1_end):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	1(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
-
-	ALIGN (4)
-L(shl_2):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-2(%eax), %eax
-	movaps	(%eax), %xmm1
-	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_2_loop):
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$2, %xmm2, %xmm3
-	palignr	$2, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jl	L(shl_2_end)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$2, %xmm2, %xmm3
-	palignr	$2, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(shl_2_loop)
-
-L(shl_2_end):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	2(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
-
-	ALIGN (4)
-L(shl_3):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-3(%eax), %eax
-	movaps	(%eax), %xmm1
-	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_3_loop):
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$3, %xmm2, %xmm3
-	palignr	$3, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jl	L(shl_3_end)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$3, %xmm2, %xmm3
-	palignr	$3, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(shl_3_loop)
-
-L(shl_3_end):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	3(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
-
-	ALIGN (4)
-L(shl_4):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-4(%eax), %eax
-	movaps	(%eax), %xmm1
-	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_4_loop):
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$4, %xmm2, %xmm3
-	palignr	$4, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jl	L(shl_4_end)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$4, %xmm2, %xmm3
-	palignr	$4, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(shl_4_loop)
-
-L(shl_4_end):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	4(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
-
-	ALIGN (4)
-L(shl_5):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-5(%eax), %eax
-	movaps	(%eax), %xmm1
-	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_5_loop):
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$5, %xmm2, %xmm3
-	palignr	$5, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jl	L(shl_5_end)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$5, %xmm2, %xmm3
-	palignr	$5, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(shl_5_loop)
-
-L(shl_5_end):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	5(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
-
-
-	ALIGN (4)
-L(shl_6):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-6(%eax), %eax
-	movaps	(%eax), %xmm1
-	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_6_loop):
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$6, %xmm2, %xmm3
-	palignr	$6, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jl	L(shl_6_end)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$6, %xmm2, %xmm3
-	palignr	$6, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(shl_6_loop)
-
-L(shl_6_end):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	6(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
-
-	ALIGN (4)
-L(shl_7):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-7(%eax), %eax
-	movaps	(%eax), %xmm1
-	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_7_loop):
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$7, %xmm2, %xmm3
-	palignr	$7, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jl	L(shl_7_end)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$7, %xmm2, %xmm3
-	palignr	$7, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(shl_7_loop)
-
-L(shl_7_end):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	7(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
-
-	ALIGN (4)
-L(shl_8):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-8(%eax), %eax
-	movaps	(%eax), %xmm1
-	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_8_loop):
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$8, %xmm2, %xmm3
-	palignr	$8, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jl	L(shl_8_end)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$8, %xmm2, %xmm3
-	palignr	$8, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(shl_8_loop)
-
-L(shl_8_end):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	8(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
-
-	ALIGN (4)
-L(shl_9):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-9(%eax), %eax
-	movaps	(%eax), %xmm1
-	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_9_loop):
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$9, %xmm2, %xmm3
-	palignr	$9, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jl	L(shl_9_end)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$9, %xmm2, %xmm3
-	palignr	$9, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(shl_9_loop)
-
-L(shl_9_end):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	9(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
-
-	ALIGN (4)
-L(shl_10):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-10(%eax), %eax
-	movaps	(%eax), %xmm1
-	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_10_loop):
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$10, %xmm2, %xmm3
-	palignr	$10, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jl	L(shl_10_end)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$10, %xmm2, %xmm3
-	palignr	$10, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(shl_10_loop)
-
-L(shl_10_end):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	10(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
-
-	ALIGN (4)
-L(shl_11):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-11(%eax), %eax
-	movaps	(%eax), %xmm1
-	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_11_loop):
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$11, %xmm2, %xmm3
-	palignr	$11, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jl	L(shl_11_end)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$11, %xmm2, %xmm3
-	palignr	$11, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(shl_11_loop)
-
-L(shl_11_end):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	11(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
-
-	ALIGN (4)
-L(shl_12):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-12(%eax), %eax
-	movaps	(%eax), %xmm1
-	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_12_loop):
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$12, %xmm2, %xmm3
-	palignr	$12, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jl	L(shl_12_end)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$12, %xmm2, %xmm3
-	palignr	$12, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(shl_12_loop)
-
-L(shl_12_end):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	12(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
-
-	ALIGN (4)
-L(shl_13):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-13(%eax), %eax
-	movaps	(%eax), %xmm1
-	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_13_loop):
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$13, %xmm2, %xmm3
-	palignr	$13, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jl	L(shl_13_end)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$13, %xmm2, %xmm3
-	palignr	$13, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(shl_13_loop)
-
-L(shl_13_end):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	13(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
-
-	ALIGN (4)
-L(shl_14):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-14(%eax), %eax
-	movaps	(%eax), %xmm1
-	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_14_loop):
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$14, %xmm2, %xmm3
-	palignr	$14, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jl	L(shl_14_end)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$14, %xmm2, %xmm3
-	palignr	$14, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(shl_14_loop)
-
-L(shl_14_end):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	14(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
-
-
-	ALIGN (4)
-L(shl_15):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-15(%eax), %eax
-	movaps	(%eax), %xmm1
-	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_15_loop):
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm4
-	palignr	$15, %xmm2, %xmm3
-	palignr	$15, %xmm1, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jl	L(shl_15_end)
-
-	movdqa	16(%eax, %edi), %xmm2
-	sub	$32, %ecx
-	movdqa	32(%eax, %edi), %xmm3
-	movdqa	%xmm3, %xmm1
-	palignr	$15, %xmm2, %xmm3
-	palignr	$15, %xmm4, %xmm2
-	lea	32(%edi), %edi
-	movdqa	%xmm2, -32(%edx, %edi)
-	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(shl_15_loop)
-
-L(shl_15_end):
-	lea	32(%ecx), %ecx
-	add	%ecx, %edi
-	add	%edi, %edx
-	lea	15(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
-
-
-	ALIGN (4)
-L(fwd_write_44bytes):
-	movl	-44(%eax), %ecx
-	movl	%ecx, -44(%edx)
-L(fwd_write_40bytes):
-	movl	-40(%eax), %ecx
-	movl	%ecx, -40(%edx)
-L(fwd_write_36bytes):
-	movl	-36(%eax), %ecx
-	movl	%ecx, -36(%edx)
-L(fwd_write_32bytes):
-	movl	-32(%eax), %ecx
-	movl	%ecx, -32(%edx)
-L(fwd_write_28bytes):
-	movl	-28(%eax), %ecx
-	movl	%ecx, -28(%edx)
-L(fwd_write_24bytes):
-	movl	-24(%eax), %ecx
-	movl	%ecx, -24(%edx)
-L(fwd_write_20bytes):
-	movl	-20(%eax), %ecx
-	movl	%ecx, -20(%edx)
-L(fwd_write_16bytes):
-	movl	-16(%eax), %ecx
-	movl	%ecx, -16(%edx)
-L(fwd_write_12bytes):
-	movl	-12(%eax), %ecx
-	movl	%ecx, -12(%edx)
-L(fwd_write_8bytes):
-	movl	-8(%eax), %ecx
-	movl	%ecx, -8(%edx)
-L(fwd_write_4bytes):
-	movl	-4(%eax), %ecx
-	movl	%ecx, -4(%edx)
-L(fwd_write_0bytes):
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-# else
-	movl	DEST(%esp), %eax
-# endif
-#endif
-	RETURN
-
-	ALIGN (4)
-L(fwd_write_5bytes):
-	movl	-5(%eax), %ecx
-	movl	-4(%eax), %eax
-	movl	%ecx, -5(%edx)
-	movl	%eax, -4(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-# else
-	movl	DEST(%esp), %eax
-# endif
-#endif
-	RETURN
-
-	ALIGN (4)
-L(fwd_write_45bytes):
-	movl	-45(%eax), %ecx
-	movl	%ecx, -45(%edx)
-L(fwd_write_41bytes):
-	movl	-41(%eax), %ecx
-	movl	%ecx, -41(%edx)
-L(fwd_write_37bytes):
-	movl	-37(%eax), %ecx
-	movl	%ecx, -37(%edx)
-L(fwd_write_33bytes):
-	movl	-33(%eax), %ecx
-	movl	%ecx, -33(%edx)
-L(fwd_write_29bytes):
-	movl	-29(%eax), %ecx
-	movl	%ecx, -29(%edx)
-L(fwd_write_25bytes):
-	movl	-25(%eax), %ecx
-	movl	%ecx, -25(%edx)
-L(fwd_write_21bytes):
-	movl	-21(%eax), %ecx
-	movl	%ecx, -21(%edx)
-L(fwd_write_17bytes):
-	movl	-17(%eax), %ecx
-	movl	%ecx, -17(%edx)
-L(fwd_write_13bytes):
-	movl	-13(%eax), %ecx
-	movl	%ecx, -13(%edx)
-L(fwd_write_9bytes):
-	movl	-9(%eax), %ecx
-	movl	%ecx, -9(%edx)
-	movl	-5(%eax), %ecx
-	movl	%ecx, -5(%edx)
-L(fwd_write_1bytes):
-	movzbl	-1(%eax), %ecx
-	movb	%cl, -1(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-# else
-	movl	DEST(%esp), %eax
-# endif
-#endif
-	RETURN
-
-	ALIGN (4)
-L(fwd_write_46bytes):
-	movl	-46(%eax), %ecx
-	movl	%ecx, -46(%edx)
-L(fwd_write_42bytes):
-	movl	-42(%eax), %ecx
-	movl	%ecx, -42(%edx)
-L(fwd_write_38bytes):
-	movl	-38(%eax), %ecx
-	movl	%ecx, -38(%edx)
-L(fwd_write_34bytes):
-	movl	-34(%eax), %ecx
-	movl	%ecx, -34(%edx)
-L(fwd_write_30bytes):
-	movl	-30(%eax), %ecx
-	movl	%ecx, -30(%edx)
-L(fwd_write_26bytes):
-	movl	-26(%eax), %ecx
-	movl	%ecx, -26(%edx)
-L(fwd_write_22bytes):
-	movl	-22(%eax), %ecx
-	movl	%ecx, -22(%edx)
-L(fwd_write_18bytes):
-	movl	-18(%eax), %ecx
-	movl	%ecx, -18(%edx)
-L(fwd_write_14bytes):
-	movl	-14(%eax), %ecx
-	movl	%ecx, -14(%edx)
-L(fwd_write_10bytes):
-	movl	-10(%eax), %ecx
-	movl	%ecx, -10(%edx)
-L(fwd_write_6bytes):
-	movl	-6(%eax), %ecx
-	movl	%ecx, -6(%edx)
-L(fwd_write_2bytes):
-	movzwl	-2(%eax), %ecx
-	movw	%cx, -2(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-# else
-	movl	DEST(%esp), %eax
-# endif
-#endif
-	RETURN
-
-	ALIGN (4)
-L(fwd_write_47bytes):
-	movl	-47(%eax), %ecx
-	movl	%ecx, -47(%edx)
-L(fwd_write_43bytes):
-	movl	-43(%eax), %ecx
-	movl	%ecx, -43(%edx)
-L(fwd_write_39bytes):
-	movl	-39(%eax), %ecx
-	movl	%ecx, -39(%edx)
-L(fwd_write_35bytes):
-	movl	-35(%eax), %ecx
-	movl	%ecx, -35(%edx)
-L(fwd_write_31bytes):
-	movl	-31(%eax), %ecx
-	movl	%ecx, -31(%edx)
-L(fwd_write_27bytes):
-	movl	-27(%eax), %ecx
-	movl	%ecx, -27(%edx)
-L(fwd_write_23bytes):
-	movl	-23(%eax), %ecx
-	movl	%ecx, -23(%edx)
-L(fwd_write_19bytes):
-	movl	-19(%eax), %ecx
-	movl	%ecx, -19(%edx)
-L(fwd_write_15bytes):
-	movl	-15(%eax), %ecx
-	movl	%ecx, -15(%edx)
-L(fwd_write_11bytes):
-	movl	-11(%eax), %ecx
-	movl	%ecx, -11(%edx)
-L(fwd_write_7bytes):
-	movl	-7(%eax), %ecx
-	movl	%ecx, -7(%edx)
-L(fwd_write_3bytes):
-	movzwl	-3(%eax), %ecx
-	movzbl	-1(%eax), %eax
-	movw	%cx, -3(%edx)
-	movb	%al, -1(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
-	movl	%edx, %eax
-# else
-	movl	DEST(%esp), %eax
-# endif
-#endif
-	RETURN
-
-	ALIGN (4)
-L(large_page):
-	movdqu	(%eax), %xmm1
-	lea	16(%eax), %eax
-	movdqu	%xmm0, (%esi)
-	movntdq	%xmm1, (%edx)
-	lea	16(%edx), %edx
-	POP (%esi)
-	lea	-0x90(%ecx), %ecx
-	POP (%edi)
-L(large_page_loop):
-	movdqu	(%eax), %xmm0
-	movdqu	0x10(%eax), %xmm1
-	movdqu	0x20(%eax), %xmm2
-	movdqu	0x30(%eax), %xmm3
-	movdqu	0x40(%eax), %xmm4
-	movdqu	0x50(%eax), %xmm5
-	movdqu	0x60(%eax), %xmm6
-	movdqu	0x70(%eax), %xmm7
-	lea	0x80(%eax), %eax
-
-	sub	$0x80, %ecx
-	movntdq	%xmm0, (%edx)
-	movntdq	%xmm1, 0x10(%edx)
-	movntdq	%xmm2, 0x20(%edx)
-	movntdq	%xmm3, 0x30(%edx)
-	movntdq	%xmm4, 0x40(%edx)
-	movntdq	%xmm5, 0x50(%edx)
-	movntdq	%xmm6, 0x60(%edx)
-	movntdq	%xmm7, 0x70(%edx)
-	lea	0x80(%edx), %edx
-	jae	L(large_page_loop)
-	cmp	$-0x40, %ecx
-	lea	0x80(%ecx), %ecx
-	jl	L(large_page_less_64bytes)
-
-	movdqu	(%eax), %xmm0
-	movdqu	0x10(%eax), %xmm1
-	movdqu	0x20(%eax), %xmm2
-	movdqu	0x30(%eax), %xmm3
-	lea	0x40(%eax), %eax
-
-	movntdq	%xmm0, (%edx)
-	movntdq	%xmm1, 0x10(%edx)
-	movntdq	%xmm2, 0x20(%edx)
-	movntdq	%xmm3, 0x30(%edx)
-	lea	0x40(%edx), %edx
-	sub	$0x40, %ecx
-L(large_page_less_64bytes):
-	cmp	$32, %ecx
-	jl	L(large_page_less_32bytes)
-	movdqu	(%eax), %xmm0
-	movdqu	0x10(%eax), %xmm1
-	lea	0x20(%eax), %eax
-	movntdq	%xmm0, (%edx)
-	movntdq	%xmm1, 0x10(%edx)
-	lea	0x20(%edx), %edx
-	sub	$0x20, %ecx
-L(large_page_less_32bytes):
-	add	%ecx, %edx
-	add	%ecx, %eax
-	sfence
-	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
-
-
-	ALIGN (4)
-L(bk_write_44bytes):
-	movl	40(%eax), %ecx
-	movl	%ecx, 40(%edx)
-L(bk_write_40bytes):
-	movl	36(%eax), %ecx
-	movl	%ecx, 36(%edx)
-L(bk_write_36bytes):
-	movl	32(%eax), %ecx
-	movl	%ecx, 32(%edx)
-L(bk_write_32bytes):
-	movl	28(%eax), %ecx
-	movl	%ecx, 28(%edx)
-L(bk_write_28bytes):
-	movl	24(%eax), %ecx
-	movl	%ecx, 24(%edx)
-L(bk_write_24bytes):
-	movl	20(%eax), %ecx
-	movl	%ecx, 20(%edx)
-L(bk_write_20bytes):
-	movl	16(%eax), %ecx
-	movl	%ecx, 16(%edx)
-L(bk_write_16bytes):
-	movl	12(%eax), %ecx
-	movl	%ecx, 12(%edx)
-L(bk_write_12bytes):
-	movl	8(%eax), %ecx
-	movl	%ecx, 8(%edx)
-L(bk_write_8bytes):
-	movl	4(%eax), %ecx
-	movl	%ecx, 4(%edx)
-L(bk_write_4bytes):
-	movl	(%eax), %ecx
-	movl	%ecx, (%edx)
-L(bk_write_0bytes):
-#ifndef USE_AS_BCOPY
-	movl	DEST(%esp), %eax
-# ifdef USE_AS_MEMPCPY
-	movl	LEN(%esp), %ecx
-	add	%ecx, %eax
-# endif
-#endif
-	RETURN
-
-	ALIGN (4)
-L(bk_write_45bytes):
-	movl	41(%eax), %ecx
-	movl	%ecx, 41(%edx)
-L(bk_write_41bytes):
-	movl	37(%eax), %ecx
-	movl	%ecx, 37(%edx)
-L(bk_write_37bytes):
-	movl	33(%eax), %ecx
-	movl	%ecx, 33(%edx)
-L(bk_write_33bytes):
-	movl	29(%eax), %ecx
-	movl	%ecx, 29(%edx)
-L(bk_write_29bytes):
-	movl	25(%eax), %ecx
-	movl	%ecx, 25(%edx)
-L(bk_write_25bytes):
-	movl	21(%eax), %ecx
-	movl	%ecx, 21(%edx)
-L(bk_write_21bytes):
-	movl	17(%eax), %ecx
-	movl	%ecx, 17(%edx)
-L(bk_write_17bytes):
-	movl	13(%eax), %ecx
-	movl	%ecx, 13(%edx)
-L(bk_write_13bytes):
-	movl	9(%eax), %ecx
-	movl	%ecx, 9(%edx)
-L(bk_write_9bytes):
-	movl	5(%eax), %ecx
-	movl	%ecx, 5(%edx)
-L(bk_write_5bytes):
-	movl	1(%eax), %ecx
-	movl	%ecx, 1(%edx)
-L(bk_write_1bytes):
-	movzbl	(%eax), %ecx
-	movb	%cl, (%edx)
-#ifndef USE_AS_BCOPY
-	movl	DEST(%esp), %eax
-# ifdef USE_AS_MEMPCPY
-	movl	LEN(%esp), %ecx
-	add	%ecx, %eax
-# endif
-#endif
-	RETURN
-
-	ALIGN (4)
-L(bk_write_46bytes):
-	movl	42(%eax), %ecx
-	movl	%ecx, 42(%edx)
-L(bk_write_42bytes):
-	movl	38(%eax), %ecx
-	movl	%ecx, 38(%edx)
-L(bk_write_38bytes):
-	movl	34(%eax), %ecx
-	movl	%ecx, 34(%edx)
-L(bk_write_34bytes):
-	movl	30(%eax), %ecx
-	movl	%ecx, 30(%edx)
-L(bk_write_30bytes):
-	movl	26(%eax), %ecx
-	movl	%ecx, 26(%edx)
-L(bk_write_26bytes):
-	movl	22(%eax), %ecx
-	movl	%ecx, 22(%edx)
-L(bk_write_22bytes):
-	movl	18(%eax), %ecx
-	movl	%ecx, 18(%edx)
-L(bk_write_18bytes):
-	movl	14(%eax), %ecx
-	movl	%ecx, 14(%edx)
-L(bk_write_14bytes):
-	movl	10(%eax), %ecx
-	movl	%ecx, 10(%edx)
-L(bk_write_10bytes):
-	movl	6(%eax), %ecx
-	movl	%ecx, 6(%edx)
-L(bk_write_6bytes):
-	movl	2(%eax), %ecx
-	movl	%ecx, 2(%edx)
-L(bk_write_2bytes):
-	movzwl	(%eax), %ecx
-	movw	%cx, (%edx)
-#ifndef USE_AS_BCOPY
-	movl	DEST(%esp), %eax
-# ifdef USE_AS_MEMPCPY
-	movl	LEN(%esp), %ecx
-	add	%ecx, %eax
-# endif
-#endif
-	RETURN
-
-	ALIGN (4)
-L(bk_write_47bytes):
-	movl	43(%eax), %ecx
-	movl	%ecx, 43(%edx)
-L(bk_write_43bytes):
-	movl	39(%eax), %ecx
-	movl	%ecx, 39(%edx)
-L(bk_write_39bytes):
-	movl	35(%eax), %ecx
-	movl	%ecx, 35(%edx)
-L(bk_write_35bytes):
-	movl	31(%eax), %ecx
-	movl	%ecx, 31(%edx)
-L(bk_write_31bytes):
-	movl	27(%eax), %ecx
-	movl	%ecx, 27(%edx)
-L(bk_write_27bytes):
-	movl	23(%eax), %ecx
-	movl	%ecx, 23(%edx)
-L(bk_write_23bytes):
-	movl	19(%eax), %ecx
-	movl	%ecx, 19(%edx)
-L(bk_write_19bytes):
-	movl	15(%eax), %ecx
-	movl	%ecx, 15(%edx)
-L(bk_write_15bytes):
-	movl	11(%eax), %ecx
-	movl	%ecx, 11(%edx)
-L(bk_write_11bytes):
-	movl	7(%eax), %ecx
-	movl	%ecx, 7(%edx)
-L(bk_write_7bytes):
-	movl	3(%eax), %ecx
-	movl	%ecx, 3(%edx)
-L(bk_write_3bytes):
-	movzwl	1(%eax), %ecx
-	movw	%cx, 1(%edx)
-	movzbl	(%eax), %eax
-	movb	%al, (%edx)
-#ifndef USE_AS_BCOPY
-	movl	DEST(%esp), %eax
-# ifdef USE_AS_MEMPCPY
-	movl	LEN(%esp), %ecx
-	add	%ecx, %eax
-# endif
-#endif
-	RETURN_END
-
-
-	.pushsection .rodata.ssse3,"a",@progbits
-	ALIGN (2)
-L(table_48bytes_fwd):
-	.int	JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
-	.int	JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
-
-	ALIGN (2)
-L(shl_table):
-	.int	JMPTBL (L(shl_0), L(shl_table))
-	.int	JMPTBL (L(shl_1), L(shl_table))
-	.int	JMPTBL (L(shl_2), L(shl_table))
-	.int	JMPTBL (L(shl_3), L(shl_table))
-	.int	JMPTBL (L(shl_4), L(shl_table))
-	.int	JMPTBL (L(shl_5), L(shl_table))
-	.int	JMPTBL (L(shl_6), L(shl_table))
-	.int	JMPTBL (L(shl_7), L(shl_table))
-	.int	JMPTBL (L(shl_8), L(shl_table))
-	.int	JMPTBL (L(shl_9), L(shl_table))
-	.int	JMPTBL (L(shl_10), L(shl_table))
-	.int	JMPTBL (L(shl_11), L(shl_table))
-	.int	JMPTBL (L(shl_12), L(shl_table))
-	.int	JMPTBL (L(shl_13), L(shl_table))
-	.int	JMPTBL (L(shl_14), L(shl_table))
-	.int	JMPTBL (L(shl_15), L(shl_table))
-
-	ALIGN (2)
-L(table_48_bytes_bwd):
-	.int	JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
-	.int	JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
-
-	.popsection
-
-#ifdef USE_AS_MEMMOVE
-	ALIGN (4)
-L(copy_backward):
-	PUSH (%esi)
-	movl	%eax, %esi
-	lea	(%ecx,%edx,1),%edx
-	lea	(%ecx,%esi,1),%esi
-	testl	$0x3, %edx
-	jnz	L(bk_align)
-
-L(bk_aligned_4):
-	cmp	$64, %ecx
-	jge	L(bk_write_more64bytes)
-
-L(bk_write_64bytesless):
-	cmp	$32, %ecx
-	jl	L(bk_write_less32bytes)
-
-L(bk_write_more32bytes):
-	/* Copy 32 bytes at a time.  */
-	sub	$32, %ecx
-	movl	-4(%esi), %eax
-	movl	%eax, -4(%edx)
-	movl	-8(%esi), %eax
-	movl	%eax, -8(%edx)
-	movl	-12(%esi), %eax
-	movl	%eax, -12(%edx)
-	movl	-16(%esi), %eax
-	movl	%eax, -16(%edx)
-	movl	-20(%esi), %eax
-	movl	%eax, -20(%edx)
-	movl	-24(%esi), %eax
-	movl	%eax, -24(%edx)
-	movl	-28(%esi), %eax
-	movl	%eax, -28(%edx)
-	movl	-32(%esi), %eax
-	movl	%eax, -32(%edx)
-	sub	$32, %edx
-	sub	$32, %esi
-
-L(bk_write_less32bytes):
-	movl	%esi, %eax
-	sub	%ecx, %edx
-	sub	%ecx, %eax
-	POP (%esi)
-L(bk_write_less32bytes_2):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
-
-	ALIGN (4)
-L(bk_align):
-	cmp	$8, %ecx
-	jle	L(bk_write_less32bytes)
-	testl	$1, %edx
-	/* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
-	   then (EDX & 2) must be != 0.  */
-	jz	L(bk_got2)
-	sub	$1, %esi
-	sub	$1, %ecx
-	sub	$1, %edx
-	movzbl	(%esi), %eax
-	movb	%al, (%edx)
-
-	testl	$2, %edx
-	jz	L(bk_aligned_4)
-
-L(bk_got2):
-	sub	$2, %esi
-	sub	$2, %ecx
-	sub	$2, %edx
-	movzwl	(%esi), %eax
-	movw	%ax, (%edx)
-	jmp	L(bk_aligned_4)
-
-	ALIGN (4)
-L(bk_write_more64bytes):
-	/* Check alignment of last byte.  */
-	testl	$15, %edx
-	jz	L(bk_ssse3_cpy_pre)
-
-/* EDX is aligned 4 bytes, but not 16 bytes.  */
-L(bk_ssse3_align):
-	sub	$4, %esi
-	sub	$4, %ecx
-	sub	$4, %edx
-	movl	(%esi), %eax
-	movl	%eax, (%edx)
-
-	testl	$15, %edx
-	jz	L(bk_ssse3_cpy_pre)
-
-	sub	$4, %esi
-	sub	$4, %ecx
-	sub	$4, %edx
-	movl	(%esi), %eax
-	movl	%eax, (%edx)
-
-	testl	$15, %edx
-	jz	L(bk_ssse3_cpy_pre)
-
-	sub	$4, %esi
-	sub	$4, %ecx
-	sub	$4, %edx
-	movl	(%esi), %eax
-	movl	%eax, (%edx)
-
-L(bk_ssse3_cpy_pre):
-	cmp	$64, %ecx
-	jl	L(bk_write_more32bytes)
-
-L(bk_ssse3_cpy):
-	sub	$64, %esi
-	sub	$64, %ecx
-	sub	$64, %edx
-	movdqu	0x30(%esi), %xmm3
-	movdqa	%xmm3, 0x30(%edx)
-	movdqu	0x20(%esi), %xmm2
-	movdqa	%xmm2, 0x20(%edx)
-	movdqu	0x10(%esi), %xmm1
-	movdqa	%xmm1, 0x10(%edx)
-	movdqu	(%esi), %xmm0
-	movdqa	%xmm0, (%edx)
-	cmp	$64, %ecx
-	jge	L(bk_ssse3_cpy)
-	jmp	L(bk_write_64bytesless)
-
-#endif
-
-END (MEMCPY)
-
-#endif
diff --git a/sysdeps/i386/i686/multiarch/memcpy.S b/sysdeps/i386/i686/multiarch/memcpy.S
deleted file mode 100644
index bf1c7cc2d2..0000000000
--- a/sysdeps/i386/i686/multiarch/memcpy.S
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Multiple versions of memcpy
-   Copyright (C) 2010 Free Software Foundation, Inc.
-   Contributed by Intel Corporation.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-/* Define multiple versions only for the definition in lib and for
-   DSO.  In static binaries we need memcpy before the initialization
-   happened.  */
-#if defined SHARED && !defined NOT_IN_libc
-	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
-	.globl	__i686.get_pc_thunk.bx
-	.hidden	__i686.get_pc_thunk.bx
-	.p2align 4
-	.type	__i686.get_pc_thunk.bx,@function
-__i686.get_pc_thunk.bx:
-	movl	(%esp), %ebx
-	ret
-
-	.text
-ENTRY(memcpy)
-	.type	memcpy, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	call	__i686.get_pc_thunk.bx
-	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__memcpy_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
-	jz	2f
-	leal	__memcpy_ssse3@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
-	jz	2f
-	leal	__memcpy_ssse3_rep@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
-END(memcpy)
-
-# undef ENTRY
-# define ENTRY(name) \
-	.type __memcpy_ia32, @function; \
-	.p2align 4; \
-	__memcpy_ia32: cfi_startproc; \
-	CALL_MCOUNT
-# undef END
-# define END(name) \
-	cfi_endproc; .size __memcpy_ia32, .-__memcpy_ia32
-
-# undef ENTRY_CHK
-# define ENTRY_CHK(name) \
-	.type __memcpy_chk_ia32, @function; \
-	.globl __memcpy_chk_ia32; \
-	.p2align 4; \
-	__memcpy_chk_ia32: cfi_startproc; \
-	CALL_MCOUNT
-# undef END_CHK
-# define END_CHK(name) \
-	cfi_endproc; .size __memcpy_chk_ia32, .-__memcpy_chk_ia32
-
-# undef libc_hidden_builtin_def
-/* IFUNC doesn't work with the hidden functions in shared library since
-   they will be called without setting up EBX needed for PLT which is
-   used by IFUNC.  */
-# define libc_hidden_builtin_def(name) \
-	.globl __GI_memcpy; __GI_memcpy = __memcpy_ia32
-#endif
-
-#include "../memcpy.S"
diff --git a/sysdeps/i386/i686/multiarch/memcpy_chk.S b/sysdeps/i386/i686/multiarch/memcpy_chk.S
deleted file mode 100644
index 171ac8adef..0000000000
--- a/sysdeps/i386/i686/multiarch/memcpy_chk.S
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Multiple versions of __memcpy_chk
-   Copyright (C) 2010 Free Software Foundation, Inc.
-   Contributed by Intel Corporation.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-/* Define multiple versions only for the definition in lib and for
-   DSO.  There are no multiarch memcpy functions for static binaries.
- */
-#ifndef NOT_IN_libc
-# ifdef SHARED
-	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
-	.globl	__i686.get_pc_thunk.bx
-	.hidden	__i686.get_pc_thunk.bx
-	.p2align 4
-	.type	__i686.get_pc_thunk.bx,@function
-__i686.get_pc_thunk.bx:
-	movl	(%esp), %ebx
-	ret
-
-	.text
-ENTRY(__memcpy_chk)
-	.type	__memcpy_chk, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	call	__i686.get_pc_thunk.bx
-	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__memcpy_chk_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
-	jz	2f
-	leal	__memcpy_chk_ssse3@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
-	jz	2f
-	leal	__memcpy_chk_ssse3_rep@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
-END(__memcpy_chk)
-# else
-#  include "../memcpy_chk.S"
-# endif
-#endif
diff --git a/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S b/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S
deleted file mode 100644
index d202fc4a13..0000000000
--- a/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_MEMMOVE
-#define MEMCPY		__memmove_ssse3_rep
-#define MEMCPY_CHK	__memmove_chk_ssse3_rep
-#include "memcpy-ssse3-rep.S"
diff --git a/sysdeps/i386/i686/multiarch/memmove-ssse3.S b/sysdeps/i386/i686/multiarch/memmove-ssse3.S
deleted file mode 100644
index 295430b1ef..0000000000
--- a/sysdeps/i386/i686/multiarch/memmove-ssse3.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_MEMMOVE
-#define MEMCPY		__memmove_ssse3
-#define MEMCPY_CHK	__memmove_chk_ssse3
-#include "memcpy-ssse3.S"
diff --git a/sysdeps/i386/i686/multiarch/memmove.S b/sysdeps/i386/i686/multiarch/memmove.S
deleted file mode 100644
index e0529c0126..0000000000
--- a/sysdeps/i386/i686/multiarch/memmove.S
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Multiple versions of memmove
-   Copyright (C) 2010 Free Software Foundation, Inc.
-   Contributed by Intel Corporation.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-/* Define multiple versions only for the definition in lib.  */
-#ifndef NOT_IN_libc
-# ifdef SHARED
-	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
-	.globl	__i686.get_pc_thunk.bx
-	.hidden	__i686.get_pc_thunk.bx
-	.p2align 4
-	.type	__i686.get_pc_thunk.bx,@function
-__i686.get_pc_thunk.bx:
-	movl	(%esp), %ebx
-	ret
-
-	.text
-ENTRY(memmove)
-	.type	memmove, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	call	__i686.get_pc_thunk.bx
-	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__memmove_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
-	jz	2f
-	leal	__memmove_ssse3@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
-	jz	2f
-	leal	__memmove_ssse3_rep@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
-END(memmove)
-
-# undef ENTRY
-# define ENTRY(name) \
-	.type __memmove_ia32, @function; \
-	.p2align 4; \
-	__memmove_ia32: cfi_startproc; \
-	CALL_MCOUNT
-# else
-	.text
-ENTRY(memmove)
-	.type	memmove, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__memmove_ia32, %eax
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
-	jz	2f
-	leal	__memmove_ssse3, %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features
-	jz	2f
-	leal	__memmove_ssse3_rep, %eax
-2:	ret
-END(memmove)
-
-# undef ENTRY
-# define ENTRY(name) \
-	.type __memmove_ia32, @function; \
-	.globl __memmove_ia32; \
-	.p2align 4; \
-	__memmove_ia32: cfi_startproc; \
-	CALL_MCOUNT
-# endif
-
-# undef END
-# define END(name) \
-	cfi_endproc; .size __memmove_ia32, .-__memmove_ia32
-
-# undef ENTRY_CHK
-# define ENTRY_CHK(name) \
-	.type __memmove_chk_ia32, @function; \
-	.globl __memmove_chk_ia32; \
-	.p2align 4; \
-	__memmove_chk_ia32: cfi_startproc; \
-	CALL_MCOUNT
-# undef END_CHK
-# define END_CHK(name) \
-	cfi_endproc; .size __memmove_chk_ia32, .-__memmove_chk_ia32
-
-# ifdef SHARED
-#  undef libc_hidden_builtin_def
-/* IFUNC doesn't work with the hidden functions in shared library since
-   they will be called without setting up EBX needed for PLT which is
-   used by IFUNC.  */
-#  define libc_hidden_builtin_def(name) \
-	.globl __GI_memmove; __GI_memmove = __memmove_ia32
-# endif
-#endif
-
-#include "../memmove.S"
diff --git a/sysdeps/i386/i686/multiarch/memmove_chk.S b/sysdeps/i386/i686/multiarch/memmove_chk.S
deleted file mode 100644
index e33f2a31b3..0000000000
--- a/sysdeps/i386/i686/multiarch/memmove_chk.S
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Multiple versions of __memmove_chk
-   Copyright (C) 2010 Free Software Foundation, Inc.
-   Contributed by Intel Corporation.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-/* Define multiple versions only for the definition in lib.  */
-#ifndef NOT_IN_libc
-# ifdef SHARED
-	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
-	.globl	__i686.get_pc_thunk.bx
-	.hidden	__i686.get_pc_thunk.bx
-	.p2align 4
-	.type	__i686.get_pc_thunk.bx,@function
-__i686.get_pc_thunk.bx:
-	movl	(%esp), %ebx
-	ret
-
-	.text
-ENTRY(__memmove_chk)
-	.type	__memmove_chk, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	call	__i686.get_pc_thunk.bx
-	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__memmove_chk_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
-	jz	2f
-	leal	__memmove_chk_ssse3@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
-	jz	2f
-	leal	__memmove_chk_ssse3_rep@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
-END(__memmove_chk)
-# else
-	.text
-ENTRY(__memmove_chk)
-	.type	__memmove_chk, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__memmove_chk_ia32, %eax
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
-	jz	2f
-	leal	__memmove_chk_ssse3, %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features
-	jz	2f
-	leal	__memmove_chk_ssse3_rep, %eax
-2:	ret
-END(__memmove_chk)
-
-	.type __memmove_chk_ssse3, @function
-	.p2align 4;
-__memmove_chk_ssse3:
-	cfi_startproc
-	CALL_MCOUNT
-	movl	12(%esp), %eax
-	cmpl	%eax, 16(%esp)
-	jb	__chk_fail
-	jmp	__memmove_ssse3
-	cfi_endproc
-	.size __memmove_chk_ssse3, .-__memmove_chk_ssse3
-
-	.type __memmove_chk_ssse3_rep, @function
-	.p2align 4;
-__memmove_chk_ssse3_rep:
-	cfi_startproc
-	CALL_MCOUNT
-	movl	12(%esp), %eax
-	cmpl	%eax, 16(%esp)
-	jb	__chk_fail
-	jmp	__memmove_ssse3_rep
-	cfi_endproc
-	.size __memmove_chk_ssse3_rep, .-__memmove_chk_ssse3_rep
-
-	.type __memmove_chk_ia32, @function
-	.p2align 4;
-__memmove_chk_ia32:
-	cfi_startproc
-	CALL_MCOUNT
-	movl	12(%esp), %eax
-	cmpl	%eax, 16(%esp)
-	jb	__chk_fail
-	jmp	__memmove_ia32
-	cfi_endproc
-	.size __memmove_chk_ia32, .-__memmove_chk_ia32
-# endif
-#endif
diff --git a/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S b/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S
deleted file mode 100644
index 5357b33e18..0000000000
--- a/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_MEMPCPY
-#define MEMCPY		__mempcpy_ssse3_rep
-#define MEMCPY_CHK	__mempcpy_chk_ssse3_rep
-#include "memcpy-ssse3-rep.S"
diff --git a/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S b/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S
deleted file mode 100644
index 822d98e954..0000000000
--- a/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_MEMPCPY
-#define MEMCPY		__mempcpy_ssse3
-#define MEMCPY_CHK	__mempcpy_chk_ssse3
-#include "memcpy-ssse3.S"
diff --git a/sysdeps/i386/i686/multiarch/mempcpy.S b/sysdeps/i386/i686/multiarch/mempcpy.S
deleted file mode 100644
index df830d2e63..0000000000
--- a/sysdeps/i386/i686/multiarch/mempcpy.S
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Multiple versions of mempcpy
-   Copyright (C) 2010 Free Software Foundation, Inc.
-   Contributed by Intel Corporation.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-/* Define multiple versions only for the definition in lib and for
-   DSO.  In static binaries we need mempcpy before the initialization
-   happened.  */
-#if defined SHARED && !defined NOT_IN_libc
-	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
-	.globl	__i686.get_pc_thunk.bx
-	.hidden	__i686.get_pc_thunk.bx
-	.p2align 4
-	.type	__i686.get_pc_thunk.bx,@function
-__i686.get_pc_thunk.bx:
-	movl	(%esp), %ebx
-	ret
-
-	.text
-ENTRY(__mempcpy)
-	.type	__mempcpy, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	call	__i686.get_pc_thunk.bx
-	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__mempcpy_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
-	jz	2f
-	leal	__mempcpy_ssse3@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
-	jz	2f
-	leal	__mempcpy_ssse3_rep@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
-END(__mempcpy)
-
-# undef ENTRY
-# define ENTRY(name) \
-	.type __mempcpy_ia32, @function; \
-	.p2align 4; \
-	__mempcpy_ia32: cfi_startproc; \
-	CALL_MCOUNT
-# undef END
-# define END(name) \
-	cfi_endproc; .size __mempcpy_ia32, .-__mempcpy_ia32
-
-# undef ENTRY_CHK
-# define ENTRY_CHK(name) \
-	.type __mempcpy_chk_ia32, @function; \
-	.globl __mempcpy_chk_ia32; \
-	.p2align 4; \
-	__mempcpy_chk_ia32: cfi_startproc; \
-	CALL_MCOUNT
-# undef END_CHK
-# define END_CHK(name) \
-	cfi_endproc; .size __mempcpy_chk_ia32, .-__mempcpy_chk_ia32
-
-# undef libc_hidden_def
-# undef libc_hidden_builtin_def
-/* IFUNC doesn't work with the hidden functions in shared library since
-   they will be called without setting up EBX needed for PLT which is
-   used by IFUNC.  */
-# define libc_hidden_def(name) \
-	.globl __GI_mempcpy; __GI_mempcpy = __mempcpy_ia32
-# define libc_hidden_builtin_def(name) \
-	.globl __GI___mempcpy; __GI___mempcpy = __mempcpy_ia32
-#endif
-
-#include "../mempcpy.S"
diff --git a/sysdeps/i386/i686/multiarch/mempcpy_chk.S b/sysdeps/i386/i686/multiarch/mempcpy_chk.S
deleted file mode 100644
index 828fb5e608..0000000000
--- a/sysdeps/i386/i686/multiarch/mempcpy_chk.S
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Multiple versions of __mempcpy_chk
-   Copyright (C) 2010 Free Software Foundation, Inc.
-   Contributed by Intel Corporation.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-/* Define multiple versions only for the definition in lib and for
-   DSO.  There are no multiarch mempcpy functions for static binaries.
- */
-#ifndef NOT_IN_libc
-# ifdef SHARED
-	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
-	.globl	__i686.get_pc_thunk.bx
-	.hidden	__i686.get_pc_thunk.bx
-	.p2align 4
-	.type	__i686.get_pc_thunk.bx,@function
-__i686.get_pc_thunk.bx:
-	movl	(%esp), %ebx
-	ret
-
-	.text
-ENTRY(__mempcpy_chk)
-	.type	__mempcpy_chk, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	call	__i686.get_pc_thunk.bx
-	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__mempcpy_chk_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
-	jz	2f
-	leal	__mempcpy_chk_ssse3@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
-	jz	2f
-	leal	__mempcpy_chk_ssse3_rep@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
-END(__mempcpy_chk)
-# else
-#  include "../mempcpy_chk.S"
-# endif
-#endif
diff --git a/sysdeps/i386/i686/multiarch/memset-sse2-rep.S b/sysdeps/i386/i686/multiarch/memset-sse2-rep.S
deleted file mode 100644
index 84afffeb66..0000000000
--- a/sysdeps/i386/i686/multiarch/memset-sse2-rep.S
+++ /dev/null
@@ -1,821 +0,0 @@
-/* memset with SSE2 and REP string.
-   Copyright (C) 2010 Free Software Foundation, Inc.
-   Contributed by Intel Corporation.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
-
-#ifndef NOT_IN_libc
-
-#include <sysdep.h>
-#include "asm-syntax.h"
-
-#define CFI_PUSH(REG)						\
-  cfi_adjust_cfa_offset (4);					\
-  cfi_rel_offset (REG, 0)
-
-#define CFI_POP(REG)						\
-  cfi_adjust_cfa_offset (-4);					\
-  cfi_restore (REG)
-
-#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
-#define POP(REG)	popl REG; CFI_POP (REG)
-
-#ifdef USE_AS_BZERO
-# define DEST		PARMS
-# define LEN		DEST+4
-# define SETRTNVAL
-#else
-# define DEST		PARMS
-# define CHR		DEST+4
-# define LEN		CHR+4
-# define SETRTNVAL	movl DEST(%esp), %eax
-#endif
-
-#ifdef SHARED
-# define ENTRANCE	PUSH (%ebx);
-# define RETURN_END	POP (%ebx); ret
-# define RETURN		RETURN_END; CFI_PUSH (%ebx)
-# define PARMS		8		/* Preserve EBX.  */
-# define JMPTBL(I, B)	I - B
-
-/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
-   jump table with relative offsets.   */
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
-    /* We first load PC into EBX.  */				\
-    call	__i686.get_pc_thunk.bx;				\
-    /* Get the address of the jump table.  */			\
-    add		$(TABLE - .), %ebx;				\
-    /* Get the entry and convert the relative offset to the	\
-       absolute address.  */					\
-    add		(%ebx,%ecx,4), %ebx;				\
-    add		%ecx, %edx;					\
-    /* We loaded the jump table and adjuested EDX. Go.  */	\
-    jmp		*%ebx
-
-	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
-	.globl	__i686.get_pc_thunk.bx
-	.hidden	__i686.get_pc_thunk.bx
-	ALIGN (4)
-	.type	__i686.get_pc_thunk.bx,@function
-__i686.get_pc_thunk.bx:
-	movl	(%esp), %ebx
-	ret
-#else
-# define ENTRANCE
-# define RETURN_END	ret
-# define RETURN		RETURN_END
-# define PARMS		4
-# define JMPTBL(I, B)	I
-
-/* Branch to an entry in a jump table.  TABLE is a jump table with
-   absolute offsets.  */
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
-    add		%ecx, %edx;					\
-    jmp		*TABLE(,%ecx,4)
-#endif
-
-	.section .text.sse2,"ax",@progbits
-#if defined SHARED && !defined NOT_IN_libc && !defined USE_AS_BZERO
-ENTRY (__memset_chk_sse2_rep)
-	movl	12(%esp), %eax
-	cmpl	%eax, 16(%esp)
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (__memset_chk_sse2_rep)
-#endif
-ENTRY (__memset_sse2_rep)
-	ENTRANCE
-
-	movl	LEN(%esp), %ecx
-#ifdef USE_AS_BZERO
-	xor	%eax, %eax
-#else
-	movzbl	CHR(%esp), %eax
-	movb	%al, %ah
-	/* Fill the whole EAX with pattern.  */
-	movl	%eax, %edx
-	shl	$16, %eax
-	or	%edx, %eax
-#endif
-	movl	DEST(%esp), %edx
-	cmp	$32, %ecx
-	jae	L(32bytesormore)
-
-L(write_less32bytes):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_32bytes))
-
-
-	.pushsection .rodata.sse2,"a",@progbits
-	ALIGN (2)
-L(table_less_32bytes):
-	.int	JMPTBL (L(write_0bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_1bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_2bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_3bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_4bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_5bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_6bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_7bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_8bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_9bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_10bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_11bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_12bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_13bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_14bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_15bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_16bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_17bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_18bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_19bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_20bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_21bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_22bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_23bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_24bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_25bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_26bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_27bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_28bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_29bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_30bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_31bytes), L(table_less_32bytes))
-	.popsection
-
-	ALIGN (4)
-L(write_28bytes):
-	movl	%eax, -28(%edx)
-L(write_24bytes):
-	movl	%eax, -24(%edx)
-L(write_20bytes):
-	movl	%eax, -20(%edx)
-L(write_16bytes):
-	movl	%eax, -16(%edx)
-L(write_12bytes):
-	movl	%eax, -12(%edx)
-L(write_8bytes):
-	movl	%eax, -8(%edx)
-L(write_4bytes):
-	movl	%eax, -4(%edx)
-L(write_0bytes):
-	SETRTNVAL
-	RETURN
-
-	ALIGN (4)
-L(write_29bytes):
-	movl	%eax, -29(%edx)
-L(write_25bytes):
-	movl	%eax, -25(%edx)
-L(write_21bytes):
-	movl	%eax, -21(%edx)
-L(write_17bytes):
-	movl	%eax, -17(%edx)
-L(write_13bytes):
-	movl	%eax, -13(%edx)
-L(write_9bytes):
-	movl	%eax, -9(%edx)
-L(write_5bytes):
-	movl	%eax, -5(%edx)
-L(write_1bytes):
-	movb	%al, -1(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN (4)
-L(write_30bytes):
-	movl	%eax, -30(%edx)
-L(write_26bytes):
-	movl	%eax, -26(%edx)
-L(write_22bytes):
-	movl	%eax, -22(%edx)
-L(write_18bytes):
-	movl	%eax, -18(%edx)
-L(write_14bytes):
-	movl	%eax, -14(%edx)
-L(write_10bytes):
-	movl	%eax, -10(%edx)
-L(write_6bytes):
-	movl	%eax, -6(%edx)
-L(write_2bytes):
-	movw	%ax, -2(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN (4)
-L(write_31bytes):
-	movl	%eax, -31(%edx)
-L(write_27bytes):
-	movl	%eax, -27(%edx)
-L(write_23bytes):
-	movl	%eax, -23(%edx)
-L(write_19bytes):
-	movl	%eax, -19(%edx)
-L(write_15bytes):
-	movl	%eax, -15(%edx)
-L(write_11bytes):
-	movl	%eax, -11(%edx)
-L(write_7bytes):
-	movl	%eax, -7(%edx)
-L(write_3bytes):
-	movw	%ax, -3(%edx)
-	movb	%al, -1(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN (4)
-/* ECX > 32 and EDX is 4 byte aligned.  */
-L(32bytesormore):
-	/* Fill xmm0 with the pattern.  */
-#ifdef USE_AS_BZERO
-	pxor	%xmm0, %xmm0
-#else
-	movd	%eax, %xmm0
-	punpcklbw %xmm0, %xmm0
-	pshufd	$0, %xmm0, %xmm0
-#endif
-	testl	$0xf, %edx
-	jz	L(aligned_16)
-/* ECX > 32 and EDX is not 16 byte aligned.  */
-L(not_aligned_16):
-	movdqu	%xmm0, (%edx)
-	movl	%edx, %eax
-	and	$-16, %edx
-	add	$16, %edx
-	sub	%edx, %eax
-	add	%eax, %ecx
-	movd	%xmm0, %eax
-
-	ALIGN (4)
-L(aligned_16):
-	cmp	$128, %ecx
-	jge	L(128bytesormore)
-
-L(aligned_16_less128bytes):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
-
-	ALIGN (4)
-L(128bytesormore):
-	PUSH (%edi)
-#ifdef DATA_CACHE_SIZE
-	PUSH (%ebx)
-	mov	$DATA_CACHE_SIZE, %ebx
-#else
-# ifdef SHARED
-	call	__i686.get_pc_thunk.bx
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	mov	__x86_data_cache_size@GOTOFF(%ebx), %ebx
-# else
-	PUSH (%ebx)
-	mov	__x86_data_cache_size, %ebx
-# endif
-#endif
-	mov	%ebx, %edi
-	shr	$4, %ebx
-	sub	%ebx, %edi
-#if defined DATA_CACHE_SIZE || !defined SHARED
-	POP (%ebx)
-#endif
-/*
- * When data size approximate the end of L1 cache,
- * fast string will prefetch and combine data efficiently.
- */
-	cmp	%edi, %ecx
-	jae	L(128bytesormore_nt)
-	subl	$128, %ecx
-L(128bytesormore_normal):
-	sub	$128, %ecx
-	movdqa	%xmm0, (%edx)
-	movdqa	%xmm0, 0x10(%edx)
-	movdqa	%xmm0, 0x20(%edx)
-	movdqa	%xmm0, 0x30(%edx)
-	movdqa	%xmm0, 0x40(%edx)
-	movdqa	%xmm0, 0x50(%edx)
-	movdqa	%xmm0, 0x60(%edx)
-	movdqa	%xmm0, 0x70(%edx)
-	lea	128(%edx), %edx
-	jl	L(128bytesless_normal)
-
-
-	sub	$128, %ecx
-	movdqa	%xmm0, (%edx)
-	movdqa	%xmm0, 0x10(%edx)
-	movdqa	%xmm0, 0x20(%edx)
-	movdqa	%xmm0, 0x30(%edx)
-	movdqa	%xmm0, 0x40(%edx)
-	movdqa	%xmm0, 0x50(%edx)
-	movdqa	%xmm0, 0x60(%edx)
-	movdqa	%xmm0, 0x70(%edx)
-	lea	128(%edx), %edx
-	jge	L(128bytesormore_normal)
-
-L(128bytesless_normal):
-	POP (%edi)
-	lea	128(%ecx), %ecx
-	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
-
-	ALIGN (4)
-L(128bytesormore_nt):
-	mov	%edx, %edi
-	mov	%ecx, %edx
-	shr	$2, %ecx
-	and	$3, %edx
-	rep stosl
-	jz	L(copy_page_by_rep_exit)
-	cmp	$2, %edx
-	jb	L(copy_page_by_rep_left_1)
-	movw	%ax, (%edi)
-	add	$2, %edi
-	sub	$2, %edx
-	jz	L(copy_page_by_rep_exit)
-L(copy_page_by_rep_left_1):
-	movb	%al, (%edi)
-L(copy_page_by_rep_exit):
-	POP (%edi)
-	SETRTNVAL
-	RETURN
-
-	.pushsection .rodata.sse2,"a",@progbits
-	ALIGN (2)
-L(table_16_128bytes):
-	.int	JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_1bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_3bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_5bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_7bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_9bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_11bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_13bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_15bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_17bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_19bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_21bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_23bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_25bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_27bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_29bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_31bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_33bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_35bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_37bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_39bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_41bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_43bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_45bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_47bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_49bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_51bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_53bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_55bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_57bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_59bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_61bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_63bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_65bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_67bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_69bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_71bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_73bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_75bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_77bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_79bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_81bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_83bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_85bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_87bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_89bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_91bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_93bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_95bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_97bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_99bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_101bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_103bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_105bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_107bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_109bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_111bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_113bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_115bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_117bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_119bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_121bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_123bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_125bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_127bytes), L(table_16_128bytes))
-	.popsection
-
-	ALIGN (4)
-L(aligned_16_112bytes):
-	movdqa	%xmm0, -112(%edx)
-L(aligned_16_96bytes):
-	movdqa	%xmm0, -96(%edx)
-L(aligned_16_80bytes):
-	movdqa	%xmm0, -80(%edx)
-L(aligned_16_64bytes):
-	movdqa	%xmm0, -64(%edx)
-L(aligned_16_48bytes):
-	movdqa	%xmm0, -48(%edx)
-L(aligned_16_32bytes):
-	movdqa	%xmm0, -32(%edx)
-L(aligned_16_16bytes):
-	movdqa	%xmm0, -16(%edx)
-L(aligned_16_0bytes):
-	SETRTNVAL
-	RETURN
-
-	ALIGN (4)
-L(aligned_16_113bytes):
-	movdqa	%xmm0, -113(%edx)
-L(aligned_16_97bytes):
-	movdqa	%xmm0, -97(%edx)
-L(aligned_16_81bytes):
-	movdqa	%xmm0, -81(%edx)
-L(aligned_16_65bytes):
-	movdqa	%xmm0, -65(%edx)
-L(aligned_16_49bytes):
-	movdqa	%xmm0, -49(%edx)
-L(aligned_16_33bytes):
-	movdqa	%xmm0, -33(%edx)
-L(aligned_16_17bytes):
-	movdqa	%xmm0, -17(%edx)
-L(aligned_16_1bytes):
-	movb	%al, -1(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN (4)
-L(aligned_16_114bytes):
-	movdqa	%xmm0, -114(%edx)
-L(aligned_16_98bytes):
-	movdqa	%xmm0, -98(%edx)
-L(aligned_16_82bytes):
-	movdqa	%xmm0, -82(%edx)
-L(aligned_16_66bytes):
-	movdqa	%xmm0, -66(%edx)
-L(aligned_16_50bytes):
-	movdqa	%xmm0, -50(%edx)
-L(aligned_16_34bytes):
-	movdqa	%xmm0, -34(%edx)
-L(aligned_16_18bytes):
-	movdqa	%xmm0, -18(%edx)
-L(aligned_16_2bytes):
-	movw	%ax, -2(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN (4)
-L(aligned_16_115bytes):
-	movdqa	%xmm0, -115(%edx)
-L(aligned_16_99bytes):
-	movdqa	%xmm0, -99(%edx)
-L(aligned_16_83bytes):
-	movdqa	%xmm0, -83(%edx)
-L(aligned_16_67bytes):
-	movdqa	%xmm0, -67(%edx)
-L(aligned_16_51bytes):
-	movdqa	%xmm0, -51(%edx)
-L(aligned_16_35bytes):
-	movdqa	%xmm0, -35(%edx)
-L(aligned_16_19bytes):
-	movdqa	%xmm0, -19(%edx)
-L(aligned_16_3bytes):
-	movw	%ax, -3(%edx)
-	movb	%al, -1(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN (4)
-L(aligned_16_116bytes):
-	movdqa	%xmm0, -116(%edx)
-L(aligned_16_100bytes):
-	movdqa	%xmm0, -100(%edx)
-L(aligned_16_84bytes):
-	movdqa	%xmm0, -84(%edx)
-L(aligned_16_68bytes):
-	movdqa	%xmm0, -68(%edx)
-L(aligned_16_52bytes):
-	movdqa	%xmm0, -52(%edx)
-L(aligned_16_36bytes):
-	movdqa	%xmm0, -36(%edx)
-L(aligned_16_20bytes):
-	movdqa	%xmm0, -20(%edx)
-L(aligned_16_4bytes):
-	movl	%eax, -4(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN (4)
-L(aligned_16_117bytes):
-	movdqa	%xmm0, -117(%edx)
-L(aligned_16_101bytes):
-	movdqa	%xmm0, -101(%edx)
-L(aligned_16_85bytes):
-	movdqa	%xmm0, -85(%edx)
-L(aligned_16_69bytes):
-	movdqa	%xmm0, -69(%edx)
-L(aligned_16_53bytes):
-	movdqa	%xmm0, -53(%edx)
-L(aligned_16_37bytes):
-	movdqa	%xmm0, -37(%edx)
-L(aligned_16_21bytes):
-	movdqa	%xmm0, -21(%edx)
-L(aligned_16_5bytes):
-	movl	%eax, -5(%edx)
-	movb	%al, -1(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN (4)
-L(aligned_16_118bytes):
-	movdqa	%xmm0, -118(%edx)
-L(aligned_16_102bytes):
-	movdqa	%xmm0, -102(%edx)
-L(aligned_16_86bytes):
-	movdqa	%xmm0, -86(%edx)
-L(aligned_16_70bytes):
-	movdqa	%xmm0, -70(%edx)
-L(aligned_16_54bytes):
-	movdqa	%xmm0, -54(%edx)
-L(aligned_16_38bytes):
-	movdqa	%xmm0, -38(%edx)
-L(aligned_16_22bytes):
-	movdqa	%xmm0, -22(%edx)
-L(aligned_16_6bytes):
-	movl	%eax, -6(%edx)
-	movw	%ax, -2(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN (4)
-L(aligned_16_119bytes):
-	movdqa	%xmm0, -119(%edx)
-L(aligned_16_103bytes):
-	movdqa	%xmm0, -103(%edx)
-L(aligned_16_87bytes):
-	movdqa	%xmm0, -87(%edx)
-L(aligned_16_71bytes):
-	movdqa	%xmm0, -71(%edx)
-L(aligned_16_55bytes):
-	movdqa	%xmm0, -55(%edx)
-L(aligned_16_39bytes):
-	movdqa	%xmm0, -39(%edx)
-L(aligned_16_23bytes):
-	movdqa	%xmm0, -23(%edx)
-L(aligned_16_7bytes):
-	movl	%eax, -7(%edx)
-	movw	%ax, -3(%edx)
-	movb	%al, -1(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN (4)
-L(aligned_16_120bytes):
-	movdqa	%xmm0, -120(%edx)
-L(aligned_16_104bytes):
-	movdqa	%xmm0, -104(%edx)
-L(aligned_16_88bytes):
-	movdqa	%xmm0, -88(%edx)
-L(aligned_16_72bytes):
-	movdqa	%xmm0, -72(%edx)
-L(aligned_16_56bytes):
-	movdqa	%xmm0, -56(%edx)
-L(aligned_16_40bytes):
-	movdqa	%xmm0, -40(%edx)
-L(aligned_16_24bytes):
-	movdqa	%xmm0, -24(%edx)
-L(aligned_16_8bytes):
-	movq	%xmm0, -8(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN (4)
-L(aligned_16_121bytes):
-	movdqa	%xmm0, -121(%edx)
-L(aligned_16_105bytes):
-	movdqa	%xmm0, -105(%edx)
-L(aligned_16_89bytes):
-	movdqa	%xmm0, -89(%edx)
-L(aligned_16_73bytes):
-	movdqa	%xmm0, -73(%edx)
-L(aligned_16_57bytes):
-	movdqa	%xmm0, -57(%edx)
-L(aligned_16_41bytes):
-	movdqa	%xmm0, -41(%edx)
-L(aligned_16_25bytes):
-	movdqa	%xmm0, -25(%edx)
-L(aligned_16_9bytes):
-	movq	%xmm0, -9(%edx)
-	movb	%al, -1(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN (4)
-L(aligned_16_122bytes):
-	movdqa	%xmm0, -122(%edx)
-L(aligned_16_106bytes):
-	movdqa	%xmm0, -106(%edx)
-L(aligned_16_90bytes):
-	movdqa	%xmm0, -90(%edx)
-L(aligned_16_74bytes):
-	movdqa	%xmm0, -74(%edx)
-L(aligned_16_58bytes):
-	movdqa	%xmm0, -58(%edx)
-L(aligned_16_42bytes):
-	movdqa	%xmm0, -42(%edx)
-L(aligned_16_26bytes):
-	movdqa	%xmm0, -26(%edx)
-L(aligned_16_10bytes):
-	movq	%xmm0, -10(%edx)
-	movw	%ax, -2(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN (4)
-L(aligned_16_123bytes):
-	movdqa	%xmm0, -123(%edx)
-L(aligned_16_107bytes):
-	movdqa	%xmm0, -107(%edx)
-L(aligned_16_91bytes):
-	movdqa	%xmm0, -91(%edx)
-L(aligned_16_75bytes):
-	movdqa	%xmm0, -75(%edx)
-L(aligned_16_59bytes):
-	movdqa	%xmm0, -59(%edx)
-L(aligned_16_43bytes):
-	movdqa	%xmm0, -43(%edx)
-L(aligned_16_27bytes):
-	movdqa	%xmm0, -27(%edx)
-L(aligned_16_11bytes):
-	movq	%xmm0, -11(%edx)
-	movw	%ax, -3(%edx)
-	movb	%al, -1(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN (4)
-L(aligned_16_124bytes):
-	movdqa	%xmm0, -124(%edx)
-L(aligned_16_108bytes):
-	movdqa	%xmm0, -108(%edx)
-L(aligned_16_92bytes):
-	movdqa	%xmm0, -92(%edx)
-L(aligned_16_76bytes):
-	movdqa	%xmm0, -76(%edx)
-L(aligned_16_60bytes):
-	movdqa	%xmm0, -60(%edx)
-L(aligned_16_44bytes):
-	movdqa	%xmm0, -44(%edx)
-L(aligned_16_28bytes):
-	movdqa	%xmm0, -28(%edx)
-L(aligned_16_12bytes):
-	movq	%xmm0, -12(%edx)
-	movl	%eax, -4(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN (4)
-L(aligned_16_125bytes):
-	movdqa	%xmm0, -125(%edx)
-L(aligned_16_109bytes):
-	movdqa	%xmm0, -109(%edx)
-L(aligned_16_93bytes):
-	movdqa	%xmm0, -93(%edx)
-L(aligned_16_77bytes):
-	movdqa	%xmm0, -77(%edx)
-L(aligned_16_61bytes):
-	movdqa	%xmm0, -61(%edx)
-L(aligned_16_45bytes):
-	movdqa	%xmm0, -45(%edx)
-L(aligned_16_29bytes):
-	movdqa	%xmm0, -29(%edx)
-L(aligned_16_13bytes):
-	movq	%xmm0, -13(%edx)
-	movl	%eax, -5(%edx)
-	movb	%al, -1(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN (4)
-L(aligned_16_126bytes):
-	movdqa	%xmm0, -126(%edx)
-L(aligned_16_110bytes):
-	movdqa	%xmm0, -110(%edx)
-L(aligned_16_94bytes):
-	movdqa	%xmm0, -94(%edx)
-L(aligned_16_78bytes):
-	movdqa	%xmm0, -78(%edx)
-L(aligned_16_62bytes):
-	movdqa	%xmm0, -62(%edx)
-L(aligned_16_46bytes):
-	movdqa	%xmm0, -46(%edx)
-L(aligned_16_30bytes):
-	movdqa	%xmm0, -30(%edx)
-L(aligned_16_14bytes):
-	movq	%xmm0, -14(%edx)
-	movl	%eax, -6(%edx)
-	movw	%ax, -2(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN (4)
-L(aligned_16_127bytes):
-	movdqa	%xmm0, -127(%edx)
-L(aligned_16_111bytes):
-	movdqa	%xmm0, -111(%edx)
-L(aligned_16_95bytes):
-	movdqa	%xmm0, -95(%edx)
-L(aligned_16_79bytes):
-	movdqa	%xmm0, -79(%edx)
-L(aligned_16_63bytes):
-	movdqa	%xmm0, -63(%edx)
-L(aligned_16_47bytes):
-	movdqa	%xmm0, -47(%edx)
-L(aligned_16_31bytes):
-	movdqa	%xmm0, -31(%edx)
-L(aligned_16_15bytes):
-	movq	%xmm0, -15(%edx)
-	movl	%eax, -7(%edx)
-	movw	%ax, -3(%edx)
-	movb	%al, -1(%edx)
-	SETRTNVAL
-	RETURN_END
-
-END (__memset_sse2_rep)
-
-#endif
diff --git a/sysdeps/i386/i686/multiarch/memset-sse2.S b/sysdeps/i386/i686/multiarch/memset-sse2.S
deleted file mode 100644
index b2b979193e..0000000000
--- a/sysdeps/i386/i686/multiarch/memset-sse2.S
+++ /dev/null
@@ -1,867 +0,0 @@
-/* memset with SSE2
-   Copyright (C) 2010 Free Software Foundation, Inc.
-   Contributed by Intel Corporation.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
-
-#ifndef NOT_IN_libc
-
-#include <sysdep.h>
-#include "asm-syntax.h"
-
-#define CFI_PUSH(REG)						\
-  cfi_adjust_cfa_offset (4);					\
-  cfi_rel_offset (REG, 0)
-
-#define CFI_POP(REG)						\
-  cfi_adjust_cfa_offset (-4);					\
-  cfi_restore (REG)
-
-#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
-#define POP(REG)	popl REG; CFI_POP (REG)
-
-#ifdef USE_AS_BZERO
-# define DEST		PARMS
-# define LEN		DEST+4
-# define SETRTNVAL
-#else
-# define DEST		PARMS
-# define CHR		DEST+4
-# define LEN		CHR+4
-# define SETRTNVAL	movl DEST(%esp), %eax
-#endif
-
-#ifdef SHARED
-# define ENTRANCE	PUSH (%ebx);
-# define RETURN_END	POP (%ebx); ret
-# define RETURN		RETURN_END; CFI_PUSH (%ebx)
-# define PARMS		8		/* Preserve EBX.  */
-# define JMPTBL(I, B)	I - B
-
-/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
-   jump table with relative offsets.   */
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
-    /* We first load PC into EBX.  */				\
-    call	__i686.get_pc_thunk.bx;				\
-    /* Get the address of the jump table.  */			\
-    add		$(TABLE - .), %ebx;				\
-    /* Get the entry and convert the relative offset to the	\
-       absolute address.  */					\
-    add		(%ebx,%ecx,4), %ebx;				\
-    add		%ecx, %edx;					\
-    /* We loaded the jump table and adjuested EDX. Go.  */	\
-    jmp		*%ebx
-
-	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
-	.globl	__i686.get_pc_thunk.bx
-	.hidden	__i686.get_pc_thunk.bx
-	ALIGN (4)
-	.type	__i686.get_pc_thunk.bx,@function
-__i686.get_pc_thunk.bx:
-	movl	(%esp), %ebx
-	ret
-#else
-# define ENTRANCE
-# define RETURN_END	ret
-# define RETURN		RETURN_END
-# define PARMS		4
-# define JMPTBL(I, B)	I
-
-/* Branch to an entry in a jump table.  TABLE is a jump table with
-   absolute offsets.  */
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
-    add		%ecx, %edx;					\
-    jmp		*TABLE(,%ecx,4)
-#endif
-
-	.section .text.sse2,"ax",@progbits
-#if defined SHARED && !defined NOT_IN_libc && !defined USE_AS_BZERO
-ENTRY (__memset_chk_sse2)
-	movl	12(%esp), %eax
-	cmpl	%eax, 16(%esp)
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (__memset_chk_sse2)
-#endif
-ENTRY (__memset_sse2)
-	ENTRANCE
-
-	movl	LEN(%esp), %ecx
-#ifdef USE_AS_BZERO
-	xor	%eax, %eax
-#else
-	movzbl	CHR(%esp), %eax
-	movb	%al, %ah
-	/* Fill the whole EAX with pattern.  */
-	movl	%eax, %edx
-	shl	$16, %eax
-	or	%edx, %eax
-#endif
-	movl	DEST(%esp), %edx
-	cmp	$32, %ecx
-	jae	L(32bytesormore)
-
-L(write_less32bytes):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_32bytes))
-
-
-	.pushsection .rodata.sse2,"a",@progbits
-	ALIGN (2)
-L(table_less_32bytes):
-	.int	JMPTBL (L(write_0bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_1bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_2bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_3bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_4bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_5bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_6bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_7bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_8bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_9bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_10bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_11bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_12bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_13bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_14bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_15bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_16bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_17bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_18bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_19bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_20bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_21bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_22bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_23bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_24bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_25bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_26bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_27bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_28bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_29bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_30bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_31bytes), L(table_less_32bytes))
-	.popsection
-
-	ALIGN (4)
-L(write_28bytes):
-	movl	%eax, -28(%edx)
-L(write_24bytes):
-	movl	%eax, -24(%edx)
-L(write_20bytes):
-	movl	%eax, -20(%edx)
-L(write_16bytes):
-	movl	%eax, -16(%edx)
-L(write_12bytes):
-	movl	%eax, -12(%edx)
-L(write_8bytes):
-	movl	%eax, -8(%edx)
-L(write_4bytes):
-	movl	%eax, -4(%edx)
-L(write_0bytes):
-	SETRTNVAL
-	RETURN
-
-	ALIGN (4)
-L(write_29bytes):
-	movl	%eax, -29(%edx)
-L(write_25bytes):
-	movl	%eax, -25(%edx)
-L(write_21bytes):
-	movl	%eax, -21(%edx)
-L(write_17bytes):
-	movl	%eax, -17(%edx)
-L(write_13bytes):
-	movl	%eax, -13(%edx)
-L(write_9bytes):
-	movl	%eax, -9(%edx)
-L(write_5bytes):
-	movl	%eax, -5(%edx)
-L(write_1bytes):
-	movb	%al, -1(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN (4)
-L(write_30bytes):
-	movl	%eax, -30(%edx)
-L(write_26bytes):
-	movl	%eax, -26(%edx)
-L(write_22bytes):
-	movl	%eax, -22(%edx)
-L(write_18bytes):
-	movl	%eax, -18(%edx)
-L(write_14bytes):
-	movl	%eax, -14(%edx)
-L(write_10bytes):
-	movl	%eax, -10(%edx)
-L(write_6bytes):
-	movl	%eax, -6(%edx)
-L(write_2bytes):
-	movw	%ax, -2(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN (4)
-L(write_31bytes):
-	movl	%eax, -31(%edx)
-L(write_27bytes):
-	movl	%eax, -27(%edx)
-L(write_23bytes):
-	movl	%eax, -23(%edx)
-L(write_19bytes):
-	movl	%eax, -19(%edx)
-L(write_15bytes):
-	movl	%eax, -15(%edx)
-L(write_11bytes):
-	movl	%eax, -11(%edx)
-L(write_7bytes):
-	movl	%eax, -7(%edx)
-L(write_3bytes):
-	movw	%ax, -3(%edx)
-	movb	%al, -1(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN (4)
-/* ECX > 32 and EDX is 4 byte aligned.  */
-L(32bytesormore):
-	/* Fill xmm0 with the pattern.  */
-#ifdef USE_AS_BZERO
-	pxor	%xmm0, %xmm0
-#else
-	movd	%eax, %xmm0
-	punpcklbw %xmm0, %xmm0
-	pshufd	$0, %xmm0, %xmm0
-#endif
-	testl	$0xf, %edx
-	jz	L(aligned_16)
-/* ECX > 32 and EDX is not 16 byte aligned.  */
-L(not_aligned_16):
-	movdqu	%xmm0, (%edx)
-	movl	%edx, %eax
-	and	$-16, %edx
-	add	$16, %edx
-	sub	%edx, %eax
-	add	%eax, %ecx
-	movd	%xmm0, %eax
-
-	ALIGN (4)
-L(aligned_16):
-	cmp	$128, %ecx
-	jge	L(128bytesormore)
-
-L(aligned_16_less128bytes):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
-
-	ALIGN (4)
-L(128bytesormore):
-#ifdef SHARED_CACHE_SIZE
-	PUSH (%ebx)
-	mov	$SHARED_CACHE_SIZE, %ebx
-#else
-# ifdef SHARED
-	call	__i686.get_pc_thunk.bx
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	mov	__x86_shared_cache_size@GOTOFF(%ebx), %ebx
-# else
-	PUSH (%ebx)
-	mov	__x86_shared_cache_size, %ebx
-# endif
-#endif
-	cmp	%ebx, %ecx
-	jae	L(128bytesormore_nt_start)
-
-
-#ifdef DATA_CACHE_SIZE
-	POP (%ebx)
-	cmp	$DATA_CACHE_SIZE, %ecx
-#else
-# ifdef SHARED
-	call	__i686.get_pc_thunk.bx
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmp	__x86_data_cache_size@GOTOFF(%ebx), %ecx
-# else
-	POP (%ebx)
-	cmp	__x86_data_cache_size, %ecx
-# endif
-#endif
-
-	jae	L(128bytes_L2_normal)
-	subl	$128, %ecx
-L(128bytesormore_normal):
-	sub	$128, %ecx
-	movdqa	%xmm0, (%edx)
-	movdqa	%xmm0, 0x10(%edx)
-	movdqa	%xmm0, 0x20(%edx)
-	movdqa	%xmm0, 0x30(%edx)
-	movdqa	%xmm0, 0x40(%edx)
-	movdqa	%xmm0, 0x50(%edx)
-	movdqa	%xmm0, 0x60(%edx)
-	movdqa	%xmm0, 0x70(%edx)
-	lea	128(%edx), %edx
-	jl	L(128bytesless_normal)
-
-
-	sub	$128, %ecx
-	movdqa	%xmm0, (%edx)
-	movdqa	%xmm0, 0x10(%edx)
-	movdqa	%xmm0, 0x20(%edx)
-	movdqa	%xmm0, 0x30(%edx)
-	movdqa	%xmm0, 0x40(%edx)
-	movdqa	%xmm0, 0x50(%edx)
-	movdqa	%xmm0, 0x60(%edx)
-	movdqa	%xmm0, 0x70(%edx)
-	lea	128(%edx), %edx
-	jge	L(128bytesormore_normal)
-
-L(128bytesless_normal):
-	lea	128(%ecx), %ecx
-	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
-
-	ALIGN (4)
-L(128bytes_L2_normal):
-	prefetcht0	0x380(%edx)
-	prefetcht0	0x3c0(%edx)
-	sub	$128, %ecx
-	movdqa	%xmm0, (%edx)
-	movaps	%xmm0, 0x10(%edx)
-	movaps	%xmm0, 0x20(%edx)
-	movaps	%xmm0, 0x30(%edx)
-	movaps	%xmm0, 0x40(%edx)
-	movaps	%xmm0, 0x50(%edx)
-	movaps	%xmm0, 0x60(%edx)
-	movaps	%xmm0, 0x70(%edx)
-	add	$128, %edx
-	cmp	$128, %ecx
-	jge	L(128bytes_L2_normal)
-
-L(128bytesless_L2_normal):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
-
-L(128bytesormore_nt_start):
-	sub	%ebx, %ecx
-	ALIGN (4)
-L(128bytesormore_shared_cache_loop):
-	prefetcht0	0x3c0(%edx)
-	prefetcht0	0x380(%edx)
-	sub	$0x80, %ebx
-	movdqa	%xmm0, (%edx)
-	movdqa	%xmm0, 0x10(%edx)
-	movdqa	%xmm0, 0x20(%edx)
-	movdqa	%xmm0, 0x30(%edx)
-	movdqa	%xmm0, 0x40(%edx)
-	movdqa	%xmm0, 0x50(%edx)
-	movdqa	%xmm0, 0x60(%edx)
-	movdqa	%xmm0, 0x70(%edx)
-	add	$0x80, %edx
-	cmp	$0x80, %ebx
-	jge	L(128bytesormore_shared_cache_loop)
-	cmp	$0x80, %ecx
-	jb	L(shared_cache_loop_end)
-	ALIGN (4)
-L(128bytesormore_nt):
-	sub	$0x80, %ecx
-	movntdq	%xmm0, (%edx)
-	movntdq	%xmm0, 0x10(%edx)
-	movntdq	%xmm0, 0x20(%edx)
-	movntdq	%xmm0, 0x30(%edx)
-	movntdq	%xmm0, 0x40(%edx)
-	movntdq	%xmm0, 0x50(%edx)
-	movntdq	%xmm0, 0x60(%edx)
-	movntdq	%xmm0, 0x70(%edx)
-	add	$0x80, %edx
-	cmp	$0x80, %ecx
-	jge	L(128bytesormore_nt)
-	sfence
-L(shared_cache_loop_end):
-#if defined DATA_CACHE_SIZE || !defined SHARED
-	POP (%ebx)
-#endif
-	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
-
-
-	.pushsection .rodata.sse2,"a",@progbits
-	ALIGN (2)
-L(table_16_128bytes):
-	.int	JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_1bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_3bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_5bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_7bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_9bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_11bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_13bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_15bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_17bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_19bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_21bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_23bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_25bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_27bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_29bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_31bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_33bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_35bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_37bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_39bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_41bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_43bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_45bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_47bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_49bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_51bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_53bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_55bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_57bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_59bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_61bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_63bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_65bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_67bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_69bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_71bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_73bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_75bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_77bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_79bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_81bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_83bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_85bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_87bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_89bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_91bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_93bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_95bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_97bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_99bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_101bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_103bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_105bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_107bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_109bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_111bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_113bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_115bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_117bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_119bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_121bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_123bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_125bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes))
-	.int	JMPTBL (L(aligned_16_127bytes), L(table_16_128bytes))
-	.popsection
-
-	ALIGN (4)
-L(aligned_16_112bytes):
-	movdqa	%xmm0, -112(%edx)
-L(aligned_16_96bytes):
-	movdqa	%xmm0, -96(%edx)
-L(aligned_16_80bytes):
-	movdqa	%xmm0, -80(%edx)
-L(aligned_16_64bytes):
-	movdqa	%xmm0, -64(%edx)
-L(aligned_16_48bytes):
-	movdqa	%xmm0, -48(%edx)
-L(aligned_16_32bytes):
-	movdqa	%xmm0, -32(%edx)
-L(aligned_16_16bytes):
-	movdqa	%xmm0, -16(%edx)
-L(aligned_16_0bytes):
-	SETRTNVAL
-	RETURN
-
-	ALIGN (4)
-L(aligned_16_113bytes):
-	movdqa	%xmm0, -113(%edx)
-L(aligned_16_97bytes):
-	movdqa	%xmm0, -97(%edx)
-L(aligned_16_81bytes):
-	movdqa	%xmm0, -81(%edx)
-L(aligned_16_65bytes):
-	movdqa	%xmm0, -65(%edx)
-L(aligned_16_49bytes):
-	movdqa	%xmm0, -49(%edx)
-L(aligned_16_33bytes):
-	movdqa	%xmm0, -33(%edx)
-L(aligned_16_17bytes):
-	movdqa	%xmm0, -17(%edx)
-L(aligned_16_1bytes):
-	movb	%al, -1(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN (4)
-L(aligned_16_114bytes):
-	movdqa	%xmm0, -114(%edx)
-L(aligned_16_98bytes):
-	movdqa	%xmm0, -98(%edx)
-L(aligned_16_82bytes):
-	movdqa	%xmm0, -82(%edx)
-L(aligned_16_66bytes):
-	movdqa	%xmm0, -66(%edx)
-L(aligned_16_50bytes):
-	movdqa	%xmm0, -50(%edx)
-L(aligned_16_34bytes):
-	movdqa	%xmm0, -34(%edx)
-L(aligned_16_18bytes):
-	movdqa	%xmm0, -18(%edx)
-L(aligned_16_2bytes):
-	movw	%ax, -2(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN (4)
-L(aligned_16_115bytes):
-	movdqa	%xmm0, -115(%edx)
-L(aligned_16_99bytes):
-	movdqa	%xmm0, -99(%edx)
-L(aligned_16_83bytes):
-	movdqa	%xmm0, -83(%edx)
-L(aligned_16_67bytes):
-	movdqa	%xmm0, -67(%edx)
-L(aligned_16_51bytes):
-	movdqa	%xmm0, -51(%edx)
-L(aligned_16_35bytes):
-	movdqa	%xmm0, -35(%edx)
-L(aligned_16_19bytes):
-	movdqa	%xmm0, -19(%edx)
-L(aligned_16_3bytes):
-	movw	%ax, -3(%edx)
-	movb	%al, -1(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN (4)
-L(aligned_16_116bytes):
-	movdqa	%xmm0, -116(%edx)
-L(aligned_16_100bytes):
-	movdqa	%xmm0, -100(%edx)
-L(aligned_16_84bytes):
-	movdqa	%xmm0, -84(%edx)
-L(aligned_16_68bytes):
-	movdqa	%xmm0, -68(%edx)
-L(aligned_16_52bytes):
-	movdqa	%xmm0, -52(%edx)
-L(aligned_16_36bytes):
-	movdqa	%xmm0, -36(%edx)
-L(aligned_16_20bytes):
-	movdqa	%xmm0, -20(%edx)
-L(aligned_16_4bytes):
-	movl	%eax, -4(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN (4)
-L(aligned_16_117bytes):
-	movdqa	%xmm0, -117(%edx)
-L(aligned_16_101bytes):
-	movdqa	%xmm0, -101(%edx)
-L(aligned_16_85bytes):
-	movdqa	%xmm0, -85(%edx)
-L(aligned_16_69bytes):
-	movdqa	%xmm0, -69(%edx)
-L(aligned_16_53bytes):
-	movdqa	%xmm0, -53(%edx)
-L(aligned_16_37bytes):
-	movdqa	%xmm0, -37(%edx)
-L(aligned_16_21bytes):
-	movdqa	%xmm0, -21(%edx)
-L(aligned_16_5bytes):
-	movl	%eax, -5(%edx)
-	movb	%al, -1(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN (4)
-L(aligned_16_118bytes):
-	movdqa	%xmm0, -118(%edx)
-L(aligned_16_102bytes):
-	movdqa	%xmm0, -102(%edx)
-L(aligned_16_86bytes):
-	movdqa	%xmm0, -86(%edx)
-L(aligned_16_70bytes):
-	movdqa	%xmm0, -70(%edx)
-L(aligned_16_54bytes):
-	movdqa	%xmm0, -54(%edx)
-L(aligned_16_38bytes):
-	movdqa	%xmm0, -38(%edx)
-L(aligned_16_22bytes):
-	movdqa	%xmm0, -22(%edx)
-L(aligned_16_6bytes):
-	movl	%eax, -6(%edx)
-	movw	%ax, -2(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN (4)
-L(aligned_16_119bytes):
-	movdqa	%xmm0, -119(%edx)
-L(aligned_16_103bytes):
-	movdqa	%xmm0, -103(%edx)
-L(aligned_16_87bytes):
-	movdqa	%xmm0, -87(%edx)
-L(aligned_16_71bytes):
-	movdqa	%xmm0, -71(%edx)
-L(aligned_16_55bytes):
-	movdqa	%xmm0, -55(%edx)
-L(aligned_16_39bytes):
-	movdqa	%xmm0, -39(%edx)
-L(aligned_16_23bytes):
-	movdqa	%xmm0, -23(%edx)
-L(aligned_16_7bytes):
-	movl	%eax, -7(%edx)
-	movw	%ax, -3(%edx)
-	movb	%al, -1(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN (4)
-L(aligned_16_120bytes):
-	movdqa	%xmm0, -120(%edx)
-L(aligned_16_104bytes):
-	movdqa	%xmm0, -104(%edx)
-L(aligned_16_88bytes):
-	movdqa	%xmm0, -88(%edx)
-L(aligned_16_72bytes):
-	movdqa	%xmm0, -72(%edx)
-L(aligned_16_56bytes):
-	movdqa	%xmm0, -56(%edx)
-L(aligned_16_40bytes):
-	movdqa	%xmm0, -40(%edx)
-L(aligned_16_24bytes):
-	movdqa	%xmm0, -24(%edx)
-L(aligned_16_8bytes):
-	movq	%xmm0, -8(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN (4)
-L(aligned_16_121bytes):
-	movdqa	%xmm0, -121(%edx)
-L(aligned_16_105bytes):
-	movdqa	%xmm0, -105(%edx)
-L(aligned_16_89bytes):
-	movdqa	%xmm0, -89(%edx)
-L(aligned_16_73bytes):
-	movdqa	%xmm0, -73(%edx)
-L(aligned_16_57bytes):
-	movdqa	%xmm0, -57(%edx)
-L(aligned_16_41bytes):
-	movdqa	%xmm0, -41(%edx)
-L(aligned_16_25bytes):
-	movdqa	%xmm0, -25(%edx)
-L(aligned_16_9bytes):
-	movq	%xmm0, -9(%edx)
-	movb	%al, -1(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN (4)
-L(aligned_16_122bytes):
-	movdqa	%xmm0, -122(%edx)
-L(aligned_16_106bytes):
-	movdqa	%xmm0, -106(%edx)
-L(aligned_16_90bytes):
-	movdqa	%xmm0, -90(%edx)
-L(aligned_16_74bytes):
-	movdqa	%xmm0, -74(%edx)
-L(aligned_16_58bytes):
-	movdqa	%xmm0, -58(%edx)
-L(aligned_16_42bytes):
-	movdqa	%xmm0, -42(%edx)
-L(aligned_16_26bytes):
-	movdqa	%xmm0, -26(%edx)
-L(aligned_16_10bytes):
-	movq	%xmm0, -10(%edx)
-	movw	%ax, -2(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN (4)
-L(aligned_16_123bytes):
-	movdqa	%xmm0, -123(%edx)
-L(aligned_16_107bytes):
-	movdqa	%xmm0, -107(%edx)
-L(aligned_16_91bytes):
-	movdqa	%xmm0, -91(%edx)
-L(aligned_16_75bytes):
-	movdqa	%xmm0, -75(%edx)
-L(aligned_16_59bytes):
-	movdqa	%xmm0, -59(%edx)
-L(aligned_16_43bytes):
-	movdqa	%xmm0, -43(%edx)
-L(aligned_16_27bytes):
-	movdqa	%xmm0, -27(%edx)
-L(aligned_16_11bytes):
-	movq	%xmm0, -11(%edx)
-	movw	%ax, -3(%edx)
-	movb	%al, -1(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN (4)
-L(aligned_16_124bytes):
-	movdqa	%xmm0, -124(%edx)
-L(aligned_16_108bytes):
-	movdqa	%xmm0, -108(%edx)
-L(aligned_16_92bytes):
-	movdqa	%xmm0, -92(%edx)
-L(aligned_16_76bytes):
-	movdqa	%xmm0, -76(%edx)
-L(aligned_16_60bytes):
-	movdqa	%xmm0, -60(%edx)
-L(aligned_16_44bytes):
-	movdqa	%xmm0, -44(%edx)
-L(aligned_16_28bytes):
-	movdqa	%xmm0, -28(%edx)
-L(aligned_16_12bytes):
-	movq	%xmm0, -12(%edx)
-	movl	%eax, -4(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN (4)
-L(aligned_16_125bytes):
-	movdqa	%xmm0, -125(%edx)
-L(aligned_16_109bytes):
-	movdqa	%xmm0, -109(%edx)
-L(aligned_16_93bytes):
-	movdqa	%xmm0, -93(%edx)
-L(aligned_16_77bytes):
-	movdqa	%xmm0, -77(%edx)
-L(aligned_16_61bytes):
-	movdqa	%xmm0, -61(%edx)
-L(aligned_16_45bytes):
-	movdqa	%xmm0, -45(%edx)
-L(aligned_16_29bytes):
-	movdqa	%xmm0, -29(%edx)
-L(aligned_16_13bytes):
-	movq	%xmm0, -13(%edx)
-	movl	%eax, -5(%edx)
-	movb	%al, -1(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN (4)
-L(aligned_16_126bytes):
-	movdqa	%xmm0, -126(%edx)
-L(aligned_16_110bytes):
-	movdqa	%xmm0, -110(%edx)
-L(aligned_16_94bytes):
-	movdqa	%xmm0, -94(%edx)
-L(aligned_16_78bytes):
-	movdqa	%xmm0, -78(%edx)
-L(aligned_16_62bytes):
-	movdqa	%xmm0, -62(%edx)
-L(aligned_16_46bytes):
-	movdqa	%xmm0, -46(%edx)
-L(aligned_16_30bytes):
-	movdqa	%xmm0, -30(%edx)
-L(aligned_16_14bytes):
-	movq	%xmm0, -14(%edx)
-	movl	%eax, -6(%edx)
-	movw	%ax, -2(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN (4)
-L(aligned_16_127bytes):
-	movdqa	%xmm0, -127(%edx)
-L(aligned_16_111bytes):
-	movdqa	%xmm0, -111(%edx)
-L(aligned_16_95bytes):
-	movdqa	%xmm0, -95(%edx)
-L(aligned_16_79bytes):
-	movdqa	%xmm0, -79(%edx)
-L(aligned_16_63bytes):
-	movdqa	%xmm0, -63(%edx)
-L(aligned_16_47bytes):
-	movdqa	%xmm0, -47(%edx)
-L(aligned_16_31bytes):
-	movdqa	%xmm0, -31(%edx)
-L(aligned_16_15bytes):
-	movq	%xmm0, -15(%edx)
-	movl	%eax, -7(%edx)
-	movw	%ax, -3(%edx)
-	movb	%al, -1(%edx)
-	SETRTNVAL
-	RETURN_END
-
-END (__memset_sse2)
-
-#endif
diff --git a/sysdeps/i386/i686/multiarch/memset.S b/sysdeps/i386/i686/multiarch/memset.S
deleted file mode 100644
index 34dddcef7b..0000000000
--- a/sysdeps/i386/i686/multiarch/memset.S
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Multiple versions of memset
-   Copyright (C) 2010 Free Software Foundation, Inc.
-   Contributed by Intel Corporation.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-/* Define multiple versions only for the definition in lib.  */
-#ifndef NOT_IN_libc
-# ifdef SHARED
-	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
-	.globl	__i686.get_pc_thunk.bx
-	.hidden	__i686.get_pc_thunk.bx
-	.p2align 4
-	.type	__i686.get_pc_thunk.bx,@function
-__i686.get_pc_thunk.bx:
-	movl	(%esp), %ebx
-	ret
-
-	.text
-ENTRY(memset)
-	.type	memset, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	call	__i686.get_pc_thunk.bx
-	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__memset_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
-	jz	2f
-	leal	__memset_sse2@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
-	jz	2f
-	leal	__memset_sse2_rep@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
-END(memset)
-# else
-	.text
-ENTRY(memset)
-	.type	memset, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__memset_ia32, %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features
-	jz	2f
-	leal	__memset_sse2, %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features
-	jz	2f
-	leal	__memset_sse2_rep, %eax
-2:	ret
-END(memset)
-# endif
-
-# undef ENTRY
-# define ENTRY(name) \
-	.type __memset_ia32, @function; \
-	.globl __memset_ia32; \
-	.p2align 4; \
-	__memset_ia32: cfi_startproc; \
-	CALL_MCOUNT
-# undef END
-# define END(name) \
-	cfi_endproc; .size __memset_ia32, .-__memset_ia32
-
-# undef ENTRY_CHK
-# define ENTRY_CHK(name) \
-	.type __memset_chk_ia32, @function; \
-	.globl __memset_chk_ia32; \
-	.p2align 4; \
-	__memset_chk_ia32: cfi_startproc; \
-	CALL_MCOUNT
-# undef END_CHK
-# define END_CHK(name) \
-	cfi_endproc; .size __memset_chk_ia32, .-__memset_chk_ia32
-
-# ifdef SHARED
-#  undef libc_hidden_builtin_def
-/* IFUNC doesn't work with the hidden functions in shared library since
-   they will be called without setting up EBX needed for PLT which is
-   used by IFUNC.  */
-#  define libc_hidden_builtin_def(name) \
-	.globl __GI_memset; __GI_memset = __memset_ia32
-# endif
-
-# undef strong_alias
-# define strong_alias(original, alias)
-#endif
-
-#include "../memset.S"
diff --git a/sysdeps/i386/i686/multiarch/memset_chk.S b/sysdeps/i386/i686/multiarch/memset_chk.S
deleted file mode 100644
index d659c7e56d..0000000000
--- a/sysdeps/i386/i686/multiarch/memset_chk.S
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Multiple versions of __memset_chk
-   Copyright (C) 2010 Free Software Foundation, Inc.
-   Contributed by Intel Corporation.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-/* Define multiple versions only for the definition in lib.  */
-#ifndef NOT_IN_libc
-# ifdef SHARED
-	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
-	.globl	__i686.get_pc_thunk.bx
-	.hidden	__i686.get_pc_thunk.bx
-	.p2align 4
-	.type	__i686.get_pc_thunk.bx,@function
-__i686.get_pc_thunk.bx:
-	movl	(%esp), %ebx
-	ret
-
-	.text
-ENTRY(__memset_chk)
-	.type	__memset_chk, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	call	__i686.get_pc_thunk.bx
-	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__memset_chk_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
-	jz	2f
-	leal	__memset_chk_sse2@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
-	jz	2f
-	leal	__memset_chk_sse2_rep@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
-END(__memset_chk)
-
-strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
-	.section .gnu.warning.__memset_zero_constant_len_parameter
-	.string "memset used with constant zero length parameter; this could be due to transposed parameters"
-# else
-	.text
-ENTRY(__memset_chk)
-	.type	__memset_chk, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__memset_chk_ia32, %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features
-	jz	2f
-	leal	__memset_chk_sse2, %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features
-	jz	2f
-	leal	__memset_chk_sse2_rep, %eax
-2:	ret
-END(__memset_chk)
-
-	.type __memset_chk_sse2, @function
-	.p2align 4;
-__memset_chk_sse2:
-	cfi_startproc
-	CALL_MCOUNT
-	movl	12(%esp), %eax
-	cmpl	%eax, 16(%esp)
-	jb	__chk_fail
-	jmp	__memset_sse2
-	cfi_endproc
-	.size __memset_chk_sse2, .-__memset_chk_sse2
-
-	.type __memset_chk_sse2_rep, @function
-	.p2align 4;
-__memset_chk_sse2_rep:
-	cfi_startproc
-	CALL_MCOUNT
-	movl	12(%esp), %eax
-	cmpl	%eax, 16(%esp)
-	jb	__chk_fail
-	jmp	__memset_sse2_rep
-	cfi_endproc
-	.size __memset_chk_sse2_rep, .-__memset_chk_sse2_rep
-
-	.type __memset_chk_ia32, @function
-	.p2align 4;
-__memset_chk_ia32:
-	cfi_startproc
-	CALL_MCOUNT
-	movl	12(%esp), %eax
-	cmpl	%eax, 16(%esp)
-	jb	__chk_fail
-	jmp	__memset_ia32
-	cfi_endproc
-	.size __memset_chk_ia32, .-__memset_chk_ia32
-# endif
-#endif
diff --git a/sysdeps/i386/sysdep.h b/sysdeps/i386/sysdep.h
index efdc82dde7..e03a8e926d 100644
--- a/sysdeps/i386/sysdep.h
+++ b/sysdeps/i386/sysdep.h
@@ -67,9 +67,6 @@
   ASM_SIZE_DIRECTIVE(name)						      \
   STABS_FUN_END(name)
 
-#define ENTRY_CHK(name) ENTRY (name)
-#define END_CHK(name) END (name)
-
 #ifdef HAVE_CPP_ASM_DEBUGINFO
 /* Disable that goop, because we just pass -g through to the assembler
    and it generates proper line number information directly.  */
diff --git a/sysdeps/x86_64/cacheinfo.c b/sysdeps/x86_64/cacheinfo.c
index 54220379ec..5b66c62eb3 100644
--- a/sysdeps/x86_64/cacheinfo.c
+++ b/sysdeps/x86_64/cacheinfo.c
@@ -74,7 +74,6 @@ static const struct intel_02_cache_info
     { 0x0a,  2, 32, M(_SC_LEVEL1_DCACHE_SIZE),    8192 },
     { 0x0c,  4, 32, M(_SC_LEVEL1_DCACHE_SIZE),   16384 },
     { 0x0d,  4, 64, M(_SC_LEVEL1_DCACHE_SIZE),   16384 },
-    { 0x0e,  6, 64, M(_SC_LEVEL1_DCACHE_SIZE),   24576 },
     { 0x21,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),   262144 },
     { 0x22,  4, 64, M(_SC_LEVEL3_CACHE_SIZE),   524288 },
     { 0x23,  8, 64, M(_SC_LEVEL3_CACHE_SIZE),  1048576 },
@@ -114,7 +113,6 @@ static const struct intel_02_cache_info
     { 0x7c,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),  1048576 },
     { 0x7d,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),  2097152 },
     { 0x7f,  2, 64, M(_SC_LEVEL2_CACHE_SIZE),   524288 },
-    { 0x80,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),   524288 },
     { 0x82,  8, 32, M(_SC_LEVEL2_CACHE_SIZE),   262144 },
     { 0x83,  8, 32, M(_SC_LEVEL2_CACHE_SIZE),   524288 },
     { 0x84,  8, 32, M(_SC_LEVEL2_CACHE_SIZE),  1048576 },
@@ -454,10 +452,9 @@ __cache_sysconf (int name)
 }
 
 
-/* Data cache size for use in memory and string routines, typically
+/* Half the data cache size for use in memory and string routines, typically
    L1 size.  */
 long int __x86_64_data_cache_size_half attribute_hidden = 32 * 1024 / 2;
-long int __x86_64_data_cache_size attribute_hidden = 32 * 1024;
 /* Shared cache size for use in memory and string routines, typically
    L2 or L3 size.  */
 long int __x86_64_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
@@ -660,10 +657,7 @@ init_cacheinfo (void)
     }
 
   if (data > 0)
-    {
-      __x86_64_data_cache_size_half = data / 2;
-      __x86_64_data_cache_size = data;
-    }
+    __x86_64_data_cache_size_half = data / 2;
 
   if (shared > 0)
     {
diff --git a/sysdeps/x86_64/multiarch/ifunc-defines.sym b/sysdeps/x86_64/multiarch/ifunc-defines.sym
index eb1538abcc..e2021cdf87 100644
--- a/sysdeps/x86_64/multiarch/ifunc-defines.sym
+++ b/sysdeps/x86_64/multiarch/ifunc-defines.sym
@@ -13,8 +13,5 @@ CPUID_ECX_OFFSET	offsetof (struct cpuid_registers, ecx)
 CPUID_EDX_OFFSET	offsetof (struct cpuid_registers, edx)
 FAMILY_OFFSET		offsetof (struct cpu_features, family)
 MODEL_OFFSET		offsetof (struct cpu_features, model)
-FEATURE_OFFSET		offsetof (struct cpu_features, feature)
-FEATURE_SIZE		sizeof (unsigned int)
 
 COMMON_CPUID_INDEX_1
-FEATURE_INDEX_1
diff --git a/sysdeps/x86_64/multiarch/init-arch.c b/sysdeps/x86_64/multiarch/init-arch.c
index 50b2a38fbd..7823aceb9b 100644
--- a/sysdeps/x86_64/multiarch/init-arch.c
+++ b/sysdeps/x86_64/multiarch/init-arch.c
@@ -64,23 +64,7 @@ __init_cpu_features (void)
 	  __cpu_features.model += extended_model;
 	}
       else if (__cpu_features.family == 0x06)
-	{
-	  __cpu_features.model += extended_model;
-	  switch (__cpu_features.model)
-	    {
-	    case 0x1a:
-	    case 0x1e:
-	    case 0x1f:
-	    case 0x25:
-	    case 0x2e:
-	    case 0x2f:
-	      /* Rep string instructions are fast on Intel Core i3, i5
-		 and i7.  */
-	      __cpu_features.feature[index_Fast_Rep_String]
-		|= bit_Fast_Rep_String;
-	      break;
-	    }
-	}
+	__cpu_features.model += extended_model;
     }
   /* This spells out "AuthenticAMD".  */
   else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h
index 69492cb3bf..0f8f77a8a1 100644
--- a/sysdeps/x86_64/multiarch/init-arch.h
+++ b/sysdeps/x86_64/multiarch/init-arch.h
@@ -16,8 +16,6 @@
    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
    02111-1307 USA.  */
 
-#define bit_Fast_Rep_String	(1 << 0)
-
 #ifdef	__ASSEMBLER__
 
 #include <ifunc-defines.h>
@@ -30,8 +28,6 @@
 #define index_SSSE3	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
 #define index_SSE4_2	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
 
-#define index_Fast_Rep_String	FEATURE_INDEX_1*FEATURE_SIZE
-
 #else	/* __ASSEMBLER__ */
 
 #include <sys/param.h>
@@ -43,13 +39,6 @@ enum
     COMMON_CPUID_INDEX_MAX
   };
 
-enum
-  {
-    FEATURE_INDEX_1 = 0,
-    /* Keep the following line at the end.  */
-    FEATURE_INDEX_MAX
-  };
-
 extern struct cpu_features
 {
   enum
@@ -69,7 +58,6 @@ extern struct cpu_features
   } cpuid[COMMON_CPUID_INDEX_MAX];
   unsigned int family;
   unsigned int model;
-  unsigned int feature[FEATURE_INDEX_MAX];
 } __cpu_features attribute_hidden;
 
 
@@ -98,6 +86,4 @@ extern const struct cpu_features *__get_cpu_features (void)
 #define HAS_SSE4_2	HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 20)
 #define HAS_FMA		HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 12)
 
-#define index_Fast_Rep_String	FEATURE_INDEX_1
-
 #endif	/* __ASSEMBLER__ */