about summary refs log tree commit diff
path: root/REORG.TODO/sysdeps/i386/i686/multiarch
diff options
context:
space:
mode:
authorZack Weinberg <zackw@panix.com>2017-06-08 15:39:03 -0400
committerZack Weinberg <zackw@panix.com>2017-06-08 15:39:03 -0400
commit5046dbb4a7eba5eccfd258f92f4735c9ffc8d069 (patch)
tree4470480d904b65cf14ca524f96f79eca818c3eaf /REORG.TODO/sysdeps/i386/i686/multiarch
parent199fc19d3aaaf57944ef036e15904febe877fc93 (diff)
downloadglibc-5046dbb4a7eba5eccfd258f92f4735c9ffc8d069.tar.gz
glibc-5046dbb4a7eba5eccfd258f92f4735c9ffc8d069.tar.xz
glibc-5046dbb4a7eba5eccfd258f92f4735c9ffc8d069.zip
Prepare for radical source tree reorganization. zack/build-layout-experiment
All top-level files and directories are moved into a temporary storage
directory, REORG.TODO, except for files that will certainly still
exist in their current form at top level when we're done (COPYING,
COPYING.LIB, LICENSES, NEWS, README), all old ChangeLog files (which
are moved to the new directory OldChangeLogs, instead), and the
generated file INSTALL (which is just deleted; in the new order, there
will be no generated files checked into version control).
Diffstat (limited to 'REORG.TODO/sysdeps/i386/i686/multiarch')
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/Makefile44
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-sse2-unaligned.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/bcopy.S59
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/bzero.S62
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/ifunc-impl-list.c376
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/locale-defines.sym11
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S502
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2.S709
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memchr.S65
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-sse4.S1225
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-ssse3.S2157
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcmp.S62
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S681
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S1809
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3.S3162
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcpy.S78
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcpy_chk.S50
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memmove-sse2-unaligned.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memmove.S89
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memmove_chk.S94
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-sse2-unaligned.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy.S81
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy_chk.S50
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-c.c7
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S417
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2.S724
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memrchr.S45
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2-rep.S811
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2.S860
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memset.S75
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memset_chk.S82
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr.S65
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/rtld-strnlen.c1
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/s_fma-fma.c27
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/s_fma.c34
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf-fma.c27
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf.c34
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/sched_cpucount.c1
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-sse2.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy.S9
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-sse2.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy.S8
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp-c.c12
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp.S39
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-c.c13
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-sse4.S2
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-ssse3.S2
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l.S7
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcat-sse2.S1245
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcat-ssse3.S572
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcat.S92
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2-bsf.S158
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2.S348
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strchr.S57
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-sse4.S804
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-ssse3.S2810
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcmp.S95
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-sse2.S2250
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-ssse3.S3901
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcpy.S116
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcspn-c.c2
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcspn.S75
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S125
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2.S695
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strlen.S60
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncase-c.c8
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncase.S39
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-c.c13
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-sse4.S2
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-ssse3.S2
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l.S7
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncat-c.c8
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncat-sse2.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncat-ssse3.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncat.S5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-c.c8
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-sse4.S5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-ssse3.S5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncmp.S5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-c.c8
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-sse2.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-ssse3.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncpy.S5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-c.c10
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-sse2.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strnlen.S37
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk-c.c2
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk.S5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2-bsf.S282
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2.S708
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strrchr.S57
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strspn-c.c2
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strspn.S56
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/test-multiarch.c1
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/varshift.c1
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/varshift.h1
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-c.c22
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-sse2.S219
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcschr.S36
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-c.c14
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-sse2.S1018
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp.S39
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-c.c5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S600
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy.S36
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-c.c9
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-sse2.S193
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcslen.S37
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-c.c5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S354
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr.S35
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-c.c9
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp.S40
127 files changed, 32113 insertions, 0 deletions
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/Makefile b/REORG.TODO/sysdeps/i386/i686/multiarch/Makefile
new file mode 100644
index 0000000000..4a0c20c051
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/Makefile
@@ -0,0 +1,44 @@
+ifeq ($(subdir),csu)
+tests += test-multiarch
+endif
+
+ifeq ($(subdir),string)
+gen-as-const-headers += locale-defines.sym
+sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \
+		   memmove-ssse3 memcpy-ssse3-rep mempcpy-ssse3-rep \
+		   memmove-ssse3-rep bcopy-ssse3 bcopy-ssse3-rep \
+		   memset-sse2-rep bzero-sse2-rep strcmp-ssse3 \
+		   strcmp-sse4 strncmp-c strncmp-ssse3 strncmp-sse4 \
+		   memcmp-ssse3 memcmp-sse4 varshift \
+		   strlen-sse2 strlen-sse2-bsf strncpy-c strcpy-ssse3 \
+		   strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 strcpy-sse2 \
+		   strncpy-sse2 stpcpy-sse2 stpncpy-sse2 strcat-ssse3 \
+		   strcat-sse2 strncat-ssse3 strncat-sse2 strncat-c \
+		   strchr-sse2 strrchr-sse2 strchr-sse2-bsf strrchr-sse2-bsf \
+		   memchr-sse2 memchr-sse2-bsf \
+		   memrchr-sse2 memrchr-sse2-bsf memrchr-c \
+		   rawmemchr-sse2 rawmemchr-sse2-bsf \
+		   strnlen-sse2 strnlen-c \
+		   strcasecmp_l-c strcasecmp-c strcasecmp_l-ssse3 \
+		   strncase_l-c strncase-c strncase_l-ssse3 \
+		   strcasecmp_l-sse4 strncase_l-sse4 \
+		   bcopy-sse2-unaligned memcpy-sse2-unaligned \
+		   mempcpy-sse2-unaligned memmove-sse2-unaligned \
+		   strcspn-c strpbrk-c strspn-c
+CFLAGS-varshift.c += -msse4
+CFLAGS-strcspn-c.c += -msse4
+CFLAGS-strpbrk-c.c += -msse4
+CFLAGS-strspn-c.c += -msse4
+endif
+
+ifeq ($(subdir),wcsmbs)
+sysdep_routines += wcscmp-sse2 wcscmp-c wcslen-sse2 wcslen-c \
+		   wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c wcschr-sse2 \
+		   wcschr-c wcsrchr-sse2 wcsrchr-c wcscpy-ssse3 wcscpy-c
+endif
+
+ifeq ($(subdir),math)
+libm-sysdep_routines += s_fma-fma s_fmaf-fma
+CFLAGS-s_fma-fma.c += -mavx -mfpmath=sse
+CFLAGS-s_fmaf-fma.c += -mavx -mfpmath=sse
+endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-sse2-unaligned.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-sse2-unaligned.S
new file mode 100644
index 0000000000..efef2a10dd
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-sse2-unaligned.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define USE_AS_BCOPY
+#define MEMCPY		__bcopy_sse2_unaligned
+#include "memcpy-sse2-unaligned.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S
new file mode 100644
index 0000000000..cbc8b420e8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define USE_AS_BCOPY
+#define MEMCPY		__bcopy_ssse3_rep
+#include "memcpy-ssse3-rep.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3.S
new file mode 100644
index 0000000000..36aac44b9c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define USE_AS_BCOPY
+#define MEMCPY		__bcopy_ssse3
+#include "memcpy-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy.S
new file mode 100644
index 0000000000..877f82c28f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy.S
@@ -0,0 +1,59 @@
+/* Multiple versions of bcopy
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib.  */
+#if IS_IN (libc)
+	.text
+ENTRY(bcopy)
+	.type	bcopy, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__bcopy_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__bcopy_sse2_unaligned)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz	2f
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__bcopy_ssse3)
+	HAS_ARCH_FEATURE (Fast_Rep_String)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__bcopy_ssse3_rep)
+2:	ret
+END(bcopy)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __bcopy_ia32, @function; \
+	.p2align 4; \
+	.globl __bcopy_ia32; \
+	.hidden __bcopy_ia32; \
+	__bcopy_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __bcopy_ia32, .-__bcopy_ia32
+
+#endif
+
+#include "../bcopy.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S
new file mode 100644
index 0000000000..507b288bb3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S
@@ -0,0 +1,3 @@
+#define USE_AS_BZERO
+#define __memset_sse2_rep __bzero_sse2_rep
+#include "memset-sse2-rep.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2.S
new file mode 100644
index 0000000000..8d04512e4e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2.S
@@ -0,0 +1,3 @@
+#define USE_AS_BZERO
+#define __memset_sse2 __bzero_sse2
+#include "memset-sse2.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bzero.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero.S
new file mode 100644
index 0000000000..9dac490aa2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero.S
@@ -0,0 +1,62 @@
+/* Multiple versions of bzero
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib.  */
+#if IS_IN (libc)
+	.text
+ENTRY(__bzero)
+	.type	__bzero, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__bzero_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX ( __bzero_sse2)
+	HAS_ARCH_FEATURE (Fast_Rep_String)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__bzero_sse2_rep)
+2:	ret
+END(__bzero)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __bzero_ia32, @function; \
+	.p2align 4; \
+	.globl __bzero_ia32; \
+	.hidden __bzero_ia32; \
+	__bzero_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __bzero_ia32, .-__bzero_ia32
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI___bzero; __GI___bzero = __bzero_ia32
+# endif
+#endif
+
+#include "../bzero.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/ifunc-impl-list.c b/REORG.TODO/sysdeps/i386/i686/multiarch/ifunc-impl-list.c
new file mode 100644
index 0000000000..e8026a2a78
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/ifunc-impl-list.c
@@ -0,0 +1,376 @@
+/* Enumerate available IFUNC implementations of a function.  i686 version.
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <assert.h>
+#include <string.h>
+#include <wchar.h>
+#include <ifunc-impl-list.h>
+#include "init-arch.h"
+
+/* Maximum number of IFUNC implementations.  */
+#define MAX_IFUNC	4
+
+/* Fill ARRAY of MAX elements with IFUNC implementations for function
+   NAME and return the number of valid entries.  */
+
+size_t
+__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+			size_t max)
+{
+  assert (max >= MAX_IFUNC);
+
+  size_t i = 0;
+
+  /* Support sysdeps/i386/i686/multiarch/bcopy.S.  */
+  IFUNC_IMPL (i, name, bcopy,
+	      IFUNC_IMPL_ADD (array, i, bcopy, HAS_CPU_FEATURE (SSSE3),
+			      __bcopy_ssse3_rep)
+	      IFUNC_IMPL_ADD (array, i, bcopy, HAS_CPU_FEATURE (SSSE3),
+			      __bcopy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, bcopy, HAS_CPU_FEATURE (SSE2),
+			      __bcopy_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, bcopy, 1, __bcopy_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/bzero.S.  */
+  IFUNC_IMPL (i, name, bzero,
+	      IFUNC_IMPL_ADD (array, i, bzero, HAS_CPU_FEATURE (SSE2),
+			      __bzero_sse2_rep)
+	      IFUNC_IMPL_ADD (array, i, bzero, HAS_CPU_FEATURE (SSE2),
+			      __bzero_sse2)
+	      IFUNC_IMPL_ADD (array, i, bzero, 1, __bzero_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/memchr.S.  */
+  IFUNC_IMPL (i, name, memchr,
+	      IFUNC_IMPL_ADD (array, i, memchr, HAS_CPU_FEATURE (SSE2),
+			      __memchr_sse2_bsf)
+	      IFUNC_IMPL_ADD (array, i, memchr, HAS_CPU_FEATURE (SSE2),
+			      __memchr_sse2)
+	      IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/memcmp.S.  */
+  IFUNC_IMPL (i, name, memcmp,
+	      IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSE4_2),
+			      __memcmp_sse4_2)
+	      IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSSE3),
+			      __memcmp_ssse3)
+	      IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/memmove_chk.S.  */
+  IFUNC_IMPL (i, name, __memmove_chk,
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __memmove_chk_ssse3_rep)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __memmove_chk_ssse3)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_CPU_FEATURE (SSE2),
+			      __memmove_chk_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
+			      __memmove_chk_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/memmove.S.  */
+  IFUNC_IMPL (i, name, memmove,
+	      IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
+			      __memmove_ssse3_rep)
+	      IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
+			      __memmove_ssse3)
+	      IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSE2),
+			      __memmove_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/memrchr.S.  */
+  IFUNC_IMPL (i, name, memrchr,
+	      IFUNC_IMPL_ADD (array, i, memrchr, HAS_CPU_FEATURE (SSE2),
+			      __memrchr_sse2_bsf)
+	      IFUNC_IMPL_ADD (array, i, memrchr, HAS_CPU_FEATURE (SSE2),
+			      __memrchr_sse2)
+	      IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/memset_chk.S.  */
+  IFUNC_IMPL (i, name, __memset_chk,
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      HAS_CPU_FEATURE (SSE2),
+			      __memset_chk_sse2_rep)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      HAS_CPU_FEATURE (SSE2),
+			      __memset_chk_sse2)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
+			      __memset_chk_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/memset.S.  */
+  IFUNC_IMPL (i, name, memset,
+	      IFUNC_IMPL_ADD (array, i, memset, HAS_CPU_FEATURE (SSE2),
+			      __memset_sse2_rep)
+	      IFUNC_IMPL_ADD (array, i, memset, HAS_CPU_FEATURE (SSE2),
+			      __memset_sse2)
+	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/rawmemchr.S.  */
+  IFUNC_IMPL (i, name, rawmemchr,
+	      IFUNC_IMPL_ADD (array, i, rawmemchr, HAS_CPU_FEATURE (SSE2),
+			      __rawmemchr_sse2_bsf)
+	      IFUNC_IMPL_ADD (array, i, rawmemchr, HAS_CPU_FEATURE (SSE2),
+			      __rawmemchr_sse2)
+	      IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/stpncpy.S.  */
+  IFUNC_IMPL (i, name, stpncpy,
+	      IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSSE3),
+			      __stpncpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSE2),
+			      __stpncpy_sse2)
+	      IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/stpcpy.S.  */
+  IFUNC_IMPL (i, name, stpcpy,
+	      IFUNC_IMPL_ADD (array, i, stpcpy, HAS_CPU_FEATURE (SSSE3),
+			      __stpcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, stpcpy, HAS_CPU_FEATURE (SSE2),
+			      __stpcpy_sse2)
+	      IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strcasecmp.S.  */
+  IFUNC_IMPL (i, name, strcasecmp,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      HAS_CPU_FEATURE (SSE4_2),
+			      __strcasecmp_sse4_2)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __strcasecmp_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strcasecmp_l.S.  */
+  IFUNC_IMPL (i, name, strcasecmp_l,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
+			      HAS_CPU_FEATURE (SSE4_2),
+			      __strcasecmp_l_sse4_2)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __strcasecmp_l_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1,
+			      __strcasecmp_l_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strcat.S.  */
+  IFUNC_IMPL (i, name, strcat,
+	      IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSSE3),
+			      __strcat_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSE2),
+			      __strcat_sse2)
+	      IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strchr.S.  */
+  IFUNC_IMPL (i, name, strchr,
+	      IFUNC_IMPL_ADD (array, i, strchr, HAS_CPU_FEATURE (SSE2),
+			      __strchr_sse2_bsf)
+	      IFUNC_IMPL_ADD (array, i, strchr, HAS_CPU_FEATURE (SSE2),
+			      __strchr_sse2)
+	      IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strcmp.S.  */
+  IFUNC_IMPL (i, name, strcmp,
+	      IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSE4_2),
+			      __strcmp_sse4_2)
+	      IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSSE3),
+			      __strcmp_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strcpy.S.  */
+  IFUNC_IMPL (i, name, strcpy,
+	      IFUNC_IMPL_ADD (array, i, strcpy, HAS_CPU_FEATURE (SSSE3),
+			      __strcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strcpy, HAS_CPU_FEATURE (SSE2),
+			      __strcpy_sse2)
+	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strcspn.S.  */
+  IFUNC_IMPL (i, name, strcspn,
+	      IFUNC_IMPL_ADD (array, i, strcspn, HAS_CPU_FEATURE (SSE4_2),
+			      __strcspn_sse42)
+	      IFUNC_IMPL_ADD (array, i, strcspn, 1, __strcspn_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strncase.S.  */
+  IFUNC_IMPL (i, name, strncasecmp,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      HAS_CPU_FEATURE (SSE4_2),
+			      __strncasecmp_sse4_2)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __strncasecmp_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp, 1,
+			      __strncasecmp_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strncase_l.S.  */
+  IFUNC_IMPL (i, name, strncasecmp_l,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
+			      HAS_CPU_FEATURE (SSE4_2),
+			      __strncasecmp_l_sse4_2)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __strncasecmp_l_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1,
+			      __strncasecmp_l_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strncat.S.  */
+  IFUNC_IMPL (i, name, strncat,
+	      IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSSE3),
+			      __strncat_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSE2),
+			      __strncat_sse2)
+	      IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strncpy.S.  */
+  IFUNC_IMPL (i, name, strncpy,
+	      IFUNC_IMPL_ADD (array, i, strncpy, HAS_CPU_FEATURE (SSSE3),
+			      __strncpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strncpy, HAS_CPU_FEATURE (SSE2),
+			      __strncpy_sse2)
+	      IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strnlen.S.  */
+  IFUNC_IMPL (i, name, strnlen,
+	      IFUNC_IMPL_ADD (array, i, strnlen, HAS_CPU_FEATURE (SSE2),
+			      __strnlen_sse2)
+	      IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strpbrk.S.  */
+  IFUNC_IMPL (i, name, strpbrk,
+	      IFUNC_IMPL_ADD (array, i, strpbrk, HAS_CPU_FEATURE (SSE4_2),
+			      __strpbrk_sse42)
+	      IFUNC_IMPL_ADD (array, i, strpbrk, 1, __strpbrk_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strrchr.S.  */
+  IFUNC_IMPL (i, name, strrchr,
+	      IFUNC_IMPL_ADD (array, i, strrchr, HAS_CPU_FEATURE (SSE2),
+			      __strrchr_sse2_bsf)
+	      IFUNC_IMPL_ADD (array, i, strrchr, HAS_CPU_FEATURE (SSE2),
+			      __strrchr_sse2)
+	      IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strspn.S.  */
+  IFUNC_IMPL (i, name, strspn,
+	      IFUNC_IMPL_ADD (array, i, strspn, HAS_CPU_FEATURE (SSE4_2),
+			      __strspn_sse42)
+	      IFUNC_IMPL_ADD (array, i, strspn, 1, __strspn_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/wcschr.S.  */
+  IFUNC_IMPL (i, name, wcschr,
+	      IFUNC_IMPL_ADD (array, i, wcschr, HAS_CPU_FEATURE (SSE2),
+			      __wcschr_sse2)
+	      IFUNC_IMPL_ADD (array, i, wcschr, 1, __wcschr_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/wcscmp.S.  */
+  IFUNC_IMPL (i, name, wcscmp,
+	      IFUNC_IMPL_ADD (array, i, wcscmp, HAS_CPU_FEATURE (SSE2),
+			      __wcscmp_sse2)
+	      IFUNC_IMPL_ADD (array, i, wcscmp, 1, __wcscmp_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/wcscpy.S.  */
+  IFUNC_IMPL (i, name, wcscpy,
+	      IFUNC_IMPL_ADD (array, i, wcscpy, HAS_CPU_FEATURE (SSSE3),
+			      __wcscpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, wcscpy, 1, __wcscpy_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/wcslen.S.  */
+  IFUNC_IMPL (i, name, wcslen,
+	      IFUNC_IMPL_ADD (array, i, wcslen, HAS_CPU_FEATURE (SSE2),
+			      __wcslen_sse2)
+	      IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/wcsrchr.S.  */
+  IFUNC_IMPL (i, name, wcsrchr,
+	      IFUNC_IMPL_ADD (array, i, wcsrchr, HAS_CPU_FEATURE (SSE2),
+			      __wcsrchr_sse2)
+	      IFUNC_IMPL_ADD (array, i, wcsrchr, 1, __wcsrchr_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/wmemcmp.S.  */
+  IFUNC_IMPL (i, name, wmemcmp,
+	      IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSE4_2),
+			      __wmemcmp_sse4_2)
+	      IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSSE3),
+			      __wmemcmp_ssse3)
+	      IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_ia32))
+
+#ifdef SHARED
+  /* Support sysdeps/i386/i686/multiarch/memcpy_chk.S.  */
+  IFUNC_IMPL (i, name, __memcpy_chk,
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __memcpy_chk_ssse3_rep)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __memcpy_chk_ssse3)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_CPU_FEATURE (SSE2),
+			      __memcpy_chk_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
+			      __memcpy_chk_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/memcpy.S.  */
+  IFUNC_IMPL (i, name, memcpy,
+	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
+			      __memcpy_ssse3_rep)
+	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
+			      __memcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSE2),
+			      __memcpy_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/mempcpy_chk.S.  */
+  IFUNC_IMPL (i, name, __mempcpy_chk,
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __mempcpy_chk_ssse3_rep)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __mempcpy_chk_ssse3)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_CPU_FEATURE (SSE2),
+			      __mempcpy_chk_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
+			      __mempcpy_chk_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/mempcpy.S.  */
+  IFUNC_IMPL (i, name, mempcpy,
+	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
+			      __mempcpy_ssse3_rep)
+	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
+			      __mempcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSE2),
+			      __mempcpy_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strlen.S.  */
+  IFUNC_IMPL (i, name, strlen,
+	      IFUNC_IMPL_ADD (array, i, strlen, HAS_CPU_FEATURE (SSE2),
+			      __strlen_sse2_bsf)
+	      IFUNC_IMPL_ADD (array, i, strlen, HAS_CPU_FEATURE (SSE2),
+			      __strlen_sse2)
+	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strncmp.S.  */
+  IFUNC_IMPL (i, name, strncmp,
+	      IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSE4_2),
+			      __strncmp_sse4_2)
+	      IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSSE3),
+			      __strncmp_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_ia32))
+#endif
+
+  return i;
+}
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/locale-defines.sym b/REORG.TODO/sysdeps/i386/i686/multiarch/locale-defines.sym
new file mode 100644
index 0000000000..aebff9a4f9
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/locale-defines.sym
@@ -0,0 +1,11 @@
+#include <locale/localeinfo.h>
+#include <langinfo.h>
+#include <stddef.h>
+
+--
+
+LOCALE_T___LOCALES		offsetof (struct __locale_struct, __locales)
+LC_CTYPE
+_NL_CTYPE_NONASCII_CASE
+LOCALE_DATA_VALUES		offsetof (struct __locale_data, values)
+SIZEOF_VALUES			sizeof (((struct __locale_data *) 0)->values[0])
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S
new file mode 100644
index 0000000000..dd316486e6
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S
@@ -0,0 +1,502 @@
+/* Optimized memchr with sse2
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS  4
+# define STR1  PARMS
+# define STR2  STR1+4
+
+# ifndef USE_AS_RAWMEMCHR
+#  define LEN   STR2+4
+#  define RETURN  POP(%edi); ret; CFI_PUSH(%edi);
+# endif
+
+# ifndef MEMCHR
+#  define MEMCHR __memchr_sse2_bsf
+# endif
+
+	.text
+ENTRY (MEMCHR)
+
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+
+# ifndef USE_AS_RAWMEMCHR
+	mov	LEN(%esp), %edx
+	test	%edx, %edx
+	jz	L(return_null_1)
+# endif
+	mov	%ecx, %eax
+
+	punpcklbw %xmm1, %xmm1
+	punpcklbw %xmm1, %xmm1
+
+	and	$63, %ecx
+	pshufd	$0, %xmm1, %xmm1
+
+	cmp	$48, %ecx
+	ja	L(crosscache)
+
+	movdqu	(%eax), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+/* Check if there is a match.  */
+	pmovmskb %xmm0, %ecx
+	test	%ecx, %ecx
+	je	L(unaligned_no_match_1)
+/* Check which byte is a match.  */
+	bsf	%ecx, %ecx
+
+# ifndef USE_AS_RAWMEMCHR
+	sub	%ecx, %edx
+	jbe	L(return_null_1)
+# endif
+	add	%ecx, %eax
+	ret
+
+	.p2align 4
+L(unaligned_no_match_1):
+# ifndef USE_AS_RAWMEMCHR
+	sub	$16, %edx
+	jbe	L(return_null_1)
+	PUSH	(%edi)
+	lea	16(%eax), %edi
+	and	$15, %eax
+	and	$-16, %edi
+	add	%eax, %edx
+# else
+	lea	16(%eax), %edx
+	and	$-16, %edx
+# endif
+	jmp	L(loop_prolog)
+
+	.p2align 4
+L(return_null_1):
+	xor	%eax, %eax
+	ret
+
+# ifndef USE_AS_RAWMEMCHR
+	CFI_POP	(%edi)
+# endif
+
+	.p2align 4
+L(crosscache):
+/* Handle unaligned string.  */
+
+# ifndef USE_AS_RAWMEMCHR
+	PUSH	(%edi)
+	mov	%eax, %edi
+	and	$15, %ecx
+	and	$-16, %edi
+	movdqa	(%edi), %xmm0
+# else
+	mov	%eax, %edx
+	and	$15, %ecx
+	and	$-16, %edx
+	movdqa	(%edx), %xmm0
+# endif
+	pcmpeqb	%xmm1, %xmm0
+/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+/* Remove the leading bytes.  */
+	sar	%cl, %eax
+	test	%eax, %eax
+	je	L(unaligned_no_match)
+/* Check which byte is a match.  */
+	bsf	%eax, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+	sub	%eax, %edx
+	jbe	L(return_null)
+	add	%edi, %eax
+	add	%ecx, %eax
+	RETURN
+# else
+	add	%edx, %eax
+	add	%ecx, %eax
+	ret
+# endif
+
+	.p2align 4
+L(unaligned_no_match):
+# ifndef USE_AS_RAWMEMCHR
+        /* Calculate the last acceptable address and check for possible
+           addition overflow by using satured math:
+           edx = ecx + edx
+           edx |= -(edx < ecx)  */
+	add	%ecx, %edx
+	sbb	%eax, %eax
+	or	%eax, %edx
+	sub	$16, %edx
+	jbe	L(return_null)
+	add	$16, %edi
+# else
+	add	$16, %edx
+# endif
+
+	.p2align 4
+/* Loop start on aligned string.  */
+L(loop_prolog):
+# ifndef USE_AS_RAWMEMCHR
+	sub	$64, %edx
+	jbe	L(exit_loop)
+	movdqa	(%edi), %xmm0
+# else
+	movdqa	(%edx), %xmm0
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	16(%edi), %xmm2
+# else
+	movdqa	16(%edx), %xmm2
+# endif
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	32(%edi), %xmm3
+# else
+	movdqa	32(%edx), %xmm3
+# endif
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	48(%edi), %xmm4
+# else
+	movdqa	48(%edx), %xmm4
+# endif
+	pcmpeqb	%xmm1, %xmm4
+
+# ifndef USE_AS_RAWMEMCHR
+	add	$64, %edi
+# else
+	add	$64, %edx
+# endif
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(matches0)
+
+# ifndef USE_AS_RAWMEMCHR
+	test	$0x3f, %edi
+# else
+	test	$0x3f, %edx
+# endif
+	jz	L(align64_loop)
+
+# ifndef USE_AS_RAWMEMCHR
+	sub	$64, %edx
+	jbe	L(exit_loop)
+	movdqa	(%edi), %xmm0
+# else
+	movdqa	(%edx), %xmm0
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	16(%edi), %xmm2
+# else
+	movdqa	16(%edx), %xmm2
+# endif
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	32(%edi), %xmm3
+# else
+	movdqa	32(%edx), %xmm3
+# endif
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	48(%edi), %xmm3
+# else
+	movdqa	48(%edx), %xmm3
+# endif
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+	add	$64, %edi
+# else
+	add	$64, %edx
+# endif
+	test	%eax, %eax
+	jnz	L(matches0)
+
+# ifndef USE_AS_RAWMEMCHR
+	mov	%edi, %ecx
+	and	$-64, %edi
+	and	$63, %ecx
+	add	%ecx, %edx
+# else
+	and	$-64, %edx
+# endif
+
+	.p2align 4
+L(align64_loop):
+# ifndef USE_AS_RAWMEMCHR
+	sub	$64, %edx
+	jbe	L(exit_loop)
+	movdqa	(%edi), %xmm0
+	movdqa	16(%edi), %xmm2
+	movdqa	32(%edi), %xmm3
+	movdqa	48(%edi), %xmm4
+# else
+	movdqa	(%edx), %xmm0
+	movdqa	16(%edx), %xmm2
+	movdqa	32(%edx), %xmm3
+	movdqa	48(%edx), %xmm4
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm1, %xmm4
+
+	pmaxub	%xmm0, %xmm3
+	pmaxub	%xmm2, %xmm4
+	pmaxub	%xmm3, %xmm4
+	pmovmskb %xmm4, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+	add	$64, %edi
+# else
+	add	$64, %edx
+# endif
+
+	test	%eax, %eax
+	jz	L(align64_loop)
+
+# ifndef USE_AS_RAWMEMCHR
+	sub	$64, %edi
+# else
+	sub	$64, %edx
+# endif
+
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	32(%edi), %xmm3
+# else
+	movdqa	32(%edx), %xmm3
+# endif
+
+	pcmpeqb	%xmm1, %xmm3
+
+# ifndef USE_AS_RAWMEMCHR
+	pcmpeqb	48(%edi), %xmm1
+# else
+	pcmpeqb	48(%edx), %xmm1
+# endif
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	pmovmskb %xmm1, %eax
+	bsf	%eax, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+	lea	48(%edi, %eax), %eax
+	RETURN
+# else
+	lea	48(%edx, %eax), %eax
+	ret
+# endif
+
+# ifndef USE_AS_RAWMEMCHR
+	.p2align 4
+L(exit_loop):
+	add	$64, %edx
+	cmp	$32, %edx
+	jbe	L(exit_loop_32)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+
+	movdqa	16(%edi), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	movdqa	32(%edi), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32_1)
+	cmp	$48, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	48(%edi), %xmm1
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(matches48_1)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(exit_loop_32):
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches_1)
+	cmp	$16, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	16(%edi), %xmm1
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(matches16_1)
+	xor	%eax, %eax
+	RETURN
+# endif
+	.p2align 4
+L(matches0):
+	bsf	%eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	lea	-16(%eax, %edi), %eax
+	RETURN
+# else
+	lea	-16(%eax, %edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(matches):
+	bsf	%eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	add	%edi, %eax
+	RETURN
+# else
+	add	%edx, %eax
+	ret
+# endif
+
+	.p2align 4
+L(matches16):
+	bsf	%eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	lea	16(%eax, %edi), %eax
+	RETURN
+# else
+	lea	16(%eax, %edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(matches32):
+	bsf	%eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	lea	32(%eax, %edi), %eax
+	RETURN
+# else
+	lea	32(%eax, %edx), %eax
+	ret
+# endif
+
+# ifndef USE_AS_RAWMEMCHR
+	.p2align 4
+L(matches_1):
+	bsf	%eax, %eax
+	sub	%eax, %edx
+	jbe	L(return_null)
+
+	add	%edi, %eax
+	RETURN
+
+	.p2align 4
+L(matches16_1):
+	sub	$16, %edx
+	bsf	%eax, %eax
+	sub	%eax, %edx
+	jbe	L(return_null)
+
+	lea	16(%edi, %eax), %eax
+	RETURN
+
+	.p2align 4
+L(matches32_1):
+	sub	$32, %edx
+	bsf	%eax, %eax
+	sub	%eax, %edx
+	jbe	L(return_null)
+
+	lea	32(%edi, %eax), %eax
+	RETURN
+
+	.p2align 4
+L(matches48_1):
+	sub	$48, %edx
+	bsf	%eax, %eax
+	sub	%eax, %edx
+	jbe	L(return_null)
+
+	lea	48(%edi, %eax), %eax
+	RETURN
+# endif
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	RETURN
+# else
+	ret
+# endif
+
+END (MEMCHR)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2.S
new file mode 100644
index 0000000000..172d70de13
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2.S
@@ -0,0 +1,709 @@
+/* Optimized memchr with sse2 without bsf
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# ifndef USE_AS_RAWMEMCHR
+#  define ENTRANCE PUSH(%edi);
+#  define PARMS  8
+#  define RETURN  POP(%edi); ret; CFI_PUSH(%edi);
+# else
+#  define ENTRANCE
+#  define PARMS  4
+# endif
+
+# define STR1  PARMS
+# define STR2  STR1+4
+
+# ifndef USE_AS_RAWMEMCHR
+#  define LEN   STR2+4
+# endif
+
+# ifndef MEMCHR
+#  define MEMCHR __memchr_sse2
+# endif
+
+	atom_text_section
+ENTRY (MEMCHR)
+	ENTRANCE
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+# ifndef USE_AS_RAWMEMCHR
+	mov	LEN(%esp), %edx
+	test	%edx, %edx
+	jz	L(return_null)
+# endif
+
+	punpcklbw %xmm1, %xmm1
+# ifndef USE_AS_RAWMEMCHR
+	mov	%ecx, %edi
+# else
+	mov	%ecx, %edx
+# endif
+	punpcklbw %xmm1, %xmm1
+
+	and	$63, %ecx
+	pshufd	$0, %xmm1, %xmm1
+	cmp	$48, %ecx
+	ja	L(crosscache)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqu	(%edi), %xmm0
+# else
+	movdqu	(%edx), %xmm0
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	jnz	L(match_case2_prolog)
+
+	sub	$16, %edx
+	jbe	L(return_null)
+	lea	16(%edi), %edi
+	and	$15, %ecx
+	and	$-16, %edi
+	add	%ecx, %edx
+# else
+	jnz	L(match_case1_prolog)
+	lea	16(%edx), %edx
+	and	$-16, %edx
+# endif
+	jmp	L(loop_prolog)
+
+	.p2align 4
+L(crosscache):
+	and	$15, %ecx
+# ifndef USE_AS_RAWMEMCHR
+	and	$-16, %edi
+	movdqa	(%edi), %xmm0
+# else
+	and	$-16, %edx
+	movdqa	(%edx), %xmm0
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	sar	%cl, %eax
+	test	%eax, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+	jnz	L(match_case2_prolog1)
+        /* "ecx" is less than 16.  Calculate "edx + ecx - 16" by using
+	   "edx - (16 - ecx)" instead of "(edx + ecx) - 16" to void
+	   possible addition overflow.  */
+	neg	%ecx
+	add	$16, %ecx
+	sub	%ecx, %edx
+	jbe	L(return_null)
+	lea	16(%edi), %edi
+# else
+	jnz	L(match_case1_prolog1)
+	lea	16(%edx), %edx
+# endif
+
+	.p2align 4
+L(loop_prolog):
+# ifndef USE_AS_RAWMEMCHR
+	sub	$64, %edx
+	jbe	L(exit_loop)
+	movdqa	(%edi), %xmm0
+# else
+	movdqa	(%edx), %xmm0
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	xor	%ecx, %ecx
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	16(%edi), %xmm2
+# else
+	movdqa	16(%edx), %xmm2
+# endif
+	pcmpeqb	%xmm1, %xmm2
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	32(%edi), %xmm3
+# else
+	movdqa	32(%edx), %xmm3
+# endif
+	pcmpeqb	%xmm1, %xmm3
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	48(%edi), %xmm4
+# else
+	movdqa	48(%edx), %xmm4
+# endif
+	pcmpeqb	%xmm1, %xmm4
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	lea	64(%edi), %edi
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	(%edi), %xmm0
+# else
+	lea	64(%edx), %edx
+	movdqa	(%edx), %xmm0
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	xor	%ecx, %ecx
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	16(%edi), %xmm2
+# else
+	movdqa	16(%edx), %xmm2
+# endif
+	pcmpeqb	%xmm1, %xmm2
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	32(%edi), %xmm3
+# else
+	movdqa	32(%edx), %xmm3
+# endif
+	pcmpeqb	%xmm1, %xmm3
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	48(%edi), %xmm4
+# else
+	movdqa	48(%edx), %xmm4
+# endif
+	pcmpeqb	%xmm1, %xmm4
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	lea	64(%edi), %edi
+	mov	%edi, %ecx
+	and	$-64, %edi
+	and	$63, %ecx
+	add	%ecx, %edx
+# else
+	lea	64(%edx), %edx
+	and	$-64, %edx
+# endif
+
+	.p2align 4
+L(align64_loop):
+
+# ifndef USE_AS_RAWMEMCHR
+	sub	$64, %edx
+	jbe	L(exit_loop)
+	movdqa	(%edi), %xmm0
+	movdqa	16(%edi), %xmm2
+	movdqa	32(%edi), %xmm3
+	movdqa	48(%edi), %xmm4
+# else
+	movdqa	(%edx), %xmm0
+	movdqa	16(%edx), %xmm2
+	movdqa	32(%edx), %xmm3
+	movdqa	48(%edx), %xmm4
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm1, %xmm4
+
+	pmaxub	%xmm0, %xmm3
+	pmaxub	%xmm2, %xmm4
+	pmaxub	%xmm3, %xmm4
+# ifndef USE_AS_RAWMEMCHR
+	add	$64, %edi
+# else
+	add	$64, %edx
+# endif
+	pmovmskb %xmm4, %eax
+
+	test	%eax, %eax
+	jz	L(align64_loop)
+
+# ifndef USE_AS_RAWMEMCHR
+	sub	$64, %edi
+# else
+	sub	$64, %edx
+# endif
+
+	pmovmskb %xmm0, %eax
+	xor	%ecx, %ecx
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+	pmovmskb %xmm2, %eax
+	lea	16(%ecx), %ecx
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	32(%edi), %xmm3
+# else
+	movdqa	32(%edx), %xmm3
+# endif
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	lea	16(%ecx), %ecx
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	pcmpeqb	48(%edi), %xmm1
+# else
+	pcmpeqb	48(%edx), %xmm1
+# endif
+	pmovmskb %xmm1, %eax
+	lea	16(%ecx), %ecx
+
+	.p2align 4
+L(match_case1):
+# ifndef USE_AS_RAWMEMCHR
+	add	%ecx, %edi
+# else
+L(match_case1_prolog1):
+	add	%ecx, %edx
+L(match_case1_prolog):
+# endif
+	test	%al, %al
+	jz	L(match_case1_high)
+	mov	%al, %cl
+	and	$15, %cl
+	jz	L(match_case1_8)
+	test	$0x01, %al
+	jnz	L(ExitCase1_1)
+	test	$0x02, %al
+	jnz	L(ExitCase1_2)
+	test	$0x04, %al
+	jnz	L(ExitCase1_3)
+# ifndef USE_AS_RAWMEMCHR
+	lea	3(%edi), %eax
+	RETURN
+# else
+	lea	3(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(match_case1_8):
+	test	$0x10, %al
+	jnz	L(ExitCase1_5)
+	test	$0x20, %al
+	jnz	L(ExitCase1_6)
+	test	$0x40, %al
+	jnz	L(ExitCase1_7)
+# ifndef USE_AS_RAWMEMCHR
+	lea	7(%edi), %eax
+	RETURN
+# else
+	lea	7(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(match_case1_high):
+	mov	%ah, %ch
+	and	$15, %ch
+	jz	L(match_case1_high_8)
+	test	$0x01, %ah
+	jnz	L(ExitCase1_9)
+	test	$0x02, %ah
+	jnz	L(ExitCase1_10)
+	test	$0x04, %ah
+	jnz	L(ExitCase1_11)
+# ifndef USE_AS_RAWMEMCHR
+	lea	11(%edi), %eax
+	RETURN
+# else
+	lea	11(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(match_case1_high_8):
+	test	$0x10, %ah
+	jnz	L(ExitCase1_13)
+	test	$0x20, %ah
+	jnz	L(ExitCase1_14)
+	test	$0x40, %ah
+	jnz	L(ExitCase1_15)
+# ifndef USE_AS_RAWMEMCHR
+	lea	15(%edi), %eax
+	RETURN
+# else
+	lea	15(%edx), %eax
+	ret
+# endif
+
+# ifndef USE_AS_RAWMEMCHR
+	.p2align 4
+L(exit_loop):
+	add	$64, %edx
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	xor	%ecx, %ecx
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(match_case2)
+	cmp	$16, %edx
+	jbe	L(return_null)
+
+	movdqa	16(%edi), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(match_case2)
+	cmp	$32, %edx
+	jbe	L(return_null)
+
+	movdqa	32(%edi), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(match_case2)
+	cmp	$48, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	48(%edi), %xmm1
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(match_case2)
+
+	xor	%eax, %eax
+	RETURN
+# endif
+
+	.p2align 4
+L(ExitCase1_1):
+# ifndef USE_AS_RAWMEMCHR
+	mov	%edi, %eax
+	RETURN
+# else
+	mov	%edx, %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_2):
+# ifndef USE_AS_RAWMEMCHR
+	lea	1(%edi), %eax
+	RETURN
+# else
+	lea	1(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_3):
+# ifndef USE_AS_RAWMEMCHR
+	lea	2(%edi), %eax
+	RETURN
+# else
+	lea	2(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_5):
+# ifndef USE_AS_RAWMEMCHR
+	lea	4(%edi), %eax
+	RETURN
+# else
+	lea	4(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_6):
+# ifndef USE_AS_RAWMEMCHR
+	lea	5(%edi), %eax
+	RETURN
+# else
+	lea	5(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_7):
+# ifndef USE_AS_RAWMEMCHR
+	lea	6(%edi), %eax
+	RETURN
+# else
+	lea	6(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_9):
+# ifndef USE_AS_RAWMEMCHR
+	lea	8(%edi), %eax
+	RETURN
+# else
+	lea	8(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_10):
+# ifndef USE_AS_RAWMEMCHR
+	lea	9(%edi), %eax
+	RETURN
+# else
+	lea	9(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_11):
+# ifndef USE_AS_RAWMEMCHR
+	lea	10(%edi), %eax
+	RETURN
+# else
+	lea	10(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_13):
+# ifndef USE_AS_RAWMEMCHR
+	lea	12(%edi), %eax
+	RETURN
+# else
+	lea	12(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_14):
+# ifndef USE_AS_RAWMEMCHR
+	lea	13(%edi), %eax
+	RETURN
+# else
+	lea	13(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_15):
+# ifndef USE_AS_RAWMEMCHR
+	lea	14(%edi), %eax
+	RETURN
+# else
+	lea	14(%edx), %eax
+	ret
+# endif
+
+# ifndef USE_AS_RAWMEMCHR
+	.p2align 4
+L(match_case2):
+	sub	%ecx, %edx
+L(match_case2_prolog1):
+	add	%ecx, %edi
+L(match_case2_prolog):
+	test	%al, %al
+	jz	L(match_case2_high)
+	mov	%al, %cl
+	and	$15, %cl
+	jz	L(match_case2_8)
+	test	$0x01, %al
+	jnz	L(ExitCase2_1)
+	test	$0x02, %al
+	jnz	L(ExitCase2_2)
+	test	$0x04, %al
+	jnz	L(ExitCase2_3)
+	sub	$4, %edx
+	jb	L(return_null)
+	lea	3(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_case2_8):
+	test	$0x10, %al
+	jnz	L(ExitCase2_5)
+	test	$0x20, %al
+	jnz	L(ExitCase2_6)
+	test	$0x40, %al
+	jnz	L(ExitCase2_7)
+	sub	$8, %edx
+	jb	L(return_null)
+	lea	7(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_case2_high):
+	mov	%ah, %ch
+	and	$15, %ch
+	jz	L(match_case2_high_8)
+	test	$0x01, %ah
+	jnz	L(ExitCase2_9)
+	test	$0x02, %ah
+	jnz	L(ExitCase2_10)
+	test	$0x04, %ah
+	jnz	L(ExitCase2_11)
+	sub	$12, %edx
+	jb	L(return_null)
+	lea	11(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_case2_high_8):
+	test	$0x10, %ah
+	jnz	L(ExitCase2_13)
+	test	$0x20, %ah
+	jnz	L(ExitCase2_14)
+	test	$0x40, %ah
+	jnz	L(ExitCase2_15)
+	sub	$16, %edx
+	jb	L(return_null)
+	lea	15(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_1):
+	mov	%edi, %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_2):
+	sub	$2, %edx
+	jb	L(return_null)
+	lea	1(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_3):
+	sub	$3, %edx
+	jb	L(return_null)
+	lea	2(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_5):
+	sub	$5, %edx
+	jb	L(return_null)
+	lea	4(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_6):
+	sub	$6, %edx
+	jb	L(return_null)
+	lea	5(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_7):
+	sub	$7, %edx
+	jb	L(return_null)
+	lea	6(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_9):
+	sub	$9, %edx
+	jb	L(return_null)
+	lea	8(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_10):
+	sub	$10, %edx
+	jb	L(return_null)
+	lea	9(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_11):
+	sub	$11, %edx
+	jb	L(return_null)
+	lea	10(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_13):
+	sub	$13, %edx
+	jb	L(return_null)
+	lea	12(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_14):
+	sub	$14, %edx
+	jb	L(return_null)
+	lea	13(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_15):
+	sub	$15, %edx
+	jb	L(return_null)
+	lea	14(%edi), %eax
+	RETURN
+# endif
+
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	RETURN
+# else
+	ret
+# endif
+
+END (MEMCHR)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr.S
new file mode 100644
index 0000000000..bd0dace290
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr.S
@@ -0,0 +1,65 @@
+/* Multiple versions of memchr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+	.text
+ENTRY(__memchr)
+	.type	__memchr, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	HAS_ARCH_FEATURE (Slow_BSF)
+	jz	3f
+
+	LOAD_FUNC_GOT_EAX ( __memchr_sse2)
+	ret
+
+2:	LOAD_FUNC_GOT_EAX (__memchr_ia32)
+	ret
+
+3:	LOAD_FUNC_GOT_EAX (__memchr_sse2_bsf)
+	ret
+END(__memchr)
+
+weak_alias(__memchr, memchr)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __memchr_ia32, @function; \
+	.globl __memchr_ia32; \
+	.p2align 4; \
+	__memchr_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __memchr_ia32, .-__memchr_ia32
+
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_memchr; __GI_memchr = __memchr_ia32
+
+#endif
+#include "../../memchr.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-sse4.S
new file mode 100644
index 0000000000..2aa13048b2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-sse4.S
@@ -0,0 +1,1225 @@
+/* memcmp with SSE4.2, wmemcmp with SSE4.2
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef MEMCMP
+#  define MEMCMP	__memcmp_sse4_2
+# endif
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+# define POP(REG)	popl REG; CFI_POP (REG)
+
+# define PARMS	4
+# define BLK1	PARMS
+# define BLK2	BLK1 + 4
+# define LEN	BLK2 + 4
+# define RETURN	POP (%ebx); ret; CFI_PUSH (%ebx)
+
+
+# ifdef SHARED
+#  define JMPTBL(I, B)	I - B
+
+/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
+	jump	table with relative offsets.  INDEX is a register contains the
+	index	into the jump table.   SCALE is the scale of INDEX. */
+
+#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)	\
+/* We first load PC into EBX.  */	\
+	SETUP_PIC_REG(bx);	\
+/* Get the address of the jump table.  */	\
+	addl	$(TABLE - .), %ebx;	\
+/* Get the entry and convert the relative offset to the	\
+	absolute	address.  */	\
+	addl	(%ebx,INDEX,SCALE), %ebx;	\
+/* We loaded the jump table and adjusted EDX/ESI. Go.  */	\
+	jmp	*%ebx
+# else
+#  define JMPTBL(I, B)	I
+
+/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
+	jump	table with relative offsets.  INDEX is a register contains the
+	index	into the jump table.   SCALE is the scale of INDEX. */
+#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)	\
+	jmp	*TABLE(,INDEX,SCALE)
+# endif
+
+
+/* Warning!
+           wmemcmp has to use SIGNED comparison for elements.
+           memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
+	.section .text.sse4.2,"ax",@progbits
+ENTRY (MEMCMP)
+	movl	BLK1(%esp), %eax
+	movl	BLK2(%esp), %edx
+	movl	LEN(%esp), %ecx
+
+# ifdef USE_AS_WMEMCMP
+	shl	$2, %ecx
+	test	%ecx, %ecx
+	jz	L(return0)
+# else
+	cmp	$1, %ecx
+	jbe	L(less1bytes)
+# endif
+
+	pxor	%xmm0, %xmm0
+	cmp	$64, %ecx
+	ja	L(64bytesormore)
+	cmp	$8, %ecx
+
+# ifndef USE_AS_WMEMCMP
+	PUSH	(%ebx)
+	jb	L(less8bytes)
+# else
+	jb	L(less8bytes)
+	PUSH	(%ebx)
+# endif
+
+	add	%ecx, %edx
+	add	%ecx, %eax
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(less8bytes):
+	mov	(%eax), %bl
+	cmpb	(%edx), %bl
+	jne	L(nonzero)
+
+	mov	1(%eax), %bl
+	cmpb	1(%edx), %bl
+	jne	L(nonzero)
+
+	cmp	$2, %ecx
+	jz	L(0bytes)
+
+	mov	2(%eax), %bl
+	cmpb	2(%edx), %bl
+	jne	L(nonzero)
+
+	cmp	$3, %ecx
+	jz	L(0bytes)
+
+	mov	3(%eax), %bl
+	cmpb	3(%edx), %bl
+	jne	L(nonzero)
+
+	cmp	$4, %ecx
+	jz	L(0bytes)
+
+	mov	4(%eax), %bl
+	cmpb	4(%edx), %bl
+	jne	L(nonzero)
+
+	cmp	$5, %ecx
+	jz	L(0bytes)
+
+	mov	5(%eax), %bl
+	cmpb	5(%edx), %bl
+	jne	L(nonzero)
+
+	cmp	$6, %ecx
+	jz	L(0bytes)
+
+	mov	6(%eax), %bl
+	cmpb	6(%edx), %bl
+	je	L(0bytes)
+
+L(nonzero):
+	POP	(%ebx)
+	mov	$1, %eax
+	ja	L(above)
+	neg	%eax
+L(above):
+	ret
+	CFI_PUSH (%ebx)
+# endif
+
+	.p2align 4
+L(0bytes):
+	POP	(%ebx)
+	xor	%eax, %eax
+	ret
+
+# ifdef USE_AS_WMEMCMP
+
+/* for wmemcmp, case N == 1 */
+
+	.p2align 4
+L(less8bytes):
+	mov	(%eax), %ecx
+	cmp	(%edx), %ecx
+	je	L(return0)
+	mov	$1, %eax
+	jg	L(find_diff_bigger)
+	neg	%eax
+	ret
+
+	.p2align 4
+L(find_diff_bigger):
+	ret
+
+	.p2align 4
+L(return0):
+	xor	%eax, %eax
+	ret
+# endif
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(less1bytes):
+	jb	L(0bytesend)
+	movzbl	(%eax), %eax
+	movzbl	(%edx), %edx
+	sub	%edx, %eax
+	ret
+
+	.p2align 4
+L(0bytesend):
+	xor	%eax, %eax
+	ret
+# endif
+	.p2align 4
+L(64bytesormore):
+	PUSH	(%ebx)
+	mov	%ecx, %ebx
+	mov	$64, %ecx
+	sub	$64, %ebx
+L(64bytesormore_loop):
+	movdqu	(%eax), %xmm1
+	movdqu	(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(find_16diff)
+
+	movdqu	16(%eax), %xmm1
+	movdqu	16(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(find_32diff)
+
+	movdqu	32(%eax), %xmm1
+	movdqu	32(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(find_48diff)
+
+	movdqu	48(%eax), %xmm1
+	movdqu	48(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(find_64diff)
+	add	%ecx, %eax
+	add	%ecx, %edx
+	sub	%ecx, %ebx
+	jae	L(64bytesormore_loop)
+	add	%ebx, %ecx
+	add	%ecx, %edx
+	add	%ecx, %eax
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
+
+# ifdef USE_AS_WMEMCMP
+
+/* Label needs only for table_64bytes filling */
+L(unreal_case):
+/* no code here */
+
+# endif
+	.p2align 4
+L(find_16diff):
+	sub	$16, %ecx
+L(find_32diff):
+	sub	$16, %ecx
+L(find_48diff):
+	sub	$16, %ecx
+L(find_64diff):
+	add	%ecx, %edx
+	add	%ecx, %eax
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(16bytes):
+	mov	-16(%eax), %ecx
+	mov	-16(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(12bytes):
+	mov	-12(%eax), %ecx
+	mov	-12(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(8bytes):
+	mov	-8(%eax), %ecx
+	mov	-8(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(4bytes):
+	mov	-4(%eax), %ecx
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+# else
+	.p2align 4
+L(16bytes):
+	mov	-16(%eax), %ecx
+	cmp	-16(%edx), %ecx
+	jne	L(find_diff)
+L(12bytes):
+	mov	-12(%eax), %ecx
+	cmp	-12(%edx), %ecx
+	jne	L(find_diff)
+L(8bytes):
+	mov	-8(%eax), %ecx
+	cmp	-8(%edx), %ecx
+	jne	L(find_diff)
+L(4bytes):
+	mov	-4(%eax), %ecx
+	cmp	-4(%edx), %ecx
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+# endif
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(49bytes):
+	movdqu	-49(%eax), %xmm1
+	movdqu	-49(%edx), %xmm2
+	mov	$-49, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(33bytes):
+	movdqu	-33(%eax), %xmm1
+	movdqu	-33(%edx), %xmm2
+	mov	$-33, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(17bytes):
+	mov	-17(%eax), %ecx
+	mov	-17(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(13bytes):
+	mov	-13(%eax), %ecx
+	mov	-13(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(9bytes):
+	mov	-9(%eax), %ecx
+	mov	-9(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(5bytes):
+	mov	-5(%eax), %ecx
+	mov	-5(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzbl	-1(%eax), %ecx
+	cmp	-1(%edx), %cl
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(50bytes):
+	mov	$-50, %ebx
+	movdqu	-50(%eax), %xmm1
+	movdqu	-50(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(34bytes):
+	mov	$-34, %ebx
+	movdqu	-34(%eax), %xmm1
+	movdqu	-34(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(18bytes):
+	mov	-18(%eax), %ecx
+	mov	-18(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(14bytes):
+	mov	-14(%eax), %ecx
+	mov	-14(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(10bytes):
+	mov	-10(%eax), %ecx
+	mov	-10(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(6bytes):
+	mov	-6(%eax), %ecx
+	mov	-6(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(2bytes):
+	movzwl	-2(%eax), %ecx
+	movzwl	-2(%edx), %ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bh, %ch
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(51bytes):
+	mov	$-51, %ebx
+	movdqu	-51(%eax), %xmm1
+	movdqu	-51(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(35bytes):
+	mov	$-35, %ebx
+	movdqu	-35(%eax), %xmm1
+	movdqu	-35(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(19bytes):
+	movl	-19(%eax), %ecx
+	movl	-19(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(15bytes):
+	movl	-15(%eax), %ecx
+	movl	-15(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(11bytes):
+	movl	-11(%eax), %ecx
+	movl	-11(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(7bytes):
+	movl	-7(%eax), %ecx
+	movl	-7(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(3bytes):
+	movzwl	-3(%eax), %ecx
+	movzwl	-3(%edx), %ebx
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+L(1bytes):
+	movzbl	-1(%eax), %eax
+	cmpb	-1(%edx), %al
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+# endif
+	.p2align 4
+L(52bytes):
+	movdqu	-52(%eax), %xmm1
+	movdqu	-52(%edx), %xmm2
+	mov	$-52, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(36bytes):
+	movdqu	-36(%eax), %xmm1
+	movdqu	-36(%edx), %xmm2
+	mov	$-36, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(20bytes):
+	movdqu	-20(%eax), %xmm1
+	movdqu	-20(%edx), %xmm2
+	mov	$-20, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+# else
+	cmp	-4(%edx), %ecx
+# endif
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(53bytes):
+	movdqu	-53(%eax), %xmm1
+	movdqu	-53(%edx), %xmm2
+	mov	$-53, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(37bytes):
+	mov	$-37, %ebx
+	movdqu	-37(%eax), %xmm1
+	movdqu	-37(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(21bytes):
+	mov	$-21, %ebx
+	movdqu	-21(%eax), %xmm1
+	movdqu	-21(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-5(%eax), %ecx
+	mov	-5(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzbl	-1(%eax), %ecx
+	cmp	-1(%edx), %cl
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(54bytes):
+	movdqu	-54(%eax), %xmm1
+	movdqu	-54(%edx), %xmm2
+	mov	$-54, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(38bytes):
+	mov	$-38, %ebx
+	movdqu	-38(%eax), %xmm1
+	movdqu	-38(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(22bytes):
+	mov	$-22, %ebx
+	movdqu	-22(%eax), %xmm1
+	movdqu	-22(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-6(%eax), %ecx
+	mov	-6(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzwl	-2(%eax), %ecx
+	movzwl	-2(%edx), %ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bh, %ch
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(55bytes):
+	movdqu	-55(%eax), %xmm1
+	movdqu	-55(%edx), %xmm2
+	mov	$-55, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(39bytes):
+	mov	$-39, %ebx
+	movdqu	-39(%eax), %xmm1
+	movdqu	-39(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(23bytes):
+	mov	$-23, %ebx
+	movdqu	-23(%eax), %xmm1
+	movdqu	-23(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	movl	-7(%eax), %ecx
+	movl	-7(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzwl	-3(%eax), %ecx
+	movzwl	-3(%edx), %ebx
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	movzbl	-1(%eax), %eax
+	cmpb	-1(%edx), %al
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+# endif
+	.p2align 4
+L(56bytes):
+	movdqu	-56(%eax), %xmm1
+	movdqu	-56(%edx), %xmm2
+	mov	$-56, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(40bytes):
+	mov	$-40, %ebx
+	movdqu	-40(%eax), %xmm1
+	movdqu	-40(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(24bytes):
+	mov	$-24, %ebx
+	movdqu	-24(%eax), %xmm1
+	movdqu	-24(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-8(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-8(%edx), %ebx
+	cmp	%ebx, %ecx
+# else
+	cmp	-8(%edx), %ecx
+# endif
+	jne	L(find_diff)
+
+	mov	-4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+# else
+	cmp	-4(%edx), %ecx
+# endif
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(57bytes):
+	movdqu	-57(%eax), %xmm1
+	movdqu	-57(%edx), %xmm2
+	mov	$-57, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(41bytes):
+	mov	$-41, %ebx
+	movdqu	-41(%eax), %xmm1
+	movdqu	-41(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(25bytes):
+	mov	$-25, %ebx
+	movdqu	-25(%eax), %xmm1
+	movdqu	-25(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-9(%eax), %ecx
+	mov	-9(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	mov	-5(%eax), %ecx
+	mov	-5(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzbl	-1(%eax), %ecx
+	cmp	-1(%edx), %cl
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(58bytes):
+	movdqu	-58(%eax), %xmm1
+	movdqu	-58(%edx), %xmm2
+	mov	$-58, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(42bytes):
+	mov	$-42, %ebx
+	movdqu	-42(%eax), %xmm1
+	movdqu	-42(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(26bytes):
+	mov	$-26, %ebx
+	movdqu	-26(%eax), %xmm1
+	movdqu	-26(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-10(%eax), %ecx
+	mov	-10(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	-6(%eax), %ecx
+	mov	-6(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	movzwl	-2(%eax), %ecx
+	movzwl	-2(%edx), %ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bh, %ch
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(59bytes):
+	movdqu	-59(%eax), %xmm1
+	movdqu	-59(%edx), %xmm2
+	mov	$-59, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(43bytes):
+	mov	$-43, %ebx
+	movdqu	-43(%eax), %xmm1
+	movdqu	-43(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(27bytes):
+	mov	$-27, %ebx
+	movdqu	-27(%eax), %xmm1
+	movdqu	-27(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	movl	-11(%eax), %ecx
+	movl	-11(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movl	-7(%eax), %ecx
+	movl	-7(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzwl	-3(%eax), %ecx
+	movzwl	-3(%edx), %ebx
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	movzbl	-1(%eax), %eax
+	cmpb	-1(%edx), %al
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+# endif
+	.p2align 4
+L(60bytes):
+	movdqu	-60(%eax), %xmm1
+	movdqu	-60(%edx), %xmm2
+	mov	$-60, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(44bytes):
+	mov	$-44, %ebx
+	movdqu	-44(%eax), %xmm1
+	movdqu	-44(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(28bytes):
+	mov	$-28, %ebx
+	movdqu	-28(%eax), %xmm1
+	movdqu	-28(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-12(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-12(%edx), %ebx
+	cmp	%ebx, %ecx
+# else
+	cmp	-12(%edx), %ecx
+# endif
+	jne	L(find_diff)
+
+	mov	-8(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-8(%edx), %ebx
+	cmp	%ebx, %ecx
+# else
+	cmp	-8(%edx), %ecx
+# endif
+	jne	L(find_diff)
+
+	mov	-4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+# else
+	cmp	-4(%edx), %ecx
+# endif
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(61bytes):
+	movdqu	-61(%eax), %xmm1
+	movdqu	-61(%edx), %xmm2
+	mov	$-61, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(45bytes):
+	mov	$-45, %ebx
+	movdqu	-45(%eax), %xmm1
+	movdqu	-45(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(29bytes):
+	mov	$-29, %ebx
+	movdqu	-29(%eax), %xmm1
+	movdqu	-29(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-13(%eax), %ecx
+	mov	-13(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	-9(%eax), %ecx
+	mov	-9(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	-5(%eax), %ecx
+	mov	-5(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzbl	-1(%eax), %ecx
+	cmp	-1(%edx), %cl
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(62bytes):
+	movdqu	-62(%eax), %xmm1
+	movdqu	-62(%edx), %xmm2
+	mov	$-62, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(46bytes):
+	mov	$-46, %ebx
+	movdqu	-46(%eax), %xmm1
+	movdqu	-46(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(30bytes):
+	mov	$-30, %ebx
+	movdqu	-30(%eax), %xmm1
+	movdqu	-30(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-14(%eax), %ecx
+	mov	-14(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	mov	-10(%eax), %ecx
+	mov	-10(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	mov	-6(%eax), %ecx
+	mov	-6(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzwl	-2(%eax), %ecx
+	movzwl	-2(%edx), %ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bh, %ch
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(63bytes):
+	movdqu	-63(%eax), %xmm1
+	movdqu	-63(%edx), %xmm2
+	mov	$-63, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(47bytes):
+	mov	$-47, %ebx
+	movdqu	-47(%eax), %xmm1
+	movdqu	-47(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(31bytes):
+	mov	$-31, %ebx
+	movdqu	-31(%eax), %xmm1
+	movdqu	-31(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	movl	-15(%eax), %ecx
+	movl	-15(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movl	-11(%eax), %ecx
+	movl	-11(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movl	-7(%eax), %ecx
+	movl	-7(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzwl	-3(%eax), %ecx
+	movzwl	-3(%edx), %ebx
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	movzbl	-1(%eax), %eax
+	cmpb	-1(%edx), %al
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+# endif
+
+	.p2align 4
+L(64bytes):
+	movdqu	-64(%eax), %xmm1
+	movdqu	-64(%edx), %xmm2
+	mov	$-64, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(48bytes):
+	movdqu	-48(%eax), %xmm1
+	movdqu	-48(%edx), %xmm2
+	mov	$-48, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(32bytes):
+	movdqu	-32(%eax), %xmm1
+	movdqu	-32(%edx), %xmm2
+	mov	$-32, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-16(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-16(%edx), %ebx
+	cmp	%ebx, %ecx
+# else
+	cmp	-16(%edx), %ecx
+# endif
+	jne	L(find_diff)
+
+	mov	-12(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-12(%edx), %ebx
+	cmp	%ebx, %ecx
+# else
+	cmp	-12(%edx), %ecx
+# endif
+	jne	L(find_diff)
+
+	mov	-8(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-8(%edx), %ebx
+	cmp	%ebx, %ecx
+# else
+	cmp	-8(%edx), %ecx
+# endif
+	jne	L(find_diff)
+
+	mov	-4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+# else
+	cmp	-4(%edx), %ecx
+# endif
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(less16bytes):
+	add	%ebx, %eax
+	add	%ebx, %edx
+
+	mov	(%eax), %ecx
+	mov	(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	4(%eax), %ecx
+	mov	4(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	8(%eax), %ecx
+	mov	8(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	12(%eax), %ecx
+	mov	12(%edx), %ebx
+	cmp	%ebx, %ecx
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+# else
+	.p2align 4
+L(less16bytes):
+	add	%ebx, %eax
+	add	%ebx, %edx
+
+	mov	(%eax), %ecx
+	cmp	(%edx), %ecx
+	jne	L(find_diff)
+
+	mov	4(%eax), %ecx
+	cmp	4(%edx), %ecx
+	jne	L(find_diff)
+
+	mov	8(%eax), %ecx
+	cmp	8(%edx), %ecx
+	jne	L(find_diff)
+
+	mov	12(%eax), %ecx
+	cmp	12(%edx), %ecx
+
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+# endif
+
+	.p2align 4
+L(find_diff):
+# ifndef USE_AS_WMEMCMP
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	shr	$16,%ecx
+	shr	$16,%ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+L(end):
+	POP	(%ebx)
+	mov	$1, %eax
+	ja	L(bigger)
+	neg	%eax
+L(bigger):
+	ret
+# else
+	POP	(%ebx)
+	mov	$1, %eax
+	jg	L(bigger)
+	neg	%eax
+	ret
+
+	.p2align 4
+L(bigger):
+	ret
+# endif
+END (MEMCMP)
+
+	.section .rodata.sse4.2,"a",@progbits
+	.p2align 2
+	.type	L(table_64bytes), @object
+# ifndef USE_AS_WMEMCMP
+L(table_64bytes):
+	.int	JMPTBL (L(0bytes), L(table_64bytes))
+	.int	JMPTBL (L(1bytes), L(table_64bytes))
+	.int	JMPTBL (L(2bytes), L(table_64bytes))
+	.int	JMPTBL (L(3bytes), L(table_64bytes))
+	.int	JMPTBL (L(4bytes), L(table_64bytes))
+	.int	JMPTBL (L(5bytes), L(table_64bytes))
+	.int	JMPTBL (L(6bytes), L(table_64bytes))
+	.int	JMPTBL (L(7bytes), L(table_64bytes))
+	.int	JMPTBL (L(8bytes), L(table_64bytes))
+	.int	JMPTBL (L(9bytes), L(table_64bytes))
+	.int	JMPTBL (L(10bytes), L(table_64bytes))
+	.int	JMPTBL (L(11bytes), L(table_64bytes))
+	.int	JMPTBL (L(12bytes), L(table_64bytes))
+	.int	JMPTBL (L(13bytes), L(table_64bytes))
+	.int	JMPTBL (L(14bytes), L(table_64bytes))
+	.int	JMPTBL (L(15bytes), L(table_64bytes))
+	.int	JMPTBL (L(16bytes), L(table_64bytes))
+	.int	JMPTBL (L(17bytes), L(table_64bytes))
+	.int	JMPTBL (L(18bytes), L(table_64bytes))
+	.int	JMPTBL (L(19bytes), L(table_64bytes))
+	.int	JMPTBL (L(20bytes), L(table_64bytes))
+	.int	JMPTBL (L(21bytes), L(table_64bytes))
+	.int	JMPTBL (L(22bytes), L(table_64bytes))
+	.int	JMPTBL (L(23bytes), L(table_64bytes))
+	.int	JMPTBL (L(24bytes), L(table_64bytes))
+	.int	JMPTBL (L(25bytes), L(table_64bytes))
+	.int	JMPTBL (L(26bytes), L(table_64bytes))
+	.int	JMPTBL (L(27bytes), L(table_64bytes))
+	.int	JMPTBL (L(28bytes), L(table_64bytes))
+	.int	JMPTBL (L(29bytes), L(table_64bytes))
+	.int	JMPTBL (L(30bytes), L(table_64bytes))
+	.int	JMPTBL (L(31bytes), L(table_64bytes))
+	.int	JMPTBL (L(32bytes), L(table_64bytes))
+	.int	JMPTBL (L(33bytes), L(table_64bytes))
+	.int	JMPTBL (L(34bytes), L(table_64bytes))
+	.int	JMPTBL (L(35bytes), L(table_64bytes))
+	.int	JMPTBL (L(36bytes), L(table_64bytes))
+	.int	JMPTBL (L(37bytes), L(table_64bytes))
+	.int	JMPTBL (L(38bytes), L(table_64bytes))
+	.int	JMPTBL (L(39bytes), L(table_64bytes))
+	.int	JMPTBL (L(40bytes), L(table_64bytes))
+	.int	JMPTBL (L(41bytes), L(table_64bytes))
+	.int	JMPTBL (L(42bytes), L(table_64bytes))
+	.int	JMPTBL (L(43bytes), L(table_64bytes))
+	.int	JMPTBL (L(44bytes), L(table_64bytes))
+	.int	JMPTBL (L(45bytes), L(table_64bytes))
+	.int	JMPTBL (L(46bytes), L(table_64bytes))
+	.int	JMPTBL (L(47bytes), L(table_64bytes))
+	.int	JMPTBL (L(48bytes), L(table_64bytes))
+	.int	JMPTBL (L(49bytes), L(table_64bytes))
+	.int	JMPTBL (L(50bytes), L(table_64bytes))
+	.int	JMPTBL (L(51bytes), L(table_64bytes))
+	.int	JMPTBL (L(52bytes), L(table_64bytes))
+	.int	JMPTBL (L(53bytes), L(table_64bytes))
+	.int	JMPTBL (L(54bytes), L(table_64bytes))
+	.int	JMPTBL (L(55bytes), L(table_64bytes))
+	.int	JMPTBL (L(56bytes), L(table_64bytes))
+	.int	JMPTBL (L(57bytes), L(table_64bytes))
+	.int	JMPTBL (L(58bytes), L(table_64bytes))
+	.int	JMPTBL (L(59bytes), L(table_64bytes))
+	.int	JMPTBL (L(60bytes), L(table_64bytes))
+	.int	JMPTBL (L(61bytes), L(table_64bytes))
+	.int	JMPTBL (L(62bytes), L(table_64bytes))
+	.int	JMPTBL (L(63bytes), L(table_64bytes))
+	.int	JMPTBL (L(64bytes), L(table_64bytes))
+# else
+L(table_64bytes):
+	.int	JMPTBL (L(0bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(4bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(8bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(12bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(16bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(20bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(24bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(28bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(32bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(36bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(40bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(44bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(48bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(52bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(56bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(60bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(64bytes), L(table_64bytes))
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-ssse3.S
new file mode 100644
index 0000000000..5ebf5a4d73
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-ssse3.S
@@ -0,0 +1,2157 @@
+/* memcmp with SSSE3, wmemcmp with SSSE3
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef MEMCMP
+#  define MEMCMP		__memcmp_ssse3
+# endif
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+# define POP(REG)	popl REG; CFI_POP (REG)
+
+# define PARMS		4
+# define BLK1		PARMS
+# define BLK2		BLK1+4
+# define LEN		BLK2+4
+# define RETURN_END	POP (%edi); POP (%esi); POP (%ebx); ret
+# define RETURN		RETURN_END; cfi_restore_state; cfi_remember_state
+
+/* Warning!
+           wmemcmp has to use SIGNED comparison for elements.
+           memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
+	atom_text_section
+ENTRY (MEMCMP)
+	movl	LEN(%esp), %ecx
+
+# ifdef USE_AS_WMEMCMP
+	shl	$2, %ecx
+	test	%ecx, %ecx
+	jz	L(zero)
+# endif
+
+	movl	BLK1(%esp), %eax
+	cmp	$48, %ecx
+	movl	BLK2(%esp), %edx
+	jae	L(48bytesormore)
+
+# ifndef USE_AS_WMEMCMP
+	cmp	$1, %ecx
+	jbe	L(less1bytes)
+# endif
+
+	PUSH	(%ebx)
+	add	%ecx, %edx
+	add	%ecx, %eax
+	jmp	L(less48bytes)
+
+	CFI_POP	(%ebx)
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(less1bytes):
+	jb	L(zero)
+	movb	(%eax), %cl
+	cmp	(%edx), %cl
+	je	L(zero)
+	mov	$1, %eax
+	ja	L(1bytesend)
+	neg	%eax
+L(1bytesend):
+	ret
+# endif
+
+	.p2align 4
+L(zero):
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(48bytesormore):
+	PUSH	(%ebx)
+	PUSH	(%esi)
+	PUSH	(%edi)
+	cfi_remember_state
+	movdqu	(%eax), %xmm3
+	movdqu	(%edx), %xmm0
+	movl	%eax, %edi
+	movl	%edx, %esi
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	16(%edi), %edi
+
+	sub	$0xffff, %edx
+	lea	16(%esi), %esi
+	jnz	L(less16bytes)
+	mov	%edi, %edx
+	and	$0xf, %edx
+	xor	%edx, %edi
+	sub	%edx, %esi
+	add	%edx, %ecx
+	mov	%esi, %edx
+	and	$0xf, %edx
+	jz	L(shr_0)
+	xor	%edx, %esi
+
+# ifndef USE_AS_WMEMCMP
+	cmp	$8, %edx
+	jae	L(next_unaligned_table)
+	cmp	$0, %edx
+	je	L(shr_0)
+	cmp	$1, %edx
+	je	L(shr_1)
+	cmp	$2, %edx
+	je	L(shr_2)
+	cmp	$3, %edx
+	je	L(shr_3)
+	cmp	$4, %edx
+	je	L(shr_4)
+	cmp	$5, %edx
+	je	L(shr_5)
+	cmp	$6, %edx
+	je	L(shr_6)
+	jmp	L(shr_7)
+
+	.p2align 2
+L(next_unaligned_table):
+	cmp	$8, %edx
+	je	L(shr_8)
+	cmp	$9, %edx
+	je	L(shr_9)
+	cmp	$10, %edx
+	je	L(shr_10)
+	cmp	$11, %edx
+	je	L(shr_11)
+	cmp	$12, %edx
+	je	L(shr_12)
+	cmp	$13, %edx
+	je	L(shr_13)
+	cmp	$14, %edx
+	je	L(shr_14)
+	jmp	L(shr_15)
+# else
+	cmp	$0, %edx
+	je	L(shr_0)
+	cmp	$4, %edx
+	je	L(shr_4)
+	cmp	$8, %edx
+	je	L(shr_8)
+	jmp	L(shr_12)
+# endif
+
+	.p2align 4
+L(shr_0):
+	cmp	$80, %ecx
+	jae	L(shr_0_gobble)
+	lea	-48(%ecx), %ecx
+	xor	%eax, %eax
+	movaps	(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+	movaps	16(%esi), %xmm2
+	pcmpeqb	16(%edi), %xmm2
+	pand	%xmm1, %xmm2
+	pmovmskb %xmm2, %edx
+	add	$32, %edi
+	add	$32, %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_0_gobble):
+	lea	-48(%ecx), %ecx
+	movdqa	(%esi), %xmm0
+	xor	%eax, %eax
+	pcmpeqb	(%edi), %xmm0
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm2
+	pcmpeqb	16(%edi), %xmm2
+L(shr_0_gobble_loop):
+	pand	%xmm0, %xmm2
+	sub	$32, %ecx
+	pmovmskb %xmm2, %edx
+	movdqa	%xmm0, %xmm1
+	movdqa	32(%esi), %xmm0
+	movdqa	48(%esi), %xmm2
+	sbb	$0xffff, %edx
+	pcmpeqb	32(%edi), %xmm0
+	pcmpeqb	48(%edi), %xmm2
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	jz	L(shr_0_gobble_loop)
+
+	pand	%xmm0, %xmm2
+	cmp	$0, %ecx
+	jge	L(shr_0_gobble_loop_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_0_gobble_loop_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm2, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_1):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_1_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$1,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$1,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	1(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_1_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$1,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$1,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_1_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$1,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$1,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_1_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_1_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_1_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	1(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_2):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_2_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$2,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$2,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	2(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_2_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$2,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$2,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_2_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$2,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$2,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_2_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_2_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_2_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	2(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_3):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_3_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$3,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$3,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	3(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_3_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$3,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$3,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_3_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$3,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$3,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_3_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_3_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_3_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	3(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+# endif
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_4):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_4_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$4,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$4,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	4(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_4_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$4,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$4,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_4_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$4,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$4,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_4_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_4_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_4_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	4(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_5):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_5_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$5,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$5,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	5(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_5_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$5,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$5,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_5_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$5,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$5,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_5_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_5_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_5_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	5(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_6):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_6_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$6,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$6,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	6(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_6_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$6,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$6,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_6_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$6,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$6,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_6_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_6_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_6_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	6(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_7):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_7_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$7,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$7,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	7(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_7_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$7,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$7,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_7_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$7,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$7,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_7_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_7_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_7_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	7(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+# endif
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_8):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_8_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$8,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$8,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	8(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_8_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$8,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$8,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_8_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$8,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$8,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_8_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_8_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_8_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	8(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_9):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_9_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$9,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$9,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	9(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_9_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$9,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$9,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_9_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$9,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$9,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_9_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_9_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_9_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	9(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_10):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_10_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$10, (%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$10,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	10(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_10_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$10, (%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$10, 16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_10_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$10,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$10,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_10_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_10_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_10_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	10(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_11):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_11_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$11, (%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$11, %xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	11(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_11_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$11, (%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$11, 16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_11_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$11,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$11,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_11_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_11_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_11_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	11(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+# endif
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_12):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_12_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$12, (%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$12, %xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	12(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_12_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$12, (%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$12, 16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_12_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$12,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$12,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_12_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_12_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_12_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	12(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_13):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_13_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$13, (%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$13, %xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	13(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_13_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$13, (%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$13, 16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_13_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$13,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$13,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_13_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_13_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_13_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	13(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_14):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_14_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$14, (%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$14, %xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	14(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_14_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$14, (%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$14, 16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_14_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$14,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$14,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_14_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_14_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_14_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	14(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_15):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_15_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$15, (%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$15, %xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	15(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_15_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$15, (%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$15, 16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_15_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$15,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$15,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_15_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_15_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_15_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	15(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+# endif
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(exit):
+	pmovmskb %xmm1, %ebx
+	sub	$0xffff, %ebx
+	jz	L(first16bytes)
+	lea	-16(%esi), %esi
+	lea	-16(%edi), %edi
+	mov	%ebx, %edx
+
+L(first16bytes):
+	add	%eax, %esi
+L(less16bytes):
+
+# ifndef USE_AS_WMEMCMP
+	test	%dl, %dl
+	jz	L(next_24_bytes)
+
+	test	$0x01, %dl
+	jnz	L(Byte16)
+
+	test	$0x02, %dl
+	jnz	L(Byte17)
+
+	test	$0x04, %dl
+	jnz	L(Byte18)
+
+	test	$0x08, %dl
+	jnz	L(Byte19)
+
+	test	$0x10, %dl
+	jnz	L(Byte20)
+
+	test	$0x20, %dl
+	jnz	L(Byte21)
+
+	test	$0x40, %dl
+	jnz	L(Byte22)
+L(Byte23):
+	movzbl	-9(%edi), %eax
+	movzbl	-9(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	.p2align 4
+L(Byte16):
+	movzbl	-16(%edi), %eax
+	movzbl	-16(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	.p2align 4
+L(Byte17):
+	movzbl	-15(%edi), %eax
+	movzbl	-15(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	.p2align 4
+L(Byte18):
+	movzbl	-14(%edi), %eax
+	movzbl	-14(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	.p2align 4
+L(Byte19):
+	movzbl	-13(%edi), %eax
+	movzbl	-13(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	.p2align 4
+L(Byte20):
+	movzbl	-12(%edi), %eax
+	movzbl	-12(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	.p2align 4
+L(Byte21):
+	movzbl	-11(%edi), %eax
+	movzbl	-11(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	.p2align 4
+L(Byte22):
+	movzbl	-10(%edi), %eax
+	movzbl	-10(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	.p2align 4
+L(next_24_bytes):
+	lea	8(%edi), %edi
+	lea	8(%esi), %esi
+	test	$0x01, %dh
+	jnz	L(Byte16)
+
+	test	$0x02, %dh
+	jnz	L(Byte17)
+
+	test	$0x04, %dh
+	jnz	L(Byte18)
+
+	test	$0x08, %dh
+	jnz	L(Byte19)
+
+	test	$0x10, %dh
+	jnz	L(Byte20)
+
+	test	$0x20, %dh
+	jnz	L(Byte21)
+
+	test	$0x40, %dh
+	jnz	L(Byte22)
+
+	.p2align 4
+L(Byte31):
+	movzbl	-9(%edi), %eax
+	movzbl	-9(%esi), %edx
+	sub	%edx, %eax
+	RETURN_END
+# else
+
+/* special for wmemcmp */
+	xor	%eax, %eax
+	test	%dl, %dl
+	jz	L(next_two_double_words)
+	and	$15, %dl
+	jz	L(second_double_word)
+	mov	-16(%edi), %eax
+	cmp	-16(%esi), %eax
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(second_double_word):
+	mov	-12(%edi), %eax
+	cmp	-12(%esi), %eax
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(next_two_double_words):
+	and	$15, %dh
+	jz	L(fourth_double_word)
+	mov	-8(%edi), %eax
+	cmp	-8(%esi), %eax
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(fourth_double_word):
+	mov	-4(%edi), %eax
+	cmp	-4(%esi), %eax
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(nequal):
+	mov	$1, %eax
+	jg	L(nequal_bigger)
+	neg	%eax
+	RETURN
+
+	.p2align 4
+L(nequal_bigger):
+	RETURN_END
+# endif
+
+	CFI_PUSH (%ebx)
+
+	.p2align 4
+L(more8bytes):
+	cmp	$16, %ecx
+	jae	L(more16bytes)
+	cmp	$8, %ecx
+	je	L(8bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$9, %ecx
+	je	L(9bytes)
+	cmp	$10, %ecx
+	je	L(10bytes)
+	cmp	$11, %ecx
+	je	L(11bytes)
+	cmp	$12, %ecx
+	je	L(12bytes)
+	cmp	$13, %ecx
+	je	L(13bytes)
+	cmp	$14, %ecx
+	je	L(14bytes)
+	jmp	L(15bytes)
+# else
+	jmp	L(12bytes)
+# endif
+
+	.p2align 4
+L(more16bytes):
+	cmp	$24, %ecx
+	jae	L(more24bytes)
+	cmp	$16, %ecx
+	je	L(16bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$17, %ecx
+	je	L(17bytes)
+	cmp	$18, %ecx
+	je	L(18bytes)
+	cmp	$19, %ecx
+	je	L(19bytes)
+	cmp	$20, %ecx
+	je	L(20bytes)
+	cmp	$21, %ecx
+	je	L(21bytes)
+	cmp	$22, %ecx
+	je	L(22bytes)
+	jmp	L(23bytes)
+# else
+	jmp	L(20bytes)
+# endif
+
+	.p2align 4
+L(more24bytes):
+	cmp	$32, %ecx
+	jae	L(more32bytes)
+	cmp	$24, %ecx
+	je	L(24bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$25, %ecx
+	je	L(25bytes)
+	cmp	$26, %ecx
+	je	L(26bytes)
+	cmp	$27, %ecx
+	je	L(27bytes)
+	cmp	$28, %ecx
+	je	L(28bytes)
+	cmp	$29, %ecx
+	je	L(29bytes)
+	cmp	$30, %ecx
+	je	L(30bytes)
+	jmp	L(31bytes)
+# else
+	jmp	L(28bytes)
+# endif
+
+	.p2align 4
+L(more32bytes):
+	cmp	$40, %ecx
+	jae	L(more40bytes)
+	cmp	$32, %ecx
+	je	L(32bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$33, %ecx
+	je	L(33bytes)
+	cmp	$34, %ecx
+	je	L(34bytes)
+	cmp	$35, %ecx
+	je	L(35bytes)
+	cmp	$36, %ecx
+	je	L(36bytes)
+	cmp	$37, %ecx
+	je	L(37bytes)
+	cmp	$38, %ecx
+	je	L(38bytes)
+	jmp	L(39bytes)
+# else
+	jmp	L(36bytes)
+# endif
+
+	.p2align 4
+L(less48bytes):
+	cmp	$8, %ecx
+	jae	L(more8bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$2, %ecx
+	je	L(2bytes)
+	cmp	$3, %ecx
+	je	L(3bytes)
+	cmp	$4, %ecx
+	je	L(4bytes)
+	cmp	$5, %ecx
+	je	L(5bytes)
+	cmp	$6, %ecx
+	je	L(6bytes)
+	jmp	L(7bytes)
+# else
+	jmp	L(4bytes)
+# endif
+
+	.p2align 4
+L(more40bytes):
+	cmp	$40, %ecx
+	je	L(40bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$41, %ecx
+	je	L(41bytes)
+	cmp	$42, %ecx
+	je	L(42bytes)
+	cmp	$43, %ecx
+	je	L(43bytes)
+	cmp	$44, %ecx
+	je	L(44bytes)
+	cmp	$45, %ecx
+	je	L(45bytes)
+	cmp	$46, %ecx
+	je	L(46bytes)
+	jmp	L(47bytes)
+
+	.p2align 4
+L(44bytes):
+	mov	-44(%eax), %ecx
+	mov	-44(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(40bytes):
+	mov	-40(%eax), %ecx
+	mov	-40(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(36bytes):
+	mov	-36(%eax), %ecx
+	mov	-36(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(32bytes):
+	mov	-32(%eax), %ecx
+	mov	-32(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(28bytes):
+	mov	-28(%eax), %ecx
+	mov	-28(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(24bytes):
+	mov	-24(%eax), %ecx
+	mov	-24(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(20bytes):
+	mov	-20(%eax), %ecx
+	mov	-20(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(16bytes):
+	mov	-16(%eax), %ecx
+	mov	-16(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(12bytes):
+	mov	-12(%eax), %ecx
+	mov	-12(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(8bytes):
+	mov	-8(%eax), %ecx
+	mov	-8(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(4bytes):
+	mov	-4(%eax), %ecx
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+	mov	$0, %eax
+	jne	L(find_diff)
+	POP	(%ebx)
+	ret
+	CFI_PUSH (%ebx)
+# else
+	.p2align 4
+L(44bytes):
+	mov	-44(%eax), %ecx
+	cmp	-44(%edx), %ecx
+	jne	L(find_diff)
+L(40bytes):
+	mov	-40(%eax), %ecx
+	cmp	-40(%edx), %ecx
+	jne	L(find_diff)
+L(36bytes):
+	mov	-36(%eax), %ecx
+	cmp	-36(%edx), %ecx
+	jne	L(find_diff)
+L(32bytes):
+	mov	-32(%eax), %ecx
+	cmp	-32(%edx), %ecx
+	jne	L(find_diff)
+L(28bytes):
+	mov	-28(%eax), %ecx
+	cmp	-28(%edx), %ecx
+	jne	L(find_diff)
+L(24bytes):
+	mov	-24(%eax), %ecx
+	cmp	-24(%edx), %ecx
+	jne	L(find_diff)
+L(20bytes):
+	mov	-20(%eax), %ecx
+	cmp	-20(%edx), %ecx
+	jne	L(find_diff)
+L(16bytes):
+	mov	-16(%eax), %ecx
+	cmp	-16(%edx), %ecx
+	jne	L(find_diff)
+L(12bytes):
+	mov	-12(%eax), %ecx
+	cmp	-12(%edx), %ecx
+	jne	L(find_diff)
+L(8bytes):
+	mov	-8(%eax), %ecx
+	cmp	-8(%edx), %ecx
+	jne	L(find_diff)
+L(4bytes):
+	mov	-4(%eax), %ecx
+	xor	%eax, %eax
+	cmp	-4(%edx), %ecx
+	jne	L(find_diff)
+	POP	(%ebx)
+	ret
+	CFI_PUSH (%ebx)
+# endif
+
+# ifndef USE_AS_WMEMCMP
+
+	.p2align 4
+L(45bytes):
+	mov	-45(%eax), %ecx
+	mov	-45(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(41bytes):
+	mov	-41(%eax), %ecx
+	mov	-41(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(37bytes):
+	mov	-37(%eax), %ecx
+	mov	-37(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(33bytes):
+	mov	-33(%eax), %ecx
+	mov	-33(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(29bytes):
+	mov	-29(%eax), %ecx
+	mov	-29(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(25bytes):
+	mov	-25(%eax), %ecx
+	mov	-25(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(21bytes):
+	mov	-21(%eax), %ecx
+	mov	-21(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(17bytes):
+	mov	-17(%eax), %ecx
+	mov	-17(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(13bytes):
+	mov	-13(%eax), %ecx
+	mov	-13(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(9bytes):
+	mov	-9(%eax), %ecx
+	mov	-9(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(5bytes):
+	mov	-5(%eax), %ecx
+	mov	-5(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzbl	-1(%eax), %ecx
+	cmp	-1(%edx), %cl
+	mov	$0, %eax
+	jne	L(end)
+	POP	(%ebx)
+	ret
+	CFI_PUSH (%ebx)
+
+	.p2align 4
+L(46bytes):
+	mov	-46(%eax), %ecx
+	mov	-46(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(42bytes):
+	mov	-42(%eax), %ecx
+	mov	-42(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(38bytes):
+	mov	-38(%eax), %ecx
+	mov	-38(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(34bytes):
+	mov	-34(%eax), %ecx
+	mov	-34(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(30bytes):
+	mov	-30(%eax), %ecx
+	mov	-30(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(26bytes):
+	mov	-26(%eax), %ecx
+	mov	-26(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(22bytes):
+	mov	-22(%eax), %ecx
+	mov	-22(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(18bytes):
+	mov	-18(%eax), %ecx
+	mov	-18(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(14bytes):
+	mov	-14(%eax), %ecx
+	mov	-14(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(10bytes):
+	mov	-10(%eax), %ecx
+	mov	-10(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(6bytes):
+	mov	-6(%eax), %ecx
+	mov	-6(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(2bytes):
+	movzwl	-2(%eax), %ecx
+	movzwl	-2(%edx), %ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bh, %ch
+	mov	$0, %eax
+	jne	L(end)
+	POP	(%ebx)
+	ret
+	CFI_PUSH (%ebx)
+
+	.p2align 4
+L(47bytes):
+	movl	-47(%eax), %ecx
+	movl	-47(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(43bytes):
+	movl	-43(%eax), %ecx
+	movl	-43(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(39bytes):
+	movl	-39(%eax), %ecx
+	movl	-39(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(35bytes):
+	movl	-35(%eax), %ecx
+	movl	-35(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(31bytes):
+	movl	-31(%eax), %ecx
+	movl	-31(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(27bytes):
+	movl	-27(%eax), %ecx
+	movl	-27(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(23bytes):
+	movl	-23(%eax), %ecx
+	movl	-23(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(19bytes):
+	movl	-19(%eax), %ecx
+	movl	-19(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(15bytes):
+	movl	-15(%eax), %ecx
+	movl	-15(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(11bytes):
+	movl	-11(%eax), %ecx
+	movl	-11(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(7bytes):
+	movl	-7(%eax), %ecx
+	movl	-7(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(3bytes):
+	movzwl	-3(%eax), %ecx
+	movzwl	-3(%edx), %ebx
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	movzbl	-1(%eax), %eax
+	cmpb	-1(%edx), %al
+	mov	$0, %eax
+	jne	L(end)
+	POP	(%ebx)
+	ret
+	CFI_PUSH (%ebx)
+
+	.p2align 4
+L(find_diff):
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	shr	$16,%ecx
+	shr	$16,%ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+
+	.p2align 4
+L(end):
+	POP	(%ebx)
+	mov	$1, %eax
+	ja	L(bigger)
+	neg	%eax
+L(bigger):
+	ret
+# else
+
+/* for wmemcmp */
+	.p2align 4
+L(find_diff):
+	POP	(%ebx)
+	mov	$1, %eax
+	jg	L(find_diff_bigger)
+	neg	%eax
+	ret
+
+	.p2align 4
+L(find_diff_bigger):
+	ret
+
+# endif
+END (MEMCMP)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp.S
new file mode 100644
index 0000000000..1fc5994a17
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp.S
@@ -0,0 +1,62 @@
+/* Multiple versions of memcmp
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+	.text
+ENTRY(memcmp)
+	.type	memcmp, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memcmp_ia32)
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memcmp_ssse3)
+	HAS_CPU_FEATURE (SSE4_2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memcmp_sse4_2)
+2:	ret
+END(memcmp)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __memcmp_ia32, @function; \
+	.p2align 4; \
+	.globl __memcmp_ia32; \
+	.hidden __memcmp_ia32; \
+	__memcmp_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __memcmp_ia32, .-__memcmp_ia32
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_memcmp; __GI_memcmp = __memcmp_ia32
+# endif
+#endif
+
+#include "../memcmp.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S
new file mode 100644
index 0000000000..2fe2072cb1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S
@@ -0,0 +1,681 @@
+/* memcpy optimized with SSE2 unaligned memory access instructions.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc) \
+    && (defined SHARED \
+	|| defined USE_AS_MEMMOVE \
+	|| !defined USE_MULTIARCH)
+
+# include <sysdep.h>
+# include "asm-syntax.h"
+
+# ifndef MEMCPY
+#  define MEMCPY	__memcpy_sse2_unaligned
+#  define MEMCPY_CHK	__memcpy_chk_sse2_unaligned
+# endif
+
+# ifdef USE_AS_BCOPY
+#  define SRC		PARMS
+#  define DEST		SRC+4
+#  define LEN		DEST+4
+# else
+#  define DEST		PARMS
+#  define SRC		DEST+4
+#  define LEN		SRC+4
+# endif
+
+# define CFI_PUSH(REG)		\
+  cfi_adjust_cfa_offset (4);		\
+  cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)		\
+  cfi_adjust_cfa_offset (-4);		\
+  cfi_restore (REG)
+
+# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+# define POP(REG)	popl REG; CFI_POP (REG)
+
+# define PARMS		8		/* Preserve EBX.  */
+# define ENTRANCE	PUSH (%ebx);
+# define RETURN_END	POP (%ebx); ret
+# define RETURN	RETURN_END; CFI_PUSH (%ebx)
+
+	.section .text.sse2,"ax",@progbits
+# if !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+# endif
+
+ENTRY (MEMCPY)
+	ENTRANCE
+	movl	LEN(%esp), %ecx
+	movl	SRC(%esp), %eax
+	movl	DEST(%esp), %edx
+	cmp	%edx, %eax
+
+# ifdef USE_AS_MEMMOVE
+	jg	L(check_forward)
+
+L(mm_len_0_or_more_backward):
+/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128]
+	separately.  */
+	cmp	$16, %ecx
+	jbe	L(mm_len_0_16_bytes_backward)
+
+	cmpl	$32, %ecx
+	jg	L(mm_len_32_or_more_backward)
+
+/* Copy [0..32] and return.  */
+	movdqu	(%eax), %xmm0
+	movdqu	-16(%eax, %ecx), %xmm1
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, -16(%edx, %ecx)
+	jmp	L(return)
+
+L(mm_len_32_or_more_backward):
+	cmpl	$64, %ecx
+	jg	L(mm_len_64_or_more_backward)
+
+/* Copy [0..64] and return.  */
+	movdqu	(%eax), %xmm0
+	movdqu	16(%eax), %xmm1
+	movdqu	-16(%eax, %ecx), %xmm2
+	movdqu	-32(%eax, %ecx), %xmm3
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, 16(%edx)
+	movdqu	%xmm2, -16(%edx, %ecx)
+	movdqu	%xmm3, -32(%edx, %ecx)
+	jmp	L(return)
+
+L(mm_len_64_or_more_backward):
+	cmpl	$128, %ecx
+	jg	L(mm_len_128_or_more_backward)
+
+/* Copy [0..128] and return.  */
+	movdqu	(%eax), %xmm0
+	movdqu	16(%eax), %xmm1
+	movdqu	32(%eax), %xmm2
+	movdqu	48(%eax), %xmm3
+	movdqu	-64(%eax, %ecx), %xmm4
+	movdqu	-48(%eax, %ecx), %xmm5
+	movdqu	-32(%eax, %ecx), %xmm6
+	movdqu	-16(%eax, %ecx), %xmm7
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, 16(%edx)
+	movdqu	%xmm2, 32(%edx)
+	movdqu	%xmm3, 48(%edx)
+	movdqu	%xmm4, -64(%edx, %ecx)
+	movdqu	%xmm5, -48(%edx, %ecx)
+	movdqu	%xmm6, -32(%edx, %ecx)
+	movdqu	%xmm7, -16(%edx, %ecx)
+	jmp	L(return)
+
+L(mm_len_128_or_more_backward):
+	add	%ecx, %eax
+	cmp	%edx, %eax
+	movl	SRC(%esp), %eax
+	jle	L(forward)
+	PUSH (%esi)
+	PUSH (%edi)
+	PUSH (%ebx)
+
+/* Aligning the address of destination. */
+	movdqu	(%eax), %xmm4
+	movdqu	16(%eax), %xmm5
+	movdqu	32(%eax), %xmm6
+	movdqu	48(%eax), %xmm7
+	leal	(%edx, %ecx), %esi
+	movdqu	-16(%eax, %ecx), %xmm0
+	subl	$16, %esp
+	movdqu	%xmm0, (%esp)
+	mov	%ecx, %edi
+	movl	%esi, %ecx
+	andl	$-16, %ecx
+	leal	(%ecx), %ebx
+	subl	%edx, %ebx
+	leal	(%eax, %ebx), %eax
+	shrl	$6, %ebx
+
+# ifdef SHARED_CACHE_SIZE_HALF
+	cmp	$SHARED_CACHE_SIZE_HALF, %edi
+# else
+#  ifdef SHARED
+	PUSH (%ebx)
+	SETUP_PIC_REG (bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %edi
+	POP (%ebx)
+#  else
+	cmp	__x86_shared_cache_size_half, %edi
+#  endif
+# endif
+	jae	L(mm_large_page_loop_backward)
+
+	.p2align 4
+L(mm_main_loop_backward):
+
+	prefetcht0 -128(%eax)
+
+	movdqu	-64(%eax), %xmm0
+	movdqu	-48(%eax), %xmm1
+	movdqu	-32(%eax), %xmm2
+	movdqu	-16(%eax), %xmm3
+	movaps	%xmm0, -64(%ecx)
+	subl	$64, %eax
+	movaps	%xmm1, -48(%ecx)
+	movaps	%xmm2, -32(%ecx)
+	movaps	%xmm3, -16(%ecx)
+	subl	$64, %ecx
+	sub	$1, %ebx
+	jnz	L(mm_main_loop_backward)
+	movdqu	(%esp), %xmm0
+	addl	$16, %esp
+	movdqu	%xmm0, -16(%esi)
+	movdqu	%xmm4, (%edx)
+	movdqu	%xmm5, 16(%edx)
+	movdqu	%xmm6, 32(%edx)
+	movdqu	%xmm7, 48(%edx)
+	POP (%ebx)
+	jmp	L(mm_return_pop_all)
+
+/* Copy [0..16] and return.  */
+L(mm_len_0_16_bytes_backward):
+	testb	$24, %cl
+	jnz	L(mm_len_9_16_bytes_backward)
+	testb	$4, %cl
+	.p2align 4,,5
+	jnz	L(mm_len_5_8_bytes_backward)
+	testl	%ecx, %ecx
+	.p2align 4,,2
+	je	L(return)
+	testb	$2, %cl
+	.p2align 4,,1
+	jne	L(mm_len_3_4_bytes_backward)
+	movzbl	-1(%eax,%ecx), %ebx
+	movzbl	(%eax), %eax
+	movb	%bl, -1(%edx,%ecx)
+	movb	%al, (%edx)
+	jmp	L(return)
+
+L(mm_len_3_4_bytes_backward):
+	movzwl	-2(%eax,%ecx), %ebx
+	movzwl	(%eax), %eax
+	movw	%bx, -2(%edx,%ecx)
+	movw	%ax, (%edx)
+	jmp	L(return)
+
+L(mm_len_9_16_bytes_backward):
+	PUSH (%esi)
+	movl	-4(%eax,%ecx), %ebx
+	movl	-8(%eax,%ecx), %esi
+	movl	%ebx, -4(%edx,%ecx)
+	movl	%esi, -8(%edx,%ecx)
+	subl	$8, %ecx
+	POP (%esi)
+	jmp	L(mm_len_0_16_bytes_backward)
+
+L(mm_len_5_8_bytes_backward):
+	movl	(%eax), %ebx
+	movl	-4(%eax,%ecx), %eax
+	movl	%ebx, (%edx)
+	movl	%eax, -4(%edx,%ecx)
+	jmp	L(return)
+
+/* Big length copy backward part.  */
+	.p2align 4
+L(mm_large_page_loop_backward):
+	movdqu	-64(%eax), %xmm0
+	movdqu	-48(%eax), %xmm1
+	movdqu	-32(%eax), %xmm2
+	movdqu	-16(%eax), %xmm3
+	movntdq	%xmm0, -64(%ecx)
+	subl	$64, %eax
+	movntdq	%xmm1, -48(%ecx)
+	movntdq	%xmm2, -32(%ecx)
+	movntdq	%xmm3, -16(%ecx)
+	subl	$64, %ecx
+	sub	$1, %ebx
+	jnz	L(mm_large_page_loop_backward)
+	sfence
+	movdqu	(%esp), %xmm0
+	addl	$16, %esp
+	movdqu	%xmm0, -16(%esi)
+	movdqu	%xmm4, (%edx)
+	movdqu	%xmm5, 16(%edx)
+	movdqu	%xmm6, 32(%edx)
+	movdqu	%xmm7, 48(%edx)
+	POP (%ebx)
+	jmp	L(mm_return_pop_all)
+
+L(check_forward):
+	add	%edx, %ecx
+	cmp	%eax, %ecx
+	movl	LEN(%esp), %ecx
+	jle	L(forward)
+
+/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
+	separately.  */
+	cmp	$16, %ecx
+	jbe	L(mm_len_0_16_bytes_forward)
+
+	cmpl	$32, %ecx
+	ja	L(mm_len_32_or_more_forward)
+
+/* Copy [0..32] and return.  */
+	movdqu	(%eax), %xmm0
+	movdqu	-16(%eax, %ecx), %xmm1
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, -16(%edx, %ecx)
+	jmp	L(return)
+
+L(mm_len_32_or_more_forward):
+	cmpl	$64, %ecx
+	ja	L(mm_len_64_or_more_forward)
+
+/* Copy [0..64] and return.  */
+	movdqu	(%eax), %xmm0
+	movdqu	16(%eax), %xmm1
+	movdqu	-16(%eax, %ecx), %xmm2
+	movdqu	-32(%eax, %ecx), %xmm3
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, 16(%edx)
+	movdqu	%xmm2, -16(%edx, %ecx)
+	movdqu	%xmm3, -32(%edx, %ecx)
+	jmp	L(return)
+
+L(mm_len_64_or_more_forward):
+	cmpl	$128, %ecx
+	ja	L(mm_len_128_or_more_forward)
+
+/* Copy [0..128] and return.  */
+	movdqu	(%eax), %xmm0
+	movdqu	16(%eax), %xmm1
+	movdqu	32(%eax), %xmm2
+	movdqu	48(%eax), %xmm3
+	movdqu	-64(%eax, %ecx), %xmm4
+	movdqu	-48(%eax, %ecx), %xmm5
+	movdqu	-32(%eax, %ecx), %xmm6
+	movdqu	-16(%eax, %ecx), %xmm7
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, 16(%edx)
+	movdqu	%xmm2, 32(%edx)
+	movdqu	%xmm3, 48(%edx)
+	movdqu	%xmm4, -64(%edx, %ecx)
+	movdqu	%xmm5, -48(%edx, %ecx)
+	movdqu	%xmm6, -32(%edx, %ecx)
+	movdqu	%xmm7, -16(%edx, %ecx)
+	jmp	L(return)
+
+L(mm_len_128_or_more_forward):
+	PUSH (%esi)
+	PUSH (%edi)
+	PUSH (%ebx)
+
+/* Aligning the address of destination. */
+	movdqu	-16(%eax, %ecx), %xmm4
+	movdqu	-32(%eax, %ecx), %xmm5
+	movdqu	-48(%eax, %ecx), %xmm6
+	movdqu	-64(%eax, %ecx), %xmm7
+	leal	(%edx, %ecx), %esi
+	movdqu	(%eax), %xmm0
+	subl	$16, %esp
+	movdqu	%xmm0, (%esp)
+	mov	%ecx, %edi
+	leal	16(%edx), %ecx
+	andl	$-16, %ecx
+	movl	%ecx, %ebx
+	subl	%edx, %ebx
+	addl	%ebx, %eax
+	movl	%esi, %ebx
+	subl	%ecx, %ebx
+	shrl	$6, %ebx
+
+# ifdef SHARED_CACHE_SIZE_HALF
+	cmp	$SHARED_CACHE_SIZE_HALF, %edi
+# else
+#  ifdef SHARED
+	PUSH (%ebx)
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %edi
+	POP (%ebx)
+#  else
+	cmp	__x86_shared_cache_size_half, %edi
+#  endif
+# endif
+	jae	L(mm_large_page_loop_forward)
+
+	.p2align 4
+L(mm_main_loop_forward):
+
+	prefetcht0 128(%eax)
+
+	movdqu	(%eax), %xmm0
+	movdqu	16(%eax), %xmm1
+	movdqu	32(%eax), %xmm2
+	movdqu	48(%eax), %xmm3
+	movdqa	%xmm0, (%ecx)
+	addl	$64, %eax
+	movaps	%xmm1, 16(%ecx)
+	movaps	%xmm2, 32(%ecx)
+	movaps	%xmm3, 48(%ecx)
+	addl	$64, %ecx
+	sub	$1, %ebx
+	jnz	L(mm_main_loop_forward)
+	movdqu	(%esp), %xmm0
+	addl	$16, %esp
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm4, -16(%esi)
+	movdqu	%xmm5, -32(%esi)
+	movdqu	%xmm6, -48(%esi)
+	movdqu	%xmm7, -64(%esi)
+	POP (%ebx)
+	jmp	L(mm_return_pop_all)
+
+L(mm_len_0_16_bytes_forward):
+	testb	$24, %cl
+	jne	L(mm_len_9_16_bytes_forward)
+	testb	$4, %cl
+	.p2align 4,,5
+	jne	L(mm_len_5_8_bytes_forward)
+	testl	%ecx, %ecx
+	.p2align 4,,2
+	je	L(return)
+	testb	$2, %cl
+	.p2align 4,,1
+	jne	L(mm_len_2_4_bytes_forward)
+	movzbl	-1(%eax,%ecx), %ebx
+	movzbl	(%eax), %eax
+	movb	%bl, -1(%edx,%ecx)
+	movb	%al, (%edx)
+	jmp	L(return)
+
+L(mm_len_2_4_bytes_forward):
+	movzwl	-2(%eax,%ecx), %ebx
+	movzwl	(%eax), %eax
+	movw	%bx, -2(%edx,%ecx)
+	movw	%ax, (%edx)
+	jmp	L(return)
+
+L(mm_len_5_8_bytes_forward):
+	movl	(%eax), %ebx
+	movl	-4(%eax,%ecx), %eax
+	movl	%ebx, (%edx)
+	movl	%eax, -4(%edx,%ecx)
+	jmp	L(return)
+
+L(mm_len_9_16_bytes_forward):
+	movq	(%eax), %xmm0
+	movq	-8(%eax, %ecx), %xmm1
+	movq	%xmm0, (%edx)
+	movq	%xmm1, -8(%edx, %ecx)
+	jmp	L(return)
+
+L(mm_return_pop_all):
+	movl	%edx, %eax
+	POP (%edi)
+	POP (%esi)
+	RETURN
+
+/* Big length copy forward part.  */
+	.p2align 4
+L(mm_large_page_loop_forward):
+	movdqu	(%eax), %xmm0
+	movdqu	16(%eax), %xmm1
+	movdqu	32(%eax), %xmm2
+	movdqu	48(%eax), %xmm3
+	movntdq	%xmm0, (%ecx)
+	addl	$64, %eax
+	movntdq	%xmm1, 16(%ecx)
+	movntdq	%xmm2, 32(%ecx)
+	movntdq	%xmm3, 48(%ecx)
+	addl	$64, %ecx
+	sub	$1, %ebx
+	jnz	L(mm_large_page_loop_forward)
+	sfence
+	movdqu	(%esp), %xmm0
+	addl	$16, %esp
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm4, -16(%esi)
+	movdqu	%xmm5, -32(%esi)
+	movdqu	%xmm6, -48(%esi)
+	movdqu	%xmm7, -64(%esi)
+	POP (%ebx)
+	jmp	L(mm_return_pop_all)
+# endif
+
+L(forward):
+	cmp	$16, %ecx
+	jbe	L(len_0_16_bytes)
+
+# ifdef SHARED_CACHE_SIZE_HALF
+	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_shared_cache_size_half, %ecx
+#  endif
+# endif
+	jae     L(large_page)
+
+	movdqu	(%eax), %xmm0
+	movdqu	-16(%eax, %ecx), %xmm1
+	cmpl    $32, %ecx
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, -16(%edx, %ecx)
+	jbe	L(return)
+
+	movdqu	16(%eax), %xmm0
+	movdqu	-32(%eax, %ecx), %xmm1
+	cmpl    $64, %ecx
+	movdqu	%xmm0, 16(%edx)
+	movdqu	%xmm1, -32(%edx, %ecx)
+	jbe	L(return)
+
+	movdqu	32(%eax), %xmm0
+	movdqu	48(%eax), %xmm1
+	movdqu	-48(%eax, %ecx), %xmm2
+	movdqu	-64(%eax, %ecx), %xmm3
+	cmpl    $128, %ecx
+	movdqu	%xmm0, 32(%edx)
+	movdqu	%xmm1, 48(%edx)
+	movdqu	%xmm2, -48(%edx, %ecx)
+	movdqu	%xmm3, -64(%edx, %ecx)
+	jbe	L(return)
+
+/* Now the main loop: we align the address of the destination.  */
+	leal	64(%edx), %ebx
+	andl	$-64, %ebx
+
+	addl	%edx, %ecx
+	andl	$-64, %ecx
+
+	subl	%edx, %eax
+
+/* We should stop two iterations before the termination
+	(in order not to misprefetch).  */
+	subl	$64, %ecx
+	cmpl	%ebx, %ecx
+	je	L(main_loop_just_one_iteration)
+
+	subl	$64, %ecx
+	cmpl	%ebx, %ecx
+	je	L(main_loop_last_two_iterations)
+
+	.p2align 4
+L(main_loop_cache):
+
+	prefetcht0 128(%ebx, %eax)
+
+	movdqu	(%ebx, %eax), %xmm0
+	movdqu	16(%ebx, %eax), %xmm1
+	movdqu	32(%ebx, %eax), %xmm2
+	movdqu	48(%ebx, %eax), %xmm3
+	movdqa	%xmm0, (%ebx)
+	movaps	%xmm1, 16(%ebx)
+	movaps	%xmm2, 32(%ebx)
+	movaps	%xmm3, 48(%ebx)
+	lea	64(%ebx), %ebx
+	cmpl	%ebx, %ecx
+	jne	L(main_loop_cache)
+
+L(main_loop_last_two_iterations):
+	movdqu	(%ebx, %eax), %xmm0
+	movdqu	16(%ebx, %eax), %xmm1
+	movdqu	32(%ebx, %eax), %xmm2
+	movdqu	48(%ebx, %eax), %xmm3
+	movdqu	64(%ebx, %eax), %xmm4
+	movdqu	80(%ebx, %eax), %xmm5
+	movdqu	96(%ebx, %eax), %xmm6
+	movdqu	112(%ebx, %eax), %xmm7
+	movdqa	%xmm0, (%ebx)
+	movaps	%xmm1, 16(%ebx)
+	movaps	%xmm2, 32(%ebx)
+	movaps	%xmm3, 48(%ebx)
+	movaps	%xmm4, 64(%ebx)
+	movaps	%xmm5, 80(%ebx)
+	movaps	%xmm6, 96(%ebx)
+	movaps	%xmm7, 112(%ebx)
+	jmp	L(return)
+
+L(main_loop_just_one_iteration):
+	movdqu	(%ebx, %eax), %xmm0
+	movdqu	16(%ebx, %eax), %xmm1
+	movdqu	32(%ebx, %eax), %xmm2
+	movdqu	48(%ebx, %eax), %xmm3
+	movdqa	%xmm0, (%ebx)
+	movaps	%xmm1, 16(%ebx)
+	movaps	%xmm2, 32(%ebx)
+	movaps	%xmm3, 48(%ebx)
+	jmp	L(return)
+
+L(large_page):
+	movdqu	(%eax), %xmm0
+	movdqu	16(%eax), %xmm1
+	movdqu	32(%eax), %xmm2
+	movdqu	48(%eax), %xmm3
+	movdqu	-64(%eax, %ecx), %xmm4
+	movdqu	-48(%eax, %ecx), %xmm5
+	movdqu	-32(%eax, %ecx), %xmm6
+	movdqu	-16(%eax, %ecx), %xmm7
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, 16(%edx)
+	movdqu	%xmm2, 32(%edx)
+	movdqu	%xmm3, 48(%edx)
+	movdqu	%xmm4, -64(%edx, %ecx)
+	movdqu	%xmm5, -48(%edx, %ecx)
+	movdqu	%xmm6, -32(%edx, %ecx)
+	movdqu	%xmm7, -16(%edx, %ecx)
+
+	movdqu	64(%eax), %xmm0
+	movdqu	80(%eax), %xmm1
+	movdqu	96(%eax), %xmm2
+	movdqu	112(%eax), %xmm3
+	movdqu	-128(%eax, %ecx), %xmm4
+	movdqu	-112(%eax, %ecx), %xmm5
+	movdqu	-96(%eax, %ecx), %xmm6
+	movdqu	-80(%eax, %ecx), %xmm7
+	movdqu	%xmm0, 64(%edx)
+	movdqu	%xmm1, 80(%edx)
+	movdqu	%xmm2, 96(%edx)
+	movdqu	%xmm3, 112(%edx)
+	movdqu	%xmm4, -128(%edx, %ecx)
+	movdqu	%xmm5, -112(%edx, %ecx)
+	movdqu	%xmm6, -96(%edx, %ecx)
+	movdqu	%xmm7, -80(%edx, %ecx)
+
+/* Now the main loop with non temporal stores. We align
+	the address of the destination.  */
+	leal	128(%edx), %ebx
+	andl	$-128, %ebx
+
+	addl	%edx, %ecx
+	andl	$-128, %ecx
+
+	subl	%edx, %eax
+
+	.p2align 4
+L(main_loop_large_page):
+	movdqu	(%ebx, %eax), %xmm0
+	movdqu	16(%ebx, %eax), %xmm1
+	movdqu	32(%ebx, %eax), %xmm2
+	movdqu	48(%ebx, %eax), %xmm3
+	movdqu	64(%ebx, %eax), %xmm4
+	movdqu	80(%ebx, %eax), %xmm5
+	movdqu	96(%ebx, %eax), %xmm6
+	movdqu	112(%ebx, %eax), %xmm7
+	movntdq	%xmm0, (%ebx)
+	movntdq	%xmm1, 16(%ebx)
+	movntdq	%xmm2, 32(%ebx)
+	movntdq	%xmm3, 48(%ebx)
+	movntdq	%xmm4, 64(%ebx)
+	movntdq	%xmm5, 80(%ebx)
+	movntdq	%xmm6, 96(%ebx)
+	movntdq	%xmm7, 112(%ebx)
+	lea	128(%ebx), %ebx
+	cmpl	%ebx, %ecx
+	jne	L(main_loop_large_page)
+	sfence
+	jmp	L(return)
+
+L(len_0_16_bytes):
+	testb	$24, %cl
+	jne	L(len_9_16_bytes)
+	testb	$4, %cl
+	.p2align 4,,5
+	jne	L(len_5_8_bytes)
+	testl	%ecx, %ecx
+	.p2align 4,,2
+	je	L(return)
+	movzbl	(%eax), %ebx
+	testb	$2, %cl
+	movb	%bl, (%edx)
+	je	L(return)
+	movzwl	-2(%eax,%ecx), %ebx
+	movw	%bx, -2(%edx,%ecx)
+	jmp	L(return)
+
+L(len_9_16_bytes):
+	movq	(%eax), %xmm0
+	movq	-8(%eax, %ecx), %xmm1
+	movq	%xmm0, (%edx)
+	movq	%xmm1, -8(%edx, %ecx)
+	jmp	L(return)
+
+L(len_5_8_bytes):
+	movl	(%eax), %ebx
+	movl	%ebx, (%edx)
+	movl	-4(%eax,%ecx), %ebx
+	movl	%ebx, -4(%edx,%ecx)
+
+L(return):
+	movl	%edx, %eax
+# if !defined USE_AS_BCOPY && defined USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+	RETURN
+
+END (MEMCPY)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S
new file mode 100644
index 0000000000..687e083147
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S
@@ -0,0 +1,1809 @@
+/* memcpy with SSSE3 and REP string.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#if IS_IN (libc) \
+    && (defined SHARED \
+	|| defined USE_AS_MEMMOVE \
+	|| !defined USE_MULTIARCH)
+
+#include "asm-syntax.h"
+
+#ifndef MEMCPY
+# define MEMCPY		__memcpy_ssse3_rep
+# define MEMCPY_CHK	__memcpy_chk_ssse3_rep
+#endif
+
+#ifdef USE_AS_BCOPY
+# define SRC		PARMS
+# define DEST		SRC+4
+# define LEN		DEST+4
+#else
+# define DEST		PARMS
+# define SRC		DEST+4
+# define LEN		SRC+4
+#endif
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#ifdef SHARED
+# define PARMS		8		/* Preserve EBX.  */
+# define ENTRANCE	PUSH (%ebx);
+# define RETURN_END	POP (%ebx); ret
+# define RETURN		RETURN_END; CFI_PUSH (%ebx)
+# define JMPTBL(I, B)	I - B
+
+/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
+   jump table with relative offsets.  INDEX is a register contains the
+   index into the jump table.   SCALE is the scale of INDEX. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+    /* We first load PC into EBX.  */				\
+    SETUP_PIC_REG(bx);						\
+    /* Get the address of the jump table.  */			\
+    addl	$(TABLE - .), %ebx;				\
+    /* Get the entry and convert the relative offset to the	\
+       absolute address.  */					\
+    addl	(%ebx,INDEX,SCALE), %ebx;			\
+    /* We loaded the jump table.  Go.  */			\
+    jmp		*%ebx
+
+# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)			\
+    addl	$(TABLE - .), %ebx
+
+# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE)	\
+    addl	(%ebx,INDEX,SCALE), %ebx;			\
+    /* We loaded the jump table.  Go.  */			\
+    jmp		*%ebx
+#else
+# define PARMS		4
+# define ENTRANCE
+# define RETURN_END	ret
+# define RETURN		RETURN_END
+# define JMPTBL(I, B)	I
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+   absolute offsets.  INDEX is a register contains the index into the
+   jump table.  SCALE is the scale of INDEX. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+    jmp		*TABLE(,INDEX,SCALE)
+
+# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)
+
+# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE)		\
+    jmp		*TABLE(,INDEX,SCALE)
+#endif
+
+	.section .text.ssse3,"ax",@progbits
+#if !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+#endif
+ENTRY (MEMCPY)
+	ENTRANCE
+	movl	LEN(%esp), %ecx
+	movl	SRC(%esp), %eax
+	movl	DEST(%esp), %edx
+
+#ifdef USE_AS_MEMMOVE
+	cmp	%eax, %edx
+	jb	L(copy_forward)
+	je	L(fwd_write_0bytes)
+	cmp	$48, %ecx
+	jb	L(bk_write_less48bytes)
+	add	%ecx, %eax
+	cmp	%eax, %edx
+	movl	SRC(%esp), %eax
+	jb	L(copy_backward)
+
+L(copy_forward):
+#endif
+	cmp	$48, %ecx
+	jae	L(48bytesormore)
+
+L(fwd_write_less32bytes):
+#ifndef USE_AS_MEMMOVE
+	cmp	%dl, %al
+	jb	L(bk_write)
+#endif
+	add	%ecx, %edx
+	add	%ecx, %eax
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+#ifndef USE_AS_MEMMOVE
+L(bk_write):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
+#endif
+
+	ALIGN (4)
+/* ECX > 32 and EDX is 4 byte aligned.  */
+L(48bytesormore):
+	movdqu	(%eax), %xmm0
+	PUSH (%edi)
+	movl	%edx, %edi
+	and	$-16, %edx
+	PUSH (%esi)
+	cfi_remember_state
+	add	$16, %edx
+	movl	%edi, %esi
+	sub	%edx, %edi
+	add	%edi, %ecx
+	sub	%edi, %eax
+
+#ifdef SHARED_CACHE_SIZE_HALF
+	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
+#else
+# ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+	cmp	__x86_shared_cache_size_half, %ecx
+# endif
+#endif
+
+	mov	%eax, %edi
+	jae	L(large_page)
+	and	$0xf, %edi
+	jz	L(shl_0)
+
+	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
+
+	ALIGN (4)
+L(shl_0):
+	movdqu	%xmm0, (%esi)
+	xor	%edi, %edi
+	cmp	$127, %ecx
+	ja	L(shl_0_gobble)
+	lea	-32(%ecx), %ecx
+L(shl_0_loop):
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+	jb	L(shl_0_end)
+
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+	jb	L(shl_0_end)
+
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+	jb	L(shl_0_end)
+
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+L(shl_0_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	add	%edi, %eax
+	POP (%esi)
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+L(shl_0_gobble):
+
+#ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+#else
+# ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	mov	__x86_data_cache_size_half@GOTOFF(%ebx), %edi
+# else
+	mov	__x86_data_cache_size_half, %edi
+# endif
+#endif
+	mov	%edi, %esi
+	shr	$3, %esi
+	sub	%esi, %edi
+	cmp	%edi, %ecx
+	jae	L(shl_0_gobble_mem_start)
+	sub	$128, %ecx
+	ALIGN (4)
+L(shl_0_gobble_cache_loop):
+	movdqa	(%eax), %xmm0
+	movaps	0x10(%eax), %xmm1
+	movaps	0x20(%eax), %xmm2
+	movaps	0x30(%eax), %xmm3
+	movaps	0x40(%eax), %xmm4
+	movaps	0x50(%eax), %xmm5
+	movaps	0x60(%eax), %xmm6
+	movaps	0x70(%eax), %xmm7
+	lea	0x80(%eax), %eax
+	sub	$128, %ecx
+	movdqa	%xmm0, (%edx)
+	movaps	%xmm1, 0x10(%edx)
+	movaps	%xmm2, 0x20(%edx)
+	movaps	%xmm3, 0x30(%edx)
+	movaps	%xmm4, 0x40(%edx)
+	movaps	%xmm5, 0x50(%edx)
+	movaps	%xmm6, 0x60(%edx)
+	movaps	%xmm7, 0x70(%edx)
+	lea	0x80(%edx), %edx
+
+	jae	L(shl_0_gobble_cache_loop)
+	add	$0x80, %ecx
+	cmp	$0x40, %ecx
+	jb	L(shl_0_cache_less_64bytes)
+
+	movdqa	(%eax), %xmm0
+	sub	$0x40, %ecx
+	movdqa	0x10(%eax), %xmm1
+
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+
+	movdqa	0x20(%eax), %xmm0
+	movdqa	0x30(%eax), %xmm1
+	add	$0x40, %eax
+
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm1, 0x30(%edx)
+	add	$0x40, %edx
+L(shl_0_cache_less_64bytes):
+	cmp	$0x20, %ecx
+	jb	L(shl_0_cache_less_32bytes)
+	movdqa	(%eax), %xmm0
+	sub	$0x20, %ecx
+	movdqa	0x10(%eax), %xmm1
+	add	$0x20, %eax
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+	add	$0x20, %edx
+L(shl_0_cache_less_32bytes):
+	cmp	$0x10, %ecx
+	jb	L(shl_0_cache_less_16bytes)
+	sub	$0x10, %ecx
+	movdqa	(%eax), %xmm0
+	add	$0x10, %eax
+	movdqa	%xmm0, (%edx)
+	add	$0x10, %edx
+L(shl_0_cache_less_16bytes):
+	add	%ecx, %edx
+	add	%ecx, %eax
+	POP (%esi)
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_0_gobble_mem_start):
+	cmp	%al, %dl
+	je	L(copy_page_by_rep)
+	sub	$128, %ecx
+L(shl_0_gobble_mem_loop):
+	prefetchnta 0x1c0(%eax)
+	prefetchnta 0x280(%eax)
+	prefetchnta 0x1c0(%edx)
+	prefetchnta 0x280(%edx)
+
+	movdqa	(%eax), %xmm0
+	movaps	0x10(%eax), %xmm1
+	movaps	0x20(%eax), %xmm2
+	movaps	0x30(%eax), %xmm3
+	movaps	0x40(%eax), %xmm4
+	movaps	0x50(%eax), %xmm5
+	movaps	0x60(%eax), %xmm6
+	movaps	0x70(%eax), %xmm7
+	lea	0x80(%eax), %eax
+	sub	$0x80, %ecx
+	movdqa	%xmm0, (%edx)
+	movaps	%xmm1, 0x10(%edx)
+	movaps	%xmm2, 0x20(%edx)
+	movaps	%xmm3, 0x30(%edx)
+	movaps	%xmm4, 0x40(%edx)
+	movaps	%xmm5, 0x50(%edx)
+	movaps	%xmm6, 0x60(%edx)
+	movaps	%xmm7, 0x70(%edx)
+	lea	0x80(%edx), %edx
+
+	jae	L(shl_0_gobble_mem_loop)
+	add	$0x80, %ecx
+	cmp	$0x40, %ecx
+	jb	L(shl_0_mem_less_64bytes)
+
+	movdqa	(%eax), %xmm0
+	sub	$0x40, %ecx
+	movdqa	0x10(%eax), %xmm1
+
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+
+	movdqa	0x20(%eax), %xmm0
+	movdqa	0x30(%eax), %xmm1
+	add	$0x40, %eax
+
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm1, 0x30(%edx)
+	add	$0x40, %edx
+L(shl_0_mem_less_64bytes):
+	cmp	$0x20, %ecx
+	jb	L(shl_0_mem_less_32bytes)
+	movdqa	(%eax), %xmm0
+	sub	$0x20, %ecx
+	movdqa	0x10(%eax), %xmm1
+	add	$0x20, %eax
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+	add	$0x20, %edx
+L(shl_0_mem_less_32bytes):
+	cmp	$0x10, %ecx
+	jb	L(shl_0_mem_less_16bytes)
+	sub	$0x10, %ecx
+	movdqa	(%eax), %xmm0
+	add	$0x10, %eax
+	movdqa	%xmm0, (%edx)
+	add	$0x10, %edx
+L(shl_0_mem_less_16bytes):
+	add	%ecx, %edx
+	add	%ecx, %eax
+	POP (%esi)
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_1):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$1, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_1_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$1, %xmm2, %xmm3
+	palignr	$1, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_1_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$1, %xmm2, %xmm3
+	palignr	$1, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_1_loop)
+
+L(shl_1_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	1(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_2):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$2, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_2_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$2, %xmm2, %xmm3
+	palignr	$2, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_2_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$2, %xmm2, %xmm3
+	palignr	$2, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_2_loop)
+
+L(shl_2_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	2(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_3):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$3, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_3_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$3, %xmm2, %xmm3
+	palignr	$3, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_3_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$3, %xmm2, %xmm3
+	palignr	$3, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_3_loop)
+
+L(shl_3_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	3(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_4):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$4, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_4_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$4, %xmm2, %xmm3
+	palignr	$4, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_4_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$4, %xmm2, %xmm3
+	palignr	$4, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_4_loop)
+
+L(shl_4_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	4(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_5):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$5, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_5_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$5, %xmm2, %xmm3
+	palignr	$5, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_5_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$5, %xmm2, %xmm3
+	palignr	$5, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_5_loop)
+
+L(shl_5_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	5(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_6):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$6, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_6_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$6, %xmm2, %xmm3
+	palignr	$6, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_6_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$6, %xmm2, %xmm3
+	palignr	$6, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_6_loop)
+
+L(shl_6_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	6(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_7):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$7, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_7_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$7, %xmm2, %xmm3
+	palignr	$7, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_7_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$7, %xmm2, %xmm3
+	palignr	$7, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_7_loop)
+
+L(shl_7_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	7(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_8):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$8, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_8_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$8, %xmm2, %xmm3
+	palignr	$8, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_8_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$8, %xmm2, %xmm3
+	palignr	$8, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_8_loop)
+
+L(shl_8_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	8(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_9):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$9, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_9_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$9, %xmm2, %xmm3
+	palignr	$9, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_9_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$9, %xmm2, %xmm3
+	palignr	$9, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_9_loop)
+
+L(shl_9_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	9(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_10):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$10, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_10_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$10, %xmm2, %xmm3
+	palignr	$10, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_10_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$10, %xmm2, %xmm3
+	palignr	$10, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_10_loop)
+
+L(shl_10_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	10(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_11):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$11, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_11_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$11, %xmm2, %xmm3
+	palignr	$11, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_11_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$11, %xmm2, %xmm3
+	palignr	$11, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_11_loop)
+
+L(shl_11_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	11(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_12):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$12, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_12_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$12, %xmm2, %xmm3
+	palignr	$12, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_12_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$12, %xmm2, %xmm3
+	palignr	$12, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_12_loop)
+
+L(shl_12_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	12(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_13):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$13, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_13_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$13, %xmm2, %xmm3
+	palignr	$13, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_13_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$13, %xmm2, %xmm3
+	palignr	$13, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_13_loop)
+
+L(shl_13_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	13(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_14):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$14, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_14_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$14, %xmm2, %xmm3
+	palignr	$14, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_14_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$14, %xmm2, %xmm3
+	palignr	$14, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_14_loop)
+
+L(shl_14_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	14(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_15):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$15, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_15_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$15, %xmm2, %xmm3
+	palignr	$15, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_15_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$15, %xmm2, %xmm3
+	palignr	$15, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_15_loop)
+
+L(shl_15_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	15(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+
+	ALIGN (4)
+L(fwd_write_44bytes):
+	movl	-44(%eax), %ecx
+	movl	%ecx, -44(%edx)
+L(fwd_write_40bytes):
+	movl	-40(%eax), %ecx
+	movl	%ecx, -40(%edx)
+L(fwd_write_36bytes):
+	movl	-36(%eax), %ecx
+	movl	%ecx, -36(%edx)
+L(fwd_write_32bytes):
+	movl	-32(%eax), %ecx
+	movl	%ecx, -32(%edx)
+L(fwd_write_28bytes):
+	movl	-28(%eax), %ecx
+	movl	%ecx, -28(%edx)
+L(fwd_write_24bytes):
+	movl	-24(%eax), %ecx
+	movl	%ecx, -24(%edx)
+L(fwd_write_20bytes):
+	movl	-20(%eax), %ecx
+	movl	%ecx, -20(%edx)
+L(fwd_write_16bytes):
+	movl	-16(%eax), %ecx
+	movl	%ecx, -16(%edx)
+L(fwd_write_12bytes):
+	movl	-12(%eax), %ecx
+	movl	%ecx, -12(%edx)
+L(fwd_write_8bytes):
+	movl	-8(%eax), %ecx
+	movl	%ecx, -8(%edx)
+L(fwd_write_4bytes):
+	movl	-4(%eax), %ecx
+	movl	%ecx, -4(%edx)
+L(fwd_write_0bytes):
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_5bytes):
+	movl	-5(%eax), %ecx
+	movl	-4(%eax), %eax
+	movl	%ecx, -5(%edx)
+	movl	%eax, -4(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_45bytes):
+	movl	-45(%eax), %ecx
+	movl	%ecx, -45(%edx)
+L(fwd_write_41bytes):
+	movl	-41(%eax), %ecx
+	movl	%ecx, -41(%edx)
+L(fwd_write_37bytes):
+	movl	-37(%eax), %ecx
+	movl	%ecx, -37(%edx)
+L(fwd_write_33bytes):
+	movl	-33(%eax), %ecx
+	movl	%ecx, -33(%edx)
+L(fwd_write_29bytes):
+	movl	-29(%eax), %ecx
+	movl	%ecx, -29(%edx)
+L(fwd_write_25bytes):
+	movl	-25(%eax), %ecx
+	movl	%ecx, -25(%edx)
+L(fwd_write_21bytes):
+	movl	-21(%eax), %ecx
+	movl	%ecx, -21(%edx)
+L(fwd_write_17bytes):
+	movl	-17(%eax), %ecx
+	movl	%ecx, -17(%edx)
+L(fwd_write_13bytes):
+	movl	-13(%eax), %ecx
+	movl	%ecx, -13(%edx)
+L(fwd_write_9bytes):
+	movl	-9(%eax), %ecx
+	movl	%ecx, -9(%edx)
+	movl	-5(%eax), %ecx
+	movl	%ecx, -5(%edx)
+L(fwd_write_1bytes):
+	movzbl	-1(%eax), %ecx
+	movb	%cl, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_46bytes):
+	movl	-46(%eax), %ecx
+	movl	%ecx, -46(%edx)
+L(fwd_write_42bytes):
+	movl	-42(%eax), %ecx
+	movl	%ecx, -42(%edx)
+L(fwd_write_38bytes):
+	movl	-38(%eax), %ecx
+	movl	%ecx, -38(%edx)
+L(fwd_write_34bytes):
+	movl	-34(%eax), %ecx
+	movl	%ecx, -34(%edx)
+L(fwd_write_30bytes):
+	movl	-30(%eax), %ecx
+	movl	%ecx, -30(%edx)
+L(fwd_write_26bytes):
+	movl	-26(%eax), %ecx
+	movl	%ecx, -26(%edx)
+L(fwd_write_22bytes):
+	movl	-22(%eax), %ecx
+	movl	%ecx, -22(%edx)
+L(fwd_write_18bytes):
+	movl	-18(%eax), %ecx
+	movl	%ecx, -18(%edx)
+L(fwd_write_14bytes):
+	movl	-14(%eax), %ecx
+	movl	%ecx, -14(%edx)
+L(fwd_write_10bytes):
+	movl	-10(%eax), %ecx
+	movl	%ecx, -10(%edx)
+L(fwd_write_6bytes):
+	movl	-6(%eax), %ecx
+	movl	%ecx, -6(%edx)
+L(fwd_write_2bytes):
+	movzwl	-2(%eax), %ecx
+	movw	%cx, -2(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_47bytes):
+	movl	-47(%eax), %ecx
+	movl	%ecx, -47(%edx)
+L(fwd_write_43bytes):
+	movl	-43(%eax), %ecx
+	movl	%ecx, -43(%edx)
+L(fwd_write_39bytes):
+	movl	-39(%eax), %ecx
+	movl	%ecx, -39(%edx)
+L(fwd_write_35bytes):
+	movl	-35(%eax), %ecx
+	movl	%ecx, -35(%edx)
+L(fwd_write_31bytes):
+	movl	-31(%eax), %ecx
+	movl	%ecx, -31(%edx)
+L(fwd_write_27bytes):
+	movl	-27(%eax), %ecx
+	movl	%ecx, -27(%edx)
+L(fwd_write_23bytes):
+	movl	-23(%eax), %ecx
+	movl	%ecx, -23(%edx)
+L(fwd_write_19bytes):
+	movl	-19(%eax), %ecx
+	movl	%ecx, -19(%edx)
+L(fwd_write_15bytes):
+	movl	-15(%eax), %ecx
+	movl	%ecx, -15(%edx)
+L(fwd_write_11bytes):
+	movl	-11(%eax), %ecx
+	movl	%ecx, -11(%edx)
+L(fwd_write_7bytes):
+	movl	-7(%eax), %ecx
+	movl	%ecx, -7(%edx)
+L(fwd_write_3bytes):
+	movzwl	-3(%eax), %ecx
+	movzbl	-1(%eax), %eax
+	movw	%cx, -3(%edx)
+	movb	%al, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN_END
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(large_page):
+	movdqu	(%eax), %xmm1
+	movdqu	%xmm0, (%esi)
+	movntdq	%xmm1, (%edx)
+	add	$0x10, %eax
+	add	$0x10, %edx
+	sub	$0x10, %ecx
+	cmp	%al, %dl
+	je	L(copy_page_by_rep)
+L(large_page_loop_init):
+	POP (%esi)
+	sub	$0x80, %ecx
+	POP (%edi)
+L(large_page_loop):
+	prefetchnta	0x1c0(%eax)
+	prefetchnta	0x280(%eax)
+	movdqu	(%eax), %xmm0
+	movdqu	0x10(%eax), %xmm1
+	movdqu	0x20(%eax), %xmm2
+	movdqu	0x30(%eax), %xmm3
+	movdqu	0x40(%eax), %xmm4
+	movdqu	0x50(%eax), %xmm5
+	movdqu	0x60(%eax), %xmm6
+	movdqu	0x70(%eax), %xmm7
+	lea	0x80(%eax), %eax
+	lfence
+	sub	$0x80, %ecx
+	movntdq	%xmm0, (%edx)
+	movntdq	%xmm1, 0x10(%edx)
+	movntdq	%xmm2, 0x20(%edx)
+	movntdq	%xmm3, 0x30(%edx)
+	movntdq	%xmm4, 0x40(%edx)
+	movntdq	%xmm5, 0x50(%edx)
+	movntdq	%xmm6, 0x60(%edx)
+	movntdq	%xmm7, 0x70(%edx)
+	lea	0x80(%edx), %edx
+	jae	L(large_page_loop)
+	add	$0x80, %ecx
+	cmp	$0x40, %ecx
+	jb	L(large_page_less_64bytes)
+
+	movdqu	(%eax), %xmm0
+	movdqu	0x10(%eax), %xmm1
+	movdqu	0x20(%eax), %xmm2
+	movdqu	0x30(%eax), %xmm3
+	lea	0x40(%eax), %eax
+
+	movntdq	%xmm0, (%edx)
+	movntdq	%xmm1, 0x10(%edx)
+	movntdq	%xmm2, 0x20(%edx)
+	movntdq	%xmm3, 0x30(%edx)
+	lea	0x40(%edx), %edx
+	sub	$0x40, %ecx
+L(large_page_less_64bytes):
+	cmp	$32, %ecx
+	jb	L(large_page_less_32bytes)
+	movdqu	(%eax), %xmm0
+	movdqu	0x10(%eax), %xmm1
+	lea	0x20(%eax), %eax
+	movntdq	%xmm0, (%edx)
+	movntdq	%xmm1, 0x10(%edx)
+	lea	0x20(%edx), %edx
+	sub	$0x20, %ecx
+L(large_page_less_32bytes):
+	add	%ecx, %edx
+	add	%ecx, %eax
+	sfence
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(copy_page_by_rep):
+	mov	%eax, %esi
+	mov	%edx, %edi
+	mov	%ecx, %edx
+	shr	$2, %ecx
+	and	$3, %edx
+	rep	movsl
+	jz	L(copy_page_by_rep_exit)
+	cmp	$2, %edx
+	jb	L(copy_page_by_rep_left_1)
+	movzwl	(%esi), %eax
+	movw	%ax, (%edi)
+	add	$2, %esi
+	add	$2, %edi
+	sub	$2, %edx
+	jz	L(copy_page_by_rep_exit)
+L(copy_page_by_rep_left_1):
+	movzbl	(%esi), %eax
+	movb	%al, (%edi)
+L(copy_page_by_rep_exit):
+	POP (%esi)
+	POP (%edi)
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(bk_write_44bytes):
+	movl	40(%eax), %ecx
+	movl	%ecx, 40(%edx)
+L(bk_write_40bytes):
+	movl	36(%eax), %ecx
+	movl	%ecx, 36(%edx)
+L(bk_write_36bytes):
+	movl	32(%eax), %ecx
+	movl	%ecx, 32(%edx)
+L(bk_write_32bytes):
+	movl	28(%eax), %ecx
+	movl	%ecx, 28(%edx)
+L(bk_write_28bytes):
+	movl	24(%eax), %ecx
+	movl	%ecx, 24(%edx)
+L(bk_write_24bytes):
+	movl	20(%eax), %ecx
+	movl	%ecx, 20(%edx)
+L(bk_write_20bytes):
+	movl	16(%eax), %ecx
+	movl	%ecx, 16(%edx)
+L(bk_write_16bytes):
+	movl	12(%eax), %ecx
+	movl	%ecx, 12(%edx)
+L(bk_write_12bytes):
+	movl	8(%eax), %ecx
+	movl	%ecx, 8(%edx)
+L(bk_write_8bytes):
+	movl	4(%eax), %ecx
+	movl	%ecx, 4(%edx)
+L(bk_write_4bytes):
+	movl	(%eax), %ecx
+	movl	%ecx, (%edx)
+L(bk_write_0bytes):
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(bk_write_45bytes):
+	movl	41(%eax), %ecx
+	movl	%ecx, 41(%edx)
+L(bk_write_41bytes):
+	movl	37(%eax), %ecx
+	movl	%ecx, 37(%edx)
+L(bk_write_37bytes):
+	movl	33(%eax), %ecx
+	movl	%ecx, 33(%edx)
+L(bk_write_33bytes):
+	movl	29(%eax), %ecx
+	movl	%ecx, 29(%edx)
+L(bk_write_29bytes):
+	movl	25(%eax), %ecx
+	movl	%ecx, 25(%edx)
+L(bk_write_25bytes):
+	movl	21(%eax), %ecx
+	movl	%ecx, 21(%edx)
+L(bk_write_21bytes):
+	movl	17(%eax), %ecx
+	movl	%ecx, 17(%edx)
+L(bk_write_17bytes):
+	movl	13(%eax), %ecx
+	movl	%ecx, 13(%edx)
+L(bk_write_13bytes):
+	movl	9(%eax), %ecx
+	movl	%ecx, 9(%edx)
+L(bk_write_9bytes):
+	movl	5(%eax), %ecx
+	movl	%ecx, 5(%edx)
+L(bk_write_5bytes):
+	movl	1(%eax), %ecx
+	movl	%ecx, 1(%edx)
+L(bk_write_1bytes):
+	movzbl	(%eax), %ecx
+	movb	%cl, (%edx)
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(bk_write_46bytes):
+	movl	42(%eax), %ecx
+	movl	%ecx, 42(%edx)
+L(bk_write_42bytes):
+	movl	38(%eax), %ecx
+	movl	%ecx, 38(%edx)
+L(bk_write_38bytes):
+	movl	34(%eax), %ecx
+	movl	%ecx, 34(%edx)
+L(bk_write_34bytes):
+	movl	30(%eax), %ecx
+	movl	%ecx, 30(%edx)
+L(bk_write_30bytes):
+	movl	26(%eax), %ecx
+	movl	%ecx, 26(%edx)
+L(bk_write_26bytes):
+	movl	22(%eax), %ecx
+	movl	%ecx, 22(%edx)
+L(bk_write_22bytes):
+	movl	18(%eax), %ecx
+	movl	%ecx, 18(%edx)
+L(bk_write_18bytes):
+	movl	14(%eax), %ecx
+	movl	%ecx, 14(%edx)
+L(bk_write_14bytes):
+	movl	10(%eax), %ecx
+	movl	%ecx, 10(%edx)
+L(bk_write_10bytes):
+	movl	6(%eax), %ecx
+	movl	%ecx, 6(%edx)
+L(bk_write_6bytes):
+	movl	2(%eax), %ecx
+	movl	%ecx, 2(%edx)
+L(bk_write_2bytes):
+	movzwl	(%eax), %ecx
+	movw	%cx, (%edx)
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(bk_write_47bytes):
+	movl	43(%eax), %ecx
+	movl	%ecx, 43(%edx)
+L(bk_write_43bytes):
+	movl	39(%eax), %ecx
+	movl	%ecx, 39(%edx)
+L(bk_write_39bytes):
+	movl	35(%eax), %ecx
+	movl	%ecx, 35(%edx)
+L(bk_write_35bytes):
+	movl	31(%eax), %ecx
+	movl	%ecx, 31(%edx)
+L(bk_write_31bytes):
+	movl	27(%eax), %ecx
+	movl	%ecx, 27(%edx)
+L(bk_write_27bytes):
+	movl	23(%eax), %ecx
+	movl	%ecx, 23(%edx)
+L(bk_write_23bytes):
+	movl	19(%eax), %ecx
+	movl	%ecx, 19(%edx)
+L(bk_write_19bytes):
+	movl	15(%eax), %ecx
+	movl	%ecx, 15(%edx)
+L(bk_write_15bytes):
+	movl	11(%eax), %ecx
+	movl	%ecx, 11(%edx)
+L(bk_write_11bytes):
+	movl	7(%eax), %ecx
+	movl	%ecx, 7(%edx)
+L(bk_write_7bytes):
+	movl	3(%eax), %ecx
+	movl	%ecx, 3(%edx)
+L(bk_write_3bytes):
+	movzwl	1(%eax), %ecx
+	movw	%cx, 1(%edx)
+	movzbl	(%eax), %eax
+	movb	%al, (%edx)
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN_END
+
+
+	.pushsection .rodata.ssse3,"a",@progbits
+	ALIGN (2)
+L(table_48bytes_fwd):
+	.int	JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
+
+	ALIGN (2)
+L(shl_table):
+	.int	JMPTBL (L(shl_0), L(shl_table))
+	.int	JMPTBL (L(shl_1), L(shl_table))
+	.int	JMPTBL (L(shl_2), L(shl_table))
+	.int	JMPTBL (L(shl_3), L(shl_table))
+	.int	JMPTBL (L(shl_4), L(shl_table))
+	.int	JMPTBL (L(shl_5), L(shl_table))
+	.int	JMPTBL (L(shl_6), L(shl_table))
+	.int	JMPTBL (L(shl_7), L(shl_table))
+	.int	JMPTBL (L(shl_8), L(shl_table))
+	.int	JMPTBL (L(shl_9), L(shl_table))
+	.int	JMPTBL (L(shl_10), L(shl_table))
+	.int	JMPTBL (L(shl_11), L(shl_table))
+	.int	JMPTBL (L(shl_12), L(shl_table))
+	.int	JMPTBL (L(shl_13), L(shl_table))
+	.int	JMPTBL (L(shl_14), L(shl_table))
+	.int	JMPTBL (L(shl_15), L(shl_table))
+
+	ALIGN (2)
+L(table_48_bytes_bwd):
+	.int	JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
+
+	.popsection
+
+#ifdef USE_AS_MEMMOVE
+	ALIGN (4)
+L(copy_backward):
+	PUSH (%esi)
+	movl	%eax, %esi
+	add	%ecx, %edx
+	add	%ecx, %esi
+	testl	$0x3, %edx
+	jnz	L(bk_align)
+
+L(bk_aligned_4):
+	cmp	$64, %ecx
+	jae	L(bk_write_more64bytes)
+
+L(bk_write_64bytesless):
+	cmp	$32, %ecx
+	jb	L(bk_write_less32bytes)
+
+L(bk_write_more32bytes):
+	/* Copy 32 bytes at a time.  */
+	sub	$32, %ecx
+	movl	-4(%esi), %eax
+	movl	%eax, -4(%edx)
+	movl	-8(%esi), %eax
+	movl	%eax, -8(%edx)
+	movl	-12(%esi), %eax
+	movl	%eax, -12(%edx)
+	movl	-16(%esi), %eax
+	movl	%eax, -16(%edx)
+	movl	-20(%esi), %eax
+	movl	%eax, -20(%edx)
+	movl	-24(%esi), %eax
+	movl	%eax, -24(%edx)
+	movl	-28(%esi), %eax
+	movl	%eax, -28(%edx)
+	movl	-32(%esi), %eax
+	movl	%eax, -32(%edx)
+	sub	$32, %edx
+	sub	$32, %esi
+
+L(bk_write_less32bytes):
+	movl	%esi, %eax
+	sub	%ecx, %edx
+	sub	%ecx, %eax
+	POP (%esi)
+L(bk_write_less48bytes):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
+
+	CFI_PUSH (%esi)
+	ALIGN (4)
+L(bk_align):
+	cmp	$8, %ecx
+	jbe	L(bk_write_less32bytes)
+	testl	$1, %edx
+	/* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
+	   then (EDX & 2) must be != 0.  */
+	jz	L(bk_got2)
+	sub	$1, %esi
+	sub	$1, %ecx
+	sub	$1, %edx
+	movzbl	(%esi), %eax
+	movb	%al, (%edx)
+
+	testl	$2, %edx
+	jz	L(bk_aligned_4)
+
+L(bk_got2):
+	sub	$2, %esi
+	sub	$2, %ecx
+	sub	$2, %edx
+	movzwl	(%esi), %eax
+	movw	%ax, (%edx)
+	jmp	L(bk_aligned_4)
+
+	ALIGN (4)
+L(bk_write_more64bytes):
+	/* Check alignment of last byte.  */
+	testl	$15, %edx
+	jz	L(bk_ssse3_cpy_pre)
+
+/* EDX is aligned 4 bytes, but not 16 bytes.  */
+L(bk_ssse3_align):
+	sub	$4, %esi
+	sub	$4, %ecx
+	sub	$4, %edx
+	movl	(%esi), %eax
+	movl	%eax, (%edx)
+
+	testl	$15, %edx
+	jz	L(bk_ssse3_cpy_pre)
+
+	sub	$4, %esi
+	sub	$4, %ecx
+	sub	$4, %edx
+	movl	(%esi), %eax
+	movl	%eax, (%edx)
+
+	testl	$15, %edx
+	jz	L(bk_ssse3_cpy_pre)
+
+	sub	$4, %esi
+	sub	$4, %ecx
+	sub	$4, %edx
+	movl	(%esi), %eax
+	movl	%eax, (%edx)
+
+L(bk_ssse3_cpy_pre):
+	cmp	$64, %ecx
+	jb	L(bk_write_more32bytes)
+
+L(bk_ssse3_cpy):
+	sub	$64, %esi
+	sub	$64, %ecx
+	sub	$64, %edx
+	movdqu	0x30(%esi), %xmm3
+	movdqa	%xmm3, 0x30(%edx)
+	movdqu	0x20(%esi), %xmm2
+	movdqa	%xmm2, 0x20(%edx)
+	movdqu	0x10(%esi), %xmm1
+	movdqa	%xmm1, 0x10(%edx)
+	movdqu	(%esi), %xmm0
+	movdqa	%xmm0, (%edx)
+	cmp	$64, %ecx
+	jae	L(bk_ssse3_cpy)
+	jmp	L(bk_write_64bytesless)
+
+#endif
+
+END (MEMCPY)
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
new file mode 100644
index 0000000000..53e8a6ca1d
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
@@ -0,0 +1,3162 @@
+/* memcpy with SSSE3
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc) \
+    && (defined SHARED \
+	|| defined USE_AS_MEMMOVE \
+	|| !defined USE_MULTIARCH)
+
+# include <sysdep.h>
+# include "asm-syntax.h"
+
+# ifndef MEMCPY
+#  define MEMCPY		__memcpy_ssse3
+#  define MEMCPY_CHK	__memcpy_chk_ssse3
+# endif
+
+# ifdef USE_AS_BCOPY
+#  define SRC		PARMS
+#  define DEST		SRC+4
+#  define LEN		DEST+4
+# else
+#  define DEST		PARMS
+#  define SRC		DEST+4
+#  define LEN		SRC+4
+# endif
+
+# define CFI_PUSH(REG)		\
+  cfi_adjust_cfa_offset (4);		\
+  cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)		\
+  cfi_adjust_cfa_offset (-4);		\
+  cfi_restore (REG)
+
+# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+# define POP(REG)	popl REG; CFI_POP (REG)
+
+# ifdef SHARED
+#  define PARMS		8		/* Preserve EBX.  */
+#  define ENTRANCE	PUSH (%ebx);
+#  define RETURN_END	POP (%ebx); ret
+#  define RETURN		RETURN_END; CFI_PUSH (%ebx)
+#  define JMPTBL(I, B)	I - B
+
+/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
+	jump table with relative offsets.  INDEX is a register contains the
+	index into the jump table.   SCALE is the scale of INDEX. */
+
+#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+    /* We first load PC into EBX.  */		\
+	SETUP_PIC_REG(bx);		\
+    /* Get the address of the jump table.  */		\
+	addl	$(TABLE - .), %ebx;		\
+    /* Get the entry and convert the relative offset to the		\
+	absolute	address.  */		\
+	addl	(%ebx, INDEX, SCALE), %ebx;		\
+    /* We loaded the jump table.  Go.  */		\
+	jmp	*%ebx
+# else
+
+#  define PARMS		4
+#  define ENTRANCE
+#  define RETURN_END	ret
+#  define RETURN		RETURN_END
+#  define JMPTBL(I, B)	I
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+	absolute offsets.  INDEX is a register contains the index into the
+	jump table.  SCALE is the scale of INDEX. */
+
+#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+	jmp	*TABLE(, INDEX, SCALE)
+# endif
+
+	.section .text.ssse3,"ax",@progbits
+# if !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+# endif
+ENTRY (MEMCPY)
+	ENTRANCE
+	movl	LEN(%esp), %ecx
+	movl	SRC(%esp), %eax
+	movl	DEST(%esp), %edx
+
+# ifdef USE_AS_MEMMOVE
+	cmp	%eax, %edx
+	jb	L(copy_forward)
+	je	L(fwd_write_0bytes)
+	cmp	$32, %ecx
+	jae	L(memmove_bwd)
+	jmp	L(bk_write_less32bytes_2)
+
+	.p2align 4
+L(memmove_bwd):
+	add	%ecx, %eax
+	cmp	%eax, %edx
+	movl	SRC(%esp), %eax
+	jb	L(copy_backward)
+
+L(copy_forward):
+# endif
+	cmp	$48, %ecx
+	jae	L(48bytesormore)
+
+L(fwd_write_less32bytes):
+# ifndef USE_AS_MEMMOVE
+	cmp	%dl, %al
+	jb	L(bk_write)
+# endif
+	add	%ecx, %edx
+	add	%ecx, %eax
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+# ifndef USE_AS_MEMMOVE
+	.p2align 4
+L(bk_write):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
+# endif
+
+	.p2align 4
+L(48bytesormore):
+# ifndef USE_AS_MEMMOVE
+	movlpd	(%eax), %xmm0
+	movlpd	8(%eax), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 8(%edx)
+# else
+	movdqu	(%eax), %xmm0
+# endif
+	PUSH (%edi)
+	movl	%edx, %edi
+	and	$-16, %edx
+	add	$16, %edx
+	sub	%edx, %edi
+	add	%edi, %ecx
+	sub	%edi, %eax
+
+# ifdef SHARED_CACHE_SIZE_HALF
+	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_shared_cache_size_half, %ecx
+#  endif
+# endif
+
+	mov	%eax, %edi
+	jae	L(large_page)
+	and	$0xf, %edi
+	jz	L(shl_0)
+	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
+
+	.p2align 4
+L(shl_0):
+# ifdef USE_AS_MEMMOVE
+	movl	DEST+4(%esp), %edi
+	movdqu	%xmm0, (%edi)
+# endif
+	xor	%edi, %edi
+	cmp	$127, %ecx
+	ja	L(shl_0_gobble)
+	lea	-32(%ecx), %ecx
+
+	.p2align 4
+L(shl_0_loop):
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+	jb	L(shl_0_end)
+
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+	jb	L(shl_0_end)
+
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+	jb	L(shl_0_end)
+
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+
+L(shl_0_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	add	%edi, %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_0_gobble):
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	POP	(%edi)
+	lea	-128(%ecx), %ecx
+	jae	L(shl_0_gobble_mem_loop)
+
+	.p2align 4
+L(shl_0_gobble_cache_loop):
+	movdqa	(%eax), %xmm0
+	movdqa	0x10(%eax), %xmm1
+	movdqa	0x20(%eax), %xmm2
+	movdqa	0x30(%eax), %xmm3
+	movdqa	0x40(%eax), %xmm4
+	movdqa	0x50(%eax), %xmm5
+	movdqa	0x60(%eax), %xmm6
+	movdqa	0x70(%eax), %xmm7
+	lea	0x80(%eax), %eax
+	sub	$128, %ecx
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+	movdqa	%xmm2, 0x20(%edx)
+	movdqa	%xmm3, 0x30(%edx)
+	movdqa	%xmm4, 0x40(%edx)
+	movdqa	%xmm5, 0x50(%edx)
+	movdqa	%xmm6, 0x60(%edx)
+	movdqa	%xmm7, 0x70(%edx)
+	lea	0x80(%edx), %edx
+
+	jae	L(shl_0_gobble_cache_loop)
+	cmp	$-0x40, %ecx
+	lea	0x80(%ecx), %ecx
+	jl	L(shl_0_cache_less_64bytes)
+
+	movdqa	(%eax), %xmm0
+	sub	$0x40, %ecx
+	movdqa	0x10(%eax), %xmm1
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+	movdqa	0x20(%eax), %xmm0
+	movdqa	0x30(%eax), %xmm1
+	add	$0x40, %eax
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm1, 0x30(%edx)
+	add	$0x40, %edx
+
+L(shl_0_cache_less_64bytes):
+	cmp	$0x20, %ecx
+	jb	L(shl_0_cache_less_32bytes)
+	movdqa	(%eax), %xmm0
+	sub	$0x20, %ecx
+	movdqa	0x10(%eax), %xmm1
+	add	$0x20, %eax
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+	add	$0x20, %edx
+
+L(shl_0_cache_less_32bytes):
+	cmp	$0x10, %ecx
+	jb	L(shl_0_cache_less_16bytes)
+	sub	$0x10, %ecx
+	movdqa	(%eax), %xmm0
+	add	$0x10, %eax
+	movdqa	%xmm0, (%edx)
+	add	$0x10, %edx
+
+L(shl_0_cache_less_16bytes):
+	add	%ecx, %edx
+	add	%ecx, %eax
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+	.p2align 4
+L(shl_0_gobble_mem_loop):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x280(%eax)
+	prefetcht0 0x1c0(%edx)
+
+	movdqa	(%eax), %xmm0
+	movdqa	0x10(%eax), %xmm1
+	movdqa	0x20(%eax), %xmm2
+	movdqa	0x30(%eax), %xmm3
+	movdqa	0x40(%eax), %xmm4
+	movdqa	0x50(%eax), %xmm5
+	movdqa	0x60(%eax), %xmm6
+	movdqa	0x70(%eax), %xmm7
+	lea	0x80(%eax), %eax
+	sub	$0x80, %ecx
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+	movdqa	%xmm2, 0x20(%edx)
+	movdqa	%xmm3, 0x30(%edx)
+	movdqa	%xmm4, 0x40(%edx)
+	movdqa	%xmm5, 0x50(%edx)
+	movdqa	%xmm6, 0x60(%edx)
+	movdqa	%xmm7, 0x70(%edx)
+	lea	0x80(%edx), %edx
+
+	jae	L(shl_0_gobble_mem_loop)
+	cmp	$-0x40, %ecx
+	lea	0x80(%ecx), %ecx
+	jl	L(shl_0_mem_less_64bytes)
+
+	movdqa	(%eax), %xmm0
+	sub	$0x40, %ecx
+	movdqa	0x10(%eax), %xmm1
+
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+
+	movdqa	0x20(%eax), %xmm0
+	movdqa	0x30(%eax), %xmm1
+	add	$0x40, %eax
+
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm1, 0x30(%edx)
+	add	$0x40, %edx
+
+L(shl_0_mem_less_64bytes):
+	cmp	$0x20, %ecx
+	jb	L(shl_0_mem_less_32bytes)
+	movdqa	(%eax), %xmm0
+	sub	$0x20, %ecx
+	movdqa	0x10(%eax), %xmm1
+	add	$0x20, %eax
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+	add	$0x20, %edx
+
+L(shl_0_mem_less_32bytes):
+	cmp	$0x10, %ecx
+	jb	L(shl_0_mem_less_16bytes)
+	sub	$0x10, %ecx
+	movdqa	(%eax), %xmm0
+	add	$0x10, %eax
+	movdqa	%xmm0, (%edx)
+	add	$0x10, %edx
+
+L(shl_0_mem_less_16bytes):
+	add	%ecx, %edx
+	add	%ecx, %eax
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
+
+	.p2align 4
+L(shl_1):
+# ifndef USE_AS_MEMMOVE
+	movaps	-1(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-1(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_1_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl1LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	15(%eax), %xmm2
+	movaps	31(%eax), %xmm3
+	movaps	47(%eax), %xmm4
+	movaps	63(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$1, %xmm4, %xmm5
+	palignr	$1, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$1, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl1LoopStart)
+
+L(Shl1LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	15(%eax), %xmm2
+	movaps	31(%eax), %xmm3
+	palignr	$1, %xmm2, %xmm3
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_1_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-1(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_1_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$1, %xmm2, %xmm3
+	palignr	$1, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jb	L(sh_1_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$1, %xmm2, %xmm3
+	palignr	$1, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_1_no_prefetch_loop)
+
+L(sh_1_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	1(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_2):
+# ifndef USE_AS_MEMMOVE
+	movaps	-2(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-2(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_2_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl2LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	14(%eax), %xmm2
+	movaps	30(%eax), %xmm3
+	movaps	46(%eax), %xmm4
+	movaps	62(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$2, %xmm4, %xmm5
+	palignr	$2, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$2, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl2LoopStart)
+
+L(Shl2LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	14(%eax), %xmm2
+	movaps	30(%eax), %xmm3
+	palignr	$2, %xmm2, %xmm3
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_2_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-2(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_2_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$2, %xmm2, %xmm3
+	palignr	$2, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jb	L(sh_2_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$2, %xmm2, %xmm3
+	palignr	$2, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_2_no_prefetch_loop)
+
+L(sh_2_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	2(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_3):
+# ifndef USE_AS_MEMMOVE
+	movaps	-3(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-3(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_3_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl3LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	13(%eax), %xmm2
+	movaps	29(%eax), %xmm3
+	movaps	45(%eax), %xmm4
+	movaps	61(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$3, %xmm4, %xmm5
+	palignr	$3, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$3, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl3LoopStart)
+
+L(Shl3LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	13(%eax), %xmm2
+	movaps	29(%eax), %xmm3
+	palignr	$3, %xmm2, %xmm3
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_3_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-3(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_3_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$3, %xmm2, %xmm3
+	palignr	$3, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(sh_3_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$3, %xmm2, %xmm3
+	palignr	$3, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(sh_3_no_prefetch_loop)
+
+L(sh_3_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	3(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_4):
+# ifndef USE_AS_MEMMOVE
+	movaps	-4(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-4(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_4_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl4LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	12(%eax), %xmm2
+	movaps	28(%eax), %xmm3
+	movaps	44(%eax), %xmm4
+	movaps	60(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$4, %xmm4, %xmm5
+	palignr	$4, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$4, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl4LoopStart)
+
+L(Shl4LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	12(%eax), %xmm2
+	movaps	28(%eax), %xmm3
+	palignr	$4, %xmm2, %xmm3
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_4_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-4(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_4_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$4, %xmm2, %xmm3
+	palignr	$4, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(sh_4_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$4, %xmm2, %xmm3
+	palignr	$4, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(sh_4_no_prefetch_loop)
+
+L(sh_4_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	4(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_5):
+# ifndef USE_AS_MEMMOVE
+	movaps	-5(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-5(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_5_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl5LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	11(%eax), %xmm2
+	movaps	27(%eax), %xmm3
+	movaps	43(%eax), %xmm4
+	movaps	59(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$5, %xmm4, %xmm5
+	palignr	$5, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$5, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl5LoopStart)
+
+L(Shl5LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	11(%eax), %xmm2
+	movaps	27(%eax), %xmm3
+	palignr	$5, %xmm2, %xmm3
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_5_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-5(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_5_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$5, %xmm2, %xmm3
+	palignr	$5, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(sh_5_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$5, %xmm2, %xmm3
+	palignr	$5, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(sh_5_no_prefetch_loop)
+
+L(sh_5_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	5(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_6):
+# ifndef USE_AS_MEMMOVE
+	movaps	-6(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-6(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_6_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl6LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	10(%eax), %xmm2
+	movaps	26(%eax), %xmm3
+	movaps	42(%eax), %xmm4
+	movaps	58(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$6, %xmm4, %xmm5
+	palignr	$6, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$6, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl6LoopStart)
+
+L(Shl6LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	10(%eax), %xmm2
+	movaps	26(%eax), %xmm3
+	palignr	$6, %xmm2, %xmm3
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_6_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-6(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_6_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$6, %xmm2, %xmm3
+	palignr	$6, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(sh_6_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$6, %xmm2, %xmm3
+	palignr	$6, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(sh_6_no_prefetch_loop)
+
+L(sh_6_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	6(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_7):
+# ifndef USE_AS_MEMMOVE
+	movaps	-7(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-7(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_7_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl7LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	9(%eax), %xmm2
+	movaps	25(%eax), %xmm3
+	movaps	41(%eax), %xmm4
+	movaps	57(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$7, %xmm4, %xmm5
+	palignr	$7, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$7, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl7LoopStart)
+
+L(Shl7LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	9(%eax), %xmm2
+	movaps	25(%eax), %xmm3
+	palignr	$7, %xmm2, %xmm3
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_7_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-7(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_7_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$7, %xmm2, %xmm3
+	palignr	$7, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jb	L(sh_7_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$7, %xmm2, %xmm3
+	palignr	$7, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_7_no_prefetch_loop)
+
+L(sh_7_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	7(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_8):
+# ifndef USE_AS_MEMMOVE
+	movaps	-8(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-8(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_8_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl8LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	8(%eax), %xmm2
+	movaps	24(%eax), %xmm3
+	movaps	40(%eax), %xmm4
+	movaps	56(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$8, %xmm4, %xmm5
+	palignr	$8, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$8, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl8LoopStart)
+
+L(LoopLeave8):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	8(%eax), %xmm2
+	movaps	24(%eax), %xmm3
+	palignr	$8, %xmm2, %xmm3
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_8_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-8(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_8_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$8, %xmm2, %xmm3
+	palignr	$8, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jb	L(sh_8_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$8, %xmm2, %xmm3
+	palignr	$8, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_8_no_prefetch_loop)
+
+L(sh_8_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	8(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_9):
+# ifndef USE_AS_MEMMOVE
+	movaps	-9(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-9(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_9_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl9LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	7(%eax), %xmm2
+	movaps	23(%eax), %xmm3
+	movaps	39(%eax), %xmm4
+	movaps	55(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$9, %xmm4, %xmm5
+	palignr	$9, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$9, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl9LoopStart)
+
+L(Shl9LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	7(%eax), %xmm2
+	movaps	23(%eax), %xmm3
+	palignr	$9, %xmm2, %xmm3
+	palignr	$9, %xmm1, %xmm2
+
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_9_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-9(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_9_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$9, %xmm2, %xmm3
+	palignr	$9, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jb	L(sh_9_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$9, %xmm2, %xmm3
+	palignr	$9, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_9_no_prefetch_loop)
+
+L(sh_9_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	9(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_10):
+# ifndef USE_AS_MEMMOVE
+	movaps	-10(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-10(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_10_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl10LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	6(%eax), %xmm2
+	movaps	22(%eax), %xmm3
+	movaps	38(%eax), %xmm4
+	movaps	54(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$10, %xmm4, %xmm5
+	palignr	$10, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$10, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl10LoopStart)
+
+L(Shl10LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	6(%eax), %xmm2
+	movaps	22(%eax), %xmm3
+	palignr	$10, %xmm2, %xmm3
+	palignr	$10, %xmm1, %xmm2
+
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_10_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-10(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_10_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$10, %xmm2, %xmm3
+	palignr	$10, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jb	L(sh_10_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$10, %xmm2, %xmm3
+	palignr	$10, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_10_no_prefetch_loop)
+
+L(sh_10_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	10(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_11):
+# ifndef USE_AS_MEMMOVE
+	movaps	-11(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-11(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_11_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl11LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	5(%eax), %xmm2
+	movaps	21(%eax), %xmm3
+	movaps	37(%eax), %xmm4
+	movaps	53(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$11, %xmm4, %xmm5
+	palignr	$11, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$11, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl11LoopStart)
+
+L(Shl11LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	5(%eax), %xmm2
+	movaps	21(%eax), %xmm3
+	palignr	$11, %xmm2, %xmm3
+	palignr	$11, %xmm1, %xmm2
+
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_11_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-11(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_11_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$11, %xmm2, %xmm3
+	palignr	$11, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jb	L(sh_11_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$11, %xmm2, %xmm3
+	palignr	$11, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_11_no_prefetch_loop)
+
+L(sh_11_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	11(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_12):
+# ifndef USE_AS_MEMMOVE
+	movaps	-12(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-12(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_12_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl12LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	4(%eax), %xmm2
+	movaps	20(%eax), %xmm3
+	movaps	36(%eax), %xmm4
+	movaps	52(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$12, %xmm4, %xmm5
+	palignr	$12, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$12, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl12LoopStart)
+
+L(Shl12LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	4(%eax), %xmm2
+	movaps	20(%eax), %xmm3
+	palignr	$12, %xmm2, %xmm3
+	palignr	$12, %xmm1, %xmm2
+
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_12_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-12(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_12_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$12, %xmm2, %xmm3
+	palignr	$12, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jb	L(sh_12_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$12, %xmm2, %xmm3
+	palignr	$12, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_12_no_prefetch_loop)
+
+L(sh_12_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	12(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_13):
+# ifndef USE_AS_MEMMOVE
+	movaps	-13(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-13(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_13_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl13LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	3(%eax), %xmm2
+	movaps	19(%eax), %xmm3
+	movaps	35(%eax), %xmm4
+	movaps	51(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$13, %xmm4, %xmm5
+	palignr	$13, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$13, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl13LoopStart)
+
+L(Shl13LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	3(%eax), %xmm2
+	movaps	19(%eax), %xmm3
+	palignr	$13, %xmm2, %xmm3
+	palignr	$13, %xmm1, %xmm2
+
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_13_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-13(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_13_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$13, %xmm2, %xmm3
+	palignr	$13, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jb	L(sh_13_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$13, %xmm2, %xmm3
+	palignr	$13, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_13_no_prefetch_loop)
+
+L(sh_13_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	13(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_14):
+# ifndef USE_AS_MEMMOVE
+	movaps	-14(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-14(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_14_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl14LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	2(%eax), %xmm2
+	movaps	18(%eax), %xmm3
+	movaps	34(%eax), %xmm4
+	movaps	50(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$14, %xmm4, %xmm5
+	palignr	$14, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$14, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl14LoopStart)
+
+L(Shl14LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	2(%eax), %xmm2
+	movaps	18(%eax), %xmm3
+	palignr	$14, %xmm2, %xmm3
+	palignr	$14, %xmm1, %xmm2
+
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_14_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-14(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_14_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$14, %xmm2, %xmm3
+	palignr	$14, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jb	L(sh_14_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$14, %xmm2, %xmm3
+	palignr	$14, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_14_no_prefetch_loop)
+
+L(sh_14_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	14(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_15):
+# ifndef USE_AS_MEMMOVE
+	movaps	-15(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-15(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_15_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl15LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	1(%eax), %xmm2
+	movaps	17(%eax), %xmm3
+	movaps	33(%eax), %xmm4
+	movaps	49(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$15, %xmm4, %xmm5
+	palignr	$15, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$15, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl15LoopStart)
+
+L(Shl15LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	1(%eax), %xmm2
+	movaps	17(%eax), %xmm3
+	palignr	$15, %xmm2, %xmm3
+	palignr	$15, %xmm1, %xmm2
+
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_15_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-15(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_15_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$15, %xmm2, %xmm3
+	palignr	$15, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jb	L(sh_15_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$15, %xmm2, %xmm3
+	palignr	$15, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_15_no_prefetch_loop)
+
+L(sh_15_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	15(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_end_0):
+	lea	32(%ecx), %ecx
+	lea	(%edx, %ecx), %edx
+	lea	(%eax, %ecx), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	.p2align 4
+L(fwd_write_44bytes):
+	movq	-44(%eax), %xmm0
+	movq	%xmm0, -44(%edx)
+L(fwd_write_36bytes):
+	movq	-36(%eax), %xmm0
+	movq	%xmm0, -36(%edx)
+L(fwd_write_28bytes):
+	movq	-28(%eax), %xmm0
+	movq	%xmm0, -28(%edx)
+L(fwd_write_20bytes):
+	movq	-20(%eax), %xmm0
+	movq	%xmm0, -20(%edx)
+L(fwd_write_12bytes):
+	movq	-12(%eax), %xmm0
+	movq	%xmm0, -12(%edx)
+L(fwd_write_4bytes):
+	movl	-4(%eax), %ecx
+	movl	%ecx, -4(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_40bytes):
+	movq	-40(%eax), %xmm0
+	movq	%xmm0, -40(%edx)
+L(fwd_write_32bytes):
+	movq	-32(%eax), %xmm0
+	movq	%xmm0, -32(%edx)
+L(fwd_write_24bytes):
+	movq	-24(%eax), %xmm0
+	movq	%xmm0, -24(%edx)
+L(fwd_write_16bytes):
+	movq	-16(%eax), %xmm0
+	movq	%xmm0, -16(%edx)
+L(fwd_write_8bytes):
+	movq	-8(%eax), %xmm0
+	movq	%xmm0, -8(%edx)
+L(fwd_write_0bytes):
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_5bytes):
+	movl	-5(%eax), %ecx
+	movl	-4(%eax), %eax
+	movl	%ecx, -5(%edx)
+	movl	%eax, -4(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_45bytes):
+	movq	-45(%eax), %xmm0
+	movq	%xmm0, -45(%edx)
+L(fwd_write_37bytes):
+	movq	-37(%eax), %xmm0
+	movq	%xmm0, -37(%edx)
+L(fwd_write_29bytes):
+	movq	-29(%eax), %xmm0
+	movq	%xmm0, -29(%edx)
+L(fwd_write_21bytes):
+	movq	-21(%eax), %xmm0
+	movq	%xmm0, -21(%edx)
+L(fwd_write_13bytes):
+	movq	-13(%eax), %xmm0
+	movq	%xmm0, -13(%edx)
+	movl	-5(%eax), %ecx
+	movl	%ecx, -5(%edx)
+	movzbl	-1(%eax), %ecx
+	movb	%cl, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_41bytes):
+	movq	-41(%eax), %xmm0
+	movq	%xmm0, -41(%edx)
+L(fwd_write_33bytes):
+	movq	-33(%eax), %xmm0
+	movq	%xmm0, -33(%edx)
+L(fwd_write_25bytes):
+	movq	-25(%eax), %xmm0
+	movq	%xmm0, -25(%edx)
+L(fwd_write_17bytes):
+	movq	-17(%eax), %xmm0
+	movq	%xmm0, -17(%edx)
+L(fwd_write_9bytes):
+	movq	-9(%eax), %xmm0
+	movq	%xmm0, -9(%edx)
+L(fwd_write_1bytes):
+	movzbl	-1(%eax), %ecx
+	movb	%cl, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_46bytes):
+	movq	-46(%eax), %xmm0
+	movq	%xmm0, -46(%edx)
+L(fwd_write_38bytes):
+	movq	-38(%eax), %xmm0
+	movq	%xmm0, -38(%edx)
+L(fwd_write_30bytes):
+	movq	-30(%eax), %xmm0
+	movq	%xmm0, -30(%edx)
+L(fwd_write_22bytes):
+	movq	-22(%eax), %xmm0
+	movq	%xmm0, -22(%edx)
+L(fwd_write_14bytes):
+	movq	-14(%eax), %xmm0
+	movq	%xmm0, -14(%edx)
+L(fwd_write_6bytes):
+	movl	-6(%eax), %ecx
+	movl	%ecx, -6(%edx)
+	movzwl	-2(%eax), %ecx
+	movw	%cx, -2(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_42bytes):
+	movq	-42(%eax), %xmm0
+	movq	%xmm0, -42(%edx)
+L(fwd_write_34bytes):
+	movq	-34(%eax), %xmm0
+	movq	%xmm0, -34(%edx)
+L(fwd_write_26bytes):
+	movq	-26(%eax), %xmm0
+	movq	%xmm0, -26(%edx)
+L(fwd_write_18bytes):
+	movq	-18(%eax), %xmm0
+	movq	%xmm0, -18(%edx)
+L(fwd_write_10bytes):
+	movq	-10(%eax), %xmm0
+	movq	%xmm0, -10(%edx)
+L(fwd_write_2bytes):
+	movzwl	-2(%eax), %ecx
+	movw	%cx, -2(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_47bytes):
+	movq	-47(%eax), %xmm0
+	movq	%xmm0, -47(%edx)
+L(fwd_write_39bytes):
+	movq	-39(%eax), %xmm0
+	movq	%xmm0, -39(%edx)
+L(fwd_write_31bytes):
+	movq	-31(%eax), %xmm0
+	movq	%xmm0, -31(%edx)
+L(fwd_write_23bytes):
+	movq	-23(%eax), %xmm0
+	movq	%xmm0, -23(%edx)
+L(fwd_write_15bytes):
+	movq	-15(%eax), %xmm0
+	movq	%xmm0, -15(%edx)
+L(fwd_write_7bytes):
+	movl	-7(%eax), %ecx
+	movl	%ecx, -7(%edx)
+	movzwl	-3(%eax), %ecx
+	movzbl	-1(%eax), %eax
+	movw	%cx, -3(%edx)
+	movb	%al, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_43bytes):
+	movq	-43(%eax), %xmm0
+	movq	%xmm0, -43(%edx)
+L(fwd_write_35bytes):
+	movq	-35(%eax), %xmm0
+	movq	%xmm0, -35(%edx)
+L(fwd_write_27bytes):
+	movq	-27(%eax), %xmm0
+	movq	%xmm0, -27(%edx)
+L(fwd_write_19bytes):
+	movq	-19(%eax), %xmm0
+	movq	%xmm0, -19(%edx)
+L(fwd_write_11bytes):
+	movq	-11(%eax), %xmm0
+	movq	%xmm0, -11(%edx)
+L(fwd_write_3bytes):
+	movzwl	-3(%eax), %ecx
+	movzbl	-1(%eax), %eax
+	movw	%cx, -3(%edx)
+	movb	%al, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_40bytes_align):
+	movdqa	-40(%eax), %xmm0
+	movdqa	%xmm0, -40(%edx)
+L(fwd_write_24bytes_align):
+	movdqa	-24(%eax), %xmm0
+	movdqa	%xmm0, -24(%edx)
+L(fwd_write_8bytes_align):
+	movq	-8(%eax), %xmm0
+	movq	%xmm0, -8(%edx)
+L(fwd_write_0bytes_align):
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_32bytes_align):
+	movdqa	-32(%eax), %xmm0
+	movdqa	%xmm0, -32(%edx)
+L(fwd_write_16bytes_align):
+	movdqa	-16(%eax), %xmm0
+	movdqa	%xmm0, -16(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_5bytes_align):
+	movl	-5(%eax), %ecx
+	movl	-4(%eax), %eax
+	movl	%ecx, -5(%edx)
+	movl	%eax, -4(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_45bytes_align):
+	movdqa	-45(%eax), %xmm0
+	movdqa	%xmm0, -45(%edx)
+L(fwd_write_29bytes_align):
+	movdqa	-29(%eax), %xmm0
+	movdqa	%xmm0, -29(%edx)
+L(fwd_write_13bytes_align):
+	movq	-13(%eax), %xmm0
+	movq	%xmm0, -13(%edx)
+	movl	-5(%eax), %ecx
+	movl	%ecx, -5(%edx)
+	movzbl	-1(%eax), %ecx
+	movb	%cl, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_37bytes_align):
+	movdqa	-37(%eax), %xmm0
+	movdqa	%xmm0, -37(%edx)
+L(fwd_write_21bytes_align):
+	movdqa	-21(%eax), %xmm0
+	movdqa	%xmm0, -21(%edx)
+	movl	-5(%eax), %ecx
+	movl	%ecx, -5(%edx)
+	movzbl	-1(%eax), %ecx
+	movb	%cl, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_41bytes_align):
+	movdqa	-41(%eax), %xmm0
+	movdqa	%xmm0, -41(%edx)
+L(fwd_write_25bytes_align):
+	movdqa	-25(%eax), %xmm0
+	movdqa	%xmm0, -25(%edx)
+L(fwd_write_9bytes_align):
+	movq	-9(%eax), %xmm0
+	movq	%xmm0, -9(%edx)
+L(fwd_write_1bytes_align):
+	movzbl	-1(%eax), %ecx
+	movb	%cl, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_33bytes_align):
+	movdqa	-33(%eax), %xmm0
+	movdqa	%xmm0, -33(%edx)
+L(fwd_write_17bytes_align):
+	movdqa	-17(%eax), %xmm0
+	movdqa	%xmm0, -17(%edx)
+	movzbl	-1(%eax), %ecx
+	movb	%cl, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_46bytes_align):
+	movdqa	-46(%eax), %xmm0
+	movdqa	%xmm0, -46(%edx)
+L(fwd_write_30bytes_align):
+	movdqa	-30(%eax), %xmm0
+	movdqa	%xmm0, -30(%edx)
+L(fwd_write_14bytes_align):
+	movq	-14(%eax), %xmm0
+	movq	%xmm0, -14(%edx)
+L(fwd_write_6bytes_align):
+	movl	-6(%eax), %ecx
+	movl	%ecx, -6(%edx)
+	movzwl	-2(%eax), %ecx
+	movw	%cx, -2(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_38bytes_align):
+	movdqa	-38(%eax), %xmm0
+	movdqa	%xmm0, -38(%edx)
+L(fwd_write_22bytes_align):
+	movdqa	-22(%eax), %xmm0
+	movdqa	%xmm0, -22(%edx)
+	movl	-6(%eax), %ecx
+	movl	%ecx, -6(%edx)
+	movzwl	-2(%eax), %ecx
+	movw	%cx, -2(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_42bytes_align):
+	movdqa	-42(%eax), %xmm0
+	movdqa	%xmm0, -42(%edx)
+L(fwd_write_26bytes_align):
+	movdqa	-26(%eax), %xmm0
+	movdqa	%xmm0, -26(%edx)
+L(fwd_write_10bytes_align):
+	movq	-10(%eax), %xmm0
+	movq	%xmm0, -10(%edx)
+L(fwd_write_2bytes_align):
+	movzwl	-2(%eax), %ecx
+	movw	%cx, -2(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_34bytes_align):
+	movdqa	-34(%eax), %xmm0
+	movdqa	%xmm0, -34(%edx)
+L(fwd_write_18bytes_align):
+	movdqa	-18(%eax), %xmm0
+	movdqa	%xmm0, -18(%edx)
+	movzwl	-2(%eax), %ecx
+	movw	%cx, -2(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_47bytes_align):
+	movdqa	-47(%eax), %xmm0
+	movdqa	%xmm0, -47(%edx)
+L(fwd_write_31bytes_align):
+	movdqa	-31(%eax), %xmm0
+	movdqa	%xmm0, -31(%edx)
+L(fwd_write_15bytes_align):
+	movq	-15(%eax), %xmm0
+	movq	%xmm0, -15(%edx)
+L(fwd_write_7bytes_align):
+	movl	-7(%eax), %ecx
+	movl	%ecx, -7(%edx)
+	movzwl	-3(%eax), %ecx
+	movzbl	-1(%eax), %eax
+	movw	%cx, -3(%edx)
+	movb	%al, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_39bytes_align):
+	movdqa	-39(%eax), %xmm0
+	movdqa	%xmm0, -39(%edx)
+L(fwd_write_23bytes_align):
+	movdqa	-23(%eax), %xmm0
+	movdqa	%xmm0, -23(%edx)
+	movl	-7(%eax), %ecx
+	movl	%ecx, -7(%edx)
+	movzwl	-3(%eax), %ecx
+	movzbl	-1(%eax), %eax
+	movw	%cx, -3(%edx)
+	movb	%al, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_43bytes_align):
+	movdqa	-43(%eax), %xmm0
+	movdqa	%xmm0, -43(%edx)
+L(fwd_write_27bytes_align):
+	movdqa	-27(%eax), %xmm0
+	movdqa	%xmm0, -27(%edx)
+L(fwd_write_11bytes_align):
+	movq	-11(%eax), %xmm0
+	movq	%xmm0, -11(%edx)
+L(fwd_write_3bytes_align):
+	movzwl	-3(%eax), %ecx
+	movzbl	-1(%eax), %eax
+	movw	%cx, -3(%edx)
+	movb	%al, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_35bytes_align):
+	movdqa	-35(%eax), %xmm0
+	movdqa	%xmm0, -35(%edx)
+L(fwd_write_19bytes_align):
+	movdqa	-19(%eax), %xmm0
+	movdqa	%xmm0, -19(%edx)
+	movzwl	-3(%eax), %ecx
+	movzbl	-1(%eax), %eax
+	movw	%cx, -3(%edx)
+	movb	%al, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_44bytes_align):
+	movdqa	-44(%eax), %xmm0
+	movdqa	%xmm0, -44(%edx)
+L(fwd_write_28bytes_align):
+	movdqa	-28(%eax), %xmm0
+	movdqa	%xmm0, -28(%edx)
+L(fwd_write_12bytes_align):
+	movq	-12(%eax), %xmm0
+	movq	%xmm0, -12(%edx)
+L(fwd_write_4bytes_align):
+	movl	-4(%eax), %ecx
+	movl	%ecx, -4(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_36bytes_align):
+	movdqa	-36(%eax), %xmm0
+	movdqa	%xmm0, -36(%edx)
+L(fwd_write_20bytes_align):
+	movdqa	-20(%eax), %xmm0
+	movdqa	%xmm0, -20(%edx)
+	movl	-4(%eax), %ecx
+	movl	%ecx, -4(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN_END
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(large_page):
+	movdqu	(%eax), %xmm1
+# ifdef USE_AS_MEMMOVE
+	movl	DEST+4(%esp), %edi
+	movdqu	%xmm0, (%edi)
+# endif
+	lea	16(%eax), %eax
+	movntdq	%xmm1, (%edx)
+	lea	16(%edx), %edx
+	lea	-0x90(%ecx), %ecx
+	POP (%edi)
+
+	.p2align 4
+L(large_page_loop):
+	movdqu	(%eax), %xmm0
+	movdqu	0x10(%eax), %xmm1
+	movdqu	0x20(%eax), %xmm2
+	movdqu	0x30(%eax), %xmm3
+	movdqu	0x40(%eax), %xmm4
+	movdqu	0x50(%eax), %xmm5
+	movdqu	0x60(%eax), %xmm6
+	movdqu	0x70(%eax), %xmm7
+	lea	0x80(%eax), %eax
+
+	sub	$0x80, %ecx
+	movntdq	%xmm0, (%edx)
+	movntdq	%xmm1, 0x10(%edx)
+	movntdq	%xmm2, 0x20(%edx)
+	movntdq	%xmm3, 0x30(%edx)
+	movntdq	%xmm4, 0x40(%edx)
+	movntdq	%xmm5, 0x50(%edx)
+	movntdq	%xmm6, 0x60(%edx)
+	movntdq	%xmm7, 0x70(%edx)
+	lea	0x80(%edx), %edx
+	jae	L(large_page_loop)
+	cmp	$-0x40, %ecx
+	lea	0x80(%ecx), %ecx
+	jl	L(large_page_less_64bytes)
+
+	movdqu	(%eax), %xmm0
+	movdqu	0x10(%eax), %xmm1
+	movdqu	0x20(%eax), %xmm2
+	movdqu	0x30(%eax), %xmm3
+	lea	0x40(%eax), %eax
+
+	movntdq	%xmm0, (%edx)
+	movntdq	%xmm1, 0x10(%edx)
+	movntdq	%xmm2, 0x20(%edx)
+	movntdq	%xmm3, 0x30(%edx)
+	lea	0x40(%edx), %edx
+	sub	$0x40, %ecx
+L(large_page_less_64bytes):
+	cmp	$32, %ecx
+	jb	L(large_page_less_32bytes)
+	movdqu	(%eax), %xmm0
+	movdqu	0x10(%eax), %xmm1
+	lea	0x20(%eax), %eax
+	movntdq	%xmm0, (%edx)
+	movntdq	%xmm1, 0x10(%edx)
+	lea	0x20(%edx), %edx
+	sub	$0x20, %ecx
+L(large_page_less_32bytes):
+	add	%ecx, %edx
+	add	%ecx, %eax
+	sfence
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+	.p2align 4
+L(bk_write_44bytes):
+	movq	36(%eax), %xmm0
+	movq	%xmm0, 36(%edx)
+L(bk_write_36bytes):
+	movq	28(%eax), %xmm0
+	movq	%xmm0, 28(%edx)
+L(bk_write_28bytes):
+	movq	20(%eax), %xmm0
+	movq	%xmm0, 20(%edx)
+L(bk_write_20bytes):
+	movq	12(%eax), %xmm0
+	movq	%xmm0, 12(%edx)
+L(bk_write_12bytes):
+	movq	4(%eax), %xmm0
+	movq	%xmm0, 4(%edx)
+L(bk_write_4bytes):
+	movl	(%eax), %ecx
+	movl	%ecx, (%edx)
+L(bk_write_0bytes):
+# ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+#  ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(bk_write_40bytes):
+	movq	32(%eax), %xmm0
+	movq	%xmm0, 32(%edx)
+L(bk_write_32bytes):
+	movq	24(%eax), %xmm0
+	movq	%xmm0, 24(%edx)
+L(bk_write_24bytes):
+	movq	16(%eax), %xmm0
+	movq	%xmm0, 16(%edx)
+L(bk_write_16bytes):
+	movq	8(%eax), %xmm0
+	movq	%xmm0, 8(%edx)
+L(bk_write_8bytes):
+	movq	(%eax), %xmm0
+	movq	%xmm0, (%edx)
+# ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+#  ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(bk_write_45bytes):
+	movq	37(%eax), %xmm0
+	movq	%xmm0, 37(%edx)
+L(bk_write_37bytes):
+	movq	29(%eax), %xmm0
+	movq	%xmm0, 29(%edx)
+L(bk_write_29bytes):
+	movq	21(%eax), %xmm0
+	movq	%xmm0, 21(%edx)
+L(bk_write_21bytes):
+	movq	13(%eax), %xmm0
+	movq	%xmm0, 13(%edx)
+L(bk_write_13bytes):
+	movq	5(%eax), %xmm0
+	movq	%xmm0, 5(%edx)
+L(bk_write_5bytes):
+	movl	1(%eax), %ecx
+	movl	%ecx, 1(%edx)
+L(bk_write_1bytes):
+	movzbl	(%eax), %ecx
+	movb	%cl, (%edx)
+# ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+#  ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(bk_write_41bytes):
+	movq	33(%eax), %xmm0
+	movq	%xmm0, 33(%edx)
+L(bk_write_33bytes):
+	movq	25(%eax), %xmm0
+	movq	%xmm0, 25(%edx)
+L(bk_write_25bytes):
+	movq	17(%eax), %xmm0
+	movq	%xmm0, 17(%edx)
+L(bk_write_17bytes):
+	movq	9(%eax), %xmm0
+	movq	%xmm0, 9(%edx)
+L(bk_write_9bytes):
+	movq	1(%eax), %xmm0
+	movq	%xmm0, 1(%edx)
+	movzbl	(%eax), %ecx
+	movb	%cl, (%edx)
+# ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+#  ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(bk_write_46bytes):
+	movq	38(%eax), %xmm0
+	movq	%xmm0, 38(%edx)
+L(bk_write_38bytes):
+	movq	30(%eax), %xmm0
+	movq	%xmm0, 30(%edx)
+L(bk_write_30bytes):
+	movq	22(%eax), %xmm0
+	movq	%xmm0, 22(%edx)
+L(bk_write_22bytes):
+	movq	14(%eax), %xmm0
+	movq	%xmm0, 14(%edx)
+L(bk_write_14bytes):
+	movq	6(%eax), %xmm0
+	movq	%xmm0, 6(%edx)
+L(bk_write_6bytes):
+	movl	2(%eax), %ecx
+	movl	%ecx, 2(%edx)
+	movzwl	(%eax), %ecx
+	movw	%cx, (%edx)
+# ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+#  ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(bk_write_42bytes):
+	movq	34(%eax), %xmm0
+	movq	%xmm0, 34(%edx)
+L(bk_write_34bytes):
+	movq	26(%eax), %xmm0
+	movq	%xmm0, 26(%edx)
+L(bk_write_26bytes):
+	movq	18(%eax), %xmm0
+	movq	%xmm0, 18(%edx)
+L(bk_write_18bytes):
+	movq	10(%eax), %xmm0
+	movq	%xmm0, 10(%edx)
+L(bk_write_10bytes):
+	movq	2(%eax), %xmm0
+	movq	%xmm0, 2(%edx)
+L(bk_write_2bytes):
+	movzwl	(%eax), %ecx
+	movw	%cx, (%edx)
+# ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+#  ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(bk_write_47bytes):
+	movq	39(%eax), %xmm0
+	movq	%xmm0, 39(%edx)
+L(bk_write_39bytes):
+	movq	31(%eax), %xmm0
+	movq	%xmm0, 31(%edx)
+L(bk_write_31bytes):
+	movq	23(%eax), %xmm0
+	movq	%xmm0, 23(%edx)
+L(bk_write_23bytes):
+	movq	15(%eax), %xmm0
+	movq	%xmm0, 15(%edx)
+L(bk_write_15bytes):
+	movq	7(%eax), %xmm0
+	movq	%xmm0, 7(%edx)
+L(bk_write_7bytes):
+	movl	3(%eax), %ecx
+	movl	%ecx, 3(%edx)
+	movzwl	1(%eax), %ecx
+	movw	%cx, 1(%edx)
+	movzbl	(%eax), %eax
+	movb	%al, (%edx)
+# ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+#  ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(bk_write_43bytes):
+	movq	35(%eax), %xmm0
+	movq	%xmm0, 35(%edx)
+L(bk_write_35bytes):
+	movq	27(%eax), %xmm0
+	movq	%xmm0, 27(%edx)
+L(bk_write_27bytes):
+	movq	19(%eax), %xmm0
+	movq	%xmm0, 19(%edx)
+L(bk_write_19bytes):
+	movq	11(%eax), %xmm0
+	movq	%xmm0, 11(%edx)
+L(bk_write_11bytes):
+	movq	3(%eax), %xmm0
+	movq	%xmm0, 3(%edx)
+L(bk_write_3bytes):
+	movzwl	1(%eax), %ecx
+	movw	%cx, 1(%edx)
+	movzbl	(%eax), %eax
+	movb	%al, (%edx)
+# ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+#  ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+#  endif
+# endif
+	RETURN_END
+
+
+	.pushsection .rodata.ssse3,"a",@progbits
+	.p2align 2
+L(table_48bytes_fwd):
+	.int	JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
+
+	.p2align 2
+L(table_48bytes_fwd_align):
+	.int	JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align))
+
+	.p2align 2
+L(shl_table):
+	.int	JMPTBL (L(shl_0), L(shl_table))
+	.int	JMPTBL (L(shl_1), L(shl_table))
+	.int	JMPTBL (L(shl_2), L(shl_table))
+	.int	JMPTBL (L(shl_3), L(shl_table))
+	.int	JMPTBL (L(shl_4), L(shl_table))
+	.int	JMPTBL (L(shl_5), L(shl_table))
+	.int	JMPTBL (L(shl_6), L(shl_table))
+	.int	JMPTBL (L(shl_7), L(shl_table))
+	.int	JMPTBL (L(shl_8), L(shl_table))
+	.int	JMPTBL (L(shl_9), L(shl_table))
+	.int	JMPTBL (L(shl_10), L(shl_table))
+	.int	JMPTBL (L(shl_11), L(shl_table))
+	.int	JMPTBL (L(shl_12), L(shl_table))
+	.int	JMPTBL (L(shl_13), L(shl_table))
+	.int	JMPTBL (L(shl_14), L(shl_table))
+	.int	JMPTBL (L(shl_15), L(shl_table))
+
+	.p2align 2
+L(table_48_bytes_bwd):
+	.int	JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
+
+	.popsection
+
+# ifdef USE_AS_MEMMOVE
+	.p2align 4
+L(copy_backward):
+	PUSH (%edi)
+	movl	%eax, %edi
+	lea	(%ecx,%edx,1),%edx
+	lea	(%ecx,%edi,1),%edi
+	testl	$0x3, %edx
+	jnz	L(bk_align)
+
+L(bk_aligned_4):
+	cmp	$64, %ecx
+	jae	L(bk_write_more64bytes)
+
+L(bk_write_64bytesless):
+	cmp	$32, %ecx
+	jb	L(bk_write_less32bytes)
+
+L(bk_write_more32bytes):
+	/* Copy 32 bytes at a time.  */
+	sub	$32, %ecx
+	movq	-8(%edi), %xmm0
+	movq	%xmm0, -8(%edx)
+	movq	-16(%edi), %xmm0
+	movq	%xmm0, -16(%edx)
+	movq	-24(%edi), %xmm0
+	movq	%xmm0, -24(%edx)
+	movq	-32(%edi), %xmm0
+	movq	%xmm0, -32(%edx)
+	sub	$32, %edx
+	sub	$32, %edi
+
+L(bk_write_less32bytes):
+	movl	%edi, %eax
+	sub	%ecx, %edx
+	sub	%ecx, %eax
+	POP (%edi)
+L(bk_write_less32bytes_2):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(bk_align):
+	cmp	$8, %ecx
+	jbe	L(bk_write_less32bytes)
+	testl	$1, %edx
+	/* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
+	then	(EDX & 2) must be != 0.  */
+	jz	L(bk_got2)
+	sub	$1, %edi
+	sub	$1, %ecx
+	sub	$1, %edx
+	movzbl	(%edi), %eax
+	movb	%al, (%edx)
+
+	testl	$2, %edx
+	jz	L(bk_aligned_4)
+
+L(bk_got2):
+	sub	$2, %edi
+	sub	$2, %ecx
+	sub	$2, %edx
+	movzwl	(%edi), %eax
+	movw	%ax, (%edx)
+	jmp	L(bk_aligned_4)
+
+	.p2align 4
+L(bk_write_more64bytes):
+	/* Check alignment of last byte.  */
+	testl	$15, %edx
+	jz	L(bk_ssse3_cpy_pre)
+
+/* EDX is aligned 4 bytes, but not 16 bytes.  */
+L(bk_ssse3_align):
+	sub	$4, %edi
+	sub	$4, %ecx
+	sub	$4, %edx
+	movl	(%edi), %eax
+	movl	%eax, (%edx)
+
+	testl	$15, %edx
+	jz	L(bk_ssse3_cpy_pre)
+
+	sub	$4, %edi
+	sub	$4, %ecx
+	sub	$4, %edx
+	movl	(%edi), %eax
+	movl	%eax, (%edx)
+
+	testl	$15, %edx
+	jz	L(bk_ssse3_cpy_pre)
+
+	sub	$4, %edi
+	sub	$4, %ecx
+	sub	$4, %edx
+	movl	(%edi), %eax
+	movl	%eax, (%edx)
+
+L(bk_ssse3_cpy_pre):
+	cmp	$64, %ecx
+	jb	L(bk_write_more32bytes)
+
+	.p2align 4
+L(bk_ssse3_cpy):
+	sub	$64, %edi
+	sub	$64, %ecx
+	sub	$64, %edx
+	movdqu	0x30(%edi), %xmm3
+	movdqa	%xmm3, 0x30(%edx)
+	movdqu	0x20(%edi), %xmm2
+	movdqa	%xmm2, 0x20(%edx)
+	movdqu	0x10(%edi), %xmm1
+	movdqa	%xmm1, 0x10(%edx)
+	movdqu	(%edi), %xmm0
+	movdqa	%xmm0, (%edx)
+	cmp	$64, %ecx
+	jae	L(bk_ssse3_cpy)
+	jmp	L(bk_write_64bytesless)
+
+# endif
+
+END (MEMCPY)
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy.S
new file mode 100644
index 0000000000..f725944620
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy.S
@@ -0,0 +1,78 @@
+/* Multiple versions of memcpy
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+   DSO.  In static binaries we need memcpy before the initialization
+   happened.  */
+#if defined SHARED && IS_IN (libc)
+	.text
+ENTRY(memcpy)
+	.type	memcpy, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memcpy_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memcpy_sse2_unaligned)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz	2f
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memcpy_ssse3)
+	HAS_ARCH_FEATURE (Fast_Rep_String)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memcpy_ssse3_rep)
+2:	ret
+END(memcpy)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __memcpy_ia32, @function; \
+	.p2align 4; \
+	.globl __memcpy_ia32; \
+	.hidden __memcpy_ia32; \
+	__memcpy_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __memcpy_ia32, .-__memcpy_ia32
+
+# undef ENTRY_CHK
+# define ENTRY_CHK(name) \
+	.type __memcpy_chk_ia32, @function; \
+	.globl __memcpy_chk_ia32; \
+	.p2align 4; \
+	__memcpy_chk_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END_CHK
+# define END_CHK(name) \
+	cfi_endproc; .size __memcpy_chk_ia32, .-__memcpy_chk_ia32
+
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_memcpy; __GI_memcpy = __memcpy_ia32
+#endif
+
+#include "../memcpy.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy_chk.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy_chk.S
new file mode 100644
index 0000000000..1b4fbe2e6f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy_chk.S
@@ -0,0 +1,50 @@
+/* Multiple versions of __memcpy_chk
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+   DSO.  There are no multiarch memcpy functions for static binaries.
+ */
+#if IS_IN (libc)
+# ifdef SHARED
+	.text
+ENTRY(__memcpy_chk)
+	.type	__memcpy_chk, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memcpy_chk_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memcpy_chk_sse2_unaligned)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz	2f
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memcpy_chk_ssse3)
+	HAS_ARCH_FEATURE (Fast_Rep_String)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memcpy_chk_ssse3_rep)
+2:	ret
+END(__memcpy_chk)
+# else
+#  include "../memcpy_chk.S"
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-sse2-unaligned.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-sse2-unaligned.S
new file mode 100644
index 0000000000..3873594cb2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-sse2-unaligned.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY		__memmove_sse2_unaligned
+#define MEMCPY_CHK	__memmove_chk_sse2_unaligned
+#include "memcpy-sse2-unaligned.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S
new file mode 100644
index 0000000000..d202fc4a13
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY		__memmove_ssse3_rep
+#define MEMCPY_CHK	__memmove_chk_ssse3_rep
+#include "memcpy-ssse3-rep.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3.S
new file mode 100644
index 0000000000..295430b1ef
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY		__memmove_ssse3
+#define MEMCPY_CHK	__memmove_chk_ssse3
+#include "memcpy-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memmove.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove.S
new file mode 100644
index 0000000000..6eb418ca7f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove.S
@@ -0,0 +1,89 @@
+/* Multiple versions of memmove
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib.  */
+#if IS_IN (libc)
+	.text
+ENTRY(memmove)
+	.type	memmove, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memmove_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memmove_sse2_unaligned)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz	2f
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memmove_ssse3)
+	HAS_ARCH_FEATURE (Fast_Rep_String)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memmove_ssse3_rep)
+2:	ret
+END(memmove)
+
+# ifdef SHARED
+#  undef ENTRY
+#  define ENTRY(name) \
+	.type __memmove_ia32, @function; \
+	.p2align 4; \
+	.globl __memmove_ia32; \
+	.hidden __memmove_ia32; \
+	__memmove_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# else
+#  undef ENTRY
+#  define ENTRY(name) \
+	.type __memmove_ia32, @function; \
+	.globl __memmove_ia32; \
+	.p2align 4; \
+	__memmove_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# endif
+
+# undef END
+# define END(name) \
+	cfi_endproc; .size __memmove_ia32, .-__memmove_ia32
+
+# undef ENTRY_CHK
+# define ENTRY_CHK(name) \
+	.type __memmove_chk_ia32, @function; \
+	.globl __memmove_chk_ia32; \
+	.p2align 4; \
+	__memmove_chk_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END_CHK
+# define END_CHK(name) \
+	cfi_endproc; .size __memmove_chk_ia32, .-__memmove_chk_ia32
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_memmove; __GI_memmove = __memmove_ia32
+# endif
+#endif
+
+#include "../memmove.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memmove_chk.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove_chk.S
new file mode 100644
index 0000000000..314834c4c6
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove_chk.S
@@ -0,0 +1,94 @@
+/* Multiple versions of __memmove_chk
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib.  */
+#if IS_IN (libc)
+	.text
+ENTRY(__memmove_chk)
+	.type	__memmove_chk, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memmove_chk_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memmove_chk_sse2_unaligned)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz	2f
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memmove_chk_ssse3)
+	HAS_ARCH_FEATURE (Fast_Rep_String)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memmove_chk_ssse3_rep)
+2:	ret
+END(__memmove_chk)
+
+# ifndef SHARED
+	.type __memmove_chk_sse2_unaligned, @function
+	.p2align 4;
+__memmove_chk_sse2_unaligned:
+	cfi_startproc
+	CALL_MCOUNT
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	__chk_fail
+	jmp	__memmove_sse2_unaligned
+	cfi_endproc
+	.size __memmove_chk_sse2_unaligned, .-__memmove_chk_sse2_unaligned
+
+	.type __memmove_chk_ssse3, @function
+	.p2align 4;
+__memmove_chk_ssse3:
+	cfi_startproc
+	CALL_MCOUNT
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	__chk_fail
+	jmp	__memmove_ssse3
+	cfi_endproc
+	.size __memmove_chk_ssse3, .-__memmove_chk_ssse3
+
+	.type __memmove_chk_ssse3_rep, @function
+	.p2align 4;
+__memmove_chk_ssse3_rep:
+	cfi_startproc
+	CALL_MCOUNT
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	__chk_fail
+	jmp	__memmove_ssse3_rep
+	cfi_endproc
+	.size __memmove_chk_ssse3_rep, .-__memmove_chk_ssse3_rep
+
+	.type __memmove_chk_ia32, @function
+	.p2align 4;
+__memmove_chk_ia32:
+	cfi_startproc
+	CALL_MCOUNT
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	__chk_fail
+	jmp	__memmove_ia32
+	cfi_endproc
+	.size __memmove_chk_ia32, .-__memmove_chk_ia32
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-sse2-unaligned.S b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-sse2-unaligned.S
new file mode 100644
index 0000000000..a1cea50771
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-sse2-unaligned.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMPCPY
+#define MEMCPY		__mempcpy_sse2_unaligned
+#define MEMCPY_CHK	__mempcpy_chk_sse2_unaligned
+#include "memcpy-sse2-unaligned.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S
new file mode 100644
index 0000000000..5357b33e18
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMPCPY
+#define MEMCPY		__mempcpy_ssse3_rep
+#define MEMCPY_CHK	__mempcpy_chk_ssse3_rep
+#include "memcpy-ssse3-rep.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S
new file mode 100644
index 0000000000..822d98e954
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMPCPY
+#define MEMCPY		__mempcpy_ssse3
+#define MEMCPY_CHK	__mempcpy_chk_ssse3
+#include "memcpy-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy.S
new file mode 100644
index 0000000000..06e377fbc9
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy.S
@@ -0,0 +1,81 @@
+/* Multiple versions of mempcpy
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+   DSO.  In static binaries we need mempcpy before the initialization
+   happened.  */
+#if defined SHARED && IS_IN (libc)
+	.text
+ENTRY(__mempcpy)
+	.type	__mempcpy, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__mempcpy_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__mempcpy_sse2_unaligned)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz	2f
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__mempcpy_ssse3)
+	HAS_ARCH_FEATURE (Fast_Rep_String)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__mempcpy_ssse3_rep)
+2:	ret
+END(__mempcpy)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __mempcpy_ia32, @function; \
+	.p2align 4; \
+	.globl __mempcpy_ia32; \
+	.hidden __mempcpy_ia32; \
+	__mempcpy_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __mempcpy_ia32, .-__mempcpy_ia32
+
+# undef ENTRY_CHK
+# define ENTRY_CHK(name) \
+	.type __mempcpy_chk_ia32, @function; \
+	.globl __mempcpy_chk_ia32; \
+	.p2align 4; \
+	__mempcpy_chk_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END_CHK
+# define END_CHK(name) \
+	cfi_endproc; .size __mempcpy_chk_ia32, .-__mempcpy_chk_ia32
+
+# undef libc_hidden_def
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+# define libc_hidden_def(name) \
+	.globl __GI_mempcpy; __GI_mempcpy = __mempcpy_ia32
+# define libc_hidden_builtin_def(name) \
+	.globl __GI___mempcpy; __GI___mempcpy = __mempcpy_ia32
+#endif
+
+#include "../mempcpy.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy_chk.S b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy_chk.S
new file mode 100644
index 0000000000..e13e5248a5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy_chk.S
@@ -0,0 +1,50 @@
+/* Multiple versions of __mempcpy_chk
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+   DSO.  There are no multiarch mempcpy functions for static binaries.
+ */
+#if IS_IN (libc)
+# ifdef SHARED
+	.text
+ENTRY(__mempcpy_chk)
+	.type	__mempcpy_chk, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__mempcpy_chk_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__mempcpy_chk_sse2_unaligned)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz	2f
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__mempcpy_chk_ssse3)
+	HAS_ARCH_FEATURE (Fast_Rep_String)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__mempcpy_chk_ssse3_rep)
+2:	ret
+END(__mempcpy_chk)
+# else
+#  include "../mempcpy_chk.S"
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-c.c
new file mode 100644
index 0000000000..ef7bbbe792
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-c.c
@@ -0,0 +1,7 @@
+#if IS_IN (libc)
+# define MEMRCHR  __memrchr_ia32
+# include <string.h>
+extern void *__memrchr_ia32 (const void *, int, size_t);
+#endif
+
+#include "string/memrchr.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S
new file mode 100644
index 0000000000..dbbe94fd08
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S
@@ -0,0 +1,417 @@
+/* Optimized memrchr with sse2
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS  4
+# define STR1  PARMS
+# define STR2  STR1+4
+# define LEN   STR2+4
+
+# define MEMCHR __memrchr_sse2_bsf
+
+	.text
+ENTRY (MEMCHR)
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+	mov	LEN(%esp), %edx
+
+	sub	$16, %edx
+	jbe	L(length_less16)
+
+	punpcklbw %xmm1, %xmm1
+	add	%edx, %ecx
+	punpcklbw %xmm1, %xmm1
+
+	movdqu	(%ecx), %xmm0
+	pshufd	$0, %xmm1, %xmm1
+	pcmpeqb	%xmm1, %xmm0
+
+/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches0)
+
+	sub	$64, %ecx
+	mov	%ecx, %eax
+	and	$15, %eax
+	jz	L(loop_prolog)
+
+	add	$16, %ecx
+	add	$16, %edx
+	sub	%eax, %ecx
+	sub	%eax, %edx
+
+	.p2align 4
+/* Loop start on aligned string.  */
+L(loop_prolog):
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	movdqa	32(%ecx), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	movdqa	(%ecx), %xmm4
+	pcmpeqb	%xmm1, %xmm4
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(matches0)
+
+	sub	$64, %ecx
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	movdqa	32(%ecx), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	movdqa	(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches0)
+
+	mov	%ecx, %eax
+	and	$63, %eax
+	test	%eax, %eax
+	jz	L(align64_loop)
+
+	add	$64, %ecx
+	add	$64, %edx
+	sub	%eax, %ecx
+	sub	%eax, %edx
+
+	.p2align 4
+L(align64_loop):
+	sub	$64, %ecx
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	(%ecx), %xmm0
+	movdqa	16(%ecx), %xmm2
+	movdqa	32(%ecx), %xmm3
+	movdqa	48(%ecx), %xmm4
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm1, %xmm4
+
+	pmaxub	%xmm3, %xmm0
+	pmaxub	%xmm4, %xmm2
+	pmaxub	%xmm0, %xmm2
+	pmovmskb %xmm2, %eax
+
+	test	%eax, %eax
+	jz	L(align64_loop)
+
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm2
+
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	(%ecx), %xmm1
+
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	pmovmskb %xmm1, %eax
+	bsr	%eax, %eax
+
+	add	%ecx, %eax
+	ret
+
+	.p2align 4
+L(exit_loop):
+	add	$64, %edx
+	cmp	$32, %edx
+	jbe	L(exit_loop_32)
+
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	movdqa	32(%ecx), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches16_1)
+	cmp	$48, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	(%ecx), %xmm1
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(matches0_1)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(exit_loop_32):
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48_1)
+	cmp	$16, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	32(%ecx), %xmm1
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(matches32_1)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(matches0):
+	bsr	%eax, %eax
+	add	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches16):
+	bsr	%eax, %eax
+	lea	16(%eax, %ecx), %eax
+	ret
+
+	.p2align 4
+L(matches32):
+	bsr	%eax, %eax
+	lea	32(%eax, %ecx), %eax
+	ret
+
+	.p2align 4
+L(matches48):
+	bsr	%eax, %eax
+	lea	48(%eax, %ecx), %eax
+	ret
+
+	.p2align 4
+L(matches0_1):
+	bsr	%eax, %eax
+	sub	$64, %edx
+	add	%eax, %edx
+	jl	L(return_null)
+	add	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches16_1):
+	bsr	%eax, %eax
+	sub	$48, %edx
+	add	%eax, %edx
+	jl	L(return_null)
+	lea	16(%ecx, %eax), %eax
+	ret
+
+	.p2align 4
+L(matches32_1):
+	bsr	%eax, %eax
+	sub	$32, %edx
+	add	%eax, %edx
+	jl	L(return_null)
+	lea	32(%ecx, %eax), %eax
+	ret
+
+	.p2align 4
+L(matches48_1):
+	bsr	%eax, %eax
+	sub	$16, %edx
+	add	%eax, %edx
+	jl	L(return_null)
+	lea	48(%ecx, %eax), %eax
+	ret
+
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(length_less16_offset0):
+	mov	%dl, %cl
+	pcmpeqb	(%eax), %xmm1
+
+	mov	$1, %edx
+	sal	%cl, %edx
+	sub	$1, %edx
+	mov	%edx, %ecx
+
+	pmovmskb %xmm1, %edx
+
+	and	%ecx, %edx
+	test	%edx, %edx
+	jz	L(return_null)
+
+	bsr	%edx, %ecx
+	add	%ecx, %eax
+	ret
+
+	.p2align 4
+L(length_less16):
+	punpcklbw %xmm1, %xmm1
+	mov	%ecx, %eax
+	punpcklbw %xmm1, %xmm1
+	add	$16, %edx
+	jz	L(return_null)
+
+	pshufd	$0, %xmm1, %xmm1
+	and	$15, %ecx
+	jz	L(length_less16_offset0)
+
+	PUSH	(%edi)
+	mov	%cl, %dh
+	add	%dl, %dh
+	and	$-16, %eax
+
+	sub	$16, %dh
+	ja	L(length_less16_part2)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edi
+
+	sar	%cl, %edi
+	add	%ecx, %eax
+	mov	%dl, %cl
+
+	mov	$1, %edx
+	sal	%cl, %edx
+	sub	$1, %edx
+
+	and	%edx, %edi
+	test	%edi, %edi
+	jz	L(ret_null)
+
+	bsr	%edi, %edi
+	add	%edi, %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH     (%edi)
+
+	.p2align 4
+L(length_less16_part2):
+	movdqa	16(%eax), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %edi
+
+	mov	%cl, %ch
+
+	mov	%dh, %cl
+	mov	$1, %edx
+	sal	%cl, %edx
+	sub	$1, %edx
+
+	and	%edx, %edi
+
+	test	%edi, %edi
+	jnz	L(length_less16_part2_return)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edi
+
+	mov	%ch, %cl
+	sar	%cl, %edi
+	test	%edi, %edi
+	jz	L(ret_null)
+
+	bsr	%edi, %edi
+	add	%edi, %eax
+	xor	%ch, %ch
+	add	%ecx, %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH     (%edi)
+
+	.p2align 4
+L(length_less16_part2_return):
+	bsr	%edi, %edi
+	lea	16(%eax, %edi), %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH     (%edi)
+
+	.p2align 4
+L(ret_null):
+	xor	%eax, %eax
+	POP	(%edi)
+	ret
+
+END (MEMCHR)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2.S
new file mode 100644
index 0000000000..5f7853f683
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2.S
@@ -0,0 +1,724 @@
+/* Optimized memrchr with sse2 without bsf
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS  4
+# define STR1  PARMS
+# define STR2  STR1+4
+# define LEN   STR2+4
+
+	atom_text_section
+ENTRY (__memrchr_sse2)
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+	mov	LEN(%esp), %edx
+
+	sub	$16, %edx
+	jbe	L(length_less16)
+
+	punpcklbw %xmm1, %xmm1
+	add	%edx, %ecx
+	punpcklbw %xmm1, %xmm1
+
+	movdqu	(%ecx), %xmm0
+	pshufd	$0, %xmm1, %xmm1
+	pcmpeqb	%xmm1, %xmm0
+
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(exit_dispatch)
+
+	sub	$64, %ecx
+	mov	%ecx, %eax
+	and	$15, %eax
+	jz	L(loop_prolog)
+
+	lea	16(%ecx), %ecx
+	lea	16(%edx), %edx
+	sub	%eax, %edx
+	and	$-16, %ecx
+
+	.p2align 4
+/* Loop start on aligned string.  */
+L(loop_prolog):
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	movdqa	32(%ecx), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	movdqa	(%ecx), %xmm4
+	pcmpeqb	%xmm1, %xmm4
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(exit_dispatch)
+
+	sub	$64, %ecx
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	movdqa	32(%ecx), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	movdqa	(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(exit_dispatch)
+
+	mov	%ecx, %eax
+	and	$63, %eax
+	test	%eax, %eax
+	jz	L(align64_loop)
+
+	lea	64(%ecx), %ecx
+	lea	64(%edx), %edx
+	and	$-64, %ecx
+	sub	%eax, %edx
+
+	.p2align 4
+L(align64_loop):
+	sub	$64, %ecx
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	(%ecx), %xmm0
+	movdqa	16(%ecx), %xmm2
+	movdqa	32(%ecx), %xmm3
+	movdqa	48(%ecx), %xmm4
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm1, %xmm4
+
+	pmaxub	%xmm3, %xmm0
+	pmaxub	%xmm4, %xmm2
+	pmaxub	%xmm0, %xmm2
+	pmovmskb %xmm2, %eax
+
+	test	%eax, %eax
+	jz	L(align64_loop)
+
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm2
+
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	(%ecx), %xmm1
+
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	pmovmskb %xmm1, %eax
+	test	%ah, %ah
+	jnz	L(exit_dispatch_high)
+	mov	%al, %dl
+	and	$15 << 4, %dl
+	jnz	L(exit_dispatch_8)
+	test	$0x08, %al
+	jnz	L(exit_4)
+	test	$0x04, %al
+	jnz	L(exit_3)
+	test	$0x02, %al
+	jnz	L(exit_2)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(exit_loop):
+	add	$64, %edx
+	cmp	$32, %edx
+	jbe	L(exit_loop_32)
+
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	movdqa	32(%ecx), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches16_1)
+	cmp	$48, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	(%ecx), %xmm1
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(matches0_1)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(exit_loop_32):
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48_1)
+	cmp	$16, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	32(%ecx), %xmm1
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(matches32_1)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(matches16):
+	lea	16(%ecx), %ecx
+	test	%ah, %ah
+	jnz	L(exit_dispatch_high)
+	mov	%al, %dl
+	and	$15 << 4, %dl
+	jnz	L(exit_dispatch_8)
+	test	$0x08, %al
+	jnz	L(exit_4)
+	test	$0x04, %al
+	jnz	L(exit_3)
+	test	$0x02, %al
+	jnz	L(exit_2)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches32):
+	lea	32(%ecx), %ecx
+	test	%ah, %ah
+	jnz	L(exit_dispatch_high)
+	mov	%al, %dl
+	and	$15 << 4, %dl
+	jnz	L(exit_dispatch_8)
+	test	$0x08, %al
+	jnz	L(exit_4)
+	test	$0x04, %al
+	jnz	L(exit_3)
+	test	$0x02, %al
+	jnz	L(exit_2)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches48):
+	lea	48(%ecx), %ecx
+
+	.p2align 4
+L(exit_dispatch):
+	test	%ah, %ah
+	jnz	L(exit_dispatch_high)
+	mov	%al, %dl
+	and	$15 << 4, %dl
+	jnz	L(exit_dispatch_8)
+	test	$0x08, %al
+	jnz	L(exit_4)
+	test	$0x04, %al
+	jnz	L(exit_3)
+	test	$0x02, %al
+	jnz	L(exit_2)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(exit_dispatch_8):
+	test	$0x80, %al
+	jnz	L(exit_8)
+	test	$0x40, %al
+	jnz	L(exit_7)
+	test	$0x20, %al
+	jnz	L(exit_6)
+	lea	4(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_dispatch_high):
+	mov	%ah, %dh
+	and	$15 << 4, %dh
+	jnz	L(exit_dispatch_high_8)
+	test	$0x08, %ah
+	jnz	L(exit_12)
+	test	$0x04, %ah
+	jnz	L(exit_11)
+	test	$0x02, %ah
+	jnz	L(exit_10)
+	lea	8(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_dispatch_high_8):
+	test	$0x80, %ah
+	jnz	L(exit_16)
+	test	$0x40, %ah
+	jnz	L(exit_15)
+	test	$0x20, %ah
+	jnz	L(exit_14)
+	lea	12(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_2):
+	lea	1(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_3):
+	lea	2(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_4):
+	lea	3(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_6):
+	lea	5(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_7):
+	lea	6(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_8):
+	lea	7(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_10):
+	lea	9(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_11):
+	lea	10(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_12):
+	lea	11(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_14):
+	lea	13(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_15):
+	lea	14(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_16):
+	lea	15(%ecx), %eax
+	ret
+
+	.p2align 4
+L(matches0_1):
+	lea	-64(%edx), %edx
+
+	test	%ah, %ah
+	jnz	L(exit_dispatch_1_high)
+	mov	%al, %ah
+	and	$15 << 4, %ah
+	jnz	L(exit_dispatch_1_8)
+	test	$0x08, %al
+	jnz	L(exit_1_4)
+	test	$0x04, %al
+	jnz	L(exit_1_3)
+	test	$0x02, %al
+	jnz	L(exit_1_2)
+	add	$0, %edx
+	jl	L(return_null)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches16_1):
+	lea	-48(%edx), %edx
+	lea	16(%ecx), %ecx
+
+	test	%ah, %ah
+	jnz	L(exit_dispatch_1_high)
+	mov	%al, %ah
+	and	$15 << 4, %ah
+	jnz	L(exit_dispatch_1_8)
+	test	$0x08, %al
+	jnz	L(exit_1_4)
+	test	$0x04, %al
+	jnz	L(exit_1_3)
+	test	$0x02, %al
+	jnz	L(exit_1_2)
+	add	$0, %edx
+	jl	L(return_null)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches32_1):
+	lea	-32(%edx), %edx
+	lea	32(%ecx), %ecx
+
+	test	%ah, %ah
+	jnz	L(exit_dispatch_1_high)
+	mov	%al, %ah
+	and	$15 << 4, %ah
+	jnz	L(exit_dispatch_1_8)
+	test	$0x08, %al
+	jnz	L(exit_1_4)
+	test	$0x04, %al
+	jnz	L(exit_1_3)
+	test	$0x02, %al
+	jnz	L(exit_1_2)
+	add	$0, %edx
+	jl	L(return_null)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches48_1):
+	lea	-16(%edx), %edx
+	lea	48(%ecx), %ecx
+
+	.p2align 4
+L(exit_dispatch_1):
+	test	%ah, %ah
+	jnz	L(exit_dispatch_1_high)
+	mov	%al, %ah
+	and	$15 << 4, %ah
+	jnz	L(exit_dispatch_1_8)
+	test	$0x08, %al
+	jnz	L(exit_1_4)
+	test	$0x04, %al
+	jnz	L(exit_1_3)
+	test	$0x02, %al
+	jnz	L(exit_1_2)
+	add	$0, %edx
+	jl	L(return_null)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(exit_dispatch_1_8):
+	test	$0x80, %al
+	jnz	L(exit_1_8)
+	test	$0x40, %al
+	jnz	L(exit_1_7)
+	test	$0x20, %al
+	jnz	L(exit_1_6)
+	add	$4, %edx
+	jl	L(return_null)
+	lea	4(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_dispatch_1_high):
+	mov	%ah, %al
+	and	$15 << 4, %al
+	jnz	L(exit_dispatch_1_high_8)
+	test	$0x08, %ah
+	jnz	L(exit_1_12)
+	test	$0x04, %ah
+	jnz	L(exit_1_11)
+	test	$0x02, %ah
+	jnz	L(exit_1_10)
+	add	$8, %edx
+	jl	L(return_null)
+	lea	8(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_dispatch_1_high_8):
+	test	$0x80, %ah
+	jnz	L(exit_1_16)
+	test	$0x40, %ah
+	jnz	L(exit_1_15)
+	test	$0x20, %ah
+	jnz	L(exit_1_14)
+	add	$12, %edx
+	jl	L(return_null)
+	lea	12(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_2):
+	add	$1, %edx
+	jl	L(return_null)
+	lea	1(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_3):
+	add	$2, %edx
+	jl	L(return_null)
+	lea	2(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_4):
+	add	$3, %edx
+	jl	L(return_null)
+	lea	3(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_6):
+	add	$5, %edx
+	jl	L(return_null)
+	lea	5(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_7):
+	add	$6, %edx
+	jl	L(return_null)
+	lea	6(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_8):
+	add	$7, %edx
+	jl	L(return_null)
+	lea	7(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_10):
+	add	$9, %edx
+	jl	L(return_null)
+	lea	9(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_11):
+	add	$10, %edx
+	jl	L(return_null)
+	lea	10(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_12):
+	add	$11, %edx
+	jl	L(return_null)
+	lea	11(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_14):
+	add	$13, %edx
+	jl	L(return_null)
+	lea	13(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_15):
+	add	$14, %edx
+	jl	L(return_null)
+	lea	14(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_16):
+	add	$15, %edx
+	jl	L(return_null)
+	lea	15(%ecx), %eax
+	ret
+
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(length_less16_offset0):
+	mov	%dl, %cl
+	pcmpeqb	(%eax), %xmm1
+
+	mov	$1, %edx
+	sal	%cl, %edx
+	sub	$1, %edx
+
+	mov	%eax, %ecx
+	pmovmskb %xmm1, %eax
+
+	and	%edx, %eax
+	test	%eax, %eax
+	jnz	L(exit_dispatch)
+
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(length_less16):
+	punpcklbw %xmm1, %xmm1
+	add	$16, %edx
+	je	L(return_null)
+	punpcklbw %xmm1, %xmm1
+
+	mov	%ecx, %eax
+	pshufd	$0, %xmm1, %xmm1
+
+	and	$15, %ecx
+	jz	L(length_less16_offset0)
+
+	PUSH	(%edi)
+
+	mov	%cl, %dh
+	add	%dl, %dh
+	and	$-16, %eax
+
+	sub	$16, %dh
+	ja	L(length_less16_part2)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edi
+
+	sar	%cl, %edi
+	add	%ecx, %eax
+	mov	%dl, %cl
+
+	mov	$1, %edx
+	sal	%cl, %edx
+	sub	$1, %edx
+
+	and	%edx, %edi
+	test	%edi, %edi
+	jz	L(ret_null)
+
+	bsr	%edi, %edi
+	add	%edi, %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH     (%edi)
+
+	.p2align 4
+L(length_less16_part2):
+	movdqa	16(%eax), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %edi
+
+	mov	%cl, %ch
+
+	mov	%dh, %cl
+	mov	$1, %edx
+	sal	%cl, %edx
+	sub	$1, %edx
+
+	and	%edx, %edi
+
+	test	%edi, %edi
+	jnz	L(length_less16_part2_return)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edi
+
+	mov	%ch, %cl
+	sar	%cl, %edi
+	test	%edi, %edi
+	jz	L(ret_null)
+
+	bsr	%edi, %edi
+	add	%edi, %eax
+	xor	%ch, %ch
+	add	%ecx, %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH     (%edi)
+
+	.p2align 4
+L(length_less16_part2_return):
+	bsr	%edi, %edi
+	lea	16(%eax, %edi), %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH     (%edi)
+
+	.p2align 4
+L(ret_null):
+	xor	%eax, %eax
+	POP	(%edi)
+	ret
+
+END (__memrchr_sse2)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr.S
new file mode 100644
index 0000000000..d4253a553b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr.S
@@ -0,0 +1,45 @@
+/* Multiple versions of memrchr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+	.text
+ENTRY(__memrchr)
+	.type	__memrchr, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	HAS_ARCH_FEATURE (Slow_BSF)
+	jz	3f
+
+	LOAD_FUNC_GOT_EAX (__memrchr_sse2)
+	ret
+
+2:	LOAD_FUNC_GOT_EAX (__memrchr_ia32)
+	ret
+
+3:	LOAD_FUNC_GOT_EAX (__memrchr_sse2_bsf)
+	ret
+END(__memrchr)
+
+weak_alias(__memrchr, memrchr)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2-rep.S
new file mode 100644
index 0000000000..3221077e49
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2-rep.S
@@ -0,0 +1,811 @@
+/* memset with SSE2 and REP string.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#ifdef USE_AS_BZERO
+# define DEST		PARMS
+# define LEN		DEST+4
+# define SETRTNVAL
+#else
+# define DEST		PARMS
+# define CHR		DEST+4
+# define LEN		CHR+4
+# define SETRTNVAL	movl DEST(%esp), %eax
+#endif
+
+#ifdef SHARED
+# define ENTRANCE	PUSH (%ebx);
+# define RETURN_END	POP (%ebx); ret
+# define RETURN		RETURN_END; CFI_PUSH (%ebx)
+# define PARMS		8		/* Preserve EBX.  */
+# define JMPTBL(I, B)	I - B
+
+/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
+   jump table with relative offsets.   */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
+    /* We first load PC into EBX.  */				\
+    SETUP_PIC_REG(bx);						\
+    /* Get the address of the jump table.  */			\
+    add		$(TABLE - .), %ebx;				\
+    /* Get the entry and convert the relative offset to the	\
+       absolute address.  */					\
+    add		(%ebx,%ecx,4), %ebx;				\
+    add		%ecx, %edx;					\
+    /* We loaded the jump table and adjusted EDX. Go.  */	\
+    jmp		*%ebx
+#else
+# define ENTRANCE
+# define RETURN_END	ret
+# define RETURN		RETURN_END
+# define PARMS		4
+# define JMPTBL(I, B)	I
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+   absolute offsets.  */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
+    add		%ecx, %edx;					\
+    jmp		*TABLE(,%ecx,4)
+#endif
+
+	.section .text.sse2,"ax",@progbits
+#if defined SHARED && IS_IN (libc) && !defined USE_AS_BZERO
+ENTRY (__memset_chk_sse2_rep)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__memset_chk_sse2_rep)
+#endif
+ENTRY (__memset_sse2_rep)
+	ENTRANCE
+
+	movl	LEN(%esp), %ecx
+#ifdef USE_AS_BZERO
+	xor	%eax, %eax
+#else
+	movzbl	CHR(%esp), %eax
+	movb	%al, %ah
+	/* Fill the whole EAX with pattern.  */
+	movl	%eax, %edx
+	shl	$16, %eax
+	or	%edx, %eax
+#endif
+	movl	DEST(%esp), %edx
+	cmp	$32, %ecx
+	jae	L(32bytesormore)
+
+L(write_less32bytes):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_32bytes))
+
+
+	.pushsection .rodata.sse2,"a",@progbits
+	ALIGN (2)
+L(table_less_32bytes):
+	.int	JMPTBL (L(write_0bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_1bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_2bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_3bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_4bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_5bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_6bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_7bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_8bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_9bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_10bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_11bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_12bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_13bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_14bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_15bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_16bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_17bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_18bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_19bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_20bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_21bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_22bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_23bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_24bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_25bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_26bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_27bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_28bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_29bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_30bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_31bytes), L(table_less_32bytes))
+	.popsection
+
+	ALIGN (4)
+L(write_28bytes):
+	movl	%eax, -28(%edx)
+L(write_24bytes):
+	movl	%eax, -24(%edx)
+L(write_20bytes):
+	movl	%eax, -20(%edx)
+L(write_16bytes):
+	movl	%eax, -16(%edx)
+L(write_12bytes):
+	movl	%eax, -12(%edx)
+L(write_8bytes):
+	movl	%eax, -8(%edx)
+L(write_4bytes):
+	movl	%eax, -4(%edx)
+L(write_0bytes):
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(write_29bytes):
+	movl	%eax, -29(%edx)
+L(write_25bytes):
+	movl	%eax, -25(%edx)
+L(write_21bytes):
+	movl	%eax, -21(%edx)
+L(write_17bytes):
+	movl	%eax, -17(%edx)
+L(write_13bytes):
+	movl	%eax, -13(%edx)
+L(write_9bytes):
+	movl	%eax, -9(%edx)
+L(write_5bytes):
+	movl	%eax, -5(%edx)
+L(write_1bytes):
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(write_30bytes):
+	movl	%eax, -30(%edx)
+L(write_26bytes):
+	movl	%eax, -26(%edx)
+L(write_22bytes):
+	movl	%eax, -22(%edx)
+L(write_18bytes):
+	movl	%eax, -18(%edx)
+L(write_14bytes):
+	movl	%eax, -14(%edx)
+L(write_10bytes):
+	movl	%eax, -10(%edx)
+L(write_6bytes):
+	movl	%eax, -6(%edx)
+L(write_2bytes):
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(write_31bytes):
+	movl	%eax, -31(%edx)
+L(write_27bytes):
+	movl	%eax, -27(%edx)
+L(write_23bytes):
+	movl	%eax, -23(%edx)
+L(write_19bytes):
+	movl	%eax, -19(%edx)
+L(write_15bytes):
+	movl	%eax, -15(%edx)
+L(write_11bytes):
+	movl	%eax, -11(%edx)
+L(write_7bytes):
+	movl	%eax, -7(%edx)
+L(write_3bytes):
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+/* ECX > 32 and EDX is 4 byte aligned.  */
+L(32bytesormore):
+	/* Fill xmm0 with the pattern.  */
+#ifdef USE_AS_BZERO
+	pxor	%xmm0, %xmm0
+#else
+	movd	%eax, %xmm0
+	pshufd	$0, %xmm0, %xmm0
+#endif
+	testl	$0xf, %edx
+	jz	L(aligned_16)
+/* ECX > 32 and EDX is not 16 byte aligned.  */
+L(not_aligned_16):
+	movdqu	%xmm0, (%edx)
+	movl	%edx, %eax
+	and	$-16, %edx
+	add	$16, %edx
+	sub	%edx, %eax
+	add	%eax, %ecx
+	movd	%xmm0, %eax
+
+	ALIGN (4)
+L(aligned_16):
+	cmp	$128, %ecx
+	jae	L(128bytesormore)
+
+L(aligned_16_less128bytes):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+	ALIGN (4)
+L(128bytesormore):
+	PUSH (%edi)
+#ifdef DATA_CACHE_SIZE
+	PUSH (%ebx)
+	mov	$DATA_CACHE_SIZE, %ebx
+#else
+# ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	mov	__x86_data_cache_size@GOTOFF(%ebx), %ebx
+# else
+	PUSH (%ebx)
+	mov	__x86_data_cache_size, %ebx
+# endif
+#endif
+	mov	%ebx, %edi
+	shr	$4, %ebx
+	sub	%ebx, %edi
+#if defined DATA_CACHE_SIZE || !defined SHARED
+	POP (%ebx)
+#endif
+/*
+ * When data size approximate the end of L1 cache,
+ * fast string will prefetch and combine data efficiently.
+ */
+	cmp	%edi, %ecx
+	jae	L(128bytesormore_endof_L1)
+	subl	$128, %ecx
+L(128bytesormore_normal):
+	sub	$128, %ecx
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm0, 0x10(%edx)
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm0, 0x30(%edx)
+	movdqa	%xmm0, 0x40(%edx)
+	movdqa	%xmm0, 0x50(%edx)
+	movdqa	%xmm0, 0x60(%edx)
+	movdqa	%xmm0, 0x70(%edx)
+	lea	128(%edx), %edx
+	jb	L(128bytesless_normal)
+
+
+	sub	$128, %ecx
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm0, 0x10(%edx)
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm0, 0x30(%edx)
+	movdqa	%xmm0, 0x40(%edx)
+	movdqa	%xmm0, 0x50(%edx)
+	movdqa	%xmm0, 0x60(%edx)
+	movdqa	%xmm0, 0x70(%edx)
+	lea	128(%edx), %edx
+	jae	L(128bytesormore_normal)
+
+L(128bytesless_normal):
+	POP (%edi)
+	add	$128, %ecx
+	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+	CFI_PUSH (%edi)
+	ALIGN (4)
+L(128bytesormore_endof_L1):
+	mov	%edx, %edi
+	mov	%ecx, %edx
+	shr	$2, %ecx
+	and	$3, %edx
+	rep stosl
+	jz	L(copy_page_by_rep_exit)
+	cmp	$2, %edx
+	jb	L(copy_page_by_rep_left_1)
+	movw	%ax, (%edi)
+	add	$2, %edi
+	sub	$2, %edx
+	jz	L(copy_page_by_rep_exit)
+L(copy_page_by_rep_left_1):
+	movb	%al, (%edi)
+L(copy_page_by_rep_exit):
+	POP (%edi)
+	SETRTNVAL
+	RETURN
+
+	.pushsection .rodata.sse2,"a",@progbits
+	ALIGN (2)
+L(table_16_128bytes):
+	.int	JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_1bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_3bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_5bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_7bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_9bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_11bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_13bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_15bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_17bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_19bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_21bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_23bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_25bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_27bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_29bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_31bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_33bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_35bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_37bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_39bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_41bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_43bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_45bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_47bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_49bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_51bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_53bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_55bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_57bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_59bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_61bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_63bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_65bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_67bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_69bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_71bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_73bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_75bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_77bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_79bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_81bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_83bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_85bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_87bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_89bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_91bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_93bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_95bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_97bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_99bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_101bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_103bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_105bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_107bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_109bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_111bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_113bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_115bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_117bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_119bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_121bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_123bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_125bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_127bytes), L(table_16_128bytes))
+	.popsection
+
+	ALIGN (4)
+L(aligned_16_112bytes):
+	movdqa	%xmm0, -112(%edx)
+L(aligned_16_96bytes):
+	movdqa	%xmm0, -96(%edx)
+L(aligned_16_80bytes):
+	movdqa	%xmm0, -80(%edx)
+L(aligned_16_64bytes):
+	movdqa	%xmm0, -64(%edx)
+L(aligned_16_48bytes):
+	movdqa	%xmm0, -48(%edx)
+L(aligned_16_32bytes):
+	movdqa	%xmm0, -32(%edx)
+L(aligned_16_16bytes):
+	movdqa	%xmm0, -16(%edx)
+L(aligned_16_0bytes):
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_113bytes):
+	movdqa	%xmm0, -113(%edx)
+L(aligned_16_97bytes):
+	movdqa	%xmm0, -97(%edx)
+L(aligned_16_81bytes):
+	movdqa	%xmm0, -81(%edx)
+L(aligned_16_65bytes):
+	movdqa	%xmm0, -65(%edx)
+L(aligned_16_49bytes):
+	movdqa	%xmm0, -49(%edx)
+L(aligned_16_33bytes):
+	movdqa	%xmm0, -33(%edx)
+L(aligned_16_17bytes):
+	movdqa	%xmm0, -17(%edx)
+L(aligned_16_1bytes):
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_114bytes):
+	movdqa	%xmm0, -114(%edx)
+L(aligned_16_98bytes):
+	movdqa	%xmm0, -98(%edx)
+L(aligned_16_82bytes):
+	movdqa	%xmm0, -82(%edx)
+L(aligned_16_66bytes):
+	movdqa	%xmm0, -66(%edx)
+L(aligned_16_50bytes):
+	movdqa	%xmm0, -50(%edx)
+L(aligned_16_34bytes):
+	movdqa	%xmm0, -34(%edx)
+L(aligned_16_18bytes):
+	movdqa	%xmm0, -18(%edx)
+L(aligned_16_2bytes):
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_115bytes):
+	movdqa	%xmm0, -115(%edx)
+L(aligned_16_99bytes):
+	movdqa	%xmm0, -99(%edx)
+L(aligned_16_83bytes):
+	movdqa	%xmm0, -83(%edx)
+L(aligned_16_67bytes):
+	movdqa	%xmm0, -67(%edx)
+L(aligned_16_51bytes):
+	movdqa	%xmm0, -51(%edx)
+L(aligned_16_35bytes):
+	movdqa	%xmm0, -35(%edx)
+L(aligned_16_19bytes):
+	movdqa	%xmm0, -19(%edx)
+L(aligned_16_3bytes):
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_116bytes):
+	movdqa	%xmm0, -116(%edx)
+L(aligned_16_100bytes):
+	movdqa	%xmm0, -100(%edx)
+L(aligned_16_84bytes):
+	movdqa	%xmm0, -84(%edx)
+L(aligned_16_68bytes):
+	movdqa	%xmm0, -68(%edx)
+L(aligned_16_52bytes):
+	movdqa	%xmm0, -52(%edx)
+L(aligned_16_36bytes):
+	movdqa	%xmm0, -36(%edx)
+L(aligned_16_20bytes):
+	movdqa	%xmm0, -20(%edx)
+L(aligned_16_4bytes):
+	movl	%eax, -4(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_117bytes):
+	movdqa	%xmm0, -117(%edx)
+L(aligned_16_101bytes):
+	movdqa	%xmm0, -101(%edx)
+L(aligned_16_85bytes):
+	movdqa	%xmm0, -85(%edx)
+L(aligned_16_69bytes):
+	movdqa	%xmm0, -69(%edx)
+L(aligned_16_53bytes):
+	movdqa	%xmm0, -53(%edx)
+L(aligned_16_37bytes):
+	movdqa	%xmm0, -37(%edx)
+L(aligned_16_21bytes):
+	movdqa	%xmm0, -21(%edx)
+L(aligned_16_5bytes):
+	movl	%eax, -5(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_118bytes):
+	movdqa	%xmm0, -118(%edx)
+L(aligned_16_102bytes):
+	movdqa	%xmm0, -102(%edx)
+L(aligned_16_86bytes):
+	movdqa	%xmm0, -86(%edx)
+L(aligned_16_70bytes):
+	movdqa	%xmm0, -70(%edx)
+L(aligned_16_54bytes):
+	movdqa	%xmm0, -54(%edx)
+L(aligned_16_38bytes):
+	movdqa	%xmm0, -38(%edx)
+L(aligned_16_22bytes):
+	movdqa	%xmm0, -22(%edx)
+L(aligned_16_6bytes):
+	movl	%eax, -6(%edx)
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_119bytes):
+	movdqa	%xmm0, -119(%edx)
+L(aligned_16_103bytes):
+	movdqa	%xmm0, -103(%edx)
+L(aligned_16_87bytes):
+	movdqa	%xmm0, -87(%edx)
+L(aligned_16_71bytes):
+	movdqa	%xmm0, -71(%edx)
+L(aligned_16_55bytes):
+	movdqa	%xmm0, -55(%edx)
+L(aligned_16_39bytes):
+	movdqa	%xmm0, -39(%edx)
+L(aligned_16_23bytes):
+	movdqa	%xmm0, -23(%edx)
+L(aligned_16_7bytes):
+	movl	%eax, -7(%edx)
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_120bytes):
+	movdqa	%xmm0, -120(%edx)
+L(aligned_16_104bytes):
+	movdqa	%xmm0, -104(%edx)
+L(aligned_16_88bytes):
+	movdqa	%xmm0, -88(%edx)
+L(aligned_16_72bytes):
+	movdqa	%xmm0, -72(%edx)
+L(aligned_16_56bytes):
+	movdqa	%xmm0, -56(%edx)
+L(aligned_16_40bytes):
+	movdqa	%xmm0, -40(%edx)
+L(aligned_16_24bytes):
+	movdqa	%xmm0, -24(%edx)
+L(aligned_16_8bytes):
+	movq	%xmm0, -8(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_121bytes):
+	movdqa	%xmm0, -121(%edx)
+L(aligned_16_105bytes):
+	movdqa	%xmm0, -105(%edx)
+L(aligned_16_89bytes):
+	movdqa	%xmm0, -89(%edx)
+L(aligned_16_73bytes):
+	movdqa	%xmm0, -73(%edx)
+L(aligned_16_57bytes):
+	movdqa	%xmm0, -57(%edx)
+L(aligned_16_41bytes):
+	movdqa	%xmm0, -41(%edx)
+L(aligned_16_25bytes):
+	movdqa	%xmm0, -25(%edx)
+L(aligned_16_9bytes):
+	movq	%xmm0, -9(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_122bytes):
+	movdqa	%xmm0, -122(%edx)
+L(aligned_16_106bytes):
+	movdqa	%xmm0, -106(%edx)
+L(aligned_16_90bytes):
+	movdqa	%xmm0, -90(%edx)
+L(aligned_16_74bytes):
+	movdqa	%xmm0, -74(%edx)
+L(aligned_16_58bytes):
+	movdqa	%xmm0, -58(%edx)
+L(aligned_16_42bytes):
+	movdqa	%xmm0, -42(%edx)
+L(aligned_16_26bytes):
+	movdqa	%xmm0, -26(%edx)
+L(aligned_16_10bytes):
+	movq	%xmm0, -10(%edx)
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_123bytes):
+	movdqa	%xmm0, -123(%edx)
+L(aligned_16_107bytes):
+	movdqa	%xmm0, -107(%edx)
+L(aligned_16_91bytes):
+	movdqa	%xmm0, -91(%edx)
+L(aligned_16_75bytes):
+	movdqa	%xmm0, -75(%edx)
+L(aligned_16_59bytes):
+	movdqa	%xmm0, -59(%edx)
+L(aligned_16_43bytes):
+	movdqa	%xmm0, -43(%edx)
+L(aligned_16_27bytes):
+	movdqa	%xmm0, -27(%edx)
+L(aligned_16_11bytes):
+	movq	%xmm0, -11(%edx)
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_124bytes):
+	movdqa	%xmm0, -124(%edx)
+L(aligned_16_108bytes):
+	movdqa	%xmm0, -108(%edx)
+L(aligned_16_92bytes):
+	movdqa	%xmm0, -92(%edx)
+L(aligned_16_76bytes):
+	movdqa	%xmm0, -76(%edx)
+L(aligned_16_60bytes):
+	movdqa	%xmm0, -60(%edx)
+L(aligned_16_44bytes):
+	movdqa	%xmm0, -44(%edx)
+L(aligned_16_28bytes):
+	movdqa	%xmm0, -28(%edx)
+L(aligned_16_12bytes):
+	movq	%xmm0, -12(%edx)
+	movl	%eax, -4(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_125bytes):
+	movdqa	%xmm0, -125(%edx)
+L(aligned_16_109bytes):
+	movdqa	%xmm0, -109(%edx)
+L(aligned_16_93bytes):
+	movdqa	%xmm0, -93(%edx)
+L(aligned_16_77bytes):
+	movdqa	%xmm0, -77(%edx)
+L(aligned_16_61bytes):
+	movdqa	%xmm0, -61(%edx)
+L(aligned_16_45bytes):
+	movdqa	%xmm0, -45(%edx)
+L(aligned_16_29bytes):
+	movdqa	%xmm0, -29(%edx)
+L(aligned_16_13bytes):
+	movq	%xmm0, -13(%edx)
+	movl	%eax, -5(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_126bytes):
+	movdqa	%xmm0, -126(%edx)
+L(aligned_16_110bytes):
+	movdqa	%xmm0, -110(%edx)
+L(aligned_16_94bytes):
+	movdqa	%xmm0, -94(%edx)
+L(aligned_16_78bytes):
+	movdqa	%xmm0, -78(%edx)
+L(aligned_16_62bytes):
+	movdqa	%xmm0, -62(%edx)
+L(aligned_16_46bytes):
+	movdqa	%xmm0, -46(%edx)
+L(aligned_16_30bytes):
+	movdqa	%xmm0, -30(%edx)
+L(aligned_16_14bytes):
+	movq	%xmm0, -14(%edx)
+	movl	%eax, -6(%edx)
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_127bytes):
+	movdqa	%xmm0, -127(%edx)
+L(aligned_16_111bytes):
+	movdqa	%xmm0, -111(%edx)
+L(aligned_16_95bytes):
+	movdqa	%xmm0, -95(%edx)
+L(aligned_16_79bytes):
+	movdqa	%xmm0, -79(%edx)
+L(aligned_16_63bytes):
+	movdqa	%xmm0, -63(%edx)
+L(aligned_16_47bytes):
+	movdqa	%xmm0, -47(%edx)
+L(aligned_16_31bytes):
+	movdqa	%xmm0, -31(%edx)
+L(aligned_16_15bytes):
+	movq	%xmm0, -15(%edx)
+	movl	%eax, -7(%edx)
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN_END
+
+END (__memset_sse2_rep)
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2.S
new file mode 100644
index 0000000000..d7b8be9114
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2.S
@@ -0,0 +1,860 @@
+/* memset with SSE2
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#ifdef USE_AS_BZERO
+# define DEST		PARMS
+# define LEN		DEST+4
+# define SETRTNVAL
+#else
+# define DEST		PARMS
+# define CHR		DEST+4
+# define LEN		CHR+4
+# define SETRTNVAL	movl DEST(%esp), %eax
+#endif
+
+#ifdef SHARED
+# define ENTRANCE	PUSH (%ebx);
+# define RETURN_END	POP (%ebx); ret
+# define RETURN		RETURN_END; CFI_PUSH (%ebx)
+# define PARMS		8		/* Preserve EBX.  */
+# define JMPTBL(I, B)	I - B
+
+/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
+   jump table with relative offsets.   */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
+    /* We first load PC into EBX.  */				\
+    SETUP_PIC_REG(bx);						\
+    /* Get the address of the jump table.  */			\
+    add		$(TABLE - .), %ebx;				\
+    /* Get the entry and convert the relative offset to the	\
+       absolute address.  */					\
+    add		(%ebx,%ecx,4), %ebx;				\
+    add		%ecx, %edx;					\
+    /* We loaded the jump table and adjusted EDX. Go.  */	\
+    jmp		*%ebx
+#else
+# define ENTRANCE
+# define RETURN_END	ret
+# define RETURN		RETURN_END
+# define PARMS		4
+# define JMPTBL(I, B)	I
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+   absolute offsets.  */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
+    add		%ecx, %edx;					\
+    jmp		*TABLE(,%ecx,4)
+#endif
+
+	.section .text.sse2,"ax",@progbits
+#if defined SHARED && IS_IN (libc) && !defined USE_AS_BZERO
+ENTRY (__memset_chk_sse2)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__memset_chk_sse2)
+#endif
+ENTRY (__memset_sse2)
+	ENTRANCE
+
+	movl	LEN(%esp), %ecx
+#ifdef USE_AS_BZERO
+	xor	%eax, %eax
+#else
+	movzbl	CHR(%esp), %eax
+	movb	%al, %ah
+	/* Fill the whole EAX with pattern.  */
+	movl	%eax, %edx
+	shl	$16, %eax
+	or	%edx, %eax
+#endif
+	movl	DEST(%esp), %edx
+	cmp	$32, %ecx
+	jae	L(32bytesormore)
+
+L(write_less32bytes):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_32bytes))
+
+
+	.pushsection .rodata.sse2,"a",@progbits
+	ALIGN (2)
+L(table_less_32bytes):
+	.int	JMPTBL (L(write_0bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_1bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_2bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_3bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_4bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_5bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_6bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_7bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_8bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_9bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_10bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_11bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_12bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_13bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_14bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_15bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_16bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_17bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_18bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_19bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_20bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_21bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_22bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_23bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_24bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_25bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_26bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_27bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_28bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_29bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_30bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_31bytes), L(table_less_32bytes))
+	.popsection
+
+	ALIGN (4)
+L(write_28bytes):
+	movl	%eax, -28(%edx)
+L(write_24bytes):
+	movl	%eax, -24(%edx)
+L(write_20bytes):
+	movl	%eax, -20(%edx)
+L(write_16bytes):
+	movl	%eax, -16(%edx)
+L(write_12bytes):
+	movl	%eax, -12(%edx)
+L(write_8bytes):
+	movl	%eax, -8(%edx)
+L(write_4bytes):
+	movl	%eax, -4(%edx)
+L(write_0bytes):
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(write_29bytes):
+	movl	%eax, -29(%edx)
+L(write_25bytes):
+	movl	%eax, -25(%edx)
+L(write_21bytes):
+	movl	%eax, -21(%edx)
+L(write_17bytes):
+	movl	%eax, -17(%edx)
+L(write_13bytes):
+	movl	%eax, -13(%edx)
+L(write_9bytes):
+	movl	%eax, -9(%edx)
+L(write_5bytes):
+	movl	%eax, -5(%edx)
+L(write_1bytes):
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(write_30bytes):
+	movl	%eax, -30(%edx)
+L(write_26bytes):
+	movl	%eax, -26(%edx)
+L(write_22bytes):
+	movl	%eax, -22(%edx)
+L(write_18bytes):
+	movl	%eax, -18(%edx)
+L(write_14bytes):
+	movl	%eax, -14(%edx)
+L(write_10bytes):
+	movl	%eax, -10(%edx)
+L(write_6bytes):
+	movl	%eax, -6(%edx)
+L(write_2bytes):
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(write_31bytes):
+	movl	%eax, -31(%edx)
+L(write_27bytes):
+	movl	%eax, -27(%edx)
+L(write_23bytes):
+	movl	%eax, -23(%edx)
+L(write_19bytes):
+	movl	%eax, -19(%edx)
+L(write_15bytes):
+	movl	%eax, -15(%edx)
+L(write_11bytes):
+	movl	%eax, -11(%edx)
+L(write_7bytes):
+	movl	%eax, -7(%edx)
+L(write_3bytes):
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+/* ECX > 32 and EDX is 4 byte aligned.  */
+L(32bytesormore):
+	/* Fill xmm0 with the pattern.  */
+#ifdef USE_AS_BZERO
+	pxor	%xmm0, %xmm0
+#else
+	movd	%eax, %xmm0
+	pshufd	$0, %xmm0, %xmm0
+#endif
+	testl	$0xf, %edx
+	jz	L(aligned_16)
+/* ECX > 32 and EDX is not 16 byte aligned.  */
+L(not_aligned_16):
+	movdqu	%xmm0, (%edx)
+	movl	%edx, %eax
+	and	$-16, %edx
+	add	$16, %edx
+	sub	%edx, %eax
+	add	%eax, %ecx
+	movd	%xmm0, %eax
+
+	ALIGN (4)
+L(aligned_16):
+	cmp	$128, %ecx
+	jae	L(128bytesormore)
+
+L(aligned_16_less128bytes):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+	ALIGN (4)
+L(128bytesormore):
+#ifdef SHARED_CACHE_SIZE
+	PUSH (%ebx)
+	mov	$SHARED_CACHE_SIZE, %ebx
+#else
+# ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	mov	__x86_shared_cache_size@GOTOFF(%ebx), %ebx
+# else
+	PUSH (%ebx)
+	mov	__x86_shared_cache_size, %ebx
+# endif
+#endif
+	cmp	%ebx, %ecx
+	jae	L(128bytesormore_nt_start)
+
+
+#ifdef DATA_CACHE_SIZE
+	POP (%ebx)
+# define RESTORE_EBX_STATE CFI_PUSH (%ebx)
+	cmp	$DATA_CACHE_SIZE, %ecx
+#else
+# ifdef SHARED
+#  define RESTORE_EBX_STATE
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size@GOTOFF(%ebx), %ecx
+# else
+	POP (%ebx)
+#  define RESTORE_EBX_STATE CFI_PUSH (%ebx)
+	cmp	__x86_data_cache_size, %ecx
+# endif
+#endif
+
+	jae	L(128bytes_L2_normal)
+	subl	$128, %ecx
+L(128bytesormore_normal):
+	sub	$128, %ecx
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm0, 0x10(%edx)
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm0, 0x30(%edx)
+	movdqa	%xmm0, 0x40(%edx)
+	movdqa	%xmm0, 0x50(%edx)
+	movdqa	%xmm0, 0x60(%edx)
+	movdqa	%xmm0, 0x70(%edx)
+	lea	128(%edx), %edx
+	jb	L(128bytesless_normal)
+
+
+	sub	$128, %ecx
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm0, 0x10(%edx)
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm0, 0x30(%edx)
+	movdqa	%xmm0, 0x40(%edx)
+	movdqa	%xmm0, 0x50(%edx)
+	movdqa	%xmm0, 0x60(%edx)
+	movdqa	%xmm0, 0x70(%edx)
+	lea	128(%edx), %edx
+	jae	L(128bytesormore_normal)
+
+L(128bytesless_normal):
+	add	$128, %ecx
+	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+	ALIGN (4)
+L(128bytes_L2_normal):
+	prefetcht0	0x380(%edx)
+	prefetcht0	0x3c0(%edx)
+	sub	$128, %ecx
+	movdqa	%xmm0, (%edx)
+	movaps	%xmm0, 0x10(%edx)
+	movaps	%xmm0, 0x20(%edx)
+	movaps	%xmm0, 0x30(%edx)
+	movaps	%xmm0, 0x40(%edx)
+	movaps	%xmm0, 0x50(%edx)
+	movaps	%xmm0, 0x60(%edx)
+	movaps	%xmm0, 0x70(%edx)
+	add	$128, %edx
+	cmp	$128, %ecx
+	jae	L(128bytes_L2_normal)
+
+L(128bytesless_L2_normal):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+	RESTORE_EBX_STATE
+L(128bytesormore_nt_start):
+	sub	%ebx, %ecx
+	ALIGN (4)
+L(128bytesormore_shared_cache_loop):
+	prefetcht0	0x3c0(%edx)
+	prefetcht0	0x380(%edx)
+	sub	$0x80, %ebx
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm0, 0x10(%edx)
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm0, 0x30(%edx)
+	movdqa	%xmm0, 0x40(%edx)
+	movdqa	%xmm0, 0x50(%edx)
+	movdqa	%xmm0, 0x60(%edx)
+	movdqa	%xmm0, 0x70(%edx)
+	add	$0x80, %edx
+	cmp	$0x80, %ebx
+	jae	L(128bytesormore_shared_cache_loop)
+	cmp	$0x80, %ecx
+	jb	L(shared_cache_loop_end)
+	ALIGN (4)
+L(128bytesormore_nt):
+	sub	$0x80, %ecx
+	movntdq	%xmm0, (%edx)
+	movntdq	%xmm0, 0x10(%edx)
+	movntdq	%xmm0, 0x20(%edx)
+	movntdq	%xmm0, 0x30(%edx)
+	movntdq	%xmm0, 0x40(%edx)
+	movntdq	%xmm0, 0x50(%edx)
+	movntdq	%xmm0, 0x60(%edx)
+	movntdq	%xmm0, 0x70(%edx)
+	add	$0x80, %edx
+	cmp	$0x80, %ecx
+	jae	L(128bytesormore_nt)
+	sfence
+L(shared_cache_loop_end):
+#if defined DATA_CACHE_SIZE || !defined SHARED
+	POP (%ebx)
+#endif
+	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+
+	.pushsection .rodata.sse2,"a",@progbits
+	ALIGN (2)
+L(table_16_128bytes):
+	.int	JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_1bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_3bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_5bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_7bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_9bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_11bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_13bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_15bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_17bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_19bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_21bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_23bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_25bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_27bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_29bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_31bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_33bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_35bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_37bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_39bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_41bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_43bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_45bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_47bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_49bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_51bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_53bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_55bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_57bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_59bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_61bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_63bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_65bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_67bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_69bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_71bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_73bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_75bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_77bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_79bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_81bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_83bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_85bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_87bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_89bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_91bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_93bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_95bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_97bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_99bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_101bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_103bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_105bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_107bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_109bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_111bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_113bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_115bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_117bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_119bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_121bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_123bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_125bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_127bytes), L(table_16_128bytes))
+	.popsection
+
+	ALIGN (4)
+L(aligned_16_112bytes):
+	movdqa	%xmm0, -112(%edx)
+L(aligned_16_96bytes):
+	movdqa	%xmm0, -96(%edx)
+L(aligned_16_80bytes):
+	movdqa	%xmm0, -80(%edx)
+L(aligned_16_64bytes):
+	movdqa	%xmm0, -64(%edx)
+L(aligned_16_48bytes):
+	movdqa	%xmm0, -48(%edx)
+L(aligned_16_32bytes):
+	movdqa	%xmm0, -32(%edx)
+L(aligned_16_16bytes):
+	movdqa	%xmm0, -16(%edx)
+L(aligned_16_0bytes):
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_113bytes):
+	movdqa	%xmm0, -113(%edx)
+L(aligned_16_97bytes):
+	movdqa	%xmm0, -97(%edx)
+L(aligned_16_81bytes):
+	movdqa	%xmm0, -81(%edx)
+L(aligned_16_65bytes):
+	movdqa	%xmm0, -65(%edx)
+L(aligned_16_49bytes):
+	movdqa	%xmm0, -49(%edx)
+L(aligned_16_33bytes):
+	movdqa	%xmm0, -33(%edx)
+L(aligned_16_17bytes):
+	movdqa	%xmm0, -17(%edx)
+L(aligned_16_1bytes):
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_114bytes):
+	movdqa	%xmm0, -114(%edx)
+L(aligned_16_98bytes):
+	movdqa	%xmm0, -98(%edx)
+L(aligned_16_82bytes):
+	movdqa	%xmm0, -82(%edx)
+L(aligned_16_66bytes):
+	movdqa	%xmm0, -66(%edx)
+L(aligned_16_50bytes):
+	movdqa	%xmm0, -50(%edx)
+L(aligned_16_34bytes):
+	movdqa	%xmm0, -34(%edx)
+L(aligned_16_18bytes):
+	movdqa	%xmm0, -18(%edx)
+L(aligned_16_2bytes):
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_115bytes):
+	movdqa	%xmm0, -115(%edx)
+L(aligned_16_99bytes):
+	movdqa	%xmm0, -99(%edx)
+L(aligned_16_83bytes):
+	movdqa	%xmm0, -83(%edx)
+L(aligned_16_67bytes):
+	movdqa	%xmm0, -67(%edx)
+L(aligned_16_51bytes):
+	movdqa	%xmm0, -51(%edx)
+L(aligned_16_35bytes):
+	movdqa	%xmm0, -35(%edx)
+L(aligned_16_19bytes):
+	movdqa	%xmm0, -19(%edx)
+L(aligned_16_3bytes):
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_116bytes):
+	movdqa	%xmm0, -116(%edx)
+L(aligned_16_100bytes):
+	movdqa	%xmm0, -100(%edx)
+L(aligned_16_84bytes):
+	movdqa	%xmm0, -84(%edx)
+L(aligned_16_68bytes):
+	movdqa	%xmm0, -68(%edx)
+L(aligned_16_52bytes):
+	movdqa	%xmm0, -52(%edx)
+L(aligned_16_36bytes):
+	movdqa	%xmm0, -36(%edx)
+L(aligned_16_20bytes):
+	movdqa	%xmm0, -20(%edx)
+L(aligned_16_4bytes):
+	movl	%eax, -4(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_117bytes):
+	movdqa	%xmm0, -117(%edx)
+L(aligned_16_101bytes):
+	movdqa	%xmm0, -101(%edx)
+L(aligned_16_85bytes):
+	movdqa	%xmm0, -85(%edx)
+L(aligned_16_69bytes):
+	movdqa	%xmm0, -69(%edx)
+L(aligned_16_53bytes):
+	movdqa	%xmm0, -53(%edx)
+L(aligned_16_37bytes):
+	movdqa	%xmm0, -37(%edx)
+L(aligned_16_21bytes):
+	movdqa	%xmm0, -21(%edx)
+L(aligned_16_5bytes):
+	movl	%eax, -5(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_118bytes):
+	movdqa	%xmm0, -118(%edx)
+L(aligned_16_102bytes):
+	movdqa	%xmm0, -102(%edx)
+L(aligned_16_86bytes):
+	movdqa	%xmm0, -86(%edx)
+L(aligned_16_70bytes):
+	movdqa	%xmm0, -70(%edx)
+L(aligned_16_54bytes):
+	movdqa	%xmm0, -54(%edx)
+L(aligned_16_38bytes):
+	movdqa	%xmm0, -38(%edx)
+L(aligned_16_22bytes):
+	movdqa	%xmm0, -22(%edx)
+L(aligned_16_6bytes):
+	movl	%eax, -6(%edx)
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_119bytes):
+	movdqa	%xmm0, -119(%edx)
+L(aligned_16_103bytes):
+	movdqa	%xmm0, -103(%edx)
+L(aligned_16_87bytes):
+	movdqa	%xmm0, -87(%edx)
+L(aligned_16_71bytes):
+	movdqa	%xmm0, -71(%edx)
+L(aligned_16_55bytes):
+	movdqa	%xmm0, -55(%edx)
+L(aligned_16_39bytes):
+	movdqa	%xmm0, -39(%edx)
+L(aligned_16_23bytes):
+	movdqa	%xmm0, -23(%edx)
+L(aligned_16_7bytes):
+	movl	%eax, -7(%edx)
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_120bytes):
+	movdqa	%xmm0, -120(%edx)
+L(aligned_16_104bytes):
+	movdqa	%xmm0, -104(%edx)
+L(aligned_16_88bytes):
+	movdqa	%xmm0, -88(%edx)
+L(aligned_16_72bytes):
+	movdqa	%xmm0, -72(%edx)
+L(aligned_16_56bytes):
+	movdqa	%xmm0, -56(%edx)
+L(aligned_16_40bytes):
+	movdqa	%xmm0, -40(%edx)
+L(aligned_16_24bytes):
+	movdqa	%xmm0, -24(%edx)
+L(aligned_16_8bytes):
+	movq	%xmm0, -8(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_121bytes):
+	movdqa	%xmm0, -121(%edx)
+L(aligned_16_105bytes):
+	movdqa	%xmm0, -105(%edx)
+L(aligned_16_89bytes):
+	movdqa	%xmm0, -89(%edx)
+L(aligned_16_73bytes):
+	movdqa	%xmm0, -73(%edx)
+L(aligned_16_57bytes):
+	movdqa	%xmm0, -57(%edx)
+L(aligned_16_41bytes):
+	movdqa	%xmm0, -41(%edx)
+L(aligned_16_25bytes):
+	movdqa	%xmm0, -25(%edx)
+L(aligned_16_9bytes):
+	movq	%xmm0, -9(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_122bytes):
+	movdqa	%xmm0, -122(%edx)
+L(aligned_16_106bytes):
+	movdqa	%xmm0, -106(%edx)
+L(aligned_16_90bytes):
+	movdqa	%xmm0, -90(%edx)
+L(aligned_16_74bytes):
+	movdqa	%xmm0, -74(%edx)
+L(aligned_16_58bytes):
+	movdqa	%xmm0, -58(%edx)
+L(aligned_16_42bytes):
+	movdqa	%xmm0, -42(%edx)
+L(aligned_16_26bytes):
+	movdqa	%xmm0, -26(%edx)
+L(aligned_16_10bytes):
+	movq	%xmm0, -10(%edx)
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_123bytes):
+	movdqa	%xmm0, -123(%edx)
+L(aligned_16_107bytes):
+	movdqa	%xmm0, -107(%edx)
+L(aligned_16_91bytes):
+	movdqa	%xmm0, -91(%edx)
+L(aligned_16_75bytes):
+	movdqa	%xmm0, -75(%edx)
+L(aligned_16_59bytes):
+	movdqa	%xmm0, -59(%edx)
+L(aligned_16_43bytes):
+	movdqa	%xmm0, -43(%edx)
+L(aligned_16_27bytes):
+	movdqa	%xmm0, -27(%edx)
+L(aligned_16_11bytes):
+	movq	%xmm0, -11(%edx)
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_124bytes):
+	movdqa	%xmm0, -124(%edx)
+L(aligned_16_108bytes):
+	movdqa	%xmm0, -108(%edx)
+L(aligned_16_92bytes):
+	movdqa	%xmm0, -92(%edx)
+L(aligned_16_76bytes):
+	movdqa	%xmm0, -76(%edx)
+L(aligned_16_60bytes):
+	movdqa	%xmm0, -60(%edx)
+L(aligned_16_44bytes):
+	movdqa	%xmm0, -44(%edx)
+L(aligned_16_28bytes):
+	movdqa	%xmm0, -28(%edx)
+L(aligned_16_12bytes):
+	movq	%xmm0, -12(%edx)
+	movl	%eax, -4(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_125bytes):
+	movdqa	%xmm0, -125(%edx)
+L(aligned_16_109bytes):
+	movdqa	%xmm0, -109(%edx)
+L(aligned_16_93bytes):
+	movdqa	%xmm0, -93(%edx)
+L(aligned_16_77bytes):
+	movdqa	%xmm0, -77(%edx)
+L(aligned_16_61bytes):
+	movdqa	%xmm0, -61(%edx)
+L(aligned_16_45bytes):
+	movdqa	%xmm0, -45(%edx)
+L(aligned_16_29bytes):
+	movdqa	%xmm0, -29(%edx)
+L(aligned_16_13bytes):
+	movq	%xmm0, -13(%edx)
+	movl	%eax, -5(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_126bytes):
+	movdqa	%xmm0, -126(%edx)
+L(aligned_16_110bytes):
+	movdqa	%xmm0, -110(%edx)
+L(aligned_16_94bytes):
+	movdqa	%xmm0, -94(%edx)
+L(aligned_16_78bytes):
+	movdqa	%xmm0, -78(%edx)
+L(aligned_16_62bytes):
+	movdqa	%xmm0, -62(%edx)
+L(aligned_16_46bytes):
+	movdqa	%xmm0, -46(%edx)
+L(aligned_16_30bytes):
+	movdqa	%xmm0, -30(%edx)
+L(aligned_16_14bytes):
+	movq	%xmm0, -14(%edx)
+	movl	%eax, -6(%edx)
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_127bytes):
+	movdqa	%xmm0, -127(%edx)
+L(aligned_16_111bytes):
+	movdqa	%xmm0, -111(%edx)
+L(aligned_16_95bytes):
+	movdqa	%xmm0, -95(%edx)
+L(aligned_16_79bytes):
+	movdqa	%xmm0, -79(%edx)
+L(aligned_16_63bytes):
+	movdqa	%xmm0, -63(%edx)
+L(aligned_16_47bytes):
+	movdqa	%xmm0, -47(%edx)
+L(aligned_16_31bytes):
+	movdqa	%xmm0, -31(%edx)
+L(aligned_16_15bytes):
+	movq	%xmm0, -15(%edx)
+	movl	%eax, -7(%edx)
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN_END
+
+END (__memset_sse2)
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memset.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memset.S
new file mode 100644
index 0000000000..f601663a9f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memset.S
@@ -0,0 +1,75 @@
+/* Multiple versions of memset
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib.  */
+#if IS_IN (libc)
+	.text
+ENTRY(memset)
+	.type	memset, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memset_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memset_sse2)
+	HAS_ARCH_FEATURE (Fast_Rep_String)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memset_sse2_rep)
+2:	ret
+END(memset)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __memset_ia32, @function; \
+	.globl __memset_ia32; \
+	.p2align 4; \
+	__memset_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __memset_ia32, .-__memset_ia32
+
+# undef ENTRY_CHK
+# define ENTRY_CHK(name) \
+	.type __memset_chk_ia32, @function; \
+	.globl __memset_chk_ia32; \
+	.p2align 4; \
+	__memset_chk_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END_CHK
+# define END_CHK(name) \
+	cfi_endproc; .size __memset_chk_ia32, .-__memset_chk_ia32
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_memset; __GI_memset = __memset_ia32
+# endif
+
+# undef strong_alias
+# define strong_alias(original, alias)
+#endif
+
+#include "../memset.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memset_chk.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memset_chk.S
new file mode 100644
index 0000000000..573cf4208a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memset_chk.S
@@ -0,0 +1,82 @@
+/* Multiple versions of __memset_chk
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib.  */
+#if IS_IN (libc)
+	.text
+ENTRY(__memset_chk)
+	.type	__memset_chk, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memset_chk_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memset_chk_sse2)
+	HAS_ARCH_FEATURE (Fast_Rep_String)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memset_chk_sse2_rep)
+2:	ret
+END(__memset_chk)
+
+# ifdef SHARED
+strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
+	.section .gnu.warning.__memset_zero_constant_len_parameter
+	.string "memset used with constant zero length parameter; this could be due to transposed parameters"
+# else
+	.text
+	.type __memset_chk_sse2, @function
+	.p2align 4;
+__memset_chk_sse2:
+	cfi_startproc
+	CALL_MCOUNT
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	__chk_fail
+	jmp	__memset_sse2
+	cfi_endproc
+	.size __memset_chk_sse2, .-__memset_chk_sse2
+
+	.type __memset_chk_sse2_rep, @function
+	.p2align 4;
+__memset_chk_sse2_rep:
+	cfi_startproc
+	CALL_MCOUNT
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	__chk_fail
+	jmp	__memset_sse2_rep
+	cfi_endproc
+	.size __memset_chk_sse2_rep, .-__memset_chk_sse2_rep
+
+	.type __memset_chk_ia32, @function
+	.p2align 4;
+__memset_chk_ia32:
+	cfi_startproc
+	CALL_MCOUNT
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	__chk_fail
+	jmp	__memset_ia32
+	cfi_endproc
+	.size __memset_chk_ia32, .-__memset_chk_ia32
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S
new file mode 100644
index 0000000000..88c0e5776c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S
@@ -0,0 +1,3 @@
+#define USE_AS_RAWMEMCHR
+#define MEMCHR __rawmemchr_sse2_bsf
+#include "memchr-sse2-bsf.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S
new file mode 100644
index 0000000000..038c74896b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S
@@ -0,0 +1,3 @@
+#define USE_AS_RAWMEMCHR
+#define MEMCHR __rawmemchr_sse2
+#include "memchr-sse2.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr.S
new file mode 100644
index 0000000000..0a41d63ee8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr.S
@@ -0,0 +1,65 @@
+/* Multiple versions of rawmemchr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+	.text
+ENTRY(__rawmemchr)
+	.type	__rawmemchr, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	HAS_ARCH_FEATURE (Slow_BSF)
+	jz	3f
+
+	LOAD_FUNC_GOT_EAX (__rawmemchr_sse2)
+	ret
+
+2:	LOAD_FUNC_GOT_EAX (__rawmemchr_ia32)
+	ret
+
+3:	LOAD_FUNC_GOT_EAX (__rawmemchr_sse2_bsf)
+	ret
+END(__rawmemchr)
+
+weak_alias(__rawmemchr, rawmemchr)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __rawmemchr_ia32, @function; \
+	.globl __rawmemchr_ia32; \
+	.p2align 4; \
+	__rawmemchr_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __rawmemchr_ia32, .-__rawmemchr_ia32
+
+# undef libc_hidden_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+# define libc_hidden_def(name) \
+	.globl __GI___rawmemchr; __GI___rawmemchr = __rawmemchr_ia32
+
+#endif
+#include "../../rawmemchr.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/rtld-strnlen.c b/REORG.TODO/sysdeps/i386/i686/multiarch/rtld-strnlen.c
new file mode 100644
index 0000000000..1aa5440644
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/rtld-strnlen.c
@@ -0,0 +1 @@
+#include <string/strnlen.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma-fma.c b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma-fma.c
new file mode 100644
index 0000000000..2e9619f97c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma-fma.c
@@ -0,0 +1,27 @@
+/* FMA version of fma.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+
+double
+__fma_fma (double x, double y, double z)
+{
+  asm ("vfmadd213sd %3, %2, %0" : "=x" (x) : "0" (x), "x" (y), "xm" (z));
+  return x;
+}
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma.c b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma.c
new file mode 100644
index 0000000000..411ebb2ba9
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma.c
@@ -0,0 +1,34 @@
+/* Multiple versions of fma.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+
+#include <math.h>
+#include <init-arch.h>
+
+extern double __fma_ia32 (double x, double y, double z) attribute_hidden;
+extern double __fma_fma (double x, double y, double z) attribute_hidden;
+
+libm_ifunc (__fma,
+	    HAS_ARCH_FEATURE (FMA_Usable) ? __fma_fma : __fma_ia32);
+weak_alias (__fma, fma)
+
+#define __fma __fma_ia32
+
+#include <sysdeps/ieee754/ldbl-96/s_fma.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf-fma.c b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf-fma.c
new file mode 100644
index 0000000000..ee57abfda2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf-fma.c
@@ -0,0 +1,27 @@
+/* FMA version of fmaf.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+
+float
+__fmaf_fma (float x, float y, float z)
+{
+  asm ("vfmadd213ss %3, %2, %0" : "=x" (x) : "0" (x), "x" (y), "xm" (z));
+  return x;
+}
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf.c b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf.c
new file mode 100644
index 0000000000..00b0fbcfc5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf.c
@@ -0,0 +1,34 @@
+/* Multiple versions of fmaf.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+
+#include <math.h>
+#include <init-arch.h>
+
+extern float __fmaf_ia32 (float x, float y, float z) attribute_hidden;
+extern float __fmaf_fma (float x, float y, float z) attribute_hidden;
+
+libm_ifunc (__fmaf,
+	    HAS_ARCH_FEATURE (FMA_Usable) ? __fmaf_fma : __fmaf_ia32);
+weak_alias (__fmaf, fmaf)
+
+#define __fmaf __fmaf_ia32
+
+#include <sysdeps/ieee754/dbl-64/s_fmaf.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/sched_cpucount.c b/REORG.TODO/sysdeps/i386/i686/multiarch/sched_cpucount.c
new file mode 100644
index 0000000000..7db31b02f8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/sched_cpucount.c
@@ -0,0 +1 @@
+#include <sysdeps/x86_64/multiarch/sched_cpucount.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-sse2.S
new file mode 100644
index 0000000000..46ca1b3074
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-sse2.S
@@ -0,0 +1,3 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy_sse2
+#include "strcpy-sse2.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S
new file mode 100644
index 0000000000..d971c2da38
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S
@@ -0,0 +1,3 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy_ssse3
+#include "strcpy-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy.S
new file mode 100644
index 0000000000..ee81ab6ae3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy.S
@@ -0,0 +1,9 @@
+/* Multiple versions of stpcpy
+   All versions must be listed in ifunc-impl-list.c.  */
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy
+#include "strcpy.S"
+
+weak_alias (__stpcpy, stpcpy)
+libc_hidden_def (__stpcpy)
+libc_hidden_builtin_def (stpcpy)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-sse2.S
new file mode 100644
index 0000000000..37a703cb76
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-sse2.S
@@ -0,0 +1,4 @@
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#define STRCPY __stpncpy_sse2
+#include "strcpy-sse2.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S
new file mode 100644
index 0000000000..14ed16f6b5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#define STRCPY __stpncpy_ssse3
+#include "strcpy-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy.S
new file mode 100644
index 0000000000..2698ca6a8c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy.S
@@ -0,0 +1,8 @@
+/* Multiple versions of stpncpy
+   All versions must be listed in ifunc-impl-list.c.  */
+#define STRCPY __stpncpy
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#include "strcpy.S"
+
+weak_alias (__stpncpy, stpncpy)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp-c.c
new file mode 100644
index 0000000000..753c6ec84a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp-c.c
@@ -0,0 +1,12 @@
+#include <string.h>
+
+extern __typeof (strcasecmp) __strcasecmp_nonascii;
+
+#define __strcasecmp __strcasecmp_nonascii
+#include <string/strcasecmp.c>
+
+strong_alias (__strcasecmp_nonascii, __strcasecmp_ia32)
+
+/* The needs of strcasecmp in libc are minimal, no need to go through
+   the IFUNC.  */
+strong_alias (__strcasecmp_nonascii, __GI___strcasecmp)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp.S
new file mode 100644
index 0000000000..ec59276408
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp.S
@@ -0,0 +1,39 @@
+/* Entry point for multi-version x86 strcasecmp.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+	.text
+ENTRY(__strcasecmp)
+	.type	__strcasecmp, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__strcasecmp_ia32)
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__strcasecmp_ssse3)
+	HAS_CPU_FEATURE (SSE4_2)
+	jz	2f
+	HAS_ARCH_FEATURE (Slow_SSE4_2)
+	jnz	2f
+	LOAD_FUNC_GOT_EAX (__strcasecmp_sse4_2)
+2:	ret
+END(__strcasecmp)
+
+weak_alias (__strcasecmp, strcasecmp)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-c.c
new file mode 100644
index 0000000000..d4fcd2b4a1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-c.c
@@ -0,0 +1,13 @@
+#include <string.h>
+
+extern __typeof (strcasecmp_l) __strcasecmp_l_nonascii;
+
+#define __strcasecmp_l __strcasecmp_l_nonascii
+#define USE_IN_EXTENDED_LOCALE_MODEL    1
+#include <string/strcasecmp.c>
+
+strong_alias (__strcasecmp_l_nonascii, __strcasecmp_l_ia32)
+
+/* The needs of strcasecmp in libc are minimal, no need to go through
+   the IFUNC.  */
+strong_alias (__strcasecmp_l_nonascii, __GI___strcasecmp_l)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-sse4.S
new file mode 100644
index 0000000000..411d4153f2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-sse4.S
@@ -0,0 +1,2 @@
+#define USE_AS_STRCASECMP_L 1
+#include "strcmp-sse4.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-ssse3.S
new file mode 100644
index 0000000000..a22b93c518
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-ssse3.S
@@ -0,0 +1,2 @@
+#define USE_AS_STRCASECMP_L 1
+#include "strcmp-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l.S
new file mode 100644
index 0000000000..711c09b0dc
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l.S
@@ -0,0 +1,7 @@
+/* Multiple versions of strcasecmp_l
+   All versions must be listed in ifunc-impl-list.c.  */
+#define STRCMP __strcasecmp_l
+#define USE_AS_STRCASECMP_L
+#include "strcmp.S"
+
+weak_alias (__strcasecmp_l, strcasecmp_l)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-sse2.S
new file mode 100644
index 0000000000..6359c7330c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-sse2.S
@@ -0,0 +1,1245 @@
+/* strcat with SSE2
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# ifdef SHARED
+#  define JMPTBL(I, B) I - B
+
+/* Load an entry in a jump table into ECX and branch to it.  TABLE is a
+	jump table with relative offsets.  INDEX is a register contains the
+	index into the jump table.   SCALE is the scale of INDEX. */
+
+#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)	\
+	/* We first load PC into ECX.  */	\
+	SETUP_PIC_REG(cx);	\
+	/* Get the address of the jump table.  */	\
+	addl	$(TABLE - .), %ecx;	\
+	/* Get the entry and convert the relative offset to the	\
+	absolute address.  */	\
+	addl	(%ecx,INDEX,SCALE), %ecx;	\
+	/* We loaded the jump table and adjusted ECX. Go.  */	\
+	jmp	*%ecx
+# else
+#  define JMPTBL(I, B) I
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+	absolute offsets.  INDEX is a register contains the index into the
+	jump table.  SCALE is the scale of INDEX. */
+
+#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)	\
+	jmp	*TABLE(,INDEX,SCALE)
+# endif
+
+# ifndef STRCAT
+#  define STRCAT  __strcat_sse2
+# endif
+
+# define PARMS  4
+# define STR1  PARMS+4
+# define STR2  STR1+4
+
+# ifdef USE_AS_STRNCAT
+#  define LEN    STR2+8
+#  define STR3   STR1+4
+# else
+#  define STR3   STR1
+# endif
+
+# define USE_AS_STRCAT
+# ifdef USE_AS_STRNCAT
+#  define RETURN  POP(%ebx); POP(%esi); ret; CFI_PUSH(%ebx); CFI_PUSH(%esi);
+# else
+#  define RETURN  POP(%esi); ret; CFI_PUSH(%esi);
+# endif
+
+.text
+ENTRY (STRCAT)
+	PUSH	(%esi)
+	mov	STR1(%esp), %eax
+	mov	STR2(%esp), %esi
+# ifdef USE_AS_STRNCAT
+	PUSH	(%ebx)
+	movl	LEN(%esp), %ebx
+	test	%ebx, %ebx
+	jz	L(ExitZero)
+# endif
+	cmpb	$0, (%esi)
+	mov	%esi, %ecx
+	mov	%eax, %edx
+	jz	L(ExitZero)
+
+	and	$63, %ecx
+	and	$63, %edx
+	cmp	$32, %ecx
+	ja	L(StrlenCore7_1)
+	cmp	$48, %edx
+	ja	L(alignment_prolog)
+
+	pxor	%xmm0, %xmm0
+	pxor	%xmm4, %xmm4
+	pxor	%xmm7, %xmm7
+	movdqu	(%eax), %xmm1
+	movdqu	(%esi), %xmm5
+	pcmpeqb	%xmm1, %xmm0
+	movdqu	16(%esi), %xmm6
+	pmovmskb %xmm0, %ecx
+	pcmpeqb	%xmm5, %xmm4
+	pcmpeqb	%xmm6, %xmm7
+	test	%ecx, %ecx
+	jnz	L(exit_less16_)
+	mov	%eax, %ecx
+	and	$-16, %eax
+	jmp	L(loop_prolog)
+
+L(alignment_prolog):
+	pxor	%xmm0, %xmm0
+	pxor	%xmm4, %xmm4
+	mov	%edx, %ecx
+	pxor	%xmm7, %xmm7
+	and	$15, %ecx
+	and	$-16, %eax
+	pcmpeqb	(%eax), %xmm0
+	movdqu	(%esi), %xmm5
+	movdqu	16(%esi), %xmm6
+	pmovmskb %xmm0, %edx
+	pcmpeqb	%xmm5, %xmm4
+	shr	%cl, %edx
+	pcmpeqb	%xmm6, %xmm7
+	test	%edx, %edx
+	jnz	L(exit_less16)
+	add	%eax, %ecx
+
+	pxor	%xmm0, %xmm0
+L(loop_prolog):
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
+	.p2align 4
+L(align16_loop):
+	pcmpeqb	16(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%eax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	lea	64(%eax), %eax
+	test	%edx, %edx
+	jz	L(align16_loop)
+	bsf	%edx, %edx
+	add	%edx, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit16):
+	bsf	%edx, %edx
+	lea	16(%eax, %edx), %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit32):
+	bsf	%edx, %edx
+	lea	32(%eax, %edx), %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit48):
+	bsf	%edx, %edx
+	lea	48(%eax, %edx), %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_less16):
+	bsf	%edx, %edx
+	add	%ecx, %eax
+	add	%edx, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_less16_):
+	bsf	%ecx, %ecx
+	add	%ecx, %eax
+
+	.p2align 4
+L(StartStrcpyPart):
+	pmovmskb %xmm4, %edx
+# ifdef USE_AS_STRNCAT
+	cmp	$16, %ebx
+	jbe	L(CopyFrom1To16BytesTail1Case2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTail1)
+
+	movdqu	%xmm5, (%eax)
+	pmovmskb %xmm7, %edx
+# ifdef USE_AS_STRNCAT
+	cmp	$32, %ebx
+	jbe	L(CopyFrom1To32Bytes1Case2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To32Bytes1)
+
+	mov	%esi, %ecx
+	and	$-16, %esi
+	and	$15, %ecx
+	pxor	%xmm0, %xmm0
+# ifdef USE_AS_STRNCAT
+	add	%ecx, %ebx
+	sbb	%edx, %edx
+	or	%edx, %ebx
+# endif
+	sub	%ecx, %eax
+	jmp	L(Unalign16Both)
+
+L(StrlenCore7_1):
+	mov	%eax, %ecx
+	pxor	%xmm0, %xmm0
+	and	$15, %ecx
+	and	$-16, %eax
+	pcmpeqb	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	shr	%cl, %edx
+	test	%edx, %edx
+	jnz	L(exit_less16_1)
+	add	%eax, %ecx
+
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
+
+	.p2align 4
+L(align16_loop_1):
+	pcmpeqb	16(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16_1)
+
+	pcmpeqb	32(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32_1)
+
+	pcmpeqb	48(%eax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48_1)
+
+	pcmpeqb	64(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	lea	64(%eax), %eax
+	test	%edx, %edx
+	jz	L(align16_loop_1)
+	bsf	%edx, %edx
+	add	%edx, %eax
+	jmp	L(StartStrcpyPart_1)
+
+	.p2align 4
+L(exit16_1):
+	bsf	%edx, %edx
+	lea	16(%eax, %edx), %eax
+	jmp	L(StartStrcpyPart_1)
+
+	.p2align 4
+L(exit32_1):
+	bsf	%edx, %edx
+	lea	32(%eax, %edx), %eax
+	jmp	L(StartStrcpyPart_1)
+
+	.p2align 4
+L(exit48_1):
+	bsf	%edx, %edx
+	lea	48(%eax, %edx), %eax
+	jmp	L(StartStrcpyPart_1)
+
+	.p2align 4
+L(exit_less16_1):
+	bsf	%edx, %edx
+	add	%ecx, %eax
+	add	%edx, %eax
+
+	.p2align 4
+L(StartStrcpyPart_1):
+	mov	%esi, %ecx
+	and	$15, %ecx
+	and	$-16, %esi
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+
+# ifdef USE_AS_STRNCAT
+	cmp	$48, %ebx
+	ja      L(BigN)
+# endif
+	pcmpeqb	(%esi), %xmm1
+# ifdef USE_AS_STRNCAT
+	add	%ecx, %ebx
+# endif
+	pmovmskb %xmm1, %edx
+	shr	%cl, %edx
+# ifdef USE_AS_STRNCAT
+	cmp	$16, %ebx
+	jbe	L(CopyFrom1To16BytesTailCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTail)
+
+	pcmpeqb	16(%esi), %xmm0
+	pmovmskb %xmm0, %edx
+# ifdef USE_AS_STRNCAT
+	cmp	$32, %ebx
+	jbe	L(CopyFrom1To32BytesCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To32Bytes)
+
+	movdqu	(%esi, %ecx), %xmm1   /* copy 16 bytes */
+	movdqu	%xmm1, (%eax)
+	sub	%ecx, %eax
+
+	.p2align 4
+L(Unalign16Both):
+	mov	$16, %ecx
+	movdqa	(%esi, %ecx), %xmm1
+	movaps	16(%esi, %ecx), %xmm2
+	movdqu	%xmm1, (%eax, %ecx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+# ifdef USE_AS_STRNCAT
+	sub	$48, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+L(Unalign16BothBigN):
+	movaps	16(%esi, %ecx), %xmm3
+	movdqu	%xmm2, (%eax, %ecx)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+# ifdef USE_AS_STRNCAT
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%esi, %ecx), %xmm4
+	movdqu	%xmm3, (%eax, %ecx)
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+# ifdef USE_AS_STRNCAT
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%esi, %ecx), %xmm1
+	movdqu	%xmm4, (%eax, %ecx)
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+# ifdef USE_AS_STRNCAT
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%esi, %ecx), %xmm2
+	movdqu	%xmm1, (%eax, %ecx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+# ifdef USE_AS_STRNCAT
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%esi, %ecx), %xmm3
+	movdqu	%xmm2, (%eax, %ecx)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+# ifdef USE_AS_STRNCAT
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+
+	movdqu	%xmm3, (%eax, %ecx)
+	mov	%esi, %edx
+	lea	16(%esi, %ecx), %esi
+	and	$-0x40, %esi
+	sub	%esi, %edx
+	sub	%edx, %eax
+# ifdef USE_AS_STRNCAT
+	lea	128(%ebx, %edx), %ebx
+# endif
+	movaps	(%esi), %xmm2
+	movaps	%xmm2, %xmm4
+	movaps	16(%esi), %xmm5
+	movaps	32(%esi), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	48(%esi), %xmm7
+	pminub	%xmm5, %xmm2
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %edx
+# ifdef USE_AS_STRNCAT
+	sub	$64, %ebx
+	jbe	L(UnalignedLeaveCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(Unaligned64Leave)
+
+	.p2align 4
+L(Unaligned64Loop_start):
+	add	$64, %eax
+	add	$64, %esi
+	movdqu	%xmm4, -64(%eax)
+	movaps	(%esi), %xmm2
+	movdqa	%xmm2, %xmm4
+	movdqu	%xmm5, -48(%eax)
+	movaps	16(%esi), %xmm5
+	pminub	%xmm5, %xmm2
+	movaps	32(%esi), %xmm3
+	movdqu	%xmm6, -32(%eax)
+	movaps	%xmm3, %xmm6
+	movdqu	%xmm7, -16(%eax)
+	movaps	48(%esi), %xmm7
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %edx
+# ifdef USE_AS_STRNCAT
+	sub	$64, %ebx
+	jbe	L(UnalignedLeaveCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jz	L(Unaligned64Loop_start)
+
+L(Unaligned64Leave):
+	pxor	%xmm1, %xmm1
+
+	pcmpeqb	%xmm4, %xmm0
+	pcmpeqb	%xmm5, %xmm1
+	pmovmskb %xmm0, %edx
+	pmovmskb %xmm1, %ecx
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnaligned_0)
+	test	%ecx, %ecx
+	jnz	L(CopyFrom1To16BytesUnaligned_16)
+
+	pcmpeqb	%xmm6, %xmm0
+	pcmpeqb	%xmm7, %xmm1
+	pmovmskb %xmm0, %edx
+	pmovmskb %xmm1, %ecx
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnaligned_32)
+
+	bsf	%ecx, %edx
+	movdqu	%xmm4, (%eax)
+	movdqu	%xmm5, 16(%eax)
+	movdqu	%xmm6, 32(%eax)
+	add	$48, %esi
+	add	$48, %eax
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+# ifdef USE_AS_STRNCAT
+	.p2align 4
+L(BigN):
+	pcmpeqb	(%esi), %xmm1
+	pmovmskb %xmm1, %edx
+	shr	%cl, %edx
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTail)
+
+	pcmpeqb	16(%esi), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(CopyFrom1To32Bytes)
+
+	movdqu	(%esi, %ecx), %xmm1   /* copy 16 bytes */
+	movdqu	%xmm1, (%eax)
+	sub	%ecx, %eax
+	sub     $48, %ebx
+	add     %ecx, %ebx
+
+	mov	$16, %ecx
+	movdqa	(%esi, %ecx), %xmm1
+	movaps	16(%esi, %ecx), %xmm2
+	movdqu	%xmm1, (%eax, %ecx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+	jmp	L(Unalign16BothBigN)
+# endif
+
+/*------------end of main part-------------------------------*/
+
+/* Case1 */
+	.p2align 4
+L(CopyFrom1To16Bytes):
+	add	%ecx, %eax
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+	.p2align 4
+L(CopyFrom1To16BytesTail):
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+	.p2align 4
+L(CopyFrom1To32Bytes1):
+	add	$16, %esi
+	add	$16, %eax
+L(CopyFrom1To16BytesTail1):
+	bsf	%edx, %edx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+	.p2align 4
+L(CopyFrom1To32Bytes):
+	bsf	%edx, %edx
+	add	%ecx, %esi
+	add	$16, %edx
+	sub	%ecx, %edx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnaligned_0):
+	bsf	%edx, %edx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnaligned_16):
+	bsf	%ecx, %edx
+	movdqu	%xmm4, (%eax)
+	add	$16, %esi
+	add	$16, %eax
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnaligned_32):
+	bsf	%edx, %edx
+	movdqu	%xmm4, (%eax)
+	movdqu	%xmm5, 16(%eax)
+	add	$32, %esi
+	add	$32, %eax
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+# ifdef USE_AS_STRNCAT
+
+	.p2align 4
+L(CopyFrom1To16BytesExit):
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+/* Case2 */
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2):
+	add	$16, %ebx
+	add	%ecx, %eax
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+	.p2align 4
+L(CopyFrom1To32BytesCase2):
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	add	$16, %edx
+	sub	%ecx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+L(CopyFrom1To16BytesTailCase2):
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+L(CopyFrom1To16BytesTail1Case2):
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+/* Case2 or Case3,  Case3 */
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesCase2)
+L(CopyFrom1To16BytesCase3):
+	add	$16, %ebx
+	add	%ecx, %eax
+	add	%ecx, %esi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+	.p2align 4
+L(CopyFrom1To32BytesCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(CopyFrom1To32BytesCase2)
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+	.p2align 4
+L(CopyFrom1To16BytesTailCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTailCase2)
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+	.p2align 4
+L(CopyFrom1To32Bytes1Case2OrCase3):
+	add	$16, %eax
+	add	$16, %esi
+	sub	$16, %ebx
+L(CopyFrom1To16BytesTail1Case2OrCase3):
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTail1Case2)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+# endif
+
+# ifdef USE_AS_STRNCAT
+	.p2align 4
+L(StrncatExit0):
+	movb	%bh, (%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+# endif
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit1):
+	movb	%bh, 1(%eax)
+# endif
+L(Exit1):
+# ifdef USE_AS_STRNCAT
+	movb	(%esi), %dh
+# endif
+	movb	%dh, (%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit2):
+	movb	%bh, 2(%eax)
+# endif
+L(Exit2):
+	movw	(%esi), %dx
+	movw	%dx, (%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit3):
+	movb	%bh, 3(%eax)
+# endif
+L(Exit3):
+	movw	(%esi), %cx
+	movw	%cx, (%eax)
+# ifdef USE_AS_STRNCAT
+	movb	2(%esi), %dh
+# endif
+	movb	%dh, 2(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit4):
+	movb	%bh, 4(%eax)
+# endif
+L(Exit4):
+	movl	(%esi), %edx
+	movl	%edx, (%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit5):
+	movb	%bh, 5(%eax)
+# endif
+L(Exit5):
+	movl	(%esi), %ecx
+# ifdef USE_AS_STRNCAT
+	movb	4(%esi), %dh
+# endif
+	movb	%dh, 4(%eax)
+	movl	%ecx, (%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit6):
+	movb	%bh, 6(%eax)
+# endif
+L(Exit6):
+	movl	(%esi), %ecx
+	movw	4(%esi), %dx
+	movl	%ecx, (%eax)
+	movw	%dx, 4(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit7):
+	movb	%bh, 7(%eax)
+# endif
+L(Exit7):
+	movl	(%esi), %ecx
+	movl	3(%esi), %edx
+	movl	%ecx, (%eax)
+	movl	%edx, 3(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit8):
+	movb	%bh, 8(%eax)
+# endif
+L(Exit8):
+	movlpd	(%esi), %xmm0
+	movlpd	%xmm0, (%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit9):
+	movb	%bh, 9(%eax)
+# endif
+L(Exit9):
+	movlpd	(%esi), %xmm0
+# ifdef USE_AS_STRNCAT
+	movb	8(%esi), %dh
+# endif
+	movb	%dh, 8(%eax)
+	movlpd	%xmm0, (%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit10):
+	movb	%bh, 10(%eax)
+# endif
+L(Exit10):
+	movlpd	(%esi), %xmm0
+	movw	8(%esi), %dx
+	movlpd	%xmm0, (%eax)
+	movw	%dx, 8(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit11):
+	movb	%bh, 11(%eax)
+# endif
+L(Exit11):
+	movlpd	(%esi), %xmm0
+	movl	7(%esi), %edx
+	movlpd	%xmm0, (%eax)
+	movl	%edx, 7(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit12):
+	movb	%bh, 12(%eax)
+# endif
+L(Exit12):
+	movlpd	(%esi), %xmm0
+	movl	8(%esi), %edx
+	movlpd	%xmm0, (%eax)
+	movl	%edx, 8(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit13):
+	movb	%bh, 13(%eax)
+# endif
+L(Exit13):
+	movlpd	(%esi), %xmm0
+	movlpd	5(%esi), %xmm1
+	movlpd	%xmm0, (%eax)
+	movlpd	%xmm1, 5(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit14):
+	movb	%bh, 14(%eax)
+# endif
+L(Exit14):
+	movlpd	(%esi), %xmm0
+	movlpd	6(%esi), %xmm1
+	movlpd	%xmm0, (%eax)
+	movlpd	%xmm1, 6(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit15):
+	movb	%bh, 15(%eax)
+# endif
+L(Exit15):
+	movlpd	(%esi), %xmm0
+	movlpd	7(%esi), %xmm1
+	movlpd	%xmm0, (%eax)
+	movlpd	%xmm1, 7(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit16):
+	movb	%bh, 16(%eax)
+# endif
+L(Exit16):
+	movdqu	(%esi), %xmm0
+	movdqu	%xmm0, (%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit17):
+	movb	%bh, 17(%eax)
+# endif
+L(Exit17):
+	movdqu	(%esi), %xmm0
+# ifdef USE_AS_STRNCAT
+	movb	16(%esi), %dh
+# endif
+	movdqu	%xmm0, (%eax)
+	movb	%dh, 16(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit18):
+	movb	%bh, 18(%eax)
+# endif
+L(Exit18):
+	movdqu	(%esi), %xmm0
+	movw	16(%esi), %cx
+	movdqu	%xmm0, (%eax)
+	movw	%cx, 16(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit19):
+	movb	%bh, 19(%eax)
+# endif
+L(Exit19):
+	movdqu	(%esi), %xmm0
+	movl	15(%esi), %ecx
+	movdqu	%xmm0, (%eax)
+	movl	%ecx, 15(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit20):
+	movb	%bh, 20(%eax)
+# endif
+L(Exit20):
+	movdqu	(%esi), %xmm0
+	movl	16(%esi), %ecx
+	movdqu	%xmm0, (%eax)
+	movl	%ecx, 16(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit21):
+	movb	%bh, 21(%eax)
+# endif
+L(Exit21):
+	movdqu	(%esi), %xmm0
+	movl	16(%esi), %ecx
+# ifdef USE_AS_STRNCAT
+	movb	20(%esi), %dh
+# endif
+	movdqu	%xmm0, (%eax)
+	movl	%ecx, 16(%eax)
+	movb	%dh, 20(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit22):
+	movb	%bh, 22(%eax)
+# endif
+L(Exit22):
+	movdqu	(%esi), %xmm0
+	movlpd	14(%esi), %xmm3
+	movdqu	%xmm0, (%eax)
+	movlpd	%xmm3, 14(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit23):
+	movb	%bh, 23(%eax)
+# endif
+L(Exit23):
+	movdqu	(%esi), %xmm0
+	movlpd	15(%esi), %xmm3
+	movdqu	%xmm0, (%eax)
+	movlpd	%xmm3, 15(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit24):
+	movb	%bh, 24(%eax)
+# endif
+L(Exit24):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movdqu	%xmm0, (%eax)
+	movlpd	%xmm2, 16(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit25):
+	movb	%bh, 25(%eax)
+# endif
+L(Exit25):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+# ifdef USE_AS_STRNCAT
+	movb	24(%esi), %dh
+# endif
+	movdqu	%xmm0, (%eax)
+	movlpd	%xmm2, 16(%eax)
+	movb	%dh, 24(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit26):
+	movb	%bh, 26(%eax)
+# endif
+L(Exit26):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movw	24(%esi), %cx
+	movdqu	%xmm0, (%eax)
+	movlpd	%xmm2, 16(%eax)
+	movw	%cx, 24(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit27):
+	movb	%bh, 27(%eax)
+# endif
+L(Exit27):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movl	23(%esi), %ecx
+	movdqu	%xmm0, (%eax)
+	movlpd	%xmm2, 16(%eax)
+	movl	%ecx, 23(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit28):
+	movb	%bh, 28(%eax)
+# endif
+L(Exit28):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movl	24(%esi), %ecx
+	movdqu	%xmm0, (%eax)
+	movlpd	%xmm2, 16(%eax)
+	movl	%ecx, 24(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit29):
+	movb	%bh, 29(%eax)
+# endif
+L(Exit29):
+	movdqu	(%esi), %xmm0
+	movdqu	13(%esi), %xmm2
+	movdqu	%xmm0, (%eax)
+	movdqu	%xmm2, 13(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit30):
+	movb	%bh, 30(%eax)
+# endif
+L(Exit30):
+	movdqu	(%esi), %xmm0
+	movdqu	14(%esi), %xmm2
+	movdqu	%xmm0, (%eax)
+	movdqu	%xmm2, 14(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit31):
+	movb	%bh, 31(%eax)
+# endif
+L(Exit31):
+	movdqu	(%esi), %xmm0
+	movdqu	15(%esi), %xmm2
+	movdqu	%xmm0, (%eax)
+	movdqu	%xmm2, 15(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit32):
+	movb	%bh, 32(%eax)
+# endif
+L(Exit32):
+	movdqu	(%esi), %xmm0
+	movdqu	16(%esi), %xmm2
+	movdqu	%xmm0, (%eax)
+	movdqu	%xmm2, 16(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+# ifdef USE_AS_STRNCAT
+
+	.p2align 4
+L(UnalignedLeaveCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(Unaligned64LeaveCase2)
+L(Unaligned64LeaveCase3):
+	lea	64(%ebx), %ecx
+	and	$-16, %ecx
+	add	$48, %ebx
+	jl	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm4, (%eax)
+	sub	$16, %ebx
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm5, 16(%eax)
+	sub	$16, %ebx
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm6, 32(%eax)
+	sub	$16, %ebx
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm7, 48(%eax)
+	xor	%bh, %bh
+	movb	%bh, 64(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+L(Unaligned64LeaveCase2):
+	xor	%ecx, %ecx
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$48, %ebx
+	jle	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm5, %xmm0
+	pmovmskb %xmm0, %edx
+	movdqu	%xmm4, (%eax)
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm6, %xmm0
+	pmovmskb %xmm0, %edx
+	movdqu	%xmm5, 16(%eax)
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm7, %xmm0
+	pmovmskb %xmm0, %edx
+	movdqu	%xmm6, 32(%eax)
+	lea	16(%eax, %ecx), %eax
+	lea	16(%esi, %ecx), %esi
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+# endif
+	.p2align 4
+L(ExitZero):
+	RETURN
+
+END (STRCAT)
+
+	.p2align 4
+	.section .rodata
+L(ExitTable):
+	.int	JMPTBL(L(Exit1), L(ExitTable))
+	.int	JMPTBL(L(Exit2), L(ExitTable))
+	.int	JMPTBL(L(Exit3), L(ExitTable))
+	.int	JMPTBL(L(Exit4), L(ExitTable))
+	.int	JMPTBL(L(Exit5), L(ExitTable))
+	.int	JMPTBL(L(Exit6), L(ExitTable))
+	.int	JMPTBL(L(Exit7), L(ExitTable))
+	.int	JMPTBL(L(Exit8), L(ExitTable))
+	.int	JMPTBL(L(Exit9), L(ExitTable))
+	.int	JMPTBL(L(Exit10), L(ExitTable))
+	.int	JMPTBL(L(Exit11), L(ExitTable))
+	.int	JMPTBL(L(Exit12), L(ExitTable))
+	.int	JMPTBL(L(Exit13), L(ExitTable))
+	.int	JMPTBL(L(Exit14), L(ExitTable))
+	.int	JMPTBL(L(Exit15), L(ExitTable))
+	.int	JMPTBL(L(Exit16), L(ExitTable))
+	.int	JMPTBL(L(Exit17), L(ExitTable))
+	.int	JMPTBL(L(Exit18), L(ExitTable))
+	.int	JMPTBL(L(Exit19), L(ExitTable))
+	.int	JMPTBL(L(Exit20), L(ExitTable))
+	.int	JMPTBL(L(Exit21), L(ExitTable))
+	.int	JMPTBL(L(Exit22), L(ExitTable))
+	.int	JMPTBL(L(Exit23), L(ExitTable))
+	.int	JMPTBL(L(Exit24), L(ExitTable))
+	.int	JMPTBL(L(Exit25), L(ExitTable))
+	.int	JMPTBL(L(Exit26), L(ExitTable))
+	.int	JMPTBL(L(Exit27), L(ExitTable))
+	.int	JMPTBL(L(Exit28), L(ExitTable))
+	.int	JMPTBL(L(Exit29), L(ExitTable))
+	.int	JMPTBL(L(Exit30), L(ExitTable))
+	.int	JMPTBL(L(Exit31), L(ExitTable))
+	.int	JMPTBL(L(Exit32), L(ExitTable))
+# ifdef USE_AS_STRNCAT
+L(ExitStrncatTable):
+	.int	JMPTBL(L(StrncatExit0), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit1), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit2), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit3), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit4), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit5), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit6), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit7), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit8), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit9), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit10), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit11), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit12), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit13), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit14), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit15), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit16), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit17), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit18), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit19), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit20), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit21), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit22), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit23), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit24), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit25), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit26), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit27), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit28), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit29), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit30), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit31), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit32), L(ExitStrncatTable))
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-ssse3.S
new file mode 100644
index 0000000000..59ffbc60a5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-ssse3.S
@@ -0,0 +1,572 @@
+/* strcat with SSSE3
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+	MERCHANTABILITY	or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# ifndef STRCAT
+#  define STRCAT  __strcat_ssse3
+# endif
+
+# define PARMS  4
+# define STR1  PARMS+4
+# define STR2  STR1+4
+
+# ifdef USE_AS_STRNCAT
+#  define LEN STR2+8
+# endif
+
+# define USE_AS_STRCAT
+
+.text
+ENTRY (STRCAT)
+	PUSH	(%edi)
+	mov	STR1(%esp), %edi
+	mov	%edi, %edx
+
+# define RETURN  jmp L(StartStrcpyPart)
+# include "strlen-sse2.S"
+
+L(StartStrcpyPart):
+	mov	STR2(%esp), %ecx
+	lea	(%edi, %eax), %edx
+# ifdef USE_AS_STRNCAT
+	PUSH	(%ebx)
+	mov	LEN(%esp), %ebx
+	test	%ebx, %ebx
+	jz	L(StrncatExit0)
+	cmp	$8, %ebx
+	jbe	L(StrncatExit8Bytes)
+# endif
+	cmpb	$0, (%ecx)
+	jz	L(Exit1)
+	cmpb	$0, 1(%ecx)
+	jz	L(Exit2)
+	cmpb	$0, 2(%ecx)
+	jz	L(Exit3)
+	cmpb	$0, 3(%ecx)
+	jz	L(Exit4)
+	cmpb	$0, 4(%ecx)
+	jz	L(Exit5)
+	cmpb	$0, 5(%ecx)
+	jz	L(Exit6)
+	cmpb	$0, 6(%ecx)
+	jz	L(Exit7)
+	cmpb	$0, 7(%ecx)
+	jz	L(Exit8)
+	cmpb	$0, 8(%ecx)
+	jz	L(Exit9)
+# ifdef USE_AS_STRNCAT
+	cmp	$16, %ebx
+	jb	L(StrncatExit15Bytes)
+# endif
+	cmpb	$0, 9(%ecx)
+	jz	L(Exit10)
+	cmpb	$0, 10(%ecx)
+	jz	L(Exit11)
+	cmpb	$0, 11(%ecx)
+	jz	L(Exit12)
+	cmpb	$0, 12(%ecx)
+	jz	L(Exit13)
+	cmpb	$0, 13(%ecx)
+	jz	L(Exit14)
+	cmpb	$0, 14(%ecx)
+	jz	L(Exit15)
+	cmpb	$0, 15(%ecx)
+	jz	L(Exit16)
+# ifdef USE_AS_STRNCAT
+	cmp	$16, %ebx
+	je	L(StrncatExit16)
+
+#  define RETURN1	\
+	POP	(%ebx);	\
+	POP	(%edi);	\
+	ret;	\
+	CFI_PUSH	(%ebx);	\
+	CFI_PUSH	(%edi)
+#  define USE_AS_STRNCPY
+# else
+#  define RETURN1  POP (%edi); ret; CFI_PUSH (%edi)
+# endif
+# include "strcpy-ssse3.S"
+	.p2align 4
+L(CopyFrom1To16Bytes):
+	add	%esi, %edx
+	add	%esi, %ecx
+
+	POP	(%esi)
+	test	%al, %al
+	jz	L(ExitHigh)
+	test	$0x01, %al
+	jnz	L(Exit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(ExitHigh):
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	test	$0x08, %ah
+	jnz	L(Exit12)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	movlpd	(%ecx), %xmm0
+	movlpd	8(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 8(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit1):
+	movb	%bh, 1(%edx)
+L(Exit1):
+	movb	(%ecx), %al
+	movb	%al, (%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit2):
+	movb	%bh, 2(%edx)
+L(Exit2):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit3):
+	movb	%bh, 3(%edx)
+L(Exit3):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	movb	2(%ecx), %al
+	movb	%al, 2(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit4):
+	movb	%bh, 4(%edx)
+L(Exit4):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit5):
+	movb	%bh, 5(%edx)
+L(Exit5):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movb	4(%ecx), %al
+	movb	%al, 4(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit6):
+	movb	%bh, 6(%edx)
+L(Exit6):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movw	4(%ecx), %ax
+	movw	%ax, 4(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit7):
+	movb	%bh, 7(%edx)
+L(Exit7):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	3(%ecx), %eax
+	movl	%eax, 3(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit8):
+	movb	%bh, 8(%edx)
+L(Exit8):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit9):
+	movb	%bh, 9(%edx)
+L(Exit9):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movb	8(%ecx), %al
+	movb	%al, 8(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit10):
+	movb	%bh, 10(%edx)
+L(Exit10):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movw	8(%ecx), %ax
+	movw	%ax, 8(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit11):
+	movb	%bh, 11(%edx)
+L(Exit11):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	7(%ecx), %eax
+	movl	%eax, 7(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit12):
+	movb	%bh, 12(%edx)
+L(Exit12):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	8(%ecx), %eax
+	movl	%eax, 8(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit13):
+	movb	%bh, 13(%edx)
+L(Exit13):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	5(%ecx), %xmm0
+	movlpd	%xmm0, 5(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit14):
+	movb	%bh, 14(%edx)
+L(Exit14):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	6(%ecx), %xmm0
+	movlpd	%xmm0, 6(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit15):
+	movb	%bh, 15(%edx)
+L(Exit15):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	7(%ecx), %xmm0
+	movlpd	%xmm0, 7(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit16):
+	movb	%bh, 16(%edx)
+L(Exit16):
+	movlpd	(%ecx), %xmm0
+	movlpd	8(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 8(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+# ifdef USE_AS_STRNCPY
+
+	CFI_PUSH(%esi)
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2):
+	add	$16, %ebx
+	add	%esi, %ecx
+	lea	(%esi, %edx), %esi
+	lea	-9(%ebx), %edx
+	and	$1<<7, %dh
+	or	%al, %dh
+	test	%dh, %dh
+	lea	(%esi), %edx
+	POP	(%esi)
+	jz	L(ExitHighCase2)
+
+	test	$0x01, %al
+	jnz	L(Exit1)
+	cmp	$1, %ebx
+	je	L(StrncatExit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	cmp	$2, %ebx
+	je	L(StrncatExit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	cmp	$3, %ebx
+	je	L(StrncatExit3)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	cmp	$4, %ebx
+	je	L(StrncatExit4)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	cmp	$5, %ebx
+	je	L(StrncatExit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	cmp	$6, %ebx
+	je	L(StrncatExit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	cmp	$7, %ebx
+	je	L(StrncatExit7)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	lea	7(%edx), %eax
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+	xor	%cl, %cl
+	movb	%cl, (%eax)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(ExitHighCase2):
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	cmp	$9, %ebx
+	je	L(StrncatExit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	cmp	$10, %ebx
+	je	L(StrncatExit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	cmp	$11, %ebx
+	je	L(StrncatExit11)
+	test	$0x8, %ah
+	jnz	L(Exit12)
+	cmp	$12, %ebx
+	je	L(StrncatExit12)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	cmp	$13, %ebx
+	je	L(StrncatExit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	cmp	$14, %ebx
+	je	L(StrncatExit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	cmp	$15, %ebx
+	je	L(StrncatExit15)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	8(%ecx), %xmm1
+	movlpd	%xmm1, 8(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	CFI_PUSH(%esi)
+
+L(CopyFrom1To16BytesCase2OrCase3):
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+
+	.p2align 4
+L(CopyFrom1To16BytesCase3):
+	add	$16, %ebx
+	add	%esi, %edx
+	add	%esi, %ecx
+
+	POP	(%esi)
+
+	cmp	$8, %ebx
+	ja	L(ExitHighCase3)
+	cmp	$1, %ebx
+	je	L(StrncatExit1)
+	cmp	$2, %ebx
+	je	L(StrncatExit2)
+	cmp	$3, %ebx
+	je	L(StrncatExit3)
+	cmp	$4, %ebx
+	je	L(StrncatExit4)
+	cmp	$5, %ebx
+	je	L(StrncatExit5)
+	cmp	$6, %ebx
+	je	L(StrncatExit6)
+	cmp	$7, %ebx
+	je	L(StrncatExit7)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movb	%bh, 8(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(ExitHighCase3):
+	cmp	$9, %ebx
+	je	L(StrncatExit9)
+	cmp	$10, %ebx
+	je	L(StrncatExit10)
+	cmp	$11, %ebx
+	je	L(StrncatExit11)
+	cmp	$12, %ebx
+	je	L(StrncatExit12)
+	cmp	$13, %ebx
+	je	L(StrncatExit13)
+	cmp	$14, %ebx
+	je	L(StrncatExit14)
+	cmp	$15, %ebx
+	je	L(StrncatExit15)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	8(%ecx), %xmm1
+	movlpd	%xmm1, 8(%edx)
+	movb	%bh, 16(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit0):
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit15Bytes):
+	cmp	$9, %ebx
+	je	L(StrncatExit9)
+	cmpb	$0, 9(%ecx)
+	jz	L(Exit10)
+	cmp	$10, %ebx
+	je	L(StrncatExit10)
+	cmpb	$0, 10(%ecx)
+	jz	L(Exit11)
+	cmp	$11, %ebx
+	je	L(StrncatExit11)
+	cmpb	$0, 11(%ecx)
+	jz	L(Exit12)
+	cmp	$12, %ebx
+	je	L(StrncatExit12)
+	cmpb	$0, 12(%ecx)
+	jz	L(Exit13)
+	cmp	$13, %ebx
+	je	L(StrncatExit13)
+	cmpb	$0, 13(%ecx)
+	jz	L(Exit14)
+	cmp	$14, %ebx
+	je	L(StrncatExit14)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	7(%ecx), %xmm0
+	movlpd	%xmm0, 7(%edx)
+	lea	14(%edx), %eax
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+	movb	%bh, (%eax)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit8Bytes):
+	cmpb	$0, (%ecx)
+	jz	L(Exit1)
+	cmp	$1, %ebx
+	je	L(StrncatExit1)
+	cmpb	$0, 1(%ecx)
+	jz	L(Exit2)
+	cmp	$2, %ebx
+	je	L(StrncatExit2)
+	cmpb	$0, 2(%ecx)
+	jz	L(Exit3)
+	cmp	$3, %ebx
+	je	L(StrncatExit3)
+	cmpb	$0, 3(%ecx)
+	jz	L(Exit4)
+	cmp	$4, %ebx
+	je	L(StrncatExit4)
+	cmpb	$0, 4(%ecx)
+	jz	L(Exit5)
+	cmp	$5, %ebx
+	je	L(StrncatExit5)
+	cmpb	$0, 5(%ecx)
+	jz	L(Exit6)
+	cmp	$6, %ebx
+	je	L(StrncatExit6)
+	cmpb	$0, 6(%ecx)
+	jz	L(Exit7)
+	cmp	$7, %ebx
+	je	L(StrncatExit7)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	lea	7(%edx), %eax
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+	movb	%bh, (%eax)
+	movl	%edi, %eax
+	RETURN1
+
+# endif
+END (STRCAT)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcat.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat.S
new file mode 100644
index 0000000000..8412cb6f23
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat.S
@@ -0,0 +1,92 @@
+/* Multiple versions of strcat
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifndef USE_AS_STRNCAT
+# ifndef STRCAT
+#  define STRCAT strcat
+# endif
+#endif
+
+#ifdef USE_AS_STRNCAT
+# define STRCAT_SSSE3	__strncat_ssse3
+# define STRCAT_SSE2		__strncat_sse2
+# define STRCAT_IA32		__strncat_ia32
+# define __GI_STRCAT		__GI_strncat
+#else
+# define STRCAT_SSSE3	__strcat_ssse3
+# define STRCAT_SSE2		__strcat_sse2
+# define STRCAT_IA32		__strcat_ia32
+# define __GI_STRCAT		__GI_strcat
+#endif
+
+
+/* Define multiple versions only for the definition in libc.  Don't
+   define multiple versions for strncat in static library since we
+   need strncat before the initialization happened.  */
+#if IS_IN (libc)
+
+	.text
+ENTRY(STRCAT)
+	.type	STRCAT, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (STRCAT_IA32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (STRCAT_SSE2)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz	2f
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (STRCAT_SSSE3)
+2:	ret
+END(STRCAT)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type STRCAT_IA32, @function; \
+	.align 16; \
+	.globl STRCAT_IA32; \
+	.hidden STRCAT_IA32; \
+	STRCAT_IA32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size STRCAT_IA32, .-STRCAT_IA32
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strcat calls through a PLT.
+   The speedup we get from using SSSE3 instruction is likely eaten away
+   by the indirect call in the PLT.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_STRCAT; __GI_STRCAT = STRCAT_IA32
+#  undef libc_hidden_def
+#  define libc_hidden_def(name) \
+	.globl __GI___STRCAT; __GI___STRCAT = STRCAT_IA32
+
+# endif
+#endif
+
+#ifndef USE_AS_STRNCAT
+# include "../../strcat.S"
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2-bsf.S
new file mode 100644
index 0000000000..95fd7c084e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2-bsf.S
@@ -0,0 +1,158 @@
+/* strchr with SSE2 with bsf
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS  8
+# define ENTRANCE PUSH(%edi)
+# define RETURN  POP(%edi); ret; CFI_PUSH(%edi);
+
+# define STR1  PARMS
+# define STR2  STR1+4
+
+	.text
+ENTRY (__strchr_sse2_bsf)
+
+	ENTRANCE
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+
+	pxor	%xmm2, %xmm2
+	mov	%ecx, %edi
+	punpcklbw %xmm1, %xmm1
+	punpcklbw %xmm1, %xmm1
+	/* ECX has OFFSET. */
+	and	$15, %ecx
+	pshufd	$0, %xmm1, %xmm1
+	je	L(loop)
+
+/* Handle unaligned string.  */
+	and	$-16, %edi
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm1, %xmm0
+	/* Find where NULL is.  */
+	pmovmskb %xmm2, %edx
+	/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+	/* Remove the leading bytes.  */
+	sarl	%cl, %edx
+	sarl	%cl, %eax
+	test	%eax, %eax
+	je	L(unaligned_no_match)
+	/* Check which byte is a match.  */
+	bsf	%eax, %eax
+	/* Is there a NULL? */
+	test	%edx, %edx
+	je	L(unaligned_match)
+	bsf	%edx, %edx
+	cmpl	%edx, %eax
+	/* Return NULL if NULL comes first.  */
+	ja	L(return_null)
+L(unaligned_match):
+	add	%edi, %eax
+	add	%ecx, %eax
+	RETURN
+
+	.p2align 4
+L(unaligned_no_match):
+	test	%edx, %edx
+	jne	L(return_null)
+	pxor	%xmm2, %xmm2
+
+	add	$16, %edi
+
+	.p2align 4
+/* Loop start on aligned string.  */
+L(loop):
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jnz	L(matches)
+	jmp	L(loop)
+
+L(matches):
+	pmovmskb %xmm2, %edx
+	test	%eax, %eax
+	jz	L(return_null)
+	bsf	%eax, %eax
+	/* There is a match.  First find where NULL is.  */
+	test	%edx, %edx
+	je	L(match)
+	bsf	%edx, %ecx
+	/* Check if NULL comes first.  */
+	cmpl	%ecx, %eax
+	ja	L(return_null)
+L(match):
+	sub	$16, %edi
+	add	%edi, %eax
+	RETURN
+
+/* Return NULL.  */
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	RETURN
+
+END (__strchr_sse2_bsf)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2.S
new file mode 100644
index 0000000000..1f9e875b04
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2.S
@@ -0,0 +1,348 @@
+/* strchr SSE2 without bsf
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS  8
+# define ENTRANCE PUSH(%edi)
+# define RETURN  POP(%edi); ret; CFI_PUSH(%edi);
+
+# define STR1  PARMS
+# define STR2  STR1+4
+
+	atom_text_section
+ENTRY (__strchr_sse2)
+
+	ENTRANCE
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+
+	pxor	%xmm2, %xmm2
+	mov	%ecx, %edi
+	punpcklbw %xmm1, %xmm1
+	punpcklbw %xmm1, %xmm1
+	/* ECX has OFFSET. */
+	and	$15, %ecx
+	pshufd	$0, %xmm1, %xmm1
+	je	L(loop)
+
+/* Handle unaligned string.  */
+	and	$-16, %edi
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm1, %xmm0
+	/* Find where NULL is.  */
+	pmovmskb %xmm2, %edx
+	/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+	/* Remove the leading bytes.  */
+	sarl	%cl, %edx
+	sarl	%cl, %eax
+	test	%eax, %eax
+	jz	L(unaligned_no_match)
+	/* Check which byte is a match.  */
+	/* Is there a NULL? */
+	add	%ecx, %edi
+	test	%edx, %edx
+	jz	L(match_case1)
+	jmp	L(match_case2)
+
+	.p2align 4
+L(unaligned_no_match):
+	test	%edx, %edx
+	jne	L(return_null)
+
+	pxor	%xmm2, %xmm2
+	add	$16, %edi
+
+	.p2align 4
+/* Loop start on aligned string.  */
+L(loop):
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+	test	%edx, %edx
+	jnz	L(return_null)
+	add	$16, %edi
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+	test	%edx, %edx
+	jnz	L(return_null)
+	add	$16, %edi
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+	test	%edx, %edx
+	jnz	L(return_null)
+	add	$16, %edi
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+	test	%edx, %edx
+	jnz	L(return_null)
+	add	$16, %edi
+	jmp	L(loop)
+
+L(matches):
+	/* There is a match.  First find where NULL is.  */
+	test	%edx, %edx
+	jz	L(match_case1)
+
+	.p2align 4
+L(match_case2):
+	test	%al, %al
+	jz	L(match_higth_case2)
+
+	mov	%al, %cl
+	and	$15, %cl
+	jnz	L(match_case2_4)
+
+	mov	%dl, %ch
+	and	$15, %ch
+	jnz	L(return_null)
+
+	test	$0x10, %al
+	jnz	L(Exit5)
+	test	$0x10, %dl
+	jnz	L(return_null)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	test	$0x20, %dl
+	jnz	L(return_null)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	test	$0x40, %dl
+	jnz	L(return_null)
+	lea	7(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_case2_4):
+	test	$0x01, %al
+	jnz	L(Exit1)
+	test	$0x01, %dl
+	jnz	L(return_null)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	test	$0x02, %dl
+	jnz	L(return_null)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	test	$0x04, %dl
+	jnz	L(return_null)
+	lea	3(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_higth_case2):
+	test	%dl, %dl
+	jnz	L(return_null)
+
+	mov	%ah, %cl
+	and	$15, %cl
+	jnz	L(match_case2_12)
+
+	mov	%dh, %ch
+	and	$15, %ch
+	jnz	L(return_null)
+
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	test	$0x10, %dh
+	jnz	L(return_null)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	test	$0x20, %dh
+	jnz	L(return_null)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	test	$0x40, %dh
+	jnz	L(return_null)
+	lea	15(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_case2_12):
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	test	$0x01, %dh
+	jnz	L(return_null)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	test	$0x02, %dh
+	jnz	L(return_null)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	test	$0x04, %dh
+	jnz	L(return_null)
+	lea	11(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_case1):
+	test	%al, %al
+	jz	L(match_higth_case1)
+
+	test	$0x01, %al
+	jnz	L(Exit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	lea	7(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_higth_case1):
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	test	$0x08, %ah
+	jnz	L(Exit12)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	lea	15(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit1):
+	lea	(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit2):
+	lea	1(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit3):
+	lea	2(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit4):
+	lea	3(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit5):
+	lea	4(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit6):
+	lea	5(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit7):
+	lea	6(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit9):
+	lea	8(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit10):
+	lea	9(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit11):
+	lea	10(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit12):
+	lea	11(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit13):
+	lea	12(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit14):
+	lea	13(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit15):
+	lea	14(%edi), %eax
+	RETURN
+
+/* Return NULL.  */
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	RETURN
+
+END (__strchr_sse2)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr.S
new file mode 100644
index 0000000000..5b97b1c767
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr.S
@@ -0,0 +1,57 @@
+/* Multiple versions of strchr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+	.text
+ENTRY(strchr)
+	.type	strchr, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__strchr_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__strchr_sse2_bsf)
+	HAS_ARCH_FEATURE (Slow_BSF)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__strchr_sse2)
+2:	ret
+END(strchr)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __strchr_ia32, @function; \
+	.globl __strchr_ia32; \
+	.p2align 4; \
+	__strchr_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __strchr_ia32, .-__strchr_ia32
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_strchr; __GI_strchr = __strchr_ia32
+#endif
+
+#include "../../i586/strchr.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-sse4.S
new file mode 100644
index 0000000000..cd26058671
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-sse4.S
@@ -0,0 +1,804 @@
+/* strcmp with SSE4.2
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#ifdef USE_AS_STRNCMP
+# ifndef STRCMP
+#  define STRCMP	__strncmp_sse4_2
+# endif
+# define STR1		8
+# define STR2		STR1+4
+# define CNT		STR2+4
+# define RETURN		POP (REM); ret; .p2align 4; CFI_PUSH (REM)
+# define REM		%ebp
+#elif defined USE_AS_STRCASECMP_L
+# include "locale-defines.h"
+# ifndef STRCMP
+#  define STRCMP	__strcasecmp_l_sse4_2
+# endif
+# ifdef PIC
+#  define STR1		12
+# else
+#  define STR1		8
+# endif
+# define STR2		STR1+4
+# define LOCALE		12	/* Loaded before the adjustment.  */
+# ifdef PIC
+#  define RETURN	POP (%edi); POP (%ebx); ret; \
+			.p2align 4; CFI_PUSH (%ebx); CFI_PUSH (%edi)
+# else
+#  define RETURN	POP (%edi); ret; .p2align 4; CFI_PUSH (%edi)
+# endif
+# define NONASCII	__strcasecmp_nonascii
+#elif defined USE_AS_STRNCASECMP_L
+# include "locale-defines.h"
+# ifndef STRCMP
+#  define STRCMP	__strncasecmp_l_sse4_2
+# endif
+# ifdef PIC
+#  define STR1		16
+# else
+#  define STR1		12
+# endif
+# define STR2		STR1+4
+# define CNT		STR2+4
+# define LOCALE		16	/* Loaded before the adjustment.  */
+# ifdef PIC
+#  define RETURN	POP (%edi); POP (REM); POP (%ebx); ret; \
+			.p2align 4; \
+			CFI_PUSH (%ebx); CFI_PUSH (REM); CFI_PUSH (%edi)
+# else
+#  define RETURN	POP (%edi); POP (REM); ret; \
+			.p2align 4; CFI_PUSH (REM); CFI_PUSH (%edi)
+# endif
+# define REM		%ebp
+# define NONASCII	__strncasecmp_nonascii
+#else
+# ifndef STRCMP
+#  define STRCMP	__strcmp_sse4_2
+# endif
+# define STR1		4
+# define STR2		STR1+4
+# define RETURN		ret; .p2align 4
+#endif
+
+	.section .text.sse4.2,"ax",@progbits
+
+#ifdef USE_AS_STRCASECMP_L
+ENTRY (__strcasecmp_sse4_2)
+# ifdef PIC
+	PUSH	(%ebx)
+	LOAD_PIC_REG(bx)
+	movl	__libc_tsd_LOCALE@GOTNTPOFF(%ebx), %eax
+#  ifdef NO_TLS_DIRECT_SEG_REFS
+	addl	%gs:0, %eax
+	movl	(%eax), %eax
+#  else
+	movl	%gs:(%eax), %eax
+#  endif
+# else
+#  ifdef NO_TLS_DIRECT_SEG_REFS
+	movl	%gs:0, %eax
+	movl	__libc_tsd_LOCALE@NTPOFF(%eax), %eax
+#  else
+	movl	%gs:__libc_tsd_LOCALE@NTPOFF, %eax
+#  endif
+# endif
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	movl	LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax
+# else
+	movl	(%eax), %eax
+# endif
+	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax)
+# ifdef PIC
+	je	L(ascii)
+	POP	(%ebx)
+	jmp	__strcasecmp_nonascii
+# else
+	jne	__strcasecmp_nonascii
+	jmp	L(ascii)
+# endif
+END (__strcasecmp_sse4_2)
+#endif
+
+#ifdef USE_AS_STRNCASECMP_L
+ENTRY (__strncasecmp_sse4_2)
+# ifdef PIC
+	PUSH	(%ebx)
+	LOAD_PIC_REG(bx)
+	movl	__libc_tsd_LOCALE@GOTNTPOFF(%ebx), %eax
+#  ifdef NO_TLS_DIRECT_SEG_REFS
+	addl	%gs:0, %eax
+	movl	(%eax), %eax
+#  else
+	movl	%gs:(%eax), %eax
+#  endif
+# else
+#  ifdef NO_TLS_DIRECT_SEG_REFS
+	movl	%gs:0, %eax
+	movl	__libc_tsd_LOCALE@NTPOFF(%eax), %eax
+#  else
+	movl	%gs:__libc_tsd_LOCALE@NTPOFF, %eax
+#  endif
+# endif
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	movl	LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax
+# else
+	movl	(%eax), %eax
+# endif
+	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax)
+# ifdef PIC
+	je	L(ascii)
+	POP	(%ebx)
+	jmp	__strncasecmp_nonascii
+# else
+	jne	__strncasecmp_nonascii
+	jmp	L(ascii)
+# endif
+END (__strncasecmp_sse4_2)
+#endif
+
+	ENTRY (STRCMP)
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movl	LOCALE(%esp), %eax
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	movl	LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax
+# else
+	movl	(%eax), %eax
+# endif
+	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax)
+	jne	NONASCII
+
+# ifdef PIC
+	PUSH	(%ebx)
+	LOAD_PIC_REG(bx)
+# endif
+L(ascii):
+	.section .rodata.cst16,"aM",@progbits,16
+	.align 16
+.Lbelowupper:
+	.quad	0x4040404040404040
+	.quad	0x4040404040404040
+.Ltopupper:
+	.quad	0x5b5b5b5b5b5b5b5b
+	.quad	0x5b5b5b5b5b5b5b5b
+.Ltouppermask:
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.previous
+
+# ifdef PIC
+#  define UCLOW_reg .Lbelowupper@GOTOFF(%ebx)
+#  define UCHIGH_reg .Ltopupper@GOTOFF(%ebx)
+#  define LCQWORD_reg .Ltouppermask@GOTOFF(%ebx)
+# else
+#  define UCLOW_reg .Lbelowupper
+#  define UCHIGH_reg .Ltopupper
+#  define LCQWORD_reg .Ltouppermask
+# endif
+#endif
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	PUSH	(REM)
+#endif
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	PUSH	(%edi)
+#endif
+	mov	STR1(%esp), %edx
+	mov	STR2(%esp), %eax
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	movl	CNT(%esp), REM
+	test	REM, REM
+	je	L(eq)
+#endif
+	mov	%dx, %cx
+	and	$0xfff, %cx
+	cmp	$0xff0, %cx
+	ja	L(first4bytes)
+	movdqu	(%edx), %xmm2
+	mov	%eax, %ecx
+	and	$0xfff, %ecx
+	cmp	$0xff0, %ecx
+	ja	L(first4bytes)
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# define TOLOWER(reg1, reg2) \
+	movdqa	reg1, %xmm3;						      \
+	movdqa	UCHIGH_reg, %xmm4;					      \
+	movdqa	reg2, %xmm5;						      \
+	movdqa	UCHIGH_reg, %xmm6;					      \
+	pcmpgtb	UCLOW_reg, %xmm3;					      \
+	pcmpgtb	reg1, %xmm4;						      \
+	pcmpgtb	UCLOW_reg, %xmm5;					      \
+	pcmpgtb	reg2, %xmm6;						      \
+	pand	%xmm4, %xmm3;						      \
+	pand	%xmm6, %xmm5;						      \
+	pand	LCQWORD_reg, %xmm3;					      \
+	pand	LCQWORD_reg, %xmm5;					      \
+	por	%xmm3, reg1;						      \
+	por	%xmm5, reg2
+
+	movdqu	(%eax), %xmm1
+	TOLOWER (%xmm2, %xmm1)
+	movd	%xmm2, %ecx
+	movd	%xmm1, %edi
+	movdqa	%xmm2, %xmm3
+	movdqa	%xmm1, %xmm4
+	cmpl	%edi, %ecx
+#else
+# define TOLOWER(reg1, reg)
+
+	movd	%xmm2, %ecx
+	cmp	(%eax), %ecx
+#endif
+	jne	L(less4bytes)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	movdqu	(%eax), %xmm1
+#endif
+	pxor	%xmm2, %xmm1
+	pxor	%xmm0, %xmm0
+	ptest	%xmm1, %xmm0
+	jnc	L(less16bytes)
+	pcmpeqb	%xmm0, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, REM
+	jbe	L(eq)
+#endif
+	add	$16, %edx
+	add	$16, %eax
+L(first4bytes):
+	movzbl	(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, (%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$1, REM
+	je	L(eq)
+#endif
+
+	movzbl	1(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	1(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 1(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$2, REM
+	je	L(eq)
+#endif
+	movzbl	2(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	2(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 2(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$3, REM
+	je	L(eq)
+#endif
+	movzbl	3(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	3(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 3(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$4, REM
+	je	L(eq)
+#endif
+	movzbl	4(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	4(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 4(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$5, REM
+	je	L(eq)
+#endif
+	movzbl	5(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	5(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 5(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$6, REM
+	je	L(eq)
+#endif
+	movzbl	6(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	6(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 6(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$7, REM
+	je	L(eq)
+#endif
+	movzbl	7(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	7(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 7(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$8, REM
+	je	L(eq)
+#endif
+	add	$8, %eax
+	add	$8, %edx
+
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	PUSH	(%edi)
+#endif
+	PUSH	(%esi)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cfi_remember_state
+#endif
+	mov	%edx, %edi
+	mov	%eax, %esi
+	xorl	%eax, %eax
+L(check_offset):
+	movl	%edi, %edx
+	movl	%esi, %ecx
+	andl	$0xfff, %edx
+	andl	$0xfff, %ecx
+	cmpl	%edx, %ecx
+	cmovl	%edx, %ecx
+	lea	-0xff0(%ecx), %edx
+	sub	%edx, %edi
+	sub	%edx, %esi
+	testl	%edx, %edx
+	jg	L(crosspage)
+L(loop):
+	movdqu	(%esi,%edx), %xmm2
+	movdqu	(%edi,%edx), %xmm1
+	TOLOWER (%xmm2, %xmm1)
+	pcmpistri	$0x1a, %xmm2, %xmm1
+	jbe	L(end)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, REM
+	jbe	L(more16byteseq)
+#endif
+
+	add	$16, %edx
+	jle	L(loop)
+L(crosspage):
+	movzbl	(%edi,%edx), %eax
+	movzbl	(%esi,%edx), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+# endif
+#endif
+	subl	%ecx, %eax
+	jne	L(ret)
+	testl	%ecx, %ecx
+	je	L(ret)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$1, REM
+	jbe	L(more16byteseq)
+#endif
+	inc	%edx
+	cmp	$15, %edx
+	jle	L(crosspage)
+	add	%edx, %edi
+	add	%edx, %esi
+	jmp	L(check_offset)
+
+	.p2align 4
+L(end):
+	jnc	L(ret)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	%ecx, REM
+	jbe	L(more16byteseq)
+#endif
+	lea	(%ecx,%edx), %ecx
+	movzbl	(%edi,%ecx), %eax
+	movzbl	(%esi,%ecx), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+# endif
+#endif
+	subl	%ecx, %eax
+L(ret):
+	POP	(%esi)
+	POP	(%edi)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	POP	(REM)
+#endif
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	POP	(%ebx)
+# endif
+#endif
+	ret
+
+	.p2align 4
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cfi_restore_state
+L(more16byteseq):
+	POP	(%esi)
+# ifdef USE_AS_STRNCMP
+	POP	(%edi)
+# endif
+#endif
+L(eq):
+	xorl	%eax, %eax
+	RETURN
+
+L(neq):
+	mov	$1, %eax
+	ja	L(neq_bigger)
+	neg	%eax
+L(neq_bigger):
+	RETURN
+
+L(less16bytes):
+	add	$0xfefefeff, %ecx
+	jnc	L(less4bytes)
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movd	%xmm3, %edi
+	xor	%edi, %ecx
+#else
+	xor	(%edx), %ecx
+#endif
+	or	$0xfefefeff, %ecx
+	add	$1, %ecx
+	jnz	L(less4bytes)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$4, REM
+	jbe	L(eq)
+#endif
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	psrldq	$4, %xmm3
+	psrldq	$4, %xmm4
+	movd	%xmm3, %ecx
+	movd	%xmm4, %edi
+	cmp	%edi, %ecx
+	mov	%ecx, %edi
+#else
+	mov	4(%edx), %ecx
+	cmp	4(%eax), %ecx
+#endif
+	jne	L(more4bytes)
+	add	$0xfefefeff, %ecx
+	jnc	L(more4bytes)
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	xor	%edi, %ecx
+#else
+	xor	4(%edx), %ecx
+#endif
+	or	$0xfefefeff, %ecx
+	add	$1, %ecx
+	jnz	L(more4bytes)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$8, REM
+	jbe	L(eq)
+#endif
+
+	add	$8, %edx
+	add	$8, %eax
+L(less4bytes):
+
+	movzbl	(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, (%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$1, REM
+	je	L(eq)
+#endif
+	movzbl	1(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	1(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 1(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$2, REM
+	je	L(eq)
+#endif
+
+	movzbl	2(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	2(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 2(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$3, REM
+	je	L(eq)
+#endif
+	movzbl	3(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	3(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 3(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+L(more4bytes):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$4, REM
+	je	L(eq)
+#endif
+	movzbl	4(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	4(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 4(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$5, REM
+	je	L(eq)
+#endif
+	movzbl	5(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	5(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 5(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$6, REM
+	je	L(eq)
+#endif
+	movzbl	6(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	6(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 6(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$7, REM
+	je	L(eq)
+#endif
+	movzbl	7(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	7(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 7(%edx)
+#endif
+	jne	L(neq)
+	jmp	L(eq)
+
+END (STRCMP)
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-ssse3.S
new file mode 100644
index 0000000000..b25cc3e068
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-ssse3.S
@@ -0,0 +1,2810 @@
+/* strcmp with SSSE3
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#ifdef USE_AS_STRNCMP
+# ifndef STRCMP
+#  define STRCMP	__strncmp_ssse3
+# endif
+# define STR1		8
+# define STR2		STR1+4
+# define CNT		STR2+4
+# define RETURN		POP (REM); ret; .p2align 4; CFI_PUSH (REM)
+# define UPDATE_STRNCMP_COUNTER				\
+	/* calculate left number to compare */		\
+	mov	$16, %esi;				\
+	sub	%ecx, %esi;				\
+	cmp	%esi, REM;				\
+	jbe	L(more8byteseq);			\
+	sub	%esi, REM
+# define FLAGS		%ebx
+# define REM		%ebp
+#elif defined USE_AS_STRCASECMP_L
+# include "locale-defines.h"
+# ifndef STRCMP
+#  define STRCMP	__strcasecmp_l_ssse3
+# endif
+# ifdef PIC
+#  define STR1		8
+# else
+#  define STR1		4
+# endif
+# define STR2		STR1+4
+# define LOCALE		12	/* Loaded before the adjustment.  */
+# ifdef PIC
+#  define RETURN	POP (%ebx); ret; .p2align 4; CFI_PUSH (%ebx)
+# else
+#  define RETURN	ret; .p2align 4
+# endif
+# define UPDATE_STRNCMP_COUNTER
+# define FLAGS		(%esp)
+# define NONASCII	__strcasecmp_nonascii
+#elif defined USE_AS_STRNCASECMP_L
+# include "locale-defines.h"
+# ifndef STRCMP
+#  define STRCMP	__strncasecmp_l_ssse3
+# endif
+# ifdef PIC
+#  define STR1		12
+# else
+#  define STR1		8
+# endif
+# define STR2		STR1+4
+# define CNT		STR2+4
+# define LOCALE		16	/* Loaded before the adjustment.  */
+# ifdef PIC
+#  define RETURN	POP (REM); POP (%ebx); ret; \
+			.p2align 4; CFI_PUSH (%ebx); CFI_PUSH (REM)
+# else
+#  define RETURN	POP (REM); ret; .p2align 4; CFI_PUSH (REM)
+# endif
+# define UPDATE_STRNCMP_COUNTER				\
+	/* calculate left number to compare */		\
+	mov	$16, %esi;				\
+	sub	%ecx, %esi;				\
+	cmp	%esi, REM;				\
+	jbe	L(more8byteseq);			\
+	sub	%esi, REM
+# define FLAGS		(%esp)
+# define REM		%ebp
+# define NONASCII	__strncasecmp_nonascii
+#else
+# ifndef STRCMP
+#  define STRCMP	__strcmp_ssse3
+# endif
+# define STR1		4
+# define STR2		STR1+4
+# define RETURN		ret; .p2align 4
+# define UPDATE_STRNCMP_COUNTER
+# define FLAGS		%ebx
+#endif
+
+	.section .text.ssse3,"ax",@progbits
+
+#ifdef USE_AS_STRCASECMP_L
+ENTRY (__strcasecmp_ssse3)
+# ifdef PIC
+	PUSH	(%ebx)
+	LOAD_PIC_REG(bx)
+	movl	__libc_tsd_LOCALE@GOTNTPOFF(%ebx), %eax
+#  ifdef NO_TLS_DIRECT_SEG_REFS
+	addl	%gs:0, %eax
+	movl	(%eax), %eax
+#  else
+	movl	%gs:(%eax), %eax
+#  endif
+# else
+#  ifdef NO_TLS_DIRECT_SEG_REFS
+	movl	%gs:0, %eax
+	movl	__libc_tsd_LOCALE@NTPOFF(%eax), %eax
+#  else
+	movl	%gs:__libc_tsd_LOCALE@NTPOFF, %eax
+#  endif
+# endif
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	movl	LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax
+# else
+	movl	(%eax), %eax
+# endif
+	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax)
+# ifdef PIC
+	je	L(ascii)
+	POP	(%ebx)
+	jmp	__strcasecmp_nonascii
+# else
+	jne	__strcasecmp_nonascii
+	jmp	L(ascii)
+# endif
+END (__strcasecmp_ssse3)
+#endif
+
+#ifdef USE_AS_STRNCASECMP_L
+ENTRY (__strncasecmp_ssse3)
+# ifdef PIC
+	PUSH	(%ebx)
+	LOAD_PIC_REG(bx)
+	movl	__libc_tsd_LOCALE@GOTNTPOFF(%ebx), %eax
+#  ifdef NO_TLS_DIRECT_SEG_REFS
+	addl	%gs:0, %eax
+	movl	(%eax), %eax
+#  else
+	movl	%gs:(%eax), %eax
+#  endif
+# else
+#  ifdef NO_TLS_DIRECT_SEG_REFS
+	movl	%gs:0, %eax
+	movl	__libc_tsd_LOCALE@NTPOFF(%eax), %eax
+#  else
+	movl	%gs:__libc_tsd_LOCALE@NTPOFF, %eax
+#  endif
+# endif
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	movl	LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax
+# else
+	movl	(%eax), %eax
+# endif
+	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax)
+# ifdef PIC
+	je	L(ascii)
+	POP	(%ebx)
+	jmp	__strncasecmp_nonascii
+# else
+	jne	__strncasecmp_nonascii
+	jmp	L(ascii)
+# endif
+END (__strncasecmp_ssse3)
+#endif
+
+ENTRY (STRCMP)
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movl	LOCALE(%esp), %eax
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	movl	LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax
+# else
+	movl	(%eax), %eax
+# endif
+	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax)
+	jne	NONASCII
+
+# ifdef PIC
+	PUSH	(%ebx)
+	LOAD_PIC_REG(bx)
+# endif
+L(ascii):
+	.section .rodata.cst16,"aM",@progbits,16
+	.align 16
+.Lbelowupper:
+	.quad	0x4040404040404040
+	.quad	0x4040404040404040
+.Ltopupper:
+	.quad	0x5b5b5b5b5b5b5b5b
+	.quad	0x5b5b5b5b5b5b5b5b
+.Ltouppermask:
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.previous
+
+# ifdef PIC
+#  define UCLOW_reg .Lbelowupper@GOTOFF(%ebx)
+#  define UCHIGH_reg .Ltopupper@GOTOFF(%ebx)
+#  define LCQWORD_reg .Ltouppermask@GOTOFF(%ebx)
+# else
+#  define UCLOW_reg .Lbelowupper
+#  define UCHIGH_reg .Ltopupper
+#  define LCQWORD_reg .Ltouppermask
+# endif
+#endif
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	PUSH	(REM)
+#endif
+
+	movl	STR1(%esp), %edx
+	movl	STR2(%esp), %eax
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	movl	CNT(%esp), REM
+	cmp	$16, REM
+	jb	L(less16bytes_sncmp)
+#elif !defined USE_AS_STRCASECMP_L
+	movzbl	(%eax), %ecx
+	cmpb	%cl, (%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	1(%eax), %ecx
+	cmpb	%cl, 1(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	2(%eax), %ecx
+	cmpb	%cl, 2(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	3(%eax), %ecx
+	cmpb	%cl, 3(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	4(%eax), %ecx
+	cmpb	%cl, 4(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	5(%eax), %ecx
+	cmpb	%cl, 5(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	6(%eax), %ecx
+	cmpb	%cl, 6(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	7(%eax), %ecx
+	cmpb	%cl, 7(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	add	$8, %edx
+	add	$8, %eax
+#endif
+	movl	%edx, %ecx
+	and	$0xfff, %ecx
+	cmp	$0xff0, %ecx
+	ja	L(crosspage)
+	mov	%eax, %ecx
+	and	$0xfff, %ecx
+	cmp	$0xff0, %ecx
+	ja	L(crosspage)
+	pxor	%xmm0, %xmm0
+	movlpd	(%eax), %xmm1
+	movlpd	(%edx), %xmm2
+	movhpd	8(%eax), %xmm1
+	movhpd	8(%edx), %xmm2
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# define TOLOWER(reg1, reg2) \
+	movdqa	reg1, %xmm5;					\
+	movdqa	reg2, %xmm7;					\
+	movdqa	UCHIGH_reg, %xmm6;				\
+	pcmpgtb	UCLOW_reg, %xmm5;				\
+	pcmpgtb	UCLOW_reg, %xmm7;				\
+	pcmpgtb	reg1, %xmm6;					\
+	pand	%xmm6, %xmm5;					\
+	movdqa	UCHIGH_reg, %xmm6;				\
+	pcmpgtb	reg2, %xmm6;					\
+	pand	%xmm6, %xmm7;					\
+	pand	LCQWORD_reg, %xmm5;				\
+	por	%xmm5, reg1;					\
+	pand	LCQWORD_reg, %xmm7;				\
+	por	%xmm7, reg2
+	TOLOWER (%xmm1, %xmm2)
+#else
+# define TOLOWER(reg1, reg2)
+#endif
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %ecx
+	sub	$0xffff, %ecx
+	jnz	L(less16bytes)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(eq)
+#endif
+	add	$16, %eax
+	add	$16, %edx
+
+L(crosspage):
+
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	PUSH	(FLAGS)
+#endif
+	PUSH	(%edi)
+	PUSH	(%esi)
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	pushl	$0
+	cfi_adjust_cfa_offset (4)
+#endif
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cfi_remember_state
+#endif
+
+	movl	%edx, %edi
+	movl	%eax, %ecx
+	and	$0xf, %ecx
+	and	$0xf, %edi
+	xor	%ecx, %eax
+	xor	%edi, %edx
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	xor	FLAGS, FLAGS
+#endif
+	cmp	%edi, %ecx
+	je	L(ashr_0)
+	ja	L(bigger)
+	orl	$0x20, FLAGS
+	xchg	%edx, %eax
+	xchg	%ecx, %edi
+L(bigger):
+	lea	15(%edi), %edi
+	sub	%ecx, %edi
+	cmp	$8, %edi
+	jle	L(ashr_less_8)
+	cmp	$14, %edi
+	je	L(ashr_15)
+	cmp	$13, %edi
+	je	L(ashr_14)
+	cmp	$12, %edi
+	je	L(ashr_13)
+	cmp	$11, %edi
+	je	L(ashr_12)
+	cmp	$10, %edi
+	je	L(ashr_11)
+	cmp	$9, %edi
+	je	L(ashr_10)
+L(ashr_less_8):
+	je	L(ashr_9)
+	cmp	$7, %edi
+	je	L(ashr_8)
+	cmp	$6, %edi
+	je	L(ashr_7)
+	cmp	$5, %edi
+	je	L(ashr_6)
+	cmp	$4, %edi
+	je	L(ashr_5)
+	cmp	$3, %edi
+	je	L(ashr_4)
+	cmp	$2, %edi
+	je	L(ashr_3)
+	cmp	$1, %edi
+	je	L(ashr_2)
+	cmp	$0, %edi
+	je	L(ashr_1)
+
+/*
+ * The following cases will be handled by ashr_0
+ *  ecx(offset of esi)  eax(offset of edi)  relative offset  corresponding case
+ *        n(0~15)            n(0~15)           15(15+ n-n)         ashr_0
+ */
+	.p2align 4
+L(ashr_0):
+	mov	$0xffff, %esi
+	movdqa	(%eax), %xmm1
+	pxor	%xmm0, %xmm0
+	pcmpeqb	%xmm1, %xmm0
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movdqa	(%edx), %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm2, %xmm1
+#else
+	pcmpeqb	(%edx), %xmm1
+#endif
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	mov	%ecx, %edi
+	jne	L(less32bytes)
+	UPDATE_STRNCMP_COUNTER
+	movl	$0x10, FLAGS
+	mov	$0x10, %ecx
+	pxor	%xmm0, %xmm0
+	.p2align 4
+L(loop_ashr_0):
+	movdqa	(%eax, %ecx), %xmm1
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movdqa	(%edx, %ecx), %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+#else
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	(%edx, %ecx), %xmm1
+#endif
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	jmp	L(loop_ashr_0)
+
+/*
+ * The following cases will be handled by ashr_1
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(15)            n -15            0(15 +(n-15) - n)         ashr_1
+ */
+	.p2align 4
+L(ashr_1):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$15, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-15(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$1, FLAGS
+	lea	1(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_1):
+	add	$16, %edi
+	jg	L(nibble_ashr_1)
+
+L(gobble_ashr_1):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$1, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_1)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$1, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_1)
+
+	.p2align 4
+L(nibble_ashr_1):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xfffe, %esi
+	jnz	L(ashr_1_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$15, REM
+	jbe	L(ashr_1_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_1)
+
+	.p2align 4
+L(ashr_1_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$1, %xmm0
+	psrldq	$1, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_2
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(14~15)            n -14            1(15 +(n-14) - n)         ashr_2
+ */
+	.p2align 4
+L(ashr_2):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$14, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-14(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$2, FLAGS
+	lea	2(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_2):
+	add	$16, %edi
+	jg	L(nibble_ashr_2)
+
+L(gobble_ashr_2):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$2, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_2)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$2, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_2)
+
+	.p2align 4
+L(nibble_ashr_2):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xfffc, %esi
+	jnz	L(ashr_2_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$14, REM
+	jbe	L(ashr_2_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_2)
+
+	.p2align 4
+L(ashr_2_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$2, %xmm0
+	psrldq	$2, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_3
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(13~15)            n -13            2(15 +(n-13) - n)         ashr_3
+ */
+	.p2align 4
+L(ashr_3):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$13, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-13(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$3, FLAGS
+	lea	3(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_3):
+	add	$16, %edi
+	jg	L(nibble_ashr_3)
+
+L(gobble_ashr_3):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$3, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_3)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$3, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_3)
+
+	.p2align 4
+L(nibble_ashr_3):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xfff8, %esi
+	jnz	L(ashr_3_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$13, REM
+	jbe	L(ashr_3_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_3)
+
+	.p2align 4
+L(ashr_3_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$3, %xmm0
+	psrldq	$3, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_4
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(12~15)            n -12            3(15 +(n-12) - n)         ashr_4
+ */
+	.p2align 4
+L(ashr_4):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$12, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-12(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$4, FLAGS
+	lea	4(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_4):
+	add	$16, %edi
+	jg	L(nibble_ashr_4)
+
+L(gobble_ashr_4):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$4, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_4)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$4, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_4)
+
+	.p2align 4
+L(nibble_ashr_4):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xfff0, %esi
+	jnz	L(ashr_4_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$12, REM
+	jbe	L(ashr_4_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_4)
+
+	.p2align 4
+L(ashr_4_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$4, %xmm0
+	psrldq	$4, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_5
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(11~15)            n -11            4(15 +(n-11) - n)         ashr_5
+ */
+	.p2align 4
+L(ashr_5):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$11, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-11(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$5, FLAGS
+	lea	5(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_5):
+	add	$16, %edi
+	jg	L(nibble_ashr_5)
+
+L(gobble_ashr_5):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$5, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_5)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$5, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_5)
+
+	.p2align 4
+L(nibble_ashr_5):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xffe0, %esi
+	jnz	L(ashr_5_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$11, REM
+	jbe	L(ashr_5_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_5)
+
+	.p2align 4
+L(ashr_5_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$5, %xmm0
+	psrldq	$5, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_6
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(10~15)            n -10            5(15 +(n-10) - n)         ashr_6
+ */
+
+	.p2align 4
+L(ashr_6):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$10, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-10(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$6, FLAGS
+	lea	6(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_6):
+	add	$16, %edi
+	jg	L(nibble_ashr_6)
+
+L(gobble_ashr_6):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$6, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_6)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$6, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_6)
+
+	.p2align 4
+L(nibble_ashr_6):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xffc0, %esi
+	jnz	L(ashr_6_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$10, REM
+	jbe	L(ashr_6_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_6)
+
+	.p2align 4
+L(ashr_6_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$6, %xmm0
+	psrldq	$6, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_7
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(9~15)            n - 9            6(15 +(n-9) - n)         ashr_7
+ */
+
+	.p2align 4
+L(ashr_7):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$9, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-9(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$7, FLAGS
+	lea	8(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_7):
+	add	$16, %edi
+	jg	L(nibble_ashr_7)
+
+L(gobble_ashr_7):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$7, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_7)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$7, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_7)
+
+	.p2align 4
+L(nibble_ashr_7):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xff80, %esi
+	jnz	L(ashr_7_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$9, REM
+	jbe	L(ashr_7_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_7)
+
+	.p2align 4
+L(ashr_7_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$7, %xmm0
+	psrldq	$7, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_8
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(8~15)            n - 8            7(15 +(n-8) - n)         ashr_8
+ */
+	.p2align 4
+L(ashr_8):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$8, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-8(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$8, FLAGS
+	lea	8(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_8):
+	add	$16, %edi
+	jg	L(nibble_ashr_8)
+
+L(gobble_ashr_8):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$8, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_8)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$8, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_8)
+
+	.p2align 4
+L(nibble_ashr_8):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xff00, %esi
+	jnz	L(ashr_8_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$8, REM
+	jbe	L(ashr_8_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_8)
+
+	.p2align 4
+L(ashr_8_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$8, %xmm0
+	psrldq	$8, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_9
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(7~15)            n - 7            8(15 +(n-7) - n)         ashr_9
+ */
+	.p2align 4
+L(ashr_9):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$7, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-7(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$9, FLAGS
+	lea	9(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_9):
+	add	$16, %edi
+	jg	L(nibble_ashr_9)
+
+L(gobble_ashr_9):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$9, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_9)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$9, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_9)
+
+	.p2align 4
+L(nibble_ashr_9):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xfe00, %esi
+	jnz	L(ashr_9_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$7, REM
+	jbe	L(ashr_9_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_9)
+
+	.p2align 4
+L(ashr_9_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$9, %xmm0
+	psrldq	$9, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_10
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(6~15)            n - 6            9(15 +(n-6) - n)         ashr_10
+ */
+	.p2align 4
+L(ashr_10):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$6, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-6(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$10, FLAGS
+	lea	10(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_10):
+	add	$16, %edi
+	jg	L(nibble_ashr_10)
+
+L(gobble_ashr_10):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$10, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_10)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$10, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_10)
+
+	.p2align 4
+L(nibble_ashr_10):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xfc00, %esi
+	jnz	L(ashr_10_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$6, REM
+	jbe	L(ashr_10_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_10)
+
+	.p2align 4
+L(ashr_10_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$10, %xmm0
+	psrldq	$10, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_11
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(5~15)            n - 5            10(15 +(n-5) - n)         ashr_11
+ */
+	.p2align 4
+L(ashr_11):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$5, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-5(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$11, FLAGS
+	lea	11(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_11):
+	add	$16, %edi
+	jg	L(nibble_ashr_11)
+
+L(gobble_ashr_11):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$11, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_11)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$11, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_11)
+
+	.p2align 4
+L(nibble_ashr_11):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xf800, %esi
+	jnz	L(ashr_11_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$5, REM
+	jbe	L(ashr_11_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_11)
+
+	.p2align 4
+L(ashr_11_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$11, %xmm0
+	psrldq	$11, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_12
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(4~15)            n - 4            11(15 +(n-4) - n)         ashr_12
+ */
+	.p2align 4
+L(ashr_12):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$4, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-4(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$12, FLAGS
+	lea	12(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_12):
+	add	$16, %edi
+	jg	L(nibble_ashr_12)
+
+L(gobble_ashr_12):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$12, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_12)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$12, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_12)
+
+	.p2align 4
+L(nibble_ashr_12):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xf000, %esi
+	jnz	L(ashr_12_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$4, REM
+	jbe	L(ashr_12_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_12)
+
+	.p2align 4
+L(ashr_12_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$12, %xmm0
+	psrldq	$12, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_13
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(3~15)            n - 3            12(15 +(n-3) - n)         ashr_13
+ */
+	.p2align 4
+L(ashr_13):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-3(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$13, FLAGS
+	lea	13(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_13):
+	add	$16, %edi
+	jg	L(nibble_ashr_13)
+
+L(gobble_ashr_13):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$13, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_13)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$13, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_13)
+
+	.p2align 4
+L(nibble_ashr_13):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xe000, %esi
+	jnz	L(ashr_13_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$3, REM
+	jbe	L(ashr_13_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_13)
+
+	.p2align 4
+L(ashr_13_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$13, %xmm0
+	psrldq	$13, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_14
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(2~15)            n - 2            13(15 +(n-2) - n)         ashr_14
+ */
+	.p2align 4
+L(ashr_14):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$2, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-2(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$14, FLAGS
+	lea	14(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_14):
+	add	$16, %edi
+	jg	L(nibble_ashr_14)
+
+L(gobble_ashr_14):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$14, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_14)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$14, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_14)
+
+	.p2align 4
+L(nibble_ashr_14):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xc000, %esi
+	jnz	L(ashr_14_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$2, REM
+	jbe	L(ashr_14_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_14)
+
+	.p2align 4
+L(ashr_14_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$14, %xmm0
+	psrldq	$14, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_14
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(1~15)            n - 1            14(15 +(n-1) - n)         ashr_15
+ */
+
+	.p2align 4
+L(ashr_15):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$1, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-1(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$15, FLAGS
+	lea	15(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_15):
+	add	$16, %edi
+	jg	L(nibble_ashr_15)
+
+L(gobble_ashr_15):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$15, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_15)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$15, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_15)
+
+	.p2align 4
+L(nibble_ashr_15):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0x8000, %esi
+	jnz	L(ashr_15_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$1, REM
+	jbe	L(ashr_15_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_15)
+
+	.p2align 4
+L(ashr_15_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$15, %xmm0
+	psrldq	$15, %xmm3
+	jmp	L(aftertail)
+
+	.p2align 4
+L(aftertail):
+	TOLOWER (%xmm1, %xmm3)
+	pcmpeqb	%xmm3, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	not	%esi
+L(exit):
+	mov	FLAGS, %edi
+	and	$0x1f, %edi
+	lea	-16(%edi, %ecx), %edi
+L(less32bytes):
+	add	%edi, %edx
+	add	%ecx, %eax
+	testl	$0x20, FLAGS
+	jz	L(ret2)
+	xchg	%eax, %edx
+
+	.p2align 4
+L(ret2):
+	mov	%esi, %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	addl	$4, %esp
+	cfi_adjust_cfa_offset (-4)
+#endif
+	POP	(%esi)
+	POP	(%edi)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	POP	(FLAGS)
+#endif
+L(less16bytes):
+	test	%cl, %cl
+	jz	L(2next_8_bytes)
+
+	test	$0x01, %cl
+	jnz	L(Byte0)
+
+	test	$0x02, %cl
+	jnz	L(Byte1)
+
+	test	$0x04, %cl
+	jnz	L(Byte2)
+
+	test	$0x08, %cl
+	jnz	L(Byte3)
+
+	test	$0x10, %cl
+	jnz	L(Byte4)
+
+	test	$0x20, %cl
+	jnz	L(Byte5)
+
+	test	$0x40, %cl
+	jnz	L(Byte6)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$7, REM
+	jbe	L(eq)
+#endif
+
+	movzx	7(%eax), %ecx
+	movzx	7(%edx), %eax
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+	sub	%ecx, %eax
+	RETURN
+
+L(Byte0):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$0, REM
+	jbe	L(eq)
+#endif
+	movzx	(%eax), %ecx
+	movzx	(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+	sub	%ecx, %eax
+	RETURN
+
+L(Byte1):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$1, REM
+	jbe	L(eq)
+#endif
+	movzx	1(%eax), %ecx
+	movzx	1(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+	sub	%ecx, %eax
+	RETURN
+
+L(Byte2):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$2, REM
+	jbe	L(eq)
+#endif
+	movzx	2(%eax), %ecx
+	movzx	2(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+	sub	%ecx, %eax
+	RETURN
+
+L(Byte3):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$3, REM
+	jbe	L(eq)
+#endif
+	movzx	3(%eax), %ecx
+	movzx	3(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+	sub	%ecx, %eax
+	RETURN
+
+L(Byte4):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$4, REM
+	jbe	L(eq)
+#endif
+	movzx	4(%eax), %ecx
+	movzx	4(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+	sub	%ecx, %eax
+	RETURN
+
+L(Byte5):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$5, REM
+	jbe	L(eq)
+#endif
+	movzx	5(%eax), %ecx
+	movzx	5(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+	sub	%ecx, %eax
+	RETURN
+
+L(Byte6):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$6, REM
+	jbe	L(eq)
+#endif
+	movzx	6(%eax), %ecx
+	movzx	6(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+	sub	%ecx, %eax
+	RETURN
+
+L(2next_8_bytes):
+	add	$8, %eax
+	add	$8, %edx
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$8, REM
+	lea	-8(REM), REM
+	jbe	L(eq)
+#endif
+
+	test	$0x01, %ch
+	jnz	L(Byte0)
+
+	test	$0x02, %ch
+	jnz	L(Byte1)
+
+	test	$0x04, %ch
+	jnz	L(Byte2)
+
+	test	$0x08, %ch
+	jnz	L(Byte3)
+
+	test	$0x10, %ch
+	jnz	L(Byte4)
+
+	test	$0x20, %ch
+	jnz	L(Byte5)
+
+	test	$0x40, %ch
+	jnz	L(Byte6)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$7, REM
+	jbe	L(eq)
+#endif
+	movzx	7(%eax), %ecx
+	movzx	7(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+	sub	%ecx, %eax
+	RETURN
+
+#ifdef USE_AS_STRNCMP
+L(neq_sncmp):
+#endif
+L(neq):
+	mov	$1, %eax
+	ja	L(neq_bigger)
+	neg	%eax
+L(neq_bigger):
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	addl	$4, %esp
+	cfi_adjust_cfa_offset (-4)
+#endif
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	POP	(REM)
+#endif
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	POP	(%ebx)
+# endif
+#endif
+	ret
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	.p2align 4
+	cfi_restore_state
+L(more8byteseq):
+
+# ifdef USE_AS_STRNCASECMP_L
+	addl	$4, %esp
+	cfi_adjust_cfa_offset (-4)
+# endif
+	POP	(%esi)
+	POP	(%edi)
+# ifdef USE_AS_STRNCMP
+	POP	(FLAGS)
+# endif
+#endif
+
+#ifdef USE_AS_STRNCMP
+L(eq_sncmp):
+#endif
+L(eq):
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	POP	(REM)
+#endif
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	POP	(%ebx)
+# endif
+#endif
+	xorl	%eax, %eax
+	ret
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	.p2align 4
+# if defined USE_AS_STRNCASECMP_L && defined PIC
+	CFI_PUSH (%ebx)
+# endif
+	CFI_PUSH (REM)
+L(less16bytes_sncmp):
+# ifdef USE_AS_STRNCASECMP_L
+	PUSH	(%esi)
+# endif
+	test	REM, REM
+	jz	L(eq_sncmp)
+
+	movzbl	(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, (%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$1, REM
+	je	L(eq_sncmp)
+
+	movzbl	1(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	1(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 1(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$2, REM
+	je	L(eq_sncmp)
+
+	movzbl	2(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	2(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 2(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$3, REM
+	je	L(eq_sncmp)
+
+	movzbl	3(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	3(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 3(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$4, REM
+	je	L(eq_sncmp)
+
+	movzbl	4(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	4(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 4(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$5, REM
+	je	L(eq_sncmp)
+
+	movzbl	5(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	5(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 5(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$6, REM
+	je	L(eq_sncmp)
+
+	movzbl	6(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	6(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 6(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$7, REM
+	je	L(eq_sncmp)
+
+	movzbl	7(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	7(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 7(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+
+	cmp	$8, REM
+	je	L(eq_sncmp)
+
+	movzbl	8(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	8(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 8(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$9, REM
+	je	L(eq_sncmp)
+
+	movzbl	9(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	9(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 9(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$10, REM
+	je	L(eq_sncmp)
+
+	movzbl	10(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	10(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 10(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$11, REM
+	je	L(eq_sncmp)
+
+	movzbl	11(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	11(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 11(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+
+	cmp	$12, REM
+	je	L(eq_sncmp)
+
+	movzbl	12(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	12(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 12(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$13, REM
+	je	L(eq_sncmp)
+
+	movzbl	13(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	13(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 13(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$14, REM
+	je	L(eq_sncmp)
+
+	movzbl	14(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	14(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 14(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$15, REM
+	je	L(eq_sncmp)
+
+	movzbl	15(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	15(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 15(%edx)
+# endif
+	jne	L(neq_sncmp)
+
+# ifdef USE_AS_STRNCASECMP_L
+L(eq_sncmp):
+	POP	(%esi)
+# endif
+	POP	(REM)
+# if defined USE_AS_STRNCASECMP_L && defined PIC
+	POP	(%ebx)
+# endif
+	xor	%eax, %eax
+	ret
+
+# ifdef USE_AS_STRNCASECMP_L
+	.p2align 4
+#  ifdef PIC
+	CFI_PUSH (%ebx)
+#  endif
+	CFI_PUSH (REM)
+	CFI_PUSH (%esi)
+L(neq_sncmp):
+	mov	$1, %eax
+	mov	$-1, %edx
+	cmovna	%edx, %eax
+	POP	(%esi)
+	POP	(REM)
+#  ifdef PIC
+	POP	(%ebx)
+#  endif
+	ret
+# endif
+#endif
+
+END (STRCMP)
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp.S
new file mode 100644
index 0000000000..56de25a4b7
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp.S
@@ -0,0 +1,95 @@
+/* Multiple versions of strcmp
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifdef USE_AS_STRNCMP
+# define STRCMP			strncmp
+# define __GI_STRCMP		__GI_strncmp
+# define __STRCMP_IA32		__strncmp_ia32
+# define __STRCMP_SSSE3		__strncmp_ssse3
+# define __STRCMP_SSE4_2	__strncmp_sse4_2
+#elif defined USE_AS_STRCASECMP_L
+# define STRCMP			__strcasecmp_l
+# define __GI_STRCMP		__GI_strcasecmp_l
+# define __STRCMP_IA32		__strcasecmp_l_ia32
+# define __STRCMP_SSSE3		__strcasecmp_l_ssse3
+# define __STRCMP_SSE4_2	__strcasecmp_l_sse4_2
+#elif defined USE_AS_STRNCASECMP_L
+# define STRCMP			__strncasecmp_l
+# define __GI_STRCMP		__GI_strncasecmp_l
+# define __STRCMP_IA32		__strncasecmp_l_ia32
+# define __STRCMP_SSSE3		__strncasecmp_l_ssse3
+# define __STRCMP_SSE4_2	__strncasecmp_l_sse4_2
+#else
+# define STRCMP			strcmp
+# define __GI_STRCMP		__GI_strcmp
+# define __STRCMP_IA32		__strcmp_ia32
+# define __STRCMP_SSSE3		__strcmp_ssse3
+# define __STRCMP_SSE4_2	__strcmp_sse4_2
+#endif
+
+/* Define multiple versions only for the definition in libc.  Don't
+   define multiple versions for strncmp in static library since we
+   need strncmp before the initialization happened.  */
+#if (defined SHARED || !defined USE_AS_STRNCMP) && IS_IN (libc)
+	.text
+ENTRY(STRCMP)
+	.type	STRCMP, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__STRCMP_IA32)
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__STRCMP_SSSE3)
+	HAS_CPU_FEATURE (SSE4_2)
+	jz	2f
+	HAS_ARCH_FEATURE (Slow_SSE4_2)
+	jnz	2f
+	LOAD_FUNC_GOT_EAX (__STRCMP_SSE4_2)
+2:	ret
+END(STRCMP)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __STRCMP_IA32, @function; \
+	.p2align 4; \
+	.globl __STRCMP_IA32; \
+	.hidden __STRCMP_IA32; \
+	__STRCMP_IA32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __STRCMP_IA32, .-__STRCMP_IA32
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_STRCMP; __GI_STRCMP = __STRCMP_IA32
+# endif
+#endif
+
+#if !defined USE_AS_STRNCMP && !defined USE_AS_STRCASECMP_L \
+    && !defined USE_AS_STRNCASECMP_L
+# include "../strcmp.S"
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-sse2.S
new file mode 100644
index 0000000000..ed627a5f62
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-sse2.S
@@ -0,0 +1,2250 @@
+/* strcpy with SSE2 and unaligned load
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+
+# define CFI_PUSH(REG)                  \
+	cfi_adjust_cfa_offset (4);     \
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)                   \
+	cfi_adjust_cfa_offset (-4);    \
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# ifndef STRCPY
+#  define STRCPY  __strcpy_sse2
+# endif
+
+# define STR1  PARMS
+# define STR2  STR1+4
+# define LEN  STR2+4
+
+# ifdef USE_AS_STRNCPY
+#  define PARMS  16
+#  define ENTRANCE PUSH(%ebx); PUSH(%esi); PUSH(%edi)
+#  define RETURN  POP(%edi); POP(%esi); POP(%ebx); ret;          \
+	CFI_PUSH(%ebx); CFI_PUSH(%esi); CFI_PUSH(%edi);
+
+# ifdef SHARED
+#  define JMPTBL(I, B)	I - B
+
+/* Load an entry in a jump table into ECX and branch to it. TABLE is a
+	jump table with relative offsets.
+	INDEX is a register contains the index into the jump table.
+	SCALE is the scale of INDEX. */
+
+#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)            \
+	/* We first load PC into ECX.  */                       \
+	SETUP_PIC_REG(cx);                                      \
+	/* Get the address of the jump table.  */               \
+	addl	$(TABLE - .), %ecx;                             \
+	/* Get the entry and convert the relative offset to the \
+	absolute	address.  */                            \
+	addl	(%ecx,INDEX,SCALE), %ecx;                       \
+	/* We loaded the jump table and adjusted ECX. Go.  */  \
+	jmp	*%ecx
+# else
+#  define JMPTBL(I, B)	I
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+	absolute	offsets.  INDEX is a register contains the index into the
+	jump	table.  SCALE is the scale of INDEX. */
+
+#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+	jmp	*TABLE(,INDEX,SCALE)
+# endif
+
+.text
+ENTRY (STRCPY)
+	ENTRANCE
+	mov	STR1(%esp), %edi
+	mov	STR2(%esp), %esi
+	movl	LEN(%esp), %ebx
+	test	%ebx, %ebx
+	jz	L(ExitZero)
+
+	mov	%esi, %ecx
+# ifndef USE_AS_STPCPY
+	mov	%edi, %eax      /* save result */
+# endif
+	and	$15, %ecx
+	jz	L(SourceStringAlignmentZero)
+
+	and	$-16, %esi
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+
+	pcmpeqb	(%esi), %xmm1
+	add	%ecx, %ebx
+	pmovmskb %xmm1, %edx
+	shr	%cl, %edx
+# ifdef USE_AS_STPCPY
+	cmp	$16, %ebx
+	jbe	L(CopyFrom1To16BytesTailCase2OrCase3)
+# else
+	cmp	$17, %ebx
+	jbe	L(CopyFrom1To16BytesTailCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTail)
+
+	pcmpeqb	16(%esi), %xmm0
+	pmovmskb %xmm0, %edx
+# ifdef USE_AS_STPCPY
+	cmp	$32, %ebx
+	jbe	L(CopyFrom1To32BytesCase2OrCase3)
+# else
+	cmp	$33, %ebx
+	jbe	L(CopyFrom1To32BytesCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To32Bytes)
+
+	movdqu	(%esi, %ecx), %xmm1   /* copy 16 bytes */
+	movdqu	%xmm1, (%edi)
+
+	sub	%ecx, %edi
+
+/* If source address alignment != destination address alignment */
+	.p2align 4
+L(Unalign16Both):
+	mov	$16, %ecx
+	movdqa	(%esi, %ecx), %xmm1
+	movaps	16(%esi, %ecx), %xmm2
+	movdqu	%xmm1, (%edi, %ecx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+	sub	$48, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm2)
+
+	movaps	16(%esi, %ecx), %xmm3
+	movdqu	%xmm2, (%edi, %ecx)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm3)
+
+	movaps	16(%esi, %ecx), %xmm4
+	movdqu	%xmm3, (%edi, %ecx)
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm4)
+
+	movaps	16(%esi, %ecx), %xmm1
+	movdqu	%xmm4, (%edi, %ecx)
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm1)
+
+	movaps	16(%esi, %ecx), %xmm2
+	movdqu	%xmm1, (%edi, %ecx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm2)
+
+	movaps	16(%esi, %ecx), %xmm3
+	movdqu	%xmm2, (%edi, %ecx)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm3)
+
+	movdqu	%xmm3, (%edi, %ecx)
+	mov	%esi, %edx
+	lea	16(%esi, %ecx), %esi
+	and	$-0x40, %esi
+	sub	%esi, %edx
+	sub	%edx, %edi
+	lea	128(%ebx, %edx), %ebx
+
+L(Unaligned64Loop):
+	movaps	(%esi), %xmm2
+	movaps	%xmm2, %xmm4
+	movaps	16(%esi), %xmm5
+	movaps	32(%esi), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	48(%esi), %xmm7
+	pminub	%xmm5, %xmm2
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %edx
+	sub	$64, %ebx
+	jbe	L(UnalignedLeaveCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(Unaligned64Leave)
+L(Unaligned64Loop_start):
+	add	$64, %edi
+	add	$64, %esi
+	movdqu	%xmm4, -64(%edi)
+	movaps	(%esi), %xmm2
+	movdqa	%xmm2, %xmm4
+	movdqu	%xmm5, -48(%edi)
+	movaps	16(%esi), %xmm5
+	pminub	%xmm5, %xmm2
+	movaps	32(%esi), %xmm3
+	movdqu	%xmm6, -32(%edi)
+	movaps	%xmm3, %xmm6
+	movdqu	%xmm7, -16(%edi)
+	movaps	48(%esi), %xmm7
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %edx
+	sub	$64, %ebx
+	jbe	L(UnalignedLeaveCase2OrCase3)
+	test	%edx, %edx
+	jz	L(Unaligned64Loop_start)
+L(Unaligned64Leave):
+	pxor	%xmm1, %xmm1
+
+	pcmpeqb	%xmm4, %xmm0
+	pcmpeqb	%xmm5, %xmm1
+	pmovmskb %xmm0, %edx
+	pmovmskb %xmm1, %ecx
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnaligned_0)
+	test	%ecx, %ecx
+	jnz	L(CopyFrom1To16BytesUnaligned_16)
+
+	pcmpeqb	%xmm6, %xmm0
+	pcmpeqb	%xmm7, %xmm1
+	pmovmskb %xmm0, %edx
+	pmovmskb %xmm1, %ecx
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnaligned_32)
+
+	bsf	%ecx, %edx
+	movdqu	%xmm4, (%edi)
+	movdqu	%xmm5, 16(%edi)
+	movdqu	%xmm6, 32(%edi)
+# ifdef USE_AS_STPCPY
+	lea	48(%edi, %edx), %eax
+# endif
+	movdqu	%xmm7, 48(%edi)
+	add	$15, %ebx
+	sub	%edx, %ebx
+	lea	49(%edi, %edx), %edi
+	jmp	L(StrncpyFillTailWithZero)
+
+/* If source address alignment == destination address alignment */
+
+L(SourceStringAlignmentZero):
+	pxor	%xmm0, %xmm0
+	movdqa	(%esi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %edx
+# ifdef USE_AS_STPCPY
+	cmp	$16, %ebx
+	jbe	L(CopyFrom1To16BytesTail1Case2OrCase3)
+# else
+	cmp	$17, %ebx
+	jbe	L(CopyFrom1To16BytesTail1Case2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTail1)
+
+	pcmpeqb	16(%esi), %xmm0
+	movdqu	%xmm1, (%edi)
+	pmovmskb %xmm0, %edx
+# ifdef USE_AS_STPCPY
+	cmp	$32, %ebx
+	jbe	L(CopyFrom1To32Bytes1Case2OrCase3)
+# else
+	cmp	$33, %ebx
+	jbe	L(CopyFrom1To32Bytes1Case2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To32Bytes1)
+
+	jmp	L(Unalign16Both)
+
+/*-----------------End of main part---------------------------*/
+
+/* Case1 */
+	.p2align 4
+L(CopyFrom1To16BytesTail):
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+	.p2align 4
+L(CopyFrom1To32Bytes1):
+	add	$16, %esi
+	add	$16, %edi
+	sub	$16, %ebx
+L(CopyFrom1To16BytesTail1):
+	bsf	%edx, %edx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+	.p2align 4
+L(CopyFrom1To32Bytes):
+	sub	%ecx, %ebx
+	bsf	%edx, %edx
+	add	%ecx, %esi
+	add	$16, %edx
+	sub	%ecx, %edx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnaligned_0):
+	bsf	%edx, %edx
+# ifdef USE_AS_STPCPY
+	lea	(%edi, %edx), %eax
+# endif
+	movdqu	%xmm4, (%edi)
+	add	$63, %ebx
+	sub	%edx, %ebx
+	lea	1(%edi, %edx), %edi
+	jmp	L(StrncpyFillTailWithZero)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnaligned_16):
+	bsf	%ecx, %edx
+	movdqu	%xmm4, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	16(%edi, %edx), %eax
+# endif
+	movdqu	%xmm5, 16(%edi)
+	add	$47, %ebx
+	sub	%edx, %ebx
+	lea	17(%edi, %edx), %edi
+	jmp	L(StrncpyFillTailWithZero)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnaligned_32):
+	bsf	%edx, %edx
+	movdqu	%xmm4, (%edi)
+	movdqu	%xmm5, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	32(%edi, %edx), %eax
+# endif
+	movdqu	%xmm6, 32(%edi)
+	add	$31, %ebx
+	sub	%edx, %ebx
+	lea	33(%edi, %edx), %edi
+	jmp	L(StrncpyFillTailWithZero)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm6):
+	movdqu	%xmm6, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm5):
+	movdqu	%xmm5, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm4):
+	movdqu	%xmm4, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm3):
+	movdqu	%xmm3, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm1):
+	movdqu	%xmm1, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesExit):
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+/* Case2 */
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2):
+	add	$16, %ebx
+	add	%ecx, %edi
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+	.p2align 4
+L(CopyFrom1To32BytesCase2):
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	add	$16, %edx
+	sub	%ecx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+L(CopyFrom1To16BytesTailCase2):
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+L(CopyFrom1To16BytesTail1Case2):
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+/* Case2 or Case3,  Case3 */
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesCase2)
+L(CopyFrom1To16BytesCase3):
+	add	$16, %ebx
+	add	%ecx, %edi
+	add	%ecx, %esi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+	.p2align 4
+L(CopyFrom1To32BytesCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(CopyFrom1To32BytesCase2)
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+	.p2align 4
+L(CopyFrom1To16BytesTailCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTailCase2)
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+	.p2align 4
+L(CopyFrom1To32Bytes1Case2OrCase3):
+	add	$16, %edi
+	add	$16, %esi
+	sub	$16, %ebx
+L(CopyFrom1To16BytesTail1Case2OrCase3):
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTail1Case2)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+	.p2align 4
+L(Exit0):
+# ifdef USE_AS_STPCPY
+	mov	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit1):
+	movb	%dh, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	(%edi), %eax
+# endif
+	sub	$1, %ebx
+	lea	1(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit2):
+	movw	(%esi), %dx
+	movw	%dx, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	1(%edi), %eax
+# endif
+	sub	$2, %ebx
+	lea	2(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit3):
+	movw	(%esi), %cx
+	movw	%cx, (%edi)
+	movb	%dh, 2(%edi)
+# ifdef USE_AS_STPCPY
+	lea	2(%edi), %eax
+# endif
+	sub	$3, %ebx
+	lea	3(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit4):
+	movl	(%esi), %edx
+	movl	%edx, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	3(%edi), %eax
+# endif
+	sub	$4, %ebx
+	lea	4(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit5):
+	movl	(%esi), %ecx
+	movb	%dh, 4(%edi)
+	movl	%ecx, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	4(%edi), %eax
+# endif
+	sub	$5, %ebx
+	lea	5(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit6):
+	movl	(%esi), %ecx
+	movw	4(%esi), %dx
+	movl	%ecx, (%edi)
+	movw	%dx, 4(%edi)
+# ifdef USE_AS_STPCPY
+	lea	5(%edi), %eax
+# endif
+	sub	$6, %ebx
+	lea	6(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit7):
+	movl	(%esi), %ecx
+	movl	3(%esi), %edx
+	movl	%ecx, (%edi)
+	movl	%edx, 3(%edi)
+# ifdef USE_AS_STPCPY
+	lea	6(%edi), %eax
+# endif
+	sub	$7, %ebx
+	lea	7(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit8):
+	movlpd	(%esi), %xmm0
+	movlpd	%xmm0, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	7(%edi), %eax
+# endif
+	sub	$8, %ebx
+	lea	8(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit9):
+	movlpd	(%esi), %xmm0
+	movb	%dh, 8(%edi)
+	movlpd	%xmm0, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	8(%edi), %eax
+# endif
+	sub	$9, %ebx
+	lea	9(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit10):
+	movlpd	(%esi), %xmm0
+	movw	8(%esi), %dx
+	movlpd	%xmm0, (%edi)
+	movw	%dx, 8(%edi)
+# ifdef USE_AS_STPCPY
+	lea	9(%edi), %eax
+# endif
+	sub	$10, %ebx
+	lea	10(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit11):
+	movlpd	(%esi), %xmm0
+	movl	7(%esi), %edx
+	movlpd	%xmm0, (%edi)
+	movl	%edx, 7(%edi)
+# ifdef USE_AS_STPCPY
+	lea	10(%edi), %eax
+# endif
+	sub	$11, %ebx
+	lea	11(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit12):
+	movlpd	(%esi), %xmm0
+	movl	8(%esi), %edx
+	movlpd	%xmm0, (%edi)
+	movl	%edx, 8(%edi)
+# ifdef USE_AS_STPCPY
+	lea	11(%edi), %eax
+# endif
+	sub	$12, %ebx
+	lea	12(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit13):
+	movlpd	(%esi), %xmm0
+	movlpd	5(%esi), %xmm1
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm1, 5(%edi)
+# ifdef USE_AS_STPCPY
+	lea	12(%edi), %eax
+# endif
+	sub	$13, %ebx
+	lea	13(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit14):
+	movlpd	(%esi), %xmm0
+	movlpd	6(%esi), %xmm1
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm1, 6(%edi)
+# ifdef USE_AS_STPCPY
+	lea	13(%edi), %eax
+# endif
+	sub	$14, %ebx
+	lea	14(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit15):
+	movlpd	(%esi), %xmm0
+	movlpd	7(%esi), %xmm1
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm1, 7(%edi)
+# ifdef USE_AS_STPCPY
+	lea	14(%edi), %eax
+# endif
+	sub	$15, %ebx
+	lea	15(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit16):
+	movdqu	(%esi), %xmm0
+	movdqu	%xmm0, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	15(%edi), %eax
+# endif
+	sub	$16, %ebx
+	lea	16(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit17):
+	movdqu	(%esi), %xmm0
+	movdqu	%xmm0, (%edi)
+	movb	%dh, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	16(%edi), %eax
+# endif
+	sub	$17, %ebx
+	lea	17(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit18):
+	movdqu	(%esi), %xmm0
+	movw	16(%esi), %cx
+	movdqu	%xmm0, (%edi)
+	movw	%cx, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	17(%edi), %eax
+# endif
+	sub	$18, %ebx
+	lea	18(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit19):
+	movdqu	(%esi), %xmm0
+	movl	15(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movl	%ecx, 15(%edi)
+# ifdef USE_AS_STPCPY
+	lea	18(%edi), %eax
+# endif
+	sub	$19, %ebx
+	lea	19(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit20):
+	movdqu	(%esi), %xmm0
+	movl	16(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movl	%ecx, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	19(%edi), %eax
+# endif
+	sub	$20, %ebx
+	lea	20(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit21):
+	movdqu	(%esi), %xmm0
+	movl	16(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movl	%ecx, 16(%edi)
+	movb	%dh, 20(%edi)
+# ifdef USE_AS_STPCPY
+	lea	20(%edi), %eax
+# endif
+	sub	$21, %ebx
+	lea	21(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit22):
+	movdqu	(%esi), %xmm0
+	movlpd	14(%esi), %xmm3
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm3, 14(%edi)
+# ifdef USE_AS_STPCPY
+	lea	21(%edi), %eax
+# endif
+	sub	$22, %ebx
+	lea	22(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit23):
+	movdqu	(%esi), %xmm0
+	movlpd	15(%esi), %xmm3
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm3, 15(%edi)
+# ifdef USE_AS_STPCPY
+	lea	22(%edi), %eax
+# endif
+	sub	$23, %ebx
+	lea	23(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit24):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	23(%edi), %eax
+# endif
+	sub	$24, %ebx
+	lea	24(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit25):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movb	%dh, 24(%edi)
+# ifdef USE_AS_STPCPY
+	lea	24(%edi), %eax
+# endif
+	sub	$25, %ebx
+	lea	25(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit26):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movw	24(%esi), %cx
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movw	%cx, 24(%edi)
+# ifdef USE_AS_STPCPY
+	lea	25(%edi), %eax
+# endif
+	sub	$26, %ebx
+	lea	26(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit27):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movl	23(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movl	%ecx, 23(%edi)
+# ifdef USE_AS_STPCPY
+	lea	26(%edi), %eax
+# endif
+	sub	$27, %ebx
+	lea	27(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit28):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movl	24(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movl	%ecx, 24(%edi)
+# ifdef USE_AS_STPCPY
+	lea	27(%edi), %eax
+# endif
+	sub	$28, %ebx
+	lea	28(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit29):
+	movdqu	(%esi), %xmm0
+	movdqu	13(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 13(%edi)
+# ifdef USE_AS_STPCPY
+	lea	28(%edi), %eax
+# endif
+	sub	$29, %ebx
+	lea	29(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit30):
+	movdqu	(%esi), %xmm0
+	movdqu	14(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 14(%edi)
+# ifdef USE_AS_STPCPY
+	lea	29(%edi), %eax
+# endif
+	sub	$30, %ebx
+	lea	30(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+
+	.p2align 4
+L(Exit31):
+	movdqu	(%esi), %xmm0
+	movdqu	15(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 15(%edi)
+# ifdef USE_AS_STPCPY
+	lea	30(%edi), %eax
+# endif
+	sub	$31, %ebx
+	lea	31(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit32):
+	movdqu	(%esi), %xmm0
+	movdqu	16(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	31(%edi), %eax
+# endif
+	sub	$32, %ebx
+	lea	32(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(StrncpyExit1):
+	movb	(%esi), %dl
+	movb	%dl, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	1(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit2):
+	movw	(%esi), %dx
+	movw	%dx, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	2(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit3):
+	movw	(%esi), %cx
+	movb	2(%esi), %dl
+	movw	%cx, (%edi)
+	movb	%dl, 2(%edi)
+# ifdef USE_AS_STPCPY
+	lea	3(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit4):
+	movl	(%esi), %edx
+	movl	%edx, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	4(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit5):
+	movl	(%esi), %ecx
+	movb	4(%esi), %dl
+	movl	%ecx, (%edi)
+	movb	%dl, 4(%edi)
+# ifdef USE_AS_STPCPY
+	lea	5(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit6):
+	movl	(%esi), %ecx
+	movw	4(%esi), %dx
+	movl	%ecx, (%edi)
+	movw	%dx, 4(%edi)
+# ifdef USE_AS_STPCPY
+	lea	6(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit7):
+	movl	(%esi), %ecx
+	movl	3(%esi), %edx
+	movl	%ecx, (%edi)
+	movl	%edx, 3(%edi)
+# ifdef USE_AS_STPCPY
+	lea	7(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit8):
+	movlpd	(%esi), %xmm0
+	movlpd	%xmm0, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	8(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit9):
+	movlpd	(%esi), %xmm0
+	movb	8(%esi), %dl
+	movlpd	%xmm0, (%edi)
+	movb	%dl, 8(%edi)
+# ifdef USE_AS_STPCPY
+	lea	9(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit10):
+	movlpd	(%esi), %xmm0
+	movw	8(%esi), %dx
+	movlpd	%xmm0, (%edi)
+	movw	%dx, 8(%edi)
+# ifdef USE_AS_STPCPY
+	lea	10(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit11):
+	movlpd	(%esi), %xmm0
+	movl	7(%esi), %edx
+	movlpd	%xmm0, (%edi)
+	movl	%edx, 7(%edi)
+# ifdef USE_AS_STPCPY
+	lea	11(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit12):
+	movlpd	(%esi), %xmm0
+	movl	8(%esi), %edx
+	movlpd	%xmm0, (%edi)
+	movl	%edx, 8(%edi)
+# ifdef USE_AS_STPCPY
+	lea	12(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit13):
+	movlpd	(%esi), %xmm0
+	movlpd	5(%esi), %xmm1
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm1, 5(%edi)
+# ifdef USE_AS_STPCPY
+	lea	13(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit14):
+	movlpd	(%esi), %xmm0
+	movlpd	6(%esi), %xmm1
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm1, 6(%edi)
+# ifdef USE_AS_STPCPY
+	lea	14(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit15):
+	movlpd	(%esi), %xmm0
+	movlpd	7(%esi), %xmm1
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm1, 7(%edi)
+# ifdef USE_AS_STPCPY
+	lea	15(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit16):
+	movdqu	(%esi), %xmm0
+	movdqu	%xmm0, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	16(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit17):
+	movdqu	(%esi), %xmm0
+	movb	16(%esi), %cl
+	movdqu	%xmm0, (%edi)
+	movb	%cl, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	17(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit18):
+	movdqu	(%esi), %xmm0
+	movw	16(%esi), %cx
+	movdqu	%xmm0, (%edi)
+	movw	%cx, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	18(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit19):
+	movdqu	(%esi), %xmm0
+	movl	15(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movl	%ecx, 15(%edi)
+# ifdef USE_AS_STPCPY
+	lea	19(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit20):
+	movdqu	(%esi), %xmm0
+	movl	16(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movl	%ecx, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	20(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit21):
+	movdqu	(%esi), %xmm0
+	movl	16(%esi), %ecx
+	movb	20(%esi), %dl
+	movdqu	%xmm0, (%edi)
+	movl	%ecx, 16(%edi)
+	movb	%dl, 20(%edi)
+# ifdef USE_AS_STPCPY
+	lea	21(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit22):
+	movdqu	(%esi), %xmm0
+	movlpd	14(%esi), %xmm3
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm3, 14(%edi)
+# ifdef USE_AS_STPCPY
+	lea	22(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit23):
+	movdqu	(%esi), %xmm0
+	movlpd	15(%esi), %xmm3
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm3, 15(%edi)
+# ifdef USE_AS_STPCPY
+	lea	23(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit24):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	24(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit25):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movb	24(%esi), %cl
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movb	%cl, 24(%edi)
+# ifdef USE_AS_STPCPY
+	lea	25(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit26):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movw	24(%esi), %cx
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movw	%cx, 24(%edi)
+# ifdef USE_AS_STPCPY
+	lea	26(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit27):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movl	23(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movl	%ecx, 23(%edi)
+# ifdef USE_AS_STPCPY
+	lea	27(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit28):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movl	24(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movl	%ecx, 24(%edi)
+# ifdef USE_AS_STPCPY
+	lea	28(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit29):
+	movdqu	(%esi), %xmm0
+	movdqu	13(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 13(%edi)
+# ifdef USE_AS_STPCPY
+	lea	29(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit30):
+	movdqu	(%esi), %xmm0
+	movdqu	14(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 14(%edi)
+# ifdef USE_AS_STPCPY
+	lea	30(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit31):
+	movdqu	(%esi), %xmm0
+	movdqu	15(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 15(%edi)
+# ifdef USE_AS_STPCPY
+	lea	31(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit32):
+	movdqu	(%esi), %xmm0
+	movdqu	16(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	32(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit33):
+	movdqu	(%esi), %xmm0
+	movdqu	16(%esi), %xmm2
+	movb	32(%esi), %cl
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 16(%edi)
+	movb	%cl, 32(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill0):
+	RETURN
+
+	.p2align 4
+L(Fill1):
+	movb	%dl, (%edi)
+	RETURN
+
+	.p2align 4
+L(Fill2):
+	movw	%dx, (%edi)
+	RETURN
+
+	.p2align 4
+L(Fill3):
+	movl	%edx, -1(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill4):
+	movl	%edx, (%edi)
+	RETURN
+
+	.p2align 4
+L(Fill5):
+	movl	%edx, (%edi)
+	movb	%dl, 4(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill6):
+	movl	%edx, (%edi)
+	movw	%dx, 4(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill7):
+	movlpd	%xmm0, -1(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill8):
+	movlpd	%xmm0, (%edi)
+	RETURN
+
+	.p2align 4
+L(Fill9):
+	movlpd	%xmm0, (%edi)
+	movb	%dl, 8(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill10):
+	movlpd	%xmm0, (%edi)
+	movw	%dx, 8(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill11):
+	movlpd	%xmm0, (%edi)
+	movl	%edx, 7(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill12):
+	movlpd	%xmm0, (%edi)
+	movl	%edx, 8(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill13):
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm0, 5(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill14):
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm0, 6(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill15):
+	movdqu	%xmm0, -1(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill16):
+	movdqu	%xmm0, (%edi)
+	RETURN
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm2):
+	movdqu	%xmm2, (%edi, %ecx)
+
+	.p2align 4
+L(CopyFrom1To16BytesXmmExit):
+	bsf	%edx, %edx
+	add	$15, %ebx
+	add	%ecx, %edi
+# ifdef USE_AS_STPCPY
+	lea	(%edi, %edx), %eax
+# endif
+	sub	%edx, %ebx
+	lea	1(%edi, %edx), %edi
+
+	.p2align 4
+L(StrncpyFillTailWithZero):
+	pxor	%xmm0, %xmm0
+	xor	%edx, %edx
+	sub	$16, %ebx
+	jbe	L(StrncpyFillExit)
+
+	movdqu	%xmm0, (%edi)
+	add	$16, %edi
+
+	mov	%edi, %esi
+	and	$0xf, %esi
+	sub	%esi, %edi
+	add	%esi, %ebx
+	sub	$64, %ebx
+	jb	L(StrncpyFillLess64)
+
+L(StrncpyFillLoopMovdqa):
+	movdqa	%xmm0, (%edi)
+	movdqa	%xmm0, 16(%edi)
+	movdqa	%xmm0, 32(%edi)
+	movdqa	%xmm0, 48(%edi)
+	add	$64, %edi
+	sub	$64, %ebx
+	jae	L(StrncpyFillLoopMovdqa)
+
+L(StrncpyFillLess64):
+	add	$32, %ebx
+	jl	L(StrncpyFillLess32)
+	movdqa	%xmm0, (%edi)
+	movdqa	%xmm0, 16(%edi)
+	add	$32, %edi
+	sub	$16, %ebx
+	jl	L(StrncpyFillExit)
+	movdqa	%xmm0, (%edi)
+	add	$16, %edi
+	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4)
+
+L(StrncpyFillLess32):
+	add	$16, %ebx
+	jl	L(StrncpyFillExit)
+	movdqa	%xmm0, (%edi)
+	add	$16, %edi
+	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4)
+
+L(StrncpyFillExit):
+	add	$16, %ebx
+	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4)
+
+	.p2align 4
+L(UnalignedLeaveCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(Unaligned64LeaveCase2)
+L(Unaligned64LeaveCase3):
+	lea	64(%ebx), %ecx
+	and	$-16, %ecx
+	add	$48, %ebx
+	jl	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm4, (%edi)
+	sub	$16, %ebx
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm5, 16(%edi)
+	sub	$16, %ebx
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm6, 32(%edi)
+	sub	$16, %ebx
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm7, 48(%edi)
+# ifdef USE_AS_STPCPY
+	lea	64(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Unaligned64LeaveCase2):
+	xor	%ecx, %ecx
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$48, %ebx
+	jle	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm4)
+
+	pcmpeqb	%xmm5, %xmm0
+	pmovmskb %xmm0, %edx
+	movdqu	%xmm4, (%edi)
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm5)
+
+	pcmpeqb	%xmm6, %xmm0
+	pmovmskb %xmm0, %edx
+	movdqu	%xmm5, 16(%edi)
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm6)
+
+	pcmpeqb	%xmm7, %xmm0
+	pmovmskb %xmm0, %edx
+	movdqu	%xmm6, 32(%edi)
+	lea	16(%edi, %ecx), %edi
+	lea	16(%esi, %ecx), %esi
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+	.p2align 4
+L(ExitZero):
+	movl	%edi, %eax
+	RETURN
+
+END (STRCPY)
+
+	.p2align 4
+	.section .rodata
+L(ExitTable):
+	.int	JMPTBL(L(Exit1), L(ExitTable))
+	.int	JMPTBL(L(Exit2), L(ExitTable))
+	.int	JMPTBL(L(Exit3), L(ExitTable))
+	.int	JMPTBL(L(Exit4), L(ExitTable))
+	.int	JMPTBL(L(Exit5), L(ExitTable))
+	.int	JMPTBL(L(Exit6), L(ExitTable))
+	.int	JMPTBL(L(Exit7), L(ExitTable))
+	.int	JMPTBL(L(Exit8), L(ExitTable))
+	.int	JMPTBL(L(Exit9), L(ExitTable))
+	.int	JMPTBL(L(Exit10), L(ExitTable))
+	.int	JMPTBL(L(Exit11), L(ExitTable))
+	.int	JMPTBL(L(Exit12), L(ExitTable))
+	.int	JMPTBL(L(Exit13), L(ExitTable))
+	.int	JMPTBL(L(Exit14), L(ExitTable))
+	.int	JMPTBL(L(Exit15), L(ExitTable))
+	.int	JMPTBL(L(Exit16), L(ExitTable))
+	.int	JMPTBL(L(Exit17), L(ExitTable))
+	.int	JMPTBL(L(Exit18), L(ExitTable))
+	.int	JMPTBL(L(Exit19), L(ExitTable))
+	.int	JMPTBL(L(Exit20), L(ExitTable))
+	.int	JMPTBL(L(Exit21), L(ExitTable))
+	.int	JMPTBL(L(Exit22), L(ExitTable))
+	.int    JMPTBL(L(Exit23), L(ExitTable))
+	.int	JMPTBL(L(Exit24), L(ExitTable))
+	.int	JMPTBL(L(Exit25), L(ExitTable))
+	.int	JMPTBL(L(Exit26), L(ExitTable))
+	.int	JMPTBL(L(Exit27), L(ExitTable))
+	.int	JMPTBL(L(Exit28), L(ExitTable))
+	.int	JMPTBL(L(Exit29), L(ExitTable))
+	.int	JMPTBL(L(Exit30), L(ExitTable))
+	.int	JMPTBL(L(Exit31), L(ExitTable))
+	.int	JMPTBL(L(Exit32), L(ExitTable))
+
+L(ExitStrncpyTable):
+	.int	JMPTBL(L(Exit0), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit4), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit5), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit6), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit7), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit11), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit12), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit13), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit17), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit18), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit19), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit20), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit21), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit22), L(ExitStrncpyTable))
+	.int    JMPTBL(L(StrncpyExit23), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit24), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit25), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit26), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit27), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit28), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit29), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable))
+
+	.p2align 4
+L(FillTable):
+	.int	JMPTBL(L(Fill0), L(FillTable))
+	.int	JMPTBL(L(Fill1), L(FillTable))
+	.int	JMPTBL(L(Fill2), L(FillTable))
+	.int	JMPTBL(L(Fill3), L(FillTable))
+	.int	JMPTBL(L(Fill4), L(FillTable))
+	.int	JMPTBL(L(Fill5), L(FillTable))
+	.int	JMPTBL(L(Fill6), L(FillTable))
+	.int	JMPTBL(L(Fill7), L(FillTable))
+	.int	JMPTBL(L(Fill8), L(FillTable))
+	.int	JMPTBL(L(Fill9), L(FillTable))
+	.int	JMPTBL(L(Fill10), L(FillTable))
+	.int	JMPTBL(L(Fill11), L(FillTable))
+	.int	JMPTBL(L(Fill12), L(FillTable))
+	.int	JMPTBL(L(Fill13), L(FillTable))
+	.int	JMPTBL(L(Fill14), L(FillTable))
+	.int	JMPTBL(L(Fill15), L(FillTable))
+	.int	JMPTBL(L(Fill16), L(FillTable))
+# else
+#  define PARMS  4
+#  define ENTRANCE
+#  define RETURN  POP (%edi); ret; CFI_PUSH (%edi)
+#  define RETURN1  ret
+
+	.text
+ENTRY (STRCPY)
+	ENTRANCE
+	mov	STR1(%esp), %edx
+	mov	STR2(%esp), %ecx
+
+	cmpb	$0, (%ecx)
+	jz	L(ExitTail1)
+	cmpb	$0, 1(%ecx)
+	jz	L(ExitTail2)
+	cmpb	$0, 2(%ecx)
+	jz	L(ExitTail3)
+	cmpb	$0, 3(%ecx)
+	jz	L(ExitTail4)
+	cmpb	$0, 4(%ecx)
+	jz	L(ExitTail5)
+	cmpb	$0, 5(%ecx)
+	jz	L(ExitTail6)
+	cmpb	$0, 6(%ecx)
+	jz	L(ExitTail7)
+	cmpb	$0, 7(%ecx)
+	jz	L(ExitTail8)
+	cmpb	$0, 8(%ecx)
+	jz	L(ExitTail9)
+	cmpb	$0, 9(%ecx)
+	jz	L(ExitTail10)
+	cmpb	$0, 10(%ecx)
+	jz	L(ExitTail11)
+	cmpb	$0, 11(%ecx)
+	jz	L(ExitTail12)
+	cmpb	$0, 12(%ecx)
+	jz	L(ExitTail13)
+	cmpb	$0, 13(%ecx)
+	jz	L(ExitTail14)
+	cmpb	$0, 14(%ecx)
+	jz	L(ExitTail15)
+	cmpb	$0, 15(%ecx)
+	jz	L(ExitTail16)
+
+	PUSH	(%edi)
+	PUSH	(%ebx)
+
+	mov	%edx, %edi
+	lea	16(%ecx), %ebx
+	and	$-16, %ebx
+	pxor	%xmm0, %xmm0
+	movdqu	(%ecx), %xmm1
+	movdqu	%xmm1, (%edx)
+	pcmpeqb	(%ebx), %xmm0
+	pmovmskb %xmm0, %eax
+	sub	%ecx, %ebx
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	mov	%ecx, %eax
+	lea	16(%ecx), %ecx
+	and	$-16, %ecx
+	sub	%ecx, %eax
+	sub	%eax, %edx
+	xor	%ebx, %ebx
+
+	.p2align 4
+	movdqa	(%ecx), %xmm1
+	movaps	16(%ecx), %xmm2
+	movdqu	%xmm1, (%edx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	add	$16, %ebx
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %ebx), %xmm3
+	movdqu	%xmm2, (%edx, %ebx)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %eax
+	add	$16, %ebx
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %ebx), %xmm4
+	movdqu	%xmm3, (%edx, %ebx)
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	add	$16, %ebx
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %ebx), %xmm1
+	movdqu	%xmm4, (%edx, %ebx)
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	add	$16, %ebx
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %ebx), %xmm2
+	movdqu	%xmm1, (%edx, %ebx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	add	$16, %ebx
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %ebx), %xmm3
+	movdqu	%xmm2, (%edx, %ebx)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %eax
+	add	$16, %ebx
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movdqu	%xmm3, (%edx, %ebx)
+	mov	%ecx, %eax
+	lea	16(%ecx, %ebx), %ecx
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	sub	%eax, %edx
+
+L(Aligned64Loop):
+	movaps	(%ecx), %xmm2
+	movaps	%xmm2, %xmm4
+	movaps	16(%ecx), %xmm5
+	movaps	32(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	48(%ecx), %xmm7
+	pminub	%xmm5, %xmm2
+	add	$64, %ecx
+	pminub	%xmm7, %xmm3
+	add	$64, %edx
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(Aligned64Leave)
+L(Aligned64Loop_start):
+	movdqu	%xmm4, -64(%edx)
+	movaps	(%ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+	movdqu	%xmm5, -48(%edx)
+	movaps	16(%ecx), %xmm5
+	pminub	%xmm5, %xmm2
+	movaps	32(%ecx), %xmm3
+	movdqu	%xmm6, -32(%edx)
+	movaps	%xmm3, %xmm6
+	movdqu	%xmm7, -16(%edx)
+	movaps	48(%ecx), %xmm7
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %eax
+	add	$64, %edx
+	add	$64, %ecx
+	test	%eax, %eax
+	jz	L(Aligned64Loop_start)
+L(Aligned64Leave):
+	sub	$0xa0, %ebx
+	pxor	%xmm0, %xmm0
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm5, %xmm0
+	pmovmskb %xmm0, %eax
+	movdqu	%xmm4, -64(%edx)
+	test	%eax, %eax
+	lea	16(%ebx), %ebx
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm6, %xmm0
+	pmovmskb %xmm0, %eax
+	movdqu	%xmm5, -48(%edx)
+	test	%eax, %eax
+	lea	16(%ebx), %ebx
+	jnz	L(CopyFrom1To16Bytes)
+
+	movdqu	%xmm6, -32(%edx)
+	pcmpeqb	%xmm7, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%ebx), %ebx
+
+/*-----------------End of main part---------------------------*/
+
+	.p2align 4
+L(CopyFrom1To16Bytes):
+	add	%ebx, %edx
+	add	%ebx, %ecx
+
+	POP	(%ebx)
+	test	%al, %al
+	jz	L(ExitHigh)
+	test	$0x01, %al
+	jnz	L(Exit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	/* Exit 8 */
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+# ifdef USE_AS_STPCPY
+	lea	7(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(ExitHigh):
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	test	$0x08, %ah
+	jnz	L(Exit12)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	/* Exit 16 */
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	8(%ecx), %xmm0
+	movlpd	%xmm0, 8(%edx)
+# ifdef USE_AS_STPCPY
+	lea	15(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit1):
+	movb	(%ecx), %al
+	movb	%al, (%edx)
+# ifdef USE_AS_STPCPY
+	lea	(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit2):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+# ifdef USE_AS_STPCPY
+	lea	1(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit3):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	movb	2(%ecx), %al
+	movb	%al, 2(%edx)
+# ifdef USE_AS_STPCPY
+	lea	2(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit4):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+# ifdef USE_AS_STPCPY
+	lea	3(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit5):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movb	4(%ecx), %al
+	movb	%al, 4(%edx)
+# ifdef USE_AS_STPCPY
+	lea	4(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit6):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movw	4(%ecx), %ax
+	movw	%ax, 4(%edx)
+# ifdef USE_AS_STPCPY
+	lea	5(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit7):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	3(%ecx), %eax
+	movl	%eax, 3(%edx)
+# ifdef USE_AS_STPCPY
+	lea	6(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit9):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+	movb	8(%ecx), %al
+	movb	%al, 8(%edx)
+# ifdef USE_AS_STPCPY
+	lea	8(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit10):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+	movw	8(%ecx), %ax
+	movw	%ax, 8(%edx)
+# ifdef USE_AS_STPCPY
+	lea	9(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit11):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+	movl	7(%ecx), %eax
+	movl	%eax, 7(%edx)
+# ifdef USE_AS_STPCPY
+	lea	10(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit12):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+	movl	8(%ecx), %eax
+	movl	%eax, 8(%edx)
+# ifdef USE_AS_STPCPY
+	lea	11(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit13):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	5(%ecx), %xmm0
+	movlpd	%xmm0, 5(%edx)
+# ifdef USE_AS_STPCPY
+	lea	12(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit14):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	6(%ecx), %xmm0
+	movlpd	%xmm0, 6(%edx)
+# ifdef USE_AS_STPCPY
+	lea	13(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit15):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	7(%ecx), %xmm0
+	movlpd	%xmm0, 7(%edx)
+# ifdef USE_AS_STPCPY
+	lea	14(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+CFI_POP (%edi)
+
+	.p2align 4
+L(ExitTail1):
+	movb	(%ecx), %al
+	movb	%al, (%edx)
+	movl	%edx, %eax
+	RETURN1
+
+	.p2align 4
+L(ExitTail2):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+# ifdef USE_AS_STPCPY
+	lea	1(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail3):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	movb	2(%ecx), %al
+	movb	%al, 2(%edx)
+# ifdef USE_AS_STPCPY
+	lea	2(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail4):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+# ifdef USE_AS_STPCPY
+	lea	3(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail5):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movb	4(%ecx), %al
+	movb	%al, 4(%edx)
+# ifdef USE_AS_STPCPY
+	lea	4(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail6):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movw	4(%ecx), %ax
+	movw	%ax, 4(%edx)
+# ifdef USE_AS_STPCPY
+	lea	5(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail7):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	3(%ecx), %eax
+	movl	%eax, 3(%edx)
+# ifdef USE_AS_STPCPY
+	lea	6(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail8):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+# ifdef USE_AS_STPCPY
+	lea	7(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail9):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+	movb	8(%ecx), %al
+	movb	%al, 8(%edx)
+# ifdef USE_AS_STPCPY
+	lea	8(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail10):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+	movw	8(%ecx), %ax
+	movw	%ax, 8(%edx)
+# ifdef USE_AS_STPCPY
+	lea	9(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail11):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+	movl	7(%ecx), %eax
+	movl	%eax, 7(%edx)
+# ifdef USE_AS_STPCPY
+	lea	10(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail12):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+	movl	8(%ecx), %eax
+	movl	%eax, 8(%edx)
+# ifdef USE_AS_STPCPY
+	lea	11(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail13):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	5(%ecx), %xmm0
+	movlpd	%xmm0, 5(%edx)
+# ifdef USE_AS_STPCPY
+	lea	12(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail14):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	6(%ecx), %xmm0
+	movlpd	%xmm0, 6(%edx)
+# ifdef USE_AS_STPCPY
+	lea	13(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail15):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	7(%ecx), %xmm0
+	movlpd	%xmm0, 7(%edx)
+# ifdef USE_AS_STPCPY
+	lea	14(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail16):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	8(%ecx), %xmm0
+	movlpd	%xmm0, 8(%edx)
+# ifdef USE_AS_STPCPY
+	lea	15(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+END (STRCPY)
+# endif
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-ssse3.S
new file mode 100644
index 0000000000..effd85da94
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-ssse3.S
@@ -0,0 +1,3901 @@
+/* strcpy with SSSE3
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#if IS_IN (libc)
+
+# ifndef USE_AS_STRCAT
+#  include <sysdep.h>
+
+#  define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+#  define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+#  define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#  define POP(REG)	popl REG; CFI_POP (REG)
+
+#  ifndef STRCPY
+#   define STRCPY  __strcpy_ssse3
+#  endif
+
+#  ifdef USE_AS_STRNCPY
+#   define PARMS  8
+#   define ENTRANCE PUSH (%ebx)
+#   define RETURN  POP (%ebx); ret; CFI_PUSH (%ebx);
+#   define RETURN1  POP (%edi); POP (%ebx); ret; CFI_PUSH (%ebx); CFI_PUSH (%edi)
+#  else
+#   define PARMS  4
+#   define ENTRANCE
+#   define RETURN  ret
+#   define RETURN1  POP (%edi); ret; CFI_PUSH (%edi)
+#  endif
+
+#  ifdef USE_AS_STPCPY
+#   define SAVE_RESULT(n)  lea	n(%edx), %eax
+#   define SAVE_RESULT_TAIL(n)  lea	n(%edx), %eax
+#  else
+#   define SAVE_RESULT(n)  movl	%edi, %eax
+#   define SAVE_RESULT_TAIL(n)  movl	%edx, %eax
+#  endif
+
+#  define STR1  PARMS
+#  define STR2  STR1+4
+#  define LEN  STR2+4
+
+/* In this code following instructions are used for copying:
+	movb	- 1 byte
+	movw	- 2 byte
+	movl	- 4 byte
+	movlpd	- 8 byte
+	movaps	- 16 byte - requires 16 byte alignment
+	of	sourse and destination adresses.
+*/
+
+.text
+ENTRY (STRCPY)
+	ENTRANCE
+	mov	STR1(%esp), %edx
+	mov	STR2(%esp), %ecx
+#  ifdef USE_AS_STRNCPY
+	movl	LEN(%esp), %ebx
+	cmp	$8, %ebx
+	jbe	L(StrncpyExit8Bytes)
+#  endif
+	cmpb	$0, (%ecx)
+	jz	L(ExitTail1)
+	cmpb	$0, 1(%ecx)
+	jz	L(ExitTail2)
+	cmpb	$0, 2(%ecx)
+	jz	L(ExitTail3)
+	cmpb	$0, 3(%ecx)
+	jz	L(ExitTail4)
+	cmpb	$0, 4(%ecx)
+	jz	L(ExitTail5)
+	cmpb	$0, 5(%ecx)
+	jz	L(ExitTail6)
+	cmpb	$0, 6(%ecx)
+	jz	L(ExitTail7)
+	cmpb	$0, 7(%ecx)
+	jz	L(ExitTail8)
+#  ifdef USE_AS_STRNCPY
+	cmp	$16, %ebx
+	jb	L(StrncpyExit15Bytes)
+#  endif
+	cmpb	$0, 8(%ecx)
+	jz	L(ExitTail9)
+	cmpb	$0, 9(%ecx)
+	jz	L(ExitTail10)
+	cmpb	$0, 10(%ecx)
+	jz	L(ExitTail11)
+	cmpb	$0, 11(%ecx)
+	jz	L(ExitTail12)
+	cmpb	$0, 12(%ecx)
+	jz	L(ExitTail13)
+	cmpb	$0, 13(%ecx)
+	jz	L(ExitTail14)
+	cmpb	$0, 14(%ecx)
+	jz	L(ExitTail15)
+#  ifdef USE_AS_STRNCPY
+	cmp	$16, %ebx
+	je	L(ExitTail16)
+#  endif
+	cmpb	$0, 15(%ecx)
+	jz	L(ExitTail16)
+
+	PUSH	(%edi)
+	mov	%edx, %edi
+# endif
+	PUSH	(%esi)
+# ifdef USE_AS_STRNCPY
+	mov	%ecx, %esi
+	sub	$16, %ebx
+	and	$0xf, %esi
+
+/* add 16 bytes ecx_offset to ebx */
+
+	add	%esi, %ebx
+# endif
+	lea	16(%ecx), %esi
+	and	$-16, %esi
+	pxor	%xmm0, %xmm0
+	movlpd	(%ecx), %xmm1
+	movlpd	%xmm1, (%edx)
+
+	pcmpeqb	(%esi), %xmm0
+	movlpd	8(%ecx), %xmm1
+	movlpd	%xmm1, 8(%edx)
+
+	pmovmskb %xmm0, %eax
+	sub	%ecx, %esi
+
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	mov	%edx, %eax
+	lea	16(%edx), %edx
+	and	$-16, %edx
+	sub	%edx, %eax
+
+# ifdef USE_AS_STRNCPY
+	add	%eax, %esi
+	lea	-1(%esi), %esi
+	and	$1<<31, %esi
+	test	%esi, %esi
+	jnz	L(ContinueCopy)
+	lea	16(%ebx), %ebx
+
+L(ContinueCopy):
+# endif
+	sub	%eax, %ecx
+	mov	%ecx, %eax
+	and	$0xf, %eax
+	mov	$0, %esi
+
+/* case: ecx_offset == edx_offset */
+
+	jz	L(Align16Both)
+
+	cmp	$8, %eax
+	jae	L(ShlHigh8)
+	cmp	$1, %eax
+	je	L(Shl1)
+	cmp	$2, %eax
+	je	L(Shl2)
+	cmp	$3, %eax
+	je	L(Shl3)
+	cmp	$4, %eax
+	je	L(Shl4)
+	cmp	$5, %eax
+	je	L(Shl5)
+	cmp	$6, %eax
+	je	L(Shl6)
+	jmp	L(Shl7)
+
+L(ShlHigh8):
+	je	L(Shl8)
+	cmp	$9, %eax
+	je	L(Shl9)
+	cmp	$10, %eax
+	je	L(Shl10)
+	cmp	$11, %eax
+	je	L(Shl11)
+	cmp	$12, %eax
+	je	L(Shl12)
+	cmp	$13, %eax
+	je	L(Shl13)
+	cmp	$14, %eax
+	je	L(Shl14)
+	jmp	L(Shl15)
+
+L(Align16Both):
+	movaps	(%ecx), %xmm1
+	movaps	16(%ecx), %xmm2
+	movaps	%xmm1, (%edx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm3
+	movaps	%xmm2, (%edx, %esi)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm4
+	movaps	%xmm3, (%edx, %esi)
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm1
+	movaps	%xmm4, (%edx, %esi)
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm2
+	movaps	%xmm1, (%edx, %esi)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm3
+	movaps	%xmm2, (%edx, %esi)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	%xmm3, (%edx, %esi)
+	mov	%ecx, %eax
+	lea	16(%ecx, %esi), %ecx
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	lea	112(%ebx, %eax), %ebx
+# endif
+	mov	$-0x40, %esi
+
+L(Aligned64Loop):
+	movaps	(%ecx), %xmm2
+	movaps	32(%ecx), %xmm3
+	movaps	%xmm2, %xmm4
+	movaps	16(%ecx), %xmm5
+	movaps	%xmm3, %xmm6
+	movaps	48(%ecx), %xmm7
+	pminub	%xmm5, %xmm2
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	lea	64(%edx), %edx
+	pcmpeqb	%xmm0, %xmm3
+	lea	64(%ecx), %ecx
+	pmovmskb %xmm3, %eax
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeaveCase2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Aligned64Leave)
+	movaps	%xmm4, -64(%edx)
+	movaps	%xmm5, -48(%edx)
+	movaps	%xmm6, -32(%edx)
+	movaps	%xmm7, -16(%edx)
+	jmp	L(Aligned64Loop)
+
+L(Aligned64Leave):
+# ifdef USE_AS_STRNCPY
+	lea	48(%ebx), %ebx
+# endif
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm5, %xmm0
+# ifdef USE_AS_STRNCPY
+	lea	-16(%ebx), %ebx
+# endif
+	pmovmskb %xmm0, %eax
+	movaps	%xmm4, -64(%edx)
+	test	%eax, %eax
+	lea	16(%esi), %esi
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm6, %xmm0
+# ifdef USE_AS_STRNCPY
+	lea	-16(%ebx), %ebx
+# endif
+	pmovmskb %xmm0, %eax
+	movaps	%xmm5, -48(%edx)
+	test	%eax, %eax
+	lea	16(%esi), %esi
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	%xmm6, -32(%edx)
+	pcmpeqb	%xmm7, %xmm0
+# ifdef USE_AS_STRNCPY
+	lea	-16(%ebx), %ebx
+# endif
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl1):
+	movaps	-1(%ecx), %xmm1
+	movaps	15(%ecx), %xmm2
+L(Shl1Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit1Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl1LoopExit)
+
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	31(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit1Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl1LoopExit)
+
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	31(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit1Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl1LoopExit)
+
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	31(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit1Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl1LoopExit)
+
+	palignr	$1, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	31(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-15(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-1(%ecx), %xmm1
+
+L(Shl1LoopStart):
+	movaps	15(%ecx), %xmm2
+	movaps	31(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	47(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	63(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$1, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$1, %xmm3, %xmm4
+	jnz	L(Shl1Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave1)
+# endif
+	palignr	$1, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl1LoopStart)
+
+L(Shl1LoopExit):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	7(%ecx), %xmm0
+	movlpd	%xmm0, 7(%edx)
+	mov	$15, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl2):
+	movaps	-2(%ecx), %xmm1
+	movaps	14(%ecx), %xmm2
+L(Shl2Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit2Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl2LoopExit)
+
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	30(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit2Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl2LoopExit)
+
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	30(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit2Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl2LoopExit)
+
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	30(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit2Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl2LoopExit)
+
+	palignr	$2, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	30(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-14(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-2(%ecx), %xmm1
+
+L(Shl2LoopStart):
+	movaps	14(%ecx), %xmm2
+	movaps	30(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	46(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	62(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$2, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$2, %xmm3, %xmm4
+	jnz	L(Shl2Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave2)
+# endif
+	palignr	$2, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl2LoopStart)
+
+L(Shl2LoopExit):
+	movlpd	(%ecx), %xmm0
+	movlpd	6(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 6(%edx)
+	mov	$14, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl3):
+	movaps	-3(%ecx), %xmm1
+	movaps	13(%ecx), %xmm2
+L(Shl3Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit3Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl3LoopExit)
+
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	29(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit3Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl3LoopExit)
+
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	29(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit3Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl3LoopExit)
+
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	29(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit3Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl3LoopExit)
+
+	palignr	$3, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	29(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-13(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-3(%ecx), %xmm1
+
+L(Shl3LoopStart):
+	movaps	13(%ecx), %xmm2
+	movaps	29(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	45(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	61(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$3, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$3, %xmm3, %xmm4
+	jnz	L(Shl3Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave3)
+# endif
+	palignr	$3, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl3LoopStart)
+
+L(Shl3LoopExit):
+	movlpd	(%ecx), %xmm0
+	movlpd	5(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 5(%edx)
+	mov	$13, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl4):
+	movaps	-4(%ecx), %xmm1
+	movaps	12(%ecx), %xmm2
+L(Shl4Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit4Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	28(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit4Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	28(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit4Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	28(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit4Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	28(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-12(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-4(%ecx), %xmm1
+
+L(Shl4LoopStart):
+	movaps	12(%ecx), %xmm2
+	movaps	28(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	44(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	60(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$4, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$4, %xmm3, %xmm4
+	jnz	L(Shl4Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave4)
+# endif
+	palignr	$4, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl4LoopStart)
+
+L(Shl4LoopExit):
+	movlpd	(%ecx), %xmm0
+	movl	8(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 8(%edx)
+	mov	$12, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl5):
+	movaps	-5(%ecx), %xmm1
+	movaps	11(%ecx), %xmm2
+L(Shl5Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit5Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl5LoopExit)
+
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	27(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit5Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl5LoopExit)
+
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	27(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit5Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl5LoopExit)
+
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	27(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit5Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl5LoopExit)
+
+	palignr	$5, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	27(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-11(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-5(%ecx), %xmm1
+
+L(Shl5LoopStart):
+	movaps	11(%ecx), %xmm2
+	movaps	27(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	43(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	59(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$5, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$5, %xmm3, %xmm4
+	jnz	L(Shl5Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave5)
+# endif
+	palignr	$5, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl5LoopStart)
+
+L(Shl5LoopExit):
+	movlpd	(%ecx), %xmm0
+	movl	7(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 7(%edx)
+	mov	$11, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl6):
+	movaps	-6(%ecx), %xmm1
+	movaps	10(%ecx), %xmm2
+L(Shl6Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit6Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl6LoopExit)
+
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	26(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit6Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl6LoopExit)
+
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	26(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit6Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl6LoopExit)
+
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	26(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit6Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl6LoopExit)
+
+	palignr	$6, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	26(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-10(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-6(%ecx), %xmm1
+
+L(Shl6LoopStart):
+	movaps	10(%ecx), %xmm2
+	movaps	26(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	42(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	58(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$6, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$6, %xmm3, %xmm4
+	jnz	L(Shl6Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave6)
+# endif
+	palignr	$6, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl6LoopStart)
+
+L(Shl6LoopExit):
+	movlpd	(%ecx), %xmm0
+	movl	6(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 6(%edx)
+	mov	$10, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl7):
+	movaps	-7(%ecx), %xmm1
+	movaps	9(%ecx), %xmm2
+L(Shl7Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit7Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl7LoopExit)
+
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	25(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit7Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl7LoopExit)
+
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	25(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit7Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl7LoopExit)
+
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	25(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit7Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl7LoopExit)
+
+	palignr	$7, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	25(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-9(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-7(%ecx), %xmm1
+
+L(Shl7LoopStart):
+	movaps	9(%ecx), %xmm2
+	movaps	25(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	41(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	57(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$7, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$7, %xmm3, %xmm4
+	jnz	L(Shl7Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave7)
+# endif
+	palignr	$7, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl7LoopStart)
+
+L(Shl7LoopExit):
+	movlpd	(%ecx), %xmm0
+	movl	5(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 5(%edx)
+	mov	$9, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl8):
+	movaps	-8(%ecx), %xmm1
+	movaps	8(%ecx), %xmm2
+L(Shl8Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit8Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	24(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit8Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	24(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit8Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	24(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit8Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	24(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-8(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-8(%ecx), %xmm1
+
+L(Shl8LoopStart):
+	movaps	8(%ecx), %xmm2
+	movaps	24(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	40(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	56(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$8, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$8, %xmm3, %xmm4
+	jnz	L(Shl8Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave8)
+# endif
+	palignr	$8, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl8LoopStart)
+
+L(Shl8LoopExit):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	mov	$8, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl9):
+	movaps	-9(%ecx), %xmm1
+	movaps	7(%ecx), %xmm2
+L(Shl9Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit9Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl9LoopExit)
+
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	23(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit9Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl9LoopExit)
+
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	23(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit9Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl9LoopExit)
+
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	23(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit9Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl9LoopExit)
+
+	palignr	$9, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	23(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-7(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-9(%ecx), %xmm1
+
+L(Shl9LoopStart):
+	movaps	7(%ecx), %xmm2
+	movaps	23(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	39(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	55(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$9, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$9, %xmm3, %xmm4
+	jnz	L(Shl9Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave9)
+# endif
+	palignr	$9, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl9LoopStart)
+
+L(Shl9LoopExit):
+	movlpd	-1(%ecx), %xmm0
+	movlpd	%xmm0, -1(%edx)
+	mov	$7, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl10):
+	movaps	-10(%ecx), %xmm1
+	movaps	6(%ecx), %xmm2
+L(Shl10Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit10Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl10LoopExit)
+
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	22(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit10Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl10LoopExit)
+
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	22(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit10Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl10LoopExit)
+
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	22(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit10Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl10LoopExit)
+
+	palignr	$10, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	22(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-6(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-10(%ecx), %xmm1
+
+L(Shl10LoopStart):
+	movaps	6(%ecx), %xmm2
+	movaps	22(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	38(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	54(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$10, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$10, %xmm3, %xmm4
+	jnz	L(Shl10Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave10)
+# endif
+	palignr	$10, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl10LoopStart)
+
+L(Shl10LoopExit):
+	movlpd	-2(%ecx), %xmm0
+	movlpd	%xmm0, -2(%edx)
+	mov	$6, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl11):
+	movaps	-11(%ecx), %xmm1
+	movaps	5(%ecx), %xmm2
+L(Shl11Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit11Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl11LoopExit)
+
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	21(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit11Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl11LoopExit)
+
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	21(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit11Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl11LoopExit)
+
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	21(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit11Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl11LoopExit)
+
+	palignr	$11, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	21(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-5(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-11(%ecx), %xmm1
+
+L(Shl11LoopStart):
+	movaps	5(%ecx), %xmm2
+	movaps	21(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	37(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	53(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$11, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$11, %xmm3, %xmm4
+	jnz	L(Shl11Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave11)
+# endif
+	palignr	$11, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl11LoopStart)
+
+L(Shl11LoopExit):
+	movlpd	-3(%ecx), %xmm0
+	movlpd	%xmm0, -3(%edx)
+	mov	$5, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl12):
+	movaps	-12(%ecx), %xmm1
+	movaps	4(%ecx), %xmm2
+L(Shl12Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit12Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	20(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit12Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	20(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit12Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	20(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit12Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	20(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-4(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-12(%ecx), %xmm1
+
+L(Shl12LoopStart):
+	movaps	4(%ecx), %xmm2
+	movaps	20(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	36(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	52(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$12, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$12, %xmm3, %xmm4
+	jnz	L(Shl12Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave12)
+# endif
+	palignr	$12, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl12LoopStart)
+
+L(Shl12LoopExit):
+	movl	(%ecx), %esi
+	movl	%esi, (%edx)
+	mov	$4, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl13):
+	movaps	-13(%ecx), %xmm1
+	movaps	3(%ecx), %xmm2
+L(Shl13Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit13Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl13LoopExit)
+
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	19(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit13Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl13LoopExit)
+
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	19(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit13Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl13LoopExit)
+
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	19(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit13Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl13LoopExit)
+
+	palignr	$13, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	19(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-3(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-13(%ecx), %xmm1
+
+L(Shl13LoopStart):
+	movaps	3(%ecx), %xmm2
+	movaps	19(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	35(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	51(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$13, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$13, %xmm3, %xmm4
+	jnz	L(Shl13Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave13)
+# endif
+	palignr	$13, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl13LoopStart)
+
+L(Shl13LoopExit):
+	movl	-1(%ecx), %esi
+	movl	%esi, -1(%edx)
+	mov	$3, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl14):
+	movaps	-14(%ecx), %xmm1
+	movaps	2(%ecx), %xmm2
+L(Shl14Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit14Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl14LoopExit)
+
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	18(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit14Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl14LoopExit)
+
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	18(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit14Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl14LoopExit)
+
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	18(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit14Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl14LoopExit)
+
+	palignr	$14, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	18(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-2(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-14(%ecx), %xmm1
+
+L(Shl14LoopStart):
+	movaps	2(%ecx), %xmm2
+	movaps	18(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	34(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	50(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$14, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$14, %xmm3, %xmm4
+	jnz	L(Shl14Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave14)
+# endif
+	palignr	$14, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl14LoopStart)
+
+L(Shl14LoopExit):
+	movl	-2(%ecx), %esi
+	movl	%esi, -2(%edx)
+	mov	$2, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl15):
+	movaps	-15(%ecx), %xmm1
+	movaps	1(%ecx), %xmm2
+L(Shl15Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit15Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl15LoopExit)
+
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	17(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit15Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl15LoopExit)
+
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	17(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit15Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl15LoopExit)
+
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	17(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit15Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl15LoopExit)
+
+	palignr	$15, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	17(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-1(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-15(%ecx), %xmm1
+
+L(Shl15LoopStart):
+	movaps	1(%ecx), %xmm2
+	movaps	17(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	33(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	49(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$15, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$15, %xmm3, %xmm4
+	jnz	L(Shl15Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave15)
+# endif
+	palignr	$15, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl15LoopStart)
+
+L(Shl15LoopExit):
+	movl	-3(%ecx), %esi
+	movl	%esi, -3(%edx)
+	mov	$1, %esi
+# ifdef USE_AS_STRCAT
+	jmp	L(CopyFrom1To16Bytes)
+# endif
+
+
+# ifndef USE_AS_STRCAT
+
+	.p2align 4
+L(CopyFrom1To16Bytes):
+#  ifdef USE_AS_STRNCPY
+	add	$16, %ebx
+#  endif
+	add	%esi, %edx
+	add	%esi, %ecx
+
+	POP	(%esi)
+	test	%al, %al
+	jz	L(ExitHigh8)
+
+L(CopyFrom1To16BytesLess8):
+	mov	%al, %ah
+	and	$15, %ah
+	jz	L(ExitHigh4)
+
+	test	$0x01, %al
+	jnz	L(Exit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+
+	.p2align 4
+L(Exit4):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	SAVE_RESULT	(3)
+#  ifdef USE_AS_STRNCPY
+	sub	$4, %ebx
+	lea	4(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(ExitHigh4):
+	test	$0x10, %al
+	jnz	L(Exit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+
+	.p2align 4
+L(Exit8):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	SAVE_RESULT	(7)
+#  ifdef USE_AS_STRNCPY
+	sub	$8, %ebx
+	lea	8(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(ExitHigh8):
+	mov	%ah, %al
+	and	$15, %al
+	jz	L(ExitHigh12)
+
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+
+	.p2align 4
+L(Exit12):
+	movlpd	(%ecx), %xmm0
+	movl	8(%ecx), %eax
+	movlpd	%xmm0, (%edx)
+	movl	%eax, 8(%edx)
+	SAVE_RESULT	(11)
+#  ifdef USE_AS_STRNCPY
+	sub	$12, %ebx
+	lea	12(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(ExitHigh12):
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+
+	.p2align 4
+L(Exit16):
+	movdqu	(%ecx), %xmm0
+	movdqu	%xmm0, (%edx)
+	SAVE_RESULT	(15)
+#  ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	lea	16(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+#   ifdef USE_AS_STRNCPY
+
+	CFI_PUSH(%esi)
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2):
+	add	$16, %ebx
+	add	%esi, %ecx
+	add	%esi, %edx
+
+	POP	(%esi)
+
+	test	%al, %al
+	jz	L(ExitHighCase2)
+
+	cmp	$8, %ebx
+	ja	L(CopyFrom1To16BytesLess8)
+
+	test	$0x01, %al
+	jnz	L(Exit1)
+	cmp	$1, %ebx
+	je	L(Exit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	cmp	$2, %ebx
+	je	L(Exit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	cmp	$3, %ebx
+	je	L(Exit3)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	cmp	$4, %ebx
+	je	L(Exit4)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	cmp	$5, %ebx
+	je	L(Exit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	cmp	$6, %ebx
+	je	L(Exit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	cmp	$7, %ebx
+	je	L(Exit7)
+	jmp	L(Exit8)
+
+	.p2align 4
+L(ExitHighCase2):
+	cmp	$8, %ebx
+	jbe	L(CopyFrom1To16BytesLess8Case3)
+
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	cmp	$9, %ebx
+	je	L(Exit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	cmp	$10, %ebx
+	je	L(Exit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	cmp	$11, %ebx
+	je	L(Exit11)
+	test	$0x8, %ah
+	jnz	L(Exit12)
+	cmp	$12, %ebx
+	je	L(Exit12)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	cmp	$13, %ebx
+	je	L(Exit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	cmp	$14, %ebx
+	je	L(Exit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	cmp	$15, %ebx
+	je	L(Exit15)
+	jmp	L(Exit16)
+
+	CFI_PUSH(%esi)
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2OrCase3):
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+
+	.p2align 4
+L(CopyFrom1To16BytesCase3):
+	add	$16, %ebx
+	add	%esi, %edx
+	add	%esi, %ecx
+
+	POP	(%esi)
+
+	cmp	$8, %ebx
+	ja	L(ExitHigh8Case3)
+
+L(CopyFrom1To16BytesLess8Case3):
+	cmp	$4, %ebx
+	ja	L(ExitHigh4Case3)
+
+	cmp	$1, %ebx
+	je	L(Exit1)
+	cmp	$2, %ebx
+	je	L(Exit2)
+	cmp	$3, %ebx
+	je	L(Exit3)
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	SAVE_RESULT	(4)
+	RETURN1
+
+	.p2align 4
+L(ExitHigh4Case3):
+	cmp	$5, %ebx
+	je	L(Exit5)
+	cmp	$6, %ebx
+	je	L(Exit6)
+	cmp	$7, %ebx
+	je	L(Exit7)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	SAVE_RESULT	(8)
+	RETURN1
+
+	.p2align 4
+L(ExitHigh8Case3):
+	cmp	$12, %ebx
+	ja	L(ExitHigh12Case3)
+
+	cmp	$9, %ebx
+	je	L(Exit9)
+	cmp	$10, %ebx
+	je	L(Exit10)
+	cmp	$11, %ebx
+	je	L(Exit11)
+	movlpd	(%ecx), %xmm0
+	movl	8(%ecx), %eax
+	movlpd	%xmm0, (%edx)
+	movl	%eax, 8(%edx)
+	SAVE_RESULT	(12)
+	RETURN1
+
+	.p2align 4
+L(ExitHigh12Case3):
+	cmp	$13, %ebx
+	je	L(Exit13)
+	cmp	$14, %ebx
+	je	L(Exit14)
+	cmp	$15, %ebx
+	je	L(Exit15)
+	movlpd	(%ecx), %xmm0
+	movlpd	8(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 8(%edx)
+	SAVE_RESULT	(16)
+	RETURN1
+
+#  endif
+
+	.p2align 4
+L(Exit1):
+	movb	(%ecx), %al
+	movb	%al, (%edx)
+	SAVE_RESULT	(0)
+#  ifdef USE_AS_STRNCPY
+	sub	$1, %ebx
+	lea	1(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(Exit2):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	SAVE_RESULT	(1)
+#  ifdef USE_AS_STRNCPY
+	sub	$2, %ebx
+	lea	2(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(Exit3):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	movb	2(%ecx), %al
+	movb	%al, 2(%edx)
+	SAVE_RESULT	(2)
+#  ifdef USE_AS_STRNCPY
+	sub	$3, %ebx
+	lea	3(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(Exit5):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movb	4(%ecx), %al
+	movb	%al, 4(%edx)
+	SAVE_RESULT	(4)
+#  ifdef USE_AS_STRNCPY
+	sub	$5, %ebx
+	lea	5(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(Exit6):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movw	4(%ecx), %ax
+	movw	%ax, 4(%edx)
+	SAVE_RESULT	(5)
+#  ifdef USE_AS_STRNCPY
+	sub	$6, %ebx
+	lea	6(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(Exit7):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	3(%ecx), %eax
+	movl	%eax, 3(%edx)
+	SAVE_RESULT	(6)
+#  ifdef USE_AS_STRNCPY
+	sub	$7, %ebx
+	lea	7(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(Exit9):
+	movlpd	(%ecx), %xmm0
+	movb	8(%ecx), %al
+	movlpd	%xmm0, (%edx)
+	movb	%al, 8(%edx)
+	SAVE_RESULT	(8)
+#  ifdef USE_AS_STRNCPY
+	sub	$9, %ebx
+	lea	9(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(Exit10):
+	movlpd	(%ecx), %xmm0
+	movw	8(%ecx), %ax
+	movlpd	%xmm0, (%edx)
+	movw	%ax, 8(%edx)
+	SAVE_RESULT	(9)
+#  ifdef USE_AS_STRNCPY
+	sub	$10, %ebx
+	lea	10(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(Exit11):
+	movlpd	(%ecx), %xmm0
+	movl	7(%ecx), %eax
+	movlpd	%xmm0, (%edx)
+	movl	%eax, 7(%edx)
+	SAVE_RESULT	(10)
+#  ifdef USE_AS_STRNCPY
+	sub	$11, %ebx
+	lea	11(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(Exit13):
+	movlpd	(%ecx), %xmm0
+	movlpd	5(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 5(%edx)
+	SAVE_RESULT	(12)
+#  ifdef USE_AS_STRNCPY
+	sub	$13, %ebx
+	lea	13(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(Exit14):
+	movlpd	(%ecx), %xmm0
+	movlpd	6(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 6(%edx)
+	SAVE_RESULT	(13)
+#  ifdef USE_AS_STRNCPY
+	sub	$14, %ebx
+	lea	14(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(Exit15):
+	movlpd	(%ecx), %xmm0
+	movlpd	7(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 7(%edx)
+	SAVE_RESULT	(14)
+#  ifdef USE_AS_STRNCPY
+	sub	$15, %ebx
+	lea	15(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+CFI_POP	(%edi)
+
+#  ifdef USE_AS_STRNCPY
+	.p2align 4
+L(Fill0):
+	RETURN
+
+	.p2align 4
+L(Fill1):
+	movb	%dl, (%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill2):
+	movw	%dx, (%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill3):
+	movw	%dx, (%ecx)
+	movb	%dl, 2(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill4):
+	movl	%edx, (%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill5):
+	movl	%edx, (%ecx)
+	movb	%dl, 4(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill6):
+	movl	%edx, (%ecx)
+	movw	%dx, 4(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill7):
+	movl	%edx, (%ecx)
+	movl	%edx, 3(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill8):
+	movlpd	%xmm0, (%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill9):
+	movlpd	%xmm0, (%ecx)
+	movb	%dl, 8(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill10):
+	movlpd	%xmm0, (%ecx)
+	movw	%dx, 8(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill11):
+	movlpd	%xmm0, (%ecx)
+	movl	%edx, 7(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill12):
+	movlpd	%xmm0, (%ecx)
+	movl	%edx, 8(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill13):
+	movlpd	%xmm0, (%ecx)
+	movlpd	%xmm0, 5(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill14):
+	movlpd	%xmm0, (%ecx)
+	movlpd	%xmm0, 6(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill15):
+	movlpd	%xmm0, (%ecx)
+	movlpd	%xmm0, 7(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill16):
+	movlpd	%xmm0, (%ecx)
+	movlpd	%xmm0, 8(%ecx)
+	RETURN
+
+	.p2align 4
+L(StrncpyFillExit1):
+	lea	16(%ebx), %ebx
+L(FillFrom1To16Bytes):
+	test	%ebx, %ebx
+	jz	L(Fill0)
+	cmp	$16, %ebx
+	je	L(Fill16)
+	cmp	$8, %ebx
+	je	L(Fill8)
+	jg	L(FillMore8)
+	cmp	$4, %ebx
+	je	L(Fill4)
+	jg	L(FillMore4)
+	cmp	$2, %ebx
+	jl	L(Fill1)
+	je	L(Fill2)
+	jg	L(Fill3)
+L(FillMore8):	/* but less than 16 */
+	cmp	$12, %ebx
+	je	L(Fill12)
+	jl	L(FillLess12)
+	cmp	$14, %ebx
+	jl	L(Fill13)
+	je	L(Fill14)
+	jg	L(Fill15)
+L(FillMore4):	/* but less than 8 */
+	cmp	$6, %ebx
+	jl	L(Fill5)
+	je	L(Fill6)
+	jg	L(Fill7)
+L(FillLess12):	/* but more than 8 */
+	cmp	$10, %ebx
+	jl	L(Fill9)
+	je	L(Fill10)
+	jmp	L(Fill11)
+
+	CFI_PUSH(%edi)
+
+	.p2align 4
+L(StrncpyFillTailWithZero1):
+	POP	(%edi)
+L(StrncpyFillTailWithZero):
+	pxor	%xmm0, %xmm0
+	xor	%edx, %edx
+	sub	$16, %ebx
+	jbe	L(StrncpyFillExit1)
+
+	movlpd	%xmm0, (%ecx)
+	movlpd	%xmm0, 8(%ecx)
+
+	lea	16(%ecx), %ecx
+
+	mov	%ecx, %edx
+	and	$0xf, %edx
+	sub	%edx, %ecx
+	add	%edx, %ebx
+	xor	%edx, %edx
+	sub	$64, %ebx
+	jb	L(StrncpyFillLess64)
+
+L(StrncpyFillLoopMovdqa):
+	movdqa	%xmm0, (%ecx)
+	movdqa	%xmm0, 16(%ecx)
+	movdqa	%xmm0, 32(%ecx)
+	movdqa	%xmm0, 48(%ecx)
+	lea	64(%ecx), %ecx
+	sub	$64, %ebx
+	jae	L(StrncpyFillLoopMovdqa)
+
+L(StrncpyFillLess64):
+	add	$32, %ebx
+	jl	L(StrncpyFillLess32)
+	movdqa	%xmm0, (%ecx)
+	movdqa	%xmm0, 16(%ecx)
+	lea	32(%ecx), %ecx
+	sub	$16, %ebx
+	jl	L(StrncpyFillExit1)
+	movdqa	%xmm0, (%ecx)
+	lea	16(%ecx), %ecx
+	jmp	L(FillFrom1To16Bytes)
+
+L(StrncpyFillLess32):
+	add	$16, %ebx
+	jl	L(StrncpyFillExit1)
+	movdqa	%xmm0, (%ecx)
+	lea	16(%ecx), %ecx
+	jmp	L(FillFrom1To16Bytes)
+#  endif
+
+	.p2align 4
+L(ExitTail1):
+	movb	(%ecx), %al
+	movb	%al, (%edx)
+	SAVE_RESULT_TAIL (0)
+#  ifdef USE_AS_STRNCPY
+	sub	$1, %ebx
+	lea	1(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail2):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	SAVE_RESULT_TAIL (1)
+#  ifdef USE_AS_STRNCPY
+	sub	$2, %ebx
+	lea	2(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail3):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	movb	2(%ecx), %al
+	movb	%al, 2(%edx)
+	SAVE_RESULT_TAIL (2)
+#  ifdef USE_AS_STRNCPY
+	sub	$3, %ebx
+	lea	3(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail4):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	SAVE_RESULT_TAIL (3)
+#  ifdef USE_AS_STRNCPY
+	sub	$4, %ebx
+	lea	4(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail5):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movb	4(%ecx), %al
+	movb	%al, 4(%edx)
+	SAVE_RESULT_TAIL (4)
+#  ifdef USE_AS_STRNCPY
+	sub	$5, %ebx
+	lea	5(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail6):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movw	4(%ecx), %ax
+	movw	%ax, 4(%edx)
+	SAVE_RESULT_TAIL (5)
+#  ifdef USE_AS_STRNCPY
+	sub	$6, %ebx
+	lea	6(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail7):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	3(%ecx), %eax
+	movl	%eax, 3(%edx)
+	SAVE_RESULT_TAIL (6)
+#  ifdef USE_AS_STRNCPY
+	sub	$7, %ebx
+	lea	7(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail8):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	SAVE_RESULT_TAIL (7)
+#  ifdef USE_AS_STRNCPY
+	sub	$8, %ebx
+	lea	8(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail9):
+	movlpd	(%ecx), %xmm0
+	movb	8(%ecx), %al
+	movlpd	%xmm0, (%edx)
+	movb	%al, 8(%edx)
+	SAVE_RESULT_TAIL (8)
+#  ifdef USE_AS_STRNCPY
+	sub	$9, %ebx
+	lea	9(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail10):
+	movlpd	(%ecx), %xmm0
+	movw	8(%ecx), %ax
+	movlpd	%xmm0, (%edx)
+	movw	%ax, 8(%edx)
+	SAVE_RESULT_TAIL (9)
+#  ifdef USE_AS_STRNCPY
+	sub	$10, %ebx
+	lea	10(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail11):
+	movlpd	(%ecx), %xmm0
+	movl	7(%ecx), %eax
+	movlpd	%xmm0, (%edx)
+	movl	%eax, 7(%edx)
+	SAVE_RESULT_TAIL (10)
+#  ifdef USE_AS_STRNCPY
+	sub	$11, %ebx
+	lea	11(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail12):
+	movlpd	(%ecx), %xmm0
+	movl	8(%ecx), %eax
+	movlpd	%xmm0, (%edx)
+	movl	%eax, 8(%edx)
+	SAVE_RESULT_TAIL (11)
+#  ifdef USE_AS_STRNCPY
+	sub	$12, %ebx
+	lea	12(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail13):
+	movlpd	(%ecx), %xmm0
+	movlpd	5(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 5(%edx)
+	SAVE_RESULT_TAIL (12)
+#  ifdef USE_AS_STRNCPY
+	sub	$13, %ebx
+	lea	13(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail14):
+	movlpd	(%ecx), %xmm0
+	movlpd	6(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 6(%edx)
+	SAVE_RESULT_TAIL (13)
+#  ifdef USE_AS_STRNCPY
+	sub	$14, %ebx
+	lea	14(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#  ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail15):
+	movlpd	(%ecx), %xmm0
+	movlpd	7(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 7(%edx)
+	SAVE_RESULT_TAIL (14)
+#  ifdef USE_AS_STRNCPY
+	sub	$15, %ebx
+	lea	15(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail16):
+	movdqu	(%ecx), %xmm0
+	movdqu	%xmm0, (%edx)
+	SAVE_RESULT_TAIL (15)
+#  ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	lea	16(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+# endif
+
+# ifdef USE_AS_STRNCPY
+#  ifndef USE_AS_STRCAT
+	CFI_PUSH (%esi)
+	CFI_PUSH (%edi)
+#  endif
+	.p2align 4
+L(StrncpyLeaveCase2OrCase3):
+	test	%eax, %eax
+	jnz	L(Aligned64LeaveCase2)
+
+L(Aligned64LeaveCase3):
+	add	$48, %ebx
+	jle	L(CopyFrom1To16BytesCase3)
+	movaps	%xmm4, -64(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase3)
+	movaps	%xmm5, -48(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase3)
+	movaps	%xmm6, -32(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(Aligned64LeaveCase2):
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	add	$48, %ebx
+	jle	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm5, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm4, -64(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm6, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm5, -48(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm7, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm6, -32(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+	jmp	L(CopyFrom1To16BytesCase2)
+
+/*--------------------------------------------------*/
+	.p2align 4
+L(StrncpyExit1Case2OrCase3):
+	movlpd	(%ecx), %xmm0
+	movlpd	7(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 7(%edx)
+	mov	$15, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit2Case2OrCase3):
+	movlpd	(%ecx), %xmm0
+	movlpd	6(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 6(%edx)
+	mov	$14, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit3Case2OrCase3):
+	movlpd	(%ecx), %xmm0
+	movlpd	5(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 5(%edx)
+	mov	$13, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit4Case2OrCase3):
+	movlpd	(%ecx), %xmm0
+	movl	8(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 8(%edx)
+	mov	$12, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit5Case2OrCase3):
+	movlpd	(%ecx), %xmm0
+	movl	7(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 7(%edx)
+	mov	$11, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit6Case2OrCase3):
+	movlpd	(%ecx), %xmm0
+	movl	6(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 6(%edx)
+	mov	$10, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit7Case2OrCase3):
+	movlpd	(%ecx), %xmm0
+	movl	5(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 5(%edx)
+	mov	$9, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit8Case2OrCase3):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	mov	$8, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit9Case2OrCase3):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	mov	$7, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit10Case2OrCase3):
+	movlpd	-1(%ecx), %xmm0
+	movlpd	%xmm0, -1(%edx)
+	mov	$6, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit11Case2OrCase3):
+	movlpd	-2(%ecx), %xmm0
+	movlpd	%xmm0, -2(%edx)
+	mov	$5, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit12Case2OrCase3):
+	movl	(%ecx), %esi
+	movl	%esi, (%edx)
+	mov	$4, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit13Case2OrCase3):
+	movl	-1(%ecx), %esi
+	movl	%esi, -1(%edx)
+	mov	$3, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit14Case2OrCase3):
+	movl	-2(%ecx), %esi
+	movl	%esi, -2(%edx)
+	mov	$2, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit15Case2OrCase3):
+	movl	-3(%ecx), %esi
+	movl	%esi, -3(%edx)
+	mov	$1, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave1):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit1)
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	31(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit1)
+	palignr	$1, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit1)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit1)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit1):
+	lea	15(%edx, %esi), %edx
+	lea	15(%ecx, %esi), %ecx
+	movdqu	-16(%ecx), %xmm0
+	xor	%esi, %esi
+	movdqu	%xmm0, -16(%edx)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave2):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit2)
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	30(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit2)
+	palignr	$2, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit2)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit2)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit2):
+	lea	14(%edx, %esi), %edx
+	lea	14(%ecx, %esi), %ecx
+	movdqu	-16(%ecx), %xmm0
+	xor	%esi, %esi
+	movdqu	%xmm0, -16(%edx)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave3):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit3)
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	29(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit3)
+	palignr	$3, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit3)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit3)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit3):
+	lea	13(%edx, %esi), %edx
+	lea	13(%ecx, %esi), %ecx
+	movdqu	-16(%ecx), %xmm0
+	xor	%esi, %esi
+	movdqu	%xmm0, -16(%edx)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave4):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit4)
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	28(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit4)
+	palignr	$4, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit4)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit4)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit4):
+	lea	12(%edx, %esi), %edx
+	lea	12(%ecx, %esi), %ecx
+	movlpd	-12(%ecx), %xmm0
+	movl	-4(%ecx), %eax
+	movlpd	%xmm0, -12(%edx)
+	movl	%eax, -4(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave5):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit5)
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	27(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit5)
+	palignr	$5, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit5)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit5)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit5):
+	lea	11(%edx, %esi), %edx
+	lea	11(%ecx, %esi), %ecx
+	movlpd	-11(%ecx), %xmm0
+	movl	-4(%ecx), %eax
+	movlpd	%xmm0, -11(%edx)
+	movl	%eax, -4(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave6):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit6)
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	26(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit6)
+	palignr	$6, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit6)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit6)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit6):
+	lea	10(%edx, %esi), %edx
+	lea	10(%ecx, %esi), %ecx
+
+	movlpd	-10(%ecx), %xmm0
+	movw	-2(%ecx), %ax
+	movlpd	%xmm0, -10(%edx)
+	movw	%ax, -2(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave7):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit7)
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	25(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit7)
+	palignr	$7, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit7)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit7)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit7):
+	lea	9(%edx, %esi), %edx
+	lea	9(%ecx, %esi), %ecx
+
+	movlpd	-9(%ecx), %xmm0
+	movb	-1(%ecx), %ah
+	movlpd	%xmm0, -9(%edx)
+	movb	%ah, -1(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave8):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit8)
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	24(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit8)
+	palignr	$8, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit8)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit8)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit8):
+	lea	8(%edx, %esi), %edx
+	lea	8(%ecx, %esi), %ecx
+	movlpd	-8(%ecx), %xmm0
+	movlpd	%xmm0, -8(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave9):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit9)
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	23(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit9)
+	palignr	$9, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit9)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit9)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit9):
+	lea	7(%edx, %esi), %edx
+	lea	7(%ecx, %esi), %ecx
+
+	movlpd	-8(%ecx), %xmm0
+	movlpd	%xmm0, -8(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave10):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit10)
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	22(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit10)
+	palignr	$10, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit10)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit10)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit10):
+	lea	6(%edx, %esi), %edx
+	lea	6(%ecx, %esi), %ecx
+
+	movlpd	-8(%ecx), %xmm0
+	movlpd	%xmm0, -8(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave11):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit11)
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	21(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit11)
+	palignr	$11, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit11)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit11)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit11):
+	lea	5(%edx, %esi), %edx
+	lea	5(%ecx, %esi), %ecx
+	movl	-5(%ecx), %esi
+	movb	-1(%ecx), %ah
+	movl	%esi, -5(%edx)
+	movb	%ah, -1(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave12):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit12)
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	20(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit12)
+	palignr	$12, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit12)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit12)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit12):
+	lea	4(%edx, %esi), %edx
+	lea	4(%ecx, %esi), %ecx
+	movl	-4(%ecx), %eax
+	movl	%eax, -4(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave13):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit13)
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	19(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit13)
+	palignr	$13, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit13)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit13)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit13):
+	lea	3(%edx, %esi), %edx
+	lea	3(%ecx, %esi), %ecx
+
+	movl	-4(%ecx), %eax
+	movl	%eax, -4(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave14):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit14)
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	18(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit14)
+	palignr	$14, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit14)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit14)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit14):
+	lea	2(%edx, %esi), %edx
+	lea	2(%ecx, %esi), %ecx
+	movw	-2(%ecx), %ax
+	movw	%ax, -2(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave15):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit15)
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	17(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit15)
+	palignr	$15, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit15)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit15)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit15):
+	lea	1(%edx, %esi), %edx
+	lea	1(%ecx, %esi), %ecx
+	movb	-1(%ecx), %ah
+	movb	%ah, -1(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+# endif
+
+# ifndef USE_AS_STRCAT
+#  ifdef USE_AS_STRNCPY
+	CFI_POP (%esi)
+	CFI_POP (%edi)
+
+	.p2align 4
+L(ExitTail0):
+	movl	%edx, %eax
+	RETURN
+
+	.p2align 4
+L(StrncpyExit15Bytes):
+	cmp	$12, %ebx
+	jbe	L(StrncpyExit12Bytes)
+	cmpb	$0, 8(%ecx)
+	jz	L(ExitTail9)
+	cmpb	$0, 9(%ecx)
+	jz	L(ExitTail10)
+	cmpb	$0, 10(%ecx)
+	jz	L(ExitTail11)
+	cmpb	$0, 11(%ecx)
+	jz	L(ExitTail12)
+	cmp	$13, %ebx
+	je	L(ExitTail13)
+	cmpb	$0, 12(%ecx)
+	jz	L(ExitTail13)
+	cmp	$14, %ebx
+	je	L(ExitTail14)
+	cmpb	$0, 13(%ecx)
+	jz	L(ExitTail14)
+	movlpd	(%ecx), %xmm0
+	movlpd	7(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 7(%edx)
+#   ifdef USE_AS_STPCPY
+	lea	14(%edx), %eax
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   else
+	movl	%edx, %eax
+#   endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit12Bytes):
+	cmp	$9, %ebx
+	je	L(ExitTail9)
+	cmpb	$0, 8(%ecx)
+	jz	L(ExitTail9)
+	cmp	$10, %ebx
+	je	L(ExitTail10)
+	cmpb	$0, 9(%ecx)
+	jz	L(ExitTail10)
+	cmp	$11, %ebx
+	je	L(ExitTail11)
+	cmpb	$0, 10(%ecx)
+	jz	L(ExitTail11)
+	movlpd	(%ecx), %xmm0
+	movl	8(%ecx), %eax
+	movlpd	%xmm0, (%edx)
+	movl	%eax, 8(%edx)
+	SAVE_RESULT_TAIL (11)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit8Bytes):
+	cmp	$4, %ebx
+	jbe	L(StrncpyExit4Bytes)
+	cmpb	$0, (%ecx)
+	jz	L(ExitTail1)
+	cmpb	$0, 1(%ecx)
+	jz	L(ExitTail2)
+	cmpb	$0, 2(%ecx)
+	jz	L(ExitTail3)
+	cmpb	$0, 3(%ecx)
+	jz	L(ExitTail4)
+
+	cmp	$5, %ebx
+	je	L(ExitTail5)
+	cmpb	$0, 4(%ecx)
+	jz	L(ExitTail5)
+	cmp	$6, %ebx
+	je	L(ExitTail6)
+	cmpb	$0, 5(%ecx)
+	jz	L(ExitTail6)
+	cmp	$7, %ebx
+	je	L(ExitTail7)
+	cmpb	$0, 6(%ecx)
+	jz	L(ExitTail7)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+#   ifdef USE_AS_STPCPY
+	lea	7(%edx), %eax
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   else
+	movl	%edx, %eax
+#   endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit4Bytes):
+	test	%ebx, %ebx
+	jz	L(ExitTail0)
+	cmp	$1, %ebx
+	je	L(ExitTail1)
+	cmpb	$0, (%ecx)
+	jz	L(ExitTail1)
+	cmp	$2, %ebx
+	je	L(ExitTail2)
+	cmpb	$0, 1(%ecx)
+	jz	L(ExitTail2)
+	cmp	$3, %ebx
+	je	L(ExitTail3)
+	cmpb	$0, 2(%ecx)
+	jz	L(ExitTail3)
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	SAVE_RESULT_TAIL (3)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+	RETURN
+#  endif
+
+END (STRCPY)
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy.S
new file mode 100644
index 0000000000..ffbc03c6d5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy.S
@@ -0,0 +1,116 @@
+/* Multiple versions of strcpy
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if !defined (USE_AS_STPCPY) && !defined (USE_AS_STRNCPY)
+# ifndef STRCPY
+#  define STRCPY strcpy
+# endif
+#endif
+
+#ifdef USE_AS_STPCPY
+# ifdef USE_AS_STRNCPY
+#  define STRCPY_SSSE3	__stpncpy_ssse3
+#  define STRCPY_SSE2		__stpncpy_sse2
+#  define STRCPY_IA32		__stpncpy_ia32
+#  define __GI_STRCPY		__GI_stpncpy
+#  define __GI___STRCPY		__GI___stpncpy
+# else
+#  define STRCPY_SSSE3	__stpcpy_ssse3
+#  define STRCPY_SSE2		__stpcpy_sse2
+#  define STRCPY_IA32		__stpcpy_ia32
+#  define __GI_STRCPY		__GI_stpcpy
+#  define __GI___STRCPY		__GI___stpcpy
+# endif
+#else
+# ifdef USE_AS_STRNCPY
+#  define STRCPY_SSSE3	__strncpy_ssse3
+#  define STRCPY_SSE2		__strncpy_sse2
+#  define STRCPY_IA32		__strncpy_ia32
+#  define __GI_STRCPY		__GI_strncpy
+# else
+#  define STRCPY_SSSE3	__strcpy_ssse3
+#  define STRCPY_SSE2		__strcpy_sse2
+#  define STRCPY_IA32		__strcpy_ia32
+#  define __GI_STRCPY		__GI_strcpy
+# endif
+#endif
+
+
+/* Define multiple versions only for the definition in libc.  Don't
+   define multiple versions for strncpy in static library since we
+   need strncpy before the initialization happened.  */
+#if IS_IN (libc)
+
+	.text
+ENTRY(STRCPY)
+	.type	STRCPY, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (STRCPY_IA32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (STRCPY_SSE2)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz	2f
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (STRCPY_SSSE3)
+2:	ret
+END(STRCPY)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type STRCPY_IA32, @function; \
+	.align 16; \
+	.globl STRCPY_IA32; \
+	.hidden STRCPY_IA32; \
+	STRCPY_IA32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size STRCPY_IA32, .-STRCPY_IA32
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strcpy calls through a PLT.
+   The speedup we get from using SSSE3 instruction is likely eaten away
+   by the indirect call in the PLT.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_STRCPY; __GI_STRCPY = STRCPY_IA32
+#  undef libc_hidden_def
+#  define libc_hidden_def(name) \
+	.globl __GI___STRCPY; __GI___STRCPY = STRCPY_IA32
+
+# endif
+#endif
+
+#ifdef USE_AS_STPCPY
+# ifdef USE_AS_STRNCPY
+#  include "../../stpncpy.S"
+# else
+#  include "../../i586/stpcpy.S"
+# endif
+#else
+# ifndef USE_AS_STRNCPY
+#  include "../../i586/strcpy.S"
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn-c.c
new file mode 100644
index 0000000000..6d61e190a8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn-c.c
@@ -0,0 +1,2 @@
+#define __strcspn_sse2 __strcspn_ia32
+#include <sysdeps/x86_64/multiarch/strcspn-c.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn.S
new file mode 100644
index 0000000000..21e5093924
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn.S
@@ -0,0 +1,75 @@
+/* Multiple versions of strcspn
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifdef USE_AS_STRPBRK
+#define STRCSPN_SSE42	__strpbrk_sse42
+#define STRCSPN_IA32	__strpbrk_ia32
+#define __GI_STRCSPN	__GI_strpbrk
+#else
+#ifndef STRCSPN
+#define STRCSPN		strcspn
+#define STRCSPN_SSE42	__strcspn_sse42
+#define STRCSPN_IA32	__strcspn_ia32
+#define __GI_STRCSPN	__GI_strcspn
+#endif
+#endif
+
+/* Define multiple versions only for the definition in libc.  Don't
+   define multiple versions for strpbrk in static library since we
+   need strpbrk before the initialization happened.  */
+#if (defined SHARED || !defined USE_AS_STRPBRK) && IS_IN (libc)
+	.text
+ENTRY(STRCSPN)
+	.type	STRCSPN, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (STRCSPN_IA32)
+	HAS_CPU_FEATURE (SSE4_2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (STRCSPN_SSE42)
+2:	ret
+END(STRCSPN)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type STRCSPN_IA32, @function; \
+	.globl STRCSPN_IA32; \
+	.p2align 4; \
+	STRCSPN_IA32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size STRCSPN_IA32, .-STRCSPN_IA32
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_STRCSPN; __GI_STRCSPN = STRCSPN_IA32
+#endif
+
+#ifdef USE_AS_STRPBRK
+#include "../../strpbrk.S"
+#else
+#include "../../strcspn.S"
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S
new file mode 100644
index 0000000000..d3ea864bab
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S
@@ -0,0 +1,125 @@
+/* strlen with SSE2 and BSF
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if defined SHARED && IS_IN (libc)
+
+#include <sysdep.h>
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+#define PARMS		4 + 8	/* Preserve ESI and EDI.  */
+#define	STR		PARMS
+#define ENTRANCE	PUSH (%esi); PUSH (%edi); cfi_remember_state
+#define RETURN		POP (%edi); POP (%esi); ret; \
+			cfi_restore_state; cfi_remember_state
+
+	.text
+ENTRY ( __strlen_sse2_bsf)
+	ENTRANCE
+	mov	STR(%esp), %edi
+	xor	%eax, %eax
+	mov	%edi, %ecx
+	and	$0x3f, %ecx
+	pxor	%xmm0, %xmm0
+	cmp	$0x30, %ecx
+	ja	L(next)
+	movdqu	(%edi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit_less16)
+	mov	%edi, %eax
+	and	$-16, %eax
+	jmp	L(align16_start)
+L(next):
+
+	mov	%edi, %eax
+	and	$-16, %eax
+	pcmpeqb	(%eax), %xmm0
+	mov	$-1, %esi
+	sub	%eax, %ecx
+	shl	%cl, %esi
+	pmovmskb %xmm0, %edx
+	and	%esi, %edx
+	jnz	L(exit)
+L(align16_start):
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
+	.p2align 4
+L(align16_loop):
+	pcmpeqb	16(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%eax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	lea	64(%eax), %eax
+	test	%edx, %edx
+	jz	L(align16_loop)
+L(exit):
+	sub	%edi, %eax
+L(exit_less16):
+	bsf	%edx, %edx
+	add	%edx, %eax
+	RETURN
+L(exit16):
+	sub	%edi, %eax
+	bsf	%edx, %edx
+	add	%edx, %eax
+	add	$16, %eax
+	RETURN
+L(exit32):
+	sub	%edi, %eax
+	bsf	%edx, %edx
+	add	%edx, %eax
+	add	$32, %eax
+	RETURN
+L(exit48):
+	sub	%edi, %eax
+	bsf	%edx, %edx
+	add	%edx, %eax
+	add	$48, %eax
+	POP (%edi)
+	POP (%esi)
+	ret
+
+END ( __strlen_sse2_bsf)
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2.S
new file mode 100644
index 0000000000..36fc1469d0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2.S
@@ -0,0 +1,695 @@
+/* strlen with SSE2
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* for strlen only SHARED version is optimized, for strcat, strncat, strnlen both STATIC and SHARED are optimized */
+
+#if (defined USE_AS_STRNLEN || defined USE_AS_STRCAT || defined SHARED) && IS_IN (libc)
+
+# ifndef USE_AS_STRCAT
+
+#  include <sysdep.h>
+#  define PARMS	4
+#  define STR	PARMS
+#  define RETURN	ret
+
+#  ifdef USE_AS_STRNLEN
+#   define LEN	PARMS + 8
+#   define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+#   define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+#   define PUSH(REG)	pushl	REG;	CFI_PUSH (REG)
+#   define POP(REG)	popl	REG;	CFI_POP (REG)
+#   undef RETURN
+#   define RETURN	POP (%edi); CFI_PUSH(%edi); ret
+#  endif
+
+#  ifndef STRLEN
+#   define STRLEN	__strlen_sse2
+#  endif
+
+	atom_text_section
+ENTRY (STRLEN)
+	mov	STR(%esp), %edx
+#  ifdef USE_AS_STRNLEN
+	PUSH	(%edi)
+	movl	LEN(%esp), %edi
+	sub	$4, %edi
+	jbe	L(len_less4_prolog)
+#  endif
+# endif
+	xor	%eax, %eax
+	cmpb	$0, (%edx)
+	jz	L(exit_tail0)
+	cmpb	$0, 1(%edx)
+	jz	L(exit_tail1)
+	cmpb	$0, 2(%edx)
+	jz	L(exit_tail2)
+	cmpb	$0, 3(%edx)
+	jz	L(exit_tail3)
+
+# ifdef USE_AS_STRNLEN
+	sub	$4, %edi
+	jbe	L(len_less8_prolog)
+# endif
+
+	cmpb	$0, 4(%edx)
+	jz	L(exit_tail4)
+	cmpb	$0, 5(%edx)
+	jz	L(exit_tail5)
+	cmpb	$0, 6(%edx)
+	jz	L(exit_tail6)
+	cmpb	$0, 7(%edx)
+	jz	L(exit_tail7)
+
+# ifdef USE_AS_STRNLEN
+	sub	$4, %edi
+	jbe	L(len_less12_prolog)
+# endif
+
+	cmpb	$0, 8(%edx)
+	jz	L(exit_tail8)
+	cmpb	$0, 9(%edx)
+	jz	L(exit_tail9)
+	cmpb	$0, 10(%edx)
+	jz	L(exit_tail10)
+	cmpb	$0, 11(%edx)
+	jz	L(exit_tail11)
+
+# ifdef USE_AS_STRNLEN
+	sub	$4, %edi
+	jbe	L(len_less16_prolog)
+# endif
+
+	cmpb	$0, 12(%edx)
+	jz	L(exit_tail12)
+	cmpb	$0, 13(%edx)
+	jz	L(exit_tail13)
+	cmpb	$0, 14(%edx)
+	jz	L(exit_tail14)
+	cmpb	$0, 15(%edx)
+	jz	L(exit_tail15)
+
+	pxor	%xmm0, %xmm0
+	lea	16(%edx), %eax
+	mov	%eax, %ecx
+	and	$-16, %eax
+
+# ifdef USE_AS_STRNLEN
+	and	$15, %edx
+	add	%edx, %edi
+	sub	$64, %edi
+	jbe	L(len_less64)
+# endif
+
+	pcmpeqb	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	pxor	%xmm1, %xmm1
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	pxor	%xmm2, %xmm2
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm2
+	pmovmskb %xmm2, %edx
+	pxor	%xmm3, %xmm3
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+# ifdef USE_AS_STRNLEN
+	sub	$64, %edi
+	jbe	L(len_less64)
+# endif
+
+	pcmpeqb	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+# ifdef USE_AS_STRNLEN
+	sub	$64, %edi
+	jbe	L(len_less64)
+# endif
+
+	pcmpeqb	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+# ifdef USE_AS_STRNLEN
+	sub	$64, %edi
+	jbe	L(len_less64)
+# endif
+
+	pcmpeqb	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+# ifdef USE_AS_STRNLEN
+	mov	%eax, %edx
+	and	$63, %edx
+	add	%edx, %edi
+# endif
+
+	and	$-0x40, %eax
+
+	.p2align 4
+L(aligned_64_loop):
+# ifdef USE_AS_STRNLEN
+	sub	$64, %edi
+	jbe	L(len_less64)
+# endif
+	movaps	(%eax), %xmm0
+	movaps	16(%eax), %xmm1
+	movaps	32(%eax), %xmm2
+	movaps	48(%eax), %xmm6
+	pminub	%xmm1, %xmm0
+	pminub	%xmm6, %xmm2
+	pminub	%xmm0, %xmm2
+	pcmpeqb	%xmm3, %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	lea	64(%eax), %eax
+	jz	L(aligned_64_loop)
+
+	pcmpeqb	-64(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	48(%ecx), %ecx
+	jnz	L(exit)
+
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	-16(%ecx), %ecx
+	jnz	L(exit)
+
+	pcmpeqb	-32(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	-16(%ecx), %ecx
+	jnz	L(exit)
+
+	pcmpeqb	%xmm6, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	-16(%ecx), %ecx
+L(exit):
+	sub	%ecx, %eax
+	test	%dl, %dl
+	jz	L(exit_high)
+
+	mov	%dl, %cl
+	and	$15, %cl
+	jz	L(exit_8)
+	test	$0x01, %dl
+	jnz	L(exit_tail0)
+	test	$0x02, %dl
+	jnz	L(exit_tail1)
+	test	$0x04, %dl
+	jnz	L(exit_tail2)
+	add	$3, %eax
+	RETURN
+
+	.p2align 4
+L(exit_8):
+	test	$0x10, %dl
+	jnz	L(exit_tail4)
+	test	$0x20, %dl
+	jnz	L(exit_tail5)
+	test	$0x40, %dl
+	jnz	L(exit_tail6)
+	add	$7, %eax
+	RETURN
+
+	.p2align 4
+L(exit_high):
+	mov	%dh, %ch
+	and	$15, %ch
+	jz	L(exit_high_8)
+	test	$0x01, %dh
+	jnz	L(exit_tail8)
+	test	$0x02, %dh
+	jnz	L(exit_tail9)
+	test	$0x04, %dh
+	jnz	L(exit_tail10)
+	add	$11, %eax
+	RETURN
+
+	.p2align 4
+L(exit_high_8):
+	test	$0x10, %dh
+	jnz	L(exit_tail12)
+	test	$0x20, %dh
+	jnz	L(exit_tail13)
+	test	$0x40, %dh
+	jnz	L(exit_tail14)
+	add	$15, %eax
+L(exit_tail0):
+	RETURN
+
+# ifdef USE_AS_STRNLEN
+
+	.p2align 4
+L(len_less64):
+	pxor	%xmm0, %xmm0
+	add	$64, %edi
+
+	pcmpeqb	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	pxor	%xmm1, %xmm1
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(strnlen_exit)
+
+	sub	$16, %edi
+	jbe	L(return_start_len)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(strnlen_exit)
+
+	sub	$16, %edi
+	jbe	L(return_start_len)
+
+	pcmpeqb	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(strnlen_exit)
+
+	sub	$16, %edi
+	jbe	L(return_start_len)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(strnlen_exit)
+
+	movl	LEN(%esp), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit):
+	sub	%ecx, %eax
+
+	test	%dl, %dl
+	jz	L(strnlen_exit_high)
+	mov	%dl, %cl
+	and	$15, %cl
+	jz	L(strnlen_exit_8)
+	test	$0x01, %dl
+	jnz	L(exit_tail0)
+	test	$0x02, %dl
+	jnz	L(strnlen_exit_tail1)
+	test	$0x04, %dl
+	jnz	L(strnlen_exit_tail2)
+	sub	$4, %edi
+	jb	L(return_start_len)
+	lea	3(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_8):
+	test	$0x10, %dl
+	jnz	L(strnlen_exit_tail4)
+	test	$0x20, %dl
+	jnz	L(strnlen_exit_tail5)
+	test	$0x40, %dl
+	jnz	L(strnlen_exit_tail6)
+	sub	$8, %edi
+	jb	L(return_start_len)
+	lea	7(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_high):
+	mov	%dh, %ch
+	and	$15, %ch
+	jz	L(strnlen_exit_high_8)
+	test	$0x01, %dh
+	jnz	L(strnlen_exit_tail8)
+	test	$0x02, %dh
+	jnz	L(strnlen_exit_tail9)
+	test	$0x04, %dh
+	jnz	L(strnlen_exit_tail10)
+	sub	$12, %edi
+	jb	L(return_start_len)
+	lea	11(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_high_8):
+	test	$0x10, %dh
+	jnz	L(strnlen_exit_tail12)
+	test	$0x20, %dh
+	jnz	L(strnlen_exit_tail13)
+	test	$0x40, %dh
+	jnz	L(strnlen_exit_tail14)
+	sub	$16, %edi
+	jb	L(return_start_len)
+	lea	15(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail1):
+	sub	$2, %edi
+	jb	L(return_start_len)
+	lea	1(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail2):
+	sub	$3, %edi
+	jb	L(return_start_len)
+	lea	2(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail4):
+	sub	$5, %edi
+	jb	L(return_start_len)
+	lea	4(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail5):
+	sub	$6, %edi
+	jb	L(return_start_len)
+	lea	5(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail6):
+	sub	$7, %edi
+	jb	L(return_start_len)
+	lea	6(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail8):
+	sub	$9, %edi
+	jb	L(return_start_len)
+	lea	8(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail9):
+	sub	$10, %edi
+	jb	L(return_start_len)
+	lea	9(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail10):
+	sub	$11, %edi
+	jb	L(return_start_len)
+	lea	10(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail12):
+	sub	$13, %edi
+	jb	L(return_start_len)
+	lea	12(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail13):
+	sub	$14, %edi
+	jb	L(return_start_len)
+	lea	13(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail14):
+	sub	$15, %edi
+	jb	L(return_start_len)
+	lea	14(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(return_start_len):
+	movl	LEN(%esp), %eax
+	RETURN
+
+/* for prolog only */
+
+	.p2align 4
+L(len_less4_prolog):
+	xor	%eax, %eax
+
+	add	$4, %edi
+	jz	L(exit_tail0)
+
+	cmpb	$0, (%edx)
+	jz	L(exit_tail0)
+	cmp	$1, %edi
+	je	L(exit_tail1)
+
+	cmpb	$0, 1(%edx)
+	jz	L(exit_tail1)
+	cmp	$2, %edi
+	je	L(exit_tail2)
+
+	cmpb	$0, 2(%edx)
+	jz	L(exit_tail2)
+	cmp	$3, %edi
+	je	L(exit_tail3)
+
+	cmpb	$0, 3(%edx)
+	jz	L(exit_tail3)
+	mov	$4, %eax
+	RETURN
+
+	.p2align 4
+L(len_less8_prolog):
+	add	$4, %edi
+
+	cmpb	$0, 4(%edx)
+	jz	L(exit_tail4)
+	cmp	$1, %edi
+	je	L(exit_tail5)
+
+	cmpb	$0, 5(%edx)
+	jz	L(exit_tail5)
+	cmp	$2, %edi
+	je	L(exit_tail6)
+
+	cmpb	$0, 6(%edx)
+	jz	L(exit_tail6)
+	cmp	$3, %edi
+	je	L(exit_tail7)
+
+	cmpb	$0, 7(%edx)
+	jz	L(exit_tail7)
+	mov	$8, %eax
+	RETURN
+
+
+	.p2align 4
+L(len_less12_prolog):
+	add	$4, %edi
+
+	cmpb	$0, 8(%edx)
+	jz	L(exit_tail8)
+	cmp	$1, %edi
+	je	L(exit_tail9)
+
+	cmpb	$0, 9(%edx)
+	jz	L(exit_tail9)
+	cmp	$2, %edi
+	je	L(exit_tail10)
+
+	cmpb	$0, 10(%edx)
+	jz	L(exit_tail10)
+	cmp	$3, %edi
+	je	L(exit_tail11)
+
+	cmpb	$0, 11(%edx)
+	jz	L(exit_tail11)
+	mov	$12, %eax
+	RETURN
+
+	.p2align 4
+L(len_less16_prolog):
+	add	$4, %edi
+
+	cmpb	$0, 12(%edx)
+	jz	L(exit_tail12)
+	cmp	$1, %edi
+	je	L(exit_tail13)
+
+	cmpb	$0, 13(%edx)
+	jz	L(exit_tail13)
+	cmp	$2, %edi
+	je	L(exit_tail14)
+
+	cmpb	$0, 14(%edx)
+	jz	L(exit_tail14)
+	cmp	$3, %edi
+	je	L(exit_tail15)
+
+	cmpb	$0, 15(%edx)
+	jz	L(exit_tail15)
+	mov	$16, %eax
+	RETURN
+# endif
+
+	.p2align 4
+L(exit_tail1):
+	add	$1, %eax
+	RETURN
+
+L(exit_tail2):
+	add	$2, %eax
+	RETURN
+
+L(exit_tail3):
+	add	$3, %eax
+	RETURN
+
+L(exit_tail4):
+	add	$4, %eax
+	RETURN
+
+L(exit_tail5):
+	add	$5, %eax
+	RETURN
+
+L(exit_tail6):
+	add	$6, %eax
+	RETURN
+
+L(exit_tail7):
+	add	$7, %eax
+	RETURN
+
+L(exit_tail8):
+	add	$8, %eax
+	RETURN
+
+L(exit_tail9):
+	add	$9, %eax
+	RETURN
+
+L(exit_tail10):
+	add	$10, %eax
+	RETURN
+
+L(exit_tail11):
+	add	$11, %eax
+	RETURN
+
+L(exit_tail12):
+	add	$12, %eax
+	RETURN
+
+L(exit_tail13):
+	add	$13, %eax
+	RETURN
+
+L(exit_tail14):
+	add	$14, %eax
+	RETURN
+
+L(exit_tail15):
+	add	$15, %eax
+# ifndef USE_AS_STRCAT
+	RETURN
+END (STRLEN)
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strlen.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen.S
new file mode 100644
index 0000000000..77cf6bcdb0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen.S
@@ -0,0 +1,60 @@
+/* Multiple versions of strlen
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc and for the
+   DSO.  In static binaries, we need strlen before the initialization
+   happened.  */
+#if defined SHARED && IS_IN (libc)
+	.text
+ENTRY(strlen)
+	.type	strlen, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__strlen_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__strlen_sse2_bsf)
+	HAS_ARCH_FEATURE (Slow_BSF)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__strlen_sse2)
+2:	ret
+END(strlen)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __strlen_ia32, @function; \
+	.globl __strlen_ia32; \
+	.p2align 4; \
+	__strlen_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __strlen_ia32, .-__strlen_ia32
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_strlen; __GI_strlen = __strlen_ia32
+#endif
+
+#include "../../i586/strlen.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase-c.c
new file mode 100644
index 0000000000..76581eb62b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase-c.c
@@ -0,0 +1,8 @@
+#include <string.h>
+
+extern __typeof (strncasecmp) __strncasecmp_nonascii;
+
+#define __strncasecmp __strncasecmp_nonascii
+#include <string/strncase.c>
+
+strong_alias (__strncasecmp_nonascii, __strncasecmp_ia32)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase.S
new file mode 100644
index 0000000000..a56e63a566
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase.S
@@ -0,0 +1,39 @@
+/* Entry point for multi-version x86 strncasecmp.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+	.text
+ENTRY(__strncasecmp)
+	.type	__strncasecmp, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__strncasecmp_ia32)
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__strncasecmp_ssse3)
+	HAS_CPU_FEATURE (SSE4_2)
+	jz	2f
+	HAS_ARCH_FEATURE (Slow_SSE4_2)
+	jnz	2f
+	LOAD_FUNC_GOT_EAX (__strncasecmp_sse4_2)
+2:	ret
+END(__strncasecmp)
+
+weak_alias (__strncasecmp, strncasecmp)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-c.c
new file mode 100644
index 0000000000..7e601af271
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-c.c
@@ -0,0 +1,13 @@
+#include <string.h>
+
+extern __typeof (strncasecmp_l) __strncasecmp_l_nonascii;
+
+#define __strncasecmp_l __strncasecmp_l_nonascii
+#define USE_IN_EXTENDED_LOCALE_MODEL    1
+#include <string/strncase.c>
+
+strong_alias (__strncasecmp_l_nonascii, __strncasecmp_l_ia32)
+
+/* The needs of strcasecmp in libc are minimal, no need to go through
+   the IFUNC.  */
+strong_alias (__strncasecmp_l_nonascii, __GI___strncasecmp_l)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-sse4.S
new file mode 100644
index 0000000000..557210832e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-sse4.S
@@ -0,0 +1,2 @@
+#define USE_AS_STRNCASECMP_L 1
+#include "strcmp-sse4.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-ssse3.S
new file mode 100644
index 0000000000..d438a1ae35
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-ssse3.S
@@ -0,0 +1,2 @@
+#define USE_AS_STRNCASECMP_L 1
+#include "strcmp-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l.S
new file mode 100644
index 0000000000..8a74ee8574
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l.S
@@ -0,0 +1,7 @@
+/* Multiple versions of strncasecmp_l
+   All versions must be listed in ifunc-impl-list.c.  */
+#define STRCMP __strncasecmp_l
+#define USE_AS_STRNCASECMP_L
+#include "strcmp.S"
+
+weak_alias (__strncasecmp_l, strncasecmp_l)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-c.c
new file mode 100644
index 0000000000..132a000545
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-c.c
@@ -0,0 +1,8 @@
+#define STRNCAT __strncat_ia32
+#ifdef SHARED
+#undef libc_hidden_def
+#define libc_hidden_def(name) \
+  __hidden_ver1 (__strncat_ia32, __GI___strncat, __strncat_ia32);
+#endif
+
+#include "string/strncat.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-sse2.S
new file mode 100644
index 0000000000..f1045b72b8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-sse2.S
@@ -0,0 +1,4 @@
+#define STRCAT  __strncat_sse2
+#define USE_AS_STRNCAT
+
+#include "strcat-sse2.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-ssse3.S
new file mode 100644
index 0000000000..625b90a978
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-ssse3.S
@@ -0,0 +1,4 @@
+#define STRCAT  __strncat_ssse3
+#define USE_AS_STRNCAT
+
+#include "strcat-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncat.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat.S
new file mode 100644
index 0000000000..5c1bf41453
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat.S
@@ -0,0 +1,5 @@
+/* Multiple versions of strncat
+   All versions must be listed in ifunc-impl-list.c.  */
+#define STRCAT strncat
+#define USE_AS_STRNCAT
+#include "strcat.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-c.c
new file mode 100644
index 0000000000..cc059da494
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-c.c
@@ -0,0 +1,8 @@
+#ifdef SHARED
+# define STRNCMP __strncmp_ia32
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name)  \
+    __hidden_ver1 (__strncmp_ia32, __GI_strncmp, __strncmp_ia32);
+#endif
+
+#include "string/strncmp.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-sse4.S
new file mode 100644
index 0000000000..cf14dfaf6c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-sse4.S
@@ -0,0 +1,5 @@
+#ifdef SHARED
+# define USE_AS_STRNCMP
+# define STRCMP	__strncmp_sse4_2
+# include "strcmp-sse4.S"
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-ssse3.S
new file mode 100644
index 0000000000..536c8685f2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-ssse3.S
@@ -0,0 +1,5 @@
+#ifdef SHARED
+# define USE_AS_STRNCMP
+# define STRCMP	__strncmp_ssse3
+# include "strcmp-ssse3.S"
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp.S
new file mode 100644
index 0000000000..150d4786d2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp.S
@@ -0,0 +1,5 @@
+/* Multiple versions of strncmp
+   All versions must be listed in ifunc-impl-list.c.  */
+#define USE_AS_STRNCMP
+#define STRCMP	strncmp
+#include "strcmp.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-c.c
new file mode 100644
index 0000000000..201e3f98b3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-c.c
@@ -0,0 +1,8 @@
+#define STRNCPY __strncpy_ia32
+#ifdef SHARED
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name)  \
+    __hidden_ver1 (__strncpy_ia32, __GI_strncpy, __strncpy_ia32);
+#endif
+
+#include "string/strncpy.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-sse2.S
new file mode 100644
index 0000000000..bdd99239a4
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-sse2.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCPY
+#define STRCPY __strncpy_sse2
+#include "strcpy-sse2.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-ssse3.S
new file mode 100644
index 0000000000..bf82ee447d
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-ssse3.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCPY
+#define STRCPY __strncpy_ssse3
+#include "strcpy-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy.S
new file mode 100644
index 0000000000..9c257efc6e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy.S
@@ -0,0 +1,5 @@
+/* Multiple versions of strncpy
+   All versions must be listed in ifunc-impl-list.c.  */
+#define USE_AS_STRNCPY
+#define STRCPY strncpy
+#include "strcpy.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-c.c
new file mode 100644
index 0000000000..351e939a93
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-c.c
@@ -0,0 +1,10 @@
+#define STRNLEN  __strnlen_ia32
+#ifdef SHARED
+# undef libc_hidden_def
+# define libc_hidden_def(name)  \
+    __hidden_ver1 (__strnlen_ia32, __GI_strnlen, __strnlen_ia32); \
+    strong_alias (__strnlen_ia32, __strnlen_ia32_1); \
+    __hidden_ver1 (__strnlen_ia32_1, __GI___strnlen, __strnlen_ia32_1);
+#endif
+
+#include "string/strnlen.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-sse2.S
new file mode 100644
index 0000000000..56b6ae2a5c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-sse2.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNLEN
+#define STRLEN __strnlen_sse2
+#include "strlen-sse2.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen.S
new file mode 100644
index 0000000000..d241522c70
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen.S
@@ -0,0 +1,37 @@
+/* Multiple versions of strnlen
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+	.text
+ENTRY(__strnlen)
+	.type	__strnlen, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__strnlen_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__strnlen_sse2)
+2:	ret
+END(__strnlen)
+
+weak_alias(__strnlen, strnlen)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk-c.c
new file mode 100644
index 0000000000..5db62053b3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk-c.c
@@ -0,0 +1,2 @@
+#define __strpbrk_sse2 __strpbrk_ia32
+#include <sysdeps/x86_64/multiarch/strpbrk-c.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk.S
new file mode 100644
index 0000000000..7201d6376f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk.S
@@ -0,0 +1,5 @@
+/* Multiple versions of strpbrk
+   All versions must be listed in ifunc-impl-list.c.  */
+#define STRCSPN strpbrk
+#define USE_AS_STRPBRK
+#include "strcspn.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2-bsf.S
new file mode 100644
index 0000000000..39a7c8825b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2-bsf.S
@@ -0,0 +1,282 @@
+/* strrchr with SSE2 with bsf and bsr
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS  4
+# define STR1  PARMS
+# define STR2  STR1+4
+
+	.text
+ENTRY (__strrchr_sse2_bsf)
+
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+
+	PUSH	(%edi)
+	pxor	%xmm2, %xmm2
+	mov	%ecx, %edi
+	punpcklbw %xmm1, %xmm1
+	punpcklbw %xmm1, %xmm1
+	/* ECX has OFFSET. */
+	and	$63, %ecx
+	cmp	$48, %ecx
+	pshufd	$0, %xmm1, %xmm1
+	ja	L(crosscashe)
+
+/* unaligned string. */
+	movdqu	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm1, %xmm0
+	/* Find where NULL is.  */
+	pmovmskb %xmm2, %edx
+	/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+
+	test	%eax, %eax
+	jnz	L(unaligned_match1)
+
+	test	%edx, %edx
+	jnz	L(return_null)
+
+	and	$-16, %edi
+	add	$16, %edi
+
+	PUSH	(%esi)
+	PUSH	(%ebx)
+
+	xor	%ebx, %ebx
+	jmp	L(loop)
+
+	CFI_POP	(%esi)
+	CFI_POP	(%ebx)
+
+	.p2align 4
+L(unaligned_return_value1):
+	bsf	%edx, %ecx
+	mov	$2, %edx
+	shl	%cl, %edx
+	sub	$1, %edx
+	and	%edx, %eax
+	jz	L(return_null)
+	bsr	%eax, %eax
+	add	%edi, %eax
+	POP	(%edi)
+	ret
+	CFI_PUSH	(%edi)
+
+	.p2align 4
+L(unaligned_match1):
+	test	%edx, %edx
+	jnz	L(unaligned_return_value1)
+
+	PUSH	(%esi)
+	PUSH	(%ebx)
+
+	mov	%eax, %ebx
+	lea	16(%edi), %esi
+	and	$-16, %edi
+	add	$16, %edi
+	jmp	L(loop)
+
+	CFI_POP	(%esi)
+	CFI_POP	(%ebx)
+
+	.p2align 4
+	L(crosscashe):
+/* Hancle unaligned string.  */
+	and	$15, %ecx
+	and	$-16, %edi
+	pxor	%xmm3, %xmm3
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm3
+	pcmpeqb	%xmm1, %xmm0
+	/* Find where NULL is.  */
+	pmovmskb %xmm3, %edx
+	/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+	/* Remove the leading bytes.  */
+	shr	%cl, %edx
+	shr	%cl, %eax
+
+	test	%eax, %eax
+	jnz	L(unaligned_match)
+
+	test	%edx, %edx
+	jnz	L(return_null)
+
+	add	$16, %edi
+
+	PUSH	(%esi)
+	PUSH	(%ebx)
+
+	xor	%ebx, %ebx
+	jmp	L(loop)
+
+	CFI_POP	(%esi)
+	CFI_POP	(%ebx)
+
+	.p2align 4
+L(unaligned_return_value):
+	add	%ecx, %edi
+	bsf	%edx, %ecx
+	mov	$2, %edx
+	shl	%cl, %edx
+	sub	$1, %edx
+	and	%edx, %eax
+	jz	L(return_null)
+	bsr	%eax, %eax
+	add	%edi, %eax
+	POP	(%edi)
+	ret
+	CFI_PUSH	(%edi)
+
+	.p2align 4
+L(unaligned_match):
+	test	%edx, %edx
+	jnz	L(unaligned_return_value)
+
+	PUSH	(%esi)
+	PUSH	(%ebx)
+
+	mov	%eax, %ebx
+	add	$16, %edi
+	lea	(%edi, %ecx), %esi
+
+/* Loop start on aligned string.  */
+	.p2align 4
+L(loop):
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm0, %eax
+	or	%eax, %ecx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm0, %eax
+	or	%eax, %ecx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm0, %eax
+	or	%eax, %ecx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm0, %eax
+	or	%eax, %ecx
+	jz	L(loop)
+
+L(matches):
+	test	%eax, %eax
+	jnz	L(match)
+L(return_value):
+	test	%ebx, %ebx
+	jz	L(return_null_1)
+	bsr	%ebx, %eax
+	add	%esi, %eax
+
+	POP	(%ebx)
+	POP	(%esi)
+
+	sub	$16, %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH	(%edi)
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(match):
+	pmovmskb %xmm2, %ecx
+	test	%ecx, %ecx
+	jnz	L(return_value_1)
+	mov	%eax, %ebx
+	mov	%edi, %esi
+	jmp	L(loop)
+
+	.p2align 4
+L(return_value_1):
+	bsf	%ecx, %ecx
+	mov	$2, %edx
+	shl	%cl, %edx
+	sub	$1, %edx
+	and	%edx, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+
+	bsr	%eax, %eax
+	add	%edi, %eax
+	sub	$16, %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH	(%edi)
+/* Return NULL.  */
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH	(%edi)
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+/* Return NULL.  */
+	.p2align 4
+L(return_null_1):
+	POP	(%ebx)
+	POP	(%esi)
+	POP	(%edi)
+	xor	%eax, %eax
+	ret
+
+END (__strrchr_sse2_bsf)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2.S
new file mode 100644
index 0000000000..20934288be
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2.S
@@ -0,0 +1,708 @@
+/* strrchr SSE2 without bsf and bsr
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS  8
+# define ENTRANCE PUSH(%edi);
+# define RETURN  POP(%edi); ret; CFI_PUSH(%edi);
+
+# define STR1  PARMS
+# define STR2  STR1+4
+
+	atom_text_section
+ENTRY (__strrchr_sse2)
+
+	ENTRANCE
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+
+	pxor	%xmm2, %xmm2
+	mov	%ecx, %edi
+	punpcklbw %xmm1, %xmm1
+	punpcklbw %xmm1, %xmm1
+	/* ECX has OFFSET. */
+	and	$63, %ecx
+	cmp	$48, %ecx
+	pshufd	$0, %xmm1, %xmm1
+	ja	L(crosscache)
+
+/* unaligned string. */
+	movdqu	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm1, %xmm0
+	/* Find where NULL is.  */
+	pmovmskb %xmm2, %ecx
+	/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+	add	$16, %edi
+
+	test	%eax, %eax
+	jnz	L(unaligned_match1)
+
+	test	%ecx, %ecx
+	jnz	L(return_null)
+
+	and	$-16, %edi
+
+	PUSH	(%esi)
+	PUSH	(%ebx)
+
+	xor	%ebx, %ebx
+	jmp	L(loop)
+
+	CFI_POP	(%esi)
+	CFI_POP	(%ebx)
+
+	.p2align 4
+L(unaligned_match1):
+	test	%ecx, %ecx
+	jnz	L(prolog_find_zero_1)
+
+	PUSH	(%esi)
+	PUSH	(%ebx)
+
+	mov	%eax, %ebx
+	mov	%edi, %esi
+	and	$-16, %edi
+	jmp	L(loop)
+
+	CFI_POP	(%esi)
+	CFI_POP	(%ebx)
+
+	.p2align 4
+L(crosscache):
+/* Hancle unaligned string.  */
+	and	$15, %ecx
+	and	$-16, %edi
+	pxor	%xmm3, %xmm3
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm3
+	pcmpeqb	%xmm1, %xmm0
+	/* Find where NULL is.  */
+	pmovmskb %xmm3, %edx
+	/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+	/* Remove the leading bytes.  */
+	shr	%cl, %edx
+	shr	%cl, %eax
+	add	$16, %edi
+
+	test	%eax, %eax
+	jnz	L(unaligned_match)
+
+	test	%edx, %edx
+	jnz	L(return_null)
+
+	PUSH	(%esi)
+	PUSH	(%ebx)
+
+	xor	%ebx, %ebx
+	jmp	L(loop)
+
+	CFI_POP	(%esi)
+	CFI_POP	(%ebx)
+
+	.p2align 4
+L(unaligned_match):
+	test	%edx, %edx
+	jnz	L(prolog_find_zero)
+
+	PUSH	(%esi)
+	PUSH	(%ebx)
+
+	mov	%eax, %ebx
+	lea	(%edi, %ecx), %esi
+
+/* Loop start on aligned string.  */
+	.p2align 4
+L(loop):
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm0, %eax
+	or	%eax, %ecx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm0, %eax
+	or	%eax, %ecx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm0, %eax
+	or	%eax, %ecx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm0, %eax
+	or	%eax, %ecx
+	jz	L(loop)
+
+L(matches):
+	test	%eax, %eax
+	jnz	L(match)
+L(return_value):
+	test	%ebx, %ebx
+	jz	L(return_null_1)
+	mov	%ebx, %eax
+	mov	%esi, %edi
+
+	POP	(%ebx)
+	POP	(%esi)
+
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(return_null_1):
+	POP	(%ebx)
+	POP	(%esi)
+
+	xor	%eax, %eax
+	RETURN
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(match):
+	pmovmskb %xmm2, %ecx
+	test	%ecx, %ecx
+	jnz	L(find_zero)
+	mov	%eax, %ebx
+	mov	%edi, %esi
+	jmp	L(loop)
+
+	.p2align 4
+L(find_zero):
+	test	%cl, %cl
+	jz	L(find_zero_high)
+	mov	%cl, %dl
+	and	$15, %dl
+	jz	L(find_zero_8)
+	test	$0x01, %cl
+	jnz	L(FindZeroExit1)
+	test	$0x02, %cl
+	jnz	L(FindZeroExit2)
+	test	$0x04, %cl
+	jnz	L(FindZeroExit3)
+	and	$1 << 4 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(find_zero_8):
+	test	$0x10, %cl
+	jnz	L(FindZeroExit5)
+	test	$0x20, %cl
+	jnz	L(FindZeroExit6)
+	test	$0x40, %cl
+	jnz	L(FindZeroExit7)
+	and	$1 << 8 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(find_zero_high):
+	mov	%ch, %dh
+	and	$15, %dh
+	jz	L(find_zero_high_8)
+	test	$0x01, %ch
+	jnz	L(FindZeroExit9)
+	test	$0x02, %ch
+	jnz	L(FindZeroExit10)
+	test	$0x04, %ch
+	jnz	L(FindZeroExit11)
+	and	$1 << 12 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(find_zero_high_8):
+	test	$0x10, %ch
+	jnz	L(FindZeroExit13)
+	test	$0x20, %ch
+	jnz	L(FindZeroExit14)
+	test	$0x40, %ch
+	jnz	L(FindZeroExit15)
+	and	$1 << 16 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit1):
+	and	$1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit2):
+	and	$1 << 2 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit3):
+	and	$1 << 3 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit5):
+	and	$1 << 5 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit6):
+	and	$1 << 6 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit7):
+	and	$1 << 7 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit9):
+	and	$1 << 9 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit10):
+	and	$1 << 10 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit11):
+	and	$1 << 11 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit13):
+	and	$1 << 13 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit14):
+	and	$1 << 14 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit15):
+	and	$1 << 15 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+
+	.p2align 4
+L(match_exit):
+	test	%ah, %ah
+	jnz	L(match_exit_high)
+	mov	%al, %dl
+	and	$15 << 4, %dl
+	jnz	L(match_exit_8)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	lea	-16(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_exit_8):
+	test	$0x80, %al
+	jnz	L(Exit8)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	lea	-12(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_exit_high):
+	mov	%ah, %dh
+	and	$15 << 4, %dh
+	jnz	L(match_exit_high_8)
+	test	$0x08, %ah
+	jnz	L(Exit12)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	lea	-8(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_exit_high_8):
+	test	$0x80, %ah
+	jnz	L(Exit16)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	lea	-4(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit2):
+	lea	-15(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit3):
+	lea	-14(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit4):
+	lea	-13(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit6):
+	lea	-11(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit7):
+	lea	-10(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit8):
+	lea	-9(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit10):
+	lea	-7(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit11):
+	lea	-6(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit12):
+	lea	-5(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit14):
+	lea	-3(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit15):
+	lea	-2(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit16):
+	lea	-1(%edi), %eax
+	RETURN
+
+/* Return NULL.  */
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(prolog_find_zero):
+	add	%ecx, %edi
+	mov     %edx, %ecx
+L(prolog_find_zero_1):
+	test	%cl, %cl
+	jz	L(prolog_find_zero_high)
+	mov	%cl, %dl
+	and	$15, %dl
+	jz	L(prolog_find_zero_8)
+	test	$0x01, %cl
+	jnz	L(PrologFindZeroExit1)
+	test	$0x02, %cl
+	jnz	L(PrologFindZeroExit2)
+	test	$0x04, %cl
+	jnz	L(PrologFindZeroExit3)
+	and	$1 << 4 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(prolog_find_zero_8):
+	test	$0x10, %cl
+	jnz	L(PrologFindZeroExit5)
+	test	$0x20, %cl
+	jnz	L(PrologFindZeroExit6)
+	test	$0x40, %cl
+	jnz	L(PrologFindZeroExit7)
+	and	$1 << 8 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(prolog_find_zero_high):
+	mov	%ch, %dh
+	and	$15, %dh
+	jz	L(prolog_find_zero_high_8)
+	test	$0x01, %ch
+	jnz	L(PrologFindZeroExit9)
+	test	$0x02, %ch
+	jnz	L(PrologFindZeroExit10)
+	test	$0x04, %ch
+	jnz	L(PrologFindZeroExit11)
+	and	$1 << 12 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(prolog_find_zero_high_8):
+	test	$0x10, %ch
+	jnz	L(PrologFindZeroExit13)
+	test	$0x20, %ch
+	jnz	L(PrologFindZeroExit14)
+	test	$0x40, %ch
+	jnz	L(PrologFindZeroExit15)
+	and	$1 << 16 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit1):
+	and	$1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit2):
+	and	$1 << 2 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit3):
+	and	$1 << 3 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit5):
+	and	$1 << 5 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit6):
+	and	$1 << 6 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit7):
+	and	$1 << 7 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit9):
+	and	$1 << 9 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit10):
+	and	$1 << 10 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit11):
+	and	$1 << 11 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit13):
+	and	$1 << 13 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit14):
+	and	$1 << 14 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit15):
+	and	$1 << 15 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+END (__strrchr_sse2)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr.S
new file mode 100644
index 0000000000..d9281eaeae
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr.S
@@ -0,0 +1,57 @@
+/* Multiple versions of strrchr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+	.text
+ENTRY(strrchr)
+	.type	strrchr, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__strrchr_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__strrchr_sse2_bsf)
+	HAS_ARCH_FEATURE (Slow_BSF)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__strrchr_sse2)
+2:	ret
+END(strrchr)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __strrchr_ia32, @function; \
+	.globl __strrchr_ia32; \
+	.p2align 4; \
+	__strrchr_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __strrchr_ia32, .-__strrchr_ia32
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_strrchr; __GI_strrchr = __strrchr_ia32
+#endif
+
+#include "../../strrchr.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strspn-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strspn-c.c
new file mode 100644
index 0000000000..bea09dea71
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strspn-c.c
@@ -0,0 +1,2 @@
+#define __strspn_sse2 __strspn_ia32
+#include <sysdeps/x86_64/multiarch/strspn-c.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strspn.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strspn.S
new file mode 100644
index 0000000000..1269062381
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strspn.S
@@ -0,0 +1,56 @@
+/* Multiple versions of strspn
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+	.text
+ENTRY(strspn)
+	.type	strspn, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__strspn_ia32)
+	HAS_CPU_FEATURE (SSE4_2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__strspn_sse42)
+2:	ret
+END(strspn)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __strspn_ia32, @function; \
+	.globl __strspn_ia32; \
+	.p2align 4; \
+__strspn_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __strspn_ia32, .-__strspn_ia32
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_strspn; __GI_strspn = __strspn_ia32
+#endif
+
+#include "../../strspn.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/test-multiarch.c b/REORG.TODO/sysdeps/i386/i686/multiarch/test-multiarch.c
new file mode 100644
index 0000000000..593cfec273
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/test-multiarch.c
@@ -0,0 +1 @@
+#include <sysdeps/x86_64/multiarch/test-multiarch.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.c b/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.c
new file mode 100644
index 0000000000..7760b966e2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.c
@@ -0,0 +1 @@
+#include <sysdeps/x86_64/multiarch/varshift.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.h b/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.h
new file mode 100644
index 0000000000..7c72c70d67
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.h
@@ -0,0 +1 @@
+#include <sysdeps/x86_64/multiarch/varshift.h>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-c.c
new file mode 100644
index 0000000000..38d41d04de
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-c.c
@@ -0,0 +1,22 @@
+#include <wchar.h>
+
+#if IS_IN (libc)
+# undef libc_hidden_weak
+# define libc_hidden_weak(name)
+
+# undef weak_alias
+# define weak_alias(name,alias)
+
+# ifdef SHARED
+#  undef libc_hidden_def
+#  define libc_hidden_def(name) \
+   __hidden_ver1 (__wcschr_ia32, __GI_wcschr, __wcschr_ia32); \
+   strong_alias (__wcschr_ia32, __wcschr_ia32_1); \
+   __hidden_ver1 (__wcschr_ia32_1, __GI___wcschr, __wcschr_ia32_1);
+# endif
+#endif
+
+extern __typeof (wcschr) __wcschr_ia32;
+
+#define WCSCHR  __wcschr_ia32
+#include <wcsmbs/wcschr.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-sse2.S
new file mode 100644
index 0000000000..9ff6c3b8d6
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-sse2.S
@@ -0,0 +1,219 @@
+/* wcschr with SSE2, without using bsf instructions
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+# define POP(REG)	popl REG; CFI_POP (REG)
+
+# define PARMS	4
+# define STR1	PARMS
+# define STR2	STR1+4
+
+	atom_text_section
+ENTRY (__wcschr_sse2)
+
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+
+	mov	%ecx, %eax
+	punpckldq %xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	punpckldq %xmm1, %xmm1
+
+	and	$63, %eax
+	cmp	$48, %eax
+	ja	L(cross_cache)
+
+	movdqu	(%ecx), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jnz	L(matches)
+	and	$-16, %ecx
+	jmp	L(loop)
+
+	.p2align 4
+L(cross_cache):
+	PUSH	(%edi)
+	mov	%ecx, %edi
+	mov	%eax, %ecx
+	and	$-16, %edi
+	and	$15, %ecx
+	movdqa	(%edi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+
+	sarl	%cl, %edx
+	sarl	%cl, %eax
+	test	%eax, %eax
+	jz	L(unaligned_no_match)
+
+	add	%edi, %ecx
+	POP	(%edi)
+
+	test	%edx, %edx
+	jz	L(match_case1)
+	test	%al, %al
+	jz	L(match_higth_case2)
+	test	$15, %al
+	jnz	L(match_case2_4)
+	test	$15, %dl
+	jnz	L(return_null)
+	lea	4(%ecx), %eax
+	ret
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(unaligned_no_match):
+	mov	%edi, %ecx
+	POP	(%edi)
+
+	test	%edx, %edx
+	jnz	L(return_null)
+
+	pxor	%xmm2, %xmm2
+
+/* Loop start on aligned string.  */
+	.p2align 4
+L(loop):
+	add	$16, %ecx
+	movdqa	(%ecx), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jnz	L(matches)
+	add	$16, %ecx
+
+	movdqa	(%ecx), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jnz	L(matches)
+	add	$16, %ecx
+
+	movdqa	(%ecx), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jnz	L(matches)
+	add	$16, %ecx
+
+	movdqa	(%ecx), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jz	L(loop)
+
+	.p2align 4
+L(matches):
+	pmovmskb %xmm2, %edx
+	test	%eax, %eax
+	jz	L(return_null)
+	test	%edx, %edx
+	jz	L(match_case1)
+
+	.p2align 4
+L(match_case2):
+	test	%al, %al
+	jz	L(match_higth_case2)
+	test	$15, %al
+	jnz	L(match_case2_4)
+	test	$15, %dl
+	jnz	L(return_null)
+	lea	4(%ecx), %eax
+	ret
+
+	.p2align 4
+L(match_case2_4):
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(match_higth_case2):
+	test	%dl, %dl
+	jnz	L(return_null)
+	test	$15, %ah
+	jnz	L(match_case2_12)
+	test	$15, %dh
+	jnz	L(return_null)
+	lea	12(%ecx), %eax
+	ret
+
+	.p2align 4
+L(match_case2_12):
+	lea	8(%ecx), %eax
+	ret
+
+	.p2align 4
+L(match_case1):
+	test	%al, %al
+	jz	L(match_higth_case1)
+
+	test	$0x01, %al
+	jnz	L(exit0)
+	lea	4(%ecx), %eax
+	ret
+
+	.p2align 4
+L(match_higth_case1):
+	test	$0x01, %ah
+	jnz	L(exit3)
+	lea	12(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit0):
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(exit3):
+	lea	8(%ecx), %eax
+	ret
+
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	ret
+
+END (__wcschr_sse2)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr.S
new file mode 100644
index 0000000000..d3c65a6436
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr.S
@@ -0,0 +1,36 @@
+/* Multiple versions of wcschr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+	.text
+ENTRY(__wcschr)
+	.type	wcschr, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__wcschr_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__wcschr_sse2)
+2:	ret
+END(__wcschr)
+weak_alias (__wcschr, wcschr)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-c.c
new file mode 100644
index 0000000000..e3337d77e2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-c.c
@@ -0,0 +1,14 @@
+#include <wchar.h>
+
+#define WCSCMP __wcscmp_ia32
+#ifdef SHARED
+# undef libc_hidden_def
+# define libc_hidden_def(name) \
+  __hidden_ver1 (__wcscmp_ia32, __GI___wcscmp, __wcscmp_ia32);
+#endif
+#undef weak_alias
+#define weak_alias(name, alias)
+
+extern __typeof (wcscmp) __wcscmp_ia32;
+
+#include "wcsmbs/wcscmp.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-sse2.S
new file mode 100644
index 0000000000..a464b58204
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-sse2.S
@@ -0,0 +1,1018 @@
+/* wcscmp with SSE2
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define ENTRANCE PUSH(%esi); PUSH(%edi)
+# define RETURN  POP(%edi); POP(%esi); ret; CFI_PUSH(%esi); CFI_PUSH(%edi);
+# define PARMS  4
+# define STR1  PARMS
+# define STR2  STR1+4
+
+/* Note: wcscmp uses signed comparison, not unsugned as in strcmp function. */
+
+	.text
+ENTRY (__wcscmp_sse2)
+/*
+	* This implementation uses SSE to compare up to 16 bytes at a time.
+*/
+	mov	STR1(%esp), %edx
+	mov	STR2(%esp), %eax
+
+	mov	(%eax), %ecx
+	cmp	%ecx, (%edx)
+	jne	L(neq)
+	test	%ecx, %ecx
+	jz	L(eq)
+
+	mov	4(%eax), %ecx
+	cmp	%ecx, 4(%edx)
+	jne	L(neq)
+	test	%ecx, %ecx
+	jz	L(eq)
+
+	mov	8(%eax), %ecx
+	cmp	%ecx, 8(%edx)
+	jne	L(neq)
+	test	%ecx, %ecx
+	jz	L(eq)
+
+	mov	12(%eax), %ecx
+	cmp	%ecx, 12(%edx)
+	jne	L(neq)
+	test	%ecx, %ecx
+	jz	L(eq)
+
+	ENTRANCE
+	add	$16, %eax
+	add	$16, %edx
+
+	mov	%eax, %esi
+	mov	%edx, %edi
+	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char checks */
+	mov	%al, %ch
+	mov	%dl, %cl
+	and	$63, %eax		/* esi alignment in cache line */
+	and	$63, %edx		/* edi alignment in cache line */
+	and	$15, %cl
+	jz	L(continue_00)
+	cmp	$16, %edx
+	jb	L(continue_0)
+	cmp	$32, %edx
+	jb	L(continue_16)
+	cmp	$48, %edx
+	jb	L(continue_32)
+
+L(continue_48):
+	and	$15, %ch
+	jz	L(continue_48_00)
+	cmp	$16, %eax
+	jb	L(continue_0_48)
+	cmp	$32, %eax
+	jb	L(continue_16_48)
+	cmp	$48, %eax
+	jb	L(continue_32_48)
+
+	.p2align 4
+L(continue_48_48):
+	mov	(%esi), %ecx
+	cmp	%ecx, (%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	4(%esi), %ecx
+	cmp	%ecx, 4(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	8(%esi), %ecx
+	cmp	%ecx, 8(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	12(%esi), %ecx
+	cmp	%ecx, 12(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	movdqu	16(%edi), %xmm1
+	movdqu	16(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqu	32(%edi), %xmm1
+	movdqu	32(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	movdqu	48(%edi), %xmm1
+	movdqu	48(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_48)
+
+	add	$64, %esi
+	add	$64, %edi
+	jmp	L(continue_48_48)
+
+L(continue_0):
+	and	$15, %ch
+	jz	L(continue_0_00)
+	cmp	$16, %eax
+	jb	L(continue_0_0)
+	cmp	$32, %eax
+	jb	L(continue_0_16)
+	cmp	$48, %eax
+	jb	L(continue_0_32)
+
+	.p2align 4
+L(continue_0_48):
+	mov	(%esi), %ecx
+	cmp	%ecx, (%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	4(%esi), %ecx
+	cmp	%ecx, 4(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	8(%esi), %ecx
+	cmp	%ecx, 8(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	12(%esi), %ecx
+	cmp	%ecx, 12(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	movdqu	16(%edi), %xmm1
+	movdqu	16(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqu	32(%edi), %xmm1
+	movdqu	32(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	mov	48(%esi), %ecx
+	cmp	%ecx, 48(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	52(%esi), %ecx
+	cmp	%ecx, 52(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	56(%esi), %ecx
+	cmp	%ecx, 56(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	60(%esi), %ecx
+	cmp	%ecx, 60(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	add	$64, %esi
+	add	$64, %edi
+	jmp	L(continue_0_48)
+
+	.p2align 4
+L(continue_00):
+	and	$15, %ch
+	jz	L(continue_00_00)
+	cmp	$16, %eax
+	jb	L(continue_00_0)
+	cmp	$32, %eax
+	jb	L(continue_00_16)
+	cmp	$48, %eax
+	jb	L(continue_00_32)
+
+	.p2align 4
+L(continue_00_48):
+	pcmpeqd	(%edi), %xmm0
+	mov	(%edi), %eax
+	pmovmskb %xmm0, %ecx
+	test	%ecx, %ecx
+	jnz	L(less4_double_words1)
+
+	cmp	(%esi), %eax
+	jne	L(nequal)
+
+	mov	4(%edi), %eax
+	cmp	4(%esi), %eax
+	jne	L(nequal)
+
+	mov	8(%edi), %eax
+	cmp	8(%esi), %eax
+	jne	L(nequal)
+
+	mov	12(%edi), %eax
+	cmp	12(%esi), %eax
+	jne	L(nequal)
+
+	movdqu	16(%esi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	16(%edi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqu	32(%esi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	32(%edi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	movdqu	48(%esi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	48(%edi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_48)
+
+	add	$64, %esi
+	add	$64, %edi
+	jmp	L(continue_00_48)
+
+	.p2align 4
+L(continue_32):
+	and	$15, %ch
+	jz	L(continue_32_00)
+	cmp	$16, %eax
+	jb	L(continue_0_32)
+	cmp	$32, %eax
+	jb	L(continue_16_32)
+	cmp	$48, %eax
+	jb	L(continue_32_32)
+
+	.p2align 4
+L(continue_32_48):
+	mov	(%esi), %ecx
+	cmp	%ecx, (%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	4(%esi), %ecx
+	cmp	%ecx, 4(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	8(%esi), %ecx
+	cmp	%ecx, 8(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	12(%esi), %ecx
+	cmp	%ecx, 12(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	16(%esi), %ecx
+	cmp	%ecx, 16(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	20(%esi), %ecx
+	cmp	%ecx, 20(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	24(%esi), %ecx
+	cmp	%ecx, 24(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	28(%esi), %ecx
+	cmp	%ecx, 28(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	movdqu	32(%edi), %xmm1
+	movdqu	32(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	movdqu	48(%edi), %xmm1
+	movdqu	48(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results */
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_48)
+
+	add	$64, %esi
+	add	$64, %edi
+	jmp	L(continue_32_48)
+
+	.p2align 4
+L(continue_16):
+	and	$15, %ch
+	jz	L(continue_16_00)
+	cmp	$16, %eax
+	jb	L(continue_0_16)
+	cmp	$32, %eax
+	jb	L(continue_16_16)
+	cmp	$48, %eax
+	jb	L(continue_16_32)
+
+	.p2align 4
+L(continue_16_48):
+	mov	(%esi), %ecx
+	cmp	%ecx, (%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	4(%esi), %ecx
+	cmp	%ecx, 4(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	8(%esi), %ecx
+	cmp	%ecx, 8(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	12(%esi), %ecx
+	cmp	%ecx, 12(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	movdqu	16(%edi), %xmm1
+	movdqu	16(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	mov	32(%esi), %ecx
+	cmp	%ecx, 32(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	36(%esi), %ecx
+	cmp	%ecx, 36(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	40(%esi), %ecx
+	cmp	%ecx, 40(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	44(%esi), %ecx
+	cmp	%ecx, 44(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	movdqu	48(%edi), %xmm1
+	movdqu	48(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_48)
+
+	add	$64, %esi
+	add	$64, %edi
+	jmp	L(continue_16_48)
+
+	.p2align 4
+L(continue_00_00):
+	movdqa	(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqa	16(%edi), %xmm3
+	pcmpeqd	%xmm3, %xmm0		/* Any null double_word? */
+	pcmpeqd	16(%esi), %xmm3		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm3		/* packed sub of comparison results*/
+	pmovmskb %xmm3, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqa	32(%edi), %xmm5
+	pcmpeqd	%xmm5, %xmm0		/* Any null double_word? */
+	pcmpeqd	32(%esi), %xmm5		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm5		/* packed sub of comparison results*/
+	pmovmskb %xmm5, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	movdqa	48(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	48(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_48)
+
+	add	$64, %esi
+	add	$64, %edi
+	jmp	L(continue_00_00)
+
+	.p2align 4
+L(continue_00_32):
+	movdqu	(%esi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	(%edi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	add	$16, %esi
+	add	$16, %edi
+	jmp	L(continue_00_48)
+
+	.p2align 4
+L(continue_00_16):
+	movdqu	(%esi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	(%edi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqu	16(%esi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	16(%edi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	add	$32, %esi
+	add	$32, %edi
+	jmp	L(continue_00_48)
+
+	.p2align 4
+L(continue_00_0):
+	movdqu	(%esi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	(%edi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqu	16(%esi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	16(%edi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqu	32(%esi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	32(%edi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	add	$48, %esi
+	add	$48, %edi
+	jmp	L(continue_00_48)
+
+	.p2align 4
+L(continue_48_00):
+	pcmpeqd	(%esi), %xmm0
+	mov	(%edi), %eax
+	pmovmskb %xmm0, %ecx
+	test	%ecx, %ecx
+	jnz	L(less4_double_words1)
+
+	cmp	(%esi), %eax
+	jne	L(nequal)
+
+	mov	4(%edi), %eax
+	cmp	4(%esi), %eax
+	jne	L(nequal)
+
+	mov	8(%edi), %eax
+	cmp	8(%esi), %eax
+	jne	L(nequal)
+
+	mov	12(%edi), %eax
+	cmp	12(%esi), %eax
+	jne	L(nequal)
+
+	movdqu	16(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	16(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqu	32(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	32(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	movdqu	48(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	48(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_48)
+
+	add	$64, %esi
+	add	$64, %edi
+	jmp	L(continue_48_00)
+
+	.p2align 4
+L(continue_32_00):
+	movdqu	(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	add	$16, %esi
+	add	$16, %edi
+	jmp	L(continue_48_00)
+
+	.p2align 4
+L(continue_16_00):
+	movdqu	(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqu	16(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	16(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	add	$32, %esi
+	add	$32, %edi
+	jmp	L(continue_48_00)
+
+	.p2align 4
+L(continue_0_00):
+	movdqu	(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqu	16(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	16(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqu	32(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	32(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	add	$48, %esi
+	add	$48, %edi
+	jmp	L(continue_48_00)
+
+	.p2align 4
+L(continue_32_32):
+	movdqu	(%edi), %xmm1
+	movdqu	(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	add	$16, %esi
+	add	$16, %edi
+	jmp	L(continue_48_48)
+
+	.p2align 4
+L(continue_16_16):
+	movdqu	(%edi), %xmm1
+	movdqu	(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqu	16(%edi), %xmm3
+	movdqu	16(%esi), %xmm4
+	pcmpeqd	%xmm3, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm4, %xmm3		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm3		/* packed sub of comparison results*/
+	pmovmskb %xmm3, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	add	$32, %esi
+	add	$32, %edi
+	jmp	L(continue_48_48)
+
+	.p2align 4
+L(continue_0_0):
+	movdqu	(%edi), %xmm1
+	movdqu	(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqu	16(%edi), %xmm3
+	movdqu	16(%esi), %xmm4
+	pcmpeqd	%xmm3, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm4, %xmm3		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm3		/* packed sub of comparison results*/
+	pmovmskb %xmm3, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqu	32(%edi), %xmm1
+	movdqu	32(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	add	$48, %esi
+	add	$48, %edi
+	jmp	L(continue_48_48)
+
+	.p2align 4
+L(continue_0_16):
+	movdqu	(%edi), %xmm1
+	movdqu	(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqu	16(%edi), %xmm1
+	movdqu	16(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	add	$32, %esi
+	add	$32, %edi
+	jmp	L(continue_32_48)
+
+	.p2align 4
+L(continue_0_32):
+	movdqu	(%edi), %xmm1
+	movdqu	(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	add	$16, %esi
+	add	$16, %edi
+	jmp	L(continue_16_48)
+
+	.p2align 4
+L(continue_16_32):
+	movdqu	(%edi), %xmm1
+	movdqu	(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	add	$16, %esi
+	add	$16, %edi
+	jmp	L(continue_32_48)
+
+	.p2align 4
+L(less4_double_words1):
+	cmp	(%esi), %eax
+	jne	L(nequal)
+	test	%eax, %eax
+	jz	L(equal)
+
+	mov	4(%esi), %ecx
+	cmp	%ecx, 4(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	8(%esi), %ecx
+	cmp	%ecx, 8(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	12(%esi), %ecx
+	cmp	%ecx, 12(%edi)
+	jne	L(nequal)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(less4_double_words):
+	xor	%eax, %eax
+	test	%dl, %dl
+	jz	L(next_two_double_words)
+	and	$15, %dl
+	jz	L(second_double_word)
+	mov	(%esi), %ecx
+	cmp	%ecx, (%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(second_double_word):
+	mov	4(%esi), %ecx
+	cmp	%ecx, 4(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(next_two_double_words):
+	and	$15, %dh
+	jz	L(fourth_double_word)
+	mov	8(%esi), %ecx
+	cmp	%ecx, 8(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(fourth_double_word):
+	mov	12(%esi), %ecx
+	cmp	%ecx, 12(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(less4_double_words_16):
+	xor	%eax, %eax
+	test	%dl, %dl
+	jz	L(next_two_double_words_16)
+	and	$15, %dl
+	jz	L(second_double_word_16)
+	mov	16(%esi), %ecx
+	cmp	%ecx, 16(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(second_double_word_16):
+	mov	20(%esi), %ecx
+	cmp	%ecx, 20(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(next_two_double_words_16):
+	and	$15, %dh
+	jz	L(fourth_double_word_16)
+	mov	24(%esi), %ecx
+	cmp	%ecx, 24(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(fourth_double_word_16):
+	mov	28(%esi), %ecx
+	cmp	%ecx, 28(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(less4_double_words_32):
+	xor	%eax, %eax
+	test	%dl, %dl
+	jz	L(next_two_double_words_32)
+	and	$15, %dl
+	jz	L(second_double_word_32)
+	mov	32(%esi), %ecx
+	cmp	%ecx, 32(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(second_double_word_32):
+	mov	36(%esi), %ecx
+	cmp	%ecx, 36(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(next_two_double_words_32):
+	and	$15, %dh
+	jz	L(fourth_double_word_32)
+	mov	40(%esi), %ecx
+	cmp	%ecx, 40(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(fourth_double_word_32):
+	mov	44(%esi), %ecx
+	cmp	%ecx, 44(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(less4_double_words_48):
+	xor	%eax, %eax
+	test	%dl, %dl
+	jz	L(next_two_double_words_48)
+	and	$15, %dl
+	jz	L(second_double_word_48)
+	mov	48(%esi), %ecx
+	cmp	%ecx, 48(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(second_double_word_48):
+	mov	52(%esi), %ecx
+	cmp	%ecx, 52(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(next_two_double_words_48):
+	and	$15, %dh
+	jz	L(fourth_double_word_48)
+	mov	56(%esi), %ecx
+	cmp	%ecx, 56(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(fourth_double_word_48):
+	mov	60(%esi), %ecx
+	cmp	%ecx, 60(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(nequal):
+	mov	$1, %eax
+	jg	L(return)
+	neg	%eax
+	RETURN
+
+	.p2align 4
+L(return):
+	RETURN
+
+	.p2align 4
+L(equal):
+	xorl	%eax, %eax
+	RETURN
+
+	CFI_POP (%edi)
+	CFI_POP (%esi)
+
+	.p2align 4
+L(neq):
+	mov	$1, %eax
+	jg	L(neq_bigger)
+	neg	%eax
+
+L(neq_bigger):
+	ret
+
+	.p2align 4
+L(eq):
+	xorl	%eax, %eax
+	ret
+
+END (__wcscmp_sse2)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp.S
new file mode 100644
index 0000000000..7118bdd4db
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp.S
@@ -0,0 +1,39 @@
+/* Multiple versions of wcscmp
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc and for the
+   DSO.  In static binaries, we need wcscmp before the initialization
+   happened.  */
+#if IS_IN (libc)
+	.text
+ENTRY(__wcscmp)
+	.type	__wcscmp, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__wcscmp_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__wcscmp_sse2)
+2:	ret
+END(__wcscmp)
+weak_alias (__wcscmp, wcscmp)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-c.c
new file mode 100644
index 0000000000..fb3000392b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-c.c
@@ -0,0 +1,5 @@
+#if IS_IN (libc)
+# define wcscpy  __wcscpy_ia32
+#endif
+
+#include "wcsmbs/wcscpy.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S
new file mode 100644
index 0000000000..6280ba92ab
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S
@@ -0,0 +1,600 @@
+/* wcscpy with SSSE3
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+# define POP(REG)	popl REG; CFI_POP (REG)
+
+# define PARMS	4
+# define RETURN	POP (%edi); ret; CFI_PUSH (%edi)
+# define STR1	PARMS
+# define STR2	STR1+4
+# define LEN	STR2+4
+
+	atom_text_section
+ENTRY (__wcscpy_ssse3)
+	mov	STR1(%esp), %edx
+	mov	STR2(%esp), %ecx
+
+	cmp	$0, (%ecx)
+	jz	L(ExitTail4)
+	cmp	$0, 4(%ecx)
+	jz	L(ExitTail8)
+	cmp	$0, 8(%ecx)
+	jz	L(ExitTail12)
+	cmp	$0, 12(%ecx)
+	jz	L(ExitTail16)
+
+	PUSH	(%edi)
+	mov	%edx, %edi
+	PUSH	(%esi)
+	lea	16(%ecx), %esi
+
+	and	$-16, %esi
+
+	pxor	%xmm0, %xmm0
+	pcmpeqd	(%esi), %xmm0
+	movdqu	(%ecx), %xmm1
+	movdqu	%xmm1, (%edx)
+
+	pmovmskb %xmm0, %eax
+	sub	%ecx, %esi
+
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	mov	%edx, %eax
+	lea	16(%edx), %edx
+	and	$-16, %edx
+	sub	%edx, %eax
+
+	sub	%eax, %ecx
+	mov	%ecx, %eax
+	and	$0xf, %eax
+	mov	$0, %esi
+
+	jz	L(Align16Both)
+	cmp	$4, %eax
+	je	L(Shl4)
+	cmp	$8, %eax
+	je	L(Shl8)
+	jmp	L(Shl12)
+
+L(Align16Both):
+	movaps	(%ecx), %xmm1
+	movaps	16(%ecx), %xmm2
+	movaps	%xmm1, (%edx)
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm3
+	movaps	%xmm2, (%edx, %esi)
+	pcmpeqd	%xmm3, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm4
+	movaps	%xmm3, (%edx, %esi)
+	pcmpeqd	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm1
+	movaps	%xmm4, (%edx, %esi)
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm2
+	movaps	%xmm1, (%edx, %esi)
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm3
+	movaps	%xmm2, (%edx, %esi)
+	pcmpeqd	%xmm3, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	%xmm3, (%edx, %esi)
+	mov	%ecx, %eax
+	lea	16(%ecx, %esi), %ecx
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	sub	%eax, %edx
+
+	mov	$-0x40, %esi
+
+L(Aligned64Loop):
+	movaps	(%ecx), %xmm2
+	movaps	32(%ecx), %xmm3
+	movaps	%xmm2, %xmm4
+	movaps	16(%ecx), %xmm5
+	movaps	%xmm3, %xmm6
+	movaps	48(%ecx), %xmm7
+	pminub	%xmm5, %xmm2
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	lea	64(%edx), %edx
+	pcmpeqd	%xmm0, %xmm3
+	lea	64(%ecx), %ecx
+	pmovmskb %xmm3, %eax
+
+	test	%eax, %eax
+	jnz	L(Aligned64Leave)
+	movaps	%xmm4, -64(%edx)
+	movaps	%xmm5, -48(%edx)
+	movaps	%xmm6, -32(%edx)
+	movaps	%xmm7, -16(%edx)
+	jmp	L(Aligned64Loop)
+
+L(Aligned64Leave):
+	pcmpeqd	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqd	%xmm5, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm4, -64(%edx)
+	test	%eax, %eax
+	lea	16(%esi), %esi
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqd	%xmm6, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm5, -48(%edx)
+	test	%eax, %eax
+	lea	16(%esi), %esi
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	%xmm6, -32(%edx)
+	pcmpeqd	%xmm7, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	lea	16(%esi), %esi
+	jnz	L(CopyFrom1To16Bytes)
+
+	mov	$-0x40, %esi
+	movaps	%xmm7, -16(%edx)
+	jmp	L(Aligned64Loop)
+
+	.p2align 4
+L(Shl4):
+	movaps	-4(%ecx), %xmm1
+	movaps	12(%ecx), %xmm2
+L(Shl4Start):
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	28(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm1
+
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	28(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	28(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	28(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-12(%ecx), %ecx
+	sub	%eax, %edx
+
+	movaps	-4(%ecx), %xmm1
+
+L(Shl4LoopStart):
+	movaps	12(%ecx), %xmm2
+	movaps	28(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	44(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	60(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqd	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$4, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$4, %xmm3, %xmm4
+	jnz	L(Shl4Start)
+
+	palignr	$4, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl4LoopStart)
+
+L(Shl4LoopExit):
+	movlpd	(%ecx), %xmm0
+	movl	8(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 8(%edx)
+	POP	(%esi)
+	add	$12, %edx
+	add	$12, %ecx
+	test	%al, %al
+	jz	L(ExitHigh)
+	test	$0x01, %al
+	jnz	L(Exit4)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	%edi, %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(Shl8):
+	movaps	-8(%ecx), %xmm1
+	movaps	8(%ecx), %xmm2
+L(Shl8Start):
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	24(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm1
+
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	24(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	24(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	24(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-8(%ecx), %ecx
+	sub	%eax, %edx
+
+	movaps	-8(%ecx), %xmm1
+
+L(Shl8LoopStart):
+	movaps	8(%ecx), %xmm2
+	movaps	24(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	40(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	56(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqd	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$8, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$8, %xmm3, %xmm4
+	jnz	L(Shl8Start)
+
+	palignr	$8, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl8LoopStart)
+
+L(Shl8LoopExit):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	POP	(%esi)
+	add	$8, %edx
+	add	$8, %ecx
+	test	%al, %al
+	jz	L(ExitHigh)
+	test	$0x01, %al
+	jnz	L(Exit4)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	%edi, %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(Shl12):
+	movaps	-12(%ecx), %xmm1
+	movaps	4(%ecx), %xmm2
+L(Shl12Start):
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	20(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm1
+
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	20(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	20(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	20(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-4(%ecx), %ecx
+	sub	%eax, %edx
+
+	movaps	-12(%ecx), %xmm1
+
+L(Shl12LoopStart):
+	movaps	4(%ecx), %xmm2
+	movaps	20(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	36(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	52(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqd	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$12, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$12, %xmm3, %xmm4
+	jnz	L(Shl12Start)
+
+	palignr	$12, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl12LoopStart)
+
+L(Shl12LoopExit):
+	movl	(%ecx), %esi
+	movl	%esi, (%edx)
+	mov	$4, %esi
+
+	.p2align 4
+L(CopyFrom1To16Bytes):
+	add	%esi, %edx
+	add	%esi, %ecx
+
+	POP	(%esi)
+	test	%al, %al
+	jz	L(ExitHigh)
+	test	$0x01, %al
+	jnz	L(Exit4)
+L(Exit8):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	%edi, %eax
+	RETURN
+
+	.p2align 4
+L(ExitHigh):
+	test	$0x01, %ah
+	jnz	L(Exit12)
+L(Exit16):
+	movdqu	(%ecx), %xmm0
+	movdqu	%xmm0, (%edx)
+	movl	%edi, %eax
+	RETURN
+
+	.p2align 4
+L(Exit4):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	%edi, %eax
+	RETURN
+
+	.p2align 4
+L(Exit12):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	8(%ecx), %eax
+	movl	%eax, 8(%edx)
+	movl	%edi, %eax
+	RETURN
+
+CFI_POP	(%edi)
+
+	.p2align 4
+L(ExitTail4):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	%edx, %eax
+	ret
+
+	.p2align 4
+L(ExitTail8):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	%edx, %eax
+	ret
+
+	.p2align 4
+L(ExitTail12):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	8(%ecx), %eax
+	movl	%eax, 8(%edx)
+	movl	%edx, %eax
+	ret
+
+	.p2align 4
+L(ExitTail16):
+	movdqu	(%ecx), %xmm0
+	movdqu	%xmm0, (%edx)
+	movl	%edx, %eax
+	ret
+
+END (__wcscpy_ssse3)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy.S
new file mode 100644
index 0000000000..cfc97dd87c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy.S
@@ -0,0 +1,36 @@
+/* Multiple versions of wcscpy
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+	.text
+ENTRY(wcscpy)
+	.type	wcscpy, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__wcscpy_ia32)
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__wcscpy_ssse3)
+2:	ret
+END(wcscpy)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-c.c
new file mode 100644
index 0000000000..a335dc0f7e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-c.c
@@ -0,0 +1,9 @@
+#include <wchar.h>
+
+#if IS_IN (libc)
+# define WCSLEN  __wcslen_ia32
+#endif
+
+extern __typeof (wcslen) __wcslen_ia32;
+
+#include "wcsmbs/wcslen.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-sse2.S
new file mode 100644
index 0000000000..bd3fc4c79b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-sse2.S
@@ -0,0 +1,193 @@
+/* wcslen with SSE2
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+# include <sysdep.h>
+# define STR	4
+
+	.text
+ENTRY (__wcslen_sse2)
+	mov	STR(%esp), %edx
+
+	cmp	$0, (%edx)
+	jz	L(exit_tail0)
+	cmp	$0, 4(%edx)
+	jz	L(exit_tail1)
+	cmp	$0, 8(%edx)
+	jz	L(exit_tail2)
+	cmp	$0, 12(%edx)
+	jz	L(exit_tail3)
+	cmp	$0, 16(%edx)
+	jz	L(exit_tail4)
+	cmp	$0, 20(%edx)
+	jz	L(exit_tail5)
+	cmp	$0, 24(%edx)
+	jz	L(exit_tail6)
+	cmp	$0, 28(%edx)
+	jz	L(exit_tail7)
+
+	pxor	%xmm0, %xmm0
+
+	lea	32(%edx), %eax
+	lea	16(%edx), %ecx
+	and	$-16, %eax
+
+	pcmpeqd	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	pxor	%xmm1, %xmm1
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqd	(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	pxor	%xmm2, %xmm2
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqd	(%eax), %xmm2
+	pmovmskb %xmm2, %edx
+	pxor	%xmm3, %xmm3
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqd	(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	and	$-0x40, %eax
+
+	.p2align 4
+L(aligned_64_loop):
+	movaps	(%eax), %xmm0
+	movaps	16(%eax), %xmm1
+	movaps	32(%eax), %xmm2
+	movaps	48(%eax), %xmm6
+
+	pminub	%xmm1, %xmm0
+	pminub	%xmm6, %xmm2
+	pminub	%xmm0, %xmm2
+	pcmpeqd	%xmm3, %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	lea	64(%eax), %eax
+	jz	L(aligned_64_loop)
+
+	pcmpeqd	-64(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	48(%ecx), %ecx
+	jnz	L(exit)
+
+	pcmpeqd	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	-16(%ecx), %ecx
+	jnz	L(exit)
+
+	pcmpeqd	-32(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	-16(%ecx), %ecx
+	jnz	L(exit)
+
+	pcmpeqd	%xmm6, %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	-16(%ecx), %ecx
+	jnz	L(exit)
+
+	jmp	L(aligned_64_loop)
+
+	.p2align 4
+L(exit):
+	sub	%ecx, %eax
+	shr	$2, %eax
+	test	%dl, %dl
+	jz	L(exit_high)
+
+	mov	%dl, %cl
+	and	$15, %cl
+	jz	L(exit_1)
+	ret
+
+	.p2align 4
+L(exit_high):
+	mov	%dh, %ch
+	and	$15, %ch
+	jz	L(exit_3)
+	add	$2, %eax
+	ret
+
+	.p2align 4
+L(exit_1):
+	add	$1, %eax
+	ret
+
+	.p2align 4
+L(exit_3):
+	add	$3, %eax
+	ret
+
+	.p2align 4
+L(exit_tail0):
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(exit_tail1):
+	mov	$1, %eax
+	ret
+
+	.p2align 4
+L(exit_tail2):
+	mov	$2, %eax
+	ret
+
+	.p2align 4
+L(exit_tail3):
+	mov	$3, %eax
+	ret
+
+	.p2align 4
+L(exit_tail4):
+	mov	$4, %eax
+	ret
+
+	.p2align 4
+L(exit_tail5):
+	mov	$5, %eax
+	ret
+
+	.p2align 4
+L(exit_tail6):
+	mov	$6, %eax
+	ret
+
+	.p2align 4
+L(exit_tail7):
+	mov	$7, %eax
+	ret
+
+END (__wcslen_sse2)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen.S
new file mode 100644
index 0000000000..6ef9b6e7b5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen.S
@@ -0,0 +1,37 @@
+/* Multiple versions of wcslen
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+	.text
+ENTRY(__wcslen)
+	.type	__wcslen, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__wcslen_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__wcslen_sse2)
+2:	ret
+END(__wcslen)
+
+weak_alias(__wcslen, wcslen)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-c.c
new file mode 100644
index 0000000000..8d8a335b5b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-c.c
@@ -0,0 +1,5 @@
+#if IS_IN (libc)
+# define wcsrchr  __wcsrchr_ia32
+#endif
+
+#include "wcsmbs/wcsrchr.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S
new file mode 100644
index 0000000000..1a9b60e55e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S
@@ -0,0 +1,354 @@
+/* wcsrchr with SSE2, without using bsf instructions.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+# include <sysdep.h>
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+# define POP(REG)	popl REG; CFI_POP (REG)
+
+# define PARMS	8
+# define ENTRANCE	PUSH (%edi);
+# define RETURN	POP (%edi); ret; CFI_PUSH (%edi);
+# define STR1	PARMS
+# define STR2	STR1+4
+
+	atom_text_section
+ENTRY (__wcsrchr_sse2)
+
+	ENTRANCE
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+
+	mov	%ecx, %edi
+	punpckldq %xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	punpckldq %xmm1, %xmm1
+
+/* ECX has OFFSET. */
+	and	$63, %ecx
+	cmp	$48, %ecx
+	ja	L(crosscache)
+
+/* unaligned string. */
+	movdqu	(%edi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+/* Find where NULL is.  */
+	pmovmskb %xmm2, %ecx
+/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+	add	$16, %edi
+
+	test	%eax, %eax
+	jnz	L(unaligned_match1)
+
+	test	%ecx, %ecx
+	jnz	L(return_null)
+
+	and	$-16, %edi
+
+	PUSH	(%esi)
+
+	xor	%edx, %edx
+	jmp	L(loop)
+
+	CFI_POP	(%esi)
+
+	.p2align 4
+L(unaligned_match1):
+	test	%ecx, %ecx
+	jnz	L(prolog_find_zero_1)
+
+	PUSH	(%esi)
+
+/* Save current match */
+	mov	%eax, %edx
+	mov	%edi, %esi
+	and	$-16, %edi
+	jmp	L(loop)
+
+	CFI_POP	(%esi)
+
+	.p2align 4
+L(crosscache):
+/* Hancle unaligned string.  */
+	and	$15, %ecx
+	and	$-16, %edi
+	pxor	%xmm3, %xmm3
+	movdqa	(%edi), %xmm0
+	pcmpeqd	%xmm0, %xmm3
+	pcmpeqd	%xmm1, %xmm0
+/* Find where NULL is.  */
+	pmovmskb %xmm3, %edx
+/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+/* Remove the leading bytes.  */
+	shr	%cl, %edx
+	shr	%cl, %eax
+	add	$16, %edi
+
+	test	%eax, %eax
+	jnz	L(unaligned_match)
+
+	test	%edx, %edx
+	jnz	L(return_null)
+
+	PUSH	(%esi)
+
+	xor	%edx, %edx
+	jmp	L(loop)
+
+	CFI_POP	(%esi)
+
+	.p2align 4
+L(unaligned_match):
+	test	%edx, %edx
+	jnz	L(prolog_find_zero)
+
+	PUSH	(%esi)
+
+	mov	%eax, %edx
+	lea	(%edi, %ecx), %esi
+
+/* Loop start on aligned string.  */
+	.p2align 4
+L(loop):
+	movdqa	(%edi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm0, %eax
+	or	%eax, %ecx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm3
+	pcmpeqd	%xmm3, %xmm2
+	add	$16, %edi
+	pcmpeqd	%xmm1, %xmm3
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm3, %eax
+	or	%eax, %ecx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm4
+	pcmpeqd	%xmm4, %xmm2
+	add	$16, %edi
+	pcmpeqd	%xmm1, %xmm4
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm4, %eax
+	or	%eax, %ecx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm5
+	pcmpeqd	%xmm5, %xmm2
+	add	$16, %edi
+	pcmpeqd	%xmm1, %xmm5
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm5, %eax
+	or	%eax, %ecx
+	jz	L(loop)
+
+	.p2align 4
+L(matches):
+	test	%eax, %eax
+	jnz	L(match)
+L(return_value):
+	test	%edx, %edx
+	jz	L(return_null_1)
+	mov	%edx, %eax
+	mov	%esi, %edi
+
+	POP	(%esi)
+
+	test	%ah, %ah
+	jnz	L(match_third_or_fourth_wchar)
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%edi), %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(return_null_1):
+	POP	(%esi)
+
+	xor	%eax, %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(match):
+	pmovmskb %xmm2, %ecx
+	test	%ecx, %ecx
+	jnz	L(find_zero)
+/* save match info */
+	mov	%eax, %edx
+	mov	%edi, %esi
+	jmp	L(loop)
+
+	.p2align 4
+L(find_zero):
+	test	%cl, %cl
+	jz	L(find_zero_in_third_or_fourth_wchar)
+	test	$15, %cl
+	jz	L(find_zero_in_second_wchar)
+	and	$1, %eax
+	jz	L(return_value)
+
+	POP	(%esi)
+
+	lea	-16(%edi), %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(find_zero_in_second_wchar):
+	and	$1 << 5 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%esi)
+
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%edi), %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(find_zero_in_third_or_fourth_wchar):
+	test	$15, %ch
+	jz	L(find_zero_in_fourth_wchar)
+	and	$1 << 9 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%esi)
+
+	test	%ah, %ah
+	jnz	L(match_third_wchar)
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%edi), %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(find_zero_in_fourth_wchar):
+
+	POP	(%esi)
+
+	test	%ah, %ah
+	jnz	L(match_third_or_fourth_wchar)
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%edi), %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(match_second_wchar):
+	lea	-12(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_third_or_fourth_wchar):
+	test	$15 << 4, %ah
+	jnz	L(match_fourth_wchar)
+	lea	-8(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_third_wchar):
+	lea	-8(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_fourth_wchar):
+	lea	-4(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(prolog_find_zero):
+	add	%ecx, %edi
+	mov     %edx, %ecx
+L(prolog_find_zero_1):
+	test	%cl, %cl
+	jz	L(prolog_find_zero_in_third_or_fourth_wchar)
+	test	$15, %cl
+	jz	L(prolog_find_zero_in_second_wchar)
+	and	$1, %eax
+	jz	L(return_null)
+
+	lea	-16(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(prolog_find_zero_in_second_wchar):
+	and	$1 << 5 - 1, %eax
+	jz	L(return_null)
+
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(prolog_find_zero_in_third_or_fourth_wchar):
+	test	$15, %ch
+	jz	L(prolog_find_zero_in_fourth_wchar)
+	and	$1 << 9 - 1, %eax
+	jz	L(return_null)
+
+	test	%ah, %ah
+	jnz	L(match_third_wchar)
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(prolog_find_zero_in_fourth_wchar):
+	test	%ah, %ah
+	jnz	L(match_third_or_fourth_wchar)
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%edi), %eax
+	RETURN
+
+END (__wcsrchr_sse2)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr.S
new file mode 100644
index 0000000000..cf67333995
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr.S
@@ -0,0 +1,35 @@
+/* Multiple versions of wcsrchr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+	.text
+ENTRY(wcsrchr)
+	.type	wcsrchr, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__wcsrchr_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__wcsrchr_sse2)
+2:	ret
+END(wcsrchr)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-c.c
new file mode 100644
index 0000000000..75ab4b94c1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-c.c
@@ -0,0 +1,9 @@
+#include <wchar.h>
+
+#if IS_IN (libc)
+# define WMEMCMP  __wmemcmp_ia32
+#endif
+
+extern __typeof (wmemcmp) __wmemcmp_ia32;
+
+#include "wcsmbs/wmemcmp.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S
new file mode 100644
index 0000000000..1a857c7e21
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S
@@ -0,0 +1,4 @@
+#define USE_AS_WMEMCMP 1
+#define MEMCMP __wmemcmp_sse4_2
+
+#include "memcmp-sse4.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S
new file mode 100644
index 0000000000..a41ef95fc1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_WMEMCMP 1
+#define MEMCMP __wmemcmp_ssse3
+
+#include "memcmp-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp.S
new file mode 100644
index 0000000000..1b9a54a413
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp.S
@@ -0,0 +1,40 @@
+/* Multiple versions of wmemcmp
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+
+#if IS_IN (libc)
+	.text
+ENTRY(wmemcmp)
+	.type	wmemcmp, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__wmemcmp_ia32)
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__wmemcmp_ssse3)
+	HAS_CPU_FEATURE (SSE4_2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__wmemcmp_sse4_2)
+2:	ret
+END(wmemcmp)
+#endif