about summary refs log tree commit diff
path: root/REORG.TODO/sysdeps/i386/i686/multiarch
diff options
context:
space:
mode:
Diffstat (limited to 'REORG.TODO/sysdeps/i386/i686/multiarch')
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/Makefile44
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-sse2-unaligned.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/bcopy.S59
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/bzero.S62
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/ifunc-impl-list.c376
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/locale-defines.sym11
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S502
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2.S709
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memchr.S65
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-sse4.S1225
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-ssse3.S2157
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcmp.S62
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S681
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S1809
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3.S3162
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcpy.S78
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcpy_chk.S50
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memmove-sse2-unaligned.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memmove.S89
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memmove_chk.S94
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-sse2-unaligned.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy.S81
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy_chk.S50
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-c.c7
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S417
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2.S724
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memrchr.S45
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2-rep.S811
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2.S860
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memset.S75
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memset_chk.S82
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr.S65
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/rtld-strnlen.c1
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/s_fma-fma.c27
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/s_fma.c34
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf-fma.c27
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf.c34
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/sched_cpucount.c1
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-sse2.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy.S9
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-sse2.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy.S8
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp-c.c12
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp.S39
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-c.c13
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-sse4.S2
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-ssse3.S2
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l.S7
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcat-sse2.S1245
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcat-ssse3.S572
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcat.S92
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2-bsf.S158
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2.S348
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strchr.S57
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-sse4.S804
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-ssse3.S2810
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcmp.S95
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-sse2.S2250
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-ssse3.S3901
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcpy.S116
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcspn-c.c2
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcspn.S75
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S125
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2.S695
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strlen.S60
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncase-c.c8
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncase.S39
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-c.c13
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-sse4.S2
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-ssse3.S2
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l.S7
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncat-c.c8
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncat-sse2.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncat-ssse3.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncat.S5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-c.c8
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-sse4.S5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-ssse3.S5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncmp.S5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-c.c8
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-sse2.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-ssse3.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncpy.S5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-c.c10
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-sse2.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strnlen.S37
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk-c.c2
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk.S5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2-bsf.S282
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2.S708
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strrchr.S57
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strspn-c.c2
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strspn.S56
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/test-multiarch.c1
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/varshift.c1
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/varshift.h1
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-c.c22
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-sse2.S219
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcschr.S36
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-c.c14
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-sse2.S1018
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp.S39
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-c.c5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S600
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy.S36
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-c.c9
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-sse2.S193
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcslen.S37
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-c.c5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S354
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr.S35
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-c.c9
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp.S40
127 files changed, 32113 insertions, 0 deletions
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/Makefile b/REORG.TODO/sysdeps/i386/i686/multiarch/Makefile
new file mode 100644
index 0000000000..4a0c20c051
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/Makefile
@@ -0,0 +1,44 @@
+ifeq ($(subdir),csu)
+tests += test-multiarch
+endif
+
+ifeq ($(subdir),string)
+gen-as-const-headers += locale-defines.sym
+sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \
+		   memmove-ssse3 memcpy-ssse3-rep mempcpy-ssse3-rep \
+		   memmove-ssse3-rep bcopy-ssse3 bcopy-ssse3-rep \
+		   memset-sse2-rep bzero-sse2-rep strcmp-ssse3 \
+		   strcmp-sse4 strncmp-c strncmp-ssse3 strncmp-sse4 \
+		   memcmp-ssse3 memcmp-sse4 varshift \
+		   strlen-sse2 strlen-sse2-bsf strncpy-c strcpy-ssse3 \
+		   strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 strcpy-sse2 \
+		   strncpy-sse2 stpcpy-sse2 stpncpy-sse2 strcat-ssse3 \
+		   strcat-sse2 strncat-ssse3 strncat-sse2 strncat-c \
+		   strchr-sse2 strrchr-sse2 strchr-sse2-bsf strrchr-sse2-bsf \
+		   memchr-sse2 memchr-sse2-bsf \
+		   memrchr-sse2 memrchr-sse2-bsf memrchr-c \
+		   rawmemchr-sse2 rawmemchr-sse2-bsf \
+		   strnlen-sse2 strnlen-c \
+		   strcasecmp_l-c strcasecmp-c strcasecmp_l-ssse3 \
+		   strncase_l-c strncase-c strncase_l-ssse3 \
+		   strcasecmp_l-sse4 strncase_l-sse4 \
+		   bcopy-sse2-unaligned memcpy-sse2-unaligned \
+		   mempcpy-sse2-unaligned memmove-sse2-unaligned \
+		   strcspn-c strpbrk-c strspn-c
+CFLAGS-varshift.c += -msse4
+CFLAGS-strcspn-c.c += -msse4
+CFLAGS-strpbrk-c.c += -msse4
+CFLAGS-strspn-c.c += -msse4
+endif
+
+ifeq ($(subdir),wcsmbs)
+sysdep_routines += wcscmp-sse2 wcscmp-c wcslen-sse2 wcslen-c \
+		   wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c wcschr-sse2 \
+		   wcschr-c wcsrchr-sse2 wcsrchr-c wcscpy-ssse3 wcscpy-c
+endif
+
+ifeq ($(subdir),math)
+libm-sysdep_routines += s_fma-fma s_fmaf-fma
+CFLAGS-s_fma-fma.c += -mavx -mfpmath=sse
+CFLAGS-s_fmaf-fma.c += -mavx -mfpmath=sse
+endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-sse2-unaligned.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-sse2-unaligned.S
new file mode 100644
index 0000000000..efef2a10dd
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-sse2-unaligned.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define USE_AS_BCOPY
+#define MEMCPY		__bcopy_sse2_unaligned
+#include "memcpy-sse2-unaligned.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S
new file mode 100644
index 0000000000..cbc8b420e8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define USE_AS_BCOPY
+#define MEMCPY		__bcopy_ssse3_rep
+#include "memcpy-ssse3-rep.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3.S
new file mode 100644
index 0000000000..36aac44b9c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define USE_AS_BCOPY
+#define MEMCPY		__bcopy_ssse3
+#include "memcpy-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy.S
new file mode 100644
index 0000000000..877f82c28f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy.S
@@ -0,0 +1,59 @@
+/* Multiple versions of bcopy
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib.  */
+#if IS_IN (libc)
+	.text
+ENTRY(bcopy)
+	.type	bcopy, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__bcopy_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__bcopy_sse2_unaligned)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz	2f
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__bcopy_ssse3)
+	HAS_ARCH_FEATURE (Fast_Rep_String)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__bcopy_ssse3_rep)
+2:	ret
+END(bcopy)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __bcopy_ia32, @function; \
+	.p2align 4; \
+	.globl __bcopy_ia32; \
+	.hidden __bcopy_ia32; \
+	__bcopy_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __bcopy_ia32, .-__bcopy_ia32
+
+#endif
+
+#include "../bcopy.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S
new file mode 100644
index 0000000000..507b288bb3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S
@@ -0,0 +1,3 @@
+#define USE_AS_BZERO
+#define __memset_sse2_rep __bzero_sse2_rep
+#include "memset-sse2-rep.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2.S
new file mode 100644
index 0000000000..8d04512e4e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2.S
@@ -0,0 +1,3 @@
+#define USE_AS_BZERO
+#define __memset_sse2 __bzero_sse2
+#include "memset-sse2.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bzero.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero.S
new file mode 100644
index 0000000000..9dac490aa2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero.S
@@ -0,0 +1,62 @@
+/* Multiple versions of bzero
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib.  */
+#if IS_IN (libc)
+	.text
+ENTRY(__bzero)
+	.type	__bzero, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__bzero_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX ( __bzero_sse2)
+	HAS_ARCH_FEATURE (Fast_Rep_String)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__bzero_sse2_rep)
+2:	ret
+END(__bzero)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __bzero_ia32, @function; \
+	.p2align 4; \
+	.globl __bzero_ia32; \
+	.hidden __bzero_ia32; \
+	__bzero_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __bzero_ia32, .-__bzero_ia32
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI___bzero; __GI___bzero = __bzero_ia32
+# endif
+#endif
+
+#include "../bzero.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/ifunc-impl-list.c b/REORG.TODO/sysdeps/i386/i686/multiarch/ifunc-impl-list.c
new file mode 100644
index 0000000000..e8026a2a78
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/ifunc-impl-list.c
@@ -0,0 +1,376 @@
+/* Enumerate available IFUNC implementations of a function.  i686 version.
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <assert.h>
+#include <string.h>
+#include <wchar.h>
+#include <ifunc-impl-list.h>
+#include "init-arch.h"
+
+/* Maximum number of IFUNC implementations.  */
+#define MAX_IFUNC	4
+
+/* Fill ARRAY of MAX elements with IFUNC implementations for function
+   NAME and return the number of valid entries.  */
+
+size_t
+__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+			size_t max)
+{
+  assert (max >= MAX_IFUNC);
+
+  size_t i = 0;
+
+  /* Support sysdeps/i386/i686/multiarch/bcopy.S.  */
+  IFUNC_IMPL (i, name, bcopy,
+	      IFUNC_IMPL_ADD (array, i, bcopy, HAS_CPU_FEATURE (SSSE3),
+			      __bcopy_ssse3_rep)
+	      IFUNC_IMPL_ADD (array, i, bcopy, HAS_CPU_FEATURE (SSSE3),
+			      __bcopy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, bcopy, HAS_CPU_FEATURE (SSE2),
+			      __bcopy_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, bcopy, 1, __bcopy_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/bzero.S.  */
+  IFUNC_IMPL (i, name, bzero,
+	      IFUNC_IMPL_ADD (array, i, bzero, HAS_CPU_FEATURE (SSE2),
+			      __bzero_sse2_rep)
+	      IFUNC_IMPL_ADD (array, i, bzero, HAS_CPU_FEATURE (SSE2),
+			      __bzero_sse2)
+	      IFUNC_IMPL_ADD (array, i, bzero, 1, __bzero_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/memchr.S.  */
+  IFUNC_IMPL (i, name, memchr,
+	      IFUNC_IMPL_ADD (array, i, memchr, HAS_CPU_FEATURE (SSE2),
+			      __memchr_sse2_bsf)
+	      IFUNC_IMPL_ADD (array, i, memchr, HAS_CPU_FEATURE (SSE2),
+			      __memchr_sse2)
+	      IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/memcmp.S.  */
+  IFUNC_IMPL (i, name, memcmp,
+	      IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSE4_2),
+			      __memcmp_sse4_2)
+	      IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSSE3),
+			      __memcmp_ssse3)
+	      IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/memmove_chk.S.  */
+  IFUNC_IMPL (i, name, __memmove_chk,
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __memmove_chk_ssse3_rep)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __memmove_chk_ssse3)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_CPU_FEATURE (SSE2),
+			      __memmove_chk_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
+			      __memmove_chk_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/memmove.S.  */
+  IFUNC_IMPL (i, name, memmove,
+	      IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
+			      __memmove_ssse3_rep)
+	      IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
+			      __memmove_ssse3)
+	      IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSE2),
+			      __memmove_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/memrchr.S.  */
+  IFUNC_IMPL (i, name, memrchr,
+	      IFUNC_IMPL_ADD (array, i, memrchr, HAS_CPU_FEATURE (SSE2),
+			      __memrchr_sse2_bsf)
+	      IFUNC_IMPL_ADD (array, i, memrchr, HAS_CPU_FEATURE (SSE2),
+			      __memrchr_sse2)
+	      IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/memset_chk.S.  */
+  IFUNC_IMPL (i, name, __memset_chk,
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      HAS_CPU_FEATURE (SSE2),
+			      __memset_chk_sse2_rep)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      HAS_CPU_FEATURE (SSE2),
+			      __memset_chk_sse2)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
+			      __memset_chk_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/memset.S.  */
+  IFUNC_IMPL (i, name, memset,
+	      IFUNC_IMPL_ADD (array, i, memset, HAS_CPU_FEATURE (SSE2),
+			      __memset_sse2_rep)
+	      IFUNC_IMPL_ADD (array, i, memset, HAS_CPU_FEATURE (SSE2),
+			      __memset_sse2)
+	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/rawmemchr.S.  */
+  IFUNC_IMPL (i, name, rawmemchr,
+	      IFUNC_IMPL_ADD (array, i, rawmemchr, HAS_CPU_FEATURE (SSE2),
+			      __rawmemchr_sse2_bsf)
+	      IFUNC_IMPL_ADD (array, i, rawmemchr, HAS_CPU_FEATURE (SSE2),
+			      __rawmemchr_sse2)
+	      IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/stpncpy.S.  */
+  IFUNC_IMPL (i, name, stpncpy,
+	      IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSSE3),
+			      __stpncpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSE2),
+			      __stpncpy_sse2)
+	      IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/stpcpy.S.  */
+  IFUNC_IMPL (i, name, stpcpy,
+	      IFUNC_IMPL_ADD (array, i, stpcpy, HAS_CPU_FEATURE (SSSE3),
+			      __stpcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, stpcpy, HAS_CPU_FEATURE (SSE2),
+			      __stpcpy_sse2)
+	      IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strcasecmp.S.  */
+  IFUNC_IMPL (i, name, strcasecmp,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      HAS_CPU_FEATURE (SSE4_2),
+			      __strcasecmp_sse4_2)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __strcasecmp_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strcasecmp_l.S.  */
+  IFUNC_IMPL (i, name, strcasecmp_l,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
+			      HAS_CPU_FEATURE (SSE4_2),
+			      __strcasecmp_l_sse4_2)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __strcasecmp_l_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1,
+			      __strcasecmp_l_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strcat.S.  */
+  IFUNC_IMPL (i, name, strcat,
+	      IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSSE3),
+			      __strcat_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSE2),
+			      __strcat_sse2)
+	      IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strchr.S.  */
+  IFUNC_IMPL (i, name, strchr,
+	      IFUNC_IMPL_ADD (array, i, strchr, HAS_CPU_FEATURE (SSE2),
+			      __strchr_sse2_bsf)
+	      IFUNC_IMPL_ADD (array, i, strchr, HAS_CPU_FEATURE (SSE2),
+			      __strchr_sse2)
+	      IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strcmp.S.  */
+  IFUNC_IMPL (i, name, strcmp,
+	      IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSE4_2),
+			      __strcmp_sse4_2)
+	      IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSSE3),
+			      __strcmp_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strcpy.S.  */
+  IFUNC_IMPL (i, name, strcpy,
+	      IFUNC_IMPL_ADD (array, i, strcpy, HAS_CPU_FEATURE (SSSE3),
+			      __strcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strcpy, HAS_CPU_FEATURE (SSE2),
+			      __strcpy_sse2)
+	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strcspn.S.  */
+  IFUNC_IMPL (i, name, strcspn,
+	      IFUNC_IMPL_ADD (array, i, strcspn, HAS_CPU_FEATURE (SSE4_2),
+			      __strcspn_sse42)
+	      IFUNC_IMPL_ADD (array, i, strcspn, 1, __strcspn_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strncase.S.  */
+  IFUNC_IMPL (i, name, strncasecmp,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      HAS_CPU_FEATURE (SSE4_2),
+			      __strncasecmp_sse4_2)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __strncasecmp_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp, 1,
+			      __strncasecmp_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strncase_l.S.  */
+  IFUNC_IMPL (i, name, strncasecmp_l,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
+			      HAS_CPU_FEATURE (SSE4_2),
+			      __strncasecmp_l_sse4_2)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __strncasecmp_l_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1,
+			      __strncasecmp_l_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strncat.S.  */
+  IFUNC_IMPL (i, name, strncat,
+	      IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSSE3),
+			      __strncat_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSE2),
+			      __strncat_sse2)
+	      IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strncpy.S.  */
+  IFUNC_IMPL (i, name, strncpy,
+	      IFUNC_IMPL_ADD (array, i, strncpy, HAS_CPU_FEATURE (SSSE3),
+			      __strncpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strncpy, HAS_CPU_FEATURE (SSE2),
+			      __strncpy_sse2)
+	      IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strnlen.S.  */
+  IFUNC_IMPL (i, name, strnlen,
+	      IFUNC_IMPL_ADD (array, i, strnlen, HAS_CPU_FEATURE (SSE2),
+			      __strnlen_sse2)
+	      IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strpbrk.S.  */
+  IFUNC_IMPL (i, name, strpbrk,
+	      IFUNC_IMPL_ADD (array, i, strpbrk, HAS_CPU_FEATURE (SSE4_2),
+			      __strpbrk_sse42)
+	      IFUNC_IMPL_ADD (array, i, strpbrk, 1, __strpbrk_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strrchr.S.  */
+  IFUNC_IMPL (i, name, strrchr,
+	      IFUNC_IMPL_ADD (array, i, strrchr, HAS_CPU_FEATURE (SSE2),
+			      __strrchr_sse2_bsf)
+	      IFUNC_IMPL_ADD (array, i, strrchr, HAS_CPU_FEATURE (SSE2),
+			      __strrchr_sse2)
+	      IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strspn.S.  */
+  IFUNC_IMPL (i, name, strspn,
+	      IFUNC_IMPL_ADD (array, i, strspn, HAS_CPU_FEATURE (SSE4_2),
+			      __strspn_sse42)
+	      IFUNC_IMPL_ADD (array, i, strspn, 1, __strspn_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/wcschr.S.  */
+  IFUNC_IMPL (i, name, wcschr,
+	      IFUNC_IMPL_ADD (array, i, wcschr, HAS_CPU_FEATURE (SSE2),
+			      __wcschr_sse2)
+	      IFUNC_IMPL_ADD (array, i, wcschr, 1, __wcschr_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/wcscmp.S.  */
+  IFUNC_IMPL (i, name, wcscmp,
+	      IFUNC_IMPL_ADD (array, i, wcscmp, HAS_CPU_FEATURE (SSE2),
+			      __wcscmp_sse2)
+	      IFUNC_IMPL_ADD (array, i, wcscmp, 1, __wcscmp_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/wcscpy.S.  */
+  IFUNC_IMPL (i, name, wcscpy,
+	      IFUNC_IMPL_ADD (array, i, wcscpy, HAS_CPU_FEATURE (SSSE3),
+			      __wcscpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, wcscpy, 1, __wcscpy_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/wcslen.S.  */
+  IFUNC_IMPL (i, name, wcslen,
+	      IFUNC_IMPL_ADD (array, i, wcslen, HAS_CPU_FEATURE (SSE2),
+			      __wcslen_sse2)
+	      IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/wcsrchr.S.  */
+  IFUNC_IMPL (i, name, wcsrchr,
+	      IFUNC_IMPL_ADD (array, i, wcsrchr, HAS_CPU_FEATURE (SSE2),
+			      __wcsrchr_sse2)
+	      IFUNC_IMPL_ADD (array, i, wcsrchr, 1, __wcsrchr_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/wmemcmp.S.  */
+  IFUNC_IMPL (i, name, wmemcmp,
+	      IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSE4_2),
+			      __wmemcmp_sse4_2)
+	      IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSSE3),
+			      __wmemcmp_ssse3)
+	      IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_ia32))
+
+#ifdef SHARED
+  /* Support sysdeps/i386/i686/multiarch/memcpy_chk.S.  */
+  IFUNC_IMPL (i, name, __memcpy_chk,
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __memcpy_chk_ssse3_rep)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __memcpy_chk_ssse3)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_CPU_FEATURE (SSE2),
+			      __memcpy_chk_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
+			      __memcpy_chk_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/memcpy.S.  */
+  IFUNC_IMPL (i, name, memcpy,
+	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
+			      __memcpy_ssse3_rep)
+	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
+			      __memcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSE2),
+			      __memcpy_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/mempcpy_chk.S.  */
+  IFUNC_IMPL (i, name, __mempcpy_chk,
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __mempcpy_chk_ssse3_rep)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __mempcpy_chk_ssse3)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_CPU_FEATURE (SSE2),
+			      __mempcpy_chk_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
+			      __mempcpy_chk_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/mempcpy.S.  */
+  IFUNC_IMPL (i, name, mempcpy,
+	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
+			      __mempcpy_ssse3_rep)
+	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
+			      __mempcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSE2),
+			      __mempcpy_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strlen.S.  */
+  IFUNC_IMPL (i, name, strlen,
+	      IFUNC_IMPL_ADD (array, i, strlen, HAS_CPU_FEATURE (SSE2),
+			      __strlen_sse2_bsf)
+	      IFUNC_IMPL_ADD (array, i, strlen, HAS_CPU_FEATURE (SSE2),
+			      __strlen_sse2)
+	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_ia32))
+
+  /* Support sysdeps/i386/i686/multiarch/strncmp.S.  */
+  IFUNC_IMPL (i, name, strncmp,
+	      IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSE4_2),
+			      __strncmp_sse4_2)
+	      IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSSE3),
+			      __strncmp_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_ia32))
+#endif
+
+  return i;
+}
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/locale-defines.sym b/REORG.TODO/sysdeps/i386/i686/multiarch/locale-defines.sym
new file mode 100644
index 0000000000..aebff9a4f9
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/locale-defines.sym
@@ -0,0 +1,11 @@
+#include <locale/localeinfo.h>
+#include <langinfo.h>
+#include <stddef.h>
+
+--
+
+LOCALE_T___LOCALES		offsetof (struct __locale_struct, __locales)
+LC_CTYPE
+_NL_CTYPE_NONASCII_CASE
+LOCALE_DATA_VALUES		offsetof (struct __locale_data, values)
+SIZEOF_VALUES			sizeof (((struct __locale_data *) 0)->values[0])
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S
new file mode 100644
index 0000000000..dd316486e6
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S
@@ -0,0 +1,502 @@
+/* Optimized memchr with sse2
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS  4
+# define STR1  PARMS
+# define STR2  STR1+4
+
+# ifndef USE_AS_RAWMEMCHR
+#  define LEN   STR2+4
+#  define RETURN  POP(%edi); ret; CFI_PUSH(%edi);
+# endif
+
+# ifndef MEMCHR
+#  define MEMCHR __memchr_sse2_bsf
+# endif
+
+	.text
+ENTRY (MEMCHR)
+
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+
+# ifndef USE_AS_RAWMEMCHR
+	mov	LEN(%esp), %edx
+	test	%edx, %edx
+	jz	L(return_null_1)
+# endif
+	mov	%ecx, %eax
+
+	punpcklbw %xmm1, %xmm1
+	punpcklbw %xmm1, %xmm1
+
+	and	$63, %ecx
+	pshufd	$0, %xmm1, %xmm1
+
+	cmp	$48, %ecx
+	ja	L(crosscache)
+
+	movdqu	(%eax), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+/* Check if there is a match.  */
+	pmovmskb %xmm0, %ecx
+	test	%ecx, %ecx
+	je	L(unaligned_no_match_1)
+/* Check which byte is a match.  */
+	bsf	%ecx, %ecx
+
+# ifndef USE_AS_RAWMEMCHR
+	sub	%ecx, %edx
+	jbe	L(return_null_1)
+# endif
+	add	%ecx, %eax
+	ret
+
+	.p2align 4
+L(unaligned_no_match_1):
+# ifndef USE_AS_RAWMEMCHR
+	sub	$16, %edx
+	jbe	L(return_null_1)
+	PUSH	(%edi)
+	lea	16(%eax), %edi
+	and	$15, %eax
+	and	$-16, %edi
+	add	%eax, %edx
+# else
+	lea	16(%eax), %edx
+	and	$-16, %edx
+# endif
+	jmp	L(loop_prolog)
+
+	.p2align 4
+L(return_null_1):
+	xor	%eax, %eax
+	ret
+
+# ifndef USE_AS_RAWMEMCHR
+	CFI_POP	(%edi)
+# endif
+
+	.p2align 4
+L(crosscache):
+/* Handle unaligned string.  */
+
+# ifndef USE_AS_RAWMEMCHR
+	PUSH	(%edi)
+	mov	%eax, %edi
+	and	$15, %ecx
+	and	$-16, %edi
+	movdqa	(%edi), %xmm0
+# else
+	mov	%eax, %edx
+	and	$15, %ecx
+	and	$-16, %edx
+	movdqa	(%edx), %xmm0
+# endif
+	pcmpeqb	%xmm1, %xmm0
+/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+/* Remove the leading bytes.  */
+	sar	%cl, %eax
+	test	%eax, %eax
+	je	L(unaligned_no_match)
+/* Check which byte is a match.  */
+	bsf	%eax, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+	sub	%eax, %edx
+	jbe	L(return_null)
+	add	%edi, %eax
+	add	%ecx, %eax
+	RETURN
+# else
+	add	%edx, %eax
+	add	%ecx, %eax
+	ret
+# endif
+
+	.p2align 4
+L(unaligned_no_match):
+# ifndef USE_AS_RAWMEMCHR
+        /* Calculate the last acceptable address and check for possible
+           addition overflow by using satured math:
+           edx = ecx + edx
+           edx |= -(edx < ecx)  */
+	add	%ecx, %edx
+	sbb	%eax, %eax
+	or	%eax, %edx
+	sub	$16, %edx
+	jbe	L(return_null)
+	add	$16, %edi
+# else
+	add	$16, %edx
+# endif
+
+	.p2align 4
+/* Loop start on aligned string.  */
+L(loop_prolog):
+# ifndef USE_AS_RAWMEMCHR
+	sub	$64, %edx
+	jbe	L(exit_loop)
+	movdqa	(%edi), %xmm0
+# else
+	movdqa	(%edx), %xmm0
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	16(%edi), %xmm2
+# else
+	movdqa	16(%edx), %xmm2
+# endif
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	32(%edi), %xmm3
+# else
+	movdqa	32(%edx), %xmm3
+# endif
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	48(%edi), %xmm4
+# else
+	movdqa	48(%edx), %xmm4
+# endif
+	pcmpeqb	%xmm1, %xmm4
+
+# ifndef USE_AS_RAWMEMCHR
+	add	$64, %edi
+# else
+	add	$64, %edx
+# endif
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(matches0)
+
+# ifndef USE_AS_RAWMEMCHR
+	test	$0x3f, %edi
+# else
+	test	$0x3f, %edx
+# endif
+	jz	L(align64_loop)
+
+# ifndef USE_AS_RAWMEMCHR
+	sub	$64, %edx
+	jbe	L(exit_loop)
+	movdqa	(%edi), %xmm0
+# else
+	movdqa	(%edx), %xmm0
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	16(%edi), %xmm2
+# else
+	movdqa	16(%edx), %xmm2
+# endif
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	32(%edi), %xmm3
+# else
+	movdqa	32(%edx), %xmm3
+# endif
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	48(%edi), %xmm3
+# else
+	movdqa	48(%edx), %xmm3
+# endif
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+	add	$64, %edi
+# else
+	add	$64, %edx
+# endif
+	test	%eax, %eax
+	jnz	L(matches0)
+
+# ifndef USE_AS_RAWMEMCHR
+	mov	%edi, %ecx
+	and	$-64, %edi
+	and	$63, %ecx
+	add	%ecx, %edx
+# else
+	and	$-64, %edx
+# endif
+
+	.p2align 4
+L(align64_loop):
+# ifndef USE_AS_RAWMEMCHR
+	sub	$64, %edx
+	jbe	L(exit_loop)
+	movdqa	(%edi), %xmm0
+	movdqa	16(%edi), %xmm2
+	movdqa	32(%edi), %xmm3
+	movdqa	48(%edi), %xmm4
+# else
+	movdqa	(%edx), %xmm0
+	movdqa	16(%edx), %xmm2
+	movdqa	32(%edx), %xmm3
+	movdqa	48(%edx), %xmm4
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm1, %xmm4
+
+	pmaxub	%xmm0, %xmm3
+	pmaxub	%xmm2, %xmm4
+	pmaxub	%xmm3, %xmm4
+	pmovmskb %xmm4, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+	add	$64, %edi
+# else
+	add	$64, %edx
+# endif
+
+	test	%eax, %eax
+	jz	L(align64_loop)
+
+# ifndef USE_AS_RAWMEMCHR
+	sub	$64, %edi
+# else
+	sub	$64, %edx
+# endif
+
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	32(%edi), %xmm3
+# else
+	movdqa	32(%edx), %xmm3
+# endif
+
+	pcmpeqb	%xmm1, %xmm3
+
+# ifndef USE_AS_RAWMEMCHR
+	pcmpeqb	48(%edi), %xmm1
+# else
+	pcmpeqb	48(%edx), %xmm1
+# endif
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	pmovmskb %xmm1, %eax
+	bsf	%eax, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+	lea	48(%edi, %eax), %eax
+	RETURN
+# else
+	lea	48(%edx, %eax), %eax
+	ret
+# endif
+
+# ifndef USE_AS_RAWMEMCHR
+	.p2align 4
+L(exit_loop):
+	add	$64, %edx
+	cmp	$32, %edx
+	jbe	L(exit_loop_32)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+
+	movdqa	16(%edi), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	movdqa	32(%edi), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32_1)
+	cmp	$48, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	48(%edi), %xmm1
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(matches48_1)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(exit_loop_32):
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches_1)
+	cmp	$16, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	16(%edi), %xmm1
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(matches16_1)
+	xor	%eax, %eax
+	RETURN
+# endif
+	.p2align 4
+L(matches0):
+	bsf	%eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	lea	-16(%eax, %edi), %eax
+	RETURN
+# else
+	lea	-16(%eax, %edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(matches):
+	bsf	%eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	add	%edi, %eax
+	RETURN
+# else
+	add	%edx, %eax
+	ret
+# endif
+
+	.p2align 4
+L(matches16):
+	bsf	%eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	lea	16(%eax, %edi), %eax
+	RETURN
+# else
+	lea	16(%eax, %edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(matches32):
+	bsf	%eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	lea	32(%eax, %edi), %eax
+	RETURN
+# else
+	lea	32(%eax, %edx), %eax
+	ret
+# endif
+
+# ifndef USE_AS_RAWMEMCHR
+	.p2align 4
+L(matches_1):
+	bsf	%eax, %eax
+	sub	%eax, %edx
+	jbe	L(return_null)
+
+	add	%edi, %eax
+	RETURN
+
+	.p2align 4
+L(matches16_1):
+	sub	$16, %edx
+	bsf	%eax, %eax
+	sub	%eax, %edx
+	jbe	L(return_null)
+
+	lea	16(%edi, %eax), %eax
+	RETURN
+
+	.p2align 4
+L(matches32_1):
+	sub	$32, %edx
+	bsf	%eax, %eax
+	sub	%eax, %edx
+	jbe	L(return_null)
+
+	lea	32(%edi, %eax), %eax
+	RETURN
+
+	.p2align 4
+L(matches48_1):
+	sub	$48, %edx
+	bsf	%eax, %eax
+	sub	%eax, %edx
+	jbe	L(return_null)
+
+	lea	48(%edi, %eax), %eax
+	RETURN
+# endif
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	RETURN
+# else
+	ret
+# endif
+
+END (MEMCHR)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2.S
new file mode 100644
index 0000000000..172d70de13
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2.S
@@ -0,0 +1,709 @@
+/* Optimized memchr with sse2 without bsf
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# ifndef USE_AS_RAWMEMCHR
+#  define ENTRANCE PUSH(%edi);
+#  define PARMS  8
+#  define RETURN  POP(%edi); ret; CFI_PUSH(%edi);
+# else
+#  define ENTRANCE
+#  define PARMS  4
+# endif
+
+# define STR1  PARMS
+# define STR2  STR1+4
+
+# ifndef USE_AS_RAWMEMCHR
+#  define LEN   STR2+4
+# endif
+
+# ifndef MEMCHR
+#  define MEMCHR __memchr_sse2
+# endif
+
+	atom_text_section
+ENTRY (MEMCHR)
+	ENTRANCE
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+# ifndef USE_AS_RAWMEMCHR
+	mov	LEN(%esp), %edx
+	test	%edx, %edx
+	jz	L(return_null)
+# endif
+
+	punpcklbw %xmm1, %xmm1
+# ifndef USE_AS_RAWMEMCHR
+	mov	%ecx, %edi
+# else
+	mov	%ecx, %edx
+# endif
+	punpcklbw %xmm1, %xmm1
+
+	and	$63, %ecx
+	pshufd	$0, %xmm1, %xmm1
+	cmp	$48, %ecx
+	ja	L(crosscache)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqu	(%edi), %xmm0
+# else
+	movdqu	(%edx), %xmm0
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	jnz	L(match_case2_prolog)
+
+	sub	$16, %edx
+	jbe	L(return_null)
+	lea	16(%edi), %edi
+	and	$15, %ecx
+	and	$-16, %edi
+	add	%ecx, %edx
+# else
+	jnz	L(match_case1_prolog)
+	lea	16(%edx), %edx
+	and	$-16, %edx
+# endif
+	jmp	L(loop_prolog)
+
+	.p2align 4
+L(crosscache):
+	and	$15, %ecx
+# ifndef USE_AS_RAWMEMCHR
+	and	$-16, %edi
+	movdqa	(%edi), %xmm0
+# else
+	and	$-16, %edx
+	movdqa	(%edx), %xmm0
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	sar	%cl, %eax
+	test	%eax, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+	jnz	L(match_case2_prolog1)
+        /* "ecx" is less than 16.  Calculate "edx + ecx - 16" by using
+	   "edx - (16 - ecx)" instead of "(edx + ecx) - 16" to void
+	   possible addition overflow.  */
+	neg	%ecx
+	add	$16, %ecx
+	sub	%ecx, %edx
+	jbe	L(return_null)
+	lea	16(%edi), %edi
+# else
+	jnz	L(match_case1_prolog1)
+	lea	16(%edx), %edx
+# endif
+
+	.p2align 4
+L(loop_prolog):
+# ifndef USE_AS_RAWMEMCHR
+	sub	$64, %edx
+	jbe	L(exit_loop)
+	movdqa	(%edi), %xmm0
+# else
+	movdqa	(%edx), %xmm0
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	xor	%ecx, %ecx
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	16(%edi), %xmm2
+# else
+	movdqa	16(%edx), %xmm2
+# endif
+	pcmpeqb	%xmm1, %xmm2
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	32(%edi), %xmm3
+# else
+	movdqa	32(%edx), %xmm3
+# endif
+	pcmpeqb	%xmm1, %xmm3
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	48(%edi), %xmm4
+# else
+	movdqa	48(%edx), %xmm4
+# endif
+	pcmpeqb	%xmm1, %xmm4
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	lea	64(%edi), %edi
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	(%edi), %xmm0
+# else
+	lea	64(%edx), %edx
+	movdqa	(%edx), %xmm0
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	xor	%ecx, %ecx
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	16(%edi), %xmm2
+# else
+	movdqa	16(%edx), %xmm2
+# endif
+	pcmpeqb	%xmm1, %xmm2
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	32(%edi), %xmm3
+# else
+	movdqa	32(%edx), %xmm3
+# endif
+	pcmpeqb	%xmm1, %xmm3
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	48(%edi), %xmm4
+# else
+	movdqa	48(%edx), %xmm4
+# endif
+	pcmpeqb	%xmm1, %xmm4
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	lea	64(%edi), %edi
+	mov	%edi, %ecx
+	and	$-64, %edi
+	and	$63, %ecx
+	add	%ecx, %edx
+# else
+	lea	64(%edx), %edx
+	and	$-64, %edx
+# endif
+
+	.p2align 4
+L(align64_loop):
+
+# ifndef USE_AS_RAWMEMCHR
+	sub	$64, %edx
+	jbe	L(exit_loop)
+	movdqa	(%edi), %xmm0
+	movdqa	16(%edi), %xmm2
+	movdqa	32(%edi), %xmm3
+	movdqa	48(%edi), %xmm4
+# else
+	movdqa	(%edx), %xmm0
+	movdqa	16(%edx), %xmm2
+	movdqa	32(%edx), %xmm3
+	movdqa	48(%edx), %xmm4
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm1, %xmm4
+
+	pmaxub	%xmm0, %xmm3
+	pmaxub	%xmm2, %xmm4
+	pmaxub	%xmm3, %xmm4
+# ifndef USE_AS_RAWMEMCHR
+	add	$64, %edi
+# else
+	add	$64, %edx
+# endif
+	pmovmskb %xmm4, %eax
+
+	test	%eax, %eax
+	jz	L(align64_loop)
+
+# ifndef USE_AS_RAWMEMCHR
+	sub	$64, %edi
+# else
+	sub	$64, %edx
+# endif
+
+	pmovmskb %xmm0, %eax
+	xor	%ecx, %ecx
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+	pmovmskb %xmm2, %eax
+	lea	16(%ecx), %ecx
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	32(%edi), %xmm3
+# else
+	movdqa	32(%edx), %xmm3
+# endif
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	lea	16(%ecx), %ecx
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	pcmpeqb	48(%edi), %xmm1
+# else
+	pcmpeqb	48(%edx), %xmm1
+# endif
+	pmovmskb %xmm1, %eax
+	lea	16(%ecx), %ecx
+
+	.p2align 4
+L(match_case1):
+# ifndef USE_AS_RAWMEMCHR
+	add	%ecx, %edi
+# else
+L(match_case1_prolog1):
+	add	%ecx, %edx
+L(match_case1_prolog):
+# endif
+	test	%al, %al
+	jz	L(match_case1_high)
+	mov	%al, %cl
+	and	$15, %cl
+	jz	L(match_case1_8)
+	test	$0x01, %al
+	jnz	L(ExitCase1_1)
+	test	$0x02, %al
+	jnz	L(ExitCase1_2)
+	test	$0x04, %al
+	jnz	L(ExitCase1_3)
+# ifndef USE_AS_RAWMEMCHR
+	lea	3(%edi), %eax
+	RETURN
+# else
+	lea	3(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(match_case1_8):
+	test	$0x10, %al
+	jnz	L(ExitCase1_5)
+	test	$0x20, %al
+	jnz	L(ExitCase1_6)
+	test	$0x40, %al
+	jnz	L(ExitCase1_7)
+# ifndef USE_AS_RAWMEMCHR
+	lea	7(%edi), %eax
+	RETURN
+# else
+	lea	7(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(match_case1_high):
+	mov	%ah, %ch
+	and	$15, %ch
+	jz	L(match_case1_high_8)
+	test	$0x01, %ah
+	jnz	L(ExitCase1_9)
+	test	$0x02, %ah
+	jnz	L(ExitCase1_10)
+	test	$0x04, %ah
+	jnz	L(ExitCase1_11)
+# ifndef USE_AS_RAWMEMCHR
+	lea	11(%edi), %eax
+	RETURN
+# else
+	lea	11(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(match_case1_high_8):
+	test	$0x10, %ah
+	jnz	L(ExitCase1_13)
+	test	$0x20, %ah
+	jnz	L(ExitCase1_14)
+	test	$0x40, %ah
+	jnz	L(ExitCase1_15)
+# ifndef USE_AS_RAWMEMCHR
+	lea	15(%edi), %eax
+	RETURN
+# else
+	lea	15(%edx), %eax
+	ret
+# endif
+
+# ifndef USE_AS_RAWMEMCHR
+	.p2align 4
+L(exit_loop):
+	add	$64, %edx
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	xor	%ecx, %ecx
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(match_case2)
+	cmp	$16, %edx
+	jbe	L(return_null)
+
+	movdqa	16(%edi), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(match_case2)
+	cmp	$32, %edx
+	jbe	L(return_null)
+
+	movdqa	32(%edi), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(match_case2)
+	cmp	$48, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	48(%edi), %xmm1
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(match_case2)
+
+	xor	%eax, %eax
+	RETURN
+# endif
+
+	.p2align 4
+L(ExitCase1_1):
+# ifndef USE_AS_RAWMEMCHR
+	mov	%edi, %eax
+	RETURN
+# else
+	mov	%edx, %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_2):
+# ifndef USE_AS_RAWMEMCHR
+	lea	1(%edi), %eax
+	RETURN
+# else
+	lea	1(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_3):
+# ifndef USE_AS_RAWMEMCHR
+	lea	2(%edi), %eax
+	RETURN
+# else
+	lea	2(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_5):
+# ifndef USE_AS_RAWMEMCHR
+	lea	4(%edi), %eax
+	RETURN
+# else
+	lea	4(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_6):
+# ifndef USE_AS_RAWMEMCHR
+	lea	5(%edi), %eax
+	RETURN
+# else
+	lea	5(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_7):
+# ifndef USE_AS_RAWMEMCHR
+	lea	6(%edi), %eax
+	RETURN
+# else
+	lea	6(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_9):
+# ifndef USE_AS_RAWMEMCHR
+	lea	8(%edi), %eax
+	RETURN
+# else
+	lea	8(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_10):
+# ifndef USE_AS_RAWMEMCHR
+	lea	9(%edi), %eax
+	RETURN
+# else
+	lea	9(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_11):
+# ifndef USE_AS_RAWMEMCHR
+	lea	10(%edi), %eax
+	RETURN
+# else
+	lea	10(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_13):
+# ifndef USE_AS_RAWMEMCHR
+	lea	12(%edi), %eax
+	RETURN
+# else
+	lea	12(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_14):
+# ifndef USE_AS_RAWMEMCHR
+	lea	13(%edi), %eax
+	RETURN
+# else
+	lea	13(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_15):
+# ifndef USE_AS_RAWMEMCHR
+	lea	14(%edi), %eax
+	RETURN
+# else
+	lea	14(%edx), %eax
+	ret
+# endif
+
+# ifndef USE_AS_RAWMEMCHR
+	.p2align 4
+L(match_case2):
+	sub	%ecx, %edx
+L(match_case2_prolog1):
+	add	%ecx, %edi
+L(match_case2_prolog):
+	test	%al, %al
+	jz	L(match_case2_high)
+	mov	%al, %cl
+	and	$15, %cl
+	jz	L(match_case2_8)
+	test	$0x01, %al
+	jnz	L(ExitCase2_1)
+	test	$0x02, %al
+	jnz	L(ExitCase2_2)
+	test	$0x04, %al
+	jnz	L(ExitCase2_3)
+	sub	$4, %edx
+	jb	L(return_null)
+	lea	3(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_case2_8):
+	test	$0x10, %al
+	jnz	L(ExitCase2_5)
+	test	$0x20, %al
+	jnz	L(ExitCase2_6)
+	test	$0x40, %al
+	jnz	L(ExitCase2_7)
+	sub	$8, %edx
+	jb	L(return_null)
+	lea	7(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_case2_high):
+	mov	%ah, %ch
+	and	$15, %ch
+	jz	L(match_case2_high_8)
+	test	$0x01, %ah
+	jnz	L(ExitCase2_9)
+	test	$0x02, %ah
+	jnz	L(ExitCase2_10)
+	test	$0x04, %ah
+	jnz	L(ExitCase2_11)
+	sub	$12, %edx
+	jb	L(return_null)
+	lea	11(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_case2_high_8):
+	test	$0x10, %ah
+	jnz	L(ExitCase2_13)
+	test	$0x20, %ah
+	jnz	L(ExitCase2_14)
+	test	$0x40, %ah
+	jnz	L(ExitCase2_15)
+	sub	$16, %edx
+	jb	L(return_null)
+	lea	15(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_1):
+	mov	%edi, %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_2):
+	sub	$2, %edx
+	jb	L(return_null)
+	lea	1(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_3):
+	sub	$3, %edx
+	jb	L(return_null)
+	lea	2(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_5):
+	sub	$5, %edx
+	jb	L(return_null)
+	lea	4(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_6):
+	sub	$6, %edx
+	jb	L(return_null)
+	lea	5(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_7):
+	sub	$7, %edx
+	jb	L(return_null)
+	lea	6(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_9):
+	sub	$9, %edx
+	jb	L(return_null)
+	lea	8(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_10):
+	sub	$10, %edx
+	jb	L(return_null)
+	lea	9(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_11):
+	sub	$11, %edx
+	jb	L(return_null)
+	lea	10(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_13):
+	sub	$13, %edx
+	jb	L(return_null)
+	lea	12(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_14):
+	sub	$14, %edx
+	jb	L(return_null)
+	lea	13(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_15):
+	sub	$15, %edx
+	jb	L(return_null)
+	lea	14(%edi), %eax
+	RETURN
+# endif
+
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	RETURN
+# else
+	ret
+# endif
+
+END (MEMCHR)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr.S
new file mode 100644
index 0000000000..bd0dace290
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr.S
@@ -0,0 +1,65 @@
+/* Multiple versions of memchr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+	.text
+ENTRY(__memchr)
+	.type	__memchr, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	HAS_ARCH_FEATURE (Slow_BSF)
+	jz	3f
+
+	LOAD_FUNC_GOT_EAX ( __memchr_sse2)
+	ret
+
+2:	LOAD_FUNC_GOT_EAX (__memchr_ia32)
+	ret
+
+3:	LOAD_FUNC_GOT_EAX (__memchr_sse2_bsf)
+	ret
+END(__memchr)
+
+weak_alias(__memchr, memchr)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __memchr_ia32, @function; \
+	.globl __memchr_ia32; \
+	.p2align 4; \
+	__memchr_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __memchr_ia32, .-__memchr_ia32
+
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_memchr; __GI_memchr = __memchr_ia32
+
+#endif
+#include "../../memchr.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-sse4.S
new file mode 100644
index 0000000000..2aa13048b2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-sse4.S
@@ -0,0 +1,1225 @@
+/* memcmp with SSE4.2, wmemcmp with SSE4.2
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef MEMCMP
+#  define MEMCMP	__memcmp_sse4_2
+# endif
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+# define POP(REG)	popl REG; CFI_POP (REG)
+
+# define PARMS	4
+# define BLK1	PARMS
+# define BLK2	BLK1 + 4
+# define LEN	BLK2 + 4
+# define RETURN	POP (%ebx); ret; CFI_PUSH (%ebx)
+
+
+# ifdef SHARED
+#  define JMPTBL(I, B)	I - B
+
+/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
+	jump	table with relative offsets.  INDEX is a register contains the
+	index	into the jump table.   SCALE is the scale of INDEX. */
+
+#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)	\
+/* We first load PC into EBX.  */	\
+	SETUP_PIC_REG(bx);	\
+/* Get the address of the jump table.  */	\
+	addl	$(TABLE - .), %ebx;	\
+/* Get the entry and convert the relative offset to the	\
+	absolute	address.  */	\
+	addl	(%ebx,INDEX,SCALE), %ebx;	\
+/* We loaded the jump table and adjusted EDX/ESI. Go.  */	\
+	jmp	*%ebx
+# else
+#  define JMPTBL(I, B)	I
+
+/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
+	jump	table with relative offsets.  INDEX is a register contains the
+	index	into the jump table.   SCALE is the scale of INDEX. */
+#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)	\
+	jmp	*TABLE(,INDEX,SCALE)
+# endif
+
+
+/* Warning!
+           wmemcmp has to use SIGNED comparison for elements.
+           memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
+	.section .text.sse4.2,"ax",@progbits
+ENTRY (MEMCMP)
+	movl	BLK1(%esp), %eax
+	movl	BLK2(%esp), %edx
+	movl	LEN(%esp), %ecx
+
+# ifdef USE_AS_WMEMCMP
+	shl	$2, %ecx
+	test	%ecx, %ecx
+	jz	L(return0)
+# else
+	cmp	$1, %ecx
+	jbe	L(less1bytes)
+# endif
+
+	pxor	%xmm0, %xmm0
+	cmp	$64, %ecx
+	ja	L(64bytesormore)
+	cmp	$8, %ecx
+
+# ifndef USE_AS_WMEMCMP
+	PUSH	(%ebx)
+	jb	L(less8bytes)
+# else
+	jb	L(less8bytes)
+	PUSH	(%ebx)
+# endif
+
+	add	%ecx, %edx
+	add	%ecx, %eax
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(less8bytes):
+	mov	(%eax), %bl
+	cmpb	(%edx), %bl
+	jne	L(nonzero)
+
+	mov	1(%eax), %bl
+	cmpb	1(%edx), %bl
+	jne	L(nonzero)
+
+	cmp	$2, %ecx
+	jz	L(0bytes)
+
+	mov	2(%eax), %bl
+	cmpb	2(%edx), %bl
+	jne	L(nonzero)
+
+	cmp	$3, %ecx
+	jz	L(0bytes)
+
+	mov	3(%eax), %bl
+	cmpb	3(%edx), %bl
+	jne	L(nonzero)
+
+	cmp	$4, %ecx
+	jz	L(0bytes)
+
+	mov	4(%eax), %bl
+	cmpb	4(%edx), %bl
+	jne	L(nonzero)
+
+	cmp	$5, %ecx
+	jz	L(0bytes)
+
+	mov	5(%eax), %bl
+	cmpb	5(%edx), %bl
+	jne	L(nonzero)
+
+	cmp	$6, %ecx
+	jz	L(0bytes)
+
+	mov	6(%eax), %bl
+	cmpb	6(%edx), %bl
+	je	L(0bytes)
+
+L(nonzero):
+	POP	(%ebx)
+	mov	$1, %eax
+	ja	L(above)
+	neg	%eax
+L(above):
+	ret
+	CFI_PUSH (%ebx)
+# endif
+
+	.p2align 4
+L(0bytes):
+	POP	(%ebx)
+	xor	%eax, %eax
+	ret
+
+# ifdef USE_AS_WMEMCMP
+
+/* for wmemcmp, case N == 1 */
+
+	.p2align 4
+L(less8bytes):
+	mov	(%eax), %ecx
+	cmp	(%edx), %ecx
+	je	L(return0)
+	mov	$1, %eax
+	jg	L(find_diff_bigger)
+	neg	%eax
+	ret
+
+	.p2align 4
+L(find_diff_bigger):
+	ret
+
+	.p2align 4
+L(return0):
+	xor	%eax, %eax
+	ret
+# endif
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(less1bytes):
+	jb	L(0bytesend)
+	movzbl	(%eax), %eax
+	movzbl	(%edx), %edx
+	sub	%edx, %eax
+	ret
+
+	.p2align 4
+L(0bytesend):
+	xor	%eax, %eax
+	ret
+# endif
+	.p2align 4
+L(64bytesormore):
+	PUSH	(%ebx)
+	mov	%ecx, %ebx
+	mov	$64, %ecx
+	sub	$64, %ebx
+L(64bytesormore_loop):
+	movdqu	(%eax), %xmm1
+	movdqu	(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(find_16diff)
+
+	movdqu	16(%eax), %xmm1
+	movdqu	16(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(find_32diff)
+
+	movdqu	32(%eax), %xmm1
+	movdqu	32(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(find_48diff)
+
+	movdqu	48(%eax), %xmm1
+	movdqu	48(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(find_64diff)
+	add	%ecx, %eax
+	add	%ecx, %edx
+	sub	%ecx, %ebx
+	jae	L(64bytesormore_loop)
+	add	%ebx, %ecx
+	add	%ecx, %edx
+	add	%ecx, %eax
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
+
+# ifdef USE_AS_WMEMCMP
+
+/* Label needs only for table_64bytes filling */
+L(unreal_case):
+/* no code here */
+
+# endif
+	.p2align 4
+L(find_16diff):
+	sub	$16, %ecx
+L(find_32diff):
+	sub	$16, %ecx
+L(find_48diff):
+	sub	$16, %ecx
+L(find_64diff):
+	add	%ecx, %edx
+	add	%ecx, %eax
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(16bytes):
+	mov	-16(%eax), %ecx
+	mov	-16(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(12bytes):
+	mov	-12(%eax), %ecx
+	mov	-12(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(8bytes):
+	mov	-8(%eax), %ecx
+	mov	-8(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(4bytes):
+	mov	-4(%eax), %ecx
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+# else
+	.p2align 4
+L(16bytes):
+	mov	-16(%eax), %ecx
+	cmp	-16(%edx), %ecx
+	jne	L(find_diff)
+L(12bytes):
+	mov	-12(%eax), %ecx
+	cmp	-12(%edx), %ecx
+	jne	L(find_diff)
+L(8bytes):
+	mov	-8(%eax), %ecx
+	cmp	-8(%edx), %ecx
+	jne	L(find_diff)
+L(4bytes):
+	mov	-4(%eax), %ecx
+	cmp	-4(%edx), %ecx
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+# endif
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(49bytes):
+	movdqu	-49(%eax), %xmm1
+	movdqu	-49(%edx), %xmm2
+	mov	$-49, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(33bytes):
+	movdqu	-33(%eax), %xmm1
+	movdqu	-33(%edx), %xmm2
+	mov	$-33, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(17bytes):
+	mov	-17(%eax), %ecx
+	mov	-17(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(13bytes):
+	mov	-13(%eax), %ecx
+	mov	-13(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(9bytes):
+	mov	-9(%eax), %ecx
+	mov	-9(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(5bytes):
+	mov	-5(%eax), %ecx
+	mov	-5(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzbl	-1(%eax), %ecx
+	cmp	-1(%edx), %cl
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(50bytes):
+	mov	$-50, %ebx
+	movdqu	-50(%eax), %xmm1
+	movdqu	-50(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(34bytes):
+	mov	$-34, %ebx
+	movdqu	-34(%eax), %xmm1
+	movdqu	-34(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(18bytes):
+	mov	-18(%eax), %ecx
+	mov	-18(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(14bytes):
+	mov	-14(%eax), %ecx
+	mov	-14(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(10bytes):
+	mov	-10(%eax), %ecx
+	mov	-10(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(6bytes):
+	mov	-6(%eax), %ecx
+	mov	-6(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(2bytes):
+	movzwl	-2(%eax), %ecx
+	movzwl	-2(%edx), %ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bh, %ch
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(51bytes):
+	mov	$-51, %ebx
+	movdqu	-51(%eax), %xmm1
+	movdqu	-51(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(35bytes):
+	mov	$-35, %ebx
+	movdqu	-35(%eax), %xmm1
+	movdqu	-35(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(19bytes):
+	movl	-19(%eax), %ecx
+	movl	-19(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(15bytes):
+	movl	-15(%eax), %ecx
+	movl	-15(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(11bytes):
+	movl	-11(%eax), %ecx
+	movl	-11(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(7bytes):
+	movl	-7(%eax), %ecx
+	movl	-7(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(3bytes):
+	movzwl	-3(%eax), %ecx
+	movzwl	-3(%edx), %ebx
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+L(1bytes):
+	movzbl	-1(%eax), %eax
+	cmpb	-1(%edx), %al
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+# endif
+	.p2align 4
+L(52bytes):
+	movdqu	-52(%eax), %xmm1
+	movdqu	-52(%edx), %xmm2
+	mov	$-52, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(36bytes):
+	movdqu	-36(%eax), %xmm1
+	movdqu	-36(%edx), %xmm2
+	mov	$-36, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(20bytes):
+	movdqu	-20(%eax), %xmm1
+	movdqu	-20(%edx), %xmm2
+	mov	$-20, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+# else
+	cmp	-4(%edx), %ecx
+# endif
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(53bytes):
+	movdqu	-53(%eax), %xmm1
+	movdqu	-53(%edx), %xmm2
+	mov	$-53, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(37bytes):
+	mov	$-37, %ebx
+	movdqu	-37(%eax), %xmm1
+	movdqu	-37(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(21bytes):
+	mov	$-21, %ebx
+	movdqu	-21(%eax), %xmm1
+	movdqu	-21(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-5(%eax), %ecx
+	mov	-5(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzbl	-1(%eax), %ecx
+	cmp	-1(%edx), %cl
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(54bytes):
+	movdqu	-54(%eax), %xmm1
+	movdqu	-54(%edx), %xmm2
+	mov	$-54, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(38bytes):
+	mov	$-38, %ebx
+	movdqu	-38(%eax), %xmm1
+	movdqu	-38(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(22bytes):
+	mov	$-22, %ebx
+	movdqu	-22(%eax), %xmm1
+	movdqu	-22(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-6(%eax), %ecx
+	mov	-6(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzwl	-2(%eax), %ecx
+	movzwl	-2(%edx), %ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bh, %ch
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(55bytes):
+	movdqu	-55(%eax), %xmm1
+	movdqu	-55(%edx), %xmm2
+	mov	$-55, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(39bytes):
+	mov	$-39, %ebx
+	movdqu	-39(%eax), %xmm1
+	movdqu	-39(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(23bytes):
+	mov	$-23, %ebx
+	movdqu	-23(%eax), %xmm1
+	movdqu	-23(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	movl	-7(%eax), %ecx
+	movl	-7(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzwl	-3(%eax), %ecx
+	movzwl	-3(%edx), %ebx
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	movzbl	-1(%eax), %eax
+	cmpb	-1(%edx), %al
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+# endif
+	.p2align 4
+L(56bytes):
+	movdqu	-56(%eax), %xmm1
+	movdqu	-56(%edx), %xmm2
+	mov	$-56, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(40bytes):
+	mov	$-40, %ebx
+	movdqu	-40(%eax), %xmm1
+	movdqu	-40(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(24bytes):
+	mov	$-24, %ebx
+	movdqu	-24(%eax), %xmm1
+	movdqu	-24(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-8(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-8(%edx), %ebx
+	cmp	%ebx, %ecx
+# else
+	cmp	-8(%edx), %ecx
+# endif
+	jne	L(find_diff)
+
+	mov	-4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+# else
+	cmp	-4(%edx), %ecx
+# endif
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(57bytes):
+	movdqu	-57(%eax), %xmm1
+	movdqu	-57(%edx), %xmm2
+	mov	$-57, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(41bytes):
+	mov	$-41, %ebx
+	movdqu	-41(%eax), %xmm1
+	movdqu	-41(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(25bytes):
+	mov	$-25, %ebx
+	movdqu	-25(%eax), %xmm1
+	movdqu	-25(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-9(%eax), %ecx
+	mov	-9(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	mov	-5(%eax), %ecx
+	mov	-5(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzbl	-1(%eax), %ecx
+	cmp	-1(%edx), %cl
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(58bytes):
+	movdqu	-58(%eax), %xmm1
+	movdqu	-58(%edx), %xmm2
+	mov	$-58, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(42bytes):
+	mov	$-42, %ebx
+	movdqu	-42(%eax), %xmm1
+	movdqu	-42(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(26bytes):
+	mov	$-26, %ebx
+	movdqu	-26(%eax), %xmm1
+	movdqu	-26(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-10(%eax), %ecx
+	mov	-10(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	-6(%eax), %ecx
+	mov	-6(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	movzwl	-2(%eax), %ecx
+	movzwl	-2(%edx), %ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bh, %ch
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(59bytes):
+	movdqu	-59(%eax), %xmm1
+	movdqu	-59(%edx), %xmm2
+	mov	$-59, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(43bytes):
+	mov	$-43, %ebx
+	movdqu	-43(%eax), %xmm1
+	movdqu	-43(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(27bytes):
+	mov	$-27, %ebx
+	movdqu	-27(%eax), %xmm1
+	movdqu	-27(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	movl	-11(%eax), %ecx
+	movl	-11(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movl	-7(%eax), %ecx
+	movl	-7(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzwl	-3(%eax), %ecx
+	movzwl	-3(%edx), %ebx
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	movzbl	-1(%eax), %eax
+	cmpb	-1(%edx), %al
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+# endif
+	.p2align 4
+L(60bytes):
+	movdqu	-60(%eax), %xmm1
+	movdqu	-60(%edx), %xmm2
+	mov	$-60, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(44bytes):
+	mov	$-44, %ebx
+	movdqu	-44(%eax), %xmm1
+	movdqu	-44(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(28bytes):
+	mov	$-28, %ebx
+	movdqu	-28(%eax), %xmm1
+	movdqu	-28(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-12(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-12(%edx), %ebx
+	cmp	%ebx, %ecx
+# else
+	cmp	-12(%edx), %ecx
+# endif
+	jne	L(find_diff)
+
+	mov	-8(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-8(%edx), %ebx
+	cmp	%ebx, %ecx
+# else
+	cmp	-8(%edx), %ecx
+# endif
+	jne	L(find_diff)
+
+	mov	-4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+# else
+	cmp	-4(%edx), %ecx
+# endif
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(61bytes):
+	movdqu	-61(%eax), %xmm1
+	movdqu	-61(%edx), %xmm2
+	mov	$-61, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(45bytes):
+	mov	$-45, %ebx
+	movdqu	-45(%eax), %xmm1
+	movdqu	-45(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(29bytes):
+	mov	$-29, %ebx
+	movdqu	-29(%eax), %xmm1
+	movdqu	-29(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-13(%eax), %ecx
+	mov	-13(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	-9(%eax), %ecx
+	mov	-9(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	-5(%eax), %ecx
+	mov	-5(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzbl	-1(%eax), %ecx
+	cmp	-1(%edx), %cl
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(62bytes):
+	movdqu	-62(%eax), %xmm1
+	movdqu	-62(%edx), %xmm2
+	mov	$-62, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(46bytes):
+	mov	$-46, %ebx
+	movdqu	-46(%eax), %xmm1
+	movdqu	-46(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(30bytes):
+	mov	$-30, %ebx
+	movdqu	-30(%eax), %xmm1
+	movdqu	-30(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-14(%eax), %ecx
+	mov	-14(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	mov	-10(%eax), %ecx
+	mov	-10(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	mov	-6(%eax), %ecx
+	mov	-6(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzwl	-2(%eax), %ecx
+	movzwl	-2(%edx), %ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bh, %ch
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(63bytes):
+	movdqu	-63(%eax), %xmm1
+	movdqu	-63(%edx), %xmm2
+	mov	$-63, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(47bytes):
+	mov	$-47, %ebx
+	movdqu	-47(%eax), %xmm1
+	movdqu	-47(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(31bytes):
+	mov	$-31, %ebx
+	movdqu	-31(%eax), %xmm1
+	movdqu	-31(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	movl	-15(%eax), %ecx
+	movl	-15(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movl	-11(%eax), %ecx
+	movl	-11(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movl	-7(%eax), %ecx
+	movl	-7(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzwl	-3(%eax), %ecx
+	movzwl	-3(%edx), %ebx
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	movzbl	-1(%eax), %eax
+	cmpb	-1(%edx), %al
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+# endif
+
+	.p2align 4
+L(64bytes):
+	movdqu	-64(%eax), %xmm1
+	movdqu	-64(%edx), %xmm2
+	mov	$-64, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(48bytes):
+	movdqu	-48(%eax), %xmm1
+	movdqu	-48(%edx), %xmm2
+	mov	$-48, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(32bytes):
+	movdqu	-32(%eax), %xmm1
+	movdqu	-32(%edx), %xmm2
+	mov	$-32, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-16(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-16(%edx), %ebx
+	cmp	%ebx, %ecx
+# else
+	cmp	-16(%edx), %ecx
+# endif
+	jne	L(find_diff)
+
+	mov	-12(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-12(%edx), %ebx
+	cmp	%ebx, %ecx
+# else
+	cmp	-12(%edx), %ecx
+# endif
+	jne	L(find_diff)
+
+	mov	-8(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-8(%edx), %ebx
+	cmp	%ebx, %ecx
+# else
+	cmp	-8(%edx), %ecx
+# endif
+	jne	L(find_diff)
+
+	mov	-4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+# else
+	cmp	-4(%edx), %ecx
+# endif
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(less16bytes):
+	add	%ebx, %eax
+	add	%ebx, %edx
+
+	mov	(%eax), %ecx
+	mov	(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	4(%eax), %ecx
+	mov	4(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	8(%eax), %ecx
+	mov	8(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	12(%eax), %ecx
+	mov	12(%edx), %ebx
+	cmp	%ebx, %ecx
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+# else
+	.p2align 4
+L(less16bytes):
+	add	%ebx, %eax
+	add	%ebx, %edx
+
+	mov	(%eax), %ecx
+	cmp	(%edx), %ecx
+	jne	L(find_diff)
+
+	mov	4(%eax), %ecx
+	cmp	4(%edx), %ecx
+	jne	L(find_diff)
+
+	mov	8(%eax), %ecx
+	cmp	8(%edx), %ecx
+	jne	L(find_diff)
+
+	mov	12(%eax), %ecx
+	cmp	12(%edx), %ecx
+
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+# endif
+
+	.p2align 4
+L(find_diff):
+# ifndef USE_AS_WMEMCMP
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	shr	$16,%ecx
+	shr	$16,%ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+L(end):
+	POP	(%ebx)
+	mov	$1, %eax
+	ja	L(bigger)
+	neg	%eax
+L(bigger):
+	ret
+# else
+	POP	(%ebx)
+	mov	$1, %eax
+	jg	L(bigger)
+	neg	%eax
+	ret
+
+	.p2align 4
+L(bigger):
+	ret
+# endif
+END (MEMCMP)
+
+	.section .rodata.sse4.2,"a",@progbits
+	.p2align 2
+	.type	L(table_64bytes), @object
+# ifndef USE_AS_WMEMCMP
+L(table_64bytes):
+	.int	JMPTBL (L(0bytes), L(table_64bytes))
+	.int	JMPTBL (L(1bytes), L(table_64bytes))
+	.int	JMPTBL (L(2bytes), L(table_64bytes))
+	.int	JMPTBL (L(3bytes), L(table_64bytes))
+	.int	JMPTBL (L(4bytes), L(table_64bytes))
+	.int	JMPTBL (L(5bytes), L(table_64bytes))
+	.int	JMPTBL (L(6bytes), L(table_64bytes))
+	.int	JMPTBL (L(7bytes), L(table_64bytes))
+	.int	JMPTBL (L(8bytes), L(table_64bytes))
+	.int	JMPTBL (L(9bytes), L(table_64bytes))
+	.int	JMPTBL (L(10bytes), L(table_64bytes))
+	.int	JMPTBL (L(11bytes), L(table_64bytes))
+	.int	JMPTBL (L(12bytes), L(table_64bytes))
+	.int	JMPTBL (L(13bytes), L(table_64bytes))
+	.int	JMPTBL (L(14bytes), L(table_64bytes))
+	.int	JMPTBL (L(15bytes), L(table_64bytes))
+	.int	JMPTBL (L(16bytes), L(table_64bytes))
+	.int	JMPTBL (L(17bytes), L(table_64bytes))
+	.int	JMPTBL (L(18bytes), L(table_64bytes))
+	.int	JMPTBL (L(19bytes), L(table_64bytes))
+	.int	JMPTBL (L(20bytes), L(table_64bytes))
+	.int	JMPTBL (L(21bytes), L(table_64bytes))
+	.int	JMPTBL (L(22bytes), L(table_64bytes))
+	.int	JMPTBL (L(23bytes), L(table_64bytes))
+	.int	JMPTBL (L(24bytes), L(table_64bytes))
+	.int	JMPTBL (L(25bytes), L(table_64bytes))
+	.int	JMPTBL (L(26bytes), L(table_64bytes))
+	.int	JMPTBL (L(27bytes), L(table_64bytes))
+	.int	JMPTBL (L(28bytes), L(table_64bytes))
+	.int	JMPTBL (L(29bytes), L(table_64bytes))
+	.int	JMPTBL (L(30bytes), L(table_64bytes))
+	.int	JMPTBL (L(31bytes), L(table_64bytes))
+	.int	JMPTBL (L(32bytes), L(table_64bytes))
+	.int	JMPTBL (L(33bytes), L(table_64bytes))
+	.int	JMPTBL (L(34bytes), L(table_64bytes))
+	.int	JMPTBL (L(35bytes), L(table_64bytes))
+	.int	JMPTBL (L(36bytes), L(table_64bytes))
+	.int	JMPTBL (L(37bytes), L(table_64bytes))
+	.int	JMPTBL (L(38bytes), L(table_64bytes))
+	.int	JMPTBL (L(39bytes), L(table_64bytes))
+	.int	JMPTBL (L(40bytes), L(table_64bytes))
+	.int	JMPTBL (L(41bytes), L(table_64bytes))
+	.int	JMPTBL (L(42bytes), L(table_64bytes))
+	.int	JMPTBL (L(43bytes), L(table_64bytes))
+	.int	JMPTBL (L(44bytes), L(table_64bytes))
+	.int	JMPTBL (L(45bytes), L(table_64bytes))
+	.int	JMPTBL (L(46bytes), L(table_64bytes))
+	.int	JMPTBL (L(47bytes), L(table_64bytes))
+	.int	JMPTBL (L(48bytes), L(table_64bytes))
+	.int	JMPTBL (L(49bytes), L(table_64bytes))
+	.int	JMPTBL (L(50bytes), L(table_64bytes))
+	.int	JMPTBL (L(51bytes), L(table_64bytes))
+	.int	JMPTBL (L(52bytes), L(table_64bytes))
+	.int	JMPTBL (L(53bytes), L(table_64bytes))
+	.int	JMPTBL (L(54bytes), L(table_64bytes))
+	.int	JMPTBL (L(55bytes), L(table_64bytes))
+	.int	JMPTBL (L(56bytes), L(table_64bytes))
+	.int	JMPTBL (L(57bytes), L(table_64bytes))
+	.int	JMPTBL (L(58bytes), L(table_64bytes))
+	.int	JMPTBL (L(59bytes), L(table_64bytes))
+	.int	JMPTBL (L(60bytes), L(table_64bytes))
+	.int	JMPTBL (L(61bytes), L(table_64bytes))
+	.int	JMPTBL (L(62bytes), L(table_64bytes))
+	.int	JMPTBL (L(63bytes), L(table_64bytes))
+	.int	JMPTBL (L(64bytes), L(table_64bytes))
+# else
+L(table_64bytes):
+	.int	JMPTBL (L(0bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(4bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(8bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(12bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(16bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(20bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(24bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(28bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(32bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(36bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(40bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(44bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(48bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(52bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(56bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(60bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(64bytes), L(table_64bytes))
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-ssse3.S
new file mode 100644
index 0000000000..5ebf5a4d73
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-ssse3.S
@@ -0,0 +1,2157 @@
+/* memcmp with SSSE3, wmemcmp with SSSE3
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef MEMCMP
+#  define MEMCMP		__memcmp_ssse3
+# endif
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+# define POP(REG)	popl REG; CFI_POP (REG)
+
+# define PARMS		4
+# define BLK1		PARMS
+# define BLK2		BLK1+4
+# define LEN		BLK2+4
+# define RETURN_END	POP (%edi); POP (%esi); POP (%ebx); ret
+# define RETURN		RETURN_END; cfi_restore_state; cfi_remember_state
+
+/* Warning!
+           wmemcmp has to use SIGNED comparison for elements.
+           memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
+	atom_text_section
+ENTRY (MEMCMP)
+	movl	LEN(%esp), %ecx
+
+# ifdef USE_AS_WMEMCMP
+	shl	$2, %ecx
+	test	%ecx, %ecx
+	jz	L(zero)
+# endif
+
+	movl	BLK1(%esp), %eax
+	cmp	$48, %ecx
+	movl	BLK2(%esp), %edx
+	jae	L(48bytesormore)
+
+# ifndef USE_AS_WMEMCMP
+	cmp	$1, %ecx
+	jbe	L(less1bytes)
+# endif
+
+	PUSH	(%ebx)
+	add	%ecx, %edx
+	add	%ecx, %eax
+	jmp	L(less48bytes)
+
+	CFI_POP	(%ebx)
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(less1bytes):
+	jb	L(zero)
+	movb	(%eax), %cl
+	cmp	(%edx), %cl
+	je	L(zero)
+	mov	$1, %eax
+	ja	L(1bytesend)
+	neg	%eax
+L(1bytesend):
+	ret
+# endif
+
+	.p2align 4
+L(zero):
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(48bytesormore):
+	PUSH	(%ebx)
+	PUSH	(%esi)
+	PUSH	(%edi)
+	cfi_remember_state
+	movdqu	(%eax), %xmm3
+	movdqu	(%edx), %xmm0
+	movl	%eax, %edi
+	movl	%edx, %esi
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	16(%edi), %edi
+
+	sub	$0xffff, %edx
+	lea	16(%esi), %esi
+	jnz	L(less16bytes)
+	mov	%edi, %edx
+	and	$0xf, %edx
+	xor	%edx, %edi
+	sub	%edx, %esi
+	add	%edx, %ecx
+	mov	%esi, %edx
+	and	$0xf, %edx
+	jz	L(shr_0)
+	xor	%edx, %esi
+
+# ifndef USE_AS_WMEMCMP
+	cmp	$8, %edx
+	jae	L(next_unaligned_table)
+	cmp	$0, %edx
+	je	L(shr_0)
+	cmp	$1, %edx
+	je	L(shr_1)
+	cmp	$2, %edx
+	je	L(shr_2)
+	cmp	$3, %edx
+	je	L(shr_3)
+	cmp	$4, %edx
+	je	L(shr_4)
+	cmp	$5, %edx
+	je	L(shr_5)
+	cmp	$6, %edx
+	je	L(shr_6)
+	jmp	L(shr_7)
+
+	.p2align 2
+L(next_unaligned_table):
+	cmp	$8, %edx
+	je	L(shr_8)
+	cmp	$9, %edx
+	je	L(shr_9)
+	cmp	$10, %edx
+	je	L(shr_10)
+	cmp	$11, %edx
+	je	L(shr_11)
+	cmp	$12, %edx
+	je	L(shr_12)
+	cmp	$13, %edx
+	je	L(shr_13)
+	cmp	$14, %edx
+	je	L(shr_14)
+	jmp	L(shr_15)
+# else
+	cmp	$0, %edx
+	je	L(shr_0)
+	cmp	$4, %edx
+	je	L(shr_4)
+	cmp	$8, %edx
+	je	L(shr_8)
+	jmp	L(shr_12)
+# endif
+
+	.p2align 4
+L(shr_0):
+	cmp	$80, %ecx
+	jae	L(shr_0_gobble)
+	lea	-48(%ecx), %ecx
+	xor	%eax, %eax
+	movaps	(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+	movaps	16(%esi), %xmm2
+	pcmpeqb	16(%edi), %xmm2
+	pand	%xmm1, %xmm2
+	pmovmskb %xmm2, %edx
+	add	$32, %edi
+	add	$32, %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_0_gobble):
+	lea	-48(%ecx), %ecx
+	movdqa	(%esi), %xmm0
+	xor	%eax, %eax
+	pcmpeqb	(%edi), %xmm0
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm2
+	pcmpeqb	16(%edi), %xmm2
+L(shr_0_gobble_loop):
+	pand	%xmm0, %xmm2
+	sub	$32, %ecx
+	pmovmskb %xmm2, %edx
+	movdqa	%xmm0, %xmm1
+	movdqa	32(%esi), %xmm0
+	movdqa	48(%esi), %xmm2
+	sbb	$0xffff, %edx
+	pcmpeqb	32(%edi), %xmm0
+	pcmpeqb	48(%edi), %xmm2
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	jz	L(shr_0_gobble_loop)
+
+	pand	%xmm0, %xmm2
+	cmp	$0, %ecx
+	jge	L(shr_0_gobble_loop_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_0_gobble_loop_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm2, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_1):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_1_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$1,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$1,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	1(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_1_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$1,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$1,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_1_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$1,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$1,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_1_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_1_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_1_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	1(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_2):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_2_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$2,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$2,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	2(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_2_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$2,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$2,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_2_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$2,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$2,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_2_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_2_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_2_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	2(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_3):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_3_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$3,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$3,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	3(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_3_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$3,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$3,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_3_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$3,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$3,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_3_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_3_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_3_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	3(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+# endif
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_4):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_4_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$4,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$4,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	4(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_4_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$4,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$4,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_4_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$4,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$4,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_4_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_4_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_4_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	4(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_5):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_5_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$5,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$5,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	5(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_5_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$5,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$5,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_5_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$5,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$5,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_5_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_5_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_5_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	5(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_6):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_6_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$6,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$6,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	6(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_6_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$6,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$6,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_6_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$6,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$6,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_6_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_6_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_6_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	6(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_7):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_7_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$7,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$7,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	7(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_7_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$7,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$7,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_7_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$7,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$7,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_7_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_7_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_7_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	7(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+# endif
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_8):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_8_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$8,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$8,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	8(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_8_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$8,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$8,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_8_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$8,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$8,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_8_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_8_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_8_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	8(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_9):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_9_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$9,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$9,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	9(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_9_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$9,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$9,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_9_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$9,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$9,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_9_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_9_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_9_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	9(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_10):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_10_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$10, (%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$10,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	10(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_10_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$10, (%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$10, 16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_10_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$10,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$10,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_10_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_10_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_10_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	10(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_11):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_11_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$11, (%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$11, %xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	11(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_11_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$11, (%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$11, 16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_11_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$11,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$11,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_11_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_11_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_11_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	11(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+# endif
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_12):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_12_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$12, (%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$12, %xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	12(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_12_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$12, (%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$12, 16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_12_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$12,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$12,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_12_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_12_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_12_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	12(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_13):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_13_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$13, (%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$13, %xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	13(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_13_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$13, (%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$13, 16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_13_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$13,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$13,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_13_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_13_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_13_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	13(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_14):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_14_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$14, (%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$14, %xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	14(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_14_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$14, (%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$14, 16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_14_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$14,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$14,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_14_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_14_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_14_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	14(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_15):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_15_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$15, (%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$15, %xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	15(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(shr_15_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$15, (%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$15, 16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_15_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$15,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$15,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_15_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %ecx
+	jge	L(shr_15_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_15_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	15(%ecx, %esi,1), %edx
+	POP	(%edi)
+	POP	(%esi)
+	jmp	L(less48bytes)
+# endif
+
+	cfi_restore_state
+	cfi_remember_state
+	.p2align 4
+L(exit):
+	pmovmskb %xmm1, %ebx
+	sub	$0xffff, %ebx
+	jz	L(first16bytes)
+	lea	-16(%esi), %esi
+	lea	-16(%edi), %edi
+	mov	%ebx, %edx
+
+L(first16bytes):
+	add	%eax, %esi
+L(less16bytes):
+
+# ifndef USE_AS_WMEMCMP
+	test	%dl, %dl
+	jz	L(next_24_bytes)
+
+	test	$0x01, %dl
+	jnz	L(Byte16)
+
+	test	$0x02, %dl
+	jnz	L(Byte17)
+
+	test	$0x04, %dl
+	jnz	L(Byte18)
+
+	test	$0x08, %dl
+	jnz	L(Byte19)
+
+	test	$0x10, %dl
+	jnz	L(Byte20)
+
+	test	$0x20, %dl
+	jnz	L(Byte21)
+
+	test	$0x40, %dl
+	jnz	L(Byte22)
+L(Byte23):
+	movzbl	-9(%edi), %eax
+	movzbl	-9(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	.p2align 4
+L(Byte16):
+	movzbl	-16(%edi), %eax
+	movzbl	-16(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	.p2align 4
+L(Byte17):
+	movzbl	-15(%edi), %eax
+	movzbl	-15(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	.p2align 4
+L(Byte18):
+	movzbl	-14(%edi), %eax
+	movzbl	-14(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	.p2align 4
+L(Byte19):
+	movzbl	-13(%edi), %eax
+	movzbl	-13(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	.p2align 4
+L(Byte20):
+	movzbl	-12(%edi), %eax
+	movzbl	-12(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	.p2align 4
+L(Byte21):
+	movzbl	-11(%edi), %eax
+	movzbl	-11(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	.p2align 4
+L(Byte22):
+	movzbl	-10(%edi), %eax
+	movzbl	-10(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	.p2align 4
+L(next_24_bytes):
+	lea	8(%edi), %edi
+	lea	8(%esi), %esi
+	test	$0x01, %dh
+	jnz	L(Byte16)
+
+	test	$0x02, %dh
+	jnz	L(Byte17)
+
+	test	$0x04, %dh
+	jnz	L(Byte18)
+
+	test	$0x08, %dh
+	jnz	L(Byte19)
+
+	test	$0x10, %dh
+	jnz	L(Byte20)
+
+	test	$0x20, %dh
+	jnz	L(Byte21)
+
+	test	$0x40, %dh
+	jnz	L(Byte22)
+
+	.p2align 4
+L(Byte31):
+	movzbl	-9(%edi), %eax
+	movzbl	-9(%esi), %edx
+	sub	%edx, %eax
+	RETURN_END
+# else
+
+/* special for wmemcmp */
+	xor	%eax, %eax
+	test	%dl, %dl
+	jz	L(next_two_double_words)
+	and	$15, %dl
+	jz	L(second_double_word)
+	mov	-16(%edi), %eax
+	cmp	-16(%esi), %eax
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(second_double_word):
+	mov	-12(%edi), %eax
+	cmp	-12(%esi), %eax
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(next_two_double_words):
+	and	$15, %dh
+	jz	L(fourth_double_word)
+	mov	-8(%edi), %eax
+	cmp	-8(%esi), %eax
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(fourth_double_word):
+	mov	-4(%edi), %eax
+	cmp	-4(%esi), %eax
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(nequal):
+	mov	$1, %eax
+	jg	L(nequal_bigger)
+	neg	%eax
+	RETURN
+
+	.p2align 4
+L(nequal_bigger):
+	RETURN_END
+# endif
+
+	CFI_PUSH (%ebx)
+
+	.p2align 4
+L(more8bytes):
+	cmp	$16, %ecx
+	jae	L(more16bytes)
+	cmp	$8, %ecx
+	je	L(8bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$9, %ecx
+	je	L(9bytes)
+	cmp	$10, %ecx
+	je	L(10bytes)
+	cmp	$11, %ecx
+	je	L(11bytes)
+	cmp	$12, %ecx
+	je	L(12bytes)
+	cmp	$13, %ecx
+	je	L(13bytes)
+	cmp	$14, %ecx
+	je	L(14bytes)
+	jmp	L(15bytes)
+# else
+	jmp	L(12bytes)
+# endif
+
+	.p2align 4
+L(more16bytes):
+	cmp	$24, %ecx
+	jae	L(more24bytes)
+	cmp	$16, %ecx
+	je	L(16bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$17, %ecx
+	je	L(17bytes)
+	cmp	$18, %ecx
+	je	L(18bytes)
+	cmp	$19, %ecx
+	je	L(19bytes)
+	cmp	$20, %ecx
+	je	L(20bytes)
+	cmp	$21, %ecx
+	je	L(21bytes)
+	cmp	$22, %ecx
+	je	L(22bytes)
+	jmp	L(23bytes)
+# else
+	jmp	L(20bytes)
+# endif
+
+	.p2align 4
+L(more24bytes):
+	cmp	$32, %ecx
+	jae	L(more32bytes)
+	cmp	$24, %ecx
+	je	L(24bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$25, %ecx
+	je	L(25bytes)
+	cmp	$26, %ecx
+	je	L(26bytes)
+	cmp	$27, %ecx
+	je	L(27bytes)
+	cmp	$28, %ecx
+	je	L(28bytes)
+	cmp	$29, %ecx
+	je	L(29bytes)
+	cmp	$30, %ecx
+	je	L(30bytes)
+	jmp	L(31bytes)
+# else
+	jmp	L(28bytes)
+# endif
+
+	.p2align 4
+L(more32bytes):
+	cmp	$40, %ecx
+	jae	L(more40bytes)
+	cmp	$32, %ecx
+	je	L(32bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$33, %ecx
+	je	L(33bytes)
+	cmp	$34, %ecx
+	je	L(34bytes)
+	cmp	$35, %ecx
+	je	L(35bytes)
+	cmp	$36, %ecx
+	je	L(36bytes)
+	cmp	$37, %ecx
+	je	L(37bytes)
+	cmp	$38, %ecx
+	je	L(38bytes)
+	jmp	L(39bytes)
+# else
+	jmp	L(36bytes)
+# endif
+
+	.p2align 4
+L(less48bytes):
+	cmp	$8, %ecx
+	jae	L(more8bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$2, %ecx
+	je	L(2bytes)
+	cmp	$3, %ecx
+	je	L(3bytes)
+	cmp	$4, %ecx
+	je	L(4bytes)
+	cmp	$5, %ecx
+	je	L(5bytes)
+	cmp	$6, %ecx
+	je	L(6bytes)
+	jmp	L(7bytes)
+# else
+	jmp	L(4bytes)
+# endif
+
+	.p2align 4
+L(more40bytes):
+	cmp	$40, %ecx
+	je	L(40bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$41, %ecx
+	je	L(41bytes)
+	cmp	$42, %ecx
+	je	L(42bytes)
+	cmp	$43, %ecx
+	je	L(43bytes)
+	cmp	$44, %ecx
+	je	L(44bytes)
+	cmp	$45, %ecx
+	je	L(45bytes)
+	cmp	$46, %ecx
+	je	L(46bytes)
+	jmp	L(47bytes)
+
+	.p2align 4
+L(44bytes):
+	mov	-44(%eax), %ecx
+	mov	-44(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(40bytes):
+	mov	-40(%eax), %ecx
+	mov	-40(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(36bytes):
+	mov	-36(%eax), %ecx
+	mov	-36(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(32bytes):
+	mov	-32(%eax), %ecx
+	mov	-32(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(28bytes):
+	mov	-28(%eax), %ecx
+	mov	-28(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(24bytes):
+	mov	-24(%eax), %ecx
+	mov	-24(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(20bytes):
+	mov	-20(%eax), %ecx
+	mov	-20(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(16bytes):
+	mov	-16(%eax), %ecx
+	mov	-16(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(12bytes):
+	mov	-12(%eax), %ecx
+	mov	-12(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(8bytes):
+	mov	-8(%eax), %ecx
+	mov	-8(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(4bytes):
+	mov	-4(%eax), %ecx
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+	mov	$0, %eax
+	jne	L(find_diff)
+	POP	(%ebx)
+	ret
+	CFI_PUSH (%ebx)
+# else
+	.p2align 4
+L(44bytes):
+	mov	-44(%eax), %ecx
+	cmp	-44(%edx), %ecx
+	jne	L(find_diff)
+L(40bytes):
+	mov	-40(%eax), %ecx
+	cmp	-40(%edx), %ecx
+	jne	L(find_diff)
+L(36bytes):
+	mov	-36(%eax), %ecx
+	cmp	-36(%edx), %ecx
+	jne	L(find_diff)
+L(32bytes):
+	mov	-32(%eax), %ecx
+	cmp	-32(%edx), %ecx
+	jne	L(find_diff)
+L(28bytes):
+	mov	-28(%eax), %ecx
+	cmp	-28(%edx), %ecx
+	jne	L(find_diff)
+L(24bytes):
+	mov	-24(%eax), %ecx
+	cmp	-24(%edx), %ecx
+	jne	L(find_diff)
+L(20bytes):
+	mov	-20(%eax), %ecx
+	cmp	-20(%edx), %ecx
+	jne	L(find_diff)
+L(16bytes):
+	mov	-16(%eax), %ecx
+	cmp	-16(%edx), %ecx
+	jne	L(find_diff)
+L(12bytes):
+	mov	-12(%eax), %ecx
+	cmp	-12(%edx), %ecx
+	jne	L(find_diff)
+L(8bytes):
+	mov	-8(%eax), %ecx
+	cmp	-8(%edx), %ecx
+	jne	L(find_diff)
+L(4bytes):
+	mov	-4(%eax), %ecx
+	xor	%eax, %eax
+	cmp	-4(%edx), %ecx
+	jne	L(find_diff)
+	POP	(%ebx)
+	ret
+	CFI_PUSH (%ebx)
+# endif
+
+# ifndef USE_AS_WMEMCMP
+
+	.p2align 4
+L(45bytes):
+	mov	-45(%eax), %ecx
+	mov	-45(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(41bytes):
+	mov	-41(%eax), %ecx
+	mov	-41(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(37bytes):
+	mov	-37(%eax), %ecx
+	mov	-37(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(33bytes):
+	mov	-33(%eax), %ecx
+	mov	-33(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(29bytes):
+	mov	-29(%eax), %ecx
+	mov	-29(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(25bytes):
+	mov	-25(%eax), %ecx
+	mov	-25(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(21bytes):
+	mov	-21(%eax), %ecx
+	mov	-21(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(17bytes):
+	mov	-17(%eax), %ecx
+	mov	-17(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(13bytes):
+	mov	-13(%eax), %ecx
+	mov	-13(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(9bytes):
+	mov	-9(%eax), %ecx
+	mov	-9(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(5bytes):
+	mov	-5(%eax), %ecx
+	mov	-5(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzbl	-1(%eax), %ecx
+	cmp	-1(%edx), %cl
+	mov	$0, %eax
+	jne	L(end)
+	POP	(%ebx)
+	ret
+	CFI_PUSH (%ebx)
+
+	.p2align 4
+L(46bytes):
+	mov	-46(%eax), %ecx
+	mov	-46(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(42bytes):
+	mov	-42(%eax), %ecx
+	mov	-42(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(38bytes):
+	mov	-38(%eax), %ecx
+	mov	-38(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(34bytes):
+	mov	-34(%eax), %ecx
+	mov	-34(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(30bytes):
+	mov	-30(%eax), %ecx
+	mov	-30(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(26bytes):
+	mov	-26(%eax), %ecx
+	mov	-26(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(22bytes):
+	mov	-22(%eax), %ecx
+	mov	-22(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(18bytes):
+	mov	-18(%eax), %ecx
+	mov	-18(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(14bytes):
+	mov	-14(%eax), %ecx
+	mov	-14(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(10bytes):
+	mov	-10(%eax), %ecx
+	mov	-10(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(6bytes):
+	mov	-6(%eax), %ecx
+	mov	-6(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(2bytes):
+	movzwl	-2(%eax), %ecx
+	movzwl	-2(%edx), %ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bh, %ch
+	mov	$0, %eax
+	jne	L(end)
+	POP	(%ebx)
+	ret
+	CFI_PUSH (%ebx)
+
+	.p2align 4
+L(47bytes):
+	movl	-47(%eax), %ecx
+	movl	-47(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(43bytes):
+	movl	-43(%eax), %ecx
+	movl	-43(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(39bytes):
+	movl	-39(%eax), %ecx
+	movl	-39(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(35bytes):
+	movl	-35(%eax), %ecx
+	movl	-35(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(31bytes):
+	movl	-31(%eax), %ecx
+	movl	-31(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(27bytes):
+	movl	-27(%eax), %ecx
+	movl	-27(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(23bytes):
+	movl	-23(%eax), %ecx
+	movl	-23(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(19bytes):
+	movl	-19(%eax), %ecx
+	movl	-19(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(15bytes):
+	movl	-15(%eax), %ecx
+	movl	-15(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(11bytes):
+	movl	-11(%eax), %ecx
+	movl	-11(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(7bytes):
+	movl	-7(%eax), %ecx
+	movl	-7(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(3bytes):
+	movzwl	-3(%eax), %ecx
+	movzwl	-3(%edx), %ebx
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	movzbl	-1(%eax), %eax
+	cmpb	-1(%edx), %al
+	mov	$0, %eax
+	jne	L(end)
+	POP	(%ebx)
+	ret
+	CFI_PUSH (%ebx)
+
+	.p2align 4
+L(find_diff):
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	shr	$16,%ecx
+	shr	$16,%ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+
+	.p2align 4
+L(end):
+	POP	(%ebx)
+	mov	$1, %eax
+	ja	L(bigger)
+	neg	%eax
+L(bigger):
+	ret
+# else
+
+/* for wmemcmp */
+	.p2align 4
+L(find_diff):
+	POP	(%ebx)
+	mov	$1, %eax
+	jg	L(find_diff_bigger)
+	neg	%eax
+	ret
+
+	.p2align 4
+L(find_diff_bigger):
+	ret
+
+# endif
+END (MEMCMP)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp.S
new file mode 100644
index 0000000000..1fc5994a17
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp.S
@@ -0,0 +1,62 @@
+/* Multiple versions of memcmp
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+	.text
+ENTRY(memcmp)
+	.type	memcmp, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memcmp_ia32)
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memcmp_ssse3)
+	HAS_CPU_FEATURE (SSE4_2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memcmp_sse4_2)
+2:	ret
+END(memcmp)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __memcmp_ia32, @function; \
+	.p2align 4; \
+	.globl __memcmp_ia32; \
+	.hidden __memcmp_ia32; \
+	__memcmp_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __memcmp_ia32, .-__memcmp_ia32
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_memcmp; __GI_memcmp = __memcmp_ia32
+# endif
+#endif
+
+#include "../memcmp.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S
new file mode 100644
index 0000000000..2fe2072cb1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S
@@ -0,0 +1,681 @@
+/* memcpy optimized with SSE2 unaligned memory access instructions.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc) \
+    && (defined SHARED \
+	|| defined USE_AS_MEMMOVE \
+	|| !defined USE_MULTIARCH)
+
+# include <sysdep.h>
+# include "asm-syntax.h"
+
+# ifndef MEMCPY
+#  define MEMCPY	__memcpy_sse2_unaligned
+#  define MEMCPY_CHK	__memcpy_chk_sse2_unaligned
+# endif
+
+# ifdef USE_AS_BCOPY
+#  define SRC		PARMS
+#  define DEST		SRC+4
+#  define LEN		DEST+4
+# else
+#  define DEST		PARMS
+#  define SRC		DEST+4
+#  define LEN		SRC+4
+# endif
+
+# define CFI_PUSH(REG)		\
+  cfi_adjust_cfa_offset (4);		\
+  cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)		\
+  cfi_adjust_cfa_offset (-4);		\
+  cfi_restore (REG)
+
+# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+# define POP(REG)	popl REG; CFI_POP (REG)
+
+# define PARMS		8		/* Preserve EBX.  */
+# define ENTRANCE	PUSH (%ebx);
+# define RETURN_END	POP (%ebx); ret
+# define RETURN	RETURN_END; CFI_PUSH (%ebx)
+
+	.section .text.sse2,"ax",@progbits
+# if !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+# endif
+
+ENTRY (MEMCPY)
+	ENTRANCE
+	movl	LEN(%esp), %ecx
+	movl	SRC(%esp), %eax
+	movl	DEST(%esp), %edx
+	cmp	%edx, %eax
+
+# ifdef USE_AS_MEMMOVE
+	jg	L(check_forward)
+
+L(mm_len_0_or_more_backward):
+/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128]
+	separately.  */
+	cmp	$16, %ecx
+	jbe	L(mm_len_0_16_bytes_backward)
+
+	cmpl	$32, %ecx
+	jg	L(mm_len_32_or_more_backward)
+
+/* Copy [0..32] and return.  */
+	movdqu	(%eax), %xmm0
+	movdqu	-16(%eax, %ecx), %xmm1
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, -16(%edx, %ecx)
+	jmp	L(return)
+
+L(mm_len_32_or_more_backward):
+	cmpl	$64, %ecx
+	jg	L(mm_len_64_or_more_backward)
+
+/* Copy [0..64] and return.  */
+	movdqu	(%eax), %xmm0
+	movdqu	16(%eax), %xmm1
+	movdqu	-16(%eax, %ecx), %xmm2
+	movdqu	-32(%eax, %ecx), %xmm3
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, 16(%edx)
+	movdqu	%xmm2, -16(%edx, %ecx)
+	movdqu	%xmm3, -32(%edx, %ecx)
+	jmp	L(return)
+
+L(mm_len_64_or_more_backward):
+	cmpl	$128, %ecx
+	jg	L(mm_len_128_or_more_backward)
+
+/* Copy [0..128] and return.  */
+	movdqu	(%eax), %xmm0
+	movdqu	16(%eax), %xmm1
+	movdqu	32(%eax), %xmm2
+	movdqu	48(%eax), %xmm3
+	movdqu	-64(%eax, %ecx), %xmm4
+	movdqu	-48(%eax, %ecx), %xmm5
+	movdqu	-32(%eax, %ecx), %xmm6
+	movdqu	-16(%eax, %ecx), %xmm7
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, 16(%edx)
+	movdqu	%xmm2, 32(%edx)
+	movdqu	%xmm3, 48(%edx)
+	movdqu	%xmm4, -64(%edx, %ecx)
+	movdqu	%xmm5, -48(%edx, %ecx)
+	movdqu	%xmm6, -32(%edx, %ecx)
+	movdqu	%xmm7, -16(%edx, %ecx)
+	jmp	L(return)
+
+L(mm_len_128_or_more_backward):
+	add	%ecx, %eax
+	cmp	%edx, %eax
+	movl	SRC(%esp), %eax
+	jle	L(forward)
+	PUSH (%esi)
+	PUSH (%edi)
+	PUSH (%ebx)
+
+/* Aligning the address of destination. */
+	movdqu	(%eax), %xmm4
+	movdqu	16(%eax), %xmm5
+	movdqu	32(%eax), %xmm6
+	movdqu	48(%eax), %xmm7
+	leal	(%edx, %ecx), %esi
+	movdqu	-16(%eax, %ecx), %xmm0
+	subl	$16, %esp
+	movdqu	%xmm0, (%esp)
+	mov	%ecx, %edi
+	movl	%esi, %ecx
+	andl	$-16, %ecx
+	leal	(%ecx), %ebx
+	subl	%edx, %ebx
+	leal	(%eax, %ebx), %eax
+	shrl	$6, %ebx
+
+# ifdef SHARED_CACHE_SIZE_HALF
+	cmp	$SHARED_CACHE_SIZE_HALF, %edi
+# else
+#  ifdef SHARED
+	PUSH (%ebx)
+	SETUP_PIC_REG (bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %edi
+	POP (%ebx)
+#  else
+	cmp	__x86_shared_cache_size_half, %edi
+#  endif
+# endif
+	jae	L(mm_large_page_loop_backward)
+
+	.p2align 4
+L(mm_main_loop_backward):
+
+	prefetcht0 -128(%eax)
+
+	movdqu	-64(%eax), %xmm0
+	movdqu	-48(%eax), %xmm1
+	movdqu	-32(%eax), %xmm2
+	movdqu	-16(%eax), %xmm3
+	movaps	%xmm0, -64(%ecx)
+	subl	$64, %eax
+	movaps	%xmm1, -48(%ecx)
+	movaps	%xmm2, -32(%ecx)
+	movaps	%xmm3, -16(%ecx)
+	subl	$64, %ecx
+	sub	$1, %ebx
+	jnz	L(mm_main_loop_backward)
+	movdqu	(%esp), %xmm0
+	addl	$16, %esp
+	movdqu	%xmm0, -16(%esi)
+	movdqu	%xmm4, (%edx)
+	movdqu	%xmm5, 16(%edx)
+	movdqu	%xmm6, 32(%edx)
+	movdqu	%xmm7, 48(%edx)
+	POP (%ebx)
+	jmp	L(mm_return_pop_all)
+
+/* Copy [0..16] and return.  */
+L(mm_len_0_16_bytes_backward):
+	testb	$24, %cl
+	jnz	L(mm_len_9_16_bytes_backward)
+	testb	$4, %cl
+	.p2align 4,,5
+	jnz	L(mm_len_5_8_bytes_backward)
+	testl	%ecx, %ecx
+	.p2align 4,,2
+	je	L(return)
+	testb	$2, %cl
+	.p2align 4,,1
+	jne	L(mm_len_3_4_bytes_backward)
+	movzbl	-1(%eax,%ecx), %ebx
+	movzbl	(%eax), %eax
+	movb	%bl, -1(%edx,%ecx)
+	movb	%al, (%edx)
+	jmp	L(return)
+
+L(mm_len_3_4_bytes_backward):
+	movzwl	-2(%eax,%ecx), %ebx
+	movzwl	(%eax), %eax
+	movw	%bx, -2(%edx,%ecx)
+	movw	%ax, (%edx)
+	jmp	L(return)
+
+L(mm_len_9_16_bytes_backward):
+	PUSH (%esi)
+	movl	-4(%eax,%ecx), %ebx
+	movl	-8(%eax,%ecx), %esi
+	movl	%ebx, -4(%edx,%ecx)
+	movl	%esi, -8(%edx,%ecx)
+	subl	$8, %ecx
+	POP (%esi)
+	jmp	L(mm_len_0_16_bytes_backward)
+
+L(mm_len_5_8_bytes_backward):
+	movl	(%eax), %ebx
+	movl	-4(%eax,%ecx), %eax
+	movl	%ebx, (%edx)
+	movl	%eax, -4(%edx,%ecx)
+	jmp	L(return)
+
+/* Big length copy backward part.  */
+	.p2align 4
+L(mm_large_page_loop_backward):
+	movdqu	-64(%eax), %xmm0
+	movdqu	-48(%eax), %xmm1
+	movdqu	-32(%eax), %xmm2
+	movdqu	-16(%eax), %xmm3
+	movntdq	%xmm0, -64(%ecx)
+	subl	$64, %eax
+	movntdq	%xmm1, -48(%ecx)
+	movntdq	%xmm2, -32(%ecx)
+	movntdq	%xmm3, -16(%ecx)
+	subl	$64, %ecx
+	sub	$1, %ebx
+	jnz	L(mm_large_page_loop_backward)
+	sfence
+	movdqu	(%esp), %xmm0
+	addl	$16, %esp
+	movdqu	%xmm0, -16(%esi)
+	movdqu	%xmm4, (%edx)
+	movdqu	%xmm5, 16(%edx)
+	movdqu	%xmm6, 32(%edx)
+	movdqu	%xmm7, 48(%edx)
+	POP (%ebx)
+	jmp	L(mm_return_pop_all)
+
+L(check_forward):
+	add	%edx, %ecx
+	cmp	%eax, %ecx
+	movl	LEN(%esp), %ecx
+	jle	L(forward)
+
+/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
+	separately.  */
+	cmp	$16, %ecx
+	jbe	L(mm_len_0_16_bytes_forward)
+
+	cmpl	$32, %ecx
+	ja	L(mm_len_32_or_more_forward)
+
+/* Copy [0..32] and return.  */
+	movdqu	(%eax), %xmm0
+	movdqu	-16(%eax, %ecx), %xmm1
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, -16(%edx, %ecx)
+	jmp	L(return)
+
+L(mm_len_32_or_more_forward):
+	cmpl	$64, %ecx
+	ja	L(mm_len_64_or_more_forward)
+
+/* Copy [0..64] and return.  */
+	movdqu	(%eax), %xmm0
+	movdqu	16(%eax), %xmm1
+	movdqu	-16(%eax, %ecx), %xmm2
+	movdqu	-32(%eax, %ecx), %xmm3
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, 16(%edx)
+	movdqu	%xmm2, -16(%edx, %ecx)
+	movdqu	%xmm3, -32(%edx, %ecx)
+	jmp	L(return)
+
+L(mm_len_64_or_more_forward):
+	cmpl	$128, %ecx
+	ja	L(mm_len_128_or_more_forward)
+
+/* Copy [0..128] and return.  */
+	movdqu	(%eax), %xmm0
+	movdqu	16(%eax), %xmm1
+	movdqu	32(%eax), %xmm2
+	movdqu	48(%eax), %xmm3
+	movdqu	-64(%eax, %ecx), %xmm4
+	movdqu	-48(%eax, %ecx), %xmm5
+	movdqu	-32(%eax, %ecx), %xmm6
+	movdqu	-16(%eax, %ecx), %xmm7
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, 16(%edx)
+	movdqu	%xmm2, 32(%edx)
+	movdqu	%xmm3, 48(%edx)
+	movdqu	%xmm4, -64(%edx, %ecx)
+	movdqu	%xmm5, -48(%edx, %ecx)
+	movdqu	%xmm6, -32(%edx, %ecx)
+	movdqu	%xmm7, -16(%edx, %ecx)
+	jmp	L(return)
+
+L(mm_len_128_or_more_forward):
+	PUSH (%esi)
+	PUSH (%edi)
+	PUSH (%ebx)
+
+/* Aligning the address of destination. */
+	movdqu	-16(%eax, %ecx), %xmm4
+	movdqu	-32(%eax, %ecx), %xmm5
+	movdqu	-48(%eax, %ecx), %xmm6
+	movdqu	-64(%eax, %ecx), %xmm7
+	leal	(%edx, %ecx), %esi
+	movdqu	(%eax), %xmm0
+	subl	$16, %esp
+	movdqu	%xmm0, (%esp)
+	mov	%ecx, %edi
+	leal	16(%edx), %ecx
+	andl	$-16, %ecx
+	movl	%ecx, %ebx
+	subl	%edx, %ebx
+	addl	%ebx, %eax
+	movl	%esi, %ebx
+	subl	%ecx, %ebx
+	shrl	$6, %ebx
+
+# ifdef SHARED_CACHE_SIZE_HALF
+	cmp	$SHARED_CACHE_SIZE_HALF, %edi
+# else
+#  ifdef SHARED
+	PUSH (%ebx)
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %edi
+	POP (%ebx)
+#  else
+	cmp	__x86_shared_cache_size_half, %edi
+#  endif
+# endif
+	jae	L(mm_large_page_loop_forward)
+
+	.p2align 4
+L(mm_main_loop_forward):
+
+	prefetcht0 128(%eax)
+
+	movdqu	(%eax), %xmm0
+	movdqu	16(%eax), %xmm1
+	movdqu	32(%eax), %xmm2
+	movdqu	48(%eax), %xmm3
+	movdqa	%xmm0, (%ecx)
+	addl	$64, %eax
+	movaps	%xmm1, 16(%ecx)
+	movaps	%xmm2, 32(%ecx)
+	movaps	%xmm3, 48(%ecx)
+	addl	$64, %ecx
+	sub	$1, %ebx
+	jnz	L(mm_main_loop_forward)
+	movdqu	(%esp), %xmm0
+	addl	$16, %esp
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm4, -16(%esi)
+	movdqu	%xmm5, -32(%esi)
+	movdqu	%xmm6, -48(%esi)
+	movdqu	%xmm7, -64(%esi)
+	POP (%ebx)
+	jmp	L(mm_return_pop_all)
+
+L(mm_len_0_16_bytes_forward):
+	testb	$24, %cl
+	jne	L(mm_len_9_16_bytes_forward)
+	testb	$4, %cl
+	.p2align 4,,5
+	jne	L(mm_len_5_8_bytes_forward)
+	testl	%ecx, %ecx
+	.p2align 4,,2
+	je	L(return)
+	testb	$2, %cl
+	.p2align 4,,1
+	jne	L(mm_len_2_4_bytes_forward)
+	movzbl	-1(%eax,%ecx), %ebx
+	movzbl	(%eax), %eax
+	movb	%bl, -1(%edx,%ecx)
+	movb	%al, (%edx)
+	jmp	L(return)
+
+L(mm_len_2_4_bytes_forward):
+	movzwl	-2(%eax,%ecx), %ebx
+	movzwl	(%eax), %eax
+	movw	%bx, -2(%edx,%ecx)
+	movw	%ax, (%edx)
+	jmp	L(return)
+
+L(mm_len_5_8_bytes_forward):
+	movl	(%eax), %ebx
+	movl	-4(%eax,%ecx), %eax
+	movl	%ebx, (%edx)
+	movl	%eax, -4(%edx,%ecx)
+	jmp	L(return)
+
+L(mm_len_9_16_bytes_forward):
+	movq	(%eax), %xmm0
+	movq	-8(%eax, %ecx), %xmm1
+	movq	%xmm0, (%edx)
+	movq	%xmm1, -8(%edx, %ecx)
+	jmp	L(return)
+
+L(mm_return_pop_all):
+	movl	%edx, %eax
+	POP (%edi)
+	POP (%esi)
+	RETURN
+
+/* Big length copy forward part.  */
+	.p2align 4
+L(mm_large_page_loop_forward):
+	movdqu	(%eax), %xmm0
+	movdqu	16(%eax), %xmm1
+	movdqu	32(%eax), %xmm2
+	movdqu	48(%eax), %xmm3
+	movntdq	%xmm0, (%ecx)
+	addl	$64, %eax
+	movntdq	%xmm1, 16(%ecx)
+	movntdq	%xmm2, 32(%ecx)
+	movntdq	%xmm3, 48(%ecx)
+	addl	$64, %ecx
+	sub	$1, %ebx
+	jnz	L(mm_large_page_loop_forward)
+	sfence
+	movdqu	(%esp), %xmm0
+	addl	$16, %esp
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm4, -16(%esi)
+	movdqu	%xmm5, -32(%esi)
+	movdqu	%xmm6, -48(%esi)
+	movdqu	%xmm7, -64(%esi)
+	POP (%ebx)
+	jmp	L(mm_return_pop_all)
+# endif
+
+L(forward):
+	cmp	$16, %ecx
+	jbe	L(len_0_16_bytes)
+
+# ifdef SHARED_CACHE_SIZE_HALF
+	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_shared_cache_size_half, %ecx
+#  endif
+# endif
+	jae     L(large_page)
+
+	movdqu	(%eax), %xmm0
+	movdqu	-16(%eax, %ecx), %xmm1
+	cmpl    $32, %ecx
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, -16(%edx, %ecx)
+	jbe	L(return)
+
+	movdqu	16(%eax), %xmm0
+	movdqu	-32(%eax, %ecx), %xmm1
+	cmpl    $64, %ecx
+	movdqu	%xmm0, 16(%edx)
+	movdqu	%xmm1, -32(%edx, %ecx)
+	jbe	L(return)
+
+	movdqu	32(%eax), %xmm0
+	movdqu	48(%eax), %xmm1
+	movdqu	-48(%eax, %ecx), %xmm2
+	movdqu	-64(%eax, %ecx), %xmm3
+	cmpl    $128, %ecx
+	movdqu	%xmm0, 32(%edx)
+	movdqu	%xmm1, 48(%edx)
+	movdqu	%xmm2, -48(%edx, %ecx)
+	movdqu	%xmm3, -64(%edx, %ecx)
+	jbe	L(return)
+
+/* Now the main loop: we align the address of the destination.  */
+	leal	64(%edx), %ebx
+	andl	$-64, %ebx
+
+	addl	%edx, %ecx
+	andl	$-64, %ecx
+
+	subl	%edx, %eax
+
+/* We should stop two iterations before the termination
+	(in order not to misprefetch).  */
+	subl	$64, %ecx
+	cmpl	%ebx, %ecx
+	je	L(main_loop_just_one_iteration)
+
+	subl	$64, %ecx
+	cmpl	%ebx, %ecx
+	je	L(main_loop_last_two_iterations)
+
+	.p2align 4
+L(main_loop_cache):
+
+	prefetcht0 128(%ebx, %eax)
+
+	movdqu	(%ebx, %eax), %xmm0
+	movdqu	16(%ebx, %eax), %xmm1
+	movdqu	32(%ebx, %eax), %xmm2
+	movdqu	48(%ebx, %eax), %xmm3
+	movdqa	%xmm0, (%ebx)
+	movaps	%xmm1, 16(%ebx)
+	movaps	%xmm2, 32(%ebx)
+	movaps	%xmm3, 48(%ebx)
+	lea	64(%ebx), %ebx
+	cmpl	%ebx, %ecx
+	jne	L(main_loop_cache)
+
+L(main_loop_last_two_iterations):
+	movdqu	(%ebx, %eax), %xmm0
+	movdqu	16(%ebx, %eax), %xmm1
+	movdqu	32(%ebx, %eax), %xmm2
+	movdqu	48(%ebx, %eax), %xmm3
+	movdqu	64(%ebx, %eax), %xmm4
+	movdqu	80(%ebx, %eax), %xmm5
+	movdqu	96(%ebx, %eax), %xmm6
+	movdqu	112(%ebx, %eax), %xmm7
+	movdqa	%xmm0, (%ebx)
+	movaps	%xmm1, 16(%ebx)
+	movaps	%xmm2, 32(%ebx)
+	movaps	%xmm3, 48(%ebx)
+	movaps	%xmm4, 64(%ebx)
+	movaps	%xmm5, 80(%ebx)
+	movaps	%xmm6, 96(%ebx)
+	movaps	%xmm7, 112(%ebx)
+	jmp	L(return)
+
+L(main_loop_just_one_iteration):
+	movdqu	(%ebx, %eax), %xmm0
+	movdqu	16(%ebx, %eax), %xmm1
+	movdqu	32(%ebx, %eax), %xmm2
+	movdqu	48(%ebx, %eax), %xmm3
+	movdqa	%xmm0, (%ebx)
+	movaps	%xmm1, 16(%ebx)
+	movaps	%xmm2, 32(%ebx)
+	movaps	%xmm3, 48(%ebx)
+	jmp	L(return)
+
+L(large_page):
+	movdqu	(%eax), %xmm0
+	movdqu	16(%eax), %xmm1
+	movdqu	32(%eax), %xmm2
+	movdqu	48(%eax), %xmm3
+	movdqu	-64(%eax, %ecx), %xmm4
+	movdqu	-48(%eax, %ecx), %xmm5
+	movdqu	-32(%eax, %ecx), %xmm6
+	movdqu	-16(%eax, %ecx), %xmm7
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, 16(%edx)
+	movdqu	%xmm2, 32(%edx)
+	movdqu	%xmm3, 48(%edx)
+	movdqu	%xmm4, -64(%edx, %ecx)
+	movdqu	%xmm5, -48(%edx, %ecx)
+	movdqu	%xmm6, -32(%edx, %ecx)
+	movdqu	%xmm7, -16(%edx, %ecx)
+
+	movdqu	64(%eax), %xmm0
+	movdqu	80(%eax), %xmm1
+	movdqu	96(%eax), %xmm2
+	movdqu	112(%eax), %xmm3
+	movdqu	-128(%eax, %ecx), %xmm4
+	movdqu	-112(%eax, %ecx), %xmm5
+	movdqu	-96(%eax, %ecx), %xmm6
+	movdqu	-80(%eax, %ecx), %xmm7
+	movdqu	%xmm0, 64(%edx)
+	movdqu	%xmm1, 80(%edx)
+	movdqu	%xmm2, 96(%edx)
+	movdqu	%xmm3, 112(%edx)
+	movdqu	%xmm4, -128(%edx, %ecx)
+	movdqu	%xmm5, -112(%edx, %ecx)
+	movdqu	%xmm6, -96(%edx, %ecx)
+	movdqu	%xmm7, -80(%edx, %ecx)
+
+/* Now the main loop with non temporal stores. We align
+	the address of the destination.  */
+	leal	128(%edx), %ebx
+	andl	$-128, %ebx
+
+	addl	%edx, %ecx
+	andl	$-128, %ecx
+
+	subl	%edx, %eax
+
+	.p2align 4
+L(main_loop_large_page):
+	movdqu	(%ebx, %eax), %xmm0
+	movdqu	16(%ebx, %eax), %xmm1
+	movdqu	32(%ebx, %eax), %xmm2
+	movdqu	48(%ebx, %eax), %xmm3
+	movdqu	64(%ebx, %eax), %xmm4
+	movdqu	80(%ebx, %eax), %xmm5
+	movdqu	96(%ebx, %eax), %xmm6
+	movdqu	112(%ebx, %eax), %xmm7
+	movntdq	%xmm0, (%ebx)
+	movntdq	%xmm1, 16(%ebx)
+	movntdq	%xmm2, 32(%ebx)
+	movntdq	%xmm3, 48(%ebx)
+	movntdq	%xmm4, 64(%ebx)
+	movntdq	%xmm5, 80(%ebx)
+	movntdq	%xmm6, 96(%ebx)
+	movntdq	%xmm7, 112(%ebx)
+	lea	128(%ebx), %ebx
+	cmpl	%ebx, %ecx
+	jne	L(main_loop_large_page)
+	sfence
+	jmp	L(return)
+
+L(len_0_16_bytes):
+	testb	$24, %cl
+	jne	L(len_9_16_bytes)
+	testb	$4, %cl
+	.p2align 4,,5
+	jne	L(len_5_8_bytes)
+	testl	%ecx, %ecx
+	.p2align 4,,2
+	je	L(return)
+	movzbl	(%eax), %ebx
+	testb	$2, %cl
+	movb	%bl, (%edx)
+	je	L(return)
+	movzwl	-2(%eax,%ecx), %ebx
+	movw	%bx, -2(%edx,%ecx)
+	jmp	L(return)
+
+L(len_9_16_bytes):
+	movq	(%eax), %xmm0
+	movq	-8(%eax, %ecx), %xmm1
+	movq	%xmm0, (%edx)
+	movq	%xmm1, -8(%edx, %ecx)
+	jmp	L(return)
+
+L(len_5_8_bytes):
+	movl	(%eax), %ebx
+	movl	%ebx, (%edx)
+	movl	-4(%eax,%ecx), %ebx
+	movl	%ebx, -4(%edx,%ecx)
+
+L(return):
+	movl	%edx, %eax
+# if !defined USE_AS_BCOPY && defined USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+	RETURN
+
+END (MEMCPY)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S
new file mode 100644
index 0000000000..687e083147
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S
@@ -0,0 +1,1809 @@
+/* memcpy with SSSE3 and REP string.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#if IS_IN (libc) \
+    && (defined SHARED \
+	|| defined USE_AS_MEMMOVE \
+	|| !defined USE_MULTIARCH)
+
+#include "asm-syntax.h"
+
+#ifndef MEMCPY
+# define MEMCPY		__memcpy_ssse3_rep
+# define MEMCPY_CHK	__memcpy_chk_ssse3_rep
+#endif
+
+#ifdef USE_AS_BCOPY
+# define SRC		PARMS
+# define DEST		SRC+4
+# define LEN		DEST+4
+#else
+# define DEST		PARMS
+# define SRC		DEST+4
+# define LEN		SRC+4
+#endif
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#ifdef SHARED
+# define PARMS		8		/* Preserve EBX.  */
+# define ENTRANCE	PUSH (%ebx);
+# define RETURN_END	POP (%ebx); ret
+# define RETURN		RETURN_END; CFI_PUSH (%ebx)
+# define JMPTBL(I, B)	I - B
+
+/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
+   jump table with relative offsets.  INDEX is a register contains the
+   index into the jump table.   SCALE is the scale of INDEX. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+    /* We first load PC into EBX.  */				\
+    SETUP_PIC_REG(bx);						\
+    /* Get the address of the jump table.  */			\
+    addl	$(TABLE - .), %ebx;				\
+    /* Get the entry and convert the relative offset to the	\
+       absolute address.  */					\
+    addl	(%ebx,INDEX,SCALE), %ebx;			\
+    /* We loaded the jump table.  Go.  */			\
+    jmp		*%ebx
+
+# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)			\
+    addl	$(TABLE - .), %ebx
+
+# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE)	\
+    addl	(%ebx,INDEX,SCALE), %ebx;			\
+    /* We loaded the jump table.  Go.  */			\
+    jmp		*%ebx
+#else
+# define PARMS		4
+# define ENTRANCE
+# define RETURN_END	ret
+# define RETURN		RETURN_END
+# define JMPTBL(I, B)	I
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+   absolute offsets.  INDEX is a register contains the index into the
+   jump table.  SCALE is the scale of INDEX. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+    jmp		*TABLE(,INDEX,SCALE)
+
+# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)
+
+# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE)		\
+    jmp		*TABLE(,INDEX,SCALE)
+#endif
+
+	.section .text.ssse3,"ax",@progbits
+#if !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+#endif
+ENTRY (MEMCPY)
+	ENTRANCE
+	movl	LEN(%esp), %ecx
+	movl	SRC(%esp), %eax
+	movl	DEST(%esp), %edx
+
+#ifdef USE_AS_MEMMOVE
+	cmp	%eax, %edx
+	jb	L(copy_forward)
+	je	L(fwd_write_0bytes)
+	cmp	$48, %ecx
+	jb	L(bk_write_less48bytes)
+	add	%ecx, %eax
+	cmp	%eax, %edx
+	movl	SRC(%esp), %eax
+	jb	L(copy_backward)
+
+L(copy_forward):
+#endif
+	cmp	$48, %ecx
+	jae	L(48bytesormore)
+
+L(fwd_write_less32bytes):
+#ifndef USE_AS_MEMMOVE
+	cmp	%dl, %al
+	jb	L(bk_write)
+#endif
+	add	%ecx, %edx
+	add	%ecx, %eax
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+#ifndef USE_AS_MEMMOVE
+L(bk_write):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
+#endif
+
+	ALIGN (4)
+/* ECX > 32 and EDX is 4 byte aligned.  */
+L(48bytesormore):
+	movdqu	(%eax), %xmm0
+	PUSH (%edi)
+	movl	%edx, %edi
+	and	$-16, %edx
+	PUSH (%esi)
+	cfi_remember_state
+	add	$16, %edx
+	movl	%edi, %esi
+	sub	%edx, %edi
+	add	%edi, %ecx
+	sub	%edi, %eax
+
+#ifdef SHARED_CACHE_SIZE_HALF
+	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
+#else
+# ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+	cmp	__x86_shared_cache_size_half, %ecx
+# endif
+#endif
+
+	mov	%eax, %edi
+	jae	L(large_page)
+	and	$0xf, %edi
+	jz	L(shl_0)
+
+	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
+
+	ALIGN (4)
+L(shl_0):
+	movdqu	%xmm0, (%esi)
+	xor	%edi, %edi
+	cmp	$127, %ecx
+	ja	L(shl_0_gobble)
+	lea	-32(%ecx), %ecx
+L(shl_0_loop):
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+	jb	L(shl_0_end)
+
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+	jb	L(shl_0_end)
+
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+	jb	L(shl_0_end)
+
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+L(shl_0_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	add	%edi, %eax
+	POP (%esi)
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+L(shl_0_gobble):
+
+#ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+#else
+# ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	mov	__x86_data_cache_size_half@GOTOFF(%ebx), %edi
+# else
+	mov	__x86_data_cache_size_half, %edi
+# endif
+#endif
+	mov	%edi, %esi
+	shr	$3, %esi
+	sub	%esi, %edi
+	cmp	%edi, %ecx
+	jae	L(shl_0_gobble_mem_start)
+	sub	$128, %ecx
+	ALIGN (4)
+L(shl_0_gobble_cache_loop):
+	movdqa	(%eax), %xmm0
+	movaps	0x10(%eax), %xmm1
+	movaps	0x20(%eax), %xmm2
+	movaps	0x30(%eax), %xmm3
+	movaps	0x40(%eax), %xmm4
+	movaps	0x50(%eax), %xmm5
+	movaps	0x60(%eax), %xmm6
+	movaps	0x70(%eax), %xmm7
+	lea	0x80(%eax), %eax
+	sub	$128, %ecx
+	movdqa	%xmm0, (%edx)
+	movaps	%xmm1, 0x10(%edx)
+	movaps	%xmm2, 0x20(%edx)
+	movaps	%xmm3, 0x30(%edx)
+	movaps	%xmm4, 0x40(%edx)
+	movaps	%xmm5, 0x50(%edx)
+	movaps	%xmm6, 0x60(%edx)
+	movaps	%xmm7, 0x70(%edx)
+	lea	0x80(%edx), %edx
+
+	jae	L(shl_0_gobble_cache_loop)
+	add	$0x80, %ecx
+	cmp	$0x40, %ecx
+	jb	L(shl_0_cache_less_64bytes)
+
+	movdqa	(%eax), %xmm0
+	sub	$0x40, %ecx
+	movdqa	0x10(%eax), %xmm1
+
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+
+	movdqa	0x20(%eax), %xmm0
+	movdqa	0x30(%eax), %xmm1
+	add	$0x40, %eax
+
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm1, 0x30(%edx)
+	add	$0x40, %edx
+L(shl_0_cache_less_64bytes):
+	cmp	$0x20, %ecx
+	jb	L(shl_0_cache_less_32bytes)
+	movdqa	(%eax), %xmm0
+	sub	$0x20, %ecx
+	movdqa	0x10(%eax), %xmm1
+	add	$0x20, %eax
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+	add	$0x20, %edx
+L(shl_0_cache_less_32bytes):
+	cmp	$0x10, %ecx
+	jb	L(shl_0_cache_less_16bytes)
+	sub	$0x10, %ecx
+	movdqa	(%eax), %xmm0
+	add	$0x10, %eax
+	movdqa	%xmm0, (%edx)
+	add	$0x10, %edx
+L(shl_0_cache_less_16bytes):
+	add	%ecx, %edx
+	add	%ecx, %eax
+	POP (%esi)
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_0_gobble_mem_start):
+	cmp	%al, %dl
+	je	L(copy_page_by_rep)
+	sub	$128, %ecx
+L(shl_0_gobble_mem_loop):
+	prefetchnta 0x1c0(%eax)
+	prefetchnta 0x280(%eax)
+	prefetchnta 0x1c0(%edx)
+	prefetchnta 0x280(%edx)
+
+	movdqa	(%eax), %xmm0
+	movaps	0x10(%eax), %xmm1
+	movaps	0x20(%eax), %xmm2
+	movaps	0x30(%eax), %xmm3
+	movaps	0x40(%eax), %xmm4
+	movaps	0x50(%eax), %xmm5
+	movaps	0x60(%eax), %xmm6
+	movaps	0x70(%eax), %xmm7
+	lea	0x80(%eax), %eax
+	sub	$0x80, %ecx
+	movdqa	%xmm0, (%edx)
+	movaps	%xmm1, 0x10(%edx)
+	movaps	%xmm2, 0x20(%edx)
+	movaps	%xmm3, 0x30(%edx)
+	movaps	%xmm4, 0x40(%edx)
+	movaps	%xmm5, 0x50(%edx)
+	movaps	%xmm6, 0x60(%edx)
+	movaps	%xmm7, 0x70(%edx)
+	lea	0x80(%edx), %edx
+
+	jae	L(shl_0_gobble_mem_loop)
+	add	$0x80, %ecx
+	cmp	$0x40, %ecx
+	jb	L(shl_0_mem_less_64bytes)
+
+	movdqa	(%eax), %xmm0
+	sub	$0x40, %ecx
+	movdqa	0x10(%eax), %xmm1
+
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+
+	movdqa	0x20(%eax), %xmm0
+	movdqa	0x30(%eax), %xmm1
+	add	$0x40, %eax
+
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm1, 0x30(%edx)
+	add	$0x40, %edx
+L(shl_0_mem_less_64bytes):
+	cmp	$0x20, %ecx
+	jb	L(shl_0_mem_less_32bytes)
+	movdqa	(%eax), %xmm0
+	sub	$0x20, %ecx
+	movdqa	0x10(%eax), %xmm1
+	add	$0x20, %eax
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+	add	$0x20, %edx
+L(shl_0_mem_less_32bytes):
+	cmp	$0x10, %ecx
+	jb	L(shl_0_mem_less_16bytes)
+	sub	$0x10, %ecx
+	movdqa	(%eax), %xmm0
+	add	$0x10, %eax
+	movdqa	%xmm0, (%edx)
+	add	$0x10, %edx
+L(shl_0_mem_less_16bytes):
+	add	%ecx, %edx
+	add	%ecx, %eax
+	POP (%esi)
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_1):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$1, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_1_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$1, %xmm2, %xmm3
+	palignr	$1, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_1_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$1, %xmm2, %xmm3
+	palignr	$1, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_1_loop)
+
+L(shl_1_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	1(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_2):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$2, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_2_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$2, %xmm2, %xmm3
+	palignr	$2, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_2_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$2, %xmm2, %xmm3
+	palignr	$2, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_2_loop)
+
+L(shl_2_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	2(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_3):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$3, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_3_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$3, %xmm2, %xmm3
+	palignr	$3, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_3_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$3, %xmm2, %xmm3
+	palignr	$3, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_3_loop)
+
+L(shl_3_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	3(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_4):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$4, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_4_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$4, %xmm2, %xmm3
+	palignr	$4, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_4_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$4, %xmm2, %xmm3
+	palignr	$4, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_4_loop)
+
+L(shl_4_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	4(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_5):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$5, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_5_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$5, %xmm2, %xmm3
+	palignr	$5, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_5_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$5, %xmm2, %xmm3
+	palignr	$5, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_5_loop)
+
+L(shl_5_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	5(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_6):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$6, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_6_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$6, %xmm2, %xmm3
+	palignr	$6, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_6_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$6, %xmm2, %xmm3
+	palignr	$6, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_6_loop)
+
+L(shl_6_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	6(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_7):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$7, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_7_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$7, %xmm2, %xmm3
+	palignr	$7, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_7_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$7, %xmm2, %xmm3
+	palignr	$7, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_7_loop)
+
+L(shl_7_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	7(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_8):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$8, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_8_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$8, %xmm2, %xmm3
+	palignr	$8, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_8_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$8, %xmm2, %xmm3
+	palignr	$8, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_8_loop)
+
+L(shl_8_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	8(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_9):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$9, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_9_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$9, %xmm2, %xmm3
+	palignr	$9, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_9_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$9, %xmm2, %xmm3
+	palignr	$9, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_9_loop)
+
+L(shl_9_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	9(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_10):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$10, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_10_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$10, %xmm2, %xmm3
+	palignr	$10, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_10_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$10, %xmm2, %xmm3
+	palignr	$10, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_10_loop)
+
+L(shl_10_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	10(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_11):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$11, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_11_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$11, %xmm2, %xmm3
+	palignr	$11, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_11_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$11, %xmm2, %xmm3
+	palignr	$11, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_11_loop)
+
+L(shl_11_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	11(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_12):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$12, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_12_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$12, %xmm2, %xmm3
+	palignr	$12, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_12_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$12, %xmm2, %xmm3
+	palignr	$12, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_12_loop)
+
+L(shl_12_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	12(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_13):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$13, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_13_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$13, %xmm2, %xmm3
+	palignr	$13, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_13_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$13, %xmm2, %xmm3
+	palignr	$13, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_13_loop)
+
+L(shl_13_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	13(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_14):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$14, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_14_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$14, %xmm2, %xmm3
+	palignr	$14, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_14_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$14, %xmm2, %xmm3
+	palignr	$14, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_14_loop)
+
+L(shl_14_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	14(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_15):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$15, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_15_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$15, %xmm2, %xmm3
+	palignr	$15, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_15_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$15, %xmm2, %xmm3
+	palignr	$15, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_15_loop)
+
+L(shl_15_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	15(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+
+	ALIGN (4)
+L(fwd_write_44bytes):
+	movl	-44(%eax), %ecx
+	movl	%ecx, -44(%edx)
+L(fwd_write_40bytes):
+	movl	-40(%eax), %ecx
+	movl	%ecx, -40(%edx)
+L(fwd_write_36bytes):
+	movl	-36(%eax), %ecx
+	movl	%ecx, -36(%edx)
+L(fwd_write_32bytes):
+	movl	-32(%eax), %ecx
+	movl	%ecx, -32(%edx)
+L(fwd_write_28bytes):
+	movl	-28(%eax), %ecx
+	movl	%ecx, -28(%edx)
+L(fwd_write_24bytes):
+	movl	-24(%eax), %ecx
+	movl	%ecx, -24(%edx)
+L(fwd_write_20bytes):
+	movl	-20(%eax), %ecx
+	movl	%ecx, -20(%edx)
+L(fwd_write_16bytes):
+	movl	-16(%eax), %ecx
+	movl	%ecx, -16(%edx)
+L(fwd_write_12bytes):
+	movl	-12(%eax), %ecx
+	movl	%ecx, -12(%edx)
+L(fwd_write_8bytes):
+	movl	-8(%eax), %ecx
+	movl	%ecx, -8(%edx)
+L(fwd_write_4bytes):
+	movl	-4(%eax), %ecx
+	movl	%ecx, -4(%edx)
+L(fwd_write_0bytes):
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_5bytes):
+	movl	-5(%eax), %ecx
+	movl	-4(%eax), %eax
+	movl	%ecx, -5(%edx)
+	movl	%eax, -4(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_45bytes):
+	movl	-45(%eax), %ecx
+	movl	%ecx, -45(%edx)
+L(fwd_write_41bytes):
+	movl	-41(%eax), %ecx
+	movl	%ecx, -41(%edx)
+L(fwd_write_37bytes):
+	movl	-37(%eax), %ecx
+	movl	%ecx, -37(%edx)
+L(fwd_write_33bytes):
+	movl	-33(%eax), %ecx
+	movl	%ecx, -33(%edx)
+L(fwd_write_29bytes):
+	movl	-29(%eax), %ecx
+	movl	%ecx, -29(%edx)
+L(fwd_write_25bytes):
+	movl	-25(%eax), %ecx
+	movl	%ecx, -25(%edx)
+L(fwd_write_21bytes):
+	movl	-21(%eax), %ecx
+	movl	%ecx, -21(%edx)
+L(fwd_write_17bytes):
+	movl	-17(%eax), %ecx
+	movl	%ecx, -17(%edx)
+L(fwd_write_13bytes):
+	movl	-13(%eax), %ecx
+	movl	%ecx, -13(%edx)
+L(fwd_write_9bytes):
+	movl	-9(%eax), %ecx
+	movl	%ecx, -9(%edx)
+	movl	-5(%eax), %ecx
+	movl	%ecx, -5(%edx)
+L(fwd_write_1bytes):
+	movzbl	-1(%eax), %ecx
+	movb	%cl, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_46bytes):
+	movl	-46(%eax), %ecx
+	movl	%ecx, -46(%edx)
+L(fwd_write_42bytes):
+	movl	-42(%eax), %ecx
+	movl	%ecx, -42(%edx)
+L(fwd_write_38bytes):
+	movl	-38(%eax), %ecx
+	movl	%ecx, -38(%edx)
+L(fwd_write_34bytes):
+	movl	-34(%eax), %ecx
+	movl	%ecx, -34(%edx)
+L(fwd_write_30bytes):
+	movl	-30(%eax), %ecx
+	movl	%ecx, -30(%edx)
+L(fwd_write_26bytes):
+	movl	-26(%eax), %ecx
+	movl	%ecx, -26(%edx)
+L(fwd_write_22bytes):
+	movl	-22(%eax), %ecx
+	movl	%ecx, -22(%edx)
+L(fwd_write_18bytes):
+	movl	-18(%eax), %ecx
+	movl	%ecx, -18(%edx)
+L(fwd_write_14bytes):
+	movl	-14(%eax), %ecx
+	movl	%ecx, -14(%edx)
+L(fwd_write_10bytes):
+	movl	-10(%eax), %ecx
+	movl	%ecx, -10(%edx)
+L(fwd_write_6bytes):
+	movl	-6(%eax), %ecx
+	movl	%ecx, -6(%edx)
+L(fwd_write_2bytes):
+	movzwl	-2(%eax), %ecx
+	movw	%cx, -2(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_47bytes):
+	movl	-47(%eax), %ecx
+	movl	%ecx, -47(%edx)
+L(fwd_write_43bytes):
+	movl	-43(%eax), %ecx
+	movl	%ecx, -43(%edx)
+L(fwd_write_39bytes):
+	movl	-39(%eax), %ecx
+	movl	%ecx, -39(%edx)
+L(fwd_write_35bytes):
+	movl	-35(%eax), %ecx
+	movl	%ecx, -35(%edx)
+L(fwd_write_31bytes):
+	movl	-31(%eax), %ecx
+	movl	%ecx, -31(%edx)
+L(fwd_write_27bytes):
+	movl	-27(%eax), %ecx
+	movl	%ecx, -27(%edx)
+L(fwd_write_23bytes):
+	movl	-23(%eax), %ecx
+	movl	%ecx, -23(%edx)
+L(fwd_write_19bytes):
+	movl	-19(%eax), %ecx
+	movl	%ecx, -19(%edx)
+L(fwd_write_15bytes):
+	movl	-15(%eax), %ecx
+	movl	%ecx, -15(%edx)
+L(fwd_write_11bytes):
+	movl	-11(%eax), %ecx
+	movl	%ecx, -11(%edx)
+L(fwd_write_7bytes):
+	movl	-7(%eax), %ecx
+	movl	%ecx, -7(%edx)
+L(fwd_write_3bytes):
+	movzwl	-3(%eax), %ecx
+	movzbl	-1(%eax), %eax
+	movw	%cx, -3(%edx)
+	movb	%al, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN_END
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(large_page):
+	movdqu	(%eax), %xmm1
+	movdqu	%xmm0, (%esi)
+	movntdq	%xmm1, (%edx)
+	add	$0x10, %eax
+	add	$0x10, %edx
+	sub	$0x10, %ecx
+	cmp	%al, %dl
+	je	L(copy_page_by_rep)
+L(large_page_loop_init):
+	POP (%esi)
+	sub	$0x80, %ecx
+	POP (%edi)
+L(large_page_loop):
+	prefetchnta	0x1c0(%eax)
+	prefetchnta	0x280(%eax)
+	movdqu	(%eax), %xmm0
+	movdqu	0x10(%eax), %xmm1
+	movdqu	0x20(%eax), %xmm2
+	movdqu	0x30(%eax), %xmm3
+	movdqu	0x40(%eax), %xmm4
+	movdqu	0x50(%eax), %xmm5
+	movdqu	0x60(%eax), %xmm6
+	movdqu	0x70(%eax), %xmm7
+	lea	0x80(%eax), %eax
+	lfence
+	sub	$0x80, %ecx
+	movntdq	%xmm0, (%edx)
+	movntdq	%xmm1, 0x10(%edx)
+	movntdq	%xmm2, 0x20(%edx)
+	movntdq	%xmm3, 0x30(%edx)
+	movntdq	%xmm4, 0x40(%edx)
+	movntdq	%xmm5, 0x50(%edx)
+	movntdq	%xmm6, 0x60(%edx)
+	movntdq	%xmm7, 0x70(%edx)
+	lea	0x80(%edx), %edx
+	jae	L(large_page_loop)
+	add	$0x80, %ecx
+	cmp	$0x40, %ecx
+	jb	L(large_page_less_64bytes)
+
+	movdqu	(%eax), %xmm0
+	movdqu	0x10(%eax), %xmm1
+	movdqu	0x20(%eax), %xmm2
+	movdqu	0x30(%eax), %xmm3
+	lea	0x40(%eax), %eax
+
+	movntdq	%xmm0, (%edx)
+	movntdq	%xmm1, 0x10(%edx)
+	movntdq	%xmm2, 0x20(%edx)
+	movntdq	%xmm3, 0x30(%edx)
+	lea	0x40(%edx), %edx
+	sub	$0x40, %ecx
+L(large_page_less_64bytes):
+	cmp	$32, %ecx
+	jb	L(large_page_less_32bytes)
+	movdqu	(%eax), %xmm0
+	movdqu	0x10(%eax), %xmm1
+	lea	0x20(%eax), %eax
+	movntdq	%xmm0, (%edx)
+	movntdq	%xmm1, 0x10(%edx)
+	lea	0x20(%edx), %edx
+	sub	$0x20, %ecx
+L(large_page_less_32bytes):
+	add	%ecx, %edx
+	add	%ecx, %eax
+	sfence
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(copy_page_by_rep):
+	mov	%eax, %esi
+	mov	%edx, %edi
+	mov	%ecx, %edx
+	shr	$2, %ecx
+	and	$3, %edx
+	rep	movsl
+	jz	L(copy_page_by_rep_exit)
+	cmp	$2, %edx
+	jb	L(copy_page_by_rep_left_1)
+	movzwl	(%esi), %eax
+	movw	%ax, (%edi)
+	add	$2, %esi
+	add	$2, %edi
+	sub	$2, %edx
+	jz	L(copy_page_by_rep_exit)
+L(copy_page_by_rep_left_1):
+	movzbl	(%esi), %eax
+	movb	%al, (%edi)
+L(copy_page_by_rep_exit):
+	POP (%esi)
+	POP (%edi)
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(bk_write_44bytes):
+	movl	40(%eax), %ecx
+	movl	%ecx, 40(%edx)
+L(bk_write_40bytes):
+	movl	36(%eax), %ecx
+	movl	%ecx, 36(%edx)
+L(bk_write_36bytes):
+	movl	32(%eax), %ecx
+	movl	%ecx, 32(%edx)
+L(bk_write_32bytes):
+	movl	28(%eax), %ecx
+	movl	%ecx, 28(%edx)
+L(bk_write_28bytes):
+	movl	24(%eax), %ecx
+	movl	%ecx, 24(%edx)
+L(bk_write_24bytes):
+	movl	20(%eax), %ecx
+	movl	%ecx, 20(%edx)
+L(bk_write_20bytes):
+	movl	16(%eax), %ecx
+	movl	%ecx, 16(%edx)
+L(bk_write_16bytes):
+	movl	12(%eax), %ecx
+	movl	%ecx, 12(%edx)
+L(bk_write_12bytes):
+	movl	8(%eax), %ecx
+	movl	%ecx, 8(%edx)
+L(bk_write_8bytes):
+	movl	4(%eax), %ecx
+	movl	%ecx, 4(%edx)
+L(bk_write_4bytes):
+	movl	(%eax), %ecx
+	movl	%ecx, (%edx)
+L(bk_write_0bytes):
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(bk_write_45bytes):
+	movl	41(%eax), %ecx
+	movl	%ecx, 41(%edx)
+L(bk_write_41bytes):
+	movl	37(%eax), %ecx
+	movl	%ecx, 37(%edx)
+L(bk_write_37bytes):
+	movl	33(%eax), %ecx
+	movl	%ecx, 33(%edx)
+L(bk_write_33bytes):
+	movl	29(%eax), %ecx
+	movl	%ecx, 29(%edx)
+L(bk_write_29bytes):
+	movl	25(%eax), %ecx
+	movl	%ecx, 25(%edx)
+L(bk_write_25bytes):
+	movl	21(%eax), %ecx
+	movl	%ecx, 21(%edx)
+L(bk_write_21bytes):
+	movl	17(%eax), %ecx
+	movl	%ecx, 17(%edx)
+L(bk_write_17bytes):
+	movl	13(%eax), %ecx
+	movl	%ecx, 13(%edx)
+L(bk_write_13bytes):
+	movl	9(%eax), %ecx
+	movl	%ecx, 9(%edx)
+L(bk_write_9bytes):
+	movl	5(%eax), %ecx
+	movl	%ecx, 5(%edx)
+L(bk_write_5bytes):
+	movl	1(%eax), %ecx
+	movl	%ecx, 1(%edx)
+L(bk_write_1bytes):
+	movzbl	(%eax), %ecx
+	movb	%cl, (%edx)
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(bk_write_46bytes):
+	movl	42(%eax), %ecx
+	movl	%ecx, 42(%edx)
+L(bk_write_42bytes):
+	movl	38(%eax), %ecx
+	movl	%ecx, 38(%edx)
+L(bk_write_38bytes):
+	movl	34(%eax), %ecx
+	movl	%ecx, 34(%edx)
+L(bk_write_34bytes):
+	movl	30(%eax), %ecx
+	movl	%ecx, 30(%edx)
+L(bk_write_30bytes):
+	movl	26(%eax), %ecx
+	movl	%ecx, 26(%edx)
+L(bk_write_26bytes):
+	movl	22(%eax), %ecx
+	movl	%ecx, 22(%edx)
+L(bk_write_22bytes):
+	movl	18(%eax), %ecx
+	movl	%ecx, 18(%edx)
+L(bk_write_18bytes):
+	movl	14(%eax), %ecx
+	movl	%ecx, 14(%edx)
+L(bk_write_14bytes):
+	movl	10(%eax), %ecx
+	movl	%ecx, 10(%edx)
+L(bk_write_10bytes):
+	movl	6(%eax), %ecx
+	movl	%ecx, 6(%edx)
+L(bk_write_6bytes):
+	movl	2(%eax), %ecx
+	movl	%ecx, 2(%edx)
+L(bk_write_2bytes):
+	movzwl	(%eax), %ecx
+	movw	%cx, (%edx)
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(bk_write_47bytes):
+	movl	43(%eax), %ecx
+	movl	%ecx, 43(%edx)
+L(bk_write_43bytes):
+	movl	39(%eax), %ecx
+	movl	%ecx, 39(%edx)
+L(bk_write_39bytes):
+	movl	35(%eax), %ecx
+	movl	%ecx, 35(%edx)
+L(bk_write_35bytes):
+	movl	31(%eax), %ecx
+	movl	%ecx, 31(%edx)
+L(bk_write_31bytes):
+	movl	27(%eax), %ecx
+	movl	%ecx, 27(%edx)
+L(bk_write_27bytes):
+	movl	23(%eax), %ecx
+	movl	%ecx, 23(%edx)
+L(bk_write_23bytes):
+	movl	19(%eax), %ecx
+	movl	%ecx, 19(%edx)
+L(bk_write_19bytes):
+	movl	15(%eax), %ecx
+	movl	%ecx, 15(%edx)
+L(bk_write_15bytes):
+	movl	11(%eax), %ecx
+	movl	%ecx, 11(%edx)
+L(bk_write_11bytes):
+	movl	7(%eax), %ecx
+	movl	%ecx, 7(%edx)
+L(bk_write_7bytes):
+	movl	3(%eax), %ecx
+	movl	%ecx, 3(%edx)
+L(bk_write_3bytes):
+	movzwl	1(%eax), %ecx
+	movw	%cx, 1(%edx)
+	movzbl	(%eax), %eax
+	movb	%al, (%edx)
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN_END
+
+
+	.pushsection .rodata.ssse3,"a",@progbits
+	ALIGN (2)
+L(table_48bytes_fwd):
+	.int	JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
+
+	ALIGN (2)
+L(shl_table):
+	.int	JMPTBL (L(shl_0), L(shl_table))
+	.int	JMPTBL (L(shl_1), L(shl_table))
+	.int	JMPTBL (L(shl_2), L(shl_table))
+	.int	JMPTBL (L(shl_3), L(shl_table))
+	.int	JMPTBL (L(shl_4), L(shl_table))
+	.int	JMPTBL (L(shl_5), L(shl_table))
+	.int	JMPTBL (L(shl_6), L(shl_table))
+	.int	JMPTBL (L(shl_7), L(shl_table))
+	.int	JMPTBL (L(shl_8), L(shl_table))
+	.int	JMPTBL (L(shl_9), L(shl_table))
+	.int	JMPTBL (L(shl_10), L(shl_table))
+	.int	JMPTBL (L(shl_11), L(shl_table))
+	.int	JMPTBL (L(shl_12), L(shl_table))
+	.int	JMPTBL (L(shl_13), L(shl_table))
+	.int	JMPTBL (L(shl_14), L(shl_table))
+	.int	JMPTBL (L(shl_15), L(shl_table))
+
+	ALIGN (2)
+L(table_48_bytes_bwd):
+	.int	JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
+
+	.popsection
+
+#ifdef USE_AS_MEMMOVE
+	ALIGN (4)
+L(copy_backward):
+	PUSH (%esi)
+	movl	%eax, %esi
+	add	%ecx, %edx
+	add	%ecx, %esi
+	testl	$0x3, %edx
+	jnz	L(bk_align)
+
+L(bk_aligned_4):
+	cmp	$64, %ecx
+	jae	L(bk_write_more64bytes)
+
+L(bk_write_64bytesless):
+	cmp	$32, %ecx
+	jb	L(bk_write_less32bytes)
+
+L(bk_write_more32bytes):
+	/* Copy 32 bytes at a time.  */
+	sub	$32, %ecx
+	movl	-4(%esi), %eax
+	movl	%eax, -4(%edx)
+	movl	-8(%esi), %eax
+	movl	%eax, -8(%edx)
+	movl	-12(%esi), %eax
+	movl	%eax, -12(%edx)
+	movl	-16(%esi), %eax
+	movl	%eax, -16(%edx)
+	movl	-20(%esi), %eax
+	movl	%eax, -20(%edx)
+	movl	-24(%esi), %eax
+	movl	%eax, -24(%edx)
+	movl	-28(%esi), %eax
+	movl	%eax, -28(%edx)
+	movl	-32(%esi), %eax
+	movl	%eax, -32(%edx)
+	sub	$32, %edx
+	sub	$32, %esi
+
+L(bk_write_less32bytes):
+	movl	%esi, %eax
+	sub	%ecx, %edx
+	sub	%ecx, %eax
+	POP (%esi)
+L(bk_write_less48bytes):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
+
+	CFI_PUSH (%esi)
+	ALIGN (4)
+L(bk_align):
+	cmp	$8, %ecx
+	jbe	L(bk_write_less32bytes)
+	testl	$1, %edx
+	/* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
+	   then (EDX & 2) must be != 0.  */
+	jz	L(bk_got2)
+	sub	$1, %esi
+	sub	$1, %ecx
+	sub	$1, %edx
+	movzbl	(%esi), %eax
+	movb	%al, (%edx)
+
+	testl	$2, %edx
+	jz	L(bk_aligned_4)
+
+L(bk_got2):
+	sub	$2, %esi
+	sub	$2, %ecx
+	sub	$2, %edx
+	movzwl	(%esi), %eax
+	movw	%ax, (%edx)
+	jmp	L(bk_aligned_4)
+
+	ALIGN (4)
+L(bk_write_more64bytes):
+	/* Check alignment of last byte.  */
+	testl	$15, %edx
+	jz	L(bk_ssse3_cpy_pre)
+
+/* EDX is aligned 4 bytes, but not 16 bytes.  */
+L(bk_ssse3_align):
+	sub	$4, %esi
+	sub	$4, %ecx
+	sub	$4, %edx
+	movl	(%esi), %eax
+	movl	%eax, (%edx)
+
+	testl	$15, %edx
+	jz	L(bk_ssse3_cpy_pre)
+
+	sub	$4, %esi
+	sub	$4, %ecx
+	sub	$4, %edx
+	movl	(%esi), %eax
+	movl	%eax, (%edx)
+
+	testl	$15, %edx
+	jz	L(bk_ssse3_cpy_pre)
+
+	sub	$4, %esi
+	sub	$4, %ecx
+	sub	$4, %edx
+	movl	(%esi), %eax
+	movl	%eax, (%edx)
+
+L(bk_ssse3_cpy_pre):
+	cmp	$64, %ecx
+	jb	L(bk_write_more32bytes)
+
+L(bk_ssse3_cpy):
+	sub	$64, %esi
+	sub	$64, %ecx
+	sub	$64, %edx
+	movdqu	0x30(%esi), %xmm3
+	movdqa	%xmm3, 0x30(%edx)
+	movdqu	0x20(%esi), %xmm2
+	movdqa	%xmm2, 0x20(%edx)
+	movdqu	0x10(%esi), %xmm1
+	movdqa	%xmm1, 0x10(%edx)
+	movdqu	(%esi), %xmm0
+	movdqa	%xmm0, (%edx)
+	cmp	$64, %ecx
+	jae	L(bk_ssse3_cpy)
+	jmp	L(bk_write_64bytesless)
+
+#endif
+
+END (MEMCPY)
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
new file mode 100644
index 0000000000..53e8a6ca1d
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
@@ -0,0 +1,3162 @@
+/* memcpy with SSSE3
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc) \
+    && (defined SHARED \
+	|| defined USE_AS_MEMMOVE \
+	|| !defined USE_MULTIARCH)
+
+# include <sysdep.h>
+# include "asm-syntax.h"
+
+# ifndef MEMCPY
+#  define MEMCPY		__memcpy_ssse3
+#  define MEMCPY_CHK	__memcpy_chk_ssse3
+# endif
+
+# ifdef USE_AS_BCOPY
+#  define SRC		PARMS
+#  define DEST		SRC+4
+#  define LEN		DEST+4
+# else
+#  define DEST		PARMS
+#  define SRC		DEST+4
+#  define LEN		SRC+4
+# endif
+
+# define CFI_PUSH(REG)		\
+  cfi_adjust_cfa_offset (4);		\
+  cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)		\
+  cfi_adjust_cfa_offset (-4);		\
+  cfi_restore (REG)
+
+# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+# define POP(REG)	popl REG; CFI_POP (REG)
+
+# ifdef SHARED
+#  define PARMS		8		/* Preserve EBX.  */
+#  define ENTRANCE	PUSH (%ebx);
+#  define RETURN_END	POP (%ebx); ret
+#  define RETURN		RETURN_END; CFI_PUSH (%ebx)
+#  define JMPTBL(I, B)	I - B
+
+/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
+	jump table with relative offsets.  INDEX is a register contains the
+	index into the jump table.   SCALE is the scale of INDEX. */
+
+#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+    /* We first load PC into EBX.  */		\
+	SETUP_PIC_REG(bx);		\
+    /* Get the address of the jump table.  */		\
+	addl	$(TABLE - .), %ebx;		\
+    /* Get the entry and convert the relative offset to the		\
+	absolute	address.  */		\
+	addl	(%ebx, INDEX, SCALE), %ebx;		\
+    /* We loaded the jump table.  Go.  */		\
+	jmp	*%ebx
+# else
+
+#  define PARMS		4
+#  define ENTRANCE
+#  define RETURN_END	ret
+#  define RETURN		RETURN_END
+#  define JMPTBL(I, B)	I
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+	absolute offsets.  INDEX is a register contains the index into the
+	jump table.  SCALE is the scale of INDEX. */
+
+#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+	jmp	*TABLE(, INDEX, SCALE)
+# endif
+
+	.section .text.ssse3,"ax",@progbits
+# if !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+# endif
+ENTRY (MEMCPY)
+	ENTRANCE
+	movl	LEN(%esp), %ecx
+	movl	SRC(%esp), %eax
+	movl	DEST(%esp), %edx
+
+# ifdef USE_AS_MEMMOVE
+	cmp	%eax, %edx
+	jb	L(copy_forward)
+	je	L(fwd_write_0bytes)
+	cmp	$32, %ecx
+	jae	L(memmove_bwd)
+	jmp	L(bk_write_less32bytes_2)
+
+	.p2align 4
+L(memmove_bwd):
+	add	%ecx, %eax
+	cmp	%eax, %edx
+	movl	SRC(%esp), %eax
+	jb	L(copy_backward)
+
+L(copy_forward):
+# endif
+	cmp	$48, %ecx
+	jae	L(48bytesormore)
+
+L(fwd_write_less32bytes):
+# ifndef USE_AS_MEMMOVE
+	cmp	%dl, %al
+	jb	L(bk_write)
+# endif
+	add	%ecx, %edx
+	add	%ecx, %eax
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+# ifndef USE_AS_MEMMOVE
+	.p2align 4
+L(bk_write):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
+# endif
+
+	.p2align 4
+L(48bytesormore):
+# ifndef USE_AS_MEMMOVE
+	movlpd	(%eax), %xmm0
+	movlpd	8(%eax), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 8(%edx)
+# else
+	movdqu	(%eax), %xmm0
+# endif
+	PUSH (%edi)
+	movl	%edx, %edi
+	and	$-16, %edx
+	add	$16, %edx
+	sub	%edx, %edi
+	add	%edi, %ecx
+	sub	%edi, %eax
+
+# ifdef SHARED_CACHE_SIZE_HALF
+	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_shared_cache_size_half, %ecx
+#  endif
+# endif
+
+	mov	%eax, %edi
+	jae	L(large_page)
+	and	$0xf, %edi
+	jz	L(shl_0)
+	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
+
+	.p2align 4
+L(shl_0):
+# ifdef USE_AS_MEMMOVE
+	movl	DEST+4(%esp), %edi
+	movdqu	%xmm0, (%edi)
+# endif
+	xor	%edi, %edi
+	cmp	$127, %ecx
+	ja	L(shl_0_gobble)
+	lea	-32(%ecx), %ecx
+
+	.p2align 4
+L(shl_0_loop):
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+	jb	L(shl_0_end)
+
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+	jb	L(shl_0_end)
+
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+	jb	L(shl_0_end)
+
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+
+L(shl_0_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	add	%edi, %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_0_gobble):
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	POP	(%edi)
+	lea	-128(%ecx), %ecx
+	jae	L(shl_0_gobble_mem_loop)
+
+	.p2align 4
+L(shl_0_gobble_cache_loop):
+	movdqa	(%eax), %xmm0
+	movdqa	0x10(%eax), %xmm1
+	movdqa	0x20(%eax), %xmm2
+	movdqa	0x30(%eax), %xmm3
+	movdqa	0x40(%eax), %xmm4
+	movdqa	0x50(%eax), %xmm5
+	movdqa	0x60(%eax), %xmm6
+	movdqa	0x70(%eax), %xmm7
+	lea	0x80(%eax), %eax
+	sub	$128, %ecx
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+	movdqa	%xmm2, 0x20(%edx)
+	movdqa	%xmm3, 0x30(%edx)
+	movdqa	%xmm4, 0x40(%edx)
+	movdqa	%xmm5, 0x50(%edx)
+	movdqa	%xmm6, 0x60(%edx)
+	movdqa	%xmm7, 0x70(%edx)
+	lea	0x80(%edx), %edx
+
+	jae	L(shl_0_gobble_cache_loop)
+	cmp	$-0x40, %ecx
+	lea	0x80(%ecx), %ecx
+	jl	L(shl_0_cache_less_64bytes)
+
+	movdqa	(%eax), %xmm0
+	sub	$0x40, %ecx
+	movdqa	0x10(%eax), %xmm1
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+	movdqa	0x20(%eax), %xmm0
+	movdqa	0x30(%eax), %xmm1
+	add	$0x40, %eax
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm1, 0x30(%edx)
+	add	$0x40, %edx
+
+L(shl_0_cache_less_64bytes):
+	cmp	$0x20, %ecx
+	jb	L(shl_0_cache_less_32bytes)
+	movdqa	(%eax), %xmm0
+	sub	$0x20, %ecx
+	movdqa	0x10(%eax), %xmm1
+	add	$0x20, %eax
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+	add	$0x20, %edx
+
+L(shl_0_cache_less_32bytes):
+	cmp	$0x10, %ecx
+	jb	L(shl_0_cache_less_16bytes)
+	sub	$0x10, %ecx
+	movdqa	(%eax), %xmm0
+	add	$0x10, %eax
+	movdqa	%xmm0, (%edx)
+	add	$0x10, %edx
+
+L(shl_0_cache_less_16bytes):
+	add	%ecx, %edx
+	add	%ecx, %eax
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+	.p2align 4
+L(shl_0_gobble_mem_loop):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x280(%eax)
+	prefetcht0 0x1c0(%edx)
+
+	movdqa	(%eax), %xmm0
+	movdqa	0x10(%eax), %xmm1
+	movdqa	0x20(%eax), %xmm2
+	movdqa	0x30(%eax), %xmm3
+	movdqa	0x40(%eax), %xmm4
+	movdqa	0x50(%eax), %xmm5
+	movdqa	0x60(%eax), %xmm6
+	movdqa	0x70(%eax), %xmm7
+	lea	0x80(%eax), %eax
+	sub	$0x80, %ecx
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+	movdqa	%xmm2, 0x20(%edx)
+	movdqa	%xmm3, 0x30(%edx)
+	movdqa	%xmm4, 0x40(%edx)
+	movdqa	%xmm5, 0x50(%edx)
+	movdqa	%xmm6, 0x60(%edx)
+	movdqa	%xmm7, 0x70(%edx)
+	lea	0x80(%edx), %edx
+
+	jae	L(shl_0_gobble_mem_loop)
+	cmp	$-0x40, %ecx
+	lea	0x80(%ecx), %ecx
+	jl	L(shl_0_mem_less_64bytes)
+
+	movdqa	(%eax), %xmm0
+	sub	$0x40, %ecx
+	movdqa	0x10(%eax), %xmm1
+
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+
+	movdqa	0x20(%eax), %xmm0
+	movdqa	0x30(%eax), %xmm1
+	add	$0x40, %eax
+
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm1, 0x30(%edx)
+	add	$0x40, %edx
+
+L(shl_0_mem_less_64bytes):
+	cmp	$0x20, %ecx
+	jb	L(shl_0_mem_less_32bytes)
+	movdqa	(%eax), %xmm0
+	sub	$0x20, %ecx
+	movdqa	0x10(%eax), %xmm1
+	add	$0x20, %eax
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+	add	$0x20, %edx
+
+L(shl_0_mem_less_32bytes):
+	cmp	$0x10, %ecx
+	jb	L(shl_0_mem_less_16bytes)
+	sub	$0x10, %ecx
+	movdqa	(%eax), %xmm0
+	add	$0x10, %eax
+	movdqa	%xmm0, (%edx)
+	add	$0x10, %edx
+
+L(shl_0_mem_less_16bytes):
+	add	%ecx, %edx
+	add	%ecx, %eax
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
+
+	.p2align 4
+L(shl_1):
+# ifndef USE_AS_MEMMOVE
+	movaps	-1(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-1(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_1_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl1LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	15(%eax), %xmm2
+	movaps	31(%eax), %xmm3
+	movaps	47(%eax), %xmm4
+	movaps	63(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$1, %xmm4, %xmm5
+	palignr	$1, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$1, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl1LoopStart)
+
+L(Shl1LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	15(%eax), %xmm2
+	movaps	31(%eax), %xmm3
+	palignr	$1, %xmm2, %xmm3
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_1_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-1(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_1_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$1, %xmm2, %xmm3
+	palignr	$1, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jb	L(sh_1_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$1, %xmm2, %xmm3
+	palignr	$1, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_1_no_prefetch_loop)
+
+L(sh_1_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	1(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_2):
+# ifndef USE_AS_MEMMOVE
+	movaps	-2(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-2(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_2_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl2LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	14(%eax), %xmm2
+	movaps	30(%eax), %xmm3
+	movaps	46(%eax), %xmm4
+	movaps	62(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$2, %xmm4, %xmm5
+	palignr	$2, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$2, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl2LoopStart)
+
+L(Shl2LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	14(%eax), %xmm2
+	movaps	30(%eax), %xmm3
+	palignr	$2, %xmm2, %xmm3
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_2_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-2(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_2_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$2, %xmm2, %xmm3
+	palignr	$2, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jb	L(sh_2_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$2, %xmm2, %xmm3
+	palignr	$2, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_2_no_prefetch_loop)
+
+L(sh_2_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	2(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_3):
+# ifndef USE_AS_MEMMOVE
+	movaps	-3(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-3(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_3_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl3LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	13(%eax), %xmm2
+	movaps	29(%eax), %xmm3
+	movaps	45(%eax), %xmm4
+	movaps	61(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$3, %xmm4, %xmm5
+	palignr	$3, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$3, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl3LoopStart)
+
+L(Shl3LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	13(%eax), %xmm2
+	movaps	29(%eax), %xmm3
+	palignr	$3, %xmm2, %xmm3
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_3_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-3(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_3_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$3, %xmm2, %xmm3
+	palignr	$3, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(sh_3_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$3, %xmm2, %xmm3
+	palignr	$3, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(sh_3_no_prefetch_loop)
+
+L(sh_3_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	3(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_4):
+# ifndef USE_AS_MEMMOVE
+	movaps	-4(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-4(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_4_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl4LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	12(%eax), %xmm2
+	movaps	28(%eax), %xmm3
+	movaps	44(%eax), %xmm4
+	movaps	60(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$4, %xmm4, %xmm5
+	palignr	$4, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$4, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl4LoopStart)
+
+L(Shl4LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	12(%eax), %xmm2
+	movaps	28(%eax), %xmm3
+	palignr	$4, %xmm2, %xmm3
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_4_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-4(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_4_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$4, %xmm2, %xmm3
+	palignr	$4, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(sh_4_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$4, %xmm2, %xmm3
+	palignr	$4, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(sh_4_no_prefetch_loop)
+
+L(sh_4_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	4(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_5):
+# ifndef USE_AS_MEMMOVE
+	movaps	-5(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-5(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_5_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl5LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	11(%eax), %xmm2
+	movaps	27(%eax), %xmm3
+	movaps	43(%eax), %xmm4
+	movaps	59(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$5, %xmm4, %xmm5
+	palignr	$5, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$5, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl5LoopStart)
+
+L(Shl5LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	11(%eax), %xmm2
+	movaps	27(%eax), %xmm3
+	palignr	$5, %xmm2, %xmm3
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_5_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-5(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_5_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$5, %xmm2, %xmm3
+	palignr	$5, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(sh_5_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$5, %xmm2, %xmm3
+	palignr	$5, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(sh_5_no_prefetch_loop)
+
+L(sh_5_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	5(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_6):
+# ifndef USE_AS_MEMMOVE
+	movaps	-6(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-6(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_6_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl6LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	10(%eax), %xmm2
+	movaps	26(%eax), %xmm3
+	movaps	42(%eax), %xmm4
+	movaps	58(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$6, %xmm4, %xmm5
+	palignr	$6, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$6, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl6LoopStart)
+
+L(Shl6LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	10(%eax), %xmm2
+	movaps	26(%eax), %xmm3
+	palignr	$6, %xmm2, %xmm3
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_6_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-6(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_6_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$6, %xmm2, %xmm3
+	palignr	$6, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(sh_6_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$6, %xmm2, %xmm3
+	palignr	$6, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(sh_6_no_prefetch_loop)
+
+L(sh_6_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	6(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_7):
+# ifndef USE_AS_MEMMOVE
+	movaps	-7(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-7(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_7_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl7LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	9(%eax), %xmm2
+	movaps	25(%eax), %xmm3
+	movaps	41(%eax), %xmm4
+	movaps	57(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$7, %xmm4, %xmm5
+	palignr	$7, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$7, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl7LoopStart)
+
+L(Shl7LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	9(%eax), %xmm2
+	movaps	25(%eax), %xmm3
+	palignr	$7, %xmm2, %xmm3
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_7_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-7(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_7_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$7, %xmm2, %xmm3
+	palignr	$7, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jb	L(sh_7_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$7, %xmm2, %xmm3
+	palignr	$7, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_7_no_prefetch_loop)
+
+L(sh_7_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	7(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_8):
+# ifndef USE_AS_MEMMOVE
+	movaps	-8(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-8(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_8_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl8LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	8(%eax), %xmm2
+	movaps	24(%eax), %xmm3
+	movaps	40(%eax), %xmm4
+	movaps	56(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$8, %xmm4, %xmm5
+	palignr	$8, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$8, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl8LoopStart)
+
+L(LoopLeave8):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	8(%eax), %xmm2
+	movaps	24(%eax), %xmm3
+	palignr	$8, %xmm2, %xmm3
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_8_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-8(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_8_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$8, %xmm2, %xmm3
+	palignr	$8, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jb	L(sh_8_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$8, %xmm2, %xmm3
+	palignr	$8, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_8_no_prefetch_loop)
+
+L(sh_8_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	8(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_9):
+# ifndef USE_AS_MEMMOVE
+	movaps	-9(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-9(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_9_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl9LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	7(%eax), %xmm2
+	movaps	23(%eax), %xmm3
+	movaps	39(%eax), %xmm4
+	movaps	55(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$9, %xmm4, %xmm5
+	palignr	$9, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$9, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl9LoopStart)
+
+L(Shl9LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	7(%eax), %xmm2
+	movaps	23(%eax), %xmm3
+	palignr	$9, %xmm2, %xmm3
+	palignr	$9, %xmm1, %xmm2
+
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_9_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-9(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_9_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$9, %xmm2, %xmm3
+	palignr	$9, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jb	L(sh_9_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$9, %xmm2, %xmm3
+	palignr	$9, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_9_no_prefetch_loop)
+
+L(sh_9_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	9(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_10):
+# ifndef USE_AS_MEMMOVE
+	movaps	-10(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-10(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_10_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl10LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	6(%eax), %xmm2
+	movaps	22(%eax), %xmm3
+	movaps	38(%eax), %xmm4
+	movaps	54(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$10, %xmm4, %xmm5
+	palignr	$10, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$10, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl10LoopStart)
+
+L(Shl10LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	6(%eax), %xmm2
+	movaps	22(%eax), %xmm3
+	palignr	$10, %xmm2, %xmm3
+	palignr	$10, %xmm1, %xmm2
+
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_10_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-10(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_10_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$10, %xmm2, %xmm3
+	palignr	$10, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jb	L(sh_10_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$10, %xmm2, %xmm3
+	palignr	$10, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_10_no_prefetch_loop)
+
+L(sh_10_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	10(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_11):
+# ifndef USE_AS_MEMMOVE
+	movaps	-11(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-11(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_11_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl11LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	5(%eax), %xmm2
+	movaps	21(%eax), %xmm3
+	movaps	37(%eax), %xmm4
+	movaps	53(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$11, %xmm4, %xmm5
+	palignr	$11, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$11, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl11LoopStart)
+
+L(Shl11LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	5(%eax), %xmm2
+	movaps	21(%eax), %xmm3
+	palignr	$11, %xmm2, %xmm3
+	palignr	$11, %xmm1, %xmm2
+
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_11_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-11(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_11_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$11, %xmm2, %xmm3
+	palignr	$11, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jb	L(sh_11_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$11, %xmm2, %xmm3
+	palignr	$11, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_11_no_prefetch_loop)
+
+L(sh_11_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	11(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_12):
+# ifndef USE_AS_MEMMOVE
+	movaps	-12(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-12(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_12_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl12LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	4(%eax), %xmm2
+	movaps	20(%eax), %xmm3
+	movaps	36(%eax), %xmm4
+	movaps	52(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$12, %xmm4, %xmm5
+	palignr	$12, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$12, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl12LoopStart)
+
+L(Shl12LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	4(%eax), %xmm2
+	movaps	20(%eax), %xmm3
+	palignr	$12, %xmm2, %xmm3
+	palignr	$12, %xmm1, %xmm2
+
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_12_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-12(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_12_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$12, %xmm2, %xmm3
+	palignr	$12, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jb	L(sh_12_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$12, %xmm2, %xmm3
+	palignr	$12, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_12_no_prefetch_loop)
+
+L(sh_12_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	12(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_13):
+# ifndef USE_AS_MEMMOVE
+	movaps	-13(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-13(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_13_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl13LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	3(%eax), %xmm2
+	movaps	19(%eax), %xmm3
+	movaps	35(%eax), %xmm4
+	movaps	51(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$13, %xmm4, %xmm5
+	palignr	$13, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$13, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl13LoopStart)
+
+L(Shl13LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	3(%eax), %xmm2
+	movaps	19(%eax), %xmm3
+	palignr	$13, %xmm2, %xmm3
+	palignr	$13, %xmm1, %xmm2
+
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_13_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-13(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_13_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$13, %xmm2, %xmm3
+	palignr	$13, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jb	L(sh_13_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$13, %xmm2, %xmm3
+	palignr	$13, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_13_no_prefetch_loop)
+
+L(sh_13_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	13(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_14):
+# ifndef USE_AS_MEMMOVE
+	movaps	-14(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-14(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_14_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl14LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	2(%eax), %xmm2
+	movaps	18(%eax), %xmm3
+	movaps	34(%eax), %xmm4
+	movaps	50(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$14, %xmm4, %xmm5
+	palignr	$14, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$14, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl14LoopStart)
+
+L(Shl14LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	2(%eax), %xmm2
+	movaps	18(%eax), %xmm3
+	palignr	$14, %xmm2, %xmm3
+	palignr	$14, %xmm1, %xmm2
+
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_14_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-14(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_14_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$14, %xmm2, %xmm3
+	palignr	$14, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jb	L(sh_14_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$14, %xmm2, %xmm3
+	palignr	$14, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_14_no_prefetch_loop)
+
+L(sh_14_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	14(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_15):
+# ifndef USE_AS_MEMMOVE
+	movaps	-15(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-15(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_15_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl15LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	1(%eax), %xmm2
+	movaps	17(%eax), %xmm3
+	movaps	33(%eax), %xmm4
+	movaps	49(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$15, %xmm4, %xmm5
+	palignr	$15, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$15, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl15LoopStart)
+
+L(Shl15LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	1(%eax), %xmm2
+	movaps	17(%eax), %xmm3
+	palignr	$15, %xmm2, %xmm3
+	palignr	$15, %xmm1, %xmm2
+
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_15_no_prefetch):
+	lea	-32(%ecx), %ecx
+	lea	-15(%eax), %eax
+	xor	%edi, %edi
+
+	.p2align 4
+L(sh_15_no_prefetch_loop):
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$15, %xmm2, %xmm3
+	palignr	$15, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jb	L(sh_15_end_no_prefetch_loop)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$15, %xmm2, %xmm3
+	palignr	$15, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_15_no_prefetch_loop)
+
+L(sh_15_end_no_prefetch_loop):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	15(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_end_0):
+	lea	32(%ecx), %ecx
+	lea	(%edx, %ecx), %edx
+	lea	(%eax, %ecx), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	.p2align 4
+L(fwd_write_44bytes):
+	movq	-44(%eax), %xmm0
+	movq	%xmm0, -44(%edx)
+L(fwd_write_36bytes):
+	movq	-36(%eax), %xmm0
+	movq	%xmm0, -36(%edx)
+L(fwd_write_28bytes):
+	movq	-28(%eax), %xmm0
+	movq	%xmm0, -28(%edx)
+L(fwd_write_20bytes):
+	movq	-20(%eax), %xmm0
+	movq	%xmm0, -20(%edx)
+L(fwd_write_12bytes):
+	movq	-12(%eax), %xmm0
+	movq	%xmm0, -12(%edx)
+L(fwd_write_4bytes):
+	movl	-4(%eax), %ecx
+	movl	%ecx, -4(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_40bytes):
+	movq	-40(%eax), %xmm0
+	movq	%xmm0, -40(%edx)
+L(fwd_write_32bytes):
+	movq	-32(%eax), %xmm0
+	movq	%xmm0, -32(%edx)
+L(fwd_write_24bytes):
+	movq	-24(%eax), %xmm0
+	movq	%xmm0, -24(%edx)
+L(fwd_write_16bytes):
+	movq	-16(%eax), %xmm0
+	movq	%xmm0, -16(%edx)
+L(fwd_write_8bytes):
+	movq	-8(%eax), %xmm0
+	movq	%xmm0, -8(%edx)
+L(fwd_write_0bytes):
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_5bytes):
+	movl	-5(%eax), %ecx
+	movl	-4(%eax), %eax
+	movl	%ecx, -5(%edx)
+	movl	%eax, -4(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_45bytes):
+	movq	-45(%eax), %xmm0
+	movq	%xmm0, -45(%edx)
+L(fwd_write_37bytes):
+	movq	-37(%eax), %xmm0
+	movq	%xmm0, -37(%edx)
+L(fwd_write_29bytes):
+	movq	-29(%eax), %xmm0
+	movq	%xmm0, -29(%edx)
+L(fwd_write_21bytes):
+	movq	-21(%eax), %xmm0
+	movq	%xmm0, -21(%edx)
+L(fwd_write_13bytes):
+	movq	-13(%eax), %xmm0
+	movq	%xmm0, -13(%edx)
+	movl	-5(%eax), %ecx
+	movl	%ecx, -5(%edx)
+	movzbl	-1(%eax), %ecx
+	movb	%cl, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_41bytes):
+	movq	-41(%eax), %xmm0
+	movq	%xmm0, -41(%edx)
+L(fwd_write_33bytes):
+	movq	-33(%eax), %xmm0
+	movq	%xmm0, -33(%edx)
+L(fwd_write_25bytes):
+	movq	-25(%eax), %xmm0
+	movq	%xmm0, -25(%edx)
+L(fwd_write_17bytes):
+	movq	-17(%eax), %xmm0
+	movq	%xmm0, -17(%edx)
+L(fwd_write_9bytes):
+	movq	-9(%eax), %xmm0
+	movq	%xmm0, -9(%edx)
+L(fwd_write_1bytes):
+	movzbl	-1(%eax), %ecx
+	movb	%cl, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_46bytes):
+	movq	-46(%eax), %xmm0
+	movq	%xmm0, -46(%edx)
+L(fwd_write_38bytes):
+	movq	-38(%eax), %xmm0
+	movq	%xmm0, -38(%edx)
+L(fwd_write_30bytes):
+	movq	-30(%eax), %xmm0
+	movq	%xmm0, -30(%edx)
+L(fwd_write_22bytes):
+	movq	-22(%eax), %xmm0
+	movq	%xmm0, -22(%edx)
+L(fwd_write_14bytes):
+	movq	-14(%eax), %xmm0
+	movq	%xmm0, -14(%edx)
+L(fwd_write_6bytes):
+	movl	-6(%eax), %ecx
+	movl	%ecx, -6(%edx)
+	movzwl	-2(%eax), %ecx
+	movw	%cx, -2(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_42bytes):
+	movq	-42(%eax), %xmm0
+	movq	%xmm0, -42(%edx)
+L(fwd_write_34bytes):
+	movq	-34(%eax), %xmm0
+	movq	%xmm0, -34(%edx)
+L(fwd_write_26bytes):
+	movq	-26(%eax), %xmm0
+	movq	%xmm0, -26(%edx)
+L(fwd_write_18bytes):
+	movq	-18(%eax), %xmm0
+	movq	%xmm0, -18(%edx)
+L(fwd_write_10bytes):
+	movq	-10(%eax), %xmm0
+	movq	%xmm0, -10(%edx)
+L(fwd_write_2bytes):
+	movzwl	-2(%eax), %ecx
+	movw	%cx, -2(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_47bytes):
+	movq	-47(%eax), %xmm0
+	movq	%xmm0, -47(%edx)
+L(fwd_write_39bytes):
+	movq	-39(%eax), %xmm0
+	movq	%xmm0, -39(%edx)
+L(fwd_write_31bytes):
+	movq	-31(%eax), %xmm0
+	movq	%xmm0, -31(%edx)
+L(fwd_write_23bytes):
+	movq	-23(%eax), %xmm0
+	movq	%xmm0, -23(%edx)
+L(fwd_write_15bytes):
+	movq	-15(%eax), %xmm0
+	movq	%xmm0, -15(%edx)
+L(fwd_write_7bytes):
+	movl	-7(%eax), %ecx
+	movl	%ecx, -7(%edx)
+	movzwl	-3(%eax), %ecx
+	movzbl	-1(%eax), %eax
+	movw	%cx, -3(%edx)
+	movb	%al, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_43bytes):
+	movq	-43(%eax), %xmm0
+	movq	%xmm0, -43(%edx)
+L(fwd_write_35bytes):
+	movq	-35(%eax), %xmm0
+	movq	%xmm0, -35(%edx)
+L(fwd_write_27bytes):
+	movq	-27(%eax), %xmm0
+	movq	%xmm0, -27(%edx)
+L(fwd_write_19bytes):
+	movq	-19(%eax), %xmm0
+	movq	%xmm0, -19(%edx)
+L(fwd_write_11bytes):
+	movq	-11(%eax), %xmm0
+	movq	%xmm0, -11(%edx)
+L(fwd_write_3bytes):
+	movzwl	-3(%eax), %ecx
+	movzbl	-1(%eax), %eax
+	movw	%cx, -3(%edx)
+	movb	%al, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_40bytes_align):
+	movdqa	-40(%eax), %xmm0
+	movdqa	%xmm0, -40(%edx)
+L(fwd_write_24bytes_align):
+	movdqa	-24(%eax), %xmm0
+	movdqa	%xmm0, -24(%edx)
+L(fwd_write_8bytes_align):
+	movq	-8(%eax), %xmm0
+	movq	%xmm0, -8(%edx)
+L(fwd_write_0bytes_align):
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_32bytes_align):
+	movdqa	-32(%eax), %xmm0
+	movdqa	%xmm0, -32(%edx)
+L(fwd_write_16bytes_align):
+	movdqa	-16(%eax), %xmm0
+	movdqa	%xmm0, -16(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_5bytes_align):
+	movl	-5(%eax), %ecx
+	movl	-4(%eax), %eax
+	movl	%ecx, -5(%edx)
+	movl	%eax, -4(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_45bytes_align):
+	movdqa	-45(%eax), %xmm0
+	movdqa	%xmm0, -45(%edx)
+L(fwd_write_29bytes_align):
+	movdqa	-29(%eax), %xmm0
+	movdqa	%xmm0, -29(%edx)
+L(fwd_write_13bytes_align):
+	movq	-13(%eax), %xmm0
+	movq	%xmm0, -13(%edx)
+	movl	-5(%eax), %ecx
+	movl	%ecx, -5(%edx)
+	movzbl	-1(%eax), %ecx
+	movb	%cl, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_37bytes_align):
+	movdqa	-37(%eax), %xmm0
+	movdqa	%xmm0, -37(%edx)
+L(fwd_write_21bytes_align):
+	movdqa	-21(%eax), %xmm0
+	movdqa	%xmm0, -21(%edx)
+	movl	-5(%eax), %ecx
+	movl	%ecx, -5(%edx)
+	movzbl	-1(%eax), %ecx
+	movb	%cl, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_41bytes_align):
+	movdqa	-41(%eax), %xmm0
+	movdqa	%xmm0, -41(%edx)
+L(fwd_write_25bytes_align):
+	movdqa	-25(%eax), %xmm0
+	movdqa	%xmm0, -25(%edx)
+L(fwd_write_9bytes_align):
+	movq	-9(%eax), %xmm0
+	movq	%xmm0, -9(%edx)
+L(fwd_write_1bytes_align):
+	movzbl	-1(%eax), %ecx
+	movb	%cl, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_33bytes_align):
+	movdqa	-33(%eax), %xmm0
+	movdqa	%xmm0, -33(%edx)
+L(fwd_write_17bytes_align):
+	movdqa	-17(%eax), %xmm0
+	movdqa	%xmm0, -17(%edx)
+	movzbl	-1(%eax), %ecx
+	movb	%cl, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_46bytes_align):
+	movdqa	-46(%eax), %xmm0
+	movdqa	%xmm0, -46(%edx)
+L(fwd_write_30bytes_align):
+	movdqa	-30(%eax), %xmm0
+	movdqa	%xmm0, -30(%edx)
+L(fwd_write_14bytes_align):
+	movq	-14(%eax), %xmm0
+	movq	%xmm0, -14(%edx)
+L(fwd_write_6bytes_align):
+	movl	-6(%eax), %ecx
+	movl	%ecx, -6(%edx)
+	movzwl	-2(%eax), %ecx
+	movw	%cx, -2(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_38bytes_align):
+	movdqa	-38(%eax), %xmm0
+	movdqa	%xmm0, -38(%edx)
+L(fwd_write_22bytes_align):
+	movdqa	-22(%eax), %xmm0
+	movdqa	%xmm0, -22(%edx)
+	movl	-6(%eax), %ecx
+	movl	%ecx, -6(%edx)
+	movzwl	-2(%eax), %ecx
+	movw	%cx, -2(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_42bytes_align):
+	movdqa	-42(%eax), %xmm0
+	movdqa	%xmm0, -42(%edx)
+L(fwd_write_26bytes_align):
+	movdqa	-26(%eax), %xmm0
+	movdqa	%xmm0, -26(%edx)
+L(fwd_write_10bytes_align):
+	movq	-10(%eax), %xmm0
+	movq	%xmm0, -10(%edx)
+L(fwd_write_2bytes_align):
+	movzwl	-2(%eax), %ecx
+	movw	%cx, -2(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_34bytes_align):
+	movdqa	-34(%eax), %xmm0
+	movdqa	%xmm0, -34(%edx)
+L(fwd_write_18bytes_align):
+	movdqa	-18(%eax), %xmm0
+	movdqa	%xmm0, -18(%edx)
+	movzwl	-2(%eax), %ecx
+	movw	%cx, -2(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_47bytes_align):
+	movdqa	-47(%eax), %xmm0
+	movdqa	%xmm0, -47(%edx)
+L(fwd_write_31bytes_align):
+	movdqa	-31(%eax), %xmm0
+	movdqa	%xmm0, -31(%edx)
+L(fwd_write_15bytes_align):
+	movq	-15(%eax), %xmm0
+	movq	%xmm0, -15(%edx)
+L(fwd_write_7bytes_align):
+	movl	-7(%eax), %ecx
+	movl	%ecx, -7(%edx)
+	movzwl	-3(%eax), %ecx
+	movzbl	-1(%eax), %eax
+	movw	%cx, -3(%edx)
+	movb	%al, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_39bytes_align):
+	movdqa	-39(%eax), %xmm0
+	movdqa	%xmm0, -39(%edx)
+L(fwd_write_23bytes_align):
+	movdqa	-23(%eax), %xmm0
+	movdqa	%xmm0, -23(%edx)
+	movl	-7(%eax), %ecx
+	movl	%ecx, -7(%edx)
+	movzwl	-3(%eax), %ecx
+	movzbl	-1(%eax), %eax
+	movw	%cx, -3(%edx)
+	movb	%al, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_43bytes_align):
+	movdqa	-43(%eax), %xmm0
+	movdqa	%xmm0, -43(%edx)
+L(fwd_write_27bytes_align):
+	movdqa	-27(%eax), %xmm0
+	movdqa	%xmm0, -27(%edx)
+L(fwd_write_11bytes_align):
+	movq	-11(%eax), %xmm0
+	movq	%xmm0, -11(%edx)
+L(fwd_write_3bytes_align):
+	movzwl	-3(%eax), %ecx
+	movzbl	-1(%eax), %eax
+	movw	%cx, -3(%edx)
+	movb	%al, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_35bytes_align):
+	movdqa	-35(%eax), %xmm0
+	movdqa	%xmm0, -35(%edx)
+L(fwd_write_19bytes_align):
+	movdqa	-19(%eax), %xmm0
+	movdqa	%xmm0, -19(%edx)
+	movzwl	-3(%eax), %ecx
+	movzbl	-1(%eax), %eax
+	movw	%cx, -3(%edx)
+	movb	%al, -1(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_44bytes_align):
+	movdqa	-44(%eax), %xmm0
+	movdqa	%xmm0, -44(%edx)
+L(fwd_write_28bytes_align):
+	movdqa	-28(%eax), %xmm0
+	movdqa	%xmm0, -28(%edx)
+L(fwd_write_12bytes_align):
+	movq	-12(%eax), %xmm0
+	movq	%xmm0, -12(%edx)
+L(fwd_write_4bytes_align):
+	movl	-4(%eax), %ecx
+	movl	%ecx, -4(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(fwd_write_36bytes_align):
+	movdqa	-36(%eax), %xmm0
+	movdqa	%xmm0, -36(%edx)
+L(fwd_write_20bytes_align):
+	movdqa	-20(%eax), %xmm0
+	movdqa	%xmm0, -20(%edx)
+	movl	-4(%eax), %ecx
+	movl	%ecx, -4(%edx)
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
+	RETURN_END
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(large_page):
+	movdqu	(%eax), %xmm1
+# ifdef USE_AS_MEMMOVE
+	movl	DEST+4(%esp), %edi
+	movdqu	%xmm0, (%edi)
+# endif
+	lea	16(%eax), %eax
+	movntdq	%xmm1, (%edx)
+	lea	16(%edx), %edx
+	lea	-0x90(%ecx), %ecx
+	POP (%edi)
+
+	.p2align 4
+L(large_page_loop):
+	movdqu	(%eax), %xmm0
+	movdqu	0x10(%eax), %xmm1
+	movdqu	0x20(%eax), %xmm2
+	movdqu	0x30(%eax), %xmm3
+	movdqu	0x40(%eax), %xmm4
+	movdqu	0x50(%eax), %xmm5
+	movdqu	0x60(%eax), %xmm6
+	movdqu	0x70(%eax), %xmm7
+	lea	0x80(%eax), %eax
+
+	sub	$0x80, %ecx
+	movntdq	%xmm0, (%edx)
+	movntdq	%xmm1, 0x10(%edx)
+	movntdq	%xmm2, 0x20(%edx)
+	movntdq	%xmm3, 0x30(%edx)
+	movntdq	%xmm4, 0x40(%edx)
+	movntdq	%xmm5, 0x50(%edx)
+	movntdq	%xmm6, 0x60(%edx)
+	movntdq	%xmm7, 0x70(%edx)
+	lea	0x80(%edx), %edx
+	jae	L(large_page_loop)
+	cmp	$-0x40, %ecx
+	lea	0x80(%ecx), %ecx
+	jl	L(large_page_less_64bytes)
+
+	movdqu	(%eax), %xmm0
+	movdqu	0x10(%eax), %xmm1
+	movdqu	0x20(%eax), %xmm2
+	movdqu	0x30(%eax), %xmm3
+	lea	0x40(%eax), %eax
+
+	movntdq	%xmm0, (%edx)
+	movntdq	%xmm1, 0x10(%edx)
+	movntdq	%xmm2, 0x20(%edx)
+	movntdq	%xmm3, 0x30(%edx)
+	lea	0x40(%edx), %edx
+	sub	$0x40, %ecx
+L(large_page_less_64bytes):
+	cmp	$32, %ecx
+	jb	L(large_page_less_32bytes)
+	movdqu	(%eax), %xmm0
+	movdqu	0x10(%eax), %xmm1
+	lea	0x20(%eax), %eax
+	movntdq	%xmm0, (%edx)
+	movntdq	%xmm1, 0x10(%edx)
+	lea	0x20(%edx), %edx
+	sub	$0x20, %ecx
+L(large_page_less_32bytes):
+	add	%ecx, %edx
+	add	%ecx, %eax
+	sfence
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+	.p2align 4
+L(bk_write_44bytes):
+	movq	36(%eax), %xmm0
+	movq	%xmm0, 36(%edx)
+L(bk_write_36bytes):
+	movq	28(%eax), %xmm0
+	movq	%xmm0, 28(%edx)
+L(bk_write_28bytes):
+	movq	20(%eax), %xmm0
+	movq	%xmm0, 20(%edx)
+L(bk_write_20bytes):
+	movq	12(%eax), %xmm0
+	movq	%xmm0, 12(%edx)
+L(bk_write_12bytes):
+	movq	4(%eax), %xmm0
+	movq	%xmm0, 4(%edx)
+L(bk_write_4bytes):
+	movl	(%eax), %ecx
+	movl	%ecx, (%edx)
+L(bk_write_0bytes):
+# ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+#  ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(bk_write_40bytes):
+	movq	32(%eax), %xmm0
+	movq	%xmm0, 32(%edx)
+L(bk_write_32bytes):
+	movq	24(%eax), %xmm0
+	movq	%xmm0, 24(%edx)
+L(bk_write_24bytes):
+	movq	16(%eax), %xmm0
+	movq	%xmm0, 16(%edx)
+L(bk_write_16bytes):
+	movq	8(%eax), %xmm0
+	movq	%xmm0, 8(%edx)
+L(bk_write_8bytes):
+	movq	(%eax), %xmm0
+	movq	%xmm0, (%edx)
+# ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+#  ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(bk_write_45bytes):
+	movq	37(%eax), %xmm0
+	movq	%xmm0, 37(%edx)
+L(bk_write_37bytes):
+	movq	29(%eax), %xmm0
+	movq	%xmm0, 29(%edx)
+L(bk_write_29bytes):
+	movq	21(%eax), %xmm0
+	movq	%xmm0, 21(%edx)
+L(bk_write_21bytes):
+	movq	13(%eax), %xmm0
+	movq	%xmm0, 13(%edx)
+L(bk_write_13bytes):
+	movq	5(%eax), %xmm0
+	movq	%xmm0, 5(%edx)
+L(bk_write_5bytes):
+	movl	1(%eax), %ecx
+	movl	%ecx, 1(%edx)
+L(bk_write_1bytes):
+	movzbl	(%eax), %ecx
+	movb	%cl, (%edx)
+# ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+#  ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(bk_write_41bytes):
+	movq	33(%eax), %xmm0
+	movq	%xmm0, 33(%edx)
+L(bk_write_33bytes):
+	movq	25(%eax), %xmm0
+	movq	%xmm0, 25(%edx)
+L(bk_write_25bytes):
+	movq	17(%eax), %xmm0
+	movq	%xmm0, 17(%edx)
+L(bk_write_17bytes):
+	movq	9(%eax), %xmm0
+	movq	%xmm0, 9(%edx)
+L(bk_write_9bytes):
+	movq	1(%eax), %xmm0
+	movq	%xmm0, 1(%edx)
+	movzbl	(%eax), %ecx
+	movb	%cl, (%edx)
+# ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+#  ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(bk_write_46bytes):
+	movq	38(%eax), %xmm0
+	movq	%xmm0, 38(%edx)
+L(bk_write_38bytes):
+	movq	30(%eax), %xmm0
+	movq	%xmm0, 30(%edx)
+L(bk_write_30bytes):
+	movq	22(%eax), %xmm0
+	movq	%xmm0, 22(%edx)
+L(bk_write_22bytes):
+	movq	14(%eax), %xmm0
+	movq	%xmm0, 14(%edx)
+L(bk_write_14bytes):
+	movq	6(%eax), %xmm0
+	movq	%xmm0, 6(%edx)
+L(bk_write_6bytes):
+	movl	2(%eax), %ecx
+	movl	%ecx, 2(%edx)
+	movzwl	(%eax), %ecx
+	movw	%cx, (%edx)
+# ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+#  ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(bk_write_42bytes):
+	movq	34(%eax), %xmm0
+	movq	%xmm0, 34(%edx)
+L(bk_write_34bytes):
+	movq	26(%eax), %xmm0
+	movq	%xmm0, 26(%edx)
+L(bk_write_26bytes):
+	movq	18(%eax), %xmm0
+	movq	%xmm0, 18(%edx)
+L(bk_write_18bytes):
+	movq	10(%eax), %xmm0
+	movq	%xmm0, 10(%edx)
+L(bk_write_10bytes):
+	movq	2(%eax), %xmm0
+	movq	%xmm0, 2(%edx)
+L(bk_write_2bytes):
+	movzwl	(%eax), %ecx
+	movw	%cx, (%edx)
+# ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+#  ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(bk_write_47bytes):
+	movq	39(%eax), %xmm0
+	movq	%xmm0, 39(%edx)
+L(bk_write_39bytes):
+	movq	31(%eax), %xmm0
+	movq	%xmm0, 31(%edx)
+L(bk_write_31bytes):
+	movq	23(%eax), %xmm0
+	movq	%xmm0, 23(%edx)
+L(bk_write_23bytes):
+	movq	15(%eax), %xmm0
+	movq	%xmm0, 15(%edx)
+L(bk_write_15bytes):
+	movq	7(%eax), %xmm0
+	movq	%xmm0, 7(%edx)
+L(bk_write_7bytes):
+	movl	3(%eax), %ecx
+	movl	%ecx, 3(%edx)
+	movzwl	1(%eax), %ecx
+	movw	%cx, 1(%edx)
+	movzbl	(%eax), %eax
+	movb	%al, (%edx)
+# ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+#  ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+#  endif
+# endif
+	RETURN
+
+	.p2align 4
+L(bk_write_43bytes):
+	movq	35(%eax), %xmm0
+	movq	%xmm0, 35(%edx)
+L(bk_write_35bytes):
+	movq	27(%eax), %xmm0
+	movq	%xmm0, 27(%edx)
+L(bk_write_27bytes):
+	movq	19(%eax), %xmm0
+	movq	%xmm0, 19(%edx)
+L(bk_write_19bytes):
+	movq	11(%eax), %xmm0
+	movq	%xmm0, 11(%edx)
+L(bk_write_11bytes):
+	movq	3(%eax), %xmm0
+	movq	%xmm0, 3(%edx)
+L(bk_write_3bytes):
+	movzwl	1(%eax), %ecx
+	movw	%cx, 1(%edx)
+	movzbl	(%eax), %eax
+	movb	%al, (%edx)
+# ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+#  ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+#  endif
+# endif
+	RETURN_END
+
+
+	.pushsection .rodata.ssse3,"a",@progbits
+	.p2align 2
+L(table_48bytes_fwd):
+	.int	JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
+
+	.p2align 2
+L(table_48bytes_fwd_align):
+	.int	JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align))
+
+	.p2align 2
+L(shl_table):
+	.int	JMPTBL (L(shl_0), L(shl_table))
+	.int	JMPTBL (L(shl_1), L(shl_table))
+	.int	JMPTBL (L(shl_2), L(shl_table))
+	.int	JMPTBL (L(shl_3), L(shl_table))
+	.int	JMPTBL (L(shl_4), L(shl_table))
+	.int	JMPTBL (L(shl_5), L(shl_table))
+	.int	JMPTBL (L(shl_6), L(shl_table))
+	.int	JMPTBL (L(shl_7), L(shl_table))
+	.int	JMPTBL (L(shl_8), L(shl_table))
+	.int	JMPTBL (L(shl_9), L(shl_table))
+	.int	JMPTBL (L(shl_10), L(shl_table))
+	.int	JMPTBL (L(shl_11), L(shl_table))
+	.int	JMPTBL (L(shl_12), L(shl_table))
+	.int	JMPTBL (L(shl_13), L(shl_table))
+	.int	JMPTBL (L(shl_14), L(shl_table))
+	.int	JMPTBL (L(shl_15), L(shl_table))
+
+	.p2align 2
+L(table_48_bytes_bwd):
+	.int	JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
+
+	.popsection
+
+# ifdef USE_AS_MEMMOVE
+	.p2align 4
+L(copy_backward):
+	PUSH (%edi)
+	movl	%eax, %edi
+	lea	(%ecx,%edx,1),%edx
+	lea	(%ecx,%edi,1),%edi
+	testl	$0x3, %edx
+	jnz	L(bk_align)
+
+L(bk_aligned_4):
+	cmp	$64, %ecx
+	jae	L(bk_write_more64bytes)
+
+L(bk_write_64bytesless):
+	cmp	$32, %ecx
+	jb	L(bk_write_less32bytes)
+
+L(bk_write_more32bytes):
+	/* Copy 32 bytes at a time.  */
+	sub	$32, %ecx
+	movq	-8(%edi), %xmm0
+	movq	%xmm0, -8(%edx)
+	movq	-16(%edi), %xmm0
+	movq	%xmm0, -16(%edx)
+	movq	-24(%edi), %xmm0
+	movq	%xmm0, -24(%edx)
+	movq	-32(%edi), %xmm0
+	movq	%xmm0, -32(%edx)
+	sub	$32, %edx
+	sub	$32, %edi
+
+L(bk_write_less32bytes):
+	movl	%edi, %eax
+	sub	%ecx, %edx
+	sub	%ecx, %eax
+	POP (%edi)
+L(bk_write_less32bytes_2):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(bk_align):
+	cmp	$8, %ecx
+	jbe	L(bk_write_less32bytes)
+	testl	$1, %edx
+	/* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
+	then	(EDX & 2) must be != 0.  */
+	jz	L(bk_got2)
+	sub	$1, %edi
+	sub	$1, %ecx
+	sub	$1, %edx
+	movzbl	(%edi), %eax
+	movb	%al, (%edx)
+
+	testl	$2, %edx
+	jz	L(bk_aligned_4)
+
+L(bk_got2):
+	sub	$2, %edi
+	sub	$2, %ecx
+	sub	$2, %edx
+	movzwl	(%edi), %eax
+	movw	%ax, (%edx)
+	jmp	L(bk_aligned_4)
+
+	.p2align 4
+L(bk_write_more64bytes):
+	/* Check alignment of last byte.  */
+	testl	$15, %edx
+	jz	L(bk_ssse3_cpy_pre)
+
+/* EDX is aligned 4 bytes, but not 16 bytes.  */
+L(bk_ssse3_align):
+	sub	$4, %edi
+	sub	$4, %ecx
+	sub	$4, %edx
+	movl	(%edi), %eax
+	movl	%eax, (%edx)
+
+	testl	$15, %edx
+	jz	L(bk_ssse3_cpy_pre)
+
+	sub	$4, %edi
+	sub	$4, %ecx
+	sub	$4, %edx
+	movl	(%edi), %eax
+	movl	%eax, (%edx)
+
+	testl	$15, %edx
+	jz	L(bk_ssse3_cpy_pre)
+
+	sub	$4, %edi
+	sub	$4, %ecx
+	sub	$4, %edx
+	movl	(%edi), %eax
+	movl	%eax, (%edx)
+
+L(bk_ssse3_cpy_pre):
+	cmp	$64, %ecx
+	jb	L(bk_write_more32bytes)
+
+	.p2align 4
+L(bk_ssse3_cpy):
+	sub	$64, %edi
+	sub	$64, %ecx
+	sub	$64, %edx
+	movdqu	0x30(%edi), %xmm3
+	movdqa	%xmm3, 0x30(%edx)
+	movdqu	0x20(%edi), %xmm2
+	movdqa	%xmm2, 0x20(%edx)
+	movdqu	0x10(%edi), %xmm1
+	movdqa	%xmm1, 0x10(%edx)
+	movdqu	(%edi), %xmm0
+	movdqa	%xmm0, (%edx)
+	cmp	$64, %ecx
+	jae	L(bk_ssse3_cpy)
+	jmp	L(bk_write_64bytesless)
+
+# endif
+
+END (MEMCPY)
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy.S
new file mode 100644
index 0000000000..f725944620
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy.S
@@ -0,0 +1,78 @@
+/* Multiple versions of memcpy
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+   DSO.  In static binaries we need memcpy before the initialization
+   happened.  */
+#if defined SHARED && IS_IN (libc)
+	.text
+ENTRY(memcpy)
+	.type	memcpy, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memcpy_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memcpy_sse2_unaligned)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz	2f
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memcpy_ssse3)
+	HAS_ARCH_FEATURE (Fast_Rep_String)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memcpy_ssse3_rep)
+2:	ret
+END(memcpy)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __memcpy_ia32, @function; \
+	.p2align 4; \
+	.globl __memcpy_ia32; \
+	.hidden __memcpy_ia32; \
+	__memcpy_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __memcpy_ia32, .-__memcpy_ia32
+
+# undef ENTRY_CHK
+# define ENTRY_CHK(name) \
+	.type __memcpy_chk_ia32, @function; \
+	.globl __memcpy_chk_ia32; \
+	.p2align 4; \
+	__memcpy_chk_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END_CHK
+# define END_CHK(name) \
+	cfi_endproc; .size __memcpy_chk_ia32, .-__memcpy_chk_ia32
+
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_memcpy; __GI_memcpy = __memcpy_ia32
+#endif
+
+#include "../memcpy.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy_chk.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy_chk.S
new file mode 100644
index 0000000000..1b4fbe2e6f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy_chk.S
@@ -0,0 +1,50 @@
+/* Multiple versions of __memcpy_chk
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+   DSO.  There are no multiarch memcpy functions for static binaries.
+ */
+#if IS_IN (libc)
+# ifdef SHARED
+	.text
+ENTRY(__memcpy_chk)
+	.type	__memcpy_chk, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memcpy_chk_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memcpy_chk_sse2_unaligned)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz	2f
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memcpy_chk_ssse3)
+	HAS_ARCH_FEATURE (Fast_Rep_String)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memcpy_chk_ssse3_rep)
+2:	ret
+END(__memcpy_chk)
+# else
+#  include "../memcpy_chk.S"
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-sse2-unaligned.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-sse2-unaligned.S
new file mode 100644
index 0000000000..3873594cb2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-sse2-unaligned.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY		__memmove_sse2_unaligned
+#define MEMCPY_CHK	__memmove_chk_sse2_unaligned
+#include "memcpy-sse2-unaligned.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S
new file mode 100644
index 0000000000..d202fc4a13
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY		__memmove_ssse3_rep
+#define MEMCPY_CHK	__memmove_chk_ssse3_rep
+#include "memcpy-ssse3-rep.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3.S
new file mode 100644
index 0000000000..295430b1ef
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY		__memmove_ssse3
+#define MEMCPY_CHK	__memmove_chk_ssse3
+#include "memcpy-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memmove.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove.S
new file mode 100644
index 0000000000..6eb418ca7f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove.S
@@ -0,0 +1,89 @@
+/* Multiple versions of memmove
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib.  */
+#if IS_IN (libc)
+	.text
+ENTRY(memmove)
+	.type	memmove, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memmove_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memmove_sse2_unaligned)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz	2f
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memmove_ssse3)
+	HAS_ARCH_FEATURE (Fast_Rep_String)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memmove_ssse3_rep)
+2:	ret
+END(memmove)
+
+# ifdef SHARED
+#  undef ENTRY
+#  define ENTRY(name) \
+	.type __memmove_ia32, @function; \
+	.p2align 4; \
+	.globl __memmove_ia32; \
+	.hidden __memmove_ia32; \
+	__memmove_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# else
+#  undef ENTRY
+#  define ENTRY(name) \
+	.type __memmove_ia32, @function; \
+	.globl __memmove_ia32; \
+	.p2align 4; \
+	__memmove_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# endif
+
+# undef END
+# define END(name) \
+	cfi_endproc; .size __memmove_ia32, .-__memmove_ia32
+
+# undef ENTRY_CHK
+# define ENTRY_CHK(name) \
+	.type __memmove_chk_ia32, @function; \
+	.globl __memmove_chk_ia32; \
+	.p2align 4; \
+	__memmove_chk_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END_CHK
+# define END_CHK(name) \
+	cfi_endproc; .size __memmove_chk_ia32, .-__memmove_chk_ia32
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_memmove; __GI_memmove = __memmove_ia32
+# endif
+#endif
+
+#include "../memmove.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memmove_chk.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove_chk.S
new file mode 100644
index 0000000000..314834c4c6
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove_chk.S
@@ -0,0 +1,94 @@
+/* Multiple versions of __memmove_chk
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib.  */
+#if IS_IN (libc)
+	.text
+ENTRY(__memmove_chk)
+	.type	__memmove_chk, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memmove_chk_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memmove_chk_sse2_unaligned)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz	2f
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memmove_chk_ssse3)
+	HAS_ARCH_FEATURE (Fast_Rep_String)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memmove_chk_ssse3_rep)
+2:	ret
+END(__memmove_chk)
+
+# ifndef SHARED
+	.type __memmove_chk_sse2_unaligned, @function
+	.p2align 4;
+__memmove_chk_sse2_unaligned:
+	cfi_startproc
+	CALL_MCOUNT
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	__chk_fail
+	jmp	__memmove_sse2_unaligned
+	cfi_endproc
+	.size __memmove_chk_sse2_unaligned, .-__memmove_chk_sse2_unaligned
+
+	.type __memmove_chk_ssse3, @function
+	.p2align 4;
+__memmove_chk_ssse3:
+	cfi_startproc
+	CALL_MCOUNT
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	__chk_fail
+	jmp	__memmove_ssse3
+	cfi_endproc
+	.size __memmove_chk_ssse3, .-__memmove_chk_ssse3
+
+	.type __memmove_chk_ssse3_rep, @function
+	.p2align 4;
+__memmove_chk_ssse3_rep:
+	cfi_startproc
+	CALL_MCOUNT
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	__chk_fail
+	jmp	__memmove_ssse3_rep
+	cfi_endproc
+	.size __memmove_chk_ssse3_rep, .-__memmove_chk_ssse3_rep
+
+	.type __memmove_chk_ia32, @function
+	.p2align 4;
+__memmove_chk_ia32:
+	cfi_startproc
+	CALL_MCOUNT
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	__chk_fail
+	jmp	__memmove_ia32
+	cfi_endproc
+	.size __memmove_chk_ia32, .-__memmove_chk_ia32
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-sse2-unaligned.S b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-sse2-unaligned.S
new file mode 100644
index 0000000000..a1cea50771
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-sse2-unaligned.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMPCPY
+#define MEMCPY		__mempcpy_sse2_unaligned
+#define MEMCPY_CHK	__mempcpy_chk_sse2_unaligned
+#include "memcpy-sse2-unaligned.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S
new file mode 100644
index 0000000000..5357b33e18
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMPCPY
+#define MEMCPY		__mempcpy_ssse3_rep
+#define MEMCPY_CHK	__mempcpy_chk_ssse3_rep
+#include "memcpy-ssse3-rep.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S
new file mode 100644
index 0000000000..822d98e954
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMPCPY
+#define MEMCPY		__mempcpy_ssse3
+#define MEMCPY_CHK	__mempcpy_chk_ssse3
+#include "memcpy-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy.S
new file mode 100644
index 0000000000..06e377fbc9
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy.S
@@ -0,0 +1,81 @@
+/* Multiple versions of mempcpy
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+   DSO.  In static binaries we need mempcpy before the initialization
+   happened.  */
+#if defined SHARED && IS_IN (libc)
+	.text
+ENTRY(__mempcpy)
+	.type	__mempcpy, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__mempcpy_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__mempcpy_sse2_unaligned)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz	2f
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__mempcpy_ssse3)
+	HAS_ARCH_FEATURE (Fast_Rep_String)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__mempcpy_ssse3_rep)
+2:	ret
+END(__mempcpy)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __mempcpy_ia32, @function; \
+	.p2align 4; \
+	.globl __mempcpy_ia32; \
+	.hidden __mempcpy_ia32; \
+	__mempcpy_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __mempcpy_ia32, .-__mempcpy_ia32
+
+# undef ENTRY_CHK
+# define ENTRY_CHK(name) \
+	.type __mempcpy_chk_ia32, @function; \
+	.globl __mempcpy_chk_ia32; \
+	.p2align 4; \
+	__mempcpy_chk_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END_CHK
+# define END_CHK(name) \
+	cfi_endproc; .size __mempcpy_chk_ia32, .-__mempcpy_chk_ia32
+
+# undef libc_hidden_def
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+# define libc_hidden_def(name) \
+	.globl __GI_mempcpy; __GI_mempcpy = __mempcpy_ia32
+# define libc_hidden_builtin_def(name) \
+	.globl __GI___mempcpy; __GI___mempcpy = __mempcpy_ia32
+#endif
+
+#include "../mempcpy.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy_chk.S b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy_chk.S
new file mode 100644
index 0000000000..e13e5248a5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy_chk.S
@@ -0,0 +1,50 @@
+/* Multiple versions of __mempcpy_chk
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+   DSO.  There are no multiarch mempcpy functions for static binaries.
+ */
+#if IS_IN (libc)
+# ifdef SHARED
+	.text
+ENTRY(__mempcpy_chk)
+	.type	__mempcpy_chk, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__mempcpy_chk_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__mempcpy_chk_sse2_unaligned)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz	2f
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__mempcpy_chk_ssse3)
+	HAS_ARCH_FEATURE (Fast_Rep_String)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__mempcpy_chk_ssse3_rep)
+2:	ret
+END(__mempcpy_chk)
+# else
+#  include "../mempcpy_chk.S"
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-c.c
new file mode 100644
index 0000000000..ef7bbbe792
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-c.c
@@ -0,0 +1,7 @@
+#if IS_IN (libc)
+# define MEMRCHR  __memrchr_ia32
+# include <string.h>
+extern void *__memrchr_ia32 (const void *, int, size_t);
+#endif
+
+#include "string/memrchr.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S
new file mode 100644
index 0000000000..dbbe94fd08
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S
@@ -0,0 +1,417 @@
+/* Optimized memrchr with sse2
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS  4
+# define STR1  PARMS
+# define STR2  STR1+4
+# define LEN   STR2+4
+
+# define MEMCHR __memrchr_sse2_bsf
+
+	.text
+ENTRY (MEMCHR)
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+	mov	LEN(%esp), %edx
+
+	sub	$16, %edx
+	jbe	L(length_less16)
+
+	punpcklbw %xmm1, %xmm1
+	add	%edx, %ecx
+	punpcklbw %xmm1, %xmm1
+
+	movdqu	(%ecx), %xmm0
+	pshufd	$0, %xmm1, %xmm1
+	pcmpeqb	%xmm1, %xmm0
+
+/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches0)
+
+	sub	$64, %ecx
+	mov	%ecx, %eax
+	and	$15, %eax
+	jz	L(loop_prolog)
+
+	add	$16, %ecx
+	add	$16, %edx
+	sub	%eax, %ecx
+	sub	%eax, %edx
+
+	.p2align 4
+/* Loop start on aligned string.  */
+L(loop_prolog):
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	movdqa	32(%ecx), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	movdqa	(%ecx), %xmm4
+	pcmpeqb	%xmm1, %xmm4
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(matches0)
+
+	sub	$64, %ecx
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	movdqa	32(%ecx), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	movdqa	(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches0)
+
+	mov	%ecx, %eax
+	and	$63, %eax
+	test	%eax, %eax
+	jz	L(align64_loop)
+
+	add	$64, %ecx
+	add	$64, %edx
+	sub	%eax, %ecx
+	sub	%eax, %edx
+
+	.p2align 4
+L(align64_loop):
+	sub	$64, %ecx
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	(%ecx), %xmm0
+	movdqa	16(%ecx), %xmm2
+	movdqa	32(%ecx), %xmm3
+	movdqa	48(%ecx), %xmm4
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm1, %xmm4
+
+	pmaxub	%xmm3, %xmm0
+	pmaxub	%xmm4, %xmm2
+	pmaxub	%xmm0, %xmm2
+	pmovmskb %xmm2, %eax
+
+	test	%eax, %eax
+	jz	L(align64_loop)
+
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm2
+
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	(%ecx), %xmm1
+
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	pmovmskb %xmm1, %eax
+	bsr	%eax, %eax
+
+	add	%ecx, %eax
+	ret
+
+	.p2align 4
+L(exit_loop):
+	add	$64, %edx
+	cmp	$32, %edx
+	jbe	L(exit_loop_32)
+
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	movdqa	32(%ecx), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches16_1)
+	cmp	$48, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	(%ecx), %xmm1
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(matches0_1)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(exit_loop_32):
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48_1)
+	cmp	$16, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	32(%ecx), %xmm1
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(matches32_1)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(matches0):
+	bsr	%eax, %eax
+	add	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches16):
+	bsr	%eax, %eax
+	lea	16(%eax, %ecx), %eax
+	ret
+
+	.p2align 4
+L(matches32):
+	bsr	%eax, %eax
+	lea	32(%eax, %ecx), %eax
+	ret
+
+	.p2align 4
+L(matches48):
+	bsr	%eax, %eax
+	lea	48(%eax, %ecx), %eax
+	ret
+
+	.p2align 4
+L(matches0_1):
+	bsr	%eax, %eax
+	sub	$64, %edx
+	add	%eax, %edx
+	jl	L(return_null)
+	add	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches16_1):
+	bsr	%eax, %eax
+	sub	$48, %edx
+	add	%eax, %edx
+	jl	L(return_null)
+	lea	16(%ecx, %eax), %eax
+	ret
+
+	.p2align 4
+L(matches32_1):
+	bsr	%eax, %eax
+	sub	$32, %edx
+	add	%eax, %edx
+	jl	L(return_null)
+	lea	32(%ecx, %eax), %eax
+	ret
+
+	.p2align 4
+L(matches48_1):
+	bsr	%eax, %eax
+	sub	$16, %edx
+	add	%eax, %edx
+	jl	L(return_null)
+	lea	48(%ecx, %eax), %eax
+	ret
+
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(length_less16_offset0):
+	mov	%dl, %cl
+	pcmpeqb	(%eax), %xmm1
+
+	mov	$1, %edx
+	sal	%cl, %edx
+	sub	$1, %edx
+	mov	%edx, %ecx
+
+	pmovmskb %xmm1, %edx
+
+	and	%ecx, %edx
+	test	%edx, %edx
+	jz	L(return_null)
+
+	bsr	%edx, %ecx
+	add	%ecx, %eax
+	ret
+
+	.p2align 4
+L(length_less16):
+	punpcklbw %xmm1, %xmm1
+	mov	%ecx, %eax
+	punpcklbw %xmm1, %xmm1
+	add	$16, %edx
+	jz	L(return_null)
+
+	pshufd	$0, %xmm1, %xmm1
+	and	$15, %ecx
+	jz	L(length_less16_offset0)
+
+	PUSH	(%edi)
+	mov	%cl, %dh
+	add	%dl, %dh
+	and	$-16, %eax
+
+	sub	$16, %dh
+	ja	L(length_less16_part2)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edi
+
+	sar	%cl, %edi
+	add	%ecx, %eax
+	mov	%dl, %cl
+
+	mov	$1, %edx
+	sal	%cl, %edx
+	sub	$1, %edx
+
+	and	%edx, %edi
+	test	%edi, %edi
+	jz	L(ret_null)
+
+	bsr	%edi, %edi
+	add	%edi, %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH     (%edi)
+
+	.p2align 4
+L(length_less16_part2):
+	movdqa	16(%eax), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %edi
+
+	mov	%cl, %ch
+
+	mov	%dh, %cl
+	mov	$1, %edx
+	sal	%cl, %edx
+	sub	$1, %edx
+
+	and	%edx, %edi
+
+	test	%edi, %edi
+	jnz	L(length_less16_part2_return)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edi
+
+	mov	%ch, %cl
+	sar	%cl, %edi
+	test	%edi, %edi
+	jz	L(ret_null)
+
+	bsr	%edi, %edi
+	add	%edi, %eax
+	xor	%ch, %ch
+	add	%ecx, %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH     (%edi)
+
+	.p2align 4
+L(length_less16_part2_return):
+	bsr	%edi, %edi
+	lea	16(%eax, %edi), %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH     (%edi)
+
+	.p2align 4
+L(ret_null):
+	xor	%eax, %eax
+	POP	(%edi)
+	ret
+
+END (MEMCHR)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2.S
new file mode 100644
index 0000000000..5f7853f683
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2.S
@@ -0,0 +1,724 @@
+/* Optimized memrchr with sse2 without bsf
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS  4
+# define STR1  PARMS
+# define STR2  STR1+4
+# define LEN   STR2+4
+
+	atom_text_section
+ENTRY (__memrchr_sse2)
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+	mov	LEN(%esp), %edx
+
+	sub	$16, %edx
+	jbe	L(length_less16)
+
+	punpcklbw %xmm1, %xmm1
+	add	%edx, %ecx
+	punpcklbw %xmm1, %xmm1
+
+	movdqu	(%ecx), %xmm0
+	pshufd	$0, %xmm1, %xmm1
+	pcmpeqb	%xmm1, %xmm0
+
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(exit_dispatch)
+
+	sub	$64, %ecx
+	mov	%ecx, %eax
+	and	$15, %eax
+	jz	L(loop_prolog)
+
+	lea	16(%ecx), %ecx
+	lea	16(%edx), %edx
+	sub	%eax, %edx
+	and	$-16, %ecx
+
+	.p2align 4
+/* Loop start on aligned string.  */
+L(loop_prolog):
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	movdqa	32(%ecx), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	movdqa	(%ecx), %xmm4
+	pcmpeqb	%xmm1, %xmm4
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(exit_dispatch)
+
+	sub	$64, %ecx
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	movdqa	32(%ecx), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	movdqa	(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(exit_dispatch)
+
+	mov	%ecx, %eax
+	and	$63, %eax
+	test	%eax, %eax
+	jz	L(align64_loop)
+
+	lea	64(%ecx), %ecx
+	lea	64(%edx), %edx
+	and	$-64, %ecx
+	sub	%eax, %edx
+
+	.p2align 4
+L(align64_loop):
+	sub	$64, %ecx
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	(%ecx), %xmm0
+	movdqa	16(%ecx), %xmm2
+	movdqa	32(%ecx), %xmm3
+	movdqa	48(%ecx), %xmm4
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm1, %xmm4
+
+	pmaxub	%xmm3, %xmm0
+	pmaxub	%xmm4, %xmm2
+	pmaxub	%xmm0, %xmm2
+	pmovmskb %xmm2, %eax
+
+	test	%eax, %eax
+	jz	L(align64_loop)
+
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm2
+
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	(%ecx), %xmm1
+
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	pmovmskb %xmm1, %eax
+	test	%ah, %ah
+	jnz	L(exit_dispatch_high)
+	mov	%al, %dl
+	and	$15 << 4, %dl
+	jnz	L(exit_dispatch_8)
+	test	$0x08, %al
+	jnz	L(exit_4)
+	test	$0x04, %al
+	jnz	L(exit_3)
+	test	$0x02, %al
+	jnz	L(exit_2)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(exit_loop):
+	add	$64, %edx
+	cmp	$32, %edx
+	jbe	L(exit_loop_32)
+
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	movdqa	32(%ecx), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches16_1)
+	cmp	$48, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	(%ecx), %xmm1
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(matches0_1)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(exit_loop_32):
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48_1)
+	cmp	$16, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	32(%ecx), %xmm1
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(matches32_1)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(matches16):
+	lea	16(%ecx), %ecx
+	test	%ah, %ah
+	jnz	L(exit_dispatch_high)
+	mov	%al, %dl
+	and	$15 << 4, %dl
+	jnz	L(exit_dispatch_8)
+	test	$0x08, %al
+	jnz	L(exit_4)
+	test	$0x04, %al
+	jnz	L(exit_3)
+	test	$0x02, %al
+	jnz	L(exit_2)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches32):
+	lea	32(%ecx), %ecx
+	test	%ah, %ah
+	jnz	L(exit_dispatch_high)
+	mov	%al, %dl
+	and	$15 << 4, %dl
+	jnz	L(exit_dispatch_8)
+	test	$0x08, %al
+	jnz	L(exit_4)
+	test	$0x04, %al
+	jnz	L(exit_3)
+	test	$0x02, %al
+	jnz	L(exit_2)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches48):
+	lea	48(%ecx), %ecx
+
+	.p2align 4
+L(exit_dispatch):
+	test	%ah, %ah
+	jnz	L(exit_dispatch_high)
+	mov	%al, %dl
+	and	$15 << 4, %dl
+	jnz	L(exit_dispatch_8)
+	test	$0x08, %al
+	jnz	L(exit_4)
+	test	$0x04, %al
+	jnz	L(exit_3)
+	test	$0x02, %al
+	jnz	L(exit_2)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(exit_dispatch_8):
+	test	$0x80, %al
+	jnz	L(exit_8)
+	test	$0x40, %al
+	jnz	L(exit_7)
+	test	$0x20, %al
+	jnz	L(exit_6)
+	lea	4(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_dispatch_high):
+	mov	%ah, %dh
+	and	$15 << 4, %dh
+	jnz	L(exit_dispatch_high_8)
+	test	$0x08, %ah
+	jnz	L(exit_12)
+	test	$0x04, %ah
+	jnz	L(exit_11)
+	test	$0x02, %ah
+	jnz	L(exit_10)
+	lea	8(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_dispatch_high_8):
+	test	$0x80, %ah
+	jnz	L(exit_16)
+	test	$0x40, %ah
+	jnz	L(exit_15)
+	test	$0x20, %ah
+	jnz	L(exit_14)
+	lea	12(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_2):
+	lea	1(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_3):
+	lea	2(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_4):
+	lea	3(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_6):
+	lea	5(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_7):
+	lea	6(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_8):
+	lea	7(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_10):
+	lea	9(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_11):
+	lea	10(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_12):
+	lea	11(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_14):
+	lea	13(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_15):
+	lea	14(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_16):
+	lea	15(%ecx), %eax
+	ret
+
+	.p2align 4
+L(matches0_1):
+	lea	-64(%edx), %edx
+
+	test	%ah, %ah
+	jnz	L(exit_dispatch_1_high)
+	mov	%al, %ah
+	and	$15 << 4, %ah
+	jnz	L(exit_dispatch_1_8)
+	test	$0x08, %al
+	jnz	L(exit_1_4)
+	test	$0x04, %al
+	jnz	L(exit_1_3)
+	test	$0x02, %al
+	jnz	L(exit_1_2)
+	add	$0, %edx
+	jl	L(return_null)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches16_1):
+	lea	-48(%edx), %edx
+	lea	16(%ecx), %ecx
+
+	test	%ah, %ah
+	jnz	L(exit_dispatch_1_high)
+	mov	%al, %ah
+	and	$15 << 4, %ah
+	jnz	L(exit_dispatch_1_8)
+	test	$0x08, %al
+	jnz	L(exit_1_4)
+	test	$0x04, %al
+	jnz	L(exit_1_3)
+	test	$0x02, %al
+	jnz	L(exit_1_2)
+	add	$0, %edx
+	jl	L(return_null)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches32_1):
+	lea	-32(%edx), %edx
+	lea	32(%ecx), %ecx
+
+	test	%ah, %ah
+	jnz	L(exit_dispatch_1_high)
+	mov	%al, %ah
+	and	$15 << 4, %ah
+	jnz	L(exit_dispatch_1_8)
+	test	$0x08, %al
+	jnz	L(exit_1_4)
+	test	$0x04, %al
+	jnz	L(exit_1_3)
+	test	$0x02, %al
+	jnz	L(exit_1_2)
+	add	$0, %edx
+	jl	L(return_null)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches48_1):
+	lea	-16(%edx), %edx
+	lea	48(%ecx), %ecx
+
+	.p2align 4
+L(exit_dispatch_1):
+	test	%ah, %ah
+	jnz	L(exit_dispatch_1_high)
+	mov	%al, %ah
+	and	$15 << 4, %ah
+	jnz	L(exit_dispatch_1_8)
+	test	$0x08, %al
+	jnz	L(exit_1_4)
+	test	$0x04, %al
+	jnz	L(exit_1_3)
+	test	$0x02, %al
+	jnz	L(exit_1_2)
+	add	$0, %edx
+	jl	L(return_null)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(exit_dispatch_1_8):
+	test	$0x80, %al
+	jnz	L(exit_1_8)
+	test	$0x40, %al
+	jnz	L(exit_1_7)
+	test	$0x20, %al
+	jnz	L(exit_1_6)
+	add	$4, %edx
+	jl	L(return_null)
+	lea	4(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_dispatch_1_high):
+	mov	%ah, %al
+	and	$15 << 4, %al
+	jnz	L(exit_dispatch_1_high_8)
+	test	$0x08, %ah
+	jnz	L(exit_1_12)
+	test	$0x04, %ah
+	jnz	L(exit_1_11)
+	test	$0x02, %ah
+	jnz	L(exit_1_10)
+	add	$8, %edx
+	jl	L(return_null)
+	lea	8(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_dispatch_1_high_8):
+	test	$0x80, %ah
+	jnz	L(exit_1_16)
+	test	$0x40, %ah
+	jnz	L(exit_1_15)
+	test	$0x20, %ah
+	jnz	L(exit_1_14)
+	add	$12, %edx
+	jl	L(return_null)
+	lea	12(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_2):
+	add	$1, %edx
+	jl	L(return_null)
+	lea	1(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_3):
+	add	$2, %edx
+	jl	L(return_null)
+	lea	2(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_4):
+	add	$3, %edx
+	jl	L(return_null)
+	lea	3(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_6):
+	add	$5, %edx
+	jl	L(return_null)
+	lea	5(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_7):
+	add	$6, %edx
+	jl	L(return_null)
+	lea	6(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_8):
+	add	$7, %edx
+	jl	L(return_null)
+	lea	7(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_10):
+	add	$9, %edx
+	jl	L(return_null)
+	lea	9(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_11):
+	add	$10, %edx
+	jl	L(return_null)
+	lea	10(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_12):
+	add	$11, %edx
+	jl	L(return_null)
+	lea	11(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_14):
+	add	$13, %edx
+	jl	L(return_null)
+	lea	13(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_15):
+	add	$14, %edx
+	jl	L(return_null)
+	lea	14(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_16):
+	add	$15, %edx
+	jl	L(return_null)
+	lea	15(%ecx), %eax
+	ret
+
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(length_less16_offset0):
+	mov	%dl, %cl
+	pcmpeqb	(%eax), %xmm1
+
+	mov	$1, %edx
+	sal	%cl, %edx
+	sub	$1, %edx
+
+	mov	%eax, %ecx
+	pmovmskb %xmm1, %eax
+
+	and	%edx, %eax
+	test	%eax, %eax
+	jnz	L(exit_dispatch)
+
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(length_less16):
+	punpcklbw %xmm1, %xmm1
+	add	$16, %edx
+	je	L(return_null)
+	punpcklbw %xmm1, %xmm1
+
+	mov	%ecx, %eax
+	pshufd	$0, %xmm1, %xmm1
+
+	and	$15, %ecx
+	jz	L(length_less16_offset0)
+
+	PUSH	(%edi)
+
+	mov	%cl, %dh
+	add	%dl, %dh
+	and	$-16, %eax
+
+	sub	$16, %dh
+	ja	L(length_less16_part2)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edi
+
+	sar	%cl, %edi
+	add	%ecx, %eax
+	mov	%dl, %cl
+
+	mov	$1, %edx
+	sal	%cl, %edx
+	sub	$1, %edx
+
+	and	%edx, %edi
+	test	%edi, %edi
+	jz	L(ret_null)
+
+	bsr	%edi, %edi
+	add	%edi, %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH     (%edi)
+
+	.p2align 4
+L(length_less16_part2):
+	movdqa	16(%eax), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %edi
+
+	mov	%cl, %ch
+
+	mov	%dh, %cl
+	mov	$1, %edx
+	sal	%cl, %edx
+	sub	$1, %edx
+
+	and	%edx, %edi
+
+	test	%edi, %edi
+	jnz	L(length_less16_part2_return)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edi
+
+	mov	%ch, %cl
+	sar	%cl, %edi
+	test	%edi, %edi
+	jz	L(ret_null)
+
+	bsr	%edi, %edi
+	add	%edi, %eax
+	xor	%ch, %ch
+	add	%ecx, %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH     (%edi)
+
+	.p2align 4
+L(length_less16_part2_return):
+	bsr	%edi, %edi
+	lea	16(%eax, %edi), %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH     (%edi)
+
+	.p2align 4
+L(ret_null):
+	xor	%eax, %eax
+	POP	(%edi)
+	ret
+
+END (__memrchr_sse2)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr.S
new file mode 100644
index 0000000000..d4253a553b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr.S
@@ -0,0 +1,45 @@
+/* Multiple versions of memrchr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+	.text
+ENTRY(__memrchr)
+	.type	__memrchr, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	HAS_ARCH_FEATURE (Slow_BSF)
+	jz	3f
+
+	LOAD_FUNC_GOT_EAX (__memrchr_sse2)
+	ret
+
+2:	LOAD_FUNC_GOT_EAX (__memrchr_ia32)
+	ret
+
+3:	LOAD_FUNC_GOT_EAX (__memrchr_sse2_bsf)
+	ret
+END(__memrchr)
+
+weak_alias(__memrchr, memrchr)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2-rep.S
new file mode 100644
index 0000000000..3221077e49
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2-rep.S
@@ -0,0 +1,811 @@
+/* memset with SSE2 and REP string.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#ifdef USE_AS_BZERO
+# define DEST		PARMS
+# define LEN		DEST+4
+# define SETRTNVAL
+#else
+# define DEST		PARMS
+# define CHR		DEST+4
+# define LEN		CHR+4
+# define SETRTNVAL	movl DEST(%esp), %eax
+#endif
+
+#ifdef SHARED
+# define ENTRANCE	PUSH (%ebx);
+# define RETURN_END	POP (%ebx); ret
+# define RETURN		RETURN_END; CFI_PUSH (%ebx)
+# define PARMS		8		/* Preserve EBX.  */
+# define JMPTBL(I, B)	I - B
+
+/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
+   jump table with relative offsets.   */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
+    /* We first load PC into EBX.  */				\
+    SETUP_PIC_REG(bx);						\
+    /* Get the address of the jump table.  */			\
+    add		$(TABLE - .), %ebx;				\
+    /* Get the entry and convert the relative offset to the	\
+       absolute address.  */					\
+    add		(%ebx,%ecx,4), %ebx;				\
+    add		%ecx, %edx;					\
+    /* We loaded the jump table and adjusted EDX. Go.  */	\
+    jmp		*%ebx
+#else
+# define ENTRANCE
+# define RETURN_END	ret
+# define RETURN		RETURN_END
+# define PARMS		4
+# define JMPTBL(I, B)	I
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+   absolute offsets.  */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
+    add		%ecx, %edx;					\
+    jmp		*TABLE(,%ecx,4)
+#endif
+
+	.section .text.sse2,"ax",@progbits
+#if defined SHARED && IS_IN (libc) && !defined USE_AS_BZERO
+ENTRY (__memset_chk_sse2_rep)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__memset_chk_sse2_rep)
+#endif
+ENTRY (__memset_sse2_rep)
+	ENTRANCE
+
+	movl	LEN(%esp), %ecx
+#ifdef USE_AS_BZERO
+	xor	%eax, %eax
+#else
+	movzbl	CHR(%esp), %eax
+	movb	%al, %ah
+	/* Fill the whole EAX with pattern.  */
+	movl	%eax, %edx
+	shl	$16, %eax
+	or	%edx, %eax
+#endif
+	movl	DEST(%esp), %edx
+	cmp	$32, %ecx
+	jae	L(32bytesormore)
+
+L(write_less32bytes):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_32bytes))
+
+
+	.pushsection .rodata.sse2,"a",@progbits
+	ALIGN (2)
+L(table_less_32bytes):
+	.int	JMPTBL (L(write_0bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_1bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_2bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_3bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_4bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_5bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_6bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_7bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_8bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_9bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_10bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_11bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_12bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_13bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_14bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_15bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_16bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_17bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_18bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_19bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_20bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_21bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_22bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_23bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_24bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_25bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_26bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_27bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_28bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_29bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_30bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_31bytes), L(table_less_32bytes))
+	.popsection
+
+	ALIGN (4)
+L(write_28bytes):
+	movl	%eax, -28(%edx)
+L(write_24bytes):
+	movl	%eax, -24(%edx)
+L(write_20bytes):
+	movl	%eax, -20(%edx)
+L(write_16bytes):
+	movl	%eax, -16(%edx)
+L(write_12bytes):
+	movl	%eax, -12(%edx)
+L(write_8bytes):
+	movl	%eax, -8(%edx)
+L(write_4bytes):
+	movl	%eax, -4(%edx)
+L(write_0bytes):
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(write_29bytes):
+	movl	%eax, -29(%edx)
+L(write_25bytes):
+	movl	%eax, -25(%edx)
+L(write_21bytes):
+	movl	%eax, -21(%edx)
+L(write_17bytes):
+	movl	%eax, -17(%edx)
+L(write_13bytes):
+	movl	%eax, -13(%edx)
+L(write_9bytes):
+	movl	%eax, -9(%edx)
+L(write_5bytes):
+	movl	%eax, -5(%edx)
+L(write_1bytes):
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(write_30bytes):
+	movl	%eax, -30(%edx)
+L(write_26bytes):
+	movl	%eax, -26(%edx)
+L(write_22bytes):
+	movl	%eax, -22(%edx)
+L(write_18bytes):
+	movl	%eax, -18(%edx)
+L(write_14bytes):
+	movl	%eax, -14(%edx)
+L(write_10bytes):
+	movl	%eax, -10(%edx)
+L(write_6bytes):
+	movl	%eax, -6(%edx)
+L(write_2bytes):
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(write_31bytes):
+	movl	%eax, -31(%edx)
+L(write_27bytes):
+	movl	%eax, -27(%edx)
+L(write_23bytes):
+	movl	%eax, -23(%edx)
+L(write_19bytes):
+	movl	%eax, -19(%edx)
+L(write_15bytes):
+	movl	%eax, -15(%edx)
+L(write_11bytes):
+	movl	%eax, -11(%edx)
+L(write_7bytes):
+	movl	%eax, -7(%edx)
+L(write_3bytes):
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+/* ECX > 32 and EDX is 4 byte aligned.  */
+L(32bytesormore):
+	/* Fill xmm0 with the pattern.  */
+#ifdef USE_AS_BZERO
+	pxor	%xmm0, %xmm0
+#else
+	movd	%eax, %xmm0
+	pshufd	$0, %xmm0, %xmm0
+#endif
+	testl	$0xf, %edx
+	jz	L(aligned_16)
+/* ECX > 32 and EDX is not 16 byte aligned.  */
+L(not_aligned_16):
+	movdqu	%xmm0, (%edx)
+	movl	%edx, %eax
+	and	$-16, %edx
+	add	$16, %edx
+	sub	%edx, %eax
+	add	%eax, %ecx
+	movd	%xmm0, %eax
+
+	ALIGN (4)
+L(aligned_16):
+	cmp	$128, %ecx
+	jae	L(128bytesormore)
+
+L(aligned_16_less128bytes):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+	ALIGN (4)
+L(128bytesormore):
+	PUSH (%edi)
+#ifdef DATA_CACHE_SIZE
+	PUSH (%ebx)
+	mov	$DATA_CACHE_SIZE, %ebx
+#else
+# ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	mov	__x86_data_cache_size@GOTOFF(%ebx), %ebx
+# else
+	PUSH (%ebx)
+	mov	__x86_data_cache_size, %ebx
+# endif
+#endif
+	mov	%ebx, %edi
+	shr	$4, %ebx
+	sub	%ebx, %edi
+#if defined DATA_CACHE_SIZE || !defined SHARED
+	POP (%ebx)
+#endif
+/*
+ * When data size approximate the end of L1 cache,
+ * fast string will prefetch and combine data efficiently.
+ */
+	cmp	%edi, %ecx
+	jae	L(128bytesormore_endof_L1)
+	subl	$128, %ecx
+L(128bytesormore_normal):
+	sub	$128, %ecx
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm0, 0x10(%edx)
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm0, 0x30(%edx)
+	movdqa	%xmm0, 0x40(%edx)
+	movdqa	%xmm0, 0x50(%edx)
+	movdqa	%xmm0, 0x60(%edx)
+	movdqa	%xmm0, 0x70(%edx)
+	lea	128(%edx), %edx
+	jb	L(128bytesless_normal)
+
+
+	sub	$128, %ecx
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm0, 0x10(%edx)
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm0, 0x30(%edx)
+	movdqa	%xmm0, 0x40(%edx)
+	movdqa	%xmm0, 0x50(%edx)
+	movdqa	%xmm0, 0x60(%edx)
+	movdqa	%xmm0, 0x70(%edx)
+	lea	128(%edx), %edx
+	jae	L(128bytesormore_normal)
+
+L(128bytesless_normal):
+	POP (%edi)
+	add	$128, %ecx
+	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+	CFI_PUSH (%edi)
+	ALIGN (4)
+L(128bytesormore_endof_L1):
+	mov	%edx, %edi
+	mov	%ecx, %edx
+	shr	$2, %ecx
+	and	$3, %edx
+	rep stosl
+	jz	L(copy_page_by_rep_exit)
+	cmp	$2, %edx
+	jb	L(copy_page_by_rep_left_1)
+	movw	%ax, (%edi)
+	add	$2, %edi
+	sub	$2, %edx
+	jz	L(copy_page_by_rep_exit)
+L(copy_page_by_rep_left_1):
+	movb	%al, (%edi)
+L(copy_page_by_rep_exit):
+	POP (%edi)
+	SETRTNVAL
+	RETURN
+
+	.pushsection .rodata.sse2,"a",@progbits
+	ALIGN (2)
+L(table_16_128bytes):
+	.int	JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_1bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_3bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_5bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_7bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_9bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_11bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_13bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_15bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_17bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_19bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_21bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_23bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_25bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_27bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_29bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_31bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_33bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_35bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_37bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_39bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_41bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_43bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_45bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_47bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_49bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_51bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_53bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_55bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_57bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_59bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_61bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_63bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_65bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_67bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_69bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_71bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_73bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_75bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_77bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_79bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_81bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_83bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_85bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_87bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_89bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_91bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_93bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_95bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_97bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_99bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_101bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_103bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_105bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_107bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_109bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_111bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_113bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_115bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_117bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_119bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_121bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_123bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_125bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_127bytes), L(table_16_128bytes))
+	.popsection
+
+	ALIGN (4)
+L(aligned_16_112bytes):
+	movdqa	%xmm0, -112(%edx)
+L(aligned_16_96bytes):
+	movdqa	%xmm0, -96(%edx)
+L(aligned_16_80bytes):
+	movdqa	%xmm0, -80(%edx)
+L(aligned_16_64bytes):
+	movdqa	%xmm0, -64(%edx)
+L(aligned_16_48bytes):
+	movdqa	%xmm0, -48(%edx)
+L(aligned_16_32bytes):
+	movdqa	%xmm0, -32(%edx)
+L(aligned_16_16bytes):
+	movdqa	%xmm0, -16(%edx)
+L(aligned_16_0bytes):
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_113bytes):
+	movdqa	%xmm0, -113(%edx)
+L(aligned_16_97bytes):
+	movdqa	%xmm0, -97(%edx)
+L(aligned_16_81bytes):
+	movdqa	%xmm0, -81(%edx)
+L(aligned_16_65bytes):
+	movdqa	%xmm0, -65(%edx)
+L(aligned_16_49bytes):
+	movdqa	%xmm0, -49(%edx)
+L(aligned_16_33bytes):
+	movdqa	%xmm0, -33(%edx)
+L(aligned_16_17bytes):
+	movdqa	%xmm0, -17(%edx)
+L(aligned_16_1bytes):
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_114bytes):
+	movdqa	%xmm0, -114(%edx)
+L(aligned_16_98bytes):
+	movdqa	%xmm0, -98(%edx)
+L(aligned_16_82bytes):
+	movdqa	%xmm0, -82(%edx)
+L(aligned_16_66bytes):
+	movdqa	%xmm0, -66(%edx)
+L(aligned_16_50bytes):
+	movdqa	%xmm0, -50(%edx)
+L(aligned_16_34bytes):
+	movdqa	%xmm0, -34(%edx)
+L(aligned_16_18bytes):
+	movdqa	%xmm0, -18(%edx)
+L(aligned_16_2bytes):
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_115bytes):
+	movdqa	%xmm0, -115(%edx)
+L(aligned_16_99bytes):
+	movdqa	%xmm0, -99(%edx)
+L(aligned_16_83bytes):
+	movdqa	%xmm0, -83(%edx)
+L(aligned_16_67bytes):
+	movdqa	%xmm0, -67(%edx)
+L(aligned_16_51bytes):
+	movdqa	%xmm0, -51(%edx)
+L(aligned_16_35bytes):
+	movdqa	%xmm0, -35(%edx)
+L(aligned_16_19bytes):
+	movdqa	%xmm0, -19(%edx)
+L(aligned_16_3bytes):
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_116bytes):
+	movdqa	%xmm0, -116(%edx)
+L(aligned_16_100bytes):
+	movdqa	%xmm0, -100(%edx)
+L(aligned_16_84bytes):
+	movdqa	%xmm0, -84(%edx)
+L(aligned_16_68bytes):
+	movdqa	%xmm0, -68(%edx)
+L(aligned_16_52bytes):
+	movdqa	%xmm0, -52(%edx)
+L(aligned_16_36bytes):
+	movdqa	%xmm0, -36(%edx)
+L(aligned_16_20bytes):
+	movdqa	%xmm0, -20(%edx)
+L(aligned_16_4bytes):
+	movl	%eax, -4(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_117bytes):
+	movdqa	%xmm0, -117(%edx)
+L(aligned_16_101bytes):
+	movdqa	%xmm0, -101(%edx)
+L(aligned_16_85bytes):
+	movdqa	%xmm0, -85(%edx)
+L(aligned_16_69bytes):
+	movdqa	%xmm0, -69(%edx)
+L(aligned_16_53bytes):
+	movdqa	%xmm0, -53(%edx)
+L(aligned_16_37bytes):
+	movdqa	%xmm0, -37(%edx)
+L(aligned_16_21bytes):
+	movdqa	%xmm0, -21(%edx)
+L(aligned_16_5bytes):
+	movl	%eax, -5(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_118bytes):
+	movdqa	%xmm0, -118(%edx)
+L(aligned_16_102bytes):
+	movdqa	%xmm0, -102(%edx)
+L(aligned_16_86bytes):
+	movdqa	%xmm0, -86(%edx)
+L(aligned_16_70bytes):
+	movdqa	%xmm0, -70(%edx)
+L(aligned_16_54bytes):
+	movdqa	%xmm0, -54(%edx)
+L(aligned_16_38bytes):
+	movdqa	%xmm0, -38(%edx)
+L(aligned_16_22bytes):
+	movdqa	%xmm0, -22(%edx)
+L(aligned_16_6bytes):
+	movl	%eax, -6(%edx)
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_119bytes):
+	movdqa	%xmm0, -119(%edx)
+L(aligned_16_103bytes):
+	movdqa	%xmm0, -103(%edx)
+L(aligned_16_87bytes):
+	movdqa	%xmm0, -87(%edx)
+L(aligned_16_71bytes):
+	movdqa	%xmm0, -71(%edx)
+L(aligned_16_55bytes):
+	movdqa	%xmm0, -55(%edx)
+L(aligned_16_39bytes):
+	movdqa	%xmm0, -39(%edx)
+L(aligned_16_23bytes):
+	movdqa	%xmm0, -23(%edx)
+L(aligned_16_7bytes):
+	movl	%eax, -7(%edx)
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_120bytes):
+	movdqa	%xmm0, -120(%edx)
+L(aligned_16_104bytes):
+	movdqa	%xmm0, -104(%edx)
+L(aligned_16_88bytes):
+	movdqa	%xmm0, -88(%edx)
+L(aligned_16_72bytes):
+	movdqa	%xmm0, -72(%edx)
+L(aligned_16_56bytes):
+	movdqa	%xmm0, -56(%edx)
+L(aligned_16_40bytes):
+	movdqa	%xmm0, -40(%edx)
+L(aligned_16_24bytes):
+	movdqa	%xmm0, -24(%edx)
+L(aligned_16_8bytes):
+	movq	%xmm0, -8(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_121bytes):
+	movdqa	%xmm0, -121(%edx)
+L(aligned_16_105bytes):
+	movdqa	%xmm0, -105(%edx)
+L(aligned_16_89bytes):
+	movdqa	%xmm0, -89(%edx)
+L(aligned_16_73bytes):
+	movdqa	%xmm0, -73(%edx)
+L(aligned_16_57bytes):
+	movdqa	%xmm0, -57(%edx)
+L(aligned_16_41bytes):
+	movdqa	%xmm0, -41(%edx)
+L(aligned_16_25bytes):
+	movdqa	%xmm0, -25(%edx)
+L(aligned_16_9bytes):
+	movq	%xmm0, -9(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_122bytes):
+	movdqa	%xmm0, -122(%edx)
+L(aligned_16_106bytes):
+	movdqa	%xmm0, -106(%edx)
+L(aligned_16_90bytes):
+	movdqa	%xmm0, -90(%edx)
+L(aligned_16_74bytes):
+	movdqa	%xmm0, -74(%edx)
+L(aligned_16_58bytes):
+	movdqa	%xmm0, -58(%edx)
+L(aligned_16_42bytes):
+	movdqa	%xmm0, -42(%edx)
+L(aligned_16_26bytes):
+	movdqa	%xmm0, -26(%edx)
+L(aligned_16_10bytes):
+	movq	%xmm0, -10(%edx)
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_123bytes):
+	movdqa	%xmm0, -123(%edx)
+L(aligned_16_107bytes):
+	movdqa	%xmm0, -107(%edx)
+L(aligned_16_91bytes):
+	movdqa	%xmm0, -91(%edx)
+L(aligned_16_75bytes):
+	movdqa	%xmm0, -75(%edx)
+L(aligned_16_59bytes):
+	movdqa	%xmm0, -59(%edx)
+L(aligned_16_43bytes):
+	movdqa	%xmm0, -43(%edx)
+L(aligned_16_27bytes):
+	movdqa	%xmm0, -27(%edx)
+L(aligned_16_11bytes):
+	movq	%xmm0, -11(%edx)
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_124bytes):
+	movdqa	%xmm0, -124(%edx)
+L(aligned_16_108bytes):
+	movdqa	%xmm0, -108(%edx)
+L(aligned_16_92bytes):
+	movdqa	%xmm0, -92(%edx)
+L(aligned_16_76bytes):
+	movdqa	%xmm0, -76(%edx)
+L(aligned_16_60bytes):
+	movdqa	%xmm0, -60(%edx)
+L(aligned_16_44bytes):
+	movdqa	%xmm0, -44(%edx)
+L(aligned_16_28bytes):
+	movdqa	%xmm0, -28(%edx)
+L(aligned_16_12bytes):
+	movq	%xmm0, -12(%edx)
+	movl	%eax, -4(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_125bytes):
+	movdqa	%xmm0, -125(%edx)
+L(aligned_16_109bytes):
+	movdqa	%xmm0, -109(%edx)
+L(aligned_16_93bytes):
+	movdqa	%xmm0, -93(%edx)
+L(aligned_16_77bytes):
+	movdqa	%xmm0, -77(%edx)
+L(aligned_16_61bytes):
+	movdqa	%xmm0, -61(%edx)
+L(aligned_16_45bytes):
+	movdqa	%xmm0, -45(%edx)
+L(aligned_16_29bytes):
+	movdqa	%xmm0, -29(%edx)
+L(aligned_16_13bytes):
+	movq	%xmm0, -13(%edx)
+	movl	%eax, -5(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_126bytes):
+	movdqa	%xmm0, -126(%edx)
+L(aligned_16_110bytes):
+	movdqa	%xmm0, -110(%edx)
+L(aligned_16_94bytes):
+	movdqa	%xmm0, -94(%edx)
+L(aligned_16_78bytes):
+	movdqa	%xmm0, -78(%edx)
+L(aligned_16_62bytes):
+	movdqa	%xmm0, -62(%edx)
+L(aligned_16_46bytes):
+	movdqa	%xmm0, -46(%edx)
+L(aligned_16_30bytes):
+	movdqa	%xmm0, -30(%edx)
+L(aligned_16_14bytes):
+	movq	%xmm0, -14(%edx)
+	movl	%eax, -6(%edx)
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_127bytes):
+	movdqa	%xmm0, -127(%edx)
+L(aligned_16_111bytes):
+	movdqa	%xmm0, -111(%edx)
+L(aligned_16_95bytes):
+	movdqa	%xmm0, -95(%edx)
+L(aligned_16_79bytes):
+	movdqa	%xmm0, -79(%edx)
+L(aligned_16_63bytes):
+	movdqa	%xmm0, -63(%edx)
+L(aligned_16_47bytes):
+	movdqa	%xmm0, -47(%edx)
+L(aligned_16_31bytes):
+	movdqa	%xmm0, -31(%edx)
+L(aligned_16_15bytes):
+	movq	%xmm0, -15(%edx)
+	movl	%eax, -7(%edx)
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN_END
+
+END (__memset_sse2_rep)
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2.S
new file mode 100644
index 0000000000..d7b8be9114
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2.S
@@ -0,0 +1,860 @@
+/* memset with SSE2
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#ifdef USE_AS_BZERO
+# define DEST		PARMS
+# define LEN		DEST+4
+# define SETRTNVAL
+#else
+# define DEST		PARMS
+# define CHR		DEST+4
+# define LEN		CHR+4
+# define SETRTNVAL	movl DEST(%esp), %eax
+#endif
+
+#ifdef SHARED
+# define ENTRANCE	PUSH (%ebx);
+# define RETURN_END	POP (%ebx); ret
+# define RETURN		RETURN_END; CFI_PUSH (%ebx)
+# define PARMS		8		/* Preserve EBX.  */
+# define JMPTBL(I, B)	I - B
+
+/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
+   jump table with relative offsets.   */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
+    /* We first load PC into EBX.  */				\
+    SETUP_PIC_REG(bx);						\
+    /* Get the address of the jump table.  */			\
+    add		$(TABLE - .), %ebx;				\
+    /* Get the entry and convert the relative offset to the	\
+       absolute address.  */					\
+    add		(%ebx,%ecx,4), %ebx;				\
+    add		%ecx, %edx;					\
+    /* We loaded the jump table and adjusted EDX. Go.  */	\
+    jmp		*%ebx
+#else
+# define ENTRANCE
+# define RETURN_END	ret
+# define RETURN		RETURN_END
+# define PARMS		4
+# define JMPTBL(I, B)	I
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+   absolute offsets.  */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
+    add		%ecx, %edx;					\
+    jmp		*TABLE(,%ecx,4)
+#endif
+
+	.section .text.sse2,"ax",@progbits
+#if defined SHARED && IS_IN (libc) && !defined USE_AS_BZERO
+ENTRY (__memset_chk_sse2)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__memset_chk_sse2)
+#endif
+ENTRY (__memset_sse2)
+	ENTRANCE
+
+	movl	LEN(%esp), %ecx
+#ifdef USE_AS_BZERO
+	xor	%eax, %eax
+#else
+	movzbl	CHR(%esp), %eax
+	movb	%al, %ah
+	/* Fill the whole EAX with pattern.  */
+	movl	%eax, %edx
+	shl	$16, %eax
+	or	%edx, %eax
+#endif
+	movl	DEST(%esp), %edx
+	cmp	$32, %ecx
+	jae	L(32bytesormore)
+
+L(write_less32bytes):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_32bytes))
+
+
+	.pushsection .rodata.sse2,"a",@progbits
+	ALIGN (2)
+L(table_less_32bytes):
+	.int	JMPTBL (L(write_0bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_1bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_2bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_3bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_4bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_5bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_6bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_7bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_8bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_9bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_10bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_11bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_12bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_13bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_14bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_15bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_16bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_17bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_18bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_19bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_20bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_21bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_22bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_23bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_24bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_25bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_26bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_27bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_28bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_29bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_30bytes), L(table_less_32bytes))
+	.int	JMPTBL (L(write_31bytes), L(table_less_32bytes))
+	.popsection
+
+	ALIGN (4)
+L(write_28bytes):
+	movl	%eax, -28(%edx)
+L(write_24bytes):
+	movl	%eax, -24(%edx)
+L(write_20bytes):
+	movl	%eax, -20(%edx)
+L(write_16bytes):
+	movl	%eax, -16(%edx)
+L(write_12bytes):
+	movl	%eax, -12(%edx)
+L(write_8bytes):
+	movl	%eax, -8(%edx)
+L(write_4bytes):
+	movl	%eax, -4(%edx)
+L(write_0bytes):
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(write_29bytes):
+	movl	%eax, -29(%edx)
+L(write_25bytes):
+	movl	%eax, -25(%edx)
+L(write_21bytes):
+	movl	%eax, -21(%edx)
+L(write_17bytes):
+	movl	%eax, -17(%edx)
+L(write_13bytes):
+	movl	%eax, -13(%edx)
+L(write_9bytes):
+	movl	%eax, -9(%edx)
+L(write_5bytes):
+	movl	%eax, -5(%edx)
+L(write_1bytes):
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(write_30bytes):
+	movl	%eax, -30(%edx)
+L(write_26bytes):
+	movl	%eax, -26(%edx)
+L(write_22bytes):
+	movl	%eax, -22(%edx)
+L(write_18bytes):
+	movl	%eax, -18(%edx)
+L(write_14bytes):
+	movl	%eax, -14(%edx)
+L(write_10bytes):
+	movl	%eax, -10(%edx)
+L(write_6bytes):
+	movl	%eax, -6(%edx)
+L(write_2bytes):
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(write_31bytes):
+	movl	%eax, -31(%edx)
+L(write_27bytes):
+	movl	%eax, -27(%edx)
+L(write_23bytes):
+	movl	%eax, -23(%edx)
+L(write_19bytes):
+	movl	%eax, -19(%edx)
+L(write_15bytes):
+	movl	%eax, -15(%edx)
+L(write_11bytes):
+	movl	%eax, -11(%edx)
+L(write_7bytes):
+	movl	%eax, -7(%edx)
+L(write_3bytes):
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+/* ECX > 32 and EDX is 4 byte aligned.  */
+L(32bytesormore):
+	/* Fill xmm0 with the pattern.  */
+#ifdef USE_AS_BZERO
+	pxor	%xmm0, %xmm0
+#else
+	movd	%eax, %xmm0
+	pshufd	$0, %xmm0, %xmm0
+#endif
+	testl	$0xf, %edx
+	jz	L(aligned_16)
+/* ECX > 32 and EDX is not 16 byte aligned.  */
+L(not_aligned_16):
+	movdqu	%xmm0, (%edx)
+	movl	%edx, %eax
+	and	$-16, %edx
+	add	$16, %edx
+	sub	%edx, %eax
+	add	%eax, %ecx
+	movd	%xmm0, %eax
+
+	ALIGN (4)
+L(aligned_16):
+	cmp	$128, %ecx
+	jae	L(128bytesormore)
+
+L(aligned_16_less128bytes):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+	ALIGN (4)
+L(128bytesormore):
+#ifdef SHARED_CACHE_SIZE
+	PUSH (%ebx)
+	mov	$SHARED_CACHE_SIZE, %ebx
+#else
+# ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	mov	__x86_shared_cache_size@GOTOFF(%ebx), %ebx
+# else
+	PUSH (%ebx)
+	mov	__x86_shared_cache_size, %ebx
+# endif
+#endif
+	cmp	%ebx, %ecx
+	jae	L(128bytesormore_nt_start)
+
+
+#ifdef DATA_CACHE_SIZE
+	POP (%ebx)
+# define RESTORE_EBX_STATE CFI_PUSH (%ebx)
+	cmp	$DATA_CACHE_SIZE, %ecx
+#else
+# ifdef SHARED
+#  define RESTORE_EBX_STATE
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size@GOTOFF(%ebx), %ecx
+# else
+	POP (%ebx)
+#  define RESTORE_EBX_STATE CFI_PUSH (%ebx)
+	cmp	__x86_data_cache_size, %ecx
+# endif
+#endif
+
+	jae	L(128bytes_L2_normal)
+	subl	$128, %ecx
+L(128bytesormore_normal):
+	sub	$128, %ecx
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm0, 0x10(%edx)
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm0, 0x30(%edx)
+	movdqa	%xmm0, 0x40(%edx)
+	movdqa	%xmm0, 0x50(%edx)
+	movdqa	%xmm0, 0x60(%edx)
+	movdqa	%xmm0, 0x70(%edx)
+	lea	128(%edx), %edx
+	jb	L(128bytesless_normal)
+
+
+	sub	$128, %ecx
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm0, 0x10(%edx)
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm0, 0x30(%edx)
+	movdqa	%xmm0, 0x40(%edx)
+	movdqa	%xmm0, 0x50(%edx)
+	movdqa	%xmm0, 0x60(%edx)
+	movdqa	%xmm0, 0x70(%edx)
+	lea	128(%edx), %edx
+	jae	L(128bytesormore_normal)
+
+L(128bytesless_normal):
+	add	$128, %ecx
+	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+	ALIGN (4)
+L(128bytes_L2_normal):
+	prefetcht0	0x380(%edx)
+	prefetcht0	0x3c0(%edx)
+	sub	$128, %ecx
+	movdqa	%xmm0, (%edx)
+	movaps	%xmm0, 0x10(%edx)
+	movaps	%xmm0, 0x20(%edx)
+	movaps	%xmm0, 0x30(%edx)
+	movaps	%xmm0, 0x40(%edx)
+	movaps	%xmm0, 0x50(%edx)
+	movaps	%xmm0, 0x60(%edx)
+	movaps	%xmm0, 0x70(%edx)
+	add	$128, %edx
+	cmp	$128, %ecx
+	jae	L(128bytes_L2_normal)
+
+L(128bytesless_L2_normal):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+	RESTORE_EBX_STATE
+L(128bytesormore_nt_start):
+	sub	%ebx, %ecx
+	ALIGN (4)
+L(128bytesormore_shared_cache_loop):
+	prefetcht0	0x3c0(%edx)
+	prefetcht0	0x380(%edx)
+	sub	$0x80, %ebx
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm0, 0x10(%edx)
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm0, 0x30(%edx)
+	movdqa	%xmm0, 0x40(%edx)
+	movdqa	%xmm0, 0x50(%edx)
+	movdqa	%xmm0, 0x60(%edx)
+	movdqa	%xmm0, 0x70(%edx)
+	add	$0x80, %edx
+	cmp	$0x80, %ebx
+	jae	L(128bytesormore_shared_cache_loop)
+	cmp	$0x80, %ecx
+	jb	L(shared_cache_loop_end)
+	ALIGN (4)
+L(128bytesormore_nt):
+	sub	$0x80, %ecx
+	movntdq	%xmm0, (%edx)
+	movntdq	%xmm0, 0x10(%edx)
+	movntdq	%xmm0, 0x20(%edx)
+	movntdq	%xmm0, 0x30(%edx)
+	movntdq	%xmm0, 0x40(%edx)
+	movntdq	%xmm0, 0x50(%edx)
+	movntdq	%xmm0, 0x60(%edx)
+	movntdq	%xmm0, 0x70(%edx)
+	add	$0x80, %edx
+	cmp	$0x80, %ecx
+	jae	L(128bytesormore_nt)
+	sfence
+L(shared_cache_loop_end):
+#if defined DATA_CACHE_SIZE || !defined SHARED
+	POP (%ebx)
+#endif
+	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+
+	.pushsection .rodata.sse2,"a",@progbits
+	ALIGN (2)
+L(table_16_128bytes):
+	.int	JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_1bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_3bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_5bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_7bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_9bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_11bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_13bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_15bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_17bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_19bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_21bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_23bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_25bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_27bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_29bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_31bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_33bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_35bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_37bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_39bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_41bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_43bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_45bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_47bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_49bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_51bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_53bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_55bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_57bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_59bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_61bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_63bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_65bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_67bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_69bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_71bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_73bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_75bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_77bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_79bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_81bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_83bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_85bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_87bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_89bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_91bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_93bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_95bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_97bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_99bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_101bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_103bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_105bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_107bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_109bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_111bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_113bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_115bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_117bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_119bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_121bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_123bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_125bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes))
+	.int	JMPTBL (L(aligned_16_127bytes), L(table_16_128bytes))
+	.popsection
+
+	ALIGN (4)
+L(aligned_16_112bytes):
+	movdqa	%xmm0, -112(%edx)
+L(aligned_16_96bytes):
+	movdqa	%xmm0, -96(%edx)
+L(aligned_16_80bytes):
+	movdqa	%xmm0, -80(%edx)
+L(aligned_16_64bytes):
+	movdqa	%xmm0, -64(%edx)
+L(aligned_16_48bytes):
+	movdqa	%xmm0, -48(%edx)
+L(aligned_16_32bytes):
+	movdqa	%xmm0, -32(%edx)
+L(aligned_16_16bytes):
+	movdqa	%xmm0, -16(%edx)
+L(aligned_16_0bytes):
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_113bytes):
+	movdqa	%xmm0, -113(%edx)
+L(aligned_16_97bytes):
+	movdqa	%xmm0, -97(%edx)
+L(aligned_16_81bytes):
+	movdqa	%xmm0, -81(%edx)
+L(aligned_16_65bytes):
+	movdqa	%xmm0, -65(%edx)
+L(aligned_16_49bytes):
+	movdqa	%xmm0, -49(%edx)
+L(aligned_16_33bytes):
+	movdqa	%xmm0, -33(%edx)
+L(aligned_16_17bytes):
+	movdqa	%xmm0, -17(%edx)
+L(aligned_16_1bytes):
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_114bytes):
+	movdqa	%xmm0, -114(%edx)
+L(aligned_16_98bytes):
+	movdqa	%xmm0, -98(%edx)
+L(aligned_16_82bytes):
+	movdqa	%xmm0, -82(%edx)
+L(aligned_16_66bytes):
+	movdqa	%xmm0, -66(%edx)
+L(aligned_16_50bytes):
+	movdqa	%xmm0, -50(%edx)
+L(aligned_16_34bytes):
+	movdqa	%xmm0, -34(%edx)
+L(aligned_16_18bytes):
+	movdqa	%xmm0, -18(%edx)
+L(aligned_16_2bytes):
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_115bytes):
+	movdqa	%xmm0, -115(%edx)
+L(aligned_16_99bytes):
+	movdqa	%xmm0, -99(%edx)
+L(aligned_16_83bytes):
+	movdqa	%xmm0, -83(%edx)
+L(aligned_16_67bytes):
+	movdqa	%xmm0, -67(%edx)
+L(aligned_16_51bytes):
+	movdqa	%xmm0, -51(%edx)
+L(aligned_16_35bytes):
+	movdqa	%xmm0, -35(%edx)
+L(aligned_16_19bytes):
+	movdqa	%xmm0, -19(%edx)
+L(aligned_16_3bytes):
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_116bytes):
+	movdqa	%xmm0, -116(%edx)
+L(aligned_16_100bytes):
+	movdqa	%xmm0, -100(%edx)
+L(aligned_16_84bytes):
+	movdqa	%xmm0, -84(%edx)
+L(aligned_16_68bytes):
+	movdqa	%xmm0, -68(%edx)
+L(aligned_16_52bytes):
+	movdqa	%xmm0, -52(%edx)
+L(aligned_16_36bytes):
+	movdqa	%xmm0, -36(%edx)
+L(aligned_16_20bytes):
+	movdqa	%xmm0, -20(%edx)
+L(aligned_16_4bytes):
+	movl	%eax, -4(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_117bytes):
+	movdqa	%xmm0, -117(%edx)
+L(aligned_16_101bytes):
+	movdqa	%xmm0, -101(%edx)
+L(aligned_16_85bytes):
+	movdqa	%xmm0, -85(%edx)
+L(aligned_16_69bytes):
+	movdqa	%xmm0, -69(%edx)
+L(aligned_16_53bytes):
+	movdqa	%xmm0, -53(%edx)
+L(aligned_16_37bytes):
+	movdqa	%xmm0, -37(%edx)
+L(aligned_16_21bytes):
+	movdqa	%xmm0, -21(%edx)
+L(aligned_16_5bytes):
+	movl	%eax, -5(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_118bytes):
+	movdqa	%xmm0, -118(%edx)
+L(aligned_16_102bytes):
+	movdqa	%xmm0, -102(%edx)
+L(aligned_16_86bytes):
+	movdqa	%xmm0, -86(%edx)
+L(aligned_16_70bytes):
+	movdqa	%xmm0, -70(%edx)
+L(aligned_16_54bytes):
+	movdqa	%xmm0, -54(%edx)
+L(aligned_16_38bytes):
+	movdqa	%xmm0, -38(%edx)
+L(aligned_16_22bytes):
+	movdqa	%xmm0, -22(%edx)
+L(aligned_16_6bytes):
+	movl	%eax, -6(%edx)
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_119bytes):
+	movdqa	%xmm0, -119(%edx)
+L(aligned_16_103bytes):
+	movdqa	%xmm0, -103(%edx)
+L(aligned_16_87bytes):
+	movdqa	%xmm0, -87(%edx)
+L(aligned_16_71bytes):
+	movdqa	%xmm0, -71(%edx)
+L(aligned_16_55bytes):
+	movdqa	%xmm0, -55(%edx)
+L(aligned_16_39bytes):
+	movdqa	%xmm0, -39(%edx)
+L(aligned_16_23bytes):
+	movdqa	%xmm0, -23(%edx)
+L(aligned_16_7bytes):
+	movl	%eax, -7(%edx)
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_120bytes):
+	movdqa	%xmm0, -120(%edx)
+L(aligned_16_104bytes):
+	movdqa	%xmm0, -104(%edx)
+L(aligned_16_88bytes):
+	movdqa	%xmm0, -88(%edx)
+L(aligned_16_72bytes):
+	movdqa	%xmm0, -72(%edx)
+L(aligned_16_56bytes):
+	movdqa	%xmm0, -56(%edx)
+L(aligned_16_40bytes):
+	movdqa	%xmm0, -40(%edx)
+L(aligned_16_24bytes):
+	movdqa	%xmm0, -24(%edx)
+L(aligned_16_8bytes):
+	movq	%xmm0, -8(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_121bytes):
+	movdqa	%xmm0, -121(%edx)
+L(aligned_16_105bytes):
+	movdqa	%xmm0, -105(%edx)
+L(aligned_16_89bytes):
+	movdqa	%xmm0, -89(%edx)
+L(aligned_16_73bytes):
+	movdqa	%xmm0, -73(%edx)
+L(aligned_16_57bytes):
+	movdqa	%xmm0, -57(%edx)
+L(aligned_16_41bytes):
+	movdqa	%xmm0, -41(%edx)
+L(aligned_16_25bytes):
+	movdqa	%xmm0, -25(%edx)
+L(aligned_16_9bytes):
+	movq	%xmm0, -9(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_122bytes):
+	movdqa	%xmm0, -122(%edx)
+L(aligned_16_106bytes):
+	movdqa	%xmm0, -106(%edx)
+L(aligned_16_90bytes):
+	movdqa	%xmm0, -90(%edx)
+L(aligned_16_74bytes):
+	movdqa	%xmm0, -74(%edx)
+L(aligned_16_58bytes):
+	movdqa	%xmm0, -58(%edx)
+L(aligned_16_42bytes):
+	movdqa	%xmm0, -42(%edx)
+L(aligned_16_26bytes):
+	movdqa	%xmm0, -26(%edx)
+L(aligned_16_10bytes):
+	movq	%xmm0, -10(%edx)
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_123bytes):
+	movdqa	%xmm0, -123(%edx)
+L(aligned_16_107bytes):
+	movdqa	%xmm0, -107(%edx)
+L(aligned_16_91bytes):
+	movdqa	%xmm0, -91(%edx)
+L(aligned_16_75bytes):
+	movdqa	%xmm0, -75(%edx)
+L(aligned_16_59bytes):
+	movdqa	%xmm0, -59(%edx)
+L(aligned_16_43bytes):
+	movdqa	%xmm0, -43(%edx)
+L(aligned_16_27bytes):
+	movdqa	%xmm0, -27(%edx)
+L(aligned_16_11bytes):
+	movq	%xmm0, -11(%edx)
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_124bytes):
+	movdqa	%xmm0, -124(%edx)
+L(aligned_16_108bytes):
+	movdqa	%xmm0, -108(%edx)
+L(aligned_16_92bytes):
+	movdqa	%xmm0, -92(%edx)
+L(aligned_16_76bytes):
+	movdqa	%xmm0, -76(%edx)
+L(aligned_16_60bytes):
+	movdqa	%xmm0, -60(%edx)
+L(aligned_16_44bytes):
+	movdqa	%xmm0, -44(%edx)
+L(aligned_16_28bytes):
+	movdqa	%xmm0, -28(%edx)
+L(aligned_16_12bytes):
+	movq	%xmm0, -12(%edx)
+	movl	%eax, -4(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_125bytes):
+	movdqa	%xmm0, -125(%edx)
+L(aligned_16_109bytes):
+	movdqa	%xmm0, -109(%edx)
+L(aligned_16_93bytes):
+	movdqa	%xmm0, -93(%edx)
+L(aligned_16_77bytes):
+	movdqa	%xmm0, -77(%edx)
+L(aligned_16_61bytes):
+	movdqa	%xmm0, -61(%edx)
+L(aligned_16_45bytes):
+	movdqa	%xmm0, -45(%edx)
+L(aligned_16_29bytes):
+	movdqa	%xmm0, -29(%edx)
+L(aligned_16_13bytes):
+	movq	%xmm0, -13(%edx)
+	movl	%eax, -5(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_126bytes):
+	movdqa	%xmm0, -126(%edx)
+L(aligned_16_110bytes):
+	movdqa	%xmm0, -110(%edx)
+L(aligned_16_94bytes):
+	movdqa	%xmm0, -94(%edx)
+L(aligned_16_78bytes):
+	movdqa	%xmm0, -78(%edx)
+L(aligned_16_62bytes):
+	movdqa	%xmm0, -62(%edx)
+L(aligned_16_46bytes):
+	movdqa	%xmm0, -46(%edx)
+L(aligned_16_30bytes):
+	movdqa	%xmm0, -30(%edx)
+L(aligned_16_14bytes):
+	movq	%xmm0, -14(%edx)
+	movl	%eax, -6(%edx)
+	movw	%ax, -2(%edx)
+	SETRTNVAL
+	RETURN
+
+	ALIGN (4)
+L(aligned_16_127bytes):
+	movdqa	%xmm0, -127(%edx)
+L(aligned_16_111bytes):
+	movdqa	%xmm0, -111(%edx)
+L(aligned_16_95bytes):
+	movdqa	%xmm0, -95(%edx)
+L(aligned_16_79bytes):
+	movdqa	%xmm0, -79(%edx)
+L(aligned_16_63bytes):
+	movdqa	%xmm0, -63(%edx)
+L(aligned_16_47bytes):
+	movdqa	%xmm0, -47(%edx)
+L(aligned_16_31bytes):
+	movdqa	%xmm0, -31(%edx)
+L(aligned_16_15bytes):
+	movq	%xmm0, -15(%edx)
+	movl	%eax, -7(%edx)
+	movw	%ax, -3(%edx)
+	movb	%al, -1(%edx)
+	SETRTNVAL
+	RETURN_END
+
+END (__memset_sse2)
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memset.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memset.S
new file mode 100644
index 0000000000..f601663a9f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memset.S
@@ -0,0 +1,75 @@
+/* Multiple versions of memset
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib.  */
+#if IS_IN (libc)
+	.text
+ENTRY(memset)
+	.type	memset, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memset_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memset_sse2)
+	HAS_ARCH_FEATURE (Fast_Rep_String)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memset_sse2_rep)
+2:	ret
+END(memset)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __memset_ia32, @function; \
+	.globl __memset_ia32; \
+	.p2align 4; \
+	__memset_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __memset_ia32, .-__memset_ia32
+
+# undef ENTRY_CHK
+# define ENTRY_CHK(name) \
+	.type __memset_chk_ia32, @function; \
+	.globl __memset_chk_ia32; \
+	.p2align 4; \
+	__memset_chk_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END_CHK
+# define END_CHK(name) \
+	cfi_endproc; .size __memset_chk_ia32, .-__memset_chk_ia32
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_memset; __GI_memset = __memset_ia32
+# endif
+
+# undef strong_alias
+# define strong_alias(original, alias)
+#endif
+
+#include "../memset.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memset_chk.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memset_chk.S
new file mode 100644
index 0000000000..573cf4208a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memset_chk.S
@@ -0,0 +1,82 @@
+/* Multiple versions of __memset_chk
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib.  */
+#if IS_IN (libc)
+	.text
+ENTRY(__memset_chk)
+	.type	__memset_chk, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memset_chk_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memset_chk_sse2)
+	HAS_ARCH_FEATURE (Fast_Rep_String)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__memset_chk_sse2_rep)
+2:	ret
+END(__memset_chk)
+
+# ifdef SHARED
+strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
+	.section .gnu.warning.__memset_zero_constant_len_parameter
+	.string "memset used with constant zero length parameter; this could be due to transposed parameters"
+# else
+	.text
+	.type __memset_chk_sse2, @function
+	.p2align 4;
+__memset_chk_sse2:
+	cfi_startproc
+	CALL_MCOUNT
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	__chk_fail
+	jmp	__memset_sse2
+	cfi_endproc
+	.size __memset_chk_sse2, .-__memset_chk_sse2
+
+	.type __memset_chk_sse2_rep, @function
+	.p2align 4;
+__memset_chk_sse2_rep:
+	cfi_startproc
+	CALL_MCOUNT
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	__chk_fail
+	jmp	__memset_sse2_rep
+	cfi_endproc
+	.size __memset_chk_sse2_rep, .-__memset_chk_sse2_rep
+
+	.type __memset_chk_ia32, @function
+	.p2align 4;
+__memset_chk_ia32:
+	cfi_startproc
+	CALL_MCOUNT
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	__chk_fail
+	jmp	__memset_ia32
+	cfi_endproc
+	.size __memset_chk_ia32, .-__memset_chk_ia32
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S
new file mode 100644
index 0000000000..88c0e5776c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S
@@ -0,0 +1,3 @@
+#define USE_AS_RAWMEMCHR
+#define MEMCHR __rawmemchr_sse2_bsf
+#include "memchr-sse2-bsf.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S
new file mode 100644
index 0000000000..038c74896b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S
@@ -0,0 +1,3 @@
+#define USE_AS_RAWMEMCHR
+#define MEMCHR __rawmemchr_sse2
+#include "memchr-sse2.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr.S
new file mode 100644
index 0000000000..0a41d63ee8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr.S
@@ -0,0 +1,65 @@
+/* Multiple versions of rawmemchr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+	.text
+ENTRY(__rawmemchr)
+	.type	__rawmemchr, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	HAS_ARCH_FEATURE (Slow_BSF)
+	jz	3f
+
+	LOAD_FUNC_GOT_EAX (__rawmemchr_sse2)
+	ret
+
+2:	LOAD_FUNC_GOT_EAX (__rawmemchr_ia32)
+	ret
+
+3:	LOAD_FUNC_GOT_EAX (__rawmemchr_sse2_bsf)
+	ret
+END(__rawmemchr)
+
+weak_alias(__rawmemchr, rawmemchr)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __rawmemchr_ia32, @function; \
+	.globl __rawmemchr_ia32; \
+	.p2align 4; \
+	__rawmemchr_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __rawmemchr_ia32, .-__rawmemchr_ia32
+
+# undef libc_hidden_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+# define libc_hidden_def(name) \
+	.globl __GI___rawmemchr; __GI___rawmemchr = __rawmemchr_ia32
+
+#endif
+#include "../../rawmemchr.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/rtld-strnlen.c b/REORG.TODO/sysdeps/i386/i686/multiarch/rtld-strnlen.c
new file mode 100644
index 0000000000..1aa5440644
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/rtld-strnlen.c
@@ -0,0 +1 @@
+#include <string/strnlen.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma-fma.c b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma-fma.c
new file mode 100644
index 0000000000..2e9619f97c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma-fma.c
@@ -0,0 +1,27 @@
+/* FMA version of fma.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+
+double
+__fma_fma (double x, double y, double z)
+{
+  asm ("vfmadd213sd %3, %2, %0" : "=x" (x) : "0" (x), "x" (y), "xm" (z));
+  return x;
+}
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma.c b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma.c
new file mode 100644
index 0000000000..411ebb2ba9
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma.c
@@ -0,0 +1,34 @@
+/* Multiple versions of fma.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+
+#include <math.h>
+#include <init-arch.h>
+
+extern double __fma_ia32 (double x, double y, double z) attribute_hidden;
+extern double __fma_fma (double x, double y, double z) attribute_hidden;
+
+libm_ifunc (__fma,
+	    HAS_ARCH_FEATURE (FMA_Usable) ? __fma_fma : __fma_ia32);
+weak_alias (__fma, fma)
+
+#define __fma __fma_ia32
+
+#include <sysdeps/ieee754/ldbl-96/s_fma.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf-fma.c b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf-fma.c
new file mode 100644
index 0000000000..ee57abfda2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf-fma.c
@@ -0,0 +1,27 @@
+/* FMA version of fmaf.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+
+float
+__fmaf_fma (float x, float y, float z)
+{
+  asm ("vfmadd213ss %3, %2, %0" : "=x" (x) : "0" (x), "x" (y), "xm" (z));
+  return x;
+}
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf.c b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf.c
new file mode 100644
index 0000000000..00b0fbcfc5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf.c
@@ -0,0 +1,34 @@
+/* Multiple versions of fmaf.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+
+#include <math.h>
+#include <init-arch.h>
+
+extern float __fmaf_ia32 (float x, float y, float z) attribute_hidden;
+extern float __fmaf_fma (float x, float y, float z) attribute_hidden;
+
+libm_ifunc (__fmaf,
+	    HAS_ARCH_FEATURE (FMA_Usable) ? __fmaf_fma : __fmaf_ia32);
+weak_alias (__fmaf, fmaf)
+
+#define __fmaf __fmaf_ia32
+
+#include <sysdeps/ieee754/dbl-64/s_fmaf.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/sched_cpucount.c b/REORG.TODO/sysdeps/i386/i686/multiarch/sched_cpucount.c
new file mode 100644
index 0000000000..7db31b02f8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/sched_cpucount.c
@@ -0,0 +1 @@
+#include <sysdeps/x86_64/multiarch/sched_cpucount.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-sse2.S
new file mode 100644
index 0000000000..46ca1b3074
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-sse2.S
@@ -0,0 +1,3 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy_sse2
+#include "strcpy-sse2.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S
new file mode 100644
index 0000000000..d971c2da38
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S
@@ -0,0 +1,3 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy_ssse3
+#include "strcpy-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy.S
new file mode 100644
index 0000000000..ee81ab6ae3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy.S
@@ -0,0 +1,9 @@
+/* Multiple versions of stpcpy
+   All versions must be listed in ifunc-impl-list.c.  */
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy
+#include "strcpy.S"
+
+weak_alias (__stpcpy, stpcpy)
+libc_hidden_def (__stpcpy)
+libc_hidden_builtin_def (stpcpy)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-sse2.S
new file mode 100644
index 0000000000..37a703cb76
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-sse2.S
@@ -0,0 +1,4 @@
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#define STRCPY __stpncpy_sse2
+#include "strcpy-sse2.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S
new file mode 100644
index 0000000000..14ed16f6b5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#define STRCPY __stpncpy_ssse3
+#include "strcpy-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy.S
new file mode 100644
index 0000000000..2698ca6a8c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy.S
@@ -0,0 +1,8 @@
+/* Multiple versions of stpncpy
+   All versions must be listed in ifunc-impl-list.c.  */
+#define STRCPY __stpncpy
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#include "strcpy.S"
+
+weak_alias (__stpncpy, stpncpy)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp-c.c
new file mode 100644
index 0000000000..753c6ec84a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp-c.c
@@ -0,0 +1,12 @@
+#include <string.h>
+
+extern __typeof (strcasecmp) __strcasecmp_nonascii;
+
+#define __strcasecmp __strcasecmp_nonascii
+#include <string/strcasecmp.c>
+
+strong_alias (__strcasecmp_nonascii, __strcasecmp_ia32)
+
+/* The needs of strcasecmp in libc are minimal, no need to go through
+   the IFUNC.  */
+strong_alias (__strcasecmp_nonascii, __GI___strcasecmp)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp.S
new file mode 100644
index 0000000000..ec59276408
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp.S
@@ -0,0 +1,39 @@
+/* Entry point for multi-version x86 strcasecmp.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+	.text
+ENTRY(__strcasecmp)
+	.type	__strcasecmp, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__strcasecmp_ia32)
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__strcasecmp_ssse3)
+	HAS_CPU_FEATURE (SSE4_2)
+	jz	2f
+	HAS_ARCH_FEATURE (Slow_SSE4_2)
+	jnz	2f
+	LOAD_FUNC_GOT_EAX (__strcasecmp_sse4_2)
+2:	ret
+END(__strcasecmp)
+
+weak_alias (__strcasecmp, strcasecmp)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-c.c
new file mode 100644
index 0000000000..d4fcd2b4a1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-c.c
@@ -0,0 +1,13 @@
+#include <string.h>
+
+extern __typeof (strcasecmp_l) __strcasecmp_l_nonascii;
+
+#define __strcasecmp_l __strcasecmp_l_nonascii
+#define USE_IN_EXTENDED_LOCALE_MODEL    1
+#include <string/strcasecmp.c>
+
+strong_alias (__strcasecmp_l_nonascii, __strcasecmp_l_ia32)
+
+/* The needs of strcasecmp in libc are minimal, no need to go through
+   the IFUNC.  */
+strong_alias (__strcasecmp_l_nonascii, __GI___strcasecmp_l)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-sse4.S
new file mode 100644
index 0000000000..411d4153f2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-sse4.S
@@ -0,0 +1,2 @@
+#define USE_AS_STRCASECMP_L 1
+#include "strcmp-sse4.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-ssse3.S
new file mode 100644
index 0000000000..a22b93c518
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-ssse3.S
@@ -0,0 +1,2 @@
+#define USE_AS_STRCASECMP_L 1
+#include "strcmp-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l.S
new file mode 100644
index 0000000000..711c09b0dc
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l.S
@@ -0,0 +1,7 @@
+/* Multiple versions of strcasecmp_l
+   All versions must be listed in ifunc-impl-list.c.  */
+#define STRCMP __strcasecmp_l
+#define USE_AS_STRCASECMP_L
+#include "strcmp.S"
+
+weak_alias (__strcasecmp_l, strcasecmp_l)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-sse2.S
new file mode 100644
index 0000000000..6359c7330c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-sse2.S
@@ -0,0 +1,1245 @@
+/* strcat with SSE2
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# ifdef SHARED
+#  define JMPTBL(I, B) I - B
+
+/* Load an entry in a jump table into ECX and branch to it.  TABLE is a
+	jump table with relative offsets.  INDEX is a register contains the
+	index into the jump table.   SCALE is the scale of INDEX. */
+
+#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)	\
+	/* We first load PC into ECX.  */	\
+	SETUP_PIC_REG(cx);	\
+	/* Get the address of the jump table.  */	\
+	addl	$(TABLE - .), %ecx;	\
+	/* Get the entry and convert the relative offset to the	\
+	absolute address.  */	\
+	addl	(%ecx,INDEX,SCALE), %ecx;	\
+	/* We loaded the jump table and adjusted ECX. Go.  */	\
+	jmp	*%ecx
+# else
+#  define JMPTBL(I, B) I
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+	absolute offsets.  INDEX is a register contains the index into the
+	jump table.  SCALE is the scale of INDEX. */
+
+#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)	\
+	jmp	*TABLE(,INDEX,SCALE)
+# endif
+
+# ifndef STRCAT
+#  define STRCAT  __strcat_sse2
+# endif
+
+# define PARMS  4
+# define STR1  PARMS+4
+# define STR2  STR1+4
+
+# ifdef USE_AS_STRNCAT
+#  define LEN    STR2+8
+#  define STR3   STR1+4
+# else
+#  define STR3   STR1
+# endif
+
+# define USE_AS_STRCAT
+# ifdef USE_AS_STRNCAT
+#  define RETURN  POP(%ebx); POP(%esi); ret; CFI_PUSH(%ebx); CFI_PUSH(%esi);
+# else
+#  define RETURN  POP(%esi); ret; CFI_PUSH(%esi);
+# endif
+
+.text
+ENTRY (STRCAT)
+	PUSH	(%esi)
+	mov	STR1(%esp), %eax
+	mov	STR2(%esp), %esi
+# ifdef USE_AS_STRNCAT
+	PUSH	(%ebx)
+	movl	LEN(%esp), %ebx
+	test	%ebx, %ebx
+	jz	L(ExitZero)
+# endif
+	cmpb	$0, (%esi)
+	mov	%esi, %ecx
+	mov	%eax, %edx
+	jz	L(ExitZero)
+
+	and	$63, %ecx
+	and	$63, %edx
+	cmp	$32, %ecx
+	ja	L(StrlenCore7_1)
+	cmp	$48, %edx
+	ja	L(alignment_prolog)
+
+	pxor	%xmm0, %xmm0
+	pxor	%xmm4, %xmm4
+	pxor	%xmm7, %xmm7
+	movdqu	(%eax), %xmm1
+	movdqu	(%esi), %xmm5
+	pcmpeqb	%xmm1, %xmm0
+	movdqu	16(%esi), %xmm6
+	pmovmskb %xmm0, %ecx
+	pcmpeqb	%xmm5, %xmm4
+	pcmpeqb	%xmm6, %xmm7
+	test	%ecx, %ecx
+	jnz	L(exit_less16_)
+	mov	%eax, %ecx
+	and	$-16, %eax
+	jmp	L(loop_prolog)
+
+L(alignment_prolog):
+	pxor	%xmm0, %xmm0
+	pxor	%xmm4, %xmm4
+	mov	%edx, %ecx
+	pxor	%xmm7, %xmm7
+	and	$15, %ecx
+	and	$-16, %eax
+	pcmpeqb	(%eax), %xmm0
+	movdqu	(%esi), %xmm5
+	movdqu	16(%esi), %xmm6
+	pmovmskb %xmm0, %edx
+	pcmpeqb	%xmm5, %xmm4
+	shr	%cl, %edx
+	pcmpeqb	%xmm6, %xmm7
+	test	%edx, %edx
+	jnz	L(exit_less16)
+	add	%eax, %ecx
+
+	pxor	%xmm0, %xmm0
+L(loop_prolog):
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
+	.p2align 4
+L(align16_loop):
+	pcmpeqb	16(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%eax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	lea	64(%eax), %eax
+	test	%edx, %edx
+	jz	L(align16_loop)
+	bsf	%edx, %edx
+	add	%edx, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit16):
+	bsf	%edx, %edx
+	lea	16(%eax, %edx), %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit32):
+	bsf	%edx, %edx
+	lea	32(%eax, %edx), %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit48):
+	bsf	%edx, %edx
+	lea	48(%eax, %edx), %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_less16):
+	bsf	%edx, %edx
+	add	%ecx, %eax
+	add	%edx, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_less16_):
+	bsf	%ecx, %ecx
+	add	%ecx, %eax
+
+	.p2align 4
+L(StartStrcpyPart):
+	pmovmskb %xmm4, %edx
+# ifdef USE_AS_STRNCAT
+	cmp	$16, %ebx
+	jbe	L(CopyFrom1To16BytesTail1Case2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTail1)
+
+	movdqu	%xmm5, (%eax)
+	pmovmskb %xmm7, %edx
+# ifdef USE_AS_STRNCAT
+	cmp	$32, %ebx
+	jbe	L(CopyFrom1To32Bytes1Case2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To32Bytes1)
+
+	mov	%esi, %ecx
+	and	$-16, %esi
+	and	$15, %ecx
+	pxor	%xmm0, %xmm0
+# ifdef USE_AS_STRNCAT
+	add	%ecx, %ebx
+	sbb	%edx, %edx
+	or	%edx, %ebx
+# endif
+	sub	%ecx, %eax
+	jmp	L(Unalign16Both)
+
+L(StrlenCore7_1):
+	mov	%eax, %ecx
+	pxor	%xmm0, %xmm0
+	and	$15, %ecx
+	and	$-16, %eax
+	pcmpeqb	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	shr	%cl, %edx
+	test	%edx, %edx
+	jnz	L(exit_less16_1)
+	add	%eax, %ecx
+
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
+
+	.p2align 4
+L(align16_loop_1):
+	pcmpeqb	16(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16_1)
+
+	pcmpeqb	32(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32_1)
+
+	pcmpeqb	48(%eax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48_1)
+
+	pcmpeqb	64(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	lea	64(%eax), %eax
+	test	%edx, %edx
+	jz	L(align16_loop_1)
+	bsf	%edx, %edx
+	add	%edx, %eax
+	jmp	L(StartStrcpyPart_1)
+
+	.p2align 4
+L(exit16_1):
+	bsf	%edx, %edx
+	lea	16(%eax, %edx), %eax
+	jmp	L(StartStrcpyPart_1)
+
+	.p2align 4
+L(exit32_1):
+	bsf	%edx, %edx
+	lea	32(%eax, %edx), %eax
+	jmp	L(StartStrcpyPart_1)
+
+	.p2align 4
+L(exit48_1):
+	bsf	%edx, %edx
+	lea	48(%eax, %edx), %eax
+	jmp	L(StartStrcpyPart_1)
+
+	.p2align 4
+L(exit_less16_1):
+	bsf	%edx, %edx
+	add	%ecx, %eax
+	add	%edx, %eax
+
+	.p2align 4
+L(StartStrcpyPart_1):
+	mov	%esi, %ecx
+	and	$15, %ecx
+	and	$-16, %esi
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+
+# ifdef USE_AS_STRNCAT
+	cmp	$48, %ebx
+	ja      L(BigN)
+# endif
+	pcmpeqb	(%esi), %xmm1
+# ifdef USE_AS_STRNCAT
+	add	%ecx, %ebx
+# endif
+	pmovmskb %xmm1, %edx
+	shr	%cl, %edx
+# ifdef USE_AS_STRNCAT
+	cmp	$16, %ebx
+	jbe	L(CopyFrom1To16BytesTailCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTail)
+
+	pcmpeqb	16(%esi), %xmm0
+	pmovmskb %xmm0, %edx
+# ifdef USE_AS_STRNCAT
+	cmp	$32, %ebx
+	jbe	L(CopyFrom1To32BytesCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To32Bytes)
+
+	movdqu	(%esi, %ecx), %xmm1   /* copy 16 bytes */
+	movdqu	%xmm1, (%eax)
+	sub	%ecx, %eax
+
+	.p2align 4
+L(Unalign16Both):
+	mov	$16, %ecx
+	movdqa	(%esi, %ecx), %xmm1
+	movaps	16(%esi, %ecx), %xmm2
+	movdqu	%xmm1, (%eax, %ecx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+# ifdef USE_AS_STRNCAT
+	sub	$48, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+L(Unalign16BothBigN):
+	movaps	16(%esi, %ecx), %xmm3
+	movdqu	%xmm2, (%eax, %ecx)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+# ifdef USE_AS_STRNCAT
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%esi, %ecx), %xmm4
+	movdqu	%xmm3, (%eax, %ecx)
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+# ifdef USE_AS_STRNCAT
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%esi, %ecx), %xmm1
+	movdqu	%xmm4, (%eax, %ecx)
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+# ifdef USE_AS_STRNCAT
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%esi, %ecx), %xmm2
+	movdqu	%xmm1, (%eax, %ecx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+# ifdef USE_AS_STRNCAT
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%esi, %ecx), %xmm3
+	movdqu	%xmm2, (%eax, %ecx)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+# ifdef USE_AS_STRNCAT
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+
+	movdqu	%xmm3, (%eax, %ecx)
+	mov	%esi, %edx
+	lea	16(%esi, %ecx), %esi
+	and	$-0x40, %esi
+	sub	%esi, %edx
+	sub	%edx, %eax
+# ifdef USE_AS_STRNCAT
+	lea	128(%ebx, %edx), %ebx
+# endif
+	movaps	(%esi), %xmm2
+	movaps	%xmm2, %xmm4
+	movaps	16(%esi), %xmm5
+	movaps	32(%esi), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	48(%esi), %xmm7
+	pminub	%xmm5, %xmm2
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %edx
+# ifdef USE_AS_STRNCAT
+	sub	$64, %ebx
+	jbe	L(UnalignedLeaveCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(Unaligned64Leave)
+
+	.p2align 4
+L(Unaligned64Loop_start):
+	add	$64, %eax
+	add	$64, %esi
+	movdqu	%xmm4, -64(%eax)
+	movaps	(%esi), %xmm2
+	movdqa	%xmm2, %xmm4
+	movdqu	%xmm5, -48(%eax)
+	movaps	16(%esi), %xmm5
+	pminub	%xmm5, %xmm2
+	movaps	32(%esi), %xmm3
+	movdqu	%xmm6, -32(%eax)
+	movaps	%xmm3, %xmm6
+	movdqu	%xmm7, -16(%eax)
+	movaps	48(%esi), %xmm7
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %edx
+# ifdef USE_AS_STRNCAT
+	sub	$64, %ebx
+	jbe	L(UnalignedLeaveCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jz	L(Unaligned64Loop_start)
+
+L(Unaligned64Leave):
+	pxor	%xmm1, %xmm1
+
+	pcmpeqb	%xmm4, %xmm0
+	pcmpeqb	%xmm5, %xmm1
+	pmovmskb %xmm0, %edx
+	pmovmskb %xmm1, %ecx
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnaligned_0)
+	test	%ecx, %ecx
+	jnz	L(CopyFrom1To16BytesUnaligned_16)
+
+	pcmpeqb	%xmm6, %xmm0
+	pcmpeqb	%xmm7, %xmm1
+	pmovmskb %xmm0, %edx
+	pmovmskb %xmm1, %ecx
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnaligned_32)
+
+	bsf	%ecx, %edx
+	movdqu	%xmm4, (%eax)
+	movdqu	%xmm5, 16(%eax)
+	movdqu	%xmm6, 32(%eax)
+	add	$48, %esi
+	add	$48, %eax
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+# ifdef USE_AS_STRNCAT
+	.p2align 4
+L(BigN):
+	pcmpeqb	(%esi), %xmm1
+	pmovmskb %xmm1, %edx
+	shr	%cl, %edx
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTail)
+
+	pcmpeqb	16(%esi), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(CopyFrom1To32Bytes)
+
+	movdqu	(%esi, %ecx), %xmm1   /* copy 16 bytes */
+	movdqu	%xmm1, (%eax)
+	sub	%ecx, %eax
+	sub     $48, %ebx
+	add     %ecx, %ebx
+
+	mov	$16, %ecx
+	movdqa	(%esi, %ecx), %xmm1
+	movaps	16(%esi, %ecx), %xmm2
+	movdqu	%xmm1, (%eax, %ecx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+	jmp	L(Unalign16BothBigN)
+# endif
+
+/*------------end of main part-------------------------------*/
+
+/* Case1 */
+	.p2align 4
+L(CopyFrom1To16Bytes):
+	add	%ecx, %eax
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+	.p2align 4
+L(CopyFrom1To16BytesTail):
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+	.p2align 4
+L(CopyFrom1To32Bytes1):
+	add	$16, %esi
+	add	$16, %eax
+L(CopyFrom1To16BytesTail1):
+	bsf	%edx, %edx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+	.p2align 4
+L(CopyFrom1To32Bytes):
+	bsf	%edx, %edx
+	add	%ecx, %esi
+	add	$16, %edx
+	sub	%ecx, %edx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnaligned_0):
+	bsf	%edx, %edx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnaligned_16):
+	bsf	%ecx, %edx
+	movdqu	%xmm4, (%eax)
+	add	$16, %esi
+	add	$16, %eax
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnaligned_32):
+	bsf	%edx, %edx
+	movdqu	%xmm4, (%eax)
+	movdqu	%xmm5, 16(%eax)
+	add	$32, %esi
+	add	$32, %eax
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+# ifdef USE_AS_STRNCAT
+
+	.p2align 4
+L(CopyFrom1To16BytesExit):
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+/* Case2 */
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2):
+	add	$16, %ebx
+	add	%ecx, %eax
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+	.p2align 4
+L(CopyFrom1To32BytesCase2):
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	add	$16, %edx
+	sub	%ecx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+L(CopyFrom1To16BytesTailCase2):
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+L(CopyFrom1To16BytesTail1Case2):
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+/* Case2 or Case3,  Case3 */
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesCase2)
+L(CopyFrom1To16BytesCase3):
+	add	$16, %ebx
+	add	%ecx, %eax
+	add	%ecx, %esi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+	.p2align 4
+L(CopyFrom1To32BytesCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(CopyFrom1To32BytesCase2)
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+	.p2align 4
+L(CopyFrom1To16BytesTailCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTailCase2)
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+	.p2align 4
+L(CopyFrom1To32Bytes1Case2OrCase3):
+	add	$16, %eax
+	add	$16, %esi
+	sub	$16, %ebx
+L(CopyFrom1To16BytesTail1Case2OrCase3):
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTail1Case2)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+# endif
+
+# ifdef USE_AS_STRNCAT
+	.p2align 4
+L(StrncatExit0):
+	movb	%bh, (%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+# endif
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit1):
+	movb	%bh, 1(%eax)
+# endif
+L(Exit1):
+# ifdef USE_AS_STRNCAT
+	movb	(%esi), %dh
+# endif
+	movb	%dh, (%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit2):
+	movb	%bh, 2(%eax)
+# endif
+L(Exit2):
+	movw	(%esi), %dx
+	movw	%dx, (%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit3):
+	movb	%bh, 3(%eax)
+# endif
+L(Exit3):
+	movw	(%esi), %cx
+	movw	%cx, (%eax)
+# ifdef USE_AS_STRNCAT
+	movb	2(%esi), %dh
+# endif
+	movb	%dh, 2(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit4):
+	movb	%bh, 4(%eax)
+# endif
+L(Exit4):
+	movl	(%esi), %edx
+	movl	%edx, (%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit5):
+	movb	%bh, 5(%eax)
+# endif
+L(Exit5):
+	movl	(%esi), %ecx
+# ifdef USE_AS_STRNCAT
+	movb	4(%esi), %dh
+# endif
+	movb	%dh, 4(%eax)
+	movl	%ecx, (%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit6):
+	movb	%bh, 6(%eax)
+# endif
+L(Exit6):
+	movl	(%esi), %ecx
+	movw	4(%esi), %dx
+	movl	%ecx, (%eax)
+	movw	%dx, 4(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit7):
+	movb	%bh, 7(%eax)
+# endif
+L(Exit7):
+	movl	(%esi), %ecx
+	movl	3(%esi), %edx
+	movl	%ecx, (%eax)
+	movl	%edx, 3(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit8):
+	movb	%bh, 8(%eax)
+# endif
+L(Exit8):
+	movlpd	(%esi), %xmm0
+	movlpd	%xmm0, (%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit9):
+	movb	%bh, 9(%eax)
+# endif
+L(Exit9):
+	movlpd	(%esi), %xmm0
+# ifdef USE_AS_STRNCAT
+	movb	8(%esi), %dh
+# endif
+	movb	%dh, 8(%eax)
+	movlpd	%xmm0, (%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit10):
+	movb	%bh, 10(%eax)
+# endif
+L(Exit10):
+	movlpd	(%esi), %xmm0
+	movw	8(%esi), %dx
+	movlpd	%xmm0, (%eax)
+	movw	%dx, 8(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit11):
+	movb	%bh, 11(%eax)
+# endif
+L(Exit11):
+	movlpd	(%esi), %xmm0
+	movl	7(%esi), %edx
+	movlpd	%xmm0, (%eax)
+	movl	%edx, 7(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit12):
+	movb	%bh, 12(%eax)
+# endif
+L(Exit12):
+	movlpd	(%esi), %xmm0
+	movl	8(%esi), %edx
+	movlpd	%xmm0, (%eax)
+	movl	%edx, 8(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit13):
+	movb	%bh, 13(%eax)
+# endif
+L(Exit13):
+	movlpd	(%esi), %xmm0
+	movlpd	5(%esi), %xmm1
+	movlpd	%xmm0, (%eax)
+	movlpd	%xmm1, 5(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit14):
+	movb	%bh, 14(%eax)
+# endif
+L(Exit14):
+	movlpd	(%esi), %xmm0
+	movlpd	6(%esi), %xmm1
+	movlpd	%xmm0, (%eax)
+	movlpd	%xmm1, 6(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit15):
+	movb	%bh, 15(%eax)
+# endif
+L(Exit15):
+	movlpd	(%esi), %xmm0
+	movlpd	7(%esi), %xmm1
+	movlpd	%xmm0, (%eax)
+	movlpd	%xmm1, 7(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit16):
+	movb	%bh, 16(%eax)
+# endif
+L(Exit16):
+	movdqu	(%esi), %xmm0
+	movdqu	%xmm0, (%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit17):
+	movb	%bh, 17(%eax)
+# endif
+L(Exit17):
+	movdqu	(%esi), %xmm0
+# ifdef USE_AS_STRNCAT
+	movb	16(%esi), %dh
+# endif
+	movdqu	%xmm0, (%eax)
+	movb	%dh, 16(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit18):
+	movb	%bh, 18(%eax)
+# endif
+L(Exit18):
+	movdqu	(%esi), %xmm0
+	movw	16(%esi), %cx
+	movdqu	%xmm0, (%eax)
+	movw	%cx, 16(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit19):
+	movb	%bh, 19(%eax)
+# endif
+L(Exit19):
+	movdqu	(%esi), %xmm0
+	movl	15(%esi), %ecx
+	movdqu	%xmm0, (%eax)
+	movl	%ecx, 15(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit20):
+	movb	%bh, 20(%eax)
+# endif
+L(Exit20):
+	movdqu	(%esi), %xmm0
+	movl	16(%esi), %ecx
+	movdqu	%xmm0, (%eax)
+	movl	%ecx, 16(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit21):
+	movb	%bh, 21(%eax)
+# endif
+L(Exit21):
+	movdqu	(%esi), %xmm0
+	movl	16(%esi), %ecx
+# ifdef USE_AS_STRNCAT
+	movb	20(%esi), %dh
+# endif
+	movdqu	%xmm0, (%eax)
+	movl	%ecx, 16(%eax)
+	movb	%dh, 20(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit22):
+	movb	%bh, 22(%eax)
+# endif
+L(Exit22):
+	movdqu	(%esi), %xmm0
+	movlpd	14(%esi), %xmm3
+	movdqu	%xmm0, (%eax)
+	movlpd	%xmm3, 14(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit23):
+	movb	%bh, 23(%eax)
+# endif
+L(Exit23):
+	movdqu	(%esi), %xmm0
+	movlpd	15(%esi), %xmm3
+	movdqu	%xmm0, (%eax)
+	movlpd	%xmm3, 15(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit24):
+	movb	%bh, 24(%eax)
+# endif
+L(Exit24):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movdqu	%xmm0, (%eax)
+	movlpd	%xmm2, 16(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit25):
+	movb	%bh, 25(%eax)
+# endif
+L(Exit25):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+# ifdef USE_AS_STRNCAT
+	movb	24(%esi), %dh
+# endif
+	movdqu	%xmm0, (%eax)
+	movlpd	%xmm2, 16(%eax)
+	movb	%dh, 24(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit26):
+	movb	%bh, 26(%eax)
+# endif
+L(Exit26):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movw	24(%esi), %cx
+	movdqu	%xmm0, (%eax)
+	movlpd	%xmm2, 16(%eax)
+	movw	%cx, 24(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit27):
+	movb	%bh, 27(%eax)
+# endif
+L(Exit27):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movl	23(%esi), %ecx
+	movdqu	%xmm0, (%eax)
+	movlpd	%xmm2, 16(%eax)
+	movl	%ecx, 23(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit28):
+	movb	%bh, 28(%eax)
+# endif
+L(Exit28):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movl	24(%esi), %ecx
+	movdqu	%xmm0, (%eax)
+	movlpd	%xmm2, 16(%eax)
+	movl	%ecx, 24(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit29):
+	movb	%bh, 29(%eax)
+# endif
+L(Exit29):
+	movdqu	(%esi), %xmm0
+	movdqu	13(%esi), %xmm2
+	movdqu	%xmm0, (%eax)
+	movdqu	%xmm2, 13(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit30):
+	movb	%bh, 30(%eax)
+# endif
+L(Exit30):
+	movdqu	(%esi), %xmm0
+	movdqu	14(%esi), %xmm2
+	movdqu	%xmm0, (%eax)
+	movdqu	%xmm2, 14(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit31):
+	movb	%bh, 31(%eax)
+# endif
+L(Exit31):
+	movdqu	(%esi), %xmm0
+	movdqu	15(%esi), %xmm2
+	movdqu	%xmm0, (%eax)
+	movdqu	%xmm2, 15(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit32):
+	movb	%bh, 32(%eax)
+# endif
+L(Exit32):
+	movdqu	(%esi), %xmm0
+	movdqu	16(%esi), %xmm2
+	movdqu	%xmm0, (%eax)
+	movdqu	%xmm2, 16(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+# ifdef USE_AS_STRNCAT
+
+	.p2align 4
+L(UnalignedLeaveCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(Unaligned64LeaveCase2)
+L(Unaligned64LeaveCase3):
+	lea	64(%ebx), %ecx
+	and	$-16, %ecx
+	add	$48, %ebx
+	jl	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm4, (%eax)
+	sub	$16, %ebx
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm5, 16(%eax)
+	sub	$16, %ebx
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm6, 32(%eax)
+	sub	$16, %ebx
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm7, 48(%eax)
+	xor	%bh, %bh
+	movb	%bh, 64(%eax)
+	mov	STR3(%esp), %eax
+	RETURN
+
+	.p2align 4
+L(Unaligned64LeaveCase2):
+	xor	%ecx, %ecx
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$48, %ebx
+	jle	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm5, %xmm0
+	pmovmskb %xmm0, %edx
+	movdqu	%xmm4, (%eax)
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm6, %xmm0
+	pmovmskb %xmm0, %edx
+	movdqu	%xmm5, 16(%eax)
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm7, %xmm0
+	pmovmskb %xmm0, %edx
+	movdqu	%xmm6, 32(%eax)
+	lea	16(%eax, %ecx), %eax
+	lea	16(%esi, %ecx), %esi
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+# endif
+	.p2align 4
+L(ExitZero):
+	RETURN
+
+END (STRCAT)
+
+	.p2align 4
+	.section .rodata
+L(ExitTable):
+	.int	JMPTBL(L(Exit1), L(ExitTable))
+	.int	JMPTBL(L(Exit2), L(ExitTable))
+	.int	JMPTBL(L(Exit3), L(ExitTable))
+	.int	JMPTBL(L(Exit4), L(ExitTable))
+	.int	JMPTBL(L(Exit5), L(ExitTable))
+	.int	JMPTBL(L(Exit6), L(ExitTable))
+	.int	JMPTBL(L(Exit7), L(ExitTable))
+	.int	JMPTBL(L(Exit8), L(ExitTable))
+	.int	JMPTBL(L(Exit9), L(ExitTable))
+	.int	JMPTBL(L(Exit10), L(ExitTable))
+	.int	JMPTBL(L(Exit11), L(ExitTable))
+	.int	JMPTBL(L(Exit12), L(ExitTable))
+	.int	JMPTBL(L(Exit13), L(ExitTable))
+	.int	JMPTBL(L(Exit14), L(ExitTable))
+	.int	JMPTBL(L(Exit15), L(ExitTable))
+	.int	JMPTBL(L(Exit16), L(ExitTable))
+	.int	JMPTBL(L(Exit17), L(ExitTable))
+	.int	JMPTBL(L(Exit18), L(ExitTable))
+	.int	JMPTBL(L(Exit19), L(ExitTable))
+	.int	JMPTBL(L(Exit20), L(ExitTable))
+	.int	JMPTBL(L(Exit21), L(ExitTable))
+	.int	JMPTBL(L(Exit22), L(ExitTable))
+	.int	JMPTBL(L(Exit23), L(ExitTable))
+	.int	JMPTBL(L(Exit24), L(ExitTable))
+	.int	JMPTBL(L(Exit25), L(ExitTable))
+	.int	JMPTBL(L(Exit26), L(ExitTable))
+	.int	JMPTBL(L(Exit27), L(ExitTable))
+	.int	JMPTBL(L(Exit28), L(ExitTable))
+	.int	JMPTBL(L(Exit29), L(ExitTable))
+	.int	JMPTBL(L(Exit30), L(ExitTable))
+	.int	JMPTBL(L(Exit31), L(ExitTable))
+	.int	JMPTBL(L(Exit32), L(ExitTable))
+# ifdef USE_AS_STRNCAT
+L(ExitStrncatTable):
+	.int	JMPTBL(L(StrncatExit0), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit1), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit2), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit3), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit4), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit5), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit6), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit7), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit8), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit9), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit10), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit11), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit12), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit13), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit14), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit15), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit16), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit17), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit18), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit19), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit20), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit21), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit22), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit23), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit24), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit25), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit26), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit27), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit28), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit29), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit30), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit31), L(ExitStrncatTable))
+	.int	JMPTBL(L(StrncatExit32), L(ExitStrncatTable))
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-ssse3.S
new file mode 100644
index 0000000000..59ffbc60a5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-ssse3.S
@@ -0,0 +1,572 @@
+/* strcat with SSSE3
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+	MERCHANTABILITY	or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# ifndef STRCAT
+#  define STRCAT  __strcat_ssse3
+# endif
+
+# define PARMS  4
+# define STR1  PARMS+4
+# define STR2  STR1+4
+
+# ifdef USE_AS_STRNCAT
+#  define LEN STR2+8
+# endif
+
+# define USE_AS_STRCAT
+
+.text
+ENTRY (STRCAT)
+	PUSH	(%edi)
+	mov	STR1(%esp), %edi
+	mov	%edi, %edx
+
+# define RETURN  jmp L(StartStrcpyPart)
+# include "strlen-sse2.S"
+
+L(StartStrcpyPart):
+	mov	STR2(%esp), %ecx
+	lea	(%edi, %eax), %edx
+# ifdef USE_AS_STRNCAT
+	PUSH	(%ebx)
+	mov	LEN(%esp), %ebx
+	test	%ebx, %ebx
+	jz	L(StrncatExit0)
+	cmp	$8, %ebx
+	jbe	L(StrncatExit8Bytes)
+# endif
+	cmpb	$0, (%ecx)
+	jz	L(Exit1)
+	cmpb	$0, 1(%ecx)
+	jz	L(Exit2)
+	cmpb	$0, 2(%ecx)
+	jz	L(Exit3)
+	cmpb	$0, 3(%ecx)
+	jz	L(Exit4)
+	cmpb	$0, 4(%ecx)
+	jz	L(Exit5)
+	cmpb	$0, 5(%ecx)
+	jz	L(Exit6)
+	cmpb	$0, 6(%ecx)
+	jz	L(Exit7)
+	cmpb	$0, 7(%ecx)
+	jz	L(Exit8)
+	cmpb	$0, 8(%ecx)
+	jz	L(Exit9)
+# ifdef USE_AS_STRNCAT
+	cmp	$16, %ebx
+	jb	L(StrncatExit15Bytes)
+# endif
+	cmpb	$0, 9(%ecx)
+	jz	L(Exit10)
+	cmpb	$0, 10(%ecx)
+	jz	L(Exit11)
+	cmpb	$0, 11(%ecx)
+	jz	L(Exit12)
+	cmpb	$0, 12(%ecx)
+	jz	L(Exit13)
+	cmpb	$0, 13(%ecx)
+	jz	L(Exit14)
+	cmpb	$0, 14(%ecx)
+	jz	L(Exit15)
+	cmpb	$0, 15(%ecx)
+	jz	L(Exit16)
+# ifdef USE_AS_STRNCAT
+	cmp	$16, %ebx
+	je	L(StrncatExit16)
+
+#  define RETURN1	\
+	POP	(%ebx);	\
+	POP	(%edi);	\
+	ret;	\
+	CFI_PUSH	(%ebx);	\
+	CFI_PUSH	(%edi)
+#  define USE_AS_STRNCPY
+# else
+#  define RETURN1  POP (%edi); ret; CFI_PUSH (%edi)
+# endif
+# include "strcpy-ssse3.S"
+	.p2align 4
+L(CopyFrom1To16Bytes):
+	add	%esi, %edx
+	add	%esi, %ecx
+
+	POP	(%esi)
+	test	%al, %al
+	jz	L(ExitHigh)
+	test	$0x01, %al
+	jnz	L(Exit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(ExitHigh):
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	test	$0x08, %ah
+	jnz	L(Exit12)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	movlpd	(%ecx), %xmm0
+	movlpd	8(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 8(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit1):
+	movb	%bh, 1(%edx)
+L(Exit1):
+	movb	(%ecx), %al
+	movb	%al, (%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit2):
+	movb	%bh, 2(%edx)
+L(Exit2):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit3):
+	movb	%bh, 3(%edx)
+L(Exit3):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	movb	2(%ecx), %al
+	movb	%al, 2(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit4):
+	movb	%bh, 4(%edx)
+L(Exit4):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit5):
+	movb	%bh, 5(%edx)
+L(Exit5):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movb	4(%ecx), %al
+	movb	%al, 4(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit6):
+	movb	%bh, 6(%edx)
+L(Exit6):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movw	4(%ecx), %ax
+	movw	%ax, 4(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit7):
+	movb	%bh, 7(%edx)
+L(Exit7):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	3(%ecx), %eax
+	movl	%eax, 3(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit8):
+	movb	%bh, 8(%edx)
+L(Exit8):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit9):
+	movb	%bh, 9(%edx)
+L(Exit9):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movb	8(%ecx), %al
+	movb	%al, 8(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit10):
+	movb	%bh, 10(%edx)
+L(Exit10):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movw	8(%ecx), %ax
+	movw	%ax, 8(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit11):
+	movb	%bh, 11(%edx)
+L(Exit11):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	7(%ecx), %eax
+	movl	%eax, 7(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit12):
+	movb	%bh, 12(%edx)
+L(Exit12):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	8(%ecx), %eax
+	movl	%eax, 8(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit13):
+	movb	%bh, 13(%edx)
+L(Exit13):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	5(%ecx), %xmm0
+	movlpd	%xmm0, 5(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit14):
+	movb	%bh, 14(%edx)
+L(Exit14):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	6(%ecx), %xmm0
+	movlpd	%xmm0, 6(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit15):
+	movb	%bh, 15(%edx)
+L(Exit15):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	7(%ecx), %xmm0
+	movlpd	%xmm0, 7(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit16):
+	movb	%bh, 16(%edx)
+L(Exit16):
+	movlpd	(%ecx), %xmm0
+	movlpd	8(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 8(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+# ifdef USE_AS_STRNCPY
+
+	CFI_PUSH(%esi)
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2):
+	add	$16, %ebx
+	add	%esi, %ecx
+	lea	(%esi, %edx), %esi
+	lea	-9(%ebx), %edx
+	and	$1<<7, %dh
+	or	%al, %dh
+	test	%dh, %dh
+	lea	(%esi), %edx
+	POP	(%esi)
+	jz	L(ExitHighCase2)
+
+	test	$0x01, %al
+	jnz	L(Exit1)
+	cmp	$1, %ebx
+	je	L(StrncatExit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	cmp	$2, %ebx
+	je	L(StrncatExit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	cmp	$3, %ebx
+	je	L(StrncatExit3)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	cmp	$4, %ebx
+	je	L(StrncatExit4)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	cmp	$5, %ebx
+	je	L(StrncatExit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	cmp	$6, %ebx
+	je	L(StrncatExit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	cmp	$7, %ebx
+	je	L(StrncatExit7)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	lea	7(%edx), %eax
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+	xor	%cl, %cl
+	movb	%cl, (%eax)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(ExitHighCase2):
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	cmp	$9, %ebx
+	je	L(StrncatExit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	cmp	$10, %ebx
+	je	L(StrncatExit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	cmp	$11, %ebx
+	je	L(StrncatExit11)
+	test	$0x8, %ah
+	jnz	L(Exit12)
+	cmp	$12, %ebx
+	je	L(StrncatExit12)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	cmp	$13, %ebx
+	je	L(StrncatExit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	cmp	$14, %ebx
+	je	L(StrncatExit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	cmp	$15, %ebx
+	je	L(StrncatExit15)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	8(%ecx), %xmm1
+	movlpd	%xmm1, 8(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	CFI_PUSH(%esi)
+
+L(CopyFrom1To16BytesCase2OrCase3):
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+
+	.p2align 4
+L(CopyFrom1To16BytesCase3):
+	add	$16, %ebx
+	add	%esi, %edx
+	add	%esi, %ecx
+
+	POP	(%esi)
+
+	cmp	$8, %ebx
+	ja	L(ExitHighCase3)
+	cmp	$1, %ebx
+	je	L(StrncatExit1)
+	cmp	$2, %ebx
+	je	L(StrncatExit2)
+	cmp	$3, %ebx
+	je	L(StrncatExit3)
+	cmp	$4, %ebx
+	je	L(StrncatExit4)
+	cmp	$5, %ebx
+	je	L(StrncatExit5)
+	cmp	$6, %ebx
+	je	L(StrncatExit6)
+	cmp	$7, %ebx
+	je	L(StrncatExit7)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movb	%bh, 8(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(ExitHighCase3):
+	cmp	$9, %ebx
+	je	L(StrncatExit9)
+	cmp	$10, %ebx
+	je	L(StrncatExit10)
+	cmp	$11, %ebx
+	je	L(StrncatExit11)
+	cmp	$12, %ebx
+	je	L(StrncatExit12)
+	cmp	$13, %ebx
+	je	L(StrncatExit13)
+	cmp	$14, %ebx
+	je	L(StrncatExit14)
+	cmp	$15, %ebx
+	je	L(StrncatExit15)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	8(%ecx), %xmm1
+	movlpd	%xmm1, 8(%edx)
+	movb	%bh, 16(%edx)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit0):
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit15Bytes):
+	cmp	$9, %ebx
+	je	L(StrncatExit9)
+	cmpb	$0, 9(%ecx)
+	jz	L(Exit10)
+	cmp	$10, %ebx
+	je	L(StrncatExit10)
+	cmpb	$0, 10(%ecx)
+	jz	L(Exit11)
+	cmp	$11, %ebx
+	je	L(StrncatExit11)
+	cmpb	$0, 11(%ecx)
+	jz	L(Exit12)
+	cmp	$12, %ebx
+	je	L(StrncatExit12)
+	cmpb	$0, 12(%ecx)
+	jz	L(Exit13)
+	cmp	$13, %ebx
+	je	L(StrncatExit13)
+	cmpb	$0, 13(%ecx)
+	jz	L(Exit14)
+	cmp	$14, %ebx
+	je	L(StrncatExit14)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	7(%ecx), %xmm0
+	movlpd	%xmm0, 7(%edx)
+	lea	14(%edx), %eax
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+	movb	%bh, (%eax)
+	movl	%edi, %eax
+	RETURN1
+
+	.p2align 4
+L(StrncatExit8Bytes):
+	cmpb	$0, (%ecx)
+	jz	L(Exit1)
+	cmp	$1, %ebx
+	je	L(StrncatExit1)
+	cmpb	$0, 1(%ecx)
+	jz	L(Exit2)
+	cmp	$2, %ebx
+	je	L(StrncatExit2)
+	cmpb	$0, 2(%ecx)
+	jz	L(Exit3)
+	cmp	$3, %ebx
+	je	L(StrncatExit3)
+	cmpb	$0, 3(%ecx)
+	jz	L(Exit4)
+	cmp	$4, %ebx
+	je	L(StrncatExit4)
+	cmpb	$0, 4(%ecx)
+	jz	L(Exit5)
+	cmp	$5, %ebx
+	je	L(StrncatExit5)
+	cmpb	$0, 5(%ecx)
+	jz	L(Exit6)
+	cmp	$6, %ebx
+	je	L(StrncatExit6)
+	cmpb	$0, 6(%ecx)
+	jz	L(Exit7)
+	cmp	$7, %ebx
+	je	L(StrncatExit7)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	lea	7(%edx), %eax
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+	movb	%bh, (%eax)
+	movl	%edi, %eax
+	RETURN1
+
+# endif
+END (STRCAT)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcat.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat.S
new file mode 100644
index 0000000000..8412cb6f23
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat.S
@@ -0,0 +1,92 @@
+/* Multiple versions of strcat
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifndef USE_AS_STRNCAT
+# ifndef STRCAT
+#  define STRCAT strcat
+# endif
+#endif
+
+#ifdef USE_AS_STRNCAT
+# define STRCAT_SSSE3	__strncat_ssse3
+# define STRCAT_SSE2		__strncat_sse2
+# define STRCAT_IA32		__strncat_ia32
+# define __GI_STRCAT		__GI_strncat
+#else
+# define STRCAT_SSSE3	__strcat_ssse3
+# define STRCAT_SSE2		__strcat_sse2
+# define STRCAT_IA32		__strcat_ia32
+# define __GI_STRCAT		__GI_strcat
+#endif
+
+
+/* Define multiple versions only for the definition in libc.  Don't
+   define multiple versions for strncat in static library since we
+   need strncat before the initialization happened.  */
+#if IS_IN (libc)
+
+	.text
+ENTRY(STRCAT)
+	.type	STRCAT, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (STRCAT_IA32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (STRCAT_SSE2)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz	2f
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (STRCAT_SSSE3)
+2:	ret
+END(STRCAT)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type STRCAT_IA32, @function; \
+	.align 16; \
+	.globl STRCAT_IA32; \
+	.hidden STRCAT_IA32; \
+	STRCAT_IA32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size STRCAT_IA32, .-STRCAT_IA32
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strcat calls through a PLT.
+   The speedup we get from using SSSE3 instruction is likely eaten away
+   by the indirect call in the PLT.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_STRCAT; __GI_STRCAT = STRCAT_IA32
+#  undef libc_hidden_def
+#  define libc_hidden_def(name) \
+	.globl __GI___STRCAT; __GI___STRCAT = STRCAT_IA32
+
+# endif
+#endif
+
+#ifndef USE_AS_STRNCAT
+# include "../../strcat.S"
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2-bsf.S
new file mode 100644
index 0000000000..95fd7c084e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2-bsf.S
@@ -0,0 +1,158 @@
+/* strchr with SSE2 with bsf
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS  8
+# define ENTRANCE PUSH(%edi)
+# define RETURN  POP(%edi); ret; CFI_PUSH(%edi);
+
+# define STR1  PARMS
+# define STR2  STR1+4
+
+	.text
+ENTRY (__strchr_sse2_bsf)
+
+	ENTRANCE
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+
+	pxor	%xmm2, %xmm2
+	mov	%ecx, %edi
+	punpcklbw %xmm1, %xmm1
+	punpcklbw %xmm1, %xmm1
+	/* ECX has OFFSET. */
+	and	$15, %ecx
+	pshufd	$0, %xmm1, %xmm1
+	je	L(loop)
+
+/* Handle unaligned string.  */
+	and	$-16, %edi
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm1, %xmm0
+	/* Find where NULL is.  */
+	pmovmskb %xmm2, %edx
+	/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+	/* Remove the leading bytes.  */
+	sarl	%cl, %edx
+	sarl	%cl, %eax
+	test	%eax, %eax
+	je	L(unaligned_no_match)
+	/* Check which byte is a match.  */
+	bsf	%eax, %eax
+	/* Is there a NULL? */
+	test	%edx, %edx
+	je	L(unaligned_match)
+	bsf	%edx, %edx
+	cmpl	%edx, %eax
+	/* Return NULL if NULL comes first.  */
+	ja	L(return_null)
+L(unaligned_match):
+	add	%edi, %eax
+	add	%ecx, %eax
+	RETURN
+
+	.p2align 4
+L(unaligned_no_match):
+	test	%edx, %edx
+	jne	L(return_null)
+	pxor	%xmm2, %xmm2
+
+	add	$16, %edi
+
+	.p2align 4
+/* Loop start on aligned string.  */
+L(loop):
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jnz	L(matches)
+	jmp	L(loop)
+
+L(matches):
+	pmovmskb %xmm2, %edx
+	test	%eax, %eax
+	jz	L(return_null)
+	bsf	%eax, %eax
+	/* There is a match.  First find where NULL is.  */
+	test	%edx, %edx
+	je	L(match)
+	bsf	%edx, %ecx
+	/* Check if NULL comes first.  */
+	cmpl	%ecx, %eax
+	ja	L(return_null)
+L(match):
+	sub	$16, %edi
+	add	%edi, %eax
+	RETURN
+
+/* Return NULL.  */
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	RETURN
+
+END (__strchr_sse2_bsf)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2.S
new file mode 100644
index 0000000000..1f9e875b04
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2.S
@@ -0,0 +1,348 @@
+/* strchr SSE2 without bsf
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS  8
+# define ENTRANCE PUSH(%edi)
+# define RETURN  POP(%edi); ret; CFI_PUSH(%edi);
+
+# define STR1  PARMS
+# define STR2  STR1+4
+
+	atom_text_section
+ENTRY (__strchr_sse2)
+
+	ENTRANCE
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+
+	pxor	%xmm2, %xmm2
+	mov	%ecx, %edi
+	punpcklbw %xmm1, %xmm1
+	punpcklbw %xmm1, %xmm1
+	/* ECX has OFFSET. */
+	and	$15, %ecx
+	pshufd	$0, %xmm1, %xmm1
+	je	L(loop)
+
+/* Handle unaligned string.  */
+	and	$-16, %edi
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm1, %xmm0
+	/* Find where NULL is.  */
+	pmovmskb %xmm2, %edx
+	/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+	/* Remove the leading bytes.  */
+	sarl	%cl, %edx
+	sarl	%cl, %eax
+	test	%eax, %eax
+	jz	L(unaligned_no_match)
+	/* Check which byte is a match.  */
+	/* Is there a NULL? */
+	add	%ecx, %edi
+	test	%edx, %edx
+	jz	L(match_case1)
+	jmp	L(match_case2)
+
+	.p2align 4
+L(unaligned_no_match):
+	test	%edx, %edx
+	jne	L(return_null)
+
+	pxor	%xmm2, %xmm2
+	add	$16, %edi
+
+	.p2align 4
+/* Loop start on aligned string.  */
+L(loop):
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+	test	%edx, %edx
+	jnz	L(return_null)
+	add	$16, %edi
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+	test	%edx, %edx
+	jnz	L(return_null)
+	add	$16, %edi
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+	test	%edx, %edx
+	jnz	L(return_null)
+	add	$16, %edi
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+	test	%edx, %edx
+	jnz	L(return_null)
+	add	$16, %edi
+	jmp	L(loop)
+
+L(matches):
+	/* There is a match.  First find where NULL is.  */
+	test	%edx, %edx
+	jz	L(match_case1)
+
+	.p2align 4
+L(match_case2):
+	test	%al, %al
+	jz	L(match_higth_case2)
+
+	mov	%al, %cl
+	and	$15, %cl
+	jnz	L(match_case2_4)
+
+	mov	%dl, %ch
+	and	$15, %ch
+	jnz	L(return_null)
+
+	test	$0x10, %al
+	jnz	L(Exit5)
+	test	$0x10, %dl
+	jnz	L(return_null)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	test	$0x20, %dl
+	jnz	L(return_null)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	test	$0x40, %dl
+	jnz	L(return_null)
+	lea	7(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_case2_4):
+	test	$0x01, %al
+	jnz	L(Exit1)
+	test	$0x01, %dl
+	jnz	L(return_null)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	test	$0x02, %dl
+	jnz	L(return_null)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	test	$0x04, %dl
+	jnz	L(return_null)
+	lea	3(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_higth_case2):
+	test	%dl, %dl
+	jnz	L(return_null)
+
+	mov	%ah, %cl
+	and	$15, %cl
+	jnz	L(match_case2_12)
+
+	mov	%dh, %ch
+	and	$15, %ch
+	jnz	L(return_null)
+
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	test	$0x10, %dh
+	jnz	L(return_null)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	test	$0x20, %dh
+	jnz	L(return_null)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	test	$0x40, %dh
+	jnz	L(return_null)
+	lea	15(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_case2_12):
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	test	$0x01, %dh
+	jnz	L(return_null)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	test	$0x02, %dh
+	jnz	L(return_null)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	test	$0x04, %dh
+	jnz	L(return_null)
+	lea	11(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_case1):
+	test	%al, %al
+	jz	L(match_higth_case1)
+
+	test	$0x01, %al
+	jnz	L(Exit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	lea	7(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_higth_case1):
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	test	$0x08, %ah
+	jnz	L(Exit12)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	lea	15(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit1):
+	lea	(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit2):
+	lea	1(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit3):
+	lea	2(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit4):
+	lea	3(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit5):
+	lea	4(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit6):
+	lea	5(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit7):
+	lea	6(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit9):
+	lea	8(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit10):
+	lea	9(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit11):
+	lea	10(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit12):
+	lea	11(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit13):
+	lea	12(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit14):
+	lea	13(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit15):
+	lea	14(%edi), %eax
+	RETURN
+
+/* Return NULL.  */
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	RETURN
+
+END (__strchr_sse2)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr.S
new file mode 100644
index 0000000000..5b97b1c767
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr.S
@@ -0,0 +1,57 @@
+/* Multiple versions of strchr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+	.text
+ENTRY(strchr)
+	.type	strchr, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__strchr_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__strchr_sse2_bsf)
+	HAS_ARCH_FEATURE (Slow_BSF)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__strchr_sse2)
+2:	ret
+END(strchr)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __strchr_ia32, @function; \
+	.globl __strchr_ia32; \
+	.p2align 4; \
+	__strchr_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __strchr_ia32, .-__strchr_ia32
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_strchr; __GI_strchr = __strchr_ia32
+#endif
+
+#include "../../i586/strchr.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-sse4.S
new file mode 100644
index 0000000000..cd26058671
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-sse4.S
@@ -0,0 +1,804 @@
+/* strcmp with SSE4.2
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#ifdef USE_AS_STRNCMP
+# ifndef STRCMP
+#  define STRCMP	__strncmp_sse4_2
+# endif
+# define STR1		8
+# define STR2		STR1+4
+# define CNT		STR2+4
+# define RETURN		POP (REM); ret; .p2align 4; CFI_PUSH (REM)
+# define REM		%ebp
+#elif defined USE_AS_STRCASECMP_L
+# include "locale-defines.h"
+# ifndef STRCMP
+#  define STRCMP	__strcasecmp_l_sse4_2
+# endif
+# ifdef PIC
+#  define STR1		12
+# else
+#  define STR1		8
+# endif
+# define STR2		STR1+4
+# define LOCALE		12	/* Loaded before the adjustment.  */
+# ifdef PIC
+#  define RETURN	POP (%edi); POP (%ebx); ret; \
+			.p2align 4; CFI_PUSH (%ebx); CFI_PUSH (%edi)
+# else
+#  define RETURN	POP (%edi); ret; .p2align 4; CFI_PUSH (%edi)
+# endif
+# define NONASCII	__strcasecmp_nonascii
+#elif defined USE_AS_STRNCASECMP_L
+# include "locale-defines.h"
+# ifndef STRCMP
+#  define STRCMP	__strncasecmp_l_sse4_2
+# endif
+# ifdef PIC
+#  define STR1		16
+# else
+#  define STR1		12
+# endif
+# define STR2		STR1+4
+# define CNT		STR2+4
+# define LOCALE		16	/* Loaded before the adjustment.  */
+# ifdef PIC
+#  define RETURN	POP (%edi); POP (REM); POP (%ebx); ret; \
+			.p2align 4; \
+			CFI_PUSH (%ebx); CFI_PUSH (REM); CFI_PUSH (%edi)
+# else
+#  define RETURN	POP (%edi); POP (REM); ret; \
+			.p2align 4; CFI_PUSH (REM); CFI_PUSH (%edi)
+# endif
+# define REM		%ebp
+# define NONASCII	__strncasecmp_nonascii
+#else
+# ifndef STRCMP
+#  define STRCMP	__strcmp_sse4_2
+# endif
+# define STR1		4
+# define STR2		STR1+4
+# define RETURN		ret; .p2align 4
+#endif
+
+	.section .text.sse4.2,"ax",@progbits
+
+#ifdef USE_AS_STRCASECMP_L
+ENTRY (__strcasecmp_sse4_2)
+# ifdef PIC
+	PUSH	(%ebx)
+	LOAD_PIC_REG(bx)
+	movl	__libc_tsd_LOCALE@GOTNTPOFF(%ebx), %eax
+#  ifdef NO_TLS_DIRECT_SEG_REFS
+	addl	%gs:0, %eax
+	movl	(%eax), %eax
+#  else
+	movl	%gs:(%eax), %eax
+#  endif
+# else
+#  ifdef NO_TLS_DIRECT_SEG_REFS
+	movl	%gs:0, %eax
+	movl	__libc_tsd_LOCALE@NTPOFF(%eax), %eax
+#  else
+	movl	%gs:__libc_tsd_LOCALE@NTPOFF, %eax
+#  endif
+# endif
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	movl	LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax
+# else
+	movl	(%eax), %eax
+# endif
+	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax)
+# ifdef PIC
+	je	L(ascii)
+	POP	(%ebx)
+	jmp	__strcasecmp_nonascii
+# else
+	jne	__strcasecmp_nonascii
+	jmp	L(ascii)
+# endif
+END (__strcasecmp_sse4_2)
+#endif
+
+#ifdef USE_AS_STRNCASECMP_L
+ENTRY (__strncasecmp_sse4_2)
+# ifdef PIC
+	PUSH	(%ebx)
+	LOAD_PIC_REG(bx)
+	movl	__libc_tsd_LOCALE@GOTNTPOFF(%ebx), %eax
+#  ifdef NO_TLS_DIRECT_SEG_REFS
+	addl	%gs:0, %eax
+	movl	(%eax), %eax
+#  else
+	movl	%gs:(%eax), %eax
+#  endif
+# else
+#  ifdef NO_TLS_DIRECT_SEG_REFS
+	movl	%gs:0, %eax
+	movl	__libc_tsd_LOCALE@NTPOFF(%eax), %eax
+#  else
+	movl	%gs:__libc_tsd_LOCALE@NTPOFF, %eax
+#  endif
+# endif
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	movl	LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax
+# else
+	movl	(%eax), %eax
+# endif
+	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax)
+# ifdef PIC
+	je	L(ascii)
+	POP	(%ebx)
+	jmp	__strncasecmp_nonascii
+# else
+	jne	__strncasecmp_nonascii
+	jmp	L(ascii)
+# endif
+END (__strncasecmp_sse4_2)
+#endif
+
+	ENTRY (STRCMP)
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movl	LOCALE(%esp), %eax
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	movl	LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax
+# else
+	movl	(%eax), %eax
+# endif
+	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax)
+	jne	NONASCII
+
+# ifdef PIC
+	PUSH	(%ebx)
+	LOAD_PIC_REG(bx)
+# endif
+L(ascii):
+	.section .rodata.cst16,"aM",@progbits,16
+	.align 16
+.Lbelowupper:
+	.quad	0x4040404040404040
+	.quad	0x4040404040404040
+.Ltopupper:
+	.quad	0x5b5b5b5b5b5b5b5b
+	.quad	0x5b5b5b5b5b5b5b5b
+.Ltouppermask:
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.previous
+
+# ifdef PIC
+#  define UCLOW_reg .Lbelowupper@GOTOFF(%ebx)
+#  define UCHIGH_reg .Ltopupper@GOTOFF(%ebx)
+#  define LCQWORD_reg .Ltouppermask@GOTOFF(%ebx)
+# else
+#  define UCLOW_reg .Lbelowupper
+#  define UCHIGH_reg .Ltopupper
+#  define LCQWORD_reg .Ltouppermask
+# endif
+#endif
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	PUSH	(REM)
+#endif
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	PUSH	(%edi)
+#endif
+	mov	STR1(%esp), %edx
+	mov	STR2(%esp), %eax
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	movl	CNT(%esp), REM
+	test	REM, REM
+	je	L(eq)
+#endif
+	mov	%dx, %cx
+	and	$0xfff, %cx
+	cmp	$0xff0, %cx
+	ja	L(first4bytes)
+	movdqu	(%edx), %xmm2
+	mov	%eax, %ecx
+	and	$0xfff, %ecx
+	cmp	$0xff0, %ecx
+	ja	L(first4bytes)
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# define TOLOWER(reg1, reg2) \
+	movdqa	reg1, %xmm3;						      \
+	movdqa	UCHIGH_reg, %xmm4;					      \
+	movdqa	reg2, %xmm5;						      \
+	movdqa	UCHIGH_reg, %xmm6;					      \
+	pcmpgtb	UCLOW_reg, %xmm3;					      \
+	pcmpgtb	reg1, %xmm4;						      \
+	pcmpgtb	UCLOW_reg, %xmm5;					      \
+	pcmpgtb	reg2, %xmm6;						      \
+	pand	%xmm4, %xmm3;						      \
+	pand	%xmm6, %xmm5;						      \
+	pand	LCQWORD_reg, %xmm3;					      \
+	pand	LCQWORD_reg, %xmm5;					      \
+	por	%xmm3, reg1;						      \
+	por	%xmm5, reg2
+
+	movdqu	(%eax), %xmm1
+	TOLOWER (%xmm2, %xmm1)
+	movd	%xmm2, %ecx
+	movd	%xmm1, %edi
+	movdqa	%xmm2, %xmm3
+	movdqa	%xmm1, %xmm4
+	cmpl	%edi, %ecx
+#else
+# define TOLOWER(reg1, reg)
+
+	movd	%xmm2, %ecx
+	cmp	(%eax), %ecx
+#endif
+	jne	L(less4bytes)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	movdqu	(%eax), %xmm1
+#endif
+	pxor	%xmm2, %xmm1
+	pxor	%xmm0, %xmm0
+	ptest	%xmm1, %xmm0
+	jnc	L(less16bytes)
+	pcmpeqb	%xmm0, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, REM
+	jbe	L(eq)
+#endif
+	add	$16, %edx
+	add	$16, %eax
+L(first4bytes):
+	movzbl	(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, (%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$1, REM
+	je	L(eq)
+#endif
+
+	movzbl	1(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	1(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 1(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$2, REM
+	je	L(eq)
+#endif
+	movzbl	2(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	2(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 2(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$3, REM
+	je	L(eq)
+#endif
+	movzbl	3(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	3(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 3(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$4, REM
+	je	L(eq)
+#endif
+	movzbl	4(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	4(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 4(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$5, REM
+	je	L(eq)
+#endif
+	movzbl	5(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	5(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 5(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$6, REM
+	je	L(eq)
+#endif
+	movzbl	6(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	6(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 6(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$7, REM
+	je	L(eq)
+#endif
+	movzbl	7(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	7(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 7(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$8, REM
+	je	L(eq)
+#endif
+	add	$8, %eax
+	add	$8, %edx
+
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	PUSH	(%edi)
+#endif
+	PUSH	(%esi)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cfi_remember_state
+#endif
+	mov	%edx, %edi
+	mov	%eax, %esi
+	xorl	%eax, %eax
+L(check_offset):
+	movl	%edi, %edx
+	movl	%esi, %ecx
+	andl	$0xfff, %edx
+	andl	$0xfff, %ecx
+	cmpl	%edx, %ecx
+	cmovl	%edx, %ecx
+	lea	-0xff0(%ecx), %edx
+	sub	%edx, %edi
+	sub	%edx, %esi
+	testl	%edx, %edx
+	jg	L(crosspage)
+L(loop):
+	movdqu	(%esi,%edx), %xmm2
+	movdqu	(%edi,%edx), %xmm1
+	TOLOWER (%xmm2, %xmm1)
+	pcmpistri	$0x1a, %xmm2, %xmm1
+	jbe	L(end)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, REM
+	jbe	L(more16byteseq)
+#endif
+
+	add	$16, %edx
+	jle	L(loop)
+L(crosspage):
+	movzbl	(%edi,%edx), %eax
+	movzbl	(%esi,%edx), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+# endif
+#endif
+	subl	%ecx, %eax
+	jne	L(ret)
+	testl	%ecx, %ecx
+	je	L(ret)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$1, REM
+	jbe	L(more16byteseq)
+#endif
+	inc	%edx
+	cmp	$15, %edx
+	jle	L(crosspage)
+	add	%edx, %edi
+	add	%edx, %esi
+	jmp	L(check_offset)
+
+	.p2align 4
+L(end):
+	jnc	L(ret)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	%ecx, REM
+	jbe	L(more16byteseq)
+#endif
+	lea	(%ecx,%edx), %ecx
+	movzbl	(%edi,%ecx), %eax
+	movzbl	(%esi,%ecx), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+# endif
+#endif
+	subl	%ecx, %eax
+L(ret):
+	POP	(%esi)
+	POP	(%edi)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	POP	(REM)
+#endif
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	POP	(%ebx)
+# endif
+#endif
+	ret
+
+	.p2align 4
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cfi_restore_state
+L(more16byteseq):
+	POP	(%esi)
+# ifdef USE_AS_STRNCMP
+	POP	(%edi)
+# endif
+#endif
+L(eq):
+	xorl	%eax, %eax
+	RETURN
+
+L(neq):
+	mov	$1, %eax
+	ja	L(neq_bigger)
+	neg	%eax
+L(neq_bigger):
+	RETURN
+
+L(less16bytes):
+	add	$0xfefefeff, %ecx
+	jnc	L(less4bytes)
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movd	%xmm3, %edi
+	xor	%edi, %ecx
+#else
+	xor	(%edx), %ecx
+#endif
+	or	$0xfefefeff, %ecx
+	add	$1, %ecx
+	jnz	L(less4bytes)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$4, REM
+	jbe	L(eq)
+#endif
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	psrldq	$4, %xmm3
+	psrldq	$4, %xmm4
+	movd	%xmm3, %ecx
+	movd	%xmm4, %edi
+	cmp	%edi, %ecx
+	mov	%ecx, %edi
+#else
+	mov	4(%edx), %ecx
+	cmp	4(%eax), %ecx
+#endif
+	jne	L(more4bytes)
+	add	$0xfefefeff, %ecx
+	jnc	L(more4bytes)
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	xor	%edi, %ecx
+#else
+	xor	4(%edx), %ecx
+#endif
+	or	$0xfefefeff, %ecx
+	add	$1, %ecx
+	jnz	L(more4bytes)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$8, REM
+	jbe	L(eq)
+#endif
+
+	add	$8, %edx
+	add	$8, %eax
+L(less4bytes):
+
+	movzbl	(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, (%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$1, REM
+	je	L(eq)
+#endif
+	movzbl	1(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	1(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 1(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$2, REM
+	je	L(eq)
+#endif
+
+	movzbl	2(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	2(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 2(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$3, REM
+	je	L(eq)
+#endif
+	movzbl	3(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	3(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 3(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+L(more4bytes):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$4, REM
+	je	L(eq)
+#endif
+	movzbl	4(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	4(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 4(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$5, REM
+	je	L(eq)
+#endif
+	movzbl	5(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	5(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 5(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$6, REM
+	je	L(eq)
+#endif
+	movzbl	6(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	6(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 6(%edx)
+#endif
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$7, REM
+	je	L(eq)
+#endif
+	movzbl	7(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	7(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
+	cmpb	%cl, 7(%edx)
+#endif
+	jne	L(neq)
+	jmp	L(eq)
+
+END (STRCMP)
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-ssse3.S
new file mode 100644
index 0000000000..b25cc3e068
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-ssse3.S
@@ -0,0 +1,2810 @@
+/* strcmp with SSSE3
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#ifdef USE_AS_STRNCMP
+# ifndef STRCMP
+#  define STRCMP	__strncmp_ssse3
+# endif
+# define STR1		8
+# define STR2		STR1+4
+# define CNT		STR2+4
+# define RETURN		POP (REM); ret; .p2align 4; CFI_PUSH (REM)
+# define UPDATE_STRNCMP_COUNTER				\
+	/* calculate left number to compare */		\
+	mov	$16, %esi;				\
+	sub	%ecx, %esi;				\
+	cmp	%esi, REM;				\
+	jbe	L(more8byteseq);			\
+	sub	%esi, REM
+# define FLAGS		%ebx
+# define REM		%ebp
+#elif defined USE_AS_STRCASECMP_L
+# include "locale-defines.h"
+# ifndef STRCMP
+#  define STRCMP	__strcasecmp_l_ssse3
+# endif
+# ifdef PIC
+#  define STR1		8
+# else
+#  define STR1		4
+# endif
+# define STR2		STR1+4
+# define LOCALE		12	/* Loaded before the adjustment.  */
+# ifdef PIC
+#  define RETURN	POP (%ebx); ret; .p2align 4; CFI_PUSH (%ebx)
+# else
+#  define RETURN	ret; .p2align 4
+# endif
+# define UPDATE_STRNCMP_COUNTER
+# define FLAGS		(%esp)
+# define NONASCII	__strcasecmp_nonascii
+#elif defined USE_AS_STRNCASECMP_L
+# include "locale-defines.h"
+# ifndef STRCMP
+#  define STRCMP	__strncasecmp_l_ssse3
+# endif
+# ifdef PIC
+#  define STR1		12
+# else
+#  define STR1		8
+# endif
+# define STR2		STR1+4
+# define CNT		STR2+4
+# define LOCALE		16	/* Loaded before the adjustment.  */
+# ifdef PIC
+#  define RETURN	POP (REM); POP (%ebx); ret; \
+			.p2align 4; CFI_PUSH (%ebx); CFI_PUSH (REM)
+# else
+#  define RETURN	POP (REM); ret; .p2align 4; CFI_PUSH (REM)
+# endif
+# define UPDATE_STRNCMP_COUNTER				\
+	/* calculate left number to compare */		\
+	mov	$16, %esi;				\
+	sub	%ecx, %esi;				\
+	cmp	%esi, REM;				\
+	jbe	L(more8byteseq);			\
+	sub	%esi, REM
+# define FLAGS		(%esp)
+# define REM		%ebp
+# define NONASCII	__strncasecmp_nonascii
+#else
+# ifndef STRCMP
+#  define STRCMP	__strcmp_ssse3
+# endif
+# define STR1		4
+# define STR2		STR1+4
+# define RETURN		ret; .p2align 4
+# define UPDATE_STRNCMP_COUNTER
+# define FLAGS		%ebx
+#endif
+
+	.section .text.ssse3,"ax",@progbits
+
+#ifdef USE_AS_STRCASECMP_L
+ENTRY (__strcasecmp_ssse3)
+# ifdef PIC
+	PUSH	(%ebx)
+	LOAD_PIC_REG(bx)
+	movl	__libc_tsd_LOCALE@GOTNTPOFF(%ebx), %eax
+#  ifdef NO_TLS_DIRECT_SEG_REFS
+	addl	%gs:0, %eax
+	movl	(%eax), %eax
+#  else
+	movl	%gs:(%eax), %eax
+#  endif
+# else
+#  ifdef NO_TLS_DIRECT_SEG_REFS
+	movl	%gs:0, %eax
+	movl	__libc_tsd_LOCALE@NTPOFF(%eax), %eax
+#  else
+	movl	%gs:__libc_tsd_LOCALE@NTPOFF, %eax
+#  endif
+# endif
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	movl	LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax
+# else
+	movl	(%eax), %eax
+# endif
+	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax)
+# ifdef PIC
+	je	L(ascii)
+	POP	(%ebx)
+	jmp	__strcasecmp_nonascii
+# else
+	jne	__strcasecmp_nonascii
+	jmp	L(ascii)
+# endif
+END (__strcasecmp_ssse3)
+#endif
+
+#ifdef USE_AS_STRNCASECMP_L
+ENTRY (__strncasecmp_ssse3)
+# ifdef PIC
+	PUSH	(%ebx)
+	LOAD_PIC_REG(bx)
+	movl	__libc_tsd_LOCALE@GOTNTPOFF(%ebx), %eax
+#  ifdef NO_TLS_DIRECT_SEG_REFS
+	addl	%gs:0, %eax
+	movl	(%eax), %eax
+#  else
+	movl	%gs:(%eax), %eax
+#  endif
+# else
+#  ifdef NO_TLS_DIRECT_SEG_REFS
+	movl	%gs:0, %eax
+	movl	__libc_tsd_LOCALE@NTPOFF(%eax), %eax
+#  else
+	movl	%gs:__libc_tsd_LOCALE@NTPOFF, %eax
+#  endif
+# endif
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	movl	LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax
+# else
+	movl	(%eax), %eax
+# endif
+	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax)
+# ifdef PIC
+	je	L(ascii)
+	POP	(%ebx)
+	jmp	__strncasecmp_nonascii
+# else
+	jne	__strncasecmp_nonascii
+	jmp	L(ascii)
+# endif
+END (__strncasecmp_ssse3)
+#endif
+
+ENTRY (STRCMP)
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movl	LOCALE(%esp), %eax
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	movl	LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax
+# else
+	movl	(%eax), %eax
+# endif
+	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax)
+	jne	NONASCII
+
+# ifdef PIC
+	PUSH	(%ebx)
+	LOAD_PIC_REG(bx)
+# endif
+L(ascii):
+	.section .rodata.cst16,"aM",@progbits,16
+	.align 16
+.Lbelowupper:
+	.quad	0x4040404040404040
+	.quad	0x4040404040404040
+.Ltopupper:
+	.quad	0x5b5b5b5b5b5b5b5b
+	.quad	0x5b5b5b5b5b5b5b5b
+.Ltouppermask:
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.previous
+
+# ifdef PIC
+#  define UCLOW_reg .Lbelowupper@GOTOFF(%ebx)
+#  define UCHIGH_reg .Ltopupper@GOTOFF(%ebx)
+#  define LCQWORD_reg .Ltouppermask@GOTOFF(%ebx)
+# else
+#  define UCLOW_reg .Lbelowupper
+#  define UCHIGH_reg .Ltopupper
+#  define LCQWORD_reg .Ltouppermask
+# endif
+#endif
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	PUSH	(REM)
+#endif
+
+	movl	STR1(%esp), %edx
+	movl	STR2(%esp), %eax
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	movl	CNT(%esp), REM
+	cmp	$16, REM
+	jb	L(less16bytes_sncmp)
+#elif !defined USE_AS_STRCASECMP_L
+	movzbl	(%eax), %ecx
+	cmpb	%cl, (%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	1(%eax), %ecx
+	cmpb	%cl, 1(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	2(%eax), %ecx
+	cmpb	%cl, 2(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	3(%eax), %ecx
+	cmpb	%cl, 3(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	4(%eax), %ecx
+	cmpb	%cl, 4(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	5(%eax), %ecx
+	cmpb	%cl, 5(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	6(%eax), %ecx
+	cmpb	%cl, 6(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	7(%eax), %ecx
+	cmpb	%cl, 7(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	add	$8, %edx
+	add	$8, %eax
+#endif
+	movl	%edx, %ecx
+	and	$0xfff, %ecx
+	cmp	$0xff0, %ecx
+	ja	L(crosspage)
+	mov	%eax, %ecx
+	and	$0xfff, %ecx
+	cmp	$0xff0, %ecx
+	ja	L(crosspage)
+	pxor	%xmm0, %xmm0
+	movlpd	(%eax), %xmm1
+	movlpd	(%edx), %xmm2
+	movhpd	8(%eax), %xmm1
+	movhpd	8(%edx), %xmm2
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# define TOLOWER(reg1, reg2) \
+	movdqa	reg1, %xmm5;					\
+	movdqa	reg2, %xmm7;					\
+	movdqa	UCHIGH_reg, %xmm6;				\
+	pcmpgtb	UCLOW_reg, %xmm5;				\
+	pcmpgtb	UCLOW_reg, %xmm7;				\
+	pcmpgtb	reg1, %xmm6;					\
+	pand	%xmm6, %xmm5;					\
+	movdqa	UCHIGH_reg, %xmm6;				\
+	pcmpgtb	reg2, %xmm6;					\
+	pand	%xmm6, %xmm7;					\
+	pand	LCQWORD_reg, %xmm5;				\
+	por	%xmm5, reg1;					\
+	pand	LCQWORD_reg, %xmm7;				\
+	por	%xmm7, reg2
+	TOLOWER (%xmm1, %xmm2)
+#else
+# define TOLOWER(reg1, reg2)
+#endif
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %ecx
+	sub	$0xffff, %ecx
+	jnz	L(less16bytes)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(eq)
+#endif
+	add	$16, %eax
+	add	$16, %edx
+
+L(crosspage):
+
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	PUSH	(FLAGS)
+#endif
+	PUSH	(%edi)
+	PUSH	(%esi)
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	pushl	$0
+	cfi_adjust_cfa_offset (4)
+#endif
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cfi_remember_state
+#endif
+
+	movl	%edx, %edi
+	movl	%eax, %ecx
+	and	$0xf, %ecx
+	and	$0xf, %edi
+	xor	%ecx, %eax
+	xor	%edi, %edx
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	xor	FLAGS, FLAGS
+#endif
+	cmp	%edi, %ecx
+	je	L(ashr_0)
+	ja	L(bigger)
+	orl	$0x20, FLAGS
+	xchg	%edx, %eax
+	xchg	%ecx, %edi
+L(bigger):
+	lea	15(%edi), %edi
+	sub	%ecx, %edi
+	cmp	$8, %edi
+	jle	L(ashr_less_8)
+	cmp	$14, %edi
+	je	L(ashr_15)
+	cmp	$13, %edi
+	je	L(ashr_14)
+	cmp	$12, %edi
+	je	L(ashr_13)
+	cmp	$11, %edi
+	je	L(ashr_12)
+	cmp	$10, %edi
+	je	L(ashr_11)
+	cmp	$9, %edi
+	je	L(ashr_10)
+L(ashr_less_8):
+	je	L(ashr_9)
+	cmp	$7, %edi
+	je	L(ashr_8)
+	cmp	$6, %edi
+	je	L(ashr_7)
+	cmp	$5, %edi
+	je	L(ashr_6)
+	cmp	$4, %edi
+	je	L(ashr_5)
+	cmp	$3, %edi
+	je	L(ashr_4)
+	cmp	$2, %edi
+	je	L(ashr_3)
+	cmp	$1, %edi
+	je	L(ashr_2)
+	cmp	$0, %edi
+	je	L(ashr_1)
+
+/*
+ * The following cases will be handled by ashr_0
+ *  ecx(offset of esi)  eax(offset of edi)  relative offset  corresponding case
+ *        n(0~15)            n(0~15)           15(15+ n-n)         ashr_0
+ */
+	.p2align 4
+L(ashr_0):
+	mov	$0xffff, %esi
+	movdqa	(%eax), %xmm1
+	pxor	%xmm0, %xmm0
+	pcmpeqb	%xmm1, %xmm0
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movdqa	(%edx), %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm2, %xmm1
+#else
+	pcmpeqb	(%edx), %xmm1
+#endif
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	mov	%ecx, %edi
+	jne	L(less32bytes)
+	UPDATE_STRNCMP_COUNTER
+	movl	$0x10, FLAGS
+	mov	$0x10, %ecx
+	pxor	%xmm0, %xmm0
+	.p2align 4
+L(loop_ashr_0):
+	movdqa	(%eax, %ecx), %xmm1
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movdqa	(%edx, %ecx), %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+#else
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	(%edx, %ecx), %xmm1
+#endif
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	jmp	L(loop_ashr_0)
+
+/*
+ * The following cases will be handled by ashr_1
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(15)            n -15            0(15 +(n-15) - n)         ashr_1
+ */
+	.p2align 4
+L(ashr_1):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$15, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-15(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$1, FLAGS
+	lea	1(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_1):
+	add	$16, %edi
+	jg	L(nibble_ashr_1)
+
+L(gobble_ashr_1):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$1, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_1)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$1, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_1)
+
+	.p2align 4
+L(nibble_ashr_1):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xfffe, %esi
+	jnz	L(ashr_1_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$15, REM
+	jbe	L(ashr_1_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_1)
+
+	.p2align 4
+L(ashr_1_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$1, %xmm0
+	psrldq	$1, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_2
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(14~15)            n -14            1(15 +(n-14) - n)         ashr_2
+ */
+	.p2align 4
+L(ashr_2):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$14, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-14(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$2, FLAGS
+	lea	2(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_2):
+	add	$16, %edi
+	jg	L(nibble_ashr_2)
+
+L(gobble_ashr_2):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$2, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_2)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$2, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_2)
+
+	.p2align 4
+L(nibble_ashr_2):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xfffc, %esi
+	jnz	L(ashr_2_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$14, REM
+	jbe	L(ashr_2_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_2)
+
+	.p2align 4
+L(ashr_2_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$2, %xmm0
+	psrldq	$2, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_3
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(13~15)            n -13            2(15 +(n-13) - n)         ashr_3
+ */
+	.p2align 4
+L(ashr_3):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$13, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-13(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$3, FLAGS
+	lea	3(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_3):
+	add	$16, %edi
+	jg	L(nibble_ashr_3)
+
+L(gobble_ashr_3):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$3, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_3)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$3, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_3)
+
+	.p2align 4
+L(nibble_ashr_3):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xfff8, %esi
+	jnz	L(ashr_3_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$13, REM
+	jbe	L(ashr_3_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_3)
+
+	.p2align 4
+L(ashr_3_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$3, %xmm0
+	psrldq	$3, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_4
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(12~15)            n -12            3(15 +(n-12) - n)         ashr_4
+ */
+	.p2align 4
+L(ashr_4):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$12, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-12(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$4, FLAGS
+	lea	4(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_4):
+	add	$16, %edi
+	jg	L(nibble_ashr_4)
+
+L(gobble_ashr_4):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$4, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_4)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$4, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_4)
+
+	.p2align 4
+L(nibble_ashr_4):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xfff0, %esi
+	jnz	L(ashr_4_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$12, REM
+	jbe	L(ashr_4_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_4)
+
+	.p2align 4
+L(ashr_4_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$4, %xmm0
+	psrldq	$4, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_5
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(11~15)            n -11            4(15 +(n-11) - n)         ashr_5
+ */
+	.p2align 4
+L(ashr_5):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$11, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-11(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$5, FLAGS
+	lea	5(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_5):
+	add	$16, %edi
+	jg	L(nibble_ashr_5)
+
+L(gobble_ashr_5):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$5, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_5)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$5, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_5)
+
+	.p2align 4
+L(nibble_ashr_5):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xffe0, %esi
+	jnz	L(ashr_5_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$11, REM
+	jbe	L(ashr_5_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_5)
+
+	.p2align 4
+L(ashr_5_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$5, %xmm0
+	psrldq	$5, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_6
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(10~15)            n -10            5(15 +(n-10) - n)         ashr_6
+ */
+
+	.p2align 4
+L(ashr_6):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$10, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-10(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$6, FLAGS
+	lea	6(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_6):
+	add	$16, %edi
+	jg	L(nibble_ashr_6)
+
+L(gobble_ashr_6):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$6, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_6)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$6, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_6)
+
+	.p2align 4
+L(nibble_ashr_6):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xffc0, %esi
+	jnz	L(ashr_6_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$10, REM
+	jbe	L(ashr_6_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_6)
+
+	.p2align 4
+L(ashr_6_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$6, %xmm0
+	psrldq	$6, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_7
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(9~15)            n - 9            6(15 +(n-9) - n)         ashr_7
+ */
+
+	.p2align 4
+L(ashr_7):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$9, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-9(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$7, FLAGS
+	lea	8(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_7):
+	add	$16, %edi
+	jg	L(nibble_ashr_7)
+
+L(gobble_ashr_7):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$7, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_7)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$7, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_7)
+
+	.p2align 4
+L(nibble_ashr_7):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xff80, %esi
+	jnz	L(ashr_7_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$9, REM
+	jbe	L(ashr_7_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_7)
+
+	.p2align 4
+L(ashr_7_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$7, %xmm0
+	psrldq	$7, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_8
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(8~15)            n - 8            7(15 +(n-8) - n)         ashr_8
+ */
+	.p2align 4
+L(ashr_8):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$8, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-8(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$8, FLAGS
+	lea	8(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_8):
+	add	$16, %edi
+	jg	L(nibble_ashr_8)
+
+L(gobble_ashr_8):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$8, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_8)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$8, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_8)
+
+	.p2align 4
+L(nibble_ashr_8):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xff00, %esi
+	jnz	L(ashr_8_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$8, REM
+	jbe	L(ashr_8_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_8)
+
+	.p2align 4
+L(ashr_8_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$8, %xmm0
+	psrldq	$8, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_9
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(7~15)            n - 7            8(15 +(n-7) - n)         ashr_9
+ */
+	.p2align 4
+L(ashr_9):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$7, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-7(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$9, FLAGS
+	lea	9(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_9):
+	add	$16, %edi
+	jg	L(nibble_ashr_9)
+
+L(gobble_ashr_9):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$9, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_9)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$9, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_9)
+
+	.p2align 4
+L(nibble_ashr_9):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xfe00, %esi
+	jnz	L(ashr_9_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$7, REM
+	jbe	L(ashr_9_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_9)
+
+	.p2align 4
+L(ashr_9_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$9, %xmm0
+	psrldq	$9, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_10
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(6~15)            n - 6            9(15 +(n-6) - n)         ashr_10
+ */
+	.p2align 4
+L(ashr_10):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$6, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-6(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$10, FLAGS
+	lea	10(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_10):
+	add	$16, %edi
+	jg	L(nibble_ashr_10)
+
+L(gobble_ashr_10):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$10, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_10)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$10, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_10)
+
+	.p2align 4
+L(nibble_ashr_10):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xfc00, %esi
+	jnz	L(ashr_10_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$6, REM
+	jbe	L(ashr_10_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_10)
+
+	.p2align 4
+L(ashr_10_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$10, %xmm0
+	psrldq	$10, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_11
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(5~15)            n - 5            10(15 +(n-5) - n)         ashr_11
+ */
+	.p2align 4
+L(ashr_11):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$5, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-5(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$11, FLAGS
+	lea	11(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_11):
+	add	$16, %edi
+	jg	L(nibble_ashr_11)
+
+L(gobble_ashr_11):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$11, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_11)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$11, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_11)
+
+	.p2align 4
+L(nibble_ashr_11):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xf800, %esi
+	jnz	L(ashr_11_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$5, REM
+	jbe	L(ashr_11_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_11)
+
+	.p2align 4
+L(ashr_11_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$11, %xmm0
+	psrldq	$11, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_12
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(4~15)            n - 4            11(15 +(n-4) - n)         ashr_12
+ */
+	.p2align 4
+L(ashr_12):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$4, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-4(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$12, FLAGS
+	lea	12(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_12):
+	add	$16, %edi
+	jg	L(nibble_ashr_12)
+
+L(gobble_ashr_12):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$12, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_12)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$12, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_12)
+
+	.p2align 4
+L(nibble_ashr_12):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xf000, %esi
+	jnz	L(ashr_12_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$4, REM
+	jbe	L(ashr_12_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_12)
+
+	.p2align 4
+L(ashr_12_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$12, %xmm0
+	psrldq	$12, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_13
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(3~15)            n - 3            12(15 +(n-3) - n)         ashr_13
+ */
+	.p2align 4
+L(ashr_13):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-3(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$13, FLAGS
+	lea	13(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_13):
+	add	$16, %edi
+	jg	L(nibble_ashr_13)
+
+L(gobble_ashr_13):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$13, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_13)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$13, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_13)
+
+	.p2align 4
+L(nibble_ashr_13):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xe000, %esi
+	jnz	L(ashr_13_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$3, REM
+	jbe	L(ashr_13_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_13)
+
+	.p2align 4
+L(ashr_13_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$13, %xmm0
+	psrldq	$13, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_14
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(2~15)            n - 2            13(15 +(n-2) - n)         ashr_14
+ */
+	.p2align 4
+L(ashr_14):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$2, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-2(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$14, FLAGS
+	lea	14(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_14):
+	add	$16, %edi
+	jg	L(nibble_ashr_14)
+
+L(gobble_ashr_14):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$14, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_14)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$14, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_14)
+
+	.p2align 4
+L(nibble_ashr_14):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xc000, %esi
+	jnz	L(ashr_14_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$2, REM
+	jbe	L(ashr_14_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_14)
+
+	.p2align 4
+L(ashr_14_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$14, %xmm0
+	psrldq	$14, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_14
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(1~15)            n - 1            14(15 +(n-1) - n)         ashr_15
+ */
+
+	.p2align 4
+L(ashr_15):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$1, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-1(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	orl	$15, FLAGS
+	lea	15(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_15):
+	add	$16, %edi
+	jg	L(nibble_ashr_15)
+
+L(gobble_ashr_15):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$15, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_15)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$15, %xmm3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$16, REM
+	lea	-16(REM), REM
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_15)
+
+	.p2align 4
+L(nibble_ashr_15):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0x8000, %esi
+	jnz	L(ashr_15_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$1, REM
+	jbe	L(ashr_15_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_15)
+
+	.p2align 4
+L(ashr_15_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$15, %xmm0
+	psrldq	$15, %xmm3
+	jmp	L(aftertail)
+
+	.p2align 4
+L(aftertail):
+	TOLOWER (%xmm1, %xmm3)
+	pcmpeqb	%xmm3, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	not	%esi
+L(exit):
+	mov	FLAGS, %edi
+	and	$0x1f, %edi
+	lea	-16(%edi, %ecx), %edi
+L(less32bytes):
+	add	%edi, %edx
+	add	%ecx, %eax
+	testl	$0x20, FLAGS
+	jz	L(ret2)
+	xchg	%eax, %edx
+
+	.p2align 4
+L(ret2):
+	mov	%esi, %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	addl	$4, %esp
+	cfi_adjust_cfa_offset (-4)
+#endif
+	POP	(%esi)
+	POP	(%edi)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	POP	(FLAGS)
+#endif
+L(less16bytes):
+	test	%cl, %cl
+	jz	L(2next_8_bytes)
+
+	test	$0x01, %cl
+	jnz	L(Byte0)
+
+	test	$0x02, %cl
+	jnz	L(Byte1)
+
+	test	$0x04, %cl
+	jnz	L(Byte2)
+
+	test	$0x08, %cl
+	jnz	L(Byte3)
+
+	test	$0x10, %cl
+	jnz	L(Byte4)
+
+	test	$0x20, %cl
+	jnz	L(Byte5)
+
+	test	$0x40, %cl
+	jnz	L(Byte6)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$7, REM
+	jbe	L(eq)
+#endif
+
+	movzx	7(%eax), %ecx
+	movzx	7(%edx), %eax
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+	sub	%ecx, %eax
+	RETURN
+
+L(Byte0):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$0, REM
+	jbe	L(eq)
+#endif
+	movzx	(%eax), %ecx
+	movzx	(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+	sub	%ecx, %eax
+	RETURN
+
+L(Byte1):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$1, REM
+	jbe	L(eq)
+#endif
+	movzx	1(%eax), %ecx
+	movzx	1(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+	sub	%ecx, %eax
+	RETURN
+
+L(Byte2):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$2, REM
+	jbe	L(eq)
+#endif
+	movzx	2(%eax), %ecx
+	movzx	2(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+	sub	%ecx, %eax
+	RETURN
+
+L(Byte3):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$3, REM
+	jbe	L(eq)
+#endif
+	movzx	3(%eax), %ecx
+	movzx	3(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+	sub	%ecx, %eax
+	RETURN
+
+L(Byte4):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$4, REM
+	jbe	L(eq)
+#endif
+	movzx	4(%eax), %ecx
+	movzx	4(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+	sub	%ecx, %eax
+	RETURN
+
+L(Byte5):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$5, REM
+	jbe	L(eq)
+#endif
+	movzx	5(%eax), %ecx
+	movzx	5(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+	sub	%ecx, %eax
+	RETURN
+
+L(Byte6):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$6, REM
+	jbe	L(eq)
+#endif
+	movzx	6(%eax), %ecx
+	movzx	6(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+	sub	%ecx, %eax
+	RETURN
+
+L(2next_8_bytes):
+	add	$8, %eax
+	add	$8, %edx
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$8, REM
+	lea	-8(REM), REM
+	jbe	L(eq)
+#endif
+
+	test	$0x01, %ch
+	jnz	L(Byte0)
+
+	test	$0x02, %ch
+	jnz	L(Byte1)
+
+	test	$0x04, %ch
+	jnz	L(Byte2)
+
+	test	$0x08, %ch
+	jnz	L(Byte3)
+
+	test	$0x10, %ch
+	jnz	L(Byte4)
+
+	test	$0x20, %ch
+	jnz	L(Byte5)
+
+	test	$0x40, %ch
+	jnz	L(Byte6)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$7, REM
+	jbe	L(eq)
+#endif
+	movzx	7(%eax), %ecx
+	movzx	7(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+	sub	%ecx, %eax
+	RETURN
+
+#ifdef USE_AS_STRNCMP
+L(neq_sncmp):
+#endif
+L(neq):
+	mov	$1, %eax
+	ja	L(neq_bigger)
+	neg	%eax
+L(neq_bigger):
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	addl	$4, %esp
+	cfi_adjust_cfa_offset (-4)
+#endif
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	POP	(REM)
+#endif
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	POP	(%ebx)
+# endif
+#endif
+	ret
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	.p2align 4
+	cfi_restore_state
+L(more8byteseq):
+
+# ifdef USE_AS_STRNCASECMP_L
+	addl	$4, %esp
+	cfi_adjust_cfa_offset (-4)
+# endif
+	POP	(%esi)
+	POP	(%edi)
+# ifdef USE_AS_STRNCMP
+	POP	(FLAGS)
+# endif
+#endif
+
+#ifdef USE_AS_STRNCMP
+L(eq_sncmp):
+#endif
+L(eq):
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	POP	(REM)
+#endif
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	POP	(%ebx)
+# endif
+#endif
+	xorl	%eax, %eax
+	ret
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	.p2align 4
+# if defined USE_AS_STRNCASECMP_L && defined PIC
+	CFI_PUSH (%ebx)
+# endif
+	CFI_PUSH (REM)
+L(less16bytes_sncmp):
+# ifdef USE_AS_STRNCASECMP_L
+	PUSH	(%esi)
+# endif
+	test	REM, REM
+	jz	L(eq_sncmp)
+
+	movzbl	(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, (%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$1, REM
+	je	L(eq_sncmp)
+
+	movzbl	1(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	1(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 1(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$2, REM
+	je	L(eq_sncmp)
+
+	movzbl	2(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	2(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 2(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$3, REM
+	je	L(eq_sncmp)
+
+	movzbl	3(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	3(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 3(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$4, REM
+	je	L(eq_sncmp)
+
+	movzbl	4(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	4(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 4(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$5, REM
+	je	L(eq_sncmp)
+
+	movzbl	5(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	5(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 5(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$6, REM
+	je	L(eq_sncmp)
+
+	movzbl	6(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	6(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 6(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$7, REM
+	je	L(eq_sncmp)
+
+	movzbl	7(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	7(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 7(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+
+	cmp	$8, REM
+	je	L(eq_sncmp)
+
+	movzbl	8(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	8(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 8(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$9, REM
+	je	L(eq_sncmp)
+
+	movzbl	9(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	9(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 9(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$10, REM
+	je	L(eq_sncmp)
+
+	movzbl	10(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	10(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 10(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$11, REM
+	je	L(eq_sncmp)
+
+	movzbl	11(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	11(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 11(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+
+	cmp	$12, REM
+	je	L(eq_sncmp)
+
+	movzbl	12(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	12(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 12(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$13, REM
+	je	L(eq_sncmp)
+
+	movzbl	13(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	13(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 13(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$14, REM
+	je	L(eq_sncmp)
+
+	movzbl	14(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	14(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 14(%edx)
+# endif
+	jne	L(neq_sncmp)
+	test	%cl, %cl
+	je	L(eq_sncmp)
+
+	cmp	$15, REM
+	je	L(eq_sncmp)
+
+	movzbl	15(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+	movzbl	15(%edx), %esi
+#  ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+#  else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+#  endif
+	cmpl	%ecx, %esi
+# else
+	cmpb	%cl, 15(%edx)
+# endif
+	jne	L(neq_sncmp)
+
+# ifdef USE_AS_STRNCASECMP_L
+L(eq_sncmp):
+	POP	(%esi)
+# endif
+	POP	(REM)
+# if defined USE_AS_STRNCASECMP_L && defined PIC
+	POP	(%ebx)
+# endif
+	xor	%eax, %eax
+	ret
+
+# ifdef USE_AS_STRNCASECMP_L
+	.p2align 4
+#  ifdef PIC
+	CFI_PUSH (%ebx)
+#  endif
+	CFI_PUSH (REM)
+	CFI_PUSH (%esi)
+L(neq_sncmp):
+	mov	$1, %eax
+	mov	$-1, %edx
+	cmovna	%edx, %eax
+	POP	(%esi)
+	POP	(REM)
+#  ifdef PIC
+	POP	(%ebx)
+#  endif
+	ret
+# endif
+#endif
+
+END (STRCMP)
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp.S
new file mode 100644
index 0000000000..56de25a4b7
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp.S
@@ -0,0 +1,95 @@
+/* Multiple versions of strcmp
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifdef USE_AS_STRNCMP
+# define STRCMP			strncmp
+# define __GI_STRCMP		__GI_strncmp
+# define __STRCMP_IA32		__strncmp_ia32
+# define __STRCMP_SSSE3		__strncmp_ssse3
+# define __STRCMP_SSE4_2	__strncmp_sse4_2
+#elif defined USE_AS_STRCASECMP_L
+# define STRCMP			__strcasecmp_l
+# define __GI_STRCMP		__GI_strcasecmp_l
+# define __STRCMP_IA32		__strcasecmp_l_ia32
+# define __STRCMP_SSSE3		__strcasecmp_l_ssse3
+# define __STRCMP_SSE4_2	__strcasecmp_l_sse4_2
+#elif defined USE_AS_STRNCASECMP_L
+# define STRCMP			__strncasecmp_l
+# define __GI_STRCMP		__GI_strncasecmp_l
+# define __STRCMP_IA32		__strncasecmp_l_ia32
+# define __STRCMP_SSSE3		__strncasecmp_l_ssse3
+# define __STRCMP_SSE4_2	__strncasecmp_l_sse4_2
+#else
+# define STRCMP			strcmp
+# define __GI_STRCMP		__GI_strcmp
+# define __STRCMP_IA32		__strcmp_ia32
+# define __STRCMP_SSSE3		__strcmp_ssse3
+# define __STRCMP_SSE4_2	__strcmp_sse4_2
+#endif
+
+/* Define multiple versions only for the definition in libc.  Don't
+   define multiple versions for strncmp in static library since we
+   need strncmp before the initialization happened.  */
+#if (defined SHARED || !defined USE_AS_STRNCMP) && IS_IN (libc)
+	.text
+ENTRY(STRCMP)
+	.type	STRCMP, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__STRCMP_IA32)
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__STRCMP_SSSE3)
+	HAS_CPU_FEATURE (SSE4_2)
+	jz	2f
+	HAS_ARCH_FEATURE (Slow_SSE4_2)
+	jnz	2f
+	LOAD_FUNC_GOT_EAX (__STRCMP_SSE4_2)
+2:	ret
+END(STRCMP)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __STRCMP_IA32, @function; \
+	.p2align 4; \
+	.globl __STRCMP_IA32; \
+	.hidden __STRCMP_IA32; \
+	__STRCMP_IA32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __STRCMP_IA32, .-__STRCMP_IA32
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_STRCMP; __GI_STRCMP = __STRCMP_IA32
+# endif
+#endif
+
+#if !defined USE_AS_STRNCMP && !defined USE_AS_STRCASECMP_L \
+    && !defined USE_AS_STRNCASECMP_L
+# include "../strcmp.S"
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-sse2.S
new file mode 100644
index 0000000000..ed627a5f62
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-sse2.S
@@ -0,0 +1,2250 @@
+/* strcpy with SSE2 and unaligned load
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+
+# define CFI_PUSH(REG)                  \
+	cfi_adjust_cfa_offset (4);     \
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)                   \
+	cfi_adjust_cfa_offset (-4);    \
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# ifndef STRCPY
+#  define STRCPY  __strcpy_sse2
+# endif
+
+# define STR1  PARMS
+# define STR2  STR1+4
+# define LEN  STR2+4
+
+# ifdef USE_AS_STRNCPY
+#  define PARMS  16
+#  define ENTRANCE PUSH(%ebx); PUSH(%esi); PUSH(%edi)
+#  define RETURN  POP(%edi); POP(%esi); POP(%ebx); ret;          \
+	CFI_PUSH(%ebx); CFI_PUSH(%esi); CFI_PUSH(%edi);
+
+# ifdef SHARED
+#  define JMPTBL(I, B)	I - B
+
+/* Load an entry in a jump table into ECX and branch to it. TABLE is a
+	jump table with relative offsets.
+	INDEX is a register contains the index into the jump table.
+	SCALE is the scale of INDEX. */
+
+#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)            \
+	/* We first load PC into ECX.  */                       \
+	SETUP_PIC_REG(cx);                                      \
+	/* Get the address of the jump table.  */               \
+	addl	$(TABLE - .), %ecx;                             \
+	/* Get the entry and convert the relative offset to the \
+	absolute	address.  */                            \
+	addl	(%ecx,INDEX,SCALE), %ecx;                       \
+	/* We loaded the jump table and adjusted ECX. Go.  */  \
+	jmp	*%ecx
+# else
+#  define JMPTBL(I, B)	I
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+	absolute	offsets.  INDEX is a register contains the index into the
+	jump	table.  SCALE is the scale of INDEX. */
+
+#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+	jmp	*TABLE(,INDEX,SCALE)
+# endif
+
+.text
+ENTRY (STRCPY)
+	ENTRANCE
+	mov	STR1(%esp), %edi
+	mov	STR2(%esp), %esi
+	movl	LEN(%esp), %ebx
+	test	%ebx, %ebx
+	jz	L(ExitZero)
+
+	mov	%esi, %ecx
+# ifndef USE_AS_STPCPY
+	mov	%edi, %eax      /* save result */
+# endif
+	and	$15, %ecx
+	jz	L(SourceStringAlignmentZero)
+
+	and	$-16, %esi
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+
+	pcmpeqb	(%esi), %xmm1
+	add	%ecx, %ebx
+	pmovmskb %xmm1, %edx
+	shr	%cl, %edx
+# ifdef USE_AS_STPCPY
+	cmp	$16, %ebx
+	jbe	L(CopyFrom1To16BytesTailCase2OrCase3)
+# else
+	cmp	$17, %ebx
+	jbe	L(CopyFrom1To16BytesTailCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTail)
+
+	pcmpeqb	16(%esi), %xmm0
+	pmovmskb %xmm0, %edx
+# ifdef USE_AS_STPCPY
+	cmp	$32, %ebx
+	jbe	L(CopyFrom1To32BytesCase2OrCase3)
+# else
+	cmp	$33, %ebx
+	jbe	L(CopyFrom1To32BytesCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To32Bytes)
+
+	movdqu	(%esi, %ecx), %xmm1   /* copy 16 bytes */
+	movdqu	%xmm1, (%edi)
+
+	sub	%ecx, %edi
+
+/* If source address alignment != destination address alignment */
+	.p2align 4
+L(Unalign16Both):
+	mov	$16, %ecx
+	movdqa	(%esi, %ecx), %xmm1
+	movaps	16(%esi, %ecx), %xmm2
+	movdqu	%xmm1, (%edi, %ecx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+	sub	$48, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm2)
+
+	movaps	16(%esi, %ecx), %xmm3
+	movdqu	%xmm2, (%edi, %ecx)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm3)
+
+	movaps	16(%esi, %ecx), %xmm4
+	movdqu	%xmm3, (%edi, %ecx)
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm4)
+
+	movaps	16(%esi, %ecx), %xmm1
+	movdqu	%xmm4, (%edi, %ecx)
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm1)
+
+	movaps	16(%esi, %ecx), %xmm2
+	movdqu	%xmm1, (%edi, %ecx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm2)
+
+	movaps	16(%esi, %ecx), %xmm3
+	movdqu	%xmm2, (%edi, %ecx)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm3)
+
+	movdqu	%xmm3, (%edi, %ecx)
+	mov	%esi, %edx
+	lea	16(%esi, %ecx), %esi
+	and	$-0x40, %esi
+	sub	%esi, %edx
+	sub	%edx, %edi
+	lea	128(%ebx, %edx), %ebx
+
+L(Unaligned64Loop):
+	movaps	(%esi), %xmm2
+	movaps	%xmm2, %xmm4
+	movaps	16(%esi), %xmm5
+	movaps	32(%esi), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	48(%esi), %xmm7
+	pminub	%xmm5, %xmm2
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %edx
+	sub	$64, %ebx
+	jbe	L(UnalignedLeaveCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(Unaligned64Leave)
+L(Unaligned64Loop_start):
+	add	$64, %edi
+	add	$64, %esi
+	movdqu	%xmm4, -64(%edi)
+	movaps	(%esi), %xmm2
+	movdqa	%xmm2, %xmm4
+	movdqu	%xmm5, -48(%edi)
+	movaps	16(%esi), %xmm5
+	pminub	%xmm5, %xmm2
+	movaps	32(%esi), %xmm3
+	movdqu	%xmm6, -32(%edi)
+	movaps	%xmm3, %xmm6
+	movdqu	%xmm7, -16(%edi)
+	movaps	48(%esi), %xmm7
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %edx
+	sub	$64, %ebx
+	jbe	L(UnalignedLeaveCase2OrCase3)
+	test	%edx, %edx
+	jz	L(Unaligned64Loop_start)
+L(Unaligned64Leave):
+	pxor	%xmm1, %xmm1
+
+	pcmpeqb	%xmm4, %xmm0
+	pcmpeqb	%xmm5, %xmm1
+	pmovmskb %xmm0, %edx
+	pmovmskb %xmm1, %ecx
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnaligned_0)
+	test	%ecx, %ecx
+	jnz	L(CopyFrom1To16BytesUnaligned_16)
+
+	pcmpeqb	%xmm6, %xmm0
+	pcmpeqb	%xmm7, %xmm1
+	pmovmskb %xmm0, %edx
+	pmovmskb %xmm1, %ecx
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnaligned_32)
+
+	bsf	%ecx, %edx
+	movdqu	%xmm4, (%edi)
+	movdqu	%xmm5, 16(%edi)
+	movdqu	%xmm6, 32(%edi)
+# ifdef USE_AS_STPCPY
+	lea	48(%edi, %edx), %eax
+# endif
+	movdqu	%xmm7, 48(%edi)
+	add	$15, %ebx
+	sub	%edx, %ebx
+	lea	49(%edi, %edx), %edi
+	jmp	L(StrncpyFillTailWithZero)
+
+/* If source address alignment == destination address alignment */
+
+L(SourceStringAlignmentZero):
+	pxor	%xmm0, %xmm0
+	movdqa	(%esi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %edx
+# ifdef USE_AS_STPCPY
+	cmp	$16, %ebx
+	jbe	L(CopyFrom1To16BytesTail1Case2OrCase3)
+# else
+	cmp	$17, %ebx
+	jbe	L(CopyFrom1To16BytesTail1Case2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTail1)
+
+	pcmpeqb	16(%esi), %xmm0
+	movdqu	%xmm1, (%edi)
+	pmovmskb %xmm0, %edx
+# ifdef USE_AS_STPCPY
+	cmp	$32, %ebx
+	jbe	L(CopyFrom1To32Bytes1Case2OrCase3)
+# else
+	cmp	$33, %ebx
+	jbe	L(CopyFrom1To32Bytes1Case2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To32Bytes1)
+
+	jmp	L(Unalign16Both)
+
+/*-----------------End of main part---------------------------*/
+
+/* Case1 */
+	.p2align 4
+L(CopyFrom1To16BytesTail):
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+	.p2align 4
+L(CopyFrom1To32Bytes1):
+	add	$16, %esi
+	add	$16, %edi
+	sub	$16, %ebx
+L(CopyFrom1To16BytesTail1):
+	bsf	%edx, %edx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+	.p2align 4
+L(CopyFrom1To32Bytes):
+	sub	%ecx, %ebx
+	bsf	%edx, %edx
+	add	%ecx, %esi
+	add	$16, %edx
+	sub	%ecx, %edx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnaligned_0):
+	bsf	%edx, %edx
+# ifdef USE_AS_STPCPY
+	lea	(%edi, %edx), %eax
+# endif
+	movdqu	%xmm4, (%edi)
+	add	$63, %ebx
+	sub	%edx, %ebx
+	lea	1(%edi, %edx), %edi
+	jmp	L(StrncpyFillTailWithZero)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnaligned_16):
+	bsf	%ecx, %edx
+	movdqu	%xmm4, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	16(%edi, %edx), %eax
+# endif
+	movdqu	%xmm5, 16(%edi)
+	add	$47, %ebx
+	sub	%edx, %ebx
+	lea	17(%edi, %edx), %edi
+	jmp	L(StrncpyFillTailWithZero)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnaligned_32):
+	bsf	%edx, %edx
+	movdqu	%xmm4, (%edi)
+	movdqu	%xmm5, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	32(%edi, %edx), %eax
+# endif
+	movdqu	%xmm6, 32(%edi)
+	add	$31, %ebx
+	sub	%edx, %ebx
+	lea	33(%edi, %edx), %edi
+	jmp	L(StrncpyFillTailWithZero)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm6):
+	movdqu	%xmm6, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm5):
+	movdqu	%xmm5, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm4):
+	movdqu	%xmm4, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm3):
+	movdqu	%xmm3, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm1):
+	movdqu	%xmm1, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesExit):
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+/* Case2 */
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2):
+	add	$16, %ebx
+	add	%ecx, %edi
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+	.p2align 4
+L(CopyFrom1To32BytesCase2):
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	add	$16, %edx
+	sub	%ecx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+L(CopyFrom1To16BytesTailCase2):
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+L(CopyFrom1To16BytesTail1Case2):
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+/* Case2 or Case3,  Case3 */
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesCase2)
+L(CopyFrom1To16BytesCase3):
+	add	$16, %ebx
+	add	%ecx, %edi
+	add	%ecx, %esi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+	.p2align 4
+L(CopyFrom1To32BytesCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(CopyFrom1To32BytesCase2)
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+	.p2align 4
+L(CopyFrom1To16BytesTailCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTailCase2)
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+	.p2align 4
+L(CopyFrom1To32Bytes1Case2OrCase3):
+	add	$16, %edi
+	add	$16, %esi
+	sub	$16, %ebx
+L(CopyFrom1To16BytesTail1Case2OrCase3):
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTail1Case2)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+	.p2align 4
+L(Exit0):
+# ifdef USE_AS_STPCPY
+	mov	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit1):
+	movb	%dh, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	(%edi), %eax
+# endif
+	sub	$1, %ebx
+	lea	1(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit2):
+	movw	(%esi), %dx
+	movw	%dx, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	1(%edi), %eax
+# endif
+	sub	$2, %ebx
+	lea	2(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit3):
+	movw	(%esi), %cx
+	movw	%cx, (%edi)
+	movb	%dh, 2(%edi)
+# ifdef USE_AS_STPCPY
+	lea	2(%edi), %eax
+# endif
+	sub	$3, %ebx
+	lea	3(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit4):
+	movl	(%esi), %edx
+	movl	%edx, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	3(%edi), %eax
+# endif
+	sub	$4, %ebx
+	lea	4(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit5):
+	movl	(%esi), %ecx
+	movb	%dh, 4(%edi)
+	movl	%ecx, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	4(%edi), %eax
+# endif
+	sub	$5, %ebx
+	lea	5(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit6):
+	movl	(%esi), %ecx
+	movw	4(%esi), %dx
+	movl	%ecx, (%edi)
+	movw	%dx, 4(%edi)
+# ifdef USE_AS_STPCPY
+	lea	5(%edi), %eax
+# endif
+	sub	$6, %ebx
+	lea	6(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit7):
+	movl	(%esi), %ecx
+	movl	3(%esi), %edx
+	movl	%ecx, (%edi)
+	movl	%edx, 3(%edi)
+# ifdef USE_AS_STPCPY
+	lea	6(%edi), %eax
+# endif
+	sub	$7, %ebx
+	lea	7(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit8):
+	movlpd	(%esi), %xmm0
+	movlpd	%xmm0, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	7(%edi), %eax
+# endif
+	sub	$8, %ebx
+	lea	8(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit9):
+	movlpd	(%esi), %xmm0
+	movb	%dh, 8(%edi)
+	movlpd	%xmm0, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	8(%edi), %eax
+# endif
+	sub	$9, %ebx
+	lea	9(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit10):
+	movlpd	(%esi), %xmm0
+	movw	8(%esi), %dx
+	movlpd	%xmm0, (%edi)
+	movw	%dx, 8(%edi)
+# ifdef USE_AS_STPCPY
+	lea	9(%edi), %eax
+# endif
+	sub	$10, %ebx
+	lea	10(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit11):
+	movlpd	(%esi), %xmm0
+	movl	7(%esi), %edx
+	movlpd	%xmm0, (%edi)
+	movl	%edx, 7(%edi)
+# ifdef USE_AS_STPCPY
+	lea	10(%edi), %eax
+# endif
+	sub	$11, %ebx
+	lea	11(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit12):
+	movlpd	(%esi), %xmm0
+	movl	8(%esi), %edx
+	movlpd	%xmm0, (%edi)
+	movl	%edx, 8(%edi)
+# ifdef USE_AS_STPCPY
+	lea	11(%edi), %eax
+# endif
+	sub	$12, %ebx
+	lea	12(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit13):
+	movlpd	(%esi), %xmm0
+	movlpd	5(%esi), %xmm1
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm1, 5(%edi)
+# ifdef USE_AS_STPCPY
+	lea	12(%edi), %eax
+# endif
+	sub	$13, %ebx
+	lea	13(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit14):
+	movlpd	(%esi), %xmm0
+	movlpd	6(%esi), %xmm1
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm1, 6(%edi)
+# ifdef USE_AS_STPCPY
+	lea	13(%edi), %eax
+# endif
+	sub	$14, %ebx
+	lea	14(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit15):
+	movlpd	(%esi), %xmm0
+	movlpd	7(%esi), %xmm1
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm1, 7(%edi)
+# ifdef USE_AS_STPCPY
+	lea	14(%edi), %eax
+# endif
+	sub	$15, %ebx
+	lea	15(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit16):
+	movdqu	(%esi), %xmm0
+	movdqu	%xmm0, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	15(%edi), %eax
+# endif
+	sub	$16, %ebx
+	lea	16(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit17):
+	movdqu	(%esi), %xmm0
+	movdqu	%xmm0, (%edi)
+	movb	%dh, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	16(%edi), %eax
+# endif
+	sub	$17, %ebx
+	lea	17(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit18):
+	movdqu	(%esi), %xmm0
+	movw	16(%esi), %cx
+	movdqu	%xmm0, (%edi)
+	movw	%cx, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	17(%edi), %eax
+# endif
+	sub	$18, %ebx
+	lea	18(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit19):
+	movdqu	(%esi), %xmm0
+	movl	15(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movl	%ecx, 15(%edi)
+# ifdef USE_AS_STPCPY
+	lea	18(%edi), %eax
+# endif
+	sub	$19, %ebx
+	lea	19(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit20):
+	movdqu	(%esi), %xmm0
+	movl	16(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movl	%ecx, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	19(%edi), %eax
+# endif
+	sub	$20, %ebx
+	lea	20(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit21):
+	movdqu	(%esi), %xmm0
+	movl	16(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movl	%ecx, 16(%edi)
+	movb	%dh, 20(%edi)
+# ifdef USE_AS_STPCPY
+	lea	20(%edi), %eax
+# endif
+	sub	$21, %ebx
+	lea	21(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit22):
+	movdqu	(%esi), %xmm0
+	movlpd	14(%esi), %xmm3
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm3, 14(%edi)
+# ifdef USE_AS_STPCPY
+	lea	21(%edi), %eax
+# endif
+	sub	$22, %ebx
+	lea	22(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit23):
+	movdqu	(%esi), %xmm0
+	movlpd	15(%esi), %xmm3
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm3, 15(%edi)
+# ifdef USE_AS_STPCPY
+	lea	22(%edi), %eax
+# endif
+	sub	$23, %ebx
+	lea	23(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit24):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	23(%edi), %eax
+# endif
+	sub	$24, %ebx
+	lea	24(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit25):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movb	%dh, 24(%edi)
+# ifdef USE_AS_STPCPY
+	lea	24(%edi), %eax
+# endif
+	sub	$25, %ebx
+	lea	25(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit26):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movw	24(%esi), %cx
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movw	%cx, 24(%edi)
+# ifdef USE_AS_STPCPY
+	lea	25(%edi), %eax
+# endif
+	sub	$26, %ebx
+	lea	26(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit27):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movl	23(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movl	%ecx, 23(%edi)
+# ifdef USE_AS_STPCPY
+	lea	26(%edi), %eax
+# endif
+	sub	$27, %ebx
+	lea	27(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit28):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movl	24(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movl	%ecx, 24(%edi)
+# ifdef USE_AS_STPCPY
+	lea	27(%edi), %eax
+# endif
+	sub	$28, %ebx
+	lea	28(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit29):
+	movdqu	(%esi), %xmm0
+	movdqu	13(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 13(%edi)
+# ifdef USE_AS_STPCPY
+	lea	28(%edi), %eax
+# endif
+	sub	$29, %ebx
+	lea	29(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit30):
+	movdqu	(%esi), %xmm0
+	movdqu	14(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 14(%edi)
+# ifdef USE_AS_STPCPY
+	lea	29(%edi), %eax
+# endif
+	sub	$30, %ebx
+	lea	30(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+
+	.p2align 4
+L(Exit31):
+	movdqu	(%esi), %xmm0
+	movdqu	15(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 15(%edi)
+# ifdef USE_AS_STPCPY
+	lea	30(%edi), %eax
+# endif
+	sub	$31, %ebx
+	lea	31(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit32):
+	movdqu	(%esi), %xmm0
+	movdqu	16(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	31(%edi), %eax
+# endif
+	sub	$32, %ebx
+	lea	32(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(StrncpyExit1):
+	movb	(%esi), %dl
+	movb	%dl, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	1(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit2):
+	movw	(%esi), %dx
+	movw	%dx, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	2(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit3):
+	movw	(%esi), %cx
+	movb	2(%esi), %dl
+	movw	%cx, (%edi)
+	movb	%dl, 2(%edi)
+# ifdef USE_AS_STPCPY
+	lea	3(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit4):
+	movl	(%esi), %edx
+	movl	%edx, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	4(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit5):
+	movl	(%esi), %ecx
+	movb	4(%esi), %dl
+	movl	%ecx, (%edi)
+	movb	%dl, 4(%edi)
+# ifdef USE_AS_STPCPY
+	lea	5(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit6):
+	movl	(%esi), %ecx
+	movw	4(%esi), %dx
+	movl	%ecx, (%edi)
+	movw	%dx, 4(%edi)
+# ifdef USE_AS_STPCPY
+	lea	6(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit7):
+	movl	(%esi), %ecx
+	movl	3(%esi), %edx
+	movl	%ecx, (%edi)
+	movl	%edx, 3(%edi)
+# ifdef USE_AS_STPCPY
+	lea	7(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit8):
+	movlpd	(%esi), %xmm0
+	movlpd	%xmm0, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	8(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit9):
+	movlpd	(%esi), %xmm0
+	movb	8(%esi), %dl
+	movlpd	%xmm0, (%edi)
+	movb	%dl, 8(%edi)
+# ifdef USE_AS_STPCPY
+	lea	9(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit10):
+	movlpd	(%esi), %xmm0
+	movw	8(%esi), %dx
+	movlpd	%xmm0, (%edi)
+	movw	%dx, 8(%edi)
+# ifdef USE_AS_STPCPY
+	lea	10(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit11):
+	movlpd	(%esi), %xmm0
+	movl	7(%esi), %edx
+	movlpd	%xmm0, (%edi)
+	movl	%edx, 7(%edi)
+# ifdef USE_AS_STPCPY
+	lea	11(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit12):
+	movlpd	(%esi), %xmm0
+	movl	8(%esi), %edx
+	movlpd	%xmm0, (%edi)
+	movl	%edx, 8(%edi)
+# ifdef USE_AS_STPCPY
+	lea	12(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit13):
+	movlpd	(%esi), %xmm0
+	movlpd	5(%esi), %xmm1
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm1, 5(%edi)
+# ifdef USE_AS_STPCPY
+	lea	13(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit14):
+	movlpd	(%esi), %xmm0
+	movlpd	6(%esi), %xmm1
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm1, 6(%edi)
+# ifdef USE_AS_STPCPY
+	lea	14(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit15):
+	movlpd	(%esi), %xmm0
+	movlpd	7(%esi), %xmm1
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm1, 7(%edi)
+# ifdef USE_AS_STPCPY
+	lea	15(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit16):
+	movdqu	(%esi), %xmm0
+	movdqu	%xmm0, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	16(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit17):
+	movdqu	(%esi), %xmm0
+	movb	16(%esi), %cl
+	movdqu	%xmm0, (%edi)
+	movb	%cl, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	17(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit18):
+	movdqu	(%esi), %xmm0
+	movw	16(%esi), %cx
+	movdqu	%xmm0, (%edi)
+	movw	%cx, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	18(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit19):
+	movdqu	(%esi), %xmm0
+	movl	15(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movl	%ecx, 15(%edi)
+# ifdef USE_AS_STPCPY
+	lea	19(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit20):
+	movdqu	(%esi), %xmm0
+	movl	16(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movl	%ecx, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	20(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit21):
+	movdqu	(%esi), %xmm0
+	movl	16(%esi), %ecx
+	movb	20(%esi), %dl
+	movdqu	%xmm0, (%edi)
+	movl	%ecx, 16(%edi)
+	movb	%dl, 20(%edi)
+# ifdef USE_AS_STPCPY
+	lea	21(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit22):
+	movdqu	(%esi), %xmm0
+	movlpd	14(%esi), %xmm3
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm3, 14(%edi)
+# ifdef USE_AS_STPCPY
+	lea	22(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit23):
+	movdqu	(%esi), %xmm0
+	movlpd	15(%esi), %xmm3
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm3, 15(%edi)
+# ifdef USE_AS_STPCPY
+	lea	23(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit24):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	24(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit25):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movb	24(%esi), %cl
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movb	%cl, 24(%edi)
+# ifdef USE_AS_STPCPY
+	lea	25(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit26):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movw	24(%esi), %cx
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movw	%cx, 24(%edi)
+# ifdef USE_AS_STPCPY
+	lea	26(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit27):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movl	23(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movl	%ecx, 23(%edi)
+# ifdef USE_AS_STPCPY
+	lea	27(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit28):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movl	24(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movl	%ecx, 24(%edi)
+# ifdef USE_AS_STPCPY
+	lea	28(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit29):
+	movdqu	(%esi), %xmm0
+	movdqu	13(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 13(%edi)
+# ifdef USE_AS_STPCPY
+	lea	29(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit30):
+	movdqu	(%esi), %xmm0
+	movdqu	14(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 14(%edi)
+# ifdef USE_AS_STPCPY
+	lea	30(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit31):
+	movdqu	(%esi), %xmm0
+	movdqu	15(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 15(%edi)
+# ifdef USE_AS_STPCPY
+	lea	31(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit32):
+	movdqu	(%esi), %xmm0
+	movdqu	16(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	32(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit33):
+	movdqu	(%esi), %xmm0
+	movdqu	16(%esi), %xmm2
+	movb	32(%esi), %cl
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 16(%edi)
+	movb	%cl, 32(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill0):
+	RETURN
+
+	.p2align 4
+L(Fill1):
+	movb	%dl, (%edi)
+	RETURN
+
+	.p2align 4
+L(Fill2):
+	movw	%dx, (%edi)
+	RETURN
+
+	.p2align 4
+L(Fill3):
+	movl	%edx, -1(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill4):
+	movl	%edx, (%edi)
+	RETURN
+
+	.p2align 4
+L(Fill5):
+	movl	%edx, (%edi)
+	movb	%dl, 4(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill6):
+	movl	%edx, (%edi)
+	movw	%dx, 4(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill7):
+	movlpd	%xmm0, -1(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill8):
+	movlpd	%xmm0, (%edi)
+	RETURN
+
+	.p2align 4
+L(Fill9):
+	movlpd	%xmm0, (%edi)
+	movb	%dl, 8(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill10):
+	movlpd	%xmm0, (%edi)
+	movw	%dx, 8(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill11):
+	movlpd	%xmm0, (%edi)
+	movl	%edx, 7(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill12):
+	movlpd	%xmm0, (%edi)
+	movl	%edx, 8(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill13):
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm0, 5(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill14):
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm0, 6(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill15):
+	movdqu	%xmm0, -1(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill16):
+	movdqu	%xmm0, (%edi)
+	RETURN
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm2):
+	movdqu	%xmm2, (%edi, %ecx)
+
+	.p2align 4
+L(CopyFrom1To16BytesXmmExit):
+	bsf	%edx, %edx
+	add	$15, %ebx
+	add	%ecx, %edi
+# ifdef USE_AS_STPCPY
+	lea	(%edi, %edx), %eax
+# endif
+	sub	%edx, %ebx
+	lea	1(%edi, %edx), %edi
+
+	.p2align 4
+L(StrncpyFillTailWithZero):
+	pxor	%xmm0, %xmm0
+	xor	%edx, %edx
+	sub	$16, %ebx
+	jbe	L(StrncpyFillExit)
+
+	movdqu	%xmm0, (%edi)
+	add	$16, %edi
+
+	mov	%edi, %esi
+	and	$0xf, %esi
+	sub	%esi, %edi
+	add	%esi, %ebx
+	sub	$64, %ebx
+	jb	L(StrncpyFillLess64)
+
+L(StrncpyFillLoopMovdqa):
+	movdqa	%xmm0, (%edi)
+	movdqa	%xmm0, 16(%edi)
+	movdqa	%xmm0, 32(%edi)
+	movdqa	%xmm0, 48(%edi)
+	add	$64, %edi
+	sub	$64, %ebx
+	jae	L(StrncpyFillLoopMovdqa)
+
+L(StrncpyFillLess64):
+	add	$32, %ebx
+	jl	L(StrncpyFillLess32)
+	movdqa	%xmm0, (%edi)
+	movdqa	%xmm0, 16(%edi)
+	add	$32, %edi
+	sub	$16, %ebx
+	jl	L(StrncpyFillExit)
+	movdqa	%xmm0, (%edi)
+	add	$16, %edi
+	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4)
+
+L(StrncpyFillLess32):
+	add	$16, %ebx
+	jl	L(StrncpyFillExit)
+	movdqa	%xmm0, (%edi)
+	add	$16, %edi
+	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4)
+
+L(StrncpyFillExit):
+	add	$16, %ebx
+	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4)
+
+	.p2align 4
+L(UnalignedLeaveCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(Unaligned64LeaveCase2)
+L(Unaligned64LeaveCase3):
+	lea	64(%ebx), %ecx
+	and	$-16, %ecx
+	add	$48, %ebx
+	jl	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm4, (%edi)
+	sub	$16, %ebx
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm5, 16(%edi)
+	sub	$16, %ebx
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm6, 32(%edi)
+	sub	$16, %ebx
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm7, 48(%edi)
+# ifdef USE_AS_STPCPY
+	lea	64(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Unaligned64LeaveCase2):
+	xor	%ecx, %ecx
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$48, %ebx
+	jle	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm4)
+
+	pcmpeqb	%xmm5, %xmm0
+	pmovmskb %xmm0, %edx
+	movdqu	%xmm4, (%edi)
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm5)
+
+	pcmpeqb	%xmm6, %xmm0
+	pmovmskb %xmm0, %edx
+	movdqu	%xmm5, 16(%edi)
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm6)
+
+	pcmpeqb	%xmm7, %xmm0
+	pmovmskb %xmm0, %edx
+	movdqu	%xmm6, 32(%edi)
+	lea	16(%edi, %ecx), %edi
+	lea	16(%esi, %ecx), %esi
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+	.p2align 4
+L(ExitZero):
+	movl	%edi, %eax
+	RETURN
+
+END (STRCPY)
+
+	.p2align 4
+	.section .rodata
+L(ExitTable):
+	.int	JMPTBL(L(Exit1), L(ExitTable))
+	.int	JMPTBL(L(Exit2), L(ExitTable))
+	.int	JMPTBL(L(Exit3), L(ExitTable))
+	.int	JMPTBL(L(Exit4), L(ExitTable))
+	.int	JMPTBL(L(Exit5), L(ExitTable))
+	.int	JMPTBL(L(Exit6), L(ExitTable))
+	.int	JMPTBL(L(Exit7), L(ExitTable))
+	.int	JMPTBL(L(Exit8), L(ExitTable))
+	.int	JMPTBL(L(Exit9), L(ExitTable))
+	.int	JMPTBL(L(Exit10), L(ExitTable))
+	.int	JMPTBL(L(Exit11), L(ExitTable))
+	.int	JMPTBL(L(Exit12), L(ExitTable))
+	.int	JMPTBL(L(Exit13), L(ExitTable))
+	.int	JMPTBL(L(Exit14), L(ExitTable))
+	.int	JMPTBL(L(Exit15), L(ExitTable))
+	.int	JMPTBL(L(Exit16), L(ExitTable))
+	.int	JMPTBL(L(Exit17), L(ExitTable))
+	.int	JMPTBL(L(Exit18), L(ExitTable))
+	.int	JMPTBL(L(Exit19), L(ExitTable))
+	.int	JMPTBL(L(Exit20), L(ExitTable))
+	.int	JMPTBL(L(Exit21), L(ExitTable))
+	.int	JMPTBL(L(Exit22), L(ExitTable))
+	.int    JMPTBL(L(Exit23), L(ExitTable))
+	.int	JMPTBL(L(Exit24), L(ExitTable))
+	.int	JMPTBL(L(Exit25), L(ExitTable))
+	.int	JMPTBL(L(Exit26), L(ExitTable))
+	.int	JMPTBL(L(Exit27), L(ExitTable))
+	.int	JMPTBL(L(Exit28), L(ExitTable))
+	.int	JMPTBL(L(Exit29), L(ExitTable))
+	.int	JMPTBL(L(Exit30), L(ExitTable))
+	.int	JMPTBL(L(Exit31), L(ExitTable))
+	.int	JMPTBL(L(Exit32), L(ExitTable))
+
+L(ExitStrncpyTable):
+	.int	JMPTBL(L(Exit0), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit4), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit5), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit6), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit7), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit11), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit12), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit13), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit17), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit18), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit19), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit20), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit21), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit22), L(ExitStrncpyTable))
+	.int    JMPTBL(L(StrncpyExit23), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit24), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit25), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit26), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit27), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit28), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit29), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable))
+
+	.p2align 4
+L(FillTable):
+	.int	JMPTBL(L(Fill0), L(FillTable))
+	.int	JMPTBL(L(Fill1), L(FillTable))
+	.int	JMPTBL(L(Fill2), L(FillTable))
+	.int	JMPTBL(L(Fill3), L(FillTable))
+	.int	JMPTBL(L(Fill4), L(FillTable))
+	.int	JMPTBL(L(Fill5), L(FillTable))
+	.int	JMPTBL(L(Fill6), L(FillTable))
+	.int	JMPTBL(L(Fill7), L(FillTable))
+	.int	JMPTBL(L(Fill8), L(FillTable))
+	.int	JMPTBL(L(Fill9), L(FillTable))
+	.int	JMPTBL(L(Fill10), L(FillTable))
+	.int	JMPTBL(L(Fill11), L(FillTable))
+	.int	JMPTBL(L(Fill12), L(FillTable))
+	.int	JMPTBL(L(Fill13), L(FillTable))
+	.int	JMPTBL(L(Fill14), L(FillTable))
+	.int	JMPTBL(L(Fill15), L(FillTable))
+	.int	JMPTBL(L(Fill16), L(FillTable))
+# else
+#  define PARMS  4
+#  define ENTRANCE
+#  define RETURN  POP (%edi); ret; CFI_PUSH (%edi)
+#  define RETURN1  ret
+
+	.text
+ENTRY (STRCPY)
+	ENTRANCE
+	mov	STR1(%esp), %edx
+	mov	STR2(%esp), %ecx
+
+	cmpb	$0, (%ecx)
+	jz	L(ExitTail1)
+	cmpb	$0, 1(%ecx)
+	jz	L(ExitTail2)
+	cmpb	$0, 2(%ecx)
+	jz	L(ExitTail3)
+	cmpb	$0, 3(%ecx)
+	jz	L(ExitTail4)
+	cmpb	$0, 4(%ecx)
+	jz	L(ExitTail5)
+	cmpb	$0, 5(%ecx)
+	jz	L(ExitTail6)
+	cmpb	$0, 6(%ecx)
+	jz	L(ExitTail7)
+	cmpb	$0, 7(%ecx)
+	jz	L(ExitTail8)
+	cmpb	$0, 8(%ecx)
+	jz	L(ExitTail9)
+	cmpb	$0, 9(%ecx)
+	jz	L(ExitTail10)
+	cmpb	$0, 10(%ecx)
+	jz	L(ExitTail11)
+	cmpb	$0, 11(%ecx)
+	jz	L(ExitTail12)
+	cmpb	$0, 12(%ecx)
+	jz	L(ExitTail13)
+	cmpb	$0, 13(%ecx)
+	jz	L(ExitTail14)
+	cmpb	$0, 14(%ecx)
+	jz	L(ExitTail15)
+	cmpb	$0, 15(%ecx)
+	jz	L(ExitTail16)
+
+	PUSH	(%edi)
+	PUSH	(%ebx)
+
+	mov	%edx, %edi
+	lea	16(%ecx), %ebx
+	and	$-16, %ebx
+	pxor	%xmm0, %xmm0
+	movdqu	(%ecx), %xmm1
+	movdqu	%xmm1, (%edx)
+	pcmpeqb	(%ebx), %xmm0
+	pmovmskb %xmm0, %eax
+	sub	%ecx, %ebx
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	mov	%ecx, %eax
+	lea	16(%ecx), %ecx
+	and	$-16, %ecx
+	sub	%ecx, %eax
+	sub	%eax, %edx
+	xor	%ebx, %ebx
+
+	.p2align 4
+	movdqa	(%ecx), %xmm1
+	movaps	16(%ecx), %xmm2
+	movdqu	%xmm1, (%edx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	add	$16, %ebx
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %ebx), %xmm3
+	movdqu	%xmm2, (%edx, %ebx)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %eax
+	add	$16, %ebx
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %ebx), %xmm4
+	movdqu	%xmm3, (%edx, %ebx)
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	add	$16, %ebx
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %ebx), %xmm1
+	movdqu	%xmm4, (%edx, %ebx)
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	add	$16, %ebx
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %ebx), %xmm2
+	movdqu	%xmm1, (%edx, %ebx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	add	$16, %ebx
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %ebx), %xmm3
+	movdqu	%xmm2, (%edx, %ebx)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %eax
+	add	$16, %ebx
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movdqu	%xmm3, (%edx, %ebx)
+	mov	%ecx, %eax
+	lea	16(%ecx, %ebx), %ecx
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	sub	%eax, %edx
+
+L(Aligned64Loop):
+	movaps	(%ecx), %xmm2
+	movaps	%xmm2, %xmm4
+	movaps	16(%ecx), %xmm5
+	movaps	32(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	48(%ecx), %xmm7
+	pminub	%xmm5, %xmm2
+	add	$64, %ecx
+	pminub	%xmm7, %xmm3
+	add	$64, %edx
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(Aligned64Leave)
+L(Aligned64Loop_start):
+	movdqu	%xmm4, -64(%edx)
+	movaps	(%ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+	movdqu	%xmm5, -48(%edx)
+	movaps	16(%ecx), %xmm5
+	pminub	%xmm5, %xmm2
+	movaps	32(%ecx), %xmm3
+	movdqu	%xmm6, -32(%edx)
+	movaps	%xmm3, %xmm6
+	movdqu	%xmm7, -16(%edx)
+	movaps	48(%ecx), %xmm7
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %eax
+	add	$64, %edx
+	add	$64, %ecx
+	test	%eax, %eax
+	jz	L(Aligned64Loop_start)
+L(Aligned64Leave):
+	sub	$0xa0, %ebx
+	pxor	%xmm0, %xmm0
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm5, %xmm0
+	pmovmskb %xmm0, %eax
+	movdqu	%xmm4, -64(%edx)
+	test	%eax, %eax
+	lea	16(%ebx), %ebx
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm6, %xmm0
+	pmovmskb %xmm0, %eax
+	movdqu	%xmm5, -48(%edx)
+	test	%eax, %eax
+	lea	16(%ebx), %ebx
+	jnz	L(CopyFrom1To16Bytes)
+
+	movdqu	%xmm6, -32(%edx)
+	pcmpeqb	%xmm7, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%ebx), %ebx
+
+/*-----------------End of main part---------------------------*/
+
+	.p2align 4
+L(CopyFrom1To16Bytes):
+	add	%ebx, %edx
+	add	%ebx, %ecx
+
+	POP	(%ebx)
+	test	%al, %al
+	jz	L(ExitHigh)
+	test	$0x01, %al
+	jnz	L(Exit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	/* Exit 8 */
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+# ifdef USE_AS_STPCPY
+	lea	7(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(ExitHigh):
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	test	$0x08, %ah
+	jnz	L(Exit12)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	/* Exit 16 */
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	8(%ecx), %xmm0
+	movlpd	%xmm0, 8(%edx)
+# ifdef USE_AS_STPCPY
+	lea	15(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit1):
+	movb	(%ecx), %al
+	movb	%al, (%edx)
+# ifdef USE_AS_STPCPY
+	lea	(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit2):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+# ifdef USE_AS_STPCPY
+	lea	1(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit3):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	movb	2(%ecx), %al
+	movb	%al, 2(%edx)
+# ifdef USE_AS_STPCPY
+	lea	2(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit4):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+# ifdef USE_AS_STPCPY
+	lea	3(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit5):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movb	4(%ecx), %al
+	movb	%al, 4(%edx)
+# ifdef USE_AS_STPCPY
+	lea	4(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit6):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movw	4(%ecx), %ax
+	movw	%ax, 4(%edx)
+# ifdef USE_AS_STPCPY
+	lea	5(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit7):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	3(%ecx), %eax
+	movl	%eax, 3(%edx)
+# ifdef USE_AS_STPCPY
+	lea	6(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit9):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+	movb	8(%ecx), %al
+	movb	%al, 8(%edx)
+# ifdef USE_AS_STPCPY
+	lea	8(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit10):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+	movw	8(%ecx), %ax
+	movw	%ax, 8(%edx)
+# ifdef USE_AS_STPCPY
+	lea	9(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit11):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+	movl	7(%ecx), %eax
+	movl	%eax, 7(%edx)
+# ifdef USE_AS_STPCPY
+	lea	10(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit12):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+	movl	8(%ecx), %eax
+	movl	%eax, 8(%edx)
+# ifdef USE_AS_STPCPY
+	lea	11(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit13):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	5(%ecx), %xmm0
+	movlpd	%xmm0, 5(%edx)
+# ifdef USE_AS_STPCPY
+	lea	12(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit14):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	6(%ecx), %xmm0
+	movlpd	%xmm0, 6(%edx)
+# ifdef USE_AS_STPCPY
+	lea	13(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit15):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	7(%ecx), %xmm0
+	movlpd	%xmm0, 7(%edx)
+# ifdef USE_AS_STPCPY
+	lea	14(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+CFI_POP (%edi)
+
+	.p2align 4
+L(ExitTail1):
+	movb	(%ecx), %al
+	movb	%al, (%edx)
+	movl	%edx, %eax
+	RETURN1
+
+	.p2align 4
+L(ExitTail2):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+# ifdef USE_AS_STPCPY
+	lea	1(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail3):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	movb	2(%ecx), %al
+	movb	%al, 2(%edx)
+# ifdef USE_AS_STPCPY
+	lea	2(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail4):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+# ifdef USE_AS_STPCPY
+	lea	3(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail5):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movb	4(%ecx), %al
+	movb	%al, 4(%edx)
+# ifdef USE_AS_STPCPY
+	lea	4(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail6):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movw	4(%ecx), %ax
+	movw	%ax, 4(%edx)
+# ifdef USE_AS_STPCPY
+	lea	5(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail7):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	3(%ecx), %eax
+	movl	%eax, 3(%edx)
+# ifdef USE_AS_STPCPY
+	lea	6(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail8):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+# ifdef USE_AS_STPCPY
+	lea	7(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail9):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+	movb	8(%ecx), %al
+	movb	%al, 8(%edx)
+# ifdef USE_AS_STPCPY
+	lea	8(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail10):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+	movw	8(%ecx), %ax
+	movw	%ax, 8(%edx)
+# ifdef USE_AS_STPCPY
+	lea	9(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail11):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+	movl	7(%ecx), %eax
+	movl	%eax, 7(%edx)
+# ifdef USE_AS_STPCPY
+	lea	10(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail12):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+	movl	8(%ecx), %eax
+	movl	%eax, 8(%edx)
+# ifdef USE_AS_STPCPY
+	lea	11(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail13):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	5(%ecx), %xmm0
+	movlpd	%xmm0, 5(%edx)
+# ifdef USE_AS_STPCPY
+	lea	12(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail14):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	6(%ecx), %xmm0
+	movlpd	%xmm0, 6(%edx)
+# ifdef USE_AS_STPCPY
+	lea	13(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail15):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	7(%ecx), %xmm0
+	movlpd	%xmm0, 7(%edx)
+# ifdef USE_AS_STPCPY
+	lea	14(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail16):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	8(%ecx), %xmm0
+	movlpd	%xmm0, 8(%edx)
+# ifdef USE_AS_STPCPY
+	lea	15(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+END (STRCPY)
+# endif
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-ssse3.S
new file mode 100644
index 0000000000..effd85da94
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-ssse3.S
@@ -0,0 +1,3901 @@
+/* strcpy with SSSE3
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#if IS_IN (libc)
+
+# ifndef USE_AS_STRCAT
+#  include <sysdep.h>
+
+#  define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+#  define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+#  define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#  define POP(REG)	popl REG; CFI_POP (REG)
+
+#  ifndef STRCPY
+#   define STRCPY  __strcpy_ssse3
+#  endif
+
+#  ifdef USE_AS_STRNCPY
+#   define PARMS  8
+#   define ENTRANCE PUSH (%ebx)
+#   define RETURN  POP (%ebx); ret; CFI_PUSH (%ebx);
+#   define RETURN1  POP (%edi); POP (%ebx); ret; CFI_PUSH (%ebx); CFI_PUSH (%edi)
+#  else
+#   define PARMS  4
+#   define ENTRANCE
+#   define RETURN  ret
+#   define RETURN1  POP (%edi); ret; CFI_PUSH (%edi)
+#  endif
+
+#  ifdef USE_AS_STPCPY
+#   define SAVE_RESULT(n)  lea	n(%edx), %eax
+#   define SAVE_RESULT_TAIL(n)  lea	n(%edx), %eax
+#  else
+#   define SAVE_RESULT(n)  movl	%edi, %eax
+#   define SAVE_RESULT_TAIL(n)  movl	%edx, %eax
+#  endif
+
+#  define STR1  PARMS
+#  define STR2  STR1+4
+#  define LEN  STR2+4
+
+/* In this code following instructions are used for copying:
+	movb	- 1 byte
+	movw	- 2 byte
+	movl	- 4 byte
+	movlpd	- 8 byte
+	movaps	- 16 byte - requires 16 byte alignment
+	of	sourse and destination adresses.
+*/
+
+.text
+ENTRY (STRCPY)
+	ENTRANCE
+	mov	STR1(%esp), %edx
+	mov	STR2(%esp), %ecx
+#  ifdef USE_AS_STRNCPY
+	movl	LEN(%esp), %ebx
+	cmp	$8, %ebx
+	jbe	L(StrncpyExit8Bytes)
+#  endif
+	cmpb	$0, (%ecx)
+	jz	L(ExitTail1)
+	cmpb	$0, 1(%ecx)
+	jz	L(ExitTail2)
+	cmpb	$0, 2(%ecx)
+	jz	L(ExitTail3)
+	cmpb	$0, 3(%ecx)
+	jz	L(ExitTail4)
+	cmpb	$0, 4(%ecx)
+	jz	L(ExitTail5)
+	cmpb	$0, 5(%ecx)
+	jz	L(ExitTail6)
+	cmpb	$0, 6(%ecx)
+	jz	L(ExitTail7)
+	cmpb	$0, 7(%ecx)
+	jz	L(ExitTail8)
+#  ifdef USE_AS_STRNCPY
+	cmp	$16, %ebx
+	jb	L(StrncpyExit15Bytes)
+#  endif
+	cmpb	$0, 8(%ecx)
+	jz	L(ExitTail9)
+	cmpb	$0, 9(%ecx)
+	jz	L(ExitTail10)
+	cmpb	$0, 10(%ecx)
+	jz	L(ExitTail11)
+	cmpb	$0, 11(%ecx)
+	jz	L(ExitTail12)
+	cmpb	$0, 12(%ecx)
+	jz	L(ExitTail13)
+	cmpb	$0, 13(%ecx)
+	jz	L(ExitTail14)
+	cmpb	$0, 14(%ecx)
+	jz	L(ExitTail15)
+#  ifdef USE_AS_STRNCPY
+	cmp	$16, %ebx
+	je	L(ExitTail16)
+#  endif
+	cmpb	$0, 15(%ecx)
+	jz	L(ExitTail16)
+
+	PUSH	(%edi)
+	mov	%edx, %edi
+# endif
+	PUSH	(%esi)
+# ifdef USE_AS_STRNCPY
+	mov	%ecx, %esi
+	sub	$16, %ebx
+	and	$0xf, %esi
+
+/* add 16 bytes ecx_offset to ebx */
+
+	add	%esi, %ebx
+# endif
+	lea	16(%ecx), %esi
+	and	$-16, %esi
+	pxor	%xmm0, %xmm0
+	movlpd	(%ecx), %xmm1
+	movlpd	%xmm1, (%edx)
+
+	pcmpeqb	(%esi), %xmm0
+	movlpd	8(%ecx), %xmm1
+	movlpd	%xmm1, 8(%edx)
+
+	pmovmskb %xmm0, %eax
+	sub	%ecx, %esi
+
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	mov	%edx, %eax
+	lea	16(%edx), %edx
+	and	$-16, %edx
+	sub	%edx, %eax
+
+# ifdef USE_AS_STRNCPY
+	add	%eax, %esi
+	lea	-1(%esi), %esi
+	and	$1<<31, %esi
+	test	%esi, %esi
+	jnz	L(ContinueCopy)
+	lea	16(%ebx), %ebx
+
+L(ContinueCopy):
+# endif
+	sub	%eax, %ecx
+	mov	%ecx, %eax
+	and	$0xf, %eax
+	mov	$0, %esi
+
+/* case: ecx_offset == edx_offset */
+
+	jz	L(Align16Both)
+
+	cmp	$8, %eax
+	jae	L(ShlHigh8)
+	cmp	$1, %eax
+	je	L(Shl1)
+	cmp	$2, %eax
+	je	L(Shl2)
+	cmp	$3, %eax
+	je	L(Shl3)
+	cmp	$4, %eax
+	je	L(Shl4)
+	cmp	$5, %eax
+	je	L(Shl5)
+	cmp	$6, %eax
+	je	L(Shl6)
+	jmp	L(Shl7)
+
+L(ShlHigh8):
+	je	L(Shl8)
+	cmp	$9, %eax
+	je	L(Shl9)
+	cmp	$10, %eax
+	je	L(Shl10)
+	cmp	$11, %eax
+	je	L(Shl11)
+	cmp	$12, %eax
+	je	L(Shl12)
+	cmp	$13, %eax
+	je	L(Shl13)
+	cmp	$14, %eax
+	je	L(Shl14)
+	jmp	L(Shl15)
+
+L(Align16Both):
+	movaps	(%ecx), %xmm1
+	movaps	16(%ecx), %xmm2
+	movaps	%xmm1, (%edx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm3
+	movaps	%xmm2, (%edx, %esi)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm4
+	movaps	%xmm3, (%edx, %esi)
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm1
+	movaps	%xmm4, (%edx, %esi)
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm2
+	movaps	%xmm1, (%edx, %esi)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm3
+	movaps	%xmm2, (%edx, %esi)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	%xmm3, (%edx, %esi)
+	mov	%ecx, %eax
+	lea	16(%ecx, %esi), %ecx
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	lea	112(%ebx, %eax), %ebx
+# endif
+	mov	$-0x40, %esi
+
+L(Aligned64Loop):
+	movaps	(%ecx), %xmm2
+	movaps	32(%ecx), %xmm3
+	movaps	%xmm2, %xmm4
+	movaps	16(%ecx), %xmm5
+	movaps	%xmm3, %xmm6
+	movaps	48(%ecx), %xmm7
+	pminub	%xmm5, %xmm2
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	lea	64(%edx), %edx
+	pcmpeqb	%xmm0, %xmm3
+	lea	64(%ecx), %ecx
+	pmovmskb %xmm3, %eax
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeaveCase2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Aligned64Leave)
+	movaps	%xmm4, -64(%edx)
+	movaps	%xmm5, -48(%edx)
+	movaps	%xmm6, -32(%edx)
+	movaps	%xmm7, -16(%edx)
+	jmp	L(Aligned64Loop)
+
+L(Aligned64Leave):
+# ifdef USE_AS_STRNCPY
+	lea	48(%ebx), %ebx
+# endif
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm5, %xmm0
+# ifdef USE_AS_STRNCPY
+	lea	-16(%ebx), %ebx
+# endif
+	pmovmskb %xmm0, %eax
+	movaps	%xmm4, -64(%edx)
+	test	%eax, %eax
+	lea	16(%esi), %esi
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm6, %xmm0
+# ifdef USE_AS_STRNCPY
+	lea	-16(%ebx), %ebx
+# endif
+	pmovmskb %xmm0, %eax
+	movaps	%xmm5, -48(%edx)
+	test	%eax, %eax
+	lea	16(%esi), %esi
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	%xmm6, -32(%edx)
+	pcmpeqb	%xmm7, %xmm0
+# ifdef USE_AS_STRNCPY
+	lea	-16(%ebx), %ebx
+# endif
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl1):
+	movaps	-1(%ecx), %xmm1
+	movaps	15(%ecx), %xmm2
+L(Shl1Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit1Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl1LoopExit)
+
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	31(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit1Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl1LoopExit)
+
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	31(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit1Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl1LoopExit)
+
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	31(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit1Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl1LoopExit)
+
+	palignr	$1, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	31(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-15(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-1(%ecx), %xmm1
+
+L(Shl1LoopStart):
+	movaps	15(%ecx), %xmm2
+	movaps	31(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	47(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	63(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$1, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$1, %xmm3, %xmm4
+	jnz	L(Shl1Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave1)
+# endif
+	palignr	$1, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl1LoopStart)
+
+L(Shl1LoopExit):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	7(%ecx), %xmm0
+	movlpd	%xmm0, 7(%edx)
+	mov	$15, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl2):
+	movaps	-2(%ecx), %xmm1
+	movaps	14(%ecx), %xmm2
+L(Shl2Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit2Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl2LoopExit)
+
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	30(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit2Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl2LoopExit)
+
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	30(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit2Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl2LoopExit)
+
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	30(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit2Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl2LoopExit)
+
+	palignr	$2, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	30(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-14(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-2(%ecx), %xmm1
+
+L(Shl2LoopStart):
+	movaps	14(%ecx), %xmm2
+	movaps	30(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	46(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	62(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$2, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$2, %xmm3, %xmm4
+	jnz	L(Shl2Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave2)
+# endif
+	palignr	$2, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl2LoopStart)
+
+L(Shl2LoopExit):
+	movlpd	(%ecx), %xmm0
+	movlpd	6(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 6(%edx)
+	mov	$14, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl3):
+	movaps	-3(%ecx), %xmm1
+	movaps	13(%ecx), %xmm2
+L(Shl3Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit3Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl3LoopExit)
+
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	29(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit3Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl3LoopExit)
+
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	29(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit3Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl3LoopExit)
+
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	29(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit3Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl3LoopExit)
+
+	palignr	$3, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	29(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-13(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-3(%ecx), %xmm1
+
+L(Shl3LoopStart):
+	movaps	13(%ecx), %xmm2
+	movaps	29(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	45(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	61(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$3, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$3, %xmm3, %xmm4
+	jnz	L(Shl3Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave3)
+# endif
+	palignr	$3, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl3LoopStart)
+
+L(Shl3LoopExit):
+	movlpd	(%ecx), %xmm0
+	movlpd	5(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 5(%edx)
+	mov	$13, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl4):
+	movaps	-4(%ecx), %xmm1
+	movaps	12(%ecx), %xmm2
+L(Shl4Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit4Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	28(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit4Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	28(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit4Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	28(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit4Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	28(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-12(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-4(%ecx), %xmm1
+
+L(Shl4LoopStart):
+	movaps	12(%ecx), %xmm2
+	movaps	28(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	44(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	60(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$4, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$4, %xmm3, %xmm4
+	jnz	L(Shl4Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave4)
+# endif
+	palignr	$4, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl4LoopStart)
+
+L(Shl4LoopExit):
+	movlpd	(%ecx), %xmm0
+	movl	8(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 8(%edx)
+	mov	$12, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl5):
+	movaps	-5(%ecx), %xmm1
+	movaps	11(%ecx), %xmm2
+L(Shl5Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit5Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl5LoopExit)
+
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	27(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit5Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl5LoopExit)
+
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	27(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit5Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl5LoopExit)
+
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	27(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit5Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl5LoopExit)
+
+	palignr	$5, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	27(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-11(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-5(%ecx), %xmm1
+
+L(Shl5LoopStart):
+	movaps	11(%ecx), %xmm2
+	movaps	27(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	43(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	59(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$5, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$5, %xmm3, %xmm4
+	jnz	L(Shl5Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave5)
+# endif
+	palignr	$5, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl5LoopStart)
+
+L(Shl5LoopExit):
+	movlpd	(%ecx), %xmm0
+	movl	7(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 7(%edx)
+	mov	$11, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl6):
+	movaps	-6(%ecx), %xmm1
+	movaps	10(%ecx), %xmm2
+L(Shl6Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit6Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl6LoopExit)
+
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	26(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit6Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl6LoopExit)
+
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	26(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit6Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl6LoopExit)
+
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	26(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit6Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl6LoopExit)
+
+	palignr	$6, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	26(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-10(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-6(%ecx), %xmm1
+
+L(Shl6LoopStart):
+	movaps	10(%ecx), %xmm2
+	movaps	26(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	42(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	58(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$6, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$6, %xmm3, %xmm4
+	jnz	L(Shl6Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave6)
+# endif
+	palignr	$6, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl6LoopStart)
+
+L(Shl6LoopExit):
+	movlpd	(%ecx), %xmm0
+	movl	6(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 6(%edx)
+	mov	$10, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl7):
+	movaps	-7(%ecx), %xmm1
+	movaps	9(%ecx), %xmm2
+L(Shl7Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit7Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl7LoopExit)
+
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	25(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit7Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl7LoopExit)
+
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	25(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit7Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl7LoopExit)
+
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	25(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit7Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl7LoopExit)
+
+	palignr	$7, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	25(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-9(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-7(%ecx), %xmm1
+
+L(Shl7LoopStart):
+	movaps	9(%ecx), %xmm2
+	movaps	25(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	41(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	57(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$7, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$7, %xmm3, %xmm4
+	jnz	L(Shl7Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave7)
+# endif
+	palignr	$7, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl7LoopStart)
+
+L(Shl7LoopExit):
+	movlpd	(%ecx), %xmm0
+	movl	5(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 5(%edx)
+	mov	$9, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl8):
+	movaps	-8(%ecx), %xmm1
+	movaps	8(%ecx), %xmm2
+L(Shl8Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit8Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	24(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit8Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	24(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit8Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	24(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit8Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	24(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-8(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-8(%ecx), %xmm1
+
+L(Shl8LoopStart):
+	movaps	8(%ecx), %xmm2
+	movaps	24(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	40(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	56(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$8, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$8, %xmm3, %xmm4
+	jnz	L(Shl8Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave8)
+# endif
+	palignr	$8, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl8LoopStart)
+
+L(Shl8LoopExit):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	mov	$8, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl9):
+	movaps	-9(%ecx), %xmm1
+	movaps	7(%ecx), %xmm2
+L(Shl9Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit9Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl9LoopExit)
+
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	23(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit9Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl9LoopExit)
+
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	23(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit9Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl9LoopExit)
+
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	23(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit9Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl9LoopExit)
+
+	palignr	$9, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	23(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-7(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-9(%ecx), %xmm1
+
+L(Shl9LoopStart):
+	movaps	7(%ecx), %xmm2
+	movaps	23(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	39(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	55(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$9, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$9, %xmm3, %xmm4
+	jnz	L(Shl9Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave9)
+# endif
+	palignr	$9, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl9LoopStart)
+
+L(Shl9LoopExit):
+	movlpd	-1(%ecx), %xmm0
+	movlpd	%xmm0, -1(%edx)
+	mov	$7, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl10):
+	movaps	-10(%ecx), %xmm1
+	movaps	6(%ecx), %xmm2
+L(Shl10Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit10Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl10LoopExit)
+
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	22(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit10Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl10LoopExit)
+
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	22(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit10Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl10LoopExit)
+
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	22(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit10Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl10LoopExit)
+
+	palignr	$10, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	22(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-6(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-10(%ecx), %xmm1
+
+L(Shl10LoopStart):
+	movaps	6(%ecx), %xmm2
+	movaps	22(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	38(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	54(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$10, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$10, %xmm3, %xmm4
+	jnz	L(Shl10Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave10)
+# endif
+	palignr	$10, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl10LoopStart)
+
+L(Shl10LoopExit):
+	movlpd	-2(%ecx), %xmm0
+	movlpd	%xmm0, -2(%edx)
+	mov	$6, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl11):
+	movaps	-11(%ecx), %xmm1
+	movaps	5(%ecx), %xmm2
+L(Shl11Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit11Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl11LoopExit)
+
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	21(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit11Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl11LoopExit)
+
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	21(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit11Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl11LoopExit)
+
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	21(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit11Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl11LoopExit)
+
+	palignr	$11, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	21(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-5(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-11(%ecx), %xmm1
+
+L(Shl11LoopStart):
+	movaps	5(%ecx), %xmm2
+	movaps	21(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	37(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	53(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$11, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$11, %xmm3, %xmm4
+	jnz	L(Shl11Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave11)
+# endif
+	palignr	$11, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl11LoopStart)
+
+L(Shl11LoopExit):
+	movlpd	-3(%ecx), %xmm0
+	movlpd	%xmm0, -3(%edx)
+	mov	$5, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl12):
+	movaps	-12(%ecx), %xmm1
+	movaps	4(%ecx), %xmm2
+L(Shl12Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit12Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	20(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit12Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	20(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit12Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	20(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit12Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	20(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-4(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-12(%ecx), %xmm1
+
+L(Shl12LoopStart):
+	movaps	4(%ecx), %xmm2
+	movaps	20(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	36(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	52(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$12, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$12, %xmm3, %xmm4
+	jnz	L(Shl12Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave12)
+# endif
+	palignr	$12, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl12LoopStart)
+
+L(Shl12LoopExit):
+	movl	(%ecx), %esi
+	movl	%esi, (%edx)
+	mov	$4, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl13):
+	movaps	-13(%ecx), %xmm1
+	movaps	3(%ecx), %xmm2
+L(Shl13Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit13Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl13LoopExit)
+
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	19(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit13Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl13LoopExit)
+
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	19(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit13Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl13LoopExit)
+
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	19(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit13Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl13LoopExit)
+
+	palignr	$13, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	19(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-3(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-13(%ecx), %xmm1
+
+L(Shl13LoopStart):
+	movaps	3(%ecx), %xmm2
+	movaps	19(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	35(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	51(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$13, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$13, %xmm3, %xmm4
+	jnz	L(Shl13Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave13)
+# endif
+	palignr	$13, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl13LoopStart)
+
+L(Shl13LoopExit):
+	movl	-1(%ecx), %esi
+	movl	%esi, -1(%edx)
+	mov	$3, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl14):
+	movaps	-14(%ecx), %xmm1
+	movaps	2(%ecx), %xmm2
+L(Shl14Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit14Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl14LoopExit)
+
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	18(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit14Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl14LoopExit)
+
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	18(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit14Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl14LoopExit)
+
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	18(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit14Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl14LoopExit)
+
+	palignr	$14, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	18(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-2(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-14(%ecx), %xmm1
+
+L(Shl14LoopStart):
+	movaps	2(%ecx), %xmm2
+	movaps	18(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	34(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	50(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$14, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$14, %xmm3, %xmm4
+	jnz	L(Shl14Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave14)
+# endif
+	palignr	$14, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl14LoopStart)
+
+L(Shl14LoopExit):
+	movl	-2(%ecx), %esi
+	movl	%esi, -2(%edx)
+	mov	$2, %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl15):
+	movaps	-15(%ecx), %xmm1
+	movaps	1(%ecx), %xmm2
+L(Shl15Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit15Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl15LoopExit)
+
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	17(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit15Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl15LoopExit)
+
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	17(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit15Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl15LoopExit)
+
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	17(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit15Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl15LoopExit)
+
+	palignr	$15, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	17(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-1(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-15(%ecx), %xmm1
+
+L(Shl15LoopStart):
+	movaps	1(%ecx), %xmm2
+	movaps	17(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	33(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	49(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$15, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$15, %xmm3, %xmm4
+	jnz	L(Shl15Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave15)
+# endif
+	palignr	$15, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl15LoopStart)
+
+L(Shl15LoopExit):
+	movl	-3(%ecx), %esi
+	movl	%esi, -3(%edx)
+	mov	$1, %esi
+# ifdef USE_AS_STRCAT
+	jmp	L(CopyFrom1To16Bytes)
+# endif
+
+
+# ifndef USE_AS_STRCAT
+
+	.p2align 4
+L(CopyFrom1To16Bytes):
+#  ifdef USE_AS_STRNCPY
+	add	$16, %ebx
+#  endif
+	add	%esi, %edx
+	add	%esi, %ecx
+
+	POP	(%esi)
+	test	%al, %al
+	jz	L(ExitHigh8)
+
+L(CopyFrom1To16BytesLess8):
+	mov	%al, %ah
+	and	$15, %ah
+	jz	L(ExitHigh4)
+
+	test	$0x01, %al
+	jnz	L(Exit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+
+	.p2align 4
+L(Exit4):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	SAVE_RESULT	(3)
+#  ifdef USE_AS_STRNCPY
+	sub	$4, %ebx
+	lea	4(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(ExitHigh4):
+	test	$0x10, %al
+	jnz	L(Exit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+
+	.p2align 4
+L(Exit8):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	SAVE_RESULT	(7)
+#  ifdef USE_AS_STRNCPY
+	sub	$8, %ebx
+	lea	8(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(ExitHigh8):
+	mov	%ah, %al
+	and	$15, %al
+	jz	L(ExitHigh12)
+
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+
+	.p2align 4
+L(Exit12):
+	movlpd	(%ecx), %xmm0
+	movl	8(%ecx), %eax
+	movlpd	%xmm0, (%edx)
+	movl	%eax, 8(%edx)
+	SAVE_RESULT	(11)
+#  ifdef USE_AS_STRNCPY
+	sub	$12, %ebx
+	lea	12(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(ExitHigh12):
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+
+	.p2align 4
+L(Exit16):
+	movdqu	(%ecx), %xmm0
+	movdqu	%xmm0, (%edx)
+	SAVE_RESULT	(15)
+#  ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	lea	16(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+#   ifdef USE_AS_STRNCPY
+
+	CFI_PUSH(%esi)
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2):
+	add	$16, %ebx
+	add	%esi, %ecx
+	add	%esi, %edx
+
+	POP	(%esi)
+
+	test	%al, %al
+	jz	L(ExitHighCase2)
+
+	cmp	$8, %ebx
+	ja	L(CopyFrom1To16BytesLess8)
+
+	test	$0x01, %al
+	jnz	L(Exit1)
+	cmp	$1, %ebx
+	je	L(Exit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	cmp	$2, %ebx
+	je	L(Exit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	cmp	$3, %ebx
+	je	L(Exit3)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	cmp	$4, %ebx
+	je	L(Exit4)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	cmp	$5, %ebx
+	je	L(Exit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	cmp	$6, %ebx
+	je	L(Exit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	cmp	$7, %ebx
+	je	L(Exit7)
+	jmp	L(Exit8)
+
+	.p2align 4
+L(ExitHighCase2):
+	cmp	$8, %ebx
+	jbe	L(CopyFrom1To16BytesLess8Case3)
+
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	cmp	$9, %ebx
+	je	L(Exit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	cmp	$10, %ebx
+	je	L(Exit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	cmp	$11, %ebx
+	je	L(Exit11)
+	test	$0x8, %ah
+	jnz	L(Exit12)
+	cmp	$12, %ebx
+	je	L(Exit12)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	cmp	$13, %ebx
+	je	L(Exit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	cmp	$14, %ebx
+	je	L(Exit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	cmp	$15, %ebx
+	je	L(Exit15)
+	jmp	L(Exit16)
+
+	CFI_PUSH(%esi)
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2OrCase3):
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+
+	.p2align 4
+L(CopyFrom1To16BytesCase3):
+	add	$16, %ebx
+	add	%esi, %edx
+	add	%esi, %ecx
+
+	POP	(%esi)
+
+	cmp	$8, %ebx
+	ja	L(ExitHigh8Case3)
+
+L(CopyFrom1To16BytesLess8Case3):
+	cmp	$4, %ebx
+	ja	L(ExitHigh4Case3)
+
+	cmp	$1, %ebx
+	je	L(Exit1)
+	cmp	$2, %ebx
+	je	L(Exit2)
+	cmp	$3, %ebx
+	je	L(Exit3)
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	SAVE_RESULT	(4)
+	RETURN1
+
+	.p2align 4
+L(ExitHigh4Case3):
+	cmp	$5, %ebx
+	je	L(Exit5)
+	cmp	$6, %ebx
+	je	L(Exit6)
+	cmp	$7, %ebx
+	je	L(Exit7)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	SAVE_RESULT	(8)
+	RETURN1
+
+	.p2align 4
+L(ExitHigh8Case3):
+	cmp	$12, %ebx
+	ja	L(ExitHigh12Case3)
+
+	cmp	$9, %ebx
+	je	L(Exit9)
+	cmp	$10, %ebx
+	je	L(Exit10)
+	cmp	$11, %ebx
+	je	L(Exit11)
+	movlpd	(%ecx), %xmm0
+	movl	8(%ecx), %eax
+	movlpd	%xmm0, (%edx)
+	movl	%eax, 8(%edx)
+	SAVE_RESULT	(12)
+	RETURN1
+
+	.p2align 4
+L(ExitHigh12Case3):
+	cmp	$13, %ebx
+	je	L(Exit13)
+	cmp	$14, %ebx
+	je	L(Exit14)
+	cmp	$15, %ebx
+	je	L(Exit15)
+	movlpd	(%ecx), %xmm0
+	movlpd	8(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 8(%edx)
+	SAVE_RESULT	(16)
+	RETURN1
+
+#  endif
+
+	.p2align 4
+L(Exit1):
+	movb	(%ecx), %al
+	movb	%al, (%edx)
+	SAVE_RESULT	(0)
+#  ifdef USE_AS_STRNCPY
+	sub	$1, %ebx
+	lea	1(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(Exit2):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	SAVE_RESULT	(1)
+#  ifdef USE_AS_STRNCPY
+	sub	$2, %ebx
+	lea	2(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(Exit3):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	movb	2(%ecx), %al
+	movb	%al, 2(%edx)
+	SAVE_RESULT	(2)
+#  ifdef USE_AS_STRNCPY
+	sub	$3, %ebx
+	lea	3(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(Exit5):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movb	4(%ecx), %al
+	movb	%al, 4(%edx)
+	SAVE_RESULT	(4)
+#  ifdef USE_AS_STRNCPY
+	sub	$5, %ebx
+	lea	5(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(Exit6):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movw	4(%ecx), %ax
+	movw	%ax, 4(%edx)
+	SAVE_RESULT	(5)
+#  ifdef USE_AS_STRNCPY
+	sub	$6, %ebx
+	lea	6(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(Exit7):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	3(%ecx), %eax
+	movl	%eax, 3(%edx)
+	SAVE_RESULT	(6)
+#  ifdef USE_AS_STRNCPY
+	sub	$7, %ebx
+	lea	7(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(Exit9):
+	movlpd	(%ecx), %xmm0
+	movb	8(%ecx), %al
+	movlpd	%xmm0, (%edx)
+	movb	%al, 8(%edx)
+	SAVE_RESULT	(8)
+#  ifdef USE_AS_STRNCPY
+	sub	$9, %ebx
+	lea	9(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(Exit10):
+	movlpd	(%ecx), %xmm0
+	movw	8(%ecx), %ax
+	movlpd	%xmm0, (%edx)
+	movw	%ax, 8(%edx)
+	SAVE_RESULT	(9)
+#  ifdef USE_AS_STRNCPY
+	sub	$10, %ebx
+	lea	10(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(Exit11):
+	movlpd	(%ecx), %xmm0
+	movl	7(%ecx), %eax
+	movlpd	%xmm0, (%edx)
+	movl	%eax, 7(%edx)
+	SAVE_RESULT	(10)
+#  ifdef USE_AS_STRNCPY
+	sub	$11, %ebx
+	lea	11(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(Exit13):
+	movlpd	(%ecx), %xmm0
+	movlpd	5(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 5(%edx)
+	SAVE_RESULT	(12)
+#  ifdef USE_AS_STRNCPY
+	sub	$13, %ebx
+	lea	13(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(Exit14):
+	movlpd	(%ecx), %xmm0
+	movlpd	6(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 6(%edx)
+	SAVE_RESULT	(13)
+#  ifdef USE_AS_STRNCPY
+	sub	$14, %ebx
+	lea	14(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+	.p2align 4
+L(Exit15):
+	movlpd	(%ecx), %xmm0
+	movlpd	7(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 7(%edx)
+	SAVE_RESULT	(14)
+#  ifdef USE_AS_STRNCPY
+	sub	$15, %ebx
+	lea	15(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN1
+
+CFI_POP	(%edi)
+
+#  ifdef USE_AS_STRNCPY
+	.p2align 4
+L(Fill0):
+	RETURN
+
+	.p2align 4
+L(Fill1):
+	movb	%dl, (%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill2):
+	movw	%dx, (%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill3):
+	movw	%dx, (%ecx)
+	movb	%dl, 2(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill4):
+	movl	%edx, (%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill5):
+	movl	%edx, (%ecx)
+	movb	%dl, 4(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill6):
+	movl	%edx, (%ecx)
+	movw	%dx, 4(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill7):
+	movl	%edx, (%ecx)
+	movl	%edx, 3(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill8):
+	movlpd	%xmm0, (%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill9):
+	movlpd	%xmm0, (%ecx)
+	movb	%dl, 8(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill10):
+	movlpd	%xmm0, (%ecx)
+	movw	%dx, 8(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill11):
+	movlpd	%xmm0, (%ecx)
+	movl	%edx, 7(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill12):
+	movlpd	%xmm0, (%ecx)
+	movl	%edx, 8(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill13):
+	movlpd	%xmm0, (%ecx)
+	movlpd	%xmm0, 5(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill14):
+	movlpd	%xmm0, (%ecx)
+	movlpd	%xmm0, 6(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill15):
+	movlpd	%xmm0, (%ecx)
+	movlpd	%xmm0, 7(%ecx)
+	RETURN
+
+	.p2align 4
+L(Fill16):
+	movlpd	%xmm0, (%ecx)
+	movlpd	%xmm0, 8(%ecx)
+	RETURN
+
+	.p2align 4
+L(StrncpyFillExit1):
+	lea	16(%ebx), %ebx
+L(FillFrom1To16Bytes):
+	test	%ebx, %ebx
+	jz	L(Fill0)
+	cmp	$16, %ebx
+	je	L(Fill16)
+	cmp	$8, %ebx
+	je	L(Fill8)
+	jg	L(FillMore8)
+	cmp	$4, %ebx
+	je	L(Fill4)
+	jg	L(FillMore4)
+	cmp	$2, %ebx
+	jl	L(Fill1)
+	je	L(Fill2)
+	jg	L(Fill3)
+L(FillMore8):	/* but less than 16 */
+	cmp	$12, %ebx
+	je	L(Fill12)
+	jl	L(FillLess12)
+	cmp	$14, %ebx
+	jl	L(Fill13)
+	je	L(Fill14)
+	jg	L(Fill15)
+L(FillMore4):	/* but less than 8 */
+	cmp	$6, %ebx
+	jl	L(Fill5)
+	je	L(Fill6)
+	jg	L(Fill7)
+L(FillLess12):	/* but more than 8 */
+	cmp	$10, %ebx
+	jl	L(Fill9)
+	je	L(Fill10)
+	jmp	L(Fill11)
+
+	CFI_PUSH(%edi)
+
+	.p2align 4
+L(StrncpyFillTailWithZero1):
+	POP	(%edi)
+L(StrncpyFillTailWithZero):
+	pxor	%xmm0, %xmm0
+	xor	%edx, %edx
+	sub	$16, %ebx
+	jbe	L(StrncpyFillExit1)
+
+	movlpd	%xmm0, (%ecx)
+	movlpd	%xmm0, 8(%ecx)
+
+	lea	16(%ecx), %ecx
+
+	mov	%ecx, %edx
+	and	$0xf, %edx
+	sub	%edx, %ecx
+	add	%edx, %ebx
+	xor	%edx, %edx
+	sub	$64, %ebx
+	jb	L(StrncpyFillLess64)
+
+L(StrncpyFillLoopMovdqa):
+	movdqa	%xmm0, (%ecx)
+	movdqa	%xmm0, 16(%ecx)
+	movdqa	%xmm0, 32(%ecx)
+	movdqa	%xmm0, 48(%ecx)
+	lea	64(%ecx), %ecx
+	sub	$64, %ebx
+	jae	L(StrncpyFillLoopMovdqa)
+
+L(StrncpyFillLess64):
+	add	$32, %ebx
+	jl	L(StrncpyFillLess32)
+	movdqa	%xmm0, (%ecx)
+	movdqa	%xmm0, 16(%ecx)
+	lea	32(%ecx), %ecx
+	sub	$16, %ebx
+	jl	L(StrncpyFillExit1)
+	movdqa	%xmm0, (%ecx)
+	lea	16(%ecx), %ecx
+	jmp	L(FillFrom1To16Bytes)
+
+L(StrncpyFillLess32):
+	add	$16, %ebx
+	jl	L(StrncpyFillExit1)
+	movdqa	%xmm0, (%ecx)
+	lea	16(%ecx), %ecx
+	jmp	L(FillFrom1To16Bytes)
+#  endif
+
+	.p2align 4
+L(ExitTail1):
+	movb	(%ecx), %al
+	movb	%al, (%edx)
+	SAVE_RESULT_TAIL (0)
+#  ifdef USE_AS_STRNCPY
+	sub	$1, %ebx
+	lea	1(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail2):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	SAVE_RESULT_TAIL (1)
+#  ifdef USE_AS_STRNCPY
+	sub	$2, %ebx
+	lea	2(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail3):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	movb	2(%ecx), %al
+	movb	%al, 2(%edx)
+	SAVE_RESULT_TAIL (2)
+#  ifdef USE_AS_STRNCPY
+	sub	$3, %ebx
+	lea	3(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail4):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	SAVE_RESULT_TAIL (3)
+#  ifdef USE_AS_STRNCPY
+	sub	$4, %ebx
+	lea	4(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail5):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movb	4(%ecx), %al
+	movb	%al, 4(%edx)
+	SAVE_RESULT_TAIL (4)
+#  ifdef USE_AS_STRNCPY
+	sub	$5, %ebx
+	lea	5(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail6):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movw	4(%ecx), %ax
+	movw	%ax, 4(%edx)
+	SAVE_RESULT_TAIL (5)
+#  ifdef USE_AS_STRNCPY
+	sub	$6, %ebx
+	lea	6(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail7):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	3(%ecx), %eax
+	movl	%eax, 3(%edx)
+	SAVE_RESULT_TAIL (6)
+#  ifdef USE_AS_STRNCPY
+	sub	$7, %ebx
+	lea	7(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail8):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	SAVE_RESULT_TAIL (7)
+#  ifdef USE_AS_STRNCPY
+	sub	$8, %ebx
+	lea	8(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail9):
+	movlpd	(%ecx), %xmm0
+	movb	8(%ecx), %al
+	movlpd	%xmm0, (%edx)
+	movb	%al, 8(%edx)
+	SAVE_RESULT_TAIL (8)
+#  ifdef USE_AS_STRNCPY
+	sub	$9, %ebx
+	lea	9(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail10):
+	movlpd	(%ecx), %xmm0
+	movw	8(%ecx), %ax
+	movlpd	%xmm0, (%edx)
+	movw	%ax, 8(%edx)
+	SAVE_RESULT_TAIL (9)
+#  ifdef USE_AS_STRNCPY
+	sub	$10, %ebx
+	lea	10(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail11):
+	movlpd	(%ecx), %xmm0
+	movl	7(%ecx), %eax
+	movlpd	%xmm0, (%edx)
+	movl	%eax, 7(%edx)
+	SAVE_RESULT_TAIL (10)
+#  ifdef USE_AS_STRNCPY
+	sub	$11, %ebx
+	lea	11(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail12):
+	movlpd	(%ecx), %xmm0
+	movl	8(%ecx), %eax
+	movlpd	%xmm0, (%edx)
+	movl	%eax, 8(%edx)
+	SAVE_RESULT_TAIL (11)
+#  ifdef USE_AS_STRNCPY
+	sub	$12, %ebx
+	lea	12(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail13):
+	movlpd	(%ecx), %xmm0
+	movlpd	5(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 5(%edx)
+	SAVE_RESULT_TAIL (12)
+#  ifdef USE_AS_STRNCPY
+	sub	$13, %ebx
+	lea	13(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail14):
+	movlpd	(%ecx), %xmm0
+	movlpd	6(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 6(%edx)
+	SAVE_RESULT_TAIL (13)
+#  ifdef USE_AS_STRNCPY
+	sub	$14, %ebx
+	lea	14(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#  ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail15):
+	movlpd	(%ecx), %xmm0
+	movlpd	7(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 7(%edx)
+	SAVE_RESULT_TAIL (14)
+#  ifdef USE_AS_STRNCPY
+	sub	$15, %ebx
+	lea	15(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#  endif
+	RETURN
+
+	.p2align 4
+L(ExitTail16):
+	movdqu	(%ecx), %xmm0
+	movdqu	%xmm0, (%edx)
+	SAVE_RESULT_TAIL (15)
+#  ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	lea	16(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+#  endif
+	RETURN
+# endif
+
+# ifdef USE_AS_STRNCPY
+#  ifndef USE_AS_STRCAT
+	CFI_PUSH (%esi)
+	CFI_PUSH (%edi)
+#  endif
+	.p2align 4
+L(StrncpyLeaveCase2OrCase3):
+	test	%eax, %eax
+	jnz	L(Aligned64LeaveCase2)
+
+L(Aligned64LeaveCase3):
+	add	$48, %ebx
+	jle	L(CopyFrom1To16BytesCase3)
+	movaps	%xmm4, -64(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase3)
+	movaps	%xmm5, -48(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase3)
+	movaps	%xmm6, -32(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(Aligned64LeaveCase2):
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	add	$48, %ebx
+	jle	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm5, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm4, -64(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm6, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm5, -48(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm7, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm6, -32(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+	jmp	L(CopyFrom1To16BytesCase2)
+
+/*--------------------------------------------------*/
+	.p2align 4
+L(StrncpyExit1Case2OrCase3):
+	movlpd	(%ecx), %xmm0
+	movlpd	7(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 7(%edx)
+	mov	$15, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit2Case2OrCase3):
+	movlpd	(%ecx), %xmm0
+	movlpd	6(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 6(%edx)
+	mov	$14, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit3Case2OrCase3):
+	movlpd	(%ecx), %xmm0
+	movlpd	5(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 5(%edx)
+	mov	$13, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit4Case2OrCase3):
+	movlpd	(%ecx), %xmm0
+	movl	8(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 8(%edx)
+	mov	$12, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit5Case2OrCase3):
+	movlpd	(%ecx), %xmm0
+	movl	7(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 7(%edx)
+	mov	$11, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit6Case2OrCase3):
+	movlpd	(%ecx), %xmm0
+	movl	6(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 6(%edx)
+	mov	$10, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit7Case2OrCase3):
+	movlpd	(%ecx), %xmm0
+	movl	5(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 5(%edx)
+	mov	$9, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit8Case2OrCase3):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	mov	$8, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit9Case2OrCase3):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	mov	$7, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit10Case2OrCase3):
+	movlpd	-1(%ecx), %xmm0
+	movlpd	%xmm0, -1(%edx)
+	mov	$6, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit11Case2OrCase3):
+	movlpd	-2(%ecx), %xmm0
+	movlpd	%xmm0, -2(%edx)
+	mov	$5, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit12Case2OrCase3):
+	movl	(%ecx), %esi
+	movl	%esi, (%edx)
+	mov	$4, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit13Case2OrCase3):
+	movl	-1(%ecx), %esi
+	movl	%esi, -1(%edx)
+	mov	$3, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit14Case2OrCase3):
+	movl	-2(%ecx), %esi
+	movl	%esi, -2(%edx)
+	mov	$2, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit15Case2OrCase3):
+	movl	-3(%ecx), %esi
+	movl	%esi, -3(%edx)
+	mov	$1, %esi
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave1):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit1)
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	31(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit1)
+	palignr	$1, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit1)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit1)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit1):
+	lea	15(%edx, %esi), %edx
+	lea	15(%ecx, %esi), %ecx
+	movdqu	-16(%ecx), %xmm0
+	xor	%esi, %esi
+	movdqu	%xmm0, -16(%edx)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave2):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit2)
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	30(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit2)
+	palignr	$2, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit2)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit2)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit2):
+	lea	14(%edx, %esi), %edx
+	lea	14(%ecx, %esi), %ecx
+	movdqu	-16(%ecx), %xmm0
+	xor	%esi, %esi
+	movdqu	%xmm0, -16(%edx)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave3):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit3)
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	29(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit3)
+	palignr	$3, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit3)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit3)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit3):
+	lea	13(%edx, %esi), %edx
+	lea	13(%ecx, %esi), %ecx
+	movdqu	-16(%ecx), %xmm0
+	xor	%esi, %esi
+	movdqu	%xmm0, -16(%edx)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave4):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit4)
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	28(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit4)
+	palignr	$4, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit4)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit4)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit4):
+	lea	12(%edx, %esi), %edx
+	lea	12(%ecx, %esi), %ecx
+	movlpd	-12(%ecx), %xmm0
+	movl	-4(%ecx), %eax
+	movlpd	%xmm0, -12(%edx)
+	movl	%eax, -4(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave5):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit5)
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	27(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit5)
+	palignr	$5, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit5)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit5)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit5):
+	lea	11(%edx, %esi), %edx
+	lea	11(%ecx, %esi), %ecx
+	movlpd	-11(%ecx), %xmm0
+	movl	-4(%ecx), %eax
+	movlpd	%xmm0, -11(%edx)
+	movl	%eax, -4(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave6):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit6)
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	26(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit6)
+	palignr	$6, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit6)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit6)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit6):
+	lea	10(%edx, %esi), %edx
+	lea	10(%ecx, %esi), %ecx
+
+	movlpd	-10(%ecx), %xmm0
+	movw	-2(%ecx), %ax
+	movlpd	%xmm0, -10(%edx)
+	movw	%ax, -2(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave7):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit7)
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	25(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit7)
+	palignr	$7, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit7)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit7)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit7):
+	lea	9(%edx, %esi), %edx
+	lea	9(%ecx, %esi), %ecx
+
+	movlpd	-9(%ecx), %xmm0
+	movb	-1(%ecx), %ah
+	movlpd	%xmm0, -9(%edx)
+	movb	%ah, -1(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave8):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit8)
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	24(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit8)
+	palignr	$8, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit8)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit8)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit8):
+	lea	8(%edx, %esi), %edx
+	lea	8(%ecx, %esi), %ecx
+	movlpd	-8(%ecx), %xmm0
+	movlpd	%xmm0, -8(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave9):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit9)
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	23(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit9)
+	palignr	$9, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit9)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit9)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit9):
+	lea	7(%edx, %esi), %edx
+	lea	7(%ecx, %esi), %ecx
+
+	movlpd	-8(%ecx), %xmm0
+	movlpd	%xmm0, -8(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave10):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit10)
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	22(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit10)
+	palignr	$10, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit10)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit10)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit10):
+	lea	6(%edx, %esi), %edx
+	lea	6(%ecx, %esi), %ecx
+
+	movlpd	-8(%ecx), %xmm0
+	movlpd	%xmm0, -8(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave11):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit11)
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	21(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit11)
+	palignr	$11, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit11)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit11)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit11):
+	lea	5(%edx, %esi), %edx
+	lea	5(%ecx, %esi), %ecx
+	movl	-5(%ecx), %esi
+	movb	-1(%ecx), %ah
+	movl	%esi, -5(%edx)
+	movb	%ah, -1(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave12):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit12)
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	20(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit12)
+	palignr	$12, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit12)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit12)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit12):
+	lea	4(%edx, %esi), %edx
+	lea	4(%ecx, %esi), %ecx
+	movl	-4(%ecx), %eax
+	movl	%eax, -4(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave13):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit13)
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	19(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit13)
+	palignr	$13, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit13)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit13)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit13):
+	lea	3(%edx, %esi), %edx
+	lea	3(%ecx, %esi), %ecx
+
+	movl	-4(%ecx), %eax
+	movl	%eax, -4(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave14):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit14)
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	18(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit14)
+	palignr	$14, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit14)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit14)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit14):
+	lea	2(%edx, %esi), %edx
+	lea	2(%ecx, %esi), %ecx
+	movw	-2(%ecx), %ax
+	movw	%ax, -2(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave15):
+	movaps	%xmm2, %xmm3
+	add	$48, %ebx
+	jle	L(StrncpyExit15)
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	17(%ecx), %xmm2
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit15)
+	palignr	$15, %xmm3, %xmm2
+	movaps	%xmm2, 16(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit15)
+	movaps	%xmm4, 32(%edx)
+	lea	16(%esi), %esi
+	sub	$16, %ebx
+	jbe	L(StrncpyExit15)
+	movaps	%xmm5, 48(%edx)
+	lea	16(%esi), %esi
+	lea	-16(%ebx), %ebx
+L(StrncpyExit15):
+	lea	1(%edx, %esi), %edx
+	lea	1(%ecx, %esi), %ecx
+	movb	-1(%ecx), %ah
+	movb	%ah, -1(%edx)
+	xor	%esi, %esi
+	jmp	L(CopyFrom1To16BytesCase3)
+# endif
+
+# ifndef USE_AS_STRCAT
+#  ifdef USE_AS_STRNCPY
+	CFI_POP (%esi)
+	CFI_POP (%edi)
+
+	.p2align 4
+L(ExitTail0):
+	movl	%edx, %eax
+	RETURN
+
+	.p2align 4
+L(StrncpyExit15Bytes):
+	cmp	$12, %ebx
+	jbe	L(StrncpyExit12Bytes)
+	cmpb	$0, 8(%ecx)
+	jz	L(ExitTail9)
+	cmpb	$0, 9(%ecx)
+	jz	L(ExitTail10)
+	cmpb	$0, 10(%ecx)
+	jz	L(ExitTail11)
+	cmpb	$0, 11(%ecx)
+	jz	L(ExitTail12)
+	cmp	$13, %ebx
+	je	L(ExitTail13)
+	cmpb	$0, 12(%ecx)
+	jz	L(ExitTail13)
+	cmp	$14, %ebx
+	je	L(ExitTail14)
+	cmpb	$0, 13(%ecx)
+	jz	L(ExitTail14)
+	movlpd	(%ecx), %xmm0
+	movlpd	7(%ecx), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 7(%edx)
+#   ifdef USE_AS_STPCPY
+	lea	14(%edx), %eax
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   else
+	movl	%edx, %eax
+#   endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit12Bytes):
+	cmp	$9, %ebx
+	je	L(ExitTail9)
+	cmpb	$0, 8(%ecx)
+	jz	L(ExitTail9)
+	cmp	$10, %ebx
+	je	L(ExitTail10)
+	cmpb	$0, 9(%ecx)
+	jz	L(ExitTail10)
+	cmp	$11, %ebx
+	je	L(ExitTail11)
+	cmpb	$0, 10(%ecx)
+	jz	L(ExitTail11)
+	movlpd	(%ecx), %xmm0
+	movl	8(%ecx), %eax
+	movlpd	%xmm0, (%edx)
+	movl	%eax, 8(%edx)
+	SAVE_RESULT_TAIL (11)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit8Bytes):
+	cmp	$4, %ebx
+	jbe	L(StrncpyExit4Bytes)
+	cmpb	$0, (%ecx)
+	jz	L(ExitTail1)
+	cmpb	$0, 1(%ecx)
+	jz	L(ExitTail2)
+	cmpb	$0, 2(%ecx)
+	jz	L(ExitTail3)
+	cmpb	$0, 3(%ecx)
+	jz	L(ExitTail4)
+
+	cmp	$5, %ebx
+	je	L(ExitTail5)
+	cmpb	$0, 4(%ecx)
+	jz	L(ExitTail5)
+	cmp	$6, %ebx
+	je	L(ExitTail6)
+	cmpb	$0, 5(%ecx)
+	jz	L(ExitTail6)
+	cmp	$7, %ebx
+	je	L(ExitTail7)
+	cmpb	$0, 6(%ecx)
+	jz	L(ExitTail7)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+#   ifdef USE_AS_STPCPY
+	lea	7(%edx), %eax
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   else
+	movl	%edx, %eax
+#   endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit4Bytes):
+	test	%ebx, %ebx
+	jz	L(ExitTail0)
+	cmp	$1, %ebx
+	je	L(ExitTail1)
+	cmpb	$0, (%ecx)
+	jz	L(ExitTail1)
+	cmp	$2, %ebx
+	je	L(ExitTail2)
+	cmpb	$0, 1(%ecx)
+	jz	L(ExitTail2)
+	cmp	$3, %ebx
+	je	L(ExitTail3)
+	cmpb	$0, 2(%ecx)
+	jz	L(ExitTail3)
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	SAVE_RESULT_TAIL (3)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#   endif
+	RETURN
+#  endif
+
+END (STRCPY)
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy.S
new file mode 100644
index 0000000000..ffbc03c6d5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy.S
@@ -0,0 +1,116 @@
+/* Multiple versions of strcpy
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if !defined (USE_AS_STPCPY) && !defined (USE_AS_STRNCPY)
+# ifndef STRCPY
+#  define STRCPY strcpy
+# endif
+#endif
+
+#ifdef USE_AS_STPCPY
+# ifdef USE_AS_STRNCPY
+#  define STRCPY_SSSE3	__stpncpy_ssse3
+#  define STRCPY_SSE2		__stpncpy_sse2
+#  define STRCPY_IA32		__stpncpy_ia32
+#  define __GI_STRCPY		__GI_stpncpy
+#  define __GI___STRCPY		__GI___stpncpy
+# else
+#  define STRCPY_SSSE3	__stpcpy_ssse3
+#  define STRCPY_SSE2		__stpcpy_sse2
+#  define STRCPY_IA32		__stpcpy_ia32
+#  define __GI_STRCPY		__GI_stpcpy
+#  define __GI___STRCPY		__GI___stpcpy
+# endif
+#else
+# ifdef USE_AS_STRNCPY
+#  define STRCPY_SSSE3	__strncpy_ssse3
+#  define STRCPY_SSE2		__strncpy_sse2
+#  define STRCPY_IA32		__strncpy_ia32
+#  define __GI_STRCPY		__GI_strncpy
+# else
+#  define STRCPY_SSSE3	__strcpy_ssse3
+#  define STRCPY_SSE2		__strcpy_sse2
+#  define STRCPY_IA32		__strcpy_ia32
+#  define __GI_STRCPY		__GI_strcpy
+# endif
+#endif
+
+
+/* Define multiple versions only for the definition in libc.  Don't
+   define multiple versions for strncpy in static library since we
+   need strncpy before the initialization happened.  */
+#if IS_IN (libc)
+
+	.text
+ENTRY(STRCPY)
+	.type	STRCPY, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (STRCPY_IA32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (STRCPY_SSE2)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz	2f
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (STRCPY_SSSE3)
+2:	ret
+END(STRCPY)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type STRCPY_IA32, @function; \
+	.align 16; \
+	.globl STRCPY_IA32; \
+	.hidden STRCPY_IA32; \
+	STRCPY_IA32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size STRCPY_IA32, .-STRCPY_IA32
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strcpy calls through a PLT.
+   The speedup we get from using SSSE3 instruction is likely eaten away
+   by the indirect call in the PLT.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_STRCPY; __GI_STRCPY = STRCPY_IA32
+#  undef libc_hidden_def
+#  define libc_hidden_def(name) \
+	.globl __GI___STRCPY; __GI___STRCPY = STRCPY_IA32
+
+# endif
+#endif
+
+#ifdef USE_AS_STPCPY
+# ifdef USE_AS_STRNCPY
+#  include "../../stpncpy.S"
+# else
+#  include "../../i586/stpcpy.S"
+# endif
+#else
+# ifndef USE_AS_STRNCPY
+#  include "../../i586/strcpy.S"
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn-c.c
new file mode 100644
index 0000000000..6d61e190a8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn-c.c
@@ -0,0 +1,2 @@
+#define __strcspn_sse2 __strcspn_ia32
+#include <sysdeps/x86_64/multiarch/strcspn-c.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn.S
new file mode 100644
index 0000000000..21e5093924
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn.S
@@ -0,0 +1,75 @@
+/* Multiple versions of strcspn
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifdef USE_AS_STRPBRK
+#define STRCSPN_SSE42	__strpbrk_sse42
+#define STRCSPN_IA32	__strpbrk_ia32
+#define __GI_STRCSPN	__GI_strpbrk
+#else
+#ifndef STRCSPN
+#define STRCSPN		strcspn
+#define STRCSPN_SSE42	__strcspn_sse42
+#define STRCSPN_IA32	__strcspn_ia32
+#define __GI_STRCSPN	__GI_strcspn
+#endif
+#endif
+
+/* Define multiple versions only for the definition in libc.  Don't
+   define multiple versions for strpbrk in static library since we
+   need strpbrk before the initialization happened.  */
+#if (defined SHARED || !defined USE_AS_STRPBRK) && IS_IN (libc)
+	.text
+ENTRY(STRCSPN)
+	.type	STRCSPN, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (STRCSPN_IA32)
+	HAS_CPU_FEATURE (SSE4_2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (STRCSPN_SSE42)
+2:	ret
+END(STRCSPN)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type STRCSPN_IA32, @function; \
+	.globl STRCSPN_IA32; \
+	.p2align 4; \
+	STRCSPN_IA32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size STRCSPN_IA32, .-STRCSPN_IA32
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_STRCSPN; __GI_STRCSPN = STRCSPN_IA32
+#endif
+
+#ifdef USE_AS_STRPBRK
+#include "../../strpbrk.S"
+#else
+#include "../../strcspn.S"
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S
new file mode 100644
index 0000000000..d3ea864bab
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S
@@ -0,0 +1,125 @@
+/* strlen with SSE2 and BSF
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if defined SHARED && IS_IN (libc)
+
+#include <sysdep.h>
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+#define PARMS		4 + 8	/* Preserve ESI and EDI.  */
+#define	STR		PARMS
+#define ENTRANCE	PUSH (%esi); PUSH (%edi); cfi_remember_state
+#define RETURN		POP (%edi); POP (%esi); ret; \
+			cfi_restore_state; cfi_remember_state
+
+	.text
+ENTRY ( __strlen_sse2_bsf)
+	ENTRANCE
+	mov	STR(%esp), %edi
+	xor	%eax, %eax
+	mov	%edi, %ecx
+	and	$0x3f, %ecx
+	pxor	%xmm0, %xmm0
+	cmp	$0x30, %ecx
+	ja	L(next)
+	movdqu	(%edi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit_less16)
+	mov	%edi, %eax
+	and	$-16, %eax
+	jmp	L(align16_start)
+L(next):
+
+	mov	%edi, %eax
+	and	$-16, %eax
+	pcmpeqb	(%eax), %xmm0
+	mov	$-1, %esi
+	sub	%eax, %ecx
+	shl	%cl, %esi
+	pmovmskb %xmm0, %edx
+	and	%esi, %edx
+	jnz	L(exit)
+L(align16_start):
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
+	.p2align 4
+L(align16_loop):
+	pcmpeqb	16(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%eax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	lea	64(%eax), %eax
+	test	%edx, %edx
+	jz	L(align16_loop)
+L(exit):
+	sub	%edi, %eax
+L(exit_less16):
+	bsf	%edx, %edx
+	add	%edx, %eax
+	RETURN
+L(exit16):
+	sub	%edi, %eax
+	bsf	%edx, %edx
+	add	%edx, %eax
+	add	$16, %eax
+	RETURN
+L(exit32):
+	sub	%edi, %eax
+	bsf	%edx, %edx
+	add	%edx, %eax
+	add	$32, %eax
+	RETURN
+L(exit48):
+	sub	%edi, %eax
+	bsf	%edx, %edx
+	add	%edx, %eax
+	add	$48, %eax
+	POP (%edi)
+	POP (%esi)
+	ret
+
+END ( __strlen_sse2_bsf)
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2.S
new file mode 100644
index 0000000000..36fc1469d0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2.S
@@ -0,0 +1,695 @@
+/* strlen with SSE2
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* for strlen only SHARED version is optimized, for strcat, strncat, strnlen both STATIC and SHARED are optimized */
+
+#if (defined USE_AS_STRNLEN || defined USE_AS_STRCAT || defined SHARED) && IS_IN (libc)
+
+# ifndef USE_AS_STRCAT
+
+#  include <sysdep.h>
+#  define PARMS	4
+#  define STR	PARMS
+#  define RETURN	ret
+
+#  ifdef USE_AS_STRNLEN
+#   define LEN	PARMS + 8
+#   define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+#   define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+#   define PUSH(REG)	pushl	REG;	CFI_PUSH (REG)
+#   define POP(REG)	popl	REG;	CFI_POP (REG)
+#   undef RETURN
+#   define RETURN	POP (%edi); CFI_PUSH(%edi); ret
+#  endif
+
+#  ifndef STRLEN
+#   define STRLEN	__strlen_sse2
+#  endif
+
+	atom_text_section
+ENTRY (STRLEN)
+	mov	STR(%esp), %edx
+#  ifdef USE_AS_STRNLEN
+	PUSH	(%edi)
+	movl	LEN(%esp), %edi
+	sub	$4, %edi
+	jbe	L(len_less4_prolog)
+#  endif
+# endif
+	xor	%eax, %eax
+	cmpb	$0, (%edx)
+	jz	L(exit_tail0)
+	cmpb	$0, 1(%edx)
+	jz	L(exit_tail1)
+	cmpb	$0, 2(%edx)
+	jz	L(exit_tail2)
+	cmpb	$0, 3(%edx)
+	jz	L(exit_tail3)
+
+# ifdef USE_AS_STRNLEN
+	sub	$4, %edi
+	jbe	L(len_less8_prolog)
+# endif
+
+	cmpb	$0, 4(%edx)
+	jz	L(exit_tail4)
+	cmpb	$0, 5(%edx)
+	jz	L(exit_tail5)
+	cmpb	$0, 6(%edx)
+	jz	L(exit_tail6)
+	cmpb	$0, 7(%edx)
+	jz	L(exit_tail7)
+
+# ifdef USE_AS_STRNLEN
+	sub	$4, %edi
+	jbe	L(len_less12_prolog)
+# endif
+
+	cmpb	$0, 8(%edx)
+	jz	L(exit_tail8)
+	cmpb	$0, 9(%edx)
+	jz	L(exit_tail9)
+	cmpb	$0, 10(%edx)
+	jz	L(exit_tail10)
+	cmpb	$0, 11(%edx)
+	jz	L(exit_tail11)
+
+# ifdef USE_AS_STRNLEN
+	sub	$4, %edi
+	jbe	L(len_less16_prolog)
+# endif
+
+	cmpb	$0, 12(%edx)
+	jz	L(exit_tail12)
+	cmpb	$0, 13(%edx)
+	jz	L(exit_tail13)
+	cmpb	$0, 14(%edx)
+	jz	L(exit_tail14)
+	cmpb	$0, 15(%edx)
+	jz	L(exit_tail15)
+
+	pxor	%xmm0, %xmm0
+	lea	16(%edx), %eax
+	mov	%eax, %ecx
+	and	$-16, %eax
+
+# ifdef USE_AS_STRNLEN
+	and	$15, %edx
+	add	%edx, %edi
+	sub	$64, %edi
+	jbe	L(len_less64)
+# endif
+
+	pcmpeqb	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	pxor	%xmm1, %xmm1
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	pxor	%xmm2, %xmm2
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm2
+	pmovmskb %xmm2, %edx
+	pxor	%xmm3, %xmm3
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+# ifdef USE_AS_STRNLEN
+	sub	$64, %edi
+	jbe	L(len_less64)
+# endif
+
+	pcmpeqb	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+# ifdef USE_AS_STRNLEN
+	sub	$64, %edi
+	jbe	L(len_less64)
+# endif
+
+	pcmpeqb	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+# ifdef USE_AS_STRNLEN
+	sub	$64, %edi
+	jbe	L(len_less64)
+# endif
+
+	pcmpeqb	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqb	(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+# ifdef USE_AS_STRNLEN
+	mov	%eax, %edx
+	and	$63, %edx
+	add	%edx, %edi
+# endif
+
+	and	$-0x40, %eax
+
+	.p2align 4
+L(aligned_64_loop):
+# ifdef USE_AS_STRNLEN
+	sub	$64, %edi
+	jbe	L(len_less64)
+# endif
+	movaps	(%eax), %xmm0
+	movaps	16(%eax), %xmm1
+	movaps	32(%eax), %xmm2
+	movaps	48(%eax), %xmm6
+	pminub	%xmm1, %xmm0
+	pminub	%xmm6, %xmm2
+	pminub	%xmm0, %xmm2
+	pcmpeqb	%xmm3, %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	lea	64(%eax), %eax
+	jz	L(aligned_64_loop)
+
+	pcmpeqb	-64(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	48(%ecx), %ecx
+	jnz	L(exit)
+
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	-16(%ecx), %ecx
+	jnz	L(exit)
+
+	pcmpeqb	-32(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	-16(%ecx), %ecx
+	jnz	L(exit)
+
+	pcmpeqb	%xmm6, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	-16(%ecx), %ecx
+L(exit):
+	sub	%ecx, %eax
+	test	%dl, %dl
+	jz	L(exit_high)
+
+	mov	%dl, %cl
+	and	$15, %cl
+	jz	L(exit_8)
+	test	$0x01, %dl
+	jnz	L(exit_tail0)
+	test	$0x02, %dl
+	jnz	L(exit_tail1)
+	test	$0x04, %dl
+	jnz	L(exit_tail2)
+	add	$3, %eax
+	RETURN
+
+	.p2align 4
+L(exit_8):
+	test	$0x10, %dl
+	jnz	L(exit_tail4)
+	test	$0x20, %dl
+	jnz	L(exit_tail5)
+	test	$0x40, %dl
+	jnz	L(exit_tail6)
+	add	$7, %eax
+	RETURN
+
+	.p2align 4
+L(exit_high):
+	mov	%dh, %ch
+	and	$15, %ch
+	jz	L(exit_high_8)
+	test	$0x01, %dh
+	jnz	L(exit_tail8)
+	test	$0x02, %dh
+	jnz	L(exit_tail9)
+	test	$0x04, %dh
+	jnz	L(exit_tail10)
+	add	$11, %eax
+	RETURN
+
+	.p2align 4
+L(exit_high_8):
+	test	$0x10, %dh
+	jnz	L(exit_tail12)
+	test	$0x20, %dh
+	jnz	L(exit_tail13)
+	test	$0x40, %dh
+	jnz	L(exit_tail14)
+	add	$15, %eax
+L(exit_tail0):
+	RETURN
+
+# ifdef USE_AS_STRNLEN
+
+	.p2align 4
+L(len_less64):
+	pxor	%xmm0, %xmm0
+	add	$64, %edi
+
+	pcmpeqb	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	pxor	%xmm1, %xmm1
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(strnlen_exit)
+
+	sub	$16, %edi
+	jbe	L(return_start_len)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(strnlen_exit)
+
+	sub	$16, %edi
+	jbe	L(return_start_len)
+
+	pcmpeqb	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(strnlen_exit)
+
+	sub	$16, %edi
+	jbe	L(return_start_len)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(strnlen_exit)
+
+	movl	LEN(%esp), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit):
+	sub	%ecx, %eax
+
+	test	%dl, %dl
+	jz	L(strnlen_exit_high)
+	mov	%dl, %cl
+	and	$15, %cl
+	jz	L(strnlen_exit_8)
+	test	$0x01, %dl
+	jnz	L(exit_tail0)
+	test	$0x02, %dl
+	jnz	L(strnlen_exit_tail1)
+	test	$0x04, %dl
+	jnz	L(strnlen_exit_tail2)
+	sub	$4, %edi
+	jb	L(return_start_len)
+	lea	3(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_8):
+	test	$0x10, %dl
+	jnz	L(strnlen_exit_tail4)
+	test	$0x20, %dl
+	jnz	L(strnlen_exit_tail5)
+	test	$0x40, %dl
+	jnz	L(strnlen_exit_tail6)
+	sub	$8, %edi
+	jb	L(return_start_len)
+	lea	7(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_high):
+	mov	%dh, %ch
+	and	$15, %ch
+	jz	L(strnlen_exit_high_8)
+	test	$0x01, %dh
+	jnz	L(strnlen_exit_tail8)
+	test	$0x02, %dh
+	jnz	L(strnlen_exit_tail9)
+	test	$0x04, %dh
+	jnz	L(strnlen_exit_tail10)
+	sub	$12, %edi
+	jb	L(return_start_len)
+	lea	11(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_high_8):
+	test	$0x10, %dh
+	jnz	L(strnlen_exit_tail12)
+	test	$0x20, %dh
+	jnz	L(strnlen_exit_tail13)
+	test	$0x40, %dh
+	jnz	L(strnlen_exit_tail14)
+	sub	$16, %edi
+	jb	L(return_start_len)
+	lea	15(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail1):
+	sub	$2, %edi
+	jb	L(return_start_len)
+	lea	1(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail2):
+	sub	$3, %edi
+	jb	L(return_start_len)
+	lea	2(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail4):
+	sub	$5, %edi
+	jb	L(return_start_len)
+	lea	4(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail5):
+	sub	$6, %edi
+	jb	L(return_start_len)
+	lea	5(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail6):
+	sub	$7, %edi
+	jb	L(return_start_len)
+	lea	6(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail8):
+	sub	$9, %edi
+	jb	L(return_start_len)
+	lea	8(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail9):
+	sub	$10, %edi
+	jb	L(return_start_len)
+	lea	9(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail10):
+	sub	$11, %edi
+	jb	L(return_start_len)
+	lea	10(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail12):
+	sub	$13, %edi
+	jb	L(return_start_len)
+	lea	12(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail13):
+	sub	$14, %edi
+	jb	L(return_start_len)
+	lea	13(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail14):
+	sub	$15, %edi
+	jb	L(return_start_len)
+	lea	14(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(return_start_len):
+	movl	LEN(%esp), %eax
+	RETURN
+
+/* for prolog only */
+
+	.p2align 4
+L(len_less4_prolog):
+	xor	%eax, %eax
+
+	add	$4, %edi
+	jz	L(exit_tail0)
+
+	cmpb	$0, (%edx)
+	jz	L(exit_tail0)
+	cmp	$1, %edi
+	je	L(exit_tail1)
+
+	cmpb	$0, 1(%edx)
+	jz	L(exit_tail1)
+	cmp	$2, %edi
+	je	L(exit_tail2)
+
+	cmpb	$0, 2(%edx)
+	jz	L(exit_tail2)
+	cmp	$3, %edi
+	je	L(exit_tail3)
+
+	cmpb	$0, 3(%edx)
+	jz	L(exit_tail3)
+	mov	$4, %eax
+	RETURN
+
+	.p2align 4
+L(len_less8_prolog):
+	add	$4, %edi
+
+	cmpb	$0, 4(%edx)
+	jz	L(exit_tail4)
+	cmp	$1, %edi
+	je	L(exit_tail5)
+
+	cmpb	$0, 5(%edx)
+	jz	L(exit_tail5)
+	cmp	$2, %edi
+	je	L(exit_tail6)
+
+	cmpb	$0, 6(%edx)
+	jz	L(exit_tail6)
+	cmp	$3, %edi
+	je	L(exit_tail7)
+
+	cmpb	$0, 7(%edx)
+	jz	L(exit_tail7)
+	mov	$8, %eax
+	RETURN
+
+
+	.p2align 4
+L(len_less12_prolog):
+	add	$4, %edi
+
+	cmpb	$0, 8(%edx)
+	jz	L(exit_tail8)
+	cmp	$1, %edi
+	je	L(exit_tail9)
+
+	cmpb	$0, 9(%edx)
+	jz	L(exit_tail9)
+	cmp	$2, %edi
+	je	L(exit_tail10)
+
+	cmpb	$0, 10(%edx)
+	jz	L(exit_tail10)
+	cmp	$3, %edi
+	je	L(exit_tail11)
+
+	cmpb	$0, 11(%edx)
+	jz	L(exit_tail11)
+	mov	$12, %eax
+	RETURN
+
+	.p2align 4
+L(len_less16_prolog):
+	add	$4, %edi
+
+	cmpb	$0, 12(%edx)
+	jz	L(exit_tail12)
+	cmp	$1, %edi
+	je	L(exit_tail13)
+
+	cmpb	$0, 13(%edx)
+	jz	L(exit_tail13)
+	cmp	$2, %edi
+	je	L(exit_tail14)
+
+	cmpb	$0, 14(%edx)
+	jz	L(exit_tail14)
+	cmp	$3, %edi
+	je	L(exit_tail15)
+
+	cmpb	$0, 15(%edx)
+	jz	L(exit_tail15)
+	mov	$16, %eax
+	RETURN
+# endif
+
+	.p2align 4
+L(exit_tail1):
+	add	$1, %eax
+	RETURN
+
+L(exit_tail2):
+	add	$2, %eax
+	RETURN
+
+L(exit_tail3):
+	add	$3, %eax
+	RETURN
+
+L(exit_tail4):
+	add	$4, %eax
+	RETURN
+
+L(exit_tail5):
+	add	$5, %eax
+	RETURN
+
+L(exit_tail6):
+	add	$6, %eax
+	RETURN
+
+L(exit_tail7):
+	add	$7, %eax
+	RETURN
+
+L(exit_tail8):
+	add	$8, %eax
+	RETURN
+
+L(exit_tail9):
+	add	$9, %eax
+	RETURN
+
+L(exit_tail10):
+	add	$10, %eax
+	RETURN
+
+L(exit_tail11):
+	add	$11, %eax
+	RETURN
+
+L(exit_tail12):
+	add	$12, %eax
+	RETURN
+
+L(exit_tail13):
+	add	$13, %eax
+	RETURN
+
+L(exit_tail14):
+	add	$14, %eax
+	RETURN
+
+L(exit_tail15):
+	add	$15, %eax
+# ifndef USE_AS_STRCAT
+	RETURN
+END (STRLEN)
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strlen.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen.S
new file mode 100644
index 0000000000..77cf6bcdb0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen.S
@@ -0,0 +1,60 @@
+/* Multiple versions of strlen
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc and for the
+   DSO.  In static binaries, we need strlen before the initialization
+   happened.  */
+#if defined SHARED && IS_IN (libc)
+	.text
+ENTRY(strlen)
+	.type	strlen, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__strlen_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__strlen_sse2_bsf)
+	HAS_ARCH_FEATURE (Slow_BSF)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__strlen_sse2)
+2:	ret
+END(strlen)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __strlen_ia32, @function; \
+	.globl __strlen_ia32; \
+	.p2align 4; \
+	__strlen_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __strlen_ia32, .-__strlen_ia32
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_strlen; __GI_strlen = __strlen_ia32
+#endif
+
+#include "../../i586/strlen.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase-c.c
new file mode 100644
index 0000000000..76581eb62b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase-c.c
@@ -0,0 +1,8 @@
+#include <string.h>
+
+extern __typeof (strncasecmp) __strncasecmp_nonascii;
+
+#define __strncasecmp __strncasecmp_nonascii
+#include <string/strncase.c>
+
+strong_alias (__strncasecmp_nonascii, __strncasecmp_ia32)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase.S
new file mode 100644
index 0000000000..a56e63a566
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase.S
@@ -0,0 +1,39 @@
+/* Entry point for multi-version x86 strncasecmp.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+	.text
+ENTRY(__strncasecmp)
+	.type	__strncasecmp, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__strncasecmp_ia32)
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__strncasecmp_ssse3)
+	HAS_CPU_FEATURE (SSE4_2)
+	jz	2f
+	HAS_ARCH_FEATURE (Slow_SSE4_2)
+	jnz	2f
+	LOAD_FUNC_GOT_EAX (__strncasecmp_sse4_2)
+2:	ret
+END(__strncasecmp)
+
+weak_alias (__strncasecmp, strncasecmp)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-c.c
new file mode 100644
index 0000000000..7e601af271
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-c.c
@@ -0,0 +1,13 @@
+#include <string.h>
+
+extern __typeof (strncasecmp_l) __strncasecmp_l_nonascii;
+
+#define __strncasecmp_l __strncasecmp_l_nonascii
+#define USE_IN_EXTENDED_LOCALE_MODEL    1
+#include <string/strncase.c>
+
+strong_alias (__strncasecmp_l_nonascii, __strncasecmp_l_ia32)
+
+/* The needs of strcasecmp in libc are minimal, no need to go through
+   the IFUNC.  */
+strong_alias (__strncasecmp_l_nonascii, __GI___strncasecmp_l)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-sse4.S
new file mode 100644
index 0000000000..557210832e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-sse4.S
@@ -0,0 +1,2 @@
+#define USE_AS_STRNCASECMP_L 1
+#include "strcmp-sse4.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-ssse3.S
new file mode 100644
index 0000000000..d438a1ae35
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-ssse3.S
@@ -0,0 +1,2 @@
+#define USE_AS_STRNCASECMP_L 1
+#include "strcmp-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l.S
new file mode 100644
index 0000000000..8a74ee8574
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l.S
@@ -0,0 +1,7 @@
+/* Multiple versions of strncasecmp_l
+   All versions must be listed in ifunc-impl-list.c.  */
+#define STRCMP __strncasecmp_l
+#define USE_AS_STRNCASECMP_L
+#include "strcmp.S"
+
+weak_alias (__strncasecmp_l, strncasecmp_l)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-c.c
new file mode 100644
index 0000000000..132a000545
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-c.c
@@ -0,0 +1,8 @@
+#define STRNCAT __strncat_ia32
+#ifdef SHARED
+#undef libc_hidden_def
+#define libc_hidden_def(name) \
+  __hidden_ver1 (__strncat_ia32, __GI___strncat, __strncat_ia32);
+#endif
+
+#include "string/strncat.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-sse2.S
new file mode 100644
index 0000000000..f1045b72b8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-sse2.S
@@ -0,0 +1,4 @@
+#define STRCAT  __strncat_sse2
+#define USE_AS_STRNCAT
+
+#include "strcat-sse2.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-ssse3.S
new file mode 100644
index 0000000000..625b90a978
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-ssse3.S
@@ -0,0 +1,4 @@
+#define STRCAT  __strncat_ssse3
+#define USE_AS_STRNCAT
+
+#include "strcat-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncat.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat.S
new file mode 100644
index 0000000000..5c1bf41453
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat.S
@@ -0,0 +1,5 @@
+/* Multiple versions of strncat
+   All versions must be listed in ifunc-impl-list.c.  */
+#define STRCAT strncat
+#define USE_AS_STRNCAT
+#include "strcat.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-c.c
new file mode 100644
index 0000000000..cc059da494
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-c.c
@@ -0,0 +1,8 @@
+#ifdef SHARED
+# define STRNCMP __strncmp_ia32
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name)  \
+    __hidden_ver1 (__strncmp_ia32, __GI_strncmp, __strncmp_ia32);
+#endif
+
+#include "string/strncmp.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-sse4.S
new file mode 100644
index 0000000000..cf14dfaf6c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-sse4.S
@@ -0,0 +1,5 @@
+#ifdef SHARED
+# define USE_AS_STRNCMP
+# define STRCMP	__strncmp_sse4_2
+# include "strcmp-sse4.S"
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-ssse3.S
new file mode 100644
index 0000000000..536c8685f2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-ssse3.S
@@ -0,0 +1,5 @@
+#ifdef SHARED
+# define USE_AS_STRNCMP
+# define STRCMP	__strncmp_ssse3
+# include "strcmp-ssse3.S"
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp.S
new file mode 100644
index 0000000000..150d4786d2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp.S
@@ -0,0 +1,5 @@
+/* Multiple versions of strncmp
+   All versions must be listed in ifunc-impl-list.c.  */
+#define USE_AS_STRNCMP
+#define STRCMP	strncmp
+#include "strcmp.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-c.c
new file mode 100644
index 0000000000..201e3f98b3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-c.c
@@ -0,0 +1,8 @@
+#define STRNCPY __strncpy_ia32
+#ifdef SHARED
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name)  \
+    __hidden_ver1 (__strncpy_ia32, __GI_strncpy, __strncpy_ia32);
+#endif
+
+#include "string/strncpy.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-sse2.S
new file mode 100644
index 0000000000..bdd99239a4
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-sse2.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCPY
+#define STRCPY __strncpy_sse2
+#include "strcpy-sse2.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-ssse3.S
new file mode 100644
index 0000000000..bf82ee447d
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-ssse3.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCPY
+#define STRCPY __strncpy_ssse3
+#include "strcpy-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy.S
new file mode 100644
index 0000000000..9c257efc6e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy.S
@@ -0,0 +1,5 @@
+/* Multiple versions of strncpy
+   All versions must be listed in ifunc-impl-list.c.  */
+#define USE_AS_STRNCPY
+#define STRCPY strncpy
+#include "strcpy.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-c.c
new file mode 100644
index 0000000000..351e939a93
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-c.c
@@ -0,0 +1,10 @@
+#define STRNLEN  __strnlen_ia32
+#ifdef SHARED
+# undef libc_hidden_def
+# define libc_hidden_def(name)  \
+    __hidden_ver1 (__strnlen_ia32, __GI_strnlen, __strnlen_ia32); \
+    strong_alias (__strnlen_ia32, __strnlen_ia32_1); \
+    __hidden_ver1 (__strnlen_ia32_1, __GI___strnlen, __strnlen_ia32_1);
+#endif
+
+#include "string/strnlen.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-sse2.S
new file mode 100644
index 0000000000..56b6ae2a5c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-sse2.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNLEN
+#define STRLEN __strnlen_sse2
+#include "strlen-sse2.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen.S
new file mode 100644
index 0000000000..d241522c70
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen.S
@@ -0,0 +1,37 @@
+/* Multiple versions of strnlen
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+	.text
+ENTRY(__strnlen)
+	.type	__strnlen, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__strnlen_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__strnlen_sse2)
+2:	ret
+END(__strnlen)
+
+weak_alias(__strnlen, strnlen)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk-c.c
new file mode 100644
index 0000000000..5db62053b3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk-c.c
@@ -0,0 +1,2 @@
+#define __strpbrk_sse2 __strpbrk_ia32
+#include <sysdeps/x86_64/multiarch/strpbrk-c.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk.S
new file mode 100644
index 0000000000..7201d6376f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk.S
@@ -0,0 +1,5 @@
+/* Multiple versions of strpbrk
+   All versions must be listed in ifunc-impl-list.c.  */
+#define STRCSPN strpbrk
+#define USE_AS_STRPBRK
+#include "strcspn.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2-bsf.S
new file mode 100644
index 0000000000..39a7c8825b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2-bsf.S
@@ -0,0 +1,282 @@
+/* strrchr with SSE2 with bsf and bsr
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS  4
+# define STR1  PARMS
+# define STR2  STR1+4
+
+	.text
+ENTRY (__strrchr_sse2_bsf)
+
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+
+	PUSH	(%edi)
+	pxor	%xmm2, %xmm2
+	mov	%ecx, %edi
+	punpcklbw %xmm1, %xmm1
+	punpcklbw %xmm1, %xmm1
+	/* ECX has OFFSET. */
+	and	$63, %ecx
+	cmp	$48, %ecx
+	pshufd	$0, %xmm1, %xmm1
+	ja	L(crosscashe)
+
+/* unaligned string. */
+	movdqu	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm1, %xmm0
+	/* Find where NULL is.  */
+	pmovmskb %xmm2, %edx
+	/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+
+	test	%eax, %eax
+	jnz	L(unaligned_match1)
+
+	test	%edx, %edx
+	jnz	L(return_null)
+
+	and	$-16, %edi
+	add	$16, %edi
+
+	PUSH	(%esi)
+	PUSH	(%ebx)
+
+	xor	%ebx, %ebx
+	jmp	L(loop)
+
+	CFI_POP	(%esi)
+	CFI_POP	(%ebx)
+
+	.p2align 4
+L(unaligned_return_value1):
+	bsf	%edx, %ecx
+	mov	$2, %edx
+	shl	%cl, %edx
+	sub	$1, %edx
+	and	%edx, %eax
+	jz	L(return_null)
+	bsr	%eax, %eax
+	add	%edi, %eax
+	POP	(%edi)
+	ret
+	CFI_PUSH	(%edi)
+
+	.p2align 4
+L(unaligned_match1):
+	test	%edx, %edx
+	jnz	L(unaligned_return_value1)
+
+	PUSH	(%esi)
+	PUSH	(%ebx)
+
+	mov	%eax, %ebx
+	lea	16(%edi), %esi
+	and	$-16, %edi
+	add	$16, %edi
+	jmp	L(loop)
+
+	CFI_POP	(%esi)
+	CFI_POP	(%ebx)
+
+	.p2align 4
+	L(crosscashe):
+/* Hancle unaligned string.  */
+	and	$15, %ecx
+	and	$-16, %edi
+	pxor	%xmm3, %xmm3
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm3
+	pcmpeqb	%xmm1, %xmm0
+	/* Find where NULL is.  */
+	pmovmskb %xmm3, %edx
+	/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+	/* Remove the leading bytes.  */
+	shr	%cl, %edx
+	shr	%cl, %eax
+
+	test	%eax, %eax
+	jnz	L(unaligned_match)
+
+	test	%edx, %edx
+	jnz	L(return_null)
+
+	add	$16, %edi
+
+	PUSH	(%esi)
+	PUSH	(%ebx)
+
+	xor	%ebx, %ebx
+	jmp	L(loop)
+
+	CFI_POP	(%esi)
+	CFI_POP	(%ebx)
+
+	.p2align 4
+L(unaligned_return_value):
+	add	%ecx, %edi
+	bsf	%edx, %ecx
+	mov	$2, %edx
+	shl	%cl, %edx
+	sub	$1, %edx
+	and	%edx, %eax
+	jz	L(return_null)
+	bsr	%eax, %eax
+	add	%edi, %eax
+	POP	(%edi)
+	ret
+	CFI_PUSH	(%edi)
+
+	.p2align 4
+L(unaligned_match):
+	test	%edx, %edx
+	jnz	L(unaligned_return_value)
+
+	PUSH	(%esi)
+	PUSH	(%ebx)
+
+	mov	%eax, %ebx
+	add	$16, %edi
+	lea	(%edi, %ecx), %esi
+
+/* Loop start on aligned string.  */
+	.p2align 4
+L(loop):
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm0, %eax
+	or	%eax, %ecx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm0, %eax
+	or	%eax, %ecx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm0, %eax
+	or	%eax, %ecx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm0, %eax
+	or	%eax, %ecx
+	jz	L(loop)
+
+L(matches):
+	test	%eax, %eax
+	jnz	L(match)
+L(return_value):
+	test	%ebx, %ebx
+	jz	L(return_null_1)
+	bsr	%ebx, %eax
+	add	%esi, %eax
+
+	POP	(%ebx)
+	POP	(%esi)
+
+	sub	$16, %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH	(%edi)
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(match):
+	pmovmskb %xmm2, %ecx
+	test	%ecx, %ecx
+	jnz	L(return_value_1)
+	mov	%eax, %ebx
+	mov	%edi, %esi
+	jmp	L(loop)
+
+	.p2align 4
+L(return_value_1):
+	bsf	%ecx, %ecx
+	mov	$2, %edx
+	shl	%cl, %edx
+	sub	$1, %edx
+	and	%edx, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+
+	bsr	%eax, %eax
+	add	%edi, %eax
+	sub	$16, %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH	(%edi)
+/* Return NULL.  */
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH	(%edi)
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+/* Return NULL.  */
+	.p2align 4
+L(return_null_1):
+	POP	(%ebx)
+	POP	(%esi)
+	POP	(%edi)
+	xor	%eax, %eax
+	ret
+
+END (__strrchr_sse2_bsf)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2.S
new file mode 100644
index 0000000000..20934288be
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2.S
@@ -0,0 +1,708 @@
+/* strrchr SSE2 without bsf and bsr
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS  8
+# define ENTRANCE PUSH(%edi);
+# define RETURN  POP(%edi); ret; CFI_PUSH(%edi);
+
+# define STR1  PARMS
+# define STR2  STR1+4
+
+	atom_text_section
+ENTRY (__strrchr_sse2)
+
+	ENTRANCE
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+
+	pxor	%xmm2, %xmm2
+	mov	%ecx, %edi
+	punpcklbw %xmm1, %xmm1
+	punpcklbw %xmm1, %xmm1
+	/* ECX has OFFSET. */
+	and	$63, %ecx
+	cmp	$48, %ecx
+	pshufd	$0, %xmm1, %xmm1
+	ja	L(crosscache)
+
+/* unaligned string. */
+	movdqu	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm1, %xmm0
+	/* Find where NULL is.  */
+	pmovmskb %xmm2, %ecx
+	/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+	add	$16, %edi
+
+	test	%eax, %eax
+	jnz	L(unaligned_match1)
+
+	test	%ecx, %ecx
+	jnz	L(return_null)
+
+	and	$-16, %edi
+
+	PUSH	(%esi)
+	PUSH	(%ebx)
+
+	xor	%ebx, %ebx
+	jmp	L(loop)
+
+	CFI_POP	(%esi)
+	CFI_POP	(%ebx)
+
+	.p2align 4
+L(unaligned_match1):
+	test	%ecx, %ecx
+	jnz	L(prolog_find_zero_1)
+
+	PUSH	(%esi)
+	PUSH	(%ebx)
+
+	mov	%eax, %ebx
+	mov	%edi, %esi
+	and	$-16, %edi
+	jmp	L(loop)
+
+	CFI_POP	(%esi)
+	CFI_POP	(%ebx)
+
+	.p2align 4
+L(crosscache):
+/* Hancle unaligned string.  */
+	and	$15, %ecx
+	and	$-16, %edi
+	pxor	%xmm3, %xmm3
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm3
+	pcmpeqb	%xmm1, %xmm0
+	/* Find where NULL is.  */
+	pmovmskb %xmm3, %edx
+	/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+	/* Remove the leading bytes.  */
+	shr	%cl, %edx
+	shr	%cl, %eax
+	add	$16, %edi
+
+	test	%eax, %eax
+	jnz	L(unaligned_match)
+
+	test	%edx, %edx
+	jnz	L(return_null)
+
+	PUSH	(%esi)
+	PUSH	(%ebx)
+
+	xor	%ebx, %ebx
+	jmp	L(loop)
+
+	CFI_POP	(%esi)
+	CFI_POP	(%ebx)
+
+	.p2align 4
+L(unaligned_match):
+	test	%edx, %edx
+	jnz	L(prolog_find_zero)
+
+	PUSH	(%esi)
+	PUSH	(%ebx)
+
+	mov	%eax, %ebx
+	lea	(%edi, %ecx), %esi
+
+/* Loop start on aligned string.  */
+	.p2align 4
+L(loop):
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm0, %eax
+	or	%eax, %ecx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm0, %eax
+	or	%eax, %ecx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm0, %eax
+	or	%eax, %ecx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm0, %eax
+	or	%eax, %ecx
+	jz	L(loop)
+
+L(matches):
+	test	%eax, %eax
+	jnz	L(match)
+L(return_value):
+	test	%ebx, %ebx
+	jz	L(return_null_1)
+	mov	%ebx, %eax
+	mov	%esi, %edi
+
+	POP	(%ebx)
+	POP	(%esi)
+
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(return_null_1):
+	POP	(%ebx)
+	POP	(%esi)
+
+	xor	%eax, %eax
+	RETURN
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(match):
+	pmovmskb %xmm2, %ecx
+	test	%ecx, %ecx
+	jnz	L(find_zero)
+	mov	%eax, %ebx
+	mov	%edi, %esi
+	jmp	L(loop)
+
+	.p2align 4
+L(find_zero):
+	test	%cl, %cl
+	jz	L(find_zero_high)
+	mov	%cl, %dl
+	and	$15, %dl
+	jz	L(find_zero_8)
+	test	$0x01, %cl
+	jnz	L(FindZeroExit1)
+	test	$0x02, %cl
+	jnz	L(FindZeroExit2)
+	test	$0x04, %cl
+	jnz	L(FindZeroExit3)
+	and	$1 << 4 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(find_zero_8):
+	test	$0x10, %cl
+	jnz	L(FindZeroExit5)
+	test	$0x20, %cl
+	jnz	L(FindZeroExit6)
+	test	$0x40, %cl
+	jnz	L(FindZeroExit7)
+	and	$1 << 8 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(find_zero_high):
+	mov	%ch, %dh
+	and	$15, %dh
+	jz	L(find_zero_high_8)
+	test	$0x01, %ch
+	jnz	L(FindZeroExit9)
+	test	$0x02, %ch
+	jnz	L(FindZeroExit10)
+	test	$0x04, %ch
+	jnz	L(FindZeroExit11)
+	and	$1 << 12 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(find_zero_high_8):
+	test	$0x10, %ch
+	jnz	L(FindZeroExit13)
+	test	$0x20, %ch
+	jnz	L(FindZeroExit14)
+	test	$0x40, %ch
+	jnz	L(FindZeroExit15)
+	and	$1 << 16 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit1):
+	and	$1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit2):
+	and	$1 << 2 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit3):
+	and	$1 << 3 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit5):
+	and	$1 << 5 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit6):
+	and	$1 << 6 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit7):
+	and	$1 << 7 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit9):
+	and	$1 << 9 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit10):
+	and	$1 << 10 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit11):
+	and	$1 << 11 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit13):
+	and	$1 << 13 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit14):
+	and	$1 << 14 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+	jmp	L(match_exit)
+
+	CFI_PUSH	(%ebx)
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(FindZeroExit15):
+	and	$1 << 15 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%ebx)
+	POP	(%esi)
+
+	.p2align 4
+L(match_exit):
+	test	%ah, %ah
+	jnz	L(match_exit_high)
+	mov	%al, %dl
+	and	$15 << 4, %dl
+	jnz	L(match_exit_8)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	lea	-16(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_exit_8):
+	test	$0x80, %al
+	jnz	L(Exit8)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	lea	-12(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_exit_high):
+	mov	%ah, %dh
+	and	$15 << 4, %dh
+	jnz	L(match_exit_high_8)
+	test	$0x08, %ah
+	jnz	L(Exit12)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	lea	-8(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_exit_high_8):
+	test	$0x80, %ah
+	jnz	L(Exit16)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	lea	-4(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit2):
+	lea	-15(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit3):
+	lea	-14(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit4):
+	lea	-13(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit6):
+	lea	-11(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit7):
+	lea	-10(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit8):
+	lea	-9(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit10):
+	lea	-7(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit11):
+	lea	-6(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit12):
+	lea	-5(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit14):
+	lea	-3(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit15):
+	lea	-2(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(Exit16):
+	lea	-1(%edi), %eax
+	RETURN
+
+/* Return NULL.  */
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(prolog_find_zero):
+	add	%ecx, %edi
+	mov     %edx, %ecx
+L(prolog_find_zero_1):
+	test	%cl, %cl
+	jz	L(prolog_find_zero_high)
+	mov	%cl, %dl
+	and	$15, %dl
+	jz	L(prolog_find_zero_8)
+	test	$0x01, %cl
+	jnz	L(PrologFindZeroExit1)
+	test	$0x02, %cl
+	jnz	L(PrologFindZeroExit2)
+	test	$0x04, %cl
+	jnz	L(PrologFindZeroExit3)
+	and	$1 << 4 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(prolog_find_zero_8):
+	test	$0x10, %cl
+	jnz	L(PrologFindZeroExit5)
+	test	$0x20, %cl
+	jnz	L(PrologFindZeroExit6)
+	test	$0x40, %cl
+	jnz	L(PrologFindZeroExit7)
+	and	$1 << 8 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(prolog_find_zero_high):
+	mov	%ch, %dh
+	and	$15, %dh
+	jz	L(prolog_find_zero_high_8)
+	test	$0x01, %ch
+	jnz	L(PrologFindZeroExit9)
+	test	$0x02, %ch
+	jnz	L(PrologFindZeroExit10)
+	test	$0x04, %ch
+	jnz	L(PrologFindZeroExit11)
+	and	$1 << 12 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(prolog_find_zero_high_8):
+	test	$0x10, %ch
+	jnz	L(PrologFindZeroExit13)
+	test	$0x20, %ch
+	jnz	L(PrologFindZeroExit14)
+	test	$0x40, %ch
+	jnz	L(PrologFindZeroExit15)
+	and	$1 << 16 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit1):
+	and	$1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit2):
+	and	$1 << 2 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit3):
+	and	$1 << 3 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit5):
+	and	$1 << 5 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit6):
+	and	$1 << 6 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit7):
+	and	$1 << 7 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit9):
+	and	$1 << 9 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit10):
+	and	$1 << 10 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit11):
+	and	$1 << 11 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit13):
+	and	$1 << 13 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit14):
+	and	$1 << 14 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(PrologFindZeroExit15):
+	and	$1 << 15 - 1, %eax
+	jnz	L(match_exit)
+	xor	%eax, %eax
+	RETURN
+
+END (__strrchr_sse2)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr.S
new file mode 100644
index 0000000000..d9281eaeae
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr.S
@@ -0,0 +1,57 @@
+/* Multiple versions of strrchr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+	.text
+ENTRY(strrchr)
+	.type	strrchr, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__strrchr_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__strrchr_sse2_bsf)
+	HAS_ARCH_FEATURE (Slow_BSF)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__strrchr_sse2)
+2:	ret
+END(strrchr)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __strrchr_ia32, @function; \
+	.globl __strrchr_ia32; \
+	.p2align 4; \
+	__strrchr_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __strrchr_ia32, .-__strrchr_ia32
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_strrchr; __GI_strrchr = __strrchr_ia32
+#endif
+
+#include "../../strrchr.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strspn-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strspn-c.c
new file mode 100644
index 0000000000..bea09dea71
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strspn-c.c
@@ -0,0 +1,2 @@
+#define __strspn_sse2 __strspn_ia32
+#include <sysdeps/x86_64/multiarch/strspn-c.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strspn.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strspn.S
new file mode 100644
index 0000000000..1269062381
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strspn.S
@@ -0,0 +1,56 @@
+/* Multiple versions of strspn
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+	.text
+ENTRY(strspn)
+	.type	strspn, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__strspn_ia32)
+	HAS_CPU_FEATURE (SSE4_2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__strspn_sse42)
+2:	ret
+END(strspn)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __strspn_ia32, @function; \
+	.globl __strspn_ia32; \
+	.p2align 4; \
+__strspn_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __strspn_ia32, .-__strspn_ia32
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_strspn; __GI_strspn = __strspn_ia32
+#endif
+
+#include "../../strspn.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/test-multiarch.c b/REORG.TODO/sysdeps/i386/i686/multiarch/test-multiarch.c
new file mode 100644
index 0000000000..593cfec273
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/test-multiarch.c
@@ -0,0 +1 @@
+#include <sysdeps/x86_64/multiarch/test-multiarch.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.c b/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.c
new file mode 100644
index 0000000000..7760b966e2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.c
@@ -0,0 +1 @@
+#include <sysdeps/x86_64/multiarch/varshift.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.h b/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.h
new file mode 100644
index 0000000000..7c72c70d67
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.h
@@ -0,0 +1 @@
+#include <sysdeps/x86_64/multiarch/varshift.h>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-c.c
new file mode 100644
index 0000000000..38d41d04de
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-c.c
@@ -0,0 +1,22 @@
+#include <wchar.h>
+
+#if IS_IN (libc)
+# undef libc_hidden_weak
+# define libc_hidden_weak(name)
+
+# undef weak_alias
+# define weak_alias(name,alias)
+
+# ifdef SHARED
+#  undef libc_hidden_def
+#  define libc_hidden_def(name) \
+   __hidden_ver1 (__wcschr_ia32, __GI_wcschr, __wcschr_ia32); \
+   strong_alias (__wcschr_ia32, __wcschr_ia32_1); \
+   __hidden_ver1 (__wcschr_ia32_1, __GI___wcschr, __wcschr_ia32_1);
+# endif
+#endif
+
+extern __typeof (wcschr) __wcschr_ia32;
+
+#define WCSCHR  __wcschr_ia32
+#include <wcsmbs/wcschr.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-sse2.S
new file mode 100644
index 0000000000..9ff6c3b8d6
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-sse2.S
@@ -0,0 +1,219 @@
+/* wcschr with SSE2, without using bsf instructions
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+# define POP(REG)	popl REG; CFI_POP (REG)
+
+# define PARMS	4
+# define STR1	PARMS
+# define STR2	STR1+4
+
+	atom_text_section
+ENTRY (__wcschr_sse2)
+
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+
+	mov	%ecx, %eax
+	punpckldq %xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	punpckldq %xmm1, %xmm1
+
+	and	$63, %eax
+	cmp	$48, %eax
+	ja	L(cross_cache)
+
+	movdqu	(%ecx), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jnz	L(matches)
+	and	$-16, %ecx
+	jmp	L(loop)
+
+	.p2align 4
+L(cross_cache):
+	PUSH	(%edi)
+	mov	%ecx, %edi
+	mov	%eax, %ecx
+	and	$-16, %edi
+	and	$15, %ecx
+	movdqa	(%edi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+
+	sarl	%cl, %edx
+	sarl	%cl, %eax
+	test	%eax, %eax
+	jz	L(unaligned_no_match)
+
+	add	%edi, %ecx
+	POP	(%edi)
+
+	test	%edx, %edx
+	jz	L(match_case1)
+	test	%al, %al
+	jz	L(match_higth_case2)
+	test	$15, %al
+	jnz	L(match_case2_4)
+	test	$15, %dl
+	jnz	L(return_null)
+	lea	4(%ecx), %eax
+	ret
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(unaligned_no_match):
+	mov	%edi, %ecx
+	POP	(%edi)
+
+	test	%edx, %edx
+	jnz	L(return_null)
+
+	pxor	%xmm2, %xmm2
+
+/* Loop start on aligned string.  */
+	.p2align 4
+L(loop):
+	add	$16, %ecx
+	movdqa	(%ecx), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jnz	L(matches)
+	add	$16, %ecx
+
+	movdqa	(%ecx), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jnz	L(matches)
+	add	$16, %ecx
+
+	movdqa	(%ecx), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jnz	L(matches)
+	add	$16, %ecx
+
+	movdqa	(%ecx), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jz	L(loop)
+
+	.p2align 4
+L(matches):
+	pmovmskb %xmm2, %edx
+	test	%eax, %eax
+	jz	L(return_null)
+	test	%edx, %edx
+	jz	L(match_case1)
+
+	.p2align 4
+L(match_case2):
+	test	%al, %al
+	jz	L(match_higth_case2)
+	test	$15, %al
+	jnz	L(match_case2_4)
+	test	$15, %dl
+	jnz	L(return_null)
+	lea	4(%ecx), %eax
+	ret
+
+	.p2align 4
+L(match_case2_4):
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(match_higth_case2):
+	test	%dl, %dl
+	jnz	L(return_null)
+	test	$15, %ah
+	jnz	L(match_case2_12)
+	test	$15, %dh
+	jnz	L(return_null)
+	lea	12(%ecx), %eax
+	ret
+
+	.p2align 4
+L(match_case2_12):
+	lea	8(%ecx), %eax
+	ret
+
+	.p2align 4
+L(match_case1):
+	test	%al, %al
+	jz	L(match_higth_case1)
+
+	test	$0x01, %al
+	jnz	L(exit0)
+	lea	4(%ecx), %eax
+	ret
+
+	.p2align 4
+L(match_higth_case1):
+	test	$0x01, %ah
+	jnz	L(exit3)
+	lea	12(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit0):
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(exit3):
+	lea	8(%ecx), %eax
+	ret
+
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	ret
+
+END (__wcschr_sse2)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr.S
new file mode 100644
index 0000000000..d3c65a6436
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr.S
@@ -0,0 +1,36 @@
+/* Multiple versions of wcschr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+	.text
+ENTRY(__wcschr)
+	.type	wcschr, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__wcschr_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__wcschr_sse2)
+2:	ret
+END(__wcschr)
+weak_alias (__wcschr, wcschr)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-c.c
new file mode 100644
index 0000000000..e3337d77e2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-c.c
@@ -0,0 +1,14 @@
+#include <wchar.h>
+
+#define WCSCMP __wcscmp_ia32
+#ifdef SHARED
+# undef libc_hidden_def
+# define libc_hidden_def(name) \
+  __hidden_ver1 (__wcscmp_ia32, __GI___wcscmp, __wcscmp_ia32);
+#endif
+#undef weak_alias
+#define weak_alias(name, alias)
+
+extern __typeof (wcscmp) __wcscmp_ia32;
+
+#include "wcsmbs/wcscmp.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-sse2.S
new file mode 100644
index 0000000000..a464b58204
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-sse2.S
@@ -0,0 +1,1018 @@
+/* wcscmp with SSE2
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define ENTRANCE PUSH(%esi); PUSH(%edi)
+# define RETURN  POP(%edi); POP(%esi); ret; CFI_PUSH(%esi); CFI_PUSH(%edi);
+# define PARMS  4
+# define STR1  PARMS
+# define STR2  STR1+4
+
+/* Note: wcscmp uses signed comparison, not unsugned as in strcmp function. */
+
+	.text
+ENTRY (__wcscmp_sse2)
+/*
+	* This implementation uses SSE to compare up to 16 bytes at a time.
+*/
+	mov	STR1(%esp), %edx
+	mov	STR2(%esp), %eax
+
+	mov	(%eax), %ecx
+	cmp	%ecx, (%edx)
+	jne	L(neq)
+	test	%ecx, %ecx
+	jz	L(eq)
+
+	mov	4(%eax), %ecx
+	cmp	%ecx, 4(%edx)
+	jne	L(neq)
+	test	%ecx, %ecx
+	jz	L(eq)
+
+	mov	8(%eax), %ecx
+	cmp	%ecx, 8(%edx)
+	jne	L(neq)
+	test	%ecx, %ecx
+	jz	L(eq)
+
+	mov	12(%eax), %ecx
+	cmp	%ecx, 12(%edx)
+	jne	L(neq)
+	test	%ecx, %ecx
+	jz	L(eq)
+
+	ENTRANCE
+	add	$16, %eax
+	add	$16, %edx
+
+	mov	%eax, %esi
+	mov	%edx, %edi
+	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char checks */
+	mov	%al, %ch
+	mov	%dl, %cl
+	and	$63, %eax		/* esi alignment in cache line */
+	and	$63, %edx		/* edi alignment in cache line */
+	and	$15, %cl
+	jz	L(continue_00)
+	cmp	$16, %edx
+	jb	L(continue_0)
+	cmp	$32, %edx
+	jb	L(continue_16)
+	cmp	$48, %edx
+	jb	L(continue_32)
+
+L(continue_48):
+	and	$15, %ch
+	jz	L(continue_48_00)
+	cmp	$16, %eax
+	jb	L(continue_0_48)
+	cmp	$32, %eax
+	jb	L(continue_16_48)
+	cmp	$48, %eax
+	jb	L(continue_32_48)
+
+	.p2align 4
+L(continue_48_48):
+	mov	(%esi), %ecx
+	cmp	%ecx, (%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	4(%esi), %ecx
+	cmp	%ecx, 4(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	8(%esi), %ecx
+	cmp	%ecx, 8(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	12(%esi), %ecx
+	cmp	%ecx, 12(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	movdqu	16(%edi), %xmm1
+	movdqu	16(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqu	32(%edi), %xmm1
+	movdqu	32(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	movdqu	48(%edi), %xmm1
+	movdqu	48(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_48)
+
+	add	$64, %esi
+	add	$64, %edi
+	jmp	L(continue_48_48)
+
+L(continue_0):
+	and	$15, %ch
+	jz	L(continue_0_00)
+	cmp	$16, %eax
+	jb	L(continue_0_0)
+	cmp	$32, %eax
+	jb	L(continue_0_16)
+	cmp	$48, %eax
+	jb	L(continue_0_32)
+
+	.p2align 4
+L(continue_0_48):
+	mov	(%esi), %ecx
+	cmp	%ecx, (%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	4(%esi), %ecx
+	cmp	%ecx, 4(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	8(%esi), %ecx
+	cmp	%ecx, 8(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	12(%esi), %ecx
+	cmp	%ecx, 12(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	movdqu	16(%edi), %xmm1
+	movdqu	16(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqu	32(%edi), %xmm1
+	movdqu	32(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	mov	48(%esi), %ecx
+	cmp	%ecx, 48(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	52(%esi), %ecx
+	cmp	%ecx, 52(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	56(%esi), %ecx
+	cmp	%ecx, 56(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	60(%esi), %ecx
+	cmp	%ecx, 60(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	add	$64, %esi
+	add	$64, %edi
+	jmp	L(continue_0_48)
+
+	.p2align 4
+L(continue_00):
+	and	$15, %ch
+	jz	L(continue_00_00)
+	cmp	$16, %eax
+	jb	L(continue_00_0)
+	cmp	$32, %eax
+	jb	L(continue_00_16)
+	cmp	$48, %eax
+	jb	L(continue_00_32)
+
+	.p2align 4
+L(continue_00_48):
+	pcmpeqd	(%edi), %xmm0
+	mov	(%edi), %eax
+	pmovmskb %xmm0, %ecx
+	test	%ecx, %ecx
+	jnz	L(less4_double_words1)
+
+	cmp	(%esi), %eax
+	jne	L(nequal)
+
+	mov	4(%edi), %eax
+	cmp	4(%esi), %eax
+	jne	L(nequal)
+
+	mov	8(%edi), %eax
+	cmp	8(%esi), %eax
+	jne	L(nequal)
+
+	mov	12(%edi), %eax
+	cmp	12(%esi), %eax
+	jne	L(nequal)
+
+	movdqu	16(%esi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	16(%edi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqu	32(%esi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	32(%edi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	movdqu	48(%esi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	48(%edi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_48)
+
+	add	$64, %esi
+	add	$64, %edi
+	jmp	L(continue_00_48)
+
+	.p2align 4
+L(continue_32):
+	and	$15, %ch
+	jz	L(continue_32_00)
+	cmp	$16, %eax
+	jb	L(continue_0_32)
+	cmp	$32, %eax
+	jb	L(continue_16_32)
+	cmp	$48, %eax
+	jb	L(continue_32_32)
+
+	.p2align 4
+L(continue_32_48):
+	mov	(%esi), %ecx
+	cmp	%ecx, (%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	4(%esi), %ecx
+	cmp	%ecx, 4(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	8(%esi), %ecx
+	cmp	%ecx, 8(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	12(%esi), %ecx
+	cmp	%ecx, 12(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	16(%esi), %ecx
+	cmp	%ecx, 16(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	20(%esi), %ecx
+	cmp	%ecx, 20(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	24(%esi), %ecx
+	cmp	%ecx, 24(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	28(%esi), %ecx
+	cmp	%ecx, 28(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	movdqu	32(%edi), %xmm1
+	movdqu	32(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	movdqu	48(%edi), %xmm1
+	movdqu	48(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results */
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_48)
+
+	add	$64, %esi
+	add	$64, %edi
+	jmp	L(continue_32_48)
+
+	.p2align 4
+L(continue_16):
+	and	$15, %ch
+	jz	L(continue_16_00)
+	cmp	$16, %eax
+	jb	L(continue_0_16)
+	cmp	$32, %eax
+	jb	L(continue_16_16)
+	cmp	$48, %eax
+	jb	L(continue_16_32)
+
+	.p2align 4
+L(continue_16_48):
+	mov	(%esi), %ecx
+	cmp	%ecx, (%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	4(%esi), %ecx
+	cmp	%ecx, 4(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	8(%esi), %ecx
+	cmp	%ecx, 8(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	12(%esi), %ecx
+	cmp	%ecx, 12(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	movdqu	16(%edi), %xmm1
+	movdqu	16(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	mov	32(%esi), %ecx
+	cmp	%ecx, 32(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	36(%esi), %ecx
+	cmp	%ecx, 36(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	40(%esi), %ecx
+	cmp	%ecx, 40(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	44(%esi), %ecx
+	cmp	%ecx, 44(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	movdqu	48(%edi), %xmm1
+	movdqu	48(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_48)
+
+	add	$64, %esi
+	add	$64, %edi
+	jmp	L(continue_16_48)
+
+	.p2align 4
+L(continue_00_00):
+	movdqa	(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqa	16(%edi), %xmm3
+	pcmpeqd	%xmm3, %xmm0		/* Any null double_word? */
+	pcmpeqd	16(%esi), %xmm3		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm3		/* packed sub of comparison results*/
+	pmovmskb %xmm3, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqa	32(%edi), %xmm5
+	pcmpeqd	%xmm5, %xmm0		/* Any null double_word? */
+	pcmpeqd	32(%esi), %xmm5		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm5		/* packed sub of comparison results*/
+	pmovmskb %xmm5, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	movdqa	48(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	48(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_48)
+
+	add	$64, %esi
+	add	$64, %edi
+	jmp	L(continue_00_00)
+
+	.p2align 4
+L(continue_00_32):
+	movdqu	(%esi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	(%edi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	add	$16, %esi
+	add	$16, %edi
+	jmp	L(continue_00_48)
+
+	.p2align 4
+L(continue_00_16):
+	movdqu	(%esi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	(%edi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqu	16(%esi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	16(%edi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	add	$32, %esi
+	add	$32, %edi
+	jmp	L(continue_00_48)
+
+	.p2align 4
+L(continue_00_0):
+	movdqu	(%esi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	(%edi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqu	16(%esi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	16(%edi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqu	32(%esi), %xmm2
+	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
+	pcmpeqd	32(%edi), %xmm2		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	add	$48, %esi
+	add	$48, %edi
+	jmp	L(continue_00_48)
+
+	.p2align 4
+L(continue_48_00):
+	pcmpeqd	(%esi), %xmm0
+	mov	(%edi), %eax
+	pmovmskb %xmm0, %ecx
+	test	%ecx, %ecx
+	jnz	L(less4_double_words1)
+
+	cmp	(%esi), %eax
+	jne	L(nequal)
+
+	mov	4(%edi), %eax
+	cmp	4(%esi), %eax
+	jne	L(nequal)
+
+	mov	8(%edi), %eax
+	cmp	8(%esi), %eax
+	jne	L(nequal)
+
+	mov	12(%edi), %eax
+	cmp	12(%esi), %eax
+	jne	L(nequal)
+
+	movdqu	16(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	16(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqu	32(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	32(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	movdqu	48(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	48(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_48)
+
+	add	$64, %esi
+	add	$64, %edi
+	jmp	L(continue_48_00)
+
+	.p2align 4
+L(continue_32_00):
+	movdqu	(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	add	$16, %esi
+	add	$16, %edi
+	jmp	L(continue_48_00)
+
+	.p2align 4
+L(continue_16_00):
+	movdqu	(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqu	16(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	16(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	add	$32, %esi
+	add	$32, %edi
+	jmp	L(continue_48_00)
+
+	.p2align 4
+L(continue_0_00):
+	movdqu	(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqu	16(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	16(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqu	32(%edi), %xmm1
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	32(%esi), %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	add	$48, %esi
+	add	$48, %edi
+	jmp	L(continue_48_00)
+
+	.p2align 4
+L(continue_32_32):
+	movdqu	(%edi), %xmm1
+	movdqu	(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	add	$16, %esi
+	add	$16, %edi
+	jmp	L(continue_48_48)
+
+	.p2align 4
+L(continue_16_16):
+	movdqu	(%edi), %xmm1
+	movdqu	(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqu	16(%edi), %xmm3
+	movdqu	16(%esi), %xmm4
+	pcmpeqd	%xmm3, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm4, %xmm3		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm3		/* packed sub of comparison results*/
+	pmovmskb %xmm3, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	add	$32, %esi
+	add	$32, %edi
+	jmp	L(continue_48_48)
+
+	.p2align 4
+L(continue_0_0):
+	movdqu	(%edi), %xmm1
+	movdqu	(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqu	16(%edi), %xmm3
+	movdqu	16(%esi), %xmm4
+	pcmpeqd	%xmm3, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm4, %xmm3		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm3		/* packed sub of comparison results*/
+	pmovmskb %xmm3, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	movdqu	32(%edi), %xmm1
+	movdqu	32(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_32)
+
+	add	$48, %esi
+	add	$48, %edi
+	jmp	L(continue_48_48)
+
+	.p2align 4
+L(continue_0_16):
+	movdqu	(%edi), %xmm1
+	movdqu	(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	movdqu	16(%edi), %xmm1
+	movdqu	16(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words_16)
+
+	add	$32, %esi
+	add	$32, %edi
+	jmp	L(continue_32_48)
+
+	.p2align 4
+L(continue_0_32):
+	movdqu	(%edi), %xmm1
+	movdqu	(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	add	$16, %esi
+	add	$16, %edi
+	jmp	L(continue_16_48)
+
+	.p2align 4
+L(continue_16_32):
+	movdqu	(%edi), %xmm1
+	movdqu	(%esi), %xmm2
+	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
+	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
+	jnz	L(less4_double_words)
+
+	add	$16, %esi
+	add	$16, %edi
+	jmp	L(continue_32_48)
+
+	.p2align 4
+L(less4_double_words1):
+	cmp	(%esi), %eax
+	jne	L(nequal)
+	test	%eax, %eax
+	jz	L(equal)
+
+	mov	4(%esi), %ecx
+	cmp	%ecx, 4(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	8(%esi), %ecx
+	cmp	%ecx, 8(%edi)
+	jne	L(nequal)
+	test	%ecx, %ecx
+	jz	L(equal)
+
+	mov	12(%esi), %ecx
+	cmp	%ecx, 12(%edi)
+	jne	L(nequal)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(less4_double_words):
+	xor	%eax, %eax
+	test	%dl, %dl
+	jz	L(next_two_double_words)
+	and	$15, %dl
+	jz	L(second_double_word)
+	mov	(%esi), %ecx
+	cmp	%ecx, (%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(second_double_word):
+	mov	4(%esi), %ecx
+	cmp	%ecx, 4(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(next_two_double_words):
+	and	$15, %dh
+	jz	L(fourth_double_word)
+	mov	8(%esi), %ecx
+	cmp	%ecx, 8(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(fourth_double_word):
+	mov	12(%esi), %ecx
+	cmp	%ecx, 12(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(less4_double_words_16):
+	xor	%eax, %eax
+	test	%dl, %dl
+	jz	L(next_two_double_words_16)
+	and	$15, %dl
+	jz	L(second_double_word_16)
+	mov	16(%esi), %ecx
+	cmp	%ecx, 16(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(second_double_word_16):
+	mov	20(%esi), %ecx
+	cmp	%ecx, 20(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(next_two_double_words_16):
+	and	$15, %dh
+	jz	L(fourth_double_word_16)
+	mov	24(%esi), %ecx
+	cmp	%ecx, 24(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(fourth_double_word_16):
+	mov	28(%esi), %ecx
+	cmp	%ecx, 28(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(less4_double_words_32):
+	xor	%eax, %eax
+	test	%dl, %dl
+	jz	L(next_two_double_words_32)
+	and	$15, %dl
+	jz	L(second_double_word_32)
+	mov	32(%esi), %ecx
+	cmp	%ecx, 32(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(second_double_word_32):
+	mov	36(%esi), %ecx
+	cmp	%ecx, 36(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(next_two_double_words_32):
+	and	$15, %dh
+	jz	L(fourth_double_word_32)
+	mov	40(%esi), %ecx
+	cmp	%ecx, 40(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(fourth_double_word_32):
+	mov	44(%esi), %ecx
+	cmp	%ecx, 44(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(less4_double_words_48):
+	xor	%eax, %eax
+	test	%dl, %dl
+	jz	L(next_two_double_words_48)
+	and	$15, %dl
+	jz	L(second_double_word_48)
+	mov	48(%esi), %ecx
+	cmp	%ecx, 48(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(second_double_word_48):
+	mov	52(%esi), %ecx
+	cmp	%ecx, 52(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(next_two_double_words_48):
+	and	$15, %dh
+	jz	L(fourth_double_word_48)
+	mov	56(%esi), %ecx
+	cmp	%ecx, 56(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(fourth_double_word_48):
+	mov	60(%esi), %ecx
+	cmp	%ecx, 60(%edi)
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(nequal):
+	mov	$1, %eax
+	jg	L(return)
+	neg	%eax
+	RETURN
+
+	.p2align 4
+L(return):
+	RETURN
+
+	.p2align 4
+L(equal):
+	xorl	%eax, %eax
+	RETURN
+
+	CFI_POP (%edi)
+	CFI_POP (%esi)
+
+	.p2align 4
+L(neq):
+	mov	$1, %eax
+	jg	L(neq_bigger)
+	neg	%eax
+
+L(neq_bigger):
+	ret
+
+	.p2align 4
+L(eq):
+	xorl	%eax, %eax
+	ret
+
+END (__wcscmp_sse2)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp.S
new file mode 100644
index 0000000000..7118bdd4db
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp.S
@@ -0,0 +1,39 @@
+/* Multiple versions of wcscmp
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc and for the
+   DSO.  In static binaries, we need wcscmp before the initialization
+   happened.  */
+#if IS_IN (libc)
+	.text
+ENTRY(__wcscmp)
+	.type	__wcscmp, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__wcscmp_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__wcscmp_sse2)
+2:	ret
+END(__wcscmp)
+weak_alias (__wcscmp, wcscmp)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-c.c
new file mode 100644
index 0000000000..fb3000392b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-c.c
@@ -0,0 +1,5 @@
+#if IS_IN (libc)
+# define wcscpy  __wcscpy_ia32
+#endif
+
+#include "wcsmbs/wcscpy.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S
new file mode 100644
index 0000000000..6280ba92ab
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S
@@ -0,0 +1,600 @@
+/* wcscpy with SSSE3
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+# define POP(REG)	popl REG; CFI_POP (REG)
+
+# define PARMS	4
+# define RETURN	POP (%edi); ret; CFI_PUSH (%edi)
+# define STR1	PARMS
+# define STR2	STR1+4
+# define LEN	STR2+4
+
+	atom_text_section
+ENTRY (__wcscpy_ssse3)
+	mov	STR1(%esp), %edx
+	mov	STR2(%esp), %ecx
+
+	cmp	$0, (%ecx)
+	jz	L(ExitTail4)
+	cmp	$0, 4(%ecx)
+	jz	L(ExitTail8)
+	cmp	$0, 8(%ecx)
+	jz	L(ExitTail12)
+	cmp	$0, 12(%ecx)
+	jz	L(ExitTail16)
+
+	PUSH	(%edi)
+	mov	%edx, %edi
+	PUSH	(%esi)
+	lea	16(%ecx), %esi
+
+	and	$-16, %esi
+
+	pxor	%xmm0, %xmm0
+	pcmpeqd	(%esi), %xmm0
+	movdqu	(%ecx), %xmm1
+	movdqu	%xmm1, (%edx)
+
+	pmovmskb %xmm0, %eax
+	sub	%ecx, %esi
+
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	mov	%edx, %eax
+	lea	16(%edx), %edx
+	and	$-16, %edx
+	sub	%edx, %eax
+
+	sub	%eax, %ecx
+	mov	%ecx, %eax
+	and	$0xf, %eax
+	mov	$0, %esi
+
+	jz	L(Align16Both)
+	cmp	$4, %eax
+	je	L(Shl4)
+	cmp	$8, %eax
+	je	L(Shl8)
+	jmp	L(Shl12)
+
+L(Align16Both):
+	movaps	(%ecx), %xmm1
+	movaps	16(%ecx), %xmm2
+	movaps	%xmm1, (%edx)
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm3
+	movaps	%xmm2, (%edx, %esi)
+	pcmpeqd	%xmm3, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm4
+	movaps	%xmm3, (%edx, %esi)
+	pcmpeqd	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm1
+	movaps	%xmm4, (%edx, %esi)
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm2
+	movaps	%xmm1, (%edx, %esi)
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm3
+	movaps	%xmm2, (%edx, %esi)
+	pcmpeqd	%xmm3, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	%xmm3, (%edx, %esi)
+	mov	%ecx, %eax
+	lea	16(%ecx, %esi), %ecx
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	sub	%eax, %edx
+
+	mov	$-0x40, %esi
+
+L(Aligned64Loop):
+	movaps	(%ecx), %xmm2
+	movaps	32(%ecx), %xmm3
+	movaps	%xmm2, %xmm4
+	movaps	16(%ecx), %xmm5
+	movaps	%xmm3, %xmm6
+	movaps	48(%ecx), %xmm7
+	pminub	%xmm5, %xmm2
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	lea	64(%edx), %edx
+	pcmpeqd	%xmm0, %xmm3
+	lea	64(%ecx), %ecx
+	pmovmskb %xmm3, %eax
+
+	test	%eax, %eax
+	jnz	L(Aligned64Leave)
+	movaps	%xmm4, -64(%edx)
+	movaps	%xmm5, -48(%edx)
+	movaps	%xmm6, -32(%edx)
+	movaps	%xmm7, -16(%edx)
+	jmp	L(Aligned64Loop)
+
+L(Aligned64Leave):
+	pcmpeqd	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqd	%xmm5, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm4, -64(%edx)
+	test	%eax, %eax
+	lea	16(%esi), %esi
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqd	%xmm6, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm5, -48(%edx)
+	test	%eax, %eax
+	lea	16(%esi), %esi
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	%xmm6, -32(%edx)
+	pcmpeqd	%xmm7, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	lea	16(%esi), %esi
+	jnz	L(CopyFrom1To16Bytes)
+
+	mov	$-0x40, %esi
+	movaps	%xmm7, -16(%edx)
+	jmp	L(Aligned64Loop)
+
+	.p2align 4
+L(Shl4):
+	movaps	-4(%ecx), %xmm1
+	movaps	12(%ecx), %xmm2
+L(Shl4Start):
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	28(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm1
+
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	28(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	28(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	28(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-12(%ecx), %ecx
+	sub	%eax, %edx
+
+	movaps	-4(%ecx), %xmm1
+
+L(Shl4LoopStart):
+	movaps	12(%ecx), %xmm2
+	movaps	28(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	44(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	60(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqd	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$4, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$4, %xmm3, %xmm4
+	jnz	L(Shl4Start)
+
+	palignr	$4, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl4LoopStart)
+
+L(Shl4LoopExit):
+	movlpd	(%ecx), %xmm0
+	movl	8(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 8(%edx)
+	POP	(%esi)
+	add	$12, %edx
+	add	$12, %ecx
+	test	%al, %al
+	jz	L(ExitHigh)
+	test	$0x01, %al
+	jnz	L(Exit4)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	%edi, %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(Shl8):
+	movaps	-8(%ecx), %xmm1
+	movaps	8(%ecx), %xmm2
+L(Shl8Start):
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	24(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm1
+
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	24(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	24(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	24(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-8(%ecx), %ecx
+	sub	%eax, %edx
+
+	movaps	-8(%ecx), %xmm1
+
+L(Shl8LoopStart):
+	movaps	8(%ecx), %xmm2
+	movaps	24(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	40(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	56(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqd	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$8, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$8, %xmm3, %xmm4
+	jnz	L(Shl8Start)
+
+	palignr	$8, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl8LoopStart)
+
+L(Shl8LoopExit):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	POP	(%esi)
+	add	$8, %edx
+	add	$8, %ecx
+	test	%al, %al
+	jz	L(ExitHigh)
+	test	$0x01, %al
+	jnz	L(Exit4)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	%edi, %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(Shl12):
+	movaps	-12(%ecx), %xmm1
+	movaps	4(%ecx), %xmm2
+L(Shl12Start):
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	20(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm1
+
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	20(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	20(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm3, %xmm2
+	movaps	%xmm2, (%edx)
+	lea	20(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-4(%ecx), %ecx
+	sub	%eax, %edx
+
+	movaps	-12(%ecx), %xmm1
+
+L(Shl12LoopStart):
+	movaps	4(%ecx), %xmm2
+	movaps	20(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	36(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	52(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqd	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$12, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$12, %xmm3, %xmm4
+	jnz	L(Shl12Start)
+
+	palignr	$12, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl12LoopStart)
+
+L(Shl12LoopExit):
+	movl	(%ecx), %esi
+	movl	%esi, (%edx)
+	mov	$4, %esi
+
+	.p2align 4
+L(CopyFrom1To16Bytes):
+	add	%esi, %edx
+	add	%esi, %ecx
+
+	POP	(%esi)
+	test	%al, %al
+	jz	L(ExitHigh)
+	test	$0x01, %al
+	jnz	L(Exit4)
+L(Exit8):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	%edi, %eax
+	RETURN
+
+	.p2align 4
+L(ExitHigh):
+	test	$0x01, %ah
+	jnz	L(Exit12)
+L(Exit16):
+	movdqu	(%ecx), %xmm0
+	movdqu	%xmm0, (%edx)
+	movl	%edi, %eax
+	RETURN
+
+	.p2align 4
+L(Exit4):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	%edi, %eax
+	RETURN
+
+	.p2align 4
+L(Exit12):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	8(%ecx), %eax
+	movl	%eax, 8(%edx)
+	movl	%edi, %eax
+	RETURN
+
+CFI_POP	(%edi)
+
+	.p2align 4
+L(ExitTail4):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	%edx, %eax
+	ret
+
+	.p2align 4
+L(ExitTail8):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	%edx, %eax
+	ret
+
+	.p2align 4
+L(ExitTail12):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	8(%ecx), %eax
+	movl	%eax, 8(%edx)
+	movl	%edx, %eax
+	ret
+
+	.p2align 4
+L(ExitTail16):
+	movdqu	(%ecx), %xmm0
+	movdqu	%xmm0, (%edx)
+	movl	%edx, %eax
+	ret
+
+END (__wcscpy_ssse3)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy.S
new file mode 100644
index 0000000000..cfc97dd87c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy.S
@@ -0,0 +1,36 @@
+/* Multiple versions of wcscpy
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+	.text
+ENTRY(wcscpy)
+	.type	wcscpy, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__wcscpy_ia32)
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__wcscpy_ssse3)
+2:	ret
+END(wcscpy)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-c.c
new file mode 100644
index 0000000000..a335dc0f7e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-c.c
@@ -0,0 +1,9 @@
+#include <wchar.h>
+
+#if IS_IN (libc)
+# define WCSLEN  __wcslen_ia32
+#endif
+
+extern __typeof (wcslen) __wcslen_ia32;
+
+#include "wcsmbs/wcslen.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-sse2.S
new file mode 100644
index 0000000000..bd3fc4c79b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-sse2.S
@@ -0,0 +1,193 @@
+/* wcslen with SSE2
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+# include <sysdep.h>
+# define STR	4
+
+	.text
+ENTRY (__wcslen_sse2)
+	mov	STR(%esp), %edx
+
+	cmp	$0, (%edx)
+	jz	L(exit_tail0)
+	cmp	$0, 4(%edx)
+	jz	L(exit_tail1)
+	cmp	$0, 8(%edx)
+	jz	L(exit_tail2)
+	cmp	$0, 12(%edx)
+	jz	L(exit_tail3)
+	cmp	$0, 16(%edx)
+	jz	L(exit_tail4)
+	cmp	$0, 20(%edx)
+	jz	L(exit_tail5)
+	cmp	$0, 24(%edx)
+	jz	L(exit_tail6)
+	cmp	$0, 28(%edx)
+	jz	L(exit_tail7)
+
+	pxor	%xmm0, %xmm0
+
+	lea	32(%edx), %eax
+	lea	16(%edx), %ecx
+	and	$-16, %eax
+
+	pcmpeqd	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	pxor	%xmm1, %xmm1
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqd	(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	pxor	%xmm2, %xmm2
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqd	(%eax), %xmm2
+	pmovmskb %xmm2, %edx
+	pxor	%xmm3, %xmm3
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqd	(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	and	$-0x40, %eax
+
+	.p2align 4
+L(aligned_64_loop):
+	movaps	(%eax), %xmm0
+	movaps	16(%eax), %xmm1
+	movaps	32(%eax), %xmm2
+	movaps	48(%eax), %xmm6
+
+	pminub	%xmm1, %xmm0
+	pminub	%xmm6, %xmm2
+	pminub	%xmm0, %xmm2
+	pcmpeqd	%xmm3, %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	lea	64(%eax), %eax
+	jz	L(aligned_64_loop)
+
+	pcmpeqd	-64(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	48(%ecx), %ecx
+	jnz	L(exit)
+
+	pcmpeqd	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	-16(%ecx), %ecx
+	jnz	L(exit)
+
+	pcmpeqd	-32(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	-16(%ecx), %ecx
+	jnz	L(exit)
+
+	pcmpeqd	%xmm6, %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	-16(%ecx), %ecx
+	jnz	L(exit)
+
+	jmp	L(aligned_64_loop)
+
+	.p2align 4
+L(exit):
+	sub	%ecx, %eax
+	shr	$2, %eax
+	test	%dl, %dl
+	jz	L(exit_high)
+
+	mov	%dl, %cl
+	and	$15, %cl
+	jz	L(exit_1)
+	ret
+
+	.p2align 4
+L(exit_high):
+	mov	%dh, %ch
+	and	$15, %ch
+	jz	L(exit_3)
+	add	$2, %eax
+	ret
+
+	.p2align 4
+L(exit_1):
+	add	$1, %eax
+	ret
+
+	.p2align 4
+L(exit_3):
+	add	$3, %eax
+	ret
+
+	.p2align 4
+L(exit_tail0):
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(exit_tail1):
+	mov	$1, %eax
+	ret
+
+	.p2align 4
+L(exit_tail2):
+	mov	$2, %eax
+	ret
+
+	.p2align 4
+L(exit_tail3):
+	mov	$3, %eax
+	ret
+
+	.p2align 4
+L(exit_tail4):
+	mov	$4, %eax
+	ret
+
+	.p2align 4
+L(exit_tail5):
+	mov	$5, %eax
+	ret
+
+	.p2align 4
+L(exit_tail6):
+	mov	$6, %eax
+	ret
+
+	.p2align 4
+L(exit_tail7):
+	mov	$7, %eax
+	ret
+
+END (__wcslen_sse2)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen.S
new file mode 100644
index 0000000000..6ef9b6e7b5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen.S
@@ -0,0 +1,37 @@
+/* Multiple versions of wcslen
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+	.text
+ENTRY(__wcslen)
+	.type	__wcslen, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__wcslen_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__wcslen_sse2)
+2:	ret
+END(__wcslen)
+
+weak_alias(__wcslen, wcslen)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-c.c
new file mode 100644
index 0000000000..8d8a335b5b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-c.c
@@ -0,0 +1,5 @@
+#if IS_IN (libc)
+# define wcsrchr  __wcsrchr_ia32
+#endif
+
+#include "wcsmbs/wcsrchr.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S
new file mode 100644
index 0000000000..1a9b60e55e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S
@@ -0,0 +1,354 @@
+/* wcsrchr with SSE2, without using bsf instructions.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+# include <sysdep.h>
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+# define POP(REG)	popl REG; CFI_POP (REG)
+
+# define PARMS	8
+# define ENTRANCE	PUSH (%edi);
+# define RETURN	POP (%edi); ret; CFI_PUSH (%edi);
+# define STR1	PARMS
+# define STR2	STR1+4
+
+	atom_text_section
+ENTRY (__wcsrchr_sse2)
+
+	ENTRANCE
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+
+	mov	%ecx, %edi
+	punpckldq %xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	punpckldq %xmm1, %xmm1
+
+/* ECX has OFFSET. */
+	and	$63, %ecx
+	cmp	$48, %ecx
+	ja	L(crosscache)
+
+/* unaligned string. */
+	movdqu	(%edi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+/* Find where NULL is.  */
+	pmovmskb %xmm2, %ecx
+/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+	add	$16, %edi
+
+	test	%eax, %eax
+	jnz	L(unaligned_match1)
+
+	test	%ecx, %ecx
+	jnz	L(return_null)
+
+	and	$-16, %edi
+
+	PUSH	(%esi)
+
+	xor	%edx, %edx
+	jmp	L(loop)
+
+	CFI_POP	(%esi)
+
+	.p2align 4
+L(unaligned_match1):
+	test	%ecx, %ecx
+	jnz	L(prolog_find_zero_1)
+
+	PUSH	(%esi)
+
+/* Save current match */
+	mov	%eax, %edx
+	mov	%edi, %esi
+	and	$-16, %edi
+	jmp	L(loop)
+
+	CFI_POP	(%esi)
+
+	.p2align 4
+L(crosscache):
+/* Hancle unaligned string.  */
+	and	$15, %ecx
+	and	$-16, %edi
+	pxor	%xmm3, %xmm3
+	movdqa	(%edi), %xmm0
+	pcmpeqd	%xmm0, %xmm3
+	pcmpeqd	%xmm1, %xmm0
+/* Find where NULL is.  */
+	pmovmskb %xmm3, %edx
+/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+/* Remove the leading bytes.  */
+	shr	%cl, %edx
+	shr	%cl, %eax
+	add	$16, %edi
+
+	test	%eax, %eax
+	jnz	L(unaligned_match)
+
+	test	%edx, %edx
+	jnz	L(return_null)
+
+	PUSH	(%esi)
+
+	xor	%edx, %edx
+	jmp	L(loop)
+
+	CFI_POP	(%esi)
+
+	.p2align 4
+L(unaligned_match):
+	test	%edx, %edx
+	jnz	L(prolog_find_zero)
+
+	PUSH	(%esi)
+
+	mov	%eax, %edx
+	lea	(%edi, %ecx), %esi
+
+/* Loop start on aligned string.  */
+	.p2align 4
+L(loop):
+	movdqa	(%edi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm0, %eax
+	or	%eax, %ecx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm3
+	pcmpeqd	%xmm3, %xmm2
+	add	$16, %edi
+	pcmpeqd	%xmm1, %xmm3
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm3, %eax
+	or	%eax, %ecx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm4
+	pcmpeqd	%xmm4, %xmm2
+	add	$16, %edi
+	pcmpeqd	%xmm1, %xmm4
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm4, %eax
+	or	%eax, %ecx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm5
+	pcmpeqd	%xmm5, %xmm2
+	add	$16, %edi
+	pcmpeqd	%xmm1, %xmm5
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm5, %eax
+	or	%eax, %ecx
+	jz	L(loop)
+
+	.p2align 4
+L(matches):
+	test	%eax, %eax
+	jnz	L(match)
+L(return_value):
+	test	%edx, %edx
+	jz	L(return_null_1)
+	mov	%edx, %eax
+	mov	%esi, %edi
+
+	POP	(%esi)
+
+	test	%ah, %ah
+	jnz	L(match_third_or_fourth_wchar)
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%edi), %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(return_null_1):
+	POP	(%esi)
+
+	xor	%eax, %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(match):
+	pmovmskb %xmm2, %ecx
+	test	%ecx, %ecx
+	jnz	L(find_zero)
+/* save match info */
+	mov	%eax, %edx
+	mov	%edi, %esi
+	jmp	L(loop)
+
+	.p2align 4
+L(find_zero):
+	test	%cl, %cl
+	jz	L(find_zero_in_third_or_fourth_wchar)
+	test	$15, %cl
+	jz	L(find_zero_in_second_wchar)
+	and	$1, %eax
+	jz	L(return_value)
+
+	POP	(%esi)
+
+	lea	-16(%edi), %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(find_zero_in_second_wchar):
+	and	$1 << 5 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%esi)
+
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%edi), %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(find_zero_in_third_or_fourth_wchar):
+	test	$15, %ch
+	jz	L(find_zero_in_fourth_wchar)
+	and	$1 << 9 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%esi)
+
+	test	%ah, %ah
+	jnz	L(match_third_wchar)
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%edi), %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(find_zero_in_fourth_wchar):
+
+	POP	(%esi)
+
+	test	%ah, %ah
+	jnz	L(match_third_or_fourth_wchar)
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%edi), %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(match_second_wchar):
+	lea	-12(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_third_or_fourth_wchar):
+	test	$15 << 4, %ah
+	jnz	L(match_fourth_wchar)
+	lea	-8(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_third_wchar):
+	lea	-8(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_fourth_wchar):
+	lea	-4(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(prolog_find_zero):
+	add	%ecx, %edi
+	mov     %edx, %ecx
+L(prolog_find_zero_1):
+	test	%cl, %cl
+	jz	L(prolog_find_zero_in_third_or_fourth_wchar)
+	test	$15, %cl
+	jz	L(prolog_find_zero_in_second_wchar)
+	and	$1, %eax
+	jz	L(return_null)
+
+	lea	-16(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(prolog_find_zero_in_second_wchar):
+	and	$1 << 5 - 1, %eax
+	jz	L(return_null)
+
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(prolog_find_zero_in_third_or_fourth_wchar):
+	test	$15, %ch
+	jz	L(prolog_find_zero_in_fourth_wchar)
+	and	$1 << 9 - 1, %eax
+	jz	L(return_null)
+
+	test	%ah, %ah
+	jnz	L(match_third_wchar)
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(prolog_find_zero_in_fourth_wchar):
+	test	%ah, %ah
+	jnz	L(match_third_or_fourth_wchar)
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%edi), %eax
+	RETURN
+
+END (__wcsrchr_sse2)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr.S
new file mode 100644
index 0000000000..cf67333995
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr.S
@@ -0,0 +1,35 @@
+/* Multiple versions of wcsrchr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+	.text
+ENTRY(wcsrchr)
+	.type	wcsrchr, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__wcsrchr_ia32)
+	HAS_CPU_FEATURE (SSE2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__wcsrchr_sse2)
+2:	ret
+END(wcsrchr)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-c.c
new file mode 100644
index 0000000000..75ab4b94c1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-c.c
@@ -0,0 +1,9 @@
+#include <wchar.h>
+
+#if IS_IN (libc)
+# define WMEMCMP  __wmemcmp_ia32
+#endif
+
+extern __typeof (wmemcmp) __wmemcmp_ia32;
+
+#include "wcsmbs/wmemcmp.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S
new file mode 100644
index 0000000000..1a857c7e21
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S
@@ -0,0 +1,4 @@
+#define USE_AS_WMEMCMP 1
+#define MEMCMP __wmemcmp_sse4_2
+
+#include "memcmp-sse4.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S
new file mode 100644
index 0000000000..a41ef95fc1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_WMEMCMP 1
+#define MEMCMP __wmemcmp_ssse3
+
+#include "memcmp-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp.S
new file mode 100644
index 0000000000..1b9a54a413
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp.S
@@ -0,0 +1,40 @@
+/* Multiple versions of wmemcmp
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+
+#if IS_IN (libc)
+	.text
+ENTRY(wmemcmp)
+	.type	wmemcmp, @gnu_indirect_function
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__wmemcmp_ia32)
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__wmemcmp_ssse3)
+	HAS_CPU_FEATURE (SSE4_2)
+	jz	2f
+	LOAD_FUNC_GOT_EAX (__wmemcmp_sse4_2)
+2:	ret
+END(wmemcmp)
+#endif