87 files changed, 24585 insertions, 0 deletions
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/Makefile b/REORG.TODO/sysdeps/x86_64/multiarch/Makefile
new file mode 100644
index 0000000000..310a3a4b72
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/Makefile
@@ -0,0 +1,42 @@
+ifeq ($(subdir),csu)
+tests += test-multiarch
+endif
+
+ifeq ($(subdir),string)
+
+sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
+		   strcmp-sse2-unaligned strncmp-ssse3 \
+		   memcmp-avx2-movbe \
+		   memcmp-sse4 memcpy-ssse3 \
+		   memmove-ssse3 \
+		   memcpy-ssse3-back \
+		   memmove-ssse3-back \
+		   memmove-avx512-no-vzeroupper strcasecmp_l-ssse3 \
+		   strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
+		   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
+		   strcpy-sse2-unaligned strncpy-sse2-unaligned \
+		   stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
+		   strcat-sse2-unaligned strncat-sse2-unaligned \
+		   strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
+		   strcspn-c strpbrk-c strspn-c varshift \
+		   memset-avx512-no-vzeroupper \
+		   memmove-avx-unaligned-erms \
+		   memmove-avx512-unaligned-erms \
+		   memset-avx2-unaligned-erms \
+		   memset-avx512-unaligned-erms
+CFLAGS-varshift.c += -msse4
+CFLAGS-strcspn-c.c += -msse4
+CFLAGS-strpbrk-c.c += -msse4
+CFLAGS-strspn-c.c += -msse4
+endif
+
+ifeq ($(subdir),wcsmbs)
+sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
+		   wmemcmp-avx2-movbe \
+		   wcscpy-ssse3 wcscpy-c \
+		   wcsnlen-sse4_1 wcsnlen-c
+endif
+
+ifeq ($(subdir),debug)
+sysdep_routines += wmemset_chk-nonshared
+endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/bcopy.S b/REORG.TODO/sysdeps/x86_64/multiarch/bcopy.S
new file mode 100644
index 0000000000..639f02bde3
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/bcopy.S
@@ -0,0 +1,7 @@
+#include <sysdep.h>
+
+	.text
+ENTRY(bcopy)
+	xchg	%rdi, %rsi
+	jmp	__libc_memmove	/* Branch to IFUNC memmove.  */
+END(bcopy)
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/REORG.TODO/sysdeps/x86_64/multiarch/ifunc-impl-list.c
new file mode 100644
index 0000000000..5627183aca
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -0,0 +1,460 @@
+/* Enumerate available IFUNC implementations of a function.  x86-64 version.
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <assert.h>
+#include <string.h>
+#include <wchar.h>
+#include <ifunc-impl-list.h>
+#include <sysdep.h>
+#include "init-arch.h"
+
+/* Maximum number of IFUNC implementations.  */
+#define MAX_IFUNC	5
+
+/* Fill ARRAY of MAX elements with IFUNC implementations for function
+   NAME supported on target machine and return the number of valid
+   entries.  */
+
+size_t
+__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+			size_t max)
+{
+  assert (max >= MAX_IFUNC);
+
+  size_t i = 0;
+
+  /* Support sysdeps/x86_64/multiarch/memcmp.S.  */
+  IFUNC_IMPL (i, name, memcmp,
+	      IFUNC_IMPL_ADD (array, i, memcmp,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (MOVBE)),
+			      __memcmp_avx2_movbe)
+	      IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSE4_1),
+			      __memcmp_sse4_1)
+	      IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSSE3),
+			      __memcmp_ssse3)
+	      IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/memmove_chk.c.  */
+  IFUNC_IMPL (i, name, __memmove_chk,
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memmove_chk_avx512_no_vzeroupper)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memmove_chk_avx512_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memmove_chk_avx512_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __memmove_chk_avx_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __memmove_chk_avx_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __memmove_chk_ssse3_back)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __memmove_chk_ssse3)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
+			      __memmove_chk_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
+			      __memmove_chk_sse2_unaligned_erms))
+
+  /* Support sysdeps/x86_64/multiarch/memmove.S.  */
+  IFUNC_IMPL (i, name, memmove,
+	      IFUNC_IMPL_ADD (array, i, memmove,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __memmove_avx_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memmove,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __memmove_avx_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, memmove,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memmove_avx512_no_vzeroupper)
+	      IFUNC_IMPL_ADD (array, i, memmove,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memmove_avx512_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memmove,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memmove_avx512_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
+			      __memmove_ssse3_back)
+	      IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
+			      __memmove_ssse3)
+	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
+	      IFUNC_IMPL_ADD (array, i, memmove, 1,
+			      __memmove_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memmove, 1,
+			      __memmove_sse2_unaligned_erms))
+
+  /* Support sysdeps/x86_64/multiarch/memset_chk.S.  */
+  IFUNC_IMPL (i, name, __memset_chk,
+	      IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
+			      __memset_chk_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
+			      __memset_chk_sse2_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __memset_chk_avx2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __memset_chk_avx2_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memset_chk_avx512_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memset_chk_avx512_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memset_chk_avx512_no_vzeroupper)
+	      )
+
+  /* Support sysdeps/x86_64/multiarch/memset.S.  */
+  IFUNC_IMPL (i, name, memset,
+	      IFUNC_IMPL_ADD (array, i, memset, 1,
+			      __memset_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memset, 1,
+			      __memset_sse2_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_erms)
+	      IFUNC_IMPL_ADD (array, i, memset,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __memset_avx2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memset,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __memset_avx2_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, memset,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memset_avx512_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, memset,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memset_avx512_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memset,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memset_avx512_no_vzeroupper)
+	     )
+
+  /* Support sysdeps/x86_64/multiarch/stpncpy.S.  */
+  IFUNC_IMPL (i, name, stpncpy,
+	      IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSSE3),
+			      __stpncpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, stpncpy, 1,
+			      __stpncpy_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/stpcpy.S.  */
+  IFUNC_IMPL (i, name, stpcpy,
+	      IFUNC_IMPL_ADD (array, i, stpcpy, HAS_CPU_FEATURE (SSSE3),
+			      __stpcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/strcasecmp_l.S.  */
+  IFUNC_IMPL (i, name, strcasecmp,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __strcasecmp_avx)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      HAS_CPU_FEATURE (SSE4_2),
+			      __strcasecmp_sse42)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __strcasecmp_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/strcasecmp_l.S.  */
+  IFUNC_IMPL (i, name, strcasecmp_l,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __strcasecmp_l_avx)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
+			      HAS_CPU_FEATURE (SSE4_2),
+			      __strcasecmp_l_sse42)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __strcasecmp_l_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1,
+			      __strcasecmp_l_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/strcat.S.  */
+  IFUNC_IMPL (i, name, strcat,
+	      IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSSE3),
+			      __strcat_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/strchr.S.  */
+  IFUNC_IMPL (i, name, strchr,
+	      IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2_no_bsf)
+	      IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/strcmp.S.  */
+  IFUNC_IMPL (i, name, strcmp,
+	      IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSE4_2),
+			      __strcmp_sse42)
+	      IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSSE3),
+			      __strcmp_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/strcpy.S.  */
+  IFUNC_IMPL (i, name, strcpy,
+	      IFUNC_IMPL_ADD (array, i, strcpy, HAS_CPU_FEATURE (SSSE3),
+			      __strcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/strcspn.S.  */
+  IFUNC_IMPL (i, name, strcspn,
+	      IFUNC_IMPL_ADD (array, i, strcspn, HAS_CPU_FEATURE (SSE4_2),
+			      __strcspn_sse42)
+	      IFUNC_IMPL_ADD (array, i, strcspn, 1, __strcspn_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/strncase_l.S.  */
+  IFUNC_IMPL (i, name, strncasecmp,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __strncasecmp_avx)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      HAS_CPU_FEATURE (SSE4_2),
+			      __strncasecmp_sse42)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __strncasecmp_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp, 1,
+			      __strncasecmp_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/strncase_l.S.  */
+  IFUNC_IMPL (i, name, strncasecmp_l,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __strncasecmp_l_avx)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
+			      HAS_CPU_FEATURE (SSE4_2),
+			      __strncasecmp_l_sse42)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __strncasecmp_l_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1,
+			      __strncasecmp_l_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/strncat.S.  */
+  IFUNC_IMPL (i, name, strncat,
+	      IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSSE3),
+			      __strncat_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strncat, 1,
+			      __strncat_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/strncpy.S.  */
+  IFUNC_IMPL (i, name, strncpy,
+	      IFUNC_IMPL_ADD (array, i, strncpy, HAS_CPU_FEATURE (SSSE3),
+			      __strncpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strncpy, 1,
+			      __strncpy_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/strpbrk.S.  */
+  IFUNC_IMPL (i, name, strpbrk,
+	      IFUNC_IMPL_ADD (array, i, strpbrk, HAS_CPU_FEATURE (SSE4_2),
+			      __strpbrk_sse42)
+	      IFUNC_IMPL_ADD (array, i, strpbrk, 1, __strpbrk_sse2))
+
+
+  /* Support sysdeps/x86_64/multiarch/strspn.S.  */
+  IFUNC_IMPL (i, name, strspn,
+	      IFUNC_IMPL_ADD (array, i, strspn, HAS_CPU_FEATURE (SSE4_2),
+			      __strspn_sse42)
+	      IFUNC_IMPL_ADD (array, i, strspn, 1, __strspn_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/strstr.c.  */
+  IFUNC_IMPL (i, name, strstr,
+	      IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/wcscpy.S.  */
+  IFUNC_IMPL (i, name, wcscpy,
+	      IFUNC_IMPL_ADD (array, i, wcscpy, HAS_CPU_FEATURE (SSSE3),
+			      __wcscpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, wcscpy, 1, __wcscpy_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */
+  IFUNC_IMPL (i, name, wcsnlen,
+	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+			      HAS_CPU_FEATURE (SSE4_1),
+			      __wcsnlen_sse4_1)
+	      IFUNC_IMPL_ADD (array, i, wcsnlen, 1, __wcsnlen_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/wmemcmp.S.  */
+  IFUNC_IMPL (i, name, wmemcmp,
+	      IFUNC_IMPL_ADD (array, i, wmemcmp,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (MOVBE)),
+			      __wmemcmp_avx2_movbe)
+	      IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSE4_1),
+			      __wmemcmp_sse4_1)
+	      IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSSE3),
+			      __wmemcmp_ssse3)
+	      IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/wmemset.c.  */
+  IFUNC_IMPL (i, name, wmemset,
+	      IFUNC_IMPL_ADD (array, i, wmemset, 1,
+			      __wmemset_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, wmemset,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __wmemset_avx2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, wmemset,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __wmemset_avx512_unaligned))
+
+#ifdef SHARED
+  /* Support sysdeps/x86_64/multiarch/memcpy_chk.S.  */
+  IFUNC_IMPL (i, name, __memcpy_chk,
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memcpy_chk_avx512_no_vzeroupper)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memcpy_chk_avx512_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memcpy_chk_avx512_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __memcpy_chk_avx_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __memcpy_chk_avx_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __memcpy_chk_ssse3_back)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __memcpy_chk_ssse3)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
+			      __memcpy_chk_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
+			      __memcpy_chk_sse2_unaligned_erms))
+
+  /* Support sysdeps/x86_64/multiarch/memcpy.S.  */
+  IFUNC_IMPL (i, name, memcpy,
+	      IFUNC_IMPL_ADD (array, i, memcpy,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __memcpy_avx_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memcpy,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __memcpy_avx_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
+			      __memcpy_ssse3_back)
+	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
+			      __memcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, memcpy,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memcpy_avx512_no_vzeroupper)
+	      IFUNC_IMPL_ADD (array, i, memcpy,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memcpy_avx512_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memcpy,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memcpy_avx512_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memcpy, 1,
+			      __memcpy_sse2_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_erms))
+
+  /* Support sysdeps/x86_64/multiarch/mempcpy_chk.S.  */
+  IFUNC_IMPL (i, name, __mempcpy_chk,
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __mempcpy_chk_avx512_no_vzeroupper)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __mempcpy_chk_avx512_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __mempcpy_chk_avx512_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __mempcpy_chk_avx_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __mempcpy_chk_avx_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __mempcpy_chk_ssse3_back)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_CPU_FEATURE (SSSE3),
+			      __mempcpy_chk_ssse3)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
+			      __mempcpy_chk_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
+			      __mempcpy_chk_sse2_unaligned_erms))
+
+  /* Support sysdeps/x86_64/multiarch/mempcpy.S.  */
+  IFUNC_IMPL (i, name, mempcpy,
+	      IFUNC_IMPL_ADD (array, i, mempcpy,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __mempcpy_avx512_no_vzeroupper)
+	      IFUNC_IMPL_ADD (array, i, mempcpy,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __mempcpy_avx512_unaligned)
+	      IFUNC_IMPL_ADD (array, i, mempcpy,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __mempcpy_avx512_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, mempcpy,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __mempcpy_avx_unaligned)
+	      IFUNC_IMPL_ADD (array, i, mempcpy,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __mempcpy_avx_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
+			      __mempcpy_ssse3_back)
+	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
+			      __mempcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, mempcpy, 1,
+			      __mempcpy_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, mempcpy, 1,
+			      __mempcpy_sse2_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_erms))
+
+  /* Support sysdeps/x86_64/multiarch/strncmp.S.  */
+  IFUNC_IMPL (i, name, strncmp,
+	      IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSE4_2),
+			      __strncmp_sse42)
+	      IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSSE3),
+			      __strncmp_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/wmemset_chk.c.  */
+  IFUNC_IMPL (i, name, __wmemset_chk,
+	      IFUNC_IMPL_ADD (array, i, __wmemset_chk, 1,
+			      __wmemset_chk_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __wmemset_chk_avx2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __wmemset_chk_avx512_unaligned))
+#endif
+
+  return i;
+}
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/REORG.TODO/sysdeps/x86_64/multiarch/ifunc-wmemset.h
new file mode 100644
index 0000000000..d761985a47
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/ifunc-wmemset.h
@@ -0,0 +1,42 @@
+/* Common definition for wmemset/wmemset_chk ifunc selections.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <init-arch.h>
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+  const struct cpu_features* cpu_features = __get_cpu_features ();
+
+  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+      && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
+      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+    {
+      if (CPU_FEATURES_ARCH_P (cpu_features, AVX512F_Usable)
+	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
+	return OPTIMIZE (avx512_unaligned);
+      else
+	return OPTIMIZE (avx2_unaligned);
+    }
+
+  return OPTIMIZE (sse2_unaligned);
+}
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/REORG.TODO/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
new file mode 100644
index 0000000000..47630dd97b
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
@@ -0,0 +1,425 @@
+/* memcmp/wmemcmp optimized with AVX2.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+/* memcmp/wmemcmp is implemented as:
+   1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
+      to avoid branches.
+   2. Use overlapping compare to avoid branch.
+   3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
+      bytes for wmemcmp.
+   4. If size is 8 * VEC_SIZE or less, unroll the loop.
+   5. Compare 4 * VEC_SIZE at a time with the aligned first memory
+      area.
+   6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
+   7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
+   8. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
+
+# include <sysdep.h>
+
+# ifndef MEMCMP
+#  define MEMCMP	__memcmp_avx2_movbe
+# endif
+
+# ifdef USE_AS_WMEMCMP
+#  define VPCMPEQ	vpcmpeqd
+# else
+#  define VPCMPEQ	vpcmpeqb
+# endif
+
+# ifndef VZEROUPPER
+#  define VZEROUPPER	vzeroupper
+# endif
+
+# define VEC_SIZE 32
+# define VEC_MASK ((1 << VEC_SIZE) - 1)
+
+/* Warning!
+           wmemcmp has to use SIGNED comparison for elements.
+           memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
+	.section .text.avx,"ax",@progbits
+ENTRY (MEMCMP)
+# ifdef USE_AS_WMEMCMP
+	shl	$2, %rdx
+# endif
+	cmpq	$VEC_SIZE, %rdx
+	jb	L(less_vec)
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
+
+L(last_2x_vec):
+	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+	vmovdqu	(%rsi), %ymm2
+	VPCMPEQ (%rdi), %ymm2, %ymm2
+	vpmovmskb %ymm2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+
+L(last_vec):
+	/* Use overlapping loads to avoid branches.  */
+	leaq	-VEC_SIZE(%rdi, %rdx), %rdi
+	leaq	-VEC_SIZE(%rsi, %rdx), %rsi
+	vmovdqu	(%rsi), %ymm2
+	VPCMPEQ (%rdi), %ymm2, %ymm2
+	vpmovmskb %ymm2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec):
+	/* A byte or int32 is different within 16 or 32 bytes.  */
+	tzcntl	%eax, %ecx
+# ifdef USE_AS_WMEMCMP
+	xorl	%eax, %eax
+	movl	(%rdi, %rcx), %edx
+	cmpl	(%rsi, %rcx), %edx
+L(wmemcmp_return):
+	setl	%al
+	negl	%eax
+	orl	$1, %eax
+# else
+	movzbl	(%rdi, %rcx), %eax
+	movzbl	(%rsi, %rcx), %edx
+	sub	%edx, %eax
+# endif
+	VZEROUPPER
+	ret
+
+# ifdef USE_AS_WMEMCMP
+	.p2align 4
+L(4):
+	xorl	%eax, %eax
+	movl	(%rdi), %edx
+	cmpl	(%rsi), %edx
+	jne	L(wmemcmp_return)
+	ret
+# else
+	.p2align 4
+L(between_4_7):
+	/* Load as big endian with overlapping movbe to avoid branches.  */
+	movbe	(%rdi), %eax
+	movbe	(%rsi), %ecx
+	shlq	$32, %rax
+	shlq	$32, %rcx
+	movbe	-4(%rdi, %rdx), %edi
+	movbe	-4(%rsi, %rdx), %esi
+	orq	%rdi, %rax
+	orq	%rsi, %rcx
+	subq	%rcx, %rax
+	je	L(exit)
+	sbbl	%eax, %eax
+	orl	$1, %eax
+	ret
+
+	.p2align 4
+L(exit):
+	ret
+
+	.p2align 4
+L(between_2_3):
+	/* Load as big endian with overlapping loads and bswap to avoid
+	   branches.  */
+	movzwl	-2(%rdi, %rdx), %eax
+	movzwl	-2(%rsi, %rdx), %ecx
+	shll	$16, %eax
+	shll	$16, %ecx
+	movzwl	(%rdi), %edi
+	movzwl	(%rsi), %esi
+	orl	%edi, %eax
+	orl	%esi, %ecx
+	bswap	%eax
+	bswap	%ecx
+	subl	%ecx, %eax
+	ret
+
+	.p2align 4
+L(1):
+	movzbl	(%rdi), %eax
+	movzbl	(%rsi), %ecx
+	subl	%ecx, %eax
+	ret
+# endif
+
+	.p2align 4
+L(zero):
+	xorl	%eax, %eax
+	ret
+
+	.p2align 4
+L(less_vec):
+# ifdef USE_AS_WMEMCMP
+	/* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes.  */
+	cmpb	$4, %dl
+	je	L(4)
+	jb	L(zero)
+# else
+	cmpb	$1, %dl
+	je	L(1)
+	jb	L(zero)
+	cmpb	$4, %dl
+	jb	L(between_2_3)
+	cmpb	$8, %dl
+	jb	L(between_4_7)
+# endif
+	cmpb	$16, %dl
+	jae	L(between_16_31)
+	/* It is between 8 and 15 bytes.  */
+	vmovq	(%rdi), %xmm1
+	vmovq	(%rsi), %xmm2
+	VPCMPEQ %xmm1, %xmm2, %xmm2
+	vpmovmskb %xmm2, %eax
+	subl    $0xffff, %eax
+	jnz	L(first_vec)
+	/* Use overlapping loads to avoid branches.  */
+	leaq	-8(%rdi, %rdx), %rdi
+	leaq	-8(%rsi, %rdx), %rsi
+	vmovq	(%rdi), %xmm1
+	vmovq	(%rsi), %xmm2
+	VPCMPEQ %xmm1, %xmm2, %xmm2
+	vpmovmskb %xmm2, %eax
+	subl    $0xffff, %eax
+	jnz	L(first_vec)
+	ret
+
+	.p2align 4
+L(between_16_31):
+	/* From 16 to 31 bytes.  No branch when size == 16.  */
+	vmovdqu	(%rsi), %xmm2
+	VPCMPEQ (%rdi), %xmm2, %xmm2
+	vpmovmskb %xmm2, %eax
+	subl    $0xffff, %eax
+	jnz	L(first_vec)
+
+	/* Use overlapping loads to avoid branches.  */
+	leaq	-16(%rdi, %rdx), %rdi
+	leaq	-16(%rsi, %rdx), %rsi
+	vmovdqu	(%rsi), %xmm2
+	VPCMPEQ (%rdi), %xmm2, %xmm2
+	vpmovmskb %xmm2, %eax
+	subl    $0xffff, %eax
+	jnz	L(first_vec)
+	ret
+
+	.p2align 4
+L(more_2x_vec):
+	/* More than 2 * VEC.  */
+	cmpq	$(VEC_SIZE * 8), %rdx
+	ja	L(more_8x_vec)
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jb	L(last_4x_vec)
+
+	/* From 4 * VEC to 8 * VEC, inclusively. */
+	vmovdqu	(%rsi), %ymm1
+	VPCMPEQ (%rdi), %ymm1, %ymm1
+
+	vmovdqu	VEC_SIZE(%rsi), %ymm2
+	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
+
+	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
+	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+
+	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm4
+	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+
+	vpand	%ymm1, %ymm2, %ymm5
+	vpand	%ymm3, %ymm4, %ymm6
+	vpand	%ymm5, %ymm6, %ymm5
+
+	vpmovmskb %ymm5, %eax
+	subl	$VEC_MASK, %eax
+	jnz	L(4x_vec_end)
+
+	leaq	-(4 * VEC_SIZE)(%rdi, %rdx), %rdi
+	leaq	-(4 * VEC_SIZE)(%rsi, %rdx), %rsi
+	vmovdqu	(%rsi), %ymm1
+	VPCMPEQ (%rdi), %ymm1, %ymm1
+
+	vmovdqu	VEC_SIZE(%rsi), %ymm2
+	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
+	vpand	%ymm2, %ymm1, %ymm5
+
+	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
+	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+	vpand	%ymm3, %ymm5, %ymm5
+
+	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm4
+	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+	vpand	%ymm4, %ymm5, %ymm5
+
+	vpmovmskb %ymm5, %eax
+	subl	$VEC_MASK, %eax
+	jnz	L(4x_vec_end)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(more_8x_vec):
+	/* More than 8 * VEC.  Check the first VEC.  */
+	vmovdqu	(%rsi), %ymm2
+	VPCMPEQ (%rdi), %ymm2, %ymm2
+	vpmovmskb %ymm2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+
+	/* Align the first memory area for aligned loads in the loop.
+	   Compute how much the first memory area is misaligned.  */
+	movq	%rdi, %rcx
+	andl	$(VEC_SIZE - 1), %ecx
+	/* Get the negative of offset for alignment.  */
+	subq	$VEC_SIZE, %rcx
+	/* Adjust the second memory area.  */
+	subq	%rcx, %rsi
+	/* Adjust the first memory area which should be aligned now.  */
+	subq	%rcx, %rdi
+	/* Adjust length.  */
+	addq	%rcx, %rdx
+
+L(loop_4x_vec):
+	/* Compare 4 * VEC at a time forward.  */
+	vmovdqu	(%rsi), %ymm1
+	VPCMPEQ (%rdi), %ymm1, %ymm1
+
+	vmovdqu	VEC_SIZE(%rsi), %ymm2
+	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
+	vpand	%ymm2, %ymm1, %ymm5
+
+	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
+	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+	vpand	%ymm3, %ymm5, %ymm5
+
+	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm4
+	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+	vpand	%ymm4, %ymm5, %ymm5
+
+	vpmovmskb %ymm5, %eax
+	subl	$VEC_MASK, %eax
+	jnz	L(4x_vec_end)
+
+	addq	$(VEC_SIZE * 4), %rdi
+	addq	$(VEC_SIZE * 4), %rsi
+
+	subq	$(VEC_SIZE * 4), %rdx
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jae	L(loop_4x_vec)
+
+	/* Less than 4 * VEC.  */
+	cmpq	$VEC_SIZE, %rdx
+	jbe	L(last_vec)
+	cmpq	$(VEC_SIZE * 2), %rdx
+	jbe	L(last_2x_vec)
+
+L(last_4x_vec):
+	/* From 2 * VEC to 4 * VEC. */
+	vmovdqu	(%rsi), %ymm2
+	VPCMPEQ (%rdi), %ymm2, %ymm2
+	vpmovmskb %ymm2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+
+	addq	$VEC_SIZE, %rdi
+	addq	$VEC_SIZE, %rsi
+	vmovdqu	(%rsi), %ymm2
+	VPCMPEQ (%rdi), %ymm2, %ymm2
+	vpmovmskb %ymm2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+
+	/* Use overlapping loads to avoid branches.  */
+	leaq	-(3 * VEC_SIZE)(%rdi, %rdx), %rdi
+	leaq	-(3 * VEC_SIZE)(%rsi, %rdx), %rsi
+	vmovdqu	(%rsi), %ymm2
+	VPCMPEQ (%rdi), %ymm2, %ymm2
+	vpmovmskb %ymm2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+
+	addq	$VEC_SIZE, %rdi
+	addq	$VEC_SIZE, %rsi
+	vmovdqu	(%rsi), %ymm2
+	VPCMPEQ (%rdi), %ymm2, %ymm2
+	vpmovmskb %ymm2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(4x_vec_end):
+	vpmovmskb %ymm1, %eax
+	subl	$VEC_MASK, %eax
+	jnz	L(first_vec)
+	vpmovmskb %ymm2, %eax
+	subl	$VEC_MASK, %eax
+	jnz	L(first_vec_x1)
+	vpmovmskb %ymm3, %eax
+	subl	$VEC_MASK, %eax
+	jnz	L(first_vec_x2)
+	vpmovmskb %ymm4, %eax
+	subl	$VEC_MASK, %eax
+	tzcntl	%eax, %ecx
+# ifdef USE_AS_WMEMCMP
+	xorl	%eax, %eax
+	movl	(VEC_SIZE * 3)(%rdi, %rcx), %edx
+	cmpl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
+	jmp	L(wmemcmp_return)
+# else
+	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
+	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
+	sub	%edx, %eax
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec_x1):
+	tzcntl	%eax, %ecx
+# ifdef USE_AS_WMEMCMP
+	xorl	%eax, %eax
+	movl	VEC_SIZE(%rdi, %rcx), %edx
+	cmpl	VEC_SIZE(%rsi, %rcx), %edx
+	jmp	L(wmemcmp_return)
+# else
+	movzbl	VEC_SIZE(%rdi, %rcx), %eax
+	movzbl	VEC_SIZE(%rsi, %rcx), %edx
+	sub	%edx, %eax
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec_x2):
+	tzcntl	%eax, %ecx
+# ifdef USE_AS_WMEMCMP
+	xorl	%eax, %eax
+	movl	(VEC_SIZE * 2)(%rdi, %rcx), %edx
+	cmpl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
+	jmp	L(wmemcmp_return)
+# else
+	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
+	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
+	sub	%edx, %eax
+# endif
+	VZEROUPPER
+	ret
+END (MEMCMP)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memcmp-sse4.S b/REORG.TODO/sysdeps/x86_64/multiarch/memcmp-sse4.S
new file mode 100644
index 0000000000..771639f662
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/memcmp-sse4.S
@@ -0,0 +1,1776 @@
+/* memcmp with SSE4.1, wmemcmp with SSE4.1
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef MEMCMP
+#  define MEMCMP	__memcmp_sse4_1
+# endif
+
+# define JMPTBL(I, B)	(I - B)
+
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+  lea		TABLE(%rip), %r11;				\
+  movslq	(%r11, INDEX, SCALE), %rcx;			\
+  add		%r11, %rcx;					\
+  jmp		*%rcx;						\
+  ud2
+
+/* Warning!
+           wmemcmp has to use SIGNED comparison for elements.
+           memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
+	.section .text.sse4.1,"ax",@progbits
+ENTRY (MEMCMP)
+# ifdef USE_AS_WMEMCMP
+	shl	$2, %rdx
+# endif
+	pxor	%xmm0, %xmm0
+	cmp	$79, %rdx
+	ja	L(79bytesormore)
+# ifndef USE_AS_WMEMCMP
+	cmp	$1, %rdx
+	je	L(firstbyte)
+# endif
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(firstbyte):
+	movzbl	(%rdi), %eax
+	movzbl	(%rsi), %ecx
+	sub	%ecx, %eax
+	ret
+# endif
+
+	.p2align 4
+L(79bytesormore):
+	movdqu	(%rsi), %xmm1
+	movdqu	(%rdi), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+	mov	%rsi, %rcx
+	and	$-16, %rsi
+	add	$16, %rsi
+	sub	%rsi, %rcx
+
+	sub	%rcx, %rdi
+	add	%rcx, %rdx
+	test	$0xf, %rdi
+	jz	L(2aligned)
+
+	cmp	$128, %rdx
+	ja	L(128bytesormore)
+L(less128bytes):
+	sub	$64, %rdx
+
+	movdqu	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqu	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+
+	movdqu	32(%rdi), %xmm2
+	pxor	32(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(48bytesin256)
+
+	movdqu	48(%rdi), %xmm2
+	pxor	48(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(64bytesin256)
+	cmp	$32, %rdx
+	jb	L(less32bytesin64)
+
+	movdqu	64(%rdi), %xmm2
+	pxor	64(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(80bytesin256)
+
+	movdqu	80(%rdi), %xmm2
+	pxor	80(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(96bytesin256)
+	sub	$32, %rdx
+	add	$32, %rdi
+	add	$32, %rsi
+L(less32bytesin64):
+	add	$64, %rdi
+	add	$64, %rsi
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+L(128bytesormore):
+	cmp	$512, %rdx
+	ja	L(512bytesormore)
+	cmp	$256, %rdx
+	ja	L(less512bytes)
+L(less256bytes):
+	sub	$128, %rdx
+
+	movdqu	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqu	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+
+	movdqu	32(%rdi), %xmm2
+	pxor	32(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(48bytesin256)
+
+	movdqu	48(%rdi), %xmm2
+	pxor	48(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(64bytesin256)
+
+	movdqu	64(%rdi), %xmm2
+	pxor	64(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(80bytesin256)
+
+	movdqu	80(%rdi), %xmm2
+	pxor	80(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(96bytesin256)
+
+	movdqu	96(%rdi), %xmm2
+	pxor	96(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(112bytesin256)
+
+	movdqu	112(%rdi), %xmm2
+	pxor	112(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(128bytesin256)
+
+	add	$128, %rsi
+	add	$128, %rdi
+
+	cmp	$64, %rdx
+	jae	L(less128bytes)
+
+	cmp	$32, %rdx
+	jb	L(less32bytesin128)
+
+	movdqu	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqu	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+	sub	$32, %rdx
+	add	$32, %rdi
+	add	$32, %rsi
+L(less32bytesin128):
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+L(less512bytes):
+	sub	$256, %rdx
+	movdqu	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqu	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+
+	movdqu	32(%rdi), %xmm2
+	pxor	32(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(48bytesin256)
+
+	movdqu	48(%rdi), %xmm2
+	pxor	48(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(64bytesin256)
+
+	movdqu	64(%rdi), %xmm2
+	pxor	64(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(80bytesin256)
+
+	movdqu	80(%rdi), %xmm2
+	pxor	80(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(96bytesin256)
+
+	movdqu	96(%rdi), %xmm2
+	pxor	96(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(112bytesin256)
+
+	movdqu	112(%rdi), %xmm2
+	pxor	112(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(128bytesin256)
+
+	movdqu	128(%rdi), %xmm2
+	pxor	128(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(144bytesin256)
+
+	movdqu	144(%rdi), %xmm2
+	pxor	144(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(160bytesin256)
+
+	movdqu	160(%rdi), %xmm2
+	pxor	160(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(176bytesin256)
+
+	movdqu	176(%rdi), %xmm2
+	pxor	176(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(192bytesin256)
+
+	movdqu	192(%rdi), %xmm2
+	pxor	192(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(208bytesin256)
+
+	movdqu	208(%rdi), %xmm2
+	pxor	208(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(224bytesin256)
+
+	movdqu	224(%rdi), %xmm2
+	pxor	224(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(240bytesin256)
+
+	movdqu	240(%rdi), %xmm2
+	pxor	240(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(256bytesin256)
+
+	add	$256, %rsi
+	add	$256, %rdi
+
+	cmp	$128, %rdx
+	jae	L(less256bytes)
+
+	cmp	$64, %rdx
+	jae	L(less128bytes)
+
+	cmp	$32, %rdx
+	jb	L(less32bytesin256)
+
+	movdqu	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqu	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+	sub	$32, %rdx
+	add	$32, %rdi
+	add	$32, %rsi
+L(less32bytesin256):
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+	.p2align 4
+L(512bytesormore):
+# ifdef DATA_CACHE_SIZE_HALF
+	mov	$DATA_CACHE_SIZE_HALF, %R8_LP
+# else
+	mov	__x86_data_cache_size_half(%rip), %R8_LP
+# endif
+	mov	%r8, %r9
+	shr	$1, %r8
+	add	%r9, %r8
+	cmp	%r8, %rdx
+	ja	L(L2_L3_cache_unaglined)
+	sub	$64, %rdx
+	.p2align 4
+L(64bytesormore_loop):
+	movdqu	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	movdqa	%xmm2, %xmm1
+
+	movdqu	16(%rdi), %xmm3
+	pxor	16(%rsi), %xmm3
+	por	%xmm3, %xmm1
+
+	movdqu	32(%rdi), %xmm4
+	pxor	32(%rsi), %xmm4
+	por	%xmm4, %xmm1
+
+	movdqu	48(%rdi), %xmm5
+	pxor	48(%rsi), %xmm5
+	por	%xmm5, %xmm1
+
+	ptest	%xmm1, %xmm0
+	jnc	L(64bytesormore_loop_end)
+	add	$64, %rsi
+	add	$64, %rdi
+	sub	$64, %rdx
+	jae	L(64bytesormore_loop)
+
+	add	$64, %rdx
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+L(L2_L3_cache_unaglined):
+	sub	$64, %rdx
+	.p2align 4
+L(L2_L3_unaligned_128bytes_loop):
+	prefetchnta 0x1c0(%rdi)
+	prefetchnta 0x1c0(%rsi)
+	movdqu	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	movdqa	%xmm2, %xmm1
+
+	movdqu	16(%rdi), %xmm3
+	pxor	16(%rsi), %xmm3
+	por	%xmm3, %xmm1
+
+	movdqu	32(%rdi), %xmm4
+	pxor	32(%rsi), %xmm4
+	por	%xmm4, %xmm1
+
+	movdqu	48(%rdi), %xmm5
+	pxor	48(%rsi), %xmm5
+	por	%xmm5, %xmm1
+
+	ptest	%xmm1, %xmm0
+	jnc	L(64bytesormore_loop_end)
+	add	$64, %rsi
+	add	$64, %rdi
+	sub	$64, %rdx
+	jae	L(L2_L3_unaligned_128bytes_loop)
+
+	add	$64, %rdx
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+/*
+ * This case is for machines which are sensitive for unaligned instructions.
+ */
+	.p2align 4
+L(2aligned):
+	cmp	$128, %rdx
+	ja	L(128bytesormorein2aligned)
+L(less128bytesin2aligned):
+	sub	$64, %rdx
+
+	movdqa	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqa	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+
+	movdqa	32(%rdi), %xmm2
+	pxor	32(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(48bytesin256)
+
+	movdqa	48(%rdi), %xmm2
+	pxor	48(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(64bytesin256)
+	cmp	$32, %rdx
+	jb	L(less32bytesin64in2alinged)
+
+	movdqa	64(%rdi), %xmm2
+	pxor	64(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(80bytesin256)
+
+	movdqa	80(%rdi), %xmm2
+	pxor	80(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(96bytesin256)
+	sub	$32, %rdx
+	add	$32, %rdi
+	add	$32, %rsi
+L(less32bytesin64in2alinged):
+	add	$64, %rdi
+	add	$64, %rsi
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+	.p2align 4
+L(128bytesormorein2aligned):
+	cmp	$512, %rdx
+	ja	L(512bytesormorein2aligned)
+	cmp	$256, %rdx
+	ja	L(256bytesormorein2aligned)
+L(less256bytesin2alinged):
+	sub	$128, %rdx
+
+	movdqa	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqa	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+
+	movdqa	32(%rdi), %xmm2
+	pxor	32(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(48bytesin256)
+
+	movdqa	48(%rdi), %xmm2
+	pxor	48(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(64bytesin256)
+
+	movdqa	64(%rdi), %xmm2
+	pxor	64(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(80bytesin256)
+
+	movdqa	80(%rdi), %xmm2
+	pxor	80(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(96bytesin256)
+
+	movdqa	96(%rdi), %xmm2
+	pxor	96(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(112bytesin256)
+
+	movdqa	112(%rdi), %xmm2
+	pxor	112(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(128bytesin256)
+
+	add	$128, %rsi
+	add	$128, %rdi
+
+	cmp	$64, %rdx
+	jae	L(less128bytesin2aligned)
+
+	cmp	$32, %rdx
+	jb	L(less32bytesin128in2aligned)
+
+	movdqu	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqu	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+	sub	$32, %rdx
+	add	$32, %rdi
+	add	$32, %rsi
+L(less32bytesin128in2aligned):
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+	.p2align 4
+L(256bytesormorein2aligned):
+
+	sub	$256, %rdx
+	movdqa	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqa	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+
+	movdqa	32(%rdi), %xmm2
+	pxor	32(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(48bytesin256)
+
+	movdqa	48(%rdi), %xmm2
+	pxor	48(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(64bytesin256)
+
+	movdqa	64(%rdi), %xmm2
+	pxor	64(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(80bytesin256)
+
+	movdqa	80(%rdi), %xmm2
+	pxor	80(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(96bytesin256)
+
+	movdqa	96(%rdi), %xmm2
+	pxor	96(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(112bytesin256)
+
+	movdqa	112(%rdi), %xmm2
+	pxor	112(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(128bytesin256)
+
+	movdqa	128(%rdi), %xmm2
+	pxor	128(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(144bytesin256)
+
+	movdqa	144(%rdi), %xmm2
+	pxor	144(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(160bytesin256)
+
+	movdqa	160(%rdi), %xmm2
+	pxor	160(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(176bytesin256)
+
+	movdqa	176(%rdi), %xmm2
+	pxor	176(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(192bytesin256)
+
+	movdqa	192(%rdi), %xmm2
+	pxor	192(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(208bytesin256)
+
+	movdqa	208(%rdi), %xmm2
+	pxor	208(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(224bytesin256)
+
+	movdqa	224(%rdi), %xmm2
+	pxor	224(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(240bytesin256)
+
+	movdqa	240(%rdi), %xmm2
+	pxor	240(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(256bytesin256)
+
+	add	$256, %rsi
+	add	$256, %rdi
+
+	cmp	$128, %rdx
+	jae	L(less256bytesin2alinged)
+
+	cmp	$64, %rdx
+	jae	L(less128bytesin2aligned)
+
+	cmp	$32, %rdx
+	jb	L(less32bytesin256in2alinged)
+
+	movdqa	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqa	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+	sub	$32, %rdx
+	add	$32, %rdi
+	add	$32, %rsi
+L(less32bytesin256in2alinged):
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+	.p2align 4
+L(512bytesormorein2aligned):
+# ifdef DATA_CACHE_SIZE_HALF
+	mov	$DATA_CACHE_SIZE_HALF, %R8_LP
+# else
+	mov	__x86_data_cache_size_half(%rip), %R8_LP
+# endif
+	mov	%r8, %r9
+	shr	$1, %r8
+	add	%r9, %r8
+	cmp	%r8, %rdx
+	ja	L(L2_L3_cache_aglined)
+
+	sub	$64, %rdx
+	.p2align 4
+L(64bytesormore_loopin2aligned):
+	movdqa	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	movdqa	%xmm2, %xmm1
+
+	movdqa	16(%rdi), %xmm3
+	pxor	16(%rsi), %xmm3
+	por	%xmm3, %xmm1
+
+	movdqa	32(%rdi), %xmm4
+	pxor	32(%rsi), %xmm4
+	por	%xmm4, %xmm1
+
+	movdqa	48(%rdi), %xmm5
+	pxor	48(%rsi), %xmm5
+	por	%xmm5, %xmm1
+
+	ptest	%xmm1, %xmm0
+	jnc	L(64bytesormore_loop_end)
+	add	$64, %rsi
+	add	$64, %rdi
+	sub	$64, %rdx
+	jae	L(64bytesormore_loopin2aligned)
+
+	add	$64, %rdx
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+L(L2_L3_cache_aglined):
+	sub	$64, %rdx
+
+	.p2align 4
+L(L2_L3_aligned_128bytes_loop):
+	prefetchnta 0x1c0(%rdi)
+	prefetchnta 0x1c0(%rsi)
+	movdqa	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	movdqa	%xmm2, %xmm1
+
+	movdqa	16(%rdi), %xmm3
+	pxor	16(%rsi), %xmm3
+	por	%xmm3, %xmm1
+
+	movdqa	32(%rdi), %xmm4
+	pxor	32(%rsi), %xmm4
+	por	%xmm4, %xmm1
+
+	movdqa	48(%rdi), %xmm5
+	pxor	48(%rsi), %xmm5
+	por	%xmm5, %xmm1
+
+	ptest	%xmm1, %xmm0
+	jnc	L(64bytesormore_loop_end)
+	add	$64, %rsi
+	add	$64, %rdi
+	sub	$64, %rdx
+	jae	L(L2_L3_aligned_128bytes_loop)
+
+	add	$64, %rdx
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+
+	.p2align 4
+L(64bytesormore_loop_end):
+	add	$16, %rdi
+	add	$16, %rsi
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytes)
+
+	add	$16, %rdi
+	add	$16, %rsi
+	ptest	%xmm3, %xmm0
+	jnc	L(16bytes)
+
+	add	$16, %rdi
+	add	$16, %rsi
+	ptest	%xmm4, %xmm0
+	jnc	L(16bytes)
+
+	add	$16, %rdi
+	add	$16, %rsi
+	jmp	L(16bytes)
+
+L(256bytesin256):
+	add	$256, %rdi
+	add	$256, %rsi
+	jmp	L(16bytes)
+L(240bytesin256):
+	add	$240, %rdi
+	add	$240, %rsi
+	jmp	L(16bytes)
+L(224bytesin256):
+	add	$224, %rdi
+	add	$224, %rsi
+	jmp	L(16bytes)
+L(208bytesin256):
+	add	$208, %rdi
+	add	$208, %rsi
+	jmp	L(16bytes)
+L(192bytesin256):
+	add	$192, %rdi
+	add	$192, %rsi
+	jmp	L(16bytes)
+L(176bytesin256):
+	add	$176, %rdi
+	add	$176, %rsi
+	jmp	L(16bytes)
+L(160bytesin256):
+	add	$160, %rdi
+	add	$160, %rsi
+	jmp	L(16bytes)
+L(144bytesin256):
+	add	$144, %rdi
+	add	$144, %rsi
+	jmp	L(16bytes)
+L(128bytesin256):
+	add	$128, %rdi
+	add	$128, %rsi
+	jmp	L(16bytes)
+L(112bytesin256):
+	add	$112, %rdi
+	add	$112, %rsi
+	jmp	L(16bytes)
+L(96bytesin256):
+	add	$96, %rdi
+	add	$96, %rsi
+	jmp	L(16bytes)
+L(80bytesin256):
+	add	$80, %rdi
+	add	$80, %rsi
+	jmp	L(16bytes)
+L(64bytesin256):
+	add	$64, %rdi
+	add	$64, %rsi
+	jmp	L(16bytes)
+L(48bytesin256):
+	add	$16, %rdi
+	add	$16, %rsi
+L(32bytesin256):
+	add	$16, %rdi
+	add	$16, %rsi
+L(16bytesin256):
+	add	$16, %rdi
+	add	$16, %rsi
+L(16bytes):
+	mov	-16(%rdi), %rax
+	mov	-16(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+L(8bytes):
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(12bytes):
+	mov	-12(%rdi), %rax
+	mov	-12(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+L(4bytes):
+	mov	-4(%rsi), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-4(%rdi), %eax
+	cmp	%eax, %ecx
+# else
+	cmp	-4(%rdi), %ecx
+# endif
+	jne	L(diffin4bytes)
+L(0bytes):
+	xor	%eax, %eax
+	ret
+
+# ifndef USE_AS_WMEMCMP
+/* unreal case for wmemcmp */
+	.p2align 4
+L(65bytes):
+	movdqu	-65(%rdi), %xmm1
+	movdqu	-65(%rsi), %xmm2
+	mov	$-65, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(49bytes):
+	movdqu	-49(%rdi), %xmm1
+	movdqu	-49(%rsi), %xmm2
+	mov	$-49, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(33bytes):
+	movdqu	-33(%rdi), %xmm1
+	movdqu	-33(%rsi), %xmm2
+	mov	$-33, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(17bytes):
+	mov	-17(%rdi), %rax
+	mov	-17(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+L(9bytes):
+	mov	-9(%rdi), %rax
+	mov	-9(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	movzbl	-1(%rdi), %eax
+	movzbl	-1(%rsi), %edx
+	sub	%edx, %eax
+	ret
+
+	.p2align 4
+L(13bytes):
+	mov	-13(%rdi), %rax
+	mov	-13(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(5bytes):
+	mov	-5(%rdi), %eax
+	mov	-5(%rsi), %ecx
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+	movzbl	-1(%rdi), %eax
+	movzbl	-1(%rsi), %edx
+	sub	%edx, %eax
+	ret
+
+	.p2align 4
+L(66bytes):
+	movdqu	-66(%rdi), %xmm1
+	movdqu	-66(%rsi), %xmm2
+	mov	$-66, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(50bytes):
+	movdqu	-50(%rdi), %xmm1
+	movdqu	-50(%rsi), %xmm2
+	mov	$-50, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(34bytes):
+	movdqu	-34(%rdi), %xmm1
+	movdqu	-34(%rsi), %xmm2
+	mov	$-34, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(18bytes):
+	mov	-18(%rdi), %rax
+	mov	-18(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+L(10bytes):
+	mov	-10(%rdi), %rax
+	mov	-10(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	movzwl	-2(%rdi), %eax
+	movzwl	-2(%rsi), %ecx
+	cmp	%cl, %al
+	jne	L(end)
+	and	$0xffff, %eax
+	and	$0xffff, %ecx
+	sub	%ecx, %eax
+	ret
+
+	.p2align 4
+L(14bytes):
+	mov	-14(%rdi), %rax
+	mov	-14(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(6bytes):
+	mov	-6(%rdi), %eax
+	mov	-6(%rsi), %ecx
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+L(2bytes):
+	movzwl	-2(%rsi), %ecx
+	movzwl	-2(%rdi), %eax
+	cmp	%cl, %al
+	jne	L(end)
+	and	$0xffff, %eax
+	and	$0xffff, %ecx
+	sub	%ecx, %eax
+	ret
+
+	.p2align 4
+L(67bytes):
+	movdqu	-67(%rdi), %xmm2
+	movdqu	-67(%rsi), %xmm1
+	mov	$-67, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(51bytes):
+	movdqu	-51(%rdi), %xmm2
+	movdqu	-51(%rsi), %xmm1
+	mov	$-51, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(35bytes):
+	movdqu	-35(%rsi), %xmm1
+	movdqu	-35(%rdi), %xmm2
+	mov	$-35, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(19bytes):
+	mov	-19(%rdi), %rax
+	mov	-19(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+L(11bytes):
+	mov	-11(%rdi), %rax
+	mov	-11(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	mov	-4(%rdi), %eax
+	mov	-4(%rsi), %ecx
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(15bytes):
+	mov	-15(%rdi), %rax
+	mov	-15(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(7bytes):
+	mov	-7(%rdi), %eax
+	mov	-7(%rsi), %ecx
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+	mov	-4(%rdi), %eax
+	mov	-4(%rsi), %ecx
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(3bytes):
+	movzwl	-3(%rdi), %eax
+	movzwl	-3(%rsi), %ecx
+	cmp	%eax, %ecx
+	jne	L(diffin2bytes)
+L(1bytes):
+	movzbl	-1(%rdi), %eax
+	movzbl	-1(%rsi), %ecx
+	sub	%ecx, %eax
+	ret
+# endif
+
+	.p2align 4
+L(68bytes):
+	movdqu	-68(%rdi), %xmm2
+	movdqu	-68(%rsi), %xmm1
+	mov	$-68, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(52bytes):
+	movdqu	-52(%rdi), %xmm2
+	movdqu	-52(%rsi), %xmm1
+	mov	$-52, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(36bytes):
+	movdqu	-36(%rdi), %xmm2
+	movdqu	-36(%rsi), %xmm1
+	mov	$-36, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(20bytes):
+	movdqu	-20(%rdi), %xmm2
+	movdqu	-20(%rsi), %xmm1
+	mov	$-20, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-4(%rsi), %ecx
+
+# ifndef USE_AS_WMEMCMP
+	mov	-4(%rdi), %eax
+	cmp	%eax, %ecx
+# else
+	cmp	-4(%rdi), %ecx
+# endif
+	jne	L(diffin4bytes)
+	xor	%eax, %eax
+	ret
+
+# ifndef USE_AS_WMEMCMP
+/* unreal cases for wmemcmp */
+	.p2align 4
+L(69bytes):
+	movdqu	-69(%rsi), %xmm1
+	movdqu	-69(%rdi), %xmm2
+	mov	$-69, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(53bytes):
+	movdqu	-53(%rsi), %xmm1
+	movdqu	-53(%rdi), %xmm2
+	mov	$-53, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(37bytes):
+	movdqu	-37(%rsi), %xmm1
+	movdqu	-37(%rdi), %xmm2
+	mov	$-37, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(21bytes):
+	movdqu	-21(%rsi), %xmm1
+	movdqu	-21(%rdi), %xmm2
+	mov	$-21, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(70bytes):
+	movdqu	-70(%rsi), %xmm1
+	movdqu	-70(%rdi), %xmm2
+	mov	$-70, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(54bytes):
+	movdqu	-54(%rsi), %xmm1
+	movdqu	-54(%rdi), %xmm2
+	mov	$-54, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(38bytes):
+	movdqu	-38(%rsi), %xmm1
+	movdqu	-38(%rdi), %xmm2
+	mov	$-38, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(22bytes):
+	movdqu	-22(%rsi), %xmm1
+	movdqu	-22(%rdi), %xmm2
+	mov	$-22, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(71bytes):
+	movdqu	-71(%rsi), %xmm1
+	movdqu	-71(%rdi), %xmm2
+	mov	$-71, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(55bytes):
+	movdqu	-55(%rdi), %xmm2
+	movdqu	-55(%rsi), %xmm1
+	mov	$-55, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(39bytes):
+	movdqu	-39(%rdi), %xmm2
+	movdqu	-39(%rsi), %xmm1
+	mov	$-39, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(23bytes):
+	movdqu	-23(%rdi), %xmm2
+	movdqu	-23(%rsi), %xmm1
+	mov	$-23, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+# endif
+
+	.p2align 4
+L(72bytes):
+	movdqu	-72(%rsi), %xmm1
+	movdqu	-72(%rdi), %xmm2
+	mov	$-72, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(56bytes):
+	movdqu	-56(%rdi), %xmm2
+	movdqu	-56(%rsi), %xmm1
+	mov	$-56, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(40bytes):
+	movdqu	-40(%rdi), %xmm2
+	movdqu	-40(%rsi), %xmm1
+	mov	$-40, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(24bytes):
+	movdqu	-24(%rdi), %xmm2
+	movdqu	-24(%rsi), %xmm1
+	mov	$-24, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-8(%rsi), %rcx
+	mov	-8(%rdi), %rax
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+# ifndef USE_AS_WMEMCMP
+/* unreal cases for wmemcmp */
+	.p2align 4
+L(73bytes):
+	movdqu	-73(%rsi), %xmm1
+	movdqu	-73(%rdi), %xmm2
+	mov	$-73, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(57bytes):
+	movdqu	-57(%rdi), %xmm2
+	movdqu	-57(%rsi), %xmm1
+	mov	$-57, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(41bytes):
+	movdqu	-41(%rdi), %xmm2
+	movdqu	-41(%rsi), %xmm1
+	mov	$-41, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(25bytes):
+	movdqu	-25(%rdi), %xmm2
+	movdqu	-25(%rsi), %xmm1
+	mov	$-25, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-9(%rdi), %rax
+	mov	-9(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	movzbl	-1(%rdi), %eax
+	movzbl	-1(%rsi), %ecx
+	sub	%ecx, %eax
+	ret
+
+	.p2align 4
+L(74bytes):
+	movdqu	-74(%rsi), %xmm1
+	movdqu	-74(%rdi), %xmm2
+	mov	$-74, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(58bytes):
+	movdqu	-58(%rdi), %xmm2
+	movdqu	-58(%rsi), %xmm1
+	mov	$-58, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(42bytes):
+	movdqu	-42(%rdi), %xmm2
+	movdqu	-42(%rsi), %xmm1
+	mov	$-42, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(26bytes):
+	movdqu	-26(%rdi), %xmm2
+	movdqu	-26(%rsi), %xmm1
+	mov	$-26, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-10(%rdi), %rax
+	mov	-10(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	movzwl	-2(%rdi), %eax
+	movzwl	-2(%rsi), %ecx
+	jmp	L(diffin2bytes)
+
+	.p2align 4
+L(75bytes):
+	movdqu	-75(%rsi), %xmm1
+	movdqu	-75(%rdi), %xmm2
+	mov	$-75, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(59bytes):
+	movdqu	-59(%rdi), %xmm2
+	movdqu	-59(%rsi), %xmm1
+	mov	$-59, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(43bytes):
+	movdqu	-43(%rdi), %xmm2
+	movdqu	-43(%rsi), %xmm1
+	mov	$-43, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(27bytes):
+	movdqu	-27(%rdi), %xmm2
+	movdqu	-27(%rsi), %xmm1
+	mov	$-27, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-11(%rdi), %rax
+	mov	-11(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	mov	-4(%rdi), %eax
+	mov	-4(%rsi), %ecx
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+	xor	%eax, %eax
+	ret
+# endif
+	.p2align 4
+L(76bytes):
+	movdqu	-76(%rsi), %xmm1
+	movdqu	-76(%rdi), %xmm2
+	mov	$-76, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(60bytes):
+	movdqu	-60(%rdi), %xmm2
+	movdqu	-60(%rsi), %xmm1
+	mov	$-60, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(44bytes):
+	movdqu	-44(%rdi), %xmm2
+	movdqu	-44(%rsi), %xmm1
+	mov	$-44, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(28bytes):
+	movdqu	-28(%rdi), %xmm2
+	movdqu	-28(%rsi), %xmm1
+	mov	$-28, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-12(%rdi), %rax
+	mov	-12(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	mov	-4(%rsi), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-4(%rdi), %eax
+	cmp	%eax, %ecx
+# else
+	cmp	-4(%rdi), %ecx
+# endif
+	jne	L(diffin4bytes)
+	xor	%eax, %eax
+	ret
+
+# ifndef USE_AS_WMEMCMP
+/* unreal cases for wmemcmp */
+	.p2align 4
+L(77bytes):
+	movdqu	-77(%rsi), %xmm1
+	movdqu	-77(%rdi), %xmm2
+	mov	$-77, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(61bytes):
+	movdqu	-61(%rdi), %xmm2
+	movdqu	-61(%rsi), %xmm1
+	mov	$-61, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(45bytes):
+	movdqu	-45(%rdi), %xmm2
+	movdqu	-45(%rsi), %xmm1
+	mov	$-45, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(29bytes):
+	movdqu	-29(%rdi), %xmm2
+	movdqu	-29(%rsi), %xmm1
+	mov	$-29, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-13(%rdi), %rax
+	mov	-13(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(78bytes):
+	movdqu	-78(%rsi), %xmm1
+	movdqu	-78(%rdi), %xmm2
+	mov	$-78, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(62bytes):
+	movdqu	-62(%rdi), %xmm2
+	movdqu	-62(%rsi), %xmm1
+	mov	$-62, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(46bytes):
+	movdqu	-46(%rdi), %xmm2
+	movdqu	-46(%rsi), %xmm1
+	mov	$-46, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(30bytes):
+	movdqu	-30(%rdi), %xmm2
+	movdqu	-30(%rsi), %xmm1
+	mov	$-30, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-14(%rdi), %rax
+	mov	-14(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(79bytes):
+	movdqu	-79(%rsi), %xmm1
+	movdqu	-79(%rdi), %xmm2
+	mov	$-79, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(63bytes):
+	movdqu	-63(%rdi), %xmm2
+	movdqu	-63(%rsi), %xmm1
+	mov	$-63, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(47bytes):
+	movdqu	-47(%rdi), %xmm2
+	movdqu	-47(%rsi), %xmm1
+	mov	$-47, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(31bytes):
+	movdqu	-31(%rdi), %xmm2
+	movdqu	-31(%rsi), %xmm1
+	mov	$-31, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-15(%rdi), %rax
+	mov	-15(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+# endif
+	.p2align 4
+L(64bytes):
+	movdqu	-64(%rdi), %xmm2
+	movdqu	-64(%rsi), %xmm1
+	mov	$-64, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(48bytes):
+	movdqu	-48(%rdi), %xmm2
+	movdqu	-48(%rsi), %xmm1
+	mov	$-48, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(32bytes):
+	movdqu	-32(%rdi), %xmm2
+	movdqu	-32(%rsi), %xmm1
+	mov	$-32, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-16(%rdi), %rax
+	mov	-16(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+/*
+ * Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block.
+ */
+	.p2align 3
+L(less16bytes):
+	movsbq	%dl, %rdx
+	mov	(%rsi, %rdx), %rcx
+	mov	(%rdi, %rdx), %rax
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	mov	8(%rsi, %rdx), %rcx
+	mov	8(%rdi, %rdx), %rax
+L(diffin8bytes):
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+	shr	$32, %rcx
+	shr	$32, %rax
+
+# ifdef USE_AS_WMEMCMP
+/* for wmemcmp */
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+	xor	%eax, %eax
+	ret
+# endif
+
+L(diffin4bytes):
+# ifndef USE_AS_WMEMCMP
+	cmp	%cx, %ax
+	jne	L(diffin2bytes)
+	shr	$16, %ecx
+	shr	$16, %eax
+L(diffin2bytes):
+	cmp	%cl, %al
+	jne	L(end)
+	and	$0xffff, %eax
+	and	$0xffff, %ecx
+	sub	%ecx, %eax
+	ret
+
+	.p2align 4
+L(end):
+	and	$0xff, %eax
+	and	$0xff, %ecx
+	sub	%ecx, %eax
+	ret
+# else
+
+/* for wmemcmp */
+	mov	$1, %eax
+	jl	L(nequal_bigger)
+	neg	%eax
+	ret
+
+	.p2align 4
+L(nequal_bigger):
+	ret
+
+L(unreal_case):
+	xor	%eax, %eax
+	ret
+# endif
+
+END (MEMCMP)
+
+	.section .rodata.sse4.1,"a",@progbits
+	.p2align 3
+# ifndef USE_AS_WMEMCMP
+L(table_64bytes):
+	.int	JMPTBL (L(0bytes), L(table_64bytes))
+	.int	JMPTBL (L(1bytes), L(table_64bytes))
+	.int	JMPTBL (L(2bytes), L(table_64bytes))
+	.int	JMPTBL (L(3bytes), L(table_64bytes))
+	.int	JMPTBL (L(4bytes), L(table_64bytes))
+	.int	JMPTBL (L(5bytes), L(table_64bytes))
+	.int	JMPTBL (L(6bytes), L(table_64bytes))
+	.int	JMPTBL (L(7bytes), L(table_64bytes))
+	.int	JMPTBL (L(8bytes), L(table_64bytes))
+	.int	JMPTBL (L(9bytes), L(table_64bytes))
+	.int	JMPTBL (L(10bytes), L(table_64bytes))
+	.int	JMPTBL (L(11bytes), L(table_64bytes))
+	.int	JMPTBL (L(12bytes), L(table_64bytes))
+	.int	JMPTBL (L(13bytes), L(table_64bytes))
+	.int	JMPTBL (L(14bytes), L(table_64bytes))
+	.int	JMPTBL (L(15bytes), L(table_64bytes))
+	.int	JMPTBL (L(16bytes), L(table_64bytes))
+	.int	JMPTBL (L(17bytes), L(table_64bytes))
+	.int	JMPTBL (L(18bytes), L(table_64bytes))
+	.int	JMPTBL (L(19bytes), L(table_64bytes))
+	.int	JMPTBL (L(20bytes), L(table_64bytes))
+	.int	JMPTBL (L(21bytes), L(table_64bytes))
+	.int	JMPTBL (L(22bytes), L(table_64bytes))
+	.int	JMPTBL (L(23bytes), L(table_64bytes))
+	.int	JMPTBL (L(24bytes), L(table_64bytes))
+	.int	JMPTBL (L(25bytes), L(table_64bytes))
+	.int	JMPTBL (L(26bytes), L(table_64bytes))
+	.int	JMPTBL (L(27bytes), L(table_64bytes))
+	.int	JMPTBL (L(28bytes), L(table_64bytes))
+	.int	JMPTBL (L(29bytes), L(table_64bytes))
+	.int	JMPTBL (L(30bytes), L(table_64bytes))
+	.int	JMPTBL (L(31bytes), L(table_64bytes))
+	.int	JMPTBL (L(32bytes), L(table_64bytes))
+	.int	JMPTBL (L(33bytes), L(table_64bytes))
+	.int	JMPTBL (L(34bytes), L(table_64bytes))
+	.int	JMPTBL (L(35bytes), L(table_64bytes))
+	.int	JMPTBL (L(36bytes), L(table_64bytes))
+	.int	JMPTBL (L(37bytes), L(table_64bytes))
+	.int	JMPTBL (L(38bytes), L(table_64bytes))
+	.int	JMPTBL (L(39bytes), L(table_64bytes))
+	.int	JMPTBL (L(40bytes), L(table_64bytes))
+	.int	JMPTBL (L(41bytes), L(table_64bytes))
+	.int	JMPTBL (L(42bytes), L(table_64bytes))
+	.int	JMPTBL (L(43bytes), L(table_64bytes))
+	.int	JMPTBL (L(44bytes), L(table_64bytes))
+	.int	JMPTBL (L(45bytes), L(table_64bytes))
+	.int	JMPTBL (L(46bytes), L(table_64bytes))
+	.int	JMPTBL (L(47bytes), L(table_64bytes))
+	.int	JMPTBL (L(48bytes), L(table_64bytes))
+	.int	JMPTBL (L(49bytes), L(table_64bytes))
+	.int	JMPTBL (L(50bytes), L(table_64bytes))
+	.int	JMPTBL (L(51bytes), L(table_64bytes))
+	.int	JMPTBL (L(52bytes), L(table_64bytes))
+	.int	JMPTBL (L(53bytes), L(table_64bytes))
+	.int	JMPTBL (L(54bytes), L(table_64bytes))
+	.int	JMPTBL (L(55bytes), L(table_64bytes))
+	.int	JMPTBL (L(56bytes), L(table_64bytes))
+	.int	JMPTBL (L(57bytes), L(table_64bytes))
+	.int	JMPTBL (L(58bytes), L(table_64bytes))
+	.int	JMPTBL (L(59bytes), L(table_64bytes))
+	.int	JMPTBL (L(60bytes), L(table_64bytes))
+	.int	JMPTBL (L(61bytes), L(table_64bytes))
+	.int	JMPTBL (L(62bytes), L(table_64bytes))
+	.int	JMPTBL (L(63bytes), L(table_64bytes))
+	.int	JMPTBL (L(64bytes), L(table_64bytes))
+	.int	JMPTBL (L(65bytes), L(table_64bytes))
+	.int	JMPTBL (L(66bytes), L(table_64bytes))
+	.int	JMPTBL (L(67bytes), L(table_64bytes))
+	.int	JMPTBL (L(68bytes), L(table_64bytes))
+	.int	JMPTBL (L(69bytes), L(table_64bytes))
+	.int	JMPTBL (L(70bytes), L(table_64bytes))
+	.int	JMPTBL (L(71bytes), L(table_64bytes))
+	.int	JMPTBL (L(72bytes), L(table_64bytes))
+	.int	JMPTBL (L(73bytes), L(table_64bytes))
+	.int	JMPTBL (L(74bytes), L(table_64bytes))
+	.int	JMPTBL (L(75bytes), L(table_64bytes))
+	.int	JMPTBL (L(76bytes), L(table_64bytes))
+	.int	JMPTBL (L(77bytes), L(table_64bytes))
+	.int	JMPTBL (L(78bytes), L(table_64bytes))
+	.int	JMPTBL (L(79bytes), L(table_64bytes))
+# else
+L(table_64bytes):
+	.int	JMPTBL (L(0bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(4bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(8bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(12bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(16bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(20bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(24bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(28bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(32bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(36bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(40bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(44bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(48bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(52bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(56bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(60bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(64bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(68bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(72bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(76bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/memcmp-ssse3.S
new file mode 100644
index 0000000000..8d7d2fe67b
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/memcmp-ssse3.S
@@ -0,0 +1,1990 @@
+/* memcmp with SSSE3, wmemcmp with SSSE3
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef MEMCMP
+#  define MEMCMP	__memcmp_ssse3
+# endif
+
+/* Warning!
+	   wmemcmp has to use SIGNED comparison for elements.
+	   memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
+	atom_text_section
+ENTRY (MEMCMP)
+# ifdef USE_AS_WMEMCMP
+	shl	$2, %rdx
+	test	%rdx, %rdx
+	jz	L(equal)
+# endif
+	mov	%rdx, %rcx
+	mov	%rdi, %rdx
+	cmp	$48, %rcx;
+	jae	L(48bytesormore)	/* LEN => 48  */
+
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+/* ECX >= 32.  */
+L(48bytesormore):
+	movdqu	(%rdi), %xmm3
+	movdqu	(%rsi), %xmm0
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	16(%rdi), %rdi
+	lea	16(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(less16bytes)
+	mov	%edi, %edx
+	and	$0xf, %edx
+	xor	%rdx, %rdi
+	sub	%rdx, %rsi
+	add	%rdx, %rcx
+	mov	%esi, %edx
+	and	$0xf, %edx
+	jz	L(shr_0)
+	xor	%rdx, %rsi
+
+# ifndef USE_AS_WMEMCMP
+	cmp	$8, %edx
+	jae	L(next_unaligned_table)
+	cmp	$0, %edx
+	je	L(shr_0)
+	cmp	$1, %edx
+	je	L(shr_1)
+	cmp	$2, %edx
+	je	L(shr_2)
+	cmp	$3, %edx
+	je	L(shr_3)
+	cmp	$4, %edx
+	je	L(shr_4)
+	cmp	$5, %edx
+	je	L(shr_5)
+	cmp	$6, %edx
+	je	L(shr_6)
+	jmp	L(shr_7)
+
+	.p2align 2
+L(next_unaligned_table):
+	cmp	$8, %edx
+	je	L(shr_8)
+	cmp	$9, %edx
+	je	L(shr_9)
+	cmp	$10, %edx
+	je	L(shr_10)
+	cmp	$11, %edx
+	je	L(shr_11)
+	cmp	$12, %edx
+	je	L(shr_12)
+	cmp	$13, %edx
+	je	L(shr_13)
+	cmp	$14, %edx
+	je	L(shr_14)
+	jmp	L(shr_15)
+# else
+	cmp	$0, %edx
+	je	L(shr_0)
+	cmp	$4, %edx
+	je	L(shr_4)
+	cmp	$8, %edx
+	je	L(shr_8)
+	jmp	L(shr_12)
+# endif
+
+	.p2align 4
+L(shr_0):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	jae	L(shr_0_gobble)
+	xor	%eax, %eax
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+	movdqa	16(%rsi), %xmm2
+	pcmpeqb	16(%rdi), %xmm2
+	pand	%xmm1, %xmm2
+	pmovmskb %xmm2, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_0_gobble):
+	movdqa	(%rsi), %xmm0
+	xor	%eax, %eax
+	pcmpeqb	(%rdi), %xmm0
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm2
+	pcmpeqb	16(%rdi), %xmm2
+L(shr_0_gobble_loop):
+	pand	%xmm0, %xmm2
+	sub	$32, %rcx
+	pmovmskb %xmm2, %edx
+	movdqa	%xmm0, %xmm1
+	movdqa	32(%rsi), %xmm0
+	movdqa	48(%rsi), %xmm2
+	sbb	$0xffff, %edx
+	pcmpeqb	32(%rdi), %xmm0
+	pcmpeqb	48(%rdi), %xmm2
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	jz	L(shr_0_gobble_loop)
+
+	pand	%xmm0, %xmm2
+	cmp	$0, %rcx
+	jge	L(next)
+	inc	%edx
+	add	$32, %rcx
+L(next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm2, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+
+	.p2align 4
+L(shr_1):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_1_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$1, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$1, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$1, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_1_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$1, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$1, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_1_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$1, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$1, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_1_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_1_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_1_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	1(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+
+	.p2align 4
+L(shr_2):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_2_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$2, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$2, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$2, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_2_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$2, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$2, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_2_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$2, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$2, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_2_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_2_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_2_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	2(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_3):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_3_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$3, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$3, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$3, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_3_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$3, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$3, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_3_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$3, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$3, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_3_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_3_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_3_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	3(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+# endif
+
+	.p2align 4
+L(shr_4):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_4_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$4, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$4, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$4, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_4_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$4, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$4, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_4_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$4, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$4, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_4_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_4_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_4_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	4(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+
+	.p2align 4
+L(shr_5):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_5_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$5, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$5, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$5, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_5_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$5, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$5, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_5_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$5, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$5, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_5_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_5_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_5_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	5(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_6):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_6_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$6, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$6, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$6, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_6_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$6, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$6, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_6_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$6, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$6, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_6_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_6_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_6_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	6(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_7):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_7_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$7, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$7, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$7, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_7_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$7, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$7, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_7_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$7, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$7, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_7_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_7_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_7_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	7(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+# endif
+
+	.p2align 4
+L(shr_8):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_8_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$8, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$8, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$8, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_8_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$8, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$8, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_8_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$8, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$8, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_8_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_8_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_8_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	8(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+
+	.p2align 4
+L(shr_9):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_9_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$9, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$9, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$9, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_9_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$9, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$9, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_9_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$9, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$9, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_9_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_9_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_9_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	9(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_10):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_10_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$10, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$10, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$10, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_10_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$10, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$10, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_10_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$10, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$10, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_10_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_10_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_10_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	10(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_11):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_11_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$11, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$11, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$11, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_11_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$11, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$11, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_11_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$11, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$11, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_11_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_11_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_11_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	11(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+# endif
+
+	.p2align 4
+L(shr_12):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_12_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$12, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$12, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$12, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_12_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$12, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$12, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_12_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$12, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$12, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_12_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_12_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_12_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	12(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+
+	.p2align 4
+L(shr_13):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_13_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$13, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$13, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$13, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_13_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$13, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$13, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_13_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$13, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$13, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_13_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_13_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_13_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	13(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_14):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_14_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$14, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$14, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$14, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_14_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$14, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$14, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_14_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$14, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$14, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_14_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_14_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_14_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	14(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_15):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_15_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$15, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$15, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$15, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	.p2align 4
+L(shr_15_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$15, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$15, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_15_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$15, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$15, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_15_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_15_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_15_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	15(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+# endif
+	.p2align 4
+L(exit):
+	pmovmskb %xmm1, %r8d
+	sub	$0xffff, %r8d
+	jz	L(first16bytes)
+	lea	-16(%rsi), %rsi
+	lea	-16(%rdi), %rdi
+	mov	%r8d, %edx
+L(first16bytes):
+	add	%rax, %rsi
+L(less16bytes):
+# ifndef USE_AS_WMEMCMP
+	test	%dl, %dl
+	jz	L(next_24_bytes)
+
+	test	$0x01, %dl
+	jnz	L(Byte16)
+
+	test	$0x02, %dl
+	jnz	L(Byte17)
+
+	test	$0x04, %dl
+	jnz	L(Byte18)
+
+	test	$0x08, %dl
+	jnz	L(Byte19)
+
+	test	$0x10, %dl
+	jnz	L(Byte20)
+
+	test	$0x20, %dl
+	jnz	L(Byte21)
+
+	test	$0x40, %dl
+	jnz	L(Byte22)
+
+	movzbl	-9(%rdi), %eax
+	movzbl	-9(%rsi), %edx
+	sub	%edx, %eax
+	ret
+
+	.p2align 4
+L(Byte16):
+	movzbl	-16(%rdi), %eax
+	movzbl	-16(%rsi), %edx
+	sub	%edx, %eax
+	ret
+
+	.p2align 4
+L(Byte17):
+	movzbl	-15(%rdi), %eax
+	movzbl	-15(%rsi), %edx
+	sub	%edx, %eax
+	ret
+
+	.p2align 4
+L(Byte18):
+	movzbl	-14(%rdi), %eax
+	movzbl	-14(%rsi), %edx
+	sub	%edx, %eax
+	ret
+
+	.p2align 4
+L(Byte19):
+	movzbl	-13(%rdi), %eax
+	movzbl	-13(%rsi), %edx
+	sub	%edx, %eax
+	ret
+
+	.p2align 4
+L(Byte20):
+	movzbl	-12(%rdi), %eax
+	movzbl	-12(%rsi), %edx
+	sub	%edx, %eax
+	ret
+
+	.p2align 4
+L(Byte21):
+	movzbl	-11(%rdi), %eax
+	movzbl	-11(%rsi), %edx
+	sub	%edx, %eax
+	ret
+
+	.p2align 4
+L(Byte22):
+	movzbl	-10(%rdi), %eax
+	movzbl	-10(%rsi), %edx
+	sub	%edx, %eax
+	ret
+
+	.p2align 4
+L(next_24_bytes):
+	lea	8(%rdi), %rdi
+	lea	8(%rsi), %rsi
+	test	$0x01, %dh
+	jnz	L(Byte16)
+
+	test	$0x02, %dh
+	jnz	L(Byte17)
+
+	test	$0x04, %dh
+	jnz	L(Byte18)
+
+	test	$0x08, %dh
+	jnz	L(Byte19)
+
+	test	$0x10, %dh
+	jnz	L(Byte20)
+
+	test	$0x20, %dh
+	jnz	L(Byte21)
+
+	test	$0x40, %dh
+	jnz	L(Byte22)
+
+	movzbl	-9(%rdi), %eax
+	movzbl	-9(%rsi), %edx
+	sub	%edx, %eax
+	ret
+# else
+/* special for wmemcmp */
+	xor	%eax, %eax
+	test	%dl, %dl
+	jz	L(next_two_double_words)
+	and	$15, %dl
+	jz	L(second_double_word)
+	mov	-16(%rdi), %eax
+	cmp	-16(%rsi), %eax
+	jne	L(find_diff)
+	ret
+
+	.p2align 4
+L(second_double_word):
+	mov	-12(%rdi), %eax
+	cmp	-12(%rsi), %eax
+	jne	L(find_diff)
+	ret
+
+	.p2align 4
+L(next_two_double_words):
+	and	$15, %dh
+	jz	L(fourth_double_word)
+	mov	-8(%rdi), %eax
+	cmp	-8(%rsi), %eax
+	jne	L(find_diff)
+	ret
+
+	.p2align 4
+L(fourth_double_word):
+	mov	-4(%rdi), %eax
+	cmp	-4(%rsi), %eax
+	jne	L(find_diff)
+	ret
+# endif
+
+	.p2align 4
+L(less48bytes):
+	cmp	$8, %ecx
+	jae	L(more8bytes)
+	cmp	$0, %ecx
+	je	L(0bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$1, %ecx
+	je	L(1bytes)
+	cmp	$2, %ecx
+	je	L(2bytes)
+	cmp	$3, %ecx
+	je	L(3bytes)
+	cmp	$4, %ecx
+	je	L(4bytes)
+	cmp	$5, %ecx
+	je	L(5bytes)
+	cmp	$6, %ecx
+	je	L(6bytes)
+	jmp	L(7bytes)
+# else
+	jmp	L(4bytes)
+# endif
+
+	.p2align 4
+L(more8bytes):
+	cmp	$16, %ecx
+	jae	L(more16bytes)
+	cmp	$8, %ecx
+	je	L(8bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$9, %ecx
+	je	L(9bytes)
+	cmp	$10, %ecx
+	je	L(10bytes)
+	cmp	$11, %ecx
+	je	L(11bytes)
+	cmp	$12, %ecx
+	je	L(12bytes)
+	cmp	$13, %ecx
+	je	L(13bytes)
+	cmp	$14, %ecx
+	je	L(14bytes)
+	jmp	L(15bytes)
+# else
+	jmp	L(12bytes)
+# endif
+
+	.p2align 4
+L(more16bytes):
+	cmp	$24, %ecx
+	jae	L(more24bytes)
+	cmp	$16, %ecx
+	je	L(16bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$17, %ecx
+	je	L(17bytes)
+	cmp	$18, %ecx
+	je	L(18bytes)
+	cmp	$19, %ecx
+	je	L(19bytes)
+	cmp	$20, %ecx
+	je	L(20bytes)
+	cmp	$21, %ecx
+	je	L(21bytes)
+	cmp	$22, %ecx
+	je	L(22bytes)
+	jmp	L(23bytes)
+# else
+	jmp	L(20bytes)
+# endif
+
+	.p2align 4
+L(more24bytes):
+	cmp	$32, %ecx
+	jae	L(more32bytes)
+	cmp	$24, %ecx
+	je	L(24bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$25, %ecx
+	je	L(25bytes)
+	cmp	$26, %ecx
+	je	L(26bytes)
+	cmp	$27, %ecx
+	je	L(27bytes)
+	cmp	$28, %ecx
+	je	L(28bytes)
+	cmp	$29, %ecx
+	je	L(29bytes)
+	cmp	$30, %ecx
+	je	L(30bytes)
+	jmp	L(31bytes)
+# else
+	jmp	L(28bytes)
+# endif
+
+	.p2align 4
+L(more32bytes):
+	cmp	$40, %ecx
+	jae	L(more40bytes)
+	cmp	$32, %ecx
+	je	L(32bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$33, %ecx
+	je	L(33bytes)
+	cmp	$34, %ecx
+	je	L(34bytes)
+	cmp	$35, %ecx
+	je	L(35bytes)
+	cmp	$36, %ecx
+	je	L(36bytes)
+	cmp	$37, %ecx
+	je	L(37bytes)
+	cmp	$38, %ecx
+	je	L(38bytes)
+	jmp	L(39bytes)
+# else
+	jmp	L(36bytes)
+# endif
+
+	.p2align 4
+L(more40bytes):
+	cmp	$40, %ecx
+	je	L(40bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$41, %ecx
+	je	L(41bytes)
+	cmp	$42, %ecx
+	je	L(42bytes)
+	cmp	$43, %ecx
+	je	L(43bytes)
+	cmp	$44, %ecx
+	je	L(44bytes)
+	cmp	$45, %ecx
+	je	L(45bytes)
+	cmp	$46, %ecx
+	je	L(46bytes)
+	jmp	L(47bytes)
+
+	.p2align 4
+L(44bytes):
+	movl	-44(%rdi), %eax
+	movl	-44(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(40bytes):
+	movl	-40(%rdi), %eax
+	movl	-40(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(36bytes):
+	movl	-36(%rdi), %eax
+	movl	-36(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(32bytes):
+	movl	-32(%rdi), %eax
+	movl	-32(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(28bytes):
+	movl	-28(%rdi), %eax
+	movl	-28(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(24bytes):
+	movl	-24(%rdi), %eax
+	movl	-24(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(20bytes):
+	movl	-20(%rdi), %eax
+	movl	-20(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(16bytes):
+	movl	-16(%rdi), %eax
+	movl	-16(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(12bytes):
+	movl	-12(%rdi), %eax
+	movl	-12(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(8bytes):
+	movl	-8(%rdi), %eax
+	movl	-8(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(4bytes):
+	movl	-4(%rdi), %eax
+	movl	-4(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(0bytes):
+	xor	%eax, %eax
+	ret
+# else
+	.p2align 4
+L(44bytes):
+	movl	-44(%rdi), %eax
+	cmp	-44(%rsi), %eax
+	jne	L(find_diff)
+L(40bytes):
+	movl	-40(%rdi), %eax
+	cmp	-40(%rsi), %eax
+	jne	L(find_diff)
+L(36bytes):
+	movl	-36(%rdi), %eax
+	cmp	-36(%rsi), %eax
+	jne	L(find_diff)
+L(32bytes):
+	movl	-32(%rdi), %eax
+	cmp	-32(%rsi), %eax
+	jne	L(find_diff)
+L(28bytes):
+	movl	-28(%rdi), %eax
+	cmp	-28(%rsi), %eax
+	jne	L(find_diff)
+L(24bytes):
+	movl	-24(%rdi), %eax
+	cmp	-24(%rsi), %eax
+	jne	L(find_diff)
+L(20bytes):
+	movl	-20(%rdi), %eax
+	cmp	-20(%rsi), %eax
+	jne	L(find_diff)
+L(16bytes):
+	movl	-16(%rdi), %eax
+	cmp	-16(%rsi), %eax
+	jne	L(find_diff)
+L(12bytes):
+	movl	-12(%rdi), %eax
+	cmp	-12(%rsi), %eax
+	jne	L(find_diff)
+L(8bytes):
+	movl	-8(%rdi), %eax
+	cmp	-8(%rsi), %eax
+	jne	L(find_diff)
+L(4bytes):
+	movl	-4(%rdi), %eax
+	cmp	-4(%rsi), %eax
+	jne	L(find_diff)
+L(0bytes):
+	xor	%eax, %eax
+	ret
+# endif
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(45bytes):
+	movl	-45(%rdi), %eax
+	movl	-45(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(41bytes):
+	movl	-41(%rdi), %eax
+	movl	-41(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(37bytes):
+	movl	-37(%rdi), %eax
+	movl	-37(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(33bytes):
+	movl	-33(%rdi), %eax
+	movl	-33(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(29bytes):
+	movl	-29(%rdi), %eax
+	movl	-29(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(25bytes):
+	movl	-25(%rdi), %eax
+	movl	-25(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(21bytes):
+	movl	-21(%rdi), %eax
+	movl	-21(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(17bytes):
+	movl	-17(%rdi), %eax
+	movl	-17(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(13bytes):
+	movl	-13(%rdi), %eax
+	movl	-13(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(9bytes):
+	movl	-9(%rdi), %eax
+	movl	-9(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(5bytes):
+	movl	-5(%rdi), %eax
+	movl	-5(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(1bytes):
+	movzbl	-1(%rdi), %eax
+	cmpb	-1(%rsi), %al
+	jne	L(set)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(46bytes):
+	movl	-46(%rdi), %eax
+	movl	-46(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(42bytes):
+	movl	-42(%rdi), %eax
+	movl	-42(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(38bytes):
+	movl	-38(%rdi), %eax
+	movl	-38(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(34bytes):
+	movl	-34(%rdi), %eax
+	movl	-34(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(30bytes):
+	movl	-30(%rdi), %eax
+	movl	-30(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(26bytes):
+	movl	-26(%rdi), %eax
+	movl	-26(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(22bytes):
+	movl	-22(%rdi), %eax
+	movl	-22(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(18bytes):
+	movl	-18(%rdi), %eax
+	movl	-18(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(14bytes):
+	movl	-14(%rdi), %eax
+	movl	-14(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(10bytes):
+	movl	-10(%rdi), %eax
+	movl	-10(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(6bytes):
+	movl	-6(%rdi), %eax
+	movl	-6(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(2bytes):
+	movzwl	-2(%rdi), %eax
+	movzwl	-2(%rsi), %ecx
+	cmpb	%cl, %al
+	jne	L(set)
+	cmp	%ecx, %eax
+	jne	L(set)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(47bytes):
+	movl	-47(%rdi), %eax
+	movl	-47(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(43bytes):
+	movl	-43(%rdi), %eax
+	movl	-43(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(39bytes):
+	movl	-39(%rdi), %eax
+	movl	-39(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(35bytes):
+	movl	-35(%rdi), %eax
+	movl	-35(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(31bytes):
+	movl	-31(%rdi), %eax
+	movl	-31(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(27bytes):
+	movl	-27(%rdi), %eax
+	movl	-27(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(23bytes):
+	movl	-23(%rdi), %eax
+	movl	-23(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(19bytes):
+	movl	-19(%rdi), %eax
+	movl	-19(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(15bytes):
+	movl	-15(%rdi), %eax
+	movl	-15(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(11bytes):
+	movl	-11(%rdi), %eax
+	movl	-11(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(7bytes):
+	movl	-7(%rdi), %eax
+	movl	-7(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(3bytes):
+	movzwl	-3(%rdi), %eax
+	movzwl	-3(%rsi), %ecx
+	cmpb	%cl, %al
+	jne	L(set)
+	cmp	%ecx, %eax
+	jne	L(set)
+	movzbl	-1(%rdi), %eax
+	cmpb	-1(%rsi), %al
+	jne	L(set)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(find_diff):
+	cmpb	%cl, %al
+	jne	L(set)
+	cmpw	%cx, %ax
+	jne	L(set)
+	shr	$16, %eax
+	shr	$16, %ecx
+	cmpb	%cl, %al
+	jne	L(set)
+
+/* We get there only if we already know there is a
+difference.  */
+
+	cmp	%ecx, %eax
+L(set):
+	sbb	%eax, %eax
+	sbb	$-1, %eax
+	ret
+# else
+
+/* for wmemcmp */
+	.p2align 4
+L(find_diff):
+	mov	$1, %eax
+	jg	L(find_diff_bigger)
+	neg	%eax
+	ret
+
+	.p2align 4
+L(find_diff_bigger):
+	ret
+# endif
+
+	.p2align 4
+L(equal):
+	xor	%eax, %eax
+	ret
+
+END (MEMCMP)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memcmp.S b/REORG.TODO/sysdeps/x86_64/multiarch/memcmp.S
new file mode 100644
index 0000000000..0c9804b7e9
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/memcmp.S
@@ -0,0 +1,78 @@
+/* Multiple versions of memcmp
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+	.text
+ENTRY(memcmp)
+	.type	memcmp, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	1f
+	HAS_CPU_FEATURE (MOVBE)
+	jz	1f
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	1f
+	leaq	__memcmp_avx2_movbe(%rip), %rax
+	ret
+
+1:	HAS_CPU_FEATURE (SSSE3)
+	jnz	2f
+	leaq	__memcmp_sse2(%rip), %rax
+	ret
+
+2:	HAS_CPU_FEATURE (SSE4_1)
+	jz	3f
+	leaq	__memcmp_sse4_1(%rip), %rax
+	ret
+
+3:	leaq	__memcmp_ssse3(%rip), %rax
+	ret
+
+END(memcmp)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __memcmp_sse2, @function; \
+	.p2align 4; \
+	.globl __memcmp_sse2; \
+	.hidden __memcmp_sse2; \
+	__memcmp_sse2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __memcmp_sse2, .-__memcmp_sse2
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal memcmp calls through a PLT.
+   The speedup we get from using SSE4 instructions is likely eaten away
+   by the indirect call in the PLT.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_memcmp; __GI_memcmp = __memcmp_sse2
+# endif
+#endif
+
+#include "../memcmp.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/REORG.TODO/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
new file mode 100644
index 0000000000..4e060a27fd
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
@@ -0,0 +1,3180 @@
+/* memcpy with SSSE3 and REP string
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#if IS_IN (libc) \
+    && (defined SHARED \
+        || defined USE_AS_MEMMOVE \
+	|| !defined USE_MULTIARCH)
+
+#include "asm-syntax.h"
+
+#ifndef MEMCPY
+# define MEMCPY		__memcpy_ssse3_back
+# define MEMCPY_CHK	__memcpy_chk_ssse3_back
+# define MEMPCPY	__mempcpy_ssse3_back
+# define MEMPCPY_CHK	__mempcpy_chk_ssse3_back
+#endif
+
+#define JMPTBL(I, B)	I - B
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+   relative offsets.  INDEX is a register contains the index into the
+   jump table.  SCALE is the scale of INDEX.  */
+#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+  lea		TABLE(%rip), %r11;				\
+  movslq	(%r11, INDEX, SCALE), INDEX;			\
+  lea		(%r11, INDEX), INDEX;				\
+  jmp		*INDEX;						\
+  ud2
+
+	.section .text.ssse3,"ax",@progbits
+#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
+ENTRY (MEMPCPY_CHK)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMPCPY_CHK)
+
+ENTRY (MEMPCPY)
+	movq	%rdi, %rax
+	addq	%rdx, %rax
+	jmp	L(start)
+END (MEMPCPY)
+#endif
+
+#if !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+#endif
+
+ENTRY (MEMCPY)
+	mov	%rdi, %rax
+#ifdef USE_AS_MEMPCPY
+	add	%rdx, %rax
+#endif
+
+#ifdef USE_AS_MEMMOVE
+	cmp	%rsi, %rdi
+	jb	L(copy_forward)
+	je	L(bwd_write_0bytes)
+	cmp	$144, %rdx
+	jae	L(copy_backward)
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+L(copy_forward):
+#endif
+L(start):
+	cmp	$144, %rdx
+	jae	L(144bytesormore)
+
+L(fwd_write_less32bytes):
+#ifndef USE_AS_MEMMOVE
+	cmp	%dil, %sil
+	jbe	L(bk_write)
+#endif
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+#ifndef USE_AS_MEMMOVE
+L(bk_write):
+
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+#endif
+
+	.p2align 4
+L(144bytesormore):
+
+#ifndef USE_AS_MEMMOVE
+	cmp	%dil, %sil
+	jle	L(copy_backward)
+#endif
+	movdqu	(%rsi), %xmm0
+	mov	%rdi, %r8
+	and	$-16, %rdi
+	add	$16, %rdi
+	mov	%rdi, %r9
+	sub	%r8, %r9
+	sub	%r9, %rdx
+	add	%r9, %rsi
+	mov	%rsi, %r9
+	and	$0xf, %r9
+	jz	L(shl_0)
+#ifdef DATA_CACHE_SIZE
+	mov	$DATA_CACHE_SIZE, %RCX_LP
+#else
+	mov	__x86_data_cache_size(%rip), %RCX_LP
+#endif
+	cmp	%rcx, %rdx
+	jae	L(gobble_mem_fwd)
+	lea    	L(shl_table_fwd)(%rip), %r11
+	sub	$0x80, %rdx
+	movslq	(%r11, %r9, 4), %r9
+	add	%r11, %r9
+	jmp	*%r9
+	ud2
+
+	.p2align 4
+L(copy_backward):
+#ifdef DATA_CACHE_SIZE
+	mov	$DATA_CACHE_SIZE, %RCX_LP
+#else
+	mov	__x86_data_cache_size(%rip), %RCX_LP
+#endif
+	shl	$1, %rcx
+	cmp	%rcx, %rdx
+	ja	L(gobble_mem_bwd)
+
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	movdqu	-16(%rsi), %xmm0
+	lea	-16(%rdi), %r8
+	mov	%rdi, %r9
+	and	$0xf, %r9
+	xor	%r9, %rdi
+	sub	%r9, %rsi
+	sub	%r9, %rdx
+	mov	%rsi, %r9
+	and	$0xf, %r9
+	jz	L(shl_0_bwd)
+	lea    	L(shl_table_bwd)(%rip), %r11
+	sub	$0x80, %rdx
+	movslq	(%r11, %r9, 4), %r9
+	add	%r11, %r9
+	jmp	*%r9
+	ud2
+
+	.p2align 4
+L(shl_0):
+
+	mov	%rdx, %r9
+	shr	$8, %r9
+	add	%rdx, %r9
+#ifdef DATA_CACHE_SIZE
+	cmp	$DATA_CACHE_SIZE_HALF, %R9_LP
+#else
+	cmp	__x86_data_cache_size_half(%rip), %R9_LP
+#endif
+	jae	L(gobble_mem_fwd)
+	sub	$0x80, %rdx
+	.p2align 4
+L(shl_0_loop):
+	movdqa	(%rsi), %xmm1
+	movdqa	%xmm1, (%rdi)
+	movaps	0x10(%rsi), %xmm2
+	movaps	%xmm2, 0x10(%rdi)
+	movaps	0x20(%rsi), %xmm3
+	movaps	%xmm3, 0x20(%rdi)
+	movaps	0x30(%rsi), %xmm4
+	movaps	%xmm4, 0x30(%rdi)
+	movaps	0x40(%rsi), %xmm1
+	movaps	%xmm1, 0x40(%rdi)
+	movaps	0x50(%rsi), %xmm2
+	movaps	%xmm2, 0x50(%rdi)
+	movaps	0x60(%rsi), %xmm3
+	movaps	%xmm3, 0x60(%rdi)
+	movaps	0x70(%rsi), %xmm4
+	movaps	%xmm4, 0x70(%rdi)
+	sub	$0x80, %rdx
+	lea	0x80(%rsi), %rsi
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_0_loop)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_0_bwd):
+	sub	$0x80, %rdx
+L(copy_backward_loop):
+	movaps	-0x10(%rsi), %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	-0x20(%rsi), %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+	movaps	-0x30(%rsi), %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+	movaps	-0x40(%rsi), %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+	movaps	-0x50(%rsi), %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+	movaps	-0x60(%rsi), %xmm5
+	movaps	%xmm5, -0x60(%rdi)
+	movaps	-0x70(%rsi), %xmm5
+	movaps	%xmm5, -0x70(%rdi)
+	movaps	-0x80(%rsi), %xmm5
+	movaps	%xmm5, -0x80(%rdi)
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(copy_backward_loop)
+
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(shl_1):
+	sub	$0x80, %rdx
+	movaps	-0x01(%rsi), %xmm1
+	movaps	0x0f(%rsi), %xmm2
+	movaps	0x1f(%rsi), %xmm3
+	movaps	0x2f(%rsi), %xmm4
+	movaps	0x3f(%rsi), %xmm5
+	movaps	0x4f(%rsi), %xmm6
+	movaps	0x5f(%rsi), %xmm7
+	movaps	0x6f(%rsi), %xmm8
+	movaps	0x7f(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$1, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$1, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$1, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$1, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$1, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$1, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$1, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_1)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_1_bwd):
+	movaps	-0x01(%rsi), %xmm1
+
+	movaps	-0x11(%rsi), %xmm2
+	palignr	$1, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x21(%rsi), %xmm3
+	palignr	$1, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x31(%rsi), %xmm4
+	palignr	$1, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x41(%rsi), %xmm5
+	palignr	$1, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x51(%rsi), %xmm6
+	palignr	$1, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x61(%rsi), %xmm7
+	palignr	$1, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x71(%rsi), %xmm8
+	palignr	$1, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x81(%rsi), %xmm9
+	palignr	$1, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_1_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(shl_2):
+	sub	$0x80, %rdx
+	movaps	-0x02(%rsi), %xmm1
+	movaps	0x0e(%rsi), %xmm2
+	movaps	0x1e(%rsi), %xmm3
+	movaps	0x2e(%rsi), %xmm4
+	movaps	0x3e(%rsi), %xmm5
+	movaps	0x4e(%rsi), %xmm6
+	movaps	0x5e(%rsi), %xmm7
+	movaps	0x6e(%rsi), %xmm8
+	movaps	0x7e(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$2, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$2, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$2, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$2, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$2, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$2, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$2, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_2)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_2_bwd):
+	movaps	-0x02(%rsi), %xmm1
+
+	movaps	-0x12(%rsi), %xmm2
+	palignr	$2, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x22(%rsi), %xmm3
+	palignr	$2, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x32(%rsi), %xmm4
+	palignr	$2, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x42(%rsi), %xmm5
+	palignr	$2, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x52(%rsi), %xmm6
+	palignr	$2, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x62(%rsi), %xmm7
+	palignr	$2, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x72(%rsi), %xmm8
+	palignr	$2, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x82(%rsi), %xmm9
+	palignr	$2, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_2_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(shl_3):
+	sub	$0x80, %rdx
+	movaps -0x03(%rsi), %xmm1
+	movaps	0x0d(%rsi), %xmm2
+	movaps	0x1d(%rsi), %xmm3
+	movaps	0x2d(%rsi), %xmm4
+	movaps	0x3d(%rsi), %xmm5
+	movaps	0x4d(%rsi), %xmm6
+	movaps	0x5d(%rsi), %xmm7
+	movaps	0x6d(%rsi), %xmm8
+	movaps	0x7d(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$3, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$3, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$3, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$3, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$3, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$3, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$3, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_3)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_3_bwd):
+	movaps	-0x03(%rsi), %xmm1
+
+	movaps	-0x13(%rsi), %xmm2
+	palignr	$3, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x23(%rsi), %xmm3
+	palignr	$3, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x33(%rsi), %xmm4
+	palignr	$3, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x43(%rsi), %xmm5
+	palignr	$3, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x53(%rsi), %xmm6
+	palignr	$3, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x63(%rsi), %xmm7
+	palignr	$3, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x73(%rsi), %xmm8
+	palignr	$3, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x83(%rsi), %xmm9
+	palignr	$3, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_3_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(shl_4):
+	sub	$0x80, %rdx
+	movaps	-0x04(%rsi), %xmm1
+	movaps	0x0c(%rsi), %xmm2
+	movaps	0x1c(%rsi), %xmm3
+	movaps	0x2c(%rsi), %xmm4
+	movaps	0x3c(%rsi), %xmm5
+	movaps	0x4c(%rsi), %xmm6
+	movaps	0x5c(%rsi), %xmm7
+	movaps	0x6c(%rsi), %xmm8
+	movaps	0x7c(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$4, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$4, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$4, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$4, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$4, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$4, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$4, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_4)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_4_bwd):
+	movaps	-0x04(%rsi), %xmm1
+
+	movaps	-0x14(%rsi), %xmm2
+	palignr	$4, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x24(%rsi), %xmm3
+	palignr	$4, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x34(%rsi), %xmm4
+	palignr	$4, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x44(%rsi), %xmm5
+	palignr	$4, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x54(%rsi), %xmm6
+	palignr	$4, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x64(%rsi), %xmm7
+	palignr	$4, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x74(%rsi), %xmm8
+	palignr	$4, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x84(%rsi), %xmm9
+	palignr	$4, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_4_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(shl_5):
+	sub	$0x80, %rdx
+	movaps	-0x05(%rsi), %xmm1
+	movaps	0x0b(%rsi), %xmm2
+	movaps	0x1b(%rsi), %xmm3
+	movaps	0x2b(%rsi), %xmm4
+	movaps	0x3b(%rsi), %xmm5
+	movaps	0x4b(%rsi), %xmm6
+	movaps	0x5b(%rsi), %xmm7
+	movaps	0x6b(%rsi), %xmm8
+	movaps	0x7b(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$5, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$5, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$5, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$5, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$5, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$5, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$5, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_5)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_5_bwd):
+	movaps	-0x05(%rsi), %xmm1
+
+	movaps	-0x15(%rsi), %xmm2
+	palignr	$5, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x25(%rsi), %xmm3
+	palignr	$5, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x35(%rsi), %xmm4
+	palignr	$5, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x45(%rsi), %xmm5
+	palignr	$5, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x55(%rsi), %xmm6
+	palignr	$5, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x65(%rsi), %xmm7
+	palignr	$5, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x75(%rsi), %xmm8
+	palignr	$5, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x85(%rsi), %xmm9
+	palignr	$5, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_5_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(shl_6):
+	sub	$0x80, %rdx
+	movaps	-0x06(%rsi), %xmm1
+	movaps	0x0a(%rsi), %xmm2
+	movaps	0x1a(%rsi), %xmm3
+	movaps	0x2a(%rsi), %xmm4
+	movaps	0x3a(%rsi), %xmm5
+	movaps	0x4a(%rsi), %xmm6
+	movaps	0x5a(%rsi), %xmm7
+	movaps	0x6a(%rsi), %xmm8
+	movaps	0x7a(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$6, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$6, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$6, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$6, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$6, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$6, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$6, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_6)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_6_bwd):
+	movaps	-0x06(%rsi), %xmm1
+
+	movaps	-0x16(%rsi), %xmm2
+	palignr	$6, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x26(%rsi), %xmm3
+	palignr	$6, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x36(%rsi), %xmm4
+	palignr	$6, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x46(%rsi), %xmm5
+	palignr	$6, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x56(%rsi), %xmm6
+	palignr	$6, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x66(%rsi), %xmm7
+	palignr	$6, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x76(%rsi), %xmm8
+	palignr	$6, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x86(%rsi), %xmm9
+	palignr	$6, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_6_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(shl_7):
+	sub	$0x80, %rdx
+	movaps	-0x07(%rsi), %xmm1
+	movaps	0x09(%rsi), %xmm2
+	movaps	0x19(%rsi), %xmm3
+	movaps	0x29(%rsi), %xmm4
+	movaps	0x39(%rsi), %xmm5
+	movaps	0x49(%rsi), %xmm6
+	movaps	0x59(%rsi), %xmm7
+	movaps	0x69(%rsi), %xmm8
+	movaps	0x79(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$7, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$7, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$7, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$7, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$7, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$7, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$7, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_7)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_7_bwd):
+	movaps	-0x07(%rsi), %xmm1
+
+	movaps	-0x17(%rsi), %xmm2
+	palignr	$7, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x27(%rsi), %xmm3
+	palignr	$7, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x37(%rsi), %xmm4
+	palignr	$7, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x47(%rsi), %xmm5
+	palignr	$7, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x57(%rsi), %xmm6
+	palignr	$7, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x67(%rsi), %xmm7
+	palignr	$7, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x77(%rsi), %xmm8
+	palignr	$7, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x87(%rsi), %xmm9
+	palignr	$7, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_7_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(shl_8):
+	sub	$0x80, %rdx
+	movaps	-0x08(%rsi), %xmm1
+	movaps	0x08(%rsi), %xmm2
+	movaps	0x18(%rsi), %xmm3
+	movaps	0x28(%rsi), %xmm4
+	movaps	0x38(%rsi), %xmm5
+	movaps	0x48(%rsi), %xmm6
+	movaps	0x58(%rsi), %xmm7
+	movaps	0x68(%rsi), %xmm8
+	movaps	0x78(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$8, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$8, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$8, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$8, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$8, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$8, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$8, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_8)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_8_bwd):
+	movaps	-0x08(%rsi), %xmm1
+
+	movaps	-0x18(%rsi), %xmm2
+	palignr	$8, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x28(%rsi), %xmm3
+	palignr	$8, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x38(%rsi), %xmm4
+	palignr	$8, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x48(%rsi), %xmm5
+	palignr	$8, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x58(%rsi), %xmm6
+	palignr	$8, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x68(%rsi), %xmm7
+	palignr	$8, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x78(%rsi), %xmm8
+	palignr	$8, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x88(%rsi), %xmm9
+	palignr	$8, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_8_bwd)
+L(shl_8_end_bwd):
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(shl_9):
+	sub	$0x80, %rdx
+	movaps	-0x09(%rsi), %xmm1
+	movaps	0x07(%rsi), %xmm2
+	movaps	0x17(%rsi), %xmm3
+	movaps	0x27(%rsi), %xmm4
+	movaps	0x37(%rsi), %xmm5
+	movaps	0x47(%rsi), %xmm6
+	movaps	0x57(%rsi), %xmm7
+	movaps	0x67(%rsi), %xmm8
+	movaps	0x77(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$9, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$9, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$9, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$9, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$9, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$9, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$9, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_9)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_9_bwd):
+	movaps	-0x09(%rsi), %xmm1
+
+	movaps	-0x19(%rsi), %xmm2
+	palignr	$9, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x29(%rsi), %xmm3
+	palignr	$9, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x39(%rsi), %xmm4
+	palignr	$9, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x49(%rsi), %xmm5
+	palignr	$9, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x59(%rsi), %xmm6
+	palignr	$9, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x69(%rsi), %xmm7
+	palignr	$9, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x79(%rsi), %xmm8
+	palignr	$9, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x89(%rsi), %xmm9
+	palignr	$9, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_9_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(shl_10):
+	sub	$0x80, %rdx
+	movaps	-0x0a(%rsi), %xmm1
+	movaps	0x06(%rsi), %xmm2
+	movaps	0x16(%rsi), %xmm3
+	movaps	0x26(%rsi), %xmm4
+	movaps	0x36(%rsi), %xmm5
+	movaps	0x46(%rsi), %xmm6
+	movaps	0x56(%rsi), %xmm7
+	movaps	0x66(%rsi), %xmm8
+	movaps	0x76(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$10, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$10, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$10, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$10, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$10, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$10, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$10, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_10)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_10_bwd):
+	movaps	-0x0a(%rsi), %xmm1
+
+	movaps	-0x1a(%rsi), %xmm2
+	palignr	$10, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x2a(%rsi), %xmm3
+	palignr	$10, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x3a(%rsi), %xmm4
+	palignr	$10, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x4a(%rsi), %xmm5
+	palignr	$10, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x5a(%rsi), %xmm6
+	palignr	$10, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x6a(%rsi), %xmm7
+	palignr	$10, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x7a(%rsi), %xmm8
+	palignr	$10, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x8a(%rsi), %xmm9
+	palignr	$10, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_10_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(shl_11):
+	sub	$0x80, %rdx
+	movaps	-0x0b(%rsi), %xmm1
+	movaps	0x05(%rsi), %xmm2
+	movaps	0x15(%rsi), %xmm3
+	movaps	0x25(%rsi), %xmm4
+	movaps	0x35(%rsi), %xmm5
+	movaps	0x45(%rsi), %xmm6
+	movaps	0x55(%rsi), %xmm7
+	movaps	0x65(%rsi), %xmm8
+	movaps	0x75(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$11, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$11, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$11, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$11, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$11, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$11, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$11, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_11)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_11_bwd):
+	movaps	-0x0b(%rsi), %xmm1
+
+	movaps	-0x1b(%rsi), %xmm2
+	palignr	$11, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x2b(%rsi), %xmm3
+	palignr	$11, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x3b(%rsi), %xmm4
+	palignr	$11, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x4b(%rsi), %xmm5
+	palignr	$11, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x5b(%rsi), %xmm6
+	palignr	$11, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x6b(%rsi), %xmm7
+	palignr	$11, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x7b(%rsi), %xmm8
+	palignr	$11, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x8b(%rsi), %xmm9
+	palignr	$11, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_11_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(shl_12):
+	sub	$0x80, %rdx
+	movdqa	-0x0c(%rsi), %xmm1
+	movaps	0x04(%rsi), %xmm2
+	movaps	0x14(%rsi), %xmm3
+	movaps	0x24(%rsi), %xmm4
+	movaps	0x34(%rsi), %xmm5
+	movaps	0x44(%rsi), %xmm6
+	movaps	0x54(%rsi), %xmm7
+	movaps	0x64(%rsi), %xmm8
+	movaps	0x74(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$12, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$12, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$12, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$12, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$12, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$12, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$12, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_12)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_12_bwd):
+	movaps	-0x0c(%rsi), %xmm1
+
+	movaps	-0x1c(%rsi), %xmm2
+	palignr	$12, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x2c(%rsi), %xmm3
+	palignr	$12, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x3c(%rsi), %xmm4
+	palignr	$12, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x4c(%rsi), %xmm5
+	palignr	$12, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x5c(%rsi), %xmm6
+	palignr	$12, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x6c(%rsi), %xmm7
+	palignr	$12, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x7c(%rsi), %xmm8
+	palignr	$12, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x8c(%rsi), %xmm9
+	palignr	$12, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_12_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(shl_13):
+	sub	$0x80, %rdx
+	movaps	-0x0d(%rsi), %xmm1
+	movaps	0x03(%rsi), %xmm2
+	movaps	0x13(%rsi), %xmm3
+	movaps	0x23(%rsi), %xmm4
+	movaps	0x33(%rsi), %xmm5
+	movaps	0x43(%rsi), %xmm6
+	movaps	0x53(%rsi), %xmm7
+	movaps	0x63(%rsi), %xmm8
+	movaps	0x73(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$13, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$13, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$13, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$13, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$13, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$13, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$13, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_13)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_13_bwd):
+	movaps	-0x0d(%rsi), %xmm1
+
+	movaps	-0x1d(%rsi), %xmm2
+	palignr	$13, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x2d(%rsi), %xmm3
+	palignr	$13, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x3d(%rsi), %xmm4
+	palignr	$13, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x4d(%rsi), %xmm5
+	palignr	$13, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x5d(%rsi), %xmm6
+	palignr	$13, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x6d(%rsi), %xmm7
+	palignr	$13, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x7d(%rsi), %xmm8
+	palignr	$13, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x8d(%rsi), %xmm9
+	palignr	$13, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_13_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(shl_14):
+	sub	$0x80, %rdx
+	movaps	-0x0e(%rsi), %xmm1
+	movaps	0x02(%rsi), %xmm2
+	movaps	0x12(%rsi), %xmm3
+	movaps	0x22(%rsi), %xmm4
+	movaps	0x32(%rsi), %xmm5
+	movaps	0x42(%rsi), %xmm6
+	movaps	0x52(%rsi), %xmm7
+	movaps	0x62(%rsi), %xmm8
+	movaps	0x72(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$14, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$14, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$14, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$14, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$14, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$14, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$14, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_14)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_14_bwd):
+	movaps	-0x0e(%rsi), %xmm1
+
+	movaps	-0x1e(%rsi), %xmm2
+	palignr	$14, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x2e(%rsi), %xmm3
+	palignr	$14, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x3e(%rsi), %xmm4
+	palignr	$14, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x4e(%rsi), %xmm5
+	palignr	$14, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x5e(%rsi), %xmm6
+	palignr	$14, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x6e(%rsi), %xmm7
+	palignr	$14, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x7e(%rsi), %xmm8
+	palignr	$14, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x8e(%rsi), %xmm9
+	palignr	$14, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_14_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(shl_15):
+	sub	$0x80, %rdx
+	movaps	-0x0f(%rsi), %xmm1
+	movaps	0x01(%rsi), %xmm2
+	movaps	0x11(%rsi), %xmm3
+	movaps	0x21(%rsi), %xmm4
+	movaps	0x31(%rsi), %xmm5
+	movaps	0x41(%rsi), %xmm6
+	movaps	0x51(%rsi), %xmm7
+	movaps	0x61(%rsi), %xmm8
+	movaps	0x71(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$15, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$15, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$15, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$15, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$15, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$15, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$15, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_15)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_15_bwd):
+	movaps	-0x0f(%rsi), %xmm1
+
+	movaps	-0x1f(%rsi), %xmm2
+	palignr	$15, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x2f(%rsi), %xmm3
+	palignr	$15, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x3f(%rsi), %xmm4
+	palignr	$15, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x4f(%rsi), %xmm5
+	palignr	$15, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x5f(%rsi), %xmm6
+	palignr	$15, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x6f(%rsi), %xmm7
+	palignr	$15, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x7f(%rsi), %xmm8
+	palignr	$15, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x8f(%rsi), %xmm9
+	palignr	$15, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_15_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(gobble_mem_fwd):
+	movdqu	(%rsi), %xmm1
+	movdqu	%xmm0, (%r8)
+	movdqa	%xmm1, (%rdi)
+	sub	$16, %rdx
+	add	$16, %rsi
+	add	$16, %rdi
+
+#ifdef SHARED_CACHE_SIZE_HALF
+	mov	$SHARED_CACHE_SIZE_HALF, %RCX_LP
+#else
+	mov	__x86_shared_cache_size_half(%rip), %RCX_LP
+#endif
+#ifdef USE_AS_MEMMOVE
+	mov	%rsi, %r9
+	sub	%rdi, %r9
+	cmp	%rdx, %r9
+	jae	L(memmove_is_memcpy_fwd)
+	cmp	%rcx, %r9
+	jbe	L(ll_cache_copy_fwd_start)
+L(memmove_is_memcpy_fwd):
+#endif
+	cmp	%rcx, %rdx
+	ja	L(bigger_in_fwd)
+	mov	%rdx, %rcx
+L(bigger_in_fwd):
+	sub	%rcx, %rdx
+	cmp	$0x1000, %rdx
+	jbe	L(ll_cache_copy_fwd)
+
+	mov	%rcx, %r9
+	shl	$3, %r9
+	cmp	%r9, %rdx
+	jbe	L(2steps_copy_fwd)
+	add	%rcx, %rdx
+	xor	%rcx, %rcx
+L(2steps_copy_fwd):
+	sub	$0x80, %rdx
+L(gobble_mem_fwd_loop):
+	sub	$0x80, %rdx
+	prefetcht0 0x200(%rsi)
+	prefetcht0 0x300(%rsi)
+	movdqu	(%rsi), %xmm0
+	movdqu	0x10(%rsi), %xmm1
+	movdqu	0x20(%rsi), %xmm2
+	movdqu	0x30(%rsi), %xmm3
+	movdqu	0x40(%rsi), %xmm4
+	movdqu	0x50(%rsi), %xmm5
+	movdqu	0x60(%rsi), %xmm6
+	movdqu	0x70(%rsi), %xmm7
+	lfence
+	movntdq	%xmm0, (%rdi)
+	movntdq	%xmm1, 0x10(%rdi)
+	movntdq	%xmm2, 0x20(%rdi)
+	movntdq	%xmm3, 0x30(%rdi)
+	movntdq	%xmm4, 0x40(%rdi)
+	movntdq	%xmm5, 0x50(%rdi)
+	movntdq	%xmm6, 0x60(%rdi)
+	movntdq	%xmm7, 0x70(%rdi)
+	lea	0x80(%rsi), %rsi
+	lea	0x80(%rdi), %rdi
+	jae	L(gobble_mem_fwd_loop)
+	sfence
+	cmp	$0x80, %rcx
+	jb	L(gobble_mem_fwd_end)
+	add	$0x80, %rdx
+L(ll_cache_copy_fwd):
+	add	%rcx, %rdx
+L(ll_cache_copy_fwd_start):
+	sub	$0x80, %rdx
+L(gobble_ll_loop_fwd):
+	prefetchnta 0x1c0(%rsi)
+	prefetchnta 0x280(%rsi)
+	prefetchnta 0x1c0(%rdi)
+	prefetchnta 0x280(%rdi)
+	sub	$0x80, %rdx
+	movdqu	(%rsi), %xmm0
+	movdqu	0x10(%rsi), %xmm1
+	movdqu	0x20(%rsi), %xmm2
+	movdqu	0x30(%rsi), %xmm3
+	movdqu	0x40(%rsi), %xmm4
+	movdqu	0x50(%rsi), %xmm5
+	movdqu	0x60(%rsi), %xmm6
+	movdqu	0x70(%rsi), %xmm7
+	movdqa	%xmm0, (%rdi)
+	movdqa	%xmm1, 0x10(%rdi)
+	movdqa	%xmm2, 0x20(%rdi)
+	movdqa	%xmm3, 0x30(%rdi)
+	movdqa	%xmm4, 0x40(%rdi)
+	movdqa	%xmm5, 0x50(%rdi)
+	movdqa	%xmm6, 0x60(%rdi)
+	movdqa	%xmm7, 0x70(%rdi)
+	lea	0x80(%rsi), %rsi
+	lea	0x80(%rdi), %rdi
+	jae	L(gobble_ll_loop_fwd)
+L(gobble_mem_fwd_end):
+	add	$0x80, %rdx
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(gobble_mem_bwd):
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+
+	movdqu	-16(%rsi), %xmm0
+	lea	-16(%rdi), %r8
+	mov	%rdi, %r9
+	and	$-16, %rdi
+	sub	%rdi, %r9
+	sub	%r9, %rsi
+	sub	%r9, %rdx
+
+
+#ifdef SHARED_CACHE_SIZE_HALF
+	mov	$SHARED_CACHE_SIZE_HALF, %RCX_LP
+#else
+	mov	__x86_shared_cache_size_half(%rip), %RCX_LP
+#endif
+#ifdef USE_AS_MEMMOVE
+	mov	%rdi, %r9
+	sub	%rsi, %r9
+	cmp	%rdx, %r9
+	jae	L(memmove_is_memcpy_bwd)
+	cmp	%rcx, %r9
+	jbe	L(ll_cache_copy_bwd_start)
+L(memmove_is_memcpy_bwd):
+#endif
+	cmp	%rcx, %rdx
+	ja	L(bigger)
+	mov	%rdx, %rcx
+L(bigger):
+	sub	%rcx, %rdx
+	cmp	$0x1000, %rdx
+	jbe	L(ll_cache_copy)
+
+	mov	%rcx, %r9
+	shl	$3, %r9
+	cmp	%r9, %rdx
+	jbe	L(2steps_copy)
+	add	%rcx, %rdx
+	xor	%rcx, %rcx
+L(2steps_copy):
+	sub	$0x80, %rdx
+L(gobble_mem_bwd_loop):
+	sub	$0x80, %rdx
+	prefetcht0 -0x200(%rsi)
+	prefetcht0 -0x300(%rsi)
+	movdqu	-0x10(%rsi), %xmm1
+	movdqu	-0x20(%rsi), %xmm2
+	movdqu	-0x30(%rsi), %xmm3
+	movdqu	-0x40(%rsi), %xmm4
+	movdqu	-0x50(%rsi), %xmm5
+	movdqu	-0x60(%rsi), %xmm6
+	movdqu	-0x70(%rsi), %xmm7
+	movdqu	-0x80(%rsi), %xmm8
+	lfence
+	movntdq	%xmm1, -0x10(%rdi)
+	movntdq	%xmm2, -0x20(%rdi)
+	movntdq	%xmm3, -0x30(%rdi)
+	movntdq	%xmm4, -0x40(%rdi)
+	movntdq	%xmm5, -0x50(%rdi)
+	movntdq	%xmm6, -0x60(%rdi)
+	movntdq	%xmm7, -0x70(%rdi)
+	movntdq	%xmm8, -0x80(%rdi)
+	lea	-0x80(%rsi), %rsi
+	lea	-0x80(%rdi), %rdi
+	jae	L(gobble_mem_bwd_loop)
+	sfence
+	cmp	$0x80, %rcx
+	jb	L(gobble_mem_bwd_end)
+	add	$0x80, %rdx
+L(ll_cache_copy):
+	add	%rcx, %rdx
+L(ll_cache_copy_bwd_start):
+	sub	$0x80, %rdx
+L(gobble_ll_loop):
+	prefetchnta -0x1c0(%rsi)
+	prefetchnta -0x280(%rsi)
+	prefetchnta -0x1c0(%rdi)
+	prefetchnta -0x280(%rdi)
+	sub	$0x80, %rdx
+	movdqu	-0x10(%rsi), %xmm1
+	movdqu	-0x20(%rsi), %xmm2
+	movdqu	-0x30(%rsi), %xmm3
+	movdqu	-0x40(%rsi), %xmm4
+	movdqu	-0x50(%rsi), %xmm5
+	movdqu	-0x60(%rsi), %xmm6
+	movdqu	-0x70(%rsi), %xmm7
+	movdqu	-0x80(%rsi), %xmm8
+	movdqa	%xmm1, -0x10(%rdi)
+	movdqa	%xmm2, -0x20(%rdi)
+	movdqa	%xmm3, -0x30(%rdi)
+	movdqa	%xmm4, -0x40(%rdi)
+	movdqa	%xmm5, -0x50(%rdi)
+	movdqa	%xmm6, -0x60(%rdi)
+	movdqa	%xmm7, -0x70(%rdi)
+	movdqa	%xmm8, -0x80(%rdi)
+	lea	-0x80(%rsi), %rsi
+	lea	-0x80(%rdi), %rdi
+	jae	L(gobble_ll_loop)
+L(gobble_mem_bwd_end):
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rsi
+	sub	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(fwd_write_128bytes):
+	lddqu	-128(%rsi), %xmm0
+	movdqu	%xmm0, -128(%rdi)
+L(fwd_write_112bytes):
+	lddqu	-112(%rsi), %xmm0
+	movdqu	%xmm0, -112(%rdi)
+L(fwd_write_96bytes):
+	lddqu	-96(%rsi), %xmm0
+	movdqu	%xmm0, -96(%rdi)
+L(fwd_write_80bytes):
+	lddqu	-80(%rsi), %xmm0
+	movdqu	%xmm0, -80(%rdi)
+L(fwd_write_64bytes):
+	lddqu	-64(%rsi), %xmm0
+	movdqu	%xmm0, -64(%rdi)
+L(fwd_write_48bytes):
+	lddqu	-48(%rsi), %xmm0
+	movdqu	%xmm0, -48(%rdi)
+L(fwd_write_32bytes):
+	lddqu	-32(%rsi), %xmm0
+	movdqu	%xmm0, -32(%rdi)
+L(fwd_write_16bytes):
+	lddqu	-16(%rsi), %xmm0
+	movdqu	%xmm0, -16(%rdi)
+L(fwd_write_0bytes):
+	ret
+
+
+	.p2align 4
+L(fwd_write_143bytes):
+	lddqu	-143(%rsi), %xmm0
+	movdqu	%xmm0, -143(%rdi)
+L(fwd_write_127bytes):
+	lddqu	-127(%rsi), %xmm0
+	movdqu	%xmm0, -127(%rdi)
+L(fwd_write_111bytes):
+	lddqu	-111(%rsi), %xmm0
+	movdqu	%xmm0, -111(%rdi)
+L(fwd_write_95bytes):
+	lddqu	-95(%rsi), %xmm0
+	movdqu	%xmm0, -95(%rdi)
+L(fwd_write_79bytes):
+	lddqu	-79(%rsi), %xmm0
+	movdqu	%xmm0, -79(%rdi)
+L(fwd_write_63bytes):
+	lddqu	-63(%rsi), %xmm0
+	movdqu	%xmm0, -63(%rdi)
+L(fwd_write_47bytes):
+	lddqu	-47(%rsi), %xmm0
+	movdqu	%xmm0, -47(%rdi)
+L(fwd_write_31bytes):
+	lddqu	-31(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -31(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_15bytes):
+	mov	-15(%rsi), %rdx
+	mov	-8(%rsi), %rcx
+	mov	%rdx, -15(%rdi)
+	mov	%rcx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_142bytes):
+	lddqu	-142(%rsi), %xmm0
+	movdqu	%xmm0, -142(%rdi)
+L(fwd_write_126bytes):
+	lddqu	-126(%rsi), %xmm0
+	movdqu	%xmm0, -126(%rdi)
+L(fwd_write_110bytes):
+	lddqu	-110(%rsi), %xmm0
+	movdqu	%xmm0, -110(%rdi)
+L(fwd_write_94bytes):
+	lddqu	-94(%rsi), %xmm0
+	movdqu	%xmm0, -94(%rdi)
+L(fwd_write_78bytes):
+	lddqu	-78(%rsi), %xmm0
+	movdqu	%xmm0, -78(%rdi)
+L(fwd_write_62bytes):
+	lddqu	-62(%rsi), %xmm0
+	movdqu	%xmm0, -62(%rdi)
+L(fwd_write_46bytes):
+	lddqu	-46(%rsi), %xmm0
+	movdqu	%xmm0, -46(%rdi)
+L(fwd_write_30bytes):
+	lddqu	-30(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -30(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_14bytes):
+	mov	-14(%rsi), %rdx
+	mov	-8(%rsi), %rcx
+	mov	%rdx, -14(%rdi)
+	mov	%rcx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_141bytes):
+	lddqu	-141(%rsi), %xmm0
+	movdqu	%xmm0, -141(%rdi)
+L(fwd_write_125bytes):
+	lddqu	-125(%rsi), %xmm0
+	movdqu	%xmm0, -125(%rdi)
+L(fwd_write_109bytes):
+	lddqu	-109(%rsi), %xmm0
+	movdqu	%xmm0, -109(%rdi)
+L(fwd_write_93bytes):
+	lddqu	-93(%rsi), %xmm0
+	movdqu	%xmm0, -93(%rdi)
+L(fwd_write_77bytes):
+	lddqu	-77(%rsi), %xmm0
+	movdqu	%xmm0, -77(%rdi)
+L(fwd_write_61bytes):
+	lddqu	-61(%rsi), %xmm0
+	movdqu	%xmm0, -61(%rdi)
+L(fwd_write_45bytes):
+	lddqu	-45(%rsi), %xmm0
+	movdqu	%xmm0, -45(%rdi)
+L(fwd_write_29bytes):
+	lddqu	-29(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -29(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_13bytes):
+	mov	-13(%rsi), %rdx
+	mov	-8(%rsi), %rcx
+	mov	%rdx, -13(%rdi)
+	mov	%rcx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_140bytes):
+	lddqu	-140(%rsi), %xmm0
+	movdqu	%xmm0, -140(%rdi)
+L(fwd_write_124bytes):
+	lddqu	-124(%rsi), %xmm0
+	movdqu	%xmm0, -124(%rdi)
+L(fwd_write_108bytes):
+	lddqu	-108(%rsi), %xmm0
+	movdqu	%xmm0, -108(%rdi)
+L(fwd_write_92bytes):
+	lddqu	-92(%rsi), %xmm0
+	movdqu	%xmm0, -92(%rdi)
+L(fwd_write_76bytes):
+	lddqu	-76(%rsi), %xmm0
+	movdqu	%xmm0, -76(%rdi)
+L(fwd_write_60bytes):
+	lddqu	-60(%rsi), %xmm0
+	movdqu	%xmm0, -60(%rdi)
+L(fwd_write_44bytes):
+	lddqu	-44(%rsi), %xmm0
+	movdqu	%xmm0, -44(%rdi)
+L(fwd_write_28bytes):
+	lddqu	-28(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -28(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_12bytes):
+	mov	-12(%rsi), %rdx
+	mov	-4(%rsi), %ecx
+	mov	%rdx, -12(%rdi)
+	mov	%ecx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_139bytes):
+	lddqu	-139(%rsi), %xmm0
+	movdqu	%xmm0, -139(%rdi)
+L(fwd_write_123bytes):
+	lddqu	-123(%rsi), %xmm0
+	movdqu	%xmm0, -123(%rdi)
+L(fwd_write_107bytes):
+	lddqu	-107(%rsi), %xmm0
+	movdqu	%xmm0, -107(%rdi)
+L(fwd_write_91bytes):
+	lddqu	-91(%rsi), %xmm0
+	movdqu	%xmm0, -91(%rdi)
+L(fwd_write_75bytes):
+	lddqu	-75(%rsi), %xmm0
+	movdqu	%xmm0, -75(%rdi)
+L(fwd_write_59bytes):
+	lddqu	-59(%rsi), %xmm0
+	movdqu	%xmm0, -59(%rdi)
+L(fwd_write_43bytes):
+	lddqu	-43(%rsi), %xmm0
+	movdqu	%xmm0, -43(%rdi)
+L(fwd_write_27bytes):
+	lddqu	-27(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -27(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_11bytes):
+	mov	-11(%rsi), %rdx
+	mov	-4(%rsi), %ecx
+	mov	%rdx, -11(%rdi)
+	mov	%ecx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_138bytes):
+	lddqu	-138(%rsi), %xmm0
+	movdqu	%xmm0, -138(%rdi)
+L(fwd_write_122bytes):
+	lddqu	-122(%rsi), %xmm0
+	movdqu	%xmm0, -122(%rdi)
+L(fwd_write_106bytes):
+	lddqu	-106(%rsi), %xmm0
+	movdqu	%xmm0, -106(%rdi)
+L(fwd_write_90bytes):
+	lddqu	-90(%rsi), %xmm0
+	movdqu	%xmm0, -90(%rdi)
+L(fwd_write_74bytes):
+	lddqu	-74(%rsi), %xmm0
+	movdqu	%xmm0, -74(%rdi)
+L(fwd_write_58bytes):
+	lddqu	-58(%rsi), %xmm0
+	movdqu	%xmm0, -58(%rdi)
+L(fwd_write_42bytes):
+	lddqu	-42(%rsi), %xmm0
+	movdqu	%xmm0, -42(%rdi)
+L(fwd_write_26bytes):
+	lddqu	-26(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -26(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_10bytes):
+	mov	-10(%rsi), %rdx
+	mov	-4(%rsi), %ecx
+	mov	%rdx, -10(%rdi)
+	mov	%ecx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_137bytes):
+	lddqu	-137(%rsi), %xmm0
+	movdqu	%xmm0, -137(%rdi)
+L(fwd_write_121bytes):
+	lddqu	-121(%rsi), %xmm0
+	movdqu	%xmm0, -121(%rdi)
+L(fwd_write_105bytes):
+	lddqu	-105(%rsi), %xmm0
+	movdqu	%xmm0, -105(%rdi)
+L(fwd_write_89bytes):
+	lddqu	-89(%rsi), %xmm0
+	movdqu	%xmm0, -89(%rdi)
+L(fwd_write_73bytes):
+	lddqu	-73(%rsi), %xmm0
+	movdqu	%xmm0, -73(%rdi)
+L(fwd_write_57bytes):
+	lddqu	-57(%rsi), %xmm0
+	movdqu	%xmm0, -57(%rdi)
+L(fwd_write_41bytes):
+	lddqu	-41(%rsi), %xmm0
+	movdqu	%xmm0, -41(%rdi)
+L(fwd_write_25bytes):
+	lddqu	-25(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -25(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_9bytes):
+	mov	-9(%rsi), %rdx
+	mov	-4(%rsi), %ecx
+	mov	%rdx, -9(%rdi)
+	mov	%ecx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_136bytes):
+	lddqu	-136(%rsi), %xmm0
+	movdqu	%xmm0, -136(%rdi)
+L(fwd_write_120bytes):
+	lddqu	-120(%rsi), %xmm0
+	movdqu	%xmm0, -120(%rdi)
+L(fwd_write_104bytes):
+	lddqu	-104(%rsi), %xmm0
+	movdqu	%xmm0, -104(%rdi)
+L(fwd_write_88bytes):
+	lddqu	-88(%rsi), %xmm0
+	movdqu	%xmm0, -88(%rdi)
+L(fwd_write_72bytes):
+	lddqu	-72(%rsi), %xmm0
+	movdqu	%xmm0, -72(%rdi)
+L(fwd_write_56bytes):
+	lddqu	-56(%rsi), %xmm0
+	movdqu	%xmm0, -56(%rdi)
+L(fwd_write_40bytes):
+	lddqu	-40(%rsi), %xmm0
+	movdqu	%xmm0, -40(%rdi)
+L(fwd_write_24bytes):
+	lddqu	-24(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -24(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_8bytes):
+	mov	-8(%rsi), %rdx
+	mov	%rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_135bytes):
+	lddqu	-135(%rsi), %xmm0
+	movdqu	%xmm0, -135(%rdi)
+L(fwd_write_119bytes):
+	lddqu	-119(%rsi), %xmm0
+	movdqu	%xmm0, -119(%rdi)
+L(fwd_write_103bytes):
+	lddqu	-103(%rsi), %xmm0
+	movdqu	%xmm0, -103(%rdi)
+L(fwd_write_87bytes):
+	lddqu	-87(%rsi), %xmm0
+	movdqu	%xmm0, -87(%rdi)
+L(fwd_write_71bytes):
+	lddqu	-71(%rsi), %xmm0
+	movdqu	%xmm0, -71(%rdi)
+L(fwd_write_55bytes):
+	lddqu	-55(%rsi), %xmm0
+	movdqu	%xmm0, -55(%rdi)
+L(fwd_write_39bytes):
+	lddqu	-39(%rsi), %xmm0
+	movdqu	%xmm0, -39(%rdi)
+L(fwd_write_23bytes):
+	lddqu	-23(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -23(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_7bytes):
+	mov	-7(%rsi), %edx
+	mov	-4(%rsi), %ecx
+	mov	%edx, -7(%rdi)
+	mov	%ecx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_134bytes):
+	lddqu	-134(%rsi), %xmm0
+	movdqu	%xmm0, -134(%rdi)
+L(fwd_write_118bytes):
+	lddqu	-118(%rsi), %xmm0
+	movdqu	%xmm0, -118(%rdi)
+L(fwd_write_102bytes):
+	lddqu	-102(%rsi), %xmm0
+	movdqu	%xmm0, -102(%rdi)
+L(fwd_write_86bytes):
+	lddqu	-86(%rsi), %xmm0
+	movdqu	%xmm0, -86(%rdi)
+L(fwd_write_70bytes):
+	lddqu	-70(%rsi), %xmm0
+	movdqu	%xmm0, -70(%rdi)
+L(fwd_write_54bytes):
+	lddqu	-54(%rsi), %xmm0
+	movdqu	%xmm0, -54(%rdi)
+L(fwd_write_38bytes):
+	lddqu	-38(%rsi), %xmm0
+	movdqu	%xmm0, -38(%rdi)
+L(fwd_write_22bytes):
+	lddqu	-22(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -22(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_6bytes):
+	mov	-6(%rsi), %edx
+	mov	-4(%rsi), %ecx
+	mov	%edx, -6(%rdi)
+	mov	%ecx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_133bytes):
+	lddqu	-133(%rsi), %xmm0
+	movdqu	%xmm0, -133(%rdi)
+L(fwd_write_117bytes):
+	lddqu	-117(%rsi), %xmm0
+	movdqu	%xmm0, -117(%rdi)
+L(fwd_write_101bytes):
+	lddqu	-101(%rsi), %xmm0
+	movdqu	%xmm0, -101(%rdi)
+L(fwd_write_85bytes):
+	lddqu	-85(%rsi), %xmm0
+	movdqu	%xmm0, -85(%rdi)
+L(fwd_write_69bytes):
+	lddqu	-69(%rsi), %xmm0
+	movdqu	%xmm0, -69(%rdi)
+L(fwd_write_53bytes):
+	lddqu	-53(%rsi), %xmm0
+	movdqu	%xmm0, -53(%rdi)
+L(fwd_write_37bytes):
+	lddqu	-37(%rsi), %xmm0
+	movdqu	%xmm0, -37(%rdi)
+L(fwd_write_21bytes):
+	lddqu	-21(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -21(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_5bytes):
+	mov	-5(%rsi), %edx
+	mov	-4(%rsi), %ecx
+	mov	%edx, -5(%rdi)
+	mov	%ecx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_132bytes):
+	lddqu	-132(%rsi), %xmm0
+	movdqu	%xmm0, -132(%rdi)
+L(fwd_write_116bytes):
+	lddqu	-116(%rsi), %xmm0
+	movdqu	%xmm0, -116(%rdi)
+L(fwd_write_100bytes):
+	lddqu	-100(%rsi), %xmm0
+	movdqu	%xmm0, -100(%rdi)
+L(fwd_write_84bytes):
+	lddqu	-84(%rsi), %xmm0
+	movdqu	%xmm0, -84(%rdi)
+L(fwd_write_68bytes):
+	lddqu	-68(%rsi), %xmm0
+	movdqu	%xmm0, -68(%rdi)
+L(fwd_write_52bytes):
+	lddqu	-52(%rsi), %xmm0
+	movdqu	%xmm0, -52(%rdi)
+L(fwd_write_36bytes):
+	lddqu	-36(%rsi), %xmm0
+	movdqu	%xmm0, -36(%rdi)
+L(fwd_write_20bytes):
+	lddqu	-20(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -20(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_4bytes):
+	mov	-4(%rsi), %edx
+	mov	%edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_131bytes):
+	lddqu	-131(%rsi), %xmm0
+	movdqu	%xmm0, -131(%rdi)
+L(fwd_write_115bytes):
+	lddqu	-115(%rsi), %xmm0
+	movdqu	%xmm0, -115(%rdi)
+L(fwd_write_99bytes):
+	lddqu	-99(%rsi), %xmm0
+	movdqu	%xmm0, -99(%rdi)
+L(fwd_write_83bytes):
+	lddqu	-83(%rsi), %xmm0
+	movdqu	%xmm0, -83(%rdi)
+L(fwd_write_67bytes):
+	lddqu	-67(%rsi), %xmm0
+	movdqu	%xmm0, -67(%rdi)
+L(fwd_write_51bytes):
+	lddqu	-51(%rsi), %xmm0
+	movdqu	%xmm0, -51(%rdi)
+L(fwd_write_35bytes):
+	lddqu	-35(%rsi), %xmm0
+	movdqu	%xmm0, -35(%rdi)
+L(fwd_write_19bytes):
+	lddqu	-19(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -19(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_3bytes):
+	mov	-3(%rsi), %dx
+	mov	-2(%rsi), %cx
+	mov	%dx, -3(%rdi)
+	mov	%cx, -2(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_130bytes):
+	lddqu	-130(%rsi), %xmm0
+	movdqu	%xmm0, -130(%rdi)
+L(fwd_write_114bytes):
+	lddqu	-114(%rsi), %xmm0
+	movdqu	%xmm0, -114(%rdi)
+L(fwd_write_98bytes):
+	lddqu	-98(%rsi), %xmm0
+	movdqu	%xmm0, -98(%rdi)
+L(fwd_write_82bytes):
+	lddqu	-82(%rsi), %xmm0
+	movdqu	%xmm0, -82(%rdi)
+L(fwd_write_66bytes):
+	lddqu	-66(%rsi), %xmm0
+	movdqu	%xmm0, -66(%rdi)
+L(fwd_write_50bytes):
+	lddqu	-50(%rsi), %xmm0
+	movdqu	%xmm0, -50(%rdi)
+L(fwd_write_34bytes):
+	lddqu	-34(%rsi), %xmm0
+	movdqu	%xmm0, -34(%rdi)
+L(fwd_write_18bytes):
+	lddqu	-18(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -18(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_2bytes):
+	movzwl	-2(%rsi), %edx
+	mov	%dx, -2(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_129bytes):
+	lddqu	-129(%rsi), %xmm0
+	movdqu	%xmm0, -129(%rdi)
+L(fwd_write_113bytes):
+	lddqu	-113(%rsi), %xmm0
+	movdqu	%xmm0, -113(%rdi)
+L(fwd_write_97bytes):
+	lddqu	-97(%rsi), %xmm0
+	movdqu	%xmm0, -97(%rdi)
+L(fwd_write_81bytes):
+	lddqu	-81(%rsi), %xmm0
+	movdqu	%xmm0, -81(%rdi)
+L(fwd_write_65bytes):
+	lddqu	-65(%rsi), %xmm0
+	movdqu	%xmm0, -65(%rdi)
+L(fwd_write_49bytes):
+	lddqu	-49(%rsi), %xmm0
+	movdqu	%xmm0, -49(%rdi)
+L(fwd_write_33bytes):
+	lddqu	-33(%rsi), %xmm0
+	movdqu	%xmm0, -33(%rdi)
+L(fwd_write_17bytes):
+	lddqu	-17(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -17(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_1bytes):
+	movzbl	-1(%rsi), %edx
+	mov	%dl, -1(%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_128bytes):
+	lddqu	112(%rsi), %xmm0
+	movdqu	%xmm0, 112(%rdi)
+L(bwd_write_112bytes):
+	lddqu	96(%rsi), %xmm0
+	movdqu	%xmm0, 96(%rdi)
+L(bwd_write_96bytes):
+	lddqu	80(%rsi), %xmm0
+	movdqu	%xmm0, 80(%rdi)
+L(bwd_write_80bytes):
+	lddqu	64(%rsi), %xmm0
+	movdqu	%xmm0, 64(%rdi)
+L(bwd_write_64bytes):
+	lddqu	48(%rsi), %xmm0
+	movdqu	%xmm0, 48(%rdi)
+L(bwd_write_48bytes):
+	lddqu	32(%rsi), %xmm0
+	movdqu	%xmm0, 32(%rdi)
+L(bwd_write_32bytes):
+	lddqu	16(%rsi), %xmm0
+	movdqu	%xmm0, 16(%rdi)
+L(bwd_write_16bytes):
+	lddqu	(%rsi), %xmm0
+	movdqu	%xmm0, (%rdi)
+L(bwd_write_0bytes):
+	ret
+
+	.p2align 4
+L(bwd_write_143bytes):
+	lddqu	127(%rsi), %xmm0
+	movdqu	%xmm0, 127(%rdi)
+L(bwd_write_127bytes):
+	lddqu	111(%rsi), %xmm0
+	movdqu	%xmm0, 111(%rdi)
+L(bwd_write_111bytes):
+	lddqu	95(%rsi), %xmm0
+	movdqu	%xmm0, 95(%rdi)
+L(bwd_write_95bytes):
+	lddqu	79(%rsi), %xmm0
+	movdqu	%xmm0, 79(%rdi)
+L(bwd_write_79bytes):
+	lddqu	63(%rsi), %xmm0
+	movdqu	%xmm0, 63(%rdi)
+L(bwd_write_63bytes):
+	lddqu	47(%rsi), %xmm0
+	movdqu	%xmm0, 47(%rdi)
+L(bwd_write_47bytes):
+	lddqu	31(%rsi), %xmm0
+	movdqu	%xmm0, 31(%rdi)
+L(bwd_write_31bytes):
+	lddqu	15(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 15(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+
+	.p2align 4
+L(bwd_write_15bytes):
+	mov	7(%rsi), %rdx
+	mov	(%rsi), %rcx
+	mov	%rdx, 7(%rdi)
+	mov	%rcx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_142bytes):
+	lddqu	126(%rsi), %xmm0
+	movdqu	%xmm0, 126(%rdi)
+L(bwd_write_126bytes):
+	lddqu	110(%rsi), %xmm0
+	movdqu	%xmm0, 110(%rdi)
+L(bwd_write_110bytes):
+	lddqu	94(%rsi), %xmm0
+	movdqu	%xmm0, 94(%rdi)
+L(bwd_write_94bytes):
+	lddqu	78(%rsi), %xmm0
+	movdqu	%xmm0, 78(%rdi)
+L(bwd_write_78bytes):
+	lddqu	62(%rsi), %xmm0
+	movdqu	%xmm0, 62(%rdi)
+L(bwd_write_62bytes):
+	lddqu	46(%rsi), %xmm0
+	movdqu	%xmm0, 46(%rdi)
+L(bwd_write_46bytes):
+	lddqu	30(%rsi), %xmm0
+	movdqu	%xmm0, 30(%rdi)
+L(bwd_write_30bytes):
+	lddqu	14(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 14(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_14bytes):
+	mov	6(%rsi), %rdx
+	mov	(%rsi), %rcx
+	mov	%rdx, 6(%rdi)
+	mov	%rcx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_141bytes):
+	lddqu	125(%rsi), %xmm0
+	movdqu	%xmm0, 125(%rdi)
+L(bwd_write_125bytes):
+	lddqu	109(%rsi), %xmm0
+	movdqu	%xmm0, 109(%rdi)
+L(bwd_write_109bytes):
+	lddqu	93(%rsi), %xmm0
+	movdqu	%xmm0, 93(%rdi)
+L(bwd_write_93bytes):
+	lddqu	77(%rsi), %xmm0
+	movdqu	%xmm0, 77(%rdi)
+L(bwd_write_77bytes):
+	lddqu	61(%rsi), %xmm0
+	movdqu	%xmm0, 61(%rdi)
+L(bwd_write_61bytes):
+	lddqu	45(%rsi), %xmm0
+	movdqu	%xmm0, 45(%rdi)
+L(bwd_write_45bytes):
+	lddqu	29(%rsi), %xmm0
+	movdqu	%xmm0, 29(%rdi)
+L(bwd_write_29bytes):
+	lddqu	13(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 13(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_13bytes):
+	mov	5(%rsi), %rdx
+	mov	(%rsi), %rcx
+	mov	%rdx, 5(%rdi)
+	mov	%rcx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_140bytes):
+	lddqu	124(%rsi), %xmm0
+	movdqu	%xmm0, 124(%rdi)
+L(bwd_write_124bytes):
+	lddqu	108(%rsi), %xmm0
+	movdqu	%xmm0, 108(%rdi)
+L(bwd_write_108bytes):
+	lddqu	92(%rsi), %xmm0
+	movdqu	%xmm0, 92(%rdi)
+L(bwd_write_92bytes):
+	lddqu	76(%rsi), %xmm0
+	movdqu	%xmm0, 76(%rdi)
+L(bwd_write_76bytes):
+	lddqu	60(%rsi), %xmm0
+	movdqu	%xmm0, 60(%rdi)
+L(bwd_write_60bytes):
+	lddqu	44(%rsi), %xmm0
+	movdqu	%xmm0, 44(%rdi)
+L(bwd_write_44bytes):
+	lddqu	28(%rsi), %xmm0
+	movdqu	%xmm0, 28(%rdi)
+L(bwd_write_28bytes):
+	lddqu	12(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 12(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_12bytes):
+	mov	4(%rsi), %rdx
+	mov	(%rsi), %rcx
+	mov	%rdx, 4(%rdi)
+	mov	%rcx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_139bytes):
+	lddqu	123(%rsi), %xmm0
+	movdqu	%xmm0, 123(%rdi)
+L(bwd_write_123bytes):
+	lddqu	107(%rsi), %xmm0
+	movdqu	%xmm0, 107(%rdi)
+L(bwd_write_107bytes):
+	lddqu	91(%rsi), %xmm0
+	movdqu	%xmm0, 91(%rdi)
+L(bwd_write_91bytes):
+	lddqu	75(%rsi), %xmm0
+	movdqu	%xmm0, 75(%rdi)
+L(bwd_write_75bytes):
+	lddqu	59(%rsi), %xmm0
+	movdqu	%xmm0, 59(%rdi)
+L(bwd_write_59bytes):
+	lddqu	43(%rsi), %xmm0
+	movdqu	%xmm0, 43(%rdi)
+L(bwd_write_43bytes):
+	lddqu	27(%rsi), %xmm0
+	movdqu	%xmm0, 27(%rdi)
+L(bwd_write_27bytes):
+	lddqu	11(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 11(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_11bytes):
+	mov	3(%rsi), %rdx
+	mov	(%rsi), %rcx
+	mov	%rdx, 3(%rdi)
+	mov	%rcx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_138bytes):
+	lddqu	122(%rsi), %xmm0
+	movdqu	%xmm0, 122(%rdi)
+L(bwd_write_122bytes):
+	lddqu	106(%rsi), %xmm0
+	movdqu	%xmm0, 106(%rdi)
+L(bwd_write_106bytes):
+	lddqu	90(%rsi), %xmm0
+	movdqu	%xmm0, 90(%rdi)
+L(bwd_write_90bytes):
+	lddqu	74(%rsi), %xmm0
+	movdqu	%xmm0, 74(%rdi)
+L(bwd_write_74bytes):
+	lddqu	58(%rsi), %xmm0
+	movdqu	%xmm0, 58(%rdi)
+L(bwd_write_58bytes):
+	lddqu	42(%rsi), %xmm0
+	movdqu	%xmm0, 42(%rdi)
+L(bwd_write_42bytes):
+	lddqu	26(%rsi), %xmm0
+	movdqu	%xmm0, 26(%rdi)
+L(bwd_write_26bytes):
+	lddqu	10(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 10(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_10bytes):
+	mov	2(%rsi), %rdx
+	mov	(%rsi), %rcx
+	mov	%rdx, 2(%rdi)
+	mov	%rcx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_137bytes):
+	lddqu	121(%rsi), %xmm0
+	movdqu	%xmm0, 121(%rdi)
+L(bwd_write_121bytes):
+	lddqu	105(%rsi), %xmm0
+	movdqu	%xmm0, 105(%rdi)
+L(bwd_write_105bytes):
+	lddqu	89(%rsi), %xmm0
+	movdqu	%xmm0, 89(%rdi)
+L(bwd_write_89bytes):
+	lddqu	73(%rsi), %xmm0
+	movdqu	%xmm0, 73(%rdi)
+L(bwd_write_73bytes):
+	lddqu	57(%rsi), %xmm0
+	movdqu	%xmm0, 57(%rdi)
+L(bwd_write_57bytes):
+	lddqu	41(%rsi), %xmm0
+	movdqu	%xmm0, 41(%rdi)
+L(bwd_write_41bytes):
+	lddqu	25(%rsi), %xmm0
+	movdqu	%xmm0, 25(%rdi)
+L(bwd_write_25bytes):
+	lddqu	9(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 9(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_9bytes):
+	mov	1(%rsi), %rdx
+	mov	(%rsi), %rcx
+	mov	%rdx, 1(%rdi)
+	mov	%rcx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_136bytes):
+	lddqu	120(%rsi), %xmm0
+	movdqu	%xmm0, 120(%rdi)
+L(bwd_write_120bytes):
+	lddqu	104(%rsi), %xmm0
+	movdqu	%xmm0, 104(%rdi)
+L(bwd_write_104bytes):
+	lddqu	88(%rsi), %xmm0
+	movdqu	%xmm0, 88(%rdi)
+L(bwd_write_88bytes):
+	lddqu	72(%rsi), %xmm0
+	movdqu	%xmm0, 72(%rdi)
+L(bwd_write_72bytes):
+	lddqu	56(%rsi), %xmm0
+	movdqu	%xmm0, 56(%rdi)
+L(bwd_write_56bytes):
+	lddqu	40(%rsi), %xmm0
+	movdqu	%xmm0, 40(%rdi)
+L(bwd_write_40bytes):
+	lddqu	24(%rsi), %xmm0
+	movdqu	%xmm0, 24(%rdi)
+L(bwd_write_24bytes):
+	lddqu	8(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 8(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_8bytes):
+	mov	(%rsi), %rdx
+	mov	%rdx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_135bytes):
+	lddqu	119(%rsi), %xmm0
+	movdqu	%xmm0, 119(%rdi)
+L(bwd_write_119bytes):
+	lddqu	103(%rsi), %xmm0
+	movdqu	%xmm0, 103(%rdi)
+L(bwd_write_103bytes):
+	lddqu	87(%rsi), %xmm0
+	movdqu	%xmm0, 87(%rdi)
+L(bwd_write_87bytes):
+	lddqu	71(%rsi), %xmm0
+	movdqu	%xmm0, 71(%rdi)
+L(bwd_write_71bytes):
+	lddqu	55(%rsi), %xmm0
+	movdqu	%xmm0, 55(%rdi)
+L(bwd_write_55bytes):
+	lddqu	39(%rsi), %xmm0
+	movdqu	%xmm0, 39(%rdi)
+L(bwd_write_39bytes):
+	lddqu	23(%rsi), %xmm0
+	movdqu	%xmm0, 23(%rdi)
+L(bwd_write_23bytes):
+	lddqu	7(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 7(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_7bytes):
+	mov	3(%rsi), %edx
+	mov	(%rsi), %ecx
+	mov	%edx, 3(%rdi)
+	mov	%ecx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_134bytes):
+	lddqu	118(%rsi), %xmm0
+	movdqu	%xmm0, 118(%rdi)
+L(bwd_write_118bytes):
+	lddqu	102(%rsi), %xmm0
+	movdqu	%xmm0, 102(%rdi)
+L(bwd_write_102bytes):
+	lddqu	86(%rsi), %xmm0
+	movdqu	%xmm0, 86(%rdi)
+L(bwd_write_86bytes):
+	lddqu	70(%rsi), %xmm0
+	movdqu	%xmm0, 70(%rdi)
+L(bwd_write_70bytes):
+	lddqu	54(%rsi), %xmm0
+	movdqu	%xmm0, 54(%rdi)
+L(bwd_write_54bytes):
+	lddqu	38(%rsi), %xmm0
+	movdqu	%xmm0, 38(%rdi)
+L(bwd_write_38bytes):
+	lddqu	22(%rsi), %xmm0
+	movdqu	%xmm0, 22(%rdi)
+L(bwd_write_22bytes):
+	lddqu	6(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 6(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_6bytes):
+	mov	2(%rsi), %edx
+	mov	(%rsi), %ecx
+	mov	%edx, 2(%rdi)
+	mov	%ecx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_133bytes):
+	lddqu	117(%rsi), %xmm0
+	movdqu	%xmm0, 117(%rdi)
+L(bwd_write_117bytes):
+	lddqu	101(%rsi), %xmm0
+	movdqu	%xmm0, 101(%rdi)
+L(bwd_write_101bytes):
+	lddqu	85(%rsi), %xmm0
+	movdqu	%xmm0, 85(%rdi)
+L(bwd_write_85bytes):
+	lddqu	69(%rsi), %xmm0
+	movdqu	%xmm0, 69(%rdi)
+L(bwd_write_69bytes):
+	lddqu	53(%rsi), %xmm0
+	movdqu	%xmm0, 53(%rdi)
+L(bwd_write_53bytes):
+	lddqu	37(%rsi), %xmm0
+	movdqu	%xmm0, 37(%rdi)
+L(bwd_write_37bytes):
+	lddqu	21(%rsi), %xmm0
+	movdqu	%xmm0, 21(%rdi)
+L(bwd_write_21bytes):
+	lddqu	5(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 5(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_5bytes):
+	mov	1(%rsi), %edx
+	mov	(%rsi), %ecx
+	mov	%edx, 1(%rdi)
+	mov	%ecx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_132bytes):
+	lddqu	116(%rsi), %xmm0
+	movdqu	%xmm0, 116(%rdi)
+L(bwd_write_116bytes):
+	lddqu	100(%rsi), %xmm0
+	movdqu	%xmm0, 100(%rdi)
+L(bwd_write_100bytes):
+	lddqu	84(%rsi), %xmm0
+	movdqu	%xmm0, 84(%rdi)
+L(bwd_write_84bytes):
+	lddqu	68(%rsi), %xmm0
+	movdqu	%xmm0, 68(%rdi)
+L(bwd_write_68bytes):
+	lddqu	52(%rsi), %xmm0
+	movdqu	%xmm0, 52(%rdi)
+L(bwd_write_52bytes):
+	lddqu	36(%rsi), %xmm0
+	movdqu	%xmm0, 36(%rdi)
+L(bwd_write_36bytes):
+	lddqu	20(%rsi), %xmm0
+	movdqu	%xmm0, 20(%rdi)
+L(bwd_write_20bytes):
+	lddqu	4(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 4(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_4bytes):
+	mov	(%rsi), %edx
+	mov	%edx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_131bytes):
+	lddqu	115(%rsi), %xmm0
+	movdqu	%xmm0, 115(%rdi)
+L(bwd_write_115bytes):
+	lddqu	99(%rsi), %xmm0
+	movdqu	%xmm0, 99(%rdi)
+L(bwd_write_99bytes):
+	lddqu	83(%rsi), %xmm0
+	movdqu	%xmm0, 83(%rdi)
+L(bwd_write_83bytes):
+	lddqu	67(%rsi), %xmm0
+	movdqu	%xmm0, 67(%rdi)
+L(bwd_write_67bytes):
+	lddqu	51(%rsi), %xmm0
+	movdqu	%xmm0, 51(%rdi)
+L(bwd_write_51bytes):
+	lddqu	35(%rsi), %xmm0
+	movdqu	%xmm0, 35(%rdi)
+L(bwd_write_35bytes):
+	lddqu	19(%rsi), %xmm0
+	movdqu	%xmm0, 19(%rdi)
+L(bwd_write_19bytes):
+	lddqu	3(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 3(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_3bytes):
+	mov	1(%rsi), %dx
+	mov	(%rsi), %cx
+	mov	%dx, 1(%rdi)
+	mov	%cx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_130bytes):
+	lddqu	114(%rsi), %xmm0
+	movdqu	%xmm0, 114(%rdi)
+L(bwd_write_114bytes):
+	lddqu	98(%rsi), %xmm0
+	movdqu	%xmm0, 98(%rdi)
+L(bwd_write_98bytes):
+	lddqu	82(%rsi), %xmm0
+	movdqu	%xmm0, 82(%rdi)
+L(bwd_write_82bytes):
+	lddqu	66(%rsi), %xmm0
+	movdqu	%xmm0, 66(%rdi)
+L(bwd_write_66bytes):
+	lddqu	50(%rsi), %xmm0
+	movdqu	%xmm0, 50(%rdi)
+L(bwd_write_50bytes):
+	lddqu	34(%rsi), %xmm0
+	movdqu	%xmm0, 34(%rdi)
+L(bwd_write_34bytes):
+	lddqu	18(%rsi), %xmm0
+	movdqu	%xmm0, 18(%rdi)
+L(bwd_write_18bytes):
+	lddqu	2(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 2(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_2bytes):
+	movzwl	(%rsi), %edx
+	mov	%dx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_129bytes):
+	lddqu	113(%rsi), %xmm0
+	movdqu	%xmm0, 113(%rdi)
+L(bwd_write_113bytes):
+	lddqu	97(%rsi), %xmm0
+	movdqu	%xmm0, 97(%rdi)
+L(bwd_write_97bytes):
+	lddqu	81(%rsi), %xmm0
+	movdqu	%xmm0, 81(%rdi)
+L(bwd_write_81bytes):
+	lddqu	65(%rsi), %xmm0
+	movdqu	%xmm0, 65(%rdi)
+L(bwd_write_65bytes):
+	lddqu	49(%rsi), %xmm0
+	movdqu	%xmm0, 49(%rdi)
+L(bwd_write_49bytes):
+	lddqu	33(%rsi), %xmm0
+	movdqu	%xmm0, 33(%rdi)
+L(bwd_write_33bytes):
+	lddqu	17(%rsi), %xmm0
+	movdqu	%xmm0, 17(%rdi)
+L(bwd_write_17bytes):
+	lddqu	1(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 1(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_1bytes):
+	movzbl	(%rsi), %edx
+	mov	%dl, (%rdi)
+	ret
+
+END (MEMCPY)
+
+	.section .rodata.ssse3,"a",@progbits
+	.p2align 3
+L(table_144_bytes_bwd):
+	.int	JMPTBL (L(bwd_write_0bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_1bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_2bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_3bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_4bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_5bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_6bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_7bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_8bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_9bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_10bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_11bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_12bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_13bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_14bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_15bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_16bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_17bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_18bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_19bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_20bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_21bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_22bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_23bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_24bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_25bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_26bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_27bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_28bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_29bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_30bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_31bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_32bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_33bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_34bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_35bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_36bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_37bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_38bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_39bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_40bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_41bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_42bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_43bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_44bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_45bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_46bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_47bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_48bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_49bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_50bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_51bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_52bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_53bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_54bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_55bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_56bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_57bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_58bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_59bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_60bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_61bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_62bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_63bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_64bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_65bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_66bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_67bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_68bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_69bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_70bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_71bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_72bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_73bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_74bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_75bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_76bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_77bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_78bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_79bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_80bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_81bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_82bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_83bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_84bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_85bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_86bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_87bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_88bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_89bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_90bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_91bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_92bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_93bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_94bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_95bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_96bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_97bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_98bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_99bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_100bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_101bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_102bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_103bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_104bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_105bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_106bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_107bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_108bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_109bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_110bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_111bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_112bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_113bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_114bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_115bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_116bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_117bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_118bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_119bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_120bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_121bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_122bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_123bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_124bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_125bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_126bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_127bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_128bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_129bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_130bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_131bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_132bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_133bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_134bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_135bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_136bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_137bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_138bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_139bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_140bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_141bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_142bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_143bytes), L(table_144_bytes_bwd))
+
+	.p2align 3
+L(table_144_bytes_fwd):
+	.int	JMPTBL (L(fwd_write_0bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_1bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_2bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_3bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_4bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_5bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_6bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_7bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_8bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_9bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_10bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_11bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_12bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_13bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_14bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_15bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_16bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_17bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_18bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_19bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_20bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_21bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_22bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_23bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_24bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_25bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_26bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_27bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_28bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_29bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_30bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_31bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_32bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_33bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_34bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_35bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_36bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_37bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_38bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_39bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_40bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_41bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_42bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_43bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_44bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_45bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_46bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_47bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_48bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_49bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_50bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_51bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_52bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_53bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_54bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_55bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_56bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_57bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_58bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_59bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_60bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_61bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_62bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_63bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_64bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_65bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_66bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_67bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_68bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_69bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_70bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_71bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_72bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_73bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_74bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_75bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_76bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_77bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_78bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_79bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_80bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_81bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_82bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_83bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_84bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_85bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_86bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_87bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_88bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_89bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_90bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_91bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_92bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_93bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_94bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_95bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_96bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_97bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_98bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_99bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_100bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_101bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_102bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_103bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_104bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_105bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_106bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_107bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_108bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_109bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_110bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_111bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_112bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_113bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_114bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_115bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_116bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_117bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_118bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_119bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_120bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_121bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_122bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_123bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_124bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_125bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_126bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_127bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_128bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_129bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_130bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_131bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_132bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_133bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_134bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_135bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_136bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_137bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_138bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_139bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_140bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_141bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_142bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_143bytes), L(table_144_bytes_fwd))
+
+	.p2align 3
+L(shl_table_fwd):
+	.int	JMPTBL (L(shl_0), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_1), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_2), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_3), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_4), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_5), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_6), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_7), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_8), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_9), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_10), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_11), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_12), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_13), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_14), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_15), L(shl_table_fwd))
+
+	.p2align 3
+L(shl_table_bwd):
+	.int	JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
+
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/memcpy-ssse3.S
new file mode 100644
index 0000000000..f3ea52a46c
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/memcpy-ssse3.S
@@ -0,0 +1,3150 @@
+/* memcpy with SSSE3
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#if IS_IN (libc) \
+    && (defined SHARED \
+        || defined USE_AS_MEMMOVE \
+	|| !defined USE_MULTIARCH)
+
+#include "asm-syntax.h"
+
+#ifndef MEMCPY
+# define MEMCPY		__memcpy_ssse3
+# define MEMCPY_CHK	__memcpy_chk_ssse3
+# define MEMPCPY	__mempcpy_ssse3
+# define MEMPCPY_CHK	__mempcpy_chk_ssse3
+#endif
+
+#define JMPTBL(I, B)	I - B
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+   relative offsets.  INDEX is a register contains the index into the
+   jump table.  SCALE is the scale of INDEX.  */
+#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+  lea		TABLE(%rip), %r11;				\
+  movslq	(%r11, INDEX, SCALE), INDEX;			\
+  lea		(%r11, INDEX), INDEX;				\
+  jmp		*INDEX;						\
+  ud2
+
+	.section .text.ssse3,"ax",@progbits
+#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
+ENTRY (MEMPCPY_CHK)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMPCPY_CHK)
+
+ENTRY (MEMPCPY)
+	movq	%rdi, %rax
+	addq	%rdx, %rax
+	jmp	L(start)
+END (MEMPCPY)
+#endif
+
+#if !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+#endif
+
+ENTRY (MEMCPY)
+	mov	%rdi, %rax
+#ifdef USE_AS_MEMPCPY
+	add	%rdx, %rax
+#endif
+
+#ifdef USE_AS_MEMMOVE
+	cmp	%rsi, %rdi
+	jb	L(copy_forward)
+	je	L(write_0bytes)
+	cmp	$79, %rdx
+	jbe	L(copy_forward)
+	jmp	L(copy_backward)
+L(copy_forward):
+#endif
+L(start):
+	cmp	$79, %rdx
+	lea     L(table_less_80bytes)(%rip), %r11
+	ja	L(80bytesormore)
+	movslq	(%r11, %rdx, 4), %r9
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	add	%r11, %r9
+	jmp	*%r9
+	ud2
+
+	.p2align 4
+L(80bytesormore):
+#ifndef USE_AS_MEMMOVE
+	cmp	%dil, %sil
+	jle	L(copy_backward)
+#endif
+
+	movdqu	(%rsi), %xmm0
+	mov	%rdi, %rcx
+	and	$-16, %rdi
+	add	$16, %rdi
+	mov	%rcx, %r8
+	sub	%rdi, %rcx
+	add	%rcx, %rdx
+	sub	%rcx, %rsi
+
+#ifdef SHARED_CACHE_SIZE_HALF
+	mov	$SHARED_CACHE_SIZE_HALF, %RCX_LP
+#else
+	mov	__x86_shared_cache_size_half(%rip), %RCX_LP
+#endif
+	cmp	%rcx, %rdx
+	mov	%rsi, %r9
+	ja	L(large_page_fwd)
+	and	$0xf, %r9
+	jz	L(shl_0)
+#ifdef DATA_CACHE_SIZE_HALF
+	mov	$DATA_CACHE_SIZE_HALF, %RCX_LP
+#else
+	mov	__x86_data_cache_size_half(%rip), %RCX_LP
+#endif
+	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4)
+
+	.p2align 4
+L(copy_backward):
+	movdqu	-16(%rsi, %rdx), %xmm0
+	add	%rdx, %rsi
+	lea	-16(%rdi, %rdx), %r8
+	add	%rdx, %rdi
+
+	mov	%rdi, %rcx
+	and	$0xf, %rcx
+	xor	%rcx, %rdi
+	sub	%rcx, %rdx
+	sub	%rcx, %rsi
+
+#ifdef SHARED_CACHE_SIZE_HALF
+	mov	$SHARED_CACHE_SIZE_HALF, %RCX_LP
+#else
+	mov	__x86_shared_cache_size_half(%rip), %RCX_LP
+#endif
+
+	cmp	%rcx, %rdx
+	mov	%rsi, %r9
+	ja	L(large_page_bwd)
+	and	$0xf, %r9
+	jz	L(shl_0_bwd)
+#ifdef DATA_CACHE_SIZE_HALF
+	mov	$DATA_CACHE_SIZE_HALF, %RCX_LP
+#else
+	mov	__x86_data_cache_size_half(%rip), %RCX_LP
+#endif
+	BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4)
+
+	.p2align 4
+L(shl_0):
+	sub	$16, %rdx
+	movdqa	(%rsi), %xmm1
+	add	$16, %rsi
+	movdqa	%xmm1, (%rdi)
+	add	$16, %rdi
+	cmp	$128, %rdx
+	movdqu	%xmm0, (%r8)
+	ja	L(shl_0_gobble)
+	cmp	$64, %rdx
+	jb	L(shl_0_less_64bytes)
+	movaps	(%rsi), %xmm4
+	movaps	16(%rsi), %xmm1
+	movaps	32(%rsi), %xmm2
+	movaps	48(%rsi), %xmm3
+	movaps	%xmm4, (%rdi)
+	movaps	%xmm1, 16(%rdi)
+	movaps	%xmm2, 32(%rdi)
+	movaps	%xmm3, 48(%rdi)
+	sub	$64, %rdx
+	add	$64, %rsi
+	add	$64, %rdi
+L(shl_0_less_64bytes):
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_0_gobble):
+#ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %RDX_LP
+#else
+	cmp	__x86_data_cache_size_half(%rip), %RDX_LP
+#endif
+	lea	-128(%rdx), %rdx
+	jae	L(shl_0_gobble_mem_loop)
+L(shl_0_gobble_cache_loop):
+	movdqa	(%rsi), %xmm4
+	movaps	0x10(%rsi), %xmm1
+	movaps	0x20(%rsi), %xmm2
+	movaps	0x30(%rsi), %xmm3
+
+	movdqa	%xmm4, (%rdi)
+	movaps	%xmm1, 0x10(%rdi)
+	movaps	%xmm2, 0x20(%rdi)
+	movaps	%xmm3, 0x30(%rdi)
+
+	sub	$128, %rdx
+	movaps	0x40(%rsi), %xmm4
+	movaps	0x50(%rsi), %xmm5
+	movaps	0x60(%rsi), %xmm6
+	movaps	0x70(%rsi), %xmm7
+	lea	0x80(%rsi), %rsi
+	movaps	%xmm4, 0x40(%rdi)
+	movaps	%xmm5, 0x50(%rdi)
+	movaps	%xmm6, 0x60(%rdi)
+	movaps	%xmm7, 0x70(%rdi)
+	lea	0x80(%rdi), %rdi
+
+	jae	L(shl_0_gobble_cache_loop)
+	cmp	$-0x40, %rdx
+	lea	0x80(%rdx), %rdx
+	jl	L(shl_0_cache_less_64bytes)
+
+	movdqa	(%rsi), %xmm4
+	sub	$0x40, %rdx
+	movdqa	0x10(%rsi), %xmm1
+
+	movdqa	%xmm4, (%rdi)
+	movdqa	%xmm1, 0x10(%rdi)
+
+	movdqa	0x20(%rsi), %xmm4
+	movdqa	0x30(%rsi), %xmm1
+	add	$0x40, %rsi
+
+	movdqa	%xmm4, 0x20(%rdi)
+	movdqa	%xmm1, 0x30(%rdi)
+	add	$0x40, %rdi
+L(shl_0_cache_less_64bytes):
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_0_gobble_mem_loop):
+	prefetcht0 0x1c0(%rsi)
+	prefetcht0 0x280(%rsi)
+
+	movdqa	(%rsi), %xmm0
+	movdqa	0x10(%rsi), %xmm1
+	movdqa	0x20(%rsi), %xmm2
+	movdqa	0x30(%rsi), %xmm3
+	movdqa	0x40(%rsi), %xmm4
+	movdqa	0x50(%rsi), %xmm5
+	movdqa	0x60(%rsi), %xmm6
+	movdqa	0x70(%rsi), %xmm7
+	lea	0x80(%rsi), %rsi
+	sub	$0x80, %rdx
+	movdqa	%xmm0, (%rdi)
+	movdqa	%xmm1, 0x10(%rdi)
+	movdqa	%xmm2, 0x20(%rdi)
+	movdqa	%xmm3, 0x30(%rdi)
+	movdqa	%xmm4, 0x40(%rdi)
+	movdqa	%xmm5, 0x50(%rdi)
+	movdqa	%xmm6, 0x60(%rdi)
+	movdqa	%xmm7, 0x70(%rdi)
+	lea	0x80(%rdi), %rdi
+
+	jae	L(shl_0_gobble_mem_loop)
+	cmp	$-0x40, %rdx
+	lea	0x80(%rdx), %rdx
+	jl	L(shl_0_mem_less_64bytes)
+
+	movdqa	(%rsi), %xmm0
+	sub	$0x40, %rdx
+	movdqa	0x10(%rsi), %xmm1
+
+	movdqa	%xmm0, (%rdi)
+	movdqa	%xmm1, 0x10(%rdi)
+
+	movdqa	0x20(%rsi), %xmm0
+	movdqa	0x30(%rsi), %xmm1
+	add	$0x40, %rsi
+
+	movdqa	%xmm0, 0x20(%rdi)
+	movdqa	%xmm1, 0x30(%rdi)
+	add	$0x40, %rdi
+L(shl_0_mem_less_64bytes):
+	cmp	$0x20, %rdx
+	jb	L(shl_0_mem_less_32bytes)
+	movdqa	(%rsi), %xmm0
+	sub	$0x20, %rdx
+	movdqa	0x10(%rsi), %xmm1
+	add	$0x20, %rsi
+	movdqa	%xmm0, (%rdi)
+	movdqa	%xmm1, 0x10(%rdi)
+	add	$0x20, %rdi
+L(shl_0_mem_less_32bytes):
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_0_bwd):
+	sub	$16, %rdx
+	movdqa	-0x10(%rsi), %xmm1
+	sub	$16, %rsi
+	movdqa	%xmm1, -0x10(%rdi)
+	sub	$16, %rdi
+	cmp	$0x80, %rdx
+	movdqu	%xmm0, (%r8)
+	ja	L(shl_0_gobble_bwd)
+	cmp	$64, %rdx
+	jb	L(shl_0_less_64bytes_bwd)
+	movaps	-0x10(%rsi), %xmm0
+	movaps	-0x20(%rsi), %xmm1
+	movaps	-0x30(%rsi), %xmm2
+	movaps	-0x40(%rsi), %xmm3
+	movaps	%xmm0, -0x10(%rdi)
+	movaps	%xmm1, -0x20(%rdi)
+	movaps	%xmm2, -0x30(%rdi)
+	movaps	%xmm3, -0x40(%rdi)
+	sub	$64, %rdx
+	sub	$0x40, %rsi
+	sub	$0x40, %rdi
+L(shl_0_less_64bytes_bwd):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_0_gobble_bwd):
+#ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %RDX_LP
+#else
+	cmp	__x86_data_cache_size_half(%rip), %RDX_LP
+#endif
+	lea	-128(%rdx), %rdx
+	jae	L(shl_0_gobble_mem_bwd_loop)
+L(shl_0_gobble_bwd_loop):
+	movdqa	-0x10(%rsi), %xmm0
+	movaps	-0x20(%rsi), %xmm1
+	movaps	-0x30(%rsi), %xmm2
+	movaps	-0x40(%rsi), %xmm3
+
+	movdqa	%xmm0, -0x10(%rdi)
+	movaps	%xmm1, -0x20(%rdi)
+	movaps	%xmm2, -0x30(%rdi)
+	movaps	%xmm3, -0x40(%rdi)
+
+	sub	$0x80, %rdx
+	movaps	-0x50(%rsi), %xmm4
+	movaps	-0x60(%rsi), %xmm5
+	movaps	-0x70(%rsi), %xmm6
+	movaps	-0x80(%rsi), %xmm7
+	lea	-0x80(%rsi), %rsi
+	movaps	%xmm4, -0x50(%rdi)
+	movaps	%xmm5, -0x60(%rdi)
+	movaps	%xmm6, -0x70(%rdi)
+	movaps	%xmm7, -0x80(%rdi)
+	lea	-0x80(%rdi), %rdi
+
+	jae	L(shl_0_gobble_bwd_loop)
+	cmp	$-0x40, %rdx
+	lea	0x80(%rdx), %rdx
+	jl	L(shl_0_gobble_bwd_less_64bytes)
+
+	movdqa	-0x10(%rsi), %xmm0
+	sub	$0x40, %rdx
+	movdqa	-0x20(%rsi), %xmm1
+
+	movdqa	%xmm0, -0x10(%rdi)
+	movdqa	%xmm1, -0x20(%rdi)
+
+	movdqa	-0x30(%rsi), %xmm0
+	movdqa	-0x40(%rsi), %xmm1
+	sub	$0x40, %rsi
+
+	movdqa	%xmm0, -0x30(%rdi)
+	movdqa	%xmm1, -0x40(%rdi)
+	sub	$0x40, %rdi
+L(shl_0_gobble_bwd_less_64bytes):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_0_gobble_mem_bwd_loop):
+	prefetcht0 -0x1c0(%rsi)
+	prefetcht0 -0x280(%rsi)
+	movdqa	-0x10(%rsi), %xmm0
+	movdqa	-0x20(%rsi), %xmm1
+	movdqa	-0x30(%rsi), %xmm2
+	movdqa	-0x40(%rsi), %xmm3
+	movdqa	-0x50(%rsi), %xmm4
+	movdqa	-0x60(%rsi), %xmm5
+	movdqa	-0x70(%rsi), %xmm6
+	movdqa	-0x80(%rsi), %xmm7
+	lea	-0x80(%rsi), %rsi
+	sub	$0x80, %rdx
+	movdqa	%xmm0, -0x10(%rdi)
+	movdqa	%xmm1, -0x20(%rdi)
+	movdqa	%xmm2, -0x30(%rdi)
+	movdqa	%xmm3, -0x40(%rdi)
+	movdqa	%xmm4, -0x50(%rdi)
+	movdqa	%xmm5, -0x60(%rdi)
+	movdqa	%xmm6, -0x70(%rdi)
+	movdqa	%xmm7, -0x80(%rdi)
+	lea	-0x80(%rdi), %rdi
+
+	jae	L(shl_0_gobble_mem_bwd_loop)
+	cmp	$-0x40, %rdx
+	lea	0x80(%rdx), %rdx
+	jl	L(shl_0_mem_bwd_less_64bytes)
+
+	movdqa	-0x10(%rsi), %xmm0
+	sub	$0x40, %rdx
+	movdqa	-0x20(%rsi), %xmm1
+
+	movdqa	%xmm0, -0x10(%rdi)
+	movdqa	%xmm1, -0x20(%rdi)
+
+	movdqa	-0x30(%rsi), %xmm0
+	movdqa	-0x40(%rsi), %xmm1
+	sub	$0x40, %rsi
+
+	movdqa	%xmm0, -0x30(%rdi)
+	movdqa	%xmm1, -0x40(%rdi)
+	sub	$0x40, %rdi
+L(shl_0_mem_bwd_less_64bytes):
+	cmp	$0x20, %rdx
+	jb	L(shl_0_mem_bwd_less_32bytes)
+	movdqa	-0x10(%rsi), %xmm0
+	sub	$0x20, %rdx
+	movdqa	-0x20(%rsi), %xmm1
+	sub	$0x20, %rsi
+	movdqa	%xmm0, -0x10(%rdi)
+	movdqa	%xmm1, -0x20(%rdi)
+	sub	$0x20, %rdi
+L(shl_0_mem_bwd_less_32bytes):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_1):
+	lea	(L(shl_1_loop_L1)-L(shl_1))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x01(%rsi), %xmm1
+	jb	L(L1_fwd)
+	lea	(L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9
+L(L1_fwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_1_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_1_loop_L1):
+	sub	$64, %rdx
+	movaps	0x0f(%rsi), %xmm2
+	movaps	0x1f(%rsi), %xmm3
+	movaps	0x2f(%rsi), %xmm4
+	movaps	0x3f(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$1, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$1, %xmm3, %xmm4
+	palignr	$1, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$1, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_1_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	jmp	*%r9
+	ud2
+L(shl_1_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_1_bwd):
+	lea	(L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x01(%rsi), %xmm1
+	jb	L(L1_bwd)
+	lea	(L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9
+L(L1_bwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_1_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_1_bwd_loop_L1):
+	movaps	-0x11(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x21(%rsi), %xmm3
+	movaps	-0x31(%rsi), %xmm4
+	movaps	-0x41(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$1, %xmm2, %xmm1
+	palignr	$1, %xmm3, %xmm2
+	palignr	$1, %xmm4, %xmm3
+	palignr	$1, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_1_bwd_end)
+	movaps	%xmm4, (%rdi)
+	jmp	*%r9
+	ud2
+L(shl_1_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_2):
+	lea	(L(shl_2_loop_L1)-L(shl_2))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x02(%rsi), %xmm1
+	jb	L(L2_fwd)
+	lea	(L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9
+L(L2_fwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_2_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_2_loop_L1):
+	sub	$64, %rdx
+	movaps	0x0e(%rsi), %xmm2
+	movaps	0x1e(%rsi), %xmm3
+	movaps	0x2e(%rsi), %xmm4
+	movaps	0x3e(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$2, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$2, %xmm3, %xmm4
+	palignr	$2, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$2, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_2_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	jmp	*%r9
+	ud2
+L(shl_2_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_2_bwd):
+	lea	(L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x02(%rsi), %xmm1
+	jb	L(L2_bwd)
+	lea	(L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9
+L(L2_bwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_2_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_2_bwd_loop_L1):
+	movaps	-0x12(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x22(%rsi), %xmm3
+	movaps	-0x32(%rsi), %xmm4
+	movaps	-0x42(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$2, %xmm2, %xmm1
+	palignr	$2, %xmm3, %xmm2
+	palignr	$2, %xmm4, %xmm3
+	palignr	$2, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_2_bwd_end)
+	movaps	%xmm4, (%rdi)
+	jmp	*%r9
+	ud2
+L(shl_2_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_3):
+	lea	(L(shl_3_loop_L1)-L(shl_3))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x03(%rsi), %xmm1
+	jb	L(L3_fwd)
+	lea	(L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9
+L(L3_fwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_3_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_3_loop_L1):
+	sub	$64, %rdx
+	movaps	0x0d(%rsi), %xmm2
+	movaps	0x1d(%rsi), %xmm3
+	movaps	0x2d(%rsi), %xmm4
+	movaps	0x3d(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$3, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$3, %xmm3, %xmm4
+	palignr	$3, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$3, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_3_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	jmp	*%r9
+	ud2
+L(shl_3_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_3_bwd):
+	lea	(L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x03(%rsi), %xmm1
+	jb	L(L3_bwd)
+	lea	(L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9
+L(L3_bwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_3_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_3_bwd_loop_L1):
+	movaps	-0x13(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x23(%rsi), %xmm3
+	movaps	-0x33(%rsi), %xmm4
+	movaps	-0x43(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$3, %xmm2, %xmm1
+	palignr	$3, %xmm3, %xmm2
+	palignr	$3, %xmm4, %xmm3
+	palignr	$3, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_3_bwd_end)
+	movaps	%xmm4, (%rdi)
+	jmp	*%r9
+	ud2
+L(shl_3_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_4):
+	lea	(L(shl_4_loop_L1)-L(shl_4))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x04(%rsi), %xmm1
+	jb	L(L4_fwd)
+	lea	(L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9
+L(L4_fwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_4_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_4_loop_L1):
+	sub	$64, %rdx
+	movaps	0x0c(%rsi), %xmm2
+	movaps	0x1c(%rsi), %xmm3
+	movaps	0x2c(%rsi), %xmm4
+	movaps	0x3c(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$4, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$4, %xmm3, %xmm4
+	palignr	$4, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$4, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_4_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	jmp	*%r9
+	ud2
+L(shl_4_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_4_bwd):
+	lea	(L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x04(%rsi), %xmm1
+	jb	L(L4_bwd)
+	lea	(L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9
+L(L4_bwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_4_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_4_bwd_loop_L1):
+	movaps	-0x14(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x24(%rsi), %xmm3
+	movaps	-0x34(%rsi), %xmm4
+	movaps	-0x44(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$4, %xmm2, %xmm1
+	palignr	$4, %xmm3, %xmm2
+	palignr	$4, %xmm4, %xmm3
+	palignr	$4, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_4_bwd_end)
+	movaps	%xmm4, (%rdi)
+	jmp	*%r9
+	ud2
+L(shl_4_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_5):
+	lea	(L(shl_5_loop_L1)-L(shl_5))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x05(%rsi), %xmm1
+	jb	L(L5_fwd)
+	lea	(L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9
+L(L5_fwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_5_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_5_loop_L1):
+	sub	$64, %rdx
+	movaps	0x0b(%rsi), %xmm2
+	movaps	0x1b(%rsi), %xmm3
+	movaps	0x2b(%rsi), %xmm4
+	movaps	0x3b(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$5, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$5, %xmm3, %xmm4
+	palignr	$5, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$5, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_5_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	jmp	*%r9
+	ud2
+L(shl_5_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_5_bwd):
+	lea	(L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x05(%rsi), %xmm1
+	jb	L(L5_bwd)
+	lea	(L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9
+L(L5_bwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_5_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_5_bwd_loop_L1):
+	movaps	-0x15(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x25(%rsi), %xmm3
+	movaps	-0x35(%rsi), %xmm4
+	movaps	-0x45(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$5, %xmm2, %xmm1
+	palignr	$5, %xmm3, %xmm2
+	palignr	$5, %xmm4, %xmm3
+	palignr	$5, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_5_bwd_end)
+	movaps	%xmm4, (%rdi)
+	jmp	*%r9
+	ud2
+L(shl_5_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_6):
+	lea	(L(shl_6_loop_L1)-L(shl_6))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x06(%rsi), %xmm1
+	jb	L(L6_fwd)
+	lea	(L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9
+L(L6_fwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_6_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_6_loop_L1):
+	sub	$64, %rdx
+	movaps	0x0a(%rsi), %xmm2
+	movaps	0x1a(%rsi), %xmm3
+	movaps	0x2a(%rsi), %xmm4
+	movaps	0x3a(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$6, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$6, %xmm3, %xmm4
+	palignr	$6, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$6, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_6_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	jmp	*%r9
+	ud2
+L(shl_6_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_6_bwd):
+	lea	(L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x06(%rsi), %xmm1
+	jb	L(L6_bwd)
+	lea	(L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9
+L(L6_bwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_6_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_6_bwd_loop_L1):
+	movaps	-0x16(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x26(%rsi), %xmm3
+	movaps	-0x36(%rsi), %xmm4
+	movaps	-0x46(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$6, %xmm2, %xmm1
+	palignr	$6, %xmm3, %xmm2
+	palignr	$6, %xmm4, %xmm3
+	palignr	$6, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_6_bwd_end)
+	movaps	%xmm4, (%rdi)
+	jmp	*%r9
+	ud2
+L(shl_6_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_7):
+	lea	(L(shl_7_loop_L1)-L(shl_7))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x07(%rsi), %xmm1
+	jb	L(L7_fwd)
+	lea	(L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9
+L(L7_fwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_7_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_7_loop_L1):
+	sub	$64, %rdx
+	movaps	0x09(%rsi), %xmm2
+	movaps	0x19(%rsi), %xmm3
+	movaps	0x29(%rsi), %xmm4
+	movaps	0x39(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$7, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$7, %xmm3, %xmm4
+	palignr	$7, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$7, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_7_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	jmp	*%r9
+	ud2
+L(shl_7_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_7_bwd):
+	lea	(L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x07(%rsi), %xmm1
+	jb	L(L7_bwd)
+	lea	(L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9
+L(L7_bwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_7_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_7_bwd_loop_L1):
+	movaps	-0x17(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x27(%rsi), %xmm3
+	movaps	-0x37(%rsi), %xmm4
+	movaps	-0x47(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$7, %xmm2, %xmm1
+	palignr	$7, %xmm3, %xmm2
+	palignr	$7, %xmm4, %xmm3
+	palignr	$7, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_7_bwd_end)
+	movaps	%xmm4, (%rdi)
+	jmp	*%r9
+	ud2
+L(shl_7_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_8):
+	lea	(L(shl_8_loop_L1)-L(shl_8))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x08(%rsi), %xmm1
+	jb	L(L8_fwd)
+	lea	(L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9
+L(L8_fwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+L(shl_8_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_8_loop_L1):
+	sub	$64, %rdx
+	movaps	0x08(%rsi), %xmm2
+	movaps	0x18(%rsi), %xmm3
+	movaps	0x28(%rsi), %xmm4
+	movaps	0x38(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$8, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$8, %xmm3, %xmm4
+	palignr	$8, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$8, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_8_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	jmp	*%r9
+	ud2
+	.p2align 4
+L(shl_8_end):
+	lea	64(%rdx), %rdx
+	movaps	%xmm4, -0x20(%rdi)
+	add	%rdx, %rsi
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_8_bwd):
+	lea	(L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x08(%rsi), %xmm1
+	jb	L(L8_bwd)
+	lea	(L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9
+L(L8_bwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_8_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_8_bwd_loop_L1):
+	movaps	-0x18(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x28(%rsi), %xmm3
+	movaps	-0x38(%rsi), %xmm4
+	movaps	-0x48(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$8, %xmm2, %xmm1
+	palignr	$8, %xmm3, %xmm2
+	palignr	$8, %xmm4, %xmm3
+	palignr	$8, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_8_bwd_end)
+	movaps	%xmm4, (%rdi)
+	jmp	*%r9
+	ud2
+L(shl_8_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_9):
+	lea	(L(shl_9_loop_L1)-L(shl_9))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x09(%rsi), %xmm1
+	jb	L(L9_fwd)
+	lea	(L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9
+L(L9_fwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_9_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_9_loop_L1):
+	sub	$64, %rdx
+	movaps	0x07(%rsi), %xmm2
+	movaps	0x17(%rsi), %xmm3
+	movaps	0x27(%rsi), %xmm4
+	movaps	0x37(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$9, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$9, %xmm3, %xmm4
+	palignr	$9, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$9, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_9_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	jmp	*%r9
+	ud2
+L(shl_9_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_9_bwd):
+	lea	(L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x09(%rsi), %xmm1
+	jb	L(L9_bwd)
+	lea	(L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9
+L(L9_bwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_9_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_9_bwd_loop_L1):
+	movaps	-0x19(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x29(%rsi), %xmm3
+	movaps	-0x39(%rsi), %xmm4
+	movaps	-0x49(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$9, %xmm2, %xmm1
+	palignr	$9, %xmm3, %xmm2
+	palignr	$9, %xmm4, %xmm3
+	palignr	$9, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_9_bwd_end)
+	movaps	%xmm4, (%rdi)
+	jmp	*%r9
+	ud2
+L(shl_9_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_10):
+	lea	(L(shl_10_loop_L1)-L(shl_10))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0a(%rsi), %xmm1
+	jb	L(L10_fwd)
+	lea	(L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9
+L(L10_fwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_10_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_10_loop_L1):
+	sub	$64, %rdx
+	movaps	0x06(%rsi), %xmm2
+	movaps	0x16(%rsi), %xmm3
+	movaps	0x26(%rsi), %xmm4
+	movaps	0x36(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$10, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$10, %xmm3, %xmm4
+	palignr	$10, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$10, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_10_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	jmp	*%r9
+	ud2
+L(shl_10_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_10_bwd):
+	lea	(L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0a(%rsi), %xmm1
+	jb	L(L10_bwd)
+	lea	(L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9
+L(L10_bwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_10_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_10_bwd_loop_L1):
+	movaps	-0x1a(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x2a(%rsi), %xmm3
+	movaps	-0x3a(%rsi), %xmm4
+	movaps	-0x4a(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$10, %xmm2, %xmm1
+	palignr	$10, %xmm3, %xmm2
+	palignr	$10, %xmm4, %xmm3
+	palignr	$10, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_10_bwd_end)
+	movaps	%xmm4, (%rdi)
+	jmp	*%r9
+	ud2
+L(shl_10_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_11):
+	lea	(L(shl_11_loop_L1)-L(shl_11))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0b(%rsi), %xmm1
+	jb	L(L11_fwd)
+	lea	(L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9
+L(L11_fwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_11_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_11_loop_L1):
+	sub	$64, %rdx
+	movaps	0x05(%rsi), %xmm2
+	movaps	0x15(%rsi), %xmm3
+	movaps	0x25(%rsi), %xmm4
+	movaps	0x35(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$11, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$11, %xmm3, %xmm4
+	palignr	$11, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$11, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_11_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	jmp	*%r9
+	ud2
+L(shl_11_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_11_bwd):
+	lea	(L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0b(%rsi), %xmm1
+	jb	L(L11_bwd)
+	lea	(L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9
+L(L11_bwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_11_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_11_bwd_loop_L1):
+	movaps	-0x1b(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x2b(%rsi), %xmm3
+	movaps	-0x3b(%rsi), %xmm4
+	movaps	-0x4b(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$11, %xmm2, %xmm1
+	palignr	$11, %xmm3, %xmm2
+	palignr	$11, %xmm4, %xmm3
+	palignr	$11, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_11_bwd_end)
+	movaps	%xmm4, (%rdi)
+	jmp	*%r9
+	ud2
+L(shl_11_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_12):
+	lea	(L(shl_12_loop_L1)-L(shl_12))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0c(%rsi), %xmm1
+	jb	L(L12_fwd)
+	lea	(L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9
+L(L12_fwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_12_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_12_loop_L1):
+	sub	$64, %rdx
+	movaps	0x04(%rsi), %xmm2
+	movaps	0x14(%rsi), %xmm3
+	movaps	0x24(%rsi), %xmm4
+	movaps	0x34(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$12, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$12, %xmm3, %xmm4
+	palignr	$12, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$12, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_12_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	jmp	*%r9
+	ud2
+L(shl_12_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_12_bwd):
+	lea	(L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0c(%rsi), %xmm1
+	jb	L(L12_bwd)
+	lea	(L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9
+L(L12_bwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_12_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_12_bwd_loop_L1):
+	movaps	-0x1c(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x2c(%rsi), %xmm3
+	movaps	-0x3c(%rsi), %xmm4
+	movaps	-0x4c(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$12, %xmm2, %xmm1
+	palignr	$12, %xmm3, %xmm2
+	palignr	$12, %xmm4, %xmm3
+	palignr	$12, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_12_bwd_end)
+	movaps	%xmm4, (%rdi)
+	jmp	*%r9
+	ud2
+L(shl_12_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_13):
+	lea	(L(shl_13_loop_L1)-L(shl_13))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0d(%rsi), %xmm1
+	jb	L(L13_fwd)
+	lea	(L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9
+L(L13_fwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_13_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_13_loop_L1):
+	sub	$64, %rdx
+	movaps	0x03(%rsi), %xmm2
+	movaps	0x13(%rsi), %xmm3
+	movaps	0x23(%rsi), %xmm4
+	movaps	0x33(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$13, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$13, %xmm3, %xmm4
+	palignr	$13, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$13, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_13_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	jmp	*%r9
+	ud2
+L(shl_13_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_13_bwd):
+	lea	(L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0d(%rsi), %xmm1
+	jb	L(L13_bwd)
+	lea	(L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9
+L(L13_bwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_13_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_13_bwd_loop_L1):
+	movaps	-0x1d(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x2d(%rsi), %xmm3
+	movaps	-0x3d(%rsi), %xmm4
+	movaps	-0x4d(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$13, %xmm2, %xmm1
+	palignr	$13, %xmm3, %xmm2
+	palignr	$13, %xmm4, %xmm3
+	palignr	$13, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_13_bwd_end)
+	movaps	%xmm4, (%rdi)
+	jmp	*%r9
+	ud2
+L(shl_13_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_14):
+	lea	(L(shl_14_loop_L1)-L(shl_14))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0e(%rsi), %xmm1
+	jb	L(L14_fwd)
+	lea	(L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9
+L(L14_fwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_14_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_14_loop_L1):
+	sub	$64, %rdx
+	movaps	0x02(%rsi), %xmm2
+	movaps	0x12(%rsi), %xmm3
+	movaps	0x22(%rsi), %xmm4
+	movaps	0x32(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$14, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$14, %xmm3, %xmm4
+	palignr	$14, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$14, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_14_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	jmp	*%r9
+	ud2
+L(shl_14_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_14_bwd):
+	lea	(L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0e(%rsi), %xmm1
+	jb	L(L14_bwd)
+	lea	(L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9
+L(L14_bwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_14_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_14_bwd_loop_L1):
+	movaps	-0x1e(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x2e(%rsi), %xmm3
+	movaps	-0x3e(%rsi), %xmm4
+	movaps	-0x4e(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$14, %xmm2, %xmm1
+	palignr	$14, %xmm3, %xmm2
+	palignr	$14, %xmm4, %xmm3
+	palignr	$14, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_14_bwd_end)
+	movaps	%xmm4, (%rdi)
+	jmp	*%r9
+	ud2
+L(shl_14_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_15):
+	lea	(L(shl_15_loop_L1)-L(shl_15))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0f(%rsi), %xmm1
+	jb	L(L15_fwd)
+	lea	(L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9
+L(L15_fwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_15_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_15_loop_L1):
+	sub	$64, %rdx
+	movaps	0x01(%rsi), %xmm2
+	movaps	0x11(%rsi), %xmm3
+	movaps	0x21(%rsi), %xmm4
+	movaps	0x31(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$15, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$15, %xmm3, %xmm4
+	palignr	$15, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$15, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_15_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	jmp	*%r9
+	ud2
+L(shl_15_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_15_bwd):
+	lea	(L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0f(%rsi), %xmm1
+	jb	L(L15_bwd)
+	lea	(L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9
+L(L15_bwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_15_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_15_bwd_loop_L1):
+	movaps	-0x1f(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x2f(%rsi), %xmm3
+	movaps	-0x3f(%rsi), %xmm4
+	movaps	-0x4f(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$15, %xmm2, %xmm1
+	palignr	$15, %xmm3, %xmm2
+	palignr	$15, %xmm4, %xmm3
+	palignr	$15, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_15_bwd_end)
+	movaps	%xmm4, (%rdi)
+	jmp	*%r9
+	ud2
+L(shl_15_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(write_72bytes):
+	movdqu	-72(%rsi), %xmm0
+	movdqu	-56(%rsi), %xmm1
+	mov	-40(%rsi), %r8
+	mov	-32(%rsi), %r9
+	mov	-24(%rsi), %r10
+	mov	-16(%rsi), %r11
+	mov	-8(%rsi), %rcx
+	movdqu	 %xmm0, -72(%rdi)
+	movdqu	 %xmm1, -56(%rdi)
+	mov	 %r8, -40(%rdi)
+	mov	 %r9, -32(%rdi)
+	mov	 %r10, -24(%rdi)
+	mov	 %r11, -16(%rdi)
+	mov	 %rcx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_64bytes):
+	movdqu	-64(%rsi), %xmm0
+	mov	-48(%rsi), %rcx
+	mov	-40(%rsi), %r8
+	mov	-32(%rsi), %r9
+	mov	-24(%rsi), %r10
+	mov	-16(%rsi), %r11
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -64(%rdi)
+	mov	 %rcx, -48(%rdi)
+	mov	 %r8, -40(%rdi)
+	mov	 %r9, -32(%rdi)
+	mov	 %r10, -24(%rdi)
+	mov	 %r11, -16(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_56bytes):
+	movdqu	-56(%rsi), %xmm0
+	mov	-40(%rsi), %r8
+	mov	-32(%rsi), %r9
+	mov	-24(%rsi), %r10
+	mov	-16(%rsi), %r11
+	mov	-8(%rsi), %rcx
+	movdqu	 %xmm0, -56(%rdi)
+	mov	 %r8, -40(%rdi)
+	mov	 %r9, -32(%rdi)
+	mov	 %r10, -24(%rdi)
+	mov	 %r11, -16(%rdi)
+	mov	 %rcx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_48bytes):
+	mov	-48(%rsi), %rcx
+	mov	-40(%rsi), %r8
+	mov	-32(%rsi), %r9
+	mov	-24(%rsi), %r10
+	mov	-16(%rsi), %r11
+	mov	-8(%rsi), %rdx
+	mov	 %rcx, -48(%rdi)
+	mov	 %r8, -40(%rdi)
+	mov	 %r9, -32(%rdi)
+	mov	 %r10, -24(%rdi)
+	mov	 %r11, -16(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_40bytes):
+	mov	-40(%rsi), %r8
+	mov	-32(%rsi), %r9
+	mov	-24(%rsi), %r10
+	mov	-16(%rsi), %r11
+	mov	-8(%rsi), %rdx
+	mov	 %r8, -40(%rdi)
+	mov	 %r9, -32(%rdi)
+	mov	 %r10, -24(%rdi)
+	mov	 %r11, -16(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_32bytes):
+	mov	-32(%rsi), %r9
+	mov	-24(%rsi), %r10
+	mov	-16(%rsi), %r11
+	mov	-8(%rsi), %rdx
+	mov	 %r9, -32(%rdi)
+	mov	 %r10, -24(%rdi)
+	mov	 %r11, -16(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_24bytes):
+	mov	-24(%rsi), %r10
+	mov	-16(%rsi), %r11
+	mov	-8(%rsi), %rdx
+	mov	 %r10, -24(%rdi)
+	mov	 %r11, -16(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_16bytes):
+	mov	-16(%rsi), %r11
+	mov	-8(%rsi), %rdx
+	mov	 %r11, -16(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_8bytes):
+	mov	-8(%rsi), %rdx
+	mov	 %rdx, -8(%rdi)
+L(write_0bytes):
+	ret
+
+	.p2align 4
+L(write_73bytes):
+	movdqu	-73(%rsi), %xmm0
+	movdqu	-57(%rsi), %xmm1
+	mov	-41(%rsi), %rcx
+	mov	-33(%rsi), %r9
+	mov	-25(%rsi), %r10
+	mov	-17(%rsi), %r11
+	mov	-9(%rsi), %r8
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -73(%rdi)
+	movdqu	 %xmm1, -57(%rdi)
+	mov	 %rcx, -41(%rdi)
+	mov	 %r9, -33(%rdi)
+	mov	 %r10, -25(%rdi)
+	mov	 %r11, -17(%rdi)
+	mov	 %r8, -9(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_65bytes):
+	movdqu	-65(%rsi), %xmm0
+	movdqu	-49(%rsi), %xmm1
+	mov	-33(%rsi), %r9
+	mov	-25(%rsi), %r10
+	mov	-17(%rsi), %r11
+	mov	-9(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -65(%rdi)
+	movdqu	 %xmm1, -49(%rdi)
+	mov	 %r9, -33(%rdi)
+	mov	 %r10, -25(%rdi)
+	mov	 %r11, -17(%rdi)
+	mov	 %rcx, -9(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_57bytes):
+	movdqu	-57(%rsi), %xmm0
+	mov	-41(%rsi), %r8
+	mov	-33(%rsi), %r9
+	mov	-25(%rsi), %r10
+	mov	-17(%rsi), %r11
+	mov	-9(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -57(%rdi)
+	mov	 %r8, -41(%rdi)
+	mov	 %r9, -33(%rdi)
+	mov	 %r10, -25(%rdi)
+	mov	 %r11, -17(%rdi)
+	mov	 %rcx, -9(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_49bytes):
+	movdqu	-49(%rsi), %xmm0
+	mov	-33(%rsi), %r9
+	mov	-25(%rsi), %r10
+	mov	-17(%rsi), %r11
+	mov	-9(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -49(%rdi)
+	mov	 %r9, -33(%rdi)
+	mov	 %r10, -25(%rdi)
+	mov	 %r11, -17(%rdi)
+	mov	 %rcx, -9(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_41bytes):
+	mov	-41(%rsi), %r8
+	mov	-33(%rsi), %r9
+	mov	-25(%rsi), %r10
+	mov	-17(%rsi), %r11
+	mov	-9(%rsi), %rcx
+	mov	-1(%rsi), %dl
+	mov	 %r8, -41(%rdi)
+	mov	 %r9, -33(%rdi)
+	mov	 %r10, -25(%rdi)
+	mov	 %r11, -17(%rdi)
+	mov	 %rcx, -9(%rdi)
+	mov	 %dl, -1(%rdi)
+	ret
+
+	.p2align 4
+L(write_33bytes):
+	mov	-33(%rsi), %r9
+	mov	-25(%rsi), %r10
+	mov	-17(%rsi), %r11
+	mov	-9(%rsi), %rcx
+	mov	-1(%rsi), %dl
+	mov	 %r9, -33(%rdi)
+	mov	 %r10, -25(%rdi)
+	mov	 %r11, -17(%rdi)
+	mov	 %rcx, -9(%rdi)
+	mov	 %dl, -1(%rdi)
+	ret
+
+	.p2align 4
+L(write_25bytes):
+	mov	-25(%rsi), %r10
+	mov	-17(%rsi), %r11
+	mov	-9(%rsi), %rcx
+	mov	-1(%rsi), %dl
+	mov	 %r10, -25(%rdi)
+	mov	 %r11, -17(%rdi)
+	mov	 %rcx, -9(%rdi)
+	mov	 %dl, -1(%rdi)
+	ret
+
+	.p2align 4
+L(write_17bytes):
+	mov	-17(%rsi), %r11
+	mov	-9(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r11, -17(%rdi)
+	mov	 %rcx, -9(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_9bytes):
+	mov	-9(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %rcx, -9(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_1bytes):
+	mov	-1(%rsi), %dl
+	mov	 %dl, -1(%rdi)
+	ret
+
+	.p2align 4
+L(write_74bytes):
+	movdqu	-74(%rsi), %xmm0
+	movdqu	-58(%rsi), %xmm1
+	mov	-42(%rsi), %r8
+	mov	-34(%rsi), %r9
+	mov	-26(%rsi), %r10
+	mov	-18(%rsi), %r11
+	mov	-10(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -74(%rdi)
+	movdqu	 %xmm1, -58(%rdi)
+	mov	 %r8, -42(%rdi)
+	mov	 %r9, -34(%rdi)
+	mov	 %r10, -26(%rdi)
+	mov	 %r11, -18(%rdi)
+	mov	 %rcx, -10(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_66bytes):
+	movdqu	-66(%rsi), %xmm0
+	movdqu	-50(%rsi), %xmm1
+	mov	-42(%rsi), %r8
+	mov	-34(%rsi), %r9
+	mov	-26(%rsi), %r10
+	mov	-18(%rsi), %r11
+	mov	-10(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -66(%rdi)
+	movdqu	 %xmm1, -50(%rdi)
+	mov	 %r8, -42(%rdi)
+	mov	 %r9, -34(%rdi)
+	mov	 %r10, -26(%rdi)
+	mov	 %r11, -18(%rdi)
+	mov	 %rcx, -10(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_58bytes):
+	movdqu	-58(%rsi), %xmm1
+	mov	-42(%rsi), %r8
+	mov	-34(%rsi), %r9
+	mov	-26(%rsi), %r10
+	mov	-18(%rsi), %r11
+	mov	-10(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm1, -58(%rdi)
+	mov	 %r8, -42(%rdi)
+	mov	 %r9, -34(%rdi)
+	mov	 %r10, -26(%rdi)
+	mov	 %r11, -18(%rdi)
+	mov	 %rcx, -10(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_50bytes):
+	movdqu	-50(%rsi), %xmm0
+	mov	-34(%rsi), %r9
+	mov	-26(%rsi), %r10
+	mov	-18(%rsi), %r11
+	mov	-10(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -50(%rdi)
+	mov	 %r9, -34(%rdi)
+	mov	 %r10, -26(%rdi)
+	mov	 %r11, -18(%rdi)
+	mov	 %rcx, -10(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_42bytes):
+	mov	-42(%rsi), %r8
+	mov	-34(%rsi), %r9
+	mov	-26(%rsi), %r10
+	mov	-18(%rsi), %r11
+	mov	-10(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r8, -42(%rdi)
+	mov	 %r9, -34(%rdi)
+	mov	 %r10, -26(%rdi)
+	mov	 %r11, -18(%rdi)
+	mov	 %rcx, -10(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_34bytes):
+	mov	-34(%rsi), %r9
+	mov	-26(%rsi), %r10
+	mov	-18(%rsi), %r11
+	mov	-10(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r9, -34(%rdi)
+	mov	 %r10, -26(%rdi)
+	mov	 %r11, -18(%rdi)
+	mov	 %rcx, -10(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_26bytes):
+	mov	-26(%rsi), %r10
+	mov	-18(%rsi), %r11
+	mov	-10(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r10, -26(%rdi)
+	mov	 %r11, -18(%rdi)
+	mov	 %rcx, -10(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_18bytes):
+	mov	-18(%rsi), %r11
+	mov	-10(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r11, -18(%rdi)
+	mov	 %rcx, -10(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_10bytes):
+	mov	-10(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %rcx, -10(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_2bytes):
+	mov	-2(%rsi), %dx
+	mov	 %dx, -2(%rdi)
+	ret
+
+	.p2align 4
+L(write_75bytes):
+	movdqu	-75(%rsi), %xmm0
+	movdqu	-59(%rsi), %xmm1
+	mov	-43(%rsi), %r8
+	mov	-35(%rsi), %r9
+	mov	-27(%rsi), %r10
+	mov	-19(%rsi), %r11
+	mov	-11(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -75(%rdi)
+	movdqu	 %xmm1, -59(%rdi)
+	mov	 %r8, -43(%rdi)
+	mov	 %r9, -35(%rdi)
+	mov	 %r10, -27(%rdi)
+	mov	 %r11, -19(%rdi)
+	mov	 %rcx, -11(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_67bytes):
+	movdqu	-67(%rsi), %xmm0
+	movdqu	-59(%rsi), %xmm1
+	mov	-43(%rsi), %r8
+	mov	-35(%rsi), %r9
+	mov	-27(%rsi), %r10
+	mov	-19(%rsi), %r11
+	mov	-11(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -67(%rdi)
+	movdqu	 %xmm1, -59(%rdi)
+	mov	 %r8, -43(%rdi)
+	mov	 %r9, -35(%rdi)
+	mov	 %r10, -27(%rdi)
+	mov	 %r11, -19(%rdi)
+	mov	 %rcx, -11(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_59bytes):
+	movdqu	-59(%rsi), %xmm0
+	mov	-43(%rsi), %r8
+	mov	-35(%rsi), %r9
+	mov	-27(%rsi), %r10
+	mov	-19(%rsi), %r11
+	mov	-11(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -59(%rdi)
+	mov	 %r8, -43(%rdi)
+	mov	 %r9, -35(%rdi)
+	mov	 %r10, -27(%rdi)
+	mov	 %r11, -19(%rdi)
+	mov	 %rcx, -11(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_51bytes):
+	movdqu	-51(%rsi), %xmm0
+	mov	-35(%rsi), %r9
+	mov	-27(%rsi), %r10
+	mov	-19(%rsi), %r11
+	mov	-11(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -51(%rdi)
+	mov	 %r9, -35(%rdi)
+	mov	 %r10, -27(%rdi)
+	mov	 %r11, -19(%rdi)
+	mov	 %rcx, -11(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_43bytes):
+	mov	-43(%rsi), %r8
+	mov	-35(%rsi), %r9
+	mov	-27(%rsi), %r10
+	mov	-19(%rsi), %r11
+	mov	-11(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r8, -43(%rdi)
+	mov	 %r9, -35(%rdi)
+	mov	 %r10, -27(%rdi)
+	mov	 %r11, -19(%rdi)
+	mov	 %rcx, -11(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_35bytes):
+	mov	-35(%rsi), %r9
+	mov	-27(%rsi), %r10
+	mov	-19(%rsi), %r11
+	mov	-11(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r9, -35(%rdi)
+	mov	 %r10, -27(%rdi)
+	mov	 %r11, -19(%rdi)
+	mov	 %rcx, -11(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_27bytes):
+	mov	-27(%rsi), %r10
+	mov	-19(%rsi), %r11
+	mov	-11(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r10, -27(%rdi)
+	mov	 %r11, -19(%rdi)
+	mov	 %rcx, -11(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_19bytes):
+	mov	-19(%rsi), %r11
+	mov	-11(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r11, -19(%rdi)
+	mov	 %rcx, -11(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_11bytes):
+	mov	-11(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %rcx, -11(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_3bytes):
+	mov	-3(%rsi), %dx
+	mov	-2(%rsi), %cx
+	mov	 %dx, -3(%rdi)
+	mov	 %cx, -2(%rdi)
+	ret
+
+	.p2align 4
+L(write_76bytes):
+	movdqu	-76(%rsi), %xmm0
+	movdqu	-60(%rsi), %xmm1
+	mov	-44(%rsi), %r8
+	mov	-36(%rsi), %r9
+	mov	-28(%rsi), %r10
+	mov	-20(%rsi), %r11
+	mov	-12(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -76(%rdi)
+	movdqu	 %xmm1, -60(%rdi)
+	mov	 %r8, -44(%rdi)
+	mov	 %r9, -36(%rdi)
+	mov	 %r10, -28(%rdi)
+	mov	 %r11, -20(%rdi)
+	mov	 %rcx, -12(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_68bytes):
+	movdqu	-68(%rsi), %xmm0
+	movdqu	-52(%rsi), %xmm1
+	mov	-36(%rsi), %r9
+	mov	-28(%rsi), %r10
+	mov	-20(%rsi), %r11
+	mov	-12(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -68(%rdi)
+	movdqu	 %xmm1, -52(%rdi)
+	mov	 %r9, -36(%rdi)
+	mov	 %r10, -28(%rdi)
+	mov	 %r11, -20(%rdi)
+	mov	 %rcx, -12(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_60bytes):
+	movdqu	-60(%rsi), %xmm0
+	mov	-44(%rsi), %r8
+	mov	-36(%rsi), %r9
+	mov	-28(%rsi), %r10
+	mov	-20(%rsi), %r11
+	mov	-12(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -60(%rdi)
+	mov	 %r8, -44(%rdi)
+	mov	 %r9, -36(%rdi)
+	mov	 %r10, -28(%rdi)
+	mov	 %r11, -20(%rdi)
+	mov	 %rcx, -12(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_52bytes):
+	movdqu	-52(%rsi), %xmm0
+	mov	-36(%rsi), %r9
+	mov	-28(%rsi), %r10
+	mov	-20(%rsi), %r11
+	mov	-12(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -52(%rdi)
+	mov	 %r9, -36(%rdi)
+	mov	 %r10, -28(%rdi)
+	mov	 %r11, -20(%rdi)
+	mov	 %rcx, -12(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_44bytes):
+	mov	-44(%rsi), %r8
+	mov	-36(%rsi), %r9
+	mov	-28(%rsi), %r10
+	mov	-20(%rsi), %r11
+	mov	-12(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r8, -44(%rdi)
+	mov	 %r9, -36(%rdi)
+	mov	 %r10, -28(%rdi)
+	mov	 %r11, -20(%rdi)
+	mov	 %rcx, -12(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_36bytes):
+	mov	-36(%rsi), %r9
+	mov	-28(%rsi), %r10
+	mov	-20(%rsi), %r11
+	mov	-12(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r9, -36(%rdi)
+	mov	 %r10, -28(%rdi)
+	mov	 %r11, -20(%rdi)
+	mov	 %rcx, -12(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_28bytes):
+	mov	-28(%rsi), %r10
+	mov	-20(%rsi), %r11
+	mov	-12(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r10, -28(%rdi)
+	mov	 %r11, -20(%rdi)
+	mov	 %rcx, -12(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_20bytes):
+	mov	-20(%rsi), %r11
+	mov	-12(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r11, -20(%rdi)
+	mov	 %rcx, -12(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_12bytes):
+	mov	-12(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %rcx, -12(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_4bytes):
+	mov	-4(%rsi), %edx
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_77bytes):
+	movdqu	-77(%rsi), %xmm0
+	movdqu	-61(%rsi), %xmm1
+	mov	-45(%rsi), %r8
+	mov	-37(%rsi), %r9
+	mov	-29(%rsi), %r10
+	mov	-21(%rsi), %r11
+	mov	-13(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -77(%rdi)
+	movdqu	 %xmm1, -61(%rdi)
+	mov	 %r8, -45(%rdi)
+	mov	 %r9, -37(%rdi)
+	mov	 %r10, -29(%rdi)
+	mov	 %r11, -21(%rdi)
+	mov	 %rcx, -13(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_69bytes):
+	movdqu	-69(%rsi), %xmm0
+	movdqu	-53(%rsi), %xmm1
+	mov	-37(%rsi), %r9
+	mov	-29(%rsi), %r10
+	mov	-21(%rsi), %r11
+	mov	-13(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -69(%rdi)
+	movdqu	 %xmm1, -53(%rdi)
+	mov	 %r9, -37(%rdi)
+	mov	 %r10, -29(%rdi)
+	mov	 %r11, -21(%rdi)
+	mov	 %rcx, -13(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_61bytes):
+	movdqu	-61(%rsi), %xmm0
+	mov	-45(%rsi), %r8
+	mov	-37(%rsi), %r9
+	mov	-29(%rsi), %r10
+	mov	-21(%rsi), %r11
+	mov	-13(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -61(%rdi)
+	mov	 %r8, -45(%rdi)
+	mov	 %r9, -37(%rdi)
+	mov	 %r10, -29(%rdi)
+	mov	 %r11, -21(%rdi)
+	mov	 %rcx, -13(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_53bytes):
+	movdqu	-53(%rsi), %xmm0
+	mov	-45(%rsi), %r8
+	mov	-37(%rsi), %r9
+	mov	-29(%rsi), %r10
+	mov	-21(%rsi), %r11
+	mov	-13(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -53(%rdi)
+	mov	 %r9, -37(%rdi)
+	mov	 %r10, -29(%rdi)
+	mov	 %r11, -21(%rdi)
+	mov	 %rcx, -13(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_45bytes):
+	mov	-45(%rsi), %r8
+	mov	-37(%rsi), %r9
+	mov	-29(%rsi), %r10
+	mov	-21(%rsi), %r11
+	mov	-13(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r8, -45(%rdi)
+	mov	 %r9, -37(%rdi)
+	mov	 %r10, -29(%rdi)
+	mov	 %r11, -21(%rdi)
+	mov	 %rcx, -13(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_37bytes):
+	mov	-37(%rsi), %r9
+	mov	-29(%rsi), %r10
+	mov	-21(%rsi), %r11
+	mov	-13(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r9, -37(%rdi)
+	mov	 %r10, -29(%rdi)
+	mov	 %r11, -21(%rdi)
+	mov	 %rcx, -13(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_29bytes):
+	mov	-29(%rsi), %r10
+	mov	-21(%rsi), %r11
+	mov	-13(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r10, -29(%rdi)
+	mov	 %r11, -21(%rdi)
+	mov	 %rcx, -13(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_21bytes):
+	mov	-21(%rsi), %r11
+	mov	-13(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r11, -21(%rdi)
+	mov	 %rcx, -13(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_13bytes):
+	mov	-13(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %rcx, -13(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_5bytes):
+	mov	-5(%rsi), %edx
+	mov	-4(%rsi), %ecx
+	mov	 %edx, -5(%rdi)
+	mov	 %ecx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_78bytes):
+	movdqu	-78(%rsi), %xmm0
+	movdqu	-62(%rsi), %xmm1
+	mov	-46(%rsi), %r8
+	mov	-38(%rsi), %r9
+	mov	-30(%rsi), %r10
+	mov	-22(%rsi), %r11
+	mov	-14(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -78(%rdi)
+	movdqu	 %xmm1, -62(%rdi)
+	mov	 %r8, -46(%rdi)
+	mov	 %r9, -38(%rdi)
+	mov	 %r10, -30(%rdi)
+	mov	 %r11, -22(%rdi)
+	mov	 %rcx, -14(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_70bytes):
+	movdqu	-70(%rsi), %xmm0
+	movdqu	-54(%rsi), %xmm1
+	mov	-38(%rsi), %r9
+	mov	-30(%rsi), %r10
+	mov	-22(%rsi), %r11
+	mov	-14(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -70(%rdi)
+	movdqu	 %xmm1, -54(%rdi)
+	mov	 %r9, -38(%rdi)
+	mov	 %r10, -30(%rdi)
+	mov	 %r11, -22(%rdi)
+	mov	 %rcx, -14(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_62bytes):
+	movdqu	-62(%rsi), %xmm0
+	mov	-46(%rsi), %r8
+	mov	-38(%rsi), %r9
+	mov	-30(%rsi), %r10
+	mov	-22(%rsi), %r11
+	mov	-14(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -62(%rdi)
+	mov	 %r8, -46(%rdi)
+	mov	 %r9, -38(%rdi)
+	mov	 %r10, -30(%rdi)
+	mov	 %r11, -22(%rdi)
+	mov	 %rcx, -14(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_54bytes):
+	movdqu	-54(%rsi), %xmm0
+	mov	-38(%rsi), %r9
+	mov	-30(%rsi), %r10
+	mov	-22(%rsi), %r11
+	mov	-14(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -54(%rdi)
+	mov	 %r9, -38(%rdi)
+	mov	 %r10, -30(%rdi)
+	mov	 %r11, -22(%rdi)
+	mov	 %rcx, -14(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_46bytes):
+	mov	-46(%rsi), %r8
+	mov	-38(%rsi), %r9
+	mov	-30(%rsi), %r10
+	mov	-22(%rsi), %r11
+	mov	-14(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r8, -46(%rdi)
+	mov	 %r9, -38(%rdi)
+	mov	 %r10, -30(%rdi)
+	mov	 %r11, -22(%rdi)
+	mov	 %rcx, -14(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_38bytes):
+	mov	-38(%rsi), %r9
+	mov	-30(%rsi), %r10
+	mov	-22(%rsi), %r11
+	mov	-14(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r9, -38(%rdi)
+	mov	 %r10, -30(%rdi)
+	mov	 %r11, -22(%rdi)
+	mov	 %rcx, -14(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_30bytes):
+	mov	-30(%rsi), %r10
+	mov	-22(%rsi), %r11
+	mov	-14(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r10, -30(%rdi)
+	mov	 %r11, -22(%rdi)
+	mov	 %rcx, -14(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_22bytes):
+	mov	-22(%rsi), %r11
+	mov	-14(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r11, -22(%rdi)
+	mov	 %rcx, -14(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_14bytes):
+	mov	-14(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %rcx, -14(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_6bytes):
+	mov	-6(%rsi), %edx
+	mov	-4(%rsi), %ecx
+	mov	 %edx, -6(%rdi)
+	mov	 %ecx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_79bytes):
+	movdqu	-79(%rsi), %xmm0
+	movdqu	-63(%rsi), %xmm1
+	mov	-47(%rsi), %r8
+	mov	-39(%rsi), %r9
+	mov	-31(%rsi), %r10
+	mov	-23(%rsi), %r11
+	mov	-15(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -79(%rdi)
+	movdqu	 %xmm1, -63(%rdi)
+	mov	 %r8, -47(%rdi)
+	mov	 %r9, -39(%rdi)
+	mov	 %r10, -31(%rdi)
+	mov	 %r11, -23(%rdi)
+	mov	 %rcx, -15(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_71bytes):
+	movdqu	-71(%rsi), %xmm0
+	movdqu	-55(%rsi), %xmm1
+	mov	-39(%rsi), %r9
+	mov	-31(%rsi), %r10
+	mov	-23(%rsi), %r11
+	mov	-15(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -71(%rdi)
+	movdqu	 %xmm1, -55(%rdi)
+	mov	 %r9, -39(%rdi)
+	mov	 %r10, -31(%rdi)
+	mov	 %r11, -23(%rdi)
+	mov	 %rcx, -15(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_63bytes):
+	movdqu	-63(%rsi), %xmm0
+	mov	-47(%rsi), %r8
+	mov	-39(%rsi), %r9
+	mov	-31(%rsi), %r10
+	mov	-23(%rsi), %r11
+	mov	-15(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -63(%rdi)
+	mov	 %r8, -47(%rdi)
+	mov	 %r9, -39(%rdi)
+	mov	 %r10, -31(%rdi)
+	mov	 %r11, -23(%rdi)
+	mov	 %rcx, -15(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_55bytes):
+	movdqu	-55(%rsi), %xmm0
+	mov	-39(%rsi), %r9
+	mov	-31(%rsi), %r10
+	mov	-23(%rsi), %r11
+	mov	-15(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -55(%rdi)
+	mov	 %r9, -39(%rdi)
+	mov	 %r10, -31(%rdi)
+	mov	 %r11, -23(%rdi)
+	mov	 %rcx, -15(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_47bytes):
+	mov	-47(%rsi), %r8
+	mov	-39(%rsi), %r9
+	mov	-31(%rsi), %r10
+	mov	-23(%rsi), %r11
+	mov	-15(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r8, -47(%rdi)
+	mov	 %r9, -39(%rdi)
+	mov	 %r10, -31(%rdi)
+	mov	 %r11, -23(%rdi)
+	mov	 %rcx, -15(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_39bytes):
+	mov	-39(%rsi), %r9
+	mov	-31(%rsi), %r10
+	mov	-23(%rsi), %r11
+	mov	-15(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r9, -39(%rdi)
+	mov	 %r10, -31(%rdi)
+	mov	 %r11, -23(%rdi)
+	mov	 %rcx, -15(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_31bytes):
+	mov	-31(%rsi), %r10
+	mov	-23(%rsi), %r11
+	mov	-15(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r10, -31(%rdi)
+	mov	 %r11, -23(%rdi)
+	mov	 %rcx, -15(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_23bytes):
+	mov	-23(%rsi), %r11
+	mov	-15(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r11, -23(%rdi)
+	mov	 %rcx, -15(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_15bytes):
+	mov	-15(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %rcx, -15(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_7bytes):
+	mov	-7(%rsi), %edx
+	mov	-4(%rsi), %ecx
+	mov	 %edx, -7(%rdi)
+	mov	 %ecx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(large_page_fwd):
+	movdqu	(%rsi), %xmm1
+	lea	16(%rsi), %rsi
+	movdqu	%xmm0, (%r8)
+	movntdq	%xmm1, (%rdi)
+	lea	16(%rdi), %rdi
+	lea	-0x90(%rdx), %rdx
+#ifdef USE_AS_MEMMOVE
+	mov	%rsi, %r9
+	sub	%rdi, %r9
+	cmp	%rdx, %r9
+	jae	L(memmove_is_memcpy_fwd)
+	shl	$2, %rcx
+	cmp	%rcx, %rdx
+	jb	L(ll_cache_copy_fwd_start)
+L(memmove_is_memcpy_fwd):
+#endif
+L(large_page_loop):
+	movdqu	(%rsi), %xmm0
+	movdqu	0x10(%rsi), %xmm1
+	movdqu	0x20(%rsi), %xmm2
+	movdqu	0x30(%rsi), %xmm3
+	movdqu	0x40(%rsi), %xmm4
+	movdqu	0x50(%rsi), %xmm5
+	movdqu	0x60(%rsi), %xmm6
+	movdqu	0x70(%rsi), %xmm7
+	lea	0x80(%rsi), %rsi
+
+	sub	$0x80, %rdx
+	movntdq	%xmm0, (%rdi)
+	movntdq	%xmm1, 0x10(%rdi)
+	movntdq	%xmm2, 0x20(%rdi)
+	movntdq	%xmm3, 0x30(%rdi)
+	movntdq	%xmm4, 0x40(%rdi)
+	movntdq	%xmm5, 0x50(%rdi)
+	movntdq	%xmm6, 0x60(%rdi)
+	movntdq	%xmm7, 0x70(%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(large_page_loop)
+	cmp	$-0x40, %rdx
+	lea	0x80(%rdx), %rdx
+	jl	L(large_page_less_64bytes)
+
+	movdqu	(%rsi), %xmm0
+	movdqu	0x10(%rsi), %xmm1
+	movdqu	0x20(%rsi), %xmm2
+	movdqu	0x30(%rsi), %xmm3
+	lea	0x40(%rsi), %rsi
+
+	movntdq	%xmm0, (%rdi)
+	movntdq	%xmm1, 0x10(%rdi)
+	movntdq	%xmm2, 0x20(%rdi)
+	movntdq	%xmm3, 0x30(%rdi)
+	lea	0x40(%rdi), %rdi
+	sub	$0x40, %rdx
+L(large_page_less_64bytes):
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	sfence
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+
+#ifdef USE_AS_MEMMOVE
+	.p2align 4
+L(ll_cache_copy_fwd_start):
+	prefetcht0 0x1c0(%rsi)
+	prefetcht0 0x200(%rsi)
+	movdqu	(%rsi), %xmm0
+	movdqu	0x10(%rsi), %xmm1
+	movdqu	0x20(%rsi), %xmm2
+	movdqu	0x30(%rsi), %xmm3
+	movdqu	0x40(%rsi), %xmm4
+	movdqu	0x50(%rsi), %xmm5
+	movdqu	0x60(%rsi), %xmm6
+	movdqu	0x70(%rsi), %xmm7
+	lea	0x80(%rsi), %rsi
+
+	sub	$0x80, %rdx
+	movaps	%xmm0, (%rdi)
+	movaps	%xmm1, 0x10(%rdi)
+	movaps	%xmm2, 0x20(%rdi)
+	movaps	%xmm3, 0x30(%rdi)
+	movaps	%xmm4, 0x40(%rdi)
+	movaps	%xmm5, 0x50(%rdi)
+	movaps	%xmm6, 0x60(%rdi)
+	movaps	%xmm7, 0x70(%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(ll_cache_copy_fwd_start)
+	cmp	$-0x40, %rdx
+	lea	0x80(%rdx), %rdx
+	jl	L(large_page_ll_less_fwd_64bytes)
+
+	movdqu	(%rsi), %xmm0
+	movdqu	0x10(%rsi), %xmm1
+	movdqu	0x20(%rsi), %xmm2
+	movdqu	0x30(%rsi), %xmm3
+	lea	0x40(%rsi), %rsi
+
+	movaps	%xmm0, (%rdi)
+	movaps	%xmm1, 0x10(%rdi)
+	movaps	%xmm2, 0x20(%rdi)
+	movaps	%xmm3, 0x30(%rdi)
+	lea	0x40(%rdi), %rdi
+	sub	$0x40, %rdx
+L(large_page_ll_less_fwd_64bytes):
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+
+#endif
+	.p2align 4
+L(large_page_bwd):
+	movdqu	-0x10(%rsi), %xmm1
+	lea	-16(%rsi), %rsi
+	movdqu	%xmm0, (%r8)
+	movdqa	%xmm1, -0x10(%rdi)
+	lea	-16(%rdi), %rdi
+	lea	-0x90(%rdx), %rdx
+#ifdef USE_AS_MEMMOVE
+	mov	%rdi, %r9
+	sub	%rsi, %r9
+	cmp	%rdx, %r9
+	jae	L(memmove_is_memcpy_bwd)
+	cmp	%rcx, %r9
+	jb	L(ll_cache_copy_bwd_start)
+L(memmove_is_memcpy_bwd):
+#endif
+L(large_page_bwd_loop):
+	movdqu	-0x10(%rsi), %xmm0
+	movdqu	-0x20(%rsi), %xmm1
+	movdqu	-0x30(%rsi), %xmm2
+	movdqu	-0x40(%rsi), %xmm3
+	movdqu	-0x50(%rsi), %xmm4
+	movdqu	-0x60(%rsi), %xmm5
+	movdqu	-0x70(%rsi), %xmm6
+	movdqu	-0x80(%rsi), %xmm7
+	lea	-0x80(%rsi), %rsi
+
+	sub	$0x80, %rdx
+	movntdq	%xmm0, -0x10(%rdi)
+	movntdq	%xmm1, -0x20(%rdi)
+	movntdq	%xmm2, -0x30(%rdi)
+	movntdq	%xmm3, -0x40(%rdi)
+	movntdq	%xmm4, -0x50(%rdi)
+	movntdq	%xmm5, -0x60(%rdi)
+	movntdq	%xmm6, -0x70(%rdi)
+	movntdq	%xmm7, -0x80(%rdi)
+	lea	-0x80(%rdi), %rdi
+	jae	L(large_page_bwd_loop)
+	cmp	$-0x40, %rdx
+	lea	0x80(%rdx), %rdx
+	jl	L(large_page_less_bwd_64bytes)
+
+	movdqu	-0x10(%rsi), %xmm0
+	movdqu	-0x20(%rsi), %xmm1
+	movdqu	-0x30(%rsi), %xmm2
+	movdqu	-0x40(%rsi), %xmm3
+	lea	-0x40(%rsi), %rsi
+
+	movntdq	%xmm0, -0x10(%rdi)
+	movntdq	%xmm1, -0x20(%rdi)
+	movntdq	%xmm2, -0x30(%rdi)
+	movntdq	%xmm3, -0x40(%rdi)
+	lea	-0x40(%rdi), %rdi
+	sub	$0x40, %rdx
+L(large_page_less_bwd_64bytes):
+	sfence
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+
+#ifdef USE_AS_MEMMOVE
+	.p2align 4
+L(ll_cache_copy_bwd_start):
+	prefetcht0 -0x1c0(%rsi)
+	prefetcht0 -0x200(%rsi)
+	movdqu	-0x10(%rsi), %xmm0
+	movdqu	-0x20(%rsi), %xmm1
+	movdqu	-0x30(%rsi), %xmm2
+	movdqu	-0x40(%rsi), %xmm3
+	movdqu	-0x50(%rsi), %xmm4
+	movdqu	-0x60(%rsi), %xmm5
+	movdqu	-0x70(%rsi), %xmm6
+	movdqu	-0x80(%rsi), %xmm7
+	lea	-0x80(%rsi), %rsi
+
+	sub	$0x80, %rdx
+	movaps	%xmm0, -0x10(%rdi)
+	movaps	%xmm1, -0x20(%rdi)
+	movaps	%xmm2, -0x30(%rdi)
+	movaps	%xmm3, -0x40(%rdi)
+	movaps	%xmm4, -0x50(%rdi)
+	movaps	%xmm5, -0x60(%rdi)
+	movaps	%xmm6, -0x70(%rdi)
+	movaps	%xmm7, -0x80(%rdi)
+	lea	-0x80(%rdi), %rdi
+	jae	L(ll_cache_copy_bwd_start)
+	cmp	$-0x40, %rdx
+	lea	0x80(%rdx), %rdx
+	jl	L(large_page_ll_less_bwd_64bytes)
+
+	movdqu	-0x10(%rsi), %xmm0
+	movdqu	-0x20(%rsi), %xmm1
+	movdqu	-0x30(%rsi), %xmm2
+	movdqu	-0x40(%rsi), %xmm3
+	lea	-0x40(%rsi), %rsi
+
+	movaps	%xmm0, -0x10(%rdi)
+	movaps	%xmm1, -0x20(%rdi)
+	movaps	%xmm2, -0x30(%rdi)
+	movaps	%xmm3, -0x40(%rdi)
+	lea	-0x40(%rdi), %rdi
+	sub	$0x40, %rdx
+L(large_page_ll_less_bwd_64bytes):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+#endif
+
+END (MEMCPY)
+
+	.section .rodata.ssse3,"a",@progbits
+	.p2align 3
+L(table_less_80bytes):
+	.int	JMPTBL (L(write_0bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_1bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_2bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_3bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_4bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_5bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_6bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_7bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_8bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_9bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_10bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_11bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_12bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_13bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_14bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_15bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_16bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_17bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_18bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_19bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_20bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_21bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_22bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_23bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_24bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_25bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_26bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_27bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_28bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_29bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_30bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_31bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_32bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_33bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_34bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_35bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_36bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_37bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_38bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_39bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_40bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_41bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_42bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_43bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_44bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_45bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_46bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_47bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_48bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_49bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_50bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_51bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_52bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_53bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_54bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_55bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_56bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_57bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_58bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_59bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_60bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_61bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_62bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_63bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_64bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_65bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_66bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_67bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_68bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_69bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_70bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_71bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_72bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_73bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_74bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_75bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_76bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_77bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_78bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_79bytes), L(table_less_80bytes))
+
+	.p2align 3
+L(shl_table):
+	.int	JMPTBL (L(shl_0), L(shl_table))
+	.int	JMPTBL (L(shl_1), L(shl_table))
+	.int	JMPTBL (L(shl_2), L(shl_table))
+	.int	JMPTBL (L(shl_3), L(shl_table))
+	.int	JMPTBL (L(shl_4), L(shl_table))
+	.int	JMPTBL (L(shl_5), L(shl_table))
+	.int	JMPTBL (L(shl_6), L(shl_table))
+	.int	JMPTBL (L(shl_7), L(shl_table))
+	.int	JMPTBL (L(shl_8), L(shl_table))
+	.int	JMPTBL (L(shl_9), L(shl_table))
+	.int	JMPTBL (L(shl_10), L(shl_table))
+	.int	JMPTBL (L(shl_11), L(shl_table))
+	.int	JMPTBL (L(shl_12), L(shl_table))
+	.int	JMPTBL (L(shl_13), L(shl_table))
+	.int	JMPTBL (L(shl_14), L(shl_table))
+	.int	JMPTBL (L(shl_15), L(shl_table))
+
+	.p2align 3
+L(shl_table_bwd):
+	.int	JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
+
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memcpy.S b/REORG.TODO/sysdeps/x86_64/multiarch/memcpy.S
new file mode 100644
index 0000000000..af2770397c
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/memcpy.S
@@ -0,0 +1,75 @@
+/* Multiple versions of memcpy
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+   DSO.  In static binaries we need memcpy before the initialization
+   happened.  */
+#if defined SHARED && IS_IN (libc)
+	.text
+ENTRY(__new_memcpy)
+	.type	__new_memcpy, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	lea	__memcpy_erms(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Prefer_ERMS)
+	jnz	2f
+	HAS_ARCH_FEATURE (Prefer_No_AVX512)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX512F_Usable)
+	jz	1f
+	lea	__memcpy_avx512_no_vzeroupper(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	2f
+	lea	__memcpy_avx512_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	2f
+	lea	__memcpy_avx512_unaligned(%rip), %RAX_LP
+	ret
+1:	lea	__memcpy_avx_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	L(Fast_Unaligned_Load)
+	HAS_CPU_FEATURE (ERMS)
+	jz	2f
+	lea	__memcpy_avx_unaligned_erms(%rip), %RAX_LP
+	ret
+L(Fast_Unaligned_Load):
+	lea	__memcpy_sse2_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Unaligned_Copy)
+	jz	L(SSSE3)
+	HAS_CPU_FEATURE (ERMS)
+	jz	2f
+	lea	__memcpy_sse2_unaligned_erms(%rip), %RAX_LP
+	ret
+L(SSSE3):
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	lea    __memcpy_ssse3_back(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Copy_Backward)
+	jnz	2f
+	lea	__memcpy_ssse3(%rip), %RAX_LP
+2:	ret
+END(__new_memcpy)
+
+# undef memcpy
+# include <shlib-compat.h>
+versioned_symbol (libc, __new_memcpy, memcpy, GLIBC_2_14);
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memcpy_chk.S b/REORG.TODO/sysdeps/x86_64/multiarch/memcpy_chk.S
new file mode 100644
index 0000000000..8737fb9755
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/memcpy_chk.S
@@ -0,0 +1,72 @@
+/* Multiple versions of __memcpy_chk
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+   DSO.  There are no multiarch memcpy functions for static binaries.
+ */
+#if IS_IN (libc)
+# ifdef SHARED
+	.text
+ENTRY(__memcpy_chk)
+	.type	__memcpy_chk, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	HAS_ARCH_FEATURE (Prefer_No_AVX512)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX512F_Usable)
+	jz	1f
+	lea	__memcpy_chk_avx512_no_vzeroupper(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	2f
+	lea	__memcpy_chk_avx512_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	2f
+	lea	__memcpy_chk_avx512_unaligned(%rip), %RAX_LP
+	ret
+1:	lea	__memcpy_chk_avx_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	L(Fast_Unaligned_Load)
+	HAS_CPU_FEATURE (ERMS)
+	jz	2f
+	lea	__memcpy_chk_avx_unaligned_erms(%rip), %RAX_LP
+	ret
+L(Fast_Unaligned_Load):
+	lea	__memcpy_chk_sse2_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Unaligned_Copy)
+	jz	L(SSSE3)
+	HAS_CPU_FEATURE (ERMS)
+	jz	2f
+	lea	__memcpy_chk_sse2_unaligned_erms(%rip), %RAX_LP
+	ret
+L(SSSE3):
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	lea    __memcpy_chk_ssse3_back(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Copy_Backward)
+	jnz	2f
+	lea	__memcpy_chk_ssse3(%rip), %RAX_LP
+2:	ret
+END(__memcpy_chk)
+# else
+#  include "../memcpy_chk.S"
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
new file mode 100644
index 0000000000..e195e93f15
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
@@ -0,0 +1,12 @@
+#if IS_IN (libc)
+# define VEC_SIZE	32
+# define VEC(i)		ymm##i
+# define VMOVNT		vmovntdq
+# define VMOVU		vmovdqu
+# define VMOVA		vmovdqa
+
+# define SECTION(p)		p##.avx
+# define MEMMOVE_SYMBOL(p,s)	p##_avx_##s
+
+# include "memmove-vec-unaligned-erms.S"
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
new file mode 100644
index 0000000000..f3ef10577c
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
@@ -0,0 +1,420 @@
+/* memmove/memcpy/mempcpy optimized with AVX512 for KNL hardware.
+   Copyright (C) 2016-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#if IS_IN (libc)
+
+# include "asm-syntax.h"
+
+	.section .text.avx512,"ax",@progbits
+# if defined SHARED && !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
+ENTRY (__mempcpy_chk_avx512_no_vzeroupper)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__mempcpy_chk_avx512_no_vzeroupper)
+
+ENTRY (__mempcpy_avx512_no_vzeroupper)
+	movq	%rdi, %rax
+	addq	%rdx, %rax
+	jmp	L(start)
+END (__mempcpy_avx512_no_vzeroupper)
+# endif
+
+# ifdef SHARED
+ENTRY (__memmove_chk_avx512_no_vzeroupper)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__memmove_chk_avx512_no_vzeroupper)
+# endif
+
+ENTRY (__memmove_avx512_no_vzeroupper)
+	mov	%rdi, %rax
+# ifdef USE_AS_MEMPCPY
+	add	%rdx, %rax
+# endif
+L(start):
+	lea	(%rsi, %rdx), %rcx
+	lea	(%rdi, %rdx), %r9
+	cmp	$512, %rdx
+	ja	L(512bytesormore)
+
+L(check):
+	cmp	$16, %rdx
+	jbe	L(less_16bytes)
+	cmp	$256, %rdx
+	jb	L(less_256bytes)
+	vmovups	(%rsi), %zmm0
+	vmovups 0x40(%rsi), %zmm1
+	vmovups 0x80(%rsi), %zmm2
+	vmovups 0xC0(%rsi), %zmm3
+	vmovups	-0x100(%rcx), %zmm4
+	vmovups -0xC0(%rcx), %zmm5
+	vmovups -0x80(%rcx), %zmm6
+	vmovups -0x40(%rcx), %zmm7
+	vmovups %zmm0, (%rdi)
+	vmovups %zmm1, 0x40(%rdi)
+	vmovups %zmm2, 0x80(%rdi)
+	vmovups %zmm3, 0xC0(%rdi)
+	vmovups	%zmm4, -0x100(%r9)
+	vmovups %zmm5, -0xC0(%r9)
+	vmovups %zmm6, -0x80(%r9)
+	vmovups %zmm7, -0x40(%r9)
+	ret
+
+L(less_256bytes):
+	cmp	$128, %dl
+	jb	L(less_128bytes)
+	vmovups	(%rsi), %zmm0
+	vmovups 0x40(%rsi), %zmm1
+	vmovups -0x80(%rcx), %zmm2
+	vmovups -0x40(%rcx), %zmm3
+	vmovups	%zmm0, (%rdi)
+	vmovups %zmm1, 0x40(%rdi)
+	vmovups %zmm2, -0x80(%r9)
+	vmovups %zmm3, -0x40(%r9)
+	ret
+
+L(less_128bytes):
+	cmp	$64, %dl
+	jb	L(less_64bytes)
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 0x20(%rsi), %ymm1
+	vmovdqu -0x40(%rcx), %ymm2
+	vmovdqu -0x20(%rcx), %ymm3
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %ymm1, 0x20(%rdi)
+	vmovdqu %ymm2, -0x40(%r9)
+	vmovdqu %ymm3, -0x20(%r9)
+	ret
+
+L(less_64bytes):
+	cmp	$32, %dl
+	jb	L(less_32bytes)
+	vmovdqu	(%rsi), %ymm0
+	vmovdqu -0x20(%rcx), %ymm1
+	vmovdqu	%ymm0, (%rdi)
+	vmovdqu	%ymm1, -0x20(%r9)
+	ret
+
+L(less_32bytes):
+	vmovdqu (%rsi), %xmm0
+	vmovdqu -0x10(%rcx), %xmm1
+	vmovdqu %xmm0, (%rdi)
+	vmovdqu %xmm1, -0x10(%r9)
+	ret
+
+L(less_16bytes):
+	cmp	$8, %dl
+	jb	L(less_8bytes)
+	movq	(%rsi), %rsi
+	movq	-0x8(%rcx), %rcx
+	movq	%rsi, (%rdi)
+	movq	%rcx, -0x8(%r9)
+	ret
+
+L(less_8bytes):
+	cmp	$4, %dl
+	jb	L(less_4bytes)
+	mov	(%rsi), %esi
+	mov	-0x4(%rcx), %ecx
+	mov	%esi, (%rdi)
+	mov	%ecx, -0x4(%r9)
+	ret
+
+L(less_4bytes):
+	cmp	$2, %dl
+	jb	L(less_2bytes)
+	mov	(%rsi), %si
+	mov	-0x2(%rcx), %cx
+	mov	%si, (%rdi)
+	mov	%cx, -0x2(%r9)
+	ret
+
+L(less_2bytes):
+	cmp	$1, %dl
+	jb	L(less_1bytes)
+	mov	(%rsi), %cl
+	mov	%cl, (%rdi)
+L(less_1bytes):
+	ret
+
+L(512bytesormore):
+# ifdef SHARED_CACHE_SIZE_HALF
+	mov	$SHARED_CACHE_SIZE_HALF, %r8
+# else
+	mov	__x86_shared_cache_size_half(%rip), %r8
+# endif
+	cmp	%r8, %rdx
+	jae	L(preloop_large)
+	cmp	$1024, %rdx
+	ja	L(1024bytesormore)
+	prefetcht1 (%rsi)
+	prefetcht1 0x40(%rsi)
+	prefetcht1 0x80(%rsi)
+	prefetcht1 0xC0(%rsi)
+	prefetcht1 0x100(%rsi)
+	prefetcht1 0x140(%rsi)
+	prefetcht1 0x180(%rsi)
+	prefetcht1 0x1C0(%rsi)
+	prefetcht1 -0x200(%rcx)
+	prefetcht1 -0x1C0(%rcx)
+	prefetcht1 -0x180(%rcx)
+	prefetcht1 -0x140(%rcx)
+	prefetcht1 -0x100(%rcx)
+	prefetcht1 -0xC0(%rcx)
+	prefetcht1 -0x80(%rcx)
+	prefetcht1 -0x40(%rcx)
+	vmovups	(%rsi), %zmm0
+	vmovups 0x40(%rsi), %zmm1
+	vmovups 0x80(%rsi), %zmm2
+	vmovups 0xC0(%rsi), %zmm3
+	vmovups	0x100(%rsi), %zmm4
+	vmovups 0x140(%rsi), %zmm5
+	vmovups 0x180(%rsi), %zmm6
+	vmovups 0x1C0(%rsi), %zmm7
+	vmovups	-0x200(%rcx), %zmm8
+	vmovups -0x1C0(%rcx), %zmm9
+	vmovups -0x180(%rcx), %zmm10
+	vmovups -0x140(%rcx), %zmm11
+	vmovups	-0x100(%rcx), %zmm12
+	vmovups -0xC0(%rcx), %zmm13
+	vmovups -0x80(%rcx), %zmm14
+	vmovups -0x40(%rcx), %zmm15
+	vmovups %zmm0, (%rdi)
+	vmovups %zmm1, 0x40(%rdi)
+	vmovups %zmm2, 0x80(%rdi)
+	vmovups %zmm3, 0xC0(%rdi)
+	vmovups %zmm4, 0x100(%rdi)
+	vmovups %zmm5, 0x140(%rdi)
+	vmovups %zmm6, 0x180(%rdi)
+	vmovups %zmm7, 0x1C0(%rdi)
+	vmovups	%zmm8, -0x200(%r9)
+	vmovups %zmm9, -0x1C0(%r9)
+	vmovups %zmm10, -0x180(%r9)
+	vmovups %zmm11, -0x140(%r9)
+	vmovups	%zmm12, -0x100(%r9)
+	vmovups %zmm13, -0xC0(%r9)
+	vmovups %zmm14, -0x80(%r9)
+	vmovups %zmm15, -0x40(%r9)
+	ret
+
+L(1024bytesormore):
+	cmp	%rsi, %rdi
+	ja	L(1024bytesormore_bkw)
+	sub	$512, %r9
+	vmovups -0x200(%rcx), %zmm8
+	vmovups -0x1C0(%rcx), %zmm9
+	vmovups -0x180(%rcx), %zmm10
+	vmovups -0x140(%rcx), %zmm11
+	vmovups	-0x100(%rcx), %zmm12
+	vmovups -0xC0(%rcx), %zmm13
+	vmovups -0x80(%rcx), %zmm14
+	vmovups -0x40(%rcx), %zmm15
+	prefetcht1 (%rsi)
+	prefetcht1 0x40(%rsi)
+	prefetcht1 0x80(%rsi)
+	prefetcht1 0xC0(%rsi)
+	prefetcht1 0x100(%rsi)
+	prefetcht1 0x140(%rsi)
+	prefetcht1 0x180(%rsi)
+	prefetcht1 0x1C0(%rsi)
+
+/* Loop with unaligned memory access.  */
+L(gobble_512bytes_loop):
+	vmovups	(%rsi), %zmm0
+	vmovups 0x40(%rsi), %zmm1
+	vmovups 0x80(%rsi), %zmm2
+	vmovups 0xC0(%rsi), %zmm3
+	vmovups	0x100(%rsi), %zmm4
+	vmovups 0x140(%rsi), %zmm5
+	vmovups 0x180(%rsi), %zmm6
+	vmovups 0x1C0(%rsi), %zmm7
+	add	$512, %rsi
+	prefetcht1 (%rsi)
+	prefetcht1 0x40(%rsi)
+	prefetcht1 0x80(%rsi)
+	prefetcht1 0xC0(%rsi)
+	prefetcht1 0x100(%rsi)
+	prefetcht1 0x140(%rsi)
+	prefetcht1 0x180(%rsi)
+	prefetcht1 0x1C0(%rsi)
+	vmovups	%zmm0, (%rdi)
+	vmovups %zmm1, 0x40(%rdi)
+	vmovups %zmm2, 0x80(%rdi)
+	vmovups %zmm3, 0xC0(%rdi)
+	vmovups	%zmm4, 0x100(%rdi)
+	vmovups %zmm5, 0x140(%rdi)
+	vmovups %zmm6, 0x180(%rdi)
+	vmovups %zmm7, 0x1C0(%rdi)
+	add	$512, %rdi
+	cmp	%r9, %rdi
+	jb	L(gobble_512bytes_loop)
+	vmovups %zmm8, (%r9)
+	vmovups %zmm9, 0x40(%r9)
+	vmovups %zmm10, 0x80(%r9)
+	vmovups %zmm11, 0xC0(%r9)
+	vmovups %zmm12, 0x100(%r9)
+	vmovups %zmm13, 0x140(%r9)
+	vmovups %zmm14, 0x180(%r9)
+	vmovups %zmm15, 0x1C0(%r9)
+	ret
+
+L(1024bytesormore_bkw):
+	add	$512, %rdi
+	vmovups	0x1C0(%rsi), %zmm8
+	vmovups 0x180(%rsi), %zmm9
+	vmovups 0x140(%rsi), %zmm10
+	vmovups 0x100(%rsi), %zmm11
+	vmovups	0xC0(%rsi), %zmm12
+	vmovups 0x80(%rsi), %zmm13
+	vmovups 0x40(%rsi), %zmm14
+	vmovups (%rsi), %zmm15
+	prefetcht1 -0x40(%rcx)
+	prefetcht1 -0x80(%rcx)
+	prefetcht1 -0xC0(%rcx)
+	prefetcht1 -0x100(%rcx)
+	prefetcht1 -0x140(%rcx)
+	prefetcht1 -0x180(%rcx)
+	prefetcht1 -0x1C0(%rcx)
+	prefetcht1 -0x200(%rcx)
+
+/* Backward loop with unaligned memory access.  */
+L(gobble_512bytes_loop_bkw):
+	vmovups -0x40(%rcx), %zmm0
+	vmovups -0x80(%rcx), %zmm1
+	vmovups -0xC0(%rcx), %zmm2
+	vmovups	-0x100(%rcx), %zmm3
+	vmovups -0x140(%rcx), %zmm4
+	vmovups -0x180(%rcx), %zmm5
+	vmovups -0x1C0(%rcx), %zmm6
+	vmovups	-0x200(%rcx), %zmm7
+	sub	$512, %rcx
+	prefetcht1 -0x40(%rcx)
+	prefetcht1 -0x80(%rcx)
+	prefetcht1 -0xC0(%rcx)
+	prefetcht1 -0x100(%rcx)
+	prefetcht1 -0x140(%rcx)
+	prefetcht1 -0x180(%rcx)
+	prefetcht1 -0x1C0(%rcx)
+	prefetcht1 -0x200(%rcx)
+	vmovups %zmm0, -0x40(%r9)
+	vmovups %zmm1, -0x80(%r9)
+	vmovups %zmm2, -0xC0(%r9)
+	vmovups	%zmm3, -0x100(%r9)
+	vmovups %zmm4, -0x140(%r9)
+	vmovups %zmm5, -0x180(%r9)
+	vmovups %zmm6, -0x1C0(%r9)
+	vmovups	%zmm7, -0x200(%r9)
+	sub	$512, %r9
+	cmp	%rdi, %r9
+	ja	L(gobble_512bytes_loop_bkw)
+	vmovups %zmm8, -0x40(%rdi)
+	vmovups %zmm9, -0x80(%rdi)
+	vmovups %zmm10, -0xC0(%rdi)
+	vmovups %zmm11, -0x100(%rdi)
+	vmovups %zmm12, -0x140(%rdi)
+	vmovups %zmm13, -0x180(%rdi)
+	vmovups %zmm14, -0x1C0(%rdi)
+	vmovups %zmm15, -0x200(%rdi)
+	ret
+
+L(preloop_large):
+	cmp	%rsi, %rdi
+	ja	L(preloop_large_bkw)
+	vmovups	(%rsi), %zmm4
+	vmovups	0x40(%rsi), %zmm5
+
+/* Align destination for access with non-temporal stores in the loop.  */
+	mov	%rdi, %r8
+	and	$-0x80, %rdi
+	add	$0x80, %rdi
+	sub	%rdi, %r8
+	sub	%r8, %rsi
+	add	%r8, %rdx
+L(gobble_256bytes_nt_loop):
+	prefetcht1 0x200(%rsi)
+	prefetcht1 0x240(%rsi)
+	prefetcht1 0x280(%rsi)
+	prefetcht1 0x2C0(%rsi)
+	prefetcht1 0x300(%rsi)
+	prefetcht1 0x340(%rsi)
+	prefetcht1 0x380(%rsi)
+	prefetcht1 0x3C0(%rsi)
+	vmovdqu64 (%rsi), %zmm0
+	vmovdqu64 0x40(%rsi), %zmm1
+	vmovdqu64 0x80(%rsi), %zmm2
+	vmovdqu64 0xC0(%rsi), %zmm3
+	vmovntdq %zmm0, (%rdi)
+	vmovntdq %zmm1, 0x40(%rdi)
+	vmovntdq %zmm2, 0x80(%rdi)
+	vmovntdq %zmm3, 0xC0(%rdi)
+	sub	$256, %rdx
+	add	$256, %rsi
+	add	$256, %rdi
+	cmp	$256, %rdx
+	ja	L(gobble_256bytes_nt_loop)
+	sfence
+	vmovups	%zmm4, (%rax)
+	vmovups	%zmm5, 0x40(%rax)
+	jmp	L(check)
+
+L(preloop_large_bkw):
+	vmovups -0x80(%rcx), %zmm4
+	vmovups -0x40(%rcx), %zmm5
+
+/* Align end of destination for access with non-temporal stores.  */
+	mov	%r9, %r8
+	and	$-0x80, %r9
+	sub	%r9, %r8
+	sub	%r8, %rcx
+	sub	%r8, %rdx
+	add	%r9, %r8
+L(gobble_256bytes_nt_loop_bkw):
+	prefetcht1 -0x400(%rcx)
+	prefetcht1 -0x3C0(%rcx)
+	prefetcht1 -0x380(%rcx)
+	prefetcht1 -0x340(%rcx)
+	prefetcht1 -0x300(%rcx)
+	prefetcht1 -0x2C0(%rcx)
+	prefetcht1 -0x280(%rcx)
+	prefetcht1 -0x240(%rcx)
+	vmovdqu64 -0x100(%rcx), %zmm0
+	vmovdqu64 -0xC0(%rcx), %zmm1
+	vmovdqu64 -0x80(%rcx), %zmm2
+	vmovdqu64 -0x40(%rcx), %zmm3
+	vmovntdq %zmm0,	-0x100(%r9)
+	vmovntdq %zmm1,	-0xC0(%r9)
+	vmovntdq %zmm2,	-0x80(%r9)
+	vmovntdq %zmm3,	-0x40(%r9)
+	sub	$256, %rdx
+	sub	$256, %rcx
+	sub	$256, %r9
+	cmp	$256, %rdx
+	ja	L(gobble_256bytes_nt_loop_bkw)
+	sfence
+	vmovups	%zmm4, -0x80(%r8)
+	vmovups	%zmm5, -0x40(%r8)
+	jmp	L(check)
+END (__memmove_avx512_no_vzeroupper)
+
+# ifdef SHARED
+strong_alias (__memmove_avx512_no_vzeroupper, __memcpy_avx512_no_vzeroupper)
+strong_alias (__memmove_chk_avx512_no_vzeroupper, __memcpy_chk_avx512_no_vzeroupper)
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
new file mode 100644
index 0000000000..aac1515cf6
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
@@ -0,0 +1,12 @@
+#if IS_IN (libc)
+# define VEC_SIZE	64
+# define VEC(i)		zmm##i
+# define VMOVNT		vmovntdq
+# define VMOVU		vmovdqu64
+# define VMOVA		vmovdqa64
+
+# define SECTION(p)		p##.avx512
+# define MEMMOVE_SYMBOL(p,s)	p##_avx512_##s
+
+# include "memmove-vec-unaligned-erms.S"
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memmove-ssse3-back.S b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
new file mode 100644
index 0000000000..f9a4e9aff9
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY		__memmove_ssse3_back
+#define MEMCPY_CHK	__memmove_chk_ssse3_back
+#include "memcpy-ssse3-back.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memmove-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-ssse3.S
new file mode 100644
index 0000000000..295430b1ef
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY		__memmove_ssse3
+#define MEMCPY_CHK	__memmove_chk_ssse3
+#include "memcpy-ssse3.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
new file mode 100644
index 0000000000..dee3ec529c
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -0,0 +1,553 @@
+/* memmove/memcpy/mempcpy with unaligned load/store and rep movsb
+   Copyright (C) 2016-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* memmove/memcpy/mempcpy is implemented as:
+   1. Use overlapping load and store to avoid branch.
+   2. Load all sources into registers and store them together to avoid
+      possible address overlap between source and destination.
+   3. If size is 8 * VEC_SIZE or less, load all sources into registers
+      and store them together.
+   4. If address of destination > address of source, backward copy
+      4 * VEC_SIZE at a time with unaligned load and aligned store.
+      Load the first 4 * VEC and last VEC before the loop and store
+      them after the loop to support overlapping addresses.
+   5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned
+      load and aligned store.  Load the last 4 * VEC and first VEC
+      before the loop and store them after the loop to support
+      overlapping addresses.
+   6. If size >= __x86_shared_non_temporal_threshold and there is no
+      overlap between destination and source, use non-temporal store
+      instead of aligned store.  */
+
+#include <sysdep.h>
+
+#ifndef MEMCPY_SYMBOL
+# define MEMCPY_SYMBOL(p,s)		MEMMOVE_SYMBOL(p, s)
+#endif
+
+#ifndef MEMPCPY_SYMBOL
+# define MEMPCPY_SYMBOL(p,s)		MEMMOVE_SYMBOL(p, s)
+#endif
+
+#ifndef MEMMOVE_CHK_SYMBOL
+# define MEMMOVE_CHK_SYMBOL(p,s)	MEMMOVE_SYMBOL(p, s)
+#endif
+
+#ifndef VZEROUPPER
+# if VEC_SIZE > 16
+#  define VZEROUPPER vzeroupper
+# else
+#  define VZEROUPPER
+# endif
+#endif
+
+/* Threshold to use Enhanced REP MOVSB.  Since there is overhead to set
+   up REP MOVSB operation, REP MOVSB isn't faster on short data.  The
+   memcpy micro benchmark in glibc shows that 2KB is the approximate
+   value above which REP MOVSB becomes faster than SSE2 optimization
+   on processors with Enhanced REP MOVSB.  Since larger register size
+   can move more data with a single load and store, the threshold is
+   higher with larger register size.  */
+#ifndef REP_MOVSB_THRESHOLD
+# define REP_MOVSB_THRESHOLD	(2048 * (VEC_SIZE / 16))
+#endif
+
+#ifndef PREFETCH
+# define PREFETCH(addr) prefetcht0 addr
+#endif
+
+/* Assume 64-byte prefetch size.  */
+#ifndef PREFETCH_SIZE
+# define PREFETCH_SIZE 64
+#endif
+
+#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
+
+#if PREFETCH_SIZE == 64
+# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
+#  define PREFETCH_ONE_SET(dir, base, offset) \
+	PREFETCH ((offset)base)
+# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
+#  define PREFETCH_ONE_SET(dir, base, offset) \
+	PREFETCH ((offset)base); \
+	PREFETCH ((offset + dir * PREFETCH_SIZE)base)
+# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
+#  define PREFETCH_ONE_SET(dir, base, offset) \
+	PREFETCH ((offset)base); \
+	PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
+	PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
+	PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
+	PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
+# else
+#   error Unsupported PREFETCHED_LOAD_SIZE!
+# endif
+#else
+# error Unsupported PREFETCH_SIZE!
+#endif
+
+#ifndef SECTION
+# error SECTION is not defined!
+#endif
+
+	.section SECTION(.text),"ax",@progbits
+#if defined SHARED && IS_IN (libc)
+ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
+#endif
+
+#if VEC_SIZE == 16 || defined SHARED
+ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
+	movq	%rdi, %rax
+	addq	%rdx, %rax
+	jmp	L(start)
+END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
+#endif
+
+#if defined SHARED && IS_IN (libc)
+ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
+#endif
+
+ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
+	movq	%rdi, %rax
+L(start):
+	cmpq	$VEC_SIZE, %rdx
+	jb	L(less_vec)
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
+#if !defined USE_MULTIARCH || !IS_IN (libc)
+L(last_2x_vec):
+#endif
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
+	VZEROUPPER
+#if !defined USE_MULTIARCH || !IS_IN (libc)
+L(nop):
+#endif
+	ret
+#if defined USE_MULTIARCH && IS_IN (libc)
+END (MEMMOVE_SYMBOL (__memmove, unaligned))
+
+# if VEC_SIZE == 16
+#  if defined SHARED
+/* Only used to measure performance of REP MOVSB.  */
+ENTRY (__mempcpy_erms)
+	movq	%rdi, %rax
+	addq	%rdx, %rax
+	jmp	L(start_movsb)
+END (__mempcpy_erms)
+#  endif
+
+ENTRY (__memmove_erms)
+	movq	%rdi, %rax
+L(start_movsb):
+	movq	%rdx, %rcx
+	cmpq	%rsi, %rdi
+	jb	1f
+	/* Source == destination is less common.  */
+	je	2f
+	leaq	(%rsi,%rcx), %rdx
+	cmpq	%rdx, %rdi
+	jb	L(movsb_backward)
+1:
+	rep movsb
+2:
+	ret
+L(movsb_backward):
+	leaq	-1(%rdi,%rcx), %rdi
+	leaq	-1(%rsi,%rcx), %rsi
+	std
+	rep movsb
+	cld
+	ret
+END (__memmove_erms)
+#  if defined SHARED
+strong_alias (__memmove_erms, __memcpy_erms)
+#  endif
+# endif
+
+# ifdef SHARED
+ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
+
+ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
+	movq	%rdi, %rax
+	addq	%rdx, %rax
+	jmp	L(start_erms)
+END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
+
+ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
+# endif
+
+ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+	movq	%rdi, %rax
+L(start_erms):
+	cmpq	$VEC_SIZE, %rdx
+	jb	L(less_vec)
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(movsb_more_2x_vec)
+L(last_2x_vec):
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
+L(return):
+	VZEROUPPER
+	ret
+
+L(movsb):
+	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
+	jae	L(more_8x_vec)
+	cmpq	%rsi, %rdi
+	jb	1f
+	/* Source == destination is less common.  */
+	je	L(nop)
+	leaq	(%rsi,%rdx), %r9
+	cmpq	%r9, %rdi
+	/* Avoid slow backward REP MOVSB.  */
+# if REP_MOVSB_THRESHOLD <= (VEC_SIZE * 8)
+#  error Unsupported REP_MOVSB_THRESHOLD and VEC_SIZE!
+# endif
+	jb	L(more_8x_vec_backward)
+1:
+	movq	%rdx, %rcx
+	rep movsb
+L(nop):
+	ret
+#endif
+
+L(less_vec):
+	/* Less than 1 VEC.  */
+#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+# error Unsupported VEC_SIZE!
+#endif
+#if VEC_SIZE > 32
+	cmpb	$32, %dl
+	jae	L(between_32_63)
+#endif
+#if VEC_SIZE > 16
+	cmpb	$16, %dl
+	jae	L(between_16_31)
+#endif
+	cmpb	$8, %dl
+	jae	L(between_8_15)
+	cmpb	$4, %dl
+	jae	L(between_4_7)
+	cmpb	$1, %dl
+	ja	L(between_2_3)
+	jb	1f
+	movzbl	(%rsi), %ecx
+	movb	%cl, (%rdi)
+1:
+	ret
+#if VEC_SIZE > 32
+L(between_32_63):
+	/* From 32 to 63.  No branch when size == 32.  */
+	vmovdqu	(%rsi), %ymm0
+	vmovdqu	-32(%rsi,%rdx), %ymm1
+	vmovdqu	%ymm0, (%rdi)
+	vmovdqu	%ymm1, -32(%rdi,%rdx)
+	VZEROUPPER
+	ret
+#endif
+#if VEC_SIZE > 16
+	/* From 16 to 31.  No branch when size == 16.  */
+L(between_16_31):
+	vmovdqu	(%rsi), %xmm0
+	vmovdqu	-16(%rsi,%rdx), %xmm1
+	vmovdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm1, -16(%rdi,%rdx)
+	ret
+#endif
+L(between_8_15):
+	/* From 8 to 15.  No branch when size == 8.  */
+	movq	-8(%rsi,%rdx), %rcx
+	movq	(%rsi), %rsi
+	movq	%rcx, -8(%rdi,%rdx)
+	movq	%rsi, (%rdi)
+	ret
+L(between_4_7):
+	/* From 4 to 7.  No branch when size == 4.  */
+	movl	-4(%rsi,%rdx), %ecx
+	movl	(%rsi), %esi
+	movl	%ecx, -4(%rdi,%rdx)
+	movl	%esi, (%rdi)
+	ret
+L(between_2_3):
+	/* From 2 to 3.  No branch when size == 2.  */
+	movzwl	-2(%rsi,%rdx), %ecx
+	movzwl	(%rsi), %esi
+	movw	%cx, -2(%rdi,%rdx)
+	movw	%si, (%rdi)
+	ret
+
+#if defined USE_MULTIARCH && IS_IN (libc)
+L(movsb_more_2x_vec):
+	cmpq	$REP_MOVSB_THRESHOLD, %rdx
+	ja	L(movsb)
+#endif
+L(more_2x_vec):
+	/* More than 2 * VEC and there may be overlap between destination
+	   and source.  */
+	cmpq	$(VEC_SIZE * 8), %rdx
+	ja	L(more_8x_vec)
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jb	L(last_4x_vec)
+	/* Copy from 4 * VEC to 8 * VEC, inclusively. */
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(4)
+	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
+	VMOVU	-(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
+	VMOVU	-(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(1), VEC_SIZE(%rdi)
+	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	VMOVU	%VEC(4), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
+	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
+	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
+	VZEROUPPER
+	ret
+L(last_4x_vec):
+	/* Copy from 2 * VEC to 4 * VEC. */
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(2)
+	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(1), VEC_SIZE(%rdi)
+	VMOVU	%VEC(2), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
+	VZEROUPPER
+	ret
+
+L(more_8x_vec):
+	cmpq	%rsi, %rdi
+	ja	L(more_8x_vec_backward)
+	/* Source == destination is less common.  */
+	je	L(nop)
+	/* Load the first VEC and last 4 * VEC to support overlapping
+	   addresses.  */
+	VMOVU	(%rsi), %VEC(4)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(5)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
+	/* Save start and stop of the destination buffer.  */
+	movq	%rdi, %r11
+	leaq	-VEC_SIZE(%rdi, %rdx), %rcx
+	/* Align destination for aligned stores in the loop.  Compute
+	   how much destination is misaligned.  */
+	movq	%rdi, %r8
+	andq	$(VEC_SIZE - 1), %r8
+	/* Get the negative of offset for alignment.  */
+	subq	$VEC_SIZE, %r8
+	/* Adjust source.  */
+	subq	%r8, %rsi
+	/* Adjust destination which should be aligned now.  */
+	subq	%r8, %rdi
+	/* Adjust length.  */
+	addq	%r8, %rdx
+#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+	/* Check non-temporal store threshold.  */
+	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
+	ja	L(large_forward)
+#endif
+L(loop_4x_vec_forward):
+	/* Copy 4 * VEC a time forward.  */
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	addq	$(VEC_SIZE * 4), %rsi
+	subq	$(VEC_SIZE * 4), %rdx
+	VMOVA	%VEC(0), (%rdi)
+	VMOVA	%VEC(1), VEC_SIZE(%rdi)
+	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	addq	$(VEC_SIZE * 4), %rdi
+	cmpq	$(VEC_SIZE * 4), %rdx
+	ja	L(loop_4x_vec_forward)
+	/* Store the last 4 * VEC.  */
+	VMOVU	%VEC(5), (%rcx)
+	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
+	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
+	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
+	/* Store the first VEC.  */
+	VMOVU	%VEC(4), (%r11)
+	VZEROUPPER
+	ret
+
+L(more_8x_vec_backward):
+	/* Load the first 4 * VEC and last VEC to support overlapping
+	   addresses.  */
+	VMOVU	(%rsi), %VEC(4)
+	VMOVU	VEC_SIZE(%rsi), %VEC(5)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(6)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(7)
+	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(8)
+	/* Save stop of the destination buffer.  */
+	leaq	-VEC_SIZE(%rdi, %rdx), %r11
+	/* Align destination end for aligned stores in the loop.  Compute
+	   how much destination end is misaligned.  */
+	leaq	-VEC_SIZE(%rsi, %rdx), %rcx
+	movq	%r11, %r9
+	movq	%r11, %r8
+	andq	$(VEC_SIZE - 1), %r8
+	/* Adjust source.  */
+	subq	%r8, %rcx
+	/* Adjust the end of destination which should be aligned now.  */
+	subq	%r8, %r9
+	/* Adjust length.  */
+	subq	%r8, %rdx
+#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+	/* Check non-temporal store threshold.  */
+	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
+	ja	L(large_backward)
+#endif
+L(loop_4x_vec_backward):
+	/* Copy 4 * VEC a time backward.  */
+	VMOVU	(%rcx), %VEC(0)
+	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
+	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
+	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
+	subq	$(VEC_SIZE * 4), %rcx
+	subq	$(VEC_SIZE * 4), %rdx
+	VMOVA	%VEC(0), (%r9)
+	VMOVA	%VEC(1), -VEC_SIZE(%r9)
+	VMOVA	%VEC(2), -(VEC_SIZE * 2)(%r9)
+	VMOVA	%VEC(3), -(VEC_SIZE * 3)(%r9)
+	subq	$(VEC_SIZE * 4), %r9
+	cmpq	$(VEC_SIZE * 4), %rdx
+	ja	L(loop_4x_vec_backward)
+	/* Store the first 4 * VEC.  */
+	VMOVU	%VEC(4), (%rdi)
+	VMOVU	%VEC(5), VEC_SIZE(%rdi)
+	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
+	/* Store the last VEC.  */
+	VMOVU	%VEC(8), (%r11)
+	VZEROUPPER
+	ret
+
+#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+L(large_forward):
+	/* Don't use non-temporal store if there is overlap between
+	   destination and source since destination may be in cache
+	   when source is loaded.  */
+	leaq    (%rdi, %rdx), %r10
+	cmpq    %r10, %rsi
+	jb	L(loop_4x_vec_forward)
+L(loop_large_forward):
+	/* Copy 4 * VEC a time forward with non-temporal stores.  */
+	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
+	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	addq	$PREFETCHED_LOAD_SIZE, %rsi
+	subq	$PREFETCHED_LOAD_SIZE, %rdx
+	VMOVNT	%VEC(0), (%rdi)
+	VMOVNT	%VEC(1), VEC_SIZE(%rdi)
+	VMOVNT	%VEC(2), (VEC_SIZE * 2)(%rdi)
+	VMOVNT	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	addq	$PREFETCHED_LOAD_SIZE, %rdi
+	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
+	ja	L(loop_large_forward)
+	sfence
+	/* Store the last 4 * VEC.  */
+	VMOVU	%VEC(5), (%rcx)
+	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
+	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
+	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
+	/* Store the first VEC.  */
+	VMOVU	%VEC(4), (%r11)
+	VZEROUPPER
+	ret
+
+L(large_backward):
+	/* Don't use non-temporal store if there is overlap between
+	   destination and source since destination may be in cache
+	   when source is loaded.  */
+	leaq    (%rcx, %rdx), %r10
+	cmpq    %r10, %r9
+	jb	L(loop_4x_vec_backward)
+L(loop_large_backward):
+	/* Copy 4 * VEC a time backward with non-temporal stores.  */
+	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
+	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
+	VMOVU	(%rcx), %VEC(0)
+	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
+	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
+	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
+	subq	$PREFETCHED_LOAD_SIZE, %rcx
+	subq	$PREFETCHED_LOAD_SIZE, %rdx
+	VMOVNT	%VEC(0), (%r9)
+	VMOVNT	%VEC(1), -VEC_SIZE(%r9)
+	VMOVNT	%VEC(2), -(VEC_SIZE * 2)(%r9)
+	VMOVNT	%VEC(3), -(VEC_SIZE * 3)(%r9)
+	subq	$PREFETCHED_LOAD_SIZE, %r9
+	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
+	ja	L(loop_large_backward)
+	sfence
+	/* Store the first 4 * VEC.  */
+	VMOVU	%VEC(4), (%rdi)
+	VMOVU	%VEC(5), VEC_SIZE(%rdi)
+	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
+	/* Store the last VEC.  */
+	VMOVU	%VEC(8), (%r11)
+	VZEROUPPER
+	ret
+#endif
+END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+
+#ifdef SHARED
+# if IS_IN (libc)
+#  ifdef USE_MULTIARCH
+strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
+	      MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
+strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
+	      MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
+#  endif
+strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
+	      MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
+# endif
+#endif
+#if VEC_SIZE == 16 || defined SHARED
+strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
+	      MEMCPY_SYMBOL (__memcpy, unaligned))
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memmove.S b/REORG.TODO/sysdeps/x86_64/multiarch/memmove.S
new file mode 100644
index 0000000000..8c534e83e0
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/memmove.S
@@ -0,0 +1,101 @@
+/* Multiple versions of memmove
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2016-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+   DSO.  */
+#if IS_IN (libc)
+	.text
+ENTRY(__libc_memmove)
+	.type	__libc_memmove, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	lea	__memmove_erms(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Prefer_ERMS)
+	jnz	2f
+	HAS_ARCH_FEATURE (Prefer_No_AVX512)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX512F_Usable)
+	jz	1f
+	lea	__memmove_avx512_no_vzeroupper(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	2f
+	lea	__memmove_avx512_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	2f
+	lea	__memmove_avx512_unaligned(%rip), %RAX_LP
+	ret
+1:	lea	__memmove_avx_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	L(Fast_Unaligned_Load)
+	HAS_CPU_FEATURE (ERMS)
+	jz	2f
+	lea	__memmove_avx_unaligned_erms(%rip), %RAX_LP
+	ret
+L(Fast_Unaligned_Load):
+	lea	__memmove_sse2_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Unaligned_Copy)
+	jz	L(SSSE3)
+	HAS_CPU_FEATURE (ERMS)
+	jz	2f
+	lea	__memmove_sse2_unaligned_erms(%rip), %RAX_LP
+	ret
+L(SSSE3):
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	lea    __memmove_ssse3_back(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Copy_Backward)
+	jnz	2f
+	lea	__memmove_ssse3(%rip), %RAX_LP
+2:	ret
+END(__libc_memmove)
+#endif
+
+#if IS_IN (libc)
+# define MEMMOVE_SYMBOL(p,s)	p##_sse2_##s
+
+# ifdef SHARED
+libc_hidden_ver (__memmove_sse2_unaligned, memmove)
+libc_hidden_ver (__memcpy_sse2_unaligned, memcpy)
+libc_hidden_ver (__mempcpy_sse2_unaligned, mempcpy)
+libc_hidden_ver (__mempcpy_sse2_unaligned, __mempcpy)
+
+#  undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal memmove calls through a PLT.
+   The speedup we get from using SSE2 instructions is likely eaten away
+   by the indirect call in the PLT.  */
+#  define libc_hidden_builtin_def
+# endif
+strong_alias (__libc_memmove, memmove)
+#endif
+
+#if !defined SHARED || !IS_IN (libc)
+weak_alias (__mempcpy, mempcpy)
+#endif
+
+#include "../memmove.S"
+
+#if defined SHARED && IS_IN (libc)
+# include <shlib-compat.h>
+# if SHLIB_COMPAT (libc, GLIBC_2_2_5, GLIBC_2_14)
+/* Use __memmove_sse2_unaligned to support overlapping addresses.  */
+compat_symbol (libc, __memmove_sse2_unaligned, memcpy, GLIBC_2_2_5);
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memmove_chk.S b/REORG.TODO/sysdeps/x86_64/multiarch/memmove_chk.S
new file mode 100644
index 0000000000..7870dd0247
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/memmove_chk.S
@@ -0,0 +1,71 @@
+/* Multiple versions of __memmove_chk
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2016-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+   DSO.  There are no multiarch memmove functions for static binaries.
+ */
+#if IS_IN (libc)
+# ifdef SHARED
+	.text
+ENTRY(__memmove_chk)
+	.type	__memmove_chk, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	HAS_ARCH_FEATURE (Prefer_No_AVX512)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX512F_Usable)
+	jz	1f
+	lea	__memmove_chk_avx512_no_vzeroupper(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	2f
+	lea	__memmove_chk_avx512_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	2f
+	lea	__memmove_chk_avx512_unaligned(%rip), %RAX_LP
+	ret
+1:	lea	__memmove_chk_avx_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	L(Fast_Unaligned_Load)
+	HAS_CPU_FEATURE (ERMS)
+	jz	2f
+	lea	__memmove_chk_avx_unaligned_erms(%rip), %RAX_LP
+	ret
+L(Fast_Unaligned_Load):
+	lea	__memmove_chk_sse2_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Unaligned_Copy)
+	jz	L(SSSE3)
+	HAS_CPU_FEATURE (ERMS)
+	jz	2f
+	lea	__memmove_chk_sse2_unaligned_erms(%rip), %RAX_LP
+	ret
+L(SSSE3):
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	lea    __memmove_chk_ssse3_back(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Copy_Backward)
+	jnz	2f
+	lea	__memmove_chk_ssse3(%rip), %RAX_LP
+2:	ret
+END(__memmove_chk)
+# else
+#  include "../memmove_chk.S"
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/mempcpy.S b/REORG.TODO/sysdeps/x86_64/multiarch/mempcpy.S
new file mode 100644
index 0000000000..b8b2b28094
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/mempcpy.S
@@ -0,0 +1,73 @@
+/* Multiple versions of mempcpy
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+   DSO.  In static binaries we need mempcpy before the initialization
+   happened.  */
+#if defined SHARED && IS_IN (libc)
+	.text
+ENTRY(__mempcpy)
+	.type	__mempcpy, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	lea	__mempcpy_erms(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Prefer_ERMS)
+	jnz	2f
+	HAS_ARCH_FEATURE (Prefer_No_AVX512)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX512F_Usable)
+	jz	1f
+	lea	__mempcpy_avx512_no_vzeroupper(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	2f
+	lea	__mempcpy_avx512_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	2f
+	lea	__mempcpy_avx512_unaligned(%rip), %RAX_LP
+	ret
+1:	lea	__mempcpy_avx_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	L(Fast_Unaligned_Load)
+	HAS_CPU_FEATURE (ERMS)
+	jz	2f
+	lea	__mempcpy_avx_unaligned_erms(%rip), %RAX_LP
+	ret
+L(Fast_Unaligned_Load):
+	lea	__mempcpy_sse2_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Unaligned_Copy)
+	jz	L(SSSE3)
+	HAS_CPU_FEATURE (ERMS)
+	jz	2f
+	lea	__mempcpy_sse2_unaligned_erms(%rip), %RAX_LP
+	ret
+L(SSSE3):
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	lea    __mempcpy_ssse3_back(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Copy_Backward)
+	jnz	2f
+	lea	__mempcpy_ssse3(%rip), %RAX_LP
+2:	ret
+END(__mempcpy)
+
+weak_alias (__mempcpy, mempcpy)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/mempcpy_chk.S b/REORG.TODO/sysdeps/x86_64/multiarch/mempcpy_chk.S
new file mode 100644
index 0000000000..072b22c49f
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/mempcpy_chk.S
@@ -0,0 +1,72 @@
+/* Multiple versions of __mempcpy_chk
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+   DSO.  There are no multiarch mempcpy functions for static binaries.
+ */
+#if IS_IN (libc)
+# ifdef SHARED
+	.text
+ENTRY(__mempcpy_chk)
+	.type	__mempcpy_chk, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	HAS_ARCH_FEATURE (Prefer_No_AVX512)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX512F_Usable)
+	jz	1f
+	lea	__mempcpy_chk_avx512_no_vzeroupper(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	2f
+	lea	__mempcpy_chk_avx512_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	2f
+	lea	__mempcpy_chk_avx512_unaligned(%rip), %RAX_LP
+	ret
+1:	lea	__mempcpy_chk_avx_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	L(Fast_Unaligned_Load)
+	HAS_CPU_FEATURE (ERMS)
+	jz	2f
+	lea	__mempcpy_chk_avx_unaligned_erms(%rip), %RAX_LP
+	ret
+L(Fast_Unaligned_Load):
+	lea	__mempcpy_chk_sse2_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Unaligned_Copy)
+	jz	L(SSSE3)
+	HAS_CPU_FEATURE (ERMS)
+	jz	2f
+	lea	__mempcpy_chk_sse2_unaligned_erms(%rip), %RAX_LP
+	ret
+L(SSSE3):
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	lea    __mempcpy_chk_ssse3_back(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Copy_Backward)
+	jnz	2f
+	lea	__mempcpy_chk_ssse3(%rip), %RAX_LP
+2:	ret
+END(__mempcpy_chk)
+# else
+#  include "../mempcpy_chk.S"
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/REORG.TODO/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
new file mode 100644
index 0000000000..7ab3d89849
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
@@ -0,0 +1,22 @@
+#if IS_IN (libc)
+# define VEC_SIZE	32
+# define VEC(i)		ymm##i
+# define VMOVU		vmovdqu
+# define VMOVA		vmovdqa
+
+# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+  vmovd d, %xmm0; \
+  movq r, %rax; \
+  vpbroadcastb %xmm0, %ymm0
+
+# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+  vmovd d, %xmm0; \
+  movq r, %rax; \
+  vpbroadcastd %xmm0, %ymm0
+
+# define SECTION(p)		p##.avx
+# define MEMSET_SYMBOL(p,s)	p##_avx2_##s
+# define WMEMSET_SYMBOL(p,s)	p##_avx2_##s
+
+# include "memset-vec-unaligned-erms.S"
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S b/REORG.TODO/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
new file mode 100644
index 0000000000..1f66602398
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
@@ -0,0 +1,194 @@
+/* memset optimized with AVX512 for KNL hardware.
+   Copyright (C) 2015-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#if IS_IN (libc)
+
+#include "asm-syntax.h"
+#ifndef MEMSET
+# define MEMSET __memset_avx512_no_vzeroupper
+# define MEMSET_CHK __memset_chk_avx512_no_vzeroupper
+#endif
+
+	.section .text.avx512,"ax",@progbits
+#if defined PIC
+ENTRY (MEMSET_CHK)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMSET_CHK)
+#endif
+
+ENTRY (MEMSET)
+	vpxor	%xmm0, %xmm0, %xmm0
+	vmovd	%esi, %xmm1
+	lea	(%rdi, %rdx), %rsi
+	mov	%rdi, %rax
+	vpshufb	%xmm0, %xmm1, %xmm0
+	cmp	$16, %rdx
+	jb	L(less_16bytes)
+	cmp	$512, %rdx
+	vbroadcastss	%xmm0, %zmm2
+	ja	L(512bytesormore)
+	cmp	$256, %rdx
+	jb	L(less_256bytes)
+	vmovups	%zmm2, (%rdi)
+	vmovups %zmm2, 0x40(%rdi)
+	vmovups %zmm2, 0x80(%rdi)
+	vmovups %zmm2, 0xC0(%rdi)
+	vmovups %zmm2, -0x100(%rsi)
+	vmovups %zmm2, -0xC0(%rsi)
+	vmovups %zmm2, -0x80(%rsi)
+	vmovups %zmm2, -0x40(%rsi)
+	ret
+
+L(less_256bytes):
+	cmp	$128, %dl
+	jb	L(less_128bytes)
+	vmovups	%zmm2, (%rdi)
+	vmovups %zmm2, 0x40(%rdi)
+	vmovups %zmm2, -0x80(%rsi)
+	vmovups %zmm2, -0x40(%rsi)
+	ret
+
+L(less_128bytes):
+	cmp	$64, %dl
+	jb	L(less_64bytes)
+	vmovups	%zmm2, (%rdi)
+	vmovups	%zmm2, -0x40(%rsi)
+	ret
+
+L(less_64bytes):
+	cmp	$32, %dl
+	jb	L(less_32bytes)
+	vmovdqu	%ymm2, (%rdi)
+	vmovdqu %ymm2, -0x20(%rsi)
+	ret
+
+L(less_32bytes):
+	vmovdqu %xmm0, (%rdi)
+	vmovdqu %xmm0, -0x10(%rsi)
+	ret
+
+L(less_16bytes):
+	cmp	$8, %dl
+	jb	L(less_8bytes)
+	vmovq	%xmm0, (%rdi)
+	vmovq	%xmm0, -0x08(%rsi)
+	ret
+
+L(less_8bytes):
+	vmovd	%xmm0, %ecx
+	cmp	$4, %dl
+	jb	L(less_4bytes)
+	mov	%ecx, (%rdi)
+	mov	%ecx, -0x04(%rsi)
+	ret
+
+L(less_4bytes):
+	cmp	$2, %dl
+	jb	L(less_2bytes)
+	mov	%cx, (%rdi)
+	mov	%cx, -0x02(%rsi)
+	ret
+
+L(less_2bytes):
+	cmp	$1, %dl
+	jb	L(less_1bytes)
+	mov	%cl, (%rdi)
+L(less_1bytes):
+	ret
+
+L(512bytesormore):
+	mov	__x86_shared_cache_size_half(%rip), %rcx
+	cmp	%rcx, %rdx
+	ja	L(preloop_large)
+	cmp	$1024, %rdx
+	ja	L(1024bytesormore)
+
+	vmovups	%zmm2, (%rdi)
+	vmovups	%zmm2, 0x40(%rdi)
+	vmovups	%zmm2, 0x80(%rdi)
+	vmovups	%zmm2, 0xC0(%rdi)
+	vmovups	%zmm2, 0x100(%rdi)
+	vmovups	%zmm2, 0x140(%rdi)
+	vmovups	%zmm2, 0x180(%rdi)
+	vmovups	%zmm2, 0x1C0(%rdi)
+	vmovups %zmm2, -0x200(%rsi)
+	vmovups %zmm2, -0x1C0(%rsi)
+	vmovups %zmm2, -0x180(%rsi)
+	vmovups %zmm2, -0x140(%rsi)
+	vmovups %zmm2, -0x100(%rsi)
+	vmovups %zmm2, -0xC0(%rsi)
+	vmovups %zmm2, -0x80(%rsi)
+	vmovups %zmm2, -0x40(%rsi)
+	ret
+
+/* Align on 64 and loop with aligned stores.  */
+L(1024bytesormore):
+	sub	$0x100, %rsi
+	vmovups	%zmm2, (%rax)
+	and	$-0x40, %rdi
+	add	$0x40, %rdi
+
+L(gobble_256bytes_loop):
+	vmovaps	%zmm2, (%rdi)
+	vmovaps	%zmm2, 0x40(%rdi)
+	vmovaps	%zmm2, 0x80(%rdi)
+	vmovaps	%zmm2, 0xC0(%rdi)
+	add	$0x100, %rdi
+	cmp	%rsi, %rdi
+	jb	L(gobble_256bytes_loop)
+	vmovups %zmm2, (%rsi)
+	vmovups %zmm2, 0x40(%rsi)
+	vmovups %zmm2, 0x80(%rsi)
+	vmovups %zmm2, 0xC0(%rsi)
+	ret
+
+/* Align on 128 and loop with non-temporal stores.  */
+L(preloop_large):
+	and	$-0x80, %rdi
+	add	$0x80, %rdi
+	vmovups	%zmm2, (%rax)
+	vmovups	%zmm2, 0x40(%rax)
+	sub	$0x200, %rsi
+
+L(gobble_512bytes_nt_loop):
+	vmovntdq %zmm2, (%rdi)
+	vmovntdq %zmm2, 0x40(%rdi)
+	vmovntdq %zmm2, 0x80(%rdi)
+	vmovntdq %zmm2, 0xC0(%rdi)
+	vmovntdq %zmm2, 0x100(%rdi)
+	vmovntdq %zmm2, 0x140(%rdi)
+	vmovntdq %zmm2, 0x180(%rdi)
+	vmovntdq %zmm2, 0x1C0(%rdi)
+	add	$0x200, %rdi
+	cmp	%rsi, %rdi
+	jb	L(gobble_512bytes_nt_loop)
+	sfence
+	vmovups %zmm2, (%rsi)
+	vmovups %zmm2, 0x40(%rsi)
+	vmovups %zmm2, 0x80(%rsi)
+	vmovups %zmm2, 0xC0(%rsi)
+	vmovups	%zmm2, 0x100(%rsi)
+	vmovups	%zmm2, 0x140(%rsi)
+	vmovups	%zmm2, 0x180(%rsi)
+	vmovups	%zmm2, 0x1C0(%rsi)
+	ret
+END (MEMSET)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/REORG.TODO/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
new file mode 100644
index 0000000000..0783979ca5
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -0,0 +1,24 @@
+#if IS_IN (libc)
+# define VEC_SIZE	64
+# define VEC(i)		zmm##i
+# define VMOVU		vmovdqu64
+# define VMOVA		vmovdqa64
+
+# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+  vmovd d, %xmm0; \
+  movq r, %rax; \
+  vpbroadcastb %xmm0, %xmm0; \
+  vpbroadcastq %xmm0, %zmm0
+
+# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+  vmovd d, %xmm0; \
+  movq r, %rax; \
+  vpbroadcastd %xmm0, %xmm0; \
+  vpbroadcastq %xmm0, %zmm0
+
+# define SECTION(p)		p##.avx512
+# define MEMSET_SYMBOL(p,s)	p##_avx512_##s
+# define WMEMSET_SYMBOL(p,s)	p##_avx512_##s
+
+# include "memset-vec-unaligned-erms.S"
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/REORG.TODO/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
new file mode 100644
index 0000000000..2eb9e3744e
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -0,0 +1,263 @@
+/* memset/bzero with unaligned store and rep stosb
+   Copyright (C) 2016-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* memset is implemented as:
+   1. Use overlapping store to avoid branch.
+   2. If size is less than VEC, use integer register stores.
+   3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
+   4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
+   5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
+      4 VEC stores and store 4 * VEC at a time until done.  */
+
+#include <sysdep.h>
+
+#ifndef MEMSET_CHK_SYMBOL
+# define MEMSET_CHK_SYMBOL(p,s)		MEMSET_SYMBOL(p, s)
+#endif
+
+#ifndef WMEMSET_CHK_SYMBOL
+# define WMEMSET_CHK_SYMBOL(p,s)	WMEMSET_SYMBOL(p, s)
+#endif
+
+#ifndef VZEROUPPER
+# if VEC_SIZE > 16
+#  define VZEROUPPER			vzeroupper
+# else
+#  define VZEROUPPER
+# endif
+#endif
+
+#ifndef VZEROUPPER_SHORT_RETURN
+# if VEC_SIZE > 16
+#  define VZEROUPPER_SHORT_RETURN	vzeroupper
+# else
+#  define VZEROUPPER_SHORT_RETURN	rep
+# endif
+#endif
+
+#ifndef MOVQ
+# if VEC_SIZE > 16
+#  define MOVQ				vmovq
+# else
+#  define MOVQ				movq
+# endif
+#endif
+
+/* Threshold to use Enhanced REP STOSB.  Since there is overhead to set
+   up REP STOSB operation, REP STOSB isn't faster on short data.  The
+   memset micro benchmark in glibc shows that 2KB is the approximate
+   value above which REP STOSB becomes faster on processors with
+   Enhanced REP STOSB.  Since the stored value is fixed, larger register
+   size has minimal impact on threshold.  */
+#ifndef REP_STOSB_THRESHOLD
+# define REP_STOSB_THRESHOLD		2048
+#endif
+
+#ifndef SECTION
+# error SECTION is not defined!
+#endif
+
+	.section SECTION(.text),"ax",@progbits
+#if VEC_SIZE == 16 && IS_IN (libc)
+ENTRY (__bzero)
+	movq	%rdi, %rax /* Set return value.  */
+	movq	%rsi, %rdx /* Set n.  */
+	pxor	%xmm0, %xmm0
+	jmp	L(entry_from_bzero)
+END (__bzero)
+weak_alias (__bzero, bzero)
+#endif
+
+#if IS_IN (libc)
+# if defined SHARED
+ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
+# endif
+
+ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
+	shlq	$2, %rdx
+	WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+	jmp	L(entry_from_bzero)
+END (WMEMSET_SYMBOL (__wmemset, unaligned))
+#endif
+
+#if defined SHARED && IS_IN (libc)
+ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
+#endif
+
+ENTRY (MEMSET_SYMBOL (__memset, unaligned))
+	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+L(entry_from_bzero):
+	cmpq	$VEC_SIZE, %rdx
+	jb	L(less_vec)
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	%VEC(0), (%rdi)
+	VZEROUPPER
+	ret
+#if defined USE_MULTIARCH && IS_IN (libc)
+END (MEMSET_SYMBOL (__memset, unaligned))
+
+# if VEC_SIZE == 16
+/* Only used to measure performance of REP STOSB.  */
+ENTRY (__memset_erms)
+# else
+/* Provide a symbol to debugger.  */
+ENTRY (MEMSET_SYMBOL (__memset, erms))
+# endif
+L(stosb):
+	/* Issue vzeroupper before rep stosb.  */
+	VZEROUPPER
+	movq	%rdx, %rcx
+	movzbl	%sil, %eax
+	movq	%rdi, %rdx
+	rep stosb
+	movq	%rdx, %rax
+	ret
+# if VEC_SIZE == 16
+END (__memset_erms)
+# else
+END (MEMSET_SYMBOL (__memset, erms))
+# endif
+
+# if defined SHARED && IS_IN (libc)
+ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
+# endif
+
+ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
+	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+	cmpq	$VEC_SIZE, %rdx
+	jb	L(less_vec)
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(stosb_more_2x_vec)
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	%VEC(0), (%rdi)
+	VZEROUPPER
+	ret
+
+L(stosb_more_2x_vec):
+	cmpq	$REP_STOSB_THRESHOLD, %rdx
+	ja	L(stosb)
+#endif
+L(more_2x_vec):
+	cmpq  $(VEC_SIZE * 4), %rdx
+	ja	L(loop_start)
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(0), VEC_SIZE(%rdi)
+	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
+L(return):
+	VZEROUPPER
+	ret
+
+L(loop_start):
+	leaq	(VEC_SIZE * 4)(%rdi), %rcx
+	VMOVU	%VEC(0), (%rdi)
+	andq	$-(VEC_SIZE * 4), %rcx
+	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	%VEC(0), VEC_SIZE(%rdi)
+	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
+	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
+	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rdi)
+	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
+	addq	%rdi, %rdx
+	andq	$-(VEC_SIZE * 4), %rdx
+	cmpq	%rdx, %rcx
+	je	L(return)
+L(loop):
+	VMOVA	%VEC(0), (%rcx)
+	VMOVA	%VEC(0), VEC_SIZE(%rcx)
+	VMOVA	%VEC(0), (VEC_SIZE * 2)(%rcx)
+	VMOVA	%VEC(0), (VEC_SIZE * 3)(%rcx)
+	addq	$(VEC_SIZE * 4), %rcx
+	cmpq	%rcx, %rdx
+	jne	L(loop)
+	VZEROUPPER_SHORT_RETURN
+	ret
+L(less_vec):
+	/* Less than 1 VEC.  */
+# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+#  error Unsupported VEC_SIZE!
+# endif
+# if VEC_SIZE > 32
+	cmpb	$32, %dl
+	jae	L(between_32_63)
+# endif
+# if VEC_SIZE > 16
+	cmpb	$16, %dl
+	jae	L(between_16_31)
+# endif
+	MOVQ	%xmm0, %rcx
+	cmpb	$8, %dl
+	jae	L(between_8_15)
+	cmpb	$4, %dl
+	jae	L(between_4_7)
+	cmpb	$1, %dl
+	ja	L(between_2_3)
+	jb	1f
+	movb	%cl, (%rdi)
+1:
+	VZEROUPPER
+	ret
+# if VEC_SIZE > 32
+	/* From 32 to 63.  No branch when size == 32.  */
+L(between_32_63):
+	vmovdqu	%ymm0, -32(%rdi,%rdx)
+	vmovdqu	%ymm0, (%rdi)
+	VZEROUPPER
+	ret
+# endif
+# if VEC_SIZE > 16
+	/* From 16 to 31.  No branch when size == 16.  */
+L(between_16_31):
+	vmovdqu	%xmm0, -16(%rdi,%rdx)
+	vmovdqu	%xmm0, (%rdi)
+	VZEROUPPER
+	ret
+# endif
+	/* From 8 to 15.  No branch when size == 8.  */
+L(between_8_15):
+	movq	%rcx, -8(%rdi,%rdx)
+	movq	%rcx, (%rdi)
+	VZEROUPPER
+	ret
+L(between_4_7):
+	/* From 4 to 7.  No branch when size == 4.  */
+	movl	%ecx, -4(%rdi,%rdx)
+	movl	%ecx, (%rdi)
+	VZEROUPPER
+	ret
+L(between_2_3):
+	/* From 2 to 3.  No branch when size == 2.  */
+	movw	%cx, -2(%rdi,%rdx)
+	movw	%cx, (%rdi)
+	VZEROUPPER
+	ret
+END (MEMSET_SYMBOL (__memset, unaligned_erms))
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memset.S b/REORG.TODO/sysdeps/x86_64/multiarch/memset.S
new file mode 100644
index 0000000000..11f27378b0
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/memset.S
@@ -0,0 +1,82 @@
+/* Multiple versions of memset
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <shlib-compat.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib.  */
+#if IS_IN (libc)
+ENTRY(memset)
+	.type	memset, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	lea	__memset_erms(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Prefer_ERMS)
+	jnz	2f
+	lea	__memset_sse2_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	1f
+	lea	__memset_sse2_unaligned(%rip), %RAX_LP
+1:
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	2f
+	lea	__memset_avx2_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	L(AVX512F)
+	lea	__memset_avx2_unaligned(%rip), %RAX_LP
+L(AVX512F):
+	HAS_ARCH_FEATURE (Prefer_No_AVX512)
+	jnz	2f
+	HAS_ARCH_FEATURE (AVX512F_Usable)
+	jz	2f
+	lea	__memset_avx512_no_vzeroupper(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	2f
+	lea	__memset_avx512_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	2f
+	lea	__memset_avx512_unaligned(%rip), %RAX_LP
+2:	ret
+END(memset)
+#endif
+
+#if IS_IN (libc)
+# define MEMSET_SYMBOL(p,s)	p##_sse2_##s
+# define WMEMSET_SYMBOL(p,s)	p##_sse2_##s
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal memset calls through a PLT.
+   The speedup we get from using SSE2 instructions is likely eaten away
+   by the indirect call in the PLT.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_memset; __GI_memset = __memset_sse2_unaligned; \
+	.globl __GI_wmemset; __GI_wmemset = __wmemset_sse2_unaligned; \
+	.globl __GI___wmemset; __GI___wmemset = __wmemset_sse2_unaligned
+# endif
+
+# undef weak_alias
+# define weak_alias(original, alias) \
+	.weak bzero; bzero = __bzero
+
+# undef strong_alias
+# define strong_alias(original, alias)
+#endif
+
+#include "../memset.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memset_chk.S b/REORG.TODO/sysdeps/x86_64/multiarch/memset_chk.S
new file mode 100644
index 0000000000..7e08311cdf
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/memset_chk.S
@@ -0,0 +1,61 @@
+/* Multiple versions of memset_chk
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib.  */
+#if IS_IN (libc)
+# ifdef SHARED
+ENTRY(__memset_chk)
+	.type	__memset_chk, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	lea	__memset_chk_sse2_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	1f
+	lea	__memset_chk_sse2_unaligned(%rip), %RAX_LP
+1:
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	2f
+	lea	__memset_chk_avx2_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	L(AVX512F)
+	lea	__memset_chk_avx2_unaligned(%rip), %RAX_LP
+L(AVX512F):
+	HAS_ARCH_FEATURE (Prefer_No_AVX512)
+	jnz	2f
+	HAS_ARCH_FEATURE (AVX512F_Usable)
+	jz	2f
+	lea	__memset_chk_avx512_no_vzeroupper(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	2f
+	lea	__memset_chk_avx512_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	2f
+	lea	__memset_chk_avx512_unaligned(%rip), %RAX_LP
+2:	ret
+END(__memset_chk)
+
+strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
+	.section .gnu.warning.__memset_zero_constant_len_parameter
+	.string "memset used with constant zero length parameter; this could be due to transposed parameters"
+# else
+#  include "../memset_chk.S"
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/sched_cpucount.c b/REORG.TODO/sysdeps/x86_64/multiarch/sched_cpucount.c
new file mode 100644
index 0000000000..453f183747
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/sched_cpucount.c
@@ -0,0 +1,36 @@
+/* Count bits in CPU set.  x86-64 multi-arch version.
+   This file is part of the GNU C Library.
+   Copyright (C) 2008-2017 Free Software Foundation, Inc.
+   Contributed by Ulrich Drepper <drepper@redhat.com>.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sched.h>
+#include "init-arch.h"
+
+#define __sched_cpucount static generic_cpucount
+#include <posix/sched_cpucount.c>
+#undef __sched_cpucount
+
+#define POPCNT(l) \
+  ({ __cpu_mask r; \
+     asm ("popcnt %1, %0" : "=r" (r) : "0" (l));\
+     r; })
+#define __sched_cpucount static popcount_cpucount
+#include <posix/sched_cpucount.c>
+#undef __sched_cpucount
+
+libc_ifunc (__sched_cpucount,
+	    HAS_CPU_FEATURE (POPCOUNT) ? popcount_cpucount : generic_cpucount);
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S b/REORG.TODO/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S
new file mode 100644
index 0000000000..34231f8b46
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S
@@ -0,0 +1,3 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy_sse2_unaligned
+#include "strcpy-sse2-unaligned.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/stpcpy-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
new file mode 100644
index 0000000000..d971c2da38
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
@@ -0,0 +1,3 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy_ssse3
+#include "strcpy-ssse3.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/stpcpy.S b/REORG.TODO/sysdeps/x86_64/multiarch/stpcpy.S
new file mode 100644
index 0000000000..ee81ab6ae3
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/stpcpy.S
@@ -0,0 +1,9 @@
+/* Multiple versions of stpcpy
+   All versions must be listed in ifunc-impl-list.c.  */
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy
+#include "strcpy.S"
+
+weak_alias (__stpcpy, stpcpy)
+libc_hidden_def (__stpcpy)
+libc_hidden_builtin_def (stpcpy)
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy-c.c b/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy-c.c
new file mode 100644
index 0000000000..2fde77dcab
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy-c.c
@@ -0,0 +1,8 @@
+#define STPNCPY __stpncpy_sse2
+#ifdef SHARED
+#undef libc_hidden_def
+#define libc_hidden_def(name) \
+  __hidden_ver1 (__stpncpy_sse2, __GI___stpncpy, __stpncpy_sse2);
+#endif
+
+#include "stpncpy.c"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S b/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S
new file mode 100644
index 0000000000..658520f78f
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S
@@ -0,0 +1,4 @@
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#define STRCPY __stpncpy_sse2_unaligned
+#include "strcpy-sse2-unaligned.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
new file mode 100644
index 0000000000..14ed16f6b5
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#define STRCPY __stpncpy_ssse3
+#include "strcpy-ssse3.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy.S b/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy.S
new file mode 100644
index 0000000000..2698ca6a8c
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy.S
@@ -0,0 +1,8 @@
+/* Multiple versions of stpncpy
+   All versions must be listed in ifunc-impl-list.c.  */
+#define STRCPY __stpncpy
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#include "strcpy.S"
+
+weak_alias (__stpncpy, stpncpy)
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
new file mode 100644
index 0000000000..fb2f9ae14a
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
@@ -0,0 +1,6 @@
+#define USE_SSSE3 1
+#define USE_AS_STRCASECMP_L
+#define NO_NOLOCALE_ALIAS
+#define STRCMP __strcasecmp_l_ssse3
+#define __strcasecmp __strcasecmp_ssse3
+#include "../strcmp.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcasecmp_l.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcasecmp_l.S
new file mode 100644
index 0000000000..49f5b9fd95
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcasecmp_l.S
@@ -0,0 +1,8 @@
+/* Multiple versions of strcasecmp and strcasecmp_l
+   All versions must be listed in ifunc-impl-list.c.  */
+#define STRCMP __strcasecmp_l
+#define USE_AS_STRCASECMP_L
+#include "strcmp.S"
+
+weak_alias (__strcasecmp_l, strcasecmp_l)
+libc_hidden_def (strcasecmp_l)
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
new file mode 100644
index 0000000000..d0a8a1518a
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
@@ -0,0 +1,279 @@
+/* strcat with SSE2
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRCAT
+#  define STRCAT  __strcat_sse2_unaligned
+# endif
+
+# define USE_AS_STRCAT
+
+.text
+ENTRY (STRCAT)
+	mov	%rdi, %r9
+# ifdef USE_AS_STRNCAT
+	mov	%rdx, %r8
+# endif
+
+/* Inline corresponding strlen file, temporary until new strcpy
+   implementation gets merged.  */
+
+	xor	%rax, %rax
+	mov	%edi, %ecx
+	and	$0x3f, %ecx
+	pxor	%xmm0, %xmm0
+	cmp	$0x30, %ecx
+	ja	L(next)
+	movdqu	(%rdi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit_less16)
+	mov	%rdi, %rax
+	and	$-16, %rax
+	jmp	L(align16_start)
+L(next):
+	mov	%rdi, %rax
+	and	$-16, %rax
+	pcmpeqb	(%rax), %xmm0
+	mov	$-1, %r10d
+	sub	%rax, %rcx
+	shl	%cl, %r10d
+	pmovmskb %xmm0, %edx
+	and	%r10d, %edx
+	jnz	L(exit)
+
+L(align16_start):
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
+	pcmpeqb	16(%rax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	jnz	L(exit64)
+
+	pcmpeqb	80(%rax), %xmm0
+	add	$64, %rax
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	jnz	L(exit64)
+
+	pcmpeqb	80(%rax), %xmm0
+	add	$64, %rax
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	jnz	L(exit64)
+
+	pcmpeqb	80(%rax), %xmm0
+	add	$64, %rax
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	jnz	L(exit64)
+
+	test	$0x3f, %rax
+	jz	L(align64_loop)
+
+	pcmpeqb	80(%rax), %xmm0
+	add	$80, %rax
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	test	$0x3f, %rax
+	jz	L(align64_loop)
+
+	pcmpeqb	16(%rax), %xmm1
+	add	$16, %rax
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	test	$0x3f, %rax
+	jz	L(align64_loop)
+
+	pcmpeqb	16(%rax), %xmm2
+	add	$16, %rax
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	test	$0x3f, %rax
+	jz	L(align64_loop)
+
+	pcmpeqb	16(%rax), %xmm3
+	add	$16, %rax
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	add	$16, %rax
+	.p2align 4
+	L(align64_loop):
+	movaps	(%rax),	%xmm4
+	pminub	16(%rax),	%xmm4
+	movaps	32(%rax),	%xmm5
+	pminub	48(%rax),	%xmm5
+	add	$64,	%rax
+	pminub	%xmm4,	%xmm5
+	pcmpeqb	%xmm0,	%xmm5
+	pmovmskb %xmm5,	%edx
+	test	%edx,	%edx
+	jz	L(align64_loop)
+
+	pcmpeqb	-64(%rax), %xmm0
+	sub	$80,	%rax
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$64, %rax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit):
+	sub	%rdi, %rax
+L(exit_less16):
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit16):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$16, %rax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit32):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$32, %rax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit48):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$48, %rax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit64):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$64, %rax
+
+	.p2align 4
+L(StartStrcpyPart):
+	lea	(%r9, %rax), %rdi
+	mov	%rsi, %rcx
+	mov	%r9, %rax      /* save result */
+
+# ifdef USE_AS_STRNCAT
+	test	%r8, %r8
+	jz	L(ExitZero)
+#  define USE_AS_STRNCPY
+# endif
+
+# include "strcpy-sse2-unaligned.S"
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcat-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcat-ssse3.S
new file mode 100644
index 0000000000..edd683d778
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcat-ssse3.S
@@ -0,0 +1,867 @@
+/* strcat with SSSE3
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRCAT
+#  define STRCAT  __strcat_ssse3
+# endif
+
+# define USE_AS_STRCAT
+
+.text
+ENTRY (STRCAT)
+# ifdef USE_AS_STRNCAT
+	mov	%rdx, %r8
+# endif
+
+
+/* Inline corresponding strlen file, temporary until new strcpy
+   implementation gets merged.  */
+
+	xor	%eax, %eax
+	cmpb	$0, (%rdi)
+	jz	L(exit_tail0)
+	cmpb	$0, 1(%rdi)
+	jz	L(exit_tail1)
+	cmpb	$0, 2(%rdi)
+	jz	L(exit_tail2)
+	cmpb	$0, 3(%rdi)
+	jz	L(exit_tail3)
+
+	cmpb	$0, 4(%rdi)
+	jz	L(exit_tail4)
+	cmpb	$0, 5(%rdi)
+	jz	L(exit_tail5)
+	cmpb	$0, 6(%rdi)
+	jz	L(exit_tail6)
+	cmpb	$0, 7(%rdi)
+	jz	L(exit_tail7)
+
+	cmpb	$0, 8(%rdi)
+	jz	L(exit_tail8)
+	cmpb	$0, 9(%rdi)
+	jz	L(exit_tail9)
+	cmpb	$0, 10(%rdi)
+	jz	L(exit_tail10)
+	cmpb	$0, 11(%rdi)
+	jz	L(exit_tail11)
+
+	cmpb	$0, 12(%rdi)
+	jz	L(exit_tail12)
+	cmpb	$0, 13(%rdi)
+	jz	L(exit_tail13)
+	cmpb	$0, 14(%rdi)
+	jz	L(exit_tail14)
+	cmpb	$0, 15(%rdi)
+	jz	L(exit_tail15)
+	pxor	%xmm0, %xmm0
+	lea	16(%rdi), %rcx
+	lea	16(%rdi), %rax
+	and	$-16, %rax
+
+	pcmpeqb	(%rax), %xmm0
+	pmovmskb %xmm0, %edx
+	pxor	%xmm1, %xmm1
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	pxor	%xmm2, %xmm2
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	pxor	%xmm3, %xmm3
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	and	$-0x40, %rax
+
+	.p2align 4
+L(aligned_64):
+	pcmpeqb	(%rax), %xmm0
+	pcmpeqb	16(%rax), %xmm1
+	pcmpeqb	32(%rax), %xmm2
+	pcmpeqb	48(%rax), %xmm3
+	pmovmskb %xmm0, %edx
+	pmovmskb %xmm1, %r11d
+	pmovmskb %xmm2, %r10d
+	pmovmskb %xmm3, %r9d
+	or	%edx, %r9d
+	or	%r11d, %r9d
+	or	%r10d, %r9d
+	lea	64(%rax), %rax
+	jz	L(aligned_64)
+
+	test	%edx, %edx
+	jnz	L(aligned_64_exit_16)
+	test	%r11d, %r11d
+	jnz	L(aligned_64_exit_32)
+	test	%r10d, %r10d
+	jnz	L(aligned_64_exit_48)
+
+L(aligned_64_exit_64):
+	pmovmskb %xmm3, %edx
+	jmp	L(exit)
+
+L(aligned_64_exit_48):
+	lea	-16(%rax), %rax
+	mov	%r10d, %edx
+	jmp	L(exit)
+
+L(aligned_64_exit_32):
+	lea	-32(%rax), %rax
+	mov	%r11d, %edx
+	jmp	L(exit)
+
+L(aligned_64_exit_16):
+	lea	-48(%rax), %rax
+
+L(exit):
+	sub	%rcx, %rax
+	test	%dl, %dl
+	jz	L(exit_high)
+	test	$0x01, %dl
+	jnz	L(exit_tail0)
+
+	test	$0x02, %dl
+	jnz	L(exit_tail1)
+
+	test	$0x04, %dl
+	jnz	L(exit_tail2)
+
+	test	$0x08, %dl
+	jnz	L(exit_tail3)
+
+	test	$0x10, %dl
+	jnz	L(exit_tail4)
+
+	test	$0x20, %dl
+	jnz	L(exit_tail5)
+
+	test	$0x40, %dl
+	jnz	L(exit_tail6)
+	add	$7, %eax
+L(exit_tail0):
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_high):
+	add	$8, %eax
+	test	$0x01, %dh
+	jnz	L(exit_tail0)
+
+	test	$0x02, %dh
+	jnz	L(exit_tail1)
+
+	test	$0x04, %dh
+	jnz	L(exit_tail2)
+
+	test	$0x08, %dh
+	jnz	L(exit_tail3)
+
+	test	$0x10, %dh
+	jnz	L(exit_tail4)
+
+	test	$0x20, %dh
+	jnz	L(exit_tail5)
+
+	test	$0x40, %dh
+	jnz	L(exit_tail6)
+	add	$7, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_tail1):
+	add	$1, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_tail2):
+	add	$2, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_tail3):
+	add	$3, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_tail4):
+	add	$4, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_tail5):
+	add	$5, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_tail6):
+	add	$6, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_tail7):
+	add	$7, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_tail8):
+	add	$8, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_tail9):
+	add	$9, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_tail10):
+	add	$10, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_tail11):
+	add	$11, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_tail12):
+	add	$12, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_tail13):
+	add	$13, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_tail14):
+	add	$14, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_tail15):
+	add	$15, %eax
+
+	.p2align 4
+L(StartStrcpyPart):
+	mov	%rsi, %rcx
+	lea	(%rdi, %rax), %rdx
+# ifdef USE_AS_STRNCAT
+	test	%r8, %r8
+	jz	L(StrncatExit0)
+	cmp	$8, %r8
+	jbe	L(StrncatExit8Bytes)
+# endif
+	cmpb	$0, (%rcx)
+	jz	L(Exit1)
+	cmpb	$0, 1(%rcx)
+	jz	L(Exit2)
+	cmpb	$0, 2(%rcx)
+	jz	L(Exit3)
+	cmpb	$0, 3(%rcx)
+	jz	L(Exit4)
+	cmpb	$0, 4(%rcx)
+	jz	L(Exit5)
+	cmpb	$0, 5(%rcx)
+	jz	L(Exit6)
+	cmpb	$0, 6(%rcx)
+	jz	L(Exit7)
+	cmpb	$0, 7(%rcx)
+	jz	L(Exit8)
+	cmpb	$0, 8(%rcx)
+	jz	L(Exit9)
+# ifdef USE_AS_STRNCAT
+	cmp	$16, %r8
+	jb	L(StrncatExit15Bytes)
+# endif
+	cmpb	$0, 9(%rcx)
+	jz	L(Exit10)
+	cmpb	$0, 10(%rcx)
+	jz	L(Exit11)
+	cmpb	$0, 11(%rcx)
+	jz	L(Exit12)
+	cmpb	$0, 12(%rcx)
+	jz	L(Exit13)
+	cmpb	$0, 13(%rcx)
+	jz	L(Exit14)
+	cmpb	$0, 14(%rcx)
+	jz	L(Exit15)
+	cmpb	$0, 15(%rcx)
+	jz	L(Exit16)
+# ifdef USE_AS_STRNCAT
+	cmp	$16, %r8
+	je	L(StrncatExit16)
+#  define USE_AS_STRNCPY
+# endif
+
+# include "strcpy-ssse3.S"
+
+	.p2align 4
+L(CopyFrom1To16Bytes):
+	add	%rsi, %rdx
+	add	%rsi, %rcx
+
+	test	%al, %al
+	jz	L(ExitHigh)
+	test	$0x01, %al
+	jnz	L(Exit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(ExitHigh):
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	test	$0x08, %ah
+	jnz	L(Exit12)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	movlpd	(%rcx), %xmm0
+	movlpd	8(%rcx), %xmm1
+	movlpd	%xmm0, (%rdx)
+	movlpd	%xmm1, 8(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit1):
+	xor	%ah, %ah
+	movb	%ah, 1(%rdx)
+L(Exit1):
+	movb	(%rcx), %al
+	movb	%al, (%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit2):
+	xor	%ah, %ah
+	movb	%ah, 2(%rdx)
+L(Exit2):
+	movw	(%rcx), %ax
+	movw	%ax, (%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit3):
+	xor	%ah, %ah
+	movb	%ah, 3(%rdx)
+L(Exit3):
+	movw	(%rcx), %ax
+	movw	%ax, (%rdx)
+	movb	2(%rcx), %al
+	movb	%al, 2(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit4):
+	xor	%ah, %ah
+	movb	%ah, 4(%rdx)
+L(Exit4):
+	mov	(%rcx), %eax
+	mov	%eax, (%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit5):
+	xor	%ah, %ah
+	movb	%ah, 5(%rdx)
+L(Exit5):
+	mov	(%rcx), %eax
+	mov	%eax, (%rdx)
+	movb	4(%rcx), %al
+	movb	%al, 4(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit6):
+	xor	%ah, %ah
+	movb	%ah, 6(%rdx)
+L(Exit6):
+	mov	(%rcx), %eax
+	mov	%eax, (%rdx)
+	movw	4(%rcx), %ax
+	movw	%ax, 4(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit7):
+	xor	%ah, %ah
+	movb	%ah, 7(%rdx)
+L(Exit7):
+	mov	(%rcx), %eax
+	mov	%eax, (%rdx)
+	mov	3(%rcx), %eax
+	mov	%eax, 3(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit8):
+	xor	%ah, %ah
+	movb	%ah, 8(%rdx)
+L(Exit8):
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit9):
+	xor	%ah, %ah
+	movb	%ah, 9(%rdx)
+L(Exit9):
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	movb	8(%rcx), %al
+	movb	%al, 8(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit10):
+	xor	%ah, %ah
+	movb	%ah, 10(%rdx)
+L(Exit10):
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	movw	8(%rcx), %ax
+	movw	%ax, 8(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit11):
+	xor	%ah, %ah
+	movb	%ah, 11(%rdx)
+L(Exit11):
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	mov	7(%rcx), %eax
+	mov	%eax, 7(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit12):
+	xor	%ah, %ah
+	movb	%ah, 12(%rdx)
+L(Exit12):
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	mov	8(%rcx), %eax
+	mov	%eax, 8(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit13):
+	xor	%ah, %ah
+	movb	%ah, 13(%rdx)
+L(Exit13):
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	movlpd	5(%rcx), %xmm1
+	movlpd	%xmm1, 5(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit14):
+	xor	%ah, %ah
+	movb	%ah, 14(%rdx)
+L(Exit14):
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	movlpd	6(%rcx), %xmm1
+	movlpd	%xmm1, 6(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit15):
+	xor	%ah, %ah
+	movb	%ah, 15(%rdx)
+L(Exit15):
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	movlpd	7(%rcx), %xmm1
+	movlpd	%xmm1, 7(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit16):
+	xor	%ah, %ah
+	movb	%ah, 16(%rdx)
+L(Exit16):
+	movlpd	(%rcx), %xmm0
+	movlpd	8(%rcx), %xmm1
+	movlpd	%xmm0, (%rdx)
+	movlpd	%xmm1, 8(%rdx)
+	mov	%rdi, %rax
+	ret
+
+# ifdef USE_AS_STRNCPY
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2):
+	add	$16, %r8
+	add	%rsi, %rcx
+	lea	(%rsi, %rdx), %rsi
+	lea	-9(%r8), %rdx
+	and	$1<<7, %dh
+	or	%al, %dh
+	test	%dh, %dh
+	lea	(%rsi), %rdx
+	jz	L(ExitHighCase2)
+
+	test	$0x01, %al
+	jnz	L(Exit1)
+	cmp	$1, %r8
+	je	L(StrncatExit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	cmp	$2, %r8
+	je	L(StrncatExit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	cmp	$3, %r8
+	je	L(StrncatExit3)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	cmp	$4, %r8
+	je	L(StrncatExit4)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	cmp	$5, %r8
+	je	L(StrncatExit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	cmp	$6, %r8
+	je	L(StrncatExit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	cmp	$7, %r8
+	je	L(StrncatExit7)
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	lea	7(%rdx), %rax
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+	xor	%cl, %cl
+	movb	%cl, (%rax)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(ExitHighCase2):
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	cmp	$9, %r8
+	je	L(StrncatExit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	cmp	$10, %r8
+	je	L(StrncatExit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	cmp	$11, %r8
+	je	L(StrncatExit11)
+	test	$0x8, %ah
+	jnz	L(Exit12)
+	cmp	$12, %r8
+	je	L(StrncatExit12)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	cmp	$13, %r8
+	je	L(StrncatExit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	cmp	$14, %r8
+	je	L(StrncatExit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	cmp	$15, %r8
+	je	L(StrncatExit15)
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	movlpd	8(%rcx), %xmm1
+	movlpd	%xmm1, 8(%rdx)
+	mov	%rdi, %rax
+	ret
+
+L(CopyFrom1To16BytesCase2OrCase3):
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+
+	.p2align 4
+L(CopyFrom1To16BytesCase3):
+	add	$16, %r8
+	add	%rsi, %rdx
+	add	%rsi, %rcx
+
+	cmp	$8, %r8
+	ja	L(ExitHighCase3)
+	cmp	$1, %r8
+	je	L(StrncatExit1)
+	cmp	$2, %r8
+	je	L(StrncatExit2)
+	cmp	$3, %r8
+	je	L(StrncatExit3)
+	cmp	$4, %r8
+	je	L(StrncatExit4)
+	cmp	$5, %r8
+	je	L(StrncatExit5)
+	cmp	$6, %r8
+	je	L(StrncatExit6)
+	cmp	$7, %r8
+	je	L(StrncatExit7)
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	xor	%ah, %ah
+	movb	%ah, 8(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(ExitHighCase3):
+	cmp	$9, %r8
+	je	L(StrncatExit9)
+	cmp	$10, %r8
+	je	L(StrncatExit10)
+	cmp	$11, %r8
+	je	L(StrncatExit11)
+	cmp	$12, %r8
+	je	L(StrncatExit12)
+	cmp	$13, %r8
+	je	L(StrncatExit13)
+	cmp	$14, %r8
+	je	L(StrncatExit14)
+	cmp	$15, %r8
+	je	L(StrncatExit15)
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	movlpd	8(%rcx), %xmm1
+	movlpd	%xmm1, 8(%rdx)
+	xor	%ah, %ah
+	movb	%ah, 16(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit0):
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit15Bytes):
+	cmp	$9, %r8
+	je	L(StrncatExit9)
+	cmpb	$0, 9(%rcx)
+	jz	L(Exit10)
+	cmp	$10, %r8
+	je	L(StrncatExit10)
+	cmpb	$0, 10(%rcx)
+	jz	L(Exit11)
+	cmp	$11, %r8
+	je	L(StrncatExit11)
+	cmpb	$0, 11(%rcx)
+	jz	L(Exit12)
+	cmp	$12, %r8
+	je	L(StrncatExit12)
+	cmpb	$0, 12(%rcx)
+	jz	L(Exit13)
+	cmp	$13, %r8
+	je	L(StrncatExit13)
+	cmpb	$0, 13(%rcx)
+	jz	L(Exit14)
+	cmp	$14, %r8
+	je	L(StrncatExit14)
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	movlpd	7(%rcx), %xmm1
+	movlpd	%xmm1, 7(%rdx)
+	lea	14(%rdx), %rax
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+	xor	%cl, %cl
+	movb	%cl, (%rax)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit8Bytes):
+	cmpb	$0, (%rcx)
+	jz	L(Exit1)
+	cmp	$1, %r8
+	je	L(StrncatExit1)
+	cmpb	$0, 1(%rcx)
+	jz	L(Exit2)
+	cmp	$2, %r8
+	je	L(StrncatExit2)
+	cmpb	$0, 2(%rcx)
+	jz	L(Exit3)
+	cmp	$3, %r8
+	je	L(StrncatExit3)
+	cmpb	$0, 3(%rcx)
+	jz	L(Exit4)
+	cmp	$4, %r8
+	je	L(StrncatExit4)
+	cmpb	$0, 4(%rcx)
+	jz	L(Exit5)
+	cmp	$5, %r8
+	je	L(StrncatExit5)
+	cmpb	$0, 5(%rcx)
+	jz	L(Exit6)
+	cmp	$6, %r8
+	je	L(StrncatExit6)
+	cmpb	$0, 6(%rcx)
+	jz	L(Exit7)
+	cmp	$7, %r8
+	je	L(StrncatExit7)
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	lea	7(%rdx), %rax
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+	xor	%cl, %cl
+	movb	%cl, (%rax)
+	mov	%rdi, %rax
+	ret
+
+# endif
+END (STRCAT)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcat.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcat.S
new file mode 100644
index 0000000000..0e0e5dda9c
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcat.S
@@ -0,0 +1,85 @@
+/* Multiple versions of strcat
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifndef USE_AS_STRNCAT
+# ifndef STRCAT
+#  define STRCAT strcat
+# endif
+#endif
+
+#ifdef USE_AS_STRNCAT
+# define STRCAT_SSSE3	         	__strncat_ssse3
+# define STRCAT_SSE2	            	__strncat_sse2
+# define STRCAT_SSE2_UNALIGNED    	__strncat_sse2_unaligned
+# define __GI_STRCAT	            	__GI_strncat
+# define __GI___STRCAT              __GI___strncat
+#else
+# define STRCAT_SSSE3	         	__strcat_ssse3
+# define STRCAT_SSE2	            	__strcat_sse2
+# define STRCAT_SSE2_UNALIGNED    	__strcat_sse2_unaligned
+# define __GI_STRCAT	            	__GI_strcat
+# define __GI___STRCAT              __GI___strcat
+#endif
+
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+	.text
+ENTRY(STRCAT)
+	.type	STRCAT, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	STRCAT_SSE2_UNALIGNED(%rip), %rax
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz	2f
+	leaq	STRCAT_SSE2(%rip), %rax
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	leaq	STRCAT_SSSE3(%rip), %rax
+2:	ret
+END(STRCAT)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type STRCAT_SSE2, @function; \
+	.align 16; \
+	.globl STRCAT_SSE2; \
+	.hidden STRCAT_SSE2; \
+	STRCAT_SSE2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size STRCAT_SSE2, .-STRCAT_SSE2
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strcat calls through a PLT.
+   The speedup we get from using SSSE3 instruction is likely eaten away
+   by the indirect call in the PLT.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_STRCAT; __GI_STRCAT = STRCAT_SSE2
+# undef libc_hidden_def
+# define libc_hidden_def(name) \
+	.globl __GI___STRCAT; __GI___STRCAT = STRCAT_SSE2
+#endif
+
+#ifndef USE_AS_STRNCAT
+# include "../strcat.S"
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strchr-sse2-no-bsf.S b/REORG.TODO/sysdeps/x86_64/multiarch/strchr-sse2-no-bsf.S
new file mode 100644
index 0000000000..cbbd0b33d3
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strchr-sse2-no-bsf.S
@@ -0,0 +1,280 @@
+/* strchr with SSE2 without bsf
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+# include "asm-syntax.h"
+
+	atom_text_section
+ENTRY (__strchr_sse2_no_bsf)
+	movd	%esi, %xmm1
+	movq	%rdi, %rcx
+	punpcklbw %xmm1, %xmm1
+	andq	$~15, %rdi
+	pxor	%xmm2, %xmm2
+	punpcklbw %xmm1, %xmm1
+	orl	$0xffffffff, %esi
+	movdqa	(%rdi), %xmm0
+	pshufd	$0, %xmm1, %xmm1
+	subq	%rdi, %rcx
+	movdqa	%xmm0, %xmm3
+	leaq	16(%rdi), %rdi
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm3
+	shl	%cl, %esi
+	pmovmskb %xmm0, %eax
+	pmovmskb %xmm3, %edx
+	andl	%esi, %eax
+	andl	%esi, %edx
+	test	%eax, %eax
+	jnz	L(matches)
+	test	%edx, %edx
+	jnz	L(return_null)
+
+L(loop):
+	movdqa	(%rdi), %xmm0
+	leaq	16(%rdi), %rdi
+	movdqa	%xmm0, %xmm3
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm3
+	pmovmskb %xmm0, %eax
+	pmovmskb %xmm3, %edx
+	or	%eax, %edx
+	jz	L(loop)
+
+	pmovmskb %xmm3, %edx
+	test	%eax, %eax
+	jnz	L(matches)
+
+/* Return NULL.  */
+	.p2align 4
+L(return_null):
+	xor	%rax, %rax
+	ret
+
+L(matches):
+	/* There is a match.  First find where NULL is.  */
+	leaq	-16(%rdi), %rdi
+	test	%edx, %edx
+	jz	L(match_case1)
+
+	.p2align 4
+L(match_case2):
+	test	%al, %al
+	jz	L(match_high_case2)
+
+	mov	%al, %cl
+	and	$15, %cl
+	jnz	L(match_case2_4)
+
+	mov	%dl, %ch
+	and	$15, %ch
+	jnz	L(return_null)
+
+	test	$0x10, %al
+	jnz	L(Exit5)
+	test	$0x10, %dl
+	jnz	L(return_null)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	test	$0x20, %dl
+	jnz	L(return_null)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	test	$0x40, %dl
+	jnz	L(return_null)
+	lea	7(%rdi), %rax
+	ret
+
+	.p2align 4
+L(match_case2_4):
+	test	$0x01, %al
+	jnz	L(Exit1)
+	test	$0x01, %dl
+	jnz	L(return_null)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	test	$0x02, %dl
+	jnz	L(return_null)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	test	$0x04, %dl
+	jnz	L(return_null)
+	lea	3(%rdi), %rax
+	ret
+
+	.p2align 4
+L(match_high_case2):
+	test	%dl, %dl
+	jnz	L(return_null)
+
+	mov	%ah, %cl
+	and	$15, %cl
+	jnz	L(match_case2_12)
+
+	mov	%dh, %ch
+	and	$15, %ch
+	jnz	L(return_null)
+
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	test	$0x10, %dh
+	jnz	L(return_null)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	test	$0x20, %dh
+	jnz	L(return_null)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	test	$0x40, %dh
+	jnz	L(return_null)
+	lea	15(%rdi), %rax
+	ret
+
+	.p2align 4
+L(match_case2_12):
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	test	$0x01, %dh
+	jnz	L(return_null)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	test	$0x02, %dh
+	jnz	L(return_null)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	test	$0x04, %dh
+	jnz	L(return_null)
+	lea	11(%rdi), %rax
+	ret
+
+	.p2align 4
+L(match_case1):
+	test	%al, %al
+	jz	L(match_high_case1)
+
+	test	$0x01, %al
+	jnz	L(Exit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	lea	7(%rdi), %rax
+	ret
+
+	.p2align 4
+L(match_high_case1):
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	test	$0x08, %ah
+	jnz	L(Exit12)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	lea	15(%rdi), %rax
+	ret
+
+	.p2align 4
+L(Exit1):
+	lea	(%rdi), %rax
+	ret
+
+	.p2align 4
+L(Exit2):
+	lea	1(%rdi), %rax
+	ret
+
+	.p2align 4
+L(Exit3):
+	lea	2(%rdi), %rax
+	ret
+
+	.p2align 4
+L(Exit4):
+	lea	3(%rdi), %rax
+	ret
+
+	.p2align 4
+L(Exit5):
+	lea	4(%rdi), %rax
+	ret
+
+	.p2align 4
+L(Exit6):
+	lea	5(%rdi), %rax
+	ret
+
+	.p2align 4
+L(Exit7):
+	lea	6(%rdi), %rax
+	ret
+
+	.p2align 4
+L(Exit9):
+	lea	8(%rdi), %rax
+	ret
+
+	.p2align 4
+L(Exit10):
+	lea	9(%rdi), %rax
+	ret
+
+	.p2align 4
+L(Exit11):
+	lea	10(%rdi), %rax
+	ret
+
+	.p2align 4
+L(Exit12):
+	lea	11(%rdi), %rax
+	ret
+
+	.p2align 4
+L(Exit13):
+	lea	12(%rdi), %rax
+	ret
+
+	.p2align 4
+L(Exit14):
+	lea	13(%rdi), %rax
+	ret
+
+	.p2align 4
+L(Exit15):
+	lea	14(%rdi), %rax
+	ret
+
+END (__strchr_sse2_no_bsf)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strchr.S b/REORG.TODO/sysdeps/x86_64/multiarch/strchr.S
new file mode 100644
index 0000000000..c9f54ca2e2
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strchr.S
@@ -0,0 +1,57 @@
+/* Multiple versions of strchr
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+	.text
+ENTRY(strchr)
+	.type	strchr, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	__strchr_sse2(%rip), %rax
+2:	HAS_ARCH_FEATURE (Slow_BSF)
+	jz	3f
+	leaq    __strchr_sse2_no_bsf(%rip), %rax
+3:	ret
+END(strchr)
+
+
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __strchr_sse2, @function; \
+	.align 16; \
+	.globl __strchr_sse2; \
+	.hidden __strchr_sse2; \
+	__strchr_sse2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __strchr_sse2, .-__strchr_sse2
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strchr calls through a PLT.
+   The speedup we get from using SSE4.2 instruction is likely eaten away
+   by the indirect call in the PLT.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_strchr; __GI_strchr = __strchr_sse2
+#endif
+
+#include "../strchr.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
new file mode 100644
index 0000000000..b0992dce39
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
@@ -0,0 +1,213 @@
+/* strcmp with unaligned loads
+   Copyright (C) 2013-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+#include "sysdep.h"
+
+ENTRY ( __strcmp_sse2_unaligned)
+	movl	%edi, %eax
+	xorl	%edx, %edx
+	pxor	%xmm7, %xmm7
+	orl	%esi, %eax
+	andl	$4095, %eax
+	cmpl	$4032, %eax
+	jg	L(cross_page)
+	movdqu	(%rdi), %xmm1
+	movdqu	(%rsi), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pminub	%xmm1, %xmm0
+	pxor	%xmm1, %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb	%xmm0, %eax
+	testq	%rax, %rax
+	je	L(next_48_bytes)
+L(return):
+	bsfq	%rax, %rdx
+	movzbl	(%rdi, %rdx), %eax
+	movzbl	(%rsi, %rdx), %edx
+	subl	%edx, %eax
+	ret
+
+	.p2align 4
+L(next_48_bytes):
+	movdqu	16(%rdi), %xmm6
+	movdqu	16(%rsi), %xmm3
+	movdqu	32(%rdi), %xmm5
+	pcmpeqb	%xmm6, %xmm3
+	movdqu	32(%rsi), %xmm2
+	pminub	%xmm6, %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	movdqu	48(%rdi), %xmm4
+	pcmpeqb	%xmm5, %xmm2
+	pmovmskb	%xmm3, %edx
+	movdqu	48(%rsi), %xmm0
+	pminub	%xmm5, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb	%xmm2, %eax
+	salq	$16, %rdx
+	pminub	%xmm4, %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	salq	$32, %rax
+	orq	%rdx, %rax
+	pmovmskb	%xmm0, %ecx
+	movq	%rcx, %rdx
+	salq	$48, %rdx
+	orq	%rdx, %rax
+	jne	L(return)
+L(main_loop_header):
+	leaq	64(%rdi), %rdx
+	movl	$4096, %ecx
+	pxor	%xmm9, %xmm9
+	andq	$-64, %rdx
+	subq	%rdi, %rdx
+	leaq	(%rdi, %rdx), %rax
+	addq	%rsi, %rdx
+	movq	%rdx, %rsi
+	andl	$4095, %esi
+	subq	%rsi, %rcx
+	shrq	$6, %rcx
+	movq	%rcx, %rsi
+	jmp	L(loop_start)
+
+	.p2align 4
+L(loop):
+	addq	$64, %rax
+	addq	$64, %rdx
+L(loop_start):
+	testq	%rsi, %rsi
+	leaq	-1(%rsi), %rsi
+	je	L(loop_cross_page)
+L(back_to_loop):
+	movdqu	(%rdx), %xmm0
+	movdqu	16(%rdx), %xmm1
+	movdqa	(%rax), %xmm2
+	movdqa	16(%rax), %xmm3
+	pcmpeqb	%xmm2, %xmm0
+	movdqu	32(%rdx), %xmm5
+	pcmpeqb	%xmm3, %xmm1
+	pminub	%xmm2, %xmm0
+	movdqu	48(%rdx), %xmm6
+	pminub	%xmm3, %xmm1
+	movdqa	32(%rax), %xmm2
+	pminub	%xmm1, %xmm0
+	movdqa	48(%rax), %xmm3
+	pcmpeqb	%xmm2, %xmm5
+	pcmpeqb	%xmm3, %xmm6
+	pminub	%xmm2, %xmm5
+	pminub	%xmm3, %xmm6
+	pminub	%xmm5, %xmm0
+	pminub	%xmm6, %xmm0
+	pcmpeqb	%xmm7, %xmm0
+	pmovmskb	%xmm0, %ecx
+	testl	%ecx, %ecx
+	je	L(loop)
+	pcmpeqb	%xmm7, %xmm5
+	movdqu	(%rdx), %xmm0
+	pcmpeqb	%xmm7, %xmm1
+	movdqa	(%rax), %xmm2
+	pcmpeqb	%xmm2, %xmm0
+	pminub	%xmm2, %xmm0
+	pcmpeqb	%xmm7, %xmm6
+	pcmpeqb	%xmm7, %xmm0
+	pmovmskb	%xmm1, %ecx
+	pmovmskb	%xmm5, %r8d
+	pmovmskb	%xmm0, %edi
+	salq	$16, %rcx
+	salq	$32, %r8
+	pmovmskb	%xmm6, %esi
+	orq	%r8, %rcx
+	orq	%rdi, %rcx
+	salq	$48, %rsi
+	orq	%rsi, %rcx
+	bsfq	%rcx, %rcx
+	movzbl	(%rax, %rcx), %eax
+	movzbl	(%rdx, %rcx), %edx
+	subl	%edx, %eax
+	ret
+
+	.p2align 4
+L(loop_cross_page):
+	xor	%r10, %r10
+	movq	%rdx, %r9
+	and	$63, %r9
+	subq	%r9, %r10
+
+	movdqa	(%rdx, %r10), %xmm0
+	movdqa	16(%rdx, %r10), %xmm1
+	movdqu	(%rax, %r10), %xmm2
+	movdqu	16(%rax, %r10), %xmm3
+	pcmpeqb	%xmm2, %xmm0
+	movdqa	32(%rdx, %r10), %xmm5
+	pcmpeqb	%xmm3, %xmm1
+	pminub	%xmm2, %xmm0
+	movdqa	48(%rdx, %r10), %xmm6
+	pminub	%xmm3, %xmm1
+	movdqu	32(%rax, %r10), %xmm2
+	movdqu	48(%rax, %r10), %xmm3
+	pcmpeqb	%xmm2, %xmm5
+	pcmpeqb	%xmm3, %xmm6
+	pminub	%xmm2, %xmm5
+	pminub	%xmm3, %xmm6
+
+	pcmpeqb	%xmm7, %xmm0
+	pcmpeqb	%xmm7, %xmm1
+	pcmpeqb	%xmm7, %xmm5
+	pcmpeqb	%xmm7, %xmm6
+
+	pmovmskb	%xmm1, %ecx
+	pmovmskb	%xmm5, %r8d
+	pmovmskb	%xmm0, %edi
+	salq	$16, %rcx
+	salq	$32, %r8
+	pmovmskb	%xmm6, %esi
+	orq	%r8, %rdi
+	orq	%rcx, %rdi
+	salq	$48, %rsi
+	orq	%rsi, %rdi
+	movq	%r9, %rcx
+	movq	$63, %rsi
+	shrq	%cl, %rdi
+	test	%rdi, %rdi
+	je	L(back_to_loop)
+	bsfq	%rdi, %rcx
+	movzbl	(%rax, %rcx), %eax
+	movzbl	(%rdx, %rcx), %edx
+	subl	%edx, %eax
+	ret
+
+	.p2align 4
+L(cross_page_loop):
+	cmpb	%cl, %al
+	jne	L(different)
+	addq	$1, %rdx
+	cmpq	$64, %rdx
+	je	L(main_loop_header)
+L(cross_page):
+	movzbl	(%rdi, %rdx), %eax
+	movzbl	(%rsi, %rdx), %ecx
+	testb	%al, %al
+	jne	L(cross_page_loop)
+	xorl	%eax, %eax
+L(different):
+	subl	%ecx, %eax
+	ret
+END (__strcmp_sse2_unaligned)
+
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcmp-sse42.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcmp-sse42.S
new file mode 100644
index 0000000000..ed26d4a8fb
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcmp-sse42.S
@@ -0,0 +1,1792 @@
+/* strcmp with SSE4.2
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+/* We use 0x1a:
+	_SIDD_SBYTE_OPS
+	| _SIDD_CMP_EQUAL_EACH
+	| _SIDD_NEGATIVE_POLARITY
+	| _SIDD_LEAST_SIGNIFICANT
+   on pcmpistri to find out if two 16byte data elements are the same
+   and the offset of the first different byte.  There are 4 cases:
+
+   1. Both 16byte data elements are valid and identical.
+   2. Both 16byte data elements have EOS and identical.
+   3. Both 16byte data elements are valid and they differ at offset X.
+   4. At least one 16byte data element has EOS at offset X.  Two 16byte
+      data elements must differ at or before offset X.
+
+   Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases:
+
+   case		ECX	CFlag	ZFlag	SFlag
+    1		16	  0	  0	  0
+    2		16	  0	  1	  1
+    3		 X	  1	  0	  0
+    4	       0 <= X	  1	 0/1	 0/1
+
+   We exit from the loop for cases 2, 3 and 4 with jbe which branches
+   when either CFlag or ZFlag is 1.  If CFlag == 0, we return 0 for
+   case 2.  */
+
+	/* Put all SSE 4.2 functions together.  */
+	.section .text.SECTION,"ax",@progbits
+	.align	16
+	.type	STRCMP_SSE42, @function
+	.globl	STRCMP_SSE42
+	.hidden	STRCMP_SSE42
+#ifdef USE_AS_STRCASECMP_L
+ENTRY (GLABEL(__strcasecmp))
+	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
+	mov	%fs:(%rax),%RDX_LP
+
+	// XXX 5 byte should be before the function
+	/* 5-byte NOP.  */
+	.byte	0x0f,0x1f,0x44,0x00,0x00
+END (GLABEL(__strcasecmp))
+	/* FALLTHROUGH to strcasecmp_l.  */
+#endif
+#ifdef USE_AS_STRNCASECMP_L
+ENTRY (GLABEL(__strncasecmp))
+	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
+	mov	%fs:(%rax),%RCX_LP
+
+	// XXX 5 byte should be before the function
+	/* 5-byte NOP.  */
+	.byte	0x0f,0x1f,0x44,0x00,0x00
+END (GLABEL(__strncasecmp))
+	/* FALLTHROUGH to strncasecmp_l.  */
+#endif
+
+
+#ifdef USE_AVX
+# define movdqa vmovdqa
+# define movdqu vmovdqu
+# define pmovmskb vpmovmskb
+# define pcmpistri vpcmpistri
+# define psubb vpsubb
+# define pcmpeqb vpcmpeqb
+# define psrldq vpsrldq
+# define pslldq vpslldq
+# define palignr vpalignr
+# define pxor vpxor
+# define D(arg) arg, arg
+#else
+# define D(arg) arg
+#endif
+
+STRCMP_SSE42:
+	cfi_startproc
+	CALL_MCOUNT
+
+/*
+ * This implementation uses SSE to compare up to 16 bytes at a time.
+ */
+#ifdef USE_AS_STRCASECMP_L
+	/* We have to fall back on the C implementation for locales
+	   with encodings not matching ASCII for single bytes.  */
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	mov	LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP
+# else
+	mov	(%rdx), %RAX_LP
+# endif
+	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
+	jne	__strcasecmp_l_nonascii
+#endif
+#ifdef USE_AS_STRNCASECMP_L
+	/* We have to fall back on the C implementation for locales
+	   with encodings not matching ASCII for single bytes.  */
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	mov	LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP
+# else
+	mov	(%rcx), %RAX_LP
+# endif
+	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
+	jne	__strncasecmp_l_nonascii
+#endif
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	test	%rdx, %rdx
+	je	LABEL(strcmp_exitz)
+	cmp	$1, %rdx
+	je	LABEL(Byte0)
+	mov	%rdx, %r11
+#endif
+	mov	%esi, %ecx
+	mov	%edi, %eax
+/* Use 64bit AND here to avoid long NOP padding.  */
+	and	$0x3f, %rcx		/* rsi alignment in cache line */
+	and	$0x3f, %rax		/* rdi alignment in cache line */
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	.section .rodata.cst16,"aM",@progbits,16
+	.align 16
+LABEL(belowupper):
+	.quad	0x4040404040404040
+	.quad	0x4040404040404040
+LABEL(topupper):
+# ifdef USE_AVX
+	.quad	0x5a5a5a5a5a5a5a5a
+	.quad	0x5a5a5a5a5a5a5a5a
+# else
+	.quad	0x5b5b5b5b5b5b5b5b
+	.quad	0x5b5b5b5b5b5b5b5b
+# endif
+LABEL(touppermask):
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.previous
+	movdqa	LABEL(belowupper)(%rip), %xmm4
+# define UCLOW_reg %xmm4
+	movdqa	LABEL(topupper)(%rip), %xmm5
+# define UCHIGH_reg %xmm5
+	movdqa	LABEL(touppermask)(%rip), %xmm6
+# define LCQWORD_reg %xmm6
+#endif
+	cmp	$0x30, %ecx
+	ja	LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
+	cmp	$0x30, %eax
+	ja	LABEL(crosscache)/* rdi: 16-byte load will cross cache line */
+	movdqu	(%rdi), %xmm1
+	movdqu	(%rsi), %xmm2
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef USE_AVX
+#  define TOLOWER(reg1, reg2) \
+	vpcmpgtb UCLOW_reg, reg1, %xmm7;			\
+	vpcmpgtb UCHIGH_reg, reg1, %xmm8;			\
+	vpcmpgtb UCLOW_reg, reg2, %xmm9;			\
+	vpcmpgtb UCHIGH_reg, reg2, %xmm10;			\
+	vpandn	%xmm7, %xmm8, %xmm8;					\
+	vpandn	%xmm9, %xmm10, %xmm10;					\
+	vpand	LCQWORD_reg, %xmm8, %xmm8;				\
+	vpand	LCQWORD_reg, %xmm10, %xmm10;				\
+	vpor	reg1, %xmm8, reg1;					\
+	vpor	reg2, %xmm10, reg2
+# else
+#  define TOLOWER(reg1, reg2) \
+	movdqa	reg1, %xmm7;					\
+	movdqa	UCHIGH_reg, %xmm8;				\
+	movdqa	reg2, %xmm9;					\
+	movdqa	UCHIGH_reg, %xmm10;				\
+	pcmpgtb	UCLOW_reg, %xmm7;				\
+	pcmpgtb	reg1, %xmm8;					\
+	pcmpgtb	UCLOW_reg, %xmm9;				\
+	pcmpgtb	reg2, %xmm10;					\
+	pand	%xmm8, %xmm7;					\
+	pand	%xmm10, %xmm9;					\
+	pand	LCQWORD_reg, %xmm7;				\
+	pand	LCQWORD_reg, %xmm9;				\
+	por	%xmm7, reg1;					\
+	por	%xmm9, reg2
+# endif
+	TOLOWER (%xmm1, %xmm2)
+#else
+# define TOLOWER(reg1, reg2)
+#endif
+	pxor	%xmm0, D(%xmm0)		/* clear %xmm0 for null char checks */
+	pcmpeqb	%xmm1, D(%xmm0)		/* Any null chars? */
+	pcmpeqb	%xmm2, D(%xmm1)		/* compare first 16 bytes for equality */
+	psubb	%xmm0, D(%xmm1)		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 16 bytes are same, edx == 0xffff */
+	jnz	LABEL(less16bytes)/* If not, find different value or null char */
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)/* finish comparison */
+#endif
+	add	$16, %rsi		/* prepare to search next 16 bytes */
+	add	$16, %rdi		/* prepare to search next 16 bytes */
+
+	/*
+	 * Determine source and destination string offsets from 16-byte
+	 * alignment.  Use relative offset difference between the two to
+	 * determine which case below to use.
+	 */
+	.p2align 4
+LABEL(crosscache):
+	and	$0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
+	and	$0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
+	mov	$0xffff, %edx		/* for equivalent offset */
+	xor	%r8d, %r8d
+	and	$0xf, %ecx		/* offset of rsi */
+	and	$0xf, %eax		/* offset of rdi */
+	pxor	%xmm0, D(%xmm0)		/* clear %xmm0 for null char check */
+	cmp	%eax, %ecx
+	je	LABEL(ashr_0)		/* rsi and rdi relative offset same */
+	ja	LABEL(bigger)
+	mov	%edx, %r8d		/* r8d is offset flag for exit tail */
+	xchg	%ecx, %eax
+	xchg	%rsi, %rdi
+LABEL(bigger):
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	lea	15(%rax), %r9
+	sub	%rcx, %r9
+	lea	LABEL(unaligned_table)(%rip), %r10
+	movslq	(%r10, %r9,4), %r9
+	pcmpeqb	%xmm1, D(%xmm0)		/* Any null chars? */
+	lea	(%r10, %r9), %r10
+	jmp	*%r10			/* jump to corresponding case */
+
+/*
+ * The following cases will be handled by ashr_0
+ *  rcx(offset of rsi)  rax(offset of rdi)  relative offset  corresponding case
+ *        n(0~15)            n(0~15)           15(15+ n-n)         ashr_0
+ */
+	.p2align 4
+LABEL(ashr_0):
+
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, D(%xmm0)		/* Any null chars? */
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpeqb	(%rdi), D(%xmm1)	/* compare 16 bytes for equality */
+#else
+	movdqa	(%rdi), %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm2, D(%xmm1)		/* compare 16 bytes for equality */
+#endif
+	psubb	%xmm0, D(%xmm1)		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %r9d
+	shr	%cl, %edx		/* adjust 0xffff for offset */
+	shr	%cl, %r9d		/* adjust for 16-byte offset */
+	sub	%r9d, %edx
+	/*
+	 * edx must be the same with r9d if in left byte (16-rcx) is equal to
+	 * the start from (16-rax) and no null char was seen.
+	 */
+	jne	LABEL(less32bytes)	/* mismatch or null char */
+	UPDATE_STRNCMP_COUNTER
+	mov	$16, %rcx
+	mov	$16, %r9
+
+	/*
+	 * Now both strings are aligned at 16-byte boundary. Loop over strings
+	 * checking 32-bytes per iteration.
+	 */
+	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+	.p2align 4
+LABEL(ashr_0_use):
+	movdqa	(%rdi,%rdx), %xmm0
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri      $0x1a,(%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	lea	16(%rdx), %rdx
+	jbe	LABEL(ashr_0_exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	movdqa	(%rdi,%rdx), %xmm0
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri      $0x1a,(%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	lea	16(%rdx), %rdx
+	jbe	LABEL(ashr_0_exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	jmp	LABEL(ashr_0_use)
+
+
+	.p2align 4
+LABEL(ashr_0_exit_use):
+	jnc	LABEL(strcmp_exitz)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	%rcx, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	lea	-16(%rdx, %rcx), %rcx
+	movzbl	(%rdi, %rcx), %eax
+	movzbl	(%rsi, %rcx), %edx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
+	movl	(%rcx,%rax,4), %eax
+	movl	(%rcx,%rdx,4), %edx
+#endif
+	sub	%edx, %eax
+	ret
+
+
+
+/*
+ * The following cases will be handled by ashr_1
+ * rcx(offset of rsi)  rax(offset of rdi)   relative offset	corresponding case
+ *        n(15)            n -15            0(15 +(n-15) - n)         ashr_1
+ */
+	.p2align 4
+LABEL(ashr_1):
+	pslldq	$15, D(%xmm2)		/* shift first string to align with second */
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, D(%xmm2)		/* compare 16 bytes for equality */
+	psubb	%xmm0, D(%xmm2)		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx		/* adjust 0xffff for offset */
+	shr	%cl, %r9d		/* adjust for 16-byte offset */
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)	/* mismatch or null char seen */
+	movdqa	(%rdi), %xmm3
+	UPDATE_STRNCMP_COUNTER
+
+	mov	$16, %rcx		/* index for loads*/
+	mov	$1, %r9d		/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	1(%rdi), %r10
+	and	$0xfff, %r10		/* offset into 4K page */
+	sub	$0x1000, %r10		/* subtract 4K pagesize */
+	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+
+	.p2align 4
+LABEL(loop_ashr_1_use):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_1_use)
+
+LABEL(nibble_ashr_1_restart_use):
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $1, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rdx
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_1_use)
+
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $1, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	add	$16, %rdx
+	jmp	LABEL(loop_ashr_1_use)
+
+	.p2align 4
+LABEL(nibble_ashr_1_use):
+	sub	$0x1000, %r10
+	movdqa	-16(%rdi, %rdx), %xmm0
+	psrldq	$1, D(%xmm0)
+	pcmpistri      $0x3a,%xmm0, %xmm0
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	%r11, %rcx
+	jae	LABEL(nibble_ashr_exit_use)
+#endif
+	cmp	$14, %ecx
+	ja	LABEL(nibble_ashr_1_restart_use)
+
+	jmp	LABEL(nibble_ashr_exit_use)
+
+/*
+ * The following cases will be handled by ashr_2
+ * rcx(offset of rsi)  rax(offset of rdi)   relative offset	corresponding case
+ *        n(14~15)            n -14         1(15 +(n-14) - n)         ashr_2
+ */
+	.p2align 4
+LABEL(ashr_2):
+	pslldq	$14, D(%xmm2)
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, D(%xmm2)
+	psubb	%xmm0, D(%xmm2)
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+	UPDATE_STRNCMP_COUNTER
+
+	mov	$16, %rcx	/* index for loads */
+	mov	$2, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	2(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+
+	.p2align 4
+LABEL(loop_ashr_2_use):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_2_use)
+
+LABEL(nibble_ashr_2_restart_use):
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $2, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rdx
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_2_use)
+
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $2, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	add	$16, %rdx
+	jmp	LABEL(loop_ashr_2_use)
+
+	.p2align 4
+LABEL(nibble_ashr_2_use):
+	sub	$0x1000, %r10
+	movdqa	-16(%rdi, %rdx), %xmm0
+	psrldq	$2, D(%xmm0)
+	pcmpistri      $0x3a,%xmm0, %xmm0
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	%r11, %rcx
+	jae	LABEL(nibble_ashr_exit_use)
+#endif
+	cmp	$13, %ecx
+	ja	LABEL(nibble_ashr_2_restart_use)
+
+	jmp	LABEL(nibble_ashr_exit_use)
+
+/*
+ * The following cases will be handled by ashr_3
+ *  rcx(offset of rsi)  rax(offset of rdi)  relative offset	 corresponding case
+ *        n(13~15)            n -13         2(15 +(n-13) - n)         ashr_3
+ */
+	.p2align 4
+LABEL(ashr_3):
+	pslldq	$13, D(%xmm2)
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, D(%xmm2)
+	psubb	%xmm0, D(%xmm2)
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	mov	$16, %rcx	/* index for loads */
+	mov	$3, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	3(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+
+LABEL(loop_ashr_3_use):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_3_use)
+
+LABEL(nibble_ashr_3_restart_use):
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $3, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rdx
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_3_use)
+
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $3, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	add	$16, %rdx
+	jmp	LABEL(loop_ashr_3_use)
+
+	.p2align 4
+LABEL(nibble_ashr_3_use):
+	sub	$0x1000, %r10
+	movdqa	-16(%rdi, %rdx), %xmm0
+	psrldq	$3, D(%xmm0)
+	pcmpistri      $0x3a,%xmm0, %xmm0
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	%r11, %rcx
+	jae	LABEL(nibble_ashr_exit_use)
+#endif
+	cmp	$12, %ecx
+	ja	LABEL(nibble_ashr_3_restart_use)
+
+	jmp	LABEL(nibble_ashr_exit_use)
+
+/*
+ * The following cases will be handled by ashr_4
+ *  rcx(offset of rsi)  rax(offset of rdi)  relative offset	 corresponding case
+ *        n(12~15)            n -12         3(15 +(n-12) - n)         ashr_4
+ */
+	.p2align 4
+LABEL(ashr_4):
+	pslldq	$12, D(%xmm2)
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, D(%xmm2)
+	psubb	%xmm0, D(%xmm2)
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	mov	$16, %rcx	/* index for loads */
+	mov	$4, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	4(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+
+	.p2align 4
+LABEL(loop_ashr_4_use):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_4_use)
+
+LABEL(nibble_ashr_4_restart_use):
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $4, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rdx
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_4_use)
+
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $4, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	add	$16, %rdx
+	jmp	LABEL(loop_ashr_4_use)
+
+	.p2align 4
+LABEL(nibble_ashr_4_use):
+	sub	$0x1000, %r10
+	movdqa	-16(%rdi, %rdx), %xmm0
+	psrldq	$4, D(%xmm0)
+	pcmpistri      $0x3a,%xmm0, %xmm0
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	%r11, %rcx
+	jae	LABEL(nibble_ashr_exit_use)
+#endif
+	cmp	$11, %ecx
+	ja	LABEL(nibble_ashr_4_restart_use)
+
+	jmp	LABEL(nibble_ashr_exit_use)
+
+/*
+ * The following cases will be handled by ashr_5
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset      corresponding case
+ *        n(11~15)          n - 11		  4(15 +(n-11) - n)         ashr_5
+ */
+	.p2align 4
+LABEL(ashr_5):
+	pslldq	$11, D(%xmm2)
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, D(%xmm2)
+	psubb	%xmm0, D(%xmm2)
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	mov	$16, %rcx	/* index for loads */
+	mov	$5, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	5(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+
+	.p2align 4
+LABEL(loop_ashr_5_use):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_5_use)
+
+LABEL(nibble_ashr_5_restart_use):
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $5, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rdx
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_5_use)
+
+	movdqa	(%rdi, %rdx), %xmm0
+
+	palignr $5, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	add	$16, %rdx
+	jmp	LABEL(loop_ashr_5_use)
+
+	.p2align 4
+LABEL(nibble_ashr_5_use):
+	sub	$0x1000, %r10
+	movdqa	-16(%rdi, %rdx), %xmm0
+	psrldq	$5, D(%xmm0)
+	pcmpistri      $0x3a,%xmm0, %xmm0
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	%r11, %rcx
+	jae	LABEL(nibble_ashr_exit_use)
+#endif
+	cmp	$10, %ecx
+	ja	LABEL(nibble_ashr_5_restart_use)
+
+	jmp	LABEL(nibble_ashr_exit_use)
+
+/*
+ * The following cases will be handled by ashr_6
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset      corresponding case
+ *        n(10~15)          n - 10		  5(15 +(n-10) - n)         ashr_6
+ */
+	.p2align 4
+LABEL(ashr_6):
+	pslldq	$10, D(%xmm2)
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, D(%xmm2)
+	psubb	%xmm0, D(%xmm2)
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	mov	$16, %rcx	/* index for loads */
+	mov	$6, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	6(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+
+	.p2align 4
+LABEL(loop_ashr_6_use):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_6_use)
+
+LABEL(nibble_ashr_6_restart_use):
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $6, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rdx
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_6_use)
+
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $6, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	add	$16, %rdx
+	jmp	LABEL(loop_ashr_6_use)
+
+	.p2align 4
+LABEL(nibble_ashr_6_use):
+	sub	$0x1000, %r10
+	movdqa	-16(%rdi, %rdx), %xmm0
+	psrldq	$6, D(%xmm0)
+	pcmpistri      $0x3a,%xmm0, %xmm0
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	%r11, %rcx
+	jae	LABEL(nibble_ashr_exit_use)
+#endif
+	cmp	$9, %ecx
+	ja	LABEL(nibble_ashr_6_restart_use)
+
+	jmp	LABEL(nibble_ashr_exit_use)
+
+/*
+ * The following cases will be handled by ashr_7
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset      corresponding case
+ *        n(9~15)          n - 9		  6(15 +(n - 9) - n)         ashr_7
+ */
+	.p2align 4
+LABEL(ashr_7):
+	pslldq	$9, D(%xmm2)
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, D(%xmm2)
+	psubb	%xmm0, D(%xmm2)
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	mov	$16, %rcx	/* index for loads */
+	mov	$7, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	7(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+
+	.p2align 4
+LABEL(loop_ashr_7_use):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_7_use)
+
+LABEL(nibble_ashr_7_restart_use):
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $7, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rdx
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_7_use)
+
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $7, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	add	$16, %rdx
+	jmp	LABEL(loop_ashr_7_use)
+
+	.p2align 4
+LABEL(nibble_ashr_7_use):
+	sub	$0x1000, %r10
+	movdqa	-16(%rdi, %rdx), %xmm0
+	psrldq	$7, D(%xmm0)
+	pcmpistri      $0x3a,%xmm0, %xmm0
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	%r11, %rcx
+	jae	LABEL(nibble_ashr_exit_use)
+#endif
+	cmp	$8, %ecx
+	ja	LABEL(nibble_ashr_7_restart_use)
+
+	jmp	LABEL(nibble_ashr_exit_use)
+
+/*
+ *  The following cases will be handled by ashr_8
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
+ *        n(8~15)          n - 8		  7(15 +(n - 8) - n)         ashr_8
+ */
+	.p2align 4
+LABEL(ashr_8):
+	pslldq	$8, D(%xmm2)
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, D(%xmm2)
+	psubb	%xmm0, D(%xmm2)
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	mov	$16, %rcx	/* index for loads */
+	mov	$8, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	8(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+
+	.p2align 4
+LABEL(loop_ashr_8_use):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_8_use)
+
+LABEL(nibble_ashr_8_restart_use):
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $8, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rdx
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_8_use)
+
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $8, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	add	$16, %rdx
+	jmp	LABEL(loop_ashr_8_use)
+
+	.p2align 4
+LABEL(nibble_ashr_8_use):
+	sub	$0x1000, %r10
+	movdqa	-16(%rdi, %rdx), %xmm0
+	psrldq	$8, D(%xmm0)
+	pcmpistri      $0x3a,%xmm0, %xmm0
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	%r11, %rcx
+	jae	LABEL(nibble_ashr_exit_use)
+#endif
+	cmp	$7, %ecx
+	ja	LABEL(nibble_ashr_8_restart_use)
+
+	jmp	LABEL(nibble_ashr_exit_use)
+
+/*
+ *  The following cases will be handled by ashr_9
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
+ *        n(7~15)          n - 7		  8(15 +(n - 7) - n)         ashr_9
+ */
+	.p2align 4
+LABEL(ashr_9):
+	pslldq	$7, D(%xmm2)
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, D(%xmm2)
+	psubb	%xmm0, D(%xmm2)
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	mov	$16, %rcx	/* index for loads */
+	mov	$9, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	9(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+
+	.p2align 4
+LABEL(loop_ashr_9_use):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_9_use)
+
+LABEL(nibble_ashr_9_restart_use):
+	movdqa	(%rdi, %rdx), %xmm0
+
+	palignr $9, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rdx
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_9_use)
+
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $9, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	add	$16, %rdx
+	jmp	LABEL(loop_ashr_9_use)
+
+	.p2align 4
+LABEL(nibble_ashr_9_use):
+	sub	$0x1000, %r10
+	movdqa	-16(%rdi, %rdx), %xmm0
+	psrldq	$9, D(%xmm0)
+	pcmpistri      $0x3a,%xmm0, %xmm0
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	%r11, %rcx
+	jae	LABEL(nibble_ashr_exit_use)
+#endif
+	cmp	$6, %ecx
+	ja	LABEL(nibble_ashr_9_restart_use)
+
+	jmp	LABEL(nibble_ashr_exit_use)
+
+/*
+ *  The following cases will be handled by ashr_10
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
+ *        n(6~15)          n - 6		  9(15 +(n - 6) - n)         ashr_10
+ */
+	.p2align 4
+LABEL(ashr_10):
+	pslldq	$6, D(%xmm2)
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, D(%xmm2)
+	psubb	%xmm0, D(%xmm2)
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	mov	$16, %rcx	/* index for loads */
+	mov	$10, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	10(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+
+	.p2align 4
+LABEL(loop_ashr_10_use):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_10_use)
+
+LABEL(nibble_ashr_10_restart_use):
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $10, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rdx
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_10_use)
+
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $10, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	add	$16, %rdx
+	jmp	LABEL(loop_ashr_10_use)
+
+	.p2align 4
+LABEL(nibble_ashr_10_use):
+	sub	$0x1000, %r10
+	movdqa	-16(%rdi, %rdx), %xmm0
+	psrldq	$10, D(%xmm0)
+	pcmpistri      $0x3a,%xmm0, %xmm0
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	%r11, %rcx
+	jae	LABEL(nibble_ashr_exit_use)
+#endif
+	cmp	$5, %ecx
+	ja	LABEL(nibble_ashr_10_restart_use)
+
+	jmp	LABEL(nibble_ashr_exit_use)
+
+/*
+ *  The following cases will be handled by ashr_11
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
+ *        n(5~15)          n - 5		  10(15 +(n - 5) - n)         ashr_11
+ */
+	.p2align 4
+LABEL(ashr_11):
+	pslldq	$5, D(%xmm2)
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, D(%xmm2)
+	psubb	%xmm0, D(%xmm2)
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	mov	$16, %rcx	/* index for loads */
+	mov	$11, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	11(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+
+	.p2align 4
+LABEL(loop_ashr_11_use):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_11_use)
+
+LABEL(nibble_ashr_11_restart_use):
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $11, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rdx
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_11_use)
+
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $11, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	add	$16, %rdx
+	jmp	LABEL(loop_ashr_11_use)
+
+	.p2align 4
+LABEL(nibble_ashr_11_use):
+	sub	$0x1000, %r10
+	movdqa	-16(%rdi, %rdx), %xmm0
+	psrldq	$11, D(%xmm0)
+	pcmpistri      $0x3a,%xmm0, %xmm0
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	%r11, %rcx
+	jae	LABEL(nibble_ashr_exit_use)
+#endif
+	cmp	$4, %ecx
+	ja	LABEL(nibble_ashr_11_restart_use)
+
+	jmp	LABEL(nibble_ashr_exit_use)
+
+/*
+ *  The following cases will be handled by ashr_12
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
+ *        n(4~15)          n - 4		  11(15 +(n - 4) - n)         ashr_12
+ */
+	.p2align 4
+LABEL(ashr_12):
+	pslldq	$4, D(%xmm2)
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, D(%xmm2)
+	psubb	%xmm0, D(%xmm2)
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	mov	$16, %rcx	/* index for loads */
+	mov	$12, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	12(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+
+	.p2align 4
+LABEL(loop_ashr_12_use):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_12_use)
+
+LABEL(nibble_ashr_12_restart_use):
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $12, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rdx
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_12_use)
+
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $12, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	add	$16, %rdx
+	jmp	LABEL(loop_ashr_12_use)
+
+	.p2align 4
+LABEL(nibble_ashr_12_use):
+	sub	$0x1000, %r10
+	movdqa	-16(%rdi, %rdx), %xmm0
+	psrldq	$12, D(%xmm0)
+	pcmpistri      $0x3a,%xmm0, %xmm0
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	%r11, %rcx
+	jae	LABEL(nibble_ashr_exit_use)
+#endif
+	cmp	$3, %ecx
+	ja	LABEL(nibble_ashr_12_restart_use)
+
+	jmp	LABEL(nibble_ashr_exit_use)
+
+/*
+ *  The following cases will be handled by ashr_13
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
+ *        n(3~15)          n - 3		  12(15 +(n - 3) - n)         ashr_13
+ */
+	.p2align 4
+LABEL(ashr_13):
+	pslldq	$3, D(%xmm2)
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, D(%xmm2)
+	psubb	%xmm0, D(%xmm2)
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	mov	$16, %rcx	/* index for loads */
+	mov	$13, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	13(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+
+	.p2align 4
+LABEL(loop_ashr_13_use):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_13_use)
+
+LABEL(nibble_ashr_13_restart_use):
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $13, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rdx
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_13_use)
+
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $13, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	add	$16, %rdx
+	jmp	LABEL(loop_ashr_13_use)
+
+	.p2align 4
+LABEL(nibble_ashr_13_use):
+	sub	$0x1000, %r10
+	movdqa	-16(%rdi, %rdx), %xmm0
+	psrldq	$13, D(%xmm0)
+	pcmpistri      $0x3a,%xmm0, %xmm0
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	%r11, %rcx
+	jae	LABEL(nibble_ashr_exit_use)
+#endif
+	cmp	$2, %ecx
+	ja	LABEL(nibble_ashr_13_restart_use)
+
+	jmp	LABEL(nibble_ashr_exit_use)
+
+/*
+ *  The following cases will be handled by ashr_14
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
+ *        n(2~15)          n - 2		  13(15 +(n - 2) - n)         ashr_14
+ */
+	.p2align 4
+LABEL(ashr_14):
+	pslldq  $2, D(%xmm2)
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, D(%xmm2)
+	psubb	%xmm0, D(%xmm2)
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	mov	$16, %rcx	/* index for loads */
+	mov	$14, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	14(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+
+	.p2align 4
+LABEL(loop_ashr_14_use):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_14_use)
+
+LABEL(nibble_ashr_14_restart_use):
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $14, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rdx
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_14_use)
+
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $14, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	add	$16, %rdx
+	jmp	LABEL(loop_ashr_14_use)
+
+	.p2align 4
+LABEL(nibble_ashr_14_use):
+	sub	$0x1000, %r10
+	movdqa	-16(%rdi, %rdx), %xmm0
+	psrldq	$14, D(%xmm0)
+	pcmpistri      $0x3a,%xmm0, %xmm0
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	%r11, %rcx
+	jae	LABEL(nibble_ashr_exit_use)
+#endif
+	cmp	$1, %ecx
+	ja	LABEL(nibble_ashr_14_restart_use)
+
+	jmp	LABEL(nibble_ashr_exit_use)
+
+/*
+ *  The following cases will be handled by ashr_15
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
+ *        n(1~15)          n - 1		  14(15 +(n - 1) - n)         ashr_15
+ */
+	.p2align 4
+LABEL(ashr_15):
+	pslldq	$1, D(%xmm2)
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm1, D(%xmm2)
+	psubb	%xmm0, D(%xmm2)
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	mov	$16, %rcx	/* index for loads */
+	mov	$15, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	15(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+
+	.p2align 4
+LABEL(loop_ashr_15_use):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_15_use)
+
+LABEL(nibble_ashr_15_restart_use):
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $15, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rdx
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_15_use)
+
+	movdqa	(%rdi, %rdx), %xmm0
+	palignr $15, -16(%rdi, %rdx), D(%xmm0)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	jbe	LABEL(exit_use)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	add	$16, %rdx
+	jmp	LABEL(loop_ashr_15_use)
+
+	.p2align 4
+LABEL(nibble_ashr_15_use):
+	sub	$0x1000, %r10
+	movdqa	-16(%rdi, %rdx), %xmm0
+	psrldq	$15, D(%xmm0)
+	pcmpistri      $0x3a,%xmm0, %xmm0
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	%r11, %rcx
+	jae	LABEL(nibble_ashr_exit_use)
+#endif
+	cmp	$0, %ecx
+	ja	LABEL(nibble_ashr_15_restart_use)
+
+LABEL(nibble_ashr_exit_use):
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+	pcmpistri      $0x1a,(%rsi,%rdx), %xmm0
+#else
+	movdqa	(%rsi,%rdx), %xmm1
+	TOLOWER (%xmm0, %xmm1)
+	pcmpistri $0x1a, %xmm1, %xmm0
+#endif
+	.p2align 4
+LABEL(exit_use):
+	jnc	LABEL(strcmp_exitz)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	%rcx, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	add	%rcx, %rdx
+	lea	-16(%rdi, %r9), %rdi
+	movzbl	(%rdi, %rdx), %eax
+	movzbl	(%rsi, %rdx), %edx
+	test	%r8d, %r8d
+	jz	LABEL(ret_use)
+	xchg	%eax, %edx
+LABEL(ret_use):
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
+	movl	(%rcx,%rdx,4), %edx
+	movl	(%rcx,%rax,4), %eax
+#endif
+
+	sub	%edx, %eax
+	ret
+
+LABEL(less32bytes):
+	lea	(%rdi, %rax), %rdi	/* locate the exact address for first operand(rdi) */
+	lea	(%rsi, %rcx), %rsi	/* locate the exact address for second operand(rsi) */
+	test	%r8d, %r8d
+	jz	LABEL(ret)
+	xchg	%rsi, %rdi		/* recover original order according to flag(%r8d) */
+
+	.p2align 4
+LABEL(ret):
+LABEL(less16bytes):
+	bsf	%rdx, %rdx		/* find and store bit index in %rdx */
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	%rdx, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	movzbl	(%rsi, %rdx), %ecx
+	movzbl	(%rdi, %rdx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
+	movl	(%rdx,%rcx,4), %ecx
+	movl	(%rdx,%rax,4), %eax
+#endif
+
+	sub	%ecx, %eax
+	ret
+
+LABEL(strcmp_exitz):
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+	// XXX Same as code above
+LABEL(Byte0):
+	movzx	(%rsi), %ecx
+	movzx	(%rdi), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
+	movl	(%rdx,%rcx,4), %ecx
+	movl	(%rdx,%rax,4), %eax
+#endif
+
+	sub	%ecx, %eax
+	ret
+	cfi_endproc
+	.size	STRCMP_SSE42, .-STRCMP_SSE42
+
+#undef UCLOW_reg
+#undef UCHIGH_reg
+#undef LCQWORD_reg
+#undef TOLOWER
+
+	/* Put all SSE 4.2 functions together.  */
+	.section .rodata.SECTION,"a",@progbits
+	.p2align 3
+LABEL(unaligned_table):
+	.int	LABEL(ashr_1) - LABEL(unaligned_table)
+	.int	LABEL(ashr_2) - LABEL(unaligned_table)
+	.int	LABEL(ashr_3) - LABEL(unaligned_table)
+	.int	LABEL(ashr_4) - LABEL(unaligned_table)
+	.int	LABEL(ashr_5) - LABEL(unaligned_table)
+	.int	LABEL(ashr_6) - LABEL(unaligned_table)
+	.int	LABEL(ashr_7) - LABEL(unaligned_table)
+	.int	LABEL(ashr_8) - LABEL(unaligned_table)
+	.int	LABEL(ashr_9) - LABEL(unaligned_table)
+	.int	LABEL(ashr_10) - LABEL(unaligned_table)
+	.int	LABEL(ashr_11) - LABEL(unaligned_table)
+	.int	LABEL(ashr_12) - LABEL(unaligned_table)
+	.int	LABEL(ashr_13) - LABEL(unaligned_table)
+	.int	LABEL(ashr_14) - LABEL(unaligned_table)
+	.int	LABEL(ashr_15) - LABEL(unaligned_table)
+	.int	LABEL(ashr_0) - LABEL(unaligned_table)
+
+#undef LABEL
+#undef GLABEL
+#undef SECTION
+#undef movdqa
+#undef movdqu
+#undef pmovmskb
+#undef pcmpistri
+#undef psubb
+#undef pcmpeqb
+#undef psrldq
+#undef pslldq
+#undef palignr
+#undef pxor
+#undef D
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcmp-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcmp-ssse3.S
new file mode 100644
index 0000000000..1b7fa33c91
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcmp-ssse3.S
@@ -0,0 +1,5 @@
+#if IS_IN (libc)
+# define USE_SSSE3 1
+# define STRCMP __strcmp_ssse3
+# include "../strcmp.S"
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcmp.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcmp.S
new file mode 100644
index 0000000000..54f8f7dd44
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcmp.S
@@ -0,0 +1,209 @@
+/* Multiple versions of strcmp
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifdef USE_AS_STRNCMP
+/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
+   if the new counter > the old one or is 0.  */
+# define UPDATE_STRNCMP_COUNTER				\
+	/* calculate left number to compare */		\
+	lea	-16(%rcx, %r11), %r9;			\
+	cmp	%r9, %r11;				\
+	jb	LABEL(strcmp_exitz);			\
+	test	%r9, %r9;				\
+	je	LABEL(strcmp_exitz);			\
+	mov	%r9, %r11
+
+# define STRCMP_SSE42	__strncmp_sse42
+# define STRCMP_SSSE3	__strncmp_ssse3
+# define STRCMP_SSE2	__strncmp_sse2
+# define __GI_STRCMP	__GI_strncmp
+#elif defined USE_AS_STRCASECMP_L
+# include "locale-defines.h"
+
+# define UPDATE_STRNCMP_COUNTER
+
+# define STRCMP_AVX	__strcasecmp_l_avx
+# define STRCMP_SSE42	__strcasecmp_l_sse42
+# define STRCMP_SSSE3	__strcasecmp_l_ssse3
+# define STRCMP_SSE2	__strcasecmp_l_sse2
+# define __GI_STRCMP	__GI___strcasecmp_l
+#elif defined USE_AS_STRNCASECMP_L
+# include "locale-defines.h"
+
+/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
+   if the new counter > the old one or is 0.  */
+# define UPDATE_STRNCMP_COUNTER				\
+	/* calculate left number to compare */		\
+	lea	-16(%rcx, %r11), %r9;			\
+	cmp	%r9, %r11;				\
+	jb	LABEL(strcmp_exitz);			\
+	test	%r9, %r9;				\
+	je	LABEL(strcmp_exitz);			\
+	mov	%r9, %r11
+
+# define STRCMP_AVX	__strncasecmp_l_avx
+# define STRCMP_SSE42	__strncasecmp_l_sse42
+# define STRCMP_SSSE3	__strncasecmp_l_ssse3
+# define STRCMP_SSE2	__strncasecmp_l_sse2
+# define __GI_STRCMP	__GI___strncasecmp_l
+#else
+# define USE_AS_STRCMP
+# define UPDATE_STRNCMP_COUNTER
+# ifndef STRCMP
+#  define STRCMP	strcmp
+#  define STRCMP_SSE42	__strcmp_sse42
+#  define STRCMP_SSSE3	__strcmp_ssse3
+#  define STRCMP_SSE2	__strcmp_sse2
+#  define __GI_STRCMP	__GI_strcmp
+# endif
+#endif
+
+/* Define multiple versions only for the definition in libc.  Don't
+   define multiple versions for strncmp in static library since we
+   need strncmp before the initialization happened.  */
+#if (defined SHARED || !defined USE_AS_STRNCMP) && IS_IN (libc)
+	.text
+ENTRY(STRCMP)
+	.type	STRCMP, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+#ifdef USE_AS_STRCMP
+	leaq	__strcmp_sse2_unaligned(%rip), %rax
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz     3f
+#else
+	HAS_ARCH_FEATURE (Slow_SSE4_2)
+	jnz	2f
+	leaq	STRCMP_SSE42(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_2)
+	jnz	3f
+#endif
+2:	leaq	STRCMP_SSSE3(%rip), %rax
+	HAS_CPU_FEATURE (SSSE3)
+	jnz	3f
+	leaq	STRCMP_SSE2(%rip), %rax
+3:	ret
+END(STRCMP)
+
+# ifdef USE_AS_STRCASECMP_L
+ENTRY(__strcasecmp)
+	.type	__strcasecmp, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	__strcasecmp_avx(%rip), %rax
+	HAS_ARCH_FEATURE (AVX_Usable)
+	jnz	3f
+	HAS_ARCH_FEATURE (Slow_SSE4_2)
+	jnz	2f
+	leaq	__strcasecmp_sse42(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_2)
+	jnz	3f
+2:	leaq	__strcasecmp_ssse3(%rip), %rax
+	HAS_CPU_FEATURE (SSSE3)
+	jnz	3f
+	leaq	__strcasecmp_sse2(%rip), %rax
+3:	ret
+END(__strcasecmp)
+weak_alias (__strcasecmp, strcasecmp)
+# endif
+# ifdef USE_AS_STRNCASECMP_L
+ENTRY(__strncasecmp)
+	.type	__strncasecmp, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	__strncasecmp_avx(%rip), %rax
+	HAS_ARCH_FEATURE (AVX_Usable)
+	jnz	3f
+	HAS_ARCH_FEATURE (Slow_SSE4_2)
+	jnz	2f
+	leaq	__strncasecmp_sse42(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_2)
+	jnz	3f
+2:	leaq	__strncasecmp_ssse3(%rip), %rax
+	HAS_CPU_FEATURE (SSSE3)
+	jnz	3f
+	leaq	__strncasecmp_sse2(%rip), %rax
+3:	ret
+END(__strncasecmp)
+weak_alias (__strncasecmp, strncasecmp)
+# endif
+
+# undef LABEL
+# define LABEL(l) .L##l##_sse42
+# define GLABEL(l) l##_sse42
+# define SECTION sse4.2
+# include "strcmp-sse42.S"
+
+
+# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+#  define LABEL(l) .L##l##_avx
+#  define GLABEL(l) l##_avx
+#  define USE_AVX 1
+#  undef STRCMP_SSE42
+#  define STRCMP_SSE42 STRCMP_AVX
+#  define SECTION avx
+#  include "strcmp-sse42.S"
+# endif
+
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type STRCMP_SSE2, @function; \
+	.align 16; \
+	.globl STRCMP_SSE2; \
+	.hidden STRCMP_SSE2; \
+	STRCMP_SSE2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size STRCMP_SSE2, .-STRCMP_SSE2
+
+# ifdef USE_AS_STRCASECMP_L
+#  define ENTRY2(name) \
+	.type __strcasecmp_sse2, @function; \
+	.align 16; \
+	.globl __strcasecmp_sse2; \
+	.hidden __strcasecmp_sse2; \
+	__strcasecmp_sse2: cfi_startproc; \
+	CALL_MCOUNT
+#  define END2(name) \
+	cfi_endproc; .size __strcasecmp_sse2, .-__strcasecmp_sse2
+# endif
+
+# ifdef USE_AS_STRNCASECMP_L
+#  define ENTRY2(name) \
+	.type __strncasecmp_sse2, @function; \
+	.align 16; \
+	.globl __strncasecmp_sse2; \
+	.hidden __strncasecmp_sse2; \
+	__strncasecmp_sse2: cfi_startproc; \
+	CALL_MCOUNT
+#  define END2(name) \
+	cfi_endproc; .size __strncasecmp_sse2, .-__strncasecmp_sse2
+# endif
+
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strcmp calls through a PLT.
+   The speedup we get from using SSE4.2 instruction is likely eaten away
+   by the indirect call in the PLT.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_STRCMP; __GI_STRCMP = STRCMP_SSE2
+#endif
+
+#include "../strcmp.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
new file mode 100644
index 0000000000..6a5ab7ab26
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
@@ -0,0 +1,1889 @@
+/* strcpy with SSE2 and unaligned load
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# ifndef USE_AS_STRCAT
+#  include <sysdep.h>
+
+#  ifndef STRCPY
+#   define STRCPY  __strcpy_sse2_unaligned
+#  endif
+
+# endif
+
+# define JMPTBL(I, B)	I - B
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)             \
+	lea	TABLE(%rip), %r11;                              \
+	movslq	(%r11, INDEX, SCALE), %rcx;                     \
+	lea	(%r11, %rcx), %rcx;                             \
+	jmp	*%rcx
+
+# ifndef USE_AS_STRCAT
+
+.text
+ENTRY (STRCPY)
+#  ifdef USE_AS_STRNCPY
+	mov	%rdx, %r8
+	test	%r8, %r8
+	jz	L(ExitZero)
+#  endif
+	mov	%rsi, %rcx
+#  ifndef USE_AS_STPCPY
+	mov	%rdi, %rax      /* save result */
+#  endif
+
+# endif
+
+	and	$63, %rcx
+	cmp	$32, %rcx
+	jbe	L(SourceStringAlignmentLess32)
+
+	and	$-16, %rsi
+	and	$15, %rcx
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+
+	pcmpeqb	(%rsi), %xmm1
+	pmovmskb %xmm1, %rdx
+	shr	%cl, %rdx
+
+# ifdef USE_AS_STRNCPY
+#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+	mov	$16, %r10
+	sub	%rcx, %r10
+	cmp	%r10, %r8
+#  else
+	mov	$17, %r10
+	sub	%rcx, %r10
+	cmp	%r10, %r8
+#  endif
+	jbe	L(CopyFrom1To16BytesTailCase2OrCase3)
+# endif
+	test	%rdx, %rdx
+	jnz	L(CopyFrom1To16BytesTail)
+
+	pcmpeqb	16(%rsi), %xmm0
+	pmovmskb %xmm0, %rdx
+
+# ifdef USE_AS_STRNCPY
+	add	$16, %r10
+	cmp	%r10, %r8
+	jbe	L(CopyFrom1To32BytesCase2OrCase3)
+# endif
+	test	%rdx, %rdx
+	jnz	L(CopyFrom1To32Bytes)
+
+	movdqu	(%rsi, %rcx), %xmm1   /* copy 16 bytes */
+	movdqu	%xmm1, (%rdi)
+
+/* If source address alignment != destination address alignment */
+	.p2align 4
+L(Unalign16Both):
+	sub	%rcx, %rdi
+# ifdef USE_AS_STRNCPY
+	add	%rcx, %r8
+	sbb	%rcx, %rcx
+	or	%rcx, %r8
+# endif
+	mov	$16, %rcx
+	movdqa	(%rsi, %rcx), %xmm1
+	movaps	16(%rsi, %rcx), %xmm2
+	movdqu	%xmm1, (%rdi, %rcx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rdx
+	add	$16, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$48, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rdx, %rdx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	jnz	L(CopyFrom1To16BytesUnalignedXmm2)
+# else
+	jnz	L(CopyFrom1To16Bytes)
+# endif
+
+	movaps	16(%rsi, %rcx), %xmm3
+	movdqu	%xmm2, (%rdi, %rcx)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %rdx
+	add	$16, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rdx, %rdx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	jnz	L(CopyFrom1To16BytesUnalignedXmm3)
+# else
+	jnz	L(CopyFrom1To16Bytes)
+# endif
+
+	movaps	16(%rsi, %rcx), %xmm4
+	movdqu	%xmm3, (%rdi, %rcx)
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %rdx
+	add	$16, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rdx, %rdx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	jnz	L(CopyFrom1To16BytesUnalignedXmm4)
+# else
+	jnz	L(CopyFrom1To16Bytes)
+# endif
+
+	movaps	16(%rsi, %rcx), %xmm1
+	movdqu	%xmm4, (%rdi, %rcx)
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %rdx
+	add	$16, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rdx, %rdx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	jnz	L(CopyFrom1To16BytesUnalignedXmm1)
+# else
+	jnz	L(CopyFrom1To16Bytes)
+# endif
+
+	movaps	16(%rsi, %rcx), %xmm2
+	movdqu	%xmm1, (%rdi, %rcx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rdx
+	add	$16, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rdx, %rdx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	jnz	L(CopyFrom1To16BytesUnalignedXmm2)
+# else
+	jnz	L(CopyFrom1To16Bytes)
+# endif
+
+	movaps	16(%rsi, %rcx), %xmm3
+	movdqu	%xmm2, (%rdi, %rcx)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %rdx
+	add	$16, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rdx, %rdx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	jnz	L(CopyFrom1To16BytesUnalignedXmm3)
+# else
+	jnz	L(CopyFrom1To16Bytes)
+# endif
+
+	movdqu	%xmm3, (%rdi, %rcx)
+	mov	%rsi, %rdx
+	lea	16(%rsi, %rcx), %rsi
+	and	$-0x40, %rsi
+	sub	%rsi, %rdx
+	sub	%rdx, %rdi
+# ifdef USE_AS_STRNCPY
+	lea	128(%r8, %rdx), %r8
+# endif
+L(Unaligned64Loop):
+	movaps	(%rsi), %xmm2
+	movaps	%xmm2, %xmm4
+	movaps	16(%rsi), %xmm5
+	movaps	32(%rsi), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	48(%rsi), %xmm7
+	pminub	%xmm5, %xmm2
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %rdx
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(UnalignedLeaveCase2OrCase3)
+# endif
+	test	%rdx, %rdx
+	jnz	L(Unaligned64Leave)
+
+L(Unaligned64Loop_start):
+	add	$64, %rdi
+	add	$64, %rsi
+	movdqu	%xmm4, -64(%rdi)
+	movaps	(%rsi), %xmm2
+	movdqa	%xmm2, %xmm4
+	movdqu	%xmm5, -48(%rdi)
+	movaps	16(%rsi), %xmm5
+	pminub	%xmm5, %xmm2
+	movaps	32(%rsi), %xmm3
+	movdqu	%xmm6, -32(%rdi)
+	movaps	%xmm3, %xmm6
+	movdqu	%xmm7, -16(%rdi)
+	movaps	48(%rsi), %xmm7
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %rdx
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(UnalignedLeaveCase2OrCase3)
+# endif
+	test	%rdx, %rdx
+	jz	L(Unaligned64Loop_start)
+
+L(Unaligned64Leave):
+	pxor	%xmm1, %xmm1
+
+	pcmpeqb	%xmm4, %xmm0
+	pcmpeqb	%xmm5, %xmm1
+	pmovmskb %xmm0, %rdx
+	pmovmskb %xmm1, %rcx
+	test	%rdx, %rdx
+	jnz	L(CopyFrom1To16BytesUnaligned_0)
+	test	%rcx, %rcx
+	jnz	L(CopyFrom1To16BytesUnaligned_16)
+
+	pcmpeqb	%xmm6, %xmm0
+	pcmpeqb	%xmm7, %xmm1
+	pmovmskb %xmm0, %rdx
+	pmovmskb %xmm1, %rcx
+	test	%rdx, %rdx
+	jnz	L(CopyFrom1To16BytesUnaligned_32)
+
+	bsf	%rcx, %rdx
+	movdqu	%xmm4, (%rdi)
+	movdqu	%xmm5, 16(%rdi)
+	movdqu	%xmm6, 32(%rdi)
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+	lea	48(%rdi, %rdx), %rax
+# endif
+	movdqu	%xmm7, 48(%rdi)
+	add	$15, %r8
+	sub	%rdx, %r8
+	lea	49(%rdi, %rdx), %rdi
+	jmp	L(StrncpyFillTailWithZero)
+# else
+	add	$48, %rsi
+	add	$48, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+# endif
+
+/* If source address alignment == destination address alignment */
+
+L(SourceStringAlignmentLess32):
+	pxor	%xmm0, %xmm0
+	movdqu	(%rsi), %xmm1
+	movdqu	16(%rsi), %xmm2
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %rdx
+
+# ifdef USE_AS_STRNCPY
+#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+	cmp	$16, %r8
+#  else
+	cmp	$17, %r8
+#  endif
+	jbe	L(CopyFrom1To16BytesTail1Case2OrCase3)
+# endif
+	test	%rdx, %rdx
+	jnz	L(CopyFrom1To16BytesTail1)
+
+	pcmpeqb	%xmm2, %xmm0
+	movdqu	%xmm1, (%rdi)
+	pmovmskb %xmm0, %rdx
+
+# ifdef USE_AS_STRNCPY
+#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+	cmp	$32, %r8
+#  else
+	cmp	$33, %r8
+#  endif
+	jbe	L(CopyFrom1To32Bytes1Case2OrCase3)
+# endif
+	test	%rdx, %rdx
+	jnz	L(CopyFrom1To32Bytes1)
+
+	and	$-16, %rsi
+	and	$15, %rcx
+	jmp	L(Unalign16Both)
+
+/*------End of main part with loops---------------------*/
+
+/* Case1 */
+
+# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
+	.p2align 4
+L(CopyFrom1To16Bytes):
+	add	%rcx, %rdi
+	add	%rcx, %rsi
+	bsf	%rdx, %rdx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+# endif
+	.p2align 4
+L(CopyFrom1To16BytesTail):
+	add	%rcx, %rsi
+	bsf	%rdx, %rdx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+
+	.p2align 4
+L(CopyFrom1To32Bytes1):
+	add	$16, %rsi
+	add	$16, %rdi
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$16, %r8
+# endif
+L(CopyFrom1To16BytesTail1):
+	bsf	%rdx, %rdx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+
+	.p2align 4
+L(CopyFrom1To32Bytes):
+	bsf	%rdx, %rdx
+	add	%rcx, %rsi
+	add	$16, %rdx
+	sub	%rcx, %rdx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnaligned_0):
+	bsf	%rdx, %rdx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+	lea	(%rdi, %rdx), %rax
+# endif
+	movdqu	%xmm4, (%rdi)
+	add	$63, %r8
+	sub	%rdx, %r8
+	lea	1(%rdi, %rdx), %rdi
+	jmp	L(StrncpyFillTailWithZero)
+# else
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+# endif
+
+	.p2align 4
+L(CopyFrom1To16BytesUnaligned_16):
+	bsf	%rcx, %rdx
+	movdqu	%xmm4, (%rdi)
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+	lea	16(%rdi, %rdx), %rax
+# endif
+	movdqu	%xmm5, 16(%rdi)
+	add	$47, %r8
+	sub	%rdx, %r8
+	lea	17(%rdi, %rdx), %rdi
+	jmp	L(StrncpyFillTailWithZero)
+# else
+	add	$16, %rsi
+	add	$16, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+# endif
+
+	.p2align 4
+L(CopyFrom1To16BytesUnaligned_32):
+	bsf	%rdx, %rdx
+	movdqu	%xmm4, (%rdi)
+	movdqu	%xmm5, 16(%rdi)
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+	lea	32(%rdi, %rdx), %rax
+# endif
+	movdqu	%xmm6, 32(%rdi)
+	add	$31, %r8
+	sub	%rdx, %r8
+	lea	33(%rdi, %rdx), %rdi
+	jmp	L(StrncpyFillTailWithZero)
+# else
+	add	$32, %rsi
+	add	$32, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+# endif
+
+# ifdef USE_AS_STRNCPY
+#  ifndef USE_AS_STRCAT
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm6):
+	movdqu	%xmm6, (%rdi, %rcx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm5):
+	movdqu	%xmm5, (%rdi, %rcx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm4):
+	movdqu	%xmm4, (%rdi, %rcx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm3):
+	movdqu	%xmm3, (%rdi, %rcx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm1):
+	movdqu	%xmm1, (%rdi, %rcx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+#  endif
+
+	.p2align 4
+L(CopyFrom1To16BytesExit):
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+
+/* Case2 */
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2):
+	add	$16, %r8
+	add	%rcx, %rdi
+	add	%rcx, %rsi
+	bsf	%rdx, %rdx
+	cmp	%r8, %rdx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+	.p2align 4
+L(CopyFrom1To32BytesCase2):
+	add	%rcx, %rsi
+	bsf	%rdx, %rdx
+	add	$16, %rdx
+	sub	%rcx, %rdx
+	cmp	%r8, %rdx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+L(CopyFrom1To16BytesTailCase2):
+	add	%rcx, %rsi
+	bsf	%rdx, %rdx
+	cmp	%r8, %rdx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+L(CopyFrom1To16BytesTail1Case2):
+	bsf	%rdx, %rdx
+	cmp	%r8, %rdx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+/* Case2 or Case3,  Case3 */
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2OrCase3):
+	test	%rdx, %rdx
+	jnz	L(CopyFrom1To16BytesCase2)
+L(CopyFrom1To16BytesCase3):
+	add	$16, %r8
+	add	%rcx, %rdi
+	add	%rcx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+	.p2align 4
+L(CopyFrom1To32BytesCase2OrCase3):
+	test	%rdx, %rdx
+	jnz	L(CopyFrom1To32BytesCase2)
+	add	%rcx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+	.p2align 4
+L(CopyFrom1To16BytesTailCase2OrCase3):
+	test	%rdx, %rdx
+	jnz	L(CopyFrom1To16BytesTailCase2)
+	add	%rcx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+	.p2align 4
+L(CopyFrom1To32Bytes1Case2OrCase3):
+	add	$16, %rdi
+	add	$16, %rsi
+	sub	$16, %r8
+L(CopyFrom1To16BytesTail1Case2OrCase3):
+	test	%rdx, %rdx
+	jnz	L(CopyFrom1To16BytesTail1Case2)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+# endif
+
+/*------------End labels regarding with copying 1-16 bytes--and 1-32 bytes----*/
+
+	.p2align 4
+L(Exit1):
+	mov	%dh, (%rdi)
+# ifdef USE_AS_STPCPY
+	lea	(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$1, %r8
+	lea	1(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit2):
+	mov	(%rsi), %dx
+	mov	%dx, (%rdi)
+# ifdef USE_AS_STPCPY
+	lea	1(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$2, %r8
+	lea	2(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit3):
+	mov	(%rsi), %cx
+	mov	%cx, (%rdi)
+	mov	%dh, 2(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	2(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$3, %r8
+	lea	3(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit4):
+	mov	(%rsi), %edx
+	mov	%edx, (%rdi)
+# ifdef USE_AS_STPCPY
+	lea	3(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$4, %r8
+	lea	4(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit5):
+	mov	(%rsi), %ecx
+	mov	%dh, 4(%rdi)
+	mov	%ecx, (%rdi)
+# ifdef USE_AS_STPCPY
+	lea	4(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$5, %r8
+	lea	5(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit6):
+	mov	(%rsi), %ecx
+	mov	4(%rsi), %dx
+	mov	%ecx, (%rdi)
+	mov	%dx, 4(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	5(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$6, %r8
+	lea	6(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit7):
+	mov	(%rsi), %ecx
+	mov	3(%rsi), %edx
+	mov	%ecx, (%rdi)
+	mov	%edx, 3(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	6(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$7, %r8
+	lea	7(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit8):
+	mov	(%rsi), %rdx
+	mov	%rdx, (%rdi)
+# ifdef USE_AS_STPCPY
+	lea	7(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$8, %r8
+	lea	8(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit9):
+	mov	(%rsi), %rcx
+	mov	%dh, 8(%rdi)
+	mov	%rcx, (%rdi)
+# ifdef USE_AS_STPCPY
+	lea	8(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$9, %r8
+	lea	9(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit10):
+	mov	(%rsi), %rcx
+	mov	8(%rsi), %dx
+	mov	%rcx, (%rdi)
+	mov	%dx, 8(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	9(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$10, %r8
+	lea	10(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit11):
+	mov	(%rsi), %rcx
+	mov	7(%rsi), %edx
+	mov	%rcx, (%rdi)
+	mov	%edx, 7(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	10(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$11, %r8
+	lea	11(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit12):
+	mov	(%rsi), %rcx
+	mov	8(%rsi), %edx
+	mov	%rcx, (%rdi)
+	mov	%edx, 8(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	11(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$12, %r8
+	lea	12(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit13):
+	mov	(%rsi), %rcx
+	mov	5(%rsi), %rdx
+	mov	%rcx, (%rdi)
+	mov	%rdx, 5(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	12(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$13, %r8
+	lea	13(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit14):
+	mov	(%rsi), %rcx
+	mov	6(%rsi), %rdx
+	mov	%rcx, (%rdi)
+	mov	%rdx, 6(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	13(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$14, %r8
+	lea	14(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit15):
+	mov	(%rsi), %rcx
+	mov	7(%rsi), %rdx
+	mov	%rcx, (%rdi)
+	mov	%rdx, 7(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	14(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$15, %r8
+	lea	15(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit16):
+	movdqu	(%rsi), %xmm0
+	movdqu	%xmm0, (%rdi)
+# ifdef USE_AS_STPCPY
+	lea	15(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$16, %r8
+	lea	16(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit17):
+	movdqu	(%rsi), %xmm0
+	movdqu	%xmm0, (%rdi)
+	mov	%dh, 16(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	16(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$17, %r8
+	lea	17(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit18):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %cx
+	movdqu	%xmm0, (%rdi)
+	mov	%cx, 16(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	17(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$18, %r8
+	lea	18(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit19):
+	movdqu	(%rsi), %xmm0
+	mov	15(%rsi), %ecx
+	movdqu	%xmm0, (%rdi)
+	mov	%ecx, 15(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	18(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$19, %r8
+	lea	19(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit20):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %ecx
+	movdqu	%xmm0, (%rdi)
+	mov	%ecx, 16(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	19(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$20, %r8
+	lea	20(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit21):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %ecx
+	movdqu	%xmm0, (%rdi)
+	mov	%ecx, 16(%rdi)
+	mov	%dh, 20(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	20(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$21, %r8
+	lea	21(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit22):
+	movdqu	(%rsi), %xmm0
+	mov	14(%rsi), %rcx
+	movdqu	%xmm0, (%rdi)
+	mov	%rcx, 14(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	21(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$22, %r8
+	lea	22(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit23):
+	movdqu	(%rsi), %xmm0
+	mov	15(%rsi), %rcx
+	movdqu	%xmm0, (%rdi)
+	mov	%rcx, 15(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	22(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$23, %r8
+	lea	23(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit24):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rcx
+	movdqu	%xmm0, (%rdi)
+	mov	%rcx, 16(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	23(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$24, %r8
+	lea	24(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit25):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rcx
+	movdqu	%xmm0, (%rdi)
+	mov	%rcx, 16(%rdi)
+	mov	%dh, 24(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	24(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$25, %r8
+	lea	25(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit26):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rdx
+	mov	24(%rsi), %cx
+	movdqu	%xmm0, (%rdi)
+	mov	%rdx, 16(%rdi)
+	mov	%cx, 24(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	25(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$26, %r8
+	lea	26(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit27):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rdx
+	mov	23(%rsi), %ecx
+	movdqu	%xmm0, (%rdi)
+	mov	%rdx, 16(%rdi)
+	mov	%ecx, 23(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	26(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$27, %r8
+	lea	27(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit28):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rdx
+	mov	24(%rsi), %ecx
+	movdqu	%xmm0, (%rdi)
+	mov	%rdx, 16(%rdi)
+	mov	%ecx, 24(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	27(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$28, %r8
+	lea	28(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit29):
+	movdqu	(%rsi), %xmm0
+	movdqu	13(%rsi), %xmm2
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm2, 13(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	28(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$29, %r8
+	lea	29(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit30):
+	movdqu	(%rsi), %xmm0
+	movdqu	14(%rsi), %xmm2
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm2, 14(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	29(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$30, %r8
+	lea	30(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit31):
+	movdqu	(%rsi), %xmm0
+	movdqu	15(%rsi), %xmm2
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm2, 15(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	30(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$31, %r8
+	lea	31(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit32):
+	movdqu	(%rsi), %xmm0
+	movdqu	16(%rsi), %xmm2
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm2, 16(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	31(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$32, %r8
+	lea	32(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+# ifdef USE_AS_STRNCPY
+
+	.p2align 4
+L(StrncpyExit0):
+#  ifdef USE_AS_STPCPY
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, (%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit1):
+	mov	(%rsi), %dl
+	mov	%dl, (%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	1(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 1(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit2):
+	mov	(%rsi), %dx
+	mov	%dx, (%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	2(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 2(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit3):
+	mov	(%rsi), %cx
+	mov	2(%rsi), %dl
+	mov	%cx, (%rdi)
+	mov	%dl, 2(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	3(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 3(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit4):
+	mov	(%rsi), %edx
+	mov	%edx, (%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	4(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 4(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit5):
+	mov	(%rsi), %ecx
+	mov	4(%rsi), %dl
+	mov	%ecx, (%rdi)
+	mov	%dl, 4(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	5(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 5(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit6):
+	mov	(%rsi), %ecx
+	mov	4(%rsi), %dx
+	mov	%ecx, (%rdi)
+	mov	%dx, 4(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	6(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 6(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit7):
+	mov	(%rsi), %ecx
+	mov	3(%rsi), %edx
+	mov	%ecx, (%rdi)
+	mov	%edx, 3(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	7(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 7(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit8):
+	mov	(%rsi), %rdx
+	mov	%rdx, (%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	8(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 8(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit9):
+	mov	(%rsi), %rcx
+	mov	8(%rsi), %dl
+	mov	%rcx, (%rdi)
+	mov	%dl, 8(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	9(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 9(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit10):
+	mov	(%rsi), %rcx
+	mov	8(%rsi), %dx
+	mov	%rcx, (%rdi)
+	mov	%dx, 8(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	10(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 10(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit11):
+	mov	(%rsi), %rcx
+	mov	7(%rsi), %edx
+	mov	%rcx, (%rdi)
+	mov	%edx, 7(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	11(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 11(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit12):
+	mov	(%rsi), %rcx
+	mov	8(%rsi), %edx
+	mov	%rcx, (%rdi)
+	mov	%edx, 8(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	12(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 12(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit13):
+	mov	(%rsi), %rcx
+	mov	5(%rsi), %rdx
+	mov	%rcx, (%rdi)
+	mov	%rdx, 5(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	13(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 13(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit14):
+	mov	(%rsi), %rcx
+	mov	6(%rsi), %rdx
+	mov	%rcx, (%rdi)
+	mov	%rdx, 6(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	14(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 14(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit15):
+	mov	(%rsi), %rcx
+	mov	7(%rsi), %rdx
+	mov	%rcx, (%rdi)
+	mov	%rdx, 7(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	15(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 15(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit16):
+	movdqu	(%rsi), %xmm0
+	movdqu	%xmm0, (%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	16(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 16(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit17):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %cl
+	movdqu	%xmm0, (%rdi)
+	mov	%cl, 16(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	17(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 17(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit18):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %cx
+	movdqu	%xmm0, (%rdi)
+	mov	%cx, 16(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	18(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 18(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit19):
+	movdqu	(%rsi), %xmm0
+	mov	15(%rsi), %ecx
+	movdqu	%xmm0, (%rdi)
+	mov	%ecx, 15(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	19(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 19(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit20):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %ecx
+	movdqu	%xmm0, (%rdi)
+	mov	%ecx, 16(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	20(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 20(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit21):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %ecx
+	mov	20(%rsi), %dl
+	movdqu	%xmm0, (%rdi)
+	mov	%ecx, 16(%rdi)
+	mov	%dl, 20(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	21(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 21(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit22):
+	movdqu	(%rsi), %xmm0
+	mov	14(%rsi), %rcx
+	movdqu	%xmm0, (%rdi)
+	mov	%rcx, 14(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	22(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 22(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit23):
+	movdqu	(%rsi), %xmm0
+	mov	15(%rsi), %rcx
+	movdqu	%xmm0, (%rdi)
+	mov	%rcx, 15(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	23(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 23(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit24):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rcx
+	movdqu	%xmm0, (%rdi)
+	mov	%rcx, 16(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	24(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 24(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit25):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rdx
+	mov	24(%rsi), %cl
+	movdqu	%xmm0, (%rdi)
+	mov	%rdx, 16(%rdi)
+	mov	%cl, 24(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	25(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 25(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit26):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rdx
+	mov	24(%rsi), %cx
+	movdqu	%xmm0, (%rdi)
+	mov	%rdx, 16(%rdi)
+	mov	%cx, 24(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	26(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 26(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit27):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rdx
+	mov	23(%rsi), %ecx
+	movdqu	%xmm0, (%rdi)
+	mov	%rdx, 16(%rdi)
+	mov	%ecx, 23(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	27(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 27(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit28):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rdx
+	mov	24(%rsi), %ecx
+	movdqu	%xmm0, (%rdi)
+	mov	%rdx, 16(%rdi)
+	mov	%ecx, 24(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	28(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 28(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit29):
+	movdqu	(%rsi), %xmm0
+	movdqu	13(%rsi), %xmm2
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm2, 13(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	29(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 29(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit30):
+	movdqu	(%rsi), %xmm0
+	movdqu	14(%rsi), %xmm2
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm2, 14(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	30(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 30(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit31):
+	movdqu	(%rsi), %xmm0
+	movdqu	15(%rsi), %xmm2
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm2, 15(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	31(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 31(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit32):
+	movdqu	(%rsi), %xmm0
+	movdqu	16(%rsi), %xmm2
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm2, 16(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	32(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 32(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit33):
+	movdqu	(%rsi), %xmm0
+	movdqu	16(%rsi), %xmm2
+	mov	32(%rsi), %cl
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm2, 16(%rdi)
+	mov	%cl, 32(%rdi)
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 33(%rdi)
+#  endif
+	ret
+
+#  ifndef USE_AS_STRCAT
+
+	.p2align 4
+L(Fill0):
+	ret
+
+	.p2align 4
+L(Fill1):
+	mov	%dl, (%rdi)
+	ret
+
+	.p2align 4
+L(Fill2):
+	mov	%dx, (%rdi)
+	ret
+
+	.p2align 4
+L(Fill3):
+	mov	%edx, -1(%rdi)
+	ret
+
+	.p2align 4
+L(Fill4):
+	mov	%edx, (%rdi)
+	ret
+
+	.p2align 4
+L(Fill5):
+	mov	%edx, (%rdi)
+	mov	%dl, 4(%rdi)
+	ret
+
+	.p2align 4
+L(Fill6):
+	mov	%edx, (%rdi)
+	mov	%dx, 4(%rdi)
+	ret
+
+	.p2align 4
+L(Fill7):
+	mov	%rdx, -1(%rdi)
+	ret
+
+	.p2align 4
+L(Fill8):
+	mov	%rdx, (%rdi)
+	ret
+
+	.p2align 4
+L(Fill9):
+	mov	%rdx, (%rdi)
+	mov	%dl, 8(%rdi)
+	ret
+
+	.p2align 4
+L(Fill10):
+	mov	%rdx, (%rdi)
+	mov	%dx, 8(%rdi)
+	ret
+
+	.p2align 4
+L(Fill11):
+	mov	%rdx, (%rdi)
+	mov	%edx, 7(%rdi)
+	ret
+
+	.p2align 4
+L(Fill12):
+	mov	%rdx, (%rdi)
+	mov	%edx, 8(%rdi)
+	ret
+
+	.p2align 4
+L(Fill13):
+	mov	%rdx, (%rdi)
+	mov	%rdx, 5(%rdi)
+	ret
+
+	.p2align 4
+L(Fill14):
+	mov	%rdx, (%rdi)
+	mov	%rdx, 6(%rdi)
+	ret
+
+	.p2align 4
+L(Fill15):
+	movdqu	%xmm0, -1(%rdi)
+	ret
+
+	.p2align 4
+L(Fill16):
+	movdqu	%xmm0, (%rdi)
+	ret
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm2):
+	movdqu	%xmm2, (%rdi, %rcx)
+
+	.p2align 4
+L(CopyFrom1To16BytesXmmExit):
+	bsf	%rdx, %rdx
+	add	$15, %r8
+	add	%rcx, %rdi
+#   ifdef USE_AS_STPCPY
+	lea	(%rdi, %rdx), %rax
+#   endif
+	sub	%rdx, %r8
+	lea	1(%rdi, %rdx), %rdi
+
+	.p2align 4
+L(StrncpyFillTailWithZero):
+	pxor	%xmm0, %xmm0
+	xor	%rdx, %rdx
+	sub	$16, %r8
+	jbe	L(StrncpyFillExit)
+
+	movdqu	%xmm0, (%rdi)
+	add	$16, %rdi
+
+	mov	%rdi, %rsi
+	and	$0xf, %rsi
+	sub	%rsi, %rdi
+	add	%rsi, %r8
+	sub	$64, %r8
+	jb	L(StrncpyFillLess64)
+
+L(StrncpyFillLoopMovdqa):
+	movdqa	%xmm0, (%rdi)
+	movdqa	%xmm0, 16(%rdi)
+	movdqa	%xmm0, 32(%rdi)
+	movdqa	%xmm0, 48(%rdi)
+	add	$64, %rdi
+	sub	$64, %r8
+	jae	L(StrncpyFillLoopMovdqa)
+
+L(StrncpyFillLess64):
+	add	$32, %r8
+	jl	L(StrncpyFillLess32)
+	movdqa	%xmm0, (%rdi)
+	movdqa	%xmm0, 16(%rdi)
+	add	$32, %rdi
+	sub	$16, %r8
+	jl	L(StrncpyFillExit)
+	movdqa	%xmm0, (%rdi)
+	add	$16, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
+
+L(StrncpyFillLess32):
+	add	$16, %r8
+	jl	L(StrncpyFillExit)
+	movdqa	%xmm0, (%rdi)
+	add	$16, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
+
+L(StrncpyFillExit):
+	add	$16, %r8
+	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
+
+/* end of ifndef USE_AS_STRCAT */
+#  endif
+
+	.p2align 4
+L(UnalignedLeaveCase2OrCase3):
+	test	%rdx, %rdx
+	jnz	L(Unaligned64LeaveCase2)
+L(Unaligned64LeaveCase3):
+	lea	64(%r8), %rcx
+	and	$-16, %rcx
+	add	$48, %r8
+	jl	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm4, (%rdi)
+	sub	$16, %r8
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm5, 16(%rdi)
+	sub	$16, %r8
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm6, 32(%rdi)
+	sub	$16, %r8
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm7, 48(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	64(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 64(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(Unaligned64LeaveCase2):
+	xor	%rcx, %rcx
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %rdx
+	add	$48, %r8
+	jle	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%rdx, %rdx
+#  ifndef USE_AS_STRCAT
+	jnz	L(CopyFrom1To16BytesUnalignedXmm4)
+#  else
+	jnz	L(CopyFrom1To16Bytes)
+#  endif
+	pcmpeqb	%xmm5, %xmm0
+	pmovmskb %xmm0, %rdx
+	movdqu	%xmm4, (%rdi)
+	add	$16, %rcx
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%rdx, %rdx
+#  ifndef USE_AS_STRCAT
+	jnz	L(CopyFrom1To16BytesUnalignedXmm5)
+#  else
+	jnz	L(CopyFrom1To16Bytes)
+#  endif
+
+	pcmpeqb	%xmm6, %xmm0
+	pmovmskb %xmm0, %rdx
+	movdqu	%xmm5, 16(%rdi)
+	add	$16, %rcx
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%rdx, %rdx
+#  ifndef USE_AS_STRCAT
+	jnz	L(CopyFrom1To16BytesUnalignedXmm6)
+#  else
+	jnz	L(CopyFrom1To16Bytes)
+#  endif
+
+	pcmpeqb	%xmm7, %xmm0
+	pmovmskb %xmm0, %rdx
+	movdqu	%xmm6, 32(%rdi)
+	lea	16(%rdi, %rcx), %rdi
+	lea	16(%rsi, %rcx), %rsi
+	bsf	%rdx, %rdx
+	cmp	%r8, %rdx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+	.p2align 4
+L(ExitZero):
+#  ifndef USE_AS_STRCAT
+	mov	%rdi, %rax
+#  endif
+	ret
+
+# endif
+
+# ifndef USE_AS_STRCAT
+END (STRCPY)
+# else
+END (STRCAT)
+# endif
+	.p2align 4
+	.section .rodata
+L(ExitTable):
+	.int	JMPTBL(L(Exit1), L(ExitTable))
+	.int	JMPTBL(L(Exit2), L(ExitTable))
+	.int	JMPTBL(L(Exit3), L(ExitTable))
+	.int	JMPTBL(L(Exit4), L(ExitTable))
+	.int	JMPTBL(L(Exit5), L(ExitTable))
+	.int	JMPTBL(L(Exit6), L(ExitTable))
+	.int	JMPTBL(L(Exit7), L(ExitTable))
+	.int	JMPTBL(L(Exit8), L(ExitTable))
+	.int	JMPTBL(L(Exit9), L(ExitTable))
+	.int	JMPTBL(L(Exit10), L(ExitTable))
+	.int	JMPTBL(L(Exit11), L(ExitTable))
+	.int	JMPTBL(L(Exit12), L(ExitTable))
+	.int	JMPTBL(L(Exit13), L(ExitTable))
+	.int	JMPTBL(L(Exit14), L(ExitTable))
+	.int	JMPTBL(L(Exit15), L(ExitTable))
+	.int	JMPTBL(L(Exit16), L(ExitTable))
+	.int	JMPTBL(L(Exit17), L(ExitTable))
+	.int	JMPTBL(L(Exit18), L(ExitTable))
+	.int	JMPTBL(L(Exit19), L(ExitTable))
+	.int	JMPTBL(L(Exit20), L(ExitTable))
+	.int	JMPTBL(L(Exit21), L(ExitTable))
+	.int	JMPTBL(L(Exit22), L(ExitTable))
+	.int    JMPTBL(L(Exit23), L(ExitTable))
+	.int	JMPTBL(L(Exit24), L(ExitTable))
+	.int	JMPTBL(L(Exit25), L(ExitTable))
+	.int	JMPTBL(L(Exit26), L(ExitTable))
+	.int	JMPTBL(L(Exit27), L(ExitTable))
+	.int	JMPTBL(L(Exit28), L(ExitTable))
+	.int	JMPTBL(L(Exit29), L(ExitTable))
+	.int	JMPTBL(L(Exit30), L(ExitTable))
+	.int	JMPTBL(L(Exit31), L(ExitTable))
+	.int	JMPTBL(L(Exit32), L(ExitTable))
+# ifdef USE_AS_STRNCPY
+L(ExitStrncpyTable):
+	.int	JMPTBL(L(StrncpyExit0), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit4), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit5), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit6), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit7), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit11), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit12), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit13), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit17), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit18), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit19), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit20), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit21), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit22), L(ExitStrncpyTable))
+	.int    JMPTBL(L(StrncpyExit23), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit24), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit25), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit26), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit27), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit28), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit29), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable))
+#  ifndef USE_AS_STRCAT
+	.p2align 4
+L(FillTable):
+	.int	JMPTBL(L(Fill0), L(FillTable))
+	.int	JMPTBL(L(Fill1), L(FillTable))
+	.int	JMPTBL(L(Fill2), L(FillTable))
+	.int	JMPTBL(L(Fill3), L(FillTable))
+	.int	JMPTBL(L(Fill4), L(FillTable))
+	.int	JMPTBL(L(Fill5), L(FillTable))
+	.int	JMPTBL(L(Fill6), L(FillTable))
+	.int	JMPTBL(L(Fill7), L(FillTable))
+	.int	JMPTBL(L(Fill8), L(FillTable))
+	.int	JMPTBL(L(Fill9), L(FillTable))
+	.int	JMPTBL(L(Fill10), L(FillTable))
+	.int	JMPTBL(L(Fill11), L(FillTable))
+	.int	JMPTBL(L(Fill12), L(FillTable))
+	.int	JMPTBL(L(Fill13), L(FillTable))
+	.int	JMPTBL(L(Fill14), L(FillTable))
+	.int	JMPTBL(L(Fill15), L(FillTable))
+	.int	JMPTBL(L(Fill16), L(FillTable))
+#  endif
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcpy-ssse3.S
new file mode 100644
index 0000000000..47aaeae671
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcpy-ssse3.S
@@ -0,0 +1,3551 @@
+/* strcpy with SSSE3
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# ifndef USE_AS_STRCAT
+#  include <sysdep.h>
+
+#  ifndef STRCPY
+#   define STRCPY  __strcpy_ssse3
+#  endif
+
+	.section .text.ssse3,"ax",@progbits
+ENTRY (STRCPY)
+
+	mov	%rsi, %rcx
+#  ifdef USE_AS_STRNCPY
+	mov	%rdx, %r8
+#  endif
+	mov	%rdi, %rdx
+#  ifdef USE_AS_STRNCPY
+	test	%r8, %r8
+	jz	L(Exit0)
+	cmp	$8, %r8
+	jbe	L(StrncpyExit8Bytes)
+# endif
+	cmpb	$0, (%rcx)
+	jz	L(Exit1)
+	cmpb	$0, 1(%rcx)
+	jz	L(Exit2)
+	cmpb	$0, 2(%rcx)
+	jz	L(Exit3)
+	cmpb	$0, 3(%rcx)
+	jz	L(Exit4)
+	cmpb	$0, 4(%rcx)
+	jz	L(Exit5)
+	cmpb	$0, 5(%rcx)
+	jz	L(Exit6)
+	cmpb	$0, 6(%rcx)
+	jz	L(Exit7)
+	cmpb	$0, 7(%rcx)
+	jz	L(Exit8)
+# ifdef USE_AS_STRNCPY
+	cmp	$16, %r8
+	jb	L(StrncpyExit15Bytes)
+# endif
+	cmpb	$0, 8(%rcx)
+	jz	L(Exit9)
+	cmpb	$0, 9(%rcx)
+	jz	L(Exit10)
+	cmpb	$0, 10(%rcx)
+	jz	L(Exit11)
+	cmpb	$0, 11(%rcx)
+	jz	L(Exit12)
+	cmpb	$0, 12(%rcx)
+	jz	L(Exit13)
+	cmpb	$0, 13(%rcx)
+	jz	L(Exit14)
+	cmpb	$0, 14(%rcx)
+	jz	L(Exit15)
+# ifdef USE_AS_STRNCPY
+	cmp	$16, %r8
+	je	L(Exit16)
+# endif
+	cmpb	$0, 15(%rcx)
+	jz	L(Exit16)
+# endif
+
+# ifdef USE_AS_STRNCPY
+	mov	%rcx, %rsi
+	sub	$16, %r8
+	and	$0xf, %rsi
+
+/* add 16 bytes rcx_offset to r8 */
+
+	add	%rsi, %r8
+# endif
+	lea	16(%rcx), %rsi
+	and	$-16, %rsi
+	pxor	%xmm0, %xmm0
+	mov	(%rcx), %r9
+	mov	%r9, (%rdx)
+	pcmpeqb	(%rsi), %xmm0
+	mov	8(%rcx), %r9
+	mov	%r9, 8(%rdx)
+
+/* convert byte mask in xmm0 to bit mask */
+
+	pmovmskb %xmm0, %rax
+	sub	%rcx, %rsi
+
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	mov	%rdx, %rax
+	lea	16(%rdx), %rdx
+	and	$-16, %rdx
+	sub	%rdx, %rax
+
+# ifdef USE_AS_STRNCPY
+	add	%rax, %rsi
+	lea	-1(%rsi), %rsi
+	and	$1<<31, %esi
+	test	%rsi, %rsi
+	jnz	L(ContinueCopy)
+	lea	16(%r8), %r8
+
+L(ContinueCopy):
+# endif
+	sub	%rax, %rcx
+	mov	%rcx, %rax
+	and	$0xf, %rax
+	mov	$0, %rsi
+
+/* case: rcx_offset == rdx_offset */
+
+	jz	L(Align16Both)
+
+	cmp	$8, %rax
+	jae	L(ShlHigh8)
+	cmp	$1, %rax
+	je	L(Shl1)
+	cmp	$2, %rax
+	je	L(Shl2)
+	cmp	$3, %rax
+	je	L(Shl3)
+	cmp	$4, %rax
+	je	L(Shl4)
+	cmp	$5, %rax
+	je	L(Shl5)
+	cmp	$6, %rax
+	je	L(Shl6)
+	jmp	L(Shl7)
+
+L(ShlHigh8):
+	je	L(Shl8)
+	cmp	$9, %rax
+	je	L(Shl9)
+	cmp	$10, %rax
+	je	L(Shl10)
+	cmp	$11, %rax
+	je	L(Shl11)
+	cmp	$12, %rax
+	je	L(Shl12)
+	cmp	$13, %rax
+	je	L(Shl13)
+	cmp	$14, %rax
+	je	L(Shl14)
+	jmp	L(Shl15)
+
+L(Align16Both):
+	movaps	(%rcx), %xmm1
+	movaps	16(%rcx), %xmm2
+	movaps	%xmm1, (%rdx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%rcx, %rsi), %xmm3
+	movaps	%xmm2, (%rdx, %rsi)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%rcx, %rsi), %xmm4
+	movaps	%xmm3, (%rdx, %rsi)
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%rcx, %rsi), %xmm1
+	movaps	%xmm4, (%rdx, %rsi)
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%rcx, %rsi), %xmm2
+	movaps	%xmm1, (%rdx, %rsi)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%rcx, %rsi), %xmm3
+	movaps	%xmm2, (%rdx, %rsi)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	%xmm3, (%rdx, %rsi)
+	mov	%rcx, %rax
+	lea	16(%rcx, %rsi), %rcx
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	lea	112(%r8, %rax), %r8
+# endif
+	mov	$-0x40, %rsi
+
+	.p2align 4
+L(Aligned64Loop):
+	movaps	(%rcx), %xmm2
+	movaps	%xmm2, %xmm4
+	movaps	16(%rcx), %xmm5
+	movaps	32(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	48(%rcx), %xmm7
+	pminub	%xmm5, %xmm2
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %rax
+	lea	64(%rdx), %rdx
+	lea	64(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeaveCase2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Aligned64Leave)
+	movaps	%xmm4, -64(%rdx)
+	movaps	%xmm5, -48(%rdx)
+	movaps	%xmm6, -32(%rdx)
+	movaps	%xmm7, -16(%rdx)
+	jmp	L(Aligned64Loop)
+
+L(Aligned64Leave):
+# ifdef USE_AS_STRNCPY
+	lea	48(%r8), %r8
+# endif
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %rax
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm5, %xmm0
+# ifdef USE_AS_STRNCPY
+	lea	-16(%r8), %r8
+# endif
+	pmovmskb %xmm0, %rax
+	movaps	%xmm4, -64(%rdx)
+	test	%rax, %rax
+	lea	16(%rsi), %rsi
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm6, %xmm0
+# ifdef USE_AS_STRNCPY
+	lea	-16(%r8), %r8
+# endif
+	pmovmskb %xmm0, %rax
+	movaps	%xmm5, -48(%rdx)
+	test	%rax, %rax
+	lea	16(%rsi), %rsi
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	%xmm6, -32(%rdx)
+	pcmpeqb	%xmm7, %xmm0
+# ifdef USE_AS_STRNCPY
+	lea	-16(%r8), %r8
+# endif
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl1):
+	movaps	-1(%rcx), %xmm1
+	movaps	15(%rcx), %xmm2
+L(Shl1Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit1Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl1LoopExit)
+
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	31(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit1Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl1LoopExit)
+
+	palignr	$1, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	31(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit1Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl1LoopExit)
+
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	31(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit1Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl1LoopExit)
+
+	palignr	$1, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	31(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-15(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-1(%rcx), %xmm1
+
+/* 64 bytes loop */
+	.p2align 4
+L(Shl1LoopStart):
+	movaps	15(%rcx), %xmm2
+	movaps	31(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	47(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	63(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$1, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$1, %xmm3, %xmm4
+	jnz	L(Shl1Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave1)
+# endif
+	palignr	$1, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl1LoopStart)
+
+L(Shl1LoopExit):
+	movdqu	-1(%rcx), %xmm1
+	mov	$15, %rsi
+	movdqu	%xmm1, -1(%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl2):
+	movaps	-2(%rcx), %xmm1
+	movaps	14(%rcx), %xmm2
+L(Shl2Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit2Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl2LoopExit)
+
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	30(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit2Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl2LoopExit)
+
+	palignr	$2, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	30(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit2Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl2LoopExit)
+
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	30(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit2Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl2LoopExit)
+
+	palignr	$2, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	30(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-14(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-2(%rcx), %xmm1
+
+/* 64 bytes loop */
+	.p2align 4
+L(Shl2LoopStart):
+	movaps	14(%rcx), %xmm2
+	movaps	30(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	46(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	62(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$2, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$2, %xmm3, %xmm4
+	jnz	L(Shl2Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave2)
+# endif
+	palignr	$2, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl2LoopStart)
+
+L(Shl2LoopExit):
+	movdqu	-2(%rcx), %xmm1
+	mov	$14, %rsi
+	movdqu	%xmm1, -2(%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl3):
+	movaps	-3(%rcx), %xmm1
+	movaps	13(%rcx), %xmm2
+L(Shl3Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit3Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl3LoopExit)
+
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	29(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit3Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl3LoopExit)
+
+	palignr	$3, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	29(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit3Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl3LoopExit)
+
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	29(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit3Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl3LoopExit)
+
+	palignr	$3, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	29(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-13(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-3(%rcx), %xmm1
+
+/* 64 bytes loop */
+	.p2align 4
+L(Shl3LoopStart):
+	movaps	13(%rcx), %xmm2
+	movaps	29(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	45(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	61(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$3, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$3, %xmm3, %xmm4
+	jnz	L(Shl3Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave3)
+# endif
+	palignr	$3, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl3LoopStart)
+
+L(Shl3LoopExit):
+	movdqu	-3(%rcx), %xmm1
+	mov	$13, %rsi
+	movdqu	%xmm1, -3(%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl4):
+	movaps	-4(%rcx), %xmm1
+	movaps	12(%rcx), %xmm2
+L(Shl4Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit4Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	28(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit4Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	28(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit4Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	28(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit4Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	28(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-12(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-4(%rcx), %xmm1
+
+/* 64 bytes loop */
+	.p2align 4
+L(Shl4LoopStart):
+	movaps	12(%rcx), %xmm2
+	movaps	28(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	44(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	60(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$4, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$4, %xmm3, %xmm4
+	jnz	L(Shl4Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave4)
+# endif
+	palignr	$4, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl4LoopStart)
+
+L(Shl4LoopExit):
+	movdqu	-4(%rcx), %xmm1
+	mov	$12, %rsi
+	movdqu	%xmm1, -4(%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl5):
+	movaps	-5(%rcx), %xmm1
+	movaps	11(%rcx), %xmm2
+L(Shl5Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit5Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl5LoopExit)
+
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	27(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit5Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl5LoopExit)
+
+	palignr	$5, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	27(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit5Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl5LoopExit)
+
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	27(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit5Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl5LoopExit)
+
+	palignr	$5, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	27(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-11(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-5(%rcx), %xmm1
+
+/* 64 bytes loop */
+	.p2align 4
+L(Shl5LoopStart):
+	movaps	11(%rcx), %xmm2
+	movaps	27(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	43(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	59(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$5, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$5, %xmm3, %xmm4
+	jnz	L(Shl5Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave5)
+# endif
+	palignr	$5, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl5LoopStart)
+
+L(Shl5LoopExit):
+	movdqu	-5(%rcx), %xmm1
+	mov	$11, %rsi
+	movdqu	%xmm1, -5(%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl6):
+	movaps	-6(%rcx), %xmm1
+	movaps	10(%rcx), %xmm2
+L(Shl6Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit6Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl6LoopExit)
+
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	26(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit6Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl6LoopExit)
+
+	palignr	$6, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	26(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit6Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl6LoopExit)
+
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	26(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit6Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl6LoopExit)
+
+	palignr	$6, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	26(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-10(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-6(%rcx), %xmm1
+
+/* 64 bytes loop */
+	.p2align 4
+L(Shl6LoopStart):
+	movaps	10(%rcx), %xmm2
+	movaps	26(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	42(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	58(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$6, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$6, %xmm3, %xmm4
+	jnz	L(Shl6Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave6)
+# endif
+	palignr	$6, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl6LoopStart)
+
+L(Shl6LoopExit):
+	mov	(%rcx), %r9
+	mov	6(%rcx), %esi
+	mov	%r9, (%rdx)
+	mov	%esi, 6(%rdx)
+	mov	$10, %rsi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl7):
+	movaps	-7(%rcx), %xmm1
+	movaps	9(%rcx), %xmm2
+L(Shl7Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit7Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl7LoopExit)
+
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	25(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit7Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl7LoopExit)
+
+	palignr	$7, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	25(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit7Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl7LoopExit)
+
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	25(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit7Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl7LoopExit)
+
+	palignr	$7, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	25(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-9(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-7(%rcx), %xmm1
+
+/* 64 bytes loop */
+	.p2align 4
+L(Shl7LoopStart):
+	movaps	9(%rcx), %xmm2
+	movaps	25(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	41(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	57(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$7, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$7, %xmm3, %xmm4
+	jnz	L(Shl7Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave7)
+# endif
+	palignr	$7, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl7LoopStart)
+
+L(Shl7LoopExit):
+	mov	(%rcx), %r9
+	mov	5(%rcx), %esi
+	mov	%r9, (%rdx)
+	mov	%esi, 5(%rdx)
+	mov	$9, %rsi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl8):
+	movaps	-8(%rcx), %xmm1
+	movaps	8(%rcx), %xmm2
+L(Shl8Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit8Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	24(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit8Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	24(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit8Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	24(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit8Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	24(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-8(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-8(%rcx), %xmm1
+
+/* 64 bytes loop */
+	.p2align 4
+L(Shl8LoopStart):
+	movaps	8(%rcx), %xmm2
+	movaps	24(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	40(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	56(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$8, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$8, %xmm3, %xmm4
+	jnz	L(Shl8Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave8)
+# endif
+	palignr	$8, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl8LoopStart)
+
+L(Shl8LoopExit):
+	mov	(%rcx), %r9
+	mov	$8, %rsi
+	mov	%r9, (%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl9):
+	movaps	-9(%rcx), %xmm1
+	movaps	7(%rcx), %xmm2
+L(Shl9Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit9Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl9LoopExit)
+
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	23(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit9Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl9LoopExit)
+
+	palignr	$9, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	23(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit9Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl9LoopExit)
+
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	23(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit9Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl9LoopExit)
+
+	palignr	$9, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	23(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-7(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-9(%rcx), %xmm1
+
+/* 64 bytes loop */
+	.p2align 4
+L(Shl9LoopStart):
+	movaps	7(%rcx), %xmm2
+	movaps	23(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	39(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	55(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$9, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$9, %xmm3, %xmm4
+	jnz	L(Shl9Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave9)
+# endif
+	palignr	$9, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl9LoopStart)
+
+L(Shl9LoopExit):
+	mov	-1(%rcx), %r9
+	mov	$7, %rsi
+	mov	%r9, -1(%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl10):
+	movaps	-10(%rcx), %xmm1
+	movaps	6(%rcx), %xmm2
+L(Shl10Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit10Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl10LoopExit)
+
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	22(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit10Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl10LoopExit)
+
+	palignr	$10, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	22(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit10Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl10LoopExit)
+
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	22(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit10Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl10LoopExit)
+
+	palignr	$10, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	22(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-6(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-10(%rcx), %xmm1
+
+/* 64 bytes loop */
+	.p2align 4
+L(Shl10LoopStart):
+	movaps	6(%rcx), %xmm2
+	movaps	22(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	38(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	54(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$10, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$10, %xmm3, %xmm4
+	jnz	L(Shl10Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave10)
+# endif
+	palignr	$10, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl10LoopStart)
+
+L(Shl10LoopExit):
+	mov	-2(%rcx), %r9
+	mov	$6, %rsi
+	mov	%r9, -2(%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl11):
+	movaps	-11(%rcx), %xmm1
+	movaps	5(%rcx), %xmm2
+L(Shl11Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit11Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl11LoopExit)
+
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	21(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit11Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl11LoopExit)
+
+	palignr	$11, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	21(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit11Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl11LoopExit)
+
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	21(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit11Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl11LoopExit)
+
+	palignr	$11, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	21(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-5(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-11(%rcx), %xmm1
+
+/* 64 bytes loop */
+	.p2align 4
+L(Shl11LoopStart):
+	movaps	5(%rcx), %xmm2
+	movaps	21(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	37(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	53(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$11, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$11, %xmm3, %xmm4
+	jnz	L(Shl11Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave11)
+# endif
+	palignr	$11, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl11LoopStart)
+
+L(Shl11LoopExit):
+	mov	-3(%rcx), %r9
+	mov	$5, %rsi
+	mov	%r9, -3(%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl12):
+	movaps	-12(%rcx), %xmm1
+	movaps	4(%rcx), %xmm2
+L(Shl12Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit12Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	20(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit12Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	20(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit12Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	20(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit12Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	20(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-4(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-12(%rcx), %xmm1
+
+/* 64 bytes loop */
+	.p2align 4
+L(Shl12LoopStart):
+	movaps	4(%rcx), %xmm2
+	movaps	20(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	36(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	52(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$12, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$12, %xmm3, %xmm4
+	jnz	L(Shl12Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave12)
+# endif
+	palignr	$12, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl12LoopStart)
+
+L(Shl12LoopExit):
+	mov	(%rcx), %r9d
+	mov	$4, %rsi
+	mov	%r9d, (%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl13):
+	movaps	-13(%rcx), %xmm1
+	movaps	3(%rcx), %xmm2
+L(Shl13Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit13Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl13LoopExit)
+
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	19(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit13Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl13LoopExit)
+
+	palignr	$13, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	19(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit13Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl13LoopExit)
+
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	19(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit13Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl13LoopExit)
+
+	palignr	$13, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	19(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-3(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-13(%rcx), %xmm1
+
+/* 64 bytes loop */
+	.p2align 4
+L(Shl13LoopStart):
+	movaps	3(%rcx), %xmm2
+	movaps	19(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	35(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	51(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$13, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$13, %xmm3, %xmm4
+	jnz	L(Shl13Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave13)
+# endif
+	palignr	$13, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl13LoopStart)
+
+L(Shl13LoopExit):
+	mov	-1(%rcx), %r9d
+	mov	$3, %rsi
+	mov	%r9d, -1(%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl14):
+	movaps	-14(%rcx), %xmm1
+	movaps	2(%rcx), %xmm2
+L(Shl14Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit14Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl14LoopExit)
+
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	18(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit14Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl14LoopExit)
+
+	palignr	$14, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	18(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit14Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl14LoopExit)
+
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	18(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit14Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl14LoopExit)
+
+	palignr	$14, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	18(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-2(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-14(%rcx), %xmm1
+
+/* 64 bytes loop */
+	.p2align 4
+L(Shl14LoopStart):
+	movaps	2(%rcx), %xmm2
+	movaps	18(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	34(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	50(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$14, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$14, %xmm3, %xmm4
+	jnz	L(Shl14Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave14)
+# endif
+	palignr	$14, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl14LoopStart)
+
+L(Shl14LoopExit):
+	mov	-2(%rcx), %r9d
+	mov	$2, %rsi
+	mov	%r9d, -2(%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl15):
+	movaps	-15(%rcx), %xmm1
+	movaps	1(%rcx), %xmm2
+L(Shl15Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit15Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl15LoopExit)
+
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	17(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit15Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl15LoopExit)
+
+	palignr	$15, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	17(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit15Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl15LoopExit)
+
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	17(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit15Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl15LoopExit)
+
+	palignr	$15, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	17(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-1(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-15(%rcx), %xmm1
+
+/* 64 bytes loop */
+	.p2align 4
+L(Shl15LoopStart):
+	movaps	1(%rcx), %xmm2
+	movaps	17(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	33(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	49(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$15, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$15, %xmm3, %xmm4
+	jnz	L(Shl15Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave15)
+# endif
+	palignr	$15, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl15LoopStart)
+
+L(Shl15LoopExit):
+	mov	-3(%rcx), %r9d
+	mov	$1, %rsi
+	mov	%r9d, -3(%rdx)
+# ifdef USE_AS_STRCAT
+	jmp	L(CopyFrom1To16Bytes)
+# endif
+
+# ifndef USE_AS_STRCAT
+
+	.p2align 4
+L(CopyFrom1To16Bytes):
+#  ifdef USE_AS_STRNCPY
+	add	$16, %r8
+#  endif
+	add	%rsi, %rdx
+	add	%rsi, %rcx
+
+	test	%al, %al
+	jz	L(ExitHigh)
+	test	$0x01, %al
+	jnz	L(Exit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+
+	.p2align 4
+L(Exit8):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	7(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$8, %r8
+	lea	8(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#   endif
+#  endif
+	ret
+
+	.p2align 4
+L(ExitHigh):
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	test	$0x08, %ah
+	jnz	L(Exit12)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+
+	.p2align 4
+L(Exit16):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	8(%rcx), %rax
+	mov	%rax, 8(%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	15(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	lea	16(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#   endif
+#  endif
+	ret
+
+#  ifdef USE_AS_STRNCPY
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2):
+	add	$16, %r8
+	add	%rsi, %rcx
+	lea	(%rsi, %rdx), %rsi
+	lea	-9(%r8), %rdx
+	and	$1<<7, %dh
+	or	%al, %dh
+	test	%dh, %dh
+	lea	(%rsi), %rdx
+	jz	L(ExitHighCase2)
+
+	cmp	$1, %r8
+	je	L(Exit1)
+	test	$0x01, %al
+	jnz	L(Exit1)
+	cmp	$2, %r8
+	je	L(Exit2)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	cmp	$3, %r8
+	je	L(Exit3)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	cmp	$4, %r8
+	je	L(Exit4)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	cmp	$5, %r8
+	je	L(Exit5)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	cmp	$6, %r8
+	je	L(Exit6)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	cmp	$7, %r8
+	je	L(Exit7)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	jmp	L(Exit8)
+
+	.p2align 4
+L(ExitHighCase2):
+	cmp	$9, %r8
+	je	L(Exit9)
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	cmp	$10, %r8
+	je	L(Exit10)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	cmp	$11, %r8
+	je	L(Exit11)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	cmp	$12, %r8
+	je	L(Exit12)
+	test	$0x8, %ah
+	jnz	L(Exit12)
+	cmp	$13, %r8
+	je	L(Exit13)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	cmp	$14, %r8
+	je	L(Exit14)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	cmp	$15, %r8
+	je	L(Exit15)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	jmp	L(Exit16)
+
+L(CopyFrom1To16BytesCase2OrCase3):
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+
+	.p2align 4
+L(CopyFrom1To16BytesCase3):
+	add	$16, %r8
+	add	%rsi, %rdx
+	add	%rsi, %rcx
+
+	cmp	$16, %r8
+	je	L(Exit16)
+	cmp	$8, %r8
+	je	L(Exit8)
+	jg	L(More8Case3)
+	cmp	$4, %r8
+	je	L(Exit4)
+	jg	L(More4Case3)
+	cmp	$2, %r8
+	jl	L(Exit1)
+	je	L(Exit2)
+	jg	L(Exit3)
+L(More8Case3): /* but less than 16 */
+	cmp	$12, %r8
+	je	L(Exit12)
+	jl	L(Less12Case3)
+	cmp	$14, %r8
+	jl	L(Exit13)
+	je	L(Exit14)
+	jg	L(Exit15)
+L(More4Case3): /* but less than 8 */
+	cmp	$6, %r8
+	jl	L(Exit5)
+	je	L(Exit6)
+	jg	L(Exit7)
+L(Less12Case3): /* but more than 8 */
+	cmp	$10, %r8
+	jl	L(Exit9)
+	je	L(Exit10)
+	jg	L(Exit11)
+#  endif
+
+	.p2align 4
+L(Exit1):
+	movb	(%rcx), %al
+	movb	%al, (%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$1, %r8
+	lea	1(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#   endif
+#  endif
+	ret
+
+	.p2align 4
+L(Exit2):
+	movw	(%rcx), %ax
+	movw	%ax, (%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	1(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$2, %r8
+	lea	2(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#   endif
+#  endif
+	ret
+
+	.p2align 4
+L(Exit3):
+	movw	(%rcx), %ax
+	movw	%ax, (%rdx)
+	movb	2(%rcx), %al
+	movb	%al, 2(%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	2(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$3, %r8
+	lea	3(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#   endif
+#  endif
+	ret
+
+	.p2align 4
+L(Exit4):
+	movl	(%rcx), %eax
+	movl	%eax, (%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	3(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$4, %r8
+	lea	4(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#   endif
+#  endif
+	ret
+
+	.p2align 4
+L(Exit5):
+	movl	(%rcx), %eax
+	movl	%eax, (%rdx)
+	movb	4(%rcx), %al
+	movb	%al, 4(%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	4(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$5, %r8
+	lea	5(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#  endif
+#  endif
+	ret
+
+	.p2align 4
+L(Exit6):
+	movl	(%rcx), %eax
+	movl	%eax, (%rdx)
+	movw	4(%rcx), %ax
+	movw	%ax, 4(%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	5(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$6, %r8
+	lea	6(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#  endif
+#  endif
+	ret
+
+	.p2align 4
+L(Exit7):
+	movl	(%rcx), %eax
+	movl	%eax, (%rdx)
+	movl	3(%rcx), %eax
+	movl	%eax, 3(%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	6(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$7, %r8
+	lea	7(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#   endif
+#  endif
+	ret
+
+	.p2align 4
+L(Exit9):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	5(%rcx), %eax
+	mov	%eax, 5(%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	8(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$9, %r8
+	lea	9(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#   endif
+#  endif
+	ret
+
+	.p2align 4
+L(Exit10):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	6(%rcx), %eax
+	mov	%eax, 6(%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	9(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$10, %r8
+	lea	10(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#   endif
+#  endif
+	ret
+
+	.p2align 4
+L(Exit11):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	7(%rcx), %eax
+	mov	%eax, 7(%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	10(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$11, %r8
+	lea	11(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#   endif
+#  endif
+	ret
+
+	.p2align 4
+L(Exit12):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	8(%rcx), %eax
+	mov	%eax, 8(%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	11(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$12, %r8
+	lea	12(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#  endif
+#  endif
+	ret
+
+	.p2align 4
+L(Exit13):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	5(%rcx), %rax
+	mov	%rax, 5(%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	12(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$13, %r8
+	lea	13(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#   endif
+#  endif
+	ret
+
+	.p2align 4
+L(Exit14):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	6(%rcx), %rax
+	mov	%rax, 6(%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	13(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$14, %r8
+	lea	14(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#   endif
+#  endif
+	ret
+
+	.p2align 4
+L(Exit15):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	7(%rcx), %rax
+	mov	%rax, 7(%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	14(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$15, %r8
+	lea	15(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#   endif
+#  endif
+	ret
+
+#  ifdef USE_AS_STRNCPY
+	.p2align 4
+L(Fill0):
+	ret
+
+	.p2align 4
+L(Fill1):
+	movb	%dl, (%rcx)
+	ret
+
+	.p2align 4
+L(Fill2):
+	movw	%dx, (%rcx)
+	ret
+
+	.p2align 4
+L(Fill3):
+	movw	%dx, (%rcx)
+	movb	%dl, 2(%rcx)
+	ret
+
+	.p2align 4
+L(Fill4):
+	movl	%edx, (%rcx)
+	ret
+
+	.p2align 4
+L(Fill5):
+	movl	%edx, (%rcx)
+	movb	%dl, 4(%rcx)
+	ret
+
+	.p2align 4
+L(Fill6):
+	movl	%edx, (%rcx)
+	movw	%dx, 4(%rcx)
+	ret
+
+	.p2align 4
+L(Fill7):
+	movl	%edx, (%rcx)
+	movl	%edx, 3(%rcx)
+	ret
+
+	.p2align 4
+L(Fill8):
+	mov	%rdx, (%rcx)
+	ret
+
+	.p2align 4
+L(Fill9):
+	mov	%rdx, (%rcx)
+	movb	%dl, 8(%rcx)
+	ret
+
+	.p2align 4
+L(Fill10):
+	mov	%rdx, (%rcx)
+	movw	%dx, 8(%rcx)
+	ret
+
+	.p2align 4
+L(Fill11):
+	mov	%rdx, (%rcx)
+	movl	%edx, 7(%rcx)
+	ret
+
+	.p2align 4
+L(Fill12):
+	mov	%rdx, (%rcx)
+	movl	%edx, 8(%rcx)
+	ret
+
+	.p2align 4
+L(Fill13):
+	mov	%rdx, (%rcx)
+	mov	%rdx, 5(%rcx)
+	ret
+
+	.p2align 4
+L(Fill14):
+	mov	%rdx, (%rcx)
+	mov	%rdx, 6(%rcx)
+	ret
+
+	.p2align 4
+L(Fill15):
+	mov	%rdx, (%rcx)
+	mov	%rdx, 7(%rcx)
+	ret
+
+	.p2align 4
+L(Fill16):
+	mov	%rdx, (%rcx)
+	mov	%rdx, 8(%rcx)
+	ret
+
+	.p2align 4
+L(StrncpyFillExit1):
+	lea	16(%r8), %r8
+L(FillFrom1To16Bytes):
+	test	%r8, %r8
+	jz	L(Fill0)
+	cmp	$16, %r8
+	je	L(Fill16)
+	cmp	$8, %r8
+	je	L(Fill8)
+	jg	L(FillMore8)
+	cmp	$4, %r8
+	je	L(Fill4)
+	jg	L(FillMore4)
+	cmp	$2, %r8
+	jl	L(Fill1)
+	je	L(Fill2)
+	jg	L(Fill3)
+L(FillMore8): /* but less than 16 */
+	cmp	$12, %r8
+	je	L(Fill12)
+	jl	L(FillLess12)
+	cmp	$14, %r8
+	jl	L(Fill13)
+	je	L(Fill14)
+	jg	L(Fill15)
+L(FillMore4): /* but less than 8 */
+	cmp	$6, %r8
+	jl	L(Fill5)
+	je	L(Fill6)
+	jg	L(Fill7)
+L(FillLess12): /* but more than 8 */
+	cmp	$10, %r8
+	jl	L(Fill9)
+	je	L(Fill10)
+	jmp	L(Fill11)
+
+	.p2align 4
+L(StrncpyFillTailWithZero1):
+	xor	%rdx, %rdx
+	sub	$16, %r8
+	jbe	L(StrncpyFillExit1)
+
+	pxor	%xmm0, %xmm0
+	mov	%rdx, (%rcx)
+	mov	%rdx, 8(%rcx)
+
+	lea	16(%rcx), %rcx
+
+	mov	%rcx, %rdx
+	and	$0xf, %rdx
+	sub	%rdx, %rcx
+	add	%rdx, %r8
+	xor	%rdx, %rdx
+	sub	$64, %r8
+	jb	L(StrncpyFillLess64)
+
+L(StrncpyFillLoopMovdqa):
+	movdqa	%xmm0, (%rcx)
+	movdqa	%xmm0, 16(%rcx)
+	movdqa	%xmm0, 32(%rcx)
+	movdqa	%xmm0, 48(%rcx)
+	lea	64(%rcx), %rcx
+	sub	$64, %r8
+	jae	L(StrncpyFillLoopMovdqa)
+
+L(StrncpyFillLess64):
+	add	$32, %r8
+	jl	L(StrncpyFillLess32)
+	movdqa	%xmm0, (%rcx)
+	movdqa	%xmm0, 16(%rcx)
+	lea	32(%rcx), %rcx
+	sub	$16, %r8
+	jl	L(StrncpyFillExit1)
+	movdqa	%xmm0, (%rcx)
+	lea	16(%rcx), %rcx
+	jmp	L(FillFrom1To16Bytes)
+
+L(StrncpyFillLess32):
+	add	$16, %r8
+	jl	L(StrncpyFillExit1)
+	movdqa	%xmm0, (%rcx)
+	lea	16(%rcx), %rcx
+	jmp	L(FillFrom1To16Bytes)
+
+	.p2align 4
+L(Exit0):
+	mov	%rdx, %rax
+	ret
+
+	.p2align 4
+L(StrncpyExit15Bytes):
+	cmp	$9, %r8
+	je	L(Exit9)
+	cmpb	$0, 8(%rcx)
+	jz	L(Exit9)
+	cmp	$10, %r8
+	je	L(Exit10)
+	cmpb	$0, 9(%rcx)
+	jz	L(Exit10)
+	cmp	$11, %r8
+	je	L(Exit11)
+	cmpb	$0, 10(%rcx)
+	jz	L(Exit11)
+	cmp	$12, %r8
+	je	L(Exit12)
+	cmpb	$0, 11(%rcx)
+	jz	L(Exit12)
+	cmp	$13, %r8
+	je	L(Exit13)
+	cmpb	$0, 12(%rcx)
+	jz	L(Exit13)
+	cmp	$14, %r8
+	je	L(Exit14)
+	cmpb	$0, 13(%rcx)
+	jz	L(Exit14)
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	7(%rcx), %rax
+	mov	%rax, 7(%rdx)
+#   ifdef USE_AS_STPCPY
+	lea	14(%rdx), %rax
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#   else
+	mov	%rdi, %rax
+#   endif
+	ret
+
+	.p2align 4
+L(StrncpyExit8Bytes):
+	cmp	$1, %r8
+	je	L(Exit1)
+	cmpb	$0, (%rcx)
+	jz	L(Exit1)
+	cmp	$2, %r8
+	je	L(Exit2)
+	cmpb	$0, 1(%rcx)
+	jz	L(Exit2)
+	cmp	$3, %r8
+	je	L(Exit3)
+	cmpb	$0, 2(%rcx)
+	jz	L(Exit3)
+	cmp	$4, %r8
+	je	L(Exit4)
+	cmpb	$0, 3(%rcx)
+	jz	L(Exit4)
+	cmp	$5, %r8
+	je	L(Exit5)
+	cmpb	$0, 4(%rcx)
+	jz	L(Exit5)
+	cmp	$6, %r8
+	je	L(Exit6)
+	cmpb	$0, 5(%rcx)
+	jz	L(Exit6)
+	cmp	$7, %r8
+	je	L(Exit7)
+	cmpb	$0, 6(%rcx)
+	jz	L(Exit7)
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+#   ifdef USE_AS_STPCPY
+	lea	7(%rdx), %rax
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#   else
+	mov	%rdi, %rax
+#   endif
+	ret
+
+#  endif
+# endif
+
+# ifdef USE_AS_STRNCPY
+	.p2align 4
+L(StrncpyLeaveCase2OrCase3):
+	test	%rax, %rax
+	jnz	L(Aligned64LeaveCase2)
+
+L(Aligned64LeaveCase3):
+	lea	64(%r8), %r8
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase3)
+	movaps	%xmm4, -64(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase3)
+	movaps	%xmm5, -48(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase3)
+	movaps	%xmm6, -32(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(Aligned64LeaveCase2):
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %rax
+	add	$48, %r8
+	jle	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm5, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm4, -64(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm6, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm5, -48(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm7, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm6, -32(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+	jmp	L(CopyFrom1To16BytesCase2)
+/*--------------------------------------------------*/
+	.p2align 4
+L(StrncpyExit1Case2OrCase3):
+	movdqu	-1(%rcx), %xmm0
+	movdqu	%xmm0, -1(%rdx)
+	mov	$15, %rsi
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit2Case2OrCase3):
+	movdqu	-2(%rcx), %xmm0
+	movdqu	%xmm0, -2(%rdx)
+	mov	$14, %rsi
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit3Case2OrCase3):
+	movdqu	-3(%rcx), %xmm0
+	movdqu	%xmm0, -3(%rdx)
+	mov	$13, %rsi
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit4Case2OrCase3):
+	movdqu	-4(%rcx), %xmm0
+	movdqu	%xmm0, -4(%rdx)
+	mov	$12, %rsi
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit5Case2OrCase3):
+	movdqu	-5(%rcx), %xmm0
+	movdqu	%xmm0, -5(%rdx)
+	mov	$11, %rsi
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit6Case2OrCase3):
+	mov	(%rcx), %rsi
+	mov	6(%rcx), %r9d
+	mov	%r9d, 6(%rdx)
+	mov	%rsi, (%rdx)
+	test	%rax, %rax
+	mov	$10, %rsi
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit7Case2OrCase3):
+	mov	(%rcx), %rsi
+	mov	5(%rcx), %r9d
+	mov	%r9d, 5(%rdx)
+	mov	%rsi, (%rdx)
+	test	%rax, %rax
+	mov	$9, %rsi
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit8Case2OrCase3):
+	mov	(%rcx), %r9
+	mov	$8, %rsi
+	mov	%r9, (%rdx)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit9Case2OrCase3):
+	mov	-1(%rcx), %r9
+	mov	$7, %rsi
+	mov	%r9, -1(%rdx)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit10Case2OrCase3):
+	mov	-2(%rcx), %r9
+	mov	$6, %rsi
+	mov	%r9, -2(%rdx)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit11Case2OrCase3):
+	mov	-3(%rcx), %r9
+	mov	$5, %rsi
+	mov	%r9, -3(%rdx)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit12Case2OrCase3):
+	mov	(%rcx), %r9d
+	mov	$4, %rsi
+	mov	%r9d, (%rdx)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit13Case2OrCase3):
+	mov	-1(%rcx), %r9d
+	mov	$3, %rsi
+	mov	%r9d, -1(%rdx)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit14Case2OrCase3):
+	mov	-2(%rcx), %r9d
+	mov	$2, %rsi
+	mov	%r9d, -2(%rdx)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit15Case2OrCase3):
+	mov	-3(%rcx), %r9d
+	mov	$1, %rsi
+	mov	%r9d, -3(%rdx)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyLeave1):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit1)
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	31(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit1)
+	palignr	$1, %xmm3, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit1)
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit1)
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit1):
+	lea	15(%rdx, %rsi), %rdx
+	lea	15(%rcx, %rsi), %rcx
+	mov	-15(%rcx), %rsi
+	mov	-8(%rcx), %rax
+	mov	%rsi, -15(%rdx)
+	mov	%rax, -8(%rdx)
+	xor	%rsi, %rsi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyLeave2):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit2)
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	30(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit2)
+	palignr	$2, %xmm3, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit2)
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit2)
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit2):
+	lea	14(%rdx, %rsi), %rdx
+	lea	14(%rcx, %rsi), %rcx
+	mov	-14(%rcx), %rsi
+	mov	-8(%rcx), %rax
+	mov	%rsi, -14(%rdx)
+	mov	%rax, -8(%rdx)
+	xor	%rsi, %rsi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyLeave3):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit3)
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	29(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit3)
+	palignr	$3, %xmm3, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit3)
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit3)
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit3):
+	lea	13(%rdx, %rsi), %rdx
+	lea	13(%rcx, %rsi), %rcx
+	mov	-13(%rcx), %rsi
+	mov	-8(%rcx), %rax
+	mov	%rsi, -13(%rdx)
+	mov	%rax, -8(%rdx)
+	xor	%rsi, %rsi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyLeave4):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit4)
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	28(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit4)
+	palignr	$4, %xmm3, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit4)
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit4)
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit4):
+	lea	12(%rdx, %rsi), %rdx
+	lea	12(%rcx, %rsi), %rcx
+	mov	-12(%rcx), %rsi
+	mov	-4(%rcx), %eax
+	mov	%rsi, -12(%rdx)
+	mov	%eax, -4(%rdx)
+	xor	%rsi, %rsi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyLeave5):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit5)
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	27(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit5)
+	palignr	$5, %xmm3, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit5)
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit5)
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit5):
+	lea	11(%rdx, %rsi), %rdx
+	lea	11(%rcx, %rsi), %rcx
+	mov	-11(%rcx), %rsi
+	mov	-4(%rcx), %eax
+	mov	%rsi, -11(%rdx)
+	mov	%eax, -4(%rdx)
+	xor	%rsi, %rsi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyLeave6):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit6)
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	26(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit6)
+	palignr	$6, %xmm3, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit6)
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit6)
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit6):
+	lea	10(%rdx, %rsi), %rdx
+	lea	10(%rcx, %rsi), %rcx
+	mov	-10(%rcx), %rsi
+	movw	-2(%rcx), %ax
+	mov	%rsi, -10(%rdx)
+	movw	%ax, -2(%rdx)
+	xor	%rsi, %rsi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyLeave7):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit7)
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	25(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit7)
+	palignr	$7, %xmm3, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit7)
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit7)
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit7):
+	lea	9(%rdx, %rsi), %rdx
+	lea	9(%rcx, %rsi), %rcx
+	mov	-9(%rcx), %rsi
+	movb	-1(%rcx), %ah
+	mov	%rsi, -9(%rdx)
+	movb	%ah, -1(%rdx)
+	xor	%rsi, %rsi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyLeave8):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit8)
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	24(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit8)
+	palignr	$8, %xmm3, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit8)
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit8)
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit8):
+	lea	8(%rdx, %rsi), %rdx
+	lea	8(%rcx, %rsi), %rcx
+	mov	-8(%rcx), %rax
+	xor	%rsi, %rsi
+	mov	%rax, -8(%rdx)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyLeave9):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit9)
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	23(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit9)
+	palignr	$9, %xmm3, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit9)
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit9)
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit9):
+	lea	7(%rdx, %rsi), %rdx
+	lea	7(%rcx, %rsi), %rcx
+	mov	-8(%rcx), %rax
+	xor	%rsi, %rsi
+	mov	%rax, -8(%rdx)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyLeave10):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit10)
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	22(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit10)
+	palignr	$10, %xmm3, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit10)
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit10)
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit10):
+	lea	6(%rdx, %rsi), %rdx
+	lea	6(%rcx, %rsi), %rcx
+	mov	-8(%rcx), %rax
+	xor	%rsi, %rsi
+	mov	%rax, -8(%rdx)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyLeave11):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit11)
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	21(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit11)
+	palignr	$11, %xmm3, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit11)
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit11)
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit11):
+	lea	5(%rdx, %rsi), %rdx
+	lea	5(%rcx, %rsi), %rcx
+	mov	-8(%rcx), %rax
+	xor	%rsi, %rsi
+	mov	%rax, -8(%rdx)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyLeave12):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit12)
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	20(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit12)
+	palignr	$12, %xmm3, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit12)
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit12)
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit12):
+	lea	4(%rdx, %rsi), %rdx
+	lea	4(%rcx, %rsi), %rcx
+	mov	-4(%rcx), %eax
+	xor	%rsi, %rsi
+	mov	%eax, -4(%rdx)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyLeave13):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit13)
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	19(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit13)
+	palignr	$13, %xmm3, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit13)
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit13)
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit13):
+	lea	3(%rdx, %rsi), %rdx
+	lea	3(%rcx, %rsi), %rcx
+	mov	-4(%rcx), %eax
+	xor	%rsi, %rsi
+	mov	%eax, -4(%rdx)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyLeave14):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit14)
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	18(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit14)
+	palignr	$14, %xmm3, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit14)
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit14)
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit14):
+	lea	2(%rdx, %rsi), %rdx
+	lea	2(%rcx, %rsi), %rcx
+	movw	-2(%rcx), %ax
+	xor	%rsi, %rsi
+	movw	%ax, -2(%rdx)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyLeave15):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit15)
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	17(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit15)
+	palignr	$15, %xmm3, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit15)
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit15)
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit15):
+	lea	1(%rdx, %rsi), %rdx
+	lea	1(%rcx, %rsi), %rcx
+	movb	-1(%rcx), %ah
+	xor	%rsi, %rsi
+	movb	%ah, -1(%rdx)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+# endif
+# ifndef USE_AS_STRCAT
+END (STRCPY)
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcpy.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcpy.S
new file mode 100644
index 0000000000..77819ddc50
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcpy.S
@@ -0,0 +1,99 @@
+/* Multiple versions of strcpy
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if !defined (USE_AS_STPCPY) && !defined (USE_AS_STRNCPY)
+# ifndef STRCPY
+#  define STRCPY strcpy
+# endif
+#endif
+
+#ifdef USE_AS_STPCPY
+# ifdef USE_AS_STRNCPY
+#  define STRCPY_SSSE3		__stpncpy_ssse3
+#  define STRCPY_SSE2		__stpncpy_sse2
+#  define STRCPY_SSE2_UNALIGNED __stpncpy_sse2_unaligned
+#  define __GI_STRCPY		__GI_stpncpy
+#  define __GI___STRCPY		__GI___stpncpy
+# else
+#  define STRCPY_SSSE3		__stpcpy_ssse3
+#  define STRCPY_SSE2		__stpcpy_sse2
+#  define STRCPY_SSE2_UNALIGNED	__stpcpy_sse2_unaligned
+#  define __GI_STRCPY		__GI_stpcpy
+#  define __GI___STRCPY		__GI___stpcpy
+# endif
+#else
+# ifdef USE_AS_STRNCPY
+#  define STRCPY_SSSE3		__strncpy_ssse3
+#  define STRCPY_SSE2		__strncpy_sse2
+#  define STRCPY_SSE2_UNALIGNED	__strncpy_sse2_unaligned
+#  define __GI_STRCPY		__GI_strncpy
+# else
+#  define STRCPY_SSSE3		__strcpy_ssse3
+#  define STRCPY_SSE2		__strcpy_sse2
+#  define STRCPY_SSE2_UNALIGNED	__strcpy_sse2_unaligned
+#  define __GI_STRCPY		__GI_strcpy
+# endif
+#endif
+
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+	.text
+ENTRY(STRCPY)
+	.type	STRCPY, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	STRCPY_SSE2_UNALIGNED(%rip), %rax
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz	2f
+	leaq	STRCPY_SSE2(%rip), %rax
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	leaq	STRCPY_SSSE3(%rip), %rax
+2:	ret
+END(STRCPY)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type STRCPY_SSE2, @function; \
+	.align 16; \
+	.globl STRCPY_SSE2; \
+	.hidden STRCPY_SSE2; \
+	STRCPY_SSE2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size STRCPY_SSE2, .-STRCPY_SSE2
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strcpy calls through a PLT.
+   The speedup we get from using SSSE3 instruction is likely eaten away
+   by the indirect call in the PLT.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_STRCPY; __GI_STRCPY = STRCPY_SSE2
+# undef libc_hidden_def
+# define libc_hidden_def(name) \
+	.globl __GI___STRCPY; __GI___STRCPY = STRCPY_SSE2
+#endif
+
+#ifndef USE_AS_STRNCPY
+#include "../strcpy.S"
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcspn-c.c b/REORG.TODO/sysdeps/x86_64/multiarch/strcspn-c.c
new file mode 100644
index 0000000000..67991b5ca7
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcspn-c.c
@@ -0,0 +1,173 @@
+/* strcspn with SSE4.2 intrinsics
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <nmmintrin.h>
+#include <string.h>
+#include "varshift.h"
+
+/* We use 0x2:
+	_SIDD_SBYTE_OPS
+	| _SIDD_CMP_EQUAL_ANY
+	| _SIDD_POSITIVE_POLARITY
+	| _SIDD_LEAST_SIGNIFICANT
+   on pcmpistri to compare xmm/mem128
+
+   0 1 2 3 4 5 6 7 8 9 A B C D E F
+   X X X X X X X X X X X X X X X X
+
+   against xmm
+
+   0 1 2 3 4 5 6 7 8 9 A B C D E F
+   A A A A A A A A A A A A A A A A
+
+   to find out if the first 16byte data element has any byte A and
+   the offset of the first byte.  There are 3 cases:
+
+   1. The first 16byte data element has the byte A at the offset X.
+   2. The first 16byte data element has EOS and doesn't have the byte A.
+   3. The first 16byte data element is valid and doesn't have the byte A.
+
+   Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
+
+    1		 X	  1	 0/1	  0
+    2		16	  0	  1	  0
+    3		16	  0	  0	  0
+
+   We exit from the loop for cases 1 and 2 with jbe which branches
+   when either CFlag or ZFlag is 1.  If CFlag == 1, ECX has the offset
+   X for case 1.  */
+
+#ifndef STRCSPN_SSE2
+# define STRCSPN_SSE2 __strcspn_sse2
+# define STRCSPN_SSE42 __strcspn_sse42
+#endif
+
+#ifdef USE_AS_STRPBRK
+# define RETURN(val1, val2) return val1
+#else
+# define RETURN(val1, val2) return val2
+#endif
+
+extern
+#ifdef USE_AS_STRPBRK
+char *
+#else
+size_t
+#endif
+STRCSPN_SSE2 (const char *, const char *);
+
+
+#ifdef USE_AS_STRPBRK
+char *
+#else
+size_t
+#endif
+__attribute__ ((section (".text.sse4.2")))
+STRCSPN_SSE42 (const char *s, const char *a)
+{
+  if (*a == 0)
+    RETURN (NULL, strlen (s));
+
+  const char *aligned;
+  __m128i mask;
+  int offset = (int) ((size_t) a & 15);
+  if (offset != 0)
+    {
+      /* Load masks.  */
+      aligned = (const char *) ((size_t) a & -16L);
+      __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
+
+      mask = __m128i_shift_right (mask0, offset);
+
+      /* Find where the NULL terminator is.  */
+      int length = _mm_cmpistri (mask, mask, 0x3a);
+      if (length == 16 - offset)
+	{
+	  /* There is no NULL terminator.  */
+	  __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
+	  int index = _mm_cmpistri (mask1, mask1, 0x3a);
+	  length += index;
+
+	  /* Don't use SSE4.2 if the length of A > 16.  */
+	  if (length > 16)
+	    return STRCSPN_SSE2 (s, a);
+
+	  if (index != 0)
+	    {
+	      /* Combine mask0 and mask1.  We could play games with
+		 palignr, but frankly this data should be in L1 now
+		 so do the merge via an unaligned load.  */
+	      mask = _mm_loadu_si128 ((__m128i *) a);
+	    }
+	}
+    }
+  else
+    {
+      /* A is aligned.  */
+      mask = _mm_load_si128 ((__m128i *) a);
+
+      /* Find where the NULL terminator is.  */
+      int length = _mm_cmpistri (mask, mask, 0x3a);
+      if (length == 16)
+	{
+	  /* There is no NULL terminator.  Don't use SSE4.2 if the length
+	     of A > 16.  */
+	  if (a[16] != 0)
+	    return STRCSPN_SSE2 (s, a);
+	}
+    }
+
+  offset = (int) ((size_t) s & 15);
+  if (offset != 0)
+    {
+      /* Check partial string.  */
+      aligned = (const char *) ((size_t) s & -16L);
+      __m128i value = _mm_load_si128 ((__m128i *) aligned);
+
+      value = __m128i_shift_right (value, offset);
+
+      int length = _mm_cmpistri (mask, value, 0x2);
+      /* No need to check ZFlag since ZFlag is always 1.  */
+      int cflag = _mm_cmpistrc (mask, value, 0x2);
+      if (cflag)
+	RETURN ((char *) (s + length), length);
+      /* Find where the NULL terminator is.  */
+      int index = _mm_cmpistri (value, value, 0x3a);
+      if (index < 16 - offset)
+	RETURN (NULL, index);
+      aligned += 16;
+    }
+  else
+    aligned = s;
+
+  while (1)
+    {
+      __m128i value = _mm_load_si128 ((__m128i *) aligned);
+      int index = _mm_cmpistri (mask, value, 0x2);
+      int cflag = _mm_cmpistrc (mask, value, 0x2);
+      int zflag = _mm_cmpistrz (mask, value, 0x2);
+      if (cflag)
+	RETURN ((char *) (aligned + index), (size_t) (aligned + index - s));
+      if (zflag)
+	RETURN (NULL,
+		/* Find where the NULL terminator is.  */
+		(size_t) (aligned + _mm_cmpistri (value, value, 0x3a) - s));
+      aligned += 16;
+    }
+}
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcspn.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcspn.S
new file mode 100644
index 0000000000..d102c7e80b
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcspn.S
@@ -0,0 +1,69 @@
+/* Multiple versions of strcspn
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifdef USE_AS_STRPBRK
+#define STRCSPN_SSE42	__strpbrk_sse42
+#define STRCSPN_SSE2	__strpbrk_sse2
+#define __GI_STRCSPN	__GI_strpbrk
+#else
+#ifndef STRCSPN
+#define STRCSPN		strcspn
+#define STRCSPN_SSE42	__strcspn_sse42
+#define STRCSPN_SSE2	__strcspn_sse2
+#define __GI_STRCSPN	__GI_strcspn
+#endif
+#endif
+
+/* Define multiple versions only for the definition in libc.  Don't
+   define multiple versions for strpbrk in static library since we
+   need strpbrk before the initialization happened.  */
+#if (defined SHARED || !defined USE_AS_STRPBRK) && IS_IN (libc)
+	.text
+ENTRY(STRCSPN)
+	.type	STRCSPN, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	STRCSPN_SSE2(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_2)
+	jz	2f
+	leaq	STRCSPN_SSE42(%rip), %rax
+2:	ret
+END(STRCSPN)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type STRCSPN_SSE2, @function; \
+	.globl STRCSPN_SSE2; \
+	.align 16; \
+	STRCSPN_SSE2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size STRCSPN_SSE2, .-STRCSPN_SSE2
+#endif
+
+#ifdef USE_AS_STRPBRK
+#include "../strpbrk.S"
+#else
+#include "../strcspn.S"
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncase_l-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/strncase_l-ssse3.S
new file mode 100644
index 0000000000..6728678688
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncase_l-ssse3.S
@@ -0,0 +1,6 @@
+#define USE_SSSE3 1
+#define USE_AS_STRNCASECMP_L
+#define NO_NOLOCALE_ALIAS
+#define STRCMP __strncasecmp_l_ssse3
+#define __strncasecmp __strncasecmp_ssse3
+#include "../strcmp.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncase_l.S b/REORG.TODO/sysdeps/x86_64/multiarch/strncase_l.S
new file mode 100644
index 0000000000..9c0149788e
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncase_l.S
@@ -0,0 +1,8 @@
+/* Multiple versions of strncasecmp and strncasecmp_l
+   All versions must be listed in ifunc-impl-list.c.  */
+#define STRCMP __strncasecmp_l
+#define USE_AS_STRNCASECMP_L
+#include "strcmp.S"
+
+weak_alias (__strncasecmp_l, strncasecmp_l)
+libc_hidden_def (strncasecmp_l)
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncat-c.c b/REORG.TODO/sysdeps/x86_64/multiarch/strncat-c.c
new file mode 100644
index 0000000000..a3cdbff689
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncat-c.c
@@ -0,0 +1,8 @@
+#define STRNCAT __strncat_sse2
+#ifdef SHARED
+#undef libc_hidden_def
+#define libc_hidden_def(name) \
+  __hidden_ver1 (__strncat_sse2, __GI___strncat, __strncat_sse2);
+#endif
+
+#include "string/strncat.c"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S b/REORG.TODO/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S
new file mode 100644
index 0000000000..133e1d20b0
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCAT
+#define STRCAT __strncat_sse2_unaligned
+#include "strcat-sse2-unaligned.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncat-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/strncat-ssse3.S
new file mode 100644
index 0000000000..6c45ff3ec7
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncat-ssse3.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCAT
+#define STRCAT __strncat_ssse3
+#include "strcat-ssse3.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncat.S b/REORG.TODO/sysdeps/x86_64/multiarch/strncat.S
new file mode 100644
index 0000000000..5c1bf41453
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncat.S
@@ -0,0 +1,5 @@
+/* Multiple versions of strncat
+   All versions must be listed in ifunc-impl-list.c.  */
+#define STRCAT strncat
+#define USE_AS_STRNCAT
+#include "strcat.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncmp-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/strncmp-ssse3.S
new file mode 100644
index 0000000000..96380a46be
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncmp-ssse3.S
@@ -0,0 +1,6 @@
+#ifdef SHARED
+# define USE_SSSE3 1
+# define STRCMP __strncmp_ssse3
+# define USE_AS_STRNCMP
+# include "../strcmp.S"
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncmp.S b/REORG.TODO/sysdeps/x86_64/multiarch/strncmp.S
new file mode 100644
index 0000000000..fd5eb1397c
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncmp.S
@@ -0,0 +1,5 @@
+/* Multiple versions of strncmp
+   All versions must be listed in ifunc-impl-list.c.  */
+#define STRCMP strncmp
+#define USE_AS_STRNCMP
+#include "strcmp.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncpy-c.c b/REORG.TODO/sysdeps/x86_64/multiarch/strncpy-c.c
new file mode 100644
index 0000000000..296c32cb5d
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncpy-c.c
@@ -0,0 +1,8 @@
+#define STRNCPY __strncpy_sse2
+#ifdef SHARED
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name) \
+  __hidden_ver1 (__strncpy_sse2, __GI_strncpy, __strncpy_sse2);
+#endif
+
+#include "strncpy.c"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S b/REORG.TODO/sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S
new file mode 100644
index 0000000000..fcc23a754a
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCPY
+#define STRCPY __strncpy_sse2_unaligned
+#include "strcpy-sse2-unaligned.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncpy-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/strncpy-ssse3.S
new file mode 100644
index 0000000000..bf82ee447d
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncpy-ssse3.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCPY
+#define STRCPY __strncpy_ssse3
+#include "strcpy-ssse3.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncpy.S b/REORG.TODO/sysdeps/x86_64/multiarch/strncpy.S
new file mode 100644
index 0000000000..6d87a0ba35
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncpy.S
@@ -0,0 +1,5 @@
+/* Multiple versions of strncpy
+   All versions must be listed in ifunc-impl-list.c.  */
+#define STRCPY strncpy
+#define USE_AS_STRNCPY
+#include "strcpy.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strpbrk-c.c b/REORG.TODO/sysdeps/x86_64/multiarch/strpbrk-c.c
new file mode 100644
index 0000000000..bbf5c49d89
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strpbrk-c.c
@@ -0,0 +1,8 @@
+/* Don't define multiple versions for strpbrk in static library since we
+   need strpbrk before the initialization happened.  */
+#ifdef SHARED
+# define USE_AS_STRPBRK
+# define STRCSPN_SSE2 __strpbrk_sse2
+# define STRCSPN_SSE42 __strpbrk_sse42
+# include "strcspn-c.c"
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strpbrk.S b/REORG.TODO/sysdeps/x86_64/multiarch/strpbrk.S
new file mode 100644
index 0000000000..7201d6376f
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strpbrk.S
@@ -0,0 +1,5 @@
+/* Multiple versions of strpbrk
+   All versions must be listed in ifunc-impl-list.c.  */
+#define STRCSPN strpbrk
+#define USE_AS_STRPBRK
+#include "strcspn.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strspn-c.c b/REORG.TODO/sysdeps/x86_64/multiarch/strspn-c.c
new file mode 100644
index 0000000000..1704606b80
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strspn-c.c
@@ -0,0 +1,145 @@
+/* strspn with SSE4.2 intrinsics
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <nmmintrin.h>
+#include <string.h>
+#include "varshift.h"
+
+/* We use 0x12:
+	_SIDD_SBYTE_OPS
+	| _SIDD_CMP_EQUAL_ANY
+	| _SIDD_NEGATIVE_POLARITY
+	| _SIDD_LEAST_SIGNIFICANT
+   on pcmpistri to compare xmm/mem128
+
+   0 1 2 3 4 5 6 7 8 9 A B C D E F
+   X X X X X X X X X X X X X X X X
+
+   against xmm
+
+   0 1 2 3 4 5 6 7 8 9 A B C D E F
+   A A A A A A A A A A A A A A A A
+
+   to find out if the first 16byte data element has any non-A byte and
+   the offset of the first byte.  There are 2 cases:
+
+   1. The first 16byte data element has the non-A byte, including
+      EOS, at the offset X.
+   2. The first 16byte data element is valid and doesn't have the non-A
+      byte.
+
+   Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
+
+   case		ECX	CFlag	ZFlag	SFlag
+    1		 X	  1	 0/1	  0
+    2		16	  0	  0	  0
+
+   We exit from the loop for case 1.  */
+
+extern size_t __strspn_sse2 (const char *, const char *);
+
+
+size_t
+__attribute__ ((section (".text.sse4.2")))
+__strspn_sse42 (const char *s, const char *a)
+{
+  if (*a == 0)
+    return 0;
+
+  const char *aligned;
+  __m128i mask;
+  int offset = (int) ((size_t) a & 15);
+  if (offset != 0)
+    {
+      /* Load masks.  */
+      aligned = (const char *) ((size_t) a & -16L);
+      __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
+
+      mask = __m128i_shift_right (mask0, offset);
+
+      /* Find where the NULL terminator is.  */
+      int length = _mm_cmpistri (mask, mask, 0x3a);
+      if (length == 16 - offset)
+	{
+	  /* There is no NULL terminator.  */
+	  __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
+	  int index = _mm_cmpistri (mask1, mask1, 0x3a);
+	  length += index;
+
+	  /* Don't use SSE4.2 if the length of A > 16.  */
+	  if (length > 16)
+	    return __strspn_sse2 (s, a);
+
+	  if (index != 0)
+	    {
+	      /* Combine mask0 and mask1.  We could play games with
+		 palignr, but frankly this data should be in L1 now
+		 so do the merge via an unaligned load.  */
+	      mask = _mm_loadu_si128 ((__m128i *) a);
+	    }
+	}
+    }
+  else
+    {
+      /* A is aligned.  */
+      mask = _mm_load_si128 ((__m128i *) a);
+
+      /* Find where the NULL terminator is.  */
+      int length = _mm_cmpistri (mask, mask, 0x3a);
+      if (length == 16)
+	{
+	  /* There is no NULL terminator.  Don't use SSE4.2 if the length
+	     of A > 16.  */
+	  if (a[16] != 0)
+	    return __strspn_sse2 (s, a);
+	}
+    }
+
+  offset = (int) ((size_t) s & 15);
+  if (offset != 0)
+    {
+      /* Check partial string.  */
+      aligned = (const char *) ((size_t) s & -16L);
+      __m128i value = _mm_load_si128 ((__m128i *) aligned);
+
+      value = __m128i_shift_right (value, offset);
+
+      int length = _mm_cmpistri (mask, value, 0x12);
+      /* No need to check CFlag since it is always 1.  */
+      if (length < 16 - offset)
+	return length;
+      /* Find where the NULL terminator is.  */
+      int index = _mm_cmpistri (value, value, 0x3a);
+      if (index < 16 - offset)
+	return length;
+      aligned += 16;
+    }
+  else
+    aligned = s;
+
+  while (1)
+    {
+      __m128i value = _mm_load_si128 ((__m128i *) aligned);
+      int index = _mm_cmpistri (mask, value, 0x12);
+      int cflag = _mm_cmpistrc (mask, value, 0x12);
+      if (cflag)
+	return (size_t) (aligned + index - s);
+      aligned += 16;
+    }
+}
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strspn.S b/REORG.TODO/sysdeps/x86_64/multiarch/strspn.S
new file mode 100644
index 0000000000..adf7d9e533
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strspn.S
@@ -0,0 +1,50 @@
+/* Multiple versions of strspn
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+	.text
+ENTRY(strspn)
+	.type	strspn, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	__strspn_sse2(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_2)
+	jz	2f
+	leaq	__strspn_sse42(%rip), %rax
+2:	ret
+END(strspn)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __strspn_sse2, @function; \
+	.globl __strspn_sse2; \
+	.align 16; \
+	__strspn_sse2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __strspn_sse2, .-__strspn_sse2
+#endif
+
+#include "../strspn.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S b/REORG.TODO/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S
new file mode 100644
index 0000000000..138979d10a
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S
@@ -0,0 +1,374 @@
+/* strstr with unaligned loads
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+ENTRY(__strstr_sse2_unaligned)
+	movzbl	(%rsi), %eax
+	testb	%al, %al
+	je	L(empty)
+	movzbl	1(%rsi), %edx
+	testb	%dl, %dl
+	je	L(strchr)
+	movd	%eax, %xmm1
+	movd	%edx, %xmm2
+	movq	%rdi, %rax
+	andl	$4095, %eax
+	punpcklbw	%xmm1, %xmm1
+	cmpq	$4031, %rax
+	punpcklbw	%xmm2, %xmm2
+	punpcklwd	%xmm1, %xmm1
+	punpcklwd	%xmm2, %xmm2
+	pshufd	$0, %xmm1, %xmm1
+	pshufd	$0, %xmm2, %xmm2
+	ja	L(cross_page)
+	movdqu	(%rdi), %xmm3
+	pxor	%xmm5, %xmm5
+	movdqu	1(%rdi), %xmm4
+	movdqa	%xmm3, %xmm6
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm2, %xmm4
+	movdqu	16(%rdi), %xmm0
+	pcmpeqb	%xmm5, %xmm6
+	pminub	%xmm4, %xmm3
+	movdqa	%xmm3, %xmm4
+	movdqu	17(%rdi), %xmm3
+	pcmpeqb	%xmm0, %xmm5
+	pcmpeqb	%xmm2, %xmm3
+	por	%xmm6, %xmm4
+	pcmpeqb	%xmm1, %xmm0
+	pminub	%xmm3, %xmm0
+	por	%xmm5, %xmm0
+	pmovmskb	%xmm4, %r8d
+	pmovmskb	%xmm0, %eax
+	salq	$16, %rax
+	orq	%rax, %r8
+	je	L(next_32_bytes)
+L(next_pair_index):
+	bsf	%r8, %rax
+	addq	%rdi, %rax
+	cmpb	$0, (%rax)
+	je	L(zero1)
+	movzbl	2(%rsi), %edx
+	testb	%dl, %dl
+	je	L(found1)
+	cmpb	2(%rax), %dl
+	jne	L(next_pair)
+	xorl	%edx, %edx
+	jmp	L(pair_loop_start)
+
+	.p2align 4
+L(strchr):
+	movzbl	%al, %esi
+	jmp	__strchr_sse2
+
+	.p2align 4
+L(pair_loop):
+	addq	$1, %rdx
+	cmpb	2(%rax,%rdx), %cl
+	jne	L(next_pair)
+L(pair_loop_start):
+	movzbl	3(%rsi,%rdx), %ecx
+	testb	%cl, %cl
+	jne	L(pair_loop)
+L(found1):
+	ret
+L(zero1):
+	xorl	%eax, %eax
+	ret
+
+	.p2align 4
+L(next_pair):
+	leaq	-1(%r8), %rax
+	andq	%rax, %r8
+	jne	L(next_pair_index)
+
+	.p2align 4
+L(next_32_bytes):
+	movdqu	32(%rdi), %xmm3
+	pxor	%xmm5, %xmm5
+	movdqu	33(%rdi), %xmm4
+	movdqa	%xmm3, %xmm6
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm2, %xmm4
+	movdqu	48(%rdi), %xmm0
+	pcmpeqb	%xmm5, %xmm6
+	pminub	%xmm4, %xmm3
+	movdqa	%xmm3, %xmm4
+	movdqu	49(%rdi), %xmm3
+	pcmpeqb	%xmm0, %xmm5
+	pcmpeqb	%xmm2, %xmm3
+	por	%xmm6, %xmm4
+	pcmpeqb	%xmm1, %xmm0
+	pminub	%xmm3, %xmm0
+	por	%xmm5, %xmm0
+	pmovmskb	%xmm4, %eax
+	salq	$32, %rax
+	pmovmskb	%xmm0, %r8d
+	salq	$48, %r8
+	orq	%rax, %r8
+	je	L(loop_header)
+L(next_pair2_index):
+	bsfq	%r8, %rax
+	addq	%rdi, %rax
+	cmpb	$0, (%rax)
+	je	L(zero2)
+	movzbl	2(%rsi), %edx
+	testb	%dl, %dl
+	je	L(found2)
+	cmpb	2(%rax), %dl
+	jne	L(next_pair2)
+	xorl	%edx, %edx
+	jmp	L(pair_loop2_start)
+
+	.p2align 4
+L(pair_loop2):
+	addq	$1, %rdx
+	cmpb	2(%rax,%rdx), %cl
+	jne	L(next_pair2)
+L(pair_loop2_start):
+	movzbl	3(%rsi,%rdx), %ecx
+	testb	%cl, %cl
+	jne	L(pair_loop2)
+L(found2):
+	ret
+	L(zero2):
+	xorl	%eax, %eax
+	ret
+L(empty):
+	mov %rdi, %rax
+	ret
+
+	.p2align 4
+L(next_pair2):
+	leaq	-1(%r8), %rax
+	andq	%rax, %r8
+	jne	L(next_pair2_index)
+L(loop_header):
+	movq	$-512, %r11
+	movq	%rdi, %r9
+
+	pxor	%xmm7, %xmm7
+	andq	$-64, %rdi
+
+	.p2align 4
+L(loop):
+	movdqa	64(%rdi), %xmm3
+	movdqu	63(%rdi), %xmm6
+	movdqa	%xmm3, %xmm0
+	pxor	%xmm2, %xmm3
+	pxor	%xmm1, %xmm6
+	movdqa	80(%rdi), %xmm10
+	por	%xmm3, %xmm6
+	pminub	%xmm10, %xmm0
+	movdqu	79(%rdi), %xmm3
+	pxor	%xmm2, %xmm10
+	pxor	%xmm1, %xmm3
+	movdqa	96(%rdi), %xmm9
+	por	%xmm10, %xmm3
+	pminub	%xmm9, %xmm0
+	pxor	%xmm2, %xmm9
+	movdqa	112(%rdi), %xmm8
+	addq	$64, %rdi
+	pminub	%xmm6, %xmm3
+	movdqu	31(%rdi), %xmm4
+	pminub	%xmm8, %xmm0
+	pxor	%xmm2, %xmm8
+	pxor	%xmm1, %xmm4
+	por	%xmm9, %xmm4
+	pminub	%xmm4, %xmm3
+	movdqu	47(%rdi), %xmm5
+	pxor	%xmm1, %xmm5
+	por	%xmm8, %xmm5
+	pminub	%xmm5, %xmm3
+	pminub	%xmm3, %xmm0
+	pcmpeqb	%xmm7, %xmm0
+	pmovmskb	%xmm0, %eax
+	testl	%eax, %eax
+	je	L(loop)
+	pminub (%rdi), %xmm6
+	pminub 32(%rdi),%xmm4
+	pminub 48(%rdi),%xmm5
+	pcmpeqb %xmm7, %xmm6
+	pcmpeqb %xmm7, %xmm5
+	pmovmskb	%xmm6, %edx
+	movdqa	16(%rdi), %xmm8
+	pcmpeqb %xmm7, %xmm4
+	movdqu  15(%rdi), %xmm0
+	pmovmskb	%xmm5, %r8d
+	movdqa  %xmm8, %xmm3
+	pmovmskb	%xmm4, %ecx
+	pcmpeqb %xmm1,%xmm0
+	pcmpeqb %xmm2,%xmm3
+	salq	$32, %rcx
+	pcmpeqb %xmm7,%xmm8
+	salq	$48, %r8
+	pminub  %xmm0,%xmm3
+	orq	%rcx, %rdx
+	por	%xmm3,%xmm8
+	orq	%rdx, %r8
+	pmovmskb	%xmm8, %eax
+	salq	$16, %rax
+	orq	%rax, %r8
+	je	L(loop)
+L(next_pair_index3):
+	bsfq	%r8, %rcx
+	addq	%rdi, %rcx
+	cmpb	$0, (%rcx)
+	je	L(zero)
+	xorl	%eax, %eax
+	movzbl	2(%rsi), %edx
+	testb	%dl, %dl
+	je	L(success3)
+	cmpb	1(%rcx), %dl
+	jne	L(next_pair3)
+	jmp	L(pair_loop_start3)
+
+	.p2align 4
+L(pair_loop3):
+	addq	$1, %rax
+	cmpb	1(%rcx,%rax), %dl
+	jne	L(next_pair3)
+L(pair_loop_start3):
+	movzbl	3(%rsi,%rax), %edx
+	testb	%dl, %dl
+	jne	L(pair_loop3)
+L(success3):
+	lea	-1(%rcx), %rax
+	ret
+
+	.p2align 4
+L(next_pair3):
+	addq	%rax, %r11
+	movq	%rdi,  %rax
+	subq	%r9, %rax
+	cmpq	%r11, %rax
+	jl	L(switch_strstr)
+	leaq	-1(%r8), %rax
+	andq	%rax, %r8
+	jne	L(next_pair_index3)
+	jmp	L(loop)
+
+	.p2align 4
+L(switch_strstr):
+	movq	%rdi, %rdi
+	jmp	__strstr_sse2
+
+	.p2align 4
+L(cross_page):
+
+	movq	%rdi, %rax
+	pxor	%xmm0, %xmm0
+	andq	$-64, %rax
+	movdqa	(%rax), %xmm3
+	movdqu	-1(%rax), %xmm4
+	movdqa	%xmm3, %xmm8
+	movdqa	16(%rax), %xmm5
+	pcmpeqb	%xmm1, %xmm4
+	pcmpeqb	%xmm0, %xmm8
+	pcmpeqb	%xmm2, %xmm3
+	movdqa	%xmm5, %xmm7
+	pminub	%xmm4, %xmm3
+	movdqu	15(%rax), %xmm4
+	pcmpeqb	%xmm0, %xmm7
+	por	%xmm3, %xmm8
+	movdqa	%xmm5, %xmm3
+	movdqa	32(%rax), %xmm5
+	pcmpeqb	%xmm1, %xmm4
+	pcmpeqb	%xmm2, %xmm3
+	movdqa	%xmm5, %xmm6
+	pmovmskb	%xmm8, %ecx
+	pminub	%xmm4, %xmm3
+	movdqu	31(%rax), %xmm4
+	por	%xmm3, %xmm7
+	movdqa	%xmm5, %xmm3
+	pcmpeqb	%xmm0, %xmm6
+	movdqa	48(%rax), %xmm5
+	pcmpeqb	%xmm1, %xmm4
+	pmovmskb	%xmm7, %r8d
+	pcmpeqb	%xmm2, %xmm3
+	pcmpeqb	%xmm5, %xmm0
+	pminub	%xmm4, %xmm3
+	movdqu	47(%rax), %xmm4
+	por	%xmm3, %xmm6
+	movdqa	%xmm5, %xmm3
+	salq	$16, %r8
+	pcmpeqb	%xmm1, %xmm4
+	pcmpeqb	%xmm2, %xmm3
+	pmovmskb	%xmm6, %r10d
+	pminub	%xmm4, %xmm3
+	por	%xmm3, %xmm0
+	salq	$32, %r10
+	orq	%r10, %r8
+	orq	%rcx, %r8
+	movl	%edi, %ecx
+	pmovmskb	%xmm0, %edx
+	subl	%eax, %ecx
+	salq	$48, %rdx
+	orq	%rdx, %r8
+	shrq	%cl, %r8
+	je	L(loop_header)
+L(next_pair_index4):
+	bsfq	%r8, %rax
+	addq	%rdi, %rax
+	cmpb	$0, (%rax)
+	je	L(zero)
+
+	cmpq	%rax,%rdi
+	je	L(next_pair4)
+
+	movzbl	2(%rsi), %edx
+	testb	%dl, %dl
+	je	L(found3)
+	cmpb	1(%rax), %dl
+	jne	L(next_pair4)
+	xorl	%edx, %edx
+	jmp	L(pair_loop_start4)
+
+	.p2align 4
+L(pair_loop4):
+	addq	$1, %rdx
+	cmpb	1(%rax,%rdx), %cl
+	jne	L(next_pair4)
+L(pair_loop_start4):
+	movzbl	3(%rsi,%rdx), %ecx
+	testb	%cl, %cl
+	jne	L(pair_loop4)
+L(found3):
+	subq $1, %rax
+	ret
+
+	.p2align 4
+L(next_pair4):
+	leaq	-1(%r8), %rax
+	andq	%rax, %r8
+	jne	L(next_pair_index4)
+	jmp	L(loop_header)
+
+	.p2align 4
+L(found):
+	rep
+	ret
+
+	.p2align 4
+L(zero):
+	xorl	%eax, %eax
+	ret
+
+
+END(__strstr_sse2_unaligned)
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strstr.c b/REORG.TODO/sysdeps/x86_64/multiarch/strstr.c
new file mode 100644
index 0000000000..a7d181d797
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strstr.c
@@ -0,0 +1,50 @@
+/* Multiple versions of strstr.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Redefine strstr so that the compiler won't complain about the type
+   mismatch with the IFUNC selector in strong_alias, below.  */
+#undef  strstr
+#define strstr __redirect_strstr
+#include <string.h>
+#undef  strstr
+
+#define STRSTR __strstr_sse2
+#ifdef SHARED
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name) \
+  __hidden_ver1 (__strstr_sse2, __GI_strstr, __strstr_sse2);
+#endif
+
+#include "string/strstr.c"
+
+extern __typeof (__redirect_strstr) __strstr_sse2_unaligned attribute_hidden;
+extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden;
+
+#include "init-arch.h"
+
+/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
+   ifunc symbol properly.  */
+extern __typeof (__redirect_strstr) __libc_strstr;
+libc_ifunc (__libc_strstr,
+	    HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	    ? __strstr_sse2_unaligned
+	    : __strstr_sse2)
+
+#undef strstr
+strong_alias (__libc_strstr, strstr)
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/test-multiarch.c b/REORG.TODO/sysdeps/x86_64/multiarch/test-multiarch.c
new file mode 100644
index 0000000000..597d64e1e8
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/test-multiarch.c
@@ -0,0 +1,96 @@
+/* Test CPU feature data.
+   This file is part of the GNU C Library.
+   Copyright (C) 2012-2017 Free Software Foundation, Inc.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <cpu-features.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+static char *cpu_flags;
+
+/* Search for flags in /proc/cpuinfo and store line
+   in cpu_flags.  */
+void
+get_cpuinfo (void)
+{
+  FILE *f;
+  char *line = NULL;
+  size_t len = 0;
+  ssize_t read;
+
+  f = fopen ("/proc/cpuinfo", "r");
+  if (f == NULL)
+    {
+      printf ("cannot open /proc/cpuinfo\n");
+      exit (1);
+    }
+
+  while ((read = getline (&line, &len, f)) != -1)
+    {
+      if (strncmp (line, "flags", 5) == 0)
+       {
+         cpu_flags = strdup (line);
+         break;
+       }
+    }
+  fclose (f);
+  free (line);
+}
+
+int
+check_proc (const char *proc_name, int flag, const char *name)
+{
+  int found = 0;
+
+  printf ("Checking %s:\n", name);
+  printf ("  init-arch %d\n", flag);
+  if (strstr (cpu_flags, proc_name) != NULL)
+    found = 1;
+  printf ("  cpuinfo (%s) %d\n", proc_name, found);
+
+  if (found != flag)
+    printf (" *** failure ***\n");
+
+  return (found != flag);
+}
+
+static int
+do_test (int argc, char **argv)
+{
+  int fails;
+
+  get_cpuinfo ();
+  fails = check_proc ("avx", HAS_ARCH_FEATURE (AVX_Usable),
+		      "HAS_ARCH_FEATURE (AVX_Usable)");
+  fails += check_proc ("fma4", HAS_ARCH_FEATURE (FMA4_Usable),
+		       "HAS_ARCH_FEATURE (FMA4_Usable)");
+  fails += check_proc ("sse4_2", HAS_CPU_FEATURE (SSE4_2),
+		       "HAS_CPU_FEATURE (SSE4_2)");
+  fails += check_proc ("sse4_1", HAS_CPU_FEATURE (SSE4_1)
+		       , "HAS_CPU_FEATURE (SSE4_1)");
+  fails += check_proc ("ssse3", HAS_CPU_FEATURE (SSSE3),
+		       "HAS_CPU_FEATURE (SSSE3)");
+  fails += check_proc ("popcnt", HAS_CPU_FEATURE (POPCOUNT),
+		       "HAS_CPU_FEATURE (POPCOUNT)");
+
+  printf ("%d differences between /proc/cpuinfo and glibc code.\n", fails);
+
+  return (fails != 0);
+}
+
+#include "../../../test-skeleton.c"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/varshift.c b/REORG.TODO/sysdeps/x86_64/multiarch/varshift.c
new file mode 100644
index 0000000000..1c3e34845d
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/varshift.c
@@ -0,0 +1,25 @@
+/* Helper for variable shifts of SSE registers.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "varshift.h"
+
+const int8_t ___m128i_shift_right[31] attribute_hidden =
+  {
+    0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+  };
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/varshift.h b/REORG.TODO/sysdeps/x86_64/multiarch/varshift.h
new file mode 100644
index 0000000000..07bb76c4bf
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/varshift.h
@@ -0,0 +1,30 @@
+/* Helper for variable shifts of SSE registers.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stdint.h>
+#include <tmmintrin.h>
+
+extern const int8_t ___m128i_shift_right[31] attribute_hidden;
+
+static __inline__ __m128i
+__m128i_shift_right (__m128i value, unsigned long int offset)
+{
+  return _mm_shuffle_epi8 (value,
+			   _mm_loadu_si128 ((__m128i *) (___m128i_shift_right
+							 + offset)));
+}
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wcscpy-c.c b/REORG.TODO/sysdeps/x86_64/multiarch/wcscpy-c.c
new file mode 100644
index 0000000000..a51a83a9be
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/wcscpy-c.c
@@ -0,0 +1,5 @@
+#if IS_IN (libc)
+# define wcscpy  __wcscpy_sse2
+#endif
+
+#include "wcsmbs/wcscpy.c"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wcscpy-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
new file mode 100644
index 0000000000..53857ce4f5
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
@@ -0,0 +1,552 @@
+/* wcscpy with SSSE3
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+# include <sysdep.h>
+
+	.section .text.ssse3,"ax",@progbits
+ENTRY (__wcscpy_ssse3)
+
+	mov	%rsi, %rcx
+	mov	%rdi, %rdx
+
+	cmpl	$0, (%rcx)
+	jz	L(Exit4)
+	cmpl	$0, 4(%rcx)
+	jz	L(Exit8)
+	cmpl	$0, 8(%rcx)
+	jz	L(Exit12)
+	cmpl	$0, 12(%rcx)
+	jz	L(Exit16)
+
+	lea	16(%rcx), %rsi
+	and	$-16, %rsi
+
+	pxor	%xmm0, %xmm0
+	mov	(%rcx), %r9
+	mov	%r9, (%rdx)
+
+	pcmpeqd	(%rsi), %xmm0
+	mov	8(%rcx), %r9
+	mov	%r9, 8(%rdx)
+
+	pmovmskb %xmm0, %rax
+	sub	%rcx, %rsi
+
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	mov	%rdx, %rax
+	lea	16(%rdx), %rdx
+	and	$-16, %rdx
+	sub	%rdx, %rax
+	sub	%rax, %rcx
+	mov	%rcx, %rax
+	and	$0xf, %rax
+	mov	$0, %rsi
+
+/* case: rcx_offset == rdx_offset */
+
+	jz	L(Align16Both)
+
+	cmp	$4, %rax
+	je	L(Shl4)
+	cmp	$8, %rax
+	je	L(Shl8)
+	jmp	L(Shl12)
+
+L(Align16Both):
+	movaps	(%rcx), %xmm1
+	movaps	16(%rcx), %xmm2
+	movaps	%xmm1, (%rdx)
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%rcx, %rsi), %xmm3
+	movaps	%xmm2, (%rdx, %rsi)
+	pcmpeqd	%xmm3, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%rcx, %rsi), %xmm4
+	movaps	%xmm3, (%rdx, %rsi)
+	pcmpeqd	%xmm4, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%rcx, %rsi), %xmm1
+	movaps	%xmm4, (%rdx, %rsi)
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%rcx, %rsi), %xmm2
+	movaps	%xmm1, (%rdx, %rsi)
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%rcx, %rsi), %xmm3
+	movaps	%xmm2, (%rdx, %rsi)
+	pcmpeqd	%xmm3, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	%xmm3, (%rdx, %rsi)
+	mov	%rcx, %rax
+	lea	16(%rcx, %rsi), %rcx
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	sub	%rax, %rdx
+
+	mov	$-0x40, %rsi
+
+	.p2align 4
+L(Aligned64Loop):
+	movaps	(%rcx), %xmm2
+	movaps	%xmm2, %xmm4
+	movaps	16(%rcx), %xmm5
+	movaps	32(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	48(%rcx), %xmm7
+	pminub	%xmm5, %xmm2
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqd	%xmm0, %xmm3
+	pmovmskb %xmm3, %rax
+	lea	64(%rdx), %rdx
+	lea	64(%rcx), %rcx
+	test	%rax, %rax
+	jnz	L(Aligned64Leave)
+	movaps	%xmm4, -64(%rdx)
+	movaps	%xmm5, -48(%rdx)
+	movaps	%xmm6, -32(%rdx)
+	movaps	%xmm7, -16(%rdx)
+	jmp	L(Aligned64Loop)
+
+L(Aligned64Leave):
+	pcmpeqd	%xmm4, %xmm0
+	pmovmskb %xmm0, %rax
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqd	%xmm5, %xmm0
+
+	pmovmskb %xmm0, %rax
+	movaps	%xmm4, -64(%rdx)
+	test	%rax, %rax
+	lea	16(%rsi), %rsi
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqd	%xmm6, %xmm0
+
+	pmovmskb %xmm0, %rax
+	movaps	%xmm5, -48(%rdx)
+	test	%rax, %rax
+	lea	16(%rsi), %rsi
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	%xmm6, -32(%rdx)
+	pcmpeqd	%xmm7, %xmm0
+
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	mov	$-0x40, %rsi
+	movaps	%xmm7, -16(%rdx)
+	jmp	L(Aligned64Loop)
+
+	.p2align 4
+L(Shl4):
+	movaps	-4(%rcx), %xmm1
+	movaps	12(%rcx), %xmm2
+L(Shl4Start):
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+
+	test	%rax, %rax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	28(%rcx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+
+	test	%rax, %rax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	28(%rcx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+
+	test	%rax, %rax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	28(%rcx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+
+	test	%rax, %rax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	28(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-12(%rcx), %rcx
+	sub	%rax, %rdx
+
+	movaps	-4(%rcx), %xmm1
+
+	.p2align 4
+L(Shl4LoopStart):
+	movaps	12(%rcx), %xmm2
+	movaps	28(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	44(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	60(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqd	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$4, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$4, %xmm3, %xmm4
+	jnz	L(Shl4Start)
+
+	palignr	$4, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl4LoopStart)
+
+L(Shl4LoopExit):
+	movdqu	-4(%rcx), %xmm1
+	mov	$12, %rsi
+	movdqu	%xmm1, -4(%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl8):
+	movaps	-8(%rcx), %xmm1
+	movaps	8(%rcx), %xmm2
+L(Shl8Start):
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+
+	test	%rax, %rax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	24(%rcx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+
+	test	%rax, %rax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	24(%rcx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+
+	test	%rax, %rax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	24(%rcx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+
+	test	%rax, %rax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	24(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-8(%rcx), %rcx
+	sub	%rax, %rdx
+
+	movaps	-8(%rcx), %xmm1
+
+	.p2align 4
+L(Shl8LoopStart):
+	movaps	8(%rcx), %xmm2
+	movaps	24(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	40(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	56(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqd	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$8, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$8, %xmm3, %xmm4
+	jnz	L(Shl8Start)
+
+	palignr	$8, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl8LoopStart)
+
+L(Shl8LoopExit):
+	mov	(%rcx), %r9
+	mov	$8, %rsi
+	mov	%r9, (%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl12):
+	movaps	-12(%rcx), %xmm1
+	movaps	4(%rcx), %xmm2
+L(Shl12Start):
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+
+	test	%rax, %rax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	20(%rcx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+
+	test	%rax, %rax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	20(%rcx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+
+	test	%rax, %rax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	20(%rcx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+
+	test	%rax, %rax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	20(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-4(%rcx), %rcx
+	sub	%rax, %rdx
+
+	movaps	-12(%rcx), %xmm1
+
+	.p2align 4
+L(Shl12LoopStart):
+	movaps	4(%rcx), %xmm2
+	movaps	20(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	36(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	52(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqd	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$12, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$12, %xmm3, %xmm4
+	jnz	L(Shl12Start)
+	palignr	$12, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl12LoopStart)
+
+L(Shl12LoopExit):
+	mov	(%rcx), %r9d
+	mov	$4, %rsi
+	mov	%r9d, (%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(CopyFrom1To16Bytes):
+	add	%rsi, %rdx
+	add	%rsi, %rcx
+
+	test	%al, %al
+	jz	L(ExitHigh)
+	test	$0x01, %al
+	jnz	L(Exit4)
+
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(ExitHigh):
+	test	$0x01, %ah
+	jnz	L(Exit12)
+
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	8(%rcx), %rax
+	mov	%rax, 8(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(Exit4):
+	movl	(%rcx), %eax
+	movl	%eax, (%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(Exit8):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(Exit12):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	8(%rcx), %eax
+	mov	%eax, 8(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(Exit16):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	8(%rcx), %rax
+	mov	%rax, 8(%rdx)
+	mov	%rdi, %rax
+	ret
+
+END(__wcscpy_ssse3)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wcscpy.S b/REORG.TODO/sysdeps/x86_64/multiarch/wcscpy.S
new file mode 100644
index 0000000000..9150ab6d18
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/wcscpy.S
@@ -0,0 +1,40 @@
+/* Multiple versions of wcscpy
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+
+	.text
+ENTRY(wcscpy)
+	.type	wcscpy, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	HAS_CPU_FEATURE (SSSE3)
+	jnz	2f
+	leaq	__wcscpy_sse2(%rip), %rax
+	ret
+
+2:	leaq	__wcscpy_ssse3(%rip), %rax
+	ret
+
+END(wcscpy)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wcsnlen-c.c b/REORG.TODO/sysdeps/x86_64/multiarch/wcsnlen-c.c
new file mode 100644
index 0000000000..e1ec7cfbb5
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/wcsnlen-c.c
@@ -0,0 +1,9 @@
+#if IS_IN (libc)
+# include <wchar.h>
+
+# define WCSNLEN __wcsnlen_sse2
+
+extern __typeof (wcsnlen) __wcsnlen_sse2;
+#endif
+
+#include "wcsmbs/wcsnlen.c"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S b/REORG.TODO/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
new file mode 100644
index 0000000000..a8cab0cb00
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
@@ -0,0 +1,5 @@
+#define AS_WCSLEN
+#define AS_STRNLEN
+#define strlen	__wcsnlen_sse4_1
+
+#include "../strlen.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wcsnlen.c b/REORG.TODO/sysdeps/x86_64/multiarch/wcsnlen.c
new file mode 100644
index 0000000000..304f62eec3
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/wcsnlen.c
@@ -0,0 +1,45 @@
+/* Multiple versions of wcsnlen.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# define __wcsnlen __redirect_wcsnlen
+# include <wchar.h>
+# undef __wcsnlen
+
+# define SYMBOL_NAME wcsnlen
+# include <init-arch.h>
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+  const struct cpu_features* cpu_features = __get_cpu_features ();
+
+  if (CPU_FEATURES_CPU_P (cpu_features, SSE4_1))
+    return OPTIMIZE (sse4_1);
+
+  return OPTIMIZE (sse2);
+}
+
+libc_ifunc_redirected (__redirect_wcsnlen, __wcsnlen, IFUNC_SELECTOR ());
+weak_alias (__wcsnlen, wcsnlen);
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe.S b/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe.S
new file mode 100644
index 0000000000..bfa1a16a35
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe.S
@@ -0,0 +1,4 @@
+#define MEMCMP __wmemcmp_avx2_movbe
+#define USE_AS_WMEMCMP 1
+
+#include "memcmp-avx2-movbe.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-c.c b/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-c.c
new file mode 100644
index 0000000000..46b6715e18
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-c.c
@@ -0,0 +1,9 @@
+#if IS_IN (libc)
+# include <wchar.h>
+
+# define WMEMCMP  __wmemcmp_sse2
+
+extern __typeof (wmemcmp) __wmemcmp_sse2;
+#endif
+
+#include "wcsmbs/wmemcmp.c"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-sse4.S b/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-sse4.S
new file mode 100644
index 0000000000..b07973a4f6
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-sse4.S
@@ -0,0 +1,4 @@
+#define USE_AS_WMEMCMP 1
+#define MEMCMP __wmemcmp_sse4_1
+
+#include "memcmp-sse4.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
new file mode 100644
index 0000000000..a41ef95fc1
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_WMEMCMP 1
+#define MEMCMP __wmemcmp_ssse3
+
+#include "memcmp-ssse3.S"
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp.S b/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp.S
new file mode 100644
index 0000000000..94b25a214c
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp.S
@@ -0,0 +1,55 @@
+/* Multiple versions of wmemcmp
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+	.text
+ENTRY(wmemcmp)
+	.type	wmemcmp, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	1f
+	HAS_CPU_FEATURE (MOVBE)
+	jz	1f
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	1f
+	leaq	__wmemcmp_avx2_movbe(%rip), %rax
+	ret
+
+1:	HAS_CPU_FEATURE (SSSE3)
+	jnz	2f
+	leaq	__wmemcmp_sse2(%rip), %rax
+	ret
+
+2:	HAS_CPU_FEATURE (SSE4_1)
+	jz	3f
+	leaq	__wmemcmp_sse4_1(%rip), %rax
+	ret
+
+3:	leaq	__wmemcmp_ssse3(%rip), %rax
+	ret
+
+END(wmemcmp)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wmemset.c b/REORG.TODO/sysdeps/x86_64/multiarch/wmemset.c
new file mode 100644
index 0000000000..dd35be6e49
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/wmemset.c
@@ -0,0 +1,33 @@
+/* Multiple versions of wmemset.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# define wmemset __redirect_wmemset
+# define __wmemset __redirect___wmemset
+# include <wchar.h>
+# undef wmemset
+# undef __wmemset
+
+# define SYMBOL_NAME wmemset
+# include "ifunc-wmemset.h"
+
+libc_ifunc_redirected (__redirect_wmemset, __wmemset, IFUNC_SELECTOR ());
+weak_alias (__wmemset, wmemset)
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wmemset_chk-nonshared.S b/REORG.TODO/sysdeps/x86_64/multiarch/wmemset_chk-nonshared.S
new file mode 100644
index 0000000000..0a537fe272
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/wmemset_chk-nonshared.S
@@ -0,0 +1,21 @@
+/* Non-shared version of wmemset_chk for x86-64.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc) && !defined SHARED
+# include "../wmemset_chk.S"
+#endif
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wmemset_chk.c b/REORG.TODO/sysdeps/x86_64/multiarch/wmemset_chk.c
new file mode 100644
index 0000000000..d3ded5595b
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/wmemset_chk.c
@@ -0,0 +1,31 @@
+/* Multiple versions of wmemset_chk.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.so. */
+#if IS_IN (libc) && defined SHARED
+# define __wmemset_chk __redirect_wmemset_chk
+# include <wchar.h>
+# undef __wmemset_chk
+
+# define SYMBOL_NAME wmemset_chk
+# include "ifunc-wmemset.h"
+
+libc_ifunc_redirected (__redirect_wmemset_chk, __wmemset_chk,
+		       IFUNC_SELECTOR ());
+#endif