about summary refs log tree commit diff
diff options
context:
space:
mode:
authorH.J. Lu <hongjiu.lu@intel.com>2011-06-24 15:14:22 -0400
committerUlrich Drepper <drepper@gmail.com>2011-06-24 15:14:22 -0400
commit8912479f9ea9f56dc188d3d00c4ba4259f600661 (patch)
treefc91331de86b054859ce0dfe3fdec2a06812aa4c
parentd5495a116c6271c0ae8f6955b64b7b010b1b341a (diff)
downloadglibc-8912479f9ea9f56dc188d3d00c4ba4259f600661.tar.gz
glibc-8912479f9ea9f56dc188d3d00c4ba4259f600661.tar.xz
glibc-8912479f9ea9f56dc188d3d00c4ba4259f600661.zip
Improved st{r,p}{,n}cpy for SSE2 and SSSE3 on x86-64
-rw-r--r--ChangeLog17
-rw-r--r--NEWS3
-rw-r--r--sysdeps/x86_64/multiarch/Makefile7
-rw-r--r--sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S3
-rw-r--r--sysdeps/x86_64/multiarch/stpcpy-ssse3.S3
-rw-r--r--sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S4
-rw-r--r--sysdeps/x86_64/multiarch/stpncpy-ssse3.S4
-rw-r--r--sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S1718
-rw-r--r--sysdeps/x86_64/multiarch/strcpy-ssse3.S3721
-rw-r--r--sysdeps/x86_64/multiarch/strcpy.S1860
-rw-r--r--sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S3
-rw-r--r--sysdeps/x86_64/multiarch/strncpy-ssse3.S3
12 files changed, 5508 insertions, 1838 deletions
diff --git a/ChangeLog b/ChangeLog
index 8bf8eebb8f..b950dccfee 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,20 @@
+2011-06-22  H.J. Lu  <hongjiu.lu@intel.com>
+
+	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
+	strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3
+	strcpy-sse2-unaligned strncpy-sse2-unaligned
+	stpcpy-sse2-unaligned stpncpy-sse2-unaligned.
+	* sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S: New file.
+	* sysdeps/x86_64/multiarch/stpcpy-ssse3.S: New file.
+	* sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S: New file.
+	* sysdeps/x86_64/multiarch/stpncpy-ssse3.S: New file.
+	* sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S: New file.
+	* sysdeps/x86_64/multiarch/strcpy-ssse3.S: New file.
+	* sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S: New file.
+	* sysdeps/x86_64/multiarch/strncpy-ssse3.S: New file.
+	* sysdeps/x86_64/multiarch/strcpy.S: Remove strcpy with SSSE3.
+	(STRCPY): Support SSE2 and SSSE3 versions.
+
 2011-06-24  Ulrich Drepper  <drepper@gmail.com>
 
 	[BZ #12874]
diff --git a/NEWS b/NEWS
index dd280043b9..bc77d2dcf7 100644
--- a/NEWS
+++ b/NEWS
@@ -20,6 +20,9 @@ Version 2.15
 
 * Optimized strcpy, strncpy, stpcpy, stpncpy for SSE2 and SSSE3 on x86-32.
   Contributed by HJ Lu.
+
+* Improved strcpy, strncpy, stpcpy, stpncpy for SSE2 and SSSE3 on x86-64.
+  Contributed by HJ Lu.
 
 Version 2.14
 
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 19aa4be4cf..88410b395a 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -4,12 +4,15 @@ gen-as-const-headers += ifunc-defines.sym
 endif
 
 ifeq ($(subdir),string)
+
 sysdep_routines += stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
 		   strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \
 		   memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
 		   memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
-		   strncase_l-ssse3 strlen-sse4 strlen-no-bsf \
-		   memset-x86-64
+		   strncase_l-ssse3 strlen-sse4 strlen-no-bsf memset-x86-64 \
+		   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
+		   strcpy-sse2-unaligned strncpy-sse2-unaligned \
+		   stpcpy-sse2-unaligned stpncpy-sse2-unaligned
 ifeq (yes,$(config-cflags-sse4))
 sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
 CFLAGS-varshift.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S
new file mode 100644
index 0000000000..34231f8b46
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S
@@ -0,0 +1,3 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy_sse2_unaligned
+#include "strcpy-sse2-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S b/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
new file mode 100644
index 0000000000..d971c2da38
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
@@ -0,0 +1,3 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy_ssse3
+#include "strcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S
new file mode 100644
index 0000000000..658520f78f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S
@@ -0,0 +1,4 @@
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#define STRCPY __stpncpy_sse2_unaligned
+#include "strcpy-sse2-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S b/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
new file mode 100644
index 0000000000..14ed16f6b5
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#define STRCPY __stpncpy_ssse3
+#include "strcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
new file mode 100644
index 0000000000..9a8d1860a0
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
@@ -0,0 +1,1718 @@
+/* strcpy with SSE2 and unaligned load
+   Copyright (C) 2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifndef NOT_IN_libc
+
+# include <sysdep.h>
+
+# ifndef STRCPY
+#  define STRCPY  __strcpy_sse2_unaligned
+# endif
+
+# define JMPTBL(I, B)	I - B
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)             \
+	lea	TABLE(%rip), %r11;                              \
+	movslq	(%r11, INDEX, SCALE), %rcx;                     \
+	lea	(%r11, %rcx), %rcx;                             \
+	jmp	*%rcx
+
+	.text
+ENTRY (STRCPY)
+# ifdef USE_AS_STRNCPY
+	mov	%rdx, %r8
+	test	%r8, %r8
+	jz	L(ExitZero)
+# endif
+	mov	%rsi, %rcx
+# ifndef USE_AS_STPCPY
+	mov	%rdi, %rax      /* save result */
+# endif
+
+	and	$15, %rcx
+	jz	L(SourceStringAlignmentZero)
+
+	and	$-16, %rsi
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+
+	pcmpeqb	(%rsi), %xmm1
+# ifdef USE_AS_STRNCPY
+	add	%rcx, %r8
+# endif
+	pmovmskb %xmm1, %rdx
+	shr	%cl, %rdx
+# ifdef USE_AS_STRNCPY
+#  if defined USE_AS_STPCPY
+	cmp	$16, %r8
+#  else
+	cmp	$17, %r8
+#  endif
+	jbe	L(CopyFrom1To16BytesTailCase2OrCase3)
+# endif
+	test	%rdx, %rdx
+	jnz	L(CopyFrom1To16BytesTail)
+
+	pcmpeqb	16(%rsi), %xmm0
+	pmovmskb %xmm0, %rdx
+# ifdef USE_AS_STRNCPY
+#  if defined USE_AS_STPCPY
+	cmp	$32, %r8
+#  else
+	cmp	$33, %r8
+#  endif
+	jbe	L(CopyFrom1To32BytesCase2OrCase3)
+# endif
+	test	%rdx, %rdx
+	jnz	L(CopyFrom1To32Bytes)
+
+	movdqu	(%rsi, %rcx), %xmm1   /* copy 16 bytes */
+	movdqu	%xmm1, (%rdi)
+
+	sub	%rcx, %rdi
+
+/* If source adress alignment != destination adress alignment */
+	.p2align 4
+L(Unalign16Both):
+	mov	$16, %rcx
+	movdqa	(%rsi, %rcx), %xmm1
+	movaps	16(%rsi, %rcx), %xmm2
+	movdqu	%xmm1, (%rdi, %rcx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rdx
+	add	$16, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$48, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rdx, %rdx
+# if defined USE_AS_STRNCPY
+	jnz	L(CopyFrom1To16BytesUnalignedXmm2)
+# else
+	jnz	L(CopyFrom1To16Bytes)
+# endif
+
+	movaps	16(%rsi, %rcx), %xmm3
+	movdqu	%xmm2, (%rdi, %rcx)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %rdx
+	add	$16, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rdx, %rdx
+# if defined USE_AS_STRNCPY
+	jnz	L(CopyFrom1To16BytesUnalignedXmm3)
+# else
+	jnz	L(CopyFrom1To16Bytes)
+# endif
+
+	movaps	16(%rsi, %rcx), %xmm4
+	movdqu	%xmm3, (%rdi, %rcx)
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %rdx
+	add	$16, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rdx, %rdx
+# if defined USE_AS_STRNCPY
+	jnz	L(CopyFrom1To16BytesUnalignedXmm4)
+# else
+	jnz	L(CopyFrom1To16Bytes)
+# endif
+
+	movaps	16(%rsi, %rcx), %xmm1
+	movdqu	%xmm4, (%rdi, %rcx)
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %rdx
+	add	$16, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rdx, %rdx
+# if defined USE_AS_STRNCPY
+	jnz	L(CopyFrom1To16BytesUnalignedXmm1)
+# else
+	jnz	L(CopyFrom1To16Bytes)
+# endif
+
+	movaps	16(%rsi, %rcx), %xmm2
+	movdqu	%xmm1, (%rdi, %rcx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rdx
+	add	$16, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rdx, %rdx
+# if defined USE_AS_STRNCPY
+	jnz	L(CopyFrom1To16BytesUnalignedXmm2)
+# else
+	jnz	L(CopyFrom1To16Bytes)
+# endif
+
+	movaps	16(%rsi, %rcx), %xmm3
+	movdqu	%xmm2, (%rdi, %rcx)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %rdx
+	add	$16, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rdx, %rdx
+# if defined USE_AS_STRNCPY
+	jnz	L(CopyFrom1To16BytesUnalignedXmm3)
+# else
+	jnz	L(CopyFrom1To16Bytes)
+# endif
+
+	movdqu	%xmm3, (%rdi, %rcx)
+	mov	%rsi, %rdx
+	lea	16(%rsi, %rcx), %rsi
+	and	$-0x40, %rsi
+	sub	%rsi, %rdx
+	sub	%rdx, %rdi
+# ifdef USE_AS_STRNCPY
+	lea	128(%r8, %rdx), %r8
+# endif
+L(Unaligned64Loop):
+	movaps	(%rsi), %xmm2
+	movaps	%xmm2, %xmm4
+	movaps	16(%rsi), %xmm5
+	movaps	32(%rsi), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	48(%rsi), %xmm7
+	pminub	%xmm5, %xmm2
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %rdx
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(UnalignedLeaveCase2OrCase3)
+# endif
+	test	%rdx, %rdx
+	jnz	L(Unaligned64Leave)
+
+L(Unaligned64Loop_start):
+	add	$64, %rdi
+	add	$64, %rsi
+	movdqu	%xmm4, -64(%rdi)
+	movaps	(%rsi), %xmm2
+	movdqa	%xmm2, %xmm4
+	movdqu	%xmm5, -48(%rdi)
+	movaps	16(%rsi), %xmm5
+	pminub	%xmm5, %xmm2
+	movaps	32(%rsi), %xmm3
+	movdqu	%xmm6, -32(%rdi)
+	movaps	%xmm3, %xmm6
+	movdqu	%xmm7, -16(%rdi)
+	movaps	48(%rsi), %xmm7
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %rdx
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(UnalignedLeaveCase2OrCase3)
+# endif
+	test	%rdx, %rdx
+	jz	L(Unaligned64Loop_start)
+
+L(Unaligned64Leave):
+	pxor	%xmm1, %xmm1
+
+	pcmpeqb	%xmm4, %xmm0
+	pcmpeqb	%xmm5, %xmm1
+	pmovmskb %xmm0, %rdx
+	pmovmskb %xmm1, %rcx
+	test	%rdx, %rdx
+	jnz	L(CopyFrom1To16BytesUnaligned_0)
+	test	%rcx, %rcx
+	jnz	L(CopyFrom1To16BytesUnaligned_16)
+
+	pcmpeqb	%xmm6, %xmm0
+	pcmpeqb	%xmm7, %xmm1
+	pmovmskb %xmm0, %rdx
+	pmovmskb %xmm1, %rcx
+	test	%rdx, %rdx
+	jnz	L(CopyFrom1To16BytesUnaligned_32)
+
+	bsf	%rcx, %rdx
+	movdqu	%xmm4, (%rdi)
+	movdqu	%xmm5, 16(%rdi)
+	movdqu	%xmm6, 32(%rdi)
+# if defined USE_AS_STRNCPY
+#  ifdef USE_AS_STPCPY
+	lea	48(%rdi, %rdx), %rax
+#  endif
+	movdqu	%xmm7, 48(%rdi)
+	add	$15, %r8
+	sub	%rdx, %r8
+	lea	49(%rdi, %rdx), %rdi
+	jmp	L(StrncpyFillTailWithZero)
+# else
+	add	$48, %rsi
+	add	$48, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+# endif
+
+/* If source adress alignment == destination adress alignment */
+
+L(SourceStringAlignmentZero):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %rdx
+
+# ifdef USE_AS_STRNCPY
+#  if defined USE_AS_STPCPY
+	cmp	$16, %r8
+#  else
+	cmp	$17, %r8
+#  endif
+	jbe	L(CopyFrom1To16BytesTail1Case2OrCase3)
+# endif
+	test	%rdx, %rdx
+	jnz	L(CopyFrom1To16BytesTail1)
+
+	pcmpeqb	16(%rsi), %xmm0
+	movdqu	%xmm1, (%rdi)
+	pmovmskb %xmm0, %rdx
+
+# ifdef USE_AS_STRNCPY
+#  if defined USE_AS_STPCPY
+	cmp	$32, %r8
+#  else
+	cmp	$33, %r8
+#  endif
+	jbe	L(CopyFrom1To32Bytes1Case2OrCase3)
+# endif
+	test	%rdx, %rdx
+	jnz	L(CopyFrom1To32Bytes1)
+	jmp	L(Unalign16Both)
+
+/* ------End of main part with loops--------------------- */
+
+/* Case1 */
+
+# if (!defined USE_AS_STRNCPY)
+	.p2align 4
+L(CopyFrom1To16Bytes):
+	add	%rcx, %rdi
+	add	%rcx, %rsi
+	bsf	%rdx, %rdx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+# endif
+	.p2align 4
+L(CopyFrom1To16BytesTail):
+# if defined USE_AS_STRNCPY
+	sub	%rcx, %r8
+# endif
+	add	%rcx, %rsi
+	bsf	%rdx, %rdx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+
+	.p2align 4
+L(CopyFrom1To32Bytes1):
+	add	$16, %rsi
+	add	$16, %rdi
+# if defined USE_AS_STRNCPY
+	sub	$16, %r8
+# endif
+L(CopyFrom1To16BytesTail1):
+	bsf	%rdx, %rdx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+
+	.p2align 4
+L(CopyFrom1To32Bytes):
+# if defined USE_AS_STRNCPY
+	sub	%rcx, %r8
+# endif
+	bsf	%rdx, %rdx
+	add	%rcx, %rsi
+	add	$16, %rdx
+	sub	%rcx, %rdx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnaligned_0):
+	bsf	%rdx, %rdx
+# if defined USE_AS_STRNCPY
+#  ifdef USE_AS_STPCPY
+	lea	(%rdi, %rdx), %rax
+#  endif
+	movdqu	%xmm4, (%rdi)
+	add	$63, %r8
+	sub	%rdx, %r8
+	lea	1(%rdi, %rdx), %rdi
+	jmp	L(StrncpyFillTailWithZero)
+# else
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+# endif
+
+	.p2align 4
+L(CopyFrom1To16BytesUnaligned_16):
+	bsf	%rcx, %rdx
+	movdqu	%xmm4, (%rdi)
+# if defined USE_AS_STRNCPY
+#  ifdef USE_AS_STPCPY
+	lea	16(%rdi, %rdx), %rax
+#  endif
+	movdqu	%xmm5, 16(%rdi)
+	add	$47, %r8
+	sub	%rdx, %r8
+	lea	17(%rdi, %rdx), %rdi
+	jmp	L(StrncpyFillTailWithZero)
+# else
+	add	$16, %rsi
+	add	$16, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+# endif
+
+	.p2align 4
+L(CopyFrom1To16BytesUnaligned_32):
+	bsf	%rdx, %rdx
+	movdqu	%xmm4, (%rdi)
+	movdqu	%xmm5, 16(%rdi)
+# if defined USE_AS_STRNCPY
+#  ifdef USE_AS_STPCPY
+	lea	32(%rdi, %rdx), %rax
+#  endif
+	movdqu	%xmm6, 32(%rdi)
+	add	$31, %r8
+	sub	%rdx, %r8
+	lea	33(%rdi, %rdx), %rdi
+	jmp	L(StrncpyFillTailWithZero)
+# else
+	add	$32, %rsi
+	add	$32, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+# endif
+
+# ifdef USE_AS_STRNCPY
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm6):
+	movdqu	%xmm6, (%rdi, %rcx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm5):
+	movdqu	%xmm5, (%rdi, %rcx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm4):
+	movdqu	%xmm4, (%rdi, %rcx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm3):
+	movdqu	%xmm3, (%rdi, %rcx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm1):
+	movdqu	%xmm1, (%rdi, %rcx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesExit):
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+
+/* Case2 */
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2):
+	add	$16, %r8
+	add	%rcx, %rdi
+	add	%rcx, %rsi
+	bsf	%rdx, %rdx
+	cmp	%r8, %rdx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+	.p2align 4
+L(CopyFrom1To32BytesCase2):
+	sub	%rcx, %r8
+	add	%rcx, %rsi
+	bsf	%rdx, %rdx
+	add	$16, %rdx
+	sub	%rcx, %rdx
+	cmp	%r8, %rdx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+L(CopyFrom1To16BytesTailCase2):
+	sub	%rcx, %r8
+	add	%rcx, %rsi
+	bsf	%rdx, %rdx
+	cmp	%r8, %rdx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+L(CopyFrom1To16BytesTail1Case2):
+	bsf	%rdx, %rdx
+	cmp	%r8, %rdx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+/* Case2 or Case3,  Case3 */
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2OrCase3):
+	test	%rdx, %rdx
+	jnz	L(CopyFrom1To16BytesCase2)
+L(CopyFrom1To16BytesCase3):
+	add	$16, %r8
+	add	%rcx, %rdi
+	add	%rcx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+	.p2align 4
+L(CopyFrom1To32BytesCase2OrCase3):
+	test	%rdx, %rdx
+	jnz	L(CopyFrom1To32BytesCase2)
+	sub	%rcx, %r8
+	add	%rcx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+	.p2align 4
+L(CopyFrom1To16BytesTailCase2OrCase3):
+	test	%rdx, %rdx
+	jnz	L(CopyFrom1To16BytesTailCase2)
+	sub	%rcx, %r8
+	add	%rcx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+	.p2align 4
+L(CopyFrom1To32Bytes1Case2OrCase3):
+	add	$16, %rdi
+	add	$16, %rsi
+	sub	$16, %r8
+L(CopyFrom1To16BytesTail1Case2OrCase3):
+	test	%rdx, %rdx
+	jnz	L(CopyFrom1To16BytesTail1Case2)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+# endif
+
+/* ----End labels regarding with copying 1-16 bytes--and 1-32 bytes---- */
+
+	.p2align 4
+L(Exit1):
+	mov	%dh, (%rdi)
+# ifdef USE_AS_STPCPY
+	lea	(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+	sub	$1, %r8
+	lea	1(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit2):
+	mov	(%rsi), %dx
+	mov	%dx, (%rdi)
+# ifdef USE_AS_STPCPY
+	lea	1(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+	sub	$2, %r8
+	lea	2(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit3):
+	mov	(%rsi), %cx
+	mov	%cx, (%rdi)
+	mov	%dh, 2(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	2(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+	sub	$3, %r8
+	lea	3(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit4):
+	mov	(%rsi), %edx
+	mov	%edx, (%rdi)
+# ifdef USE_AS_STPCPY
+	lea	3(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+	sub	$4, %r8
+	lea	4(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit5):
+	mov	(%rsi), %ecx
+	mov	%dh, 4(%rdi)
+	mov	%ecx, (%rdi)
+# ifdef USE_AS_STPCPY
+	lea	4(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+	sub	$5, %r8
+	lea	5(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit6):
+	mov	(%rsi), %ecx
+	mov	4(%rsi), %dx
+	mov	%ecx, (%rdi)
+	mov	%dx, 4(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	5(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+	sub	$6, %r8
+	lea	6(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit7):
+	mov	(%rsi), %ecx
+	mov	3(%rsi), %edx
+	mov	%ecx, (%rdi)
+	mov	%edx, 3(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	6(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+	sub	$7, %r8
+	lea	7(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit8):
+	mov	(%rsi), %rdx
+	mov	%rdx, (%rdi)
+# ifdef USE_AS_STPCPY
+	lea	7(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+	sub	$8, %r8
+	lea	8(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit9):
+	mov	(%rsi), %rcx
+	mov	%dh, 8(%rdi)
+	mov	%rcx, (%rdi)
+# ifdef USE_AS_STPCPY
+	lea	8(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+	sub	$9, %r8
+	lea	9(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit10):
+	mov	(%rsi), %rcx
+	mov	8(%rsi), %dx
+	mov	%rcx, (%rdi)
+	mov	%dx, 8(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	9(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+	sub	$10, %r8
+	lea	10(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit11):
+	mov	(%rsi), %rcx
+	mov	7(%rsi), %edx
+	mov	%rcx, (%rdi)
+	mov	%edx, 7(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	10(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+	sub	$11, %r8
+	lea	11(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit12):
+	mov	(%rsi), %rcx
+	mov	8(%rsi), %edx
+	mov	%rcx, (%rdi)
+	mov	%edx, 8(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	11(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+	sub	$12, %r8
+	lea	12(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit13):
+	mov	(%rsi), %rcx
+	mov	5(%rsi), %rdx
+	mov	%rcx, (%rdi)
+	mov	%rdx, 5(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	12(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+	sub	$13, %r8
+	lea	13(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit14):
+	mov	(%rsi), %rcx
+	mov	6(%rsi), %rdx
+	mov	%rcx, (%rdi)
+	mov	%rdx, 6(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	13(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+	sub	$14, %r8
+	lea	14(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit15):
+	mov	(%rsi), %rcx
+	mov	7(%rsi), %rdx
+	mov	%rcx, (%rdi)
+	mov	%rdx, 7(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	14(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+	sub	$15, %r8
+	lea	15(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit16):
+	movdqu	(%rsi), %xmm0
+	movdqu	%xmm0, (%rdi)
+# ifdef USE_AS_STPCPY
+	lea	15(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+	sub	$16, %r8
+	lea	16(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit17):
+	movdqu	(%rsi), %xmm0
+	movdqu	%xmm0, (%rdi)
+	mov	%dh, 16(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	16(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+	sub	$17, %r8
+	lea	17(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit18):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %cx
+	movdqu	%xmm0, (%rdi)
+	mov	%cx, 16(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	17(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+	sub	$18, %r8
+	lea	18(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit19):
+	movdqu	(%rsi), %xmm0
+	mov	15(%rsi), %ecx
+	movdqu	%xmm0, (%rdi)
+	mov	%ecx, 15(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	18(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+	sub	$19, %r8
+	lea	19(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit20):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %ecx
+	movdqu	%xmm0, (%rdi)
+	mov	%ecx, 16(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	19(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+	sub	$20, %r8
+	lea	20(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit21):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %ecx
+	movdqu	%xmm0, (%rdi)
+	mov	%ecx, 16(%rdi)
+	mov	%dh, 20(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	20(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+	sub	$21, %r8
+	lea	21(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit22):
+	movdqu	(%rsi), %xmm0
+	mov	14(%rsi), %rcx
+	movdqu	%xmm0, (%rdi)
+	mov	%rcx, 14(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	21(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+	sub	$22, %r8
+	lea	22(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit23):
+	movdqu	(%rsi), %xmm0
+	mov	15(%rsi), %rcx
+	movdqu	%xmm0, (%rdi)
+	mov	%rcx, 15(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	22(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+	sub	$23, %r8
+	lea	23(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit24):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rcx
+	movdqu	%xmm0, (%rdi)
+	mov	%rcx, 16(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	23(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+	sub	$24, %r8
+	lea	24(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit25):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rcx
+	movdqu	%xmm0, (%rdi)
+	mov	%rcx, 16(%rdi)
+	mov	%dh, 24(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	24(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+	sub	$25, %r8
+	lea	25(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit26):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rdx
+	mov	24(%rsi), %cx
+	movdqu	%xmm0, (%rdi)
+	mov	%rdx, 16(%rdi)
+	mov	%cx, 24(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	25(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+	sub	$26, %r8
+	lea	26(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit27):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rdx
+	mov	23(%rsi), %ecx
+	movdqu	%xmm0, (%rdi)
+	mov	%rdx, 16(%rdi)
+	mov	%ecx, 23(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	26(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+	sub	$27, %r8
+	lea	27(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit28):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rdx
+	mov	24(%rsi), %ecx
+	movdqu	%xmm0, (%rdi)
+	mov	%rdx, 16(%rdi)
+	mov	%ecx, 24(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	27(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+	sub	$28, %r8
+	lea	28(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit29):
+	movdqu	(%rsi), %xmm0
+	movdqu	13(%rsi), %xmm2
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm2, 13(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	28(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+	sub	$29, %r8
+	lea	29(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit30):
+	movdqu	(%rsi), %xmm0
+	movdqu	14(%rsi), %xmm2
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm2, 14(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	29(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+	sub	$30, %r8
+	lea	30(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit31):
+	movdqu	(%rsi), %xmm0
+	movdqu	15(%rsi), %xmm2
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm2, 15(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	30(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+	sub	$31, %r8
+	lea	31(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit32):
+	movdqu	(%rsi), %xmm0
+	movdqu	16(%rsi), %xmm2
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm2, 16(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	31(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+	sub	$32, %r8
+	lea	32(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+# ifdef USE_AS_STRNCPY
+
+	.p2align 4
+L(StrncpyExit0):
+# ifdef USE_AS_STPCPY
+	mov	%rdi, %rax
+# endif
+	ret
+
+	.p2align 4
+L(StrncpyExit1):
+	mov	(%rsi), %dl
+	mov	%dl, (%rdi)
+# ifdef USE_AS_STPCPY
+	lea	1(%rdi), %rax
+# endif
+	ret
+
+	.p2align 4
+L(StrncpyExit2):
+	mov	(%rsi), %dx
+	mov	%dx, (%rdi)
+# ifdef USE_AS_STPCPY
+	lea	2(%rdi), %rax
+# endif
+	ret
+
+	.p2align 4
+L(StrncpyExit3):
+	mov	(%rsi), %cx
+	mov	2(%rsi), %dl
+	mov	%cx, (%rdi)
+	mov	%dl, 2(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	3(%rdi), %rax
+# endif
+	ret
+
+	.p2align 4
+L(StrncpyExit4):
+	mov	(%rsi), %edx
+	mov	%edx, (%rdi)
+# ifdef USE_AS_STPCPY
+	lea	4(%rdi), %rax
+# endif
+	ret
+
+	.p2align 4
+L(StrncpyExit5):
+	mov	(%rsi), %ecx
+	mov	4(%rsi), %dl
+	mov	%ecx, (%rdi)
+	mov	%dl, 4(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	5(%rdi), %rax
+# endif
+	ret
+
+	.p2align 4
+L(StrncpyExit6):
+	mov	(%rsi), %ecx
+	mov	4(%rsi), %dx
+	mov	%ecx, (%rdi)
+	mov	%dx, 4(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	6(%rdi), %rax
+# endif
+	ret
+
+	.p2align 4
+L(StrncpyExit7):
+	mov	(%rsi), %ecx
+	mov	3(%rsi), %edx
+	mov	%ecx, (%rdi)
+	mov	%edx, 3(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	7(%rdi), %rax
+# endif
+	ret
+
+	.p2align 4
+L(StrncpyExit8):
+	mov	(%rsi), %rdx
+	mov	%rdx, (%rdi)
+# ifdef USE_AS_STPCPY
+	lea	8(%rdi), %rax
+# endif
+	ret
+
+	.p2align 4
+L(StrncpyExit9):
+	mov	(%rsi), %rcx
+	mov	8(%rsi), %dl
+	mov	%rcx, (%rdi)
+	mov	%dl, 8(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	9(%rdi), %rax
+# endif
+	ret
+
+	.p2align 4
+L(StrncpyExit10):
+	mov	(%rsi), %rcx
+	mov	8(%rsi), %dx
+	mov	%rcx, (%rdi)
+	mov	%dx, 8(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	10(%rdi), %rax
+# endif
+	ret
+
+	.p2align 4
+L(StrncpyExit11):
+	mov	(%rsi), %rcx
+	mov	7(%rsi), %edx
+	mov	%rcx, (%rdi)
+	mov	%edx, 7(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	11(%rdi), %rax
+# endif
+	ret
+
+	.p2align 4
+L(StrncpyExit12):
+	mov	(%rsi), %rcx
+	mov	8(%rsi), %edx
+	mov	%rcx, (%rdi)
+	mov	%edx, 8(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	12(%rdi), %rax
+# endif
+	ret
+
+	.p2align 4
+L(StrncpyExit13):
+	mov	(%rsi), %rcx
+	mov	5(%rsi), %rdx
+	mov	%rcx, (%rdi)
+	mov	%rdx, 5(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	13(%rdi), %rax
+# endif
+	ret
+
+	.p2align 4
+L(StrncpyExit14):
+	mov	(%rsi), %rcx
+	mov	6(%rsi), %rdx
+	mov	%rcx, (%rdi)
+	mov	%rdx, 6(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	14(%rdi), %rax
+# endif
+	ret
+
+	.p2align 4
+L(StrncpyExit15):
+	mov	(%rsi), %rcx
+	mov	7(%rsi), %rdx
+	mov	%rcx, (%rdi)
+	mov	%rdx, 7(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	15(%rdi), %rax
+# endif
+	ret
+
+	.p2align 4
+L(StrncpyExit16):
+	movdqu	(%rsi), %xmm0
+	movdqu	%xmm0, (%rdi)
+# ifdef USE_AS_STPCPY
+	lea	16(%rdi), %rax
+# endif
+	ret
+
+	.p2align 4
+L(StrncpyExit17):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %cl
+	movdqu	%xmm0, (%rdi)
+	mov	%cl, 16(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	17(%rdi), %rax
+# endif
+	ret
+
+	.p2align 4
+L(StrncpyExit18):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %cx
+	movdqu	%xmm0, (%rdi)
+	mov	%cx, 16(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	18(%rdi), %rax
+# endif
+	ret
+
+	.p2align 4
+L(StrncpyExit19):
+	movdqu	(%rsi), %xmm0
+	mov	15(%rsi), %ecx
+	movdqu	%xmm0, (%rdi)
+	mov	%ecx, 15(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	19(%rdi), %rax
+# endif
+	ret
+
+	.p2align 4
+L(StrncpyExit20):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %ecx
+	movdqu	%xmm0, (%rdi)
+	mov	%ecx, 16(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	20(%rdi), %rax
+# endif
+	ret
+
+	.p2align 4
+L(StrncpyExit21):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %ecx
+	mov	20(%rsi), %dl
+	movdqu	%xmm0, (%rdi)
+	mov	%ecx, 16(%rdi)
+	mov	%dl, 20(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	21(%rdi), %rax
+# endif
+	ret
+
+	.p2align 4
+L(StrncpyExit22):
+	movdqu	(%rsi), %xmm0
+	mov	14(%rsi), %rcx
+	movdqu	%xmm0, (%rdi)
+	mov	%rcx, 14(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	22(%rdi), %rax
+# endif
+	ret
+
+	.p2align 4
+L(StrncpyExit23):
+	movdqu	(%rsi), %xmm0
+	mov	15(%rsi), %rcx
+	movdqu	%xmm0, (%rdi)
+	mov	%rcx, 15(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	23(%rdi), %rax
+# endif
+	ret
+
+	.p2align 4
+L(StrncpyExit24):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rcx
+	movdqu	%xmm0, (%rdi)
+	mov	%rcx, 16(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	24(%rdi), %rax
+# endif
+	ret
+
+	.p2align 4
+L(StrncpyExit25):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rdx
+	mov	24(%rsi), %cl
+	movdqu	%xmm0, (%rdi)
+	mov	%rdx, 16(%rdi)
+	mov	%cl, 24(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	25(%rdi), %rax
+# endif
+	ret
+
+	.p2align 4
+L(StrncpyExit26):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rdx
+	mov	24(%rsi), %cx
+	movdqu	%xmm0, (%rdi)
+	mov	%rdx, 16(%rdi)
+	mov	%cx, 24(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	26(%rdi), %rax
+# endif
+	ret
+
+	.p2align 4
+L(StrncpyExit27):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rdx
+	mov	23(%rsi), %ecx
+	movdqu	%xmm0, (%rdi)
+	mov	%rdx, 16(%rdi)
+	mov	%ecx, 23(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	27(%rdi), %rax
+# endif
+	ret
+
+	.p2align 4
+L(StrncpyExit28):
+	movdqu	(%rsi), %xmm0
+	mov	16(%rsi), %rdx
+	mov	24(%rsi), %ecx
+	movdqu	%xmm0, (%rdi)
+	mov	%rdx, 16(%rdi)
+	mov	%ecx, 24(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	28(%rdi), %rax
+# endif
+	ret
+
+	.p2align 4
+L(StrncpyExit29):
+	movdqu	(%rsi), %xmm0
+	movdqu	13(%rsi), %xmm2
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm2, 13(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	29(%rdi), %rax
+# endif
+	ret
+
+	.p2align 4
+L(StrncpyExit30):
+	movdqu	(%rsi), %xmm0
+	movdqu	14(%rsi), %xmm2
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm2, 14(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	30(%rdi), %rax
+# endif
+	ret
+
+	.p2align 4
+L(StrncpyExit31):
+	movdqu	(%rsi), %xmm0
+	movdqu	15(%rsi), %xmm2
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm2, 15(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	31(%rdi), %rax
+# endif
+	ret
+
+	.p2align 4
+L(StrncpyExit32):
+	movdqu	(%rsi), %xmm0
+	movdqu	16(%rsi), %xmm2
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm2, 16(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	32(%rdi), %rax
+# endif
+	ret
+
+	.p2align 4
+L(StrncpyExit33):
+	movdqu	(%rsi), %xmm0
+	movdqu	16(%rsi), %xmm2
+	mov	32(%rsi), %cl
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm2, 16(%rdi)
+	mov	%cl, 32(%rdi)
+	ret
+
+	.p2align 4
+L(Fill0):
+	ret
+
+	.p2align 4
+L(Fill1):
+	mov	%dl, (%rdi)
+	ret
+
+	.p2align 4
+L(Fill2):
+	mov	%dx, (%rdi)
+	ret
+
+	.p2align 4
+L(Fill3):
+	mov	%edx, -1(%rdi)
+	ret
+
+	.p2align 4
+L(Fill4):
+	mov	%edx, (%rdi)
+	ret
+
+	.p2align 4
+L(Fill5):
+	mov	%edx, (%rdi)
+	mov	%dl, 4(%rdi)
+	ret
+
+	.p2align 4
+L(Fill6):
+	mov	%edx, (%rdi)
+	mov	%dx, 4(%rdi)
+	ret
+
+	.p2align 4
+L(Fill7):
+	mov	%rdx, -1(%rdi)
+	ret
+
+	.p2align 4
+L(Fill8):
+	mov	%rdx, (%rdi)
+	ret
+
+	.p2align 4
+L(Fill9):
+	mov	%rdx, (%rdi)
+	mov	%dl, 8(%rdi)
+	ret
+
+	.p2align 4
+L(Fill10):
+	mov	%rdx, (%rdi)
+	mov	%dx, 8(%rdi)
+	ret
+
+	.p2align 4
+L(Fill11):
+	mov	%rdx, (%rdi)
+	mov	%edx, 7(%rdi)
+	ret
+
+	.p2align 4
+L(Fill12):
+	mov	%rdx, (%rdi)
+	mov	%edx, 8(%rdi)
+	ret
+
+	.p2align 4
+L(Fill13):
+	mov	%rdx, (%rdi)
+	mov	%rdx, 5(%rdi)
+	ret
+
+	.p2align 4
+L(Fill14):
+	mov	%rdx, (%rdi)
+	mov	%rdx, 6(%rdi)
+	ret
+
+	.p2align 4
+L(Fill15):
+	movdqu	%xmm0, -1(%rdi)
+	ret
+
+	.p2align 4
+L(Fill16):
+	movdqu	%xmm0, (%rdi)
+	ret
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm2):
+	movdqu	%xmm2, (%rdi, %rcx)
+
+	.p2align 4
+L(CopyFrom1To16BytesXmmExit):
+	bsf	%rdx, %rdx
+	add	$15, %r8
+	add	%rcx, %rdi
+# ifdef USE_AS_STPCPY
+	lea	(%rdi, %rdx), %rax
+# endif
+	sub	%rdx, %r8
+	lea	1(%rdi, %rdx), %rdi
+
+	.p2align 4
+L(StrncpyFillTailWithZero):
+	pxor	%xmm0, %xmm0
+	xor	%rdx, %rdx
+	sub	$16, %r8
+	jbe	L(StrncpyFillExit)
+
+	movdqu	%xmm0, (%rdi)
+	add	$16, %rdi
+
+	mov	%rdi, %rsi
+	and	$0xf, %rsi
+	sub	%rsi, %rdi
+	add	%rsi, %r8
+	sub	$64, %r8
+	jb	L(StrncpyFillLess64)
+
+L(StrncpyFillLoopMovdqa):
+	movdqa	%xmm0, (%rdi)
+	movdqa	%xmm0, 16(%rdi)
+	movdqa	%xmm0, 32(%rdi)
+	movdqa	%xmm0, 48(%rdi)
+	add	$64, %rdi
+	sub	$64, %r8
+	jae	L(StrncpyFillLoopMovdqa)
+
+L(StrncpyFillLess64):
+	add	$32, %r8
+	jl	L(StrncpyFillLess32)
+	movdqa	%xmm0, (%rdi)
+	movdqa	%xmm0, 16(%rdi)
+	add	$32, %rdi
+	sub	$16, %r8
+	jl	L(StrncpyFillExit)
+	movdqa	%xmm0, (%rdi)
+	add	$16, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
+
+L(StrncpyFillLess32):
+	add	$16, %r8
+	jl	L(StrncpyFillExit)
+	movdqa	%xmm0, (%rdi)
+	add	$16, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
+
+L(StrncpyFillExit):
+	add	$16, %r8
+	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
+
+	.p2align 4
+L(UnalignedLeaveCase2OrCase3):
+	test	%rdx, %rdx
+	jnz	L(Unaligned64LeaveCase2)
+L(Unaligned64LeaveCase3):
+	lea	64(%r8), %rcx
+	and	$-16, %rcx
+	add	$48, %r8
+	jl	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm4, (%rdi)
+	sub	$16, %r8
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm5, 16(%rdi)
+	sub	$16, %r8
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm6, 32(%rdi)
+	sub	$16, %r8
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm7, 48(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	64(%rdi), %rax
+# endif
+	ret
+
+	.p2align 4
+L(Unaligned64LeaveCase2):
+	xor	%rcx, %rcx
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %rdx
+	add	$48, %r8
+	jle	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%rdx, %rdx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm4)
+
+	pcmpeqb	%xmm5, %xmm0
+	pmovmskb %xmm0, %rdx
+	movdqu	%xmm4, (%rdi)
+	add	$16, %rcx
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%rdx, %rdx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm5)
+
+	pcmpeqb	%xmm6, %xmm0
+	pmovmskb %xmm0, %rdx
+	movdqu	%xmm5, 16(%rdi)
+	add	$16, %rcx
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%rdx, %rdx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm6)
+
+	pcmpeqb	%xmm7, %xmm0
+	pmovmskb %xmm0, %rdx
+	movdqu	%xmm6, 32(%rdi)
+	lea	16(%rdi, %rcx), %rdi
+	lea	16(%rsi, %rcx), %rsi
+	bsf	%rdx, %rdx
+	cmp	%r8, %rdx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+	.p2align 4
+L(ExitZero):
+	mov	%rdi, %rax
+	ret
+
+# endif
+
+END (STRCPY)
+
+	.p2align 4
+	.section .rodata
+L(ExitTable):
+	.int	JMPTBL(L(Exit1), L(ExitTable))
+	.int	JMPTBL(L(Exit2), L(ExitTable))
+	.int	JMPTBL(L(Exit3), L(ExitTable))
+	.int	JMPTBL(L(Exit4), L(ExitTable))
+	.int	JMPTBL(L(Exit5), L(ExitTable))
+	.int	JMPTBL(L(Exit6), L(ExitTable))
+	.int	JMPTBL(L(Exit7), L(ExitTable))
+	.int	JMPTBL(L(Exit8), L(ExitTable))
+	.int	JMPTBL(L(Exit9), L(ExitTable))
+	.int	JMPTBL(L(Exit10), L(ExitTable))
+	.int	JMPTBL(L(Exit11), L(ExitTable))
+	.int	JMPTBL(L(Exit12), L(ExitTable))
+	.int	JMPTBL(L(Exit13), L(ExitTable))
+	.int	JMPTBL(L(Exit14), L(ExitTable))
+	.int	JMPTBL(L(Exit15), L(ExitTable))
+	.int	JMPTBL(L(Exit16), L(ExitTable))
+	.int	JMPTBL(L(Exit17), L(ExitTable))
+	.int	JMPTBL(L(Exit18), L(ExitTable))
+	.int	JMPTBL(L(Exit19), L(ExitTable))
+	.int	JMPTBL(L(Exit20), L(ExitTable))
+	.int	JMPTBL(L(Exit21), L(ExitTable))
+	.int	JMPTBL(L(Exit22), L(ExitTable))
+	.int    JMPTBL(L(Exit23), L(ExitTable))
+	.int	JMPTBL(L(Exit24), L(ExitTable))
+	.int	JMPTBL(L(Exit25), L(ExitTable))
+	.int	JMPTBL(L(Exit26), L(ExitTable))
+	.int	JMPTBL(L(Exit27), L(ExitTable))
+	.int	JMPTBL(L(Exit28), L(ExitTable))
+	.int	JMPTBL(L(Exit29), L(ExitTable))
+	.int	JMPTBL(L(Exit30), L(ExitTable))
+	.int	JMPTBL(L(Exit31), L(ExitTable))
+	.int	JMPTBL(L(Exit32), L(ExitTable))
+# ifdef USE_AS_STRNCPY
+L(ExitStrncpyTable):
+	.int	JMPTBL(L(StrncpyExit0), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit4), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit5), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit6), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit7), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit11), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit12), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit13), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit17), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit18), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit19), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit20), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit21), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit22), L(ExitStrncpyTable))
+	.int    JMPTBL(L(StrncpyExit23), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit24), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit25), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit26), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit27), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit28), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit29), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable))
+	.p2align 4
+L(FillTable):
+	.int	JMPTBL(L(Fill0), L(FillTable))
+	.int	JMPTBL(L(Fill1), L(FillTable))
+	.int	JMPTBL(L(Fill2), L(FillTable))
+	.int	JMPTBL(L(Fill3), L(FillTable))
+	.int	JMPTBL(L(Fill4), L(FillTable))
+	.int	JMPTBL(L(Fill5), L(FillTable))
+	.int	JMPTBL(L(Fill6), L(FillTable))
+	.int	JMPTBL(L(Fill7), L(FillTable))
+	.int	JMPTBL(L(Fill8), L(FillTable))
+	.int	JMPTBL(L(Fill9), L(FillTable))
+	.int	JMPTBL(L(Fill10), L(FillTable))
+	.int	JMPTBL(L(Fill11), L(FillTable))
+	.int	JMPTBL(L(Fill12), L(FillTable))
+	.int	JMPTBL(L(Fill13), L(FillTable))
+	.int	JMPTBL(L(Fill14), L(FillTable))
+	.int	JMPTBL(L(Fill15), L(FillTable))
+	.int	JMPTBL(L(Fill16), L(FillTable))
+# endif
+#endif
diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
new file mode 100644
index 0000000000..efbd3bfccb
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
@@ -0,0 +1,3721 @@
+/* strcpy with SSSE3
+   Copyright (C) 2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifndef NOT_IN_libc
+
+# include <sysdep.h>
+
+# ifndef STRCPY
+#  define STRCPY  __strcpy_ssse3
+# endif
+
+	.section .text.ssse3,"ax",@progbits
+ENTRY (STRCPY)
+	mov	%rsi, %rcx
+# ifdef USE_AS_STRNCPY
+	mov	%rdx, %r8
+# endif
+	mov	%rdi, %rdx
+# ifdef USE_AS_STRNCPY
+	test	%r8, %r8
+	jz	L(Exit0)
+	cmp	$8, %r8
+	jbe	L(StrncpyExit8Bytes)
+# endif
+	cmpb	$0, (%rcx)
+	jz	L(Exit1)
+	cmpb	$0, 1(%rcx)
+	jz	L(Exit2)
+	cmpb	$0, 2(%rcx)
+	jz	L(Exit3)
+	cmpb	$0, 3(%rcx)
+	jz	L(Exit4)
+	cmpb	$0, 4(%rcx)
+	jz	L(Exit5)
+	cmpb	$0, 5(%rcx)
+	jz	L(Exit6)
+	cmpb	$0, 6(%rcx)
+	jz	L(Exit7)
+	cmpb	$0, 7(%rcx)
+	jz	L(Exit8)
+# ifdef USE_AS_STRNCPY
+	cmp	$16, %r8
+	jb	L(StrncpyExit15Bytes)
+# endif
+	cmpb	$0, 8(%rcx)
+	jz	L(Exit9)
+	cmpb	$0, 9(%rcx)
+	jz	L(Exit10)
+	cmpb	$0, 10(%rcx)
+	jz	L(Exit11)
+	cmpb	$0, 11(%rcx)
+	jz	L(Exit12)
+	cmpb	$0, 12(%rcx)
+	jz	L(Exit13)
+	cmpb	$0, 13(%rcx)
+	jz	L(Exit14)
+	cmpb	$0, 14(%rcx)
+	jz	L(Exit15)
+# ifdef USE_AS_STRNCPY
+	cmp	$16, %r8
+	je	L(Exit16)
+# endif
+	cmpb	$0, 15(%rcx)
+	jz	L(Exit16)
+
+# ifdef USE_AS_STRNCPY
+	mov	%rcx, %rsi
+	and	$0xf, %rsi
+
+/* add 16 bytes rcx_shift to r8 */
+
+	add	%rsi, %r8
+# endif
+	lea	16(%rcx), %rsi
+/* Now:
+	rsi	= alignment_16(rcx) + rcx_shift + 16;
+	rcx_shift = rcx - alignment_16(rcx)
+*/
+	and	$-16, %rsi
+/* Now:
+	rsi	= alignment_16(rcx) + 16
+*/
+	pxor	%xmm0, %xmm0
+	mov	(%rcx), %r9
+	mov	%r9, (%rdx)
+/*
+	look	if there is zero symbol in next 16 bytes of string
+	from	rsi to rsi + 15 and form mask in xmm0
+*/
+	pcmpeqb	(%rsi), %xmm0
+	mov	8(%rcx), %r9
+	mov	%r9, 8(%rdx)
+
+/* convert byte mask in xmm0 to bit mask */
+
+	pmovmskb %xmm0, %rax
+	sub	%rcx, %rsi
+
+/* rsi = 16 - rcx_shift */
+
+/* rax = 0: there isn't end of string from position rsi to rsi+15 */
+
+# ifdef USE_AS_STRNCPY
+	sub	$32, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	mov	%rdx, %rax
+	lea	16(%rdx), %rdx
+/* Now:
+	rdx	= rdx + 16 = alignment_16(rdx) + rdx_shift + 16
+*/
+	and	$-16, %rdx
+
+/* Now: rdx = alignment_16(rdx) + 16 */
+
+	sub	%rdx, %rax
+
+/* Now: rax = rdx_shift - 16 */
+
+# ifdef USE_AS_STRNCPY
+	add	%rax, %rsi
+	lea	-1(%rsi), %rsi
+	and	$1<<31, %esi
+	test	%rsi, %rsi
+	jnz	L(ContinueCopy)
+	lea	16(%r8), %r8
+
+L(ContinueCopy):
+# endif
+	sub	%rax, %rcx
+/* Now:
+	case	rcx_shift >= rdx_shift:
+	rcx	= alignment_16(rcx) + (rcx_shift  - rdx_shift) + 16
+	case	rcx_shift < rdx_shift:
+	rcx	= alignment_16(rcx) + (16 + rcx_shift  - rdx_shift)
+*/
+	mov	%rcx, %rax
+	and	$0xf, %rax
+/* Now:
+	case	rcx_shift >= rdx_shift: rax = rcx_shift  - rdx_shift
+	case	rcx_shift < rdx_shift: rax = (16 + rcx_shift  - rdx_shift)
+	rax	can be 0, 1,	..., 15
+*/
+	mov	$0, %rsi
+
+/* case: rcx_shift == rdx_shift */
+
+	jz	L(Align16Both)
+
+	cmp	$8, %rax
+	jae	L(ShlHigh8)
+	cmp	$1, %rax
+	je	L(Shl1)
+	cmp	$2, %rax
+	je	L(Shl2)
+	cmp	$3, %rax
+	je	L(Shl3)
+	cmp	$4, %rax
+	je	L(Shl4)
+	cmp	$5, %rax
+	je	L(Shl5)
+	cmp	$6, %rax
+	je	L(Shl6)
+	jmp	L(Shl7)
+
+L(ShlHigh8):
+	je	L(Shl8)
+	cmp	$9, %rax
+	je	L(Shl9)
+	cmp	$10, %rax
+	je	L(Shl10)
+	cmp	$11, %rax
+	je	L(Shl11)
+	cmp	$12, %rax
+	je	L(Shl12)
+	cmp	$13, %rax
+	je	L(Shl13)
+	cmp	$14, %rax
+	je	L(Shl14)
+	jmp	L(Shl15)
+
+L(Align16Both):
+	movaps	(%rcx), %xmm1
+	movaps	16(%rcx), %xmm2
+	movaps	%xmm1, (%rdx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%rcx, %rsi), %xmm3
+	movaps	%xmm2, (%rdx, %rsi)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%rcx, %rsi), %xmm4
+	movaps	%xmm3, (%rdx, %rsi)
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%rcx, %rsi), %xmm1
+	movaps	%xmm4, (%rdx, %rsi)
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%rcx, %rsi), %xmm2
+	movaps	%xmm1, (%rdx, %rsi)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%rcx, %rsi), %xmm3
+	movaps	%xmm2, (%rdx, %rsi)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	%xmm3, (%rdx, %rsi)
+	mov	%rcx, %rax
+	lea	16(%rcx, %rsi), %rcx
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	lea	48+64(%r8, %rax), %r8
+# endif
+	mov	$-0x40, %rsi
+
+L(Aligned64Loop):
+	movaps	(%rcx), %xmm2
+	movaps	%xmm2, %xmm4
+	movaps	16(%rcx), %xmm5
+	movaps	32(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	48(%rcx), %xmm7
+	pminub	%xmm5, %xmm2
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %rax
+	lea	64(%rdx), %rdx
+	lea	64(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeaveCase2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Aligned64Leave)
+	movaps	%xmm4, -64(%rdx)
+	movaps	%xmm5, -48(%rdx)
+	movaps	%xmm6, -32(%rdx)
+	movaps	%xmm7, -16(%rdx)
+	jmp	L(Aligned64Loop)
+
+L(Aligned64Leave):
+# ifdef USE_AS_STRNCPY
+	lea	48(%r8), %r8
+# endif
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %rax
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm5, %xmm0
+# ifdef USE_AS_STRNCPY
+	lea	-16(%r8), %r8
+# endif
+	pmovmskb %xmm0, %rax
+	movaps	%xmm4, -64(%rdx)
+	test	%rax, %rax
+	lea	16(%rsi), %rsi
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm6, %xmm0
+# ifdef USE_AS_STRNCPY
+	lea	-16(%r8), %r8
+# endif
+	pmovmskb %xmm0, %rax
+	movaps	%xmm5, -48(%rdx)
+	test	%rax, %rax
+	lea	16(%rsi), %rsi
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	%xmm6, -32(%rdx)
+	pcmpeqb	%xmm7, %xmm0
+# ifdef USE_AS_STRNCPY
+	lea	-16(%r8), %r8
+# endif
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl1):
+	movaps	-1(%rcx), %xmm1
+	movaps	15(%rcx), %xmm2
+L(Shl1Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit1Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl1LoopExit)
+
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	31(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit1Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl1LoopExit)
+
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	31(%rcx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit1Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl1LoopExit)
+
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	31(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit1Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl1LoopExit)
+
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	lea	31(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-15(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-1(%rcx), %xmm1
+
+L(Shl1LoopStart):
+	movaps	15(%rcx), %xmm2
+	movaps	31(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	47(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	63(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$1, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$1, %xmm3, %xmm4
+	jnz	L(Shl1Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave1)
+# endif
+	palignr	$1, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl1LoopStart)
+
+L(Shl1LoopExit):
+	movaps	(%rdx), %xmm6
+	psrldq	$15, %xmm6
+	mov	$15, %rsi
+	palignr	$1, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl2):
+	movaps	-2(%rcx), %xmm1
+	movaps	14(%rcx), %xmm2
+L(Shl2Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit2Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl2LoopExit)
+
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	30(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit2Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl2LoopExit)
+
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	30(%rcx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit2Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl2LoopExit)
+
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	30(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit2Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl2LoopExit)
+
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	lea	30(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-14(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-2(%rcx), %xmm1
+
+L(Shl2LoopStart):
+	movaps	14(%rcx), %xmm2
+	movaps	30(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	46(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	62(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$2, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$2, %xmm3, %xmm4
+	jnz	L(Shl2Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave2)
+# endif
+	palignr	$2, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl2LoopStart)
+
+L(Shl2LoopExit):
+	movaps	(%rdx), %xmm6
+	psrldq	$14, %xmm6
+	mov	$14, %rsi
+	palignr	$2, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl3):
+	movaps	-3(%rcx), %xmm1
+	movaps	13(%rcx), %xmm2
+L(Shl3Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit3Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl3LoopExit)
+
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	29(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit3Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl3LoopExit)
+
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	29(%rcx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit3Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl3LoopExit)
+
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	29(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit3Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl3LoopExit)
+
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	lea	29(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-13(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-3(%rcx), %xmm1
+
+L(Shl3LoopStart):
+	movaps	13(%rcx), %xmm2
+	movaps	29(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	45(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	61(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$3, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$3, %xmm3, %xmm4
+	jnz	L(Shl3Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave3)
+# endif
+	palignr	$3, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl3LoopStart)
+
+L(Shl3LoopExit):
+	movaps	(%rdx), %xmm6
+	psrldq	$13, %xmm6
+	mov	$13, %rsi
+	palignr	$3, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl4):
+	movaps	-4(%rcx), %xmm1
+	movaps	12(%rcx), %xmm2
+L(Shl4Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit4Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	28(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit4Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	28(%rcx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit4Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	28(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit4Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	lea	28(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-12(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-4(%rcx), %xmm1
+
+L(Shl4LoopStart):
+	movaps	12(%rcx), %xmm2
+	movaps	28(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	44(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	60(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$4, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$4, %xmm3, %xmm4
+	jnz	L(Shl4Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave4)
+# endif
+	palignr	$4, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl4LoopStart)
+
+L(Shl4LoopExit):
+	movaps	(%rdx), %xmm6
+	psrldq	$12, %xmm6
+	mov	$12, %rsi
+	palignr	$4, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl5):
+	movaps	-5(%rcx), %xmm1
+	movaps	11(%rcx), %xmm2
+L(Shl5Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit5Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl5LoopExit)
+
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	27(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit5Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl5LoopExit)
+
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	27(%rcx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit5Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl5LoopExit)
+
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	27(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit5Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl5LoopExit)
+
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	lea	27(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-11(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-5(%rcx), %xmm1
+
+L(Shl5LoopStart):
+	movaps	11(%rcx), %xmm2
+	movaps	27(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	43(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	59(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$5, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$5, %xmm3, %xmm4
+	jnz	L(Shl5Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave5)
+# endif
+	palignr	$5, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl5LoopStart)
+
+L(Shl5LoopExit):
+	movaps	(%rdx), %xmm6
+	psrldq	$11, %xmm6
+	mov	$11, %rsi
+	palignr	$5, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl6):
+	movaps	-6(%rcx), %xmm1
+	movaps	10(%rcx), %xmm2
+L(Shl6Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit6Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl6LoopExit)
+
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	26(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit6Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl6LoopExit)
+
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	26(%rcx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit6Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl6LoopExit)
+
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	26(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit6Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl6LoopExit)
+
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	lea	26(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-10(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-6(%rcx), %xmm1
+
+L(Shl6LoopStart):
+	movaps	10(%rcx), %xmm2
+	movaps	26(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	42(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	58(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$6, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$6, %xmm3, %xmm4
+	jnz	L(Shl6Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave6)
+# endif
+	palignr	$6, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl6LoopStart)
+
+L(Shl6LoopExit):
+	movaps	(%rdx), %xmm6
+	psrldq	$10, %xmm6
+	mov	$10, %rsi
+	palignr	$6, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl7):
+	movaps	-7(%rcx), %xmm1
+	movaps	9(%rcx), %xmm2
+L(Shl7Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit7Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl7LoopExit)
+
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	25(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit7Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl7LoopExit)
+
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	25(%rcx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit7Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl7LoopExit)
+
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	25(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit7Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl7LoopExit)
+
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	lea	25(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-9(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-7(%rcx), %xmm1
+
+L(Shl7LoopStart):
+	movaps	9(%rcx), %xmm2
+	movaps	25(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	41(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	57(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$7, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$7, %xmm3, %xmm4
+	jnz	L(Shl7Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave7)
+# endif
+	palignr	$7, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl7LoopStart)
+
+L(Shl7LoopExit):
+	movaps	(%rdx), %xmm6
+	psrldq	$9, %xmm6
+	mov	$9, %rsi
+	palignr	$7, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl8):
+	movaps	-8(%rcx), %xmm1
+	movaps	8(%rcx), %xmm2
+L(Shl8Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit8Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	24(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit8Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	24(%rcx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit8Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	24(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit8Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	lea	24(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-8(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-8(%rcx), %xmm1
+
+L(Shl8LoopStart):
+	movaps	8(%rcx), %xmm2
+	movaps	24(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	40(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	56(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$8, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$8, %xmm3, %xmm4
+	jnz	L(Shl8Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave8)
+# endif
+	palignr	$8, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl8LoopStart)
+
+L(Shl8LoopExit):
+	movaps	(%rdx), %xmm6
+	psrldq	$8, %xmm6
+	mov	$8, %rsi
+	palignr	$8, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl9):
+	movaps	-9(%rcx), %xmm1
+	movaps	7(%rcx), %xmm2
+L(Shl9Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit9Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl9LoopExit)
+
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	23(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit9Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl9LoopExit)
+
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	23(%rcx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit9Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl9LoopExit)
+
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	23(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit9Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl9LoopExit)
+
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	lea	23(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-7(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-9(%rcx), %xmm1
+
+L(Shl9LoopStart):
+	movaps	7(%rcx), %xmm2
+	movaps	23(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	39(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	55(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$9, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$9, %xmm3, %xmm4
+	jnz	L(Shl9Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave9)
+# endif
+	palignr	$9, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl9LoopStart)
+
+L(Shl9LoopExit):
+	movaps	(%rdx), %xmm6
+	psrldq	$7, %xmm6
+	mov	$7, %rsi
+	palignr	$9, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl10):
+	movaps	-10(%rcx), %xmm1
+	movaps	6(%rcx), %xmm2
+L(Shl10Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit10Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl10LoopExit)
+
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	22(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit10Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl10LoopExit)
+
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	22(%rcx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit10Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl10LoopExit)
+
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	22(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit10Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl10LoopExit)
+
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	lea	22(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-6(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-10(%rcx), %xmm1
+
+L(Shl10LoopStart):
+	movaps	6(%rcx), %xmm2
+	movaps	22(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	38(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	54(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$10, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$10, %xmm3, %xmm4
+	jnz	L(Shl10Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave10)
+# endif
+	palignr	$10, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl10LoopStart)
+
+L(Shl10LoopExit):
+	movaps	(%rdx), %xmm6
+	psrldq	$6, %xmm6
+	mov	$6, %rsi
+	palignr	$10, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl11):
+	movaps	-11(%rcx), %xmm1
+	movaps	5(%rcx), %xmm2
+L(Shl11Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit11Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl11LoopExit)
+
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	21(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit11Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl11LoopExit)
+
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	21(%rcx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit11Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl11LoopExit)
+
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	21(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit11Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl11LoopExit)
+
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	lea	21(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-5(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-11(%rcx), %xmm1
+
+L(Shl11LoopStart):
+	movaps	5(%rcx), %xmm2
+	movaps	21(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	37(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	53(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$11, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$11, %xmm3, %xmm4
+	jnz	L(Shl11Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave11)
+# endif
+	palignr	$11, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl11LoopStart)
+
+L(Shl11LoopExit):
+	movaps	(%rdx), %xmm6
+	psrldq	$5, %xmm6
+	mov	$5, %rsi
+	palignr	$11, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl12):
+	movaps	-12(%rcx), %xmm1
+	movaps	4(%rcx), %xmm2
+L(Shl12Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit12Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	20(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit12Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	20(%rcx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit12Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	20(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit12Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	lea	20(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-4(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-12(%rcx), %xmm1
+
+L(Shl12LoopStart):
+	movaps	4(%rcx), %xmm2
+	movaps	20(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	36(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	52(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$12, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$12, %xmm3, %xmm4
+	jnz	L(Shl12Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave12)
+# endif
+	palignr	$12, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl12LoopStart)
+
+L(Shl12LoopExit):
+	movaps	(%rdx), %xmm6
+	psrldq	$4, %xmm6
+	mov	$4, %rsi
+	palignr	$12, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl13):
+	movaps	-13(%rcx), %xmm1
+	movaps	3(%rcx), %xmm2
+L(Shl13Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit13Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl13LoopExit)
+
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	19(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit13Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl13LoopExit)
+
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	19(%rcx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit13Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl13LoopExit)
+
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	19(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit13Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl13LoopExit)
+
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	lea	19(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-3(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-13(%rcx), %xmm1
+
+L(Shl13LoopStart):
+	movaps	3(%rcx), %xmm2
+	movaps	19(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	35(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	51(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$13, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$13, %xmm3, %xmm4
+	jnz	L(Shl13Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave13)
+# endif
+	palignr	$13, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl13LoopStart)
+
+L(Shl13LoopExit):
+	movaps	(%rdx), %xmm6
+	psrldq	$3, %xmm6
+	mov	$3, %rsi
+	palignr	$13, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl14):
+	movaps	-14(%rcx), %xmm1
+	movaps	2(%rcx), %xmm2
+L(Shl14Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit14Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl14LoopExit)
+
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	18(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit14Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl14LoopExit)
+
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	18(%rcx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit14Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl14LoopExit)
+
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	18(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit14Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl14LoopExit)
+
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	lea	18(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-2(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-14(%rcx), %xmm1
+
+L(Shl14LoopStart):
+	movaps	2(%rcx), %xmm2
+	movaps	18(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	34(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	50(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$14, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$14, %xmm3, %xmm4
+	jnz	L(Shl14Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave14)
+# endif
+	palignr	$14, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl14LoopStart)
+
+L(Shl14LoopExit):
+	movaps	(%rdx), %xmm6
+	psrldq	$2, %xmm6
+	mov	$2, %rsi
+	palignr	$14, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl15):
+	movaps	-15(%rcx), %xmm1
+	movaps	1(%rcx), %xmm2
+L(Shl15Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit15Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl15LoopExit)
+
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	17(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit15Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl15LoopExit)
+
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	17(%rcx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit15Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl15LoopExit)
+
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	17(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit15Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl15LoopExit)
+
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	lea	17(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-1(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-15(%rcx), %xmm1
+
+L(Shl15LoopStart):
+	movaps	1(%rcx), %xmm2
+	movaps	17(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	33(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	49(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$15, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$15, %xmm3, %xmm4
+	jnz	L(Shl15Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave15)
+# endif
+	palignr	$15, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl15LoopStart)
+
+L(Shl15LoopExit):
+	movaps	(%rdx), %xmm6
+	psrldq	$1, %xmm6
+	mov	$1, %rsi
+	palignr	$15, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx)
+# ifdef USE_AS_STRCAT
+	jmp	L(CopyFrom1To16Bytes)
+# endif
+
+
+	.p2align 4
+L(CopyFrom1To16Bytes):
+# ifdef USE_AS_STRNCPY
+	add	$16, %r8
+# endif
+	add	%rsi, %rdx
+	add	%rsi, %rcx
+
+	test	%al, %al
+	jz	L(ExitHigh)
+	test	$0x01, %al
+	jnz	L(Exit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+
+	.p2align 4
+L(Exit8):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+# ifdef USE_AS_STPCPY
+	lea	7(%rdx), %rax
+# else
+	mov	%rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+	sub	$8, %r8
+	lea	8(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+# endif
+# endif
+	ret
+
+	.p2align 4
+L(ExitHigh):
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	test	$0x08, %ah
+	jnz	L(Exit12)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+
+	.p2align 4
+L(Exit16):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	8(%rcx), %rax
+	mov	%rax, 8(%rdx)
+# ifdef USE_AS_STPCPY
+	lea	15(%rdx), %rax
+# else
+	mov	%rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	lea	16(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+# endif
+# endif
+	ret
+
+# ifdef USE_AS_STRNCPY
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2):
+	add	$16, %r8
+	add	%rsi, %rcx
+	lea	(%rsi, %rdx), %rsi
+	lea	-9(%r8), %rdx
+	and	$1<<7, %dh
+	or	%al, %dh
+	test	%dh, %dh
+	lea	(%rsi), %rdx
+	jz	L(ExitHighCase2)
+
+	cmp	$1, %r8
+	je	L(Exit1)
+	test	$0x01, %al
+	jnz	L(Exit1)
+	cmp	$2, %r8
+	je	L(Exit2)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	cmp	$3, %r8
+	je	L(Exit3)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	cmp	$4, %r8
+	je	L(Exit4)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	cmp	$5, %r8
+	je	L(Exit5)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	cmp	$6, %r8
+	je	L(Exit6)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	cmp	$7, %r8
+	je	L(Exit7)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	jmp	L(Exit8)
+
+	.p2align 4
+L(ExitHighCase2):
+	cmp	$9, %r8
+	je	L(Exit9)
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	cmp	$10, %r8
+	je	L(Exit10)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	cmp	$11, %r8
+	je	L(Exit11)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	cmp	$12, %r8
+	je	L(Exit12)
+	test	$0x8, %ah
+	jnz	L(Exit12)
+	cmp	$13, %r8
+	je	L(Exit13)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	cmp	$14, %r8
+	je	L(Exit14)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	cmp	$15, %r8
+	je	L(Exit15)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	jmp	L(Exit16)
+
+L(CopyFrom1To16BytesCase2OrCase3):
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+
+	.p2align 4
+L(CopyFrom1To16BytesCase3):
+	add	$16, %r8
+	add	%rsi, %rdx
+	add	%rsi, %rcx
+
+	cmp	$16, %r8
+	je	L(Exit16)
+	cmp	$8, %r8
+	je	L(Exit8)
+	jg	L(More8Case3)
+	cmp	$4, %r8
+	je	L(Exit4)
+	jg	L(More4Case3)
+	cmp	$2, %r8
+	jl	L(Exit1)
+	je	L(Exit2)
+	jg	L(Exit3)
+L(More8Case3): /* but less than 16 */
+	cmp	$12, %r8
+	je	L(Exit12)
+	jl	L(Less12Case3)
+	cmp	$14, %r8
+	jl	L(Exit13)
+	je	L(Exit14)
+	jg	L(Exit15)
+L(More4Case3): /* but less than 8 */
+	cmp	$6, %r8
+	jl	L(Exit5)
+	je	L(Exit6)
+	jg	L(Exit7)
+L(Less12Case3): /* but more than 8 */
+	cmp	$10, %r8
+	jl	L(Exit9)
+	je	L(Exit10)
+	jg	L(Exit11)
+# endif
+
+	.p2align 4
+L(Exit1):
+	movb	(%rcx), %al
+	movb	%al, (%rdx)
+# ifdef USE_AS_STPCPY
+	lea	(%rdx), %rax
+# else
+	mov	%rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+	sub	$1, %r8
+	lea	1(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+# endif
+# endif
+	ret
+
+	.p2align 4
+L(Exit2):
+	movw	(%rcx), %ax
+	movw	%ax, (%rdx)
+# ifdef USE_AS_STPCPY
+	lea	1(%rdx), %rax
+# else
+	mov	%rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+	sub	$2, %r8
+	lea	2(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+# endif
+# endif
+	ret
+
+	.p2align 4
+L(Exit3):
+	movw	(%rcx), %ax
+	movw	%ax, (%rdx)
+	movb	2(%rcx), %al
+	movb	%al, 2(%rdx)
+# ifdef USE_AS_STPCPY
+	lea	2(%rdx), %rax
+# else
+	mov	%rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+	sub	$3, %r8
+	lea	3(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+# endif
+# endif
+	ret
+
+	.p2align 4
+L(Exit4):
+	movl	(%rcx), %eax
+	movl	%eax, (%rdx)
+# ifdef USE_AS_STPCPY
+	lea	3(%rdx), %rax
+# else
+	mov	%rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+	sub	$4, %r8
+	lea	4(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+# endif
+# endif
+	ret
+
+	.p2align 4
+L(Exit5):
+	movl	(%rcx), %eax
+	movl	%eax, (%rdx)
+	movb	4(%rcx), %al
+	movb	%al, 4(%rdx)
+# ifdef USE_AS_STPCPY
+	lea	4(%rdx), %rax
+# else
+	mov	%rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+	sub	$5, %r8
+	lea	5(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+# endif
+# endif
+	ret
+
+	.p2align 4
+L(Exit6):
+	movl	(%rcx), %eax
+	movl	%eax, (%rdx)
+	movw	4(%rcx), %ax
+	movw	%ax, 4(%rdx)
+# ifdef USE_AS_STPCPY
+	lea	5(%rdx), %rax
+# else
+	mov	%rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+	sub	$6, %r8
+	lea	6(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+# endif
+# endif
+	ret
+
+	.p2align 4
+L(Exit7):
+	movl	(%rcx), %eax
+	movl	%eax, (%rdx)
+	movl	3(%rcx), %eax
+	movl	%eax, 3(%rdx)
+# ifdef USE_AS_STPCPY
+	lea	6(%rdx), %rax
+# else
+	mov	%rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+	sub	$7, %r8
+	lea	7(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+# endif
+# endif
+	ret
+
+	.p2align 4
+L(Exit9):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	5(%rcx), %eax
+	mov	%eax, 5(%rdx)
+# ifdef USE_AS_STPCPY
+	lea	8(%rdx), %rax
+# else
+	mov	%rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+	sub	$9, %r8
+	lea	9(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+# endif
+# endif
+	ret
+
+	.p2align 4
+L(Exit10):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	6(%rcx), %eax
+	mov	%eax, 6(%rdx)
+# ifdef USE_AS_STPCPY
+	lea	9(%rdx), %rax
+# else
+	mov	%rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+	sub	$10, %r8
+	lea	10(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+# endif
+# endif
+	ret
+
+	.p2align 4
+L(Exit11):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	7(%rcx), %eax
+	mov	%eax, 7(%rdx)
+# ifdef USE_AS_STPCPY
+	lea	10(%rdx), %rax
+# else
+	mov	%rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+	sub	$11, %r8
+	lea	11(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+# endif
+# endif
+	ret
+
+	.p2align 4
+L(Exit12):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	8(%rcx), %eax
+	mov	%eax, 8(%rdx)
+# ifdef USE_AS_STPCPY
+	lea	11(%rdx), %rax
+# else
+	mov	%rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+	sub	$12, %r8
+	lea	12(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+# endif
+# endif
+	ret
+
+	.p2align 4
+L(Exit13):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	5(%rcx), %rax
+	mov	%rax, 5(%rdx)
+# ifdef USE_AS_STPCPY
+	lea	12(%rdx), %rax
+# else
+	mov	%rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+	sub	$13, %r8
+	lea	13(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+# endif
+# endif
+	ret
+
+	.p2align 4
+L(Exit14):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	6(%rcx), %rax
+	mov	%rax, 6(%rdx)
+# ifdef USE_AS_STPCPY
+	lea	13(%rdx), %rax
+# else
+	mov	%rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+	sub	$14, %r8
+	lea	14(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+# endif
+# endif
+	ret
+
+	.p2align 4
+L(Exit15):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	7(%rcx), %rax
+	mov	%rax, 7(%rdx)
+# ifdef USE_AS_STPCPY
+	lea	14(%rdx), %rax
+# else
+	mov	%rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+	sub	$15, %r8
+	lea	15(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+# endif
+# endif
+	ret
+
+# ifdef USE_AS_STRNCPY
+	.p2align 4
+L(Fill0):
+	ret
+
+	.p2align 4
+L(Fill1):
+	movb	%dl, (%rcx)
+	ret
+
+	.p2align 4
+L(Fill2):
+	movw	%dx, (%rcx)
+	ret
+
+	.p2align 4
+L(Fill3):
+	movw	%dx, (%rcx)
+	movb	%dl, 2(%rcx)
+	ret
+
+	.p2align 4
+L(Fill4):
+	movl	%edx, (%rcx)
+	ret
+
+	.p2align 4
+L(Fill5):
+	movl	%edx, (%rcx)
+	movb	%dl, 4(%rcx)
+	ret
+
+	.p2align 4
+L(Fill6):
+	movl	%edx, (%rcx)
+	movw	%dx, 4(%rcx)
+	ret
+
+	.p2align 4
+L(Fill7):
+	movl	%edx, (%rcx)
+	movl	%edx, 3(%rcx)
+	ret
+
+	.p2align 4
+L(Fill8):
+	mov	%rdx, (%rcx)
+	ret
+
+	.p2align 4
+L(Fill9):
+	mov	%rdx, (%rcx)
+	movb	%dl, 8(%rcx)
+	ret
+
+	.p2align 4
+L(Fill10):
+	mov	%rdx, (%rcx)
+	movw	%dx, 8(%rcx)
+	ret
+
+	.p2align 4
+L(Fill11):
+	mov	%rdx, (%rcx)
+	movl	%edx, 7(%rcx)
+	ret
+
+	.p2align 4
+L(Fill12):
+	mov	%rdx, (%rcx)
+	movl	%edx, 8(%rcx)
+	ret
+
+	.p2align 4
+L(Fill13):
+	mov	%rdx, (%rcx)
+	mov	%rdx, 5(%rcx)
+	ret
+
+	.p2align 4
+L(Fill14):
+	mov	%rdx, (%rcx)
+	mov	%rdx, 6(%rcx)
+	ret
+
+	.p2align 4
+L(Fill15):
+	mov	%rdx, (%rcx)
+	mov	%rdx, 7(%rcx)
+	ret
+
+	.p2align 4
+L(Fill16):
+	mov	%rdx, (%rcx)
+	mov	%rdx, 8(%rcx)
+	ret
+
+	.p2align 4
+L(StrncpyFillExit1):
+	lea	16(%r8), %r8
+L(FillFrom1To16Bytes):
+	test	%r8, %r8
+	jz	L(Fill0)
+	cmp	$16, %r8
+	je	L(Fill16)
+	cmp	$8, %r8
+	je	L(Fill8)
+	jg	L(FillMore8)
+	cmp	$4, %r8
+	je	L(Fill4)
+	jg	L(FillMore4)
+	cmp	$2, %r8
+	jl	L(Fill1)
+	je	L(Fill2)
+	jg	L(Fill3)
+L(FillMore8): /* but less than 16 */
+	cmp	$12, %r8
+	je	L(Fill12)
+	jl	L(FillLess12)
+	cmp	$14, %r8
+	jl	L(Fill13)
+	je	L(Fill14)
+	jg	L(Fill15)
+L(FillMore4): /* but less than 8 */
+	cmp	$6, %r8
+	jl	L(Fill5)
+	je	L(Fill6)
+	jg	L(Fill7)
+L(FillLess12): /* but more than 8 */
+	cmp	$10, %r8
+	jl	L(Fill9)
+	je	L(Fill10)
+	jmp	L(Fill11)
+
+	.p2align 4
+L(StrncpyFillTailWithZero1):
+	xor	%rdx, %rdx
+	sub	$16, %r8
+	jbe	L(StrncpyFillExit1)
+
+	pxor	%xmm0, %xmm0
+	mov	%rdx, (%rcx)
+	mov	%rdx, 8(%rcx)
+
+	lea	16(%rcx), %rcx
+
+	mov	%rcx, %rdx
+	and	$0xf, %rdx
+	sub	%rdx, %rcx
+	add	%rdx, %r8
+	xor	%rdx, %rdx
+	sub	$64, %r8
+	jb	L(StrncpyFillLess64)
+
+L(StrncpyFillLoopMovdqa):
+	movdqa	%xmm0, (%rcx)
+	movdqa	%xmm0, 16(%rcx)
+	movdqa	%xmm0, 32(%rcx)
+	movdqa	%xmm0, 48(%rcx)
+	lea	64(%rcx), %rcx
+	sub	$64, %r8
+	jae	L(StrncpyFillLoopMovdqa)
+
+L(StrncpyFillLess64):
+	add	$32, %r8
+	jl	L(StrncpyFillLess32)
+	movdqa	%xmm0, (%rcx)
+	movdqa	%xmm0, 16(%rcx)
+	lea	32(%rcx), %rcx
+	sub	$16, %r8
+	jl	L(StrncpyFillExit1)
+	movdqa	%xmm0, (%rcx)
+	lea	16(%rcx), %rcx
+	jmp	L(FillFrom1To16Bytes)
+
+L(StrncpyFillLess32):
+	add	$16, %r8
+	jl	L(StrncpyFillExit1)
+	movdqa	%xmm0, (%rcx)
+	lea	16(%rcx), %rcx
+	jmp	L(FillFrom1To16Bytes)
+
+	.p2align 4
+L(Exit0):
+	mov	%rdx, %rax
+	ret
+
+	.p2align 4
+L(StrncpyExit15Bytes):
+	cmp	$9, %r8
+	je	L(Exit9)
+	cmpb	$0, 8(%rcx)
+	jz	L(Exit9)
+	cmp	$10, %r8
+	je	L(Exit10)
+	cmpb	$0, 9(%rcx)
+	jz	L(Exit10)
+	cmp	$11, %r8
+	je	L(Exit11)
+	cmpb	$0, 10(%rcx)
+	jz	L(Exit11)
+	cmp	$12, %r8
+	je	L(Exit12)
+	cmpb	$0, 11(%rcx)
+	jz	L(Exit12)
+	cmp	$13, %r8
+	je	L(Exit13)
+	cmpb	$0, 12(%rcx)
+	jz	L(Exit13)
+	cmp	$14, %r8
+	je	L(Exit14)
+	cmpb	$0, 13(%rcx)
+	jz	L(Exit14)
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	7(%rcx), %rax
+	mov	%rax, 7(%rdx)
+# ifdef USE_AS_STPCPY
+	lea	14(%rdx), %rax
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+# else
+	mov	%rdi, %rax
+# endif
+	ret
+
+	.p2align 4
+L(StrncpyExit8Bytes):
+	cmp	$1, %r8
+	je	L(Exit1)
+	cmpb	$0, (%rcx)
+	jz	L(Exit1)
+	cmp	$2, %r8
+	je	L(Exit2)
+	cmpb	$0, 1(%rcx)
+	jz	L(Exit2)
+	cmp	$3, %r8
+	je	L(Exit3)
+	cmpb	$0, 2(%rcx)
+	jz	L(Exit3)
+	cmp	$4, %r8
+	je	L(Exit4)
+	cmpb	$0, 3(%rcx)
+	jz	L(Exit4)
+	cmp	$5, %r8
+	je	L(Exit5)
+	cmpb	$0, 4(%rcx)
+	jz	L(Exit5)
+	cmp	$6, %r8
+	je	L(Exit6)
+	cmpb	$0, 5(%rcx)
+	jz	L(Exit6)
+	cmp	$7, %r8
+	je	L(Exit7)
+	cmpb	$0, 6(%rcx)
+	jz	L(Exit7)
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+# ifdef USE_AS_STPCPY
+	lea	7(%rdx), %rax
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+# else
+	mov	%rdi, %rax
+# endif
+	ret
+
+# endif
+
+# ifdef USE_AS_STRNCPY
+
+L(StrncpyLeaveCase2OrCase3):
+	test	%rax, %rax
+	jnz	L(Aligned64LeaveCase2)
+
+L(Aligned64LeaveCase3):
+	lea	64(%r8), %r8
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase3)
+	movaps	%xmm4, -64(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase3)
+	movaps	%xmm5, -48(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase3)
+	movaps	%xmm6, -32(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(Aligned64LeaveCase2):
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %rax
+	add	$48, %r8
+	jle	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm5, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm4, -64(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm6, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm5, -48(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm7, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm6, -32(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+	jmp	L(CopyFrom1To16BytesCase2)
+/*--------------------------------------------------*/
+L(StrncpyExit1Case2OrCase3):
+	movaps	(%rdx), %xmm6
+	psrldq	$15, %xmm6
+	mov	$15, %rsi
+	palignr	$1, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit2Case2OrCase3):
+	movaps	(%rdx), %xmm6
+	psrldq	$14, %xmm6
+	mov	$14, %rsi
+	palignr	$2, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit3Case2OrCase3):
+	movaps	(%rdx), %xmm6
+	psrldq	$13, %xmm6
+	mov	$13, %rsi
+	palignr	$3, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit4Case2OrCase3):
+	movaps	(%rdx), %xmm6
+	psrldq	$12, %xmm6
+	mov	$12, %rsi
+	palignr	$4, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit5Case2OrCase3):
+	movaps	(%rdx), %xmm6
+	psrldq	$11, %xmm6
+	mov	$11, %rsi
+	palignr	$5, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit6Case2OrCase3):
+	movaps	(%rdx), %xmm6
+	psrldq	$10, %xmm6
+	mov	$10, %rsi
+	palignr	$6, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit7Case2OrCase3):
+	movaps	(%rdx), %xmm6
+	psrldq	$9, %xmm6
+	mov	$9, %rsi
+	palignr	$7, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit8Case2OrCase3):
+	movaps	(%rdx), %xmm6
+	psrldq	$8, %xmm6
+	mov	$8, %rsi
+	palignr	$8, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit9Case2OrCase3):
+	movaps	(%rdx), %xmm6
+	psrldq	$7, %xmm6
+	mov	$7, %rsi
+	palignr	$9, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit10Case2OrCase3):
+	movaps	(%rdx), %xmm6
+	psrldq	$6, %xmm6
+	mov	$6, %rsi
+	palignr	$10, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit11Case2OrCase3):
+	movaps	(%rdx), %xmm6
+	psrldq	$5, %xmm6
+	mov	$5, %rsi
+	palignr	$11, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit12Case2OrCase3):
+	movaps	(%rdx), %xmm6
+	psrldq	$4, %xmm6
+	mov	$4, %rsi
+	palignr	$12, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit13Case2OrCase3):
+	movaps	(%rdx), %xmm6
+	psrldq	$3, %xmm6
+	mov	$3, %rsi
+	palignr	$13, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit14Case2OrCase3):
+	movaps	(%rdx), %xmm6
+	psrldq	$2, %xmm6
+	mov	$2, %rsi
+	palignr	$14, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit15Case2OrCase3):
+	movaps	(%rdx), %xmm6
+	psrldq	$1, %xmm6
+	mov	$1, %rsi
+	palignr	$15, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave1):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit1)
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	31(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	movaps	%xmm2, %xmm3
+	sub	$16, %r8
+	jbe	L(StrncpyExit1)
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	movaps	31+16(%rcx), %xmm2
+	movaps	%xmm3, %xmm1
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit1)
+	movaps	%xmm2, %xmm1
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit1)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit1):
+	movaps	(%rdx, %rsi), %xmm6
+	psrldq	$15, %xmm6
+	palignr	$1, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx, %rsi)
+	lea	15(%rsi), %rsi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave2):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit2)
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	30(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	movaps	%xmm2, %xmm3
+	sub	$16, %r8
+	jbe	L(StrncpyExit2)
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	movaps	30+16(%rcx), %xmm2
+	movaps	%xmm3, %xmm1
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit2)
+	movaps	%xmm2, %xmm1
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit2)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit2):
+	movaps	(%rdx, %rsi), %xmm6
+	psrldq	$14, %xmm6
+	palignr	$2, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx, %rsi)
+	lea	14(%rsi), %rsi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave3):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit3)
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	29(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	movaps	%xmm2, %xmm3
+	sub	$16, %r8
+	jbe	L(StrncpyExit3)
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	movaps	29+16(%rcx), %xmm2
+	movaps	%xmm3, %xmm1
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit3)
+	movaps	%xmm2, %xmm1
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit3)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit3):
+	movaps	(%rdx, %rsi), %xmm6
+	psrldq	$13, %xmm6
+	palignr	$3, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx, %rsi)
+	lea	13(%rsi), %rsi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave4):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit4)
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	28(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	movaps	%xmm2, %xmm3
+	sub	$16, %r8
+	jbe	L(StrncpyExit4)
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	movaps	28+16(%rcx), %xmm2
+	movaps	%xmm3, %xmm1
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit4)
+	movaps	%xmm2, %xmm1
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit4)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit4):
+	movaps	(%rdx, %rsi), %xmm6
+	psrldq	$12, %xmm6
+	palignr	$4, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx, %rsi)
+	lea	12(%rsi), %rsi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave5):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit5)
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	27(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	movaps	%xmm2, %xmm3
+	sub	$16, %r8
+	jbe	L(StrncpyExit5)
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	movaps	27+16(%rcx), %xmm2
+	movaps	%xmm3, %xmm1
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit5)
+	movaps	%xmm2, %xmm1
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit5)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit5):
+	movaps	(%rdx, %rsi), %xmm6
+	psrldq	$11, %xmm6
+	palignr	$5, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx, %rsi)
+	lea	11(%rsi), %rsi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave6):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit6)
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	26(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	movaps	%xmm2, %xmm3
+	sub	$16, %r8
+	jbe	L(StrncpyExit6)
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	movaps	26+16(%rcx), %xmm2
+	movaps	%xmm3, %xmm1
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit6)
+	movaps	%xmm2, %xmm1
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit6)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit6):
+	movaps	(%rdx, %rsi), %xmm6
+	psrldq	$10, %xmm6
+	palignr	$6, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx, %rsi)
+	lea	10(%rsi), %rsi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave7):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit7)
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	25(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	movaps	%xmm2, %xmm3
+	sub	$16, %r8
+	jbe	L(StrncpyExit7)
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	movaps	25+16(%rcx), %xmm2
+	movaps	%xmm3, %xmm1
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit7)
+	movaps	%xmm2, %xmm1
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit7)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit7):
+	movaps	(%rdx, %rsi), %xmm6
+	psrldq	$9, %xmm6
+	palignr	$7, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx, %rsi)
+	lea	9(%rsi), %rsi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave8):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit8)
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	24(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	movaps	%xmm2, %xmm3
+	sub	$16, %r8
+	jbe	L(StrncpyExit8)
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	movaps	24+16(%rcx), %xmm2
+	movaps	%xmm3, %xmm1
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit8)
+	movaps	%xmm2, %xmm1
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit8)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit8):
+	movaps	(%rdx, %rsi), %xmm6
+	psrldq	$8, %xmm6
+	palignr	$8, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx, %rsi)
+	lea	8(%rsi), %rsi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave9):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit9)
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	23(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	movaps	%xmm2, %xmm3
+	sub	$16, %r8
+	jbe	L(StrncpyExit9)
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	movaps	23+16(%rcx), %xmm2
+	movaps	%xmm3, %xmm1
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit9)
+	movaps	%xmm2, %xmm1
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit9)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit9):
+	movaps	(%rdx, %rsi), %xmm6
+	psrldq	$7, %xmm6
+	palignr	$9, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx, %rsi)
+	lea	7(%rsi), %rsi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave10):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit10)
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	22(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	movaps	%xmm2, %xmm3
+	sub	$16, %r8
+	jbe	L(StrncpyExit10)
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	movaps	22+16(%rcx), %xmm2
+	movaps	%xmm3, %xmm1
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit10)
+	movaps	%xmm2, %xmm1
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit10)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit10):
+	movaps	(%rdx, %rsi), %xmm6
+	psrldq	$6, %xmm6
+	palignr	$10, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx, %rsi)
+	lea	6(%rsi), %rsi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave11):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit11)
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	21(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	movaps	%xmm2, %xmm3
+	sub	$16, %r8
+	jbe	L(StrncpyExit11)
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	movaps	21+16(%rcx), %xmm2
+	movaps	%xmm3, %xmm1
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit11)
+	movaps	%xmm2, %xmm1
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit11)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit11):
+	movaps	(%rdx, %rsi), %xmm6
+	psrldq	$5, %xmm6
+	palignr	$11, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx, %rsi)
+	lea	5(%rsi), %rsi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave12):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit12)
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	20(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	movaps	%xmm2, %xmm3
+	sub	$16, %r8
+	jbe	L(StrncpyExit12)
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	movaps	20+16(%rcx), %xmm2
+	movaps	%xmm3, %xmm1
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit12)
+	movaps	%xmm2, %xmm1
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit12)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit12):
+	movaps	(%rdx, %rsi), %xmm6
+	psrldq	$4, %xmm6
+	palignr	$12, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx, %rsi)
+	lea	4(%rsi), %rsi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave13):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit13)
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	19(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	movaps	%xmm2, %xmm3
+	sub	$16, %r8
+	jbe	L(StrncpyExit13)
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	movaps	19+16(%rcx), %xmm2
+	movaps	%xmm3, %xmm1
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit13)
+	movaps	%xmm2, %xmm1
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit13)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit13):
+	movaps	(%rdx, %rsi), %xmm6
+	psrldq	$3, %xmm6
+	palignr	$13, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx, %rsi)
+	lea	3(%rsi), %rsi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave14):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit14)
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	18(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	movaps	%xmm2, %xmm3
+	sub	$16, %r8
+	jbe	L(StrncpyExit14)
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	movaps	18+16(%rcx), %xmm2
+	movaps	%xmm3, %xmm1
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit14)
+	movaps	%xmm2, %xmm1
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit14)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit14):
+	movaps	(%rdx, %rsi), %xmm6
+	psrldq	$2, %xmm6
+	palignr	$14, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx, %rsi)
+	lea	2(%rsi), %rsi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave15):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit15)
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	17(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	movaps	%xmm2, %xmm3
+	sub	$16, %r8
+	jbe	L(StrncpyExit15)
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	movaps	17+16(%rcx), %xmm2
+	movaps	%xmm3, %xmm1
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit15)
+	movaps	%xmm2, %xmm1
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit15)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit15):
+	movaps	(%rdx, %rsi), %xmm6
+	psrldq	$1, %xmm6
+	palignr	$15, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx, %rsi)
+	lea	1(%rsi), %rsi
+	jmp	L(CopyFrom1To16BytesCase3)
+# endif
+
+END (STRCPY)
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/strcpy.S b/sysdeps/x86_64/multiarch/strcpy.S
index 02fa8d0710..381060f643 100644
--- a/sysdeps/x86_64/multiarch/strcpy.S
+++ b/sysdeps/x86_64/multiarch/strcpy.S
@@ -1,5 +1,5 @@
-/* strcpy with SSSE3
-   Copyright (C) 2009 Free Software Foundation, Inc.
+/* Multiple versions of strcpy
+   Copyright (C) 2009, 2011 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
@@ -29,30 +29,32 @@
 
 #ifdef USE_AS_STPCPY
 # ifdef USE_AS_STRNCPY
-#  define STRCPY_SSSE3	__stpncpy_ssse3
-#  define STRCPY_SSE2	__stpncpy_sse2
-#  define __GI_STRCPY	__GI_stpncpy
+#  define STRCPY_SSSE3		__stpncpy_ssse3
+#  define STRCPY_SSE2		__stpncpy_sse2
+#  define STRCPY_SSE2_UNALIGNED __stpncpy_sse2_unaligned
+#  define __GI_STRCPY		__GI_stpncpy
+#  define __GI___STRCPY		__GI___stpncpy
 # else
-#  define STRCPY_SSSE3	__stpcpy_ssse3
-#  define STRCPY_SSE2	__stpcpy_sse2
-#  define __GI_STRCPY	__GI_stpcpy
-#  define __GI___STRCPY	__GI___stpcpy
+#  define STRCPY_SSSE3		__stpcpy_ssse3
+#  define STRCPY_SSE2		__stpcpy_sse2
+#  define STRCPY_SSE2_UNALIGNED	__stpcpy_sse2_unaligned
+#  define __GI_STRCPY		__GI_stpcpy
+#  define __GI___STRCPY		__GI___stpcpy
 # endif
 #else
 # ifdef USE_AS_STRNCPY
-#  define STRCPY_SSSE3	__strncpy_ssse3
-#  define STRCPY_SSE2	__strncpy_sse2
-#  define __GI_STRCPY	__GI_strncpy
+#  define STRCPY_SSSE3		__strncpy_ssse3
+#  define STRCPY_SSE2		__strncpy_sse2
+#  define STRCPY_SSE2_UNALIGNED	__strncpy_sse2_unaligned
+#  define __GI_STRCPY		__GI_strncpy
 # else
-#  define STRCPY_SSSE3	__strcpy_ssse3
-#  define STRCPY_SSE2	__strcpy_sse2
-#  define __GI_STRCPY	__GI_strcpy
+#  define STRCPY_SSSE3		__strcpy_ssse3
+#  define STRCPY_SSE2		__strcpy_sse2
+#  define STRCPY_SSE2_UNALIGNED	__strcpy_sse2_unaligned
+#  define __GI_STRCPY		__GI_strcpy
 # endif
 #endif
 
-#ifndef LABEL
-#define LABEL(l) L(l)
-#endif
 
 /* Define multiple versions only for the definition in libc.  */
 #ifndef NOT_IN_libc
@@ -62,1830 +64,16 @@ ENTRY(STRCPY)
 	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
 	jne	1f
 	call	__init_cpu_features
-1:	leaq	STRCPY_SSE2(%rip), %rax
+1:	leaq	STRCPY_SSE2_UNALIGNED(%rip), %rax
+	testl	$bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
+	jnz	2f
+	leaq	STRCPY_SSE2(%rip), %rax
 	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
 	jz	2f
 	leaq	STRCPY_SSSE3(%rip), %rax
 2:	ret
 END(STRCPY)
 
-	.section .text.ssse3,"ax",@progbits
-STRCPY_SSSE3:
-	cfi_startproc
-	CALL_MCOUNT
-
-/*
- * This implementation uses SSE to copy up to 16 bytes at a time.
- */
-#ifdef USE_AS_STRNCPY
-	test    %rdx, %rdx
-	jz      LABEL(strncpy_exitz)
-	mov     %rdx, %r8
-#else
-	xor	%edx, %edx
-#endif
-	mov	%esi, %ecx
-	and	$0xfffffffffffffff0, %rsi	/*force rsi 16 byte align*/
-	and	$15, %ecx
-	mov	%rdi, %rax			/*store return parameter*/
-
-
-	pxor	%xmm0, %xmm0			/* clear %xmm0 */
-	pcmpeqb	(%rsi), %xmm0			/* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char*/
-	pmovmskb %xmm0, %edx			/* move each byte mask of %xmm0 to edx*/
-	shr	%cl, %edx			/* get real bits left in edx*/
-	test	%edx, %edx			/* edx must be 0 if there is no null char from rsi+%rcx */
-	jnz	LABEL(less16bytes)
-
-#ifdef USE_AS_STRNCPY
-	lea	-16(%r8,%rcx), %r11
-	cmp	$0, %r11
-	jle	LABEL(less16bytes)		/* if r8 + rcx <= 16, branch to less16bytes.  */
-#endif
-
-	mov	%rcx, %r9
-	or	%edi, %ecx
-	and	$15, %ecx
-	lea	-16(%r9), %r10
-	jz	LABEL(ashr_0)			/* ecx must be 0 if offset of rsi and rdi is 16 byte align*/
-
-	neg	%r10				/* store the rest in rsi aligned 16 bytes for unaligned_exit*/
-
-	pxor	%xmm0, %xmm0			/* clear %xmm0, may be polluted by unaligned operation*/
-	pcmpeqb	16(%rsi), %xmm0			/* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char*/
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(less32bytes)
-	/*
-	* at least 16 byte available to fill destination rdi
-	*/
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(less32bytes_strncpy_truncation)
-#endif
-	mov	(%rsi, %r9), %rdx
-	mov	%rdx, (%rdi)
-	mov	8(%rsi, %r9), %rdx
-	mov	%rdx, 8(%rdi)
-
-	/*
-	* so far destatination rdi may be aligned by 16, re-calculate rsi to jump
-	* crossponding case
-	* rcx is offset of rsi
-	* rax is offset of rdi
-	*/
-
-	and	$0xfffffffffffffff0, %rdi	/* force rdi 16 byte align */
-	mov	%rax, %rdx			/* rax store orignal rdi */
-	xor	%rdi, %rdx			/* equal to and $15, %rdx */
-#ifdef USE_AS_STRNCPY
-	add     %rdx, %r8
-#endif
-
-	add	$16, %rdi			/* next 16 bytes for rdi */
-	sub	%rdx, %r9
-
-	lea	16(%r9, %rsi), %rsi		/*re-calculate rsi by (16 - rdx)+ rcx */
-	mov	%esi, %ecx			/*store offset of rsi */
-	and	$0xfffffffffffffff0, %rsi	/* force rsi 16 byte align */
-
-	and	$15, %ecx			/* ecx must be 0 if rdx is equal to rcx*/
-	jz	LABEL(ashr_0)
-
-	lea	-16(%rcx), %r10
-	mov	%rcx, %r9
-	neg	%r10
-	lea	LABEL(unaligned_table)(%rip), %r11
-	movslq  (%r11, %rcx,4), %rcx
-	lea	(%r11, %rcx), %rcx
-	jmp	*%rcx
-
- /*
- * The following cases will be handled by ashr_0 & ashr_0_start
- *  rcx(offset of rsi)  rax(offset of rdi)  relative offset  corresponding case
- *	0		    0		  0		 ashr_0
- *	n(1~15)	     n(1~15)	   0		 ashr_0_start
- *
- */
-	.p2align 5
-LABEL(ashr_0):
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_aligned)
-#endif
-	movdqa  (%rsi), %xmm1	   /* fetch first 16 bytes from rsi */
-	movdqa  %xmm1, (%rdi)	   /* store first 16 bytes into rdi */
-	add     $16, %rsi
-	add     $16, %rdi
-	pcmpeqb  (%rsi), %xmm0		   /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char */
-	pmovmskb  %xmm0, %edx		   /* move each byte mask of %xmm0 to edx*/
-
-	test    %edx, %edx		  /* edx must be 0 if there is no null char in rsi*/
-	jnz	LABEL(aligned_16bytes)
-
-LABEL(ashr_0_loop):
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_aligned)
-#endif
-	movdqa  (%rsi, %rcx), %xmm1
-	movdqa  %xmm1, (%rdi, %rcx)
-	add	$16, %rcx
-	pcmpeqb  (%rsi, %rcx), %xmm0
-	pmovmskb  %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(aligned_exit)
-
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_aligned)
-#endif
-	movdqa  (%rsi, %rcx), %xmm1
-	movdqa  %xmm1, (%rdi, %rcx)
-	add	$16, %rcx
-	pcmpeqb  (%rsi, %rcx), %xmm0
-	pmovmskb  %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(aligned_exit)
-
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_aligned)
-#endif
-	movdqa  (%rsi, %rcx), %xmm1
-	movdqa  %xmm1, (%rdi, %rcx)
-	add	$16, %rcx
-	pcmpeqb  (%rsi, %rcx), %xmm0
-	pmovmskb  %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(aligned_exit)
-
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_aligned)
-#endif
-	movdqa  (%rsi, %rcx), %xmm1
-	movdqa  %xmm1, (%rdi, %rcx)
-	add	$16, %rcx
-	pcmpeqb  (%rsi, %rcx), %xmm0
-	pmovmskb  %xmm0, %edx
-	test	%edx, %edx
-	jz	LABEL(ashr_0_loop)
-
-	jmp	LABEL(aligned_exit)
-        .p2align 4
-
-/*
- * The following cases will be handled by ashr_15
- *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
- *      n(15)		n - 15		15((16 - (n -15) + n)%16	 ashr_15
- *
- * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
-	.p2align 4
-LABEL(ashr_15):
-	xor	%ecx, %ecx				/*clear ecx */
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	.p2align 4
-LABEL(ashr_15_use_ssse3):
-	movdqa	16(%rsi, %rcx), %xmm3
-	pcmpeqb	%xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $15, (%rsi, %rcx), %xmm3
-	movdqa	%xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	movdqa  16(%rsi, %rcx), %xmm3
-	pcmpeqb %xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $15, (%rsi, %rcx), %xmm3
-	movdqa  %xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-	jmp	LABEL(ashr_15_use_ssse3)
-
-/*
- * The following cases will be handled by ashr_14
- *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
- *      n(14~15)		n - 14		14((16 - (n -14) + n)%16	 ashr_14
- *
- * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
-	.p2align 4
-LABEL(ashr_14):
-	xor	%ecx, %ecx				/*clear ecx */
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	.p2align 4
-LABEL(ashr_14_use_ssse3):
-	movdqa	16(%rsi, %rcx), %xmm3
-	pcmpeqb	%xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $14, (%rsi, %rcx), %xmm3
-	movdqa	%xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	movdqa  16(%rsi, %rcx), %xmm3
-	pcmpeqb %xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $14, (%rsi, %rcx), %xmm3
-	movdqa  %xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-	jmp	LABEL(ashr_14_use_ssse3)
-
-/*
- * The following cases will be handled by ashr_13
- *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
- *      n(13~15)		n - 13		13((16 - (n -13) + n)%16	 ashr_13
- *
- * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
-	.p2align 4
-LABEL(ashr_13):
-	xor	%ecx, %ecx				/*clear ecx */
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	.p2align 4
-LABEL(ashr_13_use_ssse3):
-	movdqa	16(%rsi, %rcx), %xmm3
-	pcmpeqb	%xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $13, (%rsi, %rcx), %xmm3
-	movdqa	%xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	movdqa  16(%rsi, %rcx), %xmm3
-	pcmpeqb %xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $13, (%rsi, %rcx), %xmm3
-	movdqa  %xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-	jmp	LABEL(ashr_13_use_ssse3)
-
-/*
- * The following cases will be handled by ashr_12
- *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
- *      n(12~15)		n - 12		12((16 - (n -12) + n)%16	 ashr_12
- *
- * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
-	.p2align 4
-LABEL(ashr_12):
-	xor	%ecx, %ecx				/*clear ecx */
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	.p2align 4
-LABEL(ashr_12_use_ssse3):
-	movdqa	16(%rsi, %rcx), %xmm3
-	pcmpeqb	%xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $12, (%rsi, %rcx), %xmm3
-	movdqa	%xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	movdqa  16(%rsi, %rcx), %xmm3
-	pcmpeqb %xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $12, (%rsi, %rcx), %xmm3
-	movdqa  %xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-	jmp	LABEL(ashr_12_use_ssse3)
-
-/*
- * The following cases will be handled by ashr_11
- *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
- *      n(11~15)		n - 11		11((16 - (n -11) + n)%16	 ashr_11
- *
- * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
-	.p2align 4
-LABEL(ashr_11):
-	xor	%ecx, %ecx				/*clear ecx */
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	.p2align 4
-LABEL(ashr_11_use_ssse3):
-	movdqa	16(%rsi, %rcx), %xmm3
-	pcmpeqb	%xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $11, (%rsi, %rcx), %xmm3
-	movdqa	%xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	movdqa  16(%rsi, %rcx), %xmm3
-	pcmpeqb %xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $11, (%rsi, %rcx), %xmm3
-	movdqa  %xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-	jmp	LABEL(ashr_11_use_ssse3)
-
-/*
- * The following cases will be handled by ashr_10
- *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
- *      n(10~15)		n - 10		10((16 - (n -10) + n)%16	 ashr_10
- *
- * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
-	.p2align 4
-LABEL(ashr_10):
-	xor	%ecx, %ecx				/*clear ecx */
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	.p2align 4
-LABEL(ashr_10_use_ssse3):
-	movdqa	16(%rsi, %rcx), %xmm3
-	pcmpeqb	%xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $10, (%rsi, %rcx), %xmm3
-	movdqa	%xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	movdqa  16(%rsi, %rcx), %xmm3
-	pcmpeqb %xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $10, (%rsi, %rcx), %xmm3
-	movdqa  %xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-	jmp	LABEL(ashr_10_use_ssse3)
-
-/*
- * The following cases will be handled by ashr_9
- *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
- *      n(9~15)		n - 9		9((16 - (n -9) + n)%16	 ashr_9
- *
- * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
-	.p2align 4
-LABEL(ashr_9):
-	xor	%ecx, %ecx				/*clear ecx */
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	.p2align 4
-LABEL(ashr_9_use_ssse3):
-	movdqa	16(%rsi, %rcx), %xmm3
-	pcmpeqb	%xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $9, (%rsi, %rcx), %xmm3
-	movdqa	%xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	movdqa  16(%rsi, %rcx), %xmm3
-	pcmpeqb %xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $9, (%rsi, %rcx), %xmm3
-	movdqa  %xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-	jmp	LABEL(ashr_9_use_ssse3)
-
-/*
- * The following cases will be handled by ashr_8
- *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
- *      n(8~15)		n - 8		8((16 - (n -8) + n)%16	 ashr_8
- *
- * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
-	.p2align 4
-LABEL(ashr_8):
-	xor	%ecx, %ecx				/*clear ecx */
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	.p2align 4
-LABEL(ashr_8_use_ssse3):
-	movdqa	16(%rsi, %rcx), %xmm3
-	pcmpeqb	%xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $8, (%rsi, %rcx), %xmm3
-	movdqa	%xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	movdqa  16(%rsi, %rcx), %xmm3
-	pcmpeqb %xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $8, (%rsi, %rcx), %xmm3
-	movdqa  %xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-	jmp	LABEL(ashr_8_use_ssse3)
-
-/*
- * The following cases will be handled by ashr_7
- *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
- *      n(7~15)		n - 7		7((16 - (n -7) + n)%16	 ashr_7
- *
- * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
-	.p2align 4
-LABEL(ashr_7):
-	xor	%ecx, %ecx				/*clear ecx */
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-	.p2align 4
-
-LABEL(ashr_7_use_ssse3):
-	movdqa	16(%rsi, %rcx), %xmm3
-	pcmpeqb	%xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $7, (%rsi, %rcx), %xmm3
-	movdqa	%xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	movdqa  16(%rsi, %rcx), %xmm3
-	pcmpeqb %xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $7, (%rsi, %rcx), %xmm3
-	movdqa  %xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-	jmp	LABEL(ashr_7_use_ssse3)
-
-/*
- * The following cases will be handled by ashr_6
- *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
- *      n(6~15)		n - 6		6((16 - (n -6) + n)%16	 ashr_6
- *
- * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
-	.p2align 4
-LABEL(ashr_6):
-	xor	%ecx, %ecx				/*clear ecx */
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	.p2align 4
-LABEL(ashr_6_use_ssse3):
-	movdqa	16(%rsi, %rcx), %xmm3
-	pcmpeqb	%xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $6, (%rsi, %rcx), %xmm3
-	movdqa	%xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	movdqa  16(%rsi, %rcx), %xmm3
-	pcmpeqb %xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $6, (%rsi, %rcx), %xmm3
-	movdqa  %xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-	jmp	LABEL(ashr_6_use_ssse3)
-
- /*
- * The following cases will be handled by ashr_5
- *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
- *      n(5~15)		n - 5		5((16 - (n -5) + n)%16	 ashr_5
- *
- * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
-	.p2align 4
-LABEL(ashr_5):
-	xor	%ecx, %ecx				/*clear ecx */
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	.p2align 4
-LABEL(ashr_5_use_ssse3):
-	movdqa	16(%rsi, %rcx), %xmm3
-	pcmpeqb	%xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $5, (%rsi, %rcx), %xmm3
-	movdqa	%xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	movdqa  16(%rsi, %rcx), %xmm3
-	pcmpeqb %xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $5, (%rsi, %rcx), %xmm3
-	movdqa  %xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-	jmp	LABEL(ashr_5_use_ssse3)
-
-/*
- *
- * The following cases will be handled by ashr_4
- *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
- *      n(4~15)		n - 4		4((16 - (n -4) + n)%16	 ashr_4
- *
- * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
-	.p2align 4
-LABEL(ashr_4):
-	xor	%ecx, %ecx				/*clear ecx */
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	.p2align 4
-LABEL(ashr_4_use_ssse3):
-	movdqa	16(%rsi, %rcx), %xmm3
-	pcmpeqb	%xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $4, (%rsi, %rcx), %xmm3
-	movdqa	%xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	movdqa  16(%rsi, %rcx), %xmm3
-	pcmpeqb %xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $4, (%rsi, %rcx), %xmm3
-	movdqa  %xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-	jmp	LABEL(ashr_4_use_ssse3)
-
-/*
- *
- * The following cases will be handled by ashr_3
- *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
- *      n(3~15)		n - 3		3((16 - (n -3) + n)%16	 ashr_3
- *
- * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
-	.p2align 4
-LABEL(ashr_3):
-	xor	%ecx, %ecx				/*clear ecx */
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	.p2align 4
-LABEL(ashr_3_use_ssse3):
-	movdqa	16(%rsi, %rcx), %xmm3
-	pcmpeqb	%xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $3, (%rsi, %rcx), %xmm3
-	movdqa	%xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	movdqa  16(%rsi, %rcx), %xmm3
-	pcmpeqb %xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $3, (%rsi, %rcx), %xmm3
-	movdqa  %xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-	jmp	LABEL(ashr_3_use_ssse3)
-
-/*
- *
- * The following cases will be handled by ashr_2
- *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
- *      n(2~15)		n - 2		2((16 - (n -2) + n)%16	 ashr_2
- *
- * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
-	.p2align 4
-LABEL(ashr_2):
-	xor	%ecx, %ecx				/*clear ecx */
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	.p2align 4
-LABEL(ashr_2_use_ssse3):
-	movdqa	16(%rsi, %rcx), %xmm3
-	pcmpeqb	%xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $2, (%rsi, %rcx), %xmm3
-	movdqa	%xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	movdqa  16(%rsi, %rcx), %xmm3
-	pcmpeqb %xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $2, (%rsi, %rcx), %xmm3
-	movdqa  %xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-	jmp	LABEL(ashr_2_use_ssse3)
-
-/*
- *
- * The following cases will be handled by ashr_1
- *  rcx(offset of rsi)  rax(offset of rdi)  relative offset  		corresponding case
- *	n(1~15)		n - 1	   	1 ((16 - (n -1) + n)%16	 ashr_1
- *
- * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
-	.p2align 4
-LABEL(ashr_1):
-	xor	%ecx, %ecx				/*clear ecx */
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	.p2align 4
-LABEL(ashr_1_use_ssse3):
-	movdqa	16(%rsi, %rcx), %xmm3
-	pcmpeqb	%xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-
-	palignr $1, (%rsi, %rcx), %xmm3
-	movdqa	%xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-
-	movdqa  16(%rsi, %rcx), %xmm3
-	pcmpeqb %xmm3, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	LABEL(strncpy_truncation_unaligned)
-#endif
-	palignr $1, (%rsi, %rcx), %xmm3
-	movdqa  %xmm3, (%rdi, %rcx)
-	add	$16, %rcx
-
-#ifdef USE_AS_STRNCPY
-	cmp	%r10, %r8
-	jbe	LABEL(unaligned_exit)
-#endif
-	jmp	LABEL(ashr_1_use_ssse3)
-
-	.p2align 4
-LABEL(less32bytes):
-	xor	%ecx, %ecx
-LABEL(unaligned_exit):
-	add	%r9, %rsi		/* r9 stores original offset of rsi*/
-	mov	%rcx, %r9
-	mov	%r10, %rcx
-	shl	%cl, %edx		/* after shl, calculate the exact number to be filled*/
-	mov	%r9, %rcx
-	.p2align 4
-LABEL(aligned_exit):
-	add	%rcx, %rdi		/*locate exact address for rdi */
-LABEL(less16bytes):
-	add	%rcx, %rsi		/*locate exact address for rsi */
-LABEL(aligned_16bytes):
-#ifdef USE_AS_STRNCPY
-	mov     $1, %r9d
-	lea     -1(%r8), %rcx
-	shl     %cl, %r9d
-	cmp     $32, %r8
-	ja      LABEL(strncpy_tail)
-	or      %r9d, %edx
-LABEL(strncpy_tail):
-#endif
-	bsf	%rdx, %rcx		/*If a least significant 1 bit in %rdx is found, its bit index is stored in %rcx*/
-	lea	LABEL(tail_table)(%rip), %r11
-	movslq	(%r11, %rcx,4), %rcx
-	lea	(%r11, %rcx), %rcx
-	jmp	*%rcx
-
-#ifdef USE_AS_STRNCPY
-	.p2align 4
-LABEL(less32bytes_strncpy_truncation):
-	xor     %ecx, %ecx
-LABEL(strncpy_truncation_unaligned):
-	add      %r9, %rsi
-LABEL(strncpy_truncation_aligned):
-	add      %rcx, %rdi
-	add      %rcx, %rsi
-	add     $16, %r8
-	lea     -1(%r8), %rcx
-	lea     LABEL(tail_table)(%rip), %r11
-	movslq  (%r11, %rcx,4), %rcx
-	lea     (%r11, %rcx), %rcx
-	jmp     *%rcx
-	.p2align 4
-LABEL(strncpy_exitz):
-	mov     %rdi, %rax
-	ret
-#endif
-
-#ifdef USE_AS_STRNCPY
-	.p2align 4
-LABEL(strncpy_fill_tail):
-	mov	%rax, %rdx
-	movzx	%cl, %rax
-	mov	%r8, %rcx
-	add	%rax, %rdi
-	xor	%eax, %eax
-	shr	$3, %ecx
-	jz	LABEL(strncpy_fill_less_8)
-
-	rep	stosq
-LABEL(strncpy_fill_less_8):
-	mov	%r8, %rcx
-	and	$7, %ecx
-	jz	LABEL(strncpy_fill_return)
-LABEL(strncpy_fill_less_7):
-	sub	$1, %ecx
-	mov	%al, (%rdi, %rcx)
-	jnz	LABEL(strncpy_fill_less_7)
-LABEL(strncpy_fill_return):
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rdx)
-	sbb	$-1, %rdx
-#endif
-	mov	%rdx, %rax
-	ret
-#endif
-	.p2align 4
-LABEL(tail_0):
-	mov	(%rsi), %cl
-	mov	%cl, (%rdi)
-#ifdef USE_AS_STPCPY
-	mov	%rdi, %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$1, %cl
-	sub	$1, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-	.p2align 4
-LABEL(tail_1):
-	mov	(%rsi), %cx
-	mov	%cx, (%rdi)
-#ifdef USE_AS_STPCPY
-	lea	1(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$2, %cl
-	sub	$2, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-	.p2align 4
-LABEL(tail_2):
-	mov	(%rsi), %cx
-	mov	%cx, (%rdi)
-	mov	1(%rsi), %cx
-	mov	%cx, 1(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	2(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$3, %cl
-	sub	$3, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-	.p2align 4
-LABEL(tail_3):
-	mov	(%rsi), %ecx
-	mov	%ecx, (%rdi)
-#ifdef USE_AS_STPCPY
-	lea	3(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$4, %cl
-	sub	$4, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-	.p2align 4
-LABEL(tail_4):
-	mov	(%rsi), %ecx
-	mov	%ecx, (%rdi)
-	mov	1(%rsi), %edx
-	mov	%edx, 1(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	4(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$5, %cl
-	sub	$5, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-	.p2align 4
-LABEL(tail_5):
-	mov	(%rsi), %ecx
-	mov	%ecx, (%rdi)
-	mov	2(%rsi), %edx
-	mov	%edx, 2(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	5(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$6, %cl
-	sub	$6, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-	.p2align 4
-LABEL(tail_6):
-	mov	(%rsi), %ecx
-	mov	%ecx, (%rdi)
-	mov	3(%rsi), %edx
-	mov	%edx,3(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	6(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$7, %cl
-	sub	$7, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-
-	.p2align 4
-LABEL(tail_7):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-#ifdef USE_AS_STPCPY
-	lea	7(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$8, %cl
-	sub	$8, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-
-	.p2align 4
-LABEL(tail_8):
-
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	5(%rsi), %edx
-	mov	%edx, 5(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	8(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$9, %cl
-	sub	$9, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-
-	.p2align 4
-LABEL(tail_9):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	6(%rsi), %edx
-	mov	%edx, 6(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	9(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$10, %cl
-	sub	$10, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-
-	.p2align 4
-LABEL(tail_10):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	7(%rsi), %edx
-	mov	%edx, 7(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	10(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$11, %cl
-	sub	$11, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-	.p2align 4
-LABEL(tail_11):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	8(%rsi), %edx
-	mov	%edx, 8(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	11(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$12, %cl
-	sub	$12, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-	.p2align 4
-LABEL(tail_12):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	5(%rsi), %rcx
-	mov	%rcx, 5(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	12(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$13, %cl
-	sub	$13, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-
-	.p2align 4
-LABEL(tail_13):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	6(%rsi), %rcx
-	mov	%rcx, 6(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	13(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$14, %cl
-	sub	$14, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-
-	.p2align 4
-LABEL(tail_14):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	7(%rsi), %rcx
-	mov	%rcx, 7(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	14(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$15, %cl
-	sub	$15, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-
-LABEL(tail_15):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	8(%rsi), %rdx
-	mov	%rdx, 8(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	15(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$16, %cl
-	sub	$16, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-
-	ret
-
-	.p2align 4
-LABEL(tail_16):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	8(%rsi), %rdx
-	mov	%rdx, 8(%rdi)
-	mov	16(%rsi), %cl
-	mov	%cl, 16(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	16(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$17, %cl
-	sub	$17, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-	.p2align 4
-LABEL(tail_17):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	8(%rsi), %rdx
-	mov	%rdx, 8(%rdi)
-	mov	16(%rsi), %cx
-	mov	%cx, 16(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	17(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$18, %cl
-	sub	$18, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-
-	.p2align 4
-LABEL(tail_18):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	8(%rsi), %rdx
-	mov	%rdx, 8(%rdi)
-	mov	15(%rsi), %ecx
-	mov	%ecx,15(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	18(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$19, %cl
-	sub	$19, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-
-	.p2align 4
-LABEL(tail_19):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	8(%rsi), %rdx
-	mov	%rdx, 8(%rdi)
-	mov	16(%rsi), %ecx
-	mov	%ecx, 16(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	19(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$20, %cl
-	sub	$20, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-	.p2align 4
-LABEL(tail_20):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	8(%rsi), %rdx
-	mov	%rdx, 8(%rdi)
-	mov	13(%rsi), %rcx
-	mov	%rcx, 13(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	20(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$21, %cl
-	sub	$21, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-	.p2align 4
-LABEL(tail_21):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	8(%rsi), %rdx
-	mov	%rdx, 8(%rdi)
-	mov	14(%rsi), %rcx
-	mov	%rcx, 14(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	21(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$22, %cl
-	sub	$22, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-
-	.p2align 4
-LABEL(tail_22):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	8(%rsi), %rdx
-	mov	%rdx, 8(%rdi)
-	mov	15(%rsi), %rcx
-	mov	%rcx, 15(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	22(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$23, %cl
-	sub	$23, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-
-	.p2align 4
-LABEL(tail_23):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	8(%rsi), %rdx
-	mov	%rdx, 8(%rdi)
-	mov	16(%rsi), %rcx
-	mov	%rcx, 16(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	23(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$24, %cl
-	sub	$24, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-
-	ret
-
-	.p2align 4
-LABEL(tail_24):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	8(%rsi), %rdx
-	mov	%rdx, 8(%rdi)
-	mov	16(%rsi), %rcx
-	mov	%rcx, 16(%rdi)
-	mov	21(%rsi), %edx
-	mov	%edx, 21(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	24(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$25, %cl
-	sub	$25, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-
-	.p2align 4
-LABEL(tail_25):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	8(%rsi), %rdx
-	mov	%rdx, 8(%rdi)
-	mov	16(%rsi), %rcx
-	mov	%rcx, 16(%rdi)
-	mov	22(%rsi), %edx
-	mov	%edx, 22(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	25(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$26, %cl
-	sub	$26, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-
-	.p2align 4
-LABEL(tail_26):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	8(%rsi), %rdx
-	mov	%rdx, 8(%rdi)
-	mov	16(%rsi), %rcx
-	mov	%rcx, 16(%rdi)
-	mov	23(%rsi), %edx
-	mov	%edx, 23(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	26(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$27, %cl
-	sub	$27, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-
-	.p2align 4
-LABEL(tail_27):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	8(%rsi), %rdx
-	mov	%rdx, 8(%rdi)
-	mov	16(%rsi), %rcx
-	mov	%rcx, 16(%rdi)
-	mov	24(%rsi), %edx
-	mov	%edx, 24(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	27(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$28, %cl
-	sub	$28, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-	.p2align 4
-LABEL(tail_28):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	8(%rsi), %rdx
-	mov	%rdx, 8(%rdi)
-	mov	16(%rsi), %rcx
-	mov	%rcx, 16(%rdi)
-	mov	21(%rsi), %rdx
-	mov	%rdx, 21(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	28(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$29, %cl
-	sub	$29, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-
-	ret
-
-	.p2align 4
-LABEL(tail_29):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	8(%rsi), %rdx
-	mov	%rdx, 8(%rdi)
-	mov	16(%rsi), %rcx
-	mov	%rcx, 16(%rdi)
-	mov	22(%rsi), %rdx
-	mov	%rdx, 22(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	29(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$30, %cl
-	sub	$30, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-
-	ret
-
-
-	.p2align 4
-LABEL(tail_30):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	8(%rsi), %rdx
-	mov	%rdx, 8(%rdi)
-	mov	16(%rsi), %rcx
-	mov	%rcx, 16(%rdi)
-	mov	23(%rsi), %rdx
-	mov	%rdx, 23(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	30(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$31, %cl
-	sub	$31, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-
-	.p2align 4
-LABEL(tail_31):
-	mov	(%rsi), %rcx
-	mov	%rcx, (%rdi)
-	mov	8(%rsi), %rdx
-	mov	%rdx, 8(%rdi)
-	mov	16(%rsi), %rcx
-	mov	%rcx, 16(%rdi)
-	mov	24(%rsi), %rdx
-	mov	%rdx, 24(%rdi)
-#ifdef USE_AS_STPCPY
-	lea	31(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
-	mov	$32, %cl
-	sub	$32, %r8
-	jnz	LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#endif
-#endif
-	ret
-	cfi_endproc
-	.size	STRCPY_SSSE3, .-STRCPY_SSSE3
-
-	.p2align 4
-	.section .rodata.ssse3,"a",@progbits
-LABEL(tail_table):
-	.int	LABEL(tail_0) - LABEL(tail_table)
-	.int	LABEL(tail_1) - LABEL(tail_table)
-	.int	LABEL(tail_2) - LABEL(tail_table)
-	.int	LABEL(tail_3) - LABEL(tail_table)
-	.int	LABEL(tail_4) - LABEL(tail_table)
-	.int	LABEL(tail_5) - LABEL(tail_table)
-	.int	LABEL(tail_6) - LABEL(tail_table)
-	.int	LABEL(tail_7) - LABEL(tail_table)
-	.int	LABEL(tail_8) - LABEL(tail_table)
-	.int	LABEL(tail_9) - LABEL(tail_table)
-	.int	LABEL(tail_10) - LABEL(tail_table)
-	.int	LABEL(tail_11) - LABEL(tail_table)
-	.int	LABEL(tail_12) - LABEL(tail_table)
-	.int	LABEL(tail_13) - LABEL(tail_table)
-	.int	LABEL(tail_14) - LABEL(tail_table)
-	.int	LABEL(tail_15) - LABEL(tail_table)
-	.int	LABEL(tail_16) - LABEL(tail_table)
-	.int	LABEL(tail_17) - LABEL(tail_table)
-	.int	LABEL(tail_18) - LABEL(tail_table)
-	.int	LABEL(tail_19) - LABEL(tail_table)
-	.int	LABEL(tail_20) - LABEL(tail_table)
-	.int	LABEL(tail_21) - LABEL(tail_table)
-	.int	LABEL(tail_22) - LABEL(tail_table)
-	.int	LABEL(tail_23) - LABEL(tail_table)
-	.int	LABEL(tail_24) - LABEL(tail_table)
-	.int	LABEL(tail_25) - LABEL(tail_table)
-	.int	LABEL(tail_26) - LABEL(tail_table)
-	.int	LABEL(tail_27) - LABEL(tail_table)
-	.int	LABEL(tail_28) - LABEL(tail_table)
-	.int	LABEL(tail_29) - LABEL(tail_table)
-	.int	LABEL(tail_30) - LABEL(tail_table)
-	.int	LABEL(tail_31) - LABEL(tail_table)
-
-	.p2align 4
-LABEL(unaligned_table):
-	.int	LABEL(ashr_0) - LABEL(unaligned_table)
-	.int	LABEL(ashr_1) - LABEL(unaligned_table)
-	.int	LABEL(ashr_2) - LABEL(unaligned_table)
-	.int	LABEL(ashr_3) - LABEL(unaligned_table)
-	.int	LABEL(ashr_4) - LABEL(unaligned_table)
-	.int	LABEL(ashr_5) - LABEL(unaligned_table)
-	.int	LABEL(ashr_6) - LABEL(unaligned_table)
-	.int	LABEL(ashr_7) - LABEL(unaligned_table)
-	.int	LABEL(ashr_8) - LABEL(unaligned_table)
-	.int	LABEL(ashr_9) - LABEL(unaligned_table)
-	.int	LABEL(ashr_10) - LABEL(unaligned_table)
-	.int	LABEL(ashr_11) - LABEL(unaligned_table)
-	.int	LABEL(ashr_12) - LABEL(unaligned_table)
-	.int	LABEL(ashr_13) - LABEL(unaligned_table)
-	.int	LABEL(ashr_14) - LABEL(unaligned_table)
-	.int	LABEL(ashr_15) - LABEL(unaligned_table)
-
 # undef ENTRY
 # define ENTRY(name) \
 	.type STRCPY_SSE2, @function; \
diff --git a/sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S
new file mode 100644
index 0000000000..fcc23a754a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCPY
+#define STRCPY __strncpy_sse2_unaligned
+#include "strcpy-sse2-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/strncpy-ssse3.S b/sysdeps/x86_64/multiarch/strncpy-ssse3.S
new file mode 100644
index 0000000000..bf82ee447d
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncpy-ssse3.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCPY
+#define STRCPY __strncpy_ssse3
+#include "strcpy-ssse3.S"