about summary refs log tree commit diff
path: root/REORG.TODO/sysdeps/x86_64/multiarch/strcpy-ssse3.S
diff options
context:
space:
mode:
authorZack Weinberg <zackw@panix.com>2017-06-08 15:39:03 -0400
committerZack Weinberg <zackw@panix.com>2017-06-08 15:39:03 -0400
commit5046dbb4a7eba5eccfd258f92f4735c9ffc8d069 (patch)
tree4470480d904b65cf14ca524f96f79eca818c3eaf /REORG.TODO/sysdeps/x86_64/multiarch/strcpy-ssse3.S
parent199fc19d3aaaf57944ef036e15904febe877fc93 (diff)
downloadglibc-5046dbb4a7eba5eccfd258f92f4735c9ffc8d069.tar.gz
glibc-5046dbb4a7eba5eccfd258f92f4735c9ffc8d069.tar.xz
glibc-5046dbb4a7eba5eccfd258f92f4735c9ffc8d069.zip
Prepare for radical source tree reorganization. zack/build-layout-experiment
All top-level files and directories are moved into a temporary storage
directory, REORG.TODO, except for files that will certainly still
exist in their current form at top level when we're done (COPYING,
COPYING.LIB, LICENSES, NEWS, README), all old ChangeLog files (which
are moved to the new directory OldChangeLogs, instead), and the
generated file INSTALL (which is just deleted; in the new order, there
will be no generated files checked into version control).
Diffstat (limited to 'REORG.TODO/sysdeps/x86_64/multiarch/strcpy-ssse3.S')
-rw-r--r--REORG.TODO/sysdeps/x86_64/multiarch/strcpy-ssse3.S3551
1 files changed, 3551 insertions, 0 deletions
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcpy-ssse3.S
new file mode 100644
index 0000000000..47aaeae671
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcpy-ssse3.S
@@ -0,0 +1,3551 @@
+/* strcpy with SSSE3
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# ifndef USE_AS_STRCAT
+#  include <sysdep.h>
+
+#  ifndef STRCPY
+#   define STRCPY  __strcpy_ssse3
+#  endif
+
+	.section .text.ssse3,"ax",@progbits
+ENTRY (STRCPY)
+
+	mov	%rsi, %rcx
+#  ifdef USE_AS_STRNCPY
+	mov	%rdx, %r8
+#  endif
+	mov	%rdi, %rdx
+#  ifdef USE_AS_STRNCPY
+	test	%r8, %r8
+	jz	L(Exit0)
+	cmp	$8, %r8
+	jbe	L(StrncpyExit8Bytes)
+# endif
+	cmpb	$0, (%rcx)
+	jz	L(Exit1)
+	cmpb	$0, 1(%rcx)
+	jz	L(Exit2)
+	cmpb	$0, 2(%rcx)
+	jz	L(Exit3)
+	cmpb	$0, 3(%rcx)
+	jz	L(Exit4)
+	cmpb	$0, 4(%rcx)
+	jz	L(Exit5)
+	cmpb	$0, 5(%rcx)
+	jz	L(Exit6)
+	cmpb	$0, 6(%rcx)
+	jz	L(Exit7)
+	cmpb	$0, 7(%rcx)
+	jz	L(Exit8)
+# ifdef USE_AS_STRNCPY
+	cmp	$16, %r8
+	jb	L(StrncpyExit15Bytes)
+# endif
+	cmpb	$0, 8(%rcx)
+	jz	L(Exit9)
+	cmpb	$0, 9(%rcx)
+	jz	L(Exit10)
+	cmpb	$0, 10(%rcx)
+	jz	L(Exit11)
+	cmpb	$0, 11(%rcx)
+	jz	L(Exit12)
+	cmpb	$0, 12(%rcx)
+	jz	L(Exit13)
+	cmpb	$0, 13(%rcx)
+	jz	L(Exit14)
+	cmpb	$0, 14(%rcx)
+	jz	L(Exit15)
+# ifdef USE_AS_STRNCPY
+	cmp	$16, %r8
+	je	L(Exit16)
+# endif
+	cmpb	$0, 15(%rcx)
+	jz	L(Exit16)
+# endif
+
+# ifdef USE_AS_STRNCPY
+	mov	%rcx, %rsi
+	sub	$16, %r8
+	and	$0xf, %rsi
+
+/* add 16 bytes rcx_offset to r8 */
+
+	add	%rsi, %r8
+# endif
+	lea	16(%rcx), %rsi
+	and	$-16, %rsi
+	pxor	%xmm0, %xmm0
+	mov	(%rcx), %r9
+	mov	%r9, (%rdx)
+	pcmpeqb	(%rsi), %xmm0
+	mov	8(%rcx), %r9
+	mov	%r9, 8(%rdx)
+
+/* convert byte mask in xmm0 to bit mask */
+
+	pmovmskb %xmm0, %rax
+	sub	%rcx, %rsi
+
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	mov	%rdx, %rax
+	lea	16(%rdx), %rdx
+	and	$-16, %rdx
+	sub	%rdx, %rax
+
+# ifdef USE_AS_STRNCPY
+	add	%rax, %rsi
+	lea	-1(%rsi), %rsi
+	and	$1<<31, %esi
+	test	%rsi, %rsi
+	jnz	L(ContinueCopy)
+	lea	16(%r8), %r8
+
+L(ContinueCopy):
+# endif
+	sub	%rax, %rcx
+	mov	%rcx, %rax
+	and	$0xf, %rax
+	mov	$0, %rsi
+
+/* case: rcx_offset == rdx_offset */
+
+	jz	L(Align16Both)
+
+	cmp	$8, %rax
+	jae	L(ShlHigh8)
+	cmp	$1, %rax
+	je	L(Shl1)
+	cmp	$2, %rax
+	je	L(Shl2)
+	cmp	$3, %rax
+	je	L(Shl3)
+	cmp	$4, %rax
+	je	L(Shl4)
+	cmp	$5, %rax
+	je	L(Shl5)
+	cmp	$6, %rax
+	je	L(Shl6)
+	jmp	L(Shl7)
+
+L(ShlHigh8):
+	je	L(Shl8)
+	cmp	$9, %rax
+	je	L(Shl9)
+	cmp	$10, %rax
+	je	L(Shl10)
+	cmp	$11, %rax
+	je	L(Shl11)
+	cmp	$12, %rax
+	je	L(Shl12)
+	cmp	$13, %rax
+	je	L(Shl13)
+	cmp	$14, %rax
+	je	L(Shl14)
+	jmp	L(Shl15)
+
+L(Align16Both):
+	movaps	(%rcx), %xmm1
+	movaps	16(%rcx), %xmm2
+	movaps	%xmm1, (%rdx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%rcx, %rsi), %xmm3
+	movaps	%xmm2, (%rdx, %rsi)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%rcx, %rsi), %xmm4
+	movaps	%xmm3, (%rdx, %rsi)
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%rcx, %rsi), %xmm1
+	movaps	%xmm4, (%rdx, %rsi)
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%rcx, %rsi), %xmm2
+	movaps	%xmm1, (%rdx, %rsi)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%rcx, %rsi), %xmm3
+	movaps	%xmm2, (%rdx, %rsi)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	%xmm3, (%rdx, %rsi)
+	mov	%rcx, %rax
+	lea	16(%rcx, %rsi), %rcx
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	lea	112(%r8, %rax), %r8
+# endif
+	mov	$-0x40, %rsi
+
+	.p2align 4
+L(Aligned64Loop):
+	movaps	(%rcx), %xmm2
+	movaps	%xmm2, %xmm4
+	movaps	16(%rcx), %xmm5
+	movaps	32(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	48(%rcx), %xmm7
+	pminub	%xmm5, %xmm2
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %rax
+	lea	64(%rdx), %rdx
+	lea	64(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeaveCase2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Aligned64Leave)
+	movaps	%xmm4, -64(%rdx)
+	movaps	%xmm5, -48(%rdx)
+	movaps	%xmm6, -32(%rdx)
+	movaps	%xmm7, -16(%rdx)
+	jmp	L(Aligned64Loop)
+
+L(Aligned64Leave):
+# ifdef USE_AS_STRNCPY
+	lea	48(%r8), %r8
+# endif
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %rax
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm5, %xmm0
+# ifdef USE_AS_STRNCPY
+	lea	-16(%r8), %r8
+# endif
+	pmovmskb %xmm0, %rax
+	movaps	%xmm4, -64(%rdx)
+	test	%rax, %rax
+	lea	16(%rsi), %rsi
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm6, %xmm0
+# ifdef USE_AS_STRNCPY
+	lea	-16(%r8), %r8
+# endif
+	pmovmskb %xmm0, %rax
+	movaps	%xmm5, -48(%rdx)
+	test	%rax, %rax
+	lea	16(%rsi), %rsi
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	%xmm6, -32(%rdx)
+	pcmpeqb	%xmm7, %xmm0
+# ifdef USE_AS_STRNCPY
+	lea	-16(%r8), %r8
+# endif
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl1):
+	movaps	-1(%rcx), %xmm1
+	movaps	15(%rcx), %xmm2
+L(Shl1Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit1Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl1LoopExit)
+
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	31(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit1Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl1LoopExit)
+
+	palignr	$1, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	31(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit1Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl1LoopExit)
+
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	31(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit1Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl1LoopExit)
+
+	palignr	$1, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	31(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-15(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-1(%rcx), %xmm1
+
+/* 64 bytes loop */
+	.p2align 4
+L(Shl1LoopStart):
+	movaps	15(%rcx), %xmm2
+	movaps	31(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	47(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	63(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$1, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$1, %xmm3, %xmm4
+	jnz	L(Shl1Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave1)
+# endif
+	palignr	$1, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl1LoopStart)
+
+L(Shl1LoopExit):
+	movdqu	-1(%rcx), %xmm1
+	mov	$15, %rsi
+	movdqu	%xmm1, -1(%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl2):
+	movaps	-2(%rcx), %xmm1
+	movaps	14(%rcx), %xmm2
+L(Shl2Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit2Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl2LoopExit)
+
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	30(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit2Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl2LoopExit)
+
+	palignr	$2, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	30(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit2Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl2LoopExit)
+
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	30(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit2Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl2LoopExit)
+
+	palignr	$2, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	30(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-14(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-2(%rcx), %xmm1
+
+/* 64 bytes loop */
+	.p2align 4
+L(Shl2LoopStart):
+	movaps	14(%rcx), %xmm2
+	movaps	30(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	46(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	62(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$2, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$2, %xmm3, %xmm4
+	jnz	L(Shl2Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave2)
+# endif
+	palignr	$2, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl2LoopStart)
+
+L(Shl2LoopExit):
+	movdqu	-2(%rcx), %xmm1
+	mov	$14, %rsi
+	movdqu	%xmm1, -2(%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl3):
+	movaps	-3(%rcx), %xmm1
+	movaps	13(%rcx), %xmm2
+L(Shl3Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit3Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl3LoopExit)
+
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	29(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit3Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl3LoopExit)
+
+	palignr	$3, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	29(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit3Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl3LoopExit)
+
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	29(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit3Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl3LoopExit)
+
+	palignr	$3, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	29(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-13(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-3(%rcx), %xmm1
+
+/* 64 bytes loop */
+	.p2align 4
+L(Shl3LoopStart):
+	movaps	13(%rcx), %xmm2
+	movaps	29(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	45(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	61(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$3, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$3, %xmm3, %xmm4
+	jnz	L(Shl3Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave3)
+# endif
+	palignr	$3, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl3LoopStart)
+
+L(Shl3LoopExit):
+	movdqu	-3(%rcx), %xmm1
+	mov	$13, %rsi
+	movdqu	%xmm1, -3(%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl4):
+	movaps	-4(%rcx), %xmm1
+	movaps	12(%rcx), %xmm2
+L(Shl4Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit4Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	28(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit4Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	28(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit4Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	28(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit4Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	28(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-12(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-4(%rcx), %xmm1
+
+/* 64 bytes loop */
+	.p2align 4
+L(Shl4LoopStart):
+	movaps	12(%rcx), %xmm2
+	movaps	28(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	44(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	60(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$4, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$4, %xmm3, %xmm4
+	jnz	L(Shl4Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave4)
+# endif
+	palignr	$4, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl4LoopStart)
+
+L(Shl4LoopExit):
+	movdqu	-4(%rcx), %xmm1
+	mov	$12, %rsi
+	movdqu	%xmm1, -4(%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl5):
+	movaps	-5(%rcx), %xmm1
+	movaps	11(%rcx), %xmm2
+L(Shl5Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit5Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl5LoopExit)
+
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	27(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit5Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl5LoopExit)
+
+	palignr	$5, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	27(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit5Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl5LoopExit)
+
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	27(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit5Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl5LoopExit)
+
+	palignr	$5, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	27(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-11(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-5(%rcx), %xmm1
+
+/* 64 bytes loop */
+	.p2align 4
+L(Shl5LoopStart):
+	movaps	11(%rcx), %xmm2
+	movaps	27(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	43(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	59(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$5, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$5, %xmm3, %xmm4
+	jnz	L(Shl5Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave5)
+# endif
+	palignr	$5, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl5LoopStart)
+
+L(Shl5LoopExit):
+	movdqu	-5(%rcx), %xmm1
+	mov	$11, %rsi
+	movdqu	%xmm1, -5(%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl6):
+	movaps	-6(%rcx), %xmm1
+	movaps	10(%rcx), %xmm2
+L(Shl6Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit6Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl6LoopExit)
+
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	26(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit6Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl6LoopExit)
+
+	palignr	$6, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	26(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit6Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl6LoopExit)
+
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	26(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit6Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl6LoopExit)
+
+	palignr	$6, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	26(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-10(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-6(%rcx), %xmm1
+
+/* 64 bytes loop */
+	.p2align 4
+L(Shl6LoopStart):
+	movaps	10(%rcx), %xmm2
+	movaps	26(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	42(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	58(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$6, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$6, %xmm3, %xmm4
+	jnz	L(Shl6Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave6)
+# endif
+	palignr	$6, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl6LoopStart)
+
+L(Shl6LoopExit):
+	mov	(%rcx), %r9
+	mov	6(%rcx), %esi
+	mov	%r9, (%rdx)
+	mov	%esi, 6(%rdx)
+	mov	$10, %rsi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl7):
+	movaps	-7(%rcx), %xmm1
+	movaps	9(%rcx), %xmm2
+L(Shl7Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit7Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl7LoopExit)
+
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	25(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit7Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl7LoopExit)
+
+	palignr	$7, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	25(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit7Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl7LoopExit)
+
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	25(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit7Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl7LoopExit)
+
+	palignr	$7, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	25(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-9(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-7(%rcx), %xmm1
+
+/* 64 bytes loop */
+	.p2align 4
+L(Shl7LoopStart):
+	movaps	9(%rcx), %xmm2
+	movaps	25(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	41(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	57(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$7, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$7, %xmm3, %xmm4
+	jnz	L(Shl7Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave7)
+# endif
+	palignr	$7, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl7LoopStart)
+
+L(Shl7LoopExit):
+	mov	(%rcx), %r9
+	mov	5(%rcx), %esi
+	mov	%r9, (%rdx)
+	mov	%esi, 5(%rdx)
+	mov	$9, %rsi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl8):
+	movaps	-8(%rcx), %xmm1
+	movaps	8(%rcx), %xmm2
+L(Shl8Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit8Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	24(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit8Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	24(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit8Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	24(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit8Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	24(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-8(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-8(%rcx), %xmm1
+
+/* 64 bytes loop */
+	.p2align 4
+L(Shl8LoopStart):
+	movaps	8(%rcx), %xmm2
+	movaps	24(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	40(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	56(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$8, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$8, %xmm3, %xmm4
+	jnz	L(Shl8Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave8)
+# endif
+	palignr	$8, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl8LoopStart)
+
+L(Shl8LoopExit):
+	mov	(%rcx), %r9
+	mov	$8, %rsi
+	mov	%r9, (%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl9):
+	movaps	-9(%rcx), %xmm1
+	movaps	7(%rcx), %xmm2
+L(Shl9Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit9Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl9LoopExit)
+
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	23(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit9Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl9LoopExit)
+
+	palignr	$9, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	23(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit9Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl9LoopExit)
+
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	23(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit9Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl9LoopExit)
+
+	palignr	$9, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	23(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-7(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-9(%rcx), %xmm1
+
+/* 64 bytes loop */
+	.p2align 4
+L(Shl9LoopStart):
+	movaps	7(%rcx), %xmm2
+	movaps	23(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	39(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	55(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$9, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$9, %xmm3, %xmm4
+	jnz	L(Shl9Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave9)
+# endif
+	palignr	$9, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl9LoopStart)
+
+L(Shl9LoopExit):
+	mov	-1(%rcx), %r9
+	mov	$7, %rsi
+	mov	%r9, -1(%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl10):
+	movaps	-10(%rcx), %xmm1
+	movaps	6(%rcx), %xmm2
+L(Shl10Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit10Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl10LoopExit)
+
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	22(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit10Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl10LoopExit)
+
+	palignr	$10, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	22(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit10Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl10LoopExit)
+
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	22(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit10Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl10LoopExit)
+
+	palignr	$10, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	22(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-6(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-10(%rcx), %xmm1
+
+/* 64 bytes loop */
+	.p2align 4
+L(Shl10LoopStart):
+	movaps	6(%rcx), %xmm2
+	movaps	22(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	38(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	54(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$10, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$10, %xmm3, %xmm4
+	jnz	L(Shl10Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave10)
+# endif
+	palignr	$10, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl10LoopStart)
+
+L(Shl10LoopExit):
+	mov	-2(%rcx), %r9
+	mov	$6, %rsi
+	mov	%r9, -2(%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl11):
+	movaps	-11(%rcx), %xmm1
+	movaps	5(%rcx), %xmm2
+L(Shl11Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit11Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl11LoopExit)
+
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	21(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit11Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl11LoopExit)
+
+	palignr	$11, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	21(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit11Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl11LoopExit)
+
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	21(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit11Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl11LoopExit)
+
+	palignr	$11, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	21(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-5(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-11(%rcx), %xmm1
+
+/* 64 bytes loop */
+	.p2align 4
+L(Shl11LoopStart):
+	movaps	5(%rcx), %xmm2
+	movaps	21(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	37(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	53(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$11, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$11, %xmm3, %xmm4
+	jnz	L(Shl11Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave11)
+# endif
+	palignr	$11, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl11LoopStart)
+
+L(Shl11LoopExit):
+	mov	-3(%rcx), %r9
+	mov	$5, %rsi
+	mov	%r9, -3(%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl12):
+	movaps	-12(%rcx), %xmm1
+	movaps	4(%rcx), %xmm2
+L(Shl12Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit12Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	20(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit12Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	20(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit12Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	20(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit12Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	20(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-4(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-12(%rcx), %xmm1
+
+/* 64 bytes loop */
+	.p2align 4
+L(Shl12LoopStart):
+	movaps	4(%rcx), %xmm2
+	movaps	20(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	36(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	52(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$12, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$12, %xmm3, %xmm4
+	jnz	L(Shl12Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave12)
+# endif
+	palignr	$12, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl12LoopStart)
+
+L(Shl12LoopExit):
+	mov	(%rcx), %r9d
+	mov	$4, %rsi
+	mov	%r9d, (%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl13):
+	movaps	-13(%rcx), %xmm1
+	movaps	3(%rcx), %xmm2
+L(Shl13Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit13Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl13LoopExit)
+
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	19(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit13Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl13LoopExit)
+
+	palignr	$13, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	19(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit13Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl13LoopExit)
+
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	19(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit13Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl13LoopExit)
+
+	palignr	$13, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	19(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-3(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-13(%rcx), %xmm1
+
+/* 64 bytes loop */
+	.p2align 4
+L(Shl13LoopStart):
+	movaps	3(%rcx), %xmm2
+	movaps	19(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	35(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	51(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$13, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$13, %xmm3, %xmm4
+	jnz	L(Shl13Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave13)
+# endif
+	palignr	$13, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl13LoopStart)
+
+L(Shl13LoopExit):
+	mov	-1(%rcx), %r9d
+	mov	$3, %rsi
+	mov	%r9d, -1(%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl14):
+	movaps	-14(%rcx), %xmm1
+	movaps	2(%rcx), %xmm2
+L(Shl14Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit14Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl14LoopExit)
+
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	18(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit14Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl14LoopExit)
+
+	palignr	$14, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	18(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit14Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl14LoopExit)
+
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	18(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit14Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl14LoopExit)
+
+	palignr	$14, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	18(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-2(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-14(%rcx), %xmm1
+
+/* 64 bytes loop */
+	.p2align 4
+L(Shl14LoopStart):
+	movaps	2(%rcx), %xmm2
+	movaps	18(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	34(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	50(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$14, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$14, %xmm3, %xmm4
+	jnz	L(Shl14Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave14)
+# endif
+	palignr	$14, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl14LoopStart)
+
+L(Shl14LoopExit):
+	mov	-2(%rcx), %r9d
+	mov	$2, %rsi
+	mov	%r9d, -2(%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl15):
+	movaps	-15(%rcx), %xmm1
+	movaps	1(%rcx), %xmm2
+L(Shl15Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit15Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl15LoopExit)
+
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	17(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit15Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl15LoopExit)
+
+	palignr	$15, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	17(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit15Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl15LoopExit)
+
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	17(%rcx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	L(StrncpyExit15Case2OrCase3)
+# endif
+	test	%rax, %rax
+	jnz	L(Shl15LoopExit)
+
+	palignr	$15, %xmm3, %xmm2
+	movaps	%xmm2, (%rdx)
+	lea	17(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-1(%rcx), %rcx
+	sub	%rax, %rdx
+# ifdef USE_AS_STRNCPY
+	add	%rax, %r8
+# endif
+	movaps	-15(%rcx), %xmm1
+
+/* 64 bytes loop */
+	.p2align 4
+L(Shl15LoopStart):
+	movaps	1(%rcx), %xmm2
+	movaps	17(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	33(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	49(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$15, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$15, %xmm3, %xmm4
+	jnz	L(Shl15Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %r8
+	jbe	L(StrncpyLeave15)
+# endif
+	palignr	$15, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl15LoopStart)
+
+L(Shl15LoopExit):
+	mov	-3(%rcx), %r9d
+	mov	$1, %rsi
+	mov	%r9d, -3(%rdx)
+# ifdef USE_AS_STRCAT
+	jmp	L(CopyFrom1To16Bytes)
+# endif
+
+# ifndef USE_AS_STRCAT
+
+	.p2align 4
+L(CopyFrom1To16Bytes):
+#  ifdef USE_AS_STRNCPY
+	add	$16, %r8
+#  endif
+	add	%rsi, %rdx
+	add	%rsi, %rcx
+
+	test	%al, %al
+	jz	L(ExitHigh)
+	test	$0x01, %al
+	jnz	L(Exit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+
+	.p2align 4
+L(Exit8):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	7(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$8, %r8
+	lea	8(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#   endif
+#  endif
+	ret
+
+	.p2align 4
+L(ExitHigh):
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	test	$0x08, %ah
+	jnz	L(Exit12)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+
+	.p2align 4
+L(Exit16):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	8(%rcx), %rax
+	mov	%rax, 8(%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	15(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	lea	16(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#   endif
+#  endif
+	ret
+
+#  ifdef USE_AS_STRNCPY
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2):
+	add	$16, %r8
+	add	%rsi, %rcx
+	lea	(%rsi, %rdx), %rsi
+	lea	-9(%r8), %rdx
+	and	$1<<7, %dh
+	or	%al, %dh
+	test	%dh, %dh
+	lea	(%rsi), %rdx
+	jz	L(ExitHighCase2)
+
+	cmp	$1, %r8
+	je	L(Exit1)
+	test	$0x01, %al
+	jnz	L(Exit1)
+	cmp	$2, %r8
+	je	L(Exit2)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	cmp	$3, %r8
+	je	L(Exit3)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	cmp	$4, %r8
+	je	L(Exit4)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	cmp	$5, %r8
+	je	L(Exit5)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	cmp	$6, %r8
+	je	L(Exit6)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	cmp	$7, %r8
+	je	L(Exit7)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	jmp	L(Exit8)
+
+	.p2align 4
+L(ExitHighCase2):
+	cmp	$9, %r8
+	je	L(Exit9)
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	cmp	$10, %r8
+	je	L(Exit10)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	cmp	$11, %r8
+	je	L(Exit11)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	cmp	$12, %r8
+	je	L(Exit12)
+	test	$0x8, %ah
+	jnz	L(Exit12)
+	cmp	$13, %r8
+	je	L(Exit13)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	cmp	$14, %r8
+	je	L(Exit14)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	cmp	$15, %r8
+	je	L(Exit15)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	jmp	L(Exit16)
+
+L(CopyFrom1To16BytesCase2OrCase3):
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+
+	.p2align 4
+L(CopyFrom1To16BytesCase3):
+	add	$16, %r8
+	add	%rsi, %rdx
+	add	%rsi, %rcx
+
+	cmp	$16, %r8
+	je	L(Exit16)
+	cmp	$8, %r8
+	je	L(Exit8)
+	jg	L(More8Case3)
+	cmp	$4, %r8
+	je	L(Exit4)
+	jg	L(More4Case3)
+	cmp	$2, %r8
+	jl	L(Exit1)
+	je	L(Exit2)
+	jg	L(Exit3)
+L(More8Case3): /* but less than 16 */
+	cmp	$12, %r8
+	je	L(Exit12)
+	jl	L(Less12Case3)
+	cmp	$14, %r8
+	jl	L(Exit13)
+	je	L(Exit14)
+	jg	L(Exit15)
+L(More4Case3): /* but less than 8 */
+	cmp	$6, %r8
+	jl	L(Exit5)
+	je	L(Exit6)
+	jg	L(Exit7)
+L(Less12Case3): /* but more than 8 */
+	cmp	$10, %r8
+	jl	L(Exit9)
+	je	L(Exit10)
+	jg	L(Exit11)
+#  endif
+
+	.p2align 4
+L(Exit1):
+	movb	(%rcx), %al
+	movb	%al, (%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$1, %r8
+	lea	1(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#   endif
+#  endif
+	ret
+
+	.p2align 4
+L(Exit2):
+	movw	(%rcx), %ax
+	movw	%ax, (%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	1(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$2, %r8
+	lea	2(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#   endif
+#  endif
+	ret
+
+	.p2align 4
+L(Exit3):
+	movw	(%rcx), %ax
+	movw	%ax, (%rdx)
+	movb	2(%rcx), %al
+	movb	%al, 2(%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	2(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$3, %r8
+	lea	3(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#   endif
+#  endif
+	ret
+
+	.p2align 4
+L(Exit4):
+	movl	(%rcx), %eax
+	movl	%eax, (%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	3(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$4, %r8
+	lea	4(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#   endif
+#  endif
+	ret
+
+	.p2align 4
+L(Exit5):
+	movl	(%rcx), %eax
+	movl	%eax, (%rdx)
+	movb	4(%rcx), %al
+	movb	%al, 4(%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	4(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$5, %r8
+	lea	5(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#  endif
+#  endif
+	ret
+
+	.p2align 4
+L(Exit6):
+	movl	(%rcx), %eax
+	movl	%eax, (%rdx)
+	movw	4(%rcx), %ax
+	movw	%ax, 4(%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	5(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$6, %r8
+	lea	6(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#  endif
+#  endif
+	ret
+
+	.p2align 4
+L(Exit7):
+	movl	(%rcx), %eax
+	movl	%eax, (%rdx)
+	movl	3(%rcx), %eax
+	movl	%eax, 3(%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	6(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$7, %r8
+	lea	7(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#   endif
+#  endif
+	ret
+
+	.p2align 4
+L(Exit9):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	5(%rcx), %eax
+	mov	%eax, 5(%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	8(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$9, %r8
+	lea	9(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#   endif
+#  endif
+	ret
+
+	.p2align 4
+L(Exit10):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	6(%rcx), %eax
+	mov	%eax, 6(%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	9(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$10, %r8
+	lea	10(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#   endif
+#  endif
+	ret
+
+	.p2align 4
+L(Exit11):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	7(%rcx), %eax
+	mov	%eax, 7(%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	10(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$11, %r8
+	lea	11(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#   endif
+#  endif
+	ret
+
+	.p2align 4
+L(Exit12):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	8(%rcx), %eax
+	mov	%eax, 8(%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	11(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$12, %r8
+	lea	12(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#  endif
+#  endif
+	ret
+
+	.p2align 4
+L(Exit13):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	5(%rcx), %rax
+	mov	%rax, 5(%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	12(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$13, %r8
+	lea	13(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#   endif
+#  endif
+	ret
+
+	.p2align 4
+L(Exit14):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	6(%rcx), %rax
+	mov	%rax, 6(%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	13(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$14, %r8
+	lea	14(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#   endif
+#  endif
+	ret
+
+	.p2align 4
+L(Exit15):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	7(%rcx), %rax
+	mov	%rax, 7(%rdx)
+#  ifdef USE_AS_STPCPY
+	lea	14(%rdx), %rax
+#  else
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRNCPY
+	sub	$15, %r8
+	lea	15(%rdx), %rcx
+	jnz	L(StrncpyFillTailWithZero1)
+#   ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#   endif
+#  endif
+	ret
+
+#  ifdef USE_AS_STRNCPY
+	.p2align 4
+L(Fill0):
+	ret
+
+	.p2align 4
+L(Fill1):
+	movb	%dl, (%rcx)
+	ret
+
+	.p2align 4
+L(Fill2):
+	movw	%dx, (%rcx)
+	ret
+
+	.p2align 4
+L(Fill3):
+	movw	%dx, (%rcx)
+	movb	%dl, 2(%rcx)
+	ret
+
+	.p2align 4
+L(Fill4):
+	movl	%edx, (%rcx)
+	ret
+
+	.p2align 4
+L(Fill5):
+	movl	%edx, (%rcx)
+	movb	%dl, 4(%rcx)
+	ret
+
+	.p2align 4
+L(Fill6):
+	movl	%edx, (%rcx)
+	movw	%dx, 4(%rcx)
+	ret
+
+	.p2align 4
+L(Fill7):
+	movl	%edx, (%rcx)
+	movl	%edx, 3(%rcx)
+	ret
+
+	.p2align 4
+L(Fill8):
+	mov	%rdx, (%rcx)
+	ret
+
+	.p2align 4
+L(Fill9):
+	mov	%rdx, (%rcx)
+	movb	%dl, 8(%rcx)
+	ret
+
+	.p2align 4
+L(Fill10):
+	mov	%rdx, (%rcx)
+	movw	%dx, 8(%rcx)
+	ret
+
+	.p2align 4
+L(Fill11):
+	mov	%rdx, (%rcx)
+	movl	%edx, 7(%rcx)
+	ret
+
+	.p2align 4
+L(Fill12):
+	mov	%rdx, (%rcx)
+	movl	%edx, 8(%rcx)
+	ret
+
+	.p2align 4
+L(Fill13):
+	mov	%rdx, (%rcx)
+	mov	%rdx, 5(%rcx)
+	ret
+
+	.p2align 4
+L(Fill14):
+	mov	%rdx, (%rcx)
+	mov	%rdx, 6(%rcx)
+	ret
+
+	.p2align 4
+L(Fill15):
+	mov	%rdx, (%rcx)
+	mov	%rdx, 7(%rcx)
+	ret
+
+	.p2align 4
+L(Fill16):
+	mov	%rdx, (%rcx)
+	mov	%rdx, 8(%rcx)
+	ret
+
+	.p2align 4
+L(StrncpyFillExit1):
+	lea	16(%r8), %r8
+L(FillFrom1To16Bytes):
+	test	%r8, %r8
+	jz	L(Fill0)
+	cmp	$16, %r8
+	je	L(Fill16)
+	cmp	$8, %r8
+	je	L(Fill8)
+	jg	L(FillMore8)
+	cmp	$4, %r8
+	je	L(Fill4)
+	jg	L(FillMore4)
+	cmp	$2, %r8
+	jl	L(Fill1)
+	je	L(Fill2)
+	jg	L(Fill3)
+L(FillMore8): /* but less than 16 */
+	cmp	$12, %r8
+	je	L(Fill12)
+	jl	L(FillLess12)
+	cmp	$14, %r8
+	jl	L(Fill13)
+	je	L(Fill14)
+	jg	L(Fill15)
+L(FillMore4): /* but less than 8 */
+	cmp	$6, %r8
+	jl	L(Fill5)
+	je	L(Fill6)
+	jg	L(Fill7)
+L(FillLess12): /* but more than 8 */
+	cmp	$10, %r8
+	jl	L(Fill9)
+	je	L(Fill10)
+	jmp	L(Fill11)
+
+	.p2align 4
+L(StrncpyFillTailWithZero1):
+	xor	%rdx, %rdx
+	sub	$16, %r8
+	jbe	L(StrncpyFillExit1)
+
+	pxor	%xmm0, %xmm0
+	mov	%rdx, (%rcx)
+	mov	%rdx, 8(%rcx)
+
+	lea	16(%rcx), %rcx
+
+	mov	%rcx, %rdx
+	and	$0xf, %rdx
+	sub	%rdx, %rcx
+	add	%rdx, %r8
+	xor	%rdx, %rdx
+	sub	$64, %r8
+	jb	L(StrncpyFillLess64)
+
+L(StrncpyFillLoopMovdqa):
+	movdqa	%xmm0, (%rcx)
+	movdqa	%xmm0, 16(%rcx)
+	movdqa	%xmm0, 32(%rcx)
+	movdqa	%xmm0, 48(%rcx)
+	lea	64(%rcx), %rcx
+	sub	$64, %r8
+	jae	L(StrncpyFillLoopMovdqa)
+
+L(StrncpyFillLess64):
+	add	$32, %r8
+	jl	L(StrncpyFillLess32)
+	movdqa	%xmm0, (%rcx)
+	movdqa	%xmm0, 16(%rcx)
+	lea	32(%rcx), %rcx
+	sub	$16, %r8
+	jl	L(StrncpyFillExit1)
+	movdqa	%xmm0, (%rcx)
+	lea	16(%rcx), %rcx
+	jmp	L(FillFrom1To16Bytes)
+
+L(StrncpyFillLess32):
+	add	$16, %r8
+	jl	L(StrncpyFillExit1)
+	movdqa	%xmm0, (%rcx)
+	lea	16(%rcx), %rcx
+	jmp	L(FillFrom1To16Bytes)
+
+	.p2align 4
+L(Exit0):
+	mov	%rdx, %rax
+	ret
+
+	.p2align 4
+L(StrncpyExit15Bytes):
+	cmp	$9, %r8
+	je	L(Exit9)
+	cmpb	$0, 8(%rcx)
+	jz	L(Exit9)
+	cmp	$10, %r8
+	je	L(Exit10)
+	cmpb	$0, 9(%rcx)
+	jz	L(Exit10)
+	cmp	$11, %r8
+	je	L(Exit11)
+	cmpb	$0, 10(%rcx)
+	jz	L(Exit11)
+	cmp	$12, %r8
+	je	L(Exit12)
+	cmpb	$0, 11(%rcx)
+	jz	L(Exit12)
+	cmp	$13, %r8
+	je	L(Exit13)
+	cmpb	$0, 12(%rcx)
+	jz	L(Exit13)
+	cmp	$14, %r8
+	je	L(Exit14)
+	cmpb	$0, 13(%rcx)
+	jz	L(Exit14)
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	7(%rcx), %rax
+	mov	%rax, 7(%rdx)
+#   ifdef USE_AS_STPCPY
+	lea	14(%rdx), %rax
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#   else
+	mov	%rdi, %rax
+#   endif
+	ret
+
+	.p2align 4
+L(StrncpyExit8Bytes):
+	cmp	$1, %r8
+	je	L(Exit1)
+	cmpb	$0, (%rcx)
+	jz	L(Exit1)
+	cmp	$2, %r8
+	je	L(Exit2)
+	cmpb	$0, 1(%rcx)
+	jz	L(Exit2)
+	cmp	$3, %r8
+	je	L(Exit3)
+	cmpb	$0, 2(%rcx)
+	jz	L(Exit3)
+	cmp	$4, %r8
+	je	L(Exit4)
+	cmpb	$0, 3(%rcx)
+	jz	L(Exit4)
+	cmp	$5, %r8
+	je	L(Exit5)
+	cmpb	$0, 4(%rcx)
+	jz	L(Exit5)
+	cmp	$6, %r8
+	je	L(Exit6)
+	cmpb	$0, 5(%rcx)
+	jz	L(Exit6)
+	cmp	$7, %r8
+	je	L(Exit7)
+	cmpb	$0, 6(%rcx)
+	jz	L(Exit7)
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+#   ifdef USE_AS_STPCPY
+	lea	7(%rdx), %rax
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#   else
+	mov	%rdi, %rax
+#   endif
+	ret
+
+#  endif
+# endif
+
+# ifdef USE_AS_STRNCPY
+	.p2align 4
+L(StrncpyLeaveCase2OrCase3):
+	test	%rax, %rax
+	jnz	L(Aligned64LeaveCase2)
+
+L(Aligned64LeaveCase3):
+	lea	64(%r8), %r8
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase3)
+	movaps	%xmm4, -64(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase3)
+	movaps	%xmm5, -48(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase3)
+	movaps	%xmm6, -32(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+	jmp	L(CopyFrom1To16BytesCase3)
+
+L(Aligned64LeaveCase2):
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %rax
+	add	$48, %r8
+	jle	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm5, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm4, -64(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm6, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm5, -48(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm7, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm6, -32(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+	jmp	L(CopyFrom1To16BytesCase2)
+/*--------------------------------------------------*/
+	.p2align 4
+L(StrncpyExit1Case2OrCase3):
+	movdqu	-1(%rcx), %xmm0
+	movdqu	%xmm0, -1(%rdx)
+	mov	$15, %rsi
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit2Case2OrCase3):
+	movdqu	-2(%rcx), %xmm0
+	movdqu	%xmm0, -2(%rdx)
+	mov	$14, %rsi
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit3Case2OrCase3):
+	movdqu	-3(%rcx), %xmm0
+	movdqu	%xmm0, -3(%rdx)
+	mov	$13, %rsi
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit4Case2OrCase3):
+	movdqu	-4(%rcx), %xmm0
+	movdqu	%xmm0, -4(%rdx)
+	mov	$12, %rsi
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit5Case2OrCase3):
+	movdqu	-5(%rcx), %xmm0
+	movdqu	%xmm0, -5(%rdx)
+	mov	$11, %rsi
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit6Case2OrCase3):
+	mov	(%rcx), %rsi
+	mov	6(%rcx), %r9d
+	mov	%r9d, 6(%rdx)
+	mov	%rsi, (%rdx)
+	test	%rax, %rax
+	mov	$10, %rsi
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit7Case2OrCase3):
+	mov	(%rcx), %rsi
+	mov	5(%rcx), %r9d
+	mov	%r9d, 5(%rdx)
+	mov	%rsi, (%rdx)
+	test	%rax, %rax
+	mov	$9, %rsi
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit8Case2OrCase3):
+	mov	(%rcx), %r9
+	mov	$8, %rsi
+	mov	%r9, (%rdx)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit9Case2OrCase3):
+	mov	-1(%rcx), %r9
+	mov	$7, %rsi
+	mov	%r9, -1(%rdx)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit10Case2OrCase3):
+	mov	-2(%rcx), %r9
+	mov	$6, %rsi
+	mov	%r9, -2(%rdx)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit11Case2OrCase3):
+	mov	-3(%rcx), %r9
+	mov	$5, %rsi
+	mov	%r9, -3(%rdx)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit12Case2OrCase3):
+	mov	(%rcx), %r9d
+	mov	$4, %rsi
+	mov	%r9d, (%rdx)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit13Case2OrCase3):
+	mov	-1(%rcx), %r9d
+	mov	$3, %rsi
+	mov	%r9d, -1(%rdx)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit14Case2OrCase3):
+	mov	-2(%rcx), %r9d
+	mov	$2, %rsi
+	mov	%r9d, -2(%rdx)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyExit15Case2OrCase3):
+	mov	-3(%rcx), %r9d
+	mov	$1, %rsi
+	mov	%r9d, -3(%rdx)
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyLeave1):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit1)
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	31(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit1)
+	palignr	$1, %xmm3, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit1)
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit1)
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit1):
+	lea	15(%rdx, %rsi), %rdx
+	lea	15(%rcx, %rsi), %rcx
+	mov	-15(%rcx), %rsi
+	mov	-8(%rcx), %rax
+	mov	%rsi, -15(%rdx)
+	mov	%rax, -8(%rdx)
+	xor	%rsi, %rsi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyLeave2):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit2)
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	30(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit2)
+	palignr	$2, %xmm3, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit2)
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit2)
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit2):
+	lea	14(%rdx, %rsi), %rdx
+	lea	14(%rcx, %rsi), %rcx
+	mov	-14(%rcx), %rsi
+	mov	-8(%rcx), %rax
+	mov	%rsi, -14(%rdx)
+	mov	%rax, -8(%rdx)
+	xor	%rsi, %rsi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyLeave3):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit3)
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	29(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit3)
+	palignr	$3, %xmm3, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit3)
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit3)
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit3):
+	lea	13(%rdx, %rsi), %rdx
+	lea	13(%rcx, %rsi), %rcx
+	mov	-13(%rcx), %rsi
+	mov	-8(%rcx), %rax
+	mov	%rsi, -13(%rdx)
+	mov	%rax, -8(%rdx)
+	xor	%rsi, %rsi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyLeave4):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit4)
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	28(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit4)
+	palignr	$4, %xmm3, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit4)
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit4)
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit4):
+	lea	12(%rdx, %rsi), %rdx
+	lea	12(%rcx, %rsi), %rcx
+	mov	-12(%rcx), %rsi
+	mov	-4(%rcx), %eax
+	mov	%rsi, -12(%rdx)
+	mov	%eax, -4(%rdx)
+	xor	%rsi, %rsi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyLeave5):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit5)
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	27(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit5)
+	palignr	$5, %xmm3, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit5)
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit5)
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit5):
+	lea	11(%rdx, %rsi), %rdx
+	lea	11(%rcx, %rsi), %rcx
+	mov	-11(%rcx), %rsi
+	mov	-4(%rcx), %eax
+	mov	%rsi, -11(%rdx)
+	mov	%eax, -4(%rdx)
+	xor	%rsi, %rsi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyLeave6):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit6)
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	26(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit6)
+	palignr	$6, %xmm3, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit6)
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit6)
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit6):
+	lea	10(%rdx, %rsi), %rdx
+	lea	10(%rcx, %rsi), %rcx
+	mov	-10(%rcx), %rsi
+	movw	-2(%rcx), %ax
+	mov	%rsi, -10(%rdx)
+	movw	%ax, -2(%rdx)
+	xor	%rsi, %rsi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyLeave7):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit7)
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	25(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit7)
+	palignr	$7, %xmm3, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit7)
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit7)
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit7):
+	lea	9(%rdx, %rsi), %rdx
+	lea	9(%rcx, %rsi), %rcx
+	mov	-9(%rcx), %rsi
+	movb	-1(%rcx), %ah
+	mov	%rsi, -9(%rdx)
+	movb	%ah, -1(%rdx)
+	xor	%rsi, %rsi
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyLeave8):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit8)
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	24(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit8)
+	palignr	$8, %xmm3, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit8)
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit8)
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit8):
+	lea	8(%rdx, %rsi), %rdx
+	lea	8(%rcx, %rsi), %rcx
+	mov	-8(%rcx), %rax
+	xor	%rsi, %rsi
+	mov	%rax, -8(%rdx)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyLeave9):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit9)
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	23(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit9)
+	palignr	$9, %xmm3, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit9)
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit9)
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit9):
+	lea	7(%rdx, %rsi), %rdx
+	lea	7(%rcx, %rsi), %rcx
+	mov	-8(%rcx), %rax
+	xor	%rsi, %rsi
+	mov	%rax, -8(%rdx)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyLeave10):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit10)
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	22(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit10)
+	palignr	$10, %xmm3, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit10)
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit10)
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit10):
+	lea	6(%rdx, %rsi), %rdx
+	lea	6(%rcx, %rsi), %rcx
+	mov	-8(%rcx), %rax
+	xor	%rsi, %rsi
+	mov	%rax, -8(%rdx)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyLeave11):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit11)
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	21(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit11)
+	palignr	$11, %xmm3, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit11)
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit11)
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit11):
+	lea	5(%rdx, %rsi), %rdx
+	lea	5(%rcx, %rsi), %rcx
+	mov	-8(%rcx), %rax
+	xor	%rsi, %rsi
+	mov	%rax, -8(%rdx)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyLeave12):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit12)
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	20(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit12)
+	palignr	$12, %xmm3, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit12)
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit12)
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit12):
+	lea	4(%rdx, %rsi), %rdx
+	lea	4(%rcx, %rsi), %rcx
+	mov	-4(%rcx), %eax
+	xor	%rsi, %rsi
+	mov	%eax, -4(%rdx)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyLeave13):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit13)
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	19(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit13)
+	palignr	$13, %xmm3, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit13)
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit13)
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit13):
+	lea	3(%rdx, %rsi), %rdx
+	lea	3(%rcx, %rsi), %rcx
+	mov	-4(%rcx), %eax
+	xor	%rsi, %rsi
+	mov	%eax, -4(%rdx)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyLeave14):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit14)
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	18(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit14)
+	palignr	$14, %xmm3, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit14)
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit14)
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit14):
+	lea	2(%rdx, %rsi), %rdx
+	lea	2(%rcx, %rsi), %rcx
+	movw	-2(%rcx), %ax
+	xor	%rsi, %rsi
+	movw	%ax, -2(%rdx)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+	.p2align 4
+L(StrncpyLeave15):
+	movaps	%xmm2, %xmm3
+	add	$48, %r8
+	jle	L(StrncpyExit15)
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	17(%rcx), %xmm2
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit15)
+	palignr	$15, %xmm3, %xmm2
+	movaps	%xmm2, 16(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit15)
+	movaps	%xmm4, 32(%rdx)
+	lea	16(%rsi), %rsi
+	sub	$16, %r8
+	jbe	L(StrncpyExit15)
+	movaps	%xmm5, 48(%rdx)
+	lea	16(%rsi), %rsi
+	lea	-16(%r8), %r8
+
+L(StrncpyExit15):
+	lea	1(%rdx, %rsi), %rdx
+	lea	1(%rcx, %rsi), %rcx
+	movb	-1(%rcx), %ah
+	xor	%rsi, %rsi
+	movb	%ah, -1(%rdx)
+	jmp	L(CopyFrom1To16BytesCase3)
+
+# endif
+# ifndef USE_AS_STRCAT
+END (STRCPY)
+# endif
+#endif