about summary refs log tree commit diff
path: root/sysdeps/x86_64/multiarch
diff options
context:
space:
mode:
authorLiubov Dmitrieva <liubov.dmitrieva@gmail.com>2011-12-23 12:02:15 -0500
committerUlrich Drepper <drepper@gmail.com>2011-12-23 12:02:15 -0500
commit15db4de19dc0043c25ff6a205bfbc25a180b1c48 (patch)
treecc9e4135e4c8a50e4b8512c1fc252b4eeda52713 /sysdeps/x86_64/multiarch
parent2b2596b1e94d9d51bd8febe81b759fa45a62e3cb (diff)
downloadglibc-15db4de19dc0043c25ff6a205bfbc25a180b1c48.tar.gz
glibc-15db4de19dc0043c25ff6a205bfbc25a180b1c48.tar.xz
glibc-15db4de19dc0043c25ff6a205bfbc25a180b1c48.zip
Fix overrun in destination buffer
Diffstat (limited to 'sysdeps/x86_64/multiarch')
-rw-r--r--sysdeps/x86_64/multiarch/strcpy-ssse3.S767
-rw-r--r--sysdeps/x86_64/multiarch/wcscpy-ssse3.S64
2 files changed, 323 insertions, 508 deletions
diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
index c4ec54cd21..b1047652d9 100644
--- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S
+++ b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
@@ -29,6 +29,7 @@
 
 	.section .text.ssse3,"ax",@progbits
 ENTRY (STRCPY)
+
 	mov	%rsi, %rcx
 #  ifdef USE_AS_STRNCPY
 	mov	%rdx, %r8
@@ -39,7 +40,7 @@ ENTRY (STRCPY)
 	jz	L(Exit0)
 	cmp	$8, %r8
 	jbe	L(StrncpyExit8Bytes)
-#  endif
+# endif
 	cmpb	$0, (%rcx)
 	jz	L(Exit1)
 	cmpb	$0, 1(%rcx)
@@ -56,10 +57,10 @@ ENTRY (STRCPY)
 	jz	L(Exit7)
 	cmpb	$0, 7(%rcx)
 	jz	L(Exit8)
-#  ifdef USE_AS_STRNCPY
+# ifdef USE_AS_STRNCPY
 	cmp	$16, %r8
 	jb	L(StrncpyExit15Bytes)
-#  endif
+# endif
 	cmpb	$0, 8(%rcx)
 	jz	L(Exit9)
 	cmpb	$0, 9(%rcx)
@@ -74,10 +75,10 @@ ENTRY (STRCPY)
 	jz	L(Exit14)
 	cmpb	$0, 14(%rcx)
 	jz	L(Exit15)
-#  ifdef USE_AS_STRNCPY
+# ifdef USE_AS_STRNCPY
 	cmp	$16, %r8
 	je	L(Exit16)
-#  endif
+# endif
 	cmpb	$0, 15(%rcx)
 	jz	L(Exit16)
 # endif
@@ -87,25 +88,15 @@ ENTRY (STRCPY)
 	sub	$16, %r8
 	and	$0xf, %rsi
 
-/* add 16 bytes rcx_shift to r8 */
+/* add 16 bytes rcx_offset to r8 */
+
 	add	%rsi, %r8
 # endif
 	lea	16(%rcx), %rsi
-/* Now:
-	rsi	= alignment_16(rcx) + rcx_shift + 16;
-	rcx_shift = rcx - alignment_16(rcx)
-*/
 	and	$-16, %rsi
-/* Now:
-	rsi	= alignment_16(rcx) + 16
-*/
 	pxor	%xmm0, %xmm0
 	mov	(%rcx), %r9
 	mov	%r9, (%rdx)
-/*
-	look	if there is zero symbol in next 16 bytes of string
-	from	rsi to rsi + 15 and form mask in xmm0
-*/
 	pcmpeqb	(%rsi), %xmm0
 	mov	8(%rcx), %r9
 	mov	%r9, 8(%rdx)
@@ -115,10 +106,6 @@ ENTRY (STRCPY)
 	pmovmskb %xmm0, %rax
 	sub	%rcx, %rsi
 
-/* rsi = 16 - rcx_shift */
-
-/* rax = 0: there isn't end of string from position rsi to rsi+15 */
-
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(CopyFrom1To16BytesCase2OrCase3)
@@ -128,17 +115,9 @@ ENTRY (STRCPY)
 
 	mov	%rdx, %rax
 	lea	16(%rdx), %rdx
-/* Now:
-	rdx	= rdx + 16 = alignment_16(rdx) + rdx_shift + 16
-*/
 	and	$-16, %rdx
-
-/* Now: rdx = alignment_16(rdx) + 16 */
-
 	sub	%rdx, %rax
 
-/* Now: rax = rdx_shift - 16 */
-
 # ifdef USE_AS_STRNCPY
 	add	%rax, %rsi
 	lea	-1(%rsi), %rsi
@@ -150,22 +129,11 @@ ENTRY (STRCPY)
 L(ContinueCopy):
 # endif
 	sub	%rax, %rcx
-/* Now:
-	case	rcx_shift >= rdx_shift:
-	rcx	= alignment_16(rcx) + (rcx_shift  - rdx_shift) + 16
-	case	rcx_shift < rdx_shift:
-	rcx	= alignment_16(rcx) + (16 + rcx_shift  - rdx_shift)
-*/
 	mov	%rcx, %rax
 	and	$0xf, %rax
-/* Now:
-	case	rcx_shift >= rdx_shift: rax = rcx_shift  - rdx_shift
-	case	rcx_shift < rdx_shift: rax = (16 + rcx_shift  - rdx_shift)
-	rax	can be 0, 1,	..., 15
-*/
 	mov	$0, %rsi
 
-/* case: rcx_shift == rdx_shift */
+/* case: rcx_offset == rdx_offset */
 
 	jz	L(Align16Both)
 
@@ -282,10 +250,11 @@ L(Align16Both):
 	sub	%rcx, %rax
 	sub	%rax, %rdx
 # ifdef USE_AS_STRNCPY
-	lea	48+64(%r8, %rax), %r8
+	lea	112(%r8, %rax), %r8
 # endif
 	mov	$-0x40, %rsi
 
+	.p2align 4
 L(Aligned64Loop):
 	movaps	(%rcx), %xmm2
 	movaps	%xmm2, %xmm4
@@ -366,7 +335,6 @@ L(Shl1Start):
 	jnz	L(Shl1LoopExit)
 
 	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	31(%rcx), %xmm2
 
@@ -374,7 +342,7 @@ L(Shl1Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
+	movaps	%xmm2, %xmm1
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit1Case2OrCase3)
@@ -382,10 +350,9 @@ L(Shl1Start):
 	test	%rax, %rax
 	jnz	L(Shl1LoopExit)
 
-	palignr	$1, %xmm1, %xmm2
+	palignr	$1, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	movaps	31(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 
 	pcmpeqb	%xmm2, %xmm0
 	lea	16(%rdx), %rdx
@@ -400,7 +367,6 @@ L(Shl1Start):
 	jnz	L(Shl1LoopExit)
 
 	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	31(%rcx), %xmm2
 
@@ -408,7 +374,6 @@ L(Shl1Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit1Case2OrCase3)
@@ -416,8 +381,7 @@ L(Shl1Start):
 	test	%rax, %rax
 	jnz	L(Shl1LoopExit)
 
-	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
+	palignr	$1, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	lea	31(%rcx), %rcx
 	lea	16(%rdx), %rdx
@@ -432,6 +396,8 @@ L(Shl1Start):
 # endif
 	movaps	-1(%rcx), %xmm1
 
+/* 64 bytes loop */
+	.p2align 4
 L(Shl1LoopStart):
 	movaps	15(%rcx), %xmm2
 	movaps	31(%rcx), %xmm3
@@ -465,11 +431,9 @@ L(Shl1LoopStart):
 	jmp	L(Shl1LoopStart)
 
 L(Shl1LoopExit):
-	movaps	(%rdx), %xmm6
-	psrldq	$15, %xmm6
+	movdqu	-1(%rcx), %xmm1
 	mov	$15, %rsi
-	palignr	$1, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	movdqu	%xmm1, -1(%rdx)
 	jmp	L(CopyFrom1To16Bytes)
 
 	.p2align 4
@@ -488,7 +452,6 @@ L(Shl2Start):
 	jnz	L(Shl2LoopExit)
 
 	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	30(%rcx), %xmm2
 
@@ -496,7 +459,7 @@ L(Shl2Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
+	movaps	%xmm2, %xmm1
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit2Case2OrCase3)
@@ -504,10 +467,9 @@ L(Shl2Start):
 	test	%rax, %rax
 	jnz	L(Shl2LoopExit)
 
-	palignr	$2, %xmm1, %xmm2
+	palignr	$2, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	movaps	30(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 
 	pcmpeqb	%xmm2, %xmm0
 	lea	16(%rdx), %rdx
@@ -522,7 +484,6 @@ L(Shl2Start):
 	jnz	L(Shl2LoopExit)
 
 	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	30(%rcx), %xmm2
 
@@ -530,7 +491,6 @@ L(Shl2Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit2Case2OrCase3)
@@ -538,8 +498,7 @@ L(Shl2Start):
 	test	%rax, %rax
 	jnz	L(Shl2LoopExit)
 
-	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
+	palignr	$2, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	lea	30(%rcx), %rcx
 	lea	16(%rdx), %rdx
@@ -554,6 +513,8 @@ L(Shl2Start):
 # endif
 	movaps	-2(%rcx), %xmm1
 
+/* 64 bytes loop */
+	.p2align 4
 L(Shl2LoopStart):
 	movaps	14(%rcx), %xmm2
 	movaps	30(%rcx), %xmm3
@@ -587,11 +548,9 @@ L(Shl2LoopStart):
 	jmp	L(Shl2LoopStart)
 
 L(Shl2LoopExit):
-	movaps	(%rdx), %xmm6
-	psrldq	$14, %xmm6
+	movdqu	-2(%rcx), %xmm1
 	mov	$14, %rsi
-	palignr	$2, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	movdqu	%xmm1, -2(%rdx)
 	jmp	L(CopyFrom1To16Bytes)
 
 	.p2align 4
@@ -610,7 +569,6 @@ L(Shl3Start):
 	jnz	L(Shl3LoopExit)
 
 	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	29(%rcx), %xmm2
 
@@ -618,7 +576,7 @@ L(Shl3Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
+	movaps	%xmm2, %xmm1
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit3Case2OrCase3)
@@ -626,10 +584,9 @@ L(Shl3Start):
 	test	%rax, %rax
 	jnz	L(Shl3LoopExit)
 
-	palignr	$3, %xmm1, %xmm2
+	palignr	$3, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	movaps	29(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 
 	pcmpeqb	%xmm2, %xmm0
 	lea	16(%rdx), %rdx
@@ -644,7 +601,6 @@ L(Shl3Start):
 	jnz	L(Shl3LoopExit)
 
 	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	29(%rcx), %xmm2
 
@@ -652,7 +608,6 @@ L(Shl3Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit3Case2OrCase3)
@@ -660,8 +615,7 @@ L(Shl3Start):
 	test	%rax, %rax
 	jnz	L(Shl3LoopExit)
 
-	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
+	palignr	$3, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	lea	29(%rcx), %rcx
 	lea	16(%rdx), %rdx
@@ -676,6 +630,8 @@ L(Shl3Start):
 # endif
 	movaps	-3(%rcx), %xmm1
 
+/* 64 bytes loop */
+	.p2align 4
 L(Shl3LoopStart):
 	movaps	13(%rcx), %xmm2
 	movaps	29(%rcx), %xmm3
@@ -709,11 +665,9 @@ L(Shl3LoopStart):
 	jmp	L(Shl3LoopStart)
 
 L(Shl3LoopExit):
-	movaps	(%rdx), %xmm6
-	psrldq	$13, %xmm6
+	movdqu	-3(%rcx), %xmm1
 	mov	$13, %rsi
-	palignr	$3, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	movdqu	%xmm1, -3(%rdx)
 	jmp	L(CopyFrom1To16Bytes)
 
 	.p2align 4
@@ -732,7 +686,6 @@ L(Shl4Start):
 	jnz	L(Shl4LoopExit)
 
 	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	28(%rcx), %xmm2
 
@@ -740,7 +693,7 @@ L(Shl4Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
+	movaps	%xmm2, %xmm1
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit4Case2OrCase3)
@@ -748,10 +701,9 @@ L(Shl4Start):
 	test	%rax, %rax
 	jnz	L(Shl4LoopExit)
 
-	palignr	$4, %xmm1, %xmm2
+	palignr	$4, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	movaps	28(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 
 	pcmpeqb	%xmm2, %xmm0
 	lea	16(%rdx), %rdx
@@ -766,7 +718,6 @@ L(Shl4Start):
 	jnz	L(Shl4LoopExit)
 
 	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	28(%rcx), %xmm2
 
@@ -774,7 +725,6 @@ L(Shl4Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit4Case2OrCase3)
@@ -782,8 +732,7 @@ L(Shl4Start):
 	test	%rax, %rax
 	jnz	L(Shl4LoopExit)
 
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
+	palignr	$4, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	lea	28(%rcx), %rcx
 	lea	16(%rdx), %rdx
@@ -798,6 +747,8 @@ L(Shl4Start):
 # endif
 	movaps	-4(%rcx), %xmm1
 
+/* 64 bytes loop */
+	.p2align 4
 L(Shl4LoopStart):
 	movaps	12(%rcx), %xmm2
 	movaps	28(%rcx), %xmm3
@@ -831,11 +782,9 @@ L(Shl4LoopStart):
 	jmp	L(Shl4LoopStart)
 
 L(Shl4LoopExit):
-	movaps	(%rdx), %xmm6
-	psrldq	$12, %xmm6
+	movdqu	-4(%rcx), %xmm1
 	mov	$12, %rsi
-	palignr	$4, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	movdqu	%xmm1, -4(%rdx)
 	jmp	L(CopyFrom1To16Bytes)
 
 	.p2align 4
@@ -854,7 +803,6 @@ L(Shl5Start):
 	jnz	L(Shl5LoopExit)
 
 	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	27(%rcx), %xmm2
 
@@ -862,7 +810,7 @@ L(Shl5Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
+	movaps	%xmm2, %xmm1
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit5Case2OrCase3)
@@ -870,10 +818,9 @@ L(Shl5Start):
 	test	%rax, %rax
 	jnz	L(Shl5LoopExit)
 
-	palignr	$5, %xmm1, %xmm2
+	palignr	$5, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	movaps	27(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 
 	pcmpeqb	%xmm2, %xmm0
 	lea	16(%rdx), %rdx
@@ -888,7 +835,6 @@ L(Shl5Start):
 	jnz	L(Shl5LoopExit)
 
 	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	27(%rcx), %xmm2
 
@@ -896,7 +842,6 @@ L(Shl5Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit5Case2OrCase3)
@@ -904,8 +849,7 @@ L(Shl5Start):
 	test	%rax, %rax
 	jnz	L(Shl5LoopExit)
 
-	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
+	palignr	$5, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	lea	27(%rcx), %rcx
 	lea	16(%rdx), %rdx
@@ -920,6 +864,8 @@ L(Shl5Start):
 # endif
 	movaps	-5(%rcx), %xmm1
 
+/* 64 bytes loop */
+	.p2align 4
 L(Shl5LoopStart):
 	movaps	11(%rcx), %xmm2
 	movaps	27(%rcx), %xmm3
@@ -953,11 +899,9 @@ L(Shl5LoopStart):
 	jmp	L(Shl5LoopStart)
 
 L(Shl5LoopExit):
-	movaps	(%rdx), %xmm6
-	psrldq	$11, %xmm6
+	movdqu	-5(%rcx), %xmm1
 	mov	$11, %rsi
-	palignr	$5, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	movdqu	%xmm1, -5(%rdx)
 	jmp	L(CopyFrom1To16Bytes)
 
 	.p2align 4
@@ -976,7 +920,6 @@ L(Shl6Start):
 	jnz	L(Shl6LoopExit)
 
 	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	26(%rcx), %xmm2
 
@@ -984,7 +927,7 @@ L(Shl6Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
+	movaps	%xmm2, %xmm1
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit6Case2OrCase3)
@@ -992,10 +935,9 @@ L(Shl6Start):
 	test	%rax, %rax
 	jnz	L(Shl6LoopExit)
 
-	palignr	$6, %xmm1, %xmm2
+	palignr	$6, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	movaps	26(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 
 	pcmpeqb	%xmm2, %xmm0
 	lea	16(%rdx), %rdx
@@ -1010,7 +952,6 @@ L(Shl6Start):
 	jnz	L(Shl6LoopExit)
 
 	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	26(%rcx), %xmm2
 
@@ -1018,7 +959,6 @@ L(Shl6Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit6Case2OrCase3)
@@ -1026,8 +966,7 @@ L(Shl6Start):
 	test	%rax, %rax
 	jnz	L(Shl6LoopExit)
 
-	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
+	palignr	$6, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	lea	26(%rcx), %rcx
 	lea	16(%rdx), %rdx
@@ -1042,6 +981,8 @@ L(Shl6Start):
 # endif
 	movaps	-6(%rcx), %xmm1
 
+/* 64 bytes loop */
+	.p2align 4
 L(Shl6LoopStart):
 	movaps	10(%rcx), %xmm2
 	movaps	26(%rcx), %xmm3
@@ -1075,11 +1016,11 @@ L(Shl6LoopStart):
 	jmp	L(Shl6LoopStart)
 
 L(Shl6LoopExit):
-	movaps	(%rdx), %xmm6
-	psrldq	$10, %xmm6
+	mov	(%rcx), %r9
+	mov	6(%rcx), %esi
+	mov	%r9, (%rdx)
+	mov	%esi, 6(%rdx)
 	mov	$10, %rsi
-	palignr	$6, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
 	jmp	L(CopyFrom1To16Bytes)
 
 	.p2align 4
@@ -1098,7 +1039,6 @@ L(Shl7Start):
 	jnz	L(Shl7LoopExit)
 
 	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	25(%rcx), %xmm2
 
@@ -1106,7 +1046,7 @@ L(Shl7Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
+	movaps	%xmm2, %xmm1
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit7Case2OrCase3)
@@ -1114,10 +1054,9 @@ L(Shl7Start):
 	test	%rax, %rax
 	jnz	L(Shl7LoopExit)
 
-	palignr	$7, %xmm1, %xmm2
+	palignr	$7, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	movaps	25(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 
 	pcmpeqb	%xmm2, %xmm0
 	lea	16(%rdx), %rdx
@@ -1132,7 +1071,6 @@ L(Shl7Start):
 	jnz	L(Shl7LoopExit)
 
 	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	25(%rcx), %xmm2
 
@@ -1140,7 +1078,6 @@ L(Shl7Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit7Case2OrCase3)
@@ -1148,8 +1085,7 @@ L(Shl7Start):
 	test	%rax, %rax
 	jnz	L(Shl7LoopExit)
 
-	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
+	palignr	$7, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	lea	25(%rcx), %rcx
 	lea	16(%rdx), %rdx
@@ -1164,6 +1100,8 @@ L(Shl7Start):
 # endif
 	movaps	-7(%rcx), %xmm1
 
+/* 64 bytes loop */
+	.p2align 4
 L(Shl7LoopStart):
 	movaps	9(%rcx), %xmm2
 	movaps	25(%rcx), %xmm3
@@ -1197,11 +1135,11 @@ L(Shl7LoopStart):
 	jmp	L(Shl7LoopStart)
 
 L(Shl7LoopExit):
-	movaps	(%rdx), %xmm6
-	psrldq	$9, %xmm6
+	mov	(%rcx), %r9
+	mov	5(%rcx), %esi
+	mov	%r9, (%rdx)
+	mov	%esi, 5(%rdx)
 	mov	$9, %rsi
-	palignr	$7, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
 	jmp	L(CopyFrom1To16Bytes)
 
 	.p2align 4
@@ -1220,7 +1158,6 @@ L(Shl8Start):
 	jnz	L(Shl8LoopExit)
 
 	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	24(%rcx), %xmm2
 
@@ -1228,7 +1165,7 @@ L(Shl8Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
+	movaps	%xmm2, %xmm1
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit8Case2OrCase3)
@@ -1236,10 +1173,9 @@ L(Shl8Start):
 	test	%rax, %rax
 	jnz	L(Shl8LoopExit)
 
-	palignr	$8, %xmm1, %xmm2
+	palignr	$8, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	movaps	24(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 
 	pcmpeqb	%xmm2, %xmm0
 	lea	16(%rdx), %rdx
@@ -1254,7 +1190,6 @@ L(Shl8Start):
 	jnz	L(Shl8LoopExit)
 
 	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	24(%rcx), %xmm2
 
@@ -1262,7 +1197,6 @@ L(Shl8Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit8Case2OrCase3)
@@ -1270,8 +1204,7 @@ L(Shl8Start):
 	test	%rax, %rax
 	jnz	L(Shl8LoopExit)
 
-	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
+	palignr	$8, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	lea	24(%rcx), %rcx
 	lea	16(%rdx), %rdx
@@ -1286,6 +1219,8 @@ L(Shl8Start):
 # endif
 	movaps	-8(%rcx), %xmm1
 
+/* 64 bytes loop */
+	.p2align 4
 L(Shl8LoopStart):
 	movaps	8(%rcx), %xmm2
 	movaps	24(%rcx), %xmm3
@@ -1319,11 +1254,9 @@ L(Shl8LoopStart):
 	jmp	L(Shl8LoopStart)
 
 L(Shl8LoopExit):
-	movaps	(%rdx), %xmm6
-	psrldq	$8, %xmm6
+	mov	(%rcx), %r9
 	mov	$8, %rsi
-	palignr	$8, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	mov	%r9, (%rdx)
 	jmp	L(CopyFrom1To16Bytes)
 
 	.p2align 4
@@ -1342,7 +1275,6 @@ L(Shl9Start):
 	jnz	L(Shl9LoopExit)
 
 	palignr	$9, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	23(%rcx), %xmm2
 
@@ -1350,7 +1282,7 @@ L(Shl9Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
+	movaps	%xmm2, %xmm1
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit9Case2OrCase3)
@@ -1358,10 +1290,9 @@ L(Shl9Start):
 	test	%rax, %rax
 	jnz	L(Shl9LoopExit)
 
-	palignr	$9, %xmm1, %xmm2
+	palignr	$9, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	movaps	23(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 
 	pcmpeqb	%xmm2, %xmm0
 	lea	16(%rdx), %rdx
@@ -1376,7 +1307,6 @@ L(Shl9Start):
 	jnz	L(Shl9LoopExit)
 
 	palignr	$9, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	23(%rcx), %xmm2
 
@@ -1384,7 +1314,6 @@ L(Shl9Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit9Case2OrCase3)
@@ -1392,8 +1321,7 @@ L(Shl9Start):
 	test	%rax, %rax
 	jnz	L(Shl9LoopExit)
 
-	palignr	$9, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
+	palignr	$9, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	lea	23(%rcx), %rcx
 	lea	16(%rdx), %rdx
@@ -1408,6 +1336,8 @@ L(Shl9Start):
 # endif
 	movaps	-9(%rcx), %xmm1
 
+/* 64 bytes loop */
+	.p2align 4
 L(Shl9LoopStart):
 	movaps	7(%rcx), %xmm2
 	movaps	23(%rcx), %xmm3
@@ -1441,11 +1371,9 @@ L(Shl9LoopStart):
 	jmp	L(Shl9LoopStart)
 
 L(Shl9LoopExit):
-	movaps	(%rdx), %xmm6
-	psrldq	$7, %xmm6
+	mov	-1(%rcx), %r9
 	mov	$7, %rsi
-	palignr	$9, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	mov	%r9, -1(%rdx)
 	jmp	L(CopyFrom1To16Bytes)
 
 	.p2align 4
@@ -1464,7 +1392,6 @@ L(Shl10Start):
 	jnz	L(Shl10LoopExit)
 
 	palignr	$10, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	22(%rcx), %xmm2
 
@@ -1472,7 +1399,7 @@ L(Shl10Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
+	movaps	%xmm2, %xmm1
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit10Case2OrCase3)
@@ -1480,10 +1407,9 @@ L(Shl10Start):
 	test	%rax, %rax
 	jnz	L(Shl10LoopExit)
 
-	palignr	$10, %xmm1, %xmm2
+	palignr	$10, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	movaps	22(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 
 	pcmpeqb	%xmm2, %xmm0
 	lea	16(%rdx), %rdx
@@ -1498,7 +1424,6 @@ L(Shl10Start):
 	jnz	L(Shl10LoopExit)
 
 	palignr	$10, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	22(%rcx), %xmm2
 
@@ -1506,7 +1431,6 @@ L(Shl10Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit10Case2OrCase3)
@@ -1514,8 +1438,7 @@ L(Shl10Start):
 	test	%rax, %rax
 	jnz	L(Shl10LoopExit)
 
-	palignr	$10, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
+	palignr	$10, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	lea	22(%rcx), %rcx
 	lea	16(%rdx), %rdx
@@ -1530,6 +1453,8 @@ L(Shl10Start):
 # endif
 	movaps	-10(%rcx), %xmm1
 
+/* 64 bytes loop */
+	.p2align 4
 L(Shl10LoopStart):
 	movaps	6(%rcx), %xmm2
 	movaps	22(%rcx), %xmm3
@@ -1563,11 +1488,9 @@ L(Shl10LoopStart):
 	jmp	L(Shl10LoopStart)
 
 L(Shl10LoopExit):
-	movaps	(%rdx), %xmm6
-	psrldq	$6, %xmm6
+	mov	-2(%rcx), %r9
 	mov	$6, %rsi
-	palignr	$10, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	mov	%r9, -2(%rdx)
 	jmp	L(CopyFrom1To16Bytes)
 
 	.p2align 4
@@ -1586,7 +1509,6 @@ L(Shl11Start):
 	jnz	L(Shl11LoopExit)
 
 	palignr	$11, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	21(%rcx), %xmm2
 
@@ -1594,7 +1516,7 @@ L(Shl11Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
+	movaps	%xmm2, %xmm1
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit11Case2OrCase3)
@@ -1602,10 +1524,9 @@ L(Shl11Start):
 	test	%rax, %rax
 	jnz	L(Shl11LoopExit)
 
-	palignr	$11, %xmm1, %xmm2
+	palignr	$11, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	movaps	21(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 
 	pcmpeqb	%xmm2, %xmm0
 	lea	16(%rdx), %rdx
@@ -1620,7 +1541,6 @@ L(Shl11Start):
 	jnz	L(Shl11LoopExit)
 
 	palignr	$11, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	21(%rcx), %xmm2
 
@@ -1628,7 +1548,6 @@ L(Shl11Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit11Case2OrCase3)
@@ -1636,8 +1555,7 @@ L(Shl11Start):
 	test	%rax, %rax
 	jnz	L(Shl11LoopExit)
 
-	palignr	$11, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
+	palignr	$11, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	lea	21(%rcx), %rcx
 	lea	16(%rdx), %rdx
@@ -1652,6 +1570,8 @@ L(Shl11Start):
 # endif
 	movaps	-11(%rcx), %xmm1
 
+/* 64 bytes loop */
+	.p2align 4
 L(Shl11LoopStart):
 	movaps	5(%rcx), %xmm2
 	movaps	21(%rcx), %xmm3
@@ -1685,11 +1605,9 @@ L(Shl11LoopStart):
 	jmp	L(Shl11LoopStart)
 
 L(Shl11LoopExit):
-	movaps	(%rdx), %xmm6
-	psrldq	$5, %xmm6
+	mov	-3(%rcx), %r9
 	mov	$5, %rsi
-	palignr	$11, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	mov	%r9, -3(%rdx)
 	jmp	L(CopyFrom1To16Bytes)
 
 	.p2align 4
@@ -1708,7 +1626,6 @@ L(Shl12Start):
 	jnz	L(Shl12LoopExit)
 
 	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	20(%rcx), %xmm2
 
@@ -1716,7 +1633,7 @@ L(Shl12Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
+	movaps	%xmm2, %xmm1
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit12Case2OrCase3)
@@ -1724,10 +1641,9 @@ L(Shl12Start):
 	test	%rax, %rax
 	jnz	L(Shl12LoopExit)
 
-	palignr	$12, %xmm1, %xmm2
+	palignr	$12, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	movaps	20(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 
 	pcmpeqb	%xmm2, %xmm0
 	lea	16(%rdx), %rdx
@@ -1742,7 +1658,6 @@ L(Shl12Start):
 	jnz	L(Shl12LoopExit)
 
 	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	20(%rcx), %xmm2
 
@@ -1750,7 +1665,6 @@ L(Shl12Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit12Case2OrCase3)
@@ -1758,8 +1672,7 @@ L(Shl12Start):
 	test	%rax, %rax
 	jnz	L(Shl12LoopExit)
 
-	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
+	palignr	$12, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	lea	20(%rcx), %rcx
 	lea	16(%rdx), %rdx
@@ -1774,6 +1687,8 @@ L(Shl12Start):
 # endif
 	movaps	-12(%rcx), %xmm1
 
+/* 64 bytes loop */
+	.p2align 4
 L(Shl12LoopStart):
 	movaps	4(%rcx), %xmm2
 	movaps	20(%rcx), %xmm3
@@ -1807,11 +1722,9 @@ L(Shl12LoopStart):
 	jmp	L(Shl12LoopStart)
 
 L(Shl12LoopExit):
-	movaps	(%rdx), %xmm6
-	psrldq	$4, %xmm6
+	mov	(%rcx), %r9d
 	mov	$4, %rsi
-	palignr	$12, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	mov	%r9d, (%rdx)
 	jmp	L(CopyFrom1To16Bytes)
 
 	.p2align 4
@@ -1830,7 +1743,6 @@ L(Shl13Start):
 	jnz	L(Shl13LoopExit)
 
 	palignr	$13, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	19(%rcx), %xmm2
 
@@ -1838,7 +1750,7 @@ L(Shl13Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
+	movaps	%xmm2, %xmm1
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit13Case2OrCase3)
@@ -1846,10 +1758,9 @@ L(Shl13Start):
 	test	%rax, %rax
 	jnz	L(Shl13LoopExit)
 
-	palignr	$13, %xmm1, %xmm2
+	palignr	$13, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	movaps	19(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 
 	pcmpeqb	%xmm2, %xmm0
 	lea	16(%rdx), %rdx
@@ -1864,7 +1775,6 @@ L(Shl13Start):
 	jnz	L(Shl13LoopExit)
 
 	palignr	$13, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	19(%rcx), %xmm2
 
@@ -1872,7 +1782,6 @@ L(Shl13Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit13Case2OrCase3)
@@ -1880,8 +1789,7 @@ L(Shl13Start):
 	test	%rax, %rax
 	jnz	L(Shl13LoopExit)
 
-	palignr	$13, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
+	palignr	$13, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	lea	19(%rcx), %rcx
 	lea	16(%rdx), %rdx
@@ -1896,6 +1804,8 @@ L(Shl13Start):
 # endif
 	movaps	-13(%rcx), %xmm1
 
+/* 64 bytes loop */
+	.p2align 4
 L(Shl13LoopStart):
 	movaps	3(%rcx), %xmm2
 	movaps	19(%rcx), %xmm3
@@ -1929,11 +1839,9 @@ L(Shl13LoopStart):
 	jmp	L(Shl13LoopStart)
 
 L(Shl13LoopExit):
-	movaps	(%rdx), %xmm6
-	psrldq	$3, %xmm6
+	mov	-1(%rcx), %r9d
 	mov	$3, %rsi
-	palignr	$13, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	mov	%r9d, -1(%rdx)
 	jmp	L(CopyFrom1To16Bytes)
 
 	.p2align 4
@@ -1952,7 +1860,6 @@ L(Shl14Start):
 	jnz	L(Shl14LoopExit)
 
 	palignr	$14, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	18(%rcx), %xmm2
 
@@ -1960,7 +1867,7 @@ L(Shl14Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
+	movaps	%xmm2, %xmm1
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit14Case2OrCase3)
@@ -1968,10 +1875,9 @@ L(Shl14Start):
 	test	%rax, %rax
 	jnz	L(Shl14LoopExit)
 
-	palignr	$14, %xmm1, %xmm2
+	palignr	$14, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	movaps	18(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 
 	pcmpeqb	%xmm2, %xmm0
 	lea	16(%rdx), %rdx
@@ -1986,7 +1892,6 @@ L(Shl14Start):
 	jnz	L(Shl14LoopExit)
 
 	palignr	$14, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	18(%rcx), %xmm2
 
@@ -1994,7 +1899,6 @@ L(Shl14Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit14Case2OrCase3)
@@ -2002,8 +1906,7 @@ L(Shl14Start):
 	test	%rax, %rax
 	jnz	L(Shl14LoopExit)
 
-	palignr	$14, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
+	palignr	$14, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	lea	18(%rcx), %rcx
 	lea	16(%rdx), %rdx
@@ -2018,6 +1921,8 @@ L(Shl14Start):
 # endif
 	movaps	-14(%rcx), %xmm1
 
+/* 64 bytes loop */
+	.p2align 4
 L(Shl14LoopStart):
 	movaps	2(%rcx), %xmm2
 	movaps	18(%rcx), %xmm3
@@ -2051,11 +1956,9 @@ L(Shl14LoopStart):
 	jmp	L(Shl14LoopStart)
 
 L(Shl14LoopExit):
-	movaps	(%rdx), %xmm6
-	psrldq	$2, %xmm6
+	mov	-2(%rcx), %r9d
 	mov	$2, %rsi
-	palignr	$14, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	mov	%r9d, -2(%rdx)
 	jmp	L(CopyFrom1To16Bytes)
 
 	.p2align 4
@@ -2074,7 +1977,6 @@ L(Shl15Start):
 	jnz	L(Shl15LoopExit)
 
 	palignr	$15, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	17(%rcx), %xmm2
 
@@ -2082,7 +1984,7 @@ L(Shl15Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
+	movaps	%xmm2, %xmm1
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit15Case2OrCase3)
@@ -2090,10 +1992,9 @@ L(Shl15Start):
 	test	%rax, %rax
 	jnz	L(Shl15LoopExit)
 
-	palignr	$15, %xmm1, %xmm2
+	palignr	$15, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	movaps	17(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 
 	pcmpeqb	%xmm2, %xmm0
 	lea	16(%rdx), %rdx
@@ -2108,7 +2009,6 @@ L(Shl15Start):
 	jnz	L(Shl15LoopExit)
 
 	palignr	$15, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	17(%rcx), %xmm2
 
@@ -2116,7 +2016,6 @@ L(Shl15Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit15Case2OrCase3)
@@ -2124,8 +2023,7 @@ L(Shl15Start):
 	test	%rax, %rax
 	jnz	L(Shl15LoopExit)
 
-	palignr	$15, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
+	palignr	$15, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	lea	17(%rcx), %rcx
 	lea	16(%rdx), %rdx
@@ -2140,6 +2038,8 @@ L(Shl15Start):
 # endif
 	movaps	-15(%rcx), %xmm1
 
+/* 64 bytes loop */
+	.p2align 4
 L(Shl15LoopStart):
 	movaps	1(%rcx), %xmm2
 	movaps	17(%rcx), %xmm3
@@ -2173,16 +2073,15 @@ L(Shl15LoopStart):
 	jmp	L(Shl15LoopStart)
 
 L(Shl15LoopExit):
-	movaps	(%rdx), %xmm6
-	psrldq	$1, %xmm6
+	mov	-3(%rcx), %r9d
 	mov	$1, %rsi
-	palignr	$15, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	mov	%r9d, -3(%rdx)
 # ifdef USE_AS_STRCAT
 	jmp	L(CopyFrom1To16Bytes)
 # endif
 
 # ifndef USE_AS_STRCAT
+
 	.p2align 4
 L(CopyFrom1To16Bytes):
 #  ifdef USE_AS_STRNCPY
@@ -2463,7 +2362,7 @@ L(Exit4):
 #   ifdef USE_AS_STPCPY
 	cmpb	$1, (%rax)
 	sbb	$-1, %rax
-#  endif
+#   endif
 #  endif
 	ret
 
@@ -2485,7 +2384,7 @@ L(Exit5):
 #   ifdef USE_AS_STPCPY
 	cmpb	$1, (%rax)
 	sbb	$-1, %rax
-#   endif
+#  endif
 #  endif
 	ret
 
@@ -2507,7 +2406,7 @@ L(Exit6):
 #   ifdef USE_AS_STPCPY
 	cmpb	$1, (%rax)
 	sbb	$-1, %rax
-#   endif
+#  endif
 #  endif
 	ret
 
@@ -2617,7 +2516,7 @@ L(Exit12):
 #   ifdef USE_AS_STPCPY
 	cmpb	$1, (%rax)
 	sbb	$-1, %rax
-#   endif
+#  endif
 #  endif
 	ret
 
@@ -2955,11 +2854,10 @@ L(StrncpyExit8Bytes):
 	ret
 
 #  endif
-
 # endif
 
 # ifdef USE_AS_STRNCPY
-
+	.p2align 4
 L(StrncpyLeaveCase2OrCase3):
 	test	%rax, %rax
 	jnz	L(Aligned64LeaveCase2)
@@ -3014,710 +2912,639 @@ L(Aligned64LeaveCase2):
 	lea	-16(%r8), %r8
 	jmp	L(CopyFrom1To16BytesCase2)
 /*--------------------------------------------------*/
+	.p2align 4
 L(StrncpyExit1Case2OrCase3):
-	movaps	(%rdx), %xmm6
-	psrldq	$15, %xmm6
+	movdqu	-1(%rcx), %xmm0
+	movdqu	%xmm0, -1(%rdx)
 	mov	$15, %rsi
-	palignr	$1, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
 	test	%rax, %rax
 	jnz	L(CopyFrom1To16BytesCase2)
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyExit2Case2OrCase3):
-	movaps	(%rdx), %xmm6
-	psrldq	$14, %xmm6
+	movdqu	-2(%rcx), %xmm0
+	movdqu	%xmm0, -2(%rdx)
 	mov	$14, %rsi
-	palignr	$2, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
 	test	%rax, %rax
 	jnz	L(CopyFrom1To16BytesCase2)
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyExit3Case2OrCase3):
-	movaps	(%rdx), %xmm6
-	psrldq	$13, %xmm6
+	movdqu	-3(%rcx), %xmm0
+	movdqu	%xmm0, -3(%rdx)
 	mov	$13, %rsi
-	palignr	$3, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
 	test	%rax, %rax
 	jnz	L(CopyFrom1To16BytesCase2)
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyExit4Case2OrCase3):
-	movaps	(%rdx), %xmm6
-	psrldq	$12, %xmm6
+	movdqu	-4(%rcx), %xmm0
+	movdqu	%xmm0, -4(%rdx)
 	mov	$12, %rsi
-	palignr	$4, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
 	test	%rax, %rax
 	jnz	L(CopyFrom1To16BytesCase2)
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyExit5Case2OrCase3):
-	movaps	(%rdx), %xmm6
-	psrldq	$11, %xmm6
+	movdqu	-5(%rcx), %xmm0
+	movdqu	%xmm0, -5(%rdx)
 	mov	$11, %rsi
-	palignr	$5, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
 	test	%rax, %rax
 	jnz	L(CopyFrom1To16BytesCase2)
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyExit6Case2OrCase3):
-	movaps	(%rdx), %xmm6
-	psrldq	$10, %xmm6
-	mov	$10, %rsi
-	palignr	$6, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	mov	(%rcx), %rsi
+	mov	6(%rcx), %r9d
+	mov	%r9d, 6(%rdx)
+	mov	%rsi, (%rdx)
 	test	%rax, %rax
+	mov	$10, %rsi
 	jnz	L(CopyFrom1To16BytesCase2)
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyExit7Case2OrCase3):
-	movaps	(%rdx), %xmm6
-	psrldq	$9, %xmm6
-	mov	$9, %rsi
-	palignr	$7, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	mov	(%rcx), %rsi
+	mov	5(%rcx), %r9d
+	mov	%r9d, 5(%rdx)
+	mov	%rsi, (%rdx)
 	test	%rax, %rax
+	mov	$9, %rsi
 	jnz	L(CopyFrom1To16BytesCase2)
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyExit8Case2OrCase3):
-	movaps	(%rdx), %xmm6
-	psrldq	$8, %xmm6
+	mov	(%rcx), %r9
 	mov	$8, %rsi
-	palignr	$8, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	mov	%r9, (%rdx)
 	test	%rax, %rax
 	jnz	L(CopyFrom1To16BytesCase2)
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyExit9Case2OrCase3):
-	movaps	(%rdx), %xmm6
-	psrldq	$7, %xmm6
+	mov	-1(%rcx), %r9
 	mov	$7, %rsi
-	palignr	$9, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	mov	%r9, -1(%rdx)
 	test	%rax, %rax
 	jnz	L(CopyFrom1To16BytesCase2)
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyExit10Case2OrCase3):
-	movaps	(%rdx), %xmm6
-	psrldq	$6, %xmm6
+	mov	-2(%rcx), %r9
 	mov	$6, %rsi
-	palignr	$10, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	mov	%r9, -2(%rdx)
 	test	%rax, %rax
 	jnz	L(CopyFrom1To16BytesCase2)
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyExit11Case2OrCase3):
-	movaps	(%rdx), %xmm6
-	psrldq	$5, %xmm6
+	mov	-3(%rcx), %r9
 	mov	$5, %rsi
-	palignr	$11, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	mov	%r9, -3(%rdx)
 	test	%rax, %rax
 	jnz	L(CopyFrom1To16BytesCase2)
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyExit12Case2OrCase3):
-	movaps	(%rdx), %xmm6
-	psrldq	$4, %xmm6
+	mov	(%rcx), %r9d
 	mov	$4, %rsi
-	palignr	$12, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	mov	%r9d, (%rdx)
 	test	%rax, %rax
 	jnz	L(CopyFrom1To16BytesCase2)
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyExit13Case2OrCase3):
-	movaps	(%rdx), %xmm6
-	psrldq	$3, %xmm6
+	mov	-1(%rcx), %r9d
 	mov	$3, %rsi
-	palignr	$13, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	mov	%r9d, -1(%rdx)
 	test	%rax, %rax
 	jnz	L(CopyFrom1To16BytesCase2)
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyExit14Case2OrCase3):
-	movaps	(%rdx), %xmm6
-	psrldq	$2, %xmm6
+	mov	-2(%rcx), %r9d
 	mov	$2, %rsi
-	palignr	$14, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	mov	%r9d, -2(%rdx)
 	test	%rax, %rax
 	jnz	L(CopyFrom1To16BytesCase2)
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyExit15Case2OrCase3):
-	movaps	(%rdx), %xmm6
-	psrldq	$1, %xmm6
+	mov	-3(%rcx), %r9d
 	mov	$1, %rsi
-	palignr	$15, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	mov	%r9d, -3(%rdx)
 	test	%rax, %rax
 	jnz	L(CopyFrom1To16BytesCase2)
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyLeave1):
 	movaps	%xmm2, %xmm3
 	add	$48, %r8
 	jle	L(StrncpyExit1)
 	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	31(%rcx), %xmm2
 	lea	16(%rsi), %rsi
-	movaps	%xmm2, %xmm3
 	sub	$16, %r8
 	jbe	L(StrncpyExit1)
-	palignr	$1, %xmm1, %xmm2
+	palignr	$1, %xmm3, %xmm2
 	movaps	%xmm2, 16(%rdx)
-	movaps	31+16(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit1)
-	movaps	%xmm2, %xmm1
 	movaps	%xmm4, 32(%rdx)
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit1)
-	movaps	%xmm7, %xmm1
 	movaps	%xmm5, 48(%rdx)
 	lea	16(%rsi), %rsi
 	lea	-16(%r8), %r8
 
 L(StrncpyExit1):
-	movaps	(%rdx, %rsi), %xmm6
-	psrldq	$15, %xmm6
-	palignr	$1, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx, %rsi)
-	lea	15(%rsi), %rsi
+	lea	15(%rdx, %rsi), %rdx
+	lea	15(%rcx, %rsi), %rcx
+	mov	-15(%rcx), %rsi
+	mov	-8(%rcx), %rax
+	mov	%rsi, -15(%rdx)
+	mov	%rax, -8(%rdx)
+	xor	%rsi, %rsi
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyLeave2):
 	movaps	%xmm2, %xmm3
 	add	$48, %r8
 	jle	L(StrncpyExit2)
 	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	30(%rcx), %xmm2
 	lea	16(%rsi), %rsi
-	movaps	%xmm2, %xmm3
 	sub	$16, %r8
 	jbe	L(StrncpyExit2)
-	palignr	$2, %xmm1, %xmm2
+	palignr	$2, %xmm3, %xmm2
 	movaps	%xmm2, 16(%rdx)
-	movaps	30+16(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit2)
-	movaps	%xmm2, %xmm1
 	movaps	%xmm4, 32(%rdx)
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit2)
-	movaps	%xmm7, %xmm1
 	movaps	%xmm5, 48(%rdx)
 	lea	16(%rsi), %rsi
 	lea	-16(%r8), %r8
 
 L(StrncpyExit2):
-	movaps	(%rdx, %rsi), %xmm6
-	psrldq	$14, %xmm6
-	palignr	$2, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx, %rsi)
-	lea	14(%rsi), %rsi
+	lea	14(%rdx, %rsi), %rdx
+	lea	14(%rcx, %rsi), %rcx
+	mov	-14(%rcx), %rsi
+	mov	-8(%rcx), %rax
+	mov	%rsi, -14(%rdx)
+	mov	%rax, -8(%rdx)
+	xor	%rsi, %rsi
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyLeave3):
 	movaps	%xmm2, %xmm3
 	add	$48, %r8
 	jle	L(StrncpyExit3)
 	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	29(%rcx), %xmm2
 	lea	16(%rsi), %rsi
-	movaps	%xmm2, %xmm3
 	sub	$16, %r8
 	jbe	L(StrncpyExit3)
-	palignr	$3, %xmm1, %xmm2
+	palignr	$3, %xmm3, %xmm2
 	movaps	%xmm2, 16(%rdx)
-	movaps	29+16(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit3)
-	movaps	%xmm2, %xmm1
 	movaps	%xmm4, 32(%rdx)
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit3)
-	movaps	%xmm7, %xmm1
 	movaps	%xmm5, 48(%rdx)
 	lea	16(%rsi), %rsi
 	lea	-16(%r8), %r8
 
 L(StrncpyExit3):
-	movaps	(%rdx, %rsi), %xmm6
-	psrldq	$13, %xmm6
-	palignr	$3, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx, %rsi)
-	lea	13(%rsi), %rsi
+	lea	13(%rdx, %rsi), %rdx
+	lea	13(%rcx, %rsi), %rcx
+	mov	-13(%rcx), %rsi
+	mov	-8(%rcx), %rax
+	mov	%rsi, -13(%rdx)
+	mov	%rax, -8(%rdx)
+	xor	%rsi, %rsi
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyLeave4):
 	movaps	%xmm2, %xmm3
 	add	$48, %r8
 	jle	L(StrncpyExit4)
 	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	28(%rcx), %xmm2
 	lea	16(%rsi), %rsi
-	movaps	%xmm2, %xmm3
 	sub	$16, %r8
 	jbe	L(StrncpyExit4)
-	palignr	$4, %xmm1, %xmm2
+	palignr	$4, %xmm3, %xmm2
 	movaps	%xmm2, 16(%rdx)
-	movaps	28+16(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit4)
-	movaps	%xmm2, %xmm1
 	movaps	%xmm4, 32(%rdx)
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit4)
-	movaps	%xmm7, %xmm1
 	movaps	%xmm5, 48(%rdx)
 	lea	16(%rsi), %rsi
 	lea	-16(%r8), %r8
 
 L(StrncpyExit4):
-	movaps	(%rdx, %rsi), %xmm6
-	psrldq	$12, %xmm6
-	palignr	$4, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx, %rsi)
-	lea	12(%rsi), %rsi
+	lea	12(%rdx, %rsi), %rdx
+	lea	12(%rcx, %rsi), %rcx
+	mov	-12(%rcx), %rsi
+	mov	-4(%rcx), %eax
+	mov	%rsi, -12(%rdx)
+	mov	%eax, -4(%rdx)
+	xor	%rsi, %rsi
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyLeave5):
 	movaps	%xmm2, %xmm3
 	add	$48, %r8
 	jle	L(StrncpyExit5)
 	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	27(%rcx), %xmm2
 	lea	16(%rsi), %rsi
-	movaps	%xmm2, %xmm3
 	sub	$16, %r8
 	jbe	L(StrncpyExit5)
-	palignr	$5, %xmm1, %xmm2
+	palignr	$5, %xmm3, %xmm2
 	movaps	%xmm2, 16(%rdx)
-	movaps	27+16(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit5)
-	movaps	%xmm2, %xmm1
 	movaps	%xmm4, 32(%rdx)
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit5)
-	movaps	%xmm7, %xmm1
 	movaps	%xmm5, 48(%rdx)
 	lea	16(%rsi), %rsi
 	lea	-16(%r8), %r8
 
 L(StrncpyExit5):
-	movaps	(%rdx, %rsi), %xmm6
-	psrldq	$11, %xmm6
-	palignr	$5, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx, %rsi)
-	lea	11(%rsi), %rsi
+	lea	11(%rdx, %rsi), %rdx
+	lea	11(%rcx, %rsi), %rcx
+	mov	-11(%rcx), %rsi
+	mov	-4(%rcx), %eax
+	mov	%rsi, -11(%rdx)
+	mov	%eax, -4(%rdx)
+	xor	%rsi, %rsi
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyLeave6):
 	movaps	%xmm2, %xmm3
 	add	$48, %r8
 	jle	L(StrncpyExit6)
 	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	26(%rcx), %xmm2
 	lea	16(%rsi), %rsi
-	movaps	%xmm2, %xmm3
 	sub	$16, %r8
 	jbe	L(StrncpyExit6)
-	palignr	$6, %xmm1, %xmm2
+	palignr	$6, %xmm3, %xmm2
 	movaps	%xmm2, 16(%rdx)
-	movaps	26+16(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit6)
-	movaps	%xmm2, %xmm1
 	movaps	%xmm4, 32(%rdx)
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit6)
-	movaps	%xmm7, %xmm1
 	movaps	%xmm5, 48(%rdx)
 	lea	16(%rsi), %rsi
 	lea	-16(%r8), %r8
 
 L(StrncpyExit6):
-	movaps	(%rdx, %rsi), %xmm6
-	psrldq	$10, %xmm6
-	palignr	$6, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx, %rsi)
-	lea	10(%rsi), %rsi
+	lea	10(%rdx, %rsi), %rdx
+	lea	10(%rcx, %rsi), %rcx
+	mov	-10(%rcx), %rsi
+	movw	-2(%rcx), %ax
+	mov	%rsi, -10(%rdx)
+	movw	%ax, -2(%rdx)
+	xor	%rsi, %rsi
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyLeave7):
 	movaps	%xmm2, %xmm3
 	add	$48, %r8
 	jle	L(StrncpyExit7)
 	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	25(%rcx), %xmm2
 	lea	16(%rsi), %rsi
-	movaps	%xmm2, %xmm3
 	sub	$16, %r8
 	jbe	L(StrncpyExit7)
-	palignr	$7, %xmm1, %xmm2
+	palignr	$7, %xmm3, %xmm2
 	movaps	%xmm2, 16(%rdx)
-	movaps	25+16(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit7)
-	movaps	%xmm2, %xmm1
 	movaps	%xmm4, 32(%rdx)
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit7)
-	movaps	%xmm7, %xmm1
 	movaps	%xmm5, 48(%rdx)
 	lea	16(%rsi), %rsi
 	lea	-16(%r8), %r8
 
 L(StrncpyExit7):
-	movaps	(%rdx, %rsi), %xmm6
-	psrldq	$9, %xmm6
-	palignr	$7, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx, %rsi)
-	lea	9(%rsi), %rsi
+	lea	9(%rdx, %rsi), %rdx
+	lea	9(%rcx, %rsi), %rcx
+	mov	-9(%rcx), %rsi
+	movb	-1(%rcx), %ah
+	mov	%rsi, -9(%rdx)
+	movb	%ah, -1(%rdx)
+	xor	%rsi, %rsi
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyLeave8):
 	movaps	%xmm2, %xmm3
 	add	$48, %r8
 	jle	L(StrncpyExit8)
 	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	24(%rcx), %xmm2
 	lea	16(%rsi), %rsi
-	movaps	%xmm2, %xmm3
 	sub	$16, %r8
 	jbe	L(StrncpyExit8)
-	palignr	$8, %xmm1, %xmm2
+	palignr	$8, %xmm3, %xmm2
 	movaps	%xmm2, 16(%rdx)
-	movaps	24+16(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit8)
-	movaps	%xmm2, %xmm1
 	movaps	%xmm4, 32(%rdx)
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit8)
-	movaps	%xmm7, %xmm1
 	movaps	%xmm5, 48(%rdx)
 	lea	16(%rsi), %rsi
 	lea	-16(%r8), %r8
 
 L(StrncpyExit8):
-	movaps	(%rdx, %rsi), %xmm6
-	psrldq	$8, %xmm6
-	palignr	$8, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx, %rsi)
-	lea	8(%rsi), %rsi
+	lea	8(%rdx, %rsi), %rdx
+	lea	8(%rcx, %rsi), %rcx
+	mov	-8(%rcx), %rax
+	xor	%rsi, %rsi
+	mov	%rax, -8(%rdx)
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyLeave9):
 	movaps	%xmm2, %xmm3
 	add	$48, %r8
 	jle	L(StrncpyExit9)
 	palignr	$9, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	23(%rcx), %xmm2
 	lea	16(%rsi), %rsi
-	movaps	%xmm2, %xmm3
 	sub	$16, %r8
 	jbe	L(StrncpyExit9)
-	palignr	$9, %xmm1, %xmm2
+	palignr	$9, %xmm3, %xmm2
 	movaps	%xmm2, 16(%rdx)
-	movaps	23+16(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit9)
-	movaps	%xmm2, %xmm1
 	movaps	%xmm4, 32(%rdx)
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit9)
-	movaps	%xmm7, %xmm1
 	movaps	%xmm5, 48(%rdx)
 	lea	16(%rsi), %rsi
 	lea	-16(%r8), %r8
 
 L(StrncpyExit9):
-	movaps	(%rdx, %rsi), %xmm6
-	psrldq	$7, %xmm6
-	palignr	$9, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx, %rsi)
-	lea	7(%rsi), %rsi
+	lea	7(%rdx, %rsi), %rdx
+	lea	7(%rcx, %rsi), %rcx
+	mov	-8(%rcx), %rax
+	xor	%rsi, %rsi
+	mov	%rax, -8(%rdx)
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyLeave10):
 	movaps	%xmm2, %xmm3
 	add	$48, %r8
 	jle	L(StrncpyExit10)
 	palignr	$10, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	22(%rcx), %xmm2
 	lea	16(%rsi), %rsi
-	movaps	%xmm2, %xmm3
 	sub	$16, %r8
 	jbe	L(StrncpyExit10)
-	palignr	$10, %xmm1, %xmm2
+	palignr	$10, %xmm3, %xmm2
 	movaps	%xmm2, 16(%rdx)
-	movaps	22+16(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit10)
-	movaps	%xmm2, %xmm1
 	movaps	%xmm4, 32(%rdx)
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit10)
-	movaps	%xmm7, %xmm1
 	movaps	%xmm5, 48(%rdx)
 	lea	16(%rsi), %rsi
 	lea	-16(%r8), %r8
 
 L(StrncpyExit10):
-	movaps	(%rdx, %rsi), %xmm6
-	psrldq	$6, %xmm6
-	palignr	$10, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx, %rsi)
-	lea	6(%rsi), %rsi
+	lea	6(%rdx, %rsi), %rdx
+	lea	6(%rcx, %rsi), %rcx
+	mov	-8(%rcx), %rax
+	xor	%rsi, %rsi
+	mov	%rax, -8(%rdx)
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyLeave11):
 	movaps	%xmm2, %xmm3
 	add	$48, %r8
 	jle	L(StrncpyExit11)
 	palignr	$11, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	21(%rcx), %xmm2
 	lea	16(%rsi), %rsi
-	movaps	%xmm2, %xmm3
 	sub	$16, %r8
 	jbe	L(StrncpyExit11)
-	palignr	$11, %xmm1, %xmm2
+	palignr	$11, %xmm3, %xmm2
 	movaps	%xmm2, 16(%rdx)
-	movaps	21+16(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit11)
-	movaps	%xmm2, %xmm1
 	movaps	%xmm4, 32(%rdx)
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit11)
-	movaps	%xmm7, %xmm1
 	movaps	%xmm5, 48(%rdx)
 	lea	16(%rsi), %rsi
 	lea	-16(%r8), %r8
 
 L(StrncpyExit11):
-	movaps	(%rdx, %rsi), %xmm6
-	psrldq	$5, %xmm6
-	palignr	$11, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx, %rsi)
-	lea	5(%rsi), %rsi
+	lea	5(%rdx, %rsi), %rdx
+	lea	5(%rcx, %rsi), %rcx
+	mov	-8(%rcx), %rax
+	xor	%rsi, %rsi
+	mov	%rax, -8(%rdx)
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyLeave12):
 	movaps	%xmm2, %xmm3
 	add	$48, %r8
 	jle	L(StrncpyExit12)
 	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	20(%rcx), %xmm2
 	lea	16(%rsi), %rsi
-	movaps	%xmm2, %xmm3
 	sub	$16, %r8
 	jbe	L(StrncpyExit12)
-	palignr	$12, %xmm1, %xmm2
+	palignr	$12, %xmm3, %xmm2
 	movaps	%xmm2, 16(%rdx)
-	movaps	20+16(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit12)
-	movaps	%xmm2, %xmm1
 	movaps	%xmm4, 32(%rdx)
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit12)
-	movaps	%xmm7, %xmm1
 	movaps	%xmm5, 48(%rdx)
 	lea	16(%rsi), %rsi
 	lea	-16(%r8), %r8
 
 L(StrncpyExit12):
-	movaps	(%rdx, %rsi), %xmm6
-	psrldq	$4, %xmm6
-	palignr	$12, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx, %rsi)
-	lea	4(%rsi), %rsi
+	lea	4(%rdx, %rsi), %rdx
+	lea	4(%rcx, %rsi), %rcx
+	mov	-4(%rcx), %eax
+	xor	%rsi, %rsi
+	mov	%eax, -4(%rdx)
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyLeave13):
 	movaps	%xmm2, %xmm3
 	add	$48, %r8
 	jle	L(StrncpyExit13)
 	palignr	$13, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	19(%rcx), %xmm2
 	lea	16(%rsi), %rsi
-	movaps	%xmm2, %xmm3
 	sub	$16, %r8
 	jbe	L(StrncpyExit13)
-	palignr	$13, %xmm1, %xmm2
+	palignr	$13, %xmm3, %xmm2
 	movaps	%xmm2, 16(%rdx)
-	movaps	19+16(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit13)
-	movaps	%xmm2, %xmm1
 	movaps	%xmm4, 32(%rdx)
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit13)
-	movaps	%xmm7, %xmm1
 	movaps	%xmm5, 48(%rdx)
 	lea	16(%rsi), %rsi
 	lea	-16(%r8), %r8
 
 L(StrncpyExit13):
-	movaps	(%rdx, %rsi), %xmm6
-	psrldq	$3, %xmm6
-	palignr	$13, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx, %rsi)
-	lea	3(%rsi), %rsi
+	lea	3(%rdx, %rsi), %rdx
+	lea	3(%rcx, %rsi), %rcx
+	mov	-4(%rcx), %eax
+	xor	%rsi, %rsi
+	mov	%eax, -4(%rdx)
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyLeave14):
 	movaps	%xmm2, %xmm3
 	add	$48, %r8
 	jle	L(StrncpyExit14)
 	palignr	$14, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	18(%rcx), %xmm2
 	lea	16(%rsi), %rsi
-	movaps	%xmm2, %xmm3
 	sub	$16, %r8
 	jbe	L(StrncpyExit14)
-	palignr	$14, %xmm1, %xmm2
+	palignr	$14, %xmm3, %xmm2
 	movaps	%xmm2, 16(%rdx)
-	movaps	18+16(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit14)
-	movaps	%xmm2, %xmm1
 	movaps	%xmm4, 32(%rdx)
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit14)
-	movaps	%xmm7, %xmm1
 	movaps	%xmm5, 48(%rdx)
 	lea	16(%rsi), %rsi
 	lea	-16(%r8), %r8
 
 L(StrncpyExit14):
-	movaps	(%rdx, %rsi), %xmm6
-	psrldq	$2, %xmm6
-	palignr	$14, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx, %rsi)
-	lea	2(%rsi), %rsi
+	lea	2(%rdx, %rsi), %rdx
+	lea	2(%rcx, %rsi), %rcx
+	movw	-2(%rcx), %ax
+	xor	%rsi, %rsi
+	movw	%ax, -2(%rdx)
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyLeave15):
 	movaps	%xmm2, %xmm3
 	add	$48, %r8
 	jle	L(StrncpyExit15)
 	palignr	$15, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	17(%rcx), %xmm2
 	lea	16(%rsi), %rsi
-	movaps	%xmm2, %xmm3
 	sub	$16, %r8
 	jbe	L(StrncpyExit15)
-	palignr	$15, %xmm1, %xmm2
+	palignr	$15, %xmm3, %xmm2
 	movaps	%xmm2, 16(%rdx)
-	movaps	17+16(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit15)
-	movaps	%xmm2, %xmm1
 	movaps	%xmm4, 32(%rdx)
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit15)
-	movaps	%xmm7, %xmm1
 	movaps	%xmm5, 48(%rdx)
 	lea	16(%rsi), %rsi
 	lea	-16(%r8), %r8
 
 L(StrncpyExit15):
-	movaps	(%rdx, %rsi), %xmm6
-	psrldq	$1, %xmm6
-	palignr	$15, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx, %rsi)
-	lea	1(%rsi), %rsi
+	lea	1(%rdx, %rsi), %rdx
+	lea	1(%rcx, %rsi), %rcx
+	movb	-1(%rcx), %ah
+	xor	%rsi, %rsi
+	movb	%ah, -1(%rdx)
 	jmp	L(CopyFrom1To16BytesCase3)
+
 # endif
 # ifndef USE_AS_STRCAT
 END (STRCPY)
diff --git a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
index 4e292f3c2b..477b2cb4ef 100644
--- a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
+++ b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
@@ -21,8 +21,9 @@
 #ifndef NOT_IN_libc
 # include <sysdep.h>
 
-.text
+	.section .text.ssse3,"ax",@progbits
 ENTRY (__wcscpy_ssse3)
+
 	mov	%rsi, %rcx
 	mov	%rdi, %rdx
 
@@ -136,6 +137,7 @@ L(Align16Both):
 
 	mov	$-0x40, %rsi
 
+	.p2align 4
 L(Aligned64Loop):
 	movaps	(%rcx), %xmm2
 	movaps	%xmm2, %xmm4
@@ -205,7 +207,6 @@ L(Shl4Start):
 	jnz	L(Shl4LoopExit)
 
 	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	28(%rcx), %xmm2
 
@@ -213,15 +214,14 @@ L(Shl4Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
+	movaps	%xmm2, %xmm1
 
 	test	%rax, %rax
 	jnz	L(Shl4LoopExit)
 
-	palignr	$4, %xmm1, %xmm2
+	palignr	$4, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	movaps	28(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 
 	pcmpeqd	%xmm2, %xmm0
 	lea	16(%rdx), %rdx
@@ -233,7 +233,6 @@ L(Shl4Start):
 	jnz	L(Shl4LoopExit)
 
 	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	28(%rcx), %xmm2
 
@@ -245,8 +244,7 @@ L(Shl4Start):
 	test	%rax, %rax
 	jnz	L(Shl4LoopExit)
 
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
+	palignr	$4, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	lea	28(%rcx), %rcx
 	lea	16(%rdx), %rdx
@@ -259,6 +257,7 @@ L(Shl4Start):
 
 	movaps	-4(%rcx), %xmm1
 
+	.p2align 4
 L(Shl4LoopStart):
 	movaps	12(%rcx), %xmm2
 	movaps	28(%rcx), %xmm3
@@ -289,11 +288,9 @@ L(Shl4LoopStart):
 	jmp	L(Shl4LoopStart)
 
 L(Shl4LoopExit):
-	movaps	(%rdx), %xmm6
-	psrldq	$12, %xmm6
+	movdqu	-4(%rcx), %xmm1
 	mov	$12, %rsi
-	palignr	$4, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	movdqu	%xmm1, -4(%rdx)
 	jmp	L(CopyFrom1To16Bytes)
 
 	.p2align 4
@@ -309,7 +306,6 @@ L(Shl8Start):
 	jnz	L(Shl8LoopExit)
 
 	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	24(%rcx), %xmm2
 
@@ -317,15 +313,14 @@ L(Shl8Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
+	movaps	%xmm2, %xmm1
 
 	test	%rax, %rax
 	jnz	L(Shl8LoopExit)
 
-	palignr	$8, %xmm1, %xmm2
+	palignr	$8, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	movaps	24(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 
 	pcmpeqd	%xmm2, %xmm0
 	lea	16(%rdx), %rdx
@@ -337,7 +332,6 @@ L(Shl8Start):
 	jnz	L(Shl8LoopExit)
 
 	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	24(%rcx), %xmm2
 
@@ -345,13 +339,11 @@ L(Shl8Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
 
 	test	%rax, %rax
 	jnz	L(Shl8LoopExit)
 
-	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
+	palignr	$8, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	lea	24(%rcx), %rcx
 	lea	16(%rdx), %rdx
@@ -364,6 +356,7 @@ L(Shl8Start):
 
 	movaps	-8(%rcx), %xmm1
 
+	.p2align 4
 L(Shl8LoopStart):
 	movaps	8(%rcx), %xmm2
 	movaps	24(%rcx), %xmm3
@@ -394,11 +387,9 @@ L(Shl8LoopStart):
 	jmp	L(Shl8LoopStart)
 
 L(Shl8LoopExit):
-	movaps	(%rdx), %xmm6
-	psrldq	$8, %xmm6
+	mov	(%rcx), %r9
 	mov	$8, %rsi
-	palignr	$8, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	mov	%r9, (%rdx)
 	jmp	L(CopyFrom1To16Bytes)
 
 	.p2align 4
@@ -414,7 +405,6 @@ L(Shl12Start):
 	jnz	L(Shl12LoopExit)
 
 	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	20(%rcx), %xmm2
 
@@ -422,15 +412,14 @@ L(Shl12Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
+	movaps	%xmm2, %xmm1
 
 	test	%rax, %rax
 	jnz	L(Shl12LoopExit)
 
-	palignr	$12, %xmm1, %xmm2
+	palignr	$12, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	movaps	20(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 
 	pcmpeqd	%xmm2, %xmm0
 	lea	16(%rdx), %rdx
@@ -442,7 +431,6 @@ L(Shl12Start):
 	jnz	L(Shl12LoopExit)
 
 	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	20(%rcx), %xmm2
 
@@ -450,13 +438,11 @@ L(Shl12Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
 
 	test	%rax, %rax
 	jnz	L(Shl12LoopExit)
 
-	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
+	palignr	$12, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	lea	20(%rcx), %rcx
 	lea	16(%rdx), %rdx
@@ -469,6 +455,7 @@ L(Shl12Start):
 
 	movaps	-12(%rcx), %xmm1
 
+	.p2align 4
 L(Shl12LoopStart):
 	movaps	4(%rcx), %xmm2
 	movaps	20(%rcx), %xmm3
@@ -498,11 +485,10 @@ L(Shl12LoopStart):
 	jmp	L(Shl12LoopStart)
 
 L(Shl12LoopExit):
-	movaps	(%rdx), %xmm6
-	psrldq	$4, %xmm6
+	mov	(%rcx), %r9d
 	mov	$4, %rsi
-	palignr	$12, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	mov	%r9d, (%rdx)
+	jmp	L(CopyFrom1To16Bytes)
 
 	.p2align 4
 L(CopyFrom1To16Bytes):
@@ -556,8 +542,10 @@ L(Exit12):
 
 	.p2align 4
 L(Exit16):
-	movdqu	(%rcx), %xmm0
-	movdqu	%xmm0, (%rdx)
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	8(%rcx), %rax
+	mov	%rax, 8(%rdx)
 	mov	%rdi, %rax
 	ret