about summary refs log tree commit diff
path: root/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/x86_64/multiarch/wcscpy-ssse3.S')
-rw-r--r--sysdeps/x86_64/multiarch/wcscpy-ssse3.S64
1 files changed, 26 insertions, 38 deletions
diff --git a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
index 4e292f3c2b..477b2cb4ef 100644
--- a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
+++ b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
@@ -21,8 +21,9 @@
 #ifndef NOT_IN_libc
 # include <sysdep.h>
 
-.text
+	.section .text.ssse3,"ax",@progbits
 ENTRY (__wcscpy_ssse3)
+
 	mov	%rsi, %rcx
 	mov	%rdi, %rdx
 
@@ -136,6 +137,7 @@ L(Align16Both):
 
 	mov	$-0x40, %rsi
 
+	.p2align 4
 L(Aligned64Loop):
 	movaps	(%rcx), %xmm2
 	movaps	%xmm2, %xmm4
@@ -205,7 +207,6 @@ L(Shl4Start):
 	jnz	L(Shl4LoopExit)
 
 	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	28(%rcx), %xmm2
 
@@ -213,15 +214,14 @@ L(Shl4Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
+	movaps	%xmm2, %xmm1
 
 	test	%rax, %rax
 	jnz	L(Shl4LoopExit)
 
-	palignr	$4, %xmm1, %xmm2
+	palignr	$4, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	movaps	28(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 
 	pcmpeqd	%xmm2, %xmm0
 	lea	16(%rdx), %rdx
@@ -233,7 +233,6 @@ L(Shl4Start):
 	jnz	L(Shl4LoopExit)
 
 	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	28(%rcx), %xmm2
 
@@ -245,8 +244,7 @@ L(Shl4Start):
 	test	%rax, %rax
 	jnz	L(Shl4LoopExit)
 
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
+	palignr	$4, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	lea	28(%rcx), %rcx
 	lea	16(%rdx), %rdx
@@ -259,6 +257,7 @@ L(Shl4Start):
 
 	movaps	-4(%rcx), %xmm1
 
+	.p2align 4
 L(Shl4LoopStart):
 	movaps	12(%rcx), %xmm2
 	movaps	28(%rcx), %xmm3
@@ -289,11 +288,9 @@ L(Shl4LoopStart):
 	jmp	L(Shl4LoopStart)
 
 L(Shl4LoopExit):
-	movaps	(%rdx), %xmm6
-	psrldq	$12, %xmm6
+	movdqu	-4(%rcx), %xmm1
 	mov	$12, %rsi
-	palignr	$4, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	movdqu	%xmm1, -4(%rdx)
 	jmp	L(CopyFrom1To16Bytes)
 
 	.p2align 4
@@ -309,7 +306,6 @@ L(Shl8Start):
 	jnz	L(Shl8LoopExit)
 
 	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	24(%rcx), %xmm2
 
@@ -317,15 +313,14 @@ L(Shl8Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
+	movaps	%xmm2, %xmm1
 
 	test	%rax, %rax
 	jnz	L(Shl8LoopExit)
 
-	palignr	$8, %xmm1, %xmm2
+	palignr	$8, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	movaps	24(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 
 	pcmpeqd	%xmm2, %xmm0
 	lea	16(%rdx), %rdx
@@ -337,7 +332,6 @@ L(Shl8Start):
 	jnz	L(Shl8LoopExit)
 
 	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	24(%rcx), %xmm2
 
@@ -345,13 +339,11 @@ L(Shl8Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
 
 	test	%rax, %rax
 	jnz	L(Shl8LoopExit)
 
-	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
+	palignr	$8, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	lea	24(%rcx), %rcx
 	lea	16(%rdx), %rdx
@@ -364,6 +356,7 @@ L(Shl8Start):
 
 	movaps	-8(%rcx), %xmm1
 
+	.p2align 4
 L(Shl8LoopStart):
 	movaps	8(%rcx), %xmm2
 	movaps	24(%rcx), %xmm3
@@ -394,11 +387,9 @@ L(Shl8LoopStart):
 	jmp	L(Shl8LoopStart)
 
 L(Shl8LoopExit):
-	movaps	(%rdx), %xmm6
-	psrldq	$8, %xmm6
+	mov	(%rcx), %r9
 	mov	$8, %rsi
-	palignr	$8, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	mov	%r9, (%rdx)
 	jmp	L(CopyFrom1To16Bytes)
 
 	.p2align 4
@@ -414,7 +405,6 @@ L(Shl12Start):
 	jnz	L(Shl12LoopExit)
 
 	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	20(%rcx), %xmm2
 
@@ -422,15 +412,14 @@ L(Shl12Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
+	movaps	%xmm2, %xmm1
 
 	test	%rax, %rax
 	jnz	L(Shl12LoopExit)
 
-	palignr	$12, %xmm1, %xmm2
+	palignr	$12, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	movaps	20(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 
 	pcmpeqd	%xmm2, %xmm0
 	lea	16(%rdx), %rdx
@@ -442,7 +431,6 @@ L(Shl12Start):
 	jnz	L(Shl12LoopExit)
 
 	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	20(%rcx), %xmm2
 
@@ -450,13 +438,11 @@ L(Shl12Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
 
 	test	%rax, %rax
 	jnz	L(Shl12LoopExit)
 
-	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
+	palignr	$12, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	lea	20(%rcx), %rcx
 	lea	16(%rdx), %rdx
@@ -469,6 +455,7 @@ L(Shl12Start):
 
 	movaps	-12(%rcx), %xmm1
 
+	.p2align 4
 L(Shl12LoopStart):
 	movaps	4(%rcx), %xmm2
 	movaps	20(%rcx), %xmm3
@@ -498,11 +485,10 @@ L(Shl12LoopStart):
 	jmp	L(Shl12LoopStart)
 
 L(Shl12LoopExit):
-	movaps	(%rdx), %xmm6
-	psrldq	$4, %xmm6
+	mov	(%rcx), %r9d
 	mov	$4, %rsi
-	palignr	$12, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	mov	%r9d, (%rdx)
+	jmp	L(CopyFrom1To16Bytes)
 
 	.p2align 4
 L(CopyFrom1To16Bytes):
@@ -556,8 +542,10 @@ L(Exit12):
 
 	.p2align 4
 L(Exit16):
-	movdqu	(%rcx), %xmm0
-	movdqu	%xmm0, (%rdx)
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	8(%rcx), %rax
+	mov	%rax, 8(%rdx)
 	mov	%rdi, %rax
 	ret