about summary refs log tree commit diff
path: root/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S
diff options
context:
space:
mode:
authorLiubov Dmitrieva <liubov.dmitrieva@gmail.com>2011-12-23 08:50:39 -0500
committerUlrich Drepper <drepper@gmail.com>2011-12-23 08:50:39 -0500
commitc044cf14b0238b6e866f4ef5f8907d6680230212 (patch)
tree174941df1d04f8fd18b0cc93401b50a4a97d7a88 /sysdeps/i386/i686/multiarch/wcscpy-ssse3.S
parentd455f537be05dac05b8fc67e58f13fd85d553c74 (diff)
downloadglibc-c044cf14b0238b6e866f4ef5f8907d6680230212.tar.gz
glibc-c044cf14b0238b6e866f4ef5f8907d6680230212.tar.xz
glibc-c044cf14b0238b6e866f4ef5f8907d6680230212.zip
Fix wrong copying processing for last bytes in x86-32 wcscpy
Wrong copy algorithm for last bytes, not thread safety.
In some particular cases it uses the destination
memory beyond the string end for
16-byte load, puts changes into that part that is relevant
to destination string and writes whole 16-byte chunk into memory.
I have a test case where the memory beyond the string end contains
malloc/free data, that appear corrupted in case free() updates
it in between the 16-byte read and 16-byte write.
Diffstat (limited to 'sysdeps/i386/i686/multiarch/wcscpy-ssse3.S')
-rw-r--r--sysdeps/i386/i686/multiarch/wcscpy-ssse3.S62
1 files changed, 21 insertions, 41 deletions
diff --git a/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S b/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S
index 84d92a8bde..abeea22266 100644
--- a/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S
+++ b/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S
@@ -54,7 +54,6 @@ ENTRY (__wcscpy_ssse3)
 
 	PUSH	(%edi)
 	mov	%edx, %edi
-
 	PUSH	(%esi)
 	lea	16(%ecx), %esi
 
@@ -220,7 +219,6 @@ L(Shl4Start):
 	jnz	L(Shl4LoopExit)
 
 	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%edx)
 	movaps	28(%ecx), %xmm2
 
@@ -228,15 +226,14 @@ L(Shl4Start):
 	lea	16(%edx), %edx
 	pmovmskb %xmm0, %eax
 	lea	16(%ecx), %ecx
-	movaps	%xmm2, %xmm3
+	movaps	%xmm2, %xmm1
 
 	test	%eax, %eax
 	jnz	L(Shl4LoopExit)
 
-	palignr	$4, %xmm1, %xmm2
+	palignr	$4, %xmm3, %xmm2
 	movaps	%xmm2, (%edx)
 	movaps	28(%ecx), %xmm2
-	movaps	%xmm3, %xmm1
 
 	pcmpeqd	%xmm2, %xmm0
 	lea	16(%edx), %edx
@@ -248,7 +245,6 @@ L(Shl4Start):
 	jnz	L(Shl4LoopExit)
 
 	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%edx)
 	movaps	28(%ecx), %xmm2
 
@@ -256,13 +252,11 @@ L(Shl4Start):
 	lea	16(%edx), %edx
 	pmovmskb %xmm0, %eax
 	lea	16(%ecx), %ecx
-	movaps	%xmm2, %xmm3
 
 	test	%eax, %eax
 	jnz	L(Shl4LoopExit)
 
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
+	palignr	$4, %xmm3, %xmm2
 	movaps	%xmm2, (%edx)
 	lea	28(%ecx), %ecx
 	lea	16(%edx), %edx
@@ -305,14 +299,13 @@ L(Shl4LoopStart):
 	jmp	L(Shl4LoopStart)
 
 L(Shl4LoopExit):
-	movaps	(%edx), %xmm6
-	psrldq	$12, %xmm6
-	palignr	$4, %xmm1, %xmm6
-	movaps	%xmm6, (%edx)
+	movlpd	(%ecx), %xmm0
+	movl	8(%ecx), %esi
+	movlpd	%xmm0, (%edx)
+	movl	%esi, 8(%edx)
+	POP	(%esi)
 	add	$12, %edx
 	add	$12, %ecx
-
-	POP	(%esi)
 	test	%al, %al
 	jz	L(ExitHigh)
 	test	$0x01, %al
@@ -337,7 +330,6 @@ L(Shl8Start):
 	jnz	L(Shl8LoopExit)
 
 	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%edx)
 	movaps	24(%ecx), %xmm2
 
@@ -345,15 +337,14 @@ L(Shl8Start):
 	lea	16(%edx), %edx
 	pmovmskb %xmm0, %eax
 	lea	16(%ecx), %ecx
-	movaps	%xmm2, %xmm3
+	movaps	%xmm2, %xmm1
 
 	test	%eax, %eax
 	jnz	L(Shl8LoopExit)
 
-	palignr	$8, %xmm1, %xmm2
+	palignr	$8, %xmm3, %xmm2
 	movaps	%xmm2, (%edx)
 	movaps	24(%ecx), %xmm2
-	movaps	%xmm3, %xmm1
 
 	pcmpeqd	%xmm2, %xmm0
 	lea	16(%edx), %edx
@@ -365,7 +356,6 @@ L(Shl8Start):
 	jnz	L(Shl8LoopExit)
 
 	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%edx)
 	movaps	24(%ecx), %xmm2
 
@@ -373,13 +363,11 @@ L(Shl8Start):
 	lea	16(%edx), %edx
 	pmovmskb %xmm0, %eax
 	lea	16(%ecx), %ecx
-	movaps	%xmm2, %xmm3
 
 	test	%eax, %eax
 	jnz	L(Shl8LoopExit)
 
-	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
+	palignr	$8, %xmm3, %xmm2
 	movaps	%xmm2, (%edx)
 	lea	24(%ecx), %ecx
 	lea	16(%edx), %edx
@@ -422,14 +410,11 @@ L(Shl8LoopStart):
 	jmp	L(Shl8LoopStart)
 
 L(Shl8LoopExit):
-	movaps	(%edx), %xmm6
-	psrldq	$8, %xmm6
-	palignr	$8, %xmm1, %xmm6
-	movaps	%xmm6, (%edx)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	POP	(%esi)
 	add	$8, %edx
 	add	$8, %ecx
-
-	POP	(%esi)
 	test	%al, %al
 	jz	L(ExitHigh)
 	test	$0x01, %al
@@ -454,7 +439,6 @@ L(Shl12Start):
 	jnz	L(Shl12LoopExit)
 
 	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%edx)
 	movaps	20(%ecx), %xmm2
 
@@ -462,15 +446,14 @@ L(Shl12Start):
 	lea	16(%edx), %edx
 	pmovmskb %xmm0, %eax
 	lea	16(%ecx), %ecx
-	movaps	%xmm2, %xmm3
+	movaps	%xmm2, %xmm1
 
 	test	%eax, %eax
 	jnz	L(Shl12LoopExit)
 
-	palignr	$12, %xmm1, %xmm2
+	palignr	$12, %xmm3, %xmm2
 	movaps	%xmm2, (%edx)
 	movaps	20(%ecx), %xmm2
-	movaps	%xmm3, %xmm1
 
 	pcmpeqd	%xmm2, %xmm0
 	lea	16(%edx), %edx
@@ -482,7 +465,6 @@ L(Shl12Start):
 	jnz	L(Shl12LoopExit)
 
 	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%edx)
 	movaps	20(%ecx), %xmm2
 
@@ -490,13 +472,11 @@ L(Shl12Start):
 	lea	16(%edx), %edx
 	pmovmskb %xmm0, %eax
 	lea	16(%ecx), %ecx
-	movaps	%xmm2, %xmm3
 
 	test	%eax, %eax
 	jnz	L(Shl12LoopExit)
 
-	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
+	palignr	$12, %xmm3, %xmm2
 	movaps	%xmm2, (%edx)
 	lea	20(%ecx), %ecx
 	lea	16(%edx), %edx
@@ -539,11 +519,9 @@ L(Shl12LoopStart):
 	jmp	L(Shl12LoopStart)
 
 L(Shl12LoopExit):
-	movaps	(%edx), %xmm6
-	psrldq	$4, %xmm6
+	movl	(%ecx), %esi
+	movl	%esi, (%edx)
 	mov	$4, %esi
-	palignr	$12, %xmm1, %xmm6
-	movaps	%xmm6, (%edx)
 
 	.p2align 4
 L(CopyFrom1To16Bytes):
@@ -555,6 +533,7 @@ L(CopyFrom1To16Bytes):
 	jz	L(ExitHigh)
 	test	$0x01, %al
 	jnz	L(Exit4)
+L(Exit8):
 	movlpd	(%ecx), %xmm0
 	movlpd	%xmm0, (%edx)
 	movl	%edi, %eax
@@ -564,6 +543,7 @@ L(CopyFrom1To16Bytes):
 L(ExitHigh):
 	test	$0x01, %ah
 	jnz	L(Exit12)
+L(Exit16):
 	movdqu	(%ecx), %xmm0
 	movdqu	%xmm0, (%edx)
 	movl	%edi, %eax