about summary refs log tree commit diff
path: root/sysdeps/x86_64/memcpy.S
diff options
context:
space:
mode:
authorUlrich Drepper <drepper@redhat.com>2007-09-22 05:54:03 +0000
committerUlrich Drepper <drepper@redhat.com>2007-09-22 05:54:03 +0000
commit0435403c9d0c17c5de09b9a3e7e0d9b0002d422e (patch)
treec21734408ef49732e7054e334df36914ca6f1a69 /sysdeps/x86_64/memcpy.S
parent715899d1221ca1336926ec2467288265a7888be6 (diff)
downloadglibc-0435403c9d0c17c5de09b9a3e7e0d9b0002d422e.tar.gz
glibc-0435403c9d0c17c5de09b9a3e7e0d9b0002d422e.tar.xz
glibc-0435403c9d0c17c5de09b9a3e7e0d9b0002d422e.zip
* sysdeps/x86_64/cacheinfo.c (__x86_64_data_cache_size_half): Renamed
	from __x86_64_core_cache_size_half.
	(init_cacheinfo): Compute shared cache size for AMD processors with
	shared L3 correctly.
	* sysdeps/x86_64/memcpy.S: Adjust for __x86_64_data_cache_size_half
	name change.
	Patch in large parts by Evandro Menezes.
Diffstat (limited to 'sysdeps/x86_64/memcpy.S')
-rw-r--r--sysdeps/x86_64/memcpy.S338
1 files changed, 169 insertions, 169 deletions
diff --git a/sysdeps/x86_64/memcpy.S b/sysdeps/x86_64/memcpy.S
index 231329864f..b25646b8c5 100644
--- a/sysdeps/x86_64/memcpy.S
+++ b/sysdeps/x86_64/memcpy.S
@@ -114,15 +114,15 @@ L(1d):					/* 16-byte loop */
 	.p2align 4
 
 L(1loop):
-	movq	  (%rsi), %rcx
-	movq	8 (%rsi), %r8
-	movq	%rcx,   (%rdi)
-	movq	 %r8, 8 (%rdi)
+	movq	 (%rsi), %rcx
+	movq	8(%rsi), %r8
+	movq	%rcx,  (%rdi)
+	movq	 %r8, 8(%rdi)
 
 	subl	$16, %edx
 
-	leaq	16 (%rsi), %rsi
-	leaq	16 (%rdi), %rdi
+	leaq	16(%rsi), %rsi
+	leaq	16(%rdi), %rdi
 
 	jnz	L(1loop)
 
@@ -140,19 +140,19 @@ L(exit):				/* exit */
 
 L(1after):
 #ifndef USE_AS_MEMPCPY
-	movq	%rax, RETVAL (%rsp)	/* save return value */
+	movq	%rax, RETVAL(%rsp)	/* save return value */
 #endif
 
 /* Align to the natural word size. */
 
 L(aligntry):
-	movl	%esi, %ecx      	/* align by destination */
+	movl	%esi, %ecx      	/* align by source */
 
 	andl	$7, %ecx
 	jz	L(alignafter)  		/* already aligned */
 
 L(align):		      		/* align */
-	leaq	-8 (%rcx, %rdx), %rdx	/* calculate remaining bytes */
+	leaq	-8(%rcx, %rdx), %rdx	/* calculate remaining bytes */
 	subl	$8, %ecx
 
 	.p2align 4
@@ -163,8 +163,8 @@ L(alignloop):				/* 1-byte alignment loop */
 
 	incl	%ecx
 
-	leaq	1 (%rsi), %rsi
-	leaq	1 (%rdi), %rdi
+	leaq	1(%rsi), %rsi
+	leaq	1(%rdi), %rdi
 
 	jnz	L(alignloop)
 
@@ -172,7 +172,7 @@ L(alignloop):				/* 1-byte alignment loop */
 
 L(alignafter):
 
-/* Loop to handle mid-sized blocks. */
+/* Handle mid-sized blocks. */
 
 L(32try):				/* up to 1KB */
 	cmpq	$1024, %rdx
@@ -188,15 +188,15 @@ L(32):					/* 32-byte loop */
 L(32loop):
 	decl	%ecx
 
-	movq	(%rsi), %rax
-	movq	 8 (%rsi), %r8
-	movq	16 (%rsi), %r9
-	movq	24 (%rsi), %r10
+	movq	  (%rsi), %rax
+	movq	 8(%rsi), %r8
+	movq	16(%rsi), %r9
+	movq	24(%rsi), %r10
 
-	movq	%rax, (%rdi)
-	movq	 %r8,  8 (%rdi)
-	movq	 %r9, 16 (%rdi)
-	movq	%r10, 24 (%rdi)
+	movq	%rax,   (%rdi)
+	movq	 %r8,  8(%rdi)
+	movq	 %r9, 16(%rdi)
+	movq	%r10, 24(%rdi)
 
 	leaq	32(%rsi), %rsi
 	leaq	32(%rdi), %rdi
@@ -205,18 +205,18 @@ L(32loop):
 
 	decl	%ecx
 
-	movq	   (%rsi), %rax
-	movq	 8 (%rsi), %r8
-	movq	16 (%rsi), %r9
-	movq	24 (%rsi), %r10
+	movq	  (%rsi), %rax
+	movq	 8(%rsi), %r8
+	movq	16(%rsi), %r9
+	movq	24(%rsi), %r10
 
-	movq	%rax,    (%rdi)
-	movq	 %r8,  8 (%rdi)
-	movq	 %r9, 16 (%rdi)
-	movq	%r10, 24 (%rdi)
+	movq	%rax,   (%rdi)
+	movq	 %r8,  8(%rdi)
+	movq	 %r9, 16(%rdi)
+	movq	%r10, 24(%rdi)
 
-	leaq	32 (%rsi), %rsi
-	leaq	32 (%rdi), %rdi
+	leaq	32(%rsi), %rsi
+	leaq	32(%rdi), %rdi
 
 	jnz	L(32loop)
 
@@ -229,9 +229,9 @@ L(32skip):
 
 	movq	%rdi, %rax
 #else
-	movq	RETVAL (%rsp), %rax
+	movq	RETVAL(%rsp), %rax
 	jnz	L(1)
-	
+
 	rep
 #endif
 	retq				/* exit */
@@ -245,11 +245,11 @@ L(32after):
 	larger blocks are excluded when building for RTLD.
 */
 
-/* Handle large blocks smaller than 1/2 L1. */
+/* Handle blocks smaller than 1/2 L1. */
 
 L(fasttry):				/* first 1/2 L1 */
 #ifndef NOT_IN_libc			/* only up to this algorithm outside of libc.so */
-	movq	__x86_64_core_cache_size_half (%rip), %r11
+	movq	__x86_64_data_cache_size_half(%rip), %r11
 	cmpq	%rdx, %r11		/* calculate the smaller of */
 	cmovaq	%rdx, %r11		/* remaining bytes and 1/2 L1 */
 #endif
@@ -282,7 +282,7 @@ L(fastskip):
 
 	movq	%rdi, %rax
 #else
-	movq	RETVAL (%rsp), %rax
+	movq	RETVAL(%rsp), %rax
 	jnz	L(1)
 
 	rep
@@ -308,16 +308,16 @@ L(pre):					/* 64-byte with prefetching */
 	shrq	$6, %rcx
 	jz	L(preskip)
 
-	movq	%r14, SAVE0 (%rsp)
+	movq	%r14, SAVE0(%rsp)
 	cfi_rel_offset (%r14, SAVE0)
-	movq	%r13, SAVE1 (%rsp)
+	movq	%r13, SAVE1(%rsp)
 	cfi_rel_offset (%r13, SAVE1)
-	movq	%r12, SAVE2 (%rsp)
+	movq	%r12, SAVE2(%rsp)
 	cfi_rel_offset (%r12, SAVE2)
-	movq	%rbx, SAVE3 (%rsp)
+	movq	%rbx, SAVE3(%rsp)
 	cfi_rel_offset (%rbx, SAVE3)
 
-	cmpl	$0, __x86_64_prefetchw (%rip)
+	cmpl	$0, __x86_64_prefetchw(%rip)
 	jz	L(preloop)		/* check if PREFETCHW OK */
 
 	.p2align 4
@@ -339,45 +339,45 @@ L(prewloop):				/* cache-line in state M */
 	prefetcht0	 0 + 896 (%rsi)
 	prefetcht0	64 + 896 (%rsi)
 
-	movq	%rax,    (%rdi)
-	movq	%rbx,  8 (%rdi)
-	movq	 %r9, 16 (%rdi)
-	movq	%r10, 24 (%rdi)
-	movq	%r11, 32 (%rdi)
-	movq	%r12, 40 (%rdi)
-	movq	%r13, 48 (%rdi)
-	movq	%r14, 56 (%rdi)
+	movq	%rax,   (%rdi)
+	movq	%rbx,  8(%rdi)
+	movq	 %r9, 16(%rdi)
+	movq	%r10, 24(%rdi)
+	movq	%r11, 32(%rdi)
+	movq	%r12, 40(%rdi)
+	movq	%r13, 48(%rdi)
+	movq	%r14, 56(%rdi)
 
-	leaq	64 (%rsi), %rsi
-	leaq	64 (%rdi), %rdi
+	leaq	64(%rsi), %rsi
+	leaq	64(%rdi), %rdi
 
 	jz	L(prebail)
 
 	decq	%rcx
 
-	movq	   (%rsi), %rax
-	movq	 8 (%rsi), %rbx
-	movq	16 (%rsi), %r9
-	movq	24 (%rsi), %r10
-	movq	32 (%rsi), %r11
-	movq	40 (%rsi), %r12
-	movq	48 (%rsi), %r13
-	movq	56 (%rsi), %r14
-
-	movq	%rax,    (%rdi)
-	movq	%rbx,  8 (%rdi)
-	movq	 %r9, 16 (%rdi)
-	movq	%r10, 24 (%rdi)
-	movq	%r11, 32 (%rdi)
-	movq	%r12, 40 (%rdi)
-	movq	%r13, 48 (%rdi)
-	movq	%r14, 56 (%rdi)
-
-	prefetchw	896 - 64 (%rdi)
-	prefetchw	896 -  0 (%rdi)
-
-	leaq	64 (%rsi), %rsi
-	leaq	64 (%rdi), %rdi
+	movq	  (%rsi), %rax
+	movq	 8(%rsi), %rbx
+	movq	16(%rsi), %r9
+	movq	24(%rsi), %r10
+	movq	32(%rsi), %r11
+	movq	40(%rsi), %r12
+	movq	48(%rsi), %r13
+	movq	56(%rsi), %r14
+
+	movq	%rax,   (%rdi)
+	movq	%rbx,  8(%rdi)
+	movq	 %r9, 16(%rdi)
+	movq	%r10, 24(%rdi)
+	movq	%r11, 32(%rdi)
+	movq	%r12, 40(%rdi)
+	movq	%r13, 48(%rdi)
+	movq	%r14, 56(%rdi)
+
+	prefetchw	896 - 64(%rdi)
+	prefetchw	896 -  0(%rdi)
+
+	leaq	64(%rsi), %rsi
+	leaq	64(%rdi), %rdi
 
 	jnz	L(prewloop)
 	jmp	L(prebail)
@@ -389,26 +389,26 @@ L(prewloop):				/* cache-line in state M */
 L(preloop):				/* cache-line in state E */
 	decq	%rcx
 
-	movq	   (%rsi), %rax
-	movq	 8 (%rsi), %rbx
-	movq	16 (%rsi), %r9
-	movq	24 (%rsi), %r10
-	movq	32 (%rsi), %r11
-	movq	40 (%rsi), %r12
-	movq	48 (%rsi), %r13
-	movq	56 (%rsi), %r14
-
-	prefetcht0	896 +  0 (%rsi)
-	prefetcht0	896 + 64 (%rsi)
-
-	movq	%rax,    (%rdi)
-	movq	%rbx,  8 (%rdi)
-	movq	 %r9, 16 (%rdi)
-	movq	%r10, 24 (%rdi)
-	movq	%r11, 32 (%rdi)
-	movq	%r12, 40 (%rdi)
-	movq	%r13, 48 (%rdi)
-	movq	%r14, 56 (%rdi)
+	movq	  (%rsi), %rax
+	movq	 8(%rsi), %rbx
+	movq	16(%rsi), %r9
+	movq	24(%rsi), %r10
+	movq	32(%rsi), %r11
+	movq	40(%rsi), %r12
+	movq	48(%rsi), %r13
+	movq	56(%rsi), %r14
+
+	prefetcht0	896 +  0(%rsi)
+	prefetcht0	896 + 64(%rsi)
+
+	movq	%rax,   (%rdi)
+	movq	%rbx,  8(%rdi)
+	movq	 %r9, 16(%rdi)
+	movq	%r10, 24(%rdi)
+	movq	%r11, 32(%rdi)
+	movq	%r12, 40(%rdi)
+	movq	%r13, 48(%rdi)
+	movq	%r14, 56(%rdi)
 
 	leaq	64 (%rsi), %rsi
 	leaq	64 (%rdi), %rdi
@@ -417,40 +417,40 @@ L(preloop):				/* cache-line in state E */
 
 	decq	%rcx
 
-	movq	   (%rsi), %rax
-	movq	 8 (%rsi), %rbx
-	movq	16 (%rsi), %r9
-	movq	24 (%rsi), %r10
-	movq	32 (%rsi), %r11
-	movq	40 (%rsi), %r12
-	movq	48 (%rsi), %r13
-	movq	56 (%rsi), %r14
-
-	prefetcht0	896 - 64 (%rdi)
-	prefetcht0	896 -  0 (%rdi)
-
-	movq	%rax,    (%rdi)
-	movq	%rbx,  8 (%rdi)
-	movq	 %r9, 16 (%rdi)
-	movq	%r10, 24 (%rdi)
-	movq	%r11, 32 (%rdi)
-	movq	%r12, 40 (%rdi)
-	movq	%r13, 48 (%rdi)
-	movq	%r14, 56 (%rdi)
-
-	leaq	64 (%rsi), %rsi
-	leaq	64 (%rdi), %rdi
+	movq	  (%rsi), %rax
+	movq	 8(%rsi), %rbx
+	movq	16(%rsi), %r9
+	movq	24(%rsi), %r10
+	movq	32(%rsi), %r11
+	movq	40(%rsi), %r12
+	movq	48(%rsi), %r13
+	movq	56(%rsi), %r14
+
+	prefetcht0	896 - 64(%rdi)
+	prefetcht0	896 -  0(%rdi)
+
+	movq	%rax,   (%rdi)
+	movq	%rbx,  8(%rdi)
+	movq	 %r9, 16(%rdi)
+	movq	%r10, 24(%rdi)
+	movq	%r11, 32(%rdi)
+	movq	%r12, 40(%rdi)
+	movq	%r13, 48(%rdi)
+	movq	%r14, 56(%rdi)
+
+	leaq	64(%rsi), %rsi
+	leaq	64(%rdi), %rdi
 
 	jnz	L(preloop)
 
 L(prebail):
-	movq	SAVE3 (%rsp), %rbx
+	movq	SAVE3(%rsp), %rbx
 	cfi_restore (%rbx)
-	movq	SAVE2 (%rsp), %r12
+	movq	SAVE2(%rsp), %r12
 	cfi_restore (%r12)
-	movq	SAVE1 (%rsp), %r13
+	movq	SAVE1(%rsp), %r13
 	cfi_restore (%r13)
-	movq	SAVE0 (%rsp), %r14
+	movq	SAVE0(%rsp), %r14
 	cfi_restore (%r14)
 
 /*       .p2align 4 */
@@ -466,7 +466,7 @@ L(preskip):
 
 	movq	%rdi, %rax
 #else
-	movq	RETVAL (%rsp), %rax
+	movq	RETVAL(%rsp), %rax
 	jnz	L(1)
 
 	rep
@@ -477,7 +477,7 @@ L(preskip):
 
 L(preafter):
 
-/* Loop to handle huge blocks. */
+/* Handle huge blocks. */
 
 L(NTtry):
 
@@ -486,69 +486,69 @@ L(NT):					/* non-temporal 128-byte */
 	shrq	$7, %rcx
 	jz	L(NTskip)
 
-	movq	%r14, SAVE0 (%rsp)
+	movq	%r14, SAVE0(%rsp)
 	cfi_rel_offset (%r14, SAVE0)
-	movq	%r13, SAVE1 (%rsp)
+	movq	%r13, SAVE1(%rsp)
 	cfi_rel_offset (%r13, SAVE1)
-	movq	%r12, SAVE2 (%rsp)
+	movq	%r12, SAVE2(%rsp)
 	cfi_rel_offset (%r12, SAVE2)
 
        .p2align 4
 
 L(NTloop):
-	prefetchnta	768 (%rsi)
-	prefetchnta	832 (%rsi)
+	prefetchnta	768(%rsi)
+	prefetchnta	832(%rsi)
 
 	decq	%rcx
 
-	movq	   (%rsi), %rax
-	movq	 8 (%rsi), %r8
-	movq	16 (%rsi), %r9
-	movq	24 (%rsi), %r10
-	movq	32 (%rsi), %r11
-	movq	40 (%rsi), %r12
-	movq	48 (%rsi), %r13
-	movq	56 (%rsi), %r14
-
-	movntiq	%rax,    (%rdi)
-	movntiq	 %r8,  8 (%rdi)
-	movntiq	 %r9, 16 (%rdi)
-	movntiq	%r10, 24 (%rdi)
-	movntiq	%r11, 32 (%rdi)
-	movntiq	%r12, 40 (%rdi)
-	movntiq	%r13, 48 (%rdi)
-	movntiq	%r14, 56 (%rdi)
-
-	movq	 64 (%rsi), %rax
-	movq	 72 (%rsi), %r8
-	movq	 80 (%rsi), %r9
-	movq	 88 (%rsi), %r10
-	movq	 96 (%rsi), %r11
-	movq	104 (%rsi), %r12
-	movq	112 (%rsi), %r13
-	movq	120 (%rsi), %r14
-
-	movntiq	%rax,  64 (%rdi)
-	movntiq	 %r8,  72 (%rdi)
-	movntiq	 %r9,  80 (%rdi)
-	movntiq	%r10,  88 (%rdi)
-	movntiq	%r11,  96 (%rdi)
-	movntiq	%r12, 104 (%rdi)
-	movntiq	%r13, 112 (%rdi)
-	movntiq	%r14, 120 (%rdi)
-
-	leaq	128 (%rsi), %rsi
-	leaq	128 (%rdi), %rdi
+	movq	  (%rsi), %rax
+	movq	 8(%rsi), %r8
+	movq	16(%rsi), %r9
+	movq	24(%rsi), %r10
+	movq	32(%rsi), %r11
+	movq	40(%rsi), %r12
+	movq	48(%rsi), %r13
+	movq	56(%rsi), %r14
+
+	movntiq	%rax,   (%rdi)
+	movntiq	 %r8,  8(%rdi)
+	movntiq	 %r9, 16(%rdi)
+	movntiq	%r10, 24(%rdi)
+	movntiq	%r11, 32(%rdi)
+	movntiq	%r12, 40(%rdi)
+	movntiq	%r13, 48(%rdi)
+	movntiq	%r14, 56(%rdi)
+
+	movq	 64(%rsi), %rax
+	movq	 72(%rsi), %r8
+	movq	 80(%rsi), %r9
+	movq	 88(%rsi), %r10
+	movq	 96(%rsi), %r11
+	movq	104(%rsi), %r12
+	movq	112(%rsi), %r13
+	movq	120(%rsi), %r14
+
+	movntiq	%rax,  64(%rdi)
+	movntiq	 %r8,  72(%rdi)
+	movntiq	 %r9,  80(%rdi)
+	movntiq	%r10,  88(%rdi)
+	movntiq	%r11,  96(%rdi)
+	movntiq	%r12, 104(%rdi)
+	movntiq	%r13, 112(%rdi)
+	movntiq	%r14, 120(%rdi)
+
+	leaq	128(%rsi), %rsi
+	leaq	128(%rdi), %rdi
 
 	jnz	L(NTloop)
 
 	sfence				/* serialize memory stores */
 
-	movq	SAVE2 (%rsp), %r12
+	movq	SAVE2(%rsp), %r12
 	cfi_restore (%r12)
-	movq	SAVE1 (%rsp), %r13
+	movq	SAVE1(%rsp), %r13
 	cfi_restore (%r13)
-	movq	SAVE0 (%rsp), %r14
+	movq	SAVE0(%rsp), %r14
 	cfi_restore (%r14)
 
 L(NTskip):
@@ -558,7 +558,7 @@ L(NTskip):
 
 	movq	%rdi, %rax
 #else
-	movq	RETVAL (%rsp), %rax
+	movq	RETVAL(%rsp), %rax
 	jnz	L(1)
 
 	rep