about summary refs log tree commit diff
diff options
context:
space:
mode:
authorH.J. Lu <hjl.tools@gmail.com>2016-04-24 10:53:25 -0700
committerH.J. Lu <hjl.tools@gmail.com>2016-04-25 08:13:38 -0700
commit0d9d7bc875a16911be7a93acdf7999c0023761ea (patch)
treeb4aa0c86b1b6e25b335d4e915d19f3835c7741a8
parent8dd19b0b3ca334060eec990f0afa502700939ad3 (diff)
downloadglibc-0d9d7bc875a16911be7a93acdf7999c0023761ea.tar.gz
glibc-0d9d7bc875a16911be7a93acdf7999c0023761ea.tar.xz
glibc-0d9d7bc875a16911be7a93acdf7999c0023761ea.zip
Align to cacheline
-rw-r--r--sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S93
1 files changed, 93 insertions, 0 deletions
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 0a2bf4108f..aaee527dca 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -466,6 +466,26 @@ L(large_forward):
 	leaq    (%rdi, %rdx), %r10
 	cmpq    %r10, %rsi
 	jb	L(loop_4x_vec_forward)
+# if CACHELINE_SIZE != VEC_SIZE
+	movl	%edi, %r8d
+	andl	$(CACHELINE_SIZE - 1), %r8d
+	je	L(loop_large_forward)
+#  if CACHELINE_SIZE == (VEC_SIZE * 4)
+	/* Cacheline misaligned by VEC_SIZE, 2 * VEC_SIZE, or
+	   3 * VEC_SIZE.  */
+	cmpl	$(VEC_SIZE * 2), %r8d
+	je	L(misaligned_by_2x_vec_forward)
+	jb	L(misaligned_by_3x_vec_forward)
+#  elif CACHELINE_SIZE != (VEC_SIZE * 2)
+#   error Unsupported CACHELINE_SIZE!
+#  endif
+	/* Cacheline misaligned by VEC_SIZE.  */
+	VMOVU	(%rsi), %VEC(0)
+	addq	$VEC_SIZE, %rsi
+	subq	$VEC_SIZE, %rdx
+	VMOVA	%VEC(0), (%rdi)
+	addq	$VEC_SIZE, %rdi
+# endif
 L(loop_large_forward):
 	/* Copy 4 * VEC a time forward with non-temporal stores.  */
 	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
@@ -494,6 +514,32 @@ L(loop_large_forward):
 	VZEROUPPER
 	ret
 
+# if CACHELINE_SIZE == (VEC_SIZE * 4)
+L(misaligned_by_2x_vec_forward):
+	/* Cacheline misaligned by 2 * VEC_SIZE.  */
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	addq	$(VEC_SIZE * 2), %rsi
+	subq	$(VEC_SIZE * 2), %rdx
+	VMOVA	%VEC(0), (%rdi)
+	VMOVA	%VEC(1), VEC_SIZE(%rdi)
+	addq	$(VEC_SIZE * 2), %rdi
+	jmp	L(loop_large_forward)
+
+L(misaligned_by_3x_vec_forward):
+	/* Cacheline misaligned by 3 * VEC_SIZE.  */
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+	addq	$(VEC_SIZE * 3), %rsi
+	subq	$(VEC_SIZE * 3), %rdx
+	VMOVA	%VEC(0), (%rdi)
+	VMOVA	%VEC(1), VEC_SIZE(%rdi)
+	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
+	addq	$(VEC_SIZE * 3), %rdi
+	jmp	L(loop_large_forward)
+# endif
+
 L(large_backward):
 	/* Don't use non-temporal store if there is overlap between
 	   destination and source since destination may be in cache
@@ -501,6 +547,26 @@ L(large_backward):
 	leaq    (%rcx, %rdx), %r10
 	cmpq    %r10, %r9
 	jb	L(loop_4x_vec_backward)
+# if CACHELINE_SIZE != VEC_SIZE
+	movl	%r9d, %r8d
+	andl	$(CACHELINE_SIZE - 1), %r8d
+	je	L(loop_large_backward)
+#  if CACHELINE_SIZE == (VEC_SIZE * 4)
+	/* Cacheline misaligned by VEC_SIZE, 2 * VEC_SIZE, or
+	   3 * VEC_SIZE.  */
+	cmpl	$(VEC_SIZE * 2), %r8d
+	je	L(misaligned_by_2x_vec_backward)
+	jb	L(misaligned_by_3x_vec_backward)
+#  elif CACHELINE_SIZE != (VEC_SIZE * 2)
+#   error Unsupported CACHELINE_SIZE!
+#  endif
+	/* Cacheline misaligned by VEC_SIZE.  */
+	VMOVU	(%rcx), %VEC(0)
+	subq	$VEC_SIZE, %rcx
+	subq	$VEC_SIZE, %rdx
+	VMOVA	%VEC(0), (%r9)
+	subq	$VEC_SIZE, %r9
+# endif
 L(loop_large_backward):
 	/* Copy 4 * VEC a time backward with non-temporal stores.  */
 	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
@@ -528,6 +594,33 @@ L(loop_large_backward):
 	VMOVU	%VEC(8), (%r11)
 	VZEROUPPER
 	ret
+
+# if CACHELINE_SIZE == (VEC_SIZE * 4)
+L(misaligned_by_2x_vec_backward):
+	/* Cacheline misaligned by 2 * VEC_SIZE.  */
+	VMOVU	(%rcx), %VEC(0)
+	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
+	subq	$(VEC_SIZE * 2), %rcx
+	subq	$(VEC_SIZE * 2), %rdx
+	VMOVA	%VEC(0), (%r9)
+	VMOVA	%VEC(1), -VEC_SIZE(%r9)
+	subq	$(VEC_SIZE * 2), %r9
+	jmp	L(loop_large_backward)
+
+L(misaligned_by_3x_vec_backward):
+	/* Cacheline misaligned by 3 * VEC_SIZE.  */
+	VMOVU	(%rcx), %VEC(0)
+	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
+	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
+	subq	$(VEC_SIZE * 3), %rcx
+	subq	$(VEC_SIZE * 3), %rdx
+	VMOVA	%VEC(0), (%r9)
+	VMOVA	%VEC(1), -VEC_SIZE(%r9)
+	VMOVA	%VEC(2), -(VEC_SIZE * 2)(%r9)
+	subq	$(VEC_SIZE * 3), %r9
+	jmp	L(loop_large_backward)
+# endif
+
 #endif
 END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))