aarch64: optimized memcpy implementation for thunderx2

Since aligned loads and stores are huge performance advantage the implementation always tries to do aligned access. Among the cases when src and dst addresses are aligned or unaligned evenly there are cases of not evenly unaligned src and dst. For such cases (if the length is big enough) ext instruction is used to merge-and-shift two memory chunks loaded from two adjacent aligned locations and then the adjusted chunk gets stored to aligned address. Performance gain against the current T2 implementation: memcpy-large: 65K-32M: +40% - +10% memcpy-walk: 128-32M: +20% - +2%
author: Anton Youdkevitch <anton.youdkevitch@bell-sw.com> 2018-10-16 11:00:27 -0700
committer: Steve Ellcey <sellcey@caviumnetworks.com> 2018-10-16 11:00:27 -0700
commit: 75c1aee500ac95bde2b800b3d787c0dd805a8a82 (patch)
tree: 654659bd639a9d9e6cd3cb9313f7ee8cc03672dc /sysdeps/aarch64/multiarch/memcpy_thunderx.S
parent: bcdb1bfa0c700db25e0f355d912ec2309f9544a2 (diff)
download: glibc-75c1aee500ac95bde2b800b3d787c0dd805a8a82.tar.gz
glibc-75c1aee500ac95bde2b800b3d787c0dd805a8a82.tar.xz
glibc-75c1aee500ac95bde2b800b3d787c0dd805a8a82.zip
1 files changed, 0 insertions, 14 deletions
diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx.S b/sysdeps/aarch64/multiarch/memcpy_thunderx.S
index de494d933d..6000365e82 100644
--- a/sysdeps/aarch64/multiarch/memcpy_thunderx.S
+++ b/sysdeps/aarch64/multiarch/memcpy_thunderx.S
@@ -74,13 +74,10 @@
 
 #if IS_IN (libc)
 
-# ifndef USE_THUNDERX2
 #  undef MEMCPY
 #  define MEMCPY __memcpy_thunderx
 #  undef MEMMOVE
 #  define MEMMOVE __memmove_thunderx
-#  define USE_THUNDERX
-# endif
 
 ENTRY_ALIGN (MEMMOVE, 6)
 
@@ -182,8 +179,6 @@ L(copy96):
 	.p2align 4
 L(copy_long):
 
-# if defined(USE_THUNDERX) || defined (USE_THUNDERX2)
-
 	/* On thunderx, large memcpy's are helped by software prefetching.
 	   This loop is identical to the one below it but with prefetching
 	   instructions included.  For loops that are less than 32768 bytes,
@@ -196,11 +191,7 @@ L(copy_long):
 	bic	dst, dstin, 15
 	ldp	D_l, D_h, [src]
 	sub	src, src, tmp1
-#  if defined(USE_THUNDERX)
 	prfm	pldl1strm, [src, 384]
-#  elif defined(USE_THUNDERX2)
-	prfm	pldl1strm, [src, 256]
-#  endif
 	add	count, count, tmp1	/* Count is now 16 too large.  */
 	ldp	A_l, A_h, [src, 16]
 	stp	D_l, D_h, [dstin]
@@ -210,13 +201,9 @@ L(copy_long):
 	subs	count, count, 128 + 16	/* Test and readjust count.  */
 
 L(prefetch_loop64):
-#  if defined(USE_THUNDERX)
 	tbz	src, #6, 1f
 	prfm	pldl1strm, [src, 512]
 1:
-#  elif defined(USE_THUNDERX2)
-	prfm	pldl1strm, [src, 256]
-#  endif
 	stp	A_l, A_h, [dst, 16]
 	ldp	A_l, A_h, [src, 16]
 	stp	B_l, B_h, [dst, 32]
@@ -230,7 +217,6 @@ L(prefetch_loop64):
 	b	L(last64)
 
 L(copy_long_without_prefetch):
-# endif
 
 	and	tmp1, dstin, 15
 	bic	dst, dstin, 15
author	Anton Youdkevitch <anton.youdkevitch@bell-sw.com>	2018-10-16 11:00:27 -0700
committer	Steve Ellcey <sellcey@caviumnetworks.com>	2018-10-16 11:00:27 -0700
commit	75c1aee500ac95bde2b800b3d787c0dd805a8a82 (patch)
tree	654659bd639a9d9e6cd3cb9313f7ee8cc03672dc /sysdeps/aarch64/multiarch/memcpy_thunderx.S
parent	bcdb1bfa0c700db25e0f355d912ec2309f9544a2 (diff)
download	glibc-75c1aee500ac95bde2b800b3d787c0dd805a8a82.tar.gz glibc-75c1aee500ac95bde2b800b3d787c0dd805a8a82.tar.xz glibc-75c1aee500ac95bde2b800b3d787c0dd805a8a82.zip