diff options
author | Anton Youdkevitch <anton.youdkevitch@bell-sw.com> | 2018-10-16 11:00:27 -0700 |
---|---|---|
committer | Steve Ellcey <sellcey@caviumnetworks.com> | 2018-10-16 11:00:27 -0700 |
commit | 75c1aee500ac95bde2b800b3d787c0dd805a8a82 (patch) | |
tree | 654659bd639a9d9e6cd3cb9313f7ee8cc03672dc /sysdeps/aarch64/multiarch/memcpy_thunderx.S | |
parent | bcdb1bfa0c700db25e0f355d912ec2309f9544a2 (diff) | |
download | glibc-75c1aee500ac95bde2b800b3d787c0dd805a8a82.tar.gz glibc-75c1aee500ac95bde2b800b3d787c0dd805a8a82.tar.xz glibc-75c1aee500ac95bde2b800b3d787c0dd805a8a82.zip |
aarch64: optimized memcpy implementation for thunderx2
Since aligned loads and stores are huge performance advantage the implementation always tries to do aligned access. Among the cases when src and dst addresses are aligned or unaligned evenly there are cases of not evenly unaligned src and dst. For such cases (if the length is big enough) ext instruction is used to merge-and-shift two memory chunks loaded from two adjacent aligned locations and then the adjusted chunk gets stored to aligned address. Performance gain against the current T2 implementation: memcpy-large: 65K-32M: +40% - +10% memcpy-walk: 128-32M: +20% - +2%
Diffstat (limited to 'sysdeps/aarch64/multiarch/memcpy_thunderx.S')
-rw-r--r-- | sysdeps/aarch64/multiarch/memcpy_thunderx.S | 14 |
1 files changed, 0 insertions, 14 deletions
diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx.S b/sysdeps/aarch64/multiarch/memcpy_thunderx.S index de494d933d..6000365e82 100644 --- a/sysdeps/aarch64/multiarch/memcpy_thunderx.S +++ b/sysdeps/aarch64/multiarch/memcpy_thunderx.S @@ -74,13 +74,10 @@ #if IS_IN (libc) -# ifndef USE_THUNDERX2 # undef MEMCPY # define MEMCPY __memcpy_thunderx # undef MEMMOVE # define MEMMOVE __memmove_thunderx -# define USE_THUNDERX -# endif ENTRY_ALIGN (MEMMOVE, 6) @@ -182,8 +179,6 @@ L(copy96): .p2align 4 L(copy_long): -# if defined(USE_THUNDERX) || defined (USE_THUNDERX2) - /* On thunderx, large memcpy's are helped by software prefetching. This loop is identical to the one below it but with prefetching instructions included. For loops that are less than 32768 bytes, @@ -196,11 +191,7 @@ L(copy_long): bic dst, dstin, 15 ldp D_l, D_h, [src] sub src, src, tmp1 -# if defined(USE_THUNDERX) prfm pldl1strm, [src, 384] -# elif defined(USE_THUNDERX2) - prfm pldl1strm, [src, 256] -# endif add count, count, tmp1 /* Count is now 16 too large. */ ldp A_l, A_h, [src, 16] stp D_l, D_h, [dstin] @@ -210,13 +201,9 @@ L(copy_long): subs count, count, 128 + 16 /* Test and readjust count. */ L(prefetch_loop64): -# if defined(USE_THUNDERX) tbz src, #6, 1f prfm pldl1strm, [src, 512] 1: -# elif defined(USE_THUNDERX2) - prfm pldl1strm, [src, 256] -# endif stp A_l, A_h, [dst, 16] ldp A_l, A_h, [src, 16] stp B_l, B_h, [dst, 32] @@ -230,7 +217,6 @@ L(prefetch_loop64): b L(last64) L(copy_long_without_prefetch): -# endif and tmp1, dstin, 15 bic dst, dstin, 15 |