diff options
-rw-r--r-- | sysdeps/aarch64/multiarch/Makefile | 2 | ||||
-rw-r--r-- | sysdeps/aarch64/multiarch/ifunc-impl-list.c | 4 | ||||
-rw-r--r-- | sysdeps/aarch64/multiarch/memcpy.c | 51 | ||||
-rw-r--r-- | sysdeps/aarch64/multiarch/memcpy_sve.S | 218 | ||||
-rw-r--r-- | sysdeps/aarch64/multiarch/memmove.c | 51 |
5 files changed, 284 insertions, 42 deletions
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile index 7500cf1e93..490aaa0faf 100644 --- a/sysdeps/aarch64/multiarch/Makefile +++ b/sysdeps/aarch64/multiarch/Makefile @@ -1,6 +1,6 @@ ifeq ($(subdir),string) sysdep_routines += memcpy_generic memcpy_advsimd memcpy_thunderx memcpy_thunderx2 \ - memcpy_falkor memcpy_a64fx \ + memcpy_falkor memcpy_a64fx memcpy_sve \ memset_generic memset_falkor memset_emag memset_kunpeng \ memset_a64fx \ memchr_generic memchr_nosimd \ diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c index 483fbbdb52..f6c6d008da 100644 --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c @@ -25,7 +25,7 @@ #include <stdio.h> /* Maximum number of IFUNC implementations. */ -#define MAX_IFUNC 7 +#define MAX_IFUNC 8 size_t __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, @@ -45,6 +45,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_simd) #if HAVE_AARCH64_SVE_ASM IFUNC_IMPL_ADD (array, i, memcpy, sve, __memcpy_a64fx) + IFUNC_IMPL_ADD (array, i, memcpy, sve, __memcpy_sve) #endif IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic)) IFUNC_IMPL (i, name, memmove, @@ -54,6 +55,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_simd) #if HAVE_AARCH64_SVE_ASM IFUNC_IMPL_ADD (array, i, memmove, sve, __memmove_a64fx) + IFUNC_IMPL_ADD (array, i, memmove, sve, __memmove_sve) #endif IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic)) IFUNC_IMPL (i, name, memset, diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c index a476dd548f..0486213f08 100644 --- a/sysdeps/aarch64/multiarch/memcpy.c +++ b/sysdeps/aarch64/multiarch/memcpy.c @@ -33,27 +33,38 @@ extern __typeof (__redirect_memcpy) __memcpy_simd attribute_hidden; extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden; extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden; extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden; -# if HAVE_AARCH64_SVE_ASM extern __typeof (__redirect_memcpy) __memcpy_a64fx attribute_hidden; -# endif - -libc_ifunc (__libc_memcpy, - (IS_THUNDERX (midr) - ? __memcpy_thunderx - : (IS_FALKOR (midr) || IS_PHECDA (midr) - ? __memcpy_falkor - : (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr) - ? __memcpy_thunderx2 - : (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr) - || IS_NEOVERSE_V1 (midr) - ? __memcpy_simd -# if HAVE_AARCH64_SVE_ASM - : (IS_A64FX (midr) && sve - ? __memcpy_a64fx - : __memcpy_generic)))))); -# else - : __memcpy_generic))))); -# endif +extern __typeof (__redirect_memcpy) __memcpy_sve attribute_hidden; + +static inline __typeof (__redirect_memcpy) * +select_memcpy_ifunc (void) +{ + INIT_ARCH (); + + if (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr)) + return __memcpy_simd; + + if (sve && HAVE_AARCH64_SVE_ASM) + { + if (IS_A64FX (midr)) + return __memcpy_a64fx; + return __memcpy_sve; + } + + if (IS_THUNDERX (midr)) + return __memcpy_thunderx; + + if (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)) + return __memcpy_thunderx2; + + if (IS_FALKOR (midr) || IS_PHECDA (midr)) + return __memcpy_falkor; + + return __memcpy_generic; +} + +libc_ifunc (__libc_memcpy, select_memcpy_ifunc ()); + # undef memcpy strong_alias (__libc_memcpy, memcpy); #endif diff --git a/sysdeps/aarch64/multiarch/memcpy_sve.S b/sysdeps/aarch64/multiarch/memcpy_sve.S new file mode 100644 index 0000000000..a70907ec55 --- /dev/null +++ b/sysdeps/aarch64/multiarch/memcpy_sve.S @@ -0,0 +1,218 @@ +/* Optimized memcpy for SVE. + Copyright (C) 2021-2022 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +/* Assumptions: + * + * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses. + * + */ + +#define dstin x0 +#define src x1 +#define count x2 +#define dst x3 +#define srcend x4 +#define dstend x5 +#define tmp1 x6 +#define vlen x6 + +#define A_q q0 +#define B_q q1 +#define C_q q2 +#define D_q q3 +#define E_q q4 +#define F_q q5 +#define G_q q6 +#define H_q q7 + +/* This implementation supports both memcpy and memmove and shares most code. + It uses unaligned accesses and branchless sequences to keep the code small, + simple and improve performance. + + Copies are split into 3 main cases: small copies of up to 32 bytes, medium + copies of up to 128 bytes, and large copies. The overhead of the overlap + check in memmove is negligible since it is only required for large copies. + + Large copies use a software pipelined loop processing 64 bytes per iteration. + The source pointer is 16-byte aligned to minimize unaligned accesses. + The loop tail is handled by always copying 64 bytes from the end. +*/ + +#if HAVE_AARCH64_SVE_ASM + + .arch armv8.2-a+sve + +ENTRY (__memcpy_sve) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) + + cmp count, 128 + b.hi L(copy_long) + cmp count, 32 + b.hi L(copy32_128) + + whilelo p0.b, xzr, count + cntb vlen + tbnz vlen, 4, L(vlen128) + ld1b z0.b, p0/z, [src] + st1b z0.b, p0, [dstin] + ret + + /* Medium copies: 33..128 bytes. */ +L(copy32_128): + add srcend, src, count + add dstend, dstin, count + ldp A_q, B_q, [src] + ldp C_q, D_q, [srcend, -32] + cmp count, 64 + b.hi L(copy128) + stp A_q, B_q, [dstin] + stp C_q, D_q, [dstend, -32] + ret + + /* Copy 65..128 bytes. */ +L(copy128): + ldp E_q, F_q, [src, 32] + cmp count, 96 + b.ls L(copy96) + ldp G_q, H_q, [srcend, -64] + stp G_q, H_q, [dstend, -64] +L(copy96): + stp A_q, B_q, [dstin] + stp E_q, F_q, [dstin, 32] + stp C_q, D_q, [dstend, -32] + ret + +L(vlen128): + whilelo p1.b, vlen, count + ld1b z0.b, p0/z, [src, 0, mul vl] + ld1b z1.b, p1/z, [src, 1, mul vl] + st1b z0.b, p0, [dstin, 0, mul vl] + st1b z1.b, p1, [dstin, 1, mul vl] + ret + + .p2align 4 + /* Copy more than 128 bytes. */ +L(copy_long): + add srcend, src, count + add dstend, dstin, count + + /* Copy 16 bytes and then align src to 16-byte alignment. */ + ldr D_q, [src] + and tmp1, src, 15 + bic src, src, 15 + sub dst, dstin, tmp1 + add count, count, tmp1 /* Count is now 16 too large. */ + ldp A_q, B_q, [src, 16] + str D_q, [dstin] + ldp C_q, D_q, [src, 48] + subs count, count, 128 + 16 /* Test and readjust count. */ + b.ls L(copy64_from_end) +L(loop64): + stp A_q, B_q, [dst, 16] + ldp A_q, B_q, [src, 80] + stp C_q, D_q, [dst, 48] + ldp C_q, D_q, [src, 112] + add src, src, 64 + add dst, dst, 64 + subs count, count, 64 + b.hi L(loop64) + + /* Write the last iteration and copy 64 bytes from the end. */ +L(copy64_from_end): + ldp E_q, F_q, [srcend, -64] + stp A_q, B_q, [dst, 16] + ldp A_q, B_q, [srcend, -32] + stp C_q, D_q, [dst, 48] + stp E_q, F_q, [dstend, -64] + stp A_q, B_q, [dstend, -32] + ret + +END (__memcpy_sve) +libc_hidden_builtin_def (__memcpy_sve) + + +ENTRY (__memmove_sve) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) + + cmp count, 128 + b.hi L(move_long) + cmp count, 32 + b.hi L(copy32_128) + + whilelo p0.b, xzr, count + cntb vlen + tbnz vlen, 4, L(vlen128) + ld1b z0.b, p0/z, [src] + st1b z0.b, p0, [dstin] + ret + + .p2align 4 +L(move_long): + add srcend, src, count + add dstend, dstin, count + /* Only use backward copy if there is an overlap. */ + sub tmp1, dstin, src + cbz tmp1, L(return) + cmp tmp1, count + b.hs L(copy_long) + + /* Large backwards copy for overlapping copies. + Copy 16 bytes and then align srcend to 16-byte alignment. */ + ldr D_q, [srcend, -16] + and tmp1, srcend, 15 + bic srcend, srcend, 15 + sub count, count, tmp1 + ldp A_q, B_q, [srcend, -32] + str D_q, [dstend, -16] + ldp C_q, D_q, [srcend, -64] + sub dstend, dstend, tmp1 + subs count, count, 128 + b.ls L(copy64_from_start) + +L(loop64_backwards): + str B_q, [dstend, -16] + str A_q, [dstend, -32] + ldp A_q, B_q, [srcend, -96] + str D_q, [dstend, -48] + str C_q, [dstend, -64]! + ldp C_q, D_q, [srcend, -128] + sub srcend, srcend, 64 + subs count, count, 64 + b.hi L(loop64_backwards) + + /* Write the last iteration and copy 64 bytes from the start. */ +L(copy64_from_start): + ldp E_q, F_q, [src, 32] + stp A_q, B_q, [dstend, -32] + ldp A_q, B_q, [src] + stp C_q, D_q, [dstend, -64] + stp E_q, F_q, [dstin, 32] + stp A_q, B_q, [dstin] +L(return): + ret + +END (__memmove_sve) +libc_hidden_builtin_def (__memmove_sve) +#endif diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c index 4f7d7eedfd..261996ecc4 100644 --- a/sysdeps/aarch64/multiarch/memmove.c +++ b/sysdeps/aarch64/multiarch/memmove.c @@ -33,27 +33,38 @@ extern __typeof (__redirect_memmove) __memmove_simd attribute_hidden; extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden; extern __typeof (__redirect_memmove) __memmove_thunderx2 attribute_hidden; extern __typeof (__redirect_memmove) __memmove_falkor attribute_hidden; -# if HAVE_AARCH64_SVE_ASM extern __typeof (__redirect_memmove) __memmove_a64fx attribute_hidden; -# endif - -libc_ifunc (__libc_memmove, - (IS_THUNDERX (midr) - ? __memmove_thunderx - : (IS_FALKOR (midr) || IS_PHECDA (midr) - ? __memmove_falkor - : (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr) - ? __memmove_thunderx2 - : (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr) - || IS_NEOVERSE_V1 (midr) - ? __memmove_simd -# if HAVE_AARCH64_SVE_ASM - : (IS_A64FX (midr) && sve - ? __memmove_a64fx - : __memmove_generic)))))); -# else - : __memmove_generic))))); -# endif +extern __typeof (__redirect_memmove) __memmove_sve attribute_hidden; + +static inline __typeof (__redirect_memmove) * +select_memmove_ifunc (void) +{ + INIT_ARCH (); + + if (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr)) + return __memmove_simd; + + if (sve && HAVE_AARCH64_SVE_ASM) + { + if (IS_A64FX (midr)) + return __memmove_a64fx; + return __memmove_sve; + } + + if (IS_THUNDERX (midr)) + return __memmove_thunderx; + + if (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)) + return __memmove_thunderx2; + + if (IS_FALKOR (midr) || IS_PHECDA (midr)) + return __memmove_falkor; + + return __memmove_generic; +} + +libc_ifunc (__libc_memmove, select_memmove_ifunc ()); + # undef memmove strong_alias (__libc_memmove, memmove); #endif |