/* Optimized memcpy for Fujitsu A64FX processor. Copyright (C) 2021 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library. If not, see . */ #include /* Assumptions: * * ARMv8.2-a, AArch64, unaligned accesses, sve * */ #define L2_SIZE (8*1024*1024)/2 // L2 8MB/2 #define CACHE_LINE_SIZE 256 #define ZF_DIST (CACHE_LINE_SIZE * 21) // Zerofill distance #define dest x0 #define src x1 #define n x2 // size #define tmp1 x3 #define tmp2 x4 #define tmp3 x5 #define rest x6 #define dest_ptr x7 #define src_ptr x8 #define vector_length x9 #define cl_remainder x10 // CACHE_LINE_SIZE remainder #if HAVE_AARCH64_SVE_ASM # if IS_IN (libc) # define MEMCPY __memcpy_a64fx # define MEMMOVE __memmove_a64fx .arch armv8.2-a+sve .macro dc_zva times dc zva, tmp1 add tmp1, tmp1, CACHE_LINE_SIZE .if \times-1 dc_zva "(\times-1)" .endif .endm .macro ld1b_unroll8 ld1b z0.b, p0/z, [src_ptr, #0, mul vl] ld1b z1.b, p0/z, [src_ptr, #1, mul vl] ld1b z2.b, p0/z, [src_ptr, #2, mul vl] ld1b z3.b, p0/z, [src_ptr, #3, mul vl] ld1b z4.b, p0/z, [src_ptr, #4, mul vl] ld1b z5.b, p0/z, [src_ptr, #5, mul vl] ld1b z6.b, p0/z, [src_ptr, #6, mul vl] ld1b z7.b, p0/z, [src_ptr, #7, mul vl] .endm .macro stld1b_unroll4a st1b z0.b, p0, [dest_ptr, #0, mul vl] st1b z1.b, p0, [dest_ptr, #1, mul vl] ld1b z0.b, p0/z, [src_ptr, #0, mul vl] ld1b z1.b, p0/z, [src_ptr, #1, mul vl] st1b z2.b, p0, [dest_ptr, #2, mul vl] st1b z3.b, p0, [dest_ptr, #3, mul vl] ld1b z2.b, p0/z, [src_ptr, #2, mul vl] ld1b z3.b, p0/z, [src_ptr, #3, mul vl] .endm .macro stld1b_unroll4b st1b z4.b, p0, [dest_ptr, #4, mul vl] st1b z5.b, p0, [dest_ptr, #5, mul vl] ld1b z4.b, p0/z, [src_ptr, #4, mul vl] ld1b z5.b, p0/z, [src_ptr, #5, mul vl] st1b z6.b, p0, [dest_ptr, #6, mul vl] st1b z7.b, p0, [dest_ptr, #7, mul vl] ld1b z6.b, p0/z, [src_ptr, #6, mul vl] ld1b z7.b, p0/z, [src_ptr, #7, mul vl] .endm .macro stld1b_unroll8 stld1b_unroll4a stld1b_unroll4b .endm .macro st1b_unroll8 st1b z0.b, p0, [dest_ptr, #0, mul vl] st1b z1.b, p0, [dest_ptr, #1, mul vl] st1b z2.b, p0, [dest_ptr, #2, mul vl] st1b z3.b, p0, [dest_ptr, #3, mul vl] st1b z4.b, p0, [dest_ptr, #4, mul vl] st1b z5.b, p0, [dest_ptr, #5, mul vl] st1b z6.b, p0, [dest_ptr, #6, mul vl] st1b z7.b, p0, [dest_ptr, #7, mul vl] .endm .macro shortcut_for_small_size exit // if rest <= vector_length * 2 whilelo p0.b, xzr, n whilelo p1.b, vector_length, n b.last 1f ld1b z0.b, p0/z, [src, #0, mul vl] ld1b z1.b, p1/z, [src, #1, mul vl] st1b z0.b, p0, [dest, #0, mul vl] st1b z1.b, p1, [dest, #1, mul vl] ret 1: // if rest > vector_length * 8 cmp n, vector_length, lsl 3 // vector_length * 8 b.hi \exit // if rest <= vector_length * 4 lsl tmp1, vector_length, 1 // vector_length * 2 whilelo p2.b, tmp1, n incb tmp1 whilelo p3.b, tmp1, n b.last 1f ld1b z0.b, p0/z, [src, #0, mul vl] ld1b z1.b, p1/z, [src, #1, mul vl] ld1b z2.b, p2/z, [src, #2, mul vl] ld1b z3.b, p3/z, [src, #3, mul vl] st1b z0.b, p0, [dest, #0, mul vl] st1b z1.b, p1, [dest, #1, mul vl] st1b z2.b, p2, [dest, #2, mul vl] st1b z3.b, p3, [dest, #3, mul vl] ret 1: // if rest <= vector_length * 8 lsl tmp1, vector_length, 2 // vector_length * 4 whilelo p4.b, tmp1, n incb tmp1 whilelo p5.b, tmp1, n b.last 1f ld1b z0.b, p0/z, [src, #0, mul vl] ld1b z1.b, p1/z, [src, #1, mul vl] ld1b z2.b, p2/z, [src, #2, mul vl] ld1b z3.b, p3/z, [src, #3, mul vl] ld1b z4.b, p4/z, [src, #4, mul vl] ld1b z5.b, p5/z, [src, #5, mul vl] st1b z0.b, p0, [dest, #0, mul vl] st1b z1.b, p1, [dest, #1, mul vl] st1b z2.b, p2, [dest, #2, mul vl] st1b z3.b, p3, [dest, #3, mul vl] st1b z4.b, p4, [dest, #4, mul vl] st1b z5.b, p5, [dest, #5, mul vl] ret 1: lsl tmp1, vector_length, 2 // vector_length * 4 incb tmp1 // vector_length * 5 incb tmp1 // vector_length * 6 whilelo p6.b, tmp1, n incb tmp1 whilelo p7.b, tmp1, n ld1b z0.b, p0/z, [src, #0, mul vl] ld1b z1.b, p1/z, [src, #1, mul vl] ld1b z2.b, p2/z, [src, #2, mul vl] ld1b z3.b, p3/z, [src, #3, mul vl] ld1b z4.b, p4/z, [src, #4, mul vl] ld1b z5.b, p5/z, [src, #5, mul vl] ld1b z6.b, p6/z, [src, #6, mul vl] ld1b z7.b, p7/z, [src, #7, mul vl] st1b z0.b, p0, [dest, #0, mul vl] st1b z1.b, p1, [dest, #1, mul vl] st1b z2.b, p2, [dest, #2, mul vl] st1b z3.b, p3, [dest, #3, mul vl] st1b z4.b, p4, [dest, #4, mul vl] st1b z5.b, p5, [dest, #5, mul vl] st1b z6.b, p6, [dest, #6, mul vl] st1b z7.b, p7, [dest, #7, mul vl] ret .endm ENTRY (MEMCPY) PTR_ARG (0) PTR_ARG (1) SIZE_ARG (2) L(memcpy): cntb vector_length // shortcut for less than vector_length * 8 // gives a free ptrue to p0.b for n >= vector_length shortcut_for_small_size L(vl_agnostic) // end of shortcut L(vl_agnostic): // VL Agnostic mov rest, n mov dest_ptr, dest mov src_ptr, src // if rest >= L2_SIZE && vector_length == 64 then L(L2) mov tmp1, 64 cmp rest, L2_SIZE ccmp vector_length, tmp1, 0, cs b.eq L(L2) L(unroll8): // unrolling and software pipeline lsl tmp1, vector_length, 3 // vector_length * 8 .p2align 3 cmp rest, tmp1 b.cc L(last) ld1b_unroll8 add src_ptr, src_ptr, tmp1 sub rest, rest, tmp1 cmp rest, tmp1 b.cc 2f .p2align 3 1: stld1b_unroll8 add dest_ptr, dest_ptr, tmp1 add src_ptr, src_ptr, tmp1 sub rest, rest, tmp1 cmp rest, tmp1 b.ge 1b 2: st1b_unroll8 add dest_ptr, dest_ptr, tmp1 .p2align 3 L(last): whilelo p0.b, xzr, rest whilelo p1.b, vector_length, rest b.last 1f ld1b z0.b, p0/z, [src_ptr, #0, mul vl] ld1b z1.b, p1/z, [src_ptr, #1, mul vl] st1b z0.b, p0, [dest_ptr, #0, mul vl] st1b z1.b, p1, [dest_ptr, #1, mul vl] ret 1: lsl tmp1, vector_length, 1 // vector_length * 2 whilelo p2.b, tmp1, rest incb tmp1 whilelo p3.b, tmp1, rest b.last 1f ld1b z0.b, p0/z, [src_ptr, #0, mul vl] ld1b z1.b, p1/z, [src_ptr, #1, mul vl] ld1b z2.b, p2/z, [src_ptr, #2, mul vl] ld1b z3.b, p3/z, [src_ptr, #3, mul vl] st1b z0.b, p0, [dest_ptr, #0, mul vl] st1b z1.b, p1, [dest_ptr, #1, mul vl] st1b z2.b, p2, [dest_ptr, #2, mul vl] st1b z3.b, p3, [dest_ptr, #3, mul vl] ret 1: lsl tmp1, vector_length, 2 // vector_length * 4 whilelo p4.b, tmp1, rest incb tmp1 whilelo p5.b, tmp1, rest incb tmp1 whilelo p6.b, tmp1, rest incb tmp1 whilelo p7.b, tmp1, rest ld1b z0.b, p0/z, [src_ptr, #0, mul vl] ld1b z1.b, p1/z, [src_ptr, #1, mul vl] ld1b z2.b, p2/z, [src_ptr, #2, mul vl] ld1b z3.b, p3/z, [src_ptr, #3, mul vl] ld1b z4.b, p4/z, [src_ptr, #4, mul vl] ld1b z5.b, p5/z, [src_ptr, #5, mul vl] ld1b z6.b, p6/z, [src_ptr, #6, mul vl] ld1b z7.b, p7/z, [src_ptr, #7, mul vl] st1b z0.b, p0, [dest_ptr, #0, mul vl] st1b z1.b, p1, [dest_ptr, #1, mul vl] st1b z2.b, p2, [dest_ptr, #2, mul vl] st1b z3.b, p3, [dest_ptr, #3, mul vl] st1b z4.b, p4, [dest_ptr, #4, mul vl] st1b z5.b, p5, [dest_ptr, #5, mul vl] st1b z6.b, p6, [dest_ptr, #6, mul vl] st1b z7.b, p7, [dest_ptr, #7, mul vl] ret L(L2): // align dest address at CACHE_LINE_SIZE byte boundary mov tmp1, CACHE_LINE_SIZE ands tmp2, dest_ptr, CACHE_LINE_SIZE - 1 // if cl_remainder == 0 b.eq L(L2_dc_zva) sub cl_remainder, tmp1, tmp2 // process remainder until the first CACHE_LINE_SIZE boundary whilelo p1.b, xzr, cl_remainder // keep p0.b all true whilelo p2.b, vector_length, cl_remainder b.last 1f ld1b z1.b, p1/z, [src_ptr, #0, mul vl] ld1b z2.b, p2/z, [src_ptr, #1, mul vl] st1b z1.b, p1, [dest_ptr, #0, mul vl] st1b z2.b, p2, [dest_ptr, #1, mul vl] b 2f 1: lsl tmp1, vector_length, 1 // vector_length * 2 whilelo p3.b, tmp1, cl_remainder incb tmp1 whilelo p4.b, tmp1, cl_remainder ld1b z1.b, p1/z, [src_ptr, #0, mul vl] ld1b z2.b, p2/z, [src_ptr, #1, mul vl] ld1b z3.b, p3/z, [src_ptr, #2, mul vl] ld1b z4.b, p4/z, [src_ptr, #3, mul vl] st1b z1.b, p1, [dest_ptr, #0, mul vl] st1b z2.b, p2, [dest_ptr, #1, mul vl] st1b z3.b, p3, [dest_ptr, #2, mul vl] st1b z4.b, p4, [dest_ptr, #3, mul vl] 2: add dest_ptr, dest_ptr, cl_remainder add src_ptr, src_ptr, cl_remainder sub rest, rest, cl_remainder L(L2_dc_zva): // zero fill and tmp1, dest, 0xffffffffffffff and tmp2, src, 0xffffffffffffff subs tmp1, tmp1, tmp2 // diff b.ge 1f neg tmp1, tmp1 1: mov tmp3, ZF_DIST + CACHE_LINE_SIZE * 2 cmp tmp1, tmp3 b.lo L(unroll8) mov tmp1, dest_ptr dc_zva (ZF_DIST / CACHE_LINE_SIZE) - 1 // unroll ld1b_unroll8 // this line has to be after "b.lo L(unroll8)" add src_ptr, src_ptr, CACHE_LINE_SIZE * 2 sub rest, rest, CACHE_LINE_SIZE * 2 mov tmp1, ZF_DIST .p2align 3 1: stld1b_unroll4a add tmp2, dest_ptr, tmp1 // dest_ptr + ZF_DIST dc zva, tmp2 stld1b_unroll4b add tmp2, tmp2, CACHE_LINE_SIZE dc zva, tmp2 add dest_ptr, dest_ptr, CACHE_LINE_SIZE * 2 add src_ptr, src_ptr, CACHE_LINE_SIZE * 2 sub rest, rest, CACHE_LINE_SIZE * 2 cmp rest, tmp3 // ZF_DIST + CACHE_LINE_SIZE * 2 b.ge 1b st1b_unroll8 add dest_ptr, dest_ptr, CACHE_LINE_SIZE * 2 b L(unroll8) END (MEMCPY) libc_hidden_builtin_def (MEMCPY) ENTRY (MEMMOVE) PTR_ARG (0) PTR_ARG (1) SIZE_ARG (2) // remove tag address // dest has to be immutable because it is the return value // src has to be immutable because it is used in L(bwd_last) and tmp2, dest, 0xffffffffffffff // save dest_notag into tmp2 and tmp3, src, 0xffffffffffffff // save src_notag intp tmp3 cmp n, 0 ccmp tmp2, tmp3, 4, ne b.ne 1f ret 1: cntb vector_length // shortcut for less than vector_length * 8 // gives a free ptrue to p0.b for n >= vector_length // tmp2 and tmp3 should not be used in this macro to keep // notag addresses shortcut_for_small_size L(dispatch) // end of shortcut L(dispatch): // tmp2 = dest_notag, tmp3 = src_notag // diff = dest_notag - src_notag sub tmp1, tmp2, tmp3 // if diff <= 0 || diff >= n then memcpy cmp tmp1, 0 ccmp tmp1, n, 2, gt b.cs L(vl_agnostic) L(bwd_start): mov rest, n add dest_ptr, dest, n // dest_end add src_ptr, src, n // src_end L(bwd_unroll8): // unrolling and software pipeline lsl tmp1, vector_length, 3 // vector_length * 8 .p2align 3 cmp rest, tmp1 b.cc L(bwd_last) sub src_ptr, src_ptr, tmp1 ld1b_unroll8 sub rest, rest, tmp1 cmp rest, tmp1 b.cc 2f .p2align 3 1: sub src_ptr, src_ptr, tmp1 sub dest_ptr, dest_ptr, tmp1 stld1b_unroll8 sub rest, rest, tmp1 cmp rest, tmp1 b.ge 1b 2: sub dest_ptr, dest_ptr, tmp1 st1b_unroll8 L(bwd_last): mov dest_ptr, dest mov src_ptr, src b L(last) END (MEMMOVE) libc_hidden_builtin_def (MEMMOVE) # endif /* IS_IN (libc) */ #endif /* HAVE_AARCH64_SVE_ASM */