diff options
Diffstat (limited to 'ports/sysdeps/aarch64/memmove.S')
-rw-r--r-- | ports/sysdeps/aarch64/memmove.S | 312 |
1 files changed, 0 insertions, 312 deletions
diff --git a/ports/sysdeps/aarch64/memmove.S b/ports/sysdeps/aarch64/memmove.S deleted file mode 100644 index 4227b7f729..0000000000 --- a/ports/sysdeps/aarch64/memmove.S +++ /dev/null @@ -1,312 +0,0 @@ -/* Copyright (C) 2012-2014 Free Software Foundation, Inc. - - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library. If not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -/* Assumptions: - * - * ARMv8-a, AArch64 - * Unaligned accesses - */ - -/* Parameters and result. */ -#define dstin x0 -#define src x1 -#define count x2 -#define tmp1 x3 -#define tmp1w w3 -#define tmp2 x4 -#define tmp2w w4 -#define tmp3 x5 -#define tmp3w w5 -#define dst x6 - -#define A_l x7 -#define A_h x8 -#define B_l x9 -#define B_h x10 -#define C_l x11 -#define C_h x12 -#define D_l x13 -#define D_h x14 - -ENTRY_ALIGN (memmove, 6) - - cmp dstin, src - b.lo L(downwards) - add tmp1, src, count - cmp dstin, tmp1 - b.hs memcpy /* No overlap. */ - - /* Upwards move with potential overlap. - * Need to move from the tail backwards. SRC and DST point one - * byte beyond the remaining data to move. */ - add dst, dstin, count - add src, src, count - cmp count, #64 - b.ge L(mov_not_short_up) - - /* Deal with small moves quickly by dropping straight into the - * exit block. */ -L(tail63up): - /* Move up to 48 bytes of data. At this point we only need the - * bottom 6 bits of count to be accurate. */ - ands tmp1, count, #0x30 - b.eq L(tail15up) - sub dst, dst, tmp1 - sub src, src, tmp1 - cmp tmp1w, #0x20 - b.eq 1f - b.lt 2f - ldp A_l, A_h, [src, #32] - stp A_l, A_h, [dst, #32] -1: - ldp A_l, A_h, [src, #16] - stp A_l, A_h, [dst, #16] -2: - ldp A_l, A_h, [src] - stp A_l, A_h, [dst] -L(tail15up): - /* Move up to 15 bytes of data. Does not assume additional data - * being moved. */ - tbz count, #3, 1f - ldr tmp1, [src, #-8]! - str tmp1, [dst, #-8]! -1: - tbz count, #2, 1f - ldr tmp1w, [src, #-4]! - str tmp1w, [dst, #-4]! -1: - tbz count, #1, 1f - ldrh tmp1w, [src, #-2]! - strh tmp1w, [dst, #-2]! -1: - tbz count, #0, 1f - ldrb tmp1w, [src, #-1] - strb tmp1w, [dst, #-1] -1: - RET - -L(mov_not_short_up): - /* We don't much care about the alignment of DST, but we want SRC - * to be 128-bit (16 byte) aligned so that we don't cross cache line - * boundaries on both loads and stores. */ - ands tmp2, src, #15 /* Bytes to reach alignment. */ - b.eq 2f - sub count, count, tmp2 - /* Move enough data to reach alignment; unlike memcpy, we have to - * be aware of the overlap, which means we can't move data twice. */ - tbz tmp2, #3, 1f - ldr tmp1, [src, #-8]! - str tmp1, [dst, #-8]! -1: - tbz tmp2, #2, 1f - ldr tmp1w, [src, #-4]! - str tmp1w, [dst, #-4]! -1: - tbz tmp2, #1, 1f - ldrh tmp1w, [src, #-2]! - strh tmp1w, [dst, #-2]! -1: - tbz tmp2, #0, 1f - ldrb tmp1w, [src, #-1]! - strb tmp1w, [dst, #-1]! -1: - - /* There may be less than 63 bytes to go now. */ - cmp count, #63 - b.le L(tail63up) -2: - subs count, count, #128 - b.ge L(mov_body_large_up) - /* Less than 128 bytes to move, so handle 64 here and then jump - * to the tail. */ - ldp A_l, A_h, [src, #-64]! - ldp B_l, B_h, [src, #16] - ldp C_l, C_h, [src, #32] - ldp D_l, D_h, [src, #48] - stp A_l, A_h, [dst, #-64]! - stp B_l, B_h, [dst, #16] - stp C_l, C_h, [dst, #32] - stp D_l, D_h, [dst, #48] - tst count, #0x3f - b.ne L(tail63up) - RET - - /* Critical loop. Start at a new Icache line boundary. Assuming - * 64 bytes per line this ensures the entire loop is in one line. */ - .p2align 6 -L(mov_body_large_up): - /* There are at least 128 bytes to move. */ - ldp A_l, A_h, [src, #-16] - ldp B_l, B_h, [src, #-32] - ldp C_l, C_h, [src, #-48] - ldp D_l, D_h, [src, #-64]! -1: - stp A_l, A_h, [dst, #-16] - ldp A_l, A_h, [src, #-16] - stp B_l, B_h, [dst, #-32] - ldp B_l, B_h, [src, #-32] - stp C_l, C_h, [dst, #-48] - ldp C_l, C_h, [src, #-48] - stp D_l, D_h, [dst, #-64]! - ldp D_l, D_h, [src, #-64]! - subs count, count, #64 - b.ge 1b - stp A_l, A_h, [dst, #-16] - stp B_l, B_h, [dst, #-32] - stp C_l, C_h, [dst, #-48] - stp D_l, D_h, [dst, #-64]! - tst count, #0x3f - b.ne L(tail63up) - RET - -L(downwards): - /* For a downwards move we can safely use memcpy provided that - * DST is more than 16 bytes away from SRC. */ - sub tmp1, src, #16 - cmp dstin, tmp1 - b.ls memcpy /* May overlap, but not critically. */ - - mov dst, dstin /* Preserve DSTIN for return value. */ - cmp count, #64 - b.ge L(mov_not_short_down) - - /* Deal with small moves quickly by dropping straight into the - * exit block. */ -L(tail63down): - /* Move up to 48 bytes of data. At this point we only need the - * bottom 6 bits of count to be accurate. */ - ands tmp1, count, #0x30 - b.eq L(tail15down) - add dst, dst, tmp1 - add src, src, tmp1 - cmp tmp1w, #0x20 - b.eq 1f - b.lt 2f - ldp A_l, A_h, [src, #-48] - stp A_l, A_h, [dst, #-48] -1: - ldp A_l, A_h, [src, #-32] - stp A_l, A_h, [dst, #-32] -2: - ldp A_l, A_h, [src, #-16] - stp A_l, A_h, [dst, #-16] -L(tail15down): - /* Move up to 15 bytes of data. Does not assume additional data - being moved. */ - tbz count, #3, 1f - ldr tmp1, [src], #8 - str tmp1, [dst], #8 -1: - tbz count, #2, 1f - ldr tmp1w, [src], #4 - str tmp1w, [dst], #4 -1: - tbz count, #1, 1f - ldrh tmp1w, [src], #2 - strh tmp1w, [dst], #2 -1: - tbz count, #0, 1f - ldrb tmp1w, [src] - strb tmp1w, [dst] -1: - RET - -L(mov_not_short_down): - /* We don't much care about the alignment of DST, but we want SRC - * to be 128-bit (16 byte) aligned so that we don't cross cache line - * boundaries on both loads and stores. */ - neg tmp2, src - ands tmp2, tmp2, #15 /* Bytes to reach alignment. */ - b.eq 2f - sub count, count, tmp2 - /* Move enough data to reach alignment; unlike memcpy, we have to - * be aware of the overlap, which means we can't move data twice. */ - tbz tmp2, #3, 1f - ldr tmp1, [src], #8 - str tmp1, [dst], #8 -1: - tbz tmp2, #2, 1f - ldr tmp1w, [src], #4 - str tmp1w, [dst], #4 -1: - tbz tmp2, #1, 1f - ldrh tmp1w, [src], #2 - strh tmp1w, [dst], #2 -1: - tbz tmp2, #0, 1f - ldrb tmp1w, [src], #1 - strb tmp1w, [dst], #1 -1: - - /* There may be less than 63 bytes to go now. */ - cmp count, #63 - b.le L(tail63down) -2: - subs count, count, #128 - b.ge L(mov_body_large_down) - /* Less than 128 bytes to move, so handle 64 here and then jump - * to the tail. */ - ldp A_l, A_h, [src] - ldp B_l, B_h, [src, #16] - ldp C_l, C_h, [src, #32] - ldp D_l, D_h, [src, #48] - stp A_l, A_h, [dst] - stp B_l, B_h, [dst, #16] - stp C_l, C_h, [dst, #32] - stp D_l, D_h, [dst, #48] - tst count, #0x3f - add src, src, #64 - add dst, dst, #64 - b.ne L(tail63down) - RET - - /* Critical loop. Start at a new cache line boundary. Assuming - * 64 bytes per line this ensures the entire loop is in one line. */ - .p2align 6 -L(mov_body_large_down): - /* There are at least 128 bytes to move. */ - ldp A_l, A_h, [src, #0] - sub dst, dst, #16 /* Pre-bias. */ - ldp B_l, B_h, [src, #16] - ldp C_l, C_h, [src, #32] - ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */ -1: - stp A_l, A_h, [dst, #16] - ldp A_l, A_h, [src, #16] - stp B_l, B_h, [dst, #32] - ldp B_l, B_h, [src, #32] - stp C_l, C_h, [dst, #48] - ldp C_l, C_h, [src, #48] - stp D_l, D_h, [dst, #64]! - ldp D_l, D_h, [src, #64]! - subs count, count, #64 - b.ge 1b - stp A_l, A_h, [dst, #16] - stp B_l, B_h, [dst, #32] - stp C_l, C_h, [dst, #48] - stp D_l, D_h, [dst, #64] - add src, src, #16 - add dst, dst, #64 + 16 - tst count, #0x3f - b.ne L(tail63down) - RET -END (memmove) - -libc_hidden_builtin_def (memmove) |