/* Optimized unaligned memcpy implementation using basic LoongArch instructions. Copyright (C) 2023 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library. If not, see . */ #include #include #include #if IS_IN (libc) # define MEMCPY_NAME __memcpy_unaligned # define LD_64(reg, n) \ ld.d t0, reg, n; \ ld.d t1, reg, n + 8; \ ld.d t2, reg, n + 16; \ ld.d t3, reg, n + 24; \ ld.d t4, reg, n + 32; \ ld.d t5, reg, n + 40; \ ld.d t6, reg, n + 48; \ ld.d t7, reg, n + 56; # define ST_64(reg, n) \ st.d t0, reg, n; \ st.d t1, reg, n + 8; \ st.d t2, reg, n + 16; \ st.d t3, reg, n + 24; \ st.d t4, reg, n + 32; \ st.d t5, reg, n + 40; \ st.d t6, reg, n + 48; \ st.d t7, reg, n + 56; LEAF(MEMCPY_NAME, 3) add.d a4, a1, a2 add.d a3, a0, a2 li.w a6, 16 bge a6, a2, L(less_16bytes) li.w a6, 128 blt a6, a2, L(long_bytes) li.w a6, 64 blt a6, a2, L(more_64bytes) li.w a6, 32 blt a6, a2, L(more_32bytes) ld.d t0, a1, 0 ld.d t1, a1, 8 ld.d t2, a4, -16 ld.d t3, a4, -8 st.d t0, a0, 0 st.d t1, a0, 8 st.d t2, a3, -16 st.d t3, a3, -8 jr ra L(more_64bytes): srli.d t8, a0, 3 slli.d t8, t8, 3 addi.d t8, t8, 0x8 sub.d a7, a0, t8 ld.d t0, a1, 0 sub.d a1, a1, a7 st.d t0, a0, 0 add.d a7, a7, a2 addi.d a7, a7, -0x20 L(loop_32): ld.d t0, a1, 0 ld.d t1, a1, 8 ld.d t2, a1, 16 ld.d t3, a1, 24 st.d t0, t8, 0 st.d t1, t8, 8 st.d t2, t8, 16 st.d t3, t8, 24 addi.d t8, t8, 0x20 addi.d a1, a1, 0x20 addi.d a7, a7, -0x20 blt zero, a7, L(loop_32) ld.d t4, a4, -32 ld.d t5, a4, -24 ld.d t6, a4, -16 ld.d t7, a4, -8 st.d t4, a3, -32 st.d t5, a3, -24 st.d t6, a3, -16 st.d t7, a3, -8 jr ra L(more_32bytes): ld.d t0, a1, 0 ld.d t1, a1, 8 ld.d t2, a1, 16 ld.d t3, a1, 24 ld.d t4, a4, -32 ld.d t5, a4, -24 ld.d t6, a4, -16 ld.d t7, a4, -8 st.d t0, a0, 0 st.d t1, a0, 8 st.d t2, a0, 16 st.d t3, a0, 24 st.d t4, a3, -32 st.d t5, a3, -24 st.d t6, a3, -16 st.d t7, a3, -8 jr ra L(less_16bytes): srai.d a6, a2, 3 beqz a6, L(less_8bytes) ld.d t0, a1, 0 ld.d t1, a4, -8 st.d t0, a0, 0 st.d t1, a3, -8 jr ra L(less_8bytes): srai.d a6, a2, 2 beqz a6, L(less_4bytes) ld.w t0, a1, 0 ld.w t1, a4, -4 st.w t0, a0, 0 st.w t1, a3, -4 jr ra L(less_4bytes): srai.d a6, a2, 1 beqz a6, L(less_2bytes) ld.h t0, a1, 0 ld.h t1, a4, -2 st.h t0, a0, 0 st.h t1, a3, -2 jr ra L(less_2bytes): beqz a2, L(less_1bytes) ld.b t0, a1, 0 st.b t0, a0, 0 jr ra L(less_1bytes): jr ra L(long_bytes): srli.d t8, a0, 3 slli.d t8, t8, 3 beq a0, t8, L(start) ld.d t0, a1, 0 addi.d t8, t8, 0x8 st.d t0, a0, 0 sub.d a7, a0, t8 sub.d a1, a1, a7 L(start): addi.d a5, a3, -0x80 blt a5, t8, L(align_end_proc) L(loop_128): LD_64(a1, 0) ST_64(t8, 0) LD_64(a1, 64) addi.d a1, a1, 0x80 ST_64(t8, 64) addi.d t8, t8, 0x80 bge a5, t8, L(loop_128) L(align_end_proc): sub.d a2, a3, t8 pcaddi t1, 34 andi t2, a2, 0x78 sub.d t1, t1, t2 jr t1 ld.d t0, a1, 112 st.d t0, t8, 112 ld.d t0, a1, 104 st.d t0, t8, 104 ld.d t0, a1, 96 st.d t0, t8, 96 ld.d t0, a1, 88 st.d t0, t8, 88 ld.d t0, a1, 80 st.d t0, t8, 80 ld.d t0, a1, 72 st.d t0, t8, 72 ld.d t0, a1, 64 st.d t0, t8, 64 ld.d t0, a1, 56 st.d t0, t8, 56 ld.d t0, a1, 48 st.d t0, t8, 48 ld.d t0, a1, 40 st.d t0, t8, 40 ld.d t0, a1, 32 st.d t0, t8, 32 ld.d t0, a1, 24 st.d t0, t8, 24 ld.d t0, a1, 16 st.d t0, t8, 16 ld.d t0, a1, 8 st.d t0, t8, 8 ld.d t0, a1, 0 st.d t0, t8, 0 ld.d t0, a4, -8 st.d t0, a3, -8 jr ra END(MEMCPY_NAME) libc_hidden_builtin_def (MEMCPY_NAME) #endif