/* Optimized memmove implementation using LoongArch LSX instructions. Copyright (C) 2023 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library. If not, see . */ #include #include #include #if IS_IN (libc) && !defined __loongarch_soft_float # define MEMCPY_NAME __memcpy_lsx # define MEMMOVE_NAME __memmove_lsx LEAF(MEMCPY_NAME, 6) li.d t6, 16 add.d a3, a0, a2 add.d a4, a1, a2 bgeu t6, a2, L(less_16bytes) li.d t8, 64 li.d t7, 32 bltu t8, a2, L(copy_long) bltu t7, a2, L(more_32bytes) vld vr0, a1, 0 vld vr1, a4, -16 vst vr0, a0, 0 vst vr1, a3, -16 jr ra L(more_32bytes): vld vr0, a1, 0 vld vr1, a1, 16 vld vr2, a4, -32 vld vr3, a4, -16 vst vr0, a0, 0 vst vr1, a0, 16 vst vr2, a3, -32 vst vr3, a3, -16 jr ra L(less_16bytes): srli.d t0, a2, 3 beqz t0, L(less_8bytes) vldrepl.d vr0, a1, 0 vldrepl.d vr1, a4, -8 vstelm.d vr0, a0, 0, 0 vstelm.d vr1, a3, -8, 0 jr ra L(less_8bytes): srli.d t0, a2, 2 beqz t0, L(less_4bytes) vldrepl.w vr0, a1, 0 vldrepl.w vr1, a4, -4 vstelm.w vr0, a0, 0, 0 vstelm.w vr1, a3, -4, 0 jr ra L(less_4bytes): srli.d t0, a2, 1 beqz t0, L(less_2bytes) vldrepl.h vr0, a1, 0 vldrepl.h vr1, a4, -2 vstelm.h vr0, a0, 0, 0 vstelm.h vr1, a3, -2, 0 jr ra L(less_2bytes): beqz a2, L(less_1bytes) ld.b t0, a1, 0 st.b t0, a0, 0 L(less_1bytes): jr ra nop END(MEMCPY_NAME) LEAF(MEMMOVE_NAME, 6) li.d t6, 16 add.d a3, a0, a2 add.d a4, a1, a2 bgeu t6, a2, L(less_16bytes) li.d t8, 64 li.d t7, 32 bltu t8, a2, L(move_long) bltu t7, a2, L(more_32bytes) vld vr0, a1, 0 vld vr1, a4, -16 vst vr0, a0, 0 vst vr1, a3, -16 jr ra nop L(move_long): sub.d t0, a0, a1 bltu t0, a2, L(copy_back) L(copy_long): vld vr2, a1, 0 andi t0, a0, 0xf sub.d t0, t6, t0 add.d a1, a1, t0 sub.d a2, a2, t0 andi t1, a1, 0xf bnez t1, L(unaligned) vld vr0, a1, 0 addi.d a2, a2, -16 vst vr2, a0, 0 andi t2, a2, 0x7f add.d a5, a0, t0 beq a2, t2, L(al_less_128) sub.d t3, a2, t2 move a2, t2 add.d a6, a1, t3 L(al_loop): vld vr1, a1, 16 vld vr2, a1, 32 vld vr3, a1, 48 vld vr4, a1, 64 vld vr5, a1, 80 vld vr6, a1, 96 vld vr7, a1, 112 vst vr0, a5, 0 vld vr0, a1, 128 addi.d a1, a1, 128 vst vr1, a5, 16 vst vr2, a5, 32 vst vr3, a5, 48 vst vr4, a5, 64 vst vr5, a5, 80 vst vr6, a5, 96 vst vr7, a5, 112 addi.d a5, a5, 128 bne a1, a6, L(al_loop) L(al_less_128): blt a2, t8, L(al_less_64) vld vr1, a1, 16 vld vr2, a1, 32 vld vr3, a1, 48 addi.d a2, a2, -64 vst vr0, a5, 0 vld vr0, a1, 64 addi.d a1, a1, 64 vst vr1, a5, 16 vst vr2, a5, 32 vst vr3, a5, 48 addi.d a5, a5, 64 L(al_less_64): blt a2, t7, L(al_less_32) vld vr1, a1, 16 addi.d a2, a2, -32 vst vr0, a5, 0 vld vr0, a1, 32 addi.d a1, a1, 32 vst vr1, a5, 16 addi.d a5, a5, 32 L(al_less_32): blt a2, t6, L(al_less_16) vst vr0, a5, 0 vld vr0, a1, 16 addi.d a5, a5, 16 L(al_less_16): vld vr1, a4, -16 vst vr0, a5, 0 vst vr1, a3, -16 jr ra nop L(unaligned): pcalau12i t2, %pc_hi20(L(INDEX)) bstrins.d a1, zero, 3, 0 vld vr8, t2, %pc_lo12(L(INDEX)) vld vr0, a1, 0 vld vr1, a1, 16 addi.d a2, a2, -16 vst vr2, a0, 0 add.d a5, a0, t0 vreplgr2vr.b vr9, t1 andi t2, a2, 0x7f vadd.b vr9, vr9, vr8 addi.d a1, a1, 32 beq t2, a2, L(un_less_128) sub.d t3, a2, t2 move a2, t2 add.d a6, a1, t3 L(un_loop): vld vr2, a1, 0 vld vr3, a1, 16 vld vr4, a1, 32 vld vr5, a1, 48 vld vr6, a1, 64 vld vr7, a1, 80 vshuf.b vr8, vr1, vr0, vr9 vld vr0, a1, 96 vst vr8, a5, 0 vshuf.b vr8, vr2, vr1, vr9 vld vr1, a1, 112 vst vr8, a5, 16 addi.d a1, a1, 128 vshuf.b vr2, vr3, vr2, vr9 vshuf.b vr3, vr4, vr3, vr9 vst vr2, a5, 32 vshuf.b vr4, vr5, vr4, vr9 vst vr3, a5, 48 vshuf.b vr5, vr6, vr5, vr9 vst vr4, a5, 64 vshuf.b vr6, vr7, vr6, vr9 vst vr5, a5, 80 vshuf.b vr7, vr0, vr7, vr9 vst vr6, a5, 96 vst vr7, a5, 112 addi.d a5, a5, 128 bne a1, a6, L(un_loop) L(un_less_128): blt a2, t8, L(un_less_64) vld vr2, a1, 0 vld vr3, a1, 16 vshuf.b vr4, vr1, vr0, vr9 vld vr0, a1, 32 vst vr4, a5, 0 addi.d a2, a2, -64 vshuf.b vr4, vr2, vr1, vr9 vld vr1, a1, 48 addi.d a1, a1, 64 vst vr4, a5, 16 vshuf.b vr2, vr3, vr2, vr9 vshuf.b vr3, vr0, vr3, vr9 vst vr2, a5, 32 vst vr3, a5, 48 addi.d a5, a5, 64 L(un_less_64): blt a2, t7, L(un_less_32) vshuf.b vr3, vr1, vr0, vr9 vld vr0, a1, 0 vst vr3, a5, 0 addi.d a2, a2, -32 vshuf.b vr3, vr0, vr1, vr9 vld vr1, a1, 16 addi.d a1, a1, 32 vst vr3, a5, 16 addi.d a5, a5, 32 L(un_less_32): blt a2, t6, L(un_less_16) vshuf.b vr2, vr1, vr0, vr9 vor.v vr0, vr1, vr1 vld vr1, a1, 0 vst vr2, a5, 0 addi.d a5, a5, 16 L(un_less_16): vld vr2, a4, -16 vshuf.b vr0, vr1, vr0, vr9 vst vr0, a5, 0 vst vr2, a3, -16 jr ra L(copy_back): addi.d t0, a3, -1 vld vr2, a4, -16 andi t0, t0, 0xf addi.d t0, t0, 1 sub.d a4, a4, t0 sub.d a2, a2, t0 andi t1, a4, 0xf bnez t1, L(back_unaligned) vld vr0, a4, -16 addi.d a2, a2, -16 vst vr2, a3, -16 andi t2, a2, 0x7f sub.d a3, a3, t0 beq t2, a2, L(back_al_less_128) sub.d t3, a2, t2 move a2, t2 sub.d a6, a4, t3 L(back_al_loop): vld vr1, a4, -32 vld vr2, a4, -48 vld vr3, a4, -64 vld vr4, a4, -80 vld vr5, a4, -96 vld vr6, a4, -112 vld vr7, a4, -128 vst vr0, a3, -16 vld vr0, a4, -144 addi.d a4, a4, -128 vst vr1, a3, -32 vst vr2, a3, -48 vst vr3, a3, -64 vst vr4, a3, -80 vst vr5, a3, -96 vst vr6, a3, -112 vst vr7, a3, -128 addi.d a3, a3, -128 bne a4, a6, L(back_al_loop) L(back_al_less_128): blt a2, t8, L(back_al_less_64) vld vr1, a4, -32 vld vr2, a4, -48 vld vr3, a4, -64 addi.d a2, a2, -64 vst vr0, a3, -16 vld vr0, a4, -80 addi.d a4, a4, -64 vst vr1, a3, -32 vst vr2, a3, -48 vst vr3, a3, -64 addi.d a3, a3, -64 L(back_al_less_64): blt a2, t7, L(back_al_less_32) vld vr1, a4, -32 addi.d a2, a2, -32 vst vr0, a3, -16 vld vr0, a4, -48 vst vr1, a3, -32 addi.d a3, a3, -32 addi.d a4, a4, -32 L(back_al_less_32): blt a2, t6, L(back_al_less_16) vst vr0, a3, -16 vld vr0, a4, -32 addi.d a3, a3, -16 L(back_al_less_16): vld vr1, a1, 0 vst vr0, a3, -16 vst vr1, a0, 0 jr ra L(back_unaligned): pcalau12i t2, %pc_hi20(L(INDEX)) bstrins.d a4, zero, 3, 0 vld vr8, t2, %pc_lo12(L(INDEX)) vld vr0, a4, 0 vld vr1, a4, -16 addi.d a2, a2, -16 vst vr2, a3, -16 sub.d a3, a3, t0 vreplgr2vr.b vr9, t1 andi t2, a2, 0x7f vadd.b vr9, vr9, vr8 addi.d a4, a4, -16 beq t2, a2, L(back_un_less_128) sub.d t3, a2, t2 move a2, t2 sub.d a6, a4, t3 L(back_un_loop): vld vr2, a4, -16 vld vr3, a4, -32 vld vr4, a4, -48 vld vr5, a4, -64 vld vr6, a4, -80 vld vr7, a4, -96 vshuf.b vr8, vr0, vr1, vr9 vld vr0, a4, -112 vst vr8, a3, -16 vshuf.b vr8, vr1, vr2, vr9 vld vr1, a4, -128 vst vr8, a3, -32 addi.d a4, a4, -128 vshuf.b vr2, vr2, vr3, vr9 vshuf.b vr3, vr3, vr4, vr9 vst vr2, a3, -48 vshuf.b vr4, vr4, vr5, vr9 vst vr3, a3, -64 vshuf.b vr5, vr5, vr6, vr9 vst vr4, a3, -80 vshuf.b vr6, vr6, vr7, vr9 vst vr5, a3, -96 vshuf.b vr7, vr7, vr0, vr9 vst vr6, a3, -112 vst vr7, a3, -128 addi.d a3, a3, -128 bne a4, a6, L(back_un_loop) L(back_un_less_128): blt a2, t8, L(back_un_less_64) vld vr2, a4, -16 vld vr3, a4, -32 vshuf.b vr4, vr0, vr1, vr9 vld vr0, a4, -48 vst vr4, a3, -16 addi.d a2, a2, -64 vshuf.b vr4, vr1, vr2, vr9 vld vr1, a4, -64 addi.d a4, a4, -64 vst vr4, a3, -32 vshuf.b vr2, vr2, vr3, vr9 vshuf.b vr3, vr3, vr0, vr9 vst vr2, a3, -48 vst vr3, a3, -64 addi.d a3, a3, -64 L(back_un_less_64): blt a2, t7, L(back_un_less_32) vshuf.b vr3, vr0, vr1, vr9 vld vr0, a4, -16 vst vr3, a3, -16 addi.d a2, a2, -32 vshuf.b vr3, vr1, vr0, vr9 vld vr1, a4, -32 addi.d a4, a4, -32 vst vr3, a3, -32 addi.d a3, a3, -32 L(back_un_less_32): blt a2, t6, L(back_un_less_16) vshuf.b vr2, vr0, vr1, vr9 vor.v vr0, vr1, vr1 vld vr1, a4, -16 vst vr2, a3, -16 addi.d a3, a3, -16 L(back_un_less_16): vld vr2, a1, 0 vshuf.b vr0, vr0, vr1, vr9 vst vr0, a3, -16 vst vr2, a0, 0 jr ra END(MEMMOVE_NAME) .section .rodata.cst16,"M",@progbits,16 .align 4 L(INDEX): .dword 0x0706050403020100 .dword 0x0f0e0d0c0b0a0908 libc_hidden_builtin_def (MEMCPY_NAME) libc_hidden_builtin_def (MEMMOVE_NAME) #endif