/* Optimized strcpy stpcpy implementation using LoongArch LASX instructions. Copyright (C) 2023 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library. If not, see . */ #include #include #include #if IS_IN (libc) && !defined __loongarch_soft_float # ifndef STRCPY # define STRCPY __strcpy_lasx # endif # ifdef USE_AS_STPCPY # define dstend a0 # else # define dstend a4 # endif LEAF(STRCPY, 6) ori t8, zero, 0xfe0 andi t0, a1, 0xfff li.d t7, -1 move a2, a0 bltu t8, t0, L(page_cross_start) L(start_entry): xvld xr0, a1, 0 li.d t0, 32 andi t1, a2, 0x1f xvsetanyeqz.b fcc0, xr0 sub.d t0, t0, t1 bcnez fcc0, L(end) add.d a1, a1, t0 xvst xr0, a2, 0 andi a3, a1, 0x1f add.d a2, a2, t0 bnez a3, L(unaligned) xvld xr0, a1, 0 xvsetanyeqz.b fcc0, xr0 bcnez fcc0, L(al_end) L(al_loop): xvst xr0, a2, 0 xvld xr0, a1, 32 addi.d a2, a2, 32 addi.d a1, a1, 32 xvsetanyeqz.b fcc0, xr0 bceqz fcc0, L(al_loop) L(al_end): xvmsknz.b xr0, xr0 xvpickve.w xr1, xr0, 4 vilvl.h vr0, vr1, vr0 movfr2gr.s t0, fa0 cto.w t0, t0 add.d a1, a1, t0 xvld xr0, a1, -31 add.d dstend, a2, t0 xvst xr0, dstend, -31 jr ra nop L(page_cross_start): move a4, a1 bstrins.d a4, zero, 4, 0 xvld xr0, a4, 0 xvmsknz.b xr0, xr0 xvpickve.w xr1, xr0, 4 vilvl.h vr0, vr1, vr0 movfr2gr.s t0, fa0 sra.w t0, t0, a1 beq t0, t7, L(start_entry) b L(tail) L(unaligned): andi t0, a1, 0xfff bltu t8, t0, L(un_page_cross) L(un_start_entry): xvld xr0, a1, 0 xvsetanyeqz.b fcc0, xr0 bcnez fcc0, L(un_end) addi.d a1, a1, 32 L(un_loop): xvst xr0, a2, 0 andi t0, a1, 0xfff addi.d a2, a2, 32 bltu t8, t0, L(page_cross_loop) L(un_loop_entry): xvld xr0, a1, 0 addi.d a1, a1, 32 xvsetanyeqz.b fcc0, xr0 bceqz fcc0, L(un_loop) addi.d a1, a1, -32 L(un_end): xvmsknz.b xr0, xr0 xvpickve.w xr1, xr0, 4 vilvl.h vr0, vr1, vr0 movfr2gr.s t0, fa0 L(un_tail): cto.w t0, t0 add.d a1, a1, t0 xvld xr0, a1, -31 add.d dstend, a2, t0 xvst xr0, dstend, -31 jr ra L(un_page_cross): sub.d a4, a1, a3 xvld xr0, a4, 0 xvmsknz.b xr0, xr0 xvpickve.w xr1, xr0, 4 vilvl.h vr0, vr1, vr0 movfr2gr.s t0, fa0 sra.w t0, t0, a1 beq t0, t7, L(un_start_entry) b L(un_tail) L(page_cross_loop): sub.d a4, a1, a3 xvld xr0, a4, 0 xvmsknz.b xr0, xr0 xvpickve.w xr1, xr0, 4 vilvl.h vr0, vr1, vr0 movfr2gr.s t0, fa0 sra.w t0, t0, a1 beq t0, t7, L(un_loop_entry) b L(un_tail) L(end): xvmsknz.b xr0, xr0 xvpickve.w xr1, xr0, 4 vilvl.h vr0, vr1, vr0 movfr2gr.s t0, fa0 L(tail): cto.w t0, t0 add.d dstend, a2, t0 add.d a5, a1, t0 L(less_32): srli.d t1, t0, 4 beqz t1, L(less_16) vld vr0, a1, 0 vld vr1, a5, -15 vst vr0, a2, 0 vst vr1, dstend, -15 jr ra L(less_16): srli.d t1, t0, 3 beqz t1, L(less_8) ld.d t2, a1, 0 ld.d t3, a5, -7 st.d t2, a2, 0 st.d t3, dstend, -7 jr ra L(less_8): li.d t1, 3 bltu t0, t1, L(less_3) ld.w t2, a1, 0 ld.w t3, a5, -3 st.w t2, a2, 0 st.w t3, dstend, -3 jr ra L(less_3): beqz t0, L(zero_byte) ld.h t2, a1, 0 st.h t2, a2, 0 L(zero_byte): st.b zero, dstend, 0 jr ra END(STRCPY) libc_hidden_builtin_def (STRCPY) #endif