diff options
Diffstat (limited to 'sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S')
-rw-r--r-- | sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S | 212 |
1 files changed, 212 insertions, 0 deletions
diff --git a/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S b/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S new file mode 100644 index 0000000000..fc2498f7db --- /dev/null +++ b/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S @@ -0,0 +1,212 @@ +/* Optimized strcpy stpcpy implementation using LoongArch LSX instructions. + Copyright (C) 2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <sys/regdef.h> +#include <sys/asm.h> + +#if IS_IN (libc) && !defined __loongarch_soft_float + +# ifndef STRCPY +# define STRCPY __strcpy_lsx +# endif + +LEAF(STRCPY, 6) + pcalau12i t0, %pc_hi20(L(INDEX)) + andi a4, a1, 0xf + vld vr1, t0, %pc_lo12(L(INDEX)) + move a2, a0 + + beqz a4, L(load_start) + xor t0, a1, a4 + vld vr0, t0, 0 + vreplgr2vr.b vr2, a4 + + vadd.b vr2, vr2, vr1 + vshuf.b vr0, vr2, vr0, vr2 + vsetanyeqz.b fcc0, vr0 + bcnez fcc0, L(end) + +L(load_start): + vld vr0, a1, 0 + li.d t1, 16 + andi a3, a2, 0xf + vsetanyeqz.b fcc0, vr0 + + + sub.d t0, t1, a3 + bcnez fcc0, L(end) + add.d a1, a1, t0 + vst vr0, a2, 0 + + andi a3, a1, 0xf + add.d a2, a2, t0 + bnez a3, L(unaligned) + vld vr0, a1, 0 + + vsetanyeqz.b fcc0, vr0 + bcnez fcc0, L(al_end) +L(al_loop): + vst vr0, a2, 0 + vld vr0, a1, 16 + + addi.d a2, a2, 16 + addi.d a1, a1, 16 + vsetanyeqz.b fcc0, vr0 + bceqz fcc0, L(al_loop) + + +L(al_end): + vmsknz.b vr1, vr0 + movfr2gr.s t0, fa1 + cto.w t0, t0 + add.d a1, a1, t0 + + vld vr0, a1, -15 +# ifdef USE_AS_STPCPY + add.d a0, a2, t0 + vst vr0, a0, -15 +# else + add.d a2, a2, t0 + vst vr0, a2, -15 +# endif + jr ra + +L(end): + vmsknz.b vr1, vr0 + movfr2gr.s t0, fa1 + cto.w t0, t0 + addi.d t0, t0, 1 + +L(end_16): + andi t1, t0, 16 + beqz t1, L(end_8) + vst vr0, a2, 0 +# ifdef USE_AS_STPCPY + addi.d a0, a2, 15 +# endif + jr ra + +L(end_8): + andi t2, t0, 8 + andi t3, t0, 4 + andi t4, t0, 2 + andi t5, t0, 1 + + beqz t2, L(end_4) + vstelm.d vr0, a2, 0, 0 + addi.d a2, a2, 8 + vbsrl.v vr0, vr0, 8 + +L(end_4): + beqz t3, L(end_2) + vstelm.w vr0, a2, 0, 0 + addi.d a2, a2, 4 + vbsrl.v vr0, vr0, 4 + +L(end_2): + beqz t4, L(end_1) + vstelm.h vr0, a2, 0, 0 + addi.d a2, a2, 2 + vbsrl.v vr0, vr0, 2 + + +L(end_1): + beqz t5, L(out) + vstelm.b vr0, a2, 0, 0 + addi.d a2, a2, 1 +L(out): +# ifdef USE_AS_STPCPY + addi.d a0, a2, -1 +# endif + jr ra + + .align 4 +L(unaligned): + bstrins.d a1, zero, 3, 0 + vld vr2, a1, 0 + vreplgr2vr.b vr3, a3 + vslt.b vr4, vr1, vr3 + + vor.v vr0, vr2, vr4 + vsetanyeqz.b fcc0, vr0 + bcnez fcc0, L(un_first_end) + vld vr0, a1, 16 + + vadd.b vr3, vr3, vr1 + vshuf.b vr4, vr0, vr2, vr3 + vsetanyeqz.b fcc0, vr0 + bcnez fcc0, L(un_end) + + + vor.v vr2, vr0, vr0 + addi.d a1, a1, 16 +L(un_loop): + vld vr0, a1, 16 + vst vr4, a2, 0 + + addi.d a2, a2, 16 + vshuf.b vr4, vr0, vr2, vr3 + vsetanyeqz.b fcc0, vr0 + bcnez fcc0, L(un_end) + + vld vr2, a1, 32 + vst vr4, a2, 0 + addi.d a1, a1, 32 + addi.d a2, a2, 16 + + vshuf.b vr4, vr2, vr0, vr3 + vsetanyeqz.b fcc0, vr2 + bceqz fcc0, L(un_loop) + vor.v vr0, vr2, vr2 + + + addi.d a1, a1, -16 +L(un_end): + vsetanyeqz.b fcc0, vr4 + bcnez fcc0, 1f + vst vr4, a2, 0 + +1: + vmsknz.b vr1, vr0 + movfr2gr.s t0, fa1 + cto.w t0, t0 + add.d a1, a1, t0 + + vld vr0, a1, 1 + add.d a2, a2, t0 + sub.d a2, a2, a3 + vst vr0, a2, 1 +# ifdef USE_AS_STPCPY + addi.d a0, a2, 16 +# endif + jr ra +L(un_first_end): + addi.d a2, a2, -16 + addi.d a1, a1, -16 + b 1b +END(STRCPY) + + .section .rodata.cst16,"M",@progbits,16 + .align 4 +L(INDEX): + .dword 0x0706050403020100 + .dword 0x0f0e0d0c0b0a0908 + +libc_hidden_builtin_def (STRCPY) +#endif |