about summary refs log tree commit diff
path: root/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S')
-rw-r--r--sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S212
1 files changed, 212 insertions, 0 deletions
diff --git a/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S b/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S
new file mode 100644
index 0000000000..fc2498f7db
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S
@@ -0,0 +1,212 @@
+/* Optimized strcpy stpcpy implementation using LoongArch LSX instructions.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc) && !defined __loongarch_soft_float
+
+# ifndef STRCPY
+#  define STRCPY __strcpy_lsx
+# endif
+
+LEAF(STRCPY, 6)
+    pcalau12i       t0, %pc_hi20(L(INDEX))
+    andi            a4, a1, 0xf
+    vld             vr1, t0, %pc_lo12(L(INDEX))
+    move            a2, a0
+
+    beqz            a4, L(load_start)
+    xor             t0, a1, a4
+    vld             vr0, t0, 0
+    vreplgr2vr.b    vr2, a4
+
+    vadd.b          vr2, vr2, vr1
+    vshuf.b         vr0, vr2, vr0, vr2
+    vsetanyeqz.b    fcc0, vr0
+    bcnez           fcc0, L(end)
+
+L(load_start):
+    vld             vr0, a1, 0
+    li.d            t1, 16
+    andi            a3, a2, 0xf
+    vsetanyeqz.b    fcc0, vr0
+
+
+    sub.d           t0, t1, a3
+    bcnez           fcc0, L(end)
+    add.d           a1, a1, t0
+    vst             vr0, a2, 0
+
+    andi            a3, a1, 0xf
+    add.d           a2, a2, t0
+    bnez            a3, L(unaligned)
+    vld             vr0, a1, 0
+
+    vsetanyeqz.b    fcc0, vr0
+    bcnez           fcc0, L(al_end)
+L(al_loop):
+    vst             vr0, a2, 0
+    vld             vr0, a1, 16
+
+    addi.d          a2, a2, 16
+    addi.d          a1, a1, 16
+    vsetanyeqz.b    fcc0, vr0
+    bceqz           fcc0, L(al_loop)
+
+
+L(al_end):
+    vmsknz.b        vr1, vr0
+    movfr2gr.s      t0, fa1
+    cto.w           t0, t0
+    add.d           a1, a1, t0
+
+    vld             vr0, a1, -15
+# ifdef USE_AS_STPCPY
+    add.d           a0, a2, t0
+    vst             vr0, a0, -15
+# else
+    add.d           a2, a2, t0
+    vst             vr0, a2, -15
+# endif
+    jr              ra
+
+L(end):
+    vmsknz.b        vr1, vr0
+    movfr2gr.s      t0, fa1
+    cto.w           t0, t0
+    addi.d          t0, t0, 1
+
+L(end_16):
+    andi            t1, t0, 16
+    beqz            t1, L(end_8)
+    vst             vr0, a2, 0
+# ifdef USE_AS_STPCPY
+    addi.d          a0, a2, 15
+# endif
+    jr              ra
+
+L(end_8):
+    andi            t2, t0, 8
+    andi            t3, t0, 4
+    andi            t4, t0, 2
+    andi            t5, t0, 1
+
+    beqz            t2, L(end_4)
+    vstelm.d        vr0, a2, 0, 0
+    addi.d          a2, a2, 8
+    vbsrl.v         vr0, vr0, 8
+
+L(end_4):
+    beqz            t3, L(end_2)
+    vstelm.w        vr0, a2, 0, 0
+    addi.d          a2, a2, 4
+    vbsrl.v         vr0, vr0, 4
+
+L(end_2):
+    beqz            t4, L(end_1)
+    vstelm.h        vr0, a2, 0, 0
+    addi.d          a2, a2, 2
+    vbsrl.v         vr0, vr0, 2
+
+
+L(end_1):
+    beqz            t5, L(out)
+    vstelm.b        vr0, a2, 0, 0
+    addi.d          a2, a2, 1
+L(out):
+# ifdef USE_AS_STPCPY
+    addi.d          a0, a2, -1
+# endif
+    jr              ra
+
+    .align          4
+L(unaligned):
+    bstrins.d       a1, zero, 3, 0
+    vld             vr2, a1, 0
+    vreplgr2vr.b    vr3, a3
+    vslt.b          vr4, vr1, vr3
+
+    vor.v           vr0, vr2, vr4
+    vsetanyeqz.b    fcc0, vr0
+    bcnez           fcc0, L(un_first_end)
+    vld             vr0, a1, 16
+
+    vadd.b          vr3, vr3, vr1
+    vshuf.b         vr4, vr0, vr2, vr3
+    vsetanyeqz.b    fcc0, vr0
+    bcnez           fcc0, L(un_end)
+
+
+    vor.v           vr2, vr0, vr0
+    addi.d          a1, a1, 16
+L(un_loop):
+    vld             vr0, a1, 16
+    vst             vr4, a2, 0
+
+    addi.d          a2, a2, 16
+    vshuf.b         vr4, vr0, vr2, vr3
+    vsetanyeqz.b    fcc0, vr0
+    bcnez           fcc0, L(un_end)
+
+    vld             vr2, a1, 32
+    vst             vr4, a2, 0
+    addi.d          a1, a1, 32
+    addi.d          a2, a2, 16
+
+    vshuf.b         vr4, vr2, vr0, vr3
+    vsetanyeqz.b    fcc0, vr2
+    bceqz           fcc0, L(un_loop)
+    vor.v           vr0, vr2, vr2
+
+
+    addi.d          a1, a1, -16
+L(un_end):
+    vsetanyeqz.b    fcc0, vr4
+    bcnez           fcc0, 1f
+    vst             vr4, a2, 0
+
+1:
+    vmsknz.b        vr1, vr0
+    movfr2gr.s      t0, fa1
+    cto.w           t0, t0
+    add.d           a1, a1, t0
+
+    vld             vr0, a1, 1
+    add.d           a2, a2, t0
+    sub.d           a2, a2, a3
+    vst             vr0, a2, 1
+# ifdef USE_AS_STPCPY
+    addi.d          a0, a2, 16
+# endif
+    jr              ra
+L(un_first_end):
+    addi.d          a2, a2, -16
+    addi.d          a1, a1, -16
+    b               1b
+END(STRCPY)
+
+    .section        .rodata.cst16,"M",@progbits,16
+    .align          4
+L(INDEX):
+    .dword          0x0706050403020100
+    .dword          0x0f0e0d0c0b0a0908
+
+libc_hidden_builtin_def (STRCPY)
+#endif