about summary refs log tree commit diff
path: root/sysdeps/loongarch/lp64/multiarch/strcpy-lasx.S
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/loongarch/lp64/multiarch/strcpy-lasx.S')
-rw-r--r--sysdeps/loongarch/lp64/multiarch/strcpy-lasx.S215
1 files changed, 215 insertions, 0 deletions
diff --git a/sysdeps/loongarch/lp64/multiarch/strcpy-lasx.S b/sysdeps/loongarch/lp64/multiarch/strcpy-lasx.S
new file mode 100644
index 0000000000..c282561253
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strcpy-lasx.S
@@ -0,0 +1,215 @@
+/* Optimized strcpy stpcpy implementation using LoongArch LASX instructions.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc) && !defined __loongarch_soft_float
+
+# ifndef STRCPY
+#  define STRCPY __strcpy_lasx
+# endif
+
+# ifdef USE_AS_STPCPY
+#  define dstend a0
+# else
+#  define dstend a4
+# endif
+
+LEAF(STRCPY, 6)
+    ori             t8, zero, 0xfe0
+    andi            t0, a1, 0xfff
+    li.d            t7, -1
+    move            a2, a0
+
+    bltu            t8, t0, L(page_cross_start)
+L(start_entry):
+    xvld            xr0, a1, 0
+    li.d            t0, 32
+    andi            t1, a2, 0x1f
+
+    xvsetanyeqz.b   fcc0, xr0
+    sub.d           t0, t0, t1
+    bcnez           fcc0, L(end)
+    add.d           a1, a1, t0
+
+    xvst            xr0, a2, 0
+    andi            a3, a1, 0x1f
+    add.d           a2, a2, t0
+    bnez            a3, L(unaligned)
+
+
+    xvld            xr0, a1, 0
+    xvsetanyeqz.b   fcc0, xr0
+    bcnez           fcc0, L(al_end)
+L(al_loop):
+    xvst            xr0, a2, 0
+
+    xvld            xr0, a1, 32
+    addi.d          a2, a2, 32
+    addi.d          a1, a1, 32
+    xvsetanyeqz.b   fcc0, xr0
+
+    bceqz           fcc0, L(al_loop)
+L(al_end):
+    xvmsknz.b       xr0, xr0
+    xvpickve.w      xr1, xr0, 4
+    vilvl.h         vr0, vr1, vr0
+
+    movfr2gr.s      t0, fa0
+    cto.w           t0, t0
+    add.d           a1, a1, t0
+    xvld            xr0, a1, -31
+
+
+    add.d           dstend, a2, t0
+    xvst            xr0, dstend, -31
+    jr              ra
+    nop
+
+L(page_cross_start):
+    move            a4, a1
+    bstrins.d       a4, zero, 4, 0
+    xvld            xr0, a4, 0
+    xvmsknz.b       xr0, xr0
+
+    xvpickve.w      xr1, xr0, 4
+    vilvl.h         vr0, vr1, vr0
+    movfr2gr.s      t0, fa0
+    sra.w           t0, t0, a1
+
+    beq             t0, t7, L(start_entry)
+    b               L(tail)
+L(unaligned):
+    andi            t0, a1, 0xfff
+    bltu            t8, t0, L(un_page_cross)
+
+
+L(un_start_entry):
+    xvld            xr0, a1, 0
+    xvsetanyeqz.b   fcc0, xr0
+    bcnez           fcc0, L(un_end)
+    addi.d          a1, a1, 32
+
+L(un_loop):
+    xvst            xr0, a2, 0
+    andi            t0, a1, 0xfff
+    addi.d          a2, a2, 32
+    bltu            t8, t0, L(page_cross_loop)
+
+L(un_loop_entry):
+    xvld            xr0, a1, 0
+    addi.d          a1, a1, 32
+    xvsetanyeqz.b   fcc0, xr0
+    bceqz           fcc0, L(un_loop)
+
+    addi.d          a1, a1, -32
+L(un_end):
+    xvmsknz.b       xr0, xr0
+    xvpickve.w      xr1, xr0, 4
+    vilvl.h         vr0, vr1, vr0
+
+
+    movfr2gr.s      t0, fa0
+L(un_tail):
+    cto.w           t0, t0
+    add.d           a1, a1, t0
+    xvld            xr0, a1, -31
+
+    add.d           dstend, a2, t0
+    xvst            xr0, dstend, -31
+    jr              ra
+L(un_page_cross):
+    sub.d           a4, a1, a3
+
+    xvld            xr0, a4, 0
+    xvmsknz.b       xr0, xr0
+    xvpickve.w      xr1, xr0, 4
+    vilvl.h         vr0, vr1, vr0
+
+    movfr2gr.s      t0, fa0
+    sra.w           t0, t0, a1
+    beq             t0, t7, L(un_start_entry)
+    b               L(un_tail)
+
+
+L(page_cross_loop):
+    sub.d           a4, a1, a3
+    xvld            xr0, a4, 0
+    xvmsknz.b       xr0, xr0
+    xvpickve.w      xr1, xr0, 4
+
+    vilvl.h         vr0, vr1, vr0
+    movfr2gr.s      t0, fa0
+    sra.w           t0, t0, a1
+    beq             t0, t7, L(un_loop_entry)
+
+    b               L(un_tail)
+L(end):
+    xvmsknz.b       xr0, xr0
+    xvpickve.w      xr1, xr0, 4
+    vilvl.h         vr0, vr1, vr0
+
+    movfr2gr.s      t0, fa0
+L(tail):
+    cto.w           t0, t0
+    add.d           dstend, a2, t0
+    add.d           a5, a1, t0
+
+L(less_32):
+    srli.d          t1, t0, 4
+    beqz            t1, L(less_16)
+    vld             vr0, a1, 0
+    vld             vr1, a5, -15
+
+    vst             vr0, a2, 0
+    vst             vr1, dstend, -15
+    jr              ra
+L(less_16):
+    srli.d          t1, t0, 3
+
+    beqz            t1, L(less_8)
+    ld.d            t2, a1, 0
+    ld.d            t3, a5, -7
+    st.d            t2, a2, 0
+
+    st.d            t3, dstend, -7
+    jr              ra
+L(less_8):
+    li.d            t1, 3
+    bltu            t0, t1, L(less_3)
+
+    ld.w            t2, a1, 0
+    ld.w            t3, a5, -3
+    st.w            t2, a2, 0
+    st.w            t3, dstend, -3
+
+    jr              ra
+L(less_3):
+    beqz            t0, L(zero_byte)
+    ld.h            t2, a1, 0
+
+    st.h            t2, a2, 0
+L(zero_byte):
+    st.b            zero, dstend, 0
+    jr              ra
+END(STRCPY)
+
+libc_hidden_builtin_def (STRCPY)
+#endif