about summary refs log tree commit diff
path: root/sysdeps/loongarch/lp64/multiarch/strcpy-aligned.S
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/loongarch/lp64/multiarch/strcpy-aligned.S')
-rw-r--r--sysdeps/loongarch/lp64/multiarch/strcpy-aligned.S202
1 files changed, 202 insertions, 0 deletions
diff --git a/sysdeps/loongarch/lp64/multiarch/strcpy-aligned.S b/sysdeps/loongarch/lp64/multiarch/strcpy-aligned.S
new file mode 100644
index 0000000000..4ed539fdb4
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strcpy-aligned.S
@@ -0,0 +1,202 @@
+/* Optimized strcpy stpcpy aligned implementation using basic LoongArch
+   instructions.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc)
+# ifndef STRCPY
+#  define STRCPY __strcpy_aligned
+# endif
+#else
+# ifndef STRCPY
+#  define STRCPY strcpy
+# endif
+#endif
+
+LEAF(STRCPY, 6)
+    andi        a3, a0, 0x7
+    move        a2, a0
+    beqz        a3, L(dest_align)
+    sub.d       a5, a1, a3
+    addi.d      a5, a5, 8
+
+L(make_dest_align):
+    ld.b        t0, a1, 0
+    addi.d      a1, a1, 1
+    st.b        t0, a2, 0
+    addi.d      a2, a2, 1
+    beqz        t0, L(al_out)
+
+    bne         a1, a5, L(make_dest_align)
+
+L(dest_align):
+    andi        a4, a1, 7
+    bstrins.d   a1, zero, 2, 0
+
+    lu12i.w     t5, 0x1010
+    ld.d        t0, a1, 0
+    ori         t5, t5, 0x101
+    bstrins.d   t5, t5, 63, 32
+
+    slli.d      t6, t5, 0x7
+    bnez        a4, L(unalign)
+    sub.d       t1, t0, t5
+    andn        t2, t6, t0
+
+    and         t3, t1, t2
+    bnez        t3, L(al_end)
+
+L(al_loop):
+    st.d        t0, a2, 0
+    ld.d        t0, a1, 8
+
+    addi.d      a1, a1, 8
+    addi.d      a2, a2, 8
+    sub.d       t1, t0, t5
+    andn        t2, t6, t0
+
+    and         t3, t1, t2
+    beqz        t3, L(al_loop)
+
+L(al_end):
+    ctz.d       t1, t3
+    srli.d      t1, t1, 3
+    addi.d      t1, t1, 1
+
+    andi        a3, t1, 8
+    andi        a4, t1, 4
+    andi        a5, t1, 2
+    andi        a6, t1, 1
+
+L(al_end_8):
+    beqz        a3, L(al_end_4)
+    st.d        t0, a2, 0
+#ifdef USE_AS_STPCPY
+    addi.d      a0, a2, 7
+#endif
+    jr          ra
+L(al_end_4):
+    beqz        a4, L(al_end_2)
+    st.w        t0, a2, 0
+    addi.d      a2, a2, 4
+    srli.d      t0, t0, 32
+L(al_end_2):
+    beqz        a5, L(al_end_1)
+    st.h        t0, a2, 0
+    addi.d      a2, a2, 2
+    srli.d      t0, t0, 16
+L(al_end_1):
+    beqz        a6, L(al_out)
+    st.b        t0, a2, 0
+    addi.d      a2, a2, 1
+L(al_out):
+#ifdef USE_AS_STPCPY
+    addi.d      a0, a2, -1
+#endif
+    jr          ra
+
+    .align      4
+L(unalign):
+    slli.d      a5, a4, 3
+    li.d        t1, -1
+    sub.d       a6, zero, a5
+
+    srl.d       a7, t0, a5
+    sll.d       t7, t1, a6
+
+    or          t0, a7, t7
+    sub.d       t1, t0, t5
+    andn        t2, t6, t0
+    and         t3, t1, t2
+
+    bnez        t3, L(un_end)
+
+    ld.d        t4, a1, 8
+
+    sub.d       t1, t4, t5
+    andn        t2, t6, t4
+    sll.d       t0, t4, a6
+    and         t3, t1, t2
+
+    or          t0, t0, a7
+    bnez        t3, L(un_end_with_remaining)
+
+L(un_loop):
+    srl.d       a7, t4, a5
+
+    ld.d        t4, a1, 16
+    addi.d      a1, a1, 8
+
+    st.d        t0, a2, 0
+    addi.d      a2, a2, 8
+
+    sub.d       t1, t4, t5
+    andn        t2, t6, t4
+    sll.d       t0, t4, a6
+    and         t3, t1, t2
+
+    or          t0, t0, a7
+    beqz        t3, L(un_loop)
+
+L(un_end_with_remaining):
+    ctz.d       t1, t3
+    srli.d      t1, t1, 3
+    addi.d      t1, t1, 1
+    sub.d       t1, t1, a4
+
+    blt         t1, zero, L(un_end_less_8)
+    st.d        t0, a2, 0
+    addi.d      a2, a2, 8
+    beqz        t1, L(un_out)
+    srl.d       t0, t4, a5
+    b           L(un_end_less_8)
+
+L(un_end):
+    ctz.d       t1, t3
+    srli.d      t1, t1, 3
+    addi.d      t1, t1, 1
+
+L(un_end_less_8):
+    andi        a4, t1, 4
+    andi        a5, t1, 2
+    andi        a6, t1, 1
+L(un_end_4):
+    beqz        a4, L(un_end_2)
+    st.w        t0, a2, 0
+    addi.d      a2, a2, 4
+    srli.d      t0, t0, 32
+L(un_end_2):
+    beqz        a5, L(un_end_1)
+    st.h        t0, a2, 0
+    addi.d      a2, a2, 2
+    srli.d      t0, t0, 16
+L(un_end_1):
+    beqz        a6, L(un_out)
+    st.b        t0, a2, 0
+    addi.d      a2, a2, 1
+L(un_out):
+#ifdef USE_AS_STPCPY
+    addi.d      a0, a2, -1
+#endif
+    jr          ra
+END(STRCPY)
+
+libc_hidden_builtin_def (STRCPY)