/* Optimized strcpy stpcpy implementation using LoongArch LSX instructions.
Copyright (C) 2023 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library. If not, see
. */
#include
#include
#include
#if IS_IN (libc) && !defined __loongarch_soft_float
# ifndef STRCPY
# define STRCPY __strcpy_lsx
# endif
LEAF(STRCPY, 6)
pcalau12i t0, %pc_hi20(L(INDEX))
andi a4, a1, 0xf
vld vr1, t0, %pc_lo12(L(INDEX))
move a2, a0
beqz a4, L(load_start)
xor t0, a1, a4
vld vr0, t0, 0
vreplgr2vr.b vr2, a4
vadd.b vr2, vr2, vr1
vshuf.b vr0, vr2, vr0, vr2
vsetanyeqz.b fcc0, vr0
bcnez fcc0, L(end)
L(load_start):
vld vr0, a1, 0
li.d t1, 16
andi a3, a2, 0xf
vsetanyeqz.b fcc0, vr0
sub.d t0, t1, a3
bcnez fcc0, L(end)
add.d a1, a1, t0
vst vr0, a2, 0
andi a3, a1, 0xf
add.d a2, a2, t0
bnez a3, L(unaligned)
vld vr0, a1, 0
vsetanyeqz.b fcc0, vr0
bcnez fcc0, L(al_end)
L(al_loop):
vst vr0, a2, 0
vld vr0, a1, 16
addi.d a2, a2, 16
addi.d a1, a1, 16
vsetanyeqz.b fcc0, vr0
bceqz fcc0, L(al_loop)
L(al_end):
vmsknz.b vr1, vr0
movfr2gr.s t0, fa1
cto.w t0, t0
add.d a1, a1, t0
vld vr0, a1, -15
# ifdef USE_AS_STPCPY
add.d a0, a2, t0
vst vr0, a0, -15
# else
add.d a2, a2, t0
vst vr0, a2, -15
# endif
jr ra
L(end):
vmsknz.b vr1, vr0
movfr2gr.s t0, fa1
cto.w t0, t0
addi.d t0, t0, 1
L(end_16):
andi t1, t0, 16
beqz t1, L(end_8)
vst vr0, a2, 0
# ifdef USE_AS_STPCPY
addi.d a0, a2, 15
# endif
jr ra
L(end_8):
andi t2, t0, 8
andi t3, t0, 4
andi t4, t0, 2
andi t5, t0, 1
beqz t2, L(end_4)
vstelm.d vr0, a2, 0, 0
addi.d a2, a2, 8
vbsrl.v vr0, vr0, 8
L(end_4):
beqz t3, L(end_2)
vstelm.w vr0, a2, 0, 0
addi.d a2, a2, 4
vbsrl.v vr0, vr0, 4
L(end_2):
beqz t4, L(end_1)
vstelm.h vr0, a2, 0, 0
addi.d a2, a2, 2
vbsrl.v vr0, vr0, 2
L(end_1):
beqz t5, L(out)
vstelm.b vr0, a2, 0, 0
addi.d a2, a2, 1
L(out):
# ifdef USE_AS_STPCPY
addi.d a0, a2, -1
# endif
jr ra
.align 4
L(unaligned):
bstrins.d a1, zero, 3, 0
vld vr2, a1, 0
vreplgr2vr.b vr3, a3
vslt.b vr4, vr1, vr3
vor.v vr0, vr2, vr4
vsetanyeqz.b fcc0, vr0
bcnez fcc0, L(un_first_end)
vld vr0, a1, 16
vadd.b vr3, vr3, vr1
vshuf.b vr4, vr0, vr2, vr3
vsetanyeqz.b fcc0, vr0
bcnez fcc0, L(un_end)
vor.v vr2, vr0, vr0
addi.d a1, a1, 16
L(un_loop):
vld vr0, a1, 16
vst vr4, a2, 0
addi.d a2, a2, 16
vshuf.b vr4, vr0, vr2, vr3
vsetanyeqz.b fcc0, vr0
bcnez fcc0, L(un_end)
vld vr2, a1, 32
vst vr4, a2, 0
addi.d a1, a1, 32
addi.d a2, a2, 16
vshuf.b vr4, vr2, vr0, vr3
vsetanyeqz.b fcc0, vr2
bceqz fcc0, L(un_loop)
vor.v vr0, vr2, vr2
addi.d a1, a1, -16
L(un_end):
vsetanyeqz.b fcc0, vr4
bcnez fcc0, 1f
vst vr4, a2, 0
1:
vmsknz.b vr1, vr0
movfr2gr.s t0, fa1
cto.w t0, t0
add.d a1, a1, t0
vld vr0, a1, 1
add.d a2, a2, t0
sub.d a2, a2, a3
vst vr0, a2, 1
# ifdef USE_AS_STPCPY
addi.d a0, a2, 16
# endif
jr ra
L(un_first_end):
addi.d a2, a2, -16
addi.d a1, a1, -16
b 1b
END(STRCPY)
.section .rodata.cst16,"M",@progbits,16
.align 4
L(INDEX):
.dword 0x0706050403020100
.dword 0x0f0e0d0c0b0a0908
libc_hidden_builtin_def (STRCPY)
#endif