about summary refs log tree commit diff
path: root/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S
blob: fc2498f7dbabc389b7ad50e435c9944738664111 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
/* Optimized strcpy stpcpy implementation using LoongArch LSX instructions.
   Copyright (C) 2023 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library.  If not, see
   <https://www.gnu.org/licenses/>.  */

#include <sysdep.h>
#include <sys/regdef.h>
#include <sys/asm.h>

#if IS_IN (libc) && !defined __loongarch_soft_float

# ifndef STRCPY
#  define STRCPY __strcpy_lsx
# endif

LEAF(STRCPY, 6)
    pcalau12i       t0, %pc_hi20(L(INDEX))
    andi            a4, a1, 0xf
    vld             vr1, t0, %pc_lo12(L(INDEX))
    move            a2, a0

    beqz            a4, L(load_start)
    xor             t0, a1, a4
    vld             vr0, t0, 0
    vreplgr2vr.b    vr2, a4

    vadd.b          vr2, vr2, vr1
    vshuf.b         vr0, vr2, vr0, vr2
    vsetanyeqz.b    fcc0, vr0
    bcnez           fcc0, L(end)

L(load_start):
    vld             vr0, a1, 0
    li.d            t1, 16
    andi            a3, a2, 0xf
    vsetanyeqz.b    fcc0, vr0


    sub.d           t0, t1, a3
    bcnez           fcc0, L(end)
    add.d           a1, a1, t0
    vst             vr0, a2, 0

    andi            a3, a1, 0xf
    add.d           a2, a2, t0
    bnez            a3, L(unaligned)
    vld             vr0, a1, 0

    vsetanyeqz.b    fcc0, vr0
    bcnez           fcc0, L(al_end)
L(al_loop):
    vst             vr0, a2, 0
    vld             vr0, a1, 16

    addi.d          a2, a2, 16
    addi.d          a1, a1, 16
    vsetanyeqz.b    fcc0, vr0
    bceqz           fcc0, L(al_loop)


L(al_end):
    vmsknz.b        vr1, vr0
    movfr2gr.s      t0, fa1
    cto.w           t0, t0
    add.d           a1, a1, t0

    vld             vr0, a1, -15
# ifdef USE_AS_STPCPY
    add.d           a0, a2, t0
    vst             vr0, a0, -15
# else
    add.d           a2, a2, t0
    vst             vr0, a2, -15
# endif
    jr              ra

L(end):
    vmsknz.b        vr1, vr0
    movfr2gr.s      t0, fa1
    cto.w           t0, t0
    addi.d          t0, t0, 1

L(end_16):
    andi            t1, t0, 16
    beqz            t1, L(end_8)
    vst             vr0, a2, 0
# ifdef USE_AS_STPCPY
    addi.d          a0, a2, 15
# endif
    jr              ra

L(end_8):
    andi            t2, t0, 8
    andi            t3, t0, 4
    andi            t4, t0, 2
    andi            t5, t0, 1

    beqz            t2, L(end_4)
    vstelm.d        vr0, a2, 0, 0
    addi.d          a2, a2, 8
    vbsrl.v         vr0, vr0, 8

L(end_4):
    beqz            t3, L(end_2)
    vstelm.w        vr0, a2, 0, 0
    addi.d          a2, a2, 4
    vbsrl.v         vr0, vr0, 4

L(end_2):
    beqz            t4, L(end_1)
    vstelm.h        vr0, a2, 0, 0
    addi.d          a2, a2, 2
    vbsrl.v         vr0, vr0, 2


L(end_1):
    beqz            t5, L(out)
    vstelm.b        vr0, a2, 0, 0
    addi.d          a2, a2, 1
L(out):
# ifdef USE_AS_STPCPY
    addi.d          a0, a2, -1
# endif
    jr              ra

    .align          4
L(unaligned):
    bstrins.d       a1, zero, 3, 0
    vld             vr2, a1, 0
    vreplgr2vr.b    vr3, a3
    vslt.b          vr4, vr1, vr3

    vor.v           vr0, vr2, vr4
    vsetanyeqz.b    fcc0, vr0
    bcnez           fcc0, L(un_first_end)
    vld             vr0, a1, 16

    vadd.b          vr3, vr3, vr1
    vshuf.b         vr4, vr0, vr2, vr3
    vsetanyeqz.b    fcc0, vr0
    bcnez           fcc0, L(un_end)


    vor.v           vr2, vr0, vr0
    addi.d          a1, a1, 16
L(un_loop):
    vld             vr0, a1, 16
    vst             vr4, a2, 0

    addi.d          a2, a2, 16
    vshuf.b         vr4, vr0, vr2, vr3
    vsetanyeqz.b    fcc0, vr0
    bcnez           fcc0, L(un_end)

    vld             vr2, a1, 32
    vst             vr4, a2, 0
    addi.d          a1, a1, 32
    addi.d          a2, a2, 16

    vshuf.b         vr4, vr2, vr0, vr3
    vsetanyeqz.b    fcc0, vr2
    bceqz           fcc0, L(un_loop)
    vor.v           vr0, vr2, vr2


    addi.d          a1, a1, -16
L(un_end):
    vsetanyeqz.b    fcc0, vr4
    bcnez           fcc0, 1f
    vst             vr4, a2, 0

1:
    vmsknz.b        vr1, vr0
    movfr2gr.s      t0, fa1
    cto.w           t0, t0
    add.d           a1, a1, t0

    vld             vr0, a1, 1
    add.d           a2, a2, t0
    sub.d           a2, a2, a3
    vst             vr0, a2, 1
# ifdef USE_AS_STPCPY
    addi.d          a0, a2, 16
# endif
    jr              ra
L(un_first_end):
    addi.d          a2, a2, -16
    addi.d          a1, a1, -16
    b               1b
END(STRCPY)

    .section        .rodata.cst16,"M",@progbits,16
    .align          4
L(INDEX):
    .dword          0x0706050403020100
    .dword          0x0f0e0d0c0b0a0908

libc_hidden_builtin_def (STRCPY)
#endif