1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
|
/* Generic optimized memset using SIMD.
Copyright (C) 2012-2024 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library. If not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#ifndef MEMSET
# define MEMSET memset
#endif
/* Assumptions:
*
* ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
*
*/
#define dstin x0
#define valw w1
#define count x2
#define dst x3
#define dstend x4
#define zva_val x5
#define off x3
#define dstend2 x5
ENTRY (MEMSET)
PTR_ARG (0)
SIZE_ARG (2)
dup v0.16B, valw
cmp count, 16
b.lo L(set_small)
add dstend, dstin, count
cmp count, 64
b.hs L(set_128)
/* Set 16..63 bytes. */
mov off, 16
and off, off, count, lsr 1
sub dstend2, dstend, off
str q0, [dstin]
str q0, [dstin, off]
str q0, [dstend2, -16]
str q0, [dstend, -16]
ret
.p2align 4
/* Set 0..15 bytes. */
L(set_small):
add dstend, dstin, count
cmp count, 4
b.lo 2f
lsr off, count, 3
sub dstend2, dstend, off, lsl 2
str s0, [dstin]
str s0, [dstin, off, lsl 2]
str s0, [dstend2, -4]
str s0, [dstend, -4]
ret
/* Set 0..3 bytes. */
2: cbz count, 3f
lsr off, count, 1
strb valw, [dstin]
strb valw, [dstin, off]
strb valw, [dstend, -1]
3: ret
.p2align 4
L(set_128):
bic dst, dstin, 15
cmp count, 128
b.hi L(set_long)
stp q0, q0, [dstin]
stp q0, q0, [dstin, 32]
stp q0, q0, [dstend, -64]
stp q0, q0, [dstend, -32]
ret
.p2align 4
L(set_long):
str q0, [dstin]
str q0, [dst, 16]
tst valw, 255
b.ne L(no_zva)
#ifndef ZVA64_ONLY
mrs zva_val, dczid_el0
and zva_val, zva_val, 31
cmp zva_val, 4 /* ZVA size is 64 bytes. */
b.ne L(zva_128)
#endif
stp q0, q0, [dst, 32]
bic dst, dstin, 63
sub count, dstend, dst /* Count is now 64 too large. */
sub count, count, 64 + 64 /* Adjust count and bias for loop. */
/* Write last bytes before ZVA loop. */
stp q0, q0, [dstend, -64]
stp q0, q0, [dstend, -32]
.p2align 4
L(zva64_loop):
add dst, dst, 64
dc zva, dst
subs count, count, 64
b.hi L(zva64_loop)
ret
.p2align 3
L(no_zva):
sub count, dstend, dst /* Count is 32 too large. */
sub count, count, 64 + 32 /* Adjust count and bias for loop. */
L(no_zva_loop):
stp q0, q0, [dst, 32]
stp q0, q0, [dst, 64]
add dst, dst, 64
subs count, count, 64
b.hi L(no_zva_loop)
stp q0, q0, [dstend, -64]
stp q0, q0, [dstend, -32]
ret
#ifndef ZVA64_ONLY
.p2align 4
L(zva_128):
cmp zva_val, 5 /* ZVA size is 128 bytes. */
b.ne L(no_zva)
stp q0, q0, [dst, 32]
stp q0, q0, [dst, 64]
stp q0, q0, [dst, 96]
bic dst, dst, 127
sub count, dstend, dst /* Count is now 128 too large. */
sub count, count, 128 + 128 /* Adjust count and bias for loop. */
1: add dst, dst, 128
dc zva, dst
subs count, count, 128
b.hi 1b
stp q0, q0, [dstend, -128]
stp q0, q0, [dstend, -96]
stp q0, q0, [dstend, -64]
stp q0, q0, [dstend, -32]
ret
#endif
END (MEMSET)
libc_hidden_builtin_def (MEMSET)
|