about summary refs log tree commit diff
path: root/sysdeps/aarch64/memset.S
blob: b76dde1557ed8fb195c24a13a06f8d0d9ab126fa (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
/* Generic optimized memset using SIMD.
   Copyright (C) 2012-2024 Free Software Foundation, Inc.

   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library.  If not, see
   <https://www.gnu.org/licenses/>.  */

#include <sysdep.h>

#ifndef MEMSET
# define MEMSET memset
#endif

/* Assumptions:
 *
 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
 *
 */

#define dstin	x0
#define valw	w1
#define count	x2
#define dst	x3
#define dstend	x4
#define zva_val	x5
#define off	x3
#define dstend2	x5

ENTRY (MEMSET)
	PTR_ARG (0)
	SIZE_ARG (2)

	dup	v0.16B, valw
	cmp	count, 16
	b.lo	L(set_small)

	add	dstend, dstin, count
	cmp	count, 64
	b.hs	L(set_128)

	/* Set 16..63 bytes.  */
	mov	off, 16
	and	off, off, count, lsr 1
	sub	dstend2, dstend, off
	str	q0, [dstin]
	str	q0, [dstin, off]
	str	q0, [dstend2, -16]
	str	q0, [dstend, -16]
	ret

	.p2align 4
	/* Set 0..15 bytes.  */
L(set_small):
	add	dstend, dstin, count
	cmp	count, 4
	b.lo	2f
	lsr	off, count, 3
	sub	dstend2, dstend, off, lsl 2
	str	s0, [dstin]
	str	s0, [dstin, off, lsl 2]
	str	s0, [dstend2, -4]
	str	s0, [dstend, -4]
	ret

	/* Set 0..3 bytes.  */
2:	cbz	count, 3f
	lsr	off, count, 1
	strb	valw, [dstin]
	strb	valw, [dstin, off]
	strb	valw, [dstend, -1]
3:	ret

	.p2align 4
L(set_128):
	bic	dst, dstin, 15
	cmp	count, 128
	b.hi	L(set_long)
	stp	q0, q0, [dstin]
	stp	q0, q0, [dstin, 32]
	stp	q0, q0, [dstend, -64]
	stp	q0, q0, [dstend, -32]
	ret

	.p2align 4
L(set_long):
	str	q0, [dstin]
	str	q0, [dst, 16]
	tst	valw, 255
	b.ne	L(no_zva)
#ifndef ZVA64_ONLY
	mrs	zva_val, dczid_el0
	and	zva_val, zva_val, 31
	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
	b.ne	L(zva_128)
#endif
	stp	q0, q0, [dst, 32]
	bic	dst, dstin, 63
	sub	count, dstend, dst	/* Count is now 64 too large.  */
	sub	count, count, 64 + 64	/* Adjust count and bias for loop.  */

	/* Write last bytes before ZVA loop.  */
	stp	q0, q0, [dstend, -64]
	stp	q0, q0, [dstend, -32]

	.p2align 4
L(zva64_loop):
	add	dst, dst, 64
	dc	zva, dst
	subs	count, count, 64
	b.hi	L(zva64_loop)
	ret

	.p2align 3
L(no_zva):
	sub	count, dstend, dst	/* Count is 32 too large.  */
	sub	count, count, 64 + 32	/* Adjust count and bias for loop.  */
L(no_zva_loop):
	stp	q0, q0, [dst, 32]
	stp	q0, q0, [dst, 64]
	add	dst, dst, 64
	subs	count, count, 64
	b.hi	L(no_zva_loop)
	stp	q0, q0, [dstend, -64]
	stp	q0, q0, [dstend, -32]
	ret

#ifndef ZVA64_ONLY
	.p2align 4
L(zva_128):
	cmp	zva_val, 5		/* ZVA size is 128 bytes.  */
	b.ne	L(no_zva)

	stp	q0, q0, [dst, 32]
	stp	q0, q0, [dst, 64]
	stp	q0, q0, [dst, 96]
	bic	dst, dst, 127
	sub	count, dstend, dst	/* Count is now 128 too large.	*/
	sub	count, count, 128 + 128	/* Adjust count and bias for loop.  */
1:	add	dst, dst, 128
	dc	zva, dst
	subs	count, count, 128
	b.hi	1b
	stp	q0, q0, [dstend, -128]
	stp	q0, q0, [dstend, -96]
	stp	q0, q0, [dstend, -64]
	stp	q0, q0, [dstend, -32]
	ret
#endif

END (MEMSET)
libc_hidden_builtin_def (MEMSET)