sysdeps/alpha/alphaev6/memset.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223

/* Copyright (C) 2000-2019 Free Software Foundation, Inc.
   Contributed by Richard Henderson (rth@tamu.edu)
   EV6 optimized by Rick Gorton <rick.gorton@alpha-processor.com>.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library.  If not, see
   <https://www.gnu.org/licenses/>.  */

#include <sysdep.h>

	.arch ev6
	.set noat
	.set noreorder

ENTRY(memset)
#ifdef PROF
	ldgp	gp, 0(pv)
	lda	AT, _mcount
	jsr	AT, (AT), _mcount
	.prologue 1
#else
	.prologue 0
#endif

	/*
	 * Serious stalling happens.  The only way to mitigate this is to
	 * undertake a major re-write to interleave the constant materialization
	 * with other parts of the fall-through code.  This is important, even
	 * though it makes maintenance tougher.
	 * Do this later.
	 */
	and	$17, 255, $1	# E : 00000000000000ch
	insbl	$17, 1, $2	# U : 000000000000ch00
	mov	$16, $0		# E : return value
	ble	$18, $end	# U : zero length requested?

	addq	$18, $16, $6	# E : max address to write to
	or	$1, $2, $17	# E : 000000000000chch
	insbl	$1, 2, $3	# U : 0000000000ch0000
	insbl	$1, 3, $4	# U : 00000000ch000000

	or	$3, $4, $3	# E : 00000000chch0000
	inswl	$17, 4, $5	# U : 0000chch00000000
	xor	$16, $6, $1	# E : will complete write be within one quadword?
	inswl	$17, 6, $2	# U : chch000000000000

	or	$17, $3, $17	# E : 00000000chchchch
	or	$2, $5, $2	# E : chchchch00000000
	bic	$1, 7, $1	# E : fit within a single quadword?
	and	$16, 7, $3	# E : Target addr misalignment

	or	$17, $2, $17	# E : chchchchchchchch
	beq	$1, $within_quad # U :
	nop			# E :
	beq	$3, $aligned	# U : target is 0mod8

	/*
	 * Target address is misaligned, and won't fit within a quadword.
	 */
	ldq_u	$4, 0($16)	# L : Fetch first partial
	mov	$16, $5		# E : Save the address
	insql	$17, $16, $2	# U : Insert new bytes
	subq	$3, 8, $3	# E : Invert (for addressing uses)

	addq	$18, $3, $18	# E : $18 is new count ($3 is negative)
	mskql	$4, $16, $4	# U : clear relevant parts of the quad
	subq	$16, $3, $16	# E : $16 is new aligned destination
	or	$2, $4, $1	# E : Final bytes

	nop
	stq_u	$1,0($5)	# L : Store result
	nop
	nop

	.align 4
$aligned:
	/*
	 * We are now guaranteed to be quad aligned, with at least
	 * one partial quad to write.
	 */

	sra	$18, 3, $3	# U : Number of remaining quads to write
	and	$18, 7, $18	# E : Number of trailing bytes to write
	mov	$16, $5		# E : Save dest address
	beq	$3, $no_quad	# U : tail stuff only

	/*
	 * It's worth the effort to unroll this and use wh64 if possible.
	 * At this point, entry values are:
	 * $16	Current destination address
	 * $5	A copy of $16
	 * $6	The max quadword address to write to
	 * $18	Number trailer bytes
	 * $3	Number quads to write
	 */

	and	$16, 0x3f, $2	# E : Forward work (only useful for unrolled loop)
	subq	$3, 16, $4	# E : Only try to unroll if > 128 bytes
	subq	$2, 0x40, $1	# E : bias counter (aligning stuff 0mod64)
	blt	$4, $loop	# U :

	/*
	 * We know we've got at least 16 quads, minimum of one trip
	 * through unrolled loop.  Do a quad at a time to get us 0mod64
	 * aligned.
	 */

	nop			# E :
	nop			# E :
	nop			# E :
	beq	$1, $bigalign	# U :

$alignmod64:
	stq	$17, 0($5)	# L :
	subq	$3, 1, $3	# E : For consistency later
	addq	$1, 8, $1	# E : Increment towards zero for alignment
	addq	$5, 8, $4	# E : Initial wh64 address (filler instruction)

	nop
	nop
	addq	$5, 8, $5	# E : Inc address
	blt	$1, $alignmod64 # U :

$bigalign:
	/*
	 * $3 - number quads left to go
	 * $5 - target address (aligned 0mod64)
	 * $17 - mask of stuff to store
	 * Scratch registers available: $7, $2, $4, $1
	 * We know that we'll be taking a minimum of one trip through.
	 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
	 * Assumes the wh64 needs to be for 2 trips through the loop in the future.
	 * The wh64 is issued on for the starting destination address for trip +2
	 * through the loop, and if there are less than two trips left, the target
	 * address will be for the current trip.
	 */

$do_wh64:
	wh64	($4)		# L1 : memory subsystem write hint
	subq	$3, 24, $2	# E : For determining future wh64 addresses
	stq	$17, 0($5)	# L :
	nop			# E :

	addq	$5, 128, $4	# E : speculative target of next wh64
	stq	$17, 8($5)	# L :
	stq	$17, 16($5)	# L :
	addq	$5, 64, $7	# E : Fallback address for wh64 (== next trip addr)

	stq	$17, 24($5)	# L :
	stq	$17, 32($5)	# L :
	cmovlt	$2, $7, $4	# E : Latency 2, extra mapping cycle
	nop

	stq	$17, 40($5)	# L :
	stq	$17, 48($5)	# L :
	subq	$3, 16, $2	# E : Repeat the loop at least once more?
	nop

	stq	$17, 56($5)	# L :
	addq	$5, 64, $5	# E :
	subq	$3, 8, $3	# E :
	bge	$2, $do_wh64	# U :

	nop
	nop
	nop
	beq	$3, $no_quad	# U : Might have finished already

	.align 4
	/*
	 * Simple loop for trailing quadwords, or for small amounts
	 * of data (where we can't use an unrolled loop and wh64)
	 */
$loop:
	stq	$17, 0($5)	# L :
	subq	$3, 1, $3	# E : Decrement number quads left
	addq	$5, 8, $5	# E : Inc address
	bne	$3, $loop	# U : more?

$no_quad:
	/*
	 * Write 0..7 trailing bytes.
	 */
	nop			# E :
	beq	$18, $end	# U : All done?
	ldq	$7, 0($5)	# L :
	mskqh	$7, $6, $2	# U : Mask final quad

	insqh	$17, $6, $4	# U : New bits
	or	$2, $4, $1	# E : Put it all together
	stq	$1, 0($5)	# L : And back to memory
	ret	$31,($26),1	# L0 :

$within_quad:
	ldq_u	$1, 0($16)	# L :
	insql	$17, $16, $2	# U : New bits
	mskql	$1, $16, $4	# U : Clear old
	or	$2, $4, $2	# E : New result

	mskql	$2, $6, $4	# U :
	mskqh	$1, $6, $2	# U :
	or	$2, $4, $1	# E :
	stq_u	$1, 0($16)	# L :

$end:
	nop
	nop
	nop
	ret $31,($26),1		# L0 :

	END(memset)
libc_hidden_builtin_def (memset)