sysdeps/powerpc/powerpc64/power8/strnlen.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425

/* Optimized strnlen implementation for POWER8 using a vmx loop.

   Copyright (C) 2017-2018 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */

/* It is implemented the following heuristic:
	1. Case maxlen <= 32: align the pointer to 8 bytes to loop through
	reading doublewords. Uses the POWER7 algorithm.
	2. Case maxlen > 32: check for null bytes in the first 16 bytes using
	unaligned accesses. Return length if found. Otherwise:
		2.1 Case maxlen < 64: deduct the bytes previously read, align
		the pointer to 16 bytes and loop through reading quadwords
		until find null bytes or reach maxlen.
		2.2 Case maxlen > 64: deduct the bytes previously read, align
		the pointer to 64 bytes and set up a counter to loop through
		reading in strides of 64 bytes. In case it finished the loop
		with null bytes not found, process the remainder bytes by
		switching to the loop to heuristic in 2.1.  */

#include <sysdep.h>

/* Define default page size to 4KB.  */
#define PAGE_SIZE 4096

/* The following macros implement Power ISA v2.07 opcodes
   that could not be used directly into this code to the keep
   compatibility with older binutils versions.  */

/* Move from vector register doubleword.  */
#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))

/* Move to vector register doubleword.  */
#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))

/* Vector Bit Permute Quadword.  */
#define VBPERMQ(t,a,b)	.long (0x1000054c	\
			       | ((t)<<(32-11))	\
			       | ((a)<<(32-16))	\
			       | ((b)<<(32-21)) )

/* Vector Population Count Halfword.  */
#define VPOPCNTH(t,b) .long (0x10000743 | ((t)<<(32-11)) | ((b)<<(32-21)))

/* Vector Count Leading Zeros Halfword.  */
#define VCLZH(t,b) .long (0x10000742 | ((t)<<(32-11)) | ((b)<<(32-21)))


/* int [r3] strnlen (char *s [r3], size_t maxlen [r4])  */
/* TODO: change to power8 when minimum required binutils allows it.  */
	.machine  power7
ENTRY_TOCLESS (__strnlen)
	CALL_MCOUNT 2
	dcbt	0,r3

	cmpldi	r4,32           /* Check if maxlen <= 32.  */
	ble	L(small_range)  /* If maxlen <= 32.  */

	/* Upcoming 16 bytes unaligned accesses cannot cross the page boundary
	   otherwise the processor throws an memory access error.
	   Use following code to check there is room for such as accesses:
	     (((size_t) s) % PAGE_SIZE > (PAGE_SIZE - 16)
	   If it is disallowed then switch to the code that handles
	   the string when maxlen <= 32.  */
	clrldi	r10,r3,52
	cmpldi  cr7,r10,PAGE_SIZE-16
	bgt     cr7,L(small_range)	/* If less than 16B of page end.  */

	/* Compute our permute constant r8.  */
	li	r7,0
	/* Compute a bpermd constant to move bit 0 of each word into
	   a halfword value, and count trailing zeros.  */
#ifdef __LITTLE_ENDIAN__
	li	r8,0x2820
	oris	r8,r8,0x3830
	sldi	r8,r8,32
	ori	r8,r8,0x0800
	oris	r8,r8,0x1810
#else
	li	r8,0x1018
	oris	r8,r8,0x0008
	sldi	r8,r8,32
	ori	r8,r8,0x3038
	oris	r8,r8,0x2028
#endif

	/* maxlen > 32. Optimistically check for null bytes in the first
	   16 bytes of the string using unaligned accesses.  */
	ld	r5,0(r3)
	ld	r6,8(r3)
	cmpb	r10,r7,r5		/* Check for null bytes in DWORD1.  */
	cmpb	r11,r7,r6		/* Check for null bytes in DWORD2.  */
	or.	r7,r10,r11
	bne	cr0, L(early_find)	/* If found null bytes.  */

	/* At this point maxlen > 32 and null bytes were not found at first
	   16 bytes. Prepare for loop using VMX.  */

	/* r3 == s, r4 == maxlen. All other volatile regs are unused now.  */

	addi	r5,r3,16	/* Align up, or just add the 16B we
				   already checked.  */
	li	r0,15
	and	r7,r5,r0	/* Find offset into 16B alignment.  */
	andc	r5,r5,r0	/* Quadword align up s to the next quadword.  */
	li	r0,16
	subf	r0,r7,r0
	subf	r4,r0,r4	/* Deduct unaligned bytes from maxlen.  */


	/* Compute offsets for vmx loads, and precompute the vbpermq
	   constants for both the 64B and 16B loops.  */
	li	r6,0
	vspltisb  v0,0
	vspltisb  v10,3
	lvsl	  v11,r6,r6
	vslb	  v10,v11,v10

	cmpldi  r4,64		/* Check maxlen < 64.  */
	blt	L(smaller)	/* If maxlen < 64 */

	/* In order to begin the 64B loop, it needs to be 64
	   bytes aligned. So read quadwords until it is aligned or found null
	   bytes. At worst case it will be aligned after the fourth iteration,
	   so unroll the loop to avoid counter checking.  */
	andi.   r7,r5,63		/* Check if is 64 bytes aligned.  */
	beq     cr0,L(preloop_64B)	/* If it is already 64B aligned.  */
	lvx     v1,r5,r6
	vcmpequb.       v1,v1,v0
	addi    r5,r5,16
	addi    r4,r4,-16		/* Decrement maxlen in 16 bytes. */
	bne     cr6,L(found_aligning64B) /* If found null bytes.  */

	/* Unroll 2x above code block until aligned or find null bytes.  */
	andi.   r7,r5,63
	beq     cr0,L(preloop_64B)
	lvx     v1,r5,r6
	vcmpequb.      v1,v1,v0
	addi    r5,r5,16
	addi    r4,r4,-16
	bne     cr6,L(found_aligning64B)

	andi.   r7,r5,63
	beq     cr0,L(preloop_64B)
	lvx     v1,r5,r6
	vcmpequb.      v1,v1,v0
	addi    r5,r5,16
	addi    r4,r4,-16
	bne     cr6,L(found_aligning64B)

	/* At this point it should be 16 bytes aligned.
	   Prepare for the 64B loop.  */
	.p2align 4
L(preloop_64B):
	/* Check if maxlen became is less than 64, therefore disallowing the
	   64B loop. If it happened switch to the 16B loop code.  */
	cmpldi  r4,64		/* Check if maxlen < 64.  */
	blt     L(smaller)	/* If maxlen < 64.  */
	/* Set some constant values.  */
	li      r7,16
	li      r10,32
	li      r9,48

	/* Compute the number of 64 bytes iterations needed.  */
	srdi	r11,r4,6	/* Compute loop count (maxlen / 64).  */
	andi.	r4,r4,63	/* Set maxlen the remainder (maxlen % 64).  */
	mtctr	r11		/* Move loop count to counter register.  */

	/* Handle maxlen > 64. Loop over the bytes in strides of 64B.  */
	.p2align 4
L(loop_64B):
	lvx	v1,r5,r6	/* r5 is the pointer to s.  */
	lvx	v2,r5,r7
	lvx	v3,r5,r10
	lvx	v4,r5,r9
	/* Compare the four 16B vectors to obtain the least 16 values.
	   Null bytes should emerge into v7, then check for null bytes.  */
	vminub	v5,v1,v2
	vminub	v6,v3,v4
	vminub	v7,v5,v6
	vcmpequb. v7,v7,v0		/* Check for null bytes.  */
	addi	r5,r5,64		/* Add pointer to next iteraction.  */
	bne	cr6,L(found_64B)	/* If found null bytes.  */
	bdnz	L(loop_64B)		/* Continue the loop if count > 0. */

/* Hit loop end without null match. So branch to handle the remainder.  */

	/* Prepare a 16B loop to handle two cases:
		1. If 32 > maxlen < 64.
		2. If maxlen >= 64, and reached end of the 64B loop with null
		bytes not found. Thus handle the remainder bytes here. */
	.p2align 4
L(smaller):
        cmpldi  r4,0            /* Check maxlen is zero.  */
        beq     L(done)         /* If maxlen is zero.  */

	/* Place rounded up number of qw's to check into a vmx
	   register, and use some vector tricks to minimize
	   branching.  */
        MTVRD(v7,r4)            /* Copy maxlen from GPR to vector register. */
        vspltisb v5,1
        vspltisb v6,15
        vspltb   v2,v7,7
        vaddubs  v3,v5,v6

#ifdef __LITTLE_ENDIAN__
	vspltish v5,1           /* Compute 16 in each byte.  */
#endif

	/* Loop in 16B aligned incremements now. */
	.p2align 4
L(loop_16B):
	lvx     v1,r5,r6        /* Load quadword into vector register.  */
	addi    r5,r5,16        /* Increment address to next 16B block.  */
	vor     v7,v2,v2        /* Save loop count (v2) into v7. */
	vsububs v2,v2,v3        /* Subtract 16B from count, saturate at 0. */
	vminub  v4,v1,v2
	vcmpequb. v4,v4,v0      /* Checking for null bytes.  */
	beq     cr6,L(loop_16B) /* If null bytes not found.  */

	vcmpequb  v1,v1,v0
	VBPERMQ(v1,v1,v10)
#ifdef __LITTLE_ENDIAN__
	vsubuhm  v2,v1,v5       /* Form a mask of trailing zeros.  */
	vandc    v2,v2,v1
	VPOPCNTH(v1,v2)         /* Count of trailing zeros, 16 if none.  */
#else
	VCLZH(v1,v1)            /* Count the leading zeros, 16 if none.  */
#endif
	/* Truncate to maximum allowable offset.  */
	vcmpgtub v2,v1,v7       /* Compare and truncate for matches beyond
				   maxlen.  */
	vsel     v1,v1,v7,v2    /* 0-16 is now in byte 7.  */

	MFVRD(r0,v1)
	addi    r5,r5,-16       /* Undo speculative bump.  */
	extsb   r0,r0           /* Clear whatever gunk is in the high 56b.  */
	add     r5,r5,r0        /* Add the offset of whatever was found.  */
L(done):
	subf    r3,r3,r5        /* Length is equal to the offset of null byte
				   matched minus the pointer to s.  */
	blr                     /* Done.  */

	/* Handle case of maxlen > 64 and found null bytes in last block
	   of 64 bytes read.  */
	.p2align 4
L(found_64B):
	/* A zero was found. Reduce the result.  */
	vcmpequb  v1,v1,v0
	vcmpequb  v2,v2,v0
	vcmpequb  v3,v3,v0
	vcmpequb  v4,v4,v0

	/* Permute the first bit of each byte into bits 48-63.  */
	VBPERMQ(v1,v1,v10)
	VBPERMQ(v2,v2,v10)
	VBPERMQ(v3,v3,v10)
	VBPERMQ(v4,v4,v10)

	/* Shift each component into its correct position for merging.  */
#ifdef __LITTLE_ENDIAN__
	vsldoi	v2,v2,v2,2
	vsldoi	v3,v3,v3,4
	vsldoi	v4,v4,v4,6
#else
	vsldoi	v1,v1,v1,6
	vsldoi	v2,v2,v2,4
	vsldoi	v3,v3,v3,2
#endif

	/* Merge the results and move to a GPR.  */
	vor	v1,v2,v1
	vor	v2,v3,v4
	vor	v4,v1,v2

	/* Adjust address to the start of the current 64B block.  */
	addi	r5,r5,-64

	MFVRD(r10,v4)
#ifdef __LITTLE_ENDIAN__
	addi	r9,r10,-1	/* Form a mask from trailing zeros.  */
	andc	r9,r9,r10
	popcntd	r0,r9		/* Count the bits in the mask.  */
#else
	cntlzd	r0,r10		/* Count leading zeros before the match.  */
#endif
	subf	r5,r3,r5
	add	r3,r5,r0	/* Compute final length.  */
	blr                     /* Done.  */

	/* Handle case where null bytes were found while aligning
	   as a preparation for the 64B loop.  */
	.p2align 4
L(found_aligning64B):
	VBPERMQ(v1,v1,v10)
#ifdef __LITTLE_ENDIAN__
	MFVRD(r10,v1)
	addi    r9,r10,-1       /* Form a mask from trailing zeros.  */
	andc    r9,r9,r10
	popcntd r0,r9           /* Count the bits in the mask.  */
#else
	vsldoi  v1,v1,v1,6
	MFVRD(r10,v1)
	cntlzd  r0,r10          /* Count leading zeros before the match.  */
#endif
	addi    r5,r5,-16	/* Adjust address to offset of last 16 bytes
				   read.  */
	/* Calculate length as subtracted the pointer to s of last 16 bytes
	   offset, added with the bytes before the match.  */
	subf    r5,r3,r5
	add     r3,r5,r0
	blr			/* Done.  */

	/* Handle case of maxlen > 32 and found a null bytes within the first
	   16 bytes of s.  */
	.p2align 4
L(early_find):
	bpermd	r5,r8,r10        /* r8 contains the bit permute constants.  */
	bpermd	r6,r8,r11
	sldi	r5,r5,8
	or	r5,r5,r6	/* r5 should hold a 16B mask of
				   a potential 0.  */
	cntlzd	r5,r5		/* Count leading zeros.  */
	addi	r3,r5,-48	/* Deduct the 48 leading zeros always
				   present.  */
	blr			/* Done.  */

	/* Handle case of maxlen <= 32. Use the POWER7 algorithm.  */
	.p2align 4
L(small_range):
	clrrdi	r8,r3,3  	/* Align the pointer to 8B.  */
	li	r0,0
	/* Register's content at this point:
	   r3 == pointer to s, r4 == maxlen, r8 == pointer to s aligned to 8B,
	   r7 == last acceptable address. */
	cmpldi	r4,0                 /* Check if maxlen is zero.  */
	beq	L(end_max)	     /* If maxlen is zero.  */

	/* Calculate the last acceptable address and check for possible
	   addition overflow by using satured math:
	   r7 = r3 + r4
	   r7 |= -(r7 < x)  */
	add     r7,r3,r4
	subfc   r6,r3,r7
	subfe   r9,r9,r9
	extsw   r6,r9
	or      r7,r7,r6
	addi    r7,r7,-1

	clrrdi	r7,r7,3              /* Align to 8B address of last
					acceptable address.  */

	rlwinm	r6,r3,3,26,28        /* Calculate padding.  */
	ld	r12,0(r8)            /* Load aligned doubleword.  */
	cmpb	r10,r12,r0           /* Check for null bytes. */
#ifdef __LITTLE_ENDIAN__
	srd	r10,r10,r6
	sld	r10,r10,r6
#else
	sld	r10,r10,r6
	srd	r10,r10,r6
#endif /* __LITTLE_ENDIAN__  */
	cmpldi	cr7,r10,0
	bne	cr7,L(done_small)    /* If found null byte.  */

	cmpld	r8,r7                /* Check if reached maxlen.  */
	beq	L(end_max)	     /* If reached maxlen.  */

	/* Still handling case of maxlen <= 32. Read doubleword aligned until
	   find null bytes or reach maxlen.  */
	.p2align 4
L(loop_small):
	ldu	r12,8(r8)         /* Load next doubleword and update r8.  */
	cmpb	r10,r12,r0        /* Check for null bytes.  */
	cmpldi	cr6,r10,0
	bne	cr6,L(done_small) /* If found null bytes.  */
	cmpld	r8,r7             /* Check if reached maxlen. */
	bne	L(loop_small)	  /* If it has more bytes to read.  */
	mr	r3,r4             /* Reached maxlen with null bytes not found.
				     Length is equal to maxlen.  */
	blr			  /* Done.  */

	/* Still handling case of maxlen <= 32. Found null bytes.
	   Registers: r10 == match bits within doubleword, r8 == address of
	   last doubleword read, r3 == pointer to s, r4 == maxlen.  */
	.p2align 4
L(done_small):
#ifdef __LITTLE_ENDIAN__
	/* Count trailing zeros.  */
	addi	r0,r10,-1
	andc	r0,r0,r10
	popcntd	r0,r0
#else
	cntlzd	r0,r10	      /* Count leading zeros before the match.  */
#endif
	sub	r3,r8,r3      /* Calculate total of bytes before the match.  */
	srdi	r0,r0,3	      /* Convert leading/trailing zeros to bytes.  */
	add	r3,r3,r0      /* Length until the match.  */
	cmpld	r3,r4         /* Check length is greater than maxlen.  */
	blelr
	mr	r3,r4	      /* If length is greater than maxlen, return
				 maxlen.  */
	blr

	/* Handle case of reached maxlen with null bytes not found.  */
	.p2align 4
L(end_max):
	mr	r3,r4	/* Length is equal to maxlen.  */
	blr		/* Done.  */


END (__strnlen)
libc_hidden_def (__strnlen)
weak_alias (__strnlen, strnlen)
libc_hidden_def (strnlen)