about summary refs log tree commit diff
path: root/sysdeps/x86_64/multiarch/strrchr-avx2.S
blob: 8709a76ba15814396e4a4c086d63405d0a06184f (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
/* strrchr/wcsrchr optimized with AVX2.
   Copyright (C) 2017-2024 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */

#include <isa-level.h>

#if ISA_SHOULD_BUILD (3)

# include <sysdep.h>

# ifndef STRRCHR
#  define STRRCHR	__strrchr_avx2
# endif

# ifdef USE_AS_WCSRCHR
#  define VPBROADCAST	vpbroadcastd
#  define VPCMPEQ	vpcmpeqd
#  define VPMIN	vpminud
#  define CHAR_SIZE	4
# else
#  define VPBROADCAST	vpbroadcastb
#  define VPCMPEQ	vpcmpeqb
#  define VPMIN	vpminub
#  define CHAR_SIZE	1
# endif

# ifndef VZEROUPPER
#  define VZEROUPPER	vzeroupper
# endif

# ifndef SECTION
#  define SECTION(p)	p##.avx
# endif

# define VEC_SIZE	32
# define PAGE_SIZE	4096

	.section SECTION(.text), "ax", @progbits
ENTRY(STRRCHR)
	vmovd	%esi, %xmm7
	movl	%edi, %eax
	/* Broadcast CHAR to YMM4.  */
	VPBROADCAST %xmm7, %ymm7
	vpxor	%xmm0, %xmm0, %xmm0

	/* Shift here instead of `andl` to save code size (saves a fetch
	   block).  */
	sall	$20, %eax
	cmpl	$((PAGE_SIZE - VEC_SIZE) << 20), %eax
	ja	L(cross_page)

L(page_cross_continue):
	vmovdqu	(%rdi), %ymm1
	/* Check end of string match.  */
	VPCMPEQ	%ymm1, %ymm0, %ymm6
	vpmovmskb %ymm6, %ecx
	testl	%ecx, %ecx
	jz	L(aligned_more)

	/* Only check match with search CHAR if needed.  */
	VPCMPEQ	%ymm1, %ymm7, %ymm1
	vpmovmskb %ymm1, %eax
	/* Check if match before first zero.  */
	blsmskl	%ecx, %ecx
	andl	%ecx, %eax
	jz	L(ret0)
	bsrl	%eax, %eax
	addq	%rdi, %rax
	/* We are off by 3 for wcsrchr if search CHAR is non-zero. If
	   search CHAR is zero we are correct. Either way `andq
	   -CHAR_SIZE, %rax` gets the correct result.  */
# ifdef USE_AS_WCSRCHR
	andq	$-CHAR_SIZE, %rax
# endif
L(ret0):
L(return_vzeroupper):
	ZERO_UPPER_VEC_REGISTERS_RETURN

	/* Returns for first vec x1/x2 have hard coded backward search
	   path for earlier matches.  */
	.p2align 4,, 10
L(first_vec_x1):
	VPCMPEQ	%ymm2, %ymm7, %ymm6
	vpmovmskb %ymm6, %eax
	blsmskl	%ecx, %ecx
	andl	%ecx, %eax
	jnz	L(first_vec_x1_return)

	.p2align 4,, 4
L(first_vec_x0_test):
	VPCMPEQ	%ymm1, %ymm7, %ymm6
	vpmovmskb %ymm6, %eax
	testl	%eax, %eax
	jz	L(ret1)
	bsrl	%eax, %eax
	addq	%r8, %rax
# ifdef USE_AS_WCSRCHR
	andq	$-CHAR_SIZE, %rax
# endif
L(ret1):
	VZEROUPPER_RETURN

	.p2align 4,, 10
L(first_vec_x0_x1_test):
	VPCMPEQ	%ymm2, %ymm7, %ymm6
	vpmovmskb %ymm6, %eax
	/* Check ymm2 for search CHAR match. If no match then check ymm1
	   before returning.  */
	testl	%eax, %eax
	jz	L(first_vec_x0_test)
	.p2align 4,, 4
L(first_vec_x1_return):
	bsrl	%eax, %eax
	leaq	1(%rdi, %rax), %rax
# ifdef USE_AS_WCSRCHR
	andq	$-CHAR_SIZE, %rax
# endif
	VZEROUPPER_RETURN


	.p2align 4,, 10
L(first_vec_x2):
	VPCMPEQ	%ymm3, %ymm7, %ymm6
	vpmovmskb %ymm6, %eax
	blsmskl	%ecx, %ecx
	/* If no in-range search CHAR match in ymm3 then need to check
	   ymm1/ymm2 for an earlier match (we delay checking search
	   CHAR matches until needed).  */
	andl	%ecx, %eax
	jz	L(first_vec_x0_x1_test)
	bsrl	%eax, %eax
	leaq	(VEC_SIZE + 1)(%rdi, %rax), %rax
# ifdef USE_AS_WCSRCHR
	andq	$-CHAR_SIZE, %rax
# endif
	VZEROUPPER_RETURN


	.p2align 4
L(aligned_more):
	/* Save original pointer if match was in VEC 0.  */
	movq	%rdi, %r8

	/* Align src.  */
	orq	$(VEC_SIZE - 1), %rdi
	vmovdqu	1(%rdi), %ymm2
	VPCMPEQ	%ymm2, %ymm0, %ymm6
	vpmovmskb %ymm6, %ecx
	testl	%ecx, %ecx
	jnz	L(first_vec_x1)

	vmovdqu	(VEC_SIZE + 1)(%rdi), %ymm3
	VPCMPEQ	%ymm3, %ymm0, %ymm6
	vpmovmskb %ymm6, %ecx
	testl	%ecx, %ecx
	jnz	L(first_vec_x2)

	/* Save pointer again before realigning.  */
	movq	%rdi, %rsi
	addq	$(VEC_SIZE + 1), %rdi
	andq	$-(VEC_SIZE * 2), %rdi
	.p2align 4
L(first_aligned_loop):
	/* Do 2x VEC at a time. Any more and the cost of finding the
	   match outweighs loop benefit.  */
	vmovdqa	(VEC_SIZE * 0)(%rdi), %ymm4
	vmovdqa	(VEC_SIZE * 1)(%rdi), %ymm5

	VPCMPEQ	%ymm4, %ymm7, %ymm6
	VPMIN	%ymm4, %ymm5, %ymm8
	VPCMPEQ	%ymm5, %ymm7, %ymm10
	vpor	%ymm6, %ymm10, %ymm5
	VPCMPEQ	%ymm8, %ymm0, %ymm8
	vpor	%ymm5, %ymm8, %ymm9

	vpmovmskb %ymm9, %eax
	addq	$(VEC_SIZE * 2), %rdi
	/* No zero or search CHAR.  */
	testl	%eax, %eax
	jz	L(first_aligned_loop)

	/* If no zero CHAR then go to second loop (this allows us to
	   throw away all prior work).  */
	vpmovmskb %ymm8, %ecx
	testl	%ecx, %ecx
	jz	L(second_aligned_loop_prep)

	/* Search char could be zero so we need to get the true match.
	 */
	vpmovmskb %ymm5, %eax
	testl	%eax, %eax
	jnz	L(first_aligned_loop_return)

	.p2align 4,, 4
L(first_vec_x1_or_x2):
	VPCMPEQ	%ymm3, %ymm7, %ymm3
	VPCMPEQ	%ymm2, %ymm7, %ymm2
	vpmovmskb %ymm3, %eax
	vpmovmskb %ymm2, %edx
	/* Use add for macro-fusion.  */
	addq	%rax, %rdx
	jz	L(first_vec_x0_test)
	/* NB: We could move this shift to before the branch and save a
	   bit of code size / performance on the fall through. The
	   branch leads to the null case which generally seems hotter
	   than char in first 3x VEC.  */
	salq	$32, %rax
	addq	%rdx, %rax
	bsrq	%rax, %rax
	leaq	1(%rsi, %rax), %rax
# ifdef USE_AS_WCSRCHR
	andq	$-CHAR_SIZE, %rax
# endif
	VZEROUPPER_RETURN

	.p2align 4,, 8
L(first_aligned_loop_return):
	VPCMPEQ	%ymm4, %ymm0, %ymm4
	vpmovmskb %ymm4, %edx
	salq	$32, %rcx
	orq	%rdx, %rcx

	vpmovmskb %ymm10, %eax
	vpmovmskb %ymm6, %edx
	salq	$32, %rax
	orq	%rdx, %rax
	blsmskq	%rcx, %rcx
	andq	%rcx, %rax
	jz	L(first_vec_x1_or_x2)

	bsrq	%rax, %rax
	leaq	-(VEC_SIZE * 2)(%rdi, %rax), %rax
# ifdef USE_AS_WCSRCHR
	andq	$-CHAR_SIZE, %rax
# endif
	VZEROUPPER_RETURN

	/* Search char cannot be zero.  */
	.p2align 4
L(second_aligned_loop_set_furthest_match):
	/* Save VEC and pointer from most recent match.  */
L(second_aligned_loop_prep):
	movq	%rdi, %rsi
	vmovdqu	%ymm6, %ymm2
	vmovdqu	%ymm10, %ymm3

	.p2align 4
L(second_aligned_loop):
	/* Search 2x at at time.  */
	vmovdqa	(VEC_SIZE * 0)(%rdi), %ymm4
	vmovdqa	(VEC_SIZE * 1)(%rdi), %ymm5

	VPCMPEQ	%ymm4, %ymm7, %ymm6
	VPMIN	%ymm4, %ymm5, %ymm1
	VPCMPEQ	%ymm5, %ymm7, %ymm10
	vpor	%ymm6, %ymm10, %ymm5
	VPCMPEQ	%ymm1, %ymm0, %ymm1
	vpor	%ymm5, %ymm1, %ymm9

	vpmovmskb %ymm9, %eax
	addq	$(VEC_SIZE * 2), %rdi
	testl	%eax, %eax
	jz	L(second_aligned_loop)
	vpmovmskb %ymm1, %ecx
	testl	%ecx, %ecx
	jz	L(second_aligned_loop_set_furthest_match)
	vpmovmskb %ymm5, %eax
	testl	%eax, %eax
	jnz	L(return_new_match)

	/* This is the hot patch. We know CHAR is inbounds and that
	   ymm3/ymm2 have latest match.  */
	.p2align 4,, 4
L(return_old_match):
	vpmovmskb %ymm3, %eax
	vpmovmskb %ymm2, %edx
	salq	$32, %rax
	orq	%rdx, %rax
	bsrq	%rax, %rax
	/* Search char cannot be zero so safe to just use lea for
	   wcsrchr.  */
	leaq	(VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax
	VZEROUPPER_RETURN

	/* Last iteration also potentially has a match.  */
	.p2align 4,, 8
L(return_new_match):
	VPCMPEQ	%ymm4, %ymm0, %ymm4
	vpmovmskb %ymm4, %edx
	salq	$32, %rcx
	orq	%rdx, %rcx

	vpmovmskb %ymm10, %eax
	vpmovmskb %ymm6, %edx
	salq	$32, %rax
	orq	%rdx, %rax
	blsmskq	%rcx, %rcx
	andq	%rcx, %rax
	jz	L(return_old_match)
	bsrq	%rax, %rax
	/* Search char cannot be zero so safe to just use lea for
	   wcsrchr.  */
	leaq	(VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax
	VZEROUPPER_RETURN

	.p2align 4,, 4
L(cross_page):
	movq	%rdi, %rsi
	andq	$-VEC_SIZE, %rsi
	vmovdqu	(%rsi), %ymm1
	VPCMPEQ	%ymm1, %ymm0, %ymm6
	vpmovmskb %ymm6, %ecx
	/* Shift out zero CHAR matches that are before the beginning of
	   src (rdi).  */
	shrxl	%edi, %ecx, %ecx
	testl	%ecx, %ecx
	jz	L(page_cross_continue)
	VPCMPEQ	%ymm1, %ymm7, %ymm1
	vpmovmskb %ymm1, %eax

	/* Shift out search CHAR matches that are before the beginning of
	   src (rdi).  */
	shrxl	%edi, %eax, %eax
	blsmskl	%ecx, %ecx
	/* Check if any search CHAR match in range.  */
	andl	%ecx, %eax
	jz	L(ret2)
	bsrl	%eax, %eax
	addq	%rdi, %rax
# ifdef USE_AS_WCSRCHR
	andq	$-CHAR_SIZE, %rax
# endif
L(ret2):
	VZEROUPPER_RETURN
END(STRRCHR)
#endif