sysdeps/x86_64/multiarch/strchr-sse2.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192

/* strchr optimized with SSE2.
   Copyright (C) 2009-2022 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */

#include <isa-level.h>

/* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation
   so we need this to build for ISA V2 builds. */
#if ISA_SHOULD_BUILD (2)

# ifndef STRCHR
#  define STRCHR __strchr_sse2
# endif

# include <sysdep.h>

	.text
ENTRY (STRCHR)
	movd	%esi, %xmm1
	movl	%edi, %eax
	andl	$4095, %eax
	punpcklbw %xmm1, %xmm1
	cmpl	$4032, %eax
	punpcklwd %xmm1, %xmm1
	pshufd	$0, %xmm1, %xmm1
	jg	L(cross_page)
	movdqu	(%rdi), %xmm0
	pxor	%xmm3, %xmm3
	movdqa	%xmm0, %xmm4
	pcmpeqb	%xmm1, %xmm0
	pcmpeqb	%xmm3, %xmm4
	por	%xmm4, %xmm0
	pmovmskb %xmm0, %eax
	test	%eax, %eax
	je	L(next_48_bytes)
	bsf	%eax, %eax
# ifdef AS_STRCHRNUL
	leaq	(%rdi,%rax), %rax
# else
	movl	$0, %edx
	leaq	(%rdi,%rax), %rax
	cmpb	%sil, (%rax)
	cmovne	%rdx, %rax
# endif
	ret

	.p2align 3
L(next_48_bytes):
	movdqu	16(%rdi), %xmm0
	movdqa	%xmm0, %xmm4
	pcmpeqb	%xmm1, %xmm0
	pcmpeqb	%xmm3, %xmm4
	por	%xmm4, %xmm0
	pmovmskb %xmm0, %ecx
	movdqu	32(%rdi), %xmm0
	movdqa	%xmm0, %xmm4
	pcmpeqb	%xmm1, %xmm0
	salq	$16, %rcx
	pcmpeqb	%xmm3, %xmm4
	por	%xmm4, %xmm0
	pmovmskb %xmm0, %eax
	movdqu	48(%rdi), %xmm0
	pcmpeqb	%xmm0, %xmm3
	salq	$32, %rax
	pcmpeqb	%xmm1, %xmm0
	orq	%rcx, %rax
	por	%xmm3, %xmm0
	pmovmskb %xmm0, %ecx
	salq	$48, %rcx
	orq	%rcx, %rax
	testq	%rax, %rax
	jne	L(return)
L(loop_start):
	/* We use this alignment to force loop be aligned to 8 but not
	   16 bytes.  This gives better sheduling on AMD processors.  */
	.p2align 4
	pxor	%xmm6, %xmm6
	andq	$-64, %rdi
	.p2align 3
L(loop64):
	addq	$64, %rdi
	movdqa	(%rdi), %xmm5
	movdqa	16(%rdi), %xmm2
	movdqa	32(%rdi), %xmm3
	pxor	%xmm1, %xmm5
	movdqa	48(%rdi), %xmm4
	pxor	%xmm1, %xmm2
	pxor	%xmm1, %xmm3
	pminub	(%rdi), %xmm5
	pxor	%xmm1, %xmm4
	pminub	16(%rdi), %xmm2
	pminub	32(%rdi), %xmm3
	pminub	%xmm2, %xmm5
	pminub	48(%rdi), %xmm4
	pminub	%xmm3, %xmm5
	pminub	%xmm4, %xmm5
	pcmpeqb %xmm6, %xmm5
	pmovmskb %xmm5, %eax

	testl	%eax, %eax
	je	L(loop64)

	movdqa	(%rdi), %xmm5
	movdqa	%xmm5, %xmm0
	pcmpeqb	%xmm1, %xmm5
	pcmpeqb	%xmm6, %xmm0
	por	%xmm0, %xmm5
	pcmpeqb %xmm6, %xmm2
	pcmpeqb %xmm6, %xmm3
	pcmpeqb %xmm6, %xmm4

	pmovmskb %xmm5, %ecx
	pmovmskb %xmm2, %eax
	salq	$16, %rax
	pmovmskb %xmm3, %r8d
	pmovmskb %xmm4, %edx
	salq	$32, %r8
	orq	%r8, %rax
	orq	%rcx, %rax
	salq	$48, %rdx
	orq	%rdx, %rax
	.p2align 3
L(return):
	bsfq	%rax, %rax
# ifdef AS_STRCHRNUL
	leaq	(%rdi,%rax), %rax
# else
	movl	$0, %edx
	leaq	(%rdi,%rax), %rax
	cmpb	%sil, (%rax)
	cmovne	%rdx, %rax
# endif
	ret
	.p2align 4

L(cross_page):
	movq	%rdi, %rdx
	pxor	%xmm2, %xmm2
	andq	$-64, %rdx
	movdqa	%xmm1, %xmm0
	movdqa	(%rdx), %xmm3
	movdqa	%xmm3, %xmm4
	pcmpeqb	%xmm1, %xmm3
	pcmpeqb	%xmm2, %xmm4
	por	%xmm4, %xmm3
	pmovmskb %xmm3, %r8d
	movdqa	16(%rdx), %xmm3
	movdqa	%xmm3, %xmm4
	pcmpeqb	%xmm1, %xmm3
	pcmpeqb	%xmm2, %xmm4
	por	%xmm4, %xmm3
	pmovmskb %xmm3, %eax
	movdqa	32(%rdx), %xmm3
	movdqa	%xmm3, %xmm4
	pcmpeqb	%xmm1, %xmm3
	salq	$16, %rax
	pcmpeqb	%xmm2, %xmm4
	por	%xmm4, %xmm3
	pmovmskb %xmm3, %r9d
	movdqa	48(%rdx), %xmm3
	pcmpeqb	%xmm3, %xmm2
	salq	$32, %r9
	pcmpeqb	%xmm3, %xmm0
	orq	%r9, %rax
	orq	%r8, %rax
	por	%xmm2, %xmm0
	pmovmskb %xmm0, %ecx
	salq	$48, %rcx
	orq	%rcx, %rax
	movl	%edi, %ecx
	subb	%dl, %cl
	shrq	%cl, %rax
	testq	%rax, %rax
	jne	L(return)
	jmp	L(loop_start)

END (STRCHR)
#endif