1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
|
/* strchr optimized with SSE2.
Copyright (C) 2009-2022 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <isa-level.h>
/* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation
so we need this to build for ISA V2 builds. */
#if ISA_SHOULD_BUILD (2)
# ifndef STRCHR
# define STRCHR __strchr_sse2
# endif
# include <sysdep.h>
.text
ENTRY (STRCHR)
movd %esi, %xmm1
movl %edi, %eax
andl $4095, %eax
punpcklbw %xmm1, %xmm1
cmpl $4032, %eax
punpcklwd %xmm1, %xmm1
pshufd $0, %xmm1, %xmm1
jg L(cross_page)
movdqu (%rdi), %xmm0
pxor %xmm3, %xmm3
movdqa %xmm0, %xmm4
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm3, %xmm4
por %xmm4, %xmm0
pmovmskb %xmm0, %eax
test %eax, %eax
je L(next_48_bytes)
bsf %eax, %eax
# ifdef AS_STRCHRNUL
leaq (%rdi,%rax), %rax
# else
movl $0, %edx
leaq (%rdi,%rax), %rax
cmpb %sil, (%rax)
cmovne %rdx, %rax
# endif
ret
.p2align 3
L(next_48_bytes):
movdqu 16(%rdi), %xmm0
movdqa %xmm0, %xmm4
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm3, %xmm4
por %xmm4, %xmm0
pmovmskb %xmm0, %ecx
movdqu 32(%rdi), %xmm0
movdqa %xmm0, %xmm4
pcmpeqb %xmm1, %xmm0
salq $16, %rcx
pcmpeqb %xmm3, %xmm4
por %xmm4, %xmm0
pmovmskb %xmm0, %eax
movdqu 48(%rdi), %xmm0
pcmpeqb %xmm0, %xmm3
salq $32, %rax
pcmpeqb %xmm1, %xmm0
orq %rcx, %rax
por %xmm3, %xmm0
pmovmskb %xmm0, %ecx
salq $48, %rcx
orq %rcx, %rax
testq %rax, %rax
jne L(return)
L(loop_start):
/* We use this alignment to force loop be aligned to 8 but not
16 bytes. This gives better sheduling on AMD processors. */
.p2align 4
pxor %xmm6, %xmm6
andq $-64, %rdi
.p2align 3
L(loop64):
addq $64, %rdi
movdqa (%rdi), %xmm5
movdqa 16(%rdi), %xmm2
movdqa 32(%rdi), %xmm3
pxor %xmm1, %xmm5
movdqa 48(%rdi), %xmm4
pxor %xmm1, %xmm2
pxor %xmm1, %xmm3
pminub (%rdi), %xmm5
pxor %xmm1, %xmm4
pminub 16(%rdi), %xmm2
pminub 32(%rdi), %xmm3
pminub %xmm2, %xmm5
pminub 48(%rdi), %xmm4
pminub %xmm3, %xmm5
pminub %xmm4, %xmm5
pcmpeqb %xmm6, %xmm5
pmovmskb %xmm5, %eax
testl %eax, %eax
je L(loop64)
movdqa (%rdi), %xmm5
movdqa %xmm5, %xmm0
pcmpeqb %xmm1, %xmm5
pcmpeqb %xmm6, %xmm0
por %xmm0, %xmm5
pcmpeqb %xmm6, %xmm2
pcmpeqb %xmm6, %xmm3
pcmpeqb %xmm6, %xmm4
pmovmskb %xmm5, %ecx
pmovmskb %xmm2, %eax
salq $16, %rax
pmovmskb %xmm3, %r8d
pmovmskb %xmm4, %edx
salq $32, %r8
orq %r8, %rax
orq %rcx, %rax
salq $48, %rdx
orq %rdx, %rax
.p2align 3
L(return):
bsfq %rax, %rax
# ifdef AS_STRCHRNUL
leaq (%rdi,%rax), %rax
# else
movl $0, %edx
leaq (%rdi,%rax), %rax
cmpb %sil, (%rax)
cmovne %rdx, %rax
# endif
ret
.p2align 4
L(cross_page):
movq %rdi, %rdx
pxor %xmm2, %xmm2
andq $-64, %rdx
movdqa %xmm1, %xmm0
movdqa (%rdx), %xmm3
movdqa %xmm3, %xmm4
pcmpeqb %xmm1, %xmm3
pcmpeqb %xmm2, %xmm4
por %xmm4, %xmm3
pmovmskb %xmm3, %r8d
movdqa 16(%rdx), %xmm3
movdqa %xmm3, %xmm4
pcmpeqb %xmm1, %xmm3
pcmpeqb %xmm2, %xmm4
por %xmm4, %xmm3
pmovmskb %xmm3, %eax
movdqa 32(%rdx), %xmm3
movdqa %xmm3, %xmm4
pcmpeqb %xmm1, %xmm3
salq $16, %rax
pcmpeqb %xmm2, %xmm4
por %xmm4, %xmm3
pmovmskb %xmm3, %r9d
movdqa 48(%rdx), %xmm3
pcmpeqb %xmm3, %xmm2
salq $32, %r9
pcmpeqb %xmm3, %xmm0
orq %r9, %rax
orq %r8, %rax
por %xmm2, %xmm0
pmovmskb %xmm0, %ecx
salq $48, %rcx
orq %rcx, %rax
movl %edi, %ecx
subb %dl, %cl
shrq %cl, %rax
testq %rax, %rax
jne L(return)
jmp L(loop_start)
END (STRCHR)
#endif
|