1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
|
/* Function sincosf vectorized with SSE4.
Copyright (C) 2014-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_s_trig_data.h"
.text
ENTRY (_ZGVbN4vl4l4_sincosf_sse4)
/*
ALGORITHM DESCRIPTION:
1) Range reduction to [-Pi/4; +Pi/4] interval
a) Grab sign from source argument and save it.
b) Remove sign using AND operation
c) Getting octant Y by 2/Pi multiplication
d) Add "Right Shifter" value
e) Treat obtained value as integer S for destination sign setting.
SS = ((S-S&1)&2)<<30; For sin part
SC = ((S+S&1)&2)<<30; For cos part
f) Change destination sign if source sign is negative
using XOR operation.
g) Subtract "Right Shifter" (0x4B000000) value
h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 4 parts:
X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval)
a) Calculate X^2 = X * X
b) Calculate 2 polynomials for sin and cos:
RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3))));
RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4))));
c) Swap RS & RC if first bit of obtained value after
Right Shifting is set to 1. Using And, Andnot & Or operations.
3) Destination sign setting
a) Set shifted destination sign using XOR operation:
R1 = XOR( RS, SS );
R2 = XOR( RC, SC ). */
pushq %rbp
cfi_adjust_cfa_offset (8)
cfi_rel_offset (%rbp, 0)
movq %rsp, %rbp
cfi_def_cfa_register (%rbp)
andq $-64, %rsp
subq $320, %rsp
movq __svml_s_trig_data@GOTPCREL(%rip), %rax
movups %xmm12, 176(%rsp)
movups %xmm9, 160(%rsp)
movups __sAbsMask(%rax), %xmm12
/* Absolute argument computation */
movaps %xmm12, %xmm5
andnps %xmm0, %xmm12
movups __sInvPI(%rax), %xmm7
andps %xmm0, %xmm5
/* c) Getting octant Y by 2/Pi multiplication
d) Add "Right Shifter" value. */
mulps %xmm5, %xmm7
movups %xmm10, 144(%rsp)
movups __sPI1(%rax), %xmm10
/* h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 3 parts:
X = X - Y*PI1 - Y*PI2 - Y*PI3. */
movaps %xmm10, %xmm1
addps __sRShifter(%rax), %xmm7
/* e) Treat obtained value as integer S for destination sign setting */
movaps %xmm7, %xmm9
/* g) Subtract "Right Shifter" (0x4B000000) value */
subps __sRShifter(%rax), %xmm7
mulps %xmm7, %xmm1
pslld $31, %xmm9
movups __sPI2(%rax), %xmm6
movups %xmm13, 112(%rsp)
movaps %xmm5, %xmm13
movaps %xmm6, %xmm2
subps %xmm1, %xmm13
mulps %xmm7, %xmm2
movups __sSignMask(%rax), %xmm3
movaps %xmm5, %xmm1
movups __sOneHalf(%rax), %xmm4
subps %xmm2, %xmm13
cmpnleps __sRangeReductionVal(%rax), %xmm5
movaps %xmm3, %xmm2
andps %xmm13, %xmm2
xorps %xmm2, %xmm4
/* Result sign calculations */
xorps %xmm2, %xmm3
xorps %xmm9, %xmm3
/* Add correction term 0.5 for cos() part */
addps %xmm7, %xmm4
movmskps %xmm5, %ecx
mulps %xmm4, %xmm10
mulps %xmm4, %xmm6
subps %xmm10, %xmm1
movups __sPI3(%rax), %xmm10
subps %xmm6, %xmm1
movaps %xmm10, %xmm6
mulps %xmm7, %xmm6
mulps %xmm4, %xmm10
subps %xmm6, %xmm13
subps %xmm10, %xmm1
movups __sPI4(%rax), %xmm6
mulps %xmm6, %xmm7
mulps %xmm6, %xmm4
subps %xmm7, %xmm13
subps %xmm4, %xmm1
xorps %xmm9, %xmm13
xorps %xmm3, %xmm1
movaps %xmm13, %xmm4
movaps %xmm1, %xmm2
mulps %xmm13, %xmm4
mulps %xmm1, %xmm2
movups __sA9(%rax), %xmm7
/* 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval)
a) Calculate X^2 = X * X
b) Calculate 2 polynomials for sin and cos:
RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3))));
RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))) */
movaps %xmm7, %xmm3
mulps %xmm4, %xmm3
mulps %xmm2, %xmm7
addps __sA7(%rax), %xmm3
addps __sA7(%rax), %xmm7
mulps %xmm4, %xmm3
mulps %xmm2, %xmm7
addps __sA5(%rax), %xmm3
addps __sA5(%rax), %xmm7
mulps %xmm4, %xmm3
mulps %xmm2, %xmm7
addps __sA3(%rax), %xmm3
addps __sA3(%rax), %xmm7
mulps %xmm3, %xmm4
mulps %xmm7, %xmm2
mulps %xmm13, %xmm4
mulps %xmm1, %xmm2
addps %xmm4, %xmm13
addps %xmm2, %xmm1
xorps %xmm12, %xmm13
testl %ecx, %ecx
jne .LBL_1_3
.LBL_1_2:
cfi_remember_state
movups 160(%rsp), %xmm9
movaps %xmm13, (%rdi)
movups 144(%rsp), %xmm10
movups 176(%rsp), %xmm12
movups 112(%rsp), %xmm13
movups %xmm1, (%rsi)
movq %rbp, %rsp
cfi_def_cfa_register (%rsp)
popq %rbp
cfi_adjust_cfa_offset (-8)
cfi_restore (%rbp)
ret
.LBL_1_3:
cfi_restore_state
movups %xmm0, 128(%rsp)
movups %xmm13, 192(%rsp)
movups %xmm1, 256(%rsp)
je .LBL_1_2
xorb %dl, %dl
xorl %eax, %eax
movups %xmm8, 48(%rsp)
movups %xmm11, 32(%rsp)
movups %xmm14, 16(%rsp)
movups %xmm15, (%rsp)
movq %rsi, 64(%rsp)
movq %r12, 104(%rsp)
cfi_offset_rel_rsp (12, 104)
movb %dl, %r12b
movq %r13, 96(%rsp)
cfi_offset_rel_rsp (13, 96)
movl %eax, %r13d
movq %r14, 88(%rsp)
cfi_offset_rel_rsp (14, 88)
movl %ecx, %r14d
movq %r15, 80(%rsp)
cfi_offset_rel_rsp (15, 80)
movq %rbx, 72(%rsp)
movq %rdi, %rbx
cfi_remember_state
.LBL_1_6:
btl %r13d, %r14d
jc .LBL_1_13
.LBL_1_7:
lea 1(%r13), %esi
btl %esi, %r14d
jc .LBL_1_10
.LBL_1_8:
incb %r12b
addl $2, %r13d
cmpb $16, %r12b
jb .LBL_1_6
movups 48(%rsp), %xmm8
movq %rbx, %rdi
movups 32(%rsp), %xmm11
movups 16(%rsp), %xmm14
movups (%rsp), %xmm15
movq 64(%rsp), %rsi
movq 104(%rsp), %r12
cfi_restore (%r12)
movq 96(%rsp), %r13
cfi_restore (%r13)
movq 88(%rsp), %r14
cfi_restore (%r14)
movq 80(%rsp), %r15
cfi_restore (%r15)
movq 72(%rsp), %rbx
movups 192(%rsp), %xmm13
movups 256(%rsp), %xmm1
jmp .LBL_1_2
.LBL_1_10:
cfi_restore_state
movzbl %r12b, %r15d
movss 132(%rsp,%r15,8), %xmm0
call JUMPTARGET(sinf)
movss %xmm0, 196(%rsp,%r15,8)
movss 132(%rsp,%r15,8), %xmm0
call JUMPTARGET(cosf)
movss %xmm0, 260(%rsp,%r15,8)
jmp .LBL_1_8
.LBL_1_13:
movzbl %r12b, %r15d
movss 128(%rsp,%r15,8), %xmm0
call JUMPTARGET(sinf)
movss %xmm0, 192(%rsp,%r15,8)
movss 128(%rsp,%r15,8), %xmm0
call JUMPTARGET(cosf)
movss %xmm0, 256(%rsp,%r15,8)
jmp .LBL_1_7
END (_ZGVbN4vl4l4_sincosf_sse4)
libmvec_hidden_def(_ZGVbN4vl4l4_sincosf_sse4)
/* vvv version implemented with wrapper to vl4l4 variant. */
ENTRY (_ZGVbN4vvv_sincosf_sse4)
#ifndef __ILP32__
subq $104, %rsp
.cfi_def_cfa_offset 112
movdqu %xmm1, 32(%rsp)
lea (%rsp), %rdi
movdqu %xmm2, 48(%rdi)
lea 16(%rsp), %rsi
movdqu %xmm3, 48(%rsi)
movdqu %xmm4, 64(%rsi)
call HIDDEN_JUMPTARGET(_ZGVbN4vl4l4_sincosf_sse4)
movq 32(%rsp), %rdx
movq 40(%rsp), %rsi
movq 48(%rsp), %r8
movq 56(%rsp), %r10
movl (%rsp), %eax
movl 4(%rsp), %ecx
movl 8(%rsp), %edi
movl 12(%rsp), %r9d
movl %eax, (%rdx)
movl %ecx, (%rsi)
movq 64(%rsp), %rax
movq 72(%rsp), %rcx
movl %edi, (%r8)
movl %r9d, (%r10)
movq 80(%rsp), %rdi
movq 88(%rsp), %r9
movl 16(%rsp), %r11d
movl 20(%rsp), %edx
movl 24(%rsp), %esi
movl 28(%rsp), %r8d
movl %r11d, (%rax)
movl %edx, (%rcx)
movl %esi, (%rdi)
movl %r8d, (%r9)
addq $104, %rsp
.cfi_def_cfa_offset 8
ret
#else
subl $72, %esp
.cfi_def_cfa_offset 80
leal 48(%rsp), %esi
movaps %xmm1, 16(%esp)
leal 32(%rsp), %edi
movaps %xmm2, (%esp)
call HIDDEN_JUMPTARGET(_ZGVbN4vl4l4_sincosf_sse4)
movl 16(%esp), %eax
movss 32(%esp), %xmm0
movss %xmm0, (%eax)
movl 20(%esp), %eax
movss 36(%esp), %xmm0
movss %xmm0, (%eax)
movl 24(%esp), %eax
movss 40(%esp), %xmm0
movss %xmm0, (%eax)
movl 28(%esp), %eax
movss 44(%esp), %xmm0
movss %xmm0, (%eax)
movl (%esp), %eax
movss 48(%esp), %xmm0
movss %xmm0, (%eax)
movl 4(%esp), %eax
movss 52(%esp), %xmm0
movss %xmm0, (%eax)
movl 8(%esp), %eax
movss 56(%esp), %xmm0
movss %xmm0, (%eax)
movl 12(%esp), %eax
movss 60(%esp), %xmm0
movss %xmm0, (%eax)
addl $72, %esp
.cfi_def_cfa_offset 8
ret
#endif
END (_ZGVbN4vvv_sincosf_sse4)
|