about summary refs log tree commit diff
path: root/sysdeps/x86_64/fpu/multiarch/svml_d_atan4_core_avx2.S
blob: 4a02eb15f46dc12b7c38bfcc1afe2d55eb0cb7ba (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
/* Function atan vectorized with AVX2.
   Copyright (C) 2021-2022 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   https://www.gnu.org/licenses/.  */

/*
 * ALGORITHM DESCRIPTION:
 *
 *      For    0.0    <= x <=  7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
 *      For  7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
 *      For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
 *      For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
 *      For 39.0/16.0 <= x <=    inf   : atan(x) = atan(inf) + atan(s), where s=-1.0/x
 *      Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
 *
 */

/* Offsets for data table __svml_datan_data_internal_avx512
 */
#define AbsMask				0
#define Shifter				32
#define MaxThreshold			64
#define MOne				96
#define One				128
#define LargeX				160
#define Zero				192
#define Tbl_H				224
#define Tbl_L				480
#define dIndexMed			736
#define Pi2				768
#define Pi2_low				800
#define coeff				832

#include <sysdep.h>

	.section .text.avx2, "ax", @progbits
ENTRY(_ZGVdN4v_atan_avx2)
	lea	Tbl_H+128+__svml_datan_data_internal_avx512(%rip), %rdi
	vmovupd	Shifter+__svml_datan_data_internal_avx512(%rip), %ymm4
	vmovupd	One+__svml_datan_data_internal_avx512(%rip), %ymm9

	/* saturate X range */
	vmovupd	LargeX+__svml_datan_data_internal_avx512(%rip), %ymm6
	vandpd	__svml_datan_data_internal_avx512(%rip), %ymm0, %ymm7
	vaddpd	%ymm4, %ymm7, %ymm2
	vcmpge_oqpd MaxThreshold+__svml_datan_data_internal_avx512(%rip), %ymm7, %ymm3
	vminpd	%ymm7, %ymm6, %ymm10
	vsubpd	%ymm4, %ymm2, %ymm5

	/*
	 * table lookup sequence
	 * VPERMUTE not available
	 */
	vpsllq	$3, %ymm2, %ymm13
	vsubpd	%ymm5, %ymm7, %ymm8
	vcmpge_oqpd dIndexMed+__svml_datan_data_internal_avx512(%rip), %ymm2, %ymm2
	vfmadd231pd %ymm7, %ymm5, %ymm9
	vpand	.FLT_11(%rip), %ymm13, %ymm14
	vblendvpd %ymm3, MOne+__svml_datan_data_internal_avx512(%rip), %ymm8, %ymm11
	vblendvpd %ymm3, %ymm10, %ymm9, %ymm12
	vxorpd	%ymm0, %ymm7, %ymm1

	/* R+Rl = DiffX/Y */
	vdivpd	%ymm12, %ymm11, %ymm0
	vextractf128 $1, %ymm14, %xmm4
	vmovd	%xmm14, %eax
	vmovd	%xmm4, %ecx
	movslq	%eax, %rax
	vpextrd	$2, %xmm14, %edx
	movslq	%ecx, %rcx
	vpextrd	$2, %xmm4, %esi
	movslq	%edx, %rdx
	movslq	%esi, %rsi
	vmovsd	-128(%rax, %rdi), %xmm15
	vmovsd	(%rdi, %rax), %xmm7
	vmovsd	-128(%rcx, %rdi), %xmm5
	vmovsd	(%rdi, %rcx), %xmm9
	vmovhpd	-128(%rdx, %rdi), %xmm15, %xmm15
	vmovhpd	(%rdi, %rdx), %xmm7, %xmm8
	vmovhpd	-128(%rsi, %rdi), %xmm5, %xmm6
	vmovhpd	(%rdi, %rsi), %xmm9, %xmm10

	/* polynomial evaluation */
	vmulpd	%ymm0, %ymm0, %ymm5
	vmulpd	%ymm5, %ymm5, %ymm4
	vinsertf128 $1, %xmm6, %ymm15, %ymm11
	vinsertf128 $1, %xmm10, %ymm8, %ymm12
	vblendvpd %ymm2, %ymm12, %ymm11, %ymm13
	vmovupd	coeff+__svml_datan_data_internal_avx512(%rip), %ymm8
	vmovupd	coeff+64+__svml_datan_data_internal_avx512(%rip), %ymm2
	vmulpd	%ymm5, %ymm0, %ymm6
	vfmadd213pd coeff+32+__svml_datan_data_internal_avx512(%rip), %ymm5, %ymm8
	vfmadd213pd coeff+96+__svml_datan_data_internal_avx512(%rip), %ymm5, %ymm2

	/* set table value to Pi/2 for large X */
	vblendvpd %ymm3, Pi2+__svml_datan_data_internal_avx512(%rip), %ymm13, %ymm7
	vmovupd	coeff+128+__svml_datan_data_internal_avx512(%rip), %ymm3
	vfmadd213pd %ymm2, %ymm4, %ymm8
	vfmadd213pd coeff+160+__svml_datan_data_internal_avx512(%rip), %ymm3, %ymm5
	vfmadd213pd %ymm5, %ymm4, %ymm8
	vfmadd213pd %ymm0, %ymm6, %ymm8
	vaddpd	%ymm8, %ymm7, %ymm0
	vxorpd	%ymm1, %ymm0, %ymm0
	ret

END(_ZGVdN4v_atan_avx2)

	.section .rodata, "a"
	.align	32

.FLT_11:
	.long	0x00000078, 0x00000000, 0x00000078, 0x00000000, 0x00000078, 0x00000000, 0x00000078, 0x00000000
	.type	.FLT_11, @object
	.size	.FLT_11, 32
	.align	32

#ifdef __svml_datan_data_internal_avx512_typedef
typedef unsigned int VUINT32;
typedef struct {
	__declspec(align(32)) VUINT32 AbsMask[4][2];
	__declspec(align(32)) VUINT32 Shifter[4][2];
	__declspec(align(32)) VUINT32 MaxThreshold[4][2];
	__declspec(align(32)) VUINT32 MOne[4][2];
	__declspec(align(32)) VUINT32 One[4][2];
	__declspec(align(32)) VUINT32 LargeX[4][2];
	__declspec(align(32)) VUINT32 Zero[4][2];
	__declspec(align(32)) VUINT32 Tbl_H[32][2];
	__declspec(align(32)) VUINT32 Tbl_L[32][2];
	__declspec(align(32)) VUINT32 dIndexMed[4][2];
	__declspec(align(32)) VUINT32 Pi2[4][2];
	__declspec(align(32)) VUINT32 Pi2_low[4][2];
	__declspec(align(32)) VUINT32 coeff[6][4][2];
} __svml_datan_data_internal_avx512;
#endif
__svml_datan_data_internal_avx512:
	/* AbsMask */
	.quad	0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff
	/* Shifter */
	.align	32
	.quad	0x4318000000000000, 0x4318000000000000, 0x4318000000000000, 0x4318000000000000
	/* MaxThreshold */
	.align	32
	.quad	0x401f800000000000, 0x401f800000000000, 0x401f800000000000, 0x401f800000000000
	/* MOne */
	.align	32
	.quad	0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000
	/* One */
	.align	32
	.quad	0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000
	/* LargeX */
	.align	32
	.quad	0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000
	/* Zero */
	.align	32
	.quad	0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
	/* Tbl_H */
	.align	32
	.quad	0x0000000000000000, 0x3fcf5b75f92c80dd
	.quad	0x3fddac670561bb4f, 0x3fe4978fa3269ee1
	.quad	0x3fe921fb54442d18, 0x3fecac7c57846f9e
	.quad	0x3fef730bd281f69b, 0x3ff0d38f2c5ba09f
	.quad	0x3ff1b6e192ebbe44, 0x3ff270ef55a53a25
	.quad	0x3ff30b6d796a4da8, 0x3ff38d6a6ce13353
	.quad	0x3ff3fc176b7a8560, 0x3ff45b54837351a0
	.quad	0x3ff4ae10fc6589a5, 0x3ff4f68dea672617
	.quad	0x3ff5368c951e9cfd, 0x3ff56f6f33a3e6a7
	.quad	0x3ff5a25052114e60, 0x3ff5d013c41adabd
	.quad	0x3ff5f97315254857, 0x3ff61f06c6a92b89
	.quad	0x3ff6414d44094c7c, 0x3ff660b02c736a06
	.quad	0x3ff67d8863bc99bd, 0x3ff698213a9d5053
	.quad	0x3ff6b0bae830c070, 0x3ff6c78c7edeb195
	.quad	0x3ff6dcc57bb565fd, 0x3ff6f08f07435fec
	.quad	0x3ff7030cf9403197, 0x3ff7145eac2088a4
	/* Tbl_L */
	.align	32
	.quad	0x0000000000000000, 0x3c68ab6e3cf7afbd
	.quad	0x3c7a2b7f222f65e2, 0x3c72419a87f2a458
	.quad	0x3c81a62633145c07, 0x3c80dae13ad18a6b
	.quad	0x3c7007887af0cbbd, 0xbc9bd0dc231bfd70
	.quad	0x3c9b1b466a88828e, 0xbc9a66b1af5f84fb
	.quad	0x3c96254cb03bb199, 0xbc812c77e8a80f5c
	.quad	0xbc4441a3bd3f1084, 0x3c79e4a72eedacc4
	.quad	0xbc93b03e8a27f555, 0x3c9934f9f2b0020e
	.quad	0xbc996f47948a99f1, 0xbc7df6edd6f1ec3b
	.quad	0x3c78c2d0c89de218, 0x3c9f82bba194dd5d
	.quad	0xbc831151a43b51ca, 0xbc8487d50bceb1a5
	.quad	0xbc9c5f60a65c7397, 0xbc7acb6afb332a0f
	.quad	0xbc99b7bd2e1e8c9c, 0xbc9b9839085189e3
	.quad	0xbc97d1ab82ffb70b, 0x3c99239ad620ffe2
	.quad	0xbc929c86447928e7, 0xbc8957a7170df016
	.quad	0xbc7cbe1896221608, 0xbc9fda5797b32a0b
	/* dIndexMed */
	.align	32
	.quad	0x4318000000000010, 0x4318000000000010, 0x4318000000000010, 0x4318000000000010
	/* Pi2 */
	.align	32
	.quad	0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18
	/* Pi2_low */
	.align	32
	.quad	0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07
	/* coeff6 */
	.align	32
	.quad	0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97
	.quad	0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc
	.quad	0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0
	.quad	0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da
	.quad	0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e
	.quad	0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d
	.align	32
	.type	__svml_datan_data_internal_avx512, @object
	.size	__svml_datan_data_internal_avx512, .-__svml_datan_data_internal_avx512