about summary refs log tree commit diff
path: root/sysdeps/x86_64/fpu/multiarch/svml_s_erff16_core_avx512.S
blob: 3bdc906ec875d8f8e8f7e58622a364dcdb7c1b8b (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
/* Function erff vectorized with AVX-512.
   Copyright (C) 2021-2022 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   https://www.gnu.org/licenses/.  */

/*
 * ALGORITHM DESCRIPTION:
 *
 *   erf(x) is computed as higher precision simple polynomial
 *   with no lookup table:
 *
 *     R = P0 + x^2*(P1 + x^2*(P2 + .... x^2*P12));
 *     erf(x) = R * R * x;
 *
 *   Special cases:
 *
 *   erf(0)    = 0
 *   erf(+INF) = +1
 *   erf(-INF) = -1
 *   erf(QNaN) = QNaN
 *   erf(SNaN) = QNaN
 *
 */

/* Offsets for data table __svml_serf_data_internal
 */
#define _AbsMask			0
#define _One				64
#define _gf_MaxThreshold_LA		128
#define _gf_la_poly_0			192
#define _gf_la_poly_1			256
#define _gf_la_poly_2			320
#define _gf_la_poly_3			384
#define _gf_la_poly_4			448
#define _gf_la_poly_5			512
#define _gf_la_poly_6			576
#define _gf_la_poly_7			640
#define _gf_la_poly_8			704
#define _gf_la_poly_9			768
#define _gf_la_poly_10			832
#define _gf_la_poly_11			896
#define _gf_la_poly_12			960

#include <sysdep.h>

	.section .text.exex512, "ax", @progbits
ENTRY(_ZGVeN16v_erff_skx)
	vmovaps	%zmm0, %zmm8
	vmulps	{rn-sae}, %zmm8, %zmm8, %zmm11
	vmovups	_gf_la_poly_11+__svml_serf_data_internal(%rip), %zmm15
	vmovups	_gf_la_poly_12+__svml_serf_data_internal(%rip), %zmm10
	vmovups	_gf_la_poly_10+__svml_serf_data_internal(%rip), %zmm9
	vmovups	_gf_la_poly_9+__svml_serf_data_internal(%rip), %zmm7
	vmovups	_gf_la_poly_8+__svml_serf_data_internal(%rip), %zmm0
	vmovups	_gf_la_poly_7+__svml_serf_data_internal(%rip), %zmm1
	vmovups	_gf_la_poly_6+__svml_serf_data_internal(%rip), %zmm2
	vmovups	_gf_la_poly_5+__svml_serf_data_internal(%rip), %zmm3
	vmovups	_gf_la_poly_4+__svml_serf_data_internal(%rip), %zmm4
	vmovups	_gf_la_poly_3+__svml_serf_data_internal(%rip), %zmm5
	vmovups	_gf_la_poly_2+__svml_serf_data_internal(%rip), %zmm6
	vextractf32x8 $1, %zmm8, %ymm13
	vcvtps2pd {sae}, %ymm8, %zmm12
	vcvtps2pd {sae}, %ymm13, %zmm14
	vmulpd	{rn-sae}, %zmm12, %zmm12, %zmm12
	vmulpd	{rn-sae}, %zmm14, %zmm14, %zmm13

	/* R = P0 + x^2*(P1 + x^2*(P2 + .... x^2*P12)); */
	vmovaps	%zmm15, %zmm14
	vfmadd231pd {rn-sae}, %zmm12, %zmm10, %zmm14
	vfmadd231pd {rn-sae}, %zmm13, %zmm10, %zmm15
	vmovups	_gf_la_poly_1+__svml_serf_data_internal(%rip), %zmm10
	vfmadd213pd {rn-sae}, %zmm9, %zmm12, %zmm14
	vfmadd231pd {rn-sae}, %zmm13, %zmm15, %zmm9
	vfmadd213pd {rn-sae}, %zmm7, %zmm12, %zmm14
	vfmadd231pd {rn-sae}, %zmm13, %zmm9, %zmm7
	vfmadd213pd {rn-sae}, %zmm0, %zmm12, %zmm14
	vfmadd231pd {rn-sae}, %zmm13, %zmm7, %zmm0
	vmovups	_gf_MaxThreshold_LA+__svml_serf_data_internal(%rip), %zmm7
	vfmadd213pd {rn-sae}, %zmm1, %zmm12, %zmm14
	vfmadd231pd {rn-sae}, %zmm13, %zmm0, %zmm1
	vmovups	_gf_la_poly_0+__svml_serf_data_internal(%rip), %zmm0
	vcmpps	$22, {sae}, %zmm11, %zmm7, %k1
	vfmadd213pd {rn-sae}, %zmm2, %zmm12, %zmm14
	vfmadd231pd {rn-sae}, %zmm13, %zmm1, %zmm2
	vfmadd213pd {rn-sae}, %zmm3, %zmm12, %zmm14
	vfmadd231pd {rn-sae}, %zmm13, %zmm2, %zmm3
	vfmadd213pd {rn-sae}, %zmm4, %zmm12, %zmm14
	vfmadd231pd {rn-sae}, %zmm13, %zmm3, %zmm4
	vfmadd213pd {rn-sae}, %zmm5, %zmm12, %zmm14
	vfmadd231pd {rn-sae}, %zmm13, %zmm4, %zmm5
	vfmadd213pd {rn-sae}, %zmm6, %zmm12, %zmm14
	vfmadd231pd {rn-sae}, %zmm13, %zmm5, %zmm6
	vmovups	_AbsMask+__svml_serf_data_internal(%rip), %zmm5
	vfmadd213pd {rn-sae}, %zmm10, %zmm12, %zmm14
	vfmadd231pd {rn-sae}, %zmm13, %zmm6, %zmm10
	vandnps	%zmm8, %zmm5, %zmm6
	vfmadd213pd {rn-sae}, %zmm0, %zmm14, %zmm12
	vfmadd213pd {rn-sae}, %zmm0, %zmm10, %zmm13
	vorps	_One+__svml_serf_data_internal(%rip), %zmm6, %zmm0
	vmulpd	{rn-sae}, %zmm12, %zmm12, %zmm1
	vmulpd	{rn-sae}, %zmm13, %zmm13, %zmm3
	vcvtpd2ps {rn-sae}, %zmm1, %ymm2
	vcvtpd2ps {rn-sae}, %zmm3, %ymm4
	vinsertf32x8 $1, %ymm4, %zmm2, %zmm9

	/* erf(x) = R * R * x; */
	vmulps	{rn-sae}, %zmm8, %zmm9, %zmm0{%k1}
	ret

END(_ZGVeN16v_erff_skx)

	.section .rodata, "a"
	.align	64

#ifdef __svml_serf_data_internal_typedef
typedef unsigned int VUINT32;
typedef struct {
	__declspec(align(64)) VUINT32 _AbsMask[16][1];
	__declspec(align(64)) VUINT32 _One[16][1];
	__declspec(align(64)) VUINT32 _gf_MaxThreshold_LA[16][1];
	__declspec(align(64)) VUINT32 _gf_la_poly_0[8][2];
	__declspec(align(64)) VUINT32 _gf_la_poly_1[8][2];
	__declspec(align(64)) VUINT32 _gf_la_poly_2[8][2];
	__declspec(align(64)) VUINT32 _gf_la_poly_3[8][2];
	__declspec(align(64)) VUINT32 _gf_la_poly_4[8][2];
	__declspec(align(64)) VUINT32 _gf_la_poly_5[8][2];
	__declspec(align(64)) VUINT32 _gf_la_poly_6[8][2];
	__declspec(align(64)) VUINT32 _gf_la_poly_7[8][2];
	__declspec(align(64)) VUINT32 _gf_la_poly_8[8][2];
	__declspec(align(64)) VUINT32 _gf_la_poly_9[8][2];
	__declspec(align(64)) VUINT32 _gf_la_poly_10[8][2];
	__declspec(align(64)) VUINT32 _gf_la_poly_11[8][2];
	__declspec(align(64)) VUINT32 _gf_la_poly_12[8][2];
} __svml_serf_data_internal;
#endif
__svml_serf_data_internal:
	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _AbsMask */
	.align	64
	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 /* _One */
	.align	64
	.long	0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a /* _gf_MaxThreshold_LA */
	.align	64
	.quad	0x3ff0fefbd933b903, 0x3ff0fefbd933b903, 0x3ff0fefbd933b903, 0x3ff0fefbd933b903, 0x3ff0fefbd933b903, 0x3ff0fefbd933b903, 0x3ff0fefbd933b903, 0x3ff0fefbd933b903 /* _gf_la_poly_0 */
	.align	64
	.quad	0xbfc6a948101e6367, 0xbfc6a948101e6367, 0xbfc6a948101e6367, 0xbfc6a948101e6367, 0xbfc6a948101e6367, 0xbfc6a948101e6367, 0xbfc6a948101e6367, 0xbfc6a948101e6367 /* _gf_la_poly_1 */
	.align	64
	.quad	0x3fa3a334ce602c6b, 0x3fa3a334ce602c6b, 0x3fa3a334ce602c6b, 0x3fa3a334ce602c6b, 0x3fa3a334ce602c6b, 0x3fa3a334ce602c6b, 0x3fa3a334ce602c6b, 0x3fa3a334ce602c6b /* _gf_la_poly_2 */
	.align	64
	.quad	0xbf799309ea0c81dc, 0xbf799309ea0c81dc, 0xbf799309ea0c81dc, 0xbf799309ea0c81dc, 0xbf799309ea0c81dc, 0xbf799309ea0c81dc, 0xbf799309ea0c81dc, 0xbf799309ea0c81dc /* _gf_la_poly_3 */
	.align	64
	.quad	0x3f476df64a40e392, 0x3f476df64a40e392, 0x3f476df64a40e392, 0x3f476df64a40e392, 0x3f476df64a40e392, 0x3f476df64a40e392, 0x3f476df64a40e392, 0x3f476df64a40e392 /* _gf_la_poly_4 */
	.align	64
	.quad	0xbf0a5216b9508ede, 0xbf0a5216b9508ede, 0xbf0a5216b9508ede, 0xbf0a5216b9508ede, 0xbf0a5216b9508ede, 0xbf0a5216b9508ede, 0xbf0a5216b9508ede, 0xbf0a5216b9508ede /* _gf_la_poly_5 */
	.align	64
	.quad	0x3ea5794b95c8e8a0, 0x3ea5794b95c8e8a0, 0x3ea5794b95c8e8a0, 0x3ea5794b95c8e8a0, 0x3ea5794b95c8e8a0, 0x3ea5794b95c8e8a0, 0x3ea5794b95c8e8a0, 0x3ea5794b95c8e8a0 /* _gf_la_poly_6 */
	.align	64
	.quad	0x3e94b6c0b485f30f, 0x3e94b6c0b485f30f, 0x3e94b6c0b485f30f, 0x3e94b6c0b485f30f, 0x3e94b6c0b485f30f, 0x3e94b6c0b485f30f, 0x3e94b6c0b485f30f, 0x3e94b6c0b485f30f /* _gf_la_poly_7 */
	.align	64
	.quad	0xbe65806ce17f0523, 0xbe65806ce17f0523, 0xbe65806ce17f0523, 0xbe65806ce17f0523, 0xbe65806ce17f0523, 0xbe65806ce17f0523, 0xbe65806ce17f0523, 0xbe65806ce17f0523 /* _gf_la_poly_8 */
	.align	64
	.quad	0x3e2715640470db47, 0x3e2715640470db47, 0x3e2715640470db47, 0x3e2715640470db47, 0x3e2715640470db47, 0x3e2715640470db47, 0x3e2715640470db47, 0x3e2715640470db47 /* _gf_la_poly_9 */
	.align	64
	.quad	0xbdddcb2653d80f03, 0xbdddcb2653d80f03, 0xbdddcb2653d80f03, 0xbdddcb2653d80f03, 0xbdddcb2653d80f03, 0xbdddcb2653d80f03, 0xbdddcb2653d80f03, 0xbdddcb2653d80f03 /* _gf_la_poly_10 */
	.align	64
	.quad	0x3d85eadfc762d3eb, 0x3d85eadfc762d3eb, 0x3d85eadfc762d3eb, 0x3d85eadfc762d3eb, 0x3d85eadfc762d3eb, 0x3d85eadfc762d3eb, 0x3d85eadfc762d3eb, 0x3d85eadfc762d3eb /* _gf_la_poly_11 */
	.align	64
	.quad	0xbd1c668a2871f0f1, 0xbd1c668a2871f0f1, 0xbd1c668a2871f0f1, 0xbd1c668a2871f0f1, 0xbd1c668a2871f0f1, 0xbd1c668a2871f0f1, 0xbd1c668a2871f0f1, 0xbd1c668a2871f0f1 /* _gf_la_poly_12 */
	.align	64
	.type	__svml_serf_data_internal, @object
	.size	__svml_serf_data_internal, .-__svml_serf_data_internal