about summary refs log tree commit diff
path: root/sysdeps/x86_64/fpu/multiarch/svml_d_tanh8_core_avx512.S
blob: b3477a346b08f4cadf62234b04c755f86c08f041 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
/* Function tanh vectorized with AVX-512.
   Copyright (C) 2021-2023 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   https://www.gnu.org/licenses/.  */

/*
 * ALGORITHM DESCRIPTION:
 *
 *   NOTE: Since the hyperbolic tangent function is odd
 *         (tanh(x) = -tanh(-x)), below algorithm deals with the absolute
 *         value of the argument |x|: tanh(x) = sign(x) * tanh(|x|)
 *
 *   We use a table lookup method to compute tanh(|x|).
 *   The basic idea is to split the input range into a number of subintervals
 *   and to approximate tanh(.) with a polynomial on each of them.
 *
 *   IEEE SPECIAL CONDITIONS:
 *   x = [+, -]0, r = [+, -]0
 *   x = +Inf,   r = +1
 *   x = -Inf,   r = -1
 *   x = QNaN,   r = QNaN
 *   x = SNaN,   r = QNaN
 *
 *
 *   ALGORITHM DETAILS
 *   We handle special values in a callout function, aside from main path
 *   computations. "Special" for this algorithm are:
 *   INF, NAN, |x| > HUGE_THRESHOLD
 *
 *
 *   Main path computations are organized as follows:
 *   Actually we split the interval [0, SATURATION_THRESHOLD)
 *   into a number of subintervals.  On each subinterval we approximate tanh(.)
 *   with a minimax polynomial of pre-defined degree. Polynomial coefficients
 *   are computed beforehand and stored in table. We also use
 *
 *       y := |x| + B,
 *
 *   here B depends on subinterval and is used to make argument
 *   closer to zero.
 *   We also add large fake interval [SATURATION_THRESHOLD, HUGE_THRESHOLD],
 *   where 1.0 + 0.0*y + 0.0*y^2 ... coefficients are stored - just to
 *   preserve main path computation logic but return 1.0 for all arguments.
 *
 *   Hence reconstruction looks as follows:
 *   we extract proper polynomial and range reduction coefficients
 *        (Pj and B), corresponding to subinterval, to which |x| belongs,
 *        and return
 *
 *       r := sign(x) * (P0 + P1 * y + ... + Pn * y^n)
 *
 *   NOTE: we use multiprecision technique to multiply and sum the first
 *         K terms of the polynomial. So Pj, j = 0..K are stored in
 *         table each as a pair of target precision numbers (Pj and PLj) to
 *         achieve wider than target precision.
 *
 *
 */

/* Offsets for data table __svml_dtanh_data_internal
 */
#define _dC				0
#define _dP0				128
#define _dP1				256
#define _dP2				384
#define _dP3				512
#define _dP4				640
#define _dP5				768
#define _dP6				896
#define _dP7				1024
#define _dP8				1152
#define _dP9				1280
#define _dP10				1408
#define _dP11				1536
#define _dP12				1664
#define _dP13				1792
#define _dP14				1920
#define _dP15				2048
#define _dP16				2176
#define _dP17				2304
#define _iExpMantMask_UISA		2432
#define _iMinIdxOfsMask_UISA		2496
#define _iMaxIdxMask_UISA		2560
#define _dbSignMask			2624
#define _dbAbsMask			2688
#define _iExpMantMask			2752
#define _iExpMask			2816
#define _iMinIdxOfsMask			2880
#define _iMaxIdxMask			2944

#include <sysdep.h>

	.section .text.evex512, "ax", @progbits
ENTRY(_ZGVeN8v_tanh_skx)
	pushq	%rbp
	cfi_def_cfa_offset(16)
	movq	%rsp, %rbp
	cfi_def_cfa(6, 16)
	cfi_offset(6, -16)
	andq	$-64, %rsp
	subq	$320, %rsp
	vpsrlq	$32, %zmm0, %zmm4
	vmovups	%zmm0, (%rsp)
	vmovups	__svml_dtanh_data_internal(%rip), %zmm14
	vmovups	_dP0+__svml_dtanh_data_internal(%rip), %zmm15
	vpmovqd	%zmm4, %ymm5

	/*  Constant loading  */
	vandpd	_dbAbsMask+__svml_dtanh_data_internal(%rip), %zmm0, %zmm13
	vandpd	_dbSignMask+__svml_dtanh_data_internal(%rip), %zmm0, %zmm3

	/* Here huge arguments, INF and NaNs are filtered out to callout. */
	vpand	_iExpMantMask_UISA+__svml_dtanh_data_internal(%rip), %ymm5, %ymm7
	vmovups	_dP2+__svml_dtanh_data_internal(%rip), %zmm0
	vmovups	_dP16+__svml_dtanh_data_internal(%rip), %zmm4
	vmovups	_dP15+__svml_dtanh_data_internal(%rip), %zmm5
	vmovups	%zmm3, 64(%rsp)
	vmovups	_dP3+__svml_dtanh_data_internal(%rip), %zmm3
	vpsubd	_iMinIdxOfsMask_UISA+__svml_dtanh_data_internal(%rip), %ymm7, %ymm8

	/* if VMIN, VMAX is defined for I type */
	vxorps	%ymm9, %ymm9, %ymm9
	vpmaxsd	%ymm9, %ymm8, %ymm10
	vpminsd	_iMaxIdxMask_UISA+__svml_dtanh_data_internal(%rip), %ymm10, %ymm11
	vpsrld	$19, %ymm11, %ymm12
	vmovups	_dP12+__svml_dtanh_data_internal(%rip), %zmm8
	vmovups	_dP11+__svml_dtanh_data_internal(%rip), %zmm9
	vmovups	_dP10+__svml_dtanh_data_internal(%rip), %zmm10
	vmovups	_dP9+__svml_dtanh_data_internal(%rip), %zmm11
	vpmovzxdq %ymm12, %zmm2
	vmovups	_dP8+__svml_dtanh_data_internal(%rip), %zmm12
	vpermt2pd _dP2+64+__svml_dtanh_data_internal(%rip), %zmm2, %zmm0
	vpermt2pd _dC+64+__svml_dtanh_data_internal(%rip), %zmm2, %zmm14
	vpermt2pd _dP16+64+__svml_dtanh_data_internal(%rip), %zmm2, %zmm4
	vpermt2pd _dP15+64+__svml_dtanh_data_internal(%rip), %zmm2, %zmm5
	vsubpd	{rn-sae}, %zmm14, %zmm13, %zmm1
	vpermt2pd _dP12+64+__svml_dtanh_data_internal(%rip), %zmm2, %zmm8
	vpermt2pd _dP11+64+__svml_dtanh_data_internal(%rip), %zmm2, %zmm9
	vpermt2pd _dP10+64+__svml_dtanh_data_internal(%rip), %zmm2, %zmm10
	vpermt2pd _dP9+64+__svml_dtanh_data_internal(%rip), %zmm2, %zmm11
	vpermt2pd _dP8+64+__svml_dtanh_data_internal(%rip), %zmm2, %zmm12
	vpermt2pd _dP3+64+__svml_dtanh_data_internal(%rip), %zmm2, %zmm3
	vpermt2pd _dP0+64+__svml_dtanh_data_internal(%rip), %zmm2, %zmm15
	vmovups	%zmm0, 192(%rsp)
	vmovups	_dP17+__svml_dtanh_data_internal(%rip), %zmm0
	vmovups	_dP7+__svml_dtanh_data_internal(%rip), %zmm13
	vmovups	_dP6+__svml_dtanh_data_internal(%rip), %zmm14
	vmovups	%zmm3, 256(%rsp)
	vmovups	_dP5+__svml_dtanh_data_internal(%rip), %zmm3
	vmovups	%zmm15, 128(%rsp)
	vmovups	_dP4+__svml_dtanh_data_internal(%rip), %zmm15
	vpermt2pd _dP17+64+__svml_dtanh_data_internal(%rip), %zmm2, %zmm0
	vpermt2pd _dP7+64+__svml_dtanh_data_internal(%rip), %zmm2, %zmm13
	vpermt2pd _dP6+64+__svml_dtanh_data_internal(%rip), %zmm2, %zmm14
	vpermt2pd _dP5+64+__svml_dtanh_data_internal(%rip), %zmm2, %zmm3
	vpermt2pd _dP4+64+__svml_dtanh_data_internal(%rip), %zmm2, %zmm15
	vfmadd213pd {rn-sae}, %zmm4, %zmm1, %zmm0
	vpcmpgtd _iExpMask+__svml_dtanh_data_internal(%rip), %ymm7, %ymm6
	vmovmskps %ymm6, %edx
	vmovups	_dP14+__svml_dtanh_data_internal(%rip), %zmm6
	vfmadd213pd {rn-sae}, %zmm5, %zmm1, %zmm0
	vmovups	_dP13+__svml_dtanh_data_internal(%rip), %zmm7
	vpermt2pd _dP14+64+__svml_dtanh_data_internal(%rip), %zmm2, %zmm6
	vpermt2pd _dP13+64+__svml_dtanh_data_internal(%rip), %zmm2, %zmm7
	vfmadd213pd {rn-sae}, %zmm6, %zmm1, %zmm0
	vmovups	256(%rsp), %zmm2
	vfmadd213pd {rn-sae}, %zmm7, %zmm1, %zmm0
	vfmadd213pd {rn-sae}, %zmm8, %zmm1, %zmm0
	vfmadd213pd {rn-sae}, %zmm9, %zmm1, %zmm0
	vfmadd213pd {rn-sae}, %zmm10, %zmm1, %zmm0
	vfmadd213pd {rn-sae}, %zmm11, %zmm1, %zmm0
	vfmadd213pd {rn-sae}, %zmm12, %zmm1, %zmm0
	vfmadd213pd {rn-sae}, %zmm13, %zmm1, %zmm0
	vfmadd213pd {rn-sae}, %zmm14, %zmm1, %zmm0
	vfmadd213pd {rn-sae}, %zmm3, %zmm1, %zmm0
	vmovups	128(%rsp), %zmm3
	vfmadd213pd {rn-sae}, %zmm15, %zmm1, %zmm0
	vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
	vmovups	192(%rsp), %zmm2
	vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
	vfmadd213pd {rn-sae}, %zmm3, %zmm1, %zmm0
	vorpd	64(%rsp), %zmm0, %zmm0
	testl	%edx, %edx

	/* Go to special inputs processing branch */
	jne	L(SPECIAL_VALUES_BRANCH)
	# LOE rbx r12 r13 r14 r15 edx zmm0

	/* Restore registers
	 * and exit the function
	 */

L(EXIT):
	movq	%rbp, %rsp
	popq	%rbp
	cfi_def_cfa(7, 8)
	cfi_restore(6)
	ret
	cfi_def_cfa(6, 16)
	cfi_offset(6, -16)

	/* Branch to process
	 * special inputs
	 */

L(SPECIAL_VALUES_BRANCH):
	vmovups	(%rsp), %zmm1
	vmovups	%zmm0, 128(%rsp)
	vmovups	%zmm1, 64(%rsp)
	# LOE rbx r12 r13 r14 r15 edx zmm0

	xorl	%eax, %eax
	# LOE rbx r12 r13 r14 r15 eax edx

	vzeroupper
	movq	%r12, 16(%rsp)
	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -304; DW_OP_plus)  */
	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd0, 0xfe, 0xff, 0xff, 0x22
	movl	%eax, %r12d
	movq	%r13, 8(%rsp)
	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -312; DW_OP_plus)  */
	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xc8, 0xfe, 0xff, 0xff, 0x22
	movl	%edx, %r13d
	movq	%r14, (%rsp)
	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -320; DW_OP_plus)  */
	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xc0, 0xfe, 0xff, 0xff, 0x22
	# LOE rbx r15 r12d r13d

	/* Range mask
	 * bits check
	 */

L(RANGEMASK_CHECK):
	btl	%r12d, %r13d

	/* Call scalar math function */
	jc	L(SCALAR_MATH_CALL)
	# LOE rbx r15 r12d r13d

	/* Special inputs
	 * processing loop
	 */

L(SPECIAL_VALUES_LOOP):
	incl	%r12d
	cmpl	$8, %r12d

	/* Check bits in range mask */
	jl	L(RANGEMASK_CHECK)
	# LOE rbx r15 r12d r13d

	movq	16(%rsp), %r12
	cfi_restore(12)
	movq	8(%rsp), %r13
	cfi_restore(13)
	movq	(%rsp), %r14
	cfi_restore(14)
	vmovups	128(%rsp), %zmm0

	/* Go to exit */
	jmp	L(EXIT)
	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -304; DW_OP_plus)  */
	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd0, 0xfe, 0xff, 0xff, 0x22
	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -312; DW_OP_plus)  */
	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xc8, 0xfe, 0xff, 0xff, 0x22
	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -320; DW_OP_plus)  */
	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xc0, 0xfe, 0xff, 0xff, 0x22
	# LOE rbx r12 r13 r14 r15 zmm0

	/* Scalar math function call
	 * to process special input
	 */

L(SCALAR_MATH_CALL):
	movl	%r12d, %r14d
	vmovsd	64(%rsp, %r14, 8), %xmm0
	call	tanh@PLT
	# LOE rbx r14 r15 r12d r13d xmm0

	vmovsd	%xmm0, 128(%rsp, %r14, 8)

	/* Process special inputs in loop */
	jmp	L(SPECIAL_VALUES_LOOP)
	# LOE rbx r15 r12d r13d
END(_ZGVeN8v_tanh_skx)

	.section .rodata, "a"
	.align	64

#ifdef __svml_dtanh_data_internal_typedef
typedef unsigned int VUINT32;
typedef struct {
	__declspec(align(64)) VUINT32 _dC[16][2];
	__declspec(align(64)) VUINT32 _dP0[16][2];
	__declspec(align(64)) VUINT32 _dP1[16][2];
	__declspec(align(64)) VUINT32 _dP2[16][2];
	__declspec(align(64)) VUINT32 _dP3[16][2];
	__declspec(align(64)) VUINT32 _dP4[16][2];
	__declspec(align(64)) VUINT32 _dP5[16][2];
	__declspec(align(64)) VUINT32 _dP6[16][2];
	__declspec(align(64)) VUINT32 _dP7[16][2];
	__declspec(align(64)) VUINT32 _dP8[16][2];
	__declspec(align(64)) VUINT32 _dP9[16][2];
	__declspec(align(64)) VUINT32 _dP10[16][2];
	__declspec(align(64)) VUINT32 _dP11[16][2];
	__declspec(align(64)) VUINT32 _dP12[16][2];
	__declspec(align(64)) VUINT32 _dP13[16][2];
	__declspec(align(64)) VUINT32 _dP14[16][2];
	__declspec(align(64)) VUINT32 _dP15[16][2];
	__declspec(align(64)) VUINT32 _dP16[16][2];
	__declspec(align(64)) VUINT32 _dP17[16][2];
	__declspec(align(64)) VUINT32 _iExpMantMask_UISA[16][1];
	__declspec(align(64)) VUINT32 _iMinIdxOfsMask_UISA[16][1];
	__declspec(align(64)) VUINT32 _iMaxIdxMask_UISA[16][1];
	__declspec(align(64)) VUINT32 _dbSignMask[8][2];
	__declspec(align(64)) VUINT32 _dbAbsMask[8][2];
	__declspec(align(64)) VUINT32 _iExpMantMask[16][1];
	__declspec(align(64)) VUINT32 _iExpMask[16][1];
	__declspec(align(64)) VUINT32 _iMinIdxOfsMask[16][1];
	__declspec(align(64)) VUINT32 _iMaxIdxMask[16][1];
} __svml_dtanh_data_internal;
#endif
__svml_dtanh_data_internal:
	/* _dC */
	.quad	0x0000000000000000, 0x3fcc000000000000, 0x3fd4000000000000, 0x3fdc000000000000
	.quad	0x3fe4000000000000, 0x3fec000000000000, 0x3ff4000000000000, 0x3ffc000000000000
	.quad	0x4004000000000000, 0x400c000000000000, 0x4014000000000000, 0x401c000000000000
	.quad	0x4024000000000000, 0x402c000000000000, 0x4034000000000000, 0x0000000000000000
	/* p0 */
	.align	64
	.quad	0x0000000000000000, 0x3fcb8fd0416a7c92, 0x3fd35f98a0ea650e, 0x3fda5729ee488037
	.quad	0x3fe1bf47eabb8f95, 0x3fe686650b8c2015, 0x3feb2523bb6b2dee, 0x3fee1fbf97e33527
	.quad	0x3fef9258260a71c2, 0x3feff112c63a9077, 0x3fefff419668df11, 0x3feffffc832750f2
	.quad	0x3feffffffdc96f35, 0x3fefffffffffcf58, 0x3ff0000000000000, 0x3ff0000000000000
	/* p1 */
	.align	64
	.quad	0x0000000000000000, 0x3c65e23ebcd3bcbe, 0xbc4c600bac3adf00, 0x3c6c44091785d040
	.quad	0x3c8221d7a6e3674b, 0x3c69f89d2cf6b85c, 0x3c73b3e9ec0b8f1c, 0xbc7f8d4b0428aada
	.quad	0xbc7c52d880cf43c0, 0x3c7dd36e37096480, 0x3c7b4f6380c442ca, 0xbc729755de470096
	.quad	0x3c84cf852845efbd, 0x3c6fc4fb440a5378, 0xbc63981083b55870, 0x0000000000000000
	/* p2 */
	.align	64
	.quad	0x3ff0000000000000, 0x3fee842ca3f08532, 0x3fed11574af58f1b, 0x3fea945b9c24e4f9
	.quad	0x3fe6284c3374f815, 0x3fe02500a09f8d6e, 0x3fd1f25131e3a8c0, 0x3fbd22ca1c24a139
	.quad	0x3f9b3afe1fba5c76, 0x3f6dd37d19b22b21, 0x3f27ccec13a9ef96, 0x3ecbe6c3f33250ae
	.quad	0x3e41b4865394f75f, 0x3d8853f01bda5f28, 0x3c73953c0197ef58, 0x0000000000000000
	/* p3 */
	.align	64
	.quad	0xbbf0b3ea3fdfaa19, 0xbfca48aaeb53bc21, 0xbfd19921f4329916, 0xbfd5e0f09bef8011
	.quad	0xbfd893b59c35c882, 0xbfd6ba7cb7576538, 0xbfce7291743d7555, 0xbfbb6d85a01efb80
	.quad	0xbf9addae58c7141a, 0xbf6dc59376c7aa19, 0xbf27cc5e74677410, 0xbecbe6c0e8b4cc87
	.quad	0xbe41b486526b0565, 0xbd8853f01bef63a4, 0xbc73955be519be31, 0x0000000000000000
	/* p4 */
	.align	64
	.quad	0xbfd5555555555555, 0xbfd183afc292ba11, 0xbfcc1a4b039c9bfa, 0xbfc16e1e6d8d0be6
	.quad	0xbf92426c751e48a2, 0x3fb4f152b2bad124, 0x3fbbba40cbef72be, 0x3fb01ba038be6a3d
	.quad	0x3f916df44871efc8, 0x3f63c6869dfc8870, 0x3f1fb9aef915d828, 0x3ec299d1e27c6e11
	.quad	0x3e379b5ddcca334c, 0x3d8037f57bc62c9a, 0x3c6a2d4b50a2cff7, 0x0000000000000000
	/* p5 */
	.align	64
	.quad	0xbce6863ee44ed636, 0x3fc04dcd0476c75e, 0x3fc43d3449a80f08, 0x3fc5c26f3699b7e7
	.quad	0x3fc1a686f6ab2533, 0x3faf203c316ce730, 0xbf89c7a02788557c, 0xbf98157e26e0d541
	.quad	0xbf807b55c1c7d278, 0xbf53a18d5843190f, 0xbf0fb6bbc89b1a5b, 0xbeb299c9c684a963
	.quad	0xbe279b5dd4fb3d01, 0xbd7037f57ae72aa6, 0xbc5a2ca2bba78e86, 0x0000000000000000
	/* p6 */
	.align	64
	.quad	0x3fc1111111112ab5, 0x3fb5c19efdfc08ad, 0x3fa74c98dc34fbac, 0xbf790d6a8eff0a77
	.quad	0xbfac3c021789a786, 0xbfae2196b7326859, 0xbf93a7a011ff8c2a, 0x3f6e4709c7e8430e
	.quad	0x3f67682afa611151, 0x3f3ef2ee77717cbf, 0x3ef95a4482f180b7, 0x3e9dc2c27da3b603
	.quad	0x3e12e2afd9f7433e, 0x3d59f320348679ba, 0x3c44b61d9bbcc940, 0x0000000000000000
	/* p7 */
	.align	64
	.quad	0xbda1ea19ddddb3b4, 0xbfb0b8df995ce4df, 0xbfb2955cf41e8164, 0xbfaf9d05c309f7c6
	.quad	0xbf987d27ccff4291, 0x3f8b2ca62572b098, 0x3f8f1cf6c7f5b00a, 0x3f60379811e43dd5
	.quad	0xbf4793826f78537e, 0xbf2405695e36240f, 0xbee0e08de39ce756, 0xbe83d709ba5f714e
	.quad	0xbdf92e3fc5ee63e0, 0xbd414cc030f2110e, 0xbc2ba022e8d82a87, 0x0000000000000000
	/* p8 */
	.align	64
	.quad	0xbfaba1ba1990520b, 0xbf96e37bba52f6fc, 0x3ecff7df18455399, 0x3f97362834d33a4e
	.quad	0x3f9e7f8380184b45, 0x3f869543e7c420d4, 0xbf7326bd4914222a, 0xbf5fc15b0a9d98fa
	.quad	0x3f14cffcfa69fbb6, 0x3f057e48e5b79d10, 0x3ec33b66d7d77264, 0x3e66ac4e578b9b10
	.quad	0x3ddcc74b8d3d5c42, 0x3d23c589137f92b4, 0x3c107f8e2c8707a1, 0x0000000000000000
	/* p9 */
	.align	64
	.quad	0xbe351ca7f096011f, 0x3f9eaaf3320c3851, 0x3f9cf823fe761fc1, 0x3f9022271754ff1f
	.quad	0xbf731fe77c9c60af, 0xbf84a6046865ec7d, 0xbf4ca3f1f2b9192b, 0x3f4c77dee0afd227
	.quad	0x3f04055bce68597a, 0xbee2bf0cb4a71647, 0xbea31eaafe73efd5, 0xbe46abb02c4368ed
	.quad	0xbdbcc749ca8079dd, 0xbd03c5883836b9d2, 0xbbf07a5416264aec, 0x0000000000000000
	/* p10 */
	.align	64
	.quad	0x3f9664f94e6ac14e, 0xbf94d3343bae39dd, 0xbf7bc748e60df843, 0xbf8c89372b43ba85
	.quad	0xbf8129a092de747a, 0x3f60c85b4d538746, 0x3f5be9392199ec18, 0xbf2a0c68a4489f10
	.quad	0xbf00462601dc2faa, 0x3eb7b6a219dea9f4, 0x3e80cbcc8d4c5c8a, 0x3e2425bb231a5e29
	.quad	0x3d9992a4beac8662, 0x3ce191ba5ed3fb67, 0x3bc892450bad44c4, 0x0000000000000000
	/* p11 */
	.align	64
	.quad	0xbea8c4c1fd7852fe, 0xbfccce16b1046f13, 0xbf81a16f224bb7b6, 0xbf62cbf00406bc09
	.quad	0x3f75b29bb02cf69b, 0x3f607df0f9f90c17, 0xbf4b852a6e0758d5, 0xbf0078c63d1b8445
	.quad	0x3eec12eadd55be7a, 0xbe6fa600f593181b, 0xbe5a3c935dce3f7d, 0xbe001c6d95e3ae96
	.quad	0xbd74755a00ea1fd3, 0xbcbc1c6c063bb7ac, 0xbba3be9a4460fe00, 0x0000000000000000
	/* p12 */
	.align	64
	.quad	0xbf822404577aa9dd, 0x403d8b07f7a82aa3, 0xbf9f44ab92fbab0a, 0x3fb2eac604473d6a
	.quad	0x3f45f87d903aaac8, 0xbf5e104671036300, 0x3f19bc98ddf0f340, 0x3f0d4304bc9246e8
	.quad	0xbed13c415f7b9d41, 0xbe722b8d9720cdb0, 0x3e322666d739bec0, 0x3dd76a553d7e7918
	.quad	0x3d4de0fa59416a39, 0x3c948716cf3681b4, 0x3b873f9f2d2fda99, 0x0000000000000000
	/* p13 */
	.align	64
	.quad	0xbefdd99a221ed573, 0x4070593a3735bab4, 0xbfccab654e44835e, 0x3fd13ed80037dbac
	.quad	0xbf6045b9076cc487, 0x3f2085ee7e8ac170, 0x3f23524622610430, 0xbeff12a6626911b4
	.quad	0x3eab9008bca408af, 0x3e634df71865f620, 0xbe05bb1bcf83ca73, 0xbdaf2ac143fb6762
	.quad	0xbd23eae52a3dbf57, 0xbc6b5e3e9ca0955e, 0xbb5eca68e2c1ba2e, 0x0000000000000000
	/* p14 */
	.align	64
	.quad	0x3f6e3be689423841, 0xc0d263511f5baac1, 0x40169f73b15ebe5c, 0xc025c1dd41cd6cb5
	.quad	0xbf58fd89fe05e0d1, 0x3f73f7af01d5af7a, 0xbf1e40bdead17e6b, 0x3ee224cd6c4513e5
	.quad	0xbe24b645e68eeaa3, 0xbe4abfebfb72bc83, 0x3dd51c38f8695ed3, 0x3d8313ac38c6832b
	.quad	0x3cf7787935626685, 0x3c401ffc49c6bc29, 0xbabf0b21acfa52ab, 0x0000000000000000
	/* p15 */
	.align	64
	.quad	0xbf2a1306713a4f3a, 0xc1045e509116b066, 0x4041fab9250984ce, 0xc0458d090ec3de95
	.quad	0xbf74949d60113d63, 0x3f7c9fd6200d0ade, 0x3f02cd40e0ad0a9f, 0xbe858ab8e019f311
	.quad	0xbe792fa6323b7cf8, 0x3e2df04d67876402, 0xbd95c72be95e4d2c, 0xbd55a89c30203106
	.quad	0xbccad6b3bb9eff65, 0xbc12705ccd3dd884, 0xba8e0a4c47ae75f5, 0x0000000000000000
	/* p16 */
	.align	64
	.quad	0xbf55d7e76dc56871, 0x41528c38809c90c7, 0xc076d57fb5190b02, 0x4085f09f888f8ada
	.quad	0x3fa246332a2fcba5, 0xbfb29d851a896fcd, 0x3ed9065ae369b212, 0xbeb8e1ba4c98a030
	.quad	0x3e6ffd0766ad4016, 0xbe0c63c29f505f5b, 0xbd7fab216b9e0e49, 0x3d2826b62056aa27
	.quad	0x3ca313e31762f523, 0x3bea37aa21895319, 0x3ae5c7f1fd871496, 0x0000000000000000
	/* p17 */
	.align	64
	.quad	0x3f35e67ab76a26e7, 0x41848ee0627d8206, 0xc0a216d618b489ec, 0x40a5b89107c8af4f
	.quad	0x3fb69d8374520eda, 0xbfbded519f981716, 0xbef02d288b5b3371, 0x3eb290981209c1a6
	.quad	0xbe567e924bf5ff6e, 0x3de3f7f7de6b0eb6, 0x3d69ed18bae3ebbc, 0xbcf7534c4f3dfa71
	.quad	0xbc730b73f1eaff20, 0xbbba2cff8135d462, 0xbab5a71b5f7d9035, 0x0000000000000000
	.align	64
	.long	0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000 /* _iExpMantMask_UISA */
	.align	64
	.long	0x3fc00000, 0x3fc00000, 0x3fc00000, 0x3fc00000, 0x3fc00000, 0x3fc00000, 0x3fc00000, 0x3fc00000, 0x3fc00000, 0x3fc00000, 0x3fc00000, 0x3fc00000, 0x3fc00000, 0x3fc00000, 0x3fc00000, 0x3fc00000 /* _iMinIdxOfsMask_UISA */
	.align	64
	.long	0x00780000, 0x00780000, 0x00780000, 0x00780000, 0x00780000, 0x00780000, 0x00780000, 0x00780000, 0x00780000, 0x00780000, 0x00780000, 0x00780000, 0x00780000, 0x00780000, 0x00780000, 0x00780000 /* _iMaxIdxMask_UISA */
	.align	64
	.quad	0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000 /* _dbSignMask */
	.align	64
	.quad	0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff /* _dbAbsMask */
	.align	64
	.long	0x7ffe0000, 0x7ffe0000, 0x7ffe0000, 0x7ffe0000, 0x7ffe0000, 0x7ffe0000, 0x7ffe0000, 0x7ffe0000, 0x7ffe0000, 0x7ffe0000, 0x7ffe0000, 0x7ffe0000, 0x7ffe0000, 0x7ffe0000, 0x7ffe0000, 0x7ffe0000 /* _iExpMantMask */
	.align	64
	.long	0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000 /* _iExpMask */
	.align	64
	.long	0x3fbe0000, 0x3fbe0000, 0x3fbe0000, 0x3fbe0000, 0x3fbe0000, 0x3fbe0000, 0x3fbe0000, 0x3fbe0000, 0x3fbe0000, 0x3fbe0000, 0x3fbe0000, 0x3fbe0000, 0x3fbe0000, 0x3fbe0000, 0x3fbe0000, 0x3fbe0000 /* _iMinIdxOfsMask */
	.align	64
	.long	0x00760000, 0x00760000, 0x00760000, 0x00760000, 0x00760000, 0x00760000, 0x00760000, 0x00760000, 0x00760000, 0x00760000, 0x00760000, 0x00760000, 0x00760000, 0x00760000, 0x00760000, 0x00760000 /* _iMaxIdxMask */
	.align	64
	.type	__svml_dtanh_data_internal, @object
	.size	__svml_dtanh_data_internal, .-__svml_dtanh_data_internal