about summary refs log tree commit diff
path: root/sysdeps/x86_64/fpu
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/x86_64/fpu')
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core_avx512.S257
1 files changed, 128 insertions, 129 deletions
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core_avx512.S
index 4285a4ba42..62d96d13ea 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core_avx512.S
@@ -30,145 +30,144 @@
 
 /* Offsets for data table __svml_satan_data_internal_avx512
  */
-#define AbsMask                       	0
-#define Shifter                       	64
-#define MaxThreshold                  	128
-#define MOne                          	192
-#define One                           	256
-#define LargeX                        	320
-#define Zero                          	384
-#define Tbl_H                         	448
-#define Pi2                           	576
-#define coeff_1                       	640
-#define coeff_2                       	704
-#define coeff_3                       	768
+#define AbsMask				0
+#define Shifter				64
+#define MaxThreshold			128
+#define MOne				192
+#define One				256
+#define LargeX				320
+#define Zero				384
+#define Tbl_H				448
+#define Pi2				576
+#define coeff_1				640
+#define coeff_2				704
+#define coeff_3				768
 
 #include <sysdep.h>
 
-        .text
-	.section .text.exex512,"ax",@progbits
+	.section .text.exex512, "ax", @progbits
 ENTRY(_ZGVeN16v_atanf_skx)
-        vandps    __svml_satan_data_internal_avx512(%rip), %zmm0, %zmm7
-        vmovups   MaxThreshold+__svml_satan_data_internal_avx512(%rip), %zmm3
-        vmovups   One+__svml_satan_data_internal_avx512(%rip), %zmm8
-
-/* round to 2 bits after binary point */
-        vreduceps $40, {sae}, %zmm7, %zmm5
-
-/* saturate X range */
-        vmovups   LargeX+__svml_satan_data_internal_avx512(%rip), %zmm6
-        vmovups   Shifter+__svml_satan_data_internal_avx512(%rip), %zmm2
-        vcmpps    $29, {sae}, %zmm3, %zmm7, %k1
-
-/* table lookup sequence */
-        vmovups   Tbl_H+__svml_satan_data_internal_avx512(%rip), %zmm3
-        vsubps    {rn-sae}, %zmm5, %zmm7, %zmm4
-        vaddps    {rn-sae}, %zmm2, %zmm7, %zmm1
-        vxorps    %zmm0, %zmm7, %zmm0
-        vfmadd231ps {rn-sae}, %zmm7, %zmm4, %zmm8
-        vmovups   coeff_2+__svml_satan_data_internal_avx512(%rip), %zmm4
-
-/* if|X|>=MaxThreshold, set DiffX=-1 */
-        vblendmps MOne+__svml_satan_data_internal_avx512(%rip), %zmm5, %zmm9{%k1}
-        vmovups   coeff_3+__svml_satan_data_internal_avx512(%rip), %zmm5
-
-/* if|X|>=MaxThreshold, set Y=X */
-        vminps    {sae}, %zmm7, %zmm6, %zmm8{%k1}
-
-/* R+Rl = DiffX/Y */
-        vgetmantps $0, {sae}, %zmm9, %zmm12
-        vgetexpps {sae}, %zmm9, %zmm10
-        vpermt2ps Tbl_H+64+__svml_satan_data_internal_avx512(%rip), %zmm1, %zmm3
-        vgetmantps $0, {sae}, %zmm8, %zmm15
-        vgetexpps {sae}, %zmm8, %zmm11
-        vmovups   coeff_1+__svml_satan_data_internal_avx512(%rip), %zmm1
-
-/* set table value to Pi/2 for large X */
-        vblendmps Pi2+__svml_satan_data_internal_avx512(%rip), %zmm3, %zmm9{%k1}
-        vrcp14ps  %zmm15, %zmm13
-        vsubps    {rn-sae}, %zmm11, %zmm10, %zmm2
-        vmulps    {rn-sae}, %zmm13, %zmm12, %zmm14
-        vfnmadd213ps {rn-sae}, %zmm12, %zmm14, %zmm15
-        vfmadd213ps {rn-sae}, %zmm14, %zmm13, %zmm15
-        vscalefps {rn-sae}, %zmm2, %zmm15, %zmm7
-
-/* polynomial evaluation */
-        vmulps    {rn-sae}, %zmm7, %zmm7, %zmm8
-        vmulps    {rn-sae}, %zmm7, %zmm8, %zmm6
-        vfmadd231ps {rn-sae}, %zmm8, %zmm1, %zmm4
-        vfmadd213ps {rn-sae}, %zmm5, %zmm4, %zmm8
-        vfmadd213ps {rn-sae}, %zmm7, %zmm6, %zmm8
-        vaddps    {rn-sae}, %zmm9, %zmm8, %zmm10
-        vxorps    %zmm0, %zmm10, %zmm0
-        ret
+	vandps	__svml_satan_data_internal_avx512(%rip), %zmm0, %zmm7
+	vmovups	MaxThreshold+__svml_satan_data_internal_avx512(%rip), %zmm3
+	vmovups	One+__svml_satan_data_internal_avx512(%rip), %zmm8
+
+	/* round to 2 bits after binary point */
+	vreduceps $40, {sae}, %zmm7, %zmm5
+
+	/* saturate X range */
+	vmovups	LargeX+__svml_satan_data_internal_avx512(%rip), %zmm6
+	vmovups	Shifter+__svml_satan_data_internal_avx512(%rip), %zmm2
+	vcmpps	$29, {sae}, %zmm3, %zmm7, %k1
+
+	/* table lookup sequence */
+	vmovups	Tbl_H+__svml_satan_data_internal_avx512(%rip), %zmm3
+	vsubps	{rn-sae}, %zmm5, %zmm7, %zmm4
+	vaddps	{rn-sae}, %zmm2, %zmm7, %zmm1
+	vxorps	%zmm0, %zmm7, %zmm0
+	vfmadd231ps {rn-sae}, %zmm7, %zmm4, %zmm8
+	vmovups	coeff_2+__svml_satan_data_internal_avx512(%rip), %zmm4
+
+	/* if|X|>=MaxThreshold, set DiffX=-1 */
+	vblendmps MOne+__svml_satan_data_internal_avx512(%rip), %zmm5, %zmm9{%k1}
+	vmovups	coeff_3+__svml_satan_data_internal_avx512(%rip), %zmm5
+
+	/* if|X|>=MaxThreshold, set Y=X */
+	vminps	{sae}, %zmm7, %zmm6, %zmm8{%k1}
+
+	/* R+Rl = DiffX/Y */
+	vgetmantps $0, {sae}, %zmm9, %zmm12
+	vgetexpps {sae}, %zmm9, %zmm10
+	vpermt2ps Tbl_H+64+__svml_satan_data_internal_avx512(%rip), %zmm1, %zmm3
+	vgetmantps $0, {sae}, %zmm8, %zmm15
+	vgetexpps {sae}, %zmm8, %zmm11
+	vmovups	coeff_1+__svml_satan_data_internal_avx512(%rip), %zmm1
+
+	/* set table value to Pi/2 for large X */
+	vblendmps Pi2+__svml_satan_data_internal_avx512(%rip), %zmm3, %zmm9{%k1}
+	vrcp14ps %zmm15, %zmm13
+	vsubps	{rn-sae}, %zmm11, %zmm10, %zmm2
+	vmulps	{rn-sae}, %zmm13, %zmm12, %zmm14
+	vfnmadd213ps {rn-sae}, %zmm12, %zmm14, %zmm15
+	vfmadd213ps {rn-sae}, %zmm14, %zmm13, %zmm15
+	vscalefps {rn-sae}, %zmm2, %zmm15, %zmm7
+
+	/* polynomial evaluation */
+	vmulps	{rn-sae}, %zmm7, %zmm7, %zmm8
+	vmulps	{rn-sae}, %zmm7, %zmm8, %zmm6
+	vfmadd231ps {rn-sae}, %zmm8, %zmm1, %zmm4
+	vfmadd213ps {rn-sae}, %zmm5, %zmm4, %zmm8
+	vfmadd213ps {rn-sae}, %zmm7, %zmm6, %zmm8
+	vaddps	{rn-sae}, %zmm9, %zmm8, %zmm10
+	vxorps	%zmm0, %zmm10, %zmm0
+	ret
 
 END(_ZGVeN16v_atanf_skx)
 
-        .section .rodata, "a"
-        .align 64
+	.section .rodata, "a"
+	.align	64
 
 #ifdef __svml_satan_data_internal_avx512_typedef
 typedef unsigned int VUINT32;
 typedef struct {
-        __declspec(align(64)) VUINT32 AbsMask[16][1];
-        __declspec(align(64)) VUINT32 Shifter[16][1];
-        __declspec(align(64)) VUINT32 MaxThreshold[16][1];
-        __declspec(align(64)) VUINT32 MOne[16][1];
-        __declspec(align(64)) VUINT32 One[16][1];
-        __declspec(align(64)) VUINT32 LargeX[16][1];
-        __declspec(align(64)) VUINT32 Zero[16][1];
-        __declspec(align(64)) VUINT32 Tbl_H[32][1];
-        __declspec(align(64)) VUINT32 Pi2[16][1];
-        __declspec(align(64)) VUINT32 coeff[3][16][1];
-    } __svml_satan_data_internal_avx512;
+	__declspec(align(64)) VUINT32 AbsMask[16][1];
+	__declspec(align(64)) VUINT32 Shifter[16][1];
+	__declspec(align(64)) VUINT32 MaxThreshold[16][1];
+	__declspec(align(64)) VUINT32 MOne[16][1];
+	__declspec(align(64)) VUINT32 One[16][1];
+	__declspec(align(64)) VUINT32 LargeX[16][1];
+	__declspec(align(64)) VUINT32 Zero[16][1];
+	__declspec(align(64)) VUINT32 Tbl_H[32][1];
+	__declspec(align(64)) VUINT32 Pi2[16][1];
+	__declspec(align(64)) VUINT32 coeff[3][16][1];
+} __svml_satan_data_internal_avx512;
 #endif
 __svml_satan_data_internal_avx512:
-        /*== AbsMask ==*/
-        .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
-        /*== Shifter ==*/
-        .align 64
-        .long 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000
-        /*== MaxThreshold ==*/
-        .align 64
-        .long 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000
-        /*== MOne ==*/
-        .align 64
-        .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
-        /*== One ==*/
-        .align 64
-        .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
-        /*== LargeX ==*/
-        .align 64
-        .long 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000
-        /*== Zero ==*/
-        .align 64
-        .long 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
-        /*== Tbl_H ==*/
-        .align 64
-        .long 0x00000000, 0x3e7adbb0
-        .long 0x3eed6338, 0x3f24bc7d
-        .long 0x3f490fdb, 0x3f6563e3
-        .long 0x3f7b985f, 0x3f869c79
-        .long 0x3f8db70d, 0x3f93877b
-        .long 0x3f985b6c, 0x3f9c6b53
-        .long 0x3f9fe0bb, 0x3fa2daa4
-        .long 0x3fa57088, 0x3fa7b46f
-        .long 0x3fa9b465, 0x3fab7b7a
-        .long 0x3fad1283, 0x3fae809e
-        .long 0x3fafcb99, 0x3fb0f836
-        .long 0x3fb20a6a, 0x3fb30581
-        .long 0x3fb3ec43, 0x3fb4c10a
-        .long 0x3fb585d7, 0x3fb63c64
-        .long 0x3fb6e62c, 0x3fb78478
-        .long 0x3fb81868, 0x3fb8a2f5
-        /*== Pi2 ==*/
-        .align 64
-        .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
-        /*== coeff3 ==*/
-        .align 64
-        .long 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de
-        .long 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2
-        .long 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa
-        .align 64
-        .type	__svml_satan_data_internal_avx512,@object
-        .size	__svml_satan_data_internal_avx512,.-__svml_satan_data_internal_avx512
+	/* AbsMask */
+	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
+	/* Shifter */
+	.align	64
+	.long	0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000
+	/* MaxThreshold */
+	.align	64
+	.long	0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000
+	/* MOne */
+	.align	64
+	.long	0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
+	/* One */
+	.align	64
+	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
+	/* LargeX */
+	.align	64
+	.long	0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000
+	/* Zero */
+	.align	64
+	.long	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
+	/* Tbl_H */
+	.align	64
+	.long	0x00000000, 0x3e7adbb0
+	.long	0x3eed6338, 0x3f24bc7d
+	.long	0x3f490fdb, 0x3f6563e3
+	.long	0x3f7b985f, 0x3f869c79
+	.long	0x3f8db70d, 0x3f93877b
+	.long	0x3f985b6c, 0x3f9c6b53
+	.long	0x3f9fe0bb, 0x3fa2daa4
+	.long	0x3fa57088, 0x3fa7b46f
+	.long	0x3fa9b465, 0x3fab7b7a
+	.long	0x3fad1283, 0x3fae809e
+	.long	0x3fafcb99, 0x3fb0f836
+	.long	0x3fb20a6a, 0x3fb30581
+	.long	0x3fb3ec43, 0x3fb4c10a
+	.long	0x3fb585d7, 0x3fb63c64
+	.long	0x3fb6e62c, 0x3fb78478
+	.long	0x3fb81868, 0x3fb8a2f5
+	/* Pi2 */
+	.align	64
+	.long	0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
+	/* coeff3 */
+	.align	64
+	.long	0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de
+	.long	0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2
+	.long	0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa
+	.align	64
+	.type	__svml_satan_data_internal_avx512, @object
+	.size	__svml_satan_data_internal_avx512, .-__svml_satan_data_internal_avx512