about summary refs log tree commit diff
path: root/sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core_avx512.S
diff options
context:
space:
mode:
authorSunil K Pandey <skpgkp2@gmail.com>2021-12-29 08:23:33 -0800
committerSunil K Pandey <skpgkp2@gmail.com>2021-12-29 11:36:46 -0800
commit146310177aa9f2c7d990ef856ed6e8bb94407f06 (patch)
tree1f81aa90d5cf28eab3442be8f3889aa5ff6a23d6 /sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core_avx512.S
parent5d28a8962dcb6ec056b81d730e3c6fb57185a210 (diff)
downloadglibc-146310177aa9f2c7d990ef856ed6e8bb94407f06.tar.gz
glibc-146310177aa9f2c7d990ef856ed6e8bb94407f06.tar.xz
glibc-146310177aa9f2c7d990ef856ed6e8bb94407f06.zip
x86-64: Add vector atan/atanf implementation to libmvec
Implement vectorized atan/atanf containing SSE, AVX, AVX2 and
AVX512 versions for libmvec as per vector ABI.  It also contains
accuracy and ABI tests for vector atan/atanf with regenerated ulps.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Diffstat (limited to 'sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core_avx512.S')
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core_avx512.S174
1 files changed, 174 insertions, 0 deletions
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core_avx512.S
new file mode 100644
index 0000000000..4a37f03e69
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core_avx512.S
@@ -0,0 +1,174 @@
+/* Function atanf vectorized with AVX-512.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   https://www.gnu.org/licenses/.  */
+
+/*
+ * ALGORITHM DESCRIPTION:
+ *
+ *      For    0.0    <= x <=  7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
+ *      For  7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
+ *      For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
+ *      For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
+ *      For 39.0/16.0 <= x <=    inf   : atan(x) = atan(inf) + atan(s), where s=-1.0/x
+ *      Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
+ *
+ */
+
+/* Offsets for data table __svml_satan_data_internal_avx512
+ */
+#define AbsMask                       	0
+#define Shifter                       	64
+#define MaxThreshold                  	128
+#define MOne                          	192
+#define One                           	256
+#define LargeX                        	320
+#define Zero                          	384
+#define Tbl_H                         	448
+#define Pi2                           	576
+#define coeff_1                       	640
+#define coeff_2                       	704
+#define coeff_3                       	768
+
+#include <sysdep.h>
+
+        .text
+	.section .text.exex512,"ax",@progbits
+ENTRY(_ZGVeN16v_atanf_skx)
+        vandps    __svml_satan_data_internal_avx512(%rip), %zmm0, %zmm7
+        vmovups   MaxThreshold+__svml_satan_data_internal_avx512(%rip), %zmm3
+        vmovups   One+__svml_satan_data_internal_avx512(%rip), %zmm8
+
+/* round to 2 bits after binary point */
+        vreduceps $40, {sae}, %zmm7, %zmm5
+
+/* saturate X range */
+        vmovups   LargeX+__svml_satan_data_internal_avx512(%rip), %zmm6
+        vmovups   Shifter+__svml_satan_data_internal_avx512(%rip), %zmm2
+        vcmpps    $29, {sae}, %zmm3, %zmm7, %k1
+
+/* table lookup sequence */
+        vmovups   Tbl_H+__svml_satan_data_internal_avx512(%rip), %zmm3
+        vsubps    {rn-sae}, %zmm5, %zmm7, %zmm4
+        vaddps    {rn-sae}, %zmm2, %zmm7, %zmm1
+        vxorps    %zmm0, %zmm7, %zmm0
+        vfmadd231ps {rn-sae}, %zmm7, %zmm4, %zmm8
+        vmovups   coeff_2+__svml_satan_data_internal_avx512(%rip), %zmm4
+
+/* if|X|>=MaxThreshold, set DiffX=-1 */
+        vblendmps MOne+__svml_satan_data_internal_avx512(%rip), %zmm5, %zmm9{%k1}
+        vmovups   coeff_3+__svml_satan_data_internal_avx512(%rip), %zmm5
+
+/* if|X|>=MaxThreshold, set Y=X */
+        vminps    {sae}, %zmm7, %zmm6, %zmm8{%k1}
+
+/* R+Rl = DiffX/Y */
+        vgetmantps $0, {sae}, %zmm9, %zmm12
+        vgetexpps {sae}, %zmm9, %zmm10
+        vpermt2ps Tbl_H+64+__svml_satan_data_internal_avx512(%rip), %zmm1, %zmm3
+        vgetmantps $0, {sae}, %zmm8, %zmm15
+        vgetexpps {sae}, %zmm8, %zmm11
+        vmovups   coeff_1+__svml_satan_data_internal_avx512(%rip), %zmm1
+
+/* set table value to Pi/2 for large X */
+        vblendmps Pi2+__svml_satan_data_internal_avx512(%rip), %zmm3, %zmm9{%k1}
+        vrcp14ps  %zmm15, %zmm13
+        vsubps    {rn-sae}, %zmm11, %zmm10, %zmm2
+        vmulps    {rn-sae}, %zmm13, %zmm12, %zmm14
+        vfnmadd213ps {rn-sae}, %zmm12, %zmm14, %zmm15
+        vfmadd213ps {rn-sae}, %zmm14, %zmm13, %zmm15
+        vscalefps {rn-sae}, %zmm2, %zmm15, %zmm7
+
+/* polynomial evaluation */
+        vmulps    {rn-sae}, %zmm7, %zmm7, %zmm8
+        vmulps    {rn-sae}, %zmm7, %zmm8, %zmm6
+        vfmadd231ps {rn-sae}, %zmm8, %zmm1, %zmm4
+        vfmadd213ps {rn-sae}, %zmm5, %zmm4, %zmm8
+        vfmadd213ps {rn-sae}, %zmm7, %zmm6, %zmm8
+        vaddps    {rn-sae}, %zmm9, %zmm8, %zmm10
+        vxorps    %zmm0, %zmm10, %zmm0
+        ret
+
+END(_ZGVeN16v_atanf_skx)
+
+        .section .rodata, "a"
+        .align 64
+
+#ifdef __svml_satan_data_internal_avx512_typedef
+typedef unsigned int VUINT32;
+typedef struct {
+        __declspec(align(64)) VUINT32 AbsMask[16][1];
+        __declspec(align(64)) VUINT32 Shifter[16][1];
+        __declspec(align(64)) VUINT32 MaxThreshold[16][1];
+        __declspec(align(64)) VUINT32 MOne[16][1];
+        __declspec(align(64)) VUINT32 One[16][1];
+        __declspec(align(64)) VUINT32 LargeX[16][1];
+        __declspec(align(64)) VUINT32 Zero[16][1];
+        __declspec(align(64)) VUINT32 Tbl_H[32][1];
+        __declspec(align(64)) VUINT32 Pi2[16][1];
+        __declspec(align(64)) VUINT32 coeff[3][16][1];
+    } __svml_satan_data_internal_avx512;
+#endif
+__svml_satan_data_internal_avx512:
+        /*== AbsMask ==*/
+        .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
+        /*== Shifter ==*/
+        .align 64
+        .long 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000
+        /*== MaxThreshold ==*/
+        .align 64
+        .long 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000
+        /*== MOne ==*/
+        .align 64
+        .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
+        /*== One ==*/
+        .align 64
+        .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
+        /*== LargeX ==*/
+        .align 64
+        .long 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000
+        /*== Zero ==*/
+        .align 64
+        .long 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
+        /*== Tbl_H ==*/
+        .align 64
+        .long 0x00000000, 0x3e7adbb0
+        .long 0x3eed6338, 0x3f24bc7d
+        .long 0x3f490fdb, 0x3f6563e3
+        .long 0x3f7b985f, 0x3f869c79
+        .long 0x3f8db70d, 0x3f93877b
+        .long 0x3f985b6c, 0x3f9c6b53
+        .long 0x3f9fe0bb, 0x3fa2daa4
+        .long 0x3fa57088, 0x3fa7b46f
+        .long 0x3fa9b465, 0x3fab7b7a
+        .long 0x3fad1283, 0x3fae809e
+        .long 0x3fafcb99, 0x3fb0f836
+        .long 0x3fb20a6a, 0x3fb30581
+        .long 0x3fb3ec43, 0x3fb4c10a
+        .long 0x3fb585d7, 0x3fb63c64
+        .long 0x3fb6e62c, 0x3fb78478
+        .long 0x3fb81868, 0x3fb8a2f5
+        /*== Pi2 ==*/
+        .align 64
+        .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
+        /*== coeff3 ==*/
+        .align 64
+        .long 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de
+        .long 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2
+        .long 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa
+        .align 64
+        .type	__svml_satan_data_internal_avx512,@object
+        .size	__svml_satan_data_internal_avx512,.-__svml_satan_data_internal_avx512