about summary refs log tree commit diff
path: root/sysdeps/x86_64/fpu/multiarch/svml_s_atanf4_core_sse4.S
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/x86_64/fpu/multiarch/svml_s_atanf4_core_sse4.S')
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_atanf4_core_sse4.S164
1 files changed, 164 insertions, 0 deletions
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanf4_core_sse4.S
new file mode 100644
index 0000000000..c58a894e10
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanf4_core_sse4.S
@@ -0,0 +1,164 @@
+/* Function atanf vectorized with SSE4.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   https://www.gnu.org/licenses/.  */
+
+/*
+ * ALGORITHM DESCRIPTION:
+ *
+ *      For    0.0    <= x <=  7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
+ *      For  7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
+ *      For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
+ *      For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
+ *      For 39.0/16.0 <= x <=    inf   : atan(x) = atan(inf) + atan(s), where s=-1.0/x
+ *      Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
+ *
+ */
+
+/* Offsets for data table __svml_satan_data_internal
+ */
+#define _sSIGN_MASK                   	0
+#define _sABS_MASK                    	16
+#define _sONE                         	32
+#define _sPIO2                        	48
+#define _sPC8                         	64
+#define _sPC7                         	80
+#define _sPC6                         	96
+#define _sPC5                         	112
+#define _sPC4                         	128
+#define _sPC3                         	144
+#define _sPC2                         	160
+#define _sPC1                         	176
+#define _sPC0                         	192
+
+#include <sysdep.h>
+
+        .text
+	.section .text.sse4,"ax",@progbits
+ENTRY(_ZGVbN4v_atanf_sse4)
+/*
+ * To use minps\maxps operations for argument reduction
+ * uncomment _AT_USEMINMAX_ definition
+ *  Declarations
+ * Variables
+ * Constants
+ */
+        movups    _sABS_MASK+__svml_satan_data_internal(%rip), %xmm2
+
+/*
+ * 1) If x>1,      then r=-1/x, PIO2=Pi/2
+ * 2) If -1<=x<=1, then r=x,    PIO2=0
+ * 3) If x<-1,     then r=-1/x, PIO2=-Pi/2
+ */
+        movups    _sONE+__svml_satan_data_internal(%rip), %xmm1
+        andps     %xmm0, %xmm2
+        movaps    %xmm2, %xmm9
+        movaps    %xmm1, %xmm3
+        cmpleps   %xmm1, %xmm9
+        maxps     %xmm2, %xmm3
+        minps     %xmm2, %xmm1
+        divps     %xmm3, %xmm1
+        movups    __svml_satan_data_internal(%rip), %xmm4
+        movaps    %xmm9, %xmm10
+        andps     %xmm4, %xmm0
+        andnps    %xmm4, %xmm9
+        pxor      %xmm0, %xmm9
+        pxor      %xmm1, %xmm9
+
+/* Polynomial. */
+        movaps    %xmm9, %xmm8
+        mulps     %xmm9, %xmm8
+        movaps    %xmm8, %xmm7
+        mulps     %xmm8, %xmm7
+        movups    _sPC8+__svml_satan_data_internal(%rip), %xmm6
+        mulps     %xmm7, %xmm6
+        movups    _sPC7+__svml_satan_data_internal(%rip), %xmm5
+        mulps     %xmm7, %xmm5
+        addps     _sPC6+__svml_satan_data_internal(%rip), %xmm6
+        mulps     %xmm7, %xmm6
+        addps     _sPC5+__svml_satan_data_internal(%rip), %xmm5
+        mulps     %xmm7, %xmm5
+        addps     _sPC4+__svml_satan_data_internal(%rip), %xmm6
+        mulps     %xmm7, %xmm6
+        addps     _sPC3+__svml_satan_data_internal(%rip), %xmm5
+        mulps     %xmm5, %xmm7
+        addps     _sPC2+__svml_satan_data_internal(%rip), %xmm6
+        mulps     %xmm8, %xmm6
+        addps     _sPC1+__svml_satan_data_internal(%rip), %xmm7
+        andnps    _sPIO2+__svml_satan_data_internal(%rip), %xmm10
+        addps     %xmm6, %xmm7
+        mulps     %xmm7, %xmm8
+        pxor      %xmm0, %xmm10
+        addps     _sPC0+__svml_satan_data_internal(%rip), %xmm8
+
+/* Reconstruction. */
+        mulps     %xmm8, %xmm9
+        addps     %xmm9, %xmm10
+        movaps    %xmm10, %xmm0
+        ret
+
+END(_ZGVbN4v_atanf_sse4)
+
+        .section .rodata, "a"
+        .align 16
+
+#ifdef __svml_satan_data_internal_typedef
+typedef unsigned int VUINT32;
+typedef struct {
+        __declspec(align(16)) VUINT32 _sSIGN_MASK[4][1];
+        __declspec(align(16)) VUINT32 _sABS_MASK[4][1];
+        __declspec(align(16)) VUINT32 _sONE[4][1];
+        __declspec(align(16)) VUINT32 _sPIO2[4][1];
+        __declspec(align(16)) VUINT32 _sPC8[4][1];
+        __declspec(align(16)) VUINT32 _sPC7[4][1];
+        __declspec(align(16)) VUINT32 _sPC6[4][1];
+        __declspec(align(16)) VUINT32 _sPC5[4][1];
+        __declspec(align(16)) VUINT32 _sPC4[4][1];
+        __declspec(align(16)) VUINT32 _sPC3[4][1];
+        __declspec(align(16)) VUINT32 _sPC2[4][1];
+        __declspec(align(16)) VUINT32 _sPC1[4][1];
+        __declspec(align(16)) VUINT32 _sPC0[4][1];
+} __svml_satan_data_internal;
+#endif
+__svml_satan_data_internal:
+        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 //_sSIGN_MASK
+        .align 16
+        .long 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF //_sABS_MASK
+        .align 16
+        .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 //_sONE
+        .align 16
+        .long 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB //_sPIO2
+        .align 16
+        .long 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0 //_sPC8
+        .align 16
+        .long 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631 //_sPC7
+        .align 16
+        .long 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384 //_sPC6
+        .align 16
+        .long 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629 //_sPC5
+        .align 16
+        .long 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474 //_sPC4
+        .align 16
+        .long 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8 //_sPC3
+        .align 16
+        .long 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F //_sPC2
+        .align 16
+        .long 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49 //_sPC1
+        .align 16
+        .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 //_sPC0
+        .align 16
+        .type	__svml_satan_data_internal,@object
+        .size	__svml_satan_data_internal,.-__svml_satan_data_internal