diff options
Diffstat (limited to 'sysdeps/x86_64/fpu/multiarch')
130 files changed, 0 insertions, 15286 deletions
diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile deleted file mode 100644 index 34542155aa..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/Makefile +++ /dev/null @@ -1,70 +0,0 @@ -ifeq ($(subdir),math) -libm-sysdep_routines += s_floor-c s_ceil-c s_floorf-c s_ceilf-c \ - s_rint-c s_rintf-c s_nearbyint-c s_nearbyintf-c - -libm-sysdep_routines += e_exp-fma4 e_log-fma4 e_pow-fma4 s_atan-fma4 \ - e_asin-fma4 e_atan2-fma4 s_sin-fma4 s_tan-fma4 \ - mplog-fma4 mpa-fma4 slowexp-fma4 slowpow-fma4 \ - sincos32-fma4 doasin-fma4 dosincos-fma4 \ - halfulp-fma4 mpexp-fma4 \ - mpatan2-fma4 mpatan-fma4 mpsqrt-fma4 mptan-fma4 - -CFLAGS-doasin-fma4.c = -mfma4 -CFLAGS-dosincos-fma4.c = -mfma4 -CFLAGS-e_asin-fma4.c = -mfma4 -CFLAGS-e_atan2-fma4.c = -mfma4 -CFLAGS-e_exp-fma4.c = -mfma4 -CFLAGS-e_log-fma4.c = -mfma4 -CFLAGS-e_pow-fma4.c = -mfma4 $(config-cflags-nofma) -CFLAGS-halfulp-fma4.c = -mfma4 -CFLAGS-mpa-fma4.c = -mfma4 -CFLAGS-mpatan-fma4.c = -mfma4 -CFLAGS-mpatan2-fma4.c = -mfma4 -CFLAGS-mpexp-fma4.c = -mfma4 -CFLAGS-mplog-fma4.c = -mfma4 -CFLAGS-mpsqrt-fma4.c = -mfma4 -CFLAGS-mptan-fma4.c = -mfma4 -CFLAGS-s_atan-fma4.c = -mfma4 -CFLAGS-sincos32-fma4.c = -mfma4 -CFLAGS-slowexp-fma4.c = -mfma4 -CFLAGS-slowpow-fma4.c = -mfma4 -CFLAGS-s_sin-fma4.c = -mfma4 -CFLAGS-s_tan-fma4.c = -mfma4 - -libm-sysdep_routines += e_exp-avx e_log-avx s_atan-avx \ - e_atan2-avx s_sin-avx s_tan-avx \ - mplog-avx mpa-avx slowexp-avx \ - mpexp-avx - -CFLAGS-e_atan2-avx.c = -msse2avx -DSSE2AVX -CFLAGS-e_exp-avx.c = -msse2avx -DSSE2AVX -CFLAGS-e_log-avx.c = -msse2avx -DSSE2AVX -CFLAGS-mpa-avx.c = -msse2avx -DSSE2AVX -CFLAGS-mpexp-avx.c = -msse2avx -DSSE2AVX -CFLAGS-mplog-avx.c = -msse2avx -DSSE2AVX -CFLAGS-s_atan-avx.c = -msse2avx -DSSE2AVX -CFLAGS-s_sin-avx.c = -msse2avx -DSSE2AVX -CFLAGS-slowexp-avx.c = -msse2avx -DSSE2AVX -CFLAGS-s_tan-avx.c = -msse2avx -DSSE2AVX -endif - -ifeq ($(subdir),mathvec) -libmvec-sysdep_routines += svml_d_cos2_core_sse4 svml_d_cos4_core_avx2 \ - svml_d_cos8_core_avx512 svml_d_sin2_core_sse4 \ - svml_d_sin4_core_avx2 svml_d_sin8_core_avx512 \ - svml_d_log2_core_sse4 svml_d_log4_core_avx2 \ - svml_d_log8_core_avx512 svml_d_sincos2_core_sse4 \ - svml_d_sincos4_core_avx2 svml_d_sincos8_core_avx512 \ - svml_s_cosf4_core_sse4 svml_s_cosf8_core_avx2 \ - svml_s_cosf16_core_avx512 svml_s_sinf4_core_sse4 \ - svml_s_sinf8_core_avx2 svml_s_sinf16_core_avx512 \ - svml_s_logf4_core_sse4 svml_s_logf8_core_avx2 \ - svml_s_logf16_core_avx512 svml_d_exp2_core_sse4 \ - svml_d_exp4_core_avx2 svml_d_exp8_core_avx512 \ - svml_s_expf4_core_sse4 svml_s_expf8_core_avx2 \ - svml_s_expf16_core_avx512 svml_d_pow2_core_sse4 \ - svml_d_pow4_core_avx2 svml_d_pow8_core_avx512 \ - svml_s_powf4_core_sse4 svml_s_powf8_core_avx2 \ - svml_s_powf16_core_avx512 svml_s_sincosf4_core_sse4 \ - svml_s_sincosf8_core_avx2 svml_s_sincosf16_core_avx512 -endif diff --git a/sysdeps/x86_64/fpu/multiarch/doasin-fma4.c b/sysdeps/x86_64/fpu/multiarch/doasin-fma4.c deleted file mode 100644 index 53eb419472..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/doasin-fma4.c +++ /dev/null @@ -1,4 +0,0 @@ -#define __doasin __doasin_fma4 -#define SECTION __attribute__ ((section (".text.fma4"))) - -#include <sysdeps/ieee754/dbl-64/doasin.c> diff --git a/sysdeps/x86_64/fpu/multiarch/dosincos-fma4.c b/sysdeps/x86_64/fpu/multiarch/dosincos-fma4.c deleted file mode 100644 index 1578b2fce0..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/dosincos-fma4.c +++ /dev/null @@ -1,6 +0,0 @@ -#define __docos __docos_fma4 -#define __dubcos __dubcos_fma4 -#define __dubsin __dubsin_fma4 -#define SECTION __attribute__ ((section (".text.fma4"))) - -#include <sysdeps/ieee754/dbl-64/dosincos.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_asin-fma4.c b/sysdeps/x86_64/fpu/multiarch/e_asin-fma4.c deleted file mode 100644 index 2657c31f49..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/e_asin-fma4.c +++ /dev/null @@ -1,11 +0,0 @@ -#define __ieee754_acos __ieee754_acos_fma4 -#define __ieee754_asin __ieee754_asin_fma4 -#define __cos32 __cos32_fma4 -#define __doasin __doasin_fma4 -#define __docos __docos_fma4 -#define __dubcos __dubcos_fma4 -#define __dubsin __dubsin_fma4 -#define __sin32 __sin32_fma4 -#define SECTION __attribute__ ((section (".text.fma4"))) - -#include <sysdeps/ieee754/dbl-64/e_asin.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_asin.c b/sysdeps/x86_64/fpu/multiarch/e_asin.c deleted file mode 100644 index 111a5b99bd..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/e_asin.c +++ /dev/null @@ -1,26 +0,0 @@ -#include <init-arch.h> -#include <math.h> -#include <math_private.h> - -extern double __ieee754_acos_sse2 (double); -extern double __ieee754_asin_sse2 (double); -extern double __ieee754_acos_fma4 (double); -extern double __ieee754_asin_fma4 (double); - -libm_ifunc (__ieee754_acos, - HAS_ARCH_FEATURE (FMA4_Usable) - ? __ieee754_acos_fma4 - : __ieee754_acos_sse2); -strong_alias (__ieee754_acos, __acos_finite) - -libm_ifunc (__ieee754_asin, - HAS_ARCH_FEATURE (FMA4_Usable) - ? __ieee754_asin_fma4 - : __ieee754_asin_sse2); -strong_alias (__ieee754_asin, __asin_finite) - -#define __ieee754_acos __ieee754_acos_sse2 -#define __ieee754_asin __ieee754_asin_sse2 - - -#include <sysdeps/ieee754/dbl-64/e_asin.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_atan2-avx.c b/sysdeps/x86_64/fpu/multiarch/e_atan2-avx.c deleted file mode 100644 index 3012afac37..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/e_atan2-avx.c +++ /dev/null @@ -1,9 +0,0 @@ -#define __ieee754_atan2 __ieee754_atan2_avx -#define __add __add_avx -#define __dbl_mp __dbl_mp_avx -#define __dvd __dvd_avx -#define __mul __mul_avx -#define __sub __sub_avx -#define SECTION __attribute__ ((section (".text.avx"))) - -#include <sysdeps/ieee754/dbl-64/e_atan2.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_atan2-fma4.c b/sysdeps/x86_64/fpu/multiarch/e_atan2-fma4.c deleted file mode 100644 index f4e986293e..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/e_atan2-fma4.c +++ /dev/null @@ -1,10 +0,0 @@ -#define __ieee754_atan2 __ieee754_atan2_fma4 -#define __add __add_fma4 -#define __dbl_mp __dbl_mp_fma4 -#define __dvd __dvd_fma4 -#define __mpatan2 __mpatan2_fma4 -#define __mul __mul_fma4 -#define __sub __sub_fma4 -#define SECTION __attribute__ ((section (".text.fma4"))) - -#include <sysdeps/ieee754/dbl-64/e_atan2.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_atan2.c b/sysdeps/x86_64/fpu/multiarch/e_atan2.c deleted file mode 100644 index 9ca3c02a44..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/e_atan2.c +++ /dev/null @@ -1,18 +0,0 @@ -#include <init-arch.h> -#include <math.h> -#include <math_private.h> - -extern double __ieee754_atan2_sse2 (double, double); -extern double __ieee754_atan2_avx (double, double); -extern double __ieee754_atan2_fma4 (double, double); - -libm_ifunc (__ieee754_atan2, - HAS_ARCH_FEATURE (FMA4_Usable) ? __ieee754_atan2_fma4 - : (HAS_ARCH_FEATURE (AVX_Usable) - ? __ieee754_atan2_avx : __ieee754_atan2_sse2)); -strong_alias (__ieee754_atan2, __atan2_finite) - -#define __ieee754_atan2 __ieee754_atan2_sse2 - - -#include <sysdeps/ieee754/dbl-64/e_atan2.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_exp-avx.c b/sysdeps/x86_64/fpu/multiarch/e_exp-avx.c deleted file mode 100644 index ee5dd6d2dc..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/e_exp-avx.c +++ /dev/null @@ -1,6 +0,0 @@ -#define __ieee754_exp __ieee754_exp_avx -#define __exp1 __exp1_avx -#define __slowexp __slowexp_avx -#define SECTION __attribute__ ((section (".text.avx"))) - -#include <sysdeps/ieee754/dbl-64/e_exp.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_exp-fma4.c b/sysdeps/x86_64/fpu/multiarch/e_exp-fma4.c deleted file mode 100644 index ae6eb67603..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/e_exp-fma4.c +++ /dev/null @@ -1,6 +0,0 @@ -#define __ieee754_exp __ieee754_exp_fma4 -#define __exp1 __exp1_fma4 -#define __slowexp __slowexp_fma4 -#define SECTION __attribute__ ((section (".text.fma4"))) - -#include <sysdeps/ieee754/dbl-64/e_exp.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_exp.c b/sysdeps/x86_64/fpu/multiarch/e_exp.c deleted file mode 100644 index b7d7b5ff27..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/e_exp.c +++ /dev/null @@ -1,18 +0,0 @@ -#include <init-arch.h> -#include <math.h> -#include <math_private.h> - -extern double __ieee754_exp_sse2 (double); -extern double __ieee754_exp_avx (double); -extern double __ieee754_exp_fma4 (double); - -libm_ifunc (__ieee754_exp, - HAS_ARCH_FEATURE (FMA4_Usable) ? __ieee754_exp_fma4 - : (HAS_ARCH_FEATURE (AVX_Usable) - ? __ieee754_exp_avx : __ieee754_exp_sse2)); -strong_alias (__ieee754_exp, __exp_finite) - -#define __ieee754_exp __ieee754_exp_sse2 - - -#include <sysdeps/ieee754/dbl-64/e_exp.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_log-avx.c b/sysdeps/x86_64/fpu/multiarch/e_log-avx.c deleted file mode 100644 index c669019bc2..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/e_log-avx.c +++ /dev/null @@ -1,8 +0,0 @@ -#define __ieee754_log __ieee754_log_avx -#define __mplog __mplog_avx -#define __add __add_avx -#define __dbl_mp __dbl_mp_avx -#define __sub __sub_avx -#define SECTION __attribute__ ((section (".text.avx"))) - -#include <sysdeps/ieee754/dbl-64/e_log.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_log-fma4.c b/sysdeps/x86_64/fpu/multiarch/e_log-fma4.c deleted file mode 100644 index a2346cc618..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/e_log-fma4.c +++ /dev/null @@ -1,8 +0,0 @@ -#define __ieee754_log __ieee754_log_fma4 -#define __mplog __mplog_fma4 -#define __add __add_fma4 -#define __dbl_mp __dbl_mp_fma4 -#define __sub __sub_fma4 -#define SECTION __attribute__ ((section (".text.fma4"))) - -#include <sysdeps/ieee754/dbl-64/e_log.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_log.c b/sysdeps/x86_64/fpu/multiarch/e_log.c deleted file mode 100644 index cf9533d6c0..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/e_log.c +++ /dev/null @@ -1,18 +0,0 @@ -#include <init-arch.h> -#include <math.h> -#include <math_private.h> - -extern double __ieee754_log_sse2 (double); -extern double __ieee754_log_avx (double); -extern double __ieee754_log_fma4 (double); - -libm_ifunc (__ieee754_log, - HAS_ARCH_FEATURE (FMA4_Usable) ? __ieee754_log_fma4 - : (HAS_ARCH_FEATURE (AVX_Usable) - ? __ieee754_log_avx : __ieee754_log_sse2)); -strong_alias (__ieee754_log, __log_finite) - -#define __ieee754_log __ieee754_log_sse2 - - -#include <sysdeps/ieee754/dbl-64/e_log.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_pow-fma4.c b/sysdeps/x86_64/fpu/multiarch/e_pow-fma4.c deleted file mode 100644 index 5b3ea8e103..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/e_pow-fma4.c +++ /dev/null @@ -1,6 +0,0 @@ -#define __ieee754_pow __ieee754_pow_fma4 -#define __exp1 __exp1_fma4 -#define __slowpow __slowpow_fma4 -#define SECTION __attribute__ ((section (".text.fma4"))) - -#include <sysdeps/ieee754/dbl-64/e_pow.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_pow.c b/sysdeps/x86_64/fpu/multiarch/e_pow.c deleted file mode 100644 index a5c5d89c3e..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/e_pow.c +++ /dev/null @@ -1,17 +0,0 @@ -#include <init-arch.h> -#include <math.h> -#include <math_private.h> - -extern double __ieee754_pow_sse2 (double, double); -extern double __ieee754_pow_fma4 (double, double); - -libm_ifunc (__ieee754_pow, - HAS_ARCH_FEATURE (FMA4_Usable) - ? __ieee754_pow_fma4 - : __ieee754_pow_sse2); -strong_alias (__ieee754_pow, __pow_finite) - -#define __ieee754_pow __ieee754_pow_sse2 - - -#include <sysdeps/ieee754/dbl-64/e_pow.c> diff --git a/sysdeps/x86_64/fpu/multiarch/halfulp-fma4.c b/sysdeps/x86_64/fpu/multiarch/halfulp-fma4.c deleted file mode 100644 index a00c17c016..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/halfulp-fma4.c +++ /dev/null @@ -1,4 +0,0 @@ -#define __halfulp __halfulp_fma4 -#define SECTION __attribute__ ((section (".text.fma4"))) - -#include <sysdeps/ieee754/dbl-64/halfulp.c> diff --git a/sysdeps/x86_64/fpu/multiarch/mpa-avx.c b/sysdeps/x86_64/fpu/multiarch/mpa-avx.c deleted file mode 100644 index 366b0b7134..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/mpa-avx.c +++ /dev/null @@ -1,14 +0,0 @@ -#define __add __add_avx -#define __mul __mul_avx -#define __sqr __sqr_avx -#define __sub __sub_avx -#define __dbl_mp __dbl_mp_avx -#define __dvd __dvd_avx - -#define NO___CPY 1 -#define NO___MP_DBL 1 -#define NO___ACR 1 -#define NO__CONST 1 -#define SECTION __attribute__ ((section (".text.avx"))) - -#include <sysdeps/ieee754/dbl-64/mpa.c> diff --git a/sysdeps/x86_64/fpu/multiarch/mpa-fma4.c b/sysdeps/x86_64/fpu/multiarch/mpa-fma4.c deleted file mode 100644 index a4a759407e..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/mpa-fma4.c +++ /dev/null @@ -1,14 +0,0 @@ -#define __add __add_fma4 -#define __mul __mul_fma4 -#define __sqr __sqr_fma4 -#define __sub __sub_fma4 -#define __dbl_mp __dbl_mp_fma4 -#define __dvd __dvd_fma4 - -#define NO___CPY 1 -#define NO___MP_DBL 1 -#define NO___ACR 1 -#define NO__CONST 1 -#define SECTION __attribute__ ((section (".text.fma4"))) - -#include <sysdeps/ieee754/dbl-64/mpa.c> diff --git a/sysdeps/x86_64/fpu/multiarch/mpatan-fma4.c b/sysdeps/x86_64/fpu/multiarch/mpatan-fma4.c deleted file mode 100644 index fbd3bd49a2..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/mpatan-fma4.c +++ /dev/null @@ -1,10 +0,0 @@ -#define __mpatan __mpatan_fma4 -#define __add __add_fma4 -#define __dvd __dvd_fma4 -#define __mpsqrt __mpsqrt_fma4 -#define __mul __mul_fma4 -#define __sub __sub_fma4 -#define AVOID_MPATAN_H 1 -#define SECTION __attribute__ ((section (".text.fma4"))) - -#include <sysdeps/ieee754/dbl-64/mpatan.c> diff --git a/sysdeps/x86_64/fpu/multiarch/mpatan2-fma4.c b/sysdeps/x86_64/fpu/multiarch/mpatan2-fma4.c deleted file mode 100644 index e6e44d49b0..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/mpatan2-fma4.c +++ /dev/null @@ -1,9 +0,0 @@ -#define __mpatan2 __mpatan2_fma4 -#define __add __add_fma4 -#define __dvd __dvd_fma4 -#define __mpatan __mpatan_fma4 -#define __mpsqrt __mpsqrt_fma4 -#define __mul __mul_fma4 -#define SECTION __attribute__ ((section (".text.fma4"))) - -#include <sysdeps/ieee754/dbl-64/mpatan2.c> diff --git a/sysdeps/x86_64/fpu/multiarch/mpexp-avx.c b/sysdeps/x86_64/fpu/multiarch/mpexp-avx.c deleted file mode 100644 index 87f29c96c9..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/mpexp-avx.c +++ /dev/null @@ -1,9 +0,0 @@ -#define __mpexp __mpexp_avx -#define __add __add_avx -#define __dbl_mp __dbl_mp_avx -#define __dvd __dvd_avx -#define __mul __mul_avx -#define AVOID_MPEXP_H 1 -#define SECTION __attribute__ ((section (".text.avx"))) - -#include <sysdeps/ieee754/dbl-64/mpexp.c> diff --git a/sysdeps/x86_64/fpu/multiarch/mpexp-fma4.c b/sysdeps/x86_64/fpu/multiarch/mpexp-fma4.c deleted file mode 100644 index 07ca6e9ad0..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/mpexp-fma4.c +++ /dev/null @@ -1,9 +0,0 @@ -#define __mpexp __mpexp_fma4 -#define __add __add_fma4 -#define __dbl_mp __dbl_mp_fma4 -#define __dvd __dvd_fma4 -#define __mul __mul_fma4 -#define AVOID_MPEXP_H 1 -#define SECTION __attribute__ ((section (".text.fma4"))) - -#include <sysdeps/ieee754/dbl-64/mpexp.c> diff --git a/sysdeps/x86_64/fpu/multiarch/mplog-avx.c b/sysdeps/x86_64/fpu/multiarch/mplog-avx.c deleted file mode 100644 index fd783d9a67..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/mplog-avx.c +++ /dev/null @@ -1,8 +0,0 @@ -#define __mplog __mplog_avx -#define __add __add_avx -#define __mpexp __mpexp_avx -#define __mul __mul_avx -#define __sub __sub_avx -#define SECTION __attribute__ ((section (".text.avx"))) - -#include <sysdeps/ieee754/dbl-64/mplog.c> diff --git a/sysdeps/x86_64/fpu/multiarch/mplog-fma4.c b/sysdeps/x86_64/fpu/multiarch/mplog-fma4.c deleted file mode 100644 index b4733118d7..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/mplog-fma4.c +++ /dev/null @@ -1,8 +0,0 @@ -#define __mplog __mplog_fma4 -#define __add __add_fma4 -#define __mpexp __mpexp_fma4 -#define __mul __mul_fma4 -#define __sub __sub_fma4 -#define SECTION __attribute__ ((section (".text.fma4"))) - -#include <sysdeps/ieee754/dbl-64/mplog.c> diff --git a/sysdeps/x86_64/fpu/multiarch/mpsqrt-fma4.c b/sysdeps/x86_64/fpu/multiarch/mpsqrt-fma4.c deleted file mode 100644 index f8a1ba2d92..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/mpsqrt-fma4.c +++ /dev/null @@ -1,8 +0,0 @@ -#define __mpsqrt __mpsqrt_fma4 -#define __dbl_mp __dbl_mp_fma4 -#define __mul __mul_fma4 -#define __sub __sub_fma4 -#define AVOID_MPSQRT_H 1 -#define SECTION __attribute__ ((section (".text.fma4"))) - -#include <sysdeps/ieee754/dbl-64/mpsqrt.c> diff --git a/sysdeps/x86_64/fpu/multiarch/mptan-fma4.c b/sysdeps/x86_64/fpu/multiarch/mptan-fma4.c deleted file mode 100644 index fb4a9d48ca..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/mptan-fma4.c +++ /dev/null @@ -1,7 +0,0 @@ -#define __mptan __mptan_fma4 -#define __c32 __c32_fma4 -#define __dvd __dvd_fma4 -#define __mpranred __mpranred_fma4 -#define SECTION __attribute__ ((section (".text.fma4"))) - -#include <sysdeps/ieee754/dbl-64/mptan.c> diff --git a/sysdeps/x86_64/fpu/multiarch/s_atan-avx.c b/sysdeps/x86_64/fpu/multiarch/s_atan-avx.c deleted file mode 100644 index b5cb9c3a75..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/s_atan-avx.c +++ /dev/null @@ -1,8 +0,0 @@ -#define atan __atan_avx -#define __add __add_avx -#define __dbl_mp __dbl_mp_avx -#define __mul __mul_avx -#define __sub __sub_avx -#define SECTION __attribute__ ((section (".text.avx"))) - -#include <sysdeps/ieee754/dbl-64/s_atan.c> diff --git a/sysdeps/x86_64/fpu/multiarch/s_atan-fma4.c b/sysdeps/x86_64/fpu/multiarch/s_atan-fma4.c deleted file mode 100644 index 9e83e6cdab..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/s_atan-fma4.c +++ /dev/null @@ -1,9 +0,0 @@ -#define atan __atan_fma4 -#define __add __add_fma4 -#define __dbl_mp __dbl_mp_fma4 -#define __mpatan __mpatan_fma4 -#define __mul __mul_fma4 -#define __sub __sub_fma4 -#define SECTION __attribute__ ((section (".text.fma4"))) - -#include <sysdeps/ieee754/dbl-64/s_atan.c> diff --git a/sysdeps/x86_64/fpu/multiarch/s_atan.c b/sysdeps/x86_64/fpu/multiarch/s_atan.c deleted file mode 100644 index 742e95cb96..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/s_atan.c +++ /dev/null @@ -1,15 +0,0 @@ -#include <init-arch.h> -#include <math.h> - -extern double __atan_sse2 (double); -extern double __atan_avx (double); -extern double __atan_fma4 (double); - -libm_ifunc (atan, (HAS_ARCH_FEATURE (FMA4_Usable) ? __atan_fma4 : - HAS_ARCH_FEATURE (AVX_Usable) - ? __atan_avx : __atan_sse2)); - -#define atan __atan_sse2 - - -#include <sysdeps/ieee754/dbl-64/s_atan.c> diff --git a/sysdeps/x86_64/fpu/multiarch/s_ceil-c.c b/sysdeps/x86_64/fpu/multiarch/s_ceil-c.c deleted file mode 100644 index 6a5ea3ff27..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/s_ceil-c.c +++ /dev/null @@ -1,2 +0,0 @@ -#define __ceil __ceil_c -#include <sysdeps/ieee754/dbl-64/wordsize-64/s_ceil.c> diff --git a/sysdeps/x86_64/fpu/multiarch/s_ceil.S b/sysdeps/x86_64/fpu/multiarch/s_ceil.S deleted file mode 100644 index f8eef43eff..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/s_ceil.S +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (C) 2011-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - Contributed by Ulrich Drepper <drepper@gmail.come>, 2011. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <machine/asm.h> -#include <init-arch.h> - - -ENTRY(__ceil) - .type __ceil, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq __ceil_sse41(%rip), %rax - HAS_CPU_FEATURE (SSE4_1) - jnz 2f - leaq __ceil_c(%rip), %rax -2: ret -END(__ceil) -weak_alias (__ceil, ceil) - - -ENTRY(__ceil_sse41) - roundsd $10, %xmm0, %xmm0 - ret -END(__ceil_sse41) diff --git a/sysdeps/x86_64/fpu/multiarch/s_ceilf-c.c b/sysdeps/x86_64/fpu/multiarch/s_ceilf-c.c deleted file mode 100644 index 229a6273b2..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/s_ceilf-c.c +++ /dev/null @@ -1,2 +0,0 @@ -#define __ceilf __ceilf_c -#include <sysdeps/ieee754/flt-32/s_ceilf.c> diff --git a/sysdeps/x86_64/fpu/multiarch/s_ceilf.S b/sysdeps/x86_64/fpu/multiarch/s_ceilf.S deleted file mode 100644 index 076f10f0f0..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/s_ceilf.S +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (C) 2011-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - Contributed by Ulrich Drepper <drepper@gmail.come>, 2011. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <machine/asm.h> -#include <init-arch.h> - - -ENTRY(__ceilf) - .type __ceilf, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq __ceilf_sse41(%rip), %rax - HAS_CPU_FEATURE (SSE4_1) - jnz 2f - leaq __ceilf_c(%rip), %rax -2: ret -END(__ceilf) -weak_alias (__ceilf, ceilf) - - -ENTRY(__ceilf_sse41) - roundss $10, %xmm0, %xmm0 - ret -END(__ceilf_sse41) diff --git a/sysdeps/x86_64/fpu/multiarch/s_floor-c.c b/sysdeps/x86_64/fpu/multiarch/s_floor-c.c deleted file mode 100644 index 68733b69ef..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/s_floor-c.c +++ /dev/null @@ -1,3 +0,0 @@ -#undef __floor -#define __floor __floor_c -#include <sysdeps/ieee754/dbl-64/wordsize-64/s_floor.c> diff --git a/sysdeps/x86_64/fpu/multiarch/s_floor.S b/sysdeps/x86_64/fpu/multiarch/s_floor.S deleted file mode 100644 index f519ab24f4..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/s_floor.S +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (C) 2011-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - Contributed by Ulrich Drepper <drepper@gmail.come>, 2011. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <machine/asm.h> -#include <init-arch.h> - - -ENTRY(__floor) - .type __floor, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq __floor_sse41(%rip), %rax - HAS_CPU_FEATURE (SSE4_1) - jnz 2f - leaq __floor_c(%rip), %rax -2: ret -END(__floor) -weak_alias (__floor, floor) - - -ENTRY(__floor_sse41) - roundsd $9, %xmm0, %xmm0 - ret -END(__floor_sse41) diff --git a/sysdeps/x86_64/fpu/multiarch/s_floorf-c.c b/sysdeps/x86_64/fpu/multiarch/s_floorf-c.c deleted file mode 100644 index 2386362328..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/s_floorf-c.c +++ /dev/null @@ -1,3 +0,0 @@ -#undef __floorf -#define __floorf __floorf_c -#include <sysdeps/ieee754/flt-32/s_floorf.c> diff --git a/sysdeps/x86_64/fpu/multiarch/s_floorf.S b/sysdeps/x86_64/fpu/multiarch/s_floorf.S deleted file mode 100644 index 8613f73acc..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/s_floorf.S +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (C) 2011-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - Contributed by Ulrich Drepper <drepper@gmail.come>, 2011. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <machine/asm.h> -#include <init-arch.h> - - -ENTRY(__floorf) - .type __floorf, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq __floorf_sse41(%rip), %rax - HAS_CPU_FEATURE (SSE4_1) - jnz 2f - leaq __floorf_c(%rip), %rax -2: ret -END(__floorf) -weak_alias (__floorf, floorf) - - -ENTRY(__floorf_sse41) - roundss $9, %xmm0, %xmm0 - ret -END(__floorf_sse41) diff --git a/sysdeps/x86_64/fpu/multiarch/s_fma.c b/sysdeps/x86_64/fpu/multiarch/s_fma.c deleted file mode 100644 index 3ac4fed660..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/s_fma.c +++ /dev/null @@ -1,50 +0,0 @@ -/* FMA version of fma. - Copyright (C) 2009-2017 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <config.h> -#include <math.h> -#include <init-arch.h> - -extern double __fma_sse2 (double x, double y, double z) attribute_hidden; - - -static double -__fma_fma3 (double x, double y, double z) -{ - asm ("vfmadd213sd %3, %2, %0" : "=x" (x) : "0" (x), "x" (y), "xm" (z)); - return x; -} - - -static double -__fma_fma4 (double x, double y, double z) -{ - asm ("vfmaddsd %3, %2, %1, %0" : "=x" (x) : "x" (x), "x" (y), "x" (z)); - return x; -} - - -libm_ifunc (__fma, HAS_ARCH_FEATURE (FMA_Usable) - ? __fma_fma3 : (HAS_ARCH_FEATURE (FMA4_Usable) - ? __fma_fma4 : __fma_sse2)); -weak_alias (__fma, fma) - -#define __fma __fma_sse2 - -#include <sysdeps/ieee754/dbl-64/s_fma.c> diff --git a/sysdeps/x86_64/fpu/multiarch/s_fmaf.c b/sysdeps/x86_64/fpu/multiarch/s_fmaf.c deleted file mode 100644 index 1ae227c1d4..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/s_fmaf.c +++ /dev/null @@ -1,49 +0,0 @@ -/* FMA version of fmaf. - Copyright (C) 2009-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <config.h> -#include <math.h> -#include <init-arch.h> - -extern float __fmaf_sse2 (float x, float y, float z) attribute_hidden; - - -static float -__fmaf_fma3 (float x, float y, float z) -{ - asm ("vfmadd213ss %3, %2, %0" : "=x" (x) : "0" (x), "x" (y), "xm" (z)); - return x; -} - - -static float -__fmaf_fma4 (float x, float y, float z) -{ - asm ("vfmaddss %3, %2, %1, %0" : "=x" (x) : "x" (x), "x" (y), "x" (z)); - return x; -} - - -libm_ifunc (__fmaf, HAS_ARCH_FEATURE (FMA_Usable) - ? __fmaf_fma3 : (HAS_ARCH_FEATURE (FMA4_Usable) - ? __fmaf_fma4 : __fmaf_sse2)); -weak_alias (__fmaf, fmaf) - -#define __fmaf __fmaf_sse2 - -#include <sysdeps/ieee754/dbl-64/s_fmaf.c> diff --git a/sysdeps/x86_64/fpu/multiarch/s_nearbyint-c.c b/sysdeps/x86_64/fpu/multiarch/s_nearbyint-c.c deleted file mode 100644 index f897a2a6a6..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/s_nearbyint-c.c +++ /dev/null @@ -1,3 +0,0 @@ -#undef __nearbyint -#define __nearbyint __nearbyint_c -#include <sysdeps/ieee754/dbl-64/wordsize-64/s_nearbyint.c> diff --git a/sysdeps/x86_64/fpu/multiarch/s_nearbyint.S b/sysdeps/x86_64/fpu/multiarch/s_nearbyint.S deleted file mode 100644 index 5a734f6027..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/s_nearbyint.S +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (C) 2011-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - Contributed by Ulrich Drepper <drepper@gmail.come>, 2011. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <machine/asm.h> -#include <init-arch.h> - - -ENTRY(__nearbyint) - .type __nearbyint, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq __nearbyint_sse41(%rip), %rax - HAS_CPU_FEATURE (SSE4_1) - jnz 2f - leaq __nearbyint_c(%rip), %rax -2: ret -END(__nearbyint) -weak_alias (__nearbyint, nearbyint) - - -ENTRY(__nearbyint_sse41) - roundsd $0xc, %xmm0, %xmm0 - ret -END(__nearbyint_sse41) diff --git a/sysdeps/x86_64/fpu/multiarch/s_nearbyintf-c.c b/sysdeps/x86_64/fpu/multiarch/s_nearbyintf-c.c deleted file mode 100644 index aa7768233b..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/s_nearbyintf-c.c +++ /dev/null @@ -1,3 +0,0 @@ -#undef __nearbyintf -#define __nearbyintf __nearbyintf_c -#include <sysdeps/ieee754/flt-32/s_nearbyintf.c> diff --git a/sysdeps/x86_64/fpu/multiarch/s_nearbyintf.S b/sysdeps/x86_64/fpu/multiarch/s_nearbyintf.S deleted file mode 100644 index ad79fd6021..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/s_nearbyintf.S +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (C) 2011-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - Contributed by Ulrich Drepper <drepper@gmail.come>, 2011. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <machine/asm.h> -#include <init-arch.h> - - -ENTRY(__nearbyintf) - .type __nearbyintf, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq __nearbyintf_sse41(%rip), %rax - HAS_CPU_FEATURE (SSE4_1) - jnz 2f - leaq __nearbyintf_c(%rip), %rax -2: ret -END(__nearbyintf) -weak_alias (__nearbyintf, nearbyintf) - - -ENTRY(__nearbyintf_sse41) - roundss $0xc, %xmm0, %xmm0 - ret -END(__nearbyintf_sse41) diff --git a/sysdeps/x86_64/fpu/multiarch/s_rint-c.c b/sysdeps/x86_64/fpu/multiarch/s_rint-c.c deleted file mode 100644 index 162a630ff9..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/s_rint-c.c +++ /dev/null @@ -1,3 +0,0 @@ -#undef __rint -#define __rint __rint_c -#include <sysdeps/ieee754/dbl-64/wordsize-64/s_rint.c> diff --git a/sysdeps/x86_64/fpu/multiarch/s_rint.S b/sysdeps/x86_64/fpu/multiarch/s_rint.S deleted file mode 100644 index 4f628a93a4..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/s_rint.S +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (C) 2011-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - Contributed by Ulrich Drepper <drepper@gmail.come>, 2011. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <machine/asm.h> -#include <init-arch.h> - - -ENTRY(__rint) - .type __rint, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq __rint_sse41(%rip), %rax - HAS_CPU_FEATURE (SSE4_1) - jnz 2f - leaq __rint_c(%rip), %rax -2: ret -END(__rint) -weak_alias (__rint, rint) - - -ENTRY(__rint_sse41) - roundsd $4, %xmm0, %xmm0 - ret -END(__rint_sse41) diff --git a/sysdeps/x86_64/fpu/multiarch/s_rintf-c.c b/sysdeps/x86_64/fpu/multiarch/s_rintf-c.c deleted file mode 100644 index 8505249f34..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/s_rintf-c.c +++ /dev/null @@ -1,3 +0,0 @@ -#undef __rintf -#define __rintf __rintf_c -#include <sysdeps/ieee754/flt-32/s_rintf.c> diff --git a/sysdeps/x86_64/fpu/multiarch/s_rintf.S b/sysdeps/x86_64/fpu/multiarch/s_rintf.S deleted file mode 100644 index dee4ad794c..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/s_rintf.S +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (C) 2011-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - Contributed by Ulrich Drepper <drepper@gmail.come>, 2011. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <machine/asm.h> -#include <init-arch.h> - - -ENTRY(__rintf) - .type __rintf, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq __rintf_sse41(%rip), %rax - HAS_CPU_FEATURE (SSE4_1) - jnz 2f - leaq __rintf_c(%rip), %rax -2: ret -END(__rintf) -weak_alias (__rintf, rintf) - - -ENTRY(__rintf_sse41) - roundss $4, %xmm0, %xmm0 - ret -END(__rintf_sse41) diff --git a/sysdeps/x86_64/fpu/multiarch/s_sin-avx.c b/sysdeps/x86_64/fpu/multiarch/s_sin-avx.c deleted file mode 100644 index e1c6de0259..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/s_sin-avx.c +++ /dev/null @@ -1,5 +0,0 @@ -#define __cos __cos_avx -#define __sin __sin_avx -#define SECTION __attribute__ ((section (".text.avx"))) - -#include <sysdeps/ieee754/dbl-64/s_sin.c> diff --git a/sysdeps/x86_64/fpu/multiarch/s_sin-fma4.c b/sysdeps/x86_64/fpu/multiarch/s_sin-fma4.c deleted file mode 100644 index 4c35739dc9..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/s_sin-fma4.c +++ /dev/null @@ -1,11 +0,0 @@ -#define __cos __cos_fma4 -#define __sin __sin_fma4 -#define __docos __docos_fma4 -#define __dubsin __dubsin_fma4 -#define __mpcos __mpcos_fma4 -#define __mpcos1 __mpcos1_fma4 -#define __mpsin __mpsin_fma4 -#define __mpsin1 __mpsin1_fma4 -#define SECTION __attribute__ ((section (".text.fma4"))) - -#include <sysdeps/ieee754/dbl-64/s_sin.c> diff --git a/sysdeps/x86_64/fpu/multiarch/s_sin.c b/sysdeps/x86_64/fpu/multiarch/s_sin.c deleted file mode 100644 index 8ffd3e7125..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/s_sin.c +++ /dev/null @@ -1,26 +0,0 @@ -#include <init-arch.h> -#include <math.h> -#undef NAN - -extern double __cos_sse2 (double); -extern double __sin_sse2 (double); -extern double __cos_avx (double); -extern double __sin_avx (double); -extern double __cos_fma4 (double); -extern double __sin_fma4 (double); - -libm_ifunc (__cos, (HAS_ARCH_FEATURE (FMA4_Usable) ? __cos_fma4 : - HAS_ARCH_FEATURE (AVX_Usable) - ? __cos_avx : __cos_sse2)); -weak_alias (__cos, cos) - -libm_ifunc (__sin, (HAS_ARCH_FEATURE (FMA4_Usable) ? __sin_fma4 : - HAS_ARCH_FEATURE (AVX_Usable) - ? __sin_avx : __sin_sse2)); -weak_alias (__sin, sin) - -#define __cos __cos_sse2 -#define __sin __sin_sse2 - - -#include <sysdeps/ieee754/dbl-64/s_sin.c> diff --git a/sysdeps/x86_64/fpu/multiarch/s_tan-avx.c b/sysdeps/x86_64/fpu/multiarch/s_tan-avx.c deleted file mode 100644 index 53de5d3c98..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/s_tan-avx.c +++ /dev/null @@ -1,6 +0,0 @@ -#define tan __tan_avx -#define __dbl_mp __dbl_mp_avx -#define __sub __sub_avx -#define SECTION __attribute__ ((section (".text.avx"))) - -#include <sysdeps/ieee754/dbl-64/s_tan.c> diff --git a/sysdeps/x86_64/fpu/multiarch/s_tan-fma4.c b/sysdeps/x86_64/fpu/multiarch/s_tan-fma4.c deleted file mode 100644 index a805440b46..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/s_tan-fma4.c +++ /dev/null @@ -1,8 +0,0 @@ -#define tan __tan_fma4 -#define __dbl_mp __dbl_mp_fma4 -#define __mpranred __mpranred_fma4 -#define __mptan __mptan_fma4 -#define __sub __sub_fma4 -#define SECTION __attribute__ ((section (".text.fma4"))) - -#include <sysdeps/ieee754/dbl-64/s_tan.c> diff --git a/sysdeps/x86_64/fpu/multiarch/s_tan.c b/sysdeps/x86_64/fpu/multiarch/s_tan.c deleted file mode 100644 index 25f3bca07e..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/s_tan.c +++ /dev/null @@ -1,15 +0,0 @@ -#include <init-arch.h> -#include <math.h> - -extern double __tan_sse2 (double); -extern double __tan_avx (double); -extern double __tan_fma4 (double); - -libm_ifunc (tan, (HAS_ARCH_FEATURE (FMA4_Usable) ? __tan_fma4 : - HAS_ARCH_FEATURE (AVX_Usable) - ? __tan_avx : __tan_sse2)); - -#define tan __tan_sse2 - - -#include <sysdeps/ieee754/dbl-64/s_tan.c> diff --git a/sysdeps/x86_64/fpu/multiarch/sincos32-fma4.c b/sysdeps/x86_64/fpu/multiarch/sincos32-fma4.c deleted file mode 100644 index ebbfa18cca..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/sincos32-fma4.c +++ /dev/null @@ -1,15 +0,0 @@ -#define __cos32 __cos32_fma4 -#define __sin32 __sin32_fma4 -#define __c32 __c32_fma4 -#define __mpsin __mpsin_fma4 -#define __mpsin1 __mpsin1_fma4 -#define __mpcos __mpcos_fma4 -#define __mpcos1 __mpcos1_fma4 -#define __mpranred __mpranred_fma4 -#define __add __add_fma4 -#define __dbl_mp __dbl_mp_fma4 -#define __mul __mul_fma4 -#define __sub __sub_fma4 -#define SECTION __attribute__ ((section (".text.fma4"))) - -#include <sysdeps/ieee754/dbl-64/sincos32.c> diff --git a/sysdeps/x86_64/fpu/multiarch/slowexp-avx.c b/sysdeps/x86_64/fpu/multiarch/slowexp-avx.c deleted file mode 100644 index d01c6d71a4..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/slowexp-avx.c +++ /dev/null @@ -1,9 +0,0 @@ -#define __slowexp __slowexp_avx -#define __add __add_avx -#define __dbl_mp __dbl_mp_avx -#define __mpexp __mpexp_avx -#define __mul __mul_avx -#define __sub __sub_avx -#define SECTION __attribute__ ((section (".text.avx"))) - -#include <sysdeps/ieee754/dbl-64/slowexp.c> diff --git a/sysdeps/x86_64/fpu/multiarch/slowexp-fma4.c b/sysdeps/x86_64/fpu/multiarch/slowexp-fma4.c deleted file mode 100644 index 3bcde84233..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/slowexp-fma4.c +++ /dev/null @@ -1,9 +0,0 @@ -#define __slowexp __slowexp_fma4 -#define __add __add_fma4 -#define __dbl_mp __dbl_mp_fma4 -#define __mpexp __mpexp_fma4 -#define __mul __mul_fma4 -#define __sub __sub_fma4 -#define SECTION __attribute__ ((section (".text.fma4"))) - -#include <sysdeps/ieee754/dbl-64/slowexp.c> diff --git a/sysdeps/x86_64/fpu/multiarch/slowpow-fma4.c b/sysdeps/x86_64/fpu/multiarch/slowpow-fma4.c deleted file mode 100644 index 69d69823bb..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/slowpow-fma4.c +++ /dev/null @@ -1,11 +0,0 @@ -#define __slowpow __slowpow_fma4 -#define __add __add_fma4 -#define __dbl_mp __dbl_mp_fma4 -#define __mpexp __mpexp_fma4 -#define __mplog __mplog_fma4 -#define __mul __mul_fma4 -#define __sub __sub_fma4 -#define __halfulp __halfulp_fma4 -#define SECTION __attribute__ ((section (".text.fma4"))) - -#include <sysdeps/ieee754/dbl-64/slowpow.c> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.S deleted file mode 100644 index b209492442..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized cos, vector length is 2. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVbN2v_cos) - .type _ZGVbN2v_cos, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVbN2v_cos_sse4(%rip), %rax - HAS_CPU_FEATURE (SSE4_1) - jz 2f - ret -2: leaq _ZGVbN2v_cos_sse2(%rip), %rax - ret -END (_ZGVbN2v_cos) -libmvec_hidden_def (_ZGVbN2v_cos) - -#define _ZGVbN2v_cos _ZGVbN2v_cos_sse2 -#include "../svml_d_cos2_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core_sse4.S deleted file mode 100644 index 858dc6532f..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core_sse4.S +++ /dev/null @@ -1,223 +0,0 @@ -/* Function cos vectorized with SSE4. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include "svml_d_trig_data.h" - - .text -ENTRY (_ZGVbN2v_cos_sse4) -/* ALGORITHM DESCRIPTION: - - ( low accuracy ( < 4ulp ) or enhanced performance - ( half of correct mantissa ) implementation ) - - Argument representation: - arg + Pi/2 = (N*Pi + R) - - Result calculation: - cos(arg) = sin(arg+Pi/2) = sin(N*Pi + R) = (-1)^N * sin(R) - sin(R) is approximated by corresponding polynomial - */ - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $320, %rsp - movaps %xmm0, %xmm3 - movq __svml_d_trig_data@GOTPCREL(%rip), %rax - movups __dHalfPI(%rax), %xmm2 - -/* ARGUMENT RANGE REDUCTION: - Add Pi/2 to argument: X' = X+Pi/2 - */ - addpd %xmm3, %xmm2 - movups __dInvPI(%rax), %xmm5 - movups __dAbsMask(%rax), %xmm4 - -/* Get absolute argument value: X' = |X'| */ - andps %xmm2, %xmm4 - -/* Y = X'*InvPi + RS : right shifter add */ - mulpd %xmm5, %xmm2 - -/* Check for large arguments path */ - cmpnlepd __dRangeVal(%rax), %xmm4 - movups __dRShifter(%rax), %xmm6 - addpd %xmm6, %xmm2 - movmskpd %xmm4, %ecx - -/* N = Y - RS : right shifter sub */ - movaps %xmm2, %xmm1 - -/* SignRes = Y<<63 : shift LSB to MSB place for result sign */ - psllq $63, %xmm2 - subpd %xmm6, %xmm1 - -/* N = N - 0.5 */ - subpd __dOneHalf(%rax), %xmm1 - movups __dPI1(%rax), %xmm7 - -/* R = X - N*Pi1 */ - mulpd %xmm1, %xmm7 - movups __dPI2(%rax), %xmm4 - -/* R = R - N*Pi2 */ - mulpd %xmm1, %xmm4 - subpd %xmm7, %xmm0 - movups __dPI3(%rax), %xmm5 - -/* R = R - N*Pi3 */ - mulpd %xmm1, %xmm5 - subpd %xmm4, %xmm0 - -/* R = R - N*Pi4 */ - movups __dPI4(%rax), %xmm6 - mulpd %xmm6, %xmm1 - subpd %xmm5, %xmm0 - subpd %xmm1, %xmm0 - -/* POLYNOMIAL APPROXIMATION: R2 = R*R */ - movaps %xmm0, %xmm4 - mulpd %xmm0, %xmm4 - movups __dC7(%rax), %xmm1 - mulpd %xmm4, %xmm1 - addpd __dC6(%rax), %xmm1 - mulpd %xmm4, %xmm1 - addpd __dC5(%rax), %xmm1 - mulpd %xmm4, %xmm1 - addpd __dC4(%rax), %xmm1 - -/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */ - mulpd %xmm4, %xmm1 - addpd __dC3(%rax), %xmm1 - -/* Poly = R+R*(R2*(C1+R2*(C2+R2*Poly))) */ - mulpd %xmm4, %xmm1 - addpd __dC2(%rax), %xmm1 - mulpd %xmm4, %xmm1 - addpd __dC1(%rax), %xmm1 - mulpd %xmm1, %xmm4 - mulpd %xmm0, %xmm4 - addpd %xmm4, %xmm0 - -/* RECONSTRUCTION: - Final sign setting: Res = Poly^SignRes */ - xorps %xmm2, %xmm0 - testl %ecx, %ecx - jne .LBL_1_3 - -.LBL_1_2: - cfi_remember_state - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_1_3: - cfi_restore_state - movups %xmm3, 192(%rsp) - movups %xmm0, 256(%rsp) - je .LBL_1_2 - - xorb %dl, %dl - xorl %eax, %eax - movups %xmm8, 112(%rsp) - movups %xmm9, 96(%rsp) - movups %xmm10, 80(%rsp) - movups %xmm11, 64(%rsp) - movups %xmm12, 48(%rsp) - movups %xmm13, 32(%rsp) - movups %xmm14, 16(%rsp) - movups %xmm15, (%rsp) - movq %rsi, 136(%rsp) - movq %rdi, 128(%rsp) - movq %r12, 168(%rsp) - cfi_offset_rel_rsp (12, 168) - movb %dl, %r12b - movq %r13, 160(%rsp) - cfi_offset_rel_rsp (13, 160) - movl %ecx, %r13d - movq %r14, 152(%rsp) - cfi_offset_rel_rsp (14, 152) - movl %eax, %r14d - movq %r15, 144(%rsp) - cfi_offset_rel_rsp (15, 144) - cfi_remember_state - -.LBL_1_6: - btl %r14d, %r13d - jc .LBL_1_12 - -.LBL_1_7: - lea 1(%r14), %esi - btl %esi, %r13d - jc .LBL_1_10 - -.LBL_1_8: - incb %r12b - addl $2, %r14d - cmpb $16, %r12b - jb .LBL_1_6 - - movups 112(%rsp), %xmm8 - movups 96(%rsp), %xmm9 - movups 80(%rsp), %xmm10 - movups 64(%rsp), %xmm11 - movups 48(%rsp), %xmm12 - movups 32(%rsp), %xmm13 - movups 16(%rsp), %xmm14 - movups (%rsp), %xmm15 - movq 136(%rsp), %rsi - movq 128(%rsp), %rdi - movq 168(%rsp), %r12 - cfi_restore (%r12) - movq 160(%rsp), %r13 - cfi_restore (%r13) - movq 152(%rsp), %r14 - cfi_restore (%r14) - movq 144(%rsp), %r15 - cfi_restore (%r15) - movups 256(%rsp), %xmm0 - jmp .LBL_1_2 - -.LBL_1_10: - cfi_restore_state - movzbl %r12b, %r15d - shlq $4, %r15 - movsd 200(%rsp,%r15), %xmm0 - - call JUMPTARGET(cos) - - movsd %xmm0, 264(%rsp,%r15) - jmp .LBL_1_8 - -.LBL_1_12: - movzbl %r12b, %r15d - shlq $4, %r15 - movsd 192(%rsp,%r15), %xmm0 - - call JUMPTARGET(cos) - - movsd %xmm0, 256(%rsp,%r15) - jmp .LBL_1_7 - -END (_ZGVbN2v_cos_sse4) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core.S deleted file mode 100644 index ff382e9c6c..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized cos, vector length is 4. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVdN4v_cos) - .type _ZGVdN4v_cos, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVdN4v_cos_avx2(%rip), %rax - HAS_ARCH_FEATURE (AVX2_Usable) - jz 2f - ret -2: leaq _ZGVdN4v_cos_sse_wrapper(%rip), %rax - ret -END (_ZGVdN4v_cos) -libmvec_hidden_def (_ZGVdN4v_cos) - -#define _ZGVdN4v_cos _ZGVdN4v_cos_sse_wrapper -#include "../svml_d_cos4_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core_avx2.S deleted file mode 100644 index 4b6d09743b..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core_avx2.S +++ /dev/null @@ -1,207 +0,0 @@ -/* Function cos vectorized with AVX2. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include "svml_d_trig_data.h" - - .text -ENTRY (_ZGVdN4v_cos_avx2) - -/* ALGORITHM DESCRIPTION: - - ( low accuracy ( < 4ulp ) or enhanced performance - ( half of correct mantissa ) implementation ) - - Argument representation: - arg + Pi/2 = (N*Pi + R) - - Result calculation: - cos(arg) = sin(arg+Pi/2) = sin(N*Pi + R) = (-1)^N * sin(R) - sin(R) is approximated by corresponding polynomial - */ - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $448, %rsp - movq __svml_d_trig_data@GOTPCREL(%rip), %rax - vmovapd %ymm0, %ymm1 - vmovupd __dInvPI(%rax), %ymm4 - vmovupd __dRShifter(%rax), %ymm5 - -/* - ARGUMENT RANGE REDUCTION: - Add Pi/2 to argument: X' = X+Pi/2 - */ - vaddpd __dHalfPI(%rax), %ymm1, %ymm7 - -/* Get absolute argument value: X' = |X'| */ - vandpd __dAbsMask(%rax), %ymm7, %ymm2 - -/* Y = X'*InvPi + RS : right shifter add */ - vfmadd213pd %ymm5, %ymm4, %ymm7 - vmovupd __dC7(%rax), %ymm4 - -/* Check for large arguments path */ - vcmpnle_uqpd __dRangeVal(%rax), %ymm2, %ymm3 - -/* N = Y - RS : right shifter sub */ - vsubpd %ymm5, %ymm7, %ymm6 - vmovupd __dPI1_FMA(%rax), %ymm2 - -/* SignRes = Y<<63 : shift LSB to MSB place for result sign */ - vpsllq $63, %ymm7, %ymm7 - -/* N = N - 0.5 */ - vsubpd __dOneHalf(%rax), %ymm6, %ymm0 - vmovmskpd %ymm3, %ecx - -/* R = X - N*Pi1 */ - vmovapd %ymm1, %ymm3 - vfnmadd231pd %ymm0, %ymm2, %ymm3 - -/* R = R - N*Pi2 */ - vfnmadd231pd __dPI2_FMA(%rax), %ymm0, %ymm3 - -/* R = R - N*Pi3 */ - vfnmadd132pd __dPI3_FMA(%rax), %ymm3, %ymm0 - -/* POLYNOMIAL APPROXIMATION: R2 = R*R */ - vmulpd %ymm0, %ymm0, %ymm5 - vfmadd213pd __dC6(%rax), %ymm5, %ymm4 - vfmadd213pd __dC5(%rax), %ymm5, %ymm4 - vfmadd213pd __dC4(%rax), %ymm5, %ymm4 - -/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */ - vfmadd213pd __dC3(%rax), %ymm5, %ymm4 - -/* Poly = R+R*(R2*(C1+R2*(C2+R2*Poly))) */ - vfmadd213pd __dC2(%rax), %ymm5, %ymm4 - vfmadd213pd __dC1(%rax), %ymm5, %ymm4 - vmulpd %ymm5, %ymm4, %ymm6 - vfmadd213pd %ymm0, %ymm0, %ymm6 - -/* - RECONSTRUCTION: - Final sign setting: Res = Poly^SignRes */ - vxorpd %ymm7, %ymm6, %ymm0 - testl %ecx, %ecx - jne .LBL_1_3 - -.LBL_1_2: - cfi_remember_state - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_1_3: - cfi_restore_state - vmovupd %ymm1, 320(%rsp) - vmovupd %ymm0, 384(%rsp) - je .LBL_1_2 - - xorb %dl, %dl - xorl %eax, %eax - vmovups %ymm8, 224(%rsp) - vmovups %ymm9, 192(%rsp) - vmovups %ymm10, 160(%rsp) - vmovups %ymm11, 128(%rsp) - vmovups %ymm12, 96(%rsp) - vmovups %ymm13, 64(%rsp) - vmovups %ymm14, 32(%rsp) - vmovups %ymm15, (%rsp) - movq %rsi, 264(%rsp) - movq %rdi, 256(%rsp) - movq %r12, 296(%rsp) - cfi_offset_rel_rsp (12, 296) - movb %dl, %r12b - movq %r13, 288(%rsp) - cfi_offset_rel_rsp (13, 288) - movl %ecx, %r13d - movq %r14, 280(%rsp) - cfi_offset_rel_rsp (14, 280) - movl %eax, %r14d - movq %r15, 272(%rsp) - cfi_offset_rel_rsp (15, 272) - cfi_remember_state - -.LBL_1_6: - btl %r14d, %r13d - jc .LBL_1_12 - -.LBL_1_7: - lea 1(%r14), %esi - btl %esi, %r13d - jc .LBL_1_10 - -.LBL_1_8: - incb %r12b - addl $2, %r14d - cmpb $16, %r12b - jb .LBL_1_6 - - vmovups 224(%rsp), %ymm8 - vmovups 192(%rsp), %ymm9 - vmovups 160(%rsp), %ymm10 - vmovups 128(%rsp), %ymm11 - vmovups 96(%rsp), %ymm12 - vmovups 64(%rsp), %ymm13 - vmovups 32(%rsp), %ymm14 - vmovups (%rsp), %ymm15 - vmovupd 384(%rsp), %ymm0 - movq 264(%rsp), %rsi - movq 256(%rsp), %rdi - movq 296(%rsp), %r12 - cfi_restore (%r12) - movq 288(%rsp), %r13 - cfi_restore (%r13) - movq 280(%rsp), %r14 - cfi_restore (%r14) - movq 272(%rsp), %r15 - cfi_restore (%r15) - jmp .LBL_1_2 - -.LBL_1_10: - cfi_restore_state - movzbl %r12b, %r15d - shlq $4, %r15 - vmovsd 328(%rsp,%r15), %xmm0 - vzeroupper - - call JUMPTARGET(cos) - - vmovsd %xmm0, 392(%rsp,%r15) - jmp .LBL_1_8 - -.LBL_1_12: - movzbl %r12b, %r15d - shlq $4, %r15 - vmovsd 320(%rsp,%r15), %xmm0 - vzeroupper - - call JUMPTARGET(cos) - - vmovsd %xmm0, 384(%rsp,%r15) - jmp .LBL_1_7 - -END (_ZGVdN4v_cos_avx2) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S deleted file mode 100644 index 46d35a25d2..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S +++ /dev/null @@ -1,37 +0,0 @@ -/* Multiple versions of vectorized cos, vector length is 8. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVeN8v_cos) - .type _ZGVeN8v_cos, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX -1: leaq _ZGVeN8v_cos_skx(%rip), %rax - HAS_ARCH_FEATURE (AVX512DQ_Usable) - jnz 2f - leaq _ZGVeN8v_cos_knl(%rip), %rax - HAS_ARCH_FEATURE (AVX512F_Usable) - jnz 2f - leaq _ZGVeN8v_cos_avx2_wrapper(%rip), %rax -2: ret -END (_ZGVeN8v_cos) - -#define _ZGVeN8v_cos _ZGVeN8v_cos_avx2_wrapper -#include "../svml_d_cos8_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S deleted file mode 100644 index e7af83c6d5..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S +++ /dev/null @@ -1,463 +0,0 @@ -/* Function cos vectorized with AVX-512, KNL and SKX versions. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include "svml_d_trig_data.h" -#include "svml_d_wrapper_impl.h" - - .text -ENTRY (_ZGVeN8v_cos_knl) -#ifndef HAVE_AVX512DQ_ASM_SUPPORT -WRAPPER_IMPL_AVX512 _ZGVdN4v_cos -#else -/* - ALGORITHM DESCRIPTION: - - ( low accuracy ( < 4ulp ) or enhanced performance - ( half of correct mantissa ) implementation ) - - Argument representation: - arg + Pi/2 = (N*Pi + R) - - Result calculation: - cos(arg) = sin(arg+Pi/2) = sin(N*Pi + R) = (-1)^N * sin(R) - sin(R) is approximated by corresponding polynomial - */ - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $1280, %rsp - movq __svml_d_trig_data@GOTPCREL(%rip), %rax - -/* R = X - N*Pi1 */ - vmovaps %zmm0, %zmm7 - -/* Check for large arguments path */ - movq $-1, %rcx - -/* - ARGUMENT RANGE REDUCTION: - Add Pi/2 to argument: X' = X+Pi/2 - */ - vaddpd __dHalfPI(%rax), %zmm0, %zmm5 - vmovups __dInvPI(%rax), %zmm3 - -/* Get absolute argument value: X' = |X'| */ - vpandq __dAbsMask(%rax), %zmm5, %zmm1 - -/* Y = X'*InvPi + RS : right shifter add */ - vfmadd213pd __dRShifter(%rax), %zmm3, %zmm5 - vmovups __dPI1_FMA(%rax), %zmm6 - -/* N = Y - RS : right shifter sub */ - vsubpd __dRShifter(%rax), %zmm5, %zmm4 - -/* SignRes = Y<<63 : shift LSB to MSB place for result sign */ - vpsllq $63, %zmm5, %zmm12 - vmovups __dC7(%rax), %zmm8 - -/* N = N - 0.5 */ - vsubpd __dOneHalf(%rax), %zmm4, %zmm10 - vcmppd $22, __dRangeVal(%rax), %zmm1, %k1 - vpbroadcastq %rcx, %zmm2{%k1}{z} - vfnmadd231pd %zmm10, %zmm6, %zmm7 - vptestmq %zmm2, %zmm2, %k0 - -/* R = R - N*Pi2 */ - vfnmadd231pd __dPI2_FMA(%rax), %zmm10, %zmm7 - kmovw %k0, %ecx - movzbl %cl, %ecx - -/* R = R - N*Pi3 */ - vfnmadd132pd __dPI3_FMA(%rax), %zmm7, %zmm10 - -/* - POLYNOMIAL APPROXIMATION: - R2 = R*R - */ - vmulpd %zmm10, %zmm10, %zmm9 - vfmadd213pd __dC6(%rax), %zmm9, %zmm8 - vfmadd213pd __dC5(%rax), %zmm9, %zmm8 - vfmadd213pd __dC4(%rax), %zmm9, %zmm8 - -/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */ - vfmadd213pd __dC3(%rax), %zmm9, %zmm8 - -/* Poly = R+R*(R2*(C1+R2*(C2+R2*Poly))) */ - vfmadd213pd __dC2(%rax), %zmm9, %zmm8 - vfmadd213pd __dC1(%rax), %zmm9, %zmm8 - vmulpd %zmm9, %zmm8, %zmm11 - vfmadd213pd %zmm10, %zmm10, %zmm11 - -/* - RECONSTRUCTION: - Final sign setting: Res = Poly^SignRes - */ - vpxorq %zmm12, %zmm11, %zmm1 - testl %ecx, %ecx - jne .LBL_1_3 - -.LBL_1_2: - cfi_remember_state - vmovaps %zmm1, %zmm0 - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_1_3: - cfi_restore_state - vmovups %zmm0, 1152(%rsp) - vmovups %zmm1, 1216(%rsp) - je .LBL_1_2 - - xorb %dl, %dl - kmovw %k4, 1048(%rsp) - xorl %eax, %eax - kmovw %k5, 1040(%rsp) - kmovw %k6, 1032(%rsp) - kmovw %k7, 1024(%rsp) - vmovups %zmm16, 960(%rsp) - vmovups %zmm17, 896(%rsp) - vmovups %zmm18, 832(%rsp) - vmovups %zmm19, 768(%rsp) - vmovups %zmm20, 704(%rsp) - vmovups %zmm21, 640(%rsp) - vmovups %zmm22, 576(%rsp) - vmovups %zmm23, 512(%rsp) - vmovups %zmm24, 448(%rsp) - vmovups %zmm25, 384(%rsp) - vmovups %zmm26, 320(%rsp) - vmovups %zmm27, 256(%rsp) - vmovups %zmm28, 192(%rsp) - vmovups %zmm29, 128(%rsp) - vmovups %zmm30, 64(%rsp) - vmovups %zmm31, (%rsp) - movq %rsi, 1064(%rsp) - movq %rdi, 1056(%rsp) - movq %r12, 1096(%rsp) - cfi_offset_rel_rsp (12, 1096) - movb %dl, %r12b - movq %r13, 1088(%rsp) - cfi_offset_rel_rsp (13, 1088) - movl %ecx, %r13d - movq %r14, 1080(%rsp) - cfi_offset_rel_rsp (14, 1080) - movl %eax, %r14d - movq %r15, 1072(%rsp) - cfi_offset_rel_rsp (15, 1072) - cfi_remember_state - -.LBL_1_6: - btl %r14d, %r13d - jc .LBL_1_12 - -.LBL_1_7: - lea 1(%r14), %esi - btl %esi, %r13d - jc .LBL_1_10 - -.LBL_1_8: - addb $1, %r12b - addl $2, %r14d - cmpb $16, %r12b - jb .LBL_1_6 - - kmovw 1048(%rsp), %k4 - movq 1064(%rsp), %rsi - kmovw 1040(%rsp), %k5 - movq 1056(%rsp), %rdi - kmovw 1032(%rsp), %k6 - movq 1096(%rsp), %r12 - cfi_restore (%r12) - movq 1088(%rsp), %r13 - cfi_restore (%r13) - kmovw 1024(%rsp), %k7 - vmovups 960(%rsp), %zmm16 - vmovups 896(%rsp), %zmm17 - vmovups 832(%rsp), %zmm18 - vmovups 768(%rsp), %zmm19 - vmovups 704(%rsp), %zmm20 - vmovups 640(%rsp), %zmm21 - vmovups 576(%rsp), %zmm22 - vmovups 512(%rsp), %zmm23 - vmovups 448(%rsp), %zmm24 - vmovups 384(%rsp), %zmm25 - vmovups 320(%rsp), %zmm26 - vmovups 256(%rsp), %zmm27 - vmovups 192(%rsp), %zmm28 - vmovups 128(%rsp), %zmm29 - vmovups 64(%rsp), %zmm30 - vmovups (%rsp), %zmm31 - movq 1080(%rsp), %r14 - cfi_restore (%r14) - movq 1072(%rsp), %r15 - cfi_restore (%r15) - vmovups 1216(%rsp), %zmm1 - jmp .LBL_1_2 - -.LBL_1_10: - cfi_restore_state - movzbl %r12b, %r15d - shlq $4, %r15 - vmovsd 1160(%rsp,%r15), %xmm0 - call JUMPTARGET(cos) - vmovsd %xmm0, 1224(%rsp,%r15) - jmp .LBL_1_8 - -.LBL_1_12: - movzbl %r12b, %r15d - shlq $4, %r15 - vmovsd 1152(%rsp,%r15), %xmm0 - call JUMPTARGET(cos) - vmovsd %xmm0, 1216(%rsp,%r15) - jmp .LBL_1_7 -#endif -END (_ZGVeN8v_cos_knl) - -ENTRY (_ZGVeN8v_cos_skx) -#ifndef HAVE_AVX512DQ_ASM_SUPPORT -WRAPPER_IMPL_AVX512 _ZGVdN4v_cos -#else -/* - ALGORITHM DESCRIPTION: - - ( low accuracy ( < 4ulp ) or enhanced performance - ( half of correct mantissa ) implementation ) - - Argument representation: - arg + Pi/2 = (N*Pi + R) - - Result calculation: - cos(arg) = sin(arg+Pi/2) = sin(N*Pi + R) = (-1)^N * sin(R) - sin(R) is approximated by corresponding polynomial - */ - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $1280, %rsp - movq __svml_d_trig_data@GOTPCREL(%rip), %rax - -/* R = X - N*Pi1 */ - vmovaps %zmm0, %zmm8 - -/* Check for large arguments path */ - vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2 - -/* - ARGUMENT RANGE REDUCTION: - Add Pi/2 to argument: X' = X+Pi/2 - */ - vaddpd __dHalfPI(%rax), %zmm0, %zmm6 - vmovups __dInvPI(%rax), %zmm3 - vmovups __dRShifter(%rax), %zmm4 - vmovups __dPI1_FMA(%rax), %zmm7 - vmovups __dC7(%rax), %zmm9 - -/* Get absolute argument value: X' = |X'| */ - vandpd __dAbsMask(%rax), %zmm6, %zmm1 - -/* Y = X'*InvPi + RS : right shifter add */ - vfmadd213pd %zmm4, %zmm3, %zmm6 - vcmppd $18, __dRangeVal(%rax), %zmm1, %k1 - -/* SignRes = Y<<63 : shift LSB to MSB place for result sign */ - vpsllq $63, %zmm6, %zmm13 - -/* N = Y - RS : right shifter sub */ - vsubpd %zmm4, %zmm6, %zmm5 - -/* N = N - 0.5 */ - vsubpd __dOneHalf(%rax), %zmm5, %zmm11 - vfnmadd231pd %zmm11, %zmm7, %zmm8 - -/* R = R - N*Pi2 */ - vfnmadd231pd __dPI2_FMA(%rax), %zmm11, %zmm8 - -/* R = R - N*Pi3 */ - vfnmadd132pd __dPI3_FMA(%rax), %zmm8, %zmm11 - -/* - POLYNOMIAL APPROXIMATION: - R2 = R*R - */ - vmulpd %zmm11, %zmm11, %zmm10 - vfmadd213pd __dC6(%rax), %zmm10, %zmm9 - vfmadd213pd __dC5(%rax), %zmm10, %zmm9 - vfmadd213pd __dC4(%rax), %zmm10, %zmm9 - -/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */ - vfmadd213pd __dC3(%rax), %zmm10, %zmm9 - -/* Poly = R+R*(R2*(C1+R2*(C2+R2*Poly))) */ - vfmadd213pd __dC2(%rax), %zmm10, %zmm9 - vfmadd213pd __dC1(%rax), %zmm10, %zmm9 - vmulpd %zmm10, %zmm9, %zmm12 - vfmadd213pd %zmm11, %zmm11, %zmm12 - vpandnq %zmm1, %zmm1, %zmm2{%k1} - vcmppd $3, %zmm2, %zmm2, %k0 - -/* - RECONSTRUCTION: - Final sign setting: Res = Poly^SignRes - */ - vxorpd %zmm13, %zmm12, %zmm1 - kmovw %k0, %ecx - testl %ecx, %ecx - jne .LBL_2_3 - -.LBL_2_2: - cfi_remember_state - vmovaps %zmm1, %zmm0 - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_2_3: - cfi_restore_state - vmovups %zmm0, 1152(%rsp) - vmovups %zmm1, 1216(%rsp) - je .LBL_2_2 - - xorb %dl, %dl - xorl %eax, %eax - kmovw %k4, 1048(%rsp) - kmovw %k5, 1040(%rsp) - kmovw %k6, 1032(%rsp) - kmovw %k7, 1024(%rsp) - vmovups %zmm16, 960(%rsp) - vmovups %zmm17, 896(%rsp) - vmovups %zmm18, 832(%rsp) - vmovups %zmm19, 768(%rsp) - vmovups %zmm20, 704(%rsp) - vmovups %zmm21, 640(%rsp) - vmovups %zmm22, 576(%rsp) - vmovups %zmm23, 512(%rsp) - vmovups %zmm24, 448(%rsp) - vmovups %zmm25, 384(%rsp) - vmovups %zmm26, 320(%rsp) - vmovups %zmm27, 256(%rsp) - vmovups %zmm28, 192(%rsp) - vmovups %zmm29, 128(%rsp) - vmovups %zmm30, 64(%rsp) - vmovups %zmm31, (%rsp) - movq %rsi, 1064(%rsp) - movq %rdi, 1056(%rsp) - movq %r12, 1096(%rsp) - cfi_offset_rel_rsp (12, 1096) - movb %dl, %r12b - movq %r13, 1088(%rsp) - cfi_offset_rel_rsp (13, 1088) - movl %ecx, %r13d - movq %r14, 1080(%rsp) - cfi_offset_rel_rsp (14, 1080) - movl %eax, %r14d - movq %r15, 1072(%rsp) - cfi_offset_rel_rsp (15, 1072) - cfi_remember_state - -.LBL_2_6: - btl %r14d, %r13d - jc .LBL_2_12 - -.LBL_2_7: - lea 1(%r14), %esi - btl %esi, %r13d - jc .LBL_2_10 - -.LBL_2_8: - incb %r12b - addl $2, %r14d - cmpb $16, %r12b - jb .LBL_2_6 - - kmovw 1048(%rsp), %k4 - kmovw 1040(%rsp), %k5 - kmovw 1032(%rsp), %k6 - kmovw 1024(%rsp), %k7 - vmovups 960(%rsp), %zmm16 - vmovups 896(%rsp), %zmm17 - vmovups 832(%rsp), %zmm18 - vmovups 768(%rsp), %zmm19 - vmovups 704(%rsp), %zmm20 - vmovups 640(%rsp), %zmm21 - vmovups 576(%rsp), %zmm22 - vmovups 512(%rsp), %zmm23 - vmovups 448(%rsp), %zmm24 - vmovups 384(%rsp), %zmm25 - vmovups 320(%rsp), %zmm26 - vmovups 256(%rsp), %zmm27 - vmovups 192(%rsp), %zmm28 - vmovups 128(%rsp), %zmm29 - vmovups 64(%rsp), %zmm30 - vmovups (%rsp), %zmm31 - vmovups 1216(%rsp), %zmm1 - movq 1064(%rsp), %rsi - movq 1056(%rsp), %rdi - movq 1096(%rsp), %r12 - cfi_restore (%r12) - movq 1088(%rsp), %r13 - cfi_restore (%r13) - movq 1080(%rsp), %r14 - cfi_restore (%r14) - movq 1072(%rsp), %r15 - cfi_restore (%r15) - jmp .LBL_2_2 - -.LBL_2_10: - cfi_restore_state - movzbl %r12b, %r15d - shlq $4, %r15 - vmovsd 1160(%rsp,%r15), %xmm0 - vzeroupper - vmovsd 1160(%rsp,%r15), %xmm0 - - call JUMPTARGET(cos) - - vmovsd %xmm0, 1224(%rsp,%r15) - jmp .LBL_2_8 - -.LBL_2_12: - movzbl %r12b, %r15d - shlq $4, %r15 - vmovsd 1152(%rsp,%r15), %xmm0 - vzeroupper - vmovsd 1152(%rsp,%r15), %xmm0 - - call JUMPTARGET(cos) - - vmovsd %xmm0, 1216(%rsp,%r15) - jmp .LBL_2_7 -#endif -END (_ZGVeN8v_cos_skx) - - .section .rodata, "a" -.L_2il0floatpacket.16: - .long 0xffffffff,0xffffffff - .type .L_2il0floatpacket.16,@object diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core.S deleted file mode 100644 index 5a17e11a0f..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized exp. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVbN2v_exp) - .type _ZGVbN2v_exp, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVbN2v_exp_sse4(%rip), %rax - HAS_CPU_FEATURE (SSE4_1) - jz 2f - ret -2: leaq _ZGVbN2v_exp_sse2(%rip), %rax - ret -END (_ZGVbN2v_exp) -libmvec_hidden_def (_ZGVbN2v_exp) - -#define _ZGVbN2v_exp _ZGVbN2v_exp_sse2 -#include "../svml_d_exp2_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core_sse4.S deleted file mode 100644 index 864dc5ae9f..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core_sse4.S +++ /dev/null @@ -1,225 +0,0 @@ -/* Function exp vectorized with SSE4. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include "svml_d_exp_data.h" - - .text -ENTRY (_ZGVbN2v_exp_sse4) -/* - ALGORITHM DESCRIPTION: - - Argument representation: - N = rint(X*2^k/ln2) = 2^k*M+j - X = N*ln2/2^k + r = M*ln2 + ln2*(j/2^k) + r - then -ln2/2^(k+1) < r < ln2/2^(k+1) - Alternatively: - N = trunc(X*2^k/ln2) - then 0 < r < ln2/2^k - - Result calculation: - exp(X) = exp(M*ln2 + ln2*(j/2^k) + r) - = 2^M * 2^(j/2^k) * exp(r) - 2^M is calculated by bit manipulation - 2^(j/2^k) is stored in table - exp(r) is approximated by polynomial. - - The table lookup is skipped if k = 0. */ - - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $320, %rsp - movaps %xmm0, %xmm3 - movq __svml_dexp_data@GOTPCREL(%rip), %r8 - -/* iAbsX = (int)(lX>>32), lX = *(longlong*)&X */ - pshufd $221, %xmm3, %xmm7 - movups __dbInvLn2(%r8), %xmm0 - -/* dK = X*dbInvLn2 */ - mulpd %xmm3, %xmm0 - movq __iAbsMask(%r8), %xmm5 - movq __iDomainRange(%r8), %xmm6 - -/* iAbsX = iAbsX&iAbsMask */ - pand %xmm5, %xmm7 - -/* iRangeMask = (iAbsX>iDomainRange) */ - pcmpgtd %xmm6, %xmm7 - -/* Mask = iRangeMask?1:0, set mask for overflow/underflow */ - movmskps %xmm7, %eax - -/* dN = rint(X*2^k/Ln2) */ - xorps %xmm7, %xmm7 - movups __dbLn2hi(%r8), %xmm5 - movups __dbLn2lo(%r8), %xmm6 - roundpd $0, %xmm0, %xmm7 - -/* dR = X - dN*dbLn2hi, dbLn2hi is 52-8-k hi bits of ln2/2^k */ - mulpd %xmm7, %xmm5 - -/* dR = dR - dN*dbLn2lo, dbLn2lo is 40..94 bits of lo part of ln2/2^k */ - mulpd %xmm6, %xmm7 - movups __dbShifter(%r8), %xmm4 - -/* dM = X*dbInvLn2+dbShifter */ - addpd %xmm0, %xmm4 - movaps %xmm3, %xmm0 - subpd %xmm5, %xmm0 - subpd %xmm7, %xmm0 - movups __dPC2(%r8), %xmm5 - -/* exp(r) = b0+r*(b0+r*(b1+r*b2)) */ - mulpd %xmm0, %xmm5 - addpd __dPC1(%r8), %xmm5 - mulpd %xmm0, %xmm5 - movups __dPC0(%r8), %xmm6 - addpd %xmm6, %xmm5 - mulpd %xmm5, %xmm0 - movdqu __lIndexMask(%r8), %xmm2 - -/* lIndex = (*(longlong*)&dM)&lIndexMask, lIndex is the lower K bits of lM */ - movdqa %xmm2, %xmm1 - -/* lM = (*(longlong*)&dM)&(~lIndexMask) */ - pandn %xmm4, %xmm2 - pand %xmm4, %xmm1 - -/* lM = lM<<(52-K), 2^M */ - psllq $42, %xmm2 - -/* table lookup for dT[j] = 2^(j/2^k) */ - movd %xmm1, %edx - pextrw $4, %xmm1, %ecx - addpd %xmm0, %xmm6 - shll $3, %edx - shll $3, %ecx - movq (%r8,%rdx), %xmm0 - andl $3, %eax - movhpd (%r8,%rcx), %xmm0 - -/* 2^(j/2^k) * exp(r) */ - mulpd %xmm6, %xmm0 - -/* multiply by 2^M through integer add */ - paddq %xmm2, %xmm0 - jne .LBL_1_3 - -.LBL_1_2: - cfi_remember_state - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_1_3: - cfi_restore_state - movups %xmm3, 192(%rsp) - movups %xmm0, 256(%rsp) - je .LBL_1_2 - - xorb %cl, %cl - xorl %edx, %edx - movups %xmm8, 112(%rsp) - movups %xmm9, 96(%rsp) - movups %xmm10, 80(%rsp) - movups %xmm11, 64(%rsp) - movups %xmm12, 48(%rsp) - movups %xmm13, 32(%rsp) - movups %xmm14, 16(%rsp) - movups %xmm15, (%rsp) - movq %rsi, 136(%rsp) - movq %rdi, 128(%rsp) - movq %r12, 168(%rsp) - cfi_offset_rel_rsp (12, 168) - movb %cl, %r12b - movq %r13, 160(%rsp) - cfi_offset_rel_rsp (13, 160) - movl %eax, %r13d - movq %r14, 152(%rsp) - cfi_offset_rel_rsp (14, 152) - movl %edx, %r14d - movq %r15, 144(%rsp) - cfi_offset_rel_rsp (15, 144) - cfi_remember_state - -.LBL_1_6: - btl %r14d, %r13d - jc .LBL_1_12 - -.LBL_1_7: - lea 1(%r14), %esi - btl %esi, %r13d - jc .LBL_1_10 - -.LBL_1_8: - incb %r12b - addl $2, %r14d - cmpb $16, %r12b - jb .LBL_1_6 - - movups 112(%rsp), %xmm8 - movups 96(%rsp), %xmm9 - movups 80(%rsp), %xmm10 - movups 64(%rsp), %xmm11 - movups 48(%rsp), %xmm12 - movups 32(%rsp), %xmm13 - movups 16(%rsp), %xmm14 - movups (%rsp), %xmm15 - movq 136(%rsp), %rsi - movq 128(%rsp), %rdi - movq 168(%rsp), %r12 - cfi_restore (%r12) - movq 160(%rsp), %r13 - cfi_restore (%r13) - movq 152(%rsp), %r14 - cfi_restore (%r14) - movq 144(%rsp), %r15 - cfi_restore (%r15) - movups 256(%rsp), %xmm0 - jmp .LBL_1_2 - -.LBL_1_10: - cfi_restore_state - movzbl %r12b, %r15d - shlq $4, %r15 - movsd 200(%rsp,%r15), %xmm0 - - call JUMPTARGET(__exp_finite) - - movsd %xmm0, 264(%rsp,%r15) - jmp .LBL_1_8 - -.LBL_1_12: - movzbl %r12b, %r15d - shlq $4, %r15 - movsd 192(%rsp,%r15), %xmm0 - - call JUMPTARGET(__exp_finite) - - movsd %xmm0, 256(%rsp,%r15) - jmp .LBL_1_7 - -END (_ZGVbN2v_exp_sse4) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core.S deleted file mode 100644 index b994a794cd..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized exp. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVdN4v_exp) - .type _ZGVdN4v_exp, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVdN4v_exp_avx2(%rip), %rax - HAS_ARCH_FEATURE (AVX2_Usable) - jz 2f - ret -2: leaq _ZGVdN4v_exp_sse_wrapper(%rip), %rax - ret -END (_ZGVdN4v_exp) -libmvec_hidden_def (_ZGVdN4v_exp) - -#define _ZGVdN4v_exp _ZGVdN4v_exp_sse_wrapper -#include "../svml_d_exp4_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core_avx2.S deleted file mode 100644 index 937b3c09a6..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core_avx2.S +++ /dev/null @@ -1,212 +0,0 @@ -/* Function exp vectorized with AVX2. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include "svml_d_exp_data.h" - - .text -ENTRY (_ZGVdN4v_exp_avx2) -/* - ALGORITHM DESCRIPTION: - - Argument representation: - N = rint(X*2^k/ln2) = 2^k*M+j - X = N*ln2/2^k + r = M*ln2 + ln2*(j/2^k) + r - then -ln2/2^(k+1) < r < ln2/2^(k+1) - Alternatively: - N = trunc(X*2^k/ln2) - then 0 < r < ln2/2^k - - Result calculation: - exp(X) = exp(M*ln2 + ln2*(j/2^k) + r) - = 2^M * 2^(j/2^k) * exp(r) - 2^M is calculated by bit manipulation - 2^(j/2^k) is stored in table - exp(r) is approximated by polynomial - - The table lookup is skipped if k = 0. */ - - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $448, %rsp - movq __svml_dexp_data@GOTPCREL(%rip), %rax - vmovdqa %ymm0, %ymm2 - vmovupd __dbInvLn2(%rax), %ymm3 - vmovupd __dbShifter(%rax), %ymm1 - vmovupd __lIndexMask(%rax), %ymm4 - -/* dM = X*dbInvLn2+dbShifter, dbInvLn2 = 2^k/Ln2 */ - vfmadd213pd %ymm1, %ymm2, %ymm3 - -/* iAbsX = (int)(lX>>32), lX = *(longlong*)&X */ - vextracti128 $1, %ymm2, %xmm5 - vshufps $221, %xmm5, %xmm2, %xmm6 - -/* iAbsX = iAbsX&iAbsMask */ - vandps __iAbsMask(%rax), %xmm6, %xmm7 - -/* dN = dM-dbShifter, dN = rint(X*2^k/Ln2) */ - vsubpd %ymm1, %ymm3, %ymm6 - -/* iRangeMask = (iAbsX>iDomainRange) */ - vpcmpgtd __iDomainRange(%rax), %xmm7, %xmm0 - vmovupd __dbLn2hi(%rax), %ymm1 - vmovupd __dPC0(%rax), %ymm7 - -/* Mask = iRangeMask?1:0, set mask for overflow/underflow */ - vmovmskps %xmm0, %ecx - vmovupd __dPC2(%rax), %ymm0 - -/* dR = X - dN*dbLn2hi, dbLn2hi is 52-8-k hi bits of ln2/2^k */ - vmovdqa %ymm2, %ymm5 - vfnmadd231pd %ymm6, %ymm1, %ymm5 - -/* dR = dR - dN*dbLn2lo, dbLn2lo is 40..94 bits of lo part of ln2/2^k */ - vfnmadd132pd __dbLn2lo(%rax), %ymm5, %ymm6 - -/* exp(r) = b0+r*(b0+r*(b1+r*b2)) */ - vfmadd213pd __dPC1(%rax), %ymm6, %ymm0 - vfmadd213pd %ymm7, %ymm6, %ymm0 - vfmadd213pd %ymm7, %ymm6, %ymm0 - -/* lIndex = (*(longlong*)&dM)&lIndexMask, lIndex is the lower K bits of lM */ - vandps %ymm4, %ymm3, %ymm1 - -/* table lookup for dT[j] = 2^(j/2^k) */ - vxorpd %ymm6, %ymm6, %ymm6 - vpcmpeqd %ymm5, %ymm5, %ymm5 - vgatherqpd %ymm5, (%rax,%ymm1,8), %ymm6 - -/* lM = (*(longlong*)&dM)&(~lIndexMask) */ - vpandn %ymm3, %ymm4, %ymm3 - -/* 2^(j/2^k) * exp(r) */ - vmulpd %ymm0, %ymm6, %ymm0 - -/* lM = lM<<(52-K), 2^M */ - vpsllq $42, %ymm3, %ymm4 - -/* multiply by 2^M through integer add */ - vpaddq %ymm4, %ymm0, %ymm0 - testl %ecx, %ecx - jne .LBL_1_3 - -.LBL_1_2: - cfi_remember_state - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_1_3: - cfi_restore_state - vmovupd %ymm2, 320(%rsp) - vmovupd %ymm0, 384(%rsp) - je .LBL_1_2 - - xorb %dl, %dl - xorl %eax, %eax - vmovups %ymm8, 224(%rsp) - vmovups %ymm9, 192(%rsp) - vmovups %ymm10, 160(%rsp) - vmovups %ymm11, 128(%rsp) - vmovups %ymm12, 96(%rsp) - vmovups %ymm13, 64(%rsp) - vmovups %ymm14, 32(%rsp) - vmovups %ymm15, (%rsp) - movq %rsi, 264(%rsp) - movq %rdi, 256(%rsp) - movq %r12, 296(%rsp) - cfi_offset_rel_rsp (12, 296) - movb %dl, %r12b - movq %r13, 288(%rsp) - cfi_offset_rel_rsp (13, 288) - movl %ecx, %r13d - movq %r14, 280(%rsp) - cfi_offset_rel_rsp (14, 280) - movl %eax, %r14d - movq %r15, 272(%rsp) - cfi_offset_rel_rsp (15, 272) - cfi_remember_state - -.LBL_1_6: - btl %r14d, %r13d - jc .LBL_1_12 - -.LBL_1_7: - lea 1(%r14), %esi - btl %esi, %r13d - jc .LBL_1_10 - -.LBL_1_8: - incb %r12b - addl $2, %r14d - cmpb $16, %r12b - jb .LBL_1_6 - - vmovups 224(%rsp), %ymm8 - vmovups 192(%rsp), %ymm9 - vmovups 160(%rsp), %ymm10 - vmovups 128(%rsp), %ymm11 - vmovups 96(%rsp), %ymm12 - vmovups 64(%rsp), %ymm13 - vmovups 32(%rsp), %ymm14 - vmovups (%rsp), %ymm15 - vmovupd 384(%rsp), %ymm0 - movq 264(%rsp), %rsi - movq 256(%rsp), %rdi - movq 296(%rsp), %r12 - cfi_restore (%r12) - movq 288(%rsp), %r13 - cfi_restore (%r13) - movq 280(%rsp), %r14 - cfi_restore (%r14) - movq 272(%rsp), %r15 - cfi_restore (%r15) - jmp .LBL_1_2 - -.LBL_1_10: - cfi_restore_state - movzbl %r12b, %r15d - shlq $4, %r15 - vmovsd 328(%rsp,%r15), %xmm0 - vzeroupper - - call JUMPTARGET(__exp_finite) - - vmovsd %xmm0, 392(%rsp,%r15) - jmp .LBL_1_8 - -.LBL_1_12: - movzbl %r12b, %r15d - shlq $4, %r15 - vmovsd 320(%rsp,%r15), %xmm0 - vzeroupper - - call JUMPTARGET(__exp_finite) - - vmovsd %xmm0, 384(%rsp,%r15) - jmp .LBL_1_7 - -END (_ZGVdN4v_exp_avx2) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S deleted file mode 100644 index 6189080fcc..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S +++ /dev/null @@ -1,37 +0,0 @@ -/* Multiple versions of vectorized exp. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVeN8v_exp) - .type _ZGVeN8v_exp, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVeN8v_exp_skx(%rip), %rax - HAS_ARCH_FEATURE (AVX512DQ_Usable) - jnz 2f - leaq _ZGVeN8v_exp_knl(%rip), %rax - HAS_ARCH_FEATURE (AVX512F_Usable) - jnz 2f - leaq _ZGVeN8v_exp_avx2_wrapper(%rip), %rax -2: ret -END (_ZGVeN8v_exp) - -#define _ZGVeN8v_exp _ZGVeN8v_exp_avx2_wrapper -#include "../svml_d_exp8_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core_avx512.S deleted file mode 100644 index 97ba72c2a0..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core_avx512.S +++ /dev/null @@ -1,456 +0,0 @@ -/* Function exp vectorized with AVX-512. KNL and SKX versions. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include "svml_d_exp_data.h" -#include "svml_d_wrapper_impl.h" - - .text -ENTRY (_ZGVeN8v_exp_knl) -#ifndef HAVE_AVX512DQ_ASM_SUPPORT -WRAPPER_IMPL_AVX512 _ZGVdN4v_exp -#else -/* - ALGORITHM DESCRIPTION: - - Argument representation: - N = rint(X*2^k/ln2) = 2^k*M+j - X = N*ln2/2^k + r = M*ln2 + ln2*(j/2^k) + r - then -ln2/2^(k+1) < r < ln2/2^(k+1) - Alternatively: - N = trunc(X*2^k/ln2) - then 0 < r < ln2/2^k - - Result calculation: - exp(X) = exp(M*ln2 + ln2*(j/2^k) + r) - = 2^M * 2^(j/2^k) * exp(r) - 2^M is calculated by bit manipulation - 2^(j/2^k) is stored in table - exp(r) is approximated by polynomial - - The table lookup is skipped if k = 0. */ - - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $1280, %rsp - movq __svml_dexp_data@GOTPCREL(%rip), %rax - -/* dR = X - dN*dbLn2hi, dbLn2hi is 52-8-k hi bits of ln2/2^k */ - vmovaps %zmm0, %zmm8 - -/* iAbsX = (int)(lX>>32), lX = *(longlong*)&X */ - vpsrlq $32, %zmm0, %zmm1 - -/* iAbsX = iAbsX&iAbsMask */ - movl $255, %edx - vpmovqd %zmm1, %ymm2 - kmovw %edx, %k2 - -/* iRangeMask = (iAbsX>iDomainRange) */ - movl $-1, %ecx - -/* table lookup for dT[j] = 2^(j/2^k) */ - vpxord %zmm11, %zmm11, %zmm11 - vmovups __dbInvLn2(%rax), %zmm5 - vmovups __dbLn2hi(%rax), %zmm7 - kxnorw %k3, %k3, %k3 - -/* dM = X*dbInvLn2+dbShifter, dbInvLn2 = 2^k/Ln2 */ - vfmadd213pd __dbShifter(%rax), %zmm0, %zmm5 - vmovups __dPC2(%rax), %zmm12 - -/* dN = dM-dbShifter, dN = rint(X*2^k/Ln2) */ - vsubpd __dbShifter(%rax), %zmm5, %zmm9 - vmovups __lIndexMask(%rax), %zmm4 - vfnmadd231pd %zmm9, %zmm7, %zmm8 - vpandd __iAbsMask(%rax), %zmm2, %zmm2{%k2} - -/* lIndex = (*(longlong*)&dM)&lIndexMask, lIndex is the lower K bits of lM */ - vpandq %zmm4, %zmm5, %zmm10 - vgatherqpd (%rax,%zmm10,8), %zmm11{%k3} - vpcmpgtd __iDomainRange(%rax), %zmm2, %k1{%k2} - -/* lM = (*(longlong*)&dM)&(~lIndexMask) */ - vpandnq %zmm5, %zmm4, %zmm6 - vpbroadcastd %ecx, %zmm3{%k1}{z} - -/* lM = lM<<(52-K), 2^M */ - vpsllq $42, %zmm6, %zmm14 - -/* dR = dR - dN*dbLn2lo, dbLn2lo is 40..94 bits of lo part of ln2/2^k */ - vfnmadd132pd __dbLn2lo(%rax), %zmm8, %zmm9 - -/* Mask = iRangeMask?1:0, set mask for overflow/underflow */ - vptestmd %zmm3, %zmm3, %k0{%k2} - -/* exp(r) = b0+r*(b0+r*(b1+r*b2)) */ - vfmadd213pd __dPC1(%rax), %zmm9, %zmm12 - kmovw %k0, %ecx - movzbl %cl, %ecx - vfmadd213pd __dPC0(%rax), %zmm9, %zmm12 - vfmadd213pd __dPC0(%rax), %zmm9, %zmm12 - -/* 2^(j/2^k) * exp(r) */ - vmulpd %zmm12, %zmm11, %zmm13 - -/* multiply by 2^M through integer add */ - vpaddq %zmm14, %zmm13, %zmm1 - testl %ecx, %ecx - jne .LBL_1_3 - -.LBL_1_2: - cfi_remember_state - vmovaps %zmm1, %zmm0 - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_1_3: - cfi_restore_state - vmovups %zmm0, 1152(%rsp) - vmovups %zmm1, 1216(%rsp) - je .LBL_1_2 - - xorb %dl, %dl - kmovw %k4, 1048(%rsp) - xorl %eax, %eax - kmovw %k5, 1040(%rsp) - kmovw %k6, 1032(%rsp) - kmovw %k7, 1024(%rsp) - vmovups %zmm16, 960(%rsp) - vmovups %zmm17, 896(%rsp) - vmovups %zmm18, 832(%rsp) - vmovups %zmm19, 768(%rsp) - vmovups %zmm20, 704(%rsp) - vmovups %zmm21, 640(%rsp) - vmovups %zmm22, 576(%rsp) - vmovups %zmm23, 512(%rsp) - vmovups %zmm24, 448(%rsp) - vmovups %zmm25, 384(%rsp) - vmovups %zmm26, 320(%rsp) - vmovups %zmm27, 256(%rsp) - vmovups %zmm28, 192(%rsp) - vmovups %zmm29, 128(%rsp) - vmovups %zmm30, 64(%rsp) - vmovups %zmm31, (%rsp) - movq %rsi, 1064(%rsp) - movq %rdi, 1056(%rsp) - movq %r12, 1096(%rsp) - cfi_offset_rel_rsp (12, 1096) - movb %dl, %r12b - movq %r13, 1088(%rsp) - cfi_offset_rel_rsp (13, 1088) - movl %ecx, %r13d - movq %r14, 1080(%rsp) - cfi_offset_rel_rsp (14, 1080) - movl %eax, %r14d - movq %r15, 1072(%rsp) - cfi_offset_rel_rsp (15, 1072) - cfi_remember_state - -.LBL_1_6: - btl %r14d, %r13d - jc .LBL_1_12 - -.LBL_1_7: - lea 1(%r14), %esi - btl %esi, %r13d - jc .LBL_1_10 - -.LBL_1_8: - addb $1, %r12b - addl $2, %r14d - cmpb $16, %r12b - jb .LBL_1_6 - - kmovw 1048(%rsp), %k4 - movq 1064(%rsp), %rsi - kmovw 1040(%rsp), %k5 - movq 1056(%rsp), %rdi - kmovw 1032(%rsp), %k6 - movq 1096(%rsp), %r12 - cfi_restore (%r12) - movq 1088(%rsp), %r13 - cfi_restore (%r13) - kmovw 1024(%rsp), %k7 - vmovups 960(%rsp), %zmm16 - vmovups 896(%rsp), %zmm17 - vmovups 832(%rsp), %zmm18 - vmovups 768(%rsp), %zmm19 - vmovups 704(%rsp), %zmm20 - vmovups 640(%rsp), %zmm21 - vmovups 576(%rsp), %zmm22 - vmovups 512(%rsp), %zmm23 - vmovups 448(%rsp), %zmm24 - vmovups 384(%rsp), %zmm25 - vmovups 320(%rsp), %zmm26 - vmovups 256(%rsp), %zmm27 - vmovups 192(%rsp), %zmm28 - vmovups 128(%rsp), %zmm29 - vmovups 64(%rsp), %zmm30 - vmovups (%rsp), %zmm31 - movq 1080(%rsp), %r14 - cfi_restore (%r14) - movq 1072(%rsp), %r15 - cfi_restore (%r15) - vmovups 1216(%rsp), %zmm1 - jmp .LBL_1_2 - -.LBL_1_10: - cfi_restore_state - movzbl %r12b, %r15d - shlq $4, %r15 - vmovsd 1160(%rsp,%r15), %xmm0 - call JUMPTARGET(__exp_finite) - vmovsd %xmm0, 1224(%rsp,%r15) - jmp .LBL_1_8 - -.LBL_1_12: - movzbl %r12b, %r15d - shlq $4, %r15 - vmovsd 1152(%rsp,%r15), %xmm0 - call JUMPTARGET(__exp_finite) - vmovsd %xmm0, 1216(%rsp,%r15) - jmp .LBL_1_7 -#endif -END (_ZGVeN8v_exp_knl) - -ENTRY (_ZGVeN8v_exp_skx) -#ifndef HAVE_AVX512DQ_ASM_SUPPORT -WRAPPER_IMPL_AVX512 _ZGVdN4v_exp -#else -/* - ALGORITHM DESCRIPTION: - - Argument representation: - N = rint(X*2^k/ln2) = 2^k*M+j - X = N*ln2/2^k + r = M*ln2 + ln2*(j/2^k) + r - then -ln2/2^(k+1) < r < ln2/2^(k+1) - Alternatively: - N = trunc(X*2^k/ln2) - then 0 < r < ln2/2^k - - Result calculation: - exp(X) = exp(M*ln2 + ln2*(j/2^k) + r) - = 2^M * 2^(j/2^k) * exp(r) - 2^M is calculated by bit manipulation - 2^(j/2^k) is stored in table - exp(r) is approximated by polynomial - - The table lookup is skipped if k = 0. */ - - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $1280, %rsp - movq __svml_dexp_data@GOTPCREL(%rip), %rax - -/* table lookup for dT[j] = 2^(j/2^k) */ - kxnorw %k1, %k1, %k1 - -/* iAbsX = (int)(lX>>32), lX = *(longlong*)&X */ - vpsrlq $32, %zmm0, %zmm1 - vmovups __dbInvLn2(%rax), %zmm7 - vmovups __dbShifter(%rax), %zmm5 - vmovups __lIndexMask(%rax), %zmm6 - vmovups __dbLn2hi(%rax), %zmm9 - vmovups __dPC0(%rax), %zmm12 - -/* dM = X*dbInvLn2+dbShifter, dbInvLn2 = 2^k/Ln2 */ - vfmadd213pd %zmm5, %zmm0, %zmm7 - vpmovqd %zmm1, %ymm2 - -/* dN = dM-dbShifter, dN = rint(X*2^k/Ln2) */ - vsubpd %zmm5, %zmm7, %zmm11 - -/* iAbsX = iAbsX&iAbsMask */ - vpand __iAbsMask(%rax), %ymm2, %ymm3 - -/* dR = X - dN*dbLn2hi, dbLn2hi is 52-8-k hi bits of ln2/2^k */ - vmovaps %zmm0, %zmm10 - vfnmadd231pd %zmm11, %zmm9, %zmm10 - vmovups __dPC2(%rax), %zmm9 - -/* dR = dR - dN*dbLn2lo, dbLn2lo is 40..94 bits of lo part of ln2/2^k */ - vfnmadd132pd __dbLn2lo(%rax), %zmm10, %zmm11 - -/* exp(r) = b0+r*(b0+r*(b1+r*b2)) */ - vfmadd213pd __dPC1(%rax), %zmm11, %zmm9 - vfmadd213pd %zmm12, %zmm11, %zmm9 - vfmadd213pd %zmm12, %zmm11, %zmm9 - -/* iRangeMask = (iAbsX>iDomainRange) */ - vpcmpgtd __iDomainRange(%rax), %ymm3, %ymm4 - -/* Mask = iRangeMask?1:0, set mask for overflow/underflow */ - vmovmskps %ymm4, %ecx - -/* lIndex = (*(longlong*)&dM)&lIndexMask, lIndex is the lower K bits of lM */ - vpandq %zmm6, %zmm7, %zmm13 - vpmovqd %zmm13, %ymm14 - vpxord %zmm15, %zmm15, %zmm15 - vgatherdpd (%rax,%ymm14,8), %zmm15{%k1} - -/* 2^(j/2^k) * exp(r) */ - vmulpd %zmm9, %zmm15, %zmm10 - -/* lM = (*(longlong*)&dM)&(~lIndexMask) */ - vpandnq %zmm7, %zmm6, %zmm8 - -/* lM = lM<<(52-K), 2^M */ - vpsllq $42, %zmm8, %zmm1 - -/* multiply by 2^M through integer add */ - vpaddq %zmm1, %zmm10, %zmm1 - testl %ecx, %ecx - jne .LBL_2_3 - -.LBL_2_2: - cfi_remember_state - vmovaps %zmm1, %zmm0 - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_2_3: - cfi_restore_state - vmovups %zmm0, 1152(%rsp) - vmovups %zmm1, 1216(%rsp) - je .LBL_2_2 - - xorb %dl, %dl - xorl %eax, %eax - kmovw %k4, 1048(%rsp) - kmovw %k5, 1040(%rsp) - kmovw %k6, 1032(%rsp) - kmovw %k7, 1024(%rsp) - vmovups %zmm16, 960(%rsp) - vmovups %zmm17, 896(%rsp) - vmovups %zmm18, 832(%rsp) - vmovups %zmm19, 768(%rsp) - vmovups %zmm20, 704(%rsp) - vmovups %zmm21, 640(%rsp) - vmovups %zmm22, 576(%rsp) - vmovups %zmm23, 512(%rsp) - vmovups %zmm24, 448(%rsp) - vmovups %zmm25, 384(%rsp) - vmovups %zmm26, 320(%rsp) - vmovups %zmm27, 256(%rsp) - vmovups %zmm28, 192(%rsp) - vmovups %zmm29, 128(%rsp) - vmovups %zmm30, 64(%rsp) - vmovups %zmm31, (%rsp) - movq %rsi, 1064(%rsp) - movq %rdi, 1056(%rsp) - movq %r12, 1096(%rsp) - cfi_offset_rel_rsp (12, 1096) - movb %dl, %r12b - movq %r13, 1088(%rsp) - cfi_offset_rel_rsp (13, 1088) - movl %ecx, %r13d - movq %r14, 1080(%rsp) - cfi_offset_rel_rsp (14, 1080) - movl %eax, %r14d - movq %r15, 1072(%rsp) - cfi_offset_rel_rsp (15, 1072) - cfi_remember_state - -.LBL_2_6: - btl %r14d, %r13d - jc .LBL_2_12 - -.LBL_2_7: - lea 1(%r14), %esi - btl %esi, %r13d - jc .LBL_2_10 - -.LBL_2_8: - incb %r12b - addl $2, %r14d - cmpb $16, %r12b - jb .LBL_2_6 - - kmovw 1048(%rsp), %k4 - kmovw 1040(%rsp), %k5 - kmovw 1032(%rsp), %k6 - kmovw 1024(%rsp), %k7 - vmovups 960(%rsp), %zmm16 - vmovups 896(%rsp), %zmm17 - vmovups 832(%rsp), %zmm18 - vmovups 768(%rsp), %zmm19 - vmovups 704(%rsp), %zmm20 - vmovups 640(%rsp), %zmm21 - vmovups 576(%rsp), %zmm22 - vmovups 512(%rsp), %zmm23 - vmovups 448(%rsp), %zmm24 - vmovups 384(%rsp), %zmm25 - vmovups 320(%rsp), %zmm26 - vmovups 256(%rsp), %zmm27 - vmovups 192(%rsp), %zmm28 - vmovups 128(%rsp), %zmm29 - vmovups 64(%rsp), %zmm30 - vmovups (%rsp), %zmm31 - vmovups 1216(%rsp), %zmm1 - movq 1064(%rsp), %rsi - movq 1056(%rsp), %rdi - movq 1096(%rsp), %r12 - cfi_restore (%r12) - movq 1088(%rsp), %r13 - cfi_restore (%r13) - movq 1080(%rsp), %r14 - cfi_restore (%r14) - movq 1072(%rsp), %r15 - cfi_restore (%r15) - jmp .LBL_2_2 - -.LBL_2_10: - cfi_restore_state - movzbl %r12b, %r15d - shlq $4, %r15 - vmovsd 1160(%rsp,%r15), %xmm0 - vzeroupper - vmovsd 1160(%rsp,%r15), %xmm0 - call JUMPTARGET(__exp_finite) - vmovsd %xmm0, 1224(%rsp,%r15) - jmp .LBL_2_8 - -.LBL_2_12: - movzbl %r12b, %r15d - shlq $4, %r15 - vmovsd 1152(%rsp,%r15), %xmm0 - vzeroupper - vmovsd 1152(%rsp,%r15), %xmm0 - call JUMPTARGET(__exp_finite) - vmovsd %xmm0, 1216(%rsp,%r15) - jmp .LBL_2_7 - -#endif -END (_ZGVeN8v_exp_skx) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core.S deleted file mode 100644 index 5097add6b5..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized log. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVbN2v_log) - .type _ZGVbN2v_log, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVbN2v_log_sse4(%rip), %rax - HAS_CPU_FEATURE (SSE4_1) - jz 2f - ret -2: leaq _ZGVbN2v_log_sse2(%rip), %rax - ret -END (_ZGVbN2v_log) -libmvec_hidden_def (_ZGVbN2v_log) - -#define _ZGVbN2v_log _ZGVbN2v_log_sse2 -#include "../svml_d_log2_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core_sse4.S deleted file mode 100644 index 7d4b3c8850..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core_sse4.S +++ /dev/null @@ -1,229 +0,0 @@ -/* Function log vectorized with SSE4. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include "svml_d_log_data.h" - - .text -ENTRY (_ZGVbN2v_log_sse4) -/* - ALGORITHM DESCRIPTION: - - log(x) = -log(Rcp) + log(Rcp*x), - where Rcp ~ 1/x (accuracy ~9 bits, obtained by rounding - HW approximation to 1+9 mantissa bits) - - Reduced argument R=Rcp*x-1 is used to approximate log(1+R) as polynomial - - log(Rcp) = exponent_Rcp*log(2) + log(mantissa_Rcp) - -log(mantissa_Rcp) is obtained from a lookup table, - accessed by a 9-bit index - */ - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $320, %rsp - movaps %xmm0, %xmm6 - movq __svml_dlog_data@GOTPCREL(%rip), %r8 - movaps %xmm6, %xmm3 - movaps %xmm6, %xmm2 - -/* isolate exponent bits */ - movaps %xmm6, %xmm1 - psrlq $20, %xmm1 - movups _ExpMask(%r8), %xmm5 - -/* preserve mantissa, set input exponent to 2^(-10) */ - andps %xmm6, %xmm5 - orps _Two10(%r8), %xmm5 - -/* reciprocal approximation good to at least 11 bits */ - cvtpd2ps %xmm5, %xmm7 - cmpltpd _MinNorm(%r8), %xmm3 - cmpnlepd _MaxNorm(%r8), %xmm2 - movlhps %xmm7, %xmm7 - -/* combine and get argument value range mask */ - orps %xmm2, %xmm3 - rcpps %xmm7, %xmm0 - movmskpd %xmm3, %eax - movups _HalfMask(%r8), %xmm2 - -/* argument reduction started: R = Mantissa*Rcp - 1 */ - andps %xmm5, %xmm2 - cvtps2pd %xmm0, %xmm4 - subpd %xmm2, %xmm5 - -/* round reciprocal to nearest integer, will have 1+9 mantissa bits */ - roundpd $0, %xmm4, %xmm4 - mulpd %xmm4, %xmm2 - mulpd %xmm4, %xmm5 - subpd _One(%r8), %xmm2 - addpd %xmm2, %xmm5 - movups _Threshold(%r8), %xmm2 - -/* calculate index for table lookup */ - movaps %xmm4, %xmm3 - cmpltpd %xmm4, %xmm2 - pshufd $221, %xmm1, %xmm7 - psrlq $40, %xmm3 - -/* convert biased exponent to DP format */ - cvtdq2pd %xmm7, %xmm0 - movd %xmm3, %edx - movups _poly_coeff_1(%r8), %xmm4 - -/* polynomial computation */ - mulpd %xmm5, %xmm4 - andps _Bias(%r8), %xmm2 - orps _Bias1(%r8), %xmm2 - -/* - Table stores -log(0.5*mantissa) for larger mantissas, - adjust exponent accordingly - */ - subpd %xmm2, %xmm0 - addpd _poly_coeff_2(%r8), %xmm4 - -/* exponent*log(2.0) */ - mulpd _L2(%r8), %xmm0 - movaps %xmm5, %xmm2 - mulpd %xmm5, %xmm2 - movups _poly_coeff_3(%r8), %xmm7 - mulpd %xmm5, %xmm7 - mulpd %xmm2, %xmm4 - addpd _poly_coeff_4(%r8), %xmm7 - addpd %xmm4, %xmm7 - mulpd %xmm7, %xmm2 - movslq %edx, %rdx - pextrd $2, %xmm3, %ecx - -/* - reconstruction: - (exponent*log(2)) + (LogRcp + (R+poly)) - */ - addpd %xmm2, %xmm5 - movslq %ecx, %rcx - movsd _LogRcp_lookup(%r8,%rdx), %xmm1 - movhpd _LogRcp_lookup(%r8,%rcx), %xmm1 - addpd %xmm5, %xmm1 - addpd %xmm1, %xmm0 - testl %eax, %eax - jne .LBL_1_3 - -.LBL_1_2: - cfi_remember_state - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_1_3: - cfi_restore_state - movups %xmm6, 192(%rsp) - movups %xmm0, 256(%rsp) - je .LBL_1_2 - - xorb %cl, %cl - xorl %edx, %edx - movups %xmm8, 112(%rsp) - movups %xmm9, 96(%rsp) - movups %xmm10, 80(%rsp) - movups %xmm11, 64(%rsp) - movups %xmm12, 48(%rsp) - movups %xmm13, 32(%rsp) - movups %xmm14, 16(%rsp) - movups %xmm15, (%rsp) - movq %rsi, 136(%rsp) - movq %rdi, 128(%rsp) - movq %r12, 168(%rsp) - cfi_offset_rel_rsp (12, 168) - movb %cl, %r12b - movq %r13, 160(%rsp) - cfi_offset_rel_rsp (13, 160) - movl %eax, %r13d - movq %r14, 152(%rsp) - cfi_offset_rel_rsp (14, 152) - movl %edx, %r14d - movq %r15, 144(%rsp) - cfi_offset_rel_rsp (15, 144) - cfi_remember_state - -.LBL_1_6: - btl %r14d, %r13d - jc .LBL_1_12 - -.LBL_1_7: - lea 1(%r14), %esi - btl %esi, %r13d - jc .LBL_1_10 - -.LBL_1_8: - incb %r12b - addl $2, %r14d - cmpb $16, %r12b - jb .LBL_1_6 - - movups 112(%rsp), %xmm8 - movups 96(%rsp), %xmm9 - movups 80(%rsp), %xmm10 - movups 64(%rsp), %xmm11 - movups 48(%rsp), %xmm12 - movups 32(%rsp), %xmm13 - movups 16(%rsp), %xmm14 - movups (%rsp), %xmm15 - movq 136(%rsp), %rsi - movq 128(%rsp), %rdi - movq 168(%rsp), %r12 - cfi_restore (%r12) - movq 160(%rsp), %r13 - cfi_restore (%r13) - movq 152(%rsp), %r14 - cfi_restore (%r14) - movq 144(%rsp), %r15 - cfi_restore (%r15) - movups 256(%rsp), %xmm0 - jmp .LBL_1_2 - -.LBL_1_10: - cfi_restore_state - movzbl %r12b, %r15d - shlq $4, %r15 - movsd 200(%rsp,%r15), %xmm0 - - call JUMPTARGET(__log_finite) - - movsd %xmm0, 264(%rsp,%r15) - jmp .LBL_1_8 - -.LBL_1_12: - movzbl %r12b, %r15d - shlq $4, %r15 - movsd 192(%rsp,%r15), %xmm0 - - call JUMPTARGET(__log_finite) - - movsd %xmm0, 256(%rsp,%r15) - jmp .LBL_1_7 - -END (_ZGVbN2v_log_sse4) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core.S deleted file mode 100644 index 1e9a2f48a1..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized log. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVdN4v_log) - .type _ZGVdN4v_log, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVdN4v_log_avx2(%rip), %rax - HAS_ARCH_FEATURE (AVX2_Usable) - jz 2f - ret -2: leaq _ZGVdN4v_log_sse_wrapper(%rip), %rax - ret -END (_ZGVdN4v_log) -libmvec_hidden_def (_ZGVdN4v_log) - -#define _ZGVdN4v_log _ZGVdN4v_log_sse_wrapper -#include "../svml_d_log4_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core_avx2.S deleted file mode 100644 index 04ea9e0071..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core_avx2.S +++ /dev/null @@ -1,210 +0,0 @@ -/* Function log vectorized with AVX2. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include "svml_d_log_data.h" - - .text -ENTRY (_ZGVdN4v_log_avx2) -/* ALGORITHM DESCRIPTION: - - log(x) = -log(Rcp) + log(Rcp*x), - where Rcp ~ 1/x (accuracy ~9 bits, obtained by rounding - HW approximation to 1+9 mantissa bits) - - Reduced argument R=Rcp*x-1 is used to approximate log(1+R) as polynomial - - log(Rcp) = exponent_Rcp*log(2) + log(mantissa_Rcp) - -log(mantissa_Rcp) is obtained from a lookup table, - accessed by a 9-bit index - */ - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $448, %rsp - movq __svml_dlog_data@GOTPCREL(%rip), %rax - vmovdqa %ymm0, %ymm5 - -/* isolate exponent bits */ - vpsrlq $20, %ymm5, %ymm0 - -/* preserve mantissa, set input exponent to 2^(-10) */ - vandpd _ExpMask(%rax), %ymm5, %ymm6 - vorpd _Two10(%rax), %ymm6, %ymm4 - -/* reciprocal approximation good to at least 11 bits */ - vcvtpd2ps %ymm4, %xmm7 - vrcpps %xmm7, %xmm1 - vcmplt_oqpd _MinNorm(%rax), %ymm5, %ymm7 - vcvtps2pd %xmm1, %ymm3 - vcmpnle_uqpd _MaxNorm(%rax), %ymm5, %ymm1 - vextracti128 $1, %ymm0, %xmm2 - vshufps $221, %xmm2, %xmm0, %xmm6 - -/* round reciprocal to nearest integer, will have 1+9 mantissa bits */ - vroundpd $0, %ymm3, %ymm2 - -/* convert biased exponent to DP format */ - vcvtdq2pd %xmm6, %ymm0 - -/* combine and get argument value range mask */ - vorpd %ymm1, %ymm7, %ymm3 - vmovupd _One(%rax), %ymm1 - vmovmskpd %ymm3, %ecx - -/* calculate index for table lookup */ - vpsrlq $40, %ymm2, %ymm3 - -/* argument reduction started: R = Mantissa*Rcp - 1 */ - vfmsub213pd %ymm1, %ymm2, %ymm4 - vcmpgt_oqpd _Threshold(%rax), %ymm2, %ymm2 - vpcmpeqd %ymm6, %ymm6, %ymm6 - vxorpd %ymm1, %ymm1, %ymm1 - vgatherqpd %ymm6, _LogRcp_lookup(%rax,%ymm3), %ymm1 - -/* exponent*log(2.0) */ - vmovupd _poly_coeff_1(%rax), %ymm6 - vmulpd %ymm4, %ymm4, %ymm3 - -/* polynomial computation */ - vfmadd213pd _poly_coeff_2(%rax), %ymm4, %ymm6 - vandpd _Bias(%rax), %ymm2, %ymm7 - vorpd _Bias1(%rax), %ymm7, %ymm2 - -/* - Table stores -log(0.5*mantissa) for larger mantissas, - adjust exponent accordingly - */ - vsubpd %ymm2, %ymm0, %ymm0 - vmovupd _poly_coeff_3(%rax), %ymm2 - vfmadd213pd _poly_coeff_4(%rax), %ymm4, %ymm2 - vfmadd213pd %ymm2, %ymm3, %ymm6 - -/* - reconstruction: - (exponent*log(2)) + (LogRcp + (R+poly)) - */ - vfmadd213pd %ymm4, %ymm3, %ymm6 - vaddpd %ymm1, %ymm6, %ymm4 - vfmadd132pd _L2(%rax), %ymm4, %ymm0 - testl %ecx, %ecx - jne .LBL_1_3 - -.LBL_1_2: - cfi_remember_state - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_1_3: - cfi_restore_state - vmovupd %ymm5, 320(%rsp) - vmovupd %ymm0, 384(%rsp) - je .LBL_1_2 - - xorb %dl, %dl - xorl %eax, %eax - vmovups %ymm8, 224(%rsp) - vmovups %ymm9, 192(%rsp) - vmovups %ymm10, 160(%rsp) - vmovups %ymm11, 128(%rsp) - vmovups %ymm12, 96(%rsp) - vmovups %ymm13, 64(%rsp) - vmovups %ymm14, 32(%rsp) - vmovups %ymm15, (%rsp) - movq %rsi, 264(%rsp) - movq %rdi, 256(%rsp) - movq %r12, 296(%rsp) - cfi_offset_rel_rsp (12, 296) - movb %dl, %r12b - movq %r13, 288(%rsp) - cfi_offset_rel_rsp (13, 288) - movl %ecx, %r13d - movq %r14, 280(%rsp) - cfi_offset_rel_rsp (14, 280) - movl %eax, %r14d - movq %r15, 272(%rsp) - cfi_offset_rel_rsp (15, 272) - cfi_remember_state - -.LBL_1_6: - btl %r14d, %r13d - jc .LBL_1_12 - -.LBL_1_7: - lea 1(%r14), %esi - btl %esi, %r13d - jc .LBL_1_10 - -.LBL_1_8: - incb %r12b - addl $2, %r14d - cmpb $16, %r12b - jb .LBL_1_6 - - vmovups 224(%rsp), %ymm8 - vmovups 192(%rsp), %ymm9 - vmovups 160(%rsp), %ymm10 - vmovups 128(%rsp), %ymm11 - vmovups 96(%rsp), %ymm12 - vmovups 64(%rsp), %ymm13 - vmovups 32(%rsp), %ymm14 - vmovups (%rsp), %ymm15 - vmovupd 384(%rsp), %ymm0 - movq 264(%rsp), %rsi - movq 256(%rsp), %rdi - movq 296(%rsp), %r12 - cfi_restore (%r12) - movq 288(%rsp), %r13 - cfi_restore (%r13) - movq 280(%rsp), %r14 - cfi_restore (%r14) - movq 272(%rsp), %r15 - cfi_restore (%r15) - jmp .LBL_1_2 - -.LBL_1_10: - cfi_restore_state - movzbl %r12b, %r15d - shlq $4, %r15 - vmovsd 328(%rsp,%r15), %xmm0 - vzeroupper - - call JUMPTARGET(__log_finite) - - vmovsd %xmm0, 392(%rsp,%r15) - jmp .LBL_1_8 - -.LBL_1_12: - movzbl %r12b, %r15d - shlq $4, %r15 - vmovsd 320(%rsp,%r15), %xmm0 - vzeroupper - - call JUMPTARGET(__log_finite) - - vmovsd %xmm0, 384(%rsp,%r15) - jmp .LBL_1_7 - -END (_ZGVdN4v_log_avx2) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S deleted file mode 100644 index 43f572d36c..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S +++ /dev/null @@ -1,37 +0,0 @@ -/* Multiple versions of vectorized log. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVeN8v_log) - .type _ZGVeN8v_log, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVeN8v_log_skx(%rip), %rax - HAS_ARCH_FEATURE (AVX512DQ_Usable) - jnz 2f - leaq _ZGVeN8v_log_knl(%rip), %rax - HAS_ARCH_FEATURE (AVX512F_Usable) - jnz 2f - leaq _ZGVeN8v_log_avx2_wrapper(%rip), %rax -2: ret -END (_ZGVeN8v_log) - -#define _ZGVeN8v_log _ZGVeN8v_log_avx2_wrapper -#include "../svml_d_log8_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S deleted file mode 100644 index d10d5114c6..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S +++ /dev/null @@ -1,468 +0,0 @@ -/* Function log vectorized with AVX-512. KNL and SKX versions. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include "svml_d_log_data.h" -#include "svml_d_wrapper_impl.h" - - .text -ENTRY (_ZGVeN8v_log_knl) -#ifndef HAVE_AVX512DQ_ASM_SUPPORT -WRAPPER_IMPL_AVX512 _ZGVdN4v_log -#else -/* - ALGORITHM DESCRIPTION: - - log(x) = -log(Rcp) + log(Rcp*x), - where Rcp ~ 1/x (accuracy ~9 bits, obtained by - rounding HW approximation to 1+9 mantissa bits) - - Reduced argument R=Rcp*x-1 is used to approximate log(1+R) as polynomial - - log(Rcp) = exponent_Rcp*log(2) + log(mantissa_Rcp) - -log(mantissa_Rcp) is obtained from a lookup table, - accessed by a 9-bit index - */ - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $1280, %rsp - movq __svml_dlog_data@GOTPCREL(%rip), %rdx - movq $-1, %rax - -/* isolate exponent bits */ - vpsrlq $20, %zmm0, %zmm2 - vpsrlq $32, %zmm2, %zmm3 - vpxord %zmm2, %zmm2, %zmm2 - kxnorw %k3, %k3, %k3 - vmovups _Two10(%rdx), %zmm1 - vmovups _One(%rdx), %zmm9 - vpmovqd %zmm3, %ymm4 - -/* convert biased exponent to DP format */ - vcvtdq2pd %ymm4, %zmm13 - -/* preserve mantissa, set input exponent to 2^(-10) */ - vpternlogq $248, _ExpMask(%rdx), %zmm0, %zmm1 - vcmppd $17, _MinNorm(%rdx), %zmm0, %k1 - -/* reciprocal approximation good to at least 11 bits */ - vrcp28pd %zmm1, %zmm5 - vpbroadcastq %rax, %zmm6{%k1}{z} - vmovups _poly_coeff_3(%rdx), %zmm15 - vcmppd $22, _MaxNorm(%rdx), %zmm0, %k2 - vmovups _Bias1(%rdx), %zmm14 - -/* round reciprocal to nearest integer, will have 1+9 mantissa bits */ - vrndscalepd $8, %zmm5, %zmm11 - vpbroadcastq %rax, %zmm7{%k2}{z} - -/* argument reduction started: R = Mantissa*Rcp - 1 */ - vfmsub213pd %zmm9, %zmm11, %zmm1 - -/* calculate index for table lookup */ - vpsrlq $40, %zmm11, %zmm10 - vgatherqpd _LogRcp_lookup(%rdx,%zmm10), %zmm2{%k3} - vcmppd $30, _Threshold(%rdx), %zmm11, %k1 - -/* combine and get argument value range mask */ - vporq %zmm7, %zmm6, %zmm8 - -/* exponent*log(2.0) */ - vmovups _poly_coeff_1(%rdx), %zmm11 - vmulpd %zmm1, %zmm1, %zmm10 - vptestmq %zmm8, %zmm8, %k0 - vfmadd213pd _poly_coeff_4(%rdx), %zmm1, %zmm15 - kmovw %k0, %ecx - -/* polynomial computation */ - vfmadd213pd _poly_coeff_2(%rdx), %zmm1, %zmm11 - movzbl %cl, %ecx - vpbroadcastq %rax, %zmm12{%k1}{z} - vfmadd213pd %zmm15, %zmm10, %zmm11 - vpternlogq $248, _Bias(%rdx), %zmm12, %zmm14 - -/* - Table stores -log(0.5*mantissa) for larger mantissas, - adjust exponent accordingly - */ - vsubpd %zmm14, %zmm13, %zmm3 - -/* - reconstruction: - (exponent*log(2)) + (LogRcp + (R+poly)) - */ - vfmadd213pd %zmm1, %zmm10, %zmm11 - vaddpd %zmm2, %zmm11, %zmm1 - vfmadd132pd _L2(%rdx), %zmm1, %zmm3 - testl %ecx, %ecx - jne .LBL_1_3 - -.LBL_1_2: - cfi_remember_state - vmovaps %zmm3, %zmm0 - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_1_3: - cfi_restore_state - vmovups %zmm0, 1152(%rsp) - vmovups %zmm3, 1216(%rsp) - je .LBL_1_2 - - xorb %dl, %dl - kmovw %k4, 1048(%rsp) - xorl %eax, %eax - kmovw %k5, 1040(%rsp) - kmovw %k6, 1032(%rsp) - kmovw %k7, 1024(%rsp) - vmovups %zmm16, 960(%rsp) - vmovups %zmm17, 896(%rsp) - vmovups %zmm18, 832(%rsp) - vmovups %zmm19, 768(%rsp) - vmovups %zmm20, 704(%rsp) - vmovups %zmm21, 640(%rsp) - vmovups %zmm22, 576(%rsp) - vmovups %zmm23, 512(%rsp) - vmovups %zmm24, 448(%rsp) - vmovups %zmm25, 384(%rsp) - vmovups %zmm26, 320(%rsp) - vmovups %zmm27, 256(%rsp) - vmovups %zmm28, 192(%rsp) - vmovups %zmm29, 128(%rsp) - vmovups %zmm30, 64(%rsp) - vmovups %zmm31, (%rsp) - movq %rsi, 1064(%rsp) - movq %rdi, 1056(%rsp) - movq %r12, 1096(%rsp) - cfi_offset_rel_rsp (12, 1096) - movb %dl, %r12b - movq %r13, 1088(%rsp) - cfi_offset_rel_rsp (13, 1088) - movl %ecx, %r13d - movq %r14, 1080(%rsp) - cfi_offset_rel_rsp (14, 1080) - movl %eax, %r14d - movq %r15, 1072(%rsp) - cfi_offset_rel_rsp (15, 1072) - cfi_remember_state - -.LBL_1_6: - btl %r14d, %r13d - jc .LBL_1_12 - -.LBL_1_7: - lea 1(%r14), %esi - btl %esi, %r13d - jc .LBL_1_10 - -.LBL_1_8: - addb $1, %r12b - addl $2, %r14d - cmpb $16, %r12b - jb .LBL_1_6 - - kmovw 1048(%rsp), %k4 - movq 1064(%rsp), %rsi - kmovw 1040(%rsp), %k5 - movq 1056(%rsp), %rdi - kmovw 1032(%rsp), %k6 - movq 1096(%rsp), %r12 - cfi_restore (%r12) - movq 1088(%rsp), %r13 - cfi_restore (%r13) - kmovw 1024(%rsp), %k7 - vmovups 960(%rsp), %zmm16 - vmovups 896(%rsp), %zmm17 - vmovups 832(%rsp), %zmm18 - vmovups 768(%rsp), %zmm19 - vmovups 704(%rsp), %zmm20 - vmovups 640(%rsp), %zmm21 - vmovups 576(%rsp), %zmm22 - vmovups 512(%rsp), %zmm23 - vmovups 448(%rsp), %zmm24 - vmovups 384(%rsp), %zmm25 - vmovups 320(%rsp), %zmm26 - vmovups 256(%rsp), %zmm27 - vmovups 192(%rsp), %zmm28 - vmovups 128(%rsp), %zmm29 - vmovups 64(%rsp), %zmm30 - vmovups (%rsp), %zmm31 - movq 1080(%rsp), %r14 - cfi_restore (%r14) - movq 1072(%rsp), %r15 - cfi_restore (%r15) - vmovups 1216(%rsp), %zmm3 - jmp .LBL_1_2 - -.LBL_1_10: - cfi_restore_state - movzbl %r12b, %r15d - shlq $4, %r15 - vmovsd 1160(%rsp,%r15), %xmm0 - call JUMPTARGET(__log_finite) - vmovsd %xmm0, 1224(%rsp,%r15) - jmp .LBL_1_8 - -.LBL_1_12: - movzbl %r12b, %r15d - shlq $4, %r15 - vmovsd 1152(%rsp,%r15), %xmm0 - call JUMPTARGET(__log_finite) - vmovsd %xmm0, 1216(%rsp,%r15) - jmp .LBL_1_7 -#endif -END (_ZGVeN8v_log_knl) - -ENTRY (_ZGVeN8v_log_skx) -#ifndef HAVE_AVX512DQ_ASM_SUPPORT -WRAPPER_IMPL_AVX512 _ZGVdN4v_log -#else -/* - ALGORITHM DESCRIPTION: - - log(x) = -log(Rcp) + log(Rcp*x), - where Rcp ~ 1/x (accuracy ~9 bits, - obtained by rounding HW approximation to 1+9 mantissa bits) - - Reduced argument R=Rcp*x-1 is used to approximate log(1+R) as polynomial - - log(Rcp) = exponent_Rcp*log(2) + log(mantissa_Rcp) - -log(mantissa_Rcp) is obtained from a lookup table, - accessed by a 9-bit index - */ - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $1280, %rsp - movq __svml_dlog_data@GOTPCREL(%rip), %rax - vmovaps %zmm0, %zmm3 - kxnorw %k3, %k3, %k3 - vmovups _Two10(%rax), %zmm2 - vmovups _Threshold(%rax), %zmm14 - vmovups _One(%rax), %zmm11 - vcmppd $21, _MinNorm(%rax), %zmm3, %k1 - vcmppd $18, _MaxNorm(%rax), %zmm3, %k2 - -/* isolate exponent bits */ - vpsrlq $20, %zmm3, %zmm4 - -/* preserve mantissa, set input exponent to 2^(-10) */ - vpternlogq $248, _ExpMask(%rax), %zmm3, %zmm2 - vpbroadcastq .L_2il0floatpacket.12(%rip), %zmm1 - vpsrlq $32, %zmm4, %zmm6 - -/* reciprocal approximation good to at least 11 bits */ - vrcp14pd %zmm2, %zmm5 - -/* exponent*log(2.0) */ - vmovups _poly_coeff_1(%rax), %zmm4 - vpmovqd %zmm6, %ymm7 - -/* round reciprocal to nearest integer, will have 1+9 mantissa bits */ - vrndscalepd $8, %zmm5, %zmm0 - -/* calculate index for table lookup */ - vpsrlq $40, %zmm0, %zmm12 - -/* argument reduction started: R = Mantissa*Rcp - 1 */ - vfmsub213pd %zmm11, %zmm0, %zmm2 - vpmovqd %zmm12, %ymm13 - -/* polynomial computation */ - vfmadd213pd _poly_coeff_2(%rax), %zmm2, %zmm4 - vmovaps %zmm1, %zmm8 - vmovaps %zmm1, %zmm9 - vpxord %zmm5, %zmm5, %zmm5 - vgatherdpd _LogRcp_lookup(%rax,%ymm13), %zmm5{%k3} - vmovups _Bias1(%rax), %zmm13 - vpandnq %zmm3, %zmm3, %zmm8{%k1} - vcmppd $21, %zmm0, %zmm14, %k1 - vpandnq %zmm14, %zmm14, %zmm1{%k1} - vmulpd %zmm2, %zmm2, %zmm14 - vpternlogq $248, _Bias(%rax), %zmm1, %zmm13 - vmovups _poly_coeff_3(%rax), %zmm1 - vfmadd213pd _poly_coeff_4(%rax), %zmm2, %zmm1 - vfmadd213pd %zmm1, %zmm14, %zmm4 - -/* - reconstruction: - (exponent*log(2)) + (LogRcp + (R+poly)) - */ - vfmadd213pd %zmm2, %zmm14, %zmm4 - vaddpd %zmm5, %zmm4, %zmm2 - vpandnq %zmm3, %zmm3, %zmm9{%k2} - -/* combine and get argument value range mask */ - vorpd %zmm9, %zmm8, %zmm10 - vcmppd $3, %zmm10, %zmm10, %k0 - kmovw %k0, %ecx - -/* convert biased exponent to DP format */ - vcvtdq2pd %ymm7, %zmm15 - -/* - Table stores -log(0.5*mantissa) for larger mantissas, - adjust exponent accordingly - */ - vsubpd %zmm13, %zmm15, %zmm0 - vfmadd132pd _L2(%rax), %zmm2, %zmm0 - testl %ecx, %ecx - jne .LBL_2_3 - -.LBL_2_2: - cfi_remember_state - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_2_3: - cfi_restore_state - vmovups %zmm3, 1152(%rsp) - vmovups %zmm0, 1216(%rsp) - je .LBL_2_2 - - xorb %dl, %dl - xorl %eax, %eax - kmovw %k4, 1048(%rsp) - kmovw %k5, 1040(%rsp) - kmovw %k6, 1032(%rsp) - kmovw %k7, 1024(%rsp) - vmovups %zmm16, 960(%rsp) - vmovups %zmm17, 896(%rsp) - vmovups %zmm18, 832(%rsp) - vmovups %zmm19, 768(%rsp) - vmovups %zmm20, 704(%rsp) - vmovups %zmm21, 640(%rsp) - vmovups %zmm22, 576(%rsp) - vmovups %zmm23, 512(%rsp) - vmovups %zmm24, 448(%rsp) - vmovups %zmm25, 384(%rsp) - vmovups %zmm26, 320(%rsp) - vmovups %zmm27, 256(%rsp) - vmovups %zmm28, 192(%rsp) - vmovups %zmm29, 128(%rsp) - vmovups %zmm30, 64(%rsp) - vmovups %zmm31, (%rsp) - movq %rsi, 1064(%rsp) - movq %rdi, 1056(%rsp) - movq %r12, 1096(%rsp) - cfi_offset_rel_rsp (12, 1096) - movb %dl, %r12b - movq %r13, 1088(%rsp) - cfi_offset_rel_rsp (13, 1088) - movl %ecx, %r13d - movq %r14, 1080(%rsp) - cfi_offset_rel_rsp (14, 1080) - movl %eax, %r14d - movq %r15, 1072(%rsp) - cfi_offset_rel_rsp (15, 1072) - cfi_remember_state - -.LBL_2_6: - btl %r14d, %r13d - jc .LBL_2_12 - -.LBL_2_7: - lea 1(%r14), %esi - btl %esi, %r13d - jc .LBL_2_10 - -.LBL_2_8: - incb %r12b - addl $2, %r14d - cmpb $16, %r12b - jb .LBL_2_6 - - kmovw 1048(%rsp), %k4 - kmovw 1040(%rsp), %k5 - kmovw 1032(%rsp), %k6 - kmovw 1024(%rsp), %k7 - vmovups 960(%rsp), %zmm16 - vmovups 896(%rsp), %zmm17 - vmovups 832(%rsp), %zmm18 - vmovups 768(%rsp), %zmm19 - vmovups 704(%rsp), %zmm20 - vmovups 640(%rsp), %zmm21 - vmovups 576(%rsp), %zmm22 - vmovups 512(%rsp), %zmm23 - vmovups 448(%rsp), %zmm24 - vmovups 384(%rsp), %zmm25 - vmovups 320(%rsp), %zmm26 - vmovups 256(%rsp), %zmm27 - vmovups 192(%rsp), %zmm28 - vmovups 128(%rsp), %zmm29 - vmovups 64(%rsp), %zmm30 - vmovups (%rsp), %zmm31 - vmovups 1216(%rsp), %zmm0 - movq 1064(%rsp), %rsi - movq 1056(%rsp), %rdi - movq 1096(%rsp), %r12 - cfi_restore (%r12) - movq 1088(%rsp), %r13 - cfi_restore (%r13) - movq 1080(%rsp), %r14 - cfi_restore (%r14) - movq 1072(%rsp), %r15 - cfi_restore (%r15) - jmp .LBL_2_2 - -.LBL_2_10: - cfi_restore_state - movzbl %r12b, %r15d - shlq $4, %r15 - vmovsd 1160(%rsp,%r15), %xmm0 - vzeroupper - vmovsd 1160(%rsp,%r15), %xmm0 - - call JUMPTARGET(__log_finite) - - vmovsd %xmm0, 1224(%rsp,%r15) - jmp .LBL_2_8 - -.LBL_2_12: - movzbl %r12b, %r15d - shlq $4, %r15 - vmovsd 1152(%rsp,%r15), %xmm0 - vzeroupper - vmovsd 1152(%rsp,%r15), %xmm0 - - call JUMPTARGET(__log_finite) - - vmovsd %xmm0, 1216(%rsp,%r15) - jmp .LBL_2_7 -#endif -END (_ZGVeN8v_log_skx) - - .section .rodata, "a" -.L_2il0floatpacket.12: - .long 0xffffffff,0xffffffff - .type .L_2il0floatpacket.12,@object diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core.S deleted file mode 100644 index adb0872e56..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized pow. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVbN2vv_pow) - .type _ZGVbN2vv_pow, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVbN2vv_pow_sse4(%rip), %rax - HAS_CPU_FEATURE (SSE4_1) - jz 2f - ret -2: leaq _ZGVbN2vv_pow_sse2(%rip), %rax - ret -END (_ZGVbN2vv_pow) -libmvec_hidden_def (_ZGVbN2vv_pow) - -#define _ZGVbN2vv_pow _ZGVbN2vv_pow_sse2 -#include "../svml_d_pow2_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core_sse4.S deleted file mode 100644 index ad7c215ff0..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core_sse4.S +++ /dev/null @@ -1,432 +0,0 @@ -/* Function pow vectorized with SSE4. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include "svml_d_pow_data.h" - - .text -ENTRY (_ZGVbN2vv_pow_sse4) -/* - ALGORITHM DESCRIPTION: - - 1) Calculating log2|x| - Here we use the following formula. - Let |x|=2^k1*X1, where k1 is integer, 1<=X1<2. - Let C ~= 1/ln(2), - Rcp1 ~= 1/X1, X2=Rcp1*X1, - Rcp2 ~= 1/X2, X3=Rcp2*X2, - Rcp3 ~= 1/X3, Rcp3C ~= C/X3. - Then - log2|x| = k1 + log2(1/Rcp1) + log2(1/Rcp2) + log2(C/Rcp3C) + - log2(X1*Rcp1*Rcp2*Rcp3C/C), - where X1*Rcp1*Rcp2*Rcp3C = C*(1+q), q is very small. - - The values of Rcp1, log2(1/Rcp1), Rcp2, log2(1/Rcp2), - Rcp3C, log2(C/Rcp3C) are taken from tables. - Values of Rcp1, Rcp2, Rcp3C are such that RcpC=Rcp1*Rcp2*Rcp3C - is exactly represented in target precision. - - log2(X1*Rcp1*Rcp2*Rcp3C/C) = log2(1+q) = ln(1+q)/ln2 = - = 1/(ln2)*q - 1/(2ln2)*q^2 + 1/(3ln2)*q^3 - ... = - = 1/(C*ln2)*cq - 1/(2*C^2*ln2)*cq^2 + 1/(3*C^3*ln2)*cq^3 - ... = - = (1 + a1)*cq + a2*cq^2 + a3*cq^3 + ..., - where cq = X1*Rcp1*Rcp2*Rcp3C-C, - a1=1/(C*ln(2))-1 is small, - a2=1/(2*C^2*ln2), - a3=1/(3*C^3*ln2), - ... - We get 3 parts of log2 result: HH+HL+HLL ~= log2|x|. - - 2) Calculation of y*(HH+HL+HLL). - Split y into YHi+YLo. - Get high PH and medium PL parts of y*log2|x|. - Get low PLL part of y*log2|x|. - Now we have PH+PL+PLL ~= y*log2|x|. - - 3) Calculation of 2^(PH+PL+PLL). - Mathematical idea of computing 2^(PH+PL+PLL) is the following. - Let's represent PH+PL+PLL in the form N + j/2^expK + Z, - where expK=7 in this implementation, N and j are integers, - 0<=j<=2^expK-1, |Z|<2^(-expK-1). - Hence 2^(PH+PL+PLL) ~= 2^N * 2^(j/2^expK) * 2^Z, - where 2^(j/2^expK) is stored in a table, and - 2^Z ~= 1 + B1*Z + B2*Z^2 ... + B5*Z^5. - - We compute 2^(PH+PL+PLL) as follows. - Break PH into PHH + PHL, where PHH = N + j/2^expK. - Z = PHL + PL + PLL - Exp2Poly = B1*Z + B2*Z^2 ... + B5*Z^5 - Get 2^(j/2^expK) from table in the form THI+TLO. - Now we have 2^(PH+PL+PLL) ~= 2^N * (THI + TLO) * (1 + Exp2Poly). - - Get significand of 2^(PH+PL+PLL) in the form ResHi+ResLo: - ResHi := THI - ResLo := THI * Exp2Poly + TLO - - Get exponent ERes of the result: - Res := ResHi + ResLo: - Result := ex(Res) + N. */ - - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $256, %rsp - movq __svml_dpow_data@GOTPCREL(%rip), %rdx - movups %xmm14, 80(%rsp) - movups %xmm9, 176(%rsp) - movaps %xmm1, %xmm9 - pshufd $221, %xmm0, %xmm1 - movq _iIndexMask(%rdx), %xmm14 - movq _iIndexAdd(%rdx), %xmm6 - -/* i = (((Hi(x) & 0x000ffe00) + 0x00000200) >> 10); -> i = (b1..b11 + 1) / 2 */ - pand %xmm1, %xmm14 - paddd %xmm6, %xmm14 - psrld $10, %xmm14 - movups %xmm13, 96(%rsp) - -/* Index for reciprocal table */ - movdqa %xmm14, %xmm13 - pslld $3, %xmm13 - -/* Index for log2 table */ - pslld $4, %xmm14 - movd %xmm13, %eax - movups %xmm10, 160(%rsp) - movups _iMantissaMask(%rdx), %xmm10 - movslq %eax, %rax - -/* x1 = x; Hi(x1) = (Hi(x1)&0x000fffff)|0x3ff00000 */ - andps %xmm0, %xmm10 - pextrd $1, %xmm13, %ecx - movslq %ecx, %rcx - movups %xmm0, (%rsp) - movdqa %xmm1, %xmm0 - -/* k = Hi(x); k = k - 0x3fe7fe00; k = k >> 20 */ - movq _i3fe7fe00(%rdx), %xmm6 - psubd %xmm6, %xmm0 - movups _iHighMask(%rdx), %xmm6 - psrad $20, %xmm0 - movups %xmm15, 48(%rsp) - movups %xmm12, 112(%rsp) - orps _dbOne(%rdx), %xmm10 - movsd 11712(%rdx,%rax), %xmm12 - movd %xmm14, %r8d - movq _i2p20_2p19(%rdx), %xmm15 - movhpd 11712(%rdx,%rcx), %xmm12 - paddd %xmm15, %xmm0 - pextrd $1, %xmm14, %r9d - -/* x1Hi=x1; Lo(x1Hi)&=0xf8000000; x1Lo = x1-x1Hi */ - movaps %xmm6, %xmm14 - andps %xmm10, %xmm14 - movaps %xmm10, %xmm15 - subpd %xmm14, %xmm15 - -/* r1 = x1*rcp1 */ - mulpd %xmm12, %xmm10 - -/* E = -r1+__fence(x1Hi*rcp1) */ - mulpd %xmm12, %xmm14 - -/* E=E+x1Lo*rcp1 */ - mulpd %xmm15, %xmm12 - subpd %xmm10, %xmm14 - pshufd $80, %xmm0, %xmm0 - movslq %r8d, %r8 - andps _iffffffff00000000(%rdx), %xmm0 - subpd _db2p20_2p19(%rdx), %xmm0 - addpd %xmm12, %xmm14 - movslq %r9d, %r9 - -/* T_Rh_Eh = T_Rh + E */ - movaps %xmm14, %xmm15 - movups %xmm8, 208(%rsp) - movups 19968(%rdx,%r8), %xmm8 - movups %xmm11, 144(%rsp) - movaps %xmm8, %xmm11 - -/* cq = c+r1 */ - movups _LHN(%rdx), %xmm13 - movhpd 19968(%rdx,%r9), %xmm11 - addpd %xmm10, %xmm13 - -/* T = k + L1hi */ - addpd %xmm0, %xmm11 - -/* T_Rh = T + cq */ - movaps %xmm13, %xmm12 - addpd %xmm11, %xmm12 - addpd %xmm12, %xmm15 - -/* Rl = T-T_Rh; -> -Rh */ - subpd %xmm12, %xmm11 - -/* HLL = T_Rh - T_Rh_Eh; -> -Eh */ - subpd %xmm15, %xmm12 - -/* Rl=Rl+cq; */ - addpd %xmm13, %xmm11 - -/* cq = cq + E */ - addpd %xmm14, %xmm13 - -/* HLL+=E; -> El */ - addpd %xmm14, %xmm12 - -/* HLL+=Rl */ - addpd %xmm12, %xmm11 - unpckhpd 19968(%rdx,%r9), %xmm8 - -/* T_Rh_Eh_HLLhi = T_Rh_Eh + HLL */ - movaps %xmm15, %xmm14 - -/* HLL+=L1lo; */ - addpd %xmm11, %xmm8 - movups _clv_2(%rdx), %xmm11 - -/* HH = T_Rh_Eh_HLLhi; Lo(HH)&=0xf8000000 */ - movaps %xmm6, %xmm12 - -/* HLL = HLL + (((((((a7)*cq+a6)*cq+a5)*cq+a4)*cq+a3)*cq+a2)*cq+a1)*cq */ - mulpd %xmm13, %xmm11 - addpd _clv_3(%rdx), %xmm11 - mulpd %xmm13, %xmm11 - addpd _clv_4(%rdx), %xmm11 - mulpd %xmm13, %xmm11 - addpd _clv_5(%rdx), %xmm11 - mulpd %xmm13, %xmm11 - addpd _clv_6(%rdx), %xmm11 - mulpd %xmm13, %xmm11 - addpd _clv_7(%rdx), %xmm11 - mulpd %xmm11, %xmm13 - addpd %xmm13, %xmm8 - addpd %xmm8, %xmm14 - -/* - 2^(y*(HH+HL+HLL)) starts here: - yH = y; Lo(yH)&=0xf8000000 - */ - andps %xmm9, %xmm6 - -/* yL = y-yH; */ - movaps %xmm9, %xmm11 - subpd %xmm6, %xmm11 - andps %xmm14, %xmm12 - -/* HLLhi = T_Rh_Eh_HLLhi - T_Rh_Eh */ - movaps %xmm14, %xmm10 - -/* HL = T_Rh_Eh_HLLhi-HH; */ - subpd %xmm12, %xmm14 - subpd %xmm15, %xmm10 - movq _HIDELTA(%rdx), %xmm2 - -/* pH = yH*HH; */ - movaps %xmm6, %xmm13 - movq _LORANGE(%rdx), %xmm3 - paddd %xmm2, %xmm1 - pcmpgtd %xmm1, %xmm3 - -/* pL=yL*HL+yH*HL; pL+=yL*HH; */ - movaps %xmm11, %xmm1 - mulpd %xmm14, %xmm1 - mulpd %xmm14, %xmm6 - mulpd %xmm12, %xmm13 - mulpd %xmm11, %xmm12 - addpd %xmm6, %xmm1 - -/* HLL = HLL - HLLhi */ - subpd %xmm10, %xmm8 - addpd %xmm12, %xmm1 - -/* pLL = y*HLL */ - mulpd %xmm9, %xmm8 - movups _db2p45_2p44(%rdx), %xmm11 - -/* pHH = pH + *(double*)&db2p45_2p44 */ - movaps %xmm11, %xmm12 - addpd %xmm13, %xmm12 - -/* t=pL+pLL; t+=pHL */ - addpd %xmm8, %xmm1 - movq _ABSMASK(%rdx), %xmm5 - pshufd $221, %xmm9, %xmm4 - pand %xmm5, %xmm4 - movq _INF(%rdx), %xmm7 - movdqa %xmm4, %xmm2 - pcmpgtd %xmm7, %xmm2 - pcmpeqd %xmm7, %xmm4 - pshufd $136, %xmm12, %xmm7 - por %xmm4, %xmm2 - -/* pHH = pHH - *(double*)&db2p45_2p44 */ - subpd %xmm11, %xmm12 - pshufd $221, %xmm13, %xmm10 - por %xmm2, %xmm3 - -/* pHL = pH - pHH; */ - subpd %xmm12, %xmm13 - pand %xmm5, %xmm10 - movq _DOMAINRANGE(%rdx), %xmm5 - movdqa %xmm10, %xmm4 - addpd %xmm1, %xmm13 - pcmpgtd %xmm5, %xmm4 - pcmpeqd %xmm5, %xmm10 - por %xmm10, %xmm4 - movq _jIndexMask(%rdx), %xmm6 - por %xmm4, %xmm3 - movmskps %xmm3, %eax - -/* j = Lo(pHH)&0x0000007f */ - pand %xmm7, %xmm6 - movq _iOne(%rdx), %xmm3 - -/* _n = Lo(pHH); - _n = _n & 0xffffff80; - _n = _n >> 7; - Hi(_2n) = (0x3ff+_n)<<20; Lo(_2n) = 0; -> 2^n - */ - pslld $13, %xmm7 - paddd %xmm3, %xmm7 - pslld $4, %xmm6 - movups _cev_1(%rdx), %xmm3 - movaps %xmm13, %xmm4 - mulpd %xmm13, %xmm3 - -/* T1 = ((double*)exp2_tbl)[ 2*j ] */ - movd %xmm6, %r10d - pshufd $80, %xmm7, %xmm0 - andps _ifff0000000000000(%rdx), %xmm0 - addpd _cev_2(%rdx), %xmm3 - mulpd %xmm13, %xmm3 - addpd _cev_3(%rdx), %xmm3 - mulpd %xmm13, %xmm3 - movslq %r10d, %r10 - andl $3, %eax - pextrd $1, %xmm6, %r11d - movslq %r11d, %r11 - addpd _cev_4(%rdx), %xmm3 - movsd 36416(%rdx,%r10), %xmm2 - movhpd 36416(%rdx,%r11), %xmm2 - mulpd %xmm2, %xmm0 - mulpd %xmm3, %xmm13 - mulpd %xmm0, %xmm4 - addpd _cev_5(%rdx), %xmm13 - mulpd %xmm4, %xmm13 - addpd %xmm13, %xmm0 - jne .LBL_1_3 - -.LBL_1_2: - cfi_remember_state - movups 208(%rsp), %xmm8 - movups 176(%rsp), %xmm9 - movups 160(%rsp), %xmm10 - movups 144(%rsp), %xmm11 - movups 112(%rsp), %xmm12 - movups 96(%rsp), %xmm13 - movups 80(%rsp), %xmm14 - movups 48(%rsp), %xmm15 - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_1_3: - cfi_restore_state - movups (%rsp), %xmm1 - movups %xmm1, 64(%rsp) - movups %xmm9, 128(%rsp) - movups %xmm0, 192(%rsp) - je .LBL_1_2 - - xorb %cl, %cl - xorl %edx, %edx - movq %rsi, 8(%rsp) - movq %rdi, (%rsp) - movq %r12, 40(%rsp) - cfi_offset_rel_rsp (12, 40) - movb %cl, %r12b - movq %r13, 32(%rsp) - cfi_offset_rel_rsp (13, 32) - movl %eax, %r13d - movq %r14, 24(%rsp) - cfi_offset_rel_rsp (14, 24) - movl %edx, %r14d - movq %r15, 16(%rsp) - cfi_offset_rel_rsp (15, 16) - cfi_remember_state - -.LBL_1_6: - btl %r14d, %r13d - jc .LBL_1_12 - -.LBL_1_7: - lea 1(%r14), %esi - btl %esi, %r13d - jc .LBL_1_10 - -.LBL_1_8: - incb %r12b - addl $2, %r14d - cmpb $16, %r12b - jb .LBL_1_6 - - movq 8(%rsp), %rsi - movq (%rsp), %rdi - movq 40(%rsp), %r12 - cfi_restore (%r12) - movq 32(%rsp), %r13 - cfi_restore (%r13) - movq 24(%rsp), %r14 - cfi_restore (%r14) - movq 16(%rsp), %r15 - cfi_restore (%r15) - movups 192(%rsp), %xmm0 - jmp .LBL_1_2 - -.LBL_1_10: - cfi_restore_state - movzbl %r12b, %r15d - shlq $4, %r15 - movsd 72(%rsp,%r15), %xmm0 - movsd 136(%rsp,%r15), %xmm1 - - call JUMPTARGET(__pow_finite) - - movsd %xmm0, 200(%rsp,%r15) - jmp .LBL_1_8 - -.LBL_1_12: - movzbl %r12b, %r15d - shlq $4, %r15 - movsd 64(%rsp,%r15), %xmm0 - movsd 128(%rsp,%r15), %xmm1 - - call JUMPTARGET(__pow_finite) - - movsd %xmm0, 192(%rsp,%r15) - jmp .LBL_1_7 - -END (_ZGVbN2vv_pow_sse4) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core.S deleted file mode 100644 index eea8af6638..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized pow. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVdN4vv_pow) - .type _ZGVdN4vv_pow, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVdN4vv_pow_avx2(%rip), %rax - HAS_ARCH_FEATURE (AVX2_Usable) - jz 2f - ret -2: leaq _ZGVdN4vv_pow_sse_wrapper(%rip), %rax - ret -END (_ZGVdN4vv_pow) -libmvec_hidden_def (_ZGVdN4vv_pow) - -#define _ZGVdN4vv_pow _ZGVdN4vv_pow_sse_wrapper -#include "../svml_d_pow4_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core_avx2.S deleted file mode 100644 index 3092328909..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core_avx2.S +++ /dev/null @@ -1,387 +0,0 @@ -/* Function pow vectorized with AVX2. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include "svml_d_pow_data.h" - - .text -ENTRY (_ZGVdN4vv_pow_avx2) -/* - ALGORITHM DESCRIPTION: - - 1) Calculating log2|x| - Here we use the following formula. - Let |x|=2^k1*X1, where k1 is integer, 1<=X1<2. - Let C ~= 1/ln(2), - Rcp1 ~= 1/X1, X2=Rcp1*X1, - Rcp2 ~= 1/X2, X3=Rcp2*X2, - Rcp3 ~= 1/X3, Rcp3C ~= C/X3. - Then - log2|x| = k1 + log2(1/Rcp1) + log2(1/Rcp2) + log2(C/Rcp3C) + - log2(X1*Rcp1*Rcp2*Rcp3C/C), - where X1*Rcp1*Rcp2*Rcp3C = C*(1+q), q is very small. - - The values of Rcp1, log2(1/Rcp1), Rcp2, log2(1/Rcp2), - Rcp3C, log2(C/Rcp3C) are taken from tables. - Values of Rcp1, Rcp2, Rcp3C are such that RcpC=Rcp1*Rcp2*Rcp3C - is exactly represented in target precision. - - log2(X1*Rcp1*Rcp2*Rcp3C/C) = log2(1+q) = ln(1+q)/ln2 = - = 1/(ln2)*q - 1/(2ln2)*q^2 + 1/(3ln2)*q^3 - ... = - = 1/(C*ln2)*cq - 1/(2*C^2*ln2)*cq^2 + 1/(3*C^3*ln2)*cq^3 - ... = - = (1 + a1)*cq + a2*cq^2 + a3*cq^3 + ..., - where cq = X1*Rcp1*Rcp2*Rcp3C-C, - a1=1/(C*ln(2))-1 is small, - a2=1/(2*C^2*ln2), - a3=1/(3*C^3*ln2), - ... - We get 3 parts of log2 result: HH+HL+HLL ~= log2|x|. - - 2) Calculation of y*(HH+HL+HLL). - Split y into YHi+YLo. - Get high PH and medium PL parts of y*log2|x|. - Get low PLL part of y*log2|x|. - Now we have PH+PL+PLL ~= y*log2|x|. - - 3) Calculation of 2^(PH+PL+PLL). - Mathematical idea of computing 2^(PH+PL+PLL) is the following. - Let's represent PH+PL+PLL in the form N + j/2^expK + Z, - where expK=7 in this implementation, N and j are integers, - 0<=j<=2^expK-1, |Z|<2^(-expK-1). - Hence 2^(PH+PL+PLL) ~= 2^N * 2^(j/2^expK) * 2^Z, - where 2^(j/2^expK) is stored in a table, and - 2^Z ~= 1 + B1*Z + B2*Z^2 ... + B5*Z^5. - - We compute 2^(PH+PL+PLL) as follows. - Break PH into PHH + PHL, where PHH = N + j/2^expK. - Z = PHL + PL + PLL - Exp2Poly = B1*Z + B2*Z^2 ... + B5*Z^5 - Get 2^(j/2^expK) from table in the form THI+TLO. - Now we have 2^(PH+PL+PLL) ~= 2^N * (THI + TLO) * (1 + Exp2Poly). - - Get significand of 2^(PH+PL+PLL) in the form ResHi+ResLo: - ResHi := THI - ResLo := THI * Exp2Poly + TLO - - Get exponent ERes of the result: - Res := ResHi + ResLo: - Result := ex(Res) + N. */ - - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $448, %rsp - movq __svml_dpow_data@GOTPCREL(%rip), %rax - vmovups %ymm11, 160(%rsp) - vmovups %ymm8, 224(%rsp) - vmovups %ymm10, 352(%rsp) - vmovups %ymm9, 384(%rsp) - vmovups %ymm13, 288(%rsp) - vmovapd %ymm1, %ymm11 - vxorpd %ymm1, %ymm1, %ymm1 - vextracti128 $1, %ymm0, %xmm5 - vshufps $221, %xmm5, %xmm0, %xmm5 - -/* i = (((Hi(x) & 0x000ffe00) + 0x00000200) >> 10); -> i = (b1..b11 + 1) / 2 */ - vandps _iIndexMask(%rax), %xmm5, %xmm3 - vpaddd _iIndexAdd(%rax), %xmm3, %xmm6 - vpsrld $10, %xmm6, %xmm8 - -/* Index for reciprocal table */ - vpslld $3, %xmm8, %xmm9 - -/* Index for log2 table */ - vpslld $4, %xmm8, %xmm6 - -/* x1 = x; Hi(x1) = (Hi(x1)&0x000fffff)|0x3ff00000 */ - vandpd _iMantissaMask(%rax), %ymm0, %ymm4 - vorpd _dbOne(%rax), %ymm4, %ymm13 - vpcmpeqd %ymm4, %ymm4, %ymm4 - vpcmpeqd %ymm8, %ymm8, %ymm8 - -/* k = Hi(x); k = k - 0x3fe7fe00; k = k >> 20 */ - vpsubd _i3fe7fe00(%rax), %xmm5, %xmm3 - vpaddd _HIDELTA(%rax), %xmm5, %xmm5 - vextracti128 $1, %ymm11, %xmm7 - vshufps $221, %xmm7, %xmm11, %xmm2 - vpand _ABSMASK(%rax), %xmm2, %xmm10 - vpcmpeqd %ymm2, %ymm2, %ymm2 - vgatherdpd %ymm2, 11712(%rax,%xmm9), %ymm1 - vmovups _LORANGE(%rax), %xmm7 - vxorpd %ymm2, %ymm2, %ymm2 - vgatherdpd %ymm4, 19968(%rax,%xmm6), %ymm2 - vxorpd %ymm4, %ymm4, %ymm4 - vgatherdpd %ymm8, 19976(%rax,%xmm6), %ymm4 - vpsrad $20, %xmm3, %xmm6 - vpaddd _i2p20_2p19(%rax), %xmm6, %xmm9 - vpshufd $80, %xmm9, %xmm8 - vpshufd $250, %xmm9, %xmm3 - -/* x1Hi=x1; Lo(x1Hi)&=0xf8000000; x1Lo = x1-x1Hi */ - vandpd _iHighMask(%rax), %ymm13, %ymm9 - vinserti128 $1, %xmm3, %ymm8, %ymm6 - vandpd _iffffffff00000000(%rax), %ymm6, %ymm8 - -/* r1 = x1*rcp1 */ - vmulpd %ymm1, %ymm13, %ymm6 - vsubpd %ymm9, %ymm13, %ymm3 - vsubpd _db2p20_2p19(%rax), %ymm8, %ymm8 - -/* cq = c+r1 */ - vaddpd _LHN(%rax), %ymm6, %ymm13 - -/* E = -r1+__fence(x1Hi*rcp1) */ - vfmsub213pd %ymm6, %ymm1, %ymm9 - -/* E=E+x1Lo*rcp1 */ - vfmadd213pd %ymm9, %ymm1, %ymm3 - -/* T = k + L1hi */ - vaddpd %ymm2, %ymm8, %ymm1 - -/* T_Rh = T + cq */ - vaddpd %ymm13, %ymm1, %ymm8 - -/* Rl = T-T_Rh; -> -Rh */ - vsubpd %ymm8, %ymm1, %ymm6 - -/* Rl=Rl+cq */ - vaddpd %ymm6, %ymm13, %ymm1 - -/* T_Rh_Eh = T_Rh + E */ - vaddpd %ymm3, %ymm8, %ymm6 - -/* cq = cq + E */ - vaddpd %ymm3, %ymm13, %ymm13 - -/* HLL = T_Rh - T_Rh_Eh; -> -Eh */ - vsubpd %ymm6, %ymm8, %ymm9 - -/* HLL+=E; -> El */ - vaddpd %ymm9, %ymm3, %ymm2 - -/* HLL+=Rl */ - vaddpd %ymm1, %ymm2, %ymm8 - -/* HLL+=L1lo */ - vaddpd %ymm4, %ymm8, %ymm4 - vmovupd _clv_2(%rax), %ymm8 - -/* HLL = HLL + (((((((a7)*cq+a6)*cq+a5)*cq+a4)*cq+a3)*cq+a2)*cq+a1)*cq */ - vfmadd213pd _clv_3(%rax), %ymm13, %ymm8 - vfmadd213pd _clv_4(%rax), %ymm13, %ymm8 - vfmadd213pd _clv_5(%rax), %ymm13, %ymm8 - vfmadd213pd _clv_6(%rax), %ymm13, %ymm8 - vfmadd213pd _clv_7(%rax), %ymm13, %ymm8 - vfmadd213pd %ymm4, %ymm13, %ymm8 - -/* T_Rh_Eh_HLLhi = T_Rh_Eh + HLL */ - vaddpd %ymm8, %ymm6, %ymm9 - -/* HH = T_Rh_Eh_HLLhi; Lo(HH)&=0xf8000000 */ - vandpd _iHighMask(%rax), %ymm9, %ymm2 - -/* - 2^(y*(HH+HL+HLL)) starts here: - yH = y; Lo(yH)&=0xf8000000; - */ - vandpd _iHighMask(%rax), %ymm11, %ymm1 - -/* HLLhi = T_Rh_Eh_HLLhi - T_Rh_Eh */ - vsubpd %ymm6, %ymm9, %ymm13 - -/* HL = T_Rh_Eh_HLLhi-HH */ - vsubpd %ymm2, %ymm9, %ymm4 - -/* pH = yH*HH */ - vmulpd %ymm2, %ymm1, %ymm9 - -/* HLL = HLL - HLLhi */ - vsubpd %ymm13, %ymm8, %ymm6 - -/* yL = y-yH */ - vsubpd %ymm1, %ymm11, %ymm8 - vextracti128 $1, %ymm9, %xmm3 - vshufps $221, %xmm3, %xmm9, %xmm13 - vpand _ABSMASK(%rax), %xmm13, %xmm3 - vpcmpgtd %xmm5, %xmm7, %xmm13 - vpcmpgtd _INF(%rax), %xmm10, %xmm7 - vpcmpeqd _INF(%rax), %xmm10, %xmm10 - vpor %xmm10, %xmm7, %xmm7 - vpor %xmm7, %xmm13, %xmm5 - -/* pL=yL*HL+yH*HL; pL+=yL*HH */ - vmulpd %ymm4, %ymm8, %ymm7 - vpcmpgtd _DOMAINRANGE(%rax), %xmm3, %xmm13 - vpcmpeqd _DOMAINRANGE(%rax), %xmm3, %xmm10 - vpor %xmm10, %xmm13, %xmm3 - vpor %xmm3, %xmm5, %xmm13 - vfmadd213pd %ymm7, %ymm4, %ymm1 - -/* pLL = y*HLL; - pHH = pH + *(double*)&db2p45_2p44 - */ - vaddpd _db2p45_2p44(%rax), %ymm9, %ymm7 - vmovmskps %xmm13, %ecx - vfmadd213pd %ymm1, %ymm2, %ymm8 - -/* t=pL+pLL; t+=pHL */ - vfmadd231pd %ymm11, %ymm6, %ymm8 - vextracti128 $1, %ymm7, %xmm1 - vshufps $136, %xmm1, %xmm7, %xmm10 - -/* _n = Lo(pHH); - _n = _n & 0xffffff80; - _n = _n >> 7; - Hi(_2n) = (0x3ff+_n)<<20; Lo(_2n) = 0; -> 2^n - */ - vpslld $13, %xmm10, %xmm2 - vpaddd _iOne(%rax), %xmm2, %xmm13 - vpshufd $80, %xmm13, %xmm4 - vpshufd $250, %xmm13, %xmm1 - -/* j = Lo(pHH)&0x0000007f */ - vandps _jIndexMask(%rax), %xmm10, %xmm3 - -/* T1 = ((double*)exp2_tbl)[ 2*j ] */ - vpcmpeqd %ymm10, %ymm10, %ymm10 - vpslld $4, %xmm3, %xmm5 - -/* pHH = pHH - *(double*)&db2p45_2p44 */ - vsubpd _db2p45_2p44(%rax), %ymm7, %ymm7 - -/* pHL = pH - pHH */ - vsubpd %ymm7, %ymm9, %ymm9 - vaddpd %ymm9, %ymm8, %ymm6 - vinserti128 $1, %xmm1, %ymm4, %ymm2 - vxorpd %ymm1, %ymm1, %ymm1 - vgatherdpd %ymm10, 36416(%rax,%xmm5), %ymm1 - vandpd _ifff0000000000000(%rax), %ymm2, %ymm13 - vmovupd _cev_1(%rax), %ymm2 - vmulpd %ymm1, %ymm13, %ymm1 - vfmadd213pd _cev_2(%rax), %ymm6, %ymm2 - vmulpd %ymm6, %ymm1, %ymm8 - vfmadd213pd _cev_3(%rax), %ymm6, %ymm2 - vfmadd213pd _cev_4(%rax), %ymm6, %ymm2 - vfmadd213pd _cev_5(%rax), %ymm6, %ymm2 - vfmadd213pd %ymm1, %ymm8, %ymm2 - testl %ecx, %ecx - jne .LBL_1_3 - -.LBL_1_2: - cfi_remember_state - vmovups 224(%rsp), %ymm8 - vmovups 384(%rsp), %ymm9 - vmovups 352(%rsp), %ymm10 - vmovups 160(%rsp), %ymm11 - vmovups 288(%rsp), %ymm13 - vmovdqa %ymm2, %ymm0 - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_1_3: - cfi_restore_state - vmovupd %ymm0, 192(%rsp) - vmovupd %ymm11, 256(%rsp) - vmovupd %ymm2, 320(%rsp) - je .LBL_1_2 - - xorb %dl, %dl - xorl %eax, %eax - vmovups %ymm12, 64(%rsp) - vmovups %ymm14, 32(%rsp) - vmovups %ymm15, (%rsp) - movq %rsi, 104(%rsp) - movq %rdi, 96(%rsp) - movq %r12, 136(%rsp) - cfi_offset_rel_rsp (12, 136) - movb %dl, %r12b - movq %r13, 128(%rsp) - cfi_offset_rel_rsp (13, 128) - movl %ecx, %r13d - movq %r14, 120(%rsp) - cfi_offset_rel_rsp (14, 120) - movl %eax, %r14d - movq %r15, 112(%rsp) - cfi_offset_rel_rsp (15, 112) - cfi_remember_state - -.LBL_1_6: - btl %r14d, %r13d - jc .LBL_1_12 - -.LBL_1_7: - lea 1(%r14), %esi - btl %esi, %r13d - jc .LBL_1_10 - -.LBL_1_8: - incb %r12b - addl $2, %r14d - cmpb $16, %r12b - jb .LBL_1_6 - - vmovups 64(%rsp), %ymm12 - vmovups 32(%rsp), %ymm14 - vmovups (%rsp), %ymm15 - vmovupd 320(%rsp), %ymm2 - movq 104(%rsp), %rsi - movq 96(%rsp), %rdi - movq 136(%rsp), %r12 - cfi_restore (%r12) - movq 128(%rsp), %r13 - cfi_restore (%r13) - movq 120(%rsp), %r14 - cfi_restore (%r14) - movq 112(%rsp), %r15 - cfi_restore (%r15) - jmp .LBL_1_2 - -.LBL_1_10: - cfi_restore_state - movzbl %r12b, %r15d - shlq $4, %r15 - vmovsd 200(%rsp,%r15), %xmm0 - vmovsd 264(%rsp,%r15), %xmm1 - vzeroupper - - call JUMPTARGET(__pow_finite) - - vmovsd %xmm0, 328(%rsp,%r15) - jmp .LBL_1_8 - -.LBL_1_12: - movzbl %r12b, %r15d - shlq $4, %r15 - vmovsd 192(%rsp,%r15), %xmm0 - vmovsd 256(%rsp,%r15), %xmm1 - vzeroupper - - call JUMPTARGET(__pow_finite) - - vmovsd %xmm0, 320(%rsp,%r15) - jmp .LBL_1_7 - -END (_ZGVdN4vv_pow_avx2) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S deleted file mode 100644 index 68f12b2848..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S +++ /dev/null @@ -1,37 +0,0 @@ -/* Multiple versions of vectorized pow. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVeN8vv_pow) - .type _ZGVeN8vv_pow, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVeN8vv_pow_skx(%rip), %rax - HAS_ARCH_FEATURE (AVX512DQ_Usable) - jnz 2f - leaq _ZGVeN8vv_pow_knl(%rip), %rax - HAS_ARCH_FEATURE (AVX512F_Usable) - jnz 2f - leaq _ZGVeN8vv_pow_avx2_wrapper(%rip), %rax -2: ret -END (_ZGVeN8vv_pow) - -#define _ZGVeN8vv_pow _ZGVeN8vv_pow_avx2_wrapper -#include "../svml_d_pow8_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core_avx512.S deleted file mode 100644 index 2190c1f6b4..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core_avx512.S +++ /dev/null @@ -1,741 +0,0 @@ -/* Function pow vectorized with AVX-512. KNL and SKX versions. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include "svml_d_pow_data.h" -#include "svml_d_wrapper_impl.h" - -/* ALGORITHM DESCRIPTION: - - 1) Calculating log2|x| - Here we use the following formula. - Let |x|=2^k1*X1, where k1 is integer, 1<=X1<2. - Let C ~= 1/ln(2), - Rcp1 ~= 1/X1, X2=Rcp1*X1, - Rcp2 ~= 1/X2, X3=Rcp2*X2, - Rcp3 ~= 1/X3, Rcp3C ~= C/X3. - Then - log2|x| = k1 + log2(1/Rcp1) + log2(1/Rcp2) + log2(C/Rcp3C) + - log2(X1*Rcp1*Rcp2*Rcp3C/C), - where X1*Rcp1*Rcp2*Rcp3C = C*(1+q), q is very small. - - The values of Rcp1, log2(1/Rcp1), Rcp2, log2(1/Rcp2), - Rcp3C, log2(C/Rcp3C) are taken from tables. - Values of Rcp1, Rcp2, Rcp3C are such that RcpC=Rcp1*Rcp2*Rcp3C - is exactly represented in target precision. - - log2(X1*Rcp1*Rcp2*Rcp3C/C) = log2(1+q) = ln(1+q)/ln2 = - = 1/(ln2)*q - 1/(2ln2)*q^2 + 1/(3ln2)*q^3 - ... = - = 1/(C*ln2)*cq - 1/(2*C^2*ln2)*cq^2 + 1/(3*C^3*ln2)*cq^3 - ... = - = (1 + a1)*cq + a2*cq^2 + a3*cq^3 + ..., - where cq = X1*Rcp1*Rcp2*Rcp3C-C, - a1=1/(C*ln(2))-1 is small, - a2=1/(2*C^2*ln2), - a3=1/(3*C^3*ln2), - ... - We get 3 parts of log2 result: HH+HL+HLL ~= log2|x|. - - 2) Calculation of y*(HH+HL+HLL). - Split y into YHi+YLo. - Get high PH and medium PL parts of y*log2|x|. - Get low PLL part of y*log2|x|. - Now we have PH+PL+PLL ~= y*log2|x|. - - 3) Calculation of 2^(PH+PL+PLL). - Mathematical idea of computing 2^(PH+PL+PLL) is the following. - Let's represent PH+PL+PLL in the form N + j/2^expK + Z, - where expK=7 in this implementation, N and j are integers, - 0<=j<=2^expK-1, |Z|<2^(-expK-1). - Hence 2^(PH+PL+PLL) ~= 2^N * 2^(j/2^expK) * 2^Z, - where 2^(j/2^expK) is stored in a table, and - 2^Z ~= 1 + B1*Z + B2*Z^2 ... + B5*Z^5. - - We compute 2^(PH+PL+PLL) as follows. - Break PH into PHH + PHL, where PHH = N + j/2^expK. - Z = PHL + PL + PLL - Exp2Poly = B1*Z + B2*Z^2 ... + B5*Z^5 - Get 2^(j/2^expK) from table in the form THI+TLO. - Now we have 2^(PH+PL+PLL) ~= 2^N * (THI + TLO) * (1 + Exp2Poly). - - Get significand of 2^(PH+PL+PLL) in the form ResHi+ResLo: - ResHi := THI - ResLo := THI * Exp2Poly + TLO - - Get exponent ERes of the result: - Res := ResHi + ResLo: - Result := ex(Res) + N. */ - - .text -ENTRY (_ZGVeN8vv_pow_knl) -#ifndef HAVE_AVX512DQ_ASM_SUPPORT -WRAPPER_IMPL_AVX512_ff _ZGVdN4vv_pow -#else - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $1344, %rsp - vpsrlq $32, %zmm0, %zmm13 - vmovaps %zmm1, %zmm12 - movq __svml_dpow_data@GOTPCREL(%rip), %rax - movl $255, %edx - vpmovqd %zmm13, %ymm10 - vpsrlq $32, %zmm12, %zmm14 - kmovw %edx, %k1 - movl $-1, %ecx - vpmovqd %zmm14, %ymm15 - -/* x1 = x; Hi(x1) = (Hi(x1)&0x000fffff)|0x3ff00000 */ - vmovups _dbOne(%rax), %zmm6 - -/* i = (((Hi(x) & 0x000ffe00) + 0x00000200) >> 10); -> i = (b1..b11 + 1) / 2 */ - vmovaps %zmm10, %zmm5 - -/* k = Hi(x); k = k - 0x3fe7fe00; k = k >> 20 */ - vpsubd _i3fe7fe00(%rax), %zmm10, %zmm14{%k1} - vpandd _iIndexMask(%rax), %zmm10, %zmm5{%k1} - vpsrad $20, %zmm14, %zmm14{%k1} - vpxord %zmm9, %zmm9, %zmm9 - vpaddd _HIDELTA(%rax), %zmm10, %zmm3{%k1} - vpaddd _iIndexAdd(%rax), %zmm5, %zmm5{%k1} - vpxord %zmm7, %zmm7, %zmm7 - vpaddd _i2p20_2p19(%rax), %zmm14, %zmm14{%k1} - vpcmpd $1, _LORANGE(%rax), %zmm3, %k2{%k1} - vpsrld $10, %zmm5, %zmm5{%k1} - vpandd _ABSMASK(%rax), %zmm15, %zmm2{%k1} - vpbroadcastd %ecx, %zmm1{%k2}{z} - -/* Index for reciprocal table */ - vpslld $3, %zmm5, %zmm8{%k1} - kxnorw %k2, %k2, %k2 - vgatherdpd 11712(%rax,%ymm8), %zmm9{%k2} - vpmovzxdq %ymm14, %zmm10 - -/* Index for log2 table */ - vpslld $4, %zmm5, %zmm13{%k1} - kxnorw %k2, %k2, %k2 - vpsllq $32, %zmm10, %zmm3 - vpxord %zmm8, %zmm8, %zmm8 - vpcmpd $5, _INF(%rax), %zmm2, %k3{%k1} - vpbroadcastd %ecx, %zmm4{%k3}{z} - vpternlogq $248, _iMantissaMask(%rax), %zmm0, %zmm6 - kxnorw %k3, %k3, %k3 - vpternlogq $168, _iffffffff00000000(%rax), %zmm10, %zmm3 - -/* x1Hi=x1; Lo(x1Hi)&=0xf8000000; x1Lo = x1-x1Hi */ - vpandq _iHighMask(%rax), %zmm6, %zmm2 - vgatherdpd 19976(%rax,%ymm13), %zmm8{%k2} - vpord %zmm4, %zmm1, %zmm11{%k1} - vsubpd _db2p20_2p19(%rax), %zmm3, %zmm1 - vsubpd %zmm2, %zmm6, %zmm5 - -/* r1 = x1*rcp1 */ - vmulpd %zmm9, %zmm6, %zmm6 - vgatherdpd 19968(%rax,%ymm13), %zmm7{%k3} - -/* cq = c+r1 */ - vaddpd _LHN(%rax), %zmm6, %zmm4 - -/* E = -r1+__fence(x1Hi*rcp1) */ - vfmsub213pd %zmm6, %zmm9, %zmm2 - -/* T = k + L1hi */ - vaddpd %zmm7, %zmm1, %zmm7 - -/* E=E+x1Lo*rcp1 */ - vfmadd213pd %zmm2, %zmm9, %zmm5 - -/* T_Rh = T + cq */ - vaddpd %zmm4, %zmm7, %zmm3 - -/* Rl = T-T_Rh; -> -Rh */ - vsubpd %zmm3, %zmm7, %zmm9 - -/* Rl=Rl+cq */ - vaddpd %zmm9, %zmm4, %zmm6 - -/* T_Rh_Eh = T_Rh + E */ - vaddpd %zmm5, %zmm3, %zmm9 - -/* HLL = T_Rh - T_Rh_Eh; -> -Eh */ - vsubpd %zmm9, %zmm3, %zmm2 - -/* cq = cq + E; */ - vaddpd %zmm5, %zmm4, %zmm4 - -/* HLL+=E; -> El */ - vaddpd %zmm2, %zmm5, %zmm1 - vmovups _clv_2(%rax), %zmm5 - -/* HLL = HLL + (((((((a7)*cq+a6)*cq+a5)*cq+a4)*cq+a3)*cq+a2)*cq+a1)*cq */ - vfmadd213pd _clv_3(%rax), %zmm4, %zmm5 - -/* HLL+=Rl */ - vaddpd %zmm6, %zmm1, %zmm7 - -/* 2^(y*(HH+HL+HLL)) starts here: - yH = y; Lo(yH)&=0xf8000000 - */ - vpandq _iHighMask(%rax), %zmm12, %zmm6 - -/* yL = y-yH */ - vsubpd %zmm6, %zmm12, %zmm2 - vfmadd213pd _clv_4(%rax), %zmm4, %zmm5 - -/* HLL+=L1lo */ - vaddpd %zmm8, %zmm7, %zmm8 - vfmadd213pd _clv_5(%rax), %zmm4, %zmm5 - vfmadd213pd _clv_6(%rax), %zmm4, %zmm5 - vfmadd213pd _clv_7(%rax), %zmm4, %zmm5 - vfmadd213pd %zmm8, %zmm4, %zmm5 - -/* T_Rh_Eh_HLLhi = T_Rh_Eh + HLL */ - vaddpd %zmm5, %zmm9, %zmm13 - -/* HLLhi = T_Rh_Eh_HLLhi - T_Rh_Eh */ - vsubpd %zmm9, %zmm13, %zmm10 - -/* HLL = HLL - HLLhi */ - vsubpd %zmm10, %zmm5, %zmm3 - -/* HH = T_Rh_Eh_HLLhi; Lo(HH)&=0xf8000000 */ - vpandq _iHighMask(%rax), %zmm13, %zmm5 - -/* pH = yH*HH */ - vmulpd %zmm5, %zmm6, %zmm1 - -/* HL = T_Rh_Eh_HLLhi-HH */ - vsubpd %zmm5, %zmm13, %zmm4 - vpsrlq $32, %zmm1, %zmm14 - -/* pLL = y*HLL; - pHH = pH + *(double*)&db2p45_2p44 - */ - vaddpd _db2p45_2p44(%rax), %zmm1, %zmm10 - vpmovqd %zmm14, %ymm15 - vpandd _ABSMASK(%rax), %zmm15, %zmm14{%k1} - vpcmpd $5, _DOMAINRANGE(%rax), %zmm14, %k3{%k1} - -/* T1 = ((double*)exp2_tbl)[ 2*j ] */ - vpxord %zmm14, %zmm14, %zmm14 - vpbroadcastd %ecx, %zmm13{%k3}{z} - vpord %zmm13, %zmm11, %zmm11{%k1} - vptestmd %zmm11, %zmm11, %k0{%k1} - -/* pL=yL*HL+yH*HL; pL+=yL*HH */ - vmulpd %zmm4, %zmm2, %zmm11 - kmovw %k0, %ecx - vfmadd213pd %zmm11, %zmm4, %zmm6 - -/* pHH = pHH - *(double*)&db2p45_2p44 */ - vsubpd _db2p45_2p44(%rax), %zmm10, %zmm11 - vpmovqd %zmm10, %ymm4 - movzbl %cl, %ecx - -/* _n = Lo(pHH); - _n = _n & 0xffffff80; - _n = _n >> 7; - Hi(_2n) = (0x3ff+_n)<<20; Lo(_2n) = 0; -> 2^n - */ - vpslld $13, %zmm4, %zmm7{%k1} - -/* j = Lo(pHH)&0x0000007f */ - vpandd _jIndexMask(%rax), %zmm4, %zmm9{%k1} - vfmadd213pd %zmm6, %zmm5, %zmm2 - -/* pHL = pH - pHH */ - vsubpd %zmm11, %zmm1, %zmm1 - vpaddd _iOne(%rax), %zmm7, %zmm7{%k1} - -/* t=pL+pLL; t+=pHL */ - vfmadd231pd %zmm12, %zmm3, %zmm2 - vpslld $4, %zmm9, %zmm9{%k1} - kxnorw %k1, %k1, %k1 - vgatherdpd 36416(%rax,%ymm9), %zmm14{%k1} - vpmovzxdq %ymm7, %zmm8 - vaddpd %zmm1, %zmm2, %zmm2 - vmovups _cev_1(%rax), %zmm1 - vpsllq $32, %zmm8, %zmm13 - vpternlogq $168, _ifff0000000000000(%rax), %zmm8, %zmm13 - vfmadd213pd _cev_2(%rax), %zmm2, %zmm1 - vmulpd %zmm14, %zmm13, %zmm15 - vfmadd213pd _cev_3(%rax), %zmm2, %zmm1 - vmulpd %zmm2, %zmm15, %zmm3 - vfmadd213pd _cev_4(%rax), %zmm2, %zmm1 - vfmadd213pd _cev_5(%rax), %zmm2, %zmm1 - vfmadd213pd %zmm15, %zmm3, %zmm1 - testl %ecx, %ecx - jne .LBL_1_3 - -.LBL_1_2: - cfi_remember_state - vmovaps %zmm1, %zmm0 - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_1_3: - cfi_restore_state - vmovups %zmm0, 1152(%rsp) - vmovups %zmm12, 1216(%rsp) - vmovups %zmm1, 1280(%rsp) - je .LBL_1_2 - - xorb %dl, %dl - kmovw %k4, 1048(%rsp) - xorl %eax, %eax - kmovw %k5, 1040(%rsp) - kmovw %k6, 1032(%rsp) - kmovw %k7, 1024(%rsp) - vmovups %zmm16, 960(%rsp) - vmovups %zmm17, 896(%rsp) - vmovups %zmm18, 832(%rsp) - vmovups %zmm19, 768(%rsp) - vmovups %zmm20, 704(%rsp) - vmovups %zmm21, 640(%rsp) - vmovups %zmm22, 576(%rsp) - vmovups %zmm23, 512(%rsp) - vmovups %zmm24, 448(%rsp) - vmovups %zmm25, 384(%rsp) - vmovups %zmm26, 320(%rsp) - vmovups %zmm27, 256(%rsp) - vmovups %zmm28, 192(%rsp) - vmovups %zmm29, 128(%rsp) - vmovups %zmm30, 64(%rsp) - vmovups %zmm31, (%rsp) - movq %rsi, 1064(%rsp) - movq %rdi, 1056(%rsp) - movq %r12, 1096(%rsp) - cfi_offset_rel_rsp (12, 1096) - movb %dl, %r12b - movq %r13, 1088(%rsp) - cfi_offset_rel_rsp (13, 1088) - movl %ecx, %r13d - movq %r14, 1080(%rsp) - cfi_offset_rel_rsp (14, 1080) - movl %eax, %r14d - movq %r15, 1072(%rsp) - cfi_offset_rel_rsp (15, 1072) - cfi_remember_state - -.LBL_1_6: - btl %r14d, %r13d - jc .LBL_1_12 - -.LBL_1_7: - lea 1(%r14), %esi - btl %esi, %r13d - jc .LBL_1_10 - -.LBL_1_8: - addb $1, %r12b - addl $2, %r14d - cmpb $16, %r12b - jb .LBL_1_6 - - kmovw 1048(%rsp), %k4 - movq 1064(%rsp), %rsi - kmovw 1040(%rsp), %k5 - movq 1056(%rsp), %rdi - kmovw 1032(%rsp), %k6 - movq 1096(%rsp), %r12 - cfi_restore (%r12) - movq 1088(%rsp), %r13 - cfi_restore (%r13) - kmovw 1024(%rsp), %k7 - vmovups 960(%rsp), %zmm16 - vmovups 896(%rsp), %zmm17 - vmovups 832(%rsp), %zmm18 - vmovups 768(%rsp), %zmm19 - vmovups 704(%rsp), %zmm20 - vmovups 640(%rsp), %zmm21 - vmovups 576(%rsp), %zmm22 - vmovups 512(%rsp), %zmm23 - vmovups 448(%rsp), %zmm24 - vmovups 384(%rsp), %zmm25 - vmovups 320(%rsp), %zmm26 - vmovups 256(%rsp), %zmm27 - vmovups 192(%rsp), %zmm28 - vmovups 128(%rsp), %zmm29 - vmovups 64(%rsp), %zmm30 - vmovups (%rsp), %zmm31 - movq 1080(%rsp), %r14 - cfi_restore (%r14) - movq 1072(%rsp), %r15 - cfi_restore (%r15) - vmovups 1280(%rsp), %zmm1 - jmp .LBL_1_2 - -.LBL_1_10: - cfi_restore_state - movzbl %r12b, %r15d - shlq $4, %r15 - vmovsd 1160(%rsp,%r15), %xmm0 - vmovsd 1224(%rsp,%r15), %xmm1 - call JUMPTARGET(__pow_finite) - vmovsd %xmm0, 1288(%rsp,%r15) - jmp .LBL_1_8 - -.LBL_1_12: - movzbl %r12b, %r15d - shlq $4, %r15 - vmovsd 1152(%rsp,%r15), %xmm0 - vmovsd 1216(%rsp,%r15), %xmm1 - call JUMPTARGET(__pow_finite) - vmovsd %xmm0, 1280(%rsp,%r15) - jmp .LBL_1_7 - -#endif -END (_ZGVeN8vv_pow_knl) - -ENTRY (_ZGVeN8vv_pow_skx) -#ifndef HAVE_AVX512DQ_ASM_SUPPORT -WRAPPER_IMPL_AVX512_ff _ZGVdN4vv_pow -#else - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $1344, %rsp - vpsrlq $32, %zmm0, %zmm10 - kxnorw %k1, %k1, %k1 - kxnorw %k2, %k2, %k2 - kxnorw %k3, %k3, %k3 - vpmovqd %zmm10, %ymm7 - movq __svml_dpow_data@GOTPCREL(%rip), %rax - vmovaps %zmm1, %zmm6 - vpsrlq $32, %zmm6, %zmm13 - -/* i = (((Hi(x) & 0x000ffe00) + 0x00000200) >> 10); -> i = (b1..b11 + 1) / 2 */ - vpand _iIndexMask(%rax), %ymm7, %ymm15 - vpaddd _HIDELTA(%rax), %ymm7, %ymm2 - -/* k = Hi(x); k = k - 0x3fe7fe00; k = k >> 20 */ - vpsubd _i3fe7fe00(%rax), %ymm7, %ymm7 - vmovdqu _ABSMASK(%rax), %ymm4 - vmovdqu _LORANGE(%rax), %ymm3 - -/* x1 = x; Hi(x1) = (Hi(x1)&0x000fffff)|0x3ff00000 */ - vmovups _dbOne(%rax), %zmm11 - vmovdqu _INF(%rax), %ymm5 - vpaddd _iIndexAdd(%rax), %ymm15, %ymm12 - vpmovqd %zmm13, %ymm14 - vpternlogq $248, _iMantissaMask(%rax), %zmm0, %zmm11 - vpsrld $10, %ymm12, %ymm10 - vpsrad $20, %ymm7, %ymm13 - -/* Index for reciprocal table */ - vpslld $3, %ymm10, %ymm8 - -/* Index for log2 table */ - vpslld $4, %ymm10, %ymm1 - vpcmpgtd %ymm2, %ymm3, %ymm3 - vpand %ymm4, %ymm14, %ymm2 - vpaddd _i2p20_2p19(%rax), %ymm13, %ymm14 - vpmovzxdq %ymm14, %zmm15 - vpsllq $32, %zmm15, %zmm7 - vpternlogq $168, _iffffffff00000000(%rax), %zmm15, %zmm7 - vsubpd _db2p20_2p19(%rax), %zmm7, %zmm13 - vpxord %zmm9, %zmm9, %zmm9 - vgatherdpd 11712(%rax,%ymm8), %zmm9{%k1} - -/* T1 = ((double*)exp2_tbl)[ 2*j ] */ - kxnorw %k1, %k1, %k1 - vpxord %zmm12, %zmm12, %zmm12 - vpxord %zmm8, %zmm8, %zmm8 - vgatherdpd 19968(%rax,%ymm1), %zmm12{%k2} - vgatherdpd 19976(%rax,%ymm1), %zmm8{%k3} - vmovups _iHighMask(%rax), %zmm1 - -/* x1Hi=x1; Lo(x1Hi)&=0xf8000000; x1Lo = x1-x1Hi */ - vandpd %zmm1, %zmm11, %zmm10 - vsubpd %zmm10, %zmm11, %zmm15 - -/* r1 = x1*rcp1 */ - vmulpd %zmm9, %zmm11, %zmm11 - -/* E = -r1+__fence(x1Hi*rcp1) */ - vfmsub213pd %zmm11, %zmm9, %zmm10 - -/* cq = c+r1 */ - vaddpd _LHN(%rax), %zmm11, %zmm14 - -/* E=E+x1Lo*rcp1 */ - vfmadd213pd %zmm10, %zmm9, %zmm15 - -/* T = k + L1hi */ - vaddpd %zmm12, %zmm13, %zmm9 - -/* T_Rh = T + cq */ - vaddpd %zmm14, %zmm9, %zmm11 - -/* T_Rh_Eh = T_Rh + E */ - vaddpd %zmm15, %zmm11, %zmm13 - -/* Rl = T-T_Rh; -> -Rh */ - vsubpd %zmm11, %zmm9, %zmm12 - -/* HLL = T_Rh - T_Rh_Eh; -> -Eh */ - vsubpd %zmm13, %zmm11, %zmm9 - -/* Rl=Rl+cq */ - vaddpd %zmm12, %zmm14, %zmm10 - -/* HLL+=E; -> El */ - vaddpd %zmm9, %zmm15, %zmm7 - -/* HLL+=Rl */ - vaddpd %zmm10, %zmm7, %zmm12 - -/* 2^(y*(HH+HL+HLL)) starts here: - yH = y; Lo(yH)&=0xf8000000 - */ - vandpd %zmm1, %zmm6, %zmm7 - -/* HLL+=L1lo */ - vaddpd %zmm8, %zmm12, %zmm12 - -/* cq = cq + E */ - vaddpd %zmm15, %zmm14, %zmm8 - vmovups _clv_2(%rax), %zmm14 - -/* HLL = HLL + (((((((a7)*cq+a6)*cq+a5)*cq+a4)*cq+a3)*cq+a2)*cq+a1)*cq */ - vfmadd213pd _clv_3(%rax), %zmm8, %zmm14 - vfmadd213pd _clv_4(%rax), %zmm8, %zmm14 - vfmadd213pd _clv_5(%rax), %zmm8, %zmm14 - vfmadd213pd _clv_6(%rax), %zmm8, %zmm14 - vfmadd213pd _clv_7(%rax), %zmm8, %zmm14 - vfmadd213pd %zmm12, %zmm8, %zmm14 - -/* yL = y-yH */ - vsubpd %zmm7, %zmm6, %zmm8 - -/* T_Rh_Eh_HLLhi = T_Rh_Eh + HLL */ - vaddpd %zmm14, %zmm13, %zmm15 - -/* HH = T_Rh_Eh_HLLhi; Lo(HH)&=0xf8000000 */ - vandpd %zmm1, %zmm15, %zmm11 - -/* HLLhi = T_Rh_Eh_HLLhi - T_Rh_Eh */ - vsubpd %zmm13, %zmm15, %zmm13 - -/* pH = yH*HH */ - vmulpd %zmm11, %zmm7, %zmm9 - -/* HLL = HLL - HLLhi */ - vsubpd %zmm13, %zmm14, %zmm12 - -/* HL = T_Rh_Eh_HLLhi-HH */ - vsubpd %zmm11, %zmm15, %zmm10 - vpsrlq $32, %zmm9, %zmm1 - vmovdqu _DOMAINRANGE(%rax), %ymm13 - vpmovqd %zmm1, %ymm1 - vpand %ymm4, %ymm1, %ymm1 - vpcmpgtd %ymm5, %ymm2, %ymm4 - vpcmpeqd %ymm5, %ymm2, %ymm5 - vpternlogd $254, %ymm5, %ymm4, %ymm3 - vpcmpgtd %ymm13, %ymm1, %ymm2 - vpcmpeqd %ymm13, %ymm1, %ymm4 - vpternlogd $254, %ymm4, %ymm2, %ymm3 - -/* pLL = y*HLL */ - vmovups _db2p45_2p44(%rax), %zmm2 - -/* pHH = pH + *(double*)&db2p45_2p44 */ - vaddpd %zmm2, %zmm9, %zmm1 - vpmovqd %zmm1, %ymm5 - -/* j = Lo(pHH)&0x0000007f */ - vpand _jIndexMask(%rax), %ymm5, %ymm14 - vpslld $4, %ymm14, %ymm15 - vmovmskps %ymm3, %ecx - -/* pL=yL*HL+yH*HL; pL+=yL*HH */ - vmulpd %zmm10, %zmm8, %zmm3 - vfmadd213pd %zmm3, %zmm10, %zmm7 - vfmadd213pd %zmm7, %zmm11, %zmm8 - -/* _n = Lo(pHH) - _n = _n & 0xffffff80 - _n = _n >> 7 - Hi(_2n) = (0x3ff+_n)<<20; Lo(_2n) = 0; -> 2^n - */ - vpslld $13, %ymm5, %ymm7 - -/* t=pL+pLL; t+=pHL */ - vfmadd231pd %zmm6, %zmm12, %zmm8 - vpaddd _iOne(%rax), %ymm7, %ymm10 - vpmovzxdq %ymm10, %zmm11 - vpsllq $32, %zmm11, %zmm3 - vpternlogq $168, _ifff0000000000000(%rax), %zmm11, %zmm3 - -/* pHH = pHH - *(double*)&db2p45_2p44 */ - vsubpd %zmm2, %zmm1, %zmm11 - vmovups _cev_1(%rax), %zmm2 - -/* pHL = pH - pHH */ - vsubpd %zmm11, %zmm9, %zmm9 - vaddpd %zmm9, %zmm8, %zmm8 - vfmadd213pd _cev_2(%rax), %zmm8, %zmm2 - vfmadd213pd _cev_3(%rax), %zmm8, %zmm2 - vfmadd213pd _cev_4(%rax), %zmm8, %zmm2 - vfmadd213pd _cev_5(%rax), %zmm8, %zmm2 - vpxord %zmm4, %zmm4, %zmm4 - vgatherdpd 36416(%rax,%ymm15), %zmm4{%k1} - vmulpd %zmm4, %zmm3, %zmm1 - vmulpd %zmm8, %zmm1, %zmm12 - vfmadd213pd %zmm1, %zmm12, %zmm2 - testl %ecx, %ecx - jne .LBL_2_3 - -.LBL_2_2: - cfi_remember_state - vmovaps %zmm2, %zmm0 - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_2_3: - cfi_restore_state - vmovups %zmm0, 1152(%rsp) - vmovups %zmm6, 1216(%rsp) - vmovups %zmm2, 1280(%rsp) - je .LBL_2_2 - - xorb %dl, %dl - xorl %eax, %eax - kmovw %k4, 1048(%rsp) - kmovw %k5, 1040(%rsp) - kmovw %k6, 1032(%rsp) - kmovw %k7, 1024(%rsp) - vmovups %zmm16, 960(%rsp) - vmovups %zmm17, 896(%rsp) - vmovups %zmm18, 832(%rsp) - vmovups %zmm19, 768(%rsp) - vmovups %zmm20, 704(%rsp) - vmovups %zmm21, 640(%rsp) - vmovups %zmm22, 576(%rsp) - vmovups %zmm23, 512(%rsp) - vmovups %zmm24, 448(%rsp) - vmovups %zmm25, 384(%rsp) - vmovups %zmm26, 320(%rsp) - vmovups %zmm27, 256(%rsp) - vmovups %zmm28, 192(%rsp) - vmovups %zmm29, 128(%rsp) - vmovups %zmm30, 64(%rsp) - vmovups %zmm31, (%rsp) - movq %rsi, 1064(%rsp) - movq %rdi, 1056(%rsp) - movq %r12, 1096(%rsp) - cfi_offset_rel_rsp (12, 1096) - movb %dl, %r12b - movq %r13, 1088(%rsp) - cfi_offset_rel_rsp (13, 1088) - movl %ecx, %r13d - movq %r14, 1080(%rsp) - cfi_offset_rel_rsp (14, 1080) - movl %eax, %r14d - movq %r15, 1072(%rsp) - cfi_offset_rel_rsp (15, 1072) - cfi_remember_state - -.LBL_2_6: - btl %r14d, %r13d - jc .LBL_2_12 - -.LBL_2_7: - lea 1(%r14), %esi - btl %esi, %r13d - jc .LBL_2_10 - -.LBL_2_8: - incb %r12b - addl $2, %r14d - cmpb $16, %r12b - jb .LBL_2_6 - - kmovw 1048(%rsp), %k4 - kmovw 1040(%rsp), %k5 - kmovw 1032(%rsp), %k6 - kmovw 1024(%rsp), %k7 - vmovups 960(%rsp), %zmm16 - vmovups 896(%rsp), %zmm17 - vmovups 832(%rsp), %zmm18 - vmovups 768(%rsp), %zmm19 - vmovups 704(%rsp), %zmm20 - vmovups 640(%rsp), %zmm21 - vmovups 576(%rsp), %zmm22 - vmovups 512(%rsp), %zmm23 - vmovups 448(%rsp), %zmm24 - vmovups 384(%rsp), %zmm25 - vmovups 320(%rsp), %zmm26 - vmovups 256(%rsp), %zmm27 - vmovups 192(%rsp), %zmm28 - vmovups 128(%rsp), %zmm29 - vmovups 64(%rsp), %zmm30 - vmovups (%rsp), %zmm31 - vmovups 1280(%rsp), %zmm2 - movq 1064(%rsp), %rsi - movq 1056(%rsp), %rdi - movq 1096(%rsp), %r12 - cfi_restore (%r12) - movq 1088(%rsp), %r13 - cfi_restore (%r13) - movq 1080(%rsp), %r14 - cfi_restore (%r14) - movq 1072(%rsp), %r15 - cfi_restore (%r15) - jmp .LBL_2_2 - -.LBL_2_10: - cfi_restore_state - movzbl %r12b, %r15d - shlq $4, %r15 - vmovsd 1224(%rsp,%r15), %xmm1 - vzeroupper - vmovsd 1160(%rsp,%r15), %xmm0 - - call JUMPTARGET(__pow_finite) - - vmovsd %xmm0, 1288(%rsp,%r15) - jmp .LBL_2_8 - -.LBL_2_12: - movzbl %r12b, %r15d - shlq $4, %r15 - vmovsd 1216(%rsp,%r15), %xmm1 - vzeroupper - vmovsd 1152(%rsp,%r15), %xmm0 - - call JUMPTARGET(__pow_finite) - - vmovsd %xmm0, 1280(%rsp,%r15) - jmp .LBL_2_7 - -#endif -END (_ZGVeN8vv_pow_skx) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core.S deleted file mode 100644 index e35654be8d..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized sin. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVbN2v_sin) - .type _ZGVbN2v_sin, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVbN2v_sin_sse4(%rip), %rax - HAS_CPU_FEATURE (SSE4_1) - jz 2f - ret -2: leaq _ZGVbN2v_sin_sse2(%rip), %rax - ret -END (_ZGVbN2v_sin) -libmvec_hidden_def (_ZGVbN2v_sin) - -#define _ZGVbN2v_sin _ZGVbN2v_sin_sse2 -#include "../svml_d_sin2_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core_sse4.S deleted file mode 100644 index 393ba03b76..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core_sse4.S +++ /dev/null @@ -1,229 +0,0 @@ -/* Function sin vectorized with SSE4. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include "svml_d_trig_data.h" - - .text -ENTRY (_ZGVbN2v_sin_sse4) -/* ALGORITHM DESCRIPTION: - - ( low accuracy ( < 4ulp ) or enhanced performance - ( half of correct mantissa ) implementation ) - - Argument representation: - arg = N*Pi + R - - Result calculation: - sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R) - sin(R) is approximated by corresponding polynomial - */ - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $320, %rsp - movaps %xmm0, %xmm5 - movq __svml_d_trig_data@GOTPCREL(%rip), %rax - movups __dAbsMask(%rax), %xmm3 -/* - ARGUMENT RANGE REDUCTION: - X' = |X| - */ - movaps %xmm3, %xmm4 - -/* SignX - sign bit of X */ - andnps %xmm5, %xmm3 - movups __dInvPI(%rax), %xmm2 - andps %xmm5, %xmm4 - -/* Y = X'*InvPi + RS : right shifter add */ - mulpd %xmm4, %xmm2 - movups __dRShifter(%rax), %xmm6 - -/* R = X' - N*Pi1 */ - movaps %xmm4, %xmm0 - addpd %xmm6, %xmm2 - cmpnlepd __dRangeVal(%rax), %xmm4 - -/* N = Y - RS : right shifter sub */ - movaps %xmm2, %xmm1 - -/* SignRes = Y<<63 : shift LSB to MSB place for result sign */ - psllq $63, %xmm2 - subpd %xmm6, %xmm1 - movmskpd %xmm4, %ecx - movups __dPI1(%rax), %xmm7 - mulpd %xmm1, %xmm7 - movups __dPI2(%rax), %xmm6 - -/* R = R - N*Pi2 */ - mulpd %xmm1, %xmm6 - subpd %xmm7, %xmm0 - movups __dPI3(%rax), %xmm7 - -/* R = R - N*Pi3 */ - mulpd %xmm1, %xmm7 - subpd %xmm6, %xmm0 - movups __dPI4(%rax), %xmm6 - -/* R = R - N*Pi4 */ - mulpd %xmm6, %xmm1 - subpd %xmm7, %xmm0 - subpd %xmm1, %xmm0 - -/* - POLYNOMIAL APPROXIMATION: - R2 = R*R - */ - movaps %xmm0, %xmm1 - mulpd %xmm0, %xmm1 - -/* R = R^SignRes : update sign of reduced argument */ - xorps %xmm2, %xmm0 - movups __dC7_sin(%rax), %xmm2 - mulpd %xmm1, %xmm2 - addpd __dC6_sin(%rax), %xmm2 - mulpd %xmm1, %xmm2 - addpd __dC5_sin(%rax), %xmm2 - mulpd %xmm1, %xmm2 - addpd __dC4_sin(%rax), %xmm2 - -/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */ - mulpd %xmm1, %xmm2 - addpd __dC3_sin(%rax), %xmm2 - -/* Poly = R2*(C1+R2*(C2+R2*Poly)) */ - mulpd %xmm1, %xmm2 - addpd __dC2_sin(%rax), %xmm2 - mulpd %xmm1, %xmm2 - addpd __dC1_sin(%rax), %xmm2 - mulpd %xmm2, %xmm1 - -/* Poly = Poly*R + R */ - mulpd %xmm0, %xmm1 - addpd %xmm1, %xmm0 - -/* - RECONSTRUCTION: - Final sign setting: Res = Poly^SignX - */ - xorps %xmm3, %xmm0 - testl %ecx, %ecx - jne .LBL_1_3 - -.LBL_1_2: - cfi_remember_state - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_1_3: - cfi_restore_state - movups %xmm5, 192(%rsp) - movups %xmm0, 256(%rsp) - je .LBL_1_2 - - xorb %dl, %dl - xorl %eax, %eax - movups %xmm8, 112(%rsp) - movups %xmm9, 96(%rsp) - movups %xmm10, 80(%rsp) - movups %xmm11, 64(%rsp) - movups %xmm12, 48(%rsp) - movups %xmm13, 32(%rsp) - movups %xmm14, 16(%rsp) - movups %xmm15, (%rsp) - movq %rsi, 136(%rsp) - movq %rdi, 128(%rsp) - movq %r12, 168(%rsp) - cfi_offset_rel_rsp (12, 168) - movb %dl, %r12b - movq %r13, 160(%rsp) - cfi_offset_rel_rsp (13, 160) - movl %ecx, %r13d - movq %r14, 152(%rsp) - cfi_offset_rel_rsp (14, 152) - movl %eax, %r14d - movq %r15, 144(%rsp) - cfi_offset_rel_rsp (15, 144) - cfi_remember_state - -.LBL_1_6: - btl %r14d, %r13d - jc .LBL_1_12 - -.LBL_1_7: - lea 1(%r14), %esi - btl %esi, %r13d - jc .LBL_1_10 - -.LBL_1_8: - incb %r12b - addl $2, %r14d - cmpb $16, %r12b - jb .LBL_1_6 - - movups 112(%rsp), %xmm8 - movups 96(%rsp), %xmm9 - movups 80(%rsp), %xmm10 - movups 64(%rsp), %xmm11 - movups 48(%rsp), %xmm12 - movups 32(%rsp), %xmm13 - movups 16(%rsp), %xmm14 - movups (%rsp), %xmm15 - movq 136(%rsp), %rsi - movq 128(%rsp), %rdi - movq 168(%rsp), %r12 - cfi_restore (%r12) - movq 160(%rsp), %r13 - cfi_restore (%r13) - movq 152(%rsp), %r14 - cfi_restore (%r14) - movq 144(%rsp), %r15 - cfi_restore (%r15) - movups 256(%rsp), %xmm0 - jmp .LBL_1_2 - -.LBL_1_10: - cfi_restore_state - movzbl %r12b, %r15d - shlq $4, %r15 - movsd 200(%rsp,%r15), %xmm0 - - call JUMPTARGET(sin) - - movsd %xmm0, 264(%rsp,%r15) - jmp .LBL_1_8 - -.LBL_1_12: - movzbl %r12b, %r15d - shlq $4, %r15 - movsd 192(%rsp,%r15), %xmm0 - - call JUMPTARGET(sin) - - movsd %xmm0, 256(%rsp,%r15) - jmp .LBL_1_7 - -END (_ZGVbN2v_sin_sse4) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core.S deleted file mode 100644 index f4482d3a11..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized sin, vector length is 4. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVdN4v_sin) - .type _ZGVdN4v_sin, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVdN4v_sin_avx2(%rip), %rax - HAS_ARCH_FEATURE (AVX2_Usable) - jz 2f - ret -2: leaq _ZGVdN4v_sin_sse_wrapper(%rip), %rax - ret -END (_ZGVdN4v_sin) -libmvec_hidden_def (_ZGVdN4v_sin) - -#define _ZGVdN4v_sin _ZGVdN4v_sin_sse_wrapper -#include "../svml_d_sin4_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core_avx2.S deleted file mode 100644 index b035fa1b15..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core_avx2.S +++ /dev/null @@ -1,210 +0,0 @@ -/* Function sin vectorized with AVX2. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include "svml_d_trig_data.h" - - .text -ENTRY (_ZGVdN4v_sin_avx2) -/* ALGORITHM DESCRIPTION: - - ( low accuracy ( < 4ulp ) or enhanced performance - ( half of correct mantissa ) implementation ) - - Argument representation: - arg = N*Pi + R - - Result calculation: - sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R) - sin(R) is approximated by corresponding polynomial - */ - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $448, %rsp - movq __svml_d_trig_data@GOTPCREL(%rip), %rax - vmovdqa %ymm0, %ymm4 - vmovupd __dAbsMask(%rax), %ymm2 - vmovupd __dInvPI(%rax), %ymm6 - vmovupd __dRShifter(%rax), %ymm5 - vmovupd __dPI1_FMA(%rax), %ymm7 -/* - ARGUMENT RANGE REDUCTION: - X' = |X| - */ - vandpd %ymm2, %ymm4, %ymm3 - -/* Y = X'*InvPi + RS : right shifter add */ - vfmadd213pd %ymm5, %ymm3, %ymm6 - -/* N = Y - RS : right shifter sub */ - vsubpd %ymm5, %ymm6, %ymm1 - -/* SignRes = Y<<63 : shift LSB to MSB place for result sign */ - vpsllq $63, %ymm6, %ymm5 - -/* R = X' - N*Pi1 */ - vmovapd %ymm3, %ymm0 - vfnmadd231pd %ymm1, %ymm7, %ymm0 - vcmpnle_uqpd __dRangeVal(%rax), %ymm3, %ymm3 - -/* R = R - N*Pi2 */ - vfnmadd231pd __dPI2_FMA(%rax), %ymm1, %ymm0 - -/* R = R - N*Pi3 */ - vfnmadd132pd __dPI3_FMA(%rax), %ymm0, %ymm1 - -/* - POLYNOMIAL APPROXIMATION: - R2 = R*R - */ - vmulpd %ymm1, %ymm1, %ymm0 - -/* R = R^SignRes : update sign of reduced argument */ - vxorpd %ymm5, %ymm1, %ymm6 - vmovupd __dC7_sin(%rax), %ymm1 - vfmadd213pd __dC6_sin(%rax), %ymm0, %ymm1 - vfmadd213pd __dC5_sin(%rax), %ymm0, %ymm1 - vfmadd213pd __dC4_sin(%rax), %ymm0, %ymm1 - -/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */ - vfmadd213pd __dC3_sin(%rax), %ymm0, %ymm1 - -/* Poly = R2*(C1+R2*(C2+R2*Poly)) */ - vfmadd213pd __dC2_sin(%rax), %ymm0, %ymm1 - vfmadd213pd __dC1_sin(%rax), %ymm0, %ymm1 - -/* SignX - sign bit of X */ - vandnpd %ymm4, %ymm2, %ymm7 - vmulpd %ymm0, %ymm1, %ymm2 - -/* Poly = Poly*R + R */ - vfmadd213pd %ymm6, %ymm6, %ymm2 - vmovmskpd %ymm3, %ecx - -/* - RECONSTRUCTION: - Final sign setting: Res = Poly^SignX - */ - vxorpd %ymm7, %ymm2, %ymm0 - testl %ecx, %ecx - jne .LBL_1_3 - -.LBL_1_2: - cfi_remember_state - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_1_3: - cfi_restore_state - vmovupd %ymm4, 320(%rsp) - vmovupd %ymm0, 384(%rsp) - je .LBL_1_2 - - xorb %dl, %dl - xorl %eax, %eax - vmovups %ymm8, 224(%rsp) - vmovups %ymm9, 192(%rsp) - vmovups %ymm10, 160(%rsp) - vmovups %ymm11, 128(%rsp) - vmovups %ymm12, 96(%rsp) - vmovups %ymm13, 64(%rsp) - vmovups %ymm14, 32(%rsp) - vmovups %ymm15, (%rsp) - movq %rsi, 264(%rsp) - movq %rdi, 256(%rsp) - movq %r12, 296(%rsp) - cfi_offset_rel_rsp (12, 296) - movb %dl, %r12b - movq %r13, 288(%rsp) - cfi_offset_rel_rsp (13, 288) - movl %ecx, %r13d - movq %r14, 280(%rsp) - cfi_offset_rel_rsp (14, 280) - movl %eax, %r14d - movq %r15, 272(%rsp) - cfi_offset_rel_rsp (15, 272) - cfi_remember_state - -.LBL_1_6: - btl %r14d, %r13d - jc .LBL_1_12 - -.LBL_1_7: - lea 1(%r14), %esi - btl %esi, %r13d - jc .LBL_1_10 - -.LBL_1_8: - incb %r12b - addl $2, %r14d - cmpb $16, %r12b - jb .LBL_1_6 - - vmovups 224(%rsp), %ymm8 - vmovups 192(%rsp), %ymm9 - vmovups 160(%rsp), %ymm10 - vmovups 128(%rsp), %ymm11 - vmovups 96(%rsp), %ymm12 - vmovups 64(%rsp), %ymm13 - vmovups 32(%rsp), %ymm14 - vmovups (%rsp), %ymm15 - vmovupd 384(%rsp), %ymm0 - movq 264(%rsp), %rsi - movq 256(%rsp), %rdi - movq 296(%rsp), %r12 - cfi_restore (%r12) - movq 288(%rsp), %r13 - cfi_restore (%r13) - movq 280(%rsp), %r14 - cfi_restore (%r14) - movq 272(%rsp), %r15 - cfi_restore (%r15) - jmp .LBL_1_2 - -.LBL_1_10: - cfi_restore_state - movzbl %r12b, %r15d - shlq $4, %r15 - vmovsd 328(%rsp,%r15), %xmm0 - vzeroupper - - call JUMPTARGET(sin) - - vmovsd %xmm0, 392(%rsp,%r15) - jmp .LBL_1_8 - -.LBL_1_12: - movzbl %r12b, %r15d - shlq $4, %r15 - vmovsd 320(%rsp,%r15), %xmm0 - vzeroupper - - call JUMPTARGET(sin) - - vmovsd %xmm0, 384(%rsp,%r15) - jmp .LBL_1_7 - -END (_ZGVdN4v_sin_avx2) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S deleted file mode 100644 index 2b15889c71..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S +++ /dev/null @@ -1,37 +0,0 @@ -/* Multiple versions of vectorized sin. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVeN8v_sin) - .type _ZGVeN8v_sin, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVeN8v_sin_skx(%rip), %rax - HAS_ARCH_FEATURE (AVX512DQ_Usable) - jnz 2f - leaq _ZGVeN8v_sin_knl(%rip), %rax - HAS_ARCH_FEATURE (AVX512F_Usable) - jnz 2f - leaq _ZGVeN8v_sin_avx2_wrapper(%rip), %rax -2: ret -END (_ZGVeN8v_sin) - -#define _ZGVeN8v_sin _ZGVeN8v_sin_avx2_wrapper -#include "../svml_d_sin8_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S deleted file mode 100644 index 7580e60636..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S +++ /dev/null @@ -1,465 +0,0 @@ -/* Function sin vectorized with AVX-512, KNL and SKX versions. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include "svml_d_trig_data.h" -#include "svml_d_wrapper_impl.h" - - .text -ENTRY (_ZGVeN8v_sin_knl) -#ifndef HAVE_AVX512DQ_ASM_SUPPORT -WRAPPER_IMPL_AVX512 _ZGVdN4v_sin -#else -/* - ALGORITHM DESCRIPTION: - - ( low accuracy ( < 4ulp ) or enhanced performance - ( half of correct mantissa ) implementation ) - - Argument representation: - arg = N*Pi + R - - Result calculation: - sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R) - sin(R) is approximated by corresponding polynomial - */ - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $1280, %rsp - movq __svml_d_trig_data@GOTPCREL(%rip), %rax - movq $-1, %rdx - vmovups __dAbsMask(%rax), %zmm6 - vmovups __dInvPI(%rax), %zmm1 - -/* - ARGUMENT RANGE REDUCTION: - X' = |X| - */ - vpandq %zmm6, %zmm0, %zmm12 - vmovups __dPI1_FMA(%rax), %zmm2 - vmovups __dC7_sin(%rax), %zmm7 - -/* SignX - sign bit of X */ - vpandnq %zmm0, %zmm6, %zmm11 - -/* R = X' - N*Pi1 */ - vmovaps %zmm12, %zmm3 - -/* Y = X'*InvPi + RS : right shifter add */ - vfmadd213pd __dRShifter(%rax), %zmm12, %zmm1 - vcmppd $22, __dRangeVal(%rax), %zmm12, %k1 - vpbroadcastq %rdx, %zmm13{%k1}{z} - -/* N = Y - RS : right shifter sub */ - vsubpd __dRShifter(%rax), %zmm1, %zmm4 - -/* SignRes = Y<<63 : shift LSB to MSB place for result sign */ - vpsllq $63, %zmm1, %zmm5 - vptestmq %zmm13, %zmm13, %k0 - vfnmadd231pd %zmm4, %zmm2, %zmm3 - kmovw %k0, %ecx - movzbl %cl, %ecx - -/* R = R - N*Pi2 */ - vfnmadd231pd __dPI2_FMA(%rax), %zmm4, %zmm3 - -/* R = R - N*Pi3 */ - vfnmadd132pd __dPI3_FMA(%rax), %zmm3, %zmm4 - -/* - POLYNOMIAL APPROXIMATION: - R2 = R*R - */ - vmulpd %zmm4, %zmm4, %zmm8 - -/* R = R^SignRes : update sign of reduced argument */ - vpxorq %zmm5, %zmm4, %zmm9 - vfmadd213pd __dC6_sin(%rax), %zmm8, %zmm7 - vfmadd213pd __dC5_sin(%rax), %zmm8, %zmm7 - vfmadd213pd __dC4_sin(%rax), %zmm8, %zmm7 - -/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */ - vfmadd213pd __dC3_sin(%rax), %zmm8, %zmm7 - -/* Poly = R2*(C1+R2*(C2+R2*Poly)) */ - vfmadd213pd __dC2_sin(%rax), %zmm8, %zmm7 - vfmadd213pd __dC1_sin(%rax), %zmm8, %zmm7 - vmulpd %zmm8, %zmm7, %zmm10 - -/* Poly = Poly*R + R */ - vfmadd213pd %zmm9, %zmm9, %zmm10 - -/* - RECONSTRUCTION: - Final sign setting: Res = Poly^SignX - */ - vpxorq %zmm11, %zmm10, %zmm1 - testl %ecx, %ecx - jne .LBL_1_3 - -.LBL_1_2: - cfi_remember_state - vmovaps %zmm1, %zmm0 - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_1_3: - cfi_restore_state - vmovups %zmm0, 1152(%rsp) - vmovups %zmm1, 1216(%rsp) - je .LBL_1_2 - - xorb %dl, %dl - kmovw %k4, 1048(%rsp) - xorl %eax, %eax - kmovw %k5, 1040(%rsp) - kmovw %k6, 1032(%rsp) - kmovw %k7, 1024(%rsp) - vmovups %zmm16, 960(%rsp) - vmovups %zmm17, 896(%rsp) - vmovups %zmm18, 832(%rsp) - vmovups %zmm19, 768(%rsp) - vmovups %zmm20, 704(%rsp) - vmovups %zmm21, 640(%rsp) - vmovups %zmm22, 576(%rsp) - vmovups %zmm23, 512(%rsp) - vmovups %zmm24, 448(%rsp) - vmovups %zmm25, 384(%rsp) - vmovups %zmm26, 320(%rsp) - vmovups %zmm27, 256(%rsp) - vmovups %zmm28, 192(%rsp) - vmovups %zmm29, 128(%rsp) - vmovups %zmm30, 64(%rsp) - vmovups %zmm31, (%rsp) - movq %rsi, 1064(%rsp) - movq %rdi, 1056(%rsp) - movq %r12, 1096(%rsp) - cfi_offset_rel_rsp (12, 1096) - movb %dl, %r12b - movq %r13, 1088(%rsp) - cfi_offset_rel_rsp (13, 1088) - movl %ecx, %r13d - movq %r14, 1080(%rsp) - cfi_offset_rel_rsp (14, 1080) - movl %eax, %r14d - movq %r15, 1072(%rsp) - cfi_offset_rel_rsp (15, 1072) - cfi_remember_state - -.LBL_1_6: - btl %r14d, %r13d - jc .LBL_1_12 - -.LBL_1_7: - lea 1(%r14), %esi - btl %esi, %r13d - jc .LBL_1_10 - -.LBL_1_8: - addb $1, %r12b - addl $2, %r14d - cmpb $16, %r12b - jb .LBL_1_6 - - kmovw 1048(%rsp), %k4 - movq 1064(%rsp), %rsi - kmovw 1040(%rsp), %k5 - movq 1056(%rsp), %rdi - kmovw 1032(%rsp), %k6 - movq 1096(%rsp), %r12 - cfi_restore (%r12) - movq 1088(%rsp), %r13 - cfi_restore (%r13) - kmovw 1024(%rsp), %k7 - vmovups 960(%rsp), %zmm16 - vmovups 896(%rsp), %zmm17 - vmovups 832(%rsp), %zmm18 - vmovups 768(%rsp), %zmm19 - vmovups 704(%rsp), %zmm20 - vmovups 640(%rsp), %zmm21 - vmovups 576(%rsp), %zmm22 - vmovups 512(%rsp), %zmm23 - vmovups 448(%rsp), %zmm24 - vmovups 384(%rsp), %zmm25 - vmovups 320(%rsp), %zmm26 - vmovups 256(%rsp), %zmm27 - vmovups 192(%rsp), %zmm28 - vmovups 128(%rsp), %zmm29 - vmovups 64(%rsp), %zmm30 - vmovups (%rsp), %zmm31 - movq 1080(%rsp), %r14 - cfi_restore (%r14) - movq 1072(%rsp), %r15 - cfi_restore (%r15) - vmovups 1216(%rsp), %zmm1 - jmp .LBL_1_2 - -.LBL_1_10: - cfi_restore_state - movzbl %r12b, %r15d - shlq $4, %r15 - vmovsd 1160(%rsp,%r15), %xmm0 - call JUMPTARGET(sin) - vmovsd %xmm0, 1224(%rsp,%r15) - jmp .LBL_1_8 - -.LBL_1_12: - movzbl %r12b, %r15d - shlq $4, %r15 - vmovsd 1152(%rsp,%r15), %xmm0 - call JUMPTARGET(sin) - vmovsd %xmm0, 1216(%rsp,%r15) - jmp .LBL_1_7 -#endif -END (_ZGVeN8v_sin_knl) - -ENTRY (_ZGVeN8v_sin_skx) -#ifndef HAVE_AVX512DQ_ASM_SUPPORT -WRAPPER_IMPL_AVX512 _ZGVdN4v_sin -#else -/* - ALGORITHM DESCRIPTION: - - ( low accuracy ( < 4ulp ) or enhanced performance - ( half of correct mantissa ) implementation ) - - Argument representation: - arg = N*Pi + R - - Result calculation: - sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R) - sin(R) is approximated by corresponding polynomial - */ - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $1280, %rsp - movq __svml_d_trig_data@GOTPCREL(%rip), %rax - vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14 - vmovups __dAbsMask(%rax), %zmm7 - vmovups __dInvPI(%rax), %zmm2 - vmovups __dRShifter(%rax), %zmm1 - vmovups __dPI1_FMA(%rax), %zmm3 - vmovups __dC7_sin(%rax), %zmm8 - -/* - ARGUMENT RANGE REDUCTION: - X' = |X| - */ - vandpd %zmm7, %zmm0, %zmm13 - -/* SignX - sign bit of X */ - vandnpd %zmm0, %zmm7, %zmm12 - -/* Y = X'*InvPi + RS : right shifter add */ - vfmadd213pd %zmm1, %zmm13, %zmm2 - vcmppd $18, __dRangeVal(%rax), %zmm13, %k1 - -/* SignRes = Y<<63 : shift LSB to MSB place for result sign */ - vpsllq $63, %zmm2, %zmm6 - -/* N = Y - RS : right shifter sub */ - vsubpd %zmm1, %zmm2, %zmm5 - -/* R = X' - N*Pi1 */ - vmovaps %zmm13, %zmm4 - vfnmadd231pd %zmm5, %zmm3, %zmm4 - -/* R = R - N*Pi2 */ - vfnmadd231pd __dPI2_FMA(%rax), %zmm5, %zmm4 - -/* R = R - N*Pi3 */ - vfnmadd132pd __dPI3_FMA(%rax), %zmm4, %zmm5 - -/* - POLYNOMIAL APPROXIMATION: - R2 = R*R - */ - vmulpd %zmm5, %zmm5, %zmm9 - -/* R = R^SignRes : update sign of reduced argument */ - vxorpd %zmm6, %zmm5, %zmm10 - vfmadd213pd __dC6_sin(%rax), %zmm9, %zmm8 - vfmadd213pd __dC5_sin(%rax), %zmm9, %zmm8 - vfmadd213pd __dC4_sin(%rax), %zmm9, %zmm8 - -/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */ - vfmadd213pd __dC3_sin(%rax), %zmm9, %zmm8 - -/* Poly = R2*(C1+R2*(C2+R2*Poly)) */ - vfmadd213pd __dC2_sin(%rax), %zmm9, %zmm8 - vfmadd213pd __dC1_sin(%rax), %zmm9, %zmm8 - vmulpd %zmm9, %zmm8, %zmm11 - -/* Poly = Poly*R + R */ - vfmadd213pd %zmm10, %zmm10, %zmm11 - -/* - RECONSTRUCTION: - Final sign setting: Res = Poly^SignX - */ - vxorpd %zmm12, %zmm11, %zmm1 - vpandnq %zmm13, %zmm13, %zmm14{%k1} - vcmppd $3, %zmm14, %zmm14, %k0 - kmovw %k0, %ecx - testl %ecx, %ecx - jne .LBL_2_3 - -.LBL_2_2: - cfi_remember_state - vmovaps %zmm1, %zmm0 - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_2_3: - cfi_restore_state - vmovups %zmm0, 1152(%rsp) - vmovups %zmm1, 1216(%rsp) - je .LBL_2_2 - - xorb %dl, %dl - xorl %eax, %eax - kmovw %k4, 1048(%rsp) - kmovw %k5, 1040(%rsp) - kmovw %k6, 1032(%rsp) - kmovw %k7, 1024(%rsp) - vmovups %zmm16, 960(%rsp) - vmovups %zmm17, 896(%rsp) - vmovups %zmm18, 832(%rsp) - vmovups %zmm19, 768(%rsp) - vmovups %zmm20, 704(%rsp) - vmovups %zmm21, 640(%rsp) - vmovups %zmm22, 576(%rsp) - vmovups %zmm23, 512(%rsp) - vmovups %zmm24, 448(%rsp) - vmovups %zmm25, 384(%rsp) - vmovups %zmm26, 320(%rsp) - vmovups %zmm27, 256(%rsp) - vmovups %zmm28, 192(%rsp) - vmovups %zmm29, 128(%rsp) - vmovups %zmm30, 64(%rsp) - vmovups %zmm31, (%rsp) - movq %rsi, 1064(%rsp) - movq %rdi, 1056(%rsp) - movq %r12, 1096(%rsp) - cfi_offset_rel_rsp (12, 1096) - movb %dl, %r12b - movq %r13, 1088(%rsp) - cfi_offset_rel_rsp (13, 1088) - movl %ecx, %r13d - movq %r14, 1080(%rsp) - cfi_offset_rel_rsp (14, 1080) - movl %eax, %r14d - movq %r15, 1072(%rsp) - cfi_offset_rel_rsp (15, 1072) - cfi_remember_state - -.LBL_2_6: - btl %r14d, %r13d - jc .LBL_2_12 - -.LBL_2_7: - lea 1(%r14), %esi - btl %esi, %r13d - jc .LBL_2_10 - -.LBL_2_8: - incb %r12b - addl $2, %r14d - cmpb $16, %r12b - jb .LBL_2_6 - - kmovw 1048(%rsp), %k4 - kmovw 1040(%rsp), %k5 - kmovw 1032(%rsp), %k6 - kmovw 1024(%rsp), %k7 - vmovups 960(%rsp), %zmm16 - vmovups 896(%rsp), %zmm17 - vmovups 832(%rsp), %zmm18 - vmovups 768(%rsp), %zmm19 - vmovups 704(%rsp), %zmm20 - vmovups 640(%rsp), %zmm21 - vmovups 576(%rsp), %zmm22 - vmovups 512(%rsp), %zmm23 - vmovups 448(%rsp), %zmm24 - vmovups 384(%rsp), %zmm25 - vmovups 320(%rsp), %zmm26 - vmovups 256(%rsp), %zmm27 - vmovups 192(%rsp), %zmm28 - vmovups 128(%rsp), %zmm29 - vmovups 64(%rsp), %zmm30 - vmovups (%rsp), %zmm31 - vmovups 1216(%rsp), %zmm1 - movq 1064(%rsp), %rsi - movq 1056(%rsp), %rdi - movq 1096(%rsp), %r12 - cfi_restore (%r12) - movq 1088(%rsp), %r13 - cfi_restore (%r13) - movq 1080(%rsp), %r14 - cfi_restore (%r14) - movq 1072(%rsp), %r15 - cfi_restore (%r15) - jmp .LBL_2_2 - -.LBL_2_10: - cfi_restore_state - movzbl %r12b, %r15d - shlq $4, %r15 - vmovsd 1160(%rsp,%r15), %xmm0 - vzeroupper - vmovsd 1160(%rsp,%r15), %xmm0 - - call JUMPTARGET(sin) - - vmovsd %xmm0, 1224(%rsp,%r15) - jmp .LBL_2_8 - -.LBL_2_12: - movzbl %r12b, %r15d - shlq $4, %r15 - vmovsd 1152(%rsp,%r15), %xmm0 - vzeroupper - vmovsd 1152(%rsp,%r15), %xmm0 - - call JUMPTARGET(sin) - - vmovsd %xmm0, 1216(%rsp,%r15) - jmp .LBL_2_7 -#endif -END (_ZGVeN8v_sin_skx) - - .section .rodata, "a" -.L_2il0floatpacket.14: - .long 0xffffffff,0xffffffff - .type .L_2il0floatpacket.14,@object diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core.S deleted file mode 100644 index 13279e3fb7..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized sincos. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVbN2vvv_sincos) - .type _ZGVbN2vvv_sincos, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVbN2vvv_sincos_sse4(%rip), %rax - HAS_CPU_FEATURE (SSE4_1) - jz 2f - ret -2: leaq _ZGVbN2vvv_sincos_sse2(%rip), %rax - ret -END (_ZGVbN2vvv_sincos) -libmvec_hidden_def (_ZGVbN2vvv_sincos) - -#define _ZGVbN2vvv_sincos _ZGVbN2vvv_sincos_sse2 -#include "../svml_d_sincos2_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core_sse4.S deleted file mode 100644 index c46109f35d..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core_sse4.S +++ /dev/null @@ -1,368 +0,0 @@ -/* Function sincos vectorized with SSE4. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include "svml_d_trig_data.h" - - .text -ENTRY (_ZGVbN2vl8l8_sincos_sse4) -/* - ALGORITHM DESCRIPTION: - - ( low accuracy ( < 4ulp ) or enhanced performance - ( half of correct mantissa ) implementation ) - - Argument representation: - arg = N*Pi + R - - Result calculation: - sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R) - arg + Pi/2 = (N'*Pi + R') - cos(arg) = sin(arg+Pi/2) = sin(N'*Pi + R') = (-1)^N' * sin(R') - sin(R), sin(R') are approximated by corresponding polynomial. */ - - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $320, %rsp - movq __svml_d_trig_data@GOTPCREL(%rip), %rax - movups %xmm11, 160(%rsp) - movups %xmm12, 144(%rsp) - movups __dSignMask(%rax), %xmm11 - -/* ARGUMENT RANGE REDUCTION: - Absolute argument: X' = |X| */ - movaps %xmm11, %xmm4 - -/* Grab sign bit from argument */ - movaps %xmm11, %xmm7 - movups __dInvPI(%rax), %xmm5 - andnps %xmm0, %xmm4 - -/* SinY = X'*InvPi + RS : right shifter add */ - mulpd %xmm4, %xmm5 - addpd __dRShifter(%rax), %xmm5 - -/* SinSignRes = Y<<63 : shift LSB to MSB place for result sign */ - movaps %xmm5, %xmm12 - andps %xmm0, %xmm7 - -/* SinN = Y - RS : right shifter sub */ - subpd __dRShifter(%rax), %xmm5 - movups %xmm10, 176(%rsp) - psllq $63, %xmm12 - movups __dPI1(%rax), %xmm10 - -/* SinR = X' - SinN*Pi1 */ - movaps %xmm10, %xmm1 - mulpd %xmm5, %xmm1 - movups __dPI2(%rax), %xmm6 - -/* SinR = SinR - SinN*Pi1 */ - movaps %xmm6, %xmm2 - mulpd %xmm5, %xmm2 - movups %xmm13, 112(%rsp) - movaps %xmm4, %xmm13 - subpd %xmm1, %xmm13 - subpd %xmm2, %xmm13 - -/* Sine result sign: SinRSign = SignMask & SinR */ - movaps %xmm11, %xmm2 - -/* CosR = SinX - CosN*Pi1 */ - movaps %xmm4, %xmm1 - movups __dOneHalf(%rax), %xmm3 - andps %xmm13, %xmm2 - -/* Set SinRSign to 0.5 */ - orps %xmm2, %xmm3 - -/* Update CosRSign and CosSignRes signs */ - xorps %xmm11, %xmm2 - -/* CosN = SinN +(-)0.5 */ - addpd %xmm5, %xmm3 - cmpnlepd __dRangeVal(%rax), %xmm4 - mulpd %xmm3, %xmm10 - -/* CosR = CosR - CosN*Pi2 */ - mulpd %xmm3, %xmm6 - subpd %xmm10, %xmm1 - movmskpd %xmm4, %ecx - movups __dPI3(%rax), %xmm10 - xorps %xmm12, %xmm2 - subpd %xmm6, %xmm1 - -/* SinR = SinR - SinN*Pi3 */ - movaps %xmm10, %xmm6 - -/* Final reconstruction. - Combine Sin result's sign */ - xorps %xmm7, %xmm12 - mulpd %xmm5, %xmm6 - -/* CosR = CosR - CosN*Pi3 */ - mulpd %xmm3, %xmm10 - subpd %xmm6, %xmm13 - subpd %xmm10, %xmm1 - movups __dPI4(%rax), %xmm6 - -/* SinR = SinR - SinN*Pi4 */ - mulpd %xmm6, %xmm5 - -/* CosR = CosR - CosN*Pi4 */ - mulpd %xmm6, %xmm3 - subpd %xmm5, %xmm13 - subpd %xmm3, %xmm1 - -/* SinR2 = SinR^2 */ - movaps %xmm13, %xmm6 - -/* CosR2 = CosR^2 */ - movaps %xmm1, %xmm10 - mulpd %xmm13, %xmm6 - mulpd %xmm1, %xmm10 - -/* Polynomial approximation */ - movups __dC7(%rax), %xmm5 - movaps %xmm5, %xmm3 - mulpd %xmm6, %xmm3 - mulpd %xmm10, %xmm5 - addpd __dC6(%rax), %xmm3 - addpd __dC6(%rax), %xmm5 - mulpd %xmm6, %xmm3 - mulpd %xmm10, %xmm5 - addpd __dC5(%rax), %xmm3 - addpd __dC5(%rax), %xmm5 - mulpd %xmm6, %xmm3 - mulpd %xmm10, %xmm5 - addpd __dC4(%rax), %xmm3 - addpd __dC4(%rax), %xmm5 - -/* SinPoly = C3 + SinR2*(C4 + SinR2*(C5 + SinR2*(C6 + SinR2*C7))) */ - mulpd %xmm6, %xmm3 - -/* CosPoly = C3 + CosR2*(C4 + CosR2*(C5 + CosR2*(C6 + CosR2*C7))) */ - mulpd %xmm10, %xmm5 - addpd __dC3(%rax), %xmm3 - addpd __dC3(%rax), %xmm5 - -/* SinPoly = C2 + SinR2*SinPoly */ - mulpd %xmm6, %xmm3 - -/* CosPoly = C2 + CosR2*CosPoly */ - mulpd %xmm10, %xmm5 - addpd __dC2(%rax), %xmm3 - addpd __dC2(%rax), %xmm5 - -/* SinPoly = C1 + SinR2*SinPoly */ - mulpd %xmm6, %xmm3 - -/* CosPoly = C1 + CosR2*CosPoly */ - mulpd %xmm10, %xmm5 - addpd __dC1(%rax), %xmm3 - addpd __dC1(%rax), %xmm5 - -/* SinPoly = SinR2*SinPoly */ - mulpd %xmm3, %xmm6 - -/* CosPoly = CosR2*CosPoly */ - mulpd %xmm5, %xmm10 - -/* SinPoly = SinR*SinPoly */ - mulpd %xmm13, %xmm6 - -/* CosPoly = CosR*CosPoly */ - mulpd %xmm1, %xmm10 - addpd %xmm6, %xmm13 - addpd %xmm10, %xmm1 - -/* Update Sin result's sign */ - xorps %xmm12, %xmm13 - -/* Update Cos result's sign */ - xorps %xmm2, %xmm1 - testl %ecx, %ecx - jne .LBL_1_3 - -.LBL_1_2: - cfi_remember_state - movups 176(%rsp), %xmm10 - movaps %xmm13, (%rdi) - movups 160(%rsp), %xmm11 - movups 144(%rsp), %xmm12 - movups 112(%rsp), %xmm13 - movups %xmm1, (%rsi) - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_1_3: - cfi_restore_state - movups %xmm0, 128(%rsp) - movups %xmm13, 192(%rsp) - movups %xmm1, 256(%rsp) - je .LBL_1_2 - - xorb %dl, %dl - xorl %eax, %eax - movups %xmm8, 48(%rsp) - movups %xmm9, 32(%rsp) - movups %xmm14, 16(%rsp) - movups %xmm15, (%rsp) - movq %rsi, 64(%rsp) - movq %r12, 104(%rsp) - cfi_offset_rel_rsp (12, 104) - movb %dl, %r12b - movq %r13, 96(%rsp) - cfi_offset_rel_rsp (13, 96) - movl %eax, %r13d - movq %r14, 88(%rsp) - cfi_offset_rel_rsp (14, 88) - movl %ecx, %r14d - movq %r15, 80(%rsp) - cfi_offset_rel_rsp (15, 80) - movq %rbx, 72(%rsp) - movq %rdi, %rbx - cfi_remember_state - -.LBL_1_6: - btl %r13d, %r14d - jc .LBL_1_13 - -.LBL_1_7: - lea 1(%r13), %esi - btl %esi, %r14d - jc .LBL_1_10 - -.LBL_1_8: - incb %r12b - addl $2, %r13d - cmpb $16, %r12b - jb .LBL_1_6 - - movups 48(%rsp), %xmm8 - movq %rbx, %rdi - movups 32(%rsp), %xmm9 - movups 16(%rsp), %xmm14 - movups (%rsp), %xmm15 - movq 64(%rsp), %rsi - movq 104(%rsp), %r12 - cfi_restore (%r12) - movq 96(%rsp), %r13 - cfi_restore (%r13) - movq 88(%rsp), %r14 - cfi_restore (%r14) - movq 80(%rsp), %r15 - cfi_restore (%r15) - movq 72(%rsp), %rbx - movups 192(%rsp), %xmm13 - movups 256(%rsp), %xmm1 - jmp .LBL_1_2 - -.LBL_1_10: - cfi_restore_state - movzbl %r12b, %r15d - shlq $4, %r15 - movsd 136(%rsp,%r15), %xmm0 - - call JUMPTARGET(sin) - - movsd %xmm0, 200(%rsp,%r15) - movsd 136(%rsp,%r15), %xmm0 - - call JUMPTARGET(cos) - - movsd %xmm0, 264(%rsp,%r15) - jmp .LBL_1_8 - -.LBL_1_13: - movzbl %r12b, %r15d - shlq $4, %r15 - movsd 128(%rsp,%r15), %xmm0 - - call JUMPTARGET(sin) - - movsd %xmm0, 192(%rsp,%r15) - movsd 128(%rsp,%r15), %xmm0 - - call JUMPTARGET(cos) - - movsd %xmm0, 256(%rsp,%r15) - jmp .LBL_1_7 -END (_ZGVbN2vl8l8_sincos_sse4) -libmvec_hidden_def(_ZGVbN2vl8l8_sincos_sse4) - -/* vvv version implemented with wrapper to vl8l8 variant. */ -ENTRY (_ZGVbN2vvv_sincos_sse4) -#ifndef __ILP32__ - subq $72, %rsp - .cfi_def_cfa_offset 80 - movdqu %xmm1, 32(%rsp) - lea (%rsp), %rdi - movdqu %xmm2, 48(%rdi) - lea 16(%rsp), %rsi - call HIDDEN_JUMPTARGET(_ZGVbN2vl8l8_sincos_sse4) - movq 32(%rsp), %rdx - movq 48(%rsp), %rsi - movq 40(%rsp), %r8 - movq 56(%rsp), %r10 - movq (%rsp), %rax - movq 16(%rsp), %rcx - movq 8(%rsp), %rdi - movq 24(%rsp), %r9 - movq %rax, (%rdx) - movq %rcx, (%rsi) - movq %rdi, (%r8) - movq %r9, (%r10) - addq $72, %rsp - .cfi_def_cfa_offset 8 - ret -#else - subl $72, %esp - .cfi_def_cfa_offset 80 - leal 48(%rsp), %esi - movaps %xmm1, 16(%esp) - leal 32(%rsp), %edi - movaps %xmm2, (%esp) - call HIDDEN_JUMPTARGET(_ZGVbN2vl8l8_sincos_sse4) - movdqa 16(%esp), %xmm1 - movsd 32(%esp), %xmm0 - movq %xmm1, %rax - movdqa (%esp), %xmm2 - movsd %xmm0, (%eax) - movsd 40(%esp), %xmm0 - pextrd $1, %xmm1, %eax - movsd %xmm0, (%eax) - movsd 48(%esp), %xmm0 - movq %xmm2, %rax - movsd %xmm0, (%eax) - movsd 56(%esp), %xmm0 - pextrd $1, %xmm2, %eax - movsd %xmm0, (%eax) - addl $72, %esp - .cfi_def_cfa_offset 8 - ret -#endif -END (_ZGVbN2vvv_sincos_sse4) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core.S deleted file mode 100644 index 8aacb8e76a..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized sincos. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVdN4vvv_sincos) - .type _ZGVdN4vvv_sincos, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVdN4vvv_sincos_avx2(%rip), %rax - HAS_ARCH_FEATURE (AVX2_Usable) - jz 2f - ret -2: leaq _ZGVdN4vvv_sincos_sse_wrapper(%rip), %rax - ret -END (_ZGVdN4vvv_sincos) -libmvec_hidden_def (_ZGVdN4vvv_sincos) - -#define _ZGVdN4vvv_sincos _ZGVdN4vvv_sincos_sse_wrapper -#include "../svml_d_sincos4_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core_avx2.S deleted file mode 100644 index a6318c5ca6..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core_avx2.S +++ /dev/null @@ -1,373 +0,0 @@ -/* Function sincos vectorized with AVX2. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include "svml_d_trig_data.h" - - .text -ENTRY (_ZGVdN4vl8l8_sincos_avx2) -/* - ALGORITHM DESCRIPTION: - - ( low accuracy ( < 4ulp ) or enhanced performance - ( half of correct mantissa ) implementation ) - - Argument representation: - arg = N*Pi + R - - Result calculation: - sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R) - arg + Pi/2 = (N'*Pi + R') - cos(arg) = sin(arg+Pi/2) = sin(N'*Pi + R') = (-1)^N' * sin(R') - sin(R), sin(R') are approximated by corresponding polynomial. */ - - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $448, %rsp - movq __svml_d_trig_data@GOTPCREL(%rip), %rax - vmovups %ymm14, 288(%rsp) - vmovups %ymm8, 352(%rsp) - vmovupd __dSignMask(%rax), %ymm6 - vmovupd __dInvPI(%rax), %ymm2 - vmovupd __dPI1_FMA(%rax), %ymm5 - vmovups %ymm9, 224(%rsp) - -/* ARGUMENT RANGE REDUCTION: - Absolute argument: X' = |X| */ - vandnpd %ymm0, %ymm6, %ymm1 - -/* SinY = X'*InvPi + RS : right shifter add */ - vfmadd213pd __dRShifter(%rax), %ymm1, %ymm2 - -/* SinSignRes = Y<<63 : shift LSB to MSB place for result sign */ - vpsllq $63, %ymm2, %ymm4 - -/* SinN = Y - RS : right shifter sub */ - vsubpd __dRShifter(%rax), %ymm2, %ymm2 - -/* SinR = X' - SinN*Pi1 */ - vmovdqa %ymm1, %ymm14 - vfnmadd231pd %ymm2, %ymm5, %ymm14 - -/* SinR = SinR - SinN*Pi1 */ - vfnmadd231pd __dPI2_FMA(%rax), %ymm2, %ymm14 - -/* Sine result sign: SinRSign = SignMask & SinR */ - vandpd %ymm14, %ymm6, %ymm7 - -/* Set SinRSign to 0.5 */ - vorpd __dOneHalf(%rax), %ymm7, %ymm3 - -/* CosN = SinN +(-)0.5 */ - vaddpd %ymm3, %ymm2, %ymm3 - -/* CosR = SinX - CosN*Pi1 */ - vmovdqa %ymm1, %ymm8 - vfnmadd231pd %ymm3, %ymm5, %ymm8 - vmovupd __dPI3_FMA(%rax), %ymm5 - vcmpnle_uqpd __dRangeVal(%rax), %ymm1, %ymm1 - -/* CosR = CosR - CosN*Pi2 */ - vfnmadd231pd __dPI2_FMA(%rax), %ymm3, %ymm8 - -/* SinR = SinR - SinN*Pi3 */ - vfnmadd213pd %ymm14, %ymm5, %ymm2 - -/* CosR = CosR - CosN*Pi3 */ - vfnmadd213pd %ymm8, %ymm5, %ymm3 - vmovupd __dC6(%rax), %ymm8 - -/* SinR2 = SinR^2 */ - vmulpd %ymm2, %ymm2, %ymm14 - -/* CosR2 = CosR^2 */ - vmulpd %ymm3, %ymm3, %ymm5 - -/* Grab SignX */ - vandpd %ymm0, %ymm6, %ymm9 - -/* Update CosRSign and CosSignRes signs */ - vxorpd %ymm6, %ymm7, %ymm6 - vxorpd %ymm6, %ymm4, %ymm7 - -/* Update sign SinSignRes */ - vxorpd %ymm9, %ymm4, %ymm6 - -/* Polynomial approximation */ - vmovupd __dC7(%rax), %ymm4 - vmovdqa %ymm8, %ymm9 - vfmadd231pd __dC7(%rax), %ymm14, %ymm9 - vfmadd213pd %ymm8, %ymm5, %ymm4 - vfmadd213pd __dC5(%rax), %ymm14, %ymm9 - vfmadd213pd __dC5(%rax), %ymm5, %ymm4 - vfmadd213pd __dC4(%rax), %ymm14, %ymm9 - vfmadd213pd __dC4(%rax), %ymm5, %ymm4 - -/* SinPoly = C3 + SinR2*(C4 + SinR2*(C5 + SinR2*(C6 + SinR2*C7))) */ - vfmadd213pd __dC3(%rax), %ymm14, %ymm9 - -/* CosPoly = C3 + CosR2*(C4 + CosR2*(C5 + CosR2*(C6 + CosR2*C7))) */ - vfmadd213pd __dC3(%rax), %ymm5, %ymm4 - -/* SinPoly = C2 + SinR2*SinPoly */ - vfmadd213pd __dC2(%rax), %ymm14, %ymm9 - -/* CosPoly = C2 + CosR2*CosPoly */ - vfmadd213pd __dC2(%rax), %ymm5, %ymm4 - -/* SinPoly = C1 + SinR2*SinPoly */ - vfmadd213pd __dC1(%rax), %ymm14, %ymm9 - -/* CosPoly = C1 + CosR2*CosPoly */ - vfmadd213pd __dC1(%rax), %ymm5, %ymm4 - -/* SinPoly = SinR2*SinPoly */ - vmulpd %ymm14, %ymm9, %ymm8 - -/* CosPoly = CosR2*CosPoly */ - vmulpd %ymm5, %ymm4, %ymm4 - -/* SinPoly = SinR*SinPoly */ - vfmadd213pd %ymm2, %ymm2, %ymm8 - -/* CosPoly = CosR*CosPoly */ - vfmadd213pd %ymm3, %ymm3, %ymm4 - vmovmskpd %ymm1, %ecx - -/* Final reconstruction - Update Sin result's sign */ - vxorpd %ymm6, %ymm8, %ymm3 - -/* Update Cos result's sign */ - vxorpd %ymm7, %ymm4, %ymm2 - testl %ecx, %ecx - jne .LBL_1_3 - -.LBL_1_2: - cfi_remember_state - vmovups 352(%rsp), %ymm8 - vmovups 224(%rsp), %ymm9 - vmovups 288(%rsp), %ymm14 - vmovupd %ymm2, (%rsi) - vmovdqa %ymm3, (%rdi) - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_1_3: - cfi_restore_state - vmovupd %ymm0, 256(%rsp) - vmovupd %ymm3, 320(%rsp) - vmovupd %ymm2, 384(%rsp) - je .LBL_1_2 - - xorb %dl, %dl - xorl %eax, %eax - vmovups %ymm10, 128(%rsp) - vmovups %ymm11, 96(%rsp) - vmovups %ymm12, 64(%rsp) - vmovups %ymm13, 32(%rsp) - vmovups %ymm15, (%rsp) - movq %rsi, 160(%rsp) - movq %r12, 200(%rsp) - cfi_offset_rel_rsp (12, 200) - movb %dl, %r12b - movq %r13, 192(%rsp) - cfi_offset_rel_rsp (13, 192) - movl %eax, %r13d - movq %r14, 184(%rsp) - cfi_offset_rel_rsp (14, 184) - movl %ecx, %r14d - movq %r15, 176(%rsp) - cfi_offset_rel_rsp (15, 176) - movq %rbx, 168(%rsp) - movq %rdi, %rbx - cfi_remember_state - -.LBL_1_6: - btl %r13d, %r14d - jc .LBL_1_13 - -.LBL_1_7: - lea 1(%r13), %esi - btl %esi, %r14d - jc .LBL_1_10 - -.LBL_1_8: - incb %r12b - addl $2, %r13d - cmpb $16, %r12b - jb .LBL_1_6 - - vmovups 128(%rsp), %ymm10 - movq %rbx, %rdi - vmovups 96(%rsp), %ymm11 - vmovups 64(%rsp), %ymm12 - vmovups 32(%rsp), %ymm13 - vmovups (%rsp), %ymm15 - vmovupd 320(%rsp), %ymm3 - vmovupd 384(%rsp), %ymm2 - movq 160(%rsp), %rsi - movq 200(%rsp), %r12 - cfi_restore (%r12) - movq 192(%rsp), %r13 - cfi_restore (%r13) - movq 184(%rsp), %r14 - cfi_restore (%r14) - movq 176(%rsp), %r15 - cfi_restore (%r15) - movq 168(%rsp), %rbx - jmp .LBL_1_2 - -.LBL_1_10: - cfi_restore_state - movzbl %r12b, %r15d - shlq $4, %r15 - vmovsd 264(%rsp,%r15), %xmm0 - vzeroupper - - call JUMPTARGET(sin) - - vmovsd %xmm0, 328(%rsp,%r15) - vmovsd 264(%rsp,%r15), %xmm0 - - call JUMPTARGET(cos) - - vmovsd %xmm0, 392(%rsp,%r15) - jmp .LBL_1_8 - -.LBL_1_13: - movzbl %r12b, %r15d - shlq $4, %r15 - vmovsd 256(%rsp,%r15), %xmm0 - vzeroupper - - call JUMPTARGET(sin) - - vmovsd %xmm0, 320(%rsp,%r15) - vmovsd 256(%rsp,%r15), %xmm0 - - call JUMPTARGET(cos) - - vmovsd %xmm0, 384(%rsp,%r15) - jmp .LBL_1_7 - -END (_ZGVdN4vl8l8_sincos_avx2) -libmvec_hidden_def(_ZGVdN4vl8l8_sincos_avx2) - -/* vvv version implemented with wrapper to vl8l8 variant. */ -ENTRY (_ZGVdN4vvv_sincos_avx2) -#ifndef __ILP32__ - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-32, %rsp - subq $128, %rsp - vmovdqu %ymm1, 64(%rsp) - lea (%rsp), %rdi - vmovdqu %ymm2, 96(%rdi) - lea 32(%rsp), %rsi - call HIDDEN_JUMPTARGET(_ZGVdN4vl8l8_sincos_avx2) - movq 64(%rsp), %rdx - movq 96(%rsp), %rsi - movq 72(%rsp), %r8 - movq 104(%rsp), %r10 - movq (%rsp), %rax - movq 32(%rsp), %rcx - movq 8(%rsp), %rdi - movq 40(%rsp), %r9 - movq %rax, (%rdx) - movq %rcx, (%rsi) - movq 80(%rsp), %rax - movq 112(%rsp), %rcx - movq %rdi, (%r8) - movq %r9, (%r10) - movq 88(%rsp), %rdi - movq 120(%rsp), %r9 - movq 16(%rsp), %r11 - movq 48(%rsp), %rdx - movq 24(%rsp), %rsi - movq 56(%rsp), %r8 - movq %r11, (%rax) - movq %rdx, (%rcx) - movq %rsi, (%rdi) - movq %r8, (%r9) - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret -#else - leal 8(%rsp), %r10d - .cfi_def_cfa 10, 0 - andl $-32, %esp - pushq -8(%r10d) - pushq %rbp - .cfi_escape 0x10,0x6,0x2,0x76,0 - movl %esp, %ebp - pushq %r10 - .cfi_escape 0xf,0x3,0x76,0x78,0x6 - leal -48(%rbp), %esi - leal -80(%rbp), %edi - subl $104, %esp - vmovaps %xmm1, -96(%ebp) - vmovaps %xmm2, -112(%ebp) - call HIDDEN_JUMPTARGET(_ZGVdN4vl8l8_sincos_avx2) - movl -96(%ebp), %eax - vmovsd -80(%ebp), %xmm0 - vmovsd %xmm0, (%eax) - movl -92(%ebp), %eax - vmovsd -72(%ebp), %xmm0 - vmovsd %xmm0, (%eax) - movl -88(%ebp), %eax - vmovsd -64(%ebp), %xmm0 - vmovsd %xmm0, (%eax) - movl -84(%ebp), %eax - vmovsd -56(%ebp), %xmm0 - vmovsd %xmm0, (%eax) - movl -112(%ebp), %eax - vmovsd -48(%ebp), %xmm0 - vmovsd %xmm0, (%eax) - movl -108(%ebp), %eax - vmovsd -40(%ebp), %xmm0 - vmovsd %xmm0, (%eax) - movl -104(%ebp), %eax - vmovsd -32(%ebp), %xmm0 - vmovsd %xmm0, (%eax) - movl -100(%ebp), %eax - vmovsd -24(%ebp), %xmm0 - vmovsd %xmm0, (%eax) - addl $104, %esp - popq %r10 - .cfi_def_cfa 10, 0 - popq %rbp - leal -8(%r10), %esp - .cfi_def_cfa 7, 8 - ret -#endif -END (_ZGVdN4vvv_sincos_avx2) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S deleted file mode 100644 index 3c0abc379e..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S +++ /dev/null @@ -1,37 +0,0 @@ -/* Multiple versions of vectorized sincos. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVeN8vvv_sincos) - .type _ZGVeN8vvv_sincos, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVeN8vvv_sincos_skx(%rip), %rax - HAS_ARCH_FEATURE (AVX512DQ_Usable) - jnz 2f - leaq _ZGVeN8vvv_sincos_knl(%rip), %rax - HAS_ARCH_FEATURE (AVX512F_Usable) - jnz 2f - leaq _ZGVeN8vvv_sincos_avx2_wrapper(%rip), %rax -2: ret -END (_ZGVeN8vvv_sincos) - -#define _ZGVeN8vvv_sincos _ZGVeN8vvv_sincos_avx2_wrapper -#include "../svml_d_sincos8_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S deleted file mode 100644 index c9207558c5..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S +++ /dev/null @@ -1,763 +0,0 @@ -/* Function sincos vectorized with AVX-512. KNL and SKX versions. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include "svml_d_trig_data.h" -#include "svml_d_wrapper_impl.h" - -/* - ALGORITHM DESCRIPTION: - - ( low accuracy ( < 4ulp ) or enhanced performance - ( half of correct mantissa ) implementation ) - - Argument representation: - arg = N*Pi + R - - Result calculation: - sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R) - arg + Pi/2 = (N'*Pi + R') - cos(arg) = sin(arg+Pi/2) = sin(N'*Pi + R') = (-1)^N' * sin(R') - sin(R), sin(R') are approximated by corresponding polynomial. */ - - .text -ENTRY (_ZGVeN8vl8l8_sincos_knl) -#ifndef HAVE_AVX512DQ_ASM_SUPPORT -WRAPPER_IMPL_AVX512_fFF _ZGVdN4vl8l8_sincos -#else - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $1344, %rsp - movq __svml_d_trig_data@GOTPCREL(%rip), %rax - vmovaps %zmm0, %zmm4 - movq $-1, %rdx - vmovups __dSignMask(%rax), %zmm12 - vmovups __dInvPI(%rax), %zmm5 - -/* ARGUMENT RANGE REDUCTION: - Absolute argument: X' = |X| */ - vpandnq %zmm4, %zmm12, %zmm3 - vmovups __dPI1_FMA(%rax), %zmm7 - vmovups __dPI3_FMA(%rax), %zmm9 - -/* SinR = X' - SinN*Pi1 */ - vmovaps %zmm3, %zmm8 - -/* CosR = SinX - CosN*Pi1 */ - vmovaps %zmm3, %zmm10 - -/* SinY = X'*InvPi + RS : right shifter add */ - vfmadd213pd __dRShifter(%rax), %zmm3, %zmm5 - vmovups __dC6(%rax), %zmm13 - -/* SinN = Y - RS : right shifter sub */ - vsubpd __dRShifter(%rax), %zmm5, %zmm1 - vmovaps %zmm13, %zmm14 - -/* SinSignRes = Y<<63 : shift LSB to MSB place for result sign */ - vpsllq $63, %zmm5, %zmm2 - vcmppd $22, __dRangeVal(%rax), %zmm3, %k1 - -/* Update CosRSign and CosSignRes signs */ - vmovaps %zmm12, %zmm5 - vfnmadd231pd %zmm1, %zmm7, %zmm8 - -/* SinR = SinR - SinN*Pi1 */ - vfnmadd231pd __dPI2_FMA(%rax), %zmm1, %zmm8 - -/* Sine result sign: SinRSign = SignMask & SinR */ - vpandq %zmm8, %zmm12, %zmm11 - -/* Set SinRSign to 0.5 */ - vporq __dOneHalf(%rax), %zmm11, %zmm6 - vpternlogq $150, %zmm2, %zmm11, %zmm5 - -/* Update sign SinSignRes */ - vpternlogq $120, %zmm4, %zmm12, %zmm2 - -/* Polynomial approximation */ - vmovups __dC7(%rax), %zmm11 - -/* CosN = SinN +(-)0.5 */ - vaddpd %zmm6, %zmm1, %zmm0 - -/* SinR = SinR - SinN*Pi3 */ - vfnmadd213pd %zmm8, %zmm9, %zmm1 - vfnmadd231pd %zmm0, %zmm7, %zmm10 - -/* SinR2 = SinR^2 */ - vmulpd %zmm1, %zmm1, %zmm15 - -/* Grab SignX - CosR = CosR - CosN*Pi2 */ - vfnmadd231pd __dPI2_FMA(%rax), %zmm0, %zmm10 - vfmadd231pd __dC7(%rax), %zmm15, %zmm14 - -/* CosR = CosR - CosN*Pi3 */ - vfnmadd213pd %zmm10, %zmm9, %zmm0 - vfmadd213pd __dC5(%rax), %zmm15, %zmm14 - -/* CosR2 = CosR^2 */ - vmulpd %zmm0, %zmm0, %zmm12 - vfmadd213pd __dC4(%rax), %zmm15, %zmm14 - vfmadd213pd %zmm13, %zmm12, %zmm11 - -/* SinPoly = C3 + SinR2*(C4 + SinR2*(C5 + SinR2*(C6 + SinR2*C7))) */ - vfmadd213pd __dC3(%rax), %zmm15, %zmm14 - vfmadd213pd __dC5(%rax), %zmm12, %zmm11 - -/* SinPoly = C2 + SinR2*SinPoly */ - vfmadd213pd __dC2(%rax), %zmm15, %zmm14 - vfmadd213pd __dC4(%rax), %zmm12, %zmm11 - -/* SinPoly = C1 + SinR2*SinPoly */ - vfmadd213pd __dC1(%rax), %zmm15, %zmm14 - -/* CosPoly = C3 + CosR2*(C4 + CosR2*(C5 + CosR2*(C6 + CosR2*C7))) */ - vfmadd213pd __dC3(%rax), %zmm12, %zmm11 - -/* SinPoly = SinR2*SinPoly */ - vmulpd %zmm15, %zmm14, %zmm13 - -/* CosPoly = C2 + CosR2*CosPoly */ - vfmadd213pd __dC2(%rax), %zmm12, %zmm11 - -/* SinPoly = SinR*SinPoly */ - vfmadd213pd %zmm1, %zmm1, %zmm13 - vpbroadcastq %rdx, %zmm1{%k1}{z} - -/* CosPoly = C1 + CosR2*CosPoly */ - vfmadd213pd __dC1(%rax), %zmm12, %zmm11 - vptestmq %zmm1, %zmm1, %k0 - kmovw %k0, %ecx - -/* CosPoly = CosR2*CosPoly */ - vmulpd %zmm12, %zmm11, %zmm14 - movzbl %cl, %ecx - -/* CosPoly = CosR*CosPoly */ - vfmadd213pd %zmm0, %zmm0, %zmm14 - -/* Final reconstruction. - Update Sin result's sign */ - vpxorq %zmm2, %zmm13, %zmm0 - -/* Update Cos result's sign */ - vpxorq %zmm5, %zmm14, %zmm2 - testl %ecx, %ecx - jne .LBL_1_3 - -.LBL_1_2: - cfi_remember_state - vmovups %zmm0, (%rdi) - vmovups %zmm2, (%rsi) - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_1_3: - cfi_restore_state - vmovups %zmm4, 1152(%rsp) - vmovups %zmm0, 1216(%rsp) - vmovups %zmm2, 1280(%rsp) - je .LBL_1_2 - - xorb %dl, %dl - kmovw %k4, 1048(%rsp) - xorl %eax, %eax - kmovw %k5, 1040(%rsp) - kmovw %k6, 1032(%rsp) - kmovw %k7, 1024(%rsp) - vmovups %zmm16, 960(%rsp) - vmovups %zmm17, 896(%rsp) - vmovups %zmm18, 832(%rsp) - vmovups %zmm19, 768(%rsp) - vmovups %zmm20, 704(%rsp) - vmovups %zmm21, 640(%rsp) - vmovups %zmm22, 576(%rsp) - vmovups %zmm23, 512(%rsp) - vmovups %zmm24, 448(%rsp) - vmovups %zmm25, 384(%rsp) - vmovups %zmm26, 320(%rsp) - vmovups %zmm27, 256(%rsp) - vmovups %zmm28, 192(%rsp) - vmovups %zmm29, 128(%rsp) - vmovups %zmm30, 64(%rsp) - vmovups %zmm31, (%rsp) - movq %rsi, 1056(%rsp) - movq %r12, 1096(%rsp) - cfi_offset_rel_rsp (12, 1096) - movb %dl, %r12b - movq %r13, 1088(%rsp) - cfi_offset_rel_rsp (13, 1088) - movl %eax, %r13d - movq %r14, 1080(%rsp) - cfi_offset_rel_rsp (14, 1080) - movl %ecx, %r14d - movq %r15, 1072(%rsp) - cfi_offset_rel_rsp (15, 1072) - movq %rbx, 1064(%rsp) - movq %rdi, %rbx - cfi_remember_state - -.LBL_1_6: - btl %r13d, %r14d - jc .LBL_1_13 - -.LBL_1_7: - lea 1(%r13), %esi - btl %esi, %r14d - jc .LBL_1_10 - -.LBL_1_8: - addb $1, %r12b - addl $2, %r13d - cmpb $16, %r12b - jb .LBL_1_6 - - movq %rbx, %rdi - kmovw 1048(%rsp), %k4 - movq 1056(%rsp), %rsi - kmovw 1040(%rsp), %k5 - movq 1096(%rsp), %r12 - cfi_restore (%r12) - kmovw 1032(%rsp), %k6 - movq 1088(%rsp), %r13 - cfi_restore (%r13) - kmovw 1024(%rsp), %k7 - vmovups 960(%rsp), %zmm16 - vmovups 896(%rsp), %zmm17 - vmovups 832(%rsp), %zmm18 - vmovups 768(%rsp), %zmm19 - vmovups 704(%rsp), %zmm20 - vmovups 640(%rsp), %zmm21 - vmovups 576(%rsp), %zmm22 - vmovups 512(%rsp), %zmm23 - vmovups 448(%rsp), %zmm24 - vmovups 384(%rsp), %zmm25 - vmovups 320(%rsp), %zmm26 - vmovups 256(%rsp), %zmm27 - vmovups 192(%rsp), %zmm28 - vmovups 128(%rsp), %zmm29 - vmovups 64(%rsp), %zmm30 - vmovups (%rsp), %zmm31 - movq 1080(%rsp), %r14 - cfi_restore (%r14) - movq 1072(%rsp), %r15 - cfi_restore (%r15) - movq 1064(%rsp), %rbx - vmovups 1216(%rsp), %zmm0 - vmovups 1280(%rsp), %zmm2 - jmp .LBL_1_2 - -.LBL_1_10: - cfi_restore_state - movzbl %r12b, %r15d - shlq $4, %r15 - vmovsd 1160(%rsp,%r15), %xmm0 - - call JUMPTARGET(sin) - - vmovsd %xmm0, 1224(%rsp,%r15) - vmovsd 1160(%rsp,%r15), %xmm0 - - call JUMPTARGET(cos) - - vmovsd %xmm0, 1288(%rsp,%r15) - jmp .LBL_1_8 - -.LBL_1_13: - movzbl %r12b, %r15d - shlq $4, %r15 - vmovsd 1152(%rsp,%r15), %xmm0 - - call JUMPTARGET(sin) - - vmovsd %xmm0, 1216(%rsp,%r15) - vmovsd 1152(%rsp,%r15), %xmm0 - - call JUMPTARGET(cos) - - vmovsd %xmm0, 1280(%rsp,%r15) - jmp .LBL_1_7 - -#endif -END (_ZGVeN8vl8l8_sincos_knl) -libmvec_hidden_def(_ZGVeN8vl8l8_sincos_knl) - -ENTRY (_ZGVeN8vl8l8_sincos_skx) -#ifndef HAVE_AVX512DQ_ASM_SUPPORT -WRAPPER_IMPL_AVX512_fFF _ZGVdN4vl8l8_sincos -#else - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $1344, %rsp - movq __svml_d_trig_data@GOTPCREL(%rip), %rax - vmovaps %zmm0, %zmm8 - vmovups __dSignMask(%rax), %zmm4 - vmovups __dInvPI(%rax), %zmm9 - vmovups __dRShifter(%rax), %zmm10 - vmovups __dPI1_FMA(%rax), %zmm13 - vmovups __dPI2_FMA(%rax), %zmm14 - vmovups __dOneHalf(%rax), %zmm11 - vmovups __dPI3_FMA(%rax), %zmm2 - -/* ARGUMENT RANGE REDUCTION: - Absolute argument: X' = |X| */ - vandnpd %zmm8, %zmm4, %zmm7 - -/* SinY = X'*InvPi + RS : right shifter add */ - vfmadd213pd %zmm10, %zmm7, %zmm9 - vcmppd $18, __dRangeVal(%rax), %zmm7, %k1 - -/* SinSignRes = Y<<63 : shift LSB to MSB place for result sign */ - vpsllq $63, %zmm9, %zmm6 - -/* SinN = Y - RS : right shifter sub */ - vsubpd %zmm10, %zmm9, %zmm5 - vmovups __dC5(%rax), %zmm9 - vmovups __dC4(%rax), %zmm10 - -/* SinR = X' - SinN*Pi1 */ - vmovaps %zmm7, %zmm15 - vfnmadd231pd %zmm5, %zmm13, %zmm15 - -/* SinR = SinR - SinN*Pi1 */ - vfnmadd231pd %zmm5, %zmm14, %zmm15 - -/* Sine result sign: SinRSign = SignMask & SinR */ - vandpd %zmm15, %zmm4, %zmm1 - -/* Set SinRSign to 0.5 */ - vorpd %zmm1, %zmm11, %zmm12 - vmovups __dC3(%rax), %zmm11 - -/* CosN = SinN +(-)0.5 */ - vaddpd %zmm12, %zmm5, %zmm3 - -/* SinR = SinR - SinN*Pi3 */ - vfnmadd213pd %zmm15, %zmm2, %zmm5 - vmovups __dC2(%rax), %zmm12 - -/* SinR2 = SinR^2 */ - vmulpd %zmm5, %zmm5, %zmm15 - -/* CosR = SinX - CosN*Pi1 */ - vmovaps %zmm7, %zmm0 - vfnmadd231pd %zmm3, %zmm13, %zmm0 - vmovups __dC1(%rax), %zmm13 - -/* Grab SignX - CosR = CosR - CosN*Pi2 */ - vfnmadd231pd %zmm3, %zmm14, %zmm0 - -/* CosR = CosR - CosN*Pi3 */ - vfnmadd213pd %zmm0, %zmm2, %zmm3 - -/* Polynomial approximation */ - vmovups __dC7(%rax), %zmm0 - -/* Update CosRSign and CosSignRes signs */ - vmovaps %zmm4, %zmm2 - vpternlogq $150, %zmm6, %zmm1, %zmm2 - -/* Update sign SinSignRes */ - vpternlogq $120, %zmm8, %zmm4, %zmm6 - -/* CosR2 = CosR^2 */ - vmulpd %zmm3, %zmm3, %zmm1 - vmovups __dC6(%rax), %zmm4 - vmovaps %zmm0, %zmm14 - vfmadd213pd %zmm4, %zmm1, %zmm0 - vfmadd213pd %zmm4, %zmm15, %zmm14 - vfmadd213pd %zmm9, %zmm1, %zmm0 - vfmadd213pd %zmm9, %zmm15, %zmm14 - vfmadd213pd %zmm10, %zmm1, %zmm0 - vfmadd213pd %zmm10, %zmm15, %zmm14 - -/* CosPoly = C3 + CosR2*(C4 + CosR2*(C5 + CosR2*(C6 + CosR2*C7))) */ - vfmadd213pd %zmm11, %zmm1, %zmm0 - -/* SinPoly = C3 + SinR2*(C4 + SinR2*(C5 + SinR2*(C6 + SinR2*C7))) */ - vfmadd213pd %zmm11, %zmm15, %zmm14 - -/* CosPoly = C2 + CosR2*CosPoly */ - vfmadd213pd %zmm12, %zmm1, %zmm0 - -/* SinPoly = C2 + SinR2*SinPoly */ - vfmadd213pd %zmm12, %zmm15, %zmm14 - -/* CosPoly = C1 + CosR2*CosPoly */ - vfmadd213pd %zmm13, %zmm1, %zmm0 - -/* SinPoly = C1 + SinR2*SinPoly */ - vfmadd213pd %zmm13, %zmm15, %zmm14 - -/* CosPoly = CosR2*CosPoly */ - vmulpd %zmm1, %zmm0, %zmm1 - -/* SinPoly = SinR2*SinPoly */ - vmulpd %zmm15, %zmm14, %zmm4 - -/* CosPoly = CosR*CosPoly */ - vfmadd213pd %zmm3, %zmm3, %zmm1 - -/* SinPoly = SinR*SinPoly */ - vfmadd213pd %zmm5, %zmm5, %zmm4 - vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3 - -/* Update Cos result's sign */ - vxorpd %zmm2, %zmm1, %zmm1 - -/* Final reconstruction. - Update Sin result's sign */ - vxorpd %zmm6, %zmm4, %zmm0 - vpandnq %zmm7, %zmm7, %zmm3{%k1} - vcmppd $3, %zmm3, %zmm3, %k0 - kmovw %k0, %ecx - testl %ecx, %ecx - jne .LBL_2_3 - -.LBL_2_2: - cfi_remember_state - vmovups %zmm0, (%rdi) - vmovups %zmm1, (%rsi) - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_2_3: - cfi_restore_state - vmovups %zmm8, 1152(%rsp) - vmovups %zmm0, 1216(%rsp) - vmovups %zmm1, 1280(%rsp) - je .LBL_2_2 - - xorb %dl, %dl - xorl %eax, %eax - kmovw %k4, 1048(%rsp) - kmovw %k5, 1040(%rsp) - kmovw %k6, 1032(%rsp) - kmovw %k7, 1024(%rsp) - vmovups %zmm16, 960(%rsp) - vmovups %zmm17, 896(%rsp) - vmovups %zmm18, 832(%rsp) - vmovups %zmm19, 768(%rsp) - vmovups %zmm20, 704(%rsp) - vmovups %zmm21, 640(%rsp) - vmovups %zmm22, 576(%rsp) - vmovups %zmm23, 512(%rsp) - vmovups %zmm24, 448(%rsp) - vmovups %zmm25, 384(%rsp) - vmovups %zmm26, 320(%rsp) - vmovups %zmm27, 256(%rsp) - vmovups %zmm28, 192(%rsp) - vmovups %zmm29, 128(%rsp) - vmovups %zmm30, 64(%rsp) - vmovups %zmm31, (%rsp) - movq %rsi, 1056(%rsp) - movq %r12, 1096(%rsp) - cfi_offset_rel_rsp (12, 1096) - movb %dl, %r12b - movq %r13, 1088(%rsp) - cfi_offset_rel_rsp (13, 1088) - movl %eax, %r13d - movq %r14, 1080(%rsp) - cfi_offset_rel_rsp (14, 1080) - movl %ecx, %r14d - movq %r15, 1072(%rsp) - cfi_offset_rel_rsp (15, 1072) - movq %rbx, 1064(%rsp) - movq %rdi, %rbx - cfi_remember_state - -.LBL_2_6: - btl %r13d, %r14d - jc .LBL_2_13 - -.LBL_2_7: - lea 1(%r13), %esi - btl %esi, %r14d - jc .LBL_2_10 - -.LBL_2_8: - incb %r12b - addl $2, %r13d - cmpb $16, %r12b - jb .LBL_2_6 - - kmovw 1048(%rsp), %k4 - movq %rbx, %rdi - kmovw 1040(%rsp), %k5 - kmovw 1032(%rsp), %k6 - kmovw 1024(%rsp), %k7 - vmovups 960(%rsp), %zmm16 - vmovups 896(%rsp), %zmm17 - vmovups 832(%rsp), %zmm18 - vmovups 768(%rsp), %zmm19 - vmovups 704(%rsp), %zmm20 - vmovups 640(%rsp), %zmm21 - vmovups 576(%rsp), %zmm22 - vmovups 512(%rsp), %zmm23 - vmovups 448(%rsp), %zmm24 - vmovups 384(%rsp), %zmm25 - vmovups 320(%rsp), %zmm26 - vmovups 256(%rsp), %zmm27 - vmovups 192(%rsp), %zmm28 - vmovups 128(%rsp), %zmm29 - vmovups 64(%rsp), %zmm30 - vmovups (%rsp), %zmm31 - vmovups 1216(%rsp), %zmm0 - vmovups 1280(%rsp), %zmm1 - movq 1056(%rsp), %rsi - movq 1096(%rsp), %r12 - cfi_restore (%r12) - movq 1088(%rsp), %r13 - cfi_restore (%r13) - movq 1080(%rsp), %r14 - cfi_restore (%r14) - movq 1072(%rsp), %r15 - cfi_restore (%r15) - movq 1064(%rsp), %rbx - jmp .LBL_2_2 - -.LBL_2_10: - cfi_restore_state - movzbl %r12b, %r15d - shlq $4, %r15 - vmovsd 1160(%rsp,%r15), %xmm0 - vzeroupper - vmovsd 1160(%rsp,%r15), %xmm0 - - call JUMPTARGET(sin) - - vmovsd %xmm0, 1224(%rsp,%r15) - vmovsd 1160(%rsp,%r15), %xmm0 - - call JUMPTARGET(cos) - - vmovsd %xmm0, 1288(%rsp,%r15) - jmp .LBL_2_8 - -.LBL_2_13: - movzbl %r12b, %r15d - shlq $4, %r15 - vmovsd 1152(%rsp,%r15), %xmm0 - vzeroupper - vmovsd 1152(%rsp,%r15), %xmm0 - - call JUMPTARGET(sin) - - vmovsd %xmm0, 1216(%rsp,%r15) - vmovsd 1152(%rsp,%r15), %xmm0 - - call JUMPTARGET(cos) - - vmovsd %xmm0, 1280(%rsp,%r15) - jmp .LBL_2_7 - -#endif -END (_ZGVeN8vl8l8_sincos_skx) -libmvec_hidden_def(_ZGVeN8vl8l8_sincos_skx) - -/* Wrapper between vvv and vl8l8 vector variants. */ -.macro WRAPPER_AVX512_vvv_vl8l8 callee -#ifndef __ILP32__ - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $256, %rsp - /* Encoding for vmovups %zmm1, 128(%rsp). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x11 - .byte 0x4c - .byte 0x24 - .byte 0x02 - lea (%rsp), %rdi - /* Encoding for vmovups %zmm2, 192(%rdi). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x11 - .byte 0x57 - .byte 0x03 - lea 64(%rsp), %rsi - call HIDDEN_JUMPTARGET(\callee) - movq 128(%rsp), %rdx - movq 136(%rsp), %rsi - movq 144(%rsp), %r8 - movq 152(%rsp), %r10 - movq (%rsp), %rax - movq 8(%rsp), %rcx - movq 16(%rsp), %rdi - movq 24(%rsp), %r9 - movq %rax, (%rdx) - movq %rcx, (%rsi) - movq 160(%rsp), %rax - movq 168(%rsp), %rcx - movq %rdi, (%r8) - movq %r9, (%r10) - movq 176(%rsp), %rdi - movq 184(%rsp), %r9 - movq 32(%rsp), %r11 - movq 40(%rsp), %rdx - movq 48(%rsp), %rsi - movq 56(%rsp), %r8 - movq %r11, (%rax) - movq %rdx, (%rcx) - movq 192(%rsp), %r11 - movq 200(%rsp), %rdx - movq %rsi, (%rdi) - movq %r8, (%r9) - movq 208(%rsp), %rsi - movq 216(%rsp), %r8 - movq 64(%rsp), %r10 - movq 72(%rsp), %rax - movq 80(%rsp), %rcx - movq 88(%rsp), %rdi - movq %r10, (%r11) - movq %rax, (%rdx) - movq 224(%rsp), %r10 - movq 232(%rsp), %rax - movq %rcx, (%rsi) - movq %rdi, (%r8) - movq 240(%rsp), %rcx - movq 248(%rsp), %rdi - movq 96(%rsp), %r9 - movq 104(%rsp), %r11 - movq 112(%rsp), %rdx - movq 120(%rsp), %rsi - movq %r9, (%r10) - movq %r11, (%rax) - movq %rdx, (%rcx) - movq %rsi, (%rdi) - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret -#else - leal 8(%rsp), %r10d - .cfi_def_cfa 10, 0 - andl $-64, %esp - pushq -8(%r10d) - pushq %rbp - .cfi_escape 0x10,0x6,0x2,0x76,0 - movl %esp, %ebp - pushq %r10 - .cfi_escape 0xf,0x3,0x76,0x78,0x6 - leal -112(%rbp), %esi - leal -176(%rbp), %edi - subl $232, %esp - vmovdqa %ymm1, -208(%ebp) - vmovdqa %ymm2, -240(%ebp) - call HIDDEN_JUMPTARGET(\callee) - vmovdqa -208(%ebp), %xmm0 - vmovq %xmm0, %rax - vmovsd -176(%ebp), %xmm0 - vmovsd %xmm0, (%eax) - shrq $32, %rax - vmovsd -168(%ebp), %xmm0 - vmovsd %xmm0, (%eax) - movq -200(%ebp), %rax - vmovsd -160(%ebp), %xmm0 - vmovsd %xmm0, (%eax) - shrq $32, %rax - vmovsd -152(%ebp), %xmm0 - vmovsd %xmm0, (%eax) - movq -192(%ebp), %rax - vmovsd -144(%ebp), %xmm0 - vmovsd %xmm0, (%eax) - shrq $32, %rax - vmovsd -136(%ebp), %xmm0 - vmovsd %xmm0, (%eax) - movq -184(%ebp), %rax - vmovsd -128(%ebp), %xmm0 - vmovsd %xmm0, (%eax) - shrq $32, %rax - vmovsd -120(%ebp), %xmm0 - vmovsd %xmm0, (%eax) - vmovdqa -240(%ebp), %xmm0 - vmovq %xmm0, %rax - vmovsd -112(%ebp), %xmm0 - vmovsd %xmm0, (%eax) - shrq $32, %rax - vmovsd -104(%ebp), %xmm0 - vmovsd %xmm0, (%eax) - movq -232(%ebp), %rax - vmovsd -96(%ebp), %xmm0 - vmovsd %xmm0, (%eax) - shrq $32, %rax - vmovsd -88(%ebp), %xmm0 - vmovsd %xmm0, (%eax) - movq -224(%ebp), %rax - vmovsd -80(%ebp), %xmm0 - vmovsd %xmm0, (%eax) - shrq $32, %rax - vmovsd -72(%ebp), %xmm0 - vmovsd %xmm0, (%eax) - movq -216(%ebp), %rax - vmovsd -64(%ebp), %xmm0 - vmovsd %xmm0, (%eax) - shrq $32, %rax - vmovsd -56(%ebp), %xmm0 - vmovsd %xmm0, (%eax) - addl $232, %esp - popq %r10 - .cfi_def_cfa 10, 0 - popq %rbp - leal -8(%r10), %esp - .cfi_def_cfa 7, 8 - ret -#endif -.endm - -ENTRY (_ZGVeN8vvv_sincos_knl) -WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_knl -END (_ZGVeN8vvv_sincos_knl) - -ENTRY (_ZGVeN8vvv_sincos_skx) -WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx -END (_ZGVeN8vvv_sincos_skx) - - .section .rodata, "a" -.L_2il0floatpacket.15: - .long 0xffffffff,0xffffffff - .type .L_2il0floatpacket.15,@object diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S deleted file mode 100644 index cd67665972..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S +++ /dev/null @@ -1,37 +0,0 @@ -/* Multiple versions of vectorized cosf. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVeN16v_cosf) - .type _ZGVeN16v_cosf, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVeN16v_cosf_skx(%rip), %rax - HAS_ARCH_FEATURE (AVX512DQ_Usable) - jnz 2f - leaq _ZGVeN16v_cosf_knl(%rip), %rax - HAS_ARCH_FEATURE (AVX512F_Usable) - jnz 2f - leaq _ZGVeN16v_cosf_avx2_wrapper(%rip), %rax -2: ret -END (_ZGVeN16v_cosf) - -#define _ZGVeN16v_cosf _ZGVeN16v_cosf_avx2_wrapper -#include "../svml_s_cosf16_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S deleted file mode 100644 index 611bb5dd2d..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S +++ /dev/null @@ -1,460 +0,0 @@ -/* Function cosf vectorized with AVX-512. KNL and SKX versions. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include "svml_s_trig_data.h" -#include "svml_s_wrapper_impl.h" - - .text -ENTRY (_ZGVeN16v_cosf_knl) -#ifndef HAVE_AVX512DQ_ASM_SUPPORT -WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf -#else -/* - ALGORITHM DESCRIPTION: - - 1) Range reduction to [-Pi/2; +Pi/2] interval - a) We remove sign using AND operation - b) Add Pi/2 value to argument X for Cos to Sin transformation - c) Getting octant Y by 1/Pi multiplication - d) Add "Right Shifter" value - e) Treat obtained value as integer for destination sign setting. - Shift first bit of this value to the last (sign) position - f) Subtract "Right Shifter" value - g) Subtract 0.5 from result for octant correction - h) Subtract Y*PI from X argument, where PI divided to 4 parts: - X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4; - 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval) - a) Calculate X^2 = X * X - b) Calculate polynomial: - R = X + X * X^2 * (A3 + x^2 * (A5 + ..... - 3) Destination sign setting - a) Set shifted destination sign using XOR operation: - R = XOR( R, S ); - */ - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $1280, %rsp - movq __svml_s_trig_data@GOTPCREL(%rip), %rdx - -/* - h) Subtract Y*PI from X argument, where PI divided to 4 parts: - X = X - Y*PI1 - Y*PI2 - Y*PI3 - */ - vmovaps %zmm0, %zmm6 - movl $-1, %eax - -/* b) Add Pi/2 value to argument X for Cos to Sin transformation */ - vaddps __sHalfPI(%rdx), %zmm0, %zmm2 - vmovups __sRShifter(%rdx), %zmm3 - -/* - 1) Range reduction to [-Pi/2; +Pi/2] interval - c) Getting octant Y by 1/Pi multiplication - d) Add "Right Shifter" (0x4B000000) value - */ - vfmadd132ps __sInvPI(%rdx), %zmm3, %zmm2 - vmovups __sPI1_FMA(%rdx), %zmm5 - -/* f) Subtract "Right Shifter" (0x4B000000) value */ - vsubps %zmm3, %zmm2, %zmm4 - vmovups __sA9_FMA(%rdx), %zmm9 - -/* Check for large and special arguments */ - vpandd __sAbsMask(%rdx), %zmm0, %zmm1 - -/* - e) Treat obtained value as integer for destination sign setting. - Shift first bit of this value to the last (sign) position (S << 31) - */ - vpslld $31, %zmm2, %zmm8 - vcmpps $22, __sRangeReductionVal(%rdx), %zmm1, %k1 - vpbroadcastd %eax, %zmm12{%k1}{z} - -/* g) Subtract 0.5 from result for octant correction */ - vsubps __sOneHalf(%rdx), %zmm4, %zmm7 - vptestmd %zmm12, %zmm12, %k0 - vfnmadd231ps %zmm7, %zmm5, %zmm6 - kmovw %k0, %ecx - vfnmadd231ps __sPI2_FMA(%rdx), %zmm7, %zmm6 - vfnmadd132ps __sPI3_FMA(%rdx), %zmm6, %zmm7 - -/* a) Calculate X^2 = X * X */ - vmulps %zmm7, %zmm7, %zmm10 - -/* - 3) Destination sign setting - a) Set shifted destination sign using XOR operation: - R = XOR( R, S ); - */ - vpxord %zmm8, %zmm7, %zmm11 - -/* - b) Calculate polynomial: - R = X + X * X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9)))); - */ - vfmadd213ps __sA7_FMA(%rdx), %zmm10, %zmm9 - vfmadd213ps __sA5_FMA(%rdx), %zmm10, %zmm9 - vfmadd213ps __sA3(%rdx), %zmm10, %zmm9 - vmulps %zmm10, %zmm9, %zmm1 - vfmadd213ps %zmm11, %zmm11, %zmm1 - testl %ecx, %ecx - jne .LBL_1_3 - -.LBL_1_2: - cfi_remember_state - vmovaps %zmm1, %zmm0 - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_1_3: - cfi_restore_state - vmovups %zmm0, 1152(%rsp) - vmovups %zmm1, 1216(%rsp) - je .LBL_1_2 - - xorb %dl, %dl - kmovw %k4, 1048(%rsp) - xorl %eax, %eax - kmovw %k5, 1040(%rsp) - kmovw %k6, 1032(%rsp) - kmovw %k7, 1024(%rsp) - vmovups %zmm16, 960(%rsp) - vmovups %zmm17, 896(%rsp) - vmovups %zmm18, 832(%rsp) - vmovups %zmm19, 768(%rsp) - vmovups %zmm20, 704(%rsp) - vmovups %zmm21, 640(%rsp) - vmovups %zmm22, 576(%rsp) - vmovups %zmm23, 512(%rsp) - vmovups %zmm24, 448(%rsp) - vmovups %zmm25, 384(%rsp) - vmovups %zmm26, 320(%rsp) - vmovups %zmm27, 256(%rsp) - vmovups %zmm28, 192(%rsp) - vmovups %zmm29, 128(%rsp) - vmovups %zmm30, 64(%rsp) - vmovups %zmm31, (%rsp) - movq %rsi, 1064(%rsp) - movq %rdi, 1056(%rsp) - movq %r12, 1096(%rsp) - cfi_offset_rel_rsp (12, 1096) - movb %dl, %r12b - movq %r13, 1088(%rsp) - cfi_offset_rel_rsp (13, 1088) - movl %ecx, %r13d - movq %r14, 1080(%rsp) - cfi_offset_rel_rsp (14, 1080) - movl %eax, %r14d - movq %r15, 1072(%rsp) - cfi_offset_rel_rsp (15, 1072) - cfi_remember_state - -.LBL_1_6: - btl %r14d, %r13d - jc .LBL_1_12 - -.LBL_1_7: - lea 1(%r14), %esi - btl %esi, %r13d - jc .LBL_1_10 - -.LBL_1_8: - addb $1, %r12b - addl $2, %r14d - cmpb $16, %r12b - jb .LBL_1_6 - - kmovw 1048(%rsp), %k4 - movq 1064(%rsp), %rsi - kmovw 1040(%rsp), %k5 - movq 1056(%rsp), %rdi - kmovw 1032(%rsp), %k6 - movq 1096(%rsp), %r12 - cfi_restore (%r12) - movq 1088(%rsp), %r13 - cfi_restore (%r13) - kmovw 1024(%rsp), %k7 - vmovups 960(%rsp), %zmm16 - vmovups 896(%rsp), %zmm17 - vmovups 832(%rsp), %zmm18 - vmovups 768(%rsp), %zmm19 - vmovups 704(%rsp), %zmm20 - vmovups 640(%rsp), %zmm21 - vmovups 576(%rsp), %zmm22 - vmovups 512(%rsp), %zmm23 - vmovups 448(%rsp), %zmm24 - vmovups 384(%rsp), %zmm25 - vmovups 320(%rsp), %zmm26 - vmovups 256(%rsp), %zmm27 - vmovups 192(%rsp), %zmm28 - vmovups 128(%rsp), %zmm29 - vmovups 64(%rsp), %zmm30 - vmovups (%rsp), %zmm31 - movq 1080(%rsp), %r14 - cfi_restore (%r14) - movq 1072(%rsp), %r15 - cfi_restore (%r15) - vmovups 1216(%rsp), %zmm1 - jmp .LBL_1_2 - -.LBL_1_10: - cfi_restore_state - movzbl %r12b, %r15d - vmovss 1156(%rsp,%r15,8), %xmm0 - call JUMPTARGET(cosf) - vmovss %xmm0, 1220(%rsp,%r15,8) - jmp .LBL_1_8 - -.LBL_1_12: - movzbl %r12b, %r15d - vmovss 1152(%rsp,%r15,8), %xmm0 - call JUMPTARGET(cosf) - vmovss %xmm0, 1216(%rsp,%r15,8) - jmp .LBL_1_7 -#endif -END (_ZGVeN16v_cosf_knl) - -ENTRY (_ZGVeN16v_cosf_skx) -#ifndef HAVE_AVX512DQ_ASM_SUPPORT -WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf -#else -/* - ALGORITHM DESCRIPTION: - - 1) Range reduction to [-Pi/2; +Pi/2] interval - a) We remove sign using AND operation - b) Add Pi/2 value to argument X for Cos to Sin transformation - c) Getting octant Y by 1/Pi multiplication - d) Add "Right Shifter" value - e) Treat obtained value as integer for destination sign setting. - Shift first bit of this value to the last (sign) position - f) Subtract "Right Shifter" value - g) Subtract 0.5 from result for octant correction - h) Subtract Y*PI from X argument, where PI divided to 4 parts: - X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4; - 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval) - a) Calculate X^2 = X * X - b) Calculate polynomial: - R = X + X * X^2 * (A3 + x^2 * (A5 + ..... - 3) Destination sign setting - a) Set shifted destination sign using XOR operation: - R = XOR( R, S ); - */ - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $1280, %rsp - movq __svml_s_trig_data@GOTPCREL(%rip), %rax - -/* - h) Subtract Y*PI from X argument, where PI divided to 4 parts: - X = X - Y*PI1 - Y*PI2 - Y*PI3 - */ - vmovaps %zmm0, %zmm6 - vmovups .L_2il0floatpacket.13(%rip), %zmm12 - vmovups __sRShifter(%rax), %zmm3 - vmovups __sPI1_FMA(%rax), %zmm5 - vmovups __sA9_FMA(%rax), %zmm9 - -/* b) Add Pi/2 value to argument X for Cos to Sin transformation */ - vaddps __sHalfPI(%rax), %zmm0, %zmm2 - -/* Check for large and special arguments */ - vandps __sAbsMask(%rax), %zmm0, %zmm1 - -/* - 1) Range reduction to [-Pi/2; +Pi/2] interval - c) Getting octant Y by 1/Pi multiplication - d) Add "Right Shifter" (0x4B000000) value - */ - vfmadd132ps __sInvPI(%rax), %zmm3, %zmm2 - vcmpps $18, __sRangeReductionVal(%rax), %zmm1, %k1 - -/* - e) Treat obtained value as integer for destination sign setting. - Shift first bit of this value to the last (sign) position (S << 31) - */ - vpslld $31, %zmm2, %zmm8 - -/* f) Subtract "Right Shifter" (0x4B000000) value */ - vsubps %zmm3, %zmm2, %zmm4 - -/* g) Subtract 0.5 from result for octant correction */ - vsubps __sOneHalf(%rax), %zmm4, %zmm7 - vfnmadd231ps %zmm7, %zmm5, %zmm6 - vfnmadd231ps __sPI2_FMA(%rax), %zmm7, %zmm6 - vfnmadd132ps __sPI3_FMA(%rax), %zmm6, %zmm7 - -/* a) Calculate X^2 = X * X */ - vmulps %zmm7, %zmm7, %zmm10 - -/* - 3) Destination sign setting - a) Set shifted destination sign using XOR operation: - R = XOR( R, S ); - */ - vxorps %zmm8, %zmm7, %zmm11 - -/* - b) Calculate polynomial: - R = X + X * X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9)))); - */ - vfmadd213ps __sA7_FMA(%rax), %zmm10, %zmm9 - vfmadd213ps __sA5_FMA(%rax), %zmm10, %zmm9 - vfmadd213ps __sA3(%rax), %zmm10, %zmm9 - vpandnd %zmm1, %zmm1, %zmm12{%k1} - vmulps %zmm10, %zmm9, %zmm1 - vptestmd %zmm12, %zmm12, %k0 - vfmadd213ps %zmm11, %zmm11, %zmm1 - kmovw %k0, %ecx - testl %ecx, %ecx - jne .LBL_2_3 -.LBL_2_2: - cfi_remember_state - vmovaps %zmm1, %zmm0 - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_2_3: - cfi_restore_state - vmovups %zmm0, 1152(%rsp) - vmovups %zmm1, 1216(%rsp) - je .LBL_2_2 - - xorb %dl, %dl - xorl %eax, %eax - kmovw %k4, 1048(%rsp) - kmovw %k5, 1040(%rsp) - kmovw %k6, 1032(%rsp) - kmovw %k7, 1024(%rsp) - vmovups %zmm16, 960(%rsp) - vmovups %zmm17, 896(%rsp) - vmovups %zmm18, 832(%rsp) - vmovups %zmm19, 768(%rsp) - vmovups %zmm20, 704(%rsp) - vmovups %zmm21, 640(%rsp) - vmovups %zmm22, 576(%rsp) - vmovups %zmm23, 512(%rsp) - vmovups %zmm24, 448(%rsp) - vmovups %zmm25, 384(%rsp) - vmovups %zmm26, 320(%rsp) - vmovups %zmm27, 256(%rsp) - vmovups %zmm28, 192(%rsp) - vmovups %zmm29, 128(%rsp) - vmovups %zmm30, 64(%rsp) - vmovups %zmm31, (%rsp) - movq %rsi, 1064(%rsp) - movq %rdi, 1056(%rsp) - movq %r12, 1096(%rsp) - cfi_offset_rel_rsp (12, 1096) - movb %dl, %r12b - movq %r13, 1088(%rsp) - cfi_offset_rel_rsp (13, 1088) - movl %ecx, %r13d - movq %r14, 1080(%rsp) - cfi_offset_rel_rsp (14, 1080) - movl %eax, %r14d - movq %r15, 1072(%rsp) - cfi_offset_rel_rsp (15, 1072) - cfi_remember_state - -.LBL_2_6: - btl %r14d, %r13d - jc .LBL_2_12 -.LBL_2_7: - lea 1(%r14), %esi - btl %esi, %r13d - jc .LBL_2_10 -.LBL_2_8: - incb %r12b - addl $2, %r14d - cmpb $16, %r12b - jb .LBL_2_6 - kmovw 1048(%rsp), %k4 - kmovw 1040(%rsp), %k5 - kmovw 1032(%rsp), %k6 - kmovw 1024(%rsp), %k7 - vmovups 960(%rsp), %zmm16 - vmovups 896(%rsp), %zmm17 - vmovups 832(%rsp), %zmm18 - vmovups 768(%rsp), %zmm19 - vmovups 704(%rsp), %zmm20 - vmovups 640(%rsp), %zmm21 - vmovups 576(%rsp), %zmm22 - vmovups 512(%rsp), %zmm23 - vmovups 448(%rsp), %zmm24 - vmovups 384(%rsp), %zmm25 - vmovups 320(%rsp), %zmm26 - vmovups 256(%rsp), %zmm27 - vmovups 192(%rsp), %zmm28 - vmovups 128(%rsp), %zmm29 - vmovups 64(%rsp), %zmm30 - vmovups (%rsp), %zmm31 - vmovups 1216(%rsp), %zmm1 - movq 1064(%rsp), %rsi - movq 1056(%rsp), %rdi - movq 1096(%rsp), %r12 - cfi_restore (%r12) - movq 1088(%rsp), %r13 - cfi_restore (%r13) - movq 1080(%rsp), %r14 - cfi_restore (%r14) - movq 1072(%rsp), %r15 - cfi_restore (%r15) - jmp .LBL_2_2 - -.LBL_2_10: - cfi_restore_state - movzbl %r12b, %r15d - vmovss 1156(%rsp,%r15,8), %xmm0 - vzeroupper - vmovss 1156(%rsp,%r15,8), %xmm0 - call JUMPTARGET(cosf) - vmovss %xmm0, 1220(%rsp,%r15,8) - jmp .LBL_2_8 -.LBL_2_12: - movzbl %r12b, %r15d - vmovss 1152(%rsp,%r15,8), %xmm0 - vzeroupper - vmovss 1152(%rsp,%r15,8), %xmm0 - call JUMPTARGET(cosf) - vmovss %xmm0, 1216(%rsp,%r15,8) - jmp .LBL_2_7 -#endif -END (_ZGVeN16v_cosf_skx) - - .section .rodata, "a" -.L_2il0floatpacket.13: - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff - .type .L_2il0floatpacket.13,@object diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core.S deleted file mode 100644 index d73d7c7e3f..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized cosf, vector length is 4. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVbN4v_cosf) - .type _ZGVbN4v_cosf, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVbN4v_cosf_sse4(%rip), %rax - HAS_CPU_FEATURE (SSE4_1) - jz 2f - ret -2: leaq _ZGVbN4v_cosf_sse2(%rip), %rax - ret -END (_ZGVbN4v_cosf) -libmvec_hidden_def (_ZGVbN4v_cosf) - -#define _ZGVbN4v_cosf _ZGVbN4v_cosf_sse2 -#include "../svml_s_cosf4_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core_sse4.S deleted file mode 100644 index 73797e1a93..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core_sse4.S +++ /dev/null @@ -1,227 +0,0 @@ -/* Function cosf vectorized with SSE4. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include "svml_s_trig_data.h" - - .text -ENTRY (_ZGVbN4v_cosf_sse4) -/* - ALGORITHM DESCRIPTION: - - 1) Range reduction to [-Pi/2; +Pi/2] interval - a) We remove sign using AND operation - b) Add Pi/2 value to argument X for Cos to Sin transformation - c) Getting octant Y by 1/Pi multiplication - d) Add "Right Shifter" value - e) Treat obtained value as integer for destination sign setting. - Shift first bit of this value to the last (sign) position - f) Subtract "Right Shifter" value - g) Subtract 0.5 from result for octant correction - h) Subtract Y*PI from X argument, where PI divided to 4 parts: - X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4; - 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval) - a) Calculate X^2 = X * X - b) Calculate polynomial: - R = X + X * X^2 * (A3 + x^2 * (A5 + ..... - 3) Destination sign setting - a) Set shifted destination sign using XOR operation: - R = XOR( R, S ); - */ - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $320, %rsp - movaps %xmm0, %xmm4 - movq __svml_s_trig_data@GOTPCREL(%rip), %rax - movups __sHalfPI(%rax), %xmm1 - movups __sRShifter(%rax), %xmm5 - -/* b) Add Pi/2 value to argument X for Cos to Sin transformation */ - addps %xmm4, %xmm1 - -/* - 1) Range reduction to [-Pi/2; +Pi/2] interval - c) Getting octant Y by 1/Pi multiplication - d) Add "Right Shifter" (0x4B000000) value - */ - mulps __sInvPI(%rax), %xmm1 - movups __sPI1(%rax), %xmm6 - addps %xmm5, %xmm1 - -/* - e) Treat obtained value as integer for destination sign setting. - Shift first bit of this value to the last (sign) position (S << 31) - */ - movaps %xmm1, %xmm2 - -/* f) Subtract "Right Shifter" (0x4B000000) value */ - subps %xmm5, %xmm1 - movups __sPI2(%rax), %xmm7 - pslld $31, %xmm2 - movups __sPI3(%rax), %xmm5 - movups __sAbsMask(%rax), %xmm3 - -/* Check for large and special arguments */ - andps %xmm4, %xmm3 - -/* g) Subtract 0.5 from result for octant correction */ - subps __sOneHalf(%rax), %xmm1 - cmpnleps __sRangeReductionVal(%rax), %xmm3 - -/* - h) Subtract Y*PI from X argument, where PI divided to 4 parts: - X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4; - */ - mulps %xmm1, %xmm6 - mulps %xmm1, %xmm7 - mulps %xmm1, %xmm5 - subps %xmm6, %xmm0 - movmskps %xmm3, %ecx - movups __sPI4(%rax), %xmm6 - subps %xmm7, %xmm0 - mulps %xmm6, %xmm1 - subps %xmm5, %xmm0 - subps %xmm1, %xmm0 - -/* a) Calculate X^2 = X * X */ - movaps %xmm0, %xmm1 - mulps %xmm0, %xmm1 - -/* - 3) Destination sign setting - a) Set shifted destination sign using XOR operation: - R = XOR( R, S ); - */ - xorps %xmm2, %xmm0 - movups __sA9(%rax), %xmm2 - -/* - b) Calculate polynomial: - R = X + X * X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9)))); - */ - mulps %xmm1, %xmm2 - addps __sA7(%rax), %xmm2 - mulps %xmm1, %xmm2 - addps __sA5(%rax), %xmm2 - mulps %xmm1, %xmm2 - addps __sA3(%rax), %xmm2 - mulps %xmm2, %xmm1 - mulps %xmm0, %xmm1 - addps %xmm1, %xmm0 - testl %ecx, %ecx - jne .LBL_1_3 - -.LBL_1_2: - cfi_remember_state - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_1_3: - cfi_restore_state - movups %xmm4, 192(%rsp) - movups %xmm0, 256(%rsp) - je .LBL_1_2 - - xorb %dl, %dl - xorl %eax, %eax - movups %xmm8, 112(%rsp) - movups %xmm9, 96(%rsp) - movups %xmm10, 80(%rsp) - movups %xmm11, 64(%rsp) - movups %xmm12, 48(%rsp) - movups %xmm13, 32(%rsp) - movups %xmm14, 16(%rsp) - movups %xmm15, (%rsp) - movq %rsi, 136(%rsp) - movq %rdi, 128(%rsp) - movq %r12, 168(%rsp) - cfi_offset_rel_rsp (12, 168) - movb %dl, %r12b - movq %r13, 160(%rsp) - cfi_offset_rel_rsp (13, 160) - movl %ecx, %r13d - movq %r14, 152(%rsp) - cfi_offset_rel_rsp (14, 152) - movl %eax, %r14d - movq %r15, 144(%rsp) - cfi_offset_rel_rsp (15, 144) - cfi_remember_state - -.LBL_1_6: - btl %r14d, %r13d - jc .LBL_1_12 - -.LBL_1_7: - lea 1(%r14), %esi - btl %esi, %r13d - jc .LBL_1_10 - -.LBL_1_8: - incb %r12b - addl $2, %r14d - cmpb $16, %r12b - jb .LBL_1_6 - - movups 112(%rsp), %xmm8 - movups 96(%rsp), %xmm9 - movups 80(%rsp), %xmm10 - movups 64(%rsp), %xmm11 - movups 48(%rsp), %xmm12 - movups 32(%rsp), %xmm13 - movups 16(%rsp), %xmm14 - movups (%rsp), %xmm15 - movq 136(%rsp), %rsi - movq 128(%rsp), %rdi - movq 168(%rsp), %r12 - cfi_restore (%r12) - movq 160(%rsp), %r13 - cfi_restore (%r13) - movq 152(%rsp), %r14 - cfi_restore (%r14) - movq 144(%rsp), %r15 - cfi_restore (%r15) - movups 256(%rsp), %xmm0 - jmp .LBL_1_2 - -.LBL_1_10: - cfi_restore_state - movzbl %r12b, %r15d - movss 196(%rsp,%r15,8), %xmm0 - - call JUMPTARGET(cosf) - - movss %xmm0, 260(%rsp,%r15,8) - jmp .LBL_1_8 - -.LBL_1_12: - movzbl %r12b, %r15d - movss 192(%rsp,%r15,8), %xmm0 - - call JUMPTARGET(cosf) - - movss %xmm0, 256(%rsp,%r15,8) - jmp .LBL_1_7 -END (_ZGVbN4v_cosf_sse4) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core.S deleted file mode 100644 index f7530c138a..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized cosf, vector length is 8. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVdN8v_cosf) - .type _ZGVdN8v_cosf, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVdN8v_cosf_avx2(%rip), %rax - HAS_ARCH_FEATURE (AVX2_Usable) - jz 2f - ret -2: leaq _ZGVdN8v_cosf_sse_wrapper(%rip), %rax - ret -END (_ZGVdN8v_cosf) -libmvec_hidden_def (_ZGVdN8v_cosf) - -#define _ZGVdN8v_cosf _ZGVdN8v_cosf_sse_wrapper -#include "../svml_s_cosf8_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core_avx2.S deleted file mode 100644 index c61add3bb9..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core_avx2.S +++ /dev/null @@ -1,215 +0,0 @@ -/* Function cosf vectorized with AVX2. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - - -#include <sysdep.h> -#include "svml_s_trig_data.h" - - .text -ENTRY (_ZGVdN8v_cosf_avx2) -/* - ALGORITHM DESCRIPTION: - - 1) Range reduction to [-Pi/2; +Pi/2] interval - a) We remove sign using AND operation - b) Add Pi/2 value to argument X for Cos to Sin transformation - c) Getting octant Y by 1/Pi multiplication - d) Add "Right Shifter" value - e) Treat obtained value as integer for destination sign setting. - Shift first bit of this value to the last (sign) position - f) Subtract "Right Shifter" value - g) Subtract 0.5 from result for octant correction - h) Subtract Y*PI from X argument, where PI divided to 4 parts: - X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4; - 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval) - a) Calculate X^2 = X * X - b) Calculate polynomial: - R = X + X * X^2 * (A3 + x^2 * (A5 + ..... - 3) Destination sign setting - a) Set shifted destination sign using XOR operation: - R = XOR( R, S ); - */ - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $448, %rsp - movq __svml_s_trig_data@GOTPCREL(%rip), %rax - vmovaps %ymm0, %ymm2 - vmovups __sRShifter(%rax), %ymm5 - vmovups __sPI1_FMA(%rax), %ymm7 - -/* b) Add Pi/2 value to argument X for Cos to Sin transformation */ - vaddps __sHalfPI(%rax), %ymm2, %ymm4 - -/* - 1) Range reduction to [-Pi/2; +Pi/2] interval - c) Getting octant Y by 1/Pi multiplication - d) Add "Right Shifter" (0x4B000000) value - */ - vfmadd132ps __sInvPI(%rax), %ymm5, %ymm4 - -/* f) Subtract "Right Shifter" (0x4B000000) value */ - vsubps %ymm5, %ymm4, %ymm6 - -/* - e) Treat obtained value as integer for destination sign setting. - Shift first bit of this value to the last (sign) position (S << 31) - */ - vpslld $31, %ymm4, %ymm0 - -/* g) Subtract 0.5 from result for octant correction */ - vsubps __sOneHalf(%rax), %ymm6, %ymm4 - -/* Check for large and special arguments */ - vandps __sAbsMask(%rax), %ymm2, %ymm3 - vcmpnle_uqps __sRangeReductionVal(%rax), %ymm3, %ymm1 - -/* - h) Subtract Y*PI from X argument, where PI divided to 4 parts: - X = X - Y*PI1 - Y*PI2 - Y*PI3 - */ - vmovaps %ymm2, %ymm3 - vfnmadd231ps %ymm4, %ymm7, %ymm3 - vfnmadd231ps __sPI2_FMA(%rax), %ymm4, %ymm3 - vfnmadd132ps __sPI3_FMA(%rax), %ymm3, %ymm4 - -/* a) Calculate X^2 = X * X */ - vmulps %ymm4, %ymm4, %ymm5 - -/* - 3) Destination sign setting - a) Set shifted destination sign using XOR operation: - R = XOR( R, S ); - */ - vxorps %ymm0, %ymm4, %ymm6 - vmovups __sA9_FMA(%rax), %ymm0 - -/* - b) Calculate polynomial: - R = X + X * X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9)))) - */ - vfmadd213ps __sA7_FMA(%rax), %ymm5, %ymm0 - vfmadd213ps __sA5_FMA(%rax), %ymm5, %ymm0 - vfmadd213ps __sA3(%rax), %ymm5, %ymm0 - vmulps %ymm5, %ymm0, %ymm0 - vmovmskps %ymm1, %ecx - vfmadd213ps %ymm6, %ymm6, %ymm0 - testl %ecx, %ecx - jne .LBL_1_3 - -.LBL_1_2: - cfi_remember_state - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_1_3: - cfi_restore_state - vmovups %ymm2, 320(%rsp) - vmovups %ymm0, 384(%rsp) - je .LBL_1_2 - - xorb %dl, %dl - xorl %eax, %eax - vmovups %ymm8, 224(%rsp) - vmovups %ymm9, 192(%rsp) - vmovups %ymm10, 160(%rsp) - vmovups %ymm11, 128(%rsp) - vmovups %ymm12, 96(%rsp) - vmovups %ymm13, 64(%rsp) - vmovups %ymm14, 32(%rsp) - vmovups %ymm15, (%rsp) - movq %rsi, 264(%rsp) - movq %rdi, 256(%rsp) - movq %r12, 296(%rsp) - cfi_offset_rel_rsp (12, 296) - movb %dl, %r12b - movq %r13, 288(%rsp) - cfi_offset_rel_rsp (13, 288) - movl %ecx, %r13d - movq %r14, 280(%rsp) - cfi_offset_rel_rsp (14, 280) - movl %eax, %r14d - movq %r15, 272(%rsp) - cfi_offset_rel_rsp (15, 272) - cfi_remember_state - -.LBL_1_6: - btl %r14d, %r13d - jc .LBL_1_12 - -.LBL_1_7: - lea 1(%r14), %esi - btl %esi, %r13d - jc .LBL_1_10 - -.LBL_1_8: - incb %r12b - addl $2, %r14d - cmpb $16, %r12b - jb .LBL_1_6 - - vmovups 224(%rsp), %ymm8 - vmovups 192(%rsp), %ymm9 - vmovups 160(%rsp), %ymm10 - vmovups 128(%rsp), %ymm11 - vmovups 96(%rsp), %ymm12 - vmovups 64(%rsp), %ymm13 - vmovups 32(%rsp), %ymm14 - vmovups (%rsp), %ymm15 - vmovups 384(%rsp), %ymm0 - movq 264(%rsp), %rsi - movq 256(%rsp), %rdi - movq 296(%rsp), %r12 - cfi_restore (%r12) - movq 288(%rsp), %r13 - cfi_restore (%r13) - movq 280(%rsp), %r14 - cfi_restore (%r14) - movq 272(%rsp), %r15 - cfi_restore (%r15) - jmp .LBL_1_2 - -.LBL_1_10: - cfi_restore_state - movzbl %r12b, %r15d - vmovss 324(%rsp,%r15,8), %xmm0 - vzeroupper - - call JUMPTARGET(cosf) - - vmovss %xmm0, 388(%rsp,%r15,8) - jmp .LBL_1_8 - -.LBL_1_12: - movzbl %r12b, %r15d - vmovss 320(%rsp,%r15,8), %xmm0 - vzeroupper - - call JUMPTARGET(cosf) - - vmovss %xmm0, 384(%rsp,%r15,8) - jmp .LBL_1_7 - -END (_ZGVdN8v_cosf_avx2) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S deleted file mode 100644 index 3998f616aa..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S +++ /dev/null @@ -1,37 +0,0 @@ -/* Multiple versions of vectorized expf. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVeN16v_expf) - .type _ZGVeN16v_expf, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVeN16v_expf_skx(%rip), %rax - HAS_ARCH_FEATURE (AVX512DQ_Usable) - jnz 2f - leaq _ZGVeN16v_expf_knl(%rip), %rax - HAS_ARCH_FEATURE (AVX512F_Usable) - jnz 2f - leaq _ZGVeN16v_expf_avx2_wrapper(%rip), %rax -2: ret -END (_ZGVeN16v_expf) - -#define _ZGVeN16v_expf _ZGVeN16v_expf_avx2_wrapper -#include "../svml_s_expf16_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S deleted file mode 100644 index e80b2be1a7..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S +++ /dev/null @@ -1,447 +0,0 @@ -/* Function expf vectorized with AVX-512. KNL and SKX versions. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include "svml_s_expf_data.h" -#include "svml_s_wrapper_impl.h" - - .text -ENTRY (_ZGVeN16v_expf_knl) -#ifndef HAVE_AVX512DQ_ASM_SUPPORT -WRAPPER_IMPL_AVX512 _ZGVdN8v_expf -#else -/* - ALGORITHM DESCRIPTION: - - Argument representation: - M = rint(X*2^k/ln2) = 2^k*N+j - X = M*ln2/2^k + r = N*ln2 + ln2*(j/2^k) + r - then -ln2/2^(k+1) < r < ln2/2^(k+1) - Alternatively: - M = trunc(X*2^k/ln2) - then 0 < r < ln2/2^k - - Result calculation: - exp(X) = exp(N*ln2 + ln2*(j/2^k) + r) - = 2^N * 2^(j/2^k) * exp(r) - 2^N is calculated by bit manipulation - 2^(j/2^k) is computed from table lookup - exp(r) is approximated by polynomial - - The table lookup is skipped if k = 0. - For low accuracy approximation, exp(r) ~ 1 or 1+r. */ - - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $1280, %rsp - movq __svml_sexp_data@GOTPCREL(%rip), %rax - -/* r = x-n*ln2_hi/2^k */ - vmovaps %zmm0, %zmm6 - -/* compare against threshold */ - movl $-1, %ecx - vmovups __sInvLn2(%rax), %zmm3 - vmovups __sLn2hi(%rax), %zmm5 - -/* m = x*2^k/ln2 + shifter */ - vfmadd213ps __sShifter(%rax), %zmm0, %zmm3 - vmovups __sPC5(%rax), %zmm9 - -/* n = m - shifter = rint(x*2^k/ln2) */ - vsubps __sShifter(%rax), %zmm3, %zmm7 - -/* remove sign of x by "and" operation */ - vpandd __iAbsMask(%rax), %zmm0, %zmm1 - vpaddd __iBias(%rax), %zmm3, %zmm4 - vpcmpgtd __iDomainRange(%rax), %zmm1, %k1 - -/* compute 2^N with "shift" */ - vpslld $23, %zmm4, %zmm8 - vfnmadd231ps %zmm7, %zmm5, %zmm6 - vpbroadcastd %ecx, %zmm2{%k1}{z} - -/* r = r-n*ln2_lo/2^k = x - n*ln2/2^k */ - vfnmadd132ps __sLn2lo(%rax), %zmm6, %zmm7 - -/* set mask for overflow/underflow */ - vptestmd %zmm2, %zmm2, %k0 - kmovw %k0, %ecx - -/* c5*r+c4 */ - vfmadd213ps __sPC4(%rax), %zmm7, %zmm9 - -/* (c5*r+c4)*r+c3 */ - vfmadd213ps __sPC3(%rax), %zmm7, %zmm9 - -/* ((c5*r+c4)*r+c3)*r+c2 */ - vfmadd213ps __sPC2(%rax), %zmm7, %zmm9 - -/* (((c5*r+c4)*r+c3)*r+c2)*r+c1 */ - vfmadd213ps __sPC1(%rax), %zmm7, %zmm9 - -/* exp(r) = ((((c5*r+c4)*r+c3)*r+c2)*r+c1)*r+c0 */ - vfmadd213ps __sPC0(%rax), %zmm7, %zmm9 - -/* 2^N*exp(r) */ - vmulps %zmm9, %zmm8, %zmm1 - testl %ecx, %ecx - jne .LBL_1_3 - -.LBL_1_2: - cfi_remember_state - vmovaps %zmm1, %zmm0 - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_1_3: - cfi_restore_state - vmovups %zmm0, 1152(%rsp) - vmovups %zmm1, 1216(%rsp) - je .LBL_1_2 - - xorb %dl, %dl - kmovw %k4, 1048(%rsp) - xorl %eax, %eax - kmovw %k5, 1040(%rsp) - kmovw %k6, 1032(%rsp) - kmovw %k7, 1024(%rsp) - vmovups %zmm16, 960(%rsp) - vmovups %zmm17, 896(%rsp) - vmovups %zmm18, 832(%rsp) - vmovups %zmm19, 768(%rsp) - vmovups %zmm20, 704(%rsp) - vmovups %zmm21, 640(%rsp) - vmovups %zmm22, 576(%rsp) - vmovups %zmm23, 512(%rsp) - vmovups %zmm24, 448(%rsp) - vmovups %zmm25, 384(%rsp) - vmovups %zmm26, 320(%rsp) - vmovups %zmm27, 256(%rsp) - vmovups %zmm28, 192(%rsp) - vmovups %zmm29, 128(%rsp) - vmovups %zmm30, 64(%rsp) - vmovups %zmm31, (%rsp) - movq %rsi, 1064(%rsp) - movq %rdi, 1056(%rsp) - movq %r12, 1096(%rsp) - cfi_offset_rel_rsp (12, 1096) - movb %dl, %r12b - movq %r13, 1088(%rsp) - cfi_offset_rel_rsp (13, 1088) - movl %ecx, %r13d - movq %r14, 1080(%rsp) - cfi_offset_rel_rsp (14, 1080) - movl %eax, %r14d - movq %r15, 1072(%rsp) - cfi_offset_rel_rsp (15, 1072) - cfi_remember_state - -.LBL_1_6: - btl %r14d, %r13d - jc .LBL_1_12 - -.LBL_1_7: - lea 1(%r14), %esi - btl %esi, %r13d - jc .LBL_1_10 - -.LBL_1_8: - addb $1, %r12b - addl $2, %r14d - cmpb $16, %r12b - jb .LBL_1_6 - - kmovw 1048(%rsp), %k4 - movq 1064(%rsp), %rsi - kmovw 1040(%rsp), %k5 - movq 1056(%rsp), %rdi - kmovw 1032(%rsp), %k6 - movq 1096(%rsp), %r12 - cfi_restore (%r12) - movq 1088(%rsp), %r13 - cfi_restore (%r13) - kmovw 1024(%rsp), %k7 - vmovups 960(%rsp), %zmm16 - vmovups 896(%rsp), %zmm17 - vmovups 832(%rsp), %zmm18 - vmovups 768(%rsp), %zmm19 - vmovups 704(%rsp), %zmm20 - vmovups 640(%rsp), %zmm21 - vmovups 576(%rsp), %zmm22 - vmovups 512(%rsp), %zmm23 - vmovups 448(%rsp), %zmm24 - vmovups 384(%rsp), %zmm25 - vmovups 320(%rsp), %zmm26 - vmovups 256(%rsp), %zmm27 - vmovups 192(%rsp), %zmm28 - vmovups 128(%rsp), %zmm29 - vmovups 64(%rsp), %zmm30 - vmovups (%rsp), %zmm31 - movq 1080(%rsp), %r14 - cfi_restore (%r14) - movq 1072(%rsp), %r15 - cfi_restore (%r15) - vmovups 1216(%rsp), %zmm1 - jmp .LBL_1_2 - -.LBL_1_10: - cfi_restore_state - movzbl %r12b, %r15d - vmovss 1156(%rsp,%r15,8), %xmm0 - call JUMPTARGET(__expf_finite) - vmovss %xmm0, 1220(%rsp,%r15,8) - jmp .LBL_1_8 - -.LBL_1_12: - movzbl %r12b, %r15d - vmovss 1152(%rsp,%r15,8), %xmm0 - call JUMPTARGET(__expf_finite) - vmovss %xmm0, 1216(%rsp,%r15,8) - jmp .LBL_1_7 - -#endif -END (_ZGVeN16v_expf_knl) - -ENTRY (_ZGVeN16v_expf_skx) -#ifndef HAVE_AVX512DQ_ASM_SUPPORT -WRAPPER_IMPL_AVX512 _ZGVdN8v_expf -#else -/* - ALGORITHM DESCRIPTION: - - Argument representation: - M = rint(X*2^k/ln2) = 2^k*N+j - X = M*ln2/2^k + r = N*ln2 + ln2*(j/2^k) + r - then -ln2/2^(k+1) < r < ln2/2^(k+1) - Alternatively: - M = trunc(X*2^k/ln2) - then 0 < r < ln2/2^k - - Result calculation: - exp(X) = exp(N*ln2 + ln2*(j/2^k) + r) - = 2^N * 2^(j/2^k) * exp(r) - 2^N is calculated by bit manipulation - 2^(j/2^k) is computed from table lookup - exp(r) is approximated by polynomial - - The table lookup is skipped if k = 0. - For low accuracy approximation, exp(r) ~ 1 or 1+r. */ - - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $1280, %rsp - movq __svml_sexp_data@GOTPCREL(%rip), %rax - -/* r = x-n*ln2_hi/2^k */ - vmovaps %zmm0, %zmm7 - -/* compare against threshold */ - vmovups .L_2il0floatpacket.13(%rip), %zmm3 - vmovups __sInvLn2(%rax), %zmm4 - vmovups __sShifter(%rax), %zmm1 - vmovups __sLn2hi(%rax), %zmm6 - vmovups __sPC5(%rax), %zmm10 - -/* m = x*2^k/ln2 + shifter */ - vfmadd213ps %zmm1, %zmm0, %zmm4 - -/* n = m - shifter = rint(x*2^k/ln2) */ - vsubps %zmm1, %zmm4, %zmm8 - vpaddd __iBias(%rax), %zmm4, %zmm5 - vfnmadd231ps %zmm8, %zmm6, %zmm7 - -/* compute 2^N with "shift" */ - vpslld $23, %zmm5, %zmm9 - -/* r = r-n*ln2_lo/2^k = x - n*ln2/2^k */ - vfnmadd132ps __sLn2lo(%rax), %zmm7, %zmm8 - -/* c5*r+c4 */ - vfmadd213ps __sPC4(%rax), %zmm8, %zmm10 - -/* (c5*r+c4)*r+c3 */ - vfmadd213ps __sPC3(%rax), %zmm8, %zmm10 - -/* ((c5*r+c4)*r+c3)*r+c2 */ - vfmadd213ps __sPC2(%rax), %zmm8, %zmm10 - -/* (((c5*r+c4)*r+c3)*r+c2)*r+c1 */ - vfmadd213ps __sPC1(%rax), %zmm8, %zmm10 - -/* exp(r) = ((((c5*r+c4)*r+c3)*r+c2)*r+c1)*r+c0 */ - vfmadd213ps __sPC0(%rax), %zmm8, %zmm10 - -/* 2^N*exp(r) */ - vmulps %zmm10, %zmm9, %zmm1 - -/* remove sign of x by "and" operation */ - vpandd __iAbsMask(%rax), %zmm0, %zmm2 - vpcmpd $2, __iDomainRange(%rax), %zmm2, %k1 - vpandnd %zmm2, %zmm2, %zmm3{%k1} - -/* set mask for overflow/underflow */ - vptestmd %zmm3, %zmm3, %k0 - kmovw %k0, %ecx - testl %ecx, %ecx - jne .LBL_2_3 - -.LBL_2_2: - cfi_remember_state - vmovaps %zmm1, %zmm0 - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_2_3: - cfi_restore_state - vmovups %zmm0, 1152(%rsp) - vmovups %zmm1, 1216(%rsp) - je .LBL_2_2 - - xorb %dl, %dl - xorl %eax, %eax - kmovw %k4, 1048(%rsp) - kmovw %k5, 1040(%rsp) - kmovw %k6, 1032(%rsp) - kmovw %k7, 1024(%rsp) - vmovups %zmm16, 960(%rsp) - vmovups %zmm17, 896(%rsp) - vmovups %zmm18, 832(%rsp) - vmovups %zmm19, 768(%rsp) - vmovups %zmm20, 704(%rsp) - vmovups %zmm21, 640(%rsp) - vmovups %zmm22, 576(%rsp) - vmovups %zmm23, 512(%rsp) - vmovups %zmm24, 448(%rsp) - vmovups %zmm25, 384(%rsp) - vmovups %zmm26, 320(%rsp) - vmovups %zmm27, 256(%rsp) - vmovups %zmm28, 192(%rsp) - vmovups %zmm29, 128(%rsp) - vmovups %zmm30, 64(%rsp) - vmovups %zmm31, (%rsp) - movq %rsi, 1064(%rsp) - movq %rdi, 1056(%rsp) - movq %r12, 1096(%rsp) - cfi_offset_rel_rsp (12, 1096) - movb %dl, %r12b - movq %r13, 1088(%rsp) - cfi_offset_rel_rsp (13, 1088) - movl %ecx, %r13d - movq %r14, 1080(%rsp) - cfi_offset_rel_rsp (14, 1080) - movl %eax, %r14d - movq %r15, 1072(%rsp) - cfi_offset_rel_rsp (15, 1072) - cfi_remember_state - - -.LBL_2_6: - btl %r14d, %r13d - jc .LBL_2_12 - -.LBL_2_7: - lea 1(%r14), %esi - btl %esi, %r13d - jc .LBL_2_10 - -.LBL_2_8: - incb %r12b - addl $2, %r14d - cmpb $16, %r12b - jb .LBL_2_6 - - kmovw 1048(%rsp), %k4 - kmovw 1040(%rsp), %k5 - kmovw 1032(%rsp), %k6 - kmovw 1024(%rsp), %k7 - vmovups 960(%rsp), %zmm16 - vmovups 896(%rsp), %zmm17 - vmovups 832(%rsp), %zmm18 - vmovups 768(%rsp), %zmm19 - vmovups 704(%rsp), %zmm20 - vmovups 640(%rsp), %zmm21 - vmovups 576(%rsp), %zmm22 - vmovups 512(%rsp), %zmm23 - vmovups 448(%rsp), %zmm24 - vmovups 384(%rsp), %zmm25 - vmovups 320(%rsp), %zmm26 - vmovups 256(%rsp), %zmm27 - vmovups 192(%rsp), %zmm28 - vmovups 128(%rsp), %zmm29 - vmovups 64(%rsp), %zmm30 - vmovups (%rsp), %zmm31 - vmovups 1216(%rsp), %zmm1 - movq 1064(%rsp), %rsi - movq 1056(%rsp), %rdi - movq 1096(%rsp), %r12 - cfi_restore (%r12) - movq 1088(%rsp), %r13 - cfi_restore (%r13) - movq 1080(%rsp), %r14 - cfi_restore (%r14) - movq 1072(%rsp), %r15 - cfi_restore (%r15) - jmp .LBL_2_2 - -.LBL_2_10: - cfi_restore_state - movzbl %r12b, %r15d - vmovss 1156(%rsp,%r15,8), %xmm0 - vzeroupper - vmovss 1156(%rsp,%r15,8), %xmm0 - - call JUMPTARGET(__expf_finite) - - vmovss %xmm0, 1220(%rsp,%r15,8) - jmp .LBL_2_8 - -.LBL_2_12: - movzbl %r12b, %r15d - vmovss 1152(%rsp,%r15,8), %xmm0 - vzeroupper - vmovss 1152(%rsp,%r15,8), %xmm0 - - call JUMPTARGET(__expf_finite) - - vmovss %xmm0, 1216(%rsp,%r15,8) - jmp .LBL_2_7 - -#endif -END (_ZGVeN16v_expf_skx) - - .section .rodata, "a" -.L_2il0floatpacket.13: - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff - .type .L_2il0floatpacket.13,@object diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core.S deleted file mode 100644 index 8051720ec2..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized expf. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVbN4v_expf) - .type _ZGVbN4v_expf, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVbN4v_expf_sse4(%rip), %rax - HAS_CPU_FEATURE (SSE4_1) - jz 2f - ret -2: leaq _ZGVbN4v_expf_sse2(%rip), %rax - ret -END (_ZGVbN4v_expf) -libmvec_hidden_def (_ZGVbN4v_expf) - -#define _ZGVbN4v_expf _ZGVbN4v_expf_sse2 -#include "../svml_s_expf4_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core_sse4.S deleted file mode 100644 index 2bc510bbf7..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core_sse4.S +++ /dev/null @@ -1,212 +0,0 @@ -/* Function expf vectorized with SSE4. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include "svml_s_expf_data.h" - - .text -ENTRY (_ZGVbN4v_expf_sse4) -/* - ALGORITHM DESCRIPTION: - - Argument representation: - M = rint(X*2^k/ln2) = 2^k*N+j - X = M*ln2/2^k + r = N*ln2 + ln2*(j/2^k) + r - then -ln2/2^(k+1) < r < ln2/2^(k+1) - Alternatively: - M = trunc(X*2^k/ln2) - then 0 < r < ln2/2^k - - Result calculation: - exp(X) = exp(N*ln2 + ln2*(j/2^k) + r) - = 2^N * 2^(j/2^k) * exp(r) - 2^N is calculated by bit manipulation - 2^(j/2^k) is computed from table lookup - exp(r) is approximated by polynomial - - The table lookup is skipped if k = 0. - For low accuracy approximation, exp(r) ~ 1 or 1+r. */ - - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $320, %rsp - movaps %xmm0, %xmm5 - movq __svml_sexp_data@GOTPCREL(%rip), %rax - movups __sInvLn2(%rax), %xmm0 - -/* m = x*2^k/ln2 + shifter */ - mulps %xmm5, %xmm0 - movups __sShifter(%rax), %xmm6 - movups __sLn2hi(%rax), %xmm4 - addps %xmm6, %xmm0 - -/* n = m - shifter = rint(x*2^k/ln2) */ - movaps %xmm0, %xmm2 - -/* remove sign of x by "and" operation */ - movdqu __iAbsMask(%rax), %xmm7 - subps %xmm6, %xmm2 - -/* r = x-n*ln2_hi/2^k */ - mulps %xmm2, %xmm4 - pand %xmm5, %xmm7 - -/* compare against threshold */ - pcmpgtd __iDomainRange(%rax), %xmm7 - movups __sLn2lo(%rax), %xmm1 - -/* set mask for overflow/underflow */ - movmskps %xmm7, %ecx - movaps %xmm5, %xmm7 - movups __sPC5(%rax), %xmm3 - subps %xmm4, %xmm7 - -/* r = r-n*ln2_lo/2^k = x - n*ln2/2^k */ - mulps %xmm1, %xmm2 - -/* compute 2^N with "shift" */ - movdqu __iBias(%rax), %xmm6 - subps %xmm2, %xmm7 - -/* c5*r+c4 */ - mulps %xmm7, %xmm3 - paddd %xmm6, %xmm0 - pslld $23, %xmm0 - addps __sPC4(%rax), %xmm3 - -/* (c5*r+c4)*r+c3 */ - mulps %xmm7, %xmm3 - addps __sPC3(%rax), %xmm3 - -/* ((c5*r+c4)*r+c3)*r+c2 */ - mulps %xmm7, %xmm3 - addps __sPC2(%rax), %xmm3 - -/* (((c5*r+c4)*r+c3)*r+c2)*r+c1 */ - mulps %xmm7, %xmm3 - addps __sPC1(%rax), %xmm3 - -/* exp(r) = ((((c5*r+c4)*r+c3)*r+c2)*r+c1)*r+c0 */ - mulps %xmm3, %xmm7 - addps __sPC0(%rax), %xmm7 - -/* 2^N*exp(r) */ - mulps %xmm7, %xmm0 - testl %ecx, %ecx - jne .LBL_1_3 - -.LBL_1_2: - cfi_remember_state - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_1_3: - cfi_restore_state - movups %xmm5, 192(%rsp) - movups %xmm0, 256(%rsp) - je .LBL_1_2 - - xorb %dl, %dl - xorl %eax, %eax - movups %xmm8, 112(%rsp) - movups %xmm9, 96(%rsp) - movups %xmm10, 80(%rsp) - movups %xmm11, 64(%rsp) - movups %xmm12, 48(%rsp) - movups %xmm13, 32(%rsp) - movups %xmm14, 16(%rsp) - movups %xmm15, (%rsp) - movq %rsi, 136(%rsp) - movq %rdi, 128(%rsp) - movq %r12, 168(%rsp) - cfi_offset_rel_rsp (12, 168) - movb %dl, %r12b - movq %r13, 160(%rsp) - cfi_offset_rel_rsp (13, 160) - movl %ecx, %r13d - movq %r14, 152(%rsp) - cfi_offset_rel_rsp (14, 152) - movl %eax, %r14d - movq %r15, 144(%rsp) - cfi_offset_rel_rsp (15, 144) - cfi_remember_state - -.LBL_1_6: - btl %r14d, %r13d - jc .LBL_1_12 - -.LBL_1_7: - lea 1(%r14), %esi - btl %esi, %r13d - jc .LBL_1_10 - -.LBL_1_8: - incb %r12b - addl $2, %r14d - cmpb $16, %r12b - jb .LBL_1_6 - - movups 112(%rsp), %xmm8 - movups 96(%rsp), %xmm9 - movups 80(%rsp), %xmm10 - movups 64(%rsp), %xmm11 - movups 48(%rsp), %xmm12 - movups 32(%rsp), %xmm13 - movups 16(%rsp), %xmm14 - movups (%rsp), %xmm15 - movq 136(%rsp), %rsi - movq 128(%rsp), %rdi - movq 168(%rsp), %r12 - cfi_restore (%r12) - movq 160(%rsp), %r13 - cfi_restore (%r13) - movq 152(%rsp), %r14 - cfi_restore (%r14) - movq 144(%rsp), %r15 - cfi_restore (%r15) - movups 256(%rsp), %xmm0 - jmp .LBL_1_2 - -.LBL_1_10: - cfi_restore_state - movzbl %r12b, %r15d - movss 196(%rsp,%r15,8), %xmm0 - - call JUMPTARGET(__expf_finite) - - movss %xmm0, 260(%rsp,%r15,8) - jmp .LBL_1_8 - -.LBL_1_12: - movzbl %r12b, %r15d - movss 192(%rsp,%r15,8), %xmm0 - - call JUMPTARGET(__expf_finite) - - movss %xmm0, 256(%rsp,%r15,8) - jmp .LBL_1_7 - -END (_ZGVbN4v_expf_sse4) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core.S deleted file mode 100644 index 6ffb1fd784..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized expf. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVdN8v_expf) - .type _ZGVdN8v_expf, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVdN8v_expf_avx2(%rip), %rax - HAS_ARCH_FEATURE (AVX2_Usable) - jz 2f - ret -2: leaq _ZGVdN8v_expf_sse_wrapper(%rip), %rax - ret -END (_ZGVdN8v_expf) -libmvec_hidden_def (_ZGVdN8v_expf) - -#define _ZGVdN8v_expf _ZGVdN8v_expf_sse_wrapper -#include "../svml_s_expf8_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core_avx2.S deleted file mode 100644 index b4a070ac86..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core_avx2.S +++ /dev/null @@ -1,202 +0,0 @@ -/* Function expf vectorized with AVX2. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include "svml_s_expf_data.h" - - .text -ENTRY(_ZGVdN8v_expf_avx2) -/* - ALGORITHM DESCRIPTION: - - Argument representation: - M = rint(X*2^k/ln2) = 2^k*N+j - X = M*ln2/2^k + r = N*ln2 + ln2*(j/2^k) + r - then -ln2/2^(k+1) < r < ln2/2^(k+1) - Alternatively: - M = trunc(X*2^k/ln2) - then 0 < r < ln2/2^k - - Result calculation: - exp(X) = exp(N*ln2 + ln2*(j/2^k) + r) - = 2^N * 2^(j/2^k) * exp(r) - 2^N is calculated by bit manipulation - 2^(j/2^k) is computed from table lookup - exp(r) is approximated by polynomial - - The table lookup is skipped if k = 0. - For low accuracy approximation, exp(r) ~ 1 or 1+r. */ - - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $448, %rsp - movq __svml_sexp_data@GOTPCREL(%rip), %rax - vmovaps %ymm0, %ymm2 - vmovups __sInvLn2(%rax), %ymm7 - vmovups __sShifter(%rax), %ymm4 - vmovups __sLn2hi(%rax), %ymm3 - vmovups __sPC5(%rax), %ymm1 - -/* m = x*2^k/ln2 + shifter */ - vfmadd213ps %ymm4, %ymm2, %ymm7 - -/* n = m - shifter = rint(x*2^k/ln2) */ - vsubps %ymm4, %ymm7, %ymm0 - vpaddd __iBias(%rax), %ymm7, %ymm4 - -/* remove sign of x by "and" operation */ - vandps __iAbsMask(%rax), %ymm2, %ymm5 - -/* compare against threshold */ - vpcmpgtd __iDomainRange(%rax), %ymm5, %ymm6 - -/* r = x-n*ln2_hi/2^k */ - vmovaps %ymm2, %ymm5 - vfnmadd231ps %ymm0, %ymm3, %ymm5 - -/* r = r-n*ln2_lo/2^k = x - n*ln2/2^k */ - vfnmadd132ps __sLn2lo(%rax), %ymm5, %ymm0 - -/* c5*r+c4 */ - vfmadd213ps __sPC4(%rax), %ymm0, %ymm1 - -/* (c5*r+c4)*r+c3 */ - vfmadd213ps __sPC3(%rax), %ymm0, %ymm1 - -/* ((c5*r+c4)*r+c3)*r+c2 */ - vfmadd213ps __sPC2(%rax), %ymm0, %ymm1 - -/* (((c5*r+c4)*r+c3)*r+c2)*r+c1 */ - vfmadd213ps __sPC1(%rax), %ymm0, %ymm1 - -/* exp(r) = ((((c5*r+c4)*r+c3)*r+c2)*r+c1)*r+c0 */ - vfmadd213ps __sPC0(%rax), %ymm0, %ymm1 - -/* set mask for overflow/underflow */ - vmovmskps %ymm6, %ecx - -/* compute 2^N with "shift" */ - vpslld $23, %ymm4, %ymm6 - -/* 2^N*exp(r) */ - vmulps %ymm1, %ymm6, %ymm0 - testl %ecx, %ecx - jne .LBL_1_3 - -.LBL_1_2: - cfi_remember_state - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_1_3: - cfi_restore_state - vmovups %ymm2, 320(%rsp) - vmovups %ymm0, 384(%rsp) - je .LBL_1_2 - - xorb %dl, %dl - xorl %eax, %eax - vmovups %ymm8, 224(%rsp) - vmovups %ymm9, 192(%rsp) - vmovups %ymm10, 160(%rsp) - vmovups %ymm11, 128(%rsp) - vmovups %ymm12, 96(%rsp) - vmovups %ymm13, 64(%rsp) - vmovups %ymm14, 32(%rsp) - vmovups %ymm15, (%rsp) - movq %rsi, 264(%rsp) - movq %rdi, 256(%rsp) - movq %r12, 296(%rsp) - cfi_offset_rel_rsp (12, 296) - movb %dl, %r12b - movq %r13, 288(%rsp) - cfi_offset_rel_rsp (13, 288) - movl %ecx, %r13d - movq %r14, 280(%rsp) - cfi_offset_rel_rsp (14, 280) - movl %eax, %r14d - movq %r15, 272(%rsp) - cfi_offset_rel_rsp (15, 272) - cfi_remember_state - -.LBL_1_6: - btl %r14d, %r13d - jc .LBL_1_12 - -.LBL_1_7: - lea 1(%r14), %esi - btl %esi, %r13d - jc .LBL_1_10 - -.LBL_1_8: - incb %r12b - addl $2, %r14d - cmpb $16, %r12b - jb .LBL_1_6 - - vmovups 224(%rsp), %ymm8 - vmovups 192(%rsp), %ymm9 - vmovups 160(%rsp), %ymm10 - vmovups 128(%rsp), %ymm11 - vmovups 96(%rsp), %ymm12 - vmovups 64(%rsp), %ymm13 - vmovups 32(%rsp), %ymm14 - vmovups (%rsp), %ymm15 - vmovups 384(%rsp), %ymm0 - movq 264(%rsp), %rsi - movq 256(%rsp), %rdi - movq 296(%rsp), %r12 - cfi_restore (%r12) - movq 288(%rsp), %r13 - cfi_restore (%r13) - movq 280(%rsp), %r14 - cfi_restore (%r14) - movq 272(%rsp), %r15 - cfi_restore (%r15) - jmp .LBL_1_2 - -.LBL_1_10: - cfi_restore_state - movzbl %r12b, %r15d - vmovss 324(%rsp,%r15,8), %xmm0 - vzeroupper - - call JUMPTARGET(__expf_finite) - - vmovss %xmm0, 388(%rsp,%r15,8) - jmp .LBL_1_8 - -.LBL_1_12: - movzbl %r12b, %r15d - vmovss 320(%rsp,%r15,8), %xmm0 - vzeroupper - - call JUMPTARGET(__expf_finite) - - vmovss %xmm0, 384(%rsp,%r15,8) - jmp .LBL_1_7 - -END(_ZGVdN8v_expf_avx2) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S deleted file mode 100644 index 8ab03195c6..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S +++ /dev/null @@ -1,37 +0,0 @@ -/* Multiple versions of vectorized logf. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVeN16v_logf) - .type _ZGVeN16v_logf, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVeN16v_logf_skx(%rip), %rax - HAS_ARCH_FEATURE (AVX512DQ_Usable) - jnz 2f - leaq _ZGVeN16v_logf_knl(%rip), %rax - HAS_ARCH_FEATURE (AVX512F_Usable) - jnz 2f - leaq _ZGVeN16v_logf_avx2_wrapper(%rip), %rax -2: ret -END (_ZGVeN16v_logf) - -#define _ZGVeN16v_logf _ZGVeN16v_logf_avx2_wrapper -#include "../svml_s_logf16_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S deleted file mode 100644 index 7ff6fff848..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S +++ /dev/null @@ -1,416 +0,0 @@ -/* Function logf vectorized with AVX-512. KNL and SKX versions. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include "svml_s_logf_data.h" -#include "svml_s_wrapper_impl.h" - - .text -ENTRY (_ZGVeN16v_logf_knl) -#ifndef HAVE_AVX512DQ_ASM_SUPPORT -WRAPPER_IMPL_AVX512 _ZGVdN8v_logf -#else -/* - ALGORITHM DESCRIPTION: - - log(x) = exponent_x*log(2) + log(mantissa_x), if mantissa_x<4/3 - log(x) = (exponent_x+1)*log(2) + log(0.5*mantissa_x), if mantissa_x>4/3 - - R = mantissa_x - 1, if mantissa_x<4/3 - R = 0.5*mantissa_x - 1, if mantissa_x>4/3 - |R|< 1/3 - - log(1+R) is approximated as a polynomial: degree 9 for 1-ulp, - degree 7 for 4-ulp, degree 3 for half-precision. */ - - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $1280, %rsp - movq __svml_slog_data@GOTPCREL(%rip), %rax - movl $-1, %ecx - -/* reduction: compute r,n */ - vpsubd _iBrkValue(%rax), %zmm0, %zmm2 - vmovups _sPoly_7(%rax), %zmm7 - vpandd _iOffExpoMask(%rax), %zmm2, %zmm3 - -/* exponent_x (mantissa_x<4/3) or exponent_x+1 (mantissa_x>4/3) */ - vpsrad $23, %zmm2, %zmm4 - -/* check for working range, - set special argument mask (denormals/zero/Inf/NaN) - */ - vpaddd _iHiDelta(%rax), %zmm0, %zmm1 - -/* mantissa_x (mantissa_x<4/3), or 0.5*mantissa_x (mantissa_x>4/3) */ - vpaddd _iBrkValue(%rax), %zmm3, %zmm6 - vpcmpd $1, _iLoRange(%rax), %zmm1, %k1 - vcvtdq2ps {rn-sae}, %zmm4, %zmm1 - -/* reduced argument R */ - vsubps _sOne(%rax), %zmm6, %zmm8 - vpbroadcastd %ecx, %zmm5{%k1}{z} - -/* polynomial evaluation starts here */ - vfmadd213ps _sPoly_6(%rax), %zmm8, %zmm7 - vptestmd %zmm5, %zmm5, %k0 - kmovw %k0, %ecx - vfmadd213ps _sPoly_5(%rax), %zmm8, %zmm7 - vfmadd213ps _sPoly_4(%rax), %zmm8, %zmm7 - vfmadd213ps _sPoly_3(%rax), %zmm8, %zmm7 - vfmadd213ps _sPoly_2(%rax), %zmm8, %zmm7 - vfmadd213ps _sPoly_1(%rax), %zmm8, %zmm7 - vmulps %zmm8, %zmm7, %zmm9 - -/* polynomial evaluation end */ - vfmadd213ps %zmm8, %zmm8, %zmm9 - -/* - final reconstruction: - add exponent_value*log2 to polynomial result - */ - vfmadd132ps _sLn2(%rax), %zmm9, %zmm1 - testl %ecx, %ecx - jne .LBL_1_3 - -.LBL_1_2: - cfi_remember_state - vmovaps %zmm1, %zmm0 - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_1_3: - cfi_restore_state - vmovups %zmm0, 1152(%rsp) - vmovups %zmm1, 1216(%rsp) - je .LBL_1_2 - - xorb %dl, %dl - kmovw %k4, 1048(%rsp) - xorl %eax, %eax - kmovw %k5, 1040(%rsp) - kmovw %k6, 1032(%rsp) - kmovw %k7, 1024(%rsp) - vmovups %zmm16, 960(%rsp) - vmovups %zmm17, 896(%rsp) - vmovups %zmm18, 832(%rsp) - vmovups %zmm19, 768(%rsp) - vmovups %zmm20, 704(%rsp) - vmovups %zmm21, 640(%rsp) - vmovups %zmm22, 576(%rsp) - vmovups %zmm23, 512(%rsp) - vmovups %zmm24, 448(%rsp) - vmovups %zmm25, 384(%rsp) - vmovups %zmm26, 320(%rsp) - vmovups %zmm27, 256(%rsp) - vmovups %zmm28, 192(%rsp) - vmovups %zmm29, 128(%rsp) - vmovups %zmm30, 64(%rsp) - vmovups %zmm31, (%rsp) - movq %rsi, 1064(%rsp) - movq %rdi, 1056(%rsp) - movq %r12, 1096(%rsp) - cfi_offset_rel_rsp (12, 1096) - movb %dl, %r12b - movq %r13, 1088(%rsp) - cfi_offset_rel_rsp (13, 1088) - movl %ecx, %r13d - movq %r14, 1080(%rsp) - cfi_offset_rel_rsp (14, 1080) - movl %eax, %r14d - movq %r15, 1072(%rsp) - cfi_offset_rel_rsp (15, 1072) - cfi_remember_state - -.LBL_1_6: - btl %r14d, %r13d - jc .LBL_1_12 - -.LBL_1_7: - lea 1(%r14), %esi - btl %esi, %r13d - jc .LBL_1_10 - -.LBL_1_8: - addb $1, %r12b - addl $2, %r14d - cmpb $16, %r12b - jb .LBL_1_6 - - kmovw 1048(%rsp), %k4 - movq 1064(%rsp), %rsi - kmovw 1040(%rsp), %k5 - movq 1056(%rsp), %rdi - kmovw 1032(%rsp), %k6 - movq 1096(%rsp), %r12 - cfi_restore (%r12) - movq 1088(%rsp), %r13 - cfi_restore (%r13) - kmovw 1024(%rsp), %k7 - vmovups 960(%rsp), %zmm16 - vmovups 896(%rsp), %zmm17 - vmovups 832(%rsp), %zmm18 - vmovups 768(%rsp), %zmm19 - vmovups 704(%rsp), %zmm20 - vmovups 640(%rsp), %zmm21 - vmovups 576(%rsp), %zmm22 - vmovups 512(%rsp), %zmm23 - vmovups 448(%rsp), %zmm24 - vmovups 384(%rsp), %zmm25 - vmovups 320(%rsp), %zmm26 - vmovups 256(%rsp), %zmm27 - vmovups 192(%rsp), %zmm28 - vmovups 128(%rsp), %zmm29 - vmovups 64(%rsp), %zmm30 - vmovups (%rsp), %zmm31 - movq 1080(%rsp), %r14 - cfi_restore (%r14) - movq 1072(%rsp), %r15 - cfi_restore (%r15) - vmovups 1216(%rsp), %zmm1 - jmp .LBL_1_2 - -.LBL_1_10: - cfi_restore_state - movzbl %r12b, %r15d - vmovss 1156(%rsp,%r15,8), %xmm0 - call JUMPTARGET(__logf_finite) - vmovss %xmm0, 1220(%rsp,%r15,8) - jmp .LBL_1_8 - -.LBL_1_12: - movzbl %r12b, %r15d - vmovss 1152(%rsp,%r15,8), %xmm0 - call JUMPTARGET(__logf_finite) - vmovss %xmm0, 1216(%rsp,%r15,8) - jmp .LBL_1_7 -#endif -END (_ZGVeN16v_logf_knl) - -ENTRY (_ZGVeN16v_logf_skx) -#ifndef HAVE_AVX512DQ_ASM_SUPPORT -WRAPPER_IMPL_AVX512 _ZGVdN8v_logf -#else -/* - ALGORITHM DESCRIPTION: - - log(x) = exponent_x*log(2) + log(mantissa_x), if mantissa_x<4/3 - log(x) = (exponent_x+1)*log(2) + log(0.5*mantissa_x), if mantissa_x>4/3 - - R = mantissa_x - 1, if mantissa_x<4/3 - R = 0.5*mantissa_x - 1, if mantissa_x>4/3 - |R|< 1/3 - - log(1+R) is approximated as a polynomial: degree 9 for 1-ulp, - degree 7 for 4-ulp, degree 3 for half-precision. */ - - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $1280, %rsp - movq __svml_slog_data@GOTPCREL(%rip), %rax - vmovups .L_2il0floatpacket.7(%rip), %zmm6 - vmovups _iBrkValue(%rax), %zmm4 - vmovups _sPoly_7(%rax), %zmm8 - -/* - check for working range, - set special argument mask (denormals/zero/Inf/NaN) - */ - vpaddd _iHiDelta(%rax), %zmm0, %zmm1 - -/* reduction: compute r,n */ - vpsubd %zmm4, %zmm0, %zmm2 - vpcmpd $5, _iLoRange(%rax), %zmm1, %k1 - -/* exponent_x (mantissa_x<4/3) or exponent_x+1 (mantissa_x>4/3) */ - vpsrad $23, %zmm2, %zmm5 - vpandd _iOffExpoMask(%rax), %zmm2, %zmm3 - -/* mantissa_x (mantissa_x<4/3), or 0.5*mantissa_x (mantissa_x>4/3) */ - vpaddd %zmm4, %zmm3, %zmm7 - -/* reduced argument R */ - vsubps _sOne(%rax), %zmm7, %zmm9 - -/* polynomial evaluation starts here */ - vfmadd213ps _sPoly_6(%rax), %zmm9, %zmm8 - vfmadd213ps _sPoly_5(%rax), %zmm9, %zmm8 - vfmadd213ps _sPoly_4(%rax), %zmm9, %zmm8 - vfmadd213ps _sPoly_3(%rax), %zmm9, %zmm8 - vfmadd213ps _sPoly_2(%rax), %zmm9, %zmm8 - vfmadd213ps _sPoly_1(%rax), %zmm9, %zmm8 - vmulps %zmm9, %zmm8, %zmm10 - -/* polynomial evaluation end */ - vfmadd213ps %zmm9, %zmm9, %zmm10 - vpandnd %zmm1, %zmm1, %zmm6{%k1} - vptestmd %zmm6, %zmm6, %k0 - vcvtdq2ps {rn-sae}, %zmm5, %zmm1 - kmovw %k0, %ecx - -/* - final reconstruction: - add exponent_value*log2 to polynomial result - */ - vfmadd132ps _sLn2(%rax), %zmm10, %zmm1 - testl %ecx, %ecx - jne .LBL_2_3 - -.LBL_2_2: - cfi_remember_state - vmovaps %zmm1, %zmm0 - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_2_3: - cfi_restore_state - vmovups %zmm0, 1152(%rsp) - vmovups %zmm1, 1216(%rsp) - je .LBL_2_2 - - xorb %dl, %dl - xorl %eax, %eax - kmovw %k4, 1048(%rsp) - kmovw %k5, 1040(%rsp) - kmovw %k6, 1032(%rsp) - kmovw %k7, 1024(%rsp) - vmovups %zmm16, 960(%rsp) - vmovups %zmm17, 896(%rsp) - vmovups %zmm18, 832(%rsp) - vmovups %zmm19, 768(%rsp) - vmovups %zmm20, 704(%rsp) - vmovups %zmm21, 640(%rsp) - vmovups %zmm22, 576(%rsp) - vmovups %zmm23, 512(%rsp) - vmovups %zmm24, 448(%rsp) - vmovups %zmm25, 384(%rsp) - vmovups %zmm26, 320(%rsp) - vmovups %zmm27, 256(%rsp) - vmovups %zmm28, 192(%rsp) - vmovups %zmm29, 128(%rsp) - vmovups %zmm30, 64(%rsp) - vmovups %zmm31, (%rsp) - movq %rsi, 1064(%rsp) - movq %rdi, 1056(%rsp) - movq %r12, 1096(%rsp) - cfi_offset_rel_rsp (12, 1096) - movb %dl, %r12b - movq %r13, 1088(%rsp) - cfi_offset_rel_rsp (13, 1088) - movl %ecx, %r13d - movq %r14, 1080(%rsp) - cfi_offset_rel_rsp (14, 1080) - movl %eax, %r14d - movq %r15, 1072(%rsp) - cfi_offset_rel_rsp (15, 1072) - cfi_remember_state - -.LBL_2_6: - btl %r14d, %r13d - jc .LBL_2_12 - -.LBL_2_7: - lea 1(%r14), %esi - btl %esi, %r13d - jc .LBL_2_10 - -.LBL_2_8: - incb %r12b - addl $2, %r14d - cmpb $16, %r12b - jb .LBL_2_6 - - kmovw 1048(%rsp), %k4 - kmovw 1040(%rsp), %k5 - kmovw 1032(%rsp), %k6 - kmovw 1024(%rsp), %k7 - vmovups 960(%rsp), %zmm16 - vmovups 896(%rsp), %zmm17 - vmovups 832(%rsp), %zmm18 - vmovups 768(%rsp), %zmm19 - vmovups 704(%rsp), %zmm20 - vmovups 640(%rsp), %zmm21 - vmovups 576(%rsp), %zmm22 - vmovups 512(%rsp), %zmm23 - vmovups 448(%rsp), %zmm24 - vmovups 384(%rsp), %zmm25 - vmovups 320(%rsp), %zmm26 - vmovups 256(%rsp), %zmm27 - vmovups 192(%rsp), %zmm28 - vmovups 128(%rsp), %zmm29 - vmovups 64(%rsp), %zmm30 - vmovups (%rsp), %zmm31 - vmovups 1216(%rsp), %zmm1 - movq 1064(%rsp), %rsi - movq 1056(%rsp), %rdi - movq 1096(%rsp), %r12 - cfi_restore (%r12) - movq 1088(%rsp), %r13 - cfi_restore (%r13) - movq 1080(%rsp), %r14 - cfi_restore (%r14) - movq 1072(%rsp), %r15 - cfi_restore (%r15) - jmp .LBL_2_2 - -.LBL_2_10: - cfi_restore_state - movzbl %r12b, %r15d - vmovss 1156(%rsp,%r15,8), %xmm0 - vzeroupper - vmovss 1156(%rsp,%r15,8), %xmm0 - - call JUMPTARGET(__logf_finite) - - vmovss %xmm0, 1220(%rsp,%r15,8) - jmp .LBL_2_8 - -.LBL_2_12: - movzbl %r12b, %r15d - vmovss 1152(%rsp,%r15,8), %xmm0 - vzeroupper - vmovss 1152(%rsp,%r15,8), %xmm0 - - call JUMPTARGET(__logf_finite) - - vmovss %xmm0, 1216(%rsp,%r15,8) - jmp .LBL_2_7 - -#endif -END (_ZGVeN16v_logf_skx) - - .section .rodata, "a" -.L_2il0floatpacket.7: - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff - .type .L_2il0floatpacket.7,@object diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core.S deleted file mode 100644 index 4e0e36d5bd..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized logf. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVbN4v_logf) - .type _ZGVbN4v_logf, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVbN4v_logf_sse4(%rip), %rax - HAS_CPU_FEATURE (SSE4_1) - jz 2f - ret -2: leaq _ZGVbN4v_logf_sse2(%rip), %rax - ret -END (_ZGVbN4v_logf) -libmvec_hidden_def (_ZGVbN4v_logf) - -#define _ZGVbN4v_logf _ZGVbN4v_logf_sse2 -#include "../svml_s_logf4_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core_sse4.S deleted file mode 100644 index 156face181..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core_sse4.S +++ /dev/null @@ -1,194 +0,0 @@ -/* Function logf vectorized with SSE4. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include "svml_s_logf_data.h" - - .text -ENTRY (_ZGVbN4v_logf_sse4) -/* - ALGORITHM DESCRIPTION: - - log(x) = exponent_x*log(2) + log(mantissa_x), if mantissa_x<4/3 - log(x) = (exponent_x+1)*log(2) + log(0.5*mantissa_x), if mantissa_x>4/3 - - R = mantissa_x - 1, if mantissa_x<4/3 - R = 0.5*mantissa_x - 1, if mantissa_x>4/3 - |R|< 1/3 - - log(1+R) is approximated as a polynomial: degree 9 for 1-ulp, - degree 7 for 4-ulp, degree 3 for half-precision. */ - - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $320, %rsp - -/* reduction: compute r,n */ - movaps %xmm0, %xmm2 - -/* check for working range, - set special argument mask (denormals/zero/Inf/NaN) */ - movq __svml_slog_data@GOTPCREL(%rip), %rax - movdqu _iHiDelta(%rax), %xmm1 - movdqu _iLoRange(%rax), %xmm4 - paddd %xmm0, %xmm1 - movdqu _iBrkValue(%rax), %xmm3 - pcmpgtd %xmm1, %xmm4 - movdqu _iOffExpoMask(%rax), %xmm1 - psubd %xmm3, %xmm2 - pand %xmm2, %xmm1 - -/* exponent_x (mantissa_x<4/3) or exponent_x+1 (mantissa_x>4/3) */ - psrad $23, %xmm2 - paddd %xmm3, %xmm1 - movups _sPoly_7(%rax), %xmm5 - -/* mantissa_x (mantissa_x<4/3), or 0.5*mantissa_x (mantissa_x>4/3) */ - cvtdq2ps %xmm2, %xmm6 - -/* reduced argument R */ - subps _sOne(%rax), %xmm1 - movmskps %xmm4, %ecx - -/* final reconstruction: - add exponent_value*log2 to polynomial result */ - mulps _sLn2(%rax), %xmm6 - -/* polynomial evaluation starts here */ - mulps %xmm1, %xmm5 - addps _sPoly_6(%rax), %xmm5 - mulps %xmm1, %xmm5 - addps _sPoly_5(%rax), %xmm5 - mulps %xmm1, %xmm5 - addps _sPoly_4(%rax), %xmm5 - mulps %xmm1, %xmm5 - addps _sPoly_3(%rax), %xmm5 - mulps %xmm1, %xmm5 - addps _sPoly_2(%rax), %xmm5 - mulps %xmm1, %xmm5 - addps _sPoly_1(%rax), %xmm5 - mulps %xmm1, %xmm5 - -/* polynomial evaluation end */ - mulps %xmm1, %xmm5 - addps %xmm5, %xmm1 - addps %xmm6, %xmm1 - testl %ecx, %ecx - jne .LBL_1_3 - -.LBL_1_2: - cfi_remember_state - movdqa %xmm1, %xmm0 - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_1_3: - cfi_restore_state - movups %xmm0, 192(%rsp) - movups %xmm1, 256(%rsp) - je .LBL_1_2 - - xorb %dl, %dl - xorl %eax, %eax - movups %xmm8, 112(%rsp) - movups %xmm9, 96(%rsp) - movups %xmm10, 80(%rsp) - movups %xmm11, 64(%rsp) - movups %xmm12, 48(%rsp) - movups %xmm13, 32(%rsp) - movups %xmm14, 16(%rsp) - movups %xmm15, (%rsp) - movq %rsi, 136(%rsp) - movq %rdi, 128(%rsp) - movq %r12, 168(%rsp) - cfi_offset_rel_rsp (12, 168) - movb %dl, %r12b - movq %r13, 160(%rsp) - cfi_offset_rel_rsp (13, 160) - movl %ecx, %r13d - movq %r14, 152(%rsp) - cfi_offset_rel_rsp (14, 152) - movl %eax, %r14d - movq %r15, 144(%rsp) - cfi_offset_rel_rsp (15, 144) - cfi_remember_state - -.LBL_1_6: - btl %r14d, %r13d - jc .LBL_1_12 - -.LBL_1_7: - lea 1(%r14), %esi - btl %esi, %r13d - jc .LBL_1_10 - -.LBL_1_8: - incb %r12b - addl $2, %r14d - cmpb $16, %r12b - jb .LBL_1_6 - - movups 112(%rsp), %xmm8 - movups 96(%rsp), %xmm9 - movups 80(%rsp), %xmm10 - movups 64(%rsp), %xmm11 - movups 48(%rsp), %xmm12 - movups 32(%rsp), %xmm13 - movups 16(%rsp), %xmm14 - movups (%rsp), %xmm15 - movq 136(%rsp), %rsi - movq 128(%rsp), %rdi - movq 168(%rsp), %r12 - cfi_restore (%r12) - movq 160(%rsp), %r13 - cfi_restore (%r13) - movq 152(%rsp), %r14 - cfi_restore (%r14) - movq 144(%rsp), %r15 - cfi_restore (%r15) - movups 256(%rsp), %xmm1 - jmp .LBL_1_2 - -.LBL_1_10: - cfi_restore_state - movzbl %r12b, %r15d - movss 196(%rsp,%r15,8), %xmm0 - - call JUMPTARGET(__logf_finite) - - movss %xmm0, 260(%rsp,%r15,8) - jmp .LBL_1_8 - -.LBL_1_12: - movzbl %r12b, %r15d - movss 192(%rsp,%r15,8), %xmm0 - - call JUMPTARGET(__logf_finite) - - movss %xmm0, 256(%rsp,%r15,8) - jmp .LBL_1_7 - -END (_ZGVbN4v_logf_sse4) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core.S deleted file mode 100644 index f4b82de3d4..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized logf. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVdN8v_logf) - .type _ZGVdN8v_logf, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVdN8v_logf_avx2(%rip), %rax - HAS_ARCH_FEATURE (AVX2_Usable) - jz 2f - ret -2: leaq _ZGVdN8v_logf_sse_wrapper(%rip), %rax - ret -END (_ZGVdN8v_logf) -libmvec_hidden_def (_ZGVdN8v_logf) - -#define _ZGVdN8v_logf _ZGVdN8v_logf_sse_wrapper -#include "../svml_s_logf8_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core_avx2.S deleted file mode 100644 index 994af91ffe..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core_avx2.S +++ /dev/null @@ -1,184 +0,0 @@ -/* Function logf vectorized with AVX2. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include "svml_s_logf_data.h" - - .text -ENTRY(_ZGVdN8v_logf_avx2) -/* - ALGORITHM DESCRIPTION: - - log(x) = exponent_x*log(2) + log(mantissa_x), if mantissa_x<4/3 - log(x) = (exponent_x+1)*log(2) + log(0.5*mantissa_x), if mantissa_x>4/3 - - R = mantissa_x - 1, if mantissa_x<4/3 - R = 0.5*mantissa_x - 1, if mantissa_x>4/3 - |R|< 1/3 - - log(1+R) is approximated as a polynomial: degree 9 for 1-ulp, - degree 7 for 4-ulp, degree 3 for half-precision. */ - - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $448, %rsp - movq __svml_slog_data@GOTPCREL(%rip), %rax - vmovaps %ymm0, %ymm2 - vmovups _iBrkValue(%rax), %ymm6 - vmovups _iLoRange(%rax), %ymm1 -/* check for working range, - set special argument mask (denormals/zero/Inf/NaN) */ - vpaddd _iHiDelta(%rax), %ymm2, %ymm7 - -/* reduction: compute r,n */ - vpsubd %ymm6, %ymm2, %ymm4 - -/* exponent_x (mantissa_x<4/3) or exponent_x+1 (mantissa_x>4/3) */ - vpsrad $23, %ymm4, %ymm3 - vpand _iOffExpoMask(%rax), %ymm4, %ymm5 - vmovups _sPoly_7(%rax), %ymm4 - vcvtdq2ps %ymm3, %ymm0 - -/* mantissa_x (mantissa_x<4/3), or 0.5*mantissa_x (mantissa_x>4/3) */ - vpaddd %ymm6, %ymm5, %ymm3 - -/* reduced argument R */ - vsubps _sOne(%rax), %ymm3, %ymm5 - -/* polynomial evaluation starts here */ - vfmadd213ps _sPoly_6(%rax), %ymm5, %ymm4 - vfmadd213ps _sPoly_5(%rax), %ymm5, %ymm4 - vfmadd213ps _sPoly_4(%rax), %ymm5, %ymm4 - vfmadd213ps _sPoly_3(%rax), %ymm5, %ymm4 - vfmadd213ps _sPoly_2(%rax), %ymm5, %ymm4 - vfmadd213ps _sPoly_1(%rax), %ymm5, %ymm4 - vmulps %ymm5, %ymm4, %ymm6 - -/* polynomial evaluation end */ - vfmadd213ps %ymm5, %ymm5, %ymm6 - vpcmpgtd %ymm7, %ymm1, %ymm1 - vmovmskps %ymm1, %ecx - -/* final reconstruction: - add exponent_value*log2 to polynomial result */ - vfmadd132ps _sLn2(%rax), %ymm6, %ymm0 - testl %ecx, %ecx - jne .LBL_1_3 - -.LBL_1_2: - cfi_remember_state - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_1_3: - cfi_restore_state - vmovups %ymm2, 320(%rsp) - vmovups %ymm0, 384(%rsp) - je .LBL_1_2 - - xorb %dl, %dl - xorl %eax, %eax - vmovups %ymm8, 224(%rsp) - vmovups %ymm9, 192(%rsp) - vmovups %ymm10, 160(%rsp) - vmovups %ymm11, 128(%rsp) - vmovups %ymm12, 96(%rsp) - vmovups %ymm13, 64(%rsp) - vmovups %ymm14, 32(%rsp) - vmovups %ymm15, (%rsp) - movq %rsi, 264(%rsp) - movq %rdi, 256(%rsp) - movq %r12, 296(%rsp) - cfi_offset_rel_rsp (12, 296) - movb %dl, %r12b - movq %r13, 288(%rsp) - cfi_offset_rel_rsp (13, 288) - movl %ecx, %r13d - movq %r14, 280(%rsp) - cfi_offset_rel_rsp (14, 280) - movl %eax, %r14d - movq %r15, 272(%rsp) - cfi_offset_rel_rsp (15, 272) - cfi_remember_state - -.LBL_1_6: - btl %r14d, %r13d - jc .LBL_1_12 - -.LBL_1_7: - lea 1(%r14), %esi - btl %esi, %r13d - jc .LBL_1_10 - -.LBL_1_8: - incb %r12b - addl $2, %r14d - cmpb $16, %r12b - jb .LBL_1_6 - - vmovups 224(%rsp), %ymm8 - vmovups 192(%rsp), %ymm9 - vmovups 160(%rsp), %ymm10 - vmovups 128(%rsp), %ymm11 - vmovups 96(%rsp), %ymm12 - vmovups 64(%rsp), %ymm13 - vmovups 32(%rsp), %ymm14 - vmovups (%rsp), %ymm15 - vmovups 384(%rsp), %ymm0 - movq 264(%rsp), %rsi - movq 256(%rsp), %rdi - movq 296(%rsp), %r12 - cfi_restore (%r12) - movq 288(%rsp), %r13 - cfi_restore (%r13) - movq 280(%rsp), %r14 - cfi_restore (%r14) - movq 272(%rsp), %r15 - cfi_restore (%r15) - jmp .LBL_1_2 - -.LBL_1_10: - cfi_restore_state - movzbl %r12b, %r15d - vmovss 324(%rsp,%r15,8), %xmm0 - vzeroupper - - call JUMPTARGET(__logf_finite) - - vmovss %xmm0, 388(%rsp,%r15,8) - jmp .LBL_1_8 - -.LBL_1_12: - movzbl %r12b, %r15d - vmovss 320(%rsp,%r15,8), %xmm0 - vzeroupper - - call JUMPTARGET(__logf_finite) - - vmovss %xmm0, 384(%rsp,%r15,8) - jmp .LBL_1_7 - -END(_ZGVdN8v_logf_avx2) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S deleted file mode 100644 index 6d10c7576f..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S +++ /dev/null @@ -1,37 +0,0 @@ -/* Multiple versions of vectorized powf. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVeN16vv_powf) - .type _ZGVeN16vv_powf, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVeN16vv_powf_skx(%rip), %rax - HAS_ARCH_FEATURE (AVX512DQ_Usable) - jnz 2f - leaq _ZGVeN16vv_powf_knl(%rip), %rax - HAS_ARCH_FEATURE (AVX512F_Usable) - jnz 2f - leaq _ZGVeN16vv_powf_avx2_wrapper(%rip), %rax -2: ret -END (_ZGVeN16vv_powf) - -#define _ZGVeN16vv_powf _ZGVeN16vv_powf_avx2_wrapper -#include "../svml_s_powf16_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S deleted file mode 100644 index fc91a092b0..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S +++ /dev/null @@ -1,653 +0,0 @@ -/* Function powf vectorized with AVX-512. KNL and SKX versions. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include "svml_s_powf_data.h" -#include "svml_s_wrapper_impl.h" - -/* - ALGORITHM DESCRIPTION: - - We are using the next identity : pow(x,y) = 2^(y * log2(x)). - - 1) log2(x) calculation - Here we use the following formula. - Let |x|=2^k1*X1, where k1 is integer, 1<=X1<2. - Let C ~= 1/ln(2), - Rcp1 ~= 1/X1, X2=Rcp1*X1, - Rcp2 ~= 1/X2, X3=Rcp2*X2, - Rcp3 ~= 1/X3, Rcp3C ~= C/X3. - Then - log2|x| = k1 + log2(1/Rcp1) + log2(1/Rcp2) + log2(C/Rcp3C) + - log2(X1*Rcp1*Rcp2*Rcp3C/C), - where X1*Rcp1*Rcp2*Rcp3C = C*(1+q), q is very small. - - The values of Rcp1, log2(1/Rcp1), Rcp2, log2(1/Rcp2), - Rcp3C, log2(C/Rcp3C) are taken from tables. - Values of Rcp1, Rcp2, Rcp3C are such that RcpC=Rcp1*Rcp2*Rcp3C - is exactly represented in target precision. - - log2(X1*Rcp1*Rcp2*Rcp3C/C) = log2(1+q) = ln(1+q)/ln2 = - = 1/(ln2)*q - 1/(2ln2)*q^2 + 1/(3ln2)*q^3 - ... = - = 1/(C*ln2)*cq - 1/(2*C^2*ln2)*cq^2 + 1/(3*C^3*ln2)*cq^3 - ... = - = (1 + a1)*cq + a2*cq^2 + a3*cq^3 + ..., - where - cq=X1*Rcp1*Rcp2*Rcp3C-C, - a1=1/(C*ln(2))-1 is small, - a2=1/(2*C^2*ln2), - a3=1/(3*C^3*ln2), - ... - Log2 result is split by three parts: HH+HL+HLL - - 2) Calculation of y*log2(x) - Split y into YHi+YLo. - Get high PH and medium PL parts of y*log2|x|. - Get low PLL part of y*log2|x|. - Now we have PH+PL+PLL ~= y*log2|x|. - - 3) Calculation of 2^(y*log2(x)) - Let's represent PH+PL+PLL in the form N + j/2^expK + Z, - where expK=7 in this implementation, N and j are integers, - 0<=j<=2^expK-1, |Z|<2^(-expK-1). Hence - 2^(PH+PL+PLL) ~= 2^N * 2^(j/2^expK) * 2^Z, - where 2^(j/2^expK) is stored in a table, and - 2^Z ~= 1 + B1*Z + B2*Z^2 ... + B5*Z^5. - We compute 2^(PH+PL+PLL) as follows: - Break PH into PHH + PHL, where PHH = N + j/2^expK. - Z = PHL + PL + PLL - Exp2Poly = B1*Z + B2*Z^2 ... + B5*Z^5 - Get 2^(j/2^expK) from table in the form THI+TLO. - Now we have 2^(PH+PL+PLL) ~= 2^N * (THI + TLO) * (1 + Exp2Poly). - Get significand of 2^(PH+PL+PLL) in the form ResHi+ResLo: - ResHi := THI - ResLo := THI * Exp2Poly + TLO - Get exponent ERes of the result: - Res := ResHi + ResLo: - Result := ex(Res) + N. */ - - .text -ENTRY (_ZGVeN16vv_powf_knl) -#ifndef HAVE_AVX512DQ_ASM_SUPPORT -WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf -#else - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $1344, %rsp - movq __svml_spow_data@GOTPCREL(%rip), %rdx - vmovaps %zmm1, %zmm9 - vshuff32x4 $238, %zmm0, %zmm0, %zmm7 - kxnorw %k3, %k3, %k3 - vcvtps2pd %ymm0, %zmm14 - vcvtps2pd %ymm7, %zmm10 - movl $-1, %eax - movq $-1, %rcx - vpandd _ABSMASK(%rdx), %zmm9, %zmm4 - vmovups _ExpMask(%rdx), %zmm6 - -/* exponent bits selection */ - vpsrlq $20, %zmm14, %zmm13 - vshuff32x4 $238, %zmm9, %zmm9, %zmm8 - vpcmpd $5, _INF(%rdx), %zmm4, %k2 - vpsrlq $32, %zmm13, %zmm15 - vcvtps2pd %ymm8, %zmm2 - vmovups _Two10(%rdx), %zmm4 - vpmovqd %zmm15, %ymm12 - vcvtps2pd %ymm9, %zmm1 - vpsubd _NMINNORM(%rdx), %zmm0, %zmm3 - vpbroadcastd %eax, %zmm8{%k2}{z} - vpcmpd $5, _NMAXVAL(%rdx), %zmm3, %k1 - -/* preserve mantissa, set input exponent to 2^(-10) */ - vmovaps %zmm6, %zmm3 - vpternlogq $248, %zmm6, %zmm10, %zmm4 - vpsrlq $20, %zmm10, %zmm10 - vpternlogq $234, _Two10(%rdx), %zmm14, %zmm3 - -/* reciprocal approximation good to at least 11 bits */ - vrcp28pd %zmm4, %zmm11 - vpsrlq $32, %zmm10, %zmm14 - vpbroadcastd %eax, %zmm7{%k1}{z} - kxnorw %k1, %k1, %k1 - vrcp28pd %zmm3, %zmm5 - vpmovqd %zmm14, %ymm6 - vshufi32x4 $68, %zmm6, %zmm12, %zmm13 - vmovups _One(%rdx), %zmm6 - -/* round reciprocal to nearest integer, will have 1+9 mantissa bits */ - vrndscalepd $8, %zmm5, %zmm14 - -/* biased exponent in DP format */ - vshuff32x4 $238, %zmm13, %zmm13, %zmm5 - vrndscalepd $8, %zmm11, %zmm11 - vcmppd $30, _Threshold(%rdx), %zmm14, %k2 - vcvtdq2pd %ymm13, %zmm10 - vcvtdq2pd %ymm5, %zmm15 - -/* table lookup */ - vpsrlq $40, %zmm14, %zmm13 - vpxord %zmm5, %zmm5, %zmm5 - vgatherqpd _Log2Rcp_lookup(%rdx,%zmm13), %zmm5{%k3} - vfmsub213pd %zmm6, %zmm14, %zmm3 - vfmsub213pd %zmm6, %zmm11, %zmm4 - vcmppd $30, _Threshold(%rdx), %zmm11, %k3 - vpbroadcastq %rcx, %zmm14{%k2}{z} - -/* dpP= _dbT+lJ*T_ITEM_GRAN */ - kxnorw %k2, %k2, %k2 - vpsrlq $40, %zmm11, %zmm12 - vpxord %zmm6, %zmm6, %zmm6 - vpbroadcastq %rcx, %zmm11{%k3}{z} - kxnorw %k3, %k3, %k3 - vgatherqpd _Log2Rcp_lookup(%rdx,%zmm12), %zmm6{%k1} - vmovups _Bias1(%rdx), %zmm12 - vpternlogq $236, _Bias(%rdx), %zmm12, %zmm14 - vpternlogq $248, _Bias(%rdx), %zmm11, %zmm12 - vsubpd %zmm14, %zmm10, %zmm13 - vsubpd %zmm12, %zmm15, %zmm10 - vmovups _poly_coeff_3(%rdx), %zmm11 - vmovups _poly_coeff_4(%rdx), %zmm15 - vfmadd213pd %zmm15, %zmm4, %zmm11 - vmulpd %zmm4, %zmm4, %zmm12 - vmovaps %zmm15, %zmm14 - vmulpd %zmm3, %zmm3, %zmm15 - vfmadd231pd _poly_coeff_3(%rdx), %zmm3, %zmm14 - -/* reconstruction */ - vfmadd213pd %zmm4, %zmm12, %zmm11 - vfmadd213pd %zmm3, %zmm15, %zmm14 - vaddpd %zmm6, %zmm11, %zmm11 - vaddpd %zmm5, %zmm14, %zmm3 - vfmadd231pd _L2(%rdx), %zmm10, %zmm11 - vfmadd132pd _L2(%rdx), %zmm3, %zmm13 - vmulpd %zmm2, %zmm11, %zmm12 - vmulpd %zmm1, %zmm13, %zmm10 - vmulpd __dbInvLn2(%rdx), %zmm12, %zmm6 - -/* hi bits */ - vpsrlq $32, %zmm12, %zmm12 - vmulpd __dbInvLn2(%rdx), %zmm10, %zmm1 - -/* to round down; if dR is an integer we will get R = 1, which is ok */ - vsubpd __dbHALF(%rdx), %zmm6, %zmm4 - vpsrlq $32, %zmm10, %zmm11 - vpmovqd %zmm11, %ymm3 - vsubpd __dbHALF(%rdx), %zmm1, %zmm2 - vaddpd __dbShifter(%rdx), %zmm4, %zmm14 - vpmovqd %zmm12, %ymm4 - vshufi32x4 $68, %zmm4, %zmm3, %zmm5 - vpxord %zmm4, %zmm4, %zmm4 - vaddpd __dbShifter(%rdx), %zmm2, %zmm2 - -/* iAbsX = iAbsX&iAbsMask; */ - vpandd __iAbsMask(%rdx), %zmm5, %zmm11 - vpxord %zmm5, %zmm5, %zmm5 - vsubpd __dbShifter(%rdx), %zmm14, %zmm13 - -/* iRangeMask = (iAbsX>iDomainRange) */ - vpcmpgtd __iDomainRange(%rdx), %zmm11, %k1 - vsubpd __dbShifter(%rdx), %zmm2, %zmm15 - vpbroadcastd %eax, %zmm10{%k1}{z} - vpternlogd $254, %zmm8, %zmm7, %zmm10 - -/* [0..1) */ - vsubpd %zmm15, %zmm1, %zmm1 - -/* low K bits */ - vpandq __lbLOWKBITS(%rdx), %zmm14, %zmm11 - vgatherqpd 13952(%rdx,%zmm11,8), %zmm5{%k3} - vsubpd %zmm13, %zmm6, %zmm7 - vptestmd %zmm10, %zmm10, %k0 - vpandq __lbLOWKBITS(%rdx), %zmm2, %zmm10 - vmulpd __dbC1(%rdx), %zmm1, %zmm1 - vmulpd __dbC1(%rdx), %zmm7, %zmm3 - vpsrlq $11, %zmm2, %zmm8 - vpsrlq $11, %zmm14, %zmm2 - -/* NB : including +/- sign for the exponent!! */ - vpsllq $52, %zmm8, %zmm8 - kmovw %k0, %ecx - vpsllq $52, %zmm2, %zmm6 - vfmadd213pd %zmm5, %zmm3, %zmm5 - vgatherqpd 13952(%rdx,%zmm10,8), %zmm4{%k2} - vfmadd213pd %zmm4, %zmm1, %zmm4 - vpaddq %zmm6, %zmm5, %zmm10 - vcvtpd2ps %zmm10, %ymm12 - vpaddq %zmm8, %zmm4, %zmm7 - vcvtpd2ps %zmm7, %ymm11 - vshuff32x4 $68, %zmm12, %zmm11, %zmm1 - testl %ecx, %ecx - jne .LBL_1_3 - -.LBL_1_2: - cfi_remember_state - vmovaps %zmm1, %zmm0 - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_1_3: - cfi_restore_state - vmovups %zmm0, 1152(%rsp) - vmovups %zmm9, 1216(%rsp) - vmovups %zmm1, 1280(%rsp) - je .LBL_1_2 - - xorb %dl, %dl - kmovw %k4, 1048(%rsp) - xorl %eax, %eax - kmovw %k5, 1040(%rsp) - kmovw %k6, 1032(%rsp) - kmovw %k7, 1024(%rsp) - vmovups %zmm16, 960(%rsp) - vmovups %zmm17, 896(%rsp) - vmovups %zmm18, 832(%rsp) - vmovups %zmm19, 768(%rsp) - vmovups %zmm20, 704(%rsp) - vmovups %zmm21, 640(%rsp) - vmovups %zmm22, 576(%rsp) - vmovups %zmm23, 512(%rsp) - vmovups %zmm24, 448(%rsp) - vmovups %zmm25, 384(%rsp) - vmovups %zmm26, 320(%rsp) - vmovups %zmm27, 256(%rsp) - vmovups %zmm28, 192(%rsp) - vmovups %zmm29, 128(%rsp) - vmovups %zmm30, 64(%rsp) - vmovups %zmm31, (%rsp) - movq %rsi, 1064(%rsp) - movq %rdi, 1056(%rsp) - movq %r12, 1096(%rsp) - cfi_offset_rel_rsp (12, 1096) - movb %dl, %r12b - movq %r13, 1088(%rsp) - cfi_offset_rel_rsp (13, 1088) - movl %ecx, %r13d - movq %r14, 1080(%rsp) - cfi_offset_rel_rsp (14, 1080) - movl %eax, %r14d - movq %r15, 1072(%rsp) - cfi_offset_rel_rsp (15, 1072) - cfi_remember_state - -.LBL_1_6: - btl %r14d, %r13d - jc .LBL_1_12 - -.LBL_1_7: - lea 1(%r14), %esi - btl %esi, %r13d - jc .LBL_1_10 - -.LBL_1_8: - addb $1, %r12b - addl $2, %r14d - cmpb $16, %r12b - jb .LBL_1_6 - - kmovw 1048(%rsp), %k4 - movq 1064(%rsp), %rsi - kmovw 1040(%rsp), %k5 - movq 1056(%rsp), %rdi - kmovw 1032(%rsp), %k6 - movq 1096(%rsp), %r12 - cfi_restore (%r12) - movq 1088(%rsp), %r13 - cfi_restore (%r13) - kmovw 1024(%rsp), %k7 - vmovups 960(%rsp), %zmm16 - vmovups 896(%rsp), %zmm17 - vmovups 832(%rsp), %zmm18 - vmovups 768(%rsp), %zmm19 - vmovups 704(%rsp), %zmm20 - vmovups 640(%rsp), %zmm21 - vmovups 576(%rsp), %zmm22 - vmovups 512(%rsp), %zmm23 - vmovups 448(%rsp), %zmm24 - vmovups 384(%rsp), %zmm25 - vmovups 320(%rsp), %zmm26 - vmovups 256(%rsp), %zmm27 - vmovups 192(%rsp), %zmm28 - vmovups 128(%rsp), %zmm29 - vmovups 64(%rsp), %zmm30 - vmovups (%rsp), %zmm31 - movq 1080(%rsp), %r14 - cfi_restore (%r14) - movq 1072(%rsp), %r15 - cfi_restore (%r15) - vmovups 1280(%rsp), %zmm1 - jmp .LBL_1_2 - -.LBL_1_10: - cfi_restore_state - movzbl %r12b, %r15d - vmovss 1156(%rsp,%r15,8), %xmm0 - vmovss 1220(%rsp,%r15,8), %xmm1 - call JUMPTARGET(__powf_finite) - vmovss %xmm0, 1284(%rsp,%r15,8) - jmp .LBL_1_8 - -.LBL_1_12: - movzbl %r12b, %r15d - vmovss 1152(%rsp,%r15,8), %xmm0 - vmovss 1216(%rsp,%r15,8), %xmm1 - call JUMPTARGET(__powf_finite) - vmovss %xmm0, 1280(%rsp,%r15,8) - jmp .LBL_1_7 -#endif -END (_ZGVeN16vv_powf_knl) - -ENTRY (_ZGVeN16vv_powf_skx) -#ifndef HAVE_AVX512DQ_ASM_SUPPORT -WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf -#else - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $1344, %rsp - movq __svml_spow_data@GOTPCREL(%rip), %rax - vextractf32x8 $1, %zmm1, %ymm14 - vextractf32x8 $1, %zmm0, %ymm15 - vpsubd _NMINNORM(%rax), %zmm0, %zmm9 - vmovups %zmm26, 1280(%rsp) - vmovups _ExpMask(%rax), %zmm6 - vpcmpd $1, _NMAXVAL(%rax), %zmm9, %k1 - vcvtps2pd %ymm0, %zmm5 - vcvtps2pd %ymm1, %zmm12 - kxnorw %k3, %k3, %k3 - -/* exponent bits selection */ - vpsrlq $20, %zmm5, %zmm3 - vpsrlq $32, %zmm3, %zmm2 - vpmovqd %zmm2, %ymm11 - vcvtps2pd %ymm14, %zmm13 - vmovups .L_2il0floatpacket.23(%rip), %zmm14 - vmovaps %zmm14, %zmm26 - vpandd _ABSMASK(%rax), %zmm1, %zmm8 - vpcmpd $1, _INF(%rax), %zmm8, %k2 - vpandnd %zmm9, %zmm9, %zmm26{%k1} - vmovups _Two10(%rax), %zmm9 - kxnorw %k1, %k1, %k1 - vcvtps2pd %ymm15, %zmm4 - vmovaps %zmm14, %zmm15 - -/* preserve mantissa, set input exponent to 2^(-10) */ - vpternlogq $248, %zmm6, %zmm4, %zmm9 - vpsrlq $20, %zmm4, %zmm4 - -/* reciprocal approximation good to at least 11 bits */ - vrcp14pd %zmm9, %zmm10 - -/* round reciprocal to nearest integer, will have 1+9 mantissa bits */ - vrndscalepd $8, %zmm10, %zmm3 - vmovups _One(%rax), %zmm10 - vfmsub213pd %zmm10, %zmm3, %zmm9 - vpandnd %zmm8, %zmm8, %zmm15{%k2} - vmovaps %zmm6, %zmm8 - vpternlogq $234, _Two10(%rax), %zmm5, %zmm8 - vpsrlq $32, %zmm4, %zmm5 - vrcp14pd %zmm8, %zmm7 - vpmovqd %zmm5, %ymm6 - vrndscalepd $8, %zmm7, %zmm2 - vfmsub213pd %zmm10, %zmm2, %zmm8 - -/* table lookup */ - vpsrlq $40, %zmm2, %zmm10 - vinserti32x8 $1, %ymm6, %zmm11, %zmm4 - vpsrlq $40, %zmm3, %zmm11 - -/* biased exponent in DP format */ - vextracti32x8 $1, %zmm4, %ymm7 - vcvtdq2pd %ymm4, %zmm6 - vpmovqd %zmm10, %ymm4 - vpmovqd %zmm11, %ymm5 - vpxord %zmm10, %zmm10, %zmm10 - vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3} - vpbroadcastq .L_2il0floatpacket.24(%rip), %zmm4 - vpxord %zmm11, %zmm11, %zmm11 - vcvtdq2pd %ymm7, %zmm7 - vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1} - vmovups _Threshold(%rax), %zmm5 - vcmppd $21, %zmm2, %zmm5, %k2 - vcmppd $21, %zmm3, %zmm5, %k3 - vmovups _Bias1(%rax), %zmm3 - vmovaps %zmm4, %zmm2 - vpandnq %zmm5, %zmm5, %zmm2{%k2} - vpternlogq $236, _Bias(%rax), %zmm3, %zmm2 - -/* dpP= _dbT+lJ*T_ITEM_GRAN */ - kxnorw %k2, %k2, %k2 - vpandnq %zmm5, %zmm5, %zmm4{%k3} - vpternlogq $248, _Bias(%rax), %zmm4, %zmm3 - vsubpd %zmm2, %zmm6, %zmm4 - vmovups _poly_coeff_3(%rax), %zmm6 - vmovups _poly_coeff_4(%rax), %zmm2 - vsubpd %zmm3, %zmm7, %zmm5 - vmulpd %zmm8, %zmm8, %zmm7 - vfmadd213pd %zmm2, %zmm9, %zmm6 - kxnorw %k3, %k3, %k3 - vmovaps %zmm2, %zmm3 - vmulpd %zmm9, %zmm9, %zmm2 - vfmadd231pd _poly_coeff_3(%rax), %zmm8, %zmm3 - -/* reconstruction */ - vfmadd213pd %zmm9, %zmm2, %zmm6 - vfmadd213pd %zmm8, %zmm7, %zmm3 - vaddpd %zmm11, %zmm6, %zmm8 - vaddpd %zmm10, %zmm3, %zmm9 - vfmadd231pd _L2(%rax), %zmm5, %zmm8 - vfmadd132pd _L2(%rax), %zmm9, %zmm4 - vmulpd %zmm13, %zmm8, %zmm13 - vmulpd %zmm12, %zmm4, %zmm3 - vmulpd __dbInvLn2(%rax), %zmm13, %zmm10 - vmulpd __dbInvLn2(%rax), %zmm3, %zmm8 - -/* hi bits */ - vpsrlq $32, %zmm3, %zmm4 - vpsrlq $32, %zmm13, %zmm13 - -/* to round down; if dR is an integer we will get R = 1, which is ok */ - vsubpd __dbHALF(%rax), %zmm8, %zmm12 - vpmovqd %zmm4, %ymm5 - vpmovqd %zmm13, %ymm2 - vsubpd __dbHALF(%rax), %zmm10, %zmm9 - vaddpd __dbShifter(%rax), %zmm12, %zmm7 - vaddpd __dbShifter(%rax), %zmm9, %zmm9 - vsubpd __dbShifter(%rax), %zmm7, %zmm11 - vsubpd __dbShifter(%rax), %zmm9, %zmm12 - vinserti32x8 $1, %ymm2, %zmm5, %zmm3 - -/* iAbsX = iAbsX&iAbsMask */ - vpandd __iAbsMask(%rax), %zmm3, %zmm4 - -/* iRangeMask = (iAbsX>iDomainRange) */ - vpcmpd $2, __iDomainRange(%rax), %zmm4, %k1 - vpandnd %zmm4, %zmm4, %zmm14{%k1} - vpternlogd $254, %zmm15, %zmm26, %zmm14 - -/* [0..1) */ - vsubpd %zmm11, %zmm8, %zmm15 - vsubpd %zmm12, %zmm10, %zmm26 - vptestmd %zmm14, %zmm14, %k0 - vpsrlq $11, %zmm7, %zmm8 - vpsrlq $11, %zmm9, %zmm10 - vmulpd __dbC1(%rax), %zmm26, %zmm26 - vmulpd __dbC1(%rax), %zmm15, %zmm15 - -/* NB : including +/- sign for the exponent!! */ - vpsllq $52, %zmm10, %zmm13 - vpsllq $52, %zmm8, %zmm12 - kmovw %k0, %ecx - -/* low K bits */ - vpandq __lbLOWKBITS(%rax), %zmm9, %zmm14 - vpandq __lbLOWKBITS(%rax), %zmm7, %zmm6 - vpmovqd %zmm14, %ymm7 - vpmovqd %zmm6, %ymm9 - vpxord %zmm2, %zmm2, %zmm2 - vgatherdpd 13952(%rax,%ymm7,8), %zmm2{%k3} - vfmadd213pd %zmm2, %zmm26, %zmm2 - vpaddq %zmm13, %zmm2, %zmm2 - vcvtpd2ps %zmm2, %ymm4 - vpxord %zmm11, %zmm11, %zmm11 - vgatherdpd 13952(%rax,%ymm9,8), %zmm11{%k2} - vfmadd213pd %zmm11, %zmm15, %zmm11 - vpaddq %zmm12, %zmm11, %zmm3 - vcvtpd2ps %zmm3, %ymm5 - vinsertf32x8 $1, %ymm4, %zmm5, %zmm2 - testl %ecx, %ecx - jne .LBL_2_3 - -.LBL_2_2: - cfi_remember_state - vmovups 1280(%rsp), %zmm26 - vmovaps %zmm2, %zmm0 - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_2_3: - cfi_restore_state - vmovups %zmm0, 1088(%rsp) - vmovups %zmm1, 1152(%rsp) - vmovups %zmm2, 1216(%rsp) - je .LBL_2_2 - - xorb %dl, %dl - xorl %eax, %eax - kmovw %k4, 984(%rsp) - kmovw %k5, 976(%rsp) - kmovw %k6, 968(%rsp) - kmovw %k7, 960(%rsp) - vmovups %zmm16, 896(%rsp) - vmovups %zmm17, 832(%rsp) - vmovups %zmm18, 768(%rsp) - vmovups %zmm19, 704(%rsp) - vmovups %zmm20, 640(%rsp) - vmovups %zmm21, 576(%rsp) - vmovups %zmm22, 512(%rsp) - vmovups %zmm23, 448(%rsp) - vmovups %zmm24, 384(%rsp) - vmovups %zmm25, 320(%rsp) - vmovups %zmm27, 256(%rsp) - vmovups %zmm28, 192(%rsp) - vmovups %zmm29, 128(%rsp) - vmovups %zmm30, 64(%rsp) - vmovups %zmm31, (%rsp) - movq %rsi, 1000(%rsp) - movq %rdi, 992(%rsp) - movq %r12, 1032(%rsp) - cfi_offset_rel_rsp (12, 1032) - movb %dl, %r12b - movq %r13, 1024(%rsp) - cfi_offset_rel_rsp (13, 1024) - movl %ecx, %r13d - movq %r14, 1016(%rsp) - cfi_offset_rel_rsp (14, 1016) - movl %eax, %r14d - movq %r15, 1008(%rsp) - cfi_offset_rel_rsp (15, 1008) - cfi_remember_state - -.LBL_2_6: - btl %r14d, %r13d - jc .LBL_2_12 - -.LBL_2_7: - lea 1(%r14), %esi - btl %esi, %r13d - jc .LBL_2_10 - -.LBL_2_8: - incb %r12b - addl $2, %r14d - cmpb $16, %r12b - jb .LBL_2_6 - - kmovw 984(%rsp), %k4 - kmovw 976(%rsp), %k5 - kmovw 968(%rsp), %k6 - kmovw 960(%rsp), %k7 - vmovups 896(%rsp), %zmm16 - vmovups 832(%rsp), %zmm17 - vmovups 768(%rsp), %zmm18 - vmovups 704(%rsp), %zmm19 - vmovups 640(%rsp), %zmm20 - vmovups 576(%rsp), %zmm21 - vmovups 512(%rsp), %zmm22 - vmovups 448(%rsp), %zmm23 - vmovups 384(%rsp), %zmm24 - vmovups 320(%rsp), %zmm25 - vmovups 256(%rsp), %zmm27 - vmovups 192(%rsp), %zmm28 - vmovups 128(%rsp), %zmm29 - vmovups 64(%rsp), %zmm30 - vmovups (%rsp), %zmm31 - vmovups 1216(%rsp), %zmm2 - movq 1000(%rsp), %rsi - movq 992(%rsp), %rdi - movq 1032(%rsp), %r12 - cfi_restore (%r12) - movq 1024(%rsp), %r13 - cfi_restore (%r13) - movq 1016(%rsp), %r14 - cfi_restore (%r14) - movq 1008(%rsp), %r15 - cfi_restore (%r15) - jmp .LBL_2_2 - -.LBL_2_10: - cfi_restore_state - movzbl %r12b, %r15d - vmovss 1156(%rsp,%r15,8), %xmm1 - vzeroupper - vmovss 1092(%rsp,%r15,8), %xmm0 - call JUMPTARGET(__powf_finite) - vmovss %xmm0, 1220(%rsp,%r15,8) - jmp .LBL_2_8 - -.LBL_2_12: - movzbl %r12b, %r15d - vmovss 1152(%rsp,%r15,8), %xmm1 - vzeroupper - vmovss 1088(%rsp,%r15,8), %xmm0 - call JUMPTARGET(__powf_finite) - vmovss %xmm0, 1216(%rsp,%r15,8) - jmp .LBL_2_7 -#endif -END (_ZGVeN16vv_powf_skx) - - .section .rodata, "a" -.L_2il0floatpacket.23: - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff - .type .L_2il0floatpacket.23,@object -.L_2il0floatpacket.24: - .long 0xffffffff,0xffffffff - .type .L_2il0floatpacket.24,@object diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core.S deleted file mode 100644 index 785b549882..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized powf. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVbN4vv_powf) - .type _ZGVbN4vv_powf, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVbN4vv_powf_sse4(%rip), %rax - HAS_CPU_FEATURE (SSE4_1) - jz 2f - ret -2: leaq _ZGVbN4vv_powf_sse2(%rip), %rax - ret -END (_ZGVbN4vv_powf) -libmvec_hidden_def (_ZGVbN4vv_powf) - -#define _ZGVbN4vv_powf _ZGVbN4vv_powf_sse2 -#include "../svml_s_powf4_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core_sse4.S deleted file mode 100644 index 8b1b4e74bb..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core_sse4.S +++ /dev/null @@ -1,374 +0,0 @@ -/* Function powf vectorized with SSE4. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include "svml_s_powf_data.h" - - .text -ENTRY (_ZGVbN4vv_powf_sse4) -/* - ALGORITHM DESCRIPTION: - - We are using the next identity: pow(x,y) = 2^(y * log2(x)). - - 1) log2(x) calculation - Here we use the following formula. - Let |x|=2^k1*X1, where k1 is integer, 1<=X1<2. - Let C ~= 1/ln(2), - Rcp1 ~= 1/X1, X2=Rcp1*X1, - Rcp2 ~= 1/X2, X3=Rcp2*X2, - Rcp3 ~= 1/X3, Rcp3C ~= C/X3. - Then - log2|x| = k1 + log2(1/Rcp1) + log2(1/Rcp2) + log2(C/Rcp3C) + - log2(X1*Rcp1*Rcp2*Rcp3C/C), - where X1*Rcp1*Rcp2*Rcp3C = C*(1+q), q is very small. - - The values of Rcp1, log2(1/Rcp1), Rcp2, log2(1/Rcp2), - Rcp3C, log2(C/Rcp3C) are taken from tables. - Values of Rcp1, Rcp2, Rcp3C are such that RcpC=Rcp1*Rcp2*Rcp3C - is exactly represented in target precision. - - log2(X1*Rcp1*Rcp2*Rcp3C/C) = log2(1+q) = ln(1+q)/ln2 = - = 1/(ln2)*q - 1/(2ln2)*q^2 + 1/(3ln2)*q^3 - ... = - = 1/(C*ln2)*cq - 1/(2*C^2*ln2)*cq^2 + 1/(3*C^3*ln2)*cq^3 - ... = - = (1 + a1)*cq + a2*cq^2 + a3*cq^3 + ..., - where - cq=X1*Rcp1*Rcp2*Rcp3C-C, - a1=1/(C*ln(2))-1 is small, - a2=1/(2*C^2*ln2), - a3=1/(3*C^3*ln2), - ... - Log2 result is split by three parts: HH+HL+HLL - - 2) Calculation of y*log2(x) - Split y into YHi+YLo. - Get high PH and medium PL parts of y*log2|x|. - Get low PLL part of y*log2|x|. - Now we have PH+PL+PLL ~= y*log2|x|. - - 3) Calculation of 2^(y*log2(x)) - Let's represent PH+PL+PLL in the form N + j/2^expK + Z, - where expK=7 in this implementation, N and j are integers, - 0<=j<=2^expK-1, |Z|<2^(-expK-1). Hence - 2^(PH+PL+PLL) ~= 2^N * 2^(j/2^expK) * 2^Z, - where 2^(j/2^expK) is stored in a table, and - 2^Z ~= 1 + B1*Z + B2*Z^2 ... + B5*Z^5. - We compute 2^(PH+PL+PLL) as follows: - Break PH into PHH + PHL, where PHH = N + j/2^expK. - Z = PHL + PL + PLL - Exp2Poly = B1*Z + B2*Z^2 ... + B5*Z^5 - Get 2^(j/2^expK) from table in the form THI+TLO. - Now we have 2^(PH+PL+PLL) ~= 2^N * (THI + TLO) * (1 + Exp2Poly). - Get significand of 2^(PH+PL+PLL) in the form ResHi+ResLo: - ResHi := THI - ResLo := THI * Exp2Poly + TLO - Get exponent ERes of the result: - Res := ResHi + ResLo: - Result := ex(Res) + N. */ - - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $256, %rsp - movaps %xmm0, %xmm3 - movhlps %xmm0, %xmm3 - movaps %xmm1, %xmm5 - movups %xmm8, 112(%rsp) - movaps %xmm5, %xmm2 - cvtps2pd %xmm3, %xmm8 - cvtps2pd %xmm5, %xmm7 - movups %xmm9, 96(%rsp) - movaps %xmm0, %xmm4 - cvtps2pd %xmm0, %xmm9 - movq __svml_spow_data@GOTPCREL(%rip), %rdx - movups %xmm10, 176(%rsp) - movups %xmm13, 48(%rsp) - movups _ExpMask(%rdx), %xmm6 - -/* preserve mantissa, set input exponent to 2^(-10) */ - movaps %xmm6, %xmm10 - andps %xmm8, %xmm6 - andps %xmm9, %xmm10 - -/* exponent bits selection */ - psrlq $20, %xmm9 - orps _Two10(%rdx), %xmm6 - psrlq $20, %xmm8 - orps _Two10(%rdx), %xmm10 - -/* reciprocal approximation good to at least 11 bits */ - cvtpd2ps %xmm6, %xmm13 - cvtpd2ps %xmm10, %xmm1 - movlhps %xmm13, %xmm13 - movhlps %xmm5, %xmm2 - movlhps %xmm1, %xmm1 - movups %xmm12, 208(%rsp) - rcpps %xmm13, %xmm12 - movups %xmm11, 80(%rsp) - cvtps2pd %xmm2, %xmm11 - rcpps %xmm1, %xmm2 - movups %xmm14, 144(%rsp) - cvtps2pd %xmm12, %xmm14 - movups %xmm15, 160(%rsp) - cvtps2pd %xmm2, %xmm15 - shufps $221, %xmm8, %xmm9 - -/* round reciprocal to nearest integer, will have 1+9 mantissa bits */ - roundpd $0, %xmm14, %xmm14 - -/* biased exponent in DP format */ - pshufd $238, %xmm9, %xmm8 - roundpd $0, %xmm15, %xmm15 - cvtdq2pd %xmm8, %xmm1 - mulpd %xmm15, %xmm10 - mulpd %xmm14, %xmm6 - cvtdq2pd %xmm9, %xmm2 - subpd _One(%rdx), %xmm10 - subpd _One(%rdx), %xmm6 - -/* table lookup */ - movaps %xmm14, %xmm8 - movaps %xmm15, %xmm9 - psrlq $40, %xmm8 - psrlq $40, %xmm9 - movd %xmm8, %r8d - movd %xmm9, %eax - psubd _NMINNORM(%rdx), %xmm4 - movdqu _ABSMASK(%rdx), %xmm3 - pextrd $2, %xmm8, %r9d - pand %xmm5, %xmm3 - movups _Threshold(%rdx), %xmm8 - pextrd $2, %xmm9, %ecx - movaps %xmm8, %xmm9 - cmpltpd %xmm15, %xmm9 - cmpltpd %xmm14, %xmm8 - andps _Bias(%rdx), %xmm9 - movaps %xmm10, %xmm14 - andps _Bias(%rdx), %xmm8 - movaps %xmm6, %xmm15 - orps _Bias1(%rdx), %xmm9 - orps _Bias1(%rdx), %xmm8 - subpd %xmm9, %xmm2 - subpd %xmm8, %xmm1 - mulpd %xmm10, %xmm14 - mulpd %xmm6, %xmm15 - mulpd _L2(%rdx), %xmm2 - mulpd _L2(%rdx), %xmm1 - movups _poly_coeff_3(%rdx), %xmm9 - movaps %xmm9, %xmm8 - mulpd %xmm10, %xmm8 - mulpd %xmm6, %xmm9 - addpd _poly_coeff_4(%rdx), %xmm8 - addpd _poly_coeff_4(%rdx), %xmm9 - mulpd %xmm14, %xmm8 - mulpd %xmm15, %xmm9 - -/* reconstruction */ - addpd %xmm8, %xmm10 - addpd %xmm9, %xmm6 - movslq %eax, %rax - movslq %r8d, %r8 - movslq %ecx, %rcx - movslq %r9d, %r9 - movsd _Log2Rcp_lookup(%rdx,%rax), %xmm13 - movsd _Log2Rcp_lookup(%rdx,%r8), %xmm12 - movhpd _Log2Rcp_lookup(%rdx,%rcx), %xmm13 - movhpd _Log2Rcp_lookup(%rdx,%r9), %xmm12 - addpd %xmm10, %xmm13 - addpd %xmm6, %xmm12 - addpd %xmm13, %xmm2 - addpd %xmm12, %xmm1 - mulpd %xmm7, %xmm2 - mulpd %xmm11, %xmm1 - movups __dbInvLn2(%rdx), %xmm11 - movdqa %xmm4, %xmm12 - movaps %xmm11, %xmm10 - mulpd %xmm2, %xmm10 - mulpd %xmm1, %xmm11 - -/* to round down; if dR is an integer we will get R = 1, which is ok */ - movaps %xmm10, %xmm8 - movaps %xmm11, %xmm9 - subpd __dbHALF(%rdx), %xmm8 - subpd __dbHALF(%rdx), %xmm9 - addpd __dbShifter(%rdx), %xmm8 - addpd __dbShifter(%rdx), %xmm9 - movaps %xmm8, %xmm6 - movaps %xmm9, %xmm7 - subpd __dbShifter(%rdx), %xmm6 - subpd __dbShifter(%rdx), %xmm7 - -/* [0..1) */ - subpd %xmm6, %xmm10 - subpd %xmm7, %xmm11 - mulpd __dbC1(%rdx), %xmm10 - mulpd __dbC1(%rdx), %xmm11 - -/* hi bits */ - shufps $221, %xmm1, %xmm2 - movdqu _NMAXVAL(%rdx), %xmm1 - pcmpgtd %xmm1, %xmm12 - pcmpeqd %xmm1, %xmm4 - por %xmm4, %xmm12 - movdqa %xmm3, %xmm1 - movdqu _INF(%rdx), %xmm4 - pcmpgtd %xmm4, %xmm1 - pcmpeqd %xmm4, %xmm3 - -/* iAbsX = iAbsX&iAbsMask */ - pand __iAbsMask(%rdx), %xmm2 - por %xmm3, %xmm1 - -/* iRangeMask = (iAbsX>iDomainRange) */ - pcmpgtd __iDomainRange(%rdx), %xmm2 - por %xmm1, %xmm12 - movups __lbLOWKBITS(%rdx), %xmm3 - por %xmm2, %xmm12 - -/* low K bits */ - movaps %xmm3, %xmm2 - andps %xmm9, %xmm3 - andps %xmm8, %xmm2 - psrlq $11, %xmm8 - -/* dpP= _dbT+lJ*T_ITEM_GRAN */ - movd %xmm2, %r10d - psrlq $11, %xmm9 - movd %xmm3, %ecx - -/* NB : including +/- sign for the exponent!! */ - psllq $52, %xmm8 - psllq $52, %xmm9 - pextrw $4, %xmm2, %r11d - pextrw $4, %xmm3, %r8d - movmskps %xmm12, %eax - shll $3, %r10d - shll $3, %ecx - shll $3, %r11d - shll $3, %r8d - movq 13952(%rdx,%r10), %xmm6 - movq 13952(%rdx,%rcx), %xmm7 - movhpd 13952(%rdx,%r11), %xmm6 - movhpd 13952(%rdx,%r8), %xmm7 - mulpd %xmm6, %xmm10 - mulpd %xmm7, %xmm11 - addpd %xmm10, %xmm6 - addpd %xmm11, %xmm7 - paddq %xmm8, %xmm6 - paddq %xmm9, %xmm7 - cvtpd2ps %xmm6, %xmm1 - cvtpd2ps %xmm7, %xmm4 - movlhps %xmm4, %xmm1 - testl %eax, %eax - jne .LBL_1_3 - -.LBL_1_2: - cfi_remember_state - movups 112(%rsp), %xmm8 - movaps %xmm1, %xmm0 - movups 96(%rsp), %xmm9 - movups 176(%rsp), %xmm10 - movups 80(%rsp), %xmm11 - movups 208(%rsp), %xmm12 - movups 48(%rsp), %xmm13 - movups 144(%rsp), %xmm14 - movups 160(%rsp), %xmm15 - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_1_3: - cfi_restore_state - movups %xmm0, 64(%rsp) - movups %xmm5, 128(%rsp) - movups %xmm1, 192(%rsp) - je .LBL_1_2 - - xorb %cl, %cl - xorl %edx, %edx - movq %rsi, 8(%rsp) - movq %rdi, (%rsp) - movq %r12, 40(%rsp) - cfi_offset_rel_rsp (12, 40) - movb %cl, %r12b - movq %r13, 32(%rsp) - cfi_offset_rel_rsp (13, 32) - movl %eax, %r13d - movq %r14, 24(%rsp) - cfi_offset_rel_rsp (14, 24) - movl %edx, %r14d - movq %r15, 16(%rsp) - cfi_offset_rel_rsp (15, 16) - cfi_remember_state - -.LBL_1_6: - btl %r14d, %r13d - jc .LBL_1_12 - -.LBL_1_7: - lea 1(%r14), %esi - btl %esi, %r13d - jc .LBL_1_10 - -.LBL_1_8: - incb %r12b - addl $2, %r14d - cmpb $16, %r12b - jb .LBL_1_6 - - movq 8(%rsp), %rsi - movq (%rsp), %rdi - movq 40(%rsp), %r12 - cfi_restore (%r12) - movq 32(%rsp), %r13 - cfi_restore (%r13) - movq 24(%rsp), %r14 - cfi_restore (%r14) - movq 16(%rsp), %r15 - cfi_restore (%r15) - movups 192(%rsp), %xmm1 - jmp .LBL_1_2 - -.LBL_1_10: - cfi_restore_state - movzbl %r12b, %r15d - movss 68(%rsp,%r15,8), %xmm0 - movss 132(%rsp,%r15,8), %xmm1 - - call JUMPTARGET(__powf_finite) - - movss %xmm0, 196(%rsp,%r15,8) - jmp .LBL_1_8 - -.LBL_1_12: - movzbl %r12b, %r15d - movss 64(%rsp,%r15,8), %xmm0 - movss 128(%rsp,%r15,8), %xmm1 - - call JUMPTARGET(__powf_finite) - - movss %xmm0, 192(%rsp,%r15,8) - jmp .LBL_1_7 - -END (_ZGVbN4vv_powf_sse4) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core.S deleted file mode 100644 index 1f6a07315e..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized powf. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVdN8vv_powf) - .type _ZGVdN8vv_powf, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVdN8vv_powf_avx2(%rip), %rax - HAS_ARCH_FEATURE (AVX2_Usable) - jz 2f - ret -2: leaq _ZGVdN8vv_powf_sse_wrapper(%rip), %rax - ret -END (_ZGVdN8vv_powf) -libmvec_hidden_def (_ZGVdN8vv_powf) - -#define _ZGVdN8vv_powf _ZGVdN8vv_powf_sse_wrapper -#include "../svml_s_powf8_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core_avx2.S deleted file mode 100644 index 683932f410..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core_avx2.S +++ /dev/null @@ -1,357 +0,0 @@ -/* Function powf vectorized with AVX2. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include "svml_s_powf_data.h" - - .text -ENTRY(_ZGVdN8vv_powf_avx2) -/* - ALGORITHM DESCRIPTION: - - We are using the next identity : pow(x,y) = 2^(y * log2(x)). - - 1) log2(x) calculation - Here we use the following formula. - Let |x|=2^k1*X1, where k1 is integer, 1<=X1<2. - Let C ~= 1/ln(2), - Rcp1 ~= 1/X1, X2=Rcp1*X1, - Rcp2 ~= 1/X2, X3=Rcp2*X2, - Rcp3 ~= 1/X3, Rcp3C ~= C/X3. - Then - log2|x| = k1 + log2(1/Rcp1) + log2(1/Rcp2) + log2(C/Rcp3C) + - log2(X1*Rcp1*Rcp2*Rcp3C/C), - where X1*Rcp1*Rcp2*Rcp3C = C*(1+q), q is very small. - - The values of Rcp1, log2(1/Rcp1), Rcp2, log2(1/Rcp2), - Rcp3C, log2(C/Rcp3C) are taken from tables. - Values of Rcp1, Rcp2, Rcp3C are such that RcpC=Rcp1*Rcp2*Rcp3C - is exactly represented in target precision. - - log2(X1*Rcp1*Rcp2*Rcp3C/C) = log2(1+q) = ln(1+q)/ln2 = - = 1/(ln2)*q - 1/(2ln2)*q^2 + 1/(3ln2)*q^3 - ... = - = 1/(C*ln2)*cq - 1/(2*C^2*ln2)*cq^2 + 1/(3*C^3*ln2)*cq^3 - ... = - = (1 + a1)*cq + a2*cq^2 + a3*cq^3 + ..., - where - cq=X1*Rcp1*Rcp2*Rcp3C-C, - a1=1/(C*ln(2))-1 is small, - a2=1/(2*C^2*ln2), - a3=1/(3*C^3*ln2), - ... - Log2 result is split by three parts: HH+HL+HLL - - 2) Calculation of y*log2(x) - Split y into YHi+YLo. - Get high PH and medium PL parts of y*log2|x|. - Get low PLL part of y*log2|x|. - Now we have PH+PL+PLL ~= y*log2|x|. - - 3) Calculation of 2^(y*log2(x)) - Let's represent PH+PL+PLL in the form N + j/2^expK + Z, - where expK=7 in this implementation, N and j are integers, - 0<=j<=2^expK-1, |Z|<2^(-expK-1). Hence - 2^(PH+PL+PLL) ~= 2^N * 2^(j/2^expK) * 2^Z, - where 2^(j/2^expK) is stored in a table, and - 2^Z ~= 1 + B1*Z + B2*Z^2 ... + B5*Z^5. - We compute 2^(PH+PL+PLL) as follows: - Break PH into PHH + PHL, where PHH = N + j/2^expK. - Z = PHL + PL + PLL - Exp2Poly = B1*Z + B2*Z^2 ... + B5*Z^5 - Get 2^(j/2^expK) from table in the form THI+TLO. - Now we have 2^(PH+PL+PLL) ~= 2^N * (THI + TLO) * (1 + Exp2Poly). - Get significand of 2^(PH+PL+PLL) in the form ResHi+ResLo: - ResHi := THI - ResLo := THI * Exp2Poly + TLO - Get exponent ERes of the result: - Res := ResHi + ResLo: - Result := ex(Res) + N. */ - - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $448, %rsp - lea __VPACK_ODD_ind.6357.0.1(%rip), %rcx - vmovups %ymm14, 320(%rsp) - -/* hi bits */ - lea __VPACK_ODD_ind.6358.0.1(%rip), %rax - vmovups %ymm12, 256(%rsp) - vmovups %ymm9, 96(%rsp) - vmovups %ymm13, 224(%rsp) - vmovups %ymm15, 352(%rsp) - vmovups %ymm11, 384(%rsp) - vmovups %ymm10, 288(%rsp) - vmovups (%rcx), %ymm10 - vmovups %ymm8, 160(%rsp) - vmovdqa %ymm1, %ymm9 - movq __svml_spow_data@GOTPCREL(%rip), %rdx - vextractf128 $1, %ymm0, %xmm7 - vcvtps2pd %xmm0, %ymm14 - vcvtps2pd %xmm7, %ymm12 - vpsubd _NMINNORM(%rdx), %ymm0, %ymm7 - -/* preserve mantissa, set input exponent to 2^(-10) */ - vandpd _ExpMask(%rdx), %ymm14, %ymm3 - vandpd _ExpMask(%rdx), %ymm12, %ymm13 - -/* exponent bits selection */ - vpsrlq $20, %ymm12, %ymm12 - vpsrlq $20, %ymm14, %ymm14 - vextractf128 $1, %ymm9, %xmm2 - vcvtps2pd %xmm9, %ymm1 - vpand _ABSMASK(%rdx), %ymm9, %ymm8 - vcvtps2pd %xmm2, %ymm6 - vorpd _Two10(%rdx), %ymm3, %ymm2 - vorpd _Two10(%rdx), %ymm13, %ymm3 - -/* reciprocal approximation good to at least 11 bits */ - vcvtpd2ps %ymm2, %xmm5 - vcvtpd2ps %ymm3, %xmm15 - vrcpps %xmm5, %xmm4 - vrcpps %xmm15, %xmm11 - vcvtps2pd %xmm4, %ymm13 - vcvtps2pd %xmm11, %ymm4 - vpermps %ymm12, %ymm10, %ymm11 - -/* round reciprocal to nearest integer, will have 1+9 mantissa bits */ - vroundpd $0, %ymm13, %ymm12 - vpermps %ymm14, %ymm10, %ymm5 - vroundpd $0, %ymm4, %ymm14 - vmovupd _One(%rdx), %ymm4 - -/* table lookup */ - vpsrlq $40, %ymm12, %ymm10 - vfmsub213pd %ymm4, %ymm12, %ymm2 - vfmsub213pd %ymm4, %ymm14, %ymm3 - vcmpgt_oqpd _Threshold(%rdx), %ymm12, %ymm12 - vxorpd %ymm4, %ymm4, %ymm4 - vandpd _Bias(%rdx), %ymm12, %ymm12 - -/* biased exponent in DP format */ - vcvtdq2pd %xmm11, %ymm13 - vpcmpeqd %ymm11, %ymm11, %ymm11 - vgatherqpd %ymm11, _Log2Rcp_lookup(%rdx,%ymm10), %ymm4 - vpsrlq $40, %ymm14, %ymm10 - vcmpgt_oqpd _Threshold(%rdx), %ymm14, %ymm14 - vpcmpeqd %ymm11, %ymm11, %ymm11 - vandpd _Bias(%rdx), %ymm14, %ymm14 - vcvtdq2pd %xmm5, %ymm15 - vxorpd %ymm5, %ymm5, %ymm5 - vgatherqpd %ymm11, _Log2Rcp_lookup(%rdx,%ymm10), %ymm5 - vorpd _Bias1(%rdx), %ymm12, %ymm11 - vorpd _Bias1(%rdx), %ymm14, %ymm10 - vsubpd %ymm11, %ymm15, %ymm11 - vsubpd %ymm10, %ymm13, %ymm14 - vmovupd _poly_coeff_4(%rdx), %ymm15 - vmovupd _poly_coeff_3(%rdx), %ymm13 - vmulpd %ymm3, %ymm3, %ymm10 - vfmadd213pd %ymm15, %ymm3, %ymm13 - vmovdqa %ymm15, %ymm12 - vfmadd231pd _poly_coeff_3(%rdx), %ymm2, %ymm12 - vmulpd %ymm2, %ymm2, %ymm15 - -/* reconstruction */ - vfmadd213pd %ymm3, %ymm10, %ymm13 - vfmadd213pd %ymm2, %ymm15, %ymm12 - vaddpd %ymm5, %ymm13, %ymm13 - vaddpd %ymm4, %ymm12, %ymm2 - vfmadd231pd _L2(%rdx), %ymm14, %ymm13 - vfmadd132pd _L2(%rdx), %ymm2, %ymm11 - vmulpd %ymm6, %ymm13, %ymm2 - vmulpd %ymm1, %ymm11, %ymm10 - vmulpd __dbInvLn2(%rdx), %ymm2, %ymm6 - vmulpd __dbInvLn2(%rdx), %ymm10, %ymm15 - -/* to round down; if dR is an integer we will get R = 1, which is ok */ - vsubpd __dbHALF(%rdx), %ymm6, %ymm3 - vsubpd __dbHALF(%rdx), %ymm15, %ymm1 - vaddpd __dbShifter(%rdx), %ymm3, %ymm13 - vaddpd __dbShifter(%rdx), %ymm1, %ymm14 - vsubpd __dbShifter(%rdx), %ymm13, %ymm12 - vmovups (%rax), %ymm1 - vsubpd __dbShifter(%rdx), %ymm14, %ymm11 - -/* [0..1) */ - vsubpd %ymm12, %ymm6, %ymm6 - vpermps %ymm10, %ymm1, %ymm3 - vpermps %ymm2, %ymm1, %ymm10 - vpcmpgtd _NMAXVAL(%rdx), %ymm7, %ymm4 - vpcmpgtd _INF(%rdx), %ymm8, %ymm1 - vpcmpeqd _NMAXVAL(%rdx), %ymm7, %ymm7 - vpcmpeqd _INF(%rdx), %ymm8, %ymm8 - vpor %ymm7, %ymm4, %ymm2 - vpor %ymm8, %ymm1, %ymm1 - vsubpd %ymm11, %ymm15, %ymm7 - vinsertf128 $1, %xmm10, %ymm3, %ymm10 - vpor %ymm1, %ymm2, %ymm3 - -/* iAbsX = iAbsX&iAbsMask */ - vandps __iAbsMask(%rdx), %ymm10, %ymm10 - -/* iRangeMask = (iAbsX>iDomainRange) */ - vpcmpgtd __iDomainRange(%rdx), %ymm10, %ymm4 - vpor %ymm4, %ymm3, %ymm5 - vmulpd __dbC1(%rdx), %ymm7, %ymm4 - vmovmskps %ymm5, %ecx - vmulpd __dbC1(%rdx), %ymm6, %ymm5 - -/* low K bits */ - vandps __lbLOWKBITS(%rdx), %ymm14, %ymm6 - -/* dpP= _dbT+lJ*T_ITEM_GRAN */ - vxorpd %ymm7, %ymm7, %ymm7 - vpcmpeqd %ymm1, %ymm1, %ymm1 - vandps __lbLOWKBITS(%rdx), %ymm13, %ymm2 - vxorpd %ymm10, %ymm10, %ymm10 - vpcmpeqd %ymm3, %ymm3, %ymm3 - vgatherqpd %ymm1, 13952(%rdx,%ymm6,8), %ymm7 - vgatherqpd %ymm3, 13952(%rdx,%ymm2,8), %ymm10 - vpsrlq $11, %ymm14, %ymm14 - vpsrlq $11, %ymm13, %ymm13 - vfmadd213pd %ymm7, %ymm4, %ymm7 - vfmadd213pd %ymm10, %ymm5, %ymm10 - -/* NB : including +/- sign for the exponent!! */ - vpsllq $52, %ymm14, %ymm8 - vpsllq $52, %ymm13, %ymm11 - vpaddq %ymm8, %ymm7, %ymm12 - vpaddq %ymm11, %ymm10, %ymm1 - vcvtpd2ps %ymm12, %xmm15 - vcvtpd2ps %ymm1, %xmm2 - vinsertf128 $1, %xmm2, %ymm15, %ymm1 - testl %ecx, %ecx - jne .LBL_1_3 - -.LBL_1_2: - cfi_remember_state - vmovups 160(%rsp), %ymm8 - vmovups 96(%rsp), %ymm9 - vmovups 288(%rsp), %ymm10 - vmovups 384(%rsp), %ymm11 - vmovups 256(%rsp), %ymm12 - vmovups 224(%rsp), %ymm13 - vmovups 320(%rsp), %ymm14 - vmovups 352(%rsp), %ymm15 - vmovdqa %ymm1, %ymm0 - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_1_3: - cfi_restore_state - vmovups %ymm0, 64(%rsp) - vmovups %ymm9, 128(%rsp) - vmovups %ymm1, 192(%rsp) - je .LBL_1_2 - - xorb %dl, %dl - xorl %eax, %eax - movq %rsi, 8(%rsp) - movq %rdi, (%rsp) - movq %r12, 40(%rsp) - cfi_offset_rel_rsp (12, 40) - movb %dl, %r12b - movq %r13, 32(%rsp) - cfi_offset_rel_rsp (13, 32) - movl %ecx, %r13d - movq %r14, 24(%rsp) - cfi_offset_rel_rsp (14, 24) - movl %eax, %r14d - movq %r15, 16(%rsp) - cfi_offset_rel_rsp (15, 16) - cfi_remember_state - -.LBL_1_6: - btl %r14d, %r13d - jc .LBL_1_12 - -.LBL_1_7: - lea 1(%r14), %esi - btl %esi, %r13d - jc .LBL_1_10 - -.LBL_1_8: - incb %r12b - addl $2, %r14d - cmpb $16, %r12b - jb .LBL_1_6 - - movq 8(%rsp), %rsi - movq (%rsp), %rdi - movq 40(%rsp), %r12 - cfi_restore (%r12) - movq 32(%rsp), %r13 - cfi_restore (%r13) - movq 24(%rsp), %r14 - cfi_restore (%r14) - movq 16(%rsp), %r15 - cfi_restore (%r15) - vmovups 192(%rsp), %ymm1 - jmp .LBL_1_2 - -.LBL_1_10: - cfi_restore_state - movzbl %r12b, %r15d - vmovss 68(%rsp,%r15,8), %xmm0 - vmovss 132(%rsp,%r15,8), %xmm1 - vzeroupper - - call JUMPTARGET(__powf_finite) - - vmovss %xmm0, 196(%rsp,%r15,8) - jmp .LBL_1_8 - -.LBL_1_12: - movzbl %r12b, %r15d - vmovss 64(%rsp,%r15,8), %xmm0 - vmovss 128(%rsp,%r15,8), %xmm1 - vzeroupper - - call JUMPTARGET(__powf_finite) - - vmovss %xmm0, 192(%rsp,%r15,8) - jmp .LBL_1_7 - -END(_ZGVdN8vv_powf_avx2) - - .section .rodata, "a" -__VPACK_ODD_ind.6357.0.1: - .long 1 - .long 3 - .long 5 - .long 7 - .long 0 - .long 0 - .long 0 - .long 0 - .space 32, 0x00 -__VPACK_ODD_ind.6358.0.1: - .long 1 - .long 3 - .long 5 - .long 7 - .long 0 - .long 0 - .long 0 - .long 0 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S deleted file mode 100644 index 0545460952..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S +++ /dev/null @@ -1,37 +0,0 @@ -/* Multiple versions of vectorized sincosf. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVeN16vvv_sincosf) - .type _ZGVeN16vvv_sincosf, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVeN16vvv_sincosf_skx(%rip), %rax - HAS_ARCH_FEATURE (AVX512DQ_Usable) - jnz 2f - leaq _ZGVeN16vvv_sincosf_knl(%rip), %rax - HAS_ARCH_FEATURE (AVX512F_Usable) - jnz 2f - leaq _ZGVeN16vvv_sincosf_avx2_wrapper(%rip), %rax -2: ret -END (_ZGVeN16vvv_sincosf) - -#define _ZGVeN16vvv_sincosf _ZGVeN16vvv_sincosf_avx2_wrapper -#include "../svml_s_sincosf16_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S deleted file mode 100644 index f73ab7de7c..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S +++ /dev/null @@ -1,806 +0,0 @@ -/* Function sincosf vectorized with AVX-512. KNL and SKX versions. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include "svml_s_trig_data.h" -#include "svml_s_wrapper_impl.h" - -/* - ALGORITHM DESCRIPTION: - - 1) Range reduction to [-Pi/4; +Pi/4] interval - a) Grab sign from source argument and save it. - b) Remove sign using AND operation - c) Getting octant Y by 2/Pi multiplication - d) Add "Right Shifter" value - e) Treat obtained value as integer S for destination sign setting. - SS = ((S-S&1)&2)<<30; For sin part - SC = ((S+S&1)&2)<<30; For cos part - f) Change destination sign if source sign is negative - using XOR operation. - g) Subtract "Right Shifter" (0x4B000000) value - h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 4 parts: - X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4; - 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval) - a) Calculate X^2 = X * X - b) Calculate 2 polynomials for sin and cos: - RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3)))); - RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))); - c) Swap RS & RC if if first bit of obtained value after - Right Shifting is set to 1. Using And, Andnot & Or operations. - 3) Destination sign setting - a) Set shifted destination sign using XOR operation: - R1 = XOR( RS, SS ); - R2 = XOR( RC, SC ). */ - - .text -ENTRY (_ZGVeN16vl4l4_sincosf_knl) -#ifndef HAVE_AVX512DQ_ASM_SUPPORT -WRAPPER_IMPL_AVX512_fFF _ZGVdN8vl4l4_sincosf -#else - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $1344, %rsp - movq __svml_s_trig_data@GOTPCREL(%rip), %rax - vmovaps %zmm0, %zmm2 - movl $-1, %edx - vmovups __sAbsMask(%rax), %zmm0 - vmovups __sInvPI(%rax), %zmm3 - -/* Absolute argument computation */ - vpandd %zmm0, %zmm2, %zmm1 - vmovups __sPI1_FMA(%rax), %zmm5 - vmovups __sSignMask(%rax), %zmm9 - vpandnd %zmm2, %zmm0, %zmm0 - -/* h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 3 parts: - X = X - Y*PI1 - Y*PI2 - Y*PI3 */ - vmovaps %zmm1, %zmm6 - vmovaps %zmm1, %zmm8 - -/* c) Getting octant Y by 2/Pi multiplication - d) Add "Right Shifter" value */ - vfmadd213ps __sRShifter(%rax), %zmm1, %zmm3 - vmovups __sPI3_FMA(%rax), %zmm7 - -/* g) Subtract "Right Shifter" (0x4B000000) value */ - vsubps __sRShifter(%rax), %zmm3, %zmm12 - -/* e) Treat obtained value as integer S for destination sign setting */ - vpslld $31, %zmm3, %zmm13 - vmovups __sA7_FMA(%rax), %zmm14 - vfnmadd231ps %zmm12, %zmm5, %zmm6 - -/* 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval) - a) Calculate X^2 = X * X - b) Calculate 2 polynomials for sin and cos: - RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3)))); - RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))) */ - vmovaps %zmm14, %zmm15 - vmovups __sA9_FMA(%rax), %zmm3 - vcmpps $22, __sRangeReductionVal(%rax), %zmm1, %k1 - vpbroadcastd %edx, %zmm1{%k1}{z} - vfnmadd231ps __sPI2_FMA(%rax), %zmm12, %zmm6 - vptestmd %zmm1, %zmm1, %k0 - vpandd %zmm6, %zmm9, %zmm11 - kmovw %k0, %ecx - vpxord __sOneHalf(%rax), %zmm11, %zmm4 - -/* Result sign calculations */ - vpternlogd $150, %zmm13, %zmm9, %zmm11 - -/* Add correction term 0.5 for cos() part */ - vaddps %zmm4, %zmm12, %zmm10 - vfnmadd213ps %zmm6, %zmm7, %zmm12 - vfnmadd231ps %zmm10, %zmm5, %zmm8 - vpxord %zmm13, %zmm12, %zmm13 - vmulps %zmm13, %zmm13, %zmm12 - vfnmadd231ps __sPI2_FMA(%rax), %zmm10, %zmm8 - vfmadd231ps __sA9_FMA(%rax), %zmm12, %zmm15 - vfnmadd213ps %zmm8, %zmm7, %zmm10 - vfmadd213ps __sA5_FMA(%rax), %zmm12, %zmm15 - vpxord %zmm11, %zmm10, %zmm5 - vmulps %zmm5, %zmm5, %zmm4 - vfmadd213ps __sA3(%rax), %zmm12, %zmm15 - vfmadd213ps %zmm14, %zmm4, %zmm3 - vmulps %zmm12, %zmm15, %zmm14 - vfmadd213ps __sA5_FMA(%rax), %zmm4, %zmm3 - vfmadd213ps %zmm13, %zmm13, %zmm14 - vfmadd213ps __sA3(%rax), %zmm4, %zmm3 - vpxord %zmm0, %zmm14, %zmm0 - vmulps %zmm4, %zmm3, %zmm3 - vfmadd213ps %zmm5, %zmm5, %zmm3 - testl %ecx, %ecx - jne .LBL_1_3 - -.LBL_1_2: - cfi_remember_state - vmovups %zmm0, (%rdi) - vmovups %zmm3, (%rsi) - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_1_3: - cfi_restore_state - vmovups %zmm2, 1152(%rsp) - vmovups %zmm0, 1216(%rsp) - vmovups %zmm3, 1280(%rsp) - je .LBL_1_2 - - xorb %dl, %dl - kmovw %k4, 1048(%rsp) - xorl %eax, %eax - kmovw %k5, 1040(%rsp) - kmovw %k6, 1032(%rsp) - kmovw %k7, 1024(%rsp) - vmovups %zmm16, 960(%rsp) - vmovups %zmm17, 896(%rsp) - vmovups %zmm18, 832(%rsp) - vmovups %zmm19, 768(%rsp) - vmovups %zmm20, 704(%rsp) - vmovups %zmm21, 640(%rsp) - vmovups %zmm22, 576(%rsp) - vmovups %zmm23, 512(%rsp) - vmovups %zmm24, 448(%rsp) - vmovups %zmm25, 384(%rsp) - vmovups %zmm26, 320(%rsp) - vmovups %zmm27, 256(%rsp) - vmovups %zmm28, 192(%rsp) - vmovups %zmm29, 128(%rsp) - vmovups %zmm30, 64(%rsp) - vmovups %zmm31, (%rsp) - movq %rsi, 1056(%rsp) - movq %r12, 1096(%rsp) - cfi_offset_rel_rsp (12, 1096) - movb %dl, %r12b - movq %r13, 1088(%rsp) - cfi_offset_rel_rsp (13, 1088) - movl %eax, %r13d - movq %r14, 1080(%rsp) - cfi_offset_rel_rsp (14, 1080) - movl %ecx, %r14d - movq %r15, 1072(%rsp) - cfi_offset_rel_rsp (15, 1072) - movq %rbx, 1064(%rsp) - movq %rdi, %rbx - cfi_remember_state - -.LBL_1_6: - btl %r13d, %r14d - jc .LBL_1_13 - -.LBL_1_7: - lea 1(%r13), %esi - btl %esi, %r14d - jc .LBL_1_10 - -.LBL_1_8: - addb $1, %r12b - addl $2, %r13d - cmpb $16, %r12b - jb .LBL_1_6 - - movq %rbx, %rdi - kmovw 1048(%rsp), %k4 - movq 1056(%rsp), %rsi - kmovw 1040(%rsp), %k5 - movq 1096(%rsp), %r12 - cfi_restore (%r12) - kmovw 1032(%rsp), %k6 - movq 1088(%rsp), %r13 - cfi_restore (%r13) - kmovw 1024(%rsp), %k7 - vmovups 960(%rsp), %zmm16 - vmovups 896(%rsp), %zmm17 - vmovups 832(%rsp), %zmm18 - vmovups 768(%rsp), %zmm19 - vmovups 704(%rsp), %zmm20 - vmovups 640(%rsp), %zmm21 - vmovups 576(%rsp), %zmm22 - vmovups 512(%rsp), %zmm23 - vmovups 448(%rsp), %zmm24 - vmovups 384(%rsp), %zmm25 - vmovups 320(%rsp), %zmm26 - vmovups 256(%rsp), %zmm27 - vmovups 192(%rsp), %zmm28 - vmovups 128(%rsp), %zmm29 - vmovups 64(%rsp), %zmm30 - vmovups (%rsp), %zmm31 - movq 1080(%rsp), %r14 - cfi_restore (%r14) - movq 1072(%rsp), %r15 - cfi_restore (%r15) - movq 1064(%rsp), %rbx - vmovups 1216(%rsp), %zmm0 - vmovups 1280(%rsp), %zmm3 - jmp .LBL_1_2 - -.LBL_1_10: - cfi_restore_state - movzbl %r12b, %r15d - vmovss 1156(%rsp,%r15,8), %xmm0 - - call JUMPTARGET(sinf) - - vmovss %xmm0, 1220(%rsp,%r15,8) - vmovss 1156(%rsp,%r15,8), %xmm0 - - call JUMPTARGET(cosf) - - vmovss %xmm0, 1284(%rsp,%r15,8) - jmp .LBL_1_8 - -.LBL_1_13: - movzbl %r12b, %r15d - vmovss 1152(%rsp,%r15,8), %xmm0 - - call JUMPTARGET(sinf) - - vmovss %xmm0, 1216(%rsp,%r15,8) - vmovss 1152(%rsp,%r15,8), %xmm0 - - call JUMPTARGET(cosf) - - vmovss %xmm0, 1280(%rsp,%r15,8) - jmp .LBL_1_7 -#endif -END (_ZGVeN16vl4l4_sincosf_knl) -libmvec_hidden_def(_ZGVeN16vl4l4_sincosf_knl) - -ENTRY (_ZGVeN16vl4l4_sincosf_skx) -#ifndef HAVE_AVX512DQ_ASM_SUPPORT -WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf -#else - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $1344, %rsp - movq __svml_s_trig_data@GOTPCREL(%rip), %rax - vmovaps %zmm0, %zmm4 - vmovups __sAbsMask(%rax), %zmm3 - vmovups __sInvPI(%rax), %zmm5 - vmovups __sRShifter(%rax), %zmm6 - vmovups __sPI1_FMA(%rax), %zmm9 - vmovups __sPI2_FMA(%rax), %zmm10 - vmovups __sSignMask(%rax), %zmm14 - vmovups __sOneHalf(%rax), %zmm7 - vmovups __sPI3_FMA(%rax), %zmm12 - -/* Absolute argument computation */ - vandps %zmm3, %zmm4, %zmm2 - -/* c) Getting octant Y by 2/Pi multiplication - d) Add "Right Shifter" value */ - vfmadd213ps %zmm6, %zmm2, %zmm5 - vcmpps $18, __sRangeReductionVal(%rax), %zmm2, %k1 - -/* e) Treat obtained value as integer S for destination sign setting */ - vpslld $31, %zmm5, %zmm0 - -/* g) Subtract "Right Shifter" (0x4B000000) value */ - vsubps %zmm6, %zmm5, %zmm5 - vmovups __sA3(%rax), %zmm6 - -/* h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 3 parts: - X = X - Y*PI1 - Y*PI2 - Y*PI3 */ - vmovaps %zmm2, %zmm11 - vfnmadd231ps %zmm5, %zmm9, %zmm11 - vfnmadd231ps %zmm5, %zmm10, %zmm11 - vandps %zmm11, %zmm14, %zmm1 - vxorps %zmm1, %zmm7, %zmm8 - -/* Result sign calculations */ - vpternlogd $150, %zmm0, %zmm14, %zmm1 - vmovups .L_2il0floatpacket.13(%rip), %zmm14 - -/* Add correction term 0.5 for cos() part */ - vaddps %zmm8, %zmm5, %zmm15 - vfnmadd213ps %zmm11, %zmm12, %zmm5 - vandnps %zmm4, %zmm3, %zmm11 - vmovups __sA7_FMA(%rax), %zmm3 - vmovaps %zmm2, %zmm13 - vfnmadd231ps %zmm15, %zmm9, %zmm13 - vxorps %zmm0, %zmm5, %zmm9 - vmovups __sA5_FMA(%rax), %zmm0 - vfnmadd231ps %zmm15, %zmm10, %zmm13 - vmulps %zmm9, %zmm9, %zmm8 - vfnmadd213ps %zmm13, %zmm12, %zmm15 - vmovups __sA9_FMA(%rax), %zmm12 - vxorps %zmm1, %zmm15, %zmm1 - vmulps %zmm1, %zmm1, %zmm13 - -/* 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval) - a) Calculate X^2 = X * X - b) Calculate 2 polynomials for sin and cos: - RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3)))); - RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))) */ - vmovaps %zmm12, %zmm7 - vfmadd213ps %zmm3, %zmm8, %zmm7 - vfmadd213ps %zmm3, %zmm13, %zmm12 - vfmadd213ps %zmm0, %zmm8, %zmm7 - vfmadd213ps %zmm0, %zmm13, %zmm12 - vfmadd213ps %zmm6, %zmm8, %zmm7 - vfmadd213ps %zmm6, %zmm13, %zmm12 - vmulps %zmm8, %zmm7, %zmm10 - vmulps %zmm13, %zmm12, %zmm3 - vfmadd213ps %zmm9, %zmm9, %zmm10 - vfmadd213ps %zmm1, %zmm1, %zmm3 - vxorps %zmm11, %zmm10, %zmm0 - vpandnd %zmm2, %zmm2, %zmm14{%k1} - vptestmd %zmm14, %zmm14, %k0 - kmovw %k0, %ecx - testl %ecx, %ecx - jne .LBL_2_3 - -.LBL_2_2: - cfi_remember_state - vmovups %zmm0, (%rdi) - vmovups %zmm3, (%rsi) - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_2_3: - cfi_restore_state - vmovups %zmm4, 1152(%rsp) - vmovups %zmm0, 1216(%rsp) - vmovups %zmm3, 1280(%rsp) - je .LBL_2_2 - - xorb %dl, %dl - xorl %eax, %eax - kmovw %k4, 1048(%rsp) - kmovw %k5, 1040(%rsp) - kmovw %k6, 1032(%rsp) - kmovw %k7, 1024(%rsp) - vmovups %zmm16, 960(%rsp) - vmovups %zmm17, 896(%rsp) - vmovups %zmm18, 832(%rsp) - vmovups %zmm19, 768(%rsp) - vmovups %zmm20, 704(%rsp) - vmovups %zmm21, 640(%rsp) - vmovups %zmm22, 576(%rsp) - vmovups %zmm23, 512(%rsp) - vmovups %zmm24, 448(%rsp) - vmovups %zmm25, 384(%rsp) - vmovups %zmm26, 320(%rsp) - vmovups %zmm27, 256(%rsp) - vmovups %zmm28, 192(%rsp) - vmovups %zmm29, 128(%rsp) - vmovups %zmm30, 64(%rsp) - vmovups %zmm31, (%rsp) - movq %rsi, 1056(%rsp) - movq %r12, 1096(%rsp) - cfi_offset_rel_rsp (12, 1096) - movb %dl, %r12b - movq %r13, 1088(%rsp) - cfi_offset_rel_rsp (13, 1088) - movl %eax, %r13d - movq %r14, 1080(%rsp) - cfi_offset_rel_rsp (14, 1080) - movl %ecx, %r14d - movq %r15, 1072(%rsp) - cfi_offset_rel_rsp (15, 1072) - movq %rbx, 1064(%rsp) - movq %rdi, %rbx - cfi_remember_state - -.LBL_2_6: - btl %r13d, %r14d - jc .LBL_2_13 - -.LBL_2_7: - lea 1(%r13), %esi - btl %esi, %r14d - jc .LBL_2_10 - -.LBL_2_8: - incb %r12b - addl $2, %r13d - cmpb $16, %r12b - jb .LBL_2_6 - - kmovw 1048(%rsp), %k4 - movq %rbx, %rdi - kmovw 1040(%rsp), %k5 - kmovw 1032(%rsp), %k6 - kmovw 1024(%rsp), %k7 - vmovups 960(%rsp), %zmm16 - vmovups 896(%rsp), %zmm17 - vmovups 832(%rsp), %zmm18 - vmovups 768(%rsp), %zmm19 - vmovups 704(%rsp), %zmm20 - vmovups 640(%rsp), %zmm21 - vmovups 576(%rsp), %zmm22 - vmovups 512(%rsp), %zmm23 - vmovups 448(%rsp), %zmm24 - vmovups 384(%rsp), %zmm25 - vmovups 320(%rsp), %zmm26 - vmovups 256(%rsp), %zmm27 - vmovups 192(%rsp), %zmm28 - vmovups 128(%rsp), %zmm29 - vmovups 64(%rsp), %zmm30 - vmovups (%rsp), %zmm31 - vmovups 1216(%rsp), %zmm0 - vmovups 1280(%rsp), %zmm3 - movq 1056(%rsp), %rsi - movq 1096(%rsp), %r12 - cfi_restore (%r12) - movq 1088(%rsp), %r13 - cfi_restore (%r13) - movq 1080(%rsp), %r14 - cfi_restore (%r14) - movq 1072(%rsp), %r15 - cfi_restore (%r15) - movq 1064(%rsp), %rbx - jmp .LBL_2_2 - -.LBL_2_10: - cfi_restore_state - movzbl %r12b, %r15d - vmovss 1156(%rsp,%r15,8), %xmm0 - vzeroupper - vmovss 1156(%rsp,%r15,8), %xmm0 - - call JUMPTARGET(sinf) - - vmovss %xmm0, 1220(%rsp,%r15,8) - vmovss 1156(%rsp,%r15,8), %xmm0 - - call JUMPTARGET(cosf) - - vmovss %xmm0, 1284(%rsp,%r15,8) - jmp .LBL_2_8 - -.LBL_2_13: - movzbl %r12b, %r15d - vmovss 1152(%rsp,%r15,8), %xmm0 - vzeroupper - vmovss 1152(%rsp,%r15,8), %xmm0 - - call JUMPTARGET(sinf) - - vmovss %xmm0, 1216(%rsp,%r15,8) - vmovss 1152(%rsp,%r15,8), %xmm0 - - call JUMPTARGET(cosf) - - vmovss %xmm0, 1280(%rsp,%r15,8) - jmp .LBL_2_7 -#endif -END (_ZGVeN16vl4l4_sincosf_skx) -libmvec_hidden_def(_ZGVeN16vl4l4_sincosf_skx) - -/* Wrapper between vvv and vl4l4 vector variants. */ -.macro WRAPPER_AVX512_vvv_vl4l4 callee -#ifndef __ILP32__ - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $384, %rsp - /* Encoding for vmovups %zmm1, 128(%rsp). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x11 - .byte 0x4c - .byte 0x24 - .byte 0x02 - lea (%rsp), %rdi - /* Encoding for vmovups %zmm2, 192(%rdi). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x11 - .byte 0x57 - .byte 0x03 - /* Encoding for vmovups %zmm3, 256(%rdi). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x11 - .byte 0x5f - .byte 0x04 - /* Encoding for vmovups %zmm4, 320(%rdi). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x11 - .byte 0x67 - .byte 0x05 - lea 64(%rsp), %rsi - call HIDDEN_JUMPTARGET(\callee) - movq 128(%rsp), %rdx - movq 136(%rsp), %rsi - movq 144(%rsp), %r8 - movq 152(%rsp), %r10 - movl (%rsp), %eax - movl 4(%rsp), %ecx - movl 8(%rsp), %edi - movl 12(%rsp), %r9d - movl %eax, (%rdx) - movl %ecx, (%rsi) - movq 160(%rsp), %rax - movq 168(%rsp), %rcx - movl %edi, (%r8) - movl %r9d, (%r10) - movq 176(%rsp), %rdi - movq 184(%rsp), %r9 - movl 16(%rsp), %r11d - movl 20(%rsp), %edx - movl 24(%rsp), %esi - movl 28(%rsp), %r8d - movl %r11d, (%rax) - movl %edx, (%rcx) - movq 192(%rsp), %r11 - movq 200(%rsp), %rdx - movl %esi, (%rdi) - movl %r8d, (%r9) - movq 208(%rsp), %rsi - movq 216(%rsp), %r8 - movl 32(%rsp), %r10d - movl 36(%rsp), %eax - movl 40(%rsp), %ecx - movl 44(%rsp), %edi - movl %r10d, (%r11) - movl %eax, (%rdx) - movq 224(%rsp), %r10 - movq 232(%rsp), %rax - movl %ecx, (%rsi) - movl %edi, (%r8) - movq 240(%rsp), %rcx - movq 248(%rsp), %rdi - movl 48(%rsp), %r9d - movl 52(%rsp), %r11d - movl 56(%rsp), %edx - movl 60(%rsp), %esi - movl %r9d, (%r10) - movl %r11d, (%rax) - movq 256(%rsp), %r9 - movq 264(%rsp), %r11 - movl %edx, (%rcx) - movl %esi, (%rdi) - movq 272(%rsp), %rdx - movq 280(%rsp), %rsi - movl 64(%rsp), %r8d - movl 68(%rsp), %r10d - movl 72(%rsp), %eax - movl 76(%rsp), %ecx - movl %r8d, (%r9) - movl %r10d, (%r11) - movq 288(%rsp), %r8 - movq 296(%rsp), %r10 - movl %eax, (%rdx) - movl %ecx, (%rsi) - movq 304(%rsp), %rax - movq 312(%rsp), %rcx - movl 80(%rsp), %edi - movl 84(%rsp), %r9d - movl 88(%rsp), %r11d - movl 92(%rsp), %edx - movl %edi, (%r8) - movl %r9d, (%r10) - movq 320(%rsp), %rdi - movq 328(%rsp), %r9 - movl %r11d, (%rax) - movl %edx, (%rcx) - movq 336(%rsp), %r11 - movq 344(%rsp), %rdx - movl 96(%rsp), %esi - movl 100(%rsp), %r8d - movl 104(%rsp), %r10d - movl 108(%rsp), %eax - movl %esi, (%rdi) - movl %r8d, (%r9) - movq 352(%rsp), %rsi - movq 360(%rsp), %r8 - movl %r10d, (%r11) - movl %eax, (%rdx) - movq 368(%rsp), %r10 - movq 376(%rsp), %rax - movl 112(%rsp), %ecx - movl 116(%rsp), %edi - movl 120(%rsp), %r9d - movl 124(%rsp), %r11d - movl %ecx, (%rsi) - movl %edi, (%r8) - movl %r9d, (%r10) - movl %r11d, (%rax) - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret -#else - leal 8(%rsp), %r10d - .cfi_def_cfa 10, 0 - andl $-64, %esp - pushq -8(%r10d) - pushq %rbp - .cfi_escape 0x10,0x6,0x2,0x76,0 - movl %esp, %ebp - pushq %r10 - .cfi_escape 0xf,0x3,0x76,0x78,0x6 - leal -112(%rbp), %esi - leal -176(%rbp), %edi - subl $296, %esp - /* Encoding for vmovdqa64 %zmm1, -240(%ebp). */ - .byte 0x67 - .byte 0x62 - .byte 0xf1 - .byte 0xfd - .byte 0x48 - .byte 0x7f - .byte 0x8d - .byte 0x10 - .byte 0xff - .byte 0xff - .byte 0xff - /* Encoding for vmovdqa64 %zmm2, -304(%ebp). */ - .byte 0x67 - .byte 0x62 - .byte 0xf1 - .byte 0xfd - .byte 0x48 - .byte 0x7f - .byte 0x95 - .byte 0xd0 - .byte 0xfe - .byte 0xff - .byte 0xff - call HIDDEN_JUMPTARGET(\callee) - movl -240(%ebp), %eax - vmovss -176(%ebp), %xmm0 - vmovss %xmm0, (%eax) - movl -236(%ebp), %eax - vmovss -172(%ebp), %xmm0 - vmovss %xmm0, (%eax) - movl -232(%ebp), %eax - vmovss -168(%ebp), %xmm0 - vmovss %xmm0, (%eax) - movl -228(%ebp), %eax - vmovss -164(%ebp), %xmm0 - vmovss %xmm0, (%eax) - movl -224(%ebp), %eax - vmovss -160(%ebp), %xmm0 - vmovss %xmm0, (%eax) - movl -220(%ebp), %eax - vmovss -156(%ebp), %xmm0 - vmovss %xmm0, (%eax) - movl -216(%ebp), %eax - vmovss -152(%ebp), %xmm0 - vmovss %xmm0, (%eax) - movl -212(%ebp), %eax - vmovss -148(%ebp), %xmm0 - vmovss %xmm0, (%eax) - movl -208(%ebp), %eax - vmovss -144(%ebp), %xmm0 - vmovss %xmm0, (%eax) - movl -204(%ebp), %eax - vmovss -140(%ebp), %xmm0 - vmovss %xmm0, (%eax) - movl -200(%ebp), %eax - vmovss -136(%ebp), %xmm0 - vmovss %xmm0, (%eax) - movl -196(%ebp), %eax - vmovss -132(%ebp), %xmm0 - vmovss %xmm0, (%eax) - movl -192(%ebp), %eax - vmovss -128(%ebp), %xmm0 - vmovss %xmm0, (%eax) - movl -188(%ebp), %eax - vmovss -124(%ebp), %xmm0 - vmovss %xmm0, (%eax) - movl -184(%ebp), %eax - vmovss -120(%ebp), %xmm0 - vmovss %xmm0, (%eax) - movl -180(%ebp), %eax - vmovss -116(%ebp), %xmm0 - vmovss %xmm0, (%eax) - movl -304(%ebp), %eax - vmovss -112(%ebp), %xmm0 - vmovss %xmm0, (%eax) - movl -300(%ebp), %eax - vmovss -108(%ebp), %xmm0 - vmovss %xmm0, (%eax) - movl -296(%ebp), %eax - vmovss -104(%ebp), %xmm0 - vmovss %xmm0, (%eax) - movl -292(%ebp), %eax - vmovss -100(%ebp), %xmm0 - vmovss %xmm0, (%eax) - movl -288(%ebp), %eax - vmovss -96(%ebp), %xmm0 - vmovss %xmm0, (%eax) - movl -284(%ebp), %eax - vmovss -92(%ebp), %xmm0 - vmovss %xmm0, (%eax) - movl -280(%ebp), %eax - vmovss -88(%ebp), %xmm0 - vmovss %xmm0, (%eax) - movl -276(%ebp), %eax - vmovss -84(%ebp), %xmm0 - vmovss %xmm0, (%eax) - movl -272(%ebp), %eax - vmovss -80(%ebp), %xmm0 - vmovss %xmm0, (%eax) - movl -268(%ebp), %eax - vmovss -76(%ebp), %xmm0 - vmovss %xmm0, (%eax) - movl -264(%ebp), %eax - vmovss -72(%ebp), %xmm0 - vmovss %xmm0, (%eax) - movl -260(%ebp), %eax - vmovss -68(%ebp), %xmm0 - vmovss %xmm0, (%eax) - movl -256(%ebp), %eax - vmovss -64(%ebp), %xmm0 - vmovss %xmm0, (%eax) - movl -252(%ebp), %eax - vmovss -60(%ebp), %xmm0 - vmovss %xmm0, (%eax) - movl -248(%ebp), %eax - vmovss -56(%ebp), %xmm0 - vmovss %xmm0, (%eax) - movl -244(%ebp), %eax - vmovss -52(%ebp), %xmm0 - vmovss %xmm0, (%eax) - addl $296, %esp - popq %r10 - .cfi_def_cfa 10, 0 - popq %rbp - leal -8(%r10), %esp - .cfi_def_cfa 7, 8 - ret -#endif -.endm - -ENTRY (_ZGVeN16vvv_sincosf_knl) -WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_knl -END (_ZGVeN16vvv_sincosf_knl) - -ENTRY (_ZGVeN16vvv_sincosf_skx) -WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx -END (_ZGVeN16vvv_sincosf_skx) - - .section .rodata, "a" -.L_2il0floatpacket.13: - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff - .type .L_2il0floatpacket.13,@object diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core.S deleted file mode 100644 index a249be33d1..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized sincosf. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVbN4vvv_sincosf) - .type _ZGVbN4vvv_sincosf, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVbN4vvv_sincosf_sse4(%rip), %rax - HAS_CPU_FEATURE (SSE4_1) - jz 2f - ret -2: leaq _ZGVbN4vvv_sincosf_sse2(%rip), %rax - ret -END (_ZGVbN4vvv_sincosf) -libmvec_hidden_def (_ZGVbN4vvv_sincosf) - -#define _ZGVbN4vvv_sincosf _ZGVbN4vvv_sincosf_sse2 -#include "../svml_s_sincosf4_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core_sse4.S deleted file mode 100644 index 74a6ac1157..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core_sse4.S +++ /dev/null @@ -1,346 +0,0 @@ -/* Function sincosf vectorized with SSE4. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include "svml_s_trig_data.h" - - .text -ENTRY (_ZGVbN4vl4l4_sincosf_sse4) -/* - ALGORITHM DESCRIPTION: - - 1) Range reduction to [-Pi/4; +Pi/4] interval - a) Grab sign from source argument and save it. - b) Remove sign using AND operation - c) Getting octant Y by 2/Pi multiplication - d) Add "Right Shifter" value - e) Treat obtained value as integer S for destination sign setting. - SS = ((S-S&1)&2)<<30; For sin part - SC = ((S+S&1)&2)<<30; For cos part - f) Change destination sign if source sign is negative - using XOR operation. - g) Subtract "Right Shifter" (0x4B000000) value - h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 4 parts: - X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4; - 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval) - a) Calculate X^2 = X * X - b) Calculate 2 polynomials for sin and cos: - RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3)))); - RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))); - c) Swap RS & RC if if first bit of obtained value after - Right Shifting is set to 1. Using And, Andnot & Or operations. - 3) Destination sign setting - a) Set shifted destination sign using XOR operation: - R1 = XOR( RS, SS ); - R2 = XOR( RC, SC ). */ - - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $320, %rsp - movq __svml_s_trig_data@GOTPCREL(%rip), %rax - movups %xmm12, 176(%rsp) - movups %xmm9, 160(%rsp) - movups __sAbsMask(%rax), %xmm12 - -/* Absolute argument computation */ - movaps %xmm12, %xmm5 - andnps %xmm0, %xmm12 - movups __sInvPI(%rax), %xmm7 - andps %xmm0, %xmm5 - -/* c) Getting octant Y by 2/Pi multiplication - d) Add "Right Shifter" value. */ - mulps %xmm5, %xmm7 - movups %xmm10, 144(%rsp) - movups __sPI1(%rax), %xmm10 - -/* h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 3 parts: - X = X - Y*PI1 - Y*PI2 - Y*PI3. */ - movaps %xmm10, %xmm1 - addps __sRShifter(%rax), %xmm7 - -/* e) Treat obtained value as integer S for destination sign setting */ - movaps %xmm7, %xmm9 - -/* g) Subtract "Right Shifter" (0x4B000000) value */ - subps __sRShifter(%rax), %xmm7 - mulps %xmm7, %xmm1 - pslld $31, %xmm9 - movups __sPI2(%rax), %xmm6 - movups %xmm13, 112(%rsp) - movaps %xmm5, %xmm13 - movaps %xmm6, %xmm2 - subps %xmm1, %xmm13 - mulps %xmm7, %xmm2 - movups __sSignMask(%rax), %xmm3 - movaps %xmm5, %xmm1 - movups __sOneHalf(%rax), %xmm4 - subps %xmm2, %xmm13 - cmpnleps __sRangeReductionVal(%rax), %xmm5 - movaps %xmm3, %xmm2 - andps %xmm13, %xmm2 - xorps %xmm2, %xmm4 - -/* Result sign calculations */ - xorps %xmm2, %xmm3 - xorps %xmm9, %xmm3 - -/* Add correction term 0.5 for cos() part */ - addps %xmm7, %xmm4 - movmskps %xmm5, %ecx - mulps %xmm4, %xmm10 - mulps %xmm4, %xmm6 - subps %xmm10, %xmm1 - movups __sPI3(%rax), %xmm10 - subps %xmm6, %xmm1 - movaps %xmm10, %xmm6 - mulps %xmm7, %xmm6 - mulps %xmm4, %xmm10 - subps %xmm6, %xmm13 - subps %xmm10, %xmm1 - movups __sPI4(%rax), %xmm6 - mulps %xmm6, %xmm7 - mulps %xmm6, %xmm4 - subps %xmm7, %xmm13 - subps %xmm4, %xmm1 - xorps %xmm9, %xmm13 - xorps %xmm3, %xmm1 - movaps %xmm13, %xmm4 - movaps %xmm1, %xmm2 - mulps %xmm13, %xmm4 - mulps %xmm1, %xmm2 - movups __sA9(%rax), %xmm7 - -/* 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval) - a) Calculate X^2 = X * X - b) Calculate 2 polynomials for sin and cos: - RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3)))); - RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))) */ - movaps %xmm7, %xmm3 - mulps %xmm4, %xmm3 - mulps %xmm2, %xmm7 - addps __sA7(%rax), %xmm3 - addps __sA7(%rax), %xmm7 - mulps %xmm4, %xmm3 - mulps %xmm2, %xmm7 - addps __sA5(%rax), %xmm3 - addps __sA5(%rax), %xmm7 - mulps %xmm4, %xmm3 - mulps %xmm2, %xmm7 - addps __sA3(%rax), %xmm3 - addps __sA3(%rax), %xmm7 - mulps %xmm3, %xmm4 - mulps %xmm7, %xmm2 - mulps %xmm13, %xmm4 - mulps %xmm1, %xmm2 - addps %xmm4, %xmm13 - addps %xmm2, %xmm1 - xorps %xmm12, %xmm13 - testl %ecx, %ecx - jne .LBL_1_3 - -.LBL_1_2: - cfi_remember_state - movups 160(%rsp), %xmm9 - movaps %xmm13, (%rdi) - movups 144(%rsp), %xmm10 - movups 176(%rsp), %xmm12 - movups 112(%rsp), %xmm13 - movups %xmm1, (%rsi) - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_1_3: - cfi_restore_state - movups %xmm0, 128(%rsp) - movups %xmm13, 192(%rsp) - movups %xmm1, 256(%rsp) - je .LBL_1_2 - - xorb %dl, %dl - xorl %eax, %eax - movups %xmm8, 48(%rsp) - movups %xmm11, 32(%rsp) - movups %xmm14, 16(%rsp) - movups %xmm15, (%rsp) - movq %rsi, 64(%rsp) - movq %r12, 104(%rsp) - cfi_offset_rel_rsp (12, 104) - movb %dl, %r12b - movq %r13, 96(%rsp) - cfi_offset_rel_rsp (13, 96) - movl %eax, %r13d - movq %r14, 88(%rsp) - cfi_offset_rel_rsp (14, 88) - movl %ecx, %r14d - movq %r15, 80(%rsp) - cfi_offset_rel_rsp (15, 80) - movq %rbx, 72(%rsp) - movq %rdi, %rbx - cfi_remember_state - -.LBL_1_6: - btl %r13d, %r14d - jc .LBL_1_13 - -.LBL_1_7: - lea 1(%r13), %esi - btl %esi, %r14d - jc .LBL_1_10 - -.LBL_1_8: - incb %r12b - addl $2, %r13d - cmpb $16, %r12b - jb .LBL_1_6 - - movups 48(%rsp), %xmm8 - movq %rbx, %rdi - movups 32(%rsp), %xmm11 - movups 16(%rsp), %xmm14 - movups (%rsp), %xmm15 - movq 64(%rsp), %rsi - movq 104(%rsp), %r12 - cfi_restore (%r12) - movq 96(%rsp), %r13 - cfi_restore (%r13) - movq 88(%rsp), %r14 - cfi_restore (%r14) - movq 80(%rsp), %r15 - cfi_restore (%r15) - movq 72(%rsp), %rbx - movups 192(%rsp), %xmm13 - movups 256(%rsp), %xmm1 - jmp .LBL_1_2 - -.LBL_1_10: - cfi_restore_state - movzbl %r12b, %r15d - movss 132(%rsp,%r15,8), %xmm0 - - call JUMPTARGET(sinf) - - movss %xmm0, 196(%rsp,%r15,8) - movss 132(%rsp,%r15,8), %xmm0 - - call JUMPTARGET(cosf) - - movss %xmm0, 260(%rsp,%r15,8) - jmp .LBL_1_8 - -.LBL_1_13: - movzbl %r12b, %r15d - movss 128(%rsp,%r15,8), %xmm0 - - call JUMPTARGET(sinf) - - movss %xmm0, 192(%rsp,%r15,8) - movss 128(%rsp,%r15,8), %xmm0 - - call JUMPTARGET(cosf) - - movss %xmm0, 256(%rsp,%r15,8) - jmp .LBL_1_7 - -END (_ZGVbN4vl4l4_sincosf_sse4) -libmvec_hidden_def(_ZGVbN4vl4l4_sincosf_sse4) - -/* vvv version implemented with wrapper to vl4l4 variant. */ -ENTRY (_ZGVbN4vvv_sincosf_sse4) -#ifndef __ILP32__ - subq $104, %rsp - .cfi_def_cfa_offset 112 - movdqu %xmm1, 32(%rsp) - lea (%rsp), %rdi - movdqu %xmm2, 48(%rdi) - lea 16(%rsp), %rsi - movdqu %xmm3, 48(%rsi) - movdqu %xmm4, 64(%rsi) - call HIDDEN_JUMPTARGET(_ZGVbN4vl4l4_sincosf_sse4) - movq 32(%rsp), %rdx - movq 40(%rsp), %rsi - movq 48(%rsp), %r8 - movq 56(%rsp), %r10 - movl (%rsp), %eax - movl 4(%rsp), %ecx - movl 8(%rsp), %edi - movl 12(%rsp), %r9d - movl %eax, (%rdx) - movl %ecx, (%rsi) - movq 64(%rsp), %rax - movq 72(%rsp), %rcx - movl %edi, (%r8) - movl %r9d, (%r10) - movq 80(%rsp), %rdi - movq 88(%rsp), %r9 - movl 16(%rsp), %r11d - movl 20(%rsp), %edx - movl 24(%rsp), %esi - movl 28(%rsp), %r8d - movl %r11d, (%rax) - movl %edx, (%rcx) - movl %esi, (%rdi) - movl %r8d, (%r9) - addq $104, %rsp - .cfi_def_cfa_offset 8 - ret -#else - subl $72, %esp - .cfi_def_cfa_offset 80 - leal 48(%rsp), %esi - movaps %xmm1, 16(%esp) - leal 32(%rsp), %edi - movaps %xmm2, (%esp) - call HIDDEN_JUMPTARGET(_ZGVbN4vl4l4_sincosf_sse4) - movl 16(%esp), %eax - movss 32(%esp), %xmm0 - movss %xmm0, (%eax) - movl 20(%esp), %eax - movss 36(%esp), %xmm0 - movss %xmm0, (%eax) - movl 24(%esp), %eax - movss 40(%esp), %xmm0 - movss %xmm0, (%eax) - movl 28(%esp), %eax - movss 44(%esp), %xmm0 - movss %xmm0, (%eax) - movl (%esp), %eax - movss 48(%esp), %xmm0 - movss %xmm0, (%eax) - movl 4(%esp), %eax - movss 52(%esp), %xmm0 - movss %xmm0, (%eax) - movl 8(%esp), %eax - movss 56(%esp), %xmm0 - movss %xmm0, (%eax) - movl 12(%esp), %eax - movss 60(%esp), %xmm0 - movss %xmm0, (%eax) - addl $72, %esp - .cfi_def_cfa_offset 8 - ret -#endif -END (_ZGVbN4vvv_sincosf_sse4) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core.S deleted file mode 100644 index 320fd861a5..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized sincosf. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVdN8vvv_sincosf) - .type _ZGVdN8vvv_sincosf, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVdN8vvv_sincosf_avx2(%rip), %rax - HAS_ARCH_FEATURE (AVX2_Usable) - jz 2f - ret -2: leaq _ZGVdN8vvv_sincosf_sse_wrapper(%rip), %rax - ret -END (_ZGVdN8vvv_sincosf) -libmvec_hidden_def (_ZGVdN8vvv_sincosf) - -#define _ZGVdN8vvv_sincosf _ZGVdN8vvv_sincosf_sse_wrapper -#include "../svml_s_sincosf8_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core_avx2.S deleted file mode 100644 index 9e4e2c71c5..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core_avx2.S +++ /dev/null @@ -1,389 +0,0 @@ -/* Function sincosf vectorized with AVX2. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include "svml_s_trig_data.h" - - .text -ENTRY (_ZGVdN8vl4l4_sincosf_avx2) -/* - ALGORITHM DESCRIPTION: - - 1) Range reduction to [-Pi/4; +Pi/4] interval - a) Grab sign from source argument and save it. - b) Remove sign using AND operation - c) Getting octant Y by 2/Pi multiplication - d) Add "Right Shifter" value - e) Treat obtained value as integer S for destination sign setting. - SS = ((S-S&1)&2)<<30; For sin part - SC = ((S+S&1)&2)<<30; For cos part - f) Change destination sign if source sign is negative - using XOR operation. - g) Subtract "Right Shifter" (0x4B000000) value - h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 4 parts: - X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4; - 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval) - a) Calculate X^2 = X * X - b) Calculate 2 polynomials for sin and cos: - RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3)))); - RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))); - c) Swap RS & RC if if first bit of obtained value after - Right Shifting is set to 1. Using And, Andnot & Or operations. - 3) Destination sign setting - a) Set shifted destination sign using XOR operation: - R1 = XOR( RS, SS ); - R2 = XOR( RC, SC ). */ - - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $448, %rsp - movq __svml_s_trig_data@GOTPCREL(%rip), %rax - vmovdqa %ymm0, %ymm5 - vmovups %ymm13, 352(%rsp) - vmovups __sAbsMask(%rax), %ymm2 - vmovups __sInvPI(%rax), %ymm1 - vmovups __sPI1_FMA(%rax), %ymm13 - vmovups %ymm15, 288(%rsp) - -/* Absolute argument computation */ - vandps %ymm2, %ymm5, %ymm4 - -/* c) Getting octant Y by 2/Pi multiplication - d) Add "Right Shifter" value */ - vfmadd213ps __sRShifter(%rax), %ymm4, %ymm1 - -/* e) Treat obtained value as integer S for destination sign setting */ - vpslld $31, %ymm1, %ymm0 - -/* g) Subtract "Right Shifter" (0x4B000000) value */ - vsubps __sRShifter(%rax), %ymm1, %ymm1 - -/* h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 3 parts: - X = X - Y*PI1 - Y*PI2 - Y*PI3 */ - vmovdqa %ymm4, %ymm7 - vfnmadd231ps %ymm1, %ymm13, %ymm7 - vfnmadd231ps __sPI2_FMA(%rax), %ymm1, %ymm7 - vandps __sSignMask(%rax), %ymm7, %ymm15 - vxorps __sOneHalf(%rax), %ymm15, %ymm6 - -/* Add correction term 0.5 for cos() part */ - vaddps %ymm6, %ymm1, %ymm6 - vmovdqa %ymm4, %ymm3 - vfnmadd231ps %ymm6, %ymm13, %ymm3 - vmovups __sPI3_FMA(%rax), %ymm13 - vcmpnle_uqps __sRangeReductionVal(%rax), %ymm4, %ymm4 - vfnmadd231ps __sPI2_FMA(%rax), %ymm6, %ymm3 - vfnmadd213ps %ymm7, %ymm13, %ymm1 - vfnmadd213ps %ymm3, %ymm13, %ymm6 - -/* Result sign calculations */ - vxorps __sSignMask(%rax), %ymm15, %ymm3 - vxorps %ymm0, %ymm3, %ymm7 - vxorps %ymm7, %ymm6, %ymm3 - vxorps %ymm0, %ymm1, %ymm15 - vandnps %ymm5, %ymm2, %ymm6 - vmovups __sA7_FMA(%rax), %ymm2 - vmulps %ymm15, %ymm15, %ymm13 - vmovups __sA9_FMA(%rax), %ymm7 - vmulps %ymm3, %ymm3, %ymm1 - -/* 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval) - a) Calculate X^2 = X * X - b) Calculate 2 polynomials for sin and cos: - RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3)))); - RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))) */ - vmovdqa %ymm2, %ymm0 - vfmadd231ps __sA9_FMA(%rax), %ymm13, %ymm0 - vfmadd213ps %ymm2, %ymm1, %ymm7 - vfmadd213ps __sA5_FMA(%rax), %ymm13, %ymm0 - vfmadd213ps __sA5_FMA(%rax), %ymm1, %ymm7 - vfmadd213ps __sA3(%rax), %ymm13, %ymm0 - vfmadd213ps __sA3(%rax), %ymm1, %ymm7 - vmulps %ymm13, %ymm0, %ymm13 - vmulps %ymm1, %ymm7, %ymm1 - vfmadd213ps %ymm15, %ymm15, %ymm13 - vfmadd213ps %ymm3, %ymm3, %ymm1 - vmovmskps %ymm4, %ecx - vxorps %ymm6, %ymm13, %ymm0 - testl %ecx, %ecx - jne .LBL_1_3 - -.LBL_1_2: - cfi_remember_state - vmovups 352(%rsp), %ymm13 - vmovups 288(%rsp), %ymm15 - vmovups %ymm0, (%rdi) - vmovups %ymm1, (%rsi) - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_1_3: - cfi_restore_state - vmovups %ymm5, 256(%rsp) - vmovups %ymm0, 320(%rsp) - vmovups %ymm1, 384(%rsp) - je .LBL_1_2 - - xorb %dl, %dl - xorl %eax, %eax - vmovups %ymm8, 160(%rsp) - vmovups %ymm9, 128(%rsp) - vmovups %ymm10, 96(%rsp) - vmovups %ymm11, 64(%rsp) - vmovups %ymm12, 32(%rsp) - vmovups %ymm14, (%rsp) - movq %rsi, 192(%rsp) - movq %r12, 232(%rsp) - cfi_offset_rel_rsp (12, 232) - movb %dl, %r12b - movq %r13, 224(%rsp) - cfi_offset_rel_rsp (13, 224) - movl %eax, %r13d - movq %r14, 216(%rsp) - cfi_offset_rel_rsp (14, 216) - movl %ecx, %r14d - movq %r15, 208(%rsp) - cfi_offset_rel_rsp (14, 208) - movq %rbx, 200(%rsp) - movq %rdi, %rbx - cfi_remember_state - -.LBL_1_6: - btl %r13d, %r14d - jc .LBL_1_13 - -.LBL_1_7: - lea 1(%r13), %esi - btl %esi, %r14d - jc .LBL_1_10 - -.LBL_1_8: - incb %r12b - addl $2, %r13d - cmpb $16, %r12b - jb .LBL_1_6 - - vmovups 160(%rsp), %ymm8 - movq %rbx, %rdi - vmovups 128(%rsp), %ymm9 - vmovups 96(%rsp), %ymm10 - vmovups 64(%rsp), %ymm11 - vmovups 32(%rsp), %ymm12 - vmovups (%rsp), %ymm14 - vmovups 320(%rsp), %ymm0 - vmovups 384(%rsp), %ymm1 - movq 192(%rsp), %rsi - movq 232(%rsp), %r12 - cfi_restore (%r12) - movq 224(%rsp), %r13 - cfi_restore (%r13) - movq 216(%rsp), %r14 - cfi_restore (%r14) - movq 208(%rsp), %r15 - cfi_restore (%r15) - movq 200(%rsp), %rbx - jmp .LBL_1_2 - -.LBL_1_10: - cfi_restore_state - movzbl %r12b, %r15d - vmovss 260(%rsp,%r15,8), %xmm0 - vzeroupper - - call JUMPTARGET(sinf) - - vmovss %xmm0, 324(%rsp,%r15,8) - vmovss 260(%rsp,%r15,8), %xmm0 - - call JUMPTARGET(cosf) - - vmovss %xmm0, 388(%rsp,%r15,8) - jmp .LBL_1_8 - -.LBL_1_13: - movzbl %r12b, %r15d - vmovss 256(%rsp,%r15,8), %xmm0 - vzeroupper - - call JUMPTARGET(sinf) - - vmovss %xmm0, 320(%rsp,%r15,8) - vmovss 256(%rsp,%r15,8), %xmm0 - - call JUMPTARGET(cosf) - - vmovss %xmm0, 384(%rsp,%r15,8) - jmp .LBL_1_7 - -END (_ZGVdN8vl4l4_sincosf_avx2) -libmvec_hidden_def(_ZGVdN8vl4l4_sincosf_avx2) - -/* vvv version implemented with wrapper to vl4l4 variant. */ -ENTRY (_ZGVdN8vvv_sincosf_avx2) -#ifndef __ILP32__ - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-32, %rsp - subq $192, %rsp - vmovdqu %ymm1, 64(%rsp) - lea (%rsp), %rdi - vmovdqu %ymm2, 96(%rdi) - vmovdqu %ymm3, 128(%rdi) - vmovdqu %ymm4, 160(%rdi) - lea 32(%rsp), %rsi - call HIDDEN_JUMPTARGET(_ZGVdN8vl4l4_sincosf_avx2) - movq 64(%rsp), %rdx - movq 72(%rsp), %rsi - movq 80(%rsp), %r8 - movq 88(%rsp), %r10 - movl (%rsp), %eax - movl 4(%rsp), %ecx - movl 8(%rsp), %edi - movl 12(%rsp), %r9d - movl %eax, (%rdx) - movl %ecx, (%rsi) - movq 96(%rsp), %rax - movq 104(%rsp), %rcx - movl %edi, (%r8) - movl %r9d, (%r10) - movq 112(%rsp), %rdi - movq 120(%rsp), %r9 - movl 16(%rsp), %r11d - movl 20(%rsp), %edx - movl 24(%rsp), %esi - movl 28(%rsp), %r8d - movl %r11d, (%rax) - movl %edx, (%rcx) - movq 128(%rsp), %r11 - movq 136(%rsp), %rdx - movl %esi, (%rdi) - movl %r8d, (%r9) - movq 144(%rsp), %rsi - movq 152(%rsp), %r8 - movl 32(%rsp), %r10d - movl 36(%rsp), %eax - movl 40(%rsp), %ecx - movl 44(%rsp), %edi - movl %r10d, (%r11) - movl %eax, (%rdx) - movq 160(%rsp), %r10 - movq 168(%rsp), %rax - movl %ecx, (%rsi) - movl %edi, (%r8) - movq 176(%rsp), %rcx - movq 184(%rsp), %rdi - movl 48(%rsp), %r9d - movl 52(%rsp), %r11d - movl 56(%rsp), %edx - movl 60(%rsp), %esi - movl %r9d, (%r10) - movl %r11d, (%rax) - movl %edx, (%rcx) - movl %esi, (%rdi) - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret -#else - leal 8(%rsp), %r10d - .cfi_def_cfa 10, 0 - andl $-32, %esp - pushq -8(%r10d) - pushq %rbp - .cfi_escape 0x10,0x6,0x2,0x76,0 - movl %esp, %ebp - pushq %r10 - .cfi_escape 0xf,0x3,0x76,0x78,0x6 - leal -48(%rbp), %esi - leal -80(%rbp), %edi - subl $136, %esp - vmovdqa %ymm1, -112(%ebp) - vmovdqa %ymm2, -144(%ebp) - call HIDDEN_JUMPTARGET(_ZGVdN8vl4l4_sincosf_avx2) - vmovdqa -112(%ebp), %xmm0 - vmovq %xmm0, %rax - vmovss -80(%ebp), %xmm0 - vmovss %xmm0, (%eax) - vmovss -76(%ebp), %xmm0 - shrq $32, %rax - vmovss %xmm0, (%eax) - movq -104(%ebp), %rax - vmovss -72(%ebp), %xmm0 - vmovss %xmm0, (%eax) - vmovss -68(%ebp), %xmm0 - shrq $32, %rax - vmovss %xmm0, (%eax) - movq -96(%ebp), %rax - vmovss -64(%ebp), %xmm0 - vmovss %xmm0, (%eax) - vmovss -60(%ebp), %xmm0 - shrq $32, %rax - vmovss %xmm0, (%eax) - movq -88(%ebp), %rax - vmovss -56(%ebp), %xmm0 - vmovss %xmm0, (%eax) - vmovss -52(%ebp), %xmm0 - shrq $32, %rax - vmovss %xmm0, (%eax) - vmovdqa -144(%ebp), %xmm0 - vmovq %xmm0, %rax - vmovss -48(%ebp), %xmm0 - vmovss %xmm0, (%eax) - vmovss -44(%ebp), %xmm0 - shrq $32, %rax - vmovss %xmm0, (%eax) - movq -136(%ebp), %rax - vmovss -40(%ebp), %xmm0 - vmovss %xmm0, (%eax) - vmovss -36(%ebp), %xmm0 - shrq $32, %rax - vmovss %xmm0, (%eax) - movq -128(%ebp), %rax - vmovss -32(%ebp), %xmm0 - vmovss %xmm0, (%eax) - vmovss -28(%ebp), %xmm0 - shrq $32, %rax - vmovss %xmm0, (%eax) - movq -120(%ebp), %rax - vmovss -24(%ebp), %xmm0 - vmovss %xmm0, (%eax) - vmovss -20(%ebp), %xmm0 - shrq $32, %rax - vmovss %xmm0, (%eax) - addl $136, %esp - popq %r10 - .cfi_def_cfa 10, 0 - popq %rbp - leal -8(%r10), %esp - .cfi_def_cfa 7, 8 - ret -#endif -END (_ZGVdN8vvv_sincosf_avx2) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S deleted file mode 100644 index 2c18dbce53..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S +++ /dev/null @@ -1,37 +0,0 @@ -/* Multiple versions of vectorized sinf. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVeN16v_sinf) - .type _ZGVeN16v_sinf, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVeN16v_sinf_skx(%rip), %rax - HAS_ARCH_FEATURE (AVX512DQ_Usable) - jnz 2f - leaq _ZGVeN16v_sinf_knl(%rip), %rax - HAS_ARCH_FEATURE (AVX512F_Usable) - jnz 2f - leaq _ZGVeN16v_sinf_avx2_wrapper(%rip), %rax -2: ret -END (_ZGVeN16v_sinf) - -#define _ZGVeN16v_sinf _ZGVeN16v_sinf_avx2_wrapper -#include "../svml_s_sinf16_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S deleted file mode 100644 index 8670673a29..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S +++ /dev/null @@ -1,479 +0,0 @@ -/* Function sinf vectorized with AVX-512. KNL and SKX versions. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include "svml_s_trig_data.h" -#include "svml_s_wrapper_impl.h" - - .text -ENTRY(_ZGVeN16v_sinf_knl) -#ifndef HAVE_AVX512DQ_ASM_SUPPORT -WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf -#else -/* - ALGORITHM DESCRIPTION: - - 1) Range reduction to [-Pi/2; +Pi/2] interval - a) Grab sign from source argument and save it. - b) Remove sign using AND operation - c) Getting octant Y by 1/Pi multiplication - d) Add "Right Shifter" value - e) Treat obtained value as integer for destination sign setting. - Shift first bit of this value to the last (sign) position - f) Change destination sign if source sign is negative - using XOR operation. - g) Subtract "Right Shifter" value - h) Subtract Y*PI from X argument, where PI divided to 4 parts: - X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4; - 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval) - a) Calculate X^2 = X * X - b) Calculate polynomial: - R = X + X * X^2 * (A3 + x^2 * (A5 + ...... - 3) Destination sign setting - a) Set shifted destination sign using XOR operation: - R = XOR( R, S ); - */ - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $1280, %rsp - movq __svml_s_trig_data@GOTPCREL(%rip), %rax - -/* Check for large and special values */ - movl $-1, %edx - vmovups __sAbsMask(%rax), %zmm4 - vmovups __sInvPI(%rax), %zmm1 - -/* b) Remove sign using AND operation */ - vpandd %zmm4, %zmm0, %zmm12 - vmovups __sPI1_FMA(%rax), %zmm2 - vmovups __sA9(%rax), %zmm7 - -/* - f) Change destination sign if source sign is negative - using XOR operation. - */ - vpandnd %zmm0, %zmm4, %zmm11 - -/* - h) Subtract Y*PI from X argument, where PI divided to 4 parts: - X = X - Y*PI1 - Y*PI2 - Y*PI3; - */ - vmovaps %zmm12, %zmm3 - -/* - c) Getting octant Y by 1/Pi multiplication - d) Add "Right Shifter" value - */ - vfmadd213ps __sRShifter(%rax), %zmm12, %zmm1 - vcmpps $22, __sRangeReductionVal(%rax), %zmm12, %k1 - vpbroadcastd %edx, %zmm13{%k1}{z} - -/* g) Subtract "Right Shifter" value */ - vsubps __sRShifter(%rax), %zmm1, %zmm5 - -/* - e) Treat obtained value as integer for destination sign setting. - Shift first bit of this value to the last (sign) position - */ - vpslld $31, %zmm1, %zmm6 - vptestmd %zmm13, %zmm13, %k0 - vfnmadd231ps %zmm5, %zmm2, %zmm3 - kmovw %k0, %ecx - vfnmadd231ps __sPI2_FMA(%rax), %zmm5, %zmm3 - vfnmadd132ps __sPI3_FMA(%rax), %zmm3, %zmm5 - -/* - 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval) - a) Calculate X^2 = X * X - b) Calculate polynomial: - R = X + X * X^2 * (A3 + x^2 * (A5 + ...... - */ - vmulps %zmm5, %zmm5, %zmm8 - vpxord %zmm6, %zmm5, %zmm9 - vfmadd213ps __sA7(%rax), %zmm8, %zmm7 - vfmadd213ps __sA5(%rax), %zmm8, %zmm7 - vfmadd213ps __sA3(%rax), %zmm8, %zmm7 - vmulps %zmm8, %zmm7, %zmm10 - vfmadd213ps %zmm9, %zmm9, %zmm10 - -/* - 3) Destination sign setting - a) Set shifted destination sign using XOR operation: - R = XOR( R, S ); - */ - vpxord %zmm11, %zmm10, %zmm1 - testl %ecx, %ecx - jne .LBL_1_3 - -.LBL_1_2: - cfi_remember_state - vmovaps %zmm1, %zmm0 - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_1_3: - cfi_restore_state - vmovups %zmm0, 1152(%rsp) - vmovups %zmm1, 1216(%rsp) - je .LBL_1_2 - - xorb %dl, %dl - kmovw %k4, 1048(%rsp) - xorl %eax, %eax - kmovw %k5, 1040(%rsp) - kmovw %k6, 1032(%rsp) - kmovw %k7, 1024(%rsp) - vmovups %zmm16, 960(%rsp) - vmovups %zmm17, 896(%rsp) - vmovups %zmm18, 832(%rsp) - vmovups %zmm19, 768(%rsp) - vmovups %zmm20, 704(%rsp) - vmovups %zmm21, 640(%rsp) - vmovups %zmm22, 576(%rsp) - vmovups %zmm23, 512(%rsp) - vmovups %zmm24, 448(%rsp) - vmovups %zmm25, 384(%rsp) - vmovups %zmm26, 320(%rsp) - vmovups %zmm27, 256(%rsp) - vmovups %zmm28, 192(%rsp) - vmovups %zmm29, 128(%rsp) - vmovups %zmm30, 64(%rsp) - vmovups %zmm31, (%rsp) - movq %rsi, 1064(%rsp) - movq %rdi, 1056(%rsp) - movq %r12, 1096(%rsp) - cfi_offset_rel_rsp (12, 1096) - movb %dl, %r12b - movq %r13, 1088(%rsp) - cfi_offset_rel_rsp (13, 1088) - movl %ecx, %r13d - movq %r14, 1080(%rsp) - cfi_offset_rel_rsp (14, 1080) - movl %eax, %r14d - movq %r15, 1072(%rsp) - cfi_offset_rel_rsp (15, 1072) - cfi_remember_state - -.LBL_1_6: - btl %r14d, %r13d - jc .LBL_1_12 - -.LBL_1_7: - lea 1(%r14), %esi - btl %esi, %r13d - jc .LBL_1_10 - -.LBL_1_8: - addb $1, %r12b - addl $2, %r14d - cmpb $16, %r12b - jb .LBL_1_6 - - kmovw 1048(%rsp), %k4 - movq 1064(%rsp), %rsi - kmovw 1040(%rsp), %k5 - movq 1056(%rsp), %rdi - kmovw 1032(%rsp), %k6 - movq 1096(%rsp), %r12 - cfi_restore (%r12) - movq 1088(%rsp), %r13 - cfi_restore (%r13) - kmovw 1024(%rsp), %k7 - vmovups 960(%rsp), %zmm16 - vmovups 896(%rsp), %zmm17 - vmovups 832(%rsp), %zmm18 - vmovups 768(%rsp), %zmm19 - vmovups 704(%rsp), %zmm20 - vmovups 640(%rsp), %zmm21 - vmovups 576(%rsp), %zmm22 - vmovups 512(%rsp), %zmm23 - vmovups 448(%rsp), %zmm24 - vmovups 384(%rsp), %zmm25 - vmovups 320(%rsp), %zmm26 - vmovups 256(%rsp), %zmm27 - vmovups 192(%rsp), %zmm28 - vmovups 128(%rsp), %zmm29 - vmovups 64(%rsp), %zmm30 - vmovups (%rsp), %zmm31 - movq 1080(%rsp), %r14 - cfi_restore (%r14) - movq 1072(%rsp), %r15 - cfi_restore (%r15) - vmovups 1216(%rsp), %zmm1 - jmp .LBL_1_2 - -.LBL_1_10: - cfi_restore_state - movzbl %r12b, %r15d - vmovss 1156(%rsp,%r15,8), %xmm0 - call JUMPTARGET(sinf) - vmovss %xmm0, 1220(%rsp,%r15,8) - jmp .LBL_1_8 - -.LBL_1_12: - movzbl %r12b, %r15d - vmovss 1152(%rsp,%r15,8), %xmm0 - call JUMPTARGET(sinf) - vmovss %xmm0, 1216(%rsp,%r15,8) - jmp .LBL_1_7 -#endif -END(_ZGVeN16v_sinf_knl) - -ENTRY (_ZGVeN16v_sinf_skx) -#ifndef HAVE_AVX512DQ_ASM_SUPPORT -WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf -#else -/* - ALGORITHM DESCRIPTION: - - 1) Range reduction to [-Pi/2; +Pi/2] interval - a) Grab sign from source argument and save it. - b) Remove sign using AND operation - c) Getting octant Y by 1/Pi multiplication - d) Add "Right Shifter" value - e) Treat obtained value as integer for destination sign setting. - Shift first bit of this value to the last (sign) position - f) Change destination sign if source sign is negative - using XOR operation. - g) Subtract "Right Shifter" value - h) Subtract Y*PI from X argument, where PI divided to 4 parts: - X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4; - 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval) - a) Calculate X^2 = X * X - b) Calculate polynomial: - R = X + X * X^2 * (A3 + x^2 * (A5 + ...... - 3) Destination sign setting - a) Set shifted destination sign using XOR operation: - R = XOR( R, S ); - */ - - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $1280, %rsp - movq __svml_s_trig_data@GOTPCREL(%rip), %rax - -/* Check for large and special values */ - vmovups .L_2il0floatpacket.11(%rip), %zmm14 - vmovups __sAbsMask(%rax), %zmm5 - vmovups __sInvPI(%rax), %zmm1 - vmovups __sRShifter(%rax), %zmm2 - vmovups __sPI1_FMA(%rax), %zmm3 - vmovups __sA9(%rax), %zmm8 - -/* b) Remove sign using AND operation */ - vandps %zmm5, %zmm0, %zmm13 - -/* - f) Change destination sign if source sign is negative - using XOR operation. - */ - vandnps %zmm0, %zmm5, %zmm12 - -/* - c) Getting octant Y by 1/Pi multiplication - d) Add "Right Shifter" value - */ - vfmadd213ps %zmm2, %zmm13, %zmm1 - vcmpps $18, __sRangeReductionVal(%rax), %zmm13, %k1 - -/* - e) Treat obtained value as integer for destination sign setting. - Shift first bit of this value to the last (sign) position - */ - vpslld $31, %zmm1, %zmm7 - -/* g) Subtract "Right Shifter" value */ - vsubps %zmm2, %zmm1, %zmm6 - -/* - h) Subtract Y*PI from X argument, where PI divided to 4 parts: - X = X - Y*PI1 - Y*PI2 - Y*PI3; - */ - vmovaps %zmm13, %zmm4 - vfnmadd231ps %zmm6, %zmm3, %zmm4 - vfnmadd231ps __sPI2_FMA(%rax), %zmm6, %zmm4 - vfnmadd132ps __sPI3_FMA(%rax), %zmm4, %zmm6 - -/* - 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval) - a) Calculate X^2 = X * X - b) Calculate polynomial: - R = X + X * X^2 * (A3 + x^2 * (A5 + ...... - */ - vmulps %zmm6, %zmm6, %zmm9 - vxorps %zmm7, %zmm6, %zmm10 - vfmadd213ps __sA7(%rax), %zmm9, %zmm8 - vfmadd213ps __sA5(%rax), %zmm9, %zmm8 - vfmadd213ps __sA3(%rax), %zmm9, %zmm8 - vmulps %zmm9, %zmm8, %zmm11 - vfmadd213ps %zmm10, %zmm10, %zmm11 - -/* - 3) Destination sign setting - a) Set shifted destination sign using XOR operation: - R = XOR( R, S ); - */ - vxorps %zmm12, %zmm11, %zmm1 - vpandnd %zmm13, %zmm13, %zmm14{%k1} - vptestmd %zmm14, %zmm14, %k0 - kmovw %k0, %ecx - testl %ecx, %ecx - jne .LBL_2_3 - -.LBL_2_2: - cfi_remember_state - vmovaps %zmm1, %zmm0 - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_2_3: - cfi_restore_state - vmovups %zmm0, 1152(%rsp) - vmovups %zmm1, 1216(%rsp) - je .LBL_2_2 - - xorb %dl, %dl - xorl %eax, %eax - kmovw %k4, 1048(%rsp) - kmovw %k5, 1040(%rsp) - kmovw %k6, 1032(%rsp) - kmovw %k7, 1024(%rsp) - vmovups %zmm16, 960(%rsp) - vmovups %zmm17, 896(%rsp) - vmovups %zmm18, 832(%rsp) - vmovups %zmm19, 768(%rsp) - vmovups %zmm20, 704(%rsp) - vmovups %zmm21, 640(%rsp) - vmovups %zmm22, 576(%rsp) - vmovups %zmm23, 512(%rsp) - vmovups %zmm24, 448(%rsp) - vmovups %zmm25, 384(%rsp) - vmovups %zmm26, 320(%rsp) - vmovups %zmm27, 256(%rsp) - vmovups %zmm28, 192(%rsp) - vmovups %zmm29, 128(%rsp) - vmovups %zmm30, 64(%rsp) - vmovups %zmm31, (%rsp) - movq %rsi, 1064(%rsp) - movq %rdi, 1056(%rsp) - movq %r12, 1096(%rsp) - cfi_offset_rel_rsp (12, 1096) - movb %dl, %r12b - movq %r13, 1088(%rsp) - cfi_offset_rel_rsp (13, 1088) - movl %ecx, %r13d - movq %r14, 1080(%rsp) - cfi_offset_rel_rsp (14, 1080) - movl %eax, %r14d - movq %r15, 1072(%rsp) - cfi_offset_rel_rsp (15, 1072) - cfi_remember_state - -.LBL_2_6: - btl %r14d, %r13d - jc .LBL_2_12 - -.LBL_2_7: - lea 1(%r14), %esi - btl %esi, %r13d - jc .LBL_2_10 - -.LBL_2_8: - incb %r12b - addl $2, %r14d - cmpb $16, %r12b - jb .LBL_2_6 - - kmovw 1048(%rsp), %k4 - kmovw 1040(%rsp), %k5 - kmovw 1032(%rsp), %k6 - kmovw 1024(%rsp), %k7 - vmovups 960(%rsp), %zmm16 - vmovups 896(%rsp), %zmm17 - vmovups 832(%rsp), %zmm18 - vmovups 768(%rsp), %zmm19 - vmovups 704(%rsp), %zmm20 - vmovups 640(%rsp), %zmm21 - vmovups 576(%rsp), %zmm22 - vmovups 512(%rsp), %zmm23 - vmovups 448(%rsp), %zmm24 - vmovups 384(%rsp), %zmm25 - vmovups 320(%rsp), %zmm26 - vmovups 256(%rsp), %zmm27 - vmovups 192(%rsp), %zmm28 - vmovups 128(%rsp), %zmm29 - vmovups 64(%rsp), %zmm30 - vmovups (%rsp), %zmm31 - vmovups 1216(%rsp), %zmm1 - movq 1064(%rsp), %rsi - movq 1056(%rsp), %rdi - movq 1096(%rsp), %r12 - cfi_restore (%r12) - movq 1088(%rsp), %r13 - cfi_restore (%r13) - movq 1080(%rsp), %r14 - cfi_restore (%r14) - movq 1072(%rsp), %r15 - cfi_restore (%r15) - jmp .LBL_2_2 - -.LBL_2_10: - cfi_restore_state - movzbl %r12b, %r15d - vmovss 1156(%rsp,%r15,8), %xmm0 - vzeroupper - vmovss 1156(%rsp,%r15,8), %xmm0 - - call JUMPTARGET(sinf) - - vmovss %xmm0, 1220(%rsp,%r15,8) - jmp .LBL_2_8 - -.LBL_2_12: - movzbl %r12b, %r15d - vmovss 1152(%rsp,%r15,8), %xmm0 - vzeroupper - vmovss 1152(%rsp,%r15,8), %xmm0 - - call JUMPTARGET(sinf) - - vmovss %xmm0, 1216(%rsp,%r15,8) - jmp .LBL_2_7 -#endif -END (_ZGVeN16v_sinf_skx) - - .section .rodata, "a" -.L_2il0floatpacket.11: - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff - .type .L_2il0floatpacket.11,@object diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core.S deleted file mode 100644 index 3556473899..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized sinf. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVbN4v_sinf) - .type _ZGVbN4v_sinf, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVbN4v_sinf_sse4(%rip), %rax - HAS_CPU_FEATURE (SSE4_1) - jz 2f - ret -2: leaq _ZGVbN4v_sinf_sse2(%rip), %rax - ret -END (_ZGVbN4v_sinf) -libmvec_hidden_def (_ZGVbN4v_sinf) - -#define _ZGVbN4v_sinf _ZGVbN4v_sinf_sse2 -#include "../svml_s_sinf4_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core_sse4.S deleted file mode 100644 index c690150964..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core_sse4.S +++ /dev/null @@ -1,224 +0,0 @@ -/* Function sinf vectorized with SSE4. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - - -#include <sysdep.h> -#include "svml_s_trig_data.h" - - .text -ENTRY(_ZGVbN4v_sinf_sse4) -/* - ALGORITHM DESCRIPTION: - - 1) Range reduction to [-Pi/2; +Pi/2] interval - a) Grab sign from source argument and save it. - b) Remove sign using AND operation - c) Getting octant Y by 1/Pi multiplication - d) Add "Right Shifter" value - e) Treat obtained value as integer for destination sign setting. - Shift first bit of this value to the last (sign) position - f) Change destination sign if source sign is negative - using XOR operation. - g) Subtract "Right Shifter" value - h) Subtract Y*PI from X argument, where PI divided to 4 parts: - X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4; - 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval) - a) Calculate X^2 = X * X - b) Calculate polynomial: - R = X + X * X^2 * (A3 + x^2 * (A5 + ...... - 3) Destination sign setting - a) Set shifted destination sign using XOR operation: - R = XOR( R, S ); - */ - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $320, %rsp - movaps %xmm0, %xmm5 - movq __svml_s_trig_data@GOTPCREL(%rip), %rax - movups __sAbsMask(%rax), %xmm2 - -/* b) Remove sign using AND operation */ - movaps %xmm2, %xmm4 - -/* - f) Change destination sign if source sign is negative - using XOR operation. - */ - andnps %xmm5, %xmm2 - movups __sInvPI(%rax), %xmm1 - andps %xmm5, %xmm4 - -/* c) Getting octant Y by 1/Pi multiplication - d) Add "Right Shifter" value */ - mulps %xmm4, %xmm1 - -/* h) Subtract Y*PI from X argument, where PI divided to 4 parts: - X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4 */ - movaps %xmm4, %xmm0 - -/* Check for large and special values */ - cmpnleps __sRangeReductionVal(%rax), %xmm4 - movups __sRShifter(%rax), %xmm6 - movups __sPI1(%rax), %xmm7 - addps %xmm6, %xmm1 - movmskps %xmm4, %ecx - -/* e) Treat obtained value as integer for destination sign setting. - Shift first bit of this value to the last (sign) position */ - movaps %xmm1, %xmm3 - -/* g) Subtract "Right Shifter" value */ - subps %xmm6, %xmm1 - mulps %xmm1, %xmm7 - pslld $31, %xmm3 - movups __sPI2(%rax), %xmm6 - subps %xmm7, %xmm0 - mulps %xmm1, %xmm6 - movups __sPI3(%rax), %xmm7 - subps %xmm6, %xmm0 - mulps %xmm1, %xmm7 - movups __sPI4(%rax), %xmm6 - subps %xmm7, %xmm0 - mulps %xmm6, %xmm1 - subps %xmm1, %xmm0 - -/* 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval) - a) Calculate X^2 = X * X - b) Calculate polynomial: - R = X + X * X^2 * (A3 + x^2 * (A5 + ...... */ - movaps %xmm0, %xmm1 - mulps %xmm0, %xmm1 - xorps %xmm3, %xmm0 - movups __sA9(%rax), %xmm3 - mulps %xmm1, %xmm3 - addps __sA7(%rax), %xmm3 - mulps %xmm1, %xmm3 - addps __sA5(%rax), %xmm3 - mulps %xmm1, %xmm3 - addps __sA3(%rax), %xmm3 - mulps %xmm3, %xmm1 - mulps %xmm0, %xmm1 - addps %xmm1, %xmm0 - -/* 3) Destination sign setting - a) Set shifted destination sign using XOR operation: - R = XOR( R, S ); */ - xorps %xmm2, %xmm0 - testl %ecx, %ecx - jne .LBL_1_3 - -.LBL_1_2: - cfi_remember_state - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_1_3: - cfi_restore_state - movups %xmm5, 192(%rsp) - movups %xmm0, 256(%rsp) - je .LBL_1_2 - - xorb %dl, %dl - xorl %eax, %eax - movups %xmm8, 112(%rsp) - movups %xmm9, 96(%rsp) - movups %xmm10, 80(%rsp) - movups %xmm11, 64(%rsp) - movups %xmm12, 48(%rsp) - movups %xmm13, 32(%rsp) - movups %xmm14, 16(%rsp) - movups %xmm15, (%rsp) - movq %rsi, 136(%rsp) - movq %rdi, 128(%rsp) - movq %r12, 168(%rsp) - cfi_offset_rel_rsp (12, 168) - movb %dl, %r12b - movq %r13, 160(%rsp) - cfi_offset_rel_rsp (13, 160) - movl %ecx, %r13d - movq %r14, 152(%rsp) - cfi_offset_rel_rsp (14, 152) - movl %eax, %r14d - movq %r15, 144(%rsp) - cfi_offset_rel_rsp (15, 144) - cfi_remember_state - -.LBL_1_6: - btl %r14d, %r13d - jc .LBL_1_12 - -.LBL_1_7: - lea 1(%r14), %esi - btl %esi, %r13d - jc .LBL_1_10 - -.LBL_1_8: - incb %r12b - addl $2, %r14d - cmpb $16, %r12b - jb .LBL_1_6 - - movups 112(%rsp), %xmm8 - movups 96(%rsp), %xmm9 - movups 80(%rsp), %xmm10 - movups 64(%rsp), %xmm11 - movups 48(%rsp), %xmm12 - movups 32(%rsp), %xmm13 - movups 16(%rsp), %xmm14 - movups (%rsp), %xmm15 - movq 136(%rsp), %rsi - movq 128(%rsp), %rdi - movq 168(%rsp), %r12 - cfi_restore (%r12) - movq 160(%rsp), %r13 - cfi_restore (%r13) - movq 152(%rsp), %r14 - cfi_restore (%r14) - movq 144(%rsp), %r15 - cfi_restore (%r15) - movups 256(%rsp), %xmm0 - jmp .LBL_1_2 - -.LBL_1_10: - cfi_restore_state - movzbl %r12b, %r15d - movss 196(%rsp,%r15,8), %xmm0 - - call JUMPTARGET(sinf) - - movss %xmm0, 260(%rsp,%r15,8) - jmp .LBL_1_8 - -.LBL_1_12: - movzbl %r12b, %r15d - movss 192(%rsp,%r15,8), %xmm0 - - call JUMPTARGET(sinf) - - movss %xmm0, 256(%rsp,%r15,8) - jmp .LBL_1_7 - -END(_ZGVbN4v_sinf_sse4) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core.S deleted file mode 100644 index 674e88bd55..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized sinf, vector length is 8. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVdN8v_sinf) - .type _ZGVdN8v_sinf, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX -1: leaq _ZGVdN8v_sinf_avx2(%rip), %rax - HAS_ARCH_FEATURE (AVX2_Usable) - jz 2f - ret -2: leaq _ZGVdN8v_sinf_sse_wrapper(%rip), %rax - ret -END (_ZGVdN8v_sinf) -libmvec_hidden_def (_ZGVdN8v_sinf) - -#define _ZGVdN8v_sinf _ZGVdN8v_sinf_sse_wrapper -#include "../svml_s_sinf8_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core_avx2.S deleted file mode 100644 index d34870fa3a..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core_avx2.S +++ /dev/null @@ -1,219 +0,0 @@ -/* Function sinf vectorized with AVX2. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include "svml_s_trig_data.h" - - .text -ENTRY(_ZGVdN8v_sinf_avx2) -/* - ALGORITHM DESCRIPTION: - - 1) Range reduction to [-Pi/2; +Pi/2] interval - a) Grab sign from source argument and save it. - b) Remove sign using AND operation - c) Getting octant Y by 1/Pi multiplication - d) Add "Right Shifter" value - e) Treat obtained value as integer for destination sign setting. - Shift first bit of this value to the last (sign) position - f) Change destination sign if source sign is negative - using XOR operation. - g) Subtract "Right Shifter" value - h) Subtract Y*PI from X argument, where PI divided to 4 parts: - X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4; - 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval) - a) Calculate X^2 = X * X - b) Calculate polynomial: - R = X + X * X^2 * (A3 + x^2 * (A5 + ...... - 3) Destination sign setting - a) Set shifted destination sign using XOR operation: - R = XOR( R, S ); - */ - pushq %rbp - cfi_adjust_cfa_offset (8) - cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp - cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $448, %rsp - movq __svml_s_trig_data@GOTPCREL(%rip), %rax - vmovdqa %ymm0, %ymm5 - vmovups __sAbsMask(%rax), %ymm3 - vmovups __sInvPI(%rax), %ymm7 - vmovups __sRShifter(%rax), %ymm0 - vmovups __sPI1_FMA(%rax), %ymm1 - -/* b) Remove sign using AND operation */ - vandps %ymm3, %ymm5, %ymm4 - -/* - c) Getting octant Y by 1/Pi multiplication - d) Add "Right Shifter" value - */ - vfmadd213ps %ymm0, %ymm4, %ymm7 - -/* g) Subtract "Right Shifter" value */ - vsubps %ymm0, %ymm7, %ymm2 - -/* - e) Treat obtained value as integer for destination sign setting. - Shift first bit of this value to the last (sign) position - */ - vpslld $31, %ymm7, %ymm6 - -/* - h) Subtract Y*PI from X argument, where PI divided to 4 parts: - X = X - Y*PI1 - Y*PI2 - Y*PI3; - */ - vmovdqa %ymm4, %ymm0 - vfnmadd231ps %ymm2, %ymm1, %ymm0 - -/* Check for large and special values */ - vcmpnle_uqps __sRangeReductionVal(%rax), %ymm4, %ymm4 - vfnmadd231ps __sPI2_FMA(%rax), %ymm2, %ymm0 - vfnmadd132ps __sPI3_FMA(%rax), %ymm0, %ymm2 - -/* - 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval) - a) Calculate X^2 = X * X - b) Calculate polynomial: - R = X + X * X^2 * (A3 + x^2 * (A5 + ...... - */ - vmulps %ymm2, %ymm2, %ymm1 - -/* - f) Change destination sign if source sign is negative - using XOR operation. - */ - vandnps %ymm5, %ymm3, %ymm0 - vxorps %ymm6, %ymm2, %ymm3 - vmovups __sA9(%rax), %ymm2 - vfmadd213ps __sA7(%rax), %ymm1, %ymm2 - vfmadd213ps __sA5(%rax), %ymm1, %ymm2 - vfmadd213ps __sA3(%rax), %ymm1, %ymm2 - vmulps %ymm1, %ymm2, %ymm6 - vfmadd213ps %ymm3, %ymm3, %ymm6 - vmovmskps %ymm4, %ecx - -/* - 3) Destination sign setting - a) Set shifted destination sign using XOR operation: - R = XOR( R, S ); - */ - vxorps %ymm0, %ymm6, %ymm0 - testl %ecx, %ecx - jne .LBL_1_3 - -.LBL_1_2: - cfi_remember_state - movq %rbp, %rsp - cfi_def_cfa_register (%rsp) - popq %rbp - cfi_adjust_cfa_offset (-8) - cfi_restore (%rbp) - ret - -.LBL_1_3: - cfi_restore_state - vmovups %ymm5, 320(%rsp) - vmovups %ymm0, 384(%rsp) - je .LBL_1_2 - - xorb %dl, %dl - xorl %eax, %eax - vmovups %ymm8, 224(%rsp) - vmovups %ymm9, 192(%rsp) - vmovups %ymm10, 160(%rsp) - vmovups %ymm11, 128(%rsp) - vmovups %ymm12, 96(%rsp) - vmovups %ymm13, 64(%rsp) - vmovups %ymm14, 32(%rsp) - vmovups %ymm15, (%rsp) - movq %rsi, 264(%rsp) - movq %rdi, 256(%rsp) - movq %r12, 296(%rsp) - cfi_offset_rel_rsp (12, 296) - movb %dl, %r12b - movq %r13, 288(%rsp) - cfi_offset_rel_rsp (13, 288) - movl %ecx, %r13d - movq %r14, 280(%rsp) - cfi_offset_rel_rsp (14, 280) - movl %eax, %r14d - movq %r15, 272(%rsp) - cfi_offset_rel_rsp (15, 272) - cfi_remember_state - -.LBL_1_6: - btl %r14d, %r13d - jc .LBL_1_12 - -.LBL_1_7: - lea 1(%r14), %esi - btl %esi, %r13d - jc .LBL_1_10 - -.LBL_1_8: - incb %r12b - addl $2, %r14d - cmpb $16, %r12b - jb .LBL_1_6 - - vmovups 224(%rsp), %ymm8 - vmovups 192(%rsp), %ymm9 - vmovups 160(%rsp), %ymm10 - vmovups 128(%rsp), %ymm11 - vmovups 96(%rsp), %ymm12 - vmovups 64(%rsp), %ymm13 - vmovups 32(%rsp), %ymm14 - vmovups (%rsp), %ymm15 - vmovups 384(%rsp), %ymm0 - movq 264(%rsp), %rsi - movq 256(%rsp), %rdi - movq 296(%rsp), %r12 - cfi_restore (%r12) - movq 288(%rsp), %r13 - cfi_restore (%r13) - movq 280(%rsp), %r14 - cfi_restore (%r14) - movq 272(%rsp), %r15 - cfi_restore (%r15) - jmp .LBL_1_2 - -.LBL_1_10: - cfi_restore_state - movzbl %r12b, %r15d - vmovss 324(%rsp,%r15,8), %xmm0 - vzeroupper - - call JUMPTARGET(sinf) - - vmovss %xmm0, 388(%rsp,%r15,8) - jmp .LBL_1_8 - -.LBL_1_12: - movzbl %r12b, %r15d - vmovss 320(%rsp,%r15,8), %xmm0 - vzeroupper - - call JUMPTARGET(sinf) - - vmovss %xmm0, 384(%rsp,%r15,8) - jmp .LBL_1_7 - -END(_ZGVdN8v_sinf_avx2) |