diff options
Diffstat (limited to 'REORG.TODO/sysdeps/x86_64/fpu/multiarch')
130 files changed, 15286 insertions, 0 deletions
diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/Makefile b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/Makefile new file mode 100644 index 0000000000..34542155aa --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/Makefile @@ -0,0 +1,70 @@ +ifeq ($(subdir),math) +libm-sysdep_routines += s_floor-c s_ceil-c s_floorf-c s_ceilf-c \ + s_rint-c s_rintf-c s_nearbyint-c s_nearbyintf-c + +libm-sysdep_routines += e_exp-fma4 e_log-fma4 e_pow-fma4 s_atan-fma4 \ + e_asin-fma4 e_atan2-fma4 s_sin-fma4 s_tan-fma4 \ + mplog-fma4 mpa-fma4 slowexp-fma4 slowpow-fma4 \ + sincos32-fma4 doasin-fma4 dosincos-fma4 \ + halfulp-fma4 mpexp-fma4 \ + mpatan2-fma4 mpatan-fma4 mpsqrt-fma4 mptan-fma4 + +CFLAGS-doasin-fma4.c = -mfma4 +CFLAGS-dosincos-fma4.c = -mfma4 +CFLAGS-e_asin-fma4.c = -mfma4 +CFLAGS-e_atan2-fma4.c = -mfma4 +CFLAGS-e_exp-fma4.c = -mfma4 +CFLAGS-e_log-fma4.c = -mfma4 +CFLAGS-e_pow-fma4.c = -mfma4 $(config-cflags-nofma) +CFLAGS-halfulp-fma4.c = -mfma4 +CFLAGS-mpa-fma4.c = -mfma4 +CFLAGS-mpatan-fma4.c = -mfma4 +CFLAGS-mpatan2-fma4.c = -mfma4 +CFLAGS-mpexp-fma4.c = -mfma4 +CFLAGS-mplog-fma4.c = -mfma4 +CFLAGS-mpsqrt-fma4.c = -mfma4 +CFLAGS-mptan-fma4.c = -mfma4 +CFLAGS-s_atan-fma4.c = -mfma4 +CFLAGS-sincos32-fma4.c = -mfma4 +CFLAGS-slowexp-fma4.c = -mfma4 +CFLAGS-slowpow-fma4.c = -mfma4 +CFLAGS-s_sin-fma4.c = -mfma4 +CFLAGS-s_tan-fma4.c = -mfma4 + +libm-sysdep_routines += e_exp-avx e_log-avx s_atan-avx \ + e_atan2-avx s_sin-avx s_tan-avx \ + mplog-avx mpa-avx slowexp-avx \ + mpexp-avx + +CFLAGS-e_atan2-avx.c = -msse2avx -DSSE2AVX +CFLAGS-e_exp-avx.c = -msse2avx -DSSE2AVX +CFLAGS-e_log-avx.c = -msse2avx -DSSE2AVX +CFLAGS-mpa-avx.c = -msse2avx -DSSE2AVX +CFLAGS-mpexp-avx.c = -msse2avx -DSSE2AVX +CFLAGS-mplog-avx.c = -msse2avx -DSSE2AVX +CFLAGS-s_atan-avx.c = -msse2avx -DSSE2AVX +CFLAGS-s_sin-avx.c = -msse2avx -DSSE2AVX +CFLAGS-slowexp-avx.c = -msse2avx -DSSE2AVX +CFLAGS-s_tan-avx.c = -msse2avx -DSSE2AVX +endif + +ifeq ($(subdir),mathvec) +libmvec-sysdep_routines += svml_d_cos2_core_sse4 svml_d_cos4_core_avx2 \ + svml_d_cos8_core_avx512 svml_d_sin2_core_sse4 \ + svml_d_sin4_core_avx2 svml_d_sin8_core_avx512 \ + svml_d_log2_core_sse4 svml_d_log4_core_avx2 \ + svml_d_log8_core_avx512 svml_d_sincos2_core_sse4 \ + svml_d_sincos4_core_avx2 svml_d_sincos8_core_avx512 \ + svml_s_cosf4_core_sse4 svml_s_cosf8_core_avx2 \ + svml_s_cosf16_core_avx512 svml_s_sinf4_core_sse4 \ + svml_s_sinf8_core_avx2 svml_s_sinf16_core_avx512 \ + svml_s_logf4_core_sse4 svml_s_logf8_core_avx2 \ + svml_s_logf16_core_avx512 svml_d_exp2_core_sse4 \ + svml_d_exp4_core_avx2 svml_d_exp8_core_avx512 \ + svml_s_expf4_core_sse4 svml_s_expf8_core_avx2 \ + svml_s_expf16_core_avx512 svml_d_pow2_core_sse4 \ + svml_d_pow4_core_avx2 svml_d_pow8_core_avx512 \ + svml_s_powf4_core_sse4 svml_s_powf8_core_avx2 \ + svml_s_powf16_core_avx512 svml_s_sincosf4_core_sse4 \ + svml_s_sincosf8_core_avx2 svml_s_sincosf16_core_avx512 +endif diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/doasin-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/doasin-fma4.c new file mode 100644 index 0000000000..53eb419472 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/doasin-fma4.c @@ -0,0 +1,4 @@ +#define __doasin __doasin_fma4 +#define SECTION __attribute__ ((section (".text.fma4"))) + +#include <sysdeps/ieee754/dbl-64/doasin.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/dosincos-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/dosincos-fma4.c new file mode 100644 index 0000000000..1578b2fce0 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/dosincos-fma4.c @@ -0,0 +1,6 @@ +#define __docos __docos_fma4 +#define __dubcos __dubcos_fma4 +#define __dubsin __dubsin_fma4 +#define SECTION __attribute__ ((section (".text.fma4"))) + +#include <sysdeps/ieee754/dbl-64/dosincos.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_asin-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_asin-fma4.c new file mode 100644 index 0000000000..2657c31f49 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_asin-fma4.c @@ -0,0 +1,11 @@ +#define __ieee754_acos __ieee754_acos_fma4 +#define __ieee754_asin __ieee754_asin_fma4 +#define __cos32 __cos32_fma4 +#define __doasin __doasin_fma4 +#define __docos __docos_fma4 +#define __dubcos __dubcos_fma4 +#define __dubsin __dubsin_fma4 +#define __sin32 __sin32_fma4 +#define SECTION __attribute__ ((section (".text.fma4"))) + +#include <sysdeps/ieee754/dbl-64/e_asin.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_asin.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_asin.c new file mode 100644 index 0000000000..111a5b99bd --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_asin.c @@ -0,0 +1,26 @@ +#include <init-arch.h> +#include <math.h> +#include <math_private.h> + +extern double __ieee754_acos_sse2 (double); +extern double __ieee754_asin_sse2 (double); +extern double __ieee754_acos_fma4 (double); +extern double __ieee754_asin_fma4 (double); + +libm_ifunc (__ieee754_acos, + HAS_ARCH_FEATURE (FMA4_Usable) + ? __ieee754_acos_fma4 + : __ieee754_acos_sse2); +strong_alias (__ieee754_acos, __acos_finite) + +libm_ifunc (__ieee754_asin, + HAS_ARCH_FEATURE (FMA4_Usable) + ? __ieee754_asin_fma4 + : __ieee754_asin_sse2); +strong_alias (__ieee754_asin, __asin_finite) + +#define __ieee754_acos __ieee754_acos_sse2 +#define __ieee754_asin __ieee754_asin_sse2 + + +#include <sysdeps/ieee754/dbl-64/e_asin.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_atan2-avx.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_atan2-avx.c new file mode 100644 index 0000000000..3012afac37 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_atan2-avx.c @@ -0,0 +1,9 @@ +#define __ieee754_atan2 __ieee754_atan2_avx +#define __add __add_avx +#define __dbl_mp __dbl_mp_avx +#define __dvd __dvd_avx +#define __mul __mul_avx +#define __sub __sub_avx +#define SECTION __attribute__ ((section (".text.avx"))) + +#include <sysdeps/ieee754/dbl-64/e_atan2.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_atan2-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_atan2-fma4.c new file mode 100644 index 0000000000..f4e986293e --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_atan2-fma4.c @@ -0,0 +1,10 @@ +#define __ieee754_atan2 __ieee754_atan2_fma4 +#define __add __add_fma4 +#define __dbl_mp __dbl_mp_fma4 +#define __dvd __dvd_fma4 +#define __mpatan2 __mpatan2_fma4 +#define __mul __mul_fma4 +#define __sub __sub_fma4 +#define SECTION __attribute__ ((section (".text.fma4"))) + +#include <sysdeps/ieee754/dbl-64/e_atan2.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_atan2.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_atan2.c new file mode 100644 index 0000000000..9ca3c02a44 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_atan2.c @@ -0,0 +1,18 @@ +#include <init-arch.h> +#include <math.h> +#include <math_private.h> + +extern double __ieee754_atan2_sse2 (double, double); +extern double __ieee754_atan2_avx (double, double); +extern double __ieee754_atan2_fma4 (double, double); + +libm_ifunc (__ieee754_atan2, + HAS_ARCH_FEATURE (FMA4_Usable) ? __ieee754_atan2_fma4 + : (HAS_ARCH_FEATURE (AVX_Usable) + ? __ieee754_atan2_avx : __ieee754_atan2_sse2)); +strong_alias (__ieee754_atan2, __atan2_finite) + +#define __ieee754_atan2 __ieee754_atan2_sse2 + + +#include <sysdeps/ieee754/dbl-64/e_atan2.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_exp-avx.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_exp-avx.c new file mode 100644 index 0000000000..ee5dd6d2dc --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_exp-avx.c @@ -0,0 +1,6 @@ +#define __ieee754_exp __ieee754_exp_avx +#define __exp1 __exp1_avx +#define __slowexp __slowexp_avx +#define SECTION __attribute__ ((section (".text.avx"))) + +#include <sysdeps/ieee754/dbl-64/e_exp.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_exp-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_exp-fma4.c new file mode 100644 index 0000000000..ae6eb67603 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_exp-fma4.c @@ -0,0 +1,6 @@ +#define __ieee754_exp __ieee754_exp_fma4 +#define __exp1 __exp1_fma4 +#define __slowexp __slowexp_fma4 +#define SECTION __attribute__ ((section (".text.fma4"))) + +#include <sysdeps/ieee754/dbl-64/e_exp.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_exp.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_exp.c new file mode 100644 index 0000000000..b7d7b5ff27 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_exp.c @@ -0,0 +1,18 @@ +#include <init-arch.h> +#include <math.h> +#include <math_private.h> + +extern double __ieee754_exp_sse2 (double); +extern double __ieee754_exp_avx (double); +extern double __ieee754_exp_fma4 (double); + +libm_ifunc (__ieee754_exp, + HAS_ARCH_FEATURE (FMA4_Usable) ? __ieee754_exp_fma4 + : (HAS_ARCH_FEATURE (AVX_Usable) + ? __ieee754_exp_avx : __ieee754_exp_sse2)); +strong_alias (__ieee754_exp, __exp_finite) + +#define __ieee754_exp __ieee754_exp_sse2 + + +#include <sysdeps/ieee754/dbl-64/e_exp.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_log-avx.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_log-avx.c new file mode 100644 index 0000000000..c669019bc2 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_log-avx.c @@ -0,0 +1,8 @@ +#define __ieee754_log __ieee754_log_avx +#define __mplog __mplog_avx +#define __add __add_avx +#define __dbl_mp __dbl_mp_avx +#define __sub __sub_avx +#define SECTION __attribute__ ((section (".text.avx"))) + +#include <sysdeps/ieee754/dbl-64/e_log.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_log-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_log-fma4.c new file mode 100644 index 0000000000..a2346cc618 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_log-fma4.c @@ -0,0 +1,8 @@ +#define __ieee754_log __ieee754_log_fma4 +#define __mplog __mplog_fma4 +#define __add __add_fma4 +#define __dbl_mp __dbl_mp_fma4 +#define __sub __sub_fma4 +#define SECTION __attribute__ ((section (".text.fma4"))) + +#include <sysdeps/ieee754/dbl-64/e_log.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_log.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_log.c new file mode 100644 index 0000000000..cf9533d6c0 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_log.c @@ -0,0 +1,18 @@ +#include <init-arch.h> +#include <math.h> +#include <math_private.h> + +extern double __ieee754_log_sse2 (double); +extern double __ieee754_log_avx (double); +extern double __ieee754_log_fma4 (double); + +libm_ifunc (__ieee754_log, + HAS_ARCH_FEATURE (FMA4_Usable) ? __ieee754_log_fma4 + : (HAS_ARCH_FEATURE (AVX_Usable) + ? __ieee754_log_avx : __ieee754_log_sse2)); +strong_alias (__ieee754_log, __log_finite) + +#define __ieee754_log __ieee754_log_sse2 + + +#include <sysdeps/ieee754/dbl-64/e_log.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_pow-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_pow-fma4.c new file mode 100644 index 0000000000..5b3ea8e103 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_pow-fma4.c @@ -0,0 +1,6 @@ +#define __ieee754_pow __ieee754_pow_fma4 +#define __exp1 __exp1_fma4 +#define __slowpow __slowpow_fma4 +#define SECTION __attribute__ ((section (".text.fma4"))) + +#include <sysdeps/ieee754/dbl-64/e_pow.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_pow.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_pow.c new file mode 100644 index 0000000000..a5c5d89c3e --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/e_pow.c @@ -0,0 +1,17 @@ +#include <init-arch.h> +#include <math.h> +#include <math_private.h> + +extern double __ieee754_pow_sse2 (double, double); +extern double __ieee754_pow_fma4 (double, double); + +libm_ifunc (__ieee754_pow, + HAS_ARCH_FEATURE (FMA4_Usable) + ? __ieee754_pow_fma4 + : __ieee754_pow_sse2); +strong_alias (__ieee754_pow, __pow_finite) + +#define __ieee754_pow __ieee754_pow_sse2 + + +#include <sysdeps/ieee754/dbl-64/e_pow.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/halfulp-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/halfulp-fma4.c new file mode 100644 index 0000000000..a00c17c016 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/halfulp-fma4.c @@ -0,0 +1,4 @@ +#define __halfulp __halfulp_fma4 +#define SECTION __attribute__ ((section (".text.fma4"))) + +#include <sysdeps/ieee754/dbl-64/halfulp.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpa-avx.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpa-avx.c new file mode 100644 index 0000000000..366b0b7134 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpa-avx.c @@ -0,0 +1,14 @@ +#define __add __add_avx +#define __mul __mul_avx +#define __sqr __sqr_avx +#define __sub __sub_avx +#define __dbl_mp __dbl_mp_avx +#define __dvd __dvd_avx + +#define NO___CPY 1 +#define NO___MP_DBL 1 +#define NO___ACR 1 +#define NO__CONST 1 +#define SECTION __attribute__ ((section (".text.avx"))) + +#include <sysdeps/ieee754/dbl-64/mpa.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpa-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpa-fma4.c new file mode 100644 index 0000000000..a4a759407e --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpa-fma4.c @@ -0,0 +1,14 @@ +#define __add __add_fma4 +#define __mul __mul_fma4 +#define __sqr __sqr_fma4 +#define __sub __sub_fma4 +#define __dbl_mp __dbl_mp_fma4 +#define __dvd __dvd_fma4 + +#define NO___CPY 1 +#define NO___MP_DBL 1 +#define NO___ACR 1 +#define NO__CONST 1 +#define SECTION __attribute__ ((section (".text.fma4"))) + +#include <sysdeps/ieee754/dbl-64/mpa.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpatan-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpatan-fma4.c new file mode 100644 index 0000000000..fbd3bd49a2 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpatan-fma4.c @@ -0,0 +1,10 @@ +#define __mpatan __mpatan_fma4 +#define __add __add_fma4 +#define __dvd __dvd_fma4 +#define __mpsqrt __mpsqrt_fma4 +#define __mul __mul_fma4 +#define __sub __sub_fma4 +#define AVOID_MPATAN_H 1 +#define SECTION __attribute__ ((section (".text.fma4"))) + +#include <sysdeps/ieee754/dbl-64/mpatan.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpatan2-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpatan2-fma4.c new file mode 100644 index 0000000000..e6e44d49b0 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpatan2-fma4.c @@ -0,0 +1,9 @@ +#define __mpatan2 __mpatan2_fma4 +#define __add __add_fma4 +#define __dvd __dvd_fma4 +#define __mpatan __mpatan_fma4 +#define __mpsqrt __mpsqrt_fma4 +#define __mul __mul_fma4 +#define SECTION __attribute__ ((section (".text.fma4"))) + +#include <sysdeps/ieee754/dbl-64/mpatan2.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpexp-avx.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpexp-avx.c new file mode 100644 index 0000000000..87f29c96c9 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpexp-avx.c @@ -0,0 +1,9 @@ +#define __mpexp __mpexp_avx +#define __add __add_avx +#define __dbl_mp __dbl_mp_avx +#define __dvd __dvd_avx +#define __mul __mul_avx +#define AVOID_MPEXP_H 1 +#define SECTION __attribute__ ((section (".text.avx"))) + +#include <sysdeps/ieee754/dbl-64/mpexp.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpexp-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpexp-fma4.c new file mode 100644 index 0000000000..07ca6e9ad0 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpexp-fma4.c @@ -0,0 +1,9 @@ +#define __mpexp __mpexp_fma4 +#define __add __add_fma4 +#define __dbl_mp __dbl_mp_fma4 +#define __dvd __dvd_fma4 +#define __mul __mul_fma4 +#define AVOID_MPEXP_H 1 +#define SECTION __attribute__ ((section (".text.fma4"))) + +#include <sysdeps/ieee754/dbl-64/mpexp.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mplog-avx.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mplog-avx.c new file mode 100644 index 0000000000..fd783d9a67 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mplog-avx.c @@ -0,0 +1,8 @@ +#define __mplog __mplog_avx +#define __add __add_avx +#define __mpexp __mpexp_avx +#define __mul __mul_avx +#define __sub __sub_avx +#define SECTION __attribute__ ((section (".text.avx"))) + +#include <sysdeps/ieee754/dbl-64/mplog.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mplog-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mplog-fma4.c new file mode 100644 index 0000000000..b4733118d7 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mplog-fma4.c @@ -0,0 +1,8 @@ +#define __mplog __mplog_fma4 +#define __add __add_fma4 +#define __mpexp __mpexp_fma4 +#define __mul __mul_fma4 +#define __sub __sub_fma4 +#define SECTION __attribute__ ((section (".text.fma4"))) + +#include <sysdeps/ieee754/dbl-64/mplog.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpsqrt-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpsqrt-fma4.c new file mode 100644 index 0000000000..f8a1ba2d92 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mpsqrt-fma4.c @@ -0,0 +1,8 @@ +#define __mpsqrt __mpsqrt_fma4 +#define __dbl_mp __dbl_mp_fma4 +#define __mul __mul_fma4 +#define __sub __sub_fma4 +#define AVOID_MPSQRT_H 1 +#define SECTION __attribute__ ((section (".text.fma4"))) + +#include <sysdeps/ieee754/dbl-64/mpsqrt.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mptan-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mptan-fma4.c new file mode 100644 index 0000000000..fb4a9d48ca --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/mptan-fma4.c @@ -0,0 +1,7 @@ +#define __mptan __mptan_fma4 +#define __c32 __c32_fma4 +#define __dvd __dvd_fma4 +#define __mpranred __mpranred_fma4 +#define SECTION __attribute__ ((section (".text.fma4"))) + +#include <sysdeps/ieee754/dbl-64/mptan.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_atan-avx.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_atan-avx.c new file mode 100644 index 0000000000..b5cb9c3a75 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_atan-avx.c @@ -0,0 +1,8 @@ +#define atan __atan_avx +#define __add __add_avx +#define __dbl_mp __dbl_mp_avx +#define __mul __mul_avx +#define __sub __sub_avx +#define SECTION __attribute__ ((section (".text.avx"))) + +#include <sysdeps/ieee754/dbl-64/s_atan.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_atan-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_atan-fma4.c new file mode 100644 index 0000000000..9e83e6cdab --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_atan-fma4.c @@ -0,0 +1,9 @@ +#define atan __atan_fma4 +#define __add __add_fma4 +#define __dbl_mp __dbl_mp_fma4 +#define __mpatan __mpatan_fma4 +#define __mul __mul_fma4 +#define __sub __sub_fma4 +#define SECTION __attribute__ ((section (".text.fma4"))) + +#include <sysdeps/ieee754/dbl-64/s_atan.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_atan.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_atan.c new file mode 100644 index 0000000000..742e95cb96 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_atan.c @@ -0,0 +1,15 @@ +#include <init-arch.h> +#include <math.h> + +extern double __atan_sse2 (double); +extern double __atan_avx (double); +extern double __atan_fma4 (double); + +libm_ifunc (atan, (HAS_ARCH_FEATURE (FMA4_Usable) ? __atan_fma4 : + HAS_ARCH_FEATURE (AVX_Usable) + ? __atan_avx : __atan_sse2)); + +#define atan __atan_sse2 + + +#include <sysdeps/ieee754/dbl-64/s_atan.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_ceil-c.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_ceil-c.c new file mode 100644 index 0000000000..6a5ea3ff27 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_ceil-c.c @@ -0,0 +1,2 @@ +#define __ceil __ceil_c +#include <sysdeps/ieee754/dbl-64/wordsize-64/s_ceil.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_ceil.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_ceil.S new file mode 100644 index 0000000000..f8eef43eff --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_ceil.S @@ -0,0 +1,38 @@ +/* Copyright (C) 2011-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@gmail.come>, 2011. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <machine/asm.h> +#include <init-arch.h> + + +ENTRY(__ceil) + .type __ceil, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq __ceil_sse41(%rip), %rax + HAS_CPU_FEATURE (SSE4_1) + jnz 2f + leaq __ceil_c(%rip), %rax +2: ret +END(__ceil) +weak_alias (__ceil, ceil) + + +ENTRY(__ceil_sse41) + roundsd $10, %xmm0, %xmm0 + ret +END(__ceil_sse41) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_ceilf-c.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_ceilf-c.c new file mode 100644 index 0000000000..229a6273b2 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_ceilf-c.c @@ -0,0 +1,2 @@ +#define __ceilf __ceilf_c +#include <sysdeps/ieee754/flt-32/s_ceilf.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_ceilf.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_ceilf.S new file mode 100644 index 0000000000..076f10f0f0 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_ceilf.S @@ -0,0 +1,38 @@ +/* Copyright (C) 2011-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@gmail.come>, 2011. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <machine/asm.h> +#include <init-arch.h> + + +ENTRY(__ceilf) + .type __ceilf, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq __ceilf_sse41(%rip), %rax + HAS_CPU_FEATURE (SSE4_1) + jnz 2f + leaq __ceilf_c(%rip), %rax +2: ret +END(__ceilf) +weak_alias (__ceilf, ceilf) + + +ENTRY(__ceilf_sse41) + roundss $10, %xmm0, %xmm0 + ret +END(__ceilf_sse41) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_floor-c.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_floor-c.c new file mode 100644 index 0000000000..68733b69ef --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_floor-c.c @@ -0,0 +1,3 @@ +#undef __floor +#define __floor __floor_c +#include <sysdeps/ieee754/dbl-64/wordsize-64/s_floor.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_floor.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_floor.S new file mode 100644 index 0000000000..f519ab24f4 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_floor.S @@ -0,0 +1,38 @@ +/* Copyright (C) 2011-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@gmail.come>, 2011. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <machine/asm.h> +#include <init-arch.h> + + +ENTRY(__floor) + .type __floor, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq __floor_sse41(%rip), %rax + HAS_CPU_FEATURE (SSE4_1) + jnz 2f + leaq __floor_c(%rip), %rax +2: ret +END(__floor) +weak_alias (__floor, floor) + + +ENTRY(__floor_sse41) + roundsd $9, %xmm0, %xmm0 + ret +END(__floor_sse41) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_floorf-c.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_floorf-c.c new file mode 100644 index 0000000000..2386362328 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_floorf-c.c @@ -0,0 +1,3 @@ +#undef __floorf +#define __floorf __floorf_c +#include <sysdeps/ieee754/flt-32/s_floorf.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_floorf.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_floorf.S new file mode 100644 index 0000000000..8613f73acc --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_floorf.S @@ -0,0 +1,38 @@ +/* Copyright (C) 2011-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@gmail.come>, 2011. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <machine/asm.h> +#include <init-arch.h> + + +ENTRY(__floorf) + .type __floorf, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq __floorf_sse41(%rip), %rax + HAS_CPU_FEATURE (SSE4_1) + jnz 2f + leaq __floorf_c(%rip), %rax +2: ret +END(__floorf) +weak_alias (__floorf, floorf) + + +ENTRY(__floorf_sse41) + roundss $9, %xmm0, %xmm0 + ret +END(__floorf_sse41) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_fma.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_fma.c new file mode 100644 index 0000000000..3ac4fed660 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_fma.c @@ -0,0 +1,50 @@ +/* FMA version of fma. + Copyright (C) 2009-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <config.h> +#include <math.h> +#include <init-arch.h> + +extern double __fma_sse2 (double x, double y, double z) attribute_hidden; + + +static double +__fma_fma3 (double x, double y, double z) +{ + asm ("vfmadd213sd %3, %2, %0" : "=x" (x) : "0" (x), "x" (y), "xm" (z)); + return x; +} + + +static double +__fma_fma4 (double x, double y, double z) +{ + asm ("vfmaddsd %3, %2, %1, %0" : "=x" (x) : "x" (x), "x" (y), "x" (z)); + return x; +} + + +libm_ifunc (__fma, HAS_ARCH_FEATURE (FMA_Usable) + ? __fma_fma3 : (HAS_ARCH_FEATURE (FMA4_Usable) + ? __fma_fma4 : __fma_sse2)); +weak_alias (__fma, fma) + +#define __fma __fma_sse2 + +#include <sysdeps/ieee754/dbl-64/s_fma.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_fmaf.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_fmaf.c new file mode 100644 index 0000000000..1ae227c1d4 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_fmaf.c @@ -0,0 +1,49 @@ +/* FMA version of fmaf. + Copyright (C) 2009-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <config.h> +#include <math.h> +#include <init-arch.h> + +extern float __fmaf_sse2 (float x, float y, float z) attribute_hidden; + + +static float +__fmaf_fma3 (float x, float y, float z) +{ + asm ("vfmadd213ss %3, %2, %0" : "=x" (x) : "0" (x), "x" (y), "xm" (z)); + return x; +} + + +static float +__fmaf_fma4 (float x, float y, float z) +{ + asm ("vfmaddss %3, %2, %1, %0" : "=x" (x) : "x" (x), "x" (y), "x" (z)); + return x; +} + + +libm_ifunc (__fmaf, HAS_ARCH_FEATURE (FMA_Usable) + ? __fmaf_fma3 : (HAS_ARCH_FEATURE (FMA4_Usable) + ? __fmaf_fma4 : __fmaf_sse2)); +weak_alias (__fmaf, fmaf) + +#define __fmaf __fmaf_sse2 + +#include <sysdeps/ieee754/dbl-64/s_fmaf.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_nearbyint-c.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_nearbyint-c.c new file mode 100644 index 0000000000..f897a2a6a6 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_nearbyint-c.c @@ -0,0 +1,3 @@ +#undef __nearbyint +#define __nearbyint __nearbyint_c +#include <sysdeps/ieee754/dbl-64/wordsize-64/s_nearbyint.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_nearbyint.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_nearbyint.S new file mode 100644 index 0000000000..5a734f6027 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_nearbyint.S @@ -0,0 +1,38 @@ +/* Copyright (C) 2011-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@gmail.come>, 2011. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <machine/asm.h> +#include <init-arch.h> + + +ENTRY(__nearbyint) + .type __nearbyint, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq __nearbyint_sse41(%rip), %rax + HAS_CPU_FEATURE (SSE4_1) + jnz 2f + leaq __nearbyint_c(%rip), %rax +2: ret +END(__nearbyint) +weak_alias (__nearbyint, nearbyint) + + +ENTRY(__nearbyint_sse41) + roundsd $0xc, %xmm0, %xmm0 + ret +END(__nearbyint_sse41) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_nearbyintf-c.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_nearbyintf-c.c new file mode 100644 index 0000000000..aa7768233b --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_nearbyintf-c.c @@ -0,0 +1,3 @@ +#undef __nearbyintf +#define __nearbyintf __nearbyintf_c +#include <sysdeps/ieee754/flt-32/s_nearbyintf.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_nearbyintf.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_nearbyintf.S new file mode 100644 index 0000000000..ad79fd6021 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_nearbyintf.S @@ -0,0 +1,38 @@ +/* Copyright (C) 2011-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@gmail.come>, 2011. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <machine/asm.h> +#include <init-arch.h> + + +ENTRY(__nearbyintf) + .type __nearbyintf, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq __nearbyintf_sse41(%rip), %rax + HAS_CPU_FEATURE (SSE4_1) + jnz 2f + leaq __nearbyintf_c(%rip), %rax +2: ret +END(__nearbyintf) +weak_alias (__nearbyintf, nearbyintf) + + +ENTRY(__nearbyintf_sse41) + roundss $0xc, %xmm0, %xmm0 + ret +END(__nearbyintf_sse41) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_rint-c.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_rint-c.c new file mode 100644 index 0000000000..162a630ff9 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_rint-c.c @@ -0,0 +1,3 @@ +#undef __rint +#define __rint __rint_c +#include <sysdeps/ieee754/dbl-64/wordsize-64/s_rint.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_rint.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_rint.S new file mode 100644 index 0000000000..4f628a93a4 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_rint.S @@ -0,0 +1,38 @@ +/* Copyright (C) 2011-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@gmail.come>, 2011. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <machine/asm.h> +#include <init-arch.h> + + +ENTRY(__rint) + .type __rint, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq __rint_sse41(%rip), %rax + HAS_CPU_FEATURE (SSE4_1) + jnz 2f + leaq __rint_c(%rip), %rax +2: ret +END(__rint) +weak_alias (__rint, rint) + + +ENTRY(__rint_sse41) + roundsd $4, %xmm0, %xmm0 + ret +END(__rint_sse41) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_rintf-c.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_rintf-c.c new file mode 100644 index 0000000000..8505249f34 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_rintf-c.c @@ -0,0 +1,3 @@ +#undef __rintf +#define __rintf __rintf_c +#include <sysdeps/ieee754/flt-32/s_rintf.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_rintf.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_rintf.S new file mode 100644 index 0000000000..dee4ad794c --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_rintf.S @@ -0,0 +1,38 @@ +/* Copyright (C) 2011-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@gmail.come>, 2011. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <machine/asm.h> +#include <init-arch.h> + + +ENTRY(__rintf) + .type __rintf, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq __rintf_sse41(%rip), %rax + HAS_CPU_FEATURE (SSE4_1) + jnz 2f + leaq __rintf_c(%rip), %rax +2: ret +END(__rintf) +weak_alias (__rintf, rintf) + + +ENTRY(__rintf_sse41) + roundss $4, %xmm0, %xmm0 + ret +END(__rintf_sse41) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_sin-avx.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_sin-avx.c new file mode 100644 index 0000000000..e1c6de0259 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_sin-avx.c @@ -0,0 +1,5 @@ +#define __cos __cos_avx +#define __sin __sin_avx +#define SECTION __attribute__ ((section (".text.avx"))) + +#include <sysdeps/ieee754/dbl-64/s_sin.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_sin-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_sin-fma4.c new file mode 100644 index 0000000000..4c35739dc9 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_sin-fma4.c @@ -0,0 +1,11 @@ +#define __cos __cos_fma4 +#define __sin __sin_fma4 +#define __docos __docos_fma4 +#define __dubsin __dubsin_fma4 +#define __mpcos __mpcos_fma4 +#define __mpcos1 __mpcos1_fma4 +#define __mpsin __mpsin_fma4 +#define __mpsin1 __mpsin1_fma4 +#define SECTION __attribute__ ((section (".text.fma4"))) + +#include <sysdeps/ieee754/dbl-64/s_sin.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_sin.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_sin.c new file mode 100644 index 0000000000..8ffd3e7125 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_sin.c @@ -0,0 +1,26 @@ +#include <init-arch.h> +#include <math.h> +#undef NAN + +extern double __cos_sse2 (double); +extern double __sin_sse2 (double); +extern double __cos_avx (double); +extern double __sin_avx (double); +extern double __cos_fma4 (double); +extern double __sin_fma4 (double); + +libm_ifunc (__cos, (HAS_ARCH_FEATURE (FMA4_Usable) ? __cos_fma4 : + HAS_ARCH_FEATURE (AVX_Usable) + ? __cos_avx : __cos_sse2)); +weak_alias (__cos, cos) + +libm_ifunc (__sin, (HAS_ARCH_FEATURE (FMA4_Usable) ? __sin_fma4 : + HAS_ARCH_FEATURE (AVX_Usable) + ? __sin_avx : __sin_sse2)); +weak_alias (__sin, sin) + +#define __cos __cos_sse2 +#define __sin __sin_sse2 + + +#include <sysdeps/ieee754/dbl-64/s_sin.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_tan-avx.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_tan-avx.c new file mode 100644 index 0000000000..53de5d3c98 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_tan-avx.c @@ -0,0 +1,6 @@ +#define tan __tan_avx +#define __dbl_mp __dbl_mp_avx +#define __sub __sub_avx +#define SECTION __attribute__ ((section (".text.avx"))) + +#include <sysdeps/ieee754/dbl-64/s_tan.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_tan-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_tan-fma4.c new file mode 100644 index 0000000000..a805440b46 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_tan-fma4.c @@ -0,0 +1,8 @@ +#define tan __tan_fma4 +#define __dbl_mp __dbl_mp_fma4 +#define __mpranred __mpranred_fma4 +#define __mptan __mptan_fma4 +#define __sub __sub_fma4 +#define SECTION __attribute__ ((section (".text.fma4"))) + +#include <sysdeps/ieee754/dbl-64/s_tan.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_tan.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_tan.c new file mode 100644 index 0000000000..25f3bca07e --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/s_tan.c @@ -0,0 +1,15 @@ +#include <init-arch.h> +#include <math.h> + +extern double __tan_sse2 (double); +extern double __tan_avx (double); +extern double __tan_fma4 (double); + +libm_ifunc (tan, (HAS_ARCH_FEATURE (FMA4_Usable) ? __tan_fma4 : + HAS_ARCH_FEATURE (AVX_Usable) + ? __tan_avx : __tan_sse2)); + +#define tan __tan_sse2 + + +#include <sysdeps/ieee754/dbl-64/s_tan.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/sincos32-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/sincos32-fma4.c new file mode 100644 index 0000000000..ebbfa18cca --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/sincos32-fma4.c @@ -0,0 +1,15 @@ +#define __cos32 __cos32_fma4 +#define __sin32 __sin32_fma4 +#define __c32 __c32_fma4 +#define __mpsin __mpsin_fma4 +#define __mpsin1 __mpsin1_fma4 +#define __mpcos __mpcos_fma4 +#define __mpcos1 __mpcos1_fma4 +#define __mpranred __mpranred_fma4 +#define __add __add_fma4 +#define __dbl_mp __dbl_mp_fma4 +#define __mul __mul_fma4 +#define __sub __sub_fma4 +#define SECTION __attribute__ ((section (".text.fma4"))) + +#include <sysdeps/ieee754/dbl-64/sincos32.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/slowexp-avx.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/slowexp-avx.c new file mode 100644 index 0000000000..d01c6d71a4 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/slowexp-avx.c @@ -0,0 +1,9 @@ +#define __slowexp __slowexp_avx +#define __add __add_avx +#define __dbl_mp __dbl_mp_avx +#define __mpexp __mpexp_avx +#define __mul __mul_avx +#define __sub __sub_avx +#define SECTION __attribute__ ((section (".text.avx"))) + +#include <sysdeps/ieee754/dbl-64/slowexp.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/slowexp-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/slowexp-fma4.c new file mode 100644 index 0000000000..3bcde84233 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/slowexp-fma4.c @@ -0,0 +1,9 @@ +#define __slowexp __slowexp_fma4 +#define __add __add_fma4 +#define __dbl_mp __dbl_mp_fma4 +#define __mpexp __mpexp_fma4 +#define __mul __mul_fma4 +#define __sub __sub_fma4 +#define SECTION __attribute__ ((section (".text.fma4"))) + +#include <sysdeps/ieee754/dbl-64/slowexp.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/slowpow-fma4.c b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/slowpow-fma4.c new file mode 100644 index 0000000000..69d69823bb --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/slowpow-fma4.c @@ -0,0 +1,11 @@ +#define __slowpow __slowpow_fma4 +#define __add __add_fma4 +#define __dbl_mp __dbl_mp_fma4 +#define __mpexp __mpexp_fma4 +#define __mplog __mplog_fma4 +#define __mul __mul_fma4 +#define __sub __sub_fma4 +#define __halfulp __halfulp_fma4 +#define SECTION __attribute__ ((section (".text.fma4"))) + +#include <sysdeps/ieee754/dbl-64/slowpow.c> diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.S new file mode 100644 index 0000000000..b209492442 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized cos, vector length is 2. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVbN2v_cos) + .type _ZGVbN2v_cos, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVbN2v_cos_sse4(%rip), %rax + HAS_CPU_FEATURE (SSE4_1) + jz 2f + ret +2: leaq _ZGVbN2v_cos_sse2(%rip), %rax + ret +END (_ZGVbN2v_cos) +libmvec_hidden_def (_ZGVbN2v_cos) + +#define _ZGVbN2v_cos _ZGVbN2v_cos_sse2 +#include "../svml_d_cos2_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core_sse4.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core_sse4.S new file mode 100644 index 0000000000..858dc6532f --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core_sse4.S @@ -0,0 +1,223 @@ +/* Function cos vectorized with SSE4. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_trig_data.h" + + .text +ENTRY (_ZGVbN2v_cos_sse4) +/* ALGORITHM DESCRIPTION: + + ( low accuracy ( < 4ulp ) or enhanced performance + ( half of correct mantissa ) implementation ) + + Argument representation: + arg + Pi/2 = (N*Pi + R) + + Result calculation: + cos(arg) = sin(arg+Pi/2) = sin(N*Pi + R) = (-1)^N * sin(R) + sin(R) is approximated by corresponding polynomial + */ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $320, %rsp + movaps %xmm0, %xmm3 + movq __svml_d_trig_data@GOTPCREL(%rip), %rax + movups __dHalfPI(%rax), %xmm2 + +/* ARGUMENT RANGE REDUCTION: + Add Pi/2 to argument: X' = X+Pi/2 + */ + addpd %xmm3, %xmm2 + movups __dInvPI(%rax), %xmm5 + movups __dAbsMask(%rax), %xmm4 + +/* Get absolute argument value: X' = |X'| */ + andps %xmm2, %xmm4 + +/* Y = X'*InvPi + RS : right shifter add */ + mulpd %xmm5, %xmm2 + +/* Check for large arguments path */ + cmpnlepd __dRangeVal(%rax), %xmm4 + movups __dRShifter(%rax), %xmm6 + addpd %xmm6, %xmm2 + movmskpd %xmm4, %ecx + +/* N = Y - RS : right shifter sub */ + movaps %xmm2, %xmm1 + +/* SignRes = Y<<63 : shift LSB to MSB place for result sign */ + psllq $63, %xmm2 + subpd %xmm6, %xmm1 + +/* N = N - 0.5 */ + subpd __dOneHalf(%rax), %xmm1 + movups __dPI1(%rax), %xmm7 + +/* R = X - N*Pi1 */ + mulpd %xmm1, %xmm7 + movups __dPI2(%rax), %xmm4 + +/* R = R - N*Pi2 */ + mulpd %xmm1, %xmm4 + subpd %xmm7, %xmm0 + movups __dPI3(%rax), %xmm5 + +/* R = R - N*Pi3 */ + mulpd %xmm1, %xmm5 + subpd %xmm4, %xmm0 + +/* R = R - N*Pi4 */ + movups __dPI4(%rax), %xmm6 + mulpd %xmm6, %xmm1 + subpd %xmm5, %xmm0 + subpd %xmm1, %xmm0 + +/* POLYNOMIAL APPROXIMATION: R2 = R*R */ + movaps %xmm0, %xmm4 + mulpd %xmm0, %xmm4 + movups __dC7(%rax), %xmm1 + mulpd %xmm4, %xmm1 + addpd __dC6(%rax), %xmm1 + mulpd %xmm4, %xmm1 + addpd __dC5(%rax), %xmm1 + mulpd %xmm4, %xmm1 + addpd __dC4(%rax), %xmm1 + +/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */ + mulpd %xmm4, %xmm1 + addpd __dC3(%rax), %xmm1 + +/* Poly = R+R*(R2*(C1+R2*(C2+R2*Poly))) */ + mulpd %xmm4, %xmm1 + addpd __dC2(%rax), %xmm1 + mulpd %xmm4, %xmm1 + addpd __dC1(%rax), %xmm1 + mulpd %xmm1, %xmm4 + mulpd %xmm0, %xmm4 + addpd %xmm4, %xmm0 + +/* RECONSTRUCTION: + Final sign setting: Res = Poly^SignRes */ + xorps %xmm2, %xmm0 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + movups %xmm3, 192(%rsp) + movups %xmm0, 256(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + movups %xmm8, 112(%rsp) + movups %xmm9, 96(%rsp) + movups %xmm10, 80(%rsp) + movups %xmm11, 64(%rsp) + movups %xmm12, 48(%rsp) + movups %xmm13, 32(%rsp) + movups %xmm14, 16(%rsp) + movups %xmm15, (%rsp) + movq %rsi, 136(%rsp) + movq %rdi, 128(%rsp) + movq %r12, 168(%rsp) + cfi_offset_rel_rsp (12, 168) + movb %dl, %r12b + movq %r13, 160(%rsp) + cfi_offset_rel_rsp (13, 160) + movl %ecx, %r13d + movq %r14, 152(%rsp) + cfi_offset_rel_rsp (14, 152) + movl %eax, %r14d + movq %r15, 144(%rsp) + cfi_offset_rel_rsp (15, 144) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + movups 112(%rsp), %xmm8 + movups 96(%rsp), %xmm9 + movups 80(%rsp), %xmm10 + movups 64(%rsp), %xmm11 + movups 48(%rsp), %xmm12 + movups 32(%rsp), %xmm13 + movups 16(%rsp), %xmm14 + movups (%rsp), %xmm15 + movq 136(%rsp), %rsi + movq 128(%rsp), %rdi + movq 168(%rsp), %r12 + cfi_restore (%r12) + movq 160(%rsp), %r13 + cfi_restore (%r13) + movq 152(%rsp), %r14 + cfi_restore (%r14) + movq 144(%rsp), %r15 + cfi_restore (%r15) + movups 256(%rsp), %xmm0 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + movsd 200(%rsp,%r15), %xmm0 + + call JUMPTARGET(cos) + + movsd %xmm0, 264(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + shlq $4, %r15 + movsd 192(%rsp,%r15), %xmm0 + + call JUMPTARGET(cos) + + movsd %xmm0, 256(%rsp,%r15) + jmp .LBL_1_7 + +END (_ZGVbN2v_cos_sse4) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core.S new file mode 100644 index 0000000000..ff382e9c6c --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized cos, vector length is 4. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVdN4v_cos) + .type _ZGVdN4v_cos, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVdN4v_cos_avx2(%rip), %rax + HAS_ARCH_FEATURE (AVX2_Usable) + jz 2f + ret +2: leaq _ZGVdN4v_cos_sse_wrapper(%rip), %rax + ret +END (_ZGVdN4v_cos) +libmvec_hidden_def (_ZGVdN4v_cos) + +#define _ZGVdN4v_cos _ZGVdN4v_cos_sse_wrapper +#include "../svml_d_cos4_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core_avx2.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core_avx2.S new file mode 100644 index 0000000000..4b6d09743b --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core_avx2.S @@ -0,0 +1,207 @@ +/* Function cos vectorized with AVX2. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_trig_data.h" + + .text +ENTRY (_ZGVdN4v_cos_avx2) + +/* ALGORITHM DESCRIPTION: + + ( low accuracy ( < 4ulp ) or enhanced performance + ( half of correct mantissa ) implementation ) + + Argument representation: + arg + Pi/2 = (N*Pi + R) + + Result calculation: + cos(arg) = sin(arg+Pi/2) = sin(N*Pi + R) = (-1)^N * sin(R) + sin(R) is approximated by corresponding polynomial + */ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $448, %rsp + movq __svml_d_trig_data@GOTPCREL(%rip), %rax + vmovapd %ymm0, %ymm1 + vmovupd __dInvPI(%rax), %ymm4 + vmovupd __dRShifter(%rax), %ymm5 + +/* + ARGUMENT RANGE REDUCTION: + Add Pi/2 to argument: X' = X+Pi/2 + */ + vaddpd __dHalfPI(%rax), %ymm1, %ymm7 + +/* Get absolute argument value: X' = |X'| */ + vandpd __dAbsMask(%rax), %ymm7, %ymm2 + +/* Y = X'*InvPi + RS : right shifter add */ + vfmadd213pd %ymm5, %ymm4, %ymm7 + vmovupd __dC7(%rax), %ymm4 + +/* Check for large arguments path */ + vcmpnle_uqpd __dRangeVal(%rax), %ymm2, %ymm3 + +/* N = Y - RS : right shifter sub */ + vsubpd %ymm5, %ymm7, %ymm6 + vmovupd __dPI1_FMA(%rax), %ymm2 + +/* SignRes = Y<<63 : shift LSB to MSB place for result sign */ + vpsllq $63, %ymm7, %ymm7 + +/* N = N - 0.5 */ + vsubpd __dOneHalf(%rax), %ymm6, %ymm0 + vmovmskpd %ymm3, %ecx + +/* R = X - N*Pi1 */ + vmovapd %ymm1, %ymm3 + vfnmadd231pd %ymm0, %ymm2, %ymm3 + +/* R = R - N*Pi2 */ + vfnmadd231pd __dPI2_FMA(%rax), %ymm0, %ymm3 + +/* R = R - N*Pi3 */ + vfnmadd132pd __dPI3_FMA(%rax), %ymm3, %ymm0 + +/* POLYNOMIAL APPROXIMATION: R2 = R*R */ + vmulpd %ymm0, %ymm0, %ymm5 + vfmadd213pd __dC6(%rax), %ymm5, %ymm4 + vfmadd213pd __dC5(%rax), %ymm5, %ymm4 + vfmadd213pd __dC4(%rax), %ymm5, %ymm4 + +/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */ + vfmadd213pd __dC3(%rax), %ymm5, %ymm4 + +/* Poly = R+R*(R2*(C1+R2*(C2+R2*Poly))) */ + vfmadd213pd __dC2(%rax), %ymm5, %ymm4 + vfmadd213pd __dC1(%rax), %ymm5, %ymm4 + vmulpd %ymm5, %ymm4, %ymm6 + vfmadd213pd %ymm0, %ymm0, %ymm6 + +/* + RECONSTRUCTION: + Final sign setting: Res = Poly^SignRes */ + vxorpd %ymm7, %ymm6, %ymm0 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovupd %ymm1, 320(%rsp) + vmovupd %ymm0, 384(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + vmovups %ymm8, 224(%rsp) + vmovups %ymm9, 192(%rsp) + vmovups %ymm10, 160(%rsp) + vmovups %ymm11, 128(%rsp) + vmovups %ymm12, 96(%rsp) + vmovups %ymm13, 64(%rsp) + vmovups %ymm14, 32(%rsp) + vmovups %ymm15, (%rsp) + movq %rsi, 264(%rsp) + movq %rdi, 256(%rsp) + movq %r12, 296(%rsp) + cfi_offset_rel_rsp (12, 296) + movb %dl, %r12b + movq %r13, 288(%rsp) + cfi_offset_rel_rsp (13, 288) + movl %ecx, %r13d + movq %r14, 280(%rsp) + cfi_offset_rel_rsp (14, 280) + movl %eax, %r14d + movq %r15, 272(%rsp) + cfi_offset_rel_rsp (15, 272) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + vmovups 224(%rsp), %ymm8 + vmovups 192(%rsp), %ymm9 + vmovups 160(%rsp), %ymm10 + vmovups 128(%rsp), %ymm11 + vmovups 96(%rsp), %ymm12 + vmovups 64(%rsp), %ymm13 + vmovups 32(%rsp), %ymm14 + vmovups (%rsp), %ymm15 + vmovupd 384(%rsp), %ymm0 + movq 264(%rsp), %rsi + movq 256(%rsp), %rdi + movq 296(%rsp), %r12 + cfi_restore (%r12) + movq 288(%rsp), %r13 + cfi_restore (%r13) + movq 280(%rsp), %r14 + cfi_restore (%r14) + movq 272(%rsp), %r15 + cfi_restore (%r15) + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 328(%rsp,%r15), %xmm0 + vzeroupper + + call JUMPTARGET(cos) + + vmovsd %xmm0, 392(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 320(%rsp,%r15), %xmm0 + vzeroupper + + call JUMPTARGET(cos) + + vmovsd %xmm0, 384(%rsp,%r15) + jmp .LBL_1_7 + +END (_ZGVdN4v_cos_avx2) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S new file mode 100644 index 0000000000..46d35a25d2 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S @@ -0,0 +1,37 @@ +/* Multiple versions of vectorized cos, vector length is 8. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVeN8v_cos) + .type _ZGVeN8v_cos, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX +1: leaq _ZGVeN8v_cos_skx(%rip), %rax + HAS_ARCH_FEATURE (AVX512DQ_Usable) + jnz 2f + leaq _ZGVeN8v_cos_knl(%rip), %rax + HAS_ARCH_FEATURE (AVX512F_Usable) + jnz 2f + leaq _ZGVeN8v_cos_avx2_wrapper(%rip), %rax +2: ret +END (_ZGVeN8v_cos) + +#define _ZGVeN8v_cos _ZGVeN8v_cos_avx2_wrapper +#include "../svml_d_cos8_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S new file mode 100644 index 0000000000..e7af83c6d5 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S @@ -0,0 +1,463 @@ +/* Function cos vectorized with AVX-512, KNL and SKX versions. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_trig_data.h" +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVeN8v_cos_knl) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512 _ZGVdN4v_cos +#else +/* + ALGORITHM DESCRIPTION: + + ( low accuracy ( < 4ulp ) or enhanced performance + ( half of correct mantissa ) implementation ) + + Argument representation: + arg + Pi/2 = (N*Pi + R) + + Result calculation: + cos(arg) = sin(arg+Pi/2) = sin(N*Pi + R) = (-1)^N * sin(R) + sin(R) is approximated by corresponding polynomial + */ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1280, %rsp + movq __svml_d_trig_data@GOTPCREL(%rip), %rax + +/* R = X - N*Pi1 */ + vmovaps %zmm0, %zmm7 + +/* Check for large arguments path */ + movq $-1, %rcx + +/* + ARGUMENT RANGE REDUCTION: + Add Pi/2 to argument: X' = X+Pi/2 + */ + vaddpd __dHalfPI(%rax), %zmm0, %zmm5 + vmovups __dInvPI(%rax), %zmm3 + +/* Get absolute argument value: X' = |X'| */ + vpandq __dAbsMask(%rax), %zmm5, %zmm1 + +/* Y = X'*InvPi + RS : right shifter add */ + vfmadd213pd __dRShifter(%rax), %zmm3, %zmm5 + vmovups __dPI1_FMA(%rax), %zmm6 + +/* N = Y - RS : right shifter sub */ + vsubpd __dRShifter(%rax), %zmm5, %zmm4 + +/* SignRes = Y<<63 : shift LSB to MSB place for result sign */ + vpsllq $63, %zmm5, %zmm12 + vmovups __dC7(%rax), %zmm8 + +/* N = N - 0.5 */ + vsubpd __dOneHalf(%rax), %zmm4, %zmm10 + vcmppd $22, __dRangeVal(%rax), %zmm1, %k1 + vpbroadcastq %rcx, %zmm2{%k1}{z} + vfnmadd231pd %zmm10, %zmm6, %zmm7 + vptestmq %zmm2, %zmm2, %k0 + +/* R = R - N*Pi2 */ + vfnmadd231pd __dPI2_FMA(%rax), %zmm10, %zmm7 + kmovw %k0, %ecx + movzbl %cl, %ecx + +/* R = R - N*Pi3 */ + vfnmadd132pd __dPI3_FMA(%rax), %zmm7, %zmm10 + +/* + POLYNOMIAL APPROXIMATION: + R2 = R*R + */ + vmulpd %zmm10, %zmm10, %zmm9 + vfmadd213pd __dC6(%rax), %zmm9, %zmm8 + vfmadd213pd __dC5(%rax), %zmm9, %zmm8 + vfmadd213pd __dC4(%rax), %zmm9, %zmm8 + +/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */ + vfmadd213pd __dC3(%rax), %zmm9, %zmm8 + +/* Poly = R+R*(R2*(C1+R2*(C2+R2*Poly))) */ + vfmadd213pd __dC2(%rax), %zmm9, %zmm8 + vfmadd213pd __dC1(%rax), %zmm9, %zmm8 + vmulpd %zmm9, %zmm8, %zmm11 + vfmadd213pd %zmm10, %zmm10, %zmm11 + +/* + RECONSTRUCTION: + Final sign setting: Res = Poly^SignRes + */ + vpxorq %zmm12, %zmm11, %zmm1 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + vmovaps %zmm1, %zmm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovups %zmm0, 1152(%rsp) + vmovups %zmm1, 1216(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + kmovw %k4, 1048(%rsp) + xorl %eax, %eax + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1064(%rsp) + movq %rdi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %ecx, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %eax, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + addb $1, %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + kmovw 1048(%rsp), %k4 + movq 1064(%rsp), %rsi + kmovw 1040(%rsp), %k5 + movq 1056(%rsp), %rdi + kmovw 1032(%rsp), %k6 + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + vmovups 1216(%rsp), %zmm1 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1160(%rsp,%r15), %xmm0 + call JUMPTARGET(cos) + vmovsd %xmm0, 1224(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1152(%rsp,%r15), %xmm0 + call JUMPTARGET(cos) + vmovsd %xmm0, 1216(%rsp,%r15) + jmp .LBL_1_7 +#endif +END (_ZGVeN8v_cos_knl) + +ENTRY (_ZGVeN8v_cos_skx) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512 _ZGVdN4v_cos +#else +/* + ALGORITHM DESCRIPTION: + + ( low accuracy ( < 4ulp ) or enhanced performance + ( half of correct mantissa ) implementation ) + + Argument representation: + arg + Pi/2 = (N*Pi + R) + + Result calculation: + cos(arg) = sin(arg+Pi/2) = sin(N*Pi + R) = (-1)^N * sin(R) + sin(R) is approximated by corresponding polynomial + */ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1280, %rsp + movq __svml_d_trig_data@GOTPCREL(%rip), %rax + +/* R = X - N*Pi1 */ + vmovaps %zmm0, %zmm8 + +/* Check for large arguments path */ + vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2 + +/* + ARGUMENT RANGE REDUCTION: + Add Pi/2 to argument: X' = X+Pi/2 + */ + vaddpd __dHalfPI(%rax), %zmm0, %zmm6 + vmovups __dInvPI(%rax), %zmm3 + vmovups __dRShifter(%rax), %zmm4 + vmovups __dPI1_FMA(%rax), %zmm7 + vmovups __dC7(%rax), %zmm9 + +/* Get absolute argument value: X' = |X'| */ + vandpd __dAbsMask(%rax), %zmm6, %zmm1 + +/* Y = X'*InvPi + RS : right shifter add */ + vfmadd213pd %zmm4, %zmm3, %zmm6 + vcmppd $18, __dRangeVal(%rax), %zmm1, %k1 + +/* SignRes = Y<<63 : shift LSB to MSB place for result sign */ + vpsllq $63, %zmm6, %zmm13 + +/* N = Y - RS : right shifter sub */ + vsubpd %zmm4, %zmm6, %zmm5 + +/* N = N - 0.5 */ + vsubpd __dOneHalf(%rax), %zmm5, %zmm11 + vfnmadd231pd %zmm11, %zmm7, %zmm8 + +/* R = R - N*Pi2 */ + vfnmadd231pd __dPI2_FMA(%rax), %zmm11, %zmm8 + +/* R = R - N*Pi3 */ + vfnmadd132pd __dPI3_FMA(%rax), %zmm8, %zmm11 + +/* + POLYNOMIAL APPROXIMATION: + R2 = R*R + */ + vmulpd %zmm11, %zmm11, %zmm10 + vfmadd213pd __dC6(%rax), %zmm10, %zmm9 + vfmadd213pd __dC5(%rax), %zmm10, %zmm9 + vfmadd213pd __dC4(%rax), %zmm10, %zmm9 + +/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */ + vfmadd213pd __dC3(%rax), %zmm10, %zmm9 + +/* Poly = R+R*(R2*(C1+R2*(C2+R2*Poly))) */ + vfmadd213pd __dC2(%rax), %zmm10, %zmm9 + vfmadd213pd __dC1(%rax), %zmm10, %zmm9 + vmulpd %zmm10, %zmm9, %zmm12 + vfmadd213pd %zmm11, %zmm11, %zmm12 + vpandnq %zmm1, %zmm1, %zmm2{%k1} + vcmppd $3, %zmm2, %zmm2, %k0 + +/* + RECONSTRUCTION: + Final sign setting: Res = Poly^SignRes + */ + vxorpd %zmm13, %zmm12, %zmm1 + kmovw %k0, %ecx + testl %ecx, %ecx + jne .LBL_2_3 + +.LBL_2_2: + cfi_remember_state + vmovaps %zmm1, %zmm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_2_3: + cfi_restore_state + vmovups %zmm0, 1152(%rsp) + vmovups %zmm1, 1216(%rsp) + je .LBL_2_2 + + xorb %dl, %dl + xorl %eax, %eax + kmovw %k4, 1048(%rsp) + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1064(%rsp) + movq %rdi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %ecx, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %eax, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + cfi_remember_state + +.LBL_2_6: + btl %r14d, %r13d + jc .LBL_2_12 + +.LBL_2_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_2_10 + +.LBL_2_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_2_6 + + kmovw 1048(%rsp), %k4 + kmovw 1040(%rsp), %k5 + kmovw 1032(%rsp), %k6 + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + vmovups 1216(%rsp), %zmm1 + movq 1064(%rsp), %rsi + movq 1056(%rsp), %rdi + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + jmp .LBL_2_2 + +.LBL_2_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1160(%rsp,%r15), %xmm0 + vzeroupper + vmovsd 1160(%rsp,%r15), %xmm0 + + call JUMPTARGET(cos) + + vmovsd %xmm0, 1224(%rsp,%r15) + jmp .LBL_2_8 + +.LBL_2_12: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1152(%rsp,%r15), %xmm0 + vzeroupper + vmovsd 1152(%rsp,%r15), %xmm0 + + call JUMPTARGET(cos) + + vmovsd %xmm0, 1216(%rsp,%r15) + jmp .LBL_2_7 +#endif +END (_ZGVeN8v_cos_skx) + + .section .rodata, "a" +.L_2il0floatpacket.16: + .long 0xffffffff,0xffffffff + .type .L_2il0floatpacket.16,@object diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core.S new file mode 100644 index 0000000000..5a17e11a0f --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized exp. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVbN2v_exp) + .type _ZGVbN2v_exp, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVbN2v_exp_sse4(%rip), %rax + HAS_CPU_FEATURE (SSE4_1) + jz 2f + ret +2: leaq _ZGVbN2v_exp_sse2(%rip), %rax + ret +END (_ZGVbN2v_exp) +libmvec_hidden_def (_ZGVbN2v_exp) + +#define _ZGVbN2v_exp _ZGVbN2v_exp_sse2 +#include "../svml_d_exp2_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core_sse4.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core_sse4.S new file mode 100644 index 0000000000..864dc5ae9f --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core_sse4.S @@ -0,0 +1,225 @@ +/* Function exp vectorized with SSE4. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_exp_data.h" + + .text +ENTRY (_ZGVbN2v_exp_sse4) +/* + ALGORITHM DESCRIPTION: + + Argument representation: + N = rint(X*2^k/ln2) = 2^k*M+j + X = N*ln2/2^k + r = M*ln2 + ln2*(j/2^k) + r + then -ln2/2^(k+1) < r < ln2/2^(k+1) + Alternatively: + N = trunc(X*2^k/ln2) + then 0 < r < ln2/2^k + + Result calculation: + exp(X) = exp(M*ln2 + ln2*(j/2^k) + r) + = 2^M * 2^(j/2^k) * exp(r) + 2^M is calculated by bit manipulation + 2^(j/2^k) is stored in table + exp(r) is approximated by polynomial. + + The table lookup is skipped if k = 0. */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $320, %rsp + movaps %xmm0, %xmm3 + movq __svml_dexp_data@GOTPCREL(%rip), %r8 + +/* iAbsX = (int)(lX>>32), lX = *(longlong*)&X */ + pshufd $221, %xmm3, %xmm7 + movups __dbInvLn2(%r8), %xmm0 + +/* dK = X*dbInvLn2 */ + mulpd %xmm3, %xmm0 + movq __iAbsMask(%r8), %xmm5 + movq __iDomainRange(%r8), %xmm6 + +/* iAbsX = iAbsX&iAbsMask */ + pand %xmm5, %xmm7 + +/* iRangeMask = (iAbsX>iDomainRange) */ + pcmpgtd %xmm6, %xmm7 + +/* Mask = iRangeMask?1:0, set mask for overflow/underflow */ + movmskps %xmm7, %eax + +/* dN = rint(X*2^k/Ln2) */ + xorps %xmm7, %xmm7 + movups __dbLn2hi(%r8), %xmm5 + movups __dbLn2lo(%r8), %xmm6 + roundpd $0, %xmm0, %xmm7 + +/* dR = X - dN*dbLn2hi, dbLn2hi is 52-8-k hi bits of ln2/2^k */ + mulpd %xmm7, %xmm5 + +/* dR = dR - dN*dbLn2lo, dbLn2lo is 40..94 bits of lo part of ln2/2^k */ + mulpd %xmm6, %xmm7 + movups __dbShifter(%r8), %xmm4 + +/* dM = X*dbInvLn2+dbShifter */ + addpd %xmm0, %xmm4 + movaps %xmm3, %xmm0 + subpd %xmm5, %xmm0 + subpd %xmm7, %xmm0 + movups __dPC2(%r8), %xmm5 + +/* exp(r) = b0+r*(b0+r*(b1+r*b2)) */ + mulpd %xmm0, %xmm5 + addpd __dPC1(%r8), %xmm5 + mulpd %xmm0, %xmm5 + movups __dPC0(%r8), %xmm6 + addpd %xmm6, %xmm5 + mulpd %xmm5, %xmm0 + movdqu __lIndexMask(%r8), %xmm2 + +/* lIndex = (*(longlong*)&dM)&lIndexMask, lIndex is the lower K bits of lM */ + movdqa %xmm2, %xmm1 + +/* lM = (*(longlong*)&dM)&(~lIndexMask) */ + pandn %xmm4, %xmm2 + pand %xmm4, %xmm1 + +/* lM = lM<<(52-K), 2^M */ + psllq $42, %xmm2 + +/* table lookup for dT[j] = 2^(j/2^k) */ + movd %xmm1, %edx + pextrw $4, %xmm1, %ecx + addpd %xmm0, %xmm6 + shll $3, %edx + shll $3, %ecx + movq (%r8,%rdx), %xmm0 + andl $3, %eax + movhpd (%r8,%rcx), %xmm0 + +/* 2^(j/2^k) * exp(r) */ + mulpd %xmm6, %xmm0 + +/* multiply by 2^M through integer add */ + paddq %xmm2, %xmm0 + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + movups %xmm3, 192(%rsp) + movups %xmm0, 256(%rsp) + je .LBL_1_2 + + xorb %cl, %cl + xorl %edx, %edx + movups %xmm8, 112(%rsp) + movups %xmm9, 96(%rsp) + movups %xmm10, 80(%rsp) + movups %xmm11, 64(%rsp) + movups %xmm12, 48(%rsp) + movups %xmm13, 32(%rsp) + movups %xmm14, 16(%rsp) + movups %xmm15, (%rsp) + movq %rsi, 136(%rsp) + movq %rdi, 128(%rsp) + movq %r12, 168(%rsp) + cfi_offset_rel_rsp (12, 168) + movb %cl, %r12b + movq %r13, 160(%rsp) + cfi_offset_rel_rsp (13, 160) + movl %eax, %r13d + movq %r14, 152(%rsp) + cfi_offset_rel_rsp (14, 152) + movl %edx, %r14d + movq %r15, 144(%rsp) + cfi_offset_rel_rsp (15, 144) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + movups 112(%rsp), %xmm8 + movups 96(%rsp), %xmm9 + movups 80(%rsp), %xmm10 + movups 64(%rsp), %xmm11 + movups 48(%rsp), %xmm12 + movups 32(%rsp), %xmm13 + movups 16(%rsp), %xmm14 + movups (%rsp), %xmm15 + movq 136(%rsp), %rsi + movq 128(%rsp), %rdi + movq 168(%rsp), %r12 + cfi_restore (%r12) + movq 160(%rsp), %r13 + cfi_restore (%r13) + movq 152(%rsp), %r14 + cfi_restore (%r14) + movq 144(%rsp), %r15 + cfi_restore (%r15) + movups 256(%rsp), %xmm0 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + movsd 200(%rsp,%r15), %xmm0 + + call JUMPTARGET(__exp_finite) + + movsd %xmm0, 264(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + shlq $4, %r15 + movsd 192(%rsp,%r15), %xmm0 + + call JUMPTARGET(__exp_finite) + + movsd %xmm0, 256(%rsp,%r15) + jmp .LBL_1_7 + +END (_ZGVbN2v_exp_sse4) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core.S new file mode 100644 index 0000000000..b994a794cd --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized exp. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVdN4v_exp) + .type _ZGVdN4v_exp, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVdN4v_exp_avx2(%rip), %rax + HAS_ARCH_FEATURE (AVX2_Usable) + jz 2f + ret +2: leaq _ZGVdN4v_exp_sse_wrapper(%rip), %rax + ret +END (_ZGVdN4v_exp) +libmvec_hidden_def (_ZGVdN4v_exp) + +#define _ZGVdN4v_exp _ZGVdN4v_exp_sse_wrapper +#include "../svml_d_exp4_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core_avx2.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core_avx2.S new file mode 100644 index 0000000000..937b3c09a6 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core_avx2.S @@ -0,0 +1,212 @@ +/* Function exp vectorized with AVX2. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_exp_data.h" + + .text +ENTRY (_ZGVdN4v_exp_avx2) +/* + ALGORITHM DESCRIPTION: + + Argument representation: + N = rint(X*2^k/ln2) = 2^k*M+j + X = N*ln2/2^k + r = M*ln2 + ln2*(j/2^k) + r + then -ln2/2^(k+1) < r < ln2/2^(k+1) + Alternatively: + N = trunc(X*2^k/ln2) + then 0 < r < ln2/2^k + + Result calculation: + exp(X) = exp(M*ln2 + ln2*(j/2^k) + r) + = 2^M * 2^(j/2^k) * exp(r) + 2^M is calculated by bit manipulation + 2^(j/2^k) is stored in table + exp(r) is approximated by polynomial + + The table lookup is skipped if k = 0. */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $448, %rsp + movq __svml_dexp_data@GOTPCREL(%rip), %rax + vmovdqa %ymm0, %ymm2 + vmovupd __dbInvLn2(%rax), %ymm3 + vmovupd __dbShifter(%rax), %ymm1 + vmovupd __lIndexMask(%rax), %ymm4 + +/* dM = X*dbInvLn2+dbShifter, dbInvLn2 = 2^k/Ln2 */ + vfmadd213pd %ymm1, %ymm2, %ymm3 + +/* iAbsX = (int)(lX>>32), lX = *(longlong*)&X */ + vextracti128 $1, %ymm2, %xmm5 + vshufps $221, %xmm5, %xmm2, %xmm6 + +/* iAbsX = iAbsX&iAbsMask */ + vandps __iAbsMask(%rax), %xmm6, %xmm7 + +/* dN = dM-dbShifter, dN = rint(X*2^k/Ln2) */ + vsubpd %ymm1, %ymm3, %ymm6 + +/* iRangeMask = (iAbsX>iDomainRange) */ + vpcmpgtd __iDomainRange(%rax), %xmm7, %xmm0 + vmovupd __dbLn2hi(%rax), %ymm1 + vmovupd __dPC0(%rax), %ymm7 + +/* Mask = iRangeMask?1:0, set mask for overflow/underflow */ + vmovmskps %xmm0, %ecx + vmovupd __dPC2(%rax), %ymm0 + +/* dR = X - dN*dbLn2hi, dbLn2hi is 52-8-k hi bits of ln2/2^k */ + vmovdqa %ymm2, %ymm5 + vfnmadd231pd %ymm6, %ymm1, %ymm5 + +/* dR = dR - dN*dbLn2lo, dbLn2lo is 40..94 bits of lo part of ln2/2^k */ + vfnmadd132pd __dbLn2lo(%rax), %ymm5, %ymm6 + +/* exp(r) = b0+r*(b0+r*(b1+r*b2)) */ + vfmadd213pd __dPC1(%rax), %ymm6, %ymm0 + vfmadd213pd %ymm7, %ymm6, %ymm0 + vfmadd213pd %ymm7, %ymm6, %ymm0 + +/* lIndex = (*(longlong*)&dM)&lIndexMask, lIndex is the lower K bits of lM */ + vandps %ymm4, %ymm3, %ymm1 + +/* table lookup for dT[j] = 2^(j/2^k) */ + vxorpd %ymm6, %ymm6, %ymm6 + vpcmpeqd %ymm5, %ymm5, %ymm5 + vgatherqpd %ymm5, (%rax,%ymm1,8), %ymm6 + +/* lM = (*(longlong*)&dM)&(~lIndexMask) */ + vpandn %ymm3, %ymm4, %ymm3 + +/* 2^(j/2^k) * exp(r) */ + vmulpd %ymm0, %ymm6, %ymm0 + +/* lM = lM<<(52-K), 2^M */ + vpsllq $42, %ymm3, %ymm4 + +/* multiply by 2^M through integer add */ + vpaddq %ymm4, %ymm0, %ymm0 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovupd %ymm2, 320(%rsp) + vmovupd %ymm0, 384(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + vmovups %ymm8, 224(%rsp) + vmovups %ymm9, 192(%rsp) + vmovups %ymm10, 160(%rsp) + vmovups %ymm11, 128(%rsp) + vmovups %ymm12, 96(%rsp) + vmovups %ymm13, 64(%rsp) + vmovups %ymm14, 32(%rsp) + vmovups %ymm15, (%rsp) + movq %rsi, 264(%rsp) + movq %rdi, 256(%rsp) + movq %r12, 296(%rsp) + cfi_offset_rel_rsp (12, 296) + movb %dl, %r12b + movq %r13, 288(%rsp) + cfi_offset_rel_rsp (13, 288) + movl %ecx, %r13d + movq %r14, 280(%rsp) + cfi_offset_rel_rsp (14, 280) + movl %eax, %r14d + movq %r15, 272(%rsp) + cfi_offset_rel_rsp (15, 272) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + vmovups 224(%rsp), %ymm8 + vmovups 192(%rsp), %ymm9 + vmovups 160(%rsp), %ymm10 + vmovups 128(%rsp), %ymm11 + vmovups 96(%rsp), %ymm12 + vmovups 64(%rsp), %ymm13 + vmovups 32(%rsp), %ymm14 + vmovups (%rsp), %ymm15 + vmovupd 384(%rsp), %ymm0 + movq 264(%rsp), %rsi + movq 256(%rsp), %rdi + movq 296(%rsp), %r12 + cfi_restore (%r12) + movq 288(%rsp), %r13 + cfi_restore (%r13) + movq 280(%rsp), %r14 + cfi_restore (%r14) + movq 272(%rsp), %r15 + cfi_restore (%r15) + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 328(%rsp,%r15), %xmm0 + vzeroupper + + call JUMPTARGET(__exp_finite) + + vmovsd %xmm0, 392(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 320(%rsp,%r15), %xmm0 + vzeroupper + + call JUMPTARGET(__exp_finite) + + vmovsd %xmm0, 384(%rsp,%r15) + jmp .LBL_1_7 + +END (_ZGVdN4v_exp_avx2) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S new file mode 100644 index 0000000000..6189080fcc --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S @@ -0,0 +1,37 @@ +/* Multiple versions of vectorized exp. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVeN8v_exp) + .type _ZGVeN8v_exp, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVeN8v_exp_skx(%rip), %rax + HAS_ARCH_FEATURE (AVX512DQ_Usable) + jnz 2f + leaq _ZGVeN8v_exp_knl(%rip), %rax + HAS_ARCH_FEATURE (AVX512F_Usable) + jnz 2f + leaq _ZGVeN8v_exp_avx2_wrapper(%rip), %rax +2: ret +END (_ZGVeN8v_exp) + +#define _ZGVeN8v_exp _ZGVeN8v_exp_avx2_wrapper +#include "../svml_d_exp8_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core_avx512.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core_avx512.S new file mode 100644 index 0000000000..97ba72c2a0 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core_avx512.S @@ -0,0 +1,456 @@ +/* Function exp vectorized with AVX-512. KNL and SKX versions. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_exp_data.h" +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVeN8v_exp_knl) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512 _ZGVdN4v_exp +#else +/* + ALGORITHM DESCRIPTION: + + Argument representation: + N = rint(X*2^k/ln2) = 2^k*M+j + X = N*ln2/2^k + r = M*ln2 + ln2*(j/2^k) + r + then -ln2/2^(k+1) < r < ln2/2^(k+1) + Alternatively: + N = trunc(X*2^k/ln2) + then 0 < r < ln2/2^k + + Result calculation: + exp(X) = exp(M*ln2 + ln2*(j/2^k) + r) + = 2^M * 2^(j/2^k) * exp(r) + 2^M is calculated by bit manipulation + 2^(j/2^k) is stored in table + exp(r) is approximated by polynomial + + The table lookup is skipped if k = 0. */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1280, %rsp + movq __svml_dexp_data@GOTPCREL(%rip), %rax + +/* dR = X - dN*dbLn2hi, dbLn2hi is 52-8-k hi bits of ln2/2^k */ + vmovaps %zmm0, %zmm8 + +/* iAbsX = (int)(lX>>32), lX = *(longlong*)&X */ + vpsrlq $32, %zmm0, %zmm1 + +/* iAbsX = iAbsX&iAbsMask */ + movl $255, %edx + vpmovqd %zmm1, %ymm2 + kmovw %edx, %k2 + +/* iRangeMask = (iAbsX>iDomainRange) */ + movl $-1, %ecx + +/* table lookup for dT[j] = 2^(j/2^k) */ + vpxord %zmm11, %zmm11, %zmm11 + vmovups __dbInvLn2(%rax), %zmm5 + vmovups __dbLn2hi(%rax), %zmm7 + kxnorw %k3, %k3, %k3 + +/* dM = X*dbInvLn2+dbShifter, dbInvLn2 = 2^k/Ln2 */ + vfmadd213pd __dbShifter(%rax), %zmm0, %zmm5 + vmovups __dPC2(%rax), %zmm12 + +/* dN = dM-dbShifter, dN = rint(X*2^k/Ln2) */ + vsubpd __dbShifter(%rax), %zmm5, %zmm9 + vmovups __lIndexMask(%rax), %zmm4 + vfnmadd231pd %zmm9, %zmm7, %zmm8 + vpandd __iAbsMask(%rax), %zmm2, %zmm2{%k2} + +/* lIndex = (*(longlong*)&dM)&lIndexMask, lIndex is the lower K bits of lM */ + vpandq %zmm4, %zmm5, %zmm10 + vgatherqpd (%rax,%zmm10,8), %zmm11{%k3} + vpcmpgtd __iDomainRange(%rax), %zmm2, %k1{%k2} + +/* lM = (*(longlong*)&dM)&(~lIndexMask) */ + vpandnq %zmm5, %zmm4, %zmm6 + vpbroadcastd %ecx, %zmm3{%k1}{z} + +/* lM = lM<<(52-K), 2^M */ + vpsllq $42, %zmm6, %zmm14 + +/* dR = dR - dN*dbLn2lo, dbLn2lo is 40..94 bits of lo part of ln2/2^k */ + vfnmadd132pd __dbLn2lo(%rax), %zmm8, %zmm9 + +/* Mask = iRangeMask?1:0, set mask for overflow/underflow */ + vptestmd %zmm3, %zmm3, %k0{%k2} + +/* exp(r) = b0+r*(b0+r*(b1+r*b2)) */ + vfmadd213pd __dPC1(%rax), %zmm9, %zmm12 + kmovw %k0, %ecx + movzbl %cl, %ecx + vfmadd213pd __dPC0(%rax), %zmm9, %zmm12 + vfmadd213pd __dPC0(%rax), %zmm9, %zmm12 + +/* 2^(j/2^k) * exp(r) */ + vmulpd %zmm12, %zmm11, %zmm13 + +/* multiply by 2^M through integer add */ + vpaddq %zmm14, %zmm13, %zmm1 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + vmovaps %zmm1, %zmm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovups %zmm0, 1152(%rsp) + vmovups %zmm1, 1216(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + kmovw %k4, 1048(%rsp) + xorl %eax, %eax + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1064(%rsp) + movq %rdi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %ecx, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %eax, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + addb $1, %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + kmovw 1048(%rsp), %k4 + movq 1064(%rsp), %rsi + kmovw 1040(%rsp), %k5 + movq 1056(%rsp), %rdi + kmovw 1032(%rsp), %k6 + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + vmovups 1216(%rsp), %zmm1 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1160(%rsp,%r15), %xmm0 + call JUMPTARGET(__exp_finite) + vmovsd %xmm0, 1224(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1152(%rsp,%r15), %xmm0 + call JUMPTARGET(__exp_finite) + vmovsd %xmm0, 1216(%rsp,%r15) + jmp .LBL_1_7 +#endif +END (_ZGVeN8v_exp_knl) + +ENTRY (_ZGVeN8v_exp_skx) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512 _ZGVdN4v_exp +#else +/* + ALGORITHM DESCRIPTION: + + Argument representation: + N = rint(X*2^k/ln2) = 2^k*M+j + X = N*ln2/2^k + r = M*ln2 + ln2*(j/2^k) + r + then -ln2/2^(k+1) < r < ln2/2^(k+1) + Alternatively: + N = trunc(X*2^k/ln2) + then 0 < r < ln2/2^k + + Result calculation: + exp(X) = exp(M*ln2 + ln2*(j/2^k) + r) + = 2^M * 2^(j/2^k) * exp(r) + 2^M is calculated by bit manipulation + 2^(j/2^k) is stored in table + exp(r) is approximated by polynomial + + The table lookup is skipped if k = 0. */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1280, %rsp + movq __svml_dexp_data@GOTPCREL(%rip), %rax + +/* table lookup for dT[j] = 2^(j/2^k) */ + kxnorw %k1, %k1, %k1 + +/* iAbsX = (int)(lX>>32), lX = *(longlong*)&X */ + vpsrlq $32, %zmm0, %zmm1 + vmovups __dbInvLn2(%rax), %zmm7 + vmovups __dbShifter(%rax), %zmm5 + vmovups __lIndexMask(%rax), %zmm6 + vmovups __dbLn2hi(%rax), %zmm9 + vmovups __dPC0(%rax), %zmm12 + +/* dM = X*dbInvLn2+dbShifter, dbInvLn2 = 2^k/Ln2 */ + vfmadd213pd %zmm5, %zmm0, %zmm7 + vpmovqd %zmm1, %ymm2 + +/* dN = dM-dbShifter, dN = rint(X*2^k/Ln2) */ + vsubpd %zmm5, %zmm7, %zmm11 + +/* iAbsX = iAbsX&iAbsMask */ + vpand __iAbsMask(%rax), %ymm2, %ymm3 + +/* dR = X - dN*dbLn2hi, dbLn2hi is 52-8-k hi bits of ln2/2^k */ + vmovaps %zmm0, %zmm10 + vfnmadd231pd %zmm11, %zmm9, %zmm10 + vmovups __dPC2(%rax), %zmm9 + +/* dR = dR - dN*dbLn2lo, dbLn2lo is 40..94 bits of lo part of ln2/2^k */ + vfnmadd132pd __dbLn2lo(%rax), %zmm10, %zmm11 + +/* exp(r) = b0+r*(b0+r*(b1+r*b2)) */ + vfmadd213pd __dPC1(%rax), %zmm11, %zmm9 + vfmadd213pd %zmm12, %zmm11, %zmm9 + vfmadd213pd %zmm12, %zmm11, %zmm9 + +/* iRangeMask = (iAbsX>iDomainRange) */ + vpcmpgtd __iDomainRange(%rax), %ymm3, %ymm4 + +/* Mask = iRangeMask?1:0, set mask for overflow/underflow */ + vmovmskps %ymm4, %ecx + +/* lIndex = (*(longlong*)&dM)&lIndexMask, lIndex is the lower K bits of lM */ + vpandq %zmm6, %zmm7, %zmm13 + vpmovqd %zmm13, %ymm14 + vpxord %zmm15, %zmm15, %zmm15 + vgatherdpd (%rax,%ymm14,8), %zmm15{%k1} + +/* 2^(j/2^k) * exp(r) */ + vmulpd %zmm9, %zmm15, %zmm10 + +/* lM = (*(longlong*)&dM)&(~lIndexMask) */ + vpandnq %zmm7, %zmm6, %zmm8 + +/* lM = lM<<(52-K), 2^M */ + vpsllq $42, %zmm8, %zmm1 + +/* multiply by 2^M through integer add */ + vpaddq %zmm1, %zmm10, %zmm1 + testl %ecx, %ecx + jne .LBL_2_3 + +.LBL_2_2: + cfi_remember_state + vmovaps %zmm1, %zmm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_2_3: + cfi_restore_state + vmovups %zmm0, 1152(%rsp) + vmovups %zmm1, 1216(%rsp) + je .LBL_2_2 + + xorb %dl, %dl + xorl %eax, %eax + kmovw %k4, 1048(%rsp) + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1064(%rsp) + movq %rdi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %ecx, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %eax, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + cfi_remember_state + +.LBL_2_6: + btl %r14d, %r13d + jc .LBL_2_12 + +.LBL_2_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_2_10 + +.LBL_2_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_2_6 + + kmovw 1048(%rsp), %k4 + kmovw 1040(%rsp), %k5 + kmovw 1032(%rsp), %k6 + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + vmovups 1216(%rsp), %zmm1 + movq 1064(%rsp), %rsi + movq 1056(%rsp), %rdi + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + jmp .LBL_2_2 + +.LBL_2_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1160(%rsp,%r15), %xmm0 + vzeroupper + vmovsd 1160(%rsp,%r15), %xmm0 + call JUMPTARGET(__exp_finite) + vmovsd %xmm0, 1224(%rsp,%r15) + jmp .LBL_2_8 + +.LBL_2_12: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1152(%rsp,%r15), %xmm0 + vzeroupper + vmovsd 1152(%rsp,%r15), %xmm0 + call JUMPTARGET(__exp_finite) + vmovsd %xmm0, 1216(%rsp,%r15) + jmp .LBL_2_7 + +#endif +END (_ZGVeN8v_exp_skx) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core.S new file mode 100644 index 0000000000..5097add6b5 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized log. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVbN2v_log) + .type _ZGVbN2v_log, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVbN2v_log_sse4(%rip), %rax + HAS_CPU_FEATURE (SSE4_1) + jz 2f + ret +2: leaq _ZGVbN2v_log_sse2(%rip), %rax + ret +END (_ZGVbN2v_log) +libmvec_hidden_def (_ZGVbN2v_log) + +#define _ZGVbN2v_log _ZGVbN2v_log_sse2 +#include "../svml_d_log2_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core_sse4.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core_sse4.S new file mode 100644 index 0000000000..7d4b3c8850 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core_sse4.S @@ -0,0 +1,229 @@ +/* Function log vectorized with SSE4. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_log_data.h" + + .text +ENTRY (_ZGVbN2v_log_sse4) +/* + ALGORITHM DESCRIPTION: + + log(x) = -log(Rcp) + log(Rcp*x), + where Rcp ~ 1/x (accuracy ~9 bits, obtained by rounding + HW approximation to 1+9 mantissa bits) + + Reduced argument R=Rcp*x-1 is used to approximate log(1+R) as polynomial + + log(Rcp) = exponent_Rcp*log(2) + log(mantissa_Rcp) + -log(mantissa_Rcp) is obtained from a lookup table, + accessed by a 9-bit index + */ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $320, %rsp + movaps %xmm0, %xmm6 + movq __svml_dlog_data@GOTPCREL(%rip), %r8 + movaps %xmm6, %xmm3 + movaps %xmm6, %xmm2 + +/* isolate exponent bits */ + movaps %xmm6, %xmm1 + psrlq $20, %xmm1 + movups _ExpMask(%r8), %xmm5 + +/* preserve mantissa, set input exponent to 2^(-10) */ + andps %xmm6, %xmm5 + orps _Two10(%r8), %xmm5 + +/* reciprocal approximation good to at least 11 bits */ + cvtpd2ps %xmm5, %xmm7 + cmpltpd _MinNorm(%r8), %xmm3 + cmpnlepd _MaxNorm(%r8), %xmm2 + movlhps %xmm7, %xmm7 + +/* combine and get argument value range mask */ + orps %xmm2, %xmm3 + rcpps %xmm7, %xmm0 + movmskpd %xmm3, %eax + movups _HalfMask(%r8), %xmm2 + +/* argument reduction started: R = Mantissa*Rcp - 1 */ + andps %xmm5, %xmm2 + cvtps2pd %xmm0, %xmm4 + subpd %xmm2, %xmm5 + +/* round reciprocal to nearest integer, will have 1+9 mantissa bits */ + roundpd $0, %xmm4, %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 + subpd _One(%r8), %xmm2 + addpd %xmm2, %xmm5 + movups _Threshold(%r8), %xmm2 + +/* calculate index for table lookup */ + movaps %xmm4, %xmm3 + cmpltpd %xmm4, %xmm2 + pshufd $221, %xmm1, %xmm7 + psrlq $40, %xmm3 + +/* convert biased exponent to DP format */ + cvtdq2pd %xmm7, %xmm0 + movd %xmm3, %edx + movups _poly_coeff_1(%r8), %xmm4 + +/* polynomial computation */ + mulpd %xmm5, %xmm4 + andps _Bias(%r8), %xmm2 + orps _Bias1(%r8), %xmm2 + +/* + Table stores -log(0.5*mantissa) for larger mantissas, + adjust exponent accordingly + */ + subpd %xmm2, %xmm0 + addpd _poly_coeff_2(%r8), %xmm4 + +/* exponent*log(2.0) */ + mulpd _L2(%r8), %xmm0 + movaps %xmm5, %xmm2 + mulpd %xmm5, %xmm2 + movups _poly_coeff_3(%r8), %xmm7 + mulpd %xmm5, %xmm7 + mulpd %xmm2, %xmm4 + addpd _poly_coeff_4(%r8), %xmm7 + addpd %xmm4, %xmm7 + mulpd %xmm7, %xmm2 + movslq %edx, %rdx + pextrd $2, %xmm3, %ecx + +/* + reconstruction: + (exponent*log(2)) + (LogRcp + (R+poly)) + */ + addpd %xmm2, %xmm5 + movslq %ecx, %rcx + movsd _LogRcp_lookup(%r8,%rdx), %xmm1 + movhpd _LogRcp_lookup(%r8,%rcx), %xmm1 + addpd %xmm5, %xmm1 + addpd %xmm1, %xmm0 + testl %eax, %eax + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + movups %xmm6, 192(%rsp) + movups %xmm0, 256(%rsp) + je .LBL_1_2 + + xorb %cl, %cl + xorl %edx, %edx + movups %xmm8, 112(%rsp) + movups %xmm9, 96(%rsp) + movups %xmm10, 80(%rsp) + movups %xmm11, 64(%rsp) + movups %xmm12, 48(%rsp) + movups %xmm13, 32(%rsp) + movups %xmm14, 16(%rsp) + movups %xmm15, (%rsp) + movq %rsi, 136(%rsp) + movq %rdi, 128(%rsp) + movq %r12, 168(%rsp) + cfi_offset_rel_rsp (12, 168) + movb %cl, %r12b + movq %r13, 160(%rsp) + cfi_offset_rel_rsp (13, 160) + movl %eax, %r13d + movq %r14, 152(%rsp) + cfi_offset_rel_rsp (14, 152) + movl %edx, %r14d + movq %r15, 144(%rsp) + cfi_offset_rel_rsp (15, 144) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + movups 112(%rsp), %xmm8 + movups 96(%rsp), %xmm9 + movups 80(%rsp), %xmm10 + movups 64(%rsp), %xmm11 + movups 48(%rsp), %xmm12 + movups 32(%rsp), %xmm13 + movups 16(%rsp), %xmm14 + movups (%rsp), %xmm15 + movq 136(%rsp), %rsi + movq 128(%rsp), %rdi + movq 168(%rsp), %r12 + cfi_restore (%r12) + movq 160(%rsp), %r13 + cfi_restore (%r13) + movq 152(%rsp), %r14 + cfi_restore (%r14) + movq 144(%rsp), %r15 + cfi_restore (%r15) + movups 256(%rsp), %xmm0 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + movsd 200(%rsp,%r15), %xmm0 + + call JUMPTARGET(__log_finite) + + movsd %xmm0, 264(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + shlq $4, %r15 + movsd 192(%rsp,%r15), %xmm0 + + call JUMPTARGET(__log_finite) + + movsd %xmm0, 256(%rsp,%r15) + jmp .LBL_1_7 + +END (_ZGVbN2v_log_sse4) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core.S new file mode 100644 index 0000000000..1e9a2f48a1 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized log. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVdN4v_log) + .type _ZGVdN4v_log, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVdN4v_log_avx2(%rip), %rax + HAS_ARCH_FEATURE (AVX2_Usable) + jz 2f + ret +2: leaq _ZGVdN4v_log_sse_wrapper(%rip), %rax + ret +END (_ZGVdN4v_log) +libmvec_hidden_def (_ZGVdN4v_log) + +#define _ZGVdN4v_log _ZGVdN4v_log_sse_wrapper +#include "../svml_d_log4_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core_avx2.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core_avx2.S new file mode 100644 index 0000000000..04ea9e0071 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core_avx2.S @@ -0,0 +1,210 @@ +/* Function log vectorized with AVX2. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_log_data.h" + + .text +ENTRY (_ZGVdN4v_log_avx2) +/* ALGORITHM DESCRIPTION: + + log(x) = -log(Rcp) + log(Rcp*x), + where Rcp ~ 1/x (accuracy ~9 bits, obtained by rounding + HW approximation to 1+9 mantissa bits) + + Reduced argument R=Rcp*x-1 is used to approximate log(1+R) as polynomial + + log(Rcp) = exponent_Rcp*log(2) + log(mantissa_Rcp) + -log(mantissa_Rcp) is obtained from a lookup table, + accessed by a 9-bit index + */ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $448, %rsp + movq __svml_dlog_data@GOTPCREL(%rip), %rax + vmovdqa %ymm0, %ymm5 + +/* isolate exponent bits */ + vpsrlq $20, %ymm5, %ymm0 + +/* preserve mantissa, set input exponent to 2^(-10) */ + vandpd _ExpMask(%rax), %ymm5, %ymm6 + vorpd _Two10(%rax), %ymm6, %ymm4 + +/* reciprocal approximation good to at least 11 bits */ + vcvtpd2ps %ymm4, %xmm7 + vrcpps %xmm7, %xmm1 + vcmplt_oqpd _MinNorm(%rax), %ymm5, %ymm7 + vcvtps2pd %xmm1, %ymm3 + vcmpnle_uqpd _MaxNorm(%rax), %ymm5, %ymm1 + vextracti128 $1, %ymm0, %xmm2 + vshufps $221, %xmm2, %xmm0, %xmm6 + +/* round reciprocal to nearest integer, will have 1+9 mantissa bits */ + vroundpd $0, %ymm3, %ymm2 + +/* convert biased exponent to DP format */ + vcvtdq2pd %xmm6, %ymm0 + +/* combine and get argument value range mask */ + vorpd %ymm1, %ymm7, %ymm3 + vmovupd _One(%rax), %ymm1 + vmovmskpd %ymm3, %ecx + +/* calculate index for table lookup */ + vpsrlq $40, %ymm2, %ymm3 + +/* argument reduction started: R = Mantissa*Rcp - 1 */ + vfmsub213pd %ymm1, %ymm2, %ymm4 + vcmpgt_oqpd _Threshold(%rax), %ymm2, %ymm2 + vpcmpeqd %ymm6, %ymm6, %ymm6 + vxorpd %ymm1, %ymm1, %ymm1 + vgatherqpd %ymm6, _LogRcp_lookup(%rax,%ymm3), %ymm1 + +/* exponent*log(2.0) */ + vmovupd _poly_coeff_1(%rax), %ymm6 + vmulpd %ymm4, %ymm4, %ymm3 + +/* polynomial computation */ + vfmadd213pd _poly_coeff_2(%rax), %ymm4, %ymm6 + vandpd _Bias(%rax), %ymm2, %ymm7 + vorpd _Bias1(%rax), %ymm7, %ymm2 + +/* + Table stores -log(0.5*mantissa) for larger mantissas, + adjust exponent accordingly + */ + vsubpd %ymm2, %ymm0, %ymm0 + vmovupd _poly_coeff_3(%rax), %ymm2 + vfmadd213pd _poly_coeff_4(%rax), %ymm4, %ymm2 + vfmadd213pd %ymm2, %ymm3, %ymm6 + +/* + reconstruction: + (exponent*log(2)) + (LogRcp + (R+poly)) + */ + vfmadd213pd %ymm4, %ymm3, %ymm6 + vaddpd %ymm1, %ymm6, %ymm4 + vfmadd132pd _L2(%rax), %ymm4, %ymm0 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovupd %ymm5, 320(%rsp) + vmovupd %ymm0, 384(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + vmovups %ymm8, 224(%rsp) + vmovups %ymm9, 192(%rsp) + vmovups %ymm10, 160(%rsp) + vmovups %ymm11, 128(%rsp) + vmovups %ymm12, 96(%rsp) + vmovups %ymm13, 64(%rsp) + vmovups %ymm14, 32(%rsp) + vmovups %ymm15, (%rsp) + movq %rsi, 264(%rsp) + movq %rdi, 256(%rsp) + movq %r12, 296(%rsp) + cfi_offset_rel_rsp (12, 296) + movb %dl, %r12b + movq %r13, 288(%rsp) + cfi_offset_rel_rsp (13, 288) + movl %ecx, %r13d + movq %r14, 280(%rsp) + cfi_offset_rel_rsp (14, 280) + movl %eax, %r14d + movq %r15, 272(%rsp) + cfi_offset_rel_rsp (15, 272) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + vmovups 224(%rsp), %ymm8 + vmovups 192(%rsp), %ymm9 + vmovups 160(%rsp), %ymm10 + vmovups 128(%rsp), %ymm11 + vmovups 96(%rsp), %ymm12 + vmovups 64(%rsp), %ymm13 + vmovups 32(%rsp), %ymm14 + vmovups (%rsp), %ymm15 + vmovupd 384(%rsp), %ymm0 + movq 264(%rsp), %rsi + movq 256(%rsp), %rdi + movq 296(%rsp), %r12 + cfi_restore (%r12) + movq 288(%rsp), %r13 + cfi_restore (%r13) + movq 280(%rsp), %r14 + cfi_restore (%r14) + movq 272(%rsp), %r15 + cfi_restore (%r15) + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 328(%rsp,%r15), %xmm0 + vzeroupper + + call JUMPTARGET(__log_finite) + + vmovsd %xmm0, 392(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 320(%rsp,%r15), %xmm0 + vzeroupper + + call JUMPTARGET(__log_finite) + + vmovsd %xmm0, 384(%rsp,%r15) + jmp .LBL_1_7 + +END (_ZGVdN4v_log_avx2) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S new file mode 100644 index 0000000000..43f572d36c --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S @@ -0,0 +1,37 @@ +/* Multiple versions of vectorized log. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVeN8v_log) + .type _ZGVeN8v_log, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVeN8v_log_skx(%rip), %rax + HAS_ARCH_FEATURE (AVX512DQ_Usable) + jnz 2f + leaq _ZGVeN8v_log_knl(%rip), %rax + HAS_ARCH_FEATURE (AVX512F_Usable) + jnz 2f + leaq _ZGVeN8v_log_avx2_wrapper(%rip), %rax +2: ret +END (_ZGVeN8v_log) + +#define _ZGVeN8v_log _ZGVeN8v_log_avx2_wrapper +#include "../svml_d_log8_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S new file mode 100644 index 0000000000..d10d5114c6 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S @@ -0,0 +1,468 @@ +/* Function log vectorized with AVX-512. KNL and SKX versions. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_log_data.h" +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVeN8v_log_knl) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512 _ZGVdN4v_log +#else +/* + ALGORITHM DESCRIPTION: + + log(x) = -log(Rcp) + log(Rcp*x), + where Rcp ~ 1/x (accuracy ~9 bits, obtained by + rounding HW approximation to 1+9 mantissa bits) + + Reduced argument R=Rcp*x-1 is used to approximate log(1+R) as polynomial + + log(Rcp) = exponent_Rcp*log(2) + log(mantissa_Rcp) + -log(mantissa_Rcp) is obtained from a lookup table, + accessed by a 9-bit index + */ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1280, %rsp + movq __svml_dlog_data@GOTPCREL(%rip), %rdx + movq $-1, %rax + +/* isolate exponent bits */ + vpsrlq $20, %zmm0, %zmm2 + vpsrlq $32, %zmm2, %zmm3 + vpxord %zmm2, %zmm2, %zmm2 + kxnorw %k3, %k3, %k3 + vmovups _Two10(%rdx), %zmm1 + vmovups _One(%rdx), %zmm9 + vpmovqd %zmm3, %ymm4 + +/* convert biased exponent to DP format */ + vcvtdq2pd %ymm4, %zmm13 + +/* preserve mantissa, set input exponent to 2^(-10) */ + vpternlogq $248, _ExpMask(%rdx), %zmm0, %zmm1 + vcmppd $17, _MinNorm(%rdx), %zmm0, %k1 + +/* reciprocal approximation good to at least 11 bits */ + vrcp28pd %zmm1, %zmm5 + vpbroadcastq %rax, %zmm6{%k1}{z} + vmovups _poly_coeff_3(%rdx), %zmm15 + vcmppd $22, _MaxNorm(%rdx), %zmm0, %k2 + vmovups _Bias1(%rdx), %zmm14 + +/* round reciprocal to nearest integer, will have 1+9 mantissa bits */ + vrndscalepd $8, %zmm5, %zmm11 + vpbroadcastq %rax, %zmm7{%k2}{z} + +/* argument reduction started: R = Mantissa*Rcp - 1 */ + vfmsub213pd %zmm9, %zmm11, %zmm1 + +/* calculate index for table lookup */ + vpsrlq $40, %zmm11, %zmm10 + vgatherqpd _LogRcp_lookup(%rdx,%zmm10), %zmm2{%k3} + vcmppd $30, _Threshold(%rdx), %zmm11, %k1 + +/* combine and get argument value range mask */ + vporq %zmm7, %zmm6, %zmm8 + +/* exponent*log(2.0) */ + vmovups _poly_coeff_1(%rdx), %zmm11 + vmulpd %zmm1, %zmm1, %zmm10 + vptestmq %zmm8, %zmm8, %k0 + vfmadd213pd _poly_coeff_4(%rdx), %zmm1, %zmm15 + kmovw %k0, %ecx + +/* polynomial computation */ + vfmadd213pd _poly_coeff_2(%rdx), %zmm1, %zmm11 + movzbl %cl, %ecx + vpbroadcastq %rax, %zmm12{%k1}{z} + vfmadd213pd %zmm15, %zmm10, %zmm11 + vpternlogq $248, _Bias(%rdx), %zmm12, %zmm14 + +/* + Table stores -log(0.5*mantissa) for larger mantissas, + adjust exponent accordingly + */ + vsubpd %zmm14, %zmm13, %zmm3 + +/* + reconstruction: + (exponent*log(2)) + (LogRcp + (R+poly)) + */ + vfmadd213pd %zmm1, %zmm10, %zmm11 + vaddpd %zmm2, %zmm11, %zmm1 + vfmadd132pd _L2(%rdx), %zmm1, %zmm3 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + vmovaps %zmm3, %zmm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovups %zmm0, 1152(%rsp) + vmovups %zmm3, 1216(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + kmovw %k4, 1048(%rsp) + xorl %eax, %eax + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1064(%rsp) + movq %rdi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %ecx, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %eax, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + addb $1, %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + kmovw 1048(%rsp), %k4 + movq 1064(%rsp), %rsi + kmovw 1040(%rsp), %k5 + movq 1056(%rsp), %rdi + kmovw 1032(%rsp), %k6 + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + vmovups 1216(%rsp), %zmm3 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1160(%rsp,%r15), %xmm0 + call JUMPTARGET(__log_finite) + vmovsd %xmm0, 1224(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1152(%rsp,%r15), %xmm0 + call JUMPTARGET(__log_finite) + vmovsd %xmm0, 1216(%rsp,%r15) + jmp .LBL_1_7 +#endif +END (_ZGVeN8v_log_knl) + +ENTRY (_ZGVeN8v_log_skx) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512 _ZGVdN4v_log +#else +/* + ALGORITHM DESCRIPTION: + + log(x) = -log(Rcp) + log(Rcp*x), + where Rcp ~ 1/x (accuracy ~9 bits, + obtained by rounding HW approximation to 1+9 mantissa bits) + + Reduced argument R=Rcp*x-1 is used to approximate log(1+R) as polynomial + + log(Rcp) = exponent_Rcp*log(2) + log(mantissa_Rcp) + -log(mantissa_Rcp) is obtained from a lookup table, + accessed by a 9-bit index + */ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1280, %rsp + movq __svml_dlog_data@GOTPCREL(%rip), %rax + vmovaps %zmm0, %zmm3 + kxnorw %k3, %k3, %k3 + vmovups _Two10(%rax), %zmm2 + vmovups _Threshold(%rax), %zmm14 + vmovups _One(%rax), %zmm11 + vcmppd $21, _MinNorm(%rax), %zmm3, %k1 + vcmppd $18, _MaxNorm(%rax), %zmm3, %k2 + +/* isolate exponent bits */ + vpsrlq $20, %zmm3, %zmm4 + +/* preserve mantissa, set input exponent to 2^(-10) */ + vpternlogq $248, _ExpMask(%rax), %zmm3, %zmm2 + vpbroadcastq .L_2il0floatpacket.12(%rip), %zmm1 + vpsrlq $32, %zmm4, %zmm6 + +/* reciprocal approximation good to at least 11 bits */ + vrcp14pd %zmm2, %zmm5 + +/* exponent*log(2.0) */ + vmovups _poly_coeff_1(%rax), %zmm4 + vpmovqd %zmm6, %ymm7 + +/* round reciprocal to nearest integer, will have 1+9 mantissa bits */ + vrndscalepd $8, %zmm5, %zmm0 + +/* calculate index for table lookup */ + vpsrlq $40, %zmm0, %zmm12 + +/* argument reduction started: R = Mantissa*Rcp - 1 */ + vfmsub213pd %zmm11, %zmm0, %zmm2 + vpmovqd %zmm12, %ymm13 + +/* polynomial computation */ + vfmadd213pd _poly_coeff_2(%rax), %zmm2, %zmm4 + vmovaps %zmm1, %zmm8 + vmovaps %zmm1, %zmm9 + vpxord %zmm5, %zmm5, %zmm5 + vgatherdpd _LogRcp_lookup(%rax,%ymm13), %zmm5{%k3} + vmovups _Bias1(%rax), %zmm13 + vpandnq %zmm3, %zmm3, %zmm8{%k1} + vcmppd $21, %zmm0, %zmm14, %k1 + vpandnq %zmm14, %zmm14, %zmm1{%k1} + vmulpd %zmm2, %zmm2, %zmm14 + vpternlogq $248, _Bias(%rax), %zmm1, %zmm13 + vmovups _poly_coeff_3(%rax), %zmm1 + vfmadd213pd _poly_coeff_4(%rax), %zmm2, %zmm1 + vfmadd213pd %zmm1, %zmm14, %zmm4 + +/* + reconstruction: + (exponent*log(2)) + (LogRcp + (R+poly)) + */ + vfmadd213pd %zmm2, %zmm14, %zmm4 + vaddpd %zmm5, %zmm4, %zmm2 + vpandnq %zmm3, %zmm3, %zmm9{%k2} + +/* combine and get argument value range mask */ + vorpd %zmm9, %zmm8, %zmm10 + vcmppd $3, %zmm10, %zmm10, %k0 + kmovw %k0, %ecx + +/* convert biased exponent to DP format */ + vcvtdq2pd %ymm7, %zmm15 + +/* + Table stores -log(0.5*mantissa) for larger mantissas, + adjust exponent accordingly + */ + vsubpd %zmm13, %zmm15, %zmm0 + vfmadd132pd _L2(%rax), %zmm2, %zmm0 + testl %ecx, %ecx + jne .LBL_2_3 + +.LBL_2_2: + cfi_remember_state + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_2_3: + cfi_restore_state + vmovups %zmm3, 1152(%rsp) + vmovups %zmm0, 1216(%rsp) + je .LBL_2_2 + + xorb %dl, %dl + xorl %eax, %eax + kmovw %k4, 1048(%rsp) + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1064(%rsp) + movq %rdi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %ecx, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %eax, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + cfi_remember_state + +.LBL_2_6: + btl %r14d, %r13d + jc .LBL_2_12 + +.LBL_2_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_2_10 + +.LBL_2_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_2_6 + + kmovw 1048(%rsp), %k4 + kmovw 1040(%rsp), %k5 + kmovw 1032(%rsp), %k6 + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + vmovups 1216(%rsp), %zmm0 + movq 1064(%rsp), %rsi + movq 1056(%rsp), %rdi + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + jmp .LBL_2_2 + +.LBL_2_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1160(%rsp,%r15), %xmm0 + vzeroupper + vmovsd 1160(%rsp,%r15), %xmm0 + + call JUMPTARGET(__log_finite) + + vmovsd %xmm0, 1224(%rsp,%r15) + jmp .LBL_2_8 + +.LBL_2_12: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1152(%rsp,%r15), %xmm0 + vzeroupper + vmovsd 1152(%rsp,%r15), %xmm0 + + call JUMPTARGET(__log_finite) + + vmovsd %xmm0, 1216(%rsp,%r15) + jmp .LBL_2_7 +#endif +END (_ZGVeN8v_log_skx) + + .section .rodata, "a" +.L_2il0floatpacket.12: + .long 0xffffffff,0xffffffff + .type .L_2il0floatpacket.12,@object diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core.S new file mode 100644 index 0000000000..adb0872e56 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized pow. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVbN2vv_pow) + .type _ZGVbN2vv_pow, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVbN2vv_pow_sse4(%rip), %rax + HAS_CPU_FEATURE (SSE4_1) + jz 2f + ret +2: leaq _ZGVbN2vv_pow_sse2(%rip), %rax + ret +END (_ZGVbN2vv_pow) +libmvec_hidden_def (_ZGVbN2vv_pow) + +#define _ZGVbN2vv_pow _ZGVbN2vv_pow_sse2 +#include "../svml_d_pow2_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core_sse4.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core_sse4.S new file mode 100644 index 0000000000..ad7c215ff0 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core_sse4.S @@ -0,0 +1,432 @@ +/* Function pow vectorized with SSE4. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_pow_data.h" + + .text +ENTRY (_ZGVbN2vv_pow_sse4) +/* + ALGORITHM DESCRIPTION: + + 1) Calculating log2|x| + Here we use the following formula. + Let |x|=2^k1*X1, where k1 is integer, 1<=X1<2. + Let C ~= 1/ln(2), + Rcp1 ~= 1/X1, X2=Rcp1*X1, + Rcp2 ~= 1/X2, X3=Rcp2*X2, + Rcp3 ~= 1/X3, Rcp3C ~= C/X3. + Then + log2|x| = k1 + log2(1/Rcp1) + log2(1/Rcp2) + log2(C/Rcp3C) + + log2(X1*Rcp1*Rcp2*Rcp3C/C), + where X1*Rcp1*Rcp2*Rcp3C = C*(1+q), q is very small. + + The values of Rcp1, log2(1/Rcp1), Rcp2, log2(1/Rcp2), + Rcp3C, log2(C/Rcp3C) are taken from tables. + Values of Rcp1, Rcp2, Rcp3C are such that RcpC=Rcp1*Rcp2*Rcp3C + is exactly represented in target precision. + + log2(X1*Rcp1*Rcp2*Rcp3C/C) = log2(1+q) = ln(1+q)/ln2 = + = 1/(ln2)*q - 1/(2ln2)*q^2 + 1/(3ln2)*q^3 - ... = + = 1/(C*ln2)*cq - 1/(2*C^2*ln2)*cq^2 + 1/(3*C^3*ln2)*cq^3 - ... = + = (1 + a1)*cq + a2*cq^2 + a3*cq^3 + ..., + where cq = X1*Rcp1*Rcp2*Rcp3C-C, + a1=1/(C*ln(2))-1 is small, + a2=1/(2*C^2*ln2), + a3=1/(3*C^3*ln2), + ... + We get 3 parts of log2 result: HH+HL+HLL ~= log2|x|. + + 2) Calculation of y*(HH+HL+HLL). + Split y into YHi+YLo. + Get high PH and medium PL parts of y*log2|x|. + Get low PLL part of y*log2|x|. + Now we have PH+PL+PLL ~= y*log2|x|. + + 3) Calculation of 2^(PH+PL+PLL). + Mathematical idea of computing 2^(PH+PL+PLL) is the following. + Let's represent PH+PL+PLL in the form N + j/2^expK + Z, + where expK=7 in this implementation, N and j are integers, + 0<=j<=2^expK-1, |Z|<2^(-expK-1). + Hence 2^(PH+PL+PLL) ~= 2^N * 2^(j/2^expK) * 2^Z, + where 2^(j/2^expK) is stored in a table, and + 2^Z ~= 1 + B1*Z + B2*Z^2 ... + B5*Z^5. + + We compute 2^(PH+PL+PLL) as follows. + Break PH into PHH + PHL, where PHH = N + j/2^expK. + Z = PHL + PL + PLL + Exp2Poly = B1*Z + B2*Z^2 ... + B5*Z^5 + Get 2^(j/2^expK) from table in the form THI+TLO. + Now we have 2^(PH+PL+PLL) ~= 2^N * (THI + TLO) * (1 + Exp2Poly). + + Get significand of 2^(PH+PL+PLL) in the form ResHi+ResLo: + ResHi := THI + ResLo := THI * Exp2Poly + TLO + + Get exponent ERes of the result: + Res := ResHi + ResLo: + Result := ex(Res) + N. */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $256, %rsp + movq __svml_dpow_data@GOTPCREL(%rip), %rdx + movups %xmm14, 80(%rsp) + movups %xmm9, 176(%rsp) + movaps %xmm1, %xmm9 + pshufd $221, %xmm0, %xmm1 + movq _iIndexMask(%rdx), %xmm14 + movq _iIndexAdd(%rdx), %xmm6 + +/* i = (((Hi(x) & 0x000ffe00) + 0x00000200) >> 10); -> i = (b1..b11 + 1) / 2 */ + pand %xmm1, %xmm14 + paddd %xmm6, %xmm14 + psrld $10, %xmm14 + movups %xmm13, 96(%rsp) + +/* Index for reciprocal table */ + movdqa %xmm14, %xmm13 + pslld $3, %xmm13 + +/* Index for log2 table */ + pslld $4, %xmm14 + movd %xmm13, %eax + movups %xmm10, 160(%rsp) + movups _iMantissaMask(%rdx), %xmm10 + movslq %eax, %rax + +/* x1 = x; Hi(x1) = (Hi(x1)&0x000fffff)|0x3ff00000 */ + andps %xmm0, %xmm10 + pextrd $1, %xmm13, %ecx + movslq %ecx, %rcx + movups %xmm0, (%rsp) + movdqa %xmm1, %xmm0 + +/* k = Hi(x); k = k - 0x3fe7fe00; k = k >> 20 */ + movq _i3fe7fe00(%rdx), %xmm6 + psubd %xmm6, %xmm0 + movups _iHighMask(%rdx), %xmm6 + psrad $20, %xmm0 + movups %xmm15, 48(%rsp) + movups %xmm12, 112(%rsp) + orps _dbOne(%rdx), %xmm10 + movsd 11712(%rdx,%rax), %xmm12 + movd %xmm14, %r8d + movq _i2p20_2p19(%rdx), %xmm15 + movhpd 11712(%rdx,%rcx), %xmm12 + paddd %xmm15, %xmm0 + pextrd $1, %xmm14, %r9d + +/* x1Hi=x1; Lo(x1Hi)&=0xf8000000; x1Lo = x1-x1Hi */ + movaps %xmm6, %xmm14 + andps %xmm10, %xmm14 + movaps %xmm10, %xmm15 + subpd %xmm14, %xmm15 + +/* r1 = x1*rcp1 */ + mulpd %xmm12, %xmm10 + +/* E = -r1+__fence(x1Hi*rcp1) */ + mulpd %xmm12, %xmm14 + +/* E=E+x1Lo*rcp1 */ + mulpd %xmm15, %xmm12 + subpd %xmm10, %xmm14 + pshufd $80, %xmm0, %xmm0 + movslq %r8d, %r8 + andps _iffffffff00000000(%rdx), %xmm0 + subpd _db2p20_2p19(%rdx), %xmm0 + addpd %xmm12, %xmm14 + movslq %r9d, %r9 + +/* T_Rh_Eh = T_Rh + E */ + movaps %xmm14, %xmm15 + movups %xmm8, 208(%rsp) + movups 19968(%rdx,%r8), %xmm8 + movups %xmm11, 144(%rsp) + movaps %xmm8, %xmm11 + +/* cq = c+r1 */ + movups _LHN(%rdx), %xmm13 + movhpd 19968(%rdx,%r9), %xmm11 + addpd %xmm10, %xmm13 + +/* T = k + L1hi */ + addpd %xmm0, %xmm11 + +/* T_Rh = T + cq */ + movaps %xmm13, %xmm12 + addpd %xmm11, %xmm12 + addpd %xmm12, %xmm15 + +/* Rl = T-T_Rh; -> -Rh */ + subpd %xmm12, %xmm11 + +/* HLL = T_Rh - T_Rh_Eh; -> -Eh */ + subpd %xmm15, %xmm12 + +/* Rl=Rl+cq; */ + addpd %xmm13, %xmm11 + +/* cq = cq + E */ + addpd %xmm14, %xmm13 + +/* HLL+=E; -> El */ + addpd %xmm14, %xmm12 + +/* HLL+=Rl */ + addpd %xmm12, %xmm11 + unpckhpd 19968(%rdx,%r9), %xmm8 + +/* T_Rh_Eh_HLLhi = T_Rh_Eh + HLL */ + movaps %xmm15, %xmm14 + +/* HLL+=L1lo; */ + addpd %xmm11, %xmm8 + movups _clv_2(%rdx), %xmm11 + +/* HH = T_Rh_Eh_HLLhi; Lo(HH)&=0xf8000000 */ + movaps %xmm6, %xmm12 + +/* HLL = HLL + (((((((a7)*cq+a6)*cq+a5)*cq+a4)*cq+a3)*cq+a2)*cq+a1)*cq */ + mulpd %xmm13, %xmm11 + addpd _clv_3(%rdx), %xmm11 + mulpd %xmm13, %xmm11 + addpd _clv_4(%rdx), %xmm11 + mulpd %xmm13, %xmm11 + addpd _clv_5(%rdx), %xmm11 + mulpd %xmm13, %xmm11 + addpd _clv_6(%rdx), %xmm11 + mulpd %xmm13, %xmm11 + addpd _clv_7(%rdx), %xmm11 + mulpd %xmm11, %xmm13 + addpd %xmm13, %xmm8 + addpd %xmm8, %xmm14 + +/* + 2^(y*(HH+HL+HLL)) starts here: + yH = y; Lo(yH)&=0xf8000000 + */ + andps %xmm9, %xmm6 + +/* yL = y-yH; */ + movaps %xmm9, %xmm11 + subpd %xmm6, %xmm11 + andps %xmm14, %xmm12 + +/* HLLhi = T_Rh_Eh_HLLhi - T_Rh_Eh */ + movaps %xmm14, %xmm10 + +/* HL = T_Rh_Eh_HLLhi-HH; */ + subpd %xmm12, %xmm14 + subpd %xmm15, %xmm10 + movq _HIDELTA(%rdx), %xmm2 + +/* pH = yH*HH; */ + movaps %xmm6, %xmm13 + movq _LORANGE(%rdx), %xmm3 + paddd %xmm2, %xmm1 + pcmpgtd %xmm1, %xmm3 + +/* pL=yL*HL+yH*HL; pL+=yL*HH; */ + movaps %xmm11, %xmm1 + mulpd %xmm14, %xmm1 + mulpd %xmm14, %xmm6 + mulpd %xmm12, %xmm13 + mulpd %xmm11, %xmm12 + addpd %xmm6, %xmm1 + +/* HLL = HLL - HLLhi */ + subpd %xmm10, %xmm8 + addpd %xmm12, %xmm1 + +/* pLL = y*HLL */ + mulpd %xmm9, %xmm8 + movups _db2p45_2p44(%rdx), %xmm11 + +/* pHH = pH + *(double*)&db2p45_2p44 */ + movaps %xmm11, %xmm12 + addpd %xmm13, %xmm12 + +/* t=pL+pLL; t+=pHL */ + addpd %xmm8, %xmm1 + movq _ABSMASK(%rdx), %xmm5 + pshufd $221, %xmm9, %xmm4 + pand %xmm5, %xmm4 + movq _INF(%rdx), %xmm7 + movdqa %xmm4, %xmm2 + pcmpgtd %xmm7, %xmm2 + pcmpeqd %xmm7, %xmm4 + pshufd $136, %xmm12, %xmm7 + por %xmm4, %xmm2 + +/* pHH = pHH - *(double*)&db2p45_2p44 */ + subpd %xmm11, %xmm12 + pshufd $221, %xmm13, %xmm10 + por %xmm2, %xmm3 + +/* pHL = pH - pHH; */ + subpd %xmm12, %xmm13 + pand %xmm5, %xmm10 + movq _DOMAINRANGE(%rdx), %xmm5 + movdqa %xmm10, %xmm4 + addpd %xmm1, %xmm13 + pcmpgtd %xmm5, %xmm4 + pcmpeqd %xmm5, %xmm10 + por %xmm10, %xmm4 + movq _jIndexMask(%rdx), %xmm6 + por %xmm4, %xmm3 + movmskps %xmm3, %eax + +/* j = Lo(pHH)&0x0000007f */ + pand %xmm7, %xmm6 + movq _iOne(%rdx), %xmm3 + +/* _n = Lo(pHH); + _n = _n & 0xffffff80; + _n = _n >> 7; + Hi(_2n) = (0x3ff+_n)<<20; Lo(_2n) = 0; -> 2^n + */ + pslld $13, %xmm7 + paddd %xmm3, %xmm7 + pslld $4, %xmm6 + movups _cev_1(%rdx), %xmm3 + movaps %xmm13, %xmm4 + mulpd %xmm13, %xmm3 + +/* T1 = ((double*)exp2_tbl)[ 2*j ] */ + movd %xmm6, %r10d + pshufd $80, %xmm7, %xmm0 + andps _ifff0000000000000(%rdx), %xmm0 + addpd _cev_2(%rdx), %xmm3 + mulpd %xmm13, %xmm3 + addpd _cev_3(%rdx), %xmm3 + mulpd %xmm13, %xmm3 + movslq %r10d, %r10 + andl $3, %eax + pextrd $1, %xmm6, %r11d + movslq %r11d, %r11 + addpd _cev_4(%rdx), %xmm3 + movsd 36416(%rdx,%r10), %xmm2 + movhpd 36416(%rdx,%r11), %xmm2 + mulpd %xmm2, %xmm0 + mulpd %xmm3, %xmm13 + mulpd %xmm0, %xmm4 + addpd _cev_5(%rdx), %xmm13 + mulpd %xmm4, %xmm13 + addpd %xmm13, %xmm0 + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + movups 208(%rsp), %xmm8 + movups 176(%rsp), %xmm9 + movups 160(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 112(%rsp), %xmm12 + movups 96(%rsp), %xmm13 + movups 80(%rsp), %xmm14 + movups 48(%rsp), %xmm15 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + movups (%rsp), %xmm1 + movups %xmm1, 64(%rsp) + movups %xmm9, 128(%rsp) + movups %xmm0, 192(%rsp) + je .LBL_1_2 + + xorb %cl, %cl + xorl %edx, %edx + movq %rsi, 8(%rsp) + movq %rdi, (%rsp) + movq %r12, 40(%rsp) + cfi_offset_rel_rsp (12, 40) + movb %cl, %r12b + movq %r13, 32(%rsp) + cfi_offset_rel_rsp (13, 32) + movl %eax, %r13d + movq %r14, 24(%rsp) + cfi_offset_rel_rsp (14, 24) + movl %edx, %r14d + movq %r15, 16(%rsp) + cfi_offset_rel_rsp (15, 16) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + movq 8(%rsp), %rsi + movq (%rsp), %rdi + movq 40(%rsp), %r12 + cfi_restore (%r12) + movq 32(%rsp), %r13 + cfi_restore (%r13) + movq 24(%rsp), %r14 + cfi_restore (%r14) + movq 16(%rsp), %r15 + cfi_restore (%r15) + movups 192(%rsp), %xmm0 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + movsd 72(%rsp,%r15), %xmm0 + movsd 136(%rsp,%r15), %xmm1 + + call JUMPTARGET(__pow_finite) + + movsd %xmm0, 200(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + shlq $4, %r15 + movsd 64(%rsp,%r15), %xmm0 + movsd 128(%rsp,%r15), %xmm1 + + call JUMPTARGET(__pow_finite) + + movsd %xmm0, 192(%rsp,%r15) + jmp .LBL_1_7 + +END (_ZGVbN2vv_pow_sse4) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core.S new file mode 100644 index 0000000000..eea8af6638 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized pow. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVdN4vv_pow) + .type _ZGVdN4vv_pow, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVdN4vv_pow_avx2(%rip), %rax + HAS_ARCH_FEATURE (AVX2_Usable) + jz 2f + ret +2: leaq _ZGVdN4vv_pow_sse_wrapper(%rip), %rax + ret +END (_ZGVdN4vv_pow) +libmvec_hidden_def (_ZGVdN4vv_pow) + +#define _ZGVdN4vv_pow _ZGVdN4vv_pow_sse_wrapper +#include "../svml_d_pow4_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core_avx2.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core_avx2.S new file mode 100644 index 0000000000..3092328909 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core_avx2.S @@ -0,0 +1,387 @@ +/* Function pow vectorized with AVX2. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_pow_data.h" + + .text +ENTRY (_ZGVdN4vv_pow_avx2) +/* + ALGORITHM DESCRIPTION: + + 1) Calculating log2|x| + Here we use the following formula. + Let |x|=2^k1*X1, where k1 is integer, 1<=X1<2. + Let C ~= 1/ln(2), + Rcp1 ~= 1/X1, X2=Rcp1*X1, + Rcp2 ~= 1/X2, X3=Rcp2*X2, + Rcp3 ~= 1/X3, Rcp3C ~= C/X3. + Then + log2|x| = k1 + log2(1/Rcp1) + log2(1/Rcp2) + log2(C/Rcp3C) + + log2(X1*Rcp1*Rcp2*Rcp3C/C), + where X1*Rcp1*Rcp2*Rcp3C = C*(1+q), q is very small. + + The values of Rcp1, log2(1/Rcp1), Rcp2, log2(1/Rcp2), + Rcp3C, log2(C/Rcp3C) are taken from tables. + Values of Rcp1, Rcp2, Rcp3C are such that RcpC=Rcp1*Rcp2*Rcp3C + is exactly represented in target precision. + + log2(X1*Rcp1*Rcp2*Rcp3C/C) = log2(1+q) = ln(1+q)/ln2 = + = 1/(ln2)*q - 1/(2ln2)*q^2 + 1/(3ln2)*q^3 - ... = + = 1/(C*ln2)*cq - 1/(2*C^2*ln2)*cq^2 + 1/(3*C^3*ln2)*cq^3 - ... = + = (1 + a1)*cq + a2*cq^2 + a3*cq^3 + ..., + where cq = X1*Rcp1*Rcp2*Rcp3C-C, + a1=1/(C*ln(2))-1 is small, + a2=1/(2*C^2*ln2), + a3=1/(3*C^3*ln2), + ... + We get 3 parts of log2 result: HH+HL+HLL ~= log2|x|. + + 2) Calculation of y*(HH+HL+HLL). + Split y into YHi+YLo. + Get high PH and medium PL parts of y*log2|x|. + Get low PLL part of y*log2|x|. + Now we have PH+PL+PLL ~= y*log2|x|. + + 3) Calculation of 2^(PH+PL+PLL). + Mathematical idea of computing 2^(PH+PL+PLL) is the following. + Let's represent PH+PL+PLL in the form N + j/2^expK + Z, + where expK=7 in this implementation, N and j are integers, + 0<=j<=2^expK-1, |Z|<2^(-expK-1). + Hence 2^(PH+PL+PLL) ~= 2^N * 2^(j/2^expK) * 2^Z, + where 2^(j/2^expK) is stored in a table, and + 2^Z ~= 1 + B1*Z + B2*Z^2 ... + B5*Z^5. + + We compute 2^(PH+PL+PLL) as follows. + Break PH into PHH + PHL, where PHH = N + j/2^expK. + Z = PHL + PL + PLL + Exp2Poly = B1*Z + B2*Z^2 ... + B5*Z^5 + Get 2^(j/2^expK) from table in the form THI+TLO. + Now we have 2^(PH+PL+PLL) ~= 2^N * (THI + TLO) * (1 + Exp2Poly). + + Get significand of 2^(PH+PL+PLL) in the form ResHi+ResLo: + ResHi := THI + ResLo := THI * Exp2Poly + TLO + + Get exponent ERes of the result: + Res := ResHi + ResLo: + Result := ex(Res) + N. */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $448, %rsp + movq __svml_dpow_data@GOTPCREL(%rip), %rax + vmovups %ymm11, 160(%rsp) + vmovups %ymm8, 224(%rsp) + vmovups %ymm10, 352(%rsp) + vmovups %ymm9, 384(%rsp) + vmovups %ymm13, 288(%rsp) + vmovapd %ymm1, %ymm11 + vxorpd %ymm1, %ymm1, %ymm1 + vextracti128 $1, %ymm0, %xmm5 + vshufps $221, %xmm5, %xmm0, %xmm5 + +/* i = (((Hi(x) & 0x000ffe00) + 0x00000200) >> 10); -> i = (b1..b11 + 1) / 2 */ + vandps _iIndexMask(%rax), %xmm5, %xmm3 + vpaddd _iIndexAdd(%rax), %xmm3, %xmm6 + vpsrld $10, %xmm6, %xmm8 + +/* Index for reciprocal table */ + vpslld $3, %xmm8, %xmm9 + +/* Index for log2 table */ + vpslld $4, %xmm8, %xmm6 + +/* x1 = x; Hi(x1) = (Hi(x1)&0x000fffff)|0x3ff00000 */ + vandpd _iMantissaMask(%rax), %ymm0, %ymm4 + vorpd _dbOne(%rax), %ymm4, %ymm13 + vpcmpeqd %ymm4, %ymm4, %ymm4 + vpcmpeqd %ymm8, %ymm8, %ymm8 + +/* k = Hi(x); k = k - 0x3fe7fe00; k = k >> 20 */ + vpsubd _i3fe7fe00(%rax), %xmm5, %xmm3 + vpaddd _HIDELTA(%rax), %xmm5, %xmm5 + vextracti128 $1, %ymm11, %xmm7 + vshufps $221, %xmm7, %xmm11, %xmm2 + vpand _ABSMASK(%rax), %xmm2, %xmm10 + vpcmpeqd %ymm2, %ymm2, %ymm2 + vgatherdpd %ymm2, 11712(%rax,%xmm9), %ymm1 + vmovups _LORANGE(%rax), %xmm7 + vxorpd %ymm2, %ymm2, %ymm2 + vgatherdpd %ymm4, 19968(%rax,%xmm6), %ymm2 + vxorpd %ymm4, %ymm4, %ymm4 + vgatherdpd %ymm8, 19976(%rax,%xmm6), %ymm4 + vpsrad $20, %xmm3, %xmm6 + vpaddd _i2p20_2p19(%rax), %xmm6, %xmm9 + vpshufd $80, %xmm9, %xmm8 + vpshufd $250, %xmm9, %xmm3 + +/* x1Hi=x1; Lo(x1Hi)&=0xf8000000; x1Lo = x1-x1Hi */ + vandpd _iHighMask(%rax), %ymm13, %ymm9 + vinserti128 $1, %xmm3, %ymm8, %ymm6 + vandpd _iffffffff00000000(%rax), %ymm6, %ymm8 + +/* r1 = x1*rcp1 */ + vmulpd %ymm1, %ymm13, %ymm6 + vsubpd %ymm9, %ymm13, %ymm3 + vsubpd _db2p20_2p19(%rax), %ymm8, %ymm8 + +/* cq = c+r1 */ + vaddpd _LHN(%rax), %ymm6, %ymm13 + +/* E = -r1+__fence(x1Hi*rcp1) */ + vfmsub213pd %ymm6, %ymm1, %ymm9 + +/* E=E+x1Lo*rcp1 */ + vfmadd213pd %ymm9, %ymm1, %ymm3 + +/* T = k + L1hi */ + vaddpd %ymm2, %ymm8, %ymm1 + +/* T_Rh = T + cq */ + vaddpd %ymm13, %ymm1, %ymm8 + +/* Rl = T-T_Rh; -> -Rh */ + vsubpd %ymm8, %ymm1, %ymm6 + +/* Rl=Rl+cq */ + vaddpd %ymm6, %ymm13, %ymm1 + +/* T_Rh_Eh = T_Rh + E */ + vaddpd %ymm3, %ymm8, %ymm6 + +/* cq = cq + E */ + vaddpd %ymm3, %ymm13, %ymm13 + +/* HLL = T_Rh - T_Rh_Eh; -> -Eh */ + vsubpd %ymm6, %ymm8, %ymm9 + +/* HLL+=E; -> El */ + vaddpd %ymm9, %ymm3, %ymm2 + +/* HLL+=Rl */ + vaddpd %ymm1, %ymm2, %ymm8 + +/* HLL+=L1lo */ + vaddpd %ymm4, %ymm8, %ymm4 + vmovupd _clv_2(%rax), %ymm8 + +/* HLL = HLL + (((((((a7)*cq+a6)*cq+a5)*cq+a4)*cq+a3)*cq+a2)*cq+a1)*cq */ + vfmadd213pd _clv_3(%rax), %ymm13, %ymm8 + vfmadd213pd _clv_4(%rax), %ymm13, %ymm8 + vfmadd213pd _clv_5(%rax), %ymm13, %ymm8 + vfmadd213pd _clv_6(%rax), %ymm13, %ymm8 + vfmadd213pd _clv_7(%rax), %ymm13, %ymm8 + vfmadd213pd %ymm4, %ymm13, %ymm8 + +/* T_Rh_Eh_HLLhi = T_Rh_Eh + HLL */ + vaddpd %ymm8, %ymm6, %ymm9 + +/* HH = T_Rh_Eh_HLLhi; Lo(HH)&=0xf8000000 */ + vandpd _iHighMask(%rax), %ymm9, %ymm2 + +/* + 2^(y*(HH+HL+HLL)) starts here: + yH = y; Lo(yH)&=0xf8000000; + */ + vandpd _iHighMask(%rax), %ymm11, %ymm1 + +/* HLLhi = T_Rh_Eh_HLLhi - T_Rh_Eh */ + vsubpd %ymm6, %ymm9, %ymm13 + +/* HL = T_Rh_Eh_HLLhi-HH */ + vsubpd %ymm2, %ymm9, %ymm4 + +/* pH = yH*HH */ + vmulpd %ymm2, %ymm1, %ymm9 + +/* HLL = HLL - HLLhi */ + vsubpd %ymm13, %ymm8, %ymm6 + +/* yL = y-yH */ + vsubpd %ymm1, %ymm11, %ymm8 + vextracti128 $1, %ymm9, %xmm3 + vshufps $221, %xmm3, %xmm9, %xmm13 + vpand _ABSMASK(%rax), %xmm13, %xmm3 + vpcmpgtd %xmm5, %xmm7, %xmm13 + vpcmpgtd _INF(%rax), %xmm10, %xmm7 + vpcmpeqd _INF(%rax), %xmm10, %xmm10 + vpor %xmm10, %xmm7, %xmm7 + vpor %xmm7, %xmm13, %xmm5 + +/* pL=yL*HL+yH*HL; pL+=yL*HH */ + vmulpd %ymm4, %ymm8, %ymm7 + vpcmpgtd _DOMAINRANGE(%rax), %xmm3, %xmm13 + vpcmpeqd _DOMAINRANGE(%rax), %xmm3, %xmm10 + vpor %xmm10, %xmm13, %xmm3 + vpor %xmm3, %xmm5, %xmm13 + vfmadd213pd %ymm7, %ymm4, %ymm1 + +/* pLL = y*HLL; + pHH = pH + *(double*)&db2p45_2p44 + */ + vaddpd _db2p45_2p44(%rax), %ymm9, %ymm7 + vmovmskps %xmm13, %ecx + vfmadd213pd %ymm1, %ymm2, %ymm8 + +/* t=pL+pLL; t+=pHL */ + vfmadd231pd %ymm11, %ymm6, %ymm8 + vextracti128 $1, %ymm7, %xmm1 + vshufps $136, %xmm1, %xmm7, %xmm10 + +/* _n = Lo(pHH); + _n = _n & 0xffffff80; + _n = _n >> 7; + Hi(_2n) = (0x3ff+_n)<<20; Lo(_2n) = 0; -> 2^n + */ + vpslld $13, %xmm10, %xmm2 + vpaddd _iOne(%rax), %xmm2, %xmm13 + vpshufd $80, %xmm13, %xmm4 + vpshufd $250, %xmm13, %xmm1 + +/* j = Lo(pHH)&0x0000007f */ + vandps _jIndexMask(%rax), %xmm10, %xmm3 + +/* T1 = ((double*)exp2_tbl)[ 2*j ] */ + vpcmpeqd %ymm10, %ymm10, %ymm10 + vpslld $4, %xmm3, %xmm5 + +/* pHH = pHH - *(double*)&db2p45_2p44 */ + vsubpd _db2p45_2p44(%rax), %ymm7, %ymm7 + +/* pHL = pH - pHH */ + vsubpd %ymm7, %ymm9, %ymm9 + vaddpd %ymm9, %ymm8, %ymm6 + vinserti128 $1, %xmm1, %ymm4, %ymm2 + vxorpd %ymm1, %ymm1, %ymm1 + vgatherdpd %ymm10, 36416(%rax,%xmm5), %ymm1 + vandpd _ifff0000000000000(%rax), %ymm2, %ymm13 + vmovupd _cev_1(%rax), %ymm2 + vmulpd %ymm1, %ymm13, %ymm1 + vfmadd213pd _cev_2(%rax), %ymm6, %ymm2 + vmulpd %ymm6, %ymm1, %ymm8 + vfmadd213pd _cev_3(%rax), %ymm6, %ymm2 + vfmadd213pd _cev_4(%rax), %ymm6, %ymm2 + vfmadd213pd _cev_5(%rax), %ymm6, %ymm2 + vfmadd213pd %ymm1, %ymm8, %ymm2 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + vmovups 224(%rsp), %ymm8 + vmovups 384(%rsp), %ymm9 + vmovups 352(%rsp), %ymm10 + vmovups 160(%rsp), %ymm11 + vmovups 288(%rsp), %ymm13 + vmovdqa %ymm2, %ymm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovupd %ymm0, 192(%rsp) + vmovupd %ymm11, 256(%rsp) + vmovupd %ymm2, 320(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + vmovups %ymm12, 64(%rsp) + vmovups %ymm14, 32(%rsp) + vmovups %ymm15, (%rsp) + movq %rsi, 104(%rsp) + movq %rdi, 96(%rsp) + movq %r12, 136(%rsp) + cfi_offset_rel_rsp (12, 136) + movb %dl, %r12b + movq %r13, 128(%rsp) + cfi_offset_rel_rsp (13, 128) + movl %ecx, %r13d + movq %r14, 120(%rsp) + cfi_offset_rel_rsp (14, 120) + movl %eax, %r14d + movq %r15, 112(%rsp) + cfi_offset_rel_rsp (15, 112) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + vmovups 64(%rsp), %ymm12 + vmovups 32(%rsp), %ymm14 + vmovups (%rsp), %ymm15 + vmovupd 320(%rsp), %ymm2 + movq 104(%rsp), %rsi + movq 96(%rsp), %rdi + movq 136(%rsp), %r12 + cfi_restore (%r12) + movq 128(%rsp), %r13 + cfi_restore (%r13) + movq 120(%rsp), %r14 + cfi_restore (%r14) + movq 112(%rsp), %r15 + cfi_restore (%r15) + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 200(%rsp,%r15), %xmm0 + vmovsd 264(%rsp,%r15), %xmm1 + vzeroupper + + call JUMPTARGET(__pow_finite) + + vmovsd %xmm0, 328(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 192(%rsp,%r15), %xmm0 + vmovsd 256(%rsp,%r15), %xmm1 + vzeroupper + + call JUMPTARGET(__pow_finite) + + vmovsd %xmm0, 320(%rsp,%r15) + jmp .LBL_1_7 + +END (_ZGVdN4vv_pow_avx2) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S new file mode 100644 index 0000000000..68f12b2848 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S @@ -0,0 +1,37 @@ +/* Multiple versions of vectorized pow. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVeN8vv_pow) + .type _ZGVeN8vv_pow, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVeN8vv_pow_skx(%rip), %rax + HAS_ARCH_FEATURE (AVX512DQ_Usable) + jnz 2f + leaq _ZGVeN8vv_pow_knl(%rip), %rax + HAS_ARCH_FEATURE (AVX512F_Usable) + jnz 2f + leaq _ZGVeN8vv_pow_avx2_wrapper(%rip), %rax +2: ret +END (_ZGVeN8vv_pow) + +#define _ZGVeN8vv_pow _ZGVeN8vv_pow_avx2_wrapper +#include "../svml_d_pow8_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core_avx512.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core_avx512.S new file mode 100644 index 0000000000..2190c1f6b4 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core_avx512.S @@ -0,0 +1,741 @@ +/* Function pow vectorized with AVX-512. KNL and SKX versions. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_pow_data.h" +#include "svml_d_wrapper_impl.h" + +/* ALGORITHM DESCRIPTION: + + 1) Calculating log2|x| + Here we use the following formula. + Let |x|=2^k1*X1, where k1 is integer, 1<=X1<2. + Let C ~= 1/ln(2), + Rcp1 ~= 1/X1, X2=Rcp1*X1, + Rcp2 ~= 1/X2, X3=Rcp2*X2, + Rcp3 ~= 1/X3, Rcp3C ~= C/X3. + Then + log2|x| = k1 + log2(1/Rcp1) + log2(1/Rcp2) + log2(C/Rcp3C) + + log2(X1*Rcp1*Rcp2*Rcp3C/C), + where X1*Rcp1*Rcp2*Rcp3C = C*(1+q), q is very small. + + The values of Rcp1, log2(1/Rcp1), Rcp2, log2(1/Rcp2), + Rcp3C, log2(C/Rcp3C) are taken from tables. + Values of Rcp1, Rcp2, Rcp3C are such that RcpC=Rcp1*Rcp2*Rcp3C + is exactly represented in target precision. + + log2(X1*Rcp1*Rcp2*Rcp3C/C) = log2(1+q) = ln(1+q)/ln2 = + = 1/(ln2)*q - 1/(2ln2)*q^2 + 1/(3ln2)*q^3 - ... = + = 1/(C*ln2)*cq - 1/(2*C^2*ln2)*cq^2 + 1/(3*C^3*ln2)*cq^3 - ... = + = (1 + a1)*cq + a2*cq^2 + a3*cq^3 + ..., + where cq = X1*Rcp1*Rcp2*Rcp3C-C, + a1=1/(C*ln(2))-1 is small, + a2=1/(2*C^2*ln2), + a3=1/(3*C^3*ln2), + ... + We get 3 parts of log2 result: HH+HL+HLL ~= log2|x|. + + 2) Calculation of y*(HH+HL+HLL). + Split y into YHi+YLo. + Get high PH and medium PL parts of y*log2|x|. + Get low PLL part of y*log2|x|. + Now we have PH+PL+PLL ~= y*log2|x|. + + 3) Calculation of 2^(PH+PL+PLL). + Mathematical idea of computing 2^(PH+PL+PLL) is the following. + Let's represent PH+PL+PLL in the form N + j/2^expK + Z, + where expK=7 in this implementation, N and j are integers, + 0<=j<=2^expK-1, |Z|<2^(-expK-1). + Hence 2^(PH+PL+PLL) ~= 2^N * 2^(j/2^expK) * 2^Z, + where 2^(j/2^expK) is stored in a table, and + 2^Z ~= 1 + B1*Z + B2*Z^2 ... + B5*Z^5. + + We compute 2^(PH+PL+PLL) as follows. + Break PH into PHH + PHL, where PHH = N + j/2^expK. + Z = PHL + PL + PLL + Exp2Poly = B1*Z + B2*Z^2 ... + B5*Z^5 + Get 2^(j/2^expK) from table in the form THI+TLO. + Now we have 2^(PH+PL+PLL) ~= 2^N * (THI + TLO) * (1 + Exp2Poly). + + Get significand of 2^(PH+PL+PLL) in the form ResHi+ResLo: + ResHi := THI + ResLo := THI * Exp2Poly + TLO + + Get exponent ERes of the result: + Res := ResHi + ResLo: + Result := ex(Res) + N. */ + + .text +ENTRY (_ZGVeN8vv_pow_knl) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512_ff _ZGVdN4vv_pow +#else + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1344, %rsp + vpsrlq $32, %zmm0, %zmm13 + vmovaps %zmm1, %zmm12 + movq __svml_dpow_data@GOTPCREL(%rip), %rax + movl $255, %edx + vpmovqd %zmm13, %ymm10 + vpsrlq $32, %zmm12, %zmm14 + kmovw %edx, %k1 + movl $-1, %ecx + vpmovqd %zmm14, %ymm15 + +/* x1 = x; Hi(x1) = (Hi(x1)&0x000fffff)|0x3ff00000 */ + vmovups _dbOne(%rax), %zmm6 + +/* i = (((Hi(x) & 0x000ffe00) + 0x00000200) >> 10); -> i = (b1..b11 + 1) / 2 */ + vmovaps %zmm10, %zmm5 + +/* k = Hi(x); k = k - 0x3fe7fe00; k = k >> 20 */ + vpsubd _i3fe7fe00(%rax), %zmm10, %zmm14{%k1} + vpandd _iIndexMask(%rax), %zmm10, %zmm5{%k1} + vpsrad $20, %zmm14, %zmm14{%k1} + vpxord %zmm9, %zmm9, %zmm9 + vpaddd _HIDELTA(%rax), %zmm10, %zmm3{%k1} + vpaddd _iIndexAdd(%rax), %zmm5, %zmm5{%k1} + vpxord %zmm7, %zmm7, %zmm7 + vpaddd _i2p20_2p19(%rax), %zmm14, %zmm14{%k1} + vpcmpd $1, _LORANGE(%rax), %zmm3, %k2{%k1} + vpsrld $10, %zmm5, %zmm5{%k1} + vpandd _ABSMASK(%rax), %zmm15, %zmm2{%k1} + vpbroadcastd %ecx, %zmm1{%k2}{z} + +/* Index for reciprocal table */ + vpslld $3, %zmm5, %zmm8{%k1} + kxnorw %k2, %k2, %k2 + vgatherdpd 11712(%rax,%ymm8), %zmm9{%k2} + vpmovzxdq %ymm14, %zmm10 + +/* Index for log2 table */ + vpslld $4, %zmm5, %zmm13{%k1} + kxnorw %k2, %k2, %k2 + vpsllq $32, %zmm10, %zmm3 + vpxord %zmm8, %zmm8, %zmm8 + vpcmpd $5, _INF(%rax), %zmm2, %k3{%k1} + vpbroadcastd %ecx, %zmm4{%k3}{z} + vpternlogq $248, _iMantissaMask(%rax), %zmm0, %zmm6 + kxnorw %k3, %k3, %k3 + vpternlogq $168, _iffffffff00000000(%rax), %zmm10, %zmm3 + +/* x1Hi=x1; Lo(x1Hi)&=0xf8000000; x1Lo = x1-x1Hi */ + vpandq _iHighMask(%rax), %zmm6, %zmm2 + vgatherdpd 19976(%rax,%ymm13), %zmm8{%k2} + vpord %zmm4, %zmm1, %zmm11{%k1} + vsubpd _db2p20_2p19(%rax), %zmm3, %zmm1 + vsubpd %zmm2, %zmm6, %zmm5 + +/* r1 = x1*rcp1 */ + vmulpd %zmm9, %zmm6, %zmm6 + vgatherdpd 19968(%rax,%ymm13), %zmm7{%k3} + +/* cq = c+r1 */ + vaddpd _LHN(%rax), %zmm6, %zmm4 + +/* E = -r1+__fence(x1Hi*rcp1) */ + vfmsub213pd %zmm6, %zmm9, %zmm2 + +/* T = k + L1hi */ + vaddpd %zmm7, %zmm1, %zmm7 + +/* E=E+x1Lo*rcp1 */ + vfmadd213pd %zmm2, %zmm9, %zmm5 + +/* T_Rh = T + cq */ + vaddpd %zmm4, %zmm7, %zmm3 + +/* Rl = T-T_Rh; -> -Rh */ + vsubpd %zmm3, %zmm7, %zmm9 + +/* Rl=Rl+cq */ + vaddpd %zmm9, %zmm4, %zmm6 + +/* T_Rh_Eh = T_Rh + E */ + vaddpd %zmm5, %zmm3, %zmm9 + +/* HLL = T_Rh - T_Rh_Eh; -> -Eh */ + vsubpd %zmm9, %zmm3, %zmm2 + +/* cq = cq + E; */ + vaddpd %zmm5, %zmm4, %zmm4 + +/* HLL+=E; -> El */ + vaddpd %zmm2, %zmm5, %zmm1 + vmovups _clv_2(%rax), %zmm5 + +/* HLL = HLL + (((((((a7)*cq+a6)*cq+a5)*cq+a4)*cq+a3)*cq+a2)*cq+a1)*cq */ + vfmadd213pd _clv_3(%rax), %zmm4, %zmm5 + +/* HLL+=Rl */ + vaddpd %zmm6, %zmm1, %zmm7 + +/* 2^(y*(HH+HL+HLL)) starts here: + yH = y; Lo(yH)&=0xf8000000 + */ + vpandq _iHighMask(%rax), %zmm12, %zmm6 + +/* yL = y-yH */ + vsubpd %zmm6, %zmm12, %zmm2 + vfmadd213pd _clv_4(%rax), %zmm4, %zmm5 + +/* HLL+=L1lo */ + vaddpd %zmm8, %zmm7, %zmm8 + vfmadd213pd _clv_5(%rax), %zmm4, %zmm5 + vfmadd213pd _clv_6(%rax), %zmm4, %zmm5 + vfmadd213pd _clv_7(%rax), %zmm4, %zmm5 + vfmadd213pd %zmm8, %zmm4, %zmm5 + +/* T_Rh_Eh_HLLhi = T_Rh_Eh + HLL */ + vaddpd %zmm5, %zmm9, %zmm13 + +/* HLLhi = T_Rh_Eh_HLLhi - T_Rh_Eh */ + vsubpd %zmm9, %zmm13, %zmm10 + +/* HLL = HLL - HLLhi */ + vsubpd %zmm10, %zmm5, %zmm3 + +/* HH = T_Rh_Eh_HLLhi; Lo(HH)&=0xf8000000 */ + vpandq _iHighMask(%rax), %zmm13, %zmm5 + +/* pH = yH*HH */ + vmulpd %zmm5, %zmm6, %zmm1 + +/* HL = T_Rh_Eh_HLLhi-HH */ + vsubpd %zmm5, %zmm13, %zmm4 + vpsrlq $32, %zmm1, %zmm14 + +/* pLL = y*HLL; + pHH = pH + *(double*)&db2p45_2p44 + */ + vaddpd _db2p45_2p44(%rax), %zmm1, %zmm10 + vpmovqd %zmm14, %ymm15 + vpandd _ABSMASK(%rax), %zmm15, %zmm14{%k1} + vpcmpd $5, _DOMAINRANGE(%rax), %zmm14, %k3{%k1} + +/* T1 = ((double*)exp2_tbl)[ 2*j ] */ + vpxord %zmm14, %zmm14, %zmm14 + vpbroadcastd %ecx, %zmm13{%k3}{z} + vpord %zmm13, %zmm11, %zmm11{%k1} + vptestmd %zmm11, %zmm11, %k0{%k1} + +/* pL=yL*HL+yH*HL; pL+=yL*HH */ + vmulpd %zmm4, %zmm2, %zmm11 + kmovw %k0, %ecx + vfmadd213pd %zmm11, %zmm4, %zmm6 + +/* pHH = pHH - *(double*)&db2p45_2p44 */ + vsubpd _db2p45_2p44(%rax), %zmm10, %zmm11 + vpmovqd %zmm10, %ymm4 + movzbl %cl, %ecx + +/* _n = Lo(pHH); + _n = _n & 0xffffff80; + _n = _n >> 7; + Hi(_2n) = (0x3ff+_n)<<20; Lo(_2n) = 0; -> 2^n + */ + vpslld $13, %zmm4, %zmm7{%k1} + +/* j = Lo(pHH)&0x0000007f */ + vpandd _jIndexMask(%rax), %zmm4, %zmm9{%k1} + vfmadd213pd %zmm6, %zmm5, %zmm2 + +/* pHL = pH - pHH */ + vsubpd %zmm11, %zmm1, %zmm1 + vpaddd _iOne(%rax), %zmm7, %zmm7{%k1} + +/* t=pL+pLL; t+=pHL */ + vfmadd231pd %zmm12, %zmm3, %zmm2 + vpslld $4, %zmm9, %zmm9{%k1} + kxnorw %k1, %k1, %k1 + vgatherdpd 36416(%rax,%ymm9), %zmm14{%k1} + vpmovzxdq %ymm7, %zmm8 + vaddpd %zmm1, %zmm2, %zmm2 + vmovups _cev_1(%rax), %zmm1 + vpsllq $32, %zmm8, %zmm13 + vpternlogq $168, _ifff0000000000000(%rax), %zmm8, %zmm13 + vfmadd213pd _cev_2(%rax), %zmm2, %zmm1 + vmulpd %zmm14, %zmm13, %zmm15 + vfmadd213pd _cev_3(%rax), %zmm2, %zmm1 + vmulpd %zmm2, %zmm15, %zmm3 + vfmadd213pd _cev_4(%rax), %zmm2, %zmm1 + vfmadd213pd _cev_5(%rax), %zmm2, %zmm1 + vfmadd213pd %zmm15, %zmm3, %zmm1 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + vmovaps %zmm1, %zmm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovups %zmm0, 1152(%rsp) + vmovups %zmm12, 1216(%rsp) + vmovups %zmm1, 1280(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + kmovw %k4, 1048(%rsp) + xorl %eax, %eax + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1064(%rsp) + movq %rdi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %ecx, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %eax, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + addb $1, %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + kmovw 1048(%rsp), %k4 + movq 1064(%rsp), %rsi + kmovw 1040(%rsp), %k5 + movq 1056(%rsp), %rdi + kmovw 1032(%rsp), %k6 + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + vmovups 1280(%rsp), %zmm1 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1160(%rsp,%r15), %xmm0 + vmovsd 1224(%rsp,%r15), %xmm1 + call JUMPTARGET(__pow_finite) + vmovsd %xmm0, 1288(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1152(%rsp,%r15), %xmm0 + vmovsd 1216(%rsp,%r15), %xmm1 + call JUMPTARGET(__pow_finite) + vmovsd %xmm0, 1280(%rsp,%r15) + jmp .LBL_1_7 + +#endif +END (_ZGVeN8vv_pow_knl) + +ENTRY (_ZGVeN8vv_pow_skx) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512_ff _ZGVdN4vv_pow +#else + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1344, %rsp + vpsrlq $32, %zmm0, %zmm10 + kxnorw %k1, %k1, %k1 + kxnorw %k2, %k2, %k2 + kxnorw %k3, %k3, %k3 + vpmovqd %zmm10, %ymm7 + movq __svml_dpow_data@GOTPCREL(%rip), %rax + vmovaps %zmm1, %zmm6 + vpsrlq $32, %zmm6, %zmm13 + +/* i = (((Hi(x) & 0x000ffe00) + 0x00000200) >> 10); -> i = (b1..b11 + 1) / 2 */ + vpand _iIndexMask(%rax), %ymm7, %ymm15 + vpaddd _HIDELTA(%rax), %ymm7, %ymm2 + +/* k = Hi(x); k = k - 0x3fe7fe00; k = k >> 20 */ + vpsubd _i3fe7fe00(%rax), %ymm7, %ymm7 + vmovdqu _ABSMASK(%rax), %ymm4 + vmovdqu _LORANGE(%rax), %ymm3 + +/* x1 = x; Hi(x1) = (Hi(x1)&0x000fffff)|0x3ff00000 */ + vmovups _dbOne(%rax), %zmm11 + vmovdqu _INF(%rax), %ymm5 + vpaddd _iIndexAdd(%rax), %ymm15, %ymm12 + vpmovqd %zmm13, %ymm14 + vpternlogq $248, _iMantissaMask(%rax), %zmm0, %zmm11 + vpsrld $10, %ymm12, %ymm10 + vpsrad $20, %ymm7, %ymm13 + +/* Index for reciprocal table */ + vpslld $3, %ymm10, %ymm8 + +/* Index for log2 table */ + vpslld $4, %ymm10, %ymm1 + vpcmpgtd %ymm2, %ymm3, %ymm3 + vpand %ymm4, %ymm14, %ymm2 + vpaddd _i2p20_2p19(%rax), %ymm13, %ymm14 + vpmovzxdq %ymm14, %zmm15 + vpsllq $32, %zmm15, %zmm7 + vpternlogq $168, _iffffffff00000000(%rax), %zmm15, %zmm7 + vsubpd _db2p20_2p19(%rax), %zmm7, %zmm13 + vpxord %zmm9, %zmm9, %zmm9 + vgatherdpd 11712(%rax,%ymm8), %zmm9{%k1} + +/* T1 = ((double*)exp2_tbl)[ 2*j ] */ + kxnorw %k1, %k1, %k1 + vpxord %zmm12, %zmm12, %zmm12 + vpxord %zmm8, %zmm8, %zmm8 + vgatherdpd 19968(%rax,%ymm1), %zmm12{%k2} + vgatherdpd 19976(%rax,%ymm1), %zmm8{%k3} + vmovups _iHighMask(%rax), %zmm1 + +/* x1Hi=x1; Lo(x1Hi)&=0xf8000000; x1Lo = x1-x1Hi */ + vandpd %zmm1, %zmm11, %zmm10 + vsubpd %zmm10, %zmm11, %zmm15 + +/* r1 = x1*rcp1 */ + vmulpd %zmm9, %zmm11, %zmm11 + +/* E = -r1+__fence(x1Hi*rcp1) */ + vfmsub213pd %zmm11, %zmm9, %zmm10 + +/* cq = c+r1 */ + vaddpd _LHN(%rax), %zmm11, %zmm14 + +/* E=E+x1Lo*rcp1 */ + vfmadd213pd %zmm10, %zmm9, %zmm15 + +/* T = k + L1hi */ + vaddpd %zmm12, %zmm13, %zmm9 + +/* T_Rh = T + cq */ + vaddpd %zmm14, %zmm9, %zmm11 + +/* T_Rh_Eh = T_Rh + E */ + vaddpd %zmm15, %zmm11, %zmm13 + +/* Rl = T-T_Rh; -> -Rh */ + vsubpd %zmm11, %zmm9, %zmm12 + +/* HLL = T_Rh - T_Rh_Eh; -> -Eh */ + vsubpd %zmm13, %zmm11, %zmm9 + +/* Rl=Rl+cq */ + vaddpd %zmm12, %zmm14, %zmm10 + +/* HLL+=E; -> El */ + vaddpd %zmm9, %zmm15, %zmm7 + +/* HLL+=Rl */ + vaddpd %zmm10, %zmm7, %zmm12 + +/* 2^(y*(HH+HL+HLL)) starts here: + yH = y; Lo(yH)&=0xf8000000 + */ + vandpd %zmm1, %zmm6, %zmm7 + +/* HLL+=L1lo */ + vaddpd %zmm8, %zmm12, %zmm12 + +/* cq = cq + E */ + vaddpd %zmm15, %zmm14, %zmm8 + vmovups _clv_2(%rax), %zmm14 + +/* HLL = HLL + (((((((a7)*cq+a6)*cq+a5)*cq+a4)*cq+a3)*cq+a2)*cq+a1)*cq */ + vfmadd213pd _clv_3(%rax), %zmm8, %zmm14 + vfmadd213pd _clv_4(%rax), %zmm8, %zmm14 + vfmadd213pd _clv_5(%rax), %zmm8, %zmm14 + vfmadd213pd _clv_6(%rax), %zmm8, %zmm14 + vfmadd213pd _clv_7(%rax), %zmm8, %zmm14 + vfmadd213pd %zmm12, %zmm8, %zmm14 + +/* yL = y-yH */ + vsubpd %zmm7, %zmm6, %zmm8 + +/* T_Rh_Eh_HLLhi = T_Rh_Eh + HLL */ + vaddpd %zmm14, %zmm13, %zmm15 + +/* HH = T_Rh_Eh_HLLhi; Lo(HH)&=0xf8000000 */ + vandpd %zmm1, %zmm15, %zmm11 + +/* HLLhi = T_Rh_Eh_HLLhi - T_Rh_Eh */ + vsubpd %zmm13, %zmm15, %zmm13 + +/* pH = yH*HH */ + vmulpd %zmm11, %zmm7, %zmm9 + +/* HLL = HLL - HLLhi */ + vsubpd %zmm13, %zmm14, %zmm12 + +/* HL = T_Rh_Eh_HLLhi-HH */ + vsubpd %zmm11, %zmm15, %zmm10 + vpsrlq $32, %zmm9, %zmm1 + vmovdqu _DOMAINRANGE(%rax), %ymm13 + vpmovqd %zmm1, %ymm1 + vpand %ymm4, %ymm1, %ymm1 + vpcmpgtd %ymm5, %ymm2, %ymm4 + vpcmpeqd %ymm5, %ymm2, %ymm5 + vpternlogd $254, %ymm5, %ymm4, %ymm3 + vpcmpgtd %ymm13, %ymm1, %ymm2 + vpcmpeqd %ymm13, %ymm1, %ymm4 + vpternlogd $254, %ymm4, %ymm2, %ymm3 + +/* pLL = y*HLL */ + vmovups _db2p45_2p44(%rax), %zmm2 + +/* pHH = pH + *(double*)&db2p45_2p44 */ + vaddpd %zmm2, %zmm9, %zmm1 + vpmovqd %zmm1, %ymm5 + +/* j = Lo(pHH)&0x0000007f */ + vpand _jIndexMask(%rax), %ymm5, %ymm14 + vpslld $4, %ymm14, %ymm15 + vmovmskps %ymm3, %ecx + +/* pL=yL*HL+yH*HL; pL+=yL*HH */ + vmulpd %zmm10, %zmm8, %zmm3 + vfmadd213pd %zmm3, %zmm10, %zmm7 + vfmadd213pd %zmm7, %zmm11, %zmm8 + +/* _n = Lo(pHH) + _n = _n & 0xffffff80 + _n = _n >> 7 + Hi(_2n) = (0x3ff+_n)<<20; Lo(_2n) = 0; -> 2^n + */ + vpslld $13, %ymm5, %ymm7 + +/* t=pL+pLL; t+=pHL */ + vfmadd231pd %zmm6, %zmm12, %zmm8 + vpaddd _iOne(%rax), %ymm7, %ymm10 + vpmovzxdq %ymm10, %zmm11 + vpsllq $32, %zmm11, %zmm3 + vpternlogq $168, _ifff0000000000000(%rax), %zmm11, %zmm3 + +/* pHH = pHH - *(double*)&db2p45_2p44 */ + vsubpd %zmm2, %zmm1, %zmm11 + vmovups _cev_1(%rax), %zmm2 + +/* pHL = pH - pHH */ + vsubpd %zmm11, %zmm9, %zmm9 + vaddpd %zmm9, %zmm8, %zmm8 + vfmadd213pd _cev_2(%rax), %zmm8, %zmm2 + vfmadd213pd _cev_3(%rax), %zmm8, %zmm2 + vfmadd213pd _cev_4(%rax), %zmm8, %zmm2 + vfmadd213pd _cev_5(%rax), %zmm8, %zmm2 + vpxord %zmm4, %zmm4, %zmm4 + vgatherdpd 36416(%rax,%ymm15), %zmm4{%k1} + vmulpd %zmm4, %zmm3, %zmm1 + vmulpd %zmm8, %zmm1, %zmm12 + vfmadd213pd %zmm1, %zmm12, %zmm2 + testl %ecx, %ecx + jne .LBL_2_3 + +.LBL_2_2: + cfi_remember_state + vmovaps %zmm2, %zmm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_2_3: + cfi_restore_state + vmovups %zmm0, 1152(%rsp) + vmovups %zmm6, 1216(%rsp) + vmovups %zmm2, 1280(%rsp) + je .LBL_2_2 + + xorb %dl, %dl + xorl %eax, %eax + kmovw %k4, 1048(%rsp) + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1064(%rsp) + movq %rdi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %ecx, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %eax, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + cfi_remember_state + +.LBL_2_6: + btl %r14d, %r13d + jc .LBL_2_12 + +.LBL_2_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_2_10 + +.LBL_2_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_2_6 + + kmovw 1048(%rsp), %k4 + kmovw 1040(%rsp), %k5 + kmovw 1032(%rsp), %k6 + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + vmovups 1280(%rsp), %zmm2 + movq 1064(%rsp), %rsi + movq 1056(%rsp), %rdi + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + jmp .LBL_2_2 + +.LBL_2_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1224(%rsp,%r15), %xmm1 + vzeroupper + vmovsd 1160(%rsp,%r15), %xmm0 + + call JUMPTARGET(__pow_finite) + + vmovsd %xmm0, 1288(%rsp,%r15) + jmp .LBL_2_8 + +.LBL_2_12: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1216(%rsp,%r15), %xmm1 + vzeroupper + vmovsd 1152(%rsp,%r15), %xmm0 + + call JUMPTARGET(__pow_finite) + + vmovsd %xmm0, 1280(%rsp,%r15) + jmp .LBL_2_7 + +#endif +END (_ZGVeN8vv_pow_skx) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core.S new file mode 100644 index 0000000000..e35654be8d --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized sin. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVbN2v_sin) + .type _ZGVbN2v_sin, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVbN2v_sin_sse4(%rip), %rax + HAS_CPU_FEATURE (SSE4_1) + jz 2f + ret +2: leaq _ZGVbN2v_sin_sse2(%rip), %rax + ret +END (_ZGVbN2v_sin) +libmvec_hidden_def (_ZGVbN2v_sin) + +#define _ZGVbN2v_sin _ZGVbN2v_sin_sse2 +#include "../svml_d_sin2_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core_sse4.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core_sse4.S new file mode 100644 index 0000000000..393ba03b76 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core_sse4.S @@ -0,0 +1,229 @@ +/* Function sin vectorized with SSE4. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_trig_data.h" + + .text +ENTRY (_ZGVbN2v_sin_sse4) +/* ALGORITHM DESCRIPTION: + + ( low accuracy ( < 4ulp ) or enhanced performance + ( half of correct mantissa ) implementation ) + + Argument representation: + arg = N*Pi + R + + Result calculation: + sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R) + sin(R) is approximated by corresponding polynomial + */ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $320, %rsp + movaps %xmm0, %xmm5 + movq __svml_d_trig_data@GOTPCREL(%rip), %rax + movups __dAbsMask(%rax), %xmm3 +/* + ARGUMENT RANGE REDUCTION: + X' = |X| + */ + movaps %xmm3, %xmm4 + +/* SignX - sign bit of X */ + andnps %xmm5, %xmm3 + movups __dInvPI(%rax), %xmm2 + andps %xmm5, %xmm4 + +/* Y = X'*InvPi + RS : right shifter add */ + mulpd %xmm4, %xmm2 + movups __dRShifter(%rax), %xmm6 + +/* R = X' - N*Pi1 */ + movaps %xmm4, %xmm0 + addpd %xmm6, %xmm2 + cmpnlepd __dRangeVal(%rax), %xmm4 + +/* N = Y - RS : right shifter sub */ + movaps %xmm2, %xmm1 + +/* SignRes = Y<<63 : shift LSB to MSB place for result sign */ + psllq $63, %xmm2 + subpd %xmm6, %xmm1 + movmskpd %xmm4, %ecx + movups __dPI1(%rax), %xmm7 + mulpd %xmm1, %xmm7 + movups __dPI2(%rax), %xmm6 + +/* R = R - N*Pi2 */ + mulpd %xmm1, %xmm6 + subpd %xmm7, %xmm0 + movups __dPI3(%rax), %xmm7 + +/* R = R - N*Pi3 */ + mulpd %xmm1, %xmm7 + subpd %xmm6, %xmm0 + movups __dPI4(%rax), %xmm6 + +/* R = R - N*Pi4 */ + mulpd %xmm6, %xmm1 + subpd %xmm7, %xmm0 + subpd %xmm1, %xmm0 + +/* + POLYNOMIAL APPROXIMATION: + R2 = R*R + */ + movaps %xmm0, %xmm1 + mulpd %xmm0, %xmm1 + +/* R = R^SignRes : update sign of reduced argument */ + xorps %xmm2, %xmm0 + movups __dC7_sin(%rax), %xmm2 + mulpd %xmm1, %xmm2 + addpd __dC6_sin(%rax), %xmm2 + mulpd %xmm1, %xmm2 + addpd __dC5_sin(%rax), %xmm2 + mulpd %xmm1, %xmm2 + addpd __dC4_sin(%rax), %xmm2 + +/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */ + mulpd %xmm1, %xmm2 + addpd __dC3_sin(%rax), %xmm2 + +/* Poly = R2*(C1+R2*(C2+R2*Poly)) */ + mulpd %xmm1, %xmm2 + addpd __dC2_sin(%rax), %xmm2 + mulpd %xmm1, %xmm2 + addpd __dC1_sin(%rax), %xmm2 + mulpd %xmm2, %xmm1 + +/* Poly = Poly*R + R */ + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm0 + +/* + RECONSTRUCTION: + Final sign setting: Res = Poly^SignX + */ + xorps %xmm3, %xmm0 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + movups %xmm5, 192(%rsp) + movups %xmm0, 256(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + movups %xmm8, 112(%rsp) + movups %xmm9, 96(%rsp) + movups %xmm10, 80(%rsp) + movups %xmm11, 64(%rsp) + movups %xmm12, 48(%rsp) + movups %xmm13, 32(%rsp) + movups %xmm14, 16(%rsp) + movups %xmm15, (%rsp) + movq %rsi, 136(%rsp) + movq %rdi, 128(%rsp) + movq %r12, 168(%rsp) + cfi_offset_rel_rsp (12, 168) + movb %dl, %r12b + movq %r13, 160(%rsp) + cfi_offset_rel_rsp (13, 160) + movl %ecx, %r13d + movq %r14, 152(%rsp) + cfi_offset_rel_rsp (14, 152) + movl %eax, %r14d + movq %r15, 144(%rsp) + cfi_offset_rel_rsp (15, 144) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + movups 112(%rsp), %xmm8 + movups 96(%rsp), %xmm9 + movups 80(%rsp), %xmm10 + movups 64(%rsp), %xmm11 + movups 48(%rsp), %xmm12 + movups 32(%rsp), %xmm13 + movups 16(%rsp), %xmm14 + movups (%rsp), %xmm15 + movq 136(%rsp), %rsi + movq 128(%rsp), %rdi + movq 168(%rsp), %r12 + cfi_restore (%r12) + movq 160(%rsp), %r13 + cfi_restore (%r13) + movq 152(%rsp), %r14 + cfi_restore (%r14) + movq 144(%rsp), %r15 + cfi_restore (%r15) + movups 256(%rsp), %xmm0 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + movsd 200(%rsp,%r15), %xmm0 + + call JUMPTARGET(sin) + + movsd %xmm0, 264(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + shlq $4, %r15 + movsd 192(%rsp,%r15), %xmm0 + + call JUMPTARGET(sin) + + movsd %xmm0, 256(%rsp,%r15) + jmp .LBL_1_7 + +END (_ZGVbN2v_sin_sse4) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core.S new file mode 100644 index 0000000000..f4482d3a11 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized sin, vector length is 4. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVdN4v_sin) + .type _ZGVdN4v_sin, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVdN4v_sin_avx2(%rip), %rax + HAS_ARCH_FEATURE (AVX2_Usable) + jz 2f + ret +2: leaq _ZGVdN4v_sin_sse_wrapper(%rip), %rax + ret +END (_ZGVdN4v_sin) +libmvec_hidden_def (_ZGVdN4v_sin) + +#define _ZGVdN4v_sin _ZGVdN4v_sin_sse_wrapper +#include "../svml_d_sin4_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core_avx2.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core_avx2.S new file mode 100644 index 0000000000..b035fa1b15 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core_avx2.S @@ -0,0 +1,210 @@ +/* Function sin vectorized with AVX2. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_trig_data.h" + + .text +ENTRY (_ZGVdN4v_sin_avx2) +/* ALGORITHM DESCRIPTION: + + ( low accuracy ( < 4ulp ) or enhanced performance + ( half of correct mantissa ) implementation ) + + Argument representation: + arg = N*Pi + R + + Result calculation: + sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R) + sin(R) is approximated by corresponding polynomial + */ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $448, %rsp + movq __svml_d_trig_data@GOTPCREL(%rip), %rax + vmovdqa %ymm0, %ymm4 + vmovupd __dAbsMask(%rax), %ymm2 + vmovupd __dInvPI(%rax), %ymm6 + vmovupd __dRShifter(%rax), %ymm5 + vmovupd __dPI1_FMA(%rax), %ymm7 +/* + ARGUMENT RANGE REDUCTION: + X' = |X| + */ + vandpd %ymm2, %ymm4, %ymm3 + +/* Y = X'*InvPi + RS : right shifter add */ + vfmadd213pd %ymm5, %ymm3, %ymm6 + +/* N = Y - RS : right shifter sub */ + vsubpd %ymm5, %ymm6, %ymm1 + +/* SignRes = Y<<63 : shift LSB to MSB place for result sign */ + vpsllq $63, %ymm6, %ymm5 + +/* R = X' - N*Pi1 */ + vmovapd %ymm3, %ymm0 + vfnmadd231pd %ymm1, %ymm7, %ymm0 + vcmpnle_uqpd __dRangeVal(%rax), %ymm3, %ymm3 + +/* R = R - N*Pi2 */ + vfnmadd231pd __dPI2_FMA(%rax), %ymm1, %ymm0 + +/* R = R - N*Pi3 */ + vfnmadd132pd __dPI3_FMA(%rax), %ymm0, %ymm1 + +/* + POLYNOMIAL APPROXIMATION: + R2 = R*R + */ + vmulpd %ymm1, %ymm1, %ymm0 + +/* R = R^SignRes : update sign of reduced argument */ + vxorpd %ymm5, %ymm1, %ymm6 + vmovupd __dC7_sin(%rax), %ymm1 + vfmadd213pd __dC6_sin(%rax), %ymm0, %ymm1 + vfmadd213pd __dC5_sin(%rax), %ymm0, %ymm1 + vfmadd213pd __dC4_sin(%rax), %ymm0, %ymm1 + +/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */ + vfmadd213pd __dC3_sin(%rax), %ymm0, %ymm1 + +/* Poly = R2*(C1+R2*(C2+R2*Poly)) */ + vfmadd213pd __dC2_sin(%rax), %ymm0, %ymm1 + vfmadd213pd __dC1_sin(%rax), %ymm0, %ymm1 + +/* SignX - sign bit of X */ + vandnpd %ymm4, %ymm2, %ymm7 + vmulpd %ymm0, %ymm1, %ymm2 + +/* Poly = Poly*R + R */ + vfmadd213pd %ymm6, %ymm6, %ymm2 + vmovmskpd %ymm3, %ecx + +/* + RECONSTRUCTION: + Final sign setting: Res = Poly^SignX + */ + vxorpd %ymm7, %ymm2, %ymm0 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovupd %ymm4, 320(%rsp) + vmovupd %ymm0, 384(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + vmovups %ymm8, 224(%rsp) + vmovups %ymm9, 192(%rsp) + vmovups %ymm10, 160(%rsp) + vmovups %ymm11, 128(%rsp) + vmovups %ymm12, 96(%rsp) + vmovups %ymm13, 64(%rsp) + vmovups %ymm14, 32(%rsp) + vmovups %ymm15, (%rsp) + movq %rsi, 264(%rsp) + movq %rdi, 256(%rsp) + movq %r12, 296(%rsp) + cfi_offset_rel_rsp (12, 296) + movb %dl, %r12b + movq %r13, 288(%rsp) + cfi_offset_rel_rsp (13, 288) + movl %ecx, %r13d + movq %r14, 280(%rsp) + cfi_offset_rel_rsp (14, 280) + movl %eax, %r14d + movq %r15, 272(%rsp) + cfi_offset_rel_rsp (15, 272) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + vmovups 224(%rsp), %ymm8 + vmovups 192(%rsp), %ymm9 + vmovups 160(%rsp), %ymm10 + vmovups 128(%rsp), %ymm11 + vmovups 96(%rsp), %ymm12 + vmovups 64(%rsp), %ymm13 + vmovups 32(%rsp), %ymm14 + vmovups (%rsp), %ymm15 + vmovupd 384(%rsp), %ymm0 + movq 264(%rsp), %rsi + movq 256(%rsp), %rdi + movq 296(%rsp), %r12 + cfi_restore (%r12) + movq 288(%rsp), %r13 + cfi_restore (%r13) + movq 280(%rsp), %r14 + cfi_restore (%r14) + movq 272(%rsp), %r15 + cfi_restore (%r15) + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 328(%rsp,%r15), %xmm0 + vzeroupper + + call JUMPTARGET(sin) + + vmovsd %xmm0, 392(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 320(%rsp,%r15), %xmm0 + vzeroupper + + call JUMPTARGET(sin) + + vmovsd %xmm0, 384(%rsp,%r15) + jmp .LBL_1_7 + +END (_ZGVdN4v_sin_avx2) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S new file mode 100644 index 0000000000..2b15889c71 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S @@ -0,0 +1,37 @@ +/* Multiple versions of vectorized sin. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVeN8v_sin) + .type _ZGVeN8v_sin, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVeN8v_sin_skx(%rip), %rax + HAS_ARCH_FEATURE (AVX512DQ_Usable) + jnz 2f + leaq _ZGVeN8v_sin_knl(%rip), %rax + HAS_ARCH_FEATURE (AVX512F_Usable) + jnz 2f + leaq _ZGVeN8v_sin_avx2_wrapper(%rip), %rax +2: ret +END (_ZGVeN8v_sin) + +#define _ZGVeN8v_sin _ZGVeN8v_sin_avx2_wrapper +#include "../svml_d_sin8_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S new file mode 100644 index 0000000000..7580e60636 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S @@ -0,0 +1,465 @@ +/* Function sin vectorized with AVX-512, KNL and SKX versions. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_trig_data.h" +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVeN8v_sin_knl) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512 _ZGVdN4v_sin +#else +/* + ALGORITHM DESCRIPTION: + + ( low accuracy ( < 4ulp ) or enhanced performance + ( half of correct mantissa ) implementation ) + + Argument representation: + arg = N*Pi + R + + Result calculation: + sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R) + sin(R) is approximated by corresponding polynomial + */ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1280, %rsp + movq __svml_d_trig_data@GOTPCREL(%rip), %rax + movq $-1, %rdx + vmovups __dAbsMask(%rax), %zmm6 + vmovups __dInvPI(%rax), %zmm1 + +/* + ARGUMENT RANGE REDUCTION: + X' = |X| + */ + vpandq %zmm6, %zmm0, %zmm12 + vmovups __dPI1_FMA(%rax), %zmm2 + vmovups __dC7_sin(%rax), %zmm7 + +/* SignX - sign bit of X */ + vpandnq %zmm0, %zmm6, %zmm11 + +/* R = X' - N*Pi1 */ + vmovaps %zmm12, %zmm3 + +/* Y = X'*InvPi + RS : right shifter add */ + vfmadd213pd __dRShifter(%rax), %zmm12, %zmm1 + vcmppd $22, __dRangeVal(%rax), %zmm12, %k1 + vpbroadcastq %rdx, %zmm13{%k1}{z} + +/* N = Y - RS : right shifter sub */ + vsubpd __dRShifter(%rax), %zmm1, %zmm4 + +/* SignRes = Y<<63 : shift LSB to MSB place for result sign */ + vpsllq $63, %zmm1, %zmm5 + vptestmq %zmm13, %zmm13, %k0 + vfnmadd231pd %zmm4, %zmm2, %zmm3 + kmovw %k0, %ecx + movzbl %cl, %ecx + +/* R = R - N*Pi2 */ + vfnmadd231pd __dPI2_FMA(%rax), %zmm4, %zmm3 + +/* R = R - N*Pi3 */ + vfnmadd132pd __dPI3_FMA(%rax), %zmm3, %zmm4 + +/* + POLYNOMIAL APPROXIMATION: + R2 = R*R + */ + vmulpd %zmm4, %zmm4, %zmm8 + +/* R = R^SignRes : update sign of reduced argument */ + vpxorq %zmm5, %zmm4, %zmm9 + vfmadd213pd __dC6_sin(%rax), %zmm8, %zmm7 + vfmadd213pd __dC5_sin(%rax), %zmm8, %zmm7 + vfmadd213pd __dC4_sin(%rax), %zmm8, %zmm7 + +/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */ + vfmadd213pd __dC3_sin(%rax), %zmm8, %zmm7 + +/* Poly = R2*(C1+R2*(C2+R2*Poly)) */ + vfmadd213pd __dC2_sin(%rax), %zmm8, %zmm7 + vfmadd213pd __dC1_sin(%rax), %zmm8, %zmm7 + vmulpd %zmm8, %zmm7, %zmm10 + +/* Poly = Poly*R + R */ + vfmadd213pd %zmm9, %zmm9, %zmm10 + +/* + RECONSTRUCTION: + Final sign setting: Res = Poly^SignX + */ + vpxorq %zmm11, %zmm10, %zmm1 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + vmovaps %zmm1, %zmm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovups %zmm0, 1152(%rsp) + vmovups %zmm1, 1216(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + kmovw %k4, 1048(%rsp) + xorl %eax, %eax + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1064(%rsp) + movq %rdi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %ecx, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %eax, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + addb $1, %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + kmovw 1048(%rsp), %k4 + movq 1064(%rsp), %rsi + kmovw 1040(%rsp), %k5 + movq 1056(%rsp), %rdi + kmovw 1032(%rsp), %k6 + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + vmovups 1216(%rsp), %zmm1 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1160(%rsp,%r15), %xmm0 + call JUMPTARGET(sin) + vmovsd %xmm0, 1224(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1152(%rsp,%r15), %xmm0 + call JUMPTARGET(sin) + vmovsd %xmm0, 1216(%rsp,%r15) + jmp .LBL_1_7 +#endif +END (_ZGVeN8v_sin_knl) + +ENTRY (_ZGVeN8v_sin_skx) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512 _ZGVdN4v_sin +#else +/* + ALGORITHM DESCRIPTION: + + ( low accuracy ( < 4ulp ) or enhanced performance + ( half of correct mantissa ) implementation ) + + Argument representation: + arg = N*Pi + R + + Result calculation: + sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R) + sin(R) is approximated by corresponding polynomial + */ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1280, %rsp + movq __svml_d_trig_data@GOTPCREL(%rip), %rax + vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14 + vmovups __dAbsMask(%rax), %zmm7 + vmovups __dInvPI(%rax), %zmm2 + vmovups __dRShifter(%rax), %zmm1 + vmovups __dPI1_FMA(%rax), %zmm3 + vmovups __dC7_sin(%rax), %zmm8 + +/* + ARGUMENT RANGE REDUCTION: + X' = |X| + */ + vandpd %zmm7, %zmm0, %zmm13 + +/* SignX - sign bit of X */ + vandnpd %zmm0, %zmm7, %zmm12 + +/* Y = X'*InvPi + RS : right shifter add */ + vfmadd213pd %zmm1, %zmm13, %zmm2 + vcmppd $18, __dRangeVal(%rax), %zmm13, %k1 + +/* SignRes = Y<<63 : shift LSB to MSB place for result sign */ + vpsllq $63, %zmm2, %zmm6 + +/* N = Y - RS : right shifter sub */ + vsubpd %zmm1, %zmm2, %zmm5 + +/* R = X' - N*Pi1 */ + vmovaps %zmm13, %zmm4 + vfnmadd231pd %zmm5, %zmm3, %zmm4 + +/* R = R - N*Pi2 */ + vfnmadd231pd __dPI2_FMA(%rax), %zmm5, %zmm4 + +/* R = R - N*Pi3 */ + vfnmadd132pd __dPI3_FMA(%rax), %zmm4, %zmm5 + +/* + POLYNOMIAL APPROXIMATION: + R2 = R*R + */ + vmulpd %zmm5, %zmm5, %zmm9 + +/* R = R^SignRes : update sign of reduced argument */ + vxorpd %zmm6, %zmm5, %zmm10 + vfmadd213pd __dC6_sin(%rax), %zmm9, %zmm8 + vfmadd213pd __dC5_sin(%rax), %zmm9, %zmm8 + vfmadd213pd __dC4_sin(%rax), %zmm9, %zmm8 + +/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */ + vfmadd213pd __dC3_sin(%rax), %zmm9, %zmm8 + +/* Poly = R2*(C1+R2*(C2+R2*Poly)) */ + vfmadd213pd __dC2_sin(%rax), %zmm9, %zmm8 + vfmadd213pd __dC1_sin(%rax), %zmm9, %zmm8 + vmulpd %zmm9, %zmm8, %zmm11 + +/* Poly = Poly*R + R */ + vfmadd213pd %zmm10, %zmm10, %zmm11 + +/* + RECONSTRUCTION: + Final sign setting: Res = Poly^SignX + */ + vxorpd %zmm12, %zmm11, %zmm1 + vpandnq %zmm13, %zmm13, %zmm14{%k1} + vcmppd $3, %zmm14, %zmm14, %k0 + kmovw %k0, %ecx + testl %ecx, %ecx + jne .LBL_2_3 + +.LBL_2_2: + cfi_remember_state + vmovaps %zmm1, %zmm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_2_3: + cfi_restore_state + vmovups %zmm0, 1152(%rsp) + vmovups %zmm1, 1216(%rsp) + je .LBL_2_2 + + xorb %dl, %dl + xorl %eax, %eax + kmovw %k4, 1048(%rsp) + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1064(%rsp) + movq %rdi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %ecx, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %eax, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + cfi_remember_state + +.LBL_2_6: + btl %r14d, %r13d + jc .LBL_2_12 + +.LBL_2_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_2_10 + +.LBL_2_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_2_6 + + kmovw 1048(%rsp), %k4 + kmovw 1040(%rsp), %k5 + kmovw 1032(%rsp), %k6 + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + vmovups 1216(%rsp), %zmm1 + movq 1064(%rsp), %rsi + movq 1056(%rsp), %rdi + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + jmp .LBL_2_2 + +.LBL_2_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1160(%rsp,%r15), %xmm0 + vzeroupper + vmovsd 1160(%rsp,%r15), %xmm0 + + call JUMPTARGET(sin) + + vmovsd %xmm0, 1224(%rsp,%r15) + jmp .LBL_2_8 + +.LBL_2_12: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1152(%rsp,%r15), %xmm0 + vzeroupper + vmovsd 1152(%rsp,%r15), %xmm0 + + call JUMPTARGET(sin) + + vmovsd %xmm0, 1216(%rsp,%r15) + jmp .LBL_2_7 +#endif +END (_ZGVeN8v_sin_skx) + + .section .rodata, "a" +.L_2il0floatpacket.14: + .long 0xffffffff,0xffffffff + .type .L_2il0floatpacket.14,@object diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core.S new file mode 100644 index 0000000000..13279e3fb7 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized sincos. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVbN2vvv_sincos) + .type _ZGVbN2vvv_sincos, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVbN2vvv_sincos_sse4(%rip), %rax + HAS_CPU_FEATURE (SSE4_1) + jz 2f + ret +2: leaq _ZGVbN2vvv_sincos_sse2(%rip), %rax + ret +END (_ZGVbN2vvv_sincos) +libmvec_hidden_def (_ZGVbN2vvv_sincos) + +#define _ZGVbN2vvv_sincos _ZGVbN2vvv_sincos_sse2 +#include "../svml_d_sincos2_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core_sse4.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core_sse4.S new file mode 100644 index 0000000000..c46109f35d --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core_sse4.S @@ -0,0 +1,368 @@ +/* Function sincos vectorized with SSE4. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_trig_data.h" + + .text +ENTRY (_ZGVbN2vl8l8_sincos_sse4) +/* + ALGORITHM DESCRIPTION: + + ( low accuracy ( < 4ulp ) or enhanced performance + ( half of correct mantissa ) implementation ) + + Argument representation: + arg = N*Pi + R + + Result calculation: + sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R) + arg + Pi/2 = (N'*Pi + R') + cos(arg) = sin(arg+Pi/2) = sin(N'*Pi + R') = (-1)^N' * sin(R') + sin(R), sin(R') are approximated by corresponding polynomial. */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $320, %rsp + movq __svml_d_trig_data@GOTPCREL(%rip), %rax + movups %xmm11, 160(%rsp) + movups %xmm12, 144(%rsp) + movups __dSignMask(%rax), %xmm11 + +/* ARGUMENT RANGE REDUCTION: + Absolute argument: X' = |X| */ + movaps %xmm11, %xmm4 + +/* Grab sign bit from argument */ + movaps %xmm11, %xmm7 + movups __dInvPI(%rax), %xmm5 + andnps %xmm0, %xmm4 + +/* SinY = X'*InvPi + RS : right shifter add */ + mulpd %xmm4, %xmm5 + addpd __dRShifter(%rax), %xmm5 + +/* SinSignRes = Y<<63 : shift LSB to MSB place for result sign */ + movaps %xmm5, %xmm12 + andps %xmm0, %xmm7 + +/* SinN = Y - RS : right shifter sub */ + subpd __dRShifter(%rax), %xmm5 + movups %xmm10, 176(%rsp) + psllq $63, %xmm12 + movups __dPI1(%rax), %xmm10 + +/* SinR = X' - SinN*Pi1 */ + movaps %xmm10, %xmm1 + mulpd %xmm5, %xmm1 + movups __dPI2(%rax), %xmm6 + +/* SinR = SinR - SinN*Pi1 */ + movaps %xmm6, %xmm2 + mulpd %xmm5, %xmm2 + movups %xmm13, 112(%rsp) + movaps %xmm4, %xmm13 + subpd %xmm1, %xmm13 + subpd %xmm2, %xmm13 + +/* Sine result sign: SinRSign = SignMask & SinR */ + movaps %xmm11, %xmm2 + +/* CosR = SinX - CosN*Pi1 */ + movaps %xmm4, %xmm1 + movups __dOneHalf(%rax), %xmm3 + andps %xmm13, %xmm2 + +/* Set SinRSign to 0.5 */ + orps %xmm2, %xmm3 + +/* Update CosRSign and CosSignRes signs */ + xorps %xmm11, %xmm2 + +/* CosN = SinN +(-)0.5 */ + addpd %xmm5, %xmm3 + cmpnlepd __dRangeVal(%rax), %xmm4 + mulpd %xmm3, %xmm10 + +/* CosR = CosR - CosN*Pi2 */ + mulpd %xmm3, %xmm6 + subpd %xmm10, %xmm1 + movmskpd %xmm4, %ecx + movups __dPI3(%rax), %xmm10 + xorps %xmm12, %xmm2 + subpd %xmm6, %xmm1 + +/* SinR = SinR - SinN*Pi3 */ + movaps %xmm10, %xmm6 + +/* Final reconstruction. + Combine Sin result's sign */ + xorps %xmm7, %xmm12 + mulpd %xmm5, %xmm6 + +/* CosR = CosR - CosN*Pi3 */ + mulpd %xmm3, %xmm10 + subpd %xmm6, %xmm13 + subpd %xmm10, %xmm1 + movups __dPI4(%rax), %xmm6 + +/* SinR = SinR - SinN*Pi4 */ + mulpd %xmm6, %xmm5 + +/* CosR = CosR - CosN*Pi4 */ + mulpd %xmm6, %xmm3 + subpd %xmm5, %xmm13 + subpd %xmm3, %xmm1 + +/* SinR2 = SinR^2 */ + movaps %xmm13, %xmm6 + +/* CosR2 = CosR^2 */ + movaps %xmm1, %xmm10 + mulpd %xmm13, %xmm6 + mulpd %xmm1, %xmm10 + +/* Polynomial approximation */ + movups __dC7(%rax), %xmm5 + movaps %xmm5, %xmm3 + mulpd %xmm6, %xmm3 + mulpd %xmm10, %xmm5 + addpd __dC6(%rax), %xmm3 + addpd __dC6(%rax), %xmm5 + mulpd %xmm6, %xmm3 + mulpd %xmm10, %xmm5 + addpd __dC5(%rax), %xmm3 + addpd __dC5(%rax), %xmm5 + mulpd %xmm6, %xmm3 + mulpd %xmm10, %xmm5 + addpd __dC4(%rax), %xmm3 + addpd __dC4(%rax), %xmm5 + +/* SinPoly = C3 + SinR2*(C4 + SinR2*(C5 + SinR2*(C6 + SinR2*C7))) */ + mulpd %xmm6, %xmm3 + +/* CosPoly = C3 + CosR2*(C4 + CosR2*(C5 + CosR2*(C6 + CosR2*C7))) */ + mulpd %xmm10, %xmm5 + addpd __dC3(%rax), %xmm3 + addpd __dC3(%rax), %xmm5 + +/* SinPoly = C2 + SinR2*SinPoly */ + mulpd %xmm6, %xmm3 + +/* CosPoly = C2 + CosR2*CosPoly */ + mulpd %xmm10, %xmm5 + addpd __dC2(%rax), %xmm3 + addpd __dC2(%rax), %xmm5 + +/* SinPoly = C1 + SinR2*SinPoly */ + mulpd %xmm6, %xmm3 + +/* CosPoly = C1 + CosR2*CosPoly */ + mulpd %xmm10, %xmm5 + addpd __dC1(%rax), %xmm3 + addpd __dC1(%rax), %xmm5 + +/* SinPoly = SinR2*SinPoly */ + mulpd %xmm3, %xmm6 + +/* CosPoly = CosR2*CosPoly */ + mulpd %xmm5, %xmm10 + +/* SinPoly = SinR*SinPoly */ + mulpd %xmm13, %xmm6 + +/* CosPoly = CosR*CosPoly */ + mulpd %xmm1, %xmm10 + addpd %xmm6, %xmm13 + addpd %xmm10, %xmm1 + +/* Update Sin result's sign */ + xorps %xmm12, %xmm13 + +/* Update Cos result's sign */ + xorps %xmm2, %xmm1 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + movups 176(%rsp), %xmm10 + movaps %xmm13, (%rdi) + movups 160(%rsp), %xmm11 + movups 144(%rsp), %xmm12 + movups 112(%rsp), %xmm13 + movups %xmm1, (%rsi) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + movups %xmm0, 128(%rsp) + movups %xmm13, 192(%rsp) + movups %xmm1, 256(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + movups %xmm8, 48(%rsp) + movups %xmm9, 32(%rsp) + movups %xmm14, 16(%rsp) + movups %xmm15, (%rsp) + movq %rsi, 64(%rsp) + movq %r12, 104(%rsp) + cfi_offset_rel_rsp (12, 104) + movb %dl, %r12b + movq %r13, 96(%rsp) + cfi_offset_rel_rsp (13, 96) + movl %eax, %r13d + movq %r14, 88(%rsp) + cfi_offset_rel_rsp (14, 88) + movl %ecx, %r14d + movq %r15, 80(%rsp) + cfi_offset_rel_rsp (15, 80) + movq %rbx, 72(%rsp) + movq %rdi, %rbx + cfi_remember_state + +.LBL_1_6: + btl %r13d, %r14d + jc .LBL_1_13 + +.LBL_1_7: + lea 1(%r13), %esi + btl %esi, %r14d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r13d + cmpb $16, %r12b + jb .LBL_1_6 + + movups 48(%rsp), %xmm8 + movq %rbx, %rdi + movups 32(%rsp), %xmm9 + movups 16(%rsp), %xmm14 + movups (%rsp), %xmm15 + movq 64(%rsp), %rsi + movq 104(%rsp), %r12 + cfi_restore (%r12) + movq 96(%rsp), %r13 + cfi_restore (%r13) + movq 88(%rsp), %r14 + cfi_restore (%r14) + movq 80(%rsp), %r15 + cfi_restore (%r15) + movq 72(%rsp), %rbx + movups 192(%rsp), %xmm13 + movups 256(%rsp), %xmm1 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + movsd 136(%rsp,%r15), %xmm0 + + call JUMPTARGET(sin) + + movsd %xmm0, 200(%rsp,%r15) + movsd 136(%rsp,%r15), %xmm0 + + call JUMPTARGET(cos) + + movsd %xmm0, 264(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_13: + movzbl %r12b, %r15d + shlq $4, %r15 + movsd 128(%rsp,%r15), %xmm0 + + call JUMPTARGET(sin) + + movsd %xmm0, 192(%rsp,%r15) + movsd 128(%rsp,%r15), %xmm0 + + call JUMPTARGET(cos) + + movsd %xmm0, 256(%rsp,%r15) + jmp .LBL_1_7 +END (_ZGVbN2vl8l8_sincos_sse4) +libmvec_hidden_def(_ZGVbN2vl8l8_sincos_sse4) + +/* vvv version implemented with wrapper to vl8l8 variant. */ +ENTRY (_ZGVbN2vvv_sincos_sse4) +#ifndef __ILP32__ + subq $72, %rsp + .cfi_def_cfa_offset 80 + movdqu %xmm1, 32(%rsp) + lea (%rsp), %rdi + movdqu %xmm2, 48(%rdi) + lea 16(%rsp), %rsi + call HIDDEN_JUMPTARGET(_ZGVbN2vl8l8_sincos_sse4) + movq 32(%rsp), %rdx + movq 48(%rsp), %rsi + movq 40(%rsp), %r8 + movq 56(%rsp), %r10 + movq (%rsp), %rax + movq 16(%rsp), %rcx + movq 8(%rsp), %rdi + movq 24(%rsp), %r9 + movq %rax, (%rdx) + movq %rcx, (%rsi) + movq %rdi, (%r8) + movq %r9, (%r10) + addq $72, %rsp + .cfi_def_cfa_offset 8 + ret +#else + subl $72, %esp + .cfi_def_cfa_offset 80 + leal 48(%rsp), %esi + movaps %xmm1, 16(%esp) + leal 32(%rsp), %edi + movaps %xmm2, (%esp) + call HIDDEN_JUMPTARGET(_ZGVbN2vl8l8_sincos_sse4) + movdqa 16(%esp), %xmm1 + movsd 32(%esp), %xmm0 + movq %xmm1, %rax + movdqa (%esp), %xmm2 + movsd %xmm0, (%eax) + movsd 40(%esp), %xmm0 + pextrd $1, %xmm1, %eax + movsd %xmm0, (%eax) + movsd 48(%esp), %xmm0 + movq %xmm2, %rax + movsd %xmm0, (%eax) + movsd 56(%esp), %xmm0 + pextrd $1, %xmm2, %eax + movsd %xmm0, (%eax) + addl $72, %esp + .cfi_def_cfa_offset 8 + ret +#endif +END (_ZGVbN2vvv_sincos_sse4) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core.S new file mode 100644 index 0000000000..8aacb8e76a --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized sincos. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVdN4vvv_sincos) + .type _ZGVdN4vvv_sincos, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVdN4vvv_sincos_avx2(%rip), %rax + HAS_ARCH_FEATURE (AVX2_Usable) + jz 2f + ret +2: leaq _ZGVdN4vvv_sincos_sse_wrapper(%rip), %rax + ret +END (_ZGVdN4vvv_sincos) +libmvec_hidden_def (_ZGVdN4vvv_sincos) + +#define _ZGVdN4vvv_sincos _ZGVdN4vvv_sincos_sse_wrapper +#include "../svml_d_sincos4_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core_avx2.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core_avx2.S new file mode 100644 index 0000000000..a6318c5ca6 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core_avx2.S @@ -0,0 +1,373 @@ +/* Function sincos vectorized with AVX2. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_trig_data.h" + + .text +ENTRY (_ZGVdN4vl8l8_sincos_avx2) +/* + ALGORITHM DESCRIPTION: + + ( low accuracy ( < 4ulp ) or enhanced performance + ( half of correct mantissa ) implementation ) + + Argument representation: + arg = N*Pi + R + + Result calculation: + sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R) + arg + Pi/2 = (N'*Pi + R') + cos(arg) = sin(arg+Pi/2) = sin(N'*Pi + R') = (-1)^N' * sin(R') + sin(R), sin(R') are approximated by corresponding polynomial. */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $448, %rsp + movq __svml_d_trig_data@GOTPCREL(%rip), %rax + vmovups %ymm14, 288(%rsp) + vmovups %ymm8, 352(%rsp) + vmovupd __dSignMask(%rax), %ymm6 + vmovupd __dInvPI(%rax), %ymm2 + vmovupd __dPI1_FMA(%rax), %ymm5 + vmovups %ymm9, 224(%rsp) + +/* ARGUMENT RANGE REDUCTION: + Absolute argument: X' = |X| */ + vandnpd %ymm0, %ymm6, %ymm1 + +/* SinY = X'*InvPi + RS : right shifter add */ + vfmadd213pd __dRShifter(%rax), %ymm1, %ymm2 + +/* SinSignRes = Y<<63 : shift LSB to MSB place for result sign */ + vpsllq $63, %ymm2, %ymm4 + +/* SinN = Y - RS : right shifter sub */ + vsubpd __dRShifter(%rax), %ymm2, %ymm2 + +/* SinR = X' - SinN*Pi1 */ + vmovdqa %ymm1, %ymm14 + vfnmadd231pd %ymm2, %ymm5, %ymm14 + +/* SinR = SinR - SinN*Pi1 */ + vfnmadd231pd __dPI2_FMA(%rax), %ymm2, %ymm14 + +/* Sine result sign: SinRSign = SignMask & SinR */ + vandpd %ymm14, %ymm6, %ymm7 + +/* Set SinRSign to 0.5 */ + vorpd __dOneHalf(%rax), %ymm7, %ymm3 + +/* CosN = SinN +(-)0.5 */ + vaddpd %ymm3, %ymm2, %ymm3 + +/* CosR = SinX - CosN*Pi1 */ + vmovdqa %ymm1, %ymm8 + vfnmadd231pd %ymm3, %ymm5, %ymm8 + vmovupd __dPI3_FMA(%rax), %ymm5 + vcmpnle_uqpd __dRangeVal(%rax), %ymm1, %ymm1 + +/* CosR = CosR - CosN*Pi2 */ + vfnmadd231pd __dPI2_FMA(%rax), %ymm3, %ymm8 + +/* SinR = SinR - SinN*Pi3 */ + vfnmadd213pd %ymm14, %ymm5, %ymm2 + +/* CosR = CosR - CosN*Pi3 */ + vfnmadd213pd %ymm8, %ymm5, %ymm3 + vmovupd __dC6(%rax), %ymm8 + +/* SinR2 = SinR^2 */ + vmulpd %ymm2, %ymm2, %ymm14 + +/* CosR2 = CosR^2 */ + vmulpd %ymm3, %ymm3, %ymm5 + +/* Grab SignX */ + vandpd %ymm0, %ymm6, %ymm9 + +/* Update CosRSign and CosSignRes signs */ + vxorpd %ymm6, %ymm7, %ymm6 + vxorpd %ymm6, %ymm4, %ymm7 + +/* Update sign SinSignRes */ + vxorpd %ymm9, %ymm4, %ymm6 + +/* Polynomial approximation */ + vmovupd __dC7(%rax), %ymm4 + vmovdqa %ymm8, %ymm9 + vfmadd231pd __dC7(%rax), %ymm14, %ymm9 + vfmadd213pd %ymm8, %ymm5, %ymm4 + vfmadd213pd __dC5(%rax), %ymm14, %ymm9 + vfmadd213pd __dC5(%rax), %ymm5, %ymm4 + vfmadd213pd __dC4(%rax), %ymm14, %ymm9 + vfmadd213pd __dC4(%rax), %ymm5, %ymm4 + +/* SinPoly = C3 + SinR2*(C4 + SinR2*(C5 + SinR2*(C6 + SinR2*C7))) */ + vfmadd213pd __dC3(%rax), %ymm14, %ymm9 + +/* CosPoly = C3 + CosR2*(C4 + CosR2*(C5 + CosR2*(C6 + CosR2*C7))) */ + vfmadd213pd __dC3(%rax), %ymm5, %ymm4 + +/* SinPoly = C2 + SinR2*SinPoly */ + vfmadd213pd __dC2(%rax), %ymm14, %ymm9 + +/* CosPoly = C2 + CosR2*CosPoly */ + vfmadd213pd __dC2(%rax), %ymm5, %ymm4 + +/* SinPoly = C1 + SinR2*SinPoly */ + vfmadd213pd __dC1(%rax), %ymm14, %ymm9 + +/* CosPoly = C1 + CosR2*CosPoly */ + vfmadd213pd __dC1(%rax), %ymm5, %ymm4 + +/* SinPoly = SinR2*SinPoly */ + vmulpd %ymm14, %ymm9, %ymm8 + +/* CosPoly = CosR2*CosPoly */ + vmulpd %ymm5, %ymm4, %ymm4 + +/* SinPoly = SinR*SinPoly */ + vfmadd213pd %ymm2, %ymm2, %ymm8 + +/* CosPoly = CosR*CosPoly */ + vfmadd213pd %ymm3, %ymm3, %ymm4 + vmovmskpd %ymm1, %ecx + +/* Final reconstruction + Update Sin result's sign */ + vxorpd %ymm6, %ymm8, %ymm3 + +/* Update Cos result's sign */ + vxorpd %ymm7, %ymm4, %ymm2 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + vmovups 352(%rsp), %ymm8 + vmovups 224(%rsp), %ymm9 + vmovups 288(%rsp), %ymm14 + vmovupd %ymm2, (%rsi) + vmovdqa %ymm3, (%rdi) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovupd %ymm0, 256(%rsp) + vmovupd %ymm3, 320(%rsp) + vmovupd %ymm2, 384(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + vmovups %ymm10, 128(%rsp) + vmovups %ymm11, 96(%rsp) + vmovups %ymm12, 64(%rsp) + vmovups %ymm13, 32(%rsp) + vmovups %ymm15, (%rsp) + movq %rsi, 160(%rsp) + movq %r12, 200(%rsp) + cfi_offset_rel_rsp (12, 200) + movb %dl, %r12b + movq %r13, 192(%rsp) + cfi_offset_rel_rsp (13, 192) + movl %eax, %r13d + movq %r14, 184(%rsp) + cfi_offset_rel_rsp (14, 184) + movl %ecx, %r14d + movq %r15, 176(%rsp) + cfi_offset_rel_rsp (15, 176) + movq %rbx, 168(%rsp) + movq %rdi, %rbx + cfi_remember_state + +.LBL_1_6: + btl %r13d, %r14d + jc .LBL_1_13 + +.LBL_1_7: + lea 1(%r13), %esi + btl %esi, %r14d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r13d + cmpb $16, %r12b + jb .LBL_1_6 + + vmovups 128(%rsp), %ymm10 + movq %rbx, %rdi + vmovups 96(%rsp), %ymm11 + vmovups 64(%rsp), %ymm12 + vmovups 32(%rsp), %ymm13 + vmovups (%rsp), %ymm15 + vmovupd 320(%rsp), %ymm3 + vmovupd 384(%rsp), %ymm2 + movq 160(%rsp), %rsi + movq 200(%rsp), %r12 + cfi_restore (%r12) + movq 192(%rsp), %r13 + cfi_restore (%r13) + movq 184(%rsp), %r14 + cfi_restore (%r14) + movq 176(%rsp), %r15 + cfi_restore (%r15) + movq 168(%rsp), %rbx + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 264(%rsp,%r15), %xmm0 + vzeroupper + + call JUMPTARGET(sin) + + vmovsd %xmm0, 328(%rsp,%r15) + vmovsd 264(%rsp,%r15), %xmm0 + + call JUMPTARGET(cos) + + vmovsd %xmm0, 392(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_13: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 256(%rsp,%r15), %xmm0 + vzeroupper + + call JUMPTARGET(sin) + + vmovsd %xmm0, 320(%rsp,%r15) + vmovsd 256(%rsp,%r15), %xmm0 + + call JUMPTARGET(cos) + + vmovsd %xmm0, 384(%rsp,%r15) + jmp .LBL_1_7 + +END (_ZGVdN4vl8l8_sincos_avx2) +libmvec_hidden_def(_ZGVdN4vl8l8_sincos_avx2) + +/* vvv version implemented with wrapper to vl8l8 variant. */ +ENTRY (_ZGVdN4vvv_sincos_avx2) +#ifndef __ILP32__ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-32, %rsp + subq $128, %rsp + vmovdqu %ymm1, 64(%rsp) + lea (%rsp), %rdi + vmovdqu %ymm2, 96(%rdi) + lea 32(%rsp), %rsi + call HIDDEN_JUMPTARGET(_ZGVdN4vl8l8_sincos_avx2) + movq 64(%rsp), %rdx + movq 96(%rsp), %rsi + movq 72(%rsp), %r8 + movq 104(%rsp), %r10 + movq (%rsp), %rax + movq 32(%rsp), %rcx + movq 8(%rsp), %rdi + movq 40(%rsp), %r9 + movq %rax, (%rdx) + movq %rcx, (%rsi) + movq 80(%rsp), %rax + movq 112(%rsp), %rcx + movq %rdi, (%r8) + movq %r9, (%r10) + movq 88(%rsp), %rdi + movq 120(%rsp), %r9 + movq 16(%rsp), %r11 + movq 48(%rsp), %rdx + movq 24(%rsp), %rsi + movq 56(%rsp), %r8 + movq %r11, (%rax) + movq %rdx, (%rcx) + movq %rsi, (%rdi) + movq %r8, (%r9) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret +#else + leal 8(%rsp), %r10d + .cfi_def_cfa 10, 0 + andl $-32, %esp + pushq -8(%r10d) + pushq %rbp + .cfi_escape 0x10,0x6,0x2,0x76,0 + movl %esp, %ebp + pushq %r10 + .cfi_escape 0xf,0x3,0x76,0x78,0x6 + leal -48(%rbp), %esi + leal -80(%rbp), %edi + subl $104, %esp + vmovaps %xmm1, -96(%ebp) + vmovaps %xmm2, -112(%ebp) + call HIDDEN_JUMPTARGET(_ZGVdN4vl8l8_sincos_avx2) + movl -96(%ebp), %eax + vmovsd -80(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -92(%ebp), %eax + vmovsd -72(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -88(%ebp), %eax + vmovsd -64(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -84(%ebp), %eax + vmovsd -56(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -112(%ebp), %eax + vmovsd -48(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -108(%ebp), %eax + vmovsd -40(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -104(%ebp), %eax + vmovsd -32(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -100(%ebp), %eax + vmovsd -24(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + addl $104, %esp + popq %r10 + .cfi_def_cfa 10, 0 + popq %rbp + leal -8(%r10), %esp + .cfi_def_cfa 7, 8 + ret +#endif +END (_ZGVdN4vvv_sincos_avx2) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S new file mode 100644 index 0000000000..3c0abc379e --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S @@ -0,0 +1,37 @@ +/* Multiple versions of vectorized sincos. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVeN8vvv_sincos) + .type _ZGVeN8vvv_sincos, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVeN8vvv_sincos_skx(%rip), %rax + HAS_ARCH_FEATURE (AVX512DQ_Usable) + jnz 2f + leaq _ZGVeN8vvv_sincos_knl(%rip), %rax + HAS_ARCH_FEATURE (AVX512F_Usable) + jnz 2f + leaq _ZGVeN8vvv_sincos_avx2_wrapper(%rip), %rax +2: ret +END (_ZGVeN8vvv_sincos) + +#define _ZGVeN8vvv_sincos _ZGVeN8vvv_sincos_avx2_wrapper +#include "../svml_d_sincos8_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S new file mode 100644 index 0000000000..c9207558c5 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S @@ -0,0 +1,763 @@ +/* Function sincos vectorized with AVX-512. KNL and SKX versions. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_trig_data.h" +#include "svml_d_wrapper_impl.h" + +/* + ALGORITHM DESCRIPTION: + + ( low accuracy ( < 4ulp ) or enhanced performance + ( half of correct mantissa ) implementation ) + + Argument representation: + arg = N*Pi + R + + Result calculation: + sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R) + arg + Pi/2 = (N'*Pi + R') + cos(arg) = sin(arg+Pi/2) = sin(N'*Pi + R') = (-1)^N' * sin(R') + sin(R), sin(R') are approximated by corresponding polynomial. */ + + .text +ENTRY (_ZGVeN8vl8l8_sincos_knl) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512_fFF _ZGVdN4vl8l8_sincos +#else + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1344, %rsp + movq __svml_d_trig_data@GOTPCREL(%rip), %rax + vmovaps %zmm0, %zmm4 + movq $-1, %rdx + vmovups __dSignMask(%rax), %zmm12 + vmovups __dInvPI(%rax), %zmm5 + +/* ARGUMENT RANGE REDUCTION: + Absolute argument: X' = |X| */ + vpandnq %zmm4, %zmm12, %zmm3 + vmovups __dPI1_FMA(%rax), %zmm7 + vmovups __dPI3_FMA(%rax), %zmm9 + +/* SinR = X' - SinN*Pi1 */ + vmovaps %zmm3, %zmm8 + +/* CosR = SinX - CosN*Pi1 */ + vmovaps %zmm3, %zmm10 + +/* SinY = X'*InvPi + RS : right shifter add */ + vfmadd213pd __dRShifter(%rax), %zmm3, %zmm5 + vmovups __dC6(%rax), %zmm13 + +/* SinN = Y - RS : right shifter sub */ + vsubpd __dRShifter(%rax), %zmm5, %zmm1 + vmovaps %zmm13, %zmm14 + +/* SinSignRes = Y<<63 : shift LSB to MSB place for result sign */ + vpsllq $63, %zmm5, %zmm2 + vcmppd $22, __dRangeVal(%rax), %zmm3, %k1 + +/* Update CosRSign and CosSignRes signs */ + vmovaps %zmm12, %zmm5 + vfnmadd231pd %zmm1, %zmm7, %zmm8 + +/* SinR = SinR - SinN*Pi1 */ + vfnmadd231pd __dPI2_FMA(%rax), %zmm1, %zmm8 + +/* Sine result sign: SinRSign = SignMask & SinR */ + vpandq %zmm8, %zmm12, %zmm11 + +/* Set SinRSign to 0.5 */ + vporq __dOneHalf(%rax), %zmm11, %zmm6 + vpternlogq $150, %zmm2, %zmm11, %zmm5 + +/* Update sign SinSignRes */ + vpternlogq $120, %zmm4, %zmm12, %zmm2 + +/* Polynomial approximation */ + vmovups __dC7(%rax), %zmm11 + +/* CosN = SinN +(-)0.5 */ + vaddpd %zmm6, %zmm1, %zmm0 + +/* SinR = SinR - SinN*Pi3 */ + vfnmadd213pd %zmm8, %zmm9, %zmm1 + vfnmadd231pd %zmm0, %zmm7, %zmm10 + +/* SinR2 = SinR^2 */ + vmulpd %zmm1, %zmm1, %zmm15 + +/* Grab SignX + CosR = CosR - CosN*Pi2 */ + vfnmadd231pd __dPI2_FMA(%rax), %zmm0, %zmm10 + vfmadd231pd __dC7(%rax), %zmm15, %zmm14 + +/* CosR = CosR - CosN*Pi3 */ + vfnmadd213pd %zmm10, %zmm9, %zmm0 + vfmadd213pd __dC5(%rax), %zmm15, %zmm14 + +/* CosR2 = CosR^2 */ + vmulpd %zmm0, %zmm0, %zmm12 + vfmadd213pd __dC4(%rax), %zmm15, %zmm14 + vfmadd213pd %zmm13, %zmm12, %zmm11 + +/* SinPoly = C3 + SinR2*(C4 + SinR2*(C5 + SinR2*(C6 + SinR2*C7))) */ + vfmadd213pd __dC3(%rax), %zmm15, %zmm14 + vfmadd213pd __dC5(%rax), %zmm12, %zmm11 + +/* SinPoly = C2 + SinR2*SinPoly */ + vfmadd213pd __dC2(%rax), %zmm15, %zmm14 + vfmadd213pd __dC4(%rax), %zmm12, %zmm11 + +/* SinPoly = C1 + SinR2*SinPoly */ + vfmadd213pd __dC1(%rax), %zmm15, %zmm14 + +/* CosPoly = C3 + CosR2*(C4 + CosR2*(C5 + CosR2*(C6 + CosR2*C7))) */ + vfmadd213pd __dC3(%rax), %zmm12, %zmm11 + +/* SinPoly = SinR2*SinPoly */ + vmulpd %zmm15, %zmm14, %zmm13 + +/* CosPoly = C2 + CosR2*CosPoly */ + vfmadd213pd __dC2(%rax), %zmm12, %zmm11 + +/* SinPoly = SinR*SinPoly */ + vfmadd213pd %zmm1, %zmm1, %zmm13 + vpbroadcastq %rdx, %zmm1{%k1}{z} + +/* CosPoly = C1 + CosR2*CosPoly */ + vfmadd213pd __dC1(%rax), %zmm12, %zmm11 + vptestmq %zmm1, %zmm1, %k0 + kmovw %k0, %ecx + +/* CosPoly = CosR2*CosPoly */ + vmulpd %zmm12, %zmm11, %zmm14 + movzbl %cl, %ecx + +/* CosPoly = CosR*CosPoly */ + vfmadd213pd %zmm0, %zmm0, %zmm14 + +/* Final reconstruction. + Update Sin result's sign */ + vpxorq %zmm2, %zmm13, %zmm0 + +/* Update Cos result's sign */ + vpxorq %zmm5, %zmm14, %zmm2 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + vmovups %zmm0, (%rdi) + vmovups %zmm2, (%rsi) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovups %zmm4, 1152(%rsp) + vmovups %zmm0, 1216(%rsp) + vmovups %zmm2, 1280(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + kmovw %k4, 1048(%rsp) + xorl %eax, %eax + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %eax, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %ecx, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + movq %rbx, 1064(%rsp) + movq %rdi, %rbx + cfi_remember_state + +.LBL_1_6: + btl %r13d, %r14d + jc .LBL_1_13 + +.LBL_1_7: + lea 1(%r13), %esi + btl %esi, %r14d + jc .LBL_1_10 + +.LBL_1_8: + addb $1, %r12b + addl $2, %r13d + cmpb $16, %r12b + jb .LBL_1_6 + + movq %rbx, %rdi + kmovw 1048(%rsp), %k4 + movq 1056(%rsp), %rsi + kmovw 1040(%rsp), %k5 + movq 1096(%rsp), %r12 + cfi_restore (%r12) + kmovw 1032(%rsp), %k6 + movq 1088(%rsp), %r13 + cfi_restore (%r13) + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + movq 1064(%rsp), %rbx + vmovups 1216(%rsp), %zmm0 + vmovups 1280(%rsp), %zmm2 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1160(%rsp,%r15), %xmm0 + + call JUMPTARGET(sin) + + vmovsd %xmm0, 1224(%rsp,%r15) + vmovsd 1160(%rsp,%r15), %xmm0 + + call JUMPTARGET(cos) + + vmovsd %xmm0, 1288(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_13: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1152(%rsp,%r15), %xmm0 + + call JUMPTARGET(sin) + + vmovsd %xmm0, 1216(%rsp,%r15) + vmovsd 1152(%rsp,%r15), %xmm0 + + call JUMPTARGET(cos) + + vmovsd %xmm0, 1280(%rsp,%r15) + jmp .LBL_1_7 + +#endif +END (_ZGVeN8vl8l8_sincos_knl) +libmvec_hidden_def(_ZGVeN8vl8l8_sincos_knl) + +ENTRY (_ZGVeN8vl8l8_sincos_skx) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512_fFF _ZGVdN4vl8l8_sincos +#else + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1344, %rsp + movq __svml_d_trig_data@GOTPCREL(%rip), %rax + vmovaps %zmm0, %zmm8 + vmovups __dSignMask(%rax), %zmm4 + vmovups __dInvPI(%rax), %zmm9 + vmovups __dRShifter(%rax), %zmm10 + vmovups __dPI1_FMA(%rax), %zmm13 + vmovups __dPI2_FMA(%rax), %zmm14 + vmovups __dOneHalf(%rax), %zmm11 + vmovups __dPI3_FMA(%rax), %zmm2 + +/* ARGUMENT RANGE REDUCTION: + Absolute argument: X' = |X| */ + vandnpd %zmm8, %zmm4, %zmm7 + +/* SinY = X'*InvPi + RS : right shifter add */ + vfmadd213pd %zmm10, %zmm7, %zmm9 + vcmppd $18, __dRangeVal(%rax), %zmm7, %k1 + +/* SinSignRes = Y<<63 : shift LSB to MSB place for result sign */ + vpsllq $63, %zmm9, %zmm6 + +/* SinN = Y - RS : right shifter sub */ + vsubpd %zmm10, %zmm9, %zmm5 + vmovups __dC5(%rax), %zmm9 + vmovups __dC4(%rax), %zmm10 + +/* SinR = X' - SinN*Pi1 */ + vmovaps %zmm7, %zmm15 + vfnmadd231pd %zmm5, %zmm13, %zmm15 + +/* SinR = SinR - SinN*Pi1 */ + vfnmadd231pd %zmm5, %zmm14, %zmm15 + +/* Sine result sign: SinRSign = SignMask & SinR */ + vandpd %zmm15, %zmm4, %zmm1 + +/* Set SinRSign to 0.5 */ + vorpd %zmm1, %zmm11, %zmm12 + vmovups __dC3(%rax), %zmm11 + +/* CosN = SinN +(-)0.5 */ + vaddpd %zmm12, %zmm5, %zmm3 + +/* SinR = SinR - SinN*Pi3 */ + vfnmadd213pd %zmm15, %zmm2, %zmm5 + vmovups __dC2(%rax), %zmm12 + +/* SinR2 = SinR^2 */ + vmulpd %zmm5, %zmm5, %zmm15 + +/* CosR = SinX - CosN*Pi1 */ + vmovaps %zmm7, %zmm0 + vfnmadd231pd %zmm3, %zmm13, %zmm0 + vmovups __dC1(%rax), %zmm13 + +/* Grab SignX + CosR = CosR - CosN*Pi2 */ + vfnmadd231pd %zmm3, %zmm14, %zmm0 + +/* CosR = CosR - CosN*Pi3 */ + vfnmadd213pd %zmm0, %zmm2, %zmm3 + +/* Polynomial approximation */ + vmovups __dC7(%rax), %zmm0 + +/* Update CosRSign and CosSignRes signs */ + vmovaps %zmm4, %zmm2 + vpternlogq $150, %zmm6, %zmm1, %zmm2 + +/* Update sign SinSignRes */ + vpternlogq $120, %zmm8, %zmm4, %zmm6 + +/* CosR2 = CosR^2 */ + vmulpd %zmm3, %zmm3, %zmm1 + vmovups __dC6(%rax), %zmm4 + vmovaps %zmm0, %zmm14 + vfmadd213pd %zmm4, %zmm1, %zmm0 + vfmadd213pd %zmm4, %zmm15, %zmm14 + vfmadd213pd %zmm9, %zmm1, %zmm0 + vfmadd213pd %zmm9, %zmm15, %zmm14 + vfmadd213pd %zmm10, %zmm1, %zmm0 + vfmadd213pd %zmm10, %zmm15, %zmm14 + +/* CosPoly = C3 + CosR2*(C4 + CosR2*(C5 + CosR2*(C6 + CosR2*C7))) */ + vfmadd213pd %zmm11, %zmm1, %zmm0 + +/* SinPoly = C3 + SinR2*(C4 + SinR2*(C5 + SinR2*(C6 + SinR2*C7))) */ + vfmadd213pd %zmm11, %zmm15, %zmm14 + +/* CosPoly = C2 + CosR2*CosPoly */ + vfmadd213pd %zmm12, %zmm1, %zmm0 + +/* SinPoly = C2 + SinR2*SinPoly */ + vfmadd213pd %zmm12, %zmm15, %zmm14 + +/* CosPoly = C1 + CosR2*CosPoly */ + vfmadd213pd %zmm13, %zmm1, %zmm0 + +/* SinPoly = C1 + SinR2*SinPoly */ + vfmadd213pd %zmm13, %zmm15, %zmm14 + +/* CosPoly = CosR2*CosPoly */ + vmulpd %zmm1, %zmm0, %zmm1 + +/* SinPoly = SinR2*SinPoly */ + vmulpd %zmm15, %zmm14, %zmm4 + +/* CosPoly = CosR*CosPoly */ + vfmadd213pd %zmm3, %zmm3, %zmm1 + +/* SinPoly = SinR*SinPoly */ + vfmadd213pd %zmm5, %zmm5, %zmm4 + vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3 + +/* Update Cos result's sign */ + vxorpd %zmm2, %zmm1, %zmm1 + +/* Final reconstruction. + Update Sin result's sign */ + vxorpd %zmm6, %zmm4, %zmm0 + vpandnq %zmm7, %zmm7, %zmm3{%k1} + vcmppd $3, %zmm3, %zmm3, %k0 + kmovw %k0, %ecx + testl %ecx, %ecx + jne .LBL_2_3 + +.LBL_2_2: + cfi_remember_state + vmovups %zmm0, (%rdi) + vmovups %zmm1, (%rsi) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_2_3: + cfi_restore_state + vmovups %zmm8, 1152(%rsp) + vmovups %zmm0, 1216(%rsp) + vmovups %zmm1, 1280(%rsp) + je .LBL_2_2 + + xorb %dl, %dl + xorl %eax, %eax + kmovw %k4, 1048(%rsp) + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %eax, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %ecx, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + movq %rbx, 1064(%rsp) + movq %rdi, %rbx + cfi_remember_state + +.LBL_2_6: + btl %r13d, %r14d + jc .LBL_2_13 + +.LBL_2_7: + lea 1(%r13), %esi + btl %esi, %r14d + jc .LBL_2_10 + +.LBL_2_8: + incb %r12b + addl $2, %r13d + cmpb $16, %r12b + jb .LBL_2_6 + + kmovw 1048(%rsp), %k4 + movq %rbx, %rdi + kmovw 1040(%rsp), %k5 + kmovw 1032(%rsp), %k6 + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + vmovups 1216(%rsp), %zmm0 + vmovups 1280(%rsp), %zmm1 + movq 1056(%rsp), %rsi + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + movq 1064(%rsp), %rbx + jmp .LBL_2_2 + +.LBL_2_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1160(%rsp,%r15), %xmm0 + vzeroupper + vmovsd 1160(%rsp,%r15), %xmm0 + + call JUMPTARGET(sin) + + vmovsd %xmm0, 1224(%rsp,%r15) + vmovsd 1160(%rsp,%r15), %xmm0 + + call JUMPTARGET(cos) + + vmovsd %xmm0, 1288(%rsp,%r15) + jmp .LBL_2_8 + +.LBL_2_13: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1152(%rsp,%r15), %xmm0 + vzeroupper + vmovsd 1152(%rsp,%r15), %xmm0 + + call JUMPTARGET(sin) + + vmovsd %xmm0, 1216(%rsp,%r15) + vmovsd 1152(%rsp,%r15), %xmm0 + + call JUMPTARGET(cos) + + vmovsd %xmm0, 1280(%rsp,%r15) + jmp .LBL_2_7 + +#endif +END (_ZGVeN8vl8l8_sincos_skx) +libmvec_hidden_def(_ZGVeN8vl8l8_sincos_skx) + +/* Wrapper between vvv and vl8l8 vector variants. */ +.macro WRAPPER_AVX512_vvv_vl8l8 callee +#ifndef __ILP32__ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $256, %rsp + /* Encoding for vmovups %zmm1, 128(%rsp). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x4c + .byte 0x24 + .byte 0x02 + lea (%rsp), %rdi + /* Encoding for vmovups %zmm2, 192(%rdi). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x57 + .byte 0x03 + lea 64(%rsp), %rsi + call HIDDEN_JUMPTARGET(\callee) + movq 128(%rsp), %rdx + movq 136(%rsp), %rsi + movq 144(%rsp), %r8 + movq 152(%rsp), %r10 + movq (%rsp), %rax + movq 8(%rsp), %rcx + movq 16(%rsp), %rdi + movq 24(%rsp), %r9 + movq %rax, (%rdx) + movq %rcx, (%rsi) + movq 160(%rsp), %rax + movq 168(%rsp), %rcx + movq %rdi, (%r8) + movq %r9, (%r10) + movq 176(%rsp), %rdi + movq 184(%rsp), %r9 + movq 32(%rsp), %r11 + movq 40(%rsp), %rdx + movq 48(%rsp), %rsi + movq 56(%rsp), %r8 + movq %r11, (%rax) + movq %rdx, (%rcx) + movq 192(%rsp), %r11 + movq 200(%rsp), %rdx + movq %rsi, (%rdi) + movq %r8, (%r9) + movq 208(%rsp), %rsi + movq 216(%rsp), %r8 + movq 64(%rsp), %r10 + movq 72(%rsp), %rax + movq 80(%rsp), %rcx + movq 88(%rsp), %rdi + movq %r10, (%r11) + movq %rax, (%rdx) + movq 224(%rsp), %r10 + movq 232(%rsp), %rax + movq %rcx, (%rsi) + movq %rdi, (%r8) + movq 240(%rsp), %rcx + movq 248(%rsp), %rdi + movq 96(%rsp), %r9 + movq 104(%rsp), %r11 + movq 112(%rsp), %rdx + movq 120(%rsp), %rsi + movq %r9, (%r10) + movq %r11, (%rax) + movq %rdx, (%rcx) + movq %rsi, (%rdi) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret +#else + leal 8(%rsp), %r10d + .cfi_def_cfa 10, 0 + andl $-64, %esp + pushq -8(%r10d) + pushq %rbp + .cfi_escape 0x10,0x6,0x2,0x76,0 + movl %esp, %ebp + pushq %r10 + .cfi_escape 0xf,0x3,0x76,0x78,0x6 + leal -112(%rbp), %esi + leal -176(%rbp), %edi + subl $232, %esp + vmovdqa %ymm1, -208(%ebp) + vmovdqa %ymm2, -240(%ebp) + call HIDDEN_JUMPTARGET(\callee) + vmovdqa -208(%ebp), %xmm0 + vmovq %xmm0, %rax + vmovsd -176(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + shrq $32, %rax + vmovsd -168(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movq -200(%ebp), %rax + vmovsd -160(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + shrq $32, %rax + vmovsd -152(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movq -192(%ebp), %rax + vmovsd -144(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + shrq $32, %rax + vmovsd -136(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movq -184(%ebp), %rax + vmovsd -128(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + shrq $32, %rax + vmovsd -120(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + vmovdqa -240(%ebp), %xmm0 + vmovq %xmm0, %rax + vmovsd -112(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + shrq $32, %rax + vmovsd -104(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movq -232(%ebp), %rax + vmovsd -96(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + shrq $32, %rax + vmovsd -88(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movq -224(%ebp), %rax + vmovsd -80(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + shrq $32, %rax + vmovsd -72(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movq -216(%ebp), %rax + vmovsd -64(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + shrq $32, %rax + vmovsd -56(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + addl $232, %esp + popq %r10 + .cfi_def_cfa 10, 0 + popq %rbp + leal -8(%r10), %esp + .cfi_def_cfa 7, 8 + ret +#endif +.endm + +ENTRY (_ZGVeN8vvv_sincos_knl) +WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_knl +END (_ZGVeN8vvv_sincos_knl) + +ENTRY (_ZGVeN8vvv_sincos_skx) +WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx +END (_ZGVeN8vvv_sincos_skx) + + .section .rodata, "a" +.L_2il0floatpacket.15: + .long 0xffffffff,0xffffffff + .type .L_2il0floatpacket.15,@object diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S new file mode 100644 index 0000000000..cd67665972 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S @@ -0,0 +1,37 @@ +/* Multiple versions of vectorized cosf. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVeN16v_cosf) + .type _ZGVeN16v_cosf, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVeN16v_cosf_skx(%rip), %rax + HAS_ARCH_FEATURE (AVX512DQ_Usable) + jnz 2f + leaq _ZGVeN16v_cosf_knl(%rip), %rax + HAS_ARCH_FEATURE (AVX512F_Usable) + jnz 2f + leaq _ZGVeN16v_cosf_avx2_wrapper(%rip), %rax +2: ret +END (_ZGVeN16v_cosf) + +#define _ZGVeN16v_cosf _ZGVeN16v_cosf_avx2_wrapper +#include "../svml_s_cosf16_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S new file mode 100644 index 0000000000..611bb5dd2d --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S @@ -0,0 +1,460 @@ +/* Function cosf vectorized with AVX-512. KNL and SKX versions. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_trig_data.h" +#include "svml_s_wrapper_impl.h" + + .text +ENTRY (_ZGVeN16v_cosf_knl) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf +#else +/* + ALGORITHM DESCRIPTION: + + 1) Range reduction to [-Pi/2; +Pi/2] interval + a) We remove sign using AND operation + b) Add Pi/2 value to argument X for Cos to Sin transformation + c) Getting octant Y by 1/Pi multiplication + d) Add "Right Shifter" value + e) Treat obtained value as integer for destination sign setting. + Shift first bit of this value to the last (sign) position + f) Subtract "Right Shifter" value + g) Subtract 0.5 from result for octant correction + h) Subtract Y*PI from X argument, where PI divided to 4 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4; + 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval) + a) Calculate X^2 = X * X + b) Calculate polynomial: + R = X + X * X^2 * (A3 + x^2 * (A5 + ..... + 3) Destination sign setting + a) Set shifted destination sign using XOR operation: + R = XOR( R, S ); + */ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1280, %rsp + movq __svml_s_trig_data@GOTPCREL(%rip), %rdx + +/* + h) Subtract Y*PI from X argument, where PI divided to 4 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3 + */ + vmovaps %zmm0, %zmm6 + movl $-1, %eax + +/* b) Add Pi/2 value to argument X for Cos to Sin transformation */ + vaddps __sHalfPI(%rdx), %zmm0, %zmm2 + vmovups __sRShifter(%rdx), %zmm3 + +/* + 1) Range reduction to [-Pi/2; +Pi/2] interval + c) Getting octant Y by 1/Pi multiplication + d) Add "Right Shifter" (0x4B000000) value + */ + vfmadd132ps __sInvPI(%rdx), %zmm3, %zmm2 + vmovups __sPI1_FMA(%rdx), %zmm5 + +/* f) Subtract "Right Shifter" (0x4B000000) value */ + vsubps %zmm3, %zmm2, %zmm4 + vmovups __sA9_FMA(%rdx), %zmm9 + +/* Check for large and special arguments */ + vpandd __sAbsMask(%rdx), %zmm0, %zmm1 + +/* + e) Treat obtained value as integer for destination sign setting. + Shift first bit of this value to the last (sign) position (S << 31) + */ + vpslld $31, %zmm2, %zmm8 + vcmpps $22, __sRangeReductionVal(%rdx), %zmm1, %k1 + vpbroadcastd %eax, %zmm12{%k1}{z} + +/* g) Subtract 0.5 from result for octant correction */ + vsubps __sOneHalf(%rdx), %zmm4, %zmm7 + vptestmd %zmm12, %zmm12, %k0 + vfnmadd231ps %zmm7, %zmm5, %zmm6 + kmovw %k0, %ecx + vfnmadd231ps __sPI2_FMA(%rdx), %zmm7, %zmm6 + vfnmadd132ps __sPI3_FMA(%rdx), %zmm6, %zmm7 + +/* a) Calculate X^2 = X * X */ + vmulps %zmm7, %zmm7, %zmm10 + +/* + 3) Destination sign setting + a) Set shifted destination sign using XOR operation: + R = XOR( R, S ); + */ + vpxord %zmm8, %zmm7, %zmm11 + +/* + b) Calculate polynomial: + R = X + X * X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9)))); + */ + vfmadd213ps __sA7_FMA(%rdx), %zmm10, %zmm9 + vfmadd213ps __sA5_FMA(%rdx), %zmm10, %zmm9 + vfmadd213ps __sA3(%rdx), %zmm10, %zmm9 + vmulps %zmm10, %zmm9, %zmm1 + vfmadd213ps %zmm11, %zmm11, %zmm1 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + vmovaps %zmm1, %zmm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovups %zmm0, 1152(%rsp) + vmovups %zmm1, 1216(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + kmovw %k4, 1048(%rsp) + xorl %eax, %eax + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1064(%rsp) + movq %rdi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %ecx, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %eax, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + addb $1, %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + kmovw 1048(%rsp), %k4 + movq 1064(%rsp), %rsi + kmovw 1040(%rsp), %k5 + movq 1056(%rsp), %rdi + kmovw 1032(%rsp), %k6 + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + vmovups 1216(%rsp), %zmm1 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + vmovss 1156(%rsp,%r15,8), %xmm0 + call JUMPTARGET(cosf) + vmovss %xmm0, 1220(%rsp,%r15,8) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + vmovss 1152(%rsp,%r15,8), %xmm0 + call JUMPTARGET(cosf) + vmovss %xmm0, 1216(%rsp,%r15,8) + jmp .LBL_1_7 +#endif +END (_ZGVeN16v_cosf_knl) + +ENTRY (_ZGVeN16v_cosf_skx) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf +#else +/* + ALGORITHM DESCRIPTION: + + 1) Range reduction to [-Pi/2; +Pi/2] interval + a) We remove sign using AND operation + b) Add Pi/2 value to argument X for Cos to Sin transformation + c) Getting octant Y by 1/Pi multiplication + d) Add "Right Shifter" value + e) Treat obtained value as integer for destination sign setting. + Shift first bit of this value to the last (sign) position + f) Subtract "Right Shifter" value + g) Subtract 0.5 from result for octant correction + h) Subtract Y*PI from X argument, where PI divided to 4 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4; + 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval) + a) Calculate X^2 = X * X + b) Calculate polynomial: + R = X + X * X^2 * (A3 + x^2 * (A5 + ..... + 3) Destination sign setting + a) Set shifted destination sign using XOR operation: + R = XOR( R, S ); + */ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1280, %rsp + movq __svml_s_trig_data@GOTPCREL(%rip), %rax + +/* + h) Subtract Y*PI from X argument, where PI divided to 4 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3 + */ + vmovaps %zmm0, %zmm6 + vmovups .L_2il0floatpacket.13(%rip), %zmm12 + vmovups __sRShifter(%rax), %zmm3 + vmovups __sPI1_FMA(%rax), %zmm5 + vmovups __sA9_FMA(%rax), %zmm9 + +/* b) Add Pi/2 value to argument X for Cos to Sin transformation */ + vaddps __sHalfPI(%rax), %zmm0, %zmm2 + +/* Check for large and special arguments */ + vandps __sAbsMask(%rax), %zmm0, %zmm1 + +/* + 1) Range reduction to [-Pi/2; +Pi/2] interval + c) Getting octant Y by 1/Pi multiplication + d) Add "Right Shifter" (0x4B000000) value + */ + vfmadd132ps __sInvPI(%rax), %zmm3, %zmm2 + vcmpps $18, __sRangeReductionVal(%rax), %zmm1, %k1 + +/* + e) Treat obtained value as integer for destination sign setting. + Shift first bit of this value to the last (sign) position (S << 31) + */ + vpslld $31, %zmm2, %zmm8 + +/* f) Subtract "Right Shifter" (0x4B000000) value */ + vsubps %zmm3, %zmm2, %zmm4 + +/* g) Subtract 0.5 from result for octant correction */ + vsubps __sOneHalf(%rax), %zmm4, %zmm7 + vfnmadd231ps %zmm7, %zmm5, %zmm6 + vfnmadd231ps __sPI2_FMA(%rax), %zmm7, %zmm6 + vfnmadd132ps __sPI3_FMA(%rax), %zmm6, %zmm7 + +/* a) Calculate X^2 = X * X */ + vmulps %zmm7, %zmm7, %zmm10 + +/* + 3) Destination sign setting + a) Set shifted destination sign using XOR operation: + R = XOR( R, S ); + */ + vxorps %zmm8, %zmm7, %zmm11 + +/* + b) Calculate polynomial: + R = X + X * X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9)))); + */ + vfmadd213ps __sA7_FMA(%rax), %zmm10, %zmm9 + vfmadd213ps __sA5_FMA(%rax), %zmm10, %zmm9 + vfmadd213ps __sA3(%rax), %zmm10, %zmm9 + vpandnd %zmm1, %zmm1, %zmm12{%k1} + vmulps %zmm10, %zmm9, %zmm1 + vptestmd %zmm12, %zmm12, %k0 + vfmadd213ps %zmm11, %zmm11, %zmm1 + kmovw %k0, %ecx + testl %ecx, %ecx + jne .LBL_2_3 +.LBL_2_2: + cfi_remember_state + vmovaps %zmm1, %zmm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_2_3: + cfi_restore_state + vmovups %zmm0, 1152(%rsp) + vmovups %zmm1, 1216(%rsp) + je .LBL_2_2 + + xorb %dl, %dl + xorl %eax, %eax + kmovw %k4, 1048(%rsp) + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1064(%rsp) + movq %rdi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %ecx, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %eax, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + cfi_remember_state + +.LBL_2_6: + btl %r14d, %r13d + jc .LBL_2_12 +.LBL_2_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_2_10 +.LBL_2_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_2_6 + kmovw 1048(%rsp), %k4 + kmovw 1040(%rsp), %k5 + kmovw 1032(%rsp), %k6 + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + vmovups 1216(%rsp), %zmm1 + movq 1064(%rsp), %rsi + movq 1056(%rsp), %rdi + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + jmp .LBL_2_2 + +.LBL_2_10: + cfi_restore_state + movzbl %r12b, %r15d + vmovss 1156(%rsp,%r15,8), %xmm0 + vzeroupper + vmovss 1156(%rsp,%r15,8), %xmm0 + call JUMPTARGET(cosf) + vmovss %xmm0, 1220(%rsp,%r15,8) + jmp .LBL_2_8 +.LBL_2_12: + movzbl %r12b, %r15d + vmovss 1152(%rsp,%r15,8), %xmm0 + vzeroupper + vmovss 1152(%rsp,%r15,8), %xmm0 + call JUMPTARGET(cosf) + vmovss %xmm0, 1216(%rsp,%r15,8) + jmp .LBL_2_7 +#endif +END (_ZGVeN16v_cosf_skx) + + .section .rodata, "a" +.L_2il0floatpacket.13: + .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff + .type .L_2il0floatpacket.13,@object diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core.S new file mode 100644 index 0000000000..d73d7c7e3f --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized cosf, vector length is 4. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVbN4v_cosf) + .type _ZGVbN4v_cosf, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVbN4v_cosf_sse4(%rip), %rax + HAS_CPU_FEATURE (SSE4_1) + jz 2f + ret +2: leaq _ZGVbN4v_cosf_sse2(%rip), %rax + ret +END (_ZGVbN4v_cosf) +libmvec_hidden_def (_ZGVbN4v_cosf) + +#define _ZGVbN4v_cosf _ZGVbN4v_cosf_sse2 +#include "../svml_s_cosf4_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core_sse4.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core_sse4.S new file mode 100644 index 0000000000..73797e1a93 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core_sse4.S @@ -0,0 +1,227 @@ +/* Function cosf vectorized with SSE4. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_trig_data.h" + + .text +ENTRY (_ZGVbN4v_cosf_sse4) +/* + ALGORITHM DESCRIPTION: + + 1) Range reduction to [-Pi/2; +Pi/2] interval + a) We remove sign using AND operation + b) Add Pi/2 value to argument X for Cos to Sin transformation + c) Getting octant Y by 1/Pi multiplication + d) Add "Right Shifter" value + e) Treat obtained value as integer for destination sign setting. + Shift first bit of this value to the last (sign) position + f) Subtract "Right Shifter" value + g) Subtract 0.5 from result for octant correction + h) Subtract Y*PI from X argument, where PI divided to 4 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4; + 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval) + a) Calculate X^2 = X * X + b) Calculate polynomial: + R = X + X * X^2 * (A3 + x^2 * (A5 + ..... + 3) Destination sign setting + a) Set shifted destination sign using XOR operation: + R = XOR( R, S ); + */ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $320, %rsp + movaps %xmm0, %xmm4 + movq __svml_s_trig_data@GOTPCREL(%rip), %rax + movups __sHalfPI(%rax), %xmm1 + movups __sRShifter(%rax), %xmm5 + +/* b) Add Pi/2 value to argument X for Cos to Sin transformation */ + addps %xmm4, %xmm1 + +/* + 1) Range reduction to [-Pi/2; +Pi/2] interval + c) Getting octant Y by 1/Pi multiplication + d) Add "Right Shifter" (0x4B000000) value + */ + mulps __sInvPI(%rax), %xmm1 + movups __sPI1(%rax), %xmm6 + addps %xmm5, %xmm1 + +/* + e) Treat obtained value as integer for destination sign setting. + Shift first bit of this value to the last (sign) position (S << 31) + */ + movaps %xmm1, %xmm2 + +/* f) Subtract "Right Shifter" (0x4B000000) value */ + subps %xmm5, %xmm1 + movups __sPI2(%rax), %xmm7 + pslld $31, %xmm2 + movups __sPI3(%rax), %xmm5 + movups __sAbsMask(%rax), %xmm3 + +/* Check for large and special arguments */ + andps %xmm4, %xmm3 + +/* g) Subtract 0.5 from result for octant correction */ + subps __sOneHalf(%rax), %xmm1 + cmpnleps __sRangeReductionVal(%rax), %xmm3 + +/* + h) Subtract Y*PI from X argument, where PI divided to 4 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4; + */ + mulps %xmm1, %xmm6 + mulps %xmm1, %xmm7 + mulps %xmm1, %xmm5 + subps %xmm6, %xmm0 + movmskps %xmm3, %ecx + movups __sPI4(%rax), %xmm6 + subps %xmm7, %xmm0 + mulps %xmm6, %xmm1 + subps %xmm5, %xmm0 + subps %xmm1, %xmm0 + +/* a) Calculate X^2 = X * X */ + movaps %xmm0, %xmm1 + mulps %xmm0, %xmm1 + +/* + 3) Destination sign setting + a) Set shifted destination sign using XOR operation: + R = XOR( R, S ); + */ + xorps %xmm2, %xmm0 + movups __sA9(%rax), %xmm2 + +/* + b) Calculate polynomial: + R = X + X * X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9)))); + */ + mulps %xmm1, %xmm2 + addps __sA7(%rax), %xmm2 + mulps %xmm1, %xmm2 + addps __sA5(%rax), %xmm2 + mulps %xmm1, %xmm2 + addps __sA3(%rax), %xmm2 + mulps %xmm2, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm0 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + movups %xmm4, 192(%rsp) + movups %xmm0, 256(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + movups %xmm8, 112(%rsp) + movups %xmm9, 96(%rsp) + movups %xmm10, 80(%rsp) + movups %xmm11, 64(%rsp) + movups %xmm12, 48(%rsp) + movups %xmm13, 32(%rsp) + movups %xmm14, 16(%rsp) + movups %xmm15, (%rsp) + movq %rsi, 136(%rsp) + movq %rdi, 128(%rsp) + movq %r12, 168(%rsp) + cfi_offset_rel_rsp (12, 168) + movb %dl, %r12b + movq %r13, 160(%rsp) + cfi_offset_rel_rsp (13, 160) + movl %ecx, %r13d + movq %r14, 152(%rsp) + cfi_offset_rel_rsp (14, 152) + movl %eax, %r14d + movq %r15, 144(%rsp) + cfi_offset_rel_rsp (15, 144) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + movups 112(%rsp), %xmm8 + movups 96(%rsp), %xmm9 + movups 80(%rsp), %xmm10 + movups 64(%rsp), %xmm11 + movups 48(%rsp), %xmm12 + movups 32(%rsp), %xmm13 + movups 16(%rsp), %xmm14 + movups (%rsp), %xmm15 + movq 136(%rsp), %rsi + movq 128(%rsp), %rdi + movq 168(%rsp), %r12 + cfi_restore (%r12) + movq 160(%rsp), %r13 + cfi_restore (%r13) + movq 152(%rsp), %r14 + cfi_restore (%r14) + movq 144(%rsp), %r15 + cfi_restore (%r15) + movups 256(%rsp), %xmm0 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + movss 196(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(cosf) + + movss %xmm0, 260(%rsp,%r15,8) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + movss 192(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(cosf) + + movss %xmm0, 256(%rsp,%r15,8) + jmp .LBL_1_7 +END (_ZGVbN4v_cosf_sse4) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core.S new file mode 100644 index 0000000000..f7530c138a --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized cosf, vector length is 8. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVdN8v_cosf) + .type _ZGVdN8v_cosf, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVdN8v_cosf_avx2(%rip), %rax + HAS_ARCH_FEATURE (AVX2_Usable) + jz 2f + ret +2: leaq _ZGVdN8v_cosf_sse_wrapper(%rip), %rax + ret +END (_ZGVdN8v_cosf) +libmvec_hidden_def (_ZGVdN8v_cosf) + +#define _ZGVdN8v_cosf _ZGVdN8v_cosf_sse_wrapper +#include "../svml_s_cosf8_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core_avx2.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core_avx2.S new file mode 100644 index 0000000000..c61add3bb9 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core_avx2.S @@ -0,0 +1,215 @@ +/* Function cosf vectorized with AVX2. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + + +#include <sysdep.h> +#include "svml_s_trig_data.h" + + .text +ENTRY (_ZGVdN8v_cosf_avx2) +/* + ALGORITHM DESCRIPTION: + + 1) Range reduction to [-Pi/2; +Pi/2] interval + a) We remove sign using AND operation + b) Add Pi/2 value to argument X for Cos to Sin transformation + c) Getting octant Y by 1/Pi multiplication + d) Add "Right Shifter" value + e) Treat obtained value as integer for destination sign setting. + Shift first bit of this value to the last (sign) position + f) Subtract "Right Shifter" value + g) Subtract 0.5 from result for octant correction + h) Subtract Y*PI from X argument, where PI divided to 4 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4; + 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval) + a) Calculate X^2 = X * X + b) Calculate polynomial: + R = X + X * X^2 * (A3 + x^2 * (A5 + ..... + 3) Destination sign setting + a) Set shifted destination sign using XOR operation: + R = XOR( R, S ); + */ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $448, %rsp + movq __svml_s_trig_data@GOTPCREL(%rip), %rax + vmovaps %ymm0, %ymm2 + vmovups __sRShifter(%rax), %ymm5 + vmovups __sPI1_FMA(%rax), %ymm7 + +/* b) Add Pi/2 value to argument X for Cos to Sin transformation */ + vaddps __sHalfPI(%rax), %ymm2, %ymm4 + +/* + 1) Range reduction to [-Pi/2; +Pi/2] interval + c) Getting octant Y by 1/Pi multiplication + d) Add "Right Shifter" (0x4B000000) value + */ + vfmadd132ps __sInvPI(%rax), %ymm5, %ymm4 + +/* f) Subtract "Right Shifter" (0x4B000000) value */ + vsubps %ymm5, %ymm4, %ymm6 + +/* + e) Treat obtained value as integer for destination sign setting. + Shift first bit of this value to the last (sign) position (S << 31) + */ + vpslld $31, %ymm4, %ymm0 + +/* g) Subtract 0.5 from result for octant correction */ + vsubps __sOneHalf(%rax), %ymm6, %ymm4 + +/* Check for large and special arguments */ + vandps __sAbsMask(%rax), %ymm2, %ymm3 + vcmpnle_uqps __sRangeReductionVal(%rax), %ymm3, %ymm1 + +/* + h) Subtract Y*PI from X argument, where PI divided to 4 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3 + */ + vmovaps %ymm2, %ymm3 + vfnmadd231ps %ymm4, %ymm7, %ymm3 + vfnmadd231ps __sPI2_FMA(%rax), %ymm4, %ymm3 + vfnmadd132ps __sPI3_FMA(%rax), %ymm3, %ymm4 + +/* a) Calculate X^2 = X * X */ + vmulps %ymm4, %ymm4, %ymm5 + +/* + 3) Destination sign setting + a) Set shifted destination sign using XOR operation: + R = XOR( R, S ); + */ + vxorps %ymm0, %ymm4, %ymm6 + vmovups __sA9_FMA(%rax), %ymm0 + +/* + b) Calculate polynomial: + R = X + X * X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9)))) + */ + vfmadd213ps __sA7_FMA(%rax), %ymm5, %ymm0 + vfmadd213ps __sA5_FMA(%rax), %ymm5, %ymm0 + vfmadd213ps __sA3(%rax), %ymm5, %ymm0 + vmulps %ymm5, %ymm0, %ymm0 + vmovmskps %ymm1, %ecx + vfmadd213ps %ymm6, %ymm6, %ymm0 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovups %ymm2, 320(%rsp) + vmovups %ymm0, 384(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + vmovups %ymm8, 224(%rsp) + vmovups %ymm9, 192(%rsp) + vmovups %ymm10, 160(%rsp) + vmovups %ymm11, 128(%rsp) + vmovups %ymm12, 96(%rsp) + vmovups %ymm13, 64(%rsp) + vmovups %ymm14, 32(%rsp) + vmovups %ymm15, (%rsp) + movq %rsi, 264(%rsp) + movq %rdi, 256(%rsp) + movq %r12, 296(%rsp) + cfi_offset_rel_rsp (12, 296) + movb %dl, %r12b + movq %r13, 288(%rsp) + cfi_offset_rel_rsp (13, 288) + movl %ecx, %r13d + movq %r14, 280(%rsp) + cfi_offset_rel_rsp (14, 280) + movl %eax, %r14d + movq %r15, 272(%rsp) + cfi_offset_rel_rsp (15, 272) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + vmovups 224(%rsp), %ymm8 + vmovups 192(%rsp), %ymm9 + vmovups 160(%rsp), %ymm10 + vmovups 128(%rsp), %ymm11 + vmovups 96(%rsp), %ymm12 + vmovups 64(%rsp), %ymm13 + vmovups 32(%rsp), %ymm14 + vmovups (%rsp), %ymm15 + vmovups 384(%rsp), %ymm0 + movq 264(%rsp), %rsi + movq 256(%rsp), %rdi + movq 296(%rsp), %r12 + cfi_restore (%r12) + movq 288(%rsp), %r13 + cfi_restore (%r13) + movq 280(%rsp), %r14 + cfi_restore (%r14) + movq 272(%rsp), %r15 + cfi_restore (%r15) + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + vmovss 324(%rsp,%r15,8), %xmm0 + vzeroupper + + call JUMPTARGET(cosf) + + vmovss %xmm0, 388(%rsp,%r15,8) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + vmovss 320(%rsp,%r15,8), %xmm0 + vzeroupper + + call JUMPTARGET(cosf) + + vmovss %xmm0, 384(%rsp,%r15,8) + jmp .LBL_1_7 + +END (_ZGVdN8v_cosf_avx2) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S new file mode 100644 index 0000000000..3998f616aa --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S @@ -0,0 +1,37 @@ +/* Multiple versions of vectorized expf. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVeN16v_expf) + .type _ZGVeN16v_expf, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVeN16v_expf_skx(%rip), %rax + HAS_ARCH_FEATURE (AVX512DQ_Usable) + jnz 2f + leaq _ZGVeN16v_expf_knl(%rip), %rax + HAS_ARCH_FEATURE (AVX512F_Usable) + jnz 2f + leaq _ZGVeN16v_expf_avx2_wrapper(%rip), %rax +2: ret +END (_ZGVeN16v_expf) + +#define _ZGVeN16v_expf _ZGVeN16v_expf_avx2_wrapper +#include "../svml_s_expf16_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S new file mode 100644 index 0000000000..e80b2be1a7 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S @@ -0,0 +1,447 @@ +/* Function expf vectorized with AVX-512. KNL and SKX versions. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_expf_data.h" +#include "svml_s_wrapper_impl.h" + + .text +ENTRY (_ZGVeN16v_expf_knl) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512 _ZGVdN8v_expf +#else +/* + ALGORITHM DESCRIPTION: + + Argument representation: + M = rint(X*2^k/ln2) = 2^k*N+j + X = M*ln2/2^k + r = N*ln2 + ln2*(j/2^k) + r + then -ln2/2^(k+1) < r < ln2/2^(k+1) + Alternatively: + M = trunc(X*2^k/ln2) + then 0 < r < ln2/2^k + + Result calculation: + exp(X) = exp(N*ln2 + ln2*(j/2^k) + r) + = 2^N * 2^(j/2^k) * exp(r) + 2^N is calculated by bit manipulation + 2^(j/2^k) is computed from table lookup + exp(r) is approximated by polynomial + + The table lookup is skipped if k = 0. + For low accuracy approximation, exp(r) ~ 1 or 1+r. */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1280, %rsp + movq __svml_sexp_data@GOTPCREL(%rip), %rax + +/* r = x-n*ln2_hi/2^k */ + vmovaps %zmm0, %zmm6 + +/* compare against threshold */ + movl $-1, %ecx + vmovups __sInvLn2(%rax), %zmm3 + vmovups __sLn2hi(%rax), %zmm5 + +/* m = x*2^k/ln2 + shifter */ + vfmadd213ps __sShifter(%rax), %zmm0, %zmm3 + vmovups __sPC5(%rax), %zmm9 + +/* n = m - shifter = rint(x*2^k/ln2) */ + vsubps __sShifter(%rax), %zmm3, %zmm7 + +/* remove sign of x by "and" operation */ + vpandd __iAbsMask(%rax), %zmm0, %zmm1 + vpaddd __iBias(%rax), %zmm3, %zmm4 + vpcmpgtd __iDomainRange(%rax), %zmm1, %k1 + +/* compute 2^N with "shift" */ + vpslld $23, %zmm4, %zmm8 + vfnmadd231ps %zmm7, %zmm5, %zmm6 + vpbroadcastd %ecx, %zmm2{%k1}{z} + +/* r = r-n*ln2_lo/2^k = x - n*ln2/2^k */ + vfnmadd132ps __sLn2lo(%rax), %zmm6, %zmm7 + +/* set mask for overflow/underflow */ + vptestmd %zmm2, %zmm2, %k0 + kmovw %k0, %ecx + +/* c5*r+c4 */ + vfmadd213ps __sPC4(%rax), %zmm7, %zmm9 + +/* (c5*r+c4)*r+c3 */ + vfmadd213ps __sPC3(%rax), %zmm7, %zmm9 + +/* ((c5*r+c4)*r+c3)*r+c2 */ + vfmadd213ps __sPC2(%rax), %zmm7, %zmm9 + +/* (((c5*r+c4)*r+c3)*r+c2)*r+c1 */ + vfmadd213ps __sPC1(%rax), %zmm7, %zmm9 + +/* exp(r) = ((((c5*r+c4)*r+c3)*r+c2)*r+c1)*r+c0 */ + vfmadd213ps __sPC0(%rax), %zmm7, %zmm9 + +/* 2^N*exp(r) */ + vmulps %zmm9, %zmm8, %zmm1 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + vmovaps %zmm1, %zmm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovups %zmm0, 1152(%rsp) + vmovups %zmm1, 1216(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + kmovw %k4, 1048(%rsp) + xorl %eax, %eax + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1064(%rsp) + movq %rdi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %ecx, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %eax, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + addb $1, %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + kmovw 1048(%rsp), %k4 + movq 1064(%rsp), %rsi + kmovw 1040(%rsp), %k5 + movq 1056(%rsp), %rdi + kmovw 1032(%rsp), %k6 + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + vmovups 1216(%rsp), %zmm1 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + vmovss 1156(%rsp,%r15,8), %xmm0 + call JUMPTARGET(__expf_finite) + vmovss %xmm0, 1220(%rsp,%r15,8) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + vmovss 1152(%rsp,%r15,8), %xmm0 + call JUMPTARGET(__expf_finite) + vmovss %xmm0, 1216(%rsp,%r15,8) + jmp .LBL_1_7 + +#endif +END (_ZGVeN16v_expf_knl) + +ENTRY (_ZGVeN16v_expf_skx) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512 _ZGVdN8v_expf +#else +/* + ALGORITHM DESCRIPTION: + + Argument representation: + M = rint(X*2^k/ln2) = 2^k*N+j + X = M*ln2/2^k + r = N*ln2 + ln2*(j/2^k) + r + then -ln2/2^(k+1) < r < ln2/2^(k+1) + Alternatively: + M = trunc(X*2^k/ln2) + then 0 < r < ln2/2^k + + Result calculation: + exp(X) = exp(N*ln2 + ln2*(j/2^k) + r) + = 2^N * 2^(j/2^k) * exp(r) + 2^N is calculated by bit manipulation + 2^(j/2^k) is computed from table lookup + exp(r) is approximated by polynomial + + The table lookup is skipped if k = 0. + For low accuracy approximation, exp(r) ~ 1 or 1+r. */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1280, %rsp + movq __svml_sexp_data@GOTPCREL(%rip), %rax + +/* r = x-n*ln2_hi/2^k */ + vmovaps %zmm0, %zmm7 + +/* compare against threshold */ + vmovups .L_2il0floatpacket.13(%rip), %zmm3 + vmovups __sInvLn2(%rax), %zmm4 + vmovups __sShifter(%rax), %zmm1 + vmovups __sLn2hi(%rax), %zmm6 + vmovups __sPC5(%rax), %zmm10 + +/* m = x*2^k/ln2 + shifter */ + vfmadd213ps %zmm1, %zmm0, %zmm4 + +/* n = m - shifter = rint(x*2^k/ln2) */ + vsubps %zmm1, %zmm4, %zmm8 + vpaddd __iBias(%rax), %zmm4, %zmm5 + vfnmadd231ps %zmm8, %zmm6, %zmm7 + +/* compute 2^N with "shift" */ + vpslld $23, %zmm5, %zmm9 + +/* r = r-n*ln2_lo/2^k = x - n*ln2/2^k */ + vfnmadd132ps __sLn2lo(%rax), %zmm7, %zmm8 + +/* c5*r+c4 */ + vfmadd213ps __sPC4(%rax), %zmm8, %zmm10 + +/* (c5*r+c4)*r+c3 */ + vfmadd213ps __sPC3(%rax), %zmm8, %zmm10 + +/* ((c5*r+c4)*r+c3)*r+c2 */ + vfmadd213ps __sPC2(%rax), %zmm8, %zmm10 + +/* (((c5*r+c4)*r+c3)*r+c2)*r+c1 */ + vfmadd213ps __sPC1(%rax), %zmm8, %zmm10 + +/* exp(r) = ((((c5*r+c4)*r+c3)*r+c2)*r+c1)*r+c0 */ + vfmadd213ps __sPC0(%rax), %zmm8, %zmm10 + +/* 2^N*exp(r) */ + vmulps %zmm10, %zmm9, %zmm1 + +/* remove sign of x by "and" operation */ + vpandd __iAbsMask(%rax), %zmm0, %zmm2 + vpcmpd $2, __iDomainRange(%rax), %zmm2, %k1 + vpandnd %zmm2, %zmm2, %zmm3{%k1} + +/* set mask for overflow/underflow */ + vptestmd %zmm3, %zmm3, %k0 + kmovw %k0, %ecx + testl %ecx, %ecx + jne .LBL_2_3 + +.LBL_2_2: + cfi_remember_state + vmovaps %zmm1, %zmm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_2_3: + cfi_restore_state + vmovups %zmm0, 1152(%rsp) + vmovups %zmm1, 1216(%rsp) + je .LBL_2_2 + + xorb %dl, %dl + xorl %eax, %eax + kmovw %k4, 1048(%rsp) + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1064(%rsp) + movq %rdi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %ecx, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %eax, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + cfi_remember_state + + +.LBL_2_6: + btl %r14d, %r13d + jc .LBL_2_12 + +.LBL_2_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_2_10 + +.LBL_2_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_2_6 + + kmovw 1048(%rsp), %k4 + kmovw 1040(%rsp), %k5 + kmovw 1032(%rsp), %k6 + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + vmovups 1216(%rsp), %zmm1 + movq 1064(%rsp), %rsi + movq 1056(%rsp), %rdi + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + jmp .LBL_2_2 + +.LBL_2_10: + cfi_restore_state + movzbl %r12b, %r15d + vmovss 1156(%rsp,%r15,8), %xmm0 + vzeroupper + vmovss 1156(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(__expf_finite) + + vmovss %xmm0, 1220(%rsp,%r15,8) + jmp .LBL_2_8 + +.LBL_2_12: + movzbl %r12b, %r15d + vmovss 1152(%rsp,%r15,8), %xmm0 + vzeroupper + vmovss 1152(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(__expf_finite) + + vmovss %xmm0, 1216(%rsp,%r15,8) + jmp .LBL_2_7 + +#endif +END (_ZGVeN16v_expf_skx) + + .section .rodata, "a" +.L_2il0floatpacket.13: + .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff + .type .L_2il0floatpacket.13,@object diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core.S new file mode 100644 index 0000000000..8051720ec2 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized expf. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVbN4v_expf) + .type _ZGVbN4v_expf, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVbN4v_expf_sse4(%rip), %rax + HAS_CPU_FEATURE (SSE4_1) + jz 2f + ret +2: leaq _ZGVbN4v_expf_sse2(%rip), %rax + ret +END (_ZGVbN4v_expf) +libmvec_hidden_def (_ZGVbN4v_expf) + +#define _ZGVbN4v_expf _ZGVbN4v_expf_sse2 +#include "../svml_s_expf4_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core_sse4.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core_sse4.S new file mode 100644 index 0000000000..2bc510bbf7 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core_sse4.S @@ -0,0 +1,212 @@ +/* Function expf vectorized with SSE4. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_expf_data.h" + + .text +ENTRY (_ZGVbN4v_expf_sse4) +/* + ALGORITHM DESCRIPTION: + + Argument representation: + M = rint(X*2^k/ln2) = 2^k*N+j + X = M*ln2/2^k + r = N*ln2 + ln2*(j/2^k) + r + then -ln2/2^(k+1) < r < ln2/2^(k+1) + Alternatively: + M = trunc(X*2^k/ln2) + then 0 < r < ln2/2^k + + Result calculation: + exp(X) = exp(N*ln2 + ln2*(j/2^k) + r) + = 2^N * 2^(j/2^k) * exp(r) + 2^N is calculated by bit manipulation + 2^(j/2^k) is computed from table lookup + exp(r) is approximated by polynomial + + The table lookup is skipped if k = 0. + For low accuracy approximation, exp(r) ~ 1 or 1+r. */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $320, %rsp + movaps %xmm0, %xmm5 + movq __svml_sexp_data@GOTPCREL(%rip), %rax + movups __sInvLn2(%rax), %xmm0 + +/* m = x*2^k/ln2 + shifter */ + mulps %xmm5, %xmm0 + movups __sShifter(%rax), %xmm6 + movups __sLn2hi(%rax), %xmm4 + addps %xmm6, %xmm0 + +/* n = m - shifter = rint(x*2^k/ln2) */ + movaps %xmm0, %xmm2 + +/* remove sign of x by "and" operation */ + movdqu __iAbsMask(%rax), %xmm7 + subps %xmm6, %xmm2 + +/* r = x-n*ln2_hi/2^k */ + mulps %xmm2, %xmm4 + pand %xmm5, %xmm7 + +/* compare against threshold */ + pcmpgtd __iDomainRange(%rax), %xmm7 + movups __sLn2lo(%rax), %xmm1 + +/* set mask for overflow/underflow */ + movmskps %xmm7, %ecx + movaps %xmm5, %xmm7 + movups __sPC5(%rax), %xmm3 + subps %xmm4, %xmm7 + +/* r = r-n*ln2_lo/2^k = x - n*ln2/2^k */ + mulps %xmm1, %xmm2 + +/* compute 2^N with "shift" */ + movdqu __iBias(%rax), %xmm6 + subps %xmm2, %xmm7 + +/* c5*r+c4 */ + mulps %xmm7, %xmm3 + paddd %xmm6, %xmm0 + pslld $23, %xmm0 + addps __sPC4(%rax), %xmm3 + +/* (c5*r+c4)*r+c3 */ + mulps %xmm7, %xmm3 + addps __sPC3(%rax), %xmm3 + +/* ((c5*r+c4)*r+c3)*r+c2 */ + mulps %xmm7, %xmm3 + addps __sPC2(%rax), %xmm3 + +/* (((c5*r+c4)*r+c3)*r+c2)*r+c1 */ + mulps %xmm7, %xmm3 + addps __sPC1(%rax), %xmm3 + +/* exp(r) = ((((c5*r+c4)*r+c3)*r+c2)*r+c1)*r+c0 */ + mulps %xmm3, %xmm7 + addps __sPC0(%rax), %xmm7 + +/* 2^N*exp(r) */ + mulps %xmm7, %xmm0 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + movups %xmm5, 192(%rsp) + movups %xmm0, 256(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + movups %xmm8, 112(%rsp) + movups %xmm9, 96(%rsp) + movups %xmm10, 80(%rsp) + movups %xmm11, 64(%rsp) + movups %xmm12, 48(%rsp) + movups %xmm13, 32(%rsp) + movups %xmm14, 16(%rsp) + movups %xmm15, (%rsp) + movq %rsi, 136(%rsp) + movq %rdi, 128(%rsp) + movq %r12, 168(%rsp) + cfi_offset_rel_rsp (12, 168) + movb %dl, %r12b + movq %r13, 160(%rsp) + cfi_offset_rel_rsp (13, 160) + movl %ecx, %r13d + movq %r14, 152(%rsp) + cfi_offset_rel_rsp (14, 152) + movl %eax, %r14d + movq %r15, 144(%rsp) + cfi_offset_rel_rsp (15, 144) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + movups 112(%rsp), %xmm8 + movups 96(%rsp), %xmm9 + movups 80(%rsp), %xmm10 + movups 64(%rsp), %xmm11 + movups 48(%rsp), %xmm12 + movups 32(%rsp), %xmm13 + movups 16(%rsp), %xmm14 + movups (%rsp), %xmm15 + movq 136(%rsp), %rsi + movq 128(%rsp), %rdi + movq 168(%rsp), %r12 + cfi_restore (%r12) + movq 160(%rsp), %r13 + cfi_restore (%r13) + movq 152(%rsp), %r14 + cfi_restore (%r14) + movq 144(%rsp), %r15 + cfi_restore (%r15) + movups 256(%rsp), %xmm0 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + movss 196(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(__expf_finite) + + movss %xmm0, 260(%rsp,%r15,8) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + movss 192(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(__expf_finite) + + movss %xmm0, 256(%rsp,%r15,8) + jmp .LBL_1_7 + +END (_ZGVbN4v_expf_sse4) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core.S new file mode 100644 index 0000000000..6ffb1fd784 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized expf. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVdN8v_expf) + .type _ZGVdN8v_expf, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVdN8v_expf_avx2(%rip), %rax + HAS_ARCH_FEATURE (AVX2_Usable) + jz 2f + ret +2: leaq _ZGVdN8v_expf_sse_wrapper(%rip), %rax + ret +END (_ZGVdN8v_expf) +libmvec_hidden_def (_ZGVdN8v_expf) + +#define _ZGVdN8v_expf _ZGVdN8v_expf_sse_wrapper +#include "../svml_s_expf8_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core_avx2.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core_avx2.S new file mode 100644 index 0000000000..b4a070ac86 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core_avx2.S @@ -0,0 +1,202 @@ +/* Function expf vectorized with AVX2. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_expf_data.h" + + .text +ENTRY(_ZGVdN8v_expf_avx2) +/* + ALGORITHM DESCRIPTION: + + Argument representation: + M = rint(X*2^k/ln2) = 2^k*N+j + X = M*ln2/2^k + r = N*ln2 + ln2*(j/2^k) + r + then -ln2/2^(k+1) < r < ln2/2^(k+1) + Alternatively: + M = trunc(X*2^k/ln2) + then 0 < r < ln2/2^k + + Result calculation: + exp(X) = exp(N*ln2 + ln2*(j/2^k) + r) + = 2^N * 2^(j/2^k) * exp(r) + 2^N is calculated by bit manipulation + 2^(j/2^k) is computed from table lookup + exp(r) is approximated by polynomial + + The table lookup is skipped if k = 0. + For low accuracy approximation, exp(r) ~ 1 or 1+r. */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $448, %rsp + movq __svml_sexp_data@GOTPCREL(%rip), %rax + vmovaps %ymm0, %ymm2 + vmovups __sInvLn2(%rax), %ymm7 + vmovups __sShifter(%rax), %ymm4 + vmovups __sLn2hi(%rax), %ymm3 + vmovups __sPC5(%rax), %ymm1 + +/* m = x*2^k/ln2 + shifter */ + vfmadd213ps %ymm4, %ymm2, %ymm7 + +/* n = m - shifter = rint(x*2^k/ln2) */ + vsubps %ymm4, %ymm7, %ymm0 + vpaddd __iBias(%rax), %ymm7, %ymm4 + +/* remove sign of x by "and" operation */ + vandps __iAbsMask(%rax), %ymm2, %ymm5 + +/* compare against threshold */ + vpcmpgtd __iDomainRange(%rax), %ymm5, %ymm6 + +/* r = x-n*ln2_hi/2^k */ + vmovaps %ymm2, %ymm5 + vfnmadd231ps %ymm0, %ymm3, %ymm5 + +/* r = r-n*ln2_lo/2^k = x - n*ln2/2^k */ + vfnmadd132ps __sLn2lo(%rax), %ymm5, %ymm0 + +/* c5*r+c4 */ + vfmadd213ps __sPC4(%rax), %ymm0, %ymm1 + +/* (c5*r+c4)*r+c3 */ + vfmadd213ps __sPC3(%rax), %ymm0, %ymm1 + +/* ((c5*r+c4)*r+c3)*r+c2 */ + vfmadd213ps __sPC2(%rax), %ymm0, %ymm1 + +/* (((c5*r+c4)*r+c3)*r+c2)*r+c1 */ + vfmadd213ps __sPC1(%rax), %ymm0, %ymm1 + +/* exp(r) = ((((c5*r+c4)*r+c3)*r+c2)*r+c1)*r+c0 */ + vfmadd213ps __sPC0(%rax), %ymm0, %ymm1 + +/* set mask for overflow/underflow */ + vmovmskps %ymm6, %ecx + +/* compute 2^N with "shift" */ + vpslld $23, %ymm4, %ymm6 + +/* 2^N*exp(r) */ + vmulps %ymm1, %ymm6, %ymm0 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovups %ymm2, 320(%rsp) + vmovups %ymm0, 384(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + vmovups %ymm8, 224(%rsp) + vmovups %ymm9, 192(%rsp) + vmovups %ymm10, 160(%rsp) + vmovups %ymm11, 128(%rsp) + vmovups %ymm12, 96(%rsp) + vmovups %ymm13, 64(%rsp) + vmovups %ymm14, 32(%rsp) + vmovups %ymm15, (%rsp) + movq %rsi, 264(%rsp) + movq %rdi, 256(%rsp) + movq %r12, 296(%rsp) + cfi_offset_rel_rsp (12, 296) + movb %dl, %r12b + movq %r13, 288(%rsp) + cfi_offset_rel_rsp (13, 288) + movl %ecx, %r13d + movq %r14, 280(%rsp) + cfi_offset_rel_rsp (14, 280) + movl %eax, %r14d + movq %r15, 272(%rsp) + cfi_offset_rel_rsp (15, 272) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + vmovups 224(%rsp), %ymm8 + vmovups 192(%rsp), %ymm9 + vmovups 160(%rsp), %ymm10 + vmovups 128(%rsp), %ymm11 + vmovups 96(%rsp), %ymm12 + vmovups 64(%rsp), %ymm13 + vmovups 32(%rsp), %ymm14 + vmovups (%rsp), %ymm15 + vmovups 384(%rsp), %ymm0 + movq 264(%rsp), %rsi + movq 256(%rsp), %rdi + movq 296(%rsp), %r12 + cfi_restore (%r12) + movq 288(%rsp), %r13 + cfi_restore (%r13) + movq 280(%rsp), %r14 + cfi_restore (%r14) + movq 272(%rsp), %r15 + cfi_restore (%r15) + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + vmovss 324(%rsp,%r15,8), %xmm0 + vzeroupper + + call JUMPTARGET(__expf_finite) + + vmovss %xmm0, 388(%rsp,%r15,8) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + vmovss 320(%rsp,%r15,8), %xmm0 + vzeroupper + + call JUMPTARGET(__expf_finite) + + vmovss %xmm0, 384(%rsp,%r15,8) + jmp .LBL_1_7 + +END(_ZGVdN8v_expf_avx2) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S new file mode 100644 index 0000000000..8ab03195c6 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S @@ -0,0 +1,37 @@ +/* Multiple versions of vectorized logf. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVeN16v_logf) + .type _ZGVeN16v_logf, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVeN16v_logf_skx(%rip), %rax + HAS_ARCH_FEATURE (AVX512DQ_Usable) + jnz 2f + leaq _ZGVeN16v_logf_knl(%rip), %rax + HAS_ARCH_FEATURE (AVX512F_Usable) + jnz 2f + leaq _ZGVeN16v_logf_avx2_wrapper(%rip), %rax +2: ret +END (_ZGVeN16v_logf) + +#define _ZGVeN16v_logf _ZGVeN16v_logf_avx2_wrapper +#include "../svml_s_logf16_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S new file mode 100644 index 0000000000..7ff6fff848 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S @@ -0,0 +1,416 @@ +/* Function logf vectorized with AVX-512. KNL and SKX versions. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_logf_data.h" +#include "svml_s_wrapper_impl.h" + + .text +ENTRY (_ZGVeN16v_logf_knl) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512 _ZGVdN8v_logf +#else +/* + ALGORITHM DESCRIPTION: + + log(x) = exponent_x*log(2) + log(mantissa_x), if mantissa_x<4/3 + log(x) = (exponent_x+1)*log(2) + log(0.5*mantissa_x), if mantissa_x>4/3 + + R = mantissa_x - 1, if mantissa_x<4/3 + R = 0.5*mantissa_x - 1, if mantissa_x>4/3 + |R|< 1/3 + + log(1+R) is approximated as a polynomial: degree 9 for 1-ulp, + degree 7 for 4-ulp, degree 3 for half-precision. */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1280, %rsp + movq __svml_slog_data@GOTPCREL(%rip), %rax + movl $-1, %ecx + +/* reduction: compute r,n */ + vpsubd _iBrkValue(%rax), %zmm0, %zmm2 + vmovups _sPoly_7(%rax), %zmm7 + vpandd _iOffExpoMask(%rax), %zmm2, %zmm3 + +/* exponent_x (mantissa_x<4/3) or exponent_x+1 (mantissa_x>4/3) */ + vpsrad $23, %zmm2, %zmm4 + +/* check for working range, + set special argument mask (denormals/zero/Inf/NaN) + */ + vpaddd _iHiDelta(%rax), %zmm0, %zmm1 + +/* mantissa_x (mantissa_x<4/3), or 0.5*mantissa_x (mantissa_x>4/3) */ + vpaddd _iBrkValue(%rax), %zmm3, %zmm6 + vpcmpd $1, _iLoRange(%rax), %zmm1, %k1 + vcvtdq2ps {rn-sae}, %zmm4, %zmm1 + +/* reduced argument R */ + vsubps _sOne(%rax), %zmm6, %zmm8 + vpbroadcastd %ecx, %zmm5{%k1}{z} + +/* polynomial evaluation starts here */ + vfmadd213ps _sPoly_6(%rax), %zmm8, %zmm7 + vptestmd %zmm5, %zmm5, %k0 + kmovw %k0, %ecx + vfmadd213ps _sPoly_5(%rax), %zmm8, %zmm7 + vfmadd213ps _sPoly_4(%rax), %zmm8, %zmm7 + vfmadd213ps _sPoly_3(%rax), %zmm8, %zmm7 + vfmadd213ps _sPoly_2(%rax), %zmm8, %zmm7 + vfmadd213ps _sPoly_1(%rax), %zmm8, %zmm7 + vmulps %zmm8, %zmm7, %zmm9 + +/* polynomial evaluation end */ + vfmadd213ps %zmm8, %zmm8, %zmm9 + +/* + final reconstruction: + add exponent_value*log2 to polynomial result + */ + vfmadd132ps _sLn2(%rax), %zmm9, %zmm1 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + vmovaps %zmm1, %zmm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovups %zmm0, 1152(%rsp) + vmovups %zmm1, 1216(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + kmovw %k4, 1048(%rsp) + xorl %eax, %eax + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1064(%rsp) + movq %rdi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %ecx, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %eax, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + addb $1, %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + kmovw 1048(%rsp), %k4 + movq 1064(%rsp), %rsi + kmovw 1040(%rsp), %k5 + movq 1056(%rsp), %rdi + kmovw 1032(%rsp), %k6 + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + vmovups 1216(%rsp), %zmm1 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + vmovss 1156(%rsp,%r15,8), %xmm0 + call JUMPTARGET(__logf_finite) + vmovss %xmm0, 1220(%rsp,%r15,8) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + vmovss 1152(%rsp,%r15,8), %xmm0 + call JUMPTARGET(__logf_finite) + vmovss %xmm0, 1216(%rsp,%r15,8) + jmp .LBL_1_7 +#endif +END (_ZGVeN16v_logf_knl) + +ENTRY (_ZGVeN16v_logf_skx) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512 _ZGVdN8v_logf +#else +/* + ALGORITHM DESCRIPTION: + + log(x) = exponent_x*log(2) + log(mantissa_x), if mantissa_x<4/3 + log(x) = (exponent_x+1)*log(2) + log(0.5*mantissa_x), if mantissa_x>4/3 + + R = mantissa_x - 1, if mantissa_x<4/3 + R = 0.5*mantissa_x - 1, if mantissa_x>4/3 + |R|< 1/3 + + log(1+R) is approximated as a polynomial: degree 9 for 1-ulp, + degree 7 for 4-ulp, degree 3 for half-precision. */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1280, %rsp + movq __svml_slog_data@GOTPCREL(%rip), %rax + vmovups .L_2il0floatpacket.7(%rip), %zmm6 + vmovups _iBrkValue(%rax), %zmm4 + vmovups _sPoly_7(%rax), %zmm8 + +/* + check for working range, + set special argument mask (denormals/zero/Inf/NaN) + */ + vpaddd _iHiDelta(%rax), %zmm0, %zmm1 + +/* reduction: compute r,n */ + vpsubd %zmm4, %zmm0, %zmm2 + vpcmpd $5, _iLoRange(%rax), %zmm1, %k1 + +/* exponent_x (mantissa_x<4/3) or exponent_x+1 (mantissa_x>4/3) */ + vpsrad $23, %zmm2, %zmm5 + vpandd _iOffExpoMask(%rax), %zmm2, %zmm3 + +/* mantissa_x (mantissa_x<4/3), or 0.5*mantissa_x (mantissa_x>4/3) */ + vpaddd %zmm4, %zmm3, %zmm7 + +/* reduced argument R */ + vsubps _sOne(%rax), %zmm7, %zmm9 + +/* polynomial evaluation starts here */ + vfmadd213ps _sPoly_6(%rax), %zmm9, %zmm8 + vfmadd213ps _sPoly_5(%rax), %zmm9, %zmm8 + vfmadd213ps _sPoly_4(%rax), %zmm9, %zmm8 + vfmadd213ps _sPoly_3(%rax), %zmm9, %zmm8 + vfmadd213ps _sPoly_2(%rax), %zmm9, %zmm8 + vfmadd213ps _sPoly_1(%rax), %zmm9, %zmm8 + vmulps %zmm9, %zmm8, %zmm10 + +/* polynomial evaluation end */ + vfmadd213ps %zmm9, %zmm9, %zmm10 + vpandnd %zmm1, %zmm1, %zmm6{%k1} + vptestmd %zmm6, %zmm6, %k0 + vcvtdq2ps {rn-sae}, %zmm5, %zmm1 + kmovw %k0, %ecx + +/* + final reconstruction: + add exponent_value*log2 to polynomial result + */ + vfmadd132ps _sLn2(%rax), %zmm10, %zmm1 + testl %ecx, %ecx + jne .LBL_2_3 + +.LBL_2_2: + cfi_remember_state + vmovaps %zmm1, %zmm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_2_3: + cfi_restore_state + vmovups %zmm0, 1152(%rsp) + vmovups %zmm1, 1216(%rsp) + je .LBL_2_2 + + xorb %dl, %dl + xorl %eax, %eax + kmovw %k4, 1048(%rsp) + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1064(%rsp) + movq %rdi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %ecx, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %eax, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + cfi_remember_state + +.LBL_2_6: + btl %r14d, %r13d + jc .LBL_2_12 + +.LBL_2_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_2_10 + +.LBL_2_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_2_6 + + kmovw 1048(%rsp), %k4 + kmovw 1040(%rsp), %k5 + kmovw 1032(%rsp), %k6 + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + vmovups 1216(%rsp), %zmm1 + movq 1064(%rsp), %rsi + movq 1056(%rsp), %rdi + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + jmp .LBL_2_2 + +.LBL_2_10: + cfi_restore_state + movzbl %r12b, %r15d + vmovss 1156(%rsp,%r15,8), %xmm0 + vzeroupper + vmovss 1156(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(__logf_finite) + + vmovss %xmm0, 1220(%rsp,%r15,8) + jmp .LBL_2_8 + +.LBL_2_12: + movzbl %r12b, %r15d + vmovss 1152(%rsp,%r15,8), %xmm0 + vzeroupper + vmovss 1152(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(__logf_finite) + + vmovss %xmm0, 1216(%rsp,%r15,8) + jmp .LBL_2_7 + +#endif +END (_ZGVeN16v_logf_skx) + + .section .rodata, "a" +.L_2il0floatpacket.7: + .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff + .type .L_2il0floatpacket.7,@object diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core.S new file mode 100644 index 0000000000..4e0e36d5bd --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized logf. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVbN4v_logf) + .type _ZGVbN4v_logf, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVbN4v_logf_sse4(%rip), %rax + HAS_CPU_FEATURE (SSE4_1) + jz 2f + ret +2: leaq _ZGVbN4v_logf_sse2(%rip), %rax + ret +END (_ZGVbN4v_logf) +libmvec_hidden_def (_ZGVbN4v_logf) + +#define _ZGVbN4v_logf _ZGVbN4v_logf_sse2 +#include "../svml_s_logf4_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core_sse4.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core_sse4.S new file mode 100644 index 0000000000..156face181 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core_sse4.S @@ -0,0 +1,194 @@ +/* Function logf vectorized with SSE4. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_logf_data.h" + + .text +ENTRY (_ZGVbN4v_logf_sse4) +/* + ALGORITHM DESCRIPTION: + + log(x) = exponent_x*log(2) + log(mantissa_x), if mantissa_x<4/3 + log(x) = (exponent_x+1)*log(2) + log(0.5*mantissa_x), if mantissa_x>4/3 + + R = mantissa_x - 1, if mantissa_x<4/3 + R = 0.5*mantissa_x - 1, if mantissa_x>4/3 + |R|< 1/3 + + log(1+R) is approximated as a polynomial: degree 9 for 1-ulp, + degree 7 for 4-ulp, degree 3 for half-precision. */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $320, %rsp + +/* reduction: compute r,n */ + movaps %xmm0, %xmm2 + +/* check for working range, + set special argument mask (denormals/zero/Inf/NaN) */ + movq __svml_slog_data@GOTPCREL(%rip), %rax + movdqu _iHiDelta(%rax), %xmm1 + movdqu _iLoRange(%rax), %xmm4 + paddd %xmm0, %xmm1 + movdqu _iBrkValue(%rax), %xmm3 + pcmpgtd %xmm1, %xmm4 + movdqu _iOffExpoMask(%rax), %xmm1 + psubd %xmm3, %xmm2 + pand %xmm2, %xmm1 + +/* exponent_x (mantissa_x<4/3) or exponent_x+1 (mantissa_x>4/3) */ + psrad $23, %xmm2 + paddd %xmm3, %xmm1 + movups _sPoly_7(%rax), %xmm5 + +/* mantissa_x (mantissa_x<4/3), or 0.5*mantissa_x (mantissa_x>4/3) */ + cvtdq2ps %xmm2, %xmm6 + +/* reduced argument R */ + subps _sOne(%rax), %xmm1 + movmskps %xmm4, %ecx + +/* final reconstruction: + add exponent_value*log2 to polynomial result */ + mulps _sLn2(%rax), %xmm6 + +/* polynomial evaluation starts here */ + mulps %xmm1, %xmm5 + addps _sPoly_6(%rax), %xmm5 + mulps %xmm1, %xmm5 + addps _sPoly_5(%rax), %xmm5 + mulps %xmm1, %xmm5 + addps _sPoly_4(%rax), %xmm5 + mulps %xmm1, %xmm5 + addps _sPoly_3(%rax), %xmm5 + mulps %xmm1, %xmm5 + addps _sPoly_2(%rax), %xmm5 + mulps %xmm1, %xmm5 + addps _sPoly_1(%rax), %xmm5 + mulps %xmm1, %xmm5 + +/* polynomial evaluation end */ + mulps %xmm1, %xmm5 + addps %xmm5, %xmm1 + addps %xmm6, %xmm1 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + movdqa %xmm1, %xmm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + movups %xmm0, 192(%rsp) + movups %xmm1, 256(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + movups %xmm8, 112(%rsp) + movups %xmm9, 96(%rsp) + movups %xmm10, 80(%rsp) + movups %xmm11, 64(%rsp) + movups %xmm12, 48(%rsp) + movups %xmm13, 32(%rsp) + movups %xmm14, 16(%rsp) + movups %xmm15, (%rsp) + movq %rsi, 136(%rsp) + movq %rdi, 128(%rsp) + movq %r12, 168(%rsp) + cfi_offset_rel_rsp (12, 168) + movb %dl, %r12b + movq %r13, 160(%rsp) + cfi_offset_rel_rsp (13, 160) + movl %ecx, %r13d + movq %r14, 152(%rsp) + cfi_offset_rel_rsp (14, 152) + movl %eax, %r14d + movq %r15, 144(%rsp) + cfi_offset_rel_rsp (15, 144) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + movups 112(%rsp), %xmm8 + movups 96(%rsp), %xmm9 + movups 80(%rsp), %xmm10 + movups 64(%rsp), %xmm11 + movups 48(%rsp), %xmm12 + movups 32(%rsp), %xmm13 + movups 16(%rsp), %xmm14 + movups (%rsp), %xmm15 + movq 136(%rsp), %rsi + movq 128(%rsp), %rdi + movq 168(%rsp), %r12 + cfi_restore (%r12) + movq 160(%rsp), %r13 + cfi_restore (%r13) + movq 152(%rsp), %r14 + cfi_restore (%r14) + movq 144(%rsp), %r15 + cfi_restore (%r15) + movups 256(%rsp), %xmm1 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + movss 196(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(__logf_finite) + + movss %xmm0, 260(%rsp,%r15,8) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + movss 192(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(__logf_finite) + + movss %xmm0, 256(%rsp,%r15,8) + jmp .LBL_1_7 + +END (_ZGVbN4v_logf_sse4) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core.S new file mode 100644 index 0000000000..f4b82de3d4 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized logf. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVdN8v_logf) + .type _ZGVdN8v_logf, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVdN8v_logf_avx2(%rip), %rax + HAS_ARCH_FEATURE (AVX2_Usable) + jz 2f + ret +2: leaq _ZGVdN8v_logf_sse_wrapper(%rip), %rax + ret +END (_ZGVdN8v_logf) +libmvec_hidden_def (_ZGVdN8v_logf) + +#define _ZGVdN8v_logf _ZGVdN8v_logf_sse_wrapper +#include "../svml_s_logf8_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core_avx2.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core_avx2.S new file mode 100644 index 0000000000..994af91ffe --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core_avx2.S @@ -0,0 +1,184 @@ +/* Function logf vectorized with AVX2. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_logf_data.h" + + .text +ENTRY(_ZGVdN8v_logf_avx2) +/* + ALGORITHM DESCRIPTION: + + log(x) = exponent_x*log(2) + log(mantissa_x), if mantissa_x<4/3 + log(x) = (exponent_x+1)*log(2) + log(0.5*mantissa_x), if mantissa_x>4/3 + + R = mantissa_x - 1, if mantissa_x<4/3 + R = 0.5*mantissa_x - 1, if mantissa_x>4/3 + |R|< 1/3 + + log(1+R) is approximated as a polynomial: degree 9 for 1-ulp, + degree 7 for 4-ulp, degree 3 for half-precision. */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $448, %rsp + movq __svml_slog_data@GOTPCREL(%rip), %rax + vmovaps %ymm0, %ymm2 + vmovups _iBrkValue(%rax), %ymm6 + vmovups _iLoRange(%rax), %ymm1 +/* check for working range, + set special argument mask (denormals/zero/Inf/NaN) */ + vpaddd _iHiDelta(%rax), %ymm2, %ymm7 + +/* reduction: compute r,n */ + vpsubd %ymm6, %ymm2, %ymm4 + +/* exponent_x (mantissa_x<4/3) or exponent_x+1 (mantissa_x>4/3) */ + vpsrad $23, %ymm4, %ymm3 + vpand _iOffExpoMask(%rax), %ymm4, %ymm5 + vmovups _sPoly_7(%rax), %ymm4 + vcvtdq2ps %ymm3, %ymm0 + +/* mantissa_x (mantissa_x<4/3), or 0.5*mantissa_x (mantissa_x>4/3) */ + vpaddd %ymm6, %ymm5, %ymm3 + +/* reduced argument R */ + vsubps _sOne(%rax), %ymm3, %ymm5 + +/* polynomial evaluation starts here */ + vfmadd213ps _sPoly_6(%rax), %ymm5, %ymm4 + vfmadd213ps _sPoly_5(%rax), %ymm5, %ymm4 + vfmadd213ps _sPoly_4(%rax), %ymm5, %ymm4 + vfmadd213ps _sPoly_3(%rax), %ymm5, %ymm4 + vfmadd213ps _sPoly_2(%rax), %ymm5, %ymm4 + vfmadd213ps _sPoly_1(%rax), %ymm5, %ymm4 + vmulps %ymm5, %ymm4, %ymm6 + +/* polynomial evaluation end */ + vfmadd213ps %ymm5, %ymm5, %ymm6 + vpcmpgtd %ymm7, %ymm1, %ymm1 + vmovmskps %ymm1, %ecx + +/* final reconstruction: + add exponent_value*log2 to polynomial result */ + vfmadd132ps _sLn2(%rax), %ymm6, %ymm0 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovups %ymm2, 320(%rsp) + vmovups %ymm0, 384(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + vmovups %ymm8, 224(%rsp) + vmovups %ymm9, 192(%rsp) + vmovups %ymm10, 160(%rsp) + vmovups %ymm11, 128(%rsp) + vmovups %ymm12, 96(%rsp) + vmovups %ymm13, 64(%rsp) + vmovups %ymm14, 32(%rsp) + vmovups %ymm15, (%rsp) + movq %rsi, 264(%rsp) + movq %rdi, 256(%rsp) + movq %r12, 296(%rsp) + cfi_offset_rel_rsp (12, 296) + movb %dl, %r12b + movq %r13, 288(%rsp) + cfi_offset_rel_rsp (13, 288) + movl %ecx, %r13d + movq %r14, 280(%rsp) + cfi_offset_rel_rsp (14, 280) + movl %eax, %r14d + movq %r15, 272(%rsp) + cfi_offset_rel_rsp (15, 272) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + vmovups 224(%rsp), %ymm8 + vmovups 192(%rsp), %ymm9 + vmovups 160(%rsp), %ymm10 + vmovups 128(%rsp), %ymm11 + vmovups 96(%rsp), %ymm12 + vmovups 64(%rsp), %ymm13 + vmovups 32(%rsp), %ymm14 + vmovups (%rsp), %ymm15 + vmovups 384(%rsp), %ymm0 + movq 264(%rsp), %rsi + movq 256(%rsp), %rdi + movq 296(%rsp), %r12 + cfi_restore (%r12) + movq 288(%rsp), %r13 + cfi_restore (%r13) + movq 280(%rsp), %r14 + cfi_restore (%r14) + movq 272(%rsp), %r15 + cfi_restore (%r15) + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + vmovss 324(%rsp,%r15,8), %xmm0 + vzeroupper + + call JUMPTARGET(__logf_finite) + + vmovss %xmm0, 388(%rsp,%r15,8) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + vmovss 320(%rsp,%r15,8), %xmm0 + vzeroupper + + call JUMPTARGET(__logf_finite) + + vmovss %xmm0, 384(%rsp,%r15,8) + jmp .LBL_1_7 + +END(_ZGVdN8v_logf_avx2) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S new file mode 100644 index 0000000000..6d10c7576f --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S @@ -0,0 +1,37 @@ +/* Multiple versions of vectorized powf. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVeN16vv_powf) + .type _ZGVeN16vv_powf, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVeN16vv_powf_skx(%rip), %rax + HAS_ARCH_FEATURE (AVX512DQ_Usable) + jnz 2f + leaq _ZGVeN16vv_powf_knl(%rip), %rax + HAS_ARCH_FEATURE (AVX512F_Usable) + jnz 2f + leaq _ZGVeN16vv_powf_avx2_wrapper(%rip), %rax +2: ret +END (_ZGVeN16vv_powf) + +#define _ZGVeN16vv_powf _ZGVeN16vv_powf_avx2_wrapper +#include "../svml_s_powf16_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S new file mode 100644 index 0000000000..fc91a092b0 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S @@ -0,0 +1,653 @@ +/* Function powf vectorized with AVX-512. KNL and SKX versions. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_powf_data.h" +#include "svml_s_wrapper_impl.h" + +/* + ALGORITHM DESCRIPTION: + + We are using the next identity : pow(x,y) = 2^(y * log2(x)). + + 1) log2(x) calculation + Here we use the following formula. + Let |x|=2^k1*X1, where k1 is integer, 1<=X1<2. + Let C ~= 1/ln(2), + Rcp1 ~= 1/X1, X2=Rcp1*X1, + Rcp2 ~= 1/X2, X3=Rcp2*X2, + Rcp3 ~= 1/X3, Rcp3C ~= C/X3. + Then + log2|x| = k1 + log2(1/Rcp1) + log2(1/Rcp2) + log2(C/Rcp3C) + + log2(X1*Rcp1*Rcp2*Rcp3C/C), + where X1*Rcp1*Rcp2*Rcp3C = C*(1+q), q is very small. + + The values of Rcp1, log2(1/Rcp1), Rcp2, log2(1/Rcp2), + Rcp3C, log2(C/Rcp3C) are taken from tables. + Values of Rcp1, Rcp2, Rcp3C are such that RcpC=Rcp1*Rcp2*Rcp3C + is exactly represented in target precision. + + log2(X1*Rcp1*Rcp2*Rcp3C/C) = log2(1+q) = ln(1+q)/ln2 = + = 1/(ln2)*q - 1/(2ln2)*q^2 + 1/(3ln2)*q^3 - ... = + = 1/(C*ln2)*cq - 1/(2*C^2*ln2)*cq^2 + 1/(3*C^3*ln2)*cq^3 - ... = + = (1 + a1)*cq + a2*cq^2 + a3*cq^3 + ..., + where + cq=X1*Rcp1*Rcp2*Rcp3C-C, + a1=1/(C*ln(2))-1 is small, + a2=1/(2*C^2*ln2), + a3=1/(3*C^3*ln2), + ... + Log2 result is split by three parts: HH+HL+HLL + + 2) Calculation of y*log2(x) + Split y into YHi+YLo. + Get high PH and medium PL parts of y*log2|x|. + Get low PLL part of y*log2|x|. + Now we have PH+PL+PLL ~= y*log2|x|. + + 3) Calculation of 2^(y*log2(x)) + Let's represent PH+PL+PLL in the form N + j/2^expK + Z, + where expK=7 in this implementation, N and j are integers, + 0<=j<=2^expK-1, |Z|<2^(-expK-1). Hence + 2^(PH+PL+PLL) ~= 2^N * 2^(j/2^expK) * 2^Z, + where 2^(j/2^expK) is stored in a table, and + 2^Z ~= 1 + B1*Z + B2*Z^2 ... + B5*Z^5. + We compute 2^(PH+PL+PLL) as follows: + Break PH into PHH + PHL, where PHH = N + j/2^expK. + Z = PHL + PL + PLL + Exp2Poly = B1*Z + B2*Z^2 ... + B5*Z^5 + Get 2^(j/2^expK) from table in the form THI+TLO. + Now we have 2^(PH+PL+PLL) ~= 2^N * (THI + TLO) * (1 + Exp2Poly). + Get significand of 2^(PH+PL+PLL) in the form ResHi+ResLo: + ResHi := THI + ResLo := THI * Exp2Poly + TLO + Get exponent ERes of the result: + Res := ResHi + ResLo: + Result := ex(Res) + N. */ + + .text +ENTRY (_ZGVeN16vv_powf_knl) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf +#else + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1344, %rsp + movq __svml_spow_data@GOTPCREL(%rip), %rdx + vmovaps %zmm1, %zmm9 + vshuff32x4 $238, %zmm0, %zmm0, %zmm7 + kxnorw %k3, %k3, %k3 + vcvtps2pd %ymm0, %zmm14 + vcvtps2pd %ymm7, %zmm10 + movl $-1, %eax + movq $-1, %rcx + vpandd _ABSMASK(%rdx), %zmm9, %zmm4 + vmovups _ExpMask(%rdx), %zmm6 + +/* exponent bits selection */ + vpsrlq $20, %zmm14, %zmm13 + vshuff32x4 $238, %zmm9, %zmm9, %zmm8 + vpcmpd $5, _INF(%rdx), %zmm4, %k2 + vpsrlq $32, %zmm13, %zmm15 + vcvtps2pd %ymm8, %zmm2 + vmovups _Two10(%rdx), %zmm4 + vpmovqd %zmm15, %ymm12 + vcvtps2pd %ymm9, %zmm1 + vpsubd _NMINNORM(%rdx), %zmm0, %zmm3 + vpbroadcastd %eax, %zmm8{%k2}{z} + vpcmpd $5, _NMAXVAL(%rdx), %zmm3, %k1 + +/* preserve mantissa, set input exponent to 2^(-10) */ + vmovaps %zmm6, %zmm3 + vpternlogq $248, %zmm6, %zmm10, %zmm4 + vpsrlq $20, %zmm10, %zmm10 + vpternlogq $234, _Two10(%rdx), %zmm14, %zmm3 + +/* reciprocal approximation good to at least 11 bits */ + vrcp28pd %zmm4, %zmm11 + vpsrlq $32, %zmm10, %zmm14 + vpbroadcastd %eax, %zmm7{%k1}{z} + kxnorw %k1, %k1, %k1 + vrcp28pd %zmm3, %zmm5 + vpmovqd %zmm14, %ymm6 + vshufi32x4 $68, %zmm6, %zmm12, %zmm13 + vmovups _One(%rdx), %zmm6 + +/* round reciprocal to nearest integer, will have 1+9 mantissa bits */ + vrndscalepd $8, %zmm5, %zmm14 + +/* biased exponent in DP format */ + vshuff32x4 $238, %zmm13, %zmm13, %zmm5 + vrndscalepd $8, %zmm11, %zmm11 + vcmppd $30, _Threshold(%rdx), %zmm14, %k2 + vcvtdq2pd %ymm13, %zmm10 + vcvtdq2pd %ymm5, %zmm15 + +/* table lookup */ + vpsrlq $40, %zmm14, %zmm13 + vpxord %zmm5, %zmm5, %zmm5 + vgatherqpd _Log2Rcp_lookup(%rdx,%zmm13), %zmm5{%k3} + vfmsub213pd %zmm6, %zmm14, %zmm3 + vfmsub213pd %zmm6, %zmm11, %zmm4 + vcmppd $30, _Threshold(%rdx), %zmm11, %k3 + vpbroadcastq %rcx, %zmm14{%k2}{z} + +/* dpP= _dbT+lJ*T_ITEM_GRAN */ + kxnorw %k2, %k2, %k2 + vpsrlq $40, %zmm11, %zmm12 + vpxord %zmm6, %zmm6, %zmm6 + vpbroadcastq %rcx, %zmm11{%k3}{z} + kxnorw %k3, %k3, %k3 + vgatherqpd _Log2Rcp_lookup(%rdx,%zmm12), %zmm6{%k1} + vmovups _Bias1(%rdx), %zmm12 + vpternlogq $236, _Bias(%rdx), %zmm12, %zmm14 + vpternlogq $248, _Bias(%rdx), %zmm11, %zmm12 + vsubpd %zmm14, %zmm10, %zmm13 + vsubpd %zmm12, %zmm15, %zmm10 + vmovups _poly_coeff_3(%rdx), %zmm11 + vmovups _poly_coeff_4(%rdx), %zmm15 + vfmadd213pd %zmm15, %zmm4, %zmm11 + vmulpd %zmm4, %zmm4, %zmm12 + vmovaps %zmm15, %zmm14 + vmulpd %zmm3, %zmm3, %zmm15 + vfmadd231pd _poly_coeff_3(%rdx), %zmm3, %zmm14 + +/* reconstruction */ + vfmadd213pd %zmm4, %zmm12, %zmm11 + vfmadd213pd %zmm3, %zmm15, %zmm14 + vaddpd %zmm6, %zmm11, %zmm11 + vaddpd %zmm5, %zmm14, %zmm3 + vfmadd231pd _L2(%rdx), %zmm10, %zmm11 + vfmadd132pd _L2(%rdx), %zmm3, %zmm13 + vmulpd %zmm2, %zmm11, %zmm12 + vmulpd %zmm1, %zmm13, %zmm10 + vmulpd __dbInvLn2(%rdx), %zmm12, %zmm6 + +/* hi bits */ + vpsrlq $32, %zmm12, %zmm12 + vmulpd __dbInvLn2(%rdx), %zmm10, %zmm1 + +/* to round down; if dR is an integer we will get R = 1, which is ok */ + vsubpd __dbHALF(%rdx), %zmm6, %zmm4 + vpsrlq $32, %zmm10, %zmm11 + vpmovqd %zmm11, %ymm3 + vsubpd __dbHALF(%rdx), %zmm1, %zmm2 + vaddpd __dbShifter(%rdx), %zmm4, %zmm14 + vpmovqd %zmm12, %ymm4 + vshufi32x4 $68, %zmm4, %zmm3, %zmm5 + vpxord %zmm4, %zmm4, %zmm4 + vaddpd __dbShifter(%rdx), %zmm2, %zmm2 + +/* iAbsX = iAbsX&iAbsMask; */ + vpandd __iAbsMask(%rdx), %zmm5, %zmm11 + vpxord %zmm5, %zmm5, %zmm5 + vsubpd __dbShifter(%rdx), %zmm14, %zmm13 + +/* iRangeMask = (iAbsX>iDomainRange) */ + vpcmpgtd __iDomainRange(%rdx), %zmm11, %k1 + vsubpd __dbShifter(%rdx), %zmm2, %zmm15 + vpbroadcastd %eax, %zmm10{%k1}{z} + vpternlogd $254, %zmm8, %zmm7, %zmm10 + +/* [0..1) */ + vsubpd %zmm15, %zmm1, %zmm1 + +/* low K bits */ + vpandq __lbLOWKBITS(%rdx), %zmm14, %zmm11 + vgatherqpd 13952(%rdx,%zmm11,8), %zmm5{%k3} + vsubpd %zmm13, %zmm6, %zmm7 + vptestmd %zmm10, %zmm10, %k0 + vpandq __lbLOWKBITS(%rdx), %zmm2, %zmm10 + vmulpd __dbC1(%rdx), %zmm1, %zmm1 + vmulpd __dbC1(%rdx), %zmm7, %zmm3 + vpsrlq $11, %zmm2, %zmm8 + vpsrlq $11, %zmm14, %zmm2 + +/* NB : including +/- sign for the exponent!! */ + vpsllq $52, %zmm8, %zmm8 + kmovw %k0, %ecx + vpsllq $52, %zmm2, %zmm6 + vfmadd213pd %zmm5, %zmm3, %zmm5 + vgatherqpd 13952(%rdx,%zmm10,8), %zmm4{%k2} + vfmadd213pd %zmm4, %zmm1, %zmm4 + vpaddq %zmm6, %zmm5, %zmm10 + vcvtpd2ps %zmm10, %ymm12 + vpaddq %zmm8, %zmm4, %zmm7 + vcvtpd2ps %zmm7, %ymm11 + vshuff32x4 $68, %zmm12, %zmm11, %zmm1 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + vmovaps %zmm1, %zmm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovups %zmm0, 1152(%rsp) + vmovups %zmm9, 1216(%rsp) + vmovups %zmm1, 1280(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + kmovw %k4, 1048(%rsp) + xorl %eax, %eax + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1064(%rsp) + movq %rdi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %ecx, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %eax, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + addb $1, %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + kmovw 1048(%rsp), %k4 + movq 1064(%rsp), %rsi + kmovw 1040(%rsp), %k5 + movq 1056(%rsp), %rdi + kmovw 1032(%rsp), %k6 + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + vmovups 1280(%rsp), %zmm1 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + vmovss 1156(%rsp,%r15,8), %xmm0 + vmovss 1220(%rsp,%r15,8), %xmm1 + call JUMPTARGET(__powf_finite) + vmovss %xmm0, 1284(%rsp,%r15,8) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + vmovss 1152(%rsp,%r15,8), %xmm0 + vmovss 1216(%rsp,%r15,8), %xmm1 + call JUMPTARGET(__powf_finite) + vmovss %xmm0, 1280(%rsp,%r15,8) + jmp .LBL_1_7 +#endif +END (_ZGVeN16vv_powf_knl) + +ENTRY (_ZGVeN16vv_powf_skx) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf +#else + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1344, %rsp + movq __svml_spow_data@GOTPCREL(%rip), %rax + vextractf32x8 $1, %zmm1, %ymm14 + vextractf32x8 $1, %zmm0, %ymm15 + vpsubd _NMINNORM(%rax), %zmm0, %zmm9 + vmovups %zmm26, 1280(%rsp) + vmovups _ExpMask(%rax), %zmm6 + vpcmpd $1, _NMAXVAL(%rax), %zmm9, %k1 + vcvtps2pd %ymm0, %zmm5 + vcvtps2pd %ymm1, %zmm12 + kxnorw %k3, %k3, %k3 + +/* exponent bits selection */ + vpsrlq $20, %zmm5, %zmm3 + vpsrlq $32, %zmm3, %zmm2 + vpmovqd %zmm2, %ymm11 + vcvtps2pd %ymm14, %zmm13 + vmovups .L_2il0floatpacket.23(%rip), %zmm14 + vmovaps %zmm14, %zmm26 + vpandd _ABSMASK(%rax), %zmm1, %zmm8 + vpcmpd $1, _INF(%rax), %zmm8, %k2 + vpandnd %zmm9, %zmm9, %zmm26{%k1} + vmovups _Two10(%rax), %zmm9 + kxnorw %k1, %k1, %k1 + vcvtps2pd %ymm15, %zmm4 + vmovaps %zmm14, %zmm15 + +/* preserve mantissa, set input exponent to 2^(-10) */ + vpternlogq $248, %zmm6, %zmm4, %zmm9 + vpsrlq $20, %zmm4, %zmm4 + +/* reciprocal approximation good to at least 11 bits */ + vrcp14pd %zmm9, %zmm10 + +/* round reciprocal to nearest integer, will have 1+9 mantissa bits */ + vrndscalepd $8, %zmm10, %zmm3 + vmovups _One(%rax), %zmm10 + vfmsub213pd %zmm10, %zmm3, %zmm9 + vpandnd %zmm8, %zmm8, %zmm15{%k2} + vmovaps %zmm6, %zmm8 + vpternlogq $234, _Two10(%rax), %zmm5, %zmm8 + vpsrlq $32, %zmm4, %zmm5 + vrcp14pd %zmm8, %zmm7 + vpmovqd %zmm5, %ymm6 + vrndscalepd $8, %zmm7, %zmm2 + vfmsub213pd %zmm10, %zmm2, %zmm8 + +/* table lookup */ + vpsrlq $40, %zmm2, %zmm10 + vinserti32x8 $1, %ymm6, %zmm11, %zmm4 + vpsrlq $40, %zmm3, %zmm11 + +/* biased exponent in DP format */ + vextracti32x8 $1, %zmm4, %ymm7 + vcvtdq2pd %ymm4, %zmm6 + vpmovqd %zmm10, %ymm4 + vpmovqd %zmm11, %ymm5 + vpxord %zmm10, %zmm10, %zmm10 + vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3} + vpbroadcastq .L_2il0floatpacket.24(%rip), %zmm4 + vpxord %zmm11, %zmm11, %zmm11 + vcvtdq2pd %ymm7, %zmm7 + vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1} + vmovups _Threshold(%rax), %zmm5 + vcmppd $21, %zmm2, %zmm5, %k2 + vcmppd $21, %zmm3, %zmm5, %k3 + vmovups _Bias1(%rax), %zmm3 + vmovaps %zmm4, %zmm2 + vpandnq %zmm5, %zmm5, %zmm2{%k2} + vpternlogq $236, _Bias(%rax), %zmm3, %zmm2 + +/* dpP= _dbT+lJ*T_ITEM_GRAN */ + kxnorw %k2, %k2, %k2 + vpandnq %zmm5, %zmm5, %zmm4{%k3} + vpternlogq $248, _Bias(%rax), %zmm4, %zmm3 + vsubpd %zmm2, %zmm6, %zmm4 + vmovups _poly_coeff_3(%rax), %zmm6 + vmovups _poly_coeff_4(%rax), %zmm2 + vsubpd %zmm3, %zmm7, %zmm5 + vmulpd %zmm8, %zmm8, %zmm7 + vfmadd213pd %zmm2, %zmm9, %zmm6 + kxnorw %k3, %k3, %k3 + vmovaps %zmm2, %zmm3 + vmulpd %zmm9, %zmm9, %zmm2 + vfmadd231pd _poly_coeff_3(%rax), %zmm8, %zmm3 + +/* reconstruction */ + vfmadd213pd %zmm9, %zmm2, %zmm6 + vfmadd213pd %zmm8, %zmm7, %zmm3 + vaddpd %zmm11, %zmm6, %zmm8 + vaddpd %zmm10, %zmm3, %zmm9 + vfmadd231pd _L2(%rax), %zmm5, %zmm8 + vfmadd132pd _L2(%rax), %zmm9, %zmm4 + vmulpd %zmm13, %zmm8, %zmm13 + vmulpd %zmm12, %zmm4, %zmm3 + vmulpd __dbInvLn2(%rax), %zmm13, %zmm10 + vmulpd __dbInvLn2(%rax), %zmm3, %zmm8 + +/* hi bits */ + vpsrlq $32, %zmm3, %zmm4 + vpsrlq $32, %zmm13, %zmm13 + +/* to round down; if dR is an integer we will get R = 1, which is ok */ + vsubpd __dbHALF(%rax), %zmm8, %zmm12 + vpmovqd %zmm4, %ymm5 + vpmovqd %zmm13, %ymm2 + vsubpd __dbHALF(%rax), %zmm10, %zmm9 + vaddpd __dbShifter(%rax), %zmm12, %zmm7 + vaddpd __dbShifter(%rax), %zmm9, %zmm9 + vsubpd __dbShifter(%rax), %zmm7, %zmm11 + vsubpd __dbShifter(%rax), %zmm9, %zmm12 + vinserti32x8 $1, %ymm2, %zmm5, %zmm3 + +/* iAbsX = iAbsX&iAbsMask */ + vpandd __iAbsMask(%rax), %zmm3, %zmm4 + +/* iRangeMask = (iAbsX>iDomainRange) */ + vpcmpd $2, __iDomainRange(%rax), %zmm4, %k1 + vpandnd %zmm4, %zmm4, %zmm14{%k1} + vpternlogd $254, %zmm15, %zmm26, %zmm14 + +/* [0..1) */ + vsubpd %zmm11, %zmm8, %zmm15 + vsubpd %zmm12, %zmm10, %zmm26 + vptestmd %zmm14, %zmm14, %k0 + vpsrlq $11, %zmm7, %zmm8 + vpsrlq $11, %zmm9, %zmm10 + vmulpd __dbC1(%rax), %zmm26, %zmm26 + vmulpd __dbC1(%rax), %zmm15, %zmm15 + +/* NB : including +/- sign for the exponent!! */ + vpsllq $52, %zmm10, %zmm13 + vpsllq $52, %zmm8, %zmm12 + kmovw %k0, %ecx + +/* low K bits */ + vpandq __lbLOWKBITS(%rax), %zmm9, %zmm14 + vpandq __lbLOWKBITS(%rax), %zmm7, %zmm6 + vpmovqd %zmm14, %ymm7 + vpmovqd %zmm6, %ymm9 + vpxord %zmm2, %zmm2, %zmm2 + vgatherdpd 13952(%rax,%ymm7,8), %zmm2{%k3} + vfmadd213pd %zmm2, %zmm26, %zmm2 + vpaddq %zmm13, %zmm2, %zmm2 + vcvtpd2ps %zmm2, %ymm4 + vpxord %zmm11, %zmm11, %zmm11 + vgatherdpd 13952(%rax,%ymm9,8), %zmm11{%k2} + vfmadd213pd %zmm11, %zmm15, %zmm11 + vpaddq %zmm12, %zmm11, %zmm3 + vcvtpd2ps %zmm3, %ymm5 + vinsertf32x8 $1, %ymm4, %zmm5, %zmm2 + testl %ecx, %ecx + jne .LBL_2_3 + +.LBL_2_2: + cfi_remember_state + vmovups 1280(%rsp), %zmm26 + vmovaps %zmm2, %zmm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_2_3: + cfi_restore_state + vmovups %zmm0, 1088(%rsp) + vmovups %zmm1, 1152(%rsp) + vmovups %zmm2, 1216(%rsp) + je .LBL_2_2 + + xorb %dl, %dl + xorl %eax, %eax + kmovw %k4, 984(%rsp) + kmovw %k5, 976(%rsp) + kmovw %k6, 968(%rsp) + kmovw %k7, 960(%rsp) + vmovups %zmm16, 896(%rsp) + vmovups %zmm17, 832(%rsp) + vmovups %zmm18, 768(%rsp) + vmovups %zmm19, 704(%rsp) + vmovups %zmm20, 640(%rsp) + vmovups %zmm21, 576(%rsp) + vmovups %zmm22, 512(%rsp) + vmovups %zmm23, 448(%rsp) + vmovups %zmm24, 384(%rsp) + vmovups %zmm25, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1000(%rsp) + movq %rdi, 992(%rsp) + movq %r12, 1032(%rsp) + cfi_offset_rel_rsp (12, 1032) + movb %dl, %r12b + movq %r13, 1024(%rsp) + cfi_offset_rel_rsp (13, 1024) + movl %ecx, %r13d + movq %r14, 1016(%rsp) + cfi_offset_rel_rsp (14, 1016) + movl %eax, %r14d + movq %r15, 1008(%rsp) + cfi_offset_rel_rsp (15, 1008) + cfi_remember_state + +.LBL_2_6: + btl %r14d, %r13d + jc .LBL_2_12 + +.LBL_2_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_2_10 + +.LBL_2_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_2_6 + + kmovw 984(%rsp), %k4 + kmovw 976(%rsp), %k5 + kmovw 968(%rsp), %k6 + kmovw 960(%rsp), %k7 + vmovups 896(%rsp), %zmm16 + vmovups 832(%rsp), %zmm17 + vmovups 768(%rsp), %zmm18 + vmovups 704(%rsp), %zmm19 + vmovups 640(%rsp), %zmm20 + vmovups 576(%rsp), %zmm21 + vmovups 512(%rsp), %zmm22 + vmovups 448(%rsp), %zmm23 + vmovups 384(%rsp), %zmm24 + vmovups 320(%rsp), %zmm25 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + vmovups 1216(%rsp), %zmm2 + movq 1000(%rsp), %rsi + movq 992(%rsp), %rdi + movq 1032(%rsp), %r12 + cfi_restore (%r12) + movq 1024(%rsp), %r13 + cfi_restore (%r13) + movq 1016(%rsp), %r14 + cfi_restore (%r14) + movq 1008(%rsp), %r15 + cfi_restore (%r15) + jmp .LBL_2_2 + +.LBL_2_10: + cfi_restore_state + movzbl %r12b, %r15d + vmovss 1156(%rsp,%r15,8), %xmm1 + vzeroupper + vmovss 1092(%rsp,%r15,8), %xmm0 + call JUMPTARGET(__powf_finite) + vmovss %xmm0, 1220(%rsp,%r15,8) + jmp .LBL_2_8 + +.LBL_2_12: + movzbl %r12b, %r15d + vmovss 1152(%rsp,%r15,8), %xmm1 + vzeroupper + vmovss 1088(%rsp,%r15,8), %xmm0 + call JUMPTARGET(__powf_finite) + vmovss %xmm0, 1216(%rsp,%r15,8) + jmp .LBL_2_7 +#endif +END (_ZGVeN16vv_powf_skx) + + .section .rodata, "a" +.L_2il0floatpacket.23: + .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff + .type .L_2il0floatpacket.23,@object +.L_2il0floatpacket.24: + .long 0xffffffff,0xffffffff + .type .L_2il0floatpacket.24,@object diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core.S new file mode 100644 index 0000000000..785b549882 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized powf. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVbN4vv_powf) + .type _ZGVbN4vv_powf, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVbN4vv_powf_sse4(%rip), %rax + HAS_CPU_FEATURE (SSE4_1) + jz 2f + ret +2: leaq _ZGVbN4vv_powf_sse2(%rip), %rax + ret +END (_ZGVbN4vv_powf) +libmvec_hidden_def (_ZGVbN4vv_powf) + +#define _ZGVbN4vv_powf _ZGVbN4vv_powf_sse2 +#include "../svml_s_powf4_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core_sse4.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core_sse4.S new file mode 100644 index 0000000000..8b1b4e74bb --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core_sse4.S @@ -0,0 +1,374 @@ +/* Function powf vectorized with SSE4. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_powf_data.h" + + .text +ENTRY (_ZGVbN4vv_powf_sse4) +/* + ALGORITHM DESCRIPTION: + + We are using the next identity: pow(x,y) = 2^(y * log2(x)). + + 1) log2(x) calculation + Here we use the following formula. + Let |x|=2^k1*X1, where k1 is integer, 1<=X1<2. + Let C ~= 1/ln(2), + Rcp1 ~= 1/X1, X2=Rcp1*X1, + Rcp2 ~= 1/X2, X3=Rcp2*X2, + Rcp3 ~= 1/X3, Rcp3C ~= C/X3. + Then + log2|x| = k1 + log2(1/Rcp1) + log2(1/Rcp2) + log2(C/Rcp3C) + + log2(X1*Rcp1*Rcp2*Rcp3C/C), + where X1*Rcp1*Rcp2*Rcp3C = C*(1+q), q is very small. + + The values of Rcp1, log2(1/Rcp1), Rcp2, log2(1/Rcp2), + Rcp3C, log2(C/Rcp3C) are taken from tables. + Values of Rcp1, Rcp2, Rcp3C are such that RcpC=Rcp1*Rcp2*Rcp3C + is exactly represented in target precision. + + log2(X1*Rcp1*Rcp2*Rcp3C/C) = log2(1+q) = ln(1+q)/ln2 = + = 1/(ln2)*q - 1/(2ln2)*q^2 + 1/(3ln2)*q^3 - ... = + = 1/(C*ln2)*cq - 1/(2*C^2*ln2)*cq^2 + 1/(3*C^3*ln2)*cq^3 - ... = + = (1 + a1)*cq + a2*cq^2 + a3*cq^3 + ..., + where + cq=X1*Rcp1*Rcp2*Rcp3C-C, + a1=1/(C*ln(2))-1 is small, + a2=1/(2*C^2*ln2), + a3=1/(3*C^3*ln2), + ... + Log2 result is split by three parts: HH+HL+HLL + + 2) Calculation of y*log2(x) + Split y into YHi+YLo. + Get high PH and medium PL parts of y*log2|x|. + Get low PLL part of y*log2|x|. + Now we have PH+PL+PLL ~= y*log2|x|. + + 3) Calculation of 2^(y*log2(x)) + Let's represent PH+PL+PLL in the form N + j/2^expK + Z, + where expK=7 in this implementation, N and j are integers, + 0<=j<=2^expK-1, |Z|<2^(-expK-1). Hence + 2^(PH+PL+PLL) ~= 2^N * 2^(j/2^expK) * 2^Z, + where 2^(j/2^expK) is stored in a table, and + 2^Z ~= 1 + B1*Z + B2*Z^2 ... + B5*Z^5. + We compute 2^(PH+PL+PLL) as follows: + Break PH into PHH + PHL, where PHH = N + j/2^expK. + Z = PHL + PL + PLL + Exp2Poly = B1*Z + B2*Z^2 ... + B5*Z^5 + Get 2^(j/2^expK) from table in the form THI+TLO. + Now we have 2^(PH+PL+PLL) ~= 2^N * (THI + TLO) * (1 + Exp2Poly). + Get significand of 2^(PH+PL+PLL) in the form ResHi+ResLo: + ResHi := THI + ResLo := THI * Exp2Poly + TLO + Get exponent ERes of the result: + Res := ResHi + ResLo: + Result := ex(Res) + N. */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $256, %rsp + movaps %xmm0, %xmm3 + movhlps %xmm0, %xmm3 + movaps %xmm1, %xmm5 + movups %xmm8, 112(%rsp) + movaps %xmm5, %xmm2 + cvtps2pd %xmm3, %xmm8 + cvtps2pd %xmm5, %xmm7 + movups %xmm9, 96(%rsp) + movaps %xmm0, %xmm4 + cvtps2pd %xmm0, %xmm9 + movq __svml_spow_data@GOTPCREL(%rip), %rdx + movups %xmm10, 176(%rsp) + movups %xmm13, 48(%rsp) + movups _ExpMask(%rdx), %xmm6 + +/* preserve mantissa, set input exponent to 2^(-10) */ + movaps %xmm6, %xmm10 + andps %xmm8, %xmm6 + andps %xmm9, %xmm10 + +/* exponent bits selection */ + psrlq $20, %xmm9 + orps _Two10(%rdx), %xmm6 + psrlq $20, %xmm8 + orps _Two10(%rdx), %xmm10 + +/* reciprocal approximation good to at least 11 bits */ + cvtpd2ps %xmm6, %xmm13 + cvtpd2ps %xmm10, %xmm1 + movlhps %xmm13, %xmm13 + movhlps %xmm5, %xmm2 + movlhps %xmm1, %xmm1 + movups %xmm12, 208(%rsp) + rcpps %xmm13, %xmm12 + movups %xmm11, 80(%rsp) + cvtps2pd %xmm2, %xmm11 + rcpps %xmm1, %xmm2 + movups %xmm14, 144(%rsp) + cvtps2pd %xmm12, %xmm14 + movups %xmm15, 160(%rsp) + cvtps2pd %xmm2, %xmm15 + shufps $221, %xmm8, %xmm9 + +/* round reciprocal to nearest integer, will have 1+9 mantissa bits */ + roundpd $0, %xmm14, %xmm14 + +/* biased exponent in DP format */ + pshufd $238, %xmm9, %xmm8 + roundpd $0, %xmm15, %xmm15 + cvtdq2pd %xmm8, %xmm1 + mulpd %xmm15, %xmm10 + mulpd %xmm14, %xmm6 + cvtdq2pd %xmm9, %xmm2 + subpd _One(%rdx), %xmm10 + subpd _One(%rdx), %xmm6 + +/* table lookup */ + movaps %xmm14, %xmm8 + movaps %xmm15, %xmm9 + psrlq $40, %xmm8 + psrlq $40, %xmm9 + movd %xmm8, %r8d + movd %xmm9, %eax + psubd _NMINNORM(%rdx), %xmm4 + movdqu _ABSMASK(%rdx), %xmm3 + pextrd $2, %xmm8, %r9d + pand %xmm5, %xmm3 + movups _Threshold(%rdx), %xmm8 + pextrd $2, %xmm9, %ecx + movaps %xmm8, %xmm9 + cmpltpd %xmm15, %xmm9 + cmpltpd %xmm14, %xmm8 + andps _Bias(%rdx), %xmm9 + movaps %xmm10, %xmm14 + andps _Bias(%rdx), %xmm8 + movaps %xmm6, %xmm15 + orps _Bias1(%rdx), %xmm9 + orps _Bias1(%rdx), %xmm8 + subpd %xmm9, %xmm2 + subpd %xmm8, %xmm1 + mulpd %xmm10, %xmm14 + mulpd %xmm6, %xmm15 + mulpd _L2(%rdx), %xmm2 + mulpd _L2(%rdx), %xmm1 + movups _poly_coeff_3(%rdx), %xmm9 + movaps %xmm9, %xmm8 + mulpd %xmm10, %xmm8 + mulpd %xmm6, %xmm9 + addpd _poly_coeff_4(%rdx), %xmm8 + addpd _poly_coeff_4(%rdx), %xmm9 + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm9 + +/* reconstruction */ + addpd %xmm8, %xmm10 + addpd %xmm9, %xmm6 + movslq %eax, %rax + movslq %r8d, %r8 + movslq %ecx, %rcx + movslq %r9d, %r9 + movsd _Log2Rcp_lookup(%rdx,%rax), %xmm13 + movsd _Log2Rcp_lookup(%rdx,%r8), %xmm12 + movhpd _Log2Rcp_lookup(%rdx,%rcx), %xmm13 + movhpd _Log2Rcp_lookup(%rdx,%r9), %xmm12 + addpd %xmm10, %xmm13 + addpd %xmm6, %xmm12 + addpd %xmm13, %xmm2 + addpd %xmm12, %xmm1 + mulpd %xmm7, %xmm2 + mulpd %xmm11, %xmm1 + movups __dbInvLn2(%rdx), %xmm11 + movdqa %xmm4, %xmm12 + movaps %xmm11, %xmm10 + mulpd %xmm2, %xmm10 + mulpd %xmm1, %xmm11 + +/* to round down; if dR is an integer we will get R = 1, which is ok */ + movaps %xmm10, %xmm8 + movaps %xmm11, %xmm9 + subpd __dbHALF(%rdx), %xmm8 + subpd __dbHALF(%rdx), %xmm9 + addpd __dbShifter(%rdx), %xmm8 + addpd __dbShifter(%rdx), %xmm9 + movaps %xmm8, %xmm6 + movaps %xmm9, %xmm7 + subpd __dbShifter(%rdx), %xmm6 + subpd __dbShifter(%rdx), %xmm7 + +/* [0..1) */ + subpd %xmm6, %xmm10 + subpd %xmm7, %xmm11 + mulpd __dbC1(%rdx), %xmm10 + mulpd __dbC1(%rdx), %xmm11 + +/* hi bits */ + shufps $221, %xmm1, %xmm2 + movdqu _NMAXVAL(%rdx), %xmm1 + pcmpgtd %xmm1, %xmm12 + pcmpeqd %xmm1, %xmm4 + por %xmm4, %xmm12 + movdqa %xmm3, %xmm1 + movdqu _INF(%rdx), %xmm4 + pcmpgtd %xmm4, %xmm1 + pcmpeqd %xmm4, %xmm3 + +/* iAbsX = iAbsX&iAbsMask */ + pand __iAbsMask(%rdx), %xmm2 + por %xmm3, %xmm1 + +/* iRangeMask = (iAbsX>iDomainRange) */ + pcmpgtd __iDomainRange(%rdx), %xmm2 + por %xmm1, %xmm12 + movups __lbLOWKBITS(%rdx), %xmm3 + por %xmm2, %xmm12 + +/* low K bits */ + movaps %xmm3, %xmm2 + andps %xmm9, %xmm3 + andps %xmm8, %xmm2 + psrlq $11, %xmm8 + +/* dpP= _dbT+lJ*T_ITEM_GRAN */ + movd %xmm2, %r10d + psrlq $11, %xmm9 + movd %xmm3, %ecx + +/* NB : including +/- sign for the exponent!! */ + psllq $52, %xmm8 + psllq $52, %xmm9 + pextrw $4, %xmm2, %r11d + pextrw $4, %xmm3, %r8d + movmskps %xmm12, %eax + shll $3, %r10d + shll $3, %ecx + shll $3, %r11d + shll $3, %r8d + movq 13952(%rdx,%r10), %xmm6 + movq 13952(%rdx,%rcx), %xmm7 + movhpd 13952(%rdx,%r11), %xmm6 + movhpd 13952(%rdx,%r8), %xmm7 + mulpd %xmm6, %xmm10 + mulpd %xmm7, %xmm11 + addpd %xmm10, %xmm6 + addpd %xmm11, %xmm7 + paddq %xmm8, %xmm6 + paddq %xmm9, %xmm7 + cvtpd2ps %xmm6, %xmm1 + cvtpd2ps %xmm7, %xmm4 + movlhps %xmm4, %xmm1 + testl %eax, %eax + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + movups 112(%rsp), %xmm8 + movaps %xmm1, %xmm0 + movups 96(%rsp), %xmm9 + movups 176(%rsp), %xmm10 + movups 80(%rsp), %xmm11 + movups 208(%rsp), %xmm12 + movups 48(%rsp), %xmm13 + movups 144(%rsp), %xmm14 + movups 160(%rsp), %xmm15 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + movups %xmm0, 64(%rsp) + movups %xmm5, 128(%rsp) + movups %xmm1, 192(%rsp) + je .LBL_1_2 + + xorb %cl, %cl + xorl %edx, %edx + movq %rsi, 8(%rsp) + movq %rdi, (%rsp) + movq %r12, 40(%rsp) + cfi_offset_rel_rsp (12, 40) + movb %cl, %r12b + movq %r13, 32(%rsp) + cfi_offset_rel_rsp (13, 32) + movl %eax, %r13d + movq %r14, 24(%rsp) + cfi_offset_rel_rsp (14, 24) + movl %edx, %r14d + movq %r15, 16(%rsp) + cfi_offset_rel_rsp (15, 16) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + movq 8(%rsp), %rsi + movq (%rsp), %rdi + movq 40(%rsp), %r12 + cfi_restore (%r12) + movq 32(%rsp), %r13 + cfi_restore (%r13) + movq 24(%rsp), %r14 + cfi_restore (%r14) + movq 16(%rsp), %r15 + cfi_restore (%r15) + movups 192(%rsp), %xmm1 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + movss 68(%rsp,%r15,8), %xmm0 + movss 132(%rsp,%r15,8), %xmm1 + + call JUMPTARGET(__powf_finite) + + movss %xmm0, 196(%rsp,%r15,8) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + movss 64(%rsp,%r15,8), %xmm0 + movss 128(%rsp,%r15,8), %xmm1 + + call JUMPTARGET(__powf_finite) + + movss %xmm0, 192(%rsp,%r15,8) + jmp .LBL_1_7 + +END (_ZGVbN4vv_powf_sse4) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core.S new file mode 100644 index 0000000000..1f6a07315e --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized powf. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVdN8vv_powf) + .type _ZGVdN8vv_powf, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVdN8vv_powf_avx2(%rip), %rax + HAS_ARCH_FEATURE (AVX2_Usable) + jz 2f + ret +2: leaq _ZGVdN8vv_powf_sse_wrapper(%rip), %rax + ret +END (_ZGVdN8vv_powf) +libmvec_hidden_def (_ZGVdN8vv_powf) + +#define _ZGVdN8vv_powf _ZGVdN8vv_powf_sse_wrapper +#include "../svml_s_powf8_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core_avx2.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core_avx2.S new file mode 100644 index 0000000000..683932f410 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core_avx2.S @@ -0,0 +1,357 @@ +/* Function powf vectorized with AVX2. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_powf_data.h" + + .text +ENTRY(_ZGVdN8vv_powf_avx2) +/* + ALGORITHM DESCRIPTION: + + We are using the next identity : pow(x,y) = 2^(y * log2(x)). + + 1) log2(x) calculation + Here we use the following formula. + Let |x|=2^k1*X1, where k1 is integer, 1<=X1<2. + Let C ~= 1/ln(2), + Rcp1 ~= 1/X1, X2=Rcp1*X1, + Rcp2 ~= 1/X2, X3=Rcp2*X2, + Rcp3 ~= 1/X3, Rcp3C ~= C/X3. + Then + log2|x| = k1 + log2(1/Rcp1) + log2(1/Rcp2) + log2(C/Rcp3C) + + log2(X1*Rcp1*Rcp2*Rcp3C/C), + where X1*Rcp1*Rcp2*Rcp3C = C*(1+q), q is very small. + + The values of Rcp1, log2(1/Rcp1), Rcp2, log2(1/Rcp2), + Rcp3C, log2(C/Rcp3C) are taken from tables. + Values of Rcp1, Rcp2, Rcp3C are such that RcpC=Rcp1*Rcp2*Rcp3C + is exactly represented in target precision. + + log2(X1*Rcp1*Rcp2*Rcp3C/C) = log2(1+q) = ln(1+q)/ln2 = + = 1/(ln2)*q - 1/(2ln2)*q^2 + 1/(3ln2)*q^3 - ... = + = 1/(C*ln2)*cq - 1/(2*C^2*ln2)*cq^2 + 1/(3*C^3*ln2)*cq^3 - ... = + = (1 + a1)*cq + a2*cq^2 + a3*cq^3 + ..., + where + cq=X1*Rcp1*Rcp2*Rcp3C-C, + a1=1/(C*ln(2))-1 is small, + a2=1/(2*C^2*ln2), + a3=1/(3*C^3*ln2), + ... + Log2 result is split by three parts: HH+HL+HLL + + 2) Calculation of y*log2(x) + Split y into YHi+YLo. + Get high PH and medium PL parts of y*log2|x|. + Get low PLL part of y*log2|x|. + Now we have PH+PL+PLL ~= y*log2|x|. + + 3) Calculation of 2^(y*log2(x)) + Let's represent PH+PL+PLL in the form N + j/2^expK + Z, + where expK=7 in this implementation, N and j are integers, + 0<=j<=2^expK-1, |Z|<2^(-expK-1). Hence + 2^(PH+PL+PLL) ~= 2^N * 2^(j/2^expK) * 2^Z, + where 2^(j/2^expK) is stored in a table, and + 2^Z ~= 1 + B1*Z + B2*Z^2 ... + B5*Z^5. + We compute 2^(PH+PL+PLL) as follows: + Break PH into PHH + PHL, where PHH = N + j/2^expK. + Z = PHL + PL + PLL + Exp2Poly = B1*Z + B2*Z^2 ... + B5*Z^5 + Get 2^(j/2^expK) from table in the form THI+TLO. + Now we have 2^(PH+PL+PLL) ~= 2^N * (THI + TLO) * (1 + Exp2Poly). + Get significand of 2^(PH+PL+PLL) in the form ResHi+ResLo: + ResHi := THI + ResLo := THI * Exp2Poly + TLO + Get exponent ERes of the result: + Res := ResHi + ResLo: + Result := ex(Res) + N. */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $448, %rsp + lea __VPACK_ODD_ind.6357.0.1(%rip), %rcx + vmovups %ymm14, 320(%rsp) + +/* hi bits */ + lea __VPACK_ODD_ind.6358.0.1(%rip), %rax + vmovups %ymm12, 256(%rsp) + vmovups %ymm9, 96(%rsp) + vmovups %ymm13, 224(%rsp) + vmovups %ymm15, 352(%rsp) + vmovups %ymm11, 384(%rsp) + vmovups %ymm10, 288(%rsp) + vmovups (%rcx), %ymm10 + vmovups %ymm8, 160(%rsp) + vmovdqa %ymm1, %ymm9 + movq __svml_spow_data@GOTPCREL(%rip), %rdx + vextractf128 $1, %ymm0, %xmm7 + vcvtps2pd %xmm0, %ymm14 + vcvtps2pd %xmm7, %ymm12 + vpsubd _NMINNORM(%rdx), %ymm0, %ymm7 + +/* preserve mantissa, set input exponent to 2^(-10) */ + vandpd _ExpMask(%rdx), %ymm14, %ymm3 + vandpd _ExpMask(%rdx), %ymm12, %ymm13 + +/* exponent bits selection */ + vpsrlq $20, %ymm12, %ymm12 + vpsrlq $20, %ymm14, %ymm14 + vextractf128 $1, %ymm9, %xmm2 + vcvtps2pd %xmm9, %ymm1 + vpand _ABSMASK(%rdx), %ymm9, %ymm8 + vcvtps2pd %xmm2, %ymm6 + vorpd _Two10(%rdx), %ymm3, %ymm2 + vorpd _Two10(%rdx), %ymm13, %ymm3 + +/* reciprocal approximation good to at least 11 bits */ + vcvtpd2ps %ymm2, %xmm5 + vcvtpd2ps %ymm3, %xmm15 + vrcpps %xmm5, %xmm4 + vrcpps %xmm15, %xmm11 + vcvtps2pd %xmm4, %ymm13 + vcvtps2pd %xmm11, %ymm4 + vpermps %ymm12, %ymm10, %ymm11 + +/* round reciprocal to nearest integer, will have 1+9 mantissa bits */ + vroundpd $0, %ymm13, %ymm12 + vpermps %ymm14, %ymm10, %ymm5 + vroundpd $0, %ymm4, %ymm14 + vmovupd _One(%rdx), %ymm4 + +/* table lookup */ + vpsrlq $40, %ymm12, %ymm10 + vfmsub213pd %ymm4, %ymm12, %ymm2 + vfmsub213pd %ymm4, %ymm14, %ymm3 + vcmpgt_oqpd _Threshold(%rdx), %ymm12, %ymm12 + vxorpd %ymm4, %ymm4, %ymm4 + vandpd _Bias(%rdx), %ymm12, %ymm12 + +/* biased exponent in DP format */ + vcvtdq2pd %xmm11, %ymm13 + vpcmpeqd %ymm11, %ymm11, %ymm11 + vgatherqpd %ymm11, _Log2Rcp_lookup(%rdx,%ymm10), %ymm4 + vpsrlq $40, %ymm14, %ymm10 + vcmpgt_oqpd _Threshold(%rdx), %ymm14, %ymm14 + vpcmpeqd %ymm11, %ymm11, %ymm11 + vandpd _Bias(%rdx), %ymm14, %ymm14 + vcvtdq2pd %xmm5, %ymm15 + vxorpd %ymm5, %ymm5, %ymm5 + vgatherqpd %ymm11, _Log2Rcp_lookup(%rdx,%ymm10), %ymm5 + vorpd _Bias1(%rdx), %ymm12, %ymm11 + vorpd _Bias1(%rdx), %ymm14, %ymm10 + vsubpd %ymm11, %ymm15, %ymm11 + vsubpd %ymm10, %ymm13, %ymm14 + vmovupd _poly_coeff_4(%rdx), %ymm15 + vmovupd _poly_coeff_3(%rdx), %ymm13 + vmulpd %ymm3, %ymm3, %ymm10 + vfmadd213pd %ymm15, %ymm3, %ymm13 + vmovdqa %ymm15, %ymm12 + vfmadd231pd _poly_coeff_3(%rdx), %ymm2, %ymm12 + vmulpd %ymm2, %ymm2, %ymm15 + +/* reconstruction */ + vfmadd213pd %ymm3, %ymm10, %ymm13 + vfmadd213pd %ymm2, %ymm15, %ymm12 + vaddpd %ymm5, %ymm13, %ymm13 + vaddpd %ymm4, %ymm12, %ymm2 + vfmadd231pd _L2(%rdx), %ymm14, %ymm13 + vfmadd132pd _L2(%rdx), %ymm2, %ymm11 + vmulpd %ymm6, %ymm13, %ymm2 + vmulpd %ymm1, %ymm11, %ymm10 + vmulpd __dbInvLn2(%rdx), %ymm2, %ymm6 + vmulpd __dbInvLn2(%rdx), %ymm10, %ymm15 + +/* to round down; if dR is an integer we will get R = 1, which is ok */ + vsubpd __dbHALF(%rdx), %ymm6, %ymm3 + vsubpd __dbHALF(%rdx), %ymm15, %ymm1 + vaddpd __dbShifter(%rdx), %ymm3, %ymm13 + vaddpd __dbShifter(%rdx), %ymm1, %ymm14 + vsubpd __dbShifter(%rdx), %ymm13, %ymm12 + vmovups (%rax), %ymm1 + vsubpd __dbShifter(%rdx), %ymm14, %ymm11 + +/* [0..1) */ + vsubpd %ymm12, %ymm6, %ymm6 + vpermps %ymm10, %ymm1, %ymm3 + vpermps %ymm2, %ymm1, %ymm10 + vpcmpgtd _NMAXVAL(%rdx), %ymm7, %ymm4 + vpcmpgtd _INF(%rdx), %ymm8, %ymm1 + vpcmpeqd _NMAXVAL(%rdx), %ymm7, %ymm7 + vpcmpeqd _INF(%rdx), %ymm8, %ymm8 + vpor %ymm7, %ymm4, %ymm2 + vpor %ymm8, %ymm1, %ymm1 + vsubpd %ymm11, %ymm15, %ymm7 + vinsertf128 $1, %xmm10, %ymm3, %ymm10 + vpor %ymm1, %ymm2, %ymm3 + +/* iAbsX = iAbsX&iAbsMask */ + vandps __iAbsMask(%rdx), %ymm10, %ymm10 + +/* iRangeMask = (iAbsX>iDomainRange) */ + vpcmpgtd __iDomainRange(%rdx), %ymm10, %ymm4 + vpor %ymm4, %ymm3, %ymm5 + vmulpd __dbC1(%rdx), %ymm7, %ymm4 + vmovmskps %ymm5, %ecx + vmulpd __dbC1(%rdx), %ymm6, %ymm5 + +/* low K bits */ + vandps __lbLOWKBITS(%rdx), %ymm14, %ymm6 + +/* dpP= _dbT+lJ*T_ITEM_GRAN */ + vxorpd %ymm7, %ymm7, %ymm7 + vpcmpeqd %ymm1, %ymm1, %ymm1 + vandps __lbLOWKBITS(%rdx), %ymm13, %ymm2 + vxorpd %ymm10, %ymm10, %ymm10 + vpcmpeqd %ymm3, %ymm3, %ymm3 + vgatherqpd %ymm1, 13952(%rdx,%ymm6,8), %ymm7 + vgatherqpd %ymm3, 13952(%rdx,%ymm2,8), %ymm10 + vpsrlq $11, %ymm14, %ymm14 + vpsrlq $11, %ymm13, %ymm13 + vfmadd213pd %ymm7, %ymm4, %ymm7 + vfmadd213pd %ymm10, %ymm5, %ymm10 + +/* NB : including +/- sign for the exponent!! */ + vpsllq $52, %ymm14, %ymm8 + vpsllq $52, %ymm13, %ymm11 + vpaddq %ymm8, %ymm7, %ymm12 + vpaddq %ymm11, %ymm10, %ymm1 + vcvtpd2ps %ymm12, %xmm15 + vcvtpd2ps %ymm1, %xmm2 + vinsertf128 $1, %xmm2, %ymm15, %ymm1 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + vmovups 160(%rsp), %ymm8 + vmovups 96(%rsp), %ymm9 + vmovups 288(%rsp), %ymm10 + vmovups 384(%rsp), %ymm11 + vmovups 256(%rsp), %ymm12 + vmovups 224(%rsp), %ymm13 + vmovups 320(%rsp), %ymm14 + vmovups 352(%rsp), %ymm15 + vmovdqa %ymm1, %ymm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovups %ymm0, 64(%rsp) + vmovups %ymm9, 128(%rsp) + vmovups %ymm1, 192(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + movq %rsi, 8(%rsp) + movq %rdi, (%rsp) + movq %r12, 40(%rsp) + cfi_offset_rel_rsp (12, 40) + movb %dl, %r12b + movq %r13, 32(%rsp) + cfi_offset_rel_rsp (13, 32) + movl %ecx, %r13d + movq %r14, 24(%rsp) + cfi_offset_rel_rsp (14, 24) + movl %eax, %r14d + movq %r15, 16(%rsp) + cfi_offset_rel_rsp (15, 16) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + movq 8(%rsp), %rsi + movq (%rsp), %rdi + movq 40(%rsp), %r12 + cfi_restore (%r12) + movq 32(%rsp), %r13 + cfi_restore (%r13) + movq 24(%rsp), %r14 + cfi_restore (%r14) + movq 16(%rsp), %r15 + cfi_restore (%r15) + vmovups 192(%rsp), %ymm1 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + vmovss 68(%rsp,%r15,8), %xmm0 + vmovss 132(%rsp,%r15,8), %xmm1 + vzeroupper + + call JUMPTARGET(__powf_finite) + + vmovss %xmm0, 196(%rsp,%r15,8) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + vmovss 64(%rsp,%r15,8), %xmm0 + vmovss 128(%rsp,%r15,8), %xmm1 + vzeroupper + + call JUMPTARGET(__powf_finite) + + vmovss %xmm0, 192(%rsp,%r15,8) + jmp .LBL_1_7 + +END(_ZGVdN8vv_powf_avx2) + + .section .rodata, "a" +__VPACK_ODD_ind.6357.0.1: + .long 1 + .long 3 + .long 5 + .long 7 + .long 0 + .long 0 + .long 0 + .long 0 + .space 32, 0x00 +__VPACK_ODD_ind.6358.0.1: + .long 1 + .long 3 + .long 5 + .long 7 + .long 0 + .long 0 + .long 0 + .long 0 diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S new file mode 100644 index 0000000000..0545460952 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S @@ -0,0 +1,37 @@ +/* Multiple versions of vectorized sincosf. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVeN16vvv_sincosf) + .type _ZGVeN16vvv_sincosf, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVeN16vvv_sincosf_skx(%rip), %rax + HAS_ARCH_FEATURE (AVX512DQ_Usable) + jnz 2f + leaq _ZGVeN16vvv_sincosf_knl(%rip), %rax + HAS_ARCH_FEATURE (AVX512F_Usable) + jnz 2f + leaq _ZGVeN16vvv_sincosf_avx2_wrapper(%rip), %rax +2: ret +END (_ZGVeN16vvv_sincosf) + +#define _ZGVeN16vvv_sincosf _ZGVeN16vvv_sincosf_avx2_wrapper +#include "../svml_s_sincosf16_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S new file mode 100644 index 0000000000..f73ab7de7c --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S @@ -0,0 +1,806 @@ +/* Function sincosf vectorized with AVX-512. KNL and SKX versions. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_trig_data.h" +#include "svml_s_wrapper_impl.h" + +/* + ALGORITHM DESCRIPTION: + + 1) Range reduction to [-Pi/4; +Pi/4] interval + a) Grab sign from source argument and save it. + b) Remove sign using AND operation + c) Getting octant Y by 2/Pi multiplication + d) Add "Right Shifter" value + e) Treat obtained value as integer S for destination sign setting. + SS = ((S-S&1)&2)<<30; For sin part + SC = ((S+S&1)&2)<<30; For cos part + f) Change destination sign if source sign is negative + using XOR operation. + g) Subtract "Right Shifter" (0x4B000000) value + h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 4 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4; + 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval) + a) Calculate X^2 = X * X + b) Calculate 2 polynomials for sin and cos: + RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3)))); + RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))); + c) Swap RS & RC if if first bit of obtained value after + Right Shifting is set to 1. Using And, Andnot & Or operations. + 3) Destination sign setting + a) Set shifted destination sign using XOR operation: + R1 = XOR( RS, SS ); + R2 = XOR( RC, SC ). */ + + .text +ENTRY (_ZGVeN16vl4l4_sincosf_knl) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512_fFF _ZGVdN8vl4l4_sincosf +#else + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1344, %rsp + movq __svml_s_trig_data@GOTPCREL(%rip), %rax + vmovaps %zmm0, %zmm2 + movl $-1, %edx + vmovups __sAbsMask(%rax), %zmm0 + vmovups __sInvPI(%rax), %zmm3 + +/* Absolute argument computation */ + vpandd %zmm0, %zmm2, %zmm1 + vmovups __sPI1_FMA(%rax), %zmm5 + vmovups __sSignMask(%rax), %zmm9 + vpandnd %zmm2, %zmm0, %zmm0 + +/* h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 3 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3 */ + vmovaps %zmm1, %zmm6 + vmovaps %zmm1, %zmm8 + +/* c) Getting octant Y by 2/Pi multiplication + d) Add "Right Shifter" value */ + vfmadd213ps __sRShifter(%rax), %zmm1, %zmm3 + vmovups __sPI3_FMA(%rax), %zmm7 + +/* g) Subtract "Right Shifter" (0x4B000000) value */ + vsubps __sRShifter(%rax), %zmm3, %zmm12 + +/* e) Treat obtained value as integer S for destination sign setting */ + vpslld $31, %zmm3, %zmm13 + vmovups __sA7_FMA(%rax), %zmm14 + vfnmadd231ps %zmm12, %zmm5, %zmm6 + +/* 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval) + a) Calculate X^2 = X * X + b) Calculate 2 polynomials for sin and cos: + RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3)))); + RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))) */ + vmovaps %zmm14, %zmm15 + vmovups __sA9_FMA(%rax), %zmm3 + vcmpps $22, __sRangeReductionVal(%rax), %zmm1, %k1 + vpbroadcastd %edx, %zmm1{%k1}{z} + vfnmadd231ps __sPI2_FMA(%rax), %zmm12, %zmm6 + vptestmd %zmm1, %zmm1, %k0 + vpandd %zmm6, %zmm9, %zmm11 + kmovw %k0, %ecx + vpxord __sOneHalf(%rax), %zmm11, %zmm4 + +/* Result sign calculations */ + vpternlogd $150, %zmm13, %zmm9, %zmm11 + +/* Add correction term 0.5 for cos() part */ + vaddps %zmm4, %zmm12, %zmm10 + vfnmadd213ps %zmm6, %zmm7, %zmm12 + vfnmadd231ps %zmm10, %zmm5, %zmm8 + vpxord %zmm13, %zmm12, %zmm13 + vmulps %zmm13, %zmm13, %zmm12 + vfnmadd231ps __sPI2_FMA(%rax), %zmm10, %zmm8 + vfmadd231ps __sA9_FMA(%rax), %zmm12, %zmm15 + vfnmadd213ps %zmm8, %zmm7, %zmm10 + vfmadd213ps __sA5_FMA(%rax), %zmm12, %zmm15 + vpxord %zmm11, %zmm10, %zmm5 + vmulps %zmm5, %zmm5, %zmm4 + vfmadd213ps __sA3(%rax), %zmm12, %zmm15 + vfmadd213ps %zmm14, %zmm4, %zmm3 + vmulps %zmm12, %zmm15, %zmm14 + vfmadd213ps __sA5_FMA(%rax), %zmm4, %zmm3 + vfmadd213ps %zmm13, %zmm13, %zmm14 + vfmadd213ps __sA3(%rax), %zmm4, %zmm3 + vpxord %zmm0, %zmm14, %zmm0 + vmulps %zmm4, %zmm3, %zmm3 + vfmadd213ps %zmm5, %zmm5, %zmm3 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + vmovups %zmm0, (%rdi) + vmovups %zmm3, (%rsi) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovups %zmm2, 1152(%rsp) + vmovups %zmm0, 1216(%rsp) + vmovups %zmm3, 1280(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + kmovw %k4, 1048(%rsp) + xorl %eax, %eax + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %eax, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %ecx, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + movq %rbx, 1064(%rsp) + movq %rdi, %rbx + cfi_remember_state + +.LBL_1_6: + btl %r13d, %r14d + jc .LBL_1_13 + +.LBL_1_7: + lea 1(%r13), %esi + btl %esi, %r14d + jc .LBL_1_10 + +.LBL_1_8: + addb $1, %r12b + addl $2, %r13d + cmpb $16, %r12b + jb .LBL_1_6 + + movq %rbx, %rdi + kmovw 1048(%rsp), %k4 + movq 1056(%rsp), %rsi + kmovw 1040(%rsp), %k5 + movq 1096(%rsp), %r12 + cfi_restore (%r12) + kmovw 1032(%rsp), %k6 + movq 1088(%rsp), %r13 + cfi_restore (%r13) + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + movq 1064(%rsp), %rbx + vmovups 1216(%rsp), %zmm0 + vmovups 1280(%rsp), %zmm3 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + vmovss 1156(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(sinf) + + vmovss %xmm0, 1220(%rsp,%r15,8) + vmovss 1156(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(cosf) + + vmovss %xmm0, 1284(%rsp,%r15,8) + jmp .LBL_1_8 + +.LBL_1_13: + movzbl %r12b, %r15d + vmovss 1152(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(sinf) + + vmovss %xmm0, 1216(%rsp,%r15,8) + vmovss 1152(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(cosf) + + vmovss %xmm0, 1280(%rsp,%r15,8) + jmp .LBL_1_7 +#endif +END (_ZGVeN16vl4l4_sincosf_knl) +libmvec_hidden_def(_ZGVeN16vl4l4_sincosf_knl) + +ENTRY (_ZGVeN16vl4l4_sincosf_skx) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf +#else + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1344, %rsp + movq __svml_s_trig_data@GOTPCREL(%rip), %rax + vmovaps %zmm0, %zmm4 + vmovups __sAbsMask(%rax), %zmm3 + vmovups __sInvPI(%rax), %zmm5 + vmovups __sRShifter(%rax), %zmm6 + vmovups __sPI1_FMA(%rax), %zmm9 + vmovups __sPI2_FMA(%rax), %zmm10 + vmovups __sSignMask(%rax), %zmm14 + vmovups __sOneHalf(%rax), %zmm7 + vmovups __sPI3_FMA(%rax), %zmm12 + +/* Absolute argument computation */ + vandps %zmm3, %zmm4, %zmm2 + +/* c) Getting octant Y by 2/Pi multiplication + d) Add "Right Shifter" value */ + vfmadd213ps %zmm6, %zmm2, %zmm5 + vcmpps $18, __sRangeReductionVal(%rax), %zmm2, %k1 + +/* e) Treat obtained value as integer S for destination sign setting */ + vpslld $31, %zmm5, %zmm0 + +/* g) Subtract "Right Shifter" (0x4B000000) value */ + vsubps %zmm6, %zmm5, %zmm5 + vmovups __sA3(%rax), %zmm6 + +/* h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 3 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3 */ + vmovaps %zmm2, %zmm11 + vfnmadd231ps %zmm5, %zmm9, %zmm11 + vfnmadd231ps %zmm5, %zmm10, %zmm11 + vandps %zmm11, %zmm14, %zmm1 + vxorps %zmm1, %zmm7, %zmm8 + +/* Result sign calculations */ + vpternlogd $150, %zmm0, %zmm14, %zmm1 + vmovups .L_2il0floatpacket.13(%rip), %zmm14 + +/* Add correction term 0.5 for cos() part */ + vaddps %zmm8, %zmm5, %zmm15 + vfnmadd213ps %zmm11, %zmm12, %zmm5 + vandnps %zmm4, %zmm3, %zmm11 + vmovups __sA7_FMA(%rax), %zmm3 + vmovaps %zmm2, %zmm13 + vfnmadd231ps %zmm15, %zmm9, %zmm13 + vxorps %zmm0, %zmm5, %zmm9 + vmovups __sA5_FMA(%rax), %zmm0 + vfnmadd231ps %zmm15, %zmm10, %zmm13 + vmulps %zmm9, %zmm9, %zmm8 + vfnmadd213ps %zmm13, %zmm12, %zmm15 + vmovups __sA9_FMA(%rax), %zmm12 + vxorps %zmm1, %zmm15, %zmm1 + vmulps %zmm1, %zmm1, %zmm13 + +/* 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval) + a) Calculate X^2 = X * X + b) Calculate 2 polynomials for sin and cos: + RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3)))); + RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))) */ + vmovaps %zmm12, %zmm7 + vfmadd213ps %zmm3, %zmm8, %zmm7 + vfmadd213ps %zmm3, %zmm13, %zmm12 + vfmadd213ps %zmm0, %zmm8, %zmm7 + vfmadd213ps %zmm0, %zmm13, %zmm12 + vfmadd213ps %zmm6, %zmm8, %zmm7 + vfmadd213ps %zmm6, %zmm13, %zmm12 + vmulps %zmm8, %zmm7, %zmm10 + vmulps %zmm13, %zmm12, %zmm3 + vfmadd213ps %zmm9, %zmm9, %zmm10 + vfmadd213ps %zmm1, %zmm1, %zmm3 + vxorps %zmm11, %zmm10, %zmm0 + vpandnd %zmm2, %zmm2, %zmm14{%k1} + vptestmd %zmm14, %zmm14, %k0 + kmovw %k0, %ecx + testl %ecx, %ecx + jne .LBL_2_3 + +.LBL_2_2: + cfi_remember_state + vmovups %zmm0, (%rdi) + vmovups %zmm3, (%rsi) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_2_3: + cfi_restore_state + vmovups %zmm4, 1152(%rsp) + vmovups %zmm0, 1216(%rsp) + vmovups %zmm3, 1280(%rsp) + je .LBL_2_2 + + xorb %dl, %dl + xorl %eax, %eax + kmovw %k4, 1048(%rsp) + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %eax, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %ecx, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + movq %rbx, 1064(%rsp) + movq %rdi, %rbx + cfi_remember_state + +.LBL_2_6: + btl %r13d, %r14d + jc .LBL_2_13 + +.LBL_2_7: + lea 1(%r13), %esi + btl %esi, %r14d + jc .LBL_2_10 + +.LBL_2_8: + incb %r12b + addl $2, %r13d + cmpb $16, %r12b + jb .LBL_2_6 + + kmovw 1048(%rsp), %k4 + movq %rbx, %rdi + kmovw 1040(%rsp), %k5 + kmovw 1032(%rsp), %k6 + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + vmovups 1216(%rsp), %zmm0 + vmovups 1280(%rsp), %zmm3 + movq 1056(%rsp), %rsi + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + movq 1064(%rsp), %rbx + jmp .LBL_2_2 + +.LBL_2_10: + cfi_restore_state + movzbl %r12b, %r15d + vmovss 1156(%rsp,%r15,8), %xmm0 + vzeroupper + vmovss 1156(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(sinf) + + vmovss %xmm0, 1220(%rsp,%r15,8) + vmovss 1156(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(cosf) + + vmovss %xmm0, 1284(%rsp,%r15,8) + jmp .LBL_2_8 + +.LBL_2_13: + movzbl %r12b, %r15d + vmovss 1152(%rsp,%r15,8), %xmm0 + vzeroupper + vmovss 1152(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(sinf) + + vmovss %xmm0, 1216(%rsp,%r15,8) + vmovss 1152(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(cosf) + + vmovss %xmm0, 1280(%rsp,%r15,8) + jmp .LBL_2_7 +#endif +END (_ZGVeN16vl4l4_sincosf_skx) +libmvec_hidden_def(_ZGVeN16vl4l4_sincosf_skx) + +/* Wrapper between vvv and vl4l4 vector variants. */ +.macro WRAPPER_AVX512_vvv_vl4l4 callee +#ifndef __ILP32__ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $384, %rsp + /* Encoding for vmovups %zmm1, 128(%rsp). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x4c + .byte 0x24 + .byte 0x02 + lea (%rsp), %rdi + /* Encoding for vmovups %zmm2, 192(%rdi). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x57 + .byte 0x03 + /* Encoding for vmovups %zmm3, 256(%rdi). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x5f + .byte 0x04 + /* Encoding for vmovups %zmm4, 320(%rdi). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x67 + .byte 0x05 + lea 64(%rsp), %rsi + call HIDDEN_JUMPTARGET(\callee) + movq 128(%rsp), %rdx + movq 136(%rsp), %rsi + movq 144(%rsp), %r8 + movq 152(%rsp), %r10 + movl (%rsp), %eax + movl 4(%rsp), %ecx + movl 8(%rsp), %edi + movl 12(%rsp), %r9d + movl %eax, (%rdx) + movl %ecx, (%rsi) + movq 160(%rsp), %rax + movq 168(%rsp), %rcx + movl %edi, (%r8) + movl %r9d, (%r10) + movq 176(%rsp), %rdi + movq 184(%rsp), %r9 + movl 16(%rsp), %r11d + movl 20(%rsp), %edx + movl 24(%rsp), %esi + movl 28(%rsp), %r8d + movl %r11d, (%rax) + movl %edx, (%rcx) + movq 192(%rsp), %r11 + movq 200(%rsp), %rdx + movl %esi, (%rdi) + movl %r8d, (%r9) + movq 208(%rsp), %rsi + movq 216(%rsp), %r8 + movl 32(%rsp), %r10d + movl 36(%rsp), %eax + movl 40(%rsp), %ecx + movl 44(%rsp), %edi + movl %r10d, (%r11) + movl %eax, (%rdx) + movq 224(%rsp), %r10 + movq 232(%rsp), %rax + movl %ecx, (%rsi) + movl %edi, (%r8) + movq 240(%rsp), %rcx + movq 248(%rsp), %rdi + movl 48(%rsp), %r9d + movl 52(%rsp), %r11d + movl 56(%rsp), %edx + movl 60(%rsp), %esi + movl %r9d, (%r10) + movl %r11d, (%rax) + movq 256(%rsp), %r9 + movq 264(%rsp), %r11 + movl %edx, (%rcx) + movl %esi, (%rdi) + movq 272(%rsp), %rdx + movq 280(%rsp), %rsi + movl 64(%rsp), %r8d + movl 68(%rsp), %r10d + movl 72(%rsp), %eax + movl 76(%rsp), %ecx + movl %r8d, (%r9) + movl %r10d, (%r11) + movq 288(%rsp), %r8 + movq 296(%rsp), %r10 + movl %eax, (%rdx) + movl %ecx, (%rsi) + movq 304(%rsp), %rax + movq 312(%rsp), %rcx + movl 80(%rsp), %edi + movl 84(%rsp), %r9d + movl 88(%rsp), %r11d + movl 92(%rsp), %edx + movl %edi, (%r8) + movl %r9d, (%r10) + movq 320(%rsp), %rdi + movq 328(%rsp), %r9 + movl %r11d, (%rax) + movl %edx, (%rcx) + movq 336(%rsp), %r11 + movq 344(%rsp), %rdx + movl 96(%rsp), %esi + movl 100(%rsp), %r8d + movl 104(%rsp), %r10d + movl 108(%rsp), %eax + movl %esi, (%rdi) + movl %r8d, (%r9) + movq 352(%rsp), %rsi + movq 360(%rsp), %r8 + movl %r10d, (%r11) + movl %eax, (%rdx) + movq 368(%rsp), %r10 + movq 376(%rsp), %rax + movl 112(%rsp), %ecx + movl 116(%rsp), %edi + movl 120(%rsp), %r9d + movl 124(%rsp), %r11d + movl %ecx, (%rsi) + movl %edi, (%r8) + movl %r9d, (%r10) + movl %r11d, (%rax) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret +#else + leal 8(%rsp), %r10d + .cfi_def_cfa 10, 0 + andl $-64, %esp + pushq -8(%r10d) + pushq %rbp + .cfi_escape 0x10,0x6,0x2,0x76,0 + movl %esp, %ebp + pushq %r10 + .cfi_escape 0xf,0x3,0x76,0x78,0x6 + leal -112(%rbp), %esi + leal -176(%rbp), %edi + subl $296, %esp + /* Encoding for vmovdqa64 %zmm1, -240(%ebp). */ + .byte 0x67 + .byte 0x62 + .byte 0xf1 + .byte 0xfd + .byte 0x48 + .byte 0x7f + .byte 0x8d + .byte 0x10 + .byte 0xff + .byte 0xff + .byte 0xff + /* Encoding for vmovdqa64 %zmm2, -304(%ebp). */ + .byte 0x67 + .byte 0x62 + .byte 0xf1 + .byte 0xfd + .byte 0x48 + .byte 0x7f + .byte 0x95 + .byte 0xd0 + .byte 0xfe + .byte 0xff + .byte 0xff + call HIDDEN_JUMPTARGET(\callee) + movl -240(%ebp), %eax + vmovss -176(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -236(%ebp), %eax + vmovss -172(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -232(%ebp), %eax + vmovss -168(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -228(%ebp), %eax + vmovss -164(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -224(%ebp), %eax + vmovss -160(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -220(%ebp), %eax + vmovss -156(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -216(%ebp), %eax + vmovss -152(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -212(%ebp), %eax + vmovss -148(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -208(%ebp), %eax + vmovss -144(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -204(%ebp), %eax + vmovss -140(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -200(%ebp), %eax + vmovss -136(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -196(%ebp), %eax + vmovss -132(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -192(%ebp), %eax + vmovss -128(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -188(%ebp), %eax + vmovss -124(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -184(%ebp), %eax + vmovss -120(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -180(%ebp), %eax + vmovss -116(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -304(%ebp), %eax + vmovss -112(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -300(%ebp), %eax + vmovss -108(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -296(%ebp), %eax + vmovss -104(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -292(%ebp), %eax + vmovss -100(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -288(%ebp), %eax + vmovss -96(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -284(%ebp), %eax + vmovss -92(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -280(%ebp), %eax + vmovss -88(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -276(%ebp), %eax + vmovss -84(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -272(%ebp), %eax + vmovss -80(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -268(%ebp), %eax + vmovss -76(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -264(%ebp), %eax + vmovss -72(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -260(%ebp), %eax + vmovss -68(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -256(%ebp), %eax + vmovss -64(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -252(%ebp), %eax + vmovss -60(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -248(%ebp), %eax + vmovss -56(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -244(%ebp), %eax + vmovss -52(%ebp), %xmm0 + vmovss %xmm0, (%eax) + addl $296, %esp + popq %r10 + .cfi_def_cfa 10, 0 + popq %rbp + leal -8(%r10), %esp + .cfi_def_cfa 7, 8 + ret +#endif +.endm + +ENTRY (_ZGVeN16vvv_sincosf_knl) +WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_knl +END (_ZGVeN16vvv_sincosf_knl) + +ENTRY (_ZGVeN16vvv_sincosf_skx) +WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx +END (_ZGVeN16vvv_sincosf_skx) + + .section .rodata, "a" +.L_2il0floatpacket.13: + .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff + .type .L_2il0floatpacket.13,@object diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core.S new file mode 100644 index 0000000000..a249be33d1 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized sincosf. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVbN4vvv_sincosf) + .type _ZGVbN4vvv_sincosf, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVbN4vvv_sincosf_sse4(%rip), %rax + HAS_CPU_FEATURE (SSE4_1) + jz 2f + ret +2: leaq _ZGVbN4vvv_sincosf_sse2(%rip), %rax + ret +END (_ZGVbN4vvv_sincosf) +libmvec_hidden_def (_ZGVbN4vvv_sincosf) + +#define _ZGVbN4vvv_sincosf _ZGVbN4vvv_sincosf_sse2 +#include "../svml_s_sincosf4_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core_sse4.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core_sse4.S new file mode 100644 index 0000000000..74a6ac1157 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core_sse4.S @@ -0,0 +1,346 @@ +/* Function sincosf vectorized with SSE4. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_trig_data.h" + + .text +ENTRY (_ZGVbN4vl4l4_sincosf_sse4) +/* + ALGORITHM DESCRIPTION: + + 1) Range reduction to [-Pi/4; +Pi/4] interval + a) Grab sign from source argument and save it. + b) Remove sign using AND operation + c) Getting octant Y by 2/Pi multiplication + d) Add "Right Shifter" value + e) Treat obtained value as integer S for destination sign setting. + SS = ((S-S&1)&2)<<30; For sin part + SC = ((S+S&1)&2)<<30; For cos part + f) Change destination sign if source sign is negative + using XOR operation. + g) Subtract "Right Shifter" (0x4B000000) value + h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 4 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4; + 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval) + a) Calculate X^2 = X * X + b) Calculate 2 polynomials for sin and cos: + RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3)))); + RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))); + c) Swap RS & RC if if first bit of obtained value after + Right Shifting is set to 1. Using And, Andnot & Or operations. + 3) Destination sign setting + a) Set shifted destination sign using XOR operation: + R1 = XOR( RS, SS ); + R2 = XOR( RC, SC ). */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $320, %rsp + movq __svml_s_trig_data@GOTPCREL(%rip), %rax + movups %xmm12, 176(%rsp) + movups %xmm9, 160(%rsp) + movups __sAbsMask(%rax), %xmm12 + +/* Absolute argument computation */ + movaps %xmm12, %xmm5 + andnps %xmm0, %xmm12 + movups __sInvPI(%rax), %xmm7 + andps %xmm0, %xmm5 + +/* c) Getting octant Y by 2/Pi multiplication + d) Add "Right Shifter" value. */ + mulps %xmm5, %xmm7 + movups %xmm10, 144(%rsp) + movups __sPI1(%rax), %xmm10 + +/* h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 3 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3. */ + movaps %xmm10, %xmm1 + addps __sRShifter(%rax), %xmm7 + +/* e) Treat obtained value as integer S for destination sign setting */ + movaps %xmm7, %xmm9 + +/* g) Subtract "Right Shifter" (0x4B000000) value */ + subps __sRShifter(%rax), %xmm7 + mulps %xmm7, %xmm1 + pslld $31, %xmm9 + movups __sPI2(%rax), %xmm6 + movups %xmm13, 112(%rsp) + movaps %xmm5, %xmm13 + movaps %xmm6, %xmm2 + subps %xmm1, %xmm13 + mulps %xmm7, %xmm2 + movups __sSignMask(%rax), %xmm3 + movaps %xmm5, %xmm1 + movups __sOneHalf(%rax), %xmm4 + subps %xmm2, %xmm13 + cmpnleps __sRangeReductionVal(%rax), %xmm5 + movaps %xmm3, %xmm2 + andps %xmm13, %xmm2 + xorps %xmm2, %xmm4 + +/* Result sign calculations */ + xorps %xmm2, %xmm3 + xorps %xmm9, %xmm3 + +/* Add correction term 0.5 for cos() part */ + addps %xmm7, %xmm4 + movmskps %xmm5, %ecx + mulps %xmm4, %xmm10 + mulps %xmm4, %xmm6 + subps %xmm10, %xmm1 + movups __sPI3(%rax), %xmm10 + subps %xmm6, %xmm1 + movaps %xmm10, %xmm6 + mulps %xmm7, %xmm6 + mulps %xmm4, %xmm10 + subps %xmm6, %xmm13 + subps %xmm10, %xmm1 + movups __sPI4(%rax), %xmm6 + mulps %xmm6, %xmm7 + mulps %xmm6, %xmm4 + subps %xmm7, %xmm13 + subps %xmm4, %xmm1 + xorps %xmm9, %xmm13 + xorps %xmm3, %xmm1 + movaps %xmm13, %xmm4 + movaps %xmm1, %xmm2 + mulps %xmm13, %xmm4 + mulps %xmm1, %xmm2 + movups __sA9(%rax), %xmm7 + +/* 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval) + a) Calculate X^2 = X * X + b) Calculate 2 polynomials for sin and cos: + RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3)))); + RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))) */ + movaps %xmm7, %xmm3 + mulps %xmm4, %xmm3 + mulps %xmm2, %xmm7 + addps __sA7(%rax), %xmm3 + addps __sA7(%rax), %xmm7 + mulps %xmm4, %xmm3 + mulps %xmm2, %xmm7 + addps __sA5(%rax), %xmm3 + addps __sA5(%rax), %xmm7 + mulps %xmm4, %xmm3 + mulps %xmm2, %xmm7 + addps __sA3(%rax), %xmm3 + addps __sA3(%rax), %xmm7 + mulps %xmm3, %xmm4 + mulps %xmm7, %xmm2 + mulps %xmm13, %xmm4 + mulps %xmm1, %xmm2 + addps %xmm4, %xmm13 + addps %xmm2, %xmm1 + xorps %xmm12, %xmm13 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + movups 160(%rsp), %xmm9 + movaps %xmm13, (%rdi) + movups 144(%rsp), %xmm10 + movups 176(%rsp), %xmm12 + movups 112(%rsp), %xmm13 + movups %xmm1, (%rsi) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + movups %xmm0, 128(%rsp) + movups %xmm13, 192(%rsp) + movups %xmm1, 256(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + movups %xmm8, 48(%rsp) + movups %xmm11, 32(%rsp) + movups %xmm14, 16(%rsp) + movups %xmm15, (%rsp) + movq %rsi, 64(%rsp) + movq %r12, 104(%rsp) + cfi_offset_rel_rsp (12, 104) + movb %dl, %r12b + movq %r13, 96(%rsp) + cfi_offset_rel_rsp (13, 96) + movl %eax, %r13d + movq %r14, 88(%rsp) + cfi_offset_rel_rsp (14, 88) + movl %ecx, %r14d + movq %r15, 80(%rsp) + cfi_offset_rel_rsp (15, 80) + movq %rbx, 72(%rsp) + movq %rdi, %rbx + cfi_remember_state + +.LBL_1_6: + btl %r13d, %r14d + jc .LBL_1_13 + +.LBL_1_7: + lea 1(%r13), %esi + btl %esi, %r14d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r13d + cmpb $16, %r12b + jb .LBL_1_6 + + movups 48(%rsp), %xmm8 + movq %rbx, %rdi + movups 32(%rsp), %xmm11 + movups 16(%rsp), %xmm14 + movups (%rsp), %xmm15 + movq 64(%rsp), %rsi + movq 104(%rsp), %r12 + cfi_restore (%r12) + movq 96(%rsp), %r13 + cfi_restore (%r13) + movq 88(%rsp), %r14 + cfi_restore (%r14) + movq 80(%rsp), %r15 + cfi_restore (%r15) + movq 72(%rsp), %rbx + movups 192(%rsp), %xmm13 + movups 256(%rsp), %xmm1 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + movss 132(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(sinf) + + movss %xmm0, 196(%rsp,%r15,8) + movss 132(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(cosf) + + movss %xmm0, 260(%rsp,%r15,8) + jmp .LBL_1_8 + +.LBL_1_13: + movzbl %r12b, %r15d + movss 128(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(sinf) + + movss %xmm0, 192(%rsp,%r15,8) + movss 128(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(cosf) + + movss %xmm0, 256(%rsp,%r15,8) + jmp .LBL_1_7 + +END (_ZGVbN4vl4l4_sincosf_sse4) +libmvec_hidden_def(_ZGVbN4vl4l4_sincosf_sse4) + +/* vvv version implemented with wrapper to vl4l4 variant. */ +ENTRY (_ZGVbN4vvv_sincosf_sse4) +#ifndef __ILP32__ + subq $104, %rsp + .cfi_def_cfa_offset 112 + movdqu %xmm1, 32(%rsp) + lea (%rsp), %rdi + movdqu %xmm2, 48(%rdi) + lea 16(%rsp), %rsi + movdqu %xmm3, 48(%rsi) + movdqu %xmm4, 64(%rsi) + call HIDDEN_JUMPTARGET(_ZGVbN4vl4l4_sincosf_sse4) + movq 32(%rsp), %rdx + movq 40(%rsp), %rsi + movq 48(%rsp), %r8 + movq 56(%rsp), %r10 + movl (%rsp), %eax + movl 4(%rsp), %ecx + movl 8(%rsp), %edi + movl 12(%rsp), %r9d + movl %eax, (%rdx) + movl %ecx, (%rsi) + movq 64(%rsp), %rax + movq 72(%rsp), %rcx + movl %edi, (%r8) + movl %r9d, (%r10) + movq 80(%rsp), %rdi + movq 88(%rsp), %r9 + movl 16(%rsp), %r11d + movl 20(%rsp), %edx + movl 24(%rsp), %esi + movl 28(%rsp), %r8d + movl %r11d, (%rax) + movl %edx, (%rcx) + movl %esi, (%rdi) + movl %r8d, (%r9) + addq $104, %rsp + .cfi_def_cfa_offset 8 + ret +#else + subl $72, %esp + .cfi_def_cfa_offset 80 + leal 48(%rsp), %esi + movaps %xmm1, 16(%esp) + leal 32(%rsp), %edi + movaps %xmm2, (%esp) + call HIDDEN_JUMPTARGET(_ZGVbN4vl4l4_sincosf_sse4) + movl 16(%esp), %eax + movss 32(%esp), %xmm0 + movss %xmm0, (%eax) + movl 20(%esp), %eax + movss 36(%esp), %xmm0 + movss %xmm0, (%eax) + movl 24(%esp), %eax + movss 40(%esp), %xmm0 + movss %xmm0, (%eax) + movl 28(%esp), %eax + movss 44(%esp), %xmm0 + movss %xmm0, (%eax) + movl (%esp), %eax + movss 48(%esp), %xmm0 + movss %xmm0, (%eax) + movl 4(%esp), %eax + movss 52(%esp), %xmm0 + movss %xmm0, (%eax) + movl 8(%esp), %eax + movss 56(%esp), %xmm0 + movss %xmm0, (%eax) + movl 12(%esp), %eax + movss 60(%esp), %xmm0 + movss %xmm0, (%eax) + addl $72, %esp + .cfi_def_cfa_offset 8 + ret +#endif +END (_ZGVbN4vvv_sincosf_sse4) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core.S new file mode 100644 index 0000000000..320fd861a5 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized sincosf. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVdN8vvv_sincosf) + .type _ZGVdN8vvv_sincosf, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVdN8vvv_sincosf_avx2(%rip), %rax + HAS_ARCH_FEATURE (AVX2_Usable) + jz 2f + ret +2: leaq _ZGVdN8vvv_sincosf_sse_wrapper(%rip), %rax + ret +END (_ZGVdN8vvv_sincosf) +libmvec_hidden_def (_ZGVdN8vvv_sincosf) + +#define _ZGVdN8vvv_sincosf _ZGVdN8vvv_sincosf_sse_wrapper +#include "../svml_s_sincosf8_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core_avx2.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core_avx2.S new file mode 100644 index 0000000000..9e4e2c71c5 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core_avx2.S @@ -0,0 +1,389 @@ +/* Function sincosf vectorized with AVX2. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_trig_data.h" + + .text +ENTRY (_ZGVdN8vl4l4_sincosf_avx2) +/* + ALGORITHM DESCRIPTION: + + 1) Range reduction to [-Pi/4; +Pi/4] interval + a) Grab sign from source argument and save it. + b) Remove sign using AND operation + c) Getting octant Y by 2/Pi multiplication + d) Add "Right Shifter" value + e) Treat obtained value as integer S for destination sign setting. + SS = ((S-S&1)&2)<<30; For sin part + SC = ((S+S&1)&2)<<30; For cos part + f) Change destination sign if source sign is negative + using XOR operation. + g) Subtract "Right Shifter" (0x4B000000) value + h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 4 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4; + 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval) + a) Calculate X^2 = X * X + b) Calculate 2 polynomials for sin and cos: + RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3)))); + RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))); + c) Swap RS & RC if if first bit of obtained value after + Right Shifting is set to 1. Using And, Andnot & Or operations. + 3) Destination sign setting + a) Set shifted destination sign using XOR operation: + R1 = XOR( RS, SS ); + R2 = XOR( RC, SC ). */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $448, %rsp + movq __svml_s_trig_data@GOTPCREL(%rip), %rax + vmovdqa %ymm0, %ymm5 + vmovups %ymm13, 352(%rsp) + vmovups __sAbsMask(%rax), %ymm2 + vmovups __sInvPI(%rax), %ymm1 + vmovups __sPI1_FMA(%rax), %ymm13 + vmovups %ymm15, 288(%rsp) + +/* Absolute argument computation */ + vandps %ymm2, %ymm5, %ymm4 + +/* c) Getting octant Y by 2/Pi multiplication + d) Add "Right Shifter" value */ + vfmadd213ps __sRShifter(%rax), %ymm4, %ymm1 + +/* e) Treat obtained value as integer S for destination sign setting */ + vpslld $31, %ymm1, %ymm0 + +/* g) Subtract "Right Shifter" (0x4B000000) value */ + vsubps __sRShifter(%rax), %ymm1, %ymm1 + +/* h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 3 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3 */ + vmovdqa %ymm4, %ymm7 + vfnmadd231ps %ymm1, %ymm13, %ymm7 + vfnmadd231ps __sPI2_FMA(%rax), %ymm1, %ymm7 + vandps __sSignMask(%rax), %ymm7, %ymm15 + vxorps __sOneHalf(%rax), %ymm15, %ymm6 + +/* Add correction term 0.5 for cos() part */ + vaddps %ymm6, %ymm1, %ymm6 + vmovdqa %ymm4, %ymm3 + vfnmadd231ps %ymm6, %ymm13, %ymm3 + vmovups __sPI3_FMA(%rax), %ymm13 + vcmpnle_uqps __sRangeReductionVal(%rax), %ymm4, %ymm4 + vfnmadd231ps __sPI2_FMA(%rax), %ymm6, %ymm3 + vfnmadd213ps %ymm7, %ymm13, %ymm1 + vfnmadd213ps %ymm3, %ymm13, %ymm6 + +/* Result sign calculations */ + vxorps __sSignMask(%rax), %ymm15, %ymm3 + vxorps %ymm0, %ymm3, %ymm7 + vxorps %ymm7, %ymm6, %ymm3 + vxorps %ymm0, %ymm1, %ymm15 + vandnps %ymm5, %ymm2, %ymm6 + vmovups __sA7_FMA(%rax), %ymm2 + vmulps %ymm15, %ymm15, %ymm13 + vmovups __sA9_FMA(%rax), %ymm7 + vmulps %ymm3, %ymm3, %ymm1 + +/* 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval) + a) Calculate X^2 = X * X + b) Calculate 2 polynomials for sin and cos: + RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3)))); + RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))) */ + vmovdqa %ymm2, %ymm0 + vfmadd231ps __sA9_FMA(%rax), %ymm13, %ymm0 + vfmadd213ps %ymm2, %ymm1, %ymm7 + vfmadd213ps __sA5_FMA(%rax), %ymm13, %ymm0 + vfmadd213ps __sA5_FMA(%rax), %ymm1, %ymm7 + vfmadd213ps __sA3(%rax), %ymm13, %ymm0 + vfmadd213ps __sA3(%rax), %ymm1, %ymm7 + vmulps %ymm13, %ymm0, %ymm13 + vmulps %ymm1, %ymm7, %ymm1 + vfmadd213ps %ymm15, %ymm15, %ymm13 + vfmadd213ps %ymm3, %ymm3, %ymm1 + vmovmskps %ymm4, %ecx + vxorps %ymm6, %ymm13, %ymm0 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + vmovups 352(%rsp), %ymm13 + vmovups 288(%rsp), %ymm15 + vmovups %ymm0, (%rdi) + vmovups %ymm1, (%rsi) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovups %ymm5, 256(%rsp) + vmovups %ymm0, 320(%rsp) + vmovups %ymm1, 384(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + vmovups %ymm8, 160(%rsp) + vmovups %ymm9, 128(%rsp) + vmovups %ymm10, 96(%rsp) + vmovups %ymm11, 64(%rsp) + vmovups %ymm12, 32(%rsp) + vmovups %ymm14, (%rsp) + movq %rsi, 192(%rsp) + movq %r12, 232(%rsp) + cfi_offset_rel_rsp (12, 232) + movb %dl, %r12b + movq %r13, 224(%rsp) + cfi_offset_rel_rsp (13, 224) + movl %eax, %r13d + movq %r14, 216(%rsp) + cfi_offset_rel_rsp (14, 216) + movl %ecx, %r14d + movq %r15, 208(%rsp) + cfi_offset_rel_rsp (14, 208) + movq %rbx, 200(%rsp) + movq %rdi, %rbx + cfi_remember_state + +.LBL_1_6: + btl %r13d, %r14d + jc .LBL_1_13 + +.LBL_1_7: + lea 1(%r13), %esi + btl %esi, %r14d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r13d + cmpb $16, %r12b + jb .LBL_1_6 + + vmovups 160(%rsp), %ymm8 + movq %rbx, %rdi + vmovups 128(%rsp), %ymm9 + vmovups 96(%rsp), %ymm10 + vmovups 64(%rsp), %ymm11 + vmovups 32(%rsp), %ymm12 + vmovups (%rsp), %ymm14 + vmovups 320(%rsp), %ymm0 + vmovups 384(%rsp), %ymm1 + movq 192(%rsp), %rsi + movq 232(%rsp), %r12 + cfi_restore (%r12) + movq 224(%rsp), %r13 + cfi_restore (%r13) + movq 216(%rsp), %r14 + cfi_restore (%r14) + movq 208(%rsp), %r15 + cfi_restore (%r15) + movq 200(%rsp), %rbx + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + vmovss 260(%rsp,%r15,8), %xmm0 + vzeroupper + + call JUMPTARGET(sinf) + + vmovss %xmm0, 324(%rsp,%r15,8) + vmovss 260(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(cosf) + + vmovss %xmm0, 388(%rsp,%r15,8) + jmp .LBL_1_8 + +.LBL_1_13: + movzbl %r12b, %r15d + vmovss 256(%rsp,%r15,8), %xmm0 + vzeroupper + + call JUMPTARGET(sinf) + + vmovss %xmm0, 320(%rsp,%r15,8) + vmovss 256(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(cosf) + + vmovss %xmm0, 384(%rsp,%r15,8) + jmp .LBL_1_7 + +END (_ZGVdN8vl4l4_sincosf_avx2) +libmvec_hidden_def(_ZGVdN8vl4l4_sincosf_avx2) + +/* vvv version implemented with wrapper to vl4l4 variant. */ +ENTRY (_ZGVdN8vvv_sincosf_avx2) +#ifndef __ILP32__ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-32, %rsp + subq $192, %rsp + vmovdqu %ymm1, 64(%rsp) + lea (%rsp), %rdi + vmovdqu %ymm2, 96(%rdi) + vmovdqu %ymm3, 128(%rdi) + vmovdqu %ymm4, 160(%rdi) + lea 32(%rsp), %rsi + call HIDDEN_JUMPTARGET(_ZGVdN8vl4l4_sincosf_avx2) + movq 64(%rsp), %rdx + movq 72(%rsp), %rsi + movq 80(%rsp), %r8 + movq 88(%rsp), %r10 + movl (%rsp), %eax + movl 4(%rsp), %ecx + movl 8(%rsp), %edi + movl 12(%rsp), %r9d + movl %eax, (%rdx) + movl %ecx, (%rsi) + movq 96(%rsp), %rax + movq 104(%rsp), %rcx + movl %edi, (%r8) + movl %r9d, (%r10) + movq 112(%rsp), %rdi + movq 120(%rsp), %r9 + movl 16(%rsp), %r11d + movl 20(%rsp), %edx + movl 24(%rsp), %esi + movl 28(%rsp), %r8d + movl %r11d, (%rax) + movl %edx, (%rcx) + movq 128(%rsp), %r11 + movq 136(%rsp), %rdx + movl %esi, (%rdi) + movl %r8d, (%r9) + movq 144(%rsp), %rsi + movq 152(%rsp), %r8 + movl 32(%rsp), %r10d + movl 36(%rsp), %eax + movl 40(%rsp), %ecx + movl 44(%rsp), %edi + movl %r10d, (%r11) + movl %eax, (%rdx) + movq 160(%rsp), %r10 + movq 168(%rsp), %rax + movl %ecx, (%rsi) + movl %edi, (%r8) + movq 176(%rsp), %rcx + movq 184(%rsp), %rdi + movl 48(%rsp), %r9d + movl 52(%rsp), %r11d + movl 56(%rsp), %edx + movl 60(%rsp), %esi + movl %r9d, (%r10) + movl %r11d, (%rax) + movl %edx, (%rcx) + movl %esi, (%rdi) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret +#else + leal 8(%rsp), %r10d + .cfi_def_cfa 10, 0 + andl $-32, %esp + pushq -8(%r10d) + pushq %rbp + .cfi_escape 0x10,0x6,0x2,0x76,0 + movl %esp, %ebp + pushq %r10 + .cfi_escape 0xf,0x3,0x76,0x78,0x6 + leal -48(%rbp), %esi + leal -80(%rbp), %edi + subl $136, %esp + vmovdqa %ymm1, -112(%ebp) + vmovdqa %ymm2, -144(%ebp) + call HIDDEN_JUMPTARGET(_ZGVdN8vl4l4_sincosf_avx2) + vmovdqa -112(%ebp), %xmm0 + vmovq %xmm0, %rax + vmovss -80(%ebp), %xmm0 + vmovss %xmm0, (%eax) + vmovss -76(%ebp), %xmm0 + shrq $32, %rax + vmovss %xmm0, (%eax) + movq -104(%ebp), %rax + vmovss -72(%ebp), %xmm0 + vmovss %xmm0, (%eax) + vmovss -68(%ebp), %xmm0 + shrq $32, %rax + vmovss %xmm0, (%eax) + movq -96(%ebp), %rax + vmovss -64(%ebp), %xmm0 + vmovss %xmm0, (%eax) + vmovss -60(%ebp), %xmm0 + shrq $32, %rax + vmovss %xmm0, (%eax) + movq -88(%ebp), %rax + vmovss -56(%ebp), %xmm0 + vmovss %xmm0, (%eax) + vmovss -52(%ebp), %xmm0 + shrq $32, %rax + vmovss %xmm0, (%eax) + vmovdqa -144(%ebp), %xmm0 + vmovq %xmm0, %rax + vmovss -48(%ebp), %xmm0 + vmovss %xmm0, (%eax) + vmovss -44(%ebp), %xmm0 + shrq $32, %rax + vmovss %xmm0, (%eax) + movq -136(%ebp), %rax + vmovss -40(%ebp), %xmm0 + vmovss %xmm0, (%eax) + vmovss -36(%ebp), %xmm0 + shrq $32, %rax + vmovss %xmm0, (%eax) + movq -128(%ebp), %rax + vmovss -32(%ebp), %xmm0 + vmovss %xmm0, (%eax) + vmovss -28(%ebp), %xmm0 + shrq $32, %rax + vmovss %xmm0, (%eax) + movq -120(%ebp), %rax + vmovss -24(%ebp), %xmm0 + vmovss %xmm0, (%eax) + vmovss -20(%ebp), %xmm0 + shrq $32, %rax + vmovss %xmm0, (%eax) + addl $136, %esp + popq %r10 + .cfi_def_cfa 10, 0 + popq %rbp + leal -8(%r10), %esp + .cfi_def_cfa 7, 8 + ret +#endif +END (_ZGVdN8vvv_sincosf_avx2) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S new file mode 100644 index 0000000000..2c18dbce53 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S @@ -0,0 +1,37 @@ +/* Multiple versions of vectorized sinf. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVeN16v_sinf) + .type _ZGVeN16v_sinf, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVeN16v_sinf_skx(%rip), %rax + HAS_ARCH_FEATURE (AVX512DQ_Usable) + jnz 2f + leaq _ZGVeN16v_sinf_knl(%rip), %rax + HAS_ARCH_FEATURE (AVX512F_Usable) + jnz 2f + leaq _ZGVeN16v_sinf_avx2_wrapper(%rip), %rax +2: ret +END (_ZGVeN16v_sinf) + +#define _ZGVeN16v_sinf _ZGVeN16v_sinf_avx2_wrapper +#include "../svml_s_sinf16_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S new file mode 100644 index 0000000000..8670673a29 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S @@ -0,0 +1,479 @@ +/* Function sinf vectorized with AVX-512. KNL and SKX versions. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_trig_data.h" +#include "svml_s_wrapper_impl.h" + + .text +ENTRY(_ZGVeN16v_sinf_knl) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf +#else +/* + ALGORITHM DESCRIPTION: + + 1) Range reduction to [-Pi/2; +Pi/2] interval + a) Grab sign from source argument and save it. + b) Remove sign using AND operation + c) Getting octant Y by 1/Pi multiplication + d) Add "Right Shifter" value + e) Treat obtained value as integer for destination sign setting. + Shift first bit of this value to the last (sign) position + f) Change destination sign if source sign is negative + using XOR operation. + g) Subtract "Right Shifter" value + h) Subtract Y*PI from X argument, where PI divided to 4 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4; + 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval) + a) Calculate X^2 = X * X + b) Calculate polynomial: + R = X + X * X^2 * (A3 + x^2 * (A5 + ...... + 3) Destination sign setting + a) Set shifted destination sign using XOR operation: + R = XOR( R, S ); + */ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1280, %rsp + movq __svml_s_trig_data@GOTPCREL(%rip), %rax + +/* Check for large and special values */ + movl $-1, %edx + vmovups __sAbsMask(%rax), %zmm4 + vmovups __sInvPI(%rax), %zmm1 + +/* b) Remove sign using AND operation */ + vpandd %zmm4, %zmm0, %zmm12 + vmovups __sPI1_FMA(%rax), %zmm2 + vmovups __sA9(%rax), %zmm7 + +/* + f) Change destination sign if source sign is negative + using XOR operation. + */ + vpandnd %zmm0, %zmm4, %zmm11 + +/* + h) Subtract Y*PI from X argument, where PI divided to 4 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3; + */ + vmovaps %zmm12, %zmm3 + +/* + c) Getting octant Y by 1/Pi multiplication + d) Add "Right Shifter" value + */ + vfmadd213ps __sRShifter(%rax), %zmm12, %zmm1 + vcmpps $22, __sRangeReductionVal(%rax), %zmm12, %k1 + vpbroadcastd %edx, %zmm13{%k1}{z} + +/* g) Subtract "Right Shifter" value */ + vsubps __sRShifter(%rax), %zmm1, %zmm5 + +/* + e) Treat obtained value as integer for destination sign setting. + Shift first bit of this value to the last (sign) position + */ + vpslld $31, %zmm1, %zmm6 + vptestmd %zmm13, %zmm13, %k0 + vfnmadd231ps %zmm5, %zmm2, %zmm3 + kmovw %k0, %ecx + vfnmadd231ps __sPI2_FMA(%rax), %zmm5, %zmm3 + vfnmadd132ps __sPI3_FMA(%rax), %zmm3, %zmm5 + +/* + 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval) + a) Calculate X^2 = X * X + b) Calculate polynomial: + R = X + X * X^2 * (A3 + x^2 * (A5 + ...... + */ + vmulps %zmm5, %zmm5, %zmm8 + vpxord %zmm6, %zmm5, %zmm9 + vfmadd213ps __sA7(%rax), %zmm8, %zmm7 + vfmadd213ps __sA5(%rax), %zmm8, %zmm7 + vfmadd213ps __sA3(%rax), %zmm8, %zmm7 + vmulps %zmm8, %zmm7, %zmm10 + vfmadd213ps %zmm9, %zmm9, %zmm10 + +/* + 3) Destination sign setting + a) Set shifted destination sign using XOR operation: + R = XOR( R, S ); + */ + vpxord %zmm11, %zmm10, %zmm1 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + vmovaps %zmm1, %zmm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovups %zmm0, 1152(%rsp) + vmovups %zmm1, 1216(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + kmovw %k4, 1048(%rsp) + xorl %eax, %eax + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1064(%rsp) + movq %rdi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %ecx, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %eax, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + addb $1, %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + kmovw 1048(%rsp), %k4 + movq 1064(%rsp), %rsi + kmovw 1040(%rsp), %k5 + movq 1056(%rsp), %rdi + kmovw 1032(%rsp), %k6 + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + vmovups 1216(%rsp), %zmm1 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + vmovss 1156(%rsp,%r15,8), %xmm0 + call JUMPTARGET(sinf) + vmovss %xmm0, 1220(%rsp,%r15,8) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + vmovss 1152(%rsp,%r15,8), %xmm0 + call JUMPTARGET(sinf) + vmovss %xmm0, 1216(%rsp,%r15,8) + jmp .LBL_1_7 +#endif +END(_ZGVeN16v_sinf_knl) + +ENTRY (_ZGVeN16v_sinf_skx) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf +#else +/* + ALGORITHM DESCRIPTION: + + 1) Range reduction to [-Pi/2; +Pi/2] interval + a) Grab sign from source argument and save it. + b) Remove sign using AND operation + c) Getting octant Y by 1/Pi multiplication + d) Add "Right Shifter" value + e) Treat obtained value as integer for destination sign setting. + Shift first bit of this value to the last (sign) position + f) Change destination sign if source sign is negative + using XOR operation. + g) Subtract "Right Shifter" value + h) Subtract Y*PI from X argument, where PI divided to 4 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4; + 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval) + a) Calculate X^2 = X * X + b) Calculate polynomial: + R = X + X * X^2 * (A3 + x^2 * (A5 + ...... + 3) Destination sign setting + a) Set shifted destination sign using XOR operation: + R = XOR( R, S ); + */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1280, %rsp + movq __svml_s_trig_data@GOTPCREL(%rip), %rax + +/* Check for large and special values */ + vmovups .L_2il0floatpacket.11(%rip), %zmm14 + vmovups __sAbsMask(%rax), %zmm5 + vmovups __sInvPI(%rax), %zmm1 + vmovups __sRShifter(%rax), %zmm2 + vmovups __sPI1_FMA(%rax), %zmm3 + vmovups __sA9(%rax), %zmm8 + +/* b) Remove sign using AND operation */ + vandps %zmm5, %zmm0, %zmm13 + +/* + f) Change destination sign if source sign is negative + using XOR operation. + */ + vandnps %zmm0, %zmm5, %zmm12 + +/* + c) Getting octant Y by 1/Pi multiplication + d) Add "Right Shifter" value + */ + vfmadd213ps %zmm2, %zmm13, %zmm1 + vcmpps $18, __sRangeReductionVal(%rax), %zmm13, %k1 + +/* + e) Treat obtained value as integer for destination sign setting. + Shift first bit of this value to the last (sign) position + */ + vpslld $31, %zmm1, %zmm7 + +/* g) Subtract "Right Shifter" value */ + vsubps %zmm2, %zmm1, %zmm6 + +/* + h) Subtract Y*PI from X argument, where PI divided to 4 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3; + */ + vmovaps %zmm13, %zmm4 + vfnmadd231ps %zmm6, %zmm3, %zmm4 + vfnmadd231ps __sPI2_FMA(%rax), %zmm6, %zmm4 + vfnmadd132ps __sPI3_FMA(%rax), %zmm4, %zmm6 + +/* + 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval) + a) Calculate X^2 = X * X + b) Calculate polynomial: + R = X + X * X^2 * (A3 + x^2 * (A5 + ...... + */ + vmulps %zmm6, %zmm6, %zmm9 + vxorps %zmm7, %zmm6, %zmm10 + vfmadd213ps __sA7(%rax), %zmm9, %zmm8 + vfmadd213ps __sA5(%rax), %zmm9, %zmm8 + vfmadd213ps __sA3(%rax), %zmm9, %zmm8 + vmulps %zmm9, %zmm8, %zmm11 + vfmadd213ps %zmm10, %zmm10, %zmm11 + +/* + 3) Destination sign setting + a) Set shifted destination sign using XOR operation: + R = XOR( R, S ); + */ + vxorps %zmm12, %zmm11, %zmm1 + vpandnd %zmm13, %zmm13, %zmm14{%k1} + vptestmd %zmm14, %zmm14, %k0 + kmovw %k0, %ecx + testl %ecx, %ecx + jne .LBL_2_3 + +.LBL_2_2: + cfi_remember_state + vmovaps %zmm1, %zmm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_2_3: + cfi_restore_state + vmovups %zmm0, 1152(%rsp) + vmovups %zmm1, 1216(%rsp) + je .LBL_2_2 + + xorb %dl, %dl + xorl %eax, %eax + kmovw %k4, 1048(%rsp) + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1064(%rsp) + movq %rdi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %ecx, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %eax, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + cfi_remember_state + +.LBL_2_6: + btl %r14d, %r13d + jc .LBL_2_12 + +.LBL_2_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_2_10 + +.LBL_2_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_2_6 + + kmovw 1048(%rsp), %k4 + kmovw 1040(%rsp), %k5 + kmovw 1032(%rsp), %k6 + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + vmovups 1216(%rsp), %zmm1 + movq 1064(%rsp), %rsi + movq 1056(%rsp), %rdi + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + jmp .LBL_2_2 + +.LBL_2_10: + cfi_restore_state + movzbl %r12b, %r15d + vmovss 1156(%rsp,%r15,8), %xmm0 + vzeroupper + vmovss 1156(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(sinf) + + vmovss %xmm0, 1220(%rsp,%r15,8) + jmp .LBL_2_8 + +.LBL_2_12: + movzbl %r12b, %r15d + vmovss 1152(%rsp,%r15,8), %xmm0 + vzeroupper + vmovss 1152(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(sinf) + + vmovss %xmm0, 1216(%rsp,%r15,8) + jmp .LBL_2_7 +#endif +END (_ZGVeN16v_sinf_skx) + + .section .rodata, "a" +.L_2il0floatpacket.11: + .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff + .type .L_2il0floatpacket.11,@object diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core.S new file mode 100644 index 0000000000..3556473899 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized sinf. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVbN4v_sinf) + .type _ZGVbN4v_sinf, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq _ZGVbN4v_sinf_sse4(%rip), %rax + HAS_CPU_FEATURE (SSE4_1) + jz 2f + ret +2: leaq _ZGVbN4v_sinf_sse2(%rip), %rax + ret +END (_ZGVbN4v_sinf) +libmvec_hidden_def (_ZGVbN4v_sinf) + +#define _ZGVbN4v_sinf _ZGVbN4v_sinf_sse2 +#include "../svml_s_sinf4_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core_sse4.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core_sse4.S new file mode 100644 index 0000000000..c690150964 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core_sse4.S @@ -0,0 +1,224 @@ +/* Function sinf vectorized with SSE4. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + + +#include <sysdep.h> +#include "svml_s_trig_data.h" + + .text +ENTRY(_ZGVbN4v_sinf_sse4) +/* + ALGORITHM DESCRIPTION: + + 1) Range reduction to [-Pi/2; +Pi/2] interval + a) Grab sign from source argument and save it. + b) Remove sign using AND operation + c) Getting octant Y by 1/Pi multiplication + d) Add "Right Shifter" value + e) Treat obtained value as integer for destination sign setting. + Shift first bit of this value to the last (sign) position + f) Change destination sign if source sign is negative + using XOR operation. + g) Subtract "Right Shifter" value + h) Subtract Y*PI from X argument, where PI divided to 4 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4; + 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval) + a) Calculate X^2 = X * X + b) Calculate polynomial: + R = X + X * X^2 * (A3 + x^2 * (A5 + ...... + 3) Destination sign setting + a) Set shifted destination sign using XOR operation: + R = XOR( R, S ); + */ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $320, %rsp + movaps %xmm0, %xmm5 + movq __svml_s_trig_data@GOTPCREL(%rip), %rax + movups __sAbsMask(%rax), %xmm2 + +/* b) Remove sign using AND operation */ + movaps %xmm2, %xmm4 + +/* + f) Change destination sign if source sign is negative + using XOR operation. + */ + andnps %xmm5, %xmm2 + movups __sInvPI(%rax), %xmm1 + andps %xmm5, %xmm4 + +/* c) Getting octant Y by 1/Pi multiplication + d) Add "Right Shifter" value */ + mulps %xmm4, %xmm1 + +/* h) Subtract Y*PI from X argument, where PI divided to 4 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4 */ + movaps %xmm4, %xmm0 + +/* Check for large and special values */ + cmpnleps __sRangeReductionVal(%rax), %xmm4 + movups __sRShifter(%rax), %xmm6 + movups __sPI1(%rax), %xmm7 + addps %xmm6, %xmm1 + movmskps %xmm4, %ecx + +/* e) Treat obtained value as integer for destination sign setting. + Shift first bit of this value to the last (sign) position */ + movaps %xmm1, %xmm3 + +/* g) Subtract "Right Shifter" value */ + subps %xmm6, %xmm1 + mulps %xmm1, %xmm7 + pslld $31, %xmm3 + movups __sPI2(%rax), %xmm6 + subps %xmm7, %xmm0 + mulps %xmm1, %xmm6 + movups __sPI3(%rax), %xmm7 + subps %xmm6, %xmm0 + mulps %xmm1, %xmm7 + movups __sPI4(%rax), %xmm6 + subps %xmm7, %xmm0 + mulps %xmm6, %xmm1 + subps %xmm1, %xmm0 + +/* 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval) + a) Calculate X^2 = X * X + b) Calculate polynomial: + R = X + X * X^2 * (A3 + x^2 * (A5 + ...... */ + movaps %xmm0, %xmm1 + mulps %xmm0, %xmm1 + xorps %xmm3, %xmm0 + movups __sA9(%rax), %xmm3 + mulps %xmm1, %xmm3 + addps __sA7(%rax), %xmm3 + mulps %xmm1, %xmm3 + addps __sA5(%rax), %xmm3 + mulps %xmm1, %xmm3 + addps __sA3(%rax), %xmm3 + mulps %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm0 + +/* 3) Destination sign setting + a) Set shifted destination sign using XOR operation: + R = XOR( R, S ); */ + xorps %xmm2, %xmm0 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + movups %xmm5, 192(%rsp) + movups %xmm0, 256(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + movups %xmm8, 112(%rsp) + movups %xmm9, 96(%rsp) + movups %xmm10, 80(%rsp) + movups %xmm11, 64(%rsp) + movups %xmm12, 48(%rsp) + movups %xmm13, 32(%rsp) + movups %xmm14, 16(%rsp) + movups %xmm15, (%rsp) + movq %rsi, 136(%rsp) + movq %rdi, 128(%rsp) + movq %r12, 168(%rsp) + cfi_offset_rel_rsp (12, 168) + movb %dl, %r12b + movq %r13, 160(%rsp) + cfi_offset_rel_rsp (13, 160) + movl %ecx, %r13d + movq %r14, 152(%rsp) + cfi_offset_rel_rsp (14, 152) + movl %eax, %r14d + movq %r15, 144(%rsp) + cfi_offset_rel_rsp (15, 144) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + movups 112(%rsp), %xmm8 + movups 96(%rsp), %xmm9 + movups 80(%rsp), %xmm10 + movups 64(%rsp), %xmm11 + movups 48(%rsp), %xmm12 + movups 32(%rsp), %xmm13 + movups 16(%rsp), %xmm14 + movups (%rsp), %xmm15 + movq 136(%rsp), %rsi + movq 128(%rsp), %rdi + movq 168(%rsp), %r12 + cfi_restore (%r12) + movq 160(%rsp), %r13 + cfi_restore (%r13) + movq 152(%rsp), %r14 + cfi_restore (%r14) + movq 144(%rsp), %r15 + cfi_restore (%r15) + movups 256(%rsp), %xmm0 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + movss 196(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(sinf) + + movss %xmm0, 260(%rsp,%r15,8) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + movss 192(%rsp,%r15,8), %xmm0 + + call JUMPTARGET(sinf) + + movss %xmm0, 256(%rsp,%r15,8) + jmp .LBL_1_7 + +END(_ZGVbN4v_sinf_sse4) diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core.S new file mode 100644 index 0000000000..674e88bd55 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core.S @@ -0,0 +1,36 @@ +/* Multiple versions of vectorized sinf, vector length is 8. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVdN8v_sinf) + .type _ZGVdN8v_sinf, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX +1: leaq _ZGVdN8v_sinf_avx2(%rip), %rax + HAS_ARCH_FEATURE (AVX2_Usable) + jz 2f + ret +2: leaq _ZGVdN8v_sinf_sse_wrapper(%rip), %rax + ret +END (_ZGVdN8v_sinf) +libmvec_hidden_def (_ZGVdN8v_sinf) + +#define _ZGVdN8v_sinf _ZGVdN8v_sinf_sse_wrapper +#include "../svml_s_sinf8_core.S" diff --git a/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core_avx2.S b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core_avx2.S new file mode 100644 index 0000000000..d34870fa3a --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core_avx2.S @@ -0,0 +1,219 @@ +/* Function sinf vectorized with AVX2. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_trig_data.h" + + .text +ENTRY(_ZGVdN8v_sinf_avx2) +/* + ALGORITHM DESCRIPTION: + + 1) Range reduction to [-Pi/2; +Pi/2] interval + a) Grab sign from source argument and save it. + b) Remove sign using AND operation + c) Getting octant Y by 1/Pi multiplication + d) Add "Right Shifter" value + e) Treat obtained value as integer for destination sign setting. + Shift first bit of this value to the last (sign) position + f) Change destination sign if source sign is negative + using XOR operation. + g) Subtract "Right Shifter" value + h) Subtract Y*PI from X argument, where PI divided to 4 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4; + 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval) + a) Calculate X^2 = X * X + b) Calculate polynomial: + R = X + X * X^2 * (A3 + x^2 * (A5 + ...... + 3) Destination sign setting + a) Set shifted destination sign using XOR operation: + R = XOR( R, S ); + */ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $448, %rsp + movq __svml_s_trig_data@GOTPCREL(%rip), %rax + vmovdqa %ymm0, %ymm5 + vmovups __sAbsMask(%rax), %ymm3 + vmovups __sInvPI(%rax), %ymm7 + vmovups __sRShifter(%rax), %ymm0 + vmovups __sPI1_FMA(%rax), %ymm1 + +/* b) Remove sign using AND operation */ + vandps %ymm3, %ymm5, %ymm4 + +/* + c) Getting octant Y by 1/Pi multiplication + d) Add "Right Shifter" value + */ + vfmadd213ps %ymm0, %ymm4, %ymm7 + +/* g) Subtract "Right Shifter" value */ + vsubps %ymm0, %ymm7, %ymm2 + +/* + e) Treat obtained value as integer for destination sign setting. + Shift first bit of this value to the last (sign) position + */ + vpslld $31, %ymm7, %ymm6 + +/* + h) Subtract Y*PI from X argument, where PI divided to 4 parts: + X = X - Y*PI1 - Y*PI2 - Y*PI3; + */ + vmovdqa %ymm4, %ymm0 + vfnmadd231ps %ymm2, %ymm1, %ymm0 + +/* Check for large and special values */ + vcmpnle_uqps __sRangeReductionVal(%rax), %ymm4, %ymm4 + vfnmadd231ps __sPI2_FMA(%rax), %ymm2, %ymm0 + vfnmadd132ps __sPI3_FMA(%rax), %ymm0, %ymm2 + +/* + 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval) + a) Calculate X^2 = X * X + b) Calculate polynomial: + R = X + X * X^2 * (A3 + x^2 * (A5 + ...... + */ + vmulps %ymm2, %ymm2, %ymm1 + +/* + f) Change destination sign if source sign is negative + using XOR operation. + */ + vandnps %ymm5, %ymm3, %ymm0 + vxorps %ymm6, %ymm2, %ymm3 + vmovups __sA9(%rax), %ymm2 + vfmadd213ps __sA7(%rax), %ymm1, %ymm2 + vfmadd213ps __sA5(%rax), %ymm1, %ymm2 + vfmadd213ps __sA3(%rax), %ymm1, %ymm2 + vmulps %ymm1, %ymm2, %ymm6 + vfmadd213ps %ymm3, %ymm3, %ymm6 + vmovmskps %ymm4, %ecx + +/* + 3) Destination sign setting + a) Set shifted destination sign using XOR operation: + R = XOR( R, S ); + */ + vxorps %ymm0, %ymm6, %ymm0 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovups %ymm5, 320(%rsp) + vmovups %ymm0, 384(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + vmovups %ymm8, 224(%rsp) + vmovups %ymm9, 192(%rsp) + vmovups %ymm10, 160(%rsp) + vmovups %ymm11, 128(%rsp) + vmovups %ymm12, 96(%rsp) + vmovups %ymm13, 64(%rsp) + vmovups %ymm14, 32(%rsp) + vmovups %ymm15, (%rsp) + movq %rsi, 264(%rsp) + movq %rdi, 256(%rsp) + movq %r12, 296(%rsp) + cfi_offset_rel_rsp (12, 296) + movb %dl, %r12b + movq %r13, 288(%rsp) + cfi_offset_rel_rsp (13, 288) + movl %ecx, %r13d + movq %r14, 280(%rsp) + cfi_offset_rel_rsp (14, 280) + movl %eax, %r14d + movq %r15, 272(%rsp) + cfi_offset_rel_rsp (15, 272) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + vmovups 224(%rsp), %ymm8 + vmovups 192(%rsp), %ymm9 + vmovups 160(%rsp), %ymm10 + vmovups 128(%rsp), %ymm11 + vmovups 96(%rsp), %ymm12 + vmovups 64(%rsp), %ymm13 + vmovups 32(%rsp), %ymm14 + vmovups (%rsp), %ymm15 + vmovups 384(%rsp), %ymm0 + movq 264(%rsp), %rsi + movq 256(%rsp), %rdi + movq 296(%rsp), %r12 + cfi_restore (%r12) + movq 288(%rsp), %r13 + cfi_restore (%r13) + movq 280(%rsp), %r14 + cfi_restore (%r14) + movq 272(%rsp), %r15 + cfi_restore (%r15) + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + vmovss 324(%rsp,%r15,8), %xmm0 + vzeroupper + + call JUMPTARGET(sinf) + + vmovss %xmm0, 388(%rsp,%r15,8) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + vmovss 320(%rsp,%r15,8), %xmm0 + vzeroupper + + call JUMPTARGET(sinf) + + vmovss %xmm0, 384(%rsp,%r15,8) + jmp .LBL_1_7 + +END(_ZGVdN8v_sinf_avx2) |