diff options
author | Sunil K Pandey <skpgkp2@gmail.com> | 2021-12-29 09:11:23 -0800 |
---|---|---|
committer | Sunil K Pandey <skpgkp2@gmail.com> | 2021-12-29 11:38:02 -0800 |
commit | 2bf02c5843896c5c109b1467c64ecf11cbc2ad7b (patch) | |
tree | a313322b85bfd099aafaffd55c2f96abdd8e584c /sysdeps/x86_64/fpu/multiarch/svml_s_cbrtf16_core_avx512.S | |
parent | aa1809a1dfde88e5df73edba14b30e488b267343 (diff) | |
download | glibc-2bf02c5843896c5c109b1467c64ecf11cbc2ad7b.tar.gz glibc-2bf02c5843896c5c109b1467c64ecf11cbc2ad7b.tar.xz glibc-2bf02c5843896c5c109b1467c64ecf11cbc2ad7b.zip |
x86-64: Add vector cbrt/cbrtf implementation to libmvec
Implement vectorized cbrt/cbrtf containing SSE, AVX, AVX2 and AVX512 versions for libmvec as per vector ABI. It also contains accuracy and ABI tests for vector cbrt/cbrtf with regenerated ulps. Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Diffstat (limited to 'sysdeps/x86_64/fpu/multiarch/svml_s_cbrtf16_core_avx512.S')
-rw-r--r-- | sysdeps/x86_64/fpu/multiarch/svml_s_cbrtf16_core_avx512.S | 235 |
1 files changed, 235 insertions, 0 deletions
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cbrtf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cbrtf16_core_avx512.S new file mode 100644 index 0000000000..55b017682b --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cbrtf16_core_avx512.S @@ -0,0 +1,235 @@ +/* Function cbrtf vectorized with AVX-512. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + https://www.gnu.org/licenses/. */ + +/* + * ALGORITHM DESCRIPTION: + * + * x=2^{3*k+j} * 1.b1 b2 ... b5 b6 ... b52 + * Let r=(x*2^{-3k-j} - 1.b1 b2 ... b5 1)* rcp[b1 b2 ..b5], + * where rcp[b1 b2 .. b5]=1/(1.b1 b2 b3 b4 b5 1) in single precision + * cbrtf(2^j * 1. b1 b2 .. b5 1) is approximated as T[j][b1..b5]+D[j][b1..b5] + * (T stores the high 24 bits, D stores the low order bits) + * Result=2^k*T+(2^k*T*r)*P+2^k*D + * where P=p1+p2*r+.. + * + */ + +/* Offsets for data table __svml_scbrt_data_internal_avx512 + */ +#define etbl_H 0 +#define etbl_L 64 +#define cbrt_tbl_H 128 +#define BiasL 256 +#define SZero 320 +#define OneThird 384 +#define Bias3 448 +#define Three 512 +#define One 576 +#define poly_coeff3 640 +#define poly_coeff2 704 +#define poly_coeff1 768 + +#include <sysdep.h> + + .text + .section .text.exex512,"ax",@progbits +ENTRY(_ZGVeN16v_cbrtf_skx) + vgetmantps $0, {sae}, %zmm0, %zmm8 + +/* GetExp(x) */ + vgetexpps {sae}, %zmm0, %zmm1 + vmovups BiasL+__svml_scbrt_data_internal_avx512(%rip), %zmm2 + +/* exponent/3 */ + vmovups OneThird+__svml_scbrt_data_internal_avx512(%rip), %zmm3 + vmovups Bias3+__svml_scbrt_data_internal_avx512(%rip), %zmm4 + vmovups One+__svml_scbrt_data_internal_avx512(%rip), %zmm15 + +/* exponent%3 (to be used as index) */ + vmovups Three+__svml_scbrt_data_internal_avx512(%rip), %zmm5 + +/* polynomial */ + vmovups poly_coeff3+__svml_scbrt_data_internal_avx512(%rip), %zmm11 + vmovups poly_coeff1+__svml_scbrt_data_internal_avx512(%rip), %zmm14 + +/* Table lookup */ + vmovups cbrt_tbl_H+__svml_scbrt_data_internal_avx512(%rip), %zmm12 + +/* DblRcp ~ 1/Mantissa */ + vrcp14ps %zmm8, %zmm7 + vaddps {rn-sae}, %zmm2, %zmm1, %zmm6 + vandps SZero+__svml_scbrt_data_internal_avx512(%rip), %zmm0, %zmm0 + +/* round DblRcp to 3 fractional bits (RN mode, no Precision exception) */ + vrndscaleps $88, {sae}, %zmm7, %zmm9 + vfmsub231ps {rn-sae}, %zmm6, %zmm3, %zmm4 + vmovups poly_coeff2+__svml_scbrt_data_internal_avx512(%rip), %zmm7 + +/* Reduced argument: R = DblRcp*Mantissa - 1 */ + vfmsub231ps {rn-sae}, %zmm9, %zmm8, %zmm15 + vrndscaleps $9, {sae}, %zmm4, %zmm13 + +/* Prepare table index */ + vpsrld $19, %zmm9, %zmm10 + vfmadd231ps {rn-sae}, %zmm15, %zmm11, %zmm7 + vfnmadd231ps {rn-sae}, %zmm13, %zmm5, %zmm6 + vpermt2ps cbrt_tbl_H+64+__svml_scbrt_data_internal_avx512(%rip), %zmm10, %zmm12 + vfmadd213ps {rn-sae}, %zmm14, %zmm15, %zmm7 + vscalefps {rn-sae}, %zmm13, %zmm12, %zmm2 + +/* Table lookup: 2^(exponent%3) */ + vpermps __svml_scbrt_data_internal_avx512(%rip), %zmm6, %zmm1 + vpermps etbl_L+__svml_scbrt_data_internal_avx512(%rip), %zmm6, %zmm6 + +/* Sh*R */ + vmulps {rn-sae}, %zmm15, %zmm1, %zmm14 + +/* Sl + (Sh*R)*Poly */ + vfmadd213ps {rn-sae}, %zmm6, %zmm7, %zmm14 + +/* + * branch-free + * scaled_Th*(Sh+Sl+Sh*R*Poly) + */ + vaddps {rn-sae}, %zmm1, %zmm14, %zmm15 + vmulps {rn-sae}, %zmm2, %zmm15, %zmm3 + vorps %zmm0, %zmm3, %zmm0 + ret + +END(_ZGVeN16v_cbrtf_skx) + + .section .rodata, "a" + .align 64 + +#ifdef __svml_scbrt_data_internal_avx512_typedef +typedef unsigned int VUINT32; +typedef struct { + __declspec(align(64)) VUINT32 etbl_H[16][1]; + __declspec(align(64)) VUINT32 etbl_L[16][1]; + __declspec(align(64)) VUINT32 cbrt_tbl_H[32][1]; + __declspec(align(64)) VUINT32 BiasL[16][1]; + __declspec(align(64)) VUINT32 SZero[16][1]; + __declspec(align(64)) VUINT32 OneThird[16][1]; + __declspec(align(64)) VUINT32 Bias3[16][1]; + __declspec(align(64)) VUINT32 Three[16][1]; + __declspec(align(64)) VUINT32 One[16][1]; + __declspec(align(64)) VUINT32 poly_coeff3[16][1]; + __declspec(align(64)) VUINT32 poly_coeff2[16][1]; + __declspec(align(64)) VUINT32 poly_coeff1[16][1]; + } __svml_scbrt_data_internal_avx512; +#endif +__svml_scbrt_data_internal_avx512: + /*== etbl_H ==*/ + .long 0x3f800000 + .long 0x3fa14518 + .long 0x3fcb2ff5 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + /*== etbl_L ==*/ + .align 64 + .long 0x00000000 + .long 0xb2ce51af + .long 0x32a7adc8 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + /*== cbrt_tbl_H ==*/ + .align 64 + .long 0x3fa14518 + .long 0x3f9e0b2b + .long 0x3f9b0f9b + .long 0x3f984a9a + .long 0x3f95b5af + .long 0x3f934b6c + .long 0x3f910737 + .long 0x3f8ee526 + .long 0x3f8ce1da + .long 0x3f8afa6a + .long 0x3f892c4e + .long 0x3f87754e + .long 0x3f85d377 + .long 0x3f844510 + .long 0x3f82c892 + .long 0x3f815c9f + .long 0x3f800000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + /*== BiasL ==*/ + .align 64 + .long 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000 + /*== Zero ==*/ + .align 64 + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 + /*== OneThird ==*/ + .align 64 + .long 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab + /*== Bias3 ==*/ + .align 64 + .long 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000 + /*== Three ==*/ + .align 64 + .long 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000 + /*==One ==*/ + .align 64 + .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 + /*== poly_coeff3 ==*/ + .align 64 + .long 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c + /*== poly_coeff2 ==*/ + .align 64 + .long 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363 + /*== poly_coeff1 ==*/ + .align 64 + .long 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa + .align 64 + .type __svml_scbrt_data_internal_avx512,@object + .size __svml_scbrt_data_internal_avx512,.-__svml_scbrt_data_internal_avx512 |