diff options
author | Sunil K Pandey <skpgkp2@gmail.com> | 2022-03-07 10:47:10 -0800 |
---|---|---|
committer | Sunil K Pandey <skpgkp2@gmail.com> | 2022-03-07 21:14:10 -0800 |
commit | 5e837461dcbbe25153db3b8437ac4c0030292b51 (patch) | |
tree | 61854571d05b6cc95708bb4cd122448a5cbeafda /sysdeps/x86_64/fpu | |
parent | 994266f5019560f26e8d07be7fdf8621903339a1 (diff) | |
download | glibc-5e837461dcbbe25153db3b8437ac4c0030292b51.tar.gz glibc-5e837461dcbbe25153db3b8437ac4c0030292b51.tar.xz glibc-5e837461dcbbe25153db3b8437ac4c0030292b51.zip |
x86_64: Fix svml_s_cbrtf16_core_avx512.S code formatting
This commit contains following formatting changes 1. Instructions proceeded by a tab. 2. Instruction less than 8 characters in length have a tab between it and the first operand. 3. Instruction greater than 7 characters in length have a space between it and the first operand. 4. Tabs after `#define`d names and their value. 5. 8 space at the beginning of line replaced by tab. 6. Indent comments with code. 7. Remove redundent .text section. 8. 1 space between line content and line comment. 9. Space after all commas. Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
Diffstat (limited to 'sysdeps/x86_64/fpu')
-rw-r--r-- | sysdeps/x86_64/fpu/multiarch/svml_s_cbrtf16_core_avx512.S | 377 |
1 files changed, 188 insertions, 189 deletions
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cbrtf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cbrtf16_core_avx512.S index 9cf7918019..ce10cf177b 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_cbrtf16_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cbrtf16_core_avx512.S @@ -31,205 +31,204 @@ /* Offsets for data table __svml_scbrt_data_internal_avx512 */ -#define etbl_H 0 -#define etbl_L 64 -#define cbrt_tbl_H 128 -#define BiasL 256 -#define SZero 320 -#define OneThird 384 -#define Bias3 448 -#define Three 512 -#define One 576 -#define poly_coeff3 640 -#define poly_coeff2 704 -#define poly_coeff1 768 +#define etbl_H 0 +#define etbl_L 64 +#define cbrt_tbl_H 128 +#define BiasL 256 +#define SZero 320 +#define OneThird 384 +#define Bias3 448 +#define Three 512 +#define One 576 +#define poly_coeff3 640 +#define poly_coeff2 704 +#define poly_coeff1 768 #include <sysdep.h> - .text - .section .text.exex512,"ax",@progbits + .section .text.exex512, "ax", @progbits ENTRY(_ZGVeN16v_cbrtf_skx) - vgetmantps $0, {sae}, %zmm0, %zmm8 - -/* GetExp(x) */ - vgetexpps {sae}, %zmm0, %zmm1 - vmovups BiasL+__svml_scbrt_data_internal_avx512(%rip), %zmm2 - -/* exponent/3 */ - vmovups OneThird+__svml_scbrt_data_internal_avx512(%rip), %zmm3 - vmovups Bias3+__svml_scbrt_data_internal_avx512(%rip), %zmm4 - vmovups One+__svml_scbrt_data_internal_avx512(%rip), %zmm15 - -/* exponent%3 (to be used as index) */ - vmovups Three+__svml_scbrt_data_internal_avx512(%rip), %zmm5 - -/* polynomial */ - vmovups poly_coeff3+__svml_scbrt_data_internal_avx512(%rip), %zmm11 - vmovups poly_coeff1+__svml_scbrt_data_internal_avx512(%rip), %zmm14 - -/* Table lookup */ - vmovups cbrt_tbl_H+__svml_scbrt_data_internal_avx512(%rip), %zmm12 - -/* DblRcp ~ 1/Mantissa */ - vrcp14ps %zmm8, %zmm7 - vaddps {rn-sae}, %zmm2, %zmm1, %zmm6 - vandps SZero+__svml_scbrt_data_internal_avx512(%rip), %zmm0, %zmm0 - -/* round DblRcp to 3 fractional bits (RN mode, no Precision exception) */ - vrndscaleps $88, {sae}, %zmm7, %zmm9 - vfmsub231ps {rn-sae}, %zmm6, %zmm3, %zmm4 - vmovups poly_coeff2+__svml_scbrt_data_internal_avx512(%rip), %zmm7 - -/* Reduced argument: R = DblRcp*Mantissa - 1 */ - vfmsub231ps {rn-sae}, %zmm9, %zmm8, %zmm15 - vrndscaleps $9, {sae}, %zmm4, %zmm13 - -/* Prepare table index */ - vpsrld $19, %zmm9, %zmm10 - vfmadd231ps {rn-sae}, %zmm15, %zmm11, %zmm7 - vfnmadd231ps {rn-sae}, %zmm13, %zmm5, %zmm6 - vpermt2ps cbrt_tbl_H+64+__svml_scbrt_data_internal_avx512(%rip), %zmm10, %zmm12 - vfmadd213ps {rn-sae}, %zmm14, %zmm15, %zmm7 - vscalefps {rn-sae}, %zmm13, %zmm12, %zmm2 - -/* Table lookup: 2^(exponent%3) */ - vpermps __svml_scbrt_data_internal_avx512(%rip), %zmm6, %zmm1 - vpermps etbl_L+__svml_scbrt_data_internal_avx512(%rip), %zmm6, %zmm6 - -/* Sh*R */ - vmulps {rn-sae}, %zmm15, %zmm1, %zmm14 - -/* Sl + (Sh*R)*Poly */ - vfmadd213ps {rn-sae}, %zmm6, %zmm7, %zmm14 - -/* - * branch-free - * scaled_Th*(Sh+Sl+Sh*R*Poly) - */ - vaddps {rn-sae}, %zmm1, %zmm14, %zmm15 - vmulps {rn-sae}, %zmm2, %zmm15, %zmm3 - vorps %zmm0, %zmm3, %zmm0 - ret + vgetmantps $0, {sae}, %zmm0, %zmm8 + + /* GetExp(x) */ + vgetexpps {sae}, %zmm0, %zmm1 + vmovups BiasL+__svml_scbrt_data_internal_avx512(%rip), %zmm2 + + /* exponent/3 */ + vmovups OneThird+__svml_scbrt_data_internal_avx512(%rip), %zmm3 + vmovups Bias3+__svml_scbrt_data_internal_avx512(%rip), %zmm4 + vmovups One+__svml_scbrt_data_internal_avx512(%rip), %zmm15 + + /* exponent%3 (to be used as index) */ + vmovups Three+__svml_scbrt_data_internal_avx512(%rip), %zmm5 + + /* polynomial */ + vmovups poly_coeff3+__svml_scbrt_data_internal_avx512(%rip), %zmm11 + vmovups poly_coeff1+__svml_scbrt_data_internal_avx512(%rip), %zmm14 + + /* Table lookup */ + vmovups cbrt_tbl_H+__svml_scbrt_data_internal_avx512(%rip), %zmm12 + + /* DblRcp ~ 1/Mantissa */ + vrcp14ps %zmm8, %zmm7 + vaddps {rn-sae}, %zmm2, %zmm1, %zmm6 + vandps SZero+__svml_scbrt_data_internal_avx512(%rip), %zmm0, %zmm0 + + /* round DblRcp to 3 fractional bits (RN mode, no Precision exception) */ + vrndscaleps $88, {sae}, %zmm7, %zmm9 + vfmsub231ps {rn-sae}, %zmm6, %zmm3, %zmm4 + vmovups poly_coeff2+__svml_scbrt_data_internal_avx512(%rip), %zmm7 + + /* Reduced argument: R = DblRcp*Mantissa - 1 */ + vfmsub231ps {rn-sae}, %zmm9, %zmm8, %zmm15 + vrndscaleps $9, {sae}, %zmm4, %zmm13 + + /* Prepare table index */ + vpsrld $19, %zmm9, %zmm10 + vfmadd231ps {rn-sae}, %zmm15, %zmm11, %zmm7 + vfnmadd231ps {rn-sae}, %zmm13, %zmm5, %zmm6 + vpermt2ps cbrt_tbl_H+64+__svml_scbrt_data_internal_avx512(%rip), %zmm10, %zmm12 + vfmadd213ps {rn-sae}, %zmm14, %zmm15, %zmm7 + vscalefps {rn-sae}, %zmm13, %zmm12, %zmm2 + + /* Table lookup: 2^(exponent%3) */ + vpermps __svml_scbrt_data_internal_avx512(%rip), %zmm6, %zmm1 + vpermps etbl_L+__svml_scbrt_data_internal_avx512(%rip), %zmm6, %zmm6 + + /* Sh*R */ + vmulps {rn-sae}, %zmm15, %zmm1, %zmm14 + + /* Sl + (Sh*R)*Poly */ + vfmadd213ps {rn-sae}, %zmm6, %zmm7, %zmm14 + + /* + * branch-free + * scaled_Th*(Sh+Sl+Sh*R*Poly) + */ + vaddps {rn-sae}, %zmm1, %zmm14, %zmm15 + vmulps {rn-sae}, %zmm2, %zmm15, %zmm3 + vorps %zmm0, %zmm3, %zmm0 + ret END(_ZGVeN16v_cbrtf_skx) - .section .rodata, "a" - .align 64 + .section .rodata, "a" + .align 64 #ifdef __svml_scbrt_data_internal_avx512_typedef typedef unsigned int VUINT32; typedef struct { - __declspec(align(64)) VUINT32 etbl_H[16][1]; - __declspec(align(64)) VUINT32 etbl_L[16][1]; - __declspec(align(64)) VUINT32 cbrt_tbl_H[32][1]; - __declspec(align(64)) VUINT32 BiasL[16][1]; - __declspec(align(64)) VUINT32 SZero[16][1]; - __declspec(align(64)) VUINT32 OneThird[16][1]; - __declspec(align(64)) VUINT32 Bias3[16][1]; - __declspec(align(64)) VUINT32 Three[16][1]; - __declspec(align(64)) VUINT32 One[16][1]; - __declspec(align(64)) VUINT32 poly_coeff3[16][1]; - __declspec(align(64)) VUINT32 poly_coeff2[16][1]; - __declspec(align(64)) VUINT32 poly_coeff1[16][1]; - } __svml_scbrt_data_internal_avx512; + __declspec(align(64)) VUINT32 etbl_H[16][1]; + __declspec(align(64)) VUINT32 etbl_L[16][1]; + __declspec(align(64)) VUINT32 cbrt_tbl_H[32][1]; + __declspec(align(64)) VUINT32 BiasL[16][1]; + __declspec(align(64)) VUINT32 SZero[16][1]; + __declspec(align(64)) VUINT32 OneThird[16][1]; + __declspec(align(64)) VUINT32 Bias3[16][1]; + __declspec(align(64)) VUINT32 Three[16][1]; + __declspec(align(64)) VUINT32 One[16][1]; + __declspec(align(64)) VUINT32 poly_coeff3[16][1]; + __declspec(align(64)) VUINT32 poly_coeff2[16][1]; + __declspec(align(64)) VUINT32 poly_coeff1[16][1]; +} __svml_scbrt_data_internal_avx512; #endif __svml_scbrt_data_internal_avx512: - /*== etbl_H ==*/ - .long 0x3f800000 - .long 0x3fa14518 - .long 0x3fcb2ff5 - .long 0x00000000 - .long 0x00000000 - .long 0x00000000 - .long 0x00000000 - .long 0x00000000 - .long 0x00000000 - .long 0x00000000 - .long 0x00000000 - .long 0x00000000 - .long 0x00000000 - .long 0x00000000 - .long 0x00000000 - .long 0x00000000 - /*== etbl_L ==*/ - .align 64 - .long 0x00000000 - .long 0xb2ce51af - .long 0x32a7adc8 - .long 0x00000000 - .long 0x00000000 - .long 0x00000000 - .long 0x00000000 - .long 0x00000000 - .long 0x00000000 - .long 0x00000000 - .long 0x00000000 - .long 0x00000000 - .long 0x00000000 - .long 0x00000000 - .long 0x00000000 - .long 0x00000000 - /*== cbrt_tbl_H ==*/ - .align 64 - .long 0x3fa14518 - .long 0x3f9e0b2b - .long 0x3f9b0f9b - .long 0x3f984a9a - .long 0x3f95b5af - .long 0x3f934b6c - .long 0x3f910737 - .long 0x3f8ee526 - .long 0x3f8ce1da - .long 0x3f8afa6a - .long 0x3f892c4e - .long 0x3f87754e - .long 0x3f85d377 - .long 0x3f844510 - .long 0x3f82c892 - .long 0x3f815c9f - .long 0x3f800000 - .long 0x00000000 - .long 0x00000000 - .long 0x00000000 - .long 0x00000000 - .long 0x00000000 - .long 0x00000000 - .long 0x00000000 - .long 0x00000000 - .long 0x00000000 - .long 0x00000000 - .long 0x00000000 - .long 0x00000000 - .long 0x00000000 - .long 0x00000000 - .long 0x00000000 - /*== BiasL ==*/ - .align 64 - .long 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000 - /*== Zero ==*/ - .align 64 - .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 - /*== OneThird ==*/ - .align 64 - .long 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab - /*== Bias3 ==*/ - .align 64 - .long 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000 - /*== Three ==*/ - .align 64 - .long 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000 - /*==One ==*/ - .align 64 - .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 - /*== poly_coeff3 ==*/ - .align 64 - .long 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c - /*== poly_coeff2 ==*/ - .align 64 - .long 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363 - /*== poly_coeff1 ==*/ - .align 64 - .long 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa - .align 64 - .type __svml_scbrt_data_internal_avx512,@object - .size __svml_scbrt_data_internal_avx512,.-__svml_scbrt_data_internal_avx512 + /* etbl_H */ + .long 0x3f800000 + .long 0x3fa14518 + .long 0x3fcb2ff5 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + /* etbl_L */ + .align 64 + .long 0x00000000 + .long 0xb2ce51af + .long 0x32a7adc8 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + /* cbrt_tbl_H */ + .align 64 + .long 0x3fa14518 + .long 0x3f9e0b2b + .long 0x3f9b0f9b + .long 0x3f984a9a + .long 0x3f95b5af + .long 0x3f934b6c + .long 0x3f910737 + .long 0x3f8ee526 + .long 0x3f8ce1da + .long 0x3f8afa6a + .long 0x3f892c4e + .long 0x3f87754e + .long 0x3f85d377 + .long 0x3f844510 + .long 0x3f82c892 + .long 0x3f815c9f + .long 0x3f800000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + .long 0x00000000 + /* BiasL */ + .align 64 + .long 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000 + /* Zero */ + .align 64 + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 + /* OneThird */ + .align 64 + .long 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab + /* Bias3 */ + .align 64 + .long 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000 + /* Three */ + .align 64 + .long 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000 + /* One */ + .align 64 + .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 + /* poly_coeff3 */ + .align 64 + .long 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c + /* poly_coeff2 */ + .align 64 + .long 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363 + /* poly_coeff1 */ + .align 64 + .long 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa + .align 64 + .type __svml_scbrt_data_internal_avx512, @object + .size __svml_scbrt_data_internal_avx512, .-__svml_scbrt_data_internal_avx512 |