/* Function erff vectorized with AVX-512. Copyright (C) 2021 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library; if not, see https://www.gnu.org/licenses/. */ /* * ALGORITHM DESCRIPTION: * * erf(x) is computed as higher precision simple polynomial * with no lookup table: * * R = P0 + x^2*(P1 + x^2*(P2 + .... x^2*P12)); * erf(x) = R * R * x; * * Special cases: * * erf(0) = 0 * erf(+INF) = +1 * erf(-INF) = -1 * erf(QNaN) = QNaN * erf(SNaN) = QNaN * */ /* Offsets for data table __svml_serf_data_internal */ #define _AbsMask 0 #define _One 64 #define _gf_MaxThreshold_LA 128 #define _gf_la_poly_0 192 #define _gf_la_poly_1 256 #define _gf_la_poly_2 320 #define _gf_la_poly_3 384 #define _gf_la_poly_4 448 #define _gf_la_poly_5 512 #define _gf_la_poly_6 576 #define _gf_la_poly_7 640 #define _gf_la_poly_8 704 #define _gf_la_poly_9 768 #define _gf_la_poly_10 832 #define _gf_la_poly_11 896 #define _gf_la_poly_12 960 #include .text .section .text.exex512,"ax",@progbits ENTRY(_ZGVeN16v_erff_skx) vmovaps %zmm0, %zmm8 vmulps {rn-sae}, %zmm8, %zmm8, %zmm11 vmovups _gf_la_poly_11+__svml_serf_data_internal(%rip), %zmm15 vmovups _gf_la_poly_12+__svml_serf_data_internal(%rip), %zmm10 vmovups _gf_la_poly_10+__svml_serf_data_internal(%rip), %zmm9 vmovups _gf_la_poly_9+__svml_serf_data_internal(%rip), %zmm7 vmovups _gf_la_poly_8+__svml_serf_data_internal(%rip), %zmm0 vmovups _gf_la_poly_7+__svml_serf_data_internal(%rip), %zmm1 vmovups _gf_la_poly_6+__svml_serf_data_internal(%rip), %zmm2 vmovups _gf_la_poly_5+__svml_serf_data_internal(%rip), %zmm3 vmovups _gf_la_poly_4+__svml_serf_data_internal(%rip), %zmm4 vmovups _gf_la_poly_3+__svml_serf_data_internal(%rip), %zmm5 vmovups _gf_la_poly_2+__svml_serf_data_internal(%rip), %zmm6 vextractf32x8 $1, %zmm8, %ymm13 vcvtps2pd {sae}, %ymm8, %zmm12 vcvtps2pd {sae}, %ymm13, %zmm14 vmulpd {rn-sae}, %zmm12, %zmm12, %zmm12 vmulpd {rn-sae}, %zmm14, %zmm14, %zmm13 /* R = P0 + x^2*(P1 + x^2*(P2 + .... x^2*P12)); */ vmovaps %zmm15, %zmm14 vfmadd231pd {rn-sae}, %zmm12, %zmm10, %zmm14 vfmadd231pd {rn-sae}, %zmm13, %zmm10, %zmm15 vmovups _gf_la_poly_1+__svml_serf_data_internal(%rip), %zmm10 vfmadd213pd {rn-sae}, %zmm9, %zmm12, %zmm14 vfmadd231pd {rn-sae}, %zmm13, %zmm15, %zmm9 vfmadd213pd {rn-sae}, %zmm7, %zmm12, %zmm14 vfmadd231pd {rn-sae}, %zmm13, %zmm9, %zmm7 vfmadd213pd {rn-sae}, %zmm0, %zmm12, %zmm14 vfmadd231pd {rn-sae}, %zmm13, %zmm7, %zmm0 vmovups _gf_MaxThreshold_LA+__svml_serf_data_internal(%rip), %zmm7 vfmadd213pd {rn-sae}, %zmm1, %zmm12, %zmm14 vfmadd231pd {rn-sae}, %zmm13, %zmm0, %zmm1 vmovups _gf_la_poly_0+__svml_serf_data_internal(%rip), %zmm0 vcmpps $22, {sae}, %zmm11, %zmm7, %k1 vfmadd213pd {rn-sae}, %zmm2, %zmm12, %zmm14 vfmadd231pd {rn-sae}, %zmm13, %zmm1, %zmm2 vfmadd213pd {rn-sae}, %zmm3, %zmm12, %zmm14 vfmadd231pd {rn-sae}, %zmm13, %zmm2, %zmm3 vfmadd213pd {rn-sae}, %zmm4, %zmm12, %zmm14 vfmadd231pd {rn-sae}, %zmm13, %zmm3, %zmm4 vfmadd213pd {rn-sae}, %zmm5, %zmm12, %zmm14 vfmadd231pd {rn-sae}, %zmm13, %zmm4, %zmm5 vfmadd213pd {rn-sae}, %zmm6, %zmm12, %zmm14 vfmadd231pd {rn-sae}, %zmm13, %zmm5, %zmm6 vmovups _AbsMask+__svml_serf_data_internal(%rip), %zmm5 vfmadd213pd {rn-sae}, %zmm10, %zmm12, %zmm14 vfmadd231pd {rn-sae}, %zmm13, %zmm6, %zmm10 vandnps %zmm8, %zmm5, %zmm6 vfmadd213pd {rn-sae}, %zmm0, %zmm14, %zmm12 vfmadd213pd {rn-sae}, %zmm0, %zmm10, %zmm13 vorps _One+__svml_serf_data_internal(%rip), %zmm6, %zmm0 vmulpd {rn-sae}, %zmm12, %zmm12, %zmm1 vmulpd {rn-sae}, %zmm13, %zmm13, %zmm3 vcvtpd2ps {rn-sae}, %zmm1, %ymm2 vcvtpd2ps {rn-sae}, %zmm3, %ymm4 vinsertf32x8 $1, %ymm4, %zmm2, %zmm9 /* erf(x) = R * R * x; */ vmulps {rn-sae}, %zmm8, %zmm9, %zmm0{%k1} ret END(_ZGVeN16v_erff_skx) .section .rodata, "a" .align 64 #ifdef __svml_serf_data_internal_typedef typedef unsigned int VUINT32; typedef struct { __declspec(align(64)) VUINT32 _AbsMask[16][1]; __declspec(align(64)) VUINT32 _One[16][1]; __declspec(align(64)) VUINT32 _gf_MaxThreshold_LA[16][1]; __declspec(align(64)) VUINT32 _gf_la_poly_0[8][2]; __declspec(align(64)) VUINT32 _gf_la_poly_1[8][2]; __declspec(align(64)) VUINT32 _gf_la_poly_2[8][2]; __declspec(align(64)) VUINT32 _gf_la_poly_3[8][2]; __declspec(align(64)) VUINT32 _gf_la_poly_4[8][2]; __declspec(align(64)) VUINT32 _gf_la_poly_5[8][2]; __declspec(align(64)) VUINT32 _gf_la_poly_6[8][2]; __declspec(align(64)) VUINT32 _gf_la_poly_7[8][2]; __declspec(align(64)) VUINT32 _gf_la_poly_8[8][2]; __declspec(align(64)) VUINT32 _gf_la_poly_9[8][2]; __declspec(align(64)) VUINT32 _gf_la_poly_10[8][2]; __declspec(align(64)) VUINT32 _gf_la_poly_11[8][2]; __declspec(align(64)) VUINT32 _gf_la_poly_12[8][2]; } __svml_serf_data_internal; #endif __svml_serf_data_internal: .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _AbsMask */ .align 64 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 /* _One */ .align 64 .long 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a /* _gf_MaxThreshold_LA */ .align 64 .quad 0x3ff0fefbd933b903, 0x3ff0fefbd933b903, 0x3ff0fefbd933b903, 0x3ff0fefbd933b903, 0x3ff0fefbd933b903, 0x3ff0fefbd933b903, 0x3ff0fefbd933b903, 0x3ff0fefbd933b903 /* _gf_la_poly_0 */ .align 64 .quad 0xbfc6a948101e6367, 0xbfc6a948101e6367, 0xbfc6a948101e6367, 0xbfc6a948101e6367, 0xbfc6a948101e6367, 0xbfc6a948101e6367, 0xbfc6a948101e6367, 0xbfc6a948101e6367 /* _gf_la_poly_1 */ .align 64 .quad 0x3fa3a334ce602c6b, 0x3fa3a334ce602c6b, 0x3fa3a334ce602c6b, 0x3fa3a334ce602c6b, 0x3fa3a334ce602c6b, 0x3fa3a334ce602c6b, 0x3fa3a334ce602c6b, 0x3fa3a334ce602c6b /* _gf_la_poly_2 */ .align 64 .quad 0xbf799309ea0c81dc, 0xbf799309ea0c81dc, 0xbf799309ea0c81dc, 0xbf799309ea0c81dc, 0xbf799309ea0c81dc, 0xbf799309ea0c81dc, 0xbf799309ea0c81dc, 0xbf799309ea0c81dc /* _gf_la_poly_3 */ .align 64 .quad 0x3f476df64a40e392, 0x3f476df64a40e392, 0x3f476df64a40e392, 0x3f476df64a40e392, 0x3f476df64a40e392, 0x3f476df64a40e392, 0x3f476df64a40e392, 0x3f476df64a40e392 /* _gf_la_poly_4 */ .align 64 .quad 0xbf0a5216b9508ede, 0xbf0a5216b9508ede, 0xbf0a5216b9508ede, 0xbf0a5216b9508ede, 0xbf0a5216b9508ede, 0xbf0a5216b9508ede, 0xbf0a5216b9508ede, 0xbf0a5216b9508ede /* _gf_la_poly_5 */ .align 64 .quad 0x3ea5794b95c8e8a0, 0x3ea5794b95c8e8a0, 0x3ea5794b95c8e8a0, 0x3ea5794b95c8e8a0, 0x3ea5794b95c8e8a0, 0x3ea5794b95c8e8a0, 0x3ea5794b95c8e8a0, 0x3ea5794b95c8e8a0 /* _gf_la_poly_6 */ .align 64 .quad 0x3e94b6c0b485f30f, 0x3e94b6c0b485f30f, 0x3e94b6c0b485f30f, 0x3e94b6c0b485f30f, 0x3e94b6c0b485f30f, 0x3e94b6c0b485f30f, 0x3e94b6c0b485f30f, 0x3e94b6c0b485f30f /* _gf_la_poly_7 */ .align 64 .quad 0xbe65806ce17f0523, 0xbe65806ce17f0523, 0xbe65806ce17f0523, 0xbe65806ce17f0523, 0xbe65806ce17f0523, 0xbe65806ce17f0523, 0xbe65806ce17f0523, 0xbe65806ce17f0523 /* _gf_la_poly_8 */ .align 64 .quad 0x3e2715640470db47, 0x3e2715640470db47, 0x3e2715640470db47, 0x3e2715640470db47, 0x3e2715640470db47, 0x3e2715640470db47, 0x3e2715640470db47, 0x3e2715640470db47 /* _gf_la_poly_9 */ .align 64 .quad 0xbdddcb2653d80f03, 0xbdddcb2653d80f03, 0xbdddcb2653d80f03, 0xbdddcb2653d80f03, 0xbdddcb2653d80f03, 0xbdddcb2653d80f03, 0xbdddcb2653d80f03, 0xbdddcb2653d80f03 /* _gf_la_poly_10 */ .align 64 .quad 0x3d85eadfc762d3eb, 0x3d85eadfc762d3eb, 0x3d85eadfc762d3eb, 0x3d85eadfc762d3eb, 0x3d85eadfc762d3eb, 0x3d85eadfc762d3eb, 0x3d85eadfc762d3eb, 0x3d85eadfc762d3eb /* _gf_la_poly_11 */ .align 64 .quad 0xbd1c668a2871f0f1, 0xbd1c668a2871f0f1, 0xbd1c668a2871f0f1, 0xbd1c668a2871f0f1, 0xbd1c668a2871f0f1, 0xbd1c668a2871f0f1, 0xbd1c668a2871f0f1, 0xbd1c668a2871f0f1 /* _gf_la_poly_12 */ .align 64 .type __svml_serf_data_internal,@object .size __svml_serf_data_internal,.-__svml_serf_data_internal