/* Function acosf vectorized with AVX-512. Copyright (C) 2021-2023 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library; if not, see https://www.gnu.org/licenses/. */ /* * ALGORITHM DESCRIPTION: * * SelMask = (|x| >= 0.5) ? 1 : 0; * R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x| * acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R)) * acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|) * * */ /* Offsets for data table __svml_sacos_data_internal */ #define SgnBit 0 #define OneHalf 64 #define SmallNorm 128 #define MOne 192 #define Two 256 #define sqrt_coeff_1 320 #define sqrt_coeff_2 384 #define poly_coeff_1 448 #define poly_coeff_2 512 #define poly_coeff_3 576 #define poly_coeff_4 640 #define poly_coeff_5 704 #define Pi2H 768 #define PiH 832 #include .section .text.evex512, "ax", @progbits ENTRY(_ZGVeN16v_acosf_skx) pushq %rbp cfi_def_cfa_offset(16) movq %rsp, %rbp cfi_def_cfa(6, 16) cfi_offset(6, -16) andq $-64, %rsp subq $192, %rsp vmovups __svml_sacos_data_internal(%rip), %zmm5 vmovups OneHalf+__svml_sacos_data_internal(%rip), %zmm6 /* SQ ~ 2*sqrt(Y) */ vmovups SmallNorm+__svml_sacos_data_internal(%rip), %zmm9 vmovups MOne+__svml_sacos_data_internal(%rip), %zmm8 vmovups Two+__svml_sacos_data_internal(%rip), %zmm12 vmovups sqrt_coeff_1+__svml_sacos_data_internal(%rip), %zmm13 vmovaps %zmm0, %zmm4 /* x = -|arg| */ vorps %zmm4, %zmm5, %zmm3 vandps %zmm4, %zmm5, %zmm2 vmovups sqrt_coeff_2+__svml_sacos_data_internal(%rip), %zmm0 /* Y = 0.5 + 0.5*(-x) */ vfmadd231ps {rn-sae}, %zmm3, %zmm6, %zmm6 /* x^2 */ vmulps {rn-sae}, %zmm3, %zmm3, %zmm7 vrsqrt14ps %zmm6, %zmm10 vcmpps $17, {sae}, %zmm9, %zmm6, %k1 vcmpps $22, {sae}, %zmm3, %zmm8, %k0 vmovups poly_coeff_4+__svml_sacos_data_internal(%rip), %zmm9 vminps {sae}, %zmm6, %zmm7, %zmm1 vmovups poly_coeff_3+__svml_sacos_data_internal(%rip), %zmm7 vxorps %zmm10, %zmm10, %zmm10{%k1} vaddps {rn-sae}, %zmm6, %zmm6, %zmm14 vmulps {rn-sae}, %zmm1, %zmm1, %zmm8 vmulps {rn-sae}, %zmm10, %zmm10, %zmm11 vmulps {rn-sae}, %zmm10, %zmm14, %zmm5 vcmpps $21, {sae}, %zmm6, %zmm1, %k4 /* X