/* Function acosf vectorized with AVX2. Copyright (C) 2021-2022 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library; if not, see https://www.gnu.org/licenses/. */ /* * ALGORITHM DESCRIPTION: * * SelMask = (|x| >= 0.5) ? 1 : 0; * R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x| * acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R)) * acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|) * * */ /* Offsets for data table __svml_sacos_data_internal */ #define SgnBit 0 #define OneHalf 32 #define SmallNorm 64 #define MOne 96 #define Two 128 #define sqrt_coeff 160 #define poly_coeff 224 #define Pi2H 384 #define PiH 416 #include .section .text.avx2, "ax", @progbits ENTRY(_ZGVdN8v_acosf_avx2) pushq %rbp cfi_def_cfa_offset(16) movq %rsp, %rbp cfi_def_cfa(6, 16) cfi_offset(6, -16) andq $-32, %rsp subq $96, %rsp /* * 2*sqrt(X) ~ Sh - Sl (to 24+ bits) * SQ ~ 2*sqrt(X) */ vmovups __svml_sacos_data_internal(%rip), %ymm6 vmovups OneHalf+__svml_sacos_data_internal(%rip), %ymm7 vmovaps %ymm0, %ymm5 /* x = -|arg| */ vorps %ymm5, %ymm6, %ymm4 /* Y = 0.5 + 0.5*(-x) */ vfmadd231ps %ymm4, %ymm7, %ymm7 /* x^2 */ vmulps %ymm4, %ymm4, %ymm8 /* SQ ~ 2*sqrt(Y) */ vmovups sqrt_coeff+__svml_sacos_data_internal(%rip), %ymm0 vcmpnge_uqps MOne+__svml_sacos_data_internal(%rip), %ymm4, %ymm9 vcmplt_oqps SmallNorm+__svml_sacos_data_internal(%rip), %ymm7, %ymm10 vminps %ymm7, %ymm8, %ymm2 vaddps %ymm7, %ymm7, %ymm14 vrsqrtps %ymm7, %ymm11 vmovups poly_coeff+64+__svml_sacos_data_internal(%rip), %ymm8 vcmpnlt_uqps %ymm7, %ymm2, %ymm1 vmulps %ymm2, %ymm2, %ymm7 vfmadd213ps poly_coeff+96+__svml_sacos_data_internal(%rip), %ymm2, %ymm8 vmovmskps %ymm9, %edx /* polynomial */ vmovups poly_coeff+__svml_sacos_data_internal(%rip), %ymm9 vandnps %ymm11, %ymm10, %ymm12 vmulps %ymm12, %ymm12, %ymm13 vfmadd213ps poly_coeff+32+__svml_sacos_data_internal(%rip), %ymm2, %ymm9 /* X