/* Function acos vectorized with AVX-512. Copyright (C) 2021-2022 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library; if not, see https://www.gnu.org/licenses/. */ /* * ALGORITHM DESCRIPTION: * * SelMask = (|x| >= 0.5) ? 1 : 0; * R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x| * acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R)) * acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|) * */ /* Offsets for data table __svml_dacos_data_internal */ #define SgnBit 0 #define OneHalf 64 #define SmallNorm 128 #define MOne 192 #define Two 256 #define sqrt_coeff_1 320 #define sqrt_coeff_2 384 #define sqrt_coeff_3 448 #define sqrt_coeff_4 512 #define poly_coeff_1 576 #define poly_coeff_2 640 #define poly_coeff_3 704 #define poly_coeff_4 768 #define poly_coeff_5 832 #define poly_coeff_6 896 #define poly_coeff_7 960 #define poly_coeff_8 1024 #define poly_coeff_9 1088 #define poly_coeff_10 1152 #define poly_coeff_11 1216 #define poly_coeff_12 1280 #define PiH 1344 #define Pi2H 1408 #include .section .text.evex512, "ax", @progbits ENTRY(_ZGVeN8v_acos_skx) pushq %rbp cfi_def_cfa_offset(16) movq %rsp, %rbp cfi_def_cfa(6, 16) cfi_offset(6, -16) andq $-64, %rsp subq $192, %rsp vmovups __svml_dacos_data_internal(%rip), %zmm7 vmovups OneHalf+__svml_dacos_data_internal(%rip), %zmm8 /* S ~ 2*sqrt(Y) */ vmovups SmallNorm+__svml_dacos_data_internal(%rip), %zmm11 vmovups Two+__svml_dacos_data_internal(%rip), %zmm14 vmovups sqrt_coeff_1+__svml_dacos_data_internal(%rip), %zmm15 vmovups sqrt_coeff_2+__svml_dacos_data_internal(%rip), %zmm2 vmovups sqrt_coeff_3+__svml_dacos_data_internal(%rip), %zmm1 vmovups MOne+__svml_dacos_data_internal(%rip), %zmm10 vmovaps %zmm0, %zmm6 /* x = -|arg| */ vorpd %zmm6, %zmm7, %zmm5 vandpd %zmm6, %zmm7, %zmm4 /* Y = 0.5 + 0.5*(-x) */ vfmadd231pd {rn-sae}, %zmm5, %zmm8, %zmm8 /* x^2 */ vmulpd {rn-sae}, %zmm5, %zmm5, %zmm9 vrsqrt14pd %zmm8, %zmm12 vcmppd $17, {sae}, %zmm11, %zmm8, %k1 vcmppd $17, {sae}, %zmm10, %zmm5, %k0 vmovups poly_coeff_5+__svml_dacos_data_internal(%rip), %zmm10 vmovups poly_coeff_7+__svml_dacos_data_internal(%rip), %zmm11 vminpd {sae}, %zmm8, %zmm9, %zmm3 vmovups poly_coeff_3+__svml_dacos_data_internal(%rip), %zmm9 vxorpd %zmm12, %zmm12, %zmm12{%k1} vaddpd {rn-sae}, %zmm8, %zmm8, %zmm0 vcmppd $21, {sae}, %zmm8, %zmm3, %k4 /* X