/* Function acos vectorized with SSE4. Copyright (C) 2021-2022 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library; if not, see https://www.gnu.org/licenses/. */ /* * ALGORITHM DESCRIPTION: * * SelMask = (|x| >= 0.5) ? 1 : 0; * R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x| * acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R)) * acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|) * */ /* Offsets for data table __svml_dacos_data_internal */ #define SgnBit 0 #define OneHalf 16 #define SmallNorm 32 #define MOne 48 #define Two 64 #define sqrt_coeff 80 #define poly_coeff 144 #define PiH 336 #define Pi2H 352 #include .section .text.sse4, "ax", @progbits ENTRY(_ZGVbN2v_acos_sse4) subq $72, %rsp cfi_def_cfa_offset(80) movaps %xmm0, %xmm5 movups __svml_dacos_data_internal(%rip), %xmm3 movups OneHalf+__svml_dacos_data_internal(%rip), %xmm6 /* x = -|arg| */ movaps %xmm3, %xmm4 orps %xmm5, %xmm4 /* Y = 0.5 + 0.5*(-x) */ movaps %xmm6, %xmm7 mulpd %xmm4, %xmm7 addpd %xmm7, %xmm6 /* S ~ 2*sqrt(Y) */ cvtpd2ps %xmm6, %xmm9 movlhps %xmm9, %xmm9 /* x^2 */ movaps %xmm4, %xmm0 rsqrtps %xmm9, %xmm10 mulpd %xmm4, %xmm0 cvtps2pd %xmm10, %xmm11 minpd %xmm6, %xmm0 movaps %xmm6, %xmm1 movaps %xmm0, %xmm2 cmpltpd SmallNorm+__svml_dacos_data_internal(%rip), %xmm1 cmpnltpd %xmm6, %xmm2 addpd %xmm6, %xmm6 andnps %xmm11, %xmm1 movaps %xmm0, %xmm11 movaps %xmm1, %xmm12 andps %xmm5, %xmm3 mulpd %xmm1, %xmm12 mulpd %xmm6, %xmm1 mulpd %xmm12, %xmm6 mulpd %xmm0, %xmm11 subpd Two+__svml_dacos_data_internal(%rip), %xmm6 movups sqrt_coeff+__svml_dacos_data_internal(%rip), %xmm13 movaps %xmm6, %xmm14 mulpd %xmm6, %xmm13 mulpd %xmm1, %xmm14 addpd sqrt_coeff+16+__svml_dacos_data_internal(%rip), %xmm13 mulpd %xmm6, %xmm13 addpd sqrt_coeff+32+__svml_dacos_data_internal(%rip), %xmm13 mulpd %xmm13, %xmm6 /* polynomial */ movups poly_coeff+__svml_dacos_data_internal(%rip), %xmm15 movaps %xmm11, %xmm7 mulpd %xmm0, %xmm15 addpd sqrt_coeff+48+__svml_dacos_data_internal(%rip), %xmm6 addpd poly_coeff+16+__svml_dacos_data_internal(%rip), %xmm15 mulpd %xmm11, %xmm7 mulpd %xmm6, %xmm14 mulpd %xmm11, %xmm15 subpd %xmm14, %xmm1 movups MOne+__svml_dacos_data_internal(%rip), %xmm8 andps %xmm2, %xmm1 /* NaN processed in special branch (so wind test passed) */ cmpnlepd %xmm4, %xmm8 movmskpd %xmm8, %edx /* X