/* Function cosh vectorized with AVX2. Copyright (C) 2021-2022 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library; if not, see https://www.gnu.org/licenses/. */ /* * ALGORITHM DESCRIPTION: * * Compute cosh(x) as (exp(x)+exp(-x))/2, * where exp is calculated as * exp(M*ln2 + ln2*(j/2^k) + r) = 2^M * 2^(j/2^k) * exp(r) * * Special cases: * * cosh(NaN) = quiet NaN, and raise invalid exception * cosh(INF) = that INF * cosh(0) = 1 * cosh(x) overflows for big x and returns MAXLOG+log(2) * */ /* Offsets for data table __svml_dcosh_data_internal */ #define _dbT 0 #define _dbInvLn2 2080 #define _dbLn2hi 2112 #define _dbLn2lo 2144 #define _dbShifter 2176 #define _iIndexMask 2208 #define _dPC2 2240 #define _dPC3 2272 #define _dPC4 2304 #define _iMaxIndex 2336 #define _lExpMask 2368 #define _dSign 2400 #define _iDomainRange 2432 #include .section .text.avx2, "ax", @progbits ENTRY(_ZGVdN4v_cosh_avx2) pushq %rbp cfi_def_cfa_offset(16) movq %rsp, %rbp cfi_def_cfa(6, 16) cfi_offset(6, -16) andq $-32, %rsp subq $96, %rsp lea _dbT+__svml_dcosh_data_internal(%rip), %rax vmovupd _dSign+__svml_dcosh_data_internal(%rip), %ymm8 vmovupd _dbShifter+__svml_dcosh_data_internal(%rip), %ymm6 /* * Load argument * dM = x*2^K/log(2) + RShifter */ vmovupd _dbInvLn2+__svml_dcosh_data_internal(%rip), %ymm3 /* * trick * 256=-iIndex */ vmovups _iMaxIndex+__svml_dcosh_data_internal(%rip), %xmm14 /* dXSign=0x001000000000 */ vpsrlq $11, %ymm8, %ymm5 vmovapd %ymm0, %ymm7 /* Abs argument */ vandnpd %ymm7, %ymm8, %ymm4 vfmadd213pd %ymm6, %ymm4, %ymm3 /* Index and lookup */ vextractf128 $1, %ymm3, %xmm12 vshufps $136, %xmm12, %xmm3, %xmm13 vpand _iIndexMask+__svml_dcosh_data_internal(%rip), %xmm13, %xmm15 vpsubd %xmm15, %xmm14, %xmm0 /* iDomainRange*=3 */ vpslld $3, %xmm0, %xmm2 vmovd %xmm2, %r9d vpextrd $2, %xmm2, %r11d movslq %r9d, %r9 vpextrd $1, %xmm2, %r10d movslq %r11d, %r11 movslq %r10d, %r10 vmovsd (%rax, %r9), %xmm12 /* * Check for overflow\underflow * */ vextractf128 $1, %ymm4, %xmm9 vmovsd (%rax, %r11), %xmm14 vmovhpd (%rax, %r10), %xmm12, %xmm13 vshufps $221, %xmm9, %xmm4, %xmm10 /* iIndex*=3 */ vpslld $3, %xmm15, %xmm9 /* * R * dN = dM - RShifter */ vsubpd %ymm6, %ymm3, %ymm15 vmovd %xmm9, %ecx vpcmpgtd _iDomainRange+__svml_dcosh_data_internal(%rip), %xmm10, %xmm11 vmovupd _dbLn2hi+__svml_dcosh_data_internal(%rip), %ymm6 /* * G1, G2, G3: dTdif, dTn * 2^N, 2^(-N) * NB: copied from sinh_la - to be optimized!!!!! */ vpsllq $44, %ymm3, %ymm3 vmovmskps %xmm11, %edx /* dR = dX - dN*Log2_hi/2^K */ vfnmadd231pd %ymm6, %ymm15, %ymm4 /* lM now is an EXP(2^N) */ vpand _lExpMask+__svml_dcosh_data_internal(%rip), %ymm3, %ymm3 /* dR = (dX - dN*Log2_hi/2^K) - dN*Log2_lo/2^K */ vfnmadd231pd _dbLn2lo+__svml_dcosh_data_internal(%rip), %ymm15, %ymm4 movslq %ecx, %rcx vpextrd $2, %xmm9, %edi vpextrd $1, %xmm9, %esi movslq %edi, %rdi vmovsd (%rax, %rcx), %xmm1 vpextrd $3, %xmm9, %r8d vpextrd $3, %xmm2, %ecx movslq %esi, %rsi movslq %r8d, %r8 movslq %ecx, %rcx /* dR2 = dR^2 */ vmulpd %ymm4, %ymm4, %ymm0 vmovsd (%rax, %rdi), %xmm10 vmovhpd (%rax, %rsi), %xmm1, %xmm8 vmovhpd (%rax, %r8), %xmm10, %xmm11 vmovhpd (%rax, %rcx), %xmm14, %xmm2 vinsertf128 $1, %xmm11, %ymm8, %ymm1 vinsertf128 $1, %xmm2, %ymm13, %ymm2 vpaddq %ymm3, %ymm1, %ymm6 /* */ vpsubq %ymm3, %ymm2, %ymm1 /* * sinh(r) = r +r*r^2*a3 .... * dSinh_r = r^2*a3 */ vmulpd _dPC3+__svml_dcosh_data_internal(%rip), %ymm0, %ymm2 /* lX- = EXP(1/2) */ vpsubq %ymm5, %ymm1, %ymm5 /* dSinh_r = r + r*r^2*a3 */ vfmadd213pd %ymm4, %ymm4, %ymm2 /* poly(r) = dTp + dTn*sinh(dR)+dTp*dR2*(a2 +a4*dR2) */ vmovupd _dPC4+__svml_dcosh_data_internal(%rip), %ymm4 /* dTn = dTn*2^N - dTn*2^-N */ vsubpd %ymm5, %ymm6, %ymm1 /* dTp = dTn*2^N + dTn*2^-N */ vaddpd %ymm5, %ymm6, %ymm3 vfmadd213pd _dPC2+__svml_dcosh_data_internal(%rip), %ymm0, %ymm4 vmulpd %ymm2, %ymm1, %ymm1 vmulpd %ymm4, %ymm0, %ymm0 /* dTn*sinh(dR)+dTp*dR2*(a2 +a4*dR2) */ vfmadd213pd %ymm1, %ymm3, %ymm0 /* _VRES1 = dTp + dTn*sinh(dR)+dTp*dR2*(a2 +a4*dR2) */ vaddpd %ymm0, %ymm3, %ymm0 /* Ret H */ testl %edx, %edx /* Go to special inputs processing branch */ jne L(SPECIAL_VALUES_BRANCH) # LOE rbx r12 r13 r14 r15 edx ymm0 ymm7 /* Restore registers * and exit the function */ L(EXIT): movq %rbp, %rsp popq %rbp cfi_def_cfa(7, 8) cfi_restore(6) ret cfi_def_cfa(6, 16) cfi_offset(6, -16) /* Branch to process * special inputs */ L(SPECIAL_VALUES_BRANCH): vmovupd %ymm7, 32(%rsp) vmovupd %ymm0, 64(%rsp) # LOE rbx r12 r13 r14 r15 edx ymm0 xorl %eax, %eax # LOE rbx r12 r13 r14 r15 eax edx vzeroupper movq %r12, 16(%rsp) /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */ .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22 movl %eax, %r12d movq %r13, 8(%rsp) /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */ .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22 movl %edx, %r13d movq %r14, (%rsp) /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */ .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22 # LOE rbx r15 r12d r13d /* Range mask * bits check */ L(RANGEMASK_CHECK): btl %r12d, %r13d /* Call scalar math function */ jc L(SCALAR_MATH_CALL) # LOE rbx r15 r12d r13d /* Special inputs * processing loop */ L(SPECIAL_VALUES_LOOP): incl %r12d cmpl $4, %r12d /* Check bits in range mask */ jl L(RANGEMASK_CHECK) # LOE rbx r15 r12d r13d movq 16(%rsp), %r12 cfi_restore(12) movq 8(%rsp), %r13 cfi_restore(13) movq (%rsp), %r14 cfi_restore(14) vmovupd 64(%rsp), %ymm0 /* Go to exit */ jmp L(EXIT) /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */ .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */ .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */ .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22 # LOE rbx r12 r13 r14 r15 ymm0 /* Scalar math fucntion call * to process special input */ L(SCALAR_MATH_CALL): movl %r12d, %r14d vmovsd 32(%rsp, %r14, 8), %xmm0 call cosh@PLT # LOE rbx r14 r15 r12d r13d xmm0 vmovsd %xmm0, 64(%rsp, %r14, 8) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) # LOE rbx r15 r12d r13d END(_ZGVdN4v_cosh_avx2) .section .rodata, "a" .align 32 #ifdef __svml_dcosh_data_internal_typedef typedef unsigned int VUINT32; typedef struct { __declspec(align(32)) VUINT32 _dbT[(1+(1<<8))][2]; // dTpj ONLY! __declspec(align(32)) VUINT32 _dbInvLn2[4][2]; __declspec(align(32)) VUINT32 _dbLn2hi[4][2]; __declspec(align(32)) VUINT32 _dbLn2lo[4][2]; __declspec(align(32)) VUINT32 _dbShifter[4][2]; __declspec(align(32)) VUINT32 _iIndexMask[8][1]; // (1<