/* Function expm1 vectorized with AVX2. Copyright (C) 2021-2022 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library; if not, see https://www.gnu.org/licenses/. */ /* * ALGORITHM DESCRIPTION: * * N = (int)(x*2^k/log(2.0)), R = x - N*log(2)/2^k * exp(x) = 2^(N/2^k) * poly(R) is computed in high-low parts * expm1(x) = exp(x)-1 is then obtained via multi-precision computation * * */ /* Offsets for data table __svml_dexpm1_data_internal */ #define Expm1_HA_table 0 #define poly_coeff 2048 #define Log2e 2176 #define L2H 2208 #define L2L 2240 #define ExpAddConst 2272 #define IndexMask 2304 #define ExpMask 2336 #define MOne 2368 #define AbsMask 2400 #define Threshold 2432 #define L2 2464 #include .section .text.avx2, "ax", @progbits ENTRY(_ZGVdN4v_expm1_avx2) pushq %rbp cfi_def_cfa_offset(16) movq %rsp, %rbp cfi_def_cfa(6, 16) cfi_offset(6, -16) andq $-32, %rsp subq $96, %rsp lea __svml_dexpm1_data_internal(%rip), %r8 vmovapd %ymm0, %ymm3 vmulpd Log2e+__svml_dexpm1_data_internal(%rip), %ymm3, %ymm4 /* argument reduction */ vmovupd L2H+__svml_dexpm1_data_internal(%rip), %ymm2 vmovupd AbsMask+__svml_dexpm1_data_internal(%rip), %ymm5 vroundpd $0, %ymm4, %ymm8 vaddpd ExpAddConst+__svml_dexpm1_data_internal(%rip), %ymm8, %ymm0 vfnmadd213pd %ymm3, %ymm8, %ymm2 /* table lookup */ vandps IndexMask+__svml_dexpm1_data_internal(%rip), %ymm0, %ymm9 vandpd %ymm5, %ymm3, %ymm6 vcmpnle_uqpd Threshold+__svml_dexpm1_data_internal(%rip), %ymm6, %ymm7 vfnmadd231pd L2L+__svml_dexpm1_data_internal(%rip), %ymm8, %ymm2 vandnpd %ymm3, %ymm5, %ymm1 vmovmskpd %ymm7, %eax vmovupd poly_coeff+64+__svml_dexpm1_data_internal(%rip), %ymm7 vmulpd %ymm2, %ymm2, %ymm8 vfmadd213pd poly_coeff+96+__svml_dexpm1_data_internal(%rip), %ymm2, %ymm7 vandps ExpMask+__svml_dexpm1_data_internal(%rip), %ymm0, %ymm0 vextractf128 $1, %ymm9, %xmm10 vmovd %xmm9, %edx vmovd %xmm10, %esi vpextrd $2, %xmm9, %ecx vpextrd $2, %xmm10, %edi movslq %edx, %rdx movslq %ecx, %rcx movslq %esi, %rsi movslq %edi, %rdi vmovupd (%r8, %rdx), %xmm13 vmovupd (%r8, %rcx), %xmm14 vmovupd (%r8, %rsi), %xmm4 vmovupd (%r8, %rdi), %xmm5 vunpcklpd %xmm14, %xmm13, %xmm11 vunpcklpd %xmm5, %xmm4, %xmm12 vpsllq $41, %ymm0, %ymm10 vunpckhpd %xmm14, %xmm13, %xmm15 vunpckhpd %xmm5, %xmm4, %xmm13 vinsertf128 $1, %xmm12, %ymm11, %ymm6 /* polynomial */ vmovupd poly_coeff+__svml_dexpm1_data_internal(%rip), %ymm12 /* T-1 */ vmovupd MOne+__svml_dexpm1_data_internal(%rip), %ymm11 vfmadd213pd poly_coeff+32+__svml_dexpm1_data_internal(%rip), %ymm2, %ymm12 vfmadd213pd %ymm7, %ymm8, %ymm12 vorpd %ymm10, %ymm6, %ymm9 vfmadd213pd %ymm2, %ymm8, %ymm12 vaddpd %ymm11, %ymm9, %ymm2 vinsertf128 $1, %xmm13, %ymm15, %ymm14 /* Th1 = (Th-1) + Tl */ vfmadd213pd %ymm2, %ymm10, %ymm14 /* T = Th+Tl */ vsubpd %ymm11, %ymm14, %ymm0 vfmadd213pd %ymm14, %ymm12, %ymm0 vorpd %ymm1, %ymm0, %ymm0 testl %eax, %eax /* Go to special inputs processing branch */ jne L(SPECIAL_VALUES_BRANCH) # LOE rbx r12 r13 r14 r15 eax ymm0 ymm3 /* Restore registers * and exit the function */ L(EXIT): movq %rbp, %rsp popq %rbp cfi_def_cfa(7, 8) cfi_restore(6) ret cfi_def_cfa(6, 16) cfi_offset(6, -16) /* Branch to process * special inputs */ L(SPECIAL_VALUES_BRANCH): vmovupd %ymm3, 32(%rsp) vmovupd %ymm0, 64(%rsp) # LOE rbx r12 r13 r14 r15 eax ymm0 xorl %edx, %edx # LOE rbx r12 r13 r14 r15 eax edx vzeroupper movq %r12, 16(%rsp) /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */ .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22 movl %edx, %r12d movq %r13, 8(%rsp) /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */ .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22 movl %eax, %r13d movq %r14, (%rsp) /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */ .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22 # LOE rbx r15 r12d r13d /* Range mask * bits check */ L(RANGEMASK_CHECK): btl %r12d, %r13d /* Call scalar math function */ jc L(SCALAR_MATH_CALL) # LOE rbx r15 r12d r13d /* Special inputs * processing loop */ L(SPECIAL_VALUES_LOOP): incl %r12d cmpl $4, %r12d /* Check bits in range mask */ jl L(RANGEMASK_CHECK) # LOE rbx r15 r12d r13d movq 16(%rsp), %r12 cfi_restore(12) movq 8(%rsp), %r13 cfi_restore(13) movq (%rsp), %r14 cfi_restore(14) vmovupd 64(%rsp), %ymm0 /* Go to exit */ jmp L(EXIT) /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */ .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */ .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */ .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22 # LOE rbx r12 r13 r14 r15 ymm0 /* Scalar math fucntion call * to process special input */ L(SCALAR_MATH_CALL): movl %r12d, %r14d vmovsd 32(%rsp, %r14, 8), %xmm0 call expm1@PLT # LOE rbx r14 r15 r12d r13d xmm0 vmovsd %xmm0, 64(%rsp, %r14, 8) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) # LOE rbx r15 r12d r13d END(_ZGVdN4v_expm1_avx2) .section .rodata, "a" .align 32 #ifdef __svml_dexpm1_data_internal_typedef typedef unsigned int VUINT32; typedef struct { __declspec(align(32)) VUINT32 Expm1_HA_table[(1<<8)][2]; __declspec(align(32)) VUINT32 poly_coeff[4][4][2]; __declspec(align(32)) VUINT32 Log2e[4][2]; __declspec(align(32)) VUINT32 L2H[4][2]; __declspec(align(32)) VUINT32 L2L[4][2]; __declspec(align(32)) VUINT32 ExpAddConst[4][2]; __declspec(align(32)) VUINT32 IndexMask[4][2]; __declspec(align(32)) VUINT32 ExpMask[4][2]; __declspec(align(32)) VUINT32 MOne[4][2]; __declspec(align(32)) VUINT32 AbsMask[4][2]; __declspec(align(32)) VUINT32 Threshold[4][2]; __declspec(align(32)) VUINT32 L2[4][2]; } __svml_dexpm1_data_internal; #endif __svml_dexpm1_data_internal: /* Expm1_HA_table */ .quad 0x0000000000000000, 0x0000000000000000 .quad 0x0000163da8000000, 0x3e3fb33356d84a67 .quad 0x00002c9a40000000, 0xbe3887f9f1190835 .quad 0x00004315e8000000, 0x3e1b9fe12f5ce3e7 .quad 0x000059b0d0000000, 0x3e48ac2ba1d73e2a .quad 0x0000706b28000000, 0x3e3ddf6ddc6dc404 .quad 0x0000874518000000, 0x3e1d66f20230d7c9 .quad 0x00009e3ec8000000, 0x3e46379c1a290f03 .quad 0x0000b55870000000, 0xbe4833b784eb3a37 .quad 0x0000cc9228000000, 0x3e4b923fba03db83 .quad 0x0000e3ec30000000, 0x3e469e8d10103a17 .quad 0x0000fb66b0000000, 0xbdb2ce50dcdf6e22 .quad 0x00011301d0000000, 0x3df25b50a4ebbf1b .quad 0x00012abdc0000000, 0x3e1b0c72fee4aeb5 .quad 0x0001429ab0000000, 0xbe356d2204cbefe7 .quad 0x00015a98c8000000, 0x3e24b1ca24901aae .quad 0x000172b840000000, 0xbe4c15742919041c .quad 0x00018af938000000, 0x3e2191bd3777ee17 .quad 0x0001a35be8000000, 0x3e4b7e5ba9e5b4c8 .quad 0x0001bbe088000000, 0xbe4fdd19632a70c7 .quad 0x0001d48730000000, 0x3e368b9aa7805b80 .quad 0x0001ed5020000000, 0x3e47e6c8e5c40d00 .quad 0x0002063b88000000, 0x3e18a3358ee3bac1 .quad 0x00021f4990000000, 0x3e37ddc962552fd3 .quad 0x0002387a70000000, 0xbe38a9dc7993e052 .quad 0x000251ce50000000, 0xbe135670329f5521 .quad 0x00026b4568000000, 0xbe40ec1916d42cc6 .quad 0x000284dfe0000000, 0x3e3f5638096cf15d .quad 0x00029e9df8000000, 0xbe470108f69ed175 .quad 0x0002b87fd0000000, 0x3e2b5b31ffbbd48d .quad 0x0002d285a8000000, 0xbe31bfcf4bff6e2b .quad 0x0002ecafa8000000, 0x3e33e2f5611ca0f4 .quad 0x000306fe08000000, 0x3e418db8a96f46ad .quad 0x0003217100000000, 0xbe4d993e76563187 .quad 0x00033c08b0000000, 0x3e4320b7fa64e431 .quad 0x000356c560000000, 0xbe1b5803cdae772e .quad 0x000371a738000000, 0xbe28aac6ab1d7560 .quad 0x00038cae70000000, 0xbe47d13cd3d2b1a8 .quad 0x0003a7db38000000, 0xbe48d30048af21b7 .quad 0x0003c32dc0000000, 0x3e489d47242000f9 .quad 0x0003dea650000000, 0xbe4f6e5eee525f6f .quad 0x0003fa4508000000, 0xbe4a9bff22fa047f .quad 0x0004160a20000000, 0x3e3f72e29f84325c .quad 0x000431f5d8000000, 0x3e350a896dc70444 .quad 0x00044e0860000000, 0x3e18624b40c4dbd0 .quad 0x00046a41f0000000, 0xbe4717fd446d7686 .quad 0x000486a2b8000000, 0xbe41f6197f61f2e2 .quad 0x0004a32af0000000, 0x3e2afa7bcce5b17a .quad 0x0004bfdad8000000, 0xbe464eaec715e343 .quad 0x0004dcb298000000, 0x3e3fddd0d63b36ef .quad 0x0004f9b278000000, 0xbe362d35952cc275 .quad 0x000516daa0000000, 0x3e467b320e0897a9 .quad 0x0005342b58000000, 0xbe362b07e20f57c4 .quad 0x000551a4c8000000, 0x3e42ec9076297631 .quad 0x00056f4738000000, 0xbe34ad8259913500 .quad 0x00058d12d8000000, 0xbe4b41c016d6a1ea .quad 0x0005ab07e0000000, 0xbe45bd5eb539b67f .quad 0x0005c92688000000, 0x3e42ca35b80e258e .quad 0x0005e76f18000000, 0xbe4296f5bc8b20da .quad 0x000605e1b8000000, 0x3e376dc08b076f59 .quad 0x0006247eb0000000, 0x3e0d2ac258f87d03 .quad 0x0006434638000000, 0xbe4999e701c483c7 .quad 0x0006623880000000, 0x3e42a91124893ecf .quad 0x00068155d8000000, 0xbe4d9ab467bf1d47 .quad 0x0006a09e68000000, 0xbe380c4336f74d05 .quad 0x0006c01278000000, 0xbe47a12a08944ab3 .quad 0x0006dfb240000000, 0xbe4cd72e886ef8ea .quad 0x0006ff7df8000000, 0x3e3519483cf87e1b .quad 0x00071f75e8000000, 0x3e2d8bee7ba46e1e .quad 0x00073f9a48000000, 0x3e24b02e77ab934a .quad 0x00075feb58000000, 0xbe3bd98374091656 .quad 0x0007806950000000, 0xbe00d1604f328fec .quad 0x0007a11470000000, 0x3e4f580c36bea881 .quad 0x0007c1ed00000000, 0x3e330c1327c49334 .quad 0x0007e2f338000000, 0xbe330b19defa2fd4 .quad 0x0008042758000000, 0xbe4e0f2f724f90cc .quad 0x0008258998000000, 0x3e34cce128acf88b .quad 0x0008471a48000000, 0xbe3dc385331ad094 .quad 0x000868d998000000, 0x3e4a2497640720ed .quad 0x00088ac7d8000000, 0x3e38a669966530bd .quad 0x0008ace540000000, 0x3e415506dadd3e2b .quad 0x0008cf3218000000, 0xbe34abb7410d55e3 .quad 0x0008f1ae98000000, 0x3e31577362b98274 .quad 0x0009145b08000000, 0x3e4c8ffe2c4530da .quad 0x00093737b0000000, 0x3e29b8bc9e8a0388 .quad 0x00095a44c8000000, 0x3e4e4290774da41b .quad 0x00097d82a0000000, 0xbe00d8d83a30b6f8 .quad 0x0009a0f170000000, 0x3e2940f737462137 .quad 0x0009c49180000000, 0x3e451f8480e3e236 .quad 0x0009e86318000000, 0x3e3e323231824ca8 .quad 0x000a0c6678000000, 0x3e4aef2b2594d6d4 .quad 0x000a309bf0000000, 0xbe4dae966539f470 .quad 0x000a5503b0000000, 0x3e41f12ae45a1225 .quad 0x000a799e10000000, 0x3e49859ac3796fd9 .quad 0x000a9e6b58000000, 0xbe44301205e0a6de .quad 0x000ac36bc0000000, 0xbe0606431f9234cb .quad 0x000ae89f98000000, 0x3e35ad3ad5e8734d .quad 0x000b0e0728000000, 0x3e38db66590842ad .quad 0x000b33a2b8000000, 0x3e13c57ebdaff43a .quad 0x000b597290000000, 0xbe40d536338e3bf7 .quad 0x000b7f76f0000000, 0x3e47daf237553d84 .quad 0x000ba5b030000000, 0x3e2420c930819679 .quad 0x000bcc1e90000000, 0x3e12f074891ee83d .quad 0x000bf2c258000000, 0x3e4eb8f0442046b8 .quad 0x000c199be0000000, 0xbe43d56b1eeef9a7 .quad 0x000c40ab60000000, 0xbd87c2c975903ef8 .quad 0x000c67f130000000, 0xbe3a82eb4b5dec80 .quad 0x000c8f6d98000000, 0xbe4fc8c257729a1e .quad 0x000cb720e0000000, 0xbe48837cb757e1a1 .quad 0x000cdf0b58000000, 0xbe4511e031dd83b5 .quad 0x000d072d48000000, 0x3e403c4bdc687918 .quad 0x000d2f8708000000, 0x3deb13e315bc2473 .quad 0x000d5818e0000000, 0xbe4822dbc6d12fd3 .quad 0x000d80e318000000, 0xbe3367c68447b063 .quad 0x000da9e600000000, 0x3e4ed9942b84600d .quad 0x000dd321f0000000, 0x3e480da3025b4aef .quad 0x000dfc9730000000, 0x3e4bdcdaf5cb4656 .quad 0x000e264618000000, 0xbe4852f6baf6c4f0 .quad 0x000e502ee8000000, 0xbe1d30027630bb40 .quad 0x000e7a51f8000000, 0x3e4e3a641a5aa459 .quad 0x000ea4afa0000000, 0x3e452486cc2c7b9d .quad 0x000ecf4830000000, 0xbe438cc07b927e77 .quad 0x000efa1bf0000000, 0xbe39ea5d888e02de .quad 0x000f252b38000000, 0xbe2288ad162f2d20 .quad 0x000f507658000000, 0x3e4b722a033a7c26 .quad 0x000f7bfdb0000000, 0xbe431a0f63b7625a .quad 0x000fa7c180000000, 0x3e39e90d82e90a7e .quad 0x000fd3c228000000, 0x3e4c7b8f884badd2 /* poly_coeff[4] */ .align 32 .quad 0x3f81111168877F38, 0x3f81111168877F38, 0x3f81111168877F38, 0x3f81111168877F38 /* coeff5 */ .quad 0x3fa55555C2A9C0F3, 0x3fa55555C2A9C0F3, 0x3fa55555C2A9C0F3, 0x3fa55555C2A9C0F3 /* coeff4 */ .quad 0x3fc555555555541D, 0x3fc555555555541D, 0x3fc555555555541D, 0x3fc555555555541D /* coeff3 */ .quad 0x3fdFFFFFFFFFFE5C, 0x3fdFFFFFFFFFFE5C, 0x3fdFFFFFFFFFFE5C, 0x3fdFFFFFFFFFFE5C /* coeff2 */ /* Log2e */ .align 32 .quad 0x40671547652B82FE, 0x40671547652B82FE, 0x40671547652B82FE, 0x40671547652B82FE /* L2H */ .align 32 .quad 0x3f762e42fef80000, 0x3f762e42fef80000, 0x3f762e42fef80000, 0x3f762e42fef80000 /* L2L */ .align 32 .quad 0x3d41cf79abc9e3b4, 0x3d41cf79abc9e3b4, 0x3d41cf79abc9e3b4, 0x3d41cf79abc9e3b4 /* ExpAddConst */ .align 32 .quad 0x42f80000001ff800, 0x42f80000001ff800, 0x42f80000001ff800, 0x42f80000001ff800 /* IndexMask */ .align 32 .quad 0x00000000000007f0, 0x00000000000007f0, 0x00000000000007f0, 0x00000000000007f0 /* ExpMask */ .align 32 .quad 0x00000000003ff800, 0x00000000003ff800, 0x00000000003ff800, 0x00000000003ff800 /* MOne */ .align 32 .quad 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000 /* AbsMask */ .align 32 .quad 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff /* Threshold */ .align 32 .quad 0x40861DA04CBAFE43, 0x40861DA04CBAFE43, 0x40861DA04CBAFE43, 0x40861DA04CBAFE43 /* L2 */ .align 32 .quad 0x3f762e42fefa39ef, 0x3f762e42fefa39ef, 0x3f762e42fefa39ef, 0x3f762e42fefa39ef .align 32 .type __svml_dexpm1_data_internal, @object .size __svml_dexpm1_data_internal, .-__svml_dexpm1_data_internal