diff options
Diffstat (limited to 'sysdeps/powerpc/powerpc64/power8/fpu/s_cosf.S')
-rw-r--r-- | sysdeps/powerpc/powerpc64/power8/fpu/s_cosf.S | 508 |
1 files changed, 0 insertions, 508 deletions
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_cosf.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_cosf.S deleted file mode 100644 index 8dfa0076e0..0000000000 --- a/sysdeps/powerpc/powerpc64/power8/fpu/s_cosf.S +++ /dev/null @@ -1,508 +0,0 @@ -/* Optimized cosf(). PowerPC64/POWER8 version. - Copyright (C) 2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#define _ERRNO_H 1 -#include <bits/errno.h> - -#define FRAMESIZE (FRAME_MIN_SIZE+16) - -#define FLOAT_EXPONENT_SHIFT 23 -#define FLOAT_EXPONENT_BIAS 127 -#define INTEGER_BITS 3 - -#define PI_4 0x3f490fdb /* PI/4 */ -#define NINEPI_4 0x40e231d6 /* 9 * PI/4 */ -#define TWO_PN5 0x3d000000 /* 2^-5 */ -#define TWO_PN27 0x32000000 /* 2^-27 */ -#define INFINITY 0x7f800000 -#define TWO_P23 0x4b000000 /* 2^23 */ -#define FX_FRACTION_1_28 0x9249250 /* 0x100000000 / 28 + 1 */ - - /* Implements the function - - float [fp1] cosf (float [fp1] x) */ - - .machine power8 -EALIGN(__cosf, 4, 0) - addis r9,r2,L(anchor)@toc@ha - addi r9,r9,L(anchor)@toc@l - - lis r4,PI_4@h - ori r4,r4,PI_4@l - - xscvdpspn v0,v1 - mfvsrd r8,v0 - rldicl r3,r8,32,33 /* Remove sign bit. */ - - cmpw r3,r4 - bge L(greater_or_equal_pio4) - - lis r4,TWO_PN5@h - ori r4,r4,TWO_PN5@l - - cmpw r3,r4 - blt L(less_2pn5) - - /* Chebyshev polynomial of the form: - * 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))). */ - - lfd fp9,(L(C0)-L(anchor))(r9) - lfd fp10,(L(C1)-L(anchor))(r9) - lfd fp11,(L(C2)-L(anchor))(r9) - lfd fp12,(L(C3)-L(anchor))(r9) - lfd fp13,(L(C4)-L(anchor))(r9) - - fmul fp2,fp1,fp1 /* x^2 */ - lfd fp3,(L(DPone)-L(anchor))(r9) - - fmadd fp4,fp2,fp13,fp12 /* C3+x^2*C4 */ - fmadd fp4,fp2,fp4,fp11 /* C2+x^2*(C3+x^2*C4) */ - fmadd fp4,fp2,fp4,fp10 /* C1+x^2*(C2+x^2*(C3+x^2*C4)) */ - fmadd fp4,fp2,fp4,fp9 /* C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4))) */ - fmadd fp1,fp2,fp4,fp3 /* 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))) */ - frsp fp1,fp1 /* Round to single precision. */ - - blr - - .balign 16 -L(greater_or_equal_pio4): - lis r4,NINEPI_4@h - ori r4,r4,NINEPI_4@l - cmpw r3,r4 - bge L(greater_or_equal_9pio4) - - /* Calculate quotient of |x|/(PI/4). */ - lfd fp2,(L(invpio4)-L(anchor))(r9) - fabs fp1,fp1 /* |x| */ - fmul fp2,fp1,fp2 /* |x|/(PI/4) */ - fctiduz fp2,fp2 - mfvsrd r3,v2 /* n = |x| mod PI/4 */ - - /* Now use that quotient to find |x| mod (PI/2). */ - addi r7,r3,1 - rldicr r5,r7,2,60 /* ((n+1) >> 1) << 3 */ - addi r6,r9,(L(pio2_table)-L(anchor)) - lfdx fp4,r5,r6 - fsub fp1,fp1,fp4 - - .balign 16 -L(reduced): - /* Now we are in the range -PI/4 to PI/4. */ - - /* Work out if we are in a positive or negative primary interval. */ - addi r7,r7,2 - rldicl r4,r7,62,63 /* ((n+3) >> 2) & 1 */ - - /* Load a 1.0 or -1.0. */ - addi r5,r9,(L(ones)-L(anchor)) - sldi r4,r4,3 - lfdx fp0,r4,r5 - - /* Are we in the primary interval of sin or cos? */ - andi. r4,r7,0x2 - bne L(cos) - - /* Chebyshev polynomial of the form: - x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))). */ - - lfd fp9,(L(S0)-L(anchor))(r9) - lfd fp10,(L(S1)-L(anchor))(r9) - lfd fp11,(L(S2)-L(anchor))(r9) - lfd fp12,(L(S3)-L(anchor))(r9) - lfd fp13,(L(S4)-L(anchor))(r9) - - fmul fp2,fp1,fp1 /* x^2 */ - fmul fp3,fp2,fp1 /* x^3 */ - - fmadd fp4,fp2,fp13,fp12 /* S3+x^2*S4 */ - fmadd fp4,fp2,fp4,fp11 /* S2+x^2*(S3+x^2*S4) */ - fmadd fp4,fp2,fp4,fp10 /* S1+x^2*(S2+x^2*(S3+x^2*S4)) */ - fmadd fp4,fp2,fp4,fp9 /* S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4))) */ - fmadd fp4,fp3,fp4,fp1 /* x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))) */ - fmul fp4,fp4,fp0 /* Add in the sign. */ - frsp fp1,fp4 /* Round to single precision. */ - - blr - - .balign 16 -L(cos): - /* Chebyshev polynomial of the form: - 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))). */ - - lfd fp9,(L(C0)-L(anchor))(r9) - lfd fp10,(L(C1)-L(anchor))(r9) - lfd fp11,(L(C2)-L(anchor))(r9) - lfd fp12,(L(C3)-L(anchor))(r9) - lfd fp13,(L(C4)-L(anchor))(r9) - - fmul fp2,fp1,fp1 /* x^2 */ - lfd fp3,(L(DPone)-L(anchor))(r9) - - fmadd fp4,fp2,fp13,fp12 /* C3+x^2*C4 */ - fmadd fp4,fp2,fp4,fp11 /* C2+x^2*(C3+x^2*C4) */ - fmadd fp4,fp2,fp4,fp10 /* C1+x^2*(C2+x^2*(C3+x^2*C4)) */ - fmadd fp4,fp2,fp4,fp9 /* C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4))) */ - fmadd fp4,fp2,fp4,fp3 /* 1.0 + x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))) */ - fmul fp4,fp4,fp0 /* Add in the sign. */ - frsp fp1,fp4 /* Round to single precision. */ - - blr - - .balign 16 -L(greater_or_equal_9pio4): - lis r4,INFINITY@h - ori r4,r4,INFINITY@l - cmpw r3,r4 - bge L(inf_or_nan) - - lis r4,TWO_P23@h - ori r4,r4,TWO_P23@l - cmpw r3,r4 - bge L(greater_or_equal_2p23) - - fabs fp1,fp1 /* |x| */ - - /* Calculate quotient of |x|/(PI/4). */ - lfd fp2,(L(invpio4)-L(anchor))(r9) - - lfd fp3,(L(DPone)-L(anchor))(r9) - lfd fp4,(L(DPhalf)-L(anchor))(r9) - fmul fp2,fp1,fp2 /* |x|/(PI/4) */ - friz fp2,fp2 /* n = floor(|x|/(PI/4)) */ - - /* Calculate (n + 1) / 2. */ - fadd fp2,fp2,fp3 /* n + 1 */ - fmul fp3,fp2,fp4 /* (n + 1) / 2 */ - friz fp3,fp3 - - lfd fp4,(L(pio2hi)-L(anchor))(r9) - lfd fp5,(L(pio2lo)-L(anchor))(r9) - - fmul fp6,fp4,fp3 - fadd fp6,fp6,fp1 - fmadd fp1,fp5,fp3,fp6 - - fctiduz fp2,fp2 - mfvsrd r7,v2 /* n + 1 */ - - b L(reduced) - - .balign 16 -L(inf_or_nan): - bne L(skip_errno_setting) /* Is a NAN? */ - - /* We delayed the creation of the stack frame, as well as the saving of - the link register, because only at this point, we are sure that - doing so is actually needed. */ - - stfd fp1,-8(r1) - - /* Save the link register. */ - mflr r0 - std r0,16(r1) - cfi_offset(lr, 16) - - /* Create the stack frame. */ - stdu r1,-FRAMESIZE(r1) - cfi_adjust_cfa_offset(FRAMESIZE) - - bl JUMPTARGET(__errno_location) - nop - - /* Restore the stack frame. */ - addi r1,r1,FRAMESIZE - cfi_adjust_cfa_offset(-FRAMESIZE) - /* Restore the link register. */ - ld r0,16(r1) - mtlr r0 - - lfd fp1,-8(r1) - - /* errno = EDOM */ - li r4,EDOM - stw r4,0(r3) - -L(skip_errno_setting): - fsub fp1,fp1,fp1 /* x - x */ - blr - - .balign 16 -L(greater_or_equal_2p23): - fabs fp1,fp1 - - srwi r4,r3,FLOAT_EXPONENT_SHIFT - subi r4,r4,FLOAT_EXPONENT_BIAS - - /* We reduce the input modulo pi/4, so we need 3 bits of integer - to determine where in 2*pi we are. Index into our array - accordingly. */ - addi r4,r4,INTEGER_BITS - - /* To avoid an expensive divide, for the range we care about (0 - 127) - we can transform x/28 into: - - x/28 = (x * ((0x100000000 / 28) + 1)) >> 32 - - mulhwu returns the top 32 bits of the 64 bit result, doing the - shift for us in the same instruction. The top 32 bits are undefined, - so we have to mask them. */ - - lis r6,FX_FRACTION_1_28@h - ori r6,r6,FX_FRACTION_1_28@l - mulhwu r5,r4,r6 - clrldi r5,r5,32 - - /* Get our pointer into the invpio4_table array. */ - sldi r4,r5,3 - addi r6,r9,(L(invpio4_table)-L(anchor)) - add r4,r4,r6 - - lfd fp2,0(r4) - lfd fp3,8(r4) - lfd fp4,16(r4) - lfd fp5,24(r4) - - fmul fp6,fp2,fp1 - fmul fp7,fp3,fp1 - fmul fp8,fp4,fp1 - fmul fp9,fp5,fp1 - - /* Mask off larger integer bits in highest double word that we don't - care about to avoid losing precision when combining with smaller - values. */ - fctiduz fp10,fp6 - mfvsrd r7,v10 - rldicr r7,r7,0,(63-INTEGER_BITS) - mtvsrd v10,r7 - fcfidu fp10,fp10 /* Integer bits. */ - - fsub fp6,fp6,fp10 /* highest -= integer bits */ - - /* Work out the integer component, rounded down. Use the top two - limbs for this. */ - fadd fp10,fp6,fp7 /* highest + higher */ - - fctiduz fp10,fp10 - mfvsrd r7,v10 - andi. r0,r7,1 - fcfidu fp10,fp10 - - /* Subtract integer component from highest limb. */ - fsub fp12,fp6,fp10 - - beq L(even_integer) - - /* Our integer component is odd, so we are in the -PI/4 to 0 primary - region. We need to shift our result down by PI/4, and to do this - in the mod (4/PI) space we simply subtract 1. */ - lfd fp11,(L(DPone)-L(anchor))(r9) - fsub fp12,fp12,fp11 - - /* Now add up all the limbs in order. */ - fadd fp12,fp12,fp7 - fadd fp12,fp12,fp8 - fadd fp12,fp12,fp9 - - /* And finally multiply by pi/4. */ - lfd fp13,(L(pio4)-L(anchor))(r9) - fmul fp1,fp12,fp13 - - addi r7,r7,1 - b L(reduced) - -L(even_integer): - lfd fp11,(L(DPone)-L(anchor))(r9) - - /* Now add up all the limbs in order. */ - fadd fp12,fp12,fp7 - fadd fp12,r12,fp8 - fadd fp12,r12,fp9 - - /* We need to check if the addition of all the limbs resulted in us - overflowing 1.0. */ - fcmpu 0,fp12,fp11 - bgt L(greater_than_one) - - /* And finally multiply by pi/4. */ - lfd fp13,(L(pio4)-L(anchor))(r9) - fmul fp1,fp12,fp13 - - addi r7,r7,1 - b L(reduced) - -L(greater_than_one): - /* We did overflow 1.0 when adding up all the limbs. Add 1.0 to our - integer, and subtract 1.0 from our result. Since that makes the - integer component odd, we need to subtract another 1.0 as - explained above. */ - addi r7,r7,1 - - lfd fp11,(L(DPtwo)-L(anchor))(r9) - fsub fp12,fp12,fp11 - - /* And finally multiply by pi/4. */ - lfd fp13,(L(pio4)-L(anchor))(r9) - fmul fp1,fp12,fp13 - - addi r7,r7,1 - b L(reduced) - - .balign 16 -L(less_2pn5): - lis r4,TWO_PN27@h - ori r4,r4,TWO_PN27@l - - cmpw r3,r4 - blt L(less_2pn27) - - /* A simpler Chebyshev approximation is close enough for this range: - 1.0+x^2*(CC0+x^3*CC1). */ - - lfd fp10,(L(CC0)-L(anchor))(r9) - lfd fp11,(L(CC1)-L(anchor))(r9) - - fmul fp2,fp1,fp1 /* x^2 */ - fmul fp3,fp2,fp1 /* x^3 */ - lfd fp1,(L(DPone)-L(anchor))(r9) - - fmadd fp4,fp3,fp11,fp10 /* CC0+x^3*CC1 */ - fmadd fp1,fp2,fp4,fp1 /* 1.0+x^2*(CC0+x^3*CC1) */ - - frsp fp1,fp1 /* Round to single precision. */ - - blr - - .balign 16 -L(less_2pn27): - /* Handle some special cases: - - cosf(subnormal) raises inexact - cosf(min_normalized) raises inexact - cosf(normalized) raises inexact. */ - - lfd fp2,(L(DPone)-L(anchor))(r9) - - fabs fp1,fp1 /* |x| */ - fsub fp1,fp2,fp1 /* 1.0-|x| */ - - frsp fp1,fp1 - - blr - -END (__cosf) - - .section .rodata, "a" - - .balign 8 - -L(anchor): - - /* Chebyshev constants for sin, range -PI/4 - PI/4. */ -L(S0): .8byte 0xbfc5555555551cd9 -L(S1): .8byte 0x3f81111110c2688b -L(S2): .8byte 0xbf2a019f8b4bd1f9 -L(S3): .8byte 0x3ec71d7264e6b5b4 -L(S4): .8byte 0xbe5a947e1674b58a - - /* Chebyshev constants for cos, range 2^-27 - 2^-5. */ -L(CC0): .8byte 0xbfdfffffff5cc6fd -L(CC1): .8byte 0x3fa55514b178dac5 - - /* Chebyshev constants for cos, range -PI/4 - PI/4. */ -L(C0): .8byte 0xbfdffffffffe98ae -L(C1): .8byte 0x3fa55555545c50c7 -L(C2): .8byte 0xbf56c16b348b6874 -L(C3): .8byte 0x3efa00eb9ac43cc0 -L(C4): .8byte 0xbe923c97dd8844d7 - -L(invpio2): - .8byte 0x3fe45f306dc9c883 /* 2/PI */ - -L(invpio4): - .8byte 0x3ff45f306dc9c883 /* 4/PI */ - -L(invpio4_table): - .8byte 0x0000000000000000 - .8byte 0x3ff45f306c000000 - .8byte 0x3e3c9c882a000000 - .8byte 0x3c54fe13a8000000 - .8byte 0x3aaf47d4d0000000 - .8byte 0x38fbb81b6c000000 - .8byte 0x3714acc9e0000000 - .8byte 0x3560e4107c000000 - .8byte 0x33bca2c756000000 - .8byte 0x31fbd778ac000000 - .8byte 0x300b7246e0000000 - .8byte 0x2e5d2126e8000000 - .8byte 0x2c97003248000000 - .8byte 0x2ad77504e8000000 - .8byte 0x290921cfe0000000 - .8byte 0x274deb1cb0000000 - .8byte 0x25829a73e0000000 - .8byte 0x23fd1046be000000 - .8byte 0x2224baed10000000 - .8byte 0x20709d338e000000 - .8byte 0x1e535a2f80000000 - .8byte 0x1cef904e64000000 - .8byte 0x1b0d639830000000 - .8byte 0x1964ce7d24000000 - .8byte 0x17b908bf16000000 - -L(pio4): - .8byte 0x3fe921fb54442d18 /* PI/4 */ - -/* PI/2 as a sum of two doubles. We only use 32 bits of the upper limb - to avoid losing significant bits when multiplying with up to - (2^22)/(pi/2). */ -L(pio2hi): - .8byte 0xbff921fb54400000 - -L(pio2lo): - .8byte 0xbdd0b4611a626332 - -L(pio2_table): - .8byte 0 - .8byte 0x3ff921fb54442d18 /* 1 * PI/2 */ - .8byte 0x400921fb54442d18 /* 2 * PI/2 */ - .8byte 0x4012d97c7f3321d2 /* 3 * PI/2 */ - .8byte 0x401921fb54442d18 /* 4 * PI/2 */ - .8byte 0x401f6a7a2955385e /* 5 * PI/2 */ - .8byte 0x4022d97c7f3321d2 /* 6 * PI/2 */ - .8byte 0x4025fdbbe9bba775 /* 7 * PI/2 */ - .8byte 0x402921fb54442d18 /* 8 * PI/2 */ - .8byte 0x402c463abeccb2bb /* 9 * PI/2 */ - .8byte 0x402f6a7a2955385e /* 10 * PI/2 */ - -L(small): - .8byte 0x3cd0000000000000 /* 2^-50 */ - -L(ones): - .8byte 0x3ff0000000000000 /* +1.0 */ - .8byte 0xbff0000000000000 /* -1.0 */ - -L(DPhalf): - .8byte 0x3fe0000000000000 /* 0.5 */ - -L(DPone): - .8byte 0x3ff0000000000000 /* 1.0 */ - -L(DPtwo): - .8byte 0x4000000000000000 /* 2.0 */ - -weak_alias(__cosf, cosf) |