From a334319f6530564d22e775935d9c91663623a1b4 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Wed, 22 Dec 2004 20:10:10 +0000 Subject: (CFLAGS-tst-align.c): Add -mpreferred-stack-boundary=4. --- sysdeps/ia64/fpu/s_tanl.S | 3036 +++++++++++++++++++++------------------------ 1 file changed, 1429 insertions(+), 1607 deletions(-) (limited to 'sysdeps/ia64/fpu/s_tanl.S') diff --git a/sysdeps/ia64/fpu/s_tanl.S b/sysdeps/ia64/fpu/s_tanl.S index 607a271545..e13e6c6cbd 100644 --- a/sysdeps/ia64/fpu/s_tanl.S +++ b/sysdeps/ia64/fpu/s_tanl.S @@ -1,10 +1,10 @@ -.file "tancotl.s" +.file "tanl.s" - -// Copyright (c) 2000 - 2004, Intel Corporation +// Copyright (C) 2000, 2001, Intel Corporation // All rights reserved. -// -// Contributed 2000 by the Intel Numerics Group, Intel Corporation +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,7 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. - +// // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -35,78 +35,50 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://www.intel.com/software/products/opensource/libraries/num.htm. +// http://developer.intel.com/opensource. // -//********************************************************************* +// ********************************************************************* // // History: // -// 02/02/00 (hand-optimized) -// 04/04/00 Unwind support added +// 2/02/2000 (hand-optimized) +// 4/04/00 Unwind support added // 12/28/00 Fixed false invalid flags -// 02/06/02 Improved speed -// 05/07/02 Changed interface to __libm_pi_by_2_reduce -// 05/30/02 Added cotl -// 02/10/03 Reordered header: .section, .global, .proc, .align; -// used data8 for long double table values -// 05/15/03 Reformatted data tables -// 10/26/04 Avoided using r14-31 as scratch so not clobbered by dynamic loader // -//********************************************************************* +// ********************************************************************* // -// Functions: tanl(x) = tangent(x), for double-extended precision x values -// cotl(x) = cotangent(x), for double-extended precision x values +// Function: tanl(x) = tangent(x), for double-extended precision x values // -//********************************************************************* +// ********************************************************************* // // Resources Used: // // Floating-Point Registers: f8 (Input and Return Value) // f9-f15 -// f32-f121 +// f32-f112 // // General Purpose Registers: -// r32-r70 +// r32-r48 +// r49-r50 (Used to pass arguments to pi_by_2 reduce routine) // // Predicate Registers: p6-p15 // -//********************************************************************* +// ********************************************************************* // -// IEEE Special Conditions for tanl: +// IEEE Special Conditions: // // Denormal fault raised on denormal inputs // Overflow exceptions do not occur -// Underflow exceptions raised when appropriate for tan +// Underflow exceptions raised when appropriate for tan // (No specialized error handling for this routine) // Inexact raised when appropriate by algorithm // -// tanl(SNaN) = QNaN -// tanl(QNaN) = QNaN -// tanl(inf) = QNaN -// tanl(+/-0) = +/-0 -// -//********************************************************************* -// -// IEEE Special Conditions for cotl: -// -// Denormal fault raised on denormal inputs -// Overflow exceptions occur at zero and near zero -// Underflow exceptions do not occur -// Inexact raised when appropriate by algorithm -// -// cotl(SNaN) = QNaN -// cotl(QNaN) = QNaN -// cotl(inf) = QNaN -// cotl(+/-0) = +/-Inf and error handling is called -// -//********************************************************************* +// tan(SNaN) = QNaN +// tan(QNaN) = QNaN +// tan(inf) = QNaN +// tan(+/-0) = +/-0 // -// Below are mathematical and algorithmic descriptions for tanl. -// For cotl we use next identity cot(x) = -tan(x + Pi/2). -// So, to compute cot(x) we just need to increment N (N = N + 1) -// and invert sign of the computed result. -// -//********************************************************************* +// ********************************************************************* // // Mathematical Description // @@ -134,13 +106,13 @@ // ------- // // tan(r + c) = r + c + r^3/3 ...accurately -// -cot(r + c) = -1/(r+c) + r/3 ...accurately +// -cot(r + c) = -1/(r+c) + r/3 ...accurately // // Case 4: // ------- // // tan(r + c) = r + c + r^3/3 + 2r^5/15 ...accurately -// -cot(r + c) = -1/(r+c) + r/3 + r^3/45 ...accurately +// -cot(r + c) = -1/(r+c) + r/3 + r^3/45 ...accurately // // // The only cases left are Cases 1 and 3 of the argument reduction @@ -171,13 +143,13 @@ // Since Arg = N pi/4 + r + c accurately, we have // // tan(Arg) = tan(r+c) for N even, -// = -cot(r+c) otherwise. +// = -cot(r+c) otherwise. // // Here for this case, both tan(r) and -cot(r) can be approximated // by simple polynomials: // // tan(r) = r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19 -// -cot(r) = -1/r + Q1_1 r + Q1_2 r^3 + ... + Q1_7 r^13 +// -cot(r) = -1/r + Q1_1 r + Q1_2 r^3 + ... + Q1_7 r^13 // // accurately. Since |r| is relatively small, tan(r+c) and // -cot(r+c) can be accurately approximated by replacing r with @@ -206,21 +178,21 @@ // The required calculation is either // // tan(r + c) = tan(r) + correction, or -// -cot(r + c) = -cot(r) + correction. +// -cot(r + c) = -cot(r) + correction. // // Specifically, // // tan(r + c) = tan(r) + c tan'(r) + O(c^2) -// = tan(r) + c sec^2(r) + O(c^2) -// = tan(r) + c SEC_sq ...accurately +// = tan(r) + c sec^2(r) + O(c^2) +// = tan(r) + c SEC_sq ...accurately // as long as SEC_sq approximates sec^2(r) // to, say, 5 bits or so. // // Similarly, // -// -cot(r + c) = -cot(r) - c cot'(r) + O(c^2) -// = -cot(r) + c csc^2(r) + O(c^2) -// = -cot(r) + c CSC_sq ...accurately +// -cot(r + c) = -cot(r) - c cot'(r) + O(c^2) +// = -cot(r) + c csc^2(r) + O(c^2) +// = -cot(r) + c CSC_sq ...accurately // as long as CSC_sq approximates csc^2(r) // to, say, 5 bits or so. // @@ -236,14 +208,14 @@ // where // // B = 2^k * 1.b_1 b_2 ... b_5 1 -// x = |r| - B +// x = |r| - B // // Now, // tan(B) + tan(x) // tan( B + x ) = ------------------------ // 1 - tan(B)*tan(x) // -// / \ +// / \ // | tan(B) + tan(x) | // = tan(B) + | ------------------------ - tan(B) | @@ -276,7 +248,7 @@ // cot( B + x ) = ------------------------ // tan(B) + tan(x) // -// / \ +// / \ // | 1 - tan(B)*tan(x) | // = cot(B) + | ----------------------- - cot(B) | @@ -301,7 +273,7 @@ // Arg = N * pi/2 + r + c ...accurately // // tan(Arg) = tan(r) + correction if N is even; -// = -cot(r) + correction otherwise. +// = -cot(r) + correction otherwise. // // For Cases 2 and 4, // @@ -320,8 +292,8 @@ // tan(Arg) = r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19 // + c*(1 + r^2) N even // -// = -1/(r+c) + Q1_1 r + Q1_2 r^3 + ... + Q1_7 r^13 -// + Q1_1*c N odd +// = -1/(r+c) + Q1_1 r + Q1_2 r^3 + ... + Q1_7 r^13 +// + Q1_1*c N odd // // Case normal_r: 2^(-2) <= |r| <= pi/4 // @@ -332,15 +304,15 @@ // // tan(Arg) = tan(r) + c*sec^2(r) // = tan( sgn_r * (B+x) ) + c * sec^2(|r|) -// = sgn_r * ( tan(B+x) + sgn_r*c*sec^2(|r|) ) -// = sgn_r * ( tan(B+x) + sgn_r*c*sec^2(B) ) +// = sgn_r * ( tan(B+x) + sgn_r*c*sec^2(|r|) ) +// = sgn_r * ( tan(B+x) + sgn_r*c*sec^2(B) ) // // since B approximates |r| to 2^(-6) in relative accuracy. // // / (1/[sin(B)*cos(B)]) * tan(x) // tan(Arg) = sgn_r * | tan(B) + -------------------------------- // \ cot(B) - tan(x) -// \ +// \ // + CORR | // / @@ -352,15 +324,15 @@ // // tan(Arg) = -cot(r) + c*csc^2(r) // = -cot( sgn_r * (B+x) ) + c * csc^2(|r|) -// = sgn_r * ( -cot(B+x) + sgn_r*c*csc^2(|r|) ) -// = sgn_r * ( -cot(B+x) + sgn_r*c*csc^2(B) ) +// = sgn_r * ( -cot(B+x) + sgn_r*c*csc^2(|r|) ) +// = sgn_r * ( -cot(B+x) + sgn_r*c*csc^2(B) ) // // since B approximates |r| to 2^(-6) in relative accuracy. // // / (1/[sin(B)*cos(B)]) * tan(x) // tan(Arg) = sgn_r * | -cot(B) + -------------------------------- // \ tan(B) + tan(x) -// \ +// \ // + CORR | // / @@ -384,8 +356,8 @@ // For N even, // // rsq := r * r -// Poly := c + r * rsq * P1_1 -// Result := r + Poly ...in user-defined rounding +// Result := c + r * rsq * P1_1 +// Result := r + Result ...in user-defined rounding // // For N odd, // S_hi := -frcpa(r) ...8 bits @@ -403,8 +375,8 @@ // For N even, // // rsq := r * r -// Poly := c + r * rsq * (P1_1 + rsq * P1_2) -// Result := r + Poly ...in user-defined rounding +// Result := c + r * rsq * (P1_1 + rsq * P1_2) +// Result := r + Result ...in user-defined rounding // // For N odd, // S_hi := -frcpa(r) ...8 bits @@ -442,8 +414,8 @@ // Poly2 := P1_4 + rsq*(P1_5 + rsq*(P1_6 + ... rsq*P1_9)) // CORR := c * ( 1 + rsq ) // Poly := Poly1 + r_to_the_8*Poly2 -// Poly := r*Poly + CORR -// Result := r + Poly ...in user-defined rounding +// Result := r*Poly + CORR +// Result := r + Result ...in user-defined rounding // ...note that Poly1 and r_to_the_8 can be computed in parallel // ...with Poly2 (Poly1 is intentionally set to be much // ...shorter than Poly2 so that r_to_the_8 and CORR can be hidden) @@ -462,8 +434,8 @@ // rsq := r*r // P := Q1_1 + rsq*(Q1_2 + rsq*(Q1_3 + ... + rsq*Q1_7)) // -// Poly := r*P + S_lo -// Result := S_hi + Poly ...in user-defined rounding +// Result := r*P + S_lo +// Result := S_hi + Result ...in user-defined rounding // // // Algorithm for the case of normal_r @@ -482,7 +454,7 @@ // / (1/[sin(B)*cos(B)]) * tan(x) // sgn_r * | tan(B) + -------------------------------- + // \ cot(B) - tan(x) -// \ +// \ // CORR | // / @@ -491,7 +463,7 @@ // calculated beforehand and stored in a table. Specifically, // the table values are // -// tan(B) as T_hi + T_lo; +// tan(B) as T_hi + T_lo; // cot(B) as C_hi + C_lo; // 1/[sin(B)*cos(B)] as SC_inv // @@ -587,7 +559,7 @@ // / (1/[sin(B)*cos(B)]) * tan(x) // sgn_r * | -cot(B) + -------------------------------- + // \ tan(B) + tan(x) -// \ +// \ // CORR | // / @@ -596,7 +568,7 @@ // calculated beforehand and stored in a table. Specifically, // the table values are // -// tan(B) as T_hi + T_lo; +// tan(B) as T_hi + T_lo; // cot(B) as C_hi + C_lo; // 1/[sin(B)*cos(B)] as SC_inv // @@ -703,382 +675,254 @@ // // -RODATA -.align 16 - -LOCAL_OBJECT_START(TANL_BASE_CONSTANTS) - -tanl_table_1: -data8 0xA2F9836E4E44152A, 0x00003FFE // two_by_pi -data8 0xC84D32B0CE81B9F1, 0x00004016 // P_0 -data8 0xC90FDAA22168C235, 0x00003FFF // P_1 -data8 0xECE675D1FC8F8CBB, 0x0000BFBD // P_2 -data8 0xB7ED8FBBACC19C60, 0x0000BF7C // P_3 -LOCAL_OBJECT_END(TANL_BASE_CONSTANTS) - -LOCAL_OBJECT_START(tanl_table_2) -data8 0xC90FDAA22168C234, 0x00003FFE // PI_BY_4 -data8 0xA397E5046EC6B45A, 0x00003FE7 // Inv_P_0 -data8 0x8D848E89DBD171A1, 0x0000BFBF // d_1 -data8 0xD5394C3618A66F8E, 0x0000BF7C // d_2 -data4 0x3E800000 // two**-2 -data4 0xBE800000 // -two**-2 -data4 0x00000000 // pad -data4 0x00000000 // pad -LOCAL_OBJECT_END(tanl_table_2) - -LOCAL_OBJECT_START(tanl_table_p1) -data8 0xAAAAAAAAAAAAAABD, 0x00003FFD // P1_1 -data8 0x8888888888882E6A, 0x00003FFC // P1_2 -data8 0xDD0DD0DD0F0177B6, 0x00003FFA // P1_3 -data8 0xB327A440646B8C6D, 0x00003FF9 // P1_4 -data8 0x91371B251D5F7D20, 0x00003FF8 // P1_5 -data8 0xEB69A5F161C67914, 0x00003FF6 // P1_6 -data8 0xBEDD37BE019318D2, 0x00003FF5 // P1_7 -data8 0x9979B1463C794015, 0x00003FF4 // P1_8 -data8 0x8EBD21A38C6EB58A, 0x00003FF3 // P1_9 -LOCAL_OBJECT_END(tanl_table_p1) - -LOCAL_OBJECT_START(tanl_table_q1) -data8 0xAAAAAAAAAAAAAAB4, 0x00003FFD // Q1_1 -data8 0xB60B60B60B5FC93E, 0x00003FF9 // Q1_2 -data8 0x8AB355E00C9BBFBF, 0x00003FF6 // Q1_3 -data8 0xDDEBBC89CBEE3D4C, 0x00003FF2 // Q1_4 -data8 0xB3548A685F80BBB6, 0x00003FEF // Q1_5 -data8 0x913625604CED5BF1, 0x00003FEC // Q1_6 -data8 0xF189D95A8EE92A83, 0x00003FE8 // Q1_7 -LOCAL_OBJECT_END(tanl_table_q1) - -LOCAL_OBJECT_START(tanl_table_p2) -data8 0xAAAAAAAAAAAB362F, 0x00003FFD // P2_1 -data8 0x88888886E97A6097, 0x00003FFC // P2_2 -data8 0xDD108EE025E716A1, 0x00003FFA // P2_3 -LOCAL_OBJECT_END(tanl_table_p2) - -LOCAL_OBJECT_START(tanl_table_tm2) +#include "libm_support.h" + +#ifdef _LIBC +.rodata +#else +.data +#endif +.align 128 + +TANL_BASE_CONSTANTS: +ASM_TYPE_DIRECTIVE(TANL_BASE_CONSTANTS,@object) +data4 0x4B800000, 0xCB800000, 0x38800000, 0xB8800000 // two**24, -two**24 + // two**-14, -two**-14 +data4 0x4E44152A, 0xA2F9836E, 0x00003FFE, 0x00000000 // two_by_pi +data4 0xCE81B9F1, 0xC84D32B0, 0x00004016, 0x00000000 // P_0 +data4 0x2168C235, 0xC90FDAA2, 0x00003FFF, 0x00000000 // P_1 +data4 0xFC8F8CBB, 0xECE675D1, 0x0000BFBD, 0x00000000 // P_2 +data4 0xACC19C60, 0xB7ED8FBB, 0x0000BF7C, 0x00000000 // P_3 +data4 0x5F000000, 0xDF000000, 0x00000000, 0x00000000 // two_to_63, -two_to_63 +data4 0x6EC6B45A, 0xA397E504, 0x00003FE7, 0x00000000 // Inv_P_0 +data4 0xDBD171A1, 0x8D848E89, 0x0000BFBF, 0x00000000 // d_1 +data4 0x18A66F8E, 0xD5394C36, 0x0000BF7C, 0x00000000 // d_2 +data4 0x2168C234, 0xC90FDAA2, 0x00003FFE, 0x00000000 // PI_BY_4 +data4 0x2168C234, 0xC90FDAA2, 0x0000BFFE, 0x00000000 // MPI_BY_4 +data4 0x3E800000, 0xBE800000, 0x00000000, 0x00000000 // two**-2, -two**-2 +data4 0x2F000000, 0xAF000000, 0x00000000, 0x00000000 // two**-33, -two**-33 +data4 0xAAAAAABD, 0xAAAAAAAA, 0x00003FFD, 0x00000000 // P1_1 +data4 0x88882E6A, 0x88888888, 0x00003FFC, 0x00000000 // P1_2 +data4 0x0F0177B6, 0xDD0DD0DD, 0x00003FFA, 0x00000000 // P1_3 +data4 0x646B8C6D, 0xB327A440, 0x00003FF9, 0x00000000 // P1_4 +data4 0x1D5F7D20, 0x91371B25, 0x00003FF8, 0x00000000 // P1_5 +data4 0x61C67914, 0xEB69A5F1, 0x00003FF6, 0x00000000 // P1_6 +data4 0x019318D2, 0xBEDD37BE, 0x00003FF5, 0x00000000 // P1_7 +data4 0x3C794015, 0x9979B146, 0x00003FF4, 0x00000000 // P1_8 +data4 0x8C6EB58A, 0x8EBD21A3, 0x00003FF3, 0x00000000 // P1_9 +data4 0xAAAAAAB4, 0xAAAAAAAA, 0x00003FFD, 0x00000000 // Q1_1 +data4 0x0B5FC93E, 0xB60B60B6, 0x00003FF9, 0x00000000 // Q1_2 +data4 0x0C9BBFBF, 0x8AB355E0, 0x00003FF6, 0x00000000 // Q1_3 +data4 0xCBEE3D4C, 0xDDEBBC89, 0x00003FF2, 0x00000000 // Q1_4 +data4 0x5F80BBB6, 0xB3548A68, 0x00003FEF, 0x00000000 // Q1_5 +data4 0x4CED5BF1, 0x91362560, 0x00003FEC, 0x00000000 // Q1_6 +data4 0x8EE92A83, 0xF189D95A, 0x00003FE8, 0x00000000 // Q1_7 +data4 0xAAAB362F, 0xAAAAAAAA, 0x00003FFD, 0x00000000 // P2_1 +data4 0xE97A6097, 0x88888886, 0x00003FFC, 0x00000000 // P2_2 +data4 0x25E716A1, 0xDD108EE0, 0x00003FFA, 0x00000000 // P2_3 // // Entries T_hi double-precision memory format // Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64) // Entries T_lo single-precision memory format // Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64) // -data8 0x3FD09BC362400794 -data4 0x23A05C32, 0x00000000 -data8 0x3FD124A9DFFBC074 -data4 0x240078B2, 0x00000000 -data8 0x3FD1AE235BD4920F -data4 0x23826B8E, 0x00000000 -data8 0x3FD2383515E2701D -data4 0x22D31154, 0x00000000 -data8 0x3FD2C2E463739C2D -data4 0x2265C9E2, 0x00000000 -data8 0x3FD34E36AFEEA48B -data4 0x245C05EB, 0x00000000 -data8 0x3FD3DA317DBB35D1 -data4 0x24749F2D, 0x00000000 -data8 0x3FD466DA67321619 -data4 0x2462CECE, 0x00000000 -data8 0x3FD4F4371F94A4D5 -data4 0x246D0DF1, 0x00000000 -data8 0x3FD5824D740C3E6D -data4 0x240A85B5, 0x00000000 -data8 0x3FD611234CB1E73D -data4 0x23F96E33, 0x00000000 -data8 0x3FD6A0BEAD9EA64B -data4 0x247C5393, 0x00000000 -data8 0x3FD73125B804FD01 -data4 0x241F3B29, 0x00000000 -data8 0x3FD7C25EAB53EE83 -data4 0x2479989B, 0x00000000 -data8 0x3FD8546FE6640EED -data4 0x23B343BC, 0x00000000 -data8 0x3FD8E75FE8AF1892 -data4 0x241454D1, 0x00000000 -data8 0x3FD97B3553928BDA -data4 0x238613D9, 0x00000000 -data8 0x3FDA0FF6EB9DE4DE -data4 0x22859FA7, 0x00000000 -data8 0x3FDAA5AB99ECF92D -data4 0x237A6D06, 0x00000000 -data8 0x3FDB3C5A6D8F1796 -data4 0x23952F6C, 0x00000000 -data8 0x3FDBD40A9CFB8BE4 -data4 0x2280FC95, 0x00000000 -data8 0x3FDC6CC387943100 -data4 0x245D2EC0, 0x00000000 -data8 0x3FDD068CB736C500 -data4 0x23C4AD7D, 0x00000000 -data8 0x3FDDA16DE1DDBC31 -data4 0x23D076E6, 0x00000000 -data8 0x3FDE3D6EEB515A93 -data4 0x244809A6, 0x00000000 -data8 0x3FDEDA97E6E9E5F1 -data4 0x220856C8, 0x00000000 -data8 0x3FDF78F11963CE69 -data4 0x244BE993, 0x00000000 -data8 0x3FE00C417D635BCE -data4 0x23D21799, 0x00000000 -data8 0x3FE05CAB1C302CD3 -data4 0x248A1B1D, 0x00000000 -data8 0x3FE0ADB9DB6A1FA0 -data4 0x23D53E33, 0x00000000 -data8 0x3FE0FF724A20BA81 -data4 0x24DB9ED5, 0x00000000 -data8 0x3FE151D9153FA6F5 -data4 0x24E9E451, 0x00000000 -LOCAL_OBJECT_END(tanl_table_tm2) - -LOCAL_OBJECT_START(tanl_table_tm1) +data4 0x62400794, 0x3FD09BC3, 0x23A05C32, 0x00000000 +data4 0xDFFBC074, 0x3FD124A9, 0x240078B2, 0x00000000 +data4 0x5BD4920F, 0x3FD1AE23, 0x23826B8E, 0x00000000 +data4 0x15E2701D, 0x3FD23835, 0x22D31154, 0x00000000 +data4 0x63739C2D, 0x3FD2C2E4, 0x2265C9E2, 0x00000000 +data4 0xAFEEA48B, 0x3FD34E36, 0x245C05EB, 0x00000000 +data4 0x7DBB35D1, 0x3FD3DA31, 0x24749F2D, 0x00000000 +data4 0x67321619, 0x3FD466DA, 0x2462CECE, 0x00000000 +data4 0x1F94A4D5, 0x3FD4F437, 0x246D0DF1, 0x00000000 +data4 0x740C3E6D, 0x3FD5824D, 0x240A85B5, 0x00000000 +data4 0x4CB1E73D, 0x3FD61123, 0x23F96E33, 0x00000000 +data4 0xAD9EA64B, 0x3FD6A0BE, 0x247C5393, 0x00000000 +data4 0xB804FD01, 0x3FD73125, 0x241F3B29, 0x00000000 +data4 0xAB53EE83, 0x3FD7C25E, 0x2479989B, 0x00000000 +data4 0xE6640EED, 0x3FD8546F, 0x23B343BC, 0x00000000 +data4 0xE8AF1892, 0x3FD8E75F, 0x241454D1, 0x00000000 +data4 0x53928BDA, 0x3FD97B35, 0x238613D9, 0x00000000 +data4 0xEB9DE4DE, 0x3FDA0FF6, 0x22859FA7, 0x00000000 +data4 0x99ECF92D, 0x3FDAA5AB, 0x237A6D06, 0x00000000 +data4 0x6D8F1796, 0x3FDB3C5A, 0x23952F6C, 0x00000000 +data4 0x9CFB8BE4, 0x3FDBD40A, 0x2280FC95, 0x00000000 +data4 0x87943100, 0x3FDC6CC3, 0x245D2EC0, 0x00000000 +data4 0xB736C500, 0x3FDD068C, 0x23C4AD7D, 0x00000000 +data4 0xE1DDBC31, 0x3FDDA16D, 0x23D076E6, 0x00000000 +data4 0xEB515A93, 0x3FDE3D6E, 0x244809A6, 0x00000000 +data4 0xE6E9E5F1, 0x3FDEDA97, 0x220856C8, 0x00000000 +data4 0x1963CE69, 0x3FDF78F1, 0x244BE993, 0x00000000 +data4 0x7D635BCE, 0x3FE00C41, 0x23D21799, 0x00000000 +data4 0x1C302CD3, 0x3FE05CAB, 0x248A1B1D, 0x00000000 +data4 0xDB6A1FA0, 0x3FE0ADB9, 0x23D53E33, 0x00000000 +data4 0x4A20BA81, 0x3FE0FF72, 0x24DB9ED5, 0x00000000 +data4 0x153FA6F5, 0x3FE151D9, 0x24E9E451, 0x00000000 // // Entries T_hi double-precision memory format // Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64) // Entries T_lo single-precision memory format // Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64) // -data8 0x3FE1CEC4BA1BE39E -data4 0x24B60F9E, 0x00000000 -data8 0x3FE277E45ABD9B2D -data4 0x248C2474, 0x00000000 -data8 0x3FE324180272B110 -data4 0x247B8311, 0x00000000 -data8 0x3FE3D38B890E2DF0 -data4 0x24C55751, 0x00000000 -data8 0x3FE4866D46236871 -data4 0x24E5BC34, 0x00000000 -data8 0x3FE53CEE45E044B0 -data4 0x24001BA4, 0x00000000 -data8 0x3FE5F74282EC06E4 -data4 0x24B973DC, 0x00000000 -data8 0x3FE6B5A125DF43F9 -data4 0x24895440, 0x00000000 -data8 0x3FE77844CAFD348C -data4 0x240021CA, 0x00000000 -data8 0x3FE83F6BCEED6B92 -data4 0x24C45372, 0x00000000 -data8 0x3FE90B58A34F3665 -data4 0x240DAD33, 0x00000000 -data8 0x3FE9DC522C1E56B4 -data4 0x24F846CE, 0x00000000 -data8 0x3FEAB2A427041578 -data4 0x2323FB6E, 0x00000000 -data8 0x3FEB8E9F9DD8C373 -data4 0x24B3090B, 0x00000000 -data8 0x3FEC709B65C9AA7B -data4 0x2449F611, 0x00000000 -data8 0x3FED58F4ACCF8435 -data4 0x23616A7E, 0x00000000 -data8 0x3FEE480F97635082 -data4 0x24C2FEAE, 0x00000000 -data8 0x3FEF3E57F0ACC544 -data4 0x242CE964, 0x00000000 -data8 0x3FF01E20F7E06E4B -data4 0x2480D3EE, 0x00000000 -data8 0x3FF0A1258A798A69 -data4 0x24DB8967, 0x00000000 -LOCAL_OBJECT_END(tanl_table_tm1) - -LOCAL_OBJECT_START(tanl_table_cm2) +data4 0xBA1BE39E, 0x3FE1CEC4, 0x24B60F9E, 0x00000000 +data4 0x5ABD9B2D, 0x3FE277E4, 0x248C2474, 0x00000000 +data4 0x0272B110, 0x3FE32418, 0x247B8311, 0x00000000 +data4 0x890E2DF0, 0x3FE3D38B, 0x24C55751, 0x00000000 +data4 0x46236871, 0x3FE4866D, 0x24E5BC34, 0x00000000 +data4 0x45E044B0, 0x3FE53CEE, 0x24001BA4, 0x00000000 +data4 0x82EC06E4, 0x3FE5F742, 0x24B973DC, 0x00000000 +data4 0x25DF43F9, 0x3FE6B5A1, 0x24895440, 0x00000000 +data4 0xCAFD348C, 0x3FE77844, 0x240021CA, 0x00000000 +data4 0xCEED6B92, 0x3FE83F6B, 0x24C45372, 0x00000000 +data4 0xA34F3665, 0x3FE90B58, 0x240DAD33, 0x00000000 +data4 0x2C1E56B4, 0x3FE9DC52, 0x24F846CE, 0x00000000 +data4 0x27041578, 0x3FEAB2A4, 0x2323FB6E, 0x00000000 +data4 0x9DD8C373, 0x3FEB8E9F, 0x24B3090B, 0x00000000 +data4 0x65C9AA7B, 0x3FEC709B, 0x2449F611, 0x00000000 +data4 0xACCF8435, 0x3FED58F4, 0x23616A7E, 0x00000000 +data4 0x97635082, 0x3FEE480F, 0x24C2FEAE, 0x00000000 +data4 0xF0ACC544, 0x3FEF3E57, 0x242CE964, 0x00000000 +data4 0xF7E06E4B, 0x3FF01E20, 0x2480D3EE, 0x00000000 +data4 0x8A798A69, 0x3FF0A125, 0x24DB8967, 0x00000000 // // Entries C_hi double-precision memory format // Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64) // Entries C_lo single-precision memory format // Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64) // -data8 0x400ED3E2E63EFBD0 -data4 0x259D94D4, 0x00000000 -data8 0x400DDDB4C515DAB5 -data4 0x245F0537, 0x00000000 -data8 0x400CF57ABE19A79F -data4 0x25D4EA9F, 0x00000000 -data8 0x400C1A06D15298ED -data4 0x24AE40A0, 0x00000000 -data8 0x400B4A4C164B2708 -data4 0x25A5AAB6, 0x00000000 -data8 0x400A855A5285B068 -data4 0x25524F18, 0x00000000 -data8 0x4009CA5A3FFA549F -data4 0x24C999C0, 0x00000000 -data8 0x4009188A646AF623 -data4 0x254FD801, 0x00000000 -data8 0x40086F3C6084D0E7 -data4 0x2560F5FD, 0x00000000 -data8 0x4007CDD2A29A76EE -data4 0x255B9D19, 0x00000000 -data8 0x400733BE6C8ECA95 -data4 0x25CB021B, 0x00000000 -data8 0x4006A07E1F8DDC52 -data4 0x24AB4722, 0x00000000 -data8 0x4006139BC298AD58 -data4 0x252764E2, 0x00000000 -data8 0x40058CABBAD7164B -data4 0x24DAF5DB, 0x00000000 -data8 0x40050B4BAE31A5D3 -data4 0x25EA20F4, 0x00000000 -data8 0x40048F2189F85A8A -data4 0x2583A3E8, 0x00000000 -data8 0x400417DAA862380D -data4 0x25DCC4CC, 0x00000000 -data8 0x4003A52B1088FCFE -data4 0x2430A492, 0x00000000 -data8 0x400336CCCD3527D5 -data4 0x255F77CF, 0x00000000 -data8 0x4002CC7F5760766D -data4 0x25DA0BDA, 0x00000000 -data8 0x4002660711CE02E3 -data4 0x256FF4A2, 0x00000000 -data8 0x4002032CD37BBE04 -data4 0x25208AED, 0x00000000 -data8 0x4001A3BD7F050775 -data4 0x24B72DD6, 0x00000000 -data8 0x40014789A554848A -data4 0x24AB4DAA, 0x00000000 -data8 0x4000EE65323E81B7 -data4 0x2584C440, 0x00000000 -data8 0x4000982721CF1293 -data4 0x25C9428D, 0x00000000 -data8 0x400044A93D415EEB -data4 0x25DC8482, 0x00000000 -data8 0x3FFFE78FBD72C577 -data4 0x257F5070, 0x00000000 -data8 0x3FFF4AC375EFD28E -data4 0x23EBBF7A, 0x00000000 -data8 0x3FFEB2AF60B52DDE -data4 0x22EECA07, 0x00000000 -data8 0x3FFE1F1935204180 -data4 0x24191079, 0x00000000 -data8 0x3FFD8FCA54F7E60A -data4 0x248D3058, 0x00000000 -LOCAL_OBJECT_END(tanl_table_cm2) - -LOCAL_OBJECT_START(tanl_table_cm1) +data4 0xE63EFBD0, 0x400ED3E2, 0x259D94D4, 0x00000000 +data4 0xC515DAB5, 0x400DDDB4, 0x245F0537, 0x00000000 +data4 0xBE19A79F, 0x400CF57A, 0x25D4EA9F, 0x00000000 +data4 0xD15298ED, 0x400C1A06, 0x24AE40A0, 0x00000000 +data4 0x164B2708, 0x400B4A4C, 0x25A5AAB6, 0x00000000 +data4 0x5285B068, 0x400A855A, 0x25524F18, 0x00000000 +data4 0x3FFA549F, 0x4009CA5A, 0x24C999C0, 0x00000000 +data4 0x646AF623, 0x4009188A, 0x254FD801, 0x00000000 +data4 0x6084D0E7, 0x40086F3C, 0x2560F5FD, 0x00000000 +data4 0xA29A76EE, 0x4007CDD2, 0x255B9D19, 0x00000000 +data4 0x6C8ECA95, 0x400733BE, 0x25CB021B, 0x00000000 +data4 0x1F8DDC52, 0x4006A07E, 0x24AB4722, 0x00000000 +data4 0xC298AD58, 0x4006139B, 0x252764E2, 0x00000000 +data4 0xBAD7164B, 0x40058CAB, 0x24DAF5DB, 0x00000000 +data4 0xAE31A5D3, 0x40050B4B, 0x25EA20F4, 0x00000000 +data4 0x89F85A8A, 0x40048F21, 0x2583A3E8, 0x00000000 +data4 0xA862380D, 0x400417DA, 0x25DCC4CC, 0x00000000 +data4 0x1088FCFE, 0x4003A52B, 0x2430A492, 0x00000000 +data4 0xCD3527D5, 0x400336CC, 0x255F77CF, 0x00000000 +data4 0x5760766D, 0x4002CC7F, 0x25DA0BDA, 0x00000000 +data4 0x11CE02E3, 0x40026607, 0x256FF4A2, 0x00000000 +data4 0xD37BBE04, 0x4002032C, 0x25208AED, 0x00000000 +data4 0x7F050775, 0x4001A3BD, 0x24B72DD6, 0x00000000 +data4 0xA554848A, 0x40014789, 0x24AB4DAA, 0x00000000 +data4 0x323E81B7, 0x4000EE65, 0x2584C440, 0x00000000 +data4 0x21CF1293, 0x40009827, 0x25C9428D, 0x00000000 +data4 0x3D415EEB, 0x400044A9, 0x25DC8482, 0x00000000 +data4 0xBD72C577, 0x3FFFE78F, 0x257F5070, 0x00000000 +data4 0x75EFD28E, 0x3FFF4AC3, 0x23EBBF7A, 0x00000000 +data4 0x60B52DDE, 0x3FFEB2AF, 0x22EECA07, 0x00000000 +data4 0x35204180, 0x3FFE1F19, 0x24191079, 0x00000000 +data4 0x54F7E60A, 0x3FFD8FCA, 0x248D3058, 0x00000000 // // Entries C_hi double-precision memory format // Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64) // Entries C_lo single-precision memory format // Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64) // -data8 0x3FFCC06A79F6FADE -data4 0x239C7886, 0x00000000 -data8 0x3FFBB91F891662A6 -data4 0x250BD191, 0x00000000 -data8 0x3FFABFB6529F155D -data4 0x256CC3E6, 0x00000000 -data8 0x3FF9D3002E964AE9 -data4 0x250843E3, 0x00000000 -data8 0x3FF8F1EF89DCB383 -data4 0x2277C87E, 0x00000000 -data8 0x3FF81B937C87DBD6 -data4 0x256DA6CF, 0x00000000 -data8 0x3FF74F141042EDE4 -data4 0x2573D28A, 0x00000000 -data8 0x3FF68BAF1784B360 -data4 0x242E489A, 0x00000000 -data8 0x3FF5D0B57C923C4C -data4 0x2532D940, 0x00000000 -data8 0x3FF51D88F418EF20 -data4 0x253C7DD6, 0x00000000 -data8 0x3FF4719A02F88DAE -data4 0x23DB59BF, 0x00000000 -data8 0x3FF3CC6649DA0788 -data4 0x252B4756, 0x00000000 -data8 0x3FF32D770B980DB8 -data4 0x23FE585F, 0x00000000 -data8 0x3FF2945FE56C987A -data4 0x25378A63, 0x00000000 -data8 0x3FF200BDB16523F6 -data4 0x247BB2E0, 0x00000000 -data8 0x3FF172358CE27778 -data4 0x24446538, 0x00000000 -data8 0x3FF0E873FDEFE692 -data4 0x2514638F, 0x00000000 -data8 0x3FF0632C33154062 -data4 0x24A7FC27, 0x00000000 -data8 0x3FEFC42EB3EF115F -data4 0x248FD0FE, 0x00000000 -data8 0x3FEEC9E8135D26F6 -data4 0x2385C719, 0x00000000 -LOCAL_OBJECT_END(tanl_table_cm1) - -LOCAL_OBJECT_START(tanl_table_scim2) +data4 0x79F6FADE, 0x3FFCC06A, 0x239C7886, 0x00000000 +data4 0x891662A6, 0x3FFBB91F, 0x250BD191, 0x00000000 +data4 0x529F155D, 0x3FFABFB6, 0x256CC3E6, 0x00000000 +data4 0x2E964AE9, 0x3FF9D300, 0x250843E3, 0x00000000 +data4 0x89DCB383, 0x3FF8F1EF, 0x2277C87E, 0x00000000 +data4 0x7C87DBD6, 0x3FF81B93, 0x256DA6CF, 0x00000000 +data4 0x1042EDE4, 0x3FF74F14, 0x2573D28A, 0x00000000 +data4 0x1784B360, 0x3FF68BAF, 0x242E489A, 0x00000000 +data4 0x7C923C4C, 0x3FF5D0B5, 0x2532D940, 0x00000000 +data4 0xF418EF20, 0x3FF51D88, 0x253C7DD6, 0x00000000 +data4 0x02F88DAE, 0x3FF4719A, 0x23DB59BF, 0x00000000 +data4 0x49DA0788, 0x3FF3CC66, 0x252B4756, 0x00000000 +data4 0x0B980DB8, 0x3FF32D77, 0x23FE585F, 0x00000000 +data4 0xE56C987A, 0x3FF2945F, 0x25378A63, 0x00000000 +data4 0xB16523F6, 0x3FF200BD, 0x247BB2E0, 0x00000000 +data4 0x8CE27778, 0x3FF17235, 0x24446538, 0x00000000 +data4 0xFDEFE692, 0x3FF0E873, 0x2514638F, 0x00000000 +data4 0x33154062, 0x3FF0632C, 0x24A7FC27, 0x00000000 +data4 0xB3EF115F, 0x3FEFC42E, 0x248FD0FE, 0x00000000 +data4 0x135D26F6, 0x3FEEC9E8, 0x2385C719, 0x00000000 // // Entries SC_inv in Swapped IEEE format (extended) // Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64) // -data8 0x839D6D4A1BF30C9E, 0x00004001 -data8 0x80092804554B0EB0, 0x00004001 -data8 0xF959F94CA1CF0DE9, 0x00004000 -data8 0xF3086BA077378677, 0x00004000 -data8 0xED154515CCD4723C, 0x00004000 -data8 0xE77909441C27CF25, 0x00004000 -data8 0xE22D037D8DDACB88, 0x00004000 -data8 0xDD2B2D8A89C73522, 0x00004000 -data8 0xD86E1A23BB2C1171, 0x00004000 -data8 0xD3F0E288DFF5E0F9, 0x00004000 -data8 0xCFAF16B1283BEBD5, 0x00004000 -data8 0xCBA4AFAA0D88DD53, 0x00004000 -data8 0xC7CE03CCCA67C43D, 0x00004000 -data8 0xC427BC820CA0DDB0, 0x00004000 -data8 0xC0AECD57F13D8CAB, 0x00004000 -data8 0xBD606C3871ECE6B1, 0x00004000 -data8 0xBA3A0A96A44C4929, 0x00004000 -data8 0xB7394F6FE5CCCEC1, 0x00004000 -data8 0xB45C12039637D8BC, 0x00004000 -data8 0xB1A0552892CB051B, 0x00004000 -data8 0xAF04432B6BA2FFD0, 0x00004000 -data8 0xAC862A237221235F, 0x00004000 -data8 0xAA2478AF5F00A9D1, 0x00004000 -data8 0xA7DDBB0C81E082BF, 0x00004000 -data8 0xA5B0987D45684FEE, 0x00004000 -data8 0xA39BD0F5627A8F53, 0x00004000 -data8 0xA19E3B036EC5C8B0, 0x00004000 -data8 0x9FB6C1F091CD7C66, 0x00004000 -data8 0x9DE464101FA3DF8A, 0x00004000 -data8 0x9C263139A8F6B888, 0x00004000 -data8 0x9A7B4968C27B0450, 0x00004000 -data8 0x98E2DB7E5EE614EE, 0x00004000 -LOCAL_OBJECT_END(tanl_table_scim2) - -LOCAL_OBJECT_START(tanl_table_scim1) +data4 0x1BF30C9E, 0x839D6D4A, 0x00004001, 0x00000000 +data4 0x554B0EB0, 0x80092804, 0x00004001, 0x00000000 +data4 0xA1CF0DE9, 0xF959F94C, 0x00004000, 0x00000000 +data4 0x77378677, 0xF3086BA0, 0x00004000, 0x00000000 +data4 0xCCD4723C, 0xED154515, 0x00004000, 0x00000000 +data4 0x1C27CF25, 0xE7790944, 0x00004000, 0x00000000 +data4 0x8DDACB88, 0xE22D037D, 0x00004000, 0x00000000 +data4 0x89C73522, 0xDD2B2D8A, 0x00004000, 0x00000000 +data4 0xBB2C1171, 0xD86E1A23, 0x00004000, 0x00000000 +data4 0xDFF5E0F9, 0xD3F0E288, 0x00004000, 0x00000000 +data4 0x283BEBD5, 0xCFAF16B1, 0x00004000, 0x00000000 +data4 0x0D88DD53, 0xCBA4AFAA, 0x00004000, 0x00000000 +data4 0xCA67C43D, 0xC7CE03CC, 0x00004000, 0x00000000 +data4 0x0CA0DDB0, 0xC427BC82, 0x00004000, 0x00000000 +data4 0xF13D8CAB, 0xC0AECD57, 0x00004000, 0x00000000 +data4 0x71ECE6B1, 0xBD606C38, 0x00004000, 0x00000000 +data4 0xA44C4929, 0xBA3A0A96, 0x00004000, 0x00000000 +data4 0xE5CCCEC1, 0xB7394F6F, 0x00004000, 0x00000000 +data4 0x9637D8BC, 0xB45C1203, 0x00004000, 0x00000000 +data4 0x92CB051B, 0xB1A05528, 0x00004000, 0x00000000 +data4 0x6BA2FFD0, 0xAF04432B, 0x00004000, 0x00000000 +data4 0x7221235F, 0xAC862A23, 0x00004000, 0x00000000 +data4 0x5F00A9D1, 0xAA2478AF, 0x00004000, 0x00000000 +data4 0x81E082BF, 0xA7DDBB0C, 0x00004000, 0x00000000 +data4 0x45684FEE, 0xA5B0987D, 0x00004000, 0x00000000 +data4 0x627A8F53, 0xA39BD0F5, 0x00004000, 0x00000000 +data4 0x6EC5C8B0, 0xA19E3B03, 0x00004000, 0x00000000 +data4 0x91CD7C66, 0x9FB6C1F0, 0x00004000, 0x00000000 +data4 0x1FA3DF8A, 0x9DE46410, 0x00004000, 0x00000000 +data4 0xA8F6B888, 0x9C263139, 0x00004000, 0x00000000 +data4 0xC27B0450, 0x9A7B4968, 0x00004000, 0x00000000 +data4 0x5EE614EE, 0x98E2DB7E, 0x00004000, 0x00000000 // // Entries SC_inv in Swapped IEEE format (extended) // Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64) // -data8 0x969F335C13B2B5BA, 0x00004000 -data8 0x93D446D9D4C0F548, 0x00004000 -data8 0x9147094F61B798AF, 0x00004000 -data8 0x8EF317CC758787AC, 0x00004000 -data8 0x8CD498B3B99EEFDB, 0x00004000 -data8 0x8AE82A7DDFF8BC37, 0x00004000 -data8 0x892AD546E3C55D42, 0x00004000 -data8 0x8799FEA9D15573C1, 0x00004000 -data8 0x86335F88435A4B4C, 0x00004000 -data8 0x84F4FB6E3E93A87B, 0x00004000 -data8 0x83DD195280A382FB, 0x00004000 -data8 0x82EA3D7FA4CB8C9E, 0x00004000 -data8 0x821B247C6861D0A8, 0x00004000 -data8 0x816EBED163E8D244, 0x00004000 -data8 0x80E42D9127E4CFC6, 0x00004000 -data8 0x807ABF8D28E64AFD, 0x00004000 -data8 0x8031EF26863B4FD8, 0x00004000 -data8 0x800960ADAE8C11FD, 0x00004000 -data8 0x8000E1475FDBEC21, 0x00004000 -data8 0x80186650A07791FA, 0x00004000 -LOCAL_OBJECT_END(tanl_table_scim1) - -Arg = f8 -Save_Norm_Arg = f8 // For input to reduction routine +data4 0x13B2B5BA, 0x969F335C, 0x00004000, 0x00000000 +data4 0xD4C0F548, 0x93D446D9, 0x00004000, 0x00000000 +data4 0x61B798AF, 0x9147094F, 0x00004000, 0x00000000 +data4 0x758787AC, 0x8EF317CC, 0x00004000, 0x00000000 +data4 0xB99EEFDB, 0x8CD498B3, 0x00004000, 0x00000000 +data4 0xDFF8BC37, 0x8AE82A7D, 0x00004000, 0x00000000 +data4 0xE3C55D42, 0x892AD546, 0x00004000, 0x00000000 +data4 0xD15573C1, 0x8799FEA9, 0x00004000, 0x00000000 +data4 0x435A4B4C, 0x86335F88, 0x00004000, 0x00000000 +data4 0x3E93A87B, 0x84F4FB6E, 0x00004000, 0x00000000 +data4 0x80A382FB, 0x83DD1952, 0x00004000, 0x00000000 +data4 0xA4CB8C9E, 0x82EA3D7F, 0x00004000, 0x00000000 +data4 0x6861D0A8, 0x821B247C, 0x00004000, 0x00000000 +data4 0x63E8D244, 0x816EBED1, 0x00004000, 0x00000000 +data4 0x27E4CFC6, 0x80E42D91, 0x00004000, 0x00000000 +data4 0x28E64AFD, 0x807ABF8D, 0x00004000, 0x00000000 +data4 0x863B4FD8, 0x8031EF26, 0x00004000, 0x00000000 +data4 0xAE8C11FD, 0x800960AD, 0x00004000, 0x00000000 +data4 0x5FDBEC21, 0x8000E147, 0x00004000, 0x00000000 +data4 0xA07791FA, 0x80186650, 0x00004000, 0x00000000 +ASM_SIZE_DIRECTIVE(TANL_BASE_CONSTANTS) + +Arg = f8 Result = f8 -r = f8 // For output from reduction routine -c = f9 // For output from reduction routine +fp_tmp = f9 U_2 = f10 -rsq = f11 +rsq = f11 C_hi = f12 C_lo = f13 T_hi = f14 T_lo = f15 +N_0 = f32 d_1 = f33 -N_0 = f34 +MPI_BY_4 = f34 tail = f35 tanx = f36 Cx = f37 @@ -1105,6 +949,8 @@ P1_7 = f51 P1_8 = f52 P1_9 = f53 +TWO_TO_63 = f54 +NEGTWO_TO_63 = f55 x = f56 xsq = f57 Tx = f58 @@ -1120,10 +966,12 @@ B = f67 SC_inv = f68 Pos_r = f69 N_0_fix = f70 -d_2 = f71 -PI_BY_4 = f72 +PI_BY_4 = f71 +NEGTWO_TO_NEG2 = f72 +TWO_TO_24 = f73 TWO_TO_NEG14 = f74 TWO_TO_NEG33 = f75 +NEGTWO_TO_24 = f76 NEGTWO_TO_NEG14 = f76 NEGTWO_TO_NEG33 = f77 two_by_PI = f78 @@ -1134,14 +982,13 @@ P_2 = f82 P_3 = f83 s_val = f84 w = f85 -B_mask1 = f86 -B_mask2 = f87 -w2 = f88 +c = f86 +r = f87 A = f89 a = f90 t = f91 U_1 = f92 -NEGTWO_TO_NEG2 = f93 +d_2 = f93 TWO_TO_NEG2 = f94 Q1_1 = f95 Q1_2 = f96 @@ -1162,643 +1009,609 @@ V_hiabs = f110 V = f111 Inv_P_0 = f112 -FR_inv_pi_2to63 = f113 -FR_rshf_2to64 = f114 -FR_2tom64 = f115 -FR_rshf = f116 -Norm_Arg = f117 -Abs_Arg = f118 -TWO_TO_NEG65 = f119 -fp_tmp = f120 -mOne = f121 - GR_SAVE_B0 = r33 GR_SAVE_GP = r34 GR_SAVE_PFS = r35 -table_base = r36 +delta1 = r36 table_ptr1 = r37 table_ptr2 = r38 -table_ptr3 = r39 -lookup = r40 -N_fix_gr = r41 -GR_exp_2tom2 = r42 -GR_exp_2tom65 = r43 -exp_r = r44 -sig_r = r45 -bmask1 = r46 -table_offset = r47 -bmask2 = r48 +i_0 = r39 +i_1 = r40 +N_fix_gr = r41 +N_inc = r42 +exp_Arg = r43 +exp_r = r44 +sig_r = r45 +lookup = r46 +table_offset = r47 +Create_B = r48 gr_tmp = r49 -cot_flag = r50 - -GR_sig_inv_pi = r51 -GR_rshf_2to64 = r52 -GR_exp_2tom64 = r53 -GR_rshf = r54 -GR_exp_2_to_63 = r55 -GR_exp_2_to_24 = r56 -GR_signexp_x = r57 -GR_exp_x = r58 -GR_exp_mask = r59 -GR_exp_2tom14 = r60 -GR_exp_m2tom14 = r61 -GR_exp_2tom33 = r62 -GR_exp_m2tom33 = r63 - -GR_SAVE_B0 = r64 -GR_SAVE_PFS = r65 -GR_SAVE_GP = r66 - -GR_Parameter_X = r67 -GR_Parameter_Y = r68 -GR_Parameter_RESULT = r69 -GR_Parameter_Tag = r70 - .section .text -.global __libm_tanl# -.global __libm_cotl# - -.proc __libm_cotl# -__libm_cotl: -.endp __libm_cotl# -LOCAL_LIBM_ENTRY(cotl) - -{ .mlx - alloc r32 = ar.pfs, 0,35,4,0 - movl GR_sig_inv_pi = 0xa2f9836e4e44152a // significand of 1/pi -} -{ .mlx - mov GR_exp_mask = 0x1ffff // Exponent mask - movl GR_rshf_2to64 = 0x47e8000000000000 // 1.1000 2^(63+64) -} -;; - -// Check for NatVals, Infs , NaNs, and Zeros -{ .mfi - getf.exp GR_signexp_x = Arg // Get sign and exponent of x - fclass.m p6,p0 = Arg, 0x1E7 // Test for natval, nan, inf, zero - mov cot_flag = 0x1 -} -{ .mfb - addl table_base = @ltoff(TANL_BASE_CONSTANTS), gp // Pointer to table ptr - fnorm.s1 Norm_Arg = Arg // Normalize x - br.cond.sptk COMMON_PATH +.global tanl +.proc tanl +tanl: +#ifdef _LIBC +.global __tanl +.proc __tanl +__tanl: +#endif +{ .mfi +alloc r32 = ar.pfs, 0,17,2,0 +(p0) fclass.m.unc p6,p0 = Arg, 0x1E7 + addl gr_tmp = -1,r0 +} +{ .mfi + nop.m 0 +(p0) fclass.nm.unc p7,p0 = Arg, 0x1FF + nop.i 0 };; -LOCAL_LIBM_END(cotl) - - -.proc __libm_tanl# -__libm_tanl: -.endp __libm_tanl# -GLOBAL_IEEE754_ENTRY(tanl) - -{ .mlx - alloc r32 = ar.pfs, 0,35,4,0 - movl GR_sig_inv_pi = 0xa2f9836e4e44152a // significand of 1/pi -} -{ .mlx - mov GR_exp_mask = 0x1ffff // Exponent mask - movl GR_rshf_2to64 = 0x47e8000000000000 // 1.1000 2^(63+64) -} -;; - -// Check for NatVals, Infs , NaNs, and Zeros -{ .mfi - getf.exp GR_signexp_x = Arg // Get sign and exponent of x - fclass.m p6,p0 = Arg, 0x1E7 // Test for natval, nan, inf, zero - mov cot_flag = 0x0 -} { .mfi - addl table_base = @ltoff(TANL_BASE_CONSTANTS), gp // Pointer to table ptr - fnorm.s1 Norm_Arg = Arg // Normalize x +(p0) addl table_ptr1 = @ltoff(TANL_BASE_CONSTANTS), gp + nop.f 999 nop.i 0 -};; - -// Common path for both tanl and cotl -COMMON_PATH: -{ .mfi - setf.sig FR_inv_pi_2to63 = GR_sig_inv_pi // Form 1/pi * 2^63 - fclass.m p9, p0 = Arg, 0x0b // Test x denormal - mov GR_exp_2tom64 = 0xffff - 64 // Scaling constant to compute N -} -{ .mlx - setf.d FR_rshf_2to64 = GR_rshf_2to64 // Form const 1.1000 * 2^(63+64) - movl GR_rshf = 0x43e8000000000000 // Form const 1.1000 * 2^63 -} -;; - -// Check for everything - if false, then must be pseudo-zero or pseudo-nan. -// Branch out to deal with special values. -{ .mfi - addl gr_tmp = -1,r0 - fclass.nm p7,p0 = Arg, 0x1FF // Test x unsupported - mov GR_exp_2_to_63 = 0xffff + 63 // Exponent of 2^63 -} -{ .mfb - ld8 table_base = [table_base] // Get pointer to constant table - fms.s1 mOne = f0, f0, f1 -(p6) br.cond.spnt TANL_SPECIAL // Branch if x natval, nan, inf, zero } ;; - -{ .mmb +{ .mmi +(p0) ld8 table_ptr1 = [table_ptr1] setf.sig fp_tmp = gr_tmp // Make a constant so fmpy produces inexact - mov GR_exp_2_to_24 = 0xffff + 24 // Exponent of 2^24 -(p9) br.cond.spnt TANL_DENORMAL // Branch if x denormal + nop.i 999 } ;; -TANL_COMMON: -// Return to here if x denormal // -// Do fcmp to generate Denormal exception -// - can't do FNORM (will generate Underflow when U is unmasked!) -// Branch out to deal with unsupporteds values. -{ .mfi - setf.exp FR_2tom64 = GR_exp_2tom64 // Form 2^-64 for scaling N_float - fcmp.eq.s0 p0, p6 = Arg, f1 // Dummy to flag denormals - add table_ptr1 = 0, table_base // Point to tanl_table_1 +// Check for NatVals, Infs , NaNs, and Zeros +// Check for everything - if false, then must be pseudo-zero +// or pseudo-nan. +// Local table pointer +// +{ .mbb +(p0) add table_ptr2 = 96, table_ptr1 +(p6) br.cond.spnt L(TANL_SPECIAL) +(p7) br.cond.spnt L(TANL_SPECIAL) ;; } -{ .mib - setf.d FR_rshf = GR_rshf // Form right shift const 1.1000 * 2^63 - add table_ptr2 = 80, table_base // Point to tanl_table_2 -(p7) br.cond.spnt TANL_UNSUPPORTED // Branch if x unsupported type +// +// Point to Inv_P_0 +// Branch out to deal with unsupporteds and special values. +// +{ .mmf +(p0) ldfs TWO_TO_24 = [table_ptr1],4 +(p0) ldfs TWO_TO_63 = [table_ptr2],4 +// +// Load -2**24, load -2**63. +// +(p0) fcmp.eq.s0 p0, p6 = Arg, f1 ;; } -;; - { .mfi - and GR_exp_x = GR_exp_mask, GR_signexp_x // Get exponent of x - fmpy.s1 Save_Norm_Arg = Norm_Arg, f1 // Save x if large arg reduction - dep.z bmask1 = 0x7c, 56, 8 // Form mask to get 5 msb of r - // bmask1 = 0x7c00000000000000 +(p0) ldfs NEGTWO_TO_63 = [table_ptr2],12 +(p0) fnorm.s1 Arg = Arg + nop.i 999 +} +// +// Load 2**24, Load 2**63. +// +{ .mmi +(p0) ldfs NEGTWO_TO_24 = [table_ptr1],12 ;; +// +// Do fcmp to generate Denormal exception +// - can't do FNORM (will generate Underflow when U is unmasked!) +// Normalize input argument. +// +(p0) ldfe two_by_PI = [table_ptr1],16 + nop.i 999 +} +{ .mmi +(p0) ldfe Inv_P_0 = [table_ptr2],16 ;; +(p0) ldfe d_1 = [table_ptr2],16 + nop.i 999 } -;; - // // Decide about the paths to take: -// Set PR_6 if |Arg| >= 2**63 -// Set PR_9 if |Arg| < 2**24 - CASE 1 OR 2 -// OTHERWISE Set PR_8 - CASE 3 OR 4 +// PR_1 and PR_3 set if -2**24 < Arg < 2**24 - CASE 1 OR 2 +// OTHERWISE - CASE 3 OR 4 +// Load inverse of P_0 . +// Set PR_6 if Arg <= -2**63 +// Are there any Infs, NaNs, or zeros? // -// Branch out if the magnitude of the input argument is >= 2^63 -// - do this branch before the next. -{ .mfi - ldfe two_by_PI = [table_ptr1],16 // Load 2/pi - nop.f 999 - dep.z bmask2 = 0x41, 57, 7 // Form mask to OR to produce B - // bmask2 = 0x8200000000000000 +{ .mmi +(p0) ldfe P_0 = [table_ptr1],16 ;; +(p0) ldfe d_2 = [table_ptr2],16 + nop.i 999 } -{ .mib - ldfe PI_BY_4 = [table_ptr2],16 // Load pi/4 - cmp.ge p6,p0 = GR_exp_x, GR_exp_2_to_63 // Is |x| >= 2^63 -(p6) br.cond.spnt TANL_ARG_TOO_LARGE // Branch if |x| >= 2^63 +// +// Set PR_8 if Arg <= -2**24 +// Set PR_6 if Arg >= 2**63 +// +{ .mmi +(p0) ldfe P_1 = [table_ptr1],16 ;; +(p0) ldfe PI_BY_4 = [table_ptr2],16 + nop.i 999 } -;; - +// +// Set PR_8 if Arg >= 2**24 +// { .mmi - ldfe P_0 = [table_ptr1],16 // Load P_0 - ldfe Inv_P_0 = [table_ptr2],16 // Load Inv_P_0 - nop.i 999 +(p0) ldfe P_2 = [table_ptr1],16 ;; +(p0) ldfe MPI_BY_4 = [table_ptr2],16 + nop.i 999 } -;; - +// +// Load P_2 and PI_BY_4 +// { .mfi - ldfe P_1 = [table_ptr1],16 // Load P_1 - fmerge.s Abs_Arg = f0, Norm_Arg // Get |x| - mov GR_exp_m2tom33 = 0x2ffff - 33 // Form signexp of -2^-33 +(p0) ldfe P_3 = [table_ptr1],16 + nop.f 999 + nop.i 999 ;; } { .mfi - ldfe d_1 = [table_ptr2],16 // Load d_1 for 2^24 <= |x| < 2^63 - nop.f 999 - mov GR_exp_2tom33 = 0xffff - 33 // Form signexp of 2^-33 + nop.m 999 +(p0) fcmp.le.unc.s1 p6,p7 = Arg,NEGTWO_TO_63 + nop.i 999 } -;; - -{ .mmi - ldfe P_2 = [table_ptr1],16 // Load P_2 - ldfe d_2 = [table_ptr2],16 // Load d_2 for 2^24 <= |x| < 2^63 - cmp.ge p8,p0 = GR_exp_x, GR_exp_2_to_24 // Is |x| >= 2^24 +{ .mfi + nop.m 999 +(p0) fcmp.le.unc.s1 p8,p9 = Arg,NEGTWO_TO_24 + nop.i 999 ;; } -;; - -// Use special scaling to right shift so N=Arg * 2/pi is in rightmost bits -// Branch to Cases 3 or 4 if Arg <= -2**24 or Arg >= 2**24 -{ .mfb - ldfe P_3 = [table_ptr1],16 // Load P_3 - fma.s1 N_fix = Norm_Arg, FR_inv_pi_2to63, FR_rshf_2to64 -(p8) br.cond.spnt TANL_LARGER_ARG // Branch if 2^24 <= |x| < 2^63 +{ .mfi + nop.m 999 +(p7) fcmp.ge.s1 p6,p0 = Arg,TWO_TO_63 + nop.i 999 } -;; - -// Here if 0 < |x| < 2^24 -// ARGUMENT REDUCTION CODE - CASE 1 and 2 +{ .mfi + nop.m 999 +(p9) fcmp.ge.s1 p8,p0 = Arg,TWO_TO_24 + nop.i 999 ;; +} +{ .mib + nop.m 999 + nop.i 999 // -{ .mmf - setf.exp TWO_TO_NEG33 = GR_exp_2tom33 // Form 2^-33 - setf.exp NEGTWO_TO_NEG33 = GR_exp_m2tom33 // Form -2^-33 - fmerge.s r = Norm_Arg,Norm_Arg // Assume r=x, ok if |x| < pi/4 +// Load P_3 and -PI_BY_4 +// +(p6) br.cond.spnt L(TANL_ARG_TOO_LARGE) ;; } -;; - +{ .mib + nop.m 999 + nop.i 999 // -// If |Arg| < pi/4, set PR_8, else pi/4 <=|Arg| < 2^24 - set PR_9. +// Load 2**(-2). +// Load -2**(-2). +// Branch out if we have a special argument. +// Branch out if the magnitude of the input argument is too large +// - do this branch before the next. +// +(p8) br.cond.spnt L(TANL_LARGER_ARG) ;; +} +// +// Branch to Cases 3 or 4 if Arg <= -2**24 or Arg >= 2**24 // -// Case 2: Convert integer N_fix back to normalized floating-point value. { .mfi - getf.sig sig_r = Norm_Arg // Get sig_r if 1/4 <= |x| < pi/4 - fcmp.lt.s1 p8,p9= Abs_Arg,PI_BY_4 // Test |x| < pi/4 - mov GR_exp_2tom2 = 0xffff - 2 // Form signexp of 2^-2 +(p0) ldfs TWO_TO_NEG2 = [table_ptr2],4 +// ARGUMENT REDUCTION CODE - CASE 1 and 2 +// Load 2**(-2). +// Load -2**(-2). +(p0) fmpy.s1 N = Arg,two_by_PI + nop.i 999 ;; } { .mfi - ldfps TWO_TO_NEG2, NEGTWO_TO_NEG2 = [table_ptr2] // Load 2^-2, -2^-2 - fms.s1 N = N_fix, FR_2tom64, FR_rshf // Use scaling to get N floated - mov N_fix_gr = r0 // Assume N=0, ok if |x| < pi/4 +(p0) ldfs NEGTWO_TO_NEG2 = [table_ptr2],12 +// +// N = Arg * 2/pi +// +(p0) fcmp.lt.unc.s1 p8,p9= Arg,PI_BY_4 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// if Arg < pi/4, set PR_8. +// +(p8) fcmp.gt.s1 p8,p9= Arg,MPI_BY_4 + nop.i 999 ;; } -;; - // // Case 1: Is |r| < 2**(-2). // Arg is the same as r in this case. // r = Arg // c = 0 // -// Case 2: Place integer part of N in GP register. { .mfi -(p9) getf.sig N_fix_gr = N_fix - fmerge.s c = f0, f0 // Assume c=0, ok if |x| < pi/4 - cmp.lt p10, p0 = GR_exp_x, GR_exp_2tom2 // Test if |x| < 1/4 +(p8) mov N_fix_gr = r0 +// +// if Arg > -pi/4, reset PR_8. +// Select the case when |Arg| < pi/4 - set PR[8] = true. +// Else Select the case when |Arg| >= pi/4 - set PR[9] = true. +// +(p0) fcvt.fx.s1 N_fix = N + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Grab the integer part of N . +// +(p8) mov r = Arg + nop.i 999 } -;; - { .mfi - setf.sig B_mask1 = bmask1 // Form mask to get 5 msb of r - nop.f 999 - mov exp_r = GR_exp_x // Get exp_r if 1/4 <= |x| < pi/4 + nop.m 999 +(p8) mov c = f0 + nop.i 999 ;; } -{ .mbb - setf.sig B_mask2 = bmask2 // Form mask to form B from r -(p10) br.cond.spnt TANL_SMALL_R // Branch if 0 < |x| < 1/4 -(p8) br.cond.spnt TANL_NORMAL_R // Branch if 1/4 <= |x| < pi/4 +{ .mfi + nop.m 999 +(p8) fcmp.lt.unc.s1 p10, p11 = Arg, TWO_TO_NEG2 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p10) fcmp.gt.s1 p10,p0 = Arg, NEGTWO_TO_NEG2 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Case 2: Place integer part of N in GP register. +// +(p9) fcvt.xf N = N_fix + nop.i 999 ;; +} +{ .mib +(p9) getf.sig N_fix_gr = N_fix + nop.i 999 +// +// Case 2: Convert integer N_fix back to normalized floating-point value. +// +(p10) br.cond.spnt L(TANL_SMALL_R) ;; +} +{ .mib + nop.m 999 + nop.i 999 +(p8) br.cond.sptk L(TANL_NORMAL_R) ;; } -;; - -// Here if pi/4 <= |x| < 2^24 // // Case 1: PR_3 is only affected when PR_1 is set. // +{ .mmi +(p9) ldfs TWO_TO_NEG33 = [table_ptr2], 4 ;; // -// Case 2: w = N * P_2 -// Case 2: s_val = -N * P_1 + Arg +// Case 2: Load 2**(-33). // - -{ .mfi - nop.m 999 - fnma.s1 s_val = N, P_1, Norm_Arg - nop.i 999 +(p9) ldfs NEGTWO_TO_NEG33 = [table_ptr2], 4 + nop.i 999 ;; } { .mfi - nop.m 999 - fmpy.s1 w = N, P_2 // w = N * P_2 for |s| >= 2^-33 - nop.i 999 + nop.m 999 +// +// Case 2: Load -2**(-33). +// +(p9) fnma.s1 s_val = N, P_1, Arg + nop.i 999 } -;; - -// Case 2_reduce: w = N * P_3 (change sign) { .mfi - nop.m 999 - fmpy.s1 w2 = N, P_3 // w = N * P_3 for |s| < 2^-33 - nop.i 999 + nop.m 999 +(p9) fmpy.s1 w = N, P_2 + nop.i 999 ;; } -;; - -// Case 1_reduce: r = s + w (change sign) { .mfi - nop.m 999 - fsub.s1 r = s_val, w // r = s_val - w for |s| >= 2^-33 - nop.i 999 + nop.m 999 +// +// Case 2: w = N * P_2 +// Case 2: s_val = -N * P_1 + Arg +// +(p0) fcmp.lt.unc.s1 p9,p8 = s_val, TWO_TO_NEG33 + nop.i 999 ;; } -;; - -// Case 2_reduce: U_1 = N * P_2 + w { .mfi - nop.m 999 - fma.s1 U_1 = N, P_2, w2 // U_1 = N * P_2 + w for |s| < 2^-33 - nop.i 999 -} -;; - + nop.m 999 // // Decide between case_1 and case_2 reduce: -// Case 1_reduce: |s| >= 2**(-33) -// Case 2_reduce: |s| < 2**(-33) // -{ .mfi - nop.m 999 - fcmp.lt.s1 p9, p8 = s_val, TWO_TO_NEG33 - nop.i 999 +(p9) fcmp.gt.s1 p9, p8 = s_val, NEGTWO_TO_NEG33 + nop.i 999 ;; } -;; - { .mfi - nop.m 999 -(p9) fcmp.gt.s1 p9, p8 = s_val, NEGTWO_TO_NEG33 - nop.i 999 + nop.m 999 +// +// Case 1_reduce: s <= -2**(-33) or s >= 2**(-33) +// Case 2_reduce: -2**(-33) < s < 2**(-33) +// +(p8) fsub.s1 r = s_val, w + nop.i 999 } -;; - -// Case 1_reduce: c = s - r { .mfi - nop.m 999 - fsub.s1 c = s_val, r // c = s_val - r for |s| >= 2^-33 - nop.i 999 + nop.m 999 +(p9) fmpy.s1 w = N, P_3 + nop.i 999 ;; } -;; - -// Case 2_reduce: r is complete here - continue to calculate c . -// r = s - U_1 { .mfi - nop.m 999 -(p9) fsub.s1 r = s_val, U_1 - nop.i 999 + nop.m 999 +(p9) fma.s1 U_1 = N, P_2, w + nop.i 999 } { .mfi - nop.m 999 -(p9) fms.s1 U_2 = N, P_2, U_1 - nop.i 999 -} -;; - + nop.m 999 // // Case 1_reduce: Is |r| < 2**(-2), if so set PR_10 -// else set PR_13. +// else set PR_11. // - -{ .mfi - nop.m 999 - fand B = B_mask1, r - nop.i 999 +(p8) fsub.s1 c = s_val, r + nop.i 999 ;; } { .mfi - nop.m 999 -(p8) fcmp.lt.unc.s1 p10, p13 = r, TWO_TO_NEG2 - nop.i 999 + nop.m 999 +// +// Case 1_reduce: r = s + w (change sign) +// Case 2_reduce: w = N * P_3 (change sign) +// +(p8) fcmp.lt.unc.s1 p10, p11 = r, TWO_TO_NEG2 + nop.i 999 ;; } -;; - { .mfi -(p8) getf.sig sig_r = r // Get signif of r if |s| >= 2^-33 - nop.f 999 - nop.i 999 + nop.m 999 +(p10) fcmp.gt.s1 p10, p11 = r, NEGTWO_TO_NEG2 + nop.i 999 ;; } -;; - { .mfi -(p8) getf.exp exp_r = r // Extract signexp of r if |s| >= 2^-33 -(p10) fcmp.gt.s1 p10, p13 = r, NEGTWO_TO_NEG2 - nop.i 999 + nop.m 999 +(p9) fsub.s1 r = s_val, U_1 + nop.i 999 } -;; - +{ .mfi + nop.m 999 +// // Case 1_reduce: c is complete here. -// Case 1: Branch to SMALL_R or NORMAL_R. // c = c + w (w has not been negated.) -{ .mfi - nop.m 999 -(p8) fsub.s1 c = c, w // c = c - w for |s| >= 2^-33 - nop.i 999 -} -{ .mbb - nop.m 999 -(p10) br.cond.spnt TANL_SMALL_R // Branch if pi/4 < |x| < 2^24 and |r|<1/4 -(p13) br.cond.sptk TANL_NORMAL_R_A // Branch if pi/4 < |x| < 2^24 and |r|>=1/4 +// Case 2_reduce: r is complete here - continue to calculate c . +// r = s - U_1 +// +(p9) fms.s1 U_2 = N, P_2, U_1 + nop.i 999 ;; } -;; - - -// Here if pi/4 < |x| < 2^24 and |s| < 2^-33 +{ .mfi + nop.m 999 // -// Is i_1 = lsb of N_fix_gr even or odd? -// if i_1 == 0, set p11, else set p12. +// Case 1_reduce: c = s - r +// Case 2_reduce: U_1 = N * P_2 + w // -{ .mfi - nop.m 999 - fsub.s1 s_val = s_val, r - add N_fix_gr = N_fix_gr, cot_flag // N = N + 1 (for cotl) +(p8) fsub.s1 c = c, w + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 +(p9) fsub.s1 s_val = s_val, r + nop.i 999 +} +{ .mfb + nop.m 999 // // Case 2_reduce: // U_2 = N * P_2 - U_1 // Not needed until later. // - fadd.s1 U_2 = U_2, w2 +(p9) fadd.s1 U_2 = U_2, w // // Case 2_reduce: // s = s - r // U_2 = U_2 + w // - nop.i 999 +(p10) br.cond.spnt L(TANL_SMALL_R) ;; } -;; - +{ .mib + nop.m 999 + nop.i 999 +(p11) br.cond.sptk L(TANL_NORMAL_R) ;; +} +{ .mii + nop.m 999 // // Case 2_reduce: // c = c - U_2 // c is complete here // Argument reduction ends here. // +(p9) extr.u i_1 = N_fix_gr, 0, 1 ;; +(p9) cmp.eq.unc p11, p12 = 0x0000,i_1 ;; +} { .mfi - nop.m 999 - fmpy.s1 rsq = r, r - tbit.z p11, p12 = N_fix_gr, 0 ;; // Set p11 if N even, p12 if odd + nop.m 999 +// +// Is i_1 even or odd? +// if i_1 == 0, set p11, else set p12. +// +(p11) fmpy.s1 rsq = r, r + nop.i 999 ;; } - { .mfi - nop.m 999 -(p12) frcpa.s1 S_hi,p0 = f1, r - nop.i 999 + nop.m 999 +(p12) frcpa.s1 S_hi,p0 = f1, r + nop.i 999 } + + + +// +// Case 1: Branch to SMALL_R or NORMAL_R. +// Case 1 is done now. +// + { .mfi +(p9) addl table_ptr1 = @ltoff(TANL_BASE_CONSTANTS), gp +(p9) fsub.s1 c = s_val, U_1 + nop.i 999 ;; +} +;; + +{ .mmi +(p9) ld8 table_ptr1 = [table_ptr1] nop.m 999 - fsub.s1 c = s_val, U_1 nop.i 999 } ;; + { .mmi - add table_ptr1 = 160, table_base ;; // Point to tanl_table_p1 - ldfe P1_1 = [table_ptr1],144 - nop.i 999 ;; +(p9) add table_ptr1 = 224, table_ptr1 ;; +(p9) ldfe P1_1 = [table_ptr1],144 + nop.i 999 ;; } // +// Get [i_1] - lsb of N_fix_gr . // Load P1_1 and point to Q1_1 . // { .mfi - ldfe Q1_1 = [table_ptr1] +(p9) ldfe Q1_1 = [table_ptr1] , 0 // // N even: rsq = r * Z // N odd: S_hi = frcpa(r) // -(p12) fmerge.ns S_hi = S_hi, S_hi - nop.i 999 +(p12) fmerge.ns S_hi = S_hi, S_hi + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 // // Case 2_reduce: // c = s - U_1 // -(p9) fsub.s1 c = c, U_2 - nop.i 999 ;; +(p9) fsub.s1 c = c, U_2 + nop.i 999 ;; } { .mfi - nop.m 999 -(p12) fma.s1 poly1 = S_hi, r, f1 - nop.i 999 ;; + nop.m 999 +(p12) fma.s1 poly1 = S_hi, r, f1 + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N odd: Change sign of S_hi // -(p11) fmpy.s1 rsq = rsq, P1_1 - nop.i 999 ;; +(p11) fmpy.s1 rsq = rsq, P1_1 + nop.i 999 ;; } { .mfi - nop.m 999 -(p12) fma.s1 S_hi = S_hi, poly1, S_hi - nop.i 999 ;; + nop.m 999 +(p12) fma.s1 S_hi = S_hi, poly1, S_hi + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even: rsq = rsq * P1_1 // N odd: poly1 = 1.0 + S_hi * r 16 bits partial account for necessary // -(p11) fma.s1 Poly = r, rsq, c - nop.i 999 ;; +(p11) fma.s1 Result = r, rsq, c + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // -// N even: Poly = c + r * rsq +// N even: Result = c + r * rsq // N odd: S_hi = S_hi + S_hi*poly1 16 bits account for necessary // -(p12) fma.s1 poly1 = S_hi, r, f1 -(p11) tbit.z.unc p14, p15 = cot_flag, 0 ;; // p14=1 for tanl; p15=1 for cotl +(p12) fma.s1 poly1 = S_hi, r, f1 + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // -// N even: Result = Poly + r +// N even: Result = Result + r // N odd: poly1 = 1.0 + S_hi * r 32 bits partial // -(p14) fadd.s0 Result = r, Poly // for tanl - nop.i 999 -} -{ .mfi - nop.m 999 -(p15) fms.s0 Result = r, mOne, Poly // for cotl - nop.i 999 +(p11) fadd.s0 Result = r, Result + nop.i 999 ;; } -;; - { .mfi - nop.m 999 -(p12) fma.s1 S_hi = S_hi, poly1, S_hi - nop.i 999 ;; + nop.m 999 +(p12) fma.s1 S_hi = S_hi, poly1, S_hi + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even: Result1 = Result + r // N odd: S_hi = S_hi * poly1 + S_hi 32 bits // -(p12) fma.s1 poly1 = S_hi, r, f1 - nop.i 999 ;; +(p12) fma.s1 poly1 = S_hi, r, f1 + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N odd: poly1 = S_hi * r + 1.0 64 bits partial // -(p12) fma.s1 S_hi = S_hi, poly1, S_hi - nop.i 999 ;; +(p12) fma.s1 S_hi = S_hi, poly1, S_hi + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N odd: poly1 = S_hi * poly + 1.0 64 bits // -(p12) fma.s1 poly1 = S_hi, r, f1 - nop.i 999 ;; +(p12) fma.s1 poly1 = S_hi, r, f1 + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N odd: poly1 = S_hi * r + 1.0 // -(p12) fma.s1 poly1 = S_hi, c, poly1 - nop.i 999 ;; +(p12) fma.s1 poly1 = S_hi, c, poly1 + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N odd: poly1 = S_hi * c + poly1 // -(p12) fmpy.s1 S_lo = S_hi, poly1 - nop.i 999 ;; +(p12) fmpy.s1 S_lo = S_hi, poly1 + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N odd: S_lo = S_hi * poly1 // -(p12) fma.s1 S_lo = Q1_1, r, S_lo -(p12) tbit.z.unc p14, p15 = cot_flag, 0 // p14=1 for tanl; p15=1 for cotl +(p12) fma.s1 S_lo = Q1_1, r, S_lo + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 // // N odd: Result = S_hi + S_lo // - fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact - nop.i 999 ;; +(p0) fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact + nop.i 999 ;; } -{ .mfi - nop.m 999 +{ .mfb + nop.m 999 // // N odd: S_lo = S_lo + Q1_1 * r // -(p14) fadd.s0 Result = S_hi, S_lo // for tanl - nop.i 999 -} -{ .mfb - nop.m 999 -(p15) fms.s0 Result = S_hi, mOne, S_lo // for cotl - br.ret.sptk b0 ;; // Exit for pi/4 <= |x| < 2^24 and |s| < 2^-33 +(p12) fadd.s0 Result = S_hi, S_lo +(p0) br.ret.sptk b0 ;; } -TANL_LARGER_ARG: -// Here if 2^24 <= |x| < 2^63 +L(TANL_LARGER_ARG): + // // ARGUMENT REDUCTION CODE - CASE 3 and 4 // -{ .mmf - mov GR_exp_2tom14 = 0xffff - 14 // Form signexp of 2^-14 - mov GR_exp_m2tom14 = 0x2ffff - 14 // Form signexp of -2^-14 - fmpy.s1 N_0 = Norm_Arg, Inv_P_0 +{ .mfi +(p0) addl table_ptr1 = @ltoff(TANL_BASE_CONSTANTS), gp +(p0) fmpy.s1 N_0 = Arg, Inv_P_0 + nop.i 999 } ;; { .mmi - setf.exp TWO_TO_NEG14 = GR_exp_2tom14 // Form 2^-14 - setf.exp NEGTWO_TO_NEG14 = GR_exp_m2tom14// Form -2^-14 +(p0) ld8 table_ptr1 = [table_ptr1] + nop.m 999 nop.i 999 } ;; @@ -1809,605 +1622,661 @@ TANL_LARGER_ARG: // N_0 = Arg * Inv_P_0 // { .mmi - add table_ptr2 = 144, table_base ;; // Point to 2^-2 - ldfps TWO_TO_NEG2, NEGTWO_TO_NEG2 = [table_ptr2] - nop.i 999 +(p0) add table_ptr1 = 8, table_ptr1 ;; +// +// Point to 2*-14 +// +(p0) ldfs TWO_TO_NEG14 = [table_ptr1], 4 + nop.i 999 ;; } -;; - +// +// Load 2**(-14). +// +{ .mmi +(p0) ldfs NEGTWO_TO_NEG14 = [table_ptr1], 180 ;; // // N_0_fix = integer part of N_0 . +// Adjust table_ptr1 to beginning of table. // +(p0) ldfs TWO_TO_NEG2 = [table_ptr1], 4 + nop.i 999 ;; +} // // Make N_0 the integer part. // { .mfi - nop.m 999 - fcvt.fx.s1 N_0_fix = N_0 - nop.i 999 ;; +(p0) ldfs NEGTWO_TO_NEG2 = [table_ptr1] +// +// Load -2**(-14). +// +(p0) fcvt.fx.s1 N_0_fix = N_0 + nop.i 999 ;; } { .mfi - setf.sig B_mask1 = bmask1 // Form mask to get 5 msb of r - fcvt.xf N_0 = N_0_fix - nop.i 999 ;; + nop.m 999 +(p0) fcvt.xf N_0 = N_0_fix + nop.i 999 ;; } { .mfi - setf.sig B_mask2 = bmask2 // Form mask to form B from r - fnma.s1 ArgPrime = N_0, P_0, Norm_Arg - nop.i 999 + nop.m 999 +(p0) fnma.s1 ArgPrime = N_0, P_0, Arg + nop.i 999 } { .mfi - nop.m 999 - fmpy.s1 w = N_0, d_1 - nop.i 999 ;; + nop.m 999 +(p0) fmpy.s1 w = N_0, d_1 + nop.i 999 ;; } +{ .mfi + nop.m 999 // // ArgPrime = -N_0 * P_0 + Arg // w = N_0 * d_1 // +(p0) fmpy.s1 N = ArgPrime, two_by_PI + nop.i 999 ;; +} +{ .mfi + nop.m 999 // // N = ArgPrime * 2/pi // -// fcvt.fx.s1 N_fix = N -// Use special scaling to right shift so N=Arg * 2/pi is in rightmost bits -// Branch to Cases 3 or 4 if Arg <= -2**24 or Arg >= 2**24 +(p0) fcvt.fx.s1 N_fix = N + nop.i 999 ;; +} { .mfi - nop.m 999 - fma.s1 N_fix = ArgPrime, FR_inv_pi_2to63, FR_rshf_2to64 - - nop.i 999 ;; + nop.m 999 +// +// N_fix is the integer part. +// +(p0) fcvt.xf N = N_fix + nop.i 999 ;; } -// Convert integer N_fix back to normalized floating-point value. { .mfi - nop.m 999 - fms.s1 N = N_fix, FR_2tom64, FR_rshf // Use scaling to get N floated - nop.i 999 +(p0) getf.sig N_fix_gr = N_fix + nop.f 999 + nop.i 999 ;; } -;; - +{ .mfi + nop.m 999 // // N is the integer part of the reduced-reduced argument. // Put the integer in a GP register. // +(p0) fnma.s1 s_val = N, P_1, ArgPrime + nop.i 999 +} { .mfi - getf.sig N_fix_gr = N_fix - nop.f 999 - nop.i 999 + nop.m 999 +(p0) fnma.s1 w = N, P_2, w + nop.i 999 ;; } -;; - +{ .mfi + nop.m 999 // // s_val = -N*P_1 + ArgPrime // w = -N*P_2 + w // -{ .mfi - nop.m 999 - fnma.s1 s_val = N, P_1, ArgPrime - nop.i 999 +(p0) fcmp.lt.unc.s1 p11, p10 = s_val, TWO_TO_NEG14 + nop.i 999 ;; } { .mfi - nop.m 999 - fnma.s1 w = N, P_2, w - nop.i 999 + nop.m 999 +(p11) fcmp.gt.s1 p11, p10 = s_val, NEGTWO_TO_NEG14 + nop.i 999 ;; } -;; - -// Case 4: V_hi = N * P_2 -// Case 4: U_hi = N_0 * d_1 { .mfi - nop.m 999 - fmpy.s1 V_hi = N, P_2 // V_hi = N * P_2 for |s| < 2^-14 - nop.i 999 + nop.m 999 +// +// Case 3: r = s_val + w (Z complete) +// Case 4: U_hi = N_0 * d_1 +// +(p10) fmpy.s1 V_hi = N, P_2 + nop.i 999 } { .mfi - nop.m 999 - fmpy.s1 U_hi = N_0, d_1 // U_hi = N_0 * d_1 for |s| < 2^-14 - nop.i 999 + nop.m 999 +(p11) fmpy.s1 U_hi = N_0, d_1 + nop.i 999 ;; } -;; - -// Case 3: r = s_val + w (Z complete) -// Case 4: w = N * P_3 { .mfi - nop.m 999 - fadd.s1 r = s_val, w // r = s_val + w for |s| >= 2^-14 - nop.i 999 + nop.m 999 +// +// Case 3: r = s_val + w (Z complete) +// Case 4: U_hi = N_0 * d_1 +// +(p11) fmpy.s1 V_hi = N, P_2 + nop.i 999 } { .mfi - nop.m 999 - fmpy.s1 w2 = N, P_3 // w = N * P_3 for |s| < 2^-14 - nop.i 999 + nop.m 999 +(p11) fmpy.s1 U_hi = N_0, d_1 + nop.i 999 ;; } -;; - -// Case 4: A = U_hi + V_hi -// Note: Worry about switched sign of V_hi, so subtract instead of add. -// Case 4: V_lo = -N * P_2 - V_hi (U_hi is in place of V_hi in writeup) -// Note: the (-) is still missing for V_hi. { .mfi - nop.m 999 - fsub.s1 A = U_hi, V_hi // A = U_hi - V_hi for |s| < 2^-14 - nop.i 999 + nop.m 999 +// +// Decide between case 3 and 4: +// Case 3: s <= -2**(-14) or s >= 2**(-14) +// Case 4: -2**(-14) < s < 2**(-14) +// +(p10) fadd.s1 r = s_val, w + nop.i 999 } { .mfi - nop.m 999 - fnma.s1 V_lo = N, P_2, V_hi // V_lo = V_hi - N * P_2 for |s| < 2^-14 - nop.i 999 + nop.m 999 +(p11) fmpy.s1 w = N, P_3 + nop.i 999 ;; } -;; - -// Decide between case 3 and 4: -// Case 3: |s| >= 2**(-14) Set p10 -// Case 4: |s| < 2**(-14) Set p11 +{ .mfi + nop.m 999 // -// Case 4: U_lo = N_0 * d_1 - U_hi +// Case 4: We need abs of both U_hi and V_hi - dont +// worry about switched sign of V_hi . +// +(p11) fsub.s1 A = U_hi, V_hi + nop.i 999 +} { .mfi - nop.m 999 - fms.s1 U_lo = N_0, d_1, U_hi // U_lo = N_0*d_1 - U_hi for |s| < 2^-14 - nop.i 999 + nop.m 999 +// +// Case 4: A = U_hi + V_hi +// Note: Worry about switched sign of V_hi, so subtract instead of add. +// +(p11) fnma.s1 V_lo = N, P_2, V_hi + nop.i 999 ;; } { .mfi - nop.m 999 - fcmp.lt.s1 p11, p10 = s_val, TWO_TO_NEG14 - nop.i 999 + nop.m 999 +(p11) fms.s1 U_lo = N_0, d_1, U_hi + nop.i 999 ;; } -;; - -// Case 4: We need abs of both U_hi and V_hi - dont -// worry about switched sign of V_hi. { .mfi - nop.m 999 - fabs V_hiabs = V_hi // |V_hi| for |s| < 2^-14 - nop.i 999 + nop.m 999 +(p11) fabs V_hiabs = V_hi + nop.i 999 } { .mfi - nop.m 999 -(p11) fcmp.gt.s1 p11, p10 = s_val, NEGTWO_TO_NEG14 - nop.i 999 + nop.m 999 +// +// Case 4: V_hi = N * P_2 +// w = N * P_3 +// Note the product does not include the (-) as in the writeup +// so (-) missing for V_hi and w . +(p10) fadd.s1 r = s_val, w + nop.i 999 ;; } -;; - +{ .mfi + nop.m 999 +// // Case 3: c = s_val - r +// Case 4: U_lo = N_0 * d_1 - U_hi +// +(p11) fabs U_hiabs = U_hi + nop.i 999 +} { .mfi - nop.m 999 - fabs U_hiabs = U_hi // |U_hi| for |s| < 2^-14 - nop.i 999 + nop.m 999 +(p11) fmpy.s1 w = N, P_3 + nop.i 999 ;; } { .mfi - nop.m 999 - fsub.s1 c = s_val, r // c = s_val - r for |s| >= 2^-14 - nop.i 999 + nop.m 999 +// +// Case 4: Set P_12 if U_hiabs >= V_hiabs +// +(p11) fadd.s1 C_hi = s_val, A + nop.i 999 ;; } -;; - -// For Case 3, |s| >= 2^-14, determine if |r| < 1/4 +{ .mfi + nop.m 999 // // Case 4: C_hi = s_val + A // -{ .mfi - nop.m 999 -(p11) fadd.s1 C_hi = s_val, A // C_hi = s_val + A for |s| < 2^-14 - nop.i 999 +(p11) fadd.s1 t = U_lo, V_lo + nop.i 999 ;; } { .mfi - nop.m 999 -(p10) fcmp.lt.unc.s1 p14, p15 = r, TWO_TO_NEG2 - nop.i 999 + nop.m 999 +// +// Case 3: Is |r| < 2**(-2), if so set PR_7 +// else set PR_8. +// Case 3: If PR_7 is set, prepare to branch to Small_R. +// Case 3: If PR_8 is set, prepare to branch to Normal_R. +// +(p10) fsub.s1 c = s_val, r + nop.i 999 ;; } -;; - { .mfi - getf.sig sig_r = r // Get signif of r if |s| >= 2^-33 - fand B = B_mask1, r - nop.i 999 + nop.m 999 +// +// Case 3: c = (s - r) + w (c complete) +// +(p11) fcmp.ge.unc.s1 p12, p13 = U_hiabs, V_hiabs + nop.i 999 } -;; - -// Case 4: t = U_lo + V_lo { .mfi - getf.exp exp_r = r // Extract signexp of r if |s| >= 2^-33 -(p11) fadd.s1 t = U_lo, V_lo // t = U_lo + V_lo for |s| < 2^-14 - nop.i 999 + nop.m 999 +(p11) fms.s1 w = N_0, d_2, w + nop.i 999 ;; } { .mfi - nop.m 999 -(p14) fcmp.gt.s1 p14, p15 = r, NEGTWO_TO_NEG2 - nop.i 999 + nop.m 999 +// +// Case 4: V_hi = N * P_2 +// w = N * P_3 +// Note the product does not include the (-) as in the writeup +// so (-) missing for V_hi and w . +// +(p10) fcmp.lt.unc.s1 p14, p15 = r, TWO_TO_NEG2 + nop.i 999 ;; } -;; - -// Case 3: c = (s - r) + w (c complete) { .mfi - nop.m 999 -(p10) fadd.s1 c = c, w // c = c + w for |s| >= 2^-14 - nop.i 999 -} -{ .mbb - nop.m 999 -(p14) br.cond.spnt TANL_SMALL_R // Branch if 2^24 <= |x| < 2^63 and |r|< 1/4 -(p15) br.cond.sptk TANL_NORMAL_R_A // Branch if 2^24 <= |x| < 2^63 and |r|>=1/4 + nop.m 999 +(p14) fcmp.gt.s1 p14, p15 = r, NEGTWO_TO_NEG2 + nop.i 999 ;; } -;; - - -// Here if 2^24 <= |x| < 2^63 and |s| < 2^-14 >>>>>>> Case 4. +{ .mfb + nop.m 999 // -// Case 4: Set P_12 if U_hiabs >= V_hiabs +// Case 4: V_lo = -N * P_2 - V_hi (U_hi is in place of V_hi in writeup) +// Note: the (-) is still missing for V_hi . // Case 4: w = w + N_0 * d_2 // Note: the (-) is now incorporated in w . -{ .mfi - add table_ptr1 = 160, table_base // Point to tanl_table_p1 - fcmp.ge.unc.s1 p12, p13 = U_hiabs, V_hiabs - nop.i 999 +// +(p10) fadd.s1 c = c, w +// +// Case 4: t = U_lo + V_lo +// Note: remember V_lo should be (-), subtract instead of add. NO +// +(p14) br.cond.spnt L(TANL_SMALL_R) ;; +} +{ .mib + nop.m 999 + nop.i 999 +(p15) br.cond.spnt L(TANL_NORMAL_R) ;; } { .mfi - nop.m 999 - fms.s1 w2 = N_0, d_2, w2 - nop.i 999 + nop.m 999 +// +// Case 3: Vector off when |r| < 2**(-2). Recall that PR_3 will be true. +// The remaining stuff is for Case 4. +// +(p12) fsub.s1 a = U_hi, A +(p11) extr.u i_1 = N_fix_gr, 0, 1 ;; } -;; - +{ .mfi + nop.m 999 +// // Case 4: C_lo = s_val - C_hi +// +(p11) fadd.s1 t = t, w + nop.i 999 +} { .mfi - ldfe P1_1 = [table_ptr1], 16 // Load P1_1 - fsub.s1 C_lo = s_val, C_hi - nop.i 999 + nop.m 999 +(p13) fadd.s1 a = V_hi, A + nop.i 999 ;; } -;; + + // // Case 4: a = U_hi - A // a = V_hi - A (do an add to account for missing (-) on V_hi // -{ .mfi - ldfe P1_2 = [table_ptr1], 128 // Load P1_2 -(p12) fsub.s1 a = U_hi, A - nop.i 999 -} -{ .mfi - nop.m 999 -(p13) fadd.s1 a = V_hi, A - nop.i 999 -} -;; -// Case 4: t = U_lo + V_lo + w { .mfi - ldfe Q1_1 = [table_ptr1], 16 // Load Q1_1 - fadd.s1 t = t, w2 - nop.i 999 +(p11) addl table_ptr1 = @ltoff(TANL_BASE_CONSTANTS), gp +(p11) fsub.s1 C_lo = s_val, C_hi + nop.i 999 } ;; + + +// // Case 4: a = (U_hi - A) + V_hi // a = (V_hi - A) + U_hi // In each case account for negative missing form V_hi . // -{ .mfi - ldfe Q1_2 = [table_ptr1], 16 // Load Q1_2 -(p12) fsub.s1 a = a, V_hi - nop.i 999 -} -{ .mfi + + +{ .mmi +(p11) ld8 table_ptr1 = [table_ptr1] nop.m 999 -(p13) fsub.s1 a = U_hi, a nop.i 999 } ;; + // // Case 4: C_lo = (s_val - C_hi) + A // +{ .mmi +(p11) add table_ptr1 = 224, table_ptr1 ;; +(p11) ldfe P1_1 = [table_ptr1], 16 + nop.i 999 ;; +} { .mfi - nop.m 999 - fadd.s1 C_lo = C_lo, A - nop.i 999 ;; +(p11) ldfe P1_2 = [table_ptr1], 128 +// +// Case 4: w = U_lo + V_lo + w +// +(p12) fsub.s1 a = a, V_hi + nop.i 999 ;; } // -// Case 4: t = t + a +// Case 4: r = C_hi + C_lo // { .mfi - nop.m 999 - fadd.s1 t = t, a - nop.i 999 +(p11) ldfe Q1_1 = [table_ptr1], 16 +(p11) fadd.s1 C_lo = C_lo, A + nop.i 999 ;; } -;; - -// Case 4: C_lo = C_lo + t -// Case 4: r = C_hi + C_lo +// +// Case 4: c = C_hi - r +// Get [i_1] - lsb of N_fix_gr. +// { .mfi - nop.m 999 - fadd.s1 C_lo = C_lo, t - nop.i 999 +(p11) ldfe Q1_2 = [table_ptr1], 16 + nop.f 999 + nop.i 999 ;; } -;; - { .mfi - nop.m 999 - fadd.s1 r = C_hi, C_lo - nop.i 999 + nop.m 999 +(p13) fsub.s1 a = U_hi, a + nop.i 999 ;; } -;; - +{ .mfi + nop.m 999 +(p11) fadd.s1 t = t, a + nop.i 999 ;; +} +{ .mfi + nop.m 999 // -// Case 4: c = C_hi - r +// Case 4: t = t + a // +(p11) fadd.s1 C_lo = C_lo, t + nop.i 999 ;; +} { .mfi - nop.m 999 - fsub.s1 c = C_hi, r - nop.i 999 + nop.m 999 +// +// Case 4: C_lo = C_lo + t +// +(p11) fadd.s1 r = C_hi, C_lo + nop.i 999 ;; } { .mfi - nop.m 999 - fmpy.s1 rsq = r, r - add N_fix_gr = N_fix_gr, cot_flag // N = N + 1 (for cotl) + nop.m 999 +(p11) fsub.s1 c = C_hi, r + nop.i 999 } -;; - -// Case 4: c = c + C_lo finished. +{ .mfi + nop.m 999 // -// Is i_1 = lsb of N_fix_gr even or odd? -// if i_1 == 0, set PR_11, else set PR_12. +// Case 4: c = c + C_lo finished. +// Is i_1 even or odd? +// if i_1 == 0, set PR_4, else set PR_5. // +// r and c have been computed. +// We known whether this is the sine or cosine routine. +// Make sure ftz mode is set - should be automatic when using wre +(p0) fmpy.s1 rsq = r, r + nop.i 999 ;; +} { .mfi - nop.m 999 - fadd.s1 c = c , C_lo - tbit.z p11, p12 = N_fix_gr, 0 + nop.m 999 +(p11) fadd.s1 c = c , C_lo +(p11) cmp.eq.unc p11, p12 = 0x0000, i_1 ;; } -;; - -// r and c have been computed. { .mfi - nop.m 999 + nop.m 999 (p12) frcpa.s1 S_hi, p0 = f1, r - nop.i 999 + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 // // N odd: Change sign of S_hi // -(p11) fma.s1 Poly = rsq, P1_2, P1_1 - nop.i 999 ;; +(p11) fma.s1 Result = rsq, P1_2, P1_1 + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 (p12) fma.s1 P = rsq, Q1_2, Q1_1 - nop.i 999 + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 // // N odd: Result = S_hi + S_lo (User supplied rounding mode for C1) // - fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact - nop.i 999 ;; +(p0) fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even: rsq = r * r // N odd: S_hi = frcpa(r) // (p12) fmerge.ns S_hi = S_hi, S_hi - nop.i 999 + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 // // N even: rsq = rsq * P1_2 + P1_1 // N odd: poly1 = 1.0 + S_hi * r 16 bits partial account for necessary // -(p11) fmpy.s1 Poly = rsq, Poly - nop.i 999 ;; +(p11) fmpy.s1 Result = rsq, Result + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 (p12) fma.s1 poly1 = S_hi, r,f1 -(p11) tbit.z.unc p14, p15 = cot_flag, 0 // p14=1 for tanl; p15=1 for cotl + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 // -// N even: Poly = Poly * rsq +// N even: Result = Result * rsq // N odd: S_hi = S_hi + S_hi*poly1 16 bits account for necessary // -(p11) fma.s1 Poly = r, Poly, c - nop.i 999 ;; +(p11) fma.s1 Result = r, Result, c + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 (p12) fma.s1 S_hi = S_hi, poly1, S_hi - nop.i 999 + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 // // N odd: S_hi = S_hi * poly1 + S_hi 32 bits // -(p14) fadd.s0 Result = r, Poly // for tanl - nop.i 999 ;; +(p11) fadd.s0 Result= r, Result + nop.i 999 ;; } - -.pred.rel "mutex",p15,p12 { .mfi - nop.m 999 -(p15) fms.s0 Result = r, mOne, Poly // for cotl - nop.i 999 -} -{ .mfi - nop.m 999 + nop.m 999 (p12) fma.s1 poly1 = S_hi, r, f1 - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // -// N even: Poly = Poly * r + c +// N even: Result = Result * r + c // N odd: poly1 = 1.0 + S_hi * r 32 bits partial // (p12) fma.s1 S_hi = S_hi, poly1, S_hi - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 (p12) fma.s1 poly1 = S_hi, r, f1 - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // -// N even: Result = Poly + r (Rounding mode S0) +// N even: Result1 = Result + r (Rounding mode S0) // N odd: poly1 = S_hi * r + 1.0 64 bits partial // (p12) fma.s1 S_hi = S_hi, poly1, S_hi - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N odd: poly1 = S_hi * poly + S_hi 64 bits // (p12) fma.s1 poly1 = S_hi, r, f1 - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N odd: poly1 = S_hi * r + 1.0 // (p12) fma.s1 poly1 = S_hi, c, poly1 - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N odd: poly1 = S_hi * c + poly1 // (p12) fmpy.s1 S_lo = S_hi, poly1 - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N odd: S_lo = S_hi * poly1 // (p12) fma.s1 S_lo = P, r, S_lo -(p12) tbit.z.unc p14, p15 = cot_flag, 0 ;; // p14=1 for tanl; p15=1 for cotl -} - -{ .mfi - nop.m 999 -(p14) fadd.s0 Result = S_hi, S_lo // for tanl - nop.i 999 + nop.i 999 ;; } { .mfb - nop.m 999 + nop.m 999 // // N odd: S_lo = S_lo + r * P // -(p15) fms.s0 Result = S_hi, mOne, S_lo // for cotl - br.ret.sptk b0 ;; // Exit for 2^24 <= |x| < 2^63 and |s| < 2^-14 +(p12) fadd.s0 Result = S_hi, S_lo +(p0) br.ret.sptk b0 ;; } -TANL_SMALL_R: -// Here if |r| < 1/4 -// r and c have been computed. -// ***************************************************************** -// ***************************************************************** -// ***************************************************************** -// N odd: S_hi = frcpa(r) -// Get [i_1] - lsb of N_fix_gr. Set p11 if N even, p12 if N odd. -// N even: rsq = r * r +L(TANL_SMALL_R): +{ .mii + nop.m 999 +(p0) extr.u i_1 = N_fix_gr, 0, 1 ;; +(p0) cmp.eq.unc p11, p12 = 0x0000, i_1 +} { .mfi - add table_ptr1 = 160, table_base // Point to tanl_table_p1 - frcpa.s1 S_hi, p0 = f1, r // S_hi for N odd - add N_fix_gr = N_fix_gr, cot_flag // N = N + 1 (for cotl) + nop.m 999 +(p0) fmpy.s1 rsq = r, r + nop.i 999 ;; } { .mfi - add table_ptr2 = 400, table_base // Point to Q1_7 - fmpy.s1 rsq = r, r - nop.i 999 +(p0) addl table_ptr1 = @ltoff(TANL_BASE_CONSTANTS), gp +(p12) frcpa.s1 S_hi, p0 = f1, r + nop.i 999 } ;; + { .mmi - ldfe P1_1 = [table_ptr1], 16 -;; - ldfe P1_2 = [table_ptr1], 16 - tbit.z p11, p12 = N_fix_gr, 0 +(p0) ld8 table_ptr1 = [table_ptr1] + nop.m 999 + nop.i 999 } ;; +// ***************************************************************** +// ***************************************************************** +// ***************************************************************** + +{ .mmi +(p0) add table_ptr1 = 224, table_ptr1 ;; +(p0) ldfe P1_1 = [table_ptr1], 16 + nop.i 999 ;; +} +// r and c have been computed. +// We known whether this is the sine or cosine routine. +// Make sure ftz mode is set - should be automatic when using wre +// |r| < 2**(-2) { .mfi - ldfe P1_3 = [table_ptr1], 96 - nop.f 999 - nop.i 999 +(p0) ldfe P1_2 = [table_ptr1], 16 +(p11) fmpy.s1 r_to_the_8 = rsq, rsq + nop.i 999 ;; } -;; - +// +// Set table_ptr1 to beginning of constant table. +// Get [i_1] - lsb of N_fix_gr. +// { .mfi -(p11) ldfe P1_9 = [table_ptr1], -16 +(p0) ldfe P1_3 = [table_ptr1], 96 +// +// N even: rsq = r * r +// N odd: S_hi = frcpa(r) +// (p12) fmerge.ns S_hi = S_hi, S_hi - nop.i 999 + nop.i 999 ;; } +// +// Is i_1 even or odd? +// if i_1 == 0, set PR_11. +// if i_1 != 0, set PR_12. +// { .mfi - nop.m 999 -(p11) fmpy.s1 r_to_the_8 = rsq, rsq - nop.i 999 -} -;; - +(p11) ldfe P1_9 = [table_ptr1], -16 // // N even: Poly2 = P1_7 + Poly2 * rsq // N odd: poly2 = Q1_5 + poly2 * rsq // -{ .mfi -(p11) ldfe P1_8 = [table_ptr1], -16 (p11) fadd.s1 CORR = rsq, f1 - nop.i 999 + nop.i 999 ;; } -;; - +{ .mmi +(p11) ldfe P1_8 = [table_ptr1], -16 ;; // // N even: Poly1 = P1_2 + P1_3 * rsq -// N odd: poly1 = 1.0 + S_hi * r +// N odd: poly1 = 1.0 + S_hi * r // 16 bits partial account for necessary (-1) // -{ .mmi (p11) ldfe P1_7 = [table_ptr1], -16 -;; -(p11) ldfe P1_6 = [table_ptr1], -16 - nop.i 999 + nop.i 999 ;; } -;; - // // N even: Poly1 = P1_1 + Poly1 * rsq // N odd: S_hi = S_hi + S_hi * poly1) 16 bits account for necessary // +{ .mfi +(p11) ldfe P1_6 = [table_ptr1], -16 // // N even: Poly2 = P1_5 + Poly2 * rsq // N odd: poly2 = Q1_3 + poly2 * rsq // -{ .mfi -(p11) ldfe P1_5 = [table_ptr1], -16 (p11) fmpy.s1 r_to_the_8 = r_to_the_8, r_to_the_8 - nop.i 999 + nop.i 999 ;; } -{ .mfi - nop.m 999 -(p12) fma.s1 poly1 = S_hi, r, f1 - nop.i 999 -} -;; - // // N even: Poly1 = Poly1 * rsq // N odd: poly1 = 1.0 + S_hi * r 32 bits partial // +{ .mfi +(p11) ldfe P1_5 = [table_ptr1], -16 +(p12) fma.s1 poly1 = S_hi, r, f1 + nop.i 999 ;; +} // // N even: CORR = CORR * c @@ -2421,30 +2290,44 @@ TANL_SMALL_R: { .mmf (p11) ldfe P1_4 = [table_ptr1], -16 - nop.m 999 +(p0) addl table_ptr2 = @ltoff(TANL_BASE_CONSTANTS), gp (p11) fmpy.s1 CORR = CORR, c } ;; -{ .mfi + +{ .mmi +(p0) ld8 table_ptr2 = [table_ptr2] nop.m 999 + nop.i 999 +} +;; + + +{ .mii +(p0) add table_ptr2 = 464, table_ptr2 + nop.i 999 ;; + nop.i 999 +} +{ .mfi + nop.m 999 (p11) fma.s1 Poly1 = P1_3, rsq, P1_2 - nop.i 999 ;; + nop.i 999 ;; } { .mfi -(p12) ldfe Q1_7 = [table_ptr2], -16 +(p0) ldfe Q1_7 = [table_ptr2], -16 (p12) fma.s1 S_hi = S_hi, poly1, S_hi - nop.i 999 ;; + nop.i 999 ;; } { .mfi -(p12) ldfe Q1_6 = [table_ptr2], -16 +(p0) ldfe Q1_6 = [table_ptr2], -16 (p11) fma.s1 Poly2 = P1_9, rsq, P1_8 - nop.i 999 ;; + nop.i 999 ;; } { .mmi -(p12) ldfe Q1_5 = [table_ptr2], -16 ;; +(p0) ldfe Q1_5 = [table_ptr2], -16 ;; (p12) ldfe Q1_4 = [table_ptr2], -16 - nop.i 999 ;; + nop.i 999 ;; } { .mfi (p12) ldfe Q1_3 = [table_ptr2], -16 @@ -2453,796 +2336,735 @@ TANL_SMALL_R: // N odd: poly2 = Q1_6 + Q1_7 * rsq // (p11) fma.s1 Poly1 = Poly1, rsq, P1_1 - nop.i 999 ;; + nop.i 999 ;; } { .mfi (p12) ldfe Q1_2 = [table_ptr2], -16 (p12) fma.s1 poly1 = S_hi, r, f1 - nop.i 999 ;; + nop.i 999 ;; } { .mfi (p12) ldfe Q1_1 = [table_ptr2], -16 (p11) fma.s1 Poly2 = Poly2, rsq, P1_7 - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even: CORR = rsq + 1 // N even: r_to_the_8 = rsq * rsq // (p11) fmpy.s1 Poly1 = Poly1, rsq - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 (p12) fma.s1 S_hi = S_hi, poly1, S_hi - nop.i 999 + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 (p12) fma.s1 poly2 = Q1_7, rsq, Q1_6 - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 (p11) fma.s1 Poly2 = Poly2, rsq, P1_6 - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 (p12) fma.s1 poly1 = S_hi, r, f1 - nop.i 999 + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 (p12) fma.s1 poly2 = poly2, rsq, Q1_5 - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 (p11) fma.s1 Poly2= Poly2, rsq, P1_5 - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 (p12) fma.s1 S_hi = S_hi, poly1, S_hi - nop.i 999 + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 (p12) fma.s1 poly2 = poly2, rsq, Q1_4 - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even: r_to_the_8 = r_to_the_8 * r_to_the_8 // N odd: poly1 = S_hi * r + 1.0 64 bits partial // (p11) fma.s1 Poly2 = Poly2, rsq, P1_4 - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // -// N even: Poly = CORR + Poly * r +// N even: Result = CORR + Poly * r // N odd: P = Q1_1 + poly2 * rsq // (p12) fma.s1 poly1 = S_hi, r, f1 - nop.i 999 + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 (p12) fma.s1 poly2 = poly2, rsq, Q1_3 - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even: Poly2 = P1_4 + Poly2 * rsq // N odd: poly2 = Q1_2 + poly2 * rsq // (p11) fma.s1 Poly = Poly2, r_to_the_8, Poly1 - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 (p12) fma.s1 poly1 = S_hi, c, poly1 - nop.i 999 + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 (p12) fma.s1 poly2 = poly2, rsq, Q1_2 - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even: Poly = Poly1 + Poly2 * r_to_the_8 // N odd: S_hi = S_hi * poly1 + S_hi 64 bits // -(p11) fma.s1 Poly = Poly, r, CORR - nop.i 999 ;; +(p11) fma.s1 Result = Poly, r, CORR + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // -// N even: Result = r + Poly (User supplied rounding mode) +// N even: Result = r + Result (User supplied rounding mode) // N odd: poly1 = S_hi * c + poly1 // (p12) fmpy.s1 S_lo = S_hi, poly1 -(p11) tbit.z.unc p14, p15 = cot_flag, 0 // p14=1 for tanl; p15=1 for cotl + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 (p12) fma.s1 P = poly2, rsq, Q1_1 - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N odd: poly1 = S_hi * r + 1.0 // // // N odd: S_lo = S_hi * poly1 // -(p14) fadd.s0 Result = Poly, r // for tanl - nop.i 999 +(p11) fadd.s0 Result = Result, r + nop.i 999 ;; } { .mfi - nop.m 999 -(p15) fms.s0 Result = Poly, mOne, r // for cotl - nop.i 999 ;; -} - -{ .mfi - nop.m 999 + nop.m 999 // // N odd: S_lo = Q1_1 * c + S_lo // (p12) fma.s1 S_lo = Q1_1, c, S_lo - nop.i 999 + nop.i 999 } { .mfi - nop.m 999 - fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact - nop.i 999 ;; + nop.m 999 +(p0) fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N odd: Result = S_lo + r * P // (p12) fma.s1 Result = P, r, S_lo -(p12) tbit.z.unc p14, p15 = cot_flag, 0 ;; // p14=1 for tanl; p15=1 for cotl -} - -// -// N odd: Result = Result + S_hi (user supplied rounding mode) -// -{ .mfi - nop.m 999 -(p14) fadd.s0 Result = Result, S_hi // for tanl - nop.i 999 + nop.i 999 ;; } { .mfb - nop.m 999 -(p15) fms.s0 Result = Result, mOne, S_hi // for cotl - br.ret.sptk b0 ;; // Exit |r| < 1/4 path + nop.m 999 +// +// N odd: Result = Result + S_hi (user supplied rounding mode) +// +(p12) fadd.s0 Result = Result, S_hi +(p0) br.ret.sptk b0 ;; } -TANL_NORMAL_R: -// Here if 1/4 <= |x| < pi/4 or if |x| >= 2^63 and |r| >= 1/4 +L(TANL_NORMAL_R): +{ .mfi +(p0) getf.sig sig_r = r // ******************************************************************* // ******************************************************************* // ******************************************************************* // // r and c have been computed. +// Make sure ftz mode is set - should be automatic when using wre +// // +// Get [i_1] - lsb of N_fix_gr alone. +// +(p0) fmerge.s Pos_r = f1, r +(p0) extr.u i_1 = N_fix_gr, 0, 1 ;; +} { .mfi - nop.m 999 - fand B = B_mask1, r - nop.i 999 + nop.m 999 +(p0) fmerge.s sgn_r = r, f1 +(p0) cmp.eq.unc p11, p12 = 0x0000, i_1 ;; +} +{ .mfi + nop.m 999 + nop.f 999 +(p0) extr.u lookup = sig_r, 58, 5 +} +{ .mlx + nop.m 999 +(p0) movl Create_B = 0x8200000000000000 ;; +} +{ .mfi +(p0) addl table_ptr1 = @ltoff(TANL_BASE_CONSTANTS), gp + nop.f 999 +(p0) dep Create_B = lookup, Create_B, 58, 5 } ;; -TANL_NORMAL_R_A: -// Enter here if pi/4 <= |x| < 2^63 and |r| >= 1/4 -// Get the 5 bits or r for the lookup. 1.xxxxx .... + +// +// Get [i_1] - lsb of N_fix_gr alone. +// Pos_r = abs (r) +// + + { .mmi - add table_ptr1 = 416, table_base // Point to tanl_table_p2 - mov GR_exp_2tom65 = 0xffff - 65 // Scaling constant for B - extr.u lookup = sig_r, 58, 5 +(p0) ld8 table_ptr1 = [table_ptr1] + nop.m 999 + nop.i 999 } ;; + { .mmi - ldfe P2_1 = [table_ptr1], 16 - setf.exp TWO_TO_NEG65 = GR_exp_2tom65 // 2^-65 for scaling B if exp_r=-2 - add N_fix_gr = N_fix_gr, cot_flag // N = N + 1 (for cotl) + nop.m 999 +(p0) setf.sig B = Create_B +// +// Set table_ptr1 and table_ptr2 to base address of +// constant table. +// +(p0) add table_ptr1 = 480, table_ptr1 ;; } -;; - -.pred.rel "mutex",p11,p12 -// B = 2^63 * 1.xxxxx 100...0 -{ .mfi - ldfe P2_2 = [table_ptr1], 16 - for B = B_mask2, B - mov table_offset = 512 // Assume table offset is 512 +{ .mmb + nop.m 999 +// +// Is i_1 or i_0 == 0 ? +// Create the constant 1 00000 1000000000000000000000... +// +(p0) ldfe P2_1 = [table_ptr1], 16 + nop.b 999 } -;; - -{ .mfi - ldfe P2_3 = [table_ptr1], 16 - fmerge.s Pos_r = f1, r - tbit.nz p8,p9 = exp_r, 0 +{ .mmi + nop.m 999 ;; +(p0) getf.exp exp_r = Pos_r + nop.i 999 } -;; - +// +// Get r's exponent +// Get r's significand +// +{ .mmi +(p0) ldfe P2_2 = [table_ptr1], 16 ;; +// +// Get the 5 bits or r for the lookup. 1.xxxxx .... +// from sig_r. +// Grab lsb of exp of B +// +(p0) ldfe P2_3 = [table_ptr1], 16 + nop.i 999 ;; +} +{ .mii + nop.m 999 +(p0) andcm table_offset = 0x0001, exp_r ;; +(p0) shl table_offset = table_offset, 9 ;; +} +{ .mii + nop.m 999 +// +// Deposit 0 00000 1000000000000000000000... on +// 1 xxxxx yyyyyyyyyyyyyyyyyyyyyy..., +// getting rid of the ys. // Is B = 2** -2 or B= 2** -1? If 2**-1, then // we want an offset of 512 for table addressing. -{ .mii - add table_ptr2 = 1296, table_base // Point to tanl_table_cm2 -(p9) shladd table_offset = lookup, 4, table_offset -(p8) shladd table_offset = lookup, 4, r0 +// +(p0) shladd table_offset = lookup, 4, table_offset ;; +// +// B = ........ 1xxxxx 1000000000000000000... +// +(p0) add table_ptr1 = table_ptr1, table_offset ;; } -;; - -{ .mmi - add table_ptr1 = table_ptr1, table_offset // Point to T_hi - add table_ptr2 = table_ptr2, table_offset // Point to C_hi - add table_ptr3 = 2128, table_base // Point to tanl_table_scim2 +{ .mmb + nop.m 999 +// +// B = ........ 1xxxxx 1000000000000000000... +// Convert B so it has the same exponent as Pos_r +// +(p0) ldfd T_hi = [table_ptr1], 8 + nop.b 999 ;; } -;; -{ .mmi - ldfd T_hi = [table_ptr1], 8 // Load T_hi -;; - ldfd C_hi = [table_ptr2], 8 // Load C_hi - add table_ptr3 = table_ptr3, table_offset // Point to SC_inv -} -;; + // // x = |r| - B +// Load T_hi. +// Load C_hi. // -// Convert B so it has the same exponent as Pos_r before subtracting -{ .mfi - ldfs T_lo = [table_ptr1] // Load T_lo -(p9) fnma.s1 x = B, FR_2tom64, Pos_r - nop.i 999 -} -{ .mfi - nop.m 999 -(p8) fnma.s1 x = B, TWO_TO_NEG65, Pos_r - nop.i 999 + +{ .mmf +(p0) addl table_ptr2 = @ltoff(TANL_BASE_CONSTANTS), gp +(p0) ldfs T_lo = [table_ptr1] +(p0) fmerge.se B = Pos_r, B } ;; -{ .mfi - ldfs C_lo = [table_ptr2] // Load C_lo - nop.f 999 + +{ .mmi +(p0) ld8 table_ptr2 = [table_ptr2] + nop.m 999 nop.i 999 } ;; -{ .mfi - ldfe SC_inv = [table_ptr3] // Load SC_inv - fmerge.s sgn_r = r, f1 - tbit.z p11, p12 = N_fix_gr, 0 // p11 if N even, p12 if odd +{ .mii +(p0) add table_ptr2 = 1360, table_ptr2 + nop.i 999 ;; +(p0) add table_ptr2 = table_ptr2, table_offset ;; } -;; - +{ .mfi +(p0) ldfd C_hi = [table_ptr2], 8 +(p0) fsub.s1 x = Pos_r, B + nop.i 999 ;; +} +{ .mii +(p0) ldfs C_lo = [table_ptr2],255 + nop.i 999 ;; // // xsq = x * x // N even: Tx = T_hi * x +// Load T_lo. +// Load C_lo - increment pointer to get SC_inv +// - cant get all the way, do an add later. +// +(p0) add table_ptr2 = 569, table_ptr2 ;; +} // // N even: Tx1 = Tx + 1 // N odd: Cx1 = 1 - Cx // - { .mfi - nop.m 999 - fmpy.s1 xsq = x, x - nop.i 999 +(p0) ldfe SC_inv = [table_ptr2], 0 + nop.f 999 + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 +(p0) fmpy.s1 xsq = x, x + nop.i 999 +} +{ .mfi + nop.m 999 (p11) fmpy.s1 Tx = T_hi, x - nop.i 999 + nop.i 999 ;; } -;; - -// -// N odd: Cx = C_hi * x -// { .mfi - nop.m 999 + nop.m 999 (p12) fmpy.s1 Cx = C_hi, x - nop.i 999 + nop.i 999 ;; } -;; +{ .mfi + nop.m 999 // -// N even and odd: P = P2_3 + P2_2 * xsq +// N odd: Cx = C_hi * x // -{ .mfi - nop.m 999 - fma.s1 P = P2_3, xsq, P2_2 - nop.i 999 +(p0) fma.s1 P = P2_3, xsq, P2_2 + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 +// +// N even and odd: P = P2_3 + P2_2 * xsq +// (p11) fadd.s1 Tx1 = Tx, f1 - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even: D = C_hi - tanx // N odd: D = T_hi + tanx // (p11) fmpy.s1 CORR = SC_inv, T_hi - nop.i 999 + nop.i 999 } { .mfi - nop.m 999 - fmpy.s1 Sx = SC_inv, x - nop.i 999 ;; + nop.m 999 +(p0) fmpy.s1 Sx = SC_inv, x + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 (p12) fmpy.s1 CORR = SC_inv, C_hi - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 (p12) fsub.s1 V_hi = f1, Cx - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 - fma.s1 P = P, xsq, P2_1 - nop.i 999 + nop.m 999 +(p0) fma.s1 P = P, xsq, P2_1 + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 // // N even and odd: P = P2_1 + P * xsq // (p11) fma.s1 V_hi = Tx, Tx1, f1 - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even: Result = sgn_r * tail + T_hi (user rounding mode for C1) // N odd: Result = sgn_r * tail + C_hi (user rounding mode for C1) // - fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact - nop.i 999 ;; +(p0) fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact + nop.i 999 ;; } { .mfi - nop.m 999 - fmpy.s1 CORR = CORR, c - nop.i 999 ;; + nop.m 999 +(p0) fmpy.s1 CORR = CORR, c + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 (p12) fnma.s1 V_hi = Cx,V_hi,f1 - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even: V_hi = Tx * Tx1 + 1 // N odd: Cx1 = 1 - Cx * Cx1 // - fmpy.s1 P = P, xsq - nop.i 999 +(p0) fmpy.s1 P = P, xsq + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 // // N even and odd: P = P * xsq // (p11) fmpy.s1 V_hi = V_hi, T_hi - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even and odd: tail = P * tail + V_lo // (p11) fmpy.s1 T_hi = sgn_r, T_hi - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 - fmpy.s1 CORR = CORR, sgn_r - nop.i 999 ;; + nop.m 999 +(p0) fmpy.s1 CORR = CORR, sgn_r + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 (p12) fmpy.s1 V_hi = V_hi,C_hi - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even: V_hi = T_hi * V_hi // N odd: V_hi = C_hi * V_hi // - fma.s1 tanx = P, x, x - nop.i 999 +(p0) fma.s1 tanx = P, x, x + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 (p12) fnmpy.s1 C_hi = sgn_r, C_hi - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even: V_lo = 1 - V_hi + C_hi // N odd: V_lo = 1 - V_hi + T_hi // (p11) fadd.s1 CORR = CORR, T_lo - nop.i 999 + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 (p12) fsub.s1 CORR = CORR, C_lo - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even and odd: tanx = x + x * P // N even and odd: Sx = SC_inv * x // (p11) fsub.s1 D = C_hi, tanx - nop.i 999 + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 (p12) fadd.s1 D = T_hi, tanx - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N odd: CORR = SC_inv * C_hi // N even: CORR = SC_inv * T_hi // - fnma.s1 D = V_hi, D, f1 - nop.i 999 ;; +(p0) fnma.s1 D = V_hi, D, f1 + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even and odd: D = 1 - V_hi * D // N even and odd: CORR = CORR * c // - fma.s1 V_hi = V_hi, D, V_hi - nop.i 999 ;; +(p0) fma.s1 V_hi = V_hi, D, V_hi + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even and odd: V_hi = V_hi + V_hi * D // N even and odd: CORR = sgn_r * CORR // (p11) fnma.s1 V_lo = V_hi, C_hi, f1 - nop.i 999 + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 (p12) fnma.s1 V_lo = V_hi, T_hi, f1 - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even: CORR = COOR + T_lo // N odd: CORR = CORR - C_lo // (p11) fma.s1 V_lo = tanx, V_hi, V_lo - tbit.nz p15, p0 = cot_flag, 0 // p15=1 if we compute cotl + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 (p12) fnma.s1 V_lo = tanx, V_hi, V_lo - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p15) fms.s1 T_hi = f0, f0, T_hi // to correct result's sign for cotl - nop.i 999 + nop.i 999 ;; } { .mfi - nop.m 999 -(p15) fms.s1 C_hi = f0, f0, C_hi // to correct result's sign for cotl - nop.i 999 -};; - -{ .mfi - nop.m 999 -(p15) fms.s1 sgn_r = f0, f0, sgn_r // to correct result's sign for cotl - nop.i 999 -};; - -{ .mfi - nop.m 999 + nop.m 999 // // N even: V_lo = V_lo + V_hi * tanx // N odd: V_lo = V_lo - V_hi * tanx // (p11) fnma.s1 V_lo = C_lo, V_hi, V_lo - nop.i 999 + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 (p12) fnma.s1 V_lo = T_lo, V_hi, V_lo - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even: V_lo = V_lo - V_hi * C_lo // N odd: V_lo = V_lo - V_hi * T_lo // - fmpy.s1 V_lo = V_hi, V_lo - nop.i 999 ;; +(p0) fmpy.s1 V_lo = V_hi, V_lo + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even and odd: V_lo = V_lo * V_hi // - fadd.s1 tail = V_hi, V_lo - nop.i 999 ;; +(p0) fadd.s1 tail = V_hi, V_lo + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even and odd: tail = V_hi + V_lo // - fma.s1 tail = tail, P, V_lo - nop.i 999 ;; +(p0) fma.s1 tail = tail, P, V_lo + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even: T_hi = sgn_r * T_hi // N odd : C_hi = -sgn_r * C_hi // - fma.s1 tail = tail, Sx, CORR - nop.i 999 ;; +(p0) fma.s1 tail = tail, Sx, CORR + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even and odd: tail = Sx * tail + CORR // - fma.s1 tail = V_hi, Sx, tail - nop.i 999 ;; +(p0) fma.s1 tail = V_hi, Sx, tail + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even an odd: tail = Sx * V_hi + tail // (p11) fma.s0 Result = sgn_r, tail, T_hi - nop.i 999 + nop.i 999 } { .mfb - nop.m 999 + nop.m 999 (p12) fma.s0 Result = sgn_r, tail, C_hi - br.ret.sptk b0 ;; // Exit for 1/4 <= |r| < pi/4 +(p0) br.ret.sptk b0 ;; } -TANL_DENORMAL: -// Here if x denormal +L(TANL_SPECIAL): { .mfb - getf.exp GR_signexp_x = Norm_Arg // Get sign and exponent of x - nop.f 999 - br.cond.sptk TANL_COMMON // Return to common code + nop.m 999 +(p0) fmpy.s0 Arg = Arg, f0 +(p0) br.ret.sptk b0 ;; } -;; - - -TANL_SPECIAL: -TANL_UNSUPPORTED: // // Code for NaNs, Unsupporteds, Infs, or +/- zero ? // Invalid raised for Infs and SNaNs. // -{ .mfi - nop.m 999 - fmerge.s f10 = f8, f8 // Save input for error call - tbit.nz p6, p7 = cot_flag, 0 // p6=1 if we compute cotl -} -;; - -{ .mfi - nop.m 999 -(p6) fclass.m p6, p7 = f8, 0x7 // Test for zero (cotl only) - nop.i 999 -} -;; - -.pred.rel "mutex", p6, p7 -{ .mfi -(p6) mov GR_Parameter_Tag = 225 // (cotl) -(p6) frcpa.s0 f8, p0 = f1, f8 // cotl(+-0) = +-Inf - nop.i 999 -} -{ .mfb - nop.m 999 -(p7) fmpy.s0 f8 = f8, f0 -(p7) br.ret.sptk b0 -} -;; - -GLOBAL_IEEE754_END(tanl) +.endp tanl +ASM_SIZE_DIRECTIVE(tanl) +// ******************************************************************* +// ******************************************************************* +// ******************************************************************* +// +// Special Code to handle very large argument case. +// Call int pi_by_2_reduce(&x,&r,&c) +// for |arguments| >= 2**63 +// (Arg or x) is in f8 +// Address to save r and c as double +// ******************************************************************* +// ******************************************************************* +// ******************************************************************* -LOCAL_LIBM_ENTRY(__libm_error_region) +.proc __libm_callout +__libm_callout: +L(TANL_ARG_TOO_LARGE): .prologue - -// (1) { .mfi - add GR_Parameter_Y=-32,sp // Parameter 2 value - nop.f 0 + add r50=-32,sp // Parameter: r address + nop.f 0 .save ar.pfs,GR_SAVE_PFS - mov GR_SAVE_PFS=ar.pfs // Save ar.pfs + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs } { .mfi .fframe 64 - add sp=-64,sp // Create new stack - nop.f 0 - mov GR_SAVE_GP=gp // Save gp + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp };; - -// (2) { .mmi - stfe [GR_Parameter_Y] = f1,16 // STORE Parameter 2 on stack - add GR_Parameter_X = 16,sp // Parameter 1 address + stfe [r50] = f0,16 // Clear Parameter r on stack + add r49 = 16,sp // Parameter x address .save b0, GR_SAVE_B0 - mov GR_SAVE_B0=b0 // Save b0 + mov GR_SAVE_B0=b0 // Save b0 };; - .body -// (3) { .mib - stfe [GR_Parameter_X] = f10 // STORE Parameter 1 on stack - add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address - nop.b 0 + stfe [r50] = f0,-16 // Clear Parameter c on stack + nop.i 0 + nop.b 0 } { .mib - stfe [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack - add GR_Parameter_Y = -16,GR_Parameter_Y - br.call.sptk b0=__libm_error_support# // Call error handling function -};; -{ .mmi - nop.m 0 - nop.m 0 - add GR_Parameter_RESULT = 48,sp -};; - -// (4) -{ .mmi - ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack -.restore sp - add sp = 64,sp // Restore stack pointer - mov b0 = GR_SAVE_B0 // Restore return address -};; -{ .mib - mov gp = GR_SAVE_GP // Restore gp - mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs - br.ret.sptk b0 // Return + stfe [r49] = Arg // Store Parameter x on stack + nop.i 0 +(p0) br.call.sptk b0=__libm_pi_by_2_reduce# ;; };; - -LOCAL_LIBM_END(__libm_error_region) - -.type __libm_error_support#,@function -.global __libm_error_support# - - -// ******************************************************************* -// ******************************************************************* -// ******************************************************************* // -// Special Code to handle very large argument case. -// Call int __libm_pi_by_2_reduce(x,r,c) for |arguments| >= 2**63 -// The interface is custom: -// On input: -// (Arg or x) is in f8 -// On output: -// r is in f8 -// c is in f9 -// N is in r8 -// We know also that __libm_pi_by_2_reduce preserves f10-15, f71-127. We -// use this to eliminate save/restore of key fp registers in this calling -// function. +// Load 2^-2 // -// ******************************************************************* -// ******************************************************************* -// ******************************************************************* - -LOCAL_LIBM_ENTRY(__libm_callout) -TANL_ARG_TOO_LARGE: -.prologue -{ .mfi - add table_ptr2 = 144, table_base // Point to 2^-2 - nop.f 999 -.save ar.pfs,GR_SAVE_PFS - mov GR_SAVE_PFS=ar.pfs // Save ar.pfs -} -;; - -// Load 2^-2, -2^-2 { .mmi - ldfps TWO_TO_NEG2, NEGTWO_TO_NEG2 = [table_ptr2] - setf.sig B_mask1 = bmask1 // Form mask to get 5 msb of r -.save b0, GR_SAVE_B0 - mov GR_SAVE_B0=b0 // Save b0 -};; - -.body +(p0) ldfe Arg =[r49],16 // -// Call argument reduction with x in f8 -// Returns with N in r8, r in f8, c in f9 -// Assumes f71-127 are preserved across the call +// Call argument reduction // -{ .mib - setf.sig B_mask2 = bmask2 // Form mask to form B from r - mov GR_SAVE_GP=gp // Save gp - br.call.sptk b0=__libm_pi_by_2_reduce# +(p0) ldfs TWO_TO_NEG2 = [table_ptr2],4 +// Get Arg off stack +// Get r off stack - hi order part +// Get c off stack - lo order part +(p0) mov N_fix_gr = r8 ;; } -;; - -// -// Is |r| < 2**(-2) -// -{ .mfi - getf.sig sig_r = r // Extract significand of r - fcmp.lt.s1 p6, p0 = r, TWO_TO_NEG2 - mov gp = GR_SAVE_GP // Restore gp +{ .mmb +(p0) ldfe r =[r50],16 +(p0) ldfs NEGTWO_TO_NEG2 = [table_ptr2],4 + nop.b 999 ;; } -;; - { .mfi - getf.exp exp_r = r // Extract signexp of r - nop.f 999 - mov b0 = GR_SAVE_B0 // Restore return address +(p0) ldfe c =[r50],-32 + nop.f 999 + nop.i 999 ;; } -;; - +{ .mfi +.restore sp + add sp = 64,sp // Restore stack pointer // -// Get N_fix_gr +// Is |r| < 2**(-2) // +(p0) fcmp.lt.unc.s1 p6, p0 = r, TWO_TO_NEG2 +mov b0 = GR_SAVE_B0 // Restore return address +};; { .mfi - mov N_fix_gr = r8 -(p6) fcmp.gt.unc.s1 p6, p0 = r, NEGTWO_TO_NEG2 - mov ar.pfs = GR_SAVE_PFS // Restore pfs -} -;; - + mov gp = GR_SAVE_GP // Restore gp +(p6) fcmp.gt.unc.s1 p6, p0 = r, NEGTWO_TO_NEG2 + mov ar.pfs = GR_SAVE_PFS // Restore gp +};; { .mbb - nop.m 999 -(p6) br.cond.spnt TANL_SMALL_R // Branch if |r| < 1/4 - br.cond.sptk TANL_NORMAL_R // Branch if 1/4 <= |r| < pi/4 + nop.m 999 +(p6) br.cond.spnt L(TANL_SMALL_R) +(p0) br.cond.sptk L(TANL_NORMAL_R) ;; } -;; -LOCAL_LIBM_END(__libm_callout) +.endp __libm_callout +ASM_SIZE_DIRECTIVE(__libm_callout) .type __libm_pi_by_2_reduce#,@function .global __libm_pi_by_2_reduce# -- cgit 1.4.1