diff options
Diffstat (limited to 'sysdeps/ia64/fpu/s_expm1.S')
-rw-r--r-- | sysdeps/ia64/fpu/s_expm1.S | 2150 |
1 files changed, 634 insertions, 1516 deletions
diff --git a/sysdeps/ia64/fpu/s_expm1.S b/sysdeps/ia64/fpu/s_expm1.S index 19a237990c..09a22bbbdd 100644 --- a/sysdeps/ia64/fpu/s_expm1.S +++ b/sysdeps/ia64/fpu/s_expm1.S @@ -1,10 +1,10 @@ .file "exp_m1.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2005, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,1694 +20,821 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// HISTORY -// 2/02/00 Initial Version -// 4/04/00 Unwind support added -// 8/15/00 Bundle added after call to __libm_error_support to properly +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://www.intel.com/software/products/opensource/libraries/num.htm. +// +// History +//============================================================== +// 02/02/00 Initial Version +// 04/04/00 Unwind support added +// 08/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. +// 07/07/01 Improved speed of all paths +// 05/20/02 Cleaned up namespace and sf0 syntax +// 11/20/02 Improved speed, algorithm based on exp +// 03/31/05 Reformatted delimiters between data tables + +// API +//============================================================== +// double expm1(double) + +// Overview of operation +//============================================================== +// 1. Inputs of Nan, Inf, Zero, NatVal handled with special paths +// +// 2. |x| < 2^-60 +// Result = x, computed by x + x*x to handle appropriate flags and rounding +// +// 3. 2^-60 <= |x| < 2^-2 +// Result determined by 13th order Taylor series polynomial +// expm1f(x) = x + Q2*x^2 + ... + Q13*x^13 +// +// 4. x < -48.0 +// Here we know result is essentially -1 + eps, where eps only affects +// rounded result. Set I. +// +// 5. x >= 709.7827 +// Result overflows. Set I, O, and call error support // -// ********************************************************************* -// -// Function: Combined exp(x) and expm1(x), where -// x -// exp(x) = e , for double precision x values -// x -// expm1(x) = e - 1 for double precision x values -// -// ********************************************************************* -// -// Accuracy: Within .7 ulps for 80-bit floating point values -// Very accurate for double precision values -// -// ********************************************************************* -// -// Resources Used: -// -// Floating-Point Registers: f8 (Input and Return Value) -// f9,f32-f61, f99-f102 -// -// General Purpose Registers: -// r32-r61 -// r62-r65 (Used to pass arguments to error handling routine) -// -// Predicate Registers: p6-p15 -// -// ********************************************************************* -// -// IEEE Special Conditions: -// -// Denormal fault raised on denormal inputs -// Overflow exceptions raised when appropriate for exp and expm1 -// Underflow exceptions raised when appropriate for exp and expm1 -// (Error Handling Routine called for overflow and Underflow) -// Inexact raised when appropriate by algorithm -// -// exp(inf) = inf -// exp(-inf) = +0 -// exp(SNaN) = QNaN -// exp(QNaN) = QNaN -// exp(0) = 1 -// exp(EM_special Values) = QNaN -// exp(inf) = inf -// expm1(-inf) = -1 -// expm1(SNaN) = QNaN -// expm1(QNaN) = QNaN -// expm1(0) = 0 -// expm1(EM_special Values) = QNaN -// -// ********************************************************************* -// -// Implementation and Algorithm Notes: -// -// ker_exp_64( in_FR : X, -// in_GR : Flag, -// in_GR : Expo_Range -// out_FR : Y_hi, -// out_FR : Y_lo, -// out_FR : scale, -// out_PR : Safe ) -// -// On input, X is in register format and -// Flag = 0 for exp, -// Flag = 1 for expm1, -// -// On output, provided X and X_cor are real numbers, then -// -// scale*(Y_hi + Y_lo) approximates exp(X) if Flag is 0 -// scale*(Y_hi + Y_lo) approximates exp(X)-1 if Flag is 1 -// -// The accuracy is sufficient for a highly accurate 64 sig. -// bit implementation. Safe is set if there is no danger of -// overflow/underflow when the result is composed from scale, -// Y_hi and Y_lo. Thus, we can have a fast return if Safe is set. -// Otherwise, one must prepare to handle the possible exception -// appropriately. Note that SAFE not set (false) does not mean -// that overflow/underflow will occur; only the setting of SAFE -// guarantees the opposite. -// -// **** High Level Overview **** -// -// The method consists of three cases. -// -// If |X| < Tiny use case exp_tiny; -// else if |X| < 2^(-6) use case exp_small; -// else use case exp_regular; -// -// Case exp_tiny: -// -// 1 + X can be used to approximate exp(X) or exp(X+X_cor); -// X + X^2/2 can be used to approximate exp(X) - 1 -// -// Case exp_small: -// -// Here, exp(X), exp(X+X_cor), and exp(X) - 1 can all be -// appproximated by a relatively simple polynomial. -// -// This polynomial resembles the truncated Taylor series -// -// exp(w) = 1 + w + w^2/2! + w^3/3! + ... + w^n/n! -// -// Case exp_regular: -// -// Here we use a table lookup method. The basic idea is that in -// order to compute exp(X), we accurately decompose X into -// -// X = N * log(2)/(2^12) + r, |r| <= log(2)/2^13. -// -// Hence -// -// exp(X) = 2^( N / 2^12 ) * exp(r). -// -// The value 2^( N / 2^12 ) is obtained by simple combinations -// of values calculated beforehand and stored in table; exp(r) -// is approximated by a short polynomial because |r| is small. -// -// We elaborate this method in 4 steps. -// -// Step 1: Reduction -// -// The value 2^12/log(2) is stored as a double-extended number -// L_Inv. -// -// N := round_to_nearest_integer( X * L_Inv ) -// -// The value log(2)/2^12 is stored as two numbers L_hi and L_lo so -// that r can be computed accurately via -// -// r := (X - N*L_hi) - N*L_lo -// -// We pick L_hi such that N*L_hi is representable in 64 sig. bits -// and thus the FMA X - N*L_hi is error free. So r is the -// 1 rounding error from an exact reduction with respect to -// -// L_hi + L_lo. -// -// In particular, L_hi has 30 significant bit and can be stored -// as a double-precision number; L_lo has 64 significant bits and -// stored as a double-extended number. -// -// In the case Flag = 2, we further modify r by -// -// r := r + X_cor. -// -// Step 2: Approximation -// -// exp(r) - 1 is approximated by a short polynomial of the form -// -// r + A_1 r^2 + A_2 r^3 + A_3 r^4 . -// -// Step 3: Composition from Table Values -// -// The value 2^( N / 2^12 ) can be composed from a couple of tables -// of precalculated values. First, express N as three integers -// K, M_1, and M_2 as -// -// N = K * 2^12 + M_1 * 2^6 + M_2 -// -// Where 0 <= M_1, M_2 < 2^6; and K can be positive or negative. -// When N is represented in 2's complement, M_2 is simply the 6 -// lsb's, M_1 is the next 6, and K is simply N shifted right -// arithmetically (sign extended) by 12 bits. -// -// Now, 2^( N / 2^12 ) is simply -// -// 2^K * 2^( M_1 / 2^6 ) * 2^( M_2 / 2^12 ) -// -// Clearly, 2^K needs no tabulation. The other two values are less -// trivial because if we store each accurately to more than working -// precision, than its product is too expensive to calculate. We -// use the following method. -// -// Define two mathematical values, delta_1 and delta_2, implicitly -// such that -// -// T_1 = exp( [M_1 log(2)/2^6] - delta_1 ) -// T_2 = exp( [M_2 log(2)/2^12] - delta_2 ) -// -// are representable as 24 significant bits. To illustrate the idea, -// we show how we define delta_1: -// -// T_1 := round_to_24_bits( exp( M_1 log(2)/2^6 ) ) -// delta_1 = (M_1 log(2)/2^6) - log( T_1 ) -// -// The last equality means mathematical equality. We then tabulate -// -// W_1 := exp(delta_1) - 1 -// W_2 := exp(delta_2) - 1 -// -// Both in double precision. -// -// From the tabulated values T_1, T_2, W_1, W_2, we compose the values -// T and W via -// -// T := T_1 * T_2 ...exactly -// W := W_1 + (1 + W_1)*W_2 -// -// W approximates exp( delta ) - 1 where delta = delta_1 + delta_2. -// The mathematical product of T and (W+1) is an accurate representation -// of 2^(M_1/2^6) * 2^(M_2/2^12). -// -// Step 4. Reconstruction -// -// Finally, we can reconstruct exp(X), exp(X) - 1. -// Because -// -// X = K * log(2) + (M_1*log(2)/2^6 - delta_1) -// + (M_2*log(2)/2^12 - delta_2) -// + delta_1 + delta_2 + r ...accurately -// We have -// -// exp(X) ~=~ 2^K * ( T + T*[exp(delta_1+delta_2+r) - 1] ) -// ~=~ 2^K * ( T + T*[exp(delta + r) - 1] ) -// ~=~ 2^K * ( T + T*[(exp(delta)-1) -// + exp(delta)*(exp(r)-1)] ) -// ~=~ 2^K * ( T + T*( W + (1+W)*poly(r) ) ) -// ~=~ 2^K * ( Y_hi + Y_lo ) -// -// where Y_hi = T and Y_lo = T*(W + (1+W)*poly(r)) -// -// For exp(X)-1, we have -// -// exp(X)-1 ~=~ 2^K * ( Y_hi + Y_lo ) - 1 -// ~=~ 2^K * ( Y_hi + Y_lo - 2^(-K) ) -// -// and we combine Y_hi + Y_lo - 2^(-N) into the form of two -// numbers Y_hi + Y_lo carefully. -// -// **** Algorithm Details **** -// -// A careful algorithm must be used to realize the mathematical ideas -// accurately. We describe each of the three cases. We assume SAFE -// is preset to be TRUE. -// -// Case exp_tiny: -// -// The important points are to ensure an accurate result under -// different rounding directions and a correct setting of the SAFE -// flag. -// -// If Flag is 1, then -// SAFE := False ...possibility of underflow -// Scale := 1.0 -// Y_hi := X -// Y_lo := 2^(-17000) -// Else -// Scale := 1.0 -// Y_hi := 1.0 -// Y_lo := X ...for different rounding modes -// Endif -// -// Case exp_small: -// -// Here we compute a simple polynomial. To exploit parallelism, we split -// the polynomial into several portions. -// -// Let r = X -// -// If Flag is not 1 ...i.e. exp( argument ) -// -// rsq := r * r; -// r4 := rsq*rsq -// poly_lo := P_3 + r*(P_4 + r*(P_5 + r*P_6)) -// poly_hi := r + rsq*(P_1 + r*P_2) -// Y_lo := poly_hi + r4 * poly_lo -// set lsb(Y_lo) to 1 -// Y_hi := 1.0 -// Scale := 1.0 -// -// Else ...i.e. exp( argument ) - 1 -// -// rsq := r * r -// r4 := rsq * rsq -// r6 := rsq * r4 -// poly_lo := r6*(Q_5 + r*(Q_6 + r*Q_7)) -// poly_hi := Q_1 + r*(Q_2 + r*(Q_3 + r*Q_4)) -// Y_lo := rsq*poly_hi + poly_lo -// set lsb(Y_lo) to 1 -// Y_hi := X -// Scale := 1.0 -// -// Endif -// -// Case exp_regular: -// -// The previous description contain enough information except the -// computation of poly and the final Y_hi and Y_lo in the case for -// exp(X)-1. -// -// The computation of poly for Step 2: -// -// rsq := r*r -// poly := r + rsq*(A_1 + r*(A_2 + r*A_3)) -// -// For the case exp(X) - 1, we need to incorporate 2^(-K) into -// Y_hi and Y_lo at the end of Step 4. -// -// If K > 10 then -// Y_lo := Y_lo - 2^(-K) -// Else -// If K < -10 then -// Y_lo := Y_hi + Y_lo -// Y_hi := -2^(-K) -// Else -// Y_hi := Y_hi - 2^(-K) -// End If -// End If -// - -#include "libm_support.h" - -GR_SAVE_PFS = r59 -GR_SAVE_B0 = r60 -GR_SAVE_GP = r61 - -GR_Parameter_X = r62 -GR_Parameter_Y = r63 -GR_Parameter_RESULT = r64 - -FR_X = f9 -FR_Y = f1 -FR_RESULT = f99 - -#ifdef _LIBC -.rodata -#else -.data -#endif - -.align 64 -Constants_exp_64_Arg: -ASM_TYPE_DIRECTIVE(Constants_exp_64_Arg,@object) -data4 0x5C17F0BC,0xB8AA3B29,0x0000400B,0x00000000 -data4 0x00000000,0xB17217F4,0x00003FF2,0x00000000 -data4 0xF278ECE6,0xF473DE6A,0x00003FD4,0x00000000 -// /* Inv_L, L_hi, L_lo */ -ASM_SIZE_DIRECTIVE(Constants_exp_64_Arg) - -.align 64 -Constants_exp_64_Exponents: -ASM_TYPE_DIRECTIVE(Constants_exp_64_Exponents,@object) -data4 0x0000007E,0x00000000,0xFFFFFF83,0xFFFFFFFF -data4 0x000003FE,0x00000000,0xFFFFFC03,0xFFFFFFFF -data4 0x00003FFE,0x00000000,0xFFFFC003,0xFFFFFFFF -data4 0x00003FFE,0x00000000,0xFFFFC003,0xFFFFFFFF -data4 0xFFFFFFE2,0xFFFFFFFF,0xFFFFFFC4,0xFFFFFFFF -data4 0xFFFFFFBA,0xFFFFFFFF,0xFFFFFFBA,0xFFFFFFFF -ASM_SIZE_DIRECTIVE(Constants_exp_64_Exponents) - -.align 64 -Constants_exp_64_A: -ASM_TYPE_DIRECTIVE(Constants_exp_64_A,@object) -data4 0xB1B736A0,0xAAAAAAAB,0x00003FFA,0x00000000 -data4 0x90CD6327,0xAAAAAAAB,0x00003FFC,0x00000000 -data4 0xFFFFFFFF,0xFFFFFFFF,0x00003FFD,0x00000000 -// /* Reversed */ -ASM_SIZE_DIRECTIVE(Constants_exp_64_A) - -.align 64 -Constants_exp_64_P: -ASM_TYPE_DIRECTIVE(Constants_exp_64_P,@object) -data4 0x43914A8A,0xD00D6C81,0x00003FF2,0x00000000 -data4 0x30304B30,0xB60BC4AC,0x00003FF5,0x00000000 -data4 0x7474C518,0x88888888,0x00003FF8,0x00000000 -data4 0x8DAE729D,0xAAAAAAAA,0x00003FFA,0x00000000 -data4 0xAAAAAF61,0xAAAAAAAA,0x00003FFC,0x00000000 -data4 0x000004C7,0x80000000,0x00003FFE,0x00000000 -// /* Reversed */ -ASM_SIZE_DIRECTIVE(Constants_exp_64_P) - -.align 64 -Constants_exp_64_Q: -ASM_TYPE_DIRECTIVE(Constants_exp_64_Q,@object) -data4 0xA49EF6CA,0xD00D56F7,0x00003FEF,0x00000000 -data4 0x1C63493D,0xD00D59AB,0x00003FF2,0x00000000 -data4 0xFB50CDD2,0xB60B60B5,0x00003FF5,0x00000000 -data4 0x7BA68DC8,0x88888888,0x00003FF8,0x00000000 -data4 0xAAAAAC8D,0xAAAAAAAA,0x00003FFA,0x00000000 -data4 0xAAAAACCA,0xAAAAAAAA,0x00003FFC,0x00000000 -data4 0x00000000,0x80000000,0x00003FFE,0x00000000 -// /* Reversed */ -ASM_SIZE_DIRECTIVE(Constants_exp_64_Q) - -.align 64 -Constants_exp_64_T1: -ASM_TYPE_DIRECTIVE(Constants_exp_64_T1,@object) -data4 0x3F800000,0x3F8164D2,0x3F82CD87,0x3F843A29 -data4 0x3F85AAC3,0x3F871F62,0x3F88980F,0x3F8A14D5 -data4 0x3F8B95C2,0x3F8D1ADF,0x3F8EA43A,0x3F9031DC -data4 0x3F91C3D3,0x3F935A2B,0x3F94F4F0,0x3F96942D -data4 0x3F9837F0,0x3F99E046,0x3F9B8D3A,0x3F9D3EDA -data4 0x3F9EF532,0x3FA0B051,0x3FA27043,0x3FA43516 -data4 0x3FA5FED7,0x3FA7CD94,0x3FA9A15B,0x3FAB7A3A -data4 0x3FAD583F,0x3FAF3B79,0x3FB123F6,0x3FB311C4 -data4 0x3FB504F3,0x3FB6FD92,0x3FB8FBAF,0x3FBAFF5B -data4 0x3FBD08A4,0x3FBF179A,0x3FC12C4D,0x3FC346CD -data4 0x3FC5672A,0x3FC78D75,0x3FC9B9BE,0x3FCBEC15 -data4 0x3FCE248C,0x3FD06334,0x3FD2A81E,0x3FD4F35B -data4 0x3FD744FD,0x3FD99D16,0x3FDBFBB8,0x3FDE60F5 -data4 0x3FE0CCDF,0x3FE33F89,0x3FE5B907,0x3FE8396A -data4 0x3FEAC0C7,0x3FED4F30,0x3FEFE4BA,0x3FF28177 -data4 0x3FF5257D,0x3FF7D0DF,0x3FFA83B3,0x3FFD3E0C -ASM_SIZE_DIRECTIVE(Constants_exp_64_T1) - -.align 64 -Constants_exp_64_T2: -ASM_TYPE_DIRECTIVE(Constants_exp_64_T2,@object) -data4 0x3F800000,0x3F80058C,0x3F800B18,0x3F8010A4 -data4 0x3F801630,0x3F801BBD,0x3F80214A,0x3F8026D7 -data4 0x3F802C64,0x3F8031F2,0x3F803780,0x3F803D0E -data4 0x3F80429C,0x3F80482B,0x3F804DB9,0x3F805349 -data4 0x3F8058D8,0x3F805E67,0x3F8063F7,0x3F806987 -data4 0x3F806F17,0x3F8074A8,0x3F807A39,0x3F807FCA -data4 0x3F80855B,0x3F808AEC,0x3F80907E,0x3F809610 -data4 0x3F809BA2,0x3F80A135,0x3F80A6C7,0x3F80AC5A -data4 0x3F80B1ED,0x3F80B781,0x3F80BD14,0x3F80C2A8 -data4 0x3F80C83C,0x3F80CDD1,0x3F80D365,0x3F80D8FA -data4 0x3F80DE8F,0x3F80E425,0x3F80E9BA,0x3F80EF50 -data4 0x3F80F4E6,0x3F80FA7C,0x3F810013,0x3F8105AA -data4 0x3F810B41,0x3F8110D8,0x3F81166F,0x3F811C07 -data4 0x3F81219F,0x3F812737,0x3F812CD0,0x3F813269 -data4 0x3F813802,0x3F813D9B,0x3F814334,0x3F8148CE -data4 0x3F814E68,0x3F815402,0x3F81599C,0x3F815F37 -ASM_SIZE_DIRECTIVE(Constants_exp_64_T2) - -.align 64 -Constants_exp_64_W1: -ASM_TYPE_DIRECTIVE(Constants_exp_64_W1,@object) -data4 0x00000000,0x00000000,0x171EC4B4,0xBE384454 -data4 0x4AA72766,0xBE694741,0xD42518F8,0xBE5D32B6 -data4 0x3A319149,0x3E68D96D,0x62415F36,0xBE68F4DA -data4 0xC9C86A3B,0xBE6DDA2F,0xF49228FE,0x3E6B2E50 -data4 0x1188B886,0xBE49C0C2,0x1A4C2F1F,0x3E64BFC2 -data4 0x2CB98B54,0xBE6A2FBB,0x9A55D329,0x3E5DC5DE -data4 0x39A7AACE,0x3E696490,0x5C66DBA5,0x3E54728B -data4 0xBA1C7D7D,0xBE62B0DB,0x09F1AF5F,0x3E576E04 -data4 0x1A0DD6A1,0x3E612500,0x795FBDEF,0xBE66A419 -data4 0xE1BD41FC,0xBE5CDE8C,0xEA54964F,0xBE621376 -data4 0x476E76EE,0x3E6370BE,0x3427EB92,0x3E390D1A -data4 0x2BF82BF8,0x3E1336DE,0xD0F7BD9E,0xBE5FF1CB -data4 0x0CEB09DD,0xBE60A355,0x0980F30D,0xBE5CA37E -data4 0x4C082D25,0xBE5C541B,0x3B467D29,0xBE5BBECA -data4 0xB9D946C5,0xBE400D8A,0x07ED374A,0xBE5E2A08 -data4 0x365C8B0A,0xBE66CB28,0xD3403BCA,0x3E3AAD5B -data4 0xC7EA21E0,0x3E526055,0xE72880D6,0xBE442C75 -data4 0x85222A43,0x3E58B2BB,0x522C42BF,0xBE5AAB79 -data4 0x469DC2BC,0xBE605CB4,0xA48C40DC,0xBE589FA7 -data4 0x1AA42614,0xBE51C214,0xC37293F4,0xBE48D087 -data4 0xA2D673E0,0x3E367A1C,0x114F7A38,0xBE51BEBB -data4 0x661A4B48,0xBE6348E5,0x1D3B9962,0xBDF52643 -data4 0x35A78A53,0x3E3A3B5E,0x1CECD788,0xBE46C46C -data4 0x7857D689,0xBE60B7EC,0xD14F1AD7,0xBE594D3D -data4 0x4C9A8F60,0xBE4F9C30,0x02DFF9D2,0xBE521873 -data4 0x55E6D68F,0xBE5E4C88,0x667F3DC4,0xBE62140F -data4 0x3BF88747,0xBE36961B,0xC96EC6AA,0x3E602861 -data4 0xD57FD718,0xBE3B5151,0xFC4A627B,0x3E561CD0 -data4 0xCA913FEA,0xBE3A5217,0x9A5D193A,0x3E40A3CC -data4 0x10A9C312,0xBE5AB713,0xC5F57719,0x3E4FDADB -data4 0xDBDF59D5,0x3E361428,0x61B4180D,0x3E5DB5DB -data4 0x7408D856,0xBE42AD5F,0x31B2B707,0x3E2A3148 -ASM_SIZE_DIRECTIVE(Constants_exp_64_W1) - -.align 64 -Constants_exp_64_W2: -ASM_TYPE_DIRECTIVE(Constants_exp_64_W2,@object) -data4 0x00000000,0x00000000,0x37A3D7A2,0xBE641F25 -data4 0xAD028C40,0xBE68DD57,0xF212B1B6,0xBE5C77D8 -data4 0x1BA5B070,0x3E57878F,0x2ECAE6FE,0xBE55A36A -data4 0x569DFA3B,0xBE620608,0xA6D300A3,0xBE53B50E -data4 0x223F8F2C,0x3E5B5EF2,0xD6DE0DF4,0xBE56A0D9 -data4 0xEAE28F51,0xBE64EEF3,0x367EA80B,0xBE5E5AE2 -data4 0x5FCBC02D,0x3E47CB1A,0x9BDAFEB7,0xBE656BA0 -data4 0x805AFEE7,0x3E6E70C6,0xA3415EBA,0xBE6E0509 -data4 0x49BFF529,0xBE56856B,0x00508651,0x3E66DD33 -data4 0xC114BC13,0x3E51165F,0xC453290F,0x3E53333D -data4 0x05539FDA,0x3E6A072B,0x7C0A7696,0xBE47CD87 -data4 0xEB05C6D9,0xBE668BF4,0x6AE86C93,0xBE67C3E3 -data4 0xD0B3E84B,0xBE533904,0x556B53CE,0x3E63E8D9 -data4 0x63A98DC8,0x3E212C89,0x032A7A22,0xBE33138F -data4 0xBC584008,0x3E530FA9,0xCCB93C97,0xBE6ADF82 -data4 0x8370EA39,0x3E5F9113,0xFB6A05D8,0x3E5443A4 -data4 0x181FEE7A,0x3E63DACD,0xF0F67DEC,0xBE62B29D -data4 0x3DDE6307,0x3E65C483,0xD40A24C1,0x3E5BF030 -data4 0x14E437BE,0x3E658B8F,0xED98B6C7,0xBE631C29 -data4 0x04CF7C71,0x3E6335D2,0xE954A79D,0x3E529EED -data4 0xF64A2FB8,0x3E5D9257,0x854ED06C,0xBE6BED1B -data4 0xD71405CB,0x3E5096F6,0xACB9FDF5,0xBE3D4893 -data4 0x01B68349,0xBDFEB158,0xC6A463B9,0x3E628D35 -data4 0xADE45917,0xBE559725,0x042FC476,0xBE68C29C -data4 0x01E511FA,0xBE67593B,0x398801ED,0xBE4A4313 -data4 0xDA7C3300,0x3E699571,0x08062A9E,0x3E5349BE -data4 0x755BB28E,0x3E5229C4,0x77A1F80D,0x3E67E426 -data4 0x6B69C352,0xBE52B33F,0x084DA57F,0xBE6B3550 -data4 0xD1D09A20,0xBE6DB03F,0x2161B2C1,0xBE60CBC4 -data4 0x78A2B771,0x3E56ED9C,0x9D0FA795,0xBE508E31 -data4 0xFD1A54E9,0xBE59482A,0xB07FD23E,0xBE2A17CE -data4 0x17365712,0x3E68BF5C,0xB3785569,0x3E3956F9 -ASM_SIZE_DIRECTIVE(Constants_exp_64_W2) - -.section .text -.proc expm1# -.global expm1# -.align 64 +// 6. 2^-2 <= x < 709.7827 or -48.0 <= x < -2^-2 +// This is the main path. The algorithm is described below: -expm1: -#ifdef _LIBC -.global __expm1# -__expm1: -#endif - - -{ .mii - alloc r32 = ar.pfs,0,30,4,0 -(p0) add r33 = 1, r0 -(p0) cmp.eq.unc p7, p0 = r0, r0 -} -;; - - -// -// Set p7 true for expm1 -// Set Flag = r33 = 1 for expm1 -// These are really no longer necesary, but are a remnant -// when this file had multiple entry points. -// They should be carefully removed +// Take the input x. w is "how many log2/128 in x?" +// w = x * 128/log2 +// n = int(w) +// x = n log2/128 + r + delta + +// n = 128M + index_1 + 2^4 index_2 +// x = M log2 + (log2/128) index_1 + (log2/8) index_2 + r + delta + +// exp(x) = 2^M 2^(index_1/128) 2^(index_2/8) exp(r) exp(delta) +// Construct 2^M +// Get 2^(index_1/128) from table_1; +// Get 2^(index_2/8) from table_2; +// Calculate exp(r) by series by 5th order polynomial +// r = x - n (log2/128)_high +// delta = - n (log2/128)_low +// Calculate exp(delta) as 1 + delta + + +// Special values +//============================================================== +// expm1(+0) = +0.0 +// expm1(-0) = -0.0 + +// expm1(+qnan) = +qnan +// expm1(-qnan) = -qnan +// expm1(+snan) = +qnan +// expm1(-snan) = -qnan + +// expm1(-inf) = -1.0 +// expm1(+inf) = +inf + +// Overflow and Underflow +//======================= +// expm1(x) = largest double normal when +// x = 709.7827 = 40862e42fefa39ef +// +// Underflow is handled as described in case 2 above. + + +// Registers used +//============================================================== +// Floating Point registers used: +// f8, input +// f9 -> f15, f32 -> f75 + +// General registers used: +// r14 -> r40 + +// Predicate registers used: +// p6 -> p15 + +// Assembly macros +//============================================================== + +rRshf = r14 +rAD_TB1 = r15 +rAD_T1 = r15 +rAD_TB2 = r16 +rAD_T2 = r16 +rAD_Ln2_lo = r17 +rAD_P = r17 + +rN = r18 +rIndex_1 = r19 +rIndex_2_16 = r20 + +rM = r21 +rBiased_M = r21 +rIndex_1_16 = r22 +rSignexp_x = r23 +rExp_x = r24 +rSig_inv_ln2 = r25 + +rAD_Q1 = r26 +rAD_Q2 = r27 +rTmp = r27 +rExp_bias = r28 +rExp_mask = r29 +rRshf_2to56 = r30 + +rGt_ln = r31 +rExp_2tom56 = r31 + + +GR_SAVE_B0 = r33 +GR_SAVE_PFS = r34 +GR_SAVE_GP = r35 +GR_SAVE_SP = r36 + +GR_Parameter_X = r37 +GR_Parameter_Y = r38 +GR_Parameter_RESULT = r39 +GR_Parameter_TAG = r40 + + +FR_X = f10 +FR_Y = f1 +FR_RESULT = f8 + +fRSHF_2TO56 = f6 +fINV_LN2_2TO63 = f7 +fW_2TO56_RSH = f9 +f2TOM56 = f11 +fP5 = f12 +fP54 = f50 +fP5432 = f50 +fP4 = f13 +fP3 = f14 +fP32 = f14 +fP2 = f15 + +fLn2_by_128_hi = f33 +fLn2_by_128_lo = f34 + +fRSHF = f35 +fNfloat = f36 +fW = f37 +fR = f38 +fF = f39 + +fRsq = f40 +fRcube = f41 + +f2M = f42 +fS1 = f43 +fT1 = f44 + +fMIN_DBL_OFLOW_ARG = f45 +fMAX_DBL_MINUS_1_ARG = f46 +fMAX_DBL_NORM_ARG = f47 +fP_lo = f51 +fP_hi = f52 +fP = f53 +fS = f54 + +fNormX = f56 + +fWre_urm_f8 = f57 + +fGt_pln = f58 +fTmp = f58 + +fS2 = f59 +fT2 = f60 +fSm1 = f61 + +fXsq = f62 +fX6 = f63 +fX4 = f63 +fQ7 = f64 +fQ76 = f64 +fQ7654 = f64 +fQ765432 = f64 +fQ6 = f65 +fQ5 = f66 +fQ54 = f66 +fQ4 = f67 +fQ3 = f68 +fQ32 = f68 +fQ2 = f69 +fQD = f70 +fQDC = f70 +fQDCBA = f70 +fQDCBA98 = f70 +fQDCBA98765432 = f70 +fQC = f71 +fQB = f72 +fQBA = f72 +fQA = f73 +fQ9 = f74 +fQ98 = f74 +fQ8 = f75 + +// Data tables +//============================================================== + +RODATA +.align 16 + +// ************* DO NOT CHANGE ORDER OF THESE TABLES ******************** + +// double-extended 1/ln(2) +// 3fff b8aa 3b29 5c17 f0bb be87fed0691d3e88 +// 3fff b8aa 3b29 5c17 f0bc +// For speed the significand will be loaded directly with a movl and setf.sig +// and the exponent will be bias+63 instead of bias+0. Thus subsequent +// computations need to scale appropriately. +// The constant 128/ln(2) is needed for the computation of w. This is also +// obtained by scaling the computations. +// +// Two shifting constants are loaded directly with movl and setf.d. +// 1. fRSHF_2TO56 = 1.1000..00 * 2^(63-7) +// This constant is added to x*1/ln2 to shift the integer part of +// x*128/ln2 into the rightmost bits of the significand. +// The result of this fma is fW_2TO56_RSH. +// 2. fRSHF = 1.1000..00 * 2^(63) +// This constant is subtracted from fW_2TO56_RSH * 2^(-56) to give +// the integer part of w, n, as a floating-point number. +// The result of this fms is fNfloat. + + +LOCAL_OBJECT_START(exp_Table_1) +data8 0x40862e42fefa39f0 // smallest dbl overflow arg +data8 0xc048000000000000 // approx largest arg for minus one result +data8 0x40862e42fefa39ef // largest dbl arg to give normal dbl result +data8 0x0 // pad +data8 0xb17217f7d1cf79ab , 0x00003ff7 // ln2/128 hi +data8 0xc9e3b39803f2f6af , 0x00003fb7 // ln2/128 lo +// +// Table 1 is 2^(index_1/128) where +// index_1 goes from 0 to 15 +// +data8 0x8000000000000000 , 0x00003FFF +data8 0x80B1ED4FD999AB6C , 0x00003FFF +data8 0x8164D1F3BC030773 , 0x00003FFF +data8 0x8218AF4373FC25EC , 0x00003FFF +data8 0x82CD8698AC2BA1D7 , 0x00003FFF +data8 0x8383594EEFB6EE37 , 0x00003FFF +data8 0x843A28C3ACDE4046 , 0x00003FFF +data8 0x84F1F656379C1A29 , 0x00003FFF +data8 0x85AAC367CC487B15 , 0x00003FFF +data8 0x8664915B923FBA04 , 0x00003FFF +data8 0x871F61969E8D1010 , 0x00003FFF +data8 0x87DB357FF698D792 , 0x00003FFF +data8 0x88980E8092DA8527 , 0x00003FFF +data8 0x8955EE03618E5FDD , 0x00003FFF +data8 0x8A14D575496EFD9A , 0x00003FFF +data8 0x8AD4C6452C728924 , 0x00003FFF +LOCAL_OBJECT_END(exp_Table_1) + +// Table 2 is 2^(index_1/8) where +// index_2 goes from 0 to 7 +LOCAL_OBJECT_START(exp_Table_2) +data8 0x8000000000000000 , 0x00003FFF +data8 0x8B95C1E3EA8BD6E7 , 0x00003FFF +data8 0x9837F0518DB8A96F , 0x00003FFF +data8 0xA5FED6A9B15138EA , 0x00003FFF +data8 0xB504F333F9DE6484 , 0x00003FFF +data8 0xC5672A115506DADD , 0x00003FFF +data8 0xD744FCCAD69D6AF4 , 0x00003FFF +data8 0xEAC0C6E7DD24392F , 0x00003FFF +LOCAL_OBJECT_END(exp_Table_2) + + +LOCAL_OBJECT_START(exp_p_table) +data8 0x3f8111116da21757 //P5 +data8 0x3fa55555d787761c //P4 +data8 0x3fc5555555555414 //P3 +data8 0x3fdffffffffffd6a //P2 +LOCAL_OBJECT_END(exp_p_table) + +LOCAL_OBJECT_START(exp_Q1_table) +data8 0x3de6124613a86d09 // QD = 1/13! +data8 0x3e21eed8eff8d898 // QC = 1/12! +data8 0x3ec71de3a556c734 // Q9 = 1/9! +data8 0x3efa01a01a01a01a // Q8 = 1/8! +data8 0x8888888888888889,0x3ff8 // Q5 = 1/5! +data8 0xaaaaaaaaaaaaaaab,0x3ffc // Q3 = 1/3! +data8 0x0,0x0 // Pad to avoid bank conflicts +LOCAL_OBJECT_END(exp_Q1_table) + +LOCAL_OBJECT_START(exp_Q2_table) +data8 0x3e5ae64567f544e4 // QB = 1/11! +data8 0x3e927e4fb7789f5c // QA = 1/10! +data8 0x3f2a01a01a01a01a // Q7 = 1/7! +data8 0x3f56c16c16c16c17 // Q6 = 1/6! +data8 0xaaaaaaaaaaaaaaab,0x3ffa // Q4 = 1/4! +data8 0x8000000000000000,0x3ffe // Q2 = 1/2! +LOCAL_OBJECT_END(exp_Q2_table) +.section .text +GLOBAL_IEEE754_ENTRY(expm1) -{ .mfi -(p0) add r32 = 1,r0 -(p0) fnorm.s1 f9 = f8 - nop.i 999 +{ .mlx + getf.exp rSignexp_x = f8 // Must recompute if x unorm + movl rSig_inv_ln2 = 0xb8aa3b295c17f0bc // signif of 1/ln2 } - - -{ .mfi - nop.m 999 -(p0) fclass.m.unc p6, p8 = f8, 0x1E7 - nop.i 999 +{ .mlx + addl rAD_TB1 = @ltoff(exp_Table_1), gp + movl rRshf_2to56 = 0x4768000000000000 // 1.10000 2^(63+56) } +;; +// We do this fnorm right at the beginning to normalize +// any input unnormals so that SWA is not taken. { .mfi - nop.m 999 -(p0) fclass.nm.unc p9, p0 = f8, 0x1FF - nop.i 999 + ld8 rAD_TB1 = [rAD_TB1] + fclass.m p6,p0 = f8,0x0b // Test for x=unorm + mov rExp_mask = 0x1ffff } - { .mfi - nop.m 999 -(p0) mov f36 = f1 - nop.i 999 ;; -} - -// -// Identify NatVals, NaNs, Infs, and Zeros. -// Identify EM unsupporteds. -// Save special input registers -// -// Create FR_X_cor = 0.0 -// GR_Flag = 0 -// GR_Expo_Range = 1 -// FR_Scale = 1.0 -// - -{ .mfb - nop.m 999 -(p0) mov f32 = f0 -(p6) br.cond.spnt EXP_64_SPECIAL ;; -} - -{ .mib - nop.m 999 - nop.i 999 -(p9) br.cond.spnt EXP_64_UNSUPPORTED ;; -} - -// -// Branch out for special input values -// - -{ .mfi -(p0) cmp.ne.unc p12, p13 = 0x01, r33 -(p0) fcmp.lt.unc.s0 p9,p0 = f8, f0 -(p0) cmp.eq.unc p15, p0 = r0, r0 -} - -// -// Raise possible denormal operand exception -// Normalize x -// -// This function computes exp( x + x_cor) -// Input FR 1: FR_X -// Input FR 2: FR_X_cor -// Input GR 1: GR_Flag -// Input GR 2: GR_Expo_Range -// Output FR 3: FR_Y_hi -// Output FR 4: FR_Y_lo -// Output FR 5: FR_Scale -// Output PR 1: PR_Safe - -// -// Prepare to load constants -// Set Safe = True -// - -{ .mmi -(p0) addl r34 = @ltoff(Constants_exp_64_Arg#), gp -(p0) addl r40 = @ltoff(Constants_exp_64_W1#), gp -(p0) addl r41 = @ltoff(Constants_exp_64_W2#), gp -} -;; - -{ .mmi - ld8 r34 = [r34] - ld8 r40 = [r40] -(p0) addl r50 = @ltoff(Constants_exp_64_T1#), gp -} -;; - - -{ .mmi - ld8 r41 = [r41] -(p0) ldfe f37 = [r34],16 -(p0) addl r51 = @ltoff(Constants_exp_64_T2#), gp -} -;; - -// -// N = fcvt.fx(float_N) -// Set p14 if -6 > expo_X -// - - -// -// Bias = 0x0FFFF -// expo_X = expo_X and Mask -// - -// -// Load L_lo -// Set p10 if 14 < expo_X -// - -{ .mmi - ld8 r50 = [r50] -(p0) ldfe f40 = [r34],16 - nop.i 999 -} -;; - -{ .mlx - nop.m 999 -(p0) movl r58 = 0x0FFFF + mov rExp_bias = 0xffff + fnorm.s1 fNormX = f8 + mov rExp_2tom56 = 0xffff-56 } ;; -// -// Load W2_ptr -// Branch to SMALL is expo_X < -6 -// +// Form two constants we need +// 1/ln2 * 2^63 to compute w = x * 1/ln2 * 128 +// 1.1000..000 * 2^(63+63-7) to right shift int(w) into the significand -// -// float_N = X * L_Inv -// expo_X = exponent of X -// Mask = 0x1FFFF -// - -{ .mmi - ld8 r51 = [r51] -(p0) ldfe f41 = [r34],16 +{ .mfi + setf.sig fINV_LN2_2TO63 = rSig_inv_ln2 // form 1/ln2 * 2^63 + fclass.m p8,p0 = f8,0x07 // Test for x=0 + nop.i 0 } -;; - { .mlx -(p0) addl r34 = @ltoff(Constants_exp_64_Exponents#), gp -(p0) movl r39 = 0x1FFFF -} -;; - -{ .mmi - ld8 r34 = [r34] -(p0) getf.exp r37 = f9 - nop.i 999 + setf.d fRSHF_2TO56 = rRshf_2to56 // Form 1.100 * 2^(63+56) + movl rRshf = 0x43e8000000000000 // 1.10000 2^63 for rshift } ;; -{ .mii - nop.m 999 - nop.i 999 -(p0) and r37 = r37, r39 ;; -} - -{ .mmi -(p0) sub r37 = r37, r58 ;; -(p0) cmp.gt.unc p14, p0 = -6, r37 -(p0) cmp.lt.unc p10, p0 = 14, r37 ;; -} - { .mfi - nop.m 999 -// -// Load L_inv -// Set p12 true for Flag = 0 (exp) -// Set p13 true for Flag = 1 (expm1) -// -(p0) fmpy.s1 f38 = f9, f37 - nop.i 999 ;; -} - -{ .mfb - nop.m 999 -// -// Load L_hi -// expo_X = expo_X - Bias -// get W1_ptr -// -(p0) fcvt.fx.s1 f39 = f38 -(p14) br.cond.spnt EXP_SMALL ;; + setf.exp f2TOM56 = rExp_2tom56 // form 2^-56 for scaling Nfloat + fclass.m p9,p0 = f8,0x22 // Test for x=-inf + add rAD_TB2 = 0x140, rAD_TB1 // Point to Table 2 } - { .mib - nop.m 999 - nop.i 999 -(p10) br.cond.spnt EXP_HUGE ;; -} - -{ .mmi -(p0) shladd r34 = r32,4,r34 -(p0) addl r35 = @ltoff(Constants_exp_64_A#), gp - nop.i 999 -} -;; - -{ .mmi - ld8 r35 = [r35] - nop.m 999 - nop.i 999 + add rAD_Q1 = 0x1e0, rAD_TB1 // Point to Q table for small path + add rAD_Ln2_lo = 0x30, rAD_TB1 // Point to ln2_by_128_lo +(p6) br.cond.spnt EXPM1_UNORM // Branch if x unorm } ;; -// -// Load T_1,T_2 -// - -{ .mmb -(p0) ldfe f51 = [r35],16 -(p0) ld8 r45 = [r34],8 - nop.b 999 ;; -} -// -// Set Safe = True if k >= big_expo_neg -// Set Safe = False if k < big_expo_neg -// - -{ .mmb -(p0) ldfe f49 = [r35],16 -(p0) ld8 r48 = [r34],0 - nop.b 999 ;; -} - -{ .mfi - nop.m 999 -// -// Branch to HUGE is expo_X > 14 -// -(p0) fcvt.xf f38 = f39 - nop.i 999 ;; -} - -{ .mfi -(p0) getf.sig r52 = f39 - nop.f 999 - nop.i 999 ;; -} - -{ .mii - nop.m 999 -(p0) extr.u r43 = r52, 6, 6 ;; -// -// r = r - float_N * L_lo -// K = extr(N_fix,12,52) -// -(p0) shladd r40 = r43,3,r40 ;; -} - +EXPM1_COMMON: { .mfi -(p0) shladd r50 = r43,2,r50 -(p0) fnma.s1 f42 = f40, f38, f9 -// -// float_N = float(N) -// N_fix = signficand N -// -(p0) extr.u r42 = r52, 0, 6 -} - -{ .mmi -(p0) ldfd f43 = [r40],0 ;; -(p0) shladd r41 = r42,3,r41 -(p0) shladd r51 = r42,2,r51 + ldfpd fMIN_DBL_OFLOW_ARG, fMAX_DBL_MINUS_1_ARG = [rAD_TB1],16 + fclass.m p10,p0 = f8,0x1e1 // Test for x=+inf, NaN, NaT + add rAD_Q2 = 0x50, rAD_Q1 // Point to Q table for small path } -// -// W_1_p1 = 1 + W_1 -// - -{ .mmi -(p0) ldfs f44 = [r50],0 ;; -(p0) ldfd f45 = [r41],0 -// -// M_2 = extr(N_fix,0,6) -// M_1 = extr(N_fix,6,6) -// r = X - float_N * L_hi -// -(p0) extr r44 = r52, 12, 52 -} - -{ .mmi -(p0) ldfs f46 = [r51],0 ;; -(p0) sub r46 = r58, r44 -(p0) cmp.gt.unc p8, p15 = r44, r45 -} -// -// W = W_1 + W_1_p1*W_2 -// Load A_2 -// Bias_m_K = Bias - K -// - -{ .mii -(p0) ldfe f40 = [r35],16 -// -// load A_1 -// poly = A_2 + r*A_3 -// rsq = r * r -// neg_2_mK = exponent of Bias_m_k -// -(p0) add r47 = r58, r44 ;; -// -// Set Safe = True if k <= big_expo_pos -// Set Safe = False if k > big_expo_pos -// Load A_3 -// -(p15) cmp.lt p8,p15 = r44,r48 ;; -} - -{ .mmf -(p0) setf.exp f61 = r46 -// -// Bias_p + K = Bias + K -// T = T_1 * T_2 -// -(p0) setf.exp f36 = r47 -(p0) fnma.s1 f42 = f41, f38, f42 ;; -} - -{ .mfi - nop.m 999 -// -// Load W_1,W_2 -// Load big_exp_pos, load big_exp_neg -// -(p0) fadd.s1 f47 = f43, f1 - nop.i 999 ;; +{ .mfb + nop.m 0 + nop.f 0 +(p8) br.ret.spnt b0 // Exit for x=0, return x } +;; { .mfi - nop.m 999 -(p0) fma.s1 f52 = f42, f51, f49 - nop.i 999 + ldfd fMAX_DBL_NORM_ARG = [rAD_TB1],16 + nop.f 0 + and rExp_x = rExp_mask, rSignexp_x // Biased exponent of x } - -{ .mfi - nop.m 999 -(p0) fmpy.s1 f48 = f42, f42 - nop.i 999 ;; +{ .mfb + setf.d fRSHF = rRshf // Form right shift const 1.100 * 2^63 +(p9) fms.d.s0 f8 = f0,f0,f1 // quick exit for x=-inf +(p9) br.ret.spnt b0 } +;; { .mfi - nop.m 999 -(p0) fmpy.s1 f53 = f44, f46 - nop.i 999 ;; + ldfpd fQD, fQC = [rAD_Q1], 16 // Load coeff for small path + nop.f 0 + sub rExp_x = rExp_x, rExp_bias // True exponent of x } - -{ .mfi - nop.m 999 -(p0) fma.s1 f54 = f45, f47, f43 - nop.i 999 +{ .mfb + ldfpd fQB, fQA = [rAD_Q2], 16 // Load coeff for small path +(p10) fma.d.s0 f8 = f8, f1, f0 // For x=+inf, NaN, NaT +(p10) br.ret.spnt b0 // Exit for x=+inf, NaN, NaT } +;; { .mfi - nop.m 999 -(p0) fneg f61 = f61 - nop.i 999 ;; + ldfpd fQ9, fQ8 = [rAD_Q1], 16 // Load coeff for small path + fma.s1 fXsq = fNormX, fNormX, f0 // x*x for small path + cmp.gt p7, p8 = -2, rExp_x // Test |x| < 2^(-2) } - { .mfi - nop.m 999 -(p0) fma.s1 f52 = f42, f52, f40 - nop.i 999 ;; + ldfpd fQ7, fQ6 = [rAD_Q2], 16 // Load coeff for small path + nop.f 0 + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fadd.s1 f55 = f54, f1 - nop.i 999 + ldfe fQ5 = [rAD_Q1], 16 // Load coeff for small path + nop.f 0 + nop.i 0 } - -{ .mfi - nop.m 999 -// -// W + Wp1 * poly -// -(p0) mov f34 = f53 - nop.i 999 ;; +{ .mib + ldfe fQ4 = [rAD_Q2], 16 // Load coeff for small path +(p7) cmp.gt.unc p6, p7 = -60, rExp_x // Test |x| < 2^(-60) +(p7) br.cond.spnt EXPM1_SMALL // Branch if 2^-60 <= |x| < 2^-2 } +;; -{ .mfi - nop.m 999 -// -// A_1 + r * poly -// Scale = setf_exp(Bias_p_k) -// -(p0) fma.s1 f52 = f48, f52, f42 - nop.i 999 ;; -} +// W = X * Inv_log2_by_128 +// By adding 1.10...0*2^63 we shift and get round_int(W) in significand. +// We actually add 1.10...0*2^56 to X * Inv_log2 to do the same thing. { .mfi - nop.m 999 -// -// poly = r + rsq(A_1 + r*poly) -// Wp1 = 1 + W -// neg_2_mK = -neg_2_mK -// -(p0) fma.s1 f35 = f55, f52, f54 - nop.i 999 ;; + ldfe fLn2_by_128_hi = [rAD_TB1],32 + fma.s1 fW_2TO56_RSH = fNormX, fINV_LN2_2TO63, fRSHF_2TO56 + nop.i 0 } - { .mfb - nop.m 999 -(p0) fmpy.s1 f35 = f35, f53 -// -// Y_hi = T -// Y_lo = T * (W + Wp1*poly) -// -(p12) br.cond.sptk EXP_MAIN ;; + ldfe fLn2_by_128_lo = [rAD_Ln2_lo] +(p6) fma.d.s0 f8 = f8, f8, f8 // If x < 2^-60, result=x+x*x +(p6) br.ret.spnt b0 // Exit if x < 2^-60 } -// -// Branch if exp(x) -// Continue for exp(x-1) -// +;; -{ .mii -(p0) cmp.lt.unc p12, p13 = 10, r44 - nop.i 999 ;; -// -// Set p12 if 10 < K, Else p13 -// -(p13) cmp.gt.unc p13, p14 = -10, r44 ;; -} +// Divide arguments into the following categories: +// Certain minus one p11 - -inf < x <= MAX_DBL_MINUS_1_ARG +// Possible Overflow p14 - MAX_DBL_NORM_ARG < x < MIN_DBL_OFLOW_ARG +// Certain Overflow p15 - MIN_DBL_OFLOW_ARG <= x < +inf // -// K > 10: Y_lo = Y_lo + neg_2_mK -// K <=10: Set p13 if -10 > K, Else set p14 +// If the input is really a double arg, then there will never be "Possible +// Overflow" arguments. // -{ .mfi -(p13) cmp.eq p15, p0 = r0, r0 -(p14) fadd.s1 f34 = f61, f34 - nop.i 999 ;; -} +// After that last load, rAD_TB1 points to the beginning of table 1 { .mfi - nop.m 999 -(p12) fadd.s1 f35 = f35, f61 - nop.i 999 ;; + nop.m 0 + fcmp.ge.s1 p15,p14 = fNormX,fMIN_DBL_OFLOW_ARG + nop.i 0 } +;; { .mfi - nop.m 999 -(p13) fadd.s1 f35 = f35, f34 - nop.i 999 + add rAD_P = 0x80, rAD_TB2 + fcmp.le.s1 p11,p0 = fNormX,fMAX_DBL_MINUS_1_ARG + nop.i 0 } +;; { .mfb - nop.m 999 -// -// K <= 10 and K < -10, Set Safe = True -// K <= 10 and K < 10, Y_lo = Y_hi + Y_lo -// K <= 10 and K > =-10, Y_hi = Y_hi + neg_2_mk -// -(p13) mov f34 = f61 -(p0) br.cond.sptk EXP_MAIN ;; -} -EXP_SMALL: - -{ .mmi -(p12) addl r35 = @ltoff(Constants_exp_64_P#), gp -(p0) addl r34 = @ltoff(Constants_exp_64_Exponents#), gp - nop.i 999 + ldfpd fP5, fP4 = [rAD_P] ,16 +(p14) fcmp.gt.unc.s1 p14,p0 = fNormX,fMAX_DBL_NORM_ARG +(p15) br.cond.spnt EXPM1_CERTAIN_OVERFLOW } ;; -{ .mmi -(p12) ld8 r35 = [r35] - ld8 r34 = [r34] - nop.i 999 -} -;; +// Nfloat = round_int(W) +// The signficand of fW_2TO56_RSH contains the rounded integer part of W, +// as a twos complement number in the lower bits (that is, it may be negative). +// That twos complement number (called N) is put into rN. +// Since fW_2TO56_RSH is scaled by 2^56, it must be multiplied by 2^-56 +// before the shift constant 1.10000 * 2^63 is subtracted to yield fNfloat. +// Thus, fNfloat contains the floating point version of N -{ .mmi -(p13) addl r35 = @ltoff(Constants_exp_64_Q#), gp - nop.m 999 - nop.i 999 +{ .mfb + ldfpd fP3, fP2 = [rAD_P] + fms.s1 fNfloat = fW_2TO56_RSH, f2TOM56, fRSHF +(p11) br.cond.spnt EXPM1_CERTAIN_MINUS_ONE } ;; - -// -// Return -// K <= 10 and K < 10, Y_hi = neg_2_mk -// -// /*******************************************************/ -// /*********** Branch EXP_SMALL *************************/ -// /*******************************************************/ - { .mfi -(p13) ld8 r35 = [r35] -(p0) mov f42 = f9 -(p0) add r34 = 0x48,r34 + getf.sig rN = fW_2TO56_RSH + nop.f 0 + nop.i 0 } ;; -// -// Flag = 0 -// r4 = rsq * rsq -// - -{ .mfi -(p0) ld8 r49 =[r34],0 - nop.f 999 - nop.i 999 ;; -} - -{ .mii - nop.m 999 - nop.i 999 ;; -// -// Flag = 1 -// -(p0) cmp.lt.unc p14, p0 = r37, r49 ;; -} - -{ .mfi - nop.m 999 -// -// r = X -// -(p0) fmpy.s1 f48 = f42, f42 - nop.i 999 ;; -} - -{ .mfb - nop.m 999 -// -// rsq = r * r -// -(p0) fmpy.s1 f50 = f48, f48 -// -// Is input very small? -// -(p14) br.cond.spnt EXP_VERY_SMALL ;; -} -// -// Flag_not1: Y_hi = 1.0 -// Flag is 1: r6 = rsq * r4 -// +// rIndex_1 has index_1 +// rIndex_2_16 has index_2 * 16 +// rBiased_M has M +// rIndex_1_16 has index_1 * 16 +// r = x - Nfloat * ln2_by_128_hi +// f = 1 - Nfloat * ln2_by_128_lo { .mfi -(p12) ldfe f52 = [r35],16 -(p12) mov f34 = f1 -(p0) add r53 = 0x1,r0 ;; + and rIndex_1 = 0x0f, rN + fnma.s1 fR = fNfloat, fLn2_by_128_hi, fNormX + shr rM = rN, 0x7 } - { .mfi -(p13) ldfe f51 = [r35],16 -// -// Flag_not_1: Y_lo = poly_hi + r4 * poly_lo -// -(p13) mov f34 = f9 - nop.i 999 ;; -} - -{ .mmf -(p12) ldfe f53 = [r35],16 -// -// For Flag_not_1, Y_hi = X -// Scale = 1 -// Create 0x000...01 -// -(p0) setf.sig f37 = r53 -(p0) mov f36 = f1 ;; -} - -{ .mmi -(p13) ldfe f52 = [r35],16 ;; -(p12) ldfe f54 = [r35],16 - nop.i 999 ;; + and rIndex_2_16 = 0x70, rN + fnma.s1 fF = fNfloat, fLn2_by_128_lo, f1 + nop.i 0 } +;; -{ .mfi -(p13) ldfe f53 = [r35],16 -(p13) fmpy.s1 f58 = f48, f50 - nop.i 999 ;; -} -// -// Flag_not1: poly_lo = P_5 + r*P_6 -// Flag_1: poly_lo = Q_6 + r*Q_7 -// - -{ .mmi -(p13) ldfe f54 = [r35],16 ;; -(p12) ldfe f55 = [r35],16 - nop.i 999 ;; -} +// rAD_T1 has address of T1 +// rAD_T2 has address if T2 { .mmi -(p12) ldfe f56 = [r35],16 ;; -(p13) ldfe f55 = [r35],16 - nop.i 999 ;; + add rBiased_M = rExp_bias, rM + add rAD_T2 = rAD_TB2, rIndex_2_16 + shladd rAD_T1 = rIndex_1, 4, rAD_TB1 } +;; +// Create Scale = 2^M +// Load T1 and T2 { .mmi -(p12) ldfe f57 = [r35],0 ;; -(p13) ldfe f56 = [r35],16 - nop.i 999 ;; -} - -{ .mfi -(p13) ldfe f57 = [r35],0 - nop.f 999 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// For Flag_not_1, load p5,p6,p1,p2 -// Else load p5,p6,p1,p2 -// -(p12) fma.s1 f60 = f52, f42, f53 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p13) fma.s1 f60 = f51, f42, f52 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p12) fma.s1 f60 = f60, f42, f54 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p12) fma.s1 f59 = f56, f42, f57 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p13) fma.s1 f60 = f42, f60, f53 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p12) fma.s1 f59 = f59, f48, f42 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// Flag_1: poly_lo = Q_5 + r*(Q_6 + r*Q_7) -// Flag_not1: poly_lo = P_4 + r*(P_5 + r*P_6) -// Flag_not1: poly_hi = (P_1 + r*P_2) -// -(p13) fmpy.s1 f60 = f60, f58 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p12) fma.s1 f60 = f60, f42, f55 - nop.i 999 ;; + setf.exp f2M = rBiased_M + ldfe fT2 = [rAD_T2] + nop.i 0 } +;; { .mfi - nop.m 999 -// -// Flag_1: poly_lo = r6 *(Q_5 + ....) -// Flag_not1: poly_hi = r + rsq *(P_1 + r*P_2) -// -(p12) fma.s1 f35 = f60, f50, f59 - nop.i 999 + ldfe fT1 = [rAD_T1] + fmpy.s0 fTmp = fLn2_by_128_lo, fLn2_by_128_lo // Force inexact + nop.i 0 } +;; { .mfi - nop.m 999 -(p13) fma.s1 f59 = f54, f42, f55 - nop.i 999 ;; + nop.m 0 + fma.s1 fP54 = fR, fP5, fP4 + nop.i 0 } - { .mfi - nop.m 999 -// -// Flag_not1: Y_lo = rsq* poly_hi + poly_lo -// Flag_1: poly_lo = rsq* poly_hi + poly_lo -// -(p13) fma.s1 f59 = f59, f42, f56 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// Flag_not_1: (P_1 + r*P_2) -// -(p13) fma.s1 f59 = f59, f42, f57 - nop.i 999 ;; + nop.m 0 + fma.s1 fP32 = fR, fP3, fP2 + nop.i 0 } +;; { .mfi - nop.m 999 -// -// Flag_not_1: poly_hi = r + rsq * (P_1 + r*P_2) -// -(p13) fma.s1 f35 = f59, f48, f60 - nop.i 999 ;; + nop.m 0 + fma.s1 fRsq = fR, fR, f0 + nop.i 0 } +;; { .mfi - nop.m 999 -// -// Create 0.000...01 -// -(p0) for f37 = f35, f37 - nop.i 999 ;; -} - -{ .mfb - nop.m 999 -// -// Set lsb of Y_lo to 1 -// -(p0) fmerge.se f35 = f35,f37 -(p0) br.cond.sptk EXP_MAIN ;; -} -EXP_VERY_SMALL: - -{ .mmi - nop.m 999 -(p13) addl r34 = @ltoff(Constants_exp_64_Exponents#),gp - nop.i 999;; + nop.m 0 + fma.s1 fP5432 = fRsq, fP54, fP32 + nop.i 0 } +;; { .mfi -(p13) ld8 r34 = [r34]; -(p12) mov f35 = f9 - nop.i 999 ;; -} - -{ .mfb - nop.m 999 -(p12) mov f34 = f1 -(p12) br.cond.sptk EXP_MAIN ;; -} - -{ .mlx -(p13) add r34 = 8,r34 -(p13) movl r39 = 0x0FFFE ;; + nop.m 0 + fma.s1 fS2 = fF,fT2,f0 + nop.i 0 } -// -// Load big_exp_neg -// Create 1/2's exponent -// - -{ .mii -(p13) setf.exp f56 = r39 -(p13) shladd r34 = r32,4,r34 ;; - nop.i 999 -} -// -// Negative exponents are stored after positive -// - { .mfi -(p13) ld8 r45 = [r34],0 -// -// Y_hi = x -// Scale = 1 -// -(p13) fmpy.s1 f35 = f9, f9 - nop.i 999 ;; + nop.m 0 + fma.s1 fS1 = f2M,fT1,f0 + nop.i 0 } +;; { .mfi - nop.m 999 -// -// Reset Safe if necessary -// Create 1/2 -// -(p13) mov f34 = f9 - nop.i 999 ;; + nop.m 0 + fma.s1 fP = fRsq, fP5432, fR + nop.i 0 } +;; { .mfi -(p13) cmp.lt.unc p0, p15 = r37, r45 -(p13) mov f36 = f1 - nop.i 999 ;; + nop.m 0 + fms.s1 fSm1 = fS1,fS2,f1 // S - 1.0 + nop.i 0 } - { .mfb - nop.m 999 -// -// Y_lo = x * x -// -(p13) fmpy.s1 f35 = f35, f56 -// -// Y_lo = x*x/2 -// -(p13) br.cond.sptk EXP_MAIN ;; -} -EXP_HUGE: - -{ .mfi - nop.m 999 -(p0) fcmp.gt.unc.s1 p14, p0 = f9, f0 - nop.i 999 -} - -{ .mlx - nop.m 999 -(p0) movl r39 = 0x15DC0 ;; -} - -{ .mfi -(p14) setf.exp f34 = r39 -(p14) mov f35 = f1 -(p14) cmp.eq p0, p15 = r0, r0 ;; + nop.m 0 + fma.s1 fS = fS1,fS2,f0 +(p14) br.cond.spnt EXPM1_POSSIBLE_OVERFLOW } +;; { .mfb - nop.m 999 -(p14) mov f36 = f34 -// -// If x > 0, Set Safe = False -// If x > 0, Y_hi = 2**(24,000) -// If x > 0, Y_lo = 1.0 -// If x > 0, Scale = 2**(24,000) -// -(p14) br.cond.sptk EXP_MAIN ;; -} - -{ .mlx - nop.m 999 -(p12) movl r39 = 0xA240 -} - -{ .mlx - nop.m 999 -(p12) movl r38 = 0xA1DC ;; -} - -{ .mmb -(p13) cmp.eq p15, p14 = r0, r0 -(p12) setf.exp f34 = r39 - nop.b 999 ;; -} - -{ .mlx -(p12) setf.exp f35 = r38 -(p13) movl r39 = 0xFF9C + nop.m 0 + fma.d.s0 f8 = fS, fP, fSm1 + br.ret.sptk b0 // Normal path exit } +;; -{ .mfi - nop.m 999 -(p13) fsub.s1 f34 = f0, f1 - nop.i 999 ;; +// Here if 2^-60 <= |x| <2^-2 +// Compute 13th order polynomial +EXPM1_SMALL: +{ .mmf + ldfe fQ3 = [rAD_Q1], 16 + ldfe fQ2 = [rAD_Q2], 16 + fma.s1 fX4 = fXsq, fXsq, f0 } +;; { .mfi - nop.m 999 -(p12) mov f36 = f34 -(p12) cmp.eq p0, p15 = r0, r0 ;; + nop.m 0 + fma.s1 fQDC = fQD, fNormX, fQC + nop.i 0 } - { .mfi -(p13) setf.exp f35 = r39 -(p13) mov f36 = f1 - nop.i 999 ;; -} -EXP_MAIN: - -{ .mfi -(p0) cmp.ne.unc p12, p0 = 0x01, r33 -(p0) fmpy.s1 f101 = f36, f35 - nop.i 999 ;; -} - -{ .mfb - nop.m 999 -(p0) fma.d.s0 f99 = f34, f36, f101 -(p15) br.cond.sptk EXP_64_RETURN;; + nop.m 0 + fma.s1 fQBA = fQB, fNormX, fQA + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fsetc.s3 0x7F,0x01 - nop.i 999 + nop.m 0 + fma.s1 fQ98 = fQ9, fNormX, fQ8 + nop.i 0 } - -{ .mlx - nop.m 999 -(p0) movl r50 = 0x000000000103FF ;; -} -// -// S0 user supplied status -// S2 user supplied status + WRE + TD (Overflows) -// S3 user supplied status + RZ + TD (Underflows) -// -// -// If (Safe) is true, then -// Compute result using user supplied status field. -// No overflow or underflow here, but perhaps inexact. -// Return -// Else -// Determine if overflow or underflow was raised. -// Fetch +/- overflow threshold for IEEE single, double, -// double extended -// - { .mfi -(p0) setf.exp f60 = r50 -(p0) fma.d.s3 f102 = f34, f36, f101 - nop.i 999 + nop.m 0 + fma.s1 fQ76= fQ7, fNormX, fQ6 + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fsetc.s3 0x7F,0x40 - nop.i 999 ;; + nop.m 0 + fma.s1 fQ54 = fQ5, fNormX, fQ4 + nop.i 0 } +;; { .mfi - nop.m 999 -// -// For Safe, no need to check for over/under. -// For expm1, handle errors like exp. -// -(p0) fsetc.s2 0x7F,0x42 - nop.i 999;; + nop.m 0 + fma.s1 fX6 = fX4, fXsq, f0 + nop.i 0 } - { .mfi - nop.m 999 -(p0) fma.d.s2 f100 = f34, f36, f101 - nop.i 999 ;; + nop.m 0 + fma.s1 fQ32= fQ3, fNormX, fQ2 + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fsetc.s2 0x7F,0x40 - nop.i 999 ;; + nop.m 0 + fma.s1 fQDCBA = fQDC, fXsq, fQBA + nop.i 0 } - { .mfi - nop.m 999 -(p7) fclass.m.unc p12, p0 = f102, 0x00F - nop.i 999 + nop.m 0 + fma.s1 fQ7654 = fQ76, fXsq, fQ54 + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fclass.m.unc p11, p0 = f102, 0x00F - nop.i 999 ;; + nop.m 0 + fma.s1 fQDCBA98 = fQDCBA, fXsq, fQ98 + nop.i 0 } - { .mfi - nop.m 999 -(p7) fcmp.ge.unc.s1 p10, p0 = f100, f60 - nop.i 999 + nop.m 0 + fma.s1 fQ765432 = fQ7654, fXsq, fQ32 + nop.i 0 } +;; { .mfi - nop.m 999 -// -// Create largest double exponent + 1. -// Create smallest double exponent - 1. -// -(p0) fcmp.ge.unc.s1 p8, p0 = f100, f60 - nop.i 999 ;; -} -// -// fcmp: resultS2 >= + overflow threshold -> set (a) if true -// fcmp: resultS2 <= - overflow threshold -> set (b) if true -// fclass: resultS3 is denorm/unorm/0 -> set (d) if true -// - -{ .mib -(p10) mov r65 = 41 - nop.i 999 -(p10) br.cond.sptk __libm_error_region ;; -} - -{ .mib -(p8) mov r65 = 14 - nop.i 999 -(p8) br.cond.sptk __libm_error_region ;; -} -// -// Report that exp overflowed -// - -{ .mib -(p12) mov r65 = 42 - nop.i 999 -(p12) br.cond.sptk __libm_error_region ;; + nop.m 0 + fma.s1 fQDCBA98765432 = fQDCBA98, fX6, fQ765432 + nop.i 0 } +;; -{ .mib -(p11) mov r65 = 15 - nop.i 999 -(p11) br.cond.sptk __libm_error_region ;; +{ .mfb + nop.m 0 + fma.d.s0 f8 = fQDCBA98765432, fXsq, fNormX + br.ret.sptk b0 // Exit small branch } +;; -{ .mib - nop.m 999 - nop.i 999 -// -// Report that exp underflowed -// -(p0) br.cond.sptk EXP_64_RETURN;; -} -EXP_64_SPECIAL: -{ .mfi - nop.m 999 -(p0) fclass.m.unc p6, p0 = f8, 0x0c3 - nop.i 999 -} +EXPM1_POSSIBLE_OVERFLOW: -{ .mfi - nop.m 999 -(p0) fclass.m.unc p13, p8 = f8, 0x007 - nop.i 999 ;; -} +// Here if fMAX_DBL_NORM_ARG < x < fMIN_DBL_OFLOW_ARG +// This cannot happen if input is a double, only if input higher precision. +// Overflow is a possibility, not a certainty. -{ .mfi - nop.m 999 -(p7) fclass.m.unc p14, p0 = f8, 0x007 - nop.i 999 -} +// Recompute result using status field 2 with user's rounding mode, +// and wre set. If result is larger than largest double, then we have +// overflow { .mfi - nop.m 999 -(p0) fclass.m.unc p12, p9 = f8, 0x021 - nop.i 999 ;; + mov rGt_ln = 0x103ff // Exponent for largest dbl + 1 ulp + fsetc.s2 0x7F,0x42 // Get user's round mode, set wre + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fclass.m.unc p11, p0 = f8, 0x022 - nop.i 999 + setf.exp fGt_pln = rGt_ln // Create largest double + 1 ulp + fma.d.s2 fWre_urm_f8 = fS, fP, fSm1 // Result with wre set + nop.i 0 } +;; { .mfi - nop.m 999 -(p7) fclass.m.unc p10, p0 = f8, 0x022 - nop.i 999 ;; + nop.m 0 + fsetc.s2 0x7F,0x40 // Turn off wre in sf2 + nop.i 0 } +;; { .mfi - nop.m 999 -// -// Identify +/- 0, Inf, or -Inf -// Generate the right kind of NaN. -// -(p13) fadd.d.s0 f99 = f0, f1 - nop.i 999 ;; + nop.m 0 + fcmp.ge.s1 p6, p0 = fWre_urm_f8, fGt_pln // Test for overflow + nop.i 0 } +;; -{ .mfi - nop.m 999 -(p14) mov f99 = f8 - nop.i 999 ;; +{ .mfb + nop.m 0 + nop.f 0 +(p6) br.cond.spnt EXPM1_CERTAIN_OVERFLOW // Branch if overflow } +;; { .mfb - nop.m 999 -(p6) fadd.d.s0 f99 = f8, f1 -// -// exp(+/-0) = 1 -// expm1(+/-0) = +/-0 -// No exceptions raised -// -(p6) br.cond.sptk EXP_64_RETURN;; + nop.m 0 + fma.d.s0 f8 = fS, fP, fSm1 + br.ret.sptk b0 // Exit if really no overflow } +;; -{ .mib - nop.m 999 - nop.i 999 -(p14) br.cond.sptk EXP_64_RETURN;; +EXPM1_CERTAIN_OVERFLOW: +{ .mmi + sub rTmp = rExp_mask, r0, 1 +;; + setf.exp fTmp = rTmp + nop.i 0 } +;; { .mfi - nop.m 999 -(p11) mov f99 = f0 - nop.i 999 ;; + alloc r32=ar.pfs,1,4,4,0 + fmerge.s FR_X = f8,f8 + nop.i 0 } - { .mfb - nop.m 999 -(p10) fsub.d.s1 f99 = f0, f1 -// -// exp(-Inf) = 0 -// expm1(-Inf) = -1 -// No exceptions raised. -// -(p10) br.cond.sptk EXP_64_RETURN;; + mov GR_Parameter_TAG = 41 + fma.d.s0 FR_RESULT = fTmp, fTmp, f0 // Set I,O and +INF result + br.cond.sptk __libm_error_region } +;; +// Here if x unorm +EXPM1_UNORM: { .mfb - nop.m 999 -(p12) fmpy.d.s1 f99 = f8, f1 -// -// exp(+Inf) = Inf -// No exceptions raised. -// -(p0) br.cond.sptk EXP_64_RETURN;; + getf.exp rSignexp_x = fNormX // Must recompute if x unorm + fcmp.eq.s0 p6, p0 = f8, f0 // Set D flag + br.cond.sptk EXPM1_COMMON } +;; - -EXP_64_UNSUPPORTED: - -{ .mfb - nop.m 999 -(p0) fmpy.d.s0 f99 = f8, f0 - nop.b 0;; +// here if result will be -1 and inexact, x <= -48.0 +EXPM1_CERTAIN_MINUS_ONE: +{ .mmi + mov rTmp = 1 +;; + setf.exp fTmp = rTmp + nop.i 0 } +;; -EXP_64_RETURN: { .mfb - nop.m 999 -(p0) mov f8 = f99 -(p0) br.ret.sptk b0 + nop.m 0 + fms.d.s0 FR_RESULT = fTmp, fTmp, f1 // Set I, rounded -1+eps result + br.ret.sptk b0 } -.endp expm1 -ASM_SIZE_DIRECTIVE(expm1) +;; + +GLOBAL_IEEE754_END(expm1) -.proc __libm_error_region -__libm_error_region: + +LOCAL_LIBM_ENTRY(__libm_error_region) .prologue -// (1) { .mfi add GR_Parameter_Y=-32,sp // Parameter 2 value nop.f 0 @@ -1716,38 +843,32 @@ __libm_error_region: } { .mfi .fframe 64 - add sp=-64,sp // Create new stack + add sp=-64,sp // Create new stack nop.f 0 - mov GR_SAVE_GP=gp // Save gp + mov GR_SAVE_GP=gp // Save gp };; - -// (2) { .mmi stfd [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack - add GR_Parameter_X = 16,sp // Parameter 1 address + add GR_Parameter_X = 16,sp // Parameter 1 address .save b0, GR_SAVE_B0 - mov GR_SAVE_B0=b0 // Save b0 + mov GR_SAVE_B0=b0 // Save b0 };; - .body -// (3) { .mib - stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack + stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address - nop.b 0 + nop.b 0 } { .mib - stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack + stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack add GR_Parameter_Y = -16,GR_Parameter_Y - br.call.sptk b0=__libm_error_support# // Call error handling function + br.call.sptk b0=__libm_error_support# // Call error handling function };; { .mmi - nop.m 0 - nop.m 0 add GR_Parameter_RESULT = 48,sp + nop.m 0 + nop.i 0 };; - -// (4) { .mmi ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack .restore sp @@ -1760,9 +881,6 @@ __libm_error_region: br.ret.sptk b0 // Return };; -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) - - +LOCAL_LIBM_END(__libm_error_region) .type __libm_error_support#,@function .global __libm_error_support# |