diff options
author | Ulrich Drepper <drepper@redhat.com> | 2004-12-22 20:10:10 +0000 |
---|---|---|
committer | Ulrich Drepper <drepper@redhat.com> | 2004-12-22 20:10:10 +0000 |
commit | a334319f6530564d22e775935d9c91663623a1b4 (patch) | |
tree | b5877475619e4c938e98757d518bb1e9cbead751 /sysdeps/ia64/fpu/s_expm1.S | |
parent | 0ecb606cb6cf65de1d9fc8a919bceb4be476c602 (diff) | |
download | glibc-a334319f6530564d22e775935d9c91663623a1b4.tar.gz glibc-a334319f6530564d22e775935d9c91663623a1b4.tar.xz glibc-a334319f6530564d22e775935d9c91663623a1b4.zip |
(CFLAGS-tst-align.c): Add -mpreferred-stack-boundary=4.
Diffstat (limited to 'sysdeps/ia64/fpu/s_expm1.S')
-rw-r--r-- | sysdeps/ia64/fpu/s_expm1.S | 2150 |
1 files changed, 1516 insertions, 634 deletions
diff --git a/sysdeps/ia64/fpu/s_expm1.S b/sysdeps/ia64/fpu/s_expm1.S index 09a22bbbdd..19a237990c 100644 --- a/sysdeps/ia64/fpu/s_expm1.S +++ b/sysdeps/ia64/fpu/s_expm1.S @@ -1,10 +1,10 @@ .file "exp_m1.s" - -// Copyright (c) 2000 - 2005, Intel Corporation +// Copyright (C) 2000, 2001, Intel Corporation // All rights reserved. -// -// Contributed 2000 by the Intel Numerics Group, Intel Corporation +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,821 +20,1694 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. - -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// // Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://www.intel.com/software/products/opensource/libraries/num.htm. -// -// History -//============================================================== -// 02/02/00 Initial Version -// 04/04/00 Unwind support added -// 08/15/00 Bundle added after call to __libm_error_support to properly +// problem reports or change requests be submitted to it directly at +// http://developer.intel.com/opensource. +// +// HISTORY +// 2/02/00 Initial Version +// 4/04/00 Unwind support added +// 8/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. -// 07/07/01 Improved speed of all paths -// 05/20/02 Cleaned up namespace and sf0 syntax -// 11/20/02 Improved speed, algorithm based on exp -// 03/31/05 Reformatted delimiters between data tables - -// API -//============================================================== -// double expm1(double) - -// Overview of operation -//============================================================== -// 1. Inputs of Nan, Inf, Zero, NatVal handled with special paths -// -// 2. |x| < 2^-60 -// Result = x, computed by x + x*x to handle appropriate flags and rounding -// -// 3. 2^-60 <= |x| < 2^-2 -// Result determined by 13th order Taylor series polynomial -// expm1f(x) = x + Q2*x^2 + ... + Q13*x^13 -// -// 4. x < -48.0 -// Here we know result is essentially -1 + eps, where eps only affects -// rounded result. Set I. -// -// 5. x >= 709.7827 -// Result overflows. Set I, O, and call error support // -// 6. 2^-2 <= x < 709.7827 or -48.0 <= x < -2^-2 -// This is the main path. The algorithm is described below: - -// Take the input x. w is "how many log2/128 in x?" -// w = x * 128/log2 -// n = int(w) -// x = n log2/128 + r + delta - -// n = 128M + index_1 + 2^4 index_2 -// x = M log2 + (log2/128) index_1 + (log2/8) index_2 + r + delta - -// exp(x) = 2^M 2^(index_1/128) 2^(index_2/8) exp(r) exp(delta) -// Construct 2^M -// Get 2^(index_1/128) from table_1; -// Get 2^(index_2/8) from table_2; -// Calculate exp(r) by series by 5th order polynomial -// r = x - n (log2/128)_high -// delta = - n (log2/128)_low -// Calculate exp(delta) as 1 + delta - - -// Special values -//============================================================== -// expm1(+0) = +0.0 -// expm1(-0) = -0.0 - -// expm1(+qnan) = +qnan -// expm1(-qnan) = -qnan -// expm1(+snan) = +qnan -// expm1(-snan) = -qnan - -// expm1(-inf) = -1.0 -// expm1(+inf) = +inf - -// Overflow and Underflow -//======================= -// expm1(x) = largest double normal when -// x = 709.7827 = 40862e42fefa39ef -// -// Underflow is handled as described in case 2 above. - - -// Registers used -//============================================================== -// Floating Point registers used: -// f8, input -// f9 -> f15, f32 -> f75 - -// General registers used: -// r14 -> r40 - -// Predicate registers used: -// p6 -> p15 - -// Assembly macros -//============================================================== - -rRshf = r14 -rAD_TB1 = r15 -rAD_T1 = r15 -rAD_TB2 = r16 -rAD_T2 = r16 -rAD_Ln2_lo = r17 -rAD_P = r17 - -rN = r18 -rIndex_1 = r19 -rIndex_2_16 = r20 - -rM = r21 -rBiased_M = r21 -rIndex_1_16 = r22 -rSignexp_x = r23 -rExp_x = r24 -rSig_inv_ln2 = r25 - -rAD_Q1 = r26 -rAD_Q2 = r27 -rTmp = r27 -rExp_bias = r28 -rExp_mask = r29 -rRshf_2to56 = r30 - -rGt_ln = r31 -rExp_2tom56 = r31 - - -GR_SAVE_B0 = r33 -GR_SAVE_PFS = r34 -GR_SAVE_GP = r35 -GR_SAVE_SP = r36 - -GR_Parameter_X = r37 -GR_Parameter_Y = r38 -GR_Parameter_RESULT = r39 -GR_Parameter_TAG = r40 - - -FR_X = f10 -FR_Y = f1 -FR_RESULT = f8 - -fRSHF_2TO56 = f6 -fINV_LN2_2TO63 = f7 -fW_2TO56_RSH = f9 -f2TOM56 = f11 -fP5 = f12 -fP54 = f50 -fP5432 = f50 -fP4 = f13 -fP3 = f14 -fP32 = f14 -fP2 = f15 - -fLn2_by_128_hi = f33 -fLn2_by_128_lo = f34 - -fRSHF = f35 -fNfloat = f36 -fW = f37 -fR = f38 -fF = f39 - -fRsq = f40 -fRcube = f41 - -f2M = f42 -fS1 = f43 -fT1 = f44 - -fMIN_DBL_OFLOW_ARG = f45 -fMAX_DBL_MINUS_1_ARG = f46 -fMAX_DBL_NORM_ARG = f47 -fP_lo = f51 -fP_hi = f52 -fP = f53 -fS = f54 - -fNormX = f56 - -fWre_urm_f8 = f57 - -fGt_pln = f58 -fTmp = f58 - -fS2 = f59 -fT2 = f60 -fSm1 = f61 - -fXsq = f62 -fX6 = f63 -fX4 = f63 -fQ7 = f64 -fQ76 = f64 -fQ7654 = f64 -fQ765432 = f64 -fQ6 = f65 -fQ5 = f66 -fQ54 = f66 -fQ4 = f67 -fQ3 = f68 -fQ32 = f68 -fQ2 = f69 -fQD = f70 -fQDC = f70 -fQDCBA = f70 -fQDCBA98 = f70 -fQDCBA98765432 = f70 -fQC = f71 -fQB = f72 -fQBA = f72 -fQA = f73 -fQ9 = f74 -fQ98 = f74 -fQ8 = f75 - -// Data tables -//============================================================== - -RODATA -.align 16 - -// ************* DO NOT CHANGE ORDER OF THESE TABLES ******************** - -// double-extended 1/ln(2) -// 3fff b8aa 3b29 5c17 f0bb be87fed0691d3e88 -// 3fff b8aa 3b29 5c17 f0bc -// For speed the significand will be loaded directly with a movl and setf.sig -// and the exponent will be bias+63 instead of bias+0. Thus subsequent -// computations need to scale appropriately. -// The constant 128/ln(2) is needed for the computation of w. This is also -// obtained by scaling the computations. -// -// Two shifting constants are loaded directly with movl and setf.d. -// 1. fRSHF_2TO56 = 1.1000..00 * 2^(63-7) -// This constant is added to x*1/ln2 to shift the integer part of -// x*128/ln2 into the rightmost bits of the significand. -// The result of this fma is fW_2TO56_RSH. -// 2. fRSHF = 1.1000..00 * 2^(63) -// This constant is subtracted from fW_2TO56_RSH * 2^(-56) to give -// the integer part of w, n, as a floating-point number. -// The result of this fms is fNfloat. - - -LOCAL_OBJECT_START(exp_Table_1) -data8 0x40862e42fefa39f0 // smallest dbl overflow arg -data8 0xc048000000000000 // approx largest arg for minus one result -data8 0x40862e42fefa39ef // largest dbl arg to give normal dbl result -data8 0x0 // pad -data8 0xb17217f7d1cf79ab , 0x00003ff7 // ln2/128 hi -data8 0xc9e3b39803f2f6af , 0x00003fb7 // ln2/128 lo -// -// Table 1 is 2^(index_1/128) where -// index_1 goes from 0 to 15 -// -data8 0x8000000000000000 , 0x00003FFF -data8 0x80B1ED4FD999AB6C , 0x00003FFF -data8 0x8164D1F3BC030773 , 0x00003FFF -data8 0x8218AF4373FC25EC , 0x00003FFF -data8 0x82CD8698AC2BA1D7 , 0x00003FFF -data8 0x8383594EEFB6EE37 , 0x00003FFF -data8 0x843A28C3ACDE4046 , 0x00003FFF -data8 0x84F1F656379C1A29 , 0x00003FFF -data8 0x85AAC367CC487B15 , 0x00003FFF -data8 0x8664915B923FBA04 , 0x00003FFF -data8 0x871F61969E8D1010 , 0x00003FFF -data8 0x87DB357FF698D792 , 0x00003FFF -data8 0x88980E8092DA8527 , 0x00003FFF -data8 0x8955EE03618E5FDD , 0x00003FFF -data8 0x8A14D575496EFD9A , 0x00003FFF -data8 0x8AD4C6452C728924 , 0x00003FFF -LOCAL_OBJECT_END(exp_Table_1) - -// Table 2 is 2^(index_1/8) where -// index_2 goes from 0 to 7 -LOCAL_OBJECT_START(exp_Table_2) -data8 0x8000000000000000 , 0x00003FFF -data8 0x8B95C1E3EA8BD6E7 , 0x00003FFF -data8 0x9837F0518DB8A96F , 0x00003FFF -data8 0xA5FED6A9B15138EA , 0x00003FFF -data8 0xB504F333F9DE6484 , 0x00003FFF -data8 0xC5672A115506DADD , 0x00003FFF -data8 0xD744FCCAD69D6AF4 , 0x00003FFF -data8 0xEAC0C6E7DD24392F , 0x00003FFF -LOCAL_OBJECT_END(exp_Table_2) - - -LOCAL_OBJECT_START(exp_p_table) -data8 0x3f8111116da21757 //P5 -data8 0x3fa55555d787761c //P4 -data8 0x3fc5555555555414 //P3 -data8 0x3fdffffffffffd6a //P2 -LOCAL_OBJECT_END(exp_p_table) - -LOCAL_OBJECT_START(exp_Q1_table) -data8 0x3de6124613a86d09 // QD = 1/13! -data8 0x3e21eed8eff8d898 // QC = 1/12! -data8 0x3ec71de3a556c734 // Q9 = 1/9! -data8 0x3efa01a01a01a01a // Q8 = 1/8! -data8 0x8888888888888889,0x3ff8 // Q5 = 1/5! -data8 0xaaaaaaaaaaaaaaab,0x3ffc // Q3 = 1/3! -data8 0x0,0x0 // Pad to avoid bank conflicts -LOCAL_OBJECT_END(exp_Q1_table) - -LOCAL_OBJECT_START(exp_Q2_table) -data8 0x3e5ae64567f544e4 // QB = 1/11! -data8 0x3e927e4fb7789f5c // QA = 1/10! -data8 0x3f2a01a01a01a01a // Q7 = 1/7! -data8 0x3f56c16c16c16c17 // Q6 = 1/6! -data8 0xaaaaaaaaaaaaaaab,0x3ffa // Q4 = 1/4! -data8 0x8000000000000000,0x3ffe // Q2 = 1/2! -LOCAL_OBJECT_END(exp_Q2_table) +// ********************************************************************* +// +// Function: Combined exp(x) and expm1(x), where +// x +// exp(x) = e , for double precision x values +// x +// expm1(x) = e - 1 for double precision x values +// +// ********************************************************************* +// +// Accuracy: Within .7 ulps for 80-bit floating point values +// Very accurate for double precision values +// +// ********************************************************************* +// +// Resources Used: +// +// Floating-Point Registers: f8 (Input and Return Value) +// f9,f32-f61, f99-f102 +// +// General Purpose Registers: +// r32-r61 +// r62-r65 (Used to pass arguments to error handling routine) +// +// Predicate Registers: p6-p15 +// +// ********************************************************************* +// +// IEEE Special Conditions: +// +// Denormal fault raised on denormal inputs +// Overflow exceptions raised when appropriate for exp and expm1 +// Underflow exceptions raised when appropriate for exp and expm1 +// (Error Handling Routine called for overflow and Underflow) +// Inexact raised when appropriate by algorithm +// +// exp(inf) = inf +// exp(-inf) = +0 +// exp(SNaN) = QNaN +// exp(QNaN) = QNaN +// exp(0) = 1 +// exp(EM_special Values) = QNaN +// exp(inf) = inf +// expm1(-inf) = -1 +// expm1(SNaN) = QNaN +// expm1(QNaN) = QNaN +// expm1(0) = 0 +// expm1(EM_special Values) = QNaN +// +// ********************************************************************* +// +// Implementation and Algorithm Notes: +// +// ker_exp_64( in_FR : X, +// in_GR : Flag, +// in_GR : Expo_Range +// out_FR : Y_hi, +// out_FR : Y_lo, +// out_FR : scale, +// out_PR : Safe ) +// +// On input, X is in register format and +// Flag = 0 for exp, +// Flag = 1 for expm1, +// +// On output, provided X and X_cor are real numbers, then +// +// scale*(Y_hi + Y_lo) approximates exp(X) if Flag is 0 +// scale*(Y_hi + Y_lo) approximates exp(X)-1 if Flag is 1 +// +// The accuracy is sufficient for a highly accurate 64 sig. +// bit implementation. Safe is set if there is no danger of +// overflow/underflow when the result is composed from scale, +// Y_hi and Y_lo. Thus, we can have a fast return if Safe is set. +// Otherwise, one must prepare to handle the possible exception +// appropriately. Note that SAFE not set (false) does not mean +// that overflow/underflow will occur; only the setting of SAFE +// guarantees the opposite. +// +// **** High Level Overview **** +// +// The method consists of three cases. +// +// If |X| < Tiny use case exp_tiny; +// else if |X| < 2^(-6) use case exp_small; +// else use case exp_regular; +// +// Case exp_tiny: +// +// 1 + X can be used to approximate exp(X) or exp(X+X_cor); +// X + X^2/2 can be used to approximate exp(X) - 1 +// +// Case exp_small: +// +// Here, exp(X), exp(X+X_cor), and exp(X) - 1 can all be +// appproximated by a relatively simple polynomial. +// +// This polynomial resembles the truncated Taylor series +// +// exp(w) = 1 + w + w^2/2! + w^3/3! + ... + w^n/n! +// +// Case exp_regular: +// +// Here we use a table lookup method. The basic idea is that in +// order to compute exp(X), we accurately decompose X into +// +// X = N * log(2)/(2^12) + r, |r| <= log(2)/2^13. +// +// Hence +// +// exp(X) = 2^( N / 2^12 ) * exp(r). +// +// The value 2^( N / 2^12 ) is obtained by simple combinations +// of values calculated beforehand and stored in table; exp(r) +// is approximated by a short polynomial because |r| is small. +// +// We elaborate this method in 4 steps. +// +// Step 1: Reduction +// +// The value 2^12/log(2) is stored as a double-extended number +// L_Inv. +// +// N := round_to_nearest_integer( X * L_Inv ) +// +// The value log(2)/2^12 is stored as two numbers L_hi and L_lo so +// that r can be computed accurately via +// +// r := (X - N*L_hi) - N*L_lo +// +// We pick L_hi such that N*L_hi is representable in 64 sig. bits +// and thus the FMA X - N*L_hi is error free. So r is the +// 1 rounding error from an exact reduction with respect to +// +// L_hi + L_lo. +// +// In particular, L_hi has 30 significant bit and can be stored +// as a double-precision number; L_lo has 64 significant bits and +// stored as a double-extended number. +// +// In the case Flag = 2, we further modify r by +// +// r := r + X_cor. +// +// Step 2: Approximation +// +// exp(r) - 1 is approximated by a short polynomial of the form +// +// r + A_1 r^2 + A_2 r^3 + A_3 r^4 . +// +// Step 3: Composition from Table Values +// +// The value 2^( N / 2^12 ) can be composed from a couple of tables +// of precalculated values. First, express N as three integers +// K, M_1, and M_2 as +// +// N = K * 2^12 + M_1 * 2^6 + M_2 +// +// Where 0 <= M_1, M_2 < 2^6; and K can be positive or negative. +// When N is represented in 2's complement, M_2 is simply the 6 +// lsb's, M_1 is the next 6, and K is simply N shifted right +// arithmetically (sign extended) by 12 bits. +// +// Now, 2^( N / 2^12 ) is simply +// +// 2^K * 2^( M_1 / 2^6 ) * 2^( M_2 / 2^12 ) +// +// Clearly, 2^K needs no tabulation. The other two values are less +// trivial because if we store each accurately to more than working +// precision, than its product is too expensive to calculate. We +// use the following method. +// +// Define two mathematical values, delta_1 and delta_2, implicitly +// such that +// +// T_1 = exp( [M_1 log(2)/2^6] - delta_1 ) +// T_2 = exp( [M_2 log(2)/2^12] - delta_2 ) +// +// are representable as 24 significant bits. To illustrate the idea, +// we show how we define delta_1: +// +// T_1 := round_to_24_bits( exp( M_1 log(2)/2^6 ) ) +// delta_1 = (M_1 log(2)/2^6) - log( T_1 ) +// +// The last equality means mathematical equality. We then tabulate +// +// W_1 := exp(delta_1) - 1 +// W_2 := exp(delta_2) - 1 +// +// Both in double precision. +// +// From the tabulated values T_1, T_2, W_1, W_2, we compose the values +// T and W via +// +// T := T_1 * T_2 ...exactly +// W := W_1 + (1 + W_1)*W_2 +// +// W approximates exp( delta ) - 1 where delta = delta_1 + delta_2. +// The mathematical product of T and (W+1) is an accurate representation +// of 2^(M_1/2^6) * 2^(M_2/2^12). +// +// Step 4. Reconstruction +// +// Finally, we can reconstruct exp(X), exp(X) - 1. +// Because +// +// X = K * log(2) + (M_1*log(2)/2^6 - delta_1) +// + (M_2*log(2)/2^12 - delta_2) +// + delta_1 + delta_2 + r ...accurately +// We have +// +// exp(X) ~=~ 2^K * ( T + T*[exp(delta_1+delta_2+r) - 1] ) +// ~=~ 2^K * ( T + T*[exp(delta + r) - 1] ) +// ~=~ 2^K * ( T + T*[(exp(delta)-1) +// + exp(delta)*(exp(r)-1)] ) +// ~=~ 2^K * ( T + T*( W + (1+W)*poly(r) ) ) +// ~=~ 2^K * ( Y_hi + Y_lo ) +// +// where Y_hi = T and Y_lo = T*(W + (1+W)*poly(r)) +// +// For exp(X)-1, we have +// +// exp(X)-1 ~=~ 2^K * ( Y_hi + Y_lo ) - 1 +// ~=~ 2^K * ( Y_hi + Y_lo - 2^(-K) ) +// +// and we combine Y_hi + Y_lo - 2^(-N) into the form of two +// numbers Y_hi + Y_lo carefully. +// +// **** Algorithm Details **** +// +// A careful algorithm must be used to realize the mathematical ideas +// accurately. We describe each of the three cases. We assume SAFE +// is preset to be TRUE. +// +// Case exp_tiny: +// +// The important points are to ensure an accurate result under +// different rounding directions and a correct setting of the SAFE +// flag. +// +// If Flag is 1, then +// SAFE := False ...possibility of underflow +// Scale := 1.0 +// Y_hi := X +// Y_lo := 2^(-17000) +// Else +// Scale := 1.0 +// Y_hi := 1.0 +// Y_lo := X ...for different rounding modes +// Endif +// +// Case exp_small: +// +// Here we compute a simple polynomial. To exploit parallelism, we split +// the polynomial into several portions. +// +// Let r = X +// +// If Flag is not 1 ...i.e. exp( argument ) +// +// rsq := r * r; +// r4 := rsq*rsq +// poly_lo := P_3 + r*(P_4 + r*(P_5 + r*P_6)) +// poly_hi := r + rsq*(P_1 + r*P_2) +// Y_lo := poly_hi + r4 * poly_lo +// set lsb(Y_lo) to 1 +// Y_hi := 1.0 +// Scale := 1.0 +// +// Else ...i.e. exp( argument ) - 1 +// +// rsq := r * r +// r4 := rsq * rsq +// r6 := rsq * r4 +// poly_lo := r6*(Q_5 + r*(Q_6 + r*Q_7)) +// poly_hi := Q_1 + r*(Q_2 + r*(Q_3 + r*Q_4)) +// Y_lo := rsq*poly_hi + poly_lo +// set lsb(Y_lo) to 1 +// Y_hi := X +// Scale := 1.0 +// +// Endif +// +// Case exp_regular: +// +// The previous description contain enough information except the +// computation of poly and the final Y_hi and Y_lo in the case for +// exp(X)-1. +// +// The computation of poly for Step 2: +// +// rsq := r*r +// poly := r + rsq*(A_1 + r*(A_2 + r*A_3)) +// +// For the case exp(X) - 1, we need to incorporate 2^(-K) into +// Y_hi and Y_lo at the end of Step 4. +// +// If K > 10 then +// Y_lo := Y_lo - 2^(-K) +// Else +// If K < -10 then +// Y_lo := Y_hi + Y_lo +// Y_hi := -2^(-K) +// Else +// Y_hi := Y_hi - 2^(-K) +// End If +// End If +// +#include "libm_support.h" + +GR_SAVE_PFS = r59 +GR_SAVE_B0 = r60 +GR_SAVE_GP = r61 + +GR_Parameter_X = r62 +GR_Parameter_Y = r63 +GR_Parameter_RESULT = r64 + +FR_X = f9 +FR_Y = f1 +FR_RESULT = f99 + +#ifdef _LIBC +.rodata +#else +.data +#endif + +.align 64 +Constants_exp_64_Arg: +ASM_TYPE_DIRECTIVE(Constants_exp_64_Arg,@object) +data4 0x5C17F0BC,0xB8AA3B29,0x0000400B,0x00000000 +data4 0x00000000,0xB17217F4,0x00003FF2,0x00000000 +data4 0xF278ECE6,0xF473DE6A,0x00003FD4,0x00000000 +// /* Inv_L, L_hi, L_lo */ +ASM_SIZE_DIRECTIVE(Constants_exp_64_Arg) + +.align 64 +Constants_exp_64_Exponents: +ASM_TYPE_DIRECTIVE(Constants_exp_64_Exponents,@object) +data4 0x0000007E,0x00000000,0xFFFFFF83,0xFFFFFFFF +data4 0x000003FE,0x00000000,0xFFFFFC03,0xFFFFFFFF +data4 0x00003FFE,0x00000000,0xFFFFC003,0xFFFFFFFF +data4 0x00003FFE,0x00000000,0xFFFFC003,0xFFFFFFFF +data4 0xFFFFFFE2,0xFFFFFFFF,0xFFFFFFC4,0xFFFFFFFF +data4 0xFFFFFFBA,0xFFFFFFFF,0xFFFFFFBA,0xFFFFFFFF +ASM_SIZE_DIRECTIVE(Constants_exp_64_Exponents) + +.align 64 +Constants_exp_64_A: +ASM_TYPE_DIRECTIVE(Constants_exp_64_A,@object) +data4 0xB1B736A0,0xAAAAAAAB,0x00003FFA,0x00000000 +data4 0x90CD6327,0xAAAAAAAB,0x00003FFC,0x00000000 +data4 0xFFFFFFFF,0xFFFFFFFF,0x00003FFD,0x00000000 +// /* Reversed */ +ASM_SIZE_DIRECTIVE(Constants_exp_64_A) + +.align 64 +Constants_exp_64_P: +ASM_TYPE_DIRECTIVE(Constants_exp_64_P,@object) +data4 0x43914A8A,0xD00D6C81,0x00003FF2,0x00000000 +data4 0x30304B30,0xB60BC4AC,0x00003FF5,0x00000000 +data4 0x7474C518,0x88888888,0x00003FF8,0x00000000 +data4 0x8DAE729D,0xAAAAAAAA,0x00003FFA,0x00000000 +data4 0xAAAAAF61,0xAAAAAAAA,0x00003FFC,0x00000000 +data4 0x000004C7,0x80000000,0x00003FFE,0x00000000 +// /* Reversed */ +ASM_SIZE_DIRECTIVE(Constants_exp_64_P) + +.align 64 +Constants_exp_64_Q: +ASM_TYPE_DIRECTIVE(Constants_exp_64_Q,@object) +data4 0xA49EF6CA,0xD00D56F7,0x00003FEF,0x00000000 +data4 0x1C63493D,0xD00D59AB,0x00003FF2,0x00000000 +data4 0xFB50CDD2,0xB60B60B5,0x00003FF5,0x00000000 +data4 0x7BA68DC8,0x88888888,0x00003FF8,0x00000000 +data4 0xAAAAAC8D,0xAAAAAAAA,0x00003FFA,0x00000000 +data4 0xAAAAACCA,0xAAAAAAAA,0x00003FFC,0x00000000 +data4 0x00000000,0x80000000,0x00003FFE,0x00000000 +// /* Reversed */ +ASM_SIZE_DIRECTIVE(Constants_exp_64_Q) + +.align 64 +Constants_exp_64_T1: +ASM_TYPE_DIRECTIVE(Constants_exp_64_T1,@object) +data4 0x3F800000,0x3F8164D2,0x3F82CD87,0x3F843A29 +data4 0x3F85AAC3,0x3F871F62,0x3F88980F,0x3F8A14D5 +data4 0x3F8B95C2,0x3F8D1ADF,0x3F8EA43A,0x3F9031DC +data4 0x3F91C3D3,0x3F935A2B,0x3F94F4F0,0x3F96942D +data4 0x3F9837F0,0x3F99E046,0x3F9B8D3A,0x3F9D3EDA +data4 0x3F9EF532,0x3FA0B051,0x3FA27043,0x3FA43516 +data4 0x3FA5FED7,0x3FA7CD94,0x3FA9A15B,0x3FAB7A3A +data4 0x3FAD583F,0x3FAF3B79,0x3FB123F6,0x3FB311C4 +data4 0x3FB504F3,0x3FB6FD92,0x3FB8FBAF,0x3FBAFF5B +data4 0x3FBD08A4,0x3FBF179A,0x3FC12C4D,0x3FC346CD +data4 0x3FC5672A,0x3FC78D75,0x3FC9B9BE,0x3FCBEC15 +data4 0x3FCE248C,0x3FD06334,0x3FD2A81E,0x3FD4F35B +data4 0x3FD744FD,0x3FD99D16,0x3FDBFBB8,0x3FDE60F5 +data4 0x3FE0CCDF,0x3FE33F89,0x3FE5B907,0x3FE8396A +data4 0x3FEAC0C7,0x3FED4F30,0x3FEFE4BA,0x3FF28177 +data4 0x3FF5257D,0x3FF7D0DF,0x3FFA83B3,0x3FFD3E0C +ASM_SIZE_DIRECTIVE(Constants_exp_64_T1) + +.align 64 +Constants_exp_64_T2: +ASM_TYPE_DIRECTIVE(Constants_exp_64_T2,@object) +data4 0x3F800000,0x3F80058C,0x3F800B18,0x3F8010A4 +data4 0x3F801630,0x3F801BBD,0x3F80214A,0x3F8026D7 +data4 0x3F802C64,0x3F8031F2,0x3F803780,0x3F803D0E +data4 0x3F80429C,0x3F80482B,0x3F804DB9,0x3F805349 +data4 0x3F8058D8,0x3F805E67,0x3F8063F7,0x3F806987 +data4 0x3F806F17,0x3F8074A8,0x3F807A39,0x3F807FCA +data4 0x3F80855B,0x3F808AEC,0x3F80907E,0x3F809610 +data4 0x3F809BA2,0x3F80A135,0x3F80A6C7,0x3F80AC5A +data4 0x3F80B1ED,0x3F80B781,0x3F80BD14,0x3F80C2A8 +data4 0x3F80C83C,0x3F80CDD1,0x3F80D365,0x3F80D8FA +data4 0x3F80DE8F,0x3F80E425,0x3F80E9BA,0x3F80EF50 +data4 0x3F80F4E6,0x3F80FA7C,0x3F810013,0x3F8105AA +data4 0x3F810B41,0x3F8110D8,0x3F81166F,0x3F811C07 +data4 0x3F81219F,0x3F812737,0x3F812CD0,0x3F813269 +data4 0x3F813802,0x3F813D9B,0x3F814334,0x3F8148CE +data4 0x3F814E68,0x3F815402,0x3F81599C,0x3F815F37 +ASM_SIZE_DIRECTIVE(Constants_exp_64_T2) + +.align 64 +Constants_exp_64_W1: +ASM_TYPE_DIRECTIVE(Constants_exp_64_W1,@object) +data4 0x00000000,0x00000000,0x171EC4B4,0xBE384454 +data4 0x4AA72766,0xBE694741,0xD42518F8,0xBE5D32B6 +data4 0x3A319149,0x3E68D96D,0x62415F36,0xBE68F4DA +data4 0xC9C86A3B,0xBE6DDA2F,0xF49228FE,0x3E6B2E50 +data4 0x1188B886,0xBE49C0C2,0x1A4C2F1F,0x3E64BFC2 +data4 0x2CB98B54,0xBE6A2FBB,0x9A55D329,0x3E5DC5DE +data4 0x39A7AACE,0x3E696490,0x5C66DBA5,0x3E54728B +data4 0xBA1C7D7D,0xBE62B0DB,0x09F1AF5F,0x3E576E04 +data4 0x1A0DD6A1,0x3E612500,0x795FBDEF,0xBE66A419 +data4 0xE1BD41FC,0xBE5CDE8C,0xEA54964F,0xBE621376 +data4 0x476E76EE,0x3E6370BE,0x3427EB92,0x3E390D1A +data4 0x2BF82BF8,0x3E1336DE,0xD0F7BD9E,0xBE5FF1CB +data4 0x0CEB09DD,0xBE60A355,0x0980F30D,0xBE5CA37E +data4 0x4C082D25,0xBE5C541B,0x3B467D29,0xBE5BBECA +data4 0xB9D946C5,0xBE400D8A,0x07ED374A,0xBE5E2A08 +data4 0x365C8B0A,0xBE66CB28,0xD3403BCA,0x3E3AAD5B +data4 0xC7EA21E0,0x3E526055,0xE72880D6,0xBE442C75 +data4 0x85222A43,0x3E58B2BB,0x522C42BF,0xBE5AAB79 +data4 0x469DC2BC,0xBE605CB4,0xA48C40DC,0xBE589FA7 +data4 0x1AA42614,0xBE51C214,0xC37293F4,0xBE48D087 +data4 0xA2D673E0,0x3E367A1C,0x114F7A38,0xBE51BEBB +data4 0x661A4B48,0xBE6348E5,0x1D3B9962,0xBDF52643 +data4 0x35A78A53,0x3E3A3B5E,0x1CECD788,0xBE46C46C +data4 0x7857D689,0xBE60B7EC,0xD14F1AD7,0xBE594D3D +data4 0x4C9A8F60,0xBE4F9C30,0x02DFF9D2,0xBE521873 +data4 0x55E6D68F,0xBE5E4C88,0x667F3DC4,0xBE62140F +data4 0x3BF88747,0xBE36961B,0xC96EC6AA,0x3E602861 +data4 0xD57FD718,0xBE3B5151,0xFC4A627B,0x3E561CD0 +data4 0xCA913FEA,0xBE3A5217,0x9A5D193A,0x3E40A3CC +data4 0x10A9C312,0xBE5AB713,0xC5F57719,0x3E4FDADB +data4 0xDBDF59D5,0x3E361428,0x61B4180D,0x3E5DB5DB +data4 0x7408D856,0xBE42AD5F,0x31B2B707,0x3E2A3148 +ASM_SIZE_DIRECTIVE(Constants_exp_64_W1) + +.align 64 +Constants_exp_64_W2: +ASM_TYPE_DIRECTIVE(Constants_exp_64_W2,@object) +data4 0x00000000,0x00000000,0x37A3D7A2,0xBE641F25 +data4 0xAD028C40,0xBE68DD57,0xF212B1B6,0xBE5C77D8 +data4 0x1BA5B070,0x3E57878F,0x2ECAE6FE,0xBE55A36A +data4 0x569DFA3B,0xBE620608,0xA6D300A3,0xBE53B50E +data4 0x223F8F2C,0x3E5B5EF2,0xD6DE0DF4,0xBE56A0D9 +data4 0xEAE28F51,0xBE64EEF3,0x367EA80B,0xBE5E5AE2 +data4 0x5FCBC02D,0x3E47CB1A,0x9BDAFEB7,0xBE656BA0 +data4 0x805AFEE7,0x3E6E70C6,0xA3415EBA,0xBE6E0509 +data4 0x49BFF529,0xBE56856B,0x00508651,0x3E66DD33 +data4 0xC114BC13,0x3E51165F,0xC453290F,0x3E53333D +data4 0x05539FDA,0x3E6A072B,0x7C0A7696,0xBE47CD87 +data4 0xEB05C6D9,0xBE668BF4,0x6AE86C93,0xBE67C3E3 +data4 0xD0B3E84B,0xBE533904,0x556B53CE,0x3E63E8D9 +data4 0x63A98DC8,0x3E212C89,0x032A7A22,0xBE33138F +data4 0xBC584008,0x3E530FA9,0xCCB93C97,0xBE6ADF82 +data4 0x8370EA39,0x3E5F9113,0xFB6A05D8,0x3E5443A4 +data4 0x181FEE7A,0x3E63DACD,0xF0F67DEC,0xBE62B29D +data4 0x3DDE6307,0x3E65C483,0xD40A24C1,0x3E5BF030 +data4 0x14E437BE,0x3E658B8F,0xED98B6C7,0xBE631C29 +data4 0x04CF7C71,0x3E6335D2,0xE954A79D,0x3E529EED +data4 0xF64A2FB8,0x3E5D9257,0x854ED06C,0xBE6BED1B +data4 0xD71405CB,0x3E5096F6,0xACB9FDF5,0xBE3D4893 +data4 0x01B68349,0xBDFEB158,0xC6A463B9,0x3E628D35 +data4 0xADE45917,0xBE559725,0x042FC476,0xBE68C29C +data4 0x01E511FA,0xBE67593B,0x398801ED,0xBE4A4313 +data4 0xDA7C3300,0x3E699571,0x08062A9E,0x3E5349BE +data4 0x755BB28E,0x3E5229C4,0x77A1F80D,0x3E67E426 +data4 0x6B69C352,0xBE52B33F,0x084DA57F,0xBE6B3550 +data4 0xD1D09A20,0xBE6DB03F,0x2161B2C1,0xBE60CBC4 +data4 0x78A2B771,0x3E56ED9C,0x9D0FA795,0xBE508E31 +data4 0xFD1A54E9,0xBE59482A,0xB07FD23E,0xBE2A17CE +data4 0x17365712,0x3E68BF5C,0xB3785569,0x3E3956F9 +ASM_SIZE_DIRECTIVE(Constants_exp_64_W2) .section .text -GLOBAL_IEEE754_ENTRY(expm1) +.proc expm1# +.global expm1# +.align 64 -{ .mlx - getf.exp rSignexp_x = f8 // Must recompute if x unorm - movl rSig_inv_ln2 = 0xb8aa3b295c17f0bc // signif of 1/ln2 -} -{ .mlx - addl rAD_TB1 = @ltoff(exp_Table_1), gp - movl rRshf_2to56 = 0x4768000000000000 // 1.10000 2^(63+56) +expm1: +#ifdef _LIBC +.global __expm1# +__expm1: +#endif + + +{ .mii + alloc r32 = ar.pfs,0,30,4,0 +(p0) add r33 = 1, r0 +(p0) cmp.eq.unc p7, p0 = r0, r0 } ;; -// We do this fnorm right at the beginning to normalize -// any input unnormals so that SWA is not taken. + +// +// Set p7 true for expm1 +// Set Flag = r33 = 1 for expm1 +// These are really no longer necesary, but are a remnant +// when this file had multiple entry points. +// They should be carefully removed + + + { .mfi - ld8 rAD_TB1 = [rAD_TB1] - fclass.m p6,p0 = f8,0x0b // Test for x=unorm - mov rExp_mask = 0x1ffff +(p0) add r32 = 1,r0 +(p0) fnorm.s1 f9 = f8 + nop.i 999 } + + { .mfi - mov rExp_bias = 0xffff - fnorm.s1 fNormX = f8 - mov rExp_2tom56 = 0xffff-56 + nop.m 999 +(p0) fclass.m.unc p6, p8 = f8, 0x1E7 + nop.i 999 } -;; -// Form two constants we need -// 1/ln2 * 2^63 to compute w = x * 1/ln2 * 128 -// 1.1000..000 * 2^(63+63-7) to right shift int(w) into the significand +{ .mfi + nop.m 999 +(p0) fclass.nm.unc p9, p0 = f8, 0x1FF + nop.i 999 +} { .mfi - setf.sig fINV_LN2_2TO63 = rSig_inv_ln2 // form 1/ln2 * 2^63 - fclass.m p8,p0 = f8,0x07 // Test for x=0 - nop.i 0 + nop.m 999 +(p0) mov f36 = f1 + nop.i 999 ;; +} + +// +// Identify NatVals, NaNs, Infs, and Zeros. +// Identify EM unsupporteds. +// Save special input registers +// +// Create FR_X_cor = 0.0 +// GR_Flag = 0 +// GR_Expo_Range = 1 +// FR_Scale = 1.0 +// + +{ .mfb + nop.m 999 +(p0) mov f32 = f0 +(p6) br.cond.spnt EXP_64_SPECIAL ;; +} + +{ .mib + nop.m 999 + nop.i 999 +(p9) br.cond.spnt EXP_64_UNSUPPORTED ;; +} + +// +// Branch out for special input values +// + +{ .mfi +(p0) cmp.ne.unc p12, p13 = 0x01, r33 +(p0) fcmp.lt.unc.s0 p9,p0 = f8, f0 +(p0) cmp.eq.unc p15, p0 = r0, r0 +} + +// +// Raise possible denormal operand exception +// Normalize x +// +// This function computes exp( x + x_cor) +// Input FR 1: FR_X +// Input FR 2: FR_X_cor +// Input GR 1: GR_Flag +// Input GR 2: GR_Expo_Range +// Output FR 3: FR_Y_hi +// Output FR 4: FR_Y_lo +// Output FR 5: FR_Scale +// Output PR 1: PR_Safe + +// +// Prepare to load constants +// Set Safe = True +// + +{ .mmi +(p0) addl r34 = @ltoff(Constants_exp_64_Arg#), gp +(p0) addl r40 = @ltoff(Constants_exp_64_W1#), gp +(p0) addl r41 = @ltoff(Constants_exp_64_W2#), gp } +;; + +{ .mmi + ld8 r34 = [r34] + ld8 r40 = [r40] +(p0) addl r50 = @ltoff(Constants_exp_64_T1#), gp +} +;; + + +{ .mmi + ld8 r41 = [r41] +(p0) ldfe f37 = [r34],16 +(p0) addl r51 = @ltoff(Constants_exp_64_T2#), gp +} +;; + +// +// N = fcvt.fx(float_N) +// Set p14 if -6 > expo_X +// + + +// +// Bias = 0x0FFFF +// expo_X = expo_X and Mask +// + +// +// Load L_lo +// Set p10 if 14 < expo_X +// + +{ .mmi + ld8 r50 = [r50] +(p0) ldfe f40 = [r34],16 + nop.i 999 +} +;; + { .mlx - setf.d fRSHF_2TO56 = rRshf_2to56 // Form 1.100 * 2^(63+56) - movl rRshf = 0x43e8000000000000 // 1.10000 2^63 for rshift + nop.m 999 +(p0) movl r58 = 0x0FFFF } ;; -{ .mfi - setf.exp f2TOM56 = rExp_2tom56 // form 2^-56 for scaling Nfloat - fclass.m p9,p0 = f8,0x22 // Test for x=-inf - add rAD_TB2 = 0x140, rAD_TB1 // Point to Table 2 +// +// Load W2_ptr +// Branch to SMALL is expo_X < -6 +// + +// +// float_N = X * L_Inv +// expo_X = exponent of X +// Mask = 0x1FFFF +// + +{ .mmi + ld8 r51 = [r51] +(p0) ldfe f41 = [r34],16 } -{ .mib - add rAD_Q1 = 0x1e0, rAD_TB1 // Point to Q table for small path - add rAD_Ln2_lo = 0x30, rAD_TB1 // Point to ln2_by_128_lo -(p6) br.cond.spnt EXPM1_UNORM // Branch if x unorm +;; + +{ .mlx +(p0) addl r34 = @ltoff(Constants_exp_64_Exponents#), gp +(p0) movl r39 = 0x1FFFF +} +;; + +{ .mmi + ld8 r34 = [r34] +(p0) getf.exp r37 = f9 + nop.i 999 } ;; -EXPM1_COMMON: +{ .mii + nop.m 999 + nop.i 999 +(p0) and r37 = r37, r39 ;; +} + +{ .mmi +(p0) sub r37 = r37, r58 ;; +(p0) cmp.gt.unc p14, p0 = -6, r37 +(p0) cmp.lt.unc p10, p0 = 14, r37 ;; +} + { .mfi - ldfpd fMIN_DBL_OFLOW_ARG, fMAX_DBL_MINUS_1_ARG = [rAD_TB1],16 - fclass.m p10,p0 = f8,0x1e1 // Test for x=+inf, NaN, NaT - add rAD_Q2 = 0x50, rAD_Q1 // Point to Q table for small path + nop.m 999 +// +// Load L_inv +// Set p12 true for Flag = 0 (exp) +// Set p13 true for Flag = 1 (expm1) +// +(p0) fmpy.s1 f38 = f9, f37 + nop.i 999 ;; } + { .mfb - nop.m 0 - nop.f 0 -(p8) br.ret.spnt b0 // Exit for x=0, return x + nop.m 999 +// +// Load L_hi +// expo_X = expo_X - Bias +// get W1_ptr +// +(p0) fcvt.fx.s1 f39 = f38 +(p14) br.cond.spnt EXP_SMALL ;; +} + +{ .mib + nop.m 999 + nop.i 999 +(p10) br.cond.spnt EXP_HUGE ;; +} + +{ .mmi +(p0) shladd r34 = r32,4,r34 +(p0) addl r35 = @ltoff(Constants_exp_64_A#), gp + nop.i 999 } ;; +{ .mmi + ld8 r35 = [r35] + nop.m 999 + nop.i 999 +} +;; + +// +// Load T_1,T_2 +// + +{ .mmb +(p0) ldfe f51 = [r35],16 +(p0) ld8 r45 = [r34],8 + nop.b 999 ;; +} +// +// Set Safe = True if k >= big_expo_neg +// Set Safe = False if k < big_expo_neg +// + +{ .mmb +(p0) ldfe f49 = [r35],16 +(p0) ld8 r48 = [r34],0 + nop.b 999 ;; +} + { .mfi - ldfd fMAX_DBL_NORM_ARG = [rAD_TB1],16 - nop.f 0 - and rExp_x = rExp_mask, rSignexp_x // Biased exponent of x + nop.m 999 +// +// Branch to HUGE is expo_X > 14 +// +(p0) fcvt.xf f38 = f39 + nop.i 999 ;; } -{ .mfb - setf.d fRSHF = rRshf // Form right shift const 1.100 * 2^63 -(p9) fms.d.s0 f8 = f0,f0,f1 // quick exit for x=-inf -(p9) br.ret.spnt b0 + +{ .mfi +(p0) getf.sig r52 = f39 + nop.f 999 + nop.i 999 ;; +} + +{ .mii + nop.m 999 +(p0) extr.u r43 = r52, 6, 6 ;; +// +// r = r - float_N * L_lo +// K = extr(N_fix,12,52) +// +(p0) shladd r40 = r43,3,r40 ;; } -;; { .mfi - ldfpd fQD, fQC = [rAD_Q1], 16 // Load coeff for small path - nop.f 0 - sub rExp_x = rExp_x, rExp_bias // True exponent of x +(p0) shladd r50 = r43,2,r50 +(p0) fnma.s1 f42 = f40, f38, f9 +// +// float_N = float(N) +// N_fix = signficand N +// +(p0) extr.u r42 = r52, 0, 6 } -{ .mfb - ldfpd fQB, fQA = [rAD_Q2], 16 // Load coeff for small path -(p10) fma.d.s0 f8 = f8, f1, f0 // For x=+inf, NaN, NaT -(p10) br.ret.spnt b0 // Exit for x=+inf, NaN, NaT + +{ .mmi +(p0) ldfd f43 = [r40],0 ;; +(p0) shladd r41 = r42,3,r41 +(p0) shladd r51 = r42,2,r51 +} +// +// W_1_p1 = 1 + W_1 +// + +{ .mmi +(p0) ldfs f44 = [r50],0 ;; +(p0) ldfd f45 = [r41],0 +// +// M_2 = extr(N_fix,0,6) +// M_1 = extr(N_fix,6,6) +// r = X - float_N * L_hi +// +(p0) extr r44 = r52, 12, 52 +} + +{ .mmi +(p0) ldfs f46 = [r51],0 ;; +(p0) sub r46 = r58, r44 +(p0) cmp.gt.unc p8, p15 = r44, r45 +} +// +// W = W_1 + W_1_p1*W_2 +// Load A_2 +// Bias_m_K = Bias - K +// + +{ .mii +(p0) ldfe f40 = [r35],16 +// +// load A_1 +// poly = A_2 + r*A_3 +// rsq = r * r +// neg_2_mK = exponent of Bias_m_k +// +(p0) add r47 = r58, r44 ;; +// +// Set Safe = True if k <= big_expo_pos +// Set Safe = False if k > big_expo_pos +// Load A_3 +// +(p15) cmp.lt p8,p15 = r44,r48 ;; +} + +{ .mmf +(p0) setf.exp f61 = r46 +// +// Bias_p + K = Bias + K +// T = T_1 * T_2 +// +(p0) setf.exp f36 = r47 +(p0) fnma.s1 f42 = f41, f38, f42 ;; } -;; { .mfi - ldfpd fQ9, fQ8 = [rAD_Q1], 16 // Load coeff for small path - fma.s1 fXsq = fNormX, fNormX, f0 // x*x for small path - cmp.gt p7, p8 = -2, rExp_x // Test |x| < 2^(-2) + nop.m 999 +// +// Load W_1,W_2 +// Load big_exp_pos, load big_exp_neg +// +(p0) fadd.s1 f47 = f43, f1 + nop.i 999 ;; } + { .mfi - ldfpd fQ7, fQ6 = [rAD_Q2], 16 // Load coeff for small path - nop.f 0 - nop.i 0 + nop.m 999 +(p0) fma.s1 f52 = f42, f51, f49 + nop.i 999 } -;; { .mfi - ldfe fQ5 = [rAD_Q1], 16 // Load coeff for small path - nop.f 0 - nop.i 0 + nop.m 999 +(p0) fmpy.s1 f48 = f42, f42 + nop.i 999 ;; } -{ .mib - ldfe fQ4 = [rAD_Q2], 16 // Load coeff for small path -(p7) cmp.gt.unc p6, p7 = -60, rExp_x // Test |x| < 2^(-60) -(p7) br.cond.spnt EXPM1_SMALL // Branch if 2^-60 <= |x| < 2^-2 + +{ .mfi + nop.m 999 +(p0) fmpy.s1 f53 = f44, f46 + nop.i 999 ;; } -;; -// W = X * Inv_log2_by_128 -// By adding 1.10...0*2^63 we shift and get round_int(W) in significand. -// We actually add 1.10...0*2^56 to X * Inv_log2 to do the same thing. +{ .mfi + nop.m 999 +(p0) fma.s1 f54 = f45, f47, f43 + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fneg f61 = f61 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fma.s1 f52 = f42, f52, f40 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fadd.s1 f55 = f54, f1 + nop.i 999 +} + +{ .mfi + nop.m 999 +// +// W + Wp1 * poly +// +(p0) mov f34 = f53 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// A_1 + r * poly +// Scale = setf_exp(Bias_p_k) +// +(p0) fma.s1 f52 = f48, f52, f42 + nop.i 999 ;; +} { .mfi - ldfe fLn2_by_128_hi = [rAD_TB1],32 - fma.s1 fW_2TO56_RSH = fNormX, fINV_LN2_2TO63, fRSHF_2TO56 - nop.i 0 + nop.m 999 +// +// poly = r + rsq(A_1 + r*poly) +// Wp1 = 1 + W +// neg_2_mK = -neg_2_mK +// +(p0) fma.s1 f35 = f55, f52, f54 + nop.i 999 ;; } + { .mfb - ldfe fLn2_by_128_lo = [rAD_Ln2_lo] -(p6) fma.d.s0 f8 = f8, f8, f8 // If x < 2^-60, result=x+x*x -(p6) br.ret.spnt b0 // Exit if x < 2^-60 + nop.m 999 +(p0) fmpy.s1 f35 = f35, f53 +// +// Y_hi = T +// Y_lo = T * (W + Wp1*poly) +// +(p12) br.cond.sptk EXP_MAIN ;; } -;; +// +// Branch if exp(x) +// Continue for exp(x-1) +// -// Divide arguments into the following categories: -// Certain minus one p11 - -inf < x <= MAX_DBL_MINUS_1_ARG -// Possible Overflow p14 - MAX_DBL_NORM_ARG < x < MIN_DBL_OFLOW_ARG -// Certain Overflow p15 - MIN_DBL_OFLOW_ARG <= x < +inf +{ .mii +(p0) cmp.lt.unc p12, p13 = 10, r44 + nop.i 999 ;; +// +// Set p12 if 10 < K, Else p13 +// +(p13) cmp.gt.unc p13, p14 = -10, r44 ;; +} // -// If the input is really a double arg, then there will never be "Possible -// Overflow" arguments. +// K > 10: Y_lo = Y_lo + neg_2_mK +// K <=10: Set p13 if -10 > K, Else set p14 // -// After that last load, rAD_TB1 points to the beginning of table 1 +{ .mfi +(p13) cmp.eq p15, p0 = r0, r0 +(p14) fadd.s1 f34 = f61, f34 + nop.i 999 ;; +} { .mfi - nop.m 0 - fcmp.ge.s1 p15,p14 = fNormX,fMIN_DBL_OFLOW_ARG - nop.i 0 + nop.m 999 +(p12) fadd.s1 f35 = f35, f61 + nop.i 999 ;; } -;; { .mfi - add rAD_P = 0x80, rAD_TB2 - fcmp.le.s1 p11,p0 = fNormX,fMAX_DBL_MINUS_1_ARG - nop.i 0 + nop.m 999 +(p13) fadd.s1 f35 = f35, f34 + nop.i 999 } -;; { .mfb - ldfpd fP5, fP4 = [rAD_P] ,16 -(p14) fcmp.gt.unc.s1 p14,p0 = fNormX,fMAX_DBL_NORM_ARG -(p15) br.cond.spnt EXPM1_CERTAIN_OVERFLOW + nop.m 999 +// +// K <= 10 and K < -10, Set Safe = True +// K <= 10 and K < 10, Y_lo = Y_hi + Y_lo +// K <= 10 and K > =-10, Y_hi = Y_hi + neg_2_mk +// +(p13) mov f34 = f61 +(p0) br.cond.sptk EXP_MAIN ;; +} +EXP_SMALL: + +{ .mmi +(p12) addl r35 = @ltoff(Constants_exp_64_P#), gp +(p0) addl r34 = @ltoff(Constants_exp_64_Exponents#), gp + nop.i 999 } ;; -// Nfloat = round_int(W) -// The signficand of fW_2TO56_RSH contains the rounded integer part of W, -// as a twos complement number in the lower bits (that is, it may be negative). -// That twos complement number (called N) is put into rN. +{ .mmi +(p12) ld8 r35 = [r35] + ld8 r34 = [r34] + nop.i 999 +} +;; -// Since fW_2TO56_RSH is scaled by 2^56, it must be multiplied by 2^-56 -// before the shift constant 1.10000 * 2^63 is subtracted to yield fNfloat. -// Thus, fNfloat contains the floating point version of N -{ .mfb - ldfpd fP3, fP2 = [rAD_P] - fms.s1 fNfloat = fW_2TO56_RSH, f2TOM56, fRSHF -(p11) br.cond.spnt EXPM1_CERTAIN_MINUS_ONE +{ .mmi +(p13) addl r35 = @ltoff(Constants_exp_64_Q#), gp + nop.m 999 + nop.i 999 } ;; + +// +// Return +// K <= 10 and K < 10, Y_hi = neg_2_mk +// +// /*******************************************************/ +// /*********** Branch EXP_SMALL *************************/ +// /*******************************************************/ + { .mfi - getf.sig rN = fW_2TO56_RSH - nop.f 0 - nop.i 0 +(p13) ld8 r35 = [r35] +(p0) mov f42 = f9 +(p0) add r34 = 0x48,r34 } ;; -// rIndex_1 has index_1 -// rIndex_2_16 has index_2 * 16 -// rBiased_M has M -// rIndex_1_16 has index_1 * 16 +// +// Flag = 0 +// r4 = rsq * rsq +// + +{ .mfi +(p0) ld8 r49 =[r34],0 + nop.f 999 + nop.i 999 ;; +} + +{ .mii + nop.m 999 + nop.i 999 ;; +// +// Flag = 1 +// +(p0) cmp.lt.unc p14, p0 = r37, r49 ;; +} + +{ .mfi + nop.m 999 +// +// r = X +// +(p0) fmpy.s1 f48 = f42, f42 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +// +// rsq = r * r +// +(p0) fmpy.s1 f50 = f48, f48 +// +// Is input very small? +// +(p14) br.cond.spnt EXP_VERY_SMALL ;; +} +// +// Flag_not1: Y_hi = 1.0 +// Flag is 1: r6 = rsq * r4 +// -// r = x - Nfloat * ln2_by_128_hi -// f = 1 - Nfloat * ln2_by_128_lo { .mfi - and rIndex_1 = 0x0f, rN - fnma.s1 fR = fNfloat, fLn2_by_128_hi, fNormX - shr rM = rN, 0x7 +(p12) ldfe f52 = [r35],16 +(p12) mov f34 = f1 +(p0) add r53 = 0x1,r0 ;; } + { .mfi - and rIndex_2_16 = 0x70, rN - fnma.s1 fF = fNfloat, fLn2_by_128_lo, f1 - nop.i 0 +(p13) ldfe f51 = [r35],16 +// +// Flag_not_1: Y_lo = poly_hi + r4 * poly_lo +// +(p13) mov f34 = f9 + nop.i 999 ;; } -;; -// rAD_T1 has address of T1 -// rAD_T2 has address if T2 +{ .mmf +(p12) ldfe f53 = [r35],16 +// +// For Flag_not_1, Y_hi = X +// Scale = 1 +// Create 0x000...01 +// +(p0) setf.sig f37 = r53 +(p0) mov f36 = f1 ;; +} { .mmi - add rBiased_M = rExp_bias, rM - add rAD_T2 = rAD_TB2, rIndex_2_16 - shladd rAD_T1 = rIndex_1, 4, rAD_TB1 +(p13) ldfe f52 = [r35],16 ;; +(p12) ldfe f54 = [r35],16 + nop.i 999 ;; } -;; -// Create Scale = 2^M -// Load T1 and T2 +{ .mfi +(p13) ldfe f53 = [r35],16 +(p13) fmpy.s1 f58 = f48, f50 + nop.i 999 ;; +} +// +// Flag_not1: poly_lo = P_5 + r*P_6 +// Flag_1: poly_lo = Q_6 + r*Q_7 +// + { .mmi - setf.exp f2M = rBiased_M - ldfe fT2 = [rAD_T2] - nop.i 0 +(p13) ldfe f54 = [r35],16 ;; +(p12) ldfe f55 = [r35],16 + nop.i 999 ;; +} + +{ .mmi +(p12) ldfe f56 = [r35],16 ;; +(p13) ldfe f55 = [r35],16 + nop.i 999 ;; +} + +{ .mmi +(p12) ldfe f57 = [r35],0 ;; +(p13) ldfe f56 = [r35],16 + nop.i 999 ;; } -;; { .mfi - ldfe fT1 = [rAD_T1] - fmpy.s0 fTmp = fLn2_by_128_lo, fLn2_by_128_lo // Force inexact - nop.i 0 +(p13) ldfe f57 = [r35],0 + nop.f 999 + nop.i 999 ;; } -;; { .mfi - nop.m 0 - fma.s1 fP54 = fR, fP5, fP4 - nop.i 0 + nop.m 999 +// +// For Flag_not_1, load p5,p6,p1,p2 +// Else load p5,p6,p1,p2 +// +(p12) fma.s1 f60 = f52, f42, f53 + nop.i 999 ;; } + { .mfi - nop.m 0 - fma.s1 fP32 = fR, fP3, fP2 - nop.i 0 + nop.m 999 +(p13) fma.s1 f60 = f51, f42, f52 + nop.i 999 ;; } -;; { .mfi - nop.m 0 - fma.s1 fRsq = fR, fR, f0 - nop.i 0 + nop.m 999 +(p12) fma.s1 f60 = f60, f42, f54 + nop.i 999 ;; } -;; { .mfi - nop.m 0 - fma.s1 fP5432 = fRsq, fP54, fP32 - nop.i 0 + nop.m 999 +(p12) fma.s1 f59 = f56, f42, f57 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p13) fma.s1 f60 = f42, f60, f53 + nop.i 999 ;; } -;; { .mfi - nop.m 0 - fma.s1 fS2 = fF,fT2,f0 - nop.i 0 + nop.m 999 +(p12) fma.s1 f59 = f59, f48, f42 + nop.i 999 ;; } + { .mfi - nop.m 0 - fma.s1 fS1 = f2M,fT1,f0 - nop.i 0 + nop.m 999 +// +// Flag_1: poly_lo = Q_5 + r*(Q_6 + r*Q_7) +// Flag_not1: poly_lo = P_4 + r*(P_5 + r*P_6) +// Flag_not1: poly_hi = (P_1 + r*P_2) +// +(p13) fmpy.s1 f60 = f60, f58 + nop.i 999 ;; } -;; { .mfi - nop.m 0 - fma.s1 fP = fRsq, fP5432, fR - nop.i 0 + nop.m 999 +(p12) fma.s1 f60 = f60, f42, f55 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Flag_1: poly_lo = r6 *(Q_5 + ....) +// Flag_not1: poly_hi = r + rsq *(P_1 + r*P_2) +// +(p12) fma.s1 f35 = f60, f50, f59 + nop.i 999 } -;; { .mfi - nop.m 0 - fms.s1 fSm1 = fS1,fS2,f1 // S - 1.0 - nop.i 0 + nop.m 999 +(p13) fma.s1 f59 = f54, f42, f55 + nop.i 999 ;; } + +{ .mfi + nop.m 999 +// +// Flag_not1: Y_lo = rsq* poly_hi + poly_lo +// Flag_1: poly_lo = rsq* poly_hi + poly_lo +// +(p13) fma.s1 f59 = f59, f42, f56 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Flag_not_1: (P_1 + r*P_2) +// +(p13) fma.s1 f59 = f59, f42, f57 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Flag_not_1: poly_hi = r + rsq * (P_1 + r*P_2) +// +(p13) fma.s1 f35 = f59, f48, f60 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +// +// Create 0.000...01 +// +(p0) for f37 = f35, f37 + nop.i 999 ;; +} + { .mfb - nop.m 0 - fma.s1 fS = fS1,fS2,f0 -(p14) br.cond.spnt EXPM1_POSSIBLE_OVERFLOW + nop.m 999 +// +// Set lsb of Y_lo to 1 +// +(p0) fmerge.se f35 = f35,f37 +(p0) br.cond.sptk EXP_MAIN ;; +} +EXP_VERY_SMALL: + +{ .mmi + nop.m 999 +(p13) addl r34 = @ltoff(Constants_exp_64_Exponents#),gp + nop.i 999;; +} + +{ .mfi +(p13) ld8 r34 = [r34]; +(p12) mov f35 = f9 + nop.i 999 ;; } -;; { .mfb - nop.m 0 - fma.d.s0 f8 = fS, fP, fSm1 - br.ret.sptk b0 // Normal path exit + nop.m 999 +(p12) mov f34 = f1 +(p12) br.cond.sptk EXP_MAIN ;; } -;; -// Here if 2^-60 <= |x| <2^-2 -// Compute 13th order polynomial -EXPM1_SMALL: -{ .mmf - ldfe fQ3 = [rAD_Q1], 16 - ldfe fQ2 = [rAD_Q2], 16 - fma.s1 fX4 = fXsq, fXsq, f0 +{ .mlx +(p13) add r34 = 8,r34 +(p13) movl r39 = 0x0FFFE ;; } -;; +// +// Load big_exp_neg +// Create 1/2's exponent +// + +{ .mii +(p13) setf.exp f56 = r39 +(p13) shladd r34 = r32,4,r34 ;; + nop.i 999 +} +// +// Negative exponents are stored after positive +// { .mfi - nop.m 0 - fma.s1 fQDC = fQD, fNormX, fQC - nop.i 0 +(p13) ld8 r45 = [r34],0 +// +// Y_hi = x +// Scale = 1 +// +(p13) fmpy.s1 f35 = f9, f9 + nop.i 999 ;; } + { .mfi - nop.m 0 - fma.s1 fQBA = fQB, fNormX, fQA - nop.i 0 + nop.m 999 +// +// Reset Safe if necessary +// Create 1/2 +// +(p13) mov f34 = f9 + nop.i 999 ;; } -;; { .mfi - nop.m 0 - fma.s1 fQ98 = fQ9, fNormX, fQ8 - nop.i 0 +(p13) cmp.lt.unc p0, p15 = r37, r45 +(p13) mov f36 = f1 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +// +// Y_lo = x * x +// +(p13) fmpy.s1 f35 = f35, f56 +// +// Y_lo = x*x/2 +// +(p13) br.cond.sptk EXP_MAIN ;; } +EXP_HUGE: + { .mfi - nop.m 0 - fma.s1 fQ76= fQ7, fNormX, fQ6 - nop.i 0 + nop.m 999 +(p0) fcmp.gt.unc.s1 p14, p0 = f9, f0 + nop.i 999 +} + +{ .mlx + nop.m 999 +(p0) movl r39 = 0x15DC0 ;; } -;; { .mfi - nop.m 0 - fma.s1 fQ54 = fQ5, fNormX, fQ4 - nop.i 0 +(p14) setf.exp f34 = r39 +(p14) mov f35 = f1 +(p14) cmp.eq p0, p15 = r0, r0 ;; +} + +{ .mfb + nop.m 999 +(p14) mov f36 = f34 +// +// If x > 0, Set Safe = False +// If x > 0, Y_hi = 2**(24,000) +// If x > 0, Y_lo = 1.0 +// If x > 0, Scale = 2**(24,000) +// +(p14) br.cond.sptk EXP_MAIN ;; +} + +{ .mlx + nop.m 999 +(p12) movl r39 = 0xA240 +} + +{ .mlx + nop.m 999 +(p12) movl r38 = 0xA1DC ;; +} + +{ .mmb +(p13) cmp.eq p15, p14 = r0, r0 +(p12) setf.exp f34 = r39 + nop.b 999 ;; +} + +{ .mlx +(p12) setf.exp f35 = r38 +(p13) movl r39 = 0xFF9C } -;; { .mfi - nop.m 0 - fma.s1 fX6 = fX4, fXsq, f0 - nop.i 0 + nop.m 999 +(p13) fsub.s1 f34 = f0, f1 + nop.i 999 ;; } + { .mfi - nop.m 0 - fma.s1 fQ32= fQ3, fNormX, fQ2 - nop.i 0 + nop.m 999 +(p12) mov f36 = f34 +(p12) cmp.eq p0, p15 = r0, r0 ;; } -;; { .mfi - nop.m 0 - fma.s1 fQDCBA = fQDC, fXsq, fQBA - nop.i 0 +(p13) setf.exp f35 = r39 +(p13) mov f36 = f1 + nop.i 999 ;; } +EXP_MAIN: + { .mfi - nop.m 0 - fma.s1 fQ7654 = fQ76, fXsq, fQ54 - nop.i 0 +(p0) cmp.ne.unc p12, p0 = 0x01, r33 +(p0) fmpy.s1 f101 = f36, f35 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 +(p0) fma.d.s0 f99 = f34, f36, f101 +(p15) br.cond.sptk EXP_64_RETURN;; } -;; { .mfi - nop.m 0 - fma.s1 fQDCBA98 = fQDCBA, fXsq, fQ98 - nop.i 0 + nop.m 999 +(p0) fsetc.s3 0x7F,0x01 + nop.i 999 } + +{ .mlx + nop.m 999 +(p0) movl r50 = 0x000000000103FF ;; +} +// +// S0 user supplied status +// S2 user supplied status + WRE + TD (Overflows) +// S3 user supplied status + RZ + TD (Underflows) +// +// +// If (Safe) is true, then +// Compute result using user supplied status field. +// No overflow or underflow here, but perhaps inexact. +// Return +// Else +// Determine if overflow or underflow was raised. +// Fetch +/- overflow threshold for IEEE single, double, +// double extended +// + { .mfi - nop.m 0 - fma.s1 fQ765432 = fQ7654, fXsq, fQ32 - nop.i 0 +(p0) setf.exp f60 = r50 +(p0) fma.d.s3 f102 = f34, f36, f101 + nop.i 999 } -;; { .mfi - nop.m 0 - fma.s1 fQDCBA98765432 = fQDCBA98, fX6, fQ765432 - nop.i 0 + nop.m 999 +(p0) fsetc.s3 0x7F,0x40 + nop.i 999 ;; } -;; -{ .mfb - nop.m 0 - fma.d.s0 f8 = fQDCBA98765432, fXsq, fNormX - br.ret.sptk b0 // Exit small branch +{ .mfi + nop.m 999 +// +// For Safe, no need to check for over/under. +// For expm1, handle errors like exp. +// +(p0) fsetc.s2 0x7F,0x42 + nop.i 999;; +} + +{ .mfi + nop.m 999 +(p0) fma.d.s2 f100 = f34, f36, f101 + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p0) fsetc.s2 0x7F,0x40 + nop.i 999 ;; } -;; +{ .mfi + nop.m 999 +(p7) fclass.m.unc p12, p0 = f102, 0x00F + nop.i 999 +} + +{ .mfi + nop.m 999 +(p0) fclass.m.unc p11, p0 = f102, 0x00F + nop.i 999 ;; +} + +{ .mfi + nop.m 999 +(p7) fcmp.ge.unc.s1 p10, p0 = f100, f60 + nop.i 999 +} + +{ .mfi + nop.m 999 +// +// Create largest double exponent + 1. +// Create smallest double exponent - 1. +// +(p0) fcmp.ge.unc.s1 p8, p0 = f100, f60 + nop.i 999 ;; +} +// +// fcmp: resultS2 >= + overflow threshold -> set (a) if true +// fcmp: resultS2 <= - overflow threshold -> set (b) if true +// fclass: resultS3 is denorm/unorm/0 -> set (d) if true +// + +{ .mib +(p10) mov r65 = 41 + nop.i 999 +(p10) br.cond.sptk __libm_error_region ;; +} + +{ .mib +(p8) mov r65 = 14 + nop.i 999 +(p8) br.cond.sptk __libm_error_region ;; +} +// +// Report that exp overflowed +// -EXPM1_POSSIBLE_OVERFLOW: +{ .mib +(p12) mov r65 = 42 + nop.i 999 +(p12) br.cond.sptk __libm_error_region ;; +} -// Here if fMAX_DBL_NORM_ARG < x < fMIN_DBL_OFLOW_ARG -// This cannot happen if input is a double, only if input higher precision. -// Overflow is a possibility, not a certainty. +{ .mib +(p11) mov r65 = 15 + nop.i 999 +(p11) br.cond.sptk __libm_error_region ;; +} -// Recompute result using status field 2 with user's rounding mode, -// and wre set. If result is larger than largest double, then we have -// overflow +{ .mib + nop.m 999 + nop.i 999 +// +// Report that exp underflowed +// +(p0) br.cond.sptk EXP_64_RETURN;; +} +EXP_64_SPECIAL: { .mfi - mov rGt_ln = 0x103ff // Exponent for largest dbl + 1 ulp - fsetc.s2 0x7F,0x42 // Get user's round mode, set wre - nop.i 0 + nop.m 999 +(p0) fclass.m.unc p6, p0 = f8, 0x0c3 + nop.i 999 } -;; { .mfi - setf.exp fGt_pln = rGt_ln // Create largest double + 1 ulp - fma.d.s2 fWre_urm_f8 = fS, fP, fSm1 // Result with wre set - nop.i 0 + nop.m 999 +(p0) fclass.m.unc p13, p8 = f8, 0x007 + nop.i 999 ;; } -;; { .mfi - nop.m 0 - fsetc.s2 0x7F,0x40 // Turn off wre in sf2 - nop.i 0 + nop.m 999 +(p7) fclass.m.unc p14, p0 = f8, 0x007 + nop.i 999 } -;; { .mfi - nop.m 0 - fcmp.ge.s1 p6, p0 = fWre_urm_f8, fGt_pln // Test for overflow - nop.i 0 + nop.m 999 +(p0) fclass.m.unc p12, p9 = f8, 0x021 + nop.i 999 ;; } -;; -{ .mfb - nop.m 0 - nop.f 0 -(p6) br.cond.spnt EXPM1_CERTAIN_OVERFLOW // Branch if overflow +{ .mfi + nop.m 999 +(p0) fclass.m.unc p11, p0 = f8, 0x022 + nop.i 999 } -;; -{ .mfb - nop.m 0 - fma.d.s0 f8 = fS, fP, fSm1 - br.ret.sptk b0 // Exit if really no overflow +{ .mfi + nop.m 999 +(p7) fclass.m.unc p10, p0 = f8, 0x022 + nop.i 999 ;; } -;; -EXPM1_CERTAIN_OVERFLOW: -{ .mmi - sub rTmp = rExp_mask, r0, 1 -;; - setf.exp fTmp = rTmp - nop.i 0 +{ .mfi + nop.m 999 +// +// Identify +/- 0, Inf, or -Inf +// Generate the right kind of NaN. +// +(p13) fadd.d.s0 f99 = f0, f1 + nop.i 999 ;; } -;; { .mfi - alloc r32=ar.pfs,1,4,4,0 - fmerge.s FR_X = f8,f8 - nop.i 0 + nop.m 999 +(p14) mov f99 = f8 + nop.i 999 ;; } + { .mfb - mov GR_Parameter_TAG = 41 - fma.d.s0 FR_RESULT = fTmp, fTmp, f0 // Set I,O and +INF result - br.cond.sptk __libm_error_region + nop.m 999 +(p6) fadd.d.s0 f99 = f8, f1 +// +// exp(+/-0) = 1 +// expm1(+/-0) = +/-0 +// No exceptions raised +// +(p6) br.cond.sptk EXP_64_RETURN;; } -;; -// Here if x unorm -EXPM1_UNORM: -{ .mfb - getf.exp rSignexp_x = fNormX // Must recompute if x unorm - fcmp.eq.s0 p6, p0 = f8, f0 // Set D flag - br.cond.sptk EXPM1_COMMON +{ .mib + nop.m 999 + nop.i 999 +(p14) br.cond.sptk EXP_64_RETURN;; } -;; -// here if result will be -1 and inexact, x <= -48.0 -EXPM1_CERTAIN_MINUS_ONE: -{ .mmi - mov rTmp = 1 -;; - setf.exp fTmp = rTmp - nop.i 0 +{ .mfi + nop.m 999 +(p11) mov f99 = f0 + nop.i 999 ;; } -;; { .mfb - nop.m 0 - fms.d.s0 FR_RESULT = fTmp, fTmp, f1 // Set I, rounded -1+eps result - br.ret.sptk b0 + nop.m 999 +(p10) fsub.d.s1 f99 = f0, f1 +// +// exp(-Inf) = 0 +// expm1(-Inf) = -1 +// No exceptions raised. +// +(p10) br.cond.sptk EXP_64_RETURN;; } -;; -GLOBAL_IEEE754_END(expm1) +{ .mfb + nop.m 999 +(p12) fmpy.d.s1 f99 = f8, f1 +// +// exp(+Inf) = Inf +// No exceptions raised. +// +(p0) br.cond.sptk EXP_64_RETURN;; +} -LOCAL_LIBM_ENTRY(__libm_error_region) +EXP_64_UNSUPPORTED: + +{ .mfb + nop.m 999 +(p0) fmpy.d.s0 f99 = f8, f0 + nop.b 0;; +} + +EXP_64_RETURN: +{ .mfb + nop.m 999 +(p0) mov f8 = f99 +(p0) br.ret.sptk b0 +} +.endp expm1 +ASM_SIZE_DIRECTIVE(expm1) + +.proc __libm_error_region +__libm_error_region: .prologue +// (1) { .mfi add GR_Parameter_Y=-32,sp // Parameter 2 value nop.f 0 @@ -843,32 +1716,38 @@ LOCAL_LIBM_ENTRY(__libm_error_region) } { .mfi .fframe 64 - add sp=-64,sp // Create new stack + add sp=-64,sp // Create new stack nop.f 0 - mov GR_SAVE_GP=gp // Save gp + mov GR_SAVE_GP=gp // Save gp };; + +// (2) { .mmi stfd [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack - add GR_Parameter_X = 16,sp // Parameter 1 address + add GR_Parameter_X = 16,sp // Parameter 1 address .save b0, GR_SAVE_B0 - mov GR_SAVE_B0=b0 // Save b0 + mov GR_SAVE_B0=b0 // Save b0 };; + .body +// (3) { .mib - stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack + stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address - nop.b 0 + nop.b 0 } { .mib - stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack + stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack add GR_Parameter_Y = -16,GR_Parameter_Y - br.call.sptk b0=__libm_error_support# // Call error handling function + br.call.sptk b0=__libm_error_support# // Call error handling function };; { .mmi - add GR_Parameter_RESULT = 48,sp nop.m 0 - nop.i 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp };; + +// (4) { .mmi ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack .restore sp @@ -881,6 +1760,9 @@ LOCAL_LIBM_ENTRY(__libm_error_region) br.ret.sptk b0 // Return };; -LOCAL_LIBM_END(__libm_error_region) +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) + + .type __libm_error_support#,@function .global __libm_error_support# |