diff options
Diffstat (limited to 'sysdeps/ia64/fpu/libm_reduce.S')
-rw-r--r-- | sysdeps/ia64/fpu/libm_reduce.S | 1492 |
1 files changed, 765 insertions, 727 deletions
diff --git a/sysdeps/ia64/fpu/libm_reduce.S b/sysdeps/ia64/fpu/libm_reduce.S index 1c7f4e1e88..8bdf91d6de 100644 --- a/sysdeps/ia64/fpu/libm_reduce.S +++ b/sysdeps/ia64/fpu/libm_reduce.S @@ -1,10 +1,10 @@ .file "libm_reduce.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,304 +20,310 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// // Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// problem reports or change requests be submitted to it directly at +// http://www.intel.com/software/products/opensource/libraries/num.htm. // -// History: 02/02/00 Initial Version +// History: +// 02/02/00 Initial Version +// 05/13/02 Rescheduled for speed, changed interface to pass +// parameters in fp registers +// 02/10/03 Reordered header: .section, .global, .proc, .align; +// used data8 for long double data storage // -// ********************************************************************* -// ********************************************************************* +//********************************************************************* +//********************************************************************* // // Function: __libm_pi_by_two_reduce(x) return r, c, and N where // x = N * pi/4 + (r+c) , where |r+c| <= pi/4. // This function is not designed to be used by the // general user. // -// ********************************************************************* +//********************************************************************* // // Accuracy: Returns double-precision values // -// ********************************************************************* +//********************************************************************* // // Resources Used: // -// Floating-Point Registers: f32-f70 +// Floating-Point Registers: +// f8 = Input x, return value r +// f9 = return value c +// f32-f70 // // General Purpose Registers: // r8 = return value N -// r32 = Address of x -// r33 = Address of where to place r and then c // r34-r64 // // Predicate Registers: p6-p14 // -// ********************************************************************* +//********************************************************************* // // IEEE Special Conditions: // -// No condions should be raised. +// No condions should be raised. // -// ********************************************************************* +//********************************************************************* // // I. Introduction // =============== // // For the forward trigonometric functions sin, cos, sincos, and -// tan, the original algorithms for IA 64 handle arguments up to +// tan, the original algorithms for IA 64 handle arguments up to // 1 ulp less than 2^63 in magnitude. For double-extended arguments x, -// |x| >= 2^63, this routine returns CASE, N and r_hi, r_lo where -// +// |x| >= 2^63, this routine returns N and r_hi, r_lo where +// // x is accurately approximated by // 2*K*pi + N * pi/2 + r_hi + r_lo, |r_hi+r_lo| <= pi/4. // CASE = 1 or 2. // CASE is 1 unless |r_hi + r_lo| < 2^(-33). -// +// // The exact value of K is not determined, but that information is // not required in trigonometric function computations. -// -// We first assume the argument x in question satisfies x >= 2^(63). +// +// We first assume the argument x in question satisfies x >= 2^(63). // In particular, it is positive. Negative x can be handled by symmetry: -// +// // -x is accurately approximated by // -2*K*pi + (-N) * pi/2 - (r_hi + r_lo), |r_hi+r_lo| <= pi/4. -// +// // The idea of the reduction is that -// -// x * 2/pi = N_big + N + f, |f| <= 1/2 -// +// +// x * 2/pi = N_big + N + f, |f| <= 1/2 +// // Moreover, for double extended x, |f| >= 2^(-75). (This is an // non-obvious fact found by enumeration using a special algorithm -// involving continued fraction.) The algorithm described below +// involving continued fraction.) The algorithm described below // calculates N and an accurate approximation of f. -// -// Roughly speaking, an appropriate 256-bit (4 X 64) portion of +// +// Roughly speaking, an appropriate 256-bit (4 X 64) portion of // 2/pi is multiplied with x to give the desired information. -// +// // II. Representation of 2/PI // ========================== -// +// // The value of 2/pi in binary fixed-point is -// +// // .101000101111100110...... -// +// // We store 2/pi in a table, starting at the position corresponding -// to bit position 63 -// +// to bit position 63 +// // bit position 63 62 ... 0 -1 -2 -3 -4 -5 -6 -7 .... -16576 -// -// 0 0 ... 0 . 1 0 1 0 1 0 1 .... X -// +// +// 0 0 ... 0 . 1 0 1 0 1 0 1 .... X +// // ^ -// |__ implied binary pt -// +// |__ implied binary pt +// // III. Algorithm // ============== -// +// // This describes the algorithm in the most natural way using -// unsigned interger multiplication. The implementation section +// unsigned interger multiplication. The implementation section // describes how the integer arithmetic is simulated. -// +// // STEP 0. Initialization // ---------------------- -// -// Let the input argument x be -// +// +// Let the input argument x be +// // x = 2^m * ( 1. b_1 b_2 b_3 ... b_63 ), 63 <= m <= 16383. -// -// The first crucial step is to fetch four 64-bit portions of 2/pi. +// +// The first crucial step is to fetch four 64-bit portions of 2/pi. // To fulfill this goal, we calculate the bit position L of the // beginning of these 256-bit quantity by -// +// // L := 62 - m. -// -// Note that -16321 <= L <= -1 because 63 <= m <= 16383; and that +// +// Note that -16321 <= L <= -1 because 63 <= m <= 16383; and that // the storage of 2/pi is adequate. -// +// // Fetch P_1, P_2, P_3, P_4 beginning at bit position L thus: -// +// // bit position L L-1 L-2 ... L-63 -// +// // P_1 = b b b ... b -// +// // each b can be 0 or 1. Also, let P_0 be the two bits correspoding to // bit positions L+2 and L+1. So, when each of the P_j is interpreted // with appropriate scaling, we have // // 2/pi = P_big + P_0 + (P_1 + P_2 + P_3 + P_4) + P_small -// +// // Note that P_big and P_small can be ignored. The reasons are as follow. // First, consider P_big. If P_big = 0, we can certainly ignore it. -// Otherwise, P_big >= 2^(L+3). Now, -// +// Otherwise, P_big >= 2^(L+3). Now, +// // P_big * ulp(x) >= 2^(L+3) * 2^(m-63) -// >= 2^(65-m + m-63 ) -// >= 2^2 -// +// >= 2^(65-m + m-63 ) +// >= 2^2 +// // Thus, P_big * x is an integer of the form 4*K. So -// -// x = 4*K * (pi/2) + x*(P_0 + P_1 + P_2 + P_3 + P_4)*(pi/2) +// +// x = 4*K * (pi/2) + x*(P_0 + P_1 + P_2 + P_3 + P_4)*(pi/2) // + x*P_small*(pi/2). -// +// // Hence, P_big*x corresponds to information that can be ignored for // trigonometic function evaluation. -// +// // Next, we must estimate the effect of ignoring P_small. The absolute // error made by ignoring P_small is bounded by -// +// // |P_small * x| <= ulp(P_4) * x -// <= 2^(L-255) * 2^(m+1) -// <= 2^(62-m-255 + m + 1) -// <= 2^(-192) -// -// Since for double-extended precision, x * 2/pi = integer + f, +// <= 2^(L-255) * 2^(m+1) +// <= 2^(62-m-255 + m + 1) +// <= 2^(-192) +// +// Since for double-extended precision, x * 2/pi = integer + f, // 0.5 >= |f| >= 2^(-75), the relative error introduced by ignoring // P_small is bounded by 2^(-192+75) <= 2^(-117), which is acceptable. -// +// // Further note that if x is split into x_hi + x_lo where x_lo is the // two bits corresponding to bit positions 2^(m-62) and 2^(m-63); then -// -// P_0 * x_hi -// +// +// P_0 * x_hi +// // is also an integer of the form 4*K; and thus can also be ignored. // Let M := P_0 * x_lo which is a small integer. The main part of the // calculation is really the multiplication of x with the four pieces // P_1, P_2, P_3, and P_4. -// +// // Unless the reduced argument is extremely small in magnitude, it // suffices to carry out the multiplication of x with P_1, P_2, and -// P_3. x*P_4 will be carried out and added on as a correction only +// P_3. x*P_4 will be carried out and added on as a correction only // when it is found to be needed. Note also that x*P_4 need not be // computed exactly. A straightforward multiplication suffices since // the rounding error thus produced would be bounded by 2^(-3*64), // that is 2^(-192) which is small enough as the reduced argument // is bounded from below by 2^(-75). -// +// // Now that we have four 64-bit data representing 2/pi and a // 64-bit x. We first need to calculate a highly accurate product // of x and P_1, P_2, P_3. This is best understood as integer // multiplication. -// -// +// +// // STEP 1. Multiplication // ---------------------- -// -// +// +// // --------- --------- --------- -// | P_1 | | P_2 | | P_3 | -// --------- --------- --------- -// +// | P_1 | | P_2 | | P_3 | +// --------- --------- --------- +// +// --------- +// X | X | // --------- -// X | X | -// --------- // ---------------------------------------------------- // // --------- --------- -// | A_hi | | A_lo | -// --------- --------- +// | A_hi | | A_lo | +// --------- --------- // // // --------- --------- -// | B_hi | | B_lo | -// --------- --------- +// | B_hi | | B_lo | +// --------- --------- // // -// --------- --------- -// | C_hi | | C_lo | -// --------- --------- +// --------- --------- +// | C_hi | | C_lo | +// --------- --------- // // ==================================================== // --------- --------- --------- --------- -// | S_0 | | S_1 | | S_2 | | S_3 | -// --------- --------- --------- --------- +// | S_0 | | S_1 | | S_2 | | S_3 | +// --------- --------- --------- --------- // // // // STEP 2. Get N and f // ------------------- -// +// // Conceptually, after the individual pieces S_0, S_1, ..., are obtained, // we have to sum them and obtain an integer part, N, and a fraction, f. // Here, |f| <= 1/2, and N is an integer. Note also that N need only to // be known to module 2^k, k >= 2. In the case when |f| is small enough, // we would need to add in the value x*P_4. -// -// +// +// // STEP 3. Get reduced argument // ---------------------------- -// +// // The value f is not yet the reduced argument that we seek. The // equation -// -// x * 2/pi = 4K + N + f -// +// +// x * 2/pi = 4K + N + f +// // says that -// +// // x = 2*K*pi + N * pi/2 + f * (pi/2). -// +// // Thus, the reduced argument is given by -// -// reduced argument = f * pi/2. -// +// +// reduced argument = f * pi/2. +// // This multiplication must be performed to extra precision. -// +// // IV. Implementation // ================== -// +// // Step 0. Initialization // ---------------------- -// +// // Set sgn_x := sign(x); x := |x|; x_lo := 2 lsb of x. -// +// // In memory, 2/pi is stored contigously as -// +// // 0x00000000 0x00000000 0xA2F.... // ^ // |__ implied binary bit -// +// // Given x = 2^m * 1.xxxx...xxx; we calculate L := 62 - m. Thus // -1 <= L <= -16321. We fetch from memory 5 integer pieces of data. -// +// // P_0 is the two bits corresponding to bit positions L+2 and L+1 // P_1 is the 64-bit starting at bit position L // P_2 is the 64-bit starting at bit position L-64 // P_3 is the 64-bit starting at bit position L-128 // P_4 is the 64-bit starting at bit position L-192 -// +// // For example, if m = 63, P_0 would be 0 and P_1 would look like // 0xA2F... -// +// // If m = 65, P_0 would be the two msb of 0xA, thus, P_0 is 10 in binary. -// P_1 in binary would be 1 0 0 0 1 0 1 1 1 1 .... -// +// P_1 in binary would be 1 0 0 0 1 0 1 1 1 1 .... +// // Step 1. Multiplication // ---------------------- -// +// // At this point, P_1, P_2, P_3, P_4 are integers. They are // supposed to be interpreted as -// +// // 2^(L-63) * P_1; // 2^(L-63-64) * P_2; // 2^(L-63-128) * P_3; // 2^(L-63-192) * P_4; -// +// // Since each of them need to be multiplied to x, we would scale // both x and the P_j's by some convenient factors: scale each // of P_j's up by 2^(63-L), and scale x down by 2^(L-63). -// +// // p_1 := fcvt.xf ( P_1 ) // p_2 := fcvt.xf ( P_2 ) * 2^(-64) // p_3 := fcvt.xf ( P_3 ) * 2^(-128) @@ -325,30 +331,30 @@ // x := replace exponent of x by -1 // because 2^m * 1.xxxx...xxx * 2^(L-63) // is 2^(-1) * 1.xxxx...xxx -// +// // We are now faced with the task of computing the following -// +// // --------- --------- --------- -// | P_1 | | P_2 | | P_3 | -// --------- --------- --------- -// +// | P_1 | | P_2 | | P_3 | +// --------- --------- --------- +// // --------- -// X | X | -// --------- +// X | X | +// --------- // ---------------------------------------------------- -// +// // --------- --------- -// | A_hi | | A_lo | -// --------- --------- -// +// | A_hi | | A_lo | +// --------- --------- +// // --------- --------- -// | B_hi | | B_lo | -// --------- --------- -// -// --------- --------- -// | C_hi | | C_lo | -// --------- --------- -// +// | B_hi | | B_lo | +// --------- --------- +// +// --------- --------- +// | C_hi | | C_lo | +// --------- --------- +// // ==================================================== // ----------- --------- --------- --------- // | S_0 | | S_1 | | S_2 | | S_3 | @@ -357,108 +363,108 @@ // | |___ binary point // | // |___ possibly one more bit -// +// // Let FPSR3 be set to round towards zero with widest precision -// and exponent range. Unless an explicit FPSR is given, +// and exponent range. Unless an explicit FPSR is given, // round-to-nearest with widest precision and exponent range is // used. -// +// // Define sigma_C := 2^63; sigma_B := 2^(-1); sigma_C := 2^(-65). -// +// // Tmp_C := fmpy.fpsr3( x, p_1 ); // If Tmp_C >= sigma_C then // C_hi := Tmp_C; // C_lo := x*p_1 - C_hi ...fma, exact // Else // C_hi := fadd.fpsr3(sigma_C, Tmp_C) - sigma_C -// ...subtraction is exact, regardless -// ...of rounding direction +// ...subtraction is exact, regardless +// ...of rounding direction // C_lo := x*p_1 - C_hi ...fma, exact // End If -// +// // Tmp_B := fmpy.fpsr3( x, p_2 ); // If Tmp_B >= sigma_B then // B_hi := Tmp_B; // B_lo := x*p_2 - B_hi ...fma, exact // Else // B_hi := fadd.fpsr3(sigma_B, Tmp_B) - sigma_B -// ...subtraction is exact, regardless -// ...of rounding direction +// ...subtraction is exact, regardless +// ...of rounding direction // B_lo := x*p_2 - B_hi ...fma, exact // End If -// +// // Tmp_A := fmpy.fpsr3( x, p_3 ); // If Tmp_A >= sigma_A then // A_hi := Tmp_A; // A_lo := x*p_3 - A_hi ...fma, exact // Else // A_hi := fadd.fpsr3(sigma_A, Tmp_A) - sigma_A -// ...subtraction is exact, regardless -// ...of rounding direction +// ...subtraction is exact, regardless +// ...of rounding direction // A_lo := x*p_3 - A_hi ...fma, exact // End If -// +// // ...Note that C_hi is of integer value. We need only the -// ...last few bits. Thus we can ensure C_hi is never a big +// ...last few bits. Thus we can ensure C_hi is never a big // ...integer, freeing us from overflow worry. -// +// // Tmp_C := fadd.fpsr3( C_hi, 2^(70) ) - 2^(70); // ...Tmp_C is the upper portion of C_hi // C_hi := C_hi - Tmp_C // ...0 <= C_hi < 2^7 -// +// // Step 2. Get N and f // ------------------- -// -// At this point, we have all the components to obtain +// +// At this point, we have all the components to obtain // S_0, S_1, S_2, S_3 and thus N and f. We start by adding // C_lo and B_hi. This sum together with C_hi gives a good -// estimation of N and f. -// +// estimation of N and f. +// // A := fadd.fpsr3( B_hi, C_lo ) // B := max( B_hi, C_lo ) // b := min( B_hi, C_lo ) -// -// a := (B - A) + b ...exact. Note that a is either 0 -// ...or 2^(-64). -// +// +// a := (B - A) + b ...exact. Note that a is either 0 +// ...or 2^(-64). +// // N := round_to_nearest_integer_value( A ); -// f := A - N; ...exact because lsb(A) >= 2^(-64) -// ...and |f| <= 1/2. -// -// f := f + a ...exact because a is 0 or 2^(-64); -// ...the msb of the sum is <= 1/2 -// ...lsb >= 2^(-64). -// +// f := A - N; ...exact because lsb(A) >= 2^(-64) +// ...and |f| <= 1/2. +// +// f := f + a ...exact because a is 0 or 2^(-64); +// ...the msb of the sum is <= 1/2 +// ...lsb >= 2^(-64). +// // N := convert to integer format( C_hi + N ); // M := P_0 * x_lo; // N := N + M; -// +// // If sgn_x == 1 (that is original x was negative) // N := 2^10 - N // ...this maintains N to be non-negative, but still // ...equivalent to the (negated N) mod 4. // End If -// +// // If |f| >= 2^(-33) -// +// // ...Case 1 // CASE := 1 // g := A_hi + B_lo; // s_hi := f + g; // s_lo := (f - s_hi) + g; -// +// // Else -// +// // ...Case 2 // CASE := 2 // A := fadd.fpsr3( A_hi, B_lo ) // B := max( A_hi, B_lo ) // b := min( A_hi, B_lo ) -// -// a := (B - A) + b ...exact. Note that a is either 0 -// ...or 2^(-128). -// +// +// a := (B - A) + b ...exact. Note that a is either 0 +// ...or 2^(-128). +// // f_hi := A + f; // f_lo := (f - f_hi) + A; // ...this is exact. @@ -468,9 +474,9 @@ // ...If f = 2^(-64), f-f_hi involves cancellation and is // ...exact. If f = -2^(-64), then A + f is exact. Hence // ...f-f_hi is -A exactly, giving f_lo = 0. -// +// // f_lo := f_lo + a; -// +// // If |f| >= 2^(-50) then // s_hi := f_hi; // s_lo := f_lo; @@ -479,117 +485,111 @@ // s_hi := f_hi + f_lo // s_lo := (f_hi - s_hi) + f_lo // End If -// +// // End If -// +// // Step 3. Get reduced argument // ---------------------------- -// +// // If sgn_x == 0 (that is original x is positive) -// +// // D_hi := Pi_by_2_hi // D_lo := Pi_by_2_lo // ...load from table -// +// // Else -// +// // D_hi := neg_Pi_by_2_hi // D_lo := neg_Pi_by_2_lo // ...load from table // End If -// +// // r_hi := s_hi*D_hi -// r_lo := s_hi*D_hi - r_hi ...fma +// r_lo := s_hi*D_hi - r_hi ...fma // r_lo := (s_hi*D_lo + r_lo) + s_lo*D_hi -// -// Return CASE, N, r_hi, r_lo -// - -#include "libm_support.h" - -FR_X = f32 -FR_N = f33 -FR_p_1 = f34 -FR_TWOM33 = f35 -FR_TWOM50 = f36 -FR_g = f37 -FR_p_2 = f38 -FR_f = f39 -FR_s_lo = f40 -FR_p_3 = f41 -FR_f_abs = f42 -FR_D_lo = f43 -FR_p_4 = f44 -FR_D_hi = f45 -FR_Tmp2_C = f46 -FR_s_hi = f47 -FR_sigma_A = f48 -FR_A = f49 -FR_sigma_B = f50 -FR_B = f51 -FR_sigma_C = f52 -FR_b = f53 -FR_ScaleP2 = f54 -FR_ScaleP3 = f55 -FR_ScaleP4 = f56 -FR_Tmp_A = f57 -FR_Tmp_B = f58 -FR_Tmp_C = f59 -FR_A_hi = f60 -FR_f_hi = f61 -FR_r_hi = f62 -FR_A_lo = f63 -FR_B_hi = f64 -FR_a = f65 -FR_B_lo = f66 +// +// Return N, r_hi, r_lo +// +FR_input_X = f8 +FR_r_hi = f8 +FR_r_lo = f9 + +FR_X = f32 +FR_N = f33 +FR_p_1 = f34 +FR_TWOM33 = f35 +FR_TWOM50 = f36 +FR_g = f37 +FR_p_2 = f38 +FR_f = f39 +FR_s_lo = f40 +FR_p_3 = f41 +FR_f_abs = f42 +FR_D_lo = f43 +FR_p_4 = f44 +FR_D_hi = f45 +FR_Tmp2_C = f46 +FR_s_hi = f47 +FR_sigma_A = f48 +FR_A = f49 +FR_sigma_B = f50 +FR_B = f51 +FR_sigma_C = f52 +FR_b = f53 +FR_ScaleP2 = f54 +FR_ScaleP3 = f55 +FR_ScaleP4 = f56 +FR_Tmp_A = f57 +FR_Tmp_B = f58 +FR_Tmp_C = f59 +FR_A_hi = f60 +FR_f_hi = f61 +FR_RSHF = f62 +FR_A_lo = f63 +FR_B_hi = f64 +FR_a = f65 +FR_B_lo = f66 FR_f_lo = f67 -FR_r_lo = f68 -FR_C_hi = f69 -FR_C_lo = f70 +FR_N_fix = f68 +FR_C_hi = f69 +FR_C_lo = f70 GR_N = r8 -GR_Address_of_Input = r32 -GR_Address_of_Outputs = r33 -GR_Exp_x = r36 -GR_Temp = r37 -GR_BIASL63 = r38 +GR_Exp_x = r36 +GR_Temp = r37 +GR_BIASL63 = r38 GR_CASE = r39 -GR_x_lo = r40 -GR_sgn_x = r41 +GR_x_lo = r40 +GR_sgn_x = r41 GR_M = r42 GR_BASE = r43 GR_LENGTH1 = r44 GR_LENGTH2 = r45 GR_ASUB = r46 GR_P_0 = r47 -GR_P_1 = r48 -GR_P_2 = r49 -GR_P_3 = r50 -GR_P_4 = r51 +GR_P_1 = r48 +GR_P_2 = r49 +GR_P_3 = r50 +GR_P_4 = r51 GR_START = r52 GR_SEGMENT = r53 GR_A = r54 -GR_B = r55 +GR_B = r55 GR_C = r56 GR_D = r57 GR_E = r58 -GR_TEMP1 = r59 -GR_TEMP2 = r60 -GR_TEMP3 = r61 -GR_TEMP4 = r62 +GR_TEMP1 = r59 +GR_TEMP2 = r60 +GR_TEMP3 = r61 +GR_TEMP4 = r62 GR_TEMP5 = r63 GR_TEMP6 = r64 +GR_rshf = r64 +RODATA .align 64 -#ifdef _LIBC -.rodata -#else -.data -#endif - -Constants_Bits_of_2_by_pi: -ASM_TYPE_DIRECTIVE(Constants_Bits_of_2_by_pi,@object) +LOCAL_OBJECT_START(Constants_Bits_of_2_by_pi) data8 0x0000000000000000,0xA2F9836E4E441529 data8 0xFC2757D1F534DDC0,0xDB6295993C439041 data8 0xFE5163ABDEBBC561,0xB7246E3A424DD2E0 @@ -721,34 +721,33 @@ data8 0xB5D6DF8261DD9602,0x36169F3AC4A1A283 data8 0x6DED727A8D39A9B8,0x825C326B5B2746ED data8 0x34007700D255F4FC,0x4D59018071E0E13F data8 0x89B295F364A8F1AE,0xA74B38FC4CEAB2BB -ASM_SIZE_DIRECTIVE(Constants_Bits_of_2_by_pi) +LOCAL_OBJECT_END(Constants_Bits_of_2_by_pi) -Constants_Bits_of_pi_by_2: -ASM_TYPE_DIRECTIVE(Constants_Bits_of_pi_by_2,@object) -data4 0x2168C234,0xC90FDAA2,0x00003FFF,0x00000000 -data4 0x80DC1CD1,0xC4C6628B,0x00003FBF,0x00000000 -ASM_SIZE_DIRECTIVE(Constants_Bits_of_pi_by_2) +LOCAL_OBJECT_START(Constants_Bits_of_pi_by_2) +data8 0xC90FDAA22168C234,0x00003FFF +data8 0xC4C6628B80DC1CD1,0x00003FBF +LOCAL_OBJECT_END(Constants_Bits_of_pi_by_2) .section .text -.proc __libm_pi_by_2_reduce# .global __libm_pi_by_2_reduce# -.align 64 +.proc __libm_pi_by_2_reduce# +.align 32 -__libm_pi_by_2_reduce: +__libm_pi_by_2_reduce: -// X is at the address in Address_of_Input -// Place the two-piece result at the address in Address_of_Outputs -// r followed by c -// N is returned +// X is in f8 +// Place the two-piece result r (r_hi) in f8 and c (r_lo) in f9 +// N is returned in r8 -{ .mmf -alloc r34 = ar.pfs,2,34,0,0 -(p0) ldfe FR_X = [GR_Address_of_Input] -(p0) fsetc.s3 0x00,0x7F ;; +{ .mfi + alloc r34 = ar.pfs,2,34,0,0 + fsetc.s3 0x00,0x7F // Set sf3 to round to zero, 82-bit prec, td, ftz + nop.i 999 } -{ .mlx - nop.m 999 -(p0) movl GR_BIASL63 = 0x1003E +{ .mfi + addl GR_BASE = @ltoff(Constants_Bits_of_2_by_pi#), gp + nop.f 999 + mov GR_BIASL63 = 0x1003E } ;; @@ -765,73 +764,61 @@ alloc r34 = ar.pfs,2,34,0,0 // Address_BASE = shladd(SEGMENT,3) + BASE - { .mmi - nop.m 999 -(p0) addl GR_BASE = @ltoff(Constants_Bits_of_2_by_pi#), gp - nop.i 999 + getf.exp GR_Exp_x = FR_input_X + ld8 GR_BASE = [GR_BASE] + mov GR_TEMP5 = 0x0FFFE } ;; +// Define sigma_C := 2^63; sigma_B := 2^(-1); sigma_A := 2^(-65). { .mmi - ld8 GR_BASE = [GR_BASE] - nop.m 999 + getf.sig GR_x_lo = FR_input_X + mov GR_TEMP6 = 0x0FFBE nop.i 999 } ;; - -{ .mlx - nop.m 999 -(p0) movl GR_TEMP5 = 0x000000000000FFFE -} -{ .mmi - nop.m 999 ;; -(p0) setf.exp FR_sigma_B = GR_TEMP5 - nop.i 999 -} -{ .mlx - nop.m 999 -(p0) movl GR_TEMP6 = 0x000000000000FFBE ;; -} -// Define sigma_C := 2^63; sigma_B := 2^(-1); sigma_A := 2^(-65). -{ .mfi -(p0) setf.exp FR_sigma_A = GR_TEMP6 - nop.f 999 - nop.i 999 ;; -} -// Special Code for testing DE arguments -// (p0) movl GR_BIASL63 = 0x0000000000013FFE -// (p0) movl GR_x_lo = 0xFFFFFFFFFFFFFFFF -// (p0) setf.exp FR_X = GR_BIASL63 -// (p0) setf.sig FR_ScaleP3 = GR_x_lo -// (p0) fmerge.se FR_X = FR_X,FR_ScaleP3 +// Special Code for testing DE arguments +// movl GR_BIASL63 = 0x0000000000013FFE +// movl GR_x_lo = 0xFFFFFFFFFFFFFFFF +// setf.exp FR_X = GR_BIASL63 +// setf.sig FR_ScaleP3 = GR_x_lo +// fmerge.se FR_X = FR_X,FR_ScaleP3 // Set sgn_x := sign(x); x := |x|; x_lo := 2 lsb of x. // 2/pi is stored contigously as // 0x00000000 0x00000000.0xA2F.... // M = EXP - BIAS ( M >= 63) // Given x = 2^m * 1.xxxx...xxx; we calculate L := 62 - m. // Thus -1 <= L <= -16321. -{ .mmf -(p0) getf.exp GR_Exp_x = FR_X -(p0) getf.sig GR_x_lo = FR_X -(p0) fabs FR_X = FR_X ;; +{ .mmi + setf.exp FR_sigma_B = GR_TEMP5 + setf.exp FR_sigma_A = GR_TEMP6 + extr.u GR_M = GR_Exp_x,0,17 } +;; + { .mii -(p0) and GR_x_lo = 0x03,GR_x_lo -(p0) extr.u GR_M = GR_Exp_x,0,17 ;; -(p0) sub GR_START = GR_M,GR_BIASL63 + and GR_x_lo = 0x03,GR_x_lo + sub GR_START = GR_M,GR_BIASL63 + add GR_BASE = 8,GR_BASE // To effectively add 1 to SEGMENT } -{ .mmi - nop.m 999 ;; -(p0) and GR_LENGTH1 = 0x3F,GR_START -(p0) shr.u GR_SEGMENT = GR_START,6 +;; + +{ .mii + and GR_LENGTH1 = 0x3F,GR_START + shr.u GR_SEGMENT = GR_START,6 + nop.i 999 } +;; + { .mmi - nop.m 999 ;; -(p0) add GR_SEGMENT = 0x1,GR_SEGMENT -(p0) sub GR_LENGTH2 = 0x40,GR_LENGTH1 + shladd GR_BASE = GR_SEGMENT,3,GR_BASE + sub GR_LENGTH2 = 0x40,GR_LENGTH1 + cmp.le p6,p7 = 0x2,GR_LENGTH1 } +;; + // P_0 is the two bits corresponding to bit positions L+2 and L+1 // P_1 is the 64-bit starting at bit position L // P_2 is the 64-bit starting at bit position L-64 @@ -849,13 +836,13 @@ alloc r34 = ar.pfs,2,34,0,0 // P_4 is made up of Clo and Dhi // P_4 = deposit Dlo, position 0, length2 into P_4, position length1 // deposit Ehi, position length2, length1 into P_4, position 0 -{ .mmi -(p0) cmp.le.unc p6,p7 = 0x2,GR_LENGTH1 ;; -(p0) shladd GR_BASE = GR_SEGMENT,3,GR_BASE -(p7) cmp.eq.unc p8,p9 = 0x1,GR_LENGTH1 ;; +{ .mfi + ld8 GR_A = [GR_BASE],8 + fabs FR_X = FR_input_X +(p7) cmp.eq.unc p8,p9 = 0x1,GR_LENGTH1 } -{ .mmi - nop.m 999 +;; + // ld_64 A at Base and increment Base by 8 // ld_64 B at Base and increment Base by 8 // ld_64 C at Base and increment Base by 8 @@ -866,31 +853,35 @@ alloc r34 = ar.pfs,2,34,0,0 // A, B, C, D, and E look like | length1 | length2 | // --------------------- // hi lo -(p0) ld8 GR_A = [GR_BASE],8 -(p0) extr.u GR_sgn_x = GR_Exp_x,17,1 ;; -} -{ .mmf - nop.m 999 -(p0) ld8 GR_B = [GR_BASE],8 -(p0) fmerge.se FR_X = FR_sigma_B,FR_X ;; +{ .mlx + ld8 GR_B = [GR_BASE],8 + movl GR_rshf = 0x43e8000000000000 // 1.10000 2^63 for right shift N_fix } -{ .mii -(p0) ld8 GR_C = [GR_BASE],8 -(p8) extr.u GR_Temp = GR_A,63,1 ;; -(p0) shl GR_TEMP1 = GR_A,GR_LENGTH1 +;; + +{ .mmi + ld8 GR_C = [GR_BASE],8 + nop.m 999 +(p8) extr.u GR_Temp = GR_A,63,1 } -{ .mii -(p0) ld8 GR_D = [GR_BASE],8 +;; + // If length1 >= 2, // P_0 = deposit Ahi, position length2, 2 bit into P_0 at position 0. -(p6) shr.u GR_P_0 = GR_A,GR_LENGTH2 ;; -(p0) shl GR_TEMP2 = GR_B,GR_LENGTH1 +{ .mii + ld8 GR_D = [GR_BASE],8 + shl GR_TEMP1 = GR_A,GR_LENGTH1 // MM instruction +(p6) shr.u GR_P_0 = GR_A,GR_LENGTH2 // MM instruction } +;; + { .mii -(p0) ld8 GR_E = [GR_BASE],-40 -(p0) shr.u GR_P_1 = GR_B,GR_LENGTH2 ;; -(p0) shr.u GR_P_2 = GR_C,GR_LENGTH2 + ld8 GR_E = [GR_BASE],-40 + shl GR_TEMP2 = GR_B,GR_LENGTH1 // MM instruction + shr.u GR_P_1 = GR_B,GR_LENGTH2 // MM instruction } +;; + // Else // Load 16 bit of ASUB from (Base_Address_of_A - 2) // P_0 = ASUB & 0x3 @@ -900,43 +891,56 @@ alloc r34 = ar.pfs,2,34,0,0 // Deposit element 63 from Ahi and place in element 0 of P_0. // Endif // Endif + { .mii (p7) ld2 GR_ASUB = [GR_BASE],8 -(p0) shl GR_TEMP3 = GR_C,GR_LENGTH1 ;; -(p0) shl GR_TEMP4 = GR_D,GR_LENGTH1 + shl GR_TEMP3 = GR_C,GR_LENGTH1 // MM instruction + shr.u GR_P_2 = GR_C,GR_LENGTH2 // MM instruction } +;; + { .mii - nop.m 999 -(p0) shr.u GR_P_3 = GR_D,GR_LENGTH2 ;; -(p0) shr.u GR_P_4 = GR_E,GR_LENGTH2 + setf.d FR_RSHF = GR_rshf // Form right shift const 1.100 * 2^63 + shl GR_TEMP4 = GR_D,GR_LENGTH1 // MM instruction + shr.u GR_P_3 = GR_D,GR_LENGTH2 // MM instruction } -{ .mii +;; + +{ .mmi (p7) and GR_P_0 = 0x03,GR_ASUB -(p6) and GR_P_0 = 0x03,GR_P_0 ;; -(p0) or GR_P_1 = GR_P_1,GR_TEMP1 +(p6) and GR_P_0 = 0x03,GR_P_0 + shr.u GR_P_4 = GR_E,GR_LENGTH2 // MM instruction } +;; + { .mmi -(p8) and GR_P_0 = 0x1,GR_P_0 ;; -(p0) or GR_P_2 = GR_P_2,GR_TEMP2 -(p8) shl GR_P_0 = GR_P_0,0x1 ;; -} -{ .mii - nop.m 999 -(p0) or GR_P_3 = GR_P_3,GR_TEMP3 -(p8) or GR_P_0 = GR_P_0,GR_Temp + nop.m 999 + or GR_P_1 = GR_P_1,GR_TEMP1 +(p8) and GR_P_0 = 0x1,GR_P_0 } +;; + { .mmi -(p0) setf.sig FR_p_1 = GR_P_1 ;; -(p0) setf.sig FR_p_2 = GR_P_2 -(p0) or GR_P_4 = GR_P_4,GR_TEMP4 ;; + setf.sig FR_p_1 = GR_P_1 + or GR_P_2 = GR_P_2,GR_TEMP2 +(p8) shladd GR_P_0 = GR_P_0,1,GR_Temp } +;; + +{ .mmf + setf.sig FR_p_2 = GR_P_2 + or GR_P_3 = GR_P_3,GR_TEMP3 + fmerge.se FR_X = FR_sigma_B,FR_X +} +;; + { .mmi - nop.m 999 ;; -(p0) setf.sig FR_p_3 = GR_P_3 -(p0) pmpy2.r GR_M = GR_P_0,GR_x_lo + setf.sig FR_p_3 = GR_P_3 + or GR_P_4 = GR_P_4,GR_TEMP4 + pmpy2.r GR_M = GR_P_0,GR_x_lo } -{ .mlx -(p0) setf.sig FR_p_4 = GR_P_4 +;; + // P_1, P_2, P_3, P_4 are integers. They should be // 2^(L-63) * P_1; // 2^(L-63-64) * P_2; @@ -954,18 +958,18 @@ alloc r34 = ar.pfs,2,34,0,0 // | P_1 | | P_2 | | P_3 | // --------- --------- --------- // --------- -// X | X | -// --------- +// X | X | +// --------- // ---------------------------------------------------- // --------- --------- -// | A_hi | | A_lo | -// --------- --------- +// | A_hi | | A_lo | +// --------- --------- // --------- --------- -// | B_hi | | B_lo | -// --------- --------- +// | B_hi | | B_lo | +// --------- --------- +// --------- --------- +// | C_hi | | C_lo | // --------- --------- -// | C_hi | | C_lo | -// --------- --------- // ==================================================== // ----------- --------- --------- --------- // | S_0 | | S_1 | | S_2 | | S_3 | @@ -977,52 +981,55 @@ alloc r34 = ar.pfs,2,34,0,0 // and exponent range. Unless an explicit FPSR is given, // round-to-nearest with widest precision and exponent range is // used. -(p0) movl GR_TEMP1 = 0x000000000000FFBF -} { .mmi - nop.m 999 ;; -(p0) setf.exp FR_ScaleP2 = GR_TEMP1 - nop.i 999 -} -{ .mlx - nop.m 999 -(p0) movl GR_TEMP4 = 0x000000000001003E + setf.sig FR_p_4 = GR_P_4 + mov GR_TEMP1 = 0x0FFBF + nop.i 999 } +;; + { .mmi - nop.m 999 ;; -(p0) setf.exp FR_sigma_C = GR_TEMP4 - nop.i 999 + setf.exp FR_ScaleP2 = GR_TEMP1 + mov GR_TEMP2 = 0x0FF7F + nop.i 999 } -{ .mlx - nop.m 999 -(p0) movl GR_TEMP2 = 0x000000000000FF7F ;; +;; + +{ .mmi + setf.exp FR_ScaleP3 = GR_TEMP2 + mov GR_TEMP4 = 0x1003E + nop.i 999 } +;; + { .mmf - nop.m 999 -(p0) setf.exp FR_ScaleP3 = GR_TEMP2 -(p0) fcvt.xuf.s1 FR_p_1 = FR_p_1 ;; + setf.exp FR_sigma_C = GR_TEMP4 + mov GR_Temp = 0x0FFDE + fcvt.xuf.s1 FR_p_1 = FR_p_1 } +;; + { .mfi - nop.m 999 -(p0) fcvt.xuf.s1 FR_p_2 = FR_p_2 - nop.i 999 -} -{ .mlx - nop.m 999 -(p0) movl GR_Temp = 0x000000000000FFDE ;; -} -{ .mmf - nop.m 999 -(p0) setf.exp FR_TWOM33 = GR_Temp -(p0) fcvt.xuf.s1 FR_p_3 = FR_p_3 ;; + setf.exp FR_TWOM33 = GR_Temp + fcvt.xuf.s1 FR_p_2 = FR_p_2 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fcvt.xuf.s1 FR_p_4 = FR_p_4 - nop.i 999 ;; + nop.m 999 + fcvt.xuf.s1 FR_p_3 = FR_p_3 + nop.i 999 } +;; + { .mfi - nop.m 999 + nop.m 999 + fcvt.xuf.s1 FR_p_4 = FR_p_4 + nop.i 999 +} +;; + // Tmp_C := fmpy.fpsr3( x, p_1 ); // Tmp_B := fmpy.fpsr3( x, p_2 ); // Tmp_A := fmpy.fpsr3( x, p_3 ); @@ -1048,55 +1055,62 @@ alloc r34 = ar.pfs,2,34,0,0 // Exact, regardless ...of rounding direction // A_lo := x*p_3 - A_hi ...fma, exact // Endif -(p0) fmpy.s3 FR_Tmp_C = FR_X,FR_p_1 - nop.i 999 ;; -} { .mfi - nop.m 999 -(p0) fmpy.s1 FR_p_2 = FR_p_2,FR_ScaleP2 - nop.i 999 -} -{ .mlx - nop.m 999 -(p0) movl GR_Temp = 0x0000000000000400 + nop.m 999 + fmpy.s3 FR_Tmp_C = FR_X,FR_p_1 + nop.i 999 } -{ .mlx - nop.m 999 -(p0) movl GR_TEMP3 = 0x000000000000FF3F ;; +;; + +{ .mfi + mov GR_TEMP3 = 0x0FF3F + fmpy.s1 FR_p_2 = FR_p_2,FR_ScaleP2 + nop.i 999 } +;; + { .mmf - nop.m 999 -(p0) setf.exp FR_ScaleP4 = GR_TEMP3 -(p0) fmpy.s1 FR_p_3 = FR_p_3,FR_ScaleP3 ;; + setf.exp FR_ScaleP4 = GR_TEMP3 + mov GR_TEMP4 = 0x10045 + fmpy.s1 FR_p_3 = FR_p_3,FR_ScaleP3 } -{ .mlx - nop.m 999 -(p0) movl GR_TEMP4 = 0x0000000000010045 ;; +;; + +{ .mfi + nop.m 999 + fadd.s3 FR_C_hi = FR_sigma_C,FR_Tmp_C // For Tmp_C < sigma_C case + nop.i 999 } +;; + { .mmf - nop.m 999 -(p0) setf.exp FR_Tmp2_C = GR_TEMP4 -(p0) fmpy.s3 FR_Tmp_B = FR_X,FR_p_2 ;; + setf.exp FR_Tmp2_C = GR_TEMP4 + nop.m 999 + fmpy.s3 FR_Tmp_B = FR_X,FR_p_2 } +;; + { .mfi - nop.m 999 -(p0) fcmp.ge.unc.s1 p12, p9 = FR_Tmp_C,FR_sigma_C - nop.i 999 ;; + addl GR_BASE = @ltoff(Constants_Bits_of_pi_by_2#), gp + fcmp.ge.s1 p12, p9 = FR_Tmp_C,FR_sigma_C + nop.i 999 } { .mfi - nop.m 999 -(p0) fmpy.s3 FR_Tmp_A = FR_X,FR_p_3 - nop.i 999 ;; + nop.m 999 + fmpy.s3 FR_Tmp_A = FR_X,FR_p_3 + nop.i 99 } +;; + { .mfi - nop.m 999 + ld8 GR_BASE = [GR_BASE] (p12) mov FR_C_hi = FR_Tmp_C - nop.i 999 ;; + nop.i 999 } { .mfi -(p0) addl GR_BASE = @ltoff(Constants_Bits_of_pi_by_2#), gp -(p9) fadd.s3 FR_C_hi = FR_sigma_C,FR_Tmp_C - nop.i 999 + nop.m 999 +(p9) fsub.s1 FR_C_hi = FR_C_hi,FR_sigma_C + nop.i 999 } ;; @@ -1114,97 +1128,106 @@ alloc r34 = ar.pfs,2,34,0,0 // Load from table // End If - -{ .mmi - ld8 GR_BASE = [GR_BASE] +{ .mfi nop.m 999 + fmpy.s1 FR_p_4 = FR_p_4,FR_ScaleP4 nop.i 999 } -;; - - { .mfi -(p0) ldfe FR_D_hi = [GR_BASE],16 -(p0) fmpy.s1 FR_p_4 = FR_p_4,FR_ScaleP4 - nop.i 999 ;; + nop.m 999 + fadd.s3 FR_B_hi = FR_sigma_B,FR_Tmp_B // For Tmp_B < sigma_B case + nop.i 999 } +;; + { .mfi -(p0) ldfe FR_D_lo = [GR_BASE],0 -(p0) fcmp.ge.unc.s1 p13, p10 = FR_Tmp_B,FR_sigma_B - nop.i 999 ;; + nop.m 999 + fadd.s3 FR_A_hi = FR_sigma_A,FR_Tmp_A // For Tmp_A < sigma_A case + nop.i 999 } +;; + { .mfi - nop.m 999 -(p13) mov FR_B_hi = FR_Tmp_B - nop.i 999 + nop.m 999 + fcmp.ge.s1 p13, p10 = FR_Tmp_B,FR_sigma_B + nop.i 999 } { .mfi - nop.m 999 -(p12) fms.s1 FR_C_lo = FR_X,FR_p_1,FR_C_hi - nop.i 999 ;; + nop.m 999 + fms.s1 FR_C_lo = FR_X,FR_p_1,FR_C_hi + nop.i 999 } +;; + { .mfi - nop.m 999 -(p10) fadd.s3 FR_B_hi = FR_sigma_B,FR_Tmp_B - nop.i 999 + ldfe FR_D_hi = [GR_BASE],16 + fcmp.ge.s1 p14, p11 = FR_Tmp_A,FR_sigma_A + nop.i 999 } +;; + { .mfi - nop.m 999 -(p9) fsub.s1 FR_C_hi = FR_C_hi,FR_sigma_C - nop.i 999 ;; + ldfe FR_D_lo = [GR_BASE] +(p13) mov FR_B_hi = FR_Tmp_B + nop.i 999 } { .mfi - nop.m 999 -(p0) fcmp.ge.unc.s1 p14, p11 = FR_Tmp_A,FR_sigma_A - nop.i 999 ;; + nop.m 999 +(p10) fsub.s1 FR_B_hi = FR_B_hi,FR_sigma_B + nop.i 999 } +;; + { .mfi - nop.m 999 + nop.m 999 (p14) mov FR_A_hi = FR_Tmp_A - nop.i 999 ;; -} -{ .mfi - nop.m 999 -(p11) fadd.s3 FR_A_hi = FR_sigma_A,FR_Tmp_A - nop.i 999 ;; -} -{ .mfi - nop.m 999 -(p9) fms.s1 FR_C_lo = FR_X,FR_p_1,FR_C_hi -(p0) cmp.eq.unc p12,p9 = 0x1,GR_sgn_x -} -{ .mfi - nop.m 999 -(p13) fms.s1 FR_B_lo = FR_X,FR_p_2,FR_B_hi - nop.i 999 ;; + nop.i 999 } { .mfi - nop.m 999 -(p10) fsub.s1 FR_B_hi = FR_B_hi,FR_sigma_B - nop.i 999 + nop.m 999 +(p11) fsub.s1 FR_A_hi = FR_A_hi,FR_sigma_A + nop.i 999 } -{ .mfi - nop.m 999 +;; + // Note that C_hi is of integer value. We need only the // last few bits. Thus we can ensure C_hi is never a big // integer, freeing us from overflow worry. // Tmp_C := fadd.fpsr3( C_hi, 2^(70) ) - 2^(70); // Tmp_C is the upper portion of C_hi -(p0) fadd.s3 FR_Tmp_C = FR_C_hi,FR_Tmp2_C - nop.i 999 ;; +{ .mfi + nop.m 999 + fadd.s3 FR_Tmp_C = FR_C_hi,FR_Tmp2_C + tbit.z p12,p9 = GR_Exp_x, 17 } +;; + { .mfi - nop.m 999 -(p14) fms.s1 FR_A_lo = FR_X,FR_p_3,FR_A_hi - nop.i 999 + nop.m 999 + fms.s1 FR_B_lo = FR_X,FR_p_2,FR_B_hi + nop.i 999 } { .mfi - nop.m 999 -(p11) fsub.s1 FR_A_hi = FR_A_hi,FR_sigma_A - nop.i 999 ;; + nop.m 999 + fadd.s3 FR_A = FR_B_hi,FR_C_lo + nop.i 999 } +;; + +{ .mfi + nop.m 999 + fms.s1 FR_A_lo = FR_X,FR_p_3,FR_A_hi + nop.i 999 +} +;; + { .mfi - nop.m 999 + nop.m 999 + fsub.s1 FR_Tmp_C = FR_Tmp_C,FR_Tmp2_C + nop.i 999 +} +;; + // ******************* // Step 2. Get N and f // ******************* @@ -1215,168 +1238,213 @@ alloc r34 = ar.pfs,2,34,0,0 // A := fadd.fpsr3( B_hi, C_lo ) // B := max( B_hi, C_lo ) // b := min( B_hi, C_lo ) -(p0) fadd.s3 FR_A = FR_B_hi,FR_C_lo - nop.i 999 -} { .mfi - nop.m 999 -(p10) fms.s1 FR_B_lo = FR_X,FR_p_2,FR_B_hi - nop.i 999 ;; + nop.m 999 + fmax.s1 FR_B = FR_B_hi,FR_C_lo + nop.i 999 } +;; + +// We use a right-shift trick to get the integer part of A into the rightmost +// bits of the significand by adding 1.1000..00 * 2^63. This operation is good +// if |A| < 2^61, which it is in this case. We are doing this to save a few +// cycles over using fcvt.fx followed by fnorm. The second step of the trick +// is to subtract the same constant to float the rounded integer into a fp reg. + { .mfi - nop.m 999 -(p0) fsub.s1 FR_Tmp_C = FR_Tmp_C,FR_Tmp2_C - nop.i 999 ;; + nop.m 999 +// N := round_to_nearest_integer_value( A ); + fma.s1 FR_N_fix = FR_A, f1, FR_RSHF + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fmax.s1 FR_B = FR_B_hi,FR_C_lo - nop.i 999 ;; + nop.m 999 + fmin.s1 FR_b = FR_B_hi,FR_C_lo + nop.i 999 } { .mfi - nop.m 999 -(p0) fmin.s1 FR_b = FR_B_hi,FR_C_lo - nop.i 999 + nop.m 999 +// C_hi := C_hi - Tmp_C ...0 <= C_hi < 2^7 + fsub.s1 FR_C_hi = FR_C_hi,FR_Tmp_C + nop.i 999 } +;; + { .mfi - nop.m 999 -(p11) fms.s1 FR_A_lo = FR_X,FR_p_3,FR_A_hi - nop.i 999 ;; + nop.m 999 +// a := (B - A) + b: Exact - note that a is either 0 or 2^(-64). + fsub.s1 FR_a = FR_B,FR_A + nop.i 999 } +;; + { .mfi - nop.m 999 -// N := round_to_nearest_integer_value( A ); -(p0) fcvt.fx.s1 FR_N = FR_A - nop.i 999 ;; + nop.m 999 + fms.s1 FR_N = FR_N_fix, f1, FR_RSHF + nop.i 999 } +;; + { .mfi - nop.m 999 -// C_hi := C_hi - Tmp_C ...0 <= C_hi < 2^7 -(p0) fsub.s1 FR_C_hi = FR_C_hi,FR_Tmp_C - nop.i 999 ;; + nop.m 999 + fadd.s1 FR_a = FR_a,FR_b + nop.i 999 } +;; + +// f := A - N; Exact because lsb(A) >= 2^(-64) and |f| <= 1/2. +// N := convert to integer format( C_hi + N ); +// M := P_0 * x_lo; +// N := N + M; { .mfi - nop.m 999 -// a := (B - A) + b: Exact - note that a is either 0 or 2^(-64). -(p0) fsub.s1 FR_a = FR_B,FR_A - nop.i 999 ;; + nop.m 999 + fsub.s1 FR_f = FR_A,FR_N + nop.i 999 } { .mfi - nop.m 999 -// f := A - N; Exact because lsb(A) >= 2^(-64) and |f| <= 1/2. -(p0) fnorm.s1 FR_N = FR_N - nop.i 999 + nop.m 999 + fadd.s1 FR_N = FR_N,FR_C_hi + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fadd.s1 FR_a = FR_a,FR_b - nop.i 999 ;; + nop.m 999 +(p9) fsub.s1 FR_D_hi = f0, FR_D_hi + nop.i 999 } { .mfi - nop.m 999 -(p0) fsub.s1 FR_f = FR_A,FR_N - nop.i 999 + nop.m 999 +(p9) fsub.s1 FR_D_lo = f0, FR_D_lo + nop.i 999 } +;; + { .mfi - nop.m 999 -// N := convert to integer format( C_hi + N ); -// M := P_0 * x_lo; -// N := N + M; -(p0) fadd.s1 FR_N = FR_N,FR_C_hi - nop.i 999 ;; + nop.m 999 + fadd.s1 FR_g = FR_A_hi,FR_B_lo // For Case 1, g=A_hi+B_lo + nop.i 999 } { .mfi - nop.m 999 -// f = f + a Exact because a is 0 or 2^(-64); -// the msb of the sum is <= 1/2 and lsb >= 2^(-64). -(p0) fadd.s1 FR_f = FR_f,FR_a - nop.i 999 + nop.m 999 + fadd.s3 FR_A = FR_A_hi,FR_B_lo // For Case 2, A=A_hi+B_lo w/ sf3 + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// Create 2**(-33) -// -(p0) fcvt.fx.s1 FR_N = FR_N - nop.i 999 ;; + mov GR_Temp = 0x0FFCD // For Case 2, exponent of 2^-50 + fmax.s1 FR_B = FR_A_hi,FR_B_lo // For Case 2, B=max(A_hi,B_lo) + nop.i 999 } +;; + +// f = f + a Exact because a is 0 or 2^(-64); +// the msb of the sum is <= 1/2 and lsb >= 2^(-64). { .mfi - nop.m 999 -(p0) fabs FR_f_abs = FR_f - nop.i 999 ;; + setf.exp FR_TWOM50 = GR_Temp // For Case 2, form 2^-50 + fcvt.fx.s1 FR_N = FR_N + nop.i 999 } { .mfi -(p0) getf.sig GR_N = FR_N - nop.f 999 - nop.i 999 ;; + nop.m 999 + fadd.s1 FR_f = FR_f,FR_a + nop.i 999 } -{ .mii - nop.m 999 - nop.i 999 ;; -(p0) add GR_N = GR_N,GR_M ;; +;; + +{ .mfi + nop.m 999 + fmin.s1 FR_b = FR_A_hi,FR_B_lo // For Case 2, b=min(A_hi,B_lo) + nop.i 999 } -// If sgn_x == 1 (that is original x was negative) -// N := 2^10 - N -// this maintains N to be non-negative, but still -// equivalent to the (negated N) mod 4. -// End If -{ .mii -(p12) sub GR_N = GR_Temp,GR_N -(p0) cmp.eq.unc p12,p9 = 0x0,GR_sgn_x ;; - nop.i 999 +;; + +{ .mfi + nop.m 999 + fsub.s1 FR_a = FR_B,FR_A // For Case 2, a=B-A + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fcmp.ge.unc.s1 p13, p10 = FR_f_abs,FR_TWOM33 - nop.i 999 ;; + nop.m 999 + fadd.s1 FR_s_hi = FR_f,FR_g // For Case 1, s_hi=f+g + nop.i 999 } { .mfi - nop.m 999 -(p9) fsub.s1 FR_D_hi = f0, FR_D_hi - nop.i 999 ;; + nop.m 999 + fadd.s1 FR_f_hi = FR_A,FR_f // For Case 2, f_hi=A+f + nop.i 999 } +;; + { .mfi - nop.m 999 -(p10) fadd.s3 FR_A = FR_A_hi,FR_B_lo - nop.i 999 + nop.m 999 + fabs FR_f_abs = FR_f + nop.i 999 } +;; + { .mfi - nop.m 999 -(p13) fadd.s1 FR_g = FR_A_hi,FR_B_lo - nop.i 999 ;; + getf.sig GR_N = FR_N + fsetc.s3 0x7F,0x40 // Reset sf3 to user settings + td + nop.i 999 } +;; + { .mfi - nop.m 999 -(p10) fmax.s1 FR_B = FR_A_hi,FR_B_lo - nop.i 999 + nop.m 999 + fsub.s1 FR_s_lo = FR_f,FR_s_hi // For Case 1, s_lo=f-s_hi + nop.i 999 } { .mfi - nop.m 999 -(p9) fsub.s1 FR_D_lo = f0, FR_D_lo - nop.i 999 ;; + nop.m 999 + fsub.s1 FR_f_lo = FR_f,FR_f_hi // For Case 2, f_lo=f-f_hi + nop.i 999 } +;; + { .mfi - nop.m 999 -(p10) fmin.s1 FR_b = FR_A_hi,FR_B_lo - nop.i 999 ;; + nop.m 999 + fmpy.s1 FR_r_hi = FR_s_hi,FR_D_hi // For Case 1, r_hi=s_hi*D_hi + nop.i 999 } { .mfi - nop.m 999 -(p0) fsetc.s3 0x7F,0x40 - nop.i 999 + nop.m 999 + fadd.s1 FR_a = FR_a,FR_b // For Case 2, a=a+b + nop.i 999 } -{ .mlx - nop.m 999 -(p10) movl GR_Temp = 0x000000000000FFCD ;; +;; + + +// If sgn_x == 1 (that is original x was negative) +// N := 2^10 - N +// this maintains N to be non-negative, but still +// equivalent to the (negated N) mod 4. +// End If +{ .mfi + add GR_N = GR_N,GR_M + fcmp.ge.s1 p13, p10 = FR_f_abs,FR_TWOM33 + mov GR_Temp = 0x00400 } -{ .mmf - nop.m 999 -(p10) setf.exp FR_TWOM50 = GR_Temp -(p10) fadd.s1 FR_f_hi = FR_A,FR_f ;; +;; + +{ .mfi +(p9) sub GR_N = GR_Temp,GR_N + fadd.s1 FR_s_lo = FR_s_lo,FR_g // For Case 1, s_lo=s_lo+g + nop.i 999 } { .mfi - nop.m 999 -// a := (B - A) + b Exact. + nop.m 999 + fadd.s1 FR_f_lo = FR_f_lo,FR_A // For Case 2, f_lo=f_lo+A + nop.i 999 +} +;; + +// a := (B - A) + b Exact. // Note that a is either 0 or 2^(-128). // f_hi := A + f; // f_lo := (f - f_hi) + A @@ -1387,68 +1455,32 @@ alloc r34 = ar.pfs,2,34,0,0 // exact. If f = -2^(-64), then A + f is exact. Hence // f-f_hi is -A exactly, giving f_lo = 0. // f_lo := f_lo + a; -(p10) fsub.s1 FR_a = FR_B,FR_A - nop.i 999 -} -{ .mfi - nop.m 999 -(p13) fadd.s1 FR_s_hi = FR_f,FR_g - nop.i 999 ;; -} -{ .mlx - nop.m 999 + // If |f| >= 2^(-33) // Case 1 // CASE := 1 // g := A_hi + B_lo; // s_hi := f + g; // s_lo := (f - s_hi) + g; -(p13) movl GR_CASE = 0x1 ;; -} -{ .mlx - nop.m 999 // Else // Case 2 // CASE := 2 // A := fadd.fpsr3( A_hi, B_lo ) // B := max( A_hi, B_lo ) // b := min( A_hi, B_lo ) -(p10) movl GR_CASE = 0x2 -} -{ .mfi - nop.m 999 -(p10) fsub.s1 FR_f_lo = FR_f,FR_f_hi - nop.i 999 ;; -} -{ .mfi - nop.m 999 -(p10) fadd.s1 FR_a = FR_a,FR_b - nop.i 999 -} -{ .mfi - nop.m 999 -(p13) fsub.s1 FR_s_lo = FR_f,FR_s_hi - nop.i 999 ;; -} -{ .mfi - nop.m 999 -(p13) fadd.s1 FR_s_lo = FR_s_lo,FR_g - nop.i 999 ;; -} + { .mfi - nop.m 999 -(p10) fcmp.ge.unc.s1 p14, p11 = FR_f_abs,FR_TWOM50 - nop.i 999 ;; + nop.m 999 +(p10) fcmp.ge.unc.s1 p14, p11 = FR_f_abs,FR_TWOM50 + nop.i 999 } { .mfi - nop.m 999 -// -// Create 2**(-50) -(p10) fadd.s1 FR_f_lo = FR_f_lo,FR_A - nop.i 999 ;; + nop.m 999 +(p13) fms.s1 FR_r_lo = FR_s_hi,FR_D_hi,FR_r_hi //For Case 1, r_lo=s_hi*D_hi+r_hi + nop.i 999 } -{ .mfi - nop.m 999 +;; + // If |f| >= 2^(-50) then // s_hi := f_hi; // s_lo := f_lo; @@ -1457,84 +1489,90 @@ alloc r34 = ar.pfs,2,34,0,0 // s_hi := f_hi + f_lo // s_lo := (f_hi - s_hi) + f_lo // End If -(p14) mov FR_s_hi = FR_f_hi - nop.i 999 ;; +{ .mfi + nop.m 999 +(p14) mov FR_s_hi = FR_f_hi + nop.i 999 } { .mfi - nop.m 999 -(p10) fadd.s1 FR_f_lo = FR_f_lo,FR_a - nop.i 999 ;; + nop.m 999 +(p10) fadd.s1 FR_f_lo = FR_f_lo,FR_a + nop.i 999 } +;; + { .mfi - nop.m 999 -(p14) mov FR_s_lo = FR_f_lo - nop.i 999 + nop.m 999 +(p14) mov FR_s_lo = FR_f_lo + nop.i 999 } { .mfi - nop.m 999 -(p11) fadd.s1 FR_f_lo = FR_f_lo,FR_A_lo - nop.i 999 ;; + nop.m 999 +(p11) fadd.s1 FR_f_lo = FR_f_lo,FR_A_lo + nop.i 999 } +;; + { .mfi - nop.m 999 -(p11) fma.s1 FR_f_lo = FR_X,FR_p_4,FR_f_lo - nop.i 999 ;; + nop.m 999 +(p11) fma.s1 FR_f_lo = FR_X,FR_p_4,FR_f_lo + nop.i 999 } +;; + { .mfi - nop.m 999 -(p11) fadd.s1 FR_s_hi = FR_f_hi,FR_f_lo - nop.i 999 ;; + nop.m 999 +(p13) fma.s1 FR_r_lo = FR_s_hi,FR_D_lo,FR_r_lo //For Case 1, r_lo=s_hi*D_lo+r_lo + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 +(p11) fadd.s1 FR_s_hi = FR_f_hi,FR_f_lo + nop.i 999 +} +;; + // r_hi := s_hi*D_hi // r_lo := s_hi*D_hi - r_hi with fma // r_lo := (s_hi*D_lo + r_lo) + s_lo*D_hi -(p0) fmpy.s1 FR_r_hi = FR_s_hi,FR_D_hi - nop.i 999 -} { .mfi - nop.m 999 -(p11) fsub.s1 FR_s_lo = FR_f_hi,FR_s_hi - nop.i 999 ;; + nop.m 999 +(p10) fmpy.s1 FR_r_hi = FR_s_hi,FR_D_hi + nop.i 999 } { .mfi - nop.m 999 -(p0) fms.s1 FR_r_lo = FR_s_hi,FR_D_hi,FR_r_hi - nop.i 999 + nop.m 999 +(p11) fsub.s1 FR_s_lo = FR_f_hi,FR_s_hi + nop.i 999 } +;; + { .mfi - nop.m 999 -(p11) fadd.s1 FR_s_lo = FR_s_lo,FR_f_lo - nop.i 999 ;; -} -{ .mmi - nop.m 999 ;; -// Return N, r_hi, r_lo -// We do not return CASE -(p0) stfe [GR_Address_of_Outputs] = FR_r_hi,16 - nop.i 999 ;; + nop.m 999 +(p10) fms.s1 FR_r_lo = FR_s_hi,FR_D_hi,FR_r_hi + nop.i 999 } { .mfi - nop.m 999 -(p0) fma.s1 FR_r_lo = FR_s_hi,FR_D_lo,FR_r_lo - nop.i 999 ;; + nop.m 999 +(p11) fadd.s1 FR_s_lo = FR_s_lo,FR_f_lo + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fma.s1 FR_r_lo = FR_s_lo,FR_D_hi,FR_r_lo - nop.i 999 ;; -} -{ .mmi - nop.m 999 ;; -(p0) stfe [GR_Address_of_Outputs] = FR_r_lo,-16 - nop.i 999 + nop.m 999 +(p10) fma.s1 FR_r_lo = FR_s_hi,FR_D_lo,FR_r_lo + nop.i 999 } -{ .mib - nop.m 999 - nop.i 999 -(p0) br.ret.sptk b0 ;; +;; + +// Return N, r_hi, r_lo +// We do not return CASE +{ .mfb + nop.m 999 + fma.s1 FR_r_lo = FR_s_lo,FR_D_hi,FR_r_lo + br.ret.sptk b0 } +;; -.endp __libm_pi_by_2_reduce -ASM_SIZE_DIRECTIVE(__libm_pi_by_2_reduce) +.endp __libm_pi_by_2_reduce# |