about summary refs log tree commit diff
path: root/sysdeps/ia64/fpu/libm_reduce.S
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/ia64/fpu/libm_reduce.S')
-rw-r--r--sysdeps/ia64/fpu/libm_reduce.S1492
1 files changed, 765 insertions, 727 deletions
diff --git a/sysdeps/ia64/fpu/libm_reduce.S b/sysdeps/ia64/fpu/libm_reduce.S
index 1c7f4e1e88..8bdf91d6de 100644
--- a/sysdeps/ia64/fpu/libm_reduce.S
+++ b/sysdeps/ia64/fpu/libm_reduce.S
@@ -1,10 +1,10 @@
 .file "libm_reduce.s"
 
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
 // All rights reserved.
-// 
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
@@ -20,304 +20,310 @@
 // * The name of Intel Corporation may not be used to endorse or promote
 // products derived from this software without specific prior written
 // permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS 
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
-// 
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
 // Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at 
-// http://developer.intel.com/opensource.
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
 //
-// History:  02/02/00 Initial Version
+// History:
+// 02/02/00 Initial Version
+// 05/13/02 Rescheduled for speed, changed interface to pass
+//          parameters in fp registers
+// 02/10/03 Reordered header: .section, .global, .proc, .align;
+//          used data8 for long double data storage
 //
-// *********************************************************************
-// *********************************************************************
+//*********************************************************************
+//*********************************************************************
 //
 // Function:   __libm_pi_by_two_reduce(x) return r, c, and N where
 //             x = N * pi/4 + (r+c) , where |r+c| <= pi/4.
 //             This function is not designed to be used by the
 //             general user.
 //
-// *********************************************************************
+//*********************************************************************
 //
 // Accuracy:       Returns double-precision values
 //
-// *********************************************************************
+//*********************************************************************
 //
 // Resources Used:
 //
-//    Floating-Point Registers: f32-f70
+//    Floating-Point Registers:
+//      f8  = Input x, return value r
+//      f9  = return value c
+//      f32-f70
 //
 //    General Purpose Registers:
 //      r8  = return value N
-//      r32 = Address of x
-//      r33 = Address of where to place r and then c 
 //      r34-r64
 //
 //    Predicate Registers:      p6-p14
 //
-// *********************************************************************
+//*********************************************************************
 //
 // IEEE Special Conditions:
 //
-//    No condions should be raised. 
+//    No condions should be raised.
 //
-// *********************************************************************
+//*********************************************************************
 //
 // I. Introduction
 // ===============
 //
 // For the forward trigonometric functions sin, cos, sincos, and
-// tan, the original algorithms for IA 64 handle arguments up to 
+// tan, the original algorithms for IA 64 handle arguments up to
 // 1 ulp less than 2^63 in magnitude. For double-extended arguments x,
-// |x| >= 2^63, this routine returns CASE, N and r_hi, r_lo where
-// 
+// |x| >= 2^63, this routine returns N and r_hi, r_lo where
+//
 //    x  is accurately approximated by
 //    2*K*pi  +  N * pi/2  +  r_hi + r_lo,  |r_hi+r_lo| <= pi/4.
 //    CASE = 1 or 2.
 //    CASE is 1 unless |r_hi + r_lo| < 2^(-33).
-// 
+//
 // The exact value of K is not determined, but that information is
 // not required in trigonometric function computations.
-// 
-// We first assume the argument x in question satisfies x >= 2^(63). 
+//
+// We first assume the argument x in question satisfies x >= 2^(63).
 // In particular, it is positive. Negative x can be handled by symmetry:
-// 
+//
 //   -x  is accurately approximated by
 //         -2*K*pi  +  (-N) * pi/2  -  (r_hi + r_lo),  |r_hi+r_lo| <= pi/4.
-// 
+//
 // The idea of the reduction is that
-// 
-// 	x  *  2/pi   =   N_big  +  N  +  f,	|f| <= 1/2
-// 
+//
+//       x  *  2/pi   =   N_big  +  N  +  f,      |f| <= 1/2
+//
 // Moreover, for double extended x, |f| >= 2^(-75). (This is an
 // non-obvious fact found by enumeration using a special algorithm
-// involving continued fraction.) The algorithm described below 
+// involving continued fraction.) The algorithm described below
 // calculates N and an accurate approximation of f.
-// 
-// Roughly speaking, an appropriate 256-bit (4 X 64) portion of 
+//
+// Roughly speaking, an appropriate 256-bit (4 X 64) portion of
 // 2/pi is multiplied with x to give the desired information.
-// 
+//
 // II. Representation of 2/PI
 // ==========================
-// 
+//
 // The value of 2/pi in binary fixed-point is
-// 
+//
 //            .101000101111100110......
-// 
+//
 // We store 2/pi in a table, starting at the position corresponding
-// to bit position 63 
-// 
+// to bit position 63
+//
 //   bit position  63 62 ... 0   -1 -2 -3 -4 -5 -6 -7  ....  -16576
-// 
-// 	 	0  0  ... 0  . 1  0  1  0  1  0  1  ....    X
-//                 
+//
+//              0  0  ... 0  . 1  0  1  0  1  0  1  ....    X
+//
 //                              ^
-// 	     	             |__ implied binary pt 
-// 
+//                               |__ implied binary pt
+//
 // III. Algorithm
 // ==============
-// 
+//
 // This describes the algorithm in the most natural way using
-// unsigned interger multiplication. The implementation section 
+// unsigned interger multiplication. The implementation section
 // describes how the integer arithmetic is simulated.
-// 
+//
 // STEP 0. Initialization
 // ----------------------
-// 
-// Let the input argument x be 
-// 
+//
+// Let the input argument x be
+//
 //     x = 2^m * ( 1. b_1 b_2 b_3 ... b_63 ),  63 <= m <= 16383.
-// 
-// The first crucial step is to fetch four 64-bit portions of 2/pi. 
+//
+// The first crucial step is to fetch four 64-bit portions of 2/pi.
 // To fulfill this goal, we calculate the bit position L of the
 // beginning of these 256-bit quantity by
-// 
+//
 //     L :=  62 - m.
-// 
-// Note that -16321 <= L <= -1 because 63 <= m <= 16383; and that 
+//
+// Note that -16321 <= L <= -1 because 63 <= m <= 16383; and that
 // the storage of 2/pi is adequate.
-// 
+//
 // Fetch P_1, P_2, P_3, P_4 beginning at bit position L thus:
-// 
+//
 //      bit position  L  L-1  L-2    ...  L-63
-// 
+//
 //      P_1    =      b   b    b     ...    b
-// 
+//
 // each b can be 0 or 1. Also, let P_0 be the two bits correspoding to
 // bit positions L+2 and L+1. So, when each of the P_j is interpreted
 // with appropriate scaling, we have
 //
 //      2/pi  =  P_big  + P_0 + (P_1 + P_2 + P_3 + P_4)  +  P_small
-// 
+//
 // Note that P_big and P_small can be ignored. The reasons are as follow.
 // First, consider P_big. If P_big = 0, we can certainly ignore it.
-// Otherwise, P_big >= 2^(L+3). Now, 
-// 
+// Otherwise, P_big >= 2^(L+3). Now,
+//
 //        P_big * ulp(x) >=  2^(L+3) * 2^(m-63)
-// 		      >=  2^(65-m  +  m-63 )
-// 		      >=  2^2
-// 
+//                   >=  2^(65-m  +  m-63 )
+//                   >=  2^2
+//
 // Thus, P_big * x is an integer of the form 4*K. So
-// 
-// 	x = 4*K * (pi/2) + x*(P_0 + P_1 + P_2 + P_3 + P_4)*(pi/2)
+//
+//       x = 4*K * (pi/2) + x*(P_0 + P_1 + P_2 + P_3 + P_4)*(pi/2)
 //                + x*P_small*(pi/2).
-// 
+//
 // Hence, P_big*x corresponds to information that can be ignored for
 // trigonometic function evaluation.
-// 
+//
 // Next, we must estimate the effect of ignoring P_small. The absolute
 // error made by ignoring P_small is bounded by
-// 
+//
 //       |P_small * x|  <=  ulp(P_4) * x
-// 		     <=  2^(L-255) * 2^(m+1)
-// 		     <=  2^(62-m-255 + m + 1)
-// 		     <=  2^(-192)
-// 
-// Since for double-extended precision, x * 2/pi = integer + f, 
+//                  <=  2^(L-255) * 2^(m+1)
+//                  <=  2^(62-m-255 + m + 1)
+//                  <=  2^(-192)
+//
+// Since for double-extended precision, x * 2/pi = integer + f,
 // 0.5 >= |f| >= 2^(-75), the relative error introduced by ignoring
 // P_small is bounded by 2^(-192+75) <= 2^(-117), which is acceptable.
-// 
+//
 // Further note that if x is split into x_hi + x_lo where x_lo is the
 // two bits corresponding to bit positions 2^(m-62) and 2^(m-63); then
-// 
-// 	P_0 * x_hi 
-// 
+//
+//       P_0 * x_hi
+//
 // is also an integer of the form 4*K; and thus can also be ignored.
 // Let M := P_0 * x_lo which is a small integer. The main part of the
 // calculation is really the multiplication of x with the four pieces
 // P_1, P_2, P_3, and P_4.
-// 
+//
 // Unless the reduced argument is extremely small in magnitude, it
 // suffices to carry out the multiplication of x with P_1, P_2, and
-// P_3. x*P_4 will be carried out and added on as a correction only 
+// P_3. x*P_4 will be carried out and added on as a correction only
 // when it is found to be needed. Note also that x*P_4 need not be
 // computed exactly. A straightforward multiplication suffices since
 // the rounding error thus produced would be bounded by 2^(-3*64),
 // that is 2^(-192) which is small enough as the reduced argument
 // is bounded from below by 2^(-75).
-// 
+//
 // Now that we have four 64-bit data representing 2/pi and a
 // 64-bit x. We first need to calculate a highly accurate product
 // of x and P_1, P_2, P_3. This is best understood as integer
 // multiplication.
-// 
-// 
+//
+//
 // STEP 1. Multiplication
 // ----------------------
-// 
-// 
+//
+//
 //                     ---------   ---------   ---------
-// 	             |  P_1  |   |  P_2  |   |  P_3  |
-// 	             ---------   ---------   ---------
-// 
+//                    |  P_1  |   |  P_2  |   |  P_3  |
+//                    ---------   ---------   ---------
+//
+//                                            ---------
+//             X                              |   X   |
 //                                            ---------
-// 	      X                              |   X   |
-// 	                                     ---------
 //      ----------------------------------------------------
 //
 //                                 ---------   ---------
-//	                         |  A_hi |   |  A_lo |
-//	                         ---------   ---------
+//                               |  A_hi |   |  A_lo |
+//                               ---------   ---------
 //
 //
 //                    ---------   ---------
-//	             |  B_hi |   |  B_lo |
-//	             ---------   ---------
+//                   |  B_hi |   |  B_lo |
+//                   ---------   ---------
 //
 //
-//        ---------   ---------  
-//	 |  C_hi |   |  C_lo |  
-//	 ---------   ---------  
+//        ---------   ---------
+//       |  C_hi |   |  C_lo |
+//       ---------   ---------
 //
 //      ====================================================
 //       ---------   ---------   ---------   ---------
-//	 |  S_0  |   |  S_1  |   |  S_2  |   |  S_3  |
-//	 ---------   ---------   ---------   ---------
+//       |  S_0  |   |  S_1  |   |  S_2  |   |  S_3  |
+//       ---------   ---------   ---------   ---------
 //
 //
 //
 // STEP 2. Get N and f
 // -------------------
-// 
+//
 // Conceptually, after the individual pieces S_0, S_1, ..., are obtained,
 // we have to sum them and obtain an integer part, N, and a fraction, f.
 // Here, |f| <= 1/2, and N is an integer. Note also that N need only to
 // be known to module 2^k, k >= 2. In the case when |f| is small enough,
 // we would need to add in the value x*P_4.
-// 
-// 
+//
+//
 // STEP 3. Get reduced argument
 // ----------------------------
-// 
+//
 // The value f is not yet the reduced argument that we seek. The
 // equation
-// 
-// 	x * 2/pi = 4K  + N  + f
-// 
+//
+//       x * 2/pi = 4K  + N  + f
+//
 // says that
-// 
+//
 //         x   =  2*K*pi  + N * pi/2  +  f * (pi/2).
-// 
+//
 // Thus, the reduced argument is given by
-// 
-// 	reduced argument =  f * pi/2.
-// 
+//
+//       reduced argument =  f * pi/2.
+//
 // This multiplication must be performed to extra precision.
-// 
+//
 // IV. Implementation
 // ==================
-// 
+//
 // Step 0. Initialization
 // ----------------------
-// 
+//
 // Set sgn_x := sign(x); x := |x|; x_lo := 2 lsb of x.
-// 
+//
 // In memory, 2/pi is stored contigously as
-// 
+//
 //  0x00000000 0x00000000 0xA2F....
 //                       ^
 //                       |__ implied binary bit
-// 
+//
 // Given x = 2^m * 1.xxxx...xxx; we calculate L := 62 - m. Thus
 // -1 <= L <= -16321. We fetch from memory 5 integer pieces of data.
-// 
+//
 // P_0 is the two bits corresponding to bit positions L+2 and L+1
 // P_1 is the 64-bit starting at bit position  L
 // P_2 is the 64-bit starting at bit position  L-64
 // P_3 is the 64-bit starting at bit position  L-128
 // P_4 is the 64-bit starting at bit position  L-192
-// 
+//
 // For example, if m = 63, P_0 would be 0 and P_1 would look like
 // 0xA2F...
-// 
+//
 // If m = 65, P_0 would be the two msb of 0xA, thus, P_0 is 10 in binary.
-// P_1 in binary would be  1 0 0 0 1 0 1 1 1 1 .... 
-//  
+// P_1 in binary would be  1 0 0 0 1 0 1 1 1 1 ....
+//
 // Step 1. Multiplication
 // ----------------------
-// 
+//
 // At this point, P_1, P_2, P_3, P_4 are integers. They are
 // supposed to be interpreted as
-// 
+//
 //  2^(L-63)     * P_1;
 //  2^(L-63-64)  * P_2;
 //  2^(L-63-128) * P_3;
 // 2^(L-63-192) * P_4;
-// 
+//
 // Since each of them need to be multiplied to x, we would scale
 // both x and the P_j's by some convenient factors: scale each
 // of P_j's up by 2^(63-L), and scale x down by 2^(L-63).
-// 
+//
 //   p_1 := fcvt.xf ( P_1 )
 //   p_2 := fcvt.xf ( P_2 ) * 2^(-64)
 //   p_3 := fcvt.xf ( P_3 ) * 2^(-128)
@@ -325,30 +331,30 @@
 //   x   := replace exponent of x by -1
 //          because 2^m    * 1.xxxx...xxx  * 2^(L-63)
 //          is      2^(-1) * 1.xxxx...xxx
-// 
+//
 // We are now faced with the task of computing the following
-// 
+//
 //                     ---------   ---------   ---------
-// 	             |  P_1  |   |  P_2  |   |  P_3  |
-// 	             ---------   ---------   ---------
-// 
+//                    |  P_1  |   |  P_2  |   |  P_3  |
+//                    ---------   ---------   ---------
+//
 //                                             ---------
-// 	      X                              |   X   |
-// 	                                     ---------
+//             X                              |   X   |
+//                                            ---------
 //       ----------------------------------------------------
-// 
+//
 //                                 ---------   ---------
-// 	                         |  A_hi |   |  A_lo |
-// 	                         ---------   ---------
-// 
+//                                |  A_hi |   |  A_lo |
+//                                ---------   ---------
+//
 //                     ---------   ---------
-// 	             |  B_hi |   |  B_lo |
-// 	             ---------   ---------
-// 
-//         ---------   ---------  
-// 	 |  C_hi |   |  C_lo |  
-// 	 ---------   ---------  
-// 
+//                    |  B_hi |   |  B_lo |
+//                    ---------   ---------
+//
+//         ---------   ---------
+//        |  C_hi |   |  C_lo |
+//        ---------   ---------
+//
 //      ====================================================
 //       -----------   ---------   ---------   ---------
 //       |    S_0  |   |  S_1  |   |  S_2  |   |  S_3  |
@@ -357,108 +363,108 @@
 //        |          |___ binary point
 //        |
 //        |___ possibly one more bit
-// 
+//
 // Let FPSR3 be set to round towards zero with widest precision
-// and exponent range. Unless an explicit FPSR is given, 
+// and exponent range. Unless an explicit FPSR is given,
 // round-to-nearest with widest precision and exponent range is
 // used.
-// 
+//
 // Define sigma_C := 2^63; sigma_B := 2^(-1); sigma_C := 2^(-65).
-// 
+//
 // Tmp_C := fmpy.fpsr3( x, p_1 );
 // If Tmp_C >= sigma_C then
 //    C_hi := Tmp_C;
 //    C_lo := x*p_1 - C_hi ...fma, exact
 // Else
 //    C_hi := fadd.fpsr3(sigma_C, Tmp_C) - sigma_C
-// 			...subtraction is exact, regardless
-// 			...of rounding direction
+//                   ...subtraction is exact, regardless
+//                   ...of rounding direction
 //    C_lo := x*p_1 - C_hi ...fma, exact
 // End If
-// 
+//
 // Tmp_B := fmpy.fpsr3( x, p_2 );
 // If Tmp_B >= sigma_B then
 //    B_hi := Tmp_B;
 //    B_lo := x*p_2 - B_hi ...fma, exact
 // Else
 //    B_hi := fadd.fpsr3(sigma_B, Tmp_B) - sigma_B
-// 			...subtraction is exact, regardless
-// 			...of rounding direction
+//                   ...subtraction is exact, regardless
+//                   ...of rounding direction
 //    B_lo := x*p_2 - B_hi ...fma, exact
 // End If
-// 
+//
 // Tmp_A := fmpy.fpsr3( x, p_3 );
 // If Tmp_A >= sigma_A then
 //    A_hi := Tmp_A;
 //    A_lo := x*p_3 - A_hi ...fma, exact
 // Else
 //    A_hi := fadd.fpsr3(sigma_A, Tmp_A) - sigma_A
-// 			...subtraction is exact, regardless
-// 			...of rounding direction
+//                   ...subtraction is exact, regardless
+//                   ...of rounding direction
 //    A_lo := x*p_3 - A_hi ...fma, exact
 // End If
-// 
+//
 // ...Note that C_hi is of integer value. We need only the
-// ...last few bits. Thus we can ensure C_hi is never a big 
+// ...last few bits. Thus we can ensure C_hi is never a big
 // ...integer, freeing us from overflow worry.
-// 
+//
 // Tmp_C := fadd.fpsr3( C_hi, 2^(70) ) - 2^(70);
 // ...Tmp_C is the upper portion of C_hi
 // C_hi := C_hi - Tmp_C
 // ...0 <= C_hi < 2^7
-// 
+//
 // Step 2. Get N and f
 // -------------------
-// 
-// At this point, we have all the components to obtain 
+//
+// At this point, we have all the components to obtain
 // S_0, S_1, S_2, S_3 and thus N and f. We start by adding
 // C_lo and B_hi. This sum together with C_hi gives a good
-// estimation of N and f. 
-// 
+// estimation of N and f.
+//
 // A := fadd.fpsr3( B_hi, C_lo )
 // B := max( B_hi, C_lo )
 // b := min( B_hi, C_lo )
-// 
-// a := (B - A) + b	...exact. Note that a is either 0
-// 			...or 2^(-64).
-// 
+//
+// a := (B - A) + b      ...exact. Note that a is either 0
+//                   ...or 2^(-64).
+//
 // N := round_to_nearest_integer_value( A );
-// f := A - N;		...exact because lsb(A) >= 2^(-64)
-// 			...and |f| <= 1/2.
-// 
-// f := f + a		...exact because a is 0 or 2^(-64);
-// 			...the msb of the sum is <= 1/2
-// 			...lsb >= 2^(-64).
-// 
+// f := A - N;            ...exact because lsb(A) >= 2^(-64)
+//                   ...and |f| <= 1/2.
+//
+// f := f + a            ...exact because a is 0 or 2^(-64);
+//                   ...the msb of the sum is <= 1/2
+//                   ...lsb >= 2^(-64).
+//
 // N := convert to integer format( C_hi + N );
 // M := P_0 * x_lo;
 // N := N + M;
-// 
+//
 // If sgn_x == 1 (that is original x was negative)
 // N := 2^10 - N
 // ...this maintains N to be non-negative, but still
 // ...equivalent to the (negated N) mod 4.
 // End If
-// 
+//
 // If |f| >= 2^(-33)
-// 
+//
 // ...Case 1
 // CASE := 1
 // g := A_hi + B_lo;
 // s_hi := f + g;
 // s_lo := (f - s_hi) + g;
-// 
+//
 // Else
-// 
+//
 // ...Case 2
 // CASE := 2
 // A := fadd.fpsr3( A_hi, B_lo )
 // B := max( A_hi, B_lo )
 // b := min( A_hi, B_lo )
-// 
-// a := (B - A) + b	...exact. Note that a is either 0
-// 			...or 2^(-128).
-// 
+//
+// a := (B - A) + b      ...exact. Note that a is either 0
+//                   ...or 2^(-128).
+//
 // f_hi := A + f;
 // f_lo := (f - f_hi) + A;
 // ...this is exact.
@@ -468,9 +474,9 @@
 // ...If f = 2^(-64), f-f_hi involves cancellation and is
 // ...exact. If f = -2^(-64), then A + f is exact. Hence
 // ...f-f_hi is -A exactly, giving f_lo = 0.
-// 
+//
 // f_lo := f_lo + a;
-// 
+//
 // If |f| >= 2^(-50) then
 //    s_hi := f_hi;
 //    s_lo := f_lo;
@@ -479,117 +485,111 @@
 //    s_hi := f_hi + f_lo
 //    s_lo := (f_hi - s_hi) + f_lo
 // End If
-// 
+//
 // End If
-// 
+//
 // Step 3. Get reduced argument
 // ----------------------------
-// 
+//
 // If sgn_x == 0 (that is original x is positive)
-// 
+//
 // D_hi := Pi_by_2_hi
 // D_lo := Pi_by_2_lo
 // ...load from table
-// 
+//
 // Else
-// 
+//
 // D_hi := neg_Pi_by_2_hi
 // D_lo := neg_Pi_by_2_lo
 // ...load from table
 // End If
-// 
+//
 // r_hi :=  s_hi*D_hi
-// r_lo :=  s_hi*D_hi - r_hi   	...fma
+// r_lo :=  s_hi*D_hi - r_hi         ...fma
 // r_lo := (s_hi*D_lo + r_lo) + s_lo*D_hi
-// 
-// Return  CASE, N, r_hi, r_lo
-//
-
-#include "libm_support.h"
-
-FR_X       = f32 
-FR_N       = f33 
-FR_p_1     = f34 
-FR_TWOM33  = f35 
-FR_TWOM50  = f36 
-FR_g       = f37 
-FR_p_2     = f38 
-FR_f       = f39 
-FR_s_lo    = f40 
-FR_p_3     = f41 
-FR_f_abs   = f42 
-FR_D_lo    = f43 
-FR_p_4     = f44 
-FR_D_hi    = f45 
-FR_Tmp2_C  = f46 
-FR_s_hi    = f47 
-FR_sigma_A = f48 
-FR_A       = f49 
-FR_sigma_B = f50 
-FR_B       = f51 
-FR_sigma_C = f52 
-FR_b       = f53 
-FR_ScaleP2 = f54 
-FR_ScaleP3 = f55 
-FR_ScaleP4 = f56 
-FR_Tmp_A   = f57 
-FR_Tmp_B   = f58 
-FR_Tmp_C   = f59 
-FR_A_hi    = f60 
-FR_f_hi    = f61 
-FR_r_hi    = f62 
-FR_A_lo    = f63 
-FR_B_hi    = f64 
-FR_a       = f65 
-FR_B_lo    = f66 
+//
+// Return  N, r_hi, r_lo
+//
+FR_input_X = f8
+FR_r_hi    = f8
+FR_r_lo    = f9
+
+FR_X       = f32
+FR_N       = f33
+FR_p_1     = f34
+FR_TWOM33  = f35
+FR_TWOM50  = f36
+FR_g       = f37
+FR_p_2     = f38
+FR_f       = f39
+FR_s_lo    = f40
+FR_p_3     = f41
+FR_f_abs   = f42
+FR_D_lo    = f43
+FR_p_4     = f44
+FR_D_hi    = f45
+FR_Tmp2_C  = f46
+FR_s_hi    = f47
+FR_sigma_A = f48
+FR_A       = f49
+FR_sigma_B = f50
+FR_B       = f51
+FR_sigma_C = f52
+FR_b       = f53
+FR_ScaleP2 = f54
+FR_ScaleP3 = f55
+FR_ScaleP4 = f56
+FR_Tmp_A   = f57
+FR_Tmp_B   = f58
+FR_Tmp_C   = f59
+FR_A_hi    = f60
+FR_f_hi    = f61
+FR_RSHF    = f62
+FR_A_lo    = f63
+FR_B_hi    = f64
+FR_a       = f65
+FR_B_lo    = f66
 FR_f_lo    = f67
-FR_r_lo    = f68 
-FR_C_hi    = f69 
-FR_C_lo    = f70 
+FR_N_fix   = f68
+FR_C_hi    = f69
+FR_C_lo    = f70
 
 GR_N       = r8
-GR_Address_of_Input  = r32 
-GR_Address_of_Outputs = r33 
-GR_Exp_x   = r36 
-GR_Temp    = r37 
-GR_BIASL63 = r38 
+GR_Exp_x   = r36
+GR_Temp    = r37
+GR_BIASL63 = r38
 GR_CASE    = r39
-GR_x_lo    = r40 
-GR_sgn_x   = r41 
+GR_x_lo    = r40
+GR_sgn_x   = r41
 GR_M       = r42
 GR_BASE    = r43
 GR_LENGTH1 = r44
 GR_LENGTH2 = r45
 GR_ASUB    = r46
 GR_P_0     = r47
-GR_P_1     = r48 
-GR_P_2     = r49 
-GR_P_3     = r50 
-GR_P_4     = r51 
+GR_P_1     = r48
+GR_P_2     = r49
+GR_P_3     = r50
+GR_P_4     = r51
 GR_START   = r52
 GR_SEGMENT = r53
 GR_A       = r54
-GR_B       = r55 
+GR_B       = r55
 GR_C       = r56
 GR_D       = r57
 GR_E       = r58
-GR_TEMP1   = r59 
-GR_TEMP2   = r60 
-GR_TEMP3   = r61 
-GR_TEMP4   = r62 
+GR_TEMP1   = r59
+GR_TEMP2   = r60
+GR_TEMP3   = r61
+GR_TEMP4   = r62
 GR_TEMP5   = r63
 GR_TEMP6   = r64
+GR_rshf    = r64
 
+RODATA
 .align 64
 
-#ifdef _LIBC
-.rodata
-#else
-.data
-#endif
-
-Constants_Bits_of_2_by_pi:
-ASM_TYPE_DIRECTIVE(Constants_Bits_of_2_by_pi,@object)
+LOCAL_OBJECT_START(Constants_Bits_of_2_by_pi)
 data8 0x0000000000000000,0xA2F9836E4E441529
 data8 0xFC2757D1F534DDC0,0xDB6295993C439041
 data8 0xFE5163ABDEBBC561,0xB7246E3A424DD2E0
@@ -721,34 +721,33 @@ data8 0xB5D6DF8261DD9602,0x36169F3AC4A1A283
 data8 0x6DED727A8D39A9B8,0x825C326B5B2746ED
 data8 0x34007700D255F4FC,0x4D59018071E0E13F
 data8 0x89B295F364A8F1AE,0xA74B38FC4CEAB2BB
-ASM_SIZE_DIRECTIVE(Constants_Bits_of_2_by_pi)
+LOCAL_OBJECT_END(Constants_Bits_of_2_by_pi)
 
-Constants_Bits_of_pi_by_2:
-ASM_TYPE_DIRECTIVE(Constants_Bits_of_pi_by_2,@object)
-data4 0x2168C234,0xC90FDAA2,0x00003FFF,0x00000000
-data4 0x80DC1CD1,0xC4C6628B,0x00003FBF,0x00000000
-ASM_SIZE_DIRECTIVE(Constants_Bits_of_pi_by_2)
+LOCAL_OBJECT_START(Constants_Bits_of_pi_by_2)
+data8 0xC90FDAA22168C234,0x00003FFF
+data8 0xC4C6628B80DC1CD1,0x00003FBF
+LOCAL_OBJECT_END(Constants_Bits_of_pi_by_2)
 
 .section .text
-.proc __libm_pi_by_2_reduce#
 .global __libm_pi_by_2_reduce#
-.align 64 
+.proc __libm_pi_by_2_reduce#
+.align 32
 
-__libm_pi_by_2_reduce: 
+__libm_pi_by_2_reduce:
 
-//    X is at the address in Address_of_Input
-//    Place the two-piece result at the address in Address_of_Outputs
-//    r followed by c
-//    N is returned
+//    X is in f8
+//    Place the two-piece result r (r_hi) in f8 and c (r_lo) in f9
+//    N is returned in r8
 
-{ .mmf
-alloc  r34 = ar.pfs,2,34,0,0
-(p0)  ldfe  FR_X = [GR_Address_of_Input]
-(p0)  fsetc.s3 0x00,0x7F ;;
+{ .mfi
+      alloc  r34 = ar.pfs,2,34,0,0
+      fsetc.s3 0x00,0x7F     // Set sf3 to round to zero, 82-bit prec, td, ftz
+      nop.i 999
 }
-{ .mlx
-	nop.m 999
-(p0)  movl GR_BIASL63 = 0x1003E
+{ .mfi
+      addl           GR_BASE   = @ltoff(Constants_Bits_of_2_by_pi#), gp
+      nop.f 999
+      mov GR_BIASL63 = 0x1003E
 }
 ;;
 
@@ -765,73 +764,61 @@ alloc  r34 = ar.pfs,2,34,0,0
 //    Address_BASE = shladd(SEGMENT,3) + BASE
 
 
-
 { .mmi
-      nop.m 999
-(p0)  addl           GR_BASE   = @ltoff(Constants_Bits_of_2_by_pi#), gp
-      nop.i 999
+      getf.exp GR_Exp_x = FR_input_X
+      ld8 GR_BASE = [GR_BASE]
+      mov GR_TEMP5 = 0x0FFFE
 }
 ;;
 
+//    Define sigma_C := 2^63; sigma_B := 2^(-1); sigma_A := 2^(-65).
 { .mmi
-      ld8 GR_BASE = [GR_BASE]
-      nop.m 999
+      getf.sig GR_x_lo = FR_input_X
+      mov GR_TEMP6 = 0x0FFBE
       nop.i 999
 }
 ;;
 
-
-{ .mlx
-	nop.m 999
-(p0)  movl GR_TEMP5 = 0x000000000000FFFE
-}
-{ .mmi
-	nop.m 999 ;;
-(p0)  setf.exp FR_sigma_B = GR_TEMP5
-	nop.i 999
-}
-{ .mlx
-	nop.m 999
-(p0)  movl GR_TEMP6 = 0x000000000000FFBE ;;
-}
-//    Define sigma_C := 2^63; sigma_B := 2^(-1); sigma_A := 2^(-65).
-{ .mfi
-(p0)  setf.exp FR_sigma_A = GR_TEMP6
-	nop.f 999
-	nop.i 999 ;;
-}
-//    Special Code for testing DE arguments 
-//    (p0)  movl GR_BIASL63 = 0x0000000000013FFE
-//    (p0)  movl GR_x_lo = 0xFFFFFFFFFFFFFFFF
-//    (p0)  setf.exp FR_X = GR_BIASL63
-//    (p0)  setf.sig FR_ScaleP3 = GR_x_lo
-//    (p0)  fmerge.se FR_X = FR_X,FR_ScaleP3
+//    Special Code for testing DE arguments
+//          movl GR_BIASL63 = 0x0000000000013FFE
+//          movl GR_x_lo = 0xFFFFFFFFFFFFFFFF
+//          setf.exp FR_X = GR_BIASL63
+//          setf.sig FR_ScaleP3 = GR_x_lo
+//          fmerge.se FR_X = FR_X,FR_ScaleP3
 //    Set sgn_x := sign(x); x := |x|; x_lo := 2 lsb of x.
 //    2/pi is stored contigously as
 //    0x00000000 0x00000000.0xA2F....
 //    M = EXP - BIAS  ( M >= 63)
 //    Given x = 2^m * 1.xxxx...xxx; we calculate L := 62 - m.
 //    Thus -1 <= L <= -16321.
-{ .mmf
-(p0)  getf.exp GR_Exp_x = FR_X
-(p0)  getf.sig GR_x_lo = FR_X
-(p0)  fabs FR_X = FR_X ;;
+{ .mmi
+      setf.exp FR_sigma_B = GR_TEMP5
+      setf.exp FR_sigma_A = GR_TEMP6
+      extr.u GR_M = GR_Exp_x,0,17
 }
+;;
+
 { .mii
-(p0)  and  GR_x_lo = 0x03,GR_x_lo
-(p0)  extr.u GR_M = GR_Exp_x,0,17 ;;
-(p0)  sub  GR_START = GR_M,GR_BIASL63
+      and  GR_x_lo = 0x03,GR_x_lo
+      sub  GR_START = GR_M,GR_BIASL63
+      add  GR_BASE = 8,GR_BASE           // To effectively add 1 to SEGMENT
 }
-{ .mmi
-	nop.m 999 ;;
-(p0)  and  GR_LENGTH1 = 0x3F,GR_START
-(p0)  shr.u  GR_SEGMENT = GR_START,6
+;;
+
+{ .mii
+      and  GR_LENGTH1 = 0x3F,GR_START
+      shr.u  GR_SEGMENT = GR_START,6
+      nop.i 999
 }
+;;
+
 { .mmi
-	nop.m 999 ;;
-(p0)  add  GR_SEGMENT = 0x1,GR_SEGMENT
-(p0)  sub  GR_LENGTH2 = 0x40,GR_LENGTH1
+      shladd GR_BASE = GR_SEGMENT,3,GR_BASE
+      sub  GR_LENGTH2 = 0x40,GR_LENGTH1
+      cmp.le p6,p7 = 0x2,GR_LENGTH1
 }
+;;
+
 //    P_0 is the two bits corresponding to bit positions L+2 and L+1
 //    P_1 is the 64-bit starting at bit position  L
 //    P_2 is the 64-bit starting at bit position  L-64
@@ -849,13 +836,13 @@ alloc  r34 = ar.pfs,2,34,0,0
 //    P_4 is made up of Clo and Dhi
 //    P_4 = deposit Dlo, position 0, length2  into P_4, position length1
 //          deposit Ehi, position length2, length1 into P_4, position 0
-{ .mmi
-(p0)  cmp.le.unc p6,p7 = 0x2,GR_LENGTH1 ;;
-(p0)  shladd GR_BASE = GR_SEGMENT,3,GR_BASE
-(p7)  cmp.eq.unc p8,p9 = 0x1,GR_LENGTH1 ;;
+{ .mfi
+      ld8 GR_A = [GR_BASE],8
+      fabs FR_X = FR_input_X
+(p7)  cmp.eq.unc p8,p9 = 0x1,GR_LENGTH1
 }
-{ .mmi
-	nop.m 999
+;;
+
 //    ld_64 A at Base and increment Base by 8
 //    ld_64 B at Base and increment Base by 8
 //    ld_64 C at Base and increment Base by 8
@@ -866,31 +853,35 @@ alloc  r34 = ar.pfs,2,34,0,0
 //    A, B, C, D, and E look like    | length1 | length2   |
 //                                    ---------------------
 //                                       hi        lo
-(p0)  ld8 GR_A = [GR_BASE],8
-(p0)  extr.u GR_sgn_x = GR_Exp_x,17,1 ;;
-}
-{ .mmf
-	nop.m 999
-(p0)  ld8 GR_B = [GR_BASE],8
-(p0)  fmerge.se FR_X = FR_sigma_B,FR_X ;;
+{ .mlx
+      ld8 GR_B = [GR_BASE],8
+      movl GR_rshf = 0x43e8000000000000   // 1.10000 2^63 for right shift N_fix
 }
-{ .mii
-(p0)  ld8 GR_C = [GR_BASE],8
-(p8)  extr.u GR_Temp = GR_A,63,1 ;;
-(p0)  shl GR_TEMP1 = GR_A,GR_LENGTH1
+;;
+
+{ .mmi
+      ld8 GR_C = [GR_BASE],8
+      nop.m 999
+(p8)  extr.u GR_Temp = GR_A,63,1
 }
-{ .mii
-(p0)  ld8 GR_D = [GR_BASE],8
+;;
+
 //    If length1 >= 2,
 //       P_0 = deposit Ahi, position length2, 2 bit into P_0 at position 0.
-(p6)     shr.u GR_P_0 = GR_A,GR_LENGTH2 ;;
-(p0)  shl GR_TEMP2 = GR_B,GR_LENGTH1
+{ .mii
+      ld8 GR_D = [GR_BASE],8
+      shl GR_TEMP1 = GR_A,GR_LENGTH1   // MM instruction
+(p6)  shr.u GR_P_0 = GR_A,GR_LENGTH2   // MM instruction
 }
+;;
+
 { .mii
-(p0)  ld8 GR_E = [GR_BASE],-40
-(p0)  shr.u GR_P_1 = GR_B,GR_LENGTH2 ;;
-(p0)  shr.u GR_P_2 = GR_C,GR_LENGTH2
+      ld8 GR_E = [GR_BASE],-40
+      shl GR_TEMP2 = GR_B,GR_LENGTH1   // MM instruction
+      shr.u GR_P_1 = GR_B,GR_LENGTH2   // MM instruction
 }
+;;
+
 //    Else
 //       Load 16 bit of ASUB from (Base_Address_of_A - 2)
 //       P_0 = ASUB & 0x3
@@ -900,43 +891,56 @@ alloc  r34 = ar.pfs,2,34,0,0
 //          Deposit element 63 from Ahi and place in element 0 of P_0.
 //       Endif
 //    Endif
+
 { .mii
 (p7)  ld2 GR_ASUB = [GR_BASE],8
-(p0)  shl GR_TEMP3 = GR_C,GR_LENGTH1 ;;
-(p0)  shl GR_TEMP4 = GR_D,GR_LENGTH1
+      shl GR_TEMP3 = GR_C,GR_LENGTH1   // MM instruction
+      shr.u GR_P_2 = GR_C,GR_LENGTH2   // MM instruction
 }
+;;
+
 { .mii
-	nop.m 999
-(p0)  shr.u GR_P_3 = GR_D,GR_LENGTH2 ;;
-(p0)  shr.u GR_P_4 = GR_E,GR_LENGTH2
+      setf.d FR_RSHF = GR_rshf         // Form right shift const 1.100 * 2^63
+      shl GR_TEMP4 = GR_D,GR_LENGTH1   // MM instruction
+      shr.u GR_P_3 = GR_D,GR_LENGTH2   // MM instruction
 }
-{ .mii
+;;
+
+{ .mmi
 (p7)  and GR_P_0 = 0x03,GR_ASUB
-(p6)     and GR_P_0 = 0x03,GR_P_0 ;;
-(p0)  or GR_P_1 = GR_P_1,GR_TEMP1
+(p6)  and GR_P_0 = 0x03,GR_P_0
+      shr.u GR_P_4 = GR_E,GR_LENGTH2   // MM instruction
 }
+;;
+
 { .mmi
-(p8)  and GR_P_0 = 0x1,GR_P_0 ;;
-(p0)  or GR_P_2 = GR_P_2,GR_TEMP2
-(p8)  shl GR_P_0 = GR_P_0,0x1 ;;
-}
-{ .mii
-	nop.m 999
-(p0)  or GR_P_3 = GR_P_3,GR_TEMP3
-(p8)  or GR_P_0 = GR_P_0,GR_Temp
+      nop.m 999
+      or GR_P_1 = GR_P_1,GR_TEMP1
+(p8)  and GR_P_0 = 0x1,GR_P_0
 }
+;;
+
 { .mmi
-(p0)  setf.sig FR_p_1 = GR_P_1 ;;
-(p0)  setf.sig FR_p_2 = GR_P_2
-(p0)  or GR_P_4 = GR_P_4,GR_TEMP4 ;;
+      setf.sig FR_p_1 = GR_P_1
+      or GR_P_2 = GR_P_2,GR_TEMP2
+(p8)  shladd GR_P_0 = GR_P_0,1,GR_Temp
 }
+;;
+
+{ .mmf
+      setf.sig FR_p_2 = GR_P_2
+      or GR_P_3 = GR_P_3,GR_TEMP3
+      fmerge.se FR_X = FR_sigma_B,FR_X
+}
+;;
+
 { .mmi
-	nop.m 999 ;;
-(p0)  setf.sig FR_p_3 = GR_P_3
-(p0)  pmpy2.r GR_M = GR_P_0,GR_x_lo
+      setf.sig FR_p_3 = GR_P_3
+      or GR_P_4 = GR_P_4,GR_TEMP4
+      pmpy2.r GR_M = GR_P_0,GR_x_lo
 }
-{ .mlx
-(p0)  setf.sig FR_p_4 = GR_P_4
+;;
+
 //    P_1, P_2, P_3, P_4 are integers. They should be
 //    2^(L-63)     * P_1;
 //    2^(L-63-64)  * P_2;
@@ -954,18 +958,18 @@ alloc  r34 = ar.pfs,2,34,0,0
 //             |  P_1  |   |  P_2  |   |  P_3  |
 //             ---------   ---------   ---------
 //                                           ---------
-//	      X                              |   X   |
-//	                                     ---------
+//            X                              |   X   |
+//                                           ---------
 //      ----------------------------------------------------
 //                               ---------   ---------
-//	                         |  A_hi |   |  A_lo |
-//	                         ---------   ---------
+//                               |  A_hi |   |  A_lo |
+//                               ---------   ---------
 //                   ---------   ---------
-//	             |  B_hi |   |  B_lo |
-//	             ---------   ---------
+//                   |  B_hi |   |  B_lo |
+//                   ---------   ---------
+//       ---------   ---------
+//       |  C_hi |   |  C_lo |
 //       ---------   ---------
-//	 |  C_hi |   |  C_lo |
-//	 ---------   ---------
 //     ====================================================
 //    -----------   ---------   ---------   ---------
 //    |    S_0  |   |  S_1  |   |  S_2  |   |  S_3  |
@@ -977,52 +981,55 @@ alloc  r34 = ar.pfs,2,34,0,0
 //    and exponent range. Unless an explicit FPSR is given,
 //    round-to-nearest with widest precision and exponent range is
 //    used.
-(p0)  movl GR_TEMP1 = 0x000000000000FFBF
-}
 { .mmi
-	nop.m 999 ;;
-(p0)  setf.exp FR_ScaleP2 = GR_TEMP1
-	nop.i 999
-}
-{ .mlx
-	nop.m 999
-(p0)  movl GR_TEMP4 = 0x000000000001003E
+      setf.sig FR_p_4 = GR_P_4
+      mov GR_TEMP1 = 0x0FFBF
+      nop.i 999
 }
+;;
+
 { .mmi
-	nop.m 999 ;;
-(p0)  setf.exp FR_sigma_C = GR_TEMP4
-	nop.i 999
+      setf.exp FR_ScaleP2 = GR_TEMP1
+      mov GR_TEMP2 = 0x0FF7F
+      nop.i 999
 }
-{ .mlx
-	nop.m 999
-(p0)  movl GR_TEMP2 = 0x000000000000FF7F ;;
+;;
+
+{ .mmi
+      setf.exp FR_ScaleP3 = GR_TEMP2
+      mov GR_TEMP4 = 0x1003E
+      nop.i 999
 }
+;;
+
 { .mmf
-	nop.m 999
-(p0)  setf.exp FR_ScaleP3 = GR_TEMP2
-(p0)  fcvt.xuf.s1 FR_p_1 = FR_p_1 ;;
+      setf.exp FR_sigma_C = GR_TEMP4
+      mov GR_Temp = 0x0FFDE
+      fcvt.xuf.s1 FR_p_1 = FR_p_1
 }
+;;
+
 { .mfi
-	nop.m 999
-(p0)  fcvt.xuf.s1 FR_p_2 = FR_p_2
-	nop.i 999
-}
-{ .mlx
-	nop.m 999
-(p0)  movl GR_Temp = 0x000000000000FFDE ;;
-}
-{ .mmf
-	nop.m 999
-(p0)  setf.exp FR_TWOM33 = GR_Temp
-(p0)  fcvt.xuf.s1 FR_p_3 = FR_p_3 ;;
+      setf.exp FR_TWOM33 = GR_Temp
+      fcvt.xuf.s1 FR_p_2 = FR_p_2
+      nop.i 999
 }
+;;
+
 { .mfi
-	nop.m 999
-(p0)  fcvt.xuf.s1 FR_p_4 = FR_p_4
-	nop.i 999 ;;
+      nop.m 999
+      fcvt.xuf.s1 FR_p_3 = FR_p_3
+      nop.i 999
 }
+;;
+
 { .mfi
-	nop.m 999
+      nop.m 999
+      fcvt.xuf.s1 FR_p_4 = FR_p_4
+      nop.i 999
+}
+;;
+
 //    Tmp_C := fmpy.fpsr3( x, p_1 );
 //    Tmp_B := fmpy.fpsr3( x, p_2 );
 //    Tmp_A := fmpy.fpsr3( x, p_3 );
@@ -1048,55 +1055,62 @@ alloc  r34 = ar.pfs,2,34,0,0
 //      Exact, regardless ...of rounding direction
 //      A_lo := x*p_3 - A_hi ...fma, exact
 //    Endif
-(p0)  fmpy.s3 FR_Tmp_C = FR_X,FR_p_1
-	nop.i 999 ;;
-}
 { .mfi
-	nop.m 999
-(p0)  fmpy.s1 FR_p_2 = FR_p_2,FR_ScaleP2
-	nop.i 999
-}
-{ .mlx
-	nop.m 999
-(p0)  movl GR_Temp = 0x0000000000000400
+      nop.m 999
+      fmpy.s3 FR_Tmp_C = FR_X,FR_p_1
+      nop.i 999
 }
-{ .mlx
-	nop.m 999
-(p0)  movl GR_TEMP3 = 0x000000000000FF3F ;;
+;;
+
+{ .mfi
+      mov GR_TEMP3 = 0x0FF3F
+      fmpy.s1 FR_p_2 = FR_p_2,FR_ScaleP2
+      nop.i 999
 }
+;;
+
 { .mmf
-	nop.m 999
-(p0)  setf.exp FR_ScaleP4 = GR_TEMP3
-(p0)  fmpy.s1 FR_p_3 = FR_p_3,FR_ScaleP3 ;;
+      setf.exp FR_ScaleP4 = GR_TEMP3
+      mov GR_TEMP4 = 0x10045
+      fmpy.s1 FR_p_3 = FR_p_3,FR_ScaleP3
 }
-{ .mlx
-	nop.m 999
-(p0)  movl GR_TEMP4 = 0x0000000000010045 ;;
+;;
+
+{ .mfi
+      nop.m 999
+      fadd.s3 FR_C_hi = FR_sigma_C,FR_Tmp_C   // For Tmp_C < sigma_C case
+      nop.i 999
 }
+;;
+
 { .mmf
-	nop.m 999
-(p0)  setf.exp FR_Tmp2_C = GR_TEMP4
-(p0)  fmpy.s3 FR_Tmp_B = FR_X,FR_p_2 ;;
+      setf.exp FR_Tmp2_C = GR_TEMP4
+      nop.m 999
+      fmpy.s3 FR_Tmp_B = FR_X,FR_p_2
 }
+;;
+
 { .mfi
-	nop.m 999
-(p0)  fcmp.ge.unc.s1 p12,  p9 = FR_Tmp_C,FR_sigma_C
-	nop.i 999 ;;
+      addl           GR_BASE   = @ltoff(Constants_Bits_of_pi_by_2#), gp
+      fcmp.ge.s1 p12,  p9 = FR_Tmp_C,FR_sigma_C
+      nop.i 999
 }
 { .mfi
-	nop.m 999
-(p0)  fmpy.s3 FR_Tmp_A = FR_X,FR_p_3
-	nop.i 999 ;;
+      nop.m 999
+      fmpy.s3 FR_Tmp_A = FR_X,FR_p_3
+      nop.i 99
 }
+;;
+
 { .mfi
-	nop.m 999
+      ld8 GR_BASE = [GR_BASE]
 (p12) mov FR_C_hi = FR_Tmp_C
-	nop.i 999 ;;
+      nop.i 999
 }
 { .mfi
-(p0)  addl           GR_BASE   = @ltoff(Constants_Bits_of_pi_by_2#), gp
-(p9)  fadd.s3 FR_C_hi = FR_sigma_C,FR_Tmp_C
-	nop.i 999
+      nop.m 999
+(p9)  fsub.s1 FR_C_hi = FR_C_hi,FR_sigma_C
+      nop.i 999
 }
 ;;
 
@@ -1114,97 +1128,106 @@ alloc  r34 = ar.pfs,2,34,0,0
 //      Load from table
 //   End If
 
-
-{ .mmi
-      ld8 GR_BASE = [GR_BASE]
+{ .mfi
       nop.m 999
+      fmpy.s1 FR_p_4 = FR_p_4,FR_ScaleP4
       nop.i 999
 }
-;;
-
-
 { .mfi
-(p0) ldfe FR_D_hi = [GR_BASE],16
-(p0)  fmpy.s1 FR_p_4 = FR_p_4,FR_ScaleP4
-	nop.i 999 ;;
+      nop.m 999
+      fadd.s3 FR_B_hi = FR_sigma_B,FR_Tmp_B     // For Tmp_B < sigma_B case
+      nop.i 999
 }
+;;
+
 { .mfi
-(p0) ldfe FR_D_lo = [GR_BASE],0
-(p0)  fcmp.ge.unc.s1 p13, p10 = FR_Tmp_B,FR_sigma_B
-	nop.i 999 ;;
+      nop.m 999
+      fadd.s3 FR_A_hi = FR_sigma_A,FR_Tmp_A     // For Tmp_A < sigma_A case
+      nop.i 999
 }
+;;
+
 { .mfi
-	nop.m 999
-(p13) mov FR_B_hi = FR_Tmp_B
-	nop.i 999
+      nop.m 999
+      fcmp.ge.s1 p13, p10 = FR_Tmp_B,FR_sigma_B
+      nop.i 999
 }
 { .mfi
-	nop.m 999
-(p12) fms.s1 FR_C_lo = FR_X,FR_p_1,FR_C_hi
-	nop.i 999 ;;
+      nop.m 999
+      fms.s1 FR_C_lo = FR_X,FR_p_1,FR_C_hi
+      nop.i 999
 }
+;;
+
 { .mfi
-	nop.m 999
-(p10) fadd.s3 FR_B_hi = FR_sigma_B,FR_Tmp_B
-	nop.i 999
+      ldfe FR_D_hi = [GR_BASE],16
+      fcmp.ge.s1 p14, p11 = FR_Tmp_A,FR_sigma_A
+      nop.i 999
 }
+;;
+
 { .mfi
-	nop.m 999
-(p9)  fsub.s1 FR_C_hi = FR_C_hi,FR_sigma_C
-	nop.i 999 ;;
+      ldfe FR_D_lo = [GR_BASE]
+(p13) mov FR_B_hi = FR_Tmp_B
+      nop.i 999
 }
 { .mfi
-	nop.m 999
-(p0)  fcmp.ge.unc.s1 p14, p11 = FR_Tmp_A,FR_sigma_A
-	nop.i 999 ;;
+      nop.m 999
+(p10) fsub.s1 FR_B_hi = FR_B_hi,FR_sigma_B
+      nop.i 999
 }
+;;
+
 { .mfi
-	nop.m 999
+      nop.m 999
 (p14) mov FR_A_hi = FR_Tmp_A
-	nop.i 999 ;;
-}
-{ .mfi
-	nop.m 999
-(p11) fadd.s3 FR_A_hi = FR_sigma_A,FR_Tmp_A
-	nop.i 999 ;;
-}
-{ .mfi
-	nop.m 999
-(p9)  fms.s1 FR_C_lo = FR_X,FR_p_1,FR_C_hi
-(p0)  cmp.eq.unc p12,p9 = 0x1,GR_sgn_x
-}
-{ .mfi
-	nop.m 999
-(p13) fms.s1 FR_B_lo = FR_X,FR_p_2,FR_B_hi
-	nop.i 999 ;;
+      nop.i 999
 }
 { .mfi
-	nop.m 999
-(p10) fsub.s1 FR_B_hi = FR_B_hi,FR_sigma_B
-	nop.i 999
+      nop.m 999
+(p11) fsub.s1 FR_A_hi = FR_A_hi,FR_sigma_A
+      nop.i 999
 }
-{ .mfi
-	nop.m 999
+;;
+
 //    Note that C_hi is of integer value. We need only the
 //    last few bits. Thus we can ensure C_hi is never a big
 //    integer, freeing us from overflow worry.
 //    Tmp_C := fadd.fpsr3( C_hi, 2^(70) ) - 2^(70);
 //    Tmp_C is the upper portion of C_hi
-(p0)  fadd.s3 FR_Tmp_C = FR_C_hi,FR_Tmp2_C
-	nop.i 999 ;;
+{ .mfi
+      nop.m 999
+      fadd.s3 FR_Tmp_C = FR_C_hi,FR_Tmp2_C
+      tbit.z p12,p9 = GR_Exp_x, 17
 }
+;;
+
 { .mfi
-	nop.m 999
-(p14) fms.s1 FR_A_lo = FR_X,FR_p_3,FR_A_hi
-	nop.i 999
+      nop.m 999
+      fms.s1 FR_B_lo = FR_X,FR_p_2,FR_B_hi
+      nop.i 999
 }
 { .mfi
-	nop.m 999
-(p11) fsub.s1 FR_A_hi = FR_A_hi,FR_sigma_A
-	nop.i 999 ;;
+      nop.m 999
+      fadd.s3 FR_A = FR_B_hi,FR_C_lo
+      nop.i 999
 }
+;;
+
+{ .mfi
+      nop.m 999
+      fms.s1 FR_A_lo = FR_X,FR_p_3,FR_A_hi
+      nop.i 999
+}
+;;
+
 { .mfi
-	nop.m 999
+      nop.m 999
+      fsub.s1 FR_Tmp_C = FR_Tmp_C,FR_Tmp2_C
+      nop.i 999
+}
+;;
+
 //    *******************
 //    Step 2. Get N and f
 //    *******************
@@ -1215,168 +1238,213 @@ alloc  r34 = ar.pfs,2,34,0,0
 //    A := fadd.fpsr3( B_hi, C_lo )
 //    B := max( B_hi, C_lo )
 //    b := min( B_hi, C_lo )
-(p0)  fadd.s3 FR_A = FR_B_hi,FR_C_lo
-	nop.i 999
-}
 { .mfi
-	nop.m 999
-(p10) fms.s1 FR_B_lo = FR_X,FR_p_2,FR_B_hi
-	nop.i 999 ;;
+      nop.m 999
+      fmax.s1 FR_B = FR_B_hi,FR_C_lo
+      nop.i 999
 }
+;;
+
+// We use a right-shift trick to get the integer part of A into the rightmost
+// bits of the significand by adding 1.1000..00 * 2^63.  This operation is good
+// if |A| < 2^61, which it is in this case.  We are doing this to save a few
+// cycles over using fcvt.fx followed by fnorm.  The second step of the trick
+// is to subtract the same constant to float the rounded integer into a fp reg.
+
 { .mfi
-	nop.m 999
-(p0)  fsub.s1 FR_Tmp_C = FR_Tmp_C,FR_Tmp2_C
-	nop.i 999 ;;
+      nop.m 999
+//    N := round_to_nearest_integer_value( A );
+      fma.s1 FR_N_fix = FR_A, f1, FR_RSHF
+      nop.i 999
 }
+;;
+
 { .mfi
-	nop.m 999
-(p0)  fmax.s1 FR_B = FR_B_hi,FR_C_lo
-	nop.i 999 ;;
+      nop.m 999
+      fmin.s1 FR_b = FR_B_hi,FR_C_lo
+      nop.i 999
 }
 { .mfi
-	nop.m 999
-(p0)  fmin.s1 FR_b = FR_B_hi,FR_C_lo
-	nop.i 999
+      nop.m 999
+//    C_hi := C_hi - Tmp_C ...0 <= C_hi < 2^7
+      fsub.s1 FR_C_hi = FR_C_hi,FR_Tmp_C
+      nop.i 999
 }
+;;
+
 { .mfi
-	nop.m 999
-(p11) fms.s1 FR_A_lo = FR_X,FR_p_3,FR_A_hi
-	nop.i 999 ;;
+      nop.m 999
+//    a := (B - A) + b: Exact - note that a is either 0 or 2^(-64).
+      fsub.s1 FR_a = FR_B,FR_A
+      nop.i 999
 }
+;;
+
 { .mfi
-	nop.m 999
-//    N := round_to_nearest_integer_value( A );
-(p0)  fcvt.fx.s1 FR_N = FR_A
-	nop.i 999 ;;
+      nop.m 999
+      fms.s1 FR_N = FR_N_fix, f1, FR_RSHF
+      nop.i 999
 }
+;;
+
 { .mfi
-	nop.m 999
-//    C_hi := C_hi - Tmp_C ...0 <= C_hi < 2^7
-(p0)  fsub.s1 FR_C_hi = FR_C_hi,FR_Tmp_C
-	nop.i 999 ;;
+      nop.m 999
+      fadd.s1 FR_a = FR_a,FR_b
+      nop.i 999
 }
+;;
+
+//    f := A - N; Exact because lsb(A) >= 2^(-64) and |f| <= 1/2.
+//    N := convert to integer format( C_hi + N );
+//    M := P_0 * x_lo;
+//    N := N + M;
 { .mfi
-	nop.m 999
-//    a := (B - A) + b: Exact - note that a is either 0 or 2^(-64).
-(p0)  fsub.s1 FR_a = FR_B,FR_A
-	nop.i 999 ;;
+      nop.m 999
+      fsub.s1 FR_f = FR_A,FR_N
+      nop.i 999
 }
 { .mfi
-	nop.m 999
-//    f := A - N; Exact because lsb(A) >= 2^(-64) and |f| <= 1/2.
-(p0)  fnorm.s1 FR_N = FR_N
-	nop.i 999
+      nop.m 999
+      fadd.s1 FR_N = FR_N,FR_C_hi
+      nop.i 999
 }
+;;
+
 { .mfi
-	nop.m 999
-(p0)  fadd.s1 FR_a = FR_a,FR_b
-	nop.i 999 ;;
+      nop.m 999
+(p9)  fsub.s1 FR_D_hi = f0, FR_D_hi
+      nop.i 999
 }
 { .mfi
-	nop.m 999
-(p0)  fsub.s1 FR_f = FR_A,FR_N
-	nop.i 999
+      nop.m 999
+(p9)  fsub.s1 FR_D_lo = f0, FR_D_lo
+      nop.i 999
 }
+;;
+
 { .mfi
-	nop.m 999
-//    N := convert to integer format( C_hi + N );
-//    M := P_0 * x_lo;
-//    N := N + M;
-(p0)  fadd.s1 FR_N = FR_N,FR_C_hi
-	nop.i 999 ;;
+      nop.m 999
+      fadd.s1 FR_g = FR_A_hi,FR_B_lo          // For Case 1, g=A_hi+B_lo
+      nop.i 999
 }
 { .mfi
-	nop.m 999
-//    f = f + a	Exact because a is 0 or 2^(-64);
-//    the msb of the sum is <= 1/2 and lsb >= 2^(-64).
-(p0)  fadd.s1 FR_f = FR_f,FR_a
-	nop.i 999
+      nop.m 999
+      fadd.s3 FR_A = FR_A_hi,FR_B_lo          // For Case 2, A=A_hi+B_lo w/ sf3
+      nop.i 999
 }
+;;
+
 { .mfi
-	nop.m 999
-//
-//    Create 2**(-33)
-//
-(p0)  fcvt.fx.s1 FR_N = FR_N
-	nop.i 999 ;;
+      mov GR_Temp = 0x0FFCD                   // For Case 2, exponent of 2^-50
+      fmax.s1 FR_B = FR_A_hi,FR_B_lo          // For Case 2, B=max(A_hi,B_lo)
+      nop.i 999
 }
+;;
+
+//    f = f + a      Exact because a is 0 or 2^(-64);
+//    the msb of the sum is <= 1/2 and lsb >= 2^(-64).
 { .mfi
-	nop.m 999
-(p0)  fabs FR_f_abs = FR_f
-	nop.i 999 ;;
+      setf.exp FR_TWOM50 = GR_Temp            // For Case 2, form 2^-50
+      fcvt.fx.s1 FR_N = FR_N
+      nop.i 999
 }
 { .mfi
-(p0)  getf.sig GR_N = FR_N
-	nop.f 999
-	nop.i 999 ;;
+      nop.m 999
+      fadd.s1 FR_f = FR_f,FR_a
+      nop.i 999
 }
-{ .mii
-	nop.m 999
-	nop.i 999 ;;
-(p0)  add GR_N = GR_N,GR_M ;;
+;;
+
+{ .mfi
+      nop.m 999
+      fmin.s1 FR_b = FR_A_hi,FR_B_lo          // For Case 2, b=min(A_hi,B_lo)
+      nop.i 999
 }
-//    If sgn_x == 1 (that is original x was negative)
-//       N := 2^10 - N
-//       this maintains N to be non-negative, but still
-//       equivalent to the (negated N) mod 4.
-//    End If
-{ .mii
-(p12) sub GR_N = GR_Temp,GR_N
-(p0) cmp.eq.unc p12,p9 = 0x0,GR_sgn_x ;;
-	nop.i 999
+;;
+
+{ .mfi
+      nop.m 999
+      fsub.s1 FR_a = FR_B,FR_A                // For Case 2, a=B-A
+      nop.i 999
 }
+;;
+
 { .mfi
-	nop.m 999
-(p0)  fcmp.ge.unc.s1 p13, p10 = FR_f_abs,FR_TWOM33
-	nop.i 999 ;;
+      nop.m 999
+      fadd.s1 FR_s_hi = FR_f,FR_g             // For Case 1, s_hi=f+g
+      nop.i 999
 }
 { .mfi
-	nop.m 999
-(p9) fsub.s1 FR_D_hi = f0, FR_D_hi
-	nop.i 999 ;;
+      nop.m 999
+      fadd.s1 FR_f_hi = FR_A,FR_f             // For Case 2, f_hi=A+f
+      nop.i 999
 }
+;;
+
 { .mfi
-	nop.m 999
-(p10)    fadd.s3 FR_A = FR_A_hi,FR_B_lo
-	nop.i 999
+      nop.m 999
+      fabs FR_f_abs = FR_f
+      nop.i 999
 }
+;;
+
 { .mfi
-	nop.m 999
-(p13)    fadd.s1 FR_g = FR_A_hi,FR_B_lo
-	nop.i 999 ;;
+      getf.sig GR_N = FR_N
+      fsetc.s3 0x7F,0x40                 // Reset sf3 to user settings + td
+      nop.i 999
 }
+;;
+
 { .mfi
-	nop.m 999
-(p10)    fmax.s1 FR_B = FR_A_hi,FR_B_lo
-	nop.i 999
+      nop.m 999
+      fsub.s1 FR_s_lo = FR_f,FR_s_hi          // For Case 1, s_lo=f-s_hi
+      nop.i 999
 }
 { .mfi
-	nop.m 999
-(p9) fsub.s1 FR_D_lo = f0, FR_D_lo
-	nop.i 999 ;;
+      nop.m 999
+      fsub.s1 FR_f_lo = FR_f,FR_f_hi          // For Case 2, f_lo=f-f_hi
+      nop.i 999
 }
+;;
+
 { .mfi
-	nop.m 999
-(p10)    fmin.s1 FR_b = FR_A_hi,FR_B_lo
-	nop.i 999 ;;
+      nop.m 999
+      fmpy.s1 FR_r_hi = FR_s_hi,FR_D_hi       // For Case 1, r_hi=s_hi*D_hi
+      nop.i 999
 }
 { .mfi
-	nop.m 999
-(p0) fsetc.s3 0x7F,0x40
-	nop.i 999
+      nop.m 999
+      fadd.s1 FR_a = FR_a,FR_b                // For Case 2, a=a+b
+      nop.i 999
 }
-{ .mlx
-	nop.m 999
-(p10)    movl GR_Temp = 0x000000000000FFCD ;;
+;;
+
+
+//    If sgn_x == 1 (that is original x was negative)
+//       N := 2^10 - N
+//       this maintains N to be non-negative, but still
+//       equivalent to the (negated N) mod 4.
+//    End If
+{ .mfi
+      add GR_N = GR_N,GR_M
+      fcmp.ge.s1 p13, p10 = FR_f_abs,FR_TWOM33
+      mov GR_Temp = 0x00400
 }
-{ .mmf
-	nop.m 999
-(p10)    setf.exp FR_TWOM50 = GR_Temp
-(p10)    fadd.s1 FR_f_hi = FR_A,FR_f ;;
+;;
+
+{ .mfi
+(p9)  sub GR_N = GR_Temp,GR_N
+      fadd.s1 FR_s_lo = FR_s_lo,FR_g           // For Case 1, s_lo=s_lo+g
+      nop.i 999
 }
 { .mfi
-	nop.m 999
-//       a := (B - A) + b	Exact.
+      nop.m 999
+      fadd.s1 FR_f_lo = FR_f_lo,FR_A           // For Case 2, f_lo=f_lo+A
+      nop.i 999
+}
+;;
+
+//       a := (B - A) + b      Exact.
 //       Note that a is either 0 or 2^(-128).
 //       f_hi := A + f;
 //       f_lo := (f - f_hi) + A
@@ -1387,68 +1455,32 @@ alloc  r34 = ar.pfs,2,34,0,0
 //       exact. If f = -2^(-64), then A + f is exact. Hence
 //       f-f_hi is -A exactly, giving f_lo = 0.
 //       f_lo := f_lo + a;
-(p10)    fsub.s1 FR_a = FR_B,FR_A
-	nop.i 999
-}
-{ .mfi
-	nop.m 999
-(p13)    fadd.s1 FR_s_hi = FR_f,FR_g
-	nop.i 999 ;;
-}
-{ .mlx
-	nop.m 999
+
 //    If |f| >= 2^(-33)
 //       Case 1
 //       CASE := 1
 //       g := A_hi + B_lo;
 //       s_hi := f + g;
 //       s_lo := (f - s_hi) + g;
-(p13)    movl GR_CASE = 0x1 ;;
-}
-{ .mlx
-	nop.m 999
 //   Else
 //       Case 2
 //       CASE := 2
 //       A := fadd.fpsr3( A_hi, B_lo )
 //       B := max( A_hi, B_lo )
 //       b := min( A_hi, B_lo )
-(p10)    movl GR_CASE = 0x2
-}
-{ .mfi
-	nop.m 999
-(p10)    fsub.s1 FR_f_lo = FR_f,FR_f_hi
-	nop.i 999 ;;
-}
-{ .mfi
-	nop.m 999
-(p10)    fadd.s1 FR_a = FR_a,FR_b
-	nop.i 999
-}
-{ .mfi
-	nop.m 999
-(p13)    fsub.s1 FR_s_lo = FR_f,FR_s_hi
-	nop.i 999 ;;
-}
-{ .mfi
-	nop.m 999
-(p13)    fadd.s1 FR_s_lo = FR_s_lo,FR_g
-	nop.i 999 ;;
-}
+
 { .mfi
-	nop.m 999
-(p10)    fcmp.ge.unc.s1 p14, p11 = FR_f_abs,FR_TWOM50
-	nop.i 999 ;;
+      nop.m 999
+(p10) fcmp.ge.unc.s1 p14, p11 = FR_f_abs,FR_TWOM50
+      nop.i 999
 }
 { .mfi
-	nop.m 999
-//
-//       Create 2**(-50)
-(p10)    fadd.s1 FR_f_lo = FR_f_lo,FR_A
-	nop.i 999 ;;
+      nop.m 999
+(p13) fms.s1 FR_r_lo = FR_s_hi,FR_D_hi,FR_r_hi //For Case 1, r_lo=s_hi*D_hi+r_hi
+      nop.i 999
 }
-{ .mfi
-	nop.m 999
+;;
+
 //       If |f| >= 2^(-50) then
 //          s_hi := f_hi;
 //          s_lo := f_lo;
@@ -1457,84 +1489,90 @@ alloc  r34 = ar.pfs,2,34,0,0
 //          s_hi := f_hi + f_lo
 //          s_lo := (f_hi - s_hi) + f_lo
 //       End If
-(p14)  mov FR_s_hi = FR_f_hi
-	nop.i 999 ;;
+{ .mfi
+      nop.m 999
+(p14) mov FR_s_hi = FR_f_hi
+      nop.i 999
 }
 { .mfi
-	nop.m 999
-(p10)    fadd.s1 FR_f_lo = FR_f_lo,FR_a
-	nop.i 999 ;;
+      nop.m 999
+(p10) fadd.s1 FR_f_lo = FR_f_lo,FR_a
+      nop.i 999
 }
+;;
+
 { .mfi
-	nop.m 999
-(p14)  mov FR_s_lo = FR_f_lo
-	nop.i 999
+      nop.m 999
+(p14) mov FR_s_lo = FR_f_lo
+      nop.i 999
 }
 { .mfi
-	nop.m 999
-(p11)  fadd.s1 FR_f_lo = FR_f_lo,FR_A_lo
-	nop.i 999 ;;
+      nop.m 999
+(p11) fadd.s1 FR_f_lo = FR_f_lo,FR_A_lo
+      nop.i 999
 }
+;;
+
 { .mfi
-	nop.m 999
-(p11)  fma.s1 FR_f_lo = FR_X,FR_p_4,FR_f_lo
-	nop.i 999 ;;
+      nop.m 999
+(p11) fma.s1 FR_f_lo = FR_X,FR_p_4,FR_f_lo
+      nop.i 999
 }
+;;
+
 { .mfi
-	nop.m 999
-(p11)  fadd.s1 FR_s_hi = FR_f_hi,FR_f_lo
-	nop.i 999 ;;
+      nop.m 999
+(p13) fma.s1 FR_r_lo = FR_s_hi,FR_D_lo,FR_r_lo //For Case 1, r_lo=s_hi*D_lo+r_lo
+      nop.i 999
 }
 { .mfi
-	nop.m 999
+      nop.m 999
+(p11) fadd.s1 FR_s_hi = FR_f_hi,FR_f_lo
+      nop.i 999
+}
+;;
+
 //   r_hi :=  s_hi*D_hi
 //   r_lo :=  s_hi*D_hi - r_hi  with fma
 //   r_lo := (s_hi*D_lo + r_lo) + s_lo*D_hi
-(p0) fmpy.s1 FR_r_hi = FR_s_hi,FR_D_hi
-	nop.i 999
-}
 { .mfi
-	nop.m 999
-(p11)  fsub.s1 FR_s_lo = FR_f_hi,FR_s_hi
-	nop.i 999 ;;
+      nop.m 999
+(p10) fmpy.s1 FR_r_hi = FR_s_hi,FR_D_hi
+      nop.i 999
 }
 { .mfi
-	nop.m 999
-(p0) fms.s1 FR_r_lo = FR_s_hi,FR_D_hi,FR_r_hi
-	nop.i 999
+      nop.m 999
+(p11) fsub.s1 FR_s_lo = FR_f_hi,FR_s_hi
+      nop.i 999
 }
+;;
+
 { .mfi
-	nop.m 999
-(p11)  fadd.s1 FR_s_lo = FR_s_lo,FR_f_lo
-	nop.i 999 ;;
-}
-{ .mmi
-	nop.m 999 ;;
-//   Return  N, r_hi, r_lo
-//   We do not return CASE
-(p0) stfe [GR_Address_of_Outputs] = FR_r_hi,16
-	nop.i 999 ;;
+      nop.m 999
+(p10) fms.s1 FR_r_lo = FR_s_hi,FR_D_hi,FR_r_hi
+      nop.i 999
 }
 { .mfi
-	nop.m 999
-(p0) fma.s1 FR_r_lo = FR_s_hi,FR_D_lo,FR_r_lo
-	nop.i 999 ;;
+      nop.m 999
+(p11) fadd.s1 FR_s_lo = FR_s_lo,FR_f_lo
+      nop.i 999
 }
+;;
+
 { .mfi
-	nop.m 999
-(p0) fma.s1 FR_r_lo = FR_s_lo,FR_D_hi,FR_r_lo
-	nop.i 999 ;;
-}
-{ .mmi
-	nop.m 999 ;;
-(p0) stfe [GR_Address_of_Outputs] = FR_r_lo,-16
-	nop.i 999
+      nop.m 999
+(p10) fma.s1 FR_r_lo = FR_s_hi,FR_D_lo,FR_r_lo
+      nop.i 999
 }
-{ .mib
-	nop.m 999
-	nop.i 999
-(p0) br.ret.sptk   b0 ;;
+;;
+
+//   Return  N, r_hi, r_lo
+//   We do not return CASE
+{ .mfb
+      nop.m 999
+      fma.s1 FR_r_lo = FR_s_lo,FR_D_hi,FR_r_lo
+      br.ret.sptk   b0
 }
+;;
 
-.endp __libm_pi_by_2_reduce
-ASM_SIZE_DIRECTIVE(__libm_pi_by_2_reduce)
+.endp __libm_pi_by_2_reduce#