about summary refs log tree commit diff
path: root/sysdeps/ia64/fpu/libm_reduce.S
diff options
context:
space:
mode:
authorUlrich Drepper <drepper@redhat.com>2001-02-19 09:09:18 +0000
committerUlrich Drepper <drepper@redhat.com>2001-02-19 09:09:18 +0000
commit8da2915d5dcfa51cb5f9e55f7716b49858c1d59d (patch)
treeaa219472cc41fcb82789b12723628cb6a33cc774 /sysdeps/ia64/fpu/libm_reduce.S
parente208f556cad11f729533385e46e4191fcc49aa0a (diff)
downloadglibc-8da2915d5dcfa51cb5f9e55f7716b49858c1d59d.tar.gz
glibc-8da2915d5dcfa51cb5f9e55f7716b49858c1d59d.tar.xz
glibc-8da2915d5dcfa51cb5f9e55f7716b49858c1d59d.zip
Update.
2001-02-19  Ulrich Drepper  <drepper@redhat.com>

	* libio/iogetline.c: Move return until after last statement.

	* localedata/show-ucs-data.c: Don't show < > for better readability.

	* sysdeps/ia64/fpu/Dist: New file.
	* sysdeps/ia64/fpu/Makefile: New file.
	* sysdeps/ia64/fpu/Versions: New file.
	* sysdeps/ia64/fpu/e_acos.S: New file.
	* sysdeps/ia64/fpu/e_acosf.S: New file.
	* sysdeps/ia64/fpu/e_acosl.S: New file.
	* sysdeps/ia64/fpu/e_asin.S: New file.
	* sysdeps/ia64/fpu/e_asinf.S: New file.
	* sysdeps/ia64/fpu/e_asinl.S: New file.
	* sysdeps/ia64/fpu/e_atan2.S: New file.
	* sysdeps/ia64/fpu/e_atan2f.S: New file.
	* sysdeps/ia64/fpu/e_atan2l.c: New file.
	* sysdeps/ia64/fpu/e_cosh.S: New file.
	* sysdeps/ia64/fpu/e_coshf.S: New file.
	* sysdeps/ia64/fpu/e_coshl.S: New file.
	* sysdeps/ia64/fpu/e_exp.S: New file.
	* sysdeps/ia64/fpu/e_expf.S: New file.
	* sysdeps/ia64/fpu/e_expl.c: New file.
	* sysdeps/ia64/fpu/e_fmod.S: New file.
	* sysdeps/ia64/fpu/e_fmodf.S: New file.
	* sysdeps/ia64/fpu/e_fmodl.S: New file.
	* sysdeps/ia64/fpu/e_hypot.S: New file.
	* sysdeps/ia64/fpu/e_hypotf.S: New file.
	* sysdeps/ia64/fpu/e_hypotl.S: New file.
	* sysdeps/ia64/fpu/e_log.S: New file.
	* sysdeps/ia64/fpu/e_log10.c: New file.
	* sysdeps/ia64/fpu/e_log10f.c: New file.
	* sysdeps/ia64/fpu/e_log10l.c: New file.
	* sysdeps/ia64/fpu/e_logf.S: New file.
	* sysdeps/ia64/fpu/e_logl.c: New file.
	* sysdeps/ia64/fpu/e_pow.S: New file.
	* sysdeps/ia64/fpu/e_powf.S: New file.
	* sysdeps/ia64/fpu/e_powl.S: New file.
	* sysdeps/ia64/fpu/e_rem_pio2.c: New file.
	* sysdeps/ia64/fpu/e_rem_pio2f.c: New file.
	* sysdeps/ia64/fpu/e_remainder.S: New file.
	* sysdeps/ia64/fpu/e_remainderf.S: New file.
	* sysdeps/ia64/fpu/e_remainderl.S: New file.
	* sysdeps/ia64/fpu/e_scalb.S: New file.
	* sysdeps/ia64/fpu/e_scalbf.S: New file.
	* sysdeps/ia64/fpu/e_scalbl.S: New file.
	* sysdeps/ia64/fpu/e_sinh.S: New file.
	* sysdeps/ia64/fpu/e_sinhf.S: New file.
	* sysdeps/ia64/fpu/e_sinhl.S: New file.
	* sysdeps/ia64/fpu/e_sqrt.S: New file.
	* sysdeps/ia64/fpu/e_sqrtf.S: New file.
	* sysdeps/ia64/fpu/e_sqrtl.S: New file.
	* sysdeps/ia64/fpu/k_rem_pio2.c: New file.
	* sysdeps/ia64/fpu/k_rem_pio2f.c: New file.
	* sysdeps/ia64/fpu/k_rem_pio2l.c: New file.
	* sysdeps/ia64/fpu/libm_atan2_reg.S: New file.
	* sysdeps/ia64/fpu/libm_error.c: New file.
	* sysdeps/ia64/fpu/libm_frexp4.S: New file.
	* sysdeps/ia64/fpu/libm_frexp4f.S: New file.
	* sysdeps/ia64/fpu/libm_frexp4l.S: New file.
	* sysdeps/ia64/fpu/libm_reduce.S: New file.
	* sysdeps/ia64/fpu/libm_support.h: New file.
	* sysdeps/ia64/fpu/libm_tan.S: New file.
	* sysdeps/ia64/fpu/s_atan.S: New file.
	* sysdeps/ia64/fpu/s_atanf.S: New file.
	* sysdeps/ia64/fpu/s_atanl.S: New file.
	* sysdeps/ia64/fpu/s_cbrt.S: New file.
	* sysdeps/ia64/fpu/s_cbrtf.S: New file.
	* sysdeps/ia64/fpu/s_cbrtl.S: New file.
	* sysdeps/ia64/fpu/s_ceil.S: New file.
	* sysdeps/ia64/fpu/s_ceilf.S: New file.
	* sysdeps/ia64/fpu/s_ceill.S: New file.
	* sysdeps/ia64/fpu/s_cos.S: New file.
	* sysdeps/ia64/fpu/s_cosf.S: New file.
	* sysdeps/ia64/fpu/s_cosl.S: New file.
	* sysdeps/ia64/fpu/s_expm1.S: New file.
	* sysdeps/ia64/fpu/s_expm1f.S: New file.
	* sysdeps/ia64/fpu/s_expm1l.S: New file.
	* sysdeps/ia64/fpu/s_floor.S: New file.
	* sysdeps/ia64/fpu/s_floorf.S: New file.
	* sysdeps/ia64/fpu/s_floorl.S: New file.
	* sysdeps/ia64/fpu/s_frexp.c: New file.
	* sysdeps/ia64/fpu/s_frexpf.c: New file.
	* sysdeps/ia64/fpu/s_frexpl.c: New file.
	* sysdeps/ia64/fpu/s_ilogb.S: New file.
	* sysdeps/ia64/fpu/s_ilogbf.S: New file.
	* sysdeps/ia64/fpu/s_ilogbl.S: New file.
	* sysdeps/ia64/fpu/s_ldexp.S: New file.
	* sysdeps/ia64/fpu/s_ldexpf.S: New file.
	* sysdeps/ia64/fpu/s_ldexpl.S: New file.
	* sysdeps/ia64/fpu/s_log1p.S: New file.
	* sysdeps/ia64/fpu/s_log1pf.S: New file.
	* sysdeps/ia64/fpu/s_log1pl.S: New file.
	* sysdeps/ia64/fpu/s_logb.S: New file.
	* sysdeps/ia64/fpu/s_logbf.S: New file.
	* sysdeps/ia64/fpu/s_logbl.S: New file.
	* sysdeps/ia64/fpu/s_matherrf.c: New file.
	* sysdeps/ia64/fpu/s_matherrl.c: New file.
	* sysdeps/ia64/fpu/s_modf.S: New file.
	* sysdeps/ia64/fpu/s_modff.S: New file.
	* sysdeps/ia64/fpu/s_modfl.S: New file.
	* sysdeps/ia64/fpu/s_nearbyint.S: New file.
	* sysdeps/ia64/fpu/s_nearbyintf.S: New file.
	* sysdeps/ia64/fpu/s_nearbyintl.S: New file.
	* sysdeps/ia64/fpu/s_rint.S: New file.
	* sysdeps/ia64/fpu/s_rintf.S: New file.
	* sysdeps/ia64/fpu/s_rintl.S: New file.
	* sysdeps/ia64/fpu/s_round.S: New file.
	* sysdeps/ia64/fpu/s_roundf.S: New file.
	* sysdeps/ia64/fpu/s_roundl.S: New file.
	* sysdeps/ia64/fpu/s_scalbn.S: New file.
	* sysdeps/ia64/fpu/s_scalbnf.S: New file.
	* sysdeps/ia64/fpu/s_scalbnl.S: New file.
	* sysdeps/ia64/fpu/s_significand.S: New file.
	* sysdeps/ia64/fpu/s_significandf.S: New file.
	* sysdeps/ia64/fpu/s_significandl.S: New file.
	* sysdeps/ia64/fpu/s_sin.c: New file.
	* sysdeps/ia64/fpu/s_sincos.c: New file.
	* sysdeps/ia64/fpu/s_sincosf.c: New file.
	* sysdeps/ia64/fpu/s_sincosl.c: New file.
	* sysdeps/ia64/fpu/s_sinf.c: New file.
	* sysdeps/ia64/fpu/s_sinl.c: New file.
	* sysdeps/ia64/fpu/s_tan.S: New file.
	* sysdeps/ia64/fpu/s_tanf.S: New file.
	* sysdeps/ia64/fpu/s_tanl.S: New file.
	* sysdeps/ia64/fpu/s_trunc.S: New file.
	* sysdeps/ia64/fpu/s_truncf.S: New file.
	* sysdeps/ia64/fpu/s_truncl.S: New file.
	* sysdeps/ia64/fpu/w_acos.c: New file.
	* sysdeps/ia64/fpu/w_acosf.c: New file.
	* sysdeps/ia64/fpu/w_acosl.c: New file.
	* sysdeps/ia64/fpu/w_asin.c: New file.
	* sysdeps/ia64/fpu/w_asinf.c: New file.
	* sysdeps/ia64/fpu/w_asinl.c: New file.
	* sysdeps/ia64/fpu/w_atan2.c: New file.
	* sysdeps/ia64/fpu/w_atan2f.c: New file.
	* sysdeps/ia64/fpu/w_atan2l.c: New file.
	* sysdeps/ia64/fpu/w_cosh.c: New file.
	* sysdeps/ia64/fpu/w_coshf.c: New file.
	* sysdeps/ia64/fpu/w_coshl.c: New file.
	* sysdeps/ia64/fpu/w_exp.c: New file.
	* sysdeps/ia64/fpu/w_expf.c: New file.
	* sysdeps/ia64/fpu/w_fmod.c: New file.
	* sysdeps/ia64/fpu/w_fmodf.c: New file.
	* sysdeps/ia64/fpu/w_fmodl.c: New file.
	* sysdeps/ia64/fpu/w_hypot.c: New file.
	* sysdeps/ia64/fpu/w_hypotf.c: New file.
	* sysdeps/ia64/fpu/w_hypotl.c: New file.
	* sysdeps/ia64/fpu/w_log.c: New file.
	* sysdeps/ia64/fpu/w_log10.c: New file.
	* sysdeps/ia64/fpu/w_log10f.c: New file.
	* sysdeps/ia64/fpu/w_log10l.c: New file.
	* sysdeps/ia64/fpu/w_logf.c: New file.
	* sysdeps/ia64/fpu/w_logl.c: New file.
	* sysdeps/ia64/fpu/w_pow.c: New file.
	* sysdeps/ia64/fpu/w_powf.c: New file.
	* sysdeps/ia64/fpu/w_powl.c: New file.
	* sysdeps/ia64/fpu/w_remainder.c: New file.
	* sysdeps/ia64/fpu/w_remainderf.c: New file.
	* sysdeps/ia64/fpu/w_remainderl.c: New file.
	* sysdeps/ia64/fpu/w_scalb.c: New file.
	* sysdeps/ia64/fpu/w_scalbf.c: New file.
	* sysdeps/ia64/fpu/w_scalbl.c: New file.
	* sysdeps/ia64/fpu/w_sqrt.c: New file.
	* sysdeps/ia64/fpu/w_sqrtf.c: New file.
	* sysdeps/ia64/fpu/w_sqrtl.c: New file.
	* sysdeps/ia64/fpu/libm-test-ulps: Adjust for long double
	implementation.
	* sysdeps/ia64/fpu/bits/mathdef.h: Correct float_t and double_t types.
	Change FP_ILOGBNAN for new implementation.
	* Verions.def: Add 2.2.3 versions.
Diffstat (limited to 'sysdeps/ia64/fpu/libm_reduce.S')
-rw-r--r--sysdeps/ia64/fpu/libm_reduce.S1527
1 files changed, 1527 insertions, 0 deletions
diff --git a/sysdeps/ia64/fpu/libm_reduce.S b/sysdeps/ia64/fpu/libm_reduce.S
new file mode 100644
index 0000000000..fb04d36840
--- /dev/null
+++ b/sysdeps/ia64/fpu/libm_reduce.S
@@ -0,0 +1,1527 @@
+.file "libm_reduce.s"
+
+// Copyright (c) 2000, 2001, Intel Corporation
+// All rights reserved.
+// 
+// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
+// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+// 
+// WARRANTY DISCLAIMER
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS 
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+// 
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at 
+// http://developer.intel.com/opensource.
+//
+// History:  02/02/00 Initial Version
+//
+// *********************************************************************
+// *********************************************************************
+//
+// Function:   __libm_pi_by_two_reduce(x) return r, c, and N where
+//             x = N * pi/4 + (r+c) , where |r+c| <= pi/4.
+//             This function is not designed to be used by the
+//             general user.
+//
+// *********************************************************************
+//
+// Accuracy:       Returns double-precision values
+//
+// *********************************************************************
+//
+// Resources Used:
+//
+//    Floating-Point Registers: f32-f70
+//
+//    General Purpose Registers:
+//      r8  = return value N
+//      r32 = Address of x
+//      r33 = Address of where to place r and then c 
+//      r34-r64
+//
+//    Predicate Registers:      p6-p14
+//
+// *********************************************************************
+//
+// IEEE Special Conditions:
+//
+//    No condions should be raised. 
+//
+// *********************************************************************
+//
+// I. Introduction
+// ===============
+//
+// For the forward trigonometric functions sin, cos, sincos, and
+// tan, the original algorithms for IA 64 handle arguments up to 
+// 1 ulp less than 2^63 in magnitude. For double-extended arguments x,
+// |x| >= 2^63, this routine returns CASE, N and r_hi, r_lo where
+// 
+//    x  is accurately approximated by
+//    2*K*pi  +  N * pi/2  +  r_hi + r_lo,  |r_hi+r_lo| <= pi/4.
+//    CASE = 1 or 2.
+//    CASE is 1 unless |r_hi + r_lo| < 2^(-33).
+// 
+// The exact value of K is not determined, but that information is
+// not required in trigonometric function computations.
+// 
+// We first assume the argument x in question satisfies x >= 2^(63). 
+// In particular, it is positive. Negative x can be handled by symmetry:
+// 
+//   -x  is accurately approximated by
+//         -2*K*pi  +  (-N) * pi/2  -  (r_hi + r_lo),  |r_hi+r_lo| <= pi/4.
+// 
+// The idea of the reduction is that
+// 
+// 	x  *  2/pi   =   N_big  +  N  +  f,	|f| <= 1/2
+// 
+// Moreover, for double extended x, |f| >= 2^(-75). (This is an
+// non-obvious fact found by enumeration using a special algorithm
+// involving continued fraction.) The algorithm described below 
+// calculates N and an accurate approximation of f.
+// 
+// Roughly speaking, an appropriate 256-bit (4 X 64) portion of 
+// 2/pi is multiplied with x to give the desired information.
+// 
+// II. Representation of 2/PI
+// ==========================
+// 
+// The value of 2/pi in binary fixed-point is
+// 
+//            .101000101111100110......
+// 
+// We store 2/pi in a table, starting at the position corresponding
+// to bit position 63 
+// 
+//   bit position  63 62 ... 0   -1 -2 -3 -4 -5 -6 -7  ....  -16576
+// 
+// 	 	0  0  ... 0  . 1  0  1  0  1  0  1  ....    X
+//                 
+//                              ^
+// 	     	             |__ implied binary pt 
+// 
+// III. Algorithm
+// ==============
+// 
+// This describes the algorithm in the most natural way using
+// unsigned interger multiplication. The implementation section 
+// describes how the integer arithmetic is simulated.
+// 
+// STEP 0. Initialization
+// ----------------------
+// 
+// Let the input argument x be 
+// 
+//     x = 2^m * ( 1. b_1 b_2 b_3 ... b_63 ),  63 <= m <= 16383.
+// 
+// The first crucial step is to fetch four 64-bit portions of 2/pi. 
+// To fulfill this goal, we calculate the bit position L of the
+// beginning of these 256-bit quantity by
+// 
+//     L :=  62 - m.
+// 
+// Note that -16321 <= L <= -1 because 63 <= m <= 16383; and that 
+// the storage of 2/pi is adequate.
+// 
+// Fetch P_1, P_2, P_3, P_4 beginning at bit position L thus:
+// 
+//      bit position  L  L-1  L-2    ...  L-63
+// 
+//      P_1    =      b   b    b     ...    b
+// 
+// each b can be 0 or 1. Also, let P_0 be the two bits correspoding to
+// bit positions L+2 and L+1. So, when each of the P_j is interpreted
+// with appropriate scaling, we have
+//
+//      2/pi  =  P_big  + P_0 + (P_1 + P_2 + P_3 + P_4)  +  P_small
+// 
+// Note that P_big and P_small can be ignored. The reasons are as follow.
+// First, consider P_big. If P_big = 0, we can certainly ignore it.
+// Otherwise, P_big >= 2^(L+3). Now, 
+// 
+//        P_big * ulp(x) >=  2^(L+3) * 2^(m-63)
+// 		      >=  2^(65-m  +  m-63 )
+// 		      >=  2^2
+// 
+// Thus, P_big * x is an integer of the form 4*K. So
+// 
+// 	x = 4*K * (pi/2) + x*(P_0 + P_1 + P_2 + P_3 + P_4)*(pi/2)
+//                + x*P_small*(pi/2).
+// 
+// Hence, P_big*x corresponds to information that can be ignored for
+// trigonometic function evaluation.
+// 
+// Next, we must estimate the effect of ignoring P_small. The absolute
+// error made by ignoring P_small is bounded by
+// 
+//       |P_small * x|  <=  ulp(P_4) * x
+// 		     <=  2^(L-255) * 2^(m+1)
+// 		     <=  2^(62-m-255 + m + 1)
+// 		     <=  2^(-192)
+// 
+// Since for double-extended precision, x * 2/pi = integer + f, 
+// 0.5 >= |f| >= 2^(-75), the relative error introduced by ignoring
+// P_small is bounded by 2^(-192+75) <= 2^(-117), which is acceptable.
+// 
+// Further note that if x is split into x_hi + x_lo where x_lo is the
+// two bits corresponding to bit positions 2^(m-62) and 2^(m-63); then
+// 
+// 	P_0 * x_hi 
+// 
+// is also an integer of the form 4*K; and thus can also be ignored.
+// Let M := P_0 * x_lo which is a small integer. The main part of the
+// calculation is really the multiplication of x with the four pieces
+// P_1, P_2, P_3, and P_4.
+// 
+// Unless the reduced argument is extremely small in magnitude, it
+// suffices to carry out the multiplication of x with P_1, P_2, and
+// P_3. x*P_4 will be carried out and added on as a correction only 
+// when it is found to be needed. Note also that x*P_4 need not be
+// computed exactly. A straightforward multiplication suffices since
+// the rounding error thus produced would be bounded by 2^(-3*64),
+// that is 2^(-192) which is small enough as the reduced argument
+// is bounded from below by 2^(-75).
+// 
+// Now that we have four 64-bit data representing 2/pi and a
+// 64-bit x. We first need to calculate a highly accurate product
+// of x and P_1, P_2, P_3. This is best understood as integer
+// multiplication.
+// 
+// 
+// STEP 1. Multiplication
+// ----------------------
+// 
+// 
+//                     ---------   ---------   ---------
+// 	             |  P_1  |   |  P_2  |   |  P_3  |
+// 	             ---------   ---------   ---------
+// 
+//                                            ---------
+// 	      X                              |   X   |
+// 	                                     ---------
+//      ----------------------------------------------------
+//
+//                                 ---------   ---------
+//	                         |  A_hi |   |  A_lo |
+//	                         ---------   ---------
+//
+//
+//                    ---------   ---------
+//	             |  B_hi |   |  B_lo |
+//	             ---------   ---------
+//
+//
+//        ---------   ---------  
+//	 |  C_hi |   |  C_lo |  
+//	 ---------   ---------  
+//
+//      ====================================================
+//       ---------   ---------   ---------   ---------
+//	 |  S_0  |   |  S_1  |   |  S_2  |   |  S_3  |
+//	 ---------   ---------   ---------   ---------
+//
+//
+//
+// STEP 2. Get N and f
+// -------------------
+// 
+// Conceptually, after the individual pieces S_0, S_1, ..., are obtained,
+// we have to sum them and obtain an integer part, N, and a fraction, f.
+// Here, |f| <= 1/2, and N is an integer. Note also that N need only to
+// be known to module 2^k, k >= 2. In the case when |f| is small enough,
+// we would need to add in the value x*P_4.
+// 
+// 
+// STEP 3. Get reduced argument
+// ----------------------------
+// 
+// The value f is not yet the reduced argument that we seek. The
+// equation
+// 
+// 	x * 2/pi = 4K  + N  + f
+// 
+// says that
+// 
+//         x   =  2*K*pi  + N * pi/2  +  f * (pi/2).
+// 
+// Thus, the reduced argument is given by
+// 
+// 	reduced argument =  f * pi/2.
+// 
+// This multiplication must be performed to extra precision.
+// 
+// IV. Implementation
+// ==================
+// 
+// Step 0. Initialization
+// ----------------------
+// 
+// Set sgn_x := sign(x); x := |x|; x_lo := 2 lsb of x.
+// 
+// In memory, 2/pi is stored contigously as
+// 
+//  0x00000000 0x00000000 0xA2F....
+//                       ^
+//                       |__ implied binary bit
+// 
+// Given x = 2^m * 1.xxxx...xxx; we calculate L := 62 - m. Thus
+// -1 <= L <= -16321. We fetch from memory 5 integer pieces of data.
+// 
+// P_0 is the two bits corresponding to bit positions L+2 and L+1
+// P_1 is the 64-bit starting at bit position  L
+// P_2 is the 64-bit starting at bit position  L-64
+// P_3 is the 64-bit starting at bit position  L-128
+// P_4 is the 64-bit starting at bit position  L-192
+// 
+// For example, if m = 63, P_0 would be 0 and P_1 would look like
+// 0xA2F...
+// 
+// If m = 65, P_0 would be the two msb of 0xA, thus, P_0 is 10 in binary.
+// P_1 in binary would be  1 0 0 0 1 0 1 1 1 1 .... 
+//  
+// Step 1. Multiplication
+// ----------------------
+// 
+// At this point, P_1, P_2, P_3, P_4 are integers. They are
+// supposed to be interpreted as
+// 
+//  2^(L-63)     * P_1;
+//  2^(L-63-64)  * P_2;
+//  2^(L-63-128) * P_3;
+// 2^(L-63-192) * P_4;
+// 
+// Since each of them need to be multiplied to x, we would scale
+// both x and the P_j's by some convenient factors: scale each
+// of P_j's up by 2^(63-L), and scale x down by 2^(L-63).
+// 
+//   p_1 := fcvt.xf ( P_1 )
+//   p_2 := fcvt.xf ( P_2 ) * 2^(-64)
+//   p_3 := fcvt.xf ( P_3 ) * 2^(-128)
+//   p_4 := fcvt.xf ( P_4 ) * 2^(-192)
+//   x   := replace exponent of x by -1
+//          because 2^m    * 1.xxxx...xxx  * 2^(L-63)
+//          is      2^(-1) * 1.xxxx...xxx
+// 
+// We are now faced with the task of computing the following
+// 
+//                     ---------   ---------   ---------
+// 	             |  P_1  |   |  P_2  |   |  P_3  |
+// 	             ---------   ---------   ---------
+// 
+//                                             ---------
+// 	      X                              |   X   |
+// 	                                     ---------
+//       ----------------------------------------------------
+// 
+//                                 ---------   ---------
+// 	                         |  A_hi |   |  A_lo |
+// 	                         ---------   ---------
+// 
+//                     ---------   ---------
+// 	             |  B_hi |   |  B_lo |
+// 	             ---------   ---------
+// 
+//         ---------   ---------  
+// 	 |  C_hi |   |  C_lo |  
+// 	 ---------   ---------  
+// 
+//      ====================================================
+//       -----------   ---------   ---------   ---------
+//       |    S_0  |   |  S_1  |   |  S_2  |   |  S_3  |
+//       -----------   ---------   ---------   ---------
+//        ^          ^
+//        |          |___ binary point
+//        |
+//        |___ possibly one more bit
+// 
+// Let FPSR3 be set to round towards zero with widest precision
+// and exponent range. Unless an explicit FPSR is given, 
+// round-to-nearest with widest precision and exponent range is
+// used.
+// 
+// Define sigma_C := 2^63; sigma_B := 2^(-1); sigma_C := 2^(-65).
+// 
+// Tmp_C := fmpy.fpsr3( x, p_1 );
+// If Tmp_C >= sigma_C then
+//    C_hi := Tmp_C;
+//    C_lo := x*p_1 - C_hi ...fma, exact
+// Else
+//    C_hi := fadd.fpsr3(sigma_C, Tmp_C) - sigma_C
+// 			...subtraction is exact, regardless
+// 			...of rounding direction
+//    C_lo := x*p_1 - C_hi ...fma, exact
+// End If
+// 
+// Tmp_B := fmpy.fpsr3( x, p_2 );
+// If Tmp_B >= sigma_B then
+//    B_hi := Tmp_B;
+//    B_lo := x*p_2 - B_hi ...fma, exact
+// Else
+//    B_hi := fadd.fpsr3(sigma_B, Tmp_B) - sigma_B
+// 			...subtraction is exact, regardless
+// 			...of rounding direction
+//    B_lo := x*p_2 - B_hi ...fma, exact
+// End If
+// 
+// Tmp_A := fmpy.fpsr3( x, p_3 );
+// If Tmp_A >= sigma_A then
+//    A_hi := Tmp_A;
+//    A_lo := x*p_3 - A_hi ...fma, exact
+// Else
+//    A_hi := fadd.fpsr3(sigma_A, Tmp_A) - sigma_A
+// 			...subtraction is exact, regardless
+// 			...of rounding direction
+//    A_lo := x*p_3 - A_hi ...fma, exact
+// End If
+// 
+// ...Note that C_hi is of integer value. We need only the
+// ...last few bits. Thus we can ensure C_hi is never a big 
+// ...integer, freeing us from overflow worry.
+// 
+// Tmp_C := fadd.fpsr3( C_hi, 2^(70) ) - 2^(70);
+// ...Tmp_C is the upper portion of C_hi
+// C_hi := C_hi - Tmp_C
+// ...0 <= C_hi < 2^7
+// 
+// Step 2. Get N and f
+// -------------------
+// 
+// At this point, we have all the components to obtain 
+// S_0, S_1, S_2, S_3 and thus N and f. We start by adding
+// C_lo and B_hi. This sum together with C_hi gives a good
+// estimation of N and f. 
+// 
+// A := fadd.fpsr3( B_hi, C_lo )
+// B := max( B_hi, C_lo )
+// b := min( B_hi, C_lo )
+// 
+// a := (B - A) + b	...exact. Note that a is either 0
+// 			...or 2^(-64).
+// 
+// N := round_to_nearest_integer_value( A );
+// f := A - N;		...exact because lsb(A) >= 2^(-64)
+// 			...and |f| <= 1/2.
+// 
+// f := f + a		...exact because a is 0 or 2^(-64);
+// 			...the msb of the sum is <= 1/2
+// 			...lsb >= 2^(-64).
+// 
+// N := convert to integer format( C_hi + N );
+// M := P_0 * x_lo;
+// N := N + M;
+// 
+// If sgn_x == 1 (that is original x was negative)
+// N := 2^10 - N
+// ...this maintains N to be non-negative, but still
+// ...equivalent to the (negated N) mod 4.
+// End If
+// 
+// If |f| >= 2^(-33)
+// 
+// ...Case 1
+// CASE := 1
+// g := A_hi + B_lo;
+// s_hi := f + g;
+// s_lo := (f - s_hi) + g;
+// 
+// Else
+// 
+// ...Case 2
+// CASE := 2
+// A := fadd.fpsr3( A_hi, B_lo )
+// B := max( A_hi, B_lo )
+// b := min( A_hi, B_lo )
+// 
+// a := (B - A) + b	...exact. Note that a is either 0
+// 			...or 2^(-128).
+// 
+// f_hi := A + f;
+// f_lo := (f - f_hi) + A;
+// ...this is exact.
+// ...f-f_hi is exact because either |f| >= |A|, in which
+// ...case f-f_hi is clearly exact; or otherwise, 0<|f|<|A|
+// ...means msb(f) <= msb(A) = 2^(-64) => |f| = 2^(-64).
+// ...If f = 2^(-64), f-f_hi involves cancellation and is
+// ...exact. If f = -2^(-64), then A + f is exact. Hence
+// ...f-f_hi is -A exactly, giving f_lo = 0.
+// 
+// f_lo := f_lo + a;
+// 
+// If |f| >= 2^(-50) then
+//    s_hi := f_hi;
+//    s_lo := f_lo;
+// Else
+//    f_lo := (f_lo + A_lo) + x*p_4
+//    s_hi := f_hi + f_lo
+//    s_lo := (f_hi - s_hi) + f_lo
+// End If
+// 
+// End If
+// 
+// Step 3. Get reduced argument
+// ----------------------------
+// 
+// If sgn_x == 0 (that is original x is positive)
+// 
+// D_hi := Pi_by_2_hi
+// D_lo := Pi_by_2_lo
+// ...load from table
+// 
+// Else
+// 
+// D_hi := neg_Pi_by_2_hi
+// D_lo := neg_Pi_by_2_lo
+// ...load from table
+// End If
+// 
+// r_hi :=  s_hi*D_hi
+// r_lo :=  s_hi*D_hi - r_hi   	...fma
+// r_lo := (s_hi*D_lo + r_lo) + s_lo*D_hi
+// 
+// Return  CASE, N, r_hi, r_lo
+//
+
+#include "libm_support.h"
+
+FR_X       = f32 
+FR_N       = f33 
+FR_p_1     = f34 
+FR_TWOM33  = f35 
+FR_TWOM50  = f36 
+FR_g       = f37 
+FR_p_2     = f38 
+FR_f       = f39 
+FR_s_lo    = f40 
+FR_p_3     = f41 
+FR_f_abs   = f42 
+FR_D_lo    = f43 
+FR_p_4     = f44 
+FR_D_hi    = f45 
+FR_Tmp2_C  = f46 
+FR_s_hi    = f47 
+FR_sigma_A = f48 
+FR_A       = f49 
+FR_sigma_B = f50 
+FR_B       = f51 
+FR_sigma_C = f52 
+FR_b       = f53 
+FR_ScaleP2 = f54 
+FR_ScaleP3 = f55 
+FR_ScaleP4 = f56 
+FR_Tmp_A   = f57 
+FR_Tmp_B   = f58 
+FR_Tmp_C   = f59 
+FR_A_hi    = f60 
+FR_f_hi    = f61 
+FR_r_hi    = f62 
+FR_A_lo    = f63 
+FR_B_hi    = f64 
+FR_a       = f65 
+FR_B_lo    = f66 
+FR_f_lo    = f67
+FR_r_lo    = f68 
+FR_C_hi    = f69 
+FR_C_lo    = f70 
+
+GR_N       = r8
+GR_Address_of_Input  = r32 
+GR_Address_of_Outputs = r33 
+GR_Exp_x   = r36 
+GR_Temp    = r37 
+GR_BIASL63 = r38 
+GR_CASE    = r39
+GR_x_lo    = r40 
+GR_sgn_x   = r41 
+GR_M       = r42
+GR_BASE    = r43
+GR_LENGTH1 = r44
+GR_LENGTH2 = r45
+GR_ASUB    = r46
+GR_P_0     = r47
+GR_P_1     = r48 
+GR_P_2     = r49 
+GR_P_3     = r50 
+GR_P_4     = r51 
+GR_START   = r52
+GR_SEGMENT = r53
+GR_A       = r54
+GR_B       = r55 
+GR_C       = r56
+GR_D       = r57
+GR_E       = r58
+GR_TEMP1   = r59 
+GR_TEMP2   = r60 
+GR_TEMP3   = r61 
+GR_TEMP4   = r62 
+GR_TEMP5   = r63
+GR_TEMP6   = r64
+
+.align 64
+
+#ifdef _LIBC
+.rodata
+#else
+.data
+#endif
+
+Constants_Bits_of_2_by_pi:
+ASM_TYPE_DIRECTIVE(Constants_Bits_of_2_by_pi,@object)
+data8 0x0000000000000000,0xA2F9836E4E441529
+data8 0xFC2757D1F534DDC0,0xDB6295993C439041
+data8 0xFE5163ABDEBBC561,0xB7246E3A424DD2E0
+data8 0x06492EEA09D1921C,0xFE1DEB1CB129A73E
+data8 0xE88235F52EBB4484,0xE99C7026B45F7E41
+data8 0x3991D639835339F4,0x9C845F8BBDF9283B
+data8 0x1FF897FFDE05980F,0xEF2F118B5A0A6D1F
+data8 0x6D367ECF27CB09B7,0x4F463F669E5FEA2D
+data8 0x7527BAC7EBE5F17B,0x3D0739F78A5292EA
+data8 0x6BFB5FB11F8D5D08,0x56033046FC7B6BAB
+data8 0xF0CFBC209AF4361D,0xA9E391615EE61B08
+data8 0x6599855F14A06840,0x8DFFD8804D732731
+data8 0x06061556CA73A8C9,0x60E27BC08C6B47C4
+data8 0x19C367CDDCE8092A,0x8359C4768B961CA6
+data8 0xDDAF44D15719053E,0xA5FF07053F7E33E8
+data8 0x32C2DE4F98327DBB,0xC33D26EF6B1E5EF8
+data8 0x9F3A1F35CAF27F1D,0x87F121907C7C246A
+data8 0xFA6ED5772D30433B,0x15C614B59D19C3C2
+data8 0xC4AD414D2C5D000C,0x467D862D71E39AC6
+data8 0x9B0062337CD2B497,0xA7B4D55537F63ED7
+data8 0x1810A3FC764D2A9D,0x64ABD770F87C6357
+data8 0xB07AE715175649C0,0xD9D63B3884A7CB23
+data8 0x24778AD623545AB9,0x1F001B0AF1DFCE19
+data8 0xFF319F6A1E666157,0x9947FBACD87F7EB7
+data8 0x652289E83260BFE6,0xCDC4EF09366CD43F
+data8 0x5DD7DE16DE3B5892,0x9BDE2822D2E88628
+data8 0x4D58E232CAC616E3,0x08CB7DE050C017A7
+data8 0x1DF35BE01834132E,0x6212830148835B8E
+data8 0xF57FB0ADF2E91E43,0x4A48D36710D8DDAA
+data8 0x425FAECE616AA428,0x0AB499D3F2A6067F
+data8 0x775C83C2A3883C61,0x78738A5A8CAFBDD7
+data8 0x6F63A62DCBBFF4EF,0x818D67C12645CA55
+data8 0x36D9CAD2A8288D61,0xC277C9121426049B
+data8 0x4612C459C444C5C8,0x91B24DF31700AD43
+data8 0xD4E5492910D5FDFC,0xBE00CC941EEECE70
+data8 0xF53E1380F1ECC3E7,0xB328F8C79405933E
+data8 0x71C1B3092EF3450B,0x9C12887B20AB9FB5
+data8 0x2EC292472F327B6D,0x550C90A7721FE76B
+data8 0x96CB314A1679E279,0x4189DFF49794E884
+data8 0xE6E29731996BED88,0x365F5F0EFDBBB49A
+data8 0x486CA46742727132,0x5D8DB8159F09E5BC
+data8 0x25318D3974F71C05,0x30010C0D68084B58
+data8 0xEE2C90AA4702E774,0x24D6BDA67DF77248
+data8 0x6EEF169FA6948EF6,0x91B45153D1F20ACF
+data8 0x3398207E4BF56863,0xB25F3EDD035D407F
+data8 0x8985295255C06437,0x10D86D324832754C
+data8 0x5BD4714E6E5445C1,0x090B69F52AD56614
+data8 0x9D072750045DDB3B,0xB4C576EA17F9877D
+data8 0x6B49BA271D296996,0xACCCC65414AD6AE2
+data8 0x9089D98850722CBE,0xA4049407777030F3
+data8 0x27FC00A871EA49C2,0x663DE06483DD9797
+data8 0x3FA3FD94438C860D,0xDE41319D39928C70
+data8 0xDDE7B7173BDF082B,0x3715A0805C93805A
+data8 0x921110D8E80FAF80,0x6C4BFFDB0F903876
+data8 0x185915A562BBCB61,0xB989C7BD401004F2
+data8 0xD2277549F6B6EBBB,0x22DBAA140A2F2689
+data8 0x768364333B091A94,0x0EAA3A51C2A31DAE
+data8 0xEDAF12265C4DC26D,0x9C7A2D9756C0833F
+data8 0x03F6F0098C402B99,0x316D07B43915200C
+data8 0x5BC3D8C492F54BAD,0xC6A5CA4ECD37A736
+data8 0xA9E69492AB6842DD,0xDE6319EF8C76528B
+data8 0x6837DBFCABA1AE31,0x15DFA1AE00DAFB0C
+data8 0x664D64B705ED3065,0x29BF56573AFF47B9
+data8 0xF96AF3BE75DF9328,0x3080ABF68C6615CB
+data8 0x040622FA1DE4D9A4,0xB33D8F1B5709CD36
+data8 0xE9424EA4BE13B523,0x331AAAF0A8654FA5
+data8 0xC1D20F3F0BCD785B,0x76F923048B7B7217
+data8 0x8953A6C6E26E6F00,0xEBEF584A9BB7DAC4
+data8 0xBA66AACFCF761D02,0xD12DF1B1C1998C77
+data8 0xADC3DA4886A05DF7,0xF480C62FF0AC9AEC
+data8 0xDDBC5C3F6DDED01F,0xC790B6DB2A3A25A3
+data8 0x9AAF009353AD0457,0xB6B42D297E804BA7
+data8 0x07DA0EAA76A1597B,0x2A12162DB7DCFDE5
+data8 0xFAFEDB89FDBE896C,0x76E4FCA90670803E
+data8 0x156E85FF87FD073E,0x2833676186182AEA
+data8 0xBD4DAFE7B36E6D8F,0x3967955BBF3148D7
+data8 0x8416DF30432DC735,0x6125CE70C9B8CB30
+data8 0xFD6CBFA200A4E46C,0x05A0DD5A476F21D2
+data8 0x1262845CB9496170,0xE0566B0152993755
+data8 0x50B7D51EC4F1335F,0x6E13E4305DA92E85
+data8 0xC3B21D3632A1A4B7,0x08D4B1EA21F716E4
+data8 0x698F77FF2780030C,0x2D408DA0CD4F99A5
+data8 0x20D3A2B30A5D2F42,0xF9B4CBDA11D0BE7D
+data8 0xC1DB9BBD17AB81A2,0xCA5C6A0817552E55
+data8 0x0027F0147F8607E1,0x640B148D4196DEBE
+data8 0x872AFDDAB6256B34,0x897BFEF3059EBFB9
+data8 0x4F6A68A82A4A5AC4,0x4FBCF82D985AD795
+data8 0xC7F48D4D0DA63A20,0x5F57A4B13F149538
+data8 0x800120CC86DD71B6,0xDEC9F560BF11654D
+data8 0x6B0701ACB08CD0C0,0xB24855510EFB1EC3
+data8 0x72953B06A33540C0,0x7BDC06CC45E0FA29
+data8 0x4EC8CAD641F3E8DE,0x647CD8649B31BED9
+data8 0xC397A4D45877C5E3,0x6913DAF03C3ABA46
+data8 0x18465F7555F5BDD2,0xC6926E5D2EACED44
+data8 0x0E423E1C87C461E9,0xFD29F3D6E7CA7C22
+data8 0x35916FC5E0088DD7,0xFFE26A6EC6FDB0C1
+data8 0x0893745D7CB2AD6B,0x9D6ECD7B723E6A11
+data8 0xC6A9CFF7DF7329BA,0xC9B55100B70DB2E2
+data8 0x24BA74607DE58AD8,0x742C150D0C188194
+data8 0x667E162901767A9F,0xBEFDFDEF4556367E
+data8 0xD913D9ECB9BA8BFC,0x97C427A831C36EF1
+data8 0x36C59456A8D8B5A8,0xB40ECCCF2D891234
+data8 0x576F89562CE3CE99,0xB920D6AA5E6B9C2A
+data8 0x3ECC5F114A0BFDFB,0xF4E16D3B8E2C86E2
+data8 0x84D4E9A9B4FCD1EE,0xEFC9352E61392F44
+data8 0x2138C8D91B0AFC81,0x6A4AFBD81C2F84B4
+data8 0x538C994ECC2254DC,0x552AD6C6C096190B
+data8 0xB8701A649569605A,0x26EE523F0F117F11
+data8 0xB5F4F5CBFC2DBC34,0xEEBC34CC5DE8605E
+data8 0xDD9B8E67EF3392B8,0x17C99B5861BC57E1
+data8 0xC68351103ED84871,0xDDDD1C2DA118AF46
+data8 0x2C21D7F359987AD9,0xC0549EFA864FFC06
+data8 0x56AE79E536228922,0xAD38DC9367AAE855
+data8 0x3826829BE7CAA40D,0x51B133990ED7A948
+data8 0x0569F0B265A7887F,0x974C8836D1F9B392
+data8 0x214A827B21CF98DC,0x9F405547DC3A74E1
+data8 0x42EB67DF9DFE5FD4,0x5EA4677B7AACBAA2
+data8 0xF65523882B55BA41,0x086E59862A218347
+data8 0x39E6E389D49EE540,0xFB49E956FFCA0F1C
+data8 0x8A59C52BFA94C5C1,0xD3CFC50FAE5ADB86
+data8 0xC5476243853B8621,0x94792C8761107B4C
+data8 0x2A1A2C8012BF4390,0x2688893C78E4C4A8
+data8 0x7BDBE5C23AC4EAF4,0x268A67F7BF920D2B
+data8 0xA365B1933D0B7CBD,0xDC51A463DD27DDE1
+data8 0x6919949A9529A828,0xCE68B4ED09209F44
+data8 0xCA984E638270237C,0x7E32B90F8EF5A7E7
+data8 0x561408F1212A9DB5,0x4D7E6F5119A5ABF9
+data8 0xB5D6DF8261DD9602,0x36169F3AC4A1A283
+data8 0x6DED727A8D39A9B8,0x825C326B5B2746ED
+data8 0x34007700D255F4FC,0x4D59018071E0E13F
+data8 0x89B295F364A8F1AE,0xA74B38FC4CEAB2BB
+ASM_SIZE_DIRECTIVE(Constants_Bits_of_2_by_pi)
+
+Constants_Bits_of_pi_by_2:
+ASM_TYPE_DIRECTIVE(Constants_Bits_of_pi_by_2,@object)
+data4 0x2168C234,0xC90FDAA2,0x00003FFF,0x00000000
+data4 0x80DC1CD1,0xC4C6628B,0x00003FBF,0x00000000
+ASM_SIZE_DIRECTIVE(Constants_Bits_of_pi_by_2)
+
+.section .text
+.proc __libm_pi_by_2_reduce#
+.global __libm_pi_by_2_reduce#
+.align 64 
+
+__libm_pi_by_2_reduce: 
+
+//    X is at the address in Address_of_Input
+//    Place the two-piece result at the address in Address_of_Outputs
+//    r followed by c
+//    N is returned
+
+{ .mmf
+alloc  r34 = ar.pfs,2,34,0,0
+(p0)  ldfe  FR_X = [GR_Address_of_Input]
+(p0)  fsetc.s3 0x00,0x7F ;;
+}
+{ .mlx
+	nop.m 999
+(p0)  movl GR_BIASL63 = 0x1003E
+}
+;;
+
+
+//    L         -1-2-3-4
+//    0 0 0 0 0. 1 0 1 0
+//    M          0 1 2 .... 63, 64 65 ... 127, 128
+//     ---------------------------------------------
+//    Segment 0.        1     ,      2       ,    3
+//    START = M - 63                        M = 128 becomes 65
+//    LENGTH1  = START & 0x3F               65 become position 1
+//    SEGMENT  = shr(START,6) + 1      0 maps to 1,   64 maps to 2,
+//    LENGTH2  = 64 - LENGTH1
+//    Address_BASE = shladd(SEGMENT,3) + BASE
+
+
+
+{ .mmi
+      nop.m 999
+(p0)  addl           GR_BASE   = @ltoff(Constants_Bits_of_2_by_pi#), gp
+      nop.i 999
+}
+;;
+
+{ .mmi
+      ld8 GR_BASE = [GR_BASE]
+      nop.m 999
+      nop.i 999
+}
+;;
+
+
+{ .mlx
+	nop.m 999
+(p0)  movl GR_TEMP5 = 0x000000000000FFFE
+}
+{ .mmi
+	nop.m 999 ;;
+(p0)  setf.exp FR_sigma_B = GR_TEMP5
+	nop.i 999
+}
+{ .mlx
+	nop.m 999
+(p0)  movl GR_TEMP6 = 0x000000000000FFBE ;;
+}
+//    Define sigma_C := 2^63; sigma_B := 2^(-1); sigma_A := 2^(-65).
+{ .mfi
+(p0)  setf.exp FR_sigma_A = GR_TEMP6
+	nop.f 999
+	nop.i 999 ;;
+}
+//    Special Code for testing DE arguments 
+//    (p0)  movl GR_BIASL63 = 0x0000000000013FFE
+//    (p0)  movl GR_x_lo = 0xFFFFFFFFFFFFFFFF
+//    (p0)  setf.exp FR_X = GR_BIASL63
+//    (p0)  setf.sig FR_ScaleP3 = GR_x_lo
+//    (p0)  fmerge.se FR_X = FR_X,FR_ScaleP3
+//    Set sgn_x := sign(x); x := |x|; x_lo := 2 lsb of x.
+//    2/pi is stored contigously as
+//    0x00000000 0x00000000.0xA2F....
+//    M = EXP - BIAS  ( M >= 63)
+//    Given x = 2^m * 1.xxxx...xxx; we calculate L := 62 - m.
+//    Thus -1 <= L <= -16321.
+{ .mmf
+(p0)  getf.exp GR_Exp_x = FR_X
+(p0)  getf.sig GR_x_lo = FR_X
+(p0)  fabs FR_X = FR_X ;;
+}
+{ .mii
+(p0)  and  GR_x_lo = 0x03,GR_x_lo
+(p0)  extr.u GR_M = GR_Exp_x,0,17 ;;
+(p0)  sub  GR_START = GR_M,GR_BIASL63
+}
+{ .mmi
+	nop.m 999 ;;
+(p0)  and  GR_LENGTH1 = 0x3F,GR_START
+(p0)  shr.u  GR_SEGMENT = GR_START,6
+}
+{ .mmi
+	nop.m 999 ;;
+(p0)  add  GR_SEGMENT = 0x1,GR_SEGMENT
+(p0)  sub  GR_LENGTH2 = 0x40,GR_LENGTH1
+}
+//    P_0 is the two bits corresponding to bit positions L+2 and L+1
+//    P_1 is the 64-bit starting at bit position  L
+//    P_2 is the 64-bit starting at bit position  L-64
+//    P_3 is the 64-bit starting at bit position  L-128
+//    P_4 is the 64-bit starting at bit position  L-192
+//    P_1 is made up of Alo and Bhi
+//    P_1 = deposit Alo, position 0, length2  into P_1,position length1
+//          deposit Bhi, position length2, length1 into P_1, position 0
+//    P_2 is made up of Blo and Chi
+//    P_2 = deposit Blo, position 0, length2  into P_2, position length1
+//          deposit Chi, position length2, length1 into P_2, position 0
+//    P_3 is made up of Clo and Dhi
+//    P_3 = deposit Clo, position 0, length2  into P_3, position length1
+//          deposit Dhi, position length2, length1 into P_3, position 0
+//    P_4 is made up of Clo and Dhi
+//    P_4 = deposit Dlo, position 0, length2  into P_4, position length1
+//          deposit Ehi, position length2, length1 into P_4, position 0
+{ .mmi
+(p0)  cmp.le.unc p6,p7 = 0x2,GR_LENGTH1 ;;
+(p0)  shladd GR_BASE = GR_SEGMENT,3,GR_BASE
+(p7)  cmp.eq.unc p8,p9 = 0x1,GR_LENGTH1 ;;
+}
+{ .mmi
+	nop.m 999
+//    ld_64 A at Base and increment Base by 8
+//    ld_64 B at Base and increment Base by 8
+//    ld_64 C at Base and increment Base by 8
+//    ld_64 D at Base and increment Base by 8
+//    ld_64 E at Base and increment Base by 8
+//                                          A/B/C/D
+//                                    ---------------------
+//    A, B, C, D, and E look like    | length1 | length2   |
+//                                    ---------------------
+//                                       hi        lo
+(p0)  ld8 GR_A = [GR_BASE],8
+(p0)  extr.u GR_sgn_x = GR_Exp_x,17,1 ;;
+}
+{ .mmf
+	nop.m 999
+(p0)  ld8 GR_B = [GR_BASE],8
+(p0)  fmerge.se FR_X = FR_sigma_B,FR_X ;;
+}
+{ .mii
+(p0)  ld8 GR_C = [GR_BASE],8
+(p8)  extr.u GR_Temp = GR_A,63,1 ;;
+(p0)  shl GR_TEMP1 = GR_A,GR_LENGTH1
+}
+{ .mii
+(p0)  ld8 GR_D = [GR_BASE],8
+//    If length1 >= 2,
+//       P_0 = deposit Ahi, position length2, 2 bit into P_0 at position 0.
+(p6)     shr.u GR_P_0 = GR_A,GR_LENGTH2 ;;
+(p0)  shl GR_TEMP2 = GR_B,GR_LENGTH1
+}
+{ .mii
+(p0)  ld8 GR_E = [GR_BASE],-40
+(p0)  shr.u GR_P_1 = GR_B,GR_LENGTH2 ;;
+(p0)  shr.u GR_P_2 = GR_C,GR_LENGTH2
+}
+//    Else
+//       Load 16 bit of ASUB from (Base_Address_of_A - 2)
+//       P_0 = ASUB & 0x3
+//       If length1 == 0,
+//          P_0 complete
+//       Else
+//          Deposit element 63 from Ahi and place in element 0 of P_0.
+//       Endif
+//    Endif
+{ .mii
+(p7)  ld2 GR_ASUB = [GR_BASE],8
+(p0)  shl GR_TEMP3 = GR_C,GR_LENGTH1 ;;
+(p0)  shl GR_TEMP4 = GR_D,GR_LENGTH1
+}
+{ .mii
+	nop.m 999
+(p0)  shr.u GR_P_3 = GR_D,GR_LENGTH2 ;;
+(p0)  shr.u GR_P_4 = GR_E,GR_LENGTH2
+}
+{ .mii
+(p7)  and GR_P_0 = 0x03,GR_ASUB
+(p6)     and GR_P_0 = 0x03,GR_P_0 ;;
+(p0)  or GR_P_1 = GR_P_1,GR_TEMP1
+}
+{ .mmi
+(p8)  and GR_P_0 = 0x1,GR_P_0 ;;
+(p0)  or GR_P_2 = GR_P_2,GR_TEMP2
+(p8)  shl GR_P_0 = GR_P_0,0x1 ;;
+}
+{ .mii
+	nop.m 999
+(p0)  or GR_P_3 = GR_P_3,GR_TEMP3
+(p8)  or GR_P_0 = GR_P_0,GR_Temp
+}
+{ .mmi
+(p0)  setf.sig FR_p_1 = GR_P_1 ;;
+(p0)  setf.sig FR_p_2 = GR_P_2
+(p0)  or GR_P_4 = GR_P_4,GR_TEMP4 ;;
+}
+{ .mmi
+	nop.m 999 ;;
+(p0)  setf.sig FR_p_3 = GR_P_3
+(p0)  pmpy2.r GR_M = GR_P_0,GR_x_lo
+}
+{ .mlx
+(p0)  setf.sig FR_p_4 = GR_P_4
+//    P_1, P_2, P_3, P_4 are integers. They should be
+//    2^(L-63)     * P_1;
+//    2^(L-63-64)  * P_2;
+//    2^(L-63-128) * P_3;
+//    2^(L-63-192) * P_4;
+//    Since each of them need to be multiplied to x, we would scale
+//    both x and the P_j's by some convenient factors: scale each
+//    of P_j's up by 2^(63-L), and scale x down by 2^(L-63).
+//    p_1 := fcvt.xf ( P_1 )
+//    p_2 := fcvt.xf ( P_2 ) * 2^(-64)
+//    p_3 := fcvt.xf ( P_3 ) * 2^(-128)
+//    p_4 := fcvt.xf ( P_4 ) * 2^(-192)
+//    x= Set x's exp to -1 because 2^m*1.x...x *2^(L-63)=2^(-1)*1.x...xxx
+//             ---------   ---------   ---------
+//             |  P_1  |   |  P_2  |   |  P_3  |
+//             ---------   ---------   ---------
+//                                           ---------
+//	      X                              |   X   |
+//	                                     ---------
+//      ----------------------------------------------------
+//                               ---------   ---------
+//	                         |  A_hi |   |  A_lo |
+//	                         ---------   ---------
+//                   ---------   ---------
+//	             |  B_hi |   |  B_lo |
+//	             ---------   ---------
+//       ---------   ---------
+//	 |  C_hi |   |  C_lo |
+//	 ---------   ---------
+//     ====================================================
+//    -----------   ---------   ---------   ---------
+//    |    S_0  |   |  S_1  |   |  S_2  |   |  S_3  |
+//    -----------   ---------   ---------   ---------
+//    |            |___ binary point
+//    |___ possibly one more bit
+//
+//    Let FPSR3 be set to round towards zero with widest precision
+//    and exponent range. Unless an explicit FPSR is given,
+//    round-to-nearest with widest precision and exponent range is
+//    used.
+(p0)  movl GR_TEMP1 = 0x000000000000FFBF
+}
+{ .mmi
+	nop.m 999 ;;
+(p0)  setf.exp FR_ScaleP2 = GR_TEMP1
+	nop.i 999
+}
+{ .mlx
+	nop.m 999
+(p0)  movl GR_TEMP4 = 0x000000000001003E
+}
+{ .mmi
+	nop.m 999 ;;
+(p0)  setf.exp FR_sigma_C = GR_TEMP4
+	nop.i 999
+}
+{ .mlx
+	nop.m 999
+(p0)  movl GR_TEMP2 = 0x000000000000FF7F ;;
+}
+{ .mmf
+	nop.m 999
+(p0)  setf.exp FR_ScaleP3 = GR_TEMP2
+(p0)  fcvt.xuf.s1 FR_p_1 = FR_p_1 ;;
+}
+{ .mfi
+	nop.m 999
+(p0)  fcvt.xuf.s1 FR_p_2 = FR_p_2
+	nop.i 999
+}
+{ .mlx
+	nop.m 999
+(p0)  movl GR_Temp = 0x000000000000FFDE ;;
+}
+{ .mmf
+	nop.m 999
+(p0)  setf.exp FR_TWOM33 = GR_Temp
+(p0)  fcvt.xuf.s1 FR_p_3 = FR_p_3 ;;
+}
+{ .mfi
+	nop.m 999
+(p0)  fcvt.xuf.s1 FR_p_4 = FR_p_4
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+//    Tmp_C := fmpy.fpsr3( x, p_1 );
+//    Tmp_B := fmpy.fpsr3( x, p_2 );
+//    Tmp_A := fmpy.fpsr3( x, p_3 );
+//    If Tmp_C >= sigma_C then
+//      C_hi := Tmp_C;
+//      C_lo := x*p_1 - C_hi ...fma, exact
+//    Else
+//      C_hi := fadd.fpsr3(sigma_C, Tmp_C) - sigma_C
+//      C_lo := x*p_1 - C_hi ...fma, exact
+//    End If
+//    If Tmp_B >= sigma_B then
+//      B_hi := Tmp_B;
+//      B_lo := x*p_2 - B_hi ...fma, exact
+//    Else
+//      B_hi := fadd.fpsr3(sigma_B, Tmp_B) - sigma_B
+//      B_lo := x*p_2 - B_hi ...fma, exact
+//    End If
+//    If Tmp_A >= sigma_A then
+//      A_hi := Tmp_A;
+//      A_lo := x*p_3 - A_hi ...fma, exact
+//    Else
+//      A_hi := fadd.fpsr3(sigma_A, Tmp_A) - sigma_A
+//      Exact, regardless ...of rounding direction
+//      A_lo := x*p_3 - A_hi ...fma, exact
+//    Endif
+(p0)  fmpy.s3 FR_Tmp_C = FR_X,FR_p_1
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+(p0)  fmpy.s1 FR_p_2 = FR_p_2,FR_ScaleP2
+	nop.i 999
+}
+{ .mlx
+	nop.m 999
+(p0)  movl GR_Temp = 0x0000000000000400
+}
+{ .mlx
+	nop.m 999
+(p0)  movl GR_TEMP3 = 0x000000000000FF3F ;;
+}
+{ .mmf
+	nop.m 999
+(p0)  setf.exp FR_ScaleP4 = GR_TEMP3
+(p0)  fmpy.s1 FR_p_3 = FR_p_3,FR_ScaleP3 ;;
+}
+{ .mlx
+	nop.m 999
+(p0)  movl GR_TEMP4 = 0x0000000000010045 ;;
+}
+{ .mmf
+	nop.m 999
+(p0)  setf.exp FR_Tmp2_C = GR_TEMP4
+(p0)  fmpy.s3 FR_Tmp_B = FR_X,FR_p_2 ;;
+}
+{ .mfi
+	nop.m 999
+(p0)  fcmp.ge.unc.s1 p12,  p9 = FR_Tmp_C,FR_sigma_C
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+(p0)  fmpy.s3 FR_Tmp_A = FR_X,FR_p_3
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+(p12) mov FR_C_hi = FR_Tmp_C
+	nop.i 999 ;;
+}
+{ .mfi
+(p0)  addl           GR_BASE   = @ltoff(Constants_Bits_of_pi_by_2#), gp
+(p9)  fadd.s3 FR_C_hi = FR_sigma_C,FR_Tmp_C
+	nop.i 999
+}
+;;
+
+
+
+//   End If
+//   Step 3. Get reduced argument
+//   If sgn_x == 0 (that is original x is positive)
+//      D_hi := Pi_by_2_hi
+//      D_lo := Pi_by_2_lo
+//      Load from table
+//   Else
+//      D_hi := neg_Pi_by_2_hi
+//      D_lo := neg_Pi_by_2_lo
+//      Load from table
+//   End If
+
+
+{ .mmi
+      ld8 GR_BASE = [GR_BASE]
+      nop.m 999
+      nop.i 999
+}
+;;
+
+
+{ .mfi
+(p0) ldfe FR_D_hi = [GR_BASE],16
+(p0)  fmpy.s1 FR_p_4 = FR_p_4,FR_ScaleP4
+	nop.i 999 ;;
+}
+{ .mfi
+(p0) ldfe FR_D_lo = [GR_BASE],0
+(p0)  fcmp.ge.unc.s1 p13, p10 = FR_Tmp_B,FR_sigma_B
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+(p13) mov FR_B_hi = FR_Tmp_B
+	nop.i 999
+}
+{ .mfi
+	nop.m 999
+(p12) fms.s1 FR_C_lo = FR_X,FR_p_1,FR_C_hi
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+(p10) fadd.s3 FR_B_hi = FR_sigma_B,FR_Tmp_B
+	nop.i 999
+}
+{ .mfi
+	nop.m 999
+(p9)  fsub.s1 FR_C_hi = FR_C_hi,FR_sigma_C
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+(p0)  fcmp.ge.unc.s1 p14, p11 = FR_Tmp_A,FR_sigma_A
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+(p14) mov FR_A_hi = FR_Tmp_A
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+(p11) fadd.s3 FR_A_hi = FR_sigma_A,FR_Tmp_A
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+(p9)  fms.s1 FR_C_lo = FR_X,FR_p_1,FR_C_hi
+(p0)  cmp.eq.unc p12,p9 = 0x1,GR_sgn_x
+}
+{ .mfi
+	nop.m 999
+(p13) fms.s1 FR_B_lo = FR_X,FR_p_2,FR_B_hi
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+(p10) fsub.s1 FR_B_hi = FR_B_hi,FR_sigma_B
+	nop.i 999
+}
+{ .mfi
+	nop.m 999
+//    Note that C_hi is of integer value. We need only the
+//    last few bits. Thus we can ensure C_hi is never a big
+//    integer, freeing us from overflow worry.
+//    Tmp_C := fadd.fpsr3( C_hi, 2^(70) ) - 2^(70);
+//    Tmp_C is the upper portion of C_hi
+(p0)  fadd.s3 FR_Tmp_C = FR_C_hi,FR_Tmp2_C
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+(p14) fms.s1 FR_A_lo = FR_X,FR_p_3,FR_A_hi
+	nop.i 999
+}
+{ .mfi
+	nop.m 999
+(p11) fsub.s1 FR_A_hi = FR_A_hi,FR_sigma_A
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+//    *******************
+//    Step 2. Get N and f
+//    *******************
+//    We have all the components to obtain
+//    S_0, S_1, S_2, S_3 and thus N and f. We start by adding
+//    C_lo and B_hi. This sum together with C_hi estimates
+//    N and f well.
+//    A := fadd.fpsr3( B_hi, C_lo )
+//    B := max( B_hi, C_lo )
+//    b := min( B_hi, C_lo )
+(p0)  fadd.s3 FR_A = FR_B_hi,FR_C_lo
+	nop.i 999
+}
+{ .mfi
+	nop.m 999
+(p10) fms.s1 FR_B_lo = FR_X,FR_p_2,FR_B_hi
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+(p0)  fsub.s1 FR_Tmp_C = FR_Tmp_C,FR_Tmp2_C
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+(p0)  fmax.s1 FR_B = FR_B_hi,FR_C_lo
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+(p0)  fmin.s1 FR_b = FR_B_hi,FR_C_lo
+	nop.i 999
+}
+{ .mfi
+	nop.m 999
+(p11) fms.s1 FR_A_lo = FR_X,FR_p_3,FR_A_hi
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+//    N := round_to_nearest_integer_value( A );
+(p0)  fcvt.fx.s1 FR_N = FR_A
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+//    C_hi := C_hi - Tmp_C ...0 <= C_hi < 2^7
+(p0)  fsub.s1 FR_C_hi = FR_C_hi,FR_Tmp_C
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+//    a := (B - A) + b: Exact - note that a is either 0 or 2^(-64).
+(p0)  fsub.s1 FR_a = FR_B,FR_A
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+//    f := A - N; Exact because lsb(A) >= 2^(-64) and |f| <= 1/2.
+(p0)  fnorm.s1 FR_N = FR_N
+	nop.i 999
+}
+{ .mfi
+	nop.m 999
+(p0)  fadd.s1 FR_a = FR_a,FR_b
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+(p0)  fsub.s1 FR_f = FR_A,FR_N
+	nop.i 999
+}
+{ .mfi
+	nop.m 999
+//    N := convert to integer format( C_hi + N );
+//    M := P_0 * x_lo;
+//    N := N + M;
+(p0)  fadd.s1 FR_N = FR_N,FR_C_hi
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+//    f = f + a	Exact because a is 0 or 2^(-64);
+//    the msb of the sum is <= 1/2 and lsb >= 2^(-64).
+(p0)  fadd.s1 FR_f = FR_f,FR_a
+	nop.i 999
+}
+{ .mfi
+	nop.m 999
+//
+//    Create 2**(-33)
+//
+(p0)  fcvt.fx.s1 FR_N = FR_N
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+(p0)  fabs FR_f_abs = FR_f
+	nop.i 999 ;;
+}
+{ .mfi
+(p0)  getf.sig GR_N = FR_N
+	nop.f 999
+	nop.i 999 ;;
+}
+{ .mii
+	nop.m 999
+	nop.i 999 ;;
+(p0)  add GR_N = GR_N,GR_M ;;
+}
+//    If sgn_x == 1 (that is original x was negative)
+//       N := 2^10 - N
+//       this maintains N to be non-negative, but still
+//       equivalent to the (negated N) mod 4.
+//    End If
+{ .mii
+(p12) sub GR_N = GR_Temp,GR_N
+(p0) cmp.eq.unc p12,p9 = 0x0,GR_sgn_x ;;
+	nop.i 999
+}
+{ .mfi
+	nop.m 999
+(p0)  fcmp.ge.unc.s1 p13, p10 = FR_f_abs,FR_TWOM33
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+(p9) fsub.s1 FR_D_hi = f0, FR_D_hi
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+(p10)    fadd.s3 FR_A = FR_A_hi,FR_B_lo
+	nop.i 999
+}
+{ .mfi
+	nop.m 999
+(p13)    fadd.s1 FR_g = FR_A_hi,FR_B_lo
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+(p10)    fmax.s1 FR_B = FR_A_hi,FR_B_lo
+	nop.i 999
+}
+{ .mfi
+	nop.m 999
+(p9) fsub.s1 FR_D_lo = f0, FR_D_lo
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+(p10)    fmin.s1 FR_b = FR_A_hi,FR_B_lo
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+(p0) fsetc.s3 0x7F,0x40
+	nop.i 999
+}
+{ .mlx
+	nop.m 999
+(p10)    movl GR_Temp = 0x000000000000FFCD ;;
+}
+{ .mmf
+	nop.m 999
+(p10)    setf.exp FR_TWOM50 = GR_Temp
+(p10)    fadd.s1 FR_f_hi = FR_A,FR_f ;;
+}
+{ .mfi
+	nop.m 999
+//       a := (B - A) + b	Exact.
+//       Note that a is either 0 or 2^(-128).
+//       f_hi := A + f;
+//       f_lo := (f - f_hi) + A
+//       f_lo=f-f_hi is exact because either |f| >= |A|, in which
+//       case f-f_hi is clearly exact; or otherwise, 0<|f|<|A|
+//       means msb(f) <= msb(A) = 2^(-64) => |f| = 2^(-64).
+//       If f = 2^(-64), f-f_hi involves cancellation and is
+//       exact. If f = -2^(-64), then A + f is exact. Hence
+//       f-f_hi is -A exactly, giving f_lo = 0.
+//       f_lo := f_lo + a;
+(p10)    fsub.s1 FR_a = FR_B,FR_A
+	nop.i 999
+}
+{ .mfi
+	nop.m 999
+(p13)    fadd.s1 FR_s_hi = FR_f,FR_g
+	nop.i 999 ;;
+}
+{ .mlx
+	nop.m 999
+//    If |f| >= 2^(-33)
+//       Case 1
+//       CASE := 1
+//       g := A_hi + B_lo;
+//       s_hi := f + g;
+//       s_lo := (f - s_hi) + g;
+(p13)    movl GR_CASE = 0x1 ;;
+}
+{ .mlx
+	nop.m 999
+//   Else
+//       Case 2
+//       CASE := 2
+//       A := fadd.fpsr3( A_hi, B_lo )
+//       B := max( A_hi, B_lo )
+//       b := min( A_hi, B_lo )
+(p10)    movl GR_CASE = 0x2
+}
+{ .mfi
+	nop.m 999
+(p10)    fsub.s1 FR_f_lo = FR_f,FR_f_hi
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+(p10)    fadd.s1 FR_a = FR_a,FR_b
+	nop.i 999
+}
+{ .mfi
+	nop.m 999
+(p13)    fsub.s1 FR_s_lo = FR_f,FR_s_hi
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+(p13)    fadd.s1 FR_s_lo = FR_s_lo,FR_g
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+(p10)    fcmp.ge.unc.s1 p14, p11 = FR_f_abs,FR_TWOM50
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+//
+//       Create 2**(-50)
+(p10)    fadd.s1 FR_f_lo = FR_f_lo,FR_A
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+//       If |f| >= 2^(-50) then
+//          s_hi := f_hi;
+//          s_lo := f_lo;
+//       Else
+//          f_lo := (f_lo + A_lo) + x*p_4
+//          s_hi := f_hi + f_lo
+//          s_lo := (f_hi - s_hi) + f_lo
+//       End If
+(p14)  mov FR_s_hi = FR_f_hi
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+(p10)    fadd.s1 FR_f_lo = FR_f_lo,FR_a
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+(p14)  mov FR_s_lo = FR_f_lo
+	nop.i 999
+}
+{ .mfi
+	nop.m 999
+(p11)  fadd.s1 FR_f_lo = FR_f_lo,FR_A_lo
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+(p11)  fma.s1 FR_f_lo = FR_X,FR_p_4,FR_f_lo
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+(p11)  fadd.s1 FR_s_hi = FR_f_hi,FR_f_lo
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+//   r_hi :=  s_hi*D_hi
+//   r_lo :=  s_hi*D_hi - r_hi  with fma
+//   r_lo := (s_hi*D_lo + r_lo) + s_lo*D_hi
+(p0) fmpy.s1 FR_r_hi = FR_s_hi,FR_D_hi
+	nop.i 999
+}
+{ .mfi
+	nop.m 999
+(p11)  fsub.s1 FR_s_lo = FR_f_hi,FR_s_hi
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+(p0) fms.s1 FR_r_lo = FR_s_hi,FR_D_hi,FR_r_hi
+	nop.i 999
+}
+{ .mfi
+	nop.m 999
+(p11)  fadd.s1 FR_s_lo = FR_s_lo,FR_f_lo
+	nop.i 999 ;;
+}
+{ .mmi
+	nop.m 999 ;;
+//   Return  N, r_hi, r_lo
+//   We do not return CASE
+(p0) stfe [GR_Address_of_Outputs] = FR_r_hi,16
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+(p0) fma.s1 FR_r_lo = FR_s_hi,FR_D_lo,FR_r_lo
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+(p0) fma.s1 FR_r_lo = FR_s_lo,FR_D_hi,FR_r_lo
+	nop.i 999 ;;
+}
+{ .mmi
+	nop.m 999 ;;
+(p0) stfe [GR_Address_of_Outputs] = FR_r_lo,-16
+	nop.i 999
+}
+{ .mib
+	nop.m 999
+	nop.i 999
+(p0) br.ret.sptk   b0 ;;
+}
+
+.endp __libm_pi_by_2_reduce
+ASM_SIZE_DIRECTIVE(__libm_pi_by_2_reduce)