summary refs log tree commit diff
path: root/sysdeps/ia64/fpu/s_tanl.S
diff options
context:
space:
mode:
authorJakub Jelinek <jakub@redhat.com>2007-07-12 18:26:36 +0000
committerJakub Jelinek <jakub@redhat.com>2007-07-12 18:26:36 +0000
commit0ecb606cb6cf65de1d9fc8a919bceb4be476c602 (patch)
tree2ea1f8305970753e4a657acb2ccc15ca3eec8e2c /sysdeps/ia64/fpu/s_tanl.S
parent7d58530341304d403a6626d7f7a1913165fe2f32 (diff)
downloadglibc-0ecb606cb6cf65de1d9fc8a919bceb4be476c602.tar.gz
glibc-0ecb606cb6cf65de1d9fc8a919bceb4be476c602.tar.xz
glibc-0ecb606cb6cf65de1d9fc8a919bceb4be476c602.zip
2.5-18.1
Diffstat (limited to 'sysdeps/ia64/fpu/s_tanl.S')
-rw-r--r--sysdeps/ia64/fpu/s_tanl.S3032
1 files changed, 1605 insertions, 1427 deletions
diff --git a/sysdeps/ia64/fpu/s_tanl.S b/sysdeps/ia64/fpu/s_tanl.S
index e13e6c6cbd..607a271545 100644
--- a/sysdeps/ia64/fpu/s_tanl.S
+++ b/sysdeps/ia64/fpu/s_tanl.S
@@ -1,10 +1,10 @@
-.file "tanl.s"
+.file "tancotl.s"
 
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2004, Intel Corporation
 // All rights reserved.
-// 
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
@@ -20,7 +20,7 @@
 // * The name of Intel Corporation may not be used to endorse or promote
 // products derived from this software without specific prior written
 // permission.
-//
+
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -35,50 +35,78 @@
 // 
 // Intel Corporation is the author of this code, and requests that all
 // problem reports or change requests be submitted to it directly at 
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
 //
-// *********************************************************************
+//*********************************************************************
 //
 // History: 
 //
-// 2/02/2000 (hand-optimized)
-// 4/04/00  Unwind support added
+// 02/02/00 (hand-optimized)
+// 04/04/00 Unwind support added
 // 12/28/00 Fixed false invalid flags
+// 02/06/02 Improved speed
+// 05/07/02 Changed interface to __libm_pi_by_2_reduce
+// 05/30/02 Added cotl
+// 02/10/03 Reordered header: .section, .global, .proc, .align;
+//          used data8 for long double table values
+// 05/15/03 Reformatted data tables
+// 10/26/04 Avoided using r14-31 as scratch so not clobbered by dynamic loader
 //
-// *********************************************************************
+//*********************************************************************
 //
-// Function:   tanl(x) = tangent(x), for double-extended precision x values
+// Functions:   tanl(x) = tangent(x), for double-extended precision x values
+//              cotl(x) = cotangent(x), for double-extended precision x values
 //
-// *********************************************************************
+//*********************************************************************
 //
 // Resources Used:
 //
 //    Floating-Point Registers: f8 (Input and Return Value)
 //                              f9-f15
-//                              f32-f112
+//                              f32-f121
 //
 //    General Purpose Registers:
-//      r32-r48
-//      r49-r50 (Used to pass arguments to pi_by_2 reduce routine)
+//      r32-r70
 //
 //    Predicate Registers:      p6-p15
 //
-// *********************************************************************
+//*********************************************************************
 //
-// IEEE Special Conditions:
+// IEEE Special Conditions for tanl:
 //
 //    Denormal  fault raised on denormal inputs
 //    Overflow exceptions do not occur
-//    Underflow exceptions raised when appropriate for tan 
+//    Underflow exceptions raised when appropriate for tan
 //    (No specialized error handling for this routine)
 //    Inexact raised when appropriate by algorithm
 //
-//    tan(SNaN) = QNaN
-//    tan(QNaN) = QNaN
-//    tan(inf) = QNaN
-//    tan(+/-0) = +/-0
+//    tanl(SNaN) = QNaN
+//    tanl(QNaN) = QNaN
+//    tanl(inf) = QNaN
+//    tanl(+/-0) = +/-0
+//
+//*********************************************************************
+//
+// IEEE Special Conditions for cotl:
+//
+//    Denormal  fault raised on denormal inputs
+//    Overflow exceptions occur at zero and near zero
+//    Underflow exceptions do not occur
+//    Inexact raised when appropriate by algorithm
+//
+//    cotl(SNaN) = QNaN
+//    cotl(QNaN) = QNaN
+//    cotl(inf) = QNaN
+//    cotl(+/-0) = +/-Inf and error handling is called
+//
+//*********************************************************************
 //
-// *********************************************************************
+//    Below are mathematical and algorithmic descriptions for tanl.
+//    For cotl we use next identity cot(x) = -tan(x + Pi/2).
+//    So, to compute cot(x) we just need to increment N (N = N + 1)
+//    and invert sign of the computed result.
+//
+//*********************************************************************
 //
 // Mathematical Description
 //
@@ -106,13 +134,13 @@
 // -------
 //
 //      tan(r + c) = r + c + r^3/3          ...accurately
-//        -cot(r + c) = -1/(r+c) + r/3          ...accurately
+//     -cot(r + c) = -1/(r+c) + r/3          ...accurately
 //
 // Case 4:
 // -------
 //
 //      tan(r + c) = r + c + r^3/3 + 2r^5/15     ...accurately
-//        -cot(r + c) = -1/(r+c) + r/3 + r^3/45     ...accurately
+//     -cot(r + c) = -1/(r+c) + r/3 + r^3/45     ...accurately
 //
 //
 // The only cases left are Cases 1 and 3 of the argument reduction
@@ -143,13 +171,13 @@
 // Since Arg = N pi/4 + r + c accurately, we have
 //
 //      tan(Arg) =  tan(r+c)            for N even,
-//            = -cot(r+c)          otherwise.
+//               = -cot(r+c)            otherwise.
 //
 // Here for this case, both tan(r) and -cot(r) can be approximated
 // by simple polynomials:
 //
 //      tan(r) =    r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19
-//        -cot(r) = -1/r + Q1_1 r   + Q1_2 r^3 + ... + Q1_7 r^13
+//     -cot(r) = -1/r + Q1_1 r   + Q1_2 r^3 + ... + Q1_7 r^13
 //
 // accurately. Since |r| is relatively small, tan(r+c) and
 // -cot(r+c) can be accurately approximated by replacing r with
@@ -178,21 +206,21 @@
 // The required calculation is either
 //
 //      tan(r + c)  =  tan(r)  +  correction,  or
-//        -cot(r + c)  = -cot(r)  +  correction.
+//     -cot(r + c)  = -cot(r)  +  correction.
 //
 // Specifically,
 //
 //      tan(r + c) =  tan(r) + c tan'(r)  + O(c^2)
-//              =  tan(r) + c sec^2(r) + O(c^2)
-//              =  tan(r) + c SEC_sq     ...accurately
+//                 =  tan(r) + c sec^2(r) + O(c^2)
+//                 =  tan(r) + c SEC_sq     ...accurately
 //                as long as SEC_sq approximates sec^2(r)
 //                to, say, 5 bits or so.
 //
 // Similarly,
 //
-//        -cot(r + c) = -cot(r) - c cot'(r)  + O(c^2)
-//              = -cot(r) + c csc^2(r) + O(c^2)
-//              = -cot(r) + c CSC_sq     ...accurately
+//     -cot(r + c) = -cot(r) - c cot'(r)  + O(c^2)
+//                 = -cot(r) + c csc^2(r) + O(c^2)
+//                 = -cot(r) + c CSC_sq     ...accurately
 //                as long as CSC_sq approximates csc^2(r)
 //                to, say, 5 bits or so.
 //
@@ -208,14 +236,14 @@
 // where
 //
 //      B = 2^k * 1.b_1 b_2 ... b_5 1
-//         x = |r| - B
+//      x = |r| - B
 //
 // Now,
 //                   tan(B)  +   tan(x)
 //      tan( B + x ) =  ------------------------
 //                   1 -  tan(B)*tan(x)
 //
-//               /                         \ 
+//               /                         \
 //               |   tan(B)  +   tan(x)          |
 
 //      = tan(B) +  | ------------------------ - tan(B) |
@@ -248,7 +276,7 @@
 //      cot( B + x ) =  ------------------------
 //                   tan(B)  +  tan(x)
 //
-//               /                           \ 
+//               /                           \
 //               |   1 - tan(B)*tan(x)              |
 
 //      = cot(B) +  | ----------------------- - cot(B) |
@@ -273,7 +301,7 @@
 //      Arg = N * pi/2 +  r + c          ...accurately
 //
 //      tan(Arg) =  tan(r) + correction    if N is even;
-//            = -cot(r) + correction    otherwise.
+//               = -cot(r) + correction    otherwise.
 //
 // For Cases 2 and 4,
 //
@@ -292,8 +320,8 @@
 //      tan(Arg) =  r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19
 //                     + c*(1 + r^2)               N even
 //
-//                  = -1/(r+c) + Q1_1 r   + Q1_2 r^3 + ... + Q1_7 r^13
-//               + Q1_1*c                    N odd
+//               = -1/(r+c) + Q1_1 r   + Q1_2 r^3 + ... + Q1_7 r^13
+//                     + Q1_1*c                    N odd
 //
 //     Case normal_r: 2^(-2) <= |r| <= pi/4
 //
@@ -304,15 +332,15 @@
 //
 //      tan(Arg) = tan(r) + c*sec^2(r)
 //               = tan( sgn_r * (B+x) ) + c * sec^2(|r|)
-//                  = sgn_r * ( tan(B+x)  + sgn_r*c*sec^2(|r|) )
-//                  = sgn_r * ( tan(B+x)  + sgn_r*c*sec^2(B) )
+//               = sgn_r * ( tan(B+x)  + sgn_r*c*sec^2(|r|) )
+//               = sgn_r * ( tan(B+x)  + sgn_r*c*sec^2(B) )
 //
 // since B approximates |r| to 2^(-6) in relative accuracy.
 //
 //                 /            (1/[sin(B)*cos(B)]) * tan(x)
 //    tan(Arg) = sgn_r * | tan(B) + --------------------------------
 //                 \                     cot(B)  -  tan(x)
-//                                        \ 
+//                                        \
 //                       + CORR  |
 
 //                                     /
@@ -324,15 +352,15 @@
 //
 //      tan(Arg) = -cot(r) + c*csc^2(r)
 //               = -cot( sgn_r * (B+x) ) + c * csc^2(|r|)
-//                  = sgn_r * ( -cot(B+x)  + sgn_r*c*csc^2(|r|) )
-//                  = sgn_r * ( -cot(B+x)  + sgn_r*c*csc^2(B) )
+//               = sgn_r * ( -cot(B+x)  + sgn_r*c*csc^2(|r|) )
+//               = sgn_r * ( -cot(B+x)  + sgn_r*c*csc^2(B) )
 //
 // since B approximates |r| to 2^(-6) in relative accuracy.
 //
 //                 /            (1/[sin(B)*cos(B)]) * tan(x)
 //    tan(Arg) = sgn_r * | -cot(B) + --------------------------------
 //                 \                     tan(B)  +  tan(x)
-//                                        \ 
+//                                        \
 //                       + CORR  |
 
 //                                     /
@@ -356,8 +384,8 @@
 //    For N even,
 //
 //    rsq := r * r
-//    Result := c + r * rsq * P1_1
-//    Result := r + Result          ...in user-defined rounding
+//    Poly := c + r * rsq * P1_1
+//    Result := r + Poly          ...in user-defined rounding
 //
 //    For N odd,
 //    S_hi  := -frcpa(r)               ...8 bits
@@ -375,8 +403,8 @@
 //    For N even,
 //
 //    rsq := r * r
-//    Result := c + r * rsq * (P1_1 + rsq * P1_2)
-//    Result := r + Result          ...in user-defined rounding
+//    Poly := c + r * rsq * (P1_1 + rsq * P1_2)
+//    Result := r + Poly          ...in user-defined rounding
 //
 //    For N odd,
 //    S_hi  := -frcpa(r)               ...8 bits
@@ -414,8 +442,8 @@
 //      Poly2 := P1_4 + rsq*(P1_5 + rsq*(P1_6 + ... rsq*P1_9))
 //      CORR  := c * ( 1 + rsq )
 //      Poly  := Poly1 + r_to_the_8*Poly2
-//      Result := r*Poly + CORR
-//      Result := r + Result     ...in user-defined rounding
+//      Poly := r*Poly + CORR
+//      Result := r + Poly     ...in user-defined rounding
 //      ...note that Poly1 and r_to_the_8 can be computed in parallel
 //      ...with Poly2 (Poly1 is intentionally set to be much
 //      ...shorter than Poly2 so that r_to_the_8 and CORR can be hidden)
@@ -434,8 +462,8 @@
 //      rsq := r*r
 //      P   := Q1_1 + rsq*(Q1_2 + rsq*(Q1_3 + ... + rsq*Q1_7))
 //
-//      Result :=  r*P + S_lo
-//      Result :=  S_hi  +  Result      ...in user-defined rounding
+//      Poly :=  r*P + S_lo
+//      Result :=  S_hi  +  Poly      ...in user-defined rounding
 //
 //
 // Algorithm for the case of normal_r
@@ -454,7 +482,7 @@
 //           /           (1/[sin(B)*cos(B)]) * tan(x)
 //      sgn_r * | tan(B) + --------------------------------  +
 //           \                     cot(B)  -  tan(x)
-//                                \ 
+//                                \
 //                          CORR  |
 
 //                                /
@@ -463,7 +491,7 @@
 // calculated beforehand and stored in a table. Specifically,
 // the table values are
 //
-//      tan(B)                as  T_hi  +  T_lo;
+//      tan(B)             as  T_hi  +  T_lo;
 //      cot(B)             as  C_hi  +  C_lo;
 //      1/[sin(B)*cos(B)]  as  SC_inv
 //
@@ -559,7 +587,7 @@
 //           /             (1/[sin(B)*cos(B)]) * tan(x)
 //      sgn_r * | -cot(B) + --------------------------------  +
 //           \                     tan(B)  +  tan(x)
-//                                \ 
+//                                \
 //                          CORR  |
 
 //                                /
@@ -568,7 +596,7 @@
 // calculated beforehand and stored in a table. Specifically,
 // the table values are
 //
-//      tan(B)                as  T_hi  +  T_lo;
+//      tan(B)             as  T_hi  +  T_lo;
 //      cot(B)             as  C_hi  +  C_lo;
 //      1/[sin(B)*cos(B)]  as  SC_inv
 //
@@ -675,254 +703,382 @@
 //
 //
 
-#include "libm_support.h"
-
-#ifdef _LIBC
-.rodata
-#else
-.data
-#endif
-.align 128
-
-TANL_BASE_CONSTANTS:
-ASM_TYPE_DIRECTIVE(TANL_BASE_CONSTANTS,@object)
-data4    0x4B800000, 0xCB800000, 0x38800000, 0xB8800000 // two**24, -two**24
-                                                        // two**-14, -two**-14
-data4    0x4E44152A, 0xA2F9836E, 0x00003FFE, 0x00000000 // two_by_pi
-data4    0xCE81B9F1, 0xC84D32B0, 0x00004016, 0x00000000 // P_0
-data4    0x2168C235, 0xC90FDAA2, 0x00003FFF, 0x00000000 // P_1
-data4    0xFC8F8CBB, 0xECE675D1, 0x0000BFBD, 0x00000000 // P_2
-data4    0xACC19C60, 0xB7ED8FBB, 0x0000BF7C, 0x00000000 // P_3
-data4    0x5F000000, 0xDF000000, 0x00000000, 0x00000000 // two_to_63, -two_to_63
-data4    0x6EC6B45A, 0xA397E504, 0x00003FE7, 0x00000000 // Inv_P_0
-data4    0xDBD171A1, 0x8D848E89, 0x0000BFBF, 0x00000000 // d_1
-data4    0x18A66F8E, 0xD5394C36, 0x0000BF7C, 0x00000000 // d_2
-data4    0x2168C234, 0xC90FDAA2, 0x00003FFE, 0x00000000 // PI_BY_4
-data4    0x2168C234, 0xC90FDAA2, 0x0000BFFE, 0x00000000 // MPI_BY_4
-data4    0x3E800000, 0xBE800000, 0x00000000, 0x00000000 // two**-2, -two**-2
-data4    0x2F000000, 0xAF000000, 0x00000000, 0x00000000 // two**-33, -two**-33
-data4    0xAAAAAABD, 0xAAAAAAAA, 0x00003FFD, 0x00000000 // P1_1
-data4    0x88882E6A, 0x88888888, 0x00003FFC, 0x00000000 // P1_2
-data4    0x0F0177B6, 0xDD0DD0DD, 0x00003FFA, 0x00000000 // P1_3
-data4    0x646B8C6D, 0xB327A440, 0x00003FF9, 0x00000000 // P1_4
-data4    0x1D5F7D20, 0x91371B25, 0x00003FF8, 0x00000000 // P1_5
-data4    0x61C67914, 0xEB69A5F1, 0x00003FF6, 0x00000000 // P1_6
-data4    0x019318D2, 0xBEDD37BE, 0x00003FF5, 0x00000000 // P1_7
-data4    0x3C794015, 0x9979B146, 0x00003FF4, 0x00000000 // P1_8
-data4    0x8C6EB58A, 0x8EBD21A3, 0x00003FF3, 0x00000000 // P1_9
-data4    0xAAAAAAB4, 0xAAAAAAAA, 0x00003FFD, 0x00000000 // Q1_1
-data4    0x0B5FC93E, 0xB60B60B6, 0x00003FF9, 0x00000000 // Q1_2
-data4    0x0C9BBFBF, 0x8AB355E0, 0x00003FF6, 0x00000000 // Q1_3
-data4    0xCBEE3D4C, 0xDDEBBC89, 0x00003FF2, 0x00000000 // Q1_4
-data4    0x5F80BBB6, 0xB3548A68, 0x00003FEF, 0x00000000 // Q1_5
-data4    0x4CED5BF1, 0x91362560, 0x00003FEC, 0x00000000 // Q1_6
-data4    0x8EE92A83, 0xF189D95A, 0x00003FE8, 0x00000000 // Q1_7
-data4    0xAAAB362F, 0xAAAAAAAA, 0x00003FFD, 0x00000000 // P2_1
-data4    0xE97A6097, 0x88888886, 0x00003FFC, 0x00000000 // P2_2
-data4    0x25E716A1, 0xDD108EE0, 0x00003FFA, 0x00000000 // P2_3
+RODATA
+.align 16
+
+LOCAL_OBJECT_START(TANL_BASE_CONSTANTS)
+
+tanl_table_1:
+data8    0xA2F9836E4E44152A, 0x00003FFE // two_by_pi
+data8    0xC84D32B0CE81B9F1, 0x00004016 // P_0
+data8    0xC90FDAA22168C235, 0x00003FFF // P_1
+data8    0xECE675D1FC8F8CBB, 0x0000BFBD // P_2
+data8    0xB7ED8FBBACC19C60, 0x0000BF7C // P_3
+LOCAL_OBJECT_END(TANL_BASE_CONSTANTS)
+
+LOCAL_OBJECT_START(tanl_table_2)
+data8    0xC90FDAA22168C234, 0x00003FFE // PI_BY_4
+data8    0xA397E5046EC6B45A, 0x00003FE7 // Inv_P_0
+data8    0x8D848E89DBD171A1, 0x0000BFBF // d_1
+data8    0xD5394C3618A66F8E, 0x0000BF7C // d_2
+data4    0x3E800000 // two**-2
+data4    0xBE800000 // -two**-2
+data4    0x00000000 // pad
+data4    0x00000000 // pad
+LOCAL_OBJECT_END(tanl_table_2)
+
+LOCAL_OBJECT_START(tanl_table_p1)
+data8    0xAAAAAAAAAAAAAABD, 0x00003FFD // P1_1
+data8    0x8888888888882E6A, 0x00003FFC // P1_2
+data8    0xDD0DD0DD0F0177B6, 0x00003FFA // P1_3
+data8    0xB327A440646B8C6D, 0x00003FF9 // P1_4
+data8    0x91371B251D5F7D20, 0x00003FF8 // P1_5
+data8    0xEB69A5F161C67914, 0x00003FF6 // P1_6
+data8    0xBEDD37BE019318D2, 0x00003FF5 // P1_7
+data8    0x9979B1463C794015, 0x00003FF4 // P1_8
+data8    0x8EBD21A38C6EB58A, 0x00003FF3 // P1_9
+LOCAL_OBJECT_END(tanl_table_p1)
+
+LOCAL_OBJECT_START(tanl_table_q1)
+data8    0xAAAAAAAAAAAAAAB4, 0x00003FFD // Q1_1
+data8    0xB60B60B60B5FC93E, 0x00003FF9 // Q1_2
+data8    0x8AB355E00C9BBFBF, 0x00003FF6 // Q1_3
+data8    0xDDEBBC89CBEE3D4C, 0x00003FF2 // Q1_4
+data8    0xB3548A685F80BBB6, 0x00003FEF // Q1_5
+data8    0x913625604CED5BF1, 0x00003FEC // Q1_6
+data8    0xF189D95A8EE92A83, 0x00003FE8 // Q1_7
+LOCAL_OBJECT_END(tanl_table_q1)
+
+LOCAL_OBJECT_START(tanl_table_p2)
+data8    0xAAAAAAAAAAAB362F, 0x00003FFD // P2_1
+data8    0x88888886E97A6097, 0x00003FFC // P2_2
+data8    0xDD108EE025E716A1, 0x00003FFA // P2_3
+LOCAL_OBJECT_END(tanl_table_p2)
+
+LOCAL_OBJECT_START(tanl_table_tm2)
 //
 //  Entries T_hi   double-precision memory format
 //  Index = 0,1,...,31  B = 2^(-2)*(1+Index/32+1/64)
 //  Entries T_lo  single-precision memory format
 //  Index = 0,1,...,31  B = 2^(-2)*(1+Index/32+1/64)
 //
-data4    0x62400794, 0x3FD09BC3, 0x23A05C32, 0x00000000
-data4    0xDFFBC074, 0x3FD124A9, 0x240078B2, 0x00000000
-data4    0x5BD4920F, 0x3FD1AE23, 0x23826B8E, 0x00000000
-data4    0x15E2701D, 0x3FD23835, 0x22D31154, 0x00000000
-data4    0x63739C2D, 0x3FD2C2E4, 0x2265C9E2, 0x00000000
-data4    0xAFEEA48B, 0x3FD34E36, 0x245C05EB, 0x00000000
-data4    0x7DBB35D1, 0x3FD3DA31, 0x24749F2D, 0x00000000
-data4    0x67321619, 0x3FD466DA, 0x2462CECE, 0x00000000
-data4    0x1F94A4D5, 0x3FD4F437, 0x246D0DF1, 0x00000000
-data4    0x740C3E6D, 0x3FD5824D, 0x240A85B5, 0x00000000
-data4    0x4CB1E73D, 0x3FD61123, 0x23F96E33, 0x00000000
-data4    0xAD9EA64B, 0x3FD6A0BE, 0x247C5393, 0x00000000
-data4    0xB804FD01, 0x3FD73125, 0x241F3B29, 0x00000000
-data4    0xAB53EE83, 0x3FD7C25E, 0x2479989B, 0x00000000
-data4    0xE6640EED, 0x3FD8546F, 0x23B343BC, 0x00000000
-data4    0xE8AF1892, 0x3FD8E75F, 0x241454D1, 0x00000000
-data4    0x53928BDA, 0x3FD97B35, 0x238613D9, 0x00000000
-data4    0xEB9DE4DE, 0x3FDA0FF6, 0x22859FA7, 0x00000000
-data4    0x99ECF92D, 0x3FDAA5AB, 0x237A6D06, 0x00000000
-data4    0x6D8F1796, 0x3FDB3C5A, 0x23952F6C, 0x00000000
-data4    0x9CFB8BE4, 0x3FDBD40A, 0x2280FC95, 0x00000000
-data4    0x87943100, 0x3FDC6CC3, 0x245D2EC0, 0x00000000
-data4    0xB736C500, 0x3FDD068C, 0x23C4AD7D, 0x00000000
-data4    0xE1DDBC31, 0x3FDDA16D, 0x23D076E6, 0x00000000
-data4    0xEB515A93, 0x3FDE3D6E, 0x244809A6, 0x00000000
-data4    0xE6E9E5F1, 0x3FDEDA97, 0x220856C8, 0x00000000
-data4    0x1963CE69, 0x3FDF78F1, 0x244BE993, 0x00000000
-data4    0x7D635BCE, 0x3FE00C41, 0x23D21799, 0x00000000
-data4    0x1C302CD3, 0x3FE05CAB, 0x248A1B1D, 0x00000000
-data4    0xDB6A1FA0, 0x3FE0ADB9, 0x23D53E33, 0x00000000
-data4    0x4A20BA81, 0x3FE0FF72, 0x24DB9ED5, 0x00000000
-data4    0x153FA6F5, 0x3FE151D9, 0x24E9E451, 0x00000000
+data8 0x3FD09BC362400794
+data4 0x23A05C32, 0x00000000
+data8 0x3FD124A9DFFBC074
+data4 0x240078B2, 0x00000000
+data8 0x3FD1AE235BD4920F
+data4 0x23826B8E, 0x00000000
+data8 0x3FD2383515E2701D
+data4 0x22D31154, 0x00000000
+data8 0x3FD2C2E463739C2D
+data4 0x2265C9E2, 0x00000000
+data8 0x3FD34E36AFEEA48B
+data4 0x245C05EB, 0x00000000
+data8 0x3FD3DA317DBB35D1
+data4 0x24749F2D, 0x00000000
+data8 0x3FD466DA67321619
+data4 0x2462CECE, 0x00000000
+data8 0x3FD4F4371F94A4D5
+data4 0x246D0DF1, 0x00000000
+data8 0x3FD5824D740C3E6D
+data4 0x240A85B5, 0x00000000
+data8 0x3FD611234CB1E73D
+data4 0x23F96E33, 0x00000000
+data8 0x3FD6A0BEAD9EA64B
+data4 0x247C5393, 0x00000000
+data8 0x3FD73125B804FD01
+data4 0x241F3B29, 0x00000000
+data8 0x3FD7C25EAB53EE83
+data4 0x2479989B, 0x00000000
+data8 0x3FD8546FE6640EED
+data4 0x23B343BC, 0x00000000
+data8 0x3FD8E75FE8AF1892
+data4 0x241454D1, 0x00000000
+data8 0x3FD97B3553928BDA
+data4 0x238613D9, 0x00000000
+data8 0x3FDA0FF6EB9DE4DE
+data4 0x22859FA7, 0x00000000
+data8 0x3FDAA5AB99ECF92D
+data4 0x237A6D06, 0x00000000
+data8 0x3FDB3C5A6D8F1796
+data4 0x23952F6C, 0x00000000
+data8 0x3FDBD40A9CFB8BE4
+data4 0x2280FC95, 0x00000000
+data8 0x3FDC6CC387943100
+data4 0x245D2EC0, 0x00000000
+data8 0x3FDD068CB736C500
+data4 0x23C4AD7D, 0x00000000
+data8 0x3FDDA16DE1DDBC31
+data4 0x23D076E6, 0x00000000
+data8 0x3FDE3D6EEB515A93
+data4 0x244809A6, 0x00000000
+data8 0x3FDEDA97E6E9E5F1
+data4 0x220856C8, 0x00000000
+data8 0x3FDF78F11963CE69
+data4 0x244BE993, 0x00000000
+data8 0x3FE00C417D635BCE
+data4 0x23D21799, 0x00000000
+data8 0x3FE05CAB1C302CD3
+data4 0x248A1B1D, 0x00000000
+data8 0x3FE0ADB9DB6A1FA0
+data4 0x23D53E33, 0x00000000
+data8 0x3FE0FF724A20BA81
+data4 0x24DB9ED5, 0x00000000
+data8 0x3FE151D9153FA6F5
+data4 0x24E9E451, 0x00000000
+LOCAL_OBJECT_END(tanl_table_tm2)
+
+LOCAL_OBJECT_START(tanl_table_tm1)
 //
 //  Entries T_hi   double-precision memory format
 //  Index = 0,1,...,19  B = 2^(-1)*(1+Index/32+1/64)
 //  Entries T_lo  single-precision memory format
 //  Index = 0,1,...,19  B = 2^(-1)*(1+Index/32+1/64)
 //
-data4    0xBA1BE39E, 0x3FE1CEC4, 0x24B60F9E, 0x00000000
-data4    0x5ABD9B2D, 0x3FE277E4, 0x248C2474, 0x00000000
-data4    0x0272B110, 0x3FE32418, 0x247B8311, 0x00000000
-data4    0x890E2DF0, 0x3FE3D38B, 0x24C55751, 0x00000000
-data4    0x46236871, 0x3FE4866D, 0x24E5BC34, 0x00000000
-data4    0x45E044B0, 0x3FE53CEE, 0x24001BA4, 0x00000000
-data4    0x82EC06E4, 0x3FE5F742, 0x24B973DC, 0x00000000
-data4    0x25DF43F9, 0x3FE6B5A1, 0x24895440, 0x00000000
-data4    0xCAFD348C, 0x3FE77844, 0x240021CA, 0x00000000
-data4    0xCEED6B92, 0x3FE83F6B, 0x24C45372, 0x00000000
-data4    0xA34F3665, 0x3FE90B58, 0x240DAD33, 0x00000000
-data4    0x2C1E56B4, 0x3FE9DC52, 0x24F846CE, 0x00000000
-data4    0x27041578, 0x3FEAB2A4, 0x2323FB6E, 0x00000000
-data4    0x9DD8C373, 0x3FEB8E9F, 0x24B3090B, 0x00000000
-data4    0x65C9AA7B, 0x3FEC709B, 0x2449F611, 0x00000000
-data4    0xACCF8435, 0x3FED58F4, 0x23616A7E, 0x00000000
-data4    0x97635082, 0x3FEE480F, 0x24C2FEAE, 0x00000000
-data4    0xF0ACC544, 0x3FEF3E57, 0x242CE964, 0x00000000
-data4    0xF7E06E4B, 0x3FF01E20, 0x2480D3EE, 0x00000000
-data4    0x8A798A69, 0x3FF0A125, 0x24DB8967, 0x00000000
+data8 0x3FE1CEC4BA1BE39E
+data4 0x24B60F9E, 0x00000000
+data8 0x3FE277E45ABD9B2D
+data4 0x248C2474, 0x00000000
+data8 0x3FE324180272B110
+data4 0x247B8311, 0x00000000
+data8 0x3FE3D38B890E2DF0
+data4 0x24C55751, 0x00000000
+data8 0x3FE4866D46236871
+data4 0x24E5BC34, 0x00000000
+data8 0x3FE53CEE45E044B0
+data4 0x24001BA4, 0x00000000
+data8 0x3FE5F74282EC06E4
+data4 0x24B973DC, 0x00000000
+data8 0x3FE6B5A125DF43F9
+data4 0x24895440, 0x00000000
+data8 0x3FE77844CAFD348C
+data4 0x240021CA, 0x00000000
+data8 0x3FE83F6BCEED6B92
+data4 0x24C45372, 0x00000000
+data8 0x3FE90B58A34F3665
+data4 0x240DAD33, 0x00000000
+data8 0x3FE9DC522C1E56B4
+data4 0x24F846CE, 0x00000000
+data8 0x3FEAB2A427041578
+data4 0x2323FB6E, 0x00000000
+data8 0x3FEB8E9F9DD8C373
+data4 0x24B3090B, 0x00000000
+data8 0x3FEC709B65C9AA7B
+data4 0x2449F611, 0x00000000
+data8 0x3FED58F4ACCF8435
+data4 0x23616A7E, 0x00000000
+data8 0x3FEE480F97635082
+data4 0x24C2FEAE, 0x00000000
+data8 0x3FEF3E57F0ACC544
+data4 0x242CE964, 0x00000000
+data8 0x3FF01E20F7E06E4B
+data4 0x2480D3EE, 0x00000000
+data8 0x3FF0A1258A798A69
+data4 0x24DB8967, 0x00000000
+LOCAL_OBJECT_END(tanl_table_tm1)
+
+LOCAL_OBJECT_START(tanl_table_cm2)
 //
 //  Entries C_hi   double-precision memory format
 //  Index = 0,1,...,31  B = 2^(-2)*(1+Index/32+1/64)
 //  Entries C_lo  single-precision memory format
 //  Index = 0,1,...,31  B = 2^(-2)*(1+Index/32+1/64)
 //
-data4    0xE63EFBD0, 0x400ED3E2, 0x259D94D4, 0x00000000
-data4    0xC515DAB5, 0x400DDDB4, 0x245F0537, 0x00000000
-data4    0xBE19A79F, 0x400CF57A, 0x25D4EA9F, 0x00000000
-data4    0xD15298ED, 0x400C1A06, 0x24AE40A0, 0x00000000
-data4    0x164B2708, 0x400B4A4C, 0x25A5AAB6, 0x00000000
-data4    0x5285B068, 0x400A855A, 0x25524F18, 0x00000000
-data4    0x3FFA549F, 0x4009CA5A, 0x24C999C0, 0x00000000
-data4    0x646AF623, 0x4009188A, 0x254FD801, 0x00000000
-data4    0x6084D0E7, 0x40086F3C, 0x2560F5FD, 0x00000000
-data4    0xA29A76EE, 0x4007CDD2, 0x255B9D19, 0x00000000
-data4    0x6C8ECA95, 0x400733BE, 0x25CB021B, 0x00000000
-data4    0x1F8DDC52, 0x4006A07E, 0x24AB4722, 0x00000000
-data4    0xC298AD58, 0x4006139B, 0x252764E2, 0x00000000
-data4    0xBAD7164B, 0x40058CAB, 0x24DAF5DB, 0x00000000
-data4    0xAE31A5D3, 0x40050B4B, 0x25EA20F4, 0x00000000
-data4    0x89F85A8A, 0x40048F21, 0x2583A3E8, 0x00000000
-data4    0xA862380D, 0x400417DA, 0x25DCC4CC, 0x00000000
-data4    0x1088FCFE, 0x4003A52B, 0x2430A492, 0x00000000
-data4    0xCD3527D5, 0x400336CC, 0x255F77CF, 0x00000000
-data4    0x5760766D, 0x4002CC7F, 0x25DA0BDA, 0x00000000
-data4    0x11CE02E3, 0x40026607, 0x256FF4A2, 0x00000000
-data4    0xD37BBE04, 0x4002032C, 0x25208AED, 0x00000000
-data4    0x7F050775, 0x4001A3BD, 0x24B72DD6, 0x00000000
-data4    0xA554848A, 0x40014789, 0x24AB4DAA, 0x00000000
-data4    0x323E81B7, 0x4000EE65, 0x2584C440, 0x00000000
-data4    0x21CF1293, 0x40009827, 0x25C9428D, 0x00000000
-data4    0x3D415EEB, 0x400044A9, 0x25DC8482, 0x00000000
-data4    0xBD72C577, 0x3FFFE78F, 0x257F5070, 0x00000000
-data4    0x75EFD28E, 0x3FFF4AC3, 0x23EBBF7A, 0x00000000
-data4    0x60B52DDE, 0x3FFEB2AF, 0x22EECA07, 0x00000000
-data4    0x35204180, 0x3FFE1F19, 0x24191079, 0x00000000
-data4    0x54F7E60A, 0x3FFD8FCA, 0x248D3058, 0x00000000
+data8 0x400ED3E2E63EFBD0
+data4 0x259D94D4, 0x00000000
+data8 0x400DDDB4C515DAB5
+data4 0x245F0537, 0x00000000
+data8 0x400CF57ABE19A79F
+data4 0x25D4EA9F, 0x00000000
+data8 0x400C1A06D15298ED
+data4 0x24AE40A0, 0x00000000
+data8 0x400B4A4C164B2708
+data4 0x25A5AAB6, 0x00000000
+data8 0x400A855A5285B068
+data4 0x25524F18, 0x00000000
+data8 0x4009CA5A3FFA549F
+data4 0x24C999C0, 0x00000000
+data8 0x4009188A646AF623
+data4 0x254FD801, 0x00000000
+data8 0x40086F3C6084D0E7
+data4 0x2560F5FD, 0x00000000
+data8 0x4007CDD2A29A76EE
+data4 0x255B9D19, 0x00000000
+data8 0x400733BE6C8ECA95
+data4 0x25CB021B, 0x00000000
+data8 0x4006A07E1F8DDC52
+data4 0x24AB4722, 0x00000000
+data8 0x4006139BC298AD58
+data4 0x252764E2, 0x00000000
+data8 0x40058CABBAD7164B
+data4 0x24DAF5DB, 0x00000000
+data8 0x40050B4BAE31A5D3
+data4 0x25EA20F4, 0x00000000
+data8 0x40048F2189F85A8A
+data4 0x2583A3E8, 0x00000000
+data8 0x400417DAA862380D
+data4 0x25DCC4CC, 0x00000000
+data8 0x4003A52B1088FCFE
+data4 0x2430A492, 0x00000000
+data8 0x400336CCCD3527D5
+data4 0x255F77CF, 0x00000000
+data8 0x4002CC7F5760766D
+data4 0x25DA0BDA, 0x00000000
+data8 0x4002660711CE02E3
+data4 0x256FF4A2, 0x00000000
+data8 0x4002032CD37BBE04
+data4 0x25208AED, 0x00000000
+data8 0x4001A3BD7F050775
+data4 0x24B72DD6, 0x00000000
+data8 0x40014789A554848A
+data4 0x24AB4DAA, 0x00000000
+data8 0x4000EE65323E81B7
+data4 0x2584C440, 0x00000000
+data8 0x4000982721CF1293
+data4 0x25C9428D, 0x00000000
+data8 0x400044A93D415EEB
+data4 0x25DC8482, 0x00000000
+data8 0x3FFFE78FBD72C577
+data4 0x257F5070, 0x00000000
+data8 0x3FFF4AC375EFD28E
+data4 0x23EBBF7A, 0x00000000
+data8 0x3FFEB2AF60B52DDE
+data4 0x22EECA07, 0x00000000
+data8 0x3FFE1F1935204180
+data4 0x24191079, 0x00000000
+data8 0x3FFD8FCA54F7E60A
+data4 0x248D3058, 0x00000000
+LOCAL_OBJECT_END(tanl_table_cm2)
+
+LOCAL_OBJECT_START(tanl_table_cm1)
 //
 //  Entries C_hi   double-precision memory format
 //  Index = 0,1,...,19  B = 2^(-1)*(1+Index/32+1/64)
 //  Entries C_lo  single-precision memory format
 //  Index = 0,1,...,19  B = 2^(-1)*(1+Index/32+1/64)
 //
-data4    0x79F6FADE, 0x3FFCC06A, 0x239C7886, 0x00000000
-data4    0x891662A6, 0x3FFBB91F, 0x250BD191, 0x00000000
-data4    0x529F155D, 0x3FFABFB6, 0x256CC3E6, 0x00000000
-data4    0x2E964AE9, 0x3FF9D300, 0x250843E3, 0x00000000
-data4    0x89DCB383, 0x3FF8F1EF, 0x2277C87E, 0x00000000
-data4    0x7C87DBD6, 0x3FF81B93, 0x256DA6CF, 0x00000000
-data4    0x1042EDE4, 0x3FF74F14, 0x2573D28A, 0x00000000
-data4    0x1784B360, 0x3FF68BAF, 0x242E489A, 0x00000000
-data4    0x7C923C4C, 0x3FF5D0B5, 0x2532D940, 0x00000000
-data4    0xF418EF20, 0x3FF51D88, 0x253C7DD6, 0x00000000
-data4    0x02F88DAE, 0x3FF4719A, 0x23DB59BF, 0x00000000
-data4    0x49DA0788, 0x3FF3CC66, 0x252B4756, 0x00000000
-data4    0x0B980DB8, 0x3FF32D77, 0x23FE585F, 0x00000000
-data4    0xE56C987A, 0x3FF2945F, 0x25378A63, 0x00000000
-data4    0xB16523F6, 0x3FF200BD, 0x247BB2E0, 0x00000000
-data4    0x8CE27778, 0x3FF17235, 0x24446538, 0x00000000
-data4    0xFDEFE692, 0x3FF0E873, 0x2514638F, 0x00000000
-data4    0x33154062, 0x3FF0632C, 0x24A7FC27, 0x00000000
-data4    0xB3EF115F, 0x3FEFC42E, 0x248FD0FE, 0x00000000
-data4    0x135D26F6, 0x3FEEC9E8, 0x2385C719, 0x00000000
+data8 0x3FFCC06A79F6FADE
+data4 0x239C7886, 0x00000000
+data8 0x3FFBB91F891662A6
+data4 0x250BD191, 0x00000000
+data8 0x3FFABFB6529F155D
+data4 0x256CC3E6, 0x00000000
+data8 0x3FF9D3002E964AE9
+data4 0x250843E3, 0x00000000
+data8 0x3FF8F1EF89DCB383
+data4 0x2277C87E, 0x00000000
+data8 0x3FF81B937C87DBD6
+data4 0x256DA6CF, 0x00000000
+data8 0x3FF74F141042EDE4
+data4 0x2573D28A, 0x00000000
+data8 0x3FF68BAF1784B360
+data4 0x242E489A, 0x00000000
+data8 0x3FF5D0B57C923C4C
+data4 0x2532D940, 0x00000000
+data8 0x3FF51D88F418EF20
+data4 0x253C7DD6, 0x00000000
+data8 0x3FF4719A02F88DAE
+data4 0x23DB59BF, 0x00000000
+data8 0x3FF3CC6649DA0788
+data4 0x252B4756, 0x00000000
+data8 0x3FF32D770B980DB8
+data4 0x23FE585F, 0x00000000
+data8 0x3FF2945FE56C987A
+data4 0x25378A63, 0x00000000
+data8 0x3FF200BDB16523F6
+data4 0x247BB2E0, 0x00000000
+data8 0x3FF172358CE27778
+data4 0x24446538, 0x00000000
+data8 0x3FF0E873FDEFE692
+data4 0x2514638F, 0x00000000
+data8 0x3FF0632C33154062
+data4 0x24A7FC27, 0x00000000
+data8 0x3FEFC42EB3EF115F
+data4 0x248FD0FE, 0x00000000
+data8 0x3FEEC9E8135D26F6
+data4 0x2385C719, 0x00000000
+LOCAL_OBJECT_END(tanl_table_cm1)
+
+LOCAL_OBJECT_START(tanl_table_scim2)
 //
 //  Entries SC_inv in Swapped IEEE format (extended)
 //  Index = 0,1,...,31  B = 2^(-2)*(1+Index/32+1/64)
 //
-data4    0x1BF30C9E, 0x839D6D4A, 0x00004001, 0x00000000
-data4    0x554B0EB0, 0x80092804, 0x00004001, 0x00000000
-data4    0xA1CF0DE9, 0xF959F94C, 0x00004000, 0x00000000
-data4    0x77378677, 0xF3086BA0, 0x00004000, 0x00000000
-data4    0xCCD4723C, 0xED154515, 0x00004000, 0x00000000
-data4    0x1C27CF25, 0xE7790944, 0x00004000, 0x00000000
-data4    0x8DDACB88, 0xE22D037D, 0x00004000, 0x00000000
-data4    0x89C73522, 0xDD2B2D8A, 0x00004000, 0x00000000
-data4    0xBB2C1171, 0xD86E1A23, 0x00004000, 0x00000000
-data4    0xDFF5E0F9, 0xD3F0E288, 0x00004000, 0x00000000
-data4    0x283BEBD5, 0xCFAF16B1, 0x00004000, 0x00000000
-data4    0x0D88DD53, 0xCBA4AFAA, 0x00004000, 0x00000000
-data4    0xCA67C43D, 0xC7CE03CC, 0x00004000, 0x00000000
-data4    0x0CA0DDB0, 0xC427BC82, 0x00004000, 0x00000000
-data4    0xF13D8CAB, 0xC0AECD57, 0x00004000, 0x00000000
-data4    0x71ECE6B1, 0xBD606C38, 0x00004000, 0x00000000
-data4    0xA44C4929, 0xBA3A0A96, 0x00004000, 0x00000000
-data4    0xE5CCCEC1, 0xB7394F6F, 0x00004000, 0x00000000
-data4    0x9637D8BC, 0xB45C1203, 0x00004000, 0x00000000
-data4    0x92CB051B, 0xB1A05528, 0x00004000, 0x00000000
-data4    0x6BA2FFD0, 0xAF04432B, 0x00004000, 0x00000000
-data4    0x7221235F, 0xAC862A23, 0x00004000, 0x00000000
-data4    0x5F00A9D1, 0xAA2478AF, 0x00004000, 0x00000000
-data4    0x81E082BF, 0xA7DDBB0C, 0x00004000, 0x00000000
-data4    0x45684FEE, 0xA5B0987D, 0x00004000, 0x00000000
-data4    0x627A8F53, 0xA39BD0F5, 0x00004000, 0x00000000
-data4    0x6EC5C8B0, 0xA19E3B03, 0x00004000, 0x00000000
-data4    0x91CD7C66, 0x9FB6C1F0, 0x00004000, 0x00000000
-data4    0x1FA3DF8A, 0x9DE46410, 0x00004000, 0x00000000
-data4    0xA8F6B888, 0x9C263139, 0x00004000, 0x00000000
-data4    0xC27B0450, 0x9A7B4968, 0x00004000, 0x00000000
-data4    0x5EE614EE, 0x98E2DB7E, 0x00004000, 0x00000000
+data8    0x839D6D4A1BF30C9E, 0x00004001
+data8    0x80092804554B0EB0, 0x00004001
+data8    0xF959F94CA1CF0DE9, 0x00004000
+data8    0xF3086BA077378677, 0x00004000
+data8    0xED154515CCD4723C, 0x00004000
+data8    0xE77909441C27CF25, 0x00004000
+data8    0xE22D037D8DDACB88, 0x00004000
+data8    0xDD2B2D8A89C73522, 0x00004000
+data8    0xD86E1A23BB2C1171, 0x00004000
+data8    0xD3F0E288DFF5E0F9, 0x00004000
+data8    0xCFAF16B1283BEBD5, 0x00004000
+data8    0xCBA4AFAA0D88DD53, 0x00004000
+data8    0xC7CE03CCCA67C43D, 0x00004000
+data8    0xC427BC820CA0DDB0, 0x00004000
+data8    0xC0AECD57F13D8CAB, 0x00004000
+data8    0xBD606C3871ECE6B1, 0x00004000
+data8    0xBA3A0A96A44C4929, 0x00004000
+data8    0xB7394F6FE5CCCEC1, 0x00004000
+data8    0xB45C12039637D8BC, 0x00004000
+data8    0xB1A0552892CB051B, 0x00004000
+data8    0xAF04432B6BA2FFD0, 0x00004000
+data8    0xAC862A237221235F, 0x00004000
+data8    0xAA2478AF5F00A9D1, 0x00004000
+data8    0xA7DDBB0C81E082BF, 0x00004000
+data8    0xA5B0987D45684FEE, 0x00004000
+data8    0xA39BD0F5627A8F53, 0x00004000
+data8    0xA19E3B036EC5C8B0, 0x00004000
+data8    0x9FB6C1F091CD7C66, 0x00004000
+data8    0x9DE464101FA3DF8A, 0x00004000
+data8    0x9C263139A8F6B888, 0x00004000
+data8    0x9A7B4968C27B0450, 0x00004000
+data8    0x98E2DB7E5EE614EE, 0x00004000
+LOCAL_OBJECT_END(tanl_table_scim2)
+
+LOCAL_OBJECT_START(tanl_table_scim1)
 //
 //  Entries SC_inv in Swapped IEEE format (extended)
 //  Index = 0,1,...,19  B = 2^(-1)*(1+Index/32+1/64)
 //
-data4    0x13B2B5BA, 0x969F335C, 0x00004000, 0x00000000
-data4    0xD4C0F548, 0x93D446D9, 0x00004000, 0x00000000
-data4    0x61B798AF, 0x9147094F, 0x00004000, 0x00000000
-data4    0x758787AC, 0x8EF317CC, 0x00004000, 0x00000000
-data4    0xB99EEFDB, 0x8CD498B3, 0x00004000, 0x00000000
-data4    0xDFF8BC37, 0x8AE82A7D, 0x00004000, 0x00000000
-data4    0xE3C55D42, 0x892AD546, 0x00004000, 0x00000000
-data4    0xD15573C1, 0x8799FEA9, 0x00004000, 0x00000000
-data4    0x435A4B4C, 0x86335F88, 0x00004000, 0x00000000
-data4    0x3E93A87B, 0x84F4FB6E, 0x00004000, 0x00000000
-data4    0x80A382FB, 0x83DD1952, 0x00004000, 0x00000000
-data4    0xA4CB8C9E, 0x82EA3D7F, 0x00004000, 0x00000000
-data4    0x6861D0A8, 0x821B247C, 0x00004000, 0x00000000
-data4    0x63E8D244, 0x816EBED1, 0x00004000, 0x00000000
-data4    0x27E4CFC6, 0x80E42D91, 0x00004000, 0x00000000
-data4    0x28E64AFD, 0x807ABF8D, 0x00004000, 0x00000000
-data4    0x863B4FD8, 0x8031EF26, 0x00004000, 0x00000000
-data4    0xAE8C11FD, 0x800960AD, 0x00004000, 0x00000000
-data4    0x5FDBEC21, 0x8000E147, 0x00004000, 0x00000000
-data4    0xA07791FA, 0x80186650, 0x00004000, 0x00000000
-ASM_SIZE_DIRECTIVE(TANL_BASE_CONSTANTS)
-
-Arg                 = f8   
+data8    0x969F335C13B2B5BA, 0x00004000
+data8    0x93D446D9D4C0F548, 0x00004000
+data8    0x9147094F61B798AF, 0x00004000
+data8    0x8EF317CC758787AC, 0x00004000
+data8    0x8CD498B3B99EEFDB, 0x00004000
+data8    0x8AE82A7DDFF8BC37, 0x00004000
+data8    0x892AD546E3C55D42, 0x00004000
+data8    0x8799FEA9D15573C1, 0x00004000
+data8    0x86335F88435A4B4C, 0x00004000
+data8    0x84F4FB6E3E93A87B, 0x00004000
+data8    0x83DD195280A382FB, 0x00004000
+data8    0x82EA3D7FA4CB8C9E, 0x00004000
+data8    0x821B247C6861D0A8, 0x00004000
+data8    0x816EBED163E8D244, 0x00004000
+data8    0x80E42D9127E4CFC6, 0x00004000
+data8    0x807ABF8D28E64AFD, 0x00004000
+data8    0x8031EF26863B4FD8, 0x00004000
+data8    0x800960ADAE8C11FD, 0x00004000
+data8    0x8000E1475FDBEC21, 0x00004000
+data8    0x80186650A07791FA, 0x00004000
+LOCAL_OBJECT_END(tanl_table_scim1)
+
+Arg                 = f8
+Save_Norm_Arg       = f8        // For input to reduction routine
 Result              = f8
-fp_tmp              = f9
+r                   = f8        // For output from reduction routine
+c                   = f9        // For output from reduction routine
 U_2                 = f10
-rsq                =  f11
+rsq                 = f11
 C_hi                = f12
 C_lo                = f13
 T_hi                = f14
 T_lo                = f15
 
-N_0                 = f32
 d_1                 = f33
-MPI_BY_4            = f34
+N_0                 = f34
 tail                = f35
 tanx                = f36
 Cx                  = f37
@@ -949,8 +1105,6 @@ P1_7                = f51
 P1_8                = f52
 P1_9                = f53
 
-TWO_TO_63           = f54
-NEGTWO_TO_63        = f55
 x                   = f56
 xsq                 = f57
 Tx                  = f58
@@ -966,12 +1120,10 @@ B                   = f67
 SC_inv              = f68
 Pos_r               = f69
 N_0_fix             = f70
-PI_BY_4             = f71
-NEGTWO_TO_NEG2      = f72
-TWO_TO_24           = f73
+d_2                 = f71
+PI_BY_4             = f72
 TWO_TO_NEG14        = f74
 TWO_TO_NEG33        = f75
-NEGTWO_TO_24        = f76
 NEGTWO_TO_NEG14     = f76
 NEGTWO_TO_NEG33     = f77
 two_by_PI           = f78
@@ -982,13 +1134,14 @@ P_2                 = f82
 P_3                 = f83
 s_val               = f84
 w                   = f85
-c                   = f86
-r                   = f87
+B_mask1             = f86
+B_mask2             = f87
+w2                  = f88
 A                   = f89
 a                   = f90
 t                   = f91
 U_1                 = f92
-d_2                 = f93
+NEGTWO_TO_NEG2      = f93
 TWO_TO_NEG2         = f94
 Q1_1                = f95
 Q1_2                = f96
@@ -1009,609 +1162,643 @@ V_hiabs             = f110
 V                   = f111
 Inv_P_0             = f112
 
+FR_inv_pi_2to63     = f113
+FR_rshf_2to64       = f114
+FR_2tom64           = f115
+FR_rshf             = f116
+Norm_Arg            = f117
+Abs_Arg             = f118
+TWO_TO_NEG65        = f119
+fp_tmp              = f120
+mOne                = f121
+
 GR_SAVE_B0     = r33
 GR_SAVE_GP     = r34
 GR_SAVE_PFS    = r35
-delta1         = r36
+table_base     = r36
 table_ptr1     = r37
 table_ptr2     = r38
-i_0            = r39
-i_1            = r40 
-N_fix_gr       = r41 
-N_inc          = r42 
-exp_Arg        = r43 
-exp_r          = r44 
-sig_r          = r45 
-lookup         = r46   
-table_offset   = r47 
-Create_B       = r48 
+table_ptr3     = r39
+lookup         = r40
+N_fix_gr       = r41
+GR_exp_2tom2   = r42
+GR_exp_2tom65  = r43
+exp_r          = r44
+sig_r          = r45
+bmask1         = r46
+table_offset   = r47
+bmask2         = r48
 gr_tmp         = r49
+cot_flag       = r50
+
+GR_sig_inv_pi  = r51
+GR_rshf_2to64  = r52
+GR_exp_2tom64  = r53
+GR_rshf        = r54
+GR_exp_2_to_63 = r55
+GR_exp_2_to_24 = r56
+GR_signexp_x   = r57
+GR_exp_x       = r58
+GR_exp_mask    = r59
+GR_exp_2tom14  = r60
+GR_exp_m2tom14 = r61
+GR_exp_2tom33  = r62
+GR_exp_m2tom33 = r63
+
+GR_SAVE_B0                  = r64
+GR_SAVE_PFS                 = r65
+GR_SAVE_GP                  = r66
+
+GR_Parameter_X              = r67
+GR_Parameter_Y              = r68
+GR_Parameter_RESULT         = r69
+GR_Parameter_Tag            = r70
+
 
 .section .text
-.global tanl
-.proc tanl
-tanl:
-#ifdef _LIBC
-.global __tanl
-.proc __tanl
-__tanl:
-#endif
-{ .mfi
-alloc r32 = ar.pfs, 0,17,2,0
-(p0)   fclass.m.unc  p6,p0 = Arg, 0x1E7
-      addl gr_tmp = -1,r0             
-}
-{ .mfi
-  nop.m 0
-(p0)   fclass.nm.unc  p7,p0 = Arg, 0x1FF
-  nop.i 0
+.global __libm_tanl#
+.global __libm_cotl#
+
+.proc __libm_cotl#
+__libm_cotl:
+.endp __libm_cotl#
+LOCAL_LIBM_ENTRY(cotl)
+
+{ .mlx
+      alloc r32 = ar.pfs, 0,35,4,0
+      movl GR_sig_inv_pi = 0xa2f9836e4e44152a // significand of 1/pi
+}
+{ .mlx
+      mov GR_exp_mask = 0x1ffff            // Exponent mask
+      movl GR_rshf_2to64 = 0x47e8000000000000 // 1.1000 2^(63+64)
+}
+;;
+
+//     Check for NatVals, Infs , NaNs, and Zeros
+{ .mfi
+      getf.exp GR_signexp_x = Arg          // Get sign and exponent of x
+      fclass.m  p6,p0 = Arg, 0x1E7         // Test for natval, nan, inf, zero
+      mov cot_flag = 0x1
+}
+{ .mfb
+      addl table_base = @ltoff(TANL_BASE_CONSTANTS), gp // Pointer to table ptr
+      fnorm.s1 Norm_Arg = Arg              // Normalize x
+      br.cond.sptk COMMON_PATH
 };;
 
+LOCAL_LIBM_END(cotl)
+
+
+.proc __libm_tanl#
+__libm_tanl:
+.endp __libm_tanl#
+GLOBAL_IEEE754_ENTRY(tanl)
+
+{ .mlx
+      alloc r32 = ar.pfs, 0,35,4,0
+      movl GR_sig_inv_pi = 0xa2f9836e4e44152a // significand of 1/pi
+}
+{ .mlx
+      mov GR_exp_mask = 0x1ffff            // Exponent mask
+      movl GR_rshf_2to64 = 0x47e8000000000000 // 1.1000 2^(63+64)
+}
+;;
+
+//     Check for NatVals, Infs , NaNs, and Zeros
 { .mfi
-(p0)  addl           table_ptr1   = @ltoff(TANL_BASE_CONSTANTS), gp
-	nop.f 999
+      getf.exp GR_signexp_x = Arg          // Get sign and exponent of x
+      fclass.m  p6,p0 = Arg, 0x1E7         // Test for natval, nan, inf, zero
+      mov cot_flag = 0x0
+}
+{ .mfi
+      addl table_base = @ltoff(TANL_BASE_CONSTANTS), gp // Pointer to table ptr
+      fnorm.s1 Norm_Arg = Arg              // Normalize x
       nop.i 0
+};;
+
+// Common path for both tanl and cotl
+COMMON_PATH:
+{ .mfi
+      setf.sig FR_inv_pi_2to63 = GR_sig_inv_pi // Form 1/pi * 2^63
+      fclass.m p9, p0 = Arg, 0x0b          // Test x denormal
+      mov GR_exp_2tom64 = 0xffff - 64      // Scaling constant to compute N
+}
+{ .mlx
+      setf.d FR_rshf_2to64 = GR_rshf_2to64 // Form const 1.1000 * 2^(63+64)
+      movl GR_rshf = 0x43e8000000000000    // Form const 1.1000 * 2^63
 }
 ;;
-{ .mmi
-(p0)  ld8 table_ptr1 = [table_ptr1]
-      setf.sig fp_tmp = gr_tmp   // Make a constant so fmpy produces inexact
-      nop.i 999
+
+// Check for everything - if false, then must be pseudo-zero or pseudo-nan.
+// Branch out to deal with special values.
+{ .mfi
+      addl gr_tmp = -1,r0
+      fclass.nm  p7,p0 = Arg, 0x1FF        // Test x unsupported
+      mov GR_exp_2_to_63 = 0xffff + 63     // Exponent of 2^63
+}
+{ .mfb
+      ld8 table_base = [table_base]        // Get pointer to constant table
+      fms.s1 mOne = f0, f0, f1
+(p6)  br.cond.spnt TANL_SPECIAL            // Branch if x natval, nan, inf, zero
 }
 ;;
 
-//
-//     Check for NatVals, Infs , NaNs, and Zeros 
-//     Check for everything - if false, then must be pseudo-zero
-//     or pseudo-nan.
-//     Local table pointer
-//
-{ .mbb
-(p0)   add table_ptr2 = 96, table_ptr1
-(p6)   br.cond.spnt L(TANL_SPECIAL) 
-(p7)   br.cond.spnt L(TANL_SPECIAL) ;;
+{ .mmb
+      setf.sig fp_tmp = gr_tmp   // Make a constant so fmpy produces inexact
+      mov GR_exp_2_to_24 = 0xffff + 24     // Exponent of 2^24
+(p9)  br.cond.spnt TANL_DENORMAL           // Branch if x denormal
 }
+;;
+
+TANL_COMMON:
+// Return to here if x denormal
 //
-//     Point to Inv_P_0
-//     Branch out to deal with unsupporteds and special values. 
-//
-{ .mmf
-(p0)   ldfs TWO_TO_24 = [table_ptr1],4
-(p0)   ldfs TWO_TO_63 = [table_ptr2],4
-//
-//     Load -2**24, load -2**63.
-//
-(p0)   fcmp.eq.s0 p0, p6 = Arg, f1 ;;
-}
+// Do fcmp to generate Denormal exception
+//  - can't do FNORM (will generate Underflow when U is unmasked!)
+// Branch out to deal with unsupporteds values.
 { .mfi
-(p0)   ldfs NEGTWO_TO_63 = [table_ptr2],12
-(p0)   fnorm.s1     Arg = Arg
-	nop.i 999
+      setf.exp FR_2tom64 = GR_exp_2tom64 // Form 2^-64 for scaling N_float
+      fcmp.eq.s0 p0, p6 = Arg, f1        // Dummy to flag denormals
+      add table_ptr1 = 0, table_base     // Point to tanl_table_1
 }
-//
-//     Load 2**24, Load 2**63.
-//
-{ .mmi
-(p0)   ldfs NEGTWO_TO_24 = [table_ptr1],12 ;;
-//
-//     Do fcmp to generate Denormal exception 
-//     - can't do FNORM (will generate Underflow when U is unmasked!)
-//     Normalize input argument.
-//
-(p0)   ldfe two_by_PI = [table_ptr1],16
-	nop.i 999
+{ .mib
+      setf.d FR_rshf = GR_rshf           // Form right shift const 1.1000 * 2^63
+      add table_ptr2 = 80, table_base    // Point to tanl_table_2
+(p7)  br.cond.spnt TANL_UNSUPPORTED      // Branch if x unsupported type
 }
-{ .mmi
-(p0)   ldfe Inv_P_0 = [table_ptr2],16 ;;
-(p0)   ldfe d_1 = [table_ptr2],16
-	nop.i 999
+;;
+
+{ .mfi
+      and GR_exp_x = GR_exp_mask, GR_signexp_x // Get exponent of x
+      fmpy.s1 Save_Norm_Arg = Norm_Arg, f1     // Save x if large arg reduction
+      dep.z bmask1 = 0x7c, 56, 8               // Form mask to get 5 msb of r
+                                               // bmask1 = 0x7c00000000000000
 }
+;;
+
 //
 //     Decide about the paths to take:
-//     PR_1 and PR_3 set if -2**24 < Arg < 2**24 - CASE 1 OR 2
-//     OTHERWISE - CASE 3 OR 4
-//     Load inverse of P_0 .
-//     Set PR_6 if Arg <= -2**63
-//     Are there any Infs, NaNs, or zeros?
+//     Set PR_6 if |Arg| >= 2**63
+//     Set PR_9 if |Arg| < 2**24 - CASE 1 OR 2
+//     OTHERWISE Set PR_8 - CASE 3 OR 4
 //
-{ .mmi
-(p0)   ldfe P_0 = [table_ptr1],16 ;;
-(p0)   ldfe d_2 = [table_ptr2],16
-	nop.i 999
+//     Branch out if the magnitude of the input argument is >= 2^63
+//     - do this branch before the next.
+{ .mfi
+      ldfe two_by_PI = [table_ptr1],16        // Load 2/pi
+      nop.f 999
+      dep.z bmask2 = 0x41, 57, 7              // Form mask to OR to produce B
+                                              // bmask2 = 0x8200000000000000
 }
-//
-//     Set PR_8 if Arg <= -2**24
-//     Set PR_6 if Arg >=  2**63
-//
-{ .mmi
-(p0)   ldfe P_1 = [table_ptr1],16 ;;
-(p0)   ldfe PI_BY_4 = [table_ptr2],16
-	nop.i 999
+{ .mib
+      ldfe PI_BY_4 = [table_ptr2],16          // Load pi/4
+      cmp.ge p6,p0 = GR_exp_x, GR_exp_2_to_63 // Is |x| >= 2^63
+(p6)  br.cond.spnt TANL_ARG_TOO_LARGE         // Branch if |x| >= 2^63
 }
-//
-//     Set PR_8 if Arg >= 2**24
-//
+;;
+
 { .mmi
-(p0)   ldfe P_2 = [table_ptr1],16 ;;
-(p0)   ldfe   MPI_BY_4 = [table_ptr2],16
-	nop.i 999
-}
-//
-//     Load  P_2 and PI_BY_4
-//
-{ .mfi
-(p0)   ldfe   P_3 = [table_ptr1],16
-	nop.f 999
-	nop.i 999 ;;
-}
-{ .mfi
-	nop.m 999
-(p0)   fcmp.le.unc.s1 p6,p7 = Arg,NEGTWO_TO_63
-	nop.i 999
+      ldfe P_0 = [table_ptr1],16              // Load P_0
+      ldfe Inv_P_0 = [table_ptr2],16          // Load Inv_P_0
+      nop.i 999
 }
+;;
+
 { .mfi
-	nop.m 999
-(p0)   fcmp.le.unc.s1 p8,p9 = Arg,NEGTWO_TO_24
-	nop.i 999 ;;
+      ldfe P_1 = [table_ptr1],16              // Load P_1
+      fmerge.s Abs_Arg = f0, Norm_Arg         // Get |x|
+      mov GR_exp_m2tom33 = 0x2ffff - 33       // Form signexp of -2^-33
 }
 { .mfi
-	nop.m 999
-(p7)   fcmp.ge.s1 p6,p0 = Arg,TWO_TO_63
-	nop.i 999
+      ldfe d_1 = [table_ptr2],16              // Load d_1 for 2^24 <= |x| < 2^63
+      nop.f 999
+      mov GR_exp_2tom33 = 0xffff - 33         // Form signexp of 2^-33
 }
-{ .mfi
-	nop.m 999
-(p9)   fcmp.ge.s1 p8,p0 = Arg,TWO_TO_24
-	nop.i 999 ;;
+;;
+
+{ .mmi
+      ldfe P_2 = [table_ptr1],16              // Load P_2
+      ldfe d_2 = [table_ptr2],16              // Load d_2 for 2^24 <= |x| < 2^63
+      cmp.ge p8,p0 = GR_exp_x, GR_exp_2_to_24 // Is |x| >= 2^24
 }
-{ .mib
-	nop.m 999
-	nop.i 999
-//
-//     Load  P_3 and -PI_BY_4
-//
-(p6)   br.cond.spnt L(TANL_ARG_TOO_LARGE) ;;
+;;
+
+// Use special scaling to right shift so N=Arg * 2/pi is in rightmost bits
+// Branch to Cases 3 or 4 if Arg <= -2**24 or Arg >= 2**24
+{ .mfb
+      ldfe   P_3 = [table_ptr1],16            // Load P_3
+      fma.s1      N_fix = Norm_Arg, FR_inv_pi_2to63, FR_rshf_2to64
+(p8)  br.cond.spnt TANL_LARGER_ARG            // Branch if 2^24 <= |x| < 2^63
 }
-{ .mib
-	nop.m 999
-	nop.i 999
-//
-//     Load 2**(-2).
-//     Load -2**(-2).
-//     Branch out if we have a special argument.
-//     Branch out if the magnitude of the input argument is too large
-//     - do this branch before the next.
+;;
+
+// Here if 0 < |x| < 2^24
+//     ARGUMENT REDUCTION CODE - CASE 1 and 2
 //
-(p8)   br.cond.spnt L(TANL_LARGER_ARG) ;;
+{ .mmf
+      setf.exp TWO_TO_NEG33 = GR_exp_2tom33      // Form 2^-33
+      setf.exp NEGTWO_TO_NEG33 = GR_exp_m2tom33  // Form -2^-33
+      fmerge.s r = Norm_Arg,Norm_Arg          // Assume r=x, ok if |x| < pi/4
 }
+;;
+
 //
-//     Branch to Cases 3 or 4 if Arg <= -2**24 or Arg >= 2**24
+// If |Arg| < pi/4,  set PR_8, else  pi/4 <=|Arg| < 2^24 - set PR_9.
 //
+//     Case 2: Convert integer N_fix back to normalized floating-point value.
 { .mfi
-(p0)   ldfs TWO_TO_NEG2 = [table_ptr2],4
-//     ARGUMENT REDUCTION CODE - CASE 1 and 2
-//     Load 2**(-2).
-//     Load -2**(-2).
-(p0)   fmpy.s1 N = Arg,two_by_PI
-	nop.i 999 ;;
+      getf.sig sig_r = Norm_Arg               // Get sig_r if 1/4 <= |x| < pi/4
+      fcmp.lt.s1 p8,p9= Abs_Arg,PI_BY_4       // Test |x| < pi/4
+      mov GR_exp_2tom2 = 0xffff - 2           // Form signexp of 2^-2
 }
 { .mfi
-(p0)   ldfs NEGTWO_TO_NEG2 = [table_ptr2],12
-//
-//     N = Arg * 2/pi
-//
-(p0)   fcmp.lt.unc.s1 p8,p9= Arg,PI_BY_4
-	nop.i 999 ;;
-}
-{ .mfi
-	nop.m 999
-//
-//     if Arg < pi/4,  set PR_8.
-//
-(p8)   fcmp.gt.s1 p8,p9= Arg,MPI_BY_4
-	nop.i 999 ;;
+      ldfps TWO_TO_NEG2, NEGTWO_TO_NEG2 = [table_ptr2] // Load 2^-2, -2^-2
+      fms.s1 N = N_fix, FR_2tom64, FR_rshf    // Use scaling to get N floated
+      mov N_fix_gr = r0                       // Assume N=0, ok if |x| < pi/4
 }
+;;
+
 //
 //     Case 1: Is |r| < 2**(-2).
 //     Arg is the same as r in this case.
 //     r = Arg
 //     c = 0
 //
+//     Case 2: Place integer part of N in GP register.
 { .mfi
-(p8)   mov N_fix_gr = r0
-//
-//     if Arg > -pi/4, reset PR_8.
-//     Select the case when |Arg| < pi/4 - set PR[8] = true.
-//     Else Select the case when |Arg| >= pi/4 - set PR[9] = true.
-//
-(p0)   fcvt.fx.s1 N_fix = N
-	nop.i 999 ;;
-}
-{ .mfi
-	nop.m 999
-//
-//     Grab the integer part of N .
-//
-(p8)   mov r = Arg
-	nop.i 999
-}
-{ .mfi
-	nop.m 999
-(p8)   mov c = f0
-	nop.i 999 ;;
-}
-{ .mfi
-	nop.m 999
-(p8)   fcmp.lt.unc.s1 p10, p11 = Arg, TWO_TO_NEG2
-	nop.i 999 ;;
+(p9)  getf.sig N_fix_gr = N_fix
+      fmerge.s c = f0, f0                     // Assume c=0, ok if |x| < pi/4
+      cmp.lt p10, p0 = GR_exp_x, GR_exp_2tom2 // Test if |x| < 1/4
 }
+;;
+
 { .mfi
-	nop.m 999
-(p10)  fcmp.gt.s1 p10,p0 = Arg, NEGTWO_TO_NEG2
-	nop.i 999 ;;
+      setf.sig B_mask1 = bmask1               // Form mask to get 5 msb of r
+      nop.f 999
+      mov exp_r = GR_exp_x                    // Get exp_r if 1/4 <= |x| < pi/4
 }
-{ .mfi
-	nop.m 999
-//
-//     Case 2: Place integer part of N in GP register.
-//
-(p9)   fcvt.xf N = N_fix
-	nop.i 999 ;;
-}
-{ .mib
-(p9)   getf.sig N_fix_gr = N_fix
-	nop.i 999
-//
-//     Case 2: Convert integer N_fix back to normalized floating-point value.
-//
-(p10)  br.cond.spnt L(TANL_SMALL_R) ;;
-}
-{ .mib
-	nop.m 999
-	nop.i 999
-(p8)   br.cond.sptk L(TANL_NORMAL_R) ;;
+{ .mbb
+      setf.sig B_mask2 = bmask2               // Form mask to form B from r
+(p10) br.cond.spnt TANL_SMALL_R               // Branch if 0 < |x| < 1/4
+(p8)  br.cond.spnt TANL_NORMAL_R              // Branch if 1/4 <= |x| < pi/4
 }
+;;
+
+// Here if pi/4 <= |x| < 2^24
 //
 //     Case 1: PR_3 is only affected  when PR_1 is set.
 //
-{ .mmi
-(p9)   ldfs TWO_TO_NEG33 = [table_ptr2], 4 ;;
 //
-//     Case 2: Load 2**(-33).
+//     Case 2: w = N * P_2
+//     Case 2: s_val = -N * P_1  + Arg
 //
-(p9)   ldfs NEGTWO_TO_NEG33 = [table_ptr2], 4
-	nop.i 999 ;;
+
+{ .mfi
+      nop.m 999
+      fnma.s1 s_val = N, P_1, Norm_Arg
+      nop.i 999
 }
 { .mfi
-	nop.m 999
-//
-//     Case 2: Load -2**(-33).
-//
-(p9)   fnma.s1 s_val = N, P_1, Arg
-	nop.i 999
+      nop.m 999
+      fmpy.s1 w = N, P_2                     // w = N * P_2 for |s| >= 2^-33
+      nop.i 999
 }
+;;
+
+//     Case 2_reduce: w = N * P_3 (change sign)
 { .mfi
-	nop.m 999
-(p9)   fmpy.s1 w = N, P_2
-	nop.i 999 ;;
+      nop.m 999
+      fmpy.s1 w2 = N, P_3                    // w = N * P_3 for |s| < 2^-33
+      nop.i 999
 }
+;;
+
+//     Case 1_reduce: r = s + w (change sign)
 { .mfi
-	nop.m 999
-//
-//     Case 2: w = N * P_2
-//     Case 2: s_val = -N * P_1  + Arg
-//
-(p0)   fcmp.lt.unc.s1 p9,p8 = s_val, TWO_TO_NEG33
-	nop.i 999 ;;
+      nop.m 999
+      fsub.s1 r = s_val, w                   // r = s_val - w for |s| >= 2^-33
+      nop.i 999
 }
+;;
+
+//     Case 2_reduce: U_1 = N * P_2 + w
 { .mfi
-	nop.m 999
+      nop.m 999
+      fma.s1  U_1 = N, P_2, w2              // U_1 = N * P_2 + w for |s| < 2^-33
+      nop.i 999
+}
+;;
+
 //
 //     Decide between case_1 and case_2 reduce:
+//     Case 1_reduce:  |s| >= 2**(-33)
+//     Case 2_reduce:  |s| < 2**(-33)
 //
-(p9)   fcmp.gt.s1 p9, p8 = s_val, NEGTWO_TO_NEG33
-	nop.i 999 ;;
+{ .mfi
+      nop.m 999
+      fcmp.lt.s1 p9, p8 = s_val, TWO_TO_NEG33
+      nop.i 999
 }
+;;
+
 { .mfi
-	nop.m 999
-//
-//     Case 1_reduce:  s <= -2**(-33) or s >= 2**(-33)
-//     Case 2_reduce: -2**(-33) < s < 2**(-33)
-//
-(p8)   fsub.s1 r = s_val, w
-	nop.i 999
+      nop.m 999
+(p9)  fcmp.gt.s1 p9, p8 = s_val, NEGTWO_TO_NEG33
+      nop.i 999
 }
+;;
+
+//     Case 1_reduce: c = s - r
 { .mfi
-	nop.m 999
-(p9)   fmpy.s1 w = N, P_3
-	nop.i 999 ;;
+      nop.m 999
+      fsub.s1 c = s_val, r                     // c = s_val - r for |s| >= 2^-33
+      nop.i 999
 }
+;;
+
+//     Case 2_reduce: r is complete here - continue to calculate c .
+//     r = s - U_1
 { .mfi
-	nop.m 999
-(p9)   fma.s1  U_1 = N, P_2, w
-	nop.i 999
+      nop.m 999
+(p9)  fsub.s1 r = s_val, U_1
+      nop.i 999
 }
 { .mfi
-	nop.m 999
+      nop.m 999
+(p9)  fms.s1 U_2 = N, P_2, U_1
+      nop.i 999
+}
+;;
+
 //
 //     Case 1_reduce: Is |r| < 2**(-2), if so set PR_10
-//     else set PR_11.
+//     else set PR_13.
 //
-(p8)   fsub.s1 c = s_val, r
-	nop.i 999 ;;
-}
+
 { .mfi
-	nop.m 999
-//
-//     Case 1_reduce: r = s + w (change sign)
-//     Case 2_reduce: w = N * P_3 (change sign)
-//
-(p8)   fcmp.lt.unc.s1 p10, p11 = r, TWO_TO_NEG2
-	nop.i 999 ;;
+      nop.m 999
+      fand B = B_mask1, r
+      nop.i 999
 }
 { .mfi
-	nop.m 999
-(p10)  fcmp.gt.s1 p10, p11 = r, NEGTWO_TO_NEG2
-	nop.i 999 ;;
+      nop.m 999
+(p8)  fcmp.lt.unc.s1 p10, p13 = r, TWO_TO_NEG2
+      nop.i 999
 }
+;;
+
 { .mfi
-	nop.m 999
-(p9)   fsub.s1 r = s_val, U_1
-	nop.i 999
+(p8)  getf.sig sig_r = r               // Get signif of r if |s| >= 2^-33
+      nop.f 999
+      nop.i 999
 }
+;;
+
 { .mfi
-	nop.m 999
-//
+(p8)  getf.exp exp_r = r               // Extract signexp of r if |s| >= 2^-33
+(p10) fcmp.gt.s1 p10, p13 = r, NEGTWO_TO_NEG2
+      nop.i 999
+}
+;;
+
 //     Case 1_reduce: c is complete here.
+//     Case 1: Branch to SMALL_R or NORMAL_R.
 //     c = c + w (w has not been negated.)
-//     Case 2_reduce: r is complete here - continue to calculate c .
-//     r = s - U_1
-//
-(p9)   fms.s1 U_2 = N, P_2, U_1
-	nop.i 999 ;;
-}
 { .mfi
-	nop.m 999
+      nop.m 999
+(p8)  fsub.s1 c = c, w                         // c = c - w for |s| >= 2^-33
+      nop.i 999
+}
+{ .mbb
+      nop.m 999
+(p10) br.cond.spnt TANL_SMALL_R     // Branch if pi/4 < |x| < 2^24 and |r|<1/4
+(p13) br.cond.sptk TANL_NORMAL_R_A  // Branch if pi/4 < |x| < 2^24 and |r|>=1/4
+}
+;;
+
+
+// Here if pi/4 < |x| < 2^24 and |s| < 2^-33
 //
-//     Case 1_reduce: c = s - r
-//     Case 2_reduce: U_1 = N * P_2 + w
+//     Is i_1 = lsb of N_fix_gr even or odd?
+//     if i_1 == 0, set p11, else set p12.
 //
-(p8)   fsub.s1 c = c, w
-	nop.i 999 ;;
-}
 { .mfi
-	nop.m 999
-(p9)   fsub.s1 s_val = s_val, r
-	nop.i 999
+      nop.m 999
+      fsub.s1 s_val = s_val, r
+      add N_fix_gr = N_fix_gr, cot_flag // N = N + 1 (for cotl)
 }
-{ .mfb
-	nop.m 999
+{ .mfi
+      nop.m 999
 //
 //     Case 2_reduce:
 //     U_2 = N * P_2 - U_1
 //     Not needed until later.
 //
-(p9)   fadd.s1 U_2 = U_2, w
+      fadd.s1 U_2 = U_2, w2
 //
 //     Case 2_reduce:
 //     s = s - r
 //     U_2 = U_2 + w
 //
-(p10)  br.cond.spnt L(TANL_SMALL_R) ;;
-}
-{ .mib
-	nop.m 999
-	nop.i 999
-(p11)  br.cond.sptk L(TANL_NORMAL_R) ;;
+      nop.i 999
 }
-{ .mii
-	nop.m 999
+;;
+
 //
 //     Case 2_reduce:
 //     c = c - U_2
 //     c is complete here
 //     Argument reduction ends here.
 //
-(p9)   extr.u i_1 = N_fix_gr, 0, 1 ;;
-(p9)   cmp.eq.unc p11, p12 = 0x0000,i_1 ;;
-}
-{ .mfi
-	nop.m 999
-//
-//     Is i_1  even or odd?
-//     if i_1 == 0, set p11, else set p12.
-//
-(p11)  fmpy.s1 rsq = r, r
-	nop.i 999 ;;
-}
 { .mfi
-	nop.m 999
-(p12)  frcpa.s1 S_hi,p0 = f1, r
-	nop.i 999
+      nop.m 999
+      fmpy.s1 rsq = r, r
+      tbit.z p11, p12 = N_fix_gr, 0 ;;    // Set p11 if N even, p12 if odd
 }
 
-
-
-//
-//     Case 1: Branch to SMALL_R or NORMAL_R.
-//     Case 1 is done now.
-//
-
 { .mfi
-(p9)   addl           table_ptr1   = @ltoff(TANL_BASE_CONSTANTS), gp
-(p9)   fsub.s1 c = s_val, U_1
-       nop.i 999 ;;
+      nop.m 999
+(p12) frcpa.s1 S_hi,p0 = f1, r
+      nop.i 999
 }
-;;
-
-{ .mmi
-(p9)  ld8 table_ptr1 = [table_ptr1]
+{ .mfi
       nop.m 999
+      fsub.s1 c = s_val, U_1
       nop.i 999
 }
 ;;
 
-
 { .mmi
-(p9)   add table_ptr1 = 224, table_ptr1 ;;
-(p9)   ldfe P1_1 = [table_ptr1],144
-	nop.i 999 ;;
+      add table_ptr1 = 160, table_base ;;  // Point to tanl_table_p1
+      ldfe P1_1 = [table_ptr1],144
+      nop.i 999 ;;
 }
 //
-//     Get [i_1] -  lsb of N_fix_gr .
 //     Load P1_1 and point to Q1_1 .
 //
 { .mfi
-(p9)   ldfe Q1_1 = [table_ptr1] , 0
+      ldfe Q1_1 = [table_ptr1]
 //
 //     N even: rsq = r * Z
 //     N odd:  S_hi = frcpa(r)
 //
-(p12)  fmerge.ns S_hi = S_hi, S_hi
-	nop.i 999
+(p12) fmerge.ns S_hi = S_hi, S_hi
+      nop.i 999
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
 //     Case 2_reduce:
 //     c = s - U_1
 //
-(p9)   fsub.s1 c = c, U_2
-	nop.i 999 ;;
+(p9)  fsub.s1 c = c, U_2
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
-(p12)  fma.s1  poly1 = S_hi, r, f1
-	nop.i 999 ;;
+      nop.m 999
+(p12) fma.s1  poly1 = S_hi, r, f1
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
 //     N odd:  Change sign of S_hi
 //
-(p11)  fmpy.s1 rsq = rsq, P1_1
-	nop.i 999 ;;
+(p11) fmpy.s1 rsq = rsq, P1_1
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
-(p12)  fma.s1 S_hi = S_hi, poly1, S_hi
-	nop.i 999 ;;
+      nop.m 999
+(p12) fma.s1 S_hi = S_hi, poly1, S_hi
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
 //     N even: rsq = rsq * P1_1
 //     N odd:  poly1 =  1.0 +  S_hi * r    16 bits partial  account for necessary
 //
-(p11)  fma.s1 Result = r, rsq, c
-	nop.i 999 ;;
+(p11) fma.s1 Poly = r, rsq, c
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
-//     N even: Result = c  + r * rsq
+//     N even: Poly = c  + r * rsq
 //     N odd:  S_hi  = S_hi + S_hi*poly1  16 bits account for necessary
 //
-(p12)  fma.s1 poly1 = S_hi, r, f1
-	nop.i 999 ;;
+(p12) fma.s1 poly1 = S_hi, r, f1
+(p11) tbit.z.unc p14, p15 = cot_flag, 0 ;; // p14=1 for tanl; p15=1 for cotl
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
-//     N even: Result = Result + r
+//     N even: Result = Poly + r
 //     N odd:  poly1  = 1.0 + S_hi * r        32 bits partial
 //
-(p11)  fadd.s0 Result = r, Result
-	nop.i 999 ;;
+(p14) fadd.s0 Result = r, Poly             // for tanl
+      nop.i 999
+}
+{ .mfi
+      nop.m 999
+(p15) fms.s0 Result = r, mOne, Poly        // for cotl
+      nop.i 999
 }
+;;
+
 { .mfi
-	nop.m 999
-(p12)  fma.s1  S_hi = S_hi, poly1, S_hi
-	nop.i 999 ;;
+      nop.m 999
+(p12) fma.s1  S_hi = S_hi, poly1, S_hi
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
 //     N even: Result1 = Result + r
 //     N odd:   S_hi  = S_hi * poly1 + S_hi   32 bits
 //
-(p12)  fma.s1 poly1 = S_hi, r, f1
-	nop.i 999 ;;
+(p12) fma.s1 poly1 = S_hi, r, f1
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
 //     N odd:  poly1  =  S_hi * r + 1.0       64 bits partial
 //
-(p12)  fma.s1 S_hi = S_hi, poly1, S_hi
-	nop.i 999 ;;
+(p12) fma.s1 S_hi = S_hi, poly1, S_hi
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
 //     N odd:  poly1  =  S_hi * poly + 1.0    64 bits
 //
-(p12)  fma.s1 poly1 = S_hi, r, f1
-	nop.i 999 ;;
+(p12) fma.s1 poly1 = S_hi, r, f1
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
 //     N odd:  poly1  =  S_hi * r + 1.0
 //
-(p12)  fma.s1 poly1 = S_hi, c, poly1
-	nop.i 999 ;;
+(p12) fma.s1 poly1 = S_hi, c, poly1
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
 //     N odd:  poly1  =  S_hi * c + poly1
 //
-(p12)  fmpy.s1 S_lo = S_hi, poly1
-	nop.i 999 ;;
+(p12) fmpy.s1 S_lo = S_hi, poly1
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
 //     N odd:  S_lo  =  S_hi *  poly1
 //
-(p12)  fma.s1 S_lo = Q1_1, r, S_lo
-	nop.i 999
+(p12) fma.s1 S_lo = Q1_1, r, S_lo
+(p12) tbit.z.unc p14, p15 = cot_flag, 0 // p14=1 for tanl; p15=1 for cotl
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
 //     N odd:  Result =  S_hi + S_lo
 //
-(p0)   fmpy.s0 fp_tmp = fp_tmp, fp_tmp  // Dummy mult to set inexact
-	nop.i 999 ;;
+      fmpy.s0 fp_tmp = fp_tmp, fp_tmp  // Dummy mult to set inexact
+      nop.i 999 ;;
 }
-{ .mfb
-	nop.m 999
+{ .mfi
+      nop.m 999
 //
 //     N odd:  S_lo  =  S_lo + Q1_1 * r
 //
-(p12)  fadd.s0 Result = S_hi, S_lo
-(p0)   br.ret.sptk b0 ;;
+(p14) fadd.s0 Result = S_hi, S_lo          // for tanl
+      nop.i 999
+}
+{ .mfb
+      nop.m 999
+(p15) fms.s0 Result = S_hi, mOne, S_lo     // for cotl
+      br.ret.sptk b0 ;;          // Exit for pi/4 <= |x| < 2^24 and |s| < 2^-33
 }
 
 
-L(TANL_LARGER_ARG): 
-
+TANL_LARGER_ARG:
+// Here if 2^24 <= |x| < 2^63
 //
 // ARGUMENT REDUCTION CODE - CASE 3 and 4
 //
 
-{ .mfi
-(p0)  addl           table_ptr1   = @ltoff(TANL_BASE_CONSTANTS), gp
-(p0)  fmpy.s1 N_0 = Arg, Inv_P_0 
-	nop.i 999
+{ .mmf
+      mov GR_exp_2tom14 = 0xffff - 14          // Form signexp of 2^-14
+      mov GR_exp_m2tom14 = 0x2ffff - 14        // Form signexp of -2^-14
+      fmpy.s1 N_0 = Norm_Arg, Inv_P_0
 }
 ;;
 
 { .mmi
-(p0)  ld8 table_ptr1 = [table_ptr1]
-      nop.m 999
+      setf.exp TWO_TO_NEG14 = GR_exp_2tom14    // Form 2^-14
+      setf.exp NEGTWO_TO_NEG14 = GR_exp_m2tom14// Form -2^-14
       nop.i 999
 }
 ;;
@@ -1622,661 +1809,605 @@ L(TANL_LARGER_ARG):
 //    N_0 = Arg * Inv_P_0
 //
 { .mmi
-(p0)  add table_ptr1 = 8, table_ptr1 ;;
-//
-//    Point to  2*-14
-//
-(p0)  ldfs TWO_TO_NEG14 = [table_ptr1], 4
-	nop.i 999 ;;
+      add table_ptr2 = 144, table_base ;;     // Point to 2^-2
+      ldfps TWO_TO_NEG2, NEGTWO_TO_NEG2 = [table_ptr2]
+      nop.i 999
 }
-//
-//    Load 2**(-14).
-//
-{ .mmi
-(p0)  ldfs NEGTWO_TO_NEG14 = [table_ptr1], 180 ;;
+;;
+
 //
 //    N_0_fix  = integer part of N_0 .
-//    Adjust table_ptr1 to beginning of table.
 //
-(p0)  ldfs TWO_TO_NEG2 = [table_ptr1], 4
-	nop.i 999 ;;
-}
 //
 //    Make N_0 the integer part.
 //
 { .mfi
-(p0)  ldfs NEGTWO_TO_NEG2 = [table_ptr1]
-//
-//    Load -2**(-14).
-//
-(p0)  fcvt.fx.s1 N_0_fix = N_0
-	nop.i 999 ;;
+      nop.m 999
+      fcvt.fx.s1 N_0_fix = N_0
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
-(p0)  fcvt.xf N_0 = N_0_fix
-	nop.i 999 ;;
+      setf.sig B_mask1 = bmask1               // Form mask to get 5 msb of r
+      fcvt.xf N_0 = N_0_fix
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
-(p0)  fnma.s1 ArgPrime = N_0, P_0, Arg
-	nop.i 999
+      setf.sig B_mask2 = bmask2               // Form mask to form B from r
+      fnma.s1 ArgPrime = N_0, P_0, Norm_Arg
+      nop.i 999
 }
 { .mfi
-	nop.m 999
-(p0)  fmpy.s1 w = N_0, d_1
-	nop.i 999 ;;
+      nop.m 999
+      fmpy.s1 w = N_0, d_1
+      nop.i 999 ;;
 }
-{ .mfi
-	nop.m 999
 //
 //    ArgPrime = -N_0 * P_0 + Arg
 //    w  = N_0 * d_1
 //
-(p0)  fmpy.s1 N = ArgPrime, two_by_PI
-	nop.i 999 ;;
-}
-{ .mfi
-	nop.m 999
 //
 //    N = ArgPrime * 2/pi
 //
-(p0)  fcvt.fx.s1 N_fix = N
-	nop.i 999 ;;
-}
+//      fcvt.fx.s1 N_fix = N
+// Use special scaling to right shift so N=Arg * 2/pi is in rightmost bits
+// Branch to Cases 3 or 4 if Arg <= -2**24 or Arg >= 2**24
 { .mfi
-	nop.m 999
-//
-//    N_fix is the integer part.
-//
-(p0)  fcvt.xf N = N_fix
-	nop.i 999 ;;
+      nop.m 999
+      fma.s1      N_fix = ArgPrime, FR_inv_pi_2to63, FR_rshf_2to64
+
+      nop.i 999 ;;
 }
+//     Convert integer N_fix back to normalized floating-point value.
 { .mfi
-(p0)  getf.sig N_fix_gr = N_fix
-	nop.f 999
-	nop.i 999 ;;
+      nop.m 999
+      fms.s1 N = N_fix, FR_2tom64, FR_rshf    // Use scaling to get N floated
+      nop.i 999
 }
-{ .mfi
-	nop.m 999
+;;
+
 //
 //    N is the integer part of the reduced-reduced argument.
 //    Put the integer in a GP register.
 //
-(p0)  fnma.s1 s_val = N, P_1, ArgPrime
-	nop.i 999
-}
 { .mfi
-	nop.m 999
-(p0)  fnma.s1 w = N, P_2, w
-	nop.i 999 ;;
+      getf.sig N_fix_gr = N_fix
+      nop.f 999
+      nop.i 999
 }
-{ .mfi
-	nop.m 999
+;;
+
 //
 //    s_val = -N*P_1 + ArgPrime
 //    w = -N*P_2 + w
 //
-(p0)  fcmp.lt.unc.s1 p11, p10 = s_val, TWO_TO_NEG14
-	nop.i 999 ;;
-}
-{ .mfi
-	nop.m 999
-(p11) fcmp.gt.s1 p11, p10 = s_val, NEGTWO_TO_NEG14
-	nop.i 999 ;;
-}
 { .mfi
-	nop.m 999
-//
-//    Case 3: r = s_val + w (Z complete)
-//    Case 4: U_hi = N_0 * d_1
-//
-(p10) fmpy.s1 V_hi = N, P_2
-	nop.i 999
+      nop.m 999
+      fnma.s1 s_val = N, P_1, ArgPrime
+      nop.i 999
 }
 { .mfi
-	nop.m 999
-(p11) fmpy.s1 U_hi = N_0, d_1
-	nop.i 999 ;;
+      nop.m 999
+      fnma.s1 w = N, P_2, w
+      nop.i 999
 }
-{ .mfi
-	nop.m 999
-//
-//    Case 3: r = s_val + w (Z complete)
+;;
+
+//    Case 4: V_hi = N * P_2
 //    Case 4: U_hi = N_0 * d_1
-//
-(p11) fmpy.s1 V_hi = N, P_2
-	nop.i 999
-}
 { .mfi
-	nop.m 999
-(p11) fmpy.s1 U_hi = N_0, d_1
-	nop.i 999 ;;
+      nop.m 999
+      fmpy.s1 V_hi = N, P_2               // V_hi = N * P_2 for |s| < 2^-14
+      nop.i 999
 }
 { .mfi
-	nop.m 999
-//
-//    Decide between case 3 and 4:
-//    Case 3:  s <= -2**(-14) or s >= 2**(-14)
-//    Case 4: -2**(-14) < s < 2**(-14)
-//
-(p10) fadd.s1 r = s_val, w
-	nop.i 999
+      nop.m 999
+      fmpy.s1 U_hi = N_0, d_1             // U_hi = N_0 * d_1 for |s| < 2^-14
+      nop.i 999
 }
+;;
+
+//    Case 3: r = s_val + w (Z complete)
+//    Case 4: w = N * P_3
 { .mfi
-	nop.m 999
-(p11) fmpy.s1 w = N, P_3
-	nop.i 999 ;;
+      nop.m 999
+      fadd.s1 r = s_val, w                // r = s_val + w for |s| >= 2^-14
+      nop.i 999
 }
 { .mfi
-	nop.m 999
-//
-//    Case 4: We need abs of both U_hi and V_hi - dont
-//    worry about switched sign of V_hi .
-//
-(p11) fsub.s1 A = U_hi, V_hi
-	nop.i 999
+      nop.m 999
+      fmpy.s1 w2 = N, P_3                 // w = N * P_3 for |s| < 2^-14
+      nop.i 999
 }
-{ .mfi
-	nop.m 999
-//
+;;
+
 //    Case 4: A =  U_hi + V_hi
 //    Note: Worry about switched sign of V_hi, so subtract instead of add.
-//
-(p11) fnma.s1 V_lo = N, P_2, V_hi
-	nop.i 999 ;;
+//    Case 4: V_lo = -N * P_2 - V_hi (U_hi is in place of V_hi in writeup)
+//    Note: the (-) is still missing for V_hi.
+{ .mfi
+      nop.m 999
+      fsub.s1 A = U_hi, V_hi           // A = U_hi - V_hi for |s| < 2^-14
+      nop.i 999
 }
 { .mfi
-	nop.m 999
-(p11) fms.s1 U_lo = N_0, d_1, U_hi
-	nop.i 999 ;;
+      nop.m 999
+      fnma.s1 V_lo = N, P_2, V_hi      // V_lo = V_hi - N * P_2 for |s| < 2^-14
+      nop.i 999
 }
+;;
+
+//    Decide between case 3 and 4:
+//    Case 3:  |s| >= 2**(-14)     Set p10
+//    Case 4:  |s| <  2**(-14)     Set p11
+//
+//    Case 4: U_lo = N_0 * d_1 - U_hi
 { .mfi
-	nop.m 999
-(p11) fabs V_hiabs = V_hi
-	nop.i 999
+      nop.m 999
+      fms.s1 U_lo = N_0, d_1, U_hi     // U_lo = N_0*d_1 - U_hi for |s| < 2^-14
+      nop.i 999
 }
 { .mfi
-	nop.m 999
-//
-//    Case 4: V_hi = N * P_2
-//            w = N * P_3
-//    Note the product does not include the (-) as in the writeup
-//    so (-) missing for V_hi and w .
-(p10) fadd.s1 r = s_val, w
-	nop.i 999 ;;
+      nop.m 999
+      fcmp.lt.s1 p11, p10 = s_val, TWO_TO_NEG14
+      nop.i 999
 }
+;;
+
+//    Case 4: We need abs of both U_hi and V_hi - dont
+//    worry about switched sign of V_hi.
 { .mfi
-	nop.m 999
-//
-//    Case 3: c = s_val - r
-//    Case 4: U_lo = N_0 * d_1 - U_hi
-//
-(p11) fabs U_hiabs = U_hi
-	nop.i 999
+      nop.m 999
+      fabs V_hiabs = V_hi              // |V_hi| for |s| < 2^-14
+      nop.i 999
 }
 { .mfi
-	nop.m 999
-(p11) fmpy.s1 w = N, P_3
-	nop.i 999 ;;
+      nop.m 999
+(p11) fcmp.gt.s1 p11, p10 = s_val, NEGTWO_TO_NEG14
+      nop.i 999
 }
+;;
+
+//    Case 3: c = s_val - r
 { .mfi
-	nop.m 999
-//
-//    Case 4: Set P_12 if U_hiabs >= V_hiabs
-//
-(p11) fadd.s1 C_hi = s_val, A
-	nop.i 999 ;;
+      nop.m 999
+      fabs U_hiabs = U_hi              // |U_hi| for |s| < 2^-14
+      nop.i 999
 }
 { .mfi
-	nop.m 999
+      nop.m 999
+      fsub.s1 c = s_val, r             // c = s_val - r    for |s| >= 2^-14
+      nop.i 999
+}
+;;
+
+// For Case 3, |s| >= 2^-14, determine if |r| < 1/4
 //
 //    Case 4: C_hi = s_val + A
 //
-(p11) fadd.s1 t = U_lo, V_lo
-	nop.i 999 ;;
-}
 { .mfi
-	nop.m 999
-//
-//    Case 3: Is |r| < 2**(-2), if so set PR_7
-//    else set PR_8.
-//    Case 3: If PR_7 is set, prepare to branch to Small_R.
-//    Case 3: If PR_8 is set, prepare to branch to Normal_R.
-//
-(p10) fsub.s1 c = s_val, r
-	nop.i 999 ;;
+      nop.m 999
+(p11) fadd.s1 C_hi = s_val, A              // C_hi = s_val + A for |s| < 2^-14
+      nop.i 999
 }
 { .mfi
-	nop.m 999
-//
-//    Case 3: c = (s - r) + w (c complete)
-//
-(p11) fcmp.ge.unc.s1 p12, p13 = U_hiabs, V_hiabs
-	nop.i 999
+      nop.m 999
+(p10) fcmp.lt.unc.s1 p14, p15 = r, TWO_TO_NEG2
+      nop.i 999
 }
+;;
+
 { .mfi
-	nop.m 999
-(p11) fms.s1 w = N_0, d_2, w
-	nop.i 999 ;;
+      getf.sig sig_r = r               // Get signif of r if |s| >= 2^-33
+      fand B = B_mask1, r
+      nop.i 999
 }
+;;
+
+//    Case 4: t = U_lo + V_lo
 { .mfi
-	nop.m 999
-//
-//    Case 4: V_hi = N * P_2
-//            w = N * P_3
-//    Note the product does not include the (-) as in the writeup
-//    so (-) missing for V_hi and w .
-//
-(p10) fcmp.lt.unc.s1 p14, p15 = r, TWO_TO_NEG2
-	nop.i 999 ;;
+      getf.exp exp_r = r               // Extract signexp of r if |s| >= 2^-33
+(p11) fadd.s1 t = U_lo, V_lo               // t = U_lo + V_lo for |s| < 2^-14
+      nop.i 999
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 (p14) fcmp.gt.s1 p14, p15 = r, NEGTWO_TO_NEG2
-	nop.i 999 ;;
+      nop.i 999
 }
-{ .mfb
-	nop.m 999
+;;
+
+//    Case 3: c = (s - r) + w (c complete)
+{ .mfi
+      nop.m 999
+(p10) fadd.s1 c = c, w              // c = c + w for |s| >= 2^-14
+      nop.i 999
+}
+{ .mbb
+      nop.m 999
+(p14) br.cond.spnt TANL_SMALL_R     // Branch if 2^24 <= |x| < 2^63 and |r|< 1/4
+(p15) br.cond.sptk TANL_NORMAL_R_A  // Branch if 2^24 <= |x| < 2^63 and |r|>=1/4
+}
+;;
+
+
+// Here if 2^24 <= |x| < 2^63 and |s| < 2^-14  >>>>>>>  Case 4.
 //
-//    Case 4: V_lo = -N * P_2 - V_hi (U_hi is in place of V_hi in writeup)
-//    Note: the (-) is still missing for V_hi .
+//    Case 4: Set P_12 if U_hiabs >= V_hiabs
 //    Case 4: w = w + N_0 * d_2
 //    Note: the (-) is now incorporated in w .
-//
-(p10) fadd.s1 c = c, w
-//
-//    Case 4: t = U_lo + V_lo
-//    Note: remember V_lo should be (-), subtract instead of add. NO
-//
-(p14) br.cond.spnt L(TANL_SMALL_R) ;;
-}
-{ .mib
-	nop.m 999
-	nop.i 999
-(p15) br.cond.spnt L(TANL_NORMAL_R) ;;
-}
 { .mfi
-	nop.m 999
-//
-//    Case 3: Vector off when |r| < 2**(-2).  Recall that PR_3 will be true.
-//    The remaining stuff is for Case 4.
-//
-(p12) fsub.s1 a = U_hi, A
-(p11) extr.u i_1 = N_fix_gr, 0, 1 ;;
+      add table_ptr1 = 160, table_base           // Point to tanl_table_p1
+      fcmp.ge.unc.s1 p12, p13 = U_hiabs, V_hiabs
+      nop.i 999
 }
 { .mfi
-	nop.m 999
-//
-//    Case 4: C_lo = s_val - C_hi
-//
-(p11) fadd.s1 t = t, w
-	nop.i 999
+      nop.m 999
+      fms.s1 w2 = N_0, d_2, w2
+      nop.i 999
 }
+;;
+
+//    Case 4: C_lo = s_val - C_hi
 { .mfi
-	nop.m 999
-(p13) fadd.s1 a = V_hi, A
-	nop.i 999 ;;
+      ldfe P1_1 = [table_ptr1], 16               // Load P1_1
+      fsub.s1 C_lo = s_val, C_hi
+      nop.i 999
 }
-
-
+;;
 
 //
 //    Case 4: a = U_hi - A
 //            a = V_hi - A (do an add to account for missing (-) on V_hi
 //
-
 { .mfi
-(p11)  addl           table_ptr1   = @ltoff(TANL_BASE_CONSTANTS), gp
-(p11) fsub.s1 C_lo = s_val, C_hi
-	nop.i 999
+      ldfe P1_2 = [table_ptr1], 128              // Load P1_2
+(p12) fsub.s1 a = U_hi, A
+      nop.i 999
+}
+{ .mfi
+      nop.m 999
+(p13) fadd.s1 a = V_hi, A
+      nop.i 999
 }
 ;;
 
+//    Case 4: t = U_lo + V_lo  + w
+{ .mfi
+      ldfe Q1_1 = [table_ptr1], 16               // Load Q1_1
+      fadd.s1 t = t, w2
+      nop.i 999
+}
+;;
 
-
-//
 //    Case 4: a = (U_hi - A)  + V_hi
 //            a = (V_hi - A)  + U_hi
 //    In each case account for negative missing form V_hi .
 //
-
-
-{ .mmi
-(p11)  ld8 table_ptr1 = [table_ptr1]
+{ .mfi
+      ldfe Q1_2 = [table_ptr1], 16               // Load Q1_2
+(p12) fsub.s1 a = a, V_hi
+      nop.i 999
+}
+{ .mfi
       nop.m 999
+(p13) fsub.s1 a = U_hi, a
       nop.i 999
 }
 ;;
 
-
 //
 //    Case 4: C_lo = (s_val - C_hi) + A
 //
-{ .mmi
-(p11) add table_ptr1 = 224, table_ptr1 ;;
-(p11) ldfe P1_1 = [table_ptr1], 16
-	nop.i 999 ;;
-}
-{ .mfi
-(p11) ldfe P1_2 = [table_ptr1], 128
-//
-//    Case 4: w = U_lo + V_lo  + w
-//
-(p12) fsub.s1 a = a, V_hi
-	nop.i 999 ;;
-}
-//
-//    Case 4: r = C_hi + C_lo
-//
 { .mfi
-(p11) ldfe Q1_1 = [table_ptr1], 16
-(p11) fadd.s1 C_lo = C_lo, A
-	nop.i 999 ;;
+      nop.m 999
+      fadd.s1 C_lo = C_lo, A
+      nop.i 999 ;;
 }
 //
-//    Case 4: c = C_hi - r
-//    Get [i_1] - lsb of N_fix_gr.
+//    Case 4: t = t + a
 //
 { .mfi
-(p11) ldfe Q1_2 = [table_ptr1], 16
-	nop.f 999
-	nop.i 999 ;;
+      nop.m 999
+      fadd.s1 t = t, a
+      nop.i 999
 }
+;;
+
+//    Case 4: C_lo = C_lo + t
+//    Case 4: r = C_hi + C_lo
 { .mfi
-	nop.m 999
-(p13) fsub.s1 a = U_hi, a
-	nop.i 999 ;;
+      nop.m 999
+      fadd.s1 C_lo = C_lo, t
+      nop.i 999
 }
+;;
+
 { .mfi
-	nop.m 999
-(p11) fadd.s1 t = t, a
-	nop.i 999 ;;
+      nop.m 999
+      fadd.s1 r = C_hi, C_lo
+      nop.i 999
 }
-{ .mfi
-	nop.m 999
+;;
+
 //
-//    Case 4: t = t + a
+//    Case 4: c = C_hi - r
 //
-(p11) fadd.s1 C_lo = C_lo, t
-	nop.i 999 ;;
-}
 { .mfi
-	nop.m 999
-//
-//    Case 4: C_lo = C_lo + t
-//
-(p11) fadd.s1 r = C_hi, C_lo
-	nop.i 999 ;;
+      nop.m 999
+      fsub.s1 c = C_hi, r
+      nop.i 999
 }
 { .mfi
-	nop.m 999
-(p11) fsub.s1 c = C_hi, r
-	nop.i 999
+      nop.m 999
+      fmpy.s1 rsq = r, r
+      add N_fix_gr = N_fix_gr, cot_flag // N = N + 1 (for cotl)
 }
-{ .mfi
-	nop.m 999
-//
+;;
+
 //    Case 4: c = c + C_lo  finished.
-//    Is i_1  even or odd?
-//    if i_1 == 0, set PR_4, else set PR_5.
 //
-// r and c have been computed.
-// We known whether this is the sine or cosine routine.
-// Make sure ftz mode is set - should be automatic when using wre
-(p0)  fmpy.s1 rsq = r, r
-	nop.i 999 ;;
-}
+//    Is i_1 = lsb of N_fix_gr even or odd?
+//    if i_1 == 0, set PR_11, else set PR_12.
+//
 { .mfi
-	nop.m 999
-(p11) fadd.s1 c = c , C_lo
-(p11) cmp.eq.unc p11, p12 =  0x0000, i_1 ;;
+      nop.m 999
+      fadd.s1 c = c , C_lo
+      tbit.z p11, p12 =  N_fix_gr, 0
 }
+;;
+
+// r and c have been computed.
 { .mfi
-	nop.m 999
+      nop.m 999
 (p12) frcpa.s1 S_hi, p0 = f1, r
-	nop.i 999
+      nop.i 999
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
 //    N odd: Change sign of S_hi
 //
-(p11) fma.s1 Result = rsq, P1_2, P1_1
-	nop.i 999 ;;
+(p11) fma.s1 Poly = rsq, P1_2, P1_1
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 (p12) fma.s1 P = rsq, Q1_2, Q1_1
-	nop.i 999
+      nop.i 999
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
 //    N odd:  Result  =  S_hi + S_lo      (User supplied rounding mode for C1)
 //
-(p0)   fmpy.s0 fp_tmp = fp_tmp, fp_tmp  // Dummy mult to set inexact
-	nop.i 999 ;;
+       fmpy.s0 fp_tmp = fp_tmp, fp_tmp  // Dummy mult to set inexact
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
 //    N even: rsq = r * r
 //    N odd:  S_hi = frcpa(r)
 //
 (p12) fmerge.ns S_hi = S_hi, S_hi
-	nop.i 999
+      nop.i 999
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
 //    N even: rsq = rsq * P1_2 + P1_1
 //    N odd:  poly1 =  1.0 +  S_hi * r    16 bits partial  account for necessary
 //
-(p11) fmpy.s1 Result = rsq, Result
-	nop.i 999 ;;
+(p11) fmpy.s1 Poly = rsq, Poly
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 (p12) fma.s1 poly1 = S_hi, r,f1
-	nop.i 999
+(p11) tbit.z.unc p14, p15 = cot_flag, 0 // p14=1 for tanl; p15=1 for cotl
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
-//    N even: Result =  Result * rsq
+//    N even: Poly =  Poly * rsq
 //    N odd:  S_hi  = S_hi + S_hi*poly1  16 bits account for necessary
 //
-(p11) fma.s1 Result = r, Result, c
-	nop.i 999 ;;
+(p11) fma.s1 Poly = r, Poly, c
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
-	nop.i 999
+      nop.i 999
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
 //    N odd:   S_hi  = S_hi * poly1 + S_hi   32 bits
 //
-(p11) fadd.s0 Result= r, Result
-	nop.i 999 ;;
+(p14) fadd.s0 Result = r, Poly          // for tanl
+      nop.i 999 ;;
+}
+
+.pred.rel "mutex",p15,p12
+{ .mfi
+      nop.m 999
+(p15) fms.s0 Result = r, mOne, Poly     // for cotl
+      nop.i 999
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 (p12) fma.s1 poly1 =  S_hi, r, f1
-	nop.i 999 ;;
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
-//    N even: Result = Result * r + c
+//    N even: Poly = Poly * r + c
 //    N odd:  poly1  = 1.0 + S_hi * r        32 bits partial
 //
 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
-	nop.i 999 ;;
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 (p12) fma.s1 poly1 = S_hi, r, f1
-	nop.i 999 ;;
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
-//    N even: Result1 = Result + r  (Rounding mode S0)
+//    N even: Result = Poly + r  (Rounding mode S0)
 //    N odd:  poly1  =  S_hi * r + 1.0       64 bits partial
 //
 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
-	nop.i 999 ;;
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
 //    N odd:  poly1  =  S_hi * poly + S_hi    64 bits
 //
 (p12) fma.s1 poly1 = S_hi, r, f1
-	nop.i 999 ;;
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
 //    N odd:  poly1  =  S_hi * r + 1.0
 //
 (p12) fma.s1 poly1 = S_hi, c, poly1
-	nop.i 999 ;;
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
 //    N odd:  poly1  =  S_hi * c + poly1
 //
 (p12) fmpy.s1 S_lo = S_hi, poly1
-	nop.i 999 ;;
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
 //    N odd:  S_lo  =  S_hi *  poly1
 //
 (p12) fma.s1 S_lo = P, r, S_lo
-	nop.i 999 ;;
+(p12) tbit.z.unc p14, p15 = cot_flag, 0 ;; // p14=1 for tanl; p15=1 for cotl
+}
+
+{ .mfi
+      nop.m 999
+(p14) fadd.s0 Result = S_hi, S_lo           // for tanl
+      nop.i 999
 }
 { .mfb
-	nop.m 999
+      nop.m 999
 //
 //    N odd:  S_lo  =  S_lo + r * P
 //
-(p12) fadd.s0 Result = S_hi, S_lo
-(p0)   br.ret.sptk b0 ;;
+(p15) fms.s0 Result = S_hi, mOne, S_lo      // for cotl
+      br.ret.sptk b0 ;;      // Exit for 2^24 <= |x| < 2^63 and |s| < 2^-14
 }
 
 
-L(TANL_SMALL_R): 
-{ .mii
-	nop.m 999
-(p0)  extr.u i_1 = N_fix_gr, 0, 1 ;;
-(p0)  cmp.eq.unc p11, p12 = 0x0000, i_1
-}
+TANL_SMALL_R:
+// Here if |r| < 1/4
+// r and c have been computed.
+// *****************************************************************
+// *****************************************************************
+// *****************************************************************
+//    N odd:  S_hi = frcpa(r)
+//    Get [i_1] - lsb of N_fix_gr.  Set p11 if N even, p12 if N odd.
+//    N even: rsq = r * r
 { .mfi
-	nop.m 999
-(p0)  fmpy.s1 rsq = r, r
-	nop.i 999 ;;
+      add table_ptr1 = 160, table_base    // Point to tanl_table_p1
+      frcpa.s1 S_hi, p0 = f1, r           // S_hi for N odd
+      add N_fix_gr = N_fix_gr, cot_flag   // N = N + 1 (for cotl)
 }
 { .mfi
-(p0)  addl           table_ptr1   = @ltoff(TANL_BASE_CONSTANTS), gp
-(p12) frcpa.s1 S_hi, p0 = f1, r
-	nop.i 999
+      add table_ptr2 = 400, table_base    // Point to Q1_7
+      fmpy.s1 rsq = r, r
+      nop.i 999
 }
 ;;
 
-
 { .mmi
-(p0)  ld8 table_ptr1 = [table_ptr1]
-      nop.m 999
-      nop.i 999
+      ldfe P1_1 = [table_ptr1], 16
+;;
+      ldfe P1_2 = [table_ptr1], 16
+      tbit.z p11, p12 = N_fix_gr, 0
 }
 ;;
 
-// *****************************************************************
-// *****************************************************************
-// *****************************************************************
-
 
-{ .mmi
-(p0)  add table_ptr1 = 224, table_ptr1 ;;
-(p0)  ldfe P1_1 = [table_ptr1], 16
-	nop.i 999 ;;
-}
-//    r and c have been computed.
-//    We known whether this is the sine or cosine routine.
-//    Make sure ftz mode is set - should be automatic when using wre
-//    |r| < 2**(-2)
 { .mfi
-(p0)  ldfe P1_2 = [table_ptr1], 16
-(p11) fmpy.s1 r_to_the_8 = rsq, rsq
-	nop.i 999 ;;
+      ldfe P1_3 = [table_ptr1], 96
+      nop.f 999
+      nop.i 999
 }
-//
-//    Set table_ptr1 to beginning of constant table.
-//    Get [i_1] - lsb of N_fix_gr.
-//
+;;
+
 { .mfi
-(p0)  ldfe P1_3 = [table_ptr1], 96
-//
-//    N even: rsq = r * r
-//    N odd:  S_hi = frcpa(r)
-//
+(p11) ldfe P1_9 = [table_ptr1], -16
 (p12) fmerge.ns S_hi = S_hi, S_hi
-	nop.i 999 ;;
+      nop.i 999
 }
-//
-//    Is i_1  even or odd?
-//    if i_1 == 0, set PR_11.
-//    if i_1 != 0, set PR_12.
-//
 { .mfi
-(p11) ldfe P1_9 = [table_ptr1], -16
+      nop.m 999
+(p11) fmpy.s1 r_to_the_8 = rsq, rsq
+      nop.i 999
+}
+;;
+
 //
 //    N even: Poly2 = P1_7 + Poly2 * rsq
 //    N odd:  poly2 = Q1_5 + poly2 * rsq
 //
+{ .mfi
+(p11) ldfe P1_8 = [table_ptr1], -16
 (p11) fadd.s1 CORR = rsq, f1
-	nop.i 999 ;;
+      nop.i 999
 }
-{ .mmi
-(p11) ldfe P1_8 = [table_ptr1], -16 ;;
+;;
+
 //
 //    N even: Poly1 = P1_2 + P1_3 * rsq
-//    N odd:  poly1 =  1.0 +  S_hi * r     
+//    N odd:  poly1 =  1.0 +  S_hi * r
 //    16 bits partial  account for necessary (-1)
 //
+{ .mmi
 (p11) ldfe P1_7 = [table_ptr1], -16
-	nop.i 999 ;;
+;;
+(p11) ldfe P1_6 = [table_ptr1], -16
+      nop.i 999
 }
+;;
+
 //
 //    N even: Poly1 = P1_1 + Poly1 * rsq
 //    N odd:  S_hi  =  S_hi + S_hi * poly1)     16 bits account for necessary
 //
-{ .mfi
-(p11) ldfe P1_6 = [table_ptr1], -16
 //
 //    N even: Poly2 = P1_5 + Poly2 * rsq
 //    N odd:  poly2 = Q1_3 + poly2 * rsq
 //
+{ .mfi
+(p11) ldfe P1_5 = [table_ptr1], -16
 (p11) fmpy.s1 r_to_the_8 = r_to_the_8, r_to_the_8
-	nop.i 999 ;;
+      nop.i 999
 }
+{ .mfi
+      nop.m 999
+(p12) fma.s1 poly1 =  S_hi, r, f1
+      nop.i 999
+}
+;;
+
 //
 //    N even: Poly1 =  Poly1 * rsq
 //    N odd:  poly1  = 1.0 + S_hi * r         32 bits partial
 //
-{ .mfi
-(p11) ldfe P1_5 = [table_ptr1], -16
-(p12) fma.s1 poly1 =  S_hi, r, f1
-	nop.i 999 ;;
-}
 
 //
 //    N even: CORR =  CORR * c
@@ -2290,44 +2421,30 @@ L(TANL_SMALL_R):
 
 { .mmf
 (p11) ldfe P1_4 = [table_ptr1], -16
-(p0)  addl           table_ptr2   = @ltoff(TANL_BASE_CONSTANTS), gp
-(p11) fmpy.s1 CORR =  CORR, c
-}
-;;
-
-
-{ .mmi
-(p0)  ld8 table_ptr2 = [table_ptr2]
       nop.m 999
-      nop.i 999
+(p11) fmpy.s1 CORR =  CORR, c
 }
 ;;
 
-
-{ .mii
-(p0)  add table_ptr2 = 464, table_ptr2
-	nop.i 999 ;;
-	nop.i 999
-}
 { .mfi
-	nop.m 999
+      nop.m 999
 (p11) fma.s1 Poly1 = P1_3, rsq, P1_2
-	nop.i 999 ;;
+      nop.i 999 ;;
 }
 { .mfi
-(p0)  ldfe Q1_7 = [table_ptr2], -16
+(p12) ldfe Q1_7 = [table_ptr2], -16
 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
-	nop.i 999 ;;
+      nop.i 999 ;;
 }
 { .mfi
-(p0)  ldfe Q1_6 = [table_ptr2], -16
+(p12) ldfe Q1_6 = [table_ptr2], -16
 (p11) fma.s1 Poly2 = P1_9, rsq, P1_8
-	nop.i 999 ;;
+      nop.i 999 ;;
 }
 { .mmi
-(p0)  ldfe Q1_5 = [table_ptr2], -16 ;;
+(p12) ldfe Q1_5 = [table_ptr2], -16 ;;
 (p12) ldfe Q1_4 = [table_ptr2], -16
-	nop.i 999 ;;
+      nop.i 999 ;;
 }
 { .mfi
 (p12) ldfe Q1_3 = [table_ptr2], -16
@@ -2336,735 +2453,796 @@ L(TANL_SMALL_R):
 //    N odd:  poly2 = Q1_6 + Q1_7 * rsq
 //
 (p11) fma.s1 Poly1 = Poly1, rsq, P1_1
-	nop.i 999 ;;
+      nop.i 999 ;;
 }
 { .mfi
 (p12) ldfe Q1_2 = [table_ptr2], -16
 (p12) fma.s1 poly1 = S_hi, r, f1
-	nop.i 999 ;;
+      nop.i 999 ;;
 }
 { .mfi
 (p12) ldfe Q1_1 = [table_ptr2], -16
 (p11) fma.s1 Poly2 = Poly2, rsq, P1_7
-	nop.i 999 ;;
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
 //    N even: CORR =  rsq + 1
 //    N even: r_to_the_8 =  rsq * rsq
 //
 (p11) fmpy.s1 Poly1 = Poly1, rsq
-	nop.i 999 ;;
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
-	nop.i 999
+      nop.i 999
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 (p12) fma.s1 poly2 = Q1_7, rsq, Q1_6
-	nop.i 999 ;;
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 (p11) fma.s1 Poly2 = Poly2, rsq, P1_6
-	nop.i 999 ;;
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 (p12) fma.s1 poly1 = S_hi, r, f1
-	nop.i 999
+      nop.i 999
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 (p12) fma.s1 poly2 = poly2, rsq, Q1_5
-	nop.i 999 ;;
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 (p11) fma.s1 Poly2= Poly2, rsq, P1_5
-	nop.i 999 ;;
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 (p12) fma.s1 S_hi =  S_hi, poly1, S_hi
-	nop.i 999
+      nop.i 999
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 (p12) fma.s1 poly2 = poly2, rsq, Q1_4
-	nop.i 999 ;;
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
 //    N even: r_to_the_8 = r_to_the_8 * r_to_the_8
 //    N odd:  poly1  =  S_hi * r + 1.0       64 bits partial
 //
 (p11) fma.s1 Poly2 = Poly2, rsq, P1_4
-	nop.i 999 ;;
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
-//    N even: Result = CORR + Poly * r
+//    N even: Poly = CORR + Poly * r
 //    N odd:  P = Q1_1 + poly2 * rsq
 //
 (p12) fma.s1 poly1 = S_hi, r, f1
-	nop.i 999
+      nop.i 999
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 (p12) fma.s1 poly2 = poly2, rsq, Q1_3
-	nop.i 999 ;;
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
 //    N even: Poly2 = P1_4 + Poly2 * rsq
 //    N odd:  poly2 = Q1_2 + poly2 * rsq
 //
 (p11) fma.s1 Poly = Poly2, r_to_the_8, Poly1
-	nop.i 999 ;;
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 (p12) fma.s1 poly1 = S_hi, c, poly1
-	nop.i 999
+      nop.i 999
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 (p12) fma.s1 poly2 = poly2, rsq, Q1_2
-	nop.i 999 ;;
+      nop.i 999 ;;
 }
 
 { .mfi
-	nop.m 999
+      nop.m 999
 //
 //    N even: Poly = Poly1 + Poly2 * r_to_the_8
 //    N odd:  S_hi =  S_hi * poly1 + S_hi    64 bits
 //
-(p11) fma.s1 Result = Poly, r, CORR
-	nop.i 999 ;;
+(p11) fma.s1 Poly = Poly, r, CORR
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
-//    N even: Result =  r + Result  (User supplied rounding mode)
+//    N even: Result =  r + Poly  (User supplied rounding mode)
 //    N odd:  poly1  =  S_hi * c + poly1
 //
 (p12) fmpy.s1 S_lo = S_hi, poly1
-	nop.i 999
+(p11) tbit.z.unc p14, p15 = cot_flag, 0 // p14=1 for tanl; p15=1 for cotl
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 (p12) fma.s1 P = poly2, rsq, Q1_1
-	nop.i 999 ;;
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
 //    N odd:  poly1  =  S_hi * r + 1.0
 //
 //
 //    N odd:  S_lo  =  S_hi *  poly1
 //
-(p11) fadd.s0 Result = Result, r
-	nop.i 999 ;;
+(p14) fadd.s0 Result = Poly, r          // for tanl
+      nop.i 999
 }
 { .mfi
-	nop.m 999
+      nop.m 999
+(p15) fms.s0 Result = Poly, mOne, r     // for cotl
+      nop.i 999 ;;
+}
+
+{ .mfi
+      nop.m 999
 //
 //    N odd:  S_lo  =  Q1_1 * c + S_lo
 //
 (p12) fma.s1 S_lo = Q1_1, c, S_lo
-	nop.i 999
+      nop.i 999
 }
 { .mfi
-	nop.m 999
-(p0)   fmpy.s0 fp_tmp = fp_tmp, fp_tmp  // Dummy mult to set inexact
-	nop.i 999 ;;
+      nop.m 999
+      fmpy.s0 fp_tmp = fp_tmp, fp_tmp  // Dummy mult to set inexact
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
 //    N odd:  Result =  S_lo + r * P
 //
 (p12) fma.s1 Result = P, r, S_lo
-	nop.i 999 ;;
+(p12) tbit.z.unc p14, p15 = cot_flag, 0 ;; // p14=1 for tanl; p15=1 for cotl
 }
-{ .mfb
-	nop.m 999
+
 //
 //    N odd:  Result = Result + S_hi  (user supplied rounding mode)
 //
-(p12) fadd.s0 Result = Result, S_hi
-(p0)   br.ret.sptk b0 ;;
+{ .mfi
+      nop.m 999
+(p14) fadd.s0 Result = Result, S_hi         // for tanl
+      nop.i 999
+}
+{ .mfb
+      nop.m 999
+(p15) fms.s0 Result = Result, mOne, S_hi    // for cotl
+      br.ret.sptk b0 ;;              // Exit |r| < 1/4 path
 }
 
 
-L(TANL_NORMAL_R): 
-{ .mfi
-(p0)  getf.sig sig_r = r
+TANL_NORMAL_R:
+// Here if 1/4 <= |x| < pi/4  or  if |x| >= 2^63 and |r| >= 1/4
 // *******************************************************************
 // *******************************************************************
 // *******************************************************************
 //
 //    r and c have been computed.
-//    Make sure ftz mode is set - should be automatic when using wre
 //
-//
-//    Get [i_1] -  lsb of N_fix_gr alone.
-//
-(p0)  fmerge.s  Pos_r = f1, r
-(p0)  extr.u i_1 = N_fix_gr, 0, 1 ;;
-}
 { .mfi
-	nop.m 999
-(p0)  fmerge.s  sgn_r =  r, f1
-(p0)  cmp.eq.unc p11, p12 = 0x0000, i_1 ;;
-}
-{ .mfi
-	nop.m 999
-	nop.f 999
-(p0)  extr.u lookup = sig_r, 58, 5
-}
-{ .mlx
-	nop.m 999
-(p0)  movl Create_B = 0x8200000000000000 ;;
-}
-{ .mfi
-(p0)  addl           table_ptr1   = @ltoff(TANL_BASE_CONSTANTS), gp
-	nop.f 999
-(p0)  dep Create_B = lookup, Create_B, 58, 5
-}
-;;
-
-
-//
-//    Get [i_1] -  lsb of N_fix_gr alone.
-//    Pos_r = abs (r)
-//
-
-
-{ .mmi
-(p0)  ld8 table_ptr1 = [table_ptr1]
       nop.m 999
+      fand B = B_mask1, r
       nop.i 999
 }
 ;;
 
-
+TANL_NORMAL_R_A:
+// Enter here if pi/4 <= |x| < 2^63 and |r| >= 1/4
+//    Get the 5 bits or r for the lookup.   1.xxxxx ....
 { .mmi
-	nop.m 999
-(p0)  setf.sig B = Create_B
-//
-//    Set table_ptr1 and table_ptr2 to base address of
-//    constant table.
-//
-(p0)  add table_ptr1 = 480, table_ptr1 ;;
-}
-{ .mmb
-	nop.m 999
-//
-//    Is i_1 or i_0  == 0 ?
-//    Create the constant  1 00000 1000000000000000000000...
-//
-(p0)  ldfe P2_1 = [table_ptr1], 16
-	nop.b 999
+      add table_ptr1 = 416, table_base     // Point to tanl_table_p2
+      mov GR_exp_2tom65 = 0xffff - 65      // Scaling constant for B
+      extr.u lookup = sig_r, 58, 5
 }
+;;
+
 { .mmi
-	nop.m 999 ;;
-(p0)  getf.exp exp_r = Pos_r
-	nop.i 999
+      ldfe P2_1 = [table_ptr1], 16
+      setf.exp TWO_TO_NEG65 = GR_exp_2tom65  // 2^-65 for scaling B if exp_r=-2
+      add N_fix_gr = N_fix_gr, cot_flag      // N = N + 1 (for cotl)
 }
-//
-//    Get r's exponent
-//    Get r's significand
-//
-{ .mmi
-(p0)  ldfe P2_2 = [table_ptr1], 16 ;;
-//
-//    Get the 5 bits or r for the lookup.   1.xxxxx ....
-//    from sig_r.
-//    Grab  lsb of exp of B
-//
-(p0)  ldfe P2_3 = [table_ptr1], 16
-	nop.i 999 ;;
+;;
+
+.pred.rel "mutex",p11,p12
+//    B =  2^63 * 1.xxxxx 100...0
+{ .mfi
+      ldfe P2_2 = [table_ptr1], 16
+      for B = B_mask2, B
+      mov table_offset = 512               // Assume table offset is 512
 }
-{ .mii
-	nop.m 999
-(p0)  andcm table_offset = 0x0001, exp_r ;;
-(p0)  shl table_offset = table_offset, 9 ;;
+;;
+
+{ .mfi
+      ldfe P2_3 = [table_ptr1], 16
+      fmerge.s  Pos_r = f1, r
+      tbit.nz p8,p9 = exp_r, 0
 }
-{ .mii
-	nop.m 999
-//
-//    Deposit   0 00000 1000000000000000000000... on
-//              1 xxxxx yyyyyyyyyyyyyyyyyyyyyy...,
-//    getting rid of the ys.
+;;
+
 //    Is  B = 2** -2 or  B= 2** -1? If 2**-1, then
 //    we want an offset of 512 for table addressing.
-//
-(p0)  shladd table_offset = lookup, 4, table_offset ;;
-//
-//    B =  ........ 1xxxxx 1000000000000000000...
-//
-(p0)  add table_ptr1 = table_ptr1, table_offset ;;
-}
-{ .mmb
-	nop.m 999
-//
-//   B =  ........ 1xxxxx 1000000000000000000...
-//   Convert B so it has the same exponent as Pos_r
-//
-(p0)  ldfd T_hi = [table_ptr1], 8
-	nop.b 999 ;;
+{ .mii
+      add table_ptr2 = 1296, table_base     // Point to tanl_table_cm2
+(p9)  shladd table_offset = lookup, 4, table_offset
+(p8)  shladd table_offset = lookup, 4, r0
 }
+;;
 
+{ .mmi
+      add table_ptr1 = table_ptr1, table_offset  // Point to T_hi
+      add table_ptr2 = table_ptr2, table_offset  // Point to C_hi
+      add table_ptr3 = 2128, table_base     // Point to tanl_table_scim2
+}
+;;
 
+{ .mmi
+      ldfd T_hi = [table_ptr1], 8                // Load T_hi
+;;
+      ldfd C_hi = [table_ptr2], 8                // Load C_hi
+      add table_ptr3 = table_ptr3, table_offset  // Point to SC_inv
+}
+;;
 
 //
 //    x = |r| - B
-//    Load T_hi.
-//    Load C_hi.
 //
-
-{ .mmf
-(p0)  addl           table_ptr2   = @ltoff(TANL_BASE_CONSTANTS), gp
-(p0)  ldfs T_lo = [table_ptr1]
-(p0)  fmerge.se B = Pos_r, B
+//   Convert B so it has the same exponent as Pos_r before subtracting
+{ .mfi
+      ldfs T_lo = [table_ptr1]                   // Load T_lo
+(p9)  fnma.s1 x = B, FR_2tom64, Pos_r
+      nop.i 999
 }
-;;
-
-
-{ .mmi
-(p0)  ld8 table_ptr2 = [table_ptr2]
+{ .mfi
       nop.m 999
+(p8)  fnma.s1 x = B, TWO_TO_NEG65, Pos_r
       nop.i 999
 }
 ;;
 
-
-{ .mii
-(p0)  add table_ptr2 = 1360, table_ptr2
-	nop.i 999 ;;
-(p0)  add table_ptr2 = table_ptr2, table_offset ;;
+{ .mfi
+      ldfs C_lo = [table_ptr2]                   // Load C_lo
+      nop.f 999
+      nop.i 999
 }
+;;
+
 { .mfi
-(p0)  ldfd C_hi = [table_ptr2], 8
-(p0)  fsub.s1 x = Pos_r, B
-	nop.i 999 ;;
+      ldfe SC_inv = [table_ptr3]                 // Load SC_inv
+      fmerge.s  sgn_r = r, f1
+      tbit.z p11, p12 = N_fix_gr, 0              // p11 if N even, p12 if odd
+
 }
-{ .mii
-(p0)  ldfs C_lo = [table_ptr2],255
-	nop.i 999 ;;
+;;
+
 //
 //    xsq = x * x
 //    N even: Tx = T_hi * x
-//    Load T_lo.
-//    Load C_lo - increment pointer to get SC_inv 
-//    - cant get all the way, do an add later.
-//
-(p0)  add table_ptr2 = 569, table_ptr2 ;;
-}
 //
 //    N even: Tx1 = Tx + 1
 //    N odd:  Cx1 = 1 - Cx
 //
+
 { .mfi
-(p0)  ldfe SC_inv = [table_ptr2], 0
-	nop.f 999
-	nop.i 999 ;;
-}
-{ .mfi
-	nop.m 999
-(p0)  fmpy.s1 xsq = x, x
-	nop.i 999
+      nop.m 999
+      fmpy.s1 xsq = x, x
+      nop.i 999
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 (p11) fmpy.s1 Tx = T_hi, x
-	nop.i 999 ;;
-}
-{ .mfi
-	nop.m 999
-(p12) fmpy.s1 Cx = C_hi, x
-	nop.i 999 ;;
+      nop.i 999
 }
-{ .mfi
-	nop.m 999
+;;
+
 //
 //    N odd: Cx = C_hi * x
 //
-(p0)  fma.s1 P = P2_3, xsq, P2_2
-	nop.i 999
-}
 { .mfi
-	nop.m 999
+      nop.m 999
+(p12) fmpy.s1 Cx = C_hi, x
+      nop.i 999
+}
+;;
 //
 //    N even and odd: P = P2_3 + P2_2 * xsq
 //
+{ .mfi
+      nop.m 999
+      fma.s1 P = P2_3, xsq, P2_2
+      nop.i 999
+}
+{ .mfi
+      nop.m 999
 (p11) fadd.s1 Tx1 = Tx, f1
-	nop.i 999 ;;
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
 //    N even: D = C_hi - tanx
 //    N odd: D = T_hi + tanx
 //
 (p11) fmpy.s1 CORR = SC_inv, T_hi
-	nop.i 999
+      nop.i 999
 }
 { .mfi
-	nop.m 999
-(p0)  fmpy.s1 Sx = SC_inv, x
-	nop.i 999 ;;
+      nop.m 999
+      fmpy.s1 Sx = SC_inv, x
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 (p12) fmpy.s1 CORR = SC_inv, C_hi
-	nop.i 999 ;;
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 (p12) fsub.s1 V_hi = f1, Cx
-	nop.i 999 ;;
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
-(p0)  fma.s1 P = P, xsq, P2_1
-	nop.i 999
+      nop.m 999
+      fma.s1 P = P, xsq, P2_1
+      nop.i 999
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
 //    N even and odd: P = P2_1 + P * xsq
 //
 (p11) fma.s1 V_hi = Tx, Tx1, f1
-	nop.i 999 ;;
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
 //    N even: Result  = sgn_r * tail + T_hi (user rounding mode for C1)
 //    N odd:  Result  = sgn_r * tail + C_hi (user rounding mode for C1)
 //
-(p0)   fmpy.s0 fp_tmp = fp_tmp, fp_tmp  // Dummy mult to set inexact
-	nop.i 999 ;;
+      fmpy.s0 fp_tmp = fp_tmp, fp_tmp  // Dummy mult to set inexact
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
-(p0)  fmpy.s1 CORR = CORR, c
-	nop.i 999 ;;
+      nop.m 999
+      fmpy.s1 CORR = CORR, c
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 (p12) fnma.s1 V_hi = Cx,V_hi,f1
-	nop.i 999 ;;
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
 //    N even: V_hi = Tx * Tx1 + 1
 //    N odd: Cx1 = 1 - Cx * Cx1
 //
-(p0)  fmpy.s1 P = P, xsq
-	nop.i 999
+      fmpy.s1 P = P, xsq
+      nop.i 999
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
 //    N even and odd: P = P * xsq
 //
 (p11) fmpy.s1 V_hi = V_hi, T_hi
-	nop.i 999 ;;
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
 //    N even and odd: tail = P * tail + V_lo
 //
 (p11) fmpy.s1 T_hi = sgn_r, T_hi
-	nop.i 999 ;;
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
-(p0)  fmpy.s1 CORR = CORR, sgn_r
-	nop.i 999 ;;
+      nop.m 999
+      fmpy.s1 CORR = CORR, sgn_r
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 (p12) fmpy.s1 V_hi = V_hi,C_hi
-	nop.i 999 ;;
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
 //    N even: V_hi = T_hi * V_hi
 //    N odd: V_hi  = C_hi * V_hi
 //
-(p0)  fma.s1 tanx = P, x, x
-	nop.i 999
+      fma.s1 tanx = P, x, x
+      nop.i 999
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 (p12) fnmpy.s1 C_hi = sgn_r, C_hi
-	nop.i 999 ;;
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
 //    N even: V_lo = 1 - V_hi + C_hi
 //    N odd: V_lo = 1 - V_hi + T_hi
 //
 (p11) fadd.s1 CORR = CORR, T_lo
-	nop.i 999
+      nop.i 999
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 (p12) fsub.s1 CORR = CORR, C_lo
-	nop.i 999 ;;
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
 //    N even and odd: tanx = x + x * P
 //    N even and odd: Sx = SC_inv * x
 //
 (p11) fsub.s1 D = C_hi, tanx
-	nop.i 999
+      nop.i 999
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 (p12) fadd.s1 D = T_hi, tanx
-	nop.i 999 ;;
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
 //    N odd: CORR = SC_inv * C_hi
 //    N even: CORR = SC_inv * T_hi
 //
-(p0)  fnma.s1 D = V_hi, D, f1
-	nop.i 999 ;;
+      fnma.s1 D = V_hi, D, f1
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
 //    N even and odd: D = 1 - V_hi * D
 //    N even and odd: CORR = CORR * c
 //
-(p0)  fma.s1 V_hi = V_hi, D, V_hi
-	nop.i 999 ;;
+      fma.s1 V_hi = V_hi, D, V_hi
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
 //    N even and odd: V_hi = V_hi + V_hi * D
 //    N even and odd: CORR = sgn_r * CORR
 //
 (p11) fnma.s1 V_lo = V_hi, C_hi, f1
-	nop.i 999
+      nop.i 999
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 (p12) fnma.s1 V_lo = V_hi, T_hi, f1
-	nop.i 999 ;;
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
 //    N even: CORR = COOR + T_lo
 //    N odd: CORR = CORR - C_lo
 //
 (p11) fma.s1 V_lo = tanx, V_hi, V_lo
-	nop.i 999
+      tbit.nz p15, p0 = cot_flag, 0       // p15=1 if we compute cotl
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 (p12) fnma.s1 V_lo = tanx, V_hi, V_lo
-	nop.i 999 ;;
+      nop.i 999 ;;
+}
+
+{ .mfi
+      nop.m 999
+(p15) fms.s1 T_hi = f0, f0, T_hi        // to correct result's sign for cotl
+      nop.i 999
 }
 { .mfi
-	nop.m 999
+      nop.m 999
+(p15) fms.s1 C_hi = f0, f0, C_hi        // to correct result's sign for cotl
+      nop.i 999
+};;
+
+{ .mfi
+      nop.m 999
+(p15) fms.s1 sgn_r = f0, f0, sgn_r      // to correct result's sign for cotl
+      nop.i 999
+};;
+
+{ .mfi
+      nop.m 999
 //
 //    N even: V_lo = V_lo + V_hi * tanx
 //    N odd: V_lo = V_lo - V_hi * tanx
 //
 (p11) fnma.s1 V_lo = C_lo, V_hi, V_lo
-	nop.i 999
+      nop.i 999
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 (p12) fnma.s1 V_lo = T_lo, V_hi, V_lo
-	nop.i 999 ;;
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
 //    N  even: V_lo = V_lo - V_hi * C_lo
 //    N  odd: V_lo = V_lo - V_hi * T_lo
 //
-(p0)  fmpy.s1 V_lo = V_hi, V_lo
-	nop.i 999 ;;
+      fmpy.s1 V_lo = V_hi, V_lo
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
 //    N even and odd: V_lo = V_lo * V_hi
 //
-(p0)  fadd.s1 tail = V_hi, V_lo
-	nop.i 999 ;;
+      fadd.s1 tail = V_hi, V_lo
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
 //    N even and odd: tail = V_hi + V_lo
 //
-(p0)  fma.s1 tail = tail, P, V_lo
-	nop.i 999 ;;
+      fma.s1 tail = tail, P, V_lo
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
 //    N even: T_hi = sgn_r * T_hi
 //    N odd : C_hi = -sgn_r * C_hi
 //
-(p0)  fma.s1 tail = tail, Sx, CORR
-	nop.i 999 ;;
+      fma.s1 tail = tail, Sx, CORR
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
 //    N even and odd: tail = Sx * tail + CORR
 //
-(p0)  fma.s1 tail = V_hi, Sx, tail
-	nop.i 999 ;;
+      fma.s1 tail = V_hi, Sx, tail
+      nop.i 999 ;;
 }
 { .mfi
-	nop.m 999
+      nop.m 999
 //
 //    N even an odd: tail = Sx * V_hi + tail
 //
 (p11) fma.s0 Result = sgn_r, tail, T_hi
-	nop.i 999
+      nop.i 999
 }
 { .mfb
-	nop.m 999
+      nop.m 999
 (p12) fma.s0 Result = sgn_r, tail, C_hi
-(p0)   br.ret.sptk b0 ;;
+      br.ret.sptk b0 ;;                 // Exit for 1/4 <= |r| < pi/4
 }
 
-L(TANL_SPECIAL):
+TANL_DENORMAL:
+// Here if x denormal
 { .mfb
-        nop.m 999
-(p0)   fmpy.s0 Arg = Arg, f0
-(p0)   br.ret.sptk b0 ;;
+      getf.exp GR_signexp_x = Norm_Arg          // Get sign and exponent of x
+      nop.f 999
+      br.cond.sptk TANL_COMMON                  // Return to common code
 }
+;;
+
+
+TANL_SPECIAL:
+TANL_UNSUPPORTED:
 //
 //     Code for NaNs, Unsupporteds, Infs, or +/- zero ?
 //     Invalid raised for Infs and SNaNs.
 //
 
-.endp  tanl
-ASM_SIZE_DIRECTIVE(tanl)
+{ .mfi
+      nop.m 999
+      fmerge.s  f10 = f8, f8            // Save input for error call
+      tbit.nz p6, p7 = cot_flag, 0      // p6=1 if we compute cotl
+}
+;;
 
-// *******************************************************************
-// *******************************************************************
-// *******************************************************************
-//
-//     Special Code to handle very large argument case.
-//     Call int pi_by_2_reduce(&x,&r,&c)
-//     for |arguments| >= 2**63
-//     (Arg or x) is in f8
-//     Address to save r and c as double
-// *******************************************************************
-// *******************************************************************
-// *******************************************************************
+{ .mfi
+      nop.m 999
+(p6)  fclass.m p6, p7 = f8, 0x7         // Test for zero (cotl only)
+      nop.i 999
+}
+;;
+
+.pred.rel "mutex", p6, p7
+{ .mfi
+(p6)  mov GR_Parameter_Tag = 225        // (cotl)
+(p6)  frcpa.s0  f8, p0 = f1, f8         // cotl(+-0) = +-Inf
+      nop.i 999
+}
+{ .mfb
+      nop.m 999
+(p7)  fmpy.s0 f8 = f8, f0
+(p7)  br.ret.sptk b0
+}
+;;
 
-.proc __libm_callout
-__libm_callout:
-L(TANL_ARG_TOO_LARGE): 
+GLOBAL_IEEE754_END(tanl)
+
+
+LOCAL_LIBM_ENTRY(__libm_error_region)
 .prologue
+
+// (1)
 { .mfi
-        add   r50=-32,sp                        // Parameter: r address
-        nop.f 0
+      add           GR_Parameter_Y=-32,sp        // Parameter 2 value
+      nop.f         0
 .save   ar.pfs,GR_SAVE_PFS
-        mov  GR_SAVE_PFS=ar.pfs                 // Save ar.pfs
+      mov           GR_SAVE_PFS=ar.pfs           // Save ar.pfs
 }
 { .mfi
 .fframe 64
-        add sp=-64,sp                           // Create new stack
-        nop.f 0
-        mov GR_SAVE_GP=gp                       // Save gp
+      add sp=-64,sp                              // Create new stack
+      nop.f 0
+      mov GR_SAVE_GP=gp                          // Save gp
 };;
+
+// (2)
 { .mmi
-        stfe [r50] = f0,16                      // Clear Parameter r on stack
-        add  r49 = 16,sp                        // Parameter x address
+      stfe [GR_Parameter_Y] = f1,16              // STORE Parameter 2 on stack
+      add GR_Parameter_X = 16,sp                 // Parameter 1 address
 .save   b0, GR_SAVE_B0
-        mov GR_SAVE_B0=b0                       // Save b0
+      mov GR_SAVE_B0=b0                          // Save b0
 };;
+
 .body
+// (3)
 { .mib
-        stfe [r50] = f0,-16                     // Clear Parameter c on stack
-        nop.i 0
-        nop.b 0
+      stfe [GR_Parameter_X] = f10                // STORE Parameter 1 on stack
+      add   GR_Parameter_RESULT = 0,GR_Parameter_Y  // Parameter 3 address
+      nop.b 0
 }
 { .mib
-        stfe [r49] = Arg                        // Store Parameter x on stack
-        nop.i 0
-(p0)    br.call.sptk b0=__libm_pi_by_2_reduce# ;;
+      stfe [GR_Parameter_Y] = f8                 // STORE Parameter 3 on stack
+      add   GR_Parameter_Y = -16,GR_Parameter_Y
+      br.call.sptk b0=__libm_error_support#      // Call error handling function
+};;
+{ .mmi
+      nop.m 0
+      nop.m 0
+      add   GR_Parameter_RESULT = 48,sp
+};;
+
+// (4)
+{ .mmi
+      ldfe  f8 = [GR_Parameter_RESULT]           // Get return result off stack
+.restore sp
+      add   sp = 64,sp                           // Restore stack pointer
+      mov   b0 = GR_SAVE_B0                      // Restore return address
 };;
+{ .mib
+      mov   gp = GR_SAVE_GP                      // Restore gp
+      mov   ar.pfs = GR_SAVE_PFS                 // Restore ar.pfs
+      br.ret.sptk     b0                         // Return
+};;
+
+LOCAL_LIBM_END(__libm_error_region)
+
+.type   __libm_error_support#,@function
+.global __libm_error_support#
+
+
+// *******************************************************************
+// *******************************************************************
+// *******************************************************************
 //
-//     Load 2^-2
+//     Special Code to handle very large argument case.
+//     Call int __libm_pi_by_2_reduce(x,r,c) for |arguments| >= 2**63
+//     The interface is custom:
+//       On input:
+//         (Arg or x) is in f8
+//       On output:
+//         r is in f8
+//         c is in f9
+//         N is in r8
+//     We know also that __libm_pi_by_2_reduce preserves f10-15, f71-127.  We
+//     use this to eliminate save/restore of key fp registers in this calling
+//     function.
 //
+// *******************************************************************
+// *******************************************************************
+// *******************************************************************
+
+LOCAL_LIBM_ENTRY(__libm_callout)
+TANL_ARG_TOO_LARGE:
+.prologue
+{ .mfi
+      add table_ptr2 = 144, table_base        // Point to 2^-2
+      nop.f 999
+.save   ar.pfs,GR_SAVE_PFS
+      mov  GR_SAVE_PFS=ar.pfs                 // Save ar.pfs
+}
+;;
+
+//     Load 2^-2, -2^-2
 { .mmi
-(p0)   ldfe  Arg =[r49],16   
+      ldfps  TWO_TO_NEG2, NEGTWO_TO_NEG2 = [table_ptr2]
+      setf.sig B_mask1 = bmask1               // Form mask to get 5 msb of r
+.save   b0, GR_SAVE_B0
+      mov GR_SAVE_B0=b0                       // Save b0
+};;
+
+.body
 //
-//     Call argument reduction
+//     Call argument reduction with x in f8
+//     Returns with N in r8, r in f8, c in f9
+//     Assumes f71-127 are preserved across the call
 //
-(p0)   ldfs  TWO_TO_NEG2 = [table_ptr2],4
-//     Get Arg off stack
-//     Get r off stack - hi order part
-//     Get c off stack - lo order part
-(p0)   mov   N_fix_gr = r8 ;;
-}
-{ .mmb
-(p0)   ldfe  r =[r50],16  
-(p0)   ldfs  NEGTWO_TO_NEG2 = [table_ptr2],4
-	nop.b 999 ;;
+{ .mib
+      setf.sig B_mask2 = bmask2               // Form mask to form B from r
+      mov GR_SAVE_GP=gp                       // Save gp
+      br.call.sptk b0=__libm_pi_by_2_reduce#
 }
+;;
+
+//
+//     Is |r| < 2**(-2)
+//
 { .mfi
-(p0)   ldfe  c =[r50],-32  
-	nop.f 999
-	nop.i 999 ;;
+      getf.sig sig_r = r                     // Extract significand of r
+      fcmp.lt.s1  p6, p0 = r, TWO_TO_NEG2
+      mov   gp = GR_SAVE_GP                  // Restore gp
 }
+;;
+
 { .mfi
-.restore sp
-       add   sp = 64,sp                       // Restore stack pointer
+      getf.exp exp_r = r                     // Extract signexp of r
+      nop.f 999
+      mov    b0 = GR_SAVE_B0                 // Restore return address
+}
+;;
+
 //
-//     Is |r| < 2**(-2)
+//     Get N_fix_gr
 //
-(p0)   fcmp.lt.unc.s1  p6, p0 = r, TWO_TO_NEG2 
-mov    b0 = GR_SAVE_B0                        // Restore return address
-};;
 { .mfi
-       mov   gp = GR_SAVE_GP                  // Restore gp
-(p6)   fcmp.gt.unc.s1  p6, p0 = r, NEGTWO_TO_NEG2 
-       mov   ar.pfs = GR_SAVE_PFS             // Restore gp
-};;
+      mov   N_fix_gr = r8
+(p6)  fcmp.gt.unc.s1  p6, p0 = r, NEGTWO_TO_NEG2
+      mov   ar.pfs = GR_SAVE_PFS             // Restore pfs
+}
+;;
+
 { .mbb
-	nop.m 999
-(p6)   br.cond.spnt L(TANL_SMALL_R)
-(p0)   br.cond.sptk L(TANL_NORMAL_R) ;;
+      nop.m 999
+(p6)  br.cond.spnt TANL_SMALL_R              // Branch if |r| < 1/4
+      br.cond.sptk TANL_NORMAL_R             // Branch if 1/4 <= |r| < pi/4
 }
+;;
 
-.endp __libm_callout
-ASM_SIZE_DIRECTIVE(__libm_callout)
+LOCAL_LIBM_END(__libm_callout)
 
 .type __libm_pi_by_2_reduce#,@function
 .global __libm_pi_by_2_reduce#