about summary refs log tree commit diff
path: root/sysdeps/ia64/fpu/s_atanl.S
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/ia64/fpu/s_atanl.S')
-rw-r--r--sysdeps/ia64/fpu/s_atanl.S2158
1 files changed, 1079 insertions, 1079 deletions
diff --git a/sysdeps/ia64/fpu/s_atanl.S b/sysdeps/ia64/fpu/s_atanl.S
index 1a23611307..28d44c1850 100644
--- a/sysdeps/ia64/fpu/s_atanl.S
+++ b/sysdeps/ia64/fpu/s_atanl.S
@@ -1,10 +1,10 @@
 .file "atanl.s"
 
-
-// Copyright (c) 2000 - 2005, Intel Corporation
+// Copyright (C) 2000, 2001, Intel Corporation
 // All rights reserved.
-//
-// Contributed 2000 by the Intel Numerics Group, Intel Corporation
+// 
+// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
+// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
@@ -20,7 +20,7 @@
 // * The name of Intel Corporation may not be used to endorse or promote
 // products derived from this software without specific prior written
 // permission.
-
+//
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -35,53 +35,41 @@
 // 
 // Intel Corporation is the author of this code, and requests that all
 // problem reports or change requests be submitted to it directly at 
-// http://www.intel.com/software/products/opensource/libraries/num.htm.
+// http://developer.intel.com/opensource.
 //
 //
-//*********************************************************************
+// *********************************************************************
 //
 // History
-// 02/02/00 (hand-optimized)
-// 04/04/00 Unwind support added
-// 08/15/00 Bundle added after call to __libm_error_support to properly
+// 2/02/00  (hand-optimized)
+// 4/04/00  Unwind support added
+// 8/15/00  Bundle added after call to __libm_error_support to properly
 //          set [the previously overwritten] GR_Parameter_RESULT.
-// 03/13/01 Fixed flags when denormal raised on intermediate result
-// 01/08/02 Improved speed.
-// 02/06/02 Corrected .section statement
-// 05/20/02 Cleaned up namespace and sf0 syntax
-// 02/10/03 Reordered header: .section, .global, .proc, .align;
-//          used data8 for long double table values
-// 03/31/05 Reformatted delimiters between data tables
 //
-//*********************************************************************
+// *********************************************************************
 //
 // Function:   atanl(x) = inverse tangent(x), for double extended x values
-// Function:   atan2l(y,x) = atan(y/x), for double extended y, x values
-//
-// API
-//
-//  long double atanl  (long double x)
-//  long double atan2l (long double y, long double x)
+// Function:   atan2l(y,x) = atan(y/x), for double extended x values
 //
-//*********************************************************************
+// *********************************************************************
 //
 // Resources Used:
 //
 //    Floating-Point Registers: f8 (Input and Return Value)
-//                              f9 (Input for atan2l)
-//                              f10-f15, f32-f83
+//                              f9-f15
+//                              f32-f79
 //
 //    General Purpose Registers:
-//      r32-r51
-//      r49-r52 (Arguments to error support for 0,0 case)
+//      r32-r48
+//      r49,r50,r51,r52 (Arguments to error support for 0,0 case)
 //
 //    Predicate Registers:      p6-p15
 //
-//*********************************************************************
+// *********************************************************************
 //
 // IEEE Special Conditions:
 //
-//    Denormal fault raised on denormal inputs
+//    Denormal  fault raised on denormal inputs
 //    Underflow exceptions may occur 
 //    Special error handling for the y=0 and x=0 case
 //    Inexact raised when appropriate by algorithm
@@ -104,7 +92,7 @@
 //    atan2l(+/-Inf, Inf) = +/-pi/4
 //    atan2l(+/-Inf, -Inf) = +/-3pi/4
 //
-//*********************************************************************
+// *********************************************************************
 //
 // Mathematical Description
 // ---------------------------
@@ -120,16 +108,16 @@
 //
 //
 //        (Arg_X, Arg_Y) x
-//                        \
-//                \
-//                 \
-//                  \
+//                        \ 
+//                \ 
+//                 \ 
+//                  \ 
 //                   \ angle between is ATANL(Arg_Y,Arg_X)
 
 
 
 
-//                    \
+//                    \ 
 //                     ------------------> X-axis
 
 //                   Origin
@@ -244,14 +232,14 @@
 //       z_hi = 2^k * 1.b_1 b_2 b_3 b_4 1
 //
 // then
-//                                            /                \
+//                                            /                \ 
 //                                            |  (V/U) - z_hi  |
 
 //      arctan(V/U) = arctan(z_hi) + acrtan| -------------- |
 //                                            | 1 + (V/U)*z_hi |
 //                                            \                /
 //
-//                                            /                \
+//                                            /                \ 
 //                                            |   V - z_hi*U   |
 
 //                  = arctan(z_hi) + acrtan| -------------- |
@@ -307,7 +295,7 @@
 //    U      := max( |Arg_X|, |Arg_Y| )
 //    V      := min( |Arg_X|, |Arg_Y| )
 //
-//    execute: frcpa E, pred, V, U
+//    execute: frcap E, pred, V, U
 //    If pred is 0, go to Step 5 for special cases handling.
 //
 // Step 2. Decide on branch.
@@ -411,7 +399,7 @@
 //
 //    z := V * E     ...z approximates V/U to roughly working precision
 //    zsq := z * z
-//    z4 := zsq * zsq; z8 := z4 * z4
+//    z8 := zsq * zsq; z8 := z8 * z8
 //
 //    poly1 := P_4 + zsq*(P_5 + zsq*(P_6 + zsq*(P_7 + zsq*P_8)))
 //    poly2 := zsq*(P_1 + zsq*(P_2 + zsq*P_3))
@@ -450,11 +438,12 @@
 //
 // Step 5. Special Cases
 //
-//    These are detected early in the function by fclass instructions.
+//    If pred is 0 where pred is obtained in
+//        frcap E, pred, V, U
 //
-//    We are in one of those special cases when X or Y is 0,+-inf or NaN
+//    we are in one of those special cases of 0,+-inf or NaN
 //
-//    If one of X and Y is NaN, return X+Y (which will generate
+//    If one of U and V is NaN, return U+V (which will generate
 //    invalid in case one is a signaling NaN). Otherwise,
 //    return the Result as described in the table
 //
@@ -480,6 +469,8 @@
 //
 //
 
+#include "libm_support.h"
+
 ArgY_orig   =   f8
 Result      =   f8
 FR_RESULT   =   f8
@@ -513,7 +504,6 @@ Res_hi      =   f49
 Res_lo      =   f50
 Z           =   f52
 zsq         =   f53
-z4          =   f54
 z8          =   f54
 poly1       =   f55
 poly2       =   f56
@@ -531,8 +521,8 @@ P_5         =   f67
 P_6         =   f68
 P_7         =   f69
 P_8         =   f70
-U_hold      =   f71
-TWO_TO_NEG3 =   f72
+TWO_TO_NEG3 =   f71
+U_hold      =   f72
 C_hi_hold   =   f73
 E_hold      =   f74
 M           =   f75
@@ -540,11 +530,6 @@ ArgX_abs    =   f76
 ArgY_abs    =   f77
 Result_lo   =   f78
 A_temp      =   f79
-FR_temp     =   f80
-Xsq         =   f81
-Ysq         =   f82
-tmp_small   =   f83
-
 GR_SAVE_PFS   = r33
 GR_SAVE_B0    = r34
 GR_SAVE_GP    = r35
@@ -560,1401 +545,1415 @@ exp_ArgY      = r44
 exponent_Q    = r45 
 significand_Q = r46 
 special       = r47 
-sp_exp_Q      = r48 
-sp_exp_4sig_Q = r49 
-table_base    = r50 
-int_temp      = r51
-
+special1      = r48 
 GR_Parameter_X      = r49
 GR_Parameter_Y      = r50
 GR_Parameter_RESULT = r51
 GR_Parameter_TAG    = r52
-GR_temp             = r52
-
-RODATA
-.align 16 
-
-LOCAL_OBJECT_START(Constants_atan)
-//       double pi/2
-data8 0x3FF921FB54442D18
-//       single lo_pi/2, two**(-3)
-data4 0x248D3132, 0x3E000000
-data8 0xAAAAAAAAAAAAAAA3, 0xBFFD // P_1
-data8 0xCCCCCCCCCCCC54B2, 0x3FFC // P_2
-data8 0x9249249247E4D0C2, 0xBFFC // P_3
-data8 0xE38E38E058870889, 0x3FFB // P_4
-data8 0xBA2E895B290149F8, 0xBFFB // P_5
-data8 0x9D88E6D4250F733D, 0x3FFB // P_6
-data8 0x884E51FFFB8745A0, 0xBFFB // P_7
-data8 0xE1C7412B394396BD, 0x3FFA // P_8
-data8 0xAAAAAAAAAAAAA52F, 0xBFFD // Q_1
-data8 0xCCCCCCCCC75B60D3, 0x3FFC // Q_2
-data8 0x924923AD011F1940, 0xBFFC // Q_3
-data8 0xE36F716D2A5F89BD, 0x3FFB // Q_4
+int_temp            = r52
+
+#ifdef _LIBC
+.rodata
+#else
+.data
+#endif
+.align 64 
+
+Constants_atan:
+ASM_TYPE_DIRECTIVE(Constants_atan,@object)
+data4    0x54442D18, 0x3FF921FB, 0x248D3132, 0x3E000000
+//       double pi/2, single lo_pi/2, two**(-3)
+data4    0xAAAAAAA3, 0xAAAAAAAA, 0x0000BFFD, 0x00000000 // P_1
+data4    0xCCCC54B2, 0xCCCCCCCC, 0x00003FFC, 0x00000000 // P_2
+data4    0x47E4D0C2, 0x92492492, 0x0000BFFC, 0x00000000 // P_3
+data4    0x58870889, 0xE38E38E0, 0x00003FFB, 0x00000000 // P_4
+data4    0x290149F8, 0xBA2E895B, 0x0000BFFB, 0x00000000 // P_5
+data4    0x250F733D, 0x9D88E6D4, 0x00003FFB, 0x00000000 // P_6
+data4    0xFB8745A0, 0x884E51FF, 0x0000BFFB, 0x00000000 // P_7
+data4    0x394396BD, 0xE1C7412B, 0x00003FFA, 0x00000000 // P_8
+data4    0xAAAAA52F, 0xAAAAAAAA, 0x0000BFFD, 0x00000000 // Q_1
+data4    0xC75B60D3, 0xCCCCCCCC, 0x00003FFC, 0x00000000 // Q_2
+data4    0x011F1940, 0x924923AD, 0x0000BFFC, 0x00000000 // Q_3
+data4    0x2A5F89BD, 0xE36F716D, 0x00003FFB, 0x00000000 // Q_4
 //
 //    Entries Tbl_hi  (double precision)
 //    B = 1+Index/16+1/32  Index = 0
 //    Entries Tbl_lo (single precision)
 //    B = 1+Index/16+1/32  Index = 0
 //
-data8 0x3FE9A000A935BD8E 
-data4 0x23ACA08F, 0x00000000
+data4   0xA935BD8E, 0x3FE9A000, 0x23ACA08F, 0x00000000
 //
 //    Entries Tbl_hi  (double precision) Index = 0,1,...,15
 //    B = 2^(-1)*(1+Index/16+1/32)
 //    Entries Tbl_lo (single precision)
 //    Index = 0,1,...,15  B = 2^(-1)*(1+Index/16+1/32)
 //
-data8 0x3FDE77EB7F175A34 
-data4 0x238729EE, 0x00000000
-data8 0x3FE0039C73C1A40B 
-data4 0x249334DB, 0x00000000
-data8 0x3FE0C6145B5B43DA 
-data4 0x22CBA7D1, 0x00000000
-data8 0x3FE1835A88BE7C13 
-data4 0x246310E7, 0x00000000
-data8 0x3FE23B71E2CC9E6A 
-data4 0x236210E5, 0x00000000
-data8 0x3FE2EE628406CBCA 
-data4 0x2462EAF5, 0x00000000
-data8 0x3FE39C391CD41719 
-data4 0x24B73EF3, 0x00000000
-data8 0x3FE445065B795B55 
-data4 0x24C11260, 0x00000000
-data8 0x3FE4E8DE5BB6EC04 
-data4 0x242519EE, 0x00000000
-data8 0x3FE587D81F732FBA 
-data4 0x24D4346C, 0x00000000
-data8 0x3FE6220D115D7B8D 
-data4 0x24ED487B, 0x00000000
-data8 0x3FE6B798920B3D98 
-data4 0x2495FF1E, 0x00000000
-data8 0x3FE748978FBA8E0F 
-data4 0x223D9531, 0x00000000
-data8 0x3FE7D528289FA093 
-data4 0x242B0411, 0x00000000
-data8 0x3FE85D69576CC2C5 
-data4 0x2335B374, 0x00000000
-data8 0x3FE8E17AA99CC05D 
-data4 0x24C27CFB, 0x00000000
+data4   0x7F175A34, 0x3FDE77EB, 0x238729EE, 0x00000000
+data4   0x73C1A40B, 0x3FE0039C, 0x249334DB, 0x00000000
+data4   0x5B5B43DA, 0x3FE0C614, 0x22CBA7D1, 0x00000000
+data4   0x88BE7C13, 0x3FE1835A, 0x246310E7, 0x00000000
+data4   0xE2CC9E6A, 0x3FE23B71, 0x236210E5, 0x00000000
+data4   0x8406CBCA, 0x3FE2EE62, 0x2462EAF5, 0x00000000
+data4   0x1CD41719, 0x3FE39C39, 0x24B73EF3, 0x00000000
+data4   0x5B795B55, 0x3FE44506, 0x24C11260, 0x00000000
+data4   0x5BB6EC04, 0x3FE4E8DE, 0x242519EE, 0x00000000
+data4   0x1F732FBA, 0x3FE587D8, 0x24D4346C, 0x00000000
+data4   0x115D7B8D, 0x3FE6220D, 0x24ED487B, 0x00000000
+data4   0x920B3D98, 0x3FE6B798, 0x2495FF1E, 0x00000000
+data4   0x8FBA8E0F, 0x3FE74897, 0x223D9531, 0x00000000
+data4   0x289FA093, 0x3FE7D528, 0x242B0411, 0x00000000
+data4   0x576CC2C5, 0x3FE85D69, 0x2335B374, 0x00000000
+data4   0xA99CC05D, 0x3FE8E17A, 0x24C27CFB, 0x00000000
 //
 //    Entries Tbl_hi  (double precision) Index = 0,1,...,15
 //    B = 2^(-2)*(1+Index/16+1/32)
 //    Entries Tbl_lo (single precision)
 //    Index = 0,1,...,15  B = 2^(-2)*(1+Index/16+1/32)
 //
-data8 0x3FD025FA510665B5 
-data4 0x24263482, 0x00000000
-data8 0x3FD1151A362431C9
-data4 0x242C8DC9, 0x00000000
-data8 0x3FD2025567E47C95
-data4 0x245CF9BA, 0x00000000
-data8 0x3FD2ED987A823CFE
-data4 0x235C892C, 0x00000000
-data8 0x3FD3D6D129271134
-data4 0x2389BE52, 0x00000000
-data8 0x3FD4BDEE586890E6
-data4 0x24436471, 0x00000000
-data8 0x3FD5A2E0175E0F4E
-data4 0x2389DBD4, 0x00000000
-data8 0x3FD685979F5FA6FD
-data4 0x2476D43F, 0x00000000
-data8 0x3FD7660752817501
-data4 0x24711774, 0x00000000
-data8 0x3FD84422B8DF95D7
-data4 0x23EBB501, 0x00000000
-data8 0x3FD91FDE7CD0C662
-data4 0x23883A0C, 0x00000000
-data8 0x3FD9F93066168001
-data4 0x240DF63F, 0x00000000
-data8 0x3FDAD00F5422058B
-data4 0x23FE261A, 0x00000000
-data8 0x3FDBA473378624A5
-data4 0x23A8CD0E, 0x00000000
-data8 0x3FDC76550AAD71F8
-data4 0x2422D1D0, 0x00000000
-data8 0x3FDD45AEC9EC862B
-data4 0x2344A109, 0x00000000
+data4    0x510665B5, 0x3FD025FA, 0x24263482, 0x00000000
+data4    0x362431C9, 0x3FD1151A, 0x242C8DC9, 0x00000000
+data4    0x67E47C95, 0x3FD20255, 0x245CF9BA, 0x00000000
+data4    0x7A823CFE, 0x3FD2ED98, 0x235C892C, 0x00000000
+data4    0x29271134, 0x3FD3D6D1, 0x2389BE52, 0x00000000
+data4    0x586890E6, 0x3FD4BDEE, 0x24436471, 0x00000000
+data4    0x175E0F4E, 0x3FD5A2E0, 0x2389DBD4, 0x00000000
+data4    0x9F5FA6FD, 0x3FD68597, 0x2476D43F, 0x00000000
+data4    0x52817501, 0x3FD76607, 0x24711774, 0x00000000
+data4    0xB8DF95D7, 0x3FD84422, 0x23EBB501, 0x00000000
+data4    0x7CD0C662, 0x3FD91FDE, 0x23883A0C, 0x00000000
+data4    0x66168001, 0x3FD9F930, 0x240DF63F, 0x00000000
+data4    0x5422058B, 0x3FDAD00F, 0x23FE261A, 0x00000000
+data4    0x378624A5, 0x3FDBA473, 0x23A8CD0E, 0x00000000
+data4    0x0AAD71F8, 0x3FDC7655, 0x2422D1D0, 0x00000000
+data4    0xC9EC862B, 0x3FDD45AE, 0x2344A109, 0x00000000
 //
 //    Entries Tbl_hi  (double precision) Index = 0,1,...,15
 //    B = 2^(-3)*(1+Index/16+1/32)
 //    Entries Tbl_lo (single precision)
 //    Index = 0,1,...,15  B = 2^(-3)*(1+Index/16+1/32)
 //
-data8 0x3FC068D584212B3D
-data4 0x239874B6, 0x00000000
-data8 0x3FC1646541060850
-data4 0x2335E774, 0x00000000
-data8 0x3FC25F6E171A535C
-data4 0x233E36BE, 0x00000000
-data8 0x3FC359E8EDEB99A3
-data4 0x239680A3, 0x00000000
-data8 0x3FC453CEC6092A9E
-data4 0x230FB29E, 0x00000000
-data8 0x3FC54D18BA11570A
-data4 0x230C1418, 0x00000000
-data8 0x3FC645BFFFB3AA73
-data4 0x23F0564A, 0x00000000
-data8 0x3FC73DBDE8A7D201
-data4 0x23D4A5E1, 0x00000000
-data8 0x3FC8350BE398EBC7
-data4 0x23D4ADDA, 0x00000000
-data8 0x3FC92BA37D050271
-data4 0x23BCB085, 0x00000000
-data8 0x3FCA217E601081A5
-data4 0x23BC841D, 0x00000000
-data8 0x3FCB1696574D780B
-data4 0x23CF4A8E, 0x00000000
-data8 0x3FCC0AE54D768466
-data4 0x23BECC90, 0x00000000
-data8 0x3FCCFE654E1D5395
-data4 0x2323DCD2, 0x00000000
-data8 0x3FCDF110864C9D9D
-data4 0x23F53F3A, 0x00000000
-data8 0x3FCEE2E1451D980C
-data4 0x23CCB11F, 0x00000000
-//
-data8 0x400921FB54442D18, 0x3CA1A62633145C07 // PI two doubles
-data8 0x3FF921FB54442D18, 0x3C91A62633145C07 // PI_by_2 two dbles
-data8 0x3FE921FB54442D18, 0x3C81A62633145C07 // PI_by_4 two dbles
-data8 0x4002D97C7F3321D2, 0x3C9A79394C9E8A0A // 3PI_by_4 two dbles
-LOCAL_OBJECT_END(Constants_atan)
-
-
-.section .text
-GLOBAL_IEEE754_ENTRY(atanl)
-
-// Use common code with atan2l after setting x=1.0
-{ .mfi
-      alloc r32 = ar.pfs, 0, 17, 4, 0
-      fma.s1 Ysq = ArgY_orig, ArgY_orig, f0          // Form y*y
-      nop.i 999
-}
-{ .mfi
-      addl table_ptr1 = @ltoff(Constants_atan#), gp  // Address of table pointer
-      fma.s1 Xsq = f1, f1, f0                        // Form x*x
-      nop.i 999
-}
-;;
-
+data4    0x84212B3D, 0x3FC068D5, 0x239874B6, 0x00000000
+data4    0x41060850, 0x3FC16465, 0x2335E774, 0x00000000
+data4    0x171A535C, 0x3FC25F6E, 0x233E36BE, 0x00000000
+data4    0xEDEB99A3, 0x3FC359E8, 0x239680A3, 0x00000000
+data4    0xC6092A9E, 0x3FC453CE, 0x230FB29E, 0x00000000
+data4    0xBA11570A, 0x3FC54D18, 0x230C1418, 0x00000000
+data4    0xFFB3AA73, 0x3FC645BF, 0x23F0564A, 0x00000000
+data4    0xE8A7D201, 0x3FC73DBD, 0x23D4A5E1, 0x00000000
+data4    0xE398EBC7, 0x3FC8350B, 0x23D4ADDA, 0x00000000
+data4    0x7D050271, 0x3FC92BA3, 0x23BCB085, 0x00000000
+data4    0x601081A5, 0x3FCA217E, 0x23BC841D, 0x00000000
+data4    0x574D780B, 0x3FCB1696, 0x23CF4A8E, 0x00000000
+data4    0x4D768466, 0x3FCC0AE5, 0x23BECC90, 0x00000000
+data4    0x4E1D5395, 0x3FCCFE65, 0x2323DCD2, 0x00000000
+data4    0x864C9D9D, 0x3FCDF110, 0x23F53F3A, 0x00000000
+data4    0x451D980C, 0x3FCEE2E1, 0x23CCB11F, 0x00000000
+
+data4    0x54442D18, 0x400921FB, 0x33145C07, 0x3CA1A626 // PI two doubles
+data4    0x54442D18, 0x3FF921FB, 0x33145C07, 0x3C91A626 // PI_by_2 two dbles
+data4    0x54442D18, 0x3FE921FB, 0x33145C07, 0x3C81A626 // PI_by_4 two dbles
+data4    0x7F3321D2, 0x4002D97C, 0x4C9E8A0A, 0x3C9A7939 // 3PI_by_4 two dbles
+ASM_SIZE_DIRECTIVE(Constants_atan)
+
+
+.text
+.proc atanl#
+.global atanl#
+.align 64
+
+atanl: 
+{ .mfb
+	nop.m 999
+(p0)   mov ArgX_orig = f1 
+(p0)   br.cond.sptk atan2l ;;
+}
+.endp atanl
+ASM_SIZE_DIRECTIVE(atanl)
+
+.text
+.proc atan2l#
+.global atan2l#
+#ifdef _LIBC
+.proc __atan2l#
+.global __atan2l#
+.proc __ieee754_atan2l#
+.global __ieee754_atan2l#
+#endif
+.align 64 
+
+
+atan2l:
+#ifdef _LIBC
+__atan2l:
+__ieee754_atan2l:
+#endif
+{ .mfi
+alloc r32 = ar.pfs, 0, 17 , 4, 0
+(p0)  mov   ArgY = ArgY_orig
+}
+{ .mfi
+	nop.m 999
+(p0)  mov   ArgX = ArgX_orig
+	nop.i 999
+};;
 { .mfi
-      ld8 table_ptr1 = [table_ptr1]                  // Get table pointer
-      fnorm.s1 ArgY = ArgY_orig
-      nop.i 999
+	nop.m 999
+(p0)   fclass.m.unc p7,p0 = ArgY_orig, 0x103
+	nop.i 999 
 }
 { .mfi
-      nop.m 999
-      fnorm.s1 ArgX = f1
-      nop.i 999
-}
-;;
-
+	nop.m 999
+//
+//
+//  Save original input args and load table ptr.
+//
+(p0)   fclass.m.unc p6,p0 = ArgX_orig, 0x103
+	nop.i 999
+};;
 { .mfi
-      getf.exp sign_X = f1               // Get signexp of x
-      fmerge.s ArgX_abs = f0, f1         // Form |x|
-      nop.i 999
+(p0)   addl      table_ptr1   = @ltoff(Constants_atan#), gp
+(p0)   fclass.m.unc p0,p9 = ArgY_orig, 0x1FF
+	nop.i 999 ;;
 }
 { .mfi
-      nop.m 999
-      fnorm.s1 ArgX_orig = f1
-      nop.i 999
+       ld8 table_ptr1 = [table_ptr1]
+(p0)   fclass.m.unc p0,p8 = ArgX_orig, 0x1FF
+	nop.i 999
 }
-;;
-
 { .mfi
-      getf.exp sign_Y = ArgY_orig        // Get signexp of y
-      fmerge.s ArgY_abs = f0, ArgY_orig  // Form |y|
-      mov table_base = table_ptr1        // Save base pointer to tables
+	nop.m 999
+(p0)   fclass.m.unc p13,p0 = ArgY_orig, 0x0C3
+	nop.i 999 ;;
 }
-;;
-
 { .mfi
-      ldfd P_hi = [table_ptr1],8         // Load double precision hi part of pi
-      fclass.m p8,p0 = ArgY_orig, 0x1e7  // Test y natval, nan, inf, zero
-      nop.i 999 
+(p0)   fclass.m.unc p12,p0 = ArgX_orig, 0x0C3
+	nop.i 999
 }
-;;
 
-{ .mfi
-      ldfps P_lo, TWO_TO_NEG3 = [table_ptr1], 8 // Load P_lo and constant 2^-3
-      nop.f 999 
-      nop.i 999 
-}
-{ .mfi
-      nop.m 999
-      fma.s1 M = f1, f1, f0              // Set M = 1.0
-      nop.i 999 
-}
-;;
 
 //
+//     Check for NatVals.
 //     Check for everything - if false, then must be pseudo-zero
 //     or pseudo-nan (IA unsupporteds).
 //
-{ .mfb
-      nop.m 999
-      fclass.m p0,p12 = f1, 0x1FF        // Test x unsupported
-(p8)  br.cond.spnt ATANL_Y_SPECIAL       // Branch if y natval, nan, inf, zero
+{ .mib
+	nop.m 999
+	nop.i 999
+(p6)   br.cond.spnt L(ATANL_NATVAL) ;;
 }
-;;
 
-//     U = max(ArgX_abs,ArgY_abs)
-//     V = min(ArgX_abs,ArgY_abs)
-{ .mfi
-      nop.m 999
-      fcmp.ge.s1 p6,p7 = Xsq, Ysq        // Test for |x| >= |y| using squares
-      nop.i 999 
+{ .mib
+	nop.m 999
+	nop.i 999
+(p7)   br.cond.spnt L(ATANL_NATVAL) ;;
 }
-{ .mfb
-      nop.m 999
-      fma.s1 V = ArgX_abs, f1, f0        // Set V assuming |x| < |y|
-      br.cond.sptk ATANL_COMMON          // Branch to common code
+{ .mib
+(p0)   ldfd P_hi = [table_ptr1],8
+	nop.i 999
+(p8)   br.cond.spnt L(ATANL_UNSUPPORTED) ;;
 }
-;;
-
-GLOBAL_IEEE754_END(atanl)
-
-GLOBAL_IEEE754_ENTRY(atan2l)
-
-{ .mfi
-      alloc r32 = ar.pfs, 0, 17, 4, 0
-      fma.s1 Ysq = ArgY_orig, ArgY_orig, f0          // Form y*y
-      nop.i 999
+{ .mbb
+(p0)   add table_ptr2 = 96, table_ptr1
+(p9)   br.cond.spnt L(ATANL_UNSUPPORTED)
+//
+//     Load double precision high-order part of pi
+//
+(p12)  br.cond.spnt L(ATANL_NAN) ;;
 }
-{ .mfi
-      addl table_ptr1 = @ltoff(Constants_atan#), gp  // Address of table pointer
-      fma.s1 Xsq = ArgX_orig, ArgX_orig, f0          // Form x*x
-      nop.i 999
+{ .mfb
+	nop.m 999
+(p0)   fnorm.s1 ArgX = ArgX
+(p13)  br.cond.spnt L(ATANL_NAN) ;;
 }
-;;
-
-{ .mfi
-      ld8 table_ptr1 = [table_ptr1]                  // Get table pointer
-      fnorm.s1 ArgY = ArgY_orig
-      nop.i 999
+//
+//     Normalize the input argument.
+//     Branch out if NaN inputs
+//
+{ .mmf
+(p0)   ldfs P_lo = [table_ptr1], 4
+	nop.m 999
+(p0)   fnorm.s1 ArgY = ArgY ;;
 }
-{ .mfi
-      nop.m 999
-      fnorm.s1 ArgX = ArgX_orig
-      nop.i 999
+{ .mmf
+	nop.m 999
+(p0)   ldfs TWO_TO_NEG3 = [table_ptr1], 180
+//
+//     U = max(ArgX_abs,ArgY_abs)
+//     V = min(ArgX_abs,ArgY_abs)
+//     if PR1, swap = 0
+//     if PR2, swap = 1
+//
+(p0)   mov M = f1 ;;
 }
-;;
-
 { .mfi
-      getf.exp sign_X = ArgX_orig        // Get signexp of x
-      fmerge.s ArgX_abs = f0, ArgX_orig  // Form |x|
-      nop.i 999
+	nop.m 999
+//
+//     Get exp and sign of ArgX
+//     Get exp and sign of ArgY
+//     Load 2**(-3) and increment ptr to Q_4.
+//
+(p0)   fmerge.s ArgX_abs = f1, ArgX
+	nop.i 999 ;;
 }
-;;
-
+//
+//     load single precision low-order part of pi = P_lo
+//
 { .mfi
-      getf.exp sign_Y = ArgY_orig        // Get signexp of y
-      fmerge.s ArgY_abs = f0, ArgY_orig  // Form |y|
-      mov table_base = table_ptr1        // Save base pointer to tables
+(p0)   getf.exp sign_X = ArgX
+(p0)   fmerge.s ArgY_abs = f1, ArgY
+	nop.i 999 ;;
 }
-;;
-
-{ .mfi
-      ldfd P_hi = [table_ptr1],8         // Load double precision hi part of pi
-      fclass.m p8,p0 = ArgY_orig, 0x1e7  // Test y natval, nan, inf, zero
-      nop.i 999 
+{ .mii
+(p0)   getf.exp sign_Y = ArgY
+	nop.i 999 ;;
+(p0)   shr sign_X = sign_X, 17 ;;
 }
-;;
-
-{ .mfi
-      ldfps P_lo, TWO_TO_NEG3 = [table_ptr1], 8 // Load P_lo and constant 2^-3
-      fclass.m p9,p0 = ArgX_orig, 0x1e7  // Test x natval, nan, inf, zero
-      nop.i 999 
+{ .mii
+	nop.m 999
+(p0)   shr sign_Y = sign_Y, 17 ;;
+(p0)   cmp.eq.unc p8, p9 = 0x00000, sign_Y ;;
 }
 { .mfi
-      nop.m 999
-      fma.s1 M = f1, f1, f0              // Set M = 1.0
-      nop.i 999 
-}
-;;
-
+	nop.m 999
 //
-//     Check for everything - if false, then must be pseudo-zero
-//     or pseudo-nan (IA unsupporteds).
+//     Is ArgX_abs >= ArgY_abs
+//     Is sign_Y == 0?
 //
-{ .mfb
-      nop.m 999
-      fclass.m p0,p12 = ArgX_orig, 0x1FF // Test x unsupported
-(p8)  br.cond.spnt ATANL_Y_SPECIAL       // Branch if y natval, nan, inf, zero
+(p0)   fmax.s1 U = ArgX_abs, ArgY_abs
+	nop.i 999
 }
-;;
-
-//     U = max(ArgX_abs,ArgY_abs)
-//     V = min(ArgX_abs,ArgY_abs)
 { .mfi
-      nop.m 999
-      fcmp.ge.s1 p6,p7 = Xsq, Ysq        // Test for |x| >= |y| using squares
-      nop.i 999 
-}
-{ .mfb
-      nop.m 999
-      fma.s1 V = ArgX_abs, f1, f0        // Set V assuming |x| < |y|
-(p9)  br.cond.spnt ATANL_X_SPECIAL       // Branch if x natval, nan, inf, zero
-}
-;;
-
-// Now common code for atanl and atan2l
-ATANL_COMMON:
-{ .mfi
-      nop.m 999
-      fclass.m p0,p13 = ArgY_orig, 0x1FF // Test y unsupported
-      shr sign_X = sign_X, 17            // Get sign bit of x
+	nop.m 999
+//
+//     ArgX_abs = |ArgX|
+//     ArgY_abs = |ArgY|
+//     sign_X is sign bit of ArgX
+//     sign_Y is sign bit of ArgY
+//
+(p0)   fcmp.ge.s1 p6, p7 = ArgX_abs, ArgY_abs
+	nop.i 999 ;;
 }
 { .mfi
-      nop.m 999
-      fma.s1 U = ArgY_abs, f1, f0        // Set U assuming |x| < |y|
-      adds table_ptr1 = 176, table_ptr1  // Point to Q4
+	nop.m 999
+(p0)   fmin.s1 V = ArgX_abs, ArgY_abs
+	nop.i 999 ;;
 }
-;;
-
 { .mfi
-(p6)  add swap = r0, r0                  // Set swap=0 if |x| >= |y|
-(p6)  frcpa.s1 E, p0 = ArgY_abs, ArgX_abs // Compute E if |x| >= |y|
-      shr sign_Y = sign_Y, 17            // Get sign bit of y
+	nop.m 999
+(p8)   fadd.s1 s_Y = f0, f1
+(p6)   cmp.eq.unc p10, p11 = 0x00000, sign_X
 }
-{ .mfb
-      nop.m 999
-(p6)  fma.s1 V = ArgY_abs, f1, f0        // Set V if |x| >= |y|
-(p12) br.cond.spnt ATANL_UNSUPPORTED     // Branch if x unsupported
+{ .mii
+(p6)   add swap = r0, r0
+	nop.i 999 ;;
+(p7)   add swap = 1, r0
 }
-;;
-
-// Set p8 if y >=0
-// Set p9 if y < 0
-// Set p10 if |x| >= |y| and x >=0
-// Set p11 if |x| >= |y| and x < 0
 { .mfi
-      cmp.eq p8, p9 = 0, sign_Y          // Test for y >= 0
-(p7)  frcpa.s1 E, p0 = ArgX_abs, ArgY_abs // Compute E if |x| < |y|
-(p7)  add swap = 1, r0                   // Set swap=1 if |x| < |y|
-}
-{ .mfb
-(p6)  cmp.eq.unc p10, p11 = 0, sign_X    // If |x| >= |y|, test for x >= 0
-(p6)  fma.s1 U = ArgX_abs, f1, f0        // Set U if |x| >= |y|
-(p13) br.cond.spnt ATANL_UNSUPPORTED     // Branch if y unsupported
-}
-;;
-
+	nop.m 999
 //
+//     Let M = 1.0
 //     if p8, s_Y = 1.0
 //     if p9, s_Y = -1.0
 //
-.pred.rel "mutex",p8,p9
+(p10)  fsub.s1 M = M, f1
+	nop.i 999 ;;
+}
 { .mfi
-      nop.m 999
-(p8)  fadd.s1 s_Y = f0, f1               // If y >= 0 set s_Y = 1.0
-      nop.i 999
+	nop.m 999
+(p9)   fsub.s1 s_Y = f0, f1
+	nop.i 999 ;;
 }
 { .mfi
-      nop.m 999
-(p9)  fsub.s1 s_Y = f0, f1               // If y < 0 set s_Y = -1.0
-      nop.i 999
+	nop.m 999
+(p0)   frcpa.s1 E, p6 = V, U
+	nop.i 999 ;;
 }
-;;
-
-.pred.rel "mutex",p10,p11
+{ .mbb
+	nop.m 999
+//
+//     E = frcpa(V,U)
+//
+(p6)   br.cond.sptk L(ATANL_STEP2)
+(p0)   br.cond.spnt L(ATANL_SPECIAL_HANDLING) ;;
+}
+L(ATANL_STEP2): 
 { .mfi
-      nop.m 999
-(p10) fsub.s1 M = M, f1                  // If |x| >= |y| and x >=0, set M=0
-      nop.i 999
+	nop.m 999
+(p0)   fmpy.s1 Q = E, V
+	nop.i 999
 }
 { .mfi
-      nop.m 999
-(p11) fadd.s1 M = M, f1                  // If |x| >= |y| and x < 0, set M=2.0
-      nop.i 999
+	nop.m 999
+(p0)   fcmp.eq.s0     p0, p9 = f1, ArgY_orig
+	nop.i 999 ;;
 }
-;;
-
 { .mfi
-      nop.m 999
-      fcmp.eq.s0 p0, p9 = ArgX_orig, ArgY_orig // Dummy to set denormal flag
-      nop.i 999
+	nop.m 999
+//
+//     Is Q < 2**(-3)?
+//
+(p0)   fcmp.eq.s0     p0, p8 = f1, ArgX_orig
+	nop.i 999
+}
+{ .mfi
+	nop.m 999
+(p11)  fadd.s1 M = M, f1
+	nop.i 999 ;;
 }
+{ .mlx
+	nop.m 999
 // *************************************************
 // ********************* STEP2 *********************
 // *************************************************
+(p0)   movl special = 0x8400000000000000
+}
+{ .mlx
+	nop.m 999
 //
-//     Q = E * V
+//     lookup = b_1 b_2 b_3 B_4
 //
-{ .mfi
-      nop.m 999
-      fmpy.s1 Q = E, V
-      nop.i 999
+(p0)   movl special1 = 0x0000000000000100 ;;
 }
-;;
-
 { .mfi
-      nop.m 999
-      fnma.s1 E_hold = E, U, f1           // E_hold = 1.0 - E*U (1) if POLY path
-      nop.i 999
-}
-;;
-
-// Create a single precision representation of the signexp of Q with the 
-// 4 most significant bits of the significand followed by a 1 and then 18 0's
-{ .mfi
-      nop.m 999
-      fmpy.s1 P_hi = M, P_hi
-      dep.z special = 0x1, 18, 1           // Form 0x0000000000040000
-}
-{ .mfi
-      nop.m 999
-      fmpy.s1 P_lo = M, P_lo
-      add table_ptr2 = 32, table_ptr1
+	nop.m 999
+//
+//     Do fnorms to raise any denormal operand
+//     exceptions.
+//
+(p0)   fmpy.s1 P_hi = M, P_hi
+	nop.i 999
 }
-;;
-
 { .mfi
-      nop.m 999
-      fma.s1 A_temp = Q, f1, f0            // Set A_temp if POLY path
-      nop.i 999
+	nop.m 999
+(p0)   fmpy.s1 P_lo = M, P_lo
+	nop.i 999 ;;
 }
 { .mfi
-      nop.m 999
-      fma.s1 E = E, E_hold, E              // E = E + E*E_hold (1) if POLY path
-      nop.i 999
-}
-;;
-
+	nop.m 999
 //
-//     Is Q < 2**(-3)?
-//     swap = xor(swap,sign_X)
+//     Q = E * V
 //
-{ .mfi
-      nop.m 999
-      fcmp.lt.s1 p9, p0 = Q, TWO_TO_NEG3    // Test Q < 2^-3
-      xor swap = sign_X, swap
+(p0)   fcmp.lt.unc.s1 p6, p7 = Q, TWO_TO_NEG3
+	nop.i 999 ;;
 }
-;;
-
-//     P_hi = s_Y * P_hi
-{ .mmf
-      getf.exp exponent_Q =  Q              // Get signexp of Q
-      cmp.eq.unc p7, p6 = 0x00000, swap
-      fmpy.s1 P_hi = s_Y, P_hi
+{ .mmb
+(p0)   getf.sig significand_Q = Q
+(p0)   getf.exp exponent_Q =  Q
+	nop.b 999 ;;
 }
-;;
-
+{ .mmi
+	nop.m 999 ;;
+(p0)   andcm k = 0x0003, exponent_Q
+(p0)   extr.u lookup = significand_Q, 59, 4 ;;
+}
+{ .mib
+	nop.m 999
+(p0)   dep special = lookup, special, 59, 4
 //
-//     if (PR_1) sigma = -1.0
-//     if (PR_2) sigma =  1.0
+//     Generate 1.b_1 b_2 b_3 b_4 1 0 0 0 ... 0
 //
-{ .mfi
-      getf.sig significand_Q = Q            // Get significand of Q
-(p6)  fsub.s1 sigma = f0, f1
-      nop.i 999
+(p6)   br.cond.spnt L(ATANL_POLY) ;;
 }
-{ .mfb
-(p9)  add table_ptr1 = 128, table_base      // Point to P8 if POLY path
-(p7)  fadd.s1 sigma = f0, f1
-(p9)  br.cond.spnt ATANL_POLY               // Branch to POLY if 0 < Q < 2^-3
-}
-;;
-
+{ .mfi
+(p0)   cmp.eq.unc p8, p9 = 0x0000, k
+(p0)   fmpy.s1 P_hi = s_Y, P_hi
+//
+//     We waited a few extra cycles so P_lo and P_hi could be calculated.
+//     Load the constant 256 for loading up table entries.
 //
 // *************************************************
 // ******************** STEP3 **********************
 // *************************************************
+(p0)   add table_ptr2 = 16, table_ptr1
+}
 //
-//     lookup = b_1 b_2 b_3 B_4
+//     Let z_hi have exponent and sign of original Q
+//     Load the Tbl_hi(0) else, increment pointer.
 //
+{ .mii
+(p0)   ldfe Q_4 = [table_ptr1], -16
+(p0)   xor swap = sign_X, swap ;;
+(p9)   sub k = k, r0, 1
+}
 { .mmi
-      nop.m 999
-      nop.m 999
-      andcm k = 0x0003, exponent_Q  // k=0,1,2,3 for exp_Q=0,-1,-2,-3
+(p0)   setf.sig z_hi = special
+(p0)   ldfe Q_3 = [table_ptr1], -16
+(p9)   add table_ptr2 = 16, table_ptr2 ;;
 }
-;;
-
 //
-//  Generate sign_exp_Q b_1 b_2 b_3 b_4 1 0 0 0 ... 0  in single precision 
-//  representation.  Note sign of Q is always 0.
+//     U_hold = U - U_prime_hi
+//     k = k * 256 - Result can be 0, 256, or 512.
 //
-{ .mfi
-      cmp.eq p8, p9 = 0x0000, k             // Test k=0
-      nop.f 999
-      extr.u lookup = significand_Q, 59, 4  // Extract b_1 b_2 b_3 b_4 for index
+{ .mmb
+(p0)   ldfe Q_2 = [table_ptr1], -16
+(p8)   ldfd Tbl_hi = [table_ptr2], 8
+	nop.b 999 ;;
 }
-{ .mfi
-      sub sp_exp_Q = 0x7f, k                // Form single prec biased exp of Q
-      nop.f 999
-      sub k = k, r0, 1                      // Decrement k
+//
+//     U_prime_lo =  U_hold + V * z_hi
+//     lookup -> lookup * 16 + k
+//
+{ .mmi
+(p0)   ldfe Q_1 = [table_ptr1], -16 ;;
+(p8)   ldfs Tbl_lo = [table_ptr2], 8
+//
+//     U_prime_hi = U + V * z_hi
+//     Load the Tbl_lo(0)
+//
+(p9)   pmpy2.r k = k, special1 ;;
 }
-;;
-
-//     Form pointer to B index table
-{ .mfi
-      ldfe Q_4 = [table_ptr1], -16          // Load Q_4
-      nop.f 999
-(p9)  shl k = k, 8                          // k = 0, 256, or 512
+{ .mii
+	nop.m 999
+	nop.i 999 
+	nop.i 999 ;;
 }
-{ .mfi
-(p9)  shladd table_ptr2 = lookup, 4, table_ptr2
-      nop.f 999
-      shladd sp_exp_4sig_Q = sp_exp_Q, 4, lookup // Shift and add in 4 high bits
+{ .mii
+	nop.m 999
+	nop.i 999 
+	nop.i 999 ;;
 }
-;;
-
-{ .mmi
-(p8)  add table_ptr2 = -16, table_ptr2      // Pointer if original k was 0
-(p9)  add table_ptr2 = k, table_ptr2        // Pointer if k was 1, 2, 3
-      dep special = sp_exp_4sig_Q, special, 19, 13 // Form z_hi as single prec
+{ .mii
+	nop.m 999
+	nop.i 999 
+	nop.i 999 ;;
 }
-;;
-
-//     z_hi = s exp 1.b_1 b_2 b_3 b_4 1 0 0 0 ... 0
-{ .mmi
-      ldfd Tbl_hi = [table_ptr2], 8         // Load Tbl_hi from index table
-;;
-      setf.s z_hi = special                 // Form z_hi
-      nop.i 999
+{ .mii
+	nop.m 999
+	nop.i 999 ;;
+(p9)   shladd lookup = lookup, 0x0004, k ;;
 }
 { .mmi
-      ldfs Tbl_lo = [table_ptr2], 8         // Load Tbl_lo from index table
-;;
-      ldfe Q_3 = [table_ptr1], -16          // Load Q_3
-      nop.i 999
-}
-;;
-
-{ .mmi
-      ldfe Q_2 = [table_ptr1], -16          // Load Q_2
-      nop.m 999
-      nop.i 999
+(p9)   add table_ptr2 = table_ptr2, lookup ;;
+//
+//     V_prime =  V - U * z_hi
+//
+(p9)   ldfd Tbl_hi = [table_ptr2], 8
+	nop.i 999 ;;
 }
-;;
-
 { .mmf
-      ldfe Q_1 = [table_ptr1], -16          // Load Q_1
-      nop.m 999
-      nop.f 999
+	nop.m 999
+//
+//     C_hi = frcpa(1,U_prime_hi)
+//
+(p9)   ldfs Tbl_lo = [table_ptr2], 8
+//
+//     z_hi = s exp 1.b_1 b_2 b_3 b_4 1 0 0 0 ... 0
+//     Point to beginning of Tbl_hi entries - k = 0.
+//
+(p0)   fmerge.se z_hi = Q, z_hi ;;
 }
-;;
-
 { .mfi
-      nop.m 999
-      fma.s1 U_prime_hi = V, z_hi, U        // U_prime_hi = U + V * z_hi
-      nop.i 999
+	nop.m 999
+(p0)   fma.s1 U_prime_hi = V, z_hi, U
+	nop.i 999
 }
 { .mfi
-      nop.m 999
-      fnma.s1 V_prime = U, z_hi, V          // V_prime =  V - U * z_hi
-      nop.i 999
+	nop.m 999
+(p0)   fnma.s1 V_prime = U, z_hi, V
+	nop.i 999 ;;
 }
-;;
-
 { .mfi
-      nop.m 999
-      mov A_hi = Tbl_hi                     // Start with A_hi = Tbl_hi
-      nop.i 999
+	nop.m 999
+(p0)   mov A_hi = Tbl_hi
+	nop.i 999 ;;
 }
-;;
-
 { .mfi
-      nop.m 999
-      fsub.s1 U_hold = U, U_prime_hi        // U_hold = U - U_prime_hi
-      nop.i 999
+	nop.m 999
+(p0)   fsub.s1 U_hold = U, U_prime_hi
+	nop.i 999 ;;
 }
-;;
-
 { .mfi
-      nop.m 999
-      frcpa.s1 C_hi, p0 = f1, U_prime_hi    // C_hi = frcpa(1,U_prime_hi)
-      nop.i 999
+	nop.m 999
+(p0)   frcpa.s1 C_hi, p6 = f1, U_prime_hi
+	nop.i 999 ;;
 }
-;;
-
 { .mfi
-      nop.m 999
-      fmpy.s1 A_hi = s_Y, A_hi              // A_hi = s_Y * A_hi
-      nop.i 999
+(p0)   cmp.eq.unc p7, p6 = 0x00000, swap
+(p0)   fmpy.s1 A_hi = s_Y, A_hi
+	nop.i 999 ;;
 }
-;;
-
 { .mfi
-      nop.m 999
-      fma.s1 U_prime_lo = z_hi, V, U_hold   // U_prime_lo =  U_hold + V * z_hi
-      nop.i 999
+	nop.m 999
+//
+//     poly = wsq * poly
+//
+(p7)   fadd.s1 sigma = f0, f1
+	nop.i 999 ;;
 }
-;;
-
-//     C_hi_hold = 1 - C_hi * U_prime_hi (1)
 { .mfi
-      nop.m 999
-      fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1 
-      nop.i 999
+	nop.m 999
+(p0)   fma.s1 U_prime_lo = z_hi, V, U_hold
+	nop.i 999
 }
-;;
-
 { .mfi
-      nop.m 999
-      fma.s1 Res_hi = sigma, A_hi, P_hi   // Res_hi = P_hi + sigma * A_hi
-      nop.i 999
+	nop.m 999
+(p6)   fsub.s1 sigma = f0, f1
+	nop.i 999 ;;
 }
-;;
-
 { .mfi
-      nop.m 999
-      fma.s1 C_hi = C_hi_hold, C_hi, C_hi // C_hi = C_hi + C_hi * C_hi_hold (1)
-      nop.i 999
+	nop.m 999
+(p0)   fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1
+	nop.i 999 ;;
 }
-;;
-
-//     C_hi_hold = 1 - C_hi * U_prime_hi (2)
 { .mfi
-      nop.m 999
-      fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1
-      nop.i 999
+	nop.m 999
+//
+//     A_lo = A_lo + w_hi
+//     A_hi = s_Y * A_hi
+//
+(p0)   fma.s1 Res_hi = sigma, A_hi, P_hi
+	nop.i 999 ;;
 }
-;;
-
 { .mfi
-      nop.m 999
-      fma.s1 C_hi = C_hi_hold, C_hi, C_hi // C_hi = C_hi + C_hi * C_hi_hold (2)
-      nop.i 999
+	nop.m 999
+//
+//     C_hi_hold = 1 - C_hi * U_prime_hi (1)
+//
+(p0)   fma.s1 C_hi = C_hi_hold, C_hi, C_hi
+	nop.i 999 ;;
 }
-;;
-
-//     C_hi_hold = 1 - C_hi * U_prime_hi (3)
 { .mfi
-      nop.m 999
-      fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1 
-      nop.i 999
+	nop.m 999
+//
+//     C_hi = C_hi + C_hi * C_hi_hold    (1)
+//
+(p0)   fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1
+	nop.i 999 ;;
 }
-;;
-
 { .mfi
-      nop.m 999
-      fma.s1 C_hi = C_hi_hold, C_hi, C_hi // C_hi = C_hi + C_hi * C_hi_hold (3)
-      nop.i 999
+	nop.m 999
+//
+//     C_hi_hold = 1 - C_hi * U_prime_hi (2)
+//
+(p0)   fma.s1 C_hi = C_hi_hold, C_hi, C_hi
+	nop.i 999 ;;
 }
-;;
-
 { .mfi
-      nop.m 999
-      fmpy.s1 w_hi = V_prime, C_hi           // w_hi = V_prime * C_hi
-      nop.i 999
+	nop.m 999
+//
+//     C_hi = C_hi + C_hi * C_hi_hold    (2)
+//
+(p0)   fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1
+	nop.i 999 ;;
 }
-;;
-
 { .mfi
-      nop.m 999
-      fmpy.s1 wsq = w_hi, w_hi               // wsq = w_hi * w_hi
-      nop.i 999
+	nop.m 999
+//
+//     C_hi_hold = 1 - C_hi * U_prime_hi (3)
+//
+(p0)   fma.s1 C_hi = C_hi_hold, C_hi, C_hi
+	nop.i 999 ;;
 }
 { .mfi
-      nop.m 999
-      fnma.s1 w_lo = w_hi, U_prime_hi, V_prime // w_lo = V_prime-w_hi*U_prime_hi
-      nop.i 999
+	nop.m 999
+//
+//     C_hi = C_hi + C_hi * C_hi_hold    (3)
+//
+(p0)   fmpy.s1 w_hi = V_prime, C_hi
+	nop.i 999 ;;
 }
-;;
-
 { .mfi
-      nop.m 999
-      fma.s1 poly =  wsq, Q_4, Q_3           // poly = Q_3 + wsq * Q_4
-      nop.i 999
+	nop.m 999
+//
+//     w_hi = V_prime * C_hi
+//
+(p0)   fmpy.s1 wsq = w_hi, w_hi
+	nop.i 999
 }
 { .mfi
-      nop.m 999
-      fnma.s1 w_lo = w_hi, U_prime_lo, w_lo  // w_lo = w_lo - w_hi * U_prime_lo
-      nop.i 999
+	nop.m 999
+(p0)   fnma.s1 w_lo = w_hi, U_prime_hi, V_prime
+	nop.i 999 ;;
 }
-;;
-
 { .mfi
-      nop.m 999
-      fma.s1 poly = wsq, poly, Q_2           // poly = Q_2 + wsq * poly
-      nop.i 999
+	nop.m 999
+//
+//     wsq = w_hi * w_hi
+//     w_lo =  = V_prime - w_hi * U_prime_hi
+//
+(p0)   fma.s1 poly =  wsq, Q_4, Q_3
+	nop.i 999
 }
 { .mfi
-      nop.m 999
-      fmpy.s1 w_lo = C_hi, w_lo              // w_lo =  = w_lo * C_hi
-      nop.i 999
+	nop.m 999
+(p0)   fnma.s1 w_lo = w_hi, U_prime_lo, w_lo
+	nop.i 999 ;;
 }
-;;
-
 { .mfi
-      nop.m 999
-      fma.s1 poly = wsq, poly, Q_1           // poly = Q_1 + wsq * poly
-      nop.i 999
+	nop.m 999
+//
+//     poly = Q_3 + wsq * Q_4
+//     w_lo =  = w_lo - w_hi * U_prime_lo
+//
+(p0)   fma.s1 poly = wsq, poly, Q_2
+	nop.i 999
 }
 { .mfi
-      nop.m 999
-      fadd.s1 A_lo = Tbl_lo, w_lo            // A_lo = Tbl_lo + w_lo
-      nop.i 999
+	nop.m 999
+(p0)   fmpy.s1 w_lo = C_hi, w_lo
+	nop.i 999 ;;
 }
-;;
-
 { .mfi
-      nop.m 999
-      fmpy.s0 Q_1 =  Q_1, Q_1                // Dummy operation to raise inexact
-      nop.i 999
+	nop.m 999
+//
+//     poly = Q_2 + wsq * poly
+//     w_lo =  = w_lo * C_hi
+//
+(p0)   fma.s1 poly = wsq, poly, Q_1
+	nop.i 999
 }
-;;
-
 { .mfi
-      nop.m 999
-      fmpy.s1 poly = wsq, poly               // poly = wsq * poly
-      nop.i 999
+	nop.m 999
+(p0)   fadd.s1 A_lo = Tbl_lo, w_lo
+	nop.i 999 ;;
 }
-;;
-
 { .mfi
-      nop.m 999
-      fmpy.s1 poly = w_hi, poly              // poly = w_hi * poly
-      nop.i 999
+	nop.m 999
+//
+//     Result  =  Res_hi + Res_lo * s_Y  (User Supplied Rounding Mode)
+//
+(p0)   fmpy.s0 Q_1 =  Q_1, Q_1
+	nop.i 999 ;;
 }
-;;
-
 { .mfi
-      nop.m 999
-      fadd.s1 A_lo = A_lo, poly              // A_lo = A_lo + poly
-      nop.i 999
+	nop.m 999
+//
+//     poly = Q_1 + wsq * poly
+//     A_lo = Tbl_lo + w_lo
+//     swap = xor(swap,sign_X)
+//
+(p0)   fmpy.s1 poly = wsq, poly
+	nop.i 999 ;;
 }
-;;
-
 { .mfi
-      nop.m 999
-      fadd.s1 A_lo = A_lo, w_hi              // A_lo = A_lo + w_hi
-      nop.i 999
+	nop.m 999
+//
+//     Is (swap) != 0 ?
+//     poly = wsq * poly
+//     A_hi = Tbl_hi
+//
+(p0)   fmpy.s1 poly = w_hi, poly
+	nop.i 999 ;;
 }
-;;
-
 { .mfi
-      nop.m 999
-      fma.s1 Res_lo = sigma, A_lo, P_lo      // Res_lo = P_lo + sigma * A_lo
-      nop.i 999
+	nop.m 999
+//
+//     if (PR_1) sigma = -1.0
+//     if (PR_2) sigma =  1.0
+//
+(p0)   fadd.s1 A_lo = A_lo, poly
+	nop.i 999 ;;
 }
-;;
-
+{ .mfi
+	nop.m 999
 //
-//     Result  =  Res_hi + Res_lo * s_Y  (User Supplied Rounding Mode)
+//     P_hi = s_Y * P_hi
+//     A_lo = A_lo + poly
 //
+(p0)   fadd.s1 A_lo = A_lo, w_hi
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+(p0)   fma.s1 Res_lo = sigma, A_lo, P_lo
+	nop.i 999 ;;
+}
 { .mfb
-      nop.m 999
-      fma.s0 Result = Res_lo, s_Y, Res_hi
-      br.ret.sptk   b0                        // Exit table path 2^-3 <= V/U < 1
+	nop.m 999
+//
+//     Res_hi = P_hi + sigma * A_hi
+//     Res_lo = P_lo + sigma * A_lo
+//
+(p0)   fma.s0 Result = Res_lo, s_Y, Res_hi
+//
+//     Raise inexact.
+//
+br.ret.sptk   b0 ;;
 }
-;;
-
-
-ATANL_POLY: 
-// Here if 0 < V/U < 2^-3
 //
-// ***********************************************
-// ******************** STEP4 ********************
-// ***********************************************
-
+//     poly1 = P_5 + zsq * poly1
+//     poly2 = zsq * poly2
 //
-//     Following:
-//     Iterate 3 times E = E + E*(1.0 - E*U)
-//     Also load P_8, P_7, P_6, P_5, P_4
+L(ATANL_POLY): 
+{ .mmf
+(p0)   xor swap = sign_X, swap
+	nop.m 999
+(p0)   fnma.s1 E_hold = E, U, f1 ;;
+}
+{ .mfi
+	nop.m 999
+(p0)   mov A_temp = Q
+//
+//     poly1 = P_4 + zsq * poly1
+//     swap = xor(swap,sign_X)
+//
+//     sign_X            gr_002
+//     swap              gr_004
+//     poly1 = poly1 <== Done with poly1
+//     poly1 = P_4 + zsq * poly1
+//     swap = xor(swap,sign_X)
 //
+(p0)   cmp.eq.unc p7, p6 = 0x00000, swap
+}
 { .mfi
-      ldfe P_8 = [table_ptr1], -16            // Load P_8
-      fnma.s1 z_lo = A_temp, U, V             // z_lo = V - A_temp * U
-      nop.i 999
+	nop.m 999
+(p0)   fmpy.s1 P_hi = s_Y, P_hi
+	nop.i 999 ;;
 }
 { .mfi
-      nop.m 999
-      fnma.s1 E_hold = E, U, f1               // E_hold = 1.0 - E*U (2)
-      nop.i 999
+	nop.m 999
+(p6)   fsub.s1 sigma = f0, f1
+	nop.i 999
 }
-;;
-
-{ .mmi
-      ldfe P_7 = [table_ptr1], -16            // Load P_7
-;;
-      ldfe P_6 = [table_ptr1], -16            // Load P_6
-      nop.i 999
+{ .mfi
+	nop.m 999
+(p7)   fadd.s1 sigma = f0, f1
+	nop.i 999 ;;
 }
-;;
 
-{ .mfi
-      ldfe P_5 = [table_ptr1], -16            // Load P_5
-      fma.s1 E = E, E_hold, E                 // E = E + E_hold*E (2)
+// ***********************************************
+// ******************** STEP4 ********************
+// ***********************************************
+
+{ .mmi
+      nop.m 999
+(p0)  addl           table_ptr1   = @ltoff(Constants_atan#), gp
       nop.i 999
 }
 ;;
 
 { .mmi
-      ldfe P_4 = [table_ptr1], -16            // Load P_4
-;;
-      ldfe P_3 = [table_ptr1], -16            // Load P_3
+      ld8 table_ptr1 = [table_ptr1]
+      nop.m 999
       nop.i 999
 }
 ;;
 
+
 { .mfi
-      ldfe P_2 = [table_ptr1], -16            // Load P_2
-      fnma.s1 E_hold = E, U, f1               // E_hold = 1.0 - E*U (3)
-      nop.i 999
-}
-{ .mlx
-      nop.m 999
-      movl         int_temp = 0x24005         // Signexp for small neg number
+	nop.m 999
+(p0)   fma.s1 E = E, E_hold, E
+//
+//     Following:
+//     Iterate 3 times E = E + E*(1.0 - E*U)
+//     Also load P_8, P_7, P_6, P_5, P_4
+//     E_hold = 1.0 - E * U     (1)
+//     A_temp = Q
+//
+(p0)   add table_ptr1 = 128, table_ptr1 ;;
 }
-;;
-
 { .mmf
-      ldfe P_1 = [table_ptr1], -16            // Load P_1
-      setf.exp     tmp_small = int_temp       // Form small neg number
-      fma.s1 E = E, E_hold, E                 // E = E + E_hold*E (3)
+	nop.m 999
+//
+//     E = E + E_hold*E         (1)
+//     Point to P_8.
+//
+(p0)   ldfe P_8 = [table_ptr1], -16
+//
+//     poly = z8*poly1 + poly2  (Typo in writeup)
+//     Is (swap) != 0 ?
+//
+(p0)   fnma.s1 z_lo = A_temp, U, V ;;
 }
-;;
-
+{ .mmb
+	nop.m 999
+//
+//     E_hold = 1.0 - E * U     (2)
+//
+(p0)   ldfe P_7 = [table_ptr1], -16
+	nop.b 999 ;;
+}
+{ .mmb
+	nop.m 999
+//
+//     E = E + E_hold*E         (2)
+//
+(p0)   ldfe P_6 = [table_ptr1], -16
+	nop.b 999 ;;
+}
+{ .mmb
+	nop.m 999
+//
+//     E_hold = 1.0 - E * U     (3)
+//
+(p0)   ldfe P_5 = [table_ptr1], -16
+	nop.b 999 ;;
+}
+{ .mmf
+	nop.m 999
+//
+//     E = E + E_hold*E         (3)
 //
 //
 // At this point E approximates 1/U to roughly working precision
-// Z = V*E approximates V/U
+// z = V*E approximates V/U
 //
-{ .mfi
-      nop.m 999
-      fmpy.s1 Z = V, E                         // Z = V * E
-      nop.i 999
+(p0)   ldfe P_4 = [table_ptr1], -16
+(p0)   fnma.s1 E_hold = E, U, f1 ;;
 }
-{ .mfi
-      nop.m 999
-      fmpy.s1 z_lo = z_lo, E                   // z_lo = z_lo * E
-      nop.i 999
+{ .mmb
+	nop.m 999
+//
+//     Z =   V * E
+//
+(p0)   ldfe P_3 = [table_ptr1], -16
+	nop.b 999 ;;
 }
-;;
-
+{ .mmb
+	nop.m 999
 //
-//     Now what we want to do is
-//     poly1 = P_4 + zsq*(P_5 + zsq*(P_6 + zsq*(P_7 + zsq*P_8)))
-//     poly2 = zsq*(P_1 + zsq*(P_2 + zsq*P_3))
+//     zsq = Z * Z
 //
+(p0)   ldfe P_2 = [table_ptr1], -16
+	nop.b 999 ;;
+}
+{ .mmb
+	nop.m 999
 //
-//     Fixup added to force inexact later -
-//     A_hi = A_temp + z_lo
-//     z_lo = (A_temp - A_hi) + z_lo
+//     z8 = zsq * zsq
 //
-{ .mfi
-      nop.m 999
-      fmpy.s1 zsq = Z, Z                        // zsq = Z * Z
-      nop.i 999
+(p0)   ldfe P_1 = [table_ptr1], -16
+	nop.b 999 ;;
+}
+{ .mlx
+	nop.m 999
+(p0)   movl         int_temp = 0x24005
 }
 { .mfi
-      nop.m 999
-      fadd.s1 A_hi = A_temp, z_lo               // A_hi = A_temp + z_lo
-      nop.i 999
+	nop.m 999
+(p0)   fma.s1 E = E, E_hold, E
+	nop.i 999 ;;
 }
-;;
-
 { .mfi
-      nop.m 999
-      fma.s1 poly1 = zsq, P_8, P_7              // poly1 = P_7 + zsq * P_8
-      nop.i 999
+	nop.m 999
+(p0)   fnma.s1 E_hold = E, U, f1
+	nop.i 999 ;;
 }
 { .mfi
-      nop.m 999
-      fma.s1 poly2 = zsq, P_3, P_2              // poly2 = P_2 + zsq * P_3
-      nop.i 999
+	nop.m 999
+(p0)   fma.s1 E = E, E_hold, E
+	nop.i 999 ;;
 }
-;;
-
 { .mfi
-      nop.m 999
-      fmpy.s1 z4 = zsq, zsq                     // z4 = zsq * zsq
-      nop.i 999
+	nop.m 999
+(p0)   fmpy.s1 Z = V, E
+	nop.i 999
 }
 { .mfi
-      nop.m 999
-      fsub.s1 A_temp = A_temp, A_hi             // A_temp = A_temp - A_hi
-      nop.i 999
+	nop.m 999
+//
+//     z_lo = V - A_temp * U
+//     if (PR_2) sigma =  1.0
+//
+(p0)   fmpy.s1 z_lo = z_lo, E
+	nop.i 999 ;;
 }
-;;
-
 { .mfi
-      nop.m 999
-      fmerge.s     tmp = A_hi, A_hi             // Copy tmp = A_hi
-      nop.i 999
+	nop.m 999
+(p0)   fmpy.s1 zsq = Z, Z
+	nop.i 999
 }
-;;
-
 { .mfi
-      nop.m 999
-      fma.s1 poly1 = zsq, poly1, P_6            // poly1 = P_6 + zsq * poly1
-      nop.i 999
+	nop.m 999
+//
+//     z_lo = z_lo * E
+//     if (PR_1) sigma = -1.0
+//
+(p0)   fadd.s1 A_hi = A_temp, z_lo
+	nop.i 999 ;;
 }
 { .mfi
-      nop.m 999
-      fma.s1 poly2 = zsq, poly2, P_1            // poly2 = P_2 + zsq * poly2
-      nop.i 999
+	nop.m 999
+//
+//     z8 = z8 * z8
+//
+//
+//     Now what we want to do is
+//     poly1 = P_4 + zsq*(P_5 + zsq*(P_6 + zsq*(P_7 + zsq*P_8)))
+//     poly2 = zsq*(P_1 + zsq*(P_2 + zsq*P_3))
+//
+(p0)   fma.s1 poly1 = zsq, P_8, P_7
+	nop.i 999
 }
-;;
-
 { .mfi
-      nop.m 999
-      fmpy.s1 z8 = z4, z4                       // z8 = z4 * z4
-      nop.i 999
+	nop.m 999
+(p0)   fma.s1 poly2 = zsq, P_3, P_2
+	nop.i 999 ;;
 }
 { .mfi
-      nop.m 999
-      fadd.s1 z_lo = A_temp, z_lo               // z_lo = (A_temp - A_hi) + z_lo
-      nop.i 999
+	nop.m 999
+(p0)   fmpy.s1 z8 = zsq, zsq
+	nop.i 999
 }
-;;
-
 { .mfi
-      nop.m 999
-      fma.s1 poly1 = zsq, poly1, P_5            // poly1 = P_5 + zsq * poly1
-      nop.i 999
+	nop.m 999
+(p0)   fsub.s1 A_temp = A_temp, A_hi
+	nop.i 999 ;;
 }
 { .mfi
-      nop.m 999
-      fmpy.s1 poly2 = poly2, zsq                // poly2 = zsq * poly2
-      nop.i 999
+	nop.m 999
+//
+//     A_lo = Z * poly + z_lo
+//
+(p0)   fmerge.s     tmp = A_hi, A_hi
+	nop.i 999 ;;
 }
-;;
-
-//     Create small GR double in case need to raise underflow
 { .mfi
-      nop.m 999
-      fma.s1 poly1 = zsq, poly1, P_4            // poly1 = P_4 + zsq * poly1
-      dep GR_temp = -1,r0,0,53
+	nop.m 999
+//
+//     poly1 = P_7 + zsq * P_8
+//     poly2 = P_2 + zsq * P_3
+//
+(p0)   fma.s1 poly1 = zsq, poly1, P_6
+	nop.i 999
 }
-;;
-
-//     Create small double in case need to raise underflow
 { .mfi
-      setf.d FR_temp = GR_temp	
-      fma.s1 poly = z8, poly1, poly2            // poly = poly2 + z8 * poly1
-      nop.i 999
+	nop.m 999
+(p0)   fma.s1 poly2 = zsq, poly2, P_1
+	nop.i 999 ;;
 }
-;;
-
 { .mfi
-      nop.m 999
-      fma.s1 A_lo = Z, poly, z_lo               // A_lo = z_lo + Z * poly
-      nop.i 999
+	nop.m 999
+(p0)   fmpy.s1 z8 = z8, z8
+	nop.i 999
 }
-;;
-
 { .mfi
-      nop.m 999
-      fadd.s1      A_hi = tmp, A_lo             // A_hi = tmp + A_lo
-      nop.i 999
+	nop.m 999
+(p0)   fadd.s1 z_lo = A_temp, z_lo
+	nop.i 999 ;;
 }
-;;
-
 { .mfi
-      nop.m 999
-      fsub.s1      tmp = tmp, A_hi              // tmp = tmp - A_hi
-      nop.i 999
+	nop.m 999
+//
+//     poly1 = P_6 + zsq * poly1
+//     poly2 = P_2 + zsq * poly2
+//
+(p0)   fma.s1 poly1 = zsq, poly1, P_5
+	nop.i 999
 }
 { .mfi
-      nop.m 999
-      fmpy.s1 A_hi = s_Y, A_hi                  // A_hi = s_Y * A_hi
-      nop.i 999
+	nop.m 999
+(p0)   fmpy.s1 poly2 = poly2, zsq
+	nop.i 999 ;;
 }
-;;
-
 { .mfi
-      nop.m 999
-      fadd.s1      A_lo = tmp, A_lo             // A_lo = tmp + A_lo
-      nop.i 999
+	nop.m 999
+//
+//     Result  =  Res_hi + Res_lo  (User Supplied Rounding Mode)
+//
+(p0)   fmpy.s1 P_5 = P_5, P_5
+	nop.i 999 ;;
 }
 { .mfi
-      nop.m 999
-      fma.s1 Res_hi = sigma, A_hi, P_hi         // Res_hi = P_hi + sigma * A_hi
-      nop.i 999
+	nop.m 999
+(p0)   fma.s1 poly1 = zsq, poly1, P_4
+	nop.i 999 ;;
 }
-;;
-
 { .mfi
-      nop.m 999
-      fsub.s1 tmp =  P_hi, Res_hi               // tmp = P_hi - Res_hi
-      nop.i 999
+	nop.m 999
+(p0)   fma.s1 poly = z8, poly1, poly2
+	nop.i 999 ;;
 }
-;;
-
+{ .mfi
+	nop.m 999
 //
-//     Test if A_lo is zero
+//     Fixup added to force inexact later -
+//     A_hi = A_temp + z_lo
+//     z_lo = (A_temp - A_hi) + z_lo
 //
-{ .mfi
-      nop.m 999
-      fclass.m p6,p0 = A_lo, 0x007              // Test A_lo = 0
-      nop.i 999
+(p0)   fma.s1 A_lo = Z, poly, z_lo
+	nop.i 999 ;;
 }
-;;
-
 { .mfi
-      nop.m 999
-(p6)  mov          A_lo = tmp_small             // If A_lo zero, make very small
-      nop.i 999
+	nop.m 999
+(p0)   fadd.s1      A_hi = tmp, A_lo
+	nop.i 999 ;;
 }
-;;
-
 { .mfi
-      nop.m 999
-      fma.s1 tmp = A_hi, sigma, tmp             // tmp = sigma * A_hi  + tmp
-      nop.i 999
+	nop.m 999
+(p0)   fsub.s1      tmp = tmp, A_hi
+	nop.i 999
 }
 { .mfi
-      nop.m 999
-      fma.s1 sigma =  A_lo, sigma, P_lo         // sigma = A_lo * sigma  + P_lo
-      nop.i 999
+	nop.m 999
+(p0)   fmpy.s1 A_hi = s_Y, A_hi
+	nop.i 999 ;;
 }
-;;
-
 { .mfi
-      nop.m 999
-      fma.s1 Res_lo = s_Y, sigma, tmp           // Res_lo = s_Y * sigma + tmp
-      nop.i 999
+	nop.m 999
+(p0)   fadd.s1      A_lo = tmp, A_lo
+	nop.i 999
 }
-;;
-
+{ .mfi
+(p0)   setf.exp     tmp = int_temp
 //
-//     Test if Res_lo is denormal
+//     P_hi = s_Y * P_hi
+//     A_hi = s_Y * A_hi
 //
+(p0)   fma.s1 Res_hi = sigma, A_hi, P_hi
+	nop.i 999 ;;
+}
 { .mfi
-      nop.m 999
-      fclass.m p14, p15 = Res_lo, 0x0b
-      nop.i 999
+	nop.m 999
+(p0)   fclass.m.unc p6,p0 = A_lo, 0x007
+	nop.i 999 ;;
 }
-;;
-
+{ .mfi
+	nop.m 999
+(p6)   mov          A_lo = tmp
+	nop.i 999
+}
+{ .mfi
+	nop.m 999
 //
-//     Compute Result = Res_lo + Res_hi.  Use s3 if Res_lo is denormal.
+//     Res_hi = P_hi + sigma * A_hi
 //
-{ .mfi
-      nop.m 999
-(p14) fadd.s3 Result = Res_lo, Res_hi     // Result for Res_lo denormal
-      nop.i 999
+(p0)   fsub.s1 tmp =  P_hi, Res_hi
+	nop.i 999 ;;
 }
 { .mfi
-      nop.m 999
-(p15) fadd.s0 Result = Res_lo, Res_hi     // Result for Res_lo normal
-      nop.i 999
+	nop.m 999
+//
+//     tmp = P_hi - Res_hi
+//
+(p0)   fma.s1 tmp = A_hi, sigma, tmp
+	nop.i 999
 }
-;;
-
-//	
-//     If Res_lo is denormal test if Result equals zero
-//	
 { .mfi
-      nop.m 999
-(p14) fclass.m.unc p14, p0 = Result, 0x07
-      nop.i 999
+	nop.m 999
+(p0)   fma.s1 sigma =  A_lo, sigma, P_lo
+	nop.i 999 ;;
 }
-;;
-
+{ .mfi
+	nop.m 999
 //
-//     If Res_lo is denormal and Result equals zero, raise inexact, underflow
-//     by squaring small double
+//     tmp   = sigma * A_hi  + tmp
+//     sigma = A_lo * sigma  + P_lo
 //
+(p0)   fma.s1 Res_lo = s_Y, sigma, tmp
+	nop.i 999 ;;
+}
 { .mfb
-      nop.m 999
-(p14) fmpy.d.s0 FR_temp = FR_temp, FR_temp
-      br.ret.sptk   b0                     // Exit POLY path, 0 < Q < 2^-3
+	nop.m 999
+//
+//     Res_lo = s_Y * sigma + tmp
+//
+(p0)   fadd.s0 Result = Res_lo, Res_hi
+br.ret.sptk   b0 ;;
 }
-;;
-
-
-ATANL_UNSUPPORTED: 
+L(ATANL_NATVAL): 
+L(ATANL_UNSUPPORTED): 
+L(ATANL_NAN): 
 { .mfb
-      nop.m 999
-      fmpy.s0 Result = ArgX,ArgY 
-      br.ret.sptk   b0
+	nop.m 999
+(p0)   fmpy.s0 Result = ArgX,ArgY 
+(p0)   br.ret.sptk   b0 ;;
 }
-;;
-
-// Here if y natval, nan, inf, zero
-ATANL_Y_SPECIAL:
-// Here if x natval, nan, inf, zero
-ATANL_X_SPECIAL:
+L(ATANL_SPECIAL_HANDLING): 
 { .mfi
-      nop.m 999
-      fclass.m p13,p12 = ArgY_orig, 0x0c3  // Test y nan
-      nop.i 999
+	nop.m 999
+(p0)   fcmp.eq.s0     p0, p6 = f1, ArgY_orig
+	nop.i 999
 }
-;;
-
 { .mfi
-      nop.m 999
-      fclass.m p15,p14 = ArgY_orig, 0x103  // Test y natval
-      nop.i 999
+	nop.m 999
+(p0)   fcmp.eq.s0     p0, p5 = f1, ArgX_orig
+	nop.i 999 ;;
 }
-;;
-
 { .mfi
-      nop.m 999
-(p12) fclass.m p13,p0 = ArgX_orig, 0x0c3  // Test x nan
-      nop.i 999
+	nop.m 999
+(p0)   fclass.m.unc p6, p7 = ArgY, 0x007
+	nop.i 999
 }
-;;
-
-{ .mfi
-      nop.m 999
-(p14) fclass.m p15,p0 = ArgX_orig, 0x103  // Test x natval
-      nop.i 999
+{ .mlx
+	nop.m 999
+(p0)   movl special = 992
 }
 ;;
 
-{ .mfb
-      nop.m 999
-(p13) fmpy.s0 Result = ArgX_orig, ArgY_orig // Result nan if x or y nan
-(p13) br.ret.spnt b0                      // Exit if x or y nan
-}
-;;
 
-{ .mfb
+{ .mmi
       nop.m 999
-(p15) fmpy.s0 Result = ArgX_orig, ArgY_orig // Result natval if x or y natval
-(p15) br.ret.spnt b0                      // Exit if x or y natval
+(p0)  addl           table_ptr1   = @ltoff(Constants_atan#), gp
+      nop.i 999
 }
 ;;
 
-
-// Here if x or y inf or zero
-ATANL_SPECIAL_HANDLING: 
-{ .mfi
+{ .mmi
+      ld8 table_ptr1 = [table_ptr1]
       nop.m 999
-      fclass.m p6, p7 = ArgY_orig, 0x007        // Test y zero
-      mov special = 992                         // Offset to table
+      nop.i 999
 }
 ;;
 
-{ .mfb
-      add table_ptr1 = table_base, special      // Point to 3pi/4
-      fcmp.eq.s0 p0, p9 = ArgX_orig, ArgY_orig  // Dummy to set denormal flag
-(p7)  br.cond.spnt ATANL_ArgY_Not_ZERO          // Branch if y not zero
-}
-;;
 
-// Here if y zero
+{ .mib
+(p0)   add table_ptr1 = table_ptr1, special
+	nop.i 999
+(p7)   br.cond.spnt L(ATANL_ArgY_Not_ZERO) ;;
+}
 { .mmf
-      ldfd  Result = [table_ptr1], 8            // Get pi high
-      nop.m 999
-      fclass.m p14, p0 = ArgX, 0x035            // Test for x>=+0
+(p0)   ldfd  Result = [table_ptr1], 8
+	nop.m 999
+(p6)   fclass.m.unc p14, p0 = ArgX, 0x035 ;;
 }
-;;
-
 { .mmf
-      nop.m 999
-      ldfd  Result_lo = [table_ptr1], -8        // Get pi lo
-      fclass.m p15, p0 = ArgX, 0x036            // Test for x<=-0
+	nop.m 999
+(p0)   ldfd  Result_lo = [table_ptr1], -8
+(p6)   fclass.m.unc p15, p0 = ArgX, 0x036 ;;
 }
-;;
-
-//
-//     Return sign_Y * 0 when  ArgX > +0
-//
 { .mfi
-      nop.m 999
-(p14) fmerge.s Result = ArgY, f0               // If x>=+0, y=0, hi sgn(y)*0
-      nop.i 999
+	nop.m 999
+(p14)  fmerge.s Result = ArgY, f0
+	nop.i 999
 }
-;;
-
 { .mfi
-      nop.m 999
-      fclass.m p13, p0 = ArgX, 0x007           // Test for x=0
-      nop.i 999
+	nop.m 999
+(p6)   fclass.m.unc p13, p0 = ArgX, 0x007
+	nop.i 999 ;;
 }
-;;
-
 { .mfi
-      nop.m 999
-(p14) fmerge.s Result_lo = ArgY, f0            // If x>=+0, y=0, lo sgn(y)*0
-      nop.i 999
+	nop.m 999
+(p14)  fmerge.s Result_lo = ArgY, f0
+	nop.i 999 ;;
 }
-;;
-
 { .mfi
-(p13) mov GR_Parameter_TAG = 36                // Error tag for x=0, y=0
-      nop.f 999
-      nop.i 999
+(p13)  mov GR_Parameter_TAG = 36 
+	nop.f 999
+	nop.i 999 ;;
 }
-;;
-
+{ .mfi
+	nop.m 999
 //
-//     Return sign_Y * pi when  ArgX < -0
+//     Return sign_Y * 0 when  ArgX > +0
 //
-{ .mfi
-      nop.m 999
-(p15) fmerge.s Result = ArgY, Result           // If x<0, y=0, hi=sgn(y)*pi
-      nop.i 999
+(p15)  fmerge.s Result = ArgY, Result
+	nop.i 999 ;;
 }
-;;
-
 { .mfi
-      nop.m 999
-(p15) fmerge.s Result_lo = ArgY, Result_lo     // If x<0, y=0, lo=sgn(y)*pi
-      nop.i 999
+	nop.m 999
+(p15)  fmerge.s Result_lo = ArgY, Result_lo
+	nop.i 999 ;;
 }
-;;
-
+{ .mfb
+	nop.m 999
 //
-//     Call error support function for atan(0,0)
+//     Return sign_Y * 0 when  ArgX < -0
 //
-{ .mfb
-      nop.m 999
-      fadd.s0 Result = Result, Result_lo
-(p13) br.cond.spnt __libm_error_region         // Branch if atan(0,0)
+(p0)   fadd.s0 Result = Result, Result_lo
+(p13)  br.cond.spnt __libm_error_region ;;
 }
-;;
-
 { .mib
-      nop.m 999
-      nop.i 999
-      br.ret.sptk   b0                         // Exit for y=0, x not 0
+	nop.m 999
+	nop.i 999
+//
+//     Call error support funciton for atan(0,0)
+//
+(p0)    br.ret.sptk   b0 ;;
 }
-;;
-
-// Here if y not zero
-ATANL_ArgY_Not_ZERO: 
+L(ATANL_ArgY_Not_ZERO): 
 { .mfi
-      nop.m 999
-      fclass.m p0, p10 = ArgY, 0x023           // Test y inf
-      nop.i 999
+	nop.m 999
+(p0)   fclass.m.unc p9, p10 = ArgY, 0x023
+	nop.i 999 ;;
+}
+{ .mib
+	nop.m 999
+	nop.i 999
+(p10)  br.cond.spnt  L(ATANL_ArgY_Not_INF) ;;
+}
+{ .mfi
+	nop.m 999
+(p9)   fclass.m.unc p6, p0 = ArgX, 0x017
+	nop.i 999
+}
+{ .mfi
+	nop.m 999
+(p9)   fclass.m.unc p7, p0 = ArgX, 0x021
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+(p9)   fclass.m.unc p8, p0 = ArgX, 0x022
+	nop.i 999 ;;
+}
+{ .mmi
+(p6)   add table_ptr1 =  16, table_ptr1 ;;
+(p0)   ldfd Result = [table_ptr1], 8
+	nop.i 999 ;;
+}
+{ .mfi
+(p0)   ldfd Result_lo = [table_ptr1], -8
+	nop.f 999
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+(p6)   fmerge.s Result = ArgY, Result
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+(p6)   fmerge.s Result_lo = ArgY, Result_lo
+	nop.i 999 ;;
 }
-;;
-
 { .mfb
-      nop.m 999
-      fclass.m p6, p0 = ArgX, 0x017            // Test for 0 <= |x| < inf
-(p10) br.cond.spnt  ATANL_ArgY_Not_INF         // Branch if 0 < |y| < inf
+	nop.m 999
+(p6)    fadd.s0 Result = Result, Result_lo
+(p6)    br.ret.sptk   b0 ;;
 }
-;;
-
-// Here if y=inf
 //
+//     Load PI/2 and adjust its sign.
 //     Return +PI/2 when ArgY = +Inf and ArgX = +/-0 or normal
 //     Return -PI/2 when ArgY = -Inf and ArgX = +/-0 or normal
-//     Return +PI/4 when ArgY = +Inf and ArgX = +Inf
-//     Return -PI/4 when ArgY = -Inf and ArgX = +Inf
-//     Return +3PI/4 when ArgY = +Inf and ArgX = -Inf
-//     Return -3PI/4 when ArgY = -Inf and ArgX = -Inf
 //
+{ .mmi
+(p7)   add table_ptr1 = 32, table_ptr1 ;;
+(p7)   ldfd Result = [table_ptr1], 8
+	nop.i 999 ;;
+}
 { .mfi
-      nop.m 999
-      fclass.m p7, p0 = ArgX, 0x021            // Test for x=+inf
-      nop.i 999
+(p7)   ldfd Result_lo = [table_ptr1], -8
+	nop.f 999
+	nop.i 999 ;;
 }
-;;
-
 { .mfi
-(p6)  add table_ptr1 =  16, table_ptr1         // Point to pi/2, if x finite 
-      fclass.m p8, p0 = ArgX, 0x022            // Test for x=-inf
-      nop.i 999
+	nop.m 999
+(p7)   fmerge.s Result = ArgY, Result
+	nop.i 999 ;;
 }
-;;
-
-{ .mmi
-(p7)  add table_ptr1 =  32, table_ptr1         // Point to pi/4 if x=+inf
-;;
-(p8)  add table_ptr1 =  48, table_ptr1         // Point to 3pi/4 if x=-inf
-
-      nop.i 999
+{ .mfi
+	nop.m 999
+(p7)   fmerge.s Result_lo = ArgY, Result_lo
+	nop.i 999 ;;
 }
-;;
-
+{ .mfb
+	nop.m 999
+(p7)    fadd.s0 Result = Result, Result_lo
+(p7)    br.ret.sptk   b0 ;;
+}
+//
+//     Load PI/4 and adjust its sign.
+//     Return +PI/4 when ArgY = +Inf and ArgX = +Inf
+//     Return -PI/4 when ArgY = -Inf and ArgX = +Inf
+//
 { .mmi
-      ldfd Result = [table_ptr1], 8            // Load pi/2, pi/4, or 3pi/4 hi
-;;
-      ldfd Result_lo = [table_ptr1], -8        // Load pi/2, pi/4, or 3pi/4 lo
-      nop.i 999
+(p8)   add table_ptr1 = 48, table_ptr1 ;;
+(p8)   ldfd Result = [table_ptr1], 8
+	nop.i 999 ;;
 }
-;;
-
 { .mfi
-      nop.m 999
-      fmerge.s Result = ArgY, Result           // Merge sgn(y) in hi
-      nop.i 999
+(p8)   ldfd Result_lo = [table_ptr1], -8
+	nop.f 999
+	nop.i 999 ;;
 }
-;;
-
 { .mfi
-      nop.m 999
-      fmerge.s Result_lo = ArgY, Result_lo     // Merge sgn(y) in lo
-      nop.i 999
+	nop.m 999
+(p8)   fmerge.s Result = ArgY, Result
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+(p8)   fmerge.s Result_lo = ArgY, Result_lo
+	nop.i 999 ;;
 }
-;;
-
 { .mfb
-      nop.m 999
-      fadd.s0 Result = Result, Result_lo       // Compute complete result
-      br.ret.sptk   b0                         // Exit for y=inf
+	nop.m 999
+(p8)   fadd.s0 Result = Result, Result_lo
+(p8)   br.ret.sptk   b0 ;; 
 }
-;;
-
-// Here if y not INF, and x=0 or INF
-ATANL_ArgY_Not_INF: 
+L(ATANL_ArgY_Not_INF): 
+{ .mfi
+	nop.m 999
 //
-//     Return +PI/2 when ArgY NOT Inf, ArgY > 0 and ArgX = +/-0
-//     Return -PI/2 when ArgY NOT Inf, ArgY < 0 and ArgX = +/-0
-//     Return +0    when ArgY NOT Inf, ArgY > 0 and ArgX = +Inf
-//     Return -0    when ArgY NOT Inf, ArgY > 0 and ArgX = +Inf
-//     Return +PI   when ArgY NOT Inf, ArgY > 0 and ArgX = -Inf
-//     Return -PI   when ArgY NOT Inf, ArgY > 0 and ArgX = -Inf
+//     Load PI/4 and adjust its sign.
+//     Return +3PI/4 when ArgY = +Inf and ArgX = -Inf
+//     Return -3PI/4 when ArgY = -Inf and ArgX = -Inf
 //
+(p0)  fclass.m.unc p6, p0 = ArgX, 0x007
+	nop.i 999
+}
 { .mfi
-      nop.m 999
-      fclass.m p7, p9 = ArgX, 0x021            // Test for x=+inf
-      nop.i 999
+	nop.m 999
+(p0)  fclass.m.unc p7, p0 = ArgX, 0x021
+	nop.i 999 ;;
 }
-;;
-
 { .mfi
-      nop.m 999
-      fclass.m p6, p0 = ArgX, 0x007            // Test for x=0
-      nop.i 999
+	nop.m 999
+(p0)  fclass.m.unc p8, p0 = ArgX, 0x022
+	nop.i 999 ;;
+}
+{ .mmi
+(p6)  add table_ptr1 = 16, table_ptr1 ;;
+(p6)  ldfd Result = [table_ptr1], 8
+	nop.i 999 ;;
 }
-;;
-
 { .mfi
-(p6)  add table_ptr1 = 16, table_ptr1          // Point to pi/2
-      fclass.m p8, p0 = ArgX, 0x022            // Test for x=-inf
-      nop.i 999
+(p6)  ldfd Result_lo = [table_ptr1], -8
+	nop.f 999
+	nop.i 999 ;;
 }
-;;
-
-.pred.rel "mutex",p7,p9
 { .mfi
-(p9)  ldfd Result = [table_ptr1], 8           // Load pi or pi/2 hi
-(p7)  fmerge.s Result = ArgY, f0              // If y not inf, x=+inf, sgn(y)*0
-      nop.i 999
+	nop.m 999
+(p6)  fmerge.s Result = ArgY, Result
+	nop.i 999 ;;
 }
-;;
-
 { .mfi
-(p9)  ldfd Result_lo = [table_ptr1], -8       // Load pi or pi/2 lo
-(p7)  fnorm.s0 Result = Result                // If y not inf, x=+inf normalize
-      nop.i 999
+	nop.m 999
+(p6)  fmerge.s Result_lo = ArgY, Result_lo
+	nop.i 999 ;;
+}
+{ .mfb
+	nop.m 999
+(p6)  fadd.s0 Result = Result, Result_lo
+(p6)  br.ret.spnt   b0 ;;
 }
-;;
-
 { .mfi
-      nop.m 999
-(p9)  fmerge.s Result = ArgY, Result          // Merge sgn(y) in hi
-      nop.i 999
+	nop.m 999
+//
+//    return = sign_Y * PI/2 when ArgX = 0
+//
+(p7)  fmerge.s Result = ArgY, f0
+	nop.i 999 ;;
+}
+{ .mfb
+	nop.m 999
+(p7)  fnorm.s0 Result = Result
+(p7)  br.ret.spnt   b0 ;;
+}
+//
+//    return = sign_Y * 0 when ArgX = Inf
+//
+{ .mmi
+(p8)  ldfd Result = [table_ptr1], 8 ;;
+(p8)  ldfd Result_lo = [table_ptr1], -8
+	nop.i 999 ;;
 }
-;;
-
 { .mfi
-      nop.m 999
-(p9)  fmerge.s Result_lo = ArgY, Result_lo    // Merge sgn(y) in lo
-      nop.i 999
+	nop.m 999
+(p8)  fmerge.s Result = ArgY, Result
+	nop.i 999 ;;
+}
+{ .mfi
+	nop.m 999
+(p8)  fmerge.s Result_lo = ArgY, Result_lo
+	nop.i 999 ;;
 }
-;;
-
 { .mfb
-      nop.m 999
-(p9)  fadd.s0 Result = Result, Result_lo      // Compute complete result
-      br.ret.spnt   b0                        // Exit for y not inf, x=0,inf
+	nop.m 999
+(p8)  fadd.s0 Result = Result, Result_lo
+(p8)  br.ret.sptk   b0 ;;
 }
-;;
-
-GLOBAL_IEEE754_END(atan2l)
+//
+//    return = sign_Y * PI when ArgX = -Inf
+//
+.endp atan2l
+ASM_SIZE_DIRECTIVE(atan2l)
+ASM_SIZE_DIRECTIVE(__atan2l)
+ASM_SIZE_DIRECTIVE(__ieee754_atan2l)
  
-LOCAL_LIBM_ENTRY(__libm_error_region)
+.proc __libm_error_region
+__libm_error_region:
 .prologue
 { .mfi
         add   GR_Parameter_Y=-32,sp             // Parameter 2 value
@@ -2002,6 +2001,7 @@ LOCAL_LIBM_ENTRY(__libm_error_region)
         br.ret.sptk     b0                     // Return
 };;
 
-LOCAL_LIBM_END(__libm_error_region#)
+.endp __libm_error_region
+ASM_SIZE_DIRECTIVE(__libm_error_region) 
 .type   __libm_error_support#,@function
 .global __libm_error_support#