diff options
Diffstat (limited to 'sysdeps/ia64/fpu')
109 files changed, 35124 insertions, 43870 deletions
diff --git a/sysdeps/ia64/fpu/Makefile b/sysdeps/ia64/fpu/Makefile index 6d1b0c1717..7ec30c43d3 100644 --- a/sysdeps/ia64/fpu/Makefile +++ b/sysdeps/ia64/fpu/Makefile @@ -1,8 +1,33 @@ ifeq ($(subdir),math) -libm-sysdep_routines += libm_atan2_reg s_matherrf s_matherrl libm_reduce \ - libm_tan libm_error \ - libm_frexp4 libm_frexp4f libm_frexp4l +# +# Some files which need to go both into libc and libm have external +# dependencies which need to be resolved differently for libc +# vs. libm. For example, inside libc, __libm_error_support needs to +# resolve to HIDDEN_JUMPTARGET(__libm_error_support) whereas within +# libm it always resolves to __libm_error_support. Such files need to +# be compiled twice. Fortunately, math/Makefile already has logic to +# support this: if a file starts with "s_", make will automatically +# generate a matching file whose name starts with "m_" which simply +# includes the corresponding "s_" file. +# +duplicated-routines = s_libm_ldexp s_libm_ldexpf s_libm_ldexpl \ + s_libm_scalbn s_libm_scalbnf s_libm_scalbnl -sysdep_routines += libm_frexp4 libm_frexp4f libm_frexp4l libc_libm_error -sysdep-CPPFLAGS += -DSIZE_INT_32 +libm-sysdep_routines += s_erfc s_erfcf s_erfcl \ + s_matherrf s_matherrl libm_reduce \ + libm_error \ + libm_frexp libm_frexpf libm_frexpl \ + libm_sincos libm_sincosf libm_sincosl \ + libm_sincos_large \ + libm_lgamma libm_lgammaf libm_lgammal \ + libm_scalblnf \ + $(duplicated-routines:s_%=m_%) + +sysdep_routines += libc_libm_error libm_frexp libm_frexpf libm_frexpl \ + $(duplicated-routines) + +sysdep-CPPFLAGS += -include libm-symbols.h \ + -D__POSIX__ \ + -D_LIB_VERSIONIMF=_LIB_VERSION \ + -DSIZE_INT_32 -DSIZE_LONG_INT_64 -DSIZE_LONG_LONG_INT_64 endif diff --git a/sysdeps/ia64/fpu/e_acos.S b/sysdeps/ia64/fpu/e_acos.S index 7e83811727..b515f01a1e 100644 --- a/sysdeps/ia64/fpu/e_acos.S +++ b/sysdeps/ia64/fpu/e_acos.S @@ -1,10 +1,10 @@ .file "acos.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003 Intel Corporation // All rights reserved. // -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,9 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// WARRANTY DISCLAIMER -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -37,838 +35,800 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. // History //============================================================== -// 2/02/00 Initial version -// 8/17/00 New and much faster algorithm. -// 8/30/00 Avoided bank conflicts on loads, shortened |x|=1 and x=0 paths, +// 02/02/00 Initial version +// 08/17/00 New and much faster algorithm. +// 08/30/00 Avoided bank conflicts on loads, shortened |x|=1 and x=0 paths, // fixed mfb split issue stalls. +// 05/20/02 Cleaned up namespace and sf0 syntax +// 08/02/02 New and much faster algorithm II +// 02/06/03 Reordered header: .section, .global, .proc, .align // Description //========================================= -// The acos function computes the principle value of the arc sine of x. +// The acos function computes the principal value of the arc cosine of x. +// acos(0) returns Pi/2, acos(1) returns 0, acos(-1) returns Pi. // A doman error occurs for arguments not in the range [-1,+1]. +// +// The acos function returns the arc cosine in the range [0, Pi] radians. +// +// There are 8 paths: +// 1. x = +/-0.0 +// Return acos(x) = Pi/2 + x +// +// 2. 0.0 < |x| < 0.625 +// Return acos(x) = Pi/2 - x - x^3 *PolA(x^2) +// where PolA(x^2) = A3 + A5*x^2 + A7*x^4 +...+ A35*x^32 +// +// 3. 0.625 <=|x| < 1.0 +// Return acos(x) = Pi/2 - asin(x) = +// = Pi/2 - sign(x) * ( Pi/2 - sqrt(R) * PolB(R)) +// Where R = 1 - |x|, +// PolB(R) = B0 + B1*R + B2*R^2 +...+B12*R^12 +// +// sqrt(R) is approximated using the following sequence: +// y0 = (1 + eps)/sqrt(R) - initial approximation by frsqrta, +// |eps| < 2^(-8) +// Then 3 iterations are used to refine the result: +// H0 = 0.5*y0 +// S0 = R*y0 +// +// d0 = 0.5 - H0*S0 +// H1 = H0 + d0*H0 +// S1 = S0 + d0*S0 +// +// d1 = 0.5 - H1*S1 +// H2 = H1 + d0*H1 +// S2 = S1 + d0*S1 +// +// d2 = 0.5 - H2*S2 +// S3 = S3 + d2*S3 +// +// S3 approximates sqrt(R) with enough accuracy for this algorithm +// +// So, the result should be reconstracted as follows: +// acos(x) = Pi/2 - sign(x) * (Pi/2 - S3*PolB(R)) +// +// But for optimization purposes the reconstruction step is slightly +// changed: +// acos(x) = Cpi + sign(x)*PolB(R)*S2 - sign(x)*d2*S2*PolB(R) +// where Cpi = 0 if x > 0 and Cpi = Pi if x < 0 +// +// 4. |x| = 1.0 +// Return acos(1.0) = 0.0, acos(-1.0) = Pi +// +// 5. 1.0 < |x| <= +INF +// A doman error occurs for arguments not in the range [-1,+1] +// +// 6. x = [S,Q]NaN +// Return acos(x) = QNaN +// +// 7. x is denormal +// Return acos(x) = Pi/2 - x, +// +// 8. x is unnormal +// Normalize input in f8 and return to the very beginning of the function +// +// Registers used +//============================================================== +// Floating Point registers used: +// f8, input, output +// f6, f7, f9 -> f15, f32 -> f64 -// The acos function returns the arc cosine in the range [0, +pi] radians. -// acos(1) returns +0, acos(-1) returns pi, acos(0) returns pi/2. -// acos(x) returns a Nan and raises the invalid exception for |x| >1 +// General registers used: +// r3, r21 -> r31, r32 -> r38 -// The acos function is just like asin except that pi/2 is added at the end. +// Predicate registers used: +// p0, p6 -> p14 // // Assembly macros //========================================= - -#include "libm_support.h" - -// predicate registers -//acos_pred_LEsqrt2by2 = p7 -//acos_pred_GTsqrt2by2 = p8 - -// integer registers -ASIN_Addr1 = r33 -ASIN_Addr2 = r34 -ASIN_FFFE = r35 - -GR_SAVE_B0 = r36 -GR_SAVE_PFS = r37 -GR_SAVE_GP = r38 - -GR_Parameter_X = r39 -GR_Parameter_Y = r40 -GR_Parameter_RESULT = r41 -GR_Parameter_Tag = r42 - -// floating point registers -acos_coeff_P1 = f32 -acos_coeff_P2 = f33 -acos_coeff_P3 = f34 -acos_coeff_P4 = f35 - -acos_coeff_P5 = f36 -acos_coeff_P6 = f37 -acos_coeff_P7 = f38 -acos_coeff_P8 = f39 -acos_coeff_P9 = f40 - -acos_coeff_P10 = f41 -acos_coeff_P11 = f42 -acos_coeff_P12 = f43 -acos_coeff_P13 = f44 -acos_coeff_P14 = f45 - -acos_coeff_P15 = f46 -acos_coeff_P16 = f47 -acos_coeff_P17 = f48 -acos_coeff_P18 = f49 -acos_coeff_P19 = f50 - -acos_coeff_P20 = f51 -acos_coeff_P21 = f52 -acos_const_sqrt2by2 = f53 -acos_const_piby2 = f54 -acos_abs_x = f55 - -acos_tx = f56 -acos_tx2 = f57 -acos_tx3 = f58 -acos_tx4 = f59 -acos_tx8 = f60 - -acos_tx11 = f61 -acos_1poly_p8 = f62 -acos_1poly_p19 = f63 -acos_1poly_p4 = f64 -acos_1poly_p15 = f65 - -acos_1poly_p6 = f66 -acos_1poly_p17 = f67 -acos_1poly_p0 = f68 -acos_1poly_p11 = f69 -acos_1poly_p2 = f70 - -acos_1poly_p13 = f71 -acos_series_tx = f72 -acos_t = f73 -acos_t2 = f74 -acos_t3 = f75 - -acos_t4 = f76 -acos_t8 = f77 -acos_t11 = f78 -acos_poly_p8 = f79 -acos_poly_p19 = f80 - -acos_poly_p4 = f81 -acos_poly_p15 = f82 -acos_poly_p6 = f83 -acos_poly_p17 = f84 -acos_poly_p0 = f85 - -acos_poly_p11 = f86 -acos_poly_p2 = f87 -acos_poly_p13 = f88 -acos_series_t = f89 -acos_1by2 = f90 - -acos_3by2 = f91 -acos_5by2 = f92 -acos_11by4 = f93 -acos_35by8 = f94 -acos_63by8 = f95 - -acos_231by16 = f96 -acos_y0 = f97 -acos_H0 = f98 -acos_S0 = f99 -acos_d = f100 - -acos_l1 = f101 -acos_d2 = f102 -acos_T0 = f103 -acos_d1 = f104 -acos_e0 = f105 - -acos_l2 = f106 -acos_d3 = f107 -acos_T3 = f108 -acos_S1 = f109 -acos_e1 = f110 - -acos_z = f111 -answer2 = f112 -acos_sgn_x = f113 -acos_429by16 = f114 -acos_18by4 = f115 - -acos_3by4 = f116 -acos_l3 = f117 -acos_T6 = f118 -acos_const_add = f119 +// integer registers used +// scratch +rTblAddr = r3 + +rPiBy2Ptr = r21 +rTmpPtr3 = r22 +rDenoBound = r23 +rOne = r24 +rAbsXBits = r25 +rHalf = r26 +r0625 = r27 +rSign = r28 +rXBits = r29 +rTmpPtr2 = r30 +rTmpPtr1 = r31 + +// stacked +GR_SAVE_PFS = r32 +GR_SAVE_B0 = r33 +GR_SAVE_GP = r34 +GR_Parameter_X = r35 +GR_Parameter_Y = r36 +GR_Parameter_RESULT = r37 +GR_Parameter_TAG = r38 + +// floating point registers used +FR_X = f10 +FR_Y = f1 +FR_RESULT = f8 + + +// scratch +fXSqr = f6 +fXCube = f7 +fXQuadr = f9 +f1pX = f10 +f1mX = f11 +f1pXRcp = f12 +f1mXRcp = f13 +fH = f14 +fS = f15 +// stacked +fA3 = f32 +fB1 = f32 +fA5 = f33 +fB2 = f33 +fA7 = f34 +fPiBy2 = f34 +fA9 = f35 +fA11 = f36 +fB10 = f35 +fB11 = f36 +fA13 = f37 +fA15 = f38 +fB4 = f37 +fB5 = f38 +fA17 = f39 +fA19 = f40 +fB6 = f39 +fB7 = f40 +fA21 = f41 +fA23 = f42 +fB3 = f41 +fB8 = f42 +fA25 = f43 +fA27 = f44 +fB9 = f43 +fB12 = f44 +fA29 = f45 +fA31 = f46 +fA33 = f47 +fA35 = f48 +fBaseP = f49 +fB0 = f50 +fSignedS = f51 +fD = f52 +fHalf = f53 +fR = f54 +fCloseTo1Pol = f55 +fSignX = f56 +fDenoBound = f57 +fNormX = f58 +fX8 = f59 +fRSqr = f60 +fRQuadr = f61 +fR8 = f62 +fX16 = f63 +fCpi = f64 // Data tables //============================================================== - -#ifdef _LIBC -.rodata -#else -.data -#endif - +RODATA .align 16 - -acos_coeff_1_table: -ASM_TYPE_DIRECTIVE(acos_coeff_1_table,@object) -data8 0xE4E7E0A423A21249 , 0x00003FF8 //P7 -data8 0xC2F7EE0200FCE2A5 , 0x0000C003 //P18 -data8 0xB745D7F6C65C20E0 , 0x00003FF9 //P5 -data8 0xF75E381A323D4D94 , 0x0000C002 //P16 -data8 0x8959C2629C1024C0 , 0x0000C002 //P20 -data8 0xAFF68E7D241292C5 , 0x00003FF8 //P9 -data8 0xB6DB6DB7260AC30D , 0x00003FFA //P3 -data8 0xD0417CE2B41CB7BF , 0x0000C000 //P14 -data8 0x81D570FEA724E3E4 , 0x0000BFFD //P12 -data8 0xAAAAAAAAAAAAC277 , 0x00003FFC //P1 -data8 0xF534912FF3E7B76F , 0x00003FFF //P21 -data8 0xc90fdaa22168c235 , 0x00003fff // pi/2 -data8 0x0000000000000000 , 0x00000000 // pad to avoid bank conflicts -ASM_SIZE_DIRECTIVE(acos_coeff_1_table) - - -acos_coeff_2_table: -ASM_TYPE_DIRECTIVE(acos_coeff_2_table,@object) -data8 0x8E26AF5F29B39A2A , 0x00003FF9 //P6 -data8 0xB4F118A4B1015470 , 0x00004003 //P17 -data8 0xF8E38E10C25990E0 , 0x00003FF9 //P4 -data8 0x80F50489AEF1CAC6 , 0x00004002 //P15 -data8 0x92728015172CFE1C , 0x00004003 //P19 -data8 0xBBC3D831D4595971 , 0x00003FF8 //P8 -data8 0x999999999952A5C3 , 0x00003FFB //P2 -data8 0x855576BE6F0975EC , 0x00003FFF //P13 -data8 0xF12420E778077D89 , 0x00003FFA //P11 -data8 0xB6590FF4D23DE003 , 0x00003FF3 //P10 -data8 0xb504f333f9de6484 , 0x00003ffe // sqrt(2)/2 -ASM_SIZE_DIRECTIVE(acos_coeff_2_table) - - -.align 32 -.global acos -ASM_TYPE_DIRECTIVE(acos,@function) +LOCAL_OBJECT_START(acos_base_range_table) +// Ai: Polynomial coefficients for the acos(x), |x| < .625000 +// Bi: Polynomial coefficients for the acos(x), |x| > .625000 +data8 0xBFDAAB56C01AE468 //A29 +data8 0x3FE1C470B76A5B2B //A31 +data8 0xBFDC5FF82A0C4205 //A33 +data8 0x3FC71FD88BFE93F0 //A35 +data8 0xB504F333F9DE6487, 0x00003FFF //B0 +data8 0xAAAAAAAAAAAAFC18, 0x00003FFC //A3 +data8 0x3F9F1C71BC4A7823 //A9 +data8 0x3F96E8BBAAB216B2 //A11 +data8 0x3F91C4CA1F9F8A98 //A13 +data8 0x3F8C9DDCEDEBE7A6 //A15 +data8 0x3F877784442B1516 //A17 +data8 0x3F859C0491802BA2 //A19 +data8 0x9999999998C88B8F, 0x00003FFB //A5 +data8 0x3F6BD7A9A660BF5E //A21 +data8 0x3F9FC1659340419D //A23 +data8 0xB6DB6DB798149BDF, 0x00003FFA //A7 +data8 0xBFB3EF18964D3ED3 //A25 +data8 0x3FCD285315542CF2 //A27 +data8 0xF15BEEEFF7D2966A, 0x00003FFB //B1 +data8 0x3EF0DDA376D10FB3 //B10 +data8 0xBEB83CAFE05EBAC9 //B11 +data8 0x3F65FFB67B513644 //B4 +data8 0x3F5032FBB86A4501 //B5 +data8 0x3F392162276C7CBA //B6 +data8 0x3F2435949FD98BDF //B7 +data8 0xD93923D7FA08341C, 0x00003FF9 //B2 +data8 0x3F802995B6D90BDB //B3 +data8 0x3F10DF86B341A63F //B8 +data8 0xC90FDAA22168C235, 0x00003FFF // Pi/2 +data8 0x3EFA3EBD6B0ECB9D //B9 +data8 0x3EDE18BA080E9098 //B12 +LOCAL_OBJECT_END(acos_base_range_table) .section .text -.proc acos -.align 32 - - -acos: - -{ .mfi - alloc r32 = ar.pfs,1,6,4,0 - fma.s1 acos_tx = f8,f8,f0 - addl ASIN_Addr2 = @ltoff(acos_coeff_2_table),gp -} -{ .mfi - mov ASIN_FFFE = 0xFFFE - fnma.s1 acos_t = f8,f8,f1 - addl ASIN_Addr1 = @ltoff(acos_coeff_1_table),gp +GLOBAL_LIBM_ENTRY(acos) +acos_unnormal_back: +{ .mfi + getf.d rXBits = f8 // grab bits of input value + // set p12 = 1 if x is a NaN, denormal, or zero + fclass.m p12, p0 = f8, 0xcf + adds rSign = 1, r0 +} +{ .mfi + addl rTblAddr = @ltoff(acos_base_range_table),gp + // 1 - x = 1 - |x| for positive x + fms.s1 f1mX = f1, f1, f8 + addl rHalf = 0xFFFE, r0 // exponent of 1/2 } ;; - - -{ .mfi - setf.exp acos_1by2 = ASIN_FFFE - fmerge.s acos_abs_x = f1,f8 - nop.i 999 ;; -} - - -{ .mmf - ld8 ASIN_Addr1 = [ASIN_Addr1] - ld8 ASIN_Addr2 = [ASIN_Addr2] - fmerge.s acos_sgn_x = f8,f1 -} -;; - - -{ .mfi - nop.m 999 - fcmp.lt.s1 p11,p12 = f8, f0 - nop.i 999 ;; -} - - -{ .mfi - ldfe acos_coeff_P7 = [ASIN_Addr1],16 - fma.s1 acos_tx2 = acos_tx,acos_tx,f0 - nop.i 999 -} -{ .mfi - ldfe acos_coeff_P6 = [ASIN_Addr2],16 - fma.s1 acos_t2 = acos_t,acos_t,f0 - nop.i 999;; +{ .mfi + addl r0625 = 0x3FE4, r0 // high 16 bits of 0.625 + // set p8 = 1 if x < 0 + fcmp.lt.s1 p8, p9 = f8, f0 + shl rSign = rSign, 63 // sign bit +} +{ .mfi + // point to the beginning of the table + ld8 rTblAddr = [rTblAddr] + // 1 + x = 1 - |x| for negative x + fma.s1 f1pX = f1, f1, f8 + adds rOne = 0x3FF, r0 } - - -{ .mmf - ldfe acos_coeff_P18 = [ASIN_Addr1],16 - ldfe acos_coeff_P17 = [ASIN_Addr2],16 - fclass.m.unc p8,p0 = f8, 0xc3 //@qnan |@snan -} ;; - - -{ .mmf - ldfe acos_coeff_P5 = [ASIN_Addr1],16 - ldfe acos_coeff_P4 = [ASIN_Addr2],16 - frsqrta.s1 acos_y0,p0 = acos_t -} +{ .mfi + andcm rAbsXBits = rXBits, rSign // bits of |x| + fmerge.s fSignX = f8, f1 // signum(x) + shl r0625 = r0625, 48 // bits of DP representation of 0.625 +} +{ .mfb + setf.exp fHalf = rHalf // load A2 to FP reg + fma.s1 fXSqr = f8, f8, f0 // x^2 + // branch on special path if x is a NaN, denormal, or zero +(p12) br.cond.spnt acos_special +} ;; - - -{ .mfi - ldfe acos_coeff_P16 = [ASIN_Addr1],16 - fcmp.gt.s1 p9,p0 = acos_abs_x,f1 - nop.i 999 -} -{ .mfb - ldfe acos_coeff_P15 = [ASIN_Addr2],16 -(p8) fma.d f8 = f8,f1,f0 -(p8) br.ret.spnt b0 +{ .mfi + adds rPiBy2Ptr = 272, rTblAddr + nop.f 0 + shl rOne = rOne, 52 // bits of 1.0 +} +{ .mfi + adds rTmpPtr1 = 16, rTblAddr + nop.f 0 + // set p6 = 1 if |x| < 0.625 + cmp.lt p6, p7 = rAbsXBits, r0625 } ;; - - -{ .mmf - ldfe acos_coeff_P20 = [ASIN_Addr1],16 - ldfe acos_coeff_P19 = [ASIN_Addr2],16 - fclass.m.unc p10,p0 = f8, 0x07 //@zero -} +{ .mfi + ldfpd fA29, fA31 = [rTblAddr] // A29, fA31 + // 1 - x = 1 - |x| for positive x +(p9) fms.s1 fR = f1, f1, f8 + // point to coefficient of "near 1" polynomial +(p7) adds rTmpPtr2 = 176, rTblAddr +} +{ .mfi + ldfpd fA33, fA35 = [rTmpPtr1], 16 // A33, fA35 + // 1 + x = 1 - |x| for negative x +(p8) fma.s1 fR = f1, f1, f8 +(p6) adds rTmpPtr2 = 48, rTblAddr +} ;; - - -{ .mfi - ldfe acos_coeff_P9 = [ASIN_Addr1],16 - fma.s1 acos_t4 = acos_t2,acos_t2,f0 -(p9) mov GR_Parameter_Tag = 58 -} -{ .mfi - ldfe acos_coeff_P8 = [ASIN_Addr2],16 - fma.s1 acos_3by2 = acos_1by2,f1,f1 - nop.i 999;; +{ .mfi + ldfe fB0 = [rTmpPtr1], 16 // B0 + nop.f 0 + nop.i 0 } - - -{ .mfi - ldfe acos_coeff_P2 = [ASIN_Addr2],16 - fma.s1 acos_tx4 = acos_tx2,acos_tx2,f0 - nop.i 999 -} -{ .mfb - ldfe acos_coeff_P3 = [ASIN_Addr1],16 - fma.s1 acos_t3 = acos_t,acos_t2,f0 -(p9) br.cond.spnt __libm_error_region +{ .mib + adds rTmpPtr3 = 16, rTmpPtr2 + // set p10 = 1 if |x| = 1.0 + cmp.eq p10, p0 = rAbsXBits, rOne + // branch on special path for |x| = 1.0 +(p10) br.cond.spnt acos_abs_1 } ;; - - -{ .mfi - ldfe acos_coeff_P13 = [ASIN_Addr2],16 - fma.s1 acos_H0 = acos_y0,acos_1by2,f0 - nop.i 999 -} -{ .mfi - ldfe acos_coeff_P14 = [ASIN_Addr1],16 - fma.s1 acos_S0 = acos_y0,acos_t,f0 - nop.i 999;; +{ .mfi + ldfe fA3 = [rTmpPtr2], 48 // A3 or B1 + nop.f 0 + adds rTmpPtr1 = 64, rTmpPtr3 } - - -{ .mfi - ldfe acos_coeff_P11 = [ASIN_Addr2],16 - fcmp.eq.s1 p6,p0 = acos_abs_x, f1 - nop.i 999 -} -{ .mfi - ldfe acos_coeff_P12 = [ASIN_Addr1],16 - fma.s1 acos_tx3 = acos_tx,acos_tx2,f0 - nop.i 999 +{ .mib + ldfpd fA9, fA11 = [rTmpPtr3], 16 // A9, A11 or B10, B11 + // set p11 = 1 if |x| > 1.0 + cmp.gt p11, p0 = rAbsXBits, rOne + // branch on special path for |x| > 1.0 +(p11) br.cond.spnt acos_abs_gt_1 } ;; - - -{ .mfi - ldfe acos_coeff_P10 = [ASIN_Addr2],16 - fma.s1 acos_1poly_p6 = acos_tx,acos_coeff_P7,acos_coeff_P6 - nop.i 999 -} -{ .mfi - ldfe acos_coeff_P1 = [ASIN_Addr1],16 - fma.s1 acos_poly_p6 = acos_t,acos_coeff_P7,acos_coeff_P6 - nop.i 999;; +{ .mfi + ldfpd fA17, fA19 = [rTmpPtr2], 16 // A17, A19 or B6, B7 + // initial approximation of 1 / sqrt(1 - x) + frsqrta.s1 f1mXRcp, p0 = f1mX + nop.i 0 } - - -{ .mfi - ldfe acos_const_sqrt2by2 = [ASIN_Addr2],16 - fma.s1 acos_5by2 = acos_3by2,f1,f1 - nop.i 999 -} -{ .mfi - ldfe acos_coeff_P21 = [ASIN_Addr1],16 - fma.s1 acos_11by4 = acos_3by2,acos_3by2,acos_1by2 - nop.i 999;; +{ .mfi + ldfpd fA13, fA15 = [rTmpPtr3] // A13, A15 or B4, B5 + fma.s1 fXCube = fXSqr, f8, f0 // x^3 + nop.i 0 } - - -{ .mfi - ldfe acos_const_piby2 = [ASIN_Addr1],16 - fma.s1 acos_poly_p17 = acos_t,acos_coeff_P18,acos_coeff_P17 - nop.i 999 -} -{ .mfb - nop.m 999 - fma.s1 acos_3by4 = acos_3by2,acos_1by2,f0 -(p10) br.cond.spnt L(ACOS_ZERO) // Branch to short path if x=0 +;; +{ .mfi + ldfe fA5 = [rTmpPtr2], 48 // A5 or B2 + // initial approximation of 1 / sqrt(1 + x) + frsqrta.s1 f1pXRcp, p0 = f1pX + nop.i 0 +} +{ .mfi + ldfpd fA21, fA23 = [rTmpPtr1], 16 // A21, A23 or B3, B8 + fma.s1 fXQuadr = fXSqr, fXSqr, f0 // x^4 + nop.i 0 } ;; - - -{ .mfi - nop.m 999 - fma.s1 acos_poly_p15 = acos_t,acos_coeff_P16,acos_coeff_P15 - nop.i 999 -} -{ .mfb - nop.m 999 - fnma.s1 acos_d = acos_S0,acos_H0,acos_1by2 -(p6) br.cond.spnt L(ACOS_ABS_ONE) // Branch to short path if |x|=1 +{ .mfi + ldfe fA7 = [rTmpPtr1] // A7 or Pi/2 + fma.s1 fRSqr = fR, fR, f0 // R^2 + nop.i 0 +} +{ .mfb + ldfpd fA25, fA27 = [rTmpPtr2] // A25, A27 or B9, B12 + nop.f 0 +(p6) br.cond.spnt acos_base_range; } ;; - -{ .mfi - nop.m 999 - fma.s1 acos_poly_p19 = acos_t,acos_coeff_P20,acos_coeff_P19 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 acos_poly_p4 = acos_t,acos_coeff_P5,acos_coeff_P4 - nop.i 999;; +{ .mfi + nop.m 0 +(p9) fma.s1 fH = fHalf, f1mXRcp, f0 // H0 for x > 0 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 acos_1poly_p17 = acos_tx,acos_coeff_P18,acos_coeff_P17 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 acos_poly_p8 = acos_t,acos_coeff_P9,acos_coeff_P8 - nop.i 999;; +{ .mfi + nop.m 0 +(p9) fma.s1 fS = f1mX, f1mXRcp, f0 // S0 for x > 0 + nop.i 0 } - - -{ .mfi - nop.m 999 - fms.s1 acos_35by8 = acos_5by2,acos_11by4,acos_5by2 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 acos_63by8 = acos_5by2,acos_11by4,f1 - nop.i 999;; +;; +{ .mfi + nop.m 0 +(p8) fma.s1 fH = fHalf, f1pXRcp, f0 // H0 for x < 0 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 acos_poly_p13 = acos_t,acos_coeff_P14,acos_coeff_P13 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 acos_18by4 = acos_3by2,acos_5by2,acos_3by4 - nop.i 999;; +{ .mfi + nop.m 0 +(p8) fma.s1 fS = f1pX, f1pXRcp, f0 // S0 for x > 0 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 acos_l1 = acos_5by2,acos_d,acos_3by2 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 acos_d2 = acos_d,acos_d,f0 - nop.i 999;; +;; +{ .mfi + nop.m 0 + fma.s1 fRQuadr = fRSqr, fRSqr, f0 // R^4 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 acos_poly_p15 = acos_t2,acos_poly_p17,acos_poly_p15 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 acos_T0 = acos_d,acos_S0,f0 - nop.i 999;; +;; +{ .mfi + nop.m 0 + fma.s1 fB11 = fB11, fR, fB10 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 acos_poly_p19 = acos_t2,acos_coeff_P21,acos_poly_p19 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 acos_poly_p4 = acos_t2,acos_poly_p6,acos_poly_p4 - nop.i 999;; +{ .mfi + nop.m 0 + fma.s1 fB1 = fB1, fR, fB0 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 acos_d1 = acos_35by8,acos_d,f0 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 acos_231by16 = acos_3by2,acos_35by8,acos_63by8 - nop.i 999;; +;; +{ .mfi + nop.m 0 + fma.s1 fB5 = fB5, fR, fB4 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 acos_poly_p2 = acos_t,acos_coeff_P3,acos_coeff_P2 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 acos_poly_p8 = acos_t2,acos_coeff_P10,acos_poly_p8 - nop.i 999;; +{ .mfi + nop.m 0 + fma.s1 fB7 = fB7, fR, fB6 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 acos_poly_p11 = acos_t,acos_coeff_P12,acos_coeff_P11 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 acos_e0 = acos_d2,acos_l1,acos_d - nop.i 999;; +;; +{ .mfi + nop.m 0 + fma.s1 fB3 = fB3, fR, fB2 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 acos_1poly_p15 = acos_tx,acos_coeff_P16,acos_coeff_P15 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 acos_poly_p0 = acos_t,acos_coeff_P1,f1 - nop.i 999;; +;; +{ .mfi + nop.m 0 + fnma.s1 fD = fH, fS, fHalf // d0 = 1/2 - H0*S0 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 acos_1poly_p19 = acos_tx,acos_coeff_P20,acos_coeff_P19 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 acos_1poly_p4 = acos_tx,acos_coeff_P5,acos_coeff_P4 - nop.i 999;; +;; +{ .mfi + nop.m 0 + fma.s1 fR8 = fRQuadr, fRQuadr, f0 // R^4 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 acos_1poly_p8 = acos_tx,acos_coeff_P9,acos_coeff_P8 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 acos_l2 = acos_231by16,acos_d,acos_63by8 - nop.i 999;; +{ .mfi + nop.m 0 + fma.s1 fB9 = fB9, fR, fB8 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 acos_d3 = acos_d2,acos_d,f0 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 acos_T3 = acos_d2,acos_T0,f0 - nop.i 999;; +;; +{.mfi + nop.m 0 + fma.s1 fB12 = fB12, fRSqr, fB11 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 acos_429by16 = acos_18by4,acos_11by4,acos_231by16 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 acos_S1 = acos_e0,acos_S0,acos_S0 - nop.i 999;; +{.mfi + nop.m 0 + fma.s1 fB7 = fB7, fRSqr, fB5 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 acos_poly_p4 = acos_t4,acos_poly_p8,acos_poly_p4 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 acos_poly_p15 = acos_t4,acos_poly_p19,acos_poly_p15 - nop.i 999;; +;; +{.mfi + nop.m 0 + fma.s1 fB3 = fB3, fRSqr, fB1 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 acos_poly_p0 = acos_t2,acos_poly_p2,acos_poly_p0 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 acos_poly_p11 = acos_t2,acos_poly_p13,acos_poly_p11 - nop.i 999;; +;; +{ .mfi + nop.m 0 + fma.s1 fH = fH, fD, fH // H1 = H0 + H0*d0 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 acos_t8 = acos_t4,acos_t4,f0 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 acos_e1 = acos_d2,acos_l2,acos_d1 - nop.i 999;; +{ .mfi + nop.m 0 + fma.s1 fS = fS, fD, fS // S1 = S0 + S0*d0 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 acos_1poly_p4 = acos_tx2,acos_1poly_p6,acos_1poly_p4 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 acos_1poly_p15 = acos_tx2,acos_1poly_p17,acos_1poly_p15 - nop.i 999;; +;; +{.mfi + nop.m 0 +(p9) fma.s1 fCpi = f1, f0, f0 // Cpi = 0 if x > 0 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 acos_1poly_p8 = acos_tx2,acos_coeff_P10,acos_1poly_p8 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 acos_1poly_p19 = acos_tx2,acos_coeff_P21,acos_1poly_p19 - nop.i 999;; +{ .mfi + nop.m 0 +(p8) fma.s1 fCpi = fPiBy2, f1, fPiBy2 // Cpi = Pi if x < 0 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 acos_1poly_p2 = acos_tx,acos_coeff_P3,acos_coeff_P2 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 acos_1poly_p13 = acos_tx,acos_coeff_P14,acos_coeff_P13 - nop.i 999;; +;; +{ .mfi + nop.m 0 + fma.s1 fB12 = fB12, fRSqr, fB9 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 acos_1poly_p0 = acos_tx,acos_coeff_P1,f1 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 acos_1poly_p11 = acos_tx,acos_coeff_P12,acos_coeff_P11 - nop.i 999;; +{ .mfi + nop.m 0 + fma.s1 fB7 = fB7, fRQuadr, fB3 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 acos_l3 = acos_429by16,acos_d,f0 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 acos_z = acos_e1,acos_T3,acos_S1 - nop.i 999;; +;; +{.mfi + nop.m 0 + fnma.s1 fD = fH, fS, fHalf // d1 = 1/2 - H1*S1 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 acos_poly_p11 = acos_t4,acos_poly_p15,acos_poly_p11 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 acos_T6 = acos_T3,acos_d3,f0 - nop.i 999;; +{ .mfi + nop.m 0 + fnma.s1 fSignedS = fSignX, fS, f0 // -signum(x)*S1 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 acos_t11 = acos_t8,acos_t3,f0 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 acos_poly_p0 = acos_t4,acos_poly_p4,acos_poly_p0 - nop.i 999;; +;; +{ .mfi + nop.m 0 + fma.s1 fCloseTo1Pol = fB12, fR8, fB7 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 acos_1poly_p4 = acos_tx4,acos_1poly_p8,acos_1poly_p4 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 acos_1poly_p15 = acos_tx4,acos_1poly_p19,acos_1poly_p15 - nop.i 999;; +;; +{ .mfi + nop.m 0 + fma.s1 fH = fH, fD, fH // H2 = H1 + H1*d1 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 acos_1poly_p0 = acos_tx2,acos_1poly_p2,acos_1poly_p0 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 acos_1poly_p11 = acos_tx2,acos_1poly_p13,acos_1poly_p11 - nop.i 999;; +{ .mfi + nop.m 0 + fma.s1 fS = fS, fD, fS // S2 = S1 + S1*d1 + nop.i 0 } - - -{ .mfi - nop.m 999 -// fcmp.le.s1 acos_pred_LEsqrt2by2,acos_pred_GTsqrt2by2 = acos_abs_x,acos_const_sqrt2by2 - fcmp.le.s1 p7,p8 = acos_abs_x,acos_const_sqrt2by2 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 acos_tx8 = acos_tx4,acos_tx4,f0 - nop.i 999;; +;; +{ .mfi + nop.m 0 + // -signum(x)* S2 = -signum(x)*(S1 + S1*d1) + fma.s1 fSignedS = fSignedS, fD, fSignedS + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 acos_z = acos_l3,acos_T6,acos_z - nop.i 999;; -} - -{ .mfi - nop.m 999 - fma.s1 acos_series_t = acos_t11,acos_poly_p11,acos_poly_p0 - nop.i 999 -} -{ .mfi - nop.m 999 -(p11) fma.s1 acos_const_add = acos_const_piby2, f1, acos_const_piby2 - nop.i 999 +;; +{.mfi + nop.m 0 + fnma.s1 fD = fH, fS, fHalf // d2 = 1/2 - H2*S2 + nop.i 0 } ;; - { .mfi - nop.m 999 -(p12) fma.s1 acos_const_add = f1,f0,f0 - nop.i 999 + nop.m 0 + // Cpi + signum(x)*PolB*S2 + fnma.s1 fCpi = fSignedS, fCloseTo1Pol, fCpi + nop.i 0 +} +{ .mfi + nop.m 0 + // signum(x)*PolB * S2 + fnma.s1 fCloseTo1Pol = fSignedS, fCloseTo1Pol, f0 + nop.i 0 } ;; - -{ .mfi - nop.m 999 - fma.s1 acos_1poly_p0 = acos_tx4,acos_1poly_p4,acos_1poly_p0 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 acos_1poly_p11 = acos_tx4,acos_1poly_p15,acos_1poly_p11 - nop.i 999;; +{ .mfb + nop.m 0 + // final result for 0.625 <= |x| < 1 + fma.d.s0 f8 = fCloseTo1Pol, fD, fCpi + // exit here for 0.625 <= |x| < 1 + br.ret.sptk b0 } - - -{ .mfi - nop.m 999 - fma.s1 acos_tx11 = acos_tx8,acos_tx3,f0 - nop.i 999;; -} - -{ .mfi - nop.m 999 -//(acos_pred_GTsqrt2by2) fnma.s1 answer2 = acos_z,acos_series_t,acos_const_piby2 -(p8) fnma.s1 answer2 = acos_z,acos_series_t,f0 - nop.i 999;; -} - -{ .mfi - nop.m 999 - fma.s1 acos_series_tx = acos_tx11,acos_1poly_p11,acos_1poly_p0 - nop.i 999;; -} - -{ .mfi - nop.m 999 -//(acos_pred_GTsqrt2by2) fnma.d f8 = acos_sgn_x,answer2,acos_const_piby2 -(p8) fnma.d f8 = acos_sgn_x,answer2,acos_const_add - nop.i 999;; -} - -{ .mfb - nop.m 999 -//(acos_pred_LEsqrt2by2) fnma.d f8 = f8,acos_series_tx,acos_const_piby2 -(p7) fnma.d f8 = f8,acos_series_tx,acos_const_piby2 - br.ret.sptk b0 ;; -} +;; -L(ACOS_ZERO): -// Here if x=0 -{ .mfb - nop.m 999 - fma.d f8 = acos_const_piby2,f1,f0 - br.ret.sptk b0 ;; -} +// here if |x| < 0.625 +.align 32 +acos_base_range: +{ .mfi + ldfe fCpi = [rPiBy2Ptr] // Pi/2 + fma.s1 fA33 = fA33, fXSqr, fA31 + nop.i 0 +} +{ .mfi + nop.m 0 + fma.s1 fA15 = fA15, fXSqr, fA13 + nop.i 0 +} +;; +{ .mfi + nop.m 0 + fma.s1 fA29 = fA29, fXSqr, fA27 + nop.i 0 +} +{ .mfi + nop.m 0 + fma.s1 fA25 = fA25, fXSqr, fA23 + nop.i 0 +} +;; +{ .mfi + nop.m 0 + fma.s1 fA21 = fA21, fXSqr, fA19 + nop.i 0 +} +{ .mfi + nop.m 0 + fma.s1 fA9 = fA9, fXSqr, fA7 + nop.i 0 +} +;; +{ .mfi + nop.m 0 + fma.s1 fA5 = fA5, fXSqr, fA3 + nop.i 0 +} +;; +{ .mfi + nop.m 0 + fma.s1 fA35 = fA35, fXQuadr, fA33 + nop.i 0 +} +{ .mfi + nop.m 0 + fma.s1 fA17 = fA17, fXQuadr, fA15 + nop.i 0 +} +;; +{ .mfi + nop.m 0 + fma.s1 fX8 = fXQuadr, fXQuadr, f0 // x^8 + nop.i 0 +} +{ .mfi + nop.m 0 + fma.s1 fA25 = fA25, fXQuadr, fA21 + nop.i 0 +} +;; +{ .mfi + nop.m 0 + fma.s1 fA9 = fA9, fXQuadr, fA5 + nop.i 0 +} +;; +{ .mfi + nop.m 0 + fms.s1 fCpi = fCpi, f1, f8 // Pi/2 - x + nop.i 0 +} +;; +{ .mfi + nop.m 0 + fma.s1 fA35 = fA35, fXQuadr, fA29 + nop.i 0 +} +{ .mfi + nop.m 0 + fma.s1 fA17 = fA17, fXSqr, fA11 + nop.i 0 +} +;; +{ .mfi + nop.m 0 + fma.s1 fX16 = fX8, fX8, f0 // x^16 + nop.i 0 +} +;; +{ .mfi + nop.m 0 + fma.s1 fA35 = fA35, fX8, fA25 + nop.i 0 +} +{ .mfi + nop.m 0 + fma.s1 fA17 = fA17, fX8, fA9 + nop.i 0 +} +;; +{ .mfi + nop.m 0 + fma.s1 fBaseP = fA35, fX16, fA17 + nop.i 0 +} +;; +{ .mfb + nop.m 0 + // final result for |x| < 0.625 + fnma.d.s0 f8 = fBaseP, fXCube, fCpi + // exit here for |x| < 0.625 path + br.ret.sptk b0 +} +;; +// here if |x| = 1 +// acos(1) = 0 +// acos(-1) = Pi +.align 32 +acos_abs_1: +{ .mfi + ldfe fPiBy2 = [rPiBy2Ptr] // Pi/2 + nop.f 0 + nop.i 0 +} +;; +.pred.rel "mutex", p8, p9 +{ .mfi + nop.m 0 + // result for x = 1.0 +(p9) fma.d.s0 f8 = f1, f0, f0 // 0.0 + nop.i 0 +} +{.mfb + nop.m 0 + // result for x = -1.0 +(p8) fma.d.s0 f8 = fPiBy2, f1, fPiBy2 // Pi + // exit here for |x| = 1.0 + br.ret.sptk b0 +} +;; -L(ACOS_ABS_ONE): -.pred.rel "mutex",p11,p12 -// Here if |x|=1 -{ .mfi - nop.m 999 -(p11) fma.d f8 = acos_const_piby2,f1,acos_const_piby2 // acos(-1)=pi - nop.i 999 -} -{ .mfb - nop.m 999 -(p12) fma.d f8 = f1,f0,f0 // acos(1)=0 - br.ret.sptk b0 ;; -} +// here if x is a NaN, denormal, or zero +.align 32 +acos_special: +{ .mfi + // point to Pi/2 + adds rPiBy2Ptr = 272, rTblAddr + // set p12 = 1 if x is a NaN + fclass.m p12, p0 = f8, 0xc3 + nop.i 0 +} +{ .mlx + nop.m 0 + // smallest positive DP normalized number + movl rDenoBound = 0x0010000000000000 +} +;; +{ .mfi + ldfe fPiBy2 = [rPiBy2Ptr] // Pi/2 + // set p13 = 1 if x = 0.0 + fclass.m p13, p0 = f8, 0x07 + nop.i 0 +} +{ .mfi + nop.m 0 + fnorm.s1 fNormX = f8 + nop.i 0 +} +;; +{ .mfb + // load smallest normal to FP reg + setf.d fDenoBound = rDenoBound + // answer if x is a NaN +(p12) fma.d.s0 f8 = f8,f1,f0 + // exit here if x is a NaN +(p12) br.ret.spnt b0 +} +;; +{ .mfi + nop.m 0 + // absolute value of normalized x + fmerge.s fNormX = f1, fNormX + nop.i 0 +} +;; +{ .mfb + nop.m 0 + // final result for x = 0 +(p13) fma.d.s0 f8 = fPiBy2, f1, f8 + // exit here if x = 0.0 +(p13) br.ret.spnt b0 +} +;; +// if we still here then x is denormal or unnormal +{ .mfi + nop.m 0 + // set p14 = 1 if normalized x is greater than or + // equal to the smallest denormalized value + // So, if p14 is set to 1 it means that we deal with + // unnormal rather than with "true" denormal + fcmp.ge.s1 p14, p0 = fNormX, fDenoBound + nop.i 0 +} +;; +{ .mfi + nop.m 0 +(p14) fcmp.eq.s0 p6, p0 = f8, f0 // Set D flag if x unnormal + nop.i 0 +} +{ .mfb + nop.m 0 + // normalize unnormal input +(p14) fnorm.s1 f8 = f8 + // return to the main path +(p14) br.cond.sptk acos_unnormal_back +} +;; +// if we still here it means that input is "true" denormal +{ .mfb + nop.m 0 + // final result if x is denormal + fms.d.s0 f8 = fPiBy2, f1, f8 // Pi/2 - x + // exit here if x is denormal + br.ret.sptk b0 +} +;; +// here if |x| > 1.0 +// error handler should be called +.align 32 +acos_abs_gt_1: +{ .mfi + alloc r32 = ar.pfs, 0, 3, 4, 0 // get some registers + fmerge.s FR_X = f8,f8 + nop.i 0 +} +{ .mfb + mov GR_Parameter_TAG = 58 // error code + frcpa.s0 FR_RESULT, p0 = f0,f0 + // call error handler routine + br.cond.sptk __libm_error_region +} +;; +GLOBAL_LIBM_END(acos) -.endp acos -ASM_SIZE_DIRECTIVE(acos) -.proc __libm_error_region -__libm_error_region: +LOCAL_LIBM_ENTRY(__libm_error_region) .prologue { .mfi add GR_Parameter_Y=-32,sp // Parameter 2 value - nop.f 999 + nop.f 0 .save ar.pfs,GR_SAVE_PFS mov GR_SAVE_PFS=ar.pfs // Save ar.pfs } @@ -879,28 +839,29 @@ __libm_error_region: mov GR_SAVE_GP=gp // Save gp };; { .mmi - stfs [GR_Parameter_Y] = f1,16 // Store Parameter 2 on stack + stfd [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack add GR_Parameter_X = 16,sp // Parameter 1 address .save b0, GR_SAVE_B0 mov GR_SAVE_B0=b0 // Save b0 };; - .body - frcpa.s0 f9,p0 = f0,f0 -;; - { .mib - stfd [GR_Parameter_X] = f8 // Store Parameter 1 on stack - add GR_Parameter_RESULT = 0,GR_Parameter_Y - nop.b 0 // Parameter 3 address + stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address + nop.b 0 } { .mib - stfd [GR_Parameter_Y] = f9,-16 // Store Parameter 3 on stack - adds r32 = 48,sp - br.call.sptk b0=__libm_error_support# // Call error handling function + stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function };; { .mmi - ldfd f8 = [r32] // Get return result off stack + add GR_Parameter_RESULT = 48,sp + nop.m 0 + nop.i 0 +};; +{ .mmi + ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack .restore sp add sp = 64,sp // Restore stack pointer mov b0 = GR_SAVE_B0 // Restore return address @@ -909,11 +870,8 @@ __libm_error_region: mov gp = GR_SAVE_GP // Restore gp mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs br.ret.sptk b0 // Return - };; -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) - -.type __libm_error_support,@function -.global __libm_error_support +LOCAL_LIBM_END(__libm_error_region) +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_acosf.S b/sysdeps/ia64/fpu/e_acosf.S index a3425414cf..417f5b7ddc 100644 --- a/sysdeps/ia64/fpu/e_acosf.S +++ b/sysdeps/ia64/fpu/e_acosf.S @@ -1,10 +1,10 @@ .file "acosf.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. // -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,7 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -35,19 +35,23 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. // History //============================================================== -// 2/02/00 Initial revision -// 6/28/00 Improved speed -// 6/31/00 Changed register allocation because of some duplicate macros +// 02/02/00 Initial version +// 06/28/00 Improved speed +// 06/31/00 Changed register allocation because of some duplicate macros // moved nan exit bundle up to gain a cycle. -// 8/15/00 Bundle added after call to __libm_error_support to properly +// 08/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. -// 8/17/00 Changed predicate register macro-usage to direct predicate +// 08/17/00 Changed predicate register macro-usage to direct predicate // names due to an assembler bug. // 10/17/00 Improved speed of x=0 and x=1 paths, set D flag if x denormal. +// 03/13/01 Corrected sign of imm1 value in dep instruction. +// 05/20/02 Cleaned up namespace and sf0 syntax +// 02/06/03 Reordered header: .section, .global, .proc, .align +// 04/17/03 Moved mutex after label // Description @@ -115,7 +119,6 @@ // answer2 = sign(x) z P(t) if x>0 // = sign(x) z P(t) + pi if x<0 -#include "libm_support.h" // // Assembly macros @@ -222,42 +225,30 @@ acosf_poly_p1a = f90 // Data tables //============================================================== -#ifdef _LIBC -.rodata -#else -.data -#endif +RODATA .align 16 -acosf_coeff_1_table: -ASM_TYPE_DIRECTIVE(acosf_coeff_1_table,@object) +LOCAL_OBJECT_START(acosf_coeff_1_table) data8 0x3FC5555607DCF816 // P1 data8 0x3F9CF81AD9BAB2C6 // P4 data8 0x3FC59E0975074DF3 // P7 data8 0xBFA6F4CC2780AA1D // P6 data8 0x3FC2DD45292E93CB // P9 data8 0x3fe6a09e667f3bcd // sqrt(2)/2 -ASM_SIZE_DIRECTIVE(acosf_coeff_1_table) +LOCAL_OBJECT_END(acosf_coeff_1_table) -acosf_coeff_2_table: -ASM_TYPE_DIRECTIVE(acosf_coeff_2_table,@object) +LOCAL_OBJECT_START(acosf_coeff_2_table) data8 0x3FA6F108E31EFBA6 // P3 data8 0xBFCA31BF175D82A0 // P8 data8 0x3FA30C0337F6418B // P5 data8 0x3FB332C9266CB1F9 // P2 data8 0x3ff921fb54442d18 // pi_by_2 -ASM_SIZE_DIRECTIVE(acosf_coeff_2_table) +LOCAL_OBJECT_END(acosf_coeff_2_table) -.align 32 -.global acosf -ASM_TYPE_DIRECTIVE(acosf,@function) .section .text -.proc acosf -.align 32 - -acosf: +GLOBAL_LIBM_ENTRY(acosf) // Load the addresses of the two tables. // Then, load the coefficients and other constants. @@ -342,7 +333,7 @@ acosf: } { .mfb nop.m 999 -(p8) fma.s f8 = f8,f1,f0 +(p8) fma.s.s0 f8 = f8,f1,f0 (p8) br.ret.spnt b0 ;; // Exit if x=nan } @@ -350,7 +341,7 @@ acosf: { .mfb nop.m 999 fcmp.eq.s1 p6,p0 = acosf_abs_x,f1 -(p10) br.cond.spnt L(ACOSF_ZERO) ;; // Branch if x=0 +(p10) br.cond.spnt ACOSF_ZERO ;; // Branch if x=0 } { .mfi @@ -367,7 +358,7 @@ acosf: { .mfb nop.m 999 fma.s1 acosf_t4 = acosf_t2,acosf_t2,f0 -(p6) br.cond.spnt L(ACOSF_ABS_ONE) ;; // Branch if |x|=1 +(p6) br.cond.spnt ACOSF_ABS_ONE ;; // Branch if |x|=1 } { .mfi @@ -575,42 +566,40 @@ acosf: .pred.rel "mutex",p8,p7 //acosf_pred_GTsqrt2by2,acosf_pred_LEsqrt2by2 { .mfi nop.m 999 -(p8) fma.s f8 = acosf_z,acosf_Pt,acosf_sgn_x_piby2 +(p8) fma.s.s0 f8 = acosf_z,acosf_Pt,acosf_sgn_x_piby2 nop.i 999 } { .mfb nop.m 999 -(p7) fms.s f8 = acosf_const_piby2,f1,acosf_sinf1 +(p7) fms.s.s0 f8 = acosf_const_piby2,f1,acosf_sinf1 br.ret.sptk b0 ;; } -L(ACOSF_ZERO): +ACOSF_ZERO: // Here if x=0 { .mfb nop.m 999 - fma.s f8 = acosf_const_piby2,f1,f0 // acosf(0)=pi/2 + fma.s.s0 f8 = acosf_const_piby2,f1,f0 // acosf(0)=pi/2 br.ret.sptk b0 ;; } -L(ACOSF_ABS_ONE): +ACOSF_ABS_ONE: .pred.rel "mutex",p11,p12 // Here if |x|=1 { .mfi nop.m 999 -(p11) fma.s f8 = acosf_const_piby2,f1,acosf_const_piby2 // acosf(-1)=pi +(p11) fma.s.s0 f8 = acosf_const_piby2,f1,acosf_const_piby2 // acosf(-1)=pi nop.i 999 } { .mfb nop.m 999 -(p12) fma.s f8 = f1,f0,f0 // acosf(1)=0 +(p12) fma.s.s0 f8 = f1,f0,f0 // acosf(1)=0 br.ret.sptk b0 ;; } -.endp acosf -ASM_SIZE_DIRECTIVE(acosf) - +GLOBAL_LIBM_END(acosf) // Stack operations when calling error support. // (1) (2) @@ -642,8 +631,7 @@ ASM_SIZE_DIRECTIVE(acosf) // restore ar.pfs -.proc __libm_error_region -__libm_error_region: +LOCAL_LIBM_ENTRY(__libm_error_region) .prologue { .mfi add GR_Parameter_Y=-32,sp // Parameter 2 value @@ -699,8 +687,7 @@ __libm_error_region: br.ret.sptk b0 // Return };; -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) +LOCAL_LIBM_END(__libm_error_region) .type __libm_error_support#,@function .global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_acosl.S b/sysdeps/ia64/fpu/e_acosl.S index ab1bbf41a7..daa75b18a5 100644 --- a/sysdeps/ia64/fpu/e_acosl.S +++ b/sysdeps/ia64/fpu/e_acosl.S @@ -1,10 +1,10 @@ .file "acosl.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2001 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2001 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,1027 +20,2469 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// // Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// problem reports or change requests be submitted to it directly at +// http: //www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 2/02/00 Initial version -// 2/07/00 Modified calculation of acos_corr to correct acosl -// 4/04/00 Unwind support added -// 8/15/00 Bundle added after call to __libm_error_support to properly -// set [the previously overwritten] GR_Parameter_RESULT. -// 12/20/00 Set denormal flag properly. +// 08/28/01 New version +// 05/20/02 Cleaned up namespace and sf0 syntax +// 02/06/03 Reordered header: .section, .global, .proc, .align // // API //============================================================== -// double-extended = acosl (double-extended) -// input floating point f8 -// output floating point f8 +// long double acosl(long double) // -// Registers used +// Overview of operation //============================================================== +// Background // -// predicate registers used: -// p6 -> p12 +// Implementation // -// floating-point registers used: -// f8 has input, then output -// f8 -> f15, f32 ->f99 +// For |s| in [2^{-4}, sqrt(2)/2]: +// Let t= 2^k*1.b1 b2..b6 1, where s= 2^k*1.b1 b2.. b52 +// acos(s)= pi/2-asin(t)-asin(r), where r= s*sqrt(1-t^2)-t*sqrt(1-s^2), i.e. +// r= (s-t)*sqrt(1-t^2)-t*sqrt(1-t^2)*(sqrt((1-s^2)/(1-t^2))-1) +// asin(r)-r evaluated as 9-degree polynomial (c3*r^3+c5*r^5+c7*r^7+c9*r^9) +// The 64-bit significands of sqrt(1-t^2), 1/(1-t^2) are read from the table, +// along with the high and low parts of asin(t) (stored as two double precision +// values) // -// general registers used: -// r32 -> r48 +// |s| in (sqrt(2)/2, sqrt(255/256)): +// Let t= 2^k*1.b1 b2..b6 1, where (1-s^2)*frsqrta(1-s^2)= 2^k*1.b1 b2..b6.. +// acos(|s|)= asin(t)-asin(r) +// acos(-|s|)=pi-asin(t)+asin(r), r= s*t-sqrt(1-s^2)*sqrt(1-t^2) +// To minimize accumulated errors, r is computed as +// r= (t*s)_s-t^2*y*z+z*y*(t^2-1+s^2)_s+z*y*(1-s^2)_s*x+z'*y*(1-s^2)*PS29+ +// +(t*s-(t*s)_s)+z*y*((t^2-1-(t^2-1+s^2)_s)+s^2)+z*y*(1-s^2-(1-s^2)_s)+ +// +ez*z'*y*(1-s^2)*(1-x), +// where y= frsqrta(1-s^2), z= (sqrt(1-t^2))_s (rounded to 24 significant bits) +// z'= sqrt(1-t^2), x= ((1-s^2)*y^2-1)/2 +// +// |s|<2^{-4}: evaluate asin(s) as 17-degree polynomial, return pi/2-asin(s) +// (or simply return pi/2-s, if|s|<2^{-64}) +// +// |s| in [sqrt(255/256), 1): acos(|s|)= asin(sqrt(1-s^2)) +// acos(-|s|)= pi-asin(sqrt(1-s^2)) +// use 17-degree polynomial for asin(sqrt(1-s^2)), +// 9-degree polynomial to evaluate sqrt(1-s^2) +// High order term is (pi)_high-(y*(1-s^2))_high, for s<0, +// or y*(1-s^2)_s, for s>0 // -// Overview of operation -//============================================================== -// There are three paths -// 1. |x| < 2^-25 ACOS_TINY -// 2. 2^-25 <= |x| < 1/4 ACOS_POLY -// 3. 1/4 <= |x| < 1 ACOS_ATAN -#include "libm_support.h" -// Assembly macros + +// Registers used //============================================================== +// f6-f15, f32-f36 +// r2-r3, r23-r23 +// p6, p7, p8, p12 +// -// f8 is input, but acos_V must be put in f8 -// when __libm_atan2_reg is called, f8 must get V -// f9 gets U when __libm_atan2_reg is called + GR_SAVE_B0= r33 + GR_SAVE_PFS= r34 + GR_SAVE_GP= r35 // This reg. can safely be used + GR_SAVE_SP= r36 -// __libm_atan2_reg returns -// f8 = Z_hi -// f10 = Z_lo -// f11 = s_lo + GR_Parameter_X= r37 + GR_Parameter_Y= r38 + GR_Parameter_RESULT= r39 + GR_Parameter_TAG= r40 -acos_Z_hi = f8 -acos_Z_lo = f10 -acos_S_lo = f11 + FR_X= f10 + FR_Y= f1 + FR_RESULT= f8 -// When we call __libm_atan2_reg, we must save -// the following: -acos_corr = f12 -acos_X = f13 -acos_pi_hi = f14 -acos_pi_lo = f15 -// The rest of the assembly macros - -acos_P79 = f32 -acos_P59 = f33 -acos_P39 = f34 -acos_P19 = f35 +RODATA -acos_P810 = f36 -acos_P610 = f37 -acos_P410 = f38 -acos_P210 = f39 +.align 16 -acos_A1 = f41 -acos_A2 = f42 -acos_A3 = f43 -acos_A4 = f44 -acos_A5 = f45 -acos_A6 = f46 -acos_A7 = f47 -acos_A8 = f48 -acos_A9 = f49 -acos_A10 = f50 +LOCAL_OBJECT_START(T_table) + +// stores 64-bit significand of 1/(1-t^2), 64-bit significand of sqrt(1-t^2), +// asin(t)_high (double precision), asin(t)_low (double precision) + +data8 0x80828692b71c4391, 0xff7ddcec2d87e879 +data8 0x3fb022bc0ae531a0, 0x3c9f599c7bb42af6 +data8 0x80869f0163d0b082, 0xff79cad2247914d3 +data8 0x3fb062dd26afc320, 0x3ca4eff21bd49c5c +data8 0x808ac7d5a8690705, 0xff75a89ed6b626b9 +data8 0x3fb0a2ff4a1821e0, 0x3cb7e33b58f164cc +data8 0x808f0112ad8ad2e0, 0xff7176517c2cc0cb +data8 0x3fb0e32279319d80, 0x3caee31546582c43 +data8 0x80934abba8a1da0a, 0xff6d33e949b1ed31 +data8 0x3fb12346b8101da0, 0x3cb8bfe463d087cd +data8 0x8097a4d3dbe63d8f, 0xff68e16571015c63 +data8 0x3fb1636c0ac824e0, 0x3c8870a7c5a3556f +data8 0x809c0f5e9662b3dd, 0xff647ec520bca0f0 +data8 0x3fb1a392756ed280, 0x3c964f1a927461ae +data8 0x80a08a5f33fadc66, 0xff600c07846a6830 +data8 0x3fb1e3b9fc19e580, 0x3c69eb3576d56332 +data8 0x80a515d91d71acd4, 0xff5b892bc475affa +data8 0x3fb223e2a2dfbe80, 0x3c6a4e19fd972fb6 +data8 0x80a9b1cfc86ff7cd, 0xff56f631062cf93d +data8 0x3fb2640c6dd76260, 0x3c62041160e0849e +data8 0x80ae5e46b78b0d68, 0xff5253166bc17794 +data8 0x3fb2a43761187c80, 0x3cac61651af678c0 +data8 0x80b31b417a4b756b, 0xff4d9fdb14463dc8 +data8 0x3fb2e46380bb6160, 0x3cb06ef23eeba7a1 +data8 0x80b7e8c3ad33c369, 0xff48dc7e1baf6738 +data8 0x3fb32490d0d910c0, 0x3caa05f480b300d5 +data8 0x80bcc6d0f9c784d6, 0xff4408fe9ad13e37 +data8 0x3fb364bf558b3820, 0x3cb01e7e403aaab9 +data8 0x80c1b56d1692492d, 0xff3f255ba75f5f4e +data8 0x3fb3a4ef12ec3540, 0x3cb4fe8fcdf5f5f1 +data8 0x80c6b49bc72ec446, 0xff3a319453ebd961 +data8 0x3fb3e5200d171880, 0x3caf2dc089b2b7e2 +data8 0x80cbc460dc4e0ae8, 0xff352da7afe64ac6 +data8 0x3fb425524827a720, 0x3cb75a855e7c6053 +data8 0x80d0e4c033bee9c4, 0xff301994c79afb32 +data8 0x3fb46585c83a5e00, 0x3cb3264981c019ab +data8 0x80d615bdb87556db, 0xff2af55aa431f291 +data8 0x3fb4a5ba916c73c0, 0x3c994251d94427b5 +data8 0x80db575d6291fd8a, 0xff25c0f84bae0cb9 +data8 0x3fb4e5f0a7dbdb20, 0x3cbee2fcc4c786cb +data8 0x80e0a9a33769e535, 0xff207c6cc0ec09fd +data8 0x3fb526280fa74620, 0x3c940656e5549b91 +data8 0x80e60c93498e32cd, 0xff1b27b703a19c98 +data8 0x3fb56660ccee2740, 0x3ca7082374d7b2cd +data8 0x80eb8031b8d4052d, 0xff15c2d6105c72f8 +data8 0x3fb5a69ae3d0b520, 0x3c7c4d46e09ac68a +data8 0x80f10482b25c6c8a, 0xff104dc8e0813ed4 +data8 0x3fb5e6d6586fec20, 0x3c9aa84ffd9b4958 +data8 0x80f6998a709c7cfb, 0xff0ac88e6a4ab926 +data8 0x3fb627132eed9140, 0x3cbced2cbbbe7d16 +data8 0x80fc3f4d3b657c44, 0xff053325a0c8a2ec +data8 0x3fb667516b6c34c0, 0x3c6489c5fc68595a +data8 0x8101f5cf67ed2af8, 0xfeff8d8d73dec2bb +data8 0x3fb6a791120f33a0, 0x3cbe12acf159dfad +data8 0x8107bd1558d6291f, 0xfef9d7c4d043df29 +data8 0x3fb6e7d226fabba0, 0x3ca386d099cd0dc7 +data8 0x810d95237e38766a, 0xfef411ca9f80b5f7 +data8 0x3fb72814ae53cc20, 0x3cb9f35731e71dd6 +data8 0x81137dfe55aa0e29, 0xfeee3b9dc7eef009 +data8 0x3fb76858ac403a00, 0x3c74df3dd959141a +data8 0x811977aa6a479f0f, 0xfee8553d2cb8122c +data8 0x3fb7a89e24e6b0e0, 0x3ca6034406ee42bc +data8 0x811f822c54bd5ef8, 0xfee25ea7add46a91 +data8 0x3fb7e8e51c6eb6a0, 0x3cb82f8f78e68ed7 +data8 0x81259d88bb4ffac1, 0xfedc57dc2809fb1d +data8 0x3fb8292d9700ad60, 0x3cbebb73c0e653f9 +data8 0x812bc9c451e5a257, 0xfed640d974eb6068 +data8 0x3fb8697798c5d620, 0x3ca2feee76a9701b +data8 0x813206e3da0f3124, 0xfed0199e6ad6b585 +data8 0x3fb8a9c325e852e0, 0x3cb9e88f2f4d0efe +data8 0x813854ec231172f9, 0xfec9e229dcf4747d +data8 0x3fb8ea1042932a00, 0x3ca5ff40d81f66fd +data8 0x813eb3e209ee858f, 0xfec39a7a9b36538b +data8 0x3fb92a5ef2f247c0, 0x3cb5e3bece4d6b07 +data8 0x814523ca796f56ce, 0xfebd428f72561efe +data8 0x3fb96aaf3b3281a0, 0x3cb7b9e499436d7c +data8 0x814ba4aa6a2d3ff9, 0xfeb6da672bd48fe4 +data8 0x3fb9ab011f819860, 0x3cb9168143cc1a7f +data8 0x81523686e29bbdd7, 0xfeb062008df81f50 +data8 0x3fb9eb54a40e3ac0, 0x3cb6e544197eb1e1 +data8 0x8158d964f7124614, 0xfea9d95a5bcbd65a +data8 0x3fba2ba9cd080800, 0x3ca9a717be8f7446 +data8 0x815f8d49c9d639e4, 0xfea34073551e1ac8 +data8 0x3fba6c009e9f9260, 0x3c741e989a60938a +data8 0x8166523a8b24f626, 0xfe9c974a367f785c +data8 0x3fbaac591d0661a0, 0x3cb2c1290107e57d +data8 0x816d283c793e0114, 0xfe95ddddb94166cb +data8 0x3fbaecb34c6ef600, 0x3c9c7d5fbaec405d +data8 0x81740f54e06d55bd, 0xfe8f142c93750c50 +data8 0x3fbb2d0f310cca00, 0x3cbc09479a9cbcfb +data8 0x817b07891b15cd5e, 0xfe883a3577e9fceb +data8 0x3fbb6d6ccf1455e0, 0x3cb9450bff4ee307 +data8 0x818210de91bba6c8, 0xfe814ff7162cf62f +data8 0x3fbbadcc2abb1180, 0x3c9227fda12a8d24 +data8 0x81892b5abb0f2bf9, 0xfe7a55701a8697b1 +data8 0x3fbbee2d48377700, 0x3cb6fad72acfe356 +data8 0x819057031bf7760e, 0xfe734a9f2dfa1810 +data8 0x3fbc2e902bc10600, 0x3cb4465b588d16ad +data8 0x819793dd479d4fbe, 0xfe6c2f82f643f68b +data8 0x3fbc6ef4d9904580, 0x3c8b9ac54823960d +data8 0x819ee1eedf76367a, 0xfe65041a15d8a92c +data8 0x3fbcaf5b55dec6a0, 0x3ca2b8d28a954db2 +data8 0x81a6413d934f7a66, 0xfe5dc8632be3477f +data8 0x3fbcefc3a4e727a0, 0x3c9380da83713ab4 +data8 0x81adb1cf21597d4b, 0xfe567c5cd44431d5 +data8 0x3fbd302dcae51600, 0x3ca995b83421756a +data8 0x81b533a9563310b8, 0xfe4f2005a78fb50f +data8 0x3fbd7099cc155180, 0x3caefa2f7a817d5f +data8 0x81bcc6d20cf4f373, 0xfe47b35c3b0caaeb +data8 0x3fbdb107acb5ae80, 0x3cb455fc372dd026 +data8 0x81c46b4f2f3d6e68, 0xfe40365f20b316d6 +data8 0x3fbdf177710518c0, 0x3cbee3dcc5b01434 +data8 0x81cc2126b53c1144, 0xfe38a90ce72abf36 +data8 0x3fbe31e91d439620, 0x3cb3e131c950aebd +data8 0x81d3e85ea5bd8ee2, 0xfe310b6419c9c33a +data8 0x3fbe725cb5b24900, 0x3c01d3fac6029027 +data8 0x81dbc0fd1637b9c1, 0xfe295d6340932d15 +data8 0x3fbeb2d23e937300, 0x3c6304cc44aeedd1 +data8 0x81e3ab082ad5a0a4, 0xfe219f08e03580b3 +data8 0x3fbef349bc2a77e0, 0x3cac1d2d6abe9c72 +data8 0x81eba6861683cb97, 0xfe19d0537a0946e2 +data8 0x3fbf33c332bbe020, 0x3ca0909dba4e96ca +data8 0x81f3b37d1afc9979, 0xfe11f1418c0f94e2 +data8 0x3fbf743ea68d5b60, 0x3c937fc12a2a779a +data8 0x81fbd1f388d4be45, 0xfe0a01d190f09063 +data8 0x3fbfb4bc1be5c340, 0x3cbf51a504b55813 +data8 0x820401efbf87e248, 0xfe020201fff9efea +data8 0x3fbff53b970d1e80, 0x3ca625444b260078 +data8 0x82106ad2ffdca049, 0xfdf5e3940a49135e +data8 0x3fc02aff52065460, 0x3c9125d113e22a57 +data8 0x8221343d6ea1d3e2, 0xfde581a45429b0a0 +data8 0x3fc06b84f8e03220, 0x3caccf362295894b +data8 0x82324434adbf99c2, 0xfdd4de1a001fb775 +data8 0x3fc0ac0ed1fe7240, 0x3cc22f676096b0af +data8 0x82439aee8d0c7747, 0xfdc3f8e8269d1f03 +data8 0x3fc0ec9cee9e4820, 0x3cca147e2886a628 +data8 0x825538a1d0fcb2f0, 0xfdb2d201a9b1ba66 +data8 0x3fc12d2f6006f0a0, 0x3cc72b36633bc2d4 +data8 0x82671d86345c5cee, 0xfda1695934d723e7 +data8 0x3fc16dc63789de60, 0x3cb11f9c47c7b83f +data8 0x827949d46a121770, 0xfd8fbee13cbbb823 +data8 0x3fc1ae618682e620, 0x3cce1b59020cef8e +data8 0x828bbdc61eeab9ba, 0xfd7dd28bff0c9f34 +data8 0x3fc1ef015e586c40, 0x3cafec043e0225ee +data8 0x829e7995fb6de9e1, 0xfd6ba44b823ee1ca +data8 0x3fc22fa5d07b90c0, 0x3cba905409caf8e3 +data8 0x82b17d7fa5bbc982, 0xfd5934119557883a +data8 0x3fc2704eee685da0, 0x3cb5ef21838a823e +data8 0x82c4c9bfc373d276, 0xfd4681cfcfb2c161 +data8 0x3fc2b0fcc9a5f3e0, 0x3ccc7952c5e0e312 +data8 0x82d85e93fba50136, 0xfd338d7790ca0f41 +data8 0x3fc2f1af73c6ba00, 0x3cbecf5f977d1ca9 +data8 0x82ec3c3af8c76b32, 0xfd2056f9fff97727 +data8 0x3fc33266fe6889a0, 0x3c9d329c022ebdb5 +data8 0x830062f46abf6022, 0xfd0cde480c43b327 +data8 0x3fc373237b34de60, 0x3cc95806d4928adb +data8 0x8314d30108ea35f0, 0xfcf923526c1562b2 +data8 0x3fc3b3e4fbe10520, 0x3cbc299fe7223d54 +data8 0x83298ca29434df97, 0xfce526099d0737ed +data8 0x3fc3f4ab922e4a60, 0x3cb59d8bb8fdbccc +data8 0x833e901bd93c7009, 0xfcd0e65de39f1f7c +data8 0x3fc435774fea2a60, 0x3c9ec18b43340914 +data8 0x8353ddb0b278aad8, 0xfcbc643f4b106055 +data8 0x3fc4764846ee80a0, 0x3cb90402efd87ed6 +data8 0x836975a60a70c52e, 0xfca79f9da4fab13a +data8 0x3fc4b71e8921b860, 0xbc58f23449ed6365 +data8 0x837f5841ddfa7a46, 0xfc92986889284148 +data8 0x3fc4f7fa2876fca0, 0xbc6294812bf43acd +data8 0x839585cb3e839773, 0xfc7d4e8f554ab12f +data8 0x3fc538db36ee6960, 0x3cb910b773d4c578 +data8 0x83abfe8a5466246f, 0xfc67c2012cb6fa68 +data8 0x3fc579c1c6953cc0, 0x3cc5ede909fc47fc +data8 0x83c2c2c861474d91, 0xfc51f2acf82041d5 +data8 0x3fc5baade9860880, 0x3cac63cdfc3588e5 +data8 0x83d9d2cfc2813637, 0xfc3be08165519325 +data8 0x3fc5fb9fb1e8e3a0, 0x3cbf7c8466578c29 +data8 0x83f12eebf397daac, 0xfc258b6ce6e6822f +data8 0x3fc63c9731f39d40, 0x3cb6d2a7ffca3e9e +data8 0x8408d76990b9296e, 0xfc0ef35db402af94 +data8 0x3fc67d947be9eec0, 0x3cb1980da09e6566 +data8 0x8420cc9659487cd7, 0xfbf81841c8082dc4 +data8 0x3fc6be97a21daf00, 0x3cc2ac8330e59aa5 +data8 0x84390ec132759ecb, 0xfbe0fa06e24cc390 +data8 0x3fc6ffa0b6ef05e0, 0x3ccc1a030fee56c4 +data8 0x84519e3a29df811a, 0xfbc9989a85ce0954 +data8 0x3fc740afcccca000, 0x3cc19692a5301ca6 +data8 0x846a7b527842d61b, 0xfbb1f3e9f8e45dc4 +data8 0x3fc781c4f633e2c0, 0x3cc0e98f3868a508 +data8 0x8483a65c8434b5f0, 0xfb9a0be244f4af45 +data8 0x3fc7c2e045b12140, 0x3cb2a8d309754420 +data8 0x849d1fabe4e97dd7, 0xfb81e070362116d1 +data8 0x3fc80401cddfd120, 0x3ca7a44544aa4ce6 +data8 0x84b6e795650817ea, 0xfb6971805af8411e +data8 0x3fc84529a16ac020, 0x3c9e3b709c7d6f94 +data8 0x84d0fe6f0589da92, 0xfb50beff0423a2f5 +data8 0x3fc88657d30c49e0, 0x3cc60d65a7f0a278 +data8 0x84eb649000a73014, 0xfb37c8d84414755c +data8 0x3fc8c78c758e8e80, 0x3cc94b2ee984c2b7 +data8 0x85061a50ccd13781, 0xfb1e8ef7eeaf764b +data8 0x3fc908c79bcba900, 0x3cc8540ae794a2fe +data8 0x8521200b1fb8916e, 0xfb05114998f76a83 +data8 0x3fc94a0958ade6c0, 0x3ca127f49839fa9c +data8 0x853c7619f1618bf6, 0xfaeb4fb898b65d19 +data8 0x3fc98b51bf2ffee0, 0x3c8c9ba7a803909a +data8 0x85581cd97f45e274, 0xfad14a3004259931 +data8 0x3fc9cca0e25d4ac0, 0x3cba458e91d3bf54 +data8 0x857414a74f8446b4, 0xfab7009ab1945a54 +data8 0x3fca0df6d551fe80, 0x3cc78ea1d329d2b2 +data8 0x85905de2341dea46, 0xfa9c72e3370d2fbc +data8 0x3fca4f53ab3b6200, 0x3ccf60dca86d57ef +data8 0x85acf8ea4e423ff8, 0xfa81a0f3e9fa0ee9 +data8 0x3fca90b777580aa0, 0x3ca4c4e2ec8a867e +data8 0x85c9e62111a92e7d, 0xfa668ab6dec711b1 +data8 0x3fcad2224cf814e0, 0x3c303de5980d071c +data8 0x85e725e947fbee97, 0xfa4b3015e883dbfe +data8 0x3fcb13943f7d5f80, 0x3cc29d4eefa5cb1e +data8 0x8604b8a7144cd054, 0xfa2f90fa9883a543 +data8 0x3fcb550d625bc6a0, 0x3c9e01a746152daf +data8 0x86229ebff69e2415, 0xfa13ad4e3dfbe1c1 +data8 0x3fcb968dc9195ea0, 0x3ccc091bd73ae518 +data8 0x8640d89acf78858c, 0xf9f784f9e5a1877b +data8 0x3fcbd815874eb160, 0x3cb5f4b89875e187 +data8 0x865f669fe390c7f5, 0xf9db17e65944eacf +data8 0x3fcc19a4b0a6f9c0, 0x3cc5c0bc2b0bbf14 +data8 0x867e4938df7dc45f, 0xf9be65fc1f6c2e6e +data8 0x3fcc5b3b58e061e0, 0x3cc1ca70df8f57e7 +data8 0x869d80d0db7e4c0c, 0xf9a16f237aec427a +data8 0x3fcc9cd993cc4040, 0x3cbae93acc85eccf +data8 0x86bd0dd45f4f8265, 0xf98433446a806e70 +data8 0x3fccde7f754f5660, 0x3cb22f70e64568d0 +data8 0x86dcf0b16613e37a, 0xf966b246a8606170 +data8 0x3fcd202d11620fa0, 0x3c962030e5d4c849 +data8 0x86fd29d7624b3d5d, 0xf948ec11a9d4c45b +data8 0x3fcd61e27c10c0a0, 0x3cc7083c91d59217 +data8 0x871db9b741dbe44a, 0xf92ae08c9eca4941 +data8 0x3fcda39fc97be7c0, 0x3cc9258579e57211 +data8 0x873ea0c3722d6af2, 0xf90c8f9e71633363 +data8 0x3fcde5650dd86d60, 0x3ca4755a9ea582a9 +data8 0x875fdf6fe45529e8, 0xf8edf92dc5875319 +data8 0x3fce27325d6fe520, 0x3cbc1e2b6c1954f9 +data8 0x878176321154e2bc, 0xf8cf1d20f87270b8 +data8 0x3fce6907cca0d060, 0x3cb6ca4804750830 +data8 0x87a36580fe6bccf5, 0xf8affb5e20412199 +data8 0x3fceaae56fdee040, 0x3cad6b310d6fd46c +data8 0x87c5add5417a5cb9, 0xf89093cb0b7c0233 +data8 0x3fceeccb5bb33900, 0x3cc16e99cedadb20 +data8 0x87e84fa9057914ca, 0xf870e64d40a15036 +data8 0x3fcf2eb9a4bcb600, 0x3cc75ee47c8b09e9 +data8 0x880b4b780f02b709, 0xf850f2c9fdacdf78 +data8 0x3fcf70b05fb02e20, 0x3cad6350d379f41a +data8 0x882ea1bfc0f228ac, 0xf830b926379e6465 +data8 0x3fcfb2afa158b8a0, 0x3cce0ccd9f829985 +data8 0x885252ff21146108, 0xf810394699fe0e8e +data8 0x3fcff4b77e97f3e0, 0x3c9b30faa7a4c703 +data8 0x88765fb6dceebbb3, 0xf7ef730f865f6df0 +data8 0x3fd01b6406332540, 0x3cdc5772c9e0b9bd +data8 0x88ad1f69be2cc730, 0xf7bdc59bc9cfbd97 +data8 0x3fd04cf8ad203480, 0x3caeef44fe21a74a +data8 0x88f763f70ae2245e, 0xf77a91c868a9c54e +data8 0x3fd08f23ce0162a0, 0x3cd6290ab3fe5889 +data8 0x89431fc7bc0c2910, 0xf73642973c91298e +data8 0x3fd0d1610f0c1ec0, 0x3cc67401a01f08cf +data8 0x8990573407c7738e, 0xf6f0d71d1d7a2dd6 +data8 0x3fd113b0c65d88c0, 0x3cc7aa4020fe546f +data8 0x89df0eb108594653, 0xf6aa4e6a05cfdef2 +data8 0x3fd156134ada6fe0, 0x3cc87369da09600c +data8 0x8a2f4ad16e0ed78a, 0xf662a78900c35249 +data8 0x3fd19888f43427a0, 0x3cc62b220f38e49c +data8 0x8a811046373e0819, 0xf619e180181d97cc +data8 0x3fd1db121aed7720, 0x3ca3ede7490b52f4 +data8 0x8ad463df6ea0fa2c, 0xf5cffb504190f9a2 +data8 0x3fd21daf185fa360, 0x3caafad98c1d6c1b +data8 0x8b294a8cf0488daf, 0xf584f3f54b8604e6 +data8 0x3fd2606046bf95a0, 0x3cdb2d704eeb08fa +data8 0x8b7fc95f35647757, 0xf538ca65c960b582 +data8 0x3fd2a32601231ec0, 0x3cc661619fa2f126 +data8 0x8bd7e588272276f8, 0xf4eb7d92ff39fccb +data8 0x3fd2e600a3865760, 0x3c8a2a36a99aca4a +data8 0x8c31a45bf8e9255e, 0xf49d0c68cd09b689 +data8 0x3fd328f08ad12000, 0x3cb9efaf1d7ab552 +data8 0x8c8d0b520a35eb18, 0xf44d75cd993cfad2 +data8 0x3fd36bf614dcc040, 0x3ccacbb590bef70d +data8 0x8cea2005d068f23d, 0xf3fcb8a23ab4942b +data8 0x3fd3af11a079a6c0, 0x3cd9775872cf037d +data8 0x8d48e837c8cd5027, 0xf3aad3c1e2273908 +data8 0x3fd3f2438d754b40, 0x3ca03304f667109a +data8 0x8da969ce732f3ac7, 0xf357c60202e2fd7e +data8 0x3fd4358c3ca032e0, 0x3caecf2504ff1a9d +data8 0x8e0baad75555e361, 0xf3038e323ae9463a +data8 0x3fd478ec0fd419c0, 0x3cc64bdc3d703971 +data8 0x8e6fb18807ba877e, 0xf2ae2b1c3a6057f7 +data8 0x3fd4bc6369fa40e0, 0x3cbb7122ec245cf2 +data8 0x8ed5843f4bda74d5, 0xf2579b83aa556f0c +data8 0x3fd4fff2af11e2c0, 0x3c9cfa2dc792d394 +data8 0x8f3d29862c861fef, 0xf1ffde2612ca1909 +data8 0x3fd5439a4436d000, 0x3cc38d46d310526b +data8 0x8fa6a81128940b2d, 0xf1a6f1bac0075669 +data8 0x3fd5875a8fa83520, 0x3cd8bf59b8153f8a +data8 0x901206c1686317a6, 0xf14cd4f2a730d480 +data8 0x3fd5cb33f8cf8ac0, 0x3c9502b5c4d0e431 +data8 0x907f4ca5fe9cf739, 0xf0f186784a125726 +data8 0x3fd60f26e847b120, 0x3cc8a1a5e0acaa33 +data8 0x90ee80fd34aeda5e, 0xf09504ef9a212f18 +data8 0x3fd65333c7e43aa0, 0x3cae5b029cb1f26e +data8 0x915fab35e37421c6, 0xf0374ef5daab5c45 +data8 0x3fd6975b02b8e360, 0x3cd5aa1c280c45e6 +data8 0x91d2d2f0d894d73c, 0xefd86321822dbb51 +data8 0x3fd6db9d05213b20, 0x3cbecf2c093ccd8b +data8 0x9248000249200009, 0xef7840021aca5a72 +data8 0x3fd71ffa3cc87fc0, 0x3cb8d273f08d00d9 +data8 0x92bf3a7351f081d2, 0xef16e42021d7cbd5 +data8 0x3fd7647318b1ad20, 0x3cbce099d79cdc46 +data8 0x93388a8386725713, 0xeeb44dfce6820283 +data8 0x3fd7a908093fc1e0, 0x3ccb033ec17a30d9 +data8 0x93b3f8aa8e653812, 0xee507c126774fa45 +data8 0x3fd7edb9803e3c20, 0x3cc10aedb48671eb +data8 0x94318d99d341ade4, 0xedeb6cd32f891afb +data8 0x3fd83287f0e9cf80, 0x3c994c0c1505cd2a +data8 0x94b1523e3dedc630, 0xed851eaa3168f43c +data8 0x3fd87773cff956e0, 0x3cda3b7bce6a6b16 +data8 0x95334fc20577563f, 0xed1d8ffaa2279669 +data8 0x3fd8bc7d93a70440, 0x3cd4922edc792ce2 +data8 0x95b78f8e8f92f274, 0xecb4bf1fd2be72da +data8 0x3fd901a5b3b9cf40, 0x3cd3fea1b00f9d0d +data8 0x963e1b4e63a87c3f, 0xec4aaa6d08694cc1 +data8 0x3fd946eca98f2700, 0x3cdba4032d968ff1 +data8 0x96c6fcef314074fc, 0xebdf502d53d65fea +data8 0x3fd98c52f024e800, 0x3cbe7be1ab8c95c9 +data8 0x97523ea3eab028b2, 0xeb72aea36720793e +data8 0x3fd9d1d904239860, 0x3cd72d08a6a22b70 +data8 0x97dfeae6f4ee4a9a, 0xeb04c4096a884e94 +data8 0x3fda177f63e8ef00, 0x3cd818c3c1ebfac7 +data8 0x98700c7c6d85d119, 0xea958e90cfe1efd7 +data8 0x3fda5d468f92a540, 0x3cdf45fbfaa080fe +data8 0x9902ae7487a9caa1, 0xea250c6224aab21a +data8 0x3fdaa32f090998e0, 0x3cd715a9353cede4 +data8 0x9997dc2e017a9550, 0xe9b33b9ce2bb7638 +data8 0x3fdae939540d3f00, 0x3cc545c014943439 +data8 0x9a2fa158b29b649b, 0xe9401a573f8aa706 +data8 0x3fdb2f65f63f6c60, 0x3cd4a63c2f2ca8e2 +data8 0x9aca09f835466186, 0xe8cba69df9f0bf35 +data8 0x3fdb75b5773075e0, 0x3cda310ce1b217ec +data8 0x9b672266ab1e0136, 0xe855de74266193d4 +data8 0x3fdbbc28606babc0, 0x3cdc84b75cca6c44 +data8 0x9c06f7579f0b7bd5, 0xe7debfd2f98c060b +data8 0x3fdc02bf3d843420, 0x3cd225d967ffb922 +data8 0x9ca995db058cabdc, 0xe76648a991511c6e +data8 0x3fdc497a9c224780, 0x3cde08101c5b825b +data8 0x9d4f0b605ce71e88, 0xe6ec76dcbc02d9a7 +data8 0x3fdc905b0c10d420, 0x3cb1abbaa3edf120 +data8 0x9df765b9eecad5e6, 0xe6714846bdda7318 +data8 0x3fdcd7611f4b8a00, 0x3cbf6217ae80aadf +data8 0x9ea2b320350540fe, 0xe5f4bab71494cd6b +data8 0x3fdd1e8d6a0d56c0, 0x3cb726e048cc235c +data8 0x9f51023562fc5676, 0xe576cbf239235ecb +data8 0x3fdd65e082df5260, 0x3cd9e66872bd5250 +data8 0xa002620915c2a2f6, 0xe4f779b15f5ec5a7 +data8 0x3fddad5b02a82420, 0x3c89743b0b57534b +data8 0xa0b6e21c2caf9992, 0xe476c1a233a7873e +data8 0x3fddf4fd84bbe160, 0x3cbf7adea9ee3338 +data8 0xa16e9264cc83a6b2, 0xe3f4a16696608191 +data8 0x3fde3cc8a6ec6ee0, 0x3cce46f5a51f49c6 +data8 0xa22983528f3d8d49, 0xe3711694552da8a8 +data8 0x3fde84bd099a6600, 0x3cdc78f6490a2d31 +data8 0xa2e7c5d2e2e69460, 0xe2ec1eb4e1e0a5fb +data8 0x3fdeccdb4fc685c0, 0x3cdd3aedb56a4825 +data8 0xa3a96b5599bd2532, 0xe265b74506fbe1c9 +data8 0x3fdf15241f23b3e0, 0x3cd440f3c6d65f65 +data8 0xa46e85d1ae49d7de, 0xe1ddddb499b3606f +data8 0x3fdf5d98202994a0, 0x3cd6c44bd3fb745a +data8 0xa53727ca3e11b99e, 0xe1548f662951b00d +data8 0x3fdfa637fe27bf60, 0x3ca8ad1cd33054dd +data8 0xa6036453bdc20186, 0xe0c9c9aeabe5e481 +data8 0x3fdfef0467599580, 0x3cc0f1ac0685d78a +data8 0xa6d34f1969dda338, 0xe03d89d5281e4f81 +data8 0x3fe01bff067d6220, 0x3cc0731e8a9ef057 +data8 0xa7a6fc62f7246ff3, 0xdfafcd125c323f54 +data8 0x3fe04092d1ae3b40, 0x3ccabda24b59906d +data8 0xa87e811a861df9b9, 0xdf20909061bb9760 +data8 0x3fe0653df0fd9fc0, 0x3ce94c8dcc722278 +data8 0xa959f2d2dd687200, 0xde8fd16a4e5f88bd +data8 0x3fe08a00c1cae320, 0x3ce6b888bb60a274 +data8 0xaa3967cdeea58bda, 0xddfd8cabd1240d22 +data8 0x3fe0aedba3221c00, 0x3ced5941cd486e46 +data8 0xab904fd587263c84, 0xdd1f4472e1cf64ed +data8 0x3fe0e651e85229c0, 0x3cdb6701042299b1 +data8 0xad686d44dd5a74bb, 0xdbf173e1f6b46e92 +data8 0x3fe1309cbf4cdb20, 0x3cbf1be7bb3f0ec5 +data8 0xaf524e15640ebee4, 0xdabd54896f1029f6 +data8 0x3fe17b4ee1641300, 0x3ce81dd055b792f1 +data8 0xb14eca24ef7db3fa, 0xd982cb9ae2f47e41 +data8 0x3fe1c66b9ffd6660, 0x3cd98ea31eb5ddc7 +data8 0xb35ec807669920ce, 0xd841bd1b8291d0b6 +data8 0x3fe211f66db3a5a0, 0x3ca480c35a27b4a2 +data8 0xb5833e4755e04dd1, 0xd6fa0bd3150b6930 +data8 0x3fe25df2e05b6c40, 0x3ca4bc324287a351 +data8 0xb7bd34c8000b7bd3, 0xd5ab9939a7d23aa1 +data8 0x3fe2aa64b32f7780, 0x3cba67314933077c +data8 0xba0dc64d126cc135, 0xd4564563ce924481 +data8 0x3fe2f74fc9289ac0, 0x3cec1a1dc0efc5ec +data8 0xbc76222cbbfa74a6, 0xd2f9eeed501125a8 +data8 0x3fe344b82f859ac0, 0x3ceeef218de413ac +data8 0xbef78e31985291a9, 0xd19672e2182f78be +data8 0x3fe392a22087b7e0, 0x3cd2619ba201204c +data8 0xc19368b2b0629572, 0xd02baca5427e436a +data8 0x3fe3e11206694520, 0x3cb5d0b3143fe689 +data8 0xc44b2ae8c6733e51, 0xceb975d60b6eae5d +data8 0x3fe4300c7e945020, 0x3cbd367143da6582 +data8 0xc7206b894212dfef, 0xcd3fa6326ff0ac9a +data8 0x3fe47f965d201d60, 0x3ce797c7a4ec1d63 +data8 0xca14e1b0622de526, 0xcbbe13773c3c5338 +data8 0x3fe4cfb4b09d1a20, 0x3cedfadb5347143c +data8 0xcd2a6825eae65f82, 0xca34913d425a5ae9 +data8 0x3fe5206cc637e000, 0x3ce2798b38e54193 +data8 0xd06301095e1351ee, 0xc8a2f0d3679c08c0 +data8 0x3fe571c42e3d0be0, 0x3ccd7cb9c6c2ca68 +data8 0xd3c0d9f50057adda, 0xc70901152d59d16b +data8 0x3fe5c3c0c108f940, 0x3ceb6c13563180ab +data8 0xd74650a98cc14789, 0xc5668e3d4cbf8828 +data8 0x3fe61668a46ffa80, 0x3caa9092e9e3c0e5 +data8 0xdaf5f8579dcc8f8f, 0xc3bb61b3eed42d02 +data8 0x3fe669c251ad69e0, 0x3cccf896ef3b4fee +data8 0xded29f9f9a6171b4, 0xc20741d7f8e8e8af +data8 0x3fe6bdd49bea05c0, 0x3cdc6b29937c575d +data8 0xe2df5765854ccdb0, 0xc049f1c2d1b8014b +data8 0x3fe712a6b76c6e80, 0x3ce1ddc6f2922321 +data8 0xe71f7a9b94fcb4c3, 0xbe833105ec291e91 +data8 0x3fe76840418978a0, 0x3ccda46e85432c3d +data8 0xeb96b72d3374b91e, 0xbcb2bb61493b28b3 +data8 0x3fe7bea9496d5a40, 0x3ce37b42ec6e17d3 +data8 0xf049183c3f53c39b, 0xbad848720223d3a8 +data8 0x3fe815ea59dab0a0, 0x3cb03ad41bfc415b +data8 0xf53b11ec7f415f15, 0xb8f38b57c53c9c48 +data8 0x3fe86e0c84010760, 0x3cc03bfcfb17fe1f +data8 0xfa718f05adbf2c33, 0xb70432500286b185 +data8 0x3fe8c7196b9225c0, 0x3ced99fcc6866ba9 +data8 0xfff200c3f5489608, 0xb509e6454dca33cc +data8 0x3fe9211b54441080, 0x3cb789cb53515688 +// The following table entries are not used +//data8 0x82e138a0fac48700, 0xb3044a513a8e6132 +//data8 0x3fe97c1d30f5b7c0, 0x3ce1eb765612d1d0 +//data8 0x85f4cc7fc670d021, 0xb0f2fb2ea6cbbc88 +//data8 0x3fe9d82ab4b5fde0, 0x3ced3fe6f27e8039 +//data8 0x89377c1387d5b908, 0xaed58e9a09014d5c +//data8 0x3fea355065f87fa0, 0x3cbef481d25f5b58 +//data8 0x8cad7a2c98dec333, 0xacab929ce114d451 +//data8 0x3fea939bb451e2a0, 0x3c8e92b4fbf4560f +//data8 0x905b7dfc99583025, 0xaa748cc0dbbbc0ec +//data8 0x3feaf31b11270220, 0x3cdced8c61bd7bd5 +//data8 0x9446d8191f80dd42, 0xa82ff92687235baf +//data8 0x3feb53de0bcffc20, 0x3cbe1722fb47509e +//data8 0x98758ba086e4000a, 0xa5dd497a9c184f58 +//data8 0x3febb5f571cb0560, 0x3ce0c7774329a613 +//data8 0x9cee6c7bf18e4e24, 0xa37be3c3cd1de51b +//data8 0x3fec197373bc7be0, 0x3ce08ebdb55c3177 +//data8 0xa1b944000a1b9440, 0xa10b2101b4f27e03 +//data8 0x3fec7e6bd023da60, 0x3ce5fc5fd4995959 +//data8 0xa6defd8ba04d3e38, 0x9e8a4b93cad088ec +//data8 0x3fece4f404e29b20, 0x3cea3413401132b5 +//data8 0xac69dd408a10c62d, 0x9bf89d5d17ddae8c +//data8 0x3fed4d2388f63600, 0x3cd5a7fb0d1d4276 +//data8 0xb265c39cbd80f97a, 0x99553d969fec7beb +//data8 0x3fedb714101e0a00, 0x3cdbda21f01193f2 +//data8 0xb8e081a16ae4ae73, 0x969f3e3ed2a0516c +//data8 0x3fee22e1da97bb00, 0x3ce7231177f85f71 +//data8 0xbfea427678945732, 0x93d5990f9ee787af +//data8 0x3fee90ac13b18220, 0x3ce3c8a5453363a5 +//data8 0xc79611399b8c90c5, 0x90f72bde80febc31 +//data8 0x3fef009542b712e0, 0x3ce218fd79e8cb56 +//data8 0xcffa8425040624d7, 0x8e02b4418574ebed +//data8 0x3fef72c3d2c57520, 0x3cd32a717f82203f +//data8 0xd93299cddcf9cf23, 0x8af6ca48e9c44024 +//data8 0x3fefe762b77744c0, 0x3ce53478a6bbcf94 +//data8 0xe35eda760af69ad9, 0x87d1da0d7f45678b +//data8 0x3ff02f511b223c00, 0x3ced6e11782c28fc +//data8 0xeea6d733421da0a6, 0x84921bbe64ae029a +//data8 0x3ff06c5c6f8ce9c0, 0x3ce71fc71c1ffc02 +//data8 0xfb3b2c73fc6195cc, 0x813589ba3a5651b6 +//data8 0x3ff0aaf2613700a0, 0x3cf2a72d2fd94ef3 +//data8 0x84ac1fcec4203245, 0xfb73a828893df19e +//data8 0x3ff0eb367c3fd600, 0x3cf8054c158610de +//data8 0x8ca50621110c60e6, 0xf438a14c158d867c +//data8 0x3ff12d51caa6b580, 0x3ce6bce9748739b6 +//data8 0x95b8c2062d6f8161, 0xecb3ccdd37b369da +//data8 0x3ff1717418520340, 0x3ca5c2732533177c +//data8 0xa0262917caab4ad1, 0xe4dde4ddc81fd119 +//data8 0x3ff1b7d59dd40ba0, 0x3cc4c7c98e870ff5 +//data8 0xac402c688b72f3f4, 0xdcae469be46d4c8d +//data8 0x3ff200b93cc5a540, 0x3c8dd6dc1bfe865a +//data8 0xba76968b9eabd9ab, 0xd41a8f3df1115f7f +//data8 0x3ff24c6f8f6affa0, 0x3cf1acb6d2a7eff7 +//data8 0xcb63c87c23a71dc5, 0xcb161074c17f54ec +//data8 0x3ff29b5b338b7c80, 0x3ce9b5845f6ec746 +//data8 0xdfe323b8653af367, 0xc19107d99ab27e42 +//data8 0x3ff2edf6fac7f5a0, 0x3cf77f961925fa02 +//data8 0xf93746caaba3e1f1, 0xb777744a9df03bff +//data8 0x3ff344df237486c0, 0x3cf6ddf5f6ddda43 +//data8 0x8ca77052f6c340f0, 0xacaf476f13806648 +//data8 0x3ff3a0dfa4bb4ae0, 0x3cfee01bbd761bff +//data8 0xa1a48604a81d5c62, 0xa11575d30c0aae50 +//data8 0x3ff4030b73c55360, 0x3cf1cf0e0324d37c +//data8 0xbe45074b05579024, 0x9478e362a07dd287 +//data8 0x3ff46ce4c738c4e0, 0x3ce3179555367d12 +//data8 0xe7a08b5693d214ec, 0x8690e3575b8a7c3b +//data8 0x3ff4e0a887c40a80, 0x3cfbd5d46bfefe69 +//data8 0x94503d69396d91c7, 0xedd2ce885ff04028 +//data8 0x3ff561ebd9c18cc0, 0x3cf331bd176b233b +//data8 0xced1d96c5bb209e6, 0xc965278083808702 +//data8 0x3ff5f71d7ff42c80, 0x3ce3301cc0b5a48c +//data8 0xabac2cee0fc24e20, 0x9c4eb1136094cbbd +//data8 0x3ff6ae4c63222720, 0x3cf5ff46874ee51e +//data8 0x8040201008040201, 0xb4d7ac4d9acb1bf4 +//data8 0x3ff7b7d33b928c40, 0x3cfacdee584023bb +LOCAL_OBJECT_END(T_table) -acos_X2 = f51 -acos_X4 = f52 -acos_B = f53 -acos_Bb = f54 -acos_A = f55 -acos_Aa = f56 -acos_1mA = f57 +.align 16 -acos_W = f58 -acos_Ww = f59 +LOCAL_OBJECT_START(poly_coeffs) + // C_3 +data8 0xaaaaaaaaaaaaaaab, 0x0000000000003ffc + // C_5 +data8 0x999999999999999a, 0x0000000000003ffb + // C_7, C_9 +data8 0x3fa6db6db6db6db7, 0x3f9f1c71c71c71c8 + // pi/2 (low, high) +data8 0x3C91A62633145C07, 0x3FF921FB54442D18 + // C_11, C_13 +data8 0x3f96e8ba2e8ba2e9, 0x3f91c4ec4ec4ec4e + // C_15, C_17 +data8 0x3f8c99999999999a, 0x3f87a87878787223 + // pi (low, high) +data8 0x3CA1A62633145C07, 0x400921FB54442D18 +LOCAL_OBJECT_END(poly_coeffs) + + +R_DBL_S = r21 +R_EXP0 = r22 +R_EXP = r15 +R_SGNMASK = r23 +R_TMP = r24 +R_TMP2 = r25 +R_INDEX = r26 +R_TMP3 = r27 +R_TMP03 = r27 +R_TMP4 = r28 +R_TMP5 = r23 +R_TMP6 = r22 +R_TMP7 = r21 +R_T = r29 +R_BIAS = r20 + +F_T = f6 +F_1S2 = f7 +F_1S2_S = f9 +F_INV_1T2 = f10 +F_SQRT_1T2 = f11 +F_S2T2 = f12 +F_X = f13 +F_D = f14 +F_2M64 = f15 + +F_CS2 = f32 +F_CS3 = f33 +F_CS4 = f34 +F_CS5 = f35 +F_CS6 = f36 +F_CS7 = f37 +F_CS8 = f38 +F_CS9 = f39 +F_S23 = f40 +F_S45 = f41 +F_S67 = f42 +F_S89 = f43 +F_S25 = f44 +F_S69 = f45 +F_S29 = f46 +F_X2 = f47 +F_X4 = f48 +F_TSQRT = f49 +F_DTX = f50 +F_R = f51 +F_R2 = f52 +F_R3 = f53 +F_R4 = f54 + +F_C3 = f55 +F_C5 = f56 +F_C7 = f57 +F_C9 = f58 +F_P79 = f59 +F_P35 = f60 +F_P39 = f61 + +F_ATHI = f62 +F_ATLO = f63 + +F_T1 = f64 +F_Y = f65 +F_Y2 = f66 +F_ANDMASK = f67 +F_ORMASK = f68 +F_S = f69 +F_05 = f70 +F_SQRT_1S2 = f71 +F_DS = f72 +F_Z = f73 +F_1T2 = f74 +F_DZ = f75 +F_ZE = f76 +F_YZ = f77 +F_Y1S2 = f78 +F_Y1S2X = f79 +F_1X = f80 +F_ST = f81 +F_1T2_ST = f82 +F_TSS = f83 +F_Y1S2X2 = f84 +F_DZ_TERM = f85 +F_DTS = f86 +F_DS2X = f87 +F_T2 = f88 +F_ZY1S2S = f89 +F_Y1S2_1X = f90 +F_TS = f91 +F_PI2_LO = f92 +F_PI2_HI = f93 +F_S19 = f94 +F_INV1T2_2 = f95 +F_CORR = f96 +F_DZ0 = f97 + +F_C11 = f98 +F_C13 = f99 +F_C15 = f100 +F_C17 = f101 +F_P1113 = f102 +F_P1517 = f103 +F_P1117 = f104 +F_P317 = f105 +F_R8 = f106 +F_HI = f107 +F_1S2_HI = f108 +F_DS2 = f109 +F_Y2_2 = f110 +//F_S2 = f111 +//F_S_DS2 = f112 +F_S_1S2S = f113 +F_XL = f114 +F_2M128 = f115 +F_1AS = f116 +F_AS = f117 -acos_y0 = f60 -acos_y1 = f61 -acos_y2 = f62 -acos_H = f63 -acos_Hh = f64 -acos_t1 = f65 -acos_t2 = f66 -acos_t3 = f67 -acos_t4 = f68 -acos_t5 = f69 +.section .text +GLOBAL_LIBM_ENTRY(acosl) -acos_Pseries = f70 -acos_NORM_f8 = f71 -acos_ABS_NORM_f8 = f72 +{.mfi + // get exponent, mantissa (rounded to double precision) of s + getf.d R_DBL_S = f8 + // 1-s^2 + fnma.s1 F_1S2 = f8, f8, f1 + // r2 = pointer to T_table + addl r2 = @ltoff(T_table), gp +} -acos_2 = f73 -acos_P1P2 = f74 -acos_HALF = f75 -acos_U = f76 +{.mfi + // sign mask + mov R_SGNMASK = 0x20000 + nop.f 0 + // bias-63-1 + mov R_TMP03 = 0xffff-64;; +} -acos_1mB = f77 -acos_V = f78 -acos_S = f79 -acos_BmUU = f80 -acos_BmUUpb = f81 -acos_2U = f82 -acos_1d2U = f83 +{.mfi + // get exponent of s + getf.exp R_EXP = f8 + nop.f 0 + // R_TMP4 = 2^45 + shl R_TMP4 = R_SGNMASK, 45-17 +} -acos_Dd = f84 +{.mlx + // load bias-4 + mov R_TMP = 0xffff-4 + // load RU(sqrt(2)/2) to integer register (in double format, shifted left by 1) + movl R_TMP2 = 0x7fcd413cccfe779a;; +} -acos_pi_by_2_hi = f85 -acos_pi_by_2_lo = f86 -acos_xmpi_by_2_lo = f87 -acos_xPmw = f88 -acos_Uu = f89 -acos_AmVV = f90 -acos_AmVVpa = f91 +{.mfi + // load 2^{-64} in FP register + setf.exp F_2M64 = R_TMP03 + nop.f 0 + // index = (0x7-exponent)|b1 b2.. b6 + extr.u R_INDEX = R_DBL_S, 46, 9 +} -acos_2V = f92 -acos_1d2V = f93 -acos_Vv = f94 +{.mfi + // get t = sign|exponent|b1 b2.. b6 1 x.. x + or R_T = R_DBL_S, R_TMP4 + nop.f 0 + // R_TMP4 = 2^45-1 + sub R_TMP4 = R_TMP4, r0, 1;; +} -acos_Vu = f95 -acos_Uv = f96 - -acos_2_Z_hi = f97 -acos_s_lo_Z_lo = f98 -acos_result_lo = f99 - -acos_Z_hi = f8 -acos_Z_lo = f10 -acos_s_lo = f11 - -acos_GR_17_ones = r33 -acos_GR_16_ones = r34 -acos_GR_signexp_f8 = r35 -acos_GR_exp = r36 -acos_GR_true_exp = r37 -acos_GR_fffe = r38 - -GR_SAVE_PFS = r43 -GR_SAVE_B0 = r39 -GR_SAVE_GP = r41 - -// r40 is address of table of coefficients -// r42 - -GR_Parameter_X = r44 -GR_Parameter_Y = r45 -GR_Parameter_RESULT = r46 -GR_Parameter_TAG = r47 - - -// 2^-40: -// A true exponent of -40 is -// : -40 + register_bias -// : -28 + ffff = ffd7 -// A true exponent of 1 is -// : 1 + register_bias -// : 1 + ffff = 10000 +{.mfi + // get t = sign|exponent|b1 b2.. b6 1 0.. 0 + andcm R_T = R_T, R_TMP4 + nop.f 0 + // eliminate sign from R_DBL_S (shift left by 1) + shl R_TMP3 = R_DBL_S, 1 +} -// Data tables -//============================================================== +{.mfi + // R_BIAS = 3*2^6 + mov R_BIAS = 0xc0 + nop.f 0 + // eliminate sign from R_EXP + andcm R_EXP0 = R_EXP, R_SGNMASK;; +} -#ifdef _LIBC -.rodata -#else -.data -#endif -.align 16 -acos_coefficients: -ASM_TYPE_DIRECTIVE(acos_coefficients,@object) -data8 0xc90fdaa22168c234, 0x00003FFF // pi_by_2_hi -data8 0xc4c6628b80dc1cd1, 0x00003FBF // pi_by_2_lo -data8 0xc90fdaa22168c234, 0x00004000 // pi_hi -data8 0xc4c6628b80dc1cd1, 0x00003FC0 // pi_lo - -data8 0xBB08911F2013961E, 0x00003FF8 // A10 -data8 0x981F1095A23A87D3, 0x00003FF8 // A9 -data8 0xBDF09C6C4177BCC6, 0x00003FF8 // A8 -data8 0xE4C3A60B049ACCEA, 0x00003FF8 // A7 -data8 0x8E2789F4E8A8F1AD, 0x00003FF9 // A6 -data8 0xB745D09B2B0E850B, 0x00003FF9 // A5 -data8 0xF8E38E3BC4C50920, 0x00003FF9 // A4 -data8 0xB6DB6DB6D89FCD81, 0x00003FFA // A3 -data8 0x99999999999AF376, 0x00003FFB // A2 -data8 0xAAAAAAAAAAAAAA71, 0x00003FFC // A1 -ASM_SIZE_DIRECTIVE(acos_coefficients) - - -.align 32 -.global acosl# -ASM_TYPE_DIRECTIVE(acosl#,@function) +{.mfi + // load start address for T_table + ld8 r2 = [r2] + nop.f 0 + // p8 = 1 if |s|> = sqrt(2)/2 + cmp.geu p8, p0 = R_TMP3, R_TMP2 +} -.section .text -.proc acosl# -.align 32 +{.mlx + // p7 = 1 if |s|<2^{-4} (exponent of s<bias-4) + cmp.lt p7, p0 = R_EXP0, R_TMP + // sqrt coefficient cs8 = -33*13/128 + movl R_TMP2 = 0xc0568000;; +} -acosl: -// After normalizing f8, get its true exponent -{ .mfi - alloc r32 = ar.pfs,1,11,4,0 -(p0) fnorm.s1 acos_NORM_f8 = f8 -(p0) mov acos_GR_17_ones = 0x1ffff +{.mbb + // load t in FP register + setf.d F_T = R_T + // if |s|<2^{-4}, take alternate path + (p7) br.cond.spnt SMALL_S + // if |s|> = sqrt(2)/2, take alternate path + (p8) br.cond.sptk LARGE_S } -{ .mmi -(p0) mov acos_GR_16_ones = 0xffff -(p0) addl r40 = @ltoff(acos_coefficients), gp - nop.i 999 +{.mlx + // index = (4-exponent)|b1 b2.. b6 + sub R_INDEX = R_INDEX, R_BIAS + // sqrt coefficient cs9 = 55*13/128 + movl R_TMP = 0x40b2c000;; } -;; -// Set denormal flag on denormal input with fcmp -{ .mfi - ld8 r40 = [r40] - fcmp.eq p6,p0 = f8,f0 - nop.i 999 + +{.mfi + // sqrt coefficient cs8 = -33*13/128 + setf.s F_CS8 = R_TMP2 + nop.f 0 + // shift R_INDEX by 5 + shl R_INDEX = R_INDEX, 5 } -;; +{.mfi + // sqrt coefficient cs3 = 0.5 (set exponent = bias-1) + mov R_TMP4 = 0xffff - 1 + nop.f 0 + // sqrt coefficient cs6 = -21/16 + mov R_TMP6 = 0xbfa8;; +} -// Load the constants pi_by_2 and pi. -// Each is stored as hi and lo values -// Also load the coefficients for ACOS_POLY -{ .mmi -(p0) ldfe acos_pi_by_2_hi = [r40],16 ;; -(p0) ldfe acos_pi_by_2_lo = [r40],16 - nop.i 999 ;; +{.mlx + // table index + add r2 = r2, R_INDEX + // sqrt coefficient cs7 = 33/16 + movl R_TMP2 = 0x40040000;; } -{ .mmi -(p0) ldfe acos_pi_hi = [r40],16 ;; -(p0) ldfe acos_pi_lo = [r40],16 - nop.i 999 ;; + +{.mmi + // load cs9 = 55*13/128 + setf.s F_CS9 = R_TMP + // sqrt coefficient cs5 = 7/8 + mov R_TMP3 = 0x3f60 + // sqrt coefficient cs6 = 21/16 + shl R_TMP6 = R_TMP6, 16;; } -{ .mmi -(p0) ldfe acos_A10 = [r40],16 ;; -(p0) ldfe acos_A9 = [r40],16 - nop.i 999 ;; + +{.mmi + // load significand of 1/(1-t^2) + ldf8 F_INV_1T2 = [r2], 8 + // sqrt coefficient cs7 = 33/16 + setf.s F_CS7 = R_TMP2 + // sqrt coefficient cs4 = -5/8 + mov R_TMP5 = 0xbf20;; } -// Take the absolute value of f8 -{ .mmf - nop.m 999 -(p0) getf.exp acos_GR_signexp_f8 = acos_NORM_f8 -(p0) fmerge.s acos_ABS_NORM_f8 = f0, acos_NORM_f8 + +{.mmi + // load significand of sqrt(1-t^2) + ldf8 F_SQRT_1T2 = [r2], 8 + // sqrt coefficient cs6 = 21/16 + setf.s F_CS6 = R_TMP6 + // sqrt coefficient cs5 = 7/8 + shl R_TMP3 = R_TMP3, 16;; } -{ .mii -(p0) ldfe acos_A8 = [r40],16 - nop.i 999 ;; -(p0) and acos_GR_exp = acos_GR_signexp_f8, acos_GR_17_ones ;; + +{.mmi + // sqrt coefficient cs3 = 0.5 (set exponent = bias-1) + setf.exp F_CS3 = R_TMP4 + // r3 = pointer to polynomial coefficients + addl r3 = @ltoff(poly_coeffs), gp + // sqrt coefficient cs4 = -5/8 + shl R_TMP5 = R_TMP5, 16;; } -// case 1: |x| < 2^-25 ==> p6 ACOS_TINY -// case 2: 2^-25 <= |x| < 2^-2 ==> p8 ACOS_POLY -// case 3: 2^-2 <= |x| < 1 ==> p9 ACOS_ATAN -// case 4: 1 <= |x| ==> p11 ACOS_ERROR_RETURN -// Admittedly |x| = 1 is not an error but this is where that case is -// handled. -{ .mii -(p0) ldfe acos_A7 = [r40],16 -(p0) sub acos_GR_true_exp = acos_GR_exp, acos_GR_16_ones ;; -(p0) cmp.ge.unc p6, p7 = -26, acos_GR_true_exp ;; +{.mfi + // sqrt coefficient cs5 = 7/8 + setf.s F_CS5 = R_TMP3 + // d = s-t + fms.s1 F_D = f8, f1, F_T + // set p6 = 1 if s<0, p11 = 1 if s> = 0 + cmp.ge p6, p11 = R_EXP, R_DBL_S } -{ .mii -(p0) ldfe acos_A6 = [r40],16 -(p7) cmp.ge.unc p8, p9 = -3, acos_GR_true_exp ;; -(p9) cmp.ge.unc p10, p11 = -1, acos_GR_true_exp +{.mfi + // r3 = load start address to polynomial coefficients + ld8 r3 = [r3] + // s+t + fma.s1 F_S2T2 = f8, f1, F_T + nop.i 0;; } -{ .mmi -(p0) ldfe acos_A5 = [r40],16 ;; -(p0) ldfe acos_A4 = [r40],16 - nop.i 999 ;; + +{.mfi + // sqrt coefficient cs4 = -5/8 + setf.s F_CS4 = R_TMP5 + // s^2-t^2 + fma.s1 F_S2T2 = F_S2T2, F_D, f0 + nop.i 0;; } -{ .mmi -(p0) ldfe acos_A3 = [r40],16 ;; -(p0) ldfe acos_A2 = [r40],16 - nop.i 999 ;; + +{.mfi + // load C3 + ldfe F_C3 = [r3], 16 + // 0.5/(1-t^2) = 2^{-64}*(2^63/(1-t^2)) + fma.s1 F_INV_1T2 = F_INV_1T2, F_2M64, f0 + nop.i 0;; } -// ACOS_ERROR_RETURN ==> p11 is true -// case 4: |x| >= 1 -{ .mib -(p0) ldfe acos_A1 = [r40],16 - nop.i 999 -(p11) br.spnt L(ACOS_ERROR_RETURN) ;; +{.mfi + // load C_5 + ldfe F_C5 = [r3], 16 + // set correct exponent for sqrt(1-t^2) + fma.s1 F_SQRT_1T2 = F_SQRT_1T2, F_2M64, f0 + nop.i 0;; } -// ACOS_TINY ==> p6 is true -// case 1: |x| < 2^-25 -{ .mfi - nop.m 999 -(p6) fms.s1 acos_xmpi_by_2_lo = acos_NORM_f8,f1, acos_pi_by_2_lo - nop.i 999 ;; + +{.mfi + // load C_7, C_9 + ldfpd F_C7, F_C9 = [r3], 16 + // x = -(s^2-t^2)/(1-t^2)/2 + fnma.s1 F_X = F_INV_1T2, F_S2T2, f0 + nop.i 0;; } -{ .mfb - nop.m 999 -(p6) fms.s0 f8 = acos_pi_by_2_hi,f1, acos_xmpi_by_2_lo -(p6) br.ret.spnt b0 ;; + +{.mmf + // load asin(t)_high, asin(t)_low + ldfpd F_ATHI, F_ATLO = [r2] + // load pi/2 + ldfpd F_PI2_LO, F_PI2_HI = [r3] + // t*sqrt(1-t^2) + fma.s1 F_TSQRT = F_T, F_SQRT_1T2, f0;; } +{.mfi + nop.m 0 + // cs9*x+cs8 + fma.s1 F_S89 = F_CS9, F_X, F_CS8 + nop.i 0 +} -// ACOS_POLY ==> p8 is true -// case 2: 2^-25 <= |x| < 2^-2 -{ .mfi - nop.m 999 -(p8) fms.s1 acos_W = acos_pi_by_2_hi, f1, acos_NORM_f8 - nop.i 999 ;; +{.mfi + nop.m 0 + // cs7*x+cs6 + fma.s1 F_S67 = F_CS7, F_X, F_CS6 + nop.i 0;; } -{ .mfi - nop.m 999 -(p8) fma.s1 acos_X2 = f8,f8, f0 - nop.i 999 ;; +{.mfi + nop.m 0 + // cs5*x+cs4 + fma.s1 F_S45 = F_CS5, F_X, F_CS4 + nop.i 0 } -{ .mfi - nop.m 999 -(p8) fms.s1 acos_Ww = acos_pi_by_2_hi, f1, acos_W - nop.i 999 ;; +{.mfi + nop.m 0 + // x*x + fma.s1 F_X2 = F_X, F_X, f0 + nop.i 0;; } -{ .mfi - nop.m 999 -(p8) fma.s1 acos_X4 = acos_X2,acos_X2, f0 - nop.i 999 ;; + +{.mfi + nop.m 0 + // (s-t)-t*x + fnma.s1 F_DTX = F_T, F_X, F_D + nop.i 0 } -{ .mfi - nop.m 999 -(p8) fms.s1 acos_Ww = acos_Ww, f1, acos_NORM_f8 - nop.i 999 ;; +{.mfi + nop.m 0 + // cs3*x+cs2 (cs2 = -0.5 = -cs3) + fms.s1 F_S23 = F_CS3, F_X, F_CS3 + nop.i 0;; } -{ .mfi - nop.m 999 -(p8) fma.s1 acos_P810 = acos_X4, acos_A10, acos_A8 - nop.i 999 +{.mfi + nop.m 0 + // if sign is negative, negate table values: asin(t)_low + (p6) fnma.s1 F_ATLO = F_ATLO, f1, f0 + nop.i 0 } -// acos_P79 = X4*A9 + A7 -// acos_P810 = X4*A10 + A8 -{ .mfi - nop.m 999 -(p8) fma.s1 acos_P79 = acos_X4, acos_A9, acos_A7 - nop.i 999 ;; +{.mfi + nop.m 0 + // if sign is negative, negate table values: asin(t)_high + (p6) fnma.s1 F_ATHI = F_ATHI, f1, f0 + nop.i 0;; } -{ .mfi - nop.m 999 -(p8) fma.s1 acos_Ww = acos_Ww, f1, acos_pi_by_2_lo - nop.i 999 ;; + +{.mfi + nop.m 0 + // cs9*x^3+cs8*x^2+cs7*x+cs6 + fma.s1 F_S69 = F_S89, F_X2, F_S67 + nop.i 0 } -{ .mfi - nop.m 999 -(p8) fma.s1 acos_P610 = acos_X4, acos_P810, acos_A6 - nop.i 999 +{.mfi + nop.m 0 + // x^4 + fma.s1 F_X4 = F_X2, F_X2, f0 + nop.i 0;; } -// acos_P59 = X4*(X4*A9 + A7) + A5 -// acos_P610 = X4*(X4*A10 + A8) + A6 -{ .mfi - nop.m 999 -(p8) fma.s1 acos_P59 = acos_X4, acos_P79, acos_A5 - nop.i 999 ;; +{.mfi + nop.m 0 + // t*sqrt(1-t^2)*x^2 + fma.s1 F_TSQRT = F_TSQRT, F_X2, f0 + nop.i 0 } -{ .mfi - nop.m 999 -(p8) fma.s1 acos_P410 = acos_X4, acos_P610, acos_A4 - nop.i 999 +{.mfi + nop.m 0 + // cs5*x^3+cs4*x^2+cs3*x+cs2 + fma.s1 F_S25 = F_S45, F_X2, F_S23 + nop.i 0;; } -// acos_P39 = X4*(X4*(X4*A9 + A7) + A5) + A3 -// acos_P410 = X4*(X4*(X4*A10 + A8) + A6) + A4 -{ .mfi - nop.m 999 -(p8) fma.s1 acos_P39 = acos_X4, acos_P59, acos_A3 - nop.i 999 ;; + +{.mfi + nop.m 0 + // ((s-t)-t*x)*sqrt(1-t^2) + fma.s1 F_DTX = F_DTX, F_SQRT_1T2, f0 + nop.i 0;; } -{ .mfi - nop.m 999 -(p8) fma.s1 acos_P210 = acos_X4, acos_P410, acos_A2 - nop.i 999 +{.mfi + nop.m 0 + // (pi/2)_high - asin(t)_high + fnma.s1 F_ATHI = F_ATHI, f1, F_PI2_HI + nop.i 0 } -// acos_P19 = X4*(X4*(X4*(X4*A9 + A7) + A5) + A3) + A1 = P1 -// acos_P210 = X4*(X4*(X4*(X4*A10 + A8) + A6) + A4) + A2 = P2 -{ .mfi - nop.m 999 -(p8) fma.s1 acos_P19 = acos_X4, acos_P39, acos_A1 - nop.i 999 ;; +{.mfi + nop.m 0 + // asin(t)_low - (pi/2)_low + fnma.s1 F_ATLO = F_PI2_LO, f1, F_ATLO + nop.i 0;; } -// acos_P1P2 = Xsq*P2 + P1 -// acos_P1P2 = Xsq*(Xsq*P2 + P1) -{ .mfi - nop.m 999 -(p8) fma.s1 acos_P1P2 = acos_X2, acos_P210, acos_P19 - nop.i 999 ;; + +{.mfi + nop.m 0 + // PS29 = cs9*x^7+..+cs5*x^3+cs4*x^2+cs3*x+cs2 + fma.s1 F_S29 = F_S69, F_X4, F_S25 + nop.i 0;; } -{ .mfi - nop.m 999 -(p8) fma.s1 acos_P1P2 = acos_X2, acos_P1P2, f0 - nop.i 999 ;; + + +{.mfi + nop.m 0 + // R = ((s-t)-t*x)*sqrt(1-t^2)-t*sqrt(1-t^2)*x^2*PS29 + fnma.s1 F_R = F_S29, F_TSQRT, F_DTX + nop.i 0;; } -{ .mfi - nop.m 999 -(p8) fms.s1 acos_xPmw = acos_NORM_f8, acos_P1P2, acos_Ww - nop.i 999 ;; + +{.mfi + nop.m 0 + // R^2 + fma.s1 F_R2 = F_R, F_R, f0 + nop.i 0;; } -{ .mfb - nop.m 999 -(p8) fms.s0 f8 = acos_W, f1, acos_xPmw -(p8) br.ret.spnt b0 ;; + +{.mfi + nop.m 0 + // c7+c9*R^2 + fma.s1 F_P79 = F_C9, F_R2, F_C7 + nop.i 0 } +{.mfi + nop.m 0 + // c3+c5*R^2 + fma.s1 F_P35 = F_C5, F_R2, F_C3 + nop.i 0;; +} -// ACOS_ATAN -// case 3: 2^-2 <= |x| < 1 -// case 3: 2^-2 <= |x| < 1 ==> p9 ACOS_ATAN +{.mfi + nop.m 0 + // R^3 + fma.s1 F_R4 = F_R2, F_R2, f0 + nop.i 0;; +} -// Step 1.1: Get A,B and a,b -// A + a = 1- |X| -// B + b = 1+ |X| -// Note also that we will use acos_corr (f13) -// and acos_W +{.mfi + nop.m 0 + // R^3 + fma.s1 F_R3 = F_R2, F_R, f0 + nop.i 0;; +} -// Step 2 -// Call __libm_atan2_reg -{ .mfi -(p0) mov acos_GR_fffe = 0xfffe -(p0) fma.s1 acos_B = f1,f1, acos_ABS_NORM_f8 -(p0) mov GR_SAVE_B0 = b0 ;; +{.mfi + nop.m 0 + // c3+c5*R^2+c7*R^4+c9*R^6 + fma.s1 F_P39 = F_P79, F_R4, F_P35 + nop.i 0;; } -{ .mmf -(p0) mov GR_SAVE_GP = gp - nop.m 999 -(p0) fms.s1 acos_A = f1,f1, acos_ABS_NORM_f8 + +{.mfi + nop.m 0 + // asin(t)_low+R^3*(c3+c5*R^2+c7*R^4+c9*R^6) + fma.s1 F_P39 = F_P39, F_R3, F_ATLO + nop.i 0;; } -{ .mfi -(p0) setf.exp acos_HALF = acos_GR_fffe - nop.f 999 - nop.i 999 ;; + +{.mfi + nop.m 0 + // R+asin(t)_low+R^3*(c3+c5*R^2+c7*R^4+c9*R^6) + fma.s1 F_P39 = F_P39, f1, F_R + nop.i 0;; } -{ .mfi - nop.m 999 -(p0) fms.s1 acos_1mB = f1,f1, acos_B - nop.i 999 ;; + +{.mfb + nop.m 0 + // result = (pi/2)-asin(t)_high+R+asin(t)_low+R^3*(c3+c5*R^2+c7*R^4+c9*R^6) + fnma.s0 f8 = F_P39, f1, F_ATHI + // return + br.ret.sptk b0;; } -// We want atan2(V,U) -// so put V in f8 and U in f9 -// but save X in acos_X -{ .mfi - nop.m 999 -(p0) fmerge.se acos_X = f8, f8 - nop.i 999 ;; + + +LARGE_S: + +{.mfi + // bias-1 + mov R_TMP3 = 0xffff - 1 + // y ~ 1/sqrt(1-s^2) + frsqrta.s1 F_Y, p7 = F_1S2 + // c9 = 55*13*17/128 + mov R_TMP4 = 0x10af7b } -// Step 1.2: -///////////////////////// -// Get U = sqrt(B) -///////////////////////// +{.mlx + // c8 = -33*13*15/128 + mov R_TMP5 = 0x184923 + movl R_TMP2 = 0xff00000000000000;; +} -{ .mfi - nop.m 999 -(p0) frsqrta.s1 acos_y0,p8 = acos_B - nop.i 999 +{.mfi + // set p6 = 1 if s<0, p11 = 1 if s>0 + cmp.ge p6, p11 = R_EXP, R_DBL_S + // 1-s^2 + fnma.s1 F_1S2 = f8, f8, f1 + // set p9 = 1 + cmp.eq p9, p0 = r0, r0;; } -{ .mfi - nop.m 999 -(p0) fms.s1 acos_1mA = f1,f1, acos_A - nop.i 999 ;; + +{.mfi + // load 0.5 + setf.exp F_05 = R_TMP3 + // (1-s^2) rounded to single precision + fnma.s.s1 F_1S2_S = f8, f8, f1 + // c9 = 55*13*17/128 + shl R_TMP4 = R_TMP4, 10 } -{ .mfi - nop.m 999 -(p0) fma.s1 acos_Bb = acos_1mB,f1, acos_ABS_NORM_f8 - nop.i 999 ;; +{.mlx + // AND mask for getting t ~ sqrt(1-s^2) + setf.sig F_ANDMASK = R_TMP2 + // OR mask + movl R_TMP2 = 0x0100000000000000;; } -{ .mfi - nop.m 999 -(p0) fma.s1 acos_Hh = acos_HALF, acos_B, f0 - nop.i 999 ;; +.pred.rel "mutex", p6, p11 +{.mfi + nop.m 0 + // 1-|s| + (p6) fma.s1 F_1AS = f8, f1, f1 + nop.i 0 } -{ .mfi - nop.m 999 -(p0) fma.s1 acos_t1 = acos_y0, acos_y0, f0 - nop.i 999 +{.mfi + nop.m 0 + // 1-|s| + (p11) fnma.s1 F_1AS = f8, f1, f1 + nop.i 0;; } -{ .mfi - nop.m 999 -(p0) fms.s1 acos_Aa = acos_1mA,f1, acos_ABS_NORM_f8 - nop.i 999 ;; + +{.mfi + // c9 = 55*13*17/128 + setf.s F_CS9 = R_TMP4 + // |s| + (p6) fnma.s1 F_AS = f8, f1, f0 + // c8 = -33*13*15/128 + shl R_TMP5 = R_TMP5, 11 } -{ .mfi - nop.m 999 -(p0) fnma.s1 acos_t2 = acos_t1, acos_Hh, acos_HALF - nop.i 999 ;; +{.mfi + // c7 = 33*13/16 + mov R_TMP4 = 0x41d68 + // |s| + (p11) fma.s1 F_AS = f8, f1, f0 + nop.i 0;; } -{ .mfi - nop.m 999 -(p0) fma.s1 acos_y1 = acos_t2, acos_y0, acos_y0 - nop.i 999 + +{.mfi + setf.sig F_ORMASK = R_TMP2 + // y^2 + fma.s1 F_Y2 = F_Y, F_Y, f0 + // c7 = 33*13/16 + shl R_TMP4 = R_TMP4, 12 } +{.mfi + // c6 = -33*7/16 + mov R_TMP6 = 0xc1670 + // y' ~ sqrt(1-s^2) + fma.s1 F_T1 = F_Y, F_1S2, f0 + // c5 = 63/8 + mov R_TMP7 = 0x40fc;; +} -// Step 1.2: -///////////////////////// -// Get V = sqrt(A) -///////////////////////// -{ .mfi - nop.m 999 -(p0) frsqrta.s1 acos_y0,p8 = acos_A - nop.i 999 ;; + +{.mlx + // load c8 = -33*13*15/128 + setf.s F_CS8 = R_TMP5 + // c4 = -35/8 + movl R_TMP5 = 0xc08c0000;; } -{ .mfi - nop.m 999 -(p0) fma.s1 acos_t3 = acos_y1, acos_Hh, f0 - nop.i 999 ;; +{.mfi + // r3 = pointer to polynomial coefficients + addl r3 = @ltoff(poly_coeffs), gp + // 1-s-(1-s^2)_s + fnma.s1 F_DS = F_1S2_S, f1, F_1AS + // p9 = 0 if p7 = 1 (p9 = 1 for special cases only) + (p7) cmp.ne p9, p0 = r0, r0 } -{ .mfi - nop.m 999 -(p0) fma.s1 acos_t1 = acos_y0, acos_y0, f0 - nop.i 999 ;; +{.mlx + // load c7 = 33*13/16 + setf.s F_CS7 = R_TMP4 + // c3 = 5/2 + movl R_TMP4 = 0x40200000;; } -{ .mfi - nop.m 999 -(p0) fnma.s1 acos_t4 = acos_t3, acos_y1, acos_HALF - nop.i 999 ;; + +{.mlx + // load c4 = -35/8 + setf.s F_CS4 = R_TMP5 + // c2 = -3/2 + movl R_TMP5 = 0xbfc00000;; } -{ .mfi - nop.m 999 -(p0) fma.s1 acos_y2 = acos_t4, acos_y1, acos_y1 - nop.i 999 ;; + +{.mfi + // load c3 = 5/2 + setf.s F_CS3 = R_TMP4 + // x = (1-s^2)_s*y^2-1 + fms.s1 F_X = F_1S2_S, F_Y2, f1 + // c6 = -33*7/16 + shl R_TMP6 = R_TMP6, 12 } -{ .mfi - nop.m 999 -(p0) fma.s1 acos_S = acos_B, acos_y2, f0 - nop.i 999 +{.mfi + nop.m 0 + // y^2/2 + fma.s1 F_Y2_2 = F_Y2, F_05, f0 + nop.i 0;; } -{ .mfi - nop.m 999 -(p0) fma.s1 acos_H = acos_y2, acos_HALF, f0 - nop.i 999 ;; + +{.mfi + // load c6 = -33*7/16 + setf.s F_CS6 = R_TMP6 + // eliminate lower bits from y' + fand F_T = F_T1, F_ANDMASK + // c5 = 63/8 + shl R_TMP7 = R_TMP7, 16 } -{ .mfi - nop.m 999 -(p0) fma.s1 acos_t5 = acos_Hh, acos_y2, f0 - nop.i 999 + +{.mfb + // r3 = load start address to polynomial coefficients + ld8 r3 = [r3] + // 1-(1-s^2)_s-s^2 + fma.s1 F_DS = F_AS, F_1AS, F_DS + // p9 = 1 if s is a special input (NaN, or |s|> = 1) + (p9) br.cond.spnt acosl_SPECIAL_CASES;; } -{ .mfi - nop.m 999 -(p0) fma.s1 acos_Hh = acos_HALF, acos_A, f0 - nop.i 999 ;; +{.mmf + // get exponent, significand of y' (in single prec.) + getf.s R_TMP = F_T1 + // load c3 = -3/2 + setf.s F_CS2 = R_TMP5 + // y*(1-s^2) + fma.s1 F_Y1S2 = F_Y, F_1S2, f0;; } -{ .mfi - nop.m 999 -(p0) fnma.s1 acos_Dd = acos_S, acos_S, acos_B - nop.i 999 ;; + + +{.mfi + nop.m 0 + // if s<0, set s = -s + (p6) fnma.s1 f8 = f8, f1, f0 + nop.i 0;; } -{ .mfi - nop.m 999 -(p0) fnma.s1 acos_t2 = acos_t1, acos_Hh, acos_HALF - nop.i 999 ;; + +{.mfi + // load c5 = 63/8 + setf.s F_CS5 = R_TMP7 + // x = (1-s^2)_s*y^2-1+(1-(1-s^2)_s-s^2)*y^2 + fma.s1 F_X = F_DS, F_Y2, F_X + // for t = 2^k*1.b1 b2.., get 7-k|b1.. b6 + extr.u R_INDEX = R_TMP, 17, 9;; } -{ .mfi - nop.m 999 -(p0) fma.s1 acos_U = acos_Dd, acos_H, acos_S - nop.i 999 ;; + +{.mmi + // index = (4-exponent)|b1 b2.. b6 + sub R_INDEX = R_INDEX, R_BIAS + nop.m 0 + // get exponent of y + shr.u R_TMP2 = R_TMP, 23;; } -{ .mfi - nop.m 999 -(p0) fma.s1 acos_y1 = acos_t2, acos_y0, acos_y0 - nop.i 999 ;; +{.mmi + // load C3 + ldfe F_C3 = [r3], 16 + // set p8 = 1 if y'<2^{-4} + cmp.gt p8, p0 = 0x7b, R_TMP2 + // shift R_INDEX by 5 + shl R_INDEX = R_INDEX, 5;; } -{ .mfi - nop.m 999 -(p0) fma.s1 acos_2U = acos_U, f1, acos_U - nop.i 999 ;; + +{.mfb + // get table index for sqrt(1-t^2) + add r2 = r2, R_INDEX + // get t = 2^k*1.b1 b2.. b7 1 + for F_T = F_T, F_ORMASK + (p8) br.cond.spnt VERY_LARGE_INPUT;; } -{ .mfi - nop.m 999 -(p0) fma.s1 acos_t3 = acos_y1, acos_Hh, f0 - nop.i 999 + + +{.mmf + // load C5 + ldfe F_C5 = [r3], 16 + // load 1/(1-t^2) + ldfp8 F_INV_1T2, F_SQRT_1T2 = [r2], 16 + // x = ((1-s^2)*y^2-1)/2 + fma.s1 F_X = F_X, F_05, f0;; } -// Step 1.3: -// sqrt(A + a) = V + v -// sqrt(B + b) = U + u -///////////////////////// -// Get u -///////////////////////// +{.mmf + nop.m 0 + // C7, C9 + ldfpd F_C7, F_C9 = [r3], 16 + // set correct exponent for t + fmerge.se F_T = F_T1, F_T;; +} -// acos_BmUU = B - UU -// acos_BmUUpb = (B - UU) + b -{ .mfi - nop.m 999 -(p0) fnma.s1 acos_BmUU = acos_U, acos_U, acos_B - nop.i 999 ;; + +{.mfi + // get address for loading pi + add r3 = 48, r3 + // c9*x+c8 + fma.s1 F_S89 = F_X, F_CS9, F_CS8 + nop.i 0 } -{ .mfi - nop.m 999 -(p0) fmerge.se f9 = acos_U, acos_U - nop.i 999 ;; +{.mfi + nop.m 0 + // x^2 + fma.s1 F_X2 = F_X, F_X, f0 + nop.i 0;; } -{ .mfi - nop.m 999 -(p0) fnma.s1 acos_t4 = acos_t3, acos_y1, acos_HALF - nop.i 999 ;; + +{.mfi + // pi (low, high) + ldfpd F_PI2_LO, F_PI2_HI = [r3] + // y*(1-s^2)*x + fma.s1 F_Y1S2X = F_Y1S2, F_X, f0 + nop.i 0 } -// acos_1d2U = frcpa(2U) -{ .mfi - nop.m 999 -(p0) frcpa.s1 acos_1d2U,p9 = f1, acos_2U - nop.i 999 +{.mfi + nop.m 0 + // c7*x+c6 + fma.s1 F_S67 = F_X, F_CS7, F_CS6 + nop.i 0;; } -{ .mfi - nop.m 999 -(p0) fma.s1 acos_BmUUpb = acos_BmUU, f1, acos_Bb - nop.i 999 ;; + +{.mfi + nop.m 0 + // 1-x + fnma.s1 F_1X = F_X, f1, f1 + nop.i 0 } -{ .mfi - nop.m 999 -(p0) fma.s1 acos_y2 = acos_t4, acos_y1, acos_y1 - nop.i 999 ;; +{.mfi + nop.m 0 + // c3*x+c2 + fma.s1 F_S23 = F_X, F_CS3, F_CS2 + nop.i 0;; } -{ .mfi - nop.m 999 -// acos_Uu = ((B - UU) + b) * frcpa(2U) -(p0) fma.s1 acos_Uu = acos_BmUUpb, acos_1d2U, f0 - nop.i 999 ;; + +{.mfi + nop.m 0 + // 1-t^2 + fnma.s1 F_1T2 = F_T, F_T, f1 + nop.i 0 } -{ .mfi - nop.m 999 -(p0) fma.s1 acos_S = acos_A, acos_y2, f0 - nop.i 999 +{.mfi + // load asin(t)_high, asin(t)_low + ldfpd F_ATHI, F_ATLO = [r2] + // c5*x+c4 + fma.s1 F_S45 = F_X, F_CS5, F_CS4 + nop.i 0;; } -{ .mfi - nop.m 999 -(p0) fma.s1 acos_H = acos_y2, acos_HALF, f0 - nop.i 999 ;; + + +{.mfi + nop.m 0 + // t*s + fma.s1 F_TS = F_T, f8, f0 + nop.i 0 } -{ .mfi - nop.m 999 -(p0) fma.s1 acos_t5 = acos_Hh, acos_y2, f0 - nop.i 999 ;; +{.mfi + nop.m 0 + // 0.5/(1-t^2) + fma.s1 F_INV_1T2 = F_INV_1T2, F_2M64, f0 + nop.i 0;; } -{ .mfi - nop.m 999 -(p0) fnma.s1 acos_Dd = acos_S, acos_S, acos_A - nop.i 999 ;; +{.mfi + nop.m 0 + // z~sqrt(1-t^2), rounded to 24 significant bits + fma.s.s1 F_Z = F_SQRT_1T2, F_2M64, f0 + nop.i 0 } -{ .mfi - nop.m 999 -(p0) fma.s1 acos_V = acos_Dd, acos_H, acos_S - nop.i 999 ;; +{.mfi + nop.m 0 + // sqrt(1-t^2) + fma.s1 F_SQRT_1T2 = F_SQRT_1T2, F_2M64, f0 + nop.i 0;; } -{ .mfi - nop.m 999 -(p0) fma.s1 acos_2V = acos_V, f1, acos_V - nop.i 999 + +{.mfi + nop.m 0 + // y*(1-s^2)*x^2 + fma.s1 F_Y1S2X2 = F_Y1S2, F_X2, f0 + nop.i 0 } -// Step 3 -///////////////////////// -// Calculate the correction, acos_corr -///////////////////////// -// acos_corr = U*v - (V*u) +{.mfi + nop.m 0 + // x^4 + fma.s1 F_X4 = F_X2, F_X2, f0 + nop.i 0;; +} -{ .mfi - nop.m 999 -(p0) fma.s1 acos_Vu = acos_V,acos_Uu, f0 - nop.i 999 ;; + +{.mfi + nop.m 0 + // s*t rounded to 24 significant bits + fma.s.s1 F_TSS = F_T, f8, f0 + nop.i 0 } -///////////////////////// -// Get v -///////////////////////// -// acos_AmVV = A - VV -// acos_AmVVpa = (A - VV) + a +{.mfi + nop.m 0 + // c9*x^3+..+c6 + fma.s1 F_S69 = F_X2, F_S89, F_S67 + nop.i 0;; +} -{ .mfi - nop.m 999 -(p0) fnma.s1 acos_AmVV = acos_V, acos_V, acos_A - nop.i 999 ;; + +{.mfi + nop.m 0 + // ST = (t^2-1+s^2) rounded to 24 significant bits + fms.s.s1 F_ST = f8, f8, F_1T2 + nop.i 0 } -{ .mfi - nop.m 999 -(p0) fmerge.se f8 = acos_V, acos_V - nop.i 999 ;; +{.mfi + nop.m 0 + // c5*x^3+..+c2 + fma.s1 F_S25 = F_X2, F_S45, F_S23 + nop.i 0;; } -{ .mfi - nop.m 999 -(p0) fma.s1 acos_AmVVpa = acos_AmVV, f1, acos_Aa - nop.i 999 ;; + +{.mfi + nop.m 0 + // 0.25/(1-t^2) + fma.s1 F_INV1T2_2 = F_05, F_INV_1T2, f0 + nop.i 0 } -// acos_1d2V = frcpa(2V) -{ .mfi - nop.m 999 -(p0) frcpa.s1 acos_1d2V,p9 = f1, acos_2V - nop.i 999 ;; +{.mfi + nop.m 0 + // t*s-sqrt(1-t^2)*(1-s^2)*y + fnma.s1 F_TS = F_Y1S2, F_SQRT_1T2, F_TS + nop.i 0;; } -// acos_Vv = ((A - VV) + a) * frcpa(2V) -{ .mfi - nop.m 999 -(p0) fma.s1 acos_Vv = acos_AmVVpa, acos_1d2V, f0 - nop.i 999 ;; + +{.mfi + nop.m 0 + // z*0.5/(1-t^2) + fma.s1 F_ZE = F_INV_1T2, F_SQRT_1T2, f0 + nop.i 0 } -{ .mfi - nop.m 999 -(p0) fma.s1 acos_Uv = acos_U,acos_Vv, f0 - nop.i 999 ;; +{.mfi + nop.m 0 + // z^2+t^2-1 + fms.s1 F_DZ0 = F_Z, F_Z, F_1T2 + nop.i 0;; } -.endp acosl# -ASM_SIZE_DIRECTIVE(acosl#) +{.mfi + nop.m 0 + // (1-s^2-(1-s^2)_s)*x + fma.s1 F_DS2X = F_X, F_DS, f0 + nop.i 0;; +} -.proc __libm_callout -__libm_callout: -.prologue -{ .mfi - nop.m 0 - nop.f 0 -.save ar.pfs,GR_SAVE_PFS - mov GR_SAVE_PFS=ar.pfs +{.mfi + nop.m 0 + // t*s-(t*s)_s + fms.s1 F_DTS = F_T, f8, F_TSS + nop.i 0 } -;; -{ .mfi - mov GR_SAVE_GP=gp - nop.f 0 -.save b0, GR_SAVE_B0 - mov GR_SAVE_B0=b0 +{.mfi + nop.m 0 + // c9*x^7+..+c2 + fma.s1 F_S29 = F_X4, F_S69, F_S25 + nop.i 0;; } -.body -{ .mfb - nop.m 999 -(p0) fms.s1 acos_corr = acos_Uv,f1, acos_Vu -(p0) br.call.sptk.many b0=__libm_atan2_reg# ;; + +{.mfi + nop.m 0 + // y*z + fma.s1 F_YZ = F_Z, F_Y, f0 + nop.i 0 } +{.mfi + nop.m 0 + // t^2 + fma.s1 F_T2 = F_T, F_T, f0 + nop.i 0;; +} -// p6 ==> X is negative -// p7 ==> x is positive -// We know that |X| >= 1/4 -{ .mfi -(p0) mov gp = GR_SAVE_GP -(p0) fcmp.lt.unc p6,p7 = acos_X , f0 -(p0) mov b0 = GR_SAVE_B0 ;; +{.mfi + nop.m 0 + // 1-t^2+ST + fma.s1 F_1T2_ST = F_ST, f1, F_1T2 + nop.i 0;; } -// acos_2_Z_hi = 2 * acos_Z_hi -// acos_s_lo_Z_lo = s_lo * Z_lo -{ .mfi - nop.m 999 -(p0) fma.s1 acos_2_Z_hi = acos_Z_hi, f1, acos_Z_hi -(p0) mov ar.pfs = GR_SAVE_PFS +{.mfi + nop.m 0 + // y*(1-s^2)(1-x) + fma.s1 F_Y1S2_1X = F_Y1S2, F_1X, f0 + nop.i 0 } -{ .mfi - nop.m 999 -(p0) fma.s1 acos_s_lo_Z_lo = acos_s_lo, acos_Z_lo, f0 - nop.i 999 ;; +{.mfi + nop.m 0 + // dz ~ sqrt(1-t^2)-z + fma.s1 F_DZ = F_DZ0, F_ZE, f0 + nop.i 0;; } -// 2 is a constant needed later -{ .mfi - nop.m 999 -(p0) fma.s1 acos_2 = f1,f1,f1 - nop.i 999 ;; + +{.mfi + nop.m 0 + // -1+correction for sqrt(1-t^2)-z + fnma.s1 F_CORR = F_INV1T2_2, F_DZ0, f0 + nop.i 0;; } -// X >= 1/4 -// acos_result_lo = 2(s_lo * Z_lo) - corr -// f8 = (2*Z_hi) + (2(s_lo * Z_lo) - corr) -{ .mfi - nop.m 999 -(p7) fma.s1 acos_result_lo = acos_s_lo_Z_lo, acos_2, acos_corr - nop.i 999 ;; +{.mfi + nop.m 0 + // (PS29*x^2+x)*y*(1-s^2) + fma.s1 F_S19 = F_Y1S2X2, F_S29, F_Y1S2X + nop.i 0;; } -{ .mfi - nop.m 999 -(p7) fma.s0 f8 = acos_2_Z_hi, f1, acos_result_lo - nop.i 999 +{.mfi + nop.m 0 + // z*y*(1-s^2)_s + fma.s1 F_ZY1S2S = F_YZ, F_1S2_S, f0 + nop.i 0 } -// acos_result_lo = (pi_lo - corr) -// acos_result_lo = (pi_lo - corr) + acos_Ww -{ .mfi - nop.m 999 -(p6) fms.s1 acos_result_lo = acos_pi_lo, f1, acos_corr - nop.i 999 ;; +{.mfi + nop.m 0 + // s^2-(1-t^2+ST) + fms.s1 F_1T2_ST = f8, f8, F_1T2_ST + nop.i 0;; } -// X <= -1/4 -// acos_W = pi_hi - 2 * Z_hi -{ .mfi - nop.m 999 -(p6) fnma.s1 acos_W = acos_2, acos_Z_hi, acos_pi_hi - nop.i 999 ;; + +{.mfi + nop.m 0 + // (t*s-(t*s)_s)+z*y*(1-s^2-(1-s^2)_s)*x + fma.s1 F_DTS = F_YZ, F_DS2X, F_DTS + nop.i 0 } -// acos_Ww = pi_hi - W -// acos_Ww = (pi_hi - W) + (2 * Z_hi) -{ .mfi - nop.m 999 -(p6) fms.s1 acos_Ww = acos_pi_hi, f1, acos_W - nop.i 999 ;; +{.mfi + nop.m 0 + // dz*y*(1-s^2)*(1-x) + fma.s1 F_DZ_TERM = F_DZ, F_Y1S2_1X, f0 + nop.i 0;; } -{ .mfi - nop.m 999 -(p6) fms.s1 acos_Ww = acos_Ww, f1, acos_2_Z_hi - nop.i 999 ;; + +{.mfi + nop.m 0 + // R = t*s-sqrt(1-t^2)*(1-s^2)*y+sqrt(1-t^2)*(1-s^2)*y*PS19 + // (used for polynomial evaluation) + fma.s1 F_R = F_S19, F_SQRT_1T2, F_TS + nop.i 0;; } -{ .mfi - nop.m 999 -(p6) fma.s1 acos_result_lo = acos_result_lo, f1, acos_Ww - nop.i 999 ;; + +{.mfi + nop.m 0 + // (PS29*x^2)*y*(1-s^2) + fma.s1 F_S29 = F_Y1S2X2, F_S29, f0 + nop.i 0 } -// acos_Z_lo = ((pi_lo - corr) + acos_Ww) - 2 * (s_lo * Z_lo) -{ .mfi - nop.m 999 -(p6) fnma.s1 acos_Z_lo = acos_s_lo_Z_lo, acos_2, acos_result_lo - nop.i 999 ;; +{.mfi + nop.m 0 + // apply correction to dz*y*(1-s^2)*(1-x) + fma.s1 F_DZ_TERM = F_DZ_TERM, F_CORR, F_DZ_TERM + nop.i 0;; } -{ .mfb - nop.m 999 -(p6) fma.s0 f8 = acos_W, f1, acos_Z_lo -(p0) br.ret.sptk b0 ;; + +{.mfi + nop.m 0 + // R^2 + fma.s1 F_R2 = F_R, F_R, f0 + nop.i 0;; } -.endp __libm_callout -ASM_SIZE_DIRECTIVE(__libm_callout) -.proc SPECIAL -SPECIAL: -L(ACOS_NAN): -{ .mfb - nop.m 999 -(p0) fma.s0 f8 = f8,f1,f0 -(p0) br.ret.sptk b0 ;; + +{.mfi + nop.m 0 + // (t*s-(t*s)_s)+z*y*(1-s^2-(1-s^2)_s)*x+dz*y*(1-s^2)*(1-x) + fma.s1 F_DZ_TERM = F_DZ_TERM, f1, F_DTS + nop.i 0;; } -L(ACOS_ERROR_RETURN): -// Save ar.pfs, b0, and gp; restore on exit -// qnan snan inf norm unorm 0 -+ -// 1 1 0 0 0 0 11 = 0xc3 +{.mfi + nop.m 0 + // c7+c9*R^2 + fma.s1 F_P79 = F_C9, F_R2, F_C7 + nop.i 0 +} -// Coming in as X = +- 1 -// What should we return? +{.mfi + nop.m 0 + // c3+c5*R^2 + fma.s1 F_P35 = F_C5, F_R2, F_C3 + nop.i 0;; +} -// If X is 1, return (sign of X)pi/2 +{.mfi + nop.m 0 + // asin(t)_low-(pi)_low (if s<0) + (p6) fms.s1 F_ATLO = F_ATLO, f1, F_PI2_LO + nop.i 0 +} +{.mfi + nop.m 0 + // R^4 + fma.s1 F_R4 = F_R2, F_R2, f0 + nop.i 0;; +} -{ .mfi - nop.m 999 -(p0) fcmp.eq.unc p6,p7 = acos_ABS_NORM_f8,f1 - nop.i 999 ;; +{.mfi + nop.m 0 + // R^3 + fma.s1 F_R3 = F_R2, F_R, f0 + nop.i 0;; } -{ .mfi - nop.m 999 -(p6) fcmp.lt.unc p8,p9 = f8,f0 - nop.i 999 ;; + +{.mfi + nop.m 0 + // (t*s)_s-t^2*y*z + fnma.s1 F_TSS = F_T2, F_YZ, F_TSS + nop.i 0 } -{ .mfi - nop.m 999 -(p8) fma.s0 f8 = acos_pi_hi, f1, acos_pi_lo - nop.i 999 +{.mfi + nop.m 0 + // d(ts)+z*y*d(1-s^2)*x+dz*y*(1-s^2)*(1-x)+z*y*(s^2-1+t^2-ST) + fma.s1 F_DZ_TERM = F_YZ, F_1T2_ST, F_DZ_TERM + nop.i 0;; } -{ .mfb - nop.m 999 -(p9) fmerge.s f8 = f8,f0 -(p6) br.ret.spnt b0 ;; + +{.mfi + nop.m 0 + // (pi)_hi-asin(t)_hi (if s<0) + (p6) fms.s1 F_ATHI = F_PI2_HI, f1, F_ATHI + nop.i 0 } -// If X is a NAN, leave -{ .mfi - nop.m 999 -(p0) fclass.m.unc p12,p0 = f8, 0xc3 - nop.i 999 ;; +{.mfi + nop.m 0 + // c3+c5*R^2+c7*R^4+c9*R^6 + fma.s1 F_P39 = F_P79, F_R4, F_P35 + nop.i 0;; } -{ .mfb - nop.m 999 -(p12) fma.s0 f8 = f8,f1,f0 -(p12) br.ret.spnt b0 ;; + +{.mfi + nop.m 0 + // d(ts)+z*y*d(1-s^2)*x+dz*y*(1-s^2)*(1-x)+z*y*(s^2-1+t^2-ST)+ + // + sqrt(1-t^2)*y*(1-s^2)*x^2*PS29 + fma.s1 F_DZ_TERM = F_SQRT_1T2, F_S29, F_DZ_TERM + nop.i 0;; } -{ .mfi -(p0) mov GR_Parameter_TAG = 57 -(p0) frcpa f10, p6 = f0, f0 -nop.i 999 -};; -.endp SPECIAL -ASM_SIZE_DIRECTIVE(SPECIAL) +{.mfi + nop.m 0 + // (t*s)_s-t^2*y*z+z*y*ST + fma.s1 F_TSS = F_YZ, F_ST, F_TSS + nop.i 0 +} -.proc __libm_error_region -__libm_error_region: +{.mfi + nop.m 0 + // -asin(t)_low+R^3*(c3+c5*R^2+c7*R^4+c9*R^6) + fms.s1 F_P39 = F_P39, F_R3, F_ATLO + nop.i 0;; +} + + +{.mfi + nop.m 0 + // d(ts)+z*y*d(1-s^2)*x+dz*y*(1-s^2)*(1-x)+z*y*(s^2-1+t^2-ST) + + // + sqrt(1-t^2)*y*(1-s^2)*x^2*PS29 + + // - asin(t)_low+R^3*(c3+c5*R^2+c7*R^4+c9*R^6) + fma.s1 F_DZ_TERM = F_P39, f1, F_DZ_TERM + nop.i 0;; +} + + +{.mfi + nop.m 0 + // d(ts)+z*y*d(1-s^2)*x+dz*y*(1-s^2)*(1-x)+z*y*(s^2-1+t^2-ST) + + // + sqrt(1-t^2)*y*(1-s^2)*x^2*PS29 + z*y*(1-s^2)_s*x + + // - asin(t)_low+R^3*(c3+c5*R^2+c7*R^4+c9*R^6) + fma.s1 F_DZ_TERM = F_ZY1S2S, F_X, F_DZ_TERM + nop.i 0;; +} + + +{.mfi + nop.m 0 + // d(ts)+z*y*d(1-s^2)*x+dz*y*(1-s^2)*(1-x)+z*y*(s^2-1+t^2-ST) + + // + sqrt(1-t^2)*y*(1-s^2)*x^2*PS29 + z*y*(1-s^2)_s*x + + // - asin(t)_low+R^3*(c3+c5*R^2+c7*R^4+c9*R^6) + + // + (t*s)_s-t^2*y*z+z*y*ST + fma.s1 F_DZ_TERM = F_TSS, f1, F_DZ_TERM + nop.i 0;; +} + + +.pred.rel "mutex", p6, p11 +{.mfi + nop.m 0 + // result: add high part of table value + // s>0 in this case + (p11) fnma.s0 f8 = F_DZ_TERM, f1, F_ATHI + nop.i 0 +} + +{.mfb + nop.m 0 + // result: add high part of pi-table value + // if s<0 + (p6) fma.s0 f8 = F_DZ_TERM, f1, F_ATHI + br.ret.sptk b0;; +} + + + + + + +SMALL_S: + + // use 15-term polynomial approximation + +{.mmi + // r3 = pointer to polynomial coefficients + addl r3 = @ltoff(poly_coeffs), gp;; + // load start address for coefficients + ld8 r3 = [r3] + mov R_TMP = 0x3fbf;; +} + + +{.mmi + add r2 = 64, r3 + ldfe F_C3 = [r3], 16 + // p7 = 1 if |s|<2^{-64} (exponent of s<bias-64) + cmp.lt p7, p0 = R_EXP0, R_TMP;; +} + +{.mmf + ldfe F_C5 = [r3], 16 + ldfpd F_C11, F_C13 = [r2], 16 + nop.f 0;; +} + +{.mmf + ldfpd F_C7, F_C9 = [r3], 16 + ldfpd F_C15, F_C17 = [r2] + nop.f 0;; +} + + + +{.mfb + // load pi/2 + ldfpd F_PI2_LO, F_PI2_HI = [r3] + // s^2 + fma.s1 F_R2 = f8, f8, f0 + // |s|<2^{-64} + (p7) br.cond.spnt RETURN_PI2;; +} + + +{.mfi + nop.m 0 + // s^3 + fma.s1 F_R3 = f8, F_R2, f0 + nop.i 0 +} + +{.mfi + nop.m 0 + // s^4 + fma.s1 F_R4 = F_R2, F_R2, f0 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // c3+c5*s^2 + fma.s1 F_P35 = F_C5, F_R2, F_C3 + nop.i 0 +} + +{.mfi + nop.m 0 + // c11+c13*s^2 + fma.s1 F_P1113 = F_C13, F_R2, F_C11 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // c7+c9*s^2 + fma.s1 F_P79 = F_C9, F_R2, F_C7 + nop.i 0 +} + +{.mfi + nop.m 0 + // c15+c17*s^2 + fma.s1 F_P1517 = F_C17, F_R2, F_C15 + nop.i 0;; +} + +{.mfi + nop.m 0 + // (pi/2)_high-s_high + fnma.s1 F_T = f8, f1, F_PI2_HI + nop.i 0 +} +{.mfi + nop.m 0 + // s^8 + fma.s1 F_R8 = F_R4, F_R4, f0 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // c3+c5*s^2+c7*s^4+c9*s^6 + fma.s1 F_P39 = F_P79, F_R4, F_P35 + nop.i 0 +} + +{.mfi + nop.m 0 + // c11+c13*s^2+c15*s^4+c17*s^6 + fma.s1 F_P1117 = F_P1517, F_R4, F_P1113 + nop.i 0;; +} + +{.mfi + nop.m 0 + // -s_high + fms.s1 F_S = F_T, f1, F_PI2_HI + nop.i 0;; +} + +{.mfi + nop.m 0 + // c3+..+c17*s^14 + fma.s1 F_P317 = F_R8, F_P1117, F_P39 + nop.i 0;; +} + +{.mfi + nop.m 0 + // s_low + fma.s1 F_DS = f8, f1, F_S + nop.i 0;; +} + +{.mfi + nop.m 0 + // (pi/2)_low-s^3*(c3+..+c17*s^14) + fnma.s0 F_P317 = F_P317, F_R3, F_PI2_LO + nop.i 0;; +} + +{.mfi + nop.m 0 + // (pi/2)_low-s_low-s^3*(c3+..+c17*s^14) + fms.s1 F_P317 = F_P317, f1, F_DS + nop.i 0;; +} + +{.mfb + nop.m 0 + // result: pi/2-s-c3*s^3-..-c17*s^17 + fma.s0 f8 = F_T, f1, F_P317 + br.ret.sptk b0;; +} + + + + + +RETURN_PI2: + +{.mfi + nop.m 0 + // (pi/2)_low-s + fms.s0 F_PI2_LO = F_PI2_LO, f1, f8 + nop.i 0;; +} + +{.mfb + nop.m 0 + // (pi/2)-s + fma.s0 f8 = F_PI2_HI, f1, F_PI2_LO + br.ret.sptk b0;; +} + + + + + +VERY_LARGE_INPUT: + + +{.mmf + // pointer to pi_low, pi_high + add r2 = 80, r3 + // load C5 + ldfe F_C5 = [r3], 16 + // x = ((1-(s^2)_s)*y^2-1)/2-(s^2-(s^2)_s)*y^2/2 + fma.s1 F_X = F_X, F_05, f0;; +} + +.pred.rel "mutex", p6, p11 +{.mmf + // load pi (low, high), if s<0 + (p6) ldfpd F_PI2_LO, F_PI2_HI = [r2] + // C7, C9 + ldfpd F_C7, F_C9 = [r3], 16 + // if s>0, set F_PI2_LO=0 + (p11) fma.s1 F_PI2_HI = f0, f0, f0;; +} + +{.mfi + nop.m 0 + (p11) fma.s1 F_PI2_LO = f0, f0, f0 + nop.i 0;; +} + +{.mfi + // adjust address for C_11 + add r3 = 16, r3 + // c9*x+c8 + fma.s1 F_S89 = F_X, F_CS9, F_CS8 + nop.i 0 +} + +{.mfi + nop.m 0 + // x^2 + fma.s1 F_X2 = F_X, F_X, f0 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // y*(1-s^2)*x + fma.s1 F_Y1S2X = F_Y1S2, F_X, f0 + nop.i 0 +} + +{.mfi + // C11, C13 + ldfpd F_C11, F_C13 = [r3], 16 + // c7*x+c6 + fma.s1 F_S67 = F_X, F_CS7, F_CS6 + nop.i 0;; +} + + +{.mfi + // C15, C17 + ldfpd F_C15, F_C17 = [r3], 16 + // c3*x+c2 + fma.s1 F_S23 = F_X, F_CS3, F_CS2 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // c5*x+c4 + fma.s1 F_S45 = F_X, F_CS5, F_CS4 + nop.i 0;; +} + + + + +{.mfi + nop.m 0 + // y*(1-s^2)*x^2 + fma.s1 F_Y1S2X2 = F_Y1S2, F_X2, f0 + nop.i 0 +} + +{.mfi + nop.m 0 + // x^4 + fma.s1 F_X4 = F_X2, F_X2, f0 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // c9*x^3+..+c6 + fma.s1 F_S69 = F_X2, F_S89, F_S67 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // c5*x^3+..+c2 + fma.s1 F_S25 = F_X2, F_S45, F_S23 + nop.i 0;; +} + + + +{.mfi + nop.m 0 + // (pi)_high-y*(1-s^2)_s + fnma.s1 F_HI = F_Y, F_1S2_S, F_PI2_HI + nop.i 0;; +} + + +{.mfi + nop.m 0 + // c9*x^7+..+c2 + fma.s1 F_S29 = F_X4, F_S69, F_S25 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // -(y*(1-s^2)_s)_high + fms.s1 F_1S2_HI = F_HI, f1, F_PI2_HI + nop.i 0;; +} + + +{.mfi + nop.m 0 + // (PS29*x^2+x)*y*(1-s^2) + fma.s1 F_S19 = F_Y1S2X2, F_S29, F_Y1S2X + nop.i 0;; +} + + +{.mfi + nop.m 0 + // y*(1-s^2)_s-(y*(1-s^2))_high + fma.s1 F_DS2 = F_Y, F_1S2_S, F_1S2_HI + nop.i 0;; +} + + + +{.mfi + nop.m 0 + // R ~ sqrt(1-s^2) + // (used for polynomial evaluation) + fnma.s1 F_R = F_S19, f1, F_Y1S2 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // y*(1-s^2)-(y*(1-s^2))_high + fma.s1 F_DS2 = F_Y, F_DS, F_DS2 + nop.i 0 +} + +{.mfi + nop.m 0 + // (pi)_low+(PS29*x^2)*y*(1-s^2) + fma.s1 F_S29 = F_Y1S2X2, F_S29, F_PI2_LO + nop.i 0;; +} + + +{.mfi + nop.m 0 + // R^2 + fma.s1 F_R2 = F_R, F_R, f0 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // if s<0 + // (pi)_low+(PS29*x^2)*y*(1-s^2)-(y*(1-s^2)-(y*(1-s^2))_high) + fms.s1 F_S29 = F_S29, f1, F_DS2 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // c7+c9*R^2 + fma.s1 F_P79 = F_C9, F_R2, F_C7 + nop.i 0 +} + +{.mfi + nop.m 0 + // c3+c5*R^2 + fma.s1 F_P35 = F_C5, F_R2, F_C3 + nop.i 0;; +} + + + +{.mfi + nop.m 0 + // R^4 + fma.s1 F_R4 = F_R2, F_R2, f0 + nop.i 0 +} + +{.mfi + nop.m 0 + // R^3 + fma.s1 F_R3 = F_R2, F_R, f0 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // c11+c13*R^2 + fma.s1 F_P1113 = F_C13, F_R2, F_C11 + nop.i 0 +} + +{.mfi + nop.m 0 + // c15+c17*R^2 + fma.s1 F_P1517 = F_C17, F_R2, F_C15 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // (pi)_low+(PS29*x^2)*y*(1-s^2)-(y*(1-s^2)-(y*(1-s^2))_high)+y*(1-s^2)*x + fma.s1 F_S29 = F_Y1S2, F_X, F_S29 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // c11+c13*R^2+c15*R^4+c17*R^6 + fma.s1 F_P1117 = F_P1517, F_R4, F_P1113 + nop.i 0 +} + +{.mfi + nop.m 0 + // c3+c5*R^2+c7*R^4+c9*R^6 + fma.s1 F_P39 = F_P79, F_R4, F_P35 + nop.i 0;; +} + + + +{.mfi + nop.m 0 + // R^8 + fma.s1 F_R8 = F_R4, F_R4, f0 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // c3+c5*R^2+c7*R^4+c9*R^6+..+c17*R^14 + fma.s1 F_P317 = F_P1117, F_R8, F_P39 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // (pi)_low-(PS29*x^2)*y*(1-s^2)-(y*(1-s^2)- + // -(y*(1-s^2))_high)+y*(1-s^2)*x - P3, 17 + fnma.s1 F_S29 = F_P317, F_R3, F_S29 + nop.i 0;; +} + +.pred.rel "mutex", p6, p11 +{.mfi + nop.m 0 + // Result (if s<0): + // (pi)_low-(PS29*x^2)*y*(1-s^2)-(y*(1-s^2)- + // -(y*(1-s^2))_high)+y*(1-s^2)*x - P3, 17 + // +(pi)_high-(y*(1-s^2))_high + (p6) fma.s0 f8 = F_S29, f1, F_HI + nop.i 0 +} + +{.mfb + nop.m 0 + // Result (if s>0): + // (PS29*x^2)*y*(1-s^2)- + // -y*(1-s^2)*x + P3, 17 + // +(y*(1-s^2)) + (p11) fms.s0 f8 = F_Y, F_1S2_S, F_S29 + br.ret.sptk b0;; +} + + + + + + +acosl_SPECIAL_CASES: + +{.mfi + alloc r32 = ar.pfs, 1, 4, 4, 0 + // check if the input is a NaN, or unsupported format + // (i.e. not infinity or normal/denormal) + fclass.nm p7, p8 = f8, 0x3f + // pointer to pi/2 + add r3 = 96, r3;; +} + + +{.mfi + // load pi/2 + ldfpd F_PI2_HI, F_PI2_LO = [r3] + // get |s| + fmerge.s F_S = f0, f8 + nop.i 0 +} + +{.mfb + nop.m 0 + // if NaN, quietize it, and return + (p7) fma.s0 f8 = f8, f1, f0 + (p7) br.ret.spnt b0;; +} + + +{.mfi + nop.m 0 + // |s| = 1 ? + fcmp.eq.s0 p9, p10 = F_S, f1 + nop.i 0 +} + +{.mfi + nop.m 0 + // load FR_X + fma.s1 FR_X = f8, f1, f0 + // load error tag + mov GR_Parameter_TAG = 57;; +} + + +{.mfi + nop.m 0 + // if s = 1, result is 0 + (p9) fma.s0 f8 = f0, f0, f0 + // set p6=0 for |s|>1 + (p10) cmp.ne p6, p0 = r0, r0;; +} + + +{.mfb + nop.m 0 + // if s = -1, result is pi + (p6) fma.s0 f8 = F_PI2_HI, f1, F_PI2_LO + // return if |s| = 1 + (p9) br.ret.sptk b0;; +} + + +{.mfi + nop.m 0 + // get Infinity + frcpa.s1 FR_RESULT, p0 = f1, f0 + nop.i 0;; +} + + +{.mfb + nop.m 0 + // return QNaN indefinite (0*Infinity) + fma.s0 FR_RESULT = f0, FR_RESULT, f0 + nop.b 0;; +} + + +GLOBAL_LIBM_END(acosl) + +LOCAL_LIBM_ENTRY(__libm_error_region) .prologue // (1) { .mfi @@ -1068,12 +2510,12 @@ __libm_error_region: .body // (3) { .mib - stfe [GR_Parameter_X] = f8 // Store Parameter 1 on stack + stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack add GR_Parameter_RESULT = 0,GR_Parameter_Y nop.b 0 // Parameter 3 address } { .mib - stfe [GR_Parameter_Y] = f10 // Store Parameter 3 on stack + stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack add GR_Parameter_Y = -16,GR_Parameter_Y br.call.sptk b0=__libm_error_support# // Call error handling function };; @@ -1097,11 +2539,13 @@ __libm_error_region: br.ret.sptk b0 // Return };; -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) +LOCAL_LIBM_END(__libm_error_region) .type __libm_error_support#,@function .global __libm_error_support# -.type __libm_atan2_reg#,@function -.global __libm_atan2_reg# + + + + + diff --git a/sysdeps/ia64/fpu/e_asin.S b/sysdeps/ia64/fpu/e_asin.S index bb4c242fb2..398079eae4 100644 --- a/sysdeps/ia64/fpu/e_asin.S +++ b/sysdeps/ia64/fpu/e_asin.S @@ -1,10 +1,10 @@ .file "asin.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003 Intel Corporation // All rights reserved. // -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,7 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -35,818 +35,776 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. // History //============================================================== -// 2/02/00 Initial version -// 8/17/00 New and much faster algorithm. -// 8/31/00 Avoided bank conflicts on loads, shortened |x|=1 path, +// 02/02/00 Initial version +// 08/17/00 New and much faster algorithm. +// 08/31/00 Avoided bank conflicts on loads, shortened |x|=1 path, // fixed mfb split issue stalls. // 12/19/00 Fixed small arg cases to force inexact, or inexact and underflow. +// 08/02/02 New and much faster algorithm II +// 02/06/03 Reordered header: .section, .global, .proc, .align // Description //========================================= -// The asin function computes the principle value of the arc sine of x. +// The asin function computes the principal value of the arc sine of x. // asin(0) returns 0, asin(1) returns pi/2, asin(-1) returns -pi/2. // A doman error occurs for arguments not in the range [-1,+1]. - +// // The asin function returns the arc sine in the range [-pi/2, +pi/2] radians. +// +// There are 8 paths: +// 1. x = +/-0.0 +// Return asin(x) = +/-0.0 +// +// 2. 0.0 < |x| < 0.625 +// Return asin(x) = x + x^3 *PolA(x^2) +// where PolA(x^2) = A3 + A5*x^2 + A7*x^4 +...+ A35*x^32 +// +// 3. 0.625 <=|x| < 1.0 +// Return asin(x) = sign(x) * ( Pi/2 - sqrt(R) * PolB(R)) +// Where R = 1 - |x|, +// PolB(R) = B0 + B1*R + B2*R^2 +...+B12*R^12 +// +// sqrt(R) is approximated using the following sequence: +// y0 = (1 + eps)/sqrt(R) - initial approximation by frsqrta, +// |eps| < 2^(-8) +// Then 3 iterations are used to refine the result: +// H0 = 0.5*y0 +// S0 = R*y0 +// +// d0 = 0.5 - H0*S0 +// H1 = H0 + d0*H0 +// S1 = S0 + d0*S0 +// +// d1 = 0.5 - H1*S1 +// H2 = H1 + d0*H1 +// S2 = S1 + d0*S1 +// +// d2 = 0.5 - H2*S2 +// S3 = S3 + d2*S3 +// +// S3 approximates sqrt(R) with enough accuracy for this algorithm +// +// So, the result should be reconstracted as follows: +// asin(x) = sign(x) * (Pi/2 - S3*PolB(R)) +// +// But for optimization perposes the reconstruction step is slightly +// changed: +// asin(x) = sign(x)*(Pi/2 - PolB(R)*S2) + sign(x)*d2*S2*PolB(R) +// +// 4. |x| = 1.0 +// Return asin(x) = sign(x)*Pi/2 +// +// 5. 1.0 < |x| <= +INF +// A doman error occurs for arguments not in the range [-1,+1] +// +// 6. x = [S,Q]NaN +// Return asin(x) = QNaN +// +// 7. x is denormal +// Return asin(x) = x + x^3, +// +// 8. x is unnormal +// Normalize input in f8 and return to the very beginning of the function +// +// Registers used +//============================================================== +// Floating Point registers used: +// f8, input, output +// f6, f7, f9 -> f15, f32 -> f63 -#include "libm_support.h" +// General registers used: +// r3, r21 -> r31, r32 -> r38 + +// Predicate registers used: +// p0, p6 -> p14 // // Assembly macros //========================================= - - -// predicate registers -//asin_pred_LEsqrt2by2 = p7 -//asin_pred_GTsqrt2by2 = p8 - -// integer registers -ASIN_Addr1 = r33 -ASIN_Addr2 = r34 -ASIN_FFFE = r35 -ASIN_lnorm_sig = r36 -ASIN_snorm_exp = r37 - -GR_SAVE_B0 = r36 -GR_SAVE_PFS = r37 -GR_SAVE_GP = r38 - -GR_Parameter_X = r39 -GR_Parameter_Y = r40 -GR_Parameter_RESULT = r41 -GR_Parameter_Tag = r42 - -// floating point registers -asin_coeff_P1 = f32 -asin_coeff_P2 = f33 -asin_coeff_P3 = f34 -asin_coeff_P4 = f35 - -asin_coeff_P5 = f36 -asin_coeff_P6 = f37 -asin_coeff_P7 = f38 -asin_coeff_P8 = f39 -asin_coeff_P9 = f40 - -asin_coeff_P10 = f41 -asin_coeff_P11 = f42 -asin_coeff_P12 = f43 -asin_coeff_P13 = f44 -asin_coeff_P14 = f45 - -asin_coeff_P15 = f46 -asin_coeff_P16 = f47 -asin_coeff_P17 = f48 -asin_coeff_P18 = f49 -asin_coeff_P19 = f50 - -asin_coeff_P20 = f51 -asin_coeff_P21 = f52 -asin_const_sqrt2by2 = f53 -asin_const_piby2 = f54 -asin_abs_x = f55 - -asin_tx = f56 -asin_tx2 = f57 -asin_tx3 = f58 -asin_tx4 = f59 -asin_tx8 = f60 - -asin_tx11 = f61 -asin_1poly_p8 = f62 -asin_1poly_p19 = f63 -asin_1poly_p4 = f64 -asin_1poly_p15 = f65 - -asin_1poly_p6 = f66 -asin_1poly_p17 = f67 -asin_1poly_p0 = f68 -asin_1poly_p11 = f69 -asin_1poly_p2 = f70 - -asin_1poly_p13 = f71 -asin_series_tx = f72 -asin_t = f73 -asin_t2 = f74 -asin_t3 = f75 - -asin_t4 = f76 -asin_t8 = f77 -asin_t11 = f78 -asin_poly_p8 = f79 -asin_poly_p19 = f80 - -asin_poly_p4 = f81 -asin_poly_p15 = f82 -asin_poly_p6 = f83 -asin_poly_p17 = f84 -asin_poly_p0 = f85 - -asin_poly_p11 = f86 -asin_poly_p2 = f87 -asin_poly_p13 = f88 -asin_series_t = f89 -asin_1by2 = f90 - -asin_3by2 = f91 -asin_5by2 = f92 -asin_11by4 = f93 -asin_35by8 = f94 -asin_63by8 = f95 - -asin_231by16 = f96 -asin_y0 = f97 -asin_H0 = f98 -asin_S0 = f99 -asin_d = f100 - -asin_l1 = f101 -asin_d2 = f102 -asin_T0 = f103 -asin_d1 = f104 -asin_e0 = f105 - -asin_l2 = f106 -asin_d3 = f107 -asin_T3 = f108 -asin_S1 = f109 -asin_e1 = f110 - -asin_z = f111 -answer2 = f112 -asin_sgn_x = f113 -asin_429by16 = f114 -asin_18by4 = f115 - -asin_3by4 = f116 -asin_l3 = f117 -asin_T6 = f118 -asin_eps_exp = f119 -asin_eps_sig = f120 -asin_eps = f120 - +// integer registers used +// scratch +rTblAddr = r3 + +rPiBy2Ptr = r21 +rTmpPtr3 = r22 +rDenoBound = r23 +rOne = r24 +rAbsXBits = r25 +rHalf = r26 +r0625 = r27 +rSign = r28 +rXBits = r29 +rTmpPtr2 = r30 +rTmpPtr1 = r31 + +// stacked +GR_SAVE_PFS = r32 +GR_SAVE_B0 = r33 +GR_SAVE_GP = r34 +GR_Parameter_X = r35 +GR_Parameter_Y = r36 +GR_Parameter_RESULT = r37 +GR_Parameter_TAG = r38 + +// floating point registers used +FR_X = f10 +FR_Y = f1 +FR_RESULT = f8 + + +// scratch +fXSqr = f6 +fXCube = f7 +fXQuadr = f9 +f1pX = f10 +f1mX = f11 +f1pXRcp = f12 +f1mXRcp = f13 +fH = f14 +fS = f15 +// stacked +fA3 = f32 +fB1 = f32 +fA5 = f33 +fB2 = f33 +fA7 = f34 +fPiBy2 = f34 +fA9 = f35 +fA11 = f36 +fB10 = f35 +fB11 = f36 +fA13 = f37 +fA15 = f38 +fB4 = f37 +fB5 = f38 +fA17 = f39 +fA19 = f40 +fB6 = f39 +fB7 = f40 +fA21 = f41 +fA23 = f42 +fB3 = f41 +fB8 = f42 +fA25 = f43 +fA27 = f44 +fB9 = f43 +fB12 = f44 +fA29 = f45 +fA31 = f46 +fA33 = f47 +fA35 = f48 +fBaseP = f49 +fB0 = f50 +fSignedS = f51 +fD = f52 +fHalf = f53 +fR = f54 +fCloseTo1Pol = f55 +fSignX = f56 +fDenoBound = f57 +fNormX = f58 +fX8 = f59 +fRSqr = f60 +fRQuadr = f61 +fR8 = f62 +fX16 = f63 // Data tables //============================================================== - -#ifdef _LIBC -.rodata -#else -.data -#endif - +RODATA .align 16 - -asin_coeff_1_table: -ASM_TYPE_DIRECTIVE(asin_coeff_1_table,@object) -data8 0xE4E7E0A423A21249 , 0x00003FF8 //P7 -data8 0xC2F7EE0200FCE2A5 , 0x0000C003 //P18 -data8 0xB745D7F6C65C20E0 , 0x00003FF9 //P5 -data8 0xF75E381A323D4D94 , 0x0000C002 //P16 -data8 0x8959C2629C1024C0 , 0x0000C002 //P20 -data8 0xAFF68E7D241292C5 , 0x00003FF8 //P9 -data8 0xB6DB6DB7260AC30D , 0x00003FFA //P3 -data8 0xD0417CE2B41CB7BF , 0x0000C000 //P14 -data8 0x81D570FEA724E3E4 , 0x0000BFFD //P12 -data8 0xAAAAAAAAAAAAC277 , 0x00003FFC //P1 -data8 0xF534912FF3E7B76F , 0x00003FFF //P21 -data8 0xc90fdaa22168c235 , 0x00003fff // pi/2 -data8 0x0000000000000000 , 0x00000000 // pad to avoid data bank conflict -ASM_SIZE_DIRECTIVE(asin_coeff_1_table) - - -asin_coeff_2_table: -ASM_TYPE_DIRECTIVE(asin_coeff_2_table,@object) -data8 0x8E26AF5F29B39A2A , 0x00003FF9 //P6 -data8 0xB4F118A4B1015470 , 0x00004003 //P17 -data8 0xF8E38E10C25990E0 , 0x00003FF9 //P4 -data8 0x80F50489AEF1CAC6 , 0x00004002 //P15 -data8 0x92728015172CFE1C , 0x00004003 //P19 -data8 0xBBC3D831D4595971 , 0x00003FF8 //P8 -data8 0x999999999952A5C3 , 0x00003FFB //P2 -data8 0x855576BE6F0975EC , 0x00003FFF //P13 -data8 0xF12420E778077D89 , 0x00003FFA //P11 -data8 0xB6590FF4D23DE003 , 0x00003FF3 //P10 -data8 0xb504f333f9de6484 , 0x00003ffe // sqrt(2)/2 -ASM_SIZE_DIRECTIVE(asin_coeff_2_table) +LOCAL_OBJECT_START(asin_base_range_table) +// Ai: Polynomial coefficients for the asin(x), |x| < .625000 +// Bi: Polynomial coefficients for the asin(x), |x| > .625000 +data8 0xBFDAAB56C01AE468 //A29 +data8 0x3FE1C470B76A5B2B //A31 +data8 0xBFDC5FF82A0C4205 //A33 +data8 0x3FC71FD88BFE93F0 //A35 +data8 0xB504F333F9DE6487, 0x00003FFF //B0 +data8 0xAAAAAAAAAAAAFC18, 0x00003FFC //A3 +data8 0x3F9F1C71BC4A7823 //A9 +data8 0x3F96E8BBAAB216B2 //A11 +data8 0x3F91C4CA1F9F8A98 //A13 +data8 0x3F8C9DDCEDEBE7A6 //A15 +data8 0x3F877784442B1516 //A17 +data8 0x3F859C0491802BA2 //A19 +data8 0x9999999998C88B8F, 0x00003FFB //A5 +data8 0x3F6BD7A9A660BF5E //A21 +data8 0x3F9FC1659340419D //A23 +data8 0xB6DB6DB798149BDF, 0x00003FFA //A7 +data8 0xBFB3EF18964D3ED3 //A25 +data8 0x3FCD285315542CF2 //A27 +data8 0xF15BEEEFF7D2966A, 0x00003FFB //B1 +data8 0x3EF0DDA376D10FB3 //B10 +data8 0xBEB83CAFE05EBAC9 //B11 +data8 0x3F65FFB67B513644 //B4 +data8 0x3F5032FBB86A4501 //B5 +data8 0x3F392162276C7CBA //B6 +data8 0x3F2435949FD98BDF //B7 +data8 0xD93923D7FA08341C, 0x00003FF9 //B2 +data8 0x3F802995B6D90BDB //B3 +data8 0x3F10DF86B341A63F //B8 +data8 0xC90FDAA22168C235, 0x00003FFF // Pi/2 +data8 0x3EFA3EBD6B0ECB9D //B9 +data8 0x3EDE18BA080E9098 //B12 +LOCAL_OBJECT_END(asin_base_range_table) - -.align 32 -.global asin - .section .text -.proc asin -.align 32 - - -asin: - -{ .mfi - alloc r32 = ar.pfs,1,6,4,0 - fma.s1 asin_tx = f8,f8,f0 - addl ASIN_Addr2 = @ltoff(asin_coeff_2_table),gp -} -{ .mfi - mov ASIN_FFFE = 0xFFFE - fnma.s1 asin_t = f8,f8,f1 - addl ASIN_Addr1 = @ltoff(asin_coeff_1_table),gp +GLOBAL_LIBM_ENTRY(asin) +asin_unnormal_back: +{ .mfi + getf.d rXBits = f8 // grab bits of input value + // set p12 = 1 if x is a NaN, denormal, or zero + fclass.m p12, p0 = f8, 0xcf + adds rSign = 1, r0 +} +{ .mfi + addl rTblAddr = @ltoff(asin_base_range_table),gp + // 1 - x = 1 - |x| for positive x + fms.s1 f1mX = f1, f1, f8 + addl rHalf = 0xFFFE, r0 // exponent of 1/2 } ;; - - -{ .mfi - setf.exp asin_1by2 = ASIN_FFFE - fmerge.s asin_abs_x = f1,f8 - nop.i 999 ;; -} - -{ .mmf - ld8 ASIN_Addr1 = [ASIN_Addr1] - ld8 ASIN_Addr2 = [ASIN_Addr2] - fmerge.s asin_sgn_x = f8,f1 ;; -} - - -{ .mfi - ldfe asin_coeff_P7 = [ASIN_Addr1],16 - fma.s1 asin_tx2 = asin_tx,asin_tx,f0 - nop.i 999 -} -{ .mfi - ldfe asin_coeff_P6 = [ASIN_Addr2],16 - fma.s1 asin_t2 = asin_t,asin_t,f0 - nop.i 999;; +{ .mfi + addl r0625 = 0x3FE4, r0 // high 16 bits of 0.625 + // set p8 = 1 if x < 0 + fcmp.lt.s1 p8, p9 = f8, f0 + shl rSign = rSign, 63 // sign bit } - - -{ .mmf - ldfe asin_coeff_P18 = [ASIN_Addr1],16 - ldfe asin_coeff_P17 = [ASIN_Addr2],16 - fclass.m.unc p8,p0 = f8, 0xc3 //@qnan |@snan -} -;; - -{ .mmf - ldfe asin_coeff_P5 = [ASIN_Addr1],16 - ldfe asin_coeff_P4 = [ASIN_Addr2],16 - frsqrta.s1 asin_y0,p0 = asin_t -} -;; - -{ .mfi - ldfe asin_coeff_P16 = [ASIN_Addr1],16 - fcmp.gt.s1 p9,p0 = asin_abs_x,f1 - nop.i 999 -} -{ .mfb - ldfe asin_coeff_P15 = [ASIN_Addr2],16 -(p8) fma.d f8 = f8,f1,f0 -(p8) br.ret.spnt b0 +{ .mfi + // point to the beginning of the table + ld8 rTblAddr = [rTblAddr] + // 1 + x = 1 - |x| for negative x + fma.s1 f1pX = f1, f1, f8 + adds rOne = 0x3FF, r0 } ;; - - -{ .mmf - ldfe asin_coeff_P20 = [ASIN_Addr1],16 - ldfe asin_coeff_P19 = [ASIN_Addr2],16 - fclass.m.unc p8,p0 = f8, 0x07 //@zero -} -;; - - -{ .mfi - ldfe asin_coeff_P9 = [ASIN_Addr1],16 - fma.s1 asin_t4 = asin_t2,asin_t2,f0 -(p9) mov GR_Parameter_Tag = 61 -} -{ .mfi - ldfe asin_coeff_P8 = [ASIN_Addr2],16 - fma.s1 asin_3by2 = asin_1by2,f1,f1 - nop.i 999;; +{ .mfi + andcm rAbsXBits = rXBits, rSign // bits of |x| + fmerge.s fSignX = f8, f1 // signum(x) + shl r0625 = r0625, 48 // bits of DP representation of 0.625 } - - -{ .mfi - ldfe asin_coeff_P2 = [ASIN_Addr2],16 - fma.s1 asin_tx4 = asin_tx2,asin_tx2,f0 - nop.i 999 -} -{ .mfb - ldfe asin_coeff_P3 = [ASIN_Addr1],16 - fma.s1 asin_t3 = asin_t,asin_t2,f0 -(p8) br.ret.spnt b0 +{ .mfb + setf.exp fHalf = rHalf // load A2 to FP reg + fma.s1 fXSqr = f8, f8, f0 // x^2 + // branch on special path if x is a NaN, denormal, or zero +(p12) br.cond.spnt asin_special } ;; - - -{ .mfi - ldfe asin_coeff_P13 = [ASIN_Addr2],16 - fma.s1 asin_H0 = asin_y0,asin_1by2,f0 - nop.i 999 -} -{ .mfb - ldfe asin_coeff_P14 = [ASIN_Addr1],16 - fma.s1 asin_S0 = asin_y0,asin_t,f0 -(p9) br.cond.spnt __libm_error_region +{ .mfi + adds rPiBy2Ptr = 272, rTblAddr + nop.f 0 + shl rOne = rOne, 52 // bits of 1.0 +} +{ .mfi + adds rTmpPtr1 = 16, rTblAddr + nop.f 0 + // set p6 = 1 if |x| < 0.625 + cmp.lt p6, p7 = rAbsXBits, r0625 } ;; - - -{ .mfi - ldfe asin_coeff_P11 = [ASIN_Addr2],16 - fcmp.eq.s1 p6,p0 = asin_abs_x,f1 - nop.i 999 -} -{ .mfi - ldfe asin_coeff_P12 = [ASIN_Addr1],16 - fma.s1 asin_tx3 = asin_tx,asin_tx2,f0 - nop.i 999;; +{ .mfi + ldfpd fA29, fA31 = [rTblAddr] // A29, fA31 + // 1 - x = 1 - |x| for positive x +(p9) fms.s1 fR = f1, f1, f8 + // point to coefficient of "near 1" polynomial +(p7) adds rTmpPtr2 = 176, rTblAddr } - - -{ .mfi - ldfe asin_coeff_P10 = [ASIN_Addr2],16 - fma.s1 asin_1poly_p6 = asin_tx,asin_coeff_P7,asin_coeff_P6 - nop.i 999 -} -{ .mfi - ldfe asin_coeff_P1 = [ASIN_Addr1],16 - fma.s1 asin_poly_p6 = asin_t,asin_coeff_P7,asin_coeff_P6 - nop.i 999;; +{ .mfi + ldfpd fA33, fA35 = [rTmpPtr1], 16 // A33, fA35 + // 1 + x = 1 - |x| for negative x +(p8) fma.s1 fR = f1, f1, f8 +(p6) adds rTmpPtr2 = 48, rTblAddr } - - -{ .mfi - ldfe asin_const_sqrt2by2 = [ASIN_Addr2],16 - fma.s1 asin_5by2 = asin_3by2,f1,f1 - nop.i 999 -} -{ .mfi - ldfe asin_coeff_P21 = [ASIN_Addr1],16 - fma.s1 asin_11by4 = asin_3by2,asin_3by2,asin_1by2 - nop.i 999;; +;; +{ .mfi + ldfe fB0 = [rTmpPtr1], 16 // B0 + nop.f 0 + nop.i 0 } - - -{ .mfi - ldfe asin_const_piby2 = [ASIN_Addr1],16 - fma.s1 asin_poly_p17 = asin_t,asin_coeff_P18,asin_coeff_P17 - nop.i 999 -} -{ .mfb - nop.m 999 - fma.s1 asin_3by4 = asin_3by2,asin_1by2,f0 -(p6) br.cond.spnt L(ASIN_ABS_1) // Branch to short exit if |x|=1 +{ .mib + adds rTmpPtr3 = 16, rTmpPtr2 + // set p10 = 1 if |x| = 1.0 + cmp.eq p10, p0 = rAbsXBits, rOne + // branch on special path for |x| = 1.0 +(p10) br.cond.spnt asin_abs_1 } ;; - - -{ .mfi - addl ASIN_lnorm_sig = -0x1,r0 // Form significand 0xffffffffffffffff - fma.s1 asin_poly_p15 = asin_t,asin_coeff_P16,asin_coeff_P15 - nop.i 999 -} -{ .mfi - addl ASIN_snorm_exp = 0x0c001,r0 // Form small exponent - fnma.s1 asin_d = asin_S0,asin_H0,asin_1by2 - nop.i 999;; +{ .mfi + ldfe fA3 = [rTmpPtr2], 48 // A3 or B1 + nop.f 0 + adds rTmpPtr1 = 64, rTmpPtr3 } - - -// Form the exponent and significand of a small number -{ .mfi - setf.sig asin_eps_sig = ASIN_lnorm_sig - fma.s1 asin_poly_p19 = asin_t,asin_coeff_P20,asin_coeff_P19 - nop.i 999 -} -{ .mfi - setf.exp asin_eps_exp = ASIN_snorm_exp - fma.s1 asin_poly_p4 = asin_t,asin_coeff_P5,asin_coeff_P4 - nop.i 999;; +{ .mib + ldfpd fA9, fA11 = [rTmpPtr3], 16 // A9, A11 or B10, B11 + // set p11 = 1 if |x| > 1.0 + cmp.gt p11, p0 = rAbsXBits, rOne + // branch on special path for |x| > 1.0 +(p11) br.cond.spnt asin_abs_gt_1 } - - -{ .mfi - nop.m 999 - fma.s1 asin_1poly_p17 = asin_tx,asin_coeff_P18,asin_coeff_P17 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 asin_poly_p8 = asin_t,asin_coeff_P9,asin_coeff_P8 - nop.i 999;; +;; +{ .mfi + ldfpd fA17, fA19 = [rTmpPtr2], 16 // A17, A19 or B6, B7 + // initial approximation of 1 / sqrt(1 - x) + frsqrta.s1 f1mXRcp, p0 = f1mX + nop.i 0 } - - -{ .mfi - nop.m 999 - fms.s1 asin_35by8 = asin_5by2,asin_11by4,asin_5by2 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 asin_63by8 = asin_5by2,asin_11by4,f1 - nop.i 999;; +{ .mfi + ldfpd fA13, fA15 = [rTmpPtr3] // A13, A15 or B4, B5 + fma.s1 fXCube = fXSqr, f8, f0 // x^3 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 asin_poly_p13 = asin_t,asin_coeff_P14,asin_coeff_P13 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 asin_18by4 = asin_3by2,asin_5by2,asin_3by4 - nop.i 999;; +;; +{ .mfi + ldfe fA5 = [rTmpPtr2], 48 // A5 or B2 + // initial approximation of 1 / sqrt(1 + x) + frsqrta.s1 f1pXRcp, p0 = f1pX + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 asin_l1 = asin_5by2,asin_d,asin_3by2 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 asin_d2 = asin_d,asin_d,f0 - nop.i 999;; +{ .mfi + ldfpd fA21, fA23 = [rTmpPtr1], 16 // A21, A23 or B3, B8 + fma.s1 fXQuadr = fXSqr, fXSqr, f0 // x^4 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 asin_poly_p15 = asin_t2,asin_poly_p17,asin_poly_p15 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 asin_T0 = asin_d,asin_S0,f0 - nop.i 999;; +;; +{ .mfi + ldfe fA7 = [rTmpPtr1] // A7 or Pi/2 + fma.s1 fRSqr = fR, fR, f0 // R^2 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 asin_poly_p19 = asin_t2,asin_coeff_P21,asin_poly_p19 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 asin_poly_p4 = asin_t2,asin_poly_p6,asin_poly_p4 - nop.i 999;; +{ .mfb + ldfpd fA25, fA27 = [rTmpPtr2] // A25, A27 or B9, B12 + nop.f 0 +(p6) br.cond.spnt asin_base_range; } +;; - -{ .mfi - nop.m 999 - fma.s1 asin_d1 = asin_35by8,asin_d,f0 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 asin_231by16 = asin_3by2,asin_35by8,asin_63by8 - nop.i 999;; +{ .mfi + nop.m 0 +(p9) fma.s1 fH = fHalf, f1mXRcp, f0 // H0 for x > 0 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 asin_poly_p2 = asin_t,asin_coeff_P3,asin_coeff_P2 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 asin_poly_p8 = asin_t2,asin_coeff_P10,asin_poly_p8 - nop.i 999;; +{ .mfi + nop.m 0 +(p9) fma.s1 fS = f1mX, f1mXRcp, f0 // S0 for x > 0 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 asin_poly_p11 = asin_t,asin_coeff_P12,asin_coeff_P11 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 asin_e0 = asin_d2,asin_l1,asin_d - nop.i 999;; +;; +{ .mfi + nop.m 0 +(p8) fma.s1 fH = fHalf, f1pXRcp, f0 // H0 for x < 0 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 asin_1poly_p15 = asin_tx,asin_coeff_P16,asin_coeff_P15 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 asin_poly_p0 = asin_t,asin_coeff_P1,f1 - nop.i 999;; +{ .mfi + nop.m 0 +(p8) fma.s1 fS = f1pX, f1pXRcp, f0 // S0 for x > 0 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 asin_1poly_p19 = asin_tx,asin_coeff_P20,asin_coeff_P19 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 asin_1poly_p4 = asin_tx,asin_coeff_P5,asin_coeff_P4 - nop.i 999;; +;; +{ .mfi + nop.m 0 + fma.s1 fRQuadr = fRSqr, fRSqr, f0 // R^4 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 asin_1poly_p8 = asin_tx,asin_coeff_P9,asin_coeff_P8 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 asin_l2 = asin_231by16,asin_d,asin_63by8 - nop.i 999;; +;; +{ .mfi + nop.m 0 + fma.s1 fB11 = fB11, fR, fB10 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 asin_d3 = asin_d2,asin_d,f0 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 asin_T3 = asin_d2,asin_T0,f0 - nop.i 999;; +{ .mfi + nop.m 0 + fma.s1 fB1 = fB1, fR, fB0 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 asin_429by16 = asin_18by4,asin_11by4,asin_231by16 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 asin_S1 = asin_e0,asin_S0,asin_S0 - nop.i 999;; +;; +{ .mfi + nop.m 0 + fma.s1 fB5 = fB5, fR, fB4 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 asin_poly_p4 = asin_t4,asin_poly_p8,asin_poly_p4 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 asin_poly_p15 = asin_t4,asin_poly_p19,asin_poly_p15 - nop.i 999;; +{ .mfi + nop.m 0 + fma.s1 fB7 = fB7, fR, fB6 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 asin_poly_p0 = asin_t2,asin_poly_p2,asin_poly_p0 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 asin_poly_p11 = asin_t2,asin_poly_p13,asin_poly_p11 - nop.i 999;; +;; +{ .mfi + nop.m 0 + fma.s1 fB3 = fB3, fR, fB2 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 asin_t8 = asin_t4,asin_t4,f0 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 asin_e1 = asin_d2,asin_l2,asin_d1 - nop.i 999;; +;; +{ .mfi + nop.m 0 + fnma.s1 fD = fH, fS, fHalf // d0 = 1/2 - H0*S0 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 asin_1poly_p4 = asin_tx2,asin_1poly_p6,asin_1poly_p4 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 asin_1poly_p15 = asin_tx2,asin_1poly_p17,asin_1poly_p15 - nop.i 999;; +;; +{ .mfi + nop.m 0 + fma.s1 fR8 = fRQuadr, fRQuadr, f0 // R^4 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 asin_1poly_p8 = asin_tx2,asin_coeff_P10,asin_1poly_p8 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 asin_1poly_p19 = asin_tx2,asin_coeff_P21,asin_1poly_p19 - nop.i 999;; +{ .mfi + nop.m 0 + fma.s1 fB9 = fB9, fR, fB8 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 asin_1poly_p2 = asin_tx,asin_coeff_P3,asin_coeff_P2 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 asin_1poly_p13 = asin_tx,asin_coeff_P14,asin_coeff_P13 - nop.i 999;; +;; +{.mfi + nop.m 0 + fma.s1 fB12 = fB12, fRSqr, fB11 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 asin_1poly_p0 = asin_tx,asin_coeff_P1,f1 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 asin_1poly_p11 = asin_tx,asin_coeff_P12,asin_coeff_P11 - nop.i 999;; +{.mfi + nop.m 0 + fma.s1 fB7 = fB7, fRSqr, fB5 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 asin_l3 = asin_429by16,asin_d,f0 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 asin_z = asin_e1,asin_T3,asin_S1 - nop.i 999;; +;; +{.mfi + nop.m 0 + fma.s1 fB3 = fB3, fRSqr, fB1 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 asin_poly_p11 = asin_t4,asin_poly_p15,asin_poly_p11 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 asin_T6 = asin_T3,asin_d3,f0 - nop.i 999;; +;; +{ .mfi + nop.m 0 + fma.s1 fH = fH, fD, fH // H1 = H0 + H0*d0 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 asin_t11 = asin_t8,asin_t3,f0 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 asin_poly_p0 = asin_t4,asin_poly_p4,asin_poly_p0 - nop.i 999;; +{ .mfi + nop.m 0 + fma.s1 fS = fS, fD, fS // S1 = S0 + S0*d0 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 asin_1poly_p4 = asin_tx4,asin_1poly_p8,asin_1poly_p4 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 asin_1poly_p15 = asin_tx4,asin_1poly_p19,asin_1poly_p15 - nop.i 999;; +;; +{.mfi + nop.m 0 + fma.s1 fPiBy2 = fPiBy2, fSignX, f0 // signum(x)*Pi/2 + nop.i 0 } - - -{ .mfi - nop.m 999 - fma.s1 asin_1poly_p0 = asin_tx2,asin_1poly_p2,asin_1poly_p0 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 asin_1poly_p11 = asin_tx2,asin_1poly_p13,asin_1poly_p11 - nop.i 999;; +;; +{ .mfi + nop.m 0 + fma.s1 fB12 = fB12, fRSqr, fB9 + nop.i 0 } - - -{ .mfi - nop.m 999 -// fcmp.le.s1 asin_pred_LEsqrt2by2,asin_pred_GTsqrt2by2 = asin_abs_x,asin_const_sqrt2by2 - fcmp.le.s1 p7,p8 = asin_abs_x,asin_const_sqrt2by2 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 asin_tx8 = asin_tx4,asin_tx4,f0 - nop.i 999;; +{ .mfi + nop.m 0 + fma.s1 fB7 = fB7, fRQuadr, fB3 + nop.i 0 } - - -// Form a small number to force inexact flag for small args -{ .mfi - nop.m 999 - fmerge.se asin_eps = asin_eps_exp,asin_eps_sig - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 asin_z = asin_l3,asin_T6,asin_z - nop.i 999;; -} - -{ .mfi - nop.m 999 - fma.s1 asin_series_t = asin_t11,asin_poly_p11,asin_poly_p0 - nop.i 999;; -} - -{ .mfi - nop.m 999 - fma.s1 asin_1poly_p0 = asin_tx4,asin_1poly_p4,asin_1poly_p0 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 asin_1poly_p11 = asin_tx4,asin_1poly_p15,asin_1poly_p11 - nop.i 999;; +;; +{.mfi + nop.m 0 + fnma.s1 fD = fH, fS, fHalf // d1 = 1/2 - H1*S1 + nop.i 0 +} +{ .mfi + nop.m 0 + fnma.s1 fSignedS = fSignX, fS, f0 // -signum(x)*S1 + nop.i 0 +} +;; +{ .mfi + nop.m 0 + fma.s1 fCloseTo1Pol = fB12, fR8, fB7 + nop.i 0 +} +;; +{ .mfi + nop.m 0 + fma.s1 fH = fH, fD, fH // H2 = H1 + H1*d1 + nop.i 0 +} +{ .mfi + nop.m 0 + fma.s1 fS = fS, fD, fS // S2 = S1 + S1*d1 + nop.i 0 +} +;; +{ .mfi + nop.m 0 + // -signum(x)* S2 = -signum(x)*(S1 + S1*d1) + fma.s1 fSignedS = fSignedS, fD, fSignedS + nop.i 0 +} +;; +{.mfi + nop.m 0 + fnma.s1 fD = fH, fS, fHalf // d2 = 1/2 - H2*S2 + nop.i 0 +} +;; +{ .mfi + nop.m 0 + // signum(x)*(Pi/2 - PolB*S2) + fma.s1 fPiBy2 = fSignedS, fCloseTo1Pol, fPiBy2 + nop.i 0 +} +{ .mfi + nop.m 0 + // -signum(x)*PolB * S2 + fma.s1 fCloseTo1Pol = fSignedS, fCloseTo1Pol, f0 + nop.i 0 +} +;; +{ .mfb + nop.m 0 + // final result for 0.625 <= |x| < 1 + fma.d.s0 f8 = fCloseTo1Pol, fD, fPiBy2 + // exit here for 0.625 <= |x| < 1 + br.ret.sptk b0 } +;; - -{ .mfi - nop.m 999 - fma.s1 asin_tx11 = asin_tx8,asin_tx3,f0 - nop.i 999;; -} - -{ .mfi - nop.m 999 -//(asin_pred_GTsqrt2by2) fnma.s1 answer2 = asin_z,asin_series_t,asin_const_piby2 -(p8) fnma.s1 answer2 = asin_z,asin_series_t,asin_const_piby2 - nop.i 999;; -} - -{ .mfi - nop.m 999 - fma.s1 asin_series_tx = asin_tx11,asin_1poly_p11,asin_1poly_p0 - nop.i 999;; -} - -{ .mfi - nop.m 999 -//(asin_pred_GTsqrt2by2) fma.d f8 = asin_sgn_x,answer2,f0 -(p8) fma.d f8 = asin_sgn_x,answer2,f0 - nop.i 999;; -} - -// asin_eps is added only to force inexact and possibly underflow flag -// in case asin_series_tx is zero -// -{ .mfi - nop.m 999 -(p7) fma.d asin_eps = f8,asin_series_tx,asin_eps - nop.i 999 -} -{ .mfb - nop.m 999 -//(asin_pred_LEsqrt2by2) fma.d f8 = f8,asin_series_tx,f0 -(p7) fma.d f8 = f8,asin_series_tx,f0 - br.ret.sptk b0 -} + +// here if |x| < 0.625 +.align 32 +asin_base_range: +{ .mfi + nop.m 0 + fma.s1 fA33 = fA33, fXSqr, fA31 + nop.i 0 +} +{ .mfi + nop.m 0 + fma.s1 fA15 = fA15, fXSqr, fA13 + nop.i 0 +} +;; +{ .mfi + nop.m 0 + fma.s1 fA29 = fA29, fXSqr, fA27 + nop.i 0 +} +{ .mfi + nop.m 0 + fma.s1 fA25 = fA25, fXSqr, fA23 + nop.i 0 +} +;; +{ .mfi + nop.m 0 + fma.s1 fA21 = fA21, fXSqr, fA19 + nop.i 0 +} +{ .mfi + nop.m 0 + fma.s1 fA9 = fA9, fXSqr, fA7 + nop.i 0 +} +;; +{ .mfi + nop.m 0 + fma.s1 fA5 = fA5, fXSqr, fA3 + nop.i 0 +} +;; +{ .mfi + nop.m 0 + fma.s1 fA35 = fA35, fXQuadr, fA33 + nop.i 0 +} +{ .mfi + nop.m 0 + fma.s1 fA17 = fA17, fXQuadr, fA15 + nop.i 0 +} +;; +{ .mfi + nop.m 0 + fma.s1 fX8 = fXQuadr, fXQuadr, f0 // x^8 + nop.i 0 +} +{ .mfi + nop.m 0 + fma.s1 fA25 = fA25, fXQuadr, fA21 + nop.i 0 +} +;; +{ .mfi + nop.m 0 + fma.s1 fA9 = fA9, fXQuadr, fA5 + nop.i 0 +} +;; +{ .mfi + nop.m 0 + fma.s1 fA35 = fA35, fXQuadr, fA29 + nop.i 0 +} +{ .mfi + nop.m 0 + fma.s1 fA17 = fA17, fXSqr, fA11 + nop.i 0 +} +;; +{ .mfi + nop.m 0 + fma.s1 fX16 = fX8, fX8, f0 // x^16 + nop.i 0 +} +;; +{ .mfi + nop.m 0 + fma.s1 fA35 = fA35, fX8, fA25 + nop.i 0 +} +{ .mfi + nop.m 0 + fma.s1 fA17 = fA17, fX8, fA9 + nop.i 0 +} +;; +{ .mfi + nop.m 0 + fma.s1 fBaseP = fA35, fX16, fA17 + nop.i 0 +} +;; +{ .mfb + nop.m 0 + // final result for |x| < 0.625 + fma.d.s0 f8 = fBaseP, fXCube, f8 + // exit here for |x| < 0.625 path + br.ret.sptk b0 +} ;; +// here if |x| = 1 +// asin(x) = sign(x) * Pi/2 +.align 32 +asin_abs_1: +{ .mfi + ldfe fPiBy2 = [rPiBy2Ptr] // Pi/2 + nop.f 0 + nop.i 0 +} +;; +{.mfb + nop.m 0 + // result for |x| = 1.0 + fma.d.s0 f8 = fPiBy2, fSignX, f0 + // exit here for |x| = 1.0 + br.ret.sptk b0 +} +;; -L(ASIN_ABS_1): -// Here for short exit if |x|=1 -{ .mfb - nop.m 999 - fma.d f8 = asin_sgn_x,asin_const_piby2,f0 - br.ret.sptk b0 -} +// here if x is a NaN, denormal, or zero +.align 32 +asin_special: +{ .mfi + nop.m 0 + // set p12 = 1 if x is a NaN + fclass.m p12, p0 = f8, 0xc3 + nop.i 0 +} +{ .mlx + nop.m 0 + // smallest positive DP normalized number + movl rDenoBound = 0x0010000000000000 +} +;; +{ .mfi + nop.m 0 + // set p13 = 1 if x = 0.0 + fclass.m p13, p0 = f8, 0x07 + nop.i 0 +} +{ .mfi + nop.m 0 + fnorm.s1 fNormX = f8 + nop.i 0 +} +;; +{ .mfb + // load smallest normal to FP reg + setf.d fDenoBound = rDenoBound + // answer if x is a NaN +(p12) fma.d.s0 f8 = f8,f1,f0 + // exit here if x is a NaN +(p12) br.ret.spnt b0 +} +;; +{ .mfb + nop.m 0 + nop.f 0 + // exit here if x = 0.0 +(p13) br.ret.spnt b0 +} +;; +// if we still here then x is denormal or unnormal +{ .mfi + nop.m 0 + // absolute value of normalized x + fmerge.s fNormX = f1, fNormX + nop.i 0 +} +;; +{ .mfi + nop.m 0 + // set p14 = 1 if normalized x is greater than or + // equal to the smallest denormalized value + // So, if p14 is set to 1 it means that we deal with + // unnormal rather than with "true" denormal + fcmp.ge.s1 p14, p0 = fNormX, fDenoBound + nop.i 0 +} +;; +{ .mfi + nop.m 0 +(p14) fcmp.eq.s0 p6, p0 = f8, f0 // Set D flag if x unnormal + nop.i 0 +} +{ .mfb + nop.m 0 + // normalize unnormal input +(p14) fnorm.s1 f8 = f8 + // return to the main path +(p14) br.cond.sptk asin_unnormal_back +} +;; +// if we still here it means that input is "true" denormal +{ .mfb + nop.m 0 + // final result if x is denormal + fma.d.s0 f8 = f8, fXSqr, f8 + // exit here if x is denormal + br.ret.sptk b0 +} ;; +// here if |x| > 1.0 +// error handler should be called +.align 32 +asin_abs_gt_1: +{ .mfi + alloc r32 = ar.pfs, 0, 3, 4, 0 // get some registers + fmerge.s FR_X = f8,f8 + nop.i 0 +} +{ .mfb + mov GR_Parameter_TAG = 61 // error code + frcpa.s0 FR_RESULT, p0 = f0,f0 + // call error handler routine + br.cond.sptk __libm_error_region +} +;; +GLOBAL_LIBM_END(asin) -.endp asin -ASM_SIZE_DIRECTIVE(asin) -.proc __libm_error_region -__libm_error_region: +LOCAL_LIBM_ENTRY(__libm_error_region) .prologue { .mfi add GR_Parameter_Y=-32,sp // Parameter 2 value - nop.f 999 + nop.f 0 .save ar.pfs,GR_SAVE_PFS mov GR_SAVE_PFS=ar.pfs // Save ar.pfs } @@ -857,28 +815,29 @@ __libm_error_region: mov GR_SAVE_GP=gp // Save gp };; { .mmi - stfs [GR_Parameter_Y] = f1,16 // Store Parameter 2 on stack + stfd [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack add GR_Parameter_X = 16,sp // Parameter 1 address .save b0, GR_SAVE_B0 mov GR_SAVE_B0=b0 // Save b0 };; - .body - frcpa.s0 f9,p0 = f0,f0 -;; - { .mib - stfd [GR_Parameter_X] = f8 // Store Parameter 1 on stack - add GR_Parameter_RESULT = 0,GR_Parameter_Y - nop.b 0 // Parameter 3 address + stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address + nop.b 0 } { .mib - stfd [GR_Parameter_Y] = f9,-16 // Store Parameter 3 on stack - adds r32 = 48,sp - br.call.sptk b0=__libm_error_support# // Call error handling function + stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function };; { .mmi - ldfd f8 = [r32] // Get return result off stack + add GR_Parameter_RESULT = 48,sp + nop.m 0 + nop.i 0 +};; +{ .mmi + ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack .restore sp add sp = 64,sp // Restore stack pointer mov b0 = GR_SAVE_B0 // Restore return address @@ -887,11 +846,8 @@ __libm_error_region: mov gp = GR_SAVE_GP // Restore gp mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs br.ret.sptk b0 // Return - };; -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) - -.type __libm_error_support,@function -.global __libm_error_support +LOCAL_LIBM_END(__libm_error_region) +.type __libm_error_support#,@function +.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_asinf.S b/sysdeps/ia64/fpu/e_asinf.S index ddae85880b..f9a1312b26 100644 --- a/sysdeps/ia64/fpu/e_asinf.S +++ b/sysdeps/ia64/fpu/e_asinf.S @@ -1,10 +1,10 @@ .file "asinf.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. // -// Contributed 2/02/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,7 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -35,21 +35,25 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. // History //============================================================== -// 2/02/00 Initial revision -// 6/28/00 Improved speed -// 6/31/00 Changed register allocation because of some duplicate macros +// 02/02/00 Initial version +// 06/28/00 Improved speed +// 06/31/00 Changed register allocation because of some duplicate macros // moved nan exit bundle up to gain a cycle. -// 8/08/00 Improved speed by avoiding SIR flush. -// 8/15/00 Bundle added after call to __libm_error_support to properly +// 08/08/00 Improved speed by avoiding SIR flush. +// 08/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. -// 8/17/00 Changed predicate register macro-usage to direct predicate +// 08/17/00 Changed predicate register macro-usage to direct predicate // names due to an assembler bug. // 10/17/00 Improved speed of x=0 and x=1 paths, set D flag if x denormal. +// 03/13/01 Corrected sign of imm1 value in dep instruction. +// 05/20/02 Cleaned up namespace and sf0 syntax +// 02/06/03 Reordered header: .section, .global, .proc, .align + // Description //========================================= // The asinf function computes the arc sine of x in the range [-pi,+pi]. @@ -119,7 +123,6 @@ // answer2 = - sign(x) z P(t) + (sign(x) pi/2) // -#include "libm_support.h" // Assembly macros //========================================= @@ -225,42 +228,30 @@ asinf_poly_p1a = f90 // Data tables //============================================================== -#ifdef _LIBC -.rodata -#else -.data -#endif +RODATA .align 16 -asinf_coeff_1_table: -ASM_TYPE_DIRECTIVE(asinf_coeff_1_table,@object) +LOCAL_OBJECT_START(asinf_coeff_1_table) data8 0x3FC5555607DCF816 // P1 data8 0x3F9CF81AD9BAB2C6 // P4 data8 0x3FC59E0975074DF3 // P7 data8 0xBFA6F4CC2780AA1D // P6 data8 0x3FC2DD45292E93CB // P9 data8 0x3fe6a09e667f3bcd // sqrt(2)/2 -ASM_SIZE_DIRECTIVE(asinf_coeff_1_table) +LOCAL_OBJECT_END(asinf_coeff_1_table) -asinf_coeff_2_table: -ASM_TYPE_DIRECTIVE(asinf_coeff_2_table,@object) +LOCAL_OBJECT_START(asinf_coeff_2_table) data8 0x3FA6F108E31EFBA6 // P3 data8 0xBFCA31BF175D82A0 // P8 data8 0x3FA30C0337F6418B // P5 data8 0x3FB332C9266CB1F9 // P2 data8 0x3ff921fb54442d18 // pi_by_2 -ASM_SIZE_DIRECTIVE(asinf_coeff_2_table) +LOCAL_OBJECT_END(asinf_coeff_2_table) -.align 32 -.global asinf - .section .text -.proc asinf -.align 32 - -asinf: +GLOBAL_LIBM_ENTRY(asinf) // Load the addresses of the two tables. // Then, load the coefficients and other constants. @@ -345,7 +336,7 @@ asinf: } { .mfb nop.m 999 -(p8) fma.s f8 = f8,f1,f0 +(p8) fma.s.s0 f8 = f8,f1,f0 (p8) br.ret.spnt b0 ;; // Exit if x=nan } @@ -370,7 +361,7 @@ asinf: { .mfb nop.m 999 fma.s1 asinf_t4 = asinf_t2,asinf_t2,f0 -(p6) br.cond.spnt L(ASINF_ABS_ONE) ;; // Branch if |x|=1 +(p6) br.cond.spnt ASINF_ABS_ONE ;; // Branch if |x|=1 } { .mfi @@ -572,28 +563,26 @@ asinf: .pred.rel "mutex",p8,p7 //asinf_pred_GTsqrt2by2,asinf_pred_LEsqrt2by2 { .mfi nop.m 999 -(p8) fnma.s f8 = asinf_z,asinf_Pt,asinf_sgn_x_piby2 +(p8) fnma.s.s0 f8 = asinf_z,asinf_Pt,asinf_sgn_x_piby2 nop.i 999 } { .mfb nop.m 999 -(p7) fma.s f8 = asinf_x11,asinf_poly_Bx,asinf_poly_Ax +(p7) fma.s.s0 f8 = asinf_x11,asinf_poly_Bx,asinf_poly_Ax br.ret.sptk b0 ;; } -L(ASINF_ABS_ONE): +ASINF_ABS_ONE: // Here for short exit if |x|=1 { .mfb nop.m 999 - fma.s f8 = asinf_sgn_x,asinf_const_piby2,f0 + fma.s.s0 f8 = asinf_sgn_x,asinf_const_piby2,f0 br.ret.sptk b0 } ;; -.endp asinf -ASM_SIZE_DIRECTIVE(asinf) - +GLOBAL_LIBM_END(asinf) // Stack operations when calling error support. // (1) (2) // sp -> + psp -> + @@ -623,8 +612,7 @@ ASM_SIZE_DIRECTIVE(asinf) // restore gp // restore ar.pfs -.proc __libm_error_region -__libm_error_region: +LOCAL_LIBM_ENTRY(__libm_error_region) .prologue { .mfi add GR_Parameter_Y=-32,sp // Parameter 2 value @@ -680,8 +668,7 @@ __libm_error_region: br.ret.sptk b0 // Return };; -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) +LOCAL_LIBM_END(__libm_error_region) .type __libm_error_support#,@function .global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_asinl.S b/sysdeps/ia64/fpu/e_asinl.S index 9153832090..bf5feba155 100644 --- a/sysdeps/ia64/fpu/e_asinl.S +++ b/sysdeps/ia64/fpu/e_asinl.S @@ -1,10 +1,10 @@ .file "asinl.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2001 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2001 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,720 +20,2448 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// // Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// problem reports or change requests be submitted to it directly at +// http: //www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 2/02/00 Initial version -// 4/04/00 Unwind support added -// 8/15/00 Bundle added after call to __libm_error_support to properly -// set [the previously overwritten] GR_Parameter_RESULT. +// 08/28/01 New version +// 05/20/02 Cleaned up namespace and sf0 syntax +// 02/06/03 Reordered header: .section, .global, .proc, .align // // API //============================================================== -// long double = asinl(long double) -// input floating point f8 -// output floating point f8 +// long double asinl(long double) // -// Registers used +// Overview of operation //============================================================== +// Background // -// predicate registers used: -// p6 -> p12 +// Implementation // -// floating-point registers used: -// f8 has input, then output -// f32 -> f87, f8 -> f13, f32 -> f87 +// For |s| in [2^{-4}, sqrt(2)/2]: +// Let t= 2^k*1.b1 b2..b6 1, where s= 2^k*1.b1 b2.. b52 +// asin(s)= asin(t)+asin(r), where r= s*sqrt(1-t^2)-t*sqrt(1-s^2), i.e. +// r= (s-t)*sqrt(1-t^2)-t*sqrt(1-t^2)*(sqrt((1-s^2)/(1-t^2))-1) +// asin(r)-r evaluated as 9-degree polynomial (c3*r^3+c5*r^5+c7*r^7+c9*r^9) +// The 64-bit significands of sqrt(1-t^2), 1/(1-t^2) are read from the table, +// along with the high and low parts of asin(t) (stored as two double precision +// values) // -// general registers used: -// r32 -> r47 +// |s| in (sqrt(2)/2, sqrt(255/256)): +// Let t= 2^k*1.b1 b2..b6 1, where (1-s^2)*frsqrta(1-s^2)= 2^k*1.b1 b2..b6.. +// asin(|s|)= pi/2-asin(t)+asin(r), r= s*t-sqrt(1-s^2)*sqrt(1-t^2) +// To minimize accumulated errors, r is computed as +// r= (t*s)_s-t^2*y*z+z*y*(t^2-1+s^2)_s+z*y*(1-s^2)_s*x+z'*y*(1-s^2)*PS29+ +// +(t*s-(t*s)_s)+z*y*((t^2-1-(t^2-1+s^2)_s)+s^2)+z*y*(1-s^2-(1-s^2)_s)+ +// +ez*z'*y*(1-s^2)*(1-x), +// where y= frsqrta(1-s^2), z= (sqrt(1-t^2))_s (rounded to 24 significant bits) +// z'= sqrt(1-t^2), x= ((1-s^2)*y^2-1)/2 +// +// |s|<2^{-4}: evaluate as 17-degree polynomial +// (or simply return s, if|s|<2^{-64}) +// +// |s| in [sqrt(255/256), 1): asin(|s|)= pi/2-asin(sqrt(1-s^2)) +// use 17-degree polynomial for asin(sqrt(1-s^2)), +// 9-degree polynomial to evaluate sqrt(1-s^2) +// High order term is (pi/2)_high-(y*(1-s^2))_high // -// Overview of operation -//============================================================== -// There are three paths -// 1. |x| < 2^-40 ASIN_TINY -// 2. 2^-40 <= |x| < 1/4 ASIN_POLY -// 3. 1/4 <= |x| < 1 ASIN_ATAN -#include "libm_support.h" -// Assembly macros -//============================================================== -FR_RESULT = f10 -FR_X = f8 -FR_Y = f1 -asin_P79 = f32 -asin_P59 = f33 -asin_P39 = f34 -asin_P19 = f35 - -asin_P810 = f36 -asin_P610 = f37 -asin_P410 = f38 -asin_P210 = f39 - -asin_A1 = f41 -asin_A2 = f42 -asin_A3 = f43 -asin_A4 = f44 -asin_A5 = f45 -asin_A6 = f46 -asin_A7 = f47 -asin_A8 = f48 -asin_A9 = f49 -asin_A10 = f50 - -asin_X2 = f51 -asin_X4 = f52 - -asin_B = f53 -asin_Bb = f54 -asin_C = f55 -asin_Cc = f56 -asin_D = f57 - -asin_W = f58 -asin_Ww = f59 - -asin_y0 = f60 -asin_y1 = f61 -asin_y2 = f62 - -asin_H = f63 -asin_Hh = f64 - -asin_t1 = f65 -asin_t2 = f66 -asin_t3 = f67 -asin_t4 = f68 -asin_t5 = f69 - -asin_Pseries = f70 -asin_NORM_f8 = f71 -asin_ABS_NORM_f8 = f72 - -asin_2m100 = f73 -asin_P1P2 = f74 -asin_HALF = f75 -asin_1mD = f76 - -asin_1mB = f77 -asin_1mBmC = f78 -asin_S = f79 - -asin_BmWW = f80 -asin_BmWWpb = f81 -asin_2W = f82 -asin_1d2W = f83 -asin_Dd = f84 - -asin_XWw = f85 -asin_low = f86 - -asin_pi_by_2 = f87 -asin_pi_by_2_lo = f88 - -asin_GR_17_ones = r33 -asin_GR_16_ones = r34 -asin_GR_signexp_f8 = r35 -asin_GR_exp = r36 -asin_GR_true_exp = r37 -asin_GR_ff9b = r38 - -GR_SAVE_B0 = r39 -GR_SAVE_SP = r40 -GR_SAVE_PFS = r33 -// r33 can be used safely. -// r40 is address of table of coefficients -// Later it is used to save sp across calls -GR_SAVE_GP = r41 -asin_GR_fffe = r42 -asin_GR_retval = r43 - -GR_Parameter_X = r44 -GR_Parameter_Y = r45 -GR_Parameter_RESULT = r46 -GR_Parameter_TAG = r47 - - -// 2^-40: -// A true exponent of -40 is -// : -40 + register_bias -// : -28 + ffff = ffd7 - -// A true exponent of -100 is -// : -100 + register_bias -// : -64 + ffff = ff9b - -// Data tables + +// Registers used //============================================================== +// f6-f15, f32-f36 +// r2-r3, r23-r23 +// p6, p7, p8, p12 +// + + + GR_SAVE_B0= r33 + GR_SAVE_PFS= r34 + GR_SAVE_GP= r35 // This reg. can safely be used + GR_SAVE_SP= r36 + + GR_Parameter_X= r37 + GR_Parameter_Y= r38 + GR_Parameter_RESULT= r39 + GR_Parameter_TAG= r40 -#ifdef _LIBC -.rodata -#else -.data -#endif + FR_X= f10 + FR_Y= f1 + FR_RESULT= f8 + + + +RODATA .align 16 -asin_coefficients: -ASM_TYPE_DIRECTIVE(asin_coefficients,@object) -data8 0xBB08911F2013961E, 0x00003FF8 // A10 -data8 0x981F1095A23A87D3, 0x00003FF8 // A9 -data8 0xBDF09C6C4177BCC6, 0x00003FF8 // A8 -data8 0xE4C3A60B049ACCEA, 0x00003FF8 // A7 -data8 0x8E2789F4E8A8F1AD, 0x00003FF9 // A6 -data8 0xB745D09B2B0E850B, 0x00003FF9 // A5 -data8 0xF8E38E3BC4C50920, 0x00003FF9 // A4 -data8 0xB6DB6DB6D89FCD81, 0x00003FFA // A3 -data8 0x99999999999AF376, 0x00003FFB // A2 -data8 0xAAAAAAAAAAAAAA71, 0x00003FFC // A1 - -data8 0xc90fdaa22168c234, 0x00003FFF // pi_by_2_hi -data8 0xc4c6628b80dc1cd1, 0x00003FBF // pi_by_2_lo -ASM_SIZE_DIRECTIVE(asin_coefficients) - -.align 32 -.global asinl# + + +LOCAL_OBJECT_START(T_table) + +// stores 64-bit significand of 1/(1-t^2), 64-bit significand of sqrt(1-t^2), +// asin(t)_high (double precision), asin(t)_low (double precision) + +data8 0x80828692b71c4391, 0xff7ddcec2d87e879 +data8 0x3fb022bc0ae531a0, 0x3c9f599c7bb42af6 +data8 0x80869f0163d0b082, 0xff79cad2247914d3 +data8 0x3fb062dd26afc320, 0x3ca4eff21bd49c5c +data8 0x808ac7d5a8690705, 0xff75a89ed6b626b9 +data8 0x3fb0a2ff4a1821e0, 0x3cb7e33b58f164cc +data8 0x808f0112ad8ad2e0, 0xff7176517c2cc0cb +data8 0x3fb0e32279319d80, 0x3caee31546582c43 +data8 0x80934abba8a1da0a, 0xff6d33e949b1ed31 +data8 0x3fb12346b8101da0, 0x3cb8bfe463d087cd +data8 0x8097a4d3dbe63d8f, 0xff68e16571015c63 +data8 0x3fb1636c0ac824e0, 0x3c8870a7c5a3556f +data8 0x809c0f5e9662b3dd, 0xff647ec520bca0f0 +data8 0x3fb1a392756ed280, 0x3c964f1a927461ae +data8 0x80a08a5f33fadc66, 0xff600c07846a6830 +data8 0x3fb1e3b9fc19e580, 0x3c69eb3576d56332 +data8 0x80a515d91d71acd4, 0xff5b892bc475affa +data8 0x3fb223e2a2dfbe80, 0x3c6a4e19fd972fb6 +data8 0x80a9b1cfc86ff7cd, 0xff56f631062cf93d +data8 0x3fb2640c6dd76260, 0x3c62041160e0849e +data8 0x80ae5e46b78b0d68, 0xff5253166bc17794 +data8 0x3fb2a43761187c80, 0x3cac61651af678c0 +data8 0x80b31b417a4b756b, 0xff4d9fdb14463dc8 +data8 0x3fb2e46380bb6160, 0x3cb06ef23eeba7a1 +data8 0x80b7e8c3ad33c369, 0xff48dc7e1baf6738 +data8 0x3fb32490d0d910c0, 0x3caa05f480b300d5 +data8 0x80bcc6d0f9c784d6, 0xff4408fe9ad13e37 +data8 0x3fb364bf558b3820, 0x3cb01e7e403aaab9 +data8 0x80c1b56d1692492d, 0xff3f255ba75f5f4e +data8 0x3fb3a4ef12ec3540, 0x3cb4fe8fcdf5f5f1 +data8 0x80c6b49bc72ec446, 0xff3a319453ebd961 +data8 0x3fb3e5200d171880, 0x3caf2dc089b2b7e2 +data8 0x80cbc460dc4e0ae8, 0xff352da7afe64ac6 +data8 0x3fb425524827a720, 0x3cb75a855e7c6053 +data8 0x80d0e4c033bee9c4, 0xff301994c79afb32 +data8 0x3fb46585c83a5e00, 0x3cb3264981c019ab +data8 0x80d615bdb87556db, 0xff2af55aa431f291 +data8 0x3fb4a5ba916c73c0, 0x3c994251d94427b5 +data8 0x80db575d6291fd8a, 0xff25c0f84bae0cb9 +data8 0x3fb4e5f0a7dbdb20, 0x3cbee2fcc4c786cb +data8 0x80e0a9a33769e535, 0xff207c6cc0ec09fd +data8 0x3fb526280fa74620, 0x3c940656e5549b91 +data8 0x80e60c93498e32cd, 0xff1b27b703a19c98 +data8 0x3fb56660ccee2740, 0x3ca7082374d7b2cd +data8 0x80eb8031b8d4052d, 0xff15c2d6105c72f8 +data8 0x3fb5a69ae3d0b520, 0x3c7c4d46e09ac68a +data8 0x80f10482b25c6c8a, 0xff104dc8e0813ed4 +data8 0x3fb5e6d6586fec20, 0x3c9aa84ffd9b4958 +data8 0x80f6998a709c7cfb, 0xff0ac88e6a4ab926 +data8 0x3fb627132eed9140, 0x3cbced2cbbbe7d16 +data8 0x80fc3f4d3b657c44, 0xff053325a0c8a2ec +data8 0x3fb667516b6c34c0, 0x3c6489c5fc68595a +data8 0x8101f5cf67ed2af8, 0xfeff8d8d73dec2bb +data8 0x3fb6a791120f33a0, 0x3cbe12acf159dfad +data8 0x8107bd1558d6291f, 0xfef9d7c4d043df29 +data8 0x3fb6e7d226fabba0, 0x3ca386d099cd0dc7 +data8 0x810d95237e38766a, 0xfef411ca9f80b5f7 +data8 0x3fb72814ae53cc20, 0x3cb9f35731e71dd6 +data8 0x81137dfe55aa0e29, 0xfeee3b9dc7eef009 +data8 0x3fb76858ac403a00, 0x3c74df3dd959141a +data8 0x811977aa6a479f0f, 0xfee8553d2cb8122c +data8 0x3fb7a89e24e6b0e0, 0x3ca6034406ee42bc +data8 0x811f822c54bd5ef8, 0xfee25ea7add46a91 +data8 0x3fb7e8e51c6eb6a0, 0x3cb82f8f78e68ed7 +data8 0x81259d88bb4ffac1, 0xfedc57dc2809fb1d +data8 0x3fb8292d9700ad60, 0x3cbebb73c0e653f9 +data8 0x812bc9c451e5a257, 0xfed640d974eb6068 +data8 0x3fb8697798c5d620, 0x3ca2feee76a9701b +data8 0x813206e3da0f3124, 0xfed0199e6ad6b585 +data8 0x3fb8a9c325e852e0, 0x3cb9e88f2f4d0efe +data8 0x813854ec231172f9, 0xfec9e229dcf4747d +data8 0x3fb8ea1042932a00, 0x3ca5ff40d81f66fd +data8 0x813eb3e209ee858f, 0xfec39a7a9b36538b +data8 0x3fb92a5ef2f247c0, 0x3cb5e3bece4d6b07 +data8 0x814523ca796f56ce, 0xfebd428f72561efe +data8 0x3fb96aaf3b3281a0, 0x3cb7b9e499436d7c +data8 0x814ba4aa6a2d3ff9, 0xfeb6da672bd48fe4 +data8 0x3fb9ab011f819860, 0x3cb9168143cc1a7f +data8 0x81523686e29bbdd7, 0xfeb062008df81f50 +data8 0x3fb9eb54a40e3ac0, 0x3cb6e544197eb1e1 +data8 0x8158d964f7124614, 0xfea9d95a5bcbd65a +data8 0x3fba2ba9cd080800, 0x3ca9a717be8f7446 +data8 0x815f8d49c9d639e4, 0xfea34073551e1ac8 +data8 0x3fba6c009e9f9260, 0x3c741e989a60938a +data8 0x8166523a8b24f626, 0xfe9c974a367f785c +data8 0x3fbaac591d0661a0, 0x3cb2c1290107e57d +data8 0x816d283c793e0114, 0xfe95ddddb94166cb +data8 0x3fbaecb34c6ef600, 0x3c9c7d5fbaec405d +data8 0x81740f54e06d55bd, 0xfe8f142c93750c50 +data8 0x3fbb2d0f310cca00, 0x3cbc09479a9cbcfb +data8 0x817b07891b15cd5e, 0xfe883a3577e9fceb +data8 0x3fbb6d6ccf1455e0, 0x3cb9450bff4ee307 +data8 0x818210de91bba6c8, 0xfe814ff7162cf62f +data8 0x3fbbadcc2abb1180, 0x3c9227fda12a8d24 +data8 0x81892b5abb0f2bf9, 0xfe7a55701a8697b1 +data8 0x3fbbee2d48377700, 0x3cb6fad72acfe356 +data8 0x819057031bf7760e, 0xfe734a9f2dfa1810 +data8 0x3fbc2e902bc10600, 0x3cb4465b588d16ad +data8 0x819793dd479d4fbe, 0xfe6c2f82f643f68b +data8 0x3fbc6ef4d9904580, 0x3c8b9ac54823960d +data8 0x819ee1eedf76367a, 0xfe65041a15d8a92c +data8 0x3fbcaf5b55dec6a0, 0x3ca2b8d28a954db2 +data8 0x81a6413d934f7a66, 0xfe5dc8632be3477f +data8 0x3fbcefc3a4e727a0, 0x3c9380da83713ab4 +data8 0x81adb1cf21597d4b, 0xfe567c5cd44431d5 +data8 0x3fbd302dcae51600, 0x3ca995b83421756a +data8 0x81b533a9563310b8, 0xfe4f2005a78fb50f +data8 0x3fbd7099cc155180, 0x3caefa2f7a817d5f +data8 0x81bcc6d20cf4f373, 0xfe47b35c3b0caaeb +data8 0x3fbdb107acb5ae80, 0x3cb455fc372dd026 +data8 0x81c46b4f2f3d6e68, 0xfe40365f20b316d6 +data8 0x3fbdf177710518c0, 0x3cbee3dcc5b01434 +data8 0x81cc2126b53c1144, 0xfe38a90ce72abf36 +data8 0x3fbe31e91d439620, 0x3cb3e131c950aebd +data8 0x81d3e85ea5bd8ee2, 0xfe310b6419c9c33a +data8 0x3fbe725cb5b24900, 0x3c01d3fac6029027 +data8 0x81dbc0fd1637b9c1, 0xfe295d6340932d15 +data8 0x3fbeb2d23e937300, 0x3c6304cc44aeedd1 +data8 0x81e3ab082ad5a0a4, 0xfe219f08e03580b3 +data8 0x3fbef349bc2a77e0, 0x3cac1d2d6abe9c72 +data8 0x81eba6861683cb97, 0xfe19d0537a0946e2 +data8 0x3fbf33c332bbe020, 0x3ca0909dba4e96ca +data8 0x81f3b37d1afc9979, 0xfe11f1418c0f94e2 +data8 0x3fbf743ea68d5b60, 0x3c937fc12a2a779a +data8 0x81fbd1f388d4be45, 0xfe0a01d190f09063 +data8 0x3fbfb4bc1be5c340, 0x3cbf51a504b55813 +data8 0x820401efbf87e248, 0xfe020201fff9efea +data8 0x3fbff53b970d1e80, 0x3ca625444b260078 +data8 0x82106ad2ffdca049, 0xfdf5e3940a49135e +data8 0x3fc02aff52065460, 0x3c9125d113e22a57 +data8 0x8221343d6ea1d3e2, 0xfde581a45429b0a0 +data8 0x3fc06b84f8e03220, 0x3caccf362295894b +data8 0x82324434adbf99c2, 0xfdd4de1a001fb775 +data8 0x3fc0ac0ed1fe7240, 0x3cc22f676096b0af +data8 0x82439aee8d0c7747, 0xfdc3f8e8269d1f03 +data8 0x3fc0ec9cee9e4820, 0x3cca147e2886a628 +data8 0x825538a1d0fcb2f0, 0xfdb2d201a9b1ba66 +data8 0x3fc12d2f6006f0a0, 0x3cc72b36633bc2d4 +data8 0x82671d86345c5cee, 0xfda1695934d723e7 +data8 0x3fc16dc63789de60, 0x3cb11f9c47c7b83f +data8 0x827949d46a121770, 0xfd8fbee13cbbb823 +data8 0x3fc1ae618682e620, 0x3cce1b59020cef8e +data8 0x828bbdc61eeab9ba, 0xfd7dd28bff0c9f34 +data8 0x3fc1ef015e586c40, 0x3cafec043e0225ee +data8 0x829e7995fb6de9e1, 0xfd6ba44b823ee1ca +data8 0x3fc22fa5d07b90c0, 0x3cba905409caf8e3 +data8 0x82b17d7fa5bbc982, 0xfd5934119557883a +data8 0x3fc2704eee685da0, 0x3cb5ef21838a823e +data8 0x82c4c9bfc373d276, 0xfd4681cfcfb2c161 +data8 0x3fc2b0fcc9a5f3e0, 0x3ccc7952c5e0e312 +data8 0x82d85e93fba50136, 0xfd338d7790ca0f41 +data8 0x3fc2f1af73c6ba00, 0x3cbecf5f977d1ca9 +data8 0x82ec3c3af8c76b32, 0xfd2056f9fff97727 +data8 0x3fc33266fe6889a0, 0x3c9d329c022ebdb5 +data8 0x830062f46abf6022, 0xfd0cde480c43b327 +data8 0x3fc373237b34de60, 0x3cc95806d4928adb +data8 0x8314d30108ea35f0, 0xfcf923526c1562b2 +data8 0x3fc3b3e4fbe10520, 0x3cbc299fe7223d54 +data8 0x83298ca29434df97, 0xfce526099d0737ed +data8 0x3fc3f4ab922e4a60, 0x3cb59d8bb8fdbccc +data8 0x833e901bd93c7009, 0xfcd0e65de39f1f7c +data8 0x3fc435774fea2a60, 0x3c9ec18b43340914 +data8 0x8353ddb0b278aad8, 0xfcbc643f4b106055 +data8 0x3fc4764846ee80a0, 0x3cb90402efd87ed6 +data8 0x836975a60a70c52e, 0xfca79f9da4fab13a +data8 0x3fc4b71e8921b860, 0xbc58f23449ed6365 +data8 0x837f5841ddfa7a46, 0xfc92986889284148 +data8 0x3fc4f7fa2876fca0, 0xbc6294812bf43acd +data8 0x839585cb3e839773, 0xfc7d4e8f554ab12f +data8 0x3fc538db36ee6960, 0x3cb910b773d4c578 +data8 0x83abfe8a5466246f, 0xfc67c2012cb6fa68 +data8 0x3fc579c1c6953cc0, 0x3cc5ede909fc47fc +data8 0x83c2c2c861474d91, 0xfc51f2acf82041d5 +data8 0x3fc5baade9860880, 0x3cac63cdfc3588e5 +data8 0x83d9d2cfc2813637, 0xfc3be08165519325 +data8 0x3fc5fb9fb1e8e3a0, 0x3cbf7c8466578c29 +data8 0x83f12eebf397daac, 0xfc258b6ce6e6822f +data8 0x3fc63c9731f39d40, 0x3cb6d2a7ffca3e9e +data8 0x8408d76990b9296e, 0xfc0ef35db402af94 +data8 0x3fc67d947be9eec0, 0x3cb1980da09e6566 +data8 0x8420cc9659487cd7, 0xfbf81841c8082dc4 +data8 0x3fc6be97a21daf00, 0x3cc2ac8330e59aa5 +data8 0x84390ec132759ecb, 0xfbe0fa06e24cc390 +data8 0x3fc6ffa0b6ef05e0, 0x3ccc1a030fee56c4 +data8 0x84519e3a29df811a, 0xfbc9989a85ce0954 +data8 0x3fc740afcccca000, 0x3cc19692a5301ca6 +data8 0x846a7b527842d61b, 0xfbb1f3e9f8e45dc4 +data8 0x3fc781c4f633e2c0, 0x3cc0e98f3868a508 +data8 0x8483a65c8434b5f0, 0xfb9a0be244f4af45 +data8 0x3fc7c2e045b12140, 0x3cb2a8d309754420 +data8 0x849d1fabe4e97dd7, 0xfb81e070362116d1 +data8 0x3fc80401cddfd120, 0x3ca7a44544aa4ce6 +data8 0x84b6e795650817ea, 0xfb6971805af8411e +data8 0x3fc84529a16ac020, 0x3c9e3b709c7d6f94 +data8 0x84d0fe6f0589da92, 0xfb50beff0423a2f5 +data8 0x3fc88657d30c49e0, 0x3cc60d65a7f0a278 +data8 0x84eb649000a73014, 0xfb37c8d84414755c +data8 0x3fc8c78c758e8e80, 0x3cc94b2ee984c2b7 +data8 0x85061a50ccd13781, 0xfb1e8ef7eeaf764b +data8 0x3fc908c79bcba900, 0x3cc8540ae794a2fe +data8 0x8521200b1fb8916e, 0xfb05114998f76a83 +data8 0x3fc94a0958ade6c0, 0x3ca127f49839fa9c +data8 0x853c7619f1618bf6, 0xfaeb4fb898b65d19 +data8 0x3fc98b51bf2ffee0, 0x3c8c9ba7a803909a +data8 0x85581cd97f45e274, 0xfad14a3004259931 +data8 0x3fc9cca0e25d4ac0, 0x3cba458e91d3bf54 +data8 0x857414a74f8446b4, 0xfab7009ab1945a54 +data8 0x3fca0df6d551fe80, 0x3cc78ea1d329d2b2 +data8 0x85905de2341dea46, 0xfa9c72e3370d2fbc +data8 0x3fca4f53ab3b6200, 0x3ccf60dca86d57ef +data8 0x85acf8ea4e423ff8, 0xfa81a0f3e9fa0ee9 +data8 0x3fca90b777580aa0, 0x3ca4c4e2ec8a867e +data8 0x85c9e62111a92e7d, 0xfa668ab6dec711b1 +data8 0x3fcad2224cf814e0, 0x3c303de5980d071c +data8 0x85e725e947fbee97, 0xfa4b3015e883dbfe +data8 0x3fcb13943f7d5f80, 0x3cc29d4eefa5cb1e +data8 0x8604b8a7144cd054, 0xfa2f90fa9883a543 +data8 0x3fcb550d625bc6a0, 0x3c9e01a746152daf +data8 0x86229ebff69e2415, 0xfa13ad4e3dfbe1c1 +data8 0x3fcb968dc9195ea0, 0x3ccc091bd73ae518 +data8 0x8640d89acf78858c, 0xf9f784f9e5a1877b +data8 0x3fcbd815874eb160, 0x3cb5f4b89875e187 +data8 0x865f669fe390c7f5, 0xf9db17e65944eacf +data8 0x3fcc19a4b0a6f9c0, 0x3cc5c0bc2b0bbf14 +data8 0x867e4938df7dc45f, 0xf9be65fc1f6c2e6e +data8 0x3fcc5b3b58e061e0, 0x3cc1ca70df8f57e7 +data8 0x869d80d0db7e4c0c, 0xf9a16f237aec427a +data8 0x3fcc9cd993cc4040, 0x3cbae93acc85eccf +data8 0x86bd0dd45f4f8265, 0xf98433446a806e70 +data8 0x3fccde7f754f5660, 0x3cb22f70e64568d0 +data8 0x86dcf0b16613e37a, 0xf966b246a8606170 +data8 0x3fcd202d11620fa0, 0x3c962030e5d4c849 +data8 0x86fd29d7624b3d5d, 0xf948ec11a9d4c45b +data8 0x3fcd61e27c10c0a0, 0x3cc7083c91d59217 +data8 0x871db9b741dbe44a, 0xf92ae08c9eca4941 +data8 0x3fcda39fc97be7c0, 0x3cc9258579e57211 +data8 0x873ea0c3722d6af2, 0xf90c8f9e71633363 +data8 0x3fcde5650dd86d60, 0x3ca4755a9ea582a9 +data8 0x875fdf6fe45529e8, 0xf8edf92dc5875319 +data8 0x3fce27325d6fe520, 0x3cbc1e2b6c1954f9 +data8 0x878176321154e2bc, 0xf8cf1d20f87270b8 +data8 0x3fce6907cca0d060, 0x3cb6ca4804750830 +data8 0x87a36580fe6bccf5, 0xf8affb5e20412199 +data8 0x3fceaae56fdee040, 0x3cad6b310d6fd46c +data8 0x87c5add5417a5cb9, 0xf89093cb0b7c0233 +data8 0x3fceeccb5bb33900, 0x3cc16e99cedadb20 +data8 0x87e84fa9057914ca, 0xf870e64d40a15036 +data8 0x3fcf2eb9a4bcb600, 0x3cc75ee47c8b09e9 +data8 0x880b4b780f02b709, 0xf850f2c9fdacdf78 +data8 0x3fcf70b05fb02e20, 0x3cad6350d379f41a +data8 0x882ea1bfc0f228ac, 0xf830b926379e6465 +data8 0x3fcfb2afa158b8a0, 0x3cce0ccd9f829985 +data8 0x885252ff21146108, 0xf810394699fe0e8e +data8 0x3fcff4b77e97f3e0, 0x3c9b30faa7a4c703 +data8 0x88765fb6dceebbb3, 0xf7ef730f865f6df0 +data8 0x3fd01b6406332540, 0x3cdc5772c9e0b9bd +data8 0x88ad1f69be2cc730, 0xf7bdc59bc9cfbd97 +data8 0x3fd04cf8ad203480, 0x3caeef44fe21a74a +data8 0x88f763f70ae2245e, 0xf77a91c868a9c54e +data8 0x3fd08f23ce0162a0, 0x3cd6290ab3fe5889 +data8 0x89431fc7bc0c2910, 0xf73642973c91298e +data8 0x3fd0d1610f0c1ec0, 0x3cc67401a01f08cf +data8 0x8990573407c7738e, 0xf6f0d71d1d7a2dd6 +data8 0x3fd113b0c65d88c0, 0x3cc7aa4020fe546f +data8 0x89df0eb108594653, 0xf6aa4e6a05cfdef2 +data8 0x3fd156134ada6fe0, 0x3cc87369da09600c +data8 0x8a2f4ad16e0ed78a, 0xf662a78900c35249 +data8 0x3fd19888f43427a0, 0x3cc62b220f38e49c +data8 0x8a811046373e0819, 0xf619e180181d97cc +data8 0x3fd1db121aed7720, 0x3ca3ede7490b52f4 +data8 0x8ad463df6ea0fa2c, 0xf5cffb504190f9a2 +data8 0x3fd21daf185fa360, 0x3caafad98c1d6c1b +data8 0x8b294a8cf0488daf, 0xf584f3f54b8604e6 +data8 0x3fd2606046bf95a0, 0x3cdb2d704eeb08fa +data8 0x8b7fc95f35647757, 0xf538ca65c960b582 +data8 0x3fd2a32601231ec0, 0x3cc661619fa2f126 +data8 0x8bd7e588272276f8, 0xf4eb7d92ff39fccb +data8 0x3fd2e600a3865760, 0x3c8a2a36a99aca4a +data8 0x8c31a45bf8e9255e, 0xf49d0c68cd09b689 +data8 0x3fd328f08ad12000, 0x3cb9efaf1d7ab552 +data8 0x8c8d0b520a35eb18, 0xf44d75cd993cfad2 +data8 0x3fd36bf614dcc040, 0x3ccacbb590bef70d +data8 0x8cea2005d068f23d, 0xf3fcb8a23ab4942b +data8 0x3fd3af11a079a6c0, 0x3cd9775872cf037d +data8 0x8d48e837c8cd5027, 0xf3aad3c1e2273908 +data8 0x3fd3f2438d754b40, 0x3ca03304f667109a +data8 0x8da969ce732f3ac7, 0xf357c60202e2fd7e +data8 0x3fd4358c3ca032e0, 0x3caecf2504ff1a9d +data8 0x8e0baad75555e361, 0xf3038e323ae9463a +data8 0x3fd478ec0fd419c0, 0x3cc64bdc3d703971 +data8 0x8e6fb18807ba877e, 0xf2ae2b1c3a6057f7 +data8 0x3fd4bc6369fa40e0, 0x3cbb7122ec245cf2 +data8 0x8ed5843f4bda74d5, 0xf2579b83aa556f0c +data8 0x3fd4fff2af11e2c0, 0x3c9cfa2dc792d394 +data8 0x8f3d29862c861fef, 0xf1ffde2612ca1909 +data8 0x3fd5439a4436d000, 0x3cc38d46d310526b +data8 0x8fa6a81128940b2d, 0xf1a6f1bac0075669 +data8 0x3fd5875a8fa83520, 0x3cd8bf59b8153f8a +data8 0x901206c1686317a6, 0xf14cd4f2a730d480 +data8 0x3fd5cb33f8cf8ac0, 0x3c9502b5c4d0e431 +data8 0x907f4ca5fe9cf739, 0xf0f186784a125726 +data8 0x3fd60f26e847b120, 0x3cc8a1a5e0acaa33 +data8 0x90ee80fd34aeda5e, 0xf09504ef9a212f18 +data8 0x3fd65333c7e43aa0, 0x3cae5b029cb1f26e +data8 0x915fab35e37421c6, 0xf0374ef5daab5c45 +data8 0x3fd6975b02b8e360, 0x3cd5aa1c280c45e6 +data8 0x91d2d2f0d894d73c, 0xefd86321822dbb51 +data8 0x3fd6db9d05213b20, 0x3cbecf2c093ccd8b +data8 0x9248000249200009, 0xef7840021aca5a72 +data8 0x3fd71ffa3cc87fc0, 0x3cb8d273f08d00d9 +data8 0x92bf3a7351f081d2, 0xef16e42021d7cbd5 +data8 0x3fd7647318b1ad20, 0x3cbce099d79cdc46 +data8 0x93388a8386725713, 0xeeb44dfce6820283 +data8 0x3fd7a908093fc1e0, 0x3ccb033ec17a30d9 +data8 0x93b3f8aa8e653812, 0xee507c126774fa45 +data8 0x3fd7edb9803e3c20, 0x3cc10aedb48671eb +data8 0x94318d99d341ade4, 0xedeb6cd32f891afb +data8 0x3fd83287f0e9cf80, 0x3c994c0c1505cd2a +data8 0x94b1523e3dedc630, 0xed851eaa3168f43c +data8 0x3fd87773cff956e0, 0x3cda3b7bce6a6b16 +data8 0x95334fc20577563f, 0xed1d8ffaa2279669 +data8 0x3fd8bc7d93a70440, 0x3cd4922edc792ce2 +data8 0x95b78f8e8f92f274, 0xecb4bf1fd2be72da +data8 0x3fd901a5b3b9cf40, 0x3cd3fea1b00f9d0d +data8 0x963e1b4e63a87c3f, 0xec4aaa6d08694cc1 +data8 0x3fd946eca98f2700, 0x3cdba4032d968ff1 +data8 0x96c6fcef314074fc, 0xebdf502d53d65fea +data8 0x3fd98c52f024e800, 0x3cbe7be1ab8c95c9 +data8 0x97523ea3eab028b2, 0xeb72aea36720793e +data8 0x3fd9d1d904239860, 0x3cd72d08a6a22b70 +data8 0x97dfeae6f4ee4a9a, 0xeb04c4096a884e94 +data8 0x3fda177f63e8ef00, 0x3cd818c3c1ebfac7 +data8 0x98700c7c6d85d119, 0xea958e90cfe1efd7 +data8 0x3fda5d468f92a540, 0x3cdf45fbfaa080fe +data8 0x9902ae7487a9caa1, 0xea250c6224aab21a +data8 0x3fdaa32f090998e0, 0x3cd715a9353cede4 +data8 0x9997dc2e017a9550, 0xe9b33b9ce2bb7638 +data8 0x3fdae939540d3f00, 0x3cc545c014943439 +data8 0x9a2fa158b29b649b, 0xe9401a573f8aa706 +data8 0x3fdb2f65f63f6c60, 0x3cd4a63c2f2ca8e2 +data8 0x9aca09f835466186, 0xe8cba69df9f0bf35 +data8 0x3fdb75b5773075e0, 0x3cda310ce1b217ec +data8 0x9b672266ab1e0136, 0xe855de74266193d4 +data8 0x3fdbbc28606babc0, 0x3cdc84b75cca6c44 +data8 0x9c06f7579f0b7bd5, 0xe7debfd2f98c060b +data8 0x3fdc02bf3d843420, 0x3cd225d967ffb922 +data8 0x9ca995db058cabdc, 0xe76648a991511c6e +data8 0x3fdc497a9c224780, 0x3cde08101c5b825b +data8 0x9d4f0b605ce71e88, 0xe6ec76dcbc02d9a7 +data8 0x3fdc905b0c10d420, 0x3cb1abbaa3edf120 +data8 0x9df765b9eecad5e6, 0xe6714846bdda7318 +data8 0x3fdcd7611f4b8a00, 0x3cbf6217ae80aadf +data8 0x9ea2b320350540fe, 0xe5f4bab71494cd6b +data8 0x3fdd1e8d6a0d56c0, 0x3cb726e048cc235c +data8 0x9f51023562fc5676, 0xe576cbf239235ecb +data8 0x3fdd65e082df5260, 0x3cd9e66872bd5250 +data8 0xa002620915c2a2f6, 0xe4f779b15f5ec5a7 +data8 0x3fddad5b02a82420, 0x3c89743b0b57534b +data8 0xa0b6e21c2caf9992, 0xe476c1a233a7873e +data8 0x3fddf4fd84bbe160, 0x3cbf7adea9ee3338 +data8 0xa16e9264cc83a6b2, 0xe3f4a16696608191 +data8 0x3fde3cc8a6ec6ee0, 0x3cce46f5a51f49c6 +data8 0xa22983528f3d8d49, 0xe3711694552da8a8 +data8 0x3fde84bd099a6600, 0x3cdc78f6490a2d31 +data8 0xa2e7c5d2e2e69460, 0xe2ec1eb4e1e0a5fb +data8 0x3fdeccdb4fc685c0, 0x3cdd3aedb56a4825 +data8 0xa3a96b5599bd2532, 0xe265b74506fbe1c9 +data8 0x3fdf15241f23b3e0, 0x3cd440f3c6d65f65 +data8 0xa46e85d1ae49d7de, 0xe1ddddb499b3606f +data8 0x3fdf5d98202994a0, 0x3cd6c44bd3fb745a +data8 0xa53727ca3e11b99e, 0xe1548f662951b00d +data8 0x3fdfa637fe27bf60, 0x3ca8ad1cd33054dd +data8 0xa6036453bdc20186, 0xe0c9c9aeabe5e481 +data8 0x3fdfef0467599580, 0x3cc0f1ac0685d78a +data8 0xa6d34f1969dda338, 0xe03d89d5281e4f81 +data8 0x3fe01bff067d6220, 0x3cc0731e8a9ef057 +data8 0xa7a6fc62f7246ff3, 0xdfafcd125c323f54 +data8 0x3fe04092d1ae3b40, 0x3ccabda24b59906d +data8 0xa87e811a861df9b9, 0xdf20909061bb9760 +data8 0x3fe0653df0fd9fc0, 0x3ce94c8dcc722278 +data8 0xa959f2d2dd687200, 0xde8fd16a4e5f88bd +data8 0x3fe08a00c1cae320, 0x3ce6b888bb60a274 +data8 0xaa3967cdeea58bda, 0xddfd8cabd1240d22 +data8 0x3fe0aedba3221c00, 0x3ced5941cd486e46 +data8 0xab904fd587263c84, 0xdd1f4472e1cf64ed +data8 0x3fe0e651e85229c0, 0x3cdb6701042299b1 +data8 0xad686d44dd5a74bb, 0xdbf173e1f6b46e92 +data8 0x3fe1309cbf4cdb20, 0x3cbf1be7bb3f0ec5 +data8 0xaf524e15640ebee4, 0xdabd54896f1029f6 +data8 0x3fe17b4ee1641300, 0x3ce81dd055b792f1 +data8 0xb14eca24ef7db3fa, 0xd982cb9ae2f47e41 +data8 0x3fe1c66b9ffd6660, 0x3cd98ea31eb5ddc7 +data8 0xb35ec807669920ce, 0xd841bd1b8291d0b6 +data8 0x3fe211f66db3a5a0, 0x3ca480c35a27b4a2 +data8 0xb5833e4755e04dd1, 0xd6fa0bd3150b6930 +data8 0x3fe25df2e05b6c40, 0x3ca4bc324287a351 +data8 0xb7bd34c8000b7bd3, 0xd5ab9939a7d23aa1 +data8 0x3fe2aa64b32f7780, 0x3cba67314933077c +data8 0xba0dc64d126cc135, 0xd4564563ce924481 +data8 0x3fe2f74fc9289ac0, 0x3cec1a1dc0efc5ec +data8 0xbc76222cbbfa74a6, 0xd2f9eeed501125a8 +data8 0x3fe344b82f859ac0, 0x3ceeef218de413ac +data8 0xbef78e31985291a9, 0xd19672e2182f78be +data8 0x3fe392a22087b7e0, 0x3cd2619ba201204c +data8 0xc19368b2b0629572, 0xd02baca5427e436a +data8 0x3fe3e11206694520, 0x3cb5d0b3143fe689 +data8 0xc44b2ae8c6733e51, 0xceb975d60b6eae5d +data8 0x3fe4300c7e945020, 0x3cbd367143da6582 +data8 0xc7206b894212dfef, 0xcd3fa6326ff0ac9a +data8 0x3fe47f965d201d60, 0x3ce797c7a4ec1d63 +data8 0xca14e1b0622de526, 0xcbbe13773c3c5338 +data8 0x3fe4cfb4b09d1a20, 0x3cedfadb5347143c +data8 0xcd2a6825eae65f82, 0xca34913d425a5ae9 +data8 0x3fe5206cc637e000, 0x3ce2798b38e54193 +data8 0xd06301095e1351ee, 0xc8a2f0d3679c08c0 +data8 0x3fe571c42e3d0be0, 0x3ccd7cb9c6c2ca68 +data8 0xd3c0d9f50057adda, 0xc70901152d59d16b +data8 0x3fe5c3c0c108f940, 0x3ceb6c13563180ab +data8 0xd74650a98cc14789, 0xc5668e3d4cbf8828 +data8 0x3fe61668a46ffa80, 0x3caa9092e9e3c0e5 +data8 0xdaf5f8579dcc8f8f, 0xc3bb61b3eed42d02 +data8 0x3fe669c251ad69e0, 0x3cccf896ef3b4fee +data8 0xded29f9f9a6171b4, 0xc20741d7f8e8e8af +data8 0x3fe6bdd49bea05c0, 0x3cdc6b29937c575d +data8 0xe2df5765854ccdb0, 0xc049f1c2d1b8014b +data8 0x3fe712a6b76c6e80, 0x3ce1ddc6f2922321 +data8 0xe71f7a9b94fcb4c3, 0xbe833105ec291e91 +data8 0x3fe76840418978a0, 0x3ccda46e85432c3d +data8 0xeb96b72d3374b91e, 0xbcb2bb61493b28b3 +data8 0x3fe7bea9496d5a40, 0x3ce37b42ec6e17d3 +data8 0xf049183c3f53c39b, 0xbad848720223d3a8 +data8 0x3fe815ea59dab0a0, 0x3cb03ad41bfc415b +data8 0xf53b11ec7f415f15, 0xb8f38b57c53c9c48 +data8 0x3fe86e0c84010760, 0x3cc03bfcfb17fe1f +data8 0xfa718f05adbf2c33, 0xb70432500286b185 +data8 0x3fe8c7196b9225c0, 0x3ced99fcc6866ba9 +data8 0xfff200c3f5489608, 0xb509e6454dca33cc +data8 0x3fe9211b54441080, 0x3cb789cb53515688 +// The following table entries are not used +//data8 0x82e138a0fac48700, 0xb3044a513a8e6132 +//data8 0x3fe97c1d30f5b7c0, 0x3ce1eb765612d1d0 +//data8 0x85f4cc7fc670d021, 0xb0f2fb2ea6cbbc88 +//data8 0x3fe9d82ab4b5fde0, 0x3ced3fe6f27e8039 +//data8 0x89377c1387d5b908, 0xaed58e9a09014d5c +//data8 0x3fea355065f87fa0, 0x3cbef481d25f5b58 +//data8 0x8cad7a2c98dec333, 0xacab929ce114d451 +//data8 0x3fea939bb451e2a0, 0x3c8e92b4fbf4560f +//data8 0x905b7dfc99583025, 0xaa748cc0dbbbc0ec +//data8 0x3feaf31b11270220, 0x3cdced8c61bd7bd5 +//data8 0x9446d8191f80dd42, 0xa82ff92687235baf +//data8 0x3feb53de0bcffc20, 0x3cbe1722fb47509e +//data8 0x98758ba086e4000a, 0xa5dd497a9c184f58 +//data8 0x3febb5f571cb0560, 0x3ce0c7774329a613 +//data8 0x9cee6c7bf18e4e24, 0xa37be3c3cd1de51b +//data8 0x3fec197373bc7be0, 0x3ce08ebdb55c3177 +//data8 0xa1b944000a1b9440, 0xa10b2101b4f27e03 +//data8 0x3fec7e6bd023da60, 0x3ce5fc5fd4995959 +//data8 0xa6defd8ba04d3e38, 0x9e8a4b93cad088ec +//data8 0x3fece4f404e29b20, 0x3cea3413401132b5 +//data8 0xac69dd408a10c62d, 0x9bf89d5d17ddae8c +//data8 0x3fed4d2388f63600, 0x3cd5a7fb0d1d4276 +//data8 0xb265c39cbd80f97a, 0x99553d969fec7beb +//data8 0x3fedb714101e0a00, 0x3cdbda21f01193f2 +//data8 0xb8e081a16ae4ae73, 0x969f3e3ed2a0516c +//data8 0x3fee22e1da97bb00, 0x3ce7231177f85f71 +//data8 0xbfea427678945732, 0x93d5990f9ee787af +//data8 0x3fee90ac13b18220, 0x3ce3c8a5453363a5 +//data8 0xc79611399b8c90c5, 0x90f72bde80febc31 +//data8 0x3fef009542b712e0, 0x3ce218fd79e8cb56 +//data8 0xcffa8425040624d7, 0x8e02b4418574ebed +//data8 0x3fef72c3d2c57520, 0x3cd32a717f82203f +//data8 0xd93299cddcf9cf23, 0x8af6ca48e9c44024 +//data8 0x3fefe762b77744c0, 0x3ce53478a6bbcf94 +//data8 0xe35eda760af69ad9, 0x87d1da0d7f45678b +//data8 0x3ff02f511b223c00, 0x3ced6e11782c28fc +//data8 0xeea6d733421da0a6, 0x84921bbe64ae029a +//data8 0x3ff06c5c6f8ce9c0, 0x3ce71fc71c1ffc02 +//data8 0xfb3b2c73fc6195cc, 0x813589ba3a5651b6 +//data8 0x3ff0aaf2613700a0, 0x3cf2a72d2fd94ef3 +//data8 0x84ac1fcec4203245, 0xfb73a828893df19e +//data8 0x3ff0eb367c3fd600, 0x3cf8054c158610de +//data8 0x8ca50621110c60e6, 0xf438a14c158d867c +//data8 0x3ff12d51caa6b580, 0x3ce6bce9748739b6 +//data8 0x95b8c2062d6f8161, 0xecb3ccdd37b369da +//data8 0x3ff1717418520340, 0x3ca5c2732533177c +//data8 0xa0262917caab4ad1, 0xe4dde4ddc81fd119 +//data8 0x3ff1b7d59dd40ba0, 0x3cc4c7c98e870ff5 +//data8 0xac402c688b72f3f4, 0xdcae469be46d4c8d +//data8 0x3ff200b93cc5a540, 0x3c8dd6dc1bfe865a +//data8 0xba76968b9eabd9ab, 0xd41a8f3df1115f7f +//data8 0x3ff24c6f8f6affa0, 0x3cf1acb6d2a7eff7 +//data8 0xcb63c87c23a71dc5, 0xcb161074c17f54ec +//data8 0x3ff29b5b338b7c80, 0x3ce9b5845f6ec746 +//data8 0xdfe323b8653af367, 0xc19107d99ab27e42 +//data8 0x3ff2edf6fac7f5a0, 0x3cf77f961925fa02 +//data8 0xf93746caaba3e1f1, 0xb777744a9df03bff +//data8 0x3ff344df237486c0, 0x3cf6ddf5f6ddda43 +//data8 0x8ca77052f6c340f0, 0xacaf476f13806648 +//data8 0x3ff3a0dfa4bb4ae0, 0x3cfee01bbd761bff +//data8 0xa1a48604a81d5c62, 0xa11575d30c0aae50 +//data8 0x3ff4030b73c55360, 0x3cf1cf0e0324d37c +//data8 0xbe45074b05579024, 0x9478e362a07dd287 +//data8 0x3ff46ce4c738c4e0, 0x3ce3179555367d12 +//data8 0xe7a08b5693d214ec, 0x8690e3575b8a7c3b +//data8 0x3ff4e0a887c40a80, 0x3cfbd5d46bfefe69 +//data8 0x94503d69396d91c7, 0xedd2ce885ff04028 +//data8 0x3ff561ebd9c18cc0, 0x3cf331bd176b233b +//data8 0xced1d96c5bb209e6, 0xc965278083808702 +//data8 0x3ff5f71d7ff42c80, 0x3ce3301cc0b5a48c +//data8 0xabac2cee0fc24e20, 0x9c4eb1136094cbbd +//data8 0x3ff6ae4c63222720, 0x3cf5ff46874ee51e +//data8 0x8040201008040201, 0xb4d7ac4d9acb1bf4 +//data8 0x3ff7b7d33b928c40, 0x3cfacdee584023bb +LOCAL_OBJECT_END(T_table) + + + +.align 16 + +LOCAL_OBJECT_START(poly_coeffs) + // C_3 +data8 0xaaaaaaaaaaaaaaab, 0x0000000000003ffc + // C_5 +data8 0x999999999999999a, 0x0000000000003ffb + // C_7, C_9 +data8 0x3fa6db6db6db6db7, 0x3f9f1c71c71c71c8 + // pi/2 (low, high) +data8 0x3C91A62633145C07, 0x3FF921FB54442D18 + // C_11, C_13 +data8 0x3f96e8ba2e8ba2e9, 0x3f91c4ec4ec4ec4e + // C_15, C_17 +data8 0x3f8c99999999999a, 0x3f87a87878787223 +LOCAL_OBJECT_END(poly_coeffs) + + +R_DBL_S = r21 +R_EXP0 = r22 +R_EXP = r15 +R_SGNMASK = r23 +R_TMP = r24 +R_TMP2 = r25 +R_INDEX = r26 +R_TMP3 = r27 +R_TMP03 = r27 +R_TMP4 = r28 +R_TMP5 = r23 +R_TMP6 = r22 +R_TMP7 = r21 +R_T = r29 +R_BIAS = r20 + +F_T = f6 +F_1S2 = f7 +F_1S2_S = f9 +F_INV_1T2 = f10 +F_SQRT_1T2 = f11 +F_S2T2 = f12 +F_X = f13 +F_D = f14 +F_2M64 = f15 + +F_CS2 = f32 +F_CS3 = f33 +F_CS4 = f34 +F_CS5 = f35 +F_CS6 = f36 +F_CS7 = f37 +F_CS8 = f38 +F_CS9 = f39 +F_S23 = f40 +F_S45 = f41 +F_S67 = f42 +F_S89 = f43 +F_S25 = f44 +F_S69 = f45 +F_S29 = f46 +F_X2 = f47 +F_X4 = f48 +F_TSQRT = f49 +F_DTX = f50 +F_R = f51 +F_R2 = f52 +F_R3 = f53 +F_R4 = f54 + +F_C3 = f55 +F_C5 = f56 +F_C7 = f57 +F_C9 = f58 +F_P79 = f59 +F_P35 = f60 +F_P39 = f61 + +F_ATHI = f62 +F_ATLO = f63 + +F_T1 = f64 +F_Y = f65 +F_Y2 = f66 +F_ANDMASK = f67 +F_ORMASK = f68 +F_S = f69 +F_05 = f70 +F_SQRT_1S2 = f71 +F_DS = f72 +F_Z = f73 +F_1T2 = f74 +F_DZ = f75 +F_ZE = f76 +F_YZ = f77 +F_Y1S2 = f78 +F_Y1S2X = f79 +F_1X = f80 +F_ST = f81 +F_1T2_ST = f82 +F_TSS = f83 +F_Y1S2X2 = f84 +F_DZ_TERM = f85 +F_DTS = f86 +F_DS2X = f87 +F_T2 = f88 +F_ZY1S2S = f89 +F_Y1S2_1X = f90 +F_TS = f91 +F_PI2_LO = f92 +F_PI2_HI = f93 +F_S19 = f94 +F_INV1T2_2 = f95 +F_CORR = f96 +F_DZ0 = f97 + +F_C11 = f98 +F_C13 = f99 +F_C15 = f100 +F_C17 = f101 +F_P1113 = f102 +F_P1517 = f103 +F_P1117 = f104 +F_P317 = f105 +F_R8 = f106 +F_HI = f107 +F_1S2_HI = f108 +F_DS2 = f109 +F_Y2_2 = f110 +F_S2 = f111 +F_S_DS2 = f112 +F_S_1S2S = f113 +F_XL = f114 +F_2M128 = f115 + .section .text -.proc asinl# -.align 32 +GLOBAL_LIBM_ENTRY(asinl) + +{.mfi + // get exponent, mantissa (rounded to double precision) of s + getf.d R_DBL_S = f8 + // 1-s^2 + fnma.s1 F_1S2 = f8, f8, f1 + // r2 = pointer to T_table + addl r2 = @ltoff(T_table), gp +} +{.mfi + // sign mask + mov R_SGNMASK = 0x20000 + nop.f 0 + // bias-63-1 + mov R_TMP03 = 0xffff-64;; +} -asinl: -{ .mfi - alloc r32 = ar.pfs,1,11,4,0 -(p0) fnorm asin_NORM_f8 = f8 -(p0) mov asin_GR_17_ones = 0x1ffff +{.mfi + // get exponent of s + getf.exp R_EXP = f8 + nop.f 0 + // R_TMP4 = 2^45 + shl R_TMP4 = R_SGNMASK, 45-17 } -{ .mii -(p0) mov asin_GR_16_ones = 0xffff -(p0) mov asin_GR_ff9b = 0xff9b ;; - nop.i 999 +{.mlx + // load bias-4 + mov R_TMP = 0xffff-4 + // load RU(sqrt(2)/2) to integer register (in double format, shifted left by 1) + movl R_TMP2 = 0x7fcd413cccfe779a;; } -{ .mmi -(p0) setf.exp asin_2m100 = asin_GR_ff9b -(p0) addl r40 = @ltoff(asin_coefficients), gp - nop.i 999 +{.mfi + // load 2^{-64} in FP register + setf.exp F_2M64 = R_TMP03 + nop.f 0 + // index = (0x7-exponent)|b1 b2.. b6 + extr.u R_INDEX = R_DBL_S, 46, 9 } -;; -{ .mmi - ld8 r40 = [r40] - nop.m 999 - nop.i 999 +{.mfi + // get t = sign|exponent|b1 b2.. b6 1 x.. x + or R_T = R_DBL_S, R_TMP4 + nop.f 0 + // R_TMP4 = 2^45-1 + sub R_TMP4 = R_TMP4, r0, 1;; } -;; +{.mfi + // get t = sign|exponent|b1 b2.. b6 1 0.. 0 + andcm R_T = R_T, R_TMP4 + nop.f 0 + // eliminate sign from R_DBL_S (shift left by 1) + shl R_TMP3 = R_DBL_S, 1 +} -// Load the constants +{.mfi + // R_BIAS = 3*2^6 + mov R_BIAS = 0xc0 + nop.f 0 + // eliminate sign from R_EXP + andcm R_EXP0 = R_EXP, R_SGNMASK;; +} -{ .mmi -(p0) ldfe asin_A10 = [r40],16 ;; -(p0) ldfe asin_A9 = [r40],16 - nop.i 999 ;; + + +{.mfi + // load start address for T_table + ld8 r2 = [r2] + nop.f 0 + // p8 = 1 if |s|> = sqrt(2)/2 + cmp.geu p8, p0 = R_TMP3, R_TMP2 } -{ .mmi -(p0) ldfe asin_A8 = [r40],16 ;; -(p0) ldfe asin_A7 = [r40],16 - nop.i 999 ;; +{.mlx + // p7 = 1 if |s|<2^{-4} (exponent of s<bias-4) + cmp.lt p7, p0 = R_EXP0, R_TMP + // sqrt coefficient cs8 = -33*13/128 + movl R_TMP2 = 0xc0568000;; } -{ .mmi -(p0) ldfe asin_A6 = [r40],16 ;; -(p0) getf.exp asin_GR_signexp_f8 = asin_NORM_f8 - nop.i 999 + + +{.mbb + // load t in FP register + setf.d F_T = R_T + // if |s|<2^{-4}, take alternate path + (p7) br.cond.spnt SMALL_S + // if |s|> = sqrt(2)/2, take alternate path + (p8) br.cond.sptk LARGE_S } -{ .mmi -(p0) ldfe asin_A5 = [r40],16 ;; -(p0) ldfe asin_A4 = [r40],16 - nop.i 999 ;; +{.mlx + // index = (4-exponent)|b1 b2.. b6 + sub R_INDEX = R_INDEX, R_BIAS + // sqrt coefficient cs9 = 55*13/128 + movl R_TMP = 0x40b2c000;; } -{ .mfi - nop.m 999 -(p0) fmerge.s asin_ABS_NORM_f8 = f0, asin_NORM_f8 -(p0) and asin_GR_exp = asin_GR_signexp_f8, asin_GR_17_ones ;; + +{.mfi + // sqrt coefficient cs8 = -33*13/128 + setf.s F_CS8 = R_TMP2 + nop.f 0 + // shift R_INDEX by 5 + shl R_INDEX = R_INDEX, 5 +} + +{.mfi + // sqrt coefficient cs3 = 0.5 (set exponent = bias-1) + mov R_TMP4 = 0xffff - 1 + nop.f 0 + // sqrt coefficient cs6 = -21/16 + mov R_TMP6 = 0xbfa8;; } -// case 1: |x| < 2^-40 ==> p6 (includes x = +-0) -// case 2: 2^-40 <= |x| < 2^-2 ==> p8 -// case 3: 2^-2 <= |x| < 1 ==> p9 -// case 4: 1 <= |x| ==> p11 -// In case 4, we pick up the special case x = +-1 and return +-pi/2 -{ .mii -(p0) ldfe asin_A3 = [r40],16 -(p0) sub asin_GR_true_exp = asin_GR_exp, asin_GR_16_ones ;; -(p0) cmp.ge.unc p6, p7 = -41, asin_GR_true_exp ;; +{.mlx + // table index + add r2 = r2, R_INDEX + // sqrt coefficient cs7 = 33/16 + movl R_TMP2 = 0x40040000;; } -{ .mii -(p0) ldfe asin_A2 = [r40],16 -(p7) cmp.ge.unc p8, p9 = -3, asin_GR_true_exp ;; -(p9) cmp.ge.unc p10, p11 = -1, asin_GR_true_exp + +{.mmi + // load cs9 = 55*13/128 + setf.s F_CS9 = R_TMP + // sqrt coefficient cs5 = 7/8 + mov R_TMP3 = 0x3f60 + // sqrt coefficient cs6 = 21/16 + shl R_TMP6 = R_TMP6, 16;; } -{ .mmi -(p0) ldfe asin_A1 = [r40],16 ;; -(p0) ldfe asin_pi_by_2 = [r40],16 - nop.i 999 + +{.mmi + // load significand of 1/(1-t^2) + ldf8 F_INV_1T2 = [r2], 8 + // sqrt coefficient cs7 = 33/16 + setf.s F_CS7 = R_TMP2 + // sqrt coefficient cs4 = -5/8 + mov R_TMP5 = 0xbf20;; } -// case 4: |x| >= 1 -{ .mib - nop.m 999 - nop.i 999 -(p11) br.spnt L(ASIN_ERROR_RETURN) ;; + +{.mmi + // load significand of sqrt(1-t^2) + ldf8 F_SQRT_1T2 = [r2], 8 + // sqrt coefficient cs6 = 21/16 + setf.s F_CS6 = R_TMP6 + // sqrt coefficient cs5 = 7/8 + shl R_TMP3 = R_TMP3, 16;; } -// case 1: |x| < 2^-40 -{ .mfb - nop.m 999 -(p6) fma.s0 f8 = asin_2m100,f8,f8 -(p6) br.ret.spnt b0 ;; + +{.mmi + // sqrt coefficient cs3 = 0.5 (set exponent = bias-1) + setf.exp F_CS3 = R_TMP4 + // r3 = pointer to polynomial coefficients + addl r3 = @ltoff(poly_coeffs), gp + // sqrt coefficient cs4 = -5/8 + shl R_TMP5 = R_TMP5, 16;; } -// case 2: 2^-40 <= |x| < 2^-2 ==> p8 -{ .mfi - nop.m 999 -(p8) fma.s1 asin_X2 = f8,f8, f0 - nop.i 999 ;; +{.mfi + // sqrt coefficient cs5 = 7/8 + setf.s F_CS5 = R_TMP3 + // d = s-t + fms.s1 F_D = f8, f1, F_T + // set p6 = 1 if s<0, p11 = 1 if s> = 0 + cmp.ge p6, p11 = R_EXP, R_DBL_S } -{ .mfi - nop.m 999 -(p8) fma.s1 asin_X4 = asin_X2,asin_X2, f0 - nop.i 999 ;; +{.mfi + // r3 = load start address to polynomial coefficients + ld8 r3 = [r3] + // s+t + fma.s1 F_S2T2 = f8, f1, F_T + nop.i 0;; } -{ .mfi - nop.m 999 -(p8) fma.s1 asin_P810 = asin_X4, asin_A10, asin_A8 - nop.i 999 + +{.mfi + // sqrt coefficient cs4 = -5/8 + setf.s F_CS4 = R_TMP5 + // s^2-t^2 + fma.s1 F_S2T2 = F_S2T2, F_D, f0 + nop.i 0;; } -{ .mfi - nop.m 999 -(p8) fma.s1 asin_P79 = asin_X4, asin_A9, asin_A7 - nop.i 999 ;; + +{.mfi + // load C3 + ldfe F_C3 = [r3], 16 + // 0.5/(1-t^2) = 2^{-64}*(2^63/(1-t^2)) + fma.s1 F_INV_1T2 = F_INV_1T2, F_2M64, f0 + nop.i 0;; } -{ .mfi - nop.m 999 -(p8) fma.s1 asin_P610 = asin_X4, asin_P810, asin_A6 - nop.i 999 +{.mfi + // load C_5 + ldfe F_C5 = [r3], 16 + // set correct exponent for sqrt(1-t^2) + fma.s1 F_SQRT_1T2 = F_SQRT_1T2, F_2M64, f0 + nop.i 0;; } -{ .mfi - nop.m 999 -(p8) fma.s1 asin_P59 = asin_X4, asin_P79, asin_A5 - nop.i 999 ;; + +{.mfi + // load C_7, C_9 + ldfpd F_C7, F_C9 = [r3] + // x = -(s^2-t^2)/(1-t^2)/2 + fnma.s1 F_X = F_INV_1T2, F_S2T2, f0 + nop.i 0;; } -{ .mfi - nop.m 999 -(p8) fma.s1 asin_P410 = asin_X4, asin_P610, asin_A4 - nop.i 999 + +{.mfi + // load asin(t)_high, asin(t)_low + ldfpd F_ATHI, F_ATLO = [r2] + // t*sqrt(1-t^2) + fma.s1 F_TSQRT = F_T, F_SQRT_1T2, f0 + nop.i 0;; } -{ .mfi - nop.m 999 -(p8) fma.s1 asin_P39 = asin_X4, asin_P59, asin_A3 - nop.i 999 ;; + +{.mfi + nop.m 0 + // cs9*x+cs8 + fma.s1 F_S89 = F_CS9, F_X, F_CS8 + nop.i 0 } -{ .mfi - nop.m 999 -(p8) fma.s1 asin_P210 = asin_X4, asin_P410, asin_A2 - nop.i 999 +{.mfi + nop.m 0 + // cs7*x+cs6 + fma.s1 F_S67 = F_CS7, F_X, F_CS6 + nop.i 0;; } -{ .mfi - nop.m 999 -(p8) fma.s1 asin_P19 = asin_X4, asin_P39, asin_A1 - nop.i 999 ;; +{.mfi + nop.m 0 + // cs5*x+cs4 + fma.s1 F_S45 = F_CS5, F_X, F_CS4 + nop.i 0 } -{ .mfi - nop.m 999 -(p8) fma.s1 asin_P1P2 = asin_X2, asin_P210, asin_P19 - nop.i 999 ;; +{.mfi + nop.m 0 + // x*x + fma.s1 F_X2 = F_X, F_X, f0 + nop.i 0;; } -{ .mfi - nop.m 999 -(p8) fma.s1 asin_P1P2 = asin_X2, asin_P1P2, f0 - nop.i 999 ;; + +{.mfi + nop.m 0 + // (s-t)-t*x + fnma.s1 F_DTX = F_T, F_X, F_D + nop.i 0 } -{ .mfb - nop.m 999 -(p8) fma.s0 f8 = asin_NORM_f8, asin_P1P2, asin_NORM_f8 -(p8) br.ret.spnt b0 ;; +{.mfi + nop.m 0 + // cs3*x+cs2 (cs2 = -0.5 = -cs3) + fms.s1 F_S23 = F_CS3, F_X, F_CS3 + nop.i 0;; } -// case 3: 2^-2 <= |x| < 1 -// 1- X*X is computed as B + b -// Step 1.1: Get B and b -// atan2 will return -// f8 = Z_hi -// f10 = Z_lo -// f11 = s_lo +{.mfi + nop.m 0 + // cs9*x^3+cs8*x^2+cs7*x+cs6 + fma.s1 F_S69 = F_S89, F_X2, F_S67 + nop.i 0 +} +{.mfi + nop.m 0 + // x^4 + fma.s1 F_X4 = F_X2, F_X2, f0 + nop.i 0;; +} -{ .mfi -(p0) mov asin_GR_fffe = 0xfffe -(p0) fmerge.se f8 = asin_ABS_NORM_f8, asin_ABS_NORM_f8 -nop.i 0 -};; -{ .mmf -nop.m 0 -(p0) setf.exp asin_HALF = asin_GR_fffe -(p0) fmerge.se f12 = asin_NORM_f8, asin_NORM_f8 ;; +{.mfi + nop.m 0 + // t*sqrt(1-t^2)*x^2 + fma.s1 F_TSQRT = F_TSQRT, F_X2, f0 + nop.i 0 } +{.mfi + nop.m 0 + // cs5*x^3+cs4*x^2+cs3*x+cs2 + fma.s1 F_S25 = F_S45, F_X2, F_S23 + nop.i 0;; +} -{ .mfi - nop.m 999 -(p0) fcmp.lt.unc.s1 p6,p7 = asin_ABS_NORM_f8, asin_HALF - nop.i 999 ;; + +{.mfi + nop.m 0 + // ((s-t)-t*x)*sqrt(1-t^2) + fma.s1 F_DTX = F_DTX, F_SQRT_1T2, f0 + nop.i 0;; } -{ .mfi - nop.m 999 -(p7) fma.s1 asin_D = f1,f1,asin_ABS_NORM_f8 - nop.i 999 + +{.mfi + nop.m 0 + // if sign is negative, negate table values: asin(t)_low + (p6) fnma.s1 F_ATLO = F_ATLO, f1, f0 + nop.i 0 } -{ .mfi - nop.m 999 -(p7) fms.s1 asin_C = f1,f1,asin_ABS_NORM_f8 - nop.i 999 ;; +{.mfi + nop.m 0 + // PS29 = cs9*x^7+..+cs5*x^3+cs4*x^2+cs3*x+cs2 + fma.s1 F_S29 = F_S69, F_X4, F_S25 + nop.i 0;; } -{ .mfi - nop.m 999 -(p7) fma.s1 asin_B = asin_C, asin_D, f0 - nop.i 999 + +{.mfi + nop.m 0 + // if sign is negative, negate table values: asin(t)_high + (p6) fnma.s1 F_ATHI = F_ATHI, f1, f0 + nop.i 0 } -{ .mfi - nop.m 999 -(p7) fms.s1 asin_1mD = f1,f1,asin_D - nop.i 999 ;; +{.mfi + nop.m 0 + // R = ((s-t)-t*x)*sqrt(1-t^2)-t*sqrt(1-t^2)*x^2*PS29 + fnma.s1 F_R = F_S29, F_TSQRT, F_DTX + nop.i 0;; } -{ .mfi - nop.m 999 -(p7) fma.s1 asin_Dd = asin_1mD,f1, asin_ABS_NORM_f8 - nop.i 999 + +{.mfi + nop.m 0 + // R^2 + fma.s1 F_R2 = F_R, F_R, f0 + nop.i 0;; } -{ .mfi - nop.m 999 -(p7) fms.s1 asin_Bb = asin_C, asin_D, asin_B - nop.i 999 ;; + +{.mfi + nop.m 0 + // c7+c9*R^2 + fma.s1 F_P79 = F_C9, F_R2, F_C7 + nop.i 0 } -{ .mfi - nop.m 999 -(p7) fma.s1 asin_Bb = asin_C, asin_Dd, asin_Bb - nop.i 999 +{.mfi + nop.m 0 + // c3+c5*R^2 + fma.s1 F_P35 = F_C5, F_R2, F_C3 + nop.i 0;; } -{ .mfi - nop.m 999 -(p6) fma.s1 asin_C = asin_ABS_NORM_f8, asin_ABS_NORM_f8, f0 - nop.i 999 ;; +{.mfi + nop.m 0 + // R^3 + fma.s1 F_R4 = F_R2, F_R2, f0 + nop.i 0;; } -{ .mfi - nop.m 999 -(p6) fms.s1 asin_B = f1, f1, asin_C - nop.i 999 +{.mfi + nop.m 0 + // R^3 + fma.s1 F_R3 = F_R2, F_R, f0 + nop.i 0;; } -{ .mfi - nop.m 999 -(p6) fms.s1 asin_Cc = asin_ABS_NORM_f8, asin_ABS_NORM_f8, asin_C - nop.i 999 ;; + + +{.mfi + nop.m 0 + // c3+c5*R^2+c7*R^4+c9*R^6 + fma.s1 F_P39 = F_P79, F_R4, F_P35 + nop.i 0;; } -{ .mfi - nop.m 999 -(p0) fma.s1 asin_Hh = asin_HALF, asin_B, f0 - nop.i 999 + +{.mfi + nop.m 0 + // asin(t)_low+R^3*(c3+c5*R^2+c7*R^4+c9*R^6) + fma.s1 F_P39 = F_P39, F_R3, F_ATLO + nop.i 0;; } -{ .mfi - nop.m 999 -(p6) fms.s1 asin_1mB = f1, f1, asin_B - nop.i 999 ;; + +{.mfi + nop.m 0 + // R+asin(t)_low+R^3*(c3+c5*R^2+c7*R^4+c9*R^6) + fma.s1 F_P39 = F_P39, f1, F_R + nop.i 0;; } -// Step 1.2: -// sqrt(B + b) is computed as W + w -// Get W -{ .mfi - nop.m 999 -(p0) frsqrta.s1 asin_y0,p8 = asin_B - nop.i 999 ;; +{.mfb + nop.m 0 + // result = asin(t)_high+R+asin(t)_low+R^3*(c3+c5*R^2+c7*R^4+c9*R^6) + fma.s0 f8 = F_ATHI, f1, F_P39 + // return + br.ret.sptk b0;; } -{ .mfi - nop.m 999 -(p6) fms.s1 asin_1mBmC = asin_1mB, f1, asin_C - nop.i 999 ;; + + + +LARGE_S: + +{.mfi + // bias-1 + mov R_TMP3 = 0xffff - 1 + // y ~ 1/sqrt(1-s^2) + frsqrta.s1 F_Y, p7 = F_1S2 + // c9 = 55*13*17/128 + mov R_TMP4 = 0x10af7b } -{ .mfi - nop.m 999 -(p0) fma.s1 asin_t1 = asin_y0, asin_y0, f0 - nop.i 999 ;; +{.mlx + // c8 = -33*13*15/128 + mov R_TMP5 = 0x184923 + movl R_TMP2 = 0xff00000000000000;; } -{ .mfi - nop.m 999 -(p6) fms.s1 asin_Bb = asin_1mBmC, f1, asin_Cc - nop.i 999 ;; +{.mfi + // set p6 = 1 if s<0, p11 = 1 if s>0 + cmp.ge p6, p11 = R_EXP, R_DBL_S + // 1-s^2 + fnma.s1 F_1S2 = f8, f8, f1 + // set p9 = 1 + cmp.eq p9, p0 = r0, r0;; } -{ .mfi - nop.m 999 -(p0) fnma.s1 asin_t2 = asin_t1, asin_Hh, asin_HALF - nop.i 999 ;; + +{.mfi + // load 0.5 + setf.exp F_05 = R_TMP3 + // (1-s^2) rounded to single precision + fnma.s.s1 F_1S2_S = f8, f8, f1 + // c9 = 55*13*17/128 + shl R_TMP4 = R_TMP4, 10 } -{ .mfi - nop.m 999 -(p0) fma.s1 asin_y1 = asin_t2, asin_y0, asin_y0 - nop.i 999 ;; +{.mlx + // AND mask for getting t ~ sqrt(1-s^2) + setf.sig F_ANDMASK = R_TMP2 + // OR mask + movl R_TMP2 = 0x0100000000000000;; } -{ .mfi - nop.m 999 -(p0) fma.s1 asin_t3 = asin_y1, asin_Hh, f0 - nop.i 999 ;; + +{.mfi + nop.m 0 + // (s^2)_s + fma.s.s1 F_S2 = f8, f8, f0 + nop.i 0;; } -{ .mfi - nop.m 999 -(p0) fnma.s1 asin_t4 = asin_t3, asin_y1, asin_HALF - nop.i 999 ;; + +{.mmi + // c9 = 55*13*17/128 + setf.s F_CS9 = R_TMP4 + // c7 = 33*13/16 + mov R_TMP4 = 0x41d68 + // c8 = -33*13*15/128 + shl R_TMP5 = R_TMP5, 11;; } -{ .mfi - nop.m 999 -(p0) fma.s1 asin_y2 = asin_t4, asin_y1, asin_y1 - nop.i 999 ;; + +{.mfi + setf.sig F_ORMASK = R_TMP2 + // y^2 + fma.s1 F_Y2 = F_Y, F_Y, f0 + // c7 = 33*13/16 + shl R_TMP4 = R_TMP4, 12 } -{ .mfi - nop.m 999 -(p0) fma.s1 asin_S = asin_B, asin_y2, f0 - nop.i 999 +{.mfi + // c6 = -33*7/16 + mov R_TMP6 = 0xc1670 + // y' ~ sqrt(1-s^2) + fma.s1 F_T1 = F_Y, F_1S2, f0 + // c5 = 63/8 + mov R_TMP7 = 0x40fc;; } -{ .mfi - nop.m 999 -(p0) fma.s1 asin_H = asin_y2, asin_HALF, f0 - nop.i 999 ;; + +{.mlx + // load c8 = -33*13*15/128 + setf.s F_CS8 = R_TMP5 + // c4 = -35/8 + movl R_TMP5 = 0xc08c0000;; } -{ .mfi - nop.m 999 -(p0) fma.s1 asin_t5 = asin_Hh, asin_y2, f0 - nop.i 999 ;; +{.mfi + // r3 = pointer to polynomial coefficients + addl r3 = @ltoff(poly_coeffs), gp + // 1-(1-s^2)_s + fnma.s1 F_DS = F_1S2_S, f1, f1 + // p9 = 0 if p7 = 1 (p9 = 1 for special cases only) + (p7) cmp.ne p9, p0 = r0, r0 } -{ .mfi - nop.m 999 -(p0) fnma.s1 asin_Dd = asin_S, asin_S, asin_B - nop.i 999 ;; +{.mlx + // load c7 = 33*13/16 + setf.s F_CS7 = R_TMP4 + // c3 = 5/2 + movl R_TMP4 = 0x40200000;; } -{ .mfi - nop.m 999 -(p0) fma.s1 asin_W = asin_Dd, asin_H, asin_S - nop.i 999 ;; + +{.mfi + nop.m 0 + // 1-(s^2)_s + fnma.s1 F_S_1S2S = F_S2, f1, f1 + nop.i 0 } -{ .mfi - nop.m 999 -(p0) fma.s1 asin_2W = asin_W, f1, asin_W - nop.i 999 +{.mlx + // load c4 = -35/8 + setf.s F_CS4 = R_TMP5 + // c2 = -3/2 + movl R_TMP5 = 0xbfc00000;; } -// Step 1.3 -// Get w -{ .mfi - nop.m 999 -(p0) fnma.s1 asin_BmWW = asin_W, asin_W, asin_B - nop.i 999 ;; + +{.mfi + // load c3 = 5/2 + setf.s F_CS3 = R_TMP4 + // x = (1-s^2)_s*y^2-1 + fms.s1 F_X = F_1S2_S, F_Y2, f1 + // c6 = -33*7/16 + shl R_TMP6 = R_TMP6, 12 } -// Step 2 -// asin(x) = atan2(X,sqrt(1-X*X)) -// = atan2(X, W) -Xw -// corr = Xw -// asin(x) = Z_hi + (s_lo*Z_lo - corr) -// Call atan2(X, W) -// Save W in f9 -// Save X in f12 -// Save w in f13 +{.mfi + nop.m 0 + // y^2/2 + fma.s1 F_Y2_2 = F_Y2, F_05, f0 + nop.i 0;; +} -{ .mfi - nop.m 999 -(p0) fmerge.se f9 = asin_W, asin_W - nop.i 999 ;; + +{.mfi + // load c6 = -33*7/16 + setf.s F_CS6 = R_TMP6 + // eliminate lower bits from y' + fand F_T = F_T1, F_ANDMASK + // c5 = 63/8 + shl R_TMP7 = R_TMP7, 16 } -{ .mfi - nop.m 999 -(p0) fma.s1 asin_BmWWpb = asin_BmWW, f1, asin_Bb - nop.i 999 ;; +{.mfb + // r3 = load start address to polynomial coefficients + ld8 r3 = [r3] + // 1-(1-s^2)_s-s^2 + fnma.s1 F_DS = f8, f8, F_DS + // p9 = 1 if s is a special input (NaN, or |s|> = 1) + (p9) br.cond.spnt ASINL_SPECIAL_CASES;; } -{ .mfi - nop.m 999 -(p0) frcpa.s1 asin_1d2W,p9 = f1, asin_2W - nop.i 999 ;; +{.mmf + // get exponent, significand of y' (in single prec.) + getf.s R_TMP = F_T1 + // load c3 = -3/2 + setf.s F_CS2 = R_TMP5 + // y*(1-s^2) + fma.s1 F_Y1S2 = F_Y, F_1S2, f0;; } -{ .mfi - nop.m 999 -(p0) fma.s1 asin_Ww = asin_BmWWpb, asin_1d2W, f0 - nop.i 999 ;; + +{.mfi + nop.m 0 + // x' = (y^2/2)*(1-(s^2)_s)-0.5 + fms.s1 F_XL = F_Y2_2, F_S_1S2S, F_05 + nop.i 0 } -.endp asinl -ASM_SIZE_DIRECTIVE(asinl) -.proc __libm_callout -__libm_callout: -.prologue -{ .mfi - nop.m 0 - nop.f 0 -.save ar.pfs,GR_SAVE_PFS - mov GR_SAVE_PFS=ar.pfs // Save ar.pfs -};; -{ .mfi - mov GR_SAVE_GP=gp // Save gp - nop.f 0 -.save b0, GR_SAVE_B0 - mov GR_SAVE_B0=b0 // Save b0 +{.mfi + nop.m 0 + // s^2-(s^2)_s + fms.s1 F_S_DS2 = f8, f8, F_S2 + nop.i 0;; } -.body + + +{.mfi + nop.m 0 + // if s<0, set s = -s + (p6) fnma.s1 f8 = f8, f1, f0 + nop.i 0;; +} + +{.mfi + // load c5 = 63/8 + setf.s F_CS5 = R_TMP7 + // x = (1-s^2)_s*y^2-1+(1-(1-s^2)_s-s^2)*y^2 + fma.s1 F_X = F_DS, F_Y2, F_X + // for t = 2^k*1.b1 b2.., get 7-k|b1.. b6 + extr.u R_INDEX = R_TMP, 17, 9;; +} + + +{.mmi + // index = (4-exponent)|b1 b2.. b6 + sub R_INDEX = R_INDEX, R_BIAS + nop.m 0 + // get exponent of y + shr.u R_TMP2 = R_TMP, 23;; +} + +{.mmi + // load C3 + ldfe F_C3 = [r3], 16 + // set p8 = 1 if y'<2^{-4} + cmp.gt p8, p0 = 0x7b, R_TMP2 + // shift R_INDEX by 5 + shl R_INDEX = R_INDEX, 5;; +} + + {.mfb - nop.m 0 -(p0) fmerge.se f13 = asin_Ww, asin_Ww -(p0) br.call.sptk.many b0=__libm_atan2_reg# -};; -{ .mfi - mov gp = GR_SAVE_GP // Restore gp -(p0) fma.s1 asin_XWw = asin_ABS_NORM_f8,f13,f0 - mov b0 = GR_SAVE_B0 // Restore return address -};; -// asin_XWw = Xw = corr -// asin_low = (s_lo * Z_lo - corr) -// f8 = Z_hi + (s_lo * Z_lo - corr) + // get table index for sqrt(1-t^2) + add r2 = r2, R_INDEX + // get t = 2^k*1.b1 b2.. b7 1 + for F_T = F_T, F_ORMASK + (p8) br.cond.spnt VERY_LARGE_INPUT;; +} -{ .mfi - nop.m 999 -(p0) fms.s1 asin_low = f11, f10, asin_XWw - mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs -};; -{ .mfi - nop.m 999 -(p0) fma.s0 f8 = f8, f1, asin_low - nop.i 999 ;; + +{.mmf + // load C5 + ldfe F_C5 = [r3], 16 + // load 1/(1-t^2) + ldfp8 F_INV_1T2, F_SQRT_1T2 = [r2], 16 + // x = ((1-s^2)*y^2-1)/2 + fma.s1 F_X = F_X, F_05, f0;; } -{ .mfb - nop.m 999 -(p0) fmerge.s f8 = f12,f8 -(p0) br.ret.sptk b0 ;; + + +{.mmf + nop.m 0 + // C7, C9 + ldfpd F_C7, F_C9 = [r3], 16 + // set correct exponent for t + fmerge.se F_T = F_T1, F_T;; } -.endp __libm_callout -ASM_SIZE_DIRECTIVE(__libm_callout) -.proc SPECIAL -SPECIAL: -L(ASIN_ERROR_RETURN): -// If X is 1, return (sign of X)pi/2 -{ .mfi - nop.m 999 -(p0) fcmp.eq.unc p6,p7 = asin_ABS_NORM_f8,f1 - nop.i 999 ;; +{.mfi + // pi/2 (low, high) + ldfpd F_PI2_LO, F_PI2_HI = [r3] + // c9*x+c8 + fma.s1 F_S89 = F_X, F_CS9, F_CS8 + nop.i 0 } -{ .mfb -(p6) ldfe asin_pi_by_2_lo = [r40] -(p6) fmerge.s asin_pi_by_2 = f8,asin_pi_by_2 - nop.b 0;; +{.mfi + nop.m 0 + // x^2 + fma.s1 F_X2 = F_X, F_X, f0 + nop.i 0;; } -// If X is a NAN, leave -// qnan snan inf norm unorm 0 -+ -// 1 1 0 0 0 0 11 -{ .mfb - nop.m 999 -(p6) fma.s0 f8 = f8,asin_pi_by_2_lo,asin_pi_by_2 -(p6) br.ret.spnt b0 + +{.mfi + nop.m 0 + // y*(1-s^2)*x + fma.s1 F_Y1S2X = F_Y1S2, F_X, f0 + nop.i 0 } -{ .mfi - nop.m 999 -(p0) fclass.m.unc p12,p0 = f8, 0xc3 - nop.i 999 ;; + +{.mfi + nop.m 0 + // c7*x+c6 + fma.s1 F_S67 = F_X, F_CS7, F_CS6 + nop.i 0;; } -{ .mfb - nop.m 999 -(p12) fma.s0 f8 = f8,f1,f0 -(p12) br.ret.spnt b0 ;; + +{.mfi + nop.m 0 + // 1-x + fnma.s1 F_1X = F_X, f1, f1 + nop.i 0 +} + +{.mfi + nop.m 0 + // c3*x+c2 + fma.s1 F_S23 = F_X, F_CS3, F_CS2 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // 1-t^2 + fnma.s1 F_1T2 = F_T, F_T, f1 + nop.i 0 +} + +{.mfi + // load asin(t)_high, asin(t)_low + ldfpd F_ATHI, F_ATLO = [r2] + // c5*x+c4 + fma.s1 F_S45 = F_X, F_CS5, F_CS4 + nop.i 0;; +} + + + +{.mfi + nop.m 0 + // t*s + fma.s1 F_TS = F_T, f8, f0 + nop.i 0 +} + +{.mfi + nop.m 0 + // 0.5/(1-t^2) + fma.s1 F_INV_1T2 = F_INV_1T2, F_2M64, f0 + nop.i 0;; +} + +{.mfi + nop.m 0 + // z~sqrt(1-t^2), rounded to 24 significant bits + fma.s.s1 F_Z = F_SQRT_1T2, F_2M64, f0 + nop.i 0 +} + +{.mfi + nop.m 0 + // sqrt(1-t^2) + fma.s1 F_SQRT_1T2 = F_SQRT_1T2, F_2M64, f0 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // y*(1-s^2)*x^2 + fma.s1 F_Y1S2X2 = F_Y1S2, F_X2, f0 + nop.i 0 +} + +{.mfi + nop.m 0 + // x^4 + fma.s1 F_X4 = F_X2, F_X2, f0 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // s*t rounded to 24 significant bits + fma.s.s1 F_TSS = F_T, f8, f0 + nop.i 0 +} + +{.mfi + nop.m 0 + // c9*x^3+..+c6 + fma.s1 F_S69 = F_X2, F_S89, F_S67 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // ST = (t^2-1+s^2) rounded to 24 significant bits + fms.s.s1 F_ST = f8, f8, F_1T2 + nop.i 0 +} + +{.mfi + nop.m 0 + // c5*x^3+..+c2 + fma.s1 F_S25 = F_X2, F_S45, F_S23 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // 0.25/(1-t^2) + fma.s1 F_INV1T2_2 = F_05, F_INV_1T2, f0 + nop.i 0 +} + +{.mfi + nop.m 0 + // t*s-sqrt(1-t^2)*(1-s^2)*y + fnma.s1 F_TS = F_Y1S2, F_SQRT_1T2, F_TS + nop.i 0;; +} + + +{.mfi + nop.m 0 + // z*0.5/(1-t^2) + fma.s1 F_ZE = F_INV_1T2, F_SQRT_1T2, f0 + nop.i 0 +} + +{.mfi + nop.m 0 + // z^2+t^2-1 + fms.s1 F_DZ0 = F_Z, F_Z, F_1T2 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // (1-s^2-(1-s^2)_s)*x + fma.s1 F_DS2X = F_X, F_DS, f0 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // t*s-(t*s)_s + fms.s1 F_DTS = F_T, f8, F_TSS + nop.i 0 +} + +{.mfi + nop.m 0 + // c9*x^7+..+c2 + fma.s1 F_S29 = F_X4, F_S69, F_S25 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // y*z + fma.s1 F_YZ = F_Z, F_Y, f0 + nop.i 0 +} + +{.mfi + nop.m 0 + // t^2 + fma.s1 F_T2 = F_T, F_T, f0 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // 1-t^2+ST + fma.s1 F_1T2_ST = F_ST, f1, F_1T2 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // y*(1-s^2)(1-x) + fma.s1 F_Y1S2_1X = F_Y1S2, F_1X, f0 + nop.i 0 +} + +{.mfi + nop.m 0 + // dz ~ sqrt(1-t^2)-z + fma.s1 F_DZ = F_DZ0, F_ZE, f0 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // -1+correction for sqrt(1-t^2)-z + fnma.s1 F_CORR = F_INV1T2_2, F_DZ0, f0 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // (PS29*x^2+x)*y*(1-s^2) + fma.s1 F_S19 = F_Y1S2X2, F_S29, F_Y1S2X + nop.i 0;; +} + + +{.mfi + nop.m 0 + // z*y*(1-s^2)_s + fma.s1 F_ZY1S2S = F_YZ, F_1S2_S, f0 + nop.i 0 +} + +{.mfi + nop.m 0 + // s^2-(1-t^2+ST) + fms.s1 F_1T2_ST = f8, f8, F_1T2_ST + nop.i 0;; +} + + +{.mfi + nop.m 0 + // (t*s-(t*s)_s)+z*y*(1-s^2-(1-s^2)_s)*x + fma.s1 F_DTS = F_YZ, F_DS2X, F_DTS + nop.i 0 +} + +{.mfi + nop.m 0 + // dz*y*(1-s^2)*(1-x) + fma.s1 F_DZ_TERM = F_DZ, F_Y1S2_1X, f0 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // R = t*s-sqrt(1-t^2)*(1-s^2)*y+sqrt(1-t^2)*(1-s^2)*y*PS19 + // (used for polynomial evaluation) + fma.s1 F_R = F_S19, F_SQRT_1T2, F_TS + nop.i 0;; +} + + +{.mfi + nop.m 0 + // (PS29*x^2)*y*(1-s^2) + fma.s1 F_S29 = F_Y1S2X2, F_S29, f0 + nop.i 0 +} + +{.mfi + nop.m 0 + // apply correction to dz*y*(1-s^2)*(1-x) + fma.s1 F_DZ_TERM = F_DZ_TERM, F_CORR, F_DZ_TERM + nop.i 0;; +} + + +{.mfi + nop.m 0 + // R^2 + fma.s1 F_R2 = F_R, F_R, f0 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // (t*s-(t*s)_s)+z*y*(1-s^2-(1-s^2)_s)*x+dz*y*(1-s^2)*(1-x) + fma.s1 F_DZ_TERM = F_DZ_TERM, f1, F_DTS + nop.i 0;; } -{ .mfi -(p0) mov GR_Parameter_TAG = 60 -(p0) frcpa f10, p6 = f0, f0 -nop.i 0 -};; -.endp SPECIAL -ASM_SIZE_DIRECTIVE(SPECIAL) -.proc __libm_error_region -__libm_error_region: + +{.mfi + nop.m 0 + // c7+c9*R^2 + fma.s1 F_P79 = F_C9, F_R2, F_C7 + nop.i 0 +} + +{.mfi + nop.m 0 + // c3+c5*R^2 + fma.s1 F_P35 = F_C5, F_R2, F_C3 + nop.i 0;; +} + +{.mfi + nop.m 0 + // asin(t)_low-(pi/2)_low + fms.s1 F_ATLO = F_ATLO, f1, F_PI2_LO + nop.i 0 +} + +{.mfi + nop.m 0 + // R^4 + fma.s1 F_R4 = F_R2, F_R2, f0 + nop.i 0;; +} + +{.mfi + nop.m 0 + // R^3 + fma.s1 F_R3 = F_R2, F_R, f0 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // (t*s)_s-t^2*y*z + fnma.s1 F_TSS = F_T2, F_YZ, F_TSS + nop.i 0 +} + +{.mfi + nop.m 0 + // d(ts)+z*y*d(1-s^2)*x+dz*y*(1-s^2)*(1-x)+z*y*(s^2-1+t^2-ST) + fma.s1 F_DZ_TERM = F_YZ, F_1T2_ST, F_DZ_TERM + nop.i 0;; +} + + +{.mfi + nop.m 0 + // (pi/2)_hi-asin(t)_hi + fms.s1 F_ATHI = F_PI2_HI, f1, F_ATHI + nop.i 0 +} + +{.mfi + nop.m 0 + // c3+c5*R^2+c7*R^4+c9*R^6 + fma.s1 F_P39 = F_P79, F_R4, F_P35 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // d(ts)+z*y*d(1-s^2)*x+dz*y*(1-s^2)*(1-x)+z*y*(s^2-1+t^2-ST)+ + // + sqrt(1-t^2)*y*(1-s^2)*x^2*PS29 + fma.s1 F_DZ_TERM = F_SQRT_1T2, F_S29, F_DZ_TERM + nop.i 0;; +} + + +{.mfi + nop.m 0 + // (t*s)_s-t^2*y*z+z*y*ST + fma.s1 F_TSS = F_YZ, F_ST, F_TSS + nop.i 0 +} + +{.mfi + nop.m 0 + // -asin(t)_low+R^3*(c3+c5*R^2+c7*R^4+c9*R^6) + fms.s1 F_P39 = F_P39, F_R3, F_ATLO + nop.i 0;; +} + + +{.mfi + nop.m 0 + // if s<0, change sign of F_ATHI + (p6) fnma.s1 F_ATHI = F_ATHI, f1, f0 + nop.i 0 +} + +{.mfi + nop.m 0 + // d(ts)+z*y*d(1-s^2)*x+dz*y*(1-s^2)*(1-x)+z*y*(s^2-1+t^2-ST) + + // + sqrt(1-t^2)*y*(1-s^2)*x^2*PS29 + + // - asin(t)_low+R^3*(c3+c5*R^2+c7*R^4+c9*R^6) + fma.s1 F_DZ_TERM = F_P39, f1, F_DZ_TERM + nop.i 0;; +} + + +{.mfi + nop.m 0 + // d(ts)+z*y*d(1-s^2)*x+dz*y*(1-s^2)*(1-x)+z*y*(s^2-1+t^2-ST) + + // + sqrt(1-t^2)*y*(1-s^2)*x^2*PS29 + z*y*(1-s^2)_s*x + + // - asin(t)_low+R^3*(c3+c5*R^2+c7*R^4+c9*R^6) + fma.s1 F_DZ_TERM = F_ZY1S2S, F_X, F_DZ_TERM + nop.i 0;; +} + + +{.mfi + nop.m 0 + // d(ts)+z*y*d(1-s^2)*x+dz*y*(1-s^2)*(1-x)+z*y*(s^2-1+t^2-ST) + + // + sqrt(1-t^2)*y*(1-s^2)*x^2*PS29 + z*y*(1-s^2)_s*x + + // - asin(t)_low+R^3*(c3+c5*R^2+c7*R^4+c9*R^6) + + // + (t*s)_s-t^2*y*z+z*y*ST + fma.s1 F_DZ_TERM = F_TSS, f1, F_DZ_TERM + nop.i 0;; +} + + +.pred.rel "mutex", p6, p11 +{.mfi + nop.m 0 + // result: add high part of pi/2-table value + // s>0 in this case + (p11) fma.s0 f8 = F_DZ_TERM, f1, F_ATHI + nop.i 0 +} + +{.mfb + nop.m 0 + // result: add high part of pi/2-table value + // if s<0 + (p6) fnma.s0 f8 = F_DZ_TERM, f1, F_ATHI + br.ret.sptk b0;; +} + + + + + + +SMALL_S: + + // use 15-term polynomial approximation + +{.mmi + // r3 = pointer to polynomial coefficients + addl r3 = @ltoff(poly_coeffs), gp;; + // load start address for coefficients + ld8 r3 = [r3] + mov R_TMP = 0x3fbf;; +} + + +{.mmi + add r2 = 64, r3 + ldfe F_C3 = [r3], 16 + // p7 = 1 if |s|<2^{-64} (exponent of s<bias-64) + cmp.lt p7, p0 = R_EXP0, R_TMP;; +} + +{.mmf + ldfe F_C5 = [r3], 16 + ldfpd F_C11, F_C13 = [r2], 16 + // 2^{-128} + fma.s1 F_2M128 = F_2M64, F_2M64, f0;; +} + +{.mmf + ldfpd F_C7, F_C9 = [r3] + ldfpd F_C15, F_C17 = [r2] + // if |s|<2^{-64}, return s+2^{-128}*s + (p7) fma.s0 f8 = f8, F_2M128, f8;; +} + + + +{.mfb + nop.m 0 + // s^2 + fma.s1 F_R2 = f8, f8, f0 + // if |s|<2^{-64}, return s + (p7) br.ret.spnt b0;; +} + + +{.mfi + nop.m 0 + // s^3 + fma.s1 F_R3 = f8, F_R2, f0 + nop.i 0 +} + +{.mfi + nop.m 0 + // s^4 + fma.s1 F_R4 = F_R2, F_R2, f0 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // c3+c5*s^2 + fma.s1 F_P35 = F_C5, F_R2, F_C3 + nop.i 0 +} + +{.mfi + nop.m 0 + // c11+c13*s^2 + fma.s1 F_P1113 = F_C13, F_R2, F_C11 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // c7+c9*s^2 + fma.s1 F_P79 = F_C9, F_R2, F_C7 + nop.i 0 +} + +{.mfi + nop.m 0 + // c15+c17*s^2 + fma.s1 F_P1517 = F_C17, F_R2, F_C15 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // s^8 + fma.s1 F_R8 = F_R4, F_R4, f0 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // c3+c5*s^2+c7*s^4+c9*s^6 + fma.s1 F_P39 = F_P79, F_R4, F_P35 + nop.i 0 +} + +{.mfi + nop.m 0 + // c11+c13*s^2+c15*s^4+c17*s^6 + fma.s1 F_P1117 = F_P1517, F_R4, F_P1113 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // c3+..+c17*s^14 + fma.s1 F_P317 = F_R8, F_P1117, F_P39 + nop.i 0;; +} + + +{.mfb + nop.m 0 + // result + fma.s0 f8 = F_P317, F_R3, f8 + br.ret.sptk b0;; +} + + +{.mfb + nop.m 0 + fma.s0 f8 = F_P317, F_R3, f0//F_P317, F_R3, F_S29 + // nop.f 0//fma.s0 f8 = f13, f6, f0 + br.ret.sptk b0;; +} + + + + + + VERY_LARGE_INPUT: + +{.mfi + nop.m 0 + // s rounded to 24 significant bits + fma.s.s1 F_S = f8, f1, f0 + nop.i 0 +} + +{.mfi + // load C5 + ldfe F_C5 = [r3], 16 + // x = ((1-(s^2)_s)*y^2-1)/2-(s^2-(s^2)_s)*y^2/2 + fnma.s1 F_X = F_S_DS2, F_Y2_2, F_XL + nop.i 0;; +} + + + +{.mmf + nop.m 0 + // C7, C9 + ldfpd F_C7, F_C9 = [r3], 16 + nop.f 0;; +} + + + +{.mfi + // pi/2 (low, high) + ldfpd F_PI2_LO, F_PI2_HI = [r3], 16 + // c9*x+c8 + fma.s1 F_S89 = F_X, F_CS9, F_CS8 + nop.i 0 +} + +{.mfi + nop.m 0 + // x^2 + fma.s1 F_X2 = F_X, F_X, f0 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // y*(1-s^2)*x + fma.s1 F_Y1S2X = F_Y1S2, F_X, f0 + nop.i 0 +} + +{.mfi + // C11, C13 + ldfpd F_C11, F_C13 = [r3], 16 + // c7*x+c6 + fma.s1 F_S67 = F_X, F_CS7, F_CS6 + nop.i 0;; +} + + +{.mfi + // C15, C17 + ldfpd F_C15, F_C17 = [r3], 16 + // c3*x+c2 + fma.s1 F_S23 = F_X, F_CS3, F_CS2 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // c5*x+c4 + fma.s1 F_S45 = F_X, F_CS5, F_CS4 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // (s_s)^2 + fma.s1 F_DS = F_S, F_S, f0 + nop.i 0 +} + +{.mfi + nop.m 0 + // 1-(s_s)^2 + fnma.s1 F_1S2_S = F_S, F_S, f1 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // y*(1-s^2)*x^2 + fma.s1 F_Y1S2X2 = F_Y1S2, F_X2, f0 + nop.i 0 +} + +{.mfi + nop.m 0 + // x^4 + fma.s1 F_X4 = F_X2, F_X2, f0 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // c9*x^3+..+c6 + fma.s1 F_S69 = F_X2, F_S89, F_S67 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // c5*x^3+..+c2 + fma.s1 F_S25 = F_X2, F_S45, F_S23 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // ((s_s)^2-s^2) + fnma.s1 F_DS = f8, f8, F_DS + nop.i 0 +} + +{.mfi + nop.m 0 + // (pi/2)_high-y*(1-(s_s)^2) + fnma.s1 F_HI = F_Y, F_1S2_S, F_PI2_HI + nop.i 0;; +} + + +{.mfi + nop.m 0 + // c9*x^7+..+c2 + fma.s1 F_S29 = F_X4, F_S69, F_S25 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // -(y*(1-(s_s)^2))_high + fms.s1 F_1S2_HI = F_HI, f1, F_PI2_HI + nop.i 0;; +} + + +{.mfi + nop.m 0 + // (PS29*x^2+x)*y*(1-s^2) + fma.s1 F_S19 = F_Y1S2X2, F_S29, F_Y1S2X + nop.i 0;; +} + + +{.mfi + nop.m 0 + // y*(1-(s_s)^2)-(y*(1-s^2))_high + fma.s1 F_DS2 = F_Y, F_1S2_S, F_1S2_HI + nop.i 0;; +} + + + +{.mfi + nop.m 0 + // R ~ sqrt(1-s^2) + // (used for polynomial evaluation) + fnma.s1 F_R = F_S19, f1, F_Y1S2 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // y*(1-s^2)-(y*(1-s^2))_high + fma.s1 F_DS2 = F_Y, F_DS, F_DS2 + nop.i 0 +} + +{.mfi + nop.m 0 + // (pi/2)_low+(PS29*x^2)*y*(1-s^2) + fma.s1 F_S29 = F_Y1S2X2, F_S29, F_PI2_LO + nop.i 0;; +} + + + +{.mfi + nop.m 0 + // R^2 + fma.s1 F_R2 = F_R, F_R, f0 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // (pi/2)_low+(PS29*x^2)*y*(1-s^2)-(y*(1-s^2)-(y*(1-s^2))_high) + fms.s1 F_S29 = F_S29, f1, F_DS2 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // c7+c9*R^2 + fma.s1 F_P79 = F_C9, F_R2, F_C7 + nop.i 0 +} + +{.mfi + nop.m 0 + // c3+c5*R^2 + fma.s1 F_P35 = F_C5, F_R2, F_C3 + nop.i 0;; +} + + + +{.mfi + nop.m 0 + // R^4 + fma.s1 F_R4 = F_R2, F_R2, f0 + nop.i 0 +} + +{.mfi + nop.m 0 + // R^3 + fma.s1 F_R3 = F_R2, F_R, f0 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // c11+c13*R^2 + fma.s1 F_P1113 = F_C13, F_R2, F_C11 + nop.i 0 +} + +{.mfi + nop.m 0 + // c15+c17*R^2 + fma.s1 F_P1517 = F_C17, F_R2, F_C15 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // (pi/2)_low+(PS29*x^2)*y*(1-s^2)-(y*(1-s^2)-(y*(1-s^2))_high)+y*(1-s^2)*x + fma.s1 F_S29 = F_Y1S2, F_X, F_S29 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // c11+c13*R^2+c15*R^4+c17*R^6 + fma.s1 F_P1117 = F_P1517, F_R4, F_P1113 + nop.i 0 +} + +{.mfi + nop.m 0 + // c3+c5*R^2+c7*R^4+c9*R^6 + fma.s1 F_P39 = F_P79, F_R4, F_P35 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // R^8 + fma.s1 F_R8 = F_R4, F_R4, f0 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // c3+c5*R^2+c7*R^4+c9*R^6+..+c17*R^14 + fma.s1 F_P317 = F_P1117, F_R8, F_P39 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // (pi/2)_low-(PS29*x^2)*y*(1-s^2)-(y*(1-s^2)- + // -(y*(1-s^2))_high)+y*(1-s^2)*x - P3, 17 + fnma.s1 F_S29 = F_P317, F_R3, F_S29 + nop.i 0;; +} + +{.mfi + nop.m 0 + // set sign + (p6) fnma.s1 F_S29 = F_S29, f1, f0 + nop.i 0 +} + +{.mfi + nop.m 0 + (p6) fnma.s1 F_HI = F_HI, f1, f0 + nop.i 0;; +} + + +{.mfb + nop.m 0 + // Result: + // (pi/2)_low-(PS29*x^2)*y*(1-s^2)-(y*(1-s^2)- + // -(y*(1-s^2))_high)+y*(1-s^2)*x - P3, 17 + // +(pi/2)_high-(y*(1-s^2))_high + fma.s0 f8 = F_S29, f1, F_HI + br.ret.sptk b0;; +} + + + + + + + + + + ASINL_SPECIAL_CASES: + +{.mfi + alloc r32 = ar.pfs, 1, 4, 4, 0 + // check if the input is a NaN, or unsupported format + // (i.e. not infinity or normal/denormal) + fclass.nm p7, p8 = f8, 0x3f + // pointer to pi/2 + add r3 = 48, r3;; +} + + +{.mfi + // load pi/2 + ldfpd F_PI2_HI, F_PI2_LO = [r3] + // get |s| + fmerge.s F_S = f0, f8 + nop.i 0 +} + +{.mfb + nop.m 0 + // if NaN, quietize it, and return + (p7) fma.s0 f8 = f8, f1, f0 + (p7) br.ret.spnt b0;; +} + + +{.mfi + nop.m 0 + // |s| = 1 ? + fcmp.eq.s0 p9, p0 = F_S, f1 + nop.i 0 +} + +{.mfi + nop.m 0 + // load FR_X + fma.s1 FR_X = f8, f1, f0 + // load error tag + mov GR_Parameter_TAG = 60;; +} + + +{.mfb + nop.m 0 + // change sign if s = -1 + (p6) fnma.s1 F_PI2_HI = F_PI2_HI, f1, f0 + nop.b 0 +} + +{.mfb + nop.m 0 + // change sign if s = -1 + (p6) fnma.s1 F_PI2_LO = F_PI2_LO, f1, f0 + nop.b 0;; +} + +{.mfb + nop.m 0 + // if s = 1, result is pi/2 + (p9) fma.s0 f8 = F_PI2_HI, f1, F_PI2_LO + // return if |s| = 1 + (p9) br.ret.sptk b0;; +} + + +{.mfi + nop.m 0 + // get Infinity + frcpa.s1 FR_RESULT, p0 = f1, f0 + nop.i 0;; +} + + +{.mfi + nop.m 0 + // return QNaN indefinite (0*Infinity) + fma.s0 FR_RESULT = f0, FR_RESULT, f0 + nop.i 0;; +} + + +GLOBAL_LIBM_END(asinl) + + +LOCAL_LIBM_ENTRY(__libm_error_region) .prologue +// (1) { .mfi add GR_Parameter_Y=-32,sp // Parameter 2 value nop.f 0 @@ -742,24 +2470,29 @@ __libm_error_region: } { .mfi .fframe 64 - add sp=-64,sp // Create new stack + add sp=-64,sp // Create new stack nop.f 0 - mov GR_SAVE_GP=gp // Save gp + mov GR_SAVE_GP=gp // Save gp };; + + +// (2) { .mmi - stfe [GR_Parameter_Y] = FR_Y,16 // Store Parameter 2 on stack - add GR_Parameter_X = 16,sp // Parameter 1 address + stfe [GR_Parameter_Y] = f1,16 // Store Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address .save b0, GR_SAVE_B0 - mov GR_SAVE_B0=b0 // Save b0 + mov GR_SAVE_B0=b0 // Save b0 };; + .body +// (3) { .mib - stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack + stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack add GR_Parameter_RESULT = 0,GR_Parameter_Y nop.b 0 // Parameter 3 address } { .mib - stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack + stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack add GR_Parameter_Y = -16,GR_Parameter_Y br.call.sptk b0=__libm_error_support# // Call error handling function };; @@ -768,23 +2501,27 @@ __libm_error_region: nop.m 0 add GR_Parameter_RESULT = 48,sp };; + +// (4) { .mmi ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack .restore sp add sp = 64,sp // Restore stack pointer mov b0 = GR_SAVE_B0 // Restore return address };; + { .mib - mov gp = GR_SAVE_GP // Restore gp + mov gp = GR_SAVE_GP // Restore gp mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs br.ret.sptk b0 // Return -};; +};; -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) +LOCAL_LIBM_END(__libm_error_region) .type __libm_error_support#,@function .global __libm_error_support# -.type __libm_atan2_reg#,@function -.global __libm_atan2_reg# + + + + diff --git a/sysdeps/ia64/fpu/e_atan2.S b/sysdeps/ia64/fpu/e_atan2.S index 38dd2f749a..8be7c6cec5 100644 --- a/sysdeps/ia64/fpu/e_atan2.S +++ b/sysdeps/ia64/fpu/e_atan2.S @@ -1,10 +1,10 @@ .file "atan2.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,33 +20,38 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// // Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// problem reports or change requests be submitted to it directly at +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 2/02/00 Initial version -// 4/04/00 Unwind support added -// 8/15/00 Bundle added after call to __libm_error_support to properly -// set [the previously overwritten] GR_Parameter_RESULT. -// 8/17/00 Changed predicate register macro-usage to direct predicate -// names due to an assembler bug. -// 9/28/00 Updated to set invalid on SNaN inputs -// 1/19/01 Fixed flags for small results +// 02/02/00 Initial version +// 04/04/00 Unwind support added +// 08/15/00 Bundle added after call to __libm_error_support to properly +// set [the previously overwritten] GR_Parameter_RESULT. +// 08/17/00 Changed predicate register macro-usage to direct predicate +// names due to an assembler bug. +// 09/28/00 Updated to set invalid on SNaN inputs +// 01/19/01 Fixed flags for small results +// 04/13/01 Rescheduled to make all paths faster +// 05/20/02 Cleaned up namespace and sf0 syntax +// 08/20/02 Corrected inexact flag and directed rounding symmetry bugs +// 02/06/03 Reordered header: .section, .global, .proc, .align +// 04/17/03 Added missing mutex directive // // API //============================================================== @@ -55,10 +60,12 @@ // Overview of operation //============================================================== // +// The atan2 function returns values in the interval [-pi,+pi]. +// // There are two basic paths: swap true and swap false. // atan2(Y,X) ==> atan2(V/U) where U >= V. If Y > X, we must swap. // -// p6 swap True |Y| > |X| +// p6 swap True |Y| > |X| // p7 swap False |Y| <= |X| // p8 X+ (If swap=True p8=p9=0) // p9 X- @@ -66,21 +73,21 @@ // all the other predicates p10 thru p15 are false for the main path // // Simple trigonometric identities show -// Region 1 (-45 to +45 degrees): +// Region 1 (-45 to +45 degrees): // X>0, |Y|<=X, V=Y, U=X atan2(Y,X) = sgnY * (0 + atan(V/U)) // -// Region 2 (-90 to -45 degrees, and +45 to +90 degrees): +// Region 2 (-90 to -45 degrees, and +45 to +90 degrees): // X>0, |Y|>X, V=X, U=Y atan2(Y,X) = sgnY * (pi/2 - atan(V/U)) // -// Region 3 (-135 to -90 degrees, and +90 to +135 degrees): +// Region 3 (-135 to -90 degrees, and +90 to +135 degrees): // X<0, |Y|>X, V=X, U=Y atan2(Y,X) = sgnY * (pi/2 + atan(V/U)) // -// Region 4 (-180 to -135 degrees, and +135 to +180 degrees): +// Region 4 (-180 to -135 degrees, and +135 to +180 degrees): // X<0, |Y|<=X, V=Y, U=X atan2(Y,X) = sgnY * (pi - atan(V/U)) // // So the result is always of the form atan2(Y,X) = P + sgnXY * atan(V/U) // -// We compute atan(V/U) from the identity +// We compute atan(V/U) from the identity // atan(z) + atan([(V/U)-z] / [1+(V/U)z]) // where z is a limited precision approximation (16 bits) to V/U // @@ -124,13 +131,13 @@ // +number -0 +pi/2 // -number -0 -pi/2 // -// +0 +number +0 -// -0 +number -0 +// +0 +number +0 +// -0 +number -0 // +0 -number +pi // -0 -number -pi // -// +0 +0 +0 -// -0 +0 -0 +// +0 +0 +0 +// -0 +0 -0 // +0 -0 +pi // -0 -0 -pi // @@ -138,16 +145,26 @@ // anything NaN quiet X // atan2(+-0/+-0) sets double error tag to 37 -// atan2(+-0/+-0) sets single error tag to 38 -#include "libm_support.h" +// Registers used +//============================================================== + +// predicate registers used: +// p6 -> p15 + +// floating-point registers used: +// f8, f9 input +// f32 -> f119 + +// general registers used +// r32 -> r41 // Assembly macros //============================================================== EXP_AD_P1 = r33 EXP_AD_P2 = r34 -atan2_GR_sml_exp = r35 +rsig_near_one = r35 GR_SAVE_B0 = r35 @@ -159,22 +176,23 @@ GR_Parameter_Y = r39 GR_Parameter_RESULT = r40 atan2_GR_tag = r41 - -atan2_X = f9 atan2_Y = f8 +atan2_X = f9 atan2_u1_X = f32 atan2_u1_Y = f33 -atan2_Umax = f34 -atan2_Vmin = f35 +atan2_z2_X = f34 +atan2_z2_Y = f35 + atan2_two = f36 -atan2_absX = f37 +atan2_B1sq_Y = f37 atan2_z1_X = f38 atan2_z1_Y = f39 atan2_B1X = f40 + atan2_B1Y = f41 -atan2_wp = f42 -atan2_B1sq = f43 +atan2_wp_X = f42 +atan2_B1sq_X = f43 atan2_z = f44 atan2_w = f45 @@ -183,178 +201,149 @@ atan2_P1 = f47 atan2_P2 = f48 atan2_P3 = f49 atan2_P4 = f50 + atan2_P5 = f51 atan2_P6 = f52 atan2_P7 = f53 atan2_P8 = f54 atan2_P9 = f55 + atan2_P10 = f56 atan2_P11 = f57 atan2_P12 = f58 atan2_P13 = f59 atan2_P14 = f60 + atan2_P15 = f61 atan2_P16 = f62 atan2_P17 = f63 atan2_P18 = f64 atan2_P19 = f65 + atan2_P20 = f66 atan2_P21 = f67 atan2_P22 = f68 -atan2_Pi_by_2 = f69 - +atan2_tmp = f68 +atan2_pi_by_2 = f69 +atan2_sgn_pi_by_2 = f69 atan2_V13 = f70 + atan2_W11 = f71 atan2_E = f72 -atan2_gamma = f73 +atan2_wp_Y = f73 atan2_V11 = f74 atan2_V12 = f75 + atan2_V7 = f76 atan2_V8 = f77 atan2_W7 = f78 atan2_W8 = f79 atan2_W3 = f80 + atan2_W4 = f81 atan2_V3 = f82 atan2_V4 = f83 atan2_F = f84 atan2_gV = f85 + atan2_V10 = f86 atan2_zcub = f87 atan2_V6 = f88 atan2_V9 = f89 atan2_W10 = f90 + atan2_W6 = f91 atan2_W2 = f92 atan2_V2 = f93 - atan2_alpha = f94 atan2_alpha_1 = f95 + atan2_gVF = f96 atan2_V5 = f97 atan2_W12 = f98 atan2_W5 = f99 atan2_alpha_sq = f100 + atan2_Cp = f101 atan2_V1 = f102 - -atan2_sml_norm = f103 -atan2_FR_tmp = f103 - +atan2_ysq = f103 atan2_W1 = f104 atan2_alpha_cub = f105 + atan2_C = f106 -atan2_P = f107 +atan2_xsq = f107 atan2_d = f108 atan2_A_hi = f109 atan2_dsq = f110 + atan2_pd = f111 atan2_A_lo = f112 atan2_A = f113 - atan2_Pp = f114 +atan2_sgnY = f115 -atan2_sgnY = f116 +atan2_sig_near_one = f116 +atan2_near_one = f116 atan2_pi = f117 -atan2_sgnX = f118 -atan2_sgnXY = f119 - -atan2_3pi_by_4 = f120 -atan2_pi_by_4 = f121 - -//atan2_sF = p7 -//atan2_sT = p6 +atan2_sgn_pi = f117 +atan2_3pi_by_4 = f118 +atan2_pi_by_4 = f119 -// These coefficients are for atan2. -// You can also use this set to substitute those used in the |X| <= 1 case for atan; -// BUT NOT vice versa. ///////////////////////////////////////////////////////////// -#ifdef _LIBC -.rodata -#else -.data -#endif +RODATA .align 16 -atan2_tb1: -ASM_TYPE_DIRECTIVE(atan2_tb1,@object) -data8 0xB199DD6D2675C40F , 0x0000BFFA // P10 +LOCAL_OBJECT_START(atan2_tb1) data8 0xA21922DC45605EA1 , 0x00003FFA // P11 -data8 0xD78F28FC2A592781 , 0x0000BFFA // P8 +data8 0xB199DD6D2675C40F , 0x0000BFFA // P10 data8 0xC2F01E5DDD100DBE , 0x00003FFA // P9 -data8 0x9D89D7D55C3287A5 , 0x00003FFB // P5 +data8 0xD78F28FC2A592781 , 0x0000BFFA // P8 data8 0xF0F03ADB3FC930D3 , 0x00003FFA // P7 -data8 0xF396268151CFB11C , 0x00003FF7 // P17 -data8 0x9D3436AABE218776 , 0x00003FF5 // P19 -data8 0x80D601879218B53A , 0x00003FFA // P13 -data8 0xA2270D30A90AA220 , 0x00003FF9 // P15 -data8 0xCCCCCCCCCCC906CD , 0x00003FFC // P1 +data8 0x88887EBB209E3543 , 0x0000BFFB // P6 +data8 0x9D89D7D55C3287A5 , 0x00003FFB // P5 +data8 0xBA2E8B9793955C77 , 0x0000BFFB // P4 data8 0xE38E38E320A8A098 , 0x00003FFB // P3 -data8 0xFE7E52D2A89995B3 , 0x0000BFEC // P22 -data8 0xC90FDAA22168C235 , 0x00003FFE // pi/4 -ASM_SIZE_DIRECTIVE(atan2_tb1) +data8 0x9249249247E37913 , 0x0000BFFC // P2 +data8 0xCCCCCCCCCCC906CD , 0x00003FFC // P1 +data8 0xAAAAAAAAAAAAA8A9 , 0x0000BFFD // P0 +data8 0xC90FDAA22168C235 , 0x00004000 // pi +LOCAL_OBJECT_END(atan2_tb1) -atan2_tb2: -ASM_TYPE_DIRECTIVE(atan2_tb2,@object) -data8 0x9F90FB984D8E39D0 , 0x0000BFF3 // P20 +LOCAL_OBJECT_START(atan2_tb2) data8 0xCE585A259BD8374C , 0x00003FF0 // P21 -data8 0xBA2E8B9793955C77 , 0x0000BFFB // P4 -data8 0x88887EBB209E3543 , 0x0000BFFB // P6 -data8 0xD818B4BB43D84BF2 , 0x0000BFF8 // P16 +data8 0x9F90FB984D8E39D0 , 0x0000BFF3 // P20 +data8 0x9D3436AABE218776 , 0x00003FF5 // P19 data8 0xDEC343E068A6D2A8 , 0x0000BFF6 // P18 -data8 0x9297B23CCFFB291F , 0x0000BFFA // P12 +data8 0xF396268151CFB11C , 0x00003FF7 // P17 +data8 0xD818B4BB43D84BF2 , 0x0000BFF8 // P16 +data8 0xA2270D30A90AA220 , 0x00003FF9 // P15 data8 0xD5F4F2182E7A8725 , 0x0000BFF9 // P14 -data8 0xAAAAAAAAAAAAA8A9 , 0x0000BFFD // P0 -data8 0x9249249247E37913 , 0x0000BFFC // P2 +data8 0x80D601879218B53A , 0x00003FFA // P13 +data8 0x9297B23CCFFB291F , 0x0000BFFA // P12 +data8 0xFE7E52D2A89995B3 , 0x0000BFEC // P22 data8 0xC90FDAA22168C235 , 0x00003FFF // pi/2 -data8 0xC90FDAA22168C235 , 0x00004000 // pi +data8 0xC90FDAA22168C235 , 0x00003FFE // pi/4 data8 0x96cbe3f9990e91a8 , 0x00004000 // 3pi/4 -ASM_SIZE_DIRECTIVE(atan2_tb2) - - +LOCAL_OBJECT_END(atan2_tb2) -.align 32 -.global atan2# -#ifdef _LIBC -.global __atan2# -.global __ieee754_atan2# -#endif -//////////////////////////////////////////////////////// .section .text -.align 32 - -.proc atan2# -atan2: -#ifdef _LIBC -.proc __atan2# -__atan2: -.proc __ieee754_atan2# -__ieee754_atan2: -#endif -// qnan snan inf norm unorm 0 -+ -// 0 0 1 0 0 0 11 - - -// Y NAN? p10 p11 -// p10 ==> quiet Y and return -// p11 X NAN? p12, p13 -// p12 ==> quiet X and return +GLOBAL_IEEE754_ENTRY(atan2) { .mfi alloc r32 = ar.pfs,1,5,4,0 frcpa.s1 atan2_u1_X,p6 = f1,atan2_X - addl EXP_AD_P2 = @ltoff(atan2_tb2), gp + nop.i 999 } { .mfi addl EXP_AD_P1 = @ltoff(atan2_tb1), gp - fclass.m.unc p10,p11 = f8, 0xc3 + fma.s1 atan2_two = f1,f1,f1 nop.i 999 ;; } @@ -366,256 +355,233 @@ __ieee754_atan2: } { .mfi nop.m 999 - fma.s1 atan2_two = f1,f1,f1 + fma.s1 atan2_xsq = atan2_X,atan2_X,f0 nop.i 999 ;; } - { .mfi - ld8 EXP_AD_P2 = [ EXP_AD_P2] - famax.s1 atan2_Umax = f8,f9 + nop.m 999 + fclass.m p10,p0 = atan2_Y, 0xc3 // Test for y=nan nop.i 999 } -;; - { .mfi nop.m 999 - fmerge.s atan2_absX = f0,atan2_X + fma.s1 atan2_ysq = atan2_Y,atan2_Y,f0 nop.i 999 } ;; -// p10 Y NAN, quiet and return { .mfi - ldfe atan2_P10 = [EXP_AD_P1],16 - fmerge.s atan2_sgnY = atan2_Y,f1 + add EXP_AD_P2 = 0xd0,EXP_AD_P1 + fclass.m p12,p0 = atan2_X, 0xc3 // Test for x nan nop.i 999 } -{ .mfb - nop.m 999 -(p10) fma.d f8 = f8,f9,f0 -(p10) br.ret.spnt b0 ;; -} -{ .mmf +// p10 Y NAN, quiet and return +{ .mfi ldfe atan2_P11 = [EXP_AD_P1],16 - ldfe atan2_P20 = [EXP_AD_P2],16 - fmerge.s atan2_sgnX = atan2_X,f1 + fmerge.s atan2_sgnY = atan2_Y,f1 + nop.i 999 +} +{ .mfb + ldfe atan2_P21 = [EXP_AD_P2],16 +(p10) fma.d.s0 f8 = atan2_Y,atan2_X,f0 // If y=nan, result quietized y +(p10) br.ret.spnt b0 // Exit if y=nan ;; } -{ .mfi - ldfe atan2_P8 = [EXP_AD_P1],16 +{ .mfi + ldfe atan2_P10 = [EXP_AD_P1],16 fma.s1 atan2_z1_X = atan2_u1_X, atan2_Y, f0 nop.i 999 } -{ .mfi - - ldfe atan2_P21 = [EXP_AD_P2],16 - fma.s1 atan2_z1_Y = atan2_u1_Y, atan2_X, f0 +{ .mfi + ldfe atan2_P20 = [EXP_AD_P2],16 + fnma.s1 atan2_B1X = atan2_u1_X, atan2_X, atan2_two nop.i 999 ;; } -{ .mfi +{ .mfi ldfe atan2_P9 = [EXP_AD_P1],16 - fnma.s1 atan2_B1X = atan2_u1_X, atan2_X, atan2_two + fma.s1 atan2_z1_Y = atan2_u1_Y, atan2_X, f0 nop.i 999 } -{ .mfi - - ldfe atan2_P4 = [EXP_AD_P2],16 +{ .mfi + ldfe atan2_P19 = [EXP_AD_P2],16 fnma.s1 atan2_B1Y = atan2_u1_Y, atan2_Y, atan2_two nop.i 999 -;; } - -// p6 (atan2_sT) true if swap -// p7 (atan2_sF) true if no swap -// p11 ==> Y !NAN; X NAN? +;; { .mfi - ldfe atan2_P5 = [EXP_AD_P1],16 -// fcmp.eq.unc.s1 atan2_sF,atan2_sT = atan2_Umax, atan2_X - fcmp.eq.unc.s1 p7,p6 = atan2_Umax, atan2_X + ldfe atan2_P8 = [EXP_AD_P1],16 + fma.s1 atan2_z2_X = atan2_u1_X, atan2_ysq, f0 nop.i 999 } { .mfi - ldfe atan2_P6 = [EXP_AD_P2],16 -(p11) fclass.m.unc p12,p13 = f9, 0xc3 + ldfe atan2_P18 = [EXP_AD_P2],16 + fma.s1 atan2_z2_Y = atan2_u1_Y, atan2_xsq, f0 nop.i 999 -;; } - -{ .mmf - ldfe atan2_P7 = [EXP_AD_P1],16 - ldfe atan2_P16 = [EXP_AD_P2],16 - famin.s1 atan2_Vmin = f8,f9 ;; -} -// p8 true if X positive -// p9 true if X negative -// both are false is swap is true +// p10 ==> x inf y ? +// p11 ==> x !inf y ? { .mfi - ldfe atan2_P17 = [EXP_AD_P1],16 -//(atan2_sF) fcmp.eq.unc.s1 p8,p9 = atan2_sgnX,f1 -(p7) fcmp.eq.unc.s1 p8,p9 = atan2_sgnX,f1 + ldfe atan2_P7 = [EXP_AD_P1],16 + fclass.m p10,p11 = atan2_X, 0x23 // test for x inf nop.i 999 } -{ .mfi - ldfe atan2_P18 = [EXP_AD_P2],16 - fma.s1 atan2_sgnXY = atan2_sgnX, atan2_sgnY, f0 - nop.i 999 +{ .mfb + ldfe atan2_P17 = [EXP_AD_P2],16 +(p12) fma.d.s0 f8 = atan2_X,atan2_Y,f0 // If x nan, result quiet x +(p12) br.ret.spnt b0 // Exit for x nan ;; } +// p6 true if swap, means |y| > |x| or ysq > xsq +// p7 true if no swap, means |x| >= |y| or xsq >= ysq +{ .mmf + ldfe atan2_P6 = [EXP_AD_P1],16 + ldfe atan2_P16 = [EXP_AD_P2],16 + fcmp.ge.s1 p7,p6 = atan2_xsq, atan2_ysq +;; +} { .mfi - ldfe atan2_P19 = [EXP_AD_P1],16 -//(atan2_sF) fma.s1 atan2_wp = atan2_z1_X, atan2_z1_X, f0 -(p7) fma.s1 atan2_wp = atan2_z1_X, atan2_z1_X, f0 + ldfe atan2_P5 = [EXP_AD_P1],16 + fma.s1 atan2_wp_X = atan2_z1_X, atan2_z1_X, f0 nop.i 999 } { .mfi - ldfe atan2_P12 = [EXP_AD_P2],16 -//(atan2_sT) fma.s1 atan2_wp = atan2_z1_Y, atan2_z1_Y, f0 -(p6) fma.s1 atan2_wp = atan2_z1_Y, atan2_z1_Y, f0 + ldfe atan2_P15 = [EXP_AD_P2],16 + fma.s1 atan2_B1sq_X = atan2_B1X, atan2_B1X, f0 nop.i 999 ;; } - { .mfi - ldfe atan2_P13 = [EXP_AD_P1],16 -//(atan2_sF) fma.s1 atan2_z = atan2_z1_X, atan2_B1X, f0 -(p7) fma.s1 atan2_z = atan2_z1_X, atan2_B1X, f0 + ldfe atan2_P4 = [EXP_AD_P1],16 +(p6) fma.s1 atan2_wp_Y = atan2_z1_Y, atan2_z1_Y, f0 nop.i 999 } { .mfi ldfe atan2_P14 = [EXP_AD_P2],16 -//(atan2_sT) fma.s1 atan2_z = atan2_z1_Y, atan2_B1Y, f0 -(p6) fma.s1 atan2_z = atan2_z1_Y, atan2_B1Y, f0 +(p6) fma.s1 atan2_B1sq_Y = atan2_B1Y, atan2_B1Y, f0 nop.i 999 ;; } - { .mfi - ldfe atan2_P15 = [EXP_AD_P1],16 -//(atan2_sF) fma.s1 atan2_B1sq = atan2_B1X, atan2_B1X, f0 -(p7) fma.s1 atan2_B1sq = atan2_B1X, atan2_B1X, f0 + ldfe atan2_P3 = [EXP_AD_P1],16 +(p6) fma.s1 atan2_E = atan2_z2_Y, atan2_B1Y, atan2_Y nop.i 999 } { .mfi - ldfe atan2_P0 = [EXP_AD_P2],16 -//(atan2_sT) fma.s1 atan2_B1sq = atan2_B1Y, atan2_B1Y, f0 -(p6) fma.s1 atan2_B1sq = atan2_B1Y, atan2_B1Y, f0 + ldfe atan2_P13 = [EXP_AD_P2],16 +(p7) fma.s1 atan2_E = atan2_z2_X, atan2_B1X, atan2_X nop.i 999 ;; } -// p12 ==> X NAN, quiet and return { .mfi - ldfe atan2_P1 = [EXP_AD_P1],16 - fmerge.s atan2_Umax = f0,atan2_Umax + ldfe atan2_P2 = [EXP_AD_P1],16 +(p6) fma.s1 atan2_z = atan2_z1_Y, atan2_B1Y, f0 nop.i 999 } -{ .mfb - ldfe atan2_P2 = [EXP_AD_P2],16 -(p12) fma.d f8 = f9,f8,f0 -(p12) br.ret.spnt b0 +{ .mfi + ldfe atan2_P12 = [EXP_AD_P2],16 +(p7) fma.s1 atan2_z = atan2_z1_X, atan2_B1X, f0 + nop.i 999 ;; } -// p10 ==> x inf y ? -// p11 ==> x !inf y ? { .mfi - ldfe atan2_P3 = [EXP_AD_P1],16 - fmerge.s atan2_Vmin = f0,atan2_Vmin + ldfe atan2_P1 = [EXP_AD_P1],16 + fcmp.eq.s0 p14,p15=atan2_X,atan2_Y // Dummy for denorm and invalid nop.i 999 } -{ .mfi - ldfe atan2_Pi_by_2 = [EXP_AD_P2],16 - fclass.m.unc p10,p11 = f9, 0x23 - nop.i 999 +{ .mlx + ldfe atan2_P22 = [EXP_AD_P2],16 + movl rsig_near_one = 0x8000000000000001 // signif near 1.0 ;; } +// p12 ==> x inf y inf +// p13 ==> x inf y !inf { .mmf - ldfe atan2_P22 = [EXP_AD_P1],16 - ldfe atan2_pi = [EXP_AD_P2],16 - nop.f 999 + ldfe atan2_P0 = [EXP_AD_P1],16 + ldfe atan2_pi_by_2 = [EXP_AD_P2],16 +(p10) fclass.m.unc p12,p13 = atan2_Y, 0x23 // x inf, test if y inf ;; } { .mfi - nop.m 999 - fcmp.eq.s0 p12,p13=f9,f8 // Dummy to catch denormal and invalid + ldfe atan2_pi = [EXP_AD_P1],16 +(p6) fma.s1 atan2_w = atan2_wp_Y, atan2_B1sq_Y,f0 nop.i 999 -;; } - - { .mfi - ldfe atan2_pi_by_4 = [EXP_AD_P1],16 -//(atan2_sT) fmerge.ns atan2_sgnXY = atan2_sgnXY, atan2_sgnXY -(p6) fmerge.ns atan2_sgnXY = atan2_sgnXY, atan2_sgnXY + ldfe atan2_pi_by_4 = [EXP_AD_P2],16 +(p7) fma.s1 atan2_w = atan2_wp_X, atan2_B1sq_X,f0 nop.i 999 +;; } + { .mfi ldfe atan2_3pi_by_4 = [EXP_AD_P2],16 - fma.s1 atan2_w = atan2_wp, atan2_B1sq,f0 +(p11) fclass.m.unc p9,p0 = atan2_Y, 0x23 // x not inf, test if y inf nop.i 999 ;; } -// p12 ==> x inf y inf -// p13 ==> x inf y !inf +{ .mfi + setf.sig atan2_sig_near_one = rsig_near_one +(p12) fcmp.gt.unc.s1 p10,p11 = atan2_X,f0 // x inf, y inf, test if x +inf + nop.i 999 +} { .mfi nop.m 999 - fmerge.s atan2_z = f0, atan2_z +(p6) fnma.s1 atan2_gV = atan2_Y, atan2_z, atan2_X nop.i 999 ;; } { .mfi - nop.m 99 -(p10) fclass.m.unc p12,p13 = f8, 0x23 + nop.m 999 + frcpa.s1 atan2_F,p0 = f1, atan2_E nop.i 999 } { .mfi - nop.m 99 -(p11) fclass.m.unc p14,p15 = f8, 0x23 + nop.m 999 +(p7) fnma.s1 atan2_gV = atan2_X, atan2_z, atan2_Y nop.i 999 ;; } +// p13 ==> x inf y !inf { .mfi nop.m 999 -(p12) fcmp.eq.unc.s1 p10,p11 = atan2_sgnX,f1 - nop.i 99 -;; +(p13) fcmp.gt.unc.s1 p14,p15 = atan2_X,f0 // x inf, y !inf, test if x +inf + nop.i 999 } - - { .mfb - mov atan2_GR_sml_exp = 0x1 // Small exponent for making small norm -(p14) fma.d f8 = atan2_sgnY, atan2_Pi_by_2, f0 -(p14) br.ret.spnt b0 + nop.m 999 +(p9) fma.d.s0 f8 = atan2_sgnY, atan2_pi_by_2, f0 // +-pi/2 if x !inf, y inf +(p9) br.ret.spnt b0 // exit if x not inf, y inf, result is +-pi/2 ;; } -// Make a very small normal in case need to force inexact and underflow { .mfi - setf.exp atan2_sml_norm = atan2_GR_sml_exp + nop.m 999 fma.s1 atan2_V13 = atan2_w, atan2_P11, atan2_P10 nop.i 999 } @@ -626,58 +592,58 @@ __ieee754_atan2: ;; } - { .mfi nop.m 999 - fma.s1 atan2_E = atan2_Vmin, atan2_z, atan2_Umax + fma.s1 atan2_V11 = atan2_w, atan2_P9, atan2_P8 nop.i 999 } { .mfi nop.m 999 - fnma.s1 atan2_gamma = atan2_Umax, atan2_z, f1 + fma.s1 atan2_V12 = atan2_w, atan2_w, f0 nop.i 999 ;; } { .mfi nop.m 999 - fma.s1 atan2_V11 = atan2_w, atan2_P9, atan2_P8 + fma.s1 atan2_V8 = atan2_w, atan2_P7 , atan2_P6 nop.i 999 } { .mfi nop.m 999 - fma.s1 atan2_V12 = atan2_w, atan2_w, f0 + fma.s1 atan2_W8 = atan2_w, atan2_P19, atan2_P18 nop.i 999 ;; } { .mfi nop.m 999 - fma.s1 atan2_V7 = atan2_w, atan2_P5 , atan2_P4 + fnma.s1 atan2_alpha = atan2_E, atan2_F, f1 nop.i 999 } { .mfi nop.m 999 - fma.s1 atan2_V8 = atan2_w, atan2_P7 , atan2_P6 + fnma.s1 atan2_alpha_1 = atan2_E, atan2_F, atan2_two nop.i 999 ;; } + { .mfi nop.m 999 - fma.s1 atan2_W7 = atan2_w, atan2_P17, atan2_P16 + fma.s1 atan2_V7 = atan2_w, atan2_P5 , atan2_P4 nop.i 999 } { .mfi nop.m 999 - fma.s1 atan2_W8 = atan2_w, atan2_P19, atan2_P18 + fma.s1 atan2_W7 = atan2_w, atan2_P17, atan2_P16 nop.i 999 ;; } { .mfi nop.m 999 - fma.s1 atan2_W3 = atan2_w, atan2_P13, atan2_P12 + fma.s1 atan2_V4 = atan2_w, atan2_P3 , atan2_P2 nop.i 999 } { .mfi @@ -689,55 +655,55 @@ __ieee754_atan2: { .mfi nop.m 999 - fma.s1 atan2_V3 = atan2_w, atan2_P1 , atan2_P0 + fma.s1 atan2_V3 = atan2_w, atan2_P1 , atan2_P0 nop.i 999 } { .mfi nop.m 999 - fma.s1 atan2_V4 = atan2_w, atan2_P3 , atan2_P2 + fma.s1 atan2_W3 = atan2_w, atan2_P13, atan2_P12 nop.i 999 ;; } { .mfi nop.m 999 - fma.s1 atan2_zcub = atan2_z, atan2_w, f0 + fma.s1 atan2_V10 = atan2_V12, atan2_V13, atan2_V11 nop.i 999 } { .mfi nop.m 999 - fnma.s1 atan2_gV = atan2_Umax, atan2_z, atan2_Vmin + fma.s1 atan2_gVF = atan2_gV, atan2_F, f0 nop.i 999 ;; } { .mfi nop.m 999 - frcpa.s1 atan2_F,p15 = f1, atan2_E + fma.s1 atan2_alpha_sq = atan2_alpha, atan2_alpha, f0 nop.i 999 } { .mfi nop.m 999 - fma.s1 atan2_V10 = atan2_V12, atan2_V13, atan2_V11 + fma.s1 atan2_Cp = atan2_alpha, atan2_alpha_1, f1 nop.i 999 ;; } { .mfi nop.m 999 - fma.s1 atan2_V6 = atan2_V12, atan2_V8 , atan2_V7 + fma.s1 atan2_V9 = atan2_V12, atan2_V12, f0 nop.i 999 } { .mfi nop.m 999 - fma.s1 atan2_V9 = atan2_V12, atan2_V12, f0 + fma.s1 atan2_W10 = atan2_V12, atan2_P22 , atan2_W11 nop.i 999 ;; } { .mfi nop.m 999 - fma.s1 atan2_W10 = atan2_V12, atan2_P22 , atan2_W11 + fma.s1 atan2_V6 = atan2_V12, atan2_V8 , atan2_V7 nop.i 999 } { .mfi @@ -749,65 +715,47 @@ __ieee754_atan2: { .mfi nop.m 999 - fma.s1 atan2_W2 = atan2_V12, atan2_W4 , atan2_W3 + fma.s1 atan2_V2 = atan2_V12, atan2_V4 , atan2_V3 nop.i 999 } { .mfi nop.m 999 - fma.s1 atan2_V2 = atan2_V12, atan2_V4 , atan2_V3 + fma.s1 atan2_W2 = atan2_V12, atan2_W4 , atan2_W3 nop.i 999 ;; } - -// Both X and Y are INF -// p10 ==> X + -// p11 ==> X - -.pred.rel "mutex",p10,p11 -{ .mfb - nop.m 999 -(p10) fma.d f8 = atan2_sgnY, atan2_pi_by_4, f0 -(p10) br.ret.spnt b0 -} -{ .mfb - nop.m 999 -(p11) fma.d f8 = atan2_sgnY, atan2_3pi_by_4, f0 -(p11) br.ret.spnt b0 -;; -} - - -.pred.rel "mutex",p8,p9,p6 +// p8 ==> y 0 x? +// p9 ==> y !0 x? { .mfi nop.m 999 - fnma.s1 atan2_alpha = atan2_E, atan2_F, f1 + fclass.m p8,p9 = atan2_Y, 0x07 // Test for y=0 nop.i 999 } { .mfi nop.m 999 - fnma.s1 atan2_alpha_1 = atan2_E, atan2_F, atan2_two + fma.s1 atan2_zcub = atan2_z, atan2_w, f0 nop.i 999 ;; } - { .mfi nop.m 999 -//(atan2_sT) fmerge.s atan2_P = atan2_Y, atan2_Pi_by_2 -(p6) fmerge.s atan2_P = atan2_Y, atan2_Pi_by_2 + fma.s1 atan2_alpha_cub = atan2_alpha, atan2_alpha_sq, f0 nop.i 999 } { .mfi nop.m 999 - fma.s1 atan2_gVF = atan2_gV, atan2_F, f0 + fma.s1 atan2_C = atan2_gVF, atan2_Cp, f0 nop.i 999 ;; } - +// p12 ==> y0 x0 +// p13 ==> y0 x!0 { .mfi nop.m 999 - fma.s1 atan2_V5 = atan2_V9, atan2_V10, atan2_V6 +(p8) fclass.m.unc p12,p13 = atan2_X, 0x07 // y=0, test if x is 0 nop.i 999 } { .mfi @@ -817,11 +765,9 @@ __ieee754_atan2: ;; } - - { .mfi nop.m 999 -(p8) fmerge.s atan2_P = atan2_sgnY, f0 + fma.s1 atan2_V5 = atan2_V9, atan2_V10, atan2_V6 nop.i 999 } { .mfi @@ -832,249 +778,214 @@ __ieee754_atan2: } - - +// p9 ==> y!0 x0 { .mfi nop.m 999 -(p9) fmerge.s atan2_P = atan2_sgnY, atan2_pi +(p9) fclass.m.unc p9,p0 = atan2_X, 0x07 // y not 0, test if x is 0 nop.i 999 +} +// p10 ==> X +INF, Y +-INF +{ .mfb + nop.m 999 +(p10) fma.d.s0 f8 = atan2_sgnY, atan2_pi_by_4, f0 // x=+inf, y=inf +(p10) br.ret.spnt b0 // Exit for x=+inf, y=inf, result is +-pi/4 ;; } - +.pred.rel "mutex",p11,p14 { .mfi nop.m 999 - fma.s1 atan2_alpha_sq = atan2_alpha, atan2_alpha, f0 +(p14) fmerge.s f8 = atan2_sgnY, f0 // x=+inf, y !inf, result +-0 nop.i 999 } -{ .mfi +// p11 ==> X -INF, Y +-INF +{ .mfb nop.m 999 - fma.s1 atan2_Cp = atan2_alpha, atan2_alpha_1, f1 - nop.i 999 +(p11) fma.d.s0 f8 = atan2_sgnY, atan2_3pi_by_4, f0 // x=-inf, y=inf +(p11) br.ret.spnt b0 // Exit for x=-inf, y=inf, result is +-3pi/4 ;; } - { .mfi nop.m 999 - fma.s1 atan2_V1 = atan2_V9, atan2_V5, atan2_V2 +(p13) fcmp.gt.unc.s1 p10,p11 = atan2_X,f0 // x not 0, y=0, test if x>0 nop.i 999 } -{ .mfi +{ .mfb nop.m 999 - fma.s1 atan2_W12 = atan2_V9, atan2_W12, f0 - nop.i 999 + fma.s1 atan2_d = atan2_alpha_cub, atan2_C, atan2_C +(p14) br.ret.spnt b0 // Exit if x=+inf, y !inf, result +-0 ;; } - -// p13 ==> x inf y !inf { .mfi nop.m 999 - fma.s1 atan2_W1 = atan2_V9, atan2_W5, atan2_W2 + fma.s1 atan2_W12 = atan2_V9, atan2_W12, f0 nop.i 999 } -{ .mfi +{ .mfb nop.m 999 -(p13) fcmp.eq.unc.s1 p10,p11 = atan2_sgnX,f1 - nop.i 999 +(p9) fma.d.s0 f8 = atan2_sgnY, atan2_pi_by_2, f0 // x=0, y not 0 +(p9) br.ret.spnt b0 // Exit if x=0 and y not 0, result is +-pi/2 ;; } - { .mfi nop.m 999 - fma.s1 atan2_alpha_cub = atan2_alpha, atan2_alpha_sq, f0 + fma.s1 atan2_V1 = atan2_V9, atan2_V5, atan2_V2 nop.i 999 } -{ .mfi +{ .mfb nop.m 999 - fma.s1 atan2_C = atan2_gVF, atan2_Cp, f0 - nop.i 999 + fma.s1 atan2_W1 = atan2_V9, atan2_W5, atan2_W2 +(p12) br.spnt ATAN2_ERROR // Branch if x=0 and y=0 ;; } -.pred.rel "mutex",p10,p11 -// x inf y !inf -{ .mfb +{ .mfi nop.m 999 -(p10) fmerge.s f8 = atan2_sgnY, f0 -(p10) br.ret.spnt b0 +(p10) fmerge.s f8 = atan2_sgnY, f0 // +-0 if x>0, y=0 + nop.i 999 } { .mfb nop.m 999 -(p11) fma.d f8 = atan2_sgnY, atan2_pi, f0 -(p11) br.ret.spnt b0 +(p11) fma.d.s0 f8 = atan2_sgnY, atan2_pi, f0 // +-pi if x<0, y=0 +(p13) br.ret.spnt b0 // Exit if x!0 and y=0 ;; } - -// p10 ==> y 0 x? -// p11 ==> y !0 x? { .mfi nop.m 999 - fclass.m.unc p10,p11 = f8, 0x07 + fma.s1 atan2_pd = atan2_P0, atan2_d, f0 nop.i 999 -;; } - { .mfi nop.m 999 -(p8) fmerge.s atan2_sml_norm = atan2_sgnY, atan2_sml_norm + fma.s1 atan2_dsq = atan2_d, atan2_d, f0 nop.i 999 ;; } + { .mfi nop.m 999 - fma.s1 atan2_Pp = atan2_W12, atan2_W1, atan2_V1 + fmerge.se atan2_near_one = f1, atan2_sig_near_one // Const ~1.0 nop.i 999 } { .mfi nop.m 999 - fma.s1 atan2_d = atan2_alpha_cub, atan2_C, atan2_C + fma.s1 atan2_Pp = atan2_W12, atan2_W1, atan2_V1 nop.i 999 ;; } -// p12 ==> y0 x0 -// p13 ==> y0 x!0 -// p14 ==> y!0 x0 -// p15 ==> y!0 x!0 -{ .mfi - nop.m 999 -(p10) fclass.m.unc p12,p13 = f9, 0x07 - nop.i 999 -} +// p8 true if no swap and X positive +// p9 true if no swap and X negative +// both are false is swap is true { .mfi nop.m 999 -(p11) fclass.m.unc p14,p15 = f9, 0x07 +(p7) fcmp.ge.unc.s1 p8,p9 = atan2_X,f0 nop.i 999 -;; } - - - - { .mfb nop.m 999 -(p13) fcmp.eq.unc.s1 p10,p11 = atan2_sgnX,f1 -(p12) br.spnt ATAN2_ERROR +(p15) fma.d.s0 f8 = atan2_sgnY, atan2_pi, f0 +(p15) br.ret.spnt b0 // Exit if x=-inf, y !inf, result +-pi ;; } - - { .mfi nop.m 999 - fma.s1 atan2_pd = atan2_P0, atan2_d, f0 + fma.s1 atan2_sgn_pi_by_2 = atan2_pi_by_2, atan2_sgnY, f0 nop.i 999 } { .mfi nop.m 999 - fma.s1 atan2_dsq = atan2_d, atan2_d, f0 + fma.s1 atan2_A_lo = atan2_pd, atan2_dsq, atan2_d nop.i 999 ;; } + { .mfi nop.m 999 - fma.s1 atan2_A_hi = atan2_zcub, atan2_Pp, atan2_z + fma.s1 atan2_sgn_pi = atan2_pi, atan2_sgnY, f0 nop.i 999 } -{ .mfb +{ .mfi nop.m 999 -(p14) fma.d f8 = atan2_sgnY, atan2_Pi_by_2, f0 -(p14) br.ret.spnt b0 + fma.s1 atan2_A_hi = atan2_zcub, atan2_Pp, atan2_z + nop.i 999 ;; } - -{ .mfb - nop.m 999 -(p10) fmerge.s f8 = atan2_sgnY, f0 -(p10) br.ret.spnt b0 -} -{ .mfb +// For |Y| <= |X| and X > 0, force inexact in case A_lo is zero +{ .mfi nop.m 999 -(p11) fma.d f8 = atan2_sgnY, atan2_pi, f0 -(p11) br.ret.spnt b0 +(p8) fmpy.s0 atan2_tmp = atan2_P22, atan2_P22 + nop.i 999 ;; } - - { .mfi nop.m 999 - fma.s1 atan2_A_lo = atan2_pd, atan2_dsq, atan2_d + fma.s1 atan2_A = atan2_A_hi, f1, atan2_A_lo nop.i 999 -;; } - - +// For |Y| <= |X| and X > 0, result is A_hi + A_lo { .mfi nop.m 999 - fma.s1 atan2_A = atan2_A_hi, f1, atan2_A_lo +(p8) fma.d.s0 f8 = atan2_A_hi, f1, atan2_A_lo nop.i 999 ;; } -// Force inexact and possibly underflow if very small results +.pred.rel "mutex",p6,p9 +// We perturb A by multiplying by 1.0+1ulp as we produce the result +// in order to get symmetrically rounded results in directed rounding modes. +// If we don't do this, there are a few cases where the trailing 11 bits of +// the significand of the result, before converting to double, are zero. These +// cases do not round symmetrically in round to +infinity or round to -infinity. +// The perturbation also insures that the inexact flag is set. +// For |Y| > |X|, result is +- pi/2 - (A_hi + A_lo) { .mfi nop.m 999 -(p8) fma.d atan2_FR_tmp = atan2_sgnXY, atan2_A, atan2_sml_norm +(p6) fnma.d.s0 f8 = atan2_A, atan2_near_one, atan2_sgn_pi_by_2 nop.i 999 } +// For |Y| <= |X|, and X < 0, result is +- pi + (A_hi + A_lo) { .mfb nop.m 999 - fma.d f8 = atan2_sgnXY, atan2_A, atan2_P - br.ret.sptk b0 +(p9) fma.d.s0 f8 = atan2_A, atan2_near_one, atan2_sgn_pi + br.ret.sptk b0 ;; } ATAN2_ERROR: - +// Here if x=0 and y=0 { .mfi nop.m 999 - fcmp.eq.unc.s1 p10,p11 = atan2_sgnX,f1 + fclass.m p10,p11 = atan2_X,0x05 // Test if x=+0 nop.i 999 } ;; { .mfi - mov atan2_GR_tag = 37 -(p10) fmerge.s f10 = atan2_sgnY, f0 - nop.i 999 + mov atan2_GR_tag = 37 +(p10) fmerge.s f10 = atan2_sgnY, f0 // x=+0, y=0 + nop.i 999 } { .mfi nop.m 999 -(p11) fma.d f10 = atan2_sgnY, atan2_pi, f0 +(p11) fma.d.s0 f10 = atan2_sgnY, atan2_pi, f0 // x=-0, y=0 nop.i 999 ;; } -.endp atan2# -ASM_SIZE_DIRECTIVE(atan2#) - - -// Stack operations when calling error support. -// (1) (2) (3) (call) (4) -// sp -> + psp -> + psp -> + sp -> + -// | | | | -// | | <- GR_Y R3 ->| <- GR_RESULT | -> f8 -// | | | | -// | <-GR_Y Y2->| Y2 ->| <- GR_Y | -// | | | | -// | | <- GR_X X1 ->| | -// | | | | -// sp-64 -> + sp -> + sp -> + + -// save ar.pfs save b0 restore gp -// save gp restore ar.pfs +GLOBAL_IEEE754_END(atan2) - -.proc __libm_error_region -__libm_error_region: +LOCAL_LIBM_ENTRY(__libm_error_region) .prologue // (1) { .mfi @@ -1102,19 +1013,19 @@ __libm_error_region: .body // (3) { .mib - stfd [GR_Parameter_X] = f9 // STORE Parameter 1 on stack + stfd [GR_Parameter_X] = f9 // STORE Parameter 1 on stack add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address - nop.b 0 + nop.b 0 } { .mib - stfd [GR_Parameter_Y] = f10 // STORE Parameter 3 on stack + stfd [GR_Parameter_Y] = f10 // STORE Parameter 3 on stack add GR_Parameter_Y = -16,GR_Parameter_Y - br.call.sptk b0=__libm_error_support# // Call error handling function + br.call.sptk b0=__libm_error_support# // Call error handling function };; { .mmi - nop.m 0 - nop.m 0 add GR_Parameter_RESULT = 48,sp + nop.m 0 + nop.i 0 };; // (4) @@ -1130,8 +1041,7 @@ __libm_error_region: br.ret.sptk b0 // Return };; -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) +LOCAL_LIBM_END(__libm_error_region) .type __libm_error_support#,@function .global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_atan2f.S b/sysdeps/ia64/fpu/e_atan2f.S index 03a4fed82f..c483a7ad34 100644 --- a/sysdeps/ia64/fpu/e_atan2f.S +++ b/sysdeps/ia64/fpu/e_atan2f.S @@ -1,10 +1,10 @@ .file "atan2f.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. // -// Contributed 6/1/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,7 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -35,18 +35,21 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. // History //============================================================== -// 6/01/00 Initial version -// 8/15/00 Bundle added after call to __libm_error_support to properly +// 06/01/00 Initial version +// 08/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. -// 8/17/00 Changed predicate register macro-usage to direct predicate +// 08/17/00 Changed predicate register macro-usage to direct predicate // names due to an assembler bug. -// 1/05/01 Fixed flag settings for denormal input. -// 1/19/01 Added documentation -// 1/30/01 Improved speed +// 01/05/01 Fixed flag settings for denormal input. +// 01/19/01 Added documentation +// 01/30/01 Improved speed +// 02/06/02 Corrected .section statement +// 05/20/02 Cleaned up namespace and sf0 syntax +// 02/06/03 Reordered header: .section, .global, .proc, .align // Description //========================================= @@ -226,7 +229,6 @@ // atan2f(+-0/+-0) sets single error tag to 38 // These are domain errors. -#include "libm_support.h" // // Assembly macros @@ -324,22 +326,20 @@ atan2f_poly_atan_U = f88 //atan2f_Pred_Xneg = p9 // x < 0 -.data +RODATA .align 16 -atan2f_coef_table1: -ASM_TYPE_DIRECTIVE(atan2f_coef_table1,@object) +LOCAL_OBJECT_START(atan2f_coef_table1) data8 0xBFD5555512191621 // p1 data8 0x3F522E5D33BC9BAA // p10 data8 0xBFA6E10BA401393F // p7 data8 0x3FB142A73D7C54E3 // p6 data8 0xBFC2473C5145EE38 // p3 data8 0x3FC9997E7AFBFF4E // p2 -ASM_SIZE_DIRECTIVE(atan2f_coef_table1) +LOCAL_OBJECT_END(atan2f_coef_table1) -atan2f_coef_table2: -ASM_TYPE_DIRECTIVE(atan2f_coef_table2,@object) +LOCAL_OBJECT_START(atan2f_coef_table2) data8 0xBF7DEAADAA336451 // p9 data8 0x3F97105B4160F86B // p8 data8 0xBFB68EED6A8CFA32 // p5 @@ -348,29 +348,12 @@ data8 0x3ff921fb54442d18 // pi/2 data8 0x400921fb54442d18 // pi data8 0x3fe921fb54442d18 // pi/4 data8 0x4002d97c7f3321d2 // 3pi/4 -ASM_SIZE_DIRECTIVE(atan2f_coef_table2) - +LOCAL_OBJECT_END(atan2f_coef_table2) -.global atan2f -#ifdef _LIBC -.global __atan2f -.global __ieee754_atan2f -#endif - -.text -.align 32 - -atan2f: -.proc atan2f -#ifdef _LIBC -.proc __atan2f -__atan2f: -.proc __ieee754_atan2f -__ieee754_atan2f: -#endif - +.section .text +GLOBAL_IEEE754_ENTRY(atan2f) { .mfi alloc r32 = ar.pfs,1,5,4,0 @@ -724,7 +707,7 @@ ATAN2F_XY_INF_NAN_ZERO: } { .mfb nop.m 999 -(p10) fma.s f8 = f9,f8,f0 // Result quietized y if y is nan +(p10) fma.s.s0 f8 = f9,f8,f0 // Result quietized y if y is nan (p10) br.ret.spnt b0 // Exit if y is nan } ;; @@ -737,7 +720,7 @@ ATAN2F_XY_INF_NAN_ZERO: } { .mfb nop.m 999 -(p12) fnorm.s f8 = f9 // Result quietized x if x is nan, y not nan +(p12) fnorm.s.s0 f8 = f9 // Result quietized x if x is nan, y not nan (p12) br.ret.spnt b0 // Exit if x is nan, y not nan } ;; @@ -757,7 +740,7 @@ ATAN2F_XY_INF_NAN_ZERO: } { .mfb nop.m 999 -(p7) fma.s f8 = atan2f_sgn_Y, atan2f_const_piby4,f0 // Result +-pi/4 +(p7) fma.s.s0 f8 = atan2f_sgn_Y, atan2f_const_piby4,f0 // Result +-pi/4 (p7) br.ret.spnt b0 // Exit if x +inf and y inf } ;; @@ -790,19 +773,19 @@ ATAN2F_XY_INF_NAN_ZERO: } { .mfb nop.m 999 -(p13) fma.s f8 = atan2f_sgn_Y, atan2f_const_piby2,f0 // Result +-pi/2 +(p13) fma.s.s0 f8 = atan2f_sgn_Y, atan2f_const_piby2,f0 // Result +-pi/2 (p13) br.ret.spnt b0 // Exit if x not -inf and y inf } ;; { .mfi nop.m 999 -(p14) fma.s f8 = atan2f_sgn_Y, atan2f_const_3piby4,f0 // Result +-3pi/4 +(p14) fma.s.s0 f8 = atan2f_sgn_Y, atan2f_const_3piby4,f0 // Result +-3pi/4 nop.i 999 } { .mfb nop.m 999 -(p15) fma.s f8 = atan2f_sgn_Y, atan2f_const_pi,f0 // Result +-pi +(p15) fma.s.s0 f8 = atan2f_sgn_Y, atan2f_const_pi,f0 // Result +-pi (p11) br.ret.spnt b0 // Exit if x -inf } ;; @@ -829,31 +812,28 @@ ATAN2F_XY_INF_NAN_ZERO: } { .mfb nop.m 999 -(p9) fma.s f8 = atan2f_sgn_Y, atan2f_const_pi,f0 // x < 0, y 0, result +-pi +(p9) fma.s.s0 f8 = atan2f_sgn_Y, atan2f_const_pi,f0 // x < 0, y 0, result +-pi (p10) br.cond.spnt __libm_error_region // Branch if x zero and y zero } ;; { .mfb nop.m 999 -(p11) fma.s f8 = atan2f_sgn_Y, atan2f_const_piby2,f0 // x zero, y not zero +(p11) fma.s.s0 f8 = atan2f_sgn_Y, atan2f_const_piby2,f0 // x zero, y not zero br.ret.sptk b0 // Final special case exit } ;; -.endp atan2f -ASM_SIZE_DIRECTIVE(atan2f) - +GLOBAL_IEEE754_END(atan2f) -.proc __libm_error_region -__libm_error_region: +LOCAL_LIBM_ENTRY(__libm_error_region) .prologue mov GR_Parameter_TAG = 38 fclass.m p10,p11 = f9,0x5 // @zero | @pos ;; (p10) fmerge.s f10 = f8, f0 -(p11) fma.s f10 = atan2f_sgn_Y, atan2f_const_pi,f0 +(p11) fma.s.s0 f10 = atan2f_sgn_Y, atan2f_const_pi,f0 ;; { .mfi @@ -913,8 +893,7 @@ __libm_error_region: } ;; -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) +LOCAL_LIBM_END(__libm_error_region) .type __libm_error_support#,@function .global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_cosh.S b/sysdeps/ia64/fpu/e_cosh.S index 205653d4bf..0c6c5b451e 100644 --- a/sysdeps/ia64/fpu/e_cosh.S +++ b/sysdeps/ia64/fpu/e_cosh.S @@ -1,10 +1,10 @@ .file "cosh.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2002, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,1081 +20,799 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// // Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// problem reports or change requests be submitted to it directly at +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 2/02/00 Initial version -// 4/04/00 Unwind support added -// 8/15/00 Bundle added after call to __libm_error_support to properly +// 02/02/00 Initial version +// 04/04/00 Unwind support added +// 08/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. -// +// 05/07/01 Reworked to improve speed of all paths +// 05/20/02 Cleaned up namespace and sf0 syntax +// 11/15/02 Improved speed with new algorithm + // API //============================================================== -// double = cosh(double) -// input floating point f8 -// output floating point f8 - +// double cosh(double) // Overview of operation //============================================================== -// There are four paths +// Case 1: 0 < |x| < 0.25 +// Evaluate cosh(x) by a 12th order polynomial +// Care is take for the order of multiplication; and A2 is not exactly 1/4!, +// A3 is not exactly 1/6!, etc. +// cosh(x) = 1 + (A1*x^2 + A2*x^4 + A3*x^6 + A4*x^8 + A5*x^10 + A6*x^12) +// +// Case 2: 0.25 < |x| < 710.47586 +// Algorithm is based on the identity cosh(x) = ( exp(x) + exp(-x) ) / 2. +// The algorithm for exp is described as below. There are a number of +// economies from evaluating both exp(x) and exp(-x). Although we +// are evaluating both quantities, only where the quantities diverge do we +// duplicate the computations. The basic algorithm for exp(x) is described +// below. +// +// Take the input x. w is "how many log2/128 in x?" +// w = x * 128/log2 +// n = int(w) +// x = n log2/128 + r + delta -// 1. |x| < 0.25 COSH_BY_POLY -// 2. |x| < 32 COSH_BY_TBL -// 3. |x| < 2^14 COSH_BY_EXP -// 4. |x_ >= 2^14 COSH_HUGE +// n = 128M + index_1 + 2^4 index_2 +// x = M log2 + (log2/128) index_1 + (log2/8) index_2 + r + delta -// For paths 1, and 2 SAFE is always 1. -// For path 4, Safe is always 0. -// SAFE = 1 means we cannot overflow. +// exp(x) = 2^M 2^(index_1/128) 2^(index_2/8) exp(r) exp(delta) +// Construct 2^M +// Get 2^(index_1/128) from table_1; +// Get 2^(index_2/8) from table_2; +// Calculate exp(r) by 5th order polynomial +// r = x - n (log2/128)_high +// delta = - n (log2/128)_low +// Calculate exp(delta) as 1 + delta -#include "libm_support.h" -// Assembly macros +// Special values //============================================================== -cosh_FR_X = f44 -cosh_FR_SGNX = f40 - -cosh_FR_Inv_log2by64 = f9 -cosh_FR_log2by64_lo = f11 -cosh_FR_log2by64_hi = f10 - -cosh_FR_A1 = f9 -cosh_FR_A2 = f10 -cosh_FR_A3 = f11 - -cosh_FR_Rcub = f12 -cosh_FR_M_temp = f13 -cosh_FR_R_temp = f13 -cosh_FR_Rsq = f13 -cosh_FR_R = f14 - -cosh_FR_M = f38 - -cosh_FR_B1 = f15 -cosh_FR_B2 = f32 -cosh_FR_B3 = f33 - -cosh_FR_peven_temp1 = f34 -cosh_FR_peven_temp2 = f35 -cosh_FR_peven = f36 - -cosh_FR_podd_temp1 = f34 -cosh_FR_podd_temp2 = f35 -cosh_FR_podd = f37 - -cosh_FR_J_temp = f9 -cosh_FR_J = f10 +// cosh(+0) = 1.0 +// cosh(-0) = 1.0 -cosh_FR_Mmj = f39 +// cosh(+qnan) = +qnan +// cosh(-qnan) = -qnan +// cosh(+snan) = +qnan +// cosh(-snan) = -qnan -cosh_FR_N_temp1 = f11 -cosh_FR_N_temp2 = f12 -cosh_FR_N = f13 +// cosh(-inf) = +inf +// cosh(+inf) = +inf -cosh_FR_spos = f14 -cosh_FR_sneg = f15 - -cosh_FR_Tjhi = f32 -cosh_FR_Tjlo = f33 -cosh_FR_Tmjhi = f34 -cosh_FR_Tmjlo = f35 - -GR_mJ = r35 -GR_J = r36 - -AD_mJ = r38 -AD_J = r39 - -cosh_FR_C_hi = f9 -cosh_FR_C_hi_temp = f10 -cosh_FR_C_lo_temp1 = f11 -cosh_FR_C_lo_temp2 = f12 -cosh_FR_C_lo_temp3 = f13 - -cosh_FR_C_lo = f38 -cosh_FR_S_hi = f39 - -cosh_FR_S_hi_temp1 = f10 -cosh_FR_Y_hi = f11 -cosh_FR_Y_lo_temp = f12 -cosh_FR_Y_lo = f13 -cosh_FR_COSH = f9 - -cosh_FR_X2 = f9 -cosh_FR_X4 = f10 - -cosh_FR_P1 = f14 -cosh_FR_P2 = f15 -cosh_FR_P3 = f32 -cosh_FR_P4 = f33 -cosh_FR_P5 = f34 -cosh_FR_P6 = f35 - -cosh_FR_TINY_THRESH = f9 - -cosh_FR_COSH_temp = f10 -cosh_FR_SCALE = f11 +// Overflow and Underflow +//======================= +// cosh(x) = largest double normal when +// x = 710.47586 = 0x408633ce8fb9f87d +// +// There is no underflow. -cosh_FR_hi_lo = f10 +// Registers used +//============================================================== +// Floating Point registers used: +// f8, input, output +// f6 -> f15, f32 -> f61 -cosh_FR_poly_podd_temp1 = f11 -cosh_FR_poly_podd_temp2 = f13 -cosh_FR_poly_peven_temp1 = f11 -cosh_FR_poly_peven_temp2 = f13 +// General registers used: +// r14 -> r40 -GR_SAVE_PFS = r41 -GR_SAVE_B0 = r42 -GR_SAVE_GP = r43 +// Predicate registers used: +// p6 -> p15 -GR_Parameter_X = r44 -GR_Parameter_Y = r45 -GR_Parameter_RESULT = r46 +// Assembly macros +//============================================================== +rRshf = r14 +rN_neg = r14 +rAD_TB1 = r15 +rAD_TB2 = r16 +rAD_P = r17 +rN = r18 +rIndex_1 = r19 +rIndex_2_16 = r20 +rM = r21 +rBiased_M = r21 +rSig_inv_ln2 = r22 +rIndex_1_neg = r22 +rExp_bias = r23 +rExp_bias_minus_1 = r23 +rExp_mask = r24 +rTmp = r24 +rGt_ln = r24 +rIndex_2_16_neg = r24 +rM_neg = r25 +rBiased_M_neg = r25 +rRshf_2to56 = r26 +rAD_T1_neg = r26 +rExp_2tom56 = r28 +rAD_T2_neg = r28 +rAD_T1 = r29 +rAD_T2 = r30 +rSignexp_x = r31 +rExp_x = r31 + +GR_SAVE_B0 = r33 +GR_SAVE_PFS = r34 +GR_SAVE_GP = r35 +GR_SAVE_SP = r36 + +GR_Parameter_X = r37 +GR_Parameter_Y = r38 +GR_Parameter_RESULT = r39 +GR_Parameter_TAG = r40 + + +FR_X = f10 +FR_Y = f1 +FR_RESULT = f8 + +fRSHF_2TO56 = f6 +fINV_LN2_2TO63 = f7 +fW_2TO56_RSH = f9 +f2TOM56 = f11 +fP5 = f12 +fP4 = f13 +fP3 = f14 +fP2 = f15 + +fLn2_by_128_hi = f33 +fLn2_by_128_lo = f34 + +fRSHF = f35 +fNfloat = f36 +fNormX = f37 +fR = f38 +fF = f39 + +fRsq = f40 +f2M = f41 +fS1 = f42 +fT1 = f42 +fS2 = f43 +fT2 = f43 +fS = f43 +fWre_urm_f8 = f44 +fAbsX = f44 + +fMIN_DBL_OFLOW_ARG = f45 +fMAX_DBL_NORM_ARG = f46 +fXsq = f47 +fX4 = f48 +fGt_pln = f49 +fTmp = f49 + +fP54 = f50 +fP5432 = f50 +fP32 = f51 +fP = f52 +fP54_neg = f53 +fP5432_neg = f53 +fP32_neg = f54 +fP_neg = f55 +fF_neg = f56 + +f2M_neg = f57 +fS1_neg = f58 +fT1_neg = f58 +fS2_neg = f59 +fT2_neg = f59 +fS_neg = f59 +fExp = f60 +fExp_neg = f61 + +fA6 = f50 +fA65 = f50 +fA6543 = f50 +fA654321 = f50 +fA5 = f51 +fA4 = f52 +fA43 = f52 +fA3 = f53 +fA2 = f54 +fA21 = f54 +fA1 = f55 // Data tables //============================================================== -#ifdef _LIBC -.rodata -#else -.data -#endif - +RODATA .align 16 -double_cosh_arg_reduction: -ASM_TYPE_DIRECTIVE(double_cosh_arg_reduction,@object) - data8 0xB8AA3B295C17F0BC, 0x00004005 - data8 0xB17217F7D1000000, 0x00003FF8 - data8 0xCF79ABC9E3B39804, 0x00003FD0 -ASM_SIZE_DIRECTIVE(double_cosh_arg_reduction) - -double_cosh_p_table: -ASM_TYPE_DIRECTIVE(double_cosh_p_table,@object) - data8 0x8000000000000000, 0x00003FFE - data8 0xAAAAAAAAAAAAAB80, 0x00003FFA - data8 0xB60B60B60B4FE884, 0x00003FF5 - data8 0xD00D00D1021D7370, 0x00003FEF - data8 0x93F27740C0C2F1CC, 0x00003FE9 - data8 0x8FA02AC65BCBD5BC, 0x00003FE2 -ASM_SIZE_DIRECTIVE(double_cosh_p_table) - -double_cosh_ab_table: -ASM_TYPE_DIRECTIVE(double_cosh_ab_table,@object) - data8 0xAAAAAAAAAAAAAAAC, 0x00003FFC - data8 0x88888888884ECDD5, 0x00003FF8 - data8 0xD00D0C6DCC26A86B, 0x00003FF2 - data8 0x8000000000000002, 0x00003FFE - data8 0xAAAAAAAAAA402C77, 0x00003FFA - data8 0xB60B6CC96BDB144D, 0x00003FF5 -ASM_SIZE_DIRECTIVE(double_cosh_ab_table) - -double_cosh_j_table: -ASM_TYPE_DIRECTIVE(double_cosh_j_table,@object) - data8 0xB504F333F9DE6484, 0x00003FFE, 0x1EB2FB13, 0x00000000 - data8 0xB6FD91E328D17791, 0x00003FFE, 0x1CE2CBE2, 0x00000000 - data8 0xB8FBAF4762FB9EE9, 0x00003FFE, 0x1DDC3CBC, 0x00000000 - data8 0xBAFF5AB2133E45FB, 0x00003FFE, 0x1EE9AA34, 0x00000000 - data8 0xBD08A39F580C36BF, 0x00003FFE, 0x9EAEFDC1, 0x00000000 - data8 0xBF1799B67A731083, 0x00003FFE, 0x9DBF517B, 0x00000000 - data8 0xC12C4CCA66709456, 0x00003FFE, 0x1EF88AFB, 0x00000000 - data8 0xC346CCDA24976407, 0x00003FFE, 0x1E03B216, 0x00000000 - data8 0xC5672A115506DADD, 0x00003FFE, 0x1E78AB43, 0x00000000 - data8 0xC78D74C8ABB9B15D, 0x00003FFE, 0x9E7B1747, 0x00000000 - data8 0xC9B9BD866E2F27A3, 0x00003FFE, 0x9EFE3C0E, 0x00000000 - data8 0xCBEC14FEF2727C5D, 0x00003FFE, 0x9D36F837, 0x00000000 - data8 0xCE248C151F8480E4, 0x00003FFE, 0x9DEE53E4, 0x00000000 - data8 0xD06333DAEF2B2595, 0x00003FFE, 0x9E24AE8E, 0x00000000 - data8 0xD2A81D91F12AE45A, 0x00003FFE, 0x1D912473, 0x00000000 - data8 0xD4F35AABCFEDFA1F, 0x00003FFE, 0x1EB243BE, 0x00000000 - data8 0xD744FCCAD69D6AF4, 0x00003FFE, 0x1E669A2F, 0x00000000 - data8 0xD99D15C278AFD7B6, 0x00003FFE, 0x9BBC610A, 0x00000000 - data8 0xDBFBB797DAF23755, 0x00003FFE, 0x1E761035, 0x00000000 - data8 0xDE60F4825E0E9124, 0x00003FFE, 0x9E0BE175, 0x00000000 - data8 0xE0CCDEEC2A94E111, 0x00003FFE, 0x1CCB12A1, 0x00000000 - data8 0xE33F8972BE8A5A51, 0x00003FFE, 0x1D1BFE90, 0x00000000 - data8 0xE5B906E77C8348A8, 0x00003FFE, 0x1DF2F47A, 0x00000000 - data8 0xE8396A503C4BDC68, 0x00003FFE, 0x1EF22F22, 0x00000000 - data8 0xEAC0C6E7DD24392F, 0x00003FFE, 0x9E3F4A29, 0x00000000 - data8 0xED4F301ED9942B84, 0x00003FFE, 0x1EC01A5B, 0x00000000 - data8 0xEFE4B99BDCDAF5CB, 0x00003FFE, 0x1E8CAC3A, 0x00000000 - data8 0xF281773C59FFB13A, 0x00003FFE, 0x9DBB3FAB, 0x00000000 - data8 0xF5257D152486CC2C, 0x00003FFE, 0x1EF73A19, 0x00000000 - data8 0xF7D0DF730AD13BB9, 0x00003FFE, 0x9BB795B5, 0x00000000 - data8 0xFA83B2DB722A033A, 0x00003FFE, 0x1EF84B76, 0x00000000 - data8 0xFD3E0C0CF486C175, 0x00003FFE, 0x9EF5818B, 0x00000000 - data8 0x8000000000000000, 0x00003FFF, 0x00000000, 0x00000000 - data8 0x8164D1F3BC030773, 0x00003FFF, 0x1F77CACA, 0x00000000 - data8 0x82CD8698AC2BA1D7, 0x00003FFF, 0x1EF8A91D, 0x00000000 - data8 0x843A28C3ACDE4046, 0x00003FFF, 0x1E57C976, 0x00000000 - data8 0x85AAC367CC487B15, 0x00003FFF, 0x9EE8DA92, 0x00000000 - data8 0x871F61969E8D1010, 0x00003FFF, 0x1EE85C9F, 0x00000000 - data8 0x88980E8092DA8527, 0x00003FFF, 0x1F3BF1AF, 0x00000000 - data8 0x8A14D575496EFD9A, 0x00003FFF, 0x1D80CA1E, 0x00000000 - data8 0x8B95C1E3EA8BD6E7, 0x00003FFF, 0x9D0373AF, 0x00000000 - data8 0x8D1ADF5B7E5BA9E6, 0x00003FFF, 0x9F167097, 0x00000000 - data8 0x8EA4398B45CD53C0, 0x00003FFF, 0x1EB70051, 0x00000000 - data8 0x9031DC431466B1DC, 0x00003FFF, 0x1F6EB029, 0x00000000 - data8 0x91C3D373AB11C336, 0x00003FFF, 0x1DFD6D8E, 0x00000000 - data8 0x935A2B2F13E6E92C, 0x00003FFF, 0x9EB319B0, 0x00000000 - data8 0x94F4EFA8FEF70961, 0x00003FFF, 0x1EBA2BEB, 0x00000000 - data8 0x96942D3720185A00, 0x00003FFF, 0x1F11D537, 0x00000000 - data8 0x9837F0518DB8A96F, 0x00003FFF, 0x1F0D5A46, 0x00000000 - data8 0x99E0459320B7FA65, 0x00003FFF, 0x9E5E7BCA, 0x00000000 - data8 0x9B8D39B9D54E5539, 0x00003FFF, 0x9F3AAFD1, 0x00000000 - data8 0x9D3ED9A72CFFB751, 0x00003FFF, 0x9E86DACC, 0x00000000 - data8 0x9EF5326091A111AE, 0x00003FFF, 0x9F3EDDC2, 0x00000000 - data8 0xA0B0510FB9714FC2, 0x00003FFF, 0x1E496E3D, 0x00000000 - data8 0xA27043030C496819, 0x00003FFF, 0x9F490BF6, 0x00000000 - data8 0xA43515AE09E6809E, 0x00003FFF, 0x1DD1DB48, 0x00000000 - data8 0xA5FED6A9B15138EA, 0x00003FFF, 0x1E65EBFB, 0x00000000 - data8 0xA7CD93B4E965356A, 0x00003FFF, 0x9F427496, 0x00000000 - data8 0xA9A15AB4EA7C0EF8, 0x00003FFF, 0x1F283C4A, 0x00000000 - data8 0xAB7A39B5A93ED337, 0x00003FFF, 0x1F4B0047, 0x00000000 - data8 0xAD583EEA42A14AC6, 0x00003FFF, 0x1F130152, 0x00000000 - data8 0xAF3B78AD690A4375, 0x00003FFF, 0x9E8367C0, 0x00000000 - data8 0xB123F581D2AC2590, 0x00003FFF, 0x9F705F90, 0x00000000 - data8 0xB311C412A9112489, 0x00003FFF, 0x1EFB3C53, 0x00000000 - data8 0xB504F333F9DE6484, 0x00003FFF, 0x1F32FB13, 0x00000000 -ASM_SIZE_DIRECTIVE(double_cosh_j_table) - -.align 32 -.global cosh# -.section .text -.proc cosh# -.align 32 +// ************* DO NOT CHANGE ORDER OF THESE TABLES ******************** -cosh: +// double-extended 1/ln(2) +// 3fff b8aa 3b29 5c17 f0bb be87fed0691d3e88 +// 3fff b8aa 3b29 5c17 f0bc +// For speed the significand will be loaded directly with a movl and setf.sig +// and the exponent will be bias+63 instead of bias+0. Thus subsequent +// computations need to scale appropriately. +// The constant 128/ln(2) is needed for the computation of w. This is also +// obtained by scaling the computations. +// +// Two shifting constants are loaded directly with movl and setf.d. +// 1. fRSHF_2TO56 = 1.1000..00 * 2^(63-7) +// This constant is added to x*1/ln2 to shift the integer part of +// x*128/ln2 into the rightmost bits of the significand. +// The result of this fma is fW_2TO56_RSH. +// 2. fRSHF = 1.1000..00 * 2^(63) +// This constant is subtracted from fW_2TO56_RSH * 2^(-56) to give +// the integer part of w, n, as a floating-point number. +// The result of this fms is fNfloat. + + +LOCAL_OBJECT_START(exp_table_1) +data8 0x408633ce8fb9f87e // smallest dbl overflow arg +data8 0x408633ce8fb9f87d // largest dbl arg to give normal dbl result +data8 0xb17217f7d1cf79ab , 0x00003ff7 // ln2/128 hi +data8 0xc9e3b39803f2f6af , 0x00003fb7 // ln2/128 lo +// +// Table 1 is 2^(index_1/128) where +// index_1 goes from 0 to 15 +// +data8 0x8000000000000000 , 0x00003FFF +data8 0x80B1ED4FD999AB6C , 0x00003FFF +data8 0x8164D1F3BC030773 , 0x00003FFF +data8 0x8218AF4373FC25EC , 0x00003FFF +data8 0x82CD8698AC2BA1D7 , 0x00003FFF +data8 0x8383594EEFB6EE37 , 0x00003FFF +data8 0x843A28C3ACDE4046 , 0x00003FFF +data8 0x84F1F656379C1A29 , 0x00003FFF +data8 0x85AAC367CC487B15 , 0x00003FFF +data8 0x8664915B923FBA04 , 0x00003FFF +data8 0x871F61969E8D1010 , 0x00003FFF +data8 0x87DB357FF698D792 , 0x00003FFF +data8 0x88980E8092DA8527 , 0x00003FFF +data8 0x8955EE03618E5FDD , 0x00003FFF +data8 0x8A14D575496EFD9A , 0x00003FFF +data8 0x8AD4C6452C728924 , 0x00003FFF +LOCAL_OBJECT_END(exp_table_1) + +// Table 2 is 2^(index_1/8) where +// index_2 goes from 0 to 7 +LOCAL_OBJECT_START(exp_table_2) +data8 0x8000000000000000 , 0x00003FFF +data8 0x8B95C1E3EA8BD6E7 , 0x00003FFF +data8 0x9837F0518DB8A96F , 0x00003FFF +data8 0xA5FED6A9B15138EA , 0x00003FFF +data8 0xB504F333F9DE6484 , 0x00003FFF +data8 0xC5672A115506DADD , 0x00003FFF +data8 0xD744FCCAD69D6AF4 , 0x00003FFF +data8 0xEAC0C6E7DD24392F , 0x00003FFF +LOCAL_OBJECT_END(exp_table_2) + +LOCAL_OBJECT_START(exp_p_table) +data8 0x3f8111116da21757 //P5 +data8 0x3fa55555d787761c //P4 +data8 0x3fc5555555555414 //P3 +data8 0x3fdffffffffffd6a //P2 +LOCAL_OBJECT_END(exp_p_table) + +LOCAL_OBJECT_START(cosh_p_table) +data8 0x8FA02AC65BCBD5BC, 0x00003FE2 // A6 +data8 0xD00D00D1021D7370, 0x00003FEF // A4 +data8 0xAAAAAAAAAAAAAB80, 0x00003FFA // A2 +data8 0x93F27740C0C2F1CC, 0x00003FE9 // A5 +data8 0xB60B60B60B4FE884, 0x00003FF5 // A3 +data8 0x8000000000000000, 0x00003FFE // A1 +LOCAL_OBJECT_END(cosh_p_table) -#ifdef _LIBC -.global __ieee754_cosh# -.proc __ieee754_cosh# -__ieee754_cosh: -#endif -// X NAN? +.section .text +GLOBAL_IEEE754_ENTRY(cosh) -{ .mfi - alloc r32 = ar.pfs,0,12,4,0 -(p0) fclass.m.unc p6,p7 = f8, 0xc3 //@snan | @qnan - nop.i 999 +{ .mlx + getf.exp rSignexp_x = f8 // Must recompute if x unorm + movl rSig_inv_ln2 = 0xb8aa3b295c17f0bc // significand of 1/ln2 } -;; - - -{ .mfb - nop.m 999 -(p6) fma.d.s0 f8 = f8,f1,f8 -(p6) br.ret.spnt b0 ;; +{ .mlx + addl rAD_TB1 = @ltoff(exp_table_1), gp + movl rRshf_2to56 = 0x4768000000000000 // 1.10000 2^(63+56) } +;; - -// X infinity { .mfi - nop.m 999 -(p0) fclass.m.unc p6,p0 = f8, 0x23 //@inf - nop.i 999 ;; -} - -{ .mfb - nop.m 999 -(p6) fmerge.s f8 = f0,f8 -(p6) br.ret.spnt b0 ;; + ld8 rAD_TB1 = [rAD_TB1] + fclass.m p6,p0 = f8,0x0b // Test for x=unorm + mov rExp_mask = 0x1ffff } - - - -// Put 0.25 in f9; p6 true if x < 0.25 -{ .mlx - nop.m 999 -(p0) movl r32 = 0x000000000000fffd ;; -} - { .mfi -(p0) setf.exp f9 = r32 - nop.f 999 - nop.i 999 ;; + mov rExp_bias = 0xffff + fnorm.s1 fNormX = f8 + mov rExp_2tom56 = 0xffff-56 } +;; + +// Form two constants we need +// 1/ln2 * 2^63 to compute w = x * 1/ln2 * 128 +// 1.1000..000 * 2^(63+63-7) to right shift int(w) into the significand { .mfi - nop.m 999 -(p0) fmerge.s cosh_FR_X = f0,f8 + setf.sig fINV_LN2_2TO63 = rSig_inv_ln2 // form 1/ln2 * 2^63 + fclass.m p8,p0 = f8,0x07 // Test for x=0 nop.i 999 } - -{ .mfi - nop.m 999 -(p0) fmerge.s cosh_FR_SGNX = f8,f1 - nop.i 999 ;; +{ .mlx + setf.d fRSHF_2TO56 = rRshf_2to56 // Form const 1.100 * 2^(63+56) + movl rRshf = 0x43e8000000000000 // 1.10000 2^63 for right shift } +;; { .mfi - nop.m 999 -(p0) fcmp.lt.unc p0,p7 = cosh_FR_X,f9 - nop.i 999 ;; + ldfpd fMIN_DBL_OFLOW_ARG, fMAX_DBL_NORM_ARG = [rAD_TB1],16 + fclass.m p10,p0 = f8,0x1e3 // Test for x=inf, nan, NaT + nop.i 0 } - -{ .mib - nop.m 999 - nop.i 999 -(p7) br.cond.sptk L(COSH_BY_TBL) +{ .mfb + setf.exp f2TOM56 = rExp_2tom56 // form 2^-56 for scaling Nfloat + nop.f 0 +(p6) br.cond.spnt COSH_UNORM // Branch if x=unorm } ;; - -// COSH_BY_POLY: -// POLY cannot overflow so there is no need to call __libm_error_support -// Get the values of P_x from the table - -{ .mmi - nop.m 999 -(p0) addl r34 = @ltoff(double_cosh_p_table), gp - nop.i 999 +COSH_COMMON: +{ .mfi + ldfe fLn2_by_128_hi = [rAD_TB1],16 + nop.f 0 + nop.i 0 } -;; - -{ .mmi - ld8 r34 = [r34] - nop.m 999 - nop.i 999 +{ .mfb + setf.d fRSHF = rRshf // Form right shift const 1.100 * 2^63 +(p8) fma.d.s0 f8 = f1,f1,f0 // quick exit for x=0 +(p8) br.ret.spnt b0 } ;; - -// Calculate cosh_FR_X2 = ax*ax and cosh_FR_X4 = ax*ax*ax*ax -{ .mmf - nop.m 999 -(p0) ldfe cosh_FR_P1 = [r34],16 -(p0) fma.s1 cosh_FR_X2 = cosh_FR_X, cosh_FR_X, f0 ;; -} - -{ .mmi -(p0) ldfe cosh_FR_P2 = [r34],16 ;; -(p0) ldfe cosh_FR_P3 = [r34],16 - nop.i 999 ;; +{ .mfi + ldfe fLn2_by_128_lo = [rAD_TB1],16 + nop.f 0 + nop.i 0 } - -{ .mmi -(p0) ldfe cosh_FR_P4 = [r34],16 ;; -(p0) ldfe cosh_FR_P5 = [r34],16 - nop.i 999 ;; +{ .mfb + and rExp_x = rExp_mask, rSignexp_x // Biased exponent of x +(p10) fma.d.s0 f8 = f8,f8,f0 // Result if x=inf, nan, NaT +(p10) br.ret.spnt b0 // quick exit for x=inf, nan, NaT } +;; +// After that last load rAD_TB1 points to the beginning of table 1 { .mfi -(p0) ldfe cosh_FR_P6 = [r34],16 -(p0) fma.s1 cosh_FR_X4 = cosh_FR_X2, cosh_FR_X2, f0 - nop.i 999 ;; + nop.m 0 + fcmp.eq.s0 p6,p0 = f8, f0 // Dummy to set D + sub rExp_x = rExp_x, rExp_bias // True exponent of x } +;; -// Calculate cosh_FR_podd = x4 *(x4 * P_5 + P_3) + P_1 { .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_poly_podd_temp1 = cosh_FR_X4, cosh_FR_P5, cosh_FR_P3 - nop.i 999 ;; + nop.m 0 + fmerge.s fAbsX = f0, fNormX // Form |x| + nop.i 0 } - -{ .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_podd = cosh_FR_X4, cosh_FR_poly_podd_temp1, cosh_FR_P1 - nop.i 999 +{ .mfb + cmp.gt p7, p0 = -2, rExp_x // Test |x| < 2^(-2) + fma.s1 fXsq = fNormX, fNormX, f0 // x*x for small path +(p7) br.cond.spnt COSH_SMALL // Branch if 0 < |x| < 2^-2 } +;; -// Calculate cosh_FR_peven = p_even = x4 *(x4 * (x4 * P_6 + P_4) + P_2) -{ .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_poly_peven_temp1 = cosh_FR_X4, cosh_FR_P6, cosh_FR_P4 - nop.i 999 ;; -} +// W = X * Inv_log2_by_128 +// By adding 1.10...0*2^63 we shift and get round_int(W) in significand. +// We actually add 1.10...0*2^56 to X * Inv_log2 to do the same thing. { .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_poly_peven_temp2 = cosh_FR_X4, cosh_FR_poly_peven_temp1, cosh_FR_P2 - nop.i 999 ;; + add rAD_P = 0x180, rAD_TB1 + fma.s1 fW_2TO56_RSH = fNormX, fINV_LN2_2TO63, fRSHF_2TO56 + add rAD_TB2 = 0x100, rAD_TB1 } +;; + +// Divide arguments into the following categories: +// Certain Safe - 0.25 <= |x| <= MAX_DBL_NORM_ARG +// Possible Overflow p14 - MAX_DBL_NORM_ARG < |x| < MIN_DBL_OFLOW_ARG +// Certain Overflow p15 - MIN_DBL_OFLOW_ARG <= |x| < +inf +// +// If the input is really a double arg, then there will never be +// "Possible Overflow" arguments. +// { .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_peven = cosh_FR_X4, cosh_FR_poly_peven_temp2, f0 - nop.i 999 ;; + ldfpd fP5, fP4 = [rAD_P] ,16 + fcmp.ge.s1 p15,p14 = fAbsX,fMIN_DBL_OFLOW_ARG + nop.i 0 } +;; + +// Nfloat = round_int(W) +// The signficand of fW_2TO56_RSH contains the rounded integer part of W, +// as a twos complement number in the lower bits (that is, it may be negative). +// That twos complement number (called N) is put into rN. + +// Since fW_2TO56_RSH is scaled by 2^56, it must be multiplied by 2^-56 +// before the shift constant 1.10000 * 2^63 is subtracted to yield fNfloat. +// Thus, fNfloat contains the floating point version of N -// Y_lo = x2*p_odd + p_even -// Calculate f8 = Y_hi + Y_lo { .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_Y_lo = cosh_FR_X2, cosh_FR_podd, cosh_FR_peven - nop.i 999 ;; + ldfpd fP3, fP2 = [rAD_P] +(p14) fcmp.gt.unc.s1 p14,p0 = fAbsX,fMAX_DBL_NORM_ARG + nop.i 0 } - { .mfb - nop.m 999 -(p0) fma.d.s0 f8 = f1, f1, cosh_FR_Y_lo -(p0) br.ret.sptk b0 ;; + nop.m 0 + fms.s1 fNfloat = fW_2TO56_RSH, f2TOM56, fRSHF +(p15) br.cond.spnt COSH_CERTAIN_OVERFLOW } +;; - -L(COSH_BY_TBL): - -// Now that we are at TBL; so far all we know is that |x| >= 0.25. -// The first two steps are the same for TBL and EXP, but if we are HUGE -// Double -// Go to HUGE if |x| >= 2^10, 10009 (register-biased) is e = 10 (true) -// Single -// Go to HUGE if |x| >= 2^7, 10006 (register-biased) is e = 7 (true) -// we want to leave now. Go to HUGE if |x| >= 2^14 -// 1000d (register-biased) is e = 14 (true) - -{ .mlx - nop.m 999 -(p0) movl r32 = 0x0000000000010009 ;; +{ .mfi + getf.sig rN = fW_2TO56_RSH + nop.f 0 + mov rExp_bias_minus_1 = 0xfffe } +;; + +// rIndex_1 has index_1 +// rIndex_2_16 has index_2 * 16 +// rBiased_M has M +// rM has true M +// r = x - Nfloat * ln2_by_128_hi +// f = 1 - Nfloat * ln2_by_128_lo { .mfi -(p0) setf.exp f9 = r32 - nop.f 999 - nop.i 999 ;; + and rIndex_1 = 0x0f, rN + fnma.s1 fR = fNfloat, fLn2_by_128_hi, fNormX + shr rM = rN, 0x7 } - { .mfi - nop.m 999 -(p0) fcmp.ge.unc p6,p7 = cosh_FR_X,f9 - nop.i 999 ;; + and rIndex_2_16 = 0x70, rN + fnma.s1 fF = fNfloat, fLn2_by_128_lo, f1 + sub rN_neg = r0, rN } +;; -{ .mib - nop.m 999 - nop.i 999 -(p6) br.cond.spnt L(COSH_HUGE) ;; +{ .mmi + and rIndex_1_neg = 0x0f, rN_neg + add rBiased_M = rExp_bias_minus_1, rM + shr rM_neg = rN_neg, 0x7 } - -// r32 = 1 -// r34 = N-1 -// r35 = N -// r36 = j -// r37 = N+1 - -// TBL can never overflow -// cosh(x) = cosh(B+R) -// = cosh(B) cosh(R) + sinh(B) sinh(R) -// cosh(R) can be approximated by 1 + p_even -// sinh(R) can be approximated by p_odd - -// ****************************************************** -// STEP 1 (TBL and EXP) -// ****************************************************** -// Get the following constants. -// f9 = Inv_log2by64 -// f10 = log2by64_hi -// f11 = log2by64_lo - { .mmi -(p0) adds r32 = 0x1,r0 -(p0) addl r34 = @ltoff(double_cosh_arg_reduction), gp - nop.i 999 + and rIndex_2_16_neg = 0x70, rN_neg + add rAD_T2 = rAD_TB2, rIndex_2_16 + shladd rAD_T1 = rIndex_1, 4, rAD_TB1 } ;; -// We want 2^(N-1) and 2^(-N-1). So bias N-1 and -N-1 and -// put them in an exponent. -// cosh_FR_spos = 2^(N-1) and cosh_FR_sneg = 2^(-N-1) -// r39 = 0xffff + (N-1) = 0xffff +N -1 -// r40 = 0xffff - (N +1) = 0xffff -N -1 - -{ .mlx - ld8 r34 = [r34] -(p0) movl r38 = 0x000000000000fffe ;; -} +// rAD_T1 has address of T1 +// rAD_T2 has address if T2 { .mmi -(p0) ldfe cosh_FR_Inv_log2by64 = [r34],16 ;; -(p0) ldfe cosh_FR_log2by64_hi = [r34],16 - nop.i 999 ;; + setf.exp f2M = rBiased_M + ldfe fT2 = [rAD_T2] + nop.i 0 } - -{ .mbb -(p0) ldfe cosh_FR_log2by64_lo = [r34],16 - nop.b 999 - nop.b 999 ;; -} - -// Get the A coefficients -// f9 = A_1 -// f10 = A_2 -// f11 = A_3 - { .mmi - nop.m 999 -(p0) addl r34 = @ltoff(double_cosh_ab_table), gp - nop.i 999 + add rBiased_M_neg = rExp_bias_minus_1, rM_neg + add rAD_T2_neg = rAD_TB2, rIndex_2_16_neg + shladd rAD_T1_neg = rIndex_1_neg, 4, rAD_TB1 } ;; +// Create Scale = 2^M +// Load T1 and T2 { .mmi - ld8 r34 = [r34] - nop.m 999 - nop.i 999 + ldfe fT1 = [rAD_T1] + nop.m 0 + nop.i 0 +} +{ .mmf + setf.exp f2M_neg = rBiased_M_neg + ldfe fT2_neg = [rAD_T2_neg] + fma.s1 fF_neg = fNfloat, fLn2_by_128_lo, f1 } ;; - -// Calculate M and keep it as integer and floating point. -// M = round-to-integer(x*Inv_log2by64) -// cosh_FR_M = M = truncate(ax/(log2/64)) -// Put the significand of M in r35 -// and the floating point representation of M in cosh_FR_M - { .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_M = cosh_FR_X, cosh_FR_Inv_log2by64, f0 - nop.i 999 + nop.m 0 + fma.s1 fRsq = fR, fR, f0 + nop.i 0 } - { .mfi -(p0) ldfe cosh_FR_A1 = [r34],16 - nop.f 999 - nop.i 999 ;; + ldfe fT1_neg = [rAD_T1_neg] + fma.s1 fP54 = fR, fP5, fP4 + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fcvt.fx.s1 cosh_FR_M_temp = cosh_FR_M - nop.i 999 ;; + nop.m 0 + fma.s1 fP32 = fR, fP3, fP2 + nop.i 0 } - { .mfi - nop.m 999 -(p0) fnorm.s1 cosh_FR_M = cosh_FR_M_temp - nop.i 999 ;; + nop.m 0 + fnma.s1 fP54_neg = fR, fP5, fP4 + nop.i 0 } +;; { .mfi -(p0) getf.sig r35 = cosh_FR_M_temp - nop.f 999 - nop.i 999 ;; -} - -// M is still in r35. Calculate j. j is the signed extension of the six lsb of M. It -// has a range of -32 thru 31. -// r35 = M -// r36 = j -{ .mii - nop.m 999 - nop.i 999 ;; -(p0) and r36 = 0x3f, r35 ;; + nop.m 0 + fnma.s1 fP32_neg = fR, fP3, fP2 + nop.i 0 } - -// Calculate R -// f13 = f44 - f12*f10 = x - M*log2by64_hi -// f14 = f13 - f8*f11 = R = (x - M*log2by64_hi) - M*log2by64_lo +;; { .mfi - nop.m 999 -(p0) fnma.s1 cosh_FR_R_temp = cosh_FR_M, cosh_FR_log2by64_hi, cosh_FR_X - nop.i 999 + nop.m 0 + fma.s1 fP5432 = fRsq, fP54, fP32 + nop.i 0 } - { .mfi -(p0) ldfe cosh_FR_A2 = [r34],16 - nop.f 999 - nop.i 999 ;; + nop.m 0 + fma.s1 fS2 = fF,fT2,f0 + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fnma.s1 cosh_FR_R = cosh_FR_M, cosh_FR_log2by64_lo, cosh_FR_R_temp - nop.i 999 + nop.m 0 + fma.s1 fS1 = f2M,fT1,f0 + nop.i 0 } - -// Get the B coefficients -// f15 = B_1 -// f32 = B_2 -// f33 = B_3 - -{ .mmi -(p0) ldfe cosh_FR_A3 = [r34],16 ;; -(p0) ldfe cosh_FR_B1 = [r34],16 - nop.i 999 ;; -} - -{ .mmi -(p0) ldfe cosh_FR_B2 = [r34],16 ;; -(p0) ldfe cosh_FR_B3 = [r34],16 - nop.i 999 ;; -} - -{ .mii - nop.m 999 -(p0) shl r34 = r36, 0x2 ;; -(p0) sxt1 r37 = r34 ;; -} - -// ****************************************************** -// STEP 2 (TBL and EXP) -// ****************************************************** -// Calculate Rsquared and Rcubed in preparation for p_even and p_odd -// f12 = R*R*R -// f13 = R*R -// f14 = R <== from above - { .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_Rsq = cosh_FR_R, cosh_FR_R, f0 -(p0) shr r36 = r37, 0x2 ;; -} - -// r34 = M-j = r35 - r36 -// r35 = N = (M-j)/64 - -{ .mii -(p0) sub r34 = r35, r36 - nop.i 999 ;; -(p0) shr r35 = r34, 0x6 ;; -} - -{ .mii -(p0) sub r40 = r38, r35 -(p0) adds r37 = 0x1, r35 -(p0) add r39 = r38, r35 ;; -} - -// Get the address of the J table, add the offset, -// addresses are sinh_AD_mJ and sinh_AD_J, get the T value -// f32 = T(j)_hi -// f33 = T(j)_lo -// f34 = T(-j)_hi -// f35 = T(-j)_lo - -{ .mmi -(p0) sub r34 = r35, r32 -(p0) addl r37 = @ltoff(double_cosh_j_table), gp - nop.i 999 + nop.m 0 + fma.s1 fP5432_neg = fRsq, fP54_neg, fP32_neg + nop.i 0 } ;; { .mfi - ld8 r37 = [r37] -(p0) fma.s1 cosh_FR_Rcub = cosh_FR_Rsq, cosh_FR_R, f0 - nop.i 999 + nop.m 0 + fma.s1 fS1_neg = f2M_neg,fT1_neg,f0 + nop.i 0 } - -// ****************************************************** -// STEP 3 Now decide if we need to branch to EXP -// ****************************************************** -// Put 32 in f9; p6 true if x < 32 - -{ .mlx - nop.m 999 -(p0) movl r32 = 0x0000000000010004 ;; -} - -// Calculate p_even -// f34 = B_2 + Rsq *B_3 -// f35 = B_1 + Rsq*f34 = B_1 + Rsq * (B_2 + Rsq *B_3) -// f36 = peven = Rsq * f35 = Rsq * (B_1 + Rsq * (B_2 + Rsq *B_3)) - { .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_peven_temp1 = cosh_FR_Rsq, cosh_FR_B3, cosh_FR_B2 - nop.i 999 ;; + nop.m 0 + fma.s1 fS2_neg = fF_neg,fT2_neg,f0 + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_peven_temp2 = cosh_FR_Rsq, cosh_FR_peven_temp1, cosh_FR_B1 - nop.i 999 + nop.m 0 + fma.s1 fP = fRsq, fP5432, fR + nop.i 0 } - -// Calculate p_odd -// f34 = A_2 + Rsq *A_3 -// f35 = A_1 + Rsq * (A_2 + Rsq *A_3) -// f37 = podd = R + Rcub * (A_1 + Rsq * (A_2 + Rsq *A_3)) - { .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_podd_temp1 = cosh_FR_Rsq, cosh_FR_A3, cosh_FR_A2 - nop.i 999 ;; + nop.m 0 + fma.s1 fS = fS1,fS2,f0 + nop.i 0 } +;; { .mfi -(p0) setf.exp cosh_FR_N_temp1 = r39 - nop.f 999 - nop.i 999 ;; + nop.m 0 + fms.s1 fP_neg = fRsq, fP5432_neg, fR + nop.i 0 } - { .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_peven = cosh_FR_Rsq, cosh_FR_peven_temp2, f0 - nop.i 999 + nop.m 0 + fma.s1 fS_neg = fS1_neg,fS2_neg,f0 + nop.i 0 } +;; -{ .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_podd_temp2 = cosh_FR_Rsq, cosh_FR_podd_temp1, cosh_FR_A1 - nop.i 999 ;; +{ .mfb + nop.m 0 + fmpy.s0 fTmp = fLn2_by_128_lo, fLn2_by_128_lo // Force inexact +(p14) br.cond.spnt COSH_POSSIBLE_OVERFLOW } +;; { .mfi -(p0) setf.exp f9 = r32 - nop.f 999 - nop.i 999 ;; + nop.m 0 + fma.s1 fExp = fS, fP, fS + nop.i 0 } - { .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_podd = cosh_FR_podd_temp2, cosh_FR_Rcub, cosh_FR_R - nop.i 999 + nop.m 0 + fma.s1 fExp_neg = fS_neg, fP_neg, fS_neg + nop.i 0 } +;; -// sinh_GR_mj contains the table offset for -j -// sinh_GR_j contains the table offset for +j -// p6 is true when j <= 0 - -{ .mlx -(p0) setf.exp cosh_FR_N_temp2 = r40 -(p0) movl r40 = 0x0000000000000020 ;; +{ .mfb + nop.m 0 + fma.d.s0 f8 = fExp, f1, fExp_neg + br.ret.sptk b0 // Normal path exit } +;; -{ .mfi -(p0) sub GR_mJ = r40, r36 -(p0) fmerge.se cosh_FR_spos = cosh_FR_N_temp1, f1 -(p0) adds GR_J = 0x20, r36 ;; +// Here if 0 < |x| < 0.25 +COSH_SMALL: +{ .mmf + add rAD_T1 = 0x1a0, rAD_TB1 + add rAD_T2 = 0x1d0, rAD_TB1 } +;; -{ .mii - nop.m 999 -(p0) shl GR_mJ = GR_mJ, 5 ;; -(p0) add AD_mJ = r37, GR_mJ ;; +{ .mmf + ldfe fA6 = [rAD_T1],16 + ldfe fA5 = [rAD_T2],16 + nop.f 0 } +;; { .mmi - nop.m 999 -(p0) ldfe cosh_FR_Tmjhi = [AD_mJ],16 -(p0) shl GR_J = GR_J, 5 ;; -} - -{ .mfi -(p0) ldfs cosh_FR_Tmjlo = [AD_mJ],16 -(p0) fcmp.lt.unc.s1 p6,p7 = cosh_FR_X,f9 -(p0) add AD_J = r37, GR_J ;; + ldfe fA4 = [rAD_T1],16 + ldfe fA3 = [rAD_T2],16 + nop.i 0 } +;; { .mmi -(p0) ldfe cosh_FR_Tjhi = [AD_J],16 ;; -(p0) ldfs cosh_FR_Tjlo = [AD_J],16 - nop.i 999 ;; + ldfe fA2 = [rAD_T1],16 + ldfe fA1 = [rAD_T2],16 + nop.i 0 } - -{ .mfb - nop.m 999 -(p0) fmerge.se cosh_FR_sneg = cosh_FR_N_temp2, f1 -(p7) br.cond.spnt L(COSH_BY_EXP) ;; -} - -// ****************************************************** -// If NOT branch to EXP -// ****************************************************** -// Calculate C_hi -// ****************************************************** -// cosh_FR_C_hi_temp = cosh_FR_sneg * cosh_FR_Tmjhi -// cosh_FR_C_hi = cosh_FR_spos * cosh_FR_Tjhi + (cosh_FR_sneg * cosh_FR_Tmjhi) - -{ .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_C_hi_temp = cosh_FR_sneg, cosh_FR_Tmjhi, f0 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_C_hi = cosh_FR_spos, cosh_FR_Tjhi, cosh_FR_C_hi_temp - nop.i 999 -} - -// ****************************************************** -// Calculate S_hi -// ****************************************************** -// cosh_FR_S_hi_temp1 = cosh_FR_sneg * cosh_FR_Tmjhi -// cosh_FR_S_hi = cosh_FR_spos * cosh_FR_Tjhi - cosh_FR_C_hi_temp1 - -{ .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_S_hi_temp1 = cosh_FR_sneg, cosh_FR_Tmjhi, f0 - nop.i 999 ;; -} - -// ****************************************************** -// Calculate C_lo -// ****************************************************** -// cosh_FR_C_lo_temp1 = cosh_FR_spos * cosh_FR_Tjhi - cosh_FR_C_hi -// cosh_FR_C_lo_temp2 = cosh_FR_sneg * cosh_FR_Tmjlo + (cosh_FR_spos * cosh_FR_Tjhi - cosh_FR_C_hi) -// cosh_FR_C_lo_temp1 = cosh_FR_sneg * cosh_FR_Tmjlo -// cosh_FR_C_lo_temp3 = cosh_FR_spos * cosh_FR_Tjlo + (cosh_FR_sneg * cosh_FR_Tmjlo) -// cosh_FR_C_lo = cosh_FR_C_lo_temp3 + cosh_FR_C_lo_temp2 +;; { .mfi - nop.m 999 -(p0) fms.s1 cosh_FR_C_lo_temp1 = cosh_FR_spos, cosh_FR_Tjhi, cosh_FR_C_hi - nop.i 999 + nop.m 0 + fma.s1 fX4 = fXsq, fXsq, f0 + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fms.s1 cosh_FR_S_hi = cosh_FR_spos, cosh_FR_Tjhi, cosh_FR_S_hi_temp1 - nop.i 999 ;; + nop.m 0 + fma.s1 fA65 = fXsq, fA6, fA5 + nop.i 0 } - { .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_C_lo_temp2 = cosh_FR_sneg, cosh_FR_Tmjhi, cosh_FR_C_lo_temp1 - nop.i 999 -} - -{ .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_C_lo_temp1 = cosh_FR_sneg, cosh_FR_Tmjlo, f0 - nop.i 999 ;; + nop.m 0 + fma.s1 fA43 = fXsq, fA4, fA3 + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_C_lo_temp3 = cosh_FR_spos, cosh_FR_Tjlo, cosh_FR_C_lo_temp1 - nop.i 999 ;; + nop.m 0 + fma.s1 fA21 = fXsq, fA2, fA1 + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_C_lo = cosh_FR_C_lo_temp3, f1, cosh_FR_C_lo_temp2 - nop.i 999 ;; + nop.m 0 + fma.s1 fA6543 = fX4, fA65, fA43 + nop.i 0 } - -// ****************************************************** -// cosh_FR_Y_lo_temp = cosh_FR_C_hi * cosh_FR_peven + cosh_FR_C_lo -// cosh_FR_Y_lo = cosh_FR_S_hi * cosh_FR_podd + cosh_FR_Y_lo_temp -// cosh_FR_COSH = Y_hi + Y_lo +;; { .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_Y_lo_temp = cosh_FR_C_hi, cosh_FR_peven, cosh_FR_C_lo - nop.i 999 ;; + nop.m 0 + fma.s1 fA654321 = fX4, fA6543, fA21 + nop.i 0 } +;; +// Dummy multiply to generate inexact { .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_Y_lo = cosh_FR_S_hi, cosh_FR_podd, cosh_FR_Y_lo_temp - nop.i 999 ;; + nop.m 0 + fmpy.s0 fTmp = fA6, fA6 + nop.i 0 } - { .mfb - nop.m 999 -(p0) fma.d.s0 f8 = cosh_FR_C_hi, f1, cosh_FR_Y_lo -(p0) br.ret.sptk b0 ;; + nop.m 0 + fma.d.s0 f8 = fA654321, fXsq, f1 + br.ret.sptk b0 // Exit if 0 < |x| < 0.25 } +;; -L(COSH_BY_EXP): -// When p7 is true, we know that an overflow is not going to happen -// When p7 is false, we must check for possible overflow -// p7 is the over_SAFE flag -// f44 = Scale * (Y_hi + Y_lo) -// = cosh_FR_spos * (cosh_FR_Tjhi + cosh_FR_Y_lo) +COSH_POSSIBLE_OVERFLOW: -{ .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_Y_lo_temp = cosh_FR_peven, f1, cosh_FR_podd - nop.i 999 -} - -// Now we are in EXP. This is the only path where an overflow is possible -// but not for certain. So this is the only path where over_SAFE has any use. -// r34 still has N-1 -// There is a danger of double-extended overflow if N-1 > 16382 = 0x3ffe -// There is a danger of double overflow if N-1 > 0x3fe = 1022 +// Here if fMAX_DBL_NORM_ARG < |x| < fMIN_DBL_OFLOW_ARG +// This cannot happen if input is a double, only if input higher precision. +// Overflow is a possibility, not a certainty. -{ .mlx - nop.m 999 -(p0) movl r32 = 0x00000000000003fe ;; -} +// Recompute result using status field 2 with user's rounding mode, +// and wre set. If result is larger than largest double, then we have +// overflow { .mfi -(p0) cmp.gt.unc p0,p7 = r34, r32 - nop.f 999 - nop.i 999 ;; + mov rGt_ln = 0x103ff // Exponent for largest dbl + 1 ulp + fsetc.s2 0x7F,0x42 // Get user's round mode, set wre + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_Y_lo = cosh_FR_Tjhi, cosh_FR_Y_lo_temp, cosh_FR_Tjlo - nop.i 999 ;; + setf.exp fGt_pln = rGt_ln // Create largest double + 1 ulp + fma.d.s2 fWre_urm_f8 = fS, fP, fS // Result with wre set + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_COSH_temp = cosh_FR_Y_lo, f1, cosh_FR_Tjhi - nop.i 999 ;; + nop.m 0 + fsetc.s2 0x7F,0x40 // Turn off wre in sf2 + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fma.d.s0 f44 = cosh_FR_spos, cosh_FR_COSH_temp, f0 - nop.i 999 ;; + nop.m 0 + fcmp.ge.s1 p6, p0 = fWre_urm_f8, fGt_pln // Test for overflow + nop.i 0 } +;; -// If over_SAFE is set, return { .mfb - nop.m 999 -(p7) fmerge.s f8 = f44,f44 -(p7) br.ret.sptk b0 ;; -} - -// Else see if we overflowed -// S0 user supplied status -// S2 user supplied status + WRE + TD (Overflows) -// If WRE is set then an overflow will not occur in EXP. -// The input value that would cause a register (WRE) value to overflow is about 2^15 -// and this input would go into the HUGE path. -// Answer with WRE is in f43. - -{ .mfi - nop.m 999 -(p0) fsetc.s2 0x7F,0x42 - nop.i 999;; -} - -{ .mfi - nop.m 999 -(p0) fma.d.s2 f43 = cosh_FR_spos, cosh_FR_COSH_temp, f0 - nop.i 999 ;; -} - -// 103FF => 103FF -FFFF = 400(true) -// 400 + 3FF = 7FF, which is 1 more that the exponent of the largest -// double (7FE). So 0 103FF 8000000000000000 is one ulp more than -// largest double in register bias -// Now set p8 if the answer with WRE is greater than or equal this value -// Also set p9 if the answer with WRE is less than or equal to negative this value - -{ .mlx - nop.m 999 -(p0) movl r32 = 0x00000000000103ff ;; + nop.m 0 + nop.f 0 +(p6) br.cond.spnt COSH_CERTAIN_OVERFLOW // Branch if overflow } +;; -{ .mmf - nop.m 999 -(p0) setf.exp f41 = r32 -(p0) fsetc.s2 0x7F,0x40 ;; +{ .mfb + nop.m 0 + fma.d.s0 f8 = fS, fP, fS + br.ret.sptk b0 // Exit if really no overflow } +;; -{ .mfi - nop.m 999 -(p0) fcmp.ge.unc.s1 p8, p0 = f43, f41 - nop.i 999 +COSH_CERTAIN_OVERFLOW: +{ .mmi + sub rTmp = rExp_mask, r0, 1 +;; + setf.exp fTmp = rTmp + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fmerge.ns f42 = f41, f41 - nop.i 999 ;; + alloc r32=ar.pfs,1,4,4,0 + fmerge.s FR_X = f8,f8 + nop.i 0 } - -// The error tag for overflow is 64 -{ .mii - nop.m 999 - nop.i 999 ;; -(p8) mov r47 = 64 ;; -} - { .mfb - nop.m 999 -(p0) fcmp.le.unc.s1 p9, p0 = f43, f42 -(p8) br.cond.spnt __libm_error_region ;; -} - -{ .mii - nop.m 999 - nop.i 999 ;; -(p9) mov r47 = 64 -} - -{ .mib - nop.m 999 - nop.i 999 -(p9) br.cond.spnt __libm_error_region ;; + mov GR_Parameter_TAG = 64 + fma.d.s0 FR_RESULT = fTmp, fTmp, f0 // Set I,O and +INF result + br.cond.sptk __libm_error_region } +;; +// Here if x unorm +COSH_UNORM: { .mfb - nop.m 999 -(p0) fmerge.s f8 = f44,f44 -(p0) br.ret.sptk b0 ;; -} - - -// for COSH_HUGE, put 24000 in exponent; take sign from input; add 1 -// SAFE: SAFE is always 0 for HUGE - -L(COSH_HUGE): - -{ .mlx - nop.m 999 -(p0) movl r32 = 0x0000000000015dbf ;; -} - -{ .mfi -(p0) setf.exp f9 = r32 - nop.f 999 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_hi_lo = f1, f9, f1 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p0) fma.d.s0 f44 = f9, cosh_FR_hi_lo, f0 -(p0) mov r47 = 64 + getf.exp rSignexp_x = fNormX // Must recompute if x unorm + fcmp.eq.s0 p6, p0 = f8, f0 // Set D flag + br.cond.sptk COSH_COMMON } ;; -.endp cosh# -ASM_SIZE_DIRECTIVE(cosh#) - -// Stack operations when calling error support. -// (1) (2) (3) (call) (4) -// sp -> + psp -> + psp -> + sp -> + -// | | | | -// | | <- GR_Y R3 ->| <- GR_RESULT | -> f8 -// | | | | -// | <-GR_Y Y2->| Y2 ->| <- GR_Y | -// | | | | -// | | <- GR_X X1 ->| | -// | | | | -// sp-64 -> + sp -> + sp -> + + -// save ar.pfs save b0 restore gp -// save gp restore ar.pfs - -.proc __libm_error_region -__libm_error_region: +GLOBAL_IEEE754_END(cosh) + +LOCAL_LIBM_ENTRY(__libm_error_region) .prologue -// (1) { .mfi add GR_Parameter_Y=-32,sp // Parameter 2 value nop.f 0 @@ -1103,39 +821,32 @@ __libm_error_region: } { .mfi .fframe 64 - add sp=-64,sp // Create new stack + add sp=-64,sp // Create new stack nop.f 0 - mov GR_SAVE_GP=gp // Save gp + mov GR_SAVE_GP=gp // Save gp };; - - -// (2) { .mmi - stfd [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack - add GR_Parameter_X = 16,sp // Parameter 1 address + stfd [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address .save b0, GR_SAVE_B0 - mov GR_SAVE_B0=b0 // Save b0 + mov GR_SAVE_B0=b0 // Save b0 };; - .body -// (3) { .mib - stfd [GR_Parameter_X] = f8 // STORE Parameter 1 on stack + stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address - nop.b 0 + nop.b 0 } { .mib - stfd [GR_Parameter_Y] = f44 // STORE Parameter 3 on stack + stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack add GR_Parameter_Y = -16,GR_Parameter_Y - br.call.sptk b0=__libm_error_support# // Call error handling function + br.call.sptk b0=__libm_error_support# // Call error handling function };; { .mmi - nop.m 0 - nop.m 0 add GR_Parameter_RESULT = 48,sp + nop.m 0 + nop.i 0 };; - -// (4) { .mmi ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack .restore sp @@ -1148,8 +859,6 @@ __libm_error_region: br.ret.sptk b0 // Return };; -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) - +LOCAL_LIBM_END(__libm_error_region) .type __libm_error_support#,@function .global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_coshf.S b/sysdeps/ia64/fpu/e_coshf.S index 969abc4ff6..91846e4717 100644 --- a/sysdeps/ia64/fpu/e_coshf.S +++ b/sysdeps/ia64/fpu/e_coshf.S @@ -1,10 +1,10 @@ .file "coshf.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2002, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,1127 +20,690 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://www.intel.com/software/products/opensource/libraries/num.htm. + // History -//============================================================== -// 2/02/00 Initial version -// 2/16/00 The error tag for coshf overflow changed to 65 (from 64). -// 4/04/00 Unwind support added -// 8/15/00 Bundle added after call to __libm_error_support to properly +//********************************************************************* +// 02/02/00 Initial version +// 02/16/00 The error tag for coshf overflow changed to 65 (from 64). +// 04/04/00 Unwind support added +// 08/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. +// 05/07/01 Reworked to improve speed of all paths +// 05/20/02 Cleaned up namespace and sf0 syntax +// 11/15/02 Improved algorithm based on expf // // API -//============================================================== -// float = coshf(float) -// input floating point f8 -// output floating point f8 - - +//********************************************************************* +// float coshf(float) +// // Overview of operation -//============================================================== -// There are four paths - -// 1. |x| < 0.25 COSH_BY_POLY -// 2. |x| < 32 COSH_BY_TBL -// 3. |x| < 2^14 COSH_BY_EXP -// 4. |x_ >= 2^14 COSH_HUGE - -// For paths 1, and 2 SAFE is always 1. -// For path 4, Safe is always 0. -// SAFE = 1 means we cannot overflow. - -#include "libm_support.h" - -// Assembly macros -//============================================================== -coshf_FR_X = f44 -coshf_FR_SGNX = f40 - -coshf_FR_Inv_log2by64 = f9 -coshf_FR_log2by64_lo = f11 -coshf_FR_log2by64_hi = f10 - -coshf_FR_A1 = f9 -coshf_FR_A2 = f10 -coshf_FR_A3 = f11 - -coshf_FR_Rcub = f12 -coshf_FR_M_temp = f13 -coshf_FR_R_temp = f13 -coshf_FR_Rsq = f13 -coshf_FR_R = f14 - -coshf_FR_M = f38 - -coshf_FR_B1 = f15 -coshf_FR_B2 = f32 -coshf_FR_B3 = f33 - -coshf_FR_peven_temp1 = f34 -coshf_FR_peven_temp2 = f35 -coshf_FR_peven = f36 - -coshf_FR_podd_temp1 = f34 -coshf_FR_podd_temp2 = f35 -coshf_FR_podd = f37 - -coshf_FR_J_temp = f9 -coshf_FR_J = f10 - -coshf_FR_Mmj = f39 - -coshf_FR_N_temp1 = f11 -coshf_FR_N_temp2 = f12 -coshf_FR_N = f13 - -coshf_FR_spos = f14 -coshf_FR_sneg = f15 - -coshf_FR_Tjhi = f32 -coshf_FR_Tjlo = f33 -coshf_FR_Tmjhi = f34 -coshf_FR_Tmjlo = f35 - -GR_mJ = r35 -GR_J = r36 - -AD_mJ = r38 -AD_J = r39 - - -GR_SAVE_B0 = r42 -GR_SAVE_PFS = r41 -GR_SAVE_GP = r43 - -GR_Parameter_X = r44 -GR_Parameter_Y = r45 -GR_Parameter_RESULT = r46 -GR_Parameter_TAG = r47 - -FR_X = f8 -FR_Y = f0 -FR_RESULT = f44 - - -coshf_FR_C_hi = f9 -coshf_FR_C_hi_temp = f10 -coshf_FR_C_lo_temp1 = f11 -coshf_FR_C_lo_temp2 = f12 -coshf_FR_C_lo_temp3 = f13 - -coshf_FR_C_lo = f38 -coshf_FR_S_hi = f39 +//********************************************************************* +// Case 1: 0 < |x| < 0.25 +// Evaluate cosh(x) by a 8th order polynomial +// Care is take for the order of multiplication; and A2 is not exactly 1/4!, +// A3 is not exactly 1/6!, etc. +// cosh(x) = 1 + (A1*x^2 + A2*x^4 + A3*x^6 + A4*x^8) +// +// Case 2: 0.25 < |x| < 89.41598 +// Algorithm is based on the identity cosh(x) = ( exp(x) + exp(-x) ) / 2. +// The algorithm for exp is described as below. There are a number of +// economies from evaluating both exp(x) and exp(-x). Although we +// are evaluating both quantities, only where the quantities diverge do we +// duplicate the computations. The basic algorithm for exp(x) is described +// below. +// +// Take the input x. w is "how many log2/128 in x?" +// w = x * 64/log2 +// NJ = int(w) +// x = NJ*log2/64 + R -coshf_FR_S_hi_temp1 = f10 -coshf_FR_Y_hi = f11 -coshf_FR_Y_lo_temp = f12 -coshf_FR_Y_lo = f13 -coshf_FR_COSH = f9 +// NJ = 64*n + j +// x = n*log2 + (log2/64)*j + R +// +// So, exp(x) = 2^n * 2^(j/64)* exp(R) +// +// T = 2^n * 2^(j/64) +// Construct 2^n +// Get 2^(j/64) table +// actually all the entries of 2^(j/64) table are stored in DP and +// with exponent bits set to 0 -> multiplication on 2^n can be +// performed by doing logical "or" operation with bits presenting 2^n + +// exp(R) = 1 + (exp(R) - 1) +// P = exp(R) - 1 approximated by Taylor series of 3rd degree +// P = A3*R^3 + A2*R^2 + R, A3 = 1/6, A2 = 1/2 +// -coshf_FR_X2 = f9 -coshf_FR_X4 = f10 +// The final result is reconstructed as follows +// exp(x) = T + T*P -coshf_FR_P1 = f14 -coshf_FR_P2 = f15 -coshf_FR_P3 = f32 -coshf_FR_P4 = f33 -coshf_FR_P5 = f34 -coshf_FR_P6 = f35 +// Special values +//********************************************************************* +// coshf(+0) = 1.0 +// coshf(-0) = 1.0 -coshf_FR_TINY_THRESH = f9 +// coshf(+qnan) = +qnan +// coshf(-qnan) = -qnan +// coshf(+snan) = +qnan +// coshf(-snan) = -qnan -coshf_FR_COSH_temp = f10 -coshf_FR_SCALE = f11 +// coshf(-inf) = +inf +// coshf(+inf) = +inf -coshf_FR_hi_lo = f10 +// Overflow and Underflow +//********************************************************************* +// coshf(x) = largest single normal when +// x = 89.41598 = 0x42b2d4fc +// +// There is no underflow. -coshf_FR_poly_podd_temp1 = f11 -coshf_FR_poly_podd_temp2 = f13 -coshf_FR_poly_peven_temp1 = f11 -coshf_FR_poly_peven_temp2 = f13 +// Registers used +//********************************************************************* +// Floating Point registers used: +// f8 input, output +// f6,f7, f9 -> f15, f32 -> f45 -// Data tables -//============================================================== +// General registers used: +// r2, r3, r16 -> r38 -#ifdef _LIBC -.rodata -#else -.data -#endif +// Predicate registers used: +// p6 -> p15 +// Assembly macros +//********************************************************************* +// integer registers used +// scratch +rNJ = r2 +rNJ_neg = r3 + +rJ_neg = r16 +rN_neg = r17 +rSignexp_x = r18 +rExp_x = r18 +rExp_mask = r19 +rExp_bias = r20 +rAd1 = r21 +rAd2 = r22 +rJ = r23 +rN = r24 +rTblAddr = r25 +rA3 = r26 +rExpHalf = r27 +rLn2Div64 = r28 +rGt_ln = r29 +r17ones_m1 = r29 +rRightShifter = r30 +rJ_mask = r30 +r64DivLn2 = r31 +rN_mask = r31 +// stacked +GR_SAVE_PFS = r32 +GR_SAVE_B0 = r33 +GR_SAVE_GP = r34 +GR_Parameter_X = r35 +GR_Parameter_Y = r36 +GR_Parameter_RESULT = r37 +GR_Parameter_TAG = r38 + +// floating point registers used +FR_X = f10 +FR_Y = f1 +FR_RESULT = f8 +// scratch +fRightShifter = f6 +f64DivLn2 = f7 +fNormX = f9 +fNint = f10 +fN = f11 +fR = f12 +fLn2Div64 = f13 +fA2 = f14 +fA3 = f15 +// stacked +fP = f32 +fT = f33 +fMIN_SGL_OFLOW_ARG = f34 +fMAX_SGL_NORM_ARG = f35 +fRSqr = f36 +fA1 = f37 +fA21 = f37 +fA4 = f38 +fA43 = f38 +fA4321 = f38 +fX4 = f39 +fTmp = f39 +fGt_pln = f39 +fWre_urm_f8 = f40 +fXsq = f40 +fP_neg = f41 +fT_neg = f42 +fExp = f43 +fExp_neg = f44 +fAbsX = f45 + + +RODATA .align 16 -single_coshf_arg_reduction: -ASM_TYPE_DIRECTIVE(single_coshf_arg_reduction,@object) - data8 0xB8AA3B295C17F0BC, 0x00004005 - data8 0xB17217F7D1000000, 0x00003FF8 - data8 0xCF79ABC9E3B39804, 0x00003FD0 -ASM_SIZE_DIRECTIVE(single_coshf_arg_reduction) - -single_coshf_p_table: -ASM_TYPE_DIRECTIVE(single_coshf_p_table,@object) - data8 0x8000000000000000, 0x00003FFE - data8 0xAAAAAAAAAAAAAB80, 0x00003FFA - data8 0xB60B60B60B4FE884, 0x00003FF5 - data8 0xD00D00D1021D7370, 0x00003FEF - data8 0x93F27740C0C2F1CC, 0x00003FE9 - data8 0x8FA02AC65BCBD5BC, 0x00003FE2 -ASM_SIZE_DIRECTIVE(single_coshf_p_table) - -single_coshf_ab_table: -ASM_TYPE_DIRECTIVE(single_coshf_ab_table,@object) - data8 0xAAAAAAAAAAAAAAAC, 0x00003FFC - data8 0x88888888884ECDD5, 0x00003FF8 - data8 0xD00D0C6DCC26A86B, 0x00003FF2 - data8 0x8000000000000002, 0x00003FFE - data8 0xAAAAAAAAAA402C77, 0x00003FFA - data8 0xB60B6CC96BDB144D, 0x00003FF5 -ASM_SIZE_DIRECTIVE(single_coshf_ab_table) - -single_coshf_j_table: -ASM_TYPE_DIRECTIVE(single_coshf_j_table,@object) - data8 0xB504F333F9DE6484, 0x00003FFE, 0x1EB2FB13, 0x00000000 - data8 0xB6FD91E328D17791, 0x00003FFE, 0x1CE2CBE2, 0x00000000 - data8 0xB8FBAF4762FB9EE9, 0x00003FFE, 0x1DDC3CBC, 0x00000000 - data8 0xBAFF5AB2133E45FB, 0x00003FFE, 0x1EE9AA34, 0x00000000 - data8 0xBD08A39F580C36BF, 0x00003FFE, 0x9EAEFDC1, 0x00000000 - data8 0xBF1799B67A731083, 0x00003FFE, 0x9DBF517B, 0x00000000 - data8 0xC12C4CCA66709456, 0x00003FFE, 0x1EF88AFB, 0x00000000 - data8 0xC346CCDA24976407, 0x00003FFE, 0x1E03B216, 0x00000000 - data8 0xC5672A115506DADD, 0x00003FFE, 0x1E78AB43, 0x00000000 - data8 0xC78D74C8ABB9B15D, 0x00003FFE, 0x9E7B1747, 0x00000000 - data8 0xC9B9BD866E2F27A3, 0x00003FFE, 0x9EFE3C0E, 0x00000000 - data8 0xCBEC14FEF2727C5D, 0x00003FFE, 0x9D36F837, 0x00000000 - data8 0xCE248C151F8480E4, 0x00003FFE, 0x9DEE53E4, 0x00000000 - data8 0xD06333DAEF2B2595, 0x00003FFE, 0x9E24AE8E, 0x00000000 - data8 0xD2A81D91F12AE45A, 0x00003FFE, 0x1D912473, 0x00000000 - data8 0xD4F35AABCFEDFA1F, 0x00003FFE, 0x1EB243BE, 0x00000000 - data8 0xD744FCCAD69D6AF4, 0x00003FFE, 0x1E669A2F, 0x00000000 - data8 0xD99D15C278AFD7B6, 0x00003FFE, 0x9BBC610A, 0x00000000 - data8 0xDBFBB797DAF23755, 0x00003FFE, 0x1E761035, 0x00000000 - data8 0xDE60F4825E0E9124, 0x00003FFE, 0x9E0BE175, 0x00000000 - data8 0xE0CCDEEC2A94E111, 0x00003FFE, 0x1CCB12A1, 0x00000000 - data8 0xE33F8972BE8A5A51, 0x00003FFE, 0x1D1BFE90, 0x00000000 - data8 0xE5B906E77C8348A8, 0x00003FFE, 0x1DF2F47A, 0x00000000 - data8 0xE8396A503C4BDC68, 0x00003FFE, 0x1EF22F22, 0x00000000 - data8 0xEAC0C6E7DD24392F, 0x00003FFE, 0x9E3F4A29, 0x00000000 - data8 0xED4F301ED9942B84, 0x00003FFE, 0x1EC01A5B, 0x00000000 - data8 0xEFE4B99BDCDAF5CB, 0x00003FFE, 0x1E8CAC3A, 0x00000000 - data8 0xF281773C59FFB13A, 0x00003FFE, 0x9DBB3FAB, 0x00000000 - data8 0xF5257D152486CC2C, 0x00003FFE, 0x1EF73A19, 0x00000000 - data8 0xF7D0DF730AD13BB9, 0x00003FFE, 0x9BB795B5, 0x00000000 - data8 0xFA83B2DB722A033A, 0x00003FFE, 0x1EF84B76, 0x00000000 - data8 0xFD3E0C0CF486C175, 0x00003FFE, 0x9EF5818B, 0x00000000 - data8 0x8000000000000000, 0x00003FFF, 0x00000000, 0x00000000 - data8 0x8164D1F3BC030773, 0x00003FFF, 0x1F77CACA, 0x00000000 - data8 0x82CD8698AC2BA1D7, 0x00003FFF, 0x1EF8A91D, 0x00000000 - data8 0x843A28C3ACDE4046, 0x00003FFF, 0x1E57C976, 0x00000000 - data8 0x85AAC367CC487B15, 0x00003FFF, 0x9EE8DA92, 0x00000000 - data8 0x871F61969E8D1010, 0x00003FFF, 0x1EE85C9F, 0x00000000 - data8 0x88980E8092DA8527, 0x00003FFF, 0x1F3BF1AF, 0x00000000 - data8 0x8A14D575496EFD9A, 0x00003FFF, 0x1D80CA1E, 0x00000000 - data8 0x8B95C1E3EA8BD6E7, 0x00003FFF, 0x9D0373AF, 0x00000000 - data8 0x8D1ADF5B7E5BA9E6, 0x00003FFF, 0x9F167097, 0x00000000 - data8 0x8EA4398B45CD53C0, 0x00003FFF, 0x1EB70051, 0x00000000 - data8 0x9031DC431466B1DC, 0x00003FFF, 0x1F6EB029, 0x00000000 - data8 0x91C3D373AB11C336, 0x00003FFF, 0x1DFD6D8E, 0x00000000 - data8 0x935A2B2F13E6E92C, 0x00003FFF, 0x9EB319B0, 0x00000000 - data8 0x94F4EFA8FEF70961, 0x00003FFF, 0x1EBA2BEB, 0x00000000 - data8 0x96942D3720185A00, 0x00003FFF, 0x1F11D537, 0x00000000 - data8 0x9837F0518DB8A96F, 0x00003FFF, 0x1F0D5A46, 0x00000000 - data8 0x99E0459320B7FA65, 0x00003FFF, 0x9E5E7BCA, 0x00000000 - data8 0x9B8D39B9D54E5539, 0x00003FFF, 0x9F3AAFD1, 0x00000000 - data8 0x9D3ED9A72CFFB751, 0x00003FFF, 0x9E86DACC, 0x00000000 - data8 0x9EF5326091A111AE, 0x00003FFF, 0x9F3EDDC2, 0x00000000 - data8 0xA0B0510FB9714FC2, 0x00003FFF, 0x1E496E3D, 0x00000000 - data8 0xA27043030C496819, 0x00003FFF, 0x9F490BF6, 0x00000000 - data8 0xA43515AE09E6809E, 0x00003FFF, 0x1DD1DB48, 0x00000000 - data8 0xA5FED6A9B15138EA, 0x00003FFF, 0x1E65EBFB, 0x00000000 - data8 0xA7CD93B4E965356A, 0x00003FFF, 0x9F427496, 0x00000000 - data8 0xA9A15AB4EA7C0EF8, 0x00003FFF, 0x1F283C4A, 0x00000000 - data8 0xAB7A39B5A93ED337, 0x00003FFF, 0x1F4B0047, 0x00000000 - data8 0xAD583EEA42A14AC6, 0x00003FFF, 0x1F130152, 0x00000000 - data8 0xAF3B78AD690A4375, 0x00003FFF, 0x9E8367C0, 0x00000000 - data8 0xB123F581D2AC2590, 0x00003FFF, 0x9F705F90, 0x00000000 - data8 0xB311C412A9112489, 0x00003FFF, 0x1EFB3C53, 0x00000000 - data8 0xB504F333F9DE6484, 0x00003FFF, 0x1F32FB13, 0x00000000 -ASM_SIZE_DIRECTIVE(single_coshf_j_table) - -.align 32 -.global coshf# - -.section .text -.proc coshf# -.align 32 - -coshf: - -#ifdef _LIBC -.global __ieee754_coshf# -.proc __ieee754_coshf# -__ieee754_coshf: -#endif -// X NAN? - - -{ .mfi - alloc r32 = ar.pfs,0,12,4,0 -(p0) fclass.m.unc p6,p7 = f8, 0xc3 - nop.i 999 ;; -} -{ .mfb - nop.m 999 -(p6) fma.s.s0 f8 = f8,f1,f8 -(p6) br.ret.spnt b0 ;; -} - -{ .mfi - nop.m 999 - nop.f 999 - nop.i 999 ;; -} +LOCAL_OBJECT_START(_coshf_table) +data4 0x42b2d4fd // Smallest single arg to overflow single result +data4 0x42b2d4fc // Largest single arg to give normal single result +data4 0x00000000 // pad +data4 0x00000000 // pad +// +// 2^(j/64) table, j goes from 0 to 63 +data8 0x0000000000000000 // 2^(0/64) +data8 0x00002C9A3E778061 // 2^(1/64) +data8 0x000059B0D3158574 // 2^(2/64) +data8 0x0000874518759BC8 // 2^(3/64) +data8 0x0000B5586CF9890F // 2^(4/64) +data8 0x0000E3EC32D3D1A2 // 2^(5/64) +data8 0x00011301D0125B51 // 2^(6/64) +data8 0x0001429AAEA92DE0 // 2^(7/64) +data8 0x000172B83C7D517B // 2^(8/64) +data8 0x0001A35BEB6FCB75 // 2^(9/64) +data8 0x0001D4873168B9AA // 2^(10/64) +data8 0x0002063B88628CD6 // 2^(11/64) +data8 0x0002387A6E756238 // 2^(12/64) +data8 0x00026B4565E27CDD // 2^(13/64) +data8 0x00029E9DF51FDEE1 // 2^(14/64) +data8 0x0002D285A6E4030B // 2^(15/64) +data8 0x000306FE0A31B715 // 2^(16/64) +data8 0x00033C08B26416FF // 2^(17/64) +data8 0x000371A7373AA9CB // 2^(18/64) +data8 0x0003A7DB34E59FF7 // 2^(19/64) +data8 0x0003DEA64C123422 // 2^(20/64) +data8 0x0004160A21F72E2A // 2^(21/64) +data8 0x00044E086061892D // 2^(22/64) +data8 0x000486A2B5C13CD0 // 2^(23/64) +data8 0x0004BFDAD5362A27 // 2^(24/64) +data8 0x0004F9B2769D2CA7 // 2^(25/64) +data8 0x0005342B569D4F82 // 2^(26/64) +data8 0x00056F4736B527DA // 2^(27/64) +data8 0x0005AB07DD485429 // 2^(28/64) +data8 0x0005E76F15AD2148 // 2^(29/64) +data8 0x0006247EB03A5585 // 2^(30/64) +data8 0x0006623882552225 // 2^(31/64) +data8 0x0006A09E667F3BCD // 2^(32/64) +data8 0x0006DFB23C651A2F // 2^(33/64) +data8 0x00071F75E8EC5F74 // 2^(34/64) +data8 0x00075FEB564267C9 // 2^(35/64) +data8 0x0007A11473EB0187 // 2^(36/64) +data8 0x0007E2F336CF4E62 // 2^(37/64) +data8 0x00082589994CCE13 // 2^(38/64) +data8 0x000868D99B4492ED // 2^(39/64) +data8 0x0008ACE5422AA0DB // 2^(40/64) +data8 0x0008F1AE99157736 // 2^(41/64) +data8 0x00093737B0CDC5E5 // 2^(42/64) +data8 0x00097D829FDE4E50 // 2^(43/64) +data8 0x0009C49182A3F090 // 2^(44/64) +data8 0x000A0C667B5DE565 // 2^(45/64) +data8 0x000A5503B23E255D // 2^(46/64) +data8 0x000A9E6B5579FDBF // 2^(47/64) +data8 0x000AE89F995AD3AD // 2^(48/64) +data8 0x000B33A2B84F15FB // 2^(49/64) +data8 0x000B7F76F2FB5E47 // 2^(50/64) +data8 0x000BCC1E904BC1D2 // 2^(51/64) +data8 0x000C199BDD85529C // 2^(52/64) +data8 0x000C67F12E57D14B // 2^(53/64) +data8 0x000CB720DCEF9069 // 2^(54/64) +data8 0x000D072D4A07897C // 2^(55/64) +data8 0x000D5818DCFBA487 // 2^(56/64) +data8 0x000DA9E603DB3285 // 2^(57/64) +data8 0x000DFC97337B9B5F // 2^(58/64) +data8 0x000E502EE78B3FF6 // 2^(59/64) +data8 0x000EA4AFA2A490DA // 2^(60/64) +data8 0x000EFA1BEE615A27 // 2^(61/64) +data8 0x000F50765B6E4540 // 2^(62/64) +data8 0x000FA7C1819E90D8 // 2^(63/64) +LOCAL_OBJECT_END(_coshf_table) + +LOCAL_OBJECT_START(cosh_p_table) +data8 0x3efa3001dcf5905b // A4 +data8 0x3f56c1437543543e // A3 +data8 0x3fa5555572601504 // A2 +data8 0x3fdfffffffe2f097 // A1 +LOCAL_OBJECT_END(cosh_p_table) -// X infinity -{ .mfi - nop.m 999 -(p0) fclass.m.unc p6,p0 = f8, 0x23 - nop.i 999 ;; -} -{ .mfb - nop.m 999 -(p6) fmerge.s f8 = f0,f8 -(p6) br.ret.spnt b0 ;; -} +.section .text +GLOBAL_IEEE754_ENTRY(coshf) -// Put 0.25 in f9; p6 true if x < 0.25 { .mlx - nop.m 999 -(p0) movl r32 = 0x000000000000fffd ;; -} - -{ .mfi -(p0) setf.exp f9 = r32 - nop.f 999 - nop.i 999 ;; + getf.exp rSignexp_x = f8 // Must recompute if x unorm + movl r64DivLn2 = 0x40571547652B82FE // 64/ln(2) } - -{ .mfi - nop.m 999 -(p0) fmerge.s coshf_FR_X = f0,f8 - nop.i 999 +{ .mlx + addl rTblAddr = @ltoff(_coshf_table),gp + movl rRightShifter = 0x43E8000000000000 // DP Right Shifter } +;; { .mfi - nop.m 999 -(p0) fmerge.s coshf_FR_SGNX = f8,f1 - nop.i 999 ;; + // point to the beginning of the table + ld8 rTblAddr = [rTblAddr] + fclass.m p6, p0 = f8, 0x0b // Test for x=unorm + addl rA3 = 0x3E2AA, r0 // high bits of 1.0/6.0 rounded to SP } - { .mfi - nop.m 999 -(p0) fcmp.lt.unc p0,p7 = coshf_FR_X,f9 - nop.i 999 ;; -} - -{ .mib - nop.m 999 - nop.i 999 -(p7) br.cond.sptk L(COSH_BY_TBL) ;; -} - - -// COSH_BY_POLY: - -// POLY cannot overflow so there is no need to call __libm_error_support -// Get the values of P_x from the table - -{ .mmi - nop.m 999 -(p0) addl r34 = @ltoff(single_coshf_p_table), gp - nop.i 999 + nop.m 0 + fnorm.s1 fNormX = f8 // normalized x + addl rExpHalf = 0xFFFE, r0 // exponent of 1/2 } ;; -{ .mmi - ld8 r34 = [r34] - nop.m 999 - nop.i 999 -} -;; - -// Calculate coshf_FR_X2 = ax*ax and coshf_FR_X4 = ax*ax*ax*ax -{ .mmf - nop.m 999 -(p0) ldfe coshf_FR_P1 = [r34],16 -(p0) fma.s1 coshf_FR_X2 = coshf_FR_X, coshf_FR_X, f0 ;; -} - -{ .mmi -(p0) ldfe coshf_FR_P2 = [r34],16 ;; -(p0) ldfe coshf_FR_P3 = [r34],16 - nop.i 999 ;; -} - -{ .mmi -(p0) ldfe coshf_FR_P4 = [r34],16 ;; -(p0) ldfe coshf_FR_P5 = [r34],16 - nop.i 999 ;; -} - { .mfi -(p0) ldfe coshf_FR_P6 = [r34],16 -(p0) fma.s1 coshf_FR_X4 = coshf_FR_X2, coshf_FR_X2, f0 - nop.i 999 ;; + setf.d f64DivLn2 = r64DivLn2 // load 64/ln(2) to FP reg + fclass.m p15, p0 = f8, 0x1e3 // test for NaT,NaN,Inf + nop.i 0 } - -// Calculate coshf_FR_podd = x4 *(x4 * P_5 + P_3) + P_1 -{ .mfi - nop.m 999 -(p0) fma.s1 coshf_FR_poly_podd_temp1 = coshf_FR_X4, coshf_FR_P5, coshf_FR_P3 - nop.i 999 ;; +{ .mlx + // load Right Shifter to FP reg + setf.d fRightShifter = rRightShifter + movl rLn2Div64 = 0x3F862E42FEFA39EF // DP ln(2)/64 in GR } +;; { .mfi - nop.m 999 -(p0) fma.s1 coshf_FR_podd = coshf_FR_X4, coshf_FR_poly_podd_temp1, coshf_FR_P1 - nop.i 999 + mov rExp_mask = 0x1ffff + fcmp.eq.s1 p13, p0 = f0, f8 // test for x = 0.0 + shl rA3 = rA3, 12 // 0x3E2AA000, approx to 1.0/6.0 in SP } - -// Calculate coshf_FR_peven = p_even = x4 *(x4 * (x4 * P_6 + P_4) + P_2) -{ .mfi - nop.m 999 -(p0) fma.s1 coshf_FR_poly_peven_temp1 = coshf_FR_X4, coshf_FR_P6, coshf_FR_P4 - nop.i 999 ;; +{ .mfb + nop.m 0 + nop.f 0 +(p6) br.cond.spnt COSH_UNORM // Branch if x=unorm } +;; +COSH_COMMON: { .mfi - nop.m 999 -(p0) fma.s1 coshf_FR_poly_peven_temp2 = coshf_FR_X4, coshf_FR_poly_peven_temp1, coshf_FR_P2 - nop.i 999 ;; + setf.exp fA2 = rExpHalf // load A2 to FP reg + nop.f 0 + mov rExp_bias = 0xffff } - -{ .mfi - nop.m 999 -(p0) fma.s1 coshf_FR_peven = coshf_FR_X4, coshf_FR_poly_peven_temp2, f0 - nop.i 999 ;; +{ .mfb + setf.d fLn2Div64 = rLn2Div64 // load ln(2)/64 to FP reg +(p15) fma.s.s0 f8 = f8, f8, f0 // result if x = NaT,NaN,Inf +(p15) br.ret.spnt b0 // exit here if x = NaT,NaN,Inf } - -// Y_lo = x2*p_odd + p_even -// Calculate f8 = Y_hi + Y_lo +;; { .mfi - nop.m 999 -(p0) fma.s1 coshf_FR_Y_lo = coshf_FR_X2, coshf_FR_podd, coshf_FR_peven - nop.i 999 ;; + // min overflow and max normal threshold + ldfps fMIN_SGL_OFLOW_ARG, fMAX_SGL_NORM_ARG = [rTblAddr], 8 + nop.f 0 + and rExp_x = rExp_mask, rSignexp_x // Biased exponent of x } - { .mfb - nop.m 999 -(p0) fma.s.s0 f8 = f1, f1, coshf_FR_Y_lo -(p0) br.ret.sptk b0 ;; -} - - -L(COSH_BY_TBL): - -// Now that we are at TBL; so far all we know is that |x| >= 0.25. -// The first two steps are the same for TBL and EXP, but if we are HUGE -// Double -// Go to HUGE if |x| >= 2^10, 10009 (register-biased) is e = 10 (true) -// Single -// Go to HUGE if |x| >= 2^7, 10006 (register-biased) is e = 7 (true) -// we want to leave now. Go to HUGE if |x| >= 2^14 -// 1000d (register-biased) is e = 14 (true) - -{ .mlx - nop.m 999 -(p0) movl r32 = 0x0000000000010006 ;; + setf.s fA3 = rA3 // load A3 to FP reg +(p13) fma.s.s0 f8 = f1, f1, f0 // result if x = 0.0 +(p13) br.ret.spnt b0 // exit here if x =0.0 } +;; { .mfi -(p0) setf.exp f9 = r32 - nop.f 999 - nop.i 999 ;; + sub rExp_x = rExp_x, rExp_bias // True exponent of x + fmerge.s fAbsX = f0, fNormX // Form |x| + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fcmp.ge.unc p6,p7 = coshf_FR_X,f9 - nop.i 999 ;; -} - -{ .mib - nop.m 999 - nop.i 999 -(p6) br.cond.spnt L(COSH_HUGE) ;; + nop.m 0 + // x*(64/ln(2)) + Right Shifter + fma.s1 fNint = fNormX, f64DivLn2, fRightShifter + add rTblAddr = 8, rTblAddr } - -// r32 = 1 -// r34 = N-1 -// r35 = N -// r36 = j -// r37 = N+1 - -// TBL can never overflow -// coshf(x) = coshf(B+R) -// = coshf(B) coshf(R) + sinh(B) sinh(R) -// coshf(R) can be approximated by 1 + p_even -// sinh(R) can be approximated by p_odd - -// ****************************************************** -// STEP 1 (TBL and EXP) -// ****************************************************** -// Get the following constants. -// f9 = Inv_log2by64 -// f10 = log2by64_hi -// f11 = log2by64_lo - -{ .mmi -(p0) adds r32 = 0x1,r0 -(p0) addl r34 = @ltoff(single_coshf_arg_reduction), gp - nop.i 999 +{ .mfb + cmp.gt p7, p0 = -2, rExp_x // Test |x| < 2^(-2) + fma.s1 fXsq = fNormX, fNormX, f0 // x*x for small path +(p7) br.cond.spnt COSH_SMALL // Branch if 0 < |x| < 2^-2 } ;; - -// We want 2^(N-1) and 2^(-N-1). So bias N-1 and -N-1 and -// put them in an exponent. -// coshf_FR_spos = 2^(N-1) and coshf_FR_sneg = 2^(-N-1) -// r39 = 0xffff + (N-1) = 0xffff +N -1 -// r40 = 0xffff - (N +1) = 0xffff -N -1 - -{ .mlx - ld8 r34 = [r34] -(p0) movl r38 = 0x000000000000fffe ;; -} - -{ .mmi -(p0) ldfe coshf_FR_Inv_log2by64 = [r34],16 ;; -(p0) ldfe coshf_FR_log2by64_hi = [r34],16 - nop.i 999 ;; -} - -{ .mbb -(p0) ldfe coshf_FR_log2by64_lo = [r34],16 - nop.b 999 - nop.b 999 ;; -} - -// Get the A coefficients -// f9 = A_1 -// f10 = A_2 -// f11 = A_3 - -{ .mmi - nop.m 999 -(p0) addl r34 = @ltoff(single_coshf_ab_table), gp - nop.i 999 +{ .mfi + nop.m 0 + // check for overflow + fcmp.ge.s1 p12, p13 = fAbsX, fMIN_SGL_OFLOW_ARG + mov rJ_mask = 0x3f // 6-bit mask for J } ;; -{ .mmi - ld8 r34 = [r34] - nop.m 999 - nop.i 999 +{ .mfb + nop.m 0 + fms.s1 fN = fNint, f1, fRightShifter // n in FP register + // branch out if overflow +(p12) br.cond.spnt COSH_CERTAIN_OVERFLOW } ;; - -// Calculate M and keep it as integer and floating point. -// M = round-to-integer(x*Inv_log2by64) -// coshf_FR_M = M = truncate(ax/(log2/64)) -// Put the significand of M in r35 -// and the floating point representation of M in coshf_FR_M - -{ .mfi - nop.m 999 -(p0) fma.s1 coshf_FR_M = coshf_FR_X, coshf_FR_Inv_log2by64, f0 - nop.i 999 -} - -{ .mfi -(p0) ldfe coshf_FR_A1 = [r34],16 - nop.f 999 - nop.i 999 ;; -} - { .mfi - nop.m 999 -(p0) fcvt.fx.s1 coshf_FR_M_temp = coshf_FR_M - nop.i 999 ;; + getf.sig rNJ = fNint // bits of n, j + // check for possible overflow + fcmp.gt.s1 p13, p0 = fAbsX, fMAX_SGL_NORM_ARG + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fnorm.s1 coshf_FR_M = coshf_FR_M_temp - nop.i 999 ;; + addl rN = 0xFFBF - 63, rNJ // biased and shifted n-1,j + fnma.s1 fR = fLn2Div64, fN, fNormX // R = x - N*ln(2)/64 + and rJ = rJ_mask, rNJ // bits of j } - { .mfi -(p0) getf.sig r35 = coshf_FR_M_temp - nop.f 999 - nop.i 999 ;; -} - -// M is still in r35. Calculate j. j is the signed extension of the six lsb of M. It -// has a range of -32 thru 31. -// r35 = M -// r36 = j - -{ .mii - nop.m 999 - nop.i 999 ;; -(p0) and r36 = 0x3f, r35 ;; + sub rNJ_neg = r0, rNJ // bits of n, j for -x + nop.f 0 + andcm rN_mask = -1, rJ_mask // 0xff...fc0 to mask N } - -// Calculate R -// f13 = f44 - f12*f10 = x - M*log2by64_hi -// f14 = f13 - f8*f11 = R = (x - M*log2by64_hi) - M*log2by64_lo +;; { .mfi - nop.m 999 -(p0) fnma.s1 coshf_FR_R_temp = coshf_FR_M, coshf_FR_log2by64_hi, coshf_FR_X - nop.i 999 + shladd rJ = rJ, 3, rTblAddr // address in the 2^(j/64) table + nop.f 0 + and rN = rN_mask, rN // biased, shifted n-1 } - { .mfi -(p0) ldfe coshf_FR_A2 = [r34],16 - nop.f 999 - nop.i 999 ;; + addl rN_neg = 0xFFBF - 63, rNJ_neg // -x biased, shifted n-1,j + nop.f 0 + and rJ_neg = rJ_mask, rNJ_neg // bits of j for -x } +;; { .mfi - nop.m 999 -(p0) fnma.s1 coshf_FR_R = coshf_FR_M, coshf_FR_log2by64_lo, coshf_FR_R_temp - nop.i 999 + ld8 rJ = [rJ] // Table value + nop.f 0 + shl rN = rN, 46 // 2^(n-1) bits in DP format } - -// Get the B coefficients -// f15 = B_1 -// f32 = B_2 -// f33 = B_3 - -{ .mmi -(p0) ldfe coshf_FR_A3 = [r34],16 ;; -(p0) ldfe coshf_FR_B1 = [r34],16 - nop.i 999 ;; -} - -{ .mmi -(p0) ldfe coshf_FR_B2 = [r34],16 ;; -(p0) ldfe coshf_FR_B3 = [r34],16 - nop.i 999 ;; -} - -{ .mii - nop.m 999 -(p0) shl r34 = r36, 0x2 ;; -(p0) sxt1 r37 = r34 ;; -} - -// ****************************************************** -// STEP 2 (TBL and EXP) -// ****************************************************** -// Calculate Rsquared and Rcubed in preparation for p_even and p_odd -// f12 = R*R*R -// f13 = R*R -// f14 = R <== from above - { .mfi - nop.m 999 -(p0) fma.s1 coshf_FR_Rsq = coshf_FR_R, coshf_FR_R, f0 -(p0) shr r36 = r37, 0x2 ;; -} - -// r34 = M-j = r35 - r36 -// r35 = N = (M-j)/64 - -{ .mii -(p0) sub r34 = r35, r36 - nop.i 999 ;; -(p0) shr r35 = r34, 0x6 ;; -} - -{ .mii -(p0) sub r40 = r38, r35 -(p0) adds r37 = 0x1, r35 -(p0) add r39 = r38, r35 ;; -} - -// Get the address of the J table, add the offset, -// addresses are sinh_AD_mJ and sinh_AD_J, get the T value -// f32 = T(j)_hi -// f33 = T(j)_lo -// f34 = T(-j)_hi -// f35 = T(-j)_lo - -{ .mmi -(p0) sub r34 = r35, r32 -(p0) addl r37 = @ltoff(single_coshf_j_table), gp - nop.i 999 + shladd rJ_neg = rJ_neg, 3, rTblAddr // addr in 2^(j/64) table -x + nop.f 0 + and rN_neg = rN_mask, rN_neg // biased, shifted n-1 for -x } ;; { .mfi - ld8 r37 = [r37] -(p0) fma.s1 coshf_FR_Rcub = coshf_FR_Rsq, coshf_FR_R, f0 - nop.i 999 -} - -// ****************************************************** -// STEP 3 Now decide if we need to branch to EXP -// ****************************************************** -// Put 32 in f9; p6 true if x < 32 - -{ .mlx - nop.m 999 -(p0) movl r32 = 0x0000000000010004 ;; + ld8 rJ_neg = [rJ_neg] // Table value for -x + nop.f 0 + shl rN_neg = rN_neg, 46 // 2^(n-1) bits in DP format for -x } - -// Calculate p_even -// f34 = B_2 + Rsq *B_3 -// f35 = B_1 + Rsq*f34 = B_1 + Rsq * (B_2 + Rsq *B_3) -// f36 = peven = Rsq * f35 = Rsq * (B_1 + Rsq * (B_2 + Rsq *B_3)) +;; { .mfi - nop.m 999 -(p0) fma.s1 coshf_FR_peven_temp1 = coshf_FR_Rsq, coshf_FR_B3, coshf_FR_B2 - nop.i 999 ;; + or rN = rN, rJ // bits of 2^n * 2^(j/64) in DP format + nop.f 0 + nop.i 0 } +;; -{ .mfi - nop.m 999 -(p0) fma.s1 coshf_FR_peven_temp2 = coshf_FR_Rsq, coshf_FR_peven_temp1, coshf_FR_B1 - nop.i 999 +{ .mmf + setf.d fT = rN // 2^(n-1) * 2^(j/64) + or rN_neg = rN_neg, rJ_neg // -x bits of 2^n * 2^(j/64) in DP + fma.s1 fRSqr = fR, fR, f0 // R^2 } - -// Calculate p_odd -// f34 = A_2 + Rsq *A_3 -// f35 = A_1 + Rsq * (A_2 + Rsq *A_3) -// f37 = podd = R + Rcub * (A_1 + Rsq * (A_2 + Rsq *A_3)) +;; { .mfi - nop.m 999 -(p0) fma.s1 coshf_FR_podd_temp1 = coshf_FR_Rsq, coshf_FR_A3, coshf_FR_A2 - nop.i 999 ;; + setf.d fT_neg = rN_neg // 2^(n-1) * 2^(j/64) for -x + fma.s1 fP = fA3, fR, fA2 // A3*R + A2 + nop.i 0 } - { .mfi -(p0) setf.exp coshf_FR_N_temp1 = r39 - nop.f 999 - nop.i 999 ;; + nop.m 0 + fnma.s1 fP_neg = fA3, fR, fA2 // A3*R + A2 for -x + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fma.s1 coshf_FR_peven = coshf_FR_Rsq, coshf_FR_peven_temp2, f0 - nop.i 999 + nop.m 0 + fma.s1 fP = fP, fRSqr, fR // P = (A3*R + A2)*R^2 + R + nop.i 0 } - { .mfi - nop.m 999 -(p0) fma.s1 coshf_FR_podd_temp2 = coshf_FR_Rsq, coshf_FR_podd_temp1, coshf_FR_A1 - nop.i 999 ;; + nop.m 0 + fms.s1 fP_neg = fP_neg, fRSqr, fR // P = (A3*R + A2)*R^2 + R, -x + nop.i 0 } +;; { .mfi -(p0) setf.exp f9 = r32 - nop.f 999 - nop.i 999 ;; + nop.m 0 + fmpy.s0 fTmp = fLn2Div64, fLn2Div64 // Force inexact + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fma.s1 coshf_FR_podd = coshf_FR_podd_temp2, coshf_FR_Rcub, coshf_FR_R - nop.i 999 -} - -// sinh_GR_mj contains the table offset for -j -// sinh_GR_j contains the table offset for +j -// p6 is true when j <= 0 - -{ .mlx -(p0) setf.exp coshf_FR_N_temp2 = r40 -(p0) movl r40 = 0x0000000000000020 ;; + nop.m 0 + fma.s1 fExp = fP, fT, fT // exp(x)/2 + nop.i 0 } - -{ .mfi -(p0) sub GR_mJ = r40, r36 -(p0) fmerge.se coshf_FR_spos = coshf_FR_N_temp1, f1 -(p0) adds GR_J = 0x20, r36 ;; +{ .mfb + nop.m 0 + fma.s1 fExp_neg = fP_neg, fT_neg, fT_neg // exp(-x)/2 + // branch out if possible overflow result +(p13) br.cond.spnt COSH_POSSIBLE_OVERFLOW } +;; -{ .mii - nop.m 999 -(p0) shl GR_mJ = GR_mJ, 5 ;; -(p0) add AD_mJ = r37, GR_mJ ;; +{ .mfb + nop.m 0 + // final result in the absence of overflow + fma.s.s0 f8 = fExp, f1, fExp_neg // result = (exp(x)+exp(-x))/2 + // exit here in the absence of overflow + br.ret.sptk b0 // Exit main path, 0.25 <= |x| < 89.41598 } +;; +// Here if 0 < |x| < 0.25. Evaluate 8th order polynomial. +COSH_SMALL: { .mmi - nop.m 999 -(p0) ldfe coshf_FR_Tmjhi = [AD_mJ],16 -(p0) shl GR_J = GR_J, 5 ;; -} - -{ .mfi -(p0) ldfs coshf_FR_Tmjlo = [AD_mJ],16 -(p0) fcmp.lt.unc.s1 p6,p7 = coshf_FR_X,f9 -(p0) add AD_J = r37, GR_J ;; + add rAd1 = 0x200, rTblAddr + add rAd2 = 0x210, rTblAddr + nop.i 0 } +;; { .mmi -(p0) ldfe coshf_FR_Tjhi = [AD_J],16 ;; -(p0) ldfs coshf_FR_Tjlo = [AD_J],16 - nop.i 999 ;; -} - -{ .mfb - nop.m 999 -(p0) fmerge.se coshf_FR_sneg = coshf_FR_N_temp2, f1 -(p7) br.cond.spnt L(COSH_BY_EXP) ;; -} - -// ****************************************************** -// If NOT branch to EXP -// ****************************************************** -// Calculate C_hi -// ****************************************************** -// coshf_FR_C_hi_temp = coshf_FR_sneg * coshf_FR_Tmjhi -// coshf_FR_C_hi = coshf_FR_spos * coshf_FR_Tjhi + (coshf_FR_sneg * coshf_FR_Tmjhi) - -{ .mfi - nop.m 999 -(p0) fma.s1 coshf_FR_C_hi_temp = coshf_FR_sneg, coshf_FR_Tmjhi, f0 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p0) fma.s1 coshf_FR_C_hi = coshf_FR_spos, coshf_FR_Tjhi, coshf_FR_C_hi_temp - nop.i 999 -} - -// ****************************************************** -// Calculate S_hi -// ****************************************************** -// coshf_FR_S_hi_temp1 = coshf_FR_sneg * coshf_FR_Tmjhi -// coshf_FR_S_hi = coshf_FR_spos * coshf_FR_Tjhi - coshf_FR_C_hi_temp1 - -{ .mfi - nop.m 999 -(p0) fma.s1 coshf_FR_S_hi_temp1 = coshf_FR_sneg, coshf_FR_Tmjhi, f0 - nop.i 999 ;; -} - -// ****************************************************** -// Calculate C_lo -// ****************************************************** -// coshf_FR_C_lo_temp1 = coshf_FR_spos * coshf_FR_Tjhi - coshf_FR_C_hi -// coshf_FR_C_lo_temp2 = coshf_FR_sneg * coshf_FR_Tmjlo + (coshf_FR_spos * coshf_FR_Tjhi - coshf_FR_C_hi) -// coshf_FR_C_lo_temp1 = coshf_FR_sneg * coshf_FR_Tmjlo -// coshf_FR_C_lo_temp3 = coshf_FR_spos * coshf_FR_Tjlo + (coshf_FR_sneg * coshf_FR_Tmjlo) -// coshf_FR_C_lo = coshf_FR_C_lo_temp3 + coshf_FR_C_lo_temp2 - -{ .mfi - nop.m 999 -(p0) fms.s1 coshf_FR_C_lo_temp1 = coshf_FR_spos, coshf_FR_Tjhi, coshf_FR_C_hi - nop.i 999 -} - -{ .mfi - nop.m 999 -(p0) fms.s1 coshf_FR_S_hi = coshf_FR_spos, coshf_FR_Tjhi, coshf_FR_S_hi_temp1 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p0) fma.s1 coshf_FR_C_lo_temp2 = coshf_FR_sneg, coshf_FR_Tmjhi, coshf_FR_C_lo_temp1 - nop.i 999 + ldfpd fA4, fA3 = [rAd1] + ldfpd fA2, fA1 = [rAd2] + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fma.s1 coshf_FR_C_lo_temp1 = coshf_FR_sneg, coshf_FR_Tmjlo, f0 - nop.i 999 ;; + nop.m 0 + fma.s1 fX4 = fXsq, fXsq, f0 + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fma.s1 coshf_FR_C_lo_temp3 = coshf_FR_spos, coshf_FR_Tjlo, coshf_FR_C_lo_temp1 - nop.i 999 ;; + nop.m 0 + fma.s1 fA43 = fXsq, fA4, fA3 + nop.i 0 } - { .mfi - nop.m 999 -(p0) fma.s1 coshf_FR_C_lo = coshf_FR_C_lo_temp3, f1, coshf_FR_C_lo_temp2 - nop.i 999 ;; + nop.m 0 + fma.s1 fA21 = fXsq, fA2, fA1 + nop.i 0 } - -// ****************************************************** -// coshf_FR_Y_lo_temp = coshf_FR_C_hi * coshf_FR_peven + coshf_FR_C_lo -// coshf_FR_Y_lo = coshf_FR_S_hi * coshf_FR_podd + coshf_FR_Y_lo_temp -// coshf_FR_COSH = Y_hi + Y_lo +;; { .mfi - nop.m 999 -(p0) fma.s1 coshf_FR_Y_lo_temp = coshf_FR_C_hi, coshf_FR_peven, coshf_FR_C_lo - nop.i 999 ;; + nop.m 0 + fma.s1 fA4321 = fX4, fA43, fA21 + nop.i 0 } +;; +// Dummy multiply to generate inexact { .mfi - nop.m 999 -(p0) fma.s1 coshf_FR_Y_lo = coshf_FR_S_hi, coshf_FR_podd, coshf_FR_Y_lo_temp - nop.i 999 ;; + nop.m 0 + fmpy.s0 fTmp = fA4, fA4 + nop.i 0 } - { .mfb - nop.m 999 -(p0) fma.s.s0 f8 = coshf_FR_C_hi, f1, coshf_FR_Y_lo -(p0) br.ret.sptk b0 ;; + nop.m 0 + fma.s.s0 f8 = fA4321, fXsq, f1 + br.ret.sptk b0 // Exit if 0 < |x| < 0.25 } +;; +COSH_POSSIBLE_OVERFLOW: -L(COSH_BY_EXP): +// Here if fMAX_SGL_NORM_ARG < x < fMIN_SGL_OFLOW_ARG +// This cannot happen if input is a single, only if input higher precision. +// Overflow is a possibility, not a certainty. -// When p7 is true, we know that an overflow is not going to happen -// When p7 is false, we must check for possible overflow -// p7 is the over_SAFE flag -// f44 = Scale * (Y_hi + Y_lo) -// = coshf_FR_spos * (coshf_FR_Tjhi + coshf_FR_Y_lo) +// Recompute result using status field 2 with user's rounding mode, +// and wre set. If result is larger than largest single, then we have +// overflow { .mfi - nop.m 999 -(p0) fma.s1 coshf_FR_Y_lo_temp = coshf_FR_peven, f1, coshf_FR_podd - nop.i 999 -} - -// Now we are in EXP. This is the only path where an overflow is possible -// but not for certain. So this is the only path where over_SAFE has any use. -// r34 still has N-1 -// There is a danger of double-extended overflow if N-1 > 16382 = 0x3ffe -// There is a danger of double overflow if N-1 > 0x3fe = 1022 -// There is a danger of single overflow if N-1 > 0x7e = 126 - -{ .mlx - nop.m 999 -(p0) movl r32 = 0x000000000000007e ;; -} - -{ .mfi -(p0) cmp.gt.unc p0,p7 = r34, r32 - nop.f 999 - nop.i 999 ;; + mov rGt_ln = 0x1007f // Exponent for largest single + 1 ulp + fsetc.s2 0x7F,0x42 // Get user's round mode, set wre + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fma.s1 coshf_FR_Y_lo = coshf_FR_Tjhi, coshf_FR_Y_lo_temp, coshf_FR_Tjlo - nop.i 999 ;; + setf.exp fGt_pln = rGt_ln // Create largest single + 1 ulp + fma.s.s2 fWre_urm_f8 = fP, fT, fT // Result with wre set + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fma.s1 coshf_FR_COSH_temp = coshf_FR_Y_lo, f1, coshf_FR_Tjhi - nop.i 999 ;; + nop.m 0 + fsetc.s2 0x7F,0x40 // Turn off wre in sf2 + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fma.s.s0 f44 = coshf_FR_spos, coshf_FR_COSH_temp, f0 - nop.i 999 ;; + nop.m 0 + fcmp.ge.s1 p6, p0 = fWre_urm_f8, fGt_pln // Test for overflow + nop.i 0 } +;; -// If over_SAFE is set, return { .mfb - nop.m 999 -(p7) fmerge.s f8 = f44,f44 -(p7) br.ret.sptk b0 ;; -} - -// Else see if we overflowed -// S0 user supplied status -// S2 user supplied status + WRE + TD (Overflows) -// If WRE is set then an overflow will not occur in EXP. -// The input value that would cause a register (WRE) value to overflow is about 2^15 -// and this input would go into the HUGE path. -// Answer with WRE is in f43. - -{ .mfi - nop.m 999 -(p0) fsetc.s2 0x7F,0x42 - nop.i 999;; -} - -{ .mfi - nop.m 999 -(p0) fma.s.s2 f43 = coshf_FR_spos, coshf_FR_COSH_temp, f0 - nop.i 999 ;; -} - -// 1 more that the exponent of the largest double (7FE) = 7FF -// 7FF - 3FF = 400 (true); 400 + FFFF = 103FF (register-biased) -// So 0 103FF 8000000000000000 is one ulp more than -// largest double in register bias -// 1 more that the exponent of the largest single (FE) = FF -// FF - 7F = 80 (true); 80 + FFFF = 1007F (register-biased) -// Now set p8 if the answer with WRE is greater than or equal this value -// Also set p9 if the answer with WRE is less than or equal to negative this value - -{ .mlx - nop.m 999 -(p0) movl r32 = 0x000000000001007f ;; + nop.m 0 + nop.f 0 +(p6) br.cond.spnt COSH_CERTAIN_OVERFLOW // Branch if overflow } +;; -{ .mmf - nop.m 999 -(p0) setf.exp f41 = r32 -(p0) fsetc.s2 0x7F,0x40 ;; +{ .mfb + nop.m 0 + fma.s.s0 f8 = fP, fT, fT + br.ret.sptk b0 // Exit if really no overflow } +;; -{ .mfi - nop.m 999 -(p0) fcmp.ge.unc.s1 p8, p0 = f43, f41 - nop.i 999 +// here if overflow +COSH_CERTAIN_OVERFLOW: +{ .mmi + addl r17ones_m1 = 0x1FFFE, r0 +;; + setf.exp fTmp = r17ones_m1 + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fmerge.ns f42 = f41, f41 - nop.i 999 ;; -} - -// The error tag for overflow is 65 -{ .mii - nop.m 999 - nop.i 999 ;; -(p8) mov GR_Parameter_TAG = 65 ;; + alloc r32 = ar.pfs, 0, 3, 4, 0 // get some registers + fmerge.s FR_X = f8,f8 + nop.i 0 } - { .mfb - nop.m 999 -(p0) fcmp.le.unc.s1 p9, p0 = f43, f42 -(p8) br.cond.spnt __libm_error_region ;; -} - -{ .mii - nop.m 999 - nop.i 999 ;; -(p9) mov GR_Parameter_TAG = 64 -} - -{ .mib - nop.m 999 - nop.i 999 -(p9) br.cond.spnt __libm_error_region ;; + mov GR_Parameter_TAG = 65 + fma.s.s0 FR_RESULT = fTmp, fTmp, f0 // Set I,O and +INF result + br.cond.sptk __libm_error_region } +;; +// Here if x unorm +COSH_UNORM: { .mfb - nop.m 999 -(p0) fmerge.s f8 = f44,f44 -(p0) br.ret.sptk b0 ;; + getf.exp rSignexp_x = fNormX // Must recompute if x unorm + fcmp.eq.s0 p6, p0 = f8, f0 // Set D flag + br.cond.sptk COSH_COMMON // Return to main path } +;; +GLOBAL_IEEE754_END(coshf) -L(COSH_HUGE): - -// for COSH_HUGE, put 24000 in exponent; take sign from input; add 1 -// SAFE: SAFE is always 0 for HUGE - -{ .mlx - nop.m 999 -(p0) movl r32 = 0x0000000000015dbf ;; -} - -{ .mfi -(p0) setf.exp f9 = r32 - nop.f 999 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p0) fma.s1 coshf_FR_hi_lo = f1, f9, f1 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p0) fma.s.s0 f44 = f9, coshf_FR_hi_lo, f0 -(p0) mov GR_Parameter_TAG = 65 -} -.endp coshf -ASM_SIZE_DIRECTIVE(coshf) - - -.proc __libm_error_region -__libm_error_region: +LOCAL_LIBM_ENTRY(__libm_error_region) .prologue { .mfi - add GR_Parameter_Y=-32,sp // Parameter 2 value - nop.f 0 + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 0 .save ar.pfs,GR_SAVE_PFS - mov GR_SAVE_PFS=ar.pfs // Save ar.pfs + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs } { .mfi -.fframe 64 - add sp=-64,sp // Create new stack - nop.f 0 - mov GR_SAVE_GP=gp // Save gp +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp };; { .mmi - stfs [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack - add GR_Parameter_X = 16,sp // Parameter 1 address -.save b0, GR_SAVE_B0 - mov GR_SAVE_B0=b0 // Save b0 + stfs [GR_Parameter_Y] = FR_Y,16 // Store Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 };; .body -{ .mib - stfs [GR_Parameter_X] = FR_X // Store Parameter 1 on stack - add GR_Parameter_RESULT = 0,GR_Parameter_Y - nop.b 0 // Parameter 3 address +{ .mfi + stfs [GR_Parameter_X] = FR_X // Store Parameter 1 on stack + nop.f 0 + add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address } { .mib - stfs [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack - add GR_Parameter_Y = -16,GR_Parameter_Y - br.call.sptk.many b0=__libm_error_support# // Call error handling function + stfs [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function };; + { .mmi - nop.m 0 - nop.m 0 - add GR_Parameter_RESULT = 48,sp + add GR_Parameter_RESULT = 48,sp + nop.m 0 + nop.i 0 };; + { .mmi - ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack + ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack .restore sp - add sp = 64,sp // Restore stack pointer - mov b0 = GR_SAVE_B0 // Restore return address + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address };; { .mib - mov gp = GR_SAVE_GP // Restore gp - mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs - br.ret.sptk b0 // Return -};; + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; + +LOCAL_LIBM_END(__libm_error_region) -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) .type __libm_error_support#,@function .global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_coshl.S b/sysdeps/ia64/fpu/e_coshl.S index daac20d9a3..cef8be0b1a 100644 --- a/sysdeps/ia64/fpu/e_coshl.S +++ b/sysdeps/ia64/fpu/e_coshl.S @@ -1,10 +1,10 @@ .file "coshl.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2002, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,7 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -35,1129 +35,1060 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 2/02/00 Initial version -// 4/04/00 Unwind support added -// 8/15/00 Bundle added after call to __libm_error_support to properly +// 02/02/00 Initial version +// 04/04/00 Unwind support added +// 08/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. -// 1/23/01 Set inexact flag for large args. +// 01/23/01 Set inexact flag for large args. +// 05/07/01 Reworked to improve speed of all paths +// 05/20/02 Cleaned up namespace and sf0 syntax +// 12/06/02 Improved performance // // API //============================================================== -// float = cosh(float) -// double = cosh(double) // long double = coshl(long double) // input floating point f8 // output floating point f8 - - +// +// Registers used +//============================================================== +// general registers: +// r14 -> r40 +// predicate registers used: +// p6 -> p11 +// floating-point registers used: +// f9 -> f15; f32 -> f90; +// f8 has input, then output +// // Overview of operation //============================================================== -// There are four paths - -// 1. |x| < 0.25 COSH_BY_POLY -// 2. |x| < 32 COSH_BY_TBL -// 3. |x| < 2^14 COSH_BY_EXP -// 4. |x| >= 2^14 COSH_HUGE - -// For paths 1, and 2 SAFE is always 1. -// For path 4, Safe is always 0. -// SAFE = 1 means we cannot overflow. - -#include "libm_support.h" - +// There are seven paths +// 1. 0 < |x| < 0.25 COSH_BY_POLY +// 2. 0.25 <=|x| < 32 COSH_BY_TBL +// 3. 32 <= |x| < 11357.21655 COSH_BY_EXP (merged path with COSH_BY_TBL) +// 4. |x| >= 11357.21655 COSH_HUGE +// 5. x=0 Done with early exit +// 6. x=inf,nan Done with early exit +// 7. x=denormal COSH_DENORM +// +// For double extended we get overflow for x >= 400c b174 ddc0 31ae c0ea +// >= 11357.21655 +// +// +// 1. COSH_BY_POLY 0 < |x| < 0.25 +// =============== +// Evaluate cosh(x) by a 12th order polynomial +// Care is take for the order of multiplication; and P2 is not exactly 1/4!, +// P3 is not exactly 1/6!, etc. +// cosh(x) = 1 + (P1*x^2 + P2*x^4 + P3*x^6 + P4*x^8 + P5*x^10 + P6*x^12) +// +// 2. COSH_BY_TBL 0.25 <= |x| < 32.0 +// ============= +// cosh(x) = cosh(B+R) +// = cosh(B)cosh(R) + sinh(B)sinh(R) +// +// ax = |x| = M*log2/64 + R +// B = M*log2/64 +// M = 64*N + j +// We will calculate M and get N as (M-j)/64 +// The division is a shift. +// exp(B) = exp(N*log2 + j*log2/64) +// = 2^N * 2^(j*log2/64) +// cosh(B) = 1/2(e^B + e^-B) +// = 1/2(2^N * 2^(j*log2/64) + 2^-N * 2^(-j*log2/64)) +// cosh(B) = (2^(N-1) * 2^(j*log2/64) + 2^(-N-1) * 2^(-j*log2/64)) +// sinh(B) = (2^(N-1) * 2^(j*log2/64) - 2^(-N-1) * 2^(-j*log2/64)) +// 2^(j*log2/64) is stored as Tjhi + Tjlo , j= -32,....,32 +// Tjhi is double-extended (80-bit) and Tjlo is single(32-bit) +// +// R = ax - M*log2/64 +// R = ax - M*log2_by_64_hi - M*log2_by_64_lo +// exp(R) = 1 + R +R^2(1/2! + R(1/3! + R(1/4! + ... + R(1/n!)...) +// = 1 + p_odd + p_even +// where the p_even uses the A coefficients and the p_even uses +// the B coefficients +// +// So sinh(R) = 1 + p_odd + p_even -(1 -p_odd -p_even)/2 = p_odd +// cosh(R) = 1 + p_even +// cosh(B) = C_hi + C_lo +// sinh(B) = S_hi +// cosh(x) = cosh(B)cosh(R) + sinh(B)sinh(R) +// +// 3. COSH_BY_EXP 32.0 <= |x| < 11357.21655 ( 400c b174 ddc0 31ae c0ea ) +// ============== +// Can approximate result by exp(x)/2 in this region. +// Y_hi = Tjhi +// Y_lo = Tjhi * (p_odd + p_even) + Tjlo +// cosh(x) = Y_hi + Y_lo +// +// 4. COSH_HUGE |x| >= 11357.21655 ( 400c b174 ddc0 31ae c0ea ) +// ============ +// Set error tag and call error support +// +// // Assembly macros //============================================================== -cosh_FR_X = f44 -FR_RESULT = f44 -cosh_FR_SGNX = f40 -cosh_FR_all_ones = f45 - -FR_X = f8 -FR_Y = f0 -cosh_FR_Inv_log2by64 = f9 -cosh_FR_log2by64_lo = f11 -cosh_FR_log2by64_hi = f10 - -cosh_FR_A1 = f9 -cosh_FR_A2 = f10 -cosh_FR_A3 = f11 - -cosh_FR_Rcub = f12 -cosh_FR_M_temp = f13 -cosh_FR_R_temp = f13 -cosh_FR_Rsq = f13 -cosh_FR_R = f14 - -cosh_FR_M = f38 - -cosh_FR_tmp = f15 -cosh_FR_B1 = f15 -cosh_FR_B2 = f32 -cosh_FR_B3 = f33 - -cosh_FR_peven_temp1 = f34 -cosh_FR_peven_temp2 = f35 -cosh_FR_peven = f36 - -cosh_FR_podd_temp1 = f34 -cosh_FR_podd_temp2 = f35 -cosh_FR_podd = f37 - -cosh_FR_J_temp = f9 -cosh_FR_J = f10 - -cosh_FR_Mmj = f39 - -cosh_FR_N_temp1 = f11 -cosh_FR_N_temp2 = f12 -cosh_FR_N = f13 - -cosh_FR_spos = f14 -cosh_FR_sneg = f15 - -cosh_FR_Tjhi = f32 -cosh_FR_Tjlo = f33 -cosh_FR_Tmjhi = f34 -cosh_FR_Tmjlo = f35 - -GR_mJ = r35 -GR_J = r36 - -AD_mJ = r38 -AD_J = r39 - -cosh_GR_all_ones = r40 - -GR_SAVE_PFS = r41 -GR_SAVE_B0 = r42 -GR_SAVE_GP = r43 -GR_Parameter_X = r44 -GR_Parameter_Y = r45 -GR_Parameter_RESULT = r46 -GR_Parameter_TAG = r47 +r_ad5 = r14 +r_rshf_2to57 = r15 +r_exp_denorm = r15 +r_ad_mJ_lo = r15 +r_ad_J_lo = r16 +r_2Nm1 = r17 +r_2mNm1 = r18 +r_exp_x = r18 +r_ad_J_hi = r19 +r_ad2o = r19 +r_ad_mJ_hi = r20 +r_mj = r21 +r_ad2e = r22 +r_ad3 = r23 +r_ad1 = r24 +r_Mmj = r24 +r_rshf = r25 +r_M = r25 +r_N = r25 +r_jshf = r26 +r_exp_2tom57 = r26 +r_j = r26 +r_exp_mask = r27 +r_signexp_x = r28 +r_signexp_0_5 = r28 +r_exp_0_25 = r29 +r_sig_inv_ln2 = r30 +r_exp_32 = r30 +r_exp_huge = r30 +r_ad4 = r31 + +GR_SAVE_PFS = r34 +GR_SAVE_B0 = r35 +GR_SAVE_GP = r36 + +GR_Parameter_X = r37 +GR_Parameter_Y = r38 +GR_Parameter_RESULT = r39 +GR_Parameter_TAG = r40 + + +f_ABS_X = f9 +f_X2 = f10 +f_X4 = f11 +f_tmp = f14 +f_RSHF = f15 + +f_Inv_log2by64 = f32 +f_log2by64_lo = f33 +f_log2by64_hi = f34 +f_A1 = f35 + +f_A2 = f36 +f_A3 = f37 +f_Rcub = f38 +f_M_temp = f39 +f_R_temp = f40 + +f_Rsq = f41 +f_R = f42 +f_M = f43 +f_B1 = f44 +f_B2 = f45 + +f_B3 = f46 +f_peven_temp1 = f47 +f_peven_temp2 = f48 +f_peven = f49 +f_podd_temp1 = f50 + +f_podd_temp2 = f51 +f_podd = f52 +f_poly65 = f53 +f_poly6543 = f53 +f_poly6to1 = f53 +f_poly43 = f54 +f_poly21 = f55 + +f_X3 = f56 +f_INV_LN2_2TO63 = f57 +f_RSHF_2TO57 = f58 +f_2TOM57 = f59 +f_smlst_oflow_input = f60 + +f_pre_result = f61 +f_huge = f62 +f_spos = f63 +f_sneg = f64 +f_Tjhi = f65 + +f_Tjlo = f66 +f_Tmjhi = f67 +f_Tmjlo = f68 +f_S_hi = f69 +f_SC_hi_temp = f70 + +f_C_lo_temp1 = f71 +f_C_lo_temp2 = f72 +f_C_lo_temp3 = f73 +f_C_lo_temp4 = f73 +f_C_lo = f74 +f_C_hi = f75 + +f_Y_hi = f77 +f_Y_lo_temp = f78 +f_Y_lo = f79 +f_NORM_X = f80 + +f_P1 = f81 +f_P2 = f82 +f_P3 = f83 +f_P4 = f84 +f_P5 = f85 + +f_P6 = f86 +f_Tjhi_spos = f87 +f_Tjlo_spos = f88 +f_huge = f89 +f_signed_hi_lo = f90 -cosh_FR_C_hi = f9 -cosh_FR_C_hi_temp = f10 -cosh_FR_C_lo_temp1 = f11 -cosh_FR_C_lo_temp2 = f12 -cosh_FR_C_lo_temp3 = f13 - -cosh_FR_C_lo = f38 -cosh_FR_S_hi = f39 - -cosh_FR_S_hi_temp1 = f10 -cosh_FR_Y_hi = f11 -cosh_FR_Y_lo_temp = f12 -cosh_FR_Y_lo = f13 -cosh_FR_COSH = f9 - -cosh_FR_X2 = f9 -cosh_FR_X4 = f10 - -cosh_FR_P1 = f14 -cosh_FR_P2 = f15 -cosh_FR_P3 = f32 -cosh_FR_P4 = f33 -cosh_FR_P5 = f34 -cosh_FR_P6 = f35 - -cosh_FR_TINY_THRESH = f9 - -cosh_FR_COSH_temp = f10 -cosh_FR_SCALE = f11 - -cosh_FR_hi_lo = f10 - -cosh_FR_poly_podd_temp1 = f11 -cosh_FR_poly_podd_temp2 = f13 -cosh_FR_poly_peven_temp1 = f11 -cosh_FR_poly_peven_temp2 = f13 // Data tables //============================================================== -#ifdef _LIBC -.rodata -#else -.data -#endif +// DO NOT CHANGE ORDER OF THESE TABLES +RODATA .align 16 -double_cosh_arg_reduction: -ASM_TYPE_DIRECTIVE(double_cosh_arg_reduction,@object) - data8 0xB8AA3B295C17F0BC, 0x00004005 - data8 0xB17217F7D1000000, 0x00003FF8 - data8 0xCF79ABC9E3B39804, 0x00003FD0 -ASM_SIZE_DIRECTIVE(double_cosh_arg_reduction) - -double_cosh_p_table: -ASM_TYPE_DIRECTIVE(double_cosh_p_table,@object) - data8 0x8000000000000000, 0x00003FFE - data8 0xAAAAAAAAAAAAAB80, 0x00003FFA - data8 0xB60B60B60B4FE884, 0x00003FF5 - data8 0xD00D00D1021D7370, 0x00003FEF - data8 0x93F27740C0C2F1CC, 0x00003FE9 - data8 0x8FA02AC65BCBD5BC, 0x00003FE2 -ASM_SIZE_DIRECTIVE(double_cosh_p_table) - -double_cosh_ab_table: -ASM_TYPE_DIRECTIVE(double_cosh_ab_table,@object) - data8 0xAAAAAAAAAAAAAAAC, 0x00003FFC - data8 0x88888888884ECDD5, 0x00003FF8 - data8 0xD00D0C6DCC26A86B, 0x00003FF2 - data8 0x8000000000000002, 0x00003FFE - data8 0xAAAAAAAAAA402C77, 0x00003FFA - data8 0xB60B6CC96BDB144D, 0x00003FF5 -ASM_SIZE_DIRECTIVE(double_cosh_ab_table) - -double_cosh_j_table: -ASM_TYPE_DIRECTIVE(double_cosh_j_table,@object) - data8 0xB504F333F9DE6484, 0x00003FFE, 0x1EB2FB13, 0x00000000 - data8 0xB6FD91E328D17791, 0x00003FFE, 0x1CE2CBE2, 0x00000000 - data8 0xB8FBAF4762FB9EE9, 0x00003FFE, 0x1DDC3CBC, 0x00000000 - data8 0xBAFF5AB2133E45FB, 0x00003FFE, 0x1EE9AA34, 0x00000000 - data8 0xBD08A39F580C36BF, 0x00003FFE, 0x9EAEFDC1, 0x00000000 - data8 0xBF1799B67A731083, 0x00003FFE, 0x9DBF517B, 0x00000000 - data8 0xC12C4CCA66709456, 0x00003FFE, 0x1EF88AFB, 0x00000000 - data8 0xC346CCDA24976407, 0x00003FFE, 0x1E03B216, 0x00000000 - data8 0xC5672A115506DADD, 0x00003FFE, 0x1E78AB43, 0x00000000 - data8 0xC78D74C8ABB9B15D, 0x00003FFE, 0x9E7B1747, 0x00000000 - data8 0xC9B9BD866E2F27A3, 0x00003FFE, 0x9EFE3C0E, 0x00000000 - data8 0xCBEC14FEF2727C5D, 0x00003FFE, 0x9D36F837, 0x00000000 - data8 0xCE248C151F8480E4, 0x00003FFE, 0x9DEE53E4, 0x00000000 - data8 0xD06333DAEF2B2595, 0x00003FFE, 0x9E24AE8E, 0x00000000 - data8 0xD2A81D91F12AE45A, 0x00003FFE, 0x1D912473, 0x00000000 - data8 0xD4F35AABCFEDFA1F, 0x00003FFE, 0x1EB243BE, 0x00000000 - data8 0xD744FCCAD69D6AF4, 0x00003FFE, 0x1E669A2F, 0x00000000 - data8 0xD99D15C278AFD7B6, 0x00003FFE, 0x9BBC610A, 0x00000000 - data8 0xDBFBB797DAF23755, 0x00003FFE, 0x1E761035, 0x00000000 - data8 0xDE60F4825E0E9124, 0x00003FFE, 0x9E0BE175, 0x00000000 - data8 0xE0CCDEEC2A94E111, 0x00003FFE, 0x1CCB12A1, 0x00000000 - data8 0xE33F8972BE8A5A51, 0x00003FFE, 0x1D1BFE90, 0x00000000 - data8 0xE5B906E77C8348A8, 0x00003FFE, 0x1DF2F47A, 0x00000000 - data8 0xE8396A503C4BDC68, 0x00003FFE, 0x1EF22F22, 0x00000000 - data8 0xEAC0C6E7DD24392F, 0x00003FFE, 0x9E3F4A29, 0x00000000 - data8 0xED4F301ED9942B84, 0x00003FFE, 0x1EC01A5B, 0x00000000 - data8 0xEFE4B99BDCDAF5CB, 0x00003FFE, 0x1E8CAC3A, 0x00000000 - data8 0xF281773C59FFB13A, 0x00003FFE, 0x9DBB3FAB, 0x00000000 - data8 0xF5257D152486CC2C, 0x00003FFE, 0x1EF73A19, 0x00000000 - data8 0xF7D0DF730AD13BB9, 0x00003FFE, 0x9BB795B5, 0x00000000 - data8 0xFA83B2DB722A033A, 0x00003FFE, 0x1EF84B76, 0x00000000 - data8 0xFD3E0C0CF486C175, 0x00003FFE, 0x9EF5818B, 0x00000000 - data8 0x8000000000000000, 0x00003FFF, 0x00000000, 0x00000000 - data8 0x8164D1F3BC030773, 0x00003FFF, 0x1F77CACA, 0x00000000 - data8 0x82CD8698AC2BA1D7, 0x00003FFF, 0x1EF8A91D, 0x00000000 - data8 0x843A28C3ACDE4046, 0x00003FFF, 0x1E57C976, 0x00000000 - data8 0x85AAC367CC487B15, 0x00003FFF, 0x9EE8DA92, 0x00000000 - data8 0x871F61969E8D1010, 0x00003FFF, 0x1EE85C9F, 0x00000000 - data8 0x88980E8092DA8527, 0x00003FFF, 0x1F3BF1AF, 0x00000000 - data8 0x8A14D575496EFD9A, 0x00003FFF, 0x1D80CA1E, 0x00000000 - data8 0x8B95C1E3EA8BD6E7, 0x00003FFF, 0x9D0373AF, 0x00000000 - data8 0x8D1ADF5B7E5BA9E6, 0x00003FFF, 0x9F167097, 0x00000000 - data8 0x8EA4398B45CD53C0, 0x00003FFF, 0x1EB70051, 0x00000000 - data8 0x9031DC431466B1DC, 0x00003FFF, 0x1F6EB029, 0x00000000 - data8 0x91C3D373AB11C336, 0x00003FFF, 0x1DFD6D8E, 0x00000000 - data8 0x935A2B2F13E6E92C, 0x00003FFF, 0x9EB319B0, 0x00000000 - data8 0x94F4EFA8FEF70961, 0x00003FFF, 0x1EBA2BEB, 0x00000000 - data8 0x96942D3720185A00, 0x00003FFF, 0x1F11D537, 0x00000000 - data8 0x9837F0518DB8A96F, 0x00003FFF, 0x1F0D5A46, 0x00000000 - data8 0x99E0459320B7FA65, 0x00003FFF, 0x9E5E7BCA, 0x00000000 - data8 0x9B8D39B9D54E5539, 0x00003FFF, 0x9F3AAFD1, 0x00000000 - data8 0x9D3ED9A72CFFB751, 0x00003FFF, 0x9E86DACC, 0x00000000 - data8 0x9EF5326091A111AE, 0x00003FFF, 0x9F3EDDC2, 0x00000000 - data8 0xA0B0510FB9714FC2, 0x00003FFF, 0x1E496E3D, 0x00000000 - data8 0xA27043030C496819, 0x00003FFF, 0x9F490BF6, 0x00000000 - data8 0xA43515AE09E6809E, 0x00003FFF, 0x1DD1DB48, 0x00000000 - data8 0xA5FED6A9B15138EA, 0x00003FFF, 0x1E65EBFB, 0x00000000 - data8 0xA7CD93B4E965356A, 0x00003FFF, 0x9F427496, 0x00000000 - data8 0xA9A15AB4EA7C0EF8, 0x00003FFF, 0x1F283C4A, 0x00000000 - data8 0xAB7A39B5A93ED337, 0x00003FFF, 0x1F4B0047, 0x00000000 - data8 0xAD583EEA42A14AC6, 0x00003FFF, 0x1F130152, 0x00000000 - data8 0xAF3B78AD690A4375, 0x00003FFF, 0x9E8367C0, 0x00000000 - data8 0xB123F581D2AC2590, 0x00003FFF, 0x9F705F90, 0x00000000 - data8 0xB311C412A9112489, 0x00003FFF, 0x1EFB3C53, 0x00000000 - data8 0xB504F333F9DE6484, 0x00003FFF, 0x1F32FB13, 0x00000000 -ASM_SIZE_DIRECTIVE(double_cosh_j_table) - -.align 32 -.global coshl# - -.section .text -.proc coshl# -.align 32 - -coshl: - -#ifdef _LIBC -.global __ieee754_coshl# -.proc __ieee754_coshl# -__ieee754_coshl: -#endif - -// X NAN? - -{ .mfi - alloc r32 = ar.pfs,0,12,4,0 -(p0) fclass.m.unc p6,p7 = f8, 0xc3 - mov cosh_GR_all_ones = -1 -};; - -// This is more than we need but it is in preparation -// for the values we add for error support. We push three -// addresses on the stack (3*8) = 24 bytes and one tag - -{ .mfb - nop.m 999 -(p6) fma.s0 f8 = f8,f1,f8 -(p6) br.ret.spnt b0 ;; -} - - -// Make constant that will generate inexact when squared -// X infinity -{ .mfi - setf.sig cosh_FR_all_ones = cosh_GR_all_ones -(p0) fclass.m.unc p6,p0 = f8, 0x23 - nop.i 999 ;; -} - -{ .mfb - nop.m 999 -(p6) fmerge.s f8 = f0,f8 -(p6) br.ret.spnt b0 ;; -} +LOCAL_OBJECT_START(cosh_arg_reduction) +// data8 0xB8AA3B295C17F0BC, 0x00004005 // 64/log2 -- signif loaded with setf + data8 0xB17217F7D1000000, 0x00003FF8 // log2/64 high part + data8 0xCF79ABC9E3B39804, 0x00003FD0 // log2/64 low part + data8 0xb174ddc031aec0ea, 0x0000400c // Smallest x to overflow (11357.21655) +LOCAL_OBJECT_END(cosh_arg_reduction) + +LOCAL_OBJECT_START(cosh_p_table) + data8 0x8FA02AC65BCBD5BC, 0x00003FE2 // P6 + data8 0xD00D00D1021D7370, 0x00003FEF // P4 + data8 0xAAAAAAAAAAAAAB80, 0x00003FFA // P2 + data8 0x93F27740C0C2F1CC, 0x00003FE9 // P5 + data8 0xB60B60B60B4FE884, 0x00003FF5 // P3 + data8 0x8000000000000000, 0x00003FFE // P1 +LOCAL_OBJECT_END(cosh_p_table) + +LOCAL_OBJECT_START(cosh_ab_table) + data8 0xAAAAAAAAAAAAAAAC, 0x00003FFC // A1 + data8 0x88888888884ECDD5, 0x00003FF8 // A2 + data8 0xD00D0C6DCC26A86B, 0x00003FF2 // A3 + data8 0x8000000000000002, 0x00003FFE // B1 + data8 0xAAAAAAAAAA402C77, 0x00003FFA // B2 + data8 0xB60B6CC96BDB144D, 0x00003FF5 // B3 +LOCAL_OBJECT_END(cosh_ab_table) + +LOCAL_OBJECT_START(cosh_j_hi_table) + data8 0xB504F333F9DE6484, 0x00003FFE + data8 0xB6FD91E328D17791, 0x00003FFE + data8 0xB8FBAF4762FB9EE9, 0x00003FFE + data8 0xBAFF5AB2133E45FB, 0x00003FFE + data8 0xBD08A39F580C36BF, 0x00003FFE + data8 0xBF1799B67A731083, 0x00003FFE + data8 0xC12C4CCA66709456, 0x00003FFE + data8 0xC346CCDA24976407, 0x00003FFE + data8 0xC5672A115506DADD, 0x00003FFE + data8 0xC78D74C8ABB9B15D, 0x00003FFE + data8 0xC9B9BD866E2F27A3, 0x00003FFE + data8 0xCBEC14FEF2727C5D, 0x00003FFE + data8 0xCE248C151F8480E4, 0x00003FFE + data8 0xD06333DAEF2B2595, 0x00003FFE + data8 0xD2A81D91F12AE45A, 0x00003FFE + data8 0xD4F35AABCFEDFA1F, 0x00003FFE + data8 0xD744FCCAD69D6AF4, 0x00003FFE + data8 0xD99D15C278AFD7B6, 0x00003FFE + data8 0xDBFBB797DAF23755, 0x00003FFE + data8 0xDE60F4825E0E9124, 0x00003FFE + data8 0xE0CCDEEC2A94E111, 0x00003FFE + data8 0xE33F8972BE8A5A51, 0x00003FFE + data8 0xE5B906E77C8348A8, 0x00003FFE + data8 0xE8396A503C4BDC68, 0x00003FFE + data8 0xEAC0C6E7DD24392F, 0x00003FFE + data8 0xED4F301ED9942B84, 0x00003FFE + data8 0xEFE4B99BDCDAF5CB, 0x00003FFE + data8 0xF281773C59FFB13A, 0x00003FFE + data8 0xF5257D152486CC2C, 0x00003FFE + data8 0xF7D0DF730AD13BB9, 0x00003FFE + data8 0xFA83B2DB722A033A, 0x00003FFE + data8 0xFD3E0C0CF486C175, 0x00003FFE + data8 0x8000000000000000, 0x00003FFF // Center of table + data8 0x8164D1F3BC030773, 0x00003FFF + data8 0x82CD8698AC2BA1D7, 0x00003FFF + data8 0x843A28C3ACDE4046, 0x00003FFF + data8 0x85AAC367CC487B15, 0x00003FFF + data8 0x871F61969E8D1010, 0x00003FFF + data8 0x88980E8092DA8527, 0x00003FFF + data8 0x8A14D575496EFD9A, 0x00003FFF + data8 0x8B95C1E3EA8BD6E7, 0x00003FFF + data8 0x8D1ADF5B7E5BA9E6, 0x00003FFF + data8 0x8EA4398B45CD53C0, 0x00003FFF + data8 0x9031DC431466B1DC, 0x00003FFF + data8 0x91C3D373AB11C336, 0x00003FFF + data8 0x935A2B2F13E6E92C, 0x00003FFF + data8 0x94F4EFA8FEF70961, 0x00003FFF + data8 0x96942D3720185A00, 0x00003FFF + data8 0x9837F0518DB8A96F, 0x00003FFF + data8 0x99E0459320B7FA65, 0x00003FFF + data8 0x9B8D39B9D54E5539, 0x00003FFF + data8 0x9D3ED9A72CFFB751, 0x00003FFF + data8 0x9EF5326091A111AE, 0x00003FFF + data8 0xA0B0510FB9714FC2, 0x00003FFF + data8 0xA27043030C496819, 0x00003FFF + data8 0xA43515AE09E6809E, 0x00003FFF + data8 0xA5FED6A9B15138EA, 0x00003FFF + data8 0xA7CD93B4E965356A, 0x00003FFF + data8 0xA9A15AB4EA7C0EF8, 0x00003FFF + data8 0xAB7A39B5A93ED337, 0x00003FFF + data8 0xAD583EEA42A14AC6, 0x00003FFF + data8 0xAF3B78AD690A4375, 0x00003FFF + data8 0xB123F581D2AC2590, 0x00003FFF + data8 0xB311C412A9112489, 0x00003FFF + data8 0xB504F333F9DE6484, 0x00003FFF +LOCAL_OBJECT_END(cosh_j_hi_table) + +LOCAL_OBJECT_START(cosh_j_lo_table) + data4 0x1EB2FB13 + data4 0x1CE2CBE2 + data4 0x1DDC3CBC + data4 0x1EE9AA34 + data4 0x9EAEFDC1 + data4 0x9DBF517B + data4 0x1EF88AFB + data4 0x1E03B216 + data4 0x1E78AB43 + data4 0x9E7B1747 + data4 0x9EFE3C0E + data4 0x9D36F837 + data4 0x9DEE53E4 + data4 0x9E24AE8E + data4 0x1D912473 + data4 0x1EB243BE + data4 0x1E669A2F + data4 0x9BBC610A + data4 0x1E761035 + data4 0x9E0BE175 + data4 0x1CCB12A1 + data4 0x1D1BFE90 + data4 0x1DF2F47A + data4 0x1EF22F22 + data4 0x9E3F4A29 + data4 0x1EC01A5B + data4 0x1E8CAC3A + data4 0x9DBB3FAB + data4 0x1EF73A19 + data4 0x9BB795B5 + data4 0x1EF84B76 + data4 0x9EF5818B + data4 0x00000000 // Center of table + data4 0x1F77CACA + data4 0x1EF8A91D + data4 0x1E57C976 + data4 0x9EE8DA92 + data4 0x1EE85C9F + data4 0x1F3BF1AF + data4 0x1D80CA1E + data4 0x9D0373AF + data4 0x9F167097 + data4 0x1EB70051 + data4 0x1F6EB029 + data4 0x1DFD6D8E + data4 0x9EB319B0 + data4 0x1EBA2BEB + data4 0x1F11D537 + data4 0x1F0D5A46 + data4 0x9E5E7BCA + data4 0x9F3AAFD1 + data4 0x9E86DACC + data4 0x9F3EDDC2 + data4 0x1E496E3D + data4 0x9F490BF6 + data4 0x1DD1DB48 + data4 0x1E65EBFB + data4 0x9F427496 + data4 0x1F283C4A + data4 0x1F4B0047 + data4 0x1F130152 + data4 0x9E8367C0 + data4 0x9F705F90 + data4 0x1EFB3C53 + data4 0x1F32FB13 +LOCAL_OBJECT_END(cosh_j_lo_table) +.section .text +GLOBAL_IEEE754_ENTRY(coshl) -// Put 0.25 in f9; p6 true if x < 0.25 { .mlx - nop.m 999 -(p0) movl r32 = 0x000000000000fffd ;; -} - -{ .mfi -(p0) setf.exp f9 = r32 - nop.f 999 - nop.i 999 ;; + getf.exp r_signexp_x = f8 // Get signexp of x, must redo if unorm + movl r_sig_inv_ln2 = 0xb8aa3b295c17f0bc // significand of 1/ln2 } - -{ .mfi - nop.m 999 -(p0) fmerge.s cosh_FR_X = f0,f8 - nop.i 999 +{ .mlx + addl r_ad1 = @ltoff(cosh_arg_reduction), gp + movl r_rshf_2to57 = 0x4778000000000000 // 1.10000 2^(63+57) } +;; { .mfi - nop.m 999 -(p0) fmerge.s cosh_FR_SGNX = f8,f1 - nop.i 999 ;; + ld8 r_ad1 = [r_ad1] + fmerge.s f_ABS_X = f0,f8 + mov r_exp_0_25 = 0x0fffd // Form exponent for 0.25 } - { .mfi - nop.m 999 -(p0) fcmp.lt.unc p0,p7 = cosh_FR_X,f9 - nop.i 999 ;; -} - -{ .mib - nop.m 999 - nop.i 999 -(p7) br.cond.sptk L(COSH_BY_TBL) + nop.m 0 + fnorm.s1 f_NORM_X = f8 + mov r_exp_2tom57 = 0xffff-57 } ;; - -// COSH_BY_POLY: -// POLY cannot overflow so there is no need to call __libm_error_support -// Get the values of P_x from the table - -{ .mmi - nop.m 999 -(p0) addl r34 = @ltoff(double_cosh_p_table), gp - nop.i 999 +{ .mfi + setf.d f_RSHF_2TO57 = r_rshf_2to57 // Form const 1.100 * 2^120 + fclass.m p10,p0 = f8, 0x0b // Test for denorm + mov r_exp_mask = 0x1ffff } -;; - -{ .mmi - ld8 r34 = [r34] - nop.m 999 - nop.i 999 +{ .mlx + setf.sig f_INV_LN2_2TO63 = r_sig_inv_ln2 // Form 1/ln2 * 2^63 + movl r_rshf = 0x43e8000000000000 // 1.1000 2^63 for right shift } ;; - -// Calculate cosh_FR_X2 = ax*ax and cosh_FR_X4 = ax*ax*ax*ax -{ .mmf - nop.m 999 -(p0) ldfe cosh_FR_P1 = [r34],16 -(p0) fma.s1 cosh_FR_X2 = cosh_FR_X, cosh_FR_X, f0 ;; -} - -{ .mmi -(p0) ldfe cosh_FR_P2 = [r34],16 ;; -(p0) ldfe cosh_FR_P3 = [r34],16 - nop.i 999 ;; -} - -{ .mmi -(p0) ldfe cosh_FR_P4 = [r34],16 ;; -(p0) ldfe cosh_FR_P5 = [r34],16 - nop.i 999 ;; -} - { .mfi -(p0) ldfe cosh_FR_P6 = [r34],16 -(p0) fma.s1 cosh_FR_X4 = cosh_FR_X2, cosh_FR_X2, f0 - nop.i 999 ;; + nop.m 0 + fclass.m p7,p0 = f8, 0x07 // Test if x=0 + nop.i 0 } - -// Calculate cosh_FR_podd = x4 *(x4 * P_5 + P_3) + P_1 { .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_poly_podd_temp1 = cosh_FR_X4, cosh_FR_P5, cosh_FR_P3 - nop.i 999 ;; + setf.exp f_2TOM57 = r_exp_2tom57 // Form 2^-57 for scaling + nop.f 0 + add r_ad3 = 0x90, r_ad1 // Point to ab_table } +;; { .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_podd = cosh_FR_X4, cosh_FR_poly_podd_temp1, cosh_FR_P1 - nop.i 999 + setf.d f_RSHF = r_rshf // Form right shift const 1.100 * 2^63 + fclass.m p6,p0 = f8, 0xe3 // Test if x nan, inf + add r_ad4 = 0x2f0, r_ad1 // Point to j_hi_table midpoint } - -// Calculate cosh_FR_peven = p_even = x4 *(x4 * (x4 * P_6 + P_4) + P_2) -{ .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_poly_peven_temp1 = cosh_FR_X4, cosh_FR_P6, cosh_FR_P4 - nop.i 999 ;; +{ .mib + add r_ad2e = 0x20, r_ad1 // Point to p_table + nop.i 0 +(p10) br.cond.spnt COSH_DENORM // Branch if x denorm } +;; +// Common path -- return here from COSH_DENORM if x is unnorm +COSH_COMMON: { .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_poly_peven_temp2 = cosh_FR_X4, cosh_FR_poly_peven_temp1, cosh_FR_P2 - nop.i 999 ;; + ldfe f_smlst_oflow_input = [r_ad2e],16 +(p7) fma.s0 f8 = f1, f1, f0 // Result = 1.0 if x=0 + add r_ad5 = 0x580, r_ad1 // Point to j_lo_table midpoint } - -{ .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_peven = cosh_FR_X4, cosh_FR_poly_peven_temp2, f0 - nop.i 999 ;; +{ .mib + ldfe f_log2by64_hi = [r_ad1],16 + and r_exp_x = r_exp_mask, r_signexp_x +(p7) br.ret.spnt b0 // Exit if x=0 } +;; -// Y_lo = x2*p_odd + p_even -// Calculate f8 = Y_hi + Y_lo +// Get the A coefficients for COSH_BY_TBL { .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_Y_lo = cosh_FR_X2, cosh_FR_podd, cosh_FR_peven - nop.i 999 ;; + ldfe f_A1 = [r_ad3],16 + fcmp.lt.s1 p8,p9 = f8,f0 // Test for x<0 + cmp.lt p7,p0 = r_exp_x, r_exp_0_25 // Test x < 0.25 } - { .mfb - nop.m 999 -(p0) fma.s0 f8 = f1, f1, cosh_FR_Y_lo -(p0) br.ret.sptk b0 ;; -} - - -L(COSH_BY_TBL): - -// Now that we are at TBL; so far all we know is that |x| >= 0.25. -// The first two steps are the same for TBL and EXP, but if we are HUGE -// Double Extended -// Go to HUGE if |x| >= 2^14, 1000d (register-biased) is e = 14 (true) -// Double -// Go to HUGE if |x| >= 2^10, 10009 (register-biased) is e = 10 (true) -// Single -// Go to HUGE if |x| >= 2^7, 10006 (register-biased) is e = 7 (true) -// we want to leave now. Go to HUGE if |x| >= 2^14 -// 1000d (register-biased) is e = 14 (true) - -{ .mlx - nop.m 999 -(p0) movl r32 = 0x000000000001000d ;; -} - -{ .mfi -(p0) setf.exp f9 = r32 - nop.f 999 - nop.i 999 ;; + add r_ad2o = 0x30, r_ad2e // Point to p_table odd coeffs +(p6) fma.s0 f8 = f8,f8,f0 // Result for x nan, inf +(p6) br.ret.spnt b0 // Exit for x nan, inf } +;; +// Calculate X2 = ax*ax for COSH_BY_POLY { .mfi - nop.m 999 -(p0) fcmp.ge.unc p6,p7 = cosh_FR_X,f9 - nop.i 999 ;; + ldfe f_log2by64_lo = [r_ad1],16 + nop.f 0 + nop.i 0 } - -{ .mib - nop.m 999 - nop.i 999 -(p6) br.cond.spnt L(COSH_HUGE) ;; +{ .mfb + ldfe f_A2 = [r_ad3],16 + fma.s1 f_X2 = f_NORM_X, f_NORM_X, f0 +(p7) br.cond.spnt COSH_BY_POLY } +;; -// r32 = 1 -// r34 = N-1 -// r35 = N -// r36 = j -// r37 = N+1 - -// TBL can never overflow -// cosh(x) = cosh(B+R) -// = cosh(B) cosh(R) + sinh(B) sinh(R) -// cosh(R) can be approximated by 1 + p_even -// sinh(R) can be approximated by p_odd - +// Here if |x| >= 0.25 +COSH_BY_TBL: // ****************************************************** -// STEP 1 (TBL and EXP) +// STEP 1 (TBL and EXP) - Argument reduction // ****************************************************** -// Get the following constants. -// f9 = Inv_log2by64 -// f10 = log2by64_hi -// f11 = log2by64_lo +// Get the following constants. +// Inv_log2by64 +// log2by64_hi +// log2by64_lo -{ .mmi -(p0) adds r32 = 0x1,r0 -(p0) addl r34 = @ltoff(double_cosh_arg_reduction), gp - nop.i 999 -} -;; // We want 2^(N-1) and 2^(-N-1). So bias N-1 and -N-1 and // put them in an exponent. -// cosh_FR_spos = 2^(N-1) and cosh_FR_sneg = 2^(-N-1) -// r39 = 0xffff + (N-1) = 0xffff +N -1 -// r40 = 0xffff - (N +1) = 0xffff -N -1 - -{ .mlx - ld8 r34 = [r34] -(p0) movl r38 = 0x000000000000fffe ;; -} +// f_spos = 2^(N-1) and f_sneg = 2^(-N-1) +// 0xffff + (N-1) = 0xffff +N -1 +// 0xffff - (N +1) = 0xffff -N -1 -{ .mmi -(p0) ldfe cosh_FR_Inv_log2by64 = [r34],16 ;; -(p0) ldfe cosh_FR_log2by64_hi = [r34],16 - nop.i 999 ;; -} - -{ .mbb -(p0) ldfe cosh_FR_log2by64_lo = [r34],16 - nop.b 999 - nop.b 999 ;; -} - -// Get the A coefficients -// f9 = A_1 -// f10 = A_2 -// f11 = A_3 -{ .mmi - nop.m 999 -(p0) addl r34 = @ltoff(double_cosh_ab_table), gp - nop.i 999 -} -;; +// Calculate M and keep it as integer and floating point. +// M = round-to-integer(x*Inv_log2by64) +// f_M = M = truncate(ax/(log2/64)) +// Put the integer representation of M in r_M +// and the floating point representation of M in f_M +// Get the remaining A,B coefficients { .mmi - ld8 r34 = [r34] - nop.m 999 - nop.i 999 + ldfe f_A3 = [r_ad3],16 + nop.m 0 + nop.i 0 } ;; - -// Calculate M and keep it as integer and floating point. -// M = round-to-integer(x*Inv_log2by64) -// cosh_FR_M = M = truncate(ax/(log2/64)) -// Put the significand of M in r35 -// and the floating point representation of M in cosh_FR_M - +// Use constant (1.100*2^(63-6)) to get rounded M into rightmost significand +// |x| * 64 * 1/ln2 * 2^(63-6) + 1.1000 * 2^(63+(63-6)) { .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_M = cosh_FR_X, cosh_FR_Inv_log2by64, f0 - nop.i 999 + nop.m 0 + fma.s1 f_M_temp = f_ABS_X, f_INV_LN2_2TO63, f_RSHF_2TO57 + mov r_signexp_0_5 = 0x0fffe // signexp of +0.5 } +;; +// Test for |x| >= overflow limit { .mfi -(p0) ldfe cosh_FR_A1 = [r34],16 - nop.f 999 - nop.i 999 ;; + ldfe f_B1 = [r_ad3],16 + fcmp.ge.s1 p6,p0 = f_ABS_X, f_smlst_oflow_input + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fcvt.fx.s1 cosh_FR_M_temp = cosh_FR_M - nop.i 999 ;; + ldfe f_B2 = [r_ad3],16 + nop.f 0 + mov r_exp_32 = 0x10004 } +;; -{ .mfi - nop.m 999 -(p0) fnorm.s1 cosh_FR_M = cosh_FR_M_temp - nop.i 999 ;; +// Subtract RSHF constant to get rounded M as a floating point value +// M_temp * 2^(63-6) - 2^63 +{ .mfb + ldfe f_B3 = [r_ad3],16 + fms.s1 f_M = f_M_temp, f_2TOM57, f_RSHF +(p6) br.cond.spnt COSH_HUGE // Branch if result will overflow } +;; { .mfi -(p0) getf.sig r35 = cosh_FR_M_temp - nop.f 999 - nop.i 999 ;; + getf.sig r_M = f_M_temp + nop.f 0 + cmp.ge p7,p6 = r_exp_x, r_exp_32 // Test if x >= 32 } +;; -// M is still in r35. Calculate j. j is the signed extension of the six lsb of M. It +// Calculate j. j is the signed extension of the six lsb of M. It // has a range of -32 thru 31. -// r35 = M -// r36 = j -{ .mii - nop.m 999 - nop.i 999 ;; -(p0) and r36 = 0x3f, r35 ;; -} // Calculate R -// f13 = f44 - f12*f10 = x - M*log2by64_hi -// f14 = f13 - f8*f11 = R = (x - M*log2by64_hi) - M*log2by64_lo - -{ .mfi - nop.m 999 -(p0) fnma.s1 cosh_FR_R_temp = cosh_FR_M, cosh_FR_log2by64_hi, cosh_FR_X - nop.i 999 -} +// ax - M*log2by64_hi +// R = (ax - M*log2by64_hi) - M*log2by64_lo { .mfi -(p0) ldfe cosh_FR_A2 = [r34],16 - nop.f 999 - nop.i 999 ;; + nop.m 0 + fnma.s1 f_R_temp = f_M, f_log2by64_hi, f_ABS_X + and r_j = 0x3f, r_M } +;; -{ .mfi - nop.m 999 -(p0) fnma.s1 cosh_FR_R = cosh_FR_M, cosh_FR_log2by64_lo, cosh_FR_R_temp - nop.i 999 +{ .mii + nop.m 0 + shl r_jshf = r_j, 0x2 // Shift j so can sign extend it +;; + sxt1 r_jshf = r_jshf } +;; -// Get the B coefficients -// f15 = B_1 -// f32 = B_2 -// f33 = B_3 - -{ .mmi -(p0) ldfe cosh_FR_A3 = [r34],16 ;; -(p0) ldfe cosh_FR_B1 = [r34],16 - nop.i 999 ;; +{ .mii + nop.m 0 + shr r_j = r_jshf, 0x2 // Now j has range -32 to 31 + nop.i 0 } +;; { .mmi -(p0) ldfe cosh_FR_B2 = [r34],16 ;; -(p0) ldfe cosh_FR_B3 = [r34],16 - nop.i 999 ;; + shladd r_ad_J_hi = r_j, 4, r_ad4 // pointer to Tjhi + sub r_Mmj = r_M, r_j // M-j + sub r_mj = r0, r_j // Form -j } +;; -{ .mii - nop.m 999 -(p0) shl r34 = r36, 0x2 ;; -(p0) sxt1 r37 = r34 ;; +// The TBL and EXP branches are merged and predicated +// If TBL, p6 true, 0.25 <= |x| < 32 +// If EXP, p7 true, 32 <= |x| < overflow_limit +// +// N = (M-j)/64 +{ .mfi + ldfe f_Tjhi = [r_ad_J_hi] + fnma.s1 f_R = f_M, f_log2by64_lo, f_R_temp + shr r_N = r_Mmj, 0x6 // N = (M-j)/64 } - -// ****************************************************** -// STEP 2 (TBL and EXP) -// ****************************************************** -// Calculate Rsquared and Rcubed in preparation for p_even and p_odd -// f12 = R*R*R -// f13 = R*R -// f14 = R <== from above - { .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_Rsq = cosh_FR_R, cosh_FR_R, f0 -(p0) shr r36 = r37, 0x2 ;; + shladd r_ad_mJ_hi = r_mj, 4, r_ad4 // pointer to Tmjhi + nop.f 0 + shladd r_ad_mJ_lo = r_mj, 2, r_ad5 // pointer to Tmjlo } +;; -// r34 = M-j = r35 - r36 -// r35 = N = (M-j)/64 - -{ .mii -(p0) sub r34 = r35, r36 - nop.i 999 ;; -(p0) shr r35 = r34, 0x6 ;; +{ .mfi + sub r_2mNm1 = r_signexp_0_5, r_N // signexp 2^(-N-1) + nop.f 0 + shladd r_ad_J_lo = r_j, 2, r_ad5 // pointer to Tjlo } - -{ .mii -(p0) sub r40 = r38, r35 -(p0) adds r37 = 0x1, r35 -(p0) add r39 = r38, r35 ;; +{ .mfi + ldfe f_Tmjhi = [r_ad_mJ_hi] + nop.f 0 + add r_2Nm1 = r_signexp_0_5, r_N // signexp 2^(N-1) } +;; -// Get the address of the J table, add the offset, -// addresses are sinh_AD_mJ and sinh_AD_J, get the T value -// f32 = T(j)_hi -// f33 = T(j)_lo -// f34 = T(-j)_hi -// f35 = T(-j)_lo - -{ .mmi -(p0) sub r34 = r35, r32 -(p0) addl r37 = @ltoff(double_cosh_j_table), gp - nop.i 999 +{ .mmf + ldfs f_Tmjlo = [r_ad_mJ_lo] + setf.exp f_sneg = r_2mNm1 // Form 2^(-N-1) + nop.f 0 } ;; -{ .mfi - ld8 r37 = [r37] -(p0) fma.s1 cosh_FR_Rcub = cosh_FR_Rsq, cosh_FR_R, f0 - nop.i 999 +{ .mmf + ldfs f_Tjlo = [r_ad_J_lo] + setf.exp f_spos = r_2Nm1 // Form 2^(N-1) + nop.f 0 } +;; // ****************************************************** -// STEP 3 Now decide if we need to branch to EXP +// STEP 2 (TBL and EXP) // ****************************************************** -// Put 32 in f9; p6 true if x < 32 +// Calculate Rsquared and Rcubed in preparation for p_even and p_odd -{ .mlx - nop.m 999 -(p0) movl r32 = 0x0000000000010004 ;; +{ .mmf + nop.m 0 + nop.m 0 + fma.s1 f_Rsq = f_R, f_R, f0 } +;; -// Calculate p_even -// f34 = B_2 + Rsq *B_3 -// f35 = B_1 + Rsq*f34 = B_1 + Rsq * (B_2 + Rsq *B_3) -// f36 = peven = Rsq * f35 = Rsq * (B_1 + Rsq * (B_2 + Rsq *B_3)) - -{ .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_peven_temp1 = cosh_FR_Rsq, cosh_FR_B3, cosh_FR_B2 - nop.i 999 ;; -} +// Calculate p_even +// B_2 + Rsq *B_3 +// B_1 + Rsq * (B_2 + Rsq *B_3) +// p_even = Rsq * (B_1 + Rsq * (B_2 + Rsq *B_3)) { .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_peven_temp2 = cosh_FR_Rsq, cosh_FR_peven_temp1, cosh_FR_B1 - nop.i 999 + nop.m 0 + fma.s1 f_peven_temp1 = f_Rsq, f_B3, f_B2 + nop.i 0 } - // Calculate p_odd -// f34 = A_2 + Rsq *A_3 -// f35 = A_1 + Rsq * (A_2 + Rsq *A_3) -// f37 = podd = R + Rcub * (A_1 + Rsq * (A_2 + Rsq *A_3)) - +// A_2 + Rsq *A_3 +// A_1 + Rsq * (A_2 + Rsq *A_3) +// podd = R + Rcub * (A_1 + Rsq * (A_2 + Rsq *A_3)) { .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_podd_temp1 = cosh_FR_Rsq, cosh_FR_A3, cosh_FR_A2 - nop.i 999 ;; + nop.m 0 + fma.s1 f_podd_temp1 = f_Rsq, f_A3, f_A2 + nop.i 0 } +;; { .mfi -(p0) setf.exp cosh_FR_N_temp1 = r39 - nop.f 999 - nop.i 999 ;; + nop.m 0 + fma.s1 f_Rcub = f_Rsq, f_R, f0 + nop.i 0 } +;; -{ .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_peven = cosh_FR_Rsq, cosh_FR_peven_temp2, f0 - nop.i 999 -} +// +// If TBL, +// Calculate S_hi and S_lo, and C_hi +// SC_hi_temp = sneg * Tmjhi +// S_hi = spos * Tjhi - SC_hi_temp +// S_hi = spos * Tjhi - (sneg * Tmjhi) +// C_hi = spos * Tjhi + SC_hi_temp +// C_hi = spos * Tjhi + (sneg * Tmjhi) { .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_podd_temp2 = cosh_FR_Rsq, cosh_FR_podd_temp1, cosh_FR_A1 - nop.i 999 ;; + nop.m 0 +(p6) fma.s1 f_SC_hi_temp = f_sneg, f_Tmjhi, f0 + nop.i 0 } +;; +// If TBL, +// C_lo_temp3 = sneg * Tmjlo +// C_lo_temp4 = spos * Tjlo + C_lo_temp3 +// C_lo_temp4 = spos * Tjlo + (sneg * Tmjlo) { .mfi -(p0) setf.exp f9 = r32 - nop.f 999 - nop.i 999 ;; + nop.m 0 +(p6) fma.s1 f_C_lo_temp3 = f_sneg, f_Tmjlo, f0 + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_podd = cosh_FR_podd_temp2, cosh_FR_Rcub, cosh_FR_R - nop.i 999 -} - -// sinh_GR_mj contains the table offset for -j -// sinh_GR_j contains the table offset for +j -// p6 is true when j <= 0 - -{ .mlx -(p0) setf.exp cosh_FR_N_temp2 = r40 -(p0) movl r40 = 0x0000000000000020 ;; + nop.m 0 + fma.s1 f_peven_temp2 = f_Rsq, f_peven_temp1, f_B1 + nop.i 0 } - { .mfi -(p0) sub GR_mJ = r40, r36 -(p0) fmerge.se cosh_FR_spos = cosh_FR_N_temp1, f1 -(p0) adds GR_J = 0x20, r36 ;; + nop.m 0 + fma.s1 f_podd_temp2 = f_Rsq, f_podd_temp1, f_A1 + nop.i 0 } +;; -{ .mii - nop.m 999 -(p0) shl GR_mJ = GR_mJ, 5 ;; -(p0) add AD_mJ = r37, GR_mJ ;; +// If EXP, +// Compute 2^(N-1) * Tjhi and 2^(N-1) * Tjlo +{ .mfi + nop.m 0 +(p7) fma.s1 f_Tjhi_spos = f_Tjhi, f_spos, f0 + nop.i 0 } - -{ .mmi - nop.m 999 -(p0) ldfe cosh_FR_Tmjhi = [AD_mJ],16 -(p0) shl GR_J = GR_J, 5 ;; +{ .mfi + nop.m 0 +(p7) fma.s1 f_Tjlo_spos = f_Tjlo, f_spos, f0 + nop.i 0 } +;; { .mfi -(p0) ldfs cosh_FR_Tmjlo = [AD_mJ],16 -(p0) fcmp.lt.unc.s1 p6,p7 = cosh_FR_X,f9 -(p0) add AD_J = r37, GR_J ;; + nop.m 0 +(p6) fma.s1 f_C_hi = f_spos, f_Tjhi, f_SC_hi_temp + nop.i 0 } +;; -{ .mmi -(p0) ldfe cosh_FR_Tjhi = [AD_J],16 ;; -(p0) ldfs cosh_FR_Tjlo = [AD_J],16 - nop.i 999 ;; +{ .mfi + nop.m 0 +(p6) fms.s1 f_S_hi = f_spos, f_Tjhi, f_SC_hi_temp + nop.i 0 } - -{ .mfb - nop.m 999 -(p0) fmerge.se cosh_FR_sneg = cosh_FR_N_temp2, f1 -(p7) br.cond.spnt L(COSH_BY_EXP) ;; +{ .mfi + nop.m 0 +(p6) fma.s1 f_C_lo_temp4 = f_spos, f_Tjlo, f_C_lo_temp3 + nop.i 0 } - -// ****************************************************** -// If NOT branch to EXP -// ****************************************************** -// Calculate C_hi -// ****************************************************** -// cosh_FR_C_hi_temp = cosh_FR_sneg * cosh_FR_Tmjhi -// cosh_FR_C_hi = cosh_FR_spos * cosh_FR_Tjhi + (cosh_FR_sneg * cosh_FR_Tmjhi) +;; { .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_C_hi_temp = cosh_FR_sneg, cosh_FR_Tmjhi, f0 - nop.i 999 ;; + nop.m 0 + fma.s1 f_peven = f_Rsq, f_peven_temp2, f0 + nop.i 0 } - { .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_C_hi = cosh_FR_spos, cosh_FR_Tjhi, cosh_FR_C_hi_temp - nop.i 999 + nop.m 0 + fma.s1 f_podd = f_podd_temp2, f_Rcub, f_R + nop.i 0 } +;; -// ****************************************************** -// Calculate S_hi -// ****************************************************** -// cosh_FR_S_hi_temp1 = cosh_FR_sneg * cosh_FR_Tmjhi -// cosh_FR_S_hi = cosh_FR_spos * cosh_FR_Tjhi - cosh_FR_C_hi_temp1 +// If TBL, +// C_lo_temp1 = spos * Tjhi - C_hi +// C_lo_temp2 = sneg * Tmjlo + C_lo_temp1 +// C_lo_temp2 = sneg * Tmjlo + (spos * Tjhi - C_hi) { .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_S_hi_temp1 = cosh_FR_sneg, cosh_FR_Tmjhi, f0 - nop.i 999 ;; + nop.m 0 +(p6) fms.s1 f_C_lo_temp1 = f_spos, f_Tjhi, f_C_hi + nop.i 0 } - -// ****************************************************** -// Calculate C_lo -// ****************************************************** -// cosh_FR_C_lo_temp1 = cosh_FR_spos * cosh_FR_Tjhi - cosh_FR_C_hi -// cosh_FR_C_lo_temp2 = cosh_FR_sneg * cosh_FR_Tmjlo + (cosh_FR_spos * cosh_FR_Tjhi - cosh_FR_C_hi) -// cosh_FR_C_lo_temp1 = cosh_FR_sneg * cosh_FR_Tmjlo -// cosh_FR_C_lo_temp3 = cosh_FR_spos * cosh_FR_Tjlo + (cosh_FR_sneg * cosh_FR_Tmjlo) -// cosh_FR_C_lo = cosh_FR_C_lo_temp3 + cosh_FR_C_lo_temp2 +;; { .mfi - nop.m 999 -(p0) fms.s1 cosh_FR_C_lo_temp1 = cosh_FR_spos, cosh_FR_Tjhi, cosh_FR_C_hi - nop.i 999 + nop.m 0 +(p6) fma.s1 f_C_lo_temp2 = f_sneg, f_Tmjhi, f_C_lo_temp1 + nop.i 0 } +;; +// If EXP, +// Y_hi = 2^(N-1) * Tjhi +// Y_lo = 2^(N-1) * Tjhi * (p_odd + p_even) + 2^(N-1) * Tjlo { .mfi - nop.m 999 -(p0) fms.s1 cosh_FR_S_hi = cosh_FR_spos, cosh_FR_Tjhi, cosh_FR_S_hi_temp1 - nop.i 999 ;; + nop.m 0 +(p7) fma.s1 f_Y_lo_temp = f_peven, f1, f_podd + nop.i 0 } +;; +// If TBL, +// C_lo = C_lo_temp4 + C_lo_temp2 { .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_C_lo_temp2 = cosh_FR_sneg, cosh_FR_Tmjhi, cosh_FR_C_lo_temp1 - nop.i 999 + nop.m 0 +(p6) fma.s1 f_C_lo = f_C_lo_temp4, f1, f_C_lo_temp2 + nop.i 0 } +;; +// If TBL, +// Y_hi = C_hi +// Y_lo = S_hi*p_odd + (C_hi*p_even + C_lo) { .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_C_lo_temp1 = cosh_FR_sneg, cosh_FR_Tmjlo, f0 - nop.i 999 ;; + nop.m 0 +(p6) fma.s1 f_Y_lo_temp = f_C_hi, f_peven, f_C_lo + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_C_lo_temp3 = cosh_FR_spos, cosh_FR_Tjlo, cosh_FR_C_lo_temp1 - nop.i 999 ;; + nop.m 0 +(p7) fma.s1 f_Y_lo = f_Tjhi_spos, f_Y_lo_temp, f_Tjlo_spos + nop.i 0 } +;; +// Dummy multiply to generate inexact { .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_C_lo = cosh_FR_C_lo_temp3, f1, cosh_FR_C_lo_temp2 - nop.i 999 ;; + nop.m 0 + fmpy.s0 f_tmp = f_B2, f_B2 + nop.i 0 } - -// ****************************************************** -// cosh_FR_Y_lo_temp = cosh_FR_C_hi * cosh_FR_peven + cosh_FR_C_lo -// cosh_FR_Y_lo = cosh_FR_S_hi * cosh_FR_podd + cosh_FR_Y_lo_temp -// cosh_FR_COSH = Y_hi + Y_lo - { .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_Y_lo_temp = cosh_FR_C_hi, cosh_FR_peven, cosh_FR_C_lo - nop.i 999 ;; + nop.m 0 +(p6) fma.s1 f_Y_lo = f_S_hi, f_podd, f_Y_lo_temp + nop.i 0 } +;; +// f8 = answer = Y_hi + Y_lo { .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_Y_lo = cosh_FR_S_hi, cosh_FR_podd, cosh_FR_Y_lo_temp - nop.i 999 ;; + nop.m 0 +(p7) fma.s0 f8 = f_Y_lo, f1, f_Tjhi_spos + nop.i 0 } +;; +// f8 = answer = Y_hi + Y_lo { .mfb - nop.m 999 -(p0) fma.s0 f8 = cosh_FR_C_hi, f1, cosh_FR_Y_lo -(p0) br.ret.sptk b0 ;; + nop.m 0 +(p6) fma.s0 f8 = f_Y_lo, f1, f_C_hi + br.ret.sptk b0 // Exit for COSH_BY_TBL and COSH_BY_EXP } +;; -L(COSH_BY_EXP): - -// When p7 is true, we know that an overflow is not going to happen -// When p7 is false, we must check for possible overflow -// p7 is the over_SAFE flag -// f44 = Scale * (Y_hi + Y_lo) -// = cosh_FR_spos * (cosh_FR_Tjhi + cosh_FR_Y_lo) -{ .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_Y_lo_temp = cosh_FR_peven, f1, cosh_FR_podd - nop.i 999 +// Here if 0 < |x| < 0.25 +COSH_BY_POLY: +{ .mmf + ldfe f_P6 = [r_ad2e],16 + ldfe f_P5 = [r_ad2o],16 + nop.f 0 } +;; -// Now we are in EXP. This is the only path where an overflow is possible -// but not for certain. So this is the only path where over_SAFE has any use. -// r34 still has N-1 -// There is a danger of double-extended overflow if N-1 > 0x3ffe = 16382 -// There is a danger of double overflow if N-1 > 0x3fe = 1022 -// There is a danger of single overflow if N-1 > 0x7e = 126 +{ .mmi + ldfe f_P4 = [r_ad2e],16 + ldfe f_P3 = [r_ad2o],16 + nop.i 0 +} +;; -{ .mlx - nop.m 999 -(p0) movl r32 = 0x0000000000003ffe ;; +{ .mmi + ldfe f_P2 = [r_ad2e],16 + ldfe f_P1 = [r_ad2o],16 + nop.i 0 } +;; { .mfi -(p0) cmp.gt.unc p0,p7 = r34, r32 - nop.f 999 - nop.i 999 ;; + nop.m 0 + fma.s1 f_X3 = f_NORM_X, f_X2, f0 + nop.i 0 } - { .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_Y_lo = cosh_FR_Tjhi, cosh_FR_Y_lo_temp, cosh_FR_Tjlo - nop.i 999 ;; + nop.m 0 + fma.s1 f_X4 = f_X2, f_X2, f0 + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_COSH_temp = cosh_FR_Y_lo, f1, cosh_FR_Tjhi - nop.i 999 ;; + nop.m 0 + fma.s1 f_poly65 = f_X2, f_P6, f_P5 + nop.i 0 } - { .mfi - nop.m 999 -(p0) fma.s0 f44 = cosh_FR_spos, cosh_FR_COSH_temp, f0 - nop.i 999 ;; + nop.m 0 + fma.s1 f_poly43 = f_X2, f_P4, f_P3 + nop.i 0 } +;; -// Dummy multiply to generate inexact { .mfi - nop.m 999 -(p7) fmpy.s0 cosh_FR_tmp = cosh_FR_all_ones, cosh_FR_all_ones - nop.i 999 ;; + nop.m 0 + fma.s1 f_poly21 = f_X2, f_P2, f_P1 + nop.i 0 } +;; -// If over_SAFE is set, return -{ .mfb - nop.m 999 -(p7) fmerge.s f8 = f44,f44 -(p7) br.ret.sptk b0 ;; +{ .mfi + nop.m 0 + fma.s1 f_poly6543 = f_X4, f_poly65, f_poly43 + nop.i 0 } - -// Else see if we overflowed -// S0 user supplied status -// S2 user supplied status + WRE + TD (Overflows) -// If WRE is set then an overflow will not occur in EXP. -// The input value that would cause a register (WRE) value to overflow is about 2^15 -// and this input would go into the HUGE path. -// Answer with WRE is in f43. +;; { .mfi - nop.m 999 -(p0) fsetc.s2 0x7F,0x42 - nop.i 999;; + nop.m 0 + fma.s1 f_poly6to1 = f_X4, f_poly6543, f_poly21 + nop.i 0 } +;; +// Dummy multiply to generate inexact { .mfi - nop.m 999 -(p0) fma.s2 f43 = cosh_FR_spos, cosh_FR_COSH_temp, f0 - nop.i 999 ;; + nop.m 0 + fmpy.s0 f_tmp = f_P6, f_P6 + nop.i 0 } - -// 103FF => 103FF -FFFF = 400(true) -// 400 + 3FF = 7FF, which is 1 more than the exponent of the largest -// double (7FE). So 0 103FF 8000000000000000 is one ulp more than -// largest double in register bias - -// 13FFF => 13FFF -FFFF = 4000(true) - -// Now set p8 if the answer with WRE is greater than or equal this value -// Also set p9 if the answer with WRE is less than or equal to negative this value - -{ .mlx - nop.m 999 -(p0) movl r32 = 0x0000000000013fff ;; +{ .mfb + nop.m 0 + fma.s0 f8 = f_poly6to1, f_X2, f1 + br.ret.sptk b0 // Exit COSH_BY_POLY } +;; -{ .mmf - nop.m 999 -(p0) setf.exp f41 = r32 -(p0) fsetc.s2 0x7F,0x40 ;; -} -{ .mfi - nop.m 999 -(p0) fcmp.ge.unc.s1 p8, p0 = f43, f41 - nop.i 999 +// Here if x denorm or unorm +COSH_DENORM: +// Determine if x really a denorm and not a unorm +{ .mmf + getf.exp r_signexp_x = f_NORM_X + mov r_exp_denorm = 0x0c001 // Real denorms have exp < this + fmerge.s f_ABS_X = f0, f_NORM_X } +;; { .mfi - nop.m 999 -(p0) fmerge.ns f42 = f41, f41 - nop.i 999 ;; + nop.m 0 + fcmp.eq.s0 p10,p0 = f8, f0 // Set denorm flag + nop.i 0 } +;; -// The error tag for overflow is 63 -{ .mii - nop.m 999 - nop.i 999 ;; -(p8) mov GR_Parameter_TAG = 63 ;; +// Set p8 if really a denorm +{ .mmi + and r_exp_x = r_exp_mask, r_signexp_x +;; + cmp.lt p8,p9 = r_exp_x, r_exp_denorm + nop.i 0 } +;; +// Identify denormal operands. { .mfb - nop.m 999 -(p0) fcmp.le.unc.s1 p9, p0 = f43, f42 -(p8) br.cond.spnt __libm_error_region ;; -} - -{ .mii - nop.m 999 - nop.i 999 ;; -(p9) mov GR_Parameter_TAG = 63 -} - -{ .mib - nop.m 999 - nop.i 999 -(p9) br.cond.spnt __libm_error_region ;; -} - -// Dummy multiply to generate inexact -{ .mfi - nop.m 999 -(p0) fmpy.s0 cosh_FR_tmp = cosh_FR_all_ones, cosh_FR_all_ones - nop.i 999 ;; + nop.m 0 +(p8) fma.s0 f8 = f8,f8,f1 // If x denorm, result=1+x^2 +(p9) br.cond.sptk COSH_COMMON // Return to main path if x unorm } +;; { .mfb - nop.m 999 -(p0) fmerge.s f8 = f44,f44 -(p0) br.ret.sptk b0 ;; + nop.m 0 + nop.f 0 + br.ret.sptk b0 // Exit if x denorm } +;; -// for COSH_HUGE, put 24000 in exponent; take sign from input; add 1 -// SAFE: SAFE is always 0 for HUGE - -L(COSH_HUGE): - -{ .mlx - nop.m 999 -(p0) movl r32 = 0x0000000000015dbf ;; +// Here if |x| >= overflow limit +COSH_HUGE: +// for COSH_HUGE, put 24000 in exponent; take sign from input +{ .mmi + mov r_exp_huge = 0x15dbf +;; + setf.exp f_huge = r_exp_huge + nop.i 0 } +;; { .mfi -(p0) setf.exp f9 = r32 - nop.f 999 - nop.i 999 ;; + alloc r32 = ar.pfs,0,5,4,0 + fma.s1 f_signed_hi_lo = f_huge, f1, f1 + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fma.s1 cosh_FR_hi_lo = f1, f9, f1 - nop.i 999 ;; + nop.m 0 + fma.s0 f_pre_result = f_signed_hi_lo, f_huge, f0 + mov GR_Parameter_TAG = 63 } +;; -{ .mfi - nop.m 999 -(p0) fma.s0 f44 = f9, cosh_FR_hi_lo, f0 -(p0) mov GR_Parameter_TAG = 63 -} -.endp coshl -ASM_SIZE_DIRECTIVE(coshl) +GLOBAL_IEEE754_END(coshl) -.proc __libm_error_region -__libm_error_region: +LOCAL_LIBM_ENTRY(__libm_error_region) .prologue + { .mfi - add GR_Parameter_Y=-32,sp // Parameter 2 value + add GR_Parameter_Y=-32,sp // Parameter 2 value nop.f 0 .save ar.pfs,GR_SAVE_PFS - mov GR_SAVE_PFS=ar.pfs // Save ar.pfs + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs } { .mfi .fframe 64 - add sp=-64,sp // Create new stack + add sp=-64,sp // Create new stack nop.f 0 - mov GR_SAVE_GP=gp // Save gp + mov GR_SAVE_GP=gp // Save gp };; + { .mmi - stfe [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack - add GR_Parameter_X = 16,sp // Parameter 1 address + stfe [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address .save b0, GR_SAVE_B0 - mov GR_SAVE_B0=b0 // Save b0 + mov GR_SAVE_B0=b0 // Save b0 };; + .body { .mib - stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack - add GR_Parameter_RESULT = 0,GR_Parameter_Y - nop.b 0 // Parameter 3 address + stfe [GR_Parameter_X] = f8 // STORE Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address + nop.b 0 } { .mib - stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack + stfe [GR_Parameter_Y] = f_pre_result // STORE Parameter 3 on stack add GR_Parameter_Y = -16,GR_Parameter_Y - br.call.sptk b0=__libm_error_support# // Call error handling function + br.call.sptk b0=__libm_error_support# // Call error handling function };; + { .mmi - nop.m 0 - nop.m 0 add GR_Parameter_RESULT = 48,sp + nop.m 0 + nop.i 0 };; + { .mmi - ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack + ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack .restore sp - add sp = 64,sp // Restore stack pointer - mov b0 = GR_SAVE_B0 // Restore return address + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address };; + { .mib - mov gp = GR_SAVE_GP // Restore gp - mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs - br.ret.sptk b0 // Return + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return };; -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) +LOCAL_LIBM_END(__libm_error_region) + .type __libm_error_support#,@function .global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_exp.S b/sysdeps/ia64/fpu/e_exp.S index db02336ecf..5ae8afeb99 100644 --- a/sysdeps/ia64/fpu/e_exp.S +++ b/sysdeps/ia64/fpu/e_exp.S @@ -1,10 +1,10 @@ .file "exp.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2002, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,26 +20,26 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// // Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// problem reports or change requests be submitted to it directly at +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 2/02/00 Initial version +// 2/02/00 Initial version // 3/07/00 exp(inf) = inf but now does NOT call error support // exp(-inf) = 0 but now does NOT call error support // 4/04/00 Unwind support added @@ -48,6 +48,10 @@ // 11/30/00 Reworked to shorten main path, widen main path to include all // args in normal range, and add quick exit for 0, nan, inf. // 12/05/00 Loaded constants earlier with setf to save 2 cycles. +// 02/05/02 Corrected uninitialize predicate in POSSIBLE_UNDERFLOW path +// 05/20/02 Cleaned up namespace and sf0 syntax +// 09/07/02 Force inexact flag +// 11/15/02 Split underflow path into zero/nonzero; eliminated fma in main path // API //============================================================== @@ -67,187 +71,167 @@ // Construct 2^M // Get 2^(index_1/128) from table_1; // Get 2^(index_2/8) from table_2; -// Calculate exp(r) by series +// Calculate exp(r) by 5th order polynomial // r = x - n (log2/128)_high // delta = - n (log2/128)_low // Calculate exp(delta) as 1 + delta -// Special values +// Special values //============================================================== // exp(+0) = 1.0 // exp(-0) = 1.0 -// exp(+qnan) = +qnan -// exp(-qnan) = -qnan -// exp(+snan) = +qnan -// exp(-snan) = -qnan +// exp(+qnan) = +qnan +// exp(-qnan) = -qnan +// exp(+snan) = +qnan +// exp(-snan) = -qnan -// exp(-inf) = +0 +// exp(-inf) = +0 // exp(+inf) = +inf -// Overfow and Underfow +// Overflow and Underflow //======================= -// exp(-x) = smallest double normal when -// x = -708.396 = c086232bdd7abcd2 - // exp(x) = largest double normal when -// x = 709.7827 = 40862e42fefa39ef +// x = 709.7827 = 0x40862e42fefa39ef + +// exp(x) = smallest double normal when +// x = -708.396 = 0xc086232bdd7abcd2 +// exp(x) = largest round-to-nearest single zero when +// x = -745.1332 = 0xc0874910d52d3052 // Registers used //============================================================== -// Floating Point registers used: -// f8, input -// f9 -> f15, f32 -> f60 +// Floating Point registers used: +// f8, input, output +// f6 -> f15, f32 -> f49 -// General registers used: -// r32 -> r60 +// General registers used: +// r14 -> r40 // Predicate registers used: // p6 -> p15 -#include "libm_support.h" - // Assembly macros //============================================================== -exp_GR_rshf = r33 -EXP_AD_TB1 = r34 -EXP_AD_TB2 = r35 -EXP_AD_P = r36 - -exp_GR_N = r37 -exp_GR_index_1 = r38 -exp_GR_index_2_16 = r39 - -exp_GR_biased_M = r40 -exp_GR_index_1_16 = r41 -EXP_AD_T1 = r42 -EXP_AD_T2 = r43 -exp_GR_sig_inv_ln2 = r44 - -exp_GR_17ones = r45 -exp_GR_one = r46 -exp_TB1_size = r47 -exp_TB2_size = r48 -exp_GR_rshf_2to56 = r49 - -exp_GR_gt_ln = r50 -exp_GR_exp_2tom56 = r51 - -exp_GR_17ones_m1 = r52 - -GR_SAVE_B0 = r53 -GR_SAVE_PFS = r54 -GR_SAVE_GP = r55 -GR_SAVE_SP = r56 - -GR_Parameter_X = r57 -GR_Parameter_Y = r58 -GR_Parameter_RESULT = r59 -GR_Parameter_TAG = r60 - - -FR_X = f10 -FR_Y = f1 -FR_RESULT = f8 - -EXP_RSHF_2TO56 = f6 -EXP_INV_LN2_2TO63 = f7 -EXP_W_2TO56_RSH = f9 -EXP_2TOM56 = f11 -exp_P4 = f12 -exp_P3 = f13 -exp_P2 = f14 -exp_P1 = f15 - -exp_ln2_by_128_hi = f33 -exp_ln2_by_128_lo = f34 - -EXP_RSHF = f35 -EXP_Nfloat = f36 -exp_W = f37 -exp_r = f38 -exp_f = f39 - -exp_rsq = f40 -exp_rcube = f41 - -EXP_2M = f42 -exp_S1 = f43 -exp_T1 = f44 - -EXP_MIN_DBL_OFLOW_ARG = f45 -EXP_MAX_DBL_ZERO_ARG = f46 -EXP_MAX_DBL_NORM_ARG = f47 -EXP_MAX_DBL_UFLOW_ARG = f48 -EXP_MIN_DBL_NORM_ARG = f49 -exp_rP4pP3 = f50 -exp_P_lo = f51 -exp_P_hi = f52 -exp_P = f53 -exp_S = f54 - -EXP_NORM_f8 = f56 - -exp_wre_urm_f8 = f57 -exp_ftz_urm_f8 = f57 - -exp_gt_pln = f58 - -exp_S2 = f59 -exp_T2 = f60 +rRshf = r14 +rAD_TB1 = r15 +rAD_T1 = r15 +rAD_TB2 = r16 +rAD_T2 = r16 +rAD_P = r17 +rN = r18 +rIndex_1 = r19 +rIndex_2_16 = r20 +rM = r21 +rBiased_M = r21 +rIndex_1_16 = r21 +rSig_inv_ln2 = r22 +rExp_bias = r23 +rExp_mask = r24 +rTmp = r25 +rRshf_2to56 = r26 +rGt_ln = r27 +rExp_2tom56 = r28 + + +GR_SAVE_B0 = r33 +GR_SAVE_PFS = r34 +GR_SAVE_GP = r35 +GR_SAVE_SP = r36 + +GR_Parameter_X = r37 +GR_Parameter_Y = r38 +GR_Parameter_RESULT = r39 +GR_Parameter_TAG = r40 + + +FR_X = f10 +FR_Y = f1 +FR_RESULT = f8 + +fRSHF_2TO56 = f6 +fINV_LN2_2TO63 = f7 +fW_2TO56_RSH = f9 +f2TOM56 = f11 +fP5 = f12 +fP54 = f12 +fP5432 = f12 +fP4 = f13 +fP3 = f14 +fP32 = f14 +fP2 = f15 +fP = f15 + +fLn2_by_128_hi = f33 +fLn2_by_128_lo = f34 + +fRSHF = f35 +fNfloat = f36 +fNormX = f37 +fR = f38 +fF = f39 + +fRsq = f40 +f2M = f41 +fS1 = f42 +fT1 = f42 +fS2 = f43 +fT2 = f43 +fS = f43 +fWre_urm_f8 = f44 +fFtz_urm_f8 = f44 + +fMIN_DBL_OFLOW_ARG = f45 +fMAX_DBL_ZERO_ARG = f46 +fMAX_DBL_NORM_ARG = f47 +fMIN_DBL_NORM_ARG = f48 +fGt_pln = f49 +fTmp = f49 // Data tables //============================================================== -#ifdef _LIBC -.rodata -#else -.data -#endif - +RODATA .align 16 // ************* DO NOT CHANGE ORDER OF THESE TABLES ******************** // double-extended 1/ln(2) // 3fff b8aa 3b29 5c17 f0bb be87fed0691d3e88 -// 3fff b8aa 3b29 5c17 f0bc +// 3fff b8aa 3b29 5c17 f0bc // For speed the significand will be loaded directly with a movl and setf.sig // and the exponent will be bias+63 instead of bias+0. Thus subsequent // computations need to scale appropriately. -// The constant 128/ln(2) is needed for the computation of w. This is also +// The constant 128/ln(2) is needed for the computation of w. This is also // obtained by scaling the computations. // -// Two shifting constants are loaded directly with movl and setf.d. -// 1. EXP_RSHF_2TO56 = 1.1000..00 * 2^(63-7) +// Two shifting constants are loaded directly with movl and setf.d. +// 1. fRSHF_2TO56 = 1.1000..00 * 2^(63-7) // This constant is added to x*1/ln2 to shift the integer part of // x*128/ln2 into the rightmost bits of the significand. -// The result of this fma is EXP_W_2TO56_RSH. -// 2. EXP_RSHF = 1.1000..00 * 2^(63) -// This constant is subtracted from EXP_W_2TO56_RSH * 2^(-56) to give +// The result of this fma is fW_2TO56_RSH. +// 2. fRSHF = 1.1000..00 * 2^(63) +// This constant is subtracted from fW_2TO56_RSH * 2^(-56) to give // the integer part of w, n, as a floating-point number. -// The result of this fms is EXP_Nfloat. +// The result of this fms is fNfloat. -exp_table_1: -ASM_TYPE_DIRECTIVE(exp_table_1,@object) -data8 0x40862e42fefa39f0 // smallest dbl overflow arg -data8 0xc0874c0000000000 // approx largest arg for zero result -data8 0x40862e42fefa39ef // largest dbl arg to give normal dbl result -data8 0xc086232bdd7abcd3 // largest dbl underflow arg -data8 0xc086232bdd7abcd2 // smallest dbl arg to give normal dbl result -data8 0x0 // pad +LOCAL_OBJECT_START(exp_table_1) +data8 0x40862e42fefa39f0 // smallest dbl overflow arg, +709.7827 +data8 0xc0874910d52d3052 // largest arg for rnd-to-nearest 0 result, -745.133 +data8 0x40862e42fefa39ef // largest dbl arg to give normal dbl result, +709.7827 +data8 0xc086232bdd7abcd2 // smallest dbl arg to give normal dbl result, -708.396 data8 0xb17217f7d1cf79ab , 0x00003ff7 // ln2/128 hi data8 0xc9e3b39803f2f6af , 0x00003fb7 // ln2/128 lo - +// // Table 1 is 2^(index_1/128) where // index_1 goes from 0 to 15 - +// data8 0x8000000000000000 , 0x00003FFF data8 0x80B1ED4FD999AB6C , 0x00003FFF data8 0x8164D1F3BC030773 , 0x00003FFF @@ -264,12 +248,11 @@ data8 0x88980E8092DA8527 , 0x00003FFF data8 0x8955EE03618E5FDD , 0x00003FFF data8 0x8A14D575496EFD9A , 0x00003FFF data8 0x8AD4C6452C728924 , 0x00003FFF -ASM_SIZE_DIRECTIVE(exp_table_1) +LOCAL_OBJECT_END(exp_table_1) // Table 2 is 2^(index_1/8) where // index_2 goes from 0 to 7 -exp_table_2: -ASM_TYPE_DIRECTIVE(exp_table_2,@object) +LOCAL_OBJECT_START(exp_table_2) data8 0x8000000000000000 , 0x00003FFF data8 0x8B95C1E3EA8BD6E7 , 0x00003FFF data8 0x9837F0518DB8A96F , 0x00003FFF @@ -278,413 +261,356 @@ data8 0xB504F333F9DE6484 , 0x00003FFF data8 0xC5672A115506DADD , 0x00003FFF data8 0xD744FCCAD69D6AF4 , 0x00003FFF data8 0xEAC0C6E7DD24392F , 0x00003FFF -ASM_SIZE_DIRECTIVE (exp_table_2) - +LOCAL_OBJECT_END(exp_table_2) -exp_p_table: -ASM_TYPE_DIRECTIVE(exp_p_table,@object) -data8 0x3f8111116da21757 //P_4 -data8 0x3fa55555d787761c //P_3 -data8 0x3fc5555555555414 //P_2 -data8 0x3fdffffffffffd6a //P_1 -ASM_SIZE_DIRECTIVE(exp_p_table) +LOCAL_OBJECT_START(exp_p_table) +data8 0x3f8111116da21757 //P5 +data8 0x3fa55555d787761c //P4 +data8 0x3fc5555555555414 //P3 +data8 0x3fdffffffffffd6a //P2 +LOCAL_OBJECT_END(exp_p_table) -.align 32 -.global exp# .section .text -.proc exp# -.align 32 -exp: -#ifdef _LIBC -.global __ieee754_exp# -__ieee754_exp: -#endif +GLOBAL_IEEE754_ENTRY(exp) { .mlx - alloc r32=ar.pfs,1,24,4,0 - movl exp_GR_sig_inv_ln2 = 0xb8aa3b295c17f0bc // significand of 1/ln2 + nop.m 0 + movl rSig_inv_ln2 = 0xb8aa3b295c17f0bc // significand of 1/ln2 } { .mlx - addl EXP_AD_TB1 = @ltoff(exp_table_1), gp - movl exp_GR_rshf_2to56 = 0x4768000000000000 ;; // 1.10000 2^(63+56) + addl rAD_TB1 = @ltoff(exp_table_1), gp + movl rRshf_2to56 = 0x4768000000000000 // 1.10000 2^(63+56) } ;; -// We do this fnorm right at the beginning to take any enabled -// faults and to normalize any input unnormals so that SWA is not taken. { .mfi - ld8 EXP_AD_TB1 = [EXP_AD_TB1] - fclass.m p8,p0 = f8,0x07 // Test for x=0 - mov exp_GR_17ones = 0x1FFFF + ld8 rAD_TB1 = [rAD_TB1] + fclass.m p8,p0 = f8,0x07 // Test for x=0 + mov rExp_mask = 0x1ffff } { .mfi - mov exp_TB1_size = 0x100 - fnorm EXP_NORM_f8 = f8 - mov exp_GR_exp_2tom56 = 0xffff-56 + mov rExp_bias = 0xffff + fnorm.s1 fNormX = f8 + mov rExp_2tom56 = 0xffff-56 } ;; // Form two constants we need -// 1/ln2 * 2^63 to compute w = x * 1/ln2 * 128 +// 1/ln2 * 2^63 to compute w = x * 1/ln2 * 128 // 1.1000..000 * 2^(63+63-7) to right shift int(w) into the significand -{ .mmf - setf.sig EXP_INV_LN2_2TO63 = exp_GR_sig_inv_ln2 // form 1/ln2 * 2^63 - setf.d EXP_RSHF_2TO56 = exp_GR_rshf_2to56 // Form const 1.100 * 2^(63+56) - fclass.m p9,p0 = f8,0x22 // Test for x=-inf +{ .mfi + setf.sig fINV_LN2_2TO63 = rSig_inv_ln2 // form 1/ln2 * 2^63 + fclass.m p9,p0 = f8,0x22 // Test for x=-inf + nop.i 0 +} +{ .mlx + setf.d fRSHF_2TO56 = rRshf_2to56 // Form const 1.100 * 2^(63+56) + movl rRshf = 0x43e8000000000000 // 1.10000 2^63 for right shift } ;; -{ .mlx - setf.exp EXP_2TOM56 = exp_GR_exp_2tom56 // form 2^-56 for scaling Nfloat - movl exp_GR_rshf = 0x43e8000000000000 // 1.10000 2^63 for right shift +{ .mfi + ldfpd fMIN_DBL_OFLOW_ARG, fMAX_DBL_ZERO_ARG = [rAD_TB1],16 + fclass.m p10,p0 = f8,0x1e1 // Test for x=+inf, nan, NaT + nop.i 0 } { .mfb - mov exp_TB2_size = 0x80 -(p8) fma.d f8 = f1,f1,f0 // quick exit for x=0 -(p8) br.ret.spnt b0 -;; + setf.exp f2TOM56 = rExp_2tom56 // form 2^-56 for scaling Nfloat +(p9) fma.d.s0 f8 = f0,f0,f0 // quick exit for x=-inf +(p9) br.ret.spnt b0 } +;; { .mfi - ldfpd EXP_MIN_DBL_OFLOW_ARG, EXP_MAX_DBL_ZERO_ARG = [EXP_AD_TB1],16 - fclass.m p10,p0 = f8,0x21 // Test for x=+inf - nop.i 999 + ldfpd fMAX_DBL_NORM_ARG, fMIN_DBL_NORM_ARG = [rAD_TB1],16 + nop.f 0 + nop.i 0 } { .mfb - nop.m 999 -(p9) fma.d f8 = f0,f0,f0 // quick exit for x=-inf -(p9) br.ret.spnt b0 -;; + setf.d fRSHF = rRshf // Form right shift const 1.100 * 2^63 +(p8) fma.d.s0 f8 = f1,f1,f0 // quick exit for x=0 +(p8) br.ret.spnt b0 } - -{ .mmf - ldfpd EXP_MAX_DBL_NORM_ARG, EXP_MAX_DBL_UFLOW_ARG = [EXP_AD_TB1],16 - setf.d EXP_RSHF = exp_GR_rshf // Form right shift const 1.100 * 2^63 - fclass.m p11,p0 = f8,0xc3 // Test for x=nan ;; -} { .mfb - ldfd EXP_MIN_DBL_NORM_ARG = [EXP_AD_TB1],16 - nop.f 999 -(p10) br.ret.spnt b0 // quick exit for x=+inf -;; + ldfe fLn2_by_128_hi = [rAD_TB1],16 +(p10) fma.d.s0 f8 = f8,f8,f0 // Result if x=+inf, nan, NaT +(p10) br.ret.spnt b0 // quick exit for x=+inf, nan, NaT } +;; { .mfi - ldfe exp_ln2_by_128_hi = [EXP_AD_TB1],16 - nop.f 999 - nop.i 999 -;; + ldfe fLn2_by_128_lo = [rAD_TB1],16 + fcmp.eq.s0 p6,p0 = f8, f0 // Dummy to set D + nop.i 0 } - - -{ .mfb - ldfe exp_ln2_by_128_lo = [EXP_AD_TB1],16 -(p11) fmerge.s f8 = EXP_NORM_f8, EXP_NORM_f8 -(p11) br.ret.spnt b0 // quick exit for x=nan ;; -} -// After that last load, EXP_AD_TB1 points to the beginning of table 1 +// After that last load, rAD_TB1 points to the beginning of table 1 // W = X * Inv_log2_by_128 // By adding 1.10...0*2^63 we shift and get round_int(W) in significand. // We actually add 1.10...0*2^56 to X * Inv_log2 to do the same thing. { .mfi - nop.m 999 - fma.s1 EXP_W_2TO56_RSH = EXP_NORM_f8, EXP_INV_LN2_2TO63, EXP_RSHF_2TO56 - nop.i 999 -;; + nop.m 0 + fma.s1 fW_2TO56_RSH = fNormX, fINV_LN2_2TO63, fRSHF_2TO56 + nop.i 0 } - +;; // Divide arguments into the following categories: -// Certain Underflow/zero p11 - -inf < x <= MAX_DBL_ZERO_ARG -// Certain Underflow p12 - MAX_DBL_ZERO_ARG < x <= MAX_DBL_UFLOW_ARG -// Possible Underflow p13 - MAX_DBL_UFLOW_ARG < x < MIN_DBL_NORM_ARG +// Certain Underflow p11 - -inf < x <= MAX_DBL_ZERO_ARG +// Possible Underflow p13 - MAX_DBL_ZERO_ARG < x < MIN_DBL_NORM_ARG // Certain Safe - MIN_DBL_NORM_ARG <= x <= MAX_DBL_NORM_ARG // Possible Overflow p14 - MAX_DBL_NORM_ARG < x < MIN_DBL_OFLOW_ARG // Certain Overflow p15 - MIN_DBL_OFLOW_ARG <= x < +inf // -// If the input is really a double arg, then there will never be "Possible -// Underflow" or "Possible Overflow" arguments. +// If the input is really a double arg, then there will never be +// "Possible Overflow" arguments. // { .mfi - add EXP_AD_TB2 = exp_TB1_size, EXP_AD_TB1 - fcmp.ge.s1 p15,p14 = EXP_NORM_f8,EXP_MIN_DBL_OFLOW_ARG - nop.i 999 -;; + add rAD_TB2 = 0x100, rAD_TB1 + fcmp.ge.s1 p15,p0 = fNormX,fMIN_DBL_OFLOW_ARG + nop.i 0 } +;; { .mfi - add EXP_AD_P = exp_TB2_size, EXP_AD_TB2 - fcmp.le.s1 p11,p12 = EXP_NORM_f8,EXP_MAX_DBL_ZERO_ARG - nop.i 999 -;; + add rAD_P = 0x80, rAD_TB2 + fcmp.le.s1 p11,p0 = fNormX,fMAX_DBL_ZERO_ARG + nop.i 0 } +;; { .mfb - ldfpd exp_P4, exp_P3 = [EXP_AD_P] ,16 -(p14) fcmp.gt.unc.s1 p14,p0 = EXP_NORM_f8,EXP_MAX_DBL_NORM_ARG -(p15) br.cond.spnt L(EXP_CERTAIN_OVERFLOW) -;; + ldfpd fP5, fP4 = [rAD_P] ,16 + fcmp.gt.s1 p14,p0 = fNormX,fMAX_DBL_NORM_ARG +(p15) br.cond.spnt EXP_CERTAIN_OVERFLOW } +;; - -// Nfloat = round_int(W) -// The signficand of EXP_W_2TO56_RSH contains the rounded integer part of W, +// Nfloat = round_int(W) +// The signficand of fW_2TO56_RSH contains the rounded integer part of W, // as a twos complement number in the lower bits (that is, it may be negative). -// That twos complement number (called N) is put into exp_GR_N. +// That twos complement number (called N) is put into rN. -// Since EXP_W_2TO56_RSH is scaled by 2^56, it must be multiplied by 2^-56 -// before the shift constant 1.10000 * 2^63 is subtracted to yield EXP_Nfloat. -// Thus, EXP_Nfloat contains the floating point version of N +// Since fW_2TO56_RSH is scaled by 2^56, it must be multiplied by 2^-56 +// before the shift constant 1.10000 * 2^63 is subtracted to yield fNfloat. +// Thus, fNfloat contains the floating point version of N - -{ .mfi - nop.m 999 -(p12) fcmp.le.unc p12,p0 = EXP_NORM_f8,EXP_MAX_DBL_UFLOW_ARG - nop.i 999 -} { .mfb - ldfpd exp_P2, exp_P1 = [EXP_AD_P] - fms.s1 EXP_Nfloat = EXP_W_2TO56_RSH, EXP_2TOM56, EXP_RSHF -(p11) br.cond.spnt L(EXP_CERTAIN_UNDERFLOW_ZERO) -;; + ldfpd fP3, fP2 = [rAD_P] + fms.s1 fNfloat = fW_2TO56_RSH, f2TOM56, fRSHF +(p11) br.cond.spnt EXP_CERTAIN_UNDERFLOW } +;; { .mfi - getf.sig exp_GR_N = EXP_W_2TO56_RSH -(p13) fcmp.lt.unc p13,p0 = EXP_NORM_f8,EXP_MIN_DBL_NORM_ARG - nop.i 999 -;; + getf.sig rN = fW_2TO56_RSH + nop.f 0 + nop.i 0 } +;; +// rIndex_1 has index_1 +// rIndex_2_16 has index_2 * 16 +// rBiased_M has M +// rIndex_1_16 has index_1 * 16 -// exp_GR_index_1 has index_1 -// exp_GR_index_2_16 has index_2 * 16 -// exp_GR_biased_M has M -// exp_GR_index_1_16 has index_1 * 16 - -// r2 has true M +// rM has true M +// r = x - Nfloat * ln2_by_128_hi +// f = 1 - Nfloat * ln2_by_128_lo { .mfi - and exp_GR_index_1 = 0x0f, exp_GR_N - fnma.s1 exp_r = EXP_Nfloat, exp_ln2_by_128_hi, EXP_NORM_f8 - shr r2 = exp_GR_N, 0x7 + and rIndex_1 = 0x0f, rN + fnma.s1 fR = fNfloat, fLn2_by_128_hi, fNormX + shr rM = rN, 0x7 } { .mfi - and exp_GR_index_2_16 = 0x70, exp_GR_N - fnma.s1 exp_f = EXP_Nfloat, exp_ln2_by_128_lo, f1 - nop.i 999 -;; + and rIndex_2_16 = 0x70, rN + fnma.s1 fF = fNfloat, fLn2_by_128_lo, f1 + nop.i 0 } +;; - -// EXP_AD_T1 has address of T1 -// EXP_AD_T2 has address if T2 +// rAD_T1 has address of T1 +// rAD_T2 has address if T2 { .mmi - addl exp_GR_biased_M = 0xffff, r2 - add EXP_AD_T2 = EXP_AD_TB2, exp_GR_index_2_16 - shladd EXP_AD_T1 = exp_GR_index_1, 4, EXP_AD_TB1 -;; + add rBiased_M = rExp_bias, rM + add rAD_T2 = rAD_TB2, rIndex_2_16 + shladd rAD_T1 = rIndex_1, 4, rAD_TB1 } - +;; // Create Scale = 2^M -// r = x - Nfloat * ln2_by_128_hi -// f = 1 - Nfloat * ln2_by_128_lo - { .mmi - setf.exp EXP_2M = exp_GR_biased_M - ldfe exp_T2 = [EXP_AD_T2] - nop.i 999 -;; + setf.exp f2M = rBiased_M + ldfe fT2 = [rAD_T2] + nop.i 0 } +;; // Load T1 and T2 { .mfi - ldfe exp_T1 = [EXP_AD_T1] - nop.f 999 - nop.i 999 -;; + ldfe fT1 = [rAD_T1] + fmpy.s0 fTmp = fLn2_by_128_lo, fLn2_by_128_lo // Force inexact + nop.i 0 } - +;; { .mfi - nop.m 999 - fma.s1 exp_rsq = exp_r, exp_r, f0 - nop.i 999 + nop.m 0 + fma.s1 fRsq = fR, fR, f0 + nop.i 0 } { .mfi - nop.m 999 - fma.s1 exp_rP4pP3 = exp_r, exp_P4, exp_P3 - nop.i 999 -;; + nop.m 0 + fma.s1 fP54 = fR, fP5, fP4 + nop.i 0 } - - +;; { .mfi - nop.m 999 - fma.s1 exp_rcube = exp_r, exp_rsq, f0 - nop.i 999 + nop.m 0 + fcmp.lt.s1 p13,p0 = fNormX,fMIN_DBL_NORM_ARG + nop.i 0 } { .mfi - nop.m 999 - fma.s1 exp_P_lo = exp_r, exp_rP4pP3, exp_P2 - nop.i 999 -;; + nop.m 0 + fma.s1 fP32 = fR, fP3, fP2 + nop.i 0 } - +;; { .mfi - nop.m 999 - fma.s1 exp_P_hi = exp_rsq, exp_P1, exp_r - nop.i 999 + nop.m 0 + fma.s1 fP5432 = fRsq, fP54, fP32 + nop.i 0 } -{ .mfi - nop.m 999 - fma.s1 exp_S2 = exp_f,exp_T2,f0 - nop.i 999 ;; -} { .mfi - nop.m 999 - fma.s1 exp_S1 = EXP_2M,exp_T1,f0 - nop.i 999 -;; + nop.m 0 + fma.s1 fS1 = f2M,fT1,f0 + nop.i 0 } - - { .mfi - nop.m 999 - fma.s1 exp_P = exp_rcube, exp_P_lo, exp_P_hi - nop.i 999 -;; + nop.m 0 + fma.s1 fS2 = fF,fT2,f0 + nop.i 0 } +;; { .mfi - nop.m 999 - fma.s1 exp_S = exp_S1,exp_S2,f0 - nop.i 999 -;; + nop.m 0 + fma.s1 fP = fRsq, fP5432, fR + nop.i 0 } - -{ .bbb -(p12) br.cond.spnt L(EXP_CERTAIN_UNDERFLOW) -(p13) br.cond.spnt L(EXP_POSSIBLE_UNDERFLOW) -(p14) br.cond.spnt L(EXP_POSSIBLE_OVERFLOW) -;; +{ .mfi + nop.m 0 + fma.s1 fS = fS1,fS2,f0 + nop.i 0 } +;; +{ .mbb + nop.m 0 +(p13) br.cond.spnt EXP_POSSIBLE_UNDERFLOW +(p14) br.cond.spnt EXP_POSSIBLE_OVERFLOW +} +;; { .mfb - nop.m 999 - fma.d f8 = exp_S, exp_P, exp_S - br.ret.sptk b0 ;; // Normal path exit + nop.m 0 + fma.d.s0 f8 = fS, fP, fS + br.ret.sptk b0 // Normal path exit } +;; -L(EXP_POSSIBLE_OVERFLOW): +EXP_POSSIBLE_OVERFLOW: -// We got an answer. EXP_MAX_DBL_NORM_ARG < x < EXP_MIN_DBL_OFLOW_ARG -// overflow is a possibility, not a certainty +// Here if fMAX_DBL_NORM_ARG < x < fMIN_DBL_OFLOW_ARG +// This cannot happen if input is a double, only if input higher precision. +// Overflow is a possibility, not a certainty. -{ .mfi - nop.m 999 - fsetc.s2 0x7F,0x42 - nop.i 999 ;; -} +// Recompute result using status field 2 with user's rounding mode, +// and wre set. If result is larger than largest double, then we have +// overflow { .mfi - nop.m 999 - fma.d.s2 exp_wre_urm_f8 = exp_S, exp_P, exp_S - nop.i 999 ;; + mov rGt_ln = 0x103ff // Exponent for largest dbl + 1 ulp + fsetc.s2 0x7F,0x42 // Get user's round mode, set wre + nop.i 0 } - -// We define an overflow when the answer with -// WRE set -// user-defined rounding mode -// is ldn +1 - -// Is the exponent 1 more than the largest double? -// If so, go to ERROR RETURN, else get the answer and -// leave. - -// Largest double is 7FE (biased double) -// 7FE - 3FF + FFFF = 103FE -// Create + largest_double_plus_ulp -// Create - largest_double_plus_ulp -// Calculate answer with WRE set. - -// Cases when answer is ldn+1 are as follows: -// ldn ldn+1 -// --+----------|----------+------------ -// | -// +inf +inf -inf -// RN RN -// RZ +;; { .mfi - nop.m 999 - fsetc.s2 0x7F,0x40 - mov exp_GR_gt_ln = 0x103ff ;; + setf.exp fGt_pln = rGt_ln // Create largest double + 1 ulp + fma.d.s2 fWre_urm_f8 = fS, fP, fS // Result with wre set + nop.i 0 } +;; { .mfi - setf.exp exp_gt_pln = exp_GR_gt_ln - nop.f 999 - nop.i 999 ;; + nop.m 0 + fsetc.s2 0x7F,0x40 // Turn off wre in sf2 + nop.i 0 } +;; { .mfi - nop.m 999 - fcmp.ge.unc.s1 p6, p0 = exp_wre_urm_f8, exp_gt_pln - nop.i 999 ;; + nop.m 0 + fcmp.ge.s1 p6, p0 = fWre_urm_f8, fGt_pln // Test for overflow + nop.i 0 } +;; { .mfb - nop.m 999 - nop.f 999 -(p6) br.cond.spnt L(EXP_CERTAIN_OVERFLOW) ;; // Branch if really overflow + nop.m 0 + nop.f 0 +(p6) br.cond.spnt EXP_CERTAIN_OVERFLOW // Branch if overflow } +;; { .mfb - nop.m 999 - fma.d f8 = exp_S, exp_P, exp_S - br.ret.sptk b0 ;; // Exit if really no overflow + nop.m 0 + fma.d.s0 f8 = fS, fP, fS + br.ret.sptk b0 // Exit if really no overflow } +;; -L(EXP_CERTAIN_OVERFLOW): +EXP_CERTAIN_OVERFLOW: { .mmi - sub exp_GR_17ones_m1 = exp_GR_17ones, r0, 1 ;; - setf.exp f9 = exp_GR_17ones_m1 - nop.i 999 ;; + sub rTmp = rExp_mask, r0, 1 +;; + setf.exp fTmp = rTmp + nop.i 0 } +;; { .mfi - nop.m 999 - fmerge.s FR_X = f8,f8 - nop.i 999 + alloc r32=ar.pfs,1,4,4,0 + fmerge.s FR_X = f8,f8 + nop.i 0 } { .mfb - mov GR_Parameter_TAG = 14 - fma.d FR_RESULT = f9, f9, f0 // Set I,O and +INF result - br.cond.sptk __libm_error_region ;; + mov GR_Parameter_TAG = 14 + fma.d.s0 FR_RESULT = fTmp, fTmp, f0 // Set I,O and +INF result + br.cond.sptk __libm_error_region } +;; -L(EXP_POSSIBLE_UNDERFLOW): +EXP_POSSIBLE_UNDERFLOW: -// We got an answer. EXP_MAX_DBL_UFLOW_ARG < x < EXP_MIN_DBL_NORM_ARG -// underflow is a possibility, not a certainty +// Here if fMAX_DBL_ZERO_ARG < x < fMIN_DBL_NORM_ARG +// Underflow is a possibility, not a certainty // We define an underflow when the answer with // ftz set @@ -709,81 +635,111 @@ L(EXP_POSSIBLE_UNDERFLOW): // largest dn smallest normal { .mfi - nop.m 999 - fsetc.s2 0x7F,0x41 - nop.i 999 ;; + nop.m 0 + fsetc.s2 0x7F,0x41 // Get user's round mode, set ftz + nop.i 0 } +;; + { .mfi - nop.m 999 - fma.d.s2 exp_ftz_urm_f8 = exp_S, exp_P, exp_S - nop.i 999 ;; + nop.m 0 + fma.d.s2 fFtz_urm_f8 = fS, fP, fS // Result with ftz set + nop.i 0 } +;; + { .mfi - nop.m 999 - fsetc.s2 0x7F,0x40 - nop.i 999 ;; + nop.m 0 + fsetc.s2 0x7F,0x40 // Turn off ftz in sf2 + nop.i 0 } +;; + { .mfi - nop.m 999 - fcmp.eq.unc.s1 p6, p0 = exp_ftz_urm_f8, f0 - nop.i 999 ;; + nop.m 0 + fcmp.eq.s1 p6, p7 = fFtz_urm_f8, f0 // Test for underflow + nop.i 0 } -{ .mfb - nop.m 999 - nop.f 999 -(p6) br.cond.spnt L(EXP_CERTAIN_UNDERFLOW) ;; // Branch if really underflow +{ .mfi + nop.m 0 + fma.d.s0 f8 = fS, fP, fS // Compute result, set I, maybe U + nop.i 0 } -{ .mfb - nop.m 999 - fma.d f8 = exp_S, exp_P, exp_S - br.ret.sptk b0 ;; // Exit if really no underflow +;; + +{ .mbb + nop.m 0 +(p6) br.cond.spnt EXP_UNDERFLOW_COMMON // Branch if really underflow +(p7) br.ret.sptk b0 // Exit if really no underflow } +;; -L(EXP_CERTAIN_UNDERFLOW): -{ .mfi - nop.m 999 - fmerge.s FR_X = f8,f8 - nop.i 999 +EXP_CERTAIN_UNDERFLOW: +// Here if x < fMAX_DBL_ZERO_ARG +// Result will be zero (or smallest denorm if round to +inf) with I, U set +{ .mmi + mov rTmp = 1 +;; + setf.exp fTmp = rTmp // Form small normal + nop.i 0 } +;; + { .mfb - mov GR_Parameter_TAG = 15 - fma.d FR_RESULT = exp_S, exp_P, exp_S // Set I,U and tiny result - br.cond.sptk __libm_error_region ;; + nop.m 0 + fma.d.s0 f8 = fTmp, fTmp, f0 // Set I,U, tiny (+0.0) result + br.cond.sptk EXP_UNDERFLOW_COMMON } +;; -L(EXP_CERTAIN_UNDERFLOW_ZERO): -{ .mmi - mov exp_GR_one = 1 ;; - setf.exp f9 = exp_GR_one - nop.i 999 ;; +EXP_UNDERFLOW_COMMON: +// Determine if underflow result is zero or nonzero +{ .mfi + alloc r32=ar.pfs,1,4,4,0 + fcmp.eq.s1 p6, p0 = f8, f0 + nop.i 0 } +;; -{ .mfi - nop.m 999 - fmerge.s FR_X = f8,f8 - nop.i 999 +{ .mfb + nop.m 0 + fmerge.s FR_X = fNormX,fNormX +(p6) br.cond.spnt EXP_UNDERFLOW_ZERO } +;; + +EXP_UNDERFLOW_NONZERO: +// Here if x < fMIN_DBL_NORM_ARG and result nonzero; +// I, U are set { .mfb - mov GR_Parameter_TAG = 15 - fma.d FR_RESULT = f9, f9, f0 // Set I,U and tiny (+0.0) result - br.cond.sptk __libm_error_region ;; + mov GR_Parameter_TAG = 15 + nop.f 0 // FR_RESULT already set + br.cond.sptk __libm_error_region } +;; -.endp exp -ASM_SIZE_DIRECTIVE(exp) +EXP_UNDERFLOW_ZERO: +// Here if x < fMIN_DBL_NORM_ARG and result zero; +// I, U are set +{ .mfb + mov GR_Parameter_TAG = 15 + nop.f 0 // FR_RESULT already set + br.cond.sptk __libm_error_region +} +;; +GLOBAL_IEEE754_END(exp) -.proc __libm_error_region -__libm_error_region: +LOCAL_LIBM_ENTRY(__libm_error_region) .prologue { .mfi add GR_Parameter_Y=-32,sp // Parameter 2 value nop.f 0 .save ar.pfs,GR_SAVE_PFS - mov GR_SAVE_PFS=ar.pfs // Save ar.pfs + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs } { .mfi -.fframe 64 +.fframe 64 add sp=-64,sp // Create new stack nop.f 0 mov GR_SAVE_GP=gp // Save gp @@ -791,24 +747,24 @@ __libm_error_region: { .mmi stfd [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack add GR_Parameter_X = 16,sp // Parameter 1 address -.save b0, GR_SAVE_B0 - mov GR_SAVE_B0=b0 // Save b0 +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 };; .body { .mib - stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack - add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address - nop.b 0 + stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address + nop.b 0 } { .mib - stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack - add GR_Parameter_Y = -16,GR_Parameter_Y - br.call.sptk b0=__libm_error_support# // Call error handling function + stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function };; { .mmi - nop.m 0 - nop.m 0 add GR_Parameter_RESULT = 48,sp + nop.m 0 + nop.i 0 };; { .mmi ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack @@ -817,12 +773,11 @@ __libm_error_region: mov b0 = GR_SAVE_B0 // Restore return address };; { .mib - mov gp = GR_SAVE_GP // Restore gp + mov gp = GR_SAVE_GP // Restore gp mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs br.ret.sptk b0 // Return -};; +};; -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) +LOCAL_LIBM_END(__libm_error_region) .type __libm_error_support#,@function .global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_expf.S b/sysdeps/ia64/fpu/e_expf.S index 2aad021335..8d620b6ffa 100644 --- a/sysdeps/ia64/fpu/e_expf.S +++ b/sysdeps/ia64/fpu/e_expf.S @@ -1,10 +1,10 @@ .file "expf.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2002, Intel Corporation // All rights reserved. // -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,7 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -35,589 +35,501 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. // History -//============================================================== -// 4/04/00 Unwind update -// 4/04/00 Unwind support added -// 8/15/00 Bundle added after call to __libm_error_support to properly +//********************************************************************* +// 02/02/00 Original version +// 04/04/00 Unwind support added +// 08/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. -// 8/21/00 Improvements to save 2 cycles on main path, and shorten x=0 case +// 08/21/00 Improvements to save 2 cycles on main path, and shorten x=0 case // 12/07/00 Widen main path, shorten x=inf, nan paths +// 03/15/01 Fix monotonicity problem around x=0 for round to +inf +// 02/05/02 Corrected uninitialize predicate in POSSIBLE_UNDERFLOW path +// 05/20/02 Cleaned up namespace and sf0 syntax +// 07/26/02 Algorithm changed, accuracy improved +// 09/26/02 support of higher precision inputs added, underflow threshold +// corrected +// 11/15/02 Improved performance on Itanium 2, added possible over/under paths +// +// +// API +//********************************************************************* +// float expf(float) +// +// Overview of operation +//********************************************************************* +// Take the input x. w is "how many log2/128 in x?" +// w = x * 64/log2 +// NJ = int(w) +// x = NJ*log2/64 + R + +// NJ = 64*n + j +// x = n*log2 + (log2/64)*j + R +// +// So, exp(x) = 2^n * 2^(j/64)* exp(R) +// +// T = 2^n * 2^(j/64) +// Construct 2^n +// Get 2^(j/64) table +// actually all the entries of 2^(j/64) table are stored in DP and +// with exponent bits set to 0 -> multiplication on 2^n can be +// performed by doing logical "or" operation with bits presenting 2^n + +// exp(R) = 1 + (exp(R) - 1) +// P = exp(R) - 1 approximated by Taylor series of 3rd degree +// P = A3*R^3 + A2*R^2 + R, A3 = 1/6, A2 = 1/2 // -#include "libm_support.h" - -// Assembly macros -//============================================================== -// integer registers used - - exp_GR_0x0f = r33 - exp_GR_0xf0 = r34 +// The final result is reconstructed as follows +// exp(x) = T + T*P - EXP_AD_P_1 = r36 - EXP_AD_P_2 = r37 - EXP_AD_T1 = r38 - EXP_AD_T2 = r39 - exp_GR_Mint = r40 +// Special values +//********************************************************************* +// expf(+0) = 1.0 +// expf(-0) = 1.0 - exp_GR_Mint_p_128 = r41 - exp_GR_Ind1 = r42 - EXP_AD_M1 = r43 - exp_GR_Ind2 = r44 - EXP_AD_M2 = r45 +// expf(+qnan) = +qnan +// expf(-qnan) = -qnan +// expf(+snan) = +qnan +// expf(-snan) = -qnan - exp_GR_min_oflow = r46 - exp_GR_max_zero = r47 - exp_GR_max_norm = r48 - exp_GR_max_uflow = r49 - exp_GR_min_norm = r50 +// expf(-inf) = +0 +// expf(+inf) = +inf - exp_GR_17ones = r51 - exp_GR_gt_ln = r52 - exp_GR_T2_size = r53 +// Overflow and Underflow +//********************************************************************* +// expf(x) = largest single normal when +// x = 88.72283 = 0x42b17217 - exp_GR_17ones_m1 = r56 - exp_GR_one = r57 +// expf(x) = smallest single normal when +// x = -87.33654 = 0xc2aeac4f +// expf(x) = largest round-to-nearest single zero when +// x = -103.97208 = 0xc2cff1b5 -GR_SAVE_B0 = r53 -GR_SAVE_PFS = r55 -GR_SAVE_GP = r54 +// Registers used +//********************************************************************* +// Floating Point registers used: +// f8, input +// f6,f7, f9 -> f15, f32 -> f40 -GR_Parameter_X = r59 -GR_Parameter_Y = r60 -GR_Parameter_RESULT = r61 -GR_Parameter_TAG = r62 +// General registers used: +// r3, r23 -> r38 -FR_X = f10 -FR_Y = f1 -FR_RESULT = f8 +// Predicate registers used: +// p10 -> p15 +// Assembly macros +//********************************************************************* +// integer registers used +// scratch +rNJ = r3 + +rTmp = r23 +rJ = r23 +rN = r24 +rTblAddr = r25 +rA3 = r26 +rExpHalf = r27 +rLn2Div64 = r28 +r17ones_m1 = r29 +rGt_ln = r29 +rRightShifter = r30 +r64DivLn2 = r31 +// stacked +GR_SAVE_PFS = r32 +GR_SAVE_B0 = r33 +GR_SAVE_GP = r34 +GR_Parameter_X = r35 +GR_Parameter_Y = r36 +GR_Parameter_RESULT = r37 +GR_Parameter_TAG = r38 // floating point registers used - - EXP_MIN_SGL_OFLOW_ARG = f11 - EXP_MAX_SGL_ZERO_ARG = f12 - EXP_MAX_SGL_NORM_ARG = f13 - EXP_MAX_SGL_UFLOW_ARG = f14 - EXP_MIN_SGL_NORM_ARG = f15 - - exp_coeff_P5 = f32 - exp_coeff_P6 = f33 - exp_coeff_P3 = f34 - exp_coeff_P4 = f35 - - exp_coeff_P1 = f36 - exp_coeff_P2 = f37 - exp_Mx = f38 - exp_Mfloat = f39 - exp_R = f40 - - exp_P1 = f41 - exp_P2 = f42 - exp_P3 = f43 - exp_Rsq = f44 - exp_R4 = f45 - - exp_P4 = f46 - exp_P5 = f47 - exp_P6 = f48 - exp_P7 = f49 - exp_T1 = f50 - - exp_T2 = f51 - exp_T = f52 - exp_A = f53 - exp_norm_f8 = f54 - exp_wre_urm_f8 = f55 - - exp_ftz_urm_f8 = f56 - exp_gt_pln = f57 - - -#ifdef _LIBC -.rodata -#else -.data -#endif - +FR_X = f10 +FR_Y = f1 +FR_RESULT = f8 +// scratch +fRightShifter = f6 +f64DivLn2 = f7 +fNormX = f9 +fNint = f10 +fN = f11 +fR = f12 +fLn2Div64 = f13 +fA2 = f14 +fA3 = f15 +// stacked +fP = f32 +fT = f33 +fMIN_SGL_OFLOW_ARG = f34 +fMAX_SGL_ZERO_ARG = f35 +fMAX_SGL_NORM_ARG = f36 +fMIN_SGL_NORM_ARG = f37 +fRSqr = f38 +fTmp = f39 +fGt_pln = f39 +fWre_urm_f8 = f40 +fFtz_urm_f8 = f40 + + +RODATA .align 16 -exp_coeff_1_table: -ASM_TYPE_DIRECTIVE(exp_coeff_1_table,@object) -data8 0x3F56F35FDE4F8563 // p5 -data8 0x3F2A378BEFECCFDD // p6 -data8 0x3FE00000258C581D // p1 -data8 0x3FC555557AE7B3D4 // p2 -ASM_SIZE_DIRECTIVE(exp_coeff_1_table) - - -exp_coeff_2_table: -ASM_TYPE_DIRECTIVE(exp_coeff_2_table,@object) -data8 0x3FA5551BB6592FAE // p3 -data8 0x3F8110E8EBFFD485 // p4 -ASM_SIZE_DIRECTIVE(exp_coeff_2_table) - - -exp_T2_table: -ASM_TYPE_DIRECTIVE(exp_T2_table,@object) -data8 0xa175cf9cd7d85844 , 0x00003f46 // exp(-128) -data8 0xdb7279415a1f9eed , 0x00003f47 // exp(-127) -data8 0x95213b242bd8ca5f , 0x00003f49 // exp(-126) -data8 0xcab03c968c989f83 , 0x00003f4a // exp(-125) -data8 0x89bdb674702961ad , 0x00003f4c // exp(-124) -data8 0xbb35a2eec278be35 , 0x00003f4d // exp(-123) -data8 0xfe71b17f373e7e7a , 0x00003f4e // exp(-122) -data8 0xace9a6ec52a39b63 , 0x00003f50 // exp(-121) -data8 0xeb03423fe393cf1c , 0x00003f51 // exp(-120) -data8 0x9fb52c5bcaef1693 , 0x00003f53 // exp(-119) -data8 0xd910b6377ed60bf1 , 0x00003f54 // exp(-118) -data8 0x9382dad8a9fdbfe4 , 0x00003f56 // exp(-117) -data8 0xc87d0a84dea869a3 , 0x00003f57 // exp(-116) -data8 0x883efb4c6d1087b0 , 0x00003f59 // exp(-115) -data8 0xb92d7373dce9a502 , 0x00003f5a // exp(-114) -data8 0xfbaeb020577fb0cb , 0x00003f5b // exp(-113) -ASM_SIZE_DIRECTIVE(exp_T2_table) - - -exp_T1_table: -ASM_TYPE_DIRECTIVE(exp_T1_table,@object) -data8 0x8000000000000000 , 0x00003fff // exp(16 * 0) -data8 0x87975e8540010249 , 0x00004016 // exp(16 * 1) -data8 0x8fa1fe625b3163ec , 0x0000402d // exp(16 * 2) -data8 0x9826b576512a59d7 , 0x00004044 // exp(16 * 3) -data8 0xa12cc167acbe6902 , 0x0000405b // exp(16 * 4) -data8 0xaabbcdcc279f59e4 , 0x00004072 // exp(16 * 5) -data8 0xb4dbfaadc045d16f , 0x00004089 // exp(16 * 6) -data8 0xbf95e372ccdbf146 , 0x000040a0 // exp(16 * 7) -data8 0xcaf2a62eea10bbfb , 0x000040b7 // exp(16 * 8) -data8 0xd6fbeb62fddbd340 , 0x000040ce // exp(16 * 9) -data8 0xe3bbee32e4a440ea , 0x000040e5 // exp(16 * 10) -data8 0xf13d8517c34199a8 , 0x000040fc // exp(16 * 11) -data8 0xff8c2b166241eedd , 0x00004113 // exp(16 * 12) -data8 0x875a04c0b38d6129 , 0x0000412b // exp(16 * 13) -data8 0x8f610127db6774d7 , 0x00004142 // exp(16 * 14) -data8 0x97e1dd87e5c20bb6 , 0x00004159 // exp(16 * 15) -ASM_SIZE_DIRECTIVE(exp_T1_table) - -// Argument Reduction -// exp_Mx = (int)f8 ==> The value of f8 rounded to int is placed into the -// significand of exp_Mx as a two's -// complement number. - -// Later we want to have exp_Mx in a general register. Do this with a getf.sig -// and call the general register exp_GR_Mint - -// exp_Mfloat = (float)(int)f8 ==> the two's complement number in -// significand of exp_Mx is turned -// into a floating point number. -// R = 1 - exp_Mfloat ==> reduced argument - -// Core Approximation -// Calculate a series in R -// R * p6 + p5 -// R * p4 + p3 -// R * p2 + p1 -// R^2 -// R^4 -// R^2(R * p6 + p5) + (R * p4 + p3) -// R^2(R * p2 + p1) -// R^4(R^2(R * p6 + p5) + (R * p4 + p3)) + (R^2(R * p2 + p1)) -// R + 1 -// exp(R) = (1 + R) + R^4(R^2(R * p6 + p5) + (R * p4 + p3)) + (R^2(R * p2 + p1)) -// exp(R) = 1 + R + R^2 * p1 + R^3 * p2 + R^4 * p3 + R^5 * p4 + R^6 * p5 + R^7 * p6 - -// Reconstruction -// signficand of exp_Mx is two's complement, -// -103 < x < 89 -// The smallest single denormal is 2^-149 = ssdn -// For e^x = ssdn -// x = log(ssdn) = -103.279 -// But with rounding result goes to ssdn until -103.972079 -// The largest single normal is 1.<23 1's> 2^126 ~ 2^127 = lsn -// For e^x = lsn -// x = log(lsn) = 88.7228 +LOCAL_OBJECT_START(_expf_table) +data4 0x42b17218 // Smallest sgl arg to overflow sgl result, +88.7228 +data4 0xc2cff1b5 // Largest sgl for rnd-to-nearest 0 result, -103.9720 +data4 0x42b17217 // Largest sgl arg to give normal sgl result, +88.7228 +data4 0xc2aeac4f // Smallest sgl arg to give normal sgl result, -87.3365 // -// expf overflows when x > 42b17218 = 88.7228 -// expf returns largest single denormal when x = c2aeac50 -// expf goes to zero when x < c2cff1b5 - -// Consider range of 8-bit two's complement, -128 ---> 127 -// Add 128; range becomes 0 ---> 255 - -// The number (=i) in 0 ---> 255 is used as offset into two tables. - -// i = abcd efgh = abcd * 16 + efgh = i1 * 16 + i2 - -// i1 = (exp_GR_Mint + 128) & 0xf0 (show 0xf0 as -0x10 to avoid assembler error) -// (The immediate in the AND is an 8-bit two's complement) -// i1 = i1 + start of T1 table (EXP_AD_T1) -// Note that the entries in T1 are double-extended numbers on 16-byte boundaries -// and that i1 is already shifted left by 16 after the AND. - -// i2 must be shifted left by 4 before adding to the start of the table. -// i2 = ((exp_GR_Mint + 128) & 0x0f) << 4 -// i2 = i2 + start of T2 table (EXP_AD_T2) - -// T = T1 * T2 -// A = T * (1 + R) -// answer = T * (R^2 * p1 + R^3 * p2 + R^4 * p3 + R^5 * p4 + R^6 * p5 + R^7 * p6) + -// T * (1 + R) -// = T * exp(R) - +// 2^(j/64) table, j goes from 0 to 63 +data8 0x0000000000000000 // 2^(0/64) +data8 0x00002C9A3E778061 // 2^(1/64) +data8 0x000059B0D3158574 // 2^(2/64) +data8 0x0000874518759BC8 // 2^(3/64) +data8 0x0000B5586CF9890F // 2^(4/64) +data8 0x0000E3EC32D3D1A2 // 2^(5/64) +data8 0x00011301D0125B51 // 2^(6/64) +data8 0x0001429AAEA92DE0 // 2^(7/64) +data8 0x000172B83C7D517B // 2^(8/64) +data8 0x0001A35BEB6FCB75 // 2^(9/64) +data8 0x0001D4873168B9AA // 2^(10/64) +data8 0x0002063B88628CD6 // 2^(11/64) +data8 0x0002387A6E756238 // 2^(12/64) +data8 0x00026B4565E27CDD // 2^(13/64) +data8 0x00029E9DF51FDEE1 // 2^(14/64) +data8 0x0002D285A6E4030B // 2^(15/64) +data8 0x000306FE0A31B715 // 2^(16/64) +data8 0x00033C08B26416FF // 2^(17/64) +data8 0x000371A7373AA9CB // 2^(18/64) +data8 0x0003A7DB34E59FF7 // 2^(19/64) +data8 0x0003DEA64C123422 // 2^(20/64) +data8 0x0004160A21F72E2A // 2^(21/64) +data8 0x00044E086061892D // 2^(22/64) +data8 0x000486A2B5C13CD0 // 2^(23/64) +data8 0x0004BFDAD5362A27 // 2^(24/64) +data8 0x0004F9B2769D2CA7 // 2^(25/64) +data8 0x0005342B569D4F82 // 2^(26/64) +data8 0x00056F4736B527DA // 2^(27/64) +data8 0x0005AB07DD485429 // 2^(28/64) +data8 0x0005E76F15AD2148 // 2^(29/64) +data8 0x0006247EB03A5585 // 2^(30/64) +data8 0x0006623882552225 // 2^(31/64) +data8 0x0006A09E667F3BCD // 2^(32/64) +data8 0x0006DFB23C651A2F // 2^(33/64) +data8 0x00071F75E8EC5F74 // 2^(34/64) +data8 0x00075FEB564267C9 // 2^(35/64) +data8 0x0007A11473EB0187 // 2^(36/64) +data8 0x0007E2F336CF4E62 // 2^(37/64) +data8 0x00082589994CCE13 // 2^(38/64) +data8 0x000868D99B4492ED // 2^(39/64) +data8 0x0008ACE5422AA0DB // 2^(40/64) +data8 0x0008F1AE99157736 // 2^(41/64) +data8 0x00093737B0CDC5E5 // 2^(42/64) +data8 0x00097D829FDE4E50 // 2^(43/64) +data8 0x0009C49182A3F090 // 2^(44/64) +data8 0x000A0C667B5DE565 // 2^(45/64) +data8 0x000A5503B23E255D // 2^(46/64) +data8 0x000A9E6B5579FDBF // 2^(47/64) +data8 0x000AE89F995AD3AD // 2^(48/64) +data8 0x000B33A2B84F15FB // 2^(49/64) +data8 0x000B7F76F2FB5E47 // 2^(50/64) +data8 0x000BCC1E904BC1D2 // 2^(51/64) +data8 0x000C199BDD85529C // 2^(52/64) +data8 0x000C67F12E57D14B // 2^(53/64) +data8 0x000CB720DCEF9069 // 2^(54/64) +data8 0x000D072D4A07897C // 2^(55/64) +data8 0x000D5818DCFBA487 // 2^(56/64) +data8 0x000DA9E603DB3285 // 2^(57/64) +data8 0x000DFC97337B9B5F // 2^(58/64) +data8 0x000E502EE78B3FF6 // 2^(59/64) +data8 0x000EA4AFA2A490DA // 2^(60/64) +data8 0x000EFA1BEE615A27 // 2^(61/64) +data8 0x000F50765B6E4540 // 2^(62/64) +data8 0x000FA7C1819E90D8 // 2^(63/64) +LOCAL_OBJECT_END(_expf_table) -.global expf# .section .text -.proc expf# -.align 32 -expf: -#ifdef _LIBC -.global __ieee754_expf# -__ieee754_expf: -#endif - -{ .mfi - alloc r32 = ar.pfs,1,26,4,0 - fcvt.fx.s1 exp_Mx = f8 - mov exp_GR_17ones = 0x1FFFF +GLOBAL_IEEE754_ENTRY(expf) + +{ .mlx + addl rTblAddr = @ltoff(_expf_table),gp + movl r64DivLn2 = 0x40571547652B82FE // 64/ln(2) } { .mlx - addl EXP_AD_P_1 = @ltoff(exp_coeff_1_table),gp - movl exp_GR_min_oflow = 0x42b17218 + addl rA3 = 0x3E2AA, r0 // high bits of 1.0/6.0 rounded to SP + movl rRightShifter = 0x43E8000000000000 // DP Right Shifter } ;; -// Fnorm done to take any enabled faults { .mfi - ld8 EXP_AD_P_1 = [EXP_AD_P_1] - fclass.m p6,p0 = f8, 0x07 //@zero - nop.i 999 + // point to the beginning of the table + ld8 rTblAddr = [rTblAddr] + fclass.m p14, p0 = f8, 0x22 // test for -INF + shl rA3 = rA3, 12 // 0x3E2AA000, approx to 1.0/6.0 in SP } { .mfi - add exp_GR_max_norm = -1, exp_GR_min_oflow // 0x42b17217 - fnorm exp_norm_f8 = f8 - nop.i 999 + nop.m 0 + fnorm.s1 fNormX = f8 // normalized x + addl rExpHalf = 0xFFFE, r0 // exponent of 1/2 } ;; { .mfi - setf.s EXP_MIN_SGL_OFLOW_ARG = exp_GR_min_oflow // 0x42b17218 - fclass.m p7,p0 = f8, 0x22 // Test for x=-inf - mov exp_GR_0xf0 = 0x0f0 + setf.d f64DivLn2 = r64DivLn2 // load 64/ln(2) to FP reg + fclass.m p15, p0 = f8, 0x1e1 // test for NaT,NaN,+Inf + nop.i 0 } { .mlx - setf.s EXP_MAX_SGL_NORM_ARG = exp_GR_max_norm - movl exp_GR_max_zero = 0xc2cff1b5 + // load Right Shifter to FP reg + setf.d fRightShifter = rRightShifter + movl rLn2Div64 = 0x3F862E42FEFA39EF // DP ln(2)/64 in GR } ;; - -{ .mlx - mov exp_GR_0x0f = 0x00f - movl exp_GR_max_uflow = 0xc2aeac50 +{ .mfi + nop.m 0 + fcmp.eq.s1 p13, p0 = f0, f8 // test for x = 0.0 + nop.i 0 } { .mfb - nop.m 999 -(p6) fma.s f8 = f1,f1,f0 -(p6) br.ret.spnt b0 // quick exit for x=0 + setf.s fA3 = rA3 // load A3 to FP reg +(p14) fma.s.s0 f8 = f0, f1, f0 // result if x = -inf +(p14) br.ret.spnt b0 // exit here if x = -inf } ;; { .mfi - setf.s EXP_MAX_SGL_ZERO_ARG = exp_GR_max_zero - fclass.m p8,p0 = f8, 0x21 // Test for x=+inf - adds exp_GR_min_norm = 1, exp_GR_max_uflow // 0xc2aeac51 + setf.exp fA2 = rExpHalf // load A2 to FP reg + fcmp.eq.s0 p6, p0 = f8, f0 // Dummy to flag denorm + nop.i 0 } { .mfb - ldfpd exp_coeff_P5,exp_coeff_P6 = [EXP_AD_P_1],16 -(p7) fma.s f8 = f0,f0,f0 -(p7) br.ret.spnt b0 // quick exit for x=-inf + setf.d fLn2Div64 = rLn2Div64 // load ln(2)/64 to FP reg +(p15) fma.s.s0 f8 = f8, f1, f0 // result if x = NaT,NaN,+Inf +(p15) br.ret.spnt b0 // exit here if x = NaT,NaN,+Inf } ;; -{ .mmf - ldfpd exp_coeff_P1,exp_coeff_P2 = [EXP_AD_P_1],16 - setf.s EXP_MAX_SGL_UFLOW_ARG = exp_GR_max_uflow - fclass.m p9,p0 = f8, 0xc3 // Test for x=nan -} -;; - -{ .mmb - ldfpd exp_coeff_P3,exp_coeff_P4 = [EXP_AD_P_1],16 - setf.s EXP_MIN_SGL_NORM_ARG = exp_GR_min_norm -(p8) br.ret.spnt b0 // quick exit for x=+inf +{ .mfb + // overflow and underflow_zero threshold + ldfps fMIN_SGL_OFLOW_ARG, fMAX_SGL_ZERO_ARG = [rTblAddr], 8 +(p13) fma.s.s0 f8 = f1, f1, f0 // result if x = 0.0 +(p13) br.ret.spnt b0 // exit here if x =0.0 } ;; -// EXP_AD_P_1 now points to exp_T2_table + // max normal and underflow_denorm threshold { .mfi - mov exp_GR_T2_size = 0x100 - fcvt.xf exp_Mfloat = exp_Mx - nop.i 999 + ldfps fMAX_SGL_NORM_ARG, fMIN_SGL_NORM_ARG = [rTblAddr], 8 + nop.f 0 + nop.i 0 } ;; -{ .mfb - getf.sig exp_GR_Mint = exp_Mx -(p9) fmerge.s f8 = exp_norm_f8, exp_norm_f8 -(p9) br.ret.spnt b0 // quick exit for x=nan +{ .mfi + nop.m 0 + // x*(64/ln(2)) + Right Shifter + fma.s1 fNint = fNormX, f64DivLn2, fRightShifter + nop.i 0 } ;; -{ .mmi - nop.m 999 - mov EXP_AD_T2 = EXP_AD_P_1 - add EXP_AD_T1 = exp_GR_T2_size,EXP_AD_P_1 ;; -} - - -{ .mmi - adds exp_GR_Mint_p_128 = 0x80,exp_GR_Mint ;; - and exp_GR_Ind1 = exp_GR_Mint_p_128, exp_GR_0xf0 - and exp_GR_Ind2 = exp_GR_Mint_p_128, exp_GR_0x0f ;; -} - // Divide arguments into the following categories: -// Certain Underflow/zero p11 - -inf < x <= MAX_SGL_ZERO_ARG -// Certain Underflow p12 - MAX_SGL_ZERO_ARG < x <= MAX_SGL_UFLOW_ARG -// Possible Underflow p13 - MAX_SGL_UFLOW_ARG < x < MIN_SGL_NORM_ARG +// Certain Underflow p11 - -inf < x <= MAX_SGL_ZERO_ARG +// Possible Underflow p13 - MAX_SGL_ZERO_ARG < x < MIN_SGL_NORM_ARG // Certain Safe - MIN_SGL_NORM_ARG <= x <= MAX_SGL_NORM_ARG // Possible Overflow p14 - MAX_SGL_NORM_ARG < x < MIN_SGL_OFLOW_ARG // Certain Overflow p15 - MIN_SGL_OFLOW_ARG <= x < +inf // -// If the input is really a single arg, then there will never be "Possible -// Underflow" or "Possible Overflow" arguments. +// If the input is really a single arg, then there will never be +// "Possible Overflow" arguments. // { .mfi - add EXP_AD_M1 = exp_GR_Ind1,EXP_AD_T1 - fcmp.ge.s1 p15,p14 = exp_norm_f8,EXP_MIN_SGL_OFLOW_ARG - nop.i 999 -} -{ .mfi - shladd EXP_AD_M2 = exp_GR_Ind2,4,EXP_AD_T2 - fms.s1 exp_R = f1,f8,exp_Mfloat - nop.i 999 ;; + nop.m 0 + // check for overflow + fcmp.ge.s1 p15, p0 = fNormX, fMIN_SGL_OFLOW_ARG + nop.i 0 } +;; { .mfi - ldfe exp_T1 = [EXP_AD_M1] - fcmp.le.s1 p11,p12 = exp_norm_f8,EXP_MAX_SGL_ZERO_ARG - nop.i 999 ;; + nop.m 0 + // check for underflow and tiny (+0) result + fcmp.le.s1 p11, p0 = fNormX, fMAX_SGL_ZERO_ARG + nop.i 0 } - { .mfb - ldfe exp_T2 = [EXP_AD_M2] -(p14) fcmp.gt.s1 p14,p0 = exp_norm_f8,EXP_MAX_SGL_NORM_ARG -(p15) br.cond.spnt L(EXP_CERTAIN_OVERFLOW) ;; -} - -{ .mfb - nop.m 999 -(p12) fcmp.le.s1 p12,p0 = exp_norm_f8,EXP_MAX_SGL_UFLOW_ARG -(p11) br.cond.spnt L(EXP_CERTAIN_UNDERFLOW_ZERO) + nop.m 0 + fms.s1 fN = fNint, f1, fRightShifter // n in FP register + // branch out if overflow +(p15) br.cond.spnt EXP_CERTAIN_OVERFLOW } ;; -{ .mfi - nop.m 999 -(p13) fcmp.lt.s1 p13,p0 = exp_norm_f8,EXP_MIN_SGL_NORM_ARG - nop.i 999 +{ .mfb + getf.sig rNJ = fNint // bits of n, j + // check for underflow and deno result + fcmp.lt.s1 p13, p0 = fNormX, fMIN_SGL_NORM_ARG + // branch out if underflow and tiny (+0) result +(p11) br.cond.spnt EXP_CERTAIN_UNDERFLOW } ;; - { .mfi - nop.m 999 - fma.s1 exp_Rsq = exp_R,exp_R,f0 - nop.i 999 + nop.m 0 + // check for possible overflow + fcmp.gt.s1 p14, p0 = fNormX, fMAX_SGL_NORM_ARG + extr.u rJ = rNJ, 0, 6 // bits of j } { .mfi - nop.m 999 - fma.s1 exp_P3 = exp_R,exp_coeff_P2,exp_coeff_P1 - nop.i 999 + addl rN = 0xFFFF - 63, rNJ // biased and shifted n + fnma.s1 fR = fLn2Div64, fN, fNormX // R = x - N*ln(2)/64 + nop.i 0 } ;; { .mfi - nop.m 999 - fma.s1 exp_P1 = exp_R,exp_coeff_P6,exp_coeff_P5 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 exp_P2 = exp_R,exp_coeff_P4,exp_coeff_P3 - nop.i 999 + shladd rJ = rJ, 3, rTblAddr // address in the 2^(j/64) table + nop.f 0 + shr rN = rN, 6 // biased n } ;; - { .mfi - nop.m 999 - fma.s1 exp_P7 = f1,exp_R,f1 - nop.i 999 + ld8 rJ = [rJ] + nop.f 0 + shl rN = rN, 52 // 2^n bits in DP format } ;; - -{ .mfi - nop.m 999 - fma.s1 exp_P5 = exp_Rsq,exp_P3,f0 - nop.i 999 -} { .mfi - nop.m 999 - fma.s1 exp_R4 = exp_Rsq,exp_Rsq,f0 - nop.i 999 + or rN = rN, rJ // bits of 2^n * 2^(j/64) in DP format + nop.f 0 + nop.i 0 } ;; { .mfi - nop.m 999 - fma.s1 exp_T = exp_T1,exp_T2,f0 - nop.i 999 + setf.d fT = rN // 2^n * 2^(j/64) + fma.s1 fP = fA3, fR, fA2 // A3*R + A2 + nop.i 0 } { .mfi - nop.m 999 - fma.s1 exp_P4 = exp_Rsq,exp_P1,exp_P2 - nop.i 999 + nop.m 0 + fma.s1 fRSqr = fR, fR, f0 // R^2 + nop.i 0 } ;; { .mfi - nop.m 999 - fma.s1 exp_A = exp_T,exp_P7,f0 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 exp_P6 = exp_R4,exp_P4,exp_P5 - nop.i 999 + nop.m 0 + fma.s1 fP = fP, fRSqr, fR // P = (A3*R + A2)*R^2 + R + nop.i 0 } ;; -{ .bbb -(p12) br.cond.spnt L(EXP_CERTAIN_UNDERFLOW) -(p13) br.cond.spnt L(EXP_POSSIBLE_UNDERFLOW) -(p14) br.cond.spnt L(EXP_POSSIBLE_OVERFLOW) +{ .mbb + nop.m 0 + // branch out if possible underflow +(p13) br.cond.spnt EXP_POSSIBLE_UNDERFLOW + // branch out if possible overflow result +(p14) br.cond.spnt EXP_POSSIBLE_OVERFLOW } ;; { .mfb - nop.m 999 - fma.s f8 = exp_T,exp_P6,exp_A - br.ret.sptk b0 + nop.m 0 + // final result in the absence of over- and underflow + fma.s.s0 f8 = fP, fT, fT + // exit here in the absence of over- and underflow + br.ret.sptk b0 } ;; -L(EXP_POSSIBLE_OVERFLOW): - -// We got an answer. EXP_MAX_SGL_NORM_ARG < x < EXP_MIN_SGL_OFLOW_ARG -// overflow is a possibility, not a certainty -// Set wre in s2 and perform the last operation with s2 - -// We define an overflow when the answer with -// WRE set -// user-defined rounding mode -// is lsn +1 - -// Is the exponent 1 more than the largest single? -// If so, go to ERROR RETURN, else (no overflow) get the answer and -// leave. - -// Largest single is FE (biased single) -// FE - 7F + FFFF = 1007E +EXP_POSSIBLE_OVERFLOW: -// Create + largest_single_plus_ulp -// Create - largest_single_plus_ulp +// Here if fMAX_SGL_NORM_ARG < x < fMIN_SGL_OFLOW_ARG +// This cannot happen if input is a single, only if input higher precision. +// Overflow is a possibility, not a certainty. -// Calculate answer with WRE set. - -// Cases when answer is lsn+1 are as follows: - -// midpoint -// | -// lsn | lsn+1 -// --+----------|----------+------------ -// | -// +inf +inf -inf -// RN RN -// RZ -// exp_gt_pln contains the floating point number lsn+1. -// The setf.exp puts 0x1007f in the exponent and 0x800... in the significand. - -// If the answer is >= lsn+1, we have overflowed. -// Then p6 is TRUE. Set the overflow tag, save input in FR_X, -// do the final calculation for IEEE result, and branch to error return. +// Recompute result using status field 2 with user's rounding mode, +// and wre set. If result is larger than largest single, then we have +// overflow { .mfi - mov exp_GR_gt_ln = 0x1007F - fsetc.s2 0x7F,0x42 - nop.i 999 + mov rGt_ln = 0x1007f // Exponent for largest single + 1 ulp + fsetc.s2 0x7F,0x42 // Get user's round mode, set wre + nop.i 0 } ;; { .mfi - setf.exp exp_gt_pln = exp_GR_gt_ln - fma.s.s2 exp_wre_urm_f8 = exp_T, exp_P6, exp_A - nop.i 999 + setf.exp fGt_pln = rGt_ln // Create largest single + 1 ulp + fma.s.s2 fWre_urm_f8 = fP, fT, fT // Result with wre set + nop.i 0 } ;; { .mfi - nop.m 999 - fsetc.s2 0x7F,0x40 - nop.i 999 + nop.m 0 + fsetc.s2 0x7F,0x40 // Turn off wre in sf2 + nop.i 0 } ;; { .mfi - nop.m 999 - fcmp.ge.unc.s1 p6, p0 = exp_wre_urm_f8, exp_gt_pln - nop.i 999 + nop.m 0 + fcmp.ge.s1 p6, p0 = fWre_urm_f8, fGt_pln // Test for overflow + nop.i 0 } ;; { .mfb - nop.m 999 - nop.f 999 -(p6) br.cond.spnt L(EXP_CERTAIN_OVERFLOW) // Branch if really overflow + nop.m 0 + nop.f 0 +(p6) br.cond.spnt EXP_CERTAIN_OVERFLOW // Branch if overflow } ;; { .mfb - nop.m 999 - fma.s f8 = exp_T, exp_P6, exp_A - br.ret.sptk b0 // Exit if really no overflow + nop.m 0 + fma.s.s0 f8 = fP, fT, fT + br.ret.sptk b0 // Exit if really no overflow } ;; -L(EXP_CERTAIN_OVERFLOW): +// here if overflow +EXP_CERTAIN_OVERFLOW: { .mmi - sub exp_GR_17ones_m1 = exp_GR_17ones, r0, 1 ;; - setf.exp f9 = exp_GR_17ones_m1 - nop.i 999 ;; + addl r17ones_m1 = 0x1FFFE, r0 +;; + setf.exp fTmp = r17ones_m1 + nop.i 0 } +;; { .mfi - nop.m 999 - fmerge.s FR_X = f8,f8 - nop.i 999 + alloc r32=ar.pfs,0,3,4,0 + fmerge.s FR_X = f8,f8 + nop.i 0 } { .mfb - mov GR_Parameter_TAG = 16 - fma.s FR_RESULT = f9, f9, f0 // Set I,O and +INF result - br.cond.sptk __libm_error_region ;; + mov GR_Parameter_TAG = 16 + fma.s.s0 FR_RESULT = fTmp, fTmp, f0 // Set I,O and +INF result + br.cond.sptk __libm_error_region } +;; -L(EXP_POSSIBLE_UNDERFLOW): +EXP_POSSIBLE_UNDERFLOW: -// We got an answer. EXP_MAX_SGL_UFLOW_ARG < x < EXP_MIN_SGL_NORM_ARG -// underflow is a possibility, not a certainty +// Here if fMAX_SGL_ZERO_ARG < x < fMIN_SGL_NORM_ARG +// Underflow is a possibility, not a certainty // We define an underflow when the answer with // ftz set @@ -637,144 +549,157 @@ L(EXP_POSSIBLE_UNDERFLOW): // E // -----+--------------------+--------------------+----- // | | | -// 1.1...10 2^-7f 1.1...11 2^-7f 1.0...00 2^-7e -// 0.1...11 2^-7e (biased, 1) +// 1.1...10 2^-3fff 1.1...11 2^-3fff 1.0...00 2^-3ffe +// 0.1...11 2^-3ffe (biased, 1) // largest dn smallest normal -// If the answer is = 0, we have underflowed. -// Then p6 is TRUE. Set the underflow tag, save input in FR_X, -// do the final calculation for IEEE result, and branch to error return. - { .mfi - nop.m 999 - fsetc.s2 0x7F,0x41 - nop.i 999 + nop.m 0 + fsetc.s2 0x7F,0x41 // Get user's round mode, set ftz + nop.i 0 } ;; { .mfi - nop.m 999 - fma.s.s2 exp_ftz_urm_f8 = exp_T, exp_P6, exp_A - nop.i 999 + nop.m 0 + fma.s.s2 fFtz_urm_f8 = fP, fT, fT // Result with ftz set + nop.i 0 } ;; - { .mfi - nop.m 999 - fsetc.s2 0x7F,0x40 - nop.i 999 + nop.m 0 + fsetc.s2 0x7F,0x40 // Turn off ftz in sf2 + nop.i 0 } ;; { .mfi - nop.m 999 - fcmp.eq.unc.s1 p6, p0 = exp_ftz_urm_f8, f0 - nop.i 999 + nop.m 0 + fcmp.eq.s1 p6, p7 = fFtz_urm_f8, f0 // Test for underflow + nop.i 0 +} +{ .mfi + nop.m 0 + fma.s.s0 f8 = fP, fT, fT // Compute result, set I, maybe U + nop.i 0 } ;; -{ .mfb - nop.m 999 - nop.f 999 -(p6) br.cond.spnt L(EXP_CERTAIN_UNDERFLOW) // Branch if really underflow +{ .mbb + nop.m 0 +(p6) br.cond.spnt EXP_UNDERFLOW_COMMON // Branch if really underflow +(p7) br.ret.sptk b0 // Exit if really no underflow +} +;; + +EXP_CERTAIN_UNDERFLOW: +// Here if x < fMAX_SGL_ZERO_ARG +// Result will be zero (or smallest denorm if round to +inf) with I, U set +{ .mmi + mov rTmp = 1 +;; + setf.exp fTmp = rTmp // Form small normal + nop.i 0 } ;; { .mfb - nop.m 999 - fma.s f8 = exp_T, exp_P6, exp_A - br.ret.sptk b0 // Exit if really no underflow + nop.m 0 + fma.s.s0 f8 = fTmp, fTmp, f0 // Set I,U, tiny (+0.0) result + br.cond.sptk EXP_UNDERFLOW_COMMON } ;; -L(EXP_CERTAIN_UNDERFLOW): +EXP_UNDERFLOW_COMMON: +// Determine if underflow result is zero or nonzero { .mfi - nop.m 999 - fmerge.s FR_X = f8,f8 - nop.i 999 + alloc r32=ar.pfs,0,3,4,0 + fcmp.eq.s1 p6, p0 = f8, f0 + nop.i 0 } +;; + { .mfb - mov GR_Parameter_TAG = 17 - fma.s FR_RESULT = exp_T, exp_P6, exp_A // Set I,U and tiny result - br.cond.sptk __libm_error_region ;; + nop.m 0 + fmerge.s FR_X = fNormX,fNormX +(p6) br.cond.spnt EXP_UNDERFLOW_ZERO } +;; -L(EXP_CERTAIN_UNDERFLOW_ZERO): -{ .mmi - mov exp_GR_one = 1 ;; - setf.exp f9 = exp_GR_one - nop.i 999 ;; +EXP_UNDERFLOW_NONZERO: +// Here if x < fMIN_SGL_NORM_ARG and result nonzero; +// I, U are set +{ .mfb + mov GR_Parameter_TAG = 17 + nop.f 0 // FR_RESULT already set + br.cond.sptk __libm_error_region } +;; -{ .mfi - nop.m 999 - fmerge.s FR_X = f8,f8 - nop.i 999 -} +EXP_UNDERFLOW_ZERO: +// Here if x < fMIN_SGL_NORM_ARG and result zero; +// I, U are set { .mfb - mov GR_Parameter_TAG = 17 - fma.s FR_RESULT = f9, f9, f0 // Set I,U and tiny (+0.0) result - br.cond.sptk __libm_error_region ;; + mov GR_Parameter_TAG = 17 + nop.f 0 // FR_RESULT already set + br.cond.sptk __libm_error_region } +;; -.endp expf -ASM_SIZE_DIRECTIVE(expf) - +GLOBAL_IEEE754_END(expf) -.proc __libm_error_region -__libm_error_region: +LOCAL_LIBM_ENTRY(__libm_error_region) .prologue { .mfi - add GR_Parameter_Y=-32,sp // Parameter 2 value - nop.f 999 + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 0 .save ar.pfs,GR_SAVE_PFS - mov GR_SAVE_PFS=ar.pfs // Save ar.pfs + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs } { .mfi .fframe 64 - add sp=-64,sp // Create new stack - nop.f 0 - mov GR_SAVE_GP=gp // Save gp + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp };; { .mmi - stfs [GR_Parameter_Y] = FR_Y,16 // Store Parameter 2 on stack - add GR_Parameter_X = 16,sp // Parameter 1 address + stfs [GR_Parameter_Y] = FR_Y,16 // Store Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address .save b0, GR_SAVE_B0 - mov GR_SAVE_B0=b0 // Save b0 + mov GR_SAVE_B0=b0 // Save b0 };; .body { .mfi - stfs [GR_Parameter_X] = FR_X // Store Parameter 1 on stack - nop.f 0 - add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address + stfs [GR_Parameter_X] = FR_X // Store Parameter 1 on stack + nop.f 0 + add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address } { .mib - stfs [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack - add GR_Parameter_Y = -16,GR_Parameter_Y - br.call.sptk b0=__libm_error_support# // Call error handling function + stfs [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function };; { .mmi - nop.m 0 - nop.m 0 - add GR_Parameter_RESULT = 48,sp + add GR_Parameter_RESULT = 48,sp + nop.m 0 + nop.i 0 };; { .mmi - ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack + ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack .restore sp - add sp = 64,sp // Restore stack pointer - mov b0 = GR_SAVE_B0 // Restore return address + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address };; { .mib - mov gp = GR_SAVE_GP // Restore gp - mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs - br.ret.sptk b0 // Return -};; + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) +LOCAL_LIBM_END(__libm_error_region) .type __libm_error_support#,@function diff --git a/sysdeps/ia64/fpu/e_fmod.S b/sysdeps/ia64/fpu/e_fmod.S index 2b3ee9610f..d801e0c128 100644 --- a/sysdeps/ia64/fpu/e_fmod.S +++ b/sysdeps/ia64/fpu/e_fmod.S @@ -1,11 +1,10 @@ .file "fmod.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. // -// Contributed 2/2/2000 by John Harrison, Cristina Iordache, Ted Kubaska, -// Bob Norin, Shane Story, and Ping Tak Peter Tang of the Computational -// Software Lab, Intel Corporation. +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -21,7 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -36,38 +35,42 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //==================================================================== -// 2/02/00 Initial version -// 3/02/00 New Algorithm -// 4/04/00 Unwind support added -// 8/15/00 Bundle added after call to __libm_error_support to properly +// 02/02/00 Initial version +// 03/02/00 New Algorithm +// 04/04/00 Unwind support added +// 08/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. -//11/28/00 Set FR_Y to f9 +// 11/28/00 Set FR_Y to f9 +// 03/11/02 Fixed flags for fmod(qnan,zero) +// 05/20/02 Cleaned up namespace and sf0 syntax +// 02/10/03 Reordered header: .section, .global, .proc, .align +// 04/28/03 Fix: fmod(sNaN,0) no longer sets errno // // API //==================================================================== -// double fmod(double,double); +// double fmod(double,double); // // Overview of operation //==================================================================== // fmod(a,b)=a-i*b, -// where i is an integer such that, if b!=0, +// where i is an integer such that, if b!=0, // |i|<|a/b| and |a/b-i|<1 // // Algorithm //==================================================================== // a). if |a|<|b|, return a -// b). get quotient and reciprocal overestimates accurate to +// b). get quotient and reciprocal overestimates accurate to // 33 bits (q2,y2) // c). if the exponent difference (exponent(a)-exponent(b)) // is less than 32, truncate quotient to integer and // finish in one iteration // d). if exponent(a)-exponent(b)>=32 (q2>=2^32) // round quotient estimate to single precision (k=RN(q2)), -// calculate partial remainder (a'=a-k*b), +// calculate partial remainder (a'=a-k*b), // get quotient estimate (a'*y2), and repeat from c). // // Special cases @@ -81,14 +84,9 @@ // General registers: r2,r29,r32 (ar.pfs), r33-r39 // Floating point registers: f6-f15 -#include "libm_support.h" - -.section .text - - GR_SAVE_B0 = r33 GR_SAVE_PFS = r34 -GR_SAVE_GP = r35 +GR_SAVE_GP = r35 GR_SAVE_SP = r36 GR_Parameter_X = r37 @@ -101,17 +99,9 @@ FR_Y = f9 FR_RESULT = f8 -.proc fmod# -.align 32 -.global fmod# -.align 32 +.section .text +GLOBAL_IEEE754_ENTRY(fmod) -fmod: -#ifdef _LIBC -.global __ieee754_fmod -.type __ieee754_fmod,@function -__ieee754_fmod: -#endif // inputs in f8, f9 // result in f8 @@ -133,12 +123,12 @@ __ieee754_fmod: // (1) y0 frcpa.s1 f10,p6=f6,f7 nop.i 0 -} +} // Y +-NAN, +-inf, +-0? p7 { .mfi nop.m 999 -(p0) fclass.m.unc p7,p0 = f9, 0xe7 + fclass.m.unc p7,p0 = f9, 0xe7 nop.i 999;; } @@ -149,14 +139,14 @@ __ieee754_fmod: { .mfi nop.m 999 -(p0) fclass.m.unc p9,p0 = f8, 0xe3 - nop.i 999 + fclass.m.unc p9,p0 = f8, 0xe3 + nop.i 999 } // |x| < |y|? Return x p8 { .mfi nop.m 999 -(p0) fcmp.lt.unc.s1 p8,p0 = f6,f7 + fcmp.lt.unc.s1 p8,p0 = f6,f7 nop.i 999 ;; } @@ -172,33 +162,33 @@ __ieee754_fmod: // (2) q0=a*y0 (p6) fma.s1 f13=f6,f10,f0 nop.i 0 -} +} { .mfi nop.m 0 // (3) e0 = 1 - b * y0 (p6) fnma.s1 f12=f7,f10,f1 nop.i 0;; -} +} {.mfi nop.m 0 // normalize x (if |x|<|y|) (p8) fma.d.s0 f8=f8,f1,f0 nop.i 0 -} +} {.bbb - (p9) br.cond.spnt L(FMOD_X_NAN_INF) - (p7) br.cond.spnt L(FMOD_Y_NAN_INF_ZERO) + (p9) br.cond.spnt FMOD_X_NAN_INF + (p7) br.cond.spnt FMOD_Y_NAN_INF_ZERO // if |x|<|y|, return (p8) br.ret.spnt b0;; } - {.mfi + {.mfi nop.m 0 // normalize x fma.s0 f6=f6,f1,f0 nop.i 0 -} +} {.mfi nop.m 0 // normalize y @@ -212,45 +202,45 @@ __ieee754_fmod: // (4) q1=q0+e0*q0 (p6) fma.s1 f13=f12,f13,f13 nop.i 0 -} +} { .mfi nop.m 0 // (5) e1 = e0 * e0 + 2^-34 (p6) fma.s1 f14=f12,f12,f11 nop.i 0;; -} +} {.mlx nop.m 0 movl r2=0x33a00000;; -} +} { .mfi nop.m 0 // (6) y1 = y0 + e0 * y0 (p6) fma.s1 f10=f12,f10,f10 nop.i 0;; -} +} {.mfi // set f12=1.25*2^{-24} setf.s f12=r2 // (7) q2=q1+e1*q1 (p6) fma.s1 f13=f13,f14,f13 nop.i 0;; -} +} {.mfi nop.m 0 fmerge.s f9=f8,f9 nop.i 0 -} +} { .mfi nop.m 0 // (8) y2 = y1 + e1 * y1 (p6) fma.s1 f10=f14,f10,f10 // set p6=0, p10=0 cmp.ne.and p6,p10=r0,r0;; -} +} .align 32 -L(loop53): +loop53: {.mfi nop.m 0 // compare q2, 2^32 @@ -280,7 +270,7 @@ L(loop53): // normalize truncated quotient (p8) fcvt.xf f13=f11 nop.i 0;; -} +} { .mfi nop.m 0 // calculate remainder (assuming f13=RZ(Q)) @@ -289,7 +279,7 @@ L(loop53): } {.mfi nop.m 0 - // also if exponent>32, round quotient to single precision + // also if exponent>32, round quotient to single precision // and subtract 1 ulp: q=q-q*(1.25*2^{-24}) (p7) fnma.s.s1 f11=f13,f12,f13 nop.i 0;; @@ -332,7 +322,7 @@ L(loop53): .pred.rel "mutex",p6,p10 {.mfb nop.m 0 - // add b to estimated remainder (to cover the case when the quotient was overestimated) + // add b to estimated remainder (to cover the case when the quotient was overestimated) // also set correct sign by using f9=|b|*sgn(a), f12=sgn(a) (p6) fma.d.s0 f8=f11,f12,f9 nop.b 0 @@ -354,97 +344,114 @@ L(loop53): nop.m 0 // if f14 was RZ(Q), set remainder to f14 (p9) mov f6=f14 - br.cond.sptk L(loop53);; + br.cond.sptk loop53;; } -L(FMOD_X_NAN_INF): +FMOD_X_NAN_INF: // Y zero ? -{.mfi +{.mfi + nop.m 0 + fclass.m p10,p0=f8,0xc3 // Test x=nan + nop.i 0 +} +{.mfi nop.m 0 fma.s1 f10=f9,f1,f0 nop.i 0;; } + {.mfi + nop.m 0 + fma.s0 f8=f8,f1,f0 + nop.i 0 +} +{.mfi + nop.m 0 +(p10) fclass.m p10,p0=f9,0x07 // Test x=nan, and y=zero + nop.i 0;; +} + +{.mfb nop.m 0 fcmp.eq.unc.s1 p11,p0=f10,f0 - nop.i 0;; +(p10) br.ret.spnt b0;; // Exit with result=x if x=nan and y=zero } {.mib nop.m 0 nop.i 0 // if Y zero - (p11) br.cond.spnt L(FMOD_Y_ZERO);; + (p11) br.cond.spnt FMOD_Y_ZERO;; } // X infinity? Return QNAN indefinite { .mfi nop.m 999 -(p0) fclass.m.unc p8,p9 = f8, 0x23 - nop.i 999;; + fclass.m.unc p8,p9 = f8, 0x23 + nop.i 999;; } // Y NaN ? {.mfi - nop.m 999 + nop.m 999 (p8) fclass.m p9,p8=f9,0xc3 - nop.i 0;; + nop.i 0;; } {.mfi - nop.m 999 -(p8) frcpa.s0 f8,p0 = f8,f8 + nop.m 999 +(p8) frcpa.s0 f8,p0 = f8,f8 nop.i 0 -} +} { .mfi nop.m 999 - // also set Denormal flag if necessary + // also set Denormal flag if necessary (p8) fma.s0 f9=f9,f1,f0 nop.i 999 ;; } { .mfb nop.m 999 -(p8) fma.d f8=f8,f1,f0 - nop.b 999 ;; +(p8) fma.d.s0 f8=f8,f1,f0 + nop.b 999 ;; } { .mfb nop.m 999 -(p9) frcpa.s0 f8,p7=f8,f9 - br.ret.sptk b0 ;; +(p9) frcpa.s0 f8,p7=f8,f9 + br.ret.sptk b0 ;; } -L(FMOD_Y_NAN_INF_ZERO): +FMOD_Y_NAN_INF_ZERO: // Y INF { .mfi nop.m 999 -(p0) fclass.m.unc p7,p0 = f9, 0x23 + fclass.m.unc p7,p0 = f9, 0x23 nop.i 999 ;; } { .mfb nop.m 999 -(p7) fma.d f8=f8,f1,f0 -(p7) br.ret.spnt b0 ;; +(p7) fma.d.s0 f8=f8,f1,f0 +(p7) br.ret.spnt b0 ;; } // Y NAN? { .mfi nop.m 999 -(p0) fclass.m.unc p9,p0 = f9, 0xc3 + fclass.m.unc p9,p0 = f9, 0xc3 nop.i 999 ;; } { .mfb nop.m 999 -(p9) fma.d f8=f9,f1,f0 -(p9) br.ret.spnt b0 ;; +(p9) fma.d.s0 f8=f9,f1,f0 +(p9) br.ret.spnt b0 ;; } -L(FMOD_Y_ZERO): +FMOD_Y_ZERO: // Y zero? Must be zero at this point // because it is the only choice left. // Return QNAN indefinite @@ -452,60 +459,56 @@ L(FMOD_Y_ZERO): {.mfi nop.m 0 // set Invalid - frcpa f12,p0=f0,f0 + frcpa.s0 f12,p0=f0,f0 nop.i 0 } // X NAN? { .mfi nop.m 999 -(p0) fclass.m.unc p9,p10 = f8, 0xc3 + fclass.m.unc p9,p10 = f8, 0xc3 nop.i 999 ;; } { .mfi nop.m 999 -(p10) fclass.nm p9,p10 = f8, 0xff +(p10) fclass.nm p9,p10 = f8, 0xff nop.i 999 ;; } {.mfi nop.m 999 - (p9) frcpa f11,p7=f8,f0 + (p9) frcpa.s0 f11,p7=f8,f0 nop.i 0;; } { .mfi nop.m 999 -(p10) frcpa f11,p7 = f9,f9 -(p0) mov GR_Parameter_TAG = 121 ;; +(p10) frcpa.s0 f11,p7 = f9,f9 + mov GR_Parameter_TAG = 121 ;; } { .mfi nop.m 999 -(p0) fmerge.s f10 = f8, f8 + fmerge.s f10 = f8, f8 nop.i 999 } { .mfb nop.m 999 -(p0) fma.d f8=f11,f1,f0 -(p0) br.sptk __libm_error_region;; + fma.d.s0 f8=f11,f1,f0 + br.sptk __libm_error_region;; } -.endp fmod -ASM_SIZE_DIRECTIVE(fmod) -ASM_SIZE_DIRECTIVE(__ieee754_fmod) - -.proc __libm_error_region -__libm_error_region: +GLOBAL_IEEE754_END(fmod) +LOCAL_LIBM_ENTRY(__libm_error_region) .prologue { .mfi add GR_Parameter_Y=-32,sp // Parameter 2 value nop.f 0 .save ar.pfs,GR_SAVE_PFS - mov GR_SAVE_PFS=ar.pfs // Save ar.pfs + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs } { .mfi -.fframe 64 +.fframe 64 add sp=-64,sp // Create new stack nop.f 0 mov GR_SAVE_GP=gp // Save gp @@ -513,18 +516,18 @@ __libm_error_region: { .mmi stfd [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack add GR_Parameter_X = 16,sp // Parameter 1 address -.save b0, GR_SAVE_B0 - mov GR_SAVE_B0=b0 // Save b0 +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 };; .body { .mib - stfd [GR_Parameter_X] = FR_X // Store Parameter 1 on stack - add GR_Parameter_RESULT = 0,GR_Parameter_Y - nop.b 0 // Parameter 3 address + stfd [GR_Parameter_X] = FR_X // Store Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y + nop.b 0 // Parameter 3 address } { .mib stfd [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack - add GR_Parameter_Y = -16,GR_Parameter_Y + add GR_Parameter_Y = -16,GR_Parameter_Y br.call.sptk b0=__libm_error_support# // Call error handling function };; { .mmi @@ -539,13 +542,17 @@ __libm_error_region: mov b0 = GR_SAVE_B0 // Restore return address };; { .mib - mov gp = GR_SAVE_GP // Restore gp + mov gp = GR_SAVE_GP // Restore gp mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs br.ret.sptk b0 // Return -};; +};; + +LOCAL_LIBM_END(__libm_error_region) -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) .type __libm_error_support#,@function .global __libm_error_support# + + + + diff --git a/sysdeps/ia64/fpu/e_fmodf.S b/sysdeps/ia64/fpu/e_fmodf.S index 5b6390eeec..fe1ec0304d 100644 --- a/sysdeps/ia64/fpu/e_fmodf.S +++ b/sysdeps/ia64/fpu/e_fmodf.S @@ -1,10 +1,10 @@ .file "fmodf.s" -// Copyright (c) 2000, 2001, Intel Corporation + + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. // -// Contributed 2/2/2000 by John Harrison, Cristina Iordache, Ted Kubaska, -// Bob Norin, Shane Story, and Ping Tak Peter Tang of the Computational -// Software Lab, Intel Corporation. +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,9 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// WARRANTY DISCLAIMER -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -37,38 +35,42 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //==================================================================== -// 2/02/00 Initial version -// 3/02/00 New Algorithm -// 4/04/00 Unwind support added -// 8/15/00 Bundle added after call to __libm_error_support to properly +// 02/02/00 Initial version +// 03/02/00 New Algorithm +// 04/04/00 Unwind support added +// 08/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. -//11/28/00 Set FR_Y to f9 +// 11/28/00 Set FR_Y to f9 +// 03/11/02 Fixed flags for fmodf(qnan,zero) +// 05/20/02 Cleaned up namespace and sf0 syntax +// 02/10/03 Reordered header: .section, .global, .proc, .align +// 04/28/03 Fix: fmod(sNaN,0) no longer sets errno // // API //==================================================================== -// float fmodf(float,float); +// float fmodf(float,float); // // Overview of operation //==================================================================== // fmod(a,b)=a-i*b, -// where i is an integer such that, if b!=0, +// where i is an integer such that, if b!=0, // |i|<|a/b| and |a/b-i|<1 // Algorithm //==================================================================== // a). if |a|<|b|, return a -// b). get quotient and reciprocal overestimates accurate to +// b). get quotient and reciprocal overestimates accurate to // 33 bits (q2,y2) // c). if the exponent difference (exponent(a)-exponent(b)) // is less than 32, truncate quotient to integer and // finish in one iteration // d). if exponent(a)-exponent(b)>=32 (q2>=2^32) // round quotient estimate to single precision (k=RN(q2)), -// calculate partial remainder (a'=a-k*b), +// calculate partial remainder (a'=a-k*b), // get quotient estimate (a'*y2), and repeat from c). // Special cases @@ -82,13 +84,9 @@ // General registers: r2,r29,r32 (ar.pfs), r33-r39 // Floating point registers: f6-f15 -#include "libm_support.h" - -.section .text - GR_SAVE_B0 = r33 GR_SAVE_PFS = r34 -GR_SAVE_GP = r35 +GR_SAVE_GP = r35 GR_SAVE_SP = r36 GR_Parameter_X = r37 @@ -101,18 +99,9 @@ FR_Y = f9 FR_RESULT = f8 +.section .text +GLOBAL_IEEE754_ENTRY(fmodf) -.proc fmodf# -.align 32 -.global fmodf# -.align 32 - -fmodf: -#ifdef _LIBC -.global __ieee754_fmodf -.type __ieee754_fmodf,@function -__ieee754_fmodf: -#endif // inputs in f8, f9 // result in f8 @@ -134,13 +123,13 @@ __ieee754_fmodf: // (1) y0 frcpa.s1 f10,p6=f6,f7 nop.i 0 -} +} // eliminate special cases // Y +-NAN, +-inf, +-0? p7 { .mfi nop.m 999 -(p0) fclass.m.unc p7,p0 = f9, 0xe7 + fclass.m.unc p7,p0 = f9, 0xe7 nop.i 999;; } @@ -151,14 +140,14 @@ __ieee754_fmodf: { .mfi nop.m 999 -(p0) fclass.m.unc p9,p0 = f8, 0xe3 - nop.i 999 + fclass.m.unc p9,p0 = f8, 0xe3 + nop.i 999 } // |x| < |y|? Return x p8 { .mfi nop.m 999 -(p0) fcmp.lt.unc.s1 p8,p0 = f6,f7 + fcmp.lt.unc.s1 p8,p0 = f6,f7 nop.i 999 ;; } @@ -174,33 +163,33 @@ __ieee754_fmodf: // (2) q0=a*y0 (p6) fma.s1 f13=f6,f10,f0 nop.i 0 -} +} { .mfi nop.m 0 // (3) e0 = 1 - b * y0 (p6) fnma.s1 f12=f7,f10,f1 nop.i 0;; -} +} {.mfi nop.m 0 // normalize x (if |x|<|y|) (p8) fma.s.s0 f8=f8,f1,f0 nop.i 0 -} +} {.bbb - (p9) br.cond.spnt L(FMOD_X_NAN_INF) - (p7) br.cond.spnt L(FMOD_Y_NAN_INF_ZERO) + (p9) br.cond.spnt FMOD_X_NAN_INF + (p7) br.cond.spnt FMOD_Y_NAN_INF_ZERO // if |x|<|y|, return (p8) br.ret.spnt b0;; } - {.mfi + {.mfi nop.m 0 // normalize x fma.s0 f6=f6,f1,f0 nop.i 0 -} +} {.mfi nop.m 0 // normalize y @@ -215,45 +204,45 @@ __ieee754_fmodf: // (4) q1=q0+e0*q0 (p6) fma.s1 f13=f12,f13,f13 nop.i 0 -} +} { .mfi nop.m 0 // (5) e1 = e0 * e0 + 2^-34 (p6) fma.s1 f14=f12,f12,f11 nop.i 0;; -} +} {.mlx nop.m 0 movl r2=0x33a00000;; -} +} { .mfi nop.m 0 // (6) y1 = y0 + e0 * y0 (p6) fma.s1 f10=f12,f10,f10 nop.i 0;; -} +} {.mfi // set f12=1.25*2^{-24} setf.s f12=r2 // (7) q2=q1+e1*q1 (p6) fma.s1 f13=f13,f14,f13 nop.i 0;; -} +} {.mfi nop.m 0 fmerge.s f9=f8,f9 nop.i 0 -} +} { .mfi nop.m 0 // (8) y2 = y1 + e1 * y1 (p6) fma.s1 f10=f14,f10,f10 // set p6=0, p10=0 cmp.ne.and p6,p10=r0,r0;; -} +} .align 32 -L(loop24): +loop24: {.mfi nop.m 0 // compare q2, 2^32 @@ -283,7 +272,7 @@ L(loop24): // normalize truncated quotient (p8) fcvt.xf f13=f11 nop.i 0;; -} +} { .mfi nop.m 0 // calculate remainder (assuming f13=RZ(Q)) @@ -292,7 +281,7 @@ L(loop24): } {.mfi nop.m 0 - // also if exponent>32, round quotient to single precision + // also if exponent>32, round quotient to single precision // and subtract 1 ulp: q=q-q*(1.25*2^{-24}) (p7) fnma.s.s1 f11=f13,f12,f13 nop.i 0;; @@ -335,7 +324,7 @@ L(loop24): .pred.rel "mutex",p6,p10 {.mfb nop.m 0 - // add b to estimated remainder (to cover the case when the quotient was overestimated) + // add b to estimated remainder (to cover the case when the quotient was overestimated) // also set correct sign by using f9=|b|*sgn(a), f12=sgn(a) (p6) fma.s.s0 f8=f11,f12,f9 nop.b 0 @@ -357,102 +346,118 @@ L(loop24): nop.m 0 // if f14 was RZ(Q), set remainder to f14 (p9) mov f6=f14 - br.cond.sptk L(loop24);; + br.cond.sptk loop24;; } { .mmb - nop.m 0 - nop.m 0 - br.ret.sptk b0;; + nop.m 0 + nop.m 0 + br.ret.sptk b0;; } -L(FMOD_X_NAN_INF): +FMOD_X_NAN_INF: // Y zero ? -{.mfi +{.mfi + nop.m 0 + fclass.m p10,p0=f8,0xc3 // Test x=nan + nop.i 0 +} +{.mfi nop.m 0 fma.s1 f10=f9,f1,f0 nop.i 0;; } + {.mfi + nop.m 0 + fma.s0 f8=f8,f1,f0 + nop.i 0 +} +{.mfi + nop.m 0 +(p10) fclass.m p10,p0=f9,0x07 // Test x=nan, and y=zero + nop.i 0;; +} +{.mfb nop.m 0 fcmp.eq.unc.s1 p11,p0=f10,f0 - nop.i 0;; +(p10) br.ret.spnt b0;; // Exit with result=x if x=nan and y=zero } {.mib nop.m 0 nop.i 0 // if Y zero - (p11) br.cond.spnt L(FMOD_Y_ZERO);; + (p11) br.cond.spnt FMOD_Y_ZERO;; } // X infinity? Return QNAN indefinite { .mfi nop.m 999 -(p0) fclass.m.unc p8,p9 = f8, 0x23 - nop.i 999;; + fclass.m.unc p8,p9 = f8, 0x23 + nop.i 999;; } // Y NaN ? {.mfi - nop.m 999 + nop.m 999 (p8) fclass.m p9,p8=f9,0xc3 - nop.i 0;; + nop.i 0;; } {.mfi - nop.m 999 -(p8) frcpa.s0 f8,p0 = f8,f8 + nop.m 999 +(p8) frcpa.s0 f8,p0 = f8,f8 nop.i 0 -} +} { .mfi nop.m 999 - // also set Denormal flag if necessary + // also set Denormal flag if necessary (p8) fma.s0 f9=f9,f1,f0 nop.i 999 ;; } { .mfb nop.m 999 -(p8) fma.s f8=f8,f1,f0 - nop.b 999 ;; +(p8) fma.s.s0 f8=f8,f1,f0 + nop.b 999 ;; } { .mfb nop.m 999 -(p9) frcpa.s0 f8,p7=f8,f9 - br.ret.sptk b0 ;; +(p9) frcpa.s0 f8,p7=f8,f9 + br.ret.sptk b0 ;; } -L(FMOD_Y_NAN_INF_ZERO): +FMOD_Y_NAN_INF_ZERO: // Y INF { .mfi nop.m 999 -(p0) fclass.m.unc p7,p0 = f9, 0x23 + fclass.m.unc p7,p0 = f9, 0x23 nop.i 999 ;; } { .mfb nop.m 999 -(p7) fma.s f8=f8,f1,f0 -(p7) br.ret.spnt b0 ;; +(p7) fma.s.s0 f8=f8,f1,f0 +(p7) br.ret.spnt b0 ;; } // Y NAN? { .mfi nop.m 999 -(p0) fclass.m.unc p9,p0 = f9, 0xc3 + fclass.m.unc p9,p0 = f9, 0xc3 nop.i 999 ;; } { .mfb nop.m 999 -(p9) fma.s f8=f9,f1,f0 -(p9) br.ret.spnt b0 ;; +(p9) fma.s.s0 f8=f9,f1,f0 +(p9) br.ret.spnt b0 ;; } -L(FMOD_Y_ZERO): +FMOD_Y_ZERO: // Y zero? Must be zero at this point // because it is the only choice left. // Return QNAN indefinite @@ -460,69 +465,65 @@ L(FMOD_Y_ZERO): {.mfi nop.m 0 // set Invalid - frcpa f12,p0=f0,f0 + frcpa.s0 f12,p0=f0,f0 nop.i 999 } // X NAN? { .mfi nop.m 999 -(p0) fclass.m.unc p9,p10 = f8, 0xc3 + fclass.m.unc p9,p10 = f8, 0xc3 nop.i 999 ;; } { .mfi nop.m 999 -(p10) fclass.nm p9,p10 = f8, 0xff +(p10) fclass.nm p9,p10 = f8, 0xff nop.i 999 ;; } {.mfi nop.m 999 - (p9) frcpa f11,p7=f8,f0 + (p9) frcpa.s0 f11,p7=f8,f0 nop.i 0;; } { .mfi nop.m 999 -(p10) frcpa f11,p7 = f0,f0 +(p10) frcpa.s0 f11,p7 = f0,f0 nop.i 999;; } { .mfi nop.m 999 -(p0) fmerge.s f10 = f8, f8 + fmerge.s f10 = f8, f8 nop.i 999 } { .mfi nop.m 999 -(p0) fma.s f8=f11,f1,f0 + fma.s.s0 f8=f11,f1,f0 nop.i 999;; } -L(EXP_ERROR_RETURN): +EXP_ERROR_RETURN: { .mib nop.m 0 -(p0) mov GR_Parameter_TAG=122 -(p0) br.sptk __libm_error_region;; + mov GR_Parameter_TAG=122 + br.sptk __libm_error_region;; } -.endp fmodf -ASM_SIZE_DIRECTIVE(fmodf) -ASM_SIZE_DIRECTIVE(__ieee754_fmodf) - -.proc __libm_error_region -__libm_error_region: +GLOBAL_IEEE754_END(fmodf) +LOCAL_LIBM_ENTRY(__libm_error_region) .prologue { .mfi add GR_Parameter_Y=-32,sp // Parameter 2 value nop.f 0 .save ar.pfs,GR_SAVE_PFS - mov GR_SAVE_PFS=ar.pfs // Save ar.pfs + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs } { .mfi -.fframe 64 +.fframe 64 add sp=-64,sp // Create new stack nop.f 0 mov GR_SAVE_GP=gp // Save gp @@ -530,18 +531,18 @@ __libm_error_region: { .mmi stfs [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack add GR_Parameter_X = 16,sp // Parameter 1 address -.save b0, GR_SAVE_B0 - mov GR_SAVE_B0=b0 // Save b0 +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 };; .body { .mib - stfs [GR_Parameter_X] = FR_X // Store Parameter 1 on stack - add GR_Parameter_RESULT = 0,GR_Parameter_Y - nop.b 0 // Parameter 3 address + stfs [GR_Parameter_X] = FR_X // Store Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y + nop.b 0 // Parameter 3 address } { .mib stfs [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack - add GR_Parameter_Y = -16,GR_Parameter_Y + add GR_Parameter_Y = -16,GR_Parameter_Y br.call.sptk b0=__libm_error_support#;; // Call error handling function } { .mmi @@ -556,13 +557,14 @@ __libm_error_region: mov b0 = GR_SAVE_B0 // Restore return address };; { .mib - mov gp = GR_SAVE_GP // Restore gp + mov gp = GR_SAVE_GP // Restore gp mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs br.ret.sptk b0 // Return -};; +};; -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) +LOCAL_LIBM_END(__libm_error_region) .type __libm_error_support#,@function .global __libm_error_support# + + diff --git a/sysdeps/ia64/fpu/e_fmodl.S b/sysdeps/ia64/fpu/e_fmodl.S index 85c9f6ef82..da08ae3f5c 100644 --- a/sysdeps/ia64/fpu/e_fmodl.S +++ b/sysdeps/ia64/fpu/e_fmodl.S @@ -1,11 +1,10 @@ .file "fmodl.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. // -// Contributed 2/2/2000 by John Harrison, Cristina Iordache, Ted Kubaska, -// Bob Norin, Shane Story, and Ping Tak Peter Tang of the Computational -// Software Lab, Intel Corporation. +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -21,7 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -36,38 +35,42 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //==================================================================== -// 2/02/00 Initial version -// 3/02/00 New Algorithm -// 4/04/00 Unwind support added -// 8/15/00 Bundle added after call to __libm_error_support to properly +// 02/02/00 Initial version +// 03/02/00 New Algorithm +// 04/04/00 Unwind support added +// 08/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. -//11/28/00 Set FR_Y to f9 +// 11/28/00 Set FR_Y to f9 +// 03/11/02 Fixed flags for fmodl(qnan,zero) +// 05/20/02 Cleaned up namespace and sf0 syntax +// 02/10/03 Reordered header: .section, .global, .proc, .align +// 04/28/03 Fix: fmod(sNaN,0) no longer sets errno // // API //==================================================================== -// long double fmodl(long double,long double); +// long double fmodl(long double,long double); // // Overview of operation //==================================================================== // fmod(a,b)=a-i*b, -// where i is an integer such that, if b!=0, +// where i is an integer such that, if b!=0, // |i|<|a/b| and |a/b-i|<1 // // Algorithm //==================================================================== // a). if |a|<|b|, return a -// b). get quotient and reciprocal overestimates accurate to +// b). get quotient and reciprocal overestimates accurate to // 33 bits (q2,y2) // c). if the exponent difference (exponent(a)-exponent(b)) // is less than 32, truncate quotient to integer and // finish in one iteration // d). if exponent(a)-exponent(b)>=32 (q2>=2^32) // round quotient estimate to single precision (k=RN(q2)), -// calculate partial remainder (a'=a-k*b), +// calculate partial remainder (a'=a-k*b), // get quotient estimate (a'*y2), and repeat from c). // // Registers used @@ -76,13 +79,9 @@ // General registers: r2,r29,r32 (ar.pfs), r33-r39 // Floating point registers: f6-f15 -#include "libm_support.h" - -.section .text - GR_SAVE_B0 = r33 GR_SAVE_PFS = r34 -GR_SAVE_GP = r35 +GR_SAVE_GP = r35 GR_SAVE_SP = r36 GR_Parameter_X = r37 @@ -95,18 +94,9 @@ FR_Y = f9 FR_RESULT = f8 +.section .text +GLOBAL_IEEE754_ENTRY(fmodl) -.proc fmodl# -.align 32 -.global fmodl# -.align 32 - -fmodl: -#ifdef _LIBC -.global __ieee754_fmodl -.type __ieee754_fmodl,@function -__ieee754_fmodl: -#endif // inputs in f8, f9 // result in f8 @@ -128,7 +118,7 @@ __ieee754_fmodl: // (1) y0 frcpa.s1 f10,p6=f6,f7 nop.i 0;; -} +} // eliminate special cases {.mmi @@ -141,7 +131,7 @@ cmp.eq p7,p10=r29,r0;; // Y +-NAN, +-inf, +-0? p7 { .mfi nop.m 999 -(p10) fclass.m p7,p10 = f9, 0xe7 +(p10) fclass.m p7,p10 = f9, 0xe7 nop.i 999;; } @@ -152,14 +142,14 @@ cmp.eq p7,p10=r29,r0;; { .mfi nop.m 999 -(p0) fclass.m.unc p9,p11 = f8, 0xe3 - nop.i 999 + fclass.m.unc p9,p11 = f8, 0xe3 + nop.i 999 } // |x| < |y|? Return x p8 { .mfi nop.m 999 -(p10) fcmp.lt.unc.s1 p8,p0 = f6,f7 +(p10) fcmp.lt.unc.s1 p8,p0 = f6,f7 nop.i 999 ;; } @@ -173,13 +163,13 @@ cmp.eq p7,p10=r29,r0;; // (3) e0 = 1 - b * y0 (p6) fnma.s1 f12=f7,f10,f1 nop.i 0;; -} +} // Y +-NAN, +-inf, +-0? p7 { .mfi nop.m 999 - // pseudo-NaN ? -(p10) fclass.nm p7,p0 = f9, 0xff + // pseudo-NaN ? +(p10) fclass.nm p7,p0 = f9, 0xff nop.i 999 } @@ -190,7 +180,7 @@ cmp.eq p7,p10=r29,r0;; { .mfi nop.m 999 -(p11) fclass.nm p9,p0 = f8, 0xff +(p11) fclass.nm p9,p0 = f8, 0xff nop.i 999;; } @@ -209,18 +199,18 @@ cmp.eq p7,p10=r29,r0;; nop.i 0 } {.bbb - (p9) br.cond.spnt L(FMOD_X_NAN_INF) - (p7) br.cond.spnt L(FMOD_Y_NAN_INF_ZERO) + (p9) br.cond.spnt FMOD_X_NAN_INF + (p7) br.cond.spnt FMOD_Y_NAN_INF_ZERO // if |x|<|y|, return (p8) br.ret.spnt b0;; } - {.mfi + {.mfi nop.m 0 // x denormal ? set D flag fnma.s0 f32=f6,f1,f6 nop.i 0 -} +} {.mfi nop.m 0 // y denormal ? set D flag @@ -234,46 +224,46 @@ cmp.eq p7,p10=r29,r0;; // (4) q1=q0+e0*q0 (p6) fma.s1 f13=f12,f13,f13 nop.i 0 -} +} { .mfi nop.m 0 // (5) e1 = e0 * e0 + 2^-34 (p6) fma.s1 f14=f12,f12,f11 nop.i 0;; -} +} {.mlx nop.m 0 movl r2=0x33a00000;; -} +} { .mfi nop.m 0 // (6) y1 = y0 + e0 * y0 (p6) fma.s1 f10=f12,f10,f10 nop.i 0;; -} +} {.mfi // set f12=1.25*2^{-24} setf.s f12=r2 // (7) q2=q1+e1*q1 (p6) fma.s1 f13=f13,f14,f13 nop.i 0;; -} +} {.mfi nop.m 0 fmerge.s f9=f8,f9 nop.i 0 -} +} { .mfi nop.m 0 // (8) y2 = y1 + e1 * y1 (p6) fma.s1 f10=f14,f10,f10 // set p6=0, p10=0 cmp.ne.and p6,p10=r0,r0;; -} +} .align 32 -L(loop64): +loop64: {.mfi nop.m 0 // compare q2, 2^32 @@ -305,7 +295,7 @@ L(loop64): // normalize truncated quotient (p8) fcvt.xf f13=f11 nop.i 0;; -} +} { .mfi nop.m 0 // calculate remainder (assuming f13=RZ(Q)) @@ -314,7 +304,7 @@ L(loop64): } {.mfi nop.m 0 - // also if exponent>32, round quotient to single precision + // also if exponent>32, round quotient to single precision // and subtract 1 ulp: q=q-q*(1.25*2^{-24}) (p7) fnma.s.s1 f11=f13,f12,f13 nop.i 0;; @@ -357,7 +347,7 @@ L(loop64): .pred.rel "mutex",p6,p10 {.mfb nop.m 0 - // add b to estimated remainder (to cover the case when the quotient was overestimated) + // add b to estimated remainder (to cover the case when the quotient was overestimated) // also set correct sign by using f9=|b|*sgn(a), f12=sgn(a) (p6) fma.s0 f8=f11,f12,f9 nop.b 0 @@ -378,43 +368,59 @@ L(loop64): nop.m 0 // if f14 was RZ(Q), set remainder to f14 (p9) mov f6=f14 - br.cond.sptk L(loop64);; + br.cond.sptk loop64;; } -L(FMOD_X_NAN_INF): +FMOD_X_NAN_INF: // Y zero ? -{.mfi +{.mfi + nop.m 0 + fclass.m p10,p0=f8,0xc3 // Test x=nan + nop.i 0 +} +{.mfi nop.m 0 fma.s1 f10=f9,f1,f0 nop.i 0;; } + +{.mfi + nop.m 0 + fma.s0 f8=f8,f1,f0 + nop.i 0 +} {.mfi + nop.m 0 +(p10) fclass.m p10,p0=f9,0x07 // Test x=nan, and y=zero + nop.i 0;; +} +{.mfb nop.m 0 fcmp.eq.unc.s1 p11,p0=f10,f0 - nop.i 0;; +(p10) br.ret.spnt b0;; // Exit with result=x if x=nan and y=zero } {.mib nop.m 0 nop.i 0 // if Y zero - (p11) br.cond.spnt L(FMOD_Y_ZERO);; + (p11) br.cond.spnt FMOD_Y_ZERO;; } // X infinity? Return QNAN indefinite { .mfi - // set p7 t0 0 - cmp.ne p7,p0=r0,r0 -(p0) fclass.m.unc p8,p9 = f8, 0x23 - nop.i 999;; + // set p7 t0 0 + cmp.ne p7,p0=r0,r0 + fclass.m.unc p8,p9 = f8, 0x23 + nop.i 999;; } // Y NaN ? {.mfi nop.m 999 (p8) fclass.m p9,p8=f9,0xc3 - nop.i 0;; + nop.i 0;; } // Y not pseudo-zero ? (r29 holds significand) {.mii @@ -423,63 +429,63 @@ L(FMOD_X_NAN_INF): nop.i 0;; } {.mfi - nop.m 999 -(p8) frcpa.s0 f8,p0 = f8,f8 + nop.m 999 +(p8) frcpa.s0 f8,p0 = f8,f8 nop.i 0 -} +} { .mfi nop.m 999 - // also set Denormal flag if necessary + // also set Denormal flag if necessary (p7) fnma.s0 f9=f9,f1,f9 nop.i 999 ;; } { .mfb nop.m 999 -(p8) fma.s0 f8=f8,f1,f0 - nop.b 999 ;; +(p8) fma.s0 f8=f8,f1,f0 + nop.b 999 ;; } { .mfb nop.m 999 -(p9) frcpa.s0 f8,p7=f8,f9 - br.ret.sptk b0 ;; +(p9) frcpa.s0 f8,p7=f8,f9 + br.ret.sptk b0 ;; } -L(FMOD_Y_NAN_INF_ZERO): +FMOD_Y_NAN_INF_ZERO: // Y INF { .mfi nop.m 999 -(p0) fclass.m.unc p7,p0 = f9, 0x23 + fclass.m.unc p7,p0 = f9, 0x23 nop.i 999 ;; } { .mfb nop.m 999 -(p7) fma f8=f8,f1,f0 -(p7) br.ret.spnt b0 ;; +(p7) fma.s0 f8=f8,f1,f0 +(p7) br.ret.spnt b0 ;; } // Y NAN? { .mfi nop.m 999 -(p0) fclass.m.unc p9,p10 = f9, 0xc3 + fclass.m.unc p9,p10 = f9, 0xc3 nop.i 999 ;; } { .mfi nop.m 999 -(p10) fclass.nm p9,p0 = f9, 0xff +(p10) fclass.nm p9,p0 = f9, 0xff nop.i 999 ;; } { .mfb nop.m 999 -(p9) fma f8=f9,f1,f0 -(p9) br.ret.spnt b0 ;; +(p9) fma.s0 f8=f9,f1,f0 +(p9) br.ret.spnt b0 ;; } -L(FMOD_Y_ZERO): +FMOD_Y_ZERO: // Y zero? Must be zero at this point // because it is the only choice left. // Return QNAN indefinite @@ -487,62 +493,59 @@ L(FMOD_Y_ZERO): {.mfi nop.m 0 // set Invalid - frcpa f12,p0=f0,f0 + frcpa.s0 f12,p0=f0,f0 nop.i 0 } // X NAN? { .mfi nop.m 999 -(p0) fclass.m.unc p9,p10 = f8, 0xc3 + fclass.m.unc p9,p10 = f8, 0xc3 nop.i 999 ;; } { .mfi nop.m 999 -(p10) fclass.nm p9,p10 = f8, 0xff +(p10) fclass.nm p9,p10 = f8, 0xff nop.i 999 ;; } {.mfi nop.m 999 - (p9) frcpa f11,p7=f8,f0 + (p9) frcpa.s0 f11,p7=f8,f0 nop.i 0;; } { .mfi nop.m 999 -(p10) frcpa f11,p7 = f9,f9 -(p0) mov GR_Parameter_TAG = 120 ;; +(p10) frcpa.s0 f11,p7 = f9,f9 + mov GR_Parameter_TAG = 120 ;; } { .mfi nop.m 999 -(p0) fmerge.s f10 = f8, f8 + fmerge.s f10 = f8, f8 nop.i 999 } { .mfb nop.m 999 -(p0) fma f8=f11,f1,f0 -(p0) br.sptk __libm_error_region;; + fma.s0 f8=f11,f1,f0 + br.sptk __libm_error_region;; } -.endp fmodl -ASM_SIZE_DIRECTIVE(fmodl) -ASM_SIZE_DIRECTIVE(__ieee754_fmodl) +GLOBAL_IEEE754_END(fmodl) -.proc __libm_error_region -__libm_error_region: +LOCAL_LIBM_ENTRY(__libm_error_region) .prologue { .mfi add GR_Parameter_Y=-32,sp // Parameter 2 value nop.f 0 .save ar.pfs,GR_SAVE_PFS - mov GR_SAVE_PFS=ar.pfs // Save ar.pfs + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs } { .mfi -.fframe 64 +.fframe 64 add sp=-64,sp // Create new stack nop.f 0 mov GR_SAVE_GP=gp // Save gp @@ -550,18 +553,18 @@ __libm_error_region: { .mmi stfe [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack add GR_Parameter_X = 16,sp // Parameter 1 address -.save b0, GR_SAVE_B0 - mov GR_SAVE_B0=b0 // Save b0 +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 };; .body { .mib - stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack - add GR_Parameter_RESULT = 0,GR_Parameter_Y - nop.b 0 // Parameter 3 address + stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y + nop.b 0 // Parameter 3 address } { .mib stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack - add GR_Parameter_Y = -16,GR_Parameter_Y + add GR_Parameter_Y = -16,GR_Parameter_Y br.call.sptk b0=__libm_error_support# // Call error handling function };; { .mmi @@ -576,15 +579,17 @@ __libm_error_region: mov b0 = GR_SAVE_B0 // Restore return address };; { .mib - mov gp = GR_SAVE_GP // Restore gp + mov gp = GR_SAVE_GP // Restore gp mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs br.ret.sptk b0 // Return -};; +};; + +LOCAL_LIBM_END(__libm_error_region) -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) .type __libm_error_support#,@function .global __libm_error_support# + + diff --git a/sysdeps/ia64/fpu/e_hypot.S b/sysdeps/ia64/fpu/e_hypot.S index 113aac3461..885c819326 100644 --- a/sysdeps/ia64/fpu/e_hypot.S +++ b/sysdeps/ia64/fpu/e_hypot.S @@ -1,11 +1,10 @@ -.file "hypot.asm" +.file "hypot.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Cristina Iordache, Ted Kubaska, -// Bob Norin, Shane Story, and Ping Tak Peter Tang of the -// Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -21,7 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -36,24 +35,27 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. // -// ********************************************************************* +//********************************************************************* // // History: -// 2/02/00 hand-optimized -// 4/04/00 Unwind support added -// 6/20/00 new version -// 8/15/00 Bundle added after call to __libm_error_support to properly +// 02/02/00 hand-optimized +// 04/04/00 Unwind support added +// 06/20/00 new version +// 08/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. +// 05/20/02 Cleaned up namespace and sf0 syntax +// 02/10/03 Reordered header: .section, .global, .proc, .align +// 04/17/03 Added missing mutex directive // -// ********************************************************************* +//********************************************************************* // ___________ // Function: hypot(x,y) = |(x^2 + y^2) = for double precision values // x and y // Also provides cabs functionality. // -// ********************************************************************* +//********************************************************************* // // Resources Used: // @@ -68,7 +70,7 @@ // // Predicate Registers: p6 - p10 // -// ********************************************************************* +//********************************************************************* // // IEEE Special Conditions: // @@ -78,7 +80,7 @@ // hypot(QNaN and anything) = QNaN // hypot(SNaN and anything ) = QNaN // -// ********************************************************************* +//********************************************************************* // // Implementation: // x2 = x * x in double-extended @@ -86,9 +88,7 @@ // temp = x2 + y2 in double-extended // sqrt(temp) rounded to double // -// ********************************************************************* - -#include "libm_support.h" +//********************************************************************* GR_SAVE_PFS = r33 GR_SAVE_B0 = r34 @@ -103,23 +103,11 @@ FR_Y = f33 FR_RESULT = f8 .section .text -#ifndef _LIBC -.proc cabs# -.global cabs# -cabs: -.endp cabs -#endif -.proc hypot# -.global hypot# -.align 64 -hypot: -#ifdef _LIBC -.global __hypot -__hypot: -.global __ieee754_hypot -__ieee754_hypot: -#endif +LOCAL_LIBM_ENTRY(cabs) +LOCAL_LIBM_END(cabs) +GLOBAL_IEEE754_ENTRY(hypot) + {.mfi alloc r32= ar.pfs,0,4,4,0 // Compute x*x @@ -221,6 +209,7 @@ __ieee754_hypot: mov r2=0x107fb;; } +.pred.rel "mutex",p7,p8 {.mfb nop.m 0 // if f8=Infinity or f9=Zero, return |f8| @@ -394,11 +383,8 @@ __ieee754_hypot: // No overflow (p9) br.ret.sptk b0;; } -.endp hypot -ASM_SIZE_DIRECTIVE(hypot) - -.proc __libm_error_region -__libm_error_region: +GLOBAL_IEEE754_END(hypot) +LOCAL_LIBM_ENTRY(__libm_error_region) .prologue { .mfi add GR_Parameter_Y=-32,sp // Parameter 2 value @@ -445,7 +431,8 @@ __libm_error_region: mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs br.ret.sptk b0 // Return };; -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) +LOCAL_LIBM_END(__libm_error_region#) .type __libm_error_support#,@function .global __libm_error_support# + + diff --git a/sysdeps/ia64/fpu/e_hypotf.S b/sysdeps/ia64/fpu/e_hypotf.S index 0a11ec5b41..633bb67e59 100644 --- a/sysdeps/ia64/fpu/e_hypotf.S +++ b/sysdeps/ia64/fpu/e_hypotf.S @@ -1,11 +1,10 @@ -.file "hypotf.asm" +.file "hypotf.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Cristina Iordache, Ted Kubaska, -// Bob Norin, Shane Story, and Ping Tak Peter Tang of the -// Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -21,7 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -36,24 +35,27 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. // -// ********************************************************************* +//********************************************************************* // // History: -// 2/02/00 hand-optimized -// 4/04/00 Unwind support added -// 6/26/00 new version -// 8/15/00 Bundle added after call to __libm_error_support to properly +// 02/02/00 hand-optimized +// 04/04/00 Unwind support added +// 06/26/00 new version +// 08/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. +// 05/20/02 Cleaned up namespace and sf0 syntax +// 02/10/03 Reordered header: .section, .global, .proc, .align +// 04/17/03 Added missing mutex directive // -// ********************************************************************* +//********************************************************************* // ___________ // Function: hypotf(x,y) = |(x^2 + y^2) = for single precision values // x and y // Also provides cabsf functionality. // -// ********************************************************************* +//********************************************************************* // // Resources Used: // @@ -68,7 +70,7 @@ // // Predicate Registers: p6 - p10 // -// ********************************************************************* +//********************************************************************* // // IEEE Special Conditions: // @@ -78,7 +80,7 @@ // hypotf(QNaN and anything) = QNaN // hypotf(SNaN and anything ) = QNaN // -// ********************************************************************* +//********************************************************************* // // Implementation: // x2 = x * x in double-extended @@ -86,9 +88,7 @@ // temp = x2 + y2 in double-extended // sqrt(temp) rounded to single precision // -// ********************************************************************* - -#include "libm_support.h" +//********************************************************************* GR_SAVE_PFS = r33 GR_SAVE_B0 = r34 @@ -103,23 +103,10 @@ FR_Y = f15 FR_RESULT = f8 .section .text -#ifndef _LIBC -.proc cabsf# -.global cabsf# -cabsf: -.endp cabsf -#endif -.proc hypotf# -.global hypotf# -.align 64 -hypotf: -#ifdef _LIBC -.global __hypotf -__hypotf: -.global __ieee754_hypotf -__ieee754_hypotf: -#endif +LOCAL_LIBM_ENTRY(cabsf) +LOCAL_LIBM_END(cabsf) +GLOBAL_IEEE754_ENTRY(hypotf) {.mfi alloc r32= ar.pfs,0,4,4,0 // Compute x*x @@ -207,6 +194,7 @@ __ieee754_hypotf: nop.i 0;; } +.pred.rel "mutex",p7,p8 {.mfb nop.m 0 // if f8=Infinity or f9=Zero, return |f8| @@ -348,15 +336,12 @@ __ieee754_hypotf: // No overflow (p9) br.ret.sptk b0;; } -.endp hypotf -ASM_SIZE_DIRECTIVE(hypotf) - -.proc __libm_error_region -__libm_error_region: +GLOBAL_IEEE754_END(hypotf) +LOCAL_LIBM_ENTRY(__libm_error_region) .prologue { .mii add GR_Parameter_Y=-32,sp // Parameter 2 value -(p0) mov GR_Parameter_TAG = 47 + mov GR_Parameter_TAG = 47 .save ar.pfs,GR_SAVE_PFS mov GR_SAVE_PFS=ar.pfs // Save ar.pfs } @@ -400,8 +385,9 @@ __libm_error_region: br.ret.sptk b0 // Return };; -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) +LOCAL_LIBM_END(__libm_error_region) + .type __libm_error_support#,@function .global __libm_error_support# + diff --git a/sysdeps/ia64/fpu/e_hypotl.S b/sysdeps/ia64/fpu/e_hypotl.S index 986faf6fcc..0aa94b69b8 100644 --- a/sysdeps/ia64/fpu/e_hypotl.S +++ b/sysdeps/ia64/fpu/e_hypotl.S @@ -1,11 +1,10 @@ -.file "hypotl.asm" +.file "hypotl.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Cristina Iordache, Ted Kubaska, -// Bob Norin, Shane Story, and Ping Tak Peter Tang of the -// Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -21,7 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -36,24 +35,26 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. // -// ********************************************************************* +//********************************************************************* // // History: -// 2/02/00 hand-optimized -// 4/04/00 Unwind support added -// 6/20/00 new version -// 8/15/00 Bundle added after call to __libm_error_support to properly +// 02/02/00 hand-optimized +// 04/04/00 Unwind support added +// 06/20/00 new version +// 08/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. +// 05/20/02 Cleaned up namespace and sf0 syntax +// 02/10/03 Reordered header: .section, .global, .proc, .align // -// ********************************************************************* +//********************************************************************* // ___________ // Function: hypotl(x,y) = |(x^2 + y^2) = for double extended values // x and y // Also provides cabsl functionality. // -// ********************************************************************* +//********************************************************************* // // Resources Used: // @@ -68,7 +69,7 @@ // // Predicate Registers: p6 - p10 // -// ********************************************************************* +//********************************************************************* // // IEEE Special Conditions: // @@ -78,7 +79,7 @@ // hypotl(QNaN and anything) = QNaN // hypotl(SNaN and anything ) = QNaN // -// ********************************************************************* +//********************************************************************* // // Implementation: // x2 = x * x in double-extended @@ -86,9 +87,7 @@ // temp = x2 + y2 in double-extended // sqrt(temp) rounded to double extended // -// ********************************************************************* - -#include "libm_support.h" +//********************************************************************* GR_SAVE_PFS = r33 GR_SAVE_B0 = r34 @@ -103,23 +102,10 @@ FR_Y = f33 FR_RESULT = f8 .section .text -#ifndef _LIBC -.proc cabsl# -.global cabsl# -cabsl: -.endp cabsl -#endif -.proc hypotl# -.global hypotl# -.align 64 -hypotl: -#ifdef _LIBC -.global __hypotl -__hypotl: -.global __ieee754_hypotl -__ieee754_hypotl: -#endif +LOCAL_LIBM_ENTRY(cabsl) +LOCAL_LIBM_END(cabsl) +GLOBAL_IEEE754_ENTRY(hypotl) {.mfi alloc r32= ar.pfs,0,4,4,0 // Compute x*x @@ -434,11 +420,8 @@ __ieee754_hypotl: // No overflow (p9) br.ret.sptk b0;; } -.endp hypotl -ASM_SIZE_DIRECTIVE(hypotl) - -.proc __libm_error_region -__libm_error_region: +GLOBAL_IEEE754_END(hypotl) +LOCAL_LIBM_ENTRY(__libm_error_region) .prologue { .mfi add GR_Parameter_Y=-32,sp // Parameter 2 value @@ -485,7 +468,9 @@ __libm_error_region: mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs br.ret.sptk b0 // Return };; -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) +LOCAL_LIBM_END(__libm_error_region#) .type __libm_error_support#,@function .global __libm_error_support# + + + diff --git a/sysdeps/ia64/fpu/e_log.S b/sysdeps/ia64/fpu/e_log.S index 9ad1e5fe56..f80f153679 100644 --- a/sysdeps/ia64/fpu/e_log.S +++ b/sysdeps/ia64/fpu/e_log.S @@ -1,10 +1,10 @@ .file "log.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2002, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,1085 +20,1707 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// // Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// problem reports or change requests be submitted to it directly at +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 2/02/00 Initial version -// 4/04/00 Unwind support added -// 6/16/00 Updated table to be rounded correctly -// 8/15/00 Bundle added after call to __libm_error_support to properly +// 02/02/00 Initial version +// 04/04/00 Unwind support added +// 06/16/00 Updated table to be rounded correctly +// 08/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. -// 8/17/00 Improved speed of main path by 5 cycles +// 08/17/00 Improved speed of main path by 5 cycles // Shortened path for x=1.0 -// 1/09/01 Improved speed, fixed flags for neg denormals -// +// 01/09/01 Improved speed, fixed flags for neg denormals +// 05/20/02 Cleaned up namespace and sf0 syntax +// 05/23/02 Modified algorithm. Now only one polynomial is used +// for |x-1| >= 1/256 and for |x-1| < 1/256 +// 12/11/02 Improved performance for Itanium 2 // // API //============================================================== // double log(double) // double log10(double) // +// // Overview of operation //============================================================== // Background +// ---------- // -// Consider x = 2^N 1.f1 f2 f3 f4...f63 -// Log(x) = log(frcpa(x) x/frcpa(x)) -// = log(1/frcpa(x)) + log(frcpa(x) x) -// = -log(frcpa(x)) + log(frcpa(x) x) +// This algorithm is based on fact that +// log(a b) = log(a) + log(b). +// In our case we have x = 2^N f, where 1 <= f < 2. +// So +// log(x) = log(2^N f) = log(2^N) + log(f) = n*log(2) + log(f) // -// frcpa(x) = 2^-N frcpa((1.f1 f2 ... f63) +// To calculate log(f) we do following +// log(f) = log(f * frcpa(f) / frcpa(f)) = +// = log(f * frcpa(f)) + log(1/frcpa(f)) // -// -log(frcpa(x)) = -log(C) -// = -log(2^-N) - log(frcpa(1.f1 f2 ... f63)) +// According to definition of IA-64's frcpa instruction it's a +// floating point that approximates 1/f using a lookup on the +// top of 8 bits of the input number's significand with relative +// error < 2^(-8.886). So we have following // -// -log(frcpa(x)) = -log(C) -// = +Nlog2 - log(frcpa(1.f1 f2 ... f63)) +// |(1/f - frcpa(f)) / (1/f))| = |1 - f*frcpa(f)| < 1/256 // -// -log(frcpa(x)) = -log(C) -// = +Nlog2 + log(frcpa(1.f1 f2 ... f63)) +// and // -// Log(x) = log(1/frcpa(x)) + log(frcpa(x) x) - -// Log(x) = +Nlog2 + log(1./frcpa(1.f1 f2 ... f63)) + log(frcpa(x) x) -// Log(x) = +Nlog2 - log(/frcpa(1.f1 f2 ... f63)) + log(frcpa(x) x) -// Log(x) = +Nlog2 + T + log(frcpa(x) x) +// log(f) = log(f * frcpa(f)) + log(1/frcpa(f)) = +// = log(1 + r) + T +// +// The first value can be computed by polynomial P(r) approximating +// log(1 + r) on |r| < 1/256 and the second is precomputed tabular +// value defined by top 8 bit of f. // -// Log(x) = +Nlog2 + T + log(C x) +// Finally we have that log(x) ~ (N*log(2) + T) + P(r) // -// Cx = 1 + r +// Note that if input argument is close to 1.0 (in our case it means +// that |1 - x| < 1/256) we can use just polynomial approximation +// because x = 2^0 * f = f = 1 + r and +// log(x) = log(1 + r) ~ P(r) // -// Log(x) = +Nlog2 + T + log(1+r) -// Log(x) = +Nlog2 + T + Series( r - r^2/2 + r^3/3 - r^4/4 ....) // -// 1.f1 f2 ... f8 has 256 entries. -// They are 1 + k/2^8, k = 0 ... 255 -// These 256 values are the table entries. +// To compute log10(x) we use the simple identity +// +// log10(x) = log(x)/log(10) +// +// so we have that +// +// log10(x) = (N*log(2) + T + log(1+r)) / log(10) = +// = N*(log(2)/log(10)) + (T/log(10)) + log(1 + r)/log(10) +// // // Implementation -//=============== -// CASE 1: |x-1| >= 2^-6 -// C = frcpa(x) -// r = C * x - 1 +// -------------- +// It can be seen that formulas for log and log10 differ from one another +// only by coefficients and tabular values. Namely as log as log10 are +// calculated as (N*L1 + T) + L2*Series(r) where in case of log +// L1 = log(2) +// T = log(1/frcpa(x)) +// L2 = 1.0 +// and in case of log10 +// L1 = log(2)/log(10) +// T = log(1/frcpa(x))/log(10) +// L2 = 1.0/log(10) // -// Form rseries = r + P1*r^2 + P2*r^3 + P3*r^4 + P4*r^5 + P5*r^6 +// So common code with two different entry points those set pointers +// to the base address of coresponding data sets containing values +// of L2,T and prepare integer representation of L1 needed for following +// setf instruction. // -// x = f * 2*n where f is 1.f_1f_2f_3....f_63 -// Nfloat = float(n) where n is the true unbiased exponent -// pre-index = f_1f_2....f_8 -// index = pre_index * 16 -// get the dxt table entry at index + offset = T +// Note that both log and log10 use common approximation polynomial +// it means we need only one set of coefficients of approximation. // -// result = (T + Nfloat * log(2)) + rseries // -// The T table is calculated as follows -// Form x_k = 1 + k/2^8 where k goes from 0... 255 -// y_k = frcpa(x_k) -// log(1/y_k) in quad and round to double-extended - -// CASE 2: |x-1| < 2^-6 -// w = x - 1 +// 1. |x-1| >= 1/256 +// InvX = frcpa(x) +// r = InvX*x - 1 +// P(r) = r*((r*A3 - A2) + r^4*((A4 + r*A5) + r^2*(A6 + r*A7)), +// all coefficients are calcutated in quad and rounded to double +// precision. A7,A6,A5,A4 are stored in memory whereas A3 and A2 +// created with setf. +// +// N = float(n) where n is true unbiased exponent of x +// +// T is tabular value of log(1/frcpa(x)) calculated in quad precision +// and represented by two floating-point numbers 64-bit Thi and 32-bit Tlo. +// To load Thi,Tlo we get bits from 55 to 62 of register format significand +// as index and calculate two addresses +// ad_Thi = Thi_table_base_addr + 8 * index +// ad_Tlo = Tlo_table_base_addr + 4 * index +// +// L2 (1.0 or 1.0/log(10) depending on function) is calculated in quad +// precision and rounded to double extended; it's loaded from memory. +// +// L1 (log(2) or log10(2) depending on function) is calculated in quad +// precision and represented by two floating-point 64-bit numbers L1hi,L1lo +// stored in memory. // -// Form wseries = w + Q1*w^2 + Q2*w^3 + ... + Q7*w^8 + Q8*w^9 +// And final result = ((L1hi*N + Thi) + (N*L1lo + Tlo)) + L2*P(r) +// +// +// 2. |x-1| < 1/256 +// r = x - 1 +// P(r) = r*((r*A3 - A2) + r^4*((A4 + r*A5) + r^2*(A6 + r*A7)), +// A7,A6,A5A4,A3,A2 are the same as in case |x-1| >= 1/256 +// +// And final results +// log(x) = P(r) +// log10(x) = L2*P(r) +// +// 3. How we define is input argument such that |x-1| < 1/256 or not. +// +// To do it we analyze biased exponent and integer representation of +// input argument +// +// a) First we test is biased exponent equal to 0xFFFE or 0xFFFF (i.e. +// we test is 0.5 <= x < 2). This comparison can be performed using +// unsigned version of cmp instruction in such a way +// biased_exponent_of_x - 0xFFFE < 2 +// +// +// b) Second (in case when result of a) is true) we need to compare x +// with 1-1/256 and 1+1/256 or in double precision memory representation +// with 0x3FEFE00000000000 and 0x3FF0100000000000 correspondingly. +// This comparison can be made like in a), using unsigned +// version of cmp i.e. ix - 0x3FEFE00000000000 < 0x0000300000000000. +// 0x0000300000000000 is difference between 0x3FF0100000000000 and +// 0x3FEFE00000000000 +// +// Note: NaT, any NaNs, +/-INF, +/-0, negatives and unnormalized numbers are +// filtered and processed on special branches. // -// result = wseries -// Special values +// +// Special values //============================================================== - - +// // log(+0) = -inf // log(-0) = -inf - -// log(+qnan) = +qnan -// log(-qnan) = -qnan -// log(+snan) = +qnan -// log(-snan) = -qnan - +// +// log(+qnan) = +qnan +// log(-qnan) = -qnan +// log(+snan) = +qnan +// log(-snan) = -qnan +// // log(-n) = QNAN Indefinite -// log(-inf) = QNAN Indefinite - +// log(-inf) = QNAN Indefinite +// // log(+inf) = +inf - +// +// // Registers used //============================================================== -// Floating Point registers used: +// Floating Point registers used: // f8, input -// f9 -> f15, f32 -> f68 - -// General registers used: -// r32 -> r51 - +// f7 -> f15, f32 -> f42 +// +// General registers used: +// r8 -> r11 +// r14 -> r23 +// // Predicate registers used: // p6 -> p15 -// p8 log base e -// p6 log base e special -// p9 used in the frcpa -// p13 log base e large W -// p14 log base e small w - -// p7 log base 10 -// p10 log base 10 large W -// p11 log base 10 small w -// p12 log base 10 special - -#include "libm_support.h" - // Assembly macros //============================================================== - -log_int_Nfloat = f9 -log_Nfloat = f10 - -log_P5 = f11 -log_P4 = f12 -log_P3 = f13 -log_P2 = f14 -log_half = f15 - -log_log2 = f32 -log_T = f33 - -log_rp_p4 = f34 -log_rp_p32 = f35 -log_rp_p2 = f36 -log_w6 = f37 -log_rp_p10 = f38 -log_rcube = f39 -log_rsq = f40 - -log_T_plus_Nlog2 = f41 -log_w3 = f42 - -log_r = f43 -log_C = f44 - -log_w = f45 -log_Q8 = f46 -log_Q7 = f47 -log_Q4 = f48 -log_Q3 = f49 -log_Q6 = f50 -log_Q5 = f51 -log_Q2 = f52 -log_Q1 = f53 -log_P1 = f53 - -log_rp_q7 = f54 -log_rp_q65 = f55 -log_Qlo = f56 - -log_rp_q3 = f57 -log_rp_q21 = f58 -log_Qhi = f59 - -log_wsq = f60 -log_w4 = f61 -log_Q = f62 - -log_inv_ln10 = f63 -log_log10_hi = f64 -log_log10_lo = f65 -log_rp_q10 = f66 -log_NORM_f8 = f67 -log_r2P_r = f68 - -// =================================== - -log_GR_exp_17_ones = r33 -log_GR_exp_16_ones = r34 -log_GR_exp_f8 = r35 -log_GR_signexp_f8 = r36 -log_GR_true_exp_f8 = r37 -log_GR_significand_f8 = r38 -log_GR_half_exp = r39 -log_GR_index = r39 -log_AD_1 = r40 -log_GR_signexp_w = r41 -log_GR_fff9 = r42 -log_AD_2 = r43 -log_GR_exp_w = r44 - -GR_SAVE_B0 = r45 -GR_SAVE_GP = r46 -GR_SAVE_PFS = r47 - -GR_Parameter_X = r48 -GR_Parameter_Y = r49 -GR_Parameter_RESULT = r50 -log_GR_tag = r51 - - -// Data tables +GR_TAG = r8 +GR_ad_1 = r8 +GR_ad_2 = r9 +GR_Exp = r10 +GR_N = r11 + +GR_x = r14 +GR_dx = r15 +GR_NearOne = r15 +GR_xorg = r16 +GR_mask = r16 +GR_05 = r17 +GR_A3 = r18 +GR_Sig = r19 +GR_Ind = r19 +GR_Nm1 = r20 +GR_bias = r21 +GR_ad_3 = r22 +GR_rexp = r23 + + +GR_SAVE_B0 = r33 +GR_SAVE_PFS = r34 +GR_SAVE_GP = r35 +GR_SAVE_SP = r36 + +GR_Parameter_X = r37 +GR_Parameter_Y = r38 +GR_Parameter_RESULT = r39 +GR_Parameter_TAG = r40 + + + +FR_NormX = f7 +FR_RcpX = f9 +FR_tmp = f9 +FR_r = f10 +FR_r2 = f11 +FR_r4 = f12 +FR_N = f13 +FR_Ln2hi = f14 +FR_Ln2lo = f15 + +FR_A7 = f32 +FR_A6 = f33 +FR_A5 = f34 +FR_A4 = f35 +FR_A3 = f36 +FR_A2 = f37 + +FR_Thi = f38 +FR_NxLn2hipThi = f38 +FR_NxLn2pT = f38 +FR_Tlo = f39 +FR_NxLn2lopTlo = f39 + +FR_InvLn10 = f40 +FR_A32 = f41 +FR_A321 = f42 + + +FR_Y = f1 +FR_X = f10 +FR_RESULT = f8 + + +// Data //============================================================== - -#ifdef _LIBC -.rodata -#else -.data -#endif - +RODATA .align 16 -log_table_1: -ASM_TYPE_DIRECTIVE(log_table_1,@object) -data8 0xBFC5555DA7212371 // P5 -data8 0x3FC999A19EEF5826 // P4 -data8 0x3FBC756AC654273B // Q8 -data8 0xBFC001A42489AB4D // Q7 -data8 0x3FC99999999A169B // Q4 -data8 0xBFD00000000019AC // Q3 -ASM_SIZE_DIRECTIVE(log_table_1) -log_table_2: -ASM_TYPE_DIRECTIVE(log_table_2,@object) -data8 0xBFCFFFFFFFFEF009 // P3 -data8 0x3FD555555554ECB2 // P2 -data8 0x3FC2492479AA0DF8 // Q6 -data8 0xBFC5555544986F52 // Q5 -data8 0x3FD5555555555555 // Q2 -data8 0xBFE0000000000000 // Q1, P1 = -0.5 - - -data8 0xde5bd8a937287195, 0x00003ffd // double-extended 1/ln(10) -data8 0xb17217f7d1cf79ac, 0x00003ffe // log2 -// b17217f7d1cf79ab c9e3b39803f2f6a - - -data8 0x80200aaeac44ef38 , 0x00003ff6 // log(1/frcpa(1+ 0/2^-8)) - -data8 0xc09090a2c35aa070 , 0x00003ff7 // log(1/frcpa(1+ 1/2^-8)) -data8 0xa0c94fcb41977c75 , 0x00003ff8 // log(1/frcpa(1+ 2/2^-8)) -data8 0xe18b9c263af83301 , 0x00003ff8 // log(1/frcpa(1+ 3/2^-8)) -data8 0x8d35c8d6399c30ea , 0x00003ff9 // log(1/frcpa(1+ 4/2^-8)) -data8 0xadd4d2ecd601cbb8 , 0x00003ff9 // log(1/frcpa(1+ 5/2^-8)) - -data8 0xce95403a192f9f01 , 0x00003ff9 // log(1/frcpa(1+ 6/2^-8)) -data8 0xeb59392cbcc01096 , 0x00003ff9 // log(1/frcpa(1+ 7/2^-8)) -data8 0x862c7d0cefd54c5d , 0x00003ffa // log(1/frcpa(1+ 8/2^-8)) -data8 0x94aa63c65e70d499 , 0x00003ffa // log(1/frcpa(1+ 9/2^-8)) -data8 0xa54a696d4b62b382 , 0x00003ffa // log(1/frcpa(1+ 10/2^-8)) - -data8 0xb3e4a796a5dac208 , 0x00003ffa // log(1/frcpa(1+ 11/2^-8)) -data8 0xc28c45b1878340a9 , 0x00003ffa // log(1/frcpa(1+ 12/2^-8)) -data8 0xd35c55f39d7a6235 , 0x00003ffa // log(1/frcpa(1+ 13/2^-8)) -data8 0xe220f037b954f1f5 , 0x00003ffa // log(1/frcpa(1+ 14/2^-8)) -data8 0xf0f3389b036834f3 , 0x00003ffa // log(1/frcpa(1+ 15/2^-8)) - -data8 0xffd3488d5c980465 , 0x00003ffa // log(1/frcpa(1+ 16/2^-8)) -data8 0x87609ce2ed300490 , 0x00003ffb // log(1/frcpa(1+ 17/2^-8)) -data8 0x8ede9321e8c85927 , 0x00003ffb // log(1/frcpa(1+ 18/2^-8)) -data8 0x96639427f2f8e2f4 , 0x00003ffb // log(1/frcpa(1+ 19/2^-8)) -data8 0x9defad3e8f73217b , 0x00003ffb // log(1/frcpa(1+ 20/2^-8)) - -data8 0xa582ebd50097029c , 0x00003ffb // log(1/frcpa(1+ 21/2^-8)) -data8 0xac06dbe75ab80fee , 0x00003ffb // log(1/frcpa(1+ 22/2^-8)) -data8 0xb3a78449b2d3ccca , 0x00003ffb // log(1/frcpa(1+ 23/2^-8)) -data8 0xbb4f79635ab46bb2 , 0x00003ffb // log(1/frcpa(1+ 24/2^-8)) -data8 0xc2fec93a83523f3f , 0x00003ffb // log(1/frcpa(1+ 25/2^-8)) - -data8 0xc99af2eaca4c4571 , 0x00003ffb // log(1/frcpa(1+ 26/2^-8)) -data8 0xd1581106472fa653 , 0x00003ffb // log(1/frcpa(1+ 27/2^-8)) -data8 0xd8002560d4355f2e , 0x00003ffb // log(1/frcpa(1+ 28/2^-8)) -data8 0xdfcb43b4fe508632 , 0x00003ffb // log(1/frcpa(1+ 29/2^-8)) -data8 0xe67f6dff709d4119 , 0x00003ffb // log(1/frcpa(1+ 30/2^-8)) - -data8 0xed393b1c22351280 , 0x00003ffb // log(1/frcpa(1+ 31/2^-8)) -data8 0xf5192bff087bcc35 , 0x00003ffb // log(1/frcpa(1+ 32/2^-8)) -data8 0xfbdf4ff6dfef2fa3 , 0x00003ffb // log(1/frcpa(1+ 33/2^-8)) -data8 0x81559a97f92f9cc7 , 0x00003ffc // log(1/frcpa(1+ 34/2^-8)) -data8 0x84be72bce90266e8 , 0x00003ffc // log(1/frcpa(1+ 35/2^-8)) - -data8 0x88bc74113f23def2 , 0x00003ffc // log(1/frcpa(1+ 36/2^-8)) -data8 0x8c2ba3edf6799d11 , 0x00003ffc // log(1/frcpa(1+ 37/2^-8)) -data8 0x8f9dc92f92ea08b1 , 0x00003ffc // log(1/frcpa(1+ 38/2^-8)) -data8 0x9312e8f36efab5a7 , 0x00003ffc // log(1/frcpa(1+ 39/2^-8)) -data8 0x968b08643409ceb6 , 0x00003ffc // log(1/frcpa(1+ 40/2^-8)) - -data8 0x9a062cba08a1708c , 0x00003ffc // log(1/frcpa(1+ 41/2^-8)) -data8 0x9d845b3abf95485c , 0x00003ffc // log(1/frcpa(1+ 42/2^-8)) -data8 0xa06fd841bc001bb4 , 0x00003ffc // log(1/frcpa(1+ 43/2^-8)) -data8 0xa3f3a74652fbe0db , 0x00003ffc // log(1/frcpa(1+ 44/2^-8)) -data8 0xa77a8fb2336f20f5 , 0x00003ffc // log(1/frcpa(1+ 45/2^-8)) - -data8 0xab0497015d28b0a0 , 0x00003ffc // log(1/frcpa(1+ 46/2^-8)) -data8 0xae91c2be6ba6a615 , 0x00003ffc // log(1/frcpa(1+ 47/2^-8)) -data8 0xb189d1b99aebb20b , 0x00003ffc // log(1/frcpa(1+ 48/2^-8)) -data8 0xb51cced5de9c1b2c , 0x00003ffc // log(1/frcpa(1+ 49/2^-8)) -data8 0xb819bee9e720d42f , 0x00003ffc // log(1/frcpa(1+ 50/2^-8)) - -data8 0xbbb2a0947b093a5d , 0x00003ffc // log(1/frcpa(1+ 51/2^-8)) -data8 0xbf4ec1505811684a , 0x00003ffc // log(1/frcpa(1+ 52/2^-8)) -data8 0xc2535bacfa8975ff , 0x00003ffc // log(1/frcpa(1+ 53/2^-8)) -data8 0xc55a3eafad187eb8 , 0x00003ffc // log(1/frcpa(1+ 54/2^-8)) -data8 0xc8ff2484b2c0da74 , 0x00003ffc // log(1/frcpa(1+ 55/2^-8)) - -data8 0xcc0b1a008d53ab76 , 0x00003ffc // log(1/frcpa(1+ 56/2^-8)) -data8 0xcfb6203844b3209b , 0x00003ffc // log(1/frcpa(1+ 57/2^-8)) -data8 0xd2c73949a47a19f5 , 0x00003ffc // log(1/frcpa(1+ 58/2^-8)) -data8 0xd5daae18b49d6695 , 0x00003ffc // log(1/frcpa(1+ 59/2^-8)) -data8 0xd8f08248cf7e8019 , 0x00003ffc // log(1/frcpa(1+ 60/2^-8)) - -data8 0xdca7749f1b3e540e , 0x00003ffc // log(1/frcpa(1+ 61/2^-8)) -data8 0xdfc28e033aaaf7c7 , 0x00003ffc // log(1/frcpa(1+ 62/2^-8)) -data8 0xe2e012a5f91d2f55 , 0x00003ffc // log(1/frcpa(1+ 63/2^-8)) -data8 0xe600064ed9e292a8 , 0x00003ffc // log(1/frcpa(1+ 64/2^-8)) -data8 0xe9226cce42b39f60 , 0x00003ffc // log(1/frcpa(1+ 65/2^-8)) - -data8 0xec4749fd97a28360 , 0x00003ffc // log(1/frcpa(1+ 66/2^-8)) -data8 0xef6ea1bf57780495 , 0x00003ffc // log(1/frcpa(1+ 67/2^-8)) -data8 0xf29877ff38809091 , 0x00003ffc // log(1/frcpa(1+ 68/2^-8)) -data8 0xf5c4d0b245cb89be , 0x00003ffc // log(1/frcpa(1+ 69/2^-8)) -data8 0xf8f3afd6fcdef3aa , 0x00003ffc // log(1/frcpa(1+ 70/2^-8)) - -data8 0xfc2519756be1abc7 , 0x00003ffc // log(1/frcpa(1+ 71/2^-8)) -data8 0xff59119f503e6832 , 0x00003ffc // log(1/frcpa(1+ 72/2^-8)) -data8 0x8147ce381ae0e146 , 0x00003ffd // log(1/frcpa(1+ 73/2^-8)) -data8 0x82e45f06cb1ad0f2 , 0x00003ffd // log(1/frcpa(1+ 74/2^-8)) -data8 0x842f5c7c573cbaa2 , 0x00003ffd // log(1/frcpa(1+ 75/2^-8)) - -data8 0x85ce471968c8893a , 0x00003ffd // log(1/frcpa(1+ 76/2^-8)) -data8 0x876e8305bc04066d , 0x00003ffd // log(1/frcpa(1+ 77/2^-8)) -data8 0x891012678031fbb3 , 0x00003ffd // log(1/frcpa(1+ 78/2^-8)) -data8 0x8a5f1493d766a05f , 0x00003ffd // log(1/frcpa(1+ 79/2^-8)) -data8 0x8c030c778c56fa00 , 0x00003ffd // log(1/frcpa(1+ 80/2^-8)) - -data8 0x8da85df17e31d9ae , 0x00003ffd // log(1/frcpa(1+ 81/2^-8)) -data8 0x8efa663e7921687e , 0x00003ffd // log(1/frcpa(1+ 82/2^-8)) -data8 0x90a22b6875c6a1f8 , 0x00003ffd // log(1/frcpa(1+ 83/2^-8)) -data8 0x91f62cc8f5d24837 , 0x00003ffd // log(1/frcpa(1+ 84/2^-8)) -data8 0x93a06cfc3857d980 , 0x00003ffd // log(1/frcpa(1+ 85/2^-8)) - -data8 0x94f66d5e6fd01ced , 0x00003ffd // log(1/frcpa(1+ 86/2^-8)) -data8 0x96a330156e6772f2 , 0x00003ffd // log(1/frcpa(1+ 87/2^-8)) -data8 0x97fb3582754ea25b , 0x00003ffd // log(1/frcpa(1+ 88/2^-8)) -data8 0x99aa8259aad1bbf2 , 0x00003ffd // log(1/frcpa(1+ 89/2^-8)) -data8 0x9b0492f6227ae4a8 , 0x00003ffd // log(1/frcpa(1+ 90/2^-8)) - -data8 0x9c5f8e199bf3a7a5 , 0x00003ffd // log(1/frcpa(1+ 91/2^-8)) -data8 0x9e1293b9998c1daa , 0x00003ffd // log(1/frcpa(1+ 92/2^-8)) -data8 0x9f6fa31e0b41f308 , 0x00003ffd // log(1/frcpa(1+ 93/2^-8)) -data8 0xa0cda11eaf46390e , 0x00003ffd // log(1/frcpa(1+ 94/2^-8)) -data8 0xa22c8f029cfa45aa , 0x00003ffd // log(1/frcpa(1+ 95/2^-8)) - -data8 0xa3e48badb7856b34 , 0x00003ffd // log(1/frcpa(1+ 96/2^-8)) -data8 0xa5459a0aa95849f9 , 0x00003ffd // log(1/frcpa(1+ 97/2^-8)) -data8 0xa6a79c84480cfebd , 0x00003ffd // log(1/frcpa(1+ 98/2^-8)) -data8 0xa80a946d0fcb3eb2 , 0x00003ffd // log(1/frcpa(1+ 99/2^-8)) -data8 0xa96e831a3ea7b314 , 0x00003ffd // log(1/frcpa(1+100/2^-8)) - -data8 0xaad369e3dc544e3b , 0x00003ffd // log(1/frcpa(1+101/2^-8)) -data8 0xac92e9588952c815 , 0x00003ffd // log(1/frcpa(1+102/2^-8)) -data8 0xadfa035aa1ed8fdc , 0x00003ffd // log(1/frcpa(1+103/2^-8)) -data8 0xaf6219eae1ad6e34 , 0x00003ffd // log(1/frcpa(1+104/2^-8)) -data8 0xb0cb2e6d8160f753 , 0x00003ffd // log(1/frcpa(1+105/2^-8)) - -data8 0xb2354249ad950f72 , 0x00003ffd // log(1/frcpa(1+106/2^-8)) -data8 0xb3a056e98ef4a3b4 , 0x00003ffd // log(1/frcpa(1+107/2^-8)) -data8 0xb50c6dba52c6292a , 0x00003ffd // log(1/frcpa(1+108/2^-8)) -data8 0xb679882c33876165 , 0x00003ffd // log(1/frcpa(1+109/2^-8)) -data8 0xb78c07429785cedc , 0x00003ffd // log(1/frcpa(1+110/2^-8)) - -data8 0xb8faeb8dc4a77d24 , 0x00003ffd // log(1/frcpa(1+111/2^-8)) -data8 0xba6ad77eb36ae0d6 , 0x00003ffd // log(1/frcpa(1+112/2^-8)) -data8 0xbbdbcc915e9bee50 , 0x00003ffd // log(1/frcpa(1+113/2^-8)) -data8 0xbd4dcc44f8cf12ef , 0x00003ffd // log(1/frcpa(1+114/2^-8)) -data8 0xbec0d81bf5b531fa , 0x00003ffd // log(1/frcpa(1+115/2^-8)) - -data8 0xc034f19c139186f4 , 0x00003ffd // log(1/frcpa(1+116/2^-8)) -data8 0xc14cb69f7c5e55ab , 0x00003ffd // log(1/frcpa(1+117/2^-8)) -data8 0xc2c2abbb6e5fd56f , 0x00003ffd // log(1/frcpa(1+118/2^-8)) -data8 0xc439b2c193e6771e , 0x00003ffd // log(1/frcpa(1+119/2^-8)) -data8 0xc553acb9d5c67733 , 0x00003ffd // log(1/frcpa(1+120/2^-8)) - -data8 0xc6cc96e441272441 , 0x00003ffd // log(1/frcpa(1+121/2^-8)) -data8 0xc8469753eca88c30 , 0x00003ffd // log(1/frcpa(1+122/2^-8)) -data8 0xc962cf3ce072b05c , 0x00003ffd // log(1/frcpa(1+123/2^-8)) -data8 0xcadeba8771f694aa , 0x00003ffd // log(1/frcpa(1+124/2^-8)) -data8 0xcc5bc08d1f72da94 , 0x00003ffd // log(1/frcpa(1+125/2^-8)) - -data8 0xcd7a3f99ea035c29 , 0x00003ffd // log(1/frcpa(1+126/2^-8)) -data8 0xcef93860c8a53c35 , 0x00003ffd // log(1/frcpa(1+127/2^-8)) -data8 0xd0192f68a7ed23df , 0x00003ffd // log(1/frcpa(1+128/2^-8)) -data8 0xd19a201127d3c645 , 0x00003ffd // log(1/frcpa(1+129/2^-8)) -data8 0xd2bb92f4061c172c , 0x00003ffd // log(1/frcpa(1+130/2^-8)) - -data8 0xd43e80b2ee8cc8fc , 0x00003ffd // log(1/frcpa(1+131/2^-8)) -data8 0xd56173601fc4ade4 , 0x00003ffd // log(1/frcpa(1+132/2^-8)) -data8 0xd6e6637efb54086f , 0x00003ffd // log(1/frcpa(1+133/2^-8)) -data8 0xd80ad9f58f3c8193 , 0x00003ffd // log(1/frcpa(1+134/2^-8)) -data8 0xd991d1d31aca41f8 , 0x00003ffd // log(1/frcpa(1+135/2^-8)) - -data8 0xdab7d02231484a93 , 0x00003ffd // log(1/frcpa(1+136/2^-8)) -data8 0xdc40d532cde49a54 , 0x00003ffd // log(1/frcpa(1+137/2^-8)) -data8 0xdd685f79ed8b265e , 0x00003ffd // log(1/frcpa(1+138/2^-8)) -data8 0xde9094bbc0e17b1d , 0x00003ffd // log(1/frcpa(1+139/2^-8)) -data8 0xe01c91b78440c425 , 0x00003ffd // log(1/frcpa(1+140/2^-8)) - -data8 0xe14658f26997e729 , 0x00003ffd // log(1/frcpa(1+141/2^-8)) -data8 0xe270cdc2391e0d23 , 0x00003ffd // log(1/frcpa(1+142/2^-8)) -data8 0xe3ffce3a2aa64922 , 0x00003ffd // log(1/frcpa(1+143/2^-8)) -data8 0xe52bdb274ed82887 , 0x00003ffd // log(1/frcpa(1+144/2^-8)) -data8 0xe6589852e75d7df6 , 0x00003ffd // log(1/frcpa(1+145/2^-8)) - -data8 0xe786068c79937a7d , 0x00003ffd // log(1/frcpa(1+146/2^-8)) -data8 0xe91903adad100911 , 0x00003ffd // log(1/frcpa(1+147/2^-8)) -data8 0xea481236f7d35bb0 , 0x00003ffd // log(1/frcpa(1+148/2^-8)) -data8 0xeb77d48c692e6b14 , 0x00003ffd // log(1/frcpa(1+149/2^-8)) -data8 0xeca84b83d7297b87 , 0x00003ffd // log(1/frcpa(1+150/2^-8)) - -data8 0xedd977f4962aa158 , 0x00003ffd // log(1/frcpa(1+151/2^-8)) -data8 0xef7179a22f257754 , 0x00003ffd // log(1/frcpa(1+152/2^-8)) -data8 0xf0a450d139366ca7 , 0x00003ffd // log(1/frcpa(1+153/2^-8)) -data8 0xf1d7e0524ff9ffdb , 0x00003ffd // log(1/frcpa(1+154/2^-8)) -data8 0xf30c29036a8b6cae , 0x00003ffd // log(1/frcpa(1+155/2^-8)) - -data8 0xf4412bc411ea8d92 , 0x00003ffd // log(1/frcpa(1+156/2^-8)) -data8 0xf576e97564c8619d , 0x00003ffd // log(1/frcpa(1+157/2^-8)) -data8 0xf6ad62fa1b5f172f , 0x00003ffd // log(1/frcpa(1+158/2^-8)) -data8 0xf7e499368b55c542 , 0x00003ffd // log(1/frcpa(1+159/2^-8)) -data8 0xf91c8d10abaffe22 , 0x00003ffd // log(1/frcpa(1+160/2^-8)) - -data8 0xfa553f7018c966f3 , 0x00003ffd // log(1/frcpa(1+161/2^-8)) -data8 0xfb8eb13e185d802c , 0x00003ffd // log(1/frcpa(1+162/2^-8)) -data8 0xfcc8e3659d9bcbed , 0x00003ffd // log(1/frcpa(1+163/2^-8)) -data8 0xfe03d6d34d487fd2 , 0x00003ffd // log(1/frcpa(1+164/2^-8)) -data8 0xff3f8c7581e9f0ae , 0x00003ffd // log(1/frcpa(1+165/2^-8)) - -data8 0x803e029e280173ae , 0x00003ffe // log(1/frcpa(1+166/2^-8)) -data8 0x80dca10cc52d0757 , 0x00003ffe // log(1/frcpa(1+167/2^-8)) -data8 0x817ba200632755a1 , 0x00003ffe // log(1/frcpa(1+168/2^-8)) -data8 0x821b05f3b01d6774 , 0x00003ffe // log(1/frcpa(1+169/2^-8)) -data8 0x82bacd623ff19d06 , 0x00003ffe // log(1/frcpa(1+170/2^-8)) - -data8 0x835af8c88e7a8f47 , 0x00003ffe // log(1/frcpa(1+171/2^-8)) -data8 0x83c5f8299e2b4091 , 0x00003ffe // log(1/frcpa(1+172/2^-8)) -data8 0x8466cb43f3d87300 , 0x00003ffe // log(1/frcpa(1+173/2^-8)) -data8 0x850803a67c80ca4b , 0x00003ffe // log(1/frcpa(1+174/2^-8)) -data8 0x85a9a1d11a23b461 , 0x00003ffe // log(1/frcpa(1+175/2^-8)) - -data8 0x864ba644a18e6e05 , 0x00003ffe // log(1/frcpa(1+176/2^-8)) -data8 0x86ee1182dcc432f7 , 0x00003ffe // log(1/frcpa(1+177/2^-8)) -data8 0x875a925d7e48c316 , 0x00003ffe // log(1/frcpa(1+178/2^-8)) -data8 0x87fdaa109d23aef7 , 0x00003ffe // log(1/frcpa(1+179/2^-8)) -data8 0x88a129ed4becfaf2 , 0x00003ffe // log(1/frcpa(1+180/2^-8)) - -data8 0x89451278ecd7f9cf , 0x00003ffe // log(1/frcpa(1+181/2^-8)) -data8 0x89b29295f8432617 , 0x00003ffe // log(1/frcpa(1+182/2^-8)) -data8 0x8a572ac5a5496882 , 0x00003ffe // log(1/frcpa(1+183/2^-8)) -data8 0x8afc2d0ce3b2dadf , 0x00003ffe // log(1/frcpa(1+184/2^-8)) -data8 0x8b6a69c608cfd3af , 0x00003ffe // log(1/frcpa(1+185/2^-8)) - -data8 0x8c101e106e899a83 , 0x00003ffe // log(1/frcpa(1+186/2^-8)) -data8 0x8cb63de258f9d626 , 0x00003ffe // log(1/frcpa(1+187/2^-8)) -data8 0x8d2539c5bd19e2b1 , 0x00003ffe // log(1/frcpa(1+188/2^-8)) -data8 0x8dcc0e064b29e6f1 , 0x00003ffe // log(1/frcpa(1+189/2^-8)) -data8 0x8e734f45d88357ae , 0x00003ffe // log(1/frcpa(1+190/2^-8)) - -data8 0x8ee30cef034a20db , 0x00003ffe // log(1/frcpa(1+191/2^-8)) -data8 0x8f8b0515686d1d06 , 0x00003ffe // log(1/frcpa(1+192/2^-8)) -data8 0x90336bba039bf32f , 0x00003ffe // log(1/frcpa(1+193/2^-8)) -data8 0x90a3edd23d1c9d58 , 0x00003ffe // log(1/frcpa(1+194/2^-8)) -data8 0x914d0de2f5d61b32 , 0x00003ffe // log(1/frcpa(1+195/2^-8)) - -data8 0x91be0c20d28173b5 , 0x00003ffe // log(1/frcpa(1+196/2^-8)) -data8 0x9267e737c06cd34a , 0x00003ffe // log(1/frcpa(1+197/2^-8)) -data8 0x92d962ae6abb1237 , 0x00003ffe // log(1/frcpa(1+198/2^-8)) -data8 0x9383fa6afbe2074c , 0x00003ffe // log(1/frcpa(1+199/2^-8)) -data8 0x942f0421651c1c4e , 0x00003ffe // log(1/frcpa(1+200/2^-8)) - -data8 0x94a14a3845bb985e , 0x00003ffe // log(1/frcpa(1+201/2^-8)) -data8 0x954d133857f861e7 , 0x00003ffe // log(1/frcpa(1+202/2^-8)) -data8 0x95bfd96468e604c4 , 0x00003ffe // log(1/frcpa(1+203/2^-8)) -data8 0x9632d31cafafa858 , 0x00003ffe // log(1/frcpa(1+204/2^-8)) -data8 0x96dfaabd86fa1647 , 0x00003ffe // log(1/frcpa(1+205/2^-8)) - -data8 0x9753261fcbb2a594 , 0x00003ffe // log(1/frcpa(1+206/2^-8)) -data8 0x9800c11b426b996d , 0x00003ffe // log(1/frcpa(1+207/2^-8)) -data8 0x9874bf4d45ae663c , 0x00003ffe // log(1/frcpa(1+208/2^-8)) -data8 0x99231f5ee9a74f79 , 0x00003ffe // log(1/frcpa(1+209/2^-8)) -data8 0x9997a18a56bcad28 , 0x00003ffe // log(1/frcpa(1+210/2^-8)) - -data8 0x9a46c873a3267e79 , 0x00003ffe // log(1/frcpa(1+211/2^-8)) -data8 0x9abbcfc621eb6cb6 , 0x00003ffe // log(1/frcpa(1+212/2^-8)) -data8 0x9b310cb0d354c990 , 0x00003ffe // log(1/frcpa(1+213/2^-8)) -data8 0x9be14cf9e1b3515c , 0x00003ffe // log(1/frcpa(1+214/2^-8)) -data8 0x9c5710b8cbb73a43 , 0x00003ffe // log(1/frcpa(1+215/2^-8)) - -data8 0x9ccd0abd301f399c , 0x00003ffe // log(1/frcpa(1+216/2^-8)) -data8 0x9d7e67f3bdce8888 , 0x00003ffe // log(1/frcpa(1+217/2^-8)) -data8 0x9df4ea81a99daa01 , 0x00003ffe // log(1/frcpa(1+218/2^-8)) -data8 0x9e6ba405a54514ba , 0x00003ffe // log(1/frcpa(1+219/2^-8)) -data8 0x9f1e21c8c7bb62b3 , 0x00003ffe // log(1/frcpa(1+220/2^-8)) - -data8 0x9f956593f6b6355c , 0x00003ffe // log(1/frcpa(1+221/2^-8)) -data8 0xa00ce1092e5498c3 , 0x00003ffe // log(1/frcpa(1+222/2^-8)) -data8 0xa0c08309c4b912c1 , 0x00003ffe // log(1/frcpa(1+223/2^-8)) -data8 0xa1388a8c6faa2afa , 0x00003ffe // log(1/frcpa(1+224/2^-8)) -data8 0xa1b0ca7095b5f985 , 0x00003ffe // log(1/frcpa(1+225/2^-8)) - -data8 0xa22942eb47534a00 , 0x00003ffe // log(1/frcpa(1+226/2^-8)) -data8 0xa2de62326449d0a3 , 0x00003ffe // log(1/frcpa(1+227/2^-8)) -data8 0xa357690f88bfe345 , 0x00003ffe // log(1/frcpa(1+228/2^-8)) -data8 0xa3d0a93f45169a4b , 0x00003ffe // log(1/frcpa(1+229/2^-8)) -data8 0xa44a22f7ffe65f30 , 0x00003ffe // log(1/frcpa(1+230/2^-8)) - -data8 0xa500c5e5b4c1aa36 , 0x00003ffe // log(1/frcpa(1+231/2^-8)) -data8 0xa57ad064eb2ebbc2 , 0x00003ffe // log(1/frcpa(1+232/2^-8)) -data8 0xa5f5152dedf4384e , 0x00003ffe // log(1/frcpa(1+233/2^-8)) -data8 0xa66f9478856233ec , 0x00003ffe // log(1/frcpa(1+234/2^-8)) -data8 0xa6ea4e7cca02c32e , 0x00003ffe // log(1/frcpa(1+235/2^-8)) - -data8 0xa765437325341ccf , 0x00003ffe // log(1/frcpa(1+236/2^-8)) -data8 0xa81e21e6c75b4020 , 0x00003ffe // log(1/frcpa(1+237/2^-8)) -data8 0xa899ab333fe2b9ca , 0x00003ffe // log(1/frcpa(1+238/2^-8)) -data8 0xa9157039c51ebe71 , 0x00003ffe // log(1/frcpa(1+239/2^-8)) -data8 0xa991713433c2b999 , 0x00003ffe // log(1/frcpa(1+240/2^-8)) - -data8 0xaa0dae5cbcc048b3 , 0x00003ffe // log(1/frcpa(1+241/2^-8)) -data8 0xaa8a27ede5eb13ad , 0x00003ffe // log(1/frcpa(1+242/2^-8)) -data8 0xab06de228a9e3499 , 0x00003ffe // log(1/frcpa(1+243/2^-8)) -data8 0xab83d135dc633301 , 0x00003ffe // log(1/frcpa(1+244/2^-8)) -data8 0xac3fb076adc7fe7a , 0x00003ffe // log(1/frcpa(1+245/2^-8)) - -data8 0xacbd3cbbe47988f1 , 0x00003ffe // log(1/frcpa(1+246/2^-8)) -data8 0xad3b06b1a5dc57c3 , 0x00003ffe // log(1/frcpa(1+247/2^-8)) -data8 0xadb90e94af887717 , 0x00003ffe // log(1/frcpa(1+248/2^-8)) -data8 0xae3754a218f7c816 , 0x00003ffe // log(1/frcpa(1+249/2^-8)) -data8 0xaeb5d9175437afa2 , 0x00003ffe // log(1/frcpa(1+250/2^-8)) - -data8 0xaf349c322e9c7cee , 0x00003ffe // log(1/frcpa(1+251/2^-8)) -data8 0xafb39e30d1768d1c , 0x00003ffe // log(1/frcpa(1+252/2^-8)) -data8 0xb032df51c2c93116 , 0x00003ffe // log(1/frcpa(1+253/2^-8)) -data8 0xb0b25fd3e6035ad9 , 0x00003ffe // log(1/frcpa(1+254/2^-8)) -data8 0xb1321ff67cba178c , 0x00003ffe // log(1/frcpa(1+255/2^-8)) -ASM_SIZE_DIRECTIVE(log_table_2) - - -.align 32 -.global log# -.global log10# +LOCAL_OBJECT_START(log_data) +// coefficients of polynomial approximation +data8 0x3FC2494104381A8E // A7 +data8 0xBFC5556D556BBB69 // A6 +// +// two parts of ln(2) +data8 0x3FE62E42FEF00000,0x3DD473DE6AF278ED +// +data8 0x8000000000000000,0x3FFF // 1.0 +// +data8 0x3FC999999988B5E9 // A5 +data8 0xBFCFFFFFFFF6FFF5 // A4 +// +// hi parts of ln(1/frcpa(1+i/256)), i=0...255 +data8 0x3F60040155D5889D // 0 +data8 0x3F78121214586B54 // 1 +data8 0x3F841929F96832EF // 2 +data8 0x3F8C317384C75F06 // 3 +data8 0x3F91A6B91AC73386 // 4 +data8 0x3F95BA9A5D9AC039 // 5 +data8 0x3F99D2A8074325F3 // 6 +data8 0x3F9D6B2725979802 // 7 +data8 0x3FA0C58FA19DFAA9 // 8 +data8 0x3FA2954C78CBCE1A // 9 +data8 0x3FA4A94D2DA96C56 // 10 +data8 0x3FA67C94F2D4BB58 // 11 +data8 0x3FA85188B630F068 // 12 +data8 0x3FAA6B8ABE73AF4C // 13 +data8 0x3FAC441E06F72A9E // 14 +data8 0x3FAE1E6713606D06 // 15 +data8 0x3FAFFA6911AB9300 // 16 +data8 0x3FB0EC139C5DA600 // 17 +data8 0x3FB1DBD2643D190B // 18 +data8 0x3FB2CC7284FE5F1C // 19 +data8 0x3FB3BDF5A7D1EE64 // 20 +data8 0x3FB4B05D7AA012E0 // 21 +data8 0x3FB580DB7CEB5701 // 22 +data8 0x3FB674F089365A79 // 23 +data8 0x3FB769EF2C6B568D // 24 +data8 0x3FB85FD927506A47 // 25 +data8 0x3FB9335E5D594988 // 26 +data8 0x3FBA2B0220C8E5F4 // 27 +data8 0x3FBB0004AC1A86AB // 28 +data8 0x3FBBF968769FCA10 // 29 +data8 0x3FBCCFEDBFEE13A8 // 30 +data8 0x3FBDA727638446A2 // 31 +data8 0x3FBEA3257FE10F79 // 32 +data8 0x3FBF7BE9FEDBFDE5 // 33 +data8 0x3FC02AB352FF25F3 // 34 +data8 0x3FC097CE579D204C // 35 +data8 0x3FC1178E8227E47B // 36 +data8 0x3FC185747DBECF33 // 37 +data8 0x3FC1F3B925F25D41 // 38 +data8 0x3FC2625D1E6DDF56 // 39 +data8 0x3FC2D1610C868139 // 40 +data8 0x3FC340C59741142E // 41 +data8 0x3FC3B08B6757F2A9 // 42 +data8 0x3FC40DFB08378003 // 43 +data8 0x3FC47E74E8CA5F7C // 44 +data8 0x3FC4EF51F6466DE4 // 45 +data8 0x3FC56092E02BA516 // 46 +data8 0x3FC5D23857CD74D4 // 47 +data8 0x3FC6313A37335D76 // 48 +data8 0x3FC6A399DABBD383 // 49 +data8 0x3FC70337DD3CE41A // 50 +data8 0x3FC77654128F6127 // 51 +data8 0x3FC7E9D82A0B022D // 52 +data8 0x3FC84A6B759F512E // 53 +data8 0x3FC8AB47D5F5A30F // 54 +data8 0x3FC91FE49096581B // 55 +data8 0x3FC981634011AA75 // 56 +data8 0x3FC9F6C407089664 // 57 +data8 0x3FCA58E729348F43 // 58 +data8 0x3FCABB55C31693AC // 59 +data8 0x3FCB1E104919EFD0 // 60 +data8 0x3FCB94EE93E367CA // 61 +data8 0x3FCBF851C067555E // 62 +data8 0x3FCC5C0254BF23A5 // 63 +data8 0x3FCCC000C9DB3C52 // 64 +data8 0x3FCD244D99C85673 // 65 +data8 0x3FCD88E93FB2F450 // 66 +data8 0x3FCDEDD437EAEF00 // 67 +data8 0x3FCE530EFFE71012 // 68 +data8 0x3FCEB89A1648B971 // 69 +data8 0x3FCF1E75FADF9BDE // 70 +data8 0x3FCF84A32EAD7C35 // 71 +data8 0x3FCFEB2233EA07CD // 72 +data8 0x3FD028F9C7035C1C // 73 +data8 0x3FD05C8BE0D9635A // 74 +data8 0x3FD085EB8F8AE797 // 75 +data8 0x3FD0B9C8E32D1911 // 76 +data8 0x3FD0EDD060B78080 // 77 +data8 0x3FD122024CF0063F // 78 +data8 0x3FD14BE2927AECD4 // 79 +data8 0x3FD180618EF18ADF // 80 +data8 0x3FD1B50BBE2FC63B // 81 +data8 0x3FD1DF4CC7CF242D // 82 +data8 0x3FD214456D0EB8D4 // 83 +data8 0x3FD23EC5991EBA49 // 84 +data8 0x3FD2740D9F870AFB // 85 +data8 0x3FD29ECDABCDFA03 // 86 +data8 0x3FD2D46602ADCCEE // 87 +data8 0x3FD2FF66B04EA9D4 // 88 +data8 0x3FD335504B355A37 // 89 +data8 0x3FD360925EC44F5C // 90 +data8 0x3FD38BF1C3337E74 // 91 +data8 0x3FD3C25277333183 // 92 +data8 0x3FD3EDF463C1683E // 93 +data8 0x3FD419B423D5E8C7 // 94 +data8 0x3FD44591E0539F48 // 95 +data8 0x3FD47C9175B6F0AD // 96 +data8 0x3FD4A8B341552B09 // 97 +data8 0x3FD4D4F39089019F // 98 +data8 0x3FD501528DA1F967 // 99 +data8 0x3FD52DD06347D4F6 // 100 +data8 0x3FD55A6D3C7B8A89 // 101 +data8 0x3FD5925D2B112A59 // 102 +data8 0x3FD5BF406B543DB1 // 103 +data8 0x3FD5EC433D5C35AD // 104 +data8 0x3FD61965CDB02C1E // 105 +data8 0x3FD646A84935B2A1 // 106 +data8 0x3FD6740ADD31DE94 // 107 +data8 0x3FD6A18DB74A58C5 // 108 +data8 0x3FD6CF31058670EC // 109 +data8 0x3FD6F180E852F0B9 // 110 +data8 0x3FD71F5D71B894EF // 111 +data8 0x3FD74D5AEFD66D5C // 112 +data8 0x3FD77B79922BD37D // 113 +data8 0x3FD7A9B9889F19E2 // 114 +data8 0x3FD7D81B037EB6A6 // 115 +data8 0x3FD8069E33827230 // 116 +data8 0x3FD82996D3EF8BCA // 117 +data8 0x3FD85855776DCBFA // 118 +data8 0x3FD8873658327CCE // 119 +data8 0x3FD8AA75973AB8CE // 120 +data8 0x3FD8D992DC8824E4 // 121 +data8 0x3FD908D2EA7D9511 // 122 +data8 0x3FD92C59E79C0E56 // 123 +data8 0x3FD95BD750EE3ED2 // 124 +data8 0x3FD98B7811A3EE5B // 125 +data8 0x3FD9AF47F33D406B // 126 +data8 0x3FD9DF270C1914A7 // 127 +data8 0x3FDA0325ED14FDA4 // 128 +data8 0x3FDA33440224FA78 // 129 +data8 0x3FDA57725E80C382 // 130 +data8 0x3FDA87D0165DD199 // 131 +data8 0x3FDAAC2E6C03F895 // 132 +data8 0x3FDADCCC6FDF6A81 // 133 +data8 0x3FDB015B3EB1E790 // 134 +data8 0x3FDB323A3A635948 // 135 +data8 0x3FDB56FA04462909 // 136 +data8 0x3FDB881AA659BC93 // 137 +data8 0x3FDBAD0BEF3DB164 // 138 +data8 0x3FDBD21297781C2F // 139 +data8 0x3FDC039236F08818 // 140 +data8 0x3FDC28CB1E4D32FC // 141 +data8 0x3FDC4E19B84723C1 // 142 +data8 0x3FDC7FF9C74554C9 // 143 +data8 0x3FDCA57B64E9DB05 // 144 +data8 0x3FDCCB130A5CEBAF // 145 +data8 0x3FDCF0C0D18F326F // 146 +data8 0x3FDD232075B5A201 // 147 +data8 0x3FDD490246DEFA6B // 148 +data8 0x3FDD6EFA918D25CD // 149 +data8 0x3FDD9509707AE52F // 150 +data8 0x3FDDBB2EFE92C554 // 151 +data8 0x3FDDEE2F3445E4AE // 152 +data8 0x3FDE148A1A2726CD // 153 +data8 0x3FDE3AFC0A49FF3F // 154 +data8 0x3FDE6185206D516D // 155 +data8 0x3FDE882578823D51 // 156 +data8 0x3FDEAEDD2EAC990C // 157 +data8 0x3FDED5AC5F436BE2 // 158 +data8 0x3FDEFC9326D16AB8 // 159 +data8 0x3FDF2391A21575FF // 160 +data8 0x3FDF4AA7EE03192C // 161 +data8 0x3FDF71D627C30BB0 // 162 +data8 0x3FDF991C6CB3B379 // 163 +data8 0x3FDFC07ADA69A90F // 164 +data8 0x3FDFE7F18EB03D3E // 165 +data8 0x3FE007C053C5002E // 166 +data8 0x3FE01B942198A5A0 // 167 +data8 0x3FE02F74400C64EA // 168 +data8 0x3FE04360BE7603AC // 169 +data8 0x3FE05759AC47FE33 // 170 +data8 0x3FE06B5F1911CF51 // 171 +data8 0x3FE078BF0533C568 // 172 +data8 0x3FE08CD9687E7B0E // 173 +data8 0x3FE0A10074CF9019 // 174 +data8 0x3FE0B5343A234476 // 175 +data8 0x3FE0C974C89431CD // 176 +data8 0x3FE0DDC2305B9886 // 177 +data8 0x3FE0EB524BAFC918 // 178 +data8 0x3FE0FFB54213A475 // 179 +data8 0x3FE114253DA97D9F // 180 +data8 0x3FE128A24F1D9AFF // 181 +data8 0x3FE1365252BF0864 // 182 +data8 0x3FE14AE558B4A92D // 183 +data8 0x3FE15F85A19C765B // 184 +data8 0x3FE16D4D38C119FA // 185 +data8 0x3FE18203C20DD133 // 186 +data8 0x3FE196C7BC4B1F3A // 187 +data8 0x3FE1A4A738B7A33C // 188 +data8 0x3FE1B981C0C9653C // 189 +data8 0x3FE1CE69E8BB106A // 190 +data8 0x3FE1DC619DE06944 // 191 +data8 0x3FE1F160A2AD0DA3 // 192 +data8 0x3FE2066D7740737E // 193 +data8 0x3FE2147DBA47A393 // 194 +data8 0x3FE229A1BC5EBAC3 // 195 +data8 0x3FE237C1841A502E // 196 +data8 0x3FE24CFCE6F80D9A // 197 +data8 0x3FE25B2C55CD5762 // 198 +data8 0x3FE2707F4D5F7C40 // 199 +data8 0x3FE285E0842CA383 // 200 +data8 0x3FE294294708B773 // 201 +data8 0x3FE2A9A2670AFF0C // 202 +data8 0x3FE2B7FB2C8D1CC0 // 203 +data8 0x3FE2C65A6395F5F5 // 204 +data8 0x3FE2DBF557B0DF42 // 205 +data8 0x3FE2EA64C3F97654 // 206 +data8 0x3FE3001823684D73 // 207 +data8 0x3FE30E97E9A8B5CC // 208 +data8 0x3FE32463EBDD34E9 // 209 +data8 0x3FE332F4314AD795 // 210 +data8 0x3FE348D90E7464CF // 211 +data8 0x3FE35779F8C43D6D // 212 +data8 0x3FE36621961A6A99 // 213 +data8 0x3FE37C299F3C366A // 214 +data8 0x3FE38AE2171976E7 // 215 +data8 0x3FE399A157A603E7 // 216 +data8 0x3FE3AFCCFE77B9D1 // 217 +data8 0x3FE3BE9D503533B5 // 218 +data8 0x3FE3CD7480B4A8A2 // 219 +data8 0x3FE3E3C43918F76C // 220 +data8 0x3FE3F2ACB27ED6C6 // 221 +data8 0x3FE4019C2125CA93 // 222 +data8 0x3FE4181061389722 // 223 +data8 0x3FE42711518DF545 // 224 +data8 0x3FE436194E12B6BF // 225 +data8 0x3FE445285D68EA69 // 226 +data8 0x3FE45BCC464C893A // 227 +data8 0x3FE46AED21F117FC // 228 +data8 0x3FE47A1527E8A2D3 // 229 +data8 0x3FE489445EFFFCCB // 230 +data8 0x3FE4A018BCB69835 // 231 +data8 0x3FE4AF5A0C9D65D7 // 232 +data8 0x3FE4BEA2A5BDBE87 // 233 +data8 0x3FE4CDF28F10AC46 // 234 +data8 0x3FE4DD49CF994058 // 235 +data8 0x3FE4ECA86E64A683 // 236 +data8 0x3FE503C43CD8EB68 // 237 +data8 0x3FE513356667FC57 // 238 +data8 0x3FE522AE0738A3D7 // 239 +data8 0x3FE5322E26867857 // 240 +data8 0x3FE541B5CB979809 // 241 +data8 0x3FE55144FDBCBD62 // 242 +data8 0x3FE560DBC45153C6 // 243 +data8 0x3FE5707A26BB8C66 // 244 +data8 0x3FE587F60ED5B8FF // 245 +data8 0x3FE597A7977C8F31 // 246 +data8 0x3FE5A760D634BB8A // 247 +data8 0x3FE5B721D295F10E // 248 +data8 0x3FE5C6EA94431EF9 // 249 +data8 0x3FE5D6BB22EA86F5 // 250 +data8 0x3FE5E6938645D38F // 251 +data8 0x3FE5F673C61A2ED1 // 252 +data8 0x3FE6065BEA385926 // 253 +data8 0x3FE6164BFA7CC06B // 254 +data8 0x3FE62643FECF9742 // 255 +// +// lo parts of ln(1/frcpa(1+i/256)), i=0...255 +data4 0x20E70672 // 0 +data4 0x1F60A5D0 // 1 +data4 0x218EABA0 // 2 +data4 0x21403104 // 3 +data4 0x20E9B54E // 4 +data4 0x21EE1382 // 5 +data4 0x226014E3 // 6 +data4 0x2095E5C9 // 7 +data4 0x228BA9D4 // 8 +data4 0x22932B86 // 9 +data4 0x22608A57 // 10 +data4 0x220209F3 // 11 +data4 0x212882CC // 12 +data4 0x220D46E2 // 13 +data4 0x21FA4C28 // 14 +data4 0x229E5BD9 // 15 +data4 0x228C9838 // 16 +data4 0x2311F954 // 17 +data4 0x221365DF // 18 +data4 0x22BD0CB3 // 19 +data4 0x223D4BB7 // 20 +data4 0x22A71BBE // 21 +data4 0x237DB2FA // 22 +data4 0x23194C9D // 23 +data4 0x22EC639E // 24 +data4 0x2367E669 // 25 +data4 0x232E1D5F // 26 +data4 0x234A639B // 27 +data4 0x2365C0E0 // 28 +data4 0x234646C1 // 29 +data4 0x220CBF9C // 30 +data4 0x22A00FD4 // 31 +data4 0x2306A3F2 // 32 +data4 0x23745A9B // 33 +data4 0x2398D756 // 34 +data4 0x23DD0B6A // 35 +data4 0x23DE338B // 36 +data4 0x23A222DF // 37 +data4 0x223164F8 // 38 +data4 0x23B4E87B // 39 +data4 0x23D6CCB8 // 40 +data4 0x220C2099 // 41 +data4 0x21B86B67 // 42 +data4 0x236D14F1 // 43 +data4 0x225A923F // 44 +data4 0x22748723 // 45 +data4 0x22200D13 // 46 +data4 0x23C296EA // 47 +data4 0x2302AC38 // 48 +data4 0x234B1996 // 49 +data4 0x2385E298 // 50 +data4 0x23175BE5 // 51 +data4 0x2193F482 // 52 +data4 0x23BFEA90 // 53 +data4 0x23D70A0C // 54 +data4 0x231CF30A // 55 +data4 0x235D9E90 // 56 +data4 0x221AD0CB // 57 +data4 0x22FAA08B // 58 +data4 0x23D29A87 // 59 +data4 0x20C4B2FE // 60 +data4 0x2381B8B7 // 61 +data4 0x23F8D9FC // 62 +data4 0x23EAAE7B // 63 +data4 0x2329E8AA // 64 +data4 0x23EC0322 // 65 +data4 0x2357FDCB // 66 +data4 0x2392A9AD // 67 +data4 0x22113B02 // 68 +data4 0x22DEE901 // 69 +data4 0x236A6D14 // 70 +data4 0x2371D33E // 71 +data4 0x2146F005 // 72 +data4 0x23230B06 // 73 +data4 0x22F1C77D // 74 +data4 0x23A89FA3 // 75 +data4 0x231D1241 // 76 +data4 0x244DA96C // 77 +data4 0x23ECBB7D // 78 +data4 0x223E42B4 // 79 +data4 0x23801BC9 // 80 +data4 0x23573263 // 81 +data4 0x227C1158 // 82 +data4 0x237BD749 // 83 +data4 0x21DDBAE9 // 84 +data4 0x23401735 // 85 +data4 0x241D9DEE // 86 +data4 0x23BC88CB // 87 +data4 0x2396D5F1 // 88 +data4 0x23FC89CF // 89 +data4 0x2414F9A2 // 90 +data4 0x2474A0F5 // 91 +data4 0x24354B60 // 92 +data4 0x23C1EB40 // 93 +data4 0x2306DD92 // 94 +data4 0x24353B6B // 95 +data4 0x23CD1701 // 96 +data4 0x237C7A1C // 97 +data4 0x245793AA // 98 +data4 0x24563695 // 99 +data4 0x23C51467 // 100 +data4 0x24476B68 // 101 +data4 0x212585A9 // 102 +data4 0x247B8293 // 103 +data4 0x2446848A // 104 +data4 0x246A53F8 // 105 +data4 0x246E496D // 106 +data4 0x23ED1D36 // 107 +data4 0x2314C258 // 108 +data4 0x233244A7 // 109 +data4 0x245B7AF0 // 110 +data4 0x24247130 // 111 +data4 0x22D67B38 // 112 +data4 0x2449F620 // 113 +data4 0x23BBC8B8 // 114 +data4 0x237D3BA0 // 115 +data4 0x245E8F13 // 116 +data4 0x2435573F // 117 +data4 0x242DE666 // 118 +data4 0x2463BC10 // 119 +data4 0x2466587D // 120 +data4 0x2408144B // 121 +data4 0x2405F0E5 // 122 +data4 0x22381CFF // 123 +data4 0x24154F9B // 124 +data4 0x23A4E96E // 125 +data4 0x24052967 // 126 +data4 0x2406963F // 127 +data4 0x23F7D3CB // 128 +data4 0x2448AFF4 // 129 +data4 0x24657A21 // 130 +data4 0x22FBC230 // 131 +data4 0x243C8DEA // 132 +data4 0x225DC4B7 // 133 +data4 0x23496EBF // 134 +data4 0x237C2B2B // 135 +data4 0x23A4A5B1 // 136 +data4 0x2394E9D1 // 137 +data4 0x244BC950 // 138 +data4 0x23C7448F // 139 +data4 0x2404A1AD // 140 +data4 0x246511D5 // 141 +data4 0x24246526 // 142 +data4 0x23111F57 // 143 +data4 0x22868951 // 144 +data4 0x243EB77F // 145 +data4 0x239F3DFF // 146 +data4 0x23089666 // 147 +data4 0x23EBFA6A // 148 +data4 0x23C51312 // 149 +data4 0x23E1DD5E // 150 +data4 0x232C0944 // 151 +data4 0x246A741F // 152 +data4 0x2414DF8D // 153 +data4 0x247B5546 // 154 +data4 0x2415C980 // 155 +data4 0x24324ABD // 156 +data4 0x234EB5E5 // 157 +data4 0x2465E43E // 158 +data4 0x242840D1 // 159 +data4 0x24444057 // 160 +data4 0x245E56F0 // 161 +data4 0x21AE30F8 // 162 +data4 0x23FB3283 // 163 +data4 0x247A4D07 // 164 +data4 0x22AE314D // 165 +data4 0x246B7727 // 166 +data4 0x24EAD526 // 167 +data4 0x24B41DC9 // 168 +data4 0x24EE8062 // 169 +data4 0x24A0C7C4 // 170 +data4 0x24E8DA67 // 171 +data4 0x231120F7 // 172 +data4 0x24401FFB // 173 +data4 0x2412DD09 // 174 +data4 0x248C131A // 175 +data4 0x24C0A7CE // 176 +data4 0x243DD4C8 // 177 +data4 0x24457FEB // 178 +data4 0x24DEEFBB // 179 +data4 0x243C70AE // 180 +data4 0x23E7A6FA // 181 +data4 0x24C2D311 // 182 +data4 0x23026255 // 183 +data4 0x2437C9B9 // 184 +data4 0x246BA847 // 185 +data4 0x2420B448 // 186 +data4 0x24C4CF5A // 187 +data4 0x242C4981 // 188 +data4 0x24DE1525 // 189 +data4 0x24F5CC33 // 190 +data4 0x235A85DA // 191 +data4 0x24A0B64F // 192 +data4 0x244BA0A4 // 193 +data4 0x24AAF30A // 194 +data4 0x244C86F9 // 195 +data4 0x246D5B82 // 196 +data4 0x24529347 // 197 +data4 0x240DD008 // 198 +data4 0x24E98790 // 199 +data4 0x2489B0CE // 200 +data4 0x22BC29AC // 201 +data4 0x23F37C7A // 202 +data4 0x24987FE8 // 203 +data4 0x22AFE20B // 204 +data4 0x24C8D7C2 // 205 +data4 0x24B28B7D // 206 +data4 0x23B6B271 // 207 +data4 0x24C77CB6 // 208 +data4 0x24EF1DCA // 209 +data4 0x24A4F0AC // 210 +data4 0x24CF113E // 211 +data4 0x2496BBAB // 212 +data4 0x23C7CC8A // 213 +data4 0x23AE3961 // 214 +data4 0x2410A895 // 215 +data4 0x23CE3114 // 216 +data4 0x2308247D // 217 +data4 0x240045E9 // 218 +data4 0x24974F60 // 219 +data4 0x242CB39F // 220 +data4 0x24AB8D69 // 221 +data4 0x23436788 // 222 +data4 0x24305E9E // 223 +data4 0x243E71A9 // 224 +data4 0x23C2A6B3 // 225 +data4 0x23FFE6CF // 226 +data4 0x2322D801 // 227 +data4 0x24515F21 // 228 +data4 0x2412A0D6 // 229 +data4 0x24E60D44 // 230 +data4 0x240D9251 // 231 +data4 0x247076E2 // 232 +data4 0x229B101B // 233 +data4 0x247B12DE // 234 +data4 0x244B9127 // 235 +data4 0x2499EC42 // 236 +data4 0x21FC3963 // 237 +data4 0x23E53266 // 238 +data4 0x24CE102D // 239 +data4 0x23CC45D2 // 240 +data4 0x2333171D // 241 +data4 0x246B3533 // 242 +data4 0x24931129 // 243 +data4 0x24405FFA // 244 +data4 0x24CF464D // 245 +data4 0x237095CD // 246 +data4 0x24F86CBD // 247 +data4 0x24E2D84B // 248 +data4 0x21ACBB44 // 249 +data4 0x24F43A8C // 250 +data4 0x249DB931 // 251 +data4 0x24A385EF // 252 +data4 0x238B1279 // 253 +data4 0x2436213E // 254 +data4 0x24F18A3B // 255 +LOCAL_OBJECT_END(log_data) + + +LOCAL_OBJECT_START(log10_data) +// coefficients of polynoimal approximation +data8 0x3FC2494104381A8E // A7 +data8 0xBFC5556D556BBB69 // A6 +// +// two parts of ln(2)/ln(10) +data8 0x3FD3441350900000, 0x3DCEF3FDE623E256 +// +data8 0xDE5BD8A937287195,0x3FFD // 1/ln(10) +// +data8 0x3FC999999988B5E9 // A5 +data8 0xBFCFFFFFFFF6FFF5 // A4 +// +// Hi parts of ln(1/frcpa(1+i/256))/ln(10), i=0...255 +data8 0x3F4BD27045BFD024 // 0 +data8 0x3F64E84E793A474A // 1 +data8 0x3F7175085AB85FF0 // 2 +data8 0x3F787CFF9D9147A5 // 3 +data8 0x3F7EA9D372B89FC8 // 4 +data8 0x3F82DF9D95DA961C // 5 +data8 0x3F866DF172D6372B // 6 +data8 0x3F898D79EF5EEDEF // 7 +data8 0x3F8D22ADF3F9579C // 8 +data8 0x3F9024231D30C398 // 9 +data8 0x3F91F23A98897D49 // 10 +data8 0x3F93881A7B818F9E // 11 +data8 0x3F951F6E1E759E35 // 12 +data8 0x3F96F2BCE7ADC5B4 // 13 +data8 0x3F988D362CDF359E // 14 +data8 0x3F9A292BAF010981 // 15 +data8 0x3F9BC6A03117EB97 // 16 +data8 0x3F9D65967DE3AB08 // 17 +data8 0x3F9F061167FC31E7 // 18 +data8 0x3FA05409E4F7819B // 19 +data8 0x3FA125D0432EA20D // 20 +data8 0x3FA1F85D440D299B // 21 +data8 0x3FA2AD755749617C // 22 +data8 0x3FA381772A00E603 // 23 +data8 0x3FA45643E165A70A // 24 +data8 0x3FA52BDD034475B8 // 25 +data8 0x3FA5E3966B7E9295 // 26 +data8 0x3FA6BAAF47C5B244 // 27 +data8 0x3FA773B3E8C4F3C7 // 28 +data8 0x3FA84C51EBEE8D15 // 29 +data8 0x3FA906A6786FC1CA // 30 +data8 0x3FA9C197ABF00DD6 // 31 +data8 0x3FAA9C78712191F7 // 32 +data8 0x3FAB58C09C8D637C // 33 +data8 0x3FAC15A8BCDD7B7E // 34 +data8 0x3FACD331E2C2967B // 35 +data8 0x3FADB11ED766ABF4 // 36 +data8 0x3FAE70089346A9E6 // 37 +data8 0x3FAF2F96C6754AED // 38 +data8 0x3FAFEFCA8D451FD5 // 39 +data8 0x3FB0585283764177 // 40 +data8 0x3FB0B913AAC7D3A6 // 41 +data8 0x3FB11A294F2569F5 // 42 +data8 0x3FB16B51A2696890 // 43 +data8 0x3FB1CD03ADACC8BD // 44 +data8 0x3FB22F0BDD7745F5 // 45 +data8 0x3FB2916ACA38D1E7 // 46 +data8 0x3FB2F4210DF7663C // 47 +data8 0x3FB346A6C3C49065 // 48 +data8 0x3FB3A9FEBC605409 // 49 +data8 0x3FB3FD0C10A3AA54 // 50 +data8 0x3FB46107D3540A81 // 51 +data8 0x3FB4C55DD16967FE // 52 +data8 0x3FB51940330C000A // 53 +data8 0x3FB56D620EE7115E // 54 +data8 0x3FB5D2ABCF26178D // 55 +data8 0x3FB6275AA5DEBF81 // 56 +data8 0x3FB68D4EAF26D7EE // 57 +data8 0x3FB6E28C5C54A28D // 58 +data8 0x3FB7380B9665B7C7 // 59 +data8 0x3FB78DCCC278E85B // 60 +data8 0x3FB7F50C2CF25579 // 61 +data8 0x3FB84B5FD5EAEFD7 // 62 +data8 0x3FB8A1F6BAB2B226 // 63 +data8 0x3FB8F8D144557BDF // 64 +data8 0x3FB94FEFDCD61D92 // 65 +data8 0x3FB9A752EF316149 // 66 +data8 0x3FB9FEFAE7611EDF // 67 +data8 0x3FBA56E8325F5C86 // 68 +data8 0x3FBAAF1B3E297BB3 // 69 +data8 0x3FBB079479C372AC // 70 +data8 0x3FBB6054553B12F7 // 71 +data8 0x3FBBB95B41AB5CE5 // 72 +data8 0x3FBC12A9B13FE079 // 73 +data8 0x3FBC6C4017382BEA // 74 +data8 0x3FBCB41FBA42686C // 75 +data8 0x3FBD0E38CE73393E // 76 +data8 0x3FBD689B2193F132 // 77 +data8 0x3FBDC3472B1D285F // 78 +data8 0x3FBE0C06300D528B // 79 +data8 0x3FBE6738190E394B // 80 +data8 0x3FBEC2B50D208D9A // 81 +data8 0x3FBF0C1C2B936827 // 82 +data8 0x3FBF68216C9CC726 // 83 +data8 0x3FBFB1F6381856F3 // 84 +data8 0x3FC00742AF4CE5F8 // 85 +data8 0x3FC02C64906512D2 // 86 +data8 0x3FC05AF1E63E03B4 // 87 +data8 0x3FC0804BEA723AA8 // 88 +data8 0x3FC0AF1FD6711526 // 89 +data8 0x3FC0D4B2A88059FF // 90 +data8 0x3FC0FA5EF136A06C // 91 +data8 0x3FC1299A4FB3E305 // 92 +data8 0x3FC14F806253C3EC // 93 +data8 0x3FC175805D1587C1 // 94 +data8 0x3FC19B9A637CA294 // 95 +data8 0x3FC1CB5FC26EDE16 // 96 +data8 0x3FC1F1B4E65F2590 // 97 +data8 0x3FC218248B5DC3E5 // 98 +data8 0x3FC23EAED62ADC76 // 99 +data8 0x3FC26553EBD337BC // 100 +data8 0x3FC28C13F1B118FF // 101 +data8 0x3FC2BCAA14381385 // 102 +data8 0x3FC2E3A740B7800E // 103 +data8 0x3FC30ABFD8F333B6 // 104 +data8 0x3FC331F403985096 // 105 +data8 0x3FC35943E7A6068F // 106 +data8 0x3FC380AFAC6E7C07 // 107 +data8 0x3FC3A8377997B9E5 // 108 +data8 0x3FC3CFDB771C9ADB // 109 +data8 0x3FC3EDA90D39A5DE // 110 +data8 0x3FC4157EC09505CC // 111 +data8 0x3FC43D7113FB04C0 // 112 +data8 0x3FC4658030AD1CCE // 113 +data8 0x3FC48DAC404638F5 // 114 +data8 0x3FC4B5F56CBBB869 // 115 +data8 0x3FC4DE5BE05E7582 // 116 +data8 0x3FC4FCBC0776FD85 // 117 +data8 0x3FC525561E9256EE // 118 +data8 0x3FC54E0DF3198865 // 119 +data8 0x3FC56CAB7112BDE2 // 120 +data8 0x3FC59597BA735B15 // 121 +data8 0x3FC5BEA23A506FD9 // 122 +data8 0x3FC5DD7E08DE382E // 123 +data8 0x3FC606BDD3F92355 // 124 +data8 0x3FC6301C518A501E // 125 +data8 0x3FC64F3770618915 // 126 +data8 0x3FC678CC14C1E2D7 // 127 +data8 0x3FC6981005ED2947 // 128 +data8 0x3FC6C1DB5F9BB335 // 129 +data8 0x3FC6E1488ECD2880 // 130 +data8 0x3FC70B4B2E7E41B8 // 131 +data8 0x3FC72AE209146BF8 // 132 +data8 0x3FC7551C81BD8DCF // 133 +data8 0x3FC774DD76CC43BD // 134 +data8 0x3FC79F505DB00E88 // 135 +data8 0x3FC7BF3BDE099F30 // 136 +data8 0x3FC7E9E7CAC437F8 // 137 +data8 0x3FC809FE4902D00D // 138 +data8 0x3FC82A2757995CBD // 139 +data8 0x3FC85525C625E098 // 140 +data8 0x3FC8757A79831887 // 141 +data8 0x3FC895E2058D8E02 // 142 +data8 0x3FC8C13437695531 // 143 +data8 0x3FC8E1C812EF32BE // 144 +data8 0x3FC9026F112197E8 // 145 +data8 0x3FC923294888880A // 146 +data8 0x3FC94EEA4B8334F2 // 147 +data8 0x3FC96FD1B639FC09 // 148 +data8 0x3FC990CCA66229AB // 149 +data8 0x3FC9B1DB33334842 // 150 +data8 0x3FC9D2FD740E6606 // 151 +data8 0x3FC9FF49EEDCB553 // 152 +data8 0x3FCA209A84FBCFF7 // 153 +data8 0x3FCA41FF1E43F02B // 154 +data8 0x3FCA6377D2CE9377 // 155 +data8 0x3FCA8504BAE0D9F5 // 156 +data8 0x3FCAA6A5EEEBEFE2 // 157 +data8 0x3FCAC85B878D7878 // 158 +data8 0x3FCAEA259D8FFA0B // 159 +data8 0x3FCB0C0449EB4B6A // 160 +data8 0x3FCB2DF7A5C50299 // 161 +data8 0x3FCB4FFFCA70E4D1 // 162 +data8 0x3FCB721CD17157E2 // 163 +data8 0x3FCB944ED477D4EC // 164 +data8 0x3FCBB695ED655C7C // 165 +data8 0x3FCBD8F2364AEC0F // 166 +data8 0x3FCBFB63C969F4FF // 167 +data8 0x3FCC1DEAC134D4E9 // 168 +data8 0x3FCC4087384F4F80 // 169 +data8 0x3FCC6339498F09E1 // 170 +data8 0x3FCC86010FFC076B // 171 +data8 0x3FCC9D3D065C5B41 // 172 +data8 0x3FCCC029375BA079 // 173 +data8 0x3FCCE32B66978BA4 // 174 +data8 0x3FCD0643AFD51404 // 175 +data8 0x3FCD29722F0DEA45 // 176 +data8 0x3FCD4CB70070FE43 // 177 +data8 0x3FCD6446AB3F8C95 // 178 +data8 0x3FCD87B0EF71DB44 // 179 +data8 0x3FCDAB31D1FE99A6 // 180 +data8 0x3FCDCEC96FDC888E // 181 +data8 0x3FCDE69088763579 // 182 +data8 0x3FCE0A4E4A25C1FF // 183 +data8 0x3FCE2E2315755E32 // 184 +data8 0x3FCE461322D1648A // 185 +data8 0x3FCE6A0E95C7787B // 186 +data8 0x3FCE8E216243DD60 // 187 +data8 0x3FCEA63AF26E007C // 188 +data8 0x3FCECA74ED15E0B7 // 189 +data8 0x3FCEEEC692CCD259 // 190 +data8 0x3FCF070A36B8D9C0 // 191 +data8 0x3FCF2B8393E34A2D // 192 +data8 0x3FCF5014EF538A5A // 193 +data8 0x3FCF68833AF1B17F // 194 +data8 0x3FCF8D3CD9F3F04E // 195 +data8 0x3FCFA5C61ADD93E9 // 196 +data8 0x3FCFCAA8567EBA79 // 197 +data8 0x3FCFE34CC8743DD8 // 198 +data8 0x3FD0042BFD74F519 // 199 +data8 0x3FD016BDF6A18017 // 200 +data8 0x3FD023262F907322 // 201 +data8 0x3FD035CCED8D32A1 // 202 +data8 0x3FD042430E869FFB // 203 +data8 0x3FD04EBEC842B2DF // 204 +data8 0x3FD06182E84FD4AB // 205 +data8 0x3FD06E0CB609D383 // 206 +data8 0x3FD080E60BEC8F12 // 207 +data8 0x3FD08D7E0D894735 // 208 +data8 0x3FD0A06CC96A2055 // 209 +data8 0x3FD0AD131F3B3C55 // 210 +data8 0x3FD0C01771E775FB // 211 +data8 0x3FD0CCCC3CAD6F4B // 212 +data8 0x3FD0D986D91A34A8 // 213 +data8 0x3FD0ECA9B8861A2D // 214 +data8 0x3FD0F972F87FF3D5 // 215 +data8 0x3FD106421CF0E5F7 // 216 +data8 0x3FD11983EBE28A9C // 217 +data8 0x3FD12661E35B7859 // 218 +data8 0x3FD13345D2779D3B // 219 +data8 0x3FD146A6F597283A // 220 +data8 0x3FD15399E81EA83D // 221 +data8 0x3FD16092E5D3A9A6 // 222 +data8 0x3FD17413C3B7AB5D // 223 +data8 0x3FD1811BF629D6FA // 224 +data8 0x3FD18E2A47B46685 // 225 +data8 0x3FD19B3EBE1A4418 // 226 +data8 0x3FD1AEE9017CB450 // 227 +data8 0x3FD1BC0CED7134E1 // 228 +data8 0x3FD1C93712ABC7FF // 229 +data8 0x3FD1D66777147D3E // 230 +data8 0x3FD1EA3BD1286E1C // 231 +data8 0x3FD1F77BED932C4C // 232 +data8 0x3FD204C25E1B031F // 233 +data8 0x3FD2120F28CE69B1 // 234 +data8 0x3FD21F6253C48D00 // 235 +data8 0x3FD22CBBE51D60A9 // 236 +data8 0x3FD240CE4C975444 // 237 +data8 0x3FD24E37F8ECDAE7 // 238 +data8 0x3FD25BA8215AF7FC // 239 +data8 0x3FD2691ECC29F042 // 240 +data8 0x3FD2769BFFAB2DFF // 241 +data8 0x3FD2841FC23952C9 // 242 +data8 0x3FD291AA1A384978 // 243 +data8 0x3FD29F3B0E15584A // 244 +data8 0x3FD2B3A0EE479DF7 // 245 +data8 0x3FD2C142842C09E5 // 246 +data8 0x3FD2CEEACCB7BD6C // 247 +data8 0x3FD2DC99CE82FF20 // 248 +data8 0x3FD2EA4F902FD7D9 // 249 +data8 0x3FD2F80C186A25FC // 250 +data8 0x3FD305CF6DE7B0F6 // 251 +data8 0x3FD3139997683CE7 // 252 +data8 0x3FD3216A9BB59E7C // 253 +data8 0x3FD32F4281A3CEFE // 254 +data8 0x3FD33D2150110091 // 255 +// +// Lo parts of ln(1/frcpa(1+i/256))/ln(10), i=0...255 +data4 0x1FB0EB5A // 0 +data4 0x206E5EE3 // 1 +data4 0x208F3609 // 2 +data4 0x2070EB03 // 3 +data4 0x1F314BAE // 4 +data4 0x217A889D // 5 +data4 0x21E63650 // 6 +data4 0x21C2F4A3 // 7 +data4 0x2192A10C // 8 +data4 0x1F84B73E // 9 +data4 0x2243FBCA // 10 +data4 0x21BD9C51 // 11 +data4 0x213C542B // 12 +data4 0x21047386 // 13 +data4 0x21217D8F // 14 +data4 0x226791B7 // 15 +data4 0x204CCE66 // 16 +data4 0x2234CE9F // 17 +data4 0x220675E2 // 18 +data4 0x22B8E5BA // 19 +data4 0x22C12D14 // 20 +data4 0x211D41F0 // 21 +data4 0x228507F3 // 22 +data4 0x22F7274B // 23 +data4 0x22A7FDD1 // 24 +data4 0x2244A06E // 25 +data4 0x215DCE69 // 26 +data4 0x22F5C961 // 27 +data4 0x22EBEF29 // 28 +data4 0x222A2CB6 // 29 +data4 0x22B9FE00 // 30 +data4 0x22E79EB7 // 31 +data4 0x222F9607 // 32 +data4 0x2189D87F // 33 +data4 0x2236DB45 // 34 +data4 0x22ED77FB // 35 +data4 0x21CB70F0 // 36 +data4 0x21B8ACE8 // 37 +data4 0x22EC58C1 // 38 +data4 0x22CFCC1C // 39 +data4 0x2343E77A // 40 +data4 0x237FBC7F // 41 +data4 0x230D472E // 42 +data4 0x234686FB // 43 +data4 0x23770425 // 44 +data4 0x223977EC // 45 +data4 0x2345800A // 46 +data4 0x237BC351 // 47 +data4 0x23191502 // 48 +data4 0x232BAC12 // 49 +data4 0x22692421 // 50 +data4 0x234D409D // 51 +data4 0x22EC3214 // 52 +data4 0x2376C916 // 53 +data4 0x22B00DD1 // 54 +data4 0x2309D910 // 55 +data4 0x22F925FD // 56 +data4 0x22A63A7B // 57 +data4 0x2106264A // 58 +data4 0x234227F9 // 59 +data4 0x1ECB1978 // 60 +data4 0x23460A62 // 61 +data4 0x232ED4B1 // 62 +data4 0x226DDC38 // 63 +data4 0x1F101A73 // 64 +data4 0x21B1F82B // 65 +data4 0x22752F19 // 66 +data4 0x2320BC15 // 67 +data4 0x236EEC5E // 68 +data4 0x23404D3E // 69 +data4 0x2304C517 // 70 +data4 0x22F7441A // 71 +data4 0x230D3D7A // 72 +data4 0x2264A9DF // 73 +data4 0x22410CC8 // 74 +data4 0x2342CCCB // 75 +data4 0x23560BD4 // 76 +data4 0x237BBFFE // 77 +data4 0x2373A206 // 78 +data4 0x22C871B9 // 79 +data4 0x2354B70C // 80 +data4 0x232EDB33 // 81 +data4 0x235DB680 // 82 +data4 0x230EF422 // 83 +data4 0x235316CA // 84 +data4 0x22EEEE8B // 85 +data4 0x2375C88C // 86 +data4 0x235ABD21 // 87 +data4 0x23A0D232 // 88 +data4 0x23F5FFB5 // 89 +data4 0x23D3CEC8 // 90 +data4 0x22A92204 // 91 +data4 0x238C64DF // 92 +data4 0x23B82896 // 93 +data4 0x22D633B8 // 94 +data4 0x23861E93 // 95 +data4 0x23CB594B // 96 +data4 0x2330387E // 97 +data4 0x21CD4702 // 98 +data4 0x2284C505 // 99 +data4 0x23D6995C // 100 +data4 0x23F6C807 // 101 +data4 0x239CEF5C // 102 +data4 0x239442B0 // 103 +data4 0x22B35EE5 // 104 +data4 0x2391E9A4 // 105 +data4 0x23A390F5 // 106 +data4 0x2349AC9C // 107 +data4 0x23FA5535 // 108 +data4 0x21E3A46A // 109 +data4 0x23B44ABA // 110 +data4 0x23CEA8E0 // 111 +data4 0x23F647DC // 112 +data4 0x2390D1A8 // 113 +data4 0x23D0CFA2 // 114 +data4 0x236E0872 // 115 +data4 0x23B88B91 // 116 +data4 0x2283C359 // 117 +data4 0x232F647F // 118 +data4 0x23122CD7 // 119 +data4 0x232CF564 // 120 +data4 0x232630FD // 121 +data4 0x23BEE1C8 // 122 +data4 0x23B2BD30 // 123 +data4 0x2301F1C0 // 124 +data4 0x23CE4D67 // 125 +data4 0x23A353C9 // 126 +data4 0x238086E8 // 127 +data4 0x22D0D29E // 128 +data4 0x23A3B3C8 // 129 +data4 0x23F69F4B // 130 +data4 0x23EA3C21 // 131 +data4 0x23951C88 // 132 +data4 0x2372AFFC // 133 +data4 0x23A6D1A8 // 134 +data4 0x22BBBAF4 // 135 +data4 0x227FA3DD // 136 +data4 0x23804D9B // 137 +data4 0x232D771F // 138 +data4 0x239CB57B // 139 +data4 0x2303CF34 // 140 +data4 0x22218C2A // 141 +data4 0x23991BEE // 142 +data4 0x23EB3596 // 143 +data4 0x230487FA // 144 +data4 0x2135DF4C // 145 +data4 0x2380FD2D // 146 +data4 0x23EB75E9 // 147 +data4 0x211C62C8 // 148 +data4 0x23F518F1 // 149 +data4 0x23FEF882 // 150 +data4 0x239097C7 // 151 +data4 0x223E2BDA // 152 +data4 0x23988F89 // 153 +data4 0x22E4A4AD // 154 +data4 0x23F03D9C // 155 +data4 0x23F5018F // 156 +data4 0x23E1E250 // 157 +data4 0x23FD3D90 // 158 +data4 0x22DEE2FF // 159 +data4 0x238342AB // 160 +data4 0x22E6736F // 161 +data4 0x233AFC28 // 162 +data4 0x2395F661 // 163 +data4 0x23D8B991 // 164 +data4 0x23CD58D5 // 165 +data4 0x21941FD6 // 166 +data4 0x23352915 // 167 +data4 0x235D09EE // 168 +data4 0x22DC7EF9 // 169 +data4 0x238BC9F3 // 170 +data4 0x2397DF8F // 171 +data4 0x2380A7BB // 172 +data4 0x23EFF48C // 173 +data4 0x21E67408 // 174 +data4 0x236420F7 // 175 +data4 0x22C8DFB5 // 176 +data4 0x239B5D35 // 177 +data4 0x23BDC09D // 178 +data4 0x239E822C // 179 +data4 0x23984F0A // 180 +data4 0x23EF2119 // 181 +data4 0x23F738B8 // 182 +data4 0x23B66187 // 183 +data4 0x23B06AD7 // 184 +data4 0x2369140F // 185 +data4 0x218DACE6 // 186 +data4 0x21DF23F1 // 187 +data4 0x235D8B34 // 188 +data4 0x23460333 // 189 +data4 0x23F11D62 // 190 +data4 0x23C37147 // 191 +data4 0x22B2AE2A // 192 +data4 0x23949211 // 193 +data4 0x23B69799 // 194 +data4 0x23DBEC75 // 195 +data4 0x229A6FB3 // 196 +data4 0x23FC6C60 // 197 +data4 0x22D01FFC // 198 +data4 0x235985F0 // 199 +data4 0x23F7ECA5 // 200 +data4 0x23F924D3 // 201 +data4 0x2381B92F // 202 +data4 0x243A0FBE // 203 +data4 0x24712D72 // 204 +data4 0x24594E2F // 205 +data4 0x220CD12A // 206 +data4 0x23D87FB0 // 207 +data4 0x2338288A // 208 +data4 0x242BB2CC // 209 +data4 0x220F6265 // 210 +data4 0x23BB7FE3 // 211 +data4 0x2301C0A2 // 212 +data4 0x246709AB // 213 +data4 0x23A619E2 // 214 +data4 0x24030E3B // 215 +data4 0x233C36CC // 216 +data4 0x241AAB77 // 217 +data4 0x243D41A3 // 218 +data4 0x23834A60 // 219 +data4 0x236AC7BF // 220 +data4 0x23B6D597 // 221 +data4 0x210E9474 // 222 +data4 0x242156E6 // 223 +data4 0x243A1D68 // 224 +data4 0x2472187C // 225 +data4 0x23834E86 // 226 +data4 0x23CA0807 // 227 +data4 0x24745887 // 228 +data4 0x23E2B0E1 // 229 +data4 0x2421EB67 // 230 +data4 0x23DCC64E // 231 +data4 0x22DF71D1 // 232 +data4 0x238D5ECA // 233 +data4 0x23CDE86F // 234 +data4 0x24131F45 // 235 +data4 0x240FE4E2 // 236 +data4 0x2317731A // 237 +data4 0x24015C76 // 238 +data4 0x2301A4E8 // 239 +data4 0x23E52A6D // 240 +data4 0x247D8A0D // 241 +data4 0x23DFEEBA // 242 +data4 0x22139FEC // 243 +data4 0x2454A112 // 244 +data4 0x23C21E28 // 245 +data4 0x2460D813 // 246 +data4 0x24258924 // 247 +data4 0x2425680F // 248 +data4 0x24194D1E // 249 +data4 0x24242C2F // 250 +data4 0x243DDE5E // 251 +data4 0x23DEB388 // 252 +data4 0x23E0E6EB // 253 +data4 0x24393E74 // 254 +data4 0x241B1863 // 255 +LOCAL_OBJECT_END(log10_data) + + + +// Code +//============================================================== -// log10 has p7 true, p8 false -// log has p8 true, p7 false +// log has p13 true, p14 false +// log10 has p14 true, p13 false .section .text -.proc log10# -.align 32 - -log10: -#ifdef _LIBC -.global __ieee754_log10 -.type __ieee754_log10,@function -__ieee754_log10: -#endif +GLOBAL_IEEE754_ENTRY(log10) { .mfi - alloc r32=ar.pfs,1,15,4,0 - frcpa.s1 log_C,p9 = f1,f8 - cmp.eq.unc p7,p8 = r0, r0 -} -{ .mfb - addl log_AD_1 = @ltoff(log_table_1), gp - fnorm.s1 log_NORM_f8 = f8 - br.sptk L(LOG_LOG10_X) + getf.exp GR_Exp = f8 // if x is unorm then must recompute + frcpa.s1 FR_RcpX,p0 = f1,f8 + mov GR_05 = 0xFFFE // biased exponent of A2=0.5 } -;; - -.endp log10 -ASM_SIZE_DIRECTIVE(log10) -ASM_SIZE_DIRECTIVE(__ieee754_log10) - - -.section .text -.proc log# -.align 32 -log: -#ifdef _LIBC -.global __ieee754_log -.type __ieee754_log,@function -__ieee754_log: -#endif +{ .mlx + addl GR_ad_1 = @ltoff(log10_data),gp + movl GR_A3 = 0x3fd5555555555557 // double precision memory + // representation of A3 +};; { .mfi - alloc r32=ar.pfs,1,15,4,0 - frcpa.s1 log_C,p9 = f1,f8 - cmp.eq.unc p8,p7 = r0, r0 + getf.sig GR_Sig = f8 // get significand to calculate index + fclass.m p8,p0 = f8,9 // is x positive unorm? + mov GR_xorg = 0x3fefe // double precision memory msb of 255/256 } -{ .mfi - addl log_AD_1 = @ltoff(log_table_1), gp - fnorm.s1 log_NORM_f8 = f8 - nop.i 999 -} -;; - -L(LOG_LOG10_X): +{ .mib + ld8 GR_ad_1 = [GR_ad_1] + cmp.eq p14,p13 = r0,r0 // set p14 to 1 for log10 + br.cond.sptk log_log10_common +};; +GLOBAL_IEEE754_END(log10) +GLOBAL_IEEE754_ENTRY(log) { .mfi - ld8 log_AD_1 = [log_AD_1] - fclass.m.unc p15,p0 = f8, 0x0b // Test for x=unorm - mov log_GR_fff9 = 0xfff9 -} -{ .mfi - mov log_GR_half_exp = 0x0fffe - fms.s1 log_w = f8,f1,f1 - mov log_GR_exp_17_ones = 0x1ffff -} -;; - -{ .mmi - getf.exp log_GR_signexp_f8 = f8 // If x unorm then must recompute - setf.exp log_half = log_GR_half_exp // Form 0.5 = -Q1 - nop.i 999 -} -;; - -{ .mmb - adds log_AD_2 = 0x30, log_AD_1 - mov log_GR_exp_16_ones = 0xffff -(p15) br.cond.spnt L(LOG_DENORM) -} -;; - -L(LOG_COMMON): -{.mfi - ldfpd log_P5,log_P4 = [log_AD_1],16 - fclass.m.unc p6,p0 = f8, 0xc3 // Test for x=nan - and log_GR_exp_f8 = log_GR_signexp_f8, log_GR_exp_17_ones + getf.exp GR_Exp = f8 // if x is unorm then must recompute + frcpa.s1 FR_RcpX,p0 = f1,f8 + mov GR_05 = 0xfffe } -{.mfi - ldfpd log_P3,log_P2 = [log_AD_2],16 - nop.f 999 - nop.i 999 -} -;; +{ .mlx + addl GR_ad_1 = @ltoff(log_data),gp + movl GR_A3 = 0x3fd5555555555557 // double precision memory + // representation of A3 +};; { .mfi - ldfpd log_Q8,log_Q7 = [log_AD_1],16 - fclass.m.unc p11,p0 = f8, 0x21 // Test for x=+inf - sub log_GR_true_exp_f8 = log_GR_exp_f8, log_GR_exp_16_ones + getf.sig GR_Sig = f8 // get significand to calculate index + fclass.m p8,p0 = f8,9 // is x positive unorm? + mov GR_xorg = 0x3fefe // double precision memory msb of 255/256 } { .mfi - ldfpd log_Q6,log_Q5 = [log_AD_2],16 - nop.f 999 - nop.i 999 -} -;; - + ld8 GR_ad_1 = [GR_ad_1] + nop.f 0 + cmp.eq p13,p14 = r0,r0 // set p13 to 1 for log +};; +log_log10_common: { .mfi - ldfpd log_Q4,log_Q3 = [log_AD_1],16 - fma.s1 log_wsq = log_w, log_w, f0 - nop.i 999 -} -{ .mfb - ldfpd log_Q2,log_Q1 = [log_AD_2],16 -(p6) fma.d.s0 f8 = f8,f1,f0 // quietize nan result if x=nan -(p6) br.ret.spnt b0 // Exit for x=nan + getf.d GR_x = f8 // double precision memory representation of x + fclass.m p9,p0 = f8,0x1E1 // is x NaN, NaT or +Inf? + dep.z GR_dx = 3, 44, 2 // Create 0x0000300000000000 + // Difference between double precision + // memory representations of 257/256 and + // 255/256 } -;; - - { .mfi - setf.sig log_int_Nfloat = log_GR_true_exp_f8 - fcmp.eq.s1 p10,p0 = log_NORM_f8, f1 // Test for x=+1.0 - nop.i 999 -} -{ .mfb - nop.m 999 - fms.s1 log_r = log_C,f8,f1 -(p11) br.ret.spnt b0 // Exit for x=+inf -} -;; - - -{ .mmf - getf.sig log_GR_significand_f8 = log_NORM_f8 - ldfe log_inv_ln10 = [log_AD_2],16 - fclass.m.unc p6,p0 = f8, 0x07 // Test for x=0 -} -;; - - -{ .mfb - nop.m 999 -(p10) fmerge.s f8 = f0, f0 -(p10) br.ret.spnt b0 // Exit for x=1.0 -;; -} - + setf.exp FR_A2 = GR_05 // create A2 + fnorm.s1 FR_NormX = f8 + mov GR_bias = 0xffff +};; + { .mfi - getf.exp log_GR_signexp_w = log_w - fclass.m.unc p12,p0 = f8, 0x3a // Test for x neg norm, unorm, inf - shl log_GR_index = log_GR_significand_f8,1 + setf.d FR_A3 = GR_A3 // create A3 + fcmp.eq.s1 p12,p0 = f1,f8 // is x equal to 1.0? + dep.z GR_xorg = GR_xorg, 44, 19 // 0x3fefe00000000000 + // double precision memory + // representation of 255/256 } -;; +{ .mib + add GR_ad_2 = 0x30,GR_ad_1 // address of A5,A4 + add GR_ad_3 = 0x840,GR_ad_1 // address of ln(1/frcpa) lo parts +(p8) br.cond.spnt log_positive_unorms +};; +log_core: { .mfi - ldfe log_log2 = [log_AD_2],16 - fnma.s1 log_rp_q10 = log_half, log_wsq, log_w - shr.u log_GR_index = log_GR_index,56 + ldfpd FR_A7,FR_A6 = [GR_ad_1],16 + fclass.m p10,p0 = f8,0x3A // is x < 0? + sub GR_Nm1 = GR_Exp,GR_05 // unbiased_exponent_of_x - 1 } -{ .mfb - nop.m 999 - fma.s1 log_w3 = log_wsq, log_w, f0 -(p6) br.cond.spnt L(LOG_ZERO_NEG) // Branch if x=0 -;; -} - - { .mfi - and log_GR_exp_w = log_GR_exp_17_ones, log_GR_signexp_w - fma.s1 log_w4 = log_wsq, log_wsq, f0 - nop.i 999 -} -{ .mfb - shladd log_AD_2 = log_GR_index,4,log_AD_2 - fma.s1 log_rsq = log_r, log_r, f0 -(p12) br.cond.spnt L(LOG_ZERO_NEG) // Branch if x<0 -;; -} + ldfpd FR_A5,FR_A4 = [GR_ad_2],16 +(p9) fma.d.s0 f8 = f8,f1,f0 // set V-flag + sub GR_N = GR_Exp,GR_bias // unbiased_exponent_of_x +};; { .mfi - ldfe log_T = [log_AD_2] - fma.s1 log_rp_p4 = log_P5, log_r, log_P4 - nop.i 999 + setf.sig FR_N = GR_N // copy unbiased exponent of x to significand + fms.s1 FR_r = FR_RcpX,f8,f1 // range reduction for |x-1|>=1/256 + extr.u GR_Ind = GR_Sig,55,8 // get bits from 55 to 62 as index } -{ .mfi - nop.m 999 - fma.s1 log_rp_p32 = log_P3, log_r, log_P2 - nop.i 999 -;; -} - +{ .mib + sub GR_x = GR_x, GR_xorg // get diff between x and 255/256 + cmp.gtu p6, p7 = 2, GR_Nm1 // p6 true if 0.5 <= x < 2 +(p9) br.ret.spnt b0 // exit for NaN, NaT and +Inf +};; { .mfi - nop.m 999 - fma.s1 log_rp_q7 = log_Q8, log_w, log_Q7 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 log_rp_q65 = log_Q6, log_w, log_Q5 - nop.i 999 -;; + ldfpd FR_Ln2hi,FR_Ln2lo = [GR_ad_1],16 + fclass.m p11,p0 = f8,0x07 // is x = 0? + shladd GR_ad_3 = GR_Ind,2,GR_ad_3 // address of Tlo } +{ .mib + shladd GR_ad_2 = GR_Ind,3,GR_ad_2 // address of Thi +(p6) cmp.leu p6, p7 = GR_x, GR_dx // 255/256 <= x <= 257/256 +(p10) br.cond.spnt log_negatives // jump if x is negative +};; -// p13 <== large w log -// p14 <== small w log +// p6 is true if |x-1| < 1/256 +// p7 is true if |x-1| >= 1/256 { .mfi -(p8) cmp.ge.unc p13,p14 = log_GR_exp_w, log_GR_fff9 - fma.s1 log_rp_q3 = log_Q4, log_w, log_Q3 - nop.i 999 -;; -} + ldfd FR_Thi = [GR_ad_2] +(p6) fms.s1 FR_r = f8,f1,f1 // range reduction for |x-1|<1/256 + nop.i 0 +};; -// p10 <== large w log10 -// p11 <== small w log10 -{ .mfi -(p7) cmp.ge.unc p10,p11 = log_GR_exp_w, log_GR_fff9 - fcvt.xf log_Nfloat = log_int_Nfloat - nop.i 999 +{ .mmi +(p7) ldfs FR_Tlo = [GR_ad_3] + nop.m 0 + nop.i 0 } +{ .mfb + nop.m 0 +(p12) fma.d.s0 f8 = f0,f0,f0 +(p12) br.ret.spnt b0 // exit for +1.0 +};; +.pred.rel "mutex",p6,p7 { .mfi - nop.m 999 - fma.s1 log_rp_q21 = log_Q2, log_w3, log_rp_q10 - nop.i 999 ;; +(p6) mov GR_NearOne = 1 + fms.s1 FR_A32 = FR_A3,FR_r,FR_A2 // A3*r-A2 +(p7) mov GR_NearOne = 0 } +{ .mfb + ldfe FR_InvLn10 = [GR_ad_1],16 + fma.s1 FR_r2 = FR_r,FR_r,f0 // r^2 +(p11) br.cond.spnt log_zeroes // jump if x is zero +};; { .mfi - nop.m 999 - fma.s1 log_rcube = log_rsq, log_r, f0 - nop.i 999 + nop.m 0 + fma.s1 FR_A6 = FR_A7,FR_r,FR_A6 // A7*r+A6 + nop.i 0 } { .mfi - nop.m 999 - fma.s1 log_rp_p10 = log_rsq, log_P1, log_r - nop.i 999 -;; -} +(p7) cmp.eq.unc p9,p0 = r0,r0 // set p9 if |x-1| > 1/256 + fma.s1 FR_A4 = FR_A5,FR_r,FR_A4 // A5*r+A4 +(p14) cmp.eq.unc p8,p0 = 1,GR_NearOne // set p8 to 1 if it's log10 + // and argument near 1.0 +};; { .mfi - nop.m 999 - fcmp.eq.s0 p6,p0 = f8,f0 // Sets flag on +denormal input - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 log_rp_p2 = log_rp_p4, log_rsq, log_rp_p32 - nop.i 999 -;; -} - +(p6) getf.exp GR_rexp = FR_r // Get signexp of x-1 +(p7) fcvt.xf FR_N = FR_N +(p8) cmp.eq p9,p6 = r0,r0 // Also set p9 and clear p6 if log10 + // and arg near 1 +};; { .mfi - nop.m 999 - fma.s1 log_w6 = log_w3, log_w3, f0 - nop.i 999 + nop.m 0 + fma.s1 FR_r4 = FR_r2,FR_r2,f0 // r^4 + nop.i 0 } { .mfi - nop.m 999 - fma.s1 log_Qlo = log_rp_q7, log_wsq, log_rp_q65 - nop.i 999 -} -;; + nop.m 0 +(p8) fma.s1 FR_NxLn2pT = f0,f0,f0 // Clear NxLn2pT if log10 near 1 + nop.i 0 +};; { .mfi - nop.m 999 - fma.s1 log_Qhi = log_rp_q3, log_w4, log_rp_q21 - nop.i 999 ;; + nop.m 0 + // (A3*r+A2)*r^2+r + fma.s1 FR_A321 = FR_A32,FR_r2,FR_r + mov GR_mask = 0x1ffff } - - { .mfi - nop.m 999 - fma.s1 log_T_plus_Nlog2 = log_Nfloat,log_log2, log_T - nop.i 999 ;; -} + nop.m 0 + // (A7*r+A6)*r^2+(A5*r+A4) + fma.s1 FR_A4 = FR_A6,FR_r2,FR_A4 + nop.i 0 +};; { .mfi - nop.m 999 - fma.s1 log_r2P_r = log_rp_p2, log_rcube, log_rp_p10 - nop.i 999 ;; +(p6) and GR_rexp = GR_rexp, GR_mask + // N*Ln2hi+Thi +(p7) fma.s1 FR_NxLn2hipThi = FR_N,FR_Ln2hi,FR_Thi + nop.i 0 } +{ .mfi + nop.m 0 + // N*Ln2lo+Tlo +(p7) fma.s1 FR_NxLn2lopTlo = FR_N,FR_Ln2lo,FR_Tlo + nop.i 0 +};; - -// small w, log <== p14 { .mfi - nop.m 999 -(p14) fma.d f8 = log_Qlo, log_w6, log_Qhi - nop.i 999 +(p6) sub GR_rexp = GR_rexp, GR_bias // unbiased exponent of x-1 +(p9) fma.s1 f8 = FR_A4,FR_r4,FR_A321 // P(r) if |x-1| >= 1/256 or + // log10 and |x-1| < 1/256 + nop.i 0 } { .mfi - nop.m 999 - fma.s1 log_Q = log_Qlo, log_w6, log_Qhi - nop.i 999 ;; -} - + nop.m 0 + // (N*Ln2hi+Thi) + (N*Ln2lo+Tlo) +(p7) fma.s1 FR_NxLn2pT = FR_NxLn2hipThi,f1,FR_NxLn2lopTlo + nop.i 0 +};; { .mfi - nop.m 999 -(p10) fma.s1 log_log10_hi = log_T_plus_Nlog2, log_inv_ln10,f0 - nop.i 999 ;; -} +(p6) cmp.gt.unc p10, p6 = -40, GR_rexp // Test |x-1| < 2^-40 + nop.f 0 + nop.i 0 +};; -// large w, log <== p13 -.pred.rel "mutex",p13,p10 { .mfi - nop.m 999 -(p13) fadd.d f8 = log_T_plus_Nlog2, log_r2P_r - nop.i 999 -} -{ .mfi - nop.m 999 -(p10) fma.s1 log_log10_lo = log_inv_ln10, log_r2P_r,f0 - nop.i 999 ;; -} - + nop.m 0 +(p10) fma.d.s0 f8 = FR_A32,FR_r2,FR_r // log(x) if |x-1| < 2^-40 + nop.i 0 +};; -// small w, log10 <== p11 +.pred.rel "mutex",p6,p9 { .mfi - nop.m 999 -(p11) fma.d f8 = log_inv_ln10,log_Q,f0 - nop.i 999 ;; -} - -// large w, log10 <== p10 -{ .mfb - nop.m 999 -(p10) fma.d f8 = log_log10_hi, f1, log_log10_lo - br.ret.sptk b0 -;; + nop.m 0 +(p6) fma.d.s0 f8 = FR_A4,FR_r4,FR_A321 // log(x) if 2^-40 <= |x-1| < 1/256 + nop.i 0 } - -L(LOG_DENORM): { .mfb - getf.exp log_GR_signexp_f8 = log_NORM_f8 - nop.f 999 - br.cond.sptk L(LOG_COMMON) -} -;; - -L(LOG_ZERO_NEG): - -// qnan snan inf norm unorm 0 -+ -// 0 0 0 0 0 1 11 0x7 -// 0 0 1 1 1 0 10 0x3a - -// Save x (f8) in f10 -{ .mfi - nop.m 999 - fmerge.s f10 = f8,f8 - nop.i 999 ;; -} - -// p8 p9 means ln(+-0) = -inf -// p7 p10 means log(+-0) = -inf - -// p13 means ln(-) -// p14 means log(-) - + nop.m 0 +(p9) fma.d.s0 f8 = f8,FR_InvLn10,FR_NxLn2pT // result if |x-1| >= 1/256 + // or log10 and |x-1| < 1/256 + br.ret.sptk b0 +};; -{ .mfi - nop.m 999 - fmerge.ns f6 = f1,f1 // Form -1.0 - nop.i 999 ;; -} +.align 32 +log_positive_unorms: +{ .mmf + getf.exp GR_Exp = FR_NormX // recompute biased exponent + getf.d GR_x = FR_NormX // recompute double precision x + fcmp.eq.s1 p12,p0 = f1,FR_NormX // is x equal to 1.0? +};; -// p9 means ln(+-0) = -inf -// p10 means log(+-0) = -inf -// Log(+-0) = -inf +{ .mfb + getf.sig GR_Sig = FR_NormX // recompute significand + fcmp.eq.s0 p15, p0 = f8, f0 // set denormal flag + br.cond.sptk log_core +};; +.align 32 +log_zeroes: { .mfi - nop.m 999 -(p8) fclass.m.unc p9,p0 = f10, 0x07 - nop.i 999 + nop.m 0 + fmerge.s FR_X = f8,f8 // keep input argument for subsequent + // call of __libm_error_support# + nop.i 0 } { .mfi - nop.m 999 -(p7) fclass.m.unc p10,p0 = f10, 0x07 - nop.i 999 ;; -} - - -// p13 ln(-) -// p14 log(-) + nop.m 0 + fms.s1 FR_tmp = f0,f0,f1 // -1.0 + nop.i 0 +};; -// Log(-inf, -normal, -unnormal) = QNAN indefinite -{ .mfi - nop.m 999 -(p8) fclass.m.unc p13,p0 = f10, 0x3a - nop.i 999 -} +.pred.rel "mutex",p13,p14 { .mfi - nop.m 999 -(p7) fclass.m.unc p14,p0 = f10, 0x3a - nop.i 999 ;; +(p13) mov GR_TAG = 2 // set libm error in case of log + frcpa.s0 f8,p0 = FR_tmp,f0 // log(+/-0) should be equal to -INF. + // We can get it using frcpa because it + // sets result to the IEEE-754 mandated + // quotient of FR_tmp/f0. + // As far as FR_tmp is -1 it'll be -INF + nop.i 0 } +{ .mib +(p14) mov GR_TAG = 8 // set libm error in case of log10 + nop.i 0 + br.cond.sptk log_libm_err +};; - -.pred.rel "mutex",p9,p10 -{ .mfi -(p9) mov log_GR_tag = 2 -(p9) frcpa f8,p11 = f6,f0 - nop.i 999 -} +.align 32 +log_negatives: { .mfi -(p10) mov log_GR_tag = 8 -(p10) frcpa f8,p12 = f6,f0 - nop.i 999 ;; -} + nop.m 0 + fmerge.s FR_X = f8,f8 + nop.i 0 +};; .pred.rel "mutex",p13,p14 { .mfi -(p13) mov log_GR_tag = 3 -(p13) frcpa f8,p11 = f0,f0 - nop.i 999 -} -{ .mfb -(p14) mov log_GR_tag = 9 -(p14) frcpa f8,p12 = f0,f0 - br.cond.sptk __libm_error_region ;; -} -.endp log -ASM_SIZE_DIRECTIVE(log) -ASM_SIZE_DIRECTIVE(__ieee754_log) - - -// Stack operations when calling error support. -// (1) (2) (3) (call) (4) -// sp -> + psp -> + psp -> + sp -> + -// | | | | -// | | <- GR_Y R3 ->| <- GR_RESULT | -> f8 -// | | | | -// | <-GR_Y Y2->| Y2 ->| <- GR_Y | -// | | | | -// | | <- GR_X X1 ->| | -// | | | | -// sp-64 -> + sp -> + sp -> + + -// save ar.pfs save b0 restore gp -// save gp restore ar.pfs - +(p13) mov GR_TAG = 3 // set libm error in case of log + frcpa.s0 f8,p0 = f0,f0 // log(negatives) should be equal to NaN. + // We can get it using frcpa because it + // sets result to the IEEE-754 mandated + // quotient of f0/f0 i.e. NaN. +(p14) mov GR_TAG = 9 // set libm error in case of log10 +};; +.align 32 +log_libm_err: +{ .mmi + alloc r32 = ar.pfs,1,4,4,0 + mov GR_Parameter_TAG = GR_TAG + nop.i 0 +};; +GLOBAL_IEEE754_END(log) -.proc __libm_error_region -__libm_error_region: +LOCAL_LIBM_ENTRY(__libm_error_region) .prologue - -// (1) { .mfi - add GR_Parameter_Y=-32,sp // Parameter 2 value + add GR_Parameter_Y = -32,sp // Parameter 2 value nop.f 0 .save ar.pfs,GR_SAVE_PFS - mov GR_SAVE_PFS=ar.pfs // Save ar.pfs + mov GR_SAVE_PFS = ar.pfs // Save ar.pfs } { .mfi .fframe 64 - add sp=-64,sp // Create new stack + add sp = -64,sp // Create new stack nop.f 0 - mov GR_SAVE_GP=gp // Save gp + mov GR_SAVE_GP = gp // Save gp };; - -// (2) { .mmi - stfd [GR_Parameter_Y] = f1,16 // STORE Parameter 2 on stack + stfd [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack add GR_Parameter_X = 16,sp // Parameter 1 address .save b0, GR_SAVE_B0 - mov GR_SAVE_B0=b0 // Save b0 + mov GR_SAVE_B0 = b0 // Save b0 };; .body -// (3) { .mib - stfd [GR_Parameter_X] = f10 // STORE Parameter 1 on stack - add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address - nop.b 0 + stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address + nop.b 0 } { .mib - stfd [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack + stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack add GR_Parameter_Y = -16,GR_Parameter_Y - br.call.sptk b0=__libm_error_support# // Call error handling function + br.call.sptk b0=__libm_error_support# // Call error handling function };; { .mmi - nop.m 0 - nop.m 0 add GR_Parameter_RESULT = 48,sp + nop.m 0 + nop.i 0 };; -// (4) { .mmi - ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack + ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack .restore sp - add sp = 64,sp // Restore stack pointer - mov b0 = GR_SAVE_B0 // Restore return address + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address };; + { .mib - mov gp = GR_SAVE_GP // Restore gp - mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs - br.ret.sptk b0 // Return + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return };; - -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) - +LOCAL_LIBM_END(__libm_error_region) .type __libm_error_support#,@function .global __libm_error_support# + diff --git a/sysdeps/ia64/fpu/e_logf.S b/sysdeps/ia64/fpu/e_logf.S index 829d0abed0..0ca6d3f2c8 100644 --- a/sysdeps/ia64/fpu/e_logf.S +++ b/sysdeps/ia64/fpu/e_logf.S @@ -1,10 +1,10 @@ .file "logf.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,861 +20,1072 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// // Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// problem reports or change requests be submitted to it directly at +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 3/01/00 Initial version -// 8/15/00 Bundle added after call to __libm_error_support to properly +// 03/01/00 Initial version +// 08/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. -// 1/10/01 Improved speed, fixed flags for neg denormals -// +// 01/10/01 Improved speed, fixed flags for neg denormals +// 05/20/02 Cleaned up namespace and sf0 syntax +// 05/23/02 Modified algorithm. Now only one polynomial is used +// for |x-1| >= 1/256 and for |x-1| < 1/256 +// 02/10/03 Reordered header: .section, .global, .proc, .align // // API //============================================================== // float logf(float) // float log10f(float) // +// // Overview of operation //============================================================== // Background +// ---------- // -// Consider x = 2^N 1.f1 f2 f3 f4...f63 -// Log(x) = log(frcpa(x) x/frcpa(x)) -// = log(1/frcpa(x)) + log(frcpa(x) x) -// = -log(frcpa(x)) + log(frcpa(x) x) +// This algorithm is based on fact that +// log(a b) = log(a) + log(b). // -// frcpa(x) = 2^-N frcpa((1.f1 f2 ... f63) +// In our case we have x = 2^N f, where 1 <= f < 2. +// So +// log(x) = log(2^N f) = log(2^N) + log(f) = n*log(2) + log(f) // -// -log(frcpa(x)) = -log(C) -// = -log(2^-N) - log(frcpa(1.f1 f2 ... f63)) +// To calculate log(f) we do following +// log(f) = log(f * frcpa(f) / frcpa(f)) = +// = log(f * frcpa(f)) + log(1/frcpa(f)) // -// -log(frcpa(x)) = -log(C) -// = +Nlog2 - log(frcpa(1.f1 f2 ... f63)) +// According to definition of IA-64's frcpa instruction it's a +// floating point that approximates 1/f using a lookup on the +// top of 8 bits of the input number's significand with relative +// error < 2^(-8.886). So we have following // -// -log(frcpa(x)) = -log(C) -// = +Nlog2 + log(frcpa(1.f1 f2 ... f63)) +// |(1/f - frcpa(f)) / (1/f))| = |1 - f*frcpa(f)| < 1/256 // -// Log(x) = log(1/frcpa(x)) + log(frcpa(x) x) - -// Log(x) = +Nlog2 + log(1./frcpa(1.f1 f2 ... f63)) + log(frcpa(x) x) -// Log(x) = +Nlog2 - log(/frcpa(1.f1 f2 ... f63)) + log(frcpa(x) x) -// Log(x) = +Nlog2 + T + log(frcpa(x) x) +// and +// +// log(f) = log(f * frcpa(f)) + log(1/frcpa(f)) = +// = log(1 + r) + T +// +// The first value can be computed by polynomial P(r) approximating +// log(1 + r) on |r| < 1/256 and the second is precomputed tabular +// value defined by top 8 bit of f. +// +// Finally we have that log(x) ~ (N*log(2) + T) + P(r) +// +// Note that if input argument is close to 1.0 (in our case it means +// that |1 - x| < 1/256) we can use just polynomial approximation +// because x = 2^0 * f = f = 1 + r and +// log(x) = log(1 + r) ~ P(r) +// +// +// To compute log10(x) we just use identity: // -// Log(x) = +Nlog2 + T + log(C x) +// log10(x) = log(x)/log(10) // -// Cx = 1 + r +// so we have that // -// Log(x) = +Nlog2 + T + log(1+r) -// Log(x) = +Nlog2 + T + Series( r - r^2/2 + r^3/3 - r^4/4 ....) +// log10(x) = (N*log(2) + T + log(1+r)) / log(10) = +// = N*(log(2)/log(10)) + (T/log(10)) + log(1 + r)/log(10) // -// 1.f1 f2 ... f8 has 256 entries. -// They are 1 + k/2^8, k = 0 ... 255 -// These 256 values are the table entries. // // Implementation -//=============== -// CASE 1: |x-1| >= 2^-8 -// C = frcpa(x) -// r = C * x - 1 +// -------------- +// It can be seen that formulas for log and log10 differ from one another +// only by coefficients and tabular values. Namely as log as log10 are +// calculated as (N*L1 + T) + L2*Series(r) where in case of log +// L1 = log(2) +// T = log(1/frcpa(x)) +// L2 = 1.0 +// and in case of log10 +// L1 = log(2)/log(10) +// T = log(1/frcpa(x))/log(10) +// L2 = 1.0/log(10) // -// Form rseries = r + P1*r^2 + P2*r^3 + P3*r^4 +// So common code with two different entry points those set pointers +// to the base address of coresponding data sets containing values +// of L2,T and prepare integer representation of L1 needed for following +// setf instruction can be used. // -// x = f * 2*n where f is 1.f_1f_2f_3....f_63 -// Nfloat = float(n) where n is the true unbiased exponent -// pre-index = f_1f_2....f_8 -// index = pre_index * 16 -// get the dxt table entry at index + offset = T +// Note that both log and log10 use common approximation polynomial +// it means we need only one set of coefficients of approximation. // -// result = (T + Nfloat * log(2)) + rseries +// 1. Computation of log(x) for |x-1| >= 1/256 +// InvX = frcpa(x) +// r = InvX*x - 1 +// P(r) = r*((1 - A2*r) + r^2*(A3 - A4*r)) = r*P2(r), +// A4,A3,A2 are created with setf inctruction. +// We use Taylor series and so A4 = 1/4, A3 = 1/3, +// A2 = 1/2 rounded to double. // -// The T table is calculated as follows -// Form x_k = 1 + k/2^8 where k goes from 0... 255 -// y_k = frcpa(x_k) -// log(1/y_k) in quad and round to double - -// CASE 2: |x-1| < 2^-6 -// w = x - 1 +// N = float(n) where n is true unbiased exponent of x // -// Form wseries = w + Q1*w^2 + Q2*w^3 + Q3*w^4 +// T is tabular value of log(1/frcpa(x)) calculated in quad precision +// and rounded to double. To T we get bits from 55 to 62 of register +// format significand of x and calculate address +// ad_T = table_base_addr + 8 * index // -// result = wseries - -// Special values +// L2 (1.0 or 1.0/log(10) depending on function) is calculated in quad +// precision and rounded to double; it's loaded from memory +// +// L1 (log(2) or log10(2) depending on function) is calculated in quad +// precision and rounded to double; it's created with setf. +// +// And final result = P2(r)*(r*L2) + (T + N*L1) +// +// +// 2. Computation of log(x) for |x-1| < 1/256 +// r = x - 1 +// P(r) = r*((1 - A2*r) + r^2*(A3 - A4*r)) = r*P2(r), +// A4,A3,A2 are the same as in case |x-1| >= 1/256 +// +// And final result = P2(r)*(r*L2) +// +// 3. How we define is input argument such that |x-1| < 1/256 or not. +// +// To do it we analyze biased exponent and significand of input argment. +// +// a) First we test is biased exponent equal to 0xFFFE or 0xFFFF (i.e. +// we test is 0.5 <= x < 2). This comparison can be performed using +// unsigned version of cmp instruction in such a way +// biased_exponent_of_x - 0xFFFE < 2 +// +// +// b) Second (in case when result of a) is true) we need to compare x +// with 1-1/256 and 1+1/256 or in register format representation with +// 0xFFFEFF00000000000000 and 0xFFFF8080000000000000 correspondingly. +// As far as biased exponent of x here can be equal only to 0xFFFE or +// 0xFFFF we need to test only last bit of it. Also signifigand always +// has implicit bit set to 1 that can be exluded from comparison. +// Thus it's quite enough to generate 64-bit integer bits of that are +// ix[63] = biased_exponent_of_x[0] and ix[62-0] = significand_of_x[62-0] +// and compare it with 0x7F00000000000000 and 0x80800000000000000 (those +// obtained like ix from register representatinos of 255/256 and +// 257/256). This comparison can be made like in a), using unsigned +// version of cmp i.e. ix - 0x7F00000000000000 < 0x0180000000000000. +// 0x0180000000000000 is difference between 0x80800000000000000 and +// 0x7F00000000000000. +// +// Note: NaT, any NaNs, +/-INF, +/-0, negatives and unnormalized numbers are +// filtered and processed on special branches. +// +// +// Special values //============================================================== - - -// log(+0) = -inf -// log(-0) = -inf - -// log(+qnan) = +qnan -// log(-qnan) = -qnan -// log(+snan) = +qnan -// log(-snan) = -qnan - -// log(-n) = QNAN Indefinite -// log(-inf) = QNAN Indefinite - -// log(+inf) = +inf - +// +// logf(+0) = -inf +// logf(-0) = -inf +// +// logf(+qnan) = +qnan +// logf(-qnan) = -qnan +// logf(+snan) = +qnan +// logf(-snan) = -qnan +// +// logf(-n) = QNAN Indefinite +// logf(-inf) = QNAN Indefinite +// +// logf(+inf) = +inf +// // Registers used //============================================================== -// Floating Point registers used: +// Floating Point registers used: // f8, input -// f9 -> f15, f32 -> f47 - -// General registers used: -// r32 -> r51 - +// f12 -> f14, f33 -> f39 +// +// General registers used: +// r8 -> r11 +// r14 -> r19 +// // Predicate registers used: -// p6 -> p15 +// p6 -> p12 -// p8 log base e -// p6 log base e special -// p9 used in the frcpa -// p13 log base e large W -// p14 log base e small w - -// p7 log base 10 -// p10 log base 10 large W -// p11 log base 10 small w -// p12 log base 10 special - -#include "libm_support.h" // Assembly macros //============================================================== -log_int_Nfloat = f9 -log_Nfloat = f10 - -log_P3 = f11 -log_P2 = f12 -log_P1 = f13 -log_inv_ln10 = f14 -log_log2 = f15 - -log_w = f32 -log_T = f33 -log_rp_p32 = f34 -log_rp_p2 = f35 -log_rp_p10 = f36 -log_rsq = f37 -log_T_plus_Nlog2 = f38 -log_r = f39 -log_C = f40 -log_rp_q32 = f41 -log_rp_q2 = f42 -log_rp_q10 = f43 -log_wsq = f44 -log_Q = f45 -log_inv_ln10 = f46 -log_NORM_f8 = f47 - -// =================================== - -log_GR_exp_17_ones = r33 -log_GR_exp_16_ones = r34 -log_GR_exp_f8 = r35 -log_GR_signexp_f8 = r36 -log_GR_true_exp_f8 = r37 -log_GR_significand_f8 = r38 -log_GR_index = r39 -log_AD_1 = r40 -log_GR_signexp_w = r41 -log_GR_fff7 = r42 -log_AD_2 = r43 -log_GR_exp_w = r44 - -GR_SAVE_B0 = r45 -GR_SAVE_GP = r46 -GR_SAVE_PFS = r47 - -GR_Parameter_X = r48 -GR_Parameter_Y = r49 -GR_Parameter_RESULT = r50 -log_GR_tag = r51 +GR_TAG = r8 +GR_ad_T = r8 +GR_N = r9 +GR_Exp = r10 +GR_Sig = r11 + +GR_025 = r14 +GR_05 = r15 +GR_A3 = r16 +GR_Ind = r17 +GR_dx = r15 +GR_Ln2 = r19 +GR_de = r20 +GR_x = r21 +GR_xorg = r22 + +GR_SAVE_B0 = r33 +GR_SAVE_PFS = r34 +GR_SAVE_GP = r35 +GR_SAVE_SP = r36 + +GR_Parameter_X = r37 +GR_Parameter_Y = r38 +GR_Parameter_RESULT = r39 +GR_Parameter_TAG = r40 + + +FR_A2 = f12 +FR_A3 = f13 +FR_A4 = f14 + +FR_RcpX = f33 +FR_r = f34 +FR_r2 = f35 +FR_tmp = f35 +FR_Ln2 = f36 +FR_T = f37 +FR_N = f38 +FR_NxLn2pT = f38 +FR_NormX = f39 +FR_InvLn10 = f40 + + +FR_Y = f1 +FR_X = f10 +FR_RESULT = f8 // Data tables //============================================================== - -#ifdef _LIBC -.rodata -#else -.data -#endif - +RODATA .align 16 - -log_table_1: -ASM_TYPE_DIRECTIVE(log_table_1,@object) -data8 0xbfd0001008f39d59 // p3 -data8 0x3fd5556073e0c45a // p2 -ASM_SIZE_DIRECTIVE(log_table_1) - -log_table_2: -ASM_TYPE_DIRECTIVE(log_table_2,@object) -data8 0xbfdffffffffaea15 // p1 -data8 0x3fdbcb7b1526e50e // 1/ln10 -data8 0x3fe62e42fefa39ef // Log(2) -data8 0x0 // pad - -data8 0x3F60040155D5889E //log(1/frcpa(1+ 0/256) -data8 0x3F78121214586B54 //log(1/frcpa(1+ 1/256) -data8 0x3F841929F96832F0 //log(1/frcpa(1+ 2/256) -data8 0x3F8C317384C75F06 //log(1/frcpa(1+ 3/256) -data8 0x3F91A6B91AC73386 //log(1/frcpa(1+ 4/256) -data8 0x3F95BA9A5D9AC039 //log(1/frcpa(1+ 5/256) -data8 0x3F99D2A8074325F4 //log(1/frcpa(1+ 6/256) -data8 0x3F9D6B2725979802 //log(1/frcpa(1+ 7/256) -data8 0x3FA0C58FA19DFAAA //log(1/frcpa(1+ 8/256) -data8 0x3FA2954C78CBCE1B //log(1/frcpa(1+ 9/256) -data8 0x3FA4A94D2DA96C56 //log(1/frcpa(1+ 10/256) -data8 0x3FA67C94F2D4BB58 //log(1/frcpa(1+ 11/256) -data8 0x3FA85188B630F068 //log(1/frcpa(1+ 12/256) -data8 0x3FAA6B8ABE73AF4C //log(1/frcpa(1+ 13/256) -data8 0x3FAC441E06F72A9E //log(1/frcpa(1+ 14/256) -data8 0x3FAE1E6713606D07 //log(1/frcpa(1+ 15/256) -data8 0x3FAFFA6911AB9301 //log(1/frcpa(1+ 16/256) -data8 0x3FB0EC139C5DA601 //log(1/frcpa(1+ 17/256) -data8 0x3FB1DBD2643D190B //log(1/frcpa(1+ 18/256) -data8 0x3FB2CC7284FE5F1C //log(1/frcpa(1+ 19/256) -data8 0x3FB3BDF5A7D1EE64 //log(1/frcpa(1+ 20/256) -data8 0x3FB4B05D7AA012E0 //log(1/frcpa(1+ 21/256) -data8 0x3FB580DB7CEB5702 //log(1/frcpa(1+ 22/256) -data8 0x3FB674F089365A7A //log(1/frcpa(1+ 23/256) -data8 0x3FB769EF2C6B568D //log(1/frcpa(1+ 24/256) -data8 0x3FB85FD927506A48 //log(1/frcpa(1+ 25/256) -data8 0x3FB9335E5D594989 //log(1/frcpa(1+ 26/256) -data8 0x3FBA2B0220C8E5F5 //log(1/frcpa(1+ 27/256) -data8 0x3FBB0004AC1A86AC //log(1/frcpa(1+ 28/256) -data8 0x3FBBF968769FCA11 //log(1/frcpa(1+ 29/256) -data8 0x3FBCCFEDBFEE13A8 //log(1/frcpa(1+ 30/256) -data8 0x3FBDA727638446A2 //log(1/frcpa(1+ 31/256) -data8 0x3FBEA3257FE10F7A //log(1/frcpa(1+ 32/256) -data8 0x3FBF7BE9FEDBFDE6 //log(1/frcpa(1+ 33/256) -data8 0x3FC02AB352FF25F4 //log(1/frcpa(1+ 34/256) -data8 0x3FC097CE579D204D //log(1/frcpa(1+ 35/256) -data8 0x3FC1178E8227E47C //log(1/frcpa(1+ 36/256) -data8 0x3FC185747DBECF34 //log(1/frcpa(1+ 37/256) -data8 0x3FC1F3B925F25D41 //log(1/frcpa(1+ 38/256) -data8 0x3FC2625D1E6DDF57 //log(1/frcpa(1+ 39/256) -data8 0x3FC2D1610C86813A //log(1/frcpa(1+ 40/256) -data8 0x3FC340C59741142E //log(1/frcpa(1+ 41/256) -data8 0x3FC3B08B6757F2A9 //log(1/frcpa(1+ 42/256) -data8 0x3FC40DFB08378003 //log(1/frcpa(1+ 43/256) -data8 0x3FC47E74E8CA5F7C //log(1/frcpa(1+ 44/256) -data8 0x3FC4EF51F6466DE4 //log(1/frcpa(1+ 45/256) -data8 0x3FC56092E02BA516 //log(1/frcpa(1+ 46/256) -data8 0x3FC5D23857CD74D5 //log(1/frcpa(1+ 47/256) -data8 0x3FC6313A37335D76 //log(1/frcpa(1+ 48/256) -data8 0x3FC6A399DABBD383 //log(1/frcpa(1+ 49/256) -data8 0x3FC70337DD3CE41B //log(1/frcpa(1+ 50/256) -data8 0x3FC77654128F6127 //log(1/frcpa(1+ 51/256) -data8 0x3FC7E9D82A0B022D //log(1/frcpa(1+ 52/256) -data8 0x3FC84A6B759F512F //log(1/frcpa(1+ 53/256) -data8 0x3FC8AB47D5F5A310 //log(1/frcpa(1+ 54/256) -data8 0x3FC91FE49096581B //log(1/frcpa(1+ 55/256) -data8 0x3FC981634011AA75 //log(1/frcpa(1+ 56/256) -data8 0x3FC9F6C407089664 //log(1/frcpa(1+ 57/256) -data8 0x3FCA58E729348F43 //log(1/frcpa(1+ 58/256) -data8 0x3FCABB55C31693AD //log(1/frcpa(1+ 59/256) -data8 0x3FCB1E104919EFD0 //log(1/frcpa(1+ 60/256) -data8 0x3FCB94EE93E367CB //log(1/frcpa(1+ 61/256) -data8 0x3FCBF851C067555F //log(1/frcpa(1+ 62/256) -data8 0x3FCC5C0254BF23A6 //log(1/frcpa(1+ 63/256) -data8 0x3FCCC000C9DB3C52 //log(1/frcpa(1+ 64/256) -data8 0x3FCD244D99C85674 //log(1/frcpa(1+ 65/256) -data8 0x3FCD88E93FB2F450 //log(1/frcpa(1+ 66/256) -data8 0x3FCDEDD437EAEF01 //log(1/frcpa(1+ 67/256) -data8 0x3FCE530EFFE71012 //log(1/frcpa(1+ 68/256) -data8 0x3FCEB89A1648B971 //log(1/frcpa(1+ 69/256) -data8 0x3FCF1E75FADF9BDE //log(1/frcpa(1+ 70/256) -data8 0x3FCF84A32EAD7C35 //log(1/frcpa(1+ 71/256) -data8 0x3FCFEB2233EA07CD //log(1/frcpa(1+ 72/256) -data8 0x3FD028F9C7035C1C //log(1/frcpa(1+ 73/256) -data8 0x3FD05C8BE0D9635A //log(1/frcpa(1+ 74/256) -data8 0x3FD085EB8F8AE797 //log(1/frcpa(1+ 75/256) -data8 0x3FD0B9C8E32D1911 //log(1/frcpa(1+ 76/256) -data8 0x3FD0EDD060B78081 //log(1/frcpa(1+ 77/256) -data8 0x3FD122024CF0063F //log(1/frcpa(1+ 78/256) -data8 0x3FD14BE2927AECD4 //log(1/frcpa(1+ 79/256) -data8 0x3FD180618EF18ADF //log(1/frcpa(1+ 80/256) -data8 0x3FD1B50BBE2FC63B //log(1/frcpa(1+ 81/256) -data8 0x3FD1DF4CC7CF242D //log(1/frcpa(1+ 82/256) -data8 0x3FD214456D0EB8D4 //log(1/frcpa(1+ 83/256) -data8 0x3FD23EC5991EBA49 //log(1/frcpa(1+ 84/256) -data8 0x3FD2740D9F870AFB //log(1/frcpa(1+ 85/256) -data8 0x3FD29ECDABCDFA04 //log(1/frcpa(1+ 86/256) -data8 0x3FD2D46602ADCCEE //log(1/frcpa(1+ 87/256) -data8 0x3FD2FF66B04EA9D4 //log(1/frcpa(1+ 88/256) -data8 0x3FD335504B355A37 //log(1/frcpa(1+ 89/256) -data8 0x3FD360925EC44F5D //log(1/frcpa(1+ 90/256) -data8 0x3FD38BF1C3337E75 //log(1/frcpa(1+ 91/256) -data8 0x3FD3C25277333184 //log(1/frcpa(1+ 92/256) -data8 0x3FD3EDF463C1683E //log(1/frcpa(1+ 93/256) -data8 0x3FD419B423D5E8C7 //log(1/frcpa(1+ 94/256) -data8 0x3FD44591E0539F49 //log(1/frcpa(1+ 95/256) -data8 0x3FD47C9175B6F0AD //log(1/frcpa(1+ 96/256) -data8 0x3FD4A8B341552B09 //log(1/frcpa(1+ 97/256) -data8 0x3FD4D4F3908901A0 //log(1/frcpa(1+ 98/256) -data8 0x3FD501528DA1F968 //log(1/frcpa(1+ 99/256) -data8 0x3FD52DD06347D4F6 //log(1/frcpa(1+ 100/256) -data8 0x3FD55A6D3C7B8A8A //log(1/frcpa(1+ 101/256) -data8 0x3FD5925D2B112A59 //log(1/frcpa(1+ 102/256) -data8 0x3FD5BF406B543DB2 //log(1/frcpa(1+ 103/256) -data8 0x3FD5EC433D5C35AE //log(1/frcpa(1+ 104/256) -data8 0x3FD61965CDB02C1F //log(1/frcpa(1+ 105/256) -data8 0x3FD646A84935B2A2 //log(1/frcpa(1+ 106/256) -data8 0x3FD6740ADD31DE94 //log(1/frcpa(1+ 107/256) -data8 0x3FD6A18DB74A58C5 //log(1/frcpa(1+ 108/256) -data8 0x3FD6CF31058670EC //log(1/frcpa(1+ 109/256) -data8 0x3FD6F180E852F0BA //log(1/frcpa(1+ 110/256) -data8 0x3FD71F5D71B894F0 //log(1/frcpa(1+ 111/256) -data8 0x3FD74D5AEFD66D5C //log(1/frcpa(1+ 112/256) -data8 0x3FD77B79922BD37E //log(1/frcpa(1+ 113/256) -data8 0x3FD7A9B9889F19E2 //log(1/frcpa(1+ 114/256) -data8 0x3FD7D81B037EB6A6 //log(1/frcpa(1+ 115/256) -data8 0x3FD8069E33827231 //log(1/frcpa(1+ 116/256) -data8 0x3FD82996D3EF8BCB //log(1/frcpa(1+ 117/256) -data8 0x3FD85855776DCBFB //log(1/frcpa(1+ 118/256) -data8 0x3FD8873658327CCF //log(1/frcpa(1+ 119/256) -data8 0x3FD8AA75973AB8CF //log(1/frcpa(1+ 120/256) -data8 0x3FD8D992DC8824E5 //log(1/frcpa(1+ 121/256) -data8 0x3FD908D2EA7D9512 //log(1/frcpa(1+ 122/256) -data8 0x3FD92C59E79C0E56 //log(1/frcpa(1+ 123/256) -data8 0x3FD95BD750EE3ED3 //log(1/frcpa(1+ 124/256) -data8 0x3FD98B7811A3EE5B //log(1/frcpa(1+ 125/256) -data8 0x3FD9AF47F33D406C //log(1/frcpa(1+ 126/256) -data8 0x3FD9DF270C1914A8 //log(1/frcpa(1+ 127/256) -data8 0x3FDA0325ED14FDA4 //log(1/frcpa(1+ 128/256) -data8 0x3FDA33440224FA79 //log(1/frcpa(1+ 129/256) -data8 0x3FDA57725E80C383 //log(1/frcpa(1+ 130/256) -data8 0x3FDA87D0165DD199 //log(1/frcpa(1+ 131/256) -data8 0x3FDAAC2E6C03F896 //log(1/frcpa(1+ 132/256) -data8 0x3FDADCCC6FDF6A81 //log(1/frcpa(1+ 133/256) -data8 0x3FDB015B3EB1E790 //log(1/frcpa(1+ 134/256) -data8 0x3FDB323A3A635948 //log(1/frcpa(1+ 135/256) -data8 0x3FDB56FA04462909 //log(1/frcpa(1+ 136/256) -data8 0x3FDB881AA659BC93 //log(1/frcpa(1+ 137/256) -data8 0x3FDBAD0BEF3DB165 //log(1/frcpa(1+ 138/256) -data8 0x3FDBD21297781C2F //log(1/frcpa(1+ 139/256) -data8 0x3FDC039236F08819 //log(1/frcpa(1+ 140/256) -data8 0x3FDC28CB1E4D32FD //log(1/frcpa(1+ 141/256) -data8 0x3FDC4E19B84723C2 //log(1/frcpa(1+ 142/256) -data8 0x3FDC7FF9C74554C9 //log(1/frcpa(1+ 143/256) -data8 0x3FDCA57B64E9DB05 //log(1/frcpa(1+ 144/256) -data8 0x3FDCCB130A5CEBB0 //log(1/frcpa(1+ 145/256) -data8 0x3FDCF0C0D18F326F //log(1/frcpa(1+ 146/256) -data8 0x3FDD232075B5A201 //log(1/frcpa(1+ 147/256) -data8 0x3FDD490246DEFA6B //log(1/frcpa(1+ 148/256) -data8 0x3FDD6EFA918D25CD //log(1/frcpa(1+ 149/256) -data8 0x3FDD9509707AE52F //log(1/frcpa(1+ 150/256) -data8 0x3FDDBB2EFE92C554 //log(1/frcpa(1+ 151/256) -data8 0x3FDDEE2F3445E4AF //log(1/frcpa(1+ 152/256) -data8 0x3FDE148A1A2726CE //log(1/frcpa(1+ 153/256) -data8 0x3FDE3AFC0A49FF40 //log(1/frcpa(1+ 154/256) -data8 0x3FDE6185206D516E //log(1/frcpa(1+ 155/256) -data8 0x3FDE882578823D52 //log(1/frcpa(1+ 156/256) -data8 0x3FDEAEDD2EAC990C //log(1/frcpa(1+ 157/256) -data8 0x3FDED5AC5F436BE3 //log(1/frcpa(1+ 158/256) -data8 0x3FDEFC9326D16AB9 //log(1/frcpa(1+ 159/256) -data8 0x3FDF2391A2157600 //log(1/frcpa(1+ 160/256) -data8 0x3FDF4AA7EE03192D //log(1/frcpa(1+ 161/256) -data8 0x3FDF71D627C30BB0 //log(1/frcpa(1+ 162/256) -data8 0x3FDF991C6CB3B379 //log(1/frcpa(1+ 163/256) -data8 0x3FDFC07ADA69A910 //log(1/frcpa(1+ 164/256) -data8 0x3FDFE7F18EB03D3E //log(1/frcpa(1+ 165/256) -data8 0x3FE007C053C5002E //log(1/frcpa(1+ 166/256) -data8 0x3FE01B942198A5A1 //log(1/frcpa(1+ 167/256) -data8 0x3FE02F74400C64EB //log(1/frcpa(1+ 168/256) -data8 0x3FE04360BE7603AD //log(1/frcpa(1+ 169/256) -data8 0x3FE05759AC47FE34 //log(1/frcpa(1+ 170/256) -data8 0x3FE06B5F1911CF52 //log(1/frcpa(1+ 171/256) -data8 0x3FE078BF0533C568 //log(1/frcpa(1+ 172/256) -data8 0x3FE08CD9687E7B0E //log(1/frcpa(1+ 173/256) -data8 0x3FE0A10074CF9019 //log(1/frcpa(1+ 174/256) -data8 0x3FE0B5343A234477 //log(1/frcpa(1+ 175/256) -data8 0x3FE0C974C89431CE //log(1/frcpa(1+ 176/256) -data8 0x3FE0DDC2305B9886 //log(1/frcpa(1+ 177/256) -data8 0x3FE0EB524BAFC918 //log(1/frcpa(1+ 178/256) -data8 0x3FE0FFB54213A476 //log(1/frcpa(1+ 179/256) -data8 0x3FE114253DA97D9F //log(1/frcpa(1+ 180/256) -data8 0x3FE128A24F1D9AFF //log(1/frcpa(1+ 181/256) -data8 0x3FE1365252BF0865 //log(1/frcpa(1+ 182/256) -data8 0x3FE14AE558B4A92D //log(1/frcpa(1+ 183/256) -data8 0x3FE15F85A19C765B //log(1/frcpa(1+ 184/256) -data8 0x3FE16D4D38C119FA //log(1/frcpa(1+ 185/256) -data8 0x3FE18203C20DD133 //log(1/frcpa(1+ 186/256) -data8 0x3FE196C7BC4B1F3B //log(1/frcpa(1+ 187/256) -data8 0x3FE1A4A738B7A33C //log(1/frcpa(1+ 188/256) -data8 0x3FE1B981C0C9653D //log(1/frcpa(1+ 189/256) -data8 0x3FE1CE69E8BB106B //log(1/frcpa(1+ 190/256) -data8 0x3FE1DC619DE06944 //log(1/frcpa(1+ 191/256) -data8 0x3FE1F160A2AD0DA4 //log(1/frcpa(1+ 192/256) -data8 0x3FE2066D7740737E //log(1/frcpa(1+ 193/256) -data8 0x3FE2147DBA47A394 //log(1/frcpa(1+ 194/256) -data8 0x3FE229A1BC5EBAC3 //log(1/frcpa(1+ 195/256) -data8 0x3FE237C1841A502E //log(1/frcpa(1+ 196/256) -data8 0x3FE24CFCE6F80D9A //log(1/frcpa(1+ 197/256) -data8 0x3FE25B2C55CD5762 //log(1/frcpa(1+ 198/256) -data8 0x3FE2707F4D5F7C41 //log(1/frcpa(1+ 199/256) -data8 0x3FE285E0842CA384 //log(1/frcpa(1+ 200/256) -data8 0x3FE294294708B773 //log(1/frcpa(1+ 201/256) -data8 0x3FE2A9A2670AFF0C //log(1/frcpa(1+ 202/256) -data8 0x3FE2B7FB2C8D1CC1 //log(1/frcpa(1+ 203/256) -data8 0x3FE2C65A6395F5F5 //log(1/frcpa(1+ 204/256) -data8 0x3FE2DBF557B0DF43 //log(1/frcpa(1+ 205/256) -data8 0x3FE2EA64C3F97655 //log(1/frcpa(1+ 206/256) -data8 0x3FE3001823684D73 //log(1/frcpa(1+ 207/256) -data8 0x3FE30E97E9A8B5CD //log(1/frcpa(1+ 208/256) -data8 0x3FE32463EBDD34EA //log(1/frcpa(1+ 209/256) -data8 0x3FE332F4314AD796 //log(1/frcpa(1+ 210/256) -data8 0x3FE348D90E7464D0 //log(1/frcpa(1+ 211/256) -data8 0x3FE35779F8C43D6E //log(1/frcpa(1+ 212/256) -data8 0x3FE36621961A6A99 //log(1/frcpa(1+ 213/256) -data8 0x3FE37C299F3C366A //log(1/frcpa(1+ 214/256) -data8 0x3FE38AE2171976E7 //log(1/frcpa(1+ 215/256) -data8 0x3FE399A157A603E7 //log(1/frcpa(1+ 216/256) -data8 0x3FE3AFCCFE77B9D1 //log(1/frcpa(1+ 217/256) -data8 0x3FE3BE9D503533B5 //log(1/frcpa(1+ 218/256) -data8 0x3FE3CD7480B4A8A3 //log(1/frcpa(1+ 219/256) -data8 0x3FE3E3C43918F76C //log(1/frcpa(1+ 220/256) -data8 0x3FE3F2ACB27ED6C7 //log(1/frcpa(1+ 221/256) -data8 0x3FE4019C2125CA93 //log(1/frcpa(1+ 222/256) -data8 0x3FE4181061389722 //log(1/frcpa(1+ 223/256) -data8 0x3FE42711518DF545 //log(1/frcpa(1+ 224/256) -data8 0x3FE436194E12B6BF //log(1/frcpa(1+ 225/256) -data8 0x3FE445285D68EA69 //log(1/frcpa(1+ 226/256) -data8 0x3FE45BCC464C893A //log(1/frcpa(1+ 227/256) -data8 0x3FE46AED21F117FC //log(1/frcpa(1+ 228/256) -data8 0x3FE47A1527E8A2D3 //log(1/frcpa(1+ 229/256) -data8 0x3FE489445EFFFCCC //log(1/frcpa(1+ 230/256) -data8 0x3FE4A018BCB69835 //log(1/frcpa(1+ 231/256) -data8 0x3FE4AF5A0C9D65D7 //log(1/frcpa(1+ 232/256) -data8 0x3FE4BEA2A5BDBE87 //log(1/frcpa(1+ 233/256) -data8 0x3FE4CDF28F10AC46 //log(1/frcpa(1+ 234/256) -data8 0x3FE4DD49CF994058 //log(1/frcpa(1+ 235/256) -data8 0x3FE4ECA86E64A684 //log(1/frcpa(1+ 236/256) -data8 0x3FE503C43CD8EB68 //log(1/frcpa(1+ 237/256) -data8 0x3FE513356667FC57 //log(1/frcpa(1+ 238/256) -data8 0x3FE522AE0738A3D8 //log(1/frcpa(1+ 239/256) -data8 0x3FE5322E26867857 //log(1/frcpa(1+ 240/256) -data8 0x3FE541B5CB979809 //log(1/frcpa(1+ 241/256) -data8 0x3FE55144FDBCBD62 //log(1/frcpa(1+ 242/256) -data8 0x3FE560DBC45153C7 //log(1/frcpa(1+ 243/256) -data8 0x3FE5707A26BB8C66 //log(1/frcpa(1+ 244/256) -data8 0x3FE587F60ED5B900 //log(1/frcpa(1+ 245/256) -data8 0x3FE597A7977C8F31 //log(1/frcpa(1+ 246/256) -data8 0x3FE5A760D634BB8B //log(1/frcpa(1+ 247/256) -data8 0x3FE5B721D295F10F //log(1/frcpa(1+ 248/256) -data8 0x3FE5C6EA94431EF9 //log(1/frcpa(1+ 249/256) -data8 0x3FE5D6BB22EA86F6 //log(1/frcpa(1+ 250/256) -data8 0x3FE5E6938645D390 //log(1/frcpa(1+ 251/256) -data8 0x3FE5F673C61A2ED2 //log(1/frcpa(1+ 252/256) -data8 0x3FE6065BEA385926 //log(1/frcpa(1+ 253/256) -data8 0x3FE6164BFA7CC06B //log(1/frcpa(1+ 254/256) -data8 0x3FE62643FECF9743 //log(1/frcpa(1+ 255/256) -ASM_SIZE_DIRECTIVE(log_table_2) - - -.align 32 -.global logf# -.global log10f# - -// log10 has p7 true, p8 false -// log has p8 true, p7 false - +LOCAL_OBJECT_START(logf_data) +data8 0x3FF0000000000000 // 1.0 +// +// ln(1/frcpa(1+i/256)), i=0...255 +data8 0x3F60040155D5889E // 0 +data8 0x3F78121214586B54 // 1 +data8 0x3F841929F96832F0 // 2 +data8 0x3F8C317384C75F06 // 3 +data8 0x3F91A6B91AC73386 // 4 +data8 0x3F95BA9A5D9AC039 // 5 +data8 0x3F99D2A8074325F4 // 6 +data8 0x3F9D6B2725979802 // 7 +data8 0x3FA0C58FA19DFAAA // 8 +data8 0x3FA2954C78CBCE1B // 9 +data8 0x3FA4A94D2DA96C56 // 10 +data8 0x3FA67C94F2D4BB58 // 11 +data8 0x3FA85188B630F068 // 12 +data8 0x3FAA6B8ABE73AF4C // 13 +data8 0x3FAC441E06F72A9E // 14 +data8 0x3FAE1E6713606D07 // 15 +data8 0x3FAFFA6911AB9301 // 16 +data8 0x3FB0EC139C5DA601 // 17 +data8 0x3FB1DBD2643D190B // 18 +data8 0x3FB2CC7284FE5F1C // 19 +data8 0x3FB3BDF5A7D1EE64 // 20 +data8 0x3FB4B05D7AA012E0 // 21 +data8 0x3FB580DB7CEB5702 // 22 +data8 0x3FB674F089365A7A // 23 +data8 0x3FB769EF2C6B568D // 24 +data8 0x3FB85FD927506A48 // 25 +data8 0x3FB9335E5D594989 // 26 +data8 0x3FBA2B0220C8E5F5 // 27 +data8 0x3FBB0004AC1A86AC // 28 +data8 0x3FBBF968769FCA11 // 29 +data8 0x3FBCCFEDBFEE13A8 // 30 +data8 0x3FBDA727638446A2 // 31 +data8 0x3FBEA3257FE10F7A // 32 +data8 0x3FBF7BE9FEDBFDE6 // 33 +data8 0x3FC02AB352FF25F4 // 34 +data8 0x3FC097CE579D204D // 35 +data8 0x3FC1178E8227E47C // 36 +data8 0x3FC185747DBECF34 // 37 +data8 0x3FC1F3B925F25D41 // 38 +data8 0x3FC2625D1E6DDF57 // 39 +data8 0x3FC2D1610C86813A // 40 +data8 0x3FC340C59741142E // 41 +data8 0x3FC3B08B6757F2A9 // 42 +data8 0x3FC40DFB08378003 // 43 +data8 0x3FC47E74E8CA5F7C // 44 +data8 0x3FC4EF51F6466DE4 // 45 +data8 0x3FC56092E02BA516 // 46 +data8 0x3FC5D23857CD74D5 // 47 +data8 0x3FC6313A37335D76 // 48 +data8 0x3FC6A399DABBD383 // 49 +data8 0x3FC70337DD3CE41B // 50 +data8 0x3FC77654128F6127 // 51 +data8 0x3FC7E9D82A0B022D // 52 +data8 0x3FC84A6B759F512F // 53 +data8 0x3FC8AB47D5F5A310 // 54 +data8 0x3FC91FE49096581B // 55 +data8 0x3FC981634011AA75 // 56 +data8 0x3FC9F6C407089664 // 57 +data8 0x3FCA58E729348F43 // 58 +data8 0x3FCABB55C31693AD // 59 +data8 0x3FCB1E104919EFD0 // 60 +data8 0x3FCB94EE93E367CB // 61 +data8 0x3FCBF851C067555F // 62 +data8 0x3FCC5C0254BF23A6 // 63 +data8 0x3FCCC000C9DB3C52 // 64 +data8 0x3FCD244D99C85674 // 65 +data8 0x3FCD88E93FB2F450 // 66 +data8 0x3FCDEDD437EAEF01 // 67 +data8 0x3FCE530EFFE71012 // 68 +data8 0x3FCEB89A1648B971 // 69 +data8 0x3FCF1E75FADF9BDE // 70 +data8 0x3FCF84A32EAD7C35 // 71 +data8 0x3FCFEB2233EA07CD // 72 +data8 0x3FD028F9C7035C1C // 73 +data8 0x3FD05C8BE0D9635A // 74 +data8 0x3FD085EB8F8AE797 // 75 +data8 0x3FD0B9C8E32D1911 // 76 +data8 0x3FD0EDD060B78081 // 77 +data8 0x3FD122024CF0063F // 78 +data8 0x3FD14BE2927AECD4 // 79 +data8 0x3FD180618EF18ADF // 80 +data8 0x3FD1B50BBE2FC63B // 81 +data8 0x3FD1DF4CC7CF242D // 82 +data8 0x3FD214456D0EB8D4 // 83 +data8 0x3FD23EC5991EBA49 // 84 +data8 0x3FD2740D9F870AFB // 85 +data8 0x3FD29ECDABCDFA04 // 86 +data8 0x3FD2D46602ADCCEE // 87 +data8 0x3FD2FF66B04EA9D4 // 88 +data8 0x3FD335504B355A37 // 89 +data8 0x3FD360925EC44F5D // 90 +data8 0x3FD38BF1C3337E75 // 91 +data8 0x3FD3C25277333184 // 92 +data8 0x3FD3EDF463C1683E // 93 +data8 0x3FD419B423D5E8C7 // 94 +data8 0x3FD44591E0539F49 // 95 +data8 0x3FD47C9175B6F0AD // 96 +data8 0x3FD4A8B341552B09 // 97 +data8 0x3FD4D4F3908901A0 // 98 +data8 0x3FD501528DA1F968 // 99 +data8 0x3FD52DD06347D4F6 // 100 +data8 0x3FD55A6D3C7B8A8A // 101 +data8 0x3FD5925D2B112A59 // 102 +data8 0x3FD5BF406B543DB2 // 103 +data8 0x3FD5EC433D5C35AE // 104 +data8 0x3FD61965CDB02C1F // 105 +data8 0x3FD646A84935B2A2 // 106 +data8 0x3FD6740ADD31DE94 // 107 +data8 0x3FD6A18DB74A58C5 // 108 +data8 0x3FD6CF31058670EC // 109 +data8 0x3FD6F180E852F0BA // 110 +data8 0x3FD71F5D71B894F0 // 111 +data8 0x3FD74D5AEFD66D5C // 112 +data8 0x3FD77B79922BD37E // 113 +data8 0x3FD7A9B9889F19E2 // 114 +data8 0x3FD7D81B037EB6A6 // 115 +data8 0x3FD8069E33827231 // 116 +data8 0x3FD82996D3EF8BCB // 117 +data8 0x3FD85855776DCBFB // 118 +data8 0x3FD8873658327CCF // 119 +data8 0x3FD8AA75973AB8CF // 120 +data8 0x3FD8D992DC8824E5 // 121 +data8 0x3FD908D2EA7D9512 // 122 +data8 0x3FD92C59E79C0E56 // 123 +data8 0x3FD95BD750EE3ED3 // 124 +data8 0x3FD98B7811A3EE5B // 125 +data8 0x3FD9AF47F33D406C // 126 +data8 0x3FD9DF270C1914A8 // 127 +data8 0x3FDA0325ED14FDA4 // 128 +data8 0x3FDA33440224FA79 // 129 +data8 0x3FDA57725E80C383 // 130 +data8 0x3FDA87D0165DD199 // 131 +data8 0x3FDAAC2E6C03F896 // 132 +data8 0x3FDADCCC6FDF6A81 // 133 +data8 0x3FDB015B3EB1E790 // 134 +data8 0x3FDB323A3A635948 // 135 +data8 0x3FDB56FA04462909 // 136 +data8 0x3FDB881AA659BC93 // 137 +data8 0x3FDBAD0BEF3DB165 // 138 +data8 0x3FDBD21297781C2F // 139 +data8 0x3FDC039236F08819 // 140 +data8 0x3FDC28CB1E4D32FD // 141 +data8 0x3FDC4E19B84723C2 // 142 +data8 0x3FDC7FF9C74554C9 // 143 +data8 0x3FDCA57B64E9DB05 // 144 +data8 0x3FDCCB130A5CEBB0 // 145 +data8 0x3FDCF0C0D18F326F // 146 +data8 0x3FDD232075B5A201 // 147 +data8 0x3FDD490246DEFA6B // 148 +data8 0x3FDD6EFA918D25CD // 149 +data8 0x3FDD9509707AE52F // 150 +data8 0x3FDDBB2EFE92C554 // 151 +data8 0x3FDDEE2F3445E4AF // 152 +data8 0x3FDE148A1A2726CE // 153 +data8 0x3FDE3AFC0A49FF40 // 154 +data8 0x3FDE6185206D516E // 155 +data8 0x3FDE882578823D52 // 156 +data8 0x3FDEAEDD2EAC990C // 157 +data8 0x3FDED5AC5F436BE3 // 158 +data8 0x3FDEFC9326D16AB9 // 159 +data8 0x3FDF2391A2157600 // 160 +data8 0x3FDF4AA7EE03192D // 161 +data8 0x3FDF71D627C30BB0 // 162 +data8 0x3FDF991C6CB3B379 // 163 +data8 0x3FDFC07ADA69A910 // 164 +data8 0x3FDFE7F18EB03D3E // 165 +data8 0x3FE007C053C5002E // 166 +data8 0x3FE01B942198A5A1 // 167 +data8 0x3FE02F74400C64EB // 168 +data8 0x3FE04360BE7603AD // 169 +data8 0x3FE05759AC47FE34 // 170 +data8 0x3FE06B5F1911CF52 // 171 +data8 0x3FE078BF0533C568 // 172 +data8 0x3FE08CD9687E7B0E // 173 +data8 0x3FE0A10074CF9019 // 174 +data8 0x3FE0B5343A234477 // 175 +data8 0x3FE0C974C89431CE // 176 +data8 0x3FE0DDC2305B9886 // 177 +data8 0x3FE0EB524BAFC918 // 178 +data8 0x3FE0FFB54213A476 // 179 +data8 0x3FE114253DA97D9F // 180 +data8 0x3FE128A24F1D9AFF // 181 +data8 0x3FE1365252BF0865 // 182 +data8 0x3FE14AE558B4A92D // 183 +data8 0x3FE15F85A19C765B // 184 +data8 0x3FE16D4D38C119FA // 185 +data8 0x3FE18203C20DD133 // 186 +data8 0x3FE196C7BC4B1F3B // 187 +data8 0x3FE1A4A738B7A33C // 188 +data8 0x3FE1B981C0C9653D // 189 +data8 0x3FE1CE69E8BB106B // 190 +data8 0x3FE1DC619DE06944 // 191 +data8 0x3FE1F160A2AD0DA4 // 192 +data8 0x3FE2066D7740737E // 193 +data8 0x3FE2147DBA47A394 // 194 +data8 0x3FE229A1BC5EBAC3 // 195 +data8 0x3FE237C1841A502E // 196 +data8 0x3FE24CFCE6F80D9A // 197 +data8 0x3FE25B2C55CD5762 // 198 +data8 0x3FE2707F4D5F7C41 // 199 +data8 0x3FE285E0842CA384 // 200 +data8 0x3FE294294708B773 // 201 +data8 0x3FE2A9A2670AFF0C // 202 +data8 0x3FE2B7FB2C8D1CC1 // 203 +data8 0x3FE2C65A6395F5F5 // 204 +data8 0x3FE2DBF557B0DF43 // 205 +data8 0x3FE2EA64C3F97655 // 206 +data8 0x3FE3001823684D73 // 207 +data8 0x3FE30E97E9A8B5CD // 208 +data8 0x3FE32463EBDD34EA // 209 +data8 0x3FE332F4314AD796 // 210 +data8 0x3FE348D90E7464D0 // 211 +data8 0x3FE35779F8C43D6E // 212 +data8 0x3FE36621961A6A99 // 213 +data8 0x3FE37C299F3C366A // 214 +data8 0x3FE38AE2171976E7 // 215 +data8 0x3FE399A157A603E7 // 216 +data8 0x3FE3AFCCFE77B9D1 // 217 +data8 0x3FE3BE9D503533B5 // 218 +data8 0x3FE3CD7480B4A8A3 // 219 +data8 0x3FE3E3C43918F76C // 220 +data8 0x3FE3F2ACB27ED6C7 // 221 +data8 0x3FE4019C2125CA93 // 222 +data8 0x3FE4181061389722 // 223 +data8 0x3FE42711518DF545 // 224 +data8 0x3FE436194E12B6BF // 225 +data8 0x3FE445285D68EA69 // 226 +data8 0x3FE45BCC464C893A // 227 +data8 0x3FE46AED21F117FC // 228 +data8 0x3FE47A1527E8A2D3 // 229 +data8 0x3FE489445EFFFCCC // 230 +data8 0x3FE4A018BCB69835 // 231 +data8 0x3FE4AF5A0C9D65D7 // 232 +data8 0x3FE4BEA2A5BDBE87 // 233 +data8 0x3FE4CDF28F10AC46 // 234 +data8 0x3FE4DD49CF994058 // 235 +data8 0x3FE4ECA86E64A684 // 236 +data8 0x3FE503C43CD8EB68 // 237 +data8 0x3FE513356667FC57 // 238 +data8 0x3FE522AE0738A3D8 // 239 +data8 0x3FE5322E26867857 // 240 +data8 0x3FE541B5CB979809 // 241 +data8 0x3FE55144FDBCBD62 // 242 +data8 0x3FE560DBC45153C7 // 243 +data8 0x3FE5707A26BB8C66 // 244 +data8 0x3FE587F60ED5B900 // 245 +data8 0x3FE597A7977C8F31 // 246 +data8 0x3FE5A760D634BB8B // 247 +data8 0x3FE5B721D295F10F // 248 +data8 0x3FE5C6EA94431EF9 // 249 +data8 0x3FE5D6BB22EA86F6 // 250 +data8 0x3FE5E6938645D390 // 251 +data8 0x3FE5F673C61A2ED2 // 252 +data8 0x3FE6065BEA385926 // 253 +data8 0x3FE6164BFA7CC06B // 254 +data8 0x3FE62643FECF9743 // 255 +LOCAL_OBJECT_END(logf_data) + +LOCAL_OBJECT_START(log10f_data) +data8 0x3FDBCB7B1526E50E // 1/ln(10) +// +// ln(1/frcpa(1+i/256))/ln(10), i=0...255 +data8 0x3F4BD27045BFD025 // 0 +data8 0x3F64E84E793A474A // 1 +data8 0x3F7175085AB85FF0 // 2 +data8 0x3F787CFF9D9147A5 // 3 +data8 0x3F7EA9D372B89FC8 // 4 +data8 0x3F82DF9D95DA961C // 5 +data8 0x3F866DF172D6372C // 6 +data8 0x3F898D79EF5EEDF0 // 7 +data8 0x3F8D22ADF3F9579D // 8 +data8 0x3F9024231D30C398 // 9 +data8 0x3F91F23A98897D4A // 10 +data8 0x3F93881A7B818F9E // 11 +data8 0x3F951F6E1E759E35 // 12 +data8 0x3F96F2BCE7ADC5B4 // 13 +data8 0x3F988D362CDF359E // 14 +data8 0x3F9A292BAF010982 // 15 +data8 0x3F9BC6A03117EB97 // 16 +data8 0x3F9D65967DE3AB09 // 17 +data8 0x3F9F061167FC31E8 // 18 +data8 0x3FA05409E4F7819C // 19 +data8 0x3FA125D0432EA20E // 20 +data8 0x3FA1F85D440D299B // 21 +data8 0x3FA2AD755749617D // 22 +data8 0x3FA381772A00E604 // 23 +data8 0x3FA45643E165A70B // 24 +data8 0x3FA52BDD034475B8 // 25 +data8 0x3FA5E3966B7E9295 // 26 +data8 0x3FA6BAAF47C5B245 // 27 +data8 0x3FA773B3E8C4F3C8 // 28 +data8 0x3FA84C51EBEE8D15 // 29 +data8 0x3FA906A6786FC1CB // 30 +data8 0x3FA9C197ABF00DD7 // 31 +data8 0x3FAA9C78712191F7 // 32 +data8 0x3FAB58C09C8D637C // 33 +data8 0x3FAC15A8BCDD7B7E // 34 +data8 0x3FACD331E2C2967C // 35 +data8 0x3FADB11ED766ABF4 // 36 +data8 0x3FAE70089346A9E6 // 37 +data8 0x3FAF2F96C6754AEE // 38 +data8 0x3FAFEFCA8D451FD6 // 39 +data8 0x3FB0585283764178 // 40 +data8 0x3FB0B913AAC7D3A7 // 41 +data8 0x3FB11A294F2569F6 // 42 +data8 0x3FB16B51A2696891 // 43 +data8 0x3FB1CD03ADACC8BE // 44 +data8 0x3FB22F0BDD7745F5 // 45 +data8 0x3FB2916ACA38D1E8 // 46 +data8 0x3FB2F4210DF7663D // 47 +data8 0x3FB346A6C3C49066 // 48 +data8 0x3FB3A9FEBC60540A // 49 +data8 0x3FB3FD0C10A3AA54 // 50 +data8 0x3FB46107D3540A82 // 51 +data8 0x3FB4C55DD16967FE // 52 +data8 0x3FB51940330C000B // 53 +data8 0x3FB56D620EE7115E // 54 +data8 0x3FB5D2ABCF26178E // 55 +data8 0x3FB6275AA5DEBF81 // 56 +data8 0x3FB68D4EAF26D7EE // 57 +data8 0x3FB6E28C5C54A28D // 58 +data8 0x3FB7380B9665B7C8 // 59 +data8 0x3FB78DCCC278E85B // 60 +data8 0x3FB7F50C2CF2557A // 61 +data8 0x3FB84B5FD5EAEFD8 // 62 +data8 0x3FB8A1F6BAB2B226 // 63 +data8 0x3FB8F8D144557BDF // 64 +data8 0x3FB94FEFDCD61D92 // 65 +data8 0x3FB9A752EF316149 // 66 +data8 0x3FB9FEFAE7611EE0 // 67 +data8 0x3FBA56E8325F5C87 // 68 +data8 0x3FBAAF1B3E297BB4 // 69 +data8 0x3FBB079479C372AD // 70 +data8 0x3FBB6054553B12F7 // 71 +data8 0x3FBBB95B41AB5CE6 // 72 +data8 0x3FBC12A9B13FE079 // 73 +data8 0x3FBC6C4017382BEA // 74 +data8 0x3FBCB41FBA42686D // 75 +data8 0x3FBD0E38CE73393F // 76 +data8 0x3FBD689B2193F133 // 77 +data8 0x3FBDC3472B1D2860 // 78 +data8 0x3FBE0C06300D528B // 79 +data8 0x3FBE6738190E394C // 80 +data8 0x3FBEC2B50D208D9B // 81 +data8 0x3FBF0C1C2B936828 // 82 +data8 0x3FBF68216C9CC727 // 83 +data8 0x3FBFB1F6381856F4 // 84 +data8 0x3FC00742AF4CE5F8 // 85 +data8 0x3FC02C64906512D2 // 86 +data8 0x3FC05AF1E63E03B4 // 87 +data8 0x3FC0804BEA723AA9 // 88 +data8 0x3FC0AF1FD6711527 // 89 +data8 0x3FC0D4B2A8805A00 // 90 +data8 0x3FC0FA5EF136A06C // 91 +data8 0x3FC1299A4FB3E306 // 92 +data8 0x3FC14F806253C3ED // 93 +data8 0x3FC175805D1587C1 // 94 +data8 0x3FC19B9A637CA295 // 95 +data8 0x3FC1CB5FC26EDE17 // 96 +data8 0x3FC1F1B4E65F2590 // 97 +data8 0x3FC218248B5DC3E5 // 98 +data8 0x3FC23EAED62ADC76 // 99 +data8 0x3FC26553EBD337BD // 100 +data8 0x3FC28C13F1B11900 // 101 +data8 0x3FC2BCAA14381386 // 102 +data8 0x3FC2E3A740B7800F // 103 +data8 0x3FC30ABFD8F333B6 // 104 +data8 0x3FC331F403985097 // 105 +data8 0x3FC35943E7A60690 // 106 +data8 0x3FC380AFAC6E7C07 // 107 +data8 0x3FC3A8377997B9E6 // 108 +data8 0x3FC3CFDB771C9ADB // 109 +data8 0x3FC3EDA90D39A5DF // 110 +data8 0x3FC4157EC09505CD // 111 +data8 0x3FC43D7113FB04C1 // 112 +data8 0x3FC4658030AD1CCF // 113 +data8 0x3FC48DAC404638F6 // 114 +data8 0x3FC4B5F56CBBB869 // 115 +data8 0x3FC4DE5BE05E7583 // 116 +data8 0x3FC4FCBC0776FD85 // 117 +data8 0x3FC525561E9256EE // 118 +data8 0x3FC54E0DF3198865 // 119 +data8 0x3FC56CAB7112BDE2 // 120 +data8 0x3FC59597BA735B15 // 121 +data8 0x3FC5BEA23A506FDA // 122 +data8 0x3FC5DD7E08DE382F // 123 +data8 0x3FC606BDD3F92355 // 124 +data8 0x3FC6301C518A501F // 125 +data8 0x3FC64F3770618916 // 126 +data8 0x3FC678CC14C1E2D8 // 127 +data8 0x3FC6981005ED2947 // 128 +data8 0x3FC6C1DB5F9BB336 // 129 +data8 0x3FC6E1488ECD2881 // 130 +data8 0x3FC70B4B2E7E41B9 // 131 +data8 0x3FC72AE209146BF9 // 132 +data8 0x3FC7551C81BD8DCF // 133 +data8 0x3FC774DD76CC43BE // 134 +data8 0x3FC79F505DB00E88 // 135 +data8 0x3FC7BF3BDE099F30 // 136 +data8 0x3FC7E9E7CAC437F9 // 137 +data8 0x3FC809FE4902D00D // 138 +data8 0x3FC82A2757995CBE // 139 +data8 0x3FC85525C625E098 // 140 +data8 0x3FC8757A79831887 // 141 +data8 0x3FC895E2058D8E03 // 142 +data8 0x3FC8C13437695532 // 143 +data8 0x3FC8E1C812EF32BE // 144 +data8 0x3FC9026F112197E8 // 145 +data8 0x3FC923294888880B // 146 +data8 0x3FC94EEA4B8334F3 // 147 +data8 0x3FC96FD1B639FC09 // 148 +data8 0x3FC990CCA66229AC // 149 +data8 0x3FC9B1DB33334843 // 150 +data8 0x3FC9D2FD740E6607 // 151 +data8 0x3FC9FF49EEDCB553 // 152 +data8 0x3FCA209A84FBCFF8 // 153 +data8 0x3FCA41FF1E43F02B // 154 +data8 0x3FCA6377D2CE9378 // 155 +data8 0x3FCA8504BAE0D9F6 // 156 +data8 0x3FCAA6A5EEEBEFE3 // 157 +data8 0x3FCAC85B878D7879 // 158 +data8 0x3FCAEA259D8FFA0B // 159 +data8 0x3FCB0C0449EB4B6B // 160 +data8 0x3FCB2DF7A5C50299 // 161 +data8 0x3FCB4FFFCA70E4D1 // 162 +data8 0x3FCB721CD17157E3 // 163 +data8 0x3FCB944ED477D4ED // 164 +data8 0x3FCBB695ED655C7D // 165 +data8 0x3FCBD8F2364AEC0F // 166 +data8 0x3FCBFB63C969F4FF // 167 +data8 0x3FCC1DEAC134D4E9 // 168 +data8 0x3FCC4087384F4F80 // 169 +data8 0x3FCC6339498F09E2 // 170 +data8 0x3FCC86010FFC076C // 171 +data8 0x3FCC9D3D065C5B42 // 172 +data8 0x3FCCC029375BA07A // 173 +data8 0x3FCCE32B66978BA4 // 174 +data8 0x3FCD0643AFD51404 // 175 +data8 0x3FCD29722F0DEA45 // 176 +data8 0x3FCD4CB70070FE44 // 177 +data8 0x3FCD6446AB3F8C96 // 178 +data8 0x3FCD87B0EF71DB45 // 179 +data8 0x3FCDAB31D1FE99A7 // 180 +data8 0x3FCDCEC96FDC888F // 181 +data8 0x3FCDE6908876357A // 182 +data8 0x3FCE0A4E4A25C200 // 183 +data8 0x3FCE2E2315755E33 // 184 +data8 0x3FCE461322D1648A // 185 +data8 0x3FCE6A0E95C7787B // 186 +data8 0x3FCE8E216243DD60 // 187 +data8 0x3FCEA63AF26E007C // 188 +data8 0x3FCECA74ED15E0B7 // 189 +data8 0x3FCEEEC692CCD25A // 190 +data8 0x3FCF070A36B8D9C1 // 191 +data8 0x3FCF2B8393E34A2D // 192 +data8 0x3FCF5014EF538A5B // 193 +data8 0x3FCF68833AF1B180 // 194 +data8 0x3FCF8D3CD9F3F04F // 195 +data8 0x3FCFA5C61ADD93E9 // 196 +data8 0x3FCFCAA8567EBA7A // 197 +data8 0x3FCFE34CC8743DD8 // 198 +data8 0x3FD0042BFD74F519 // 199 +data8 0x3FD016BDF6A18017 // 200 +data8 0x3FD023262F907322 // 201 +data8 0x3FD035CCED8D32A1 // 202 +data8 0x3FD042430E869FFC // 203 +data8 0x3FD04EBEC842B2E0 // 204 +data8 0x3FD06182E84FD4AC // 205 +data8 0x3FD06E0CB609D383 // 206 +data8 0x3FD080E60BEC8F12 // 207 +data8 0x3FD08D7E0D894735 // 208 +data8 0x3FD0A06CC96A2056 // 209 +data8 0x3FD0AD131F3B3C55 // 210 +data8 0x3FD0C01771E775FB // 211 +data8 0x3FD0CCCC3CAD6F4B // 212 +data8 0x3FD0D986D91A34A9 // 213 +data8 0x3FD0ECA9B8861A2D // 214 +data8 0x3FD0F972F87FF3D6 // 215 +data8 0x3FD106421CF0E5F7 // 216 +data8 0x3FD11983EBE28A9D // 217 +data8 0x3FD12661E35B785A // 218 +data8 0x3FD13345D2779D3B // 219 +data8 0x3FD146A6F597283A // 220 +data8 0x3FD15399E81EA83D // 221 +data8 0x3FD16092E5D3A9A6 // 222 +data8 0x3FD17413C3B7AB5E // 223 +data8 0x3FD1811BF629D6FB // 224 +data8 0x3FD18E2A47B46686 // 225 +data8 0x3FD19B3EBE1A4418 // 226 +data8 0x3FD1AEE9017CB450 // 227 +data8 0x3FD1BC0CED7134E2 // 228 +data8 0x3FD1C93712ABC7FF // 229 +data8 0x3FD1D66777147D3F // 230 +data8 0x3FD1EA3BD1286E1C // 231 +data8 0x3FD1F77BED932C4C // 232 +data8 0x3FD204C25E1B031F // 233 +data8 0x3FD2120F28CE69B1 // 234 +data8 0x3FD21F6253C48D01 // 235 +data8 0x3FD22CBBE51D60AA // 236 +data8 0x3FD240CE4C975444 // 237 +data8 0x3FD24E37F8ECDAE8 // 238 +data8 0x3FD25BA8215AF7FC // 239 +data8 0x3FD2691ECC29F042 // 240 +data8 0x3FD2769BFFAB2E00 // 241 +data8 0x3FD2841FC23952C9 // 242 +data8 0x3FD291AA1A384978 // 243 +data8 0x3FD29F3B0E15584B // 244 +data8 0x3FD2B3A0EE479DF7 // 245 +data8 0x3FD2C142842C09E6 // 246 +data8 0x3FD2CEEACCB7BD6D // 247 +data8 0x3FD2DC99CE82FF21 // 248 +data8 0x3FD2EA4F902FD7DA // 249 +data8 0x3FD2F80C186A25FD // 250 +data8 0x3FD305CF6DE7B0F7 // 251 +data8 0x3FD3139997683CE7 // 252 +data8 0x3FD3216A9BB59E7C // 253 +data8 0x3FD32F4281A3CEFF // 254 +data8 0x3FD33D2150110092 // 255 +LOCAL_OBJECT_END(log10f_data) + + +// Code +//============================================================== .section .text -.proc log10f# -.align 32 -log10f: -#ifdef _LIBC -.global __ieee754_log10f -.type __ieee754_log10f,@function -__ieee754_log10f: -#endif -{ .mfi - alloc r32=ar.pfs,1,15,4,0 - frcpa.s1 log_C,p9 = f1,f8 - cmp.eq.unc p7,p8 = r0, r0 -} -{ .mfb - addl log_AD_1 = @ltoff(log_table_1), gp - fnorm.s1 log_NORM_f8 = f8 - br.sptk L(LOG_LOG10_X) -} -;; - -.endp log10f -ASM_SIZE_DIRECTIVE(log10f) -ASM_SIZE_DIRECTIVE(__ieee754_log10f) - - - -.section .text -.proc logf# -.align 32 -logf: -#ifdef _LIBC -.global __ieee754_logf -.type __ieee754_logf,@function -__ieee754_logf: -#endif +// logf has p13 true, p14 false +// log10f has p14 true, p13 false +GLOBAL_IEEE754_ENTRY(log10f) { .mfi - alloc r32=ar.pfs,1,15,4,0 - frcpa.s1 log_C,p9 = f1,f8 - cmp.eq.unc p8,p7 = r0, r0 + getf.exp GR_Exp = f8 // if x is unorm then must recompute + frcpa.s1 FR_RcpX,p0 = f1,f8 + mov GR_05 = 0xFFFE // biased exponent of A2=0.5 } +{ .mlx + addl GR_ad_T = @ltoff(log10f_data),gp + movl GR_A3 = 0x3FD5555555555555 // double precision memory + // representation of A3 +};; { .mfi - addl log_AD_1 = @ltoff(log_table_1), gp - fnorm.s1 log_NORM_f8 = f8 - nop.i 999 -} -;; - -L(LOG_LOG10_X): - -{ .mfi - getf.exp log_GR_signexp_f8 = f8 // If x unorm then must recompute - fclass.m.unc p15,p0 = f8, 0x0b // Test for x=unorm - mov log_GR_fff7 = 0xfff7 + getf.sig GR_Sig = f8 // if x is unorm then must recompute + fclass.m p8,p0 = f8,9 // is x positive unorm? + sub GR_025 = GR_05,r0,1 // biased exponent of A4=0.25 } +{ .mlx + ld8 GR_ad_T = [GR_ad_T] + movl GR_Ln2 = 0x3FD34413509F79FF // double precision memory + // representation of + // log(2)/ln(10) +};; { .mfi - ld8 log_AD_1 = [log_AD_1] - fms.s1 log_w = f8,f1,f1 - mov log_GR_exp_17_ones = 0x1ffff + setf.d FR_A3 = GR_A3 // create A3 + fcmp.eq.s1 p14,p13 = f0,f0 // set p14 to 1 for log10f + dep.z GR_xorg = GR_05,55,8 // 0x7F00000000000000 integer number + // bits of that are + // GR_xorg[63] = last bit of biased + // exponent of 255/256 + // GR_xorg[62-0] = bits from 62 to 0 + // of significand of 255/256 } -;; - -{ .mmi - getf.sig log_GR_significand_f8 = f8 // If x unorm then must recompute - mov log_GR_exp_16_ones = 0xffff - nop.i 999 -} -;; - -{ .mmb - adds log_AD_2 = 0x10, log_AD_1 - and log_GR_exp_f8 = log_GR_signexp_f8, log_GR_exp_17_ones -(p15) br.cond.spnt L(LOG_DENORM) -} -;; - -L(LOG_COMMON): -{.mfi - ldfpd log_P3,log_P2 = [log_AD_1],16 - fclass.m.unc p6,p0 = f8, 0xc3 // Test for x=nan - shl log_GR_index = log_GR_significand_f8,1 -} -{.mfi - sub log_GR_true_exp_f8 = log_GR_exp_f8, log_GR_exp_16_ones - nop.f 999 - nop.i 999 -} -;; - +{ .mib + setf.exp FR_A2 = GR_05 // create A2 + sub GR_de = GR_Exp,GR_05 // biased_exponent_of_x - 0xFFFE + // needed to comparion with 0.5 and 2.0 + br.cond.sptk logf_log10f_common +};; +GLOBAL_IEEE754_END(log10f) +GLOBAL_IEEE754_ENTRY(logf) { .mfi - ldfpd log_P1,log_inv_ln10 = [log_AD_2],16 - fclass.m.unc p11,p0 = f8, 0x21 // Test for x=+inf - shr.u log_GR_index = log_GR_index,56 + getf.exp GR_Exp = f8 // if x is unorm then must recompute + frcpa.s1 FR_RcpX,p0 = f1,f8 + mov GR_05 = 0xFFFE // biased exponent of A2=-0.5 } +{ .mlx + addl GR_ad_T = @ltoff(logf_data),gp + movl GR_A3 = 0x3FD5555555555555 // double precision memory + // representation of A3 +};; { .mfi - setf.sig log_int_Nfloat = log_GR_true_exp_f8 - nop.f 999 - nop.i 999 + getf.sig GR_Sig = f8 // if x is unorm then must recompute + fclass.m p8,p0 = f8,9 // is x positive unorm? + dep.z GR_xorg = GR_05,55,8 // 0x7F00000000000000 integer number + // bits of that are + // GR_xorg[63] = last bit of biased + // exponent of 255/256 + // GR_xorg[62-0] = bits from 62 to 0 + // of significand of 255/256 } -;; - - { .mfi - ldfd log_log2 = [log_AD_2],16 - fma.s1 log_wsq = log_w, log_w, f0 - nop.i 999 -} -{ .mfb - nop.m 999 -(p6) fma.s.s0 f8 = f8,f1,f0 // quietize nan result if x=nan -(p6) br.ret.spnt b0 // Exit for x=nan -} -;; - - + ld8 GR_ad_T = [GR_ad_T] + nop.f 0 + sub GR_025 = GR_05,r0,1 // biased exponent of A4=0.25 +};; { .mfi - shladd log_AD_2 = log_GR_index,3,log_AD_2 - fcmp.eq.s1 p10,p0 = log_NORM_f8, f1 // Test for x=+1.0 - nop.i 999 + setf.d FR_A3 = GR_A3 // create A3 + fcmp.eq.s1 p13,p14 = f0,f0 // p13 - true for logf + sub GR_de = GR_Exp,GR_05 // biased_exponent_of_x - 0xFFFE + // needed to comparion with 0.5 and 2.0 } -{ .mfb - nop.m 999 - fms.s1 log_r = log_C,f8,f1 -(p11) br.ret.spnt b0 // Exit for x=+inf -} -;; - - -{ .mmf - nop.m 999 - nop.m 999 - fclass.m.unc p6,p0 = f8, 0x07 // Test for x=0 -} -;; - - -{ .mfb - ldfd log_T = [log_AD_2] -(p10) fmerge.s f8 = f0, f0 -(p10) br.ret.spnt b0 // Exit for x=1.0 -;; -} - +{ .mlx + setf.exp FR_A2 = GR_05 // create A2 + movl GR_Ln2 = 0x3FE62E42FEFA39EF // double precision memory + // representation of log(2) +};; +logf_log10f_common: { .mfi - getf.exp log_GR_signexp_w = log_w - fclass.m.unc p12,p0 = f8, 0x3a // Test for x neg norm, unorm, inf - nop.i 999 -} -;; - -{ .mmb - nop.m 999 - nop.m 999 -(p6) br.cond.spnt L(LOG_ZERO_NEG) // Branch if x=0 -;; + setf.exp FR_A4 = GR_025 // create A4=0.25 + fclass.m p9,p0 = f8,0x3A // is x < 0 (including negateve unnormals)? + dep GR_x = GR_Exp,GR_Sig,63,1 // produce integer that bits are + // GR_x[63] = GR_Exp[0] + // GR_x[62-0] = GR_Sig[62-0] } - - +{ .mib + sub GR_N = GR_Exp,GR_05,1 // unbiased exponent of x + cmp.gtu p6,p7 = 2,GR_de // is 0.5 <= x < 2.0? +(p8) br.cond.spnt logf_positive_unorm +};; +logf_core: { .mfi - and log_GR_exp_w = log_GR_exp_17_ones, log_GR_signexp_w - nop.f 999 - nop.i 999 + setf.sig FR_N = GR_N // copy unbiased exponent of x to the + // significand field of FR_N + fclass.m p10,p0 = f8,0x1E1 // is x NaN, NaT or +Inf? + dep.z GR_dx = GR_05,54,3 // 0x0180000000000000 - difference + // between our integer representations + // of 257/256 and 255/256 } -{ .mfb - nop.m 999 - fma.s1 log_rsq = log_r, log_r, f0 -(p12) br.cond.spnt L(LOG_ZERO_NEG) // Branch if x<0 -;; -} - { .mfi - nop.m 999 - fma.s1 log_rp_p32 = log_P3, log_r, log_P2 - nop.i 999 -} + nop.m 0 + nop.f 0 + sub GR_x = GR_x,GR_xorg // difference between representations + // of x and 255/256 +};; { .mfi - nop.m 999 - fma.s1 log_rp_q32 = log_P3, log_w, log_P2 - nop.i 999 -;; + ldfd FR_InvLn10 = [GR_ad_T],8 + fcmp.eq.s1 p11,p0 = f8,f1 // is x equal to 1.0? + extr.u GR_Ind = GR_Sig,55,8 // get bits from 55 to 62 as index } - +{ .mib + setf.d FR_Ln2 = GR_Ln2 // create log(2) or log10(2) +(p6) cmp.gtu p6,p7 = GR_dx,GR_x // set p6 if 255/256 <= x < 257/256 +(p9) br.cond.spnt logf_negatives // jump if input argument is negative number +};; +// p6 is true if |x-1| < 1/256 +// p7 is true if |x-1| >= 1/256 +.pred.rel "mutex",p6,p7 { .mfi - nop.m 999 - fcvt.xf log_Nfloat = log_int_Nfloat - nop.i 999 ;; + shladd GR_ad_T = GR_Ind,3,GR_ad_T // calculate address of T +(p7) fms.s1 FR_r = FR_RcpX,f8,f1 // range reduction for |x-1|>=1/256 + extr.u GR_Exp = GR_Exp,0,17 // exponent without sign } - +{ .mfb + nop.m 0 +(p6) fms.s1 FR_r = f8,f1,f1 // range reduction for |x-1|<1/256 +(p10) br.cond.spnt logf_nan_nat_pinf // exit for NaN, NaT or +Inf +};; +{ .mfb + ldfd FR_T = [GR_ad_T] // load T +(p11) fma.s.s0 f8 = f0,f0,f0 +(p11) br.ret.spnt b0 // exit for x = 1.0 +};; +{ .mib + nop.m 0 + cmp.eq p12,p0 = r0,GR_Exp // is x +/-0? (here it's quite enough + // only to compare exponent with 0 + // because all unnormals already + // have been filtered) +(p12) br.cond.spnt logf_zeroes // Branch if input argument is +/-0 +};; { .mfi - nop.m 999 - fma.s1 log_rp_p10 = log_P1, log_r, f1 - nop.i 999 + nop.m 0 + fnma.s1 FR_A2 = FR_A2,FR_r,f1 // A2*r+1 + nop.i 0 } { .mfi - nop.m 999 - fma.s1 log_rp_q10 = log_P1, log_w, f1 - nop.i 999 -;; -} - -// p13 <== large w log -// p14 <== small w log + nop.m 0 + fma.s1 FR_r2 = FR_r,FR_r,f0 // r^2 + nop.i 0 +};; { .mfi -(p8) cmp.ge.unc p13,p14 = log_GR_exp_w, log_GR_fff7 - fcmp.eq.s0 p6,p0 = f8,f0 // Sets flag on +denormal input - nop.i 999 -;; + nop.m 0 + fcvt.xf FR_N = FR_N // convert integer N in significand of FR_N + // to floating-point representation + nop.i 0 } - -// p10 <== large w log10 -// p11 <== small w log10 { .mfi -(p7) cmp.ge.unc p10,p11 = log_GR_exp_w, log_GR_fff7 - nop.f 999 - nop.i 999 ;; -} - + nop.m 0 + fnma.s1 FR_A3 = FR_A4,FR_r,FR_A3 // A4*r+A3 + nop.i 0 +};; { .mfi - nop.m 999 - fma.s1 log_T_plus_Nlog2 = log_Nfloat,log_log2, log_T - nop.i 999 ;; + nop.m 0 + fma.s1 FR_r = FR_r,FR_InvLn10,f0 // For log10f we have r/log(10) + nop.i 0 } - - { .mfi - nop.m 999 - fma.s1 log_rp_p2 = log_rp_p32, log_rsq, log_rp_p10 - nop.i 999 -} + nop.m 0 + nop.f 0 + nop.i 0 +};; { .mfi - nop.m 999 - fma.s1 log_rp_q2 = log_rp_q32, log_wsq, log_rp_q10 - nop.i 999 -;; + nop.m 0 + fma.s1 FR_A2 = FR_A3,FR_r2,FR_A2 // (A4*r+A3)*r^2+(A2*r+1) + nop.i 0 } - - -// small w, log <== p14 { .mfi - nop.m 999 -(p14) fma.s f8 = log_rp_q2, log_w, f0 - nop.i 999 -} + nop.m 0 + fma.s1 FR_NxLn2pT = FR_N,FR_Ln2,FR_T // N*Ln2+T + nop.i 0 +};; +.pred.rel "mutex",p6,p7 { .mfi - nop.m 999 -(p11) fma.s1 log_Q = log_rp_q2, log_w, f0 - nop.i 999 ;; + nop.m 0 +(p7) fma.s.s0 f8 = FR_A2,FR_r,FR_NxLn2pT // result for |x-1|>=1/256 + nop.i 0 } +{ .mfb + nop.m 0 +(p6) fma.s.s0 f8 = FR_A2,FR_r,f0 // result for |x-1|<1/256 + br.ret.sptk b0 +};; - -// large w, log <== p13 -.pred.rel "mutex",p13,p10 +.align 32 +logf_positive_unorm: { .mfi - nop.m 999 -(p13) fma.s f8 = log_rp_p2, log_r, log_T_plus_Nlog2 - nop.i 999 -} + nop.m 0 +(p8) fma.s0 f8 = f8,f1,f0 // Normalize & set D-flag + nop.i 0 +};; { .mfi - nop.m 999 -(p10) fma.s1 log_Q = log_rp_p2, log_r, log_T_plus_Nlog2 - nop.i 999 ;; -} - - -// log10 -{ .mfb - nop.m 999 -(p7) fma.s f8 = log_inv_ln10,log_Q,f0 - br.ret.sptk b0 -;; -} - - -L(LOG_DENORM): -{ .mmi - getf.exp log_GR_signexp_f8 = log_NORM_f8 - nop.m 999 - nop.i 999 -} -;; -{ .mmb - getf.sig log_GR_significand_f8 = log_NORM_f8 - and log_GR_exp_f8 = log_GR_signexp_f8, log_GR_exp_17_ones - br.cond.sptk L(LOG_COMMON) -} -;; - -L(LOG_ZERO_NEG): - -// qnan snan inf norm unorm 0 -+ -// 0 0 0 0 0 1 11 0x7 -// 0 0 1 1 1 0 10 0x3a - -// Save x (f8) in f10 + getf.exp GR_Exp = f8 // recompute biased exponent + nop.f 0 + cmp.ne p6,p7 = r0,r0 // p6 <- 0, p7 <- 1 because + // in case of unorm we are out + // interval [255/256; 257/256] +};; { .mfi - nop.m 999 - fmerge.s f10 = f8,f8 - nop.i 999 ;; -} - -// p8 p9 means ln(+-0) = -inf -// p7 p10 means log(+-0) = -inf - -// p13 means ln(-) -// p14 means log(-) - + getf.sig GR_Sig = f8 // recompute significand + nop.f 0 + nop.i 0 +};; +{ .mib + sub GR_N = GR_Exp,GR_05,1 // unbiased exponent N + nop.i 0 + br.cond.sptk logf_core // return into main path +};; +.align 32 +logf_nan_nat_pinf: { .mfi - nop.m 999 - fmerge.ns f6 = f1,f1 // Form -1.0 - nop.i 999 ;; + nop.m 0 + fma.s.s0 f8 = f8,f1,f0 // set V-flag + nop.i 0 } +{ .mfb + nop.m 0 + nop.f 0 + br.ret.sptk b0 // exit for NaN, NaT or +Inf +};; -// p9 means ln(+-0) = -inf -// p10 means log(+-0) = -inf -// Log(+-0) = -inf - -{ .mfi - nop.m 999 -(p8) fclass.m.unc p9,p0 = f10, 0x07 - nop.i 999 -} +.align 32 +logf_zeroes: { .mfi - nop.m 999 -(p7) fclass.m.unc p10,p0 = f10, 0x07 - nop.i 999 ;; + nop.m 0 + fmerge.s FR_X = f8,f8 // keep input argument for subsequent + // call of __libm_error_support# + nop.i 0 } - - -// p13 ln(-) -// p14 log(-) - -// Log(-inf, -normal, -unnormal) = QNAN indefinite { .mfi - nop.m 999 -(p8) fclass.m.unc p13,p0 = f10, 0x3a - nop.i 999 -} +(p13) mov GR_TAG = 4 // set libm error in case of logf + fms.s1 FR_tmp = f0,f0,f1 // -1.0 + nop.i 0 +};; { .mfi - nop.m 999 -(p7) fclass.m.unc p14,p0 = f10, 0x3a - nop.i 999 ;; + nop.m 0 + frcpa.s0 f8,p0 = FR_tmp,f0 // log(+/-0) should be equal to -INF. + // We can get it using frcpa because it + // sets result to the IEEE-754 mandated + // quotient of FR_tmp/f0. + // As far as FR_tmp is -1 it'll be -INF + nop.i 0 } +{ .mib +(p14) mov GR_TAG = 10 // set libm error in case of log10f + nop.i 0 + br.cond.sptk logf_libm_err +};; - -.pred.rel "mutex",p9,p10 -{ .mfi -(p9) mov log_GR_tag = 4 -(p9) frcpa f8,p11 = f6,f0 - nop.i 999 -} +.align 32 +logf_negatives: { .mfi -(p10) mov log_GR_tag = 10 -(p10) frcpa f8,p12 = f6,f0 - nop.i 999 ;; -} - -.pred.rel "mutex",p13,p14 +(p13) mov GR_TAG = 5 // set libm error in case of logf + fmerge.s FR_X = f8,f8 // keep input argument for subsequent + // call of __libm_error_support# + nop.i 0 +};; { .mfi -(p13) mov log_GR_tag = 5 -(p13) frcpa f8,p11 = f0,f0 - nop.i 999 -} -{ .mfb -(p14) mov log_GR_tag = 11 -(p14) frcpa f8,p12 = f0,f0 - br.cond.sptk __libm_error_region ;; -} -.endp logf -ASM_SIZE_DIRECTIVE(logf) -ASM_SIZE_DIRECTIVE(__ieee754_logf) +(p14) mov GR_TAG = 11 // set libm error in case of log10f + frcpa.s0 f8,p0 = f0,f0 // log(negatives) should be equal to NaN. + // We can get it using frcpa because it + // sets result to the IEEE-754 mandated + // quotient of f0/f0 i.e. NaN. + nop.i 0 +};; +.align 32 +logf_libm_err: +{ .mmi + alloc r32 = ar.pfs,1,4,4,0 + mov GR_Parameter_TAG = GR_TAG + nop.i 0 +};; +GLOBAL_IEEE754_END(logf) // Stack operations when calling error support. // (1) (2) (3) (call) (4) @@ -890,70 +1101,56 @@ ASM_SIZE_DIRECTIVE(__ieee754_logf) // save ar.pfs save b0 restore gp // save gp restore ar.pfs - - -.proc __libm_error_region -__libm_error_region: +LOCAL_LIBM_ENTRY(__libm_error_region) .prologue - -// (1) { .mfi - add GR_Parameter_Y=-32,sp // Parameter 2 value - nop.f 0 -.save ar.pfs,GR_SAVE_PFS - mov GR_SAVE_PFS=ar.pfs // Save ar.pfs + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs } { .mfi .fframe 64 - add sp=-64,sp // Create new stack - nop.f 0 - mov GR_SAVE_GP=gp // Save gp + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp };; - - -// (2) { .mmi - stfs [GR_Parameter_Y] = f1,16 // STORE Parameter 2 on stack - add GR_Parameter_X = 16,sp // Parameter 1 address + stfs [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address .save b0, GR_SAVE_B0 - mov GR_SAVE_B0=b0 // Save b0 + mov GR_SAVE_B0=b0 // Save b0 };; - .body -// (3) { .mib - stfs [GR_Parameter_X] = f10 // STORE Parameter 1 on stack - add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address - nop.b 0 + stfs [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address + nop.b 0 } { .mib - stfs [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack - add GR_Parameter_Y = -16,GR_Parameter_Y - br.call.sptk b0=__libm_error_support# // Call error handling function + stfs [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function };; - { .mmi - nop.m 0 - nop.m 0 - add GR_Parameter_RESULT = 48,sp + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp };; - -// (4) { .mmi - ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack + ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack .restore sp - add sp = 64,sp // Restore stack pointer - mov b0 = GR_SAVE_B0 // Restore return address + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address };; { .mib - mov gp = GR_SAVE_GP // Restore gp - mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs - br.ret.sptk b0 // Return + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return };; -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) - +LOCAL_LIBM_END(__libm_error_region) .type __libm_error_support#,@function .global __libm_error_support# + diff --git a/sysdeps/ia64/fpu/e_logl.c b/sysdeps/ia64/fpu/e_logl.c deleted file mode 100644 index 41254ae60a..0000000000 --- a/sysdeps/ia64/fpu/e_logl.c +++ /dev/null @@ -1 +0,0 @@ -/* Not needed. */ diff --git a/sysdeps/ia64/fpu/e_pow.S b/sysdeps/ia64/fpu/e_pow.S index 56f7f078ba..11fae53d72 100644 --- a/sysdeps/ia64/fpu/e_pow.S +++ b/sysdeps/ia64/fpu/e_pow.S @@ -1,10 +1,10 @@ .file "pow.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. // -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,7 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -35,30 +35,41 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 2/02/00 Initial version -// 2/03/00 Added p12 to definite over/under path. With odd power we did not +// 02/02/00 Initial version +// 02/03/00 Added p12 to definite over/under path. With odd power we did not // maintain the sign of x in this path. -// 4/04/00 Unwind support added -// 4/19/00 pow(+-1,inf) now returns NaN -// pow(+-val, +-inf) returns 0 or inf, but now does not call error support +// 04/04/00 Unwind support added +// 04/19/00 pow(+-1,inf) now returns NaN +// pow(+-val, +-inf) returns 0 or inf, but now does not call error +// support // Added s1 to fcvt.fx because invalid flag was incorrectly set. -// 8/15/00 Bundle added after call to __libm_error_support to properly +// 08/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. -// 9/07/00 Improved performance by eliminating bank conflicts and other stalls, +// 09/07/00 Improved performance by eliminating bank conflicts and other stalls, // and tweaking the critical path -// 9/08/00 Per c99, pow(+-1,inf) now returns 1, and pow(+1,nan) returns 1 -// 9/28/00 Updated NaN**0 path -// 1/20/01 Fixed denormal flag settings. -// 2/12/01 Improved speed. +// 09/08/00 Per c99, pow(+-1,inf) now returns 1, and pow(+1,nan) returns 1 +// 09/28/00 Updated NaN**0 path +// 01/20/01 Fixed denormal flag settings. +// 02/13/01 Improved speed. +// 03/19/01 Reordered exp polynomial to improve speed and eliminate monotonicity +// problem in round up, down, and to zero modes. Also corrected +// overflow result when x negative, y odd in round up, down, zero. +// 06/14/01 Added brace missing from bundle +// 12/10/01 Corrected case where x negative, 2^52 <= |y| < 2^53, y odd integer. +// 12/20/01 Fixed monotonity problem in round to nearest. +// 02/08/02 Fixed overflow/underflow cases that were not calling error support. +// 05/20/02 Cleaned up namespace and sf0 syntax +// 08/29/02 Improved Itanium 2 performance +// 09/21/02 Added branch for |y*log(x)|<2^-11 to fix monotonicity problems. +// 02/10/03 Reordered header: .section, .global, .proc, .align // // API //============================================================== -// double pow(double) -// float powf(float) +// double pow(double x, double y) // // Overview of operation //============================================================== @@ -67,51 +78,51 @@ // 1. Log(x) // 2. y Log(x) // 3. exp(y log(x)) -// +// // This means we work with the absolute value of x and merge in the sign later. // Log(x) = G + delta + r -rsq/2 + p // G,delta depend on the exponent of x and table entries. The table entries are // indexed by the exponent of x, called K. -// +// // The G and delta come out of the reduction; r is the reduced x. -// +// // B = frcpa(x) // xB-1 is small means that B is the approximate inverse of x. -// +// // Log(x) = Log( (1/B)(Bx) ) // = Log(1/B) + Log(Bx) // = Log(1/B) + Log( 1 + (Bx-1)) -// +// // x = 2^K 1.x_1x_2.....x_52 -// B= frcpa(x) = 2^-k Cm +// B= frcpa(x) = 2^-k Cm // Log(1/B) = Log(1/(2^-K Cm)) // Log(1/B) = Log((2^K/ Cm)) // Log(1/B) = K Log(2) + Log(1/Cm) -// +// // Log(x) = K Log(2) + Log(1/Cm) + Log( 1 + (Bx-1)) -// +// // If you take the significand of x, set the exponent to true 0, then Cm is // the frcpa. We tabulate the Log(1/Cm) values. There are 256 of them. // The frcpa table is indexed by 8 bits, the x_1 thru x_8. // m = x_1x_2...x_8 is an 8-bit index. -// +// // Log(1/Cm) = log(1/frcpa(1+m/256)) where m goes from 0 to 255. -// +// // We tabluate as two doubles, T and t, where T +t is the value itself. -// +// // Log(x) = (K Log(2)_hi + T) + (Log(2)_hi + t) + Log( 1 + (Bx-1)) // Log(x) = G + delta + Log( 1 + (Bx-1)) -// +// // The Log( 1 + (Bx-1)) can be calculated as a series in r = Bx-1. -// +// // Log( 1 + (Bx-1)) = r - rsq/2 + p -// +// // Then, -// +// // yLog(x) = yG + y delta + y(r-rsq/2) + yp // yLog(x) = Z1 + e3 + Z2 + Z3 + (e2 + e3) -// -// +// +// // exp(yLog(x)) = exp(Z1 + Z2 + Z3) exp(e1 + e2 + e3) // // @@ -133,7 +144,7 @@ // exp(r) = exp(Z - N log2/128) // // r = s + d = (Z - N (log2/128)_hi) -N (log2/128)_lo -// = Z - N (log2/128) +// = Z - N (log2/128) // // Z = s+d +N (log2/128) // @@ -149,22 +160,22 @@ // n log2/128 = n_7n_6n_5 log2/8 + n_4n_3n_2n_1 log2/128 // n log2/128 = I2 log2/8 + I1 log2/128 // -// N log2/128 = M log2 + I2 log2/8 + I1 log2/128 +// N log2/128 = M log2 + I2 log2/8 + I1 log2/128 // // exp(Z) = exp(s) (1+d) exp(log(2^M) + log(2^I2/8) + log(2^I1/128)) // exp(Z) = exp(s) (1+d1) (1+d2)(2^M) 2^I2/8 2^I1/128 // exp(Z) = exp(s) f1 f2 (2^M) 2^I2/8 2^I1/128 // // I1, I2 are table indices. Use a series for exp(s). -// Then get exp(Z) +// Then get exp(Z) // // exp(yLog(x)) = exp(Z1 + Z2 + Z3) exp(e1 + e2 + e3) -// exp(yLog(x)) = exp(Z) exp(Z3) f3 -// exp(yLog(x)) = exp(Z)f3 exp(Z3) -// exp(yLog(x)) = A exp(Z3) +// exp(yLog(x)) = exp(Z) exp(Z3) f3 +// exp(yLog(x)) = exp(Z)f3 exp(Z3) +// exp(yLog(x)) = A exp(Z3) // // We actually calculate exp(Z3) -1. -// Then, +// Then, // exp(yLog(x)) = A + A( exp(Z3) -1) // @@ -175,142 +186,146 @@ // ============== // The operation (K*log2_hi) must be exact. K is the true exponent of x. // If we allow gradual underflow (denormals), K can be represented in 12 bits -// (as a two's complement number). We assume 13 bits as an engineering precaution. -// +// (as a two's complement number). We assume 13 bits as an engineering +// precaution. +// // +------------+----------------+-+ // | 13 bits | 50 bits | | // +------------+----------------+-+ // 0 1 66 // 2 34 -// +// // So we want the lsb(log2_hi) to be 2^-50 // We get log2 as a quad-extended (15-bit exponent, 128-bit significand) -// +// // 0 fffe b17217f7d1cf79ab c9e3b39803f2f6af (4...) -// +// // Consider numbering the bits left to right, starting at 0 thru 127. // Bit 0 is the 2^-1 bit; bit 49 is the 2^-50 bit. -// +// // ...79ab // 0111 1001 1010 1011 // 44 // 89 -// -// So if we shift off the rightmost 14 bits, then (shift back only +// +// So if we shift off the rightmost 14 bits, then (shift back only // the top half) we get -// +// // 0 fffe b17217f7d1cf4000 e6af278ece600fcb dabc000000000000 -// +// // Put the right 64-bit signficand in an FR register, convert to double; // it is exact. Put the next 128 bits into a quad register and round to double. // The true exponent of the low part is -51. -// +// // hi is 0 fffe b17217f7d1cf4000 // lo is 0 ffcc e6af278ece601000 -// +// // Convert to double memory format and get -// +// // hi is 0x3fe62e42fefa39e8 -// lo is 0x3cccd5e4f1d9cc02 -// +// lo is 0x3cccd5e4f1d9cc02 +// // log2_hi + log2_lo is an accurate value for log2. -// -// +// +// // The T and t values // ================== // A similar method is used to generate the T and t values. -// +// // K * log2_hi + T must be exact. -// +// // Smallest T,t // ---------- -// The smallest T,t is +// The smallest T,t is // T t -// data8 0x3f60040155d58800, 0x3c93bce0ce3ddd81 log(1/frcpa(1+0/256))= +1.95503e-003 -// +// 0x3f60040155d58800, 0x3c93bce0ce3ddd81 log(1/frcpa(1+0/256))= +1.95503e-003 +// // The exponent is 0x3f6 (biased) or -9 (true). // For the smallest T value, what we want is to clip the significand such that -// when it is shifted right by 9, its lsb is in the bit for 2^-51. The 9 is the specific -// for the first entry. In general, it is 0xffff - (biased 15-bit exponent). +// when it is shifted right by 9, its lsb is in the bit for 2^-51. The 9 is the +// specific for the first entry. In general, it is 0xffff - (biased 15-bit +// exponent). -// Independently, what we have calculated is the table value as a quad precision number. +// Independently, what we have calculated is the table value as a quad +// precision number. // Table entry 1 is // 0 fff6 80200aaeac44ef38 338f77605fdf8000 -// +// // We store this quad precision number in a data structure that is -// sign: 1 +// sign: 1 // exponent: 15 // signficand_hi: 64 (includes explicit bit) // signficand_lo: 49 // Because the explicit bit is included, the significand is 113 bits. -// +// // Consider significand_hi for table entry 1. -// -// +// +// // +-+--- ... -------+--------------------+ // | | // +-+--- ... -------+--------------------+ // 0 1 4444444455555555556666 // 2345678901234567890123 -// +// // Labeled as above, bit 0 is 2^0, bit 1 is 2^-1, etc. // Bit 42 is 2^-42. If we shift to the right by 9, the bit in // bit 42 goes in 51. -// +// // So what we want to do is shift bits 43 thru 63 into significand_lo. -// This is shifting bit 42 into bit 63, taking care to retain the shifted-off bits. -// Then shifting (just with signficaand_hi) back into bit 42. -// -// The shift_value is 63-42 = 21. In general, this is +// This is shifting bit 42 into bit 63, taking care to retain shifted-off bits. +// Then shifting (just with signficaand_hi) back into bit 42. +// +// The shift_value is 63-42 = 21. In general, this is // 63 - (51 -(0xffff - 0xfff6)) // For this example, it is // 63 - (51 - 9) = 63 - 42 = 21 -// -// This means we are shifting 21 bits into significand_lo. We must maintain more -// that a 128-bit signficand not to lose bits. So before the shift we put the 128-bit -// significand into a 256-bit signficand and then shift. +// +// This means we are shifting 21 bits into significand_lo. We must maintain more +// that a 128-bit signficand not to lose bits. So before the shift we put the +// 128-bit significand into a 256-bit signficand and then shift. // The 256-bit significand has four parts: hh, hl, lh, and ll. -// +// // Start off with // hh hl lh ll // <64> <49><15_0> <64_0> <64_0> -// +// // After shift by 21 (then return for significand_hi), // <43><21_0> <21><43> <6><58_0> <64_0> -// +// // Take the hh part and convert to a double. There is no rounding here. -// The conversion is exact. The true exponent of the high part is the same as the -// true exponent of the input quad. -// -// We have some 64 plus significand bits for the low part. In this example, we have -// 70 bits. We want to round this to a double. Put them in a quad and then do a quad fnorm. -// For this example the true exponent of the low part is +// The conversion is exact. The true exponent of the high part is the same as +// the true exponent of the input quad. +// +// We have some 64 plus significand bits for the low part. In this example, we +// have 70 bits. We want to round this to a double. Put them in a quad and then +// do a quad fnorm. +// For this example the true exponent of the low part is // true_exponent_of_high - 43 = true_exponent_of_high - (64-21) -// In general, this is -// true_exponent_of_high - (64 - shift_value) -// -// +// In general, this is +// true_exponent_of_high - (64 - shift_value) +// +// // Largest T,t // ---------- // The largest T,t is -// data8 0x3fe62643fecf9742, 0x3c9e3147684bd37d log(1/frcpa(1+255/256))= +6.92171e-001 -// +// 0x3fe62643fecf9742, 0x3c9e3147684bd37d log(1/frcpa(1+255/256))=+6.92171e-001 +// // Table entry 256 is // 0 fffe b1321ff67cba178c 51da12f4df5a0000 -// -// The shift value is +// +// The shift value is // 63 - (51 -(0xffff - 0xfffe)) = 13 -// -// The true exponent of the low part is +// +// The true exponent of the low part is // true_exponent_of_high - (64 - shift_value) // -1 - (64-13) = -52 // Biased as a double, this is 0x3cb -// -// -// +// +// +// // So then lsb(T) must be >= 2^-51 // msb(Klog2_hi) <= 2^12 -// +// // +--------+---------+ // | 51 bits | <== largest T // +--------+---------+ @@ -320,7 +335,6 @@ // +------------+----------------+-+ - // Special Cases //============================================================== @@ -385,63 +399,67 @@ // X any Y =0 +1 -#include "libm_support.h" - // Assembly macros //============================================================== // integer registers used -pow_AD_Tt = r33 -pow_GR_FFF7 = r34 -pow_GR_exp_Y = r34 // duplicate -pow_GR_17ones = r35 - -pow_AD_P = r36 -pow_AD_Q = r37 -pow_AD_tbl1 = r38 -pow_AD_tbl2 = r39 -pow_GR_exp_X = r40 -pow_GR_true_exp_X = r40 // duplicate - -pow_GR_offset = r41 -pow_GR_exp_Xm1 = r42 -pow_GR_sig_X = r43 -pow_GR_signexp_X = r44 - -pow_GR_signexp_Xm1 = r46 -pow_GR_int_W1 = r47 -pow_GR_int_W2 = r48 -pow_GR_int_N = r49 -pow_GR_index1 = r50 - -pow_GR_index2 = r51 -pow_AD_T1 = r52 -pow_AD_T2 = r53 -pow_GR_gt_ln = r53 // duplicate -pow_int_GR_M = r54 -pow_GR_10033 = r55 - -pow_GR_16ones = r56 -pow_GR_sig_int_Y = r57 -pow_GR_sign_Y_Gpr = r58 -pow_GR_17ones_m1 = r59 -pow_GR_one = r60 -pow_GR_sign_Y = r60 - -pow_GR_signexp_Y_Gpr = r61 -pow_GR_exp_Y_Gpr = r62 -pow_GR_true_exp_Y_Gpr = r63 -pow_GR_signexp_Y = r64 - -GR_SAVE_B0 = r65 -GR_SAVE_GP = r66 -GR_SAVE_PFS = r67 - -GR_Parameter_X = r68 -GR_Parameter_Y = r69 -GR_Parameter_RESULT = r70 -pow_GR_tag = r71 +pow_GR_signexp_X = r14 +pow_GR_17ones = r15 +pow_AD_P = r16 +pow_GR_exp_2tom8 = r17 +pow_GR_sig_X = r18 +pow_GR_10033 = r19 +pow_GR_16ones = r20 + +pow_AD_Tt = r21 +pow_GR_exp_X = r22 +pow_AD_Q = r23 +pow_GR_true_exp_X = r24 +pow_GR_y_zero = r25 + +pow_GR_exp_Y = r26 +pow_AD_tbl1 = r27 +pow_AD_tbl2 = r28 +pow_GR_offset = r29 +pow_GR_exp_Xm1 = r30 +pow_GR_xneg_yodd = r31 + +pow_GR_signexp_Xm1 = r35 +pow_GR_int_W1 = r36 +pow_GR_int_W2 = r37 +pow_GR_int_N = r38 +pow_GR_index1 = r39 +pow_GR_index2 = r40 + +pow_AD_T1 = r41 +pow_AD_T2 = r42 +pow_int_GR_M = r43 +pow_GR_sig_int_Y = r44 +pow_GR_sign_Y_Gpr = r45 + +pow_GR_17ones_m1 = r46 +pow_GR_one = r47 +pow_GR_sign_Y = r48 +pow_GR_signexp_Y_Gpr = r49 +pow_GR_exp_Y_Gpr = r50 + +pow_GR_true_exp_Y_Gpr = r51 +pow_GR_signexp_Y = r52 +pow_GR_x_one = r53 +pow_GR_exp_2toM63 = r54 +pow_GR_big_pos = r55 + +pow_GR_big_neg = r56 + +GR_SAVE_B0 = r50 +GR_SAVE_GP = r51 +GR_SAVE_PFS = r52 + +GR_Parameter_X = r53 +GR_Parameter_Y = r54 +GR_Parameter_RESULT = r55 +pow_GR_tag = r56 // floating point registers used @@ -464,7 +482,8 @@ POW_log2_lo = f43 POW_r = f44 POW_Q0_half = f45 -POW_Q1 = f46 +POW_Q1 = f46 +POW_tmp = f47 POW_log2_hi = f48 POW_Q4 = f49 POW_P1 = f50 @@ -476,6 +495,7 @@ POW_Yrcub = f54 POW_log2_by_128_lo = f55 POW_v6 = f56 +POW_xsq = f57 POW_v4 = f58 POW_v2 = f59 POW_T = f60 @@ -484,6 +504,7 @@ POW_Tt = f61 POW_RSHF = f62 POW_v21ps = f63 POW_s4 = f64 +POW_twoV = f65 POW_U = f66 POW_G = f67 @@ -533,44 +554,45 @@ POW_1ps = f103 POW_A = f104 POW_es = f105 +POW_Xp1 = f106 POW_int_K = f107 POW_K = f108 POW_f123 = f109 POW_Gpr = f110 -POW_Y_Gpr = f111 +POW_Y_Gpr = f111 POW_int_Y = f112 +POW_abs_q = f114 +POW_2toM63 = f115 POW_float_int_Y = f116 POW_ftz_urm_f8 = f117 POW_wre_urm_f8 = f118 -POW_abs_A = f119 -POW_gt_pln = f120 +POW_big_neg = f119 +POW_big_pos = f120 -POW_xsq = f121 - -POW_twoV = f122 -POW_Xp1 = f123 +POW_GY_Z2 = f121 +POW_pYrcub_e3 = f122 +POW_d = f123 +POW_d2 = f124 +POW_poly_d_hi = f121 +POW_poly_d_lo = f122 +POW_poly_d = f121 // Data tables //============================================================== -#ifdef _LIBC -.rodata -#else -.data -#endif +RODATA .align 16 -pow_table_P: -ASM_TYPE_DIRECTIVE(pow_table_P,@object) +LOCAL_OBJECT_START(pow_table_P) data8 0x8000F7B249FF332D, 0x0000BFFC // P_5 data8 0xAAAAAAA9E7902C7F, 0x0000BFFC // P_3 data8 0x80000000000018E5, 0x0000BFFD // P_1 data8 0xb8aa3b295c17f0bc, 0x00004006 // inv_ln2_by_128 - - +// +// data8 0x3FA5555555554A9E // Q_2 data8 0x3F8111124F4DD9F9 // Q_3 data8 0x3FE0000000000000 // Q_0 @@ -580,20 +602,18 @@ data8 0x43e8000000000000 // Right shift constant for exp data8 0xc9e3b39803f2f6af, 0x00003fb7 // ln2_by_128_lo data8 0x0000000000000000 // pad to eliminate bank conflicts with pow_table_Q data8 0x0000000000000000 // pad to eliminate bank conflicts with pow_table_Q -ASM_SIZE_DIRECTIVE(pow_table_P) +LOCAL_OBJECT_END(pow_table_P) -pow_table_Q: -ASM_TYPE_DIRECTIVE(pow_table_Q,@object) +LOCAL_OBJECT_START(pow_table_Q) data8 0x9249FE7F0DC423CF, 0x00003FFC // P_4 data8 0xCCCCCCCC4ED2BA7F, 0x00003FFC // P_2 data8 0xAAAAAAAAAAAAB505, 0x00003FFD // P_0 data8 0x3fe62e42fefa39e8, 0x3cccd5e4f1d9cc02 // log2 hi lo = +6.93147e-001 data8 0xb17217f7d1cf79ab, 0x00003ff7 // ln2_by_128_hi -ASM_SIZE_DIRECTIVE(pow_table_Q) +LOCAL_OBJECT_END(pow_table_Q) -pow_Tt: -ASM_TYPE_DIRECTIVE(pow_Tt,@object) +LOCAL_OBJECT_START(pow_Tt) data8 0x3f60040155d58800, 0x3c93bce0ce3ddd81 // log(1/frcpa(1+0/256))= +1.95503e-003 data8 0x3f78121214586a00, 0x3cb540e0a5cfc9bc // log(1/frcpa(1+1/256))= +5.87661e-003 data8 0x3f841929f9683200, 0x3cbdf1d57404da1f // log(1/frcpa(1+2/256))= +9.81362e-003 @@ -850,13 +870,12 @@ data8 0x3fe5f673c61a2ed0, 0x3caa385eef5f2789 // log(1/frcpa(1+252/256))= +6.863 data8 0x3fe6065bea385924, 0x3cb11624f165c5b4 // log(1/frcpa(1+253/256))= +6.88276e-001 data8 0x3fe6164bfa7cc068, 0x3cbad884f87073fa // log(1/frcpa(1+254/256))= +6.90222e-001 data8 0x3fe62643fecf9740, 0x3cb78c51da12f4df // log(1/frcpa(1+255/256))= +6.92171e-001 -ASM_SIZE_DIRECTIVE(pow_Tt) +LOCAL_OBJECT_END(pow_Tt) // Table 1 is 2^(index_1/128) where // index_1 goes from 0 to 15 -pow_tbl1: -ASM_TYPE_DIRECTIVE(pow_tbl1,@object) +LOCAL_OBJECT_START(pow_tbl1) data8 0x8000000000000000 , 0x00003FFF data8 0x80B1ED4FD999AB6C , 0x00003FFF data8 0x8164D1F3BC030773 , 0x00003FFF @@ -873,13 +892,12 @@ data8 0x88980E8092DA8527 , 0x00003FFF data8 0x8955EE03618E5FDD , 0x00003FFF data8 0x8A14D575496EFD9A , 0x00003FFF data8 0x8AD4C6452C728924 , 0x00003FFF -ASM_SIZE_DIRECTIVE(pow_tbl1) +LOCAL_OBJECT_END(pow_tbl1) // Table 2 is 2^(index_1/8) where // index_2 goes from 0 to 7 -pow_tbl2: -ASM_TYPE_DIRECTIVE(pow_tbl2,@object) +LOCAL_OBJECT_START(pow_tbl2) data8 0x8000000000000000 , 0x00003FFF data8 0x8B95C1E3EA8BD6E7 , 0x00003FFF data8 0x9837F0518DB8A96F , 0x00003FFF @@ -888,402 +906,319 @@ data8 0xB504F333F9DE6484 , 0x00003FFF data8 0xC5672A115506DADD , 0x00003FFF data8 0xD744FCCAD69D6AF4 , 0x00003FFF data8 0xEAC0C6E7DD24392F , 0x00003FFF -ASM_SIZE_DIRECTIVE(pow_tbl2) - -.global pow +LOCAL_OBJECT_END(pow_tbl2) .section .text -.proc pow -.align 32 - -pow: +GLOBAL_LIBM_ENTRY(pow) +// Get exponent of x. Will be used to calculate K. { .mfi - alloc r32=ar.pfs,1,35,4,0 - fms.s1 POW_Xm1 = f8,f1,f1 // Will be used for r1 if x>0 - mov pow_GR_17ones = 0x1FFFF + getf.exp pow_GR_signexp_X = f8 + fms.s1 POW_Xm1 = f8,f1,f1 // Will be used for r1 if x>0 + mov pow_GR_17ones = 0x1FFFF } { .mfi -(p0) addl pow_AD_P = @ltoff(pow_table_P), gp - fma.s1 POW_Xp1 = f8,f1,f1 // Will be used for r1 if x<0 + addl pow_AD_P = @ltoff(pow_table_P), gp + fma.s1 POW_Xp1 = f8,f1,f1 // Will be used for r1 if x<0 nop.i 999 ;; } - -// Get exponent of x. Will be used to calculate K. +// Get significand of x. Will be used to get index to fetch T, Tt. { .mfi - getf.exp pow_GR_signexp_X = f8 - frcpa.s1 POW_B, p6 = f1,f8 + getf.sig pow_GR_sig_X = f8 + frcpa.s1 POW_B, p6 = f1,f8 nop.i 999 } { .mfi ld8 pow_AD_P = [pow_AD_P] - fma.s1 POW_NORM_X = f8,f1,f0 - mov pow_GR_FFF7 = 0xFFF7 + fma.s1 POW_NORM_X = f8,f1,f0 + mov pow_GR_exp_2tom8 = 0xFFF7 } ;; - - -// Get significand of x. Will be used to get index to fetch T, Tt. // p13 = TRUE ==> X is unorm // DOUBLE 0x10033 exponent limit at which y is an integer -// SINGLE 0x10016 { .mfi - getf.sig pow_GR_sig_X = f8 - fclass.m p13,p0 = f8, 0x0b // Test for x unorm - addl pow_GR_10033 = 0x10033, r0 + nop.m 999 + fclass.m p13,p0 = f8, 0x0b // Test for x unorm + addl pow_GR_10033 = 0x10033, r0 } { .mfi mov pow_GR_16ones = 0xFFFF - fma.s1 POW_NORM_Y = f9,f1,f0 + fma.s1 POW_NORM_Y = f9,f1,f0 nop.i 999 } ;; - // p14 = TRUE ==> X is ZERO { .mfi adds pow_AD_Tt = pow_Tt - pow_table_P, pow_AD_P - fclass.m p14,p15 = f8, 0x07 - and pow_GR_exp_X = pow_GR_signexp_X, pow_GR_17ones + fclass.m p14,p0 = f8, 0x07 + and pow_GR_exp_X = pow_GR_signexp_X, pow_GR_17ones } { .mfi - adds pow_AD_Q = pow_table_Q - pow_table_P, pow_AD_P + adds pow_AD_Q = pow_table_Q - pow_table_P, pow_AD_P nop.f 999 nop.i 999 } ;; { .mfi - ldfe POW_P5 = [pow_AD_P], 16 - fcmp.lt.s1 p8,p9 = f8, f0 // Test for x<0 - shl pow_GR_offset = pow_GR_sig_X, 1 + ldfe POW_P5 = [pow_AD_P], 16 + fcmp.lt.s1 p8,p9 = f8, f0 // Test for x<0 + nop.i 999 } { .mib - ldfe POW_P4 = [pow_AD_Q], 16 - sub pow_GR_true_exp_X = pow_GR_exp_X, pow_GR_16ones -(p13) br.cond.spnt L(POW_X_DENORM) + ldfe POW_P4 = [pow_AD_Q], 16 + sub pow_GR_true_exp_X = pow_GR_exp_X, pow_GR_16ones +(p13) br.cond.spnt POW_X_DENORM } ;; - // Continue normal and denormal paths here -L(POW_COMMON): +POW_COMMON: // p11 = TRUE ==> Y is a NAN { .mfi - ldfe POW_P3 = [pow_AD_P], 16 - fclass.m.unc p11,p0 = f9, 0xc3 - shr.u pow_GR_offset = pow_GR_offset,56 + ldfe POW_P3 = [pow_AD_P], 16 + fclass.m p11,p0 = f9, 0xc3 + nop.i 999 } { .mfi - ldfe POW_P2 = [pow_AD_Q], 16 + ldfe POW_P2 = [pow_AD_Q], 16 nop.f 999 - nop.i 999 + mov pow_GR_y_zero = 0 } ;; - - -// Compute xsq to decide later if |x|=1 -// p11 = TRUE ==> Y is a NaN +// Note POW_Xm1 and POW_r1 are used interchangably { .mfi - setf.sig POW_int_K = pow_GR_true_exp_X -(p15) fms.s1 POW_r = POW_B, POW_NORM_X,f1 - shladd pow_AD_Tt = pow_GR_offset, 4, pow_AD_Tt + alloc r32=ar.pfs,2,19,4,0 + fms.s1 POW_r = POW_B, POW_NORM_X,f1 + nop.i 999 } { .mfi - nop.m 999 -(p8) fnma.s1 POW_Xm1 = POW_Xp1,f1,f0 + setf.sig POW_int_K = pow_GR_true_exp_X +(p8) fnma.s1 POW_Xm1 = POW_Xp1,f1,f0 nop.i 999 } ;; - - -// p12 = TRUE ==> X is ZERO and Y is ZERO +// p12 = TRUE if Y is ZERO +// Compute xsq to decide later if |x|=1 { .mfi - ldfe POW_P1 = [pow_AD_P], 16 -(p14) fclass.m.unc p12,p0 = f9, 0x07 - nop.i 999 + ldfe POW_P1 = [pow_AD_P], 16 + fclass.m p12,p0 = f9, 0x07 + shl pow_GR_offset = pow_GR_sig_X, 1 } { .mfb - ldfe POW_P0 = [pow_AD_Q], 16 + ldfe POW_P0 = [pow_AD_Q], 16 fma.s1 POW_xsq = POW_NORM_X, POW_NORM_X, f0 -(p11) br.cond.spnt L(POW_Y_NAN) +(p11) br.cond.spnt POW_Y_NAN // Branch if y=nan } ;; - -.pred.rel "mutex",p8,p9 // Get exponent of |x|-1 to use in comparison to 2^-8 -{ .mmf -(p8) getf.exp pow_GR_signexp_Xm1 = POW_Xp1 -(p9) getf.exp pow_GR_signexp_Xm1 = POW_Xm1 - fcvt.fx.s1 POW_int_Y = POW_NORM_Y +{ .mfi + getf.exp pow_GR_signexp_Xm1 = POW_Xm1 + fcvt.fx.s1 POW_int_Y = POW_NORM_Y + shr.u pow_GR_offset = pow_GR_offset,56 } ;; - // p11 = TRUE ==> X is a NAN { .mfi ldfpd POW_log2_hi, POW_log2_lo = [pow_AD_Q], 16 - fclass.m.unc p11,p0 = f8, 0xc3 - nop.i 999 -} -{ .mib - ldfpd POW_T, POW_Tt = [pow_AD_Tt], 16 - nop.i 999 -(p12) br.cond.spnt L(POW_X_0_Y_0) + fclass.m p11,p0 = f8, 0xc3 + shladd pow_AD_Tt = pow_GR_offset, 4, pow_AD_Tt } -;; - - -// p14 = TRUE ==> X is zero -// p15 = TRUE ==> X is zero AND Y is negative -// p10 = TRUE ==> X is zero AND Y is >= zero { .mfi ldfe POW_inv_log2_by_128 = [pow_AD_P], 16 -(p14) fcmp.lt.unc.s1 p15, p10 = f9,f0 - nop.i 999 + fma.s1 POW_delta = f0,f0,f0 // delta=0 in case |x| near 1 +(p12) mov pow_GR_y_zero = 1 } -{ .mfi - nop.m 999 - nop.f 999 - and pow_GR_exp_Xm1 = pow_GR_signexp_Xm1, pow_GR_17ones -} -;; - - -// Determine if we will use the |x| near 1 path (p6) or normal path (p7) -// p12 = TRUE ==> X is a NAN and Y is a zero -// p13 = TRUE ==> X is a NAN and Y is anything else -{ .mfi - getf.exp pow_GR_signexp_Y = POW_NORM_Y -(p11) fclass.m.unc p12,p13 = f9, 0x07 - cmp.lt.unc p6,p7 = pow_GR_exp_Xm1, pow_GR_FFF7 -} -{ .mfi - ldfpd POW_Q2, POW_Q3 = [pow_AD_P], 16 - fma.s1 POW_rsq = POW_r, POW_r,f0 - nop.i 999 ;; -} -// If on the x near 1 path, assign r1 to r and r1*r1 to rsq { .mfi - ldfpd POW_Q0_half, POW_Q1 = [pow_AD_P], 16 -(p6) fma.s1 POW_r = POW_r1, f1, f0 - nop.i 999 + ldfpd POW_Q2, POW_Q3 = [pow_AD_P], 16 + fma.s1 POW_G = f0,f0,f0 // G=0 in case |x| near 1 + and pow_GR_exp_Xm1 = pow_GR_signexp_Xm1, pow_GR_17ones } -{ .mfi - nop.m 999 -(p6) fma.s1 POW_rsq = POW_r1, POW_r1, f0 - nop.i 999 ;; -} - +// Determine if we will use the |x| near 1 path (p6) or normal path (p7) { .mfi - ldfpd POW_Q4, POW_RSHF = [pow_AD_P], 16 -(p7) fma.s1 POW_v6 = POW_r, POW_P5, POW_P4 - and pow_GR_exp_Y = pow_GR_signexp_Y, pow_GR_17ones + getf.exp pow_GR_signexp_Y = POW_NORM_Y + nop.f 999 + cmp.lt p6,p7 = pow_GR_exp_Xm1, pow_GR_exp_2tom8 } { .mfb - nop.m 999 -(p6) fma.s1 POW_v6 = POW_r1, POW_P5, POW_P4 -(p12) br.cond.spnt L(POW_X_NAN_Y_0) + ldfpd POW_T, POW_Tt = [pow_AD_Tt], 16 + fma.s1 POW_rsq = POW_r, POW_r,f0 +(p11) br.cond.spnt POW_X_NAN // Branch if x=nan and y not nan } ;; - +// If on the x near 1 path, assign r1 to r and r1*r1 to rsq { .mfi - nop.m 999 -(p7) fma.s1 POW_v4 = POW_P3, POW_r, POW_P2 - andcm pow_GR_sign_Y = pow_GR_signexp_Y, pow_GR_17ones + ldfpd POW_Q0_half, POW_Q1 = [pow_AD_P], 16 +(p6) fma.s1 POW_r = POW_r1, f1, f0 + nop.i 999 } { .mfb nop.m 999 -(p6) fma.s1 POW_v4 = POW_P3, POW_r1, POW_P2 -(p12) br.cond.spnt L(POW_X_NAN_Y_0) +(p6) fma.s1 POW_rsq = POW_r1, POW_r1, f0 +(p14) br.cond.spnt POW_X_0 // Branch if x zero and y not nan } ;; { .mfi - nop.m 999 - fcvt.xf POW_K = POW_int_K + ldfpd POW_Q4, POW_RSHF = [pow_AD_P], 16 +(p7) fma.s1 POW_v6 = POW_r, POW_P5, POW_P4 nop.i 999 } -{ .mfb - nop.m 999 -(p13) fma.d f8 = f8,f1,f0 -(p13) br.ret.spnt b0 // Exit if x nan, y anything but zero +{ .mfi + mov pow_GR_exp_2toM63 = 0xffc0 // Exponent of 2^-63 +(p6) fma.s1 POW_v6 = POW_r1, POW_P5, POW_P4 + nop.i 999 } ;; - -// p10 = TRUE ==> X is zero AND Y is positive -// p8 = TRUE ==> X is zero AND Y is outside integer range (treat as even int) -// return +0 -// p9 = TRUE ==> X is zero AND Y is within integer range (may not be integer) + { .mfi -(p10) cmp.gt.unc p8,p9 = pow_GR_exp_Y, pow_GR_10033 -(p6) fmerge.s POW_delta = f0,f0 + setf.exp POW_2toM63 = pow_GR_exp_2toM63 // Form 2^-63 for test of q +(p7) fma.s1 POW_v4 = POW_P3, POW_r, POW_P2 nop.i 999 } { .mfi nop.m 999 -(p6) fma.s1 POW_G = f0,f0,f0 +(p6) fma.s1 POW_v4 = POW_P3, POW_r1, POW_P2 nop.i 999 } ;; { .mfi - getf.sig pow_GR_sig_int_Y = POW_int_Y - fnma.s1 POW_twoV = POW_NORM_Y, POW_rsq,f0 - nop.i 999 -} -{ .mfi nop.m 999 - fma.s1 POW_U = POW_NORM_Y,POW_r,f0 + fcvt.xf POW_K = POW_int_K nop.i 999 } ;; { .mfi - ldfe POW_log2_by_128_lo = [pow_AD_P], 16 -(p6) fma.s1 POW_v2 = POW_P1, POW_r1, POW_P0 - nop.i 999 + getf.sig pow_GR_sig_int_Y = POW_int_Y + fnma.s1 POW_twoV = POW_NORM_Y, POW_rsq,f0 + and pow_GR_exp_Y = pow_GR_signexp_Y, pow_GR_17ones } -{ .mfi - ldfe POW_log2_by_128_hi = [pow_AD_Q], 16 -(p7) fma.s1 POW_v2 = POW_P1, POW_r, POW_P0 - nop.i 999 +{ .mfb + andcm pow_GR_sign_Y = pow_GR_signexp_Y, pow_GR_17ones + fma.s1 POW_U = POW_NORM_Y,POW_r,f0 +(p12) br.cond.spnt POW_Y_0 // Branch if y=zero, x not zero or nan } ;; - +// p11 = TRUE ==> X is NEGATIVE but not inf { .mfi - nop.m 999 - fcvt.xf POW_float_int_Y = POW_int_Y + ldfe POW_log2_by_128_lo = [pow_AD_P], 16 + fclass.m p11,p0 = f8, 0x1a nop.i 999 } { .mfi - nop.m 999 - fma.s1 POW_v3 = POW_v6, POW_rsq, POW_v4 - adds pow_AD_tbl1 = pow_tbl1 - pow_Tt, pow_AD_Q + ldfe POW_log2_by_128_hi = [pow_AD_Q], 16 + fma.s1 POW_v2 = POW_P1, POW_r, POW_P0 + nop.i 999 } ;; { .mfi nop.m 999 -(p7) fma.s1 POW_delta = POW_K, POW_log2_lo, POW_Tt + fcvt.xf POW_float_int_Y = POW_int_Y nop.i 999 } { .mfi nop.m 999 -(p7) fma.s1 POW_G = POW_K, POW_log2_hi, POW_T - adds pow_AD_tbl2 = pow_tbl2 - pow_tbl1, pow_AD_tbl1 + fma.s1 POW_v3 = POW_v6, POW_rsq, POW_v4 + adds pow_AD_tbl1 = pow_tbl1 - pow_Tt, pow_AD_Q } ;; - { .mfi nop.m 999 - fms.s1 POW_e2 = POW_NORM_Y, POW_r, POW_U +(p7) fma.s1 POW_delta = POW_K, POW_log2_lo, POW_Tt nop.i 999 } { .mfi nop.m 999 - fma.s1 POW_Z2 = POW_twoV, POW_Q0_half, POW_U - nop.i 999 +(p7) fma.s1 POW_G = POW_K, POW_log2_hi, POW_T + adds pow_AD_tbl2 = pow_tbl2 - pow_tbl1, pow_AD_tbl1 } ;; -// p11 = TRUE ==> X is NEGATIVE -// p8 = TRUE ==> X is zero AND Y is outside intger range (treat as even int) -// return +0 { .mfi nop.m 999 - fclass.m.unc p11,p0 = f8, 0x1a + fms.s1 POW_e2 = POW_NORM_Y, POW_r, POW_U nop.i 999 } -{ .mfb +{ .mfi nop.m 999 -(p8) fma.d f8 = f0,f0,f0 -(p8) br.ret.spnt b0 + fma.s1 POW_Z2 = POW_twoV, POW_Q0_half, POW_U + nop.i 999 } ;; -{ .mfi +{ .mfi nop.m 999 - fma.s1 POW_Yrcub = POW_rsq, POW_U, f0 + fma.s1 POW_Yrcub = POW_rsq, POW_U, f0 nop.i 999 } -{ .mfi +{ .mfi nop.m 999 - fma.s1 POW_p = POW_rsq, POW_v3, POW_v2 + fma.s1 POW_p = POW_rsq, POW_v3, POW_v2 nop.i 999 } ;; - -// p11 = TRUE ==> X is NEGATIVE -// p12 = TRUE ==> X is NEGATIVE AND Y already int +// p11 = TRUE ==> X is NEGATIVE but not inf +// p12 = TRUE ==> X is NEGATIVE AND Y already even int // p13 = TRUE ==> X is NEGATIVE AND Y possible int { .mfi nop.m 999 - fma.s1 POW_Z1 = POW_NORM_Y, POW_G, f0 -(p11) cmp.ge.unc p12,p13 = pow_GR_exp_Y, pow_GR_10033 + fma.s1 POW_Z1 = POW_NORM_Y, POW_G, f0 +(p11) cmp.gt.unc p12,p13 = pow_GR_exp_Y, pow_GR_10033 } { .mfi nop.m 999 - fma.s1 POW_e3 = POW_NORM_Y, POW_delta, f0 + fma.s1 POW_Gpr = POW_G, f1, POW_r nop.i 999 } ;; -// p9 = TRUE ==> X is zero AND Y is within integer range (may not be integer) -// p6 = TRUE ==> X is zero AND Y is an integer (may be even or odd) -// p7 = TRUE ==> X is zero AND Y is NOT an integer, return +0 +// By adding RSHF (1.1000...*2^63) we put integer part in rightmost significand { .mfi nop.m 999 -(p9) fcmp.eq.unc.s1 p6,p7 = POW_float_int_Y, POW_NORM_Y + fma.s1 POW_W2 = POW_Z2, POW_inv_log2_by_128, POW_RSHF nop.i 999 } -{ .mfi +{ .mfi nop.m 999 - fma.s1 POW_Gpr = POW_G, f1, POW_r + fms.s1 POW_UmZ2 = POW_U, f1, POW_Z2 nop.i 999 } ;; -// By adding RSHF (1.1000...*2^63) we put integer part in rightmost significand { .mfi nop.m 999 - fma.s1 POW_W2 = POW_Z2, POW_inv_log2_by_128, POW_RSHF - nop.i 999 -} -{ .mfi - nop.m 999 - fms.s1 POW_UmZ2 = POW_U, f1, POW_Z2 + fma.s1 POW_e3 = POW_NORM_Y, POW_delta, f0 nop.i 999 } ;; - -// If x=0 and y>0, test y and flag denormal -// p6 = TRUE ==> X is zero AND Y is an integer (may be even or odd) -// p8 = TRUE ==> X is zero AND Y is an odd integer -// p9 = TRUE ==> X is zero AND Y is an even integer { .mfi nop.m 999 -(p10) fcmp.eq.s0 p15,p0 = f9,f0 -(p6) tbit.nz.unc p8,p9 = pow_GR_sig_int_Y,0 + fma.s1 POW_Z3 = POW_p, POW_Yrcub, f0 + nop.i 999 } { .mfi nop.m 999 - fma.s1 POW_Z3 = POW_p, POW_Yrcub, f0 + fma.s1 POW_GY_Z2 = POW_G, POW_NORM_Y, POW_Z2 nop.i 999 } ;; @@ -1291,7 +1226,7 @@ L(POW_COMMON): // By adding RSHF (1.1000...*2^63) we put integer part in rightmost significand { .mfi nop.m 999 - fms.s1 POW_e1 = POW_NORM_Y, POW_G, POW_Z1 + fms.s1 POW_e1 = POW_NORM_Y, POW_G, POW_Z1 nop.i 999 } { .mfi @@ -1301,81 +1236,60 @@ L(POW_COMMON): } ;; +// p13 = TRUE ==> X is NEGATIVE AND Y possible int +// p10 = TRUE ==> X is NEG and Y is an int +// p12 = TRUE ==> X is NEG and Y is not an int { .mfi nop.m 999 -(p7) fma.d f8 = f0,f0,f0 // Result +0 if x zero and y not integer - nop.i 999 +(p13) fcmp.eq.unc.s1 p10,p12 = POW_float_int_Y, POW_NORM_Y + mov pow_GR_xneg_yodd = 0 } -{ .mfb +{ .mfi nop.m 999 - fma.s1 POW_Y_Gpr = POW_NORM_Y, POW_Gpr, f0 -(p8) br.ret.spnt b0 // Exit if x zero and y odd integer + fma.s1 POW_Y_Gpr = POW_NORM_Y, POW_Gpr, f0 + nop.i 999 } ;; // By subtracting RSHF we get rounded integer POW_N2float -// p15 = TRUE ==> X_0_Y_NEG { .mfi nop.m 999 fms.s1 POW_N2float = POW_W2, f1, POW_RSHF nop.i 999 } -{ .mfb +{ .mfi nop.m 999 - fma.s1 POW_UmZ2pV = POW_twoV,POW_Q0_half,POW_UmZ2 -(p15) br.cond.spnt L(POW_X_0_Y_NEG) + fma.s1 POW_UmZ2pV = POW_twoV,POW_Q0_half,POW_UmZ2 + nop.i 999 } ;; - - { .mfi nop.m 999 - fma.s1 POW_Z3sq = POW_Z3, POW_Z3, f0 + fma.s1 POW_Z3sq = POW_Z3, POW_Z3, f0 nop.i 999 } -{ .mfb +{ .mfi nop.m 999 - fma.s1 POW_v4 = POW_Z3, POW_Q3, POW_Q2 -(p7) br.ret.spnt b0 // Exit if x zero and y not an integer + fma.s1 POW_v4 = POW_Z3, POW_Q3, POW_Q2 + nop.i 999 } ;; - - // Extract rounded integer from rightmost significand of POW_W2 // By subtracting RSHF we get rounded integer POW_N1float { .mfi - getf.sig pow_GR_int_W2 = POW_W2 + getf.sig pow_GR_int_W2 = POW_W2 fms.s1 POW_N1float = POW_W1, f1, POW_RSHF nop.i 999 } { .mfi nop.m 999 - fma.s1 POW_v2 = POW_Z3, POW_Q1, POW_Q0_half + fma.s1 POW_v2 = POW_Z3, POW_Q1, POW_Q0_half nop.i 999 } ;; - - - -// p13 = TRUE ==> X is NEGATIVE AND Y possible int -// p10 = TRUE ==> X is NEG and Y is an int -// p12 = TRUE ==> X is NEG and Y is not an int -{ .mfi - nop.m 999 -(p13) fcmp.eq.unc.s1 p10,p12 = POW_float_int_Y, POW_NORM_Y - nop.i 999 -} -{ .mfb - nop.m 999 -(p9) fma.d f8 = f0,f0,f0 // Result +0 if x zero and y even integer -(p9) br.ret.spnt b0 // Exit if x zero and y even integer -} -;; - - { .mfi nop.m 999 fnma.s1 POW_s2 = POW_N2float, POW_log2_by_128_hi, POW_Z2 @@ -1383,7 +1297,7 @@ L(POW_COMMON): } { .mfi nop.m 999 - fma.s1 POW_e2 = POW_e2,f1,POW_UmZ2pV + fma.s1 POW_e2 = POW_e2,f1,POW_UmZ2pV nop.i 999 } ;; @@ -1391,278 +1305,283 @@ L(POW_COMMON): // Extract rounded integer from rightmost significand of POW_W1 // Test if x inf { .mfi - getf.sig pow_GR_int_W1 = POW_W1 - fclass.m.unc p15,p0 = POW_NORM_X, 0x23 + getf.sig pow_GR_int_W1 = POW_W1 + fclass.m p15,p0 = POW_NORM_X, 0x23 nop.i 999 } { .mfb nop.m 999 fnma.s1 POW_f2 = POW_N2float, POW_log2_by_128_lo, f1 -(p12) br.cond.spnt L(POW_X_NEG_Y_NONINT) // Branch if x neg, y not integer +(p12) br.cond.spnt POW_X_NEG_Y_NONINT // Branch if x neg, y not integer } ;; +// p11 = TRUE ==> X is +1.0 // p12 = TRUE ==> X is NEGATIVE AND Y is an odd integer { .mfi - getf.exp pow_GR_signexp_Y_Gpr = POW_Y_Gpr - fma.s1 POW_v3 = POW_Z3sq, POW_Q4, POW_v4 -(p10) tbit.nz.unc p12,p0 = pow_GR_sig_int_Y,0 + getf.exp pow_GR_signexp_Y_Gpr = POW_Y_Gpr + fcmp.eq.s1 p11,p0 = POW_NORM_X, f1 +(p10) tbit.nz.unc p12,p0 = pow_GR_sig_int_Y,0 +} +{ .mfi + nop.m 999 + fma.s1 POW_v3 = POW_Z3sq, POW_Q4, POW_v4 + nop.i 999 } ;; - { .mfi - add pow_GR_int_N = pow_GR_int_W1, pow_GR_int_W2 + nop.m 999 fnma.s1 POW_f1 = POW_N1float, POW_log2_by_128_lo, f1 nop.i 999 } { .mfb nop.m 999 fnma.s1 POW_s1 = POW_N1float, POW_log2_by_128_hi, POW_Z1 -(p15) br.cond.spnt L(POW_X_INF) +(p15) br.cond.spnt POW_X_INF } ;; - // Test x and y and flag denormal { .mfi - and pow_GR_index1 = 0x0f, pow_GR_int_N + nop.m 999 fcmp.eq.s0 p15,p0 = f8,f9 - shr r2 = pow_GR_int_N, 7 + nop.i 999 } { .mfi - and pow_GR_exp_Y_Gpr = pow_GR_signexp_Y_Gpr, pow_GR_17ones - nop.f 999 - and pow_GR_index2 = 0x70, pow_GR_int_N + nop.m 999 + fma.s1 POW_pYrcub_e3 = POW_p, POW_Yrcub, POW_e3 + nop.i 999 } ;; - - { .mfi - shladd pow_AD_T1 = pow_GR_index1, 4, pow_AD_tbl1 + nop.m 999 fcmp.eq.s1 p7,p0 = POW_NORM_Y, f1 // Test for y=1.0 - sub pow_GR_true_exp_Y_Gpr = pow_GR_exp_Y_Gpr, pow_GR_16ones + nop.i 999 } { .mfi - addl pow_int_GR_M = 0xFFFF, r2 - fma.s1 POW_e12 = POW_e1,f1,POW_e2 - add pow_AD_T2 = pow_AD_tbl2, pow_GR_index2 + nop.m 999 + fma.s1 POW_e12 = POW_e1,f1,POW_e2 + nop.i 999 } ;; - -{ .mmi - ldfe POW_T1 = [pow_AD_T1],16 - setf.exp POW_2M = pow_int_GR_M - andcm pow_GR_sign_Y_Gpr = pow_GR_signexp_Y_Gpr, pow_GR_17ones +{ .mfi + add pow_GR_int_N = pow_GR_int_W1, pow_GR_int_W2 +(p11) fma.d.s0 f8 = f1,f1,f0 // If x=1, result is +1 + nop.i 999 +} +{ .mib +(p12) mov pow_GR_xneg_yodd = 1 + nop.i 999 +(p11) br.ret.spnt b0 // Early exit if x=1.0, result is +1 } ;; - -{ .mfb - ldfe POW_T2 = [pow_AD_T2],16 - fma.s1 POW_q = POW_Z3sq, POW_v3, POW_v2 +{ .mfi + and pow_GR_index1 = 0x0f, pow_GR_int_N + fma.s1 POW_q = POW_Z3sq, POW_v3, POW_v2 + shr pow_int_GR_M = pow_GR_int_N, 7 // M = N/128 +} +{ .mib + and pow_GR_index2 = 0x70, pow_GR_int_N + cmp.eq p6, p0 = pow_GR_xneg_yodd, r0 (p7) br.ret.spnt b0 // Early exit if y=1.0, result is x } ;; - -// double: p8 TRUE ==> |Y(G + r)| >= 10 -// single: p8 TRUE ==> |Y(G + r)| >= 7 - -// double -// -2^10 -2^9 2^9 2^10 -// -----+-----+----+ ... +-----+-----+----- -// p8 | p9 | p8 -// | | p10 | | -// single -// -2^7 -2^6 2^6 2^7 -// -----+-----+----+ ... +-----+-----+----- -// p8 | p9 | p8 -// | | p10 | | - - { .mfi -(p0) cmp.le.unc p8,p9 = 10, pow_GR_true_exp_Y_Gpr - fma.s1 POW_s = POW_s1, f1, POW_s2 - nop.i 999 + shladd pow_AD_T1 = pow_GR_index1, 4, pow_AD_tbl1 + fma.s1 POW_s = POW_s1, f1, POW_s2 + add pow_int_GR_M = pow_GR_16ones, pow_int_GR_M } { .mfi - nop.m 999 - fma.s1 POW_f12 = POW_f1, POW_f2,f0 - nop.i 999 + add pow_AD_T2 = pow_AD_tbl2, pow_GR_index2 + fma.s1 POW_f12 = POW_f1, POW_f2,f0 + and pow_GR_exp_Y_Gpr = pow_GR_signexp_Y_Gpr, pow_GR_17ones } ;; - -{ .mfi - nop.f 999 -(p9) cmp.le.unc p0,p10 = 9, pow_GR_true_exp_Y_Gpr +{ .mmi + ldfe POW_T1 = [pow_AD_T1] + ldfe POW_T2 = [pow_AD_T2] + sub pow_GR_true_exp_Y_Gpr = pow_GR_exp_Y_Gpr, pow_GR_16ones } ;; - - +{ .mfi + setf.exp POW_2M = pow_int_GR_M + fma.s1 POW_e123 = POW_e12, f1, POW_e3 + nop.i 999 +} { .mfb - nop.m 999 - fma.s1 POW_e123 = POW_e12, f1, POW_e3 -(p8) br.cond.spnt L(POW_OVER_UNDER_X_NOT_INF) +(p6) cmp.gt p6, p0 = -11, pow_GR_true_exp_Y_Gpr + fma.s1 POW_d = POW_GY_Z2, f1, POW_pYrcub_e3 +(p6) br.cond.spnt POW_NEAR_ONE // branch if |y*log(x)| < 2^(-11) } ;; - -{ .mmf - fma.s1 POW_q = POW_Z3sq, POW_q, POW_Z3 +{ .mfi + nop.m 999 + fma.s1 POW_q = POW_Z3sq, POW_q, POW_Z3 + nop.i 999 } ;; +// p8 TRUE ==> |Y(G + r)| >= 10 +// double +// -2^10 -2^9 2^9 2^10 +// -----+-----+----+ ... +-----+-----+----- +// p8 | p9 | p8 +// | | p10 | | + +// Form signexp of constants to indicate overflow { .mfi - nop.m 999 - fma.s1 POW_ssq = POW_s, POW_s, f0 - nop.i 999 + mov pow_GR_big_pos = 0x103ff + fma.s1 POW_ssq = POW_s, POW_s, f0 + cmp.le p8,p9 = 10, pow_GR_true_exp_Y_Gpr } { .mfi - nop.m 999 - fma.s1 POW_v4 = POW_s, POW_Q3, POW_Q2 - nop.i 999 + mov pow_GR_big_neg = 0x303ff + fma.s1 POW_v4 = POW_s, POW_Q3, POW_Q2 + andcm pow_GR_sign_Y_Gpr = pow_GR_signexp_Y_Gpr, pow_GR_17ones } ;; +// Form big positive and negative constants to test for possible overflow { .mfi - nop.m 999 - fma.s1 POW_v2 = POW_s, POW_Q1, POW_Q0_half - nop.i 999 + setf.exp POW_big_pos = pow_GR_big_pos + fma.s1 POW_v2 = POW_s, POW_Q1, POW_Q0_half +(p9) cmp.le.unc p0,p10 = 9, pow_GR_true_exp_Y_Gpr } -{ .mfi - nop.m 999 - fma.s1 POW_1ps = f1,f1,POW_s - nop.i 999 +{ .mfb + setf.exp POW_big_neg = pow_GR_big_neg + fma.s1 POW_1ps = f1,f1,POW_s +(p8) br.cond.spnt POW_OVER_UNDER_X_NOT_INF } ;; +// f123 = f12*(e123+1) = f12*e123+f12 { .mfi nop.m 999 - fma.s1 POW_f3 = POW_e123,f1,f1 + fma.s1 POW_f123 = POW_e123,POW_f12,POW_f12 nop.i 999 } ;; { .mfi nop.m 999 - fma.s1 POW_T1T2 = POW_T1, POW_T2, f0 + fma.s1 POW_T1T2 = POW_T1, POW_T2, f0 nop.i 999 } -;; - { .mfi nop.m 999 - fma.s1 POW_v3 = POW_ssq, POW_Q4, POW_v4 - nop.i 999 + fma.s1 POW_v3 = POW_ssq, POW_Q4, POW_v4 + cmp.ne p12,p13 = pow_GR_xneg_yodd, r0 } ;; { .mfi nop.m 999 - fma.s1 POW_v21ps = POW_ssq, POW_v2, POW_1ps + fma.s1 POW_v21ps = POW_ssq, POW_v2, POW_1ps nop.i 999 } { .mfi nop.m 999 - fma.s1 POW_s4 = POW_ssq, POW_ssq, f0 + fma.s1 POW_s4 = POW_ssq, POW_ssq, f0 nop.i 999 } ;; { .mfi nop.m 999 - fma.s1 POW_f123 = POW_f12, POW_f3, f0 +(p12) fnma.s1 POW_A = POW_2M, POW_f123, f0 nop.i 999 } +{ .mfi + nop.m 999 +(p13) fma.s1 POW_A = POW_2M, POW_f123, f0 + cmp.eq p14,p11 = r0,r0 // Initialize p14 on, p11 off +} ;; { .mfi nop.m 999 - fma.s1 POW_A = POW_2M, POW_T1T2, f0 + fmerge.s POW_abs_q = f0, POW_q // Form |q| so can test its size nop.i 999 } ;; - - { .mfi - nop.m 999 -(p12) fmerge.s POW_f123 = f8,POW_f123 // if x neg, y odd int +(p10) cmp.eq p0,p14 = r0,r0 // Turn off p14 if no overflow + fma.s1 POW_es = POW_s4, POW_v3, POW_v21ps nop.i 999 } { .mfi nop.m 999 -// fma.s1 POW_es = POW_ssq, POW_v3, POW_v2 + fma.s1 POW_A = POW_A, POW_T1T2, f0 nop.i 999 } ;; { .mfi +// Test for |q| < 2^-63. If so then reverse last two steps of the result +// to avoid monotonicity problems for results near 1.0 in round up/down/zero. +// p11 will be set if need to reverse the order, p14 if not. nop.m 999 - fma.s1 POW_es = POW_s4, POW_v3, POW_v21ps +(p10) fcmp.lt.s0 p11,p14 = POW_abs_q, POW_2toM63 // Test |q| <2^-63 nop.i 999 } ;; - +.pred.rel "mutex",p11,p14 { .mfi nop.m 999 - fma.s1 POW_A = POW_A, POW_f123, f0 +(p14) fma.s1 POW_A = POW_A, POW_es, f0 nop.i 999 } { .mfi nop.m 999 -// fma.s1 POW_es = POW_es, POW_ssq, POW_1ps +(p11) fma.s1 POW_A = POW_A, POW_q, POW_A nop.i 999 } ;; - +// Dummy op to set inexact if |q| < 2^-63 { .mfi nop.m 999 - fma.s1 POW_A = POW_A, POW_es,f0 +(p11) fma.d.s0 POW_tmp = POW_A, POW_q, POW_A nop.i 999 } ;; - - +{ .mfi + nop.m 999 +(p14) fma.d.s0 f8 = POW_A, POW_q, POW_A + nop.i 999 +} { .mfb nop.m 999 -(p10) fma.d f8 = POW_A, POW_q, POW_A -(p10) br.ret.sptk b0 +(p11) fma.d.s0 f8 = POW_A, POW_es, f0 +(p10) br.ret.sptk b0 // Exit main branch if no over/underflow } ;; - - - - // POSSIBLE_OVER_UNDER -// p6 = TRUE ==> Y negative +// p6 = TRUE ==> Y_Gpr negative +// Result is already computed. We just need to know if over/underflow occurred. -{ .mfi - nop.m 999 - fmerge.s POW_abs_A = f0, POW_A - cmp.eq.unc p0,p6 = pow_GR_sign_Y, r0 -} -;; - -{ .mib - nop.m 999 - nop.i 999 -(p6) br.cond.spnt L(POW_POSSIBLE_UNDER) +{ .mfb + cmp.eq p0,p6 = pow_GR_sign_Y_Gpr, r0 + nop.f 999 +(p6) br.cond.spnt POW_POSSIBLE_UNDER } ;; // POSSIBLE_OVER -// We got an answer. +// We got an answer. // overflow is a possibility, not a certainty @@ -1692,21 +1611,20 @@ L(POW_COMMON): // RN RN // RZ - // Put in s2 (td set, wre set) { .mfi - mov pow_GR_gt_ln = 0x103ff + nop.m 999 fsetc.s2 0x7F,0x42 - nop.i 999 + nop.i 999 } ;; - { .mfi - setf.exp POW_gt_pln = pow_GR_gt_ln - fma.d.s2 POW_wre_urm_f8 = POW_abs_A, POW_q, POW_abs_A - nop.i 999 ;; + nop.m 999 + fma.d.s2 POW_wre_urm_f8 = POW_A, POW_q, POW_A + nop.i 999 } +;; // Return s2 to default { .mfi @@ -1716,31 +1634,67 @@ L(POW_COMMON): } ;; - // p7 = TRUE ==> yes, we have an overflow { .mfi nop.m 999 - fcmp.ge.unc.s1 p7, p0 = POW_wre_urm_f8, POW_gt_pln + fcmp.ge.s1 p7, p8 = POW_wre_urm_f8, POW_big_pos nop.i 999 } ;; +{ .mfi + nop.m 999 +(p8) fcmp.le.s1 p7, p0 = POW_wre_urm_f8, POW_big_neg + nop.i 999 +} +;; +{ .mbb +(p7) mov pow_GR_tag = 24 +(p7) br.cond.spnt __libm_error_region // Branch if overflow + br.ret.sptk b0 // Exit if did not overflow +} +;; -{ .mfb -(p7) mov pow_GR_tag = 24 - fma.d f8 = POW_A, POW_q, POW_A -(p7) br.cond.spnt __libm_error_region +// Here if |y*log(x)| < 2^(-11) +// pow(x,y) ~ exp(d) ~ 1 + d + 0.5*d^2 + Q1*d^3 + Q2*d^4, where d = y*log(x) +.align 32 +POW_NEAR_ONE: + +{ .mfi + nop.m 999 + fma.s1 POW_d2 = POW_d, POW_d, f0 + nop.i 999 } -{ .mfb - nop.m 999 - nop.f 999 -(p0) br.ret.sptk b0 +;; + +{ .mfi + nop.m 999 + fma.s1 POW_poly_d_hi = POW_d, POW_Q0_half, f1 + nop.i 999 +} +{ .mfi + nop.m 999 + fma.s1 POW_poly_d_lo = POW_d, POW_Q2, POW_Q1 + nop.i 999 } ;; +{ .mfi + nop.m 999 + fma.s1 POW_poly_d = POW_d2, POW_poly_d_lo, POW_poly_d_hi + nop.i 999 +} +;; + +{ .mfb + nop.m 999 + fma.d.s0 f8 = POW_d, POW_poly_d, f1 + br.ret.sptk b0 // exit function for arguments |y*log(x)| < 2^(-11) +} +;; -L(POW_POSSIBLE_UNDER): +POW_POSSIBLE_UNDER: // We got an answer. input was < -2^9 but > -2^10 (double) // We got an answer. input was < -2^6 but > -2^7 (float) // underflow is a possibility, not a certainty @@ -1763,124 +1717,250 @@ L(POW_POSSIBLE_UNDER): // 0.1...11 2^-3ffe (biased, 1) // largest dn smallest normal - // Put in s2 (td set, ftz set) { .mfi nop.m 999 fsetc.s2 0x7F,0x41 - nop.i 999 + nop.i 999 } ;; - - { .mfi nop.m 999 - fma.d.s2 POW_ftz_urm_f8 = POW_A, POW_q, POW_A + fma.d.s2 POW_ftz_urm_f8 = POW_A, POW_q, POW_A nop.i 999 } ;; - // Return s2 to default { .mfi nop.m 999 fsetc.s2 0x7F,0x40 - nop.i 999 + nop.i 999 } ;; - // p7 = TRUE ==> yes, we have an underflow { .mfi nop.m 999 - fcmp.eq.unc.s1 p7, p0 = POW_ftz_urm_f8, f0 - nop.i 999 + fcmp.eq.s1 p7, p0 = POW_ftz_urm_f8, f0 + nop.i 999 } ;; +{ .mbb +(p7) mov pow_GR_tag = 25 +(p7) br.cond.spnt __libm_error_region // Branch if underflow + br.ret.sptk b0 // Exit if did not underflow +} +;; + +POW_X_DENORM: +// Here if x unorm. Use the NORM_X for getf instructions, and then back +// to normal path +{ .mfi + getf.exp pow_GR_signexp_X = POW_NORM_X + nop.f 999 + nop.i 999 +} +;; +{ .mmi + getf.sig pow_GR_sig_X = POW_NORM_X +;; + and pow_GR_exp_X = pow_GR_signexp_X, pow_GR_17ones + nop.i 999 +} +;; + +{ .mib + sub pow_GR_true_exp_X = pow_GR_exp_X, pow_GR_16ones + nop.i 999 + br.cond.sptk POW_COMMON +} +;; +POW_X_0: +// Here if x=0 and y not nan +// +// We have the following cases: +// p6 x=0 and y>0 and is an integer (may be even or odd) +// p7 x=0 and y>0 and is NOT an integer, return +0 +// p8 x=0 and y>0 and so big as to always be an even integer, return +0 +// p9 x=0 and y>0 and may not be integer +// p10 x=0 and y>0 and is an odd integer, return x +// p11 x=0 and y>0 and is an even integer, return +0 +// p12 used in dummy fcmp to set denormal flag if y=unorm +// p13 x=0 and y>0 +// p14 x=0 and y=0, branch to code for calling error handling +// p15 x=0 and y<0, branch to code for calling error handling +// +{ .mfi + getf.sig pow_GR_sig_int_Y = POW_int_Y // Get signif of int_Y + fcmp.lt.s1 p15,p13 = f9, f0 // Test for y<0 + and pow_GR_exp_Y = pow_GR_signexp_Y, pow_GR_17ones +} +{ .mfb + cmp.ne p14,p0 = pow_GR_y_zero,r0 // Test for y=0 + fcvt.xf POW_float_int_Y = POW_int_Y +(p14) br.cond.spnt POW_X_0_Y_0 // Branch if x=0 and y=0 +} +;; +// If x=0 and y>0, test y and flag denormal { .mfb -(p7) mov pow_GR_tag = 25 - fma.d f8 = POW_A, POW_q, POW_A -(p7) br.cond.spnt __libm_error_region +(p13) cmp.gt.unc p8,p9 = pow_GR_exp_Y, pow_GR_10033 // Test y +big = even int +(p13) fcmp.eq.s0 p12,p0 = f9,f0 // If x=0, y>0 dummy op to flag denormal +(p15) br.cond.spnt POW_X_0_Y_NEG // Branch if x=0 and y<0 } ;; +// Here if x=0 and y>0 +{ .mfi + nop.m 999 +(p9) fcmp.eq.unc.s1 p6,p7 = POW_float_int_Y, POW_NORM_Y // Test y=int + nop.i 999 +} +{ .mfi + nop.m 999 +(p8) fma.d.s0 f8 = f0,f0,f0 // If x=0, y>0 and large even int, return +0 + nop.i 999 +} +;; +{ .mfi + nop.m 999 +(p7) fma.d.s0 f8 = f0,f0,f0 // Result +0 if x=0 and y>0 and not integer +(p6) tbit.nz.unc p10,p11 = pow_GR_sig_int_Y,0 // If y>0 int, test y even/odd +} +;; + +// Note if x=0, y>0 and odd integer, just return x { .mfb nop.m 999 - nop.f 999 - br.ret.sptk b0 +(p11) fma.d.s0 f8 = f0,f0,f0 // Result +0 if x=0 and y even integer + br.ret.sptk b0 // Exit if x=0 and y>0 } ;; +POW_X_0_Y_0: +// When X is +-0 and Y is +-0, IEEE returns 1.0 +// We call error support with this value -L(POW_X_DENORM): -// Here if x unorm. Use the NORM_X for getf instructions, and the back -// to normal path -{ .mfi - getf.exp pow_GR_signexp_X = POW_NORM_X - nop.f 999 - nop.i 999 +{ .mfb + mov pow_GR_tag = 26 + fma.d.s0 f8 = f1,f1,f0 + br.cond.sptk __libm_error_region } ;; +POW_X_0_Y_NEG: +// When X is +-0 and Y is negative, IEEE returns +// X Y answer +// +0 -odd int +inf +// -0 -odd int -inf + +// +0 !-odd int +inf +// -0 !-odd int +inf + +// p6 == Y is a floating point number outside the integer. +// Hence it is an integer and is even. +// return +inf + +// p7 == Y is a floating point number within the integer range. +// p9 == (int_Y = NORM_Y), Y is an integer, which may be odd or even. +// p11 odd +// return (sign_of_x)inf +// p12 even +// return +inf +// p10 == Y is not an integer +// return +inf +// + { .mfi - getf.sig pow_GR_sig_X = POW_NORM_X - nop.f 999 - nop.i 999 + nop.m 999 + nop.f 999 + cmp.gt p6,p7 = pow_GR_exp_Y, pow_GR_10033 } ;; { .mfi - and pow_GR_exp_X = pow_GR_signexp_X, pow_GR_17ones - nop.f 999 + mov pow_GR_tag = 27 +(p7) fcmp.eq.unc.s1 p9,p10 = POW_float_int_Y, POW_NORM_Y + nop.i 999 } ;; -{ .mib - sub pow_GR_true_exp_X = pow_GR_exp_X, pow_GR_16ones - shl pow_GR_offset = pow_GR_sig_X, 1 - br.cond.sptk L(POW_COMMON) +{ .mfb + nop.m 999 +(p6) frcpa.s0 f8,p13 = f1, f0 +(p6) br.cond.sptk __libm_error_region // x=0, y<0, y large neg int +} +;; + +{ .mfb + nop.m 999 +(p10) frcpa.s0 f8,p13 = f1, f0 +(p10) br.cond.sptk __libm_error_region // x=0, y<0, y not int } ;; +// x=0, y<0, y an int +{ .mib + nop.m 999 +(p9) tbit.nz.unc p11,p12 = pow_GR_sig_int_Y,0 + nop.b 999 +} +;; -L(POW_X_0_Y_0): -// When X is +-0 and Y is +-0, IEEE returns 1.0 -// We call error support with this value +{ .mfi + nop.m 999 +(p12) frcpa.s0 f8,p13 = f1,f0 + nop.i 999 +} +;; { .mfb - mov pow_GR_tag = 26 - fma.d f8 = f1,f1,f0 - br.cond.sptk __libm_error_region + nop.m 999 +(p11) frcpa.s0 f8,p13 = f1,f8 + br.cond.sptk __libm_error_region } ;; +POW_Y_0: +// Here for y zero, x anything but zero and nan +// Set flag if x denormal +// Result is +1.0 +{ .mfi + nop.m 999 + fcmp.eq.s0 p6,p0 = f8,f0 // Sets flag if x denormal + nop.i 999 +} +{ .mfb + nop.m 999 + fma.d.s0 f8 = f1,f1,f0 + br.ret.sptk b0 +} +;; -L(POW_X_INF): -// When X is +-inf and Y is +-, IEEE returns +POW_X_INF: +// Here when X is +-inf -// overflow -// X +inf Y +inf +inf -// X -inf Y +inf +inf +// X +inf Y +inf +inf +// X -inf Y +inf +inf -// X +inf Y >0 +inf +// X +inf Y >0 +inf // X -inf Y >0, !odd integer +inf <== (-inf)^0.5 = +inf !! -// X -inf Y >0, odd integer -inf +// X -inf Y >0, odd integer -inf -// underflow -// X +inf Y -inf +0 -// X -inf Y -inf +0 +// X +inf Y -inf +0 +// X -inf Y -inf +0 -// X +inf Y <0 +0 -// X -inf Y <0, !odd integer +0 -// X -inf Y <0, odd integer -0 +// X +inf Y <0 +0 +// X -inf Y <0, !odd integer +0 +// X -inf Y <0, odd integer -0 // X + inf Y=+0 +1 // X + inf Y=-0 +1 @@ -1892,32 +1972,30 @@ L(POW_X_INF): // p6 == Y is a floating point number outside the integer. // Hence it is an integer and is even. -// p13 == (Y negative) +// p13 == (Y negative) // return +inf // p14 == (Y positive) // return +0 - - // p7 == Y is a floating point number within the integer range. // p9 == (int_Y = NORM_Y), Y is an integer, which may be odd or even. // p11 odd -// p13 == (Y negative) +// p13 == (Y negative) // return (sign_of_x)inf -// p14 == (Y positive) +// p14 == (Y positive) // return (sign_of_x)0 -// pxx even -// p13 == (Y negative) -// return +inf +// pxx even +// p13 == (Y negative) +// return +inf // p14 == (Y positive) -// return +0 +// return +0 // pxx == Y is not an integer -// p13 == (Y negative) +// p13 == (Y negative) // return +inf // p14 == (Y positive) // return +0 -// +// // If x=inf, test y and flag denormal { .mfi @@ -1929,207 +2007,131 @@ L(POW_X_INF): { .mfi nop.m 999 - fcmp.lt p13,p14 = POW_NORM_Y,f0 - cmp.gt.unc p6,p7 = pow_GR_exp_Y, pow_GR_10033 + fcmp.lt.s0 p13,p14 = POW_NORM_Y,f0 + cmp.gt p6,p7 = pow_GR_exp_Y, pow_GR_10033 } { .mfi nop.m 999 - fclass.m p12,p0 = f9, 0x23 + fclass.m p12,p0 = f9, 0x23 //@inf nop.i 999 } ;; - { .mfi nop.m 999 - fclass.m p15,p0 = f9, 0x07 //@zero + fclass.m p15,p0 = f9, 0x07 //@zero nop.i 999 } ;; { .mfb nop.m 999 -(p15) fmerge.s f8 = f1,f1 -(p15) br.ret.spnt b0 +(p15) fmerge.s f8 = f1,f1 // Return +1.0 if x=inf, y=0 +(p15) br.ret.spnt b0 // Exit if x=inf, y=0 } ;; - { .mfi -(p13) mov pow_GR_tag = 25 -(p14) frcpa.s1 f8,p10 = f1,f0 + nop.m 999 +(p14) frcpa.s1 f8,p10 = f1,f0 // If x=inf, y>0, assume result +inf nop.i 999 } { .mfb -(p14) mov pow_GR_tag = 24 -(p13) fma.s1 f8 = f0,f0,f0 -(p12) br.ret.spnt b0 -} -;; - - - -{ .mfb nop.m 999 -(p7) fcmp.eq.unc.s1 p9,p0 = POW_float_int_Y, POW_NORM_Y - nop.b 999 +(p13) fma.d.s0 f8 = f0,f0,f0 // If x=inf, y<0, assume result +0.0 +(p12) br.ret.spnt b0 // Exit if x=inf, y=inf } ;; +// Here if x=inf, and 0 < |y| < inf. Need to correct results if y odd integer. { .mfi nop.m 999 - nop.f 999 -(p9) tbit.nz.unc p11,p0 = pow_GR_sig_int_Y,0 -} -;; - -{ .mfb - nop.m 999 -(p11) fmerge.s f8 = POW_NORM_X,f8 - br.ret.sptk b0 +(p7) fcmp.eq.unc.s1 p9,p0 = POW_float_int_Y, POW_NORM_Y // Is y integer? + nop.i 999 } ;; - - -L(POW_X_0_Y_NEG): -// When X is +-0 and Y is negative, IEEE returns -// X Y answer -// +0 -odd int +inf -// -0 -odd int -inf - -// +0 !-odd int +inf -// -0 !-odd int +inf - - -// p6 == Y is a floating point number outside the integer. -// Hence it is an integer and is even. -// return +inf - -// p7 == Y is a floating point number within the integer range. -// p9 == (int_Y = NORM_Y), Y is an integer, which may be odd or even. -// p11 odd -// return (sign_of_x)inf -// p12 even -// return +inf -// p10 == Y is not an integer -// return +inf -// -// - { .mfi nop.m 999 nop.f 999 - cmp.gt.unc p6,p7 = pow_GR_exp_Y, pow_GR_10033 -} -;; - - -{ .mfi - mov pow_GR_tag = 27 -(p7) fcmp.eq.unc.s1 p9,p10 = POW_float_int_Y, POW_NORM_Y - nop.i 999 -} -;; - - -{ .mfb - nop.m 999 -(p6) frcpa.s0 f8,p13 = f1, f0 -(p6) br.cond.sptk __libm_error_region +(p9) tbit.nz.unc p11,p0 = pow_GR_sig_int_Y,0 // Test for y odd integer } ;; { .mfb nop.m 999 -(p10) frcpa.s0 f8,p13 = f1, f0 -(p10) br.cond.sptk __libm_error_region +(p11) fmerge.s f8 = POW_NORM_X,f8 // If y odd integer use sign of x + br.ret.sptk b0 // Exit for x=inf, 0 < |y| < inf } ;; +POW_X_NEG_Y_NONINT: +// When X is negative and Y is a non-integer, IEEE +// returns a qnan indefinite. +// We call error support with this value -{ .mib - nop.m 999 -(p9) tbit.nz.unc p11,p12 = pow_GR_sig_int_Y,0 - nop.b 999 +{ .mfb + mov pow_GR_tag = 28 + frcpa.s0 f8,p6 = f0,f0 + br.cond.sptk __libm_error_region } ;; - - +POW_X_NAN: +// Here if x=nan, y not nan { .mfi - nop.m 999 -(p12) frcpa.s0 f8,p13 = f1,f0 - nop.i 999 + nop.m 999 + fclass.m p9,p13 = f9, 0x07 // Test y=zero + nop.i 999 } ;; { .mfb - nop.m 999 -(p11) frcpa f8,p13 = f1,f8 - br.cond.sptk __libm_error_region + nop.m 999 +(p13) fma.d.s0 f8 = f8,f1,f0 +(p13) br.ret.sptk b0 // Exit if x nan, y anything but zero or nan } ;; - - - -L(POW_X_NEG_Y_NONINT): -// When X is negative and Y is a non-integer, IEEE -// returns a qnan indefinite. -// We call error support with this value - -{ .mfb - mov pow_GR_tag = 28 - frcpa f8,p6 = f0,f0 - br.cond.sptk __libm_error_region -} -;; - - - - -L(POW_X_NAN_Y_0): +POW_X_NAN_Y_0: // When X is a NAN and Y is zero, IEEE returns 1. // We call error support with this value. - { .mfi - nop.m 0 - fma.d.s0 f10 = f8,f1,f0 - nop.i 0 + nop.m 999 + fcmp.eq.s0 p6,p0 = f8,f0 // Dummy op to set invalid on snan + nop.i 999 } { .mfb - mov pow_GR_tag = 29 - fma.d.s0 f8 = f0,f0,f1 + mov pow_GR_tag = 29 + fma.d.s0 f8 = f0,f0,f1 br.cond.sptk __libm_error_region } ;; -L(POW_OVER_UNDER_X_NOT_INF): +POW_OVER_UNDER_X_NOT_INF: // p8 is TRUE for overflow // p9 is TRUE for underflow // if y is infinity, we should not over/underflow - { .mfi nop.m 999 - fcmp.eq.unc.s1 p14, p13 = POW_xsq,f1 - cmp.eq.unc p8,p9 = pow_GR_sign_Y_Gpr, r0 + fcmp.eq.s1 p14, p13 = POW_xsq,f1 // Test |x|=1 + cmp.eq p8,p9 = pow_GR_sign_Y_Gpr, r0 } ;; { .mfi nop.m 999 -(p14) fclass.m.unc p15, p0 = f9, 0x23 +(p14) fclass.m.unc p15, p0 = f9, 0x23 // If |x|=1, test y=inf nop.i 999 } { .mfi nop.m 999 -(p13) fclass.m.unc p11,p0 = f9, 0x23 +(p13) fclass.m.unc p11,p0 = f9, 0x23 // If |x| not 1, test y=inf nop.i 999 } ;; @@ -2137,31 +2139,33 @@ L(POW_OVER_UNDER_X_NOT_INF): // p15 = TRUE if |x|=1, y=inf, return +1 { .mfb nop.m 999 -(p15) fma.d f8 = f1,f1,f0 -(p15) br.ret.spnt b0 +(p15) fma.d.s0 f8 = f1,f1,f0 // If |x|=1, y=inf, result +1 +(p15) br.ret.spnt b0 // Exit if |x|=1, y=inf } ;; .pred.rel "mutex",p8,p9 { .mfb -(p8) setf.exp f8 = pow_GR_17ones -(p9) fmerge.s f8 = f0,f0 -(p11) br.ret.sptk b0 +(p8) setf.exp f8 = pow_GR_17ones // If exp(+big), result inf +(p9) fmerge.s f8 = f0,f0 // If exp(-big), result 0 +(p11) br.ret.sptk b0 // Exit if |x| not 1, y=inf } +;; { .mfb nop.m 999 nop.f 999 - br.cond.sptk L(POW_OVER_UNDER_ERROR) + br.cond.sptk POW_OVER_UNDER_ERROR // Branch if y not inf } ;; -L(POW_Y_NAN): -// Is x = +1 then result is +1, else result is quiet Y +POW_Y_NAN: +// Here if y=nan, x anything +// If x = +1 then result is +1, else result is quiet Y { .mfi nop.m 999 - fcmp.eq.s1 p10,p9 = POW_NORM_X, f1 + fcmp.eq.s1 p10,p9 = POW_NORM_X, f1 nop.i 999 } ;; @@ -2175,148 +2179,117 @@ L(POW_Y_NAN): { .mfi nop.m 999 -(p10) fma.d f8 = f1,f1,f0 +(p10) fma.d.s0 f8 = f1,f1,f0 nop.i 999 } { .mfb nop.m 999 -(p9) fma.d f8 = f9,f8,f0 - br.ret.sptk b0 +(p9) fma.d.s0 f8 = f9,f8,f0 + br.ret.sptk b0 // Exit y=nan } ;; -L(POW_OVER_UNDER_ERROR): +POW_OVER_UNDER_ERROR: +// Here if we have overflow or underflow. +// Enter with p12 true if x negative and y odd int to force -0 or -inf { .mfi - nop.m 999 - fmerge.s f10 = POW_NORM_X,POW_NORM_X - nop.i 999 -} -{ .mfi - sub pow_GR_17ones_m1 = pow_GR_17ones, r0, 1 - nop.f 999 - mov pow_GR_one = 0x1 + sub pow_GR_17ones_m1 = pow_GR_17ones, r0, 1 + nop.f 999 + mov pow_GR_one = 0x1 } ;; -// overflow +// overflow, force inf with O flag { .mmb -(p8) mov pow_GR_tag = 24 -(p8) setf.exp f11 = pow_GR_17ones_m1 +(p8) mov pow_GR_tag = 24 +(p8) setf.exp POW_tmp = pow_GR_17ones_m1 nop.b 999 } ;; - -// underflow +// underflow, force zero with I, U flags { .mmi -(p9) mov pow_GR_tag = 25 -(p9) setf.exp f11 = pow_GR_one +(p9) mov pow_GR_tag = 25 +(p9) setf.exp POW_tmp = pow_GR_one nop.i 999 } ;; - -// p12 x is negative and y is an odd integer - - { .mfi nop.m 999 - fma.d f8 = f11, f11, f0 + fma.d.s0 f8 = POW_tmp, POW_tmp, f0 nop.i 999 } ;; +// p12 x is negative and y is an odd integer, change sign of result { .mfi nop.m 999 -(p12) fmerge.ns f8 = f8, f8 +(p12) fnma.d.s0 f8 = POW_tmp, POW_tmp, f0 nop.i 999 } ;; +GLOBAL_LIBM_END(pow) -.endp pow -ASM_SIZE_DIRECTIVE(pow) - - -// Stack operations when calling error support. -// (1) (2) (3) (call) (4) -// sp -> + psp -> + psp -> + sp -> + -// | | | | -// | | <- GR_Y R3 ->| <- GR_RESULT | -> f8 -// | | | | -// | <-GR_Y Y2->| Y2 ->| <- GR_Y | -// | | | | -// | | <- GR_X X1 ->| | -// | | | | -// sp-64 -> + sp -> + sp -> + + -// save ar.pfs save b0 restore gp -// save gp restore ar.pfs - - +LOCAL_LIBM_ENTRY(__libm_error_region) -.proc __libm_error_region -__libm_error_region: - -// Answer is inf for overflow and 0 for underflow. .prologue -// (1) { .mfi - add GR_Parameter_Y=-32,sp // Parameter 2 value + add GR_Parameter_Y=-32,sp // Parameter 2 value nop.f 0 .save ar.pfs,GR_SAVE_PFS - mov GR_SAVE_PFS=ar.pfs // Save ar.pfs + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs } { .mfi .fframe 64 - add sp=-64,sp // Create new stack + add sp=-64,sp // Create new stack nop.f 0 - mov GR_SAVE_GP=gp // Save gp + mov GR_SAVE_GP=gp // Save gp };; - -// (2) { .mmi stfd [GR_Parameter_Y] = POW_NORM_Y,16 // STORE Parameter 2 on stack - add GR_Parameter_X = 16,sp // Parameter 1 address + add GR_Parameter_X = 16,sp // Parameter 1 address .save b0, GR_SAVE_B0 - mov GR_SAVE_B0=b0 // Save b0 + mov GR_SAVE_B0=b0 // Save b0 };; .body -// (3) { .mib - stfd [GR_Parameter_X] = POW_NORM_X // STORE Parameter 1 on stack + stfd [GR_Parameter_X] = POW_NORM_X // STORE Parameter 1 on stack add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address - nop.b 0 + nop.b 0 } { .mib - stfd [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack + stfd [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack add GR_Parameter_Y = -16,GR_Parameter_Y - br.call.sptk b0=__libm_error_support# // Call error handling function + br.call.sptk b0=__libm_error_support# // Call error handling function };; + { .mmi - nop.m 0 - nop.m 0 add GR_Parameter_RESULT = 48,sp + nop.m 0 + nop.i 0 };; -// (4) { .mmi - ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack + ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack .restore sp - add sp = 64,sp // Restore stack pointer - mov b0 = GR_SAVE_B0 // Restore return address + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address };; + { .mib - mov gp = GR_SAVE_GP // Restore gp - mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs - br.ret.sptk b0 // Return + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return };; -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) +LOCAL_LIBM_END(__libm_error_region) .type __libm_error_support#,@function .global __libm_error_support# + diff --git a/sysdeps/ia64/fpu/e_powf.S b/sysdeps/ia64/fpu/e_powf.S index d464058262..275843f1e2 100644 --- a/sysdeps/ia64/fpu/e_powf.S +++ b/sysdeps/ia64/fpu/e_powf.S @@ -1,10 +1,10 @@ .file "powf.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. // -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,7 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -35,30 +35,39 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 2/02/00 Initial version -// 2/03/00 Added p12 to definite over/under path. With odd power we did not +// 02/02/00 Initial version +// 02/03/00 Added p12 to definite over/under path. With odd power we did not // maintain the sign of x in this path. -// 4/04/00 Unwind support added -// 4/19/00 pow(+-1,inf) now returns NaN -// pow(+-val, +-inf) returns 0 or inf, but now does not call error support +// 04/04/00 Unwind support added +// 04/19/00 pow(+-1,inf) now returns NaN +// pow(+-val, +-inf) returns 0 or inf, but now does not call error +// support // Added s1 to fcvt.fx because invalid flag was incorrectly set. -// 8/15/00 Bundle added after call to __libm_error_support to properly +// 08/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. -// 9/07/00 Improved performance by eliminating bank conflicts and other stalls, +// 09/07/00 Improved performance by eliminating bank conflicts and other stalls, // and tweaking the critical path -// 9/08/00 Per c99, pow(+-1,inf) now returns 1, and pow(+1,nan) returns 1 -// 9/28/00 Updated NaN**0 path -// 1/20/01 Fixed denormal flag settings. -// 2/12/01 Improved speed. +// 09/08/00 Per c99, pow(+-1,inf) now returns 1, and pow(+1,nan) returns 1 +// 09/28/00 Updated NaN**0 path +// 01/20/01 Fixed denormal flag settings. +// 02/13/01 Improved speed. +// 03/19/01 Reordered exp polynomial to improve speed and eliminate monotonicity +// problem in round up, down, and to zero modes. Also corrected +// overflow result when x negative, y odd in round up, down, zero. +// 06/14/01 Added brace missing from bundle +// 12/10/01 Corrected case where x negative, 2^23 <= |y| < 2^24, y odd integer. +// 02/08/02 Fixed overflow/underflow cases that were not calling error support. +// 05/20/02 Cleaned up namespace and sf0 syntax +// 08/29/02 Improved Itanium 2 performance +// 02/10/03 Reordered header: .section, .global, .proc, .align // // API //============================================================== -// double pow(double) -// float powf(float) +// float powf(float x, float y) // // Overview of operation //============================================================== @@ -67,51 +76,51 @@ // 1. Log(x) // 2. y Log(x) // 3. exp(y log(x)) -// +// // This means we work with the absolute value of x and merge in the sign later. // Log(x) = G + delta + r -rsq/2 + p // G,delta depend on the exponent of x and table entries. The table entries are // indexed by the exponent of x, called K. -// +// // The G and delta come out of the reduction; r is the reduced x. -// +// // B = frcpa(x) // xB-1 is small means that B is the approximate inverse of x. -// +// // Log(x) = Log( (1/B)(Bx) ) // = Log(1/B) + Log(Bx) // = Log(1/B) + Log( 1 + (Bx-1)) -// +// // x = 2^K 1.x_1x_2.....x_52 -// B= frcpa(x) = 2^-k Cm +// B= frcpa(x) = 2^-k Cm // Log(1/B) = Log(1/(2^-K Cm)) // Log(1/B) = Log((2^K/ Cm)) // Log(1/B) = K Log(2) + Log(1/Cm) -// +// // Log(x) = K Log(2) + Log(1/Cm) + Log( 1 + (Bx-1)) -// +// // If you take the significand of x, set the exponent to true 0, then Cm is // the frcpa. We tabulate the Log(1/Cm) values. There are 256 of them. // The frcpa table is indexed by 8 bits, the x_1 thru x_8. // m = x_1x_2...x_8 is an 8-bit index. -// +// // Log(1/Cm) = log(1/frcpa(1+m/256)) where m goes from 0 to 255. -// +// // We tabluate as two doubles, T and t, where T +t is the value itself. -// +// // Log(x) = (K Log(2)_hi + T) + (Log(2)_hi + t) + Log( 1 + (Bx-1)) // Log(x) = G + delta + Log( 1 + (Bx-1)) -// +// // The Log( 1 + (Bx-1)) can be calculated as a series in r = Bx-1. -// +// // Log( 1 + (Bx-1)) = r - rsq/2 + p -// +// // Then, -// +// // yLog(x) = yG + y delta + y(r-rsq/2) + yp // yLog(x) = Z1 + e3 + Z2 + Z3 + (e2 + e3) -// -// +// +// // exp(yLog(x)) = exp(Z1 + Z2 + Z3) exp(e1 + e2 + e3) // // @@ -133,7 +142,7 @@ // exp(r) = exp(Z - N log2/128) // // r = s + d = (Z - N (log2/128)_hi) -N (log2/128)_lo -// = Z - N (log2/128) +// = Z - N (log2/128) // // Z = s+d +N (log2/128) // @@ -149,22 +158,22 @@ // n log2/128 = n_7n_6n_5 log2/8 + n_4n_3n_2n_1 log2/128 // n log2/128 = I2 log2/8 + I1 log2/128 // -// N log2/128 = M log2 + I2 log2/8 + I1 log2/128 +// N log2/128 = M log2 + I2 log2/8 + I1 log2/128 // // exp(Z) = exp(s) (1+d) exp(log(2^M) + log(2^I2/8) + log(2^I1/128)) // exp(Z) = exp(s) (1+d1) (1+d2)(2^M) 2^I2/8 2^I1/128 // exp(Z) = exp(s) f1 f2 (2^M) 2^I2/8 2^I1/128 // // I1, I2 are table indices. Use a series for exp(s). -// Then get exp(Z) +// Then get exp(Z) // // exp(yLog(x)) = exp(Z1 + Z2 + Z3) exp(e1 + e2 + e3) -// exp(yLog(x)) = exp(Z) exp(Z3) f3 -// exp(yLog(x)) = exp(Z)f3 exp(Z3) -// exp(yLog(x)) = A exp(Z3) +// exp(yLog(x)) = exp(Z) exp(Z3) f3 +// exp(yLog(x)) = exp(Z)f3 exp(Z3) +// exp(yLog(x)) = A exp(Z3) // // We actually calculate exp(Z3) -1. -// Then, +// Then, // exp(yLog(x)) = A + A( exp(Z3) -1) // @@ -175,142 +184,146 @@ // ============== // The operation (K*log2_hi) must be exact. K is the true exponent of x. // If we allow gradual underflow (denormals), K can be represented in 12 bits -// (as a two's complement number). We assume 13 bits as an engineering precaution. -// +// (as a two's complement number). We assume 13 bits as an engineering +// precaution. +// // +------------+----------------+-+ // | 13 bits | 50 bits | | // +------------+----------------+-+ // 0 1 66 // 2 34 -// +// // So we want the lsb(log2_hi) to be 2^-50 // We get log2 as a quad-extended (15-bit exponent, 128-bit significand) -// +// // 0 fffe b17217f7d1cf79ab c9e3b39803f2f6af (4...) -// +// // Consider numbering the bits left to right, starting at 0 thru 127. // Bit 0 is the 2^-1 bit; bit 49 is the 2^-50 bit. -// +// // ...79ab // 0111 1001 1010 1011 // 44 // 89 -// -// So if we shift off the rightmost 14 bits, then (shift back only +// +// So if we shift off the rightmost 14 bits, then (shift back only // the top half) we get -// +// // 0 fffe b17217f7d1cf4000 e6af278ece600fcb dabc000000000000 -// +// // Put the right 64-bit signficand in an FR register, convert to double; // it is exact. Put the next 128 bits into a quad register and round to double. // The true exponent of the low part is -51. -// +// // hi is 0 fffe b17217f7d1cf4000 // lo is 0 ffcc e6af278ece601000 -// +// // Convert to double memory format and get -// +// // hi is 0x3fe62e42fefa39e8 -// lo is 0x3cccd5e4f1d9cc02 -// +// lo is 0x3cccd5e4f1d9cc02 +// // log2_hi + log2_lo is an accurate value for log2. -// -// +// +// // The T and t values // ================== // A similar method is used to generate the T and t values. -// +// // K * log2_hi + T must be exact. -// +// // Smallest T,t // ---------- -// The smallest T,t is +// The smallest T,t is // T t -// data8 0x3f60040155d58800, 0x3c93bce0ce3ddd81 log(1/frcpa(1+0/256))= +1.95503e-003 -// +// 0x3f60040155d58800, 0x3c93bce0ce3ddd81 log(1/frcpa(1+0/256))= +1.95503e-003 +// // The exponent is 0x3f6 (biased) or -9 (true). // For the smallest T value, what we want is to clip the significand such that -// when it is shifted right by 9, its lsb is in the bit for 2^-51. The 9 is the specific -// for the first entry. In general, it is 0xffff - (biased 15-bit exponent). +// when it is shifted right by 9, its lsb is in the bit for 2^-51. The 9 is the +// specific for the first entry. In general, it is 0xffff - (biased 15-bit +// exponent). -// Independently, what we have calculated is the table value as a quad precision number. +// Independently, what we have calculated is the table value as a quad +// precision number. // Table entry 1 is // 0 fff6 80200aaeac44ef38 338f77605fdf8000 -// +// // We store this quad precision number in a data structure that is -// sign: 1 +// sign: 1 // exponent: 15 // signficand_hi: 64 (includes explicit bit) // signficand_lo: 49 // Because the explicit bit is included, the significand is 113 bits. -// +// // Consider significand_hi for table entry 1. -// -// +// +// // +-+--- ... -------+--------------------+ // | | // +-+--- ... -------+--------------------+ // 0 1 4444444455555555556666 // 2345678901234567890123 -// +// // Labeled as above, bit 0 is 2^0, bit 1 is 2^-1, etc. // Bit 42 is 2^-42. If we shift to the right by 9, the bit in // bit 42 goes in 51. -// +// // So what we want to do is shift bits 43 thru 63 into significand_lo. -// This is shifting bit 42 into bit 63, taking care to retain the shifted-off bits. -// Then shifting (just with signficaand_hi) back into bit 42. -// -// The shift_value is 63-42 = 21. In general, this is +// This is shifting bit 42 into bit 63, taking care to retain shifted-off bits. +// Then shifting (just with signficaand_hi) back into bit 42. +// +// The shift_value is 63-42 = 21. In general, this is // 63 - (51 -(0xffff - 0xfff6)) // For this example, it is // 63 - (51 - 9) = 63 - 42 = 21 -// -// This means we are shifting 21 bits into significand_lo. We must maintain more -// that a 128-bit signficand not to lose bits. So before the shift we put the 128-bit -// significand into a 256-bit signficand and then shift. +// +// This means we are shifting 21 bits into significand_lo. We must maintain more +// that a 128-bit signficand not to lose bits. So before the shift we put the +// 128-bit significand into a 256-bit signficand and then shift. // The 256-bit significand has four parts: hh, hl, lh, and ll. -// +// // Start off with // hh hl lh ll // <64> <49><15_0> <64_0> <64_0> -// +// // After shift by 21 (then return for significand_hi), // <43><21_0> <21><43> <6><58_0> <64_0> -// +// // Take the hh part and convert to a double. There is no rounding here. -// The conversion is exact. The true exponent of the high part is the same as the -// true exponent of the input quad. -// -// We have some 64 plus significand bits for the low part. In this example, we have -// 70 bits. We want to round this to a double. Put them in a quad and then do a quad fnorm. -// For this example the true exponent of the low part is +// The conversion is exact. The true exponent of the high part is the same as +// the true exponent of the input quad. +// +// We have some 64 plus significand bits for the low part. In this example, we +// have 70 bits. We want to round this to a double. Put them in a quad and then +// do a quad fnorm. +// For this example the true exponent of the low part is // true_exponent_of_high - 43 = true_exponent_of_high - (64-21) -// In general, this is -// true_exponent_of_high - (64 - shift_value) -// -// +// In general, this is +// true_exponent_of_high - (64 - shift_value) +// +// // Largest T,t // ---------- // The largest T,t is -// data8 0x3fe62643fecf9742, 0x3c9e3147684bd37d log(1/frcpa(1+255/256))= +6.92171e-001 -// +// 0x3fe62643fecf9742, 0x3c9e3147684bd37d log(1/frcpa(1+255/256))=+6.92171e-001 +// // Table entry 256 is // 0 fffe b1321ff67cba178c 51da12f4df5a0000 -// -// The shift value is +// +// The shift value is // 63 - (51 -(0xffff - 0xfffe)) = 13 -// -// The true exponent of the low part is +// +// The true exponent of the low part is // true_exponent_of_high - (64 - shift_value) // -1 - (64-13) = -52 // Biased as a double, this is 0x3cb -// -// -// +// +// +// // So then lsb(T) must be >= 2^-51 // msb(Klog2_hi) <= 2^12 -// +// // +--------+---------+ // | 51 bits | <== largest T // +--------+---------+ @@ -320,7 +333,6 @@ // +------------+----------------+-+ - // Special Cases //============================================================== @@ -385,63 +397,66 @@ // X any Y =0 +1 -#include "libm_support.h" - // Assembly macros //============================================================== // integer registers used -pow_AD_Tt = r33 -pow_GR_FFF7 = r34 -pow_GR_exp_Y = r34 // duplicate -pow_GR_17ones = r35 - -pow_AD_P = r36 -pow_AD_Q = r37 -pow_AD_tbl1 = r38 -pow_AD_tbl2 = r39 -pow_GR_exp_X = r40 -pow_GR_true_exp_X = r40 // duplicate - -pow_GR_offset = r41 -pow_GR_exp_Xm1 = r42 -pow_GR_sig_X = r43 -pow_GR_signexp_X = r44 - -pow_GR_signexp_Xm1 = r46 -pow_GR_int_W1 = r47 -pow_GR_int_W2 = r48 -pow_GR_int_N = r49 -pow_GR_index1 = r50 - -pow_GR_index2 = r51 -pow_AD_T1 = r52 -pow_AD_T2 = r53 -pow_GR_gt_ln = r53 // duplicate -pow_int_GR_M = r54 -pow_GR_10033 = r55 - -pow_GR_16ones = r56 -pow_GR_sig_int_Y = r57 -pow_GR_sign_Y_Gpr = r58 -pow_GR_17ones_m1 = r59 -pow_GR_one = r60 -pow_GR_sign_Y = r60 - -pow_GR_signexp_Y_Gpr = r61 -pow_GR_exp_Y_Gpr = r62 -pow_GR_true_exp_Y_Gpr = r63 -pow_GR_signexp_Y = r64 - -GR_SAVE_B0 = r65 -GR_SAVE_GP = r66 -GR_SAVE_PFS = r67 - -GR_Parameter_X = r68 -GR_Parameter_Y = r69 -GR_Parameter_RESULT = r70 -pow_GR_tag = r71 +pow_GR_signexp_X = r14 +pow_GR_17ones = r15 +pow_AD_P = r16 +pow_GR_exp_2tom8 = r17 +pow_GR_sig_X = r18 +pow_GR_10033 = r19 +pow_GR_16ones = r20 + +pow_AD_Tt = r21 +pow_GR_exp_X = r22 +pow_AD_Q = r23 +pow_GR_true_exp_X = r24 +pow_GR_y_zero = r25 + +pow_GR_exp_Y = r26 +pow_AD_tbl1 = r27 +pow_AD_tbl2 = r28 +pow_GR_offset = r29 +pow_GR_exp_Xm1 = r30 +pow_GR_xneg_yodd = r31 + +pow_GR_signexp_Xm1 = r35 +pow_GR_int_W1 = r36 +pow_GR_int_W2 = r37 +pow_GR_int_N = r38 +pow_GR_index1 = r39 +pow_GR_index2 = r40 + +pow_AD_T1 = r41 +pow_AD_T2 = r42 +pow_int_GR_M = r43 +pow_GR_sig_int_Y = r44 +pow_GR_sign_Y_Gpr = r45 + +pow_GR_17ones_m1 = r46 +pow_GR_one = r47 +pow_GR_sign_Y = r48 +pow_GR_signexp_Y_Gpr = r49 +pow_GR_exp_Y_Gpr = r50 + +pow_GR_true_exp_Y_Gpr = r51 +pow_GR_signexp_Y = r52 +pow_GR_x_one = r53 +pow_GR_big_pos = r55 + +pow_GR_big_neg = r56 + +GR_SAVE_B0 = r50 +GR_SAVE_GP = r51 +GR_SAVE_PFS = r52 + +GR_Parameter_X = r53 +GR_Parameter_Y = r54 +GR_Parameter_RESULT = r55 +pow_GR_tag = r56 // floating point registers used @@ -464,7 +479,8 @@ POW_log2_lo = f43 POW_r = f44 POW_Q0_half = f45 -POW_Q1 = f46 +POW_Q1 = f46 +POW_tmp = f47 POW_log2_hi = f48 POW_Q4 = f49 POW_P1 = f50 @@ -476,6 +492,7 @@ POW_Yrcub = f54 POW_log2_by_128_lo = f55 POW_v6 = f56 +POW_xsq = f57 POW_v4 = f58 POW_v2 = f59 POW_T = f60 @@ -484,6 +501,7 @@ POW_Tt = f61 POW_RSHF = f62 POW_v21ps = f63 POW_s4 = f64 +POW_twoV = f65 POW_U = f66 POW_G = f67 @@ -533,44 +551,36 @@ POW_1ps = f103 POW_A = f104 POW_es = f105 +POW_Xp1 = f106 POW_int_K = f107 POW_K = f108 POW_f123 = f109 POW_Gpr = f110 -POW_Y_Gpr = f111 +POW_Y_Gpr = f111 POW_int_Y = f112 +POW_2Mqp1 = f113 POW_float_int_Y = f116 POW_ftz_urm_f8 = f117 POW_wre_urm_f8 = f118 -POW_abs_A = f119 -POW_gt_pln = f120 - -POW_xsq = f121 - -POW_twoV = f122 -POW_Xp1 = f123 +POW_big_neg = f119 +POW_big_pos = f120 // Data tables //============================================================== -#ifdef _LIBC -.rodata -#else -.data -#endif +RODATA .align 16 -pow_table_P: -ASM_TYPE_DIRECTIVE(pow_table_P,@object) +LOCAL_OBJECT_START(pow_table_P) data8 0x8000F7B249FF332D, 0x0000BFFC // P_5 data8 0xAAAAAAA9E7902C7F, 0x0000BFFC // P_3 data8 0x80000000000018E5, 0x0000BFFD // P_1 data8 0xb8aa3b295c17f0bc, 0x00004006 // inv_ln2_by_128 - - +// +// data8 0x3FA5555555554A9E // Q_2 data8 0x3F8111124F4DD9F9 // Q_3 data8 0x3FE0000000000000 // Q_0 @@ -580,20 +590,18 @@ data8 0x43e8000000000000 // Right shift constant for exp data8 0xc9e3b39803f2f6af, 0x00003fb7 // ln2_by_128_lo data8 0x0000000000000000 // pad to eliminate bank conflicts with pow_table_Q data8 0x0000000000000000 // pad to eliminate bank conflicts with pow_table_Q -ASM_SIZE_DIRECTIVE(pow_table_P) +LOCAL_OBJECT_END(pow_table_P) -pow_table_Q: -ASM_TYPE_DIRECTIVE(pow_table_Q,@object) +LOCAL_OBJECT_START(pow_table_Q) data8 0x9249FE7F0DC423CF, 0x00003FFC // P_4 data8 0xCCCCCCCC4ED2BA7F, 0x00003FFC // P_2 data8 0xAAAAAAAAAAAAB505, 0x00003FFD // P_0 data8 0x3fe62e42fefa39e8, 0x3cccd5e4f1d9cc02 // log2 hi lo = +6.93147e-001 data8 0xb17217f7d1cf79ab, 0x00003ff7 // ln2_by_128_hi -ASM_SIZE_DIRECTIVE(pow_table_Q) +LOCAL_OBJECT_END(pow_table_Q) -pow_Tt: -ASM_TYPE_DIRECTIVE(pow_Tt,@object) +LOCAL_OBJECT_START(pow_Tt) data8 0x3f60040155d58800, 0x3c93bce0ce3ddd81 // log(1/frcpa(1+0/256))= +1.95503e-003 data8 0x3f78121214586a00, 0x3cb540e0a5cfc9bc // log(1/frcpa(1+1/256))= +5.87661e-003 data8 0x3f841929f9683200, 0x3cbdf1d57404da1f // log(1/frcpa(1+2/256))= +9.81362e-003 @@ -850,13 +858,12 @@ data8 0x3fe5f673c61a2ed0, 0x3caa385eef5f2789 // log(1/frcpa(1+252/256))= +6.863 data8 0x3fe6065bea385924, 0x3cb11624f165c5b4 // log(1/frcpa(1+253/256))= +6.88276e-001 data8 0x3fe6164bfa7cc068, 0x3cbad884f87073fa // log(1/frcpa(1+254/256))= +6.90222e-001 data8 0x3fe62643fecf9740, 0x3cb78c51da12f4df // log(1/frcpa(1+255/256))= +6.92171e-001 -ASM_SIZE_DIRECTIVE(pow_Tt) +LOCAL_OBJECT_END(pow_Tt) // Table 1 is 2^(index_1/128) where // index_1 goes from 0 to 15 -pow_tbl1: -ASM_TYPE_DIRECTIVE(pow_tbl1,@object) +LOCAL_OBJECT_START(pow_tbl1) data8 0x8000000000000000 , 0x00003FFF data8 0x80B1ED4FD999AB6C , 0x00003FFF data8 0x8164D1F3BC030773 , 0x00003FFF @@ -873,13 +880,12 @@ data8 0x88980E8092DA8527 , 0x00003FFF data8 0x8955EE03618E5FDD , 0x00003FFF data8 0x8A14D575496EFD9A , 0x00003FFF data8 0x8AD4C6452C728924 , 0x00003FFF -ASM_SIZE_DIRECTIVE(pow_tbl1) +LOCAL_OBJECT_END(pow_tbl1) // Table 2 is 2^(index_1/8) where // index_2 goes from 0 to 7 -pow_tbl2: -ASM_TYPE_DIRECTIVE(pow_tbl2,@object) +LOCAL_OBJECT_START(pow_tbl2) data8 0x8000000000000000 , 0x00003FFF data8 0x8B95C1E3EA8BD6E7 , 0x00003FFF data8 0x9837F0518DB8A96F , 0x00003FFF @@ -888,372 +894,287 @@ data8 0xB504F333F9DE6484 , 0x00003FFF data8 0xC5672A115506DADD , 0x00003FFF data8 0xD744FCCAD69D6AF4 , 0x00003FFF data8 0xEAC0C6E7DD24392F , 0x00003FFF -ASM_SIZE_DIRECTIVE(pow_tbl2) - -.global powf +LOCAL_OBJECT_END(pow_tbl2) .section .text -.proc powf -.align 32 - -powf: +GLOBAL_LIBM_ENTRY(powf) +// Get exponent of x. Will be used to calculate K. { .mfi - alloc r32=ar.pfs,1,35,4,0 - fms.s1 POW_Xm1 = f8,f1,f1 // Will be used for r1 if x>0 - mov pow_GR_17ones = 0x1FFFF + getf.exp pow_GR_signexp_X = f8 + fms.s1 POW_Xm1 = f8,f1,f1 // Will be used for r1 if x>0 + mov pow_GR_17ones = 0x1FFFF } { .mfi -(p0) addl pow_AD_P = @ltoff(pow_table_P), gp - fma.s1 POW_Xp1 = f8,f1,f1 // Will be used for r1 if x<0 + addl pow_AD_P = @ltoff(pow_table_P), gp + fma.s1 POW_Xp1 = f8,f1,f1 // Will be used for r1 if x<0 nop.i 999 ;; } - -// Get exponent of x. Will be used to calculate K. +// Get significand of x. Will be used to get index to fetch T, Tt. { .mfi - getf.exp pow_GR_signexp_X = f8 - frcpa.s1 POW_B, p6 = f1,f8 + getf.sig pow_GR_sig_X = f8 + frcpa.s1 POW_B, p6 = f1,f8 nop.i 999 } { .mfi ld8 pow_AD_P = [pow_AD_P] - fma.s1 POW_NORM_X = f8,f1,f0 - mov pow_GR_FFF7 = 0xFFF7 + fma.s1 POW_NORM_X = f8,f1,f0 + mov pow_GR_exp_2tom8 = 0xFFF7 } ;; - - -// Get significand of x. Will be used to get index to fetch T, Tt. // p13 = TRUE ==> X is unorm // DOUBLE 0x10033 exponent limit at which y is an integer -// SINGLE 0x10016 { .mfi - getf.sig pow_GR_sig_X = f8 - fclass.m p13,p0 = f8, 0x0b // Test for x unorm - addl pow_GR_10033 = 0x10033, r0 + nop.m 999 + fclass.m p13,p0 = f8, 0x0b // Test for x unorm + addl pow_GR_10033 = 0x10033, r0 } { .mfi mov pow_GR_16ones = 0xFFFF - fma.s1 POW_NORM_Y = f9,f1,f0 + fma.s1 POW_NORM_Y = f9,f1,f0 nop.i 999 } ;; - // p14 = TRUE ==> X is ZERO { .mfi adds pow_AD_Tt = pow_Tt - pow_table_P, pow_AD_P - fclass.m p14,p15 = f8, 0x07 - and pow_GR_exp_X = pow_GR_signexp_X, pow_GR_17ones + fclass.m p14,p0 = f8, 0x07 + and pow_GR_exp_X = pow_GR_signexp_X, pow_GR_17ones } { .mfi - adds pow_AD_Q = pow_table_Q - pow_table_P, pow_AD_P + adds pow_AD_Q = pow_table_Q - pow_table_P, pow_AD_P nop.f 999 nop.i 999 } ;; { .mfi - ldfe POW_P5 = [pow_AD_P], 16 - fcmp.lt.s1 p8,p9 = f8, f0 // Test for x<0 - shl pow_GR_offset = pow_GR_sig_X, 1 + ldfe POW_P5 = [pow_AD_P], 16 + fcmp.lt.s1 p8,p9 = f8, f0 // Test for x<0 + nop.i 999 } { .mib - ldfe POW_P4 = [pow_AD_Q], 16 - sub pow_GR_true_exp_X = pow_GR_exp_X, pow_GR_16ones -(p13) br.cond.spnt L(POW_X_DENORM) + ldfe POW_P4 = [pow_AD_Q], 16 + sub pow_GR_true_exp_X = pow_GR_exp_X, pow_GR_16ones +(p13) br.cond.spnt POW_X_DENORM } ;; - // Continue normal and denormal paths here -L(POW_COMMON): +POW_COMMON: // p11 = TRUE ==> Y is a NAN { .mfi - ldfe POW_P3 = [pow_AD_P], 16 - fclass.m.unc p11,p0 = f9, 0xc3 - shr.u pow_GR_offset = pow_GR_offset,56 + ldfe POW_P3 = [pow_AD_P], 16 + fclass.m p11,p0 = f9, 0xc3 + nop.i 999 } { .mfi - ldfe POW_P2 = [pow_AD_Q], 16 + ldfe POW_P2 = [pow_AD_Q], 16 nop.f 999 - nop.i 999 + mov pow_GR_y_zero = 0 } ;; - - -// Compute xsq to decide later if |x|=1 -// p11 = TRUE ==> Y is a NaN +// Note POW_Xm1 and POW_r1 are used interchangably { .mfi - setf.sig POW_int_K = pow_GR_true_exp_X -(p15) fms.s1 POW_r = POW_B, POW_NORM_X,f1 - shladd pow_AD_Tt = pow_GR_offset, 4, pow_AD_Tt + alloc r32=ar.pfs,2,19,4,0 + fms.s1 POW_r = POW_B, POW_NORM_X,f1 + nop.i 999 } { .mfi - nop.m 999 -(p8) fnma.s1 POW_Xm1 = POW_Xp1,f1,f0 + setf.sig POW_int_K = pow_GR_true_exp_X +(p8) fnma.s1 POW_Xm1 = POW_Xp1,f1,f0 nop.i 999 } ;; - - -// p12 = TRUE ==> X is ZERO and Y is ZERO +// p12 = TRUE if Y is ZERO +// Compute xsq to decide later if |x|=1 { .mfi - ldfe POW_P1 = [pow_AD_P], 16 -(p14) fclass.m.unc p12,p0 = f9, 0x07 - nop.i 999 + ldfe POW_P1 = [pow_AD_P], 16 + fclass.m p12,p0 = f9, 0x07 + shl pow_GR_offset = pow_GR_sig_X, 1 } { .mfb - ldfe POW_P0 = [pow_AD_Q], 16 + ldfe POW_P0 = [pow_AD_Q], 16 fma.s1 POW_xsq = POW_NORM_X, POW_NORM_X, f0 -(p11) br.cond.spnt L(POW_Y_NAN) +(p11) br.cond.spnt POW_Y_NAN // Branch if y=nan } ;; - -.pred.rel "mutex",p8,p9 // Get exponent of |x|-1 to use in comparison to 2^-8 -{ .mmf -(p8) getf.exp pow_GR_signexp_Xm1 = POW_Xp1 -(p9) getf.exp pow_GR_signexp_Xm1 = POW_Xm1 - fcvt.fx.s1 POW_int_Y = POW_NORM_Y +{ .mfi + getf.exp pow_GR_signexp_Xm1 = POW_Xm1 + fcvt.fx.s1 POW_int_Y = POW_NORM_Y + shr.u pow_GR_offset = pow_GR_offset,56 } ;; - // p11 = TRUE ==> X is a NAN { .mfi ldfpd POW_log2_hi, POW_log2_lo = [pow_AD_Q], 16 - fclass.m.unc p11,p0 = f8, 0xc3 - nop.i 999 + fclass.m p11,p0 = f8, 0xc3 + shladd pow_AD_Tt = pow_GR_offset, 4, pow_AD_Tt } -{ .mib - ldfpd POW_T, POW_Tt = [pow_AD_Tt], 16 - nop.i 999 -(p12) br.cond.spnt L(POW_X_0_Y_0) +{ .mfi + ldfe POW_inv_log2_by_128 = [pow_AD_P], 16 + fma.s1 POW_delta = f0,f0,f0 // delta=0 in case |x| near 1 +(p12) mov pow_GR_y_zero = 1 } ;; - -// p14 = TRUE ==> X is zero -// p15 = TRUE ==> X is zero AND Y is negative -// p10 = TRUE ==> X is zero AND Y is >= zero { .mfi - ldfe POW_inv_log2_by_128 = [pow_AD_P], 16 -(p14) fcmp.lt.unc.s1 p15, p10 = f9,f0 - nop.i 999 + ldfpd POW_Q2, POW_Q3 = [pow_AD_P], 16 + fma.s1 POW_G = f0,f0,f0 // G=0 in case |x| near 1 + and pow_GR_exp_Xm1 = pow_GR_signexp_Xm1, pow_GR_17ones } -{ .mfi - nop.m 999 - nop.f 999 - and pow_GR_exp_Xm1 = pow_GR_signexp_Xm1, pow_GR_17ones -} ;; - // Determine if we will use the |x| near 1 path (p6) or normal path (p7) -// p12 = TRUE ==> X is a NAN and Y is a zero -// p13 = TRUE ==> X is a NAN and Y is anything else { .mfi - getf.exp pow_GR_signexp_Y = POW_NORM_Y -(p11) fclass.m.unc p12,p13 = f9, 0x07 - cmp.lt.unc p6,p7 = pow_GR_exp_Xm1, pow_GR_FFF7 + getf.exp pow_GR_signexp_Y = POW_NORM_Y + nop.f 999 + cmp.lt p6,p7 = pow_GR_exp_Xm1, pow_GR_exp_2tom8 } -{ .mfi - ldfpd POW_Q2, POW_Q3 = [pow_AD_P], 16 - fma.s1 POW_rsq = POW_r, POW_r,f0 - nop.i 999 -;; +{ .mfb + ldfpd POW_T, POW_Tt = [pow_AD_Tt], 16 + fma.s1 POW_rsq = POW_r, POW_r,f0 +(p11) br.cond.spnt POW_X_NAN // Branch if x=nan and y not nan } +;; // If on the x near 1 path, assign r1 to r and r1*r1 to rsq { .mfi - ldfpd POW_Q0_half, POW_Q1 = [pow_AD_P], 16 -(p6) fma.s1 POW_r = POW_r1, f1, f0 - nop.i 999 -} -{ .mfi - nop.m 999 -(p6) fma.s1 POW_rsq = POW_r1, POW_r1, f0 + ldfpd POW_Q0_half, POW_Q1 = [pow_AD_P], 16 +(p6) fma.s1 POW_r = POW_r1, f1, f0 nop.i 999 -;; -} - - -{ .mfi - ldfpd POW_Q4, POW_RSHF = [pow_AD_P], 16 -(p7) fma.s1 POW_v6 = POW_r, POW_P5, POW_P4 - and pow_GR_exp_Y = pow_GR_signexp_Y, pow_GR_17ones } { .mfb nop.m 999 -(p6) fma.s1 POW_v6 = POW_r1, POW_P5, POW_P4 -(p12) br.cond.spnt L(POW_X_NAN_Y_0) +(p6) fma.s1 POW_rsq = POW_r1, POW_r1, f0 +(p14) br.cond.spnt POW_X_0 // Branch if x zero and y not nan } ;; - { .mfi - nop.m 999 -(p7) fma.s1 POW_v4 = POW_P3, POW_r, POW_P2 - andcm pow_GR_sign_Y = pow_GR_signexp_Y, pow_GR_17ones + ldfpd POW_Q4, POW_RSHF = [pow_AD_P], 16 +(p7) fma.s1 POW_v6 = POW_r, POW_P5, POW_P4 + nop.i 999 } -{ .mfb +{ .mfi nop.m 999 -(p6) fma.s1 POW_v4 = POW_P3, POW_r1, POW_P2 -(p12) br.cond.spnt L(POW_X_NAN_Y_0) +(p6) fma.s1 POW_v6 = POW_r1, POW_P5, POW_P4 + nop.i 999 } ;; { .mfi nop.m 999 - fcvt.xf POW_K = POW_int_K +(p7) fma.s1 POW_v4 = POW_P3, POW_r, POW_P2 nop.i 999 } -{ .mfb - nop.m 999 -(p13) fma.s f8 = f8,f1,f0 -(p13) br.ret.spnt b0 // Exit if x nan, y anything but zero -} -;; - -// p10 = TRUE ==> X is zero AND Y is positive -// p8 = TRUE ==> X is zero AND Y is outside integer range (treat as even int) -// return +0 -// p9 = TRUE ==> X is zero AND Y is within integer range (may not be integer) { .mfi -(p10) cmp.gt.unc p8,p9 = pow_GR_exp_Y, pow_GR_10033 -(p6) fmerge.s POW_delta = f0,f0 + nop.m 999 +(p6) fma.s1 POW_v4 = POW_P3, POW_r1, POW_P2 nop.i 999 } +;; + { .mfi nop.m 999 -(p6) fma.s1 POW_G = f0,f0,f0 + fcvt.xf POW_K = POW_int_K nop.i 999 } ;; { .mfi - getf.sig pow_GR_sig_int_Y = POW_int_Y - fnma.s1 POW_twoV = POW_NORM_Y, POW_rsq,f0 - nop.i 999 + getf.sig pow_GR_sig_int_Y = POW_int_Y + fnma.s1 POW_twoV = POW_NORM_Y, POW_rsq,f0 + and pow_GR_exp_Y = pow_GR_signexp_Y, pow_GR_17ones } -{ .mfi - nop.m 999 - fma.s1 POW_U = POW_NORM_Y,POW_r,f0 - nop.i 999 +{ .mfb + andcm pow_GR_sign_Y = pow_GR_signexp_Y, pow_GR_17ones + fma.s1 POW_U = POW_NORM_Y,POW_r,f0 +(p12) br.cond.spnt POW_Y_0 // Branch if y=zero, x not zero or nan } ;; +// p11 = TRUE ==> X is NEGATIVE but not inf { .mfi - ldfe POW_log2_by_128_lo = [pow_AD_P], 16 -(p6) fma.s1 POW_v2 = POW_P1, POW_r1, POW_P0 + ldfe POW_log2_by_128_lo = [pow_AD_P], 16 + fclass.m p11,p0 = f8, 0x1a nop.i 999 } { .mfi - ldfe POW_log2_by_128_hi = [pow_AD_Q], 16 -(p7) fma.s1 POW_v2 = POW_P1, POW_r, POW_P0 + ldfe POW_log2_by_128_hi = [pow_AD_Q], 16 + fma.s1 POW_v2 = POW_P1, POW_r, POW_P0 nop.i 999 } ;; - { .mfi nop.m 999 - fcvt.xf POW_float_int_Y = POW_int_Y + fcvt.xf POW_float_int_Y = POW_int_Y nop.i 999 } { .mfi nop.m 999 - fma.s1 POW_v3 = POW_v6, POW_rsq, POW_v4 - adds pow_AD_tbl1 = pow_tbl1 - pow_Tt, pow_AD_Q + fma.s1 POW_v3 = POW_v6, POW_rsq, POW_v4 + adds pow_AD_tbl1 = pow_tbl1 - pow_Tt, pow_AD_Q } ;; { .mfi nop.m 999 -(p7) fma.s1 POW_delta = POW_K, POW_log2_lo, POW_Tt +(p7) fma.s1 POW_delta = POW_K, POW_log2_lo, POW_Tt nop.i 999 } { .mfi nop.m 999 -(p7) fma.s1 POW_G = POW_K, POW_log2_hi, POW_T - adds pow_AD_tbl2 = pow_tbl2 - pow_tbl1, pow_AD_tbl1 +(p7) fma.s1 POW_G = POW_K, POW_log2_hi, POW_T + adds pow_AD_tbl2 = pow_tbl2 - pow_tbl1, pow_AD_tbl1 } ;; - { .mfi nop.m 999 - fms.s1 POW_e2 = POW_NORM_Y, POW_r, POW_U + fms.s1 POW_e2 = POW_NORM_Y, POW_r, POW_U nop.i 999 } { .mfi nop.m 999 - fma.s1 POW_Z2 = POW_twoV, POW_Q0_half, POW_U + fma.s1 POW_Z2 = POW_twoV, POW_Q0_half, POW_U nop.i 999 } ;; -// p11 = TRUE ==> X is NEGATIVE -// p8 = TRUE ==> X is zero AND Y is outside intger range (treat as even int) -// return +0 { .mfi nop.m 999 - fclass.m.unc p11,p0 = f8, 0x1a - nop.i 999 -} -{ .mfb - nop.m 999 -(p8) fma.s f8 = f0,f0,f0 -(p8) br.ret.spnt b0 -} -;; - -{ .mfi - nop.m 999 - fma.s1 POW_Yrcub = POW_rsq, POW_U, f0 + fma.s1 POW_Yrcub = POW_rsq, POW_U, f0 nop.i 999 } -{ .mfi +{ .mfi nop.m 999 - fma.s1 POW_p = POW_rsq, POW_v3, POW_v2 + fma.s1 POW_p = POW_rsq, POW_v3, POW_v2 nop.i 999 } ;; - -// p11 = TRUE ==> X is NEGATIVE -// p12 = TRUE ==> X is NEGATIVE AND Y already int +// p11 = TRUE ==> X is NEGATIVE but not inf +// p12 = TRUE ==> X is NEGATIVE AND Y already even int // p13 = TRUE ==> X is NEGATIVE AND Y possible int { .mfi nop.m 999 - fma.s1 POW_Z1 = POW_NORM_Y, POW_G, f0 -(p11) cmp.ge.unc p12,p13 = pow_GR_exp_Y, pow_GR_10033 + fma.s1 POW_Z1 = POW_NORM_Y, POW_G, f0 +(p11) cmp.gt.unc p12,p13 = pow_GR_exp_Y, pow_GR_10033 } { .mfi nop.m 999 - fma.s1 POW_e3 = POW_NORM_Y, POW_delta, f0 - nop.i 999 -} -;; - -// p9 = TRUE ==> X is zero AND Y is within integer range (may not be integer) -// p6 = TRUE ==> X is zero AND Y is an integer (may be even or odd) -// p7 = TRUE ==> X is zero AND Y is NOT an integer, return +0 -{ .mfi - nop.m 999 -(p9) fcmp.eq.unc.s1 p6,p7 = POW_float_int_Y, POW_NORM_Y - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 POW_Gpr = POW_G, f1, POW_r + fma.s1 POW_Gpr = POW_G, f1, POW_r nop.i 999 } ;; @@ -1266,24 +1187,14 @@ L(POW_COMMON): } { .mfi nop.m 999 - fms.s1 POW_UmZ2 = POW_U, f1, POW_Z2 + fms.s1 POW_UmZ2 = POW_U, f1, POW_Z2 nop.i 999 } ;; - -// If x=0 and y>0, test y and flag denormal -// p6 = TRUE ==> X is zero AND Y is an integer (may be even or odd) -// p8 = TRUE ==> X is zero AND Y is an odd integer -// p9 = TRUE ==> X is zero AND Y is an even integer -{ .mfi - nop.m 999 -(p10) fcmp.eq.s0 p15,p0 = f9,f0 -(p6) tbit.nz.unc p8,p9 = pow_GR_sig_int_Y,0 -} { .mfi nop.m 999 - fma.s1 POW_Z3 = POW_p, POW_Yrcub, f0 + fma.s1 POW_Z3 = POW_p, POW_Yrcub, f0 nop.i 999 } ;; @@ -1291,7 +1202,7 @@ L(POW_COMMON): // By adding RSHF (1.1000...*2^63) we put integer part in rightmost significand { .mfi nop.m 999 - fms.s1 POW_e1 = POW_NORM_Y, POW_G, POW_Z1 + fms.s1 POW_e1 = POW_NORM_Y, POW_G, POW_Z1 nop.i 999 } { .mfi @@ -1301,81 +1212,60 @@ L(POW_COMMON): } ;; +// p13 = TRUE ==> X is NEGATIVE AND Y possible int +// p10 = TRUE ==> X is NEG and Y is an int +// p12 = TRUE ==> X is NEG and Y is not an int { .mfi nop.m 999 -(p7) fma.s f8 = f0,f0,f0 // Result +0 if x zero and y not integer - nop.i 999 +(p13) fcmp.eq.unc.s1 p10,p12 = POW_float_int_Y, POW_NORM_Y + mov pow_GR_xneg_yodd = 0 } -{ .mfb +{ .mfi nop.m 999 - fma.s1 POW_Y_Gpr = POW_NORM_Y, POW_Gpr, f0 -(p8) br.ret.spnt b0 // Exit if x zero and y odd integer + fma.s1 POW_Y_Gpr = POW_NORM_Y, POW_Gpr, f0 + nop.i 999 } ;; // By subtracting RSHF we get rounded integer POW_N2float -// p15 = TRUE ==> X_0_Y_NEG { .mfi nop.m 999 fms.s1 POW_N2float = POW_W2, f1, POW_RSHF nop.i 999 } -{ .mfb +{ .mfi nop.m 999 - fma.s1 POW_UmZ2pV = POW_twoV,POW_Q0_half,POW_UmZ2 -(p15) br.cond.spnt L(POW_X_0_Y_NEG) + fma.s1 POW_UmZ2pV = POW_twoV,POW_Q0_half,POW_UmZ2 + nop.i 999 } ;; - - { .mfi nop.m 999 - fma.s1 POW_Z3sq = POW_Z3, POW_Z3, f0 + fma.s1 POW_Z3sq = POW_Z3, POW_Z3, f0 nop.i 999 } -{ .mfb +{ .mfi nop.m 999 - fma.s1 POW_v4 = POW_Z3, POW_Q3, POW_Q2 -(p7) br.ret.spnt b0 // Exit if x zero and y not an integer + fma.s1 POW_v4 = POW_Z3, POW_Q3, POW_Q2 + nop.i 999 } ;; - - // Extract rounded integer from rightmost significand of POW_W2 // By subtracting RSHF we get rounded integer POW_N1float { .mfi - getf.sig pow_GR_int_W2 = POW_W2 + getf.sig pow_GR_int_W2 = POW_W2 fms.s1 POW_N1float = POW_W1, f1, POW_RSHF nop.i 999 } { .mfi nop.m 999 - fma.s1 POW_v2 = POW_Z3, POW_Q1, POW_Q0_half - nop.i 999 -} -;; - - - - -// p13 = TRUE ==> X is NEGATIVE AND Y possible int -// p10 = TRUE ==> X is NEG and Y is an int -// p12 = TRUE ==> X is NEG and Y is not an int -{ .mfi - nop.m 999 -(p13) fcmp.eq.unc.s1 p10,p12 = POW_float_int_Y, POW_NORM_Y + fma.s1 POW_v2 = POW_Z3, POW_Q1, POW_Q0_half nop.i 999 } -{ .mfb - nop.m 999 -(p9) fma.s f8 = f0,f0,f0 // Result +0 if x zero and y even integer -(p9) br.ret.spnt b0 // Exit if x zero and y even integer -} ;; - { .mfi nop.m 999 fnma.s1 POW_s2 = POW_N2float, POW_log2_by_128_hi, POW_Z2 @@ -1383,7 +1273,7 @@ L(POW_COMMON): } { .mfi nop.m 999 - fma.s1 POW_e2 = POW_e2,f1,POW_UmZ2pV + fma.s1 POW_e2 = POW_e2,f1,POW_UmZ2pV nop.i 999 } ;; @@ -1391,278 +1281,250 @@ L(POW_COMMON): // Extract rounded integer from rightmost significand of POW_W1 // Test if x inf { .mfi - getf.sig pow_GR_int_W1 = POW_W1 - fclass.m.unc p15,p0 = POW_NORM_X, 0x23 + getf.sig pow_GR_int_W1 = POW_W1 + fclass.m p15,p0 = POW_NORM_X, 0x23 nop.i 999 } { .mfb nop.m 999 fnma.s1 POW_f2 = POW_N2float, POW_log2_by_128_lo, f1 -(p12) br.cond.spnt L(POW_X_NEG_Y_NONINT) // Branch if x neg, y not integer +(p12) br.cond.spnt POW_X_NEG_Y_NONINT // Branch if x neg, y not integer } ;; +// p11 = TRUE ==> X is +1.0 // p12 = TRUE ==> X is NEGATIVE AND Y is an odd integer { .mfi - getf.exp pow_GR_signexp_Y_Gpr = POW_Y_Gpr - fma.s1 POW_v3 = POW_Z3sq, POW_Q4, POW_v4 -(p10) tbit.nz.unc p12,p0 = pow_GR_sig_int_Y,0 + getf.exp pow_GR_signexp_Y_Gpr = POW_Y_Gpr + fcmp.eq.s1 p11,p0 = POW_NORM_X, f1 +(p10) tbit.nz.unc p12,p0 = pow_GR_sig_int_Y,0 +} +{ .mfi + nop.m 999 + fma.s1 POW_v3 = POW_Z3sq, POW_Q4, POW_v4 + nop.i 999 } ;; - { .mfi - add pow_GR_int_N = pow_GR_int_W1, pow_GR_int_W2 + nop.m 999 fnma.s1 POW_f1 = POW_N1float, POW_log2_by_128_lo, f1 nop.i 999 } { .mfb nop.m 999 fnma.s1 POW_s1 = POW_N1float, POW_log2_by_128_hi, POW_Z1 -(p15) br.cond.spnt L(POW_X_INF) +(p15) br.cond.spnt POW_X_INF } ;; - // Test x and y and flag denormal { .mfi - and pow_GR_index1 = 0x0f, pow_GR_int_N + nop.m 999 fcmp.eq.s0 p15,p0 = f8,f9 - shr r2 = pow_GR_int_N, 7 + nop.i 999 } { .mfi - and pow_GR_exp_Y_Gpr = pow_GR_signexp_Y_Gpr, pow_GR_17ones - nop.f 999 - and pow_GR_index2 = 0x70, pow_GR_int_N + nop.m 999 + fma.s1 POW_e3 = POW_NORM_Y, POW_delta, f0 + nop.i 999 } ;; - - { .mfi - shladd pow_AD_T1 = pow_GR_index1, 4, pow_AD_tbl1 + nop.m 999 fcmp.eq.s1 p7,p0 = POW_NORM_Y, f1 // Test for y=1.0 - sub pow_GR_true_exp_Y_Gpr = pow_GR_exp_Y_Gpr, pow_GR_16ones + nop.i 999 } { .mfi - addl pow_int_GR_M = 0xFFFF, r2 - fma.s1 POW_e12 = POW_e1,f1,POW_e2 - add pow_AD_T2 = pow_AD_tbl2, pow_GR_index2 + nop.m 999 + fma.s1 POW_e12 = POW_e1,f1,POW_e2 + nop.i 999 } ;; - -{ .mmi - ldfe POW_T1 = [pow_AD_T1],16 - setf.exp POW_2M = pow_int_GR_M - andcm pow_GR_sign_Y_Gpr = pow_GR_signexp_Y_Gpr, pow_GR_17ones +{ .mfi + add pow_GR_int_N = pow_GR_int_W1, pow_GR_int_W2 +(p11) fma.s.s0 f8 = f1,f1,f0 // If x=1, result is +1 + nop.i 999 +} +{ .mib +(p12) mov pow_GR_xneg_yodd = 1 + nop.i 999 +(p11) br.ret.spnt b0 // Early exit if x=1.0, result is +1 } ;; - -{ .mfb - ldfe POW_T2 = [pow_AD_T2],16 - fma.s1 POW_q = POW_Z3sq, POW_v3, POW_v2 +{ .mfi + and pow_GR_index1 = 0x0f, pow_GR_int_N + fma.s1 POW_q = POW_Z3sq, POW_v3, POW_v2 + shr pow_int_GR_M = pow_GR_int_N, 7 // M = N/128 +} +{ .mib + and pow_GR_index2 = 0x70, pow_GR_int_N + nop.i 999 (p7) br.ret.spnt b0 // Early exit if y=1.0, result is x } ;; - -// double: p8 TRUE ==> |Y(G + r)| >= 10 -// single: p8 TRUE ==> |Y(G + r)| >= 7 - -// double -// -2^10 -2^9 2^9 2^10 -// -----+-----+----+ ... +-----+-----+----- -// p8 | p9 | p8 -// | | p10 | | -// single -// -2^7 -2^6 2^6 2^7 -// -----+-----+----+ ... +-----+-----+----- -// p8 | p9 | p8 -// | | p10 | | - - { .mfi -(p0) cmp.le.unc p8,p9 = 7, pow_GR_true_exp_Y_Gpr - fma.s1 POW_s = POW_s1, f1, POW_s2 - nop.i 999 + shladd pow_AD_T1 = pow_GR_index1, 4, pow_AD_tbl1 + fma.s1 POW_s = POW_s1, f1, POW_s2 + add pow_int_GR_M = pow_GR_16ones, pow_int_GR_M } { .mfi - nop.m 999 - fma.s1 POW_f12 = POW_f1, POW_f2,f0 + add pow_AD_T2 = pow_AD_tbl2, pow_GR_index2 + fma.s1 POW_f12 = POW_f1, POW_f2,f0 nop.i 999 } ;; - -{ .mfi +{ .mmf + ldfe POW_T1 = [pow_AD_T1] + ldfe POW_T2 = [pow_AD_T2] nop.f 999 -(p9) cmp.le.unc p0,p10 = 6, pow_GR_true_exp_Y_Gpr } ;; - - -{ .mfb - nop.m 999 - fma.s1 POW_e123 = POW_e12, f1, POW_e3 -(p8) br.cond.spnt L(POW_OVER_UNDER_X_NOT_INF) +{ .mfi + setf.exp POW_2M = pow_int_GR_M + fma.s1 POW_e123 = POW_e12, f1, POW_e3 + and pow_GR_exp_Y_Gpr = pow_GR_signexp_Y_Gpr, pow_GR_17ones } ;; - -{ .mmf - fma.s1 POW_q = POW_Z3sq, POW_q, POW_Z3 +{ .mfi + nop.m 999 + fma.s1 POW_q = POW_Z3sq, POW_q, POW_Z3 + sub pow_GR_true_exp_Y_Gpr = pow_GR_exp_Y_Gpr, pow_GR_16ones } ;; +// p8 TRUE ==> |Y(G + r)| >= 7 +// single +// -2^7 -2^6 2^6 2^7 +// -----+-----+----+ ... +-----+-----+----- +// p8 | p9 | p8 +// | | p10 | | + +// Form signexp of constants to indicate overflow { .mfi - nop.m 999 - fma.s1 POW_ssq = POW_s, POW_s, f0 - nop.i 999 + mov pow_GR_big_pos = 0x1007f + fma.s1 POW_ssq = POW_s, POW_s, f0 + cmp.le p8,p9 = 7, pow_GR_true_exp_Y_Gpr } { .mfi - nop.m 999 - fma.s1 POW_v4 = POW_s, POW_Q3, POW_Q2 - nop.i 999 + mov pow_GR_big_neg = 0x3007f + fma.s1 POW_v4 = POW_s, POW_Q3, POW_Q2 + andcm pow_GR_sign_Y_Gpr = pow_GR_signexp_Y_Gpr, pow_GR_17ones } ;; +// Form big positive and negative constants to test for possible overflow { .mfi - nop.m 999 - fma.s1 POW_v2 = POW_s, POW_Q1, POW_Q0_half - nop.i 999 + setf.exp POW_big_pos = pow_GR_big_pos + fma.s1 POW_v2 = POW_s, POW_Q1, POW_Q0_half +(p9) cmp.le.unc p0,p10 = 6, pow_GR_true_exp_Y_Gpr } -{ .mfi - nop.m 999 - fma.s1 POW_1ps = f1,f1,POW_s - nop.i 999 +{ .mfb + setf.exp POW_big_neg = pow_GR_big_neg + fma.s1 POW_1ps = f1,f1,POW_s +(p8) br.cond.spnt POW_OVER_UNDER_X_NOT_INF } ;; +// f123 = f12*(e123+1) = f12*e123+f12 { .mfi nop.m 999 - fma.s1 POW_f3 = POW_e123,f1,f1 + fma.s1 POW_f123 = POW_e123,POW_f12,POW_f12 nop.i 999 } ;; { .mfi nop.m 999 - fma.s1 POW_T1T2 = POW_T1, POW_T2, f0 + fma.s1 POW_T1T2 = POW_T1, POW_T2, f0 nop.i 999 } -;; - { .mfi nop.m 999 - fma.s1 POW_v3 = POW_ssq, POW_Q4, POW_v4 - nop.i 999 + fma.s1 POW_v3 = POW_ssq, POW_Q4, POW_v4 + cmp.ne p12,p13 = pow_GR_xneg_yodd, r0 } ;; { .mfi nop.m 999 - fma.s1 POW_v21ps = POW_ssq, POW_v2, POW_1ps - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 POW_s4 = POW_ssq, POW_ssq, f0 + fma.s1 POW_2Mqp1 = POW_2M, POW_q, POW_2M nop.i 999 } ;; { .mfi nop.m 999 - fma.s1 POW_f123 = POW_f12, POW_f3, f0 + fma.s1 POW_v21ps = POW_ssq, POW_v2, POW_1ps nop.i 999 } -;; - { .mfi nop.m 999 - fma.s1 POW_A = POW_2M, POW_T1T2, f0 + fma.s1 POW_s4 = POW_ssq, POW_ssq, f0 nop.i 999 } ;; - - { .mfi nop.m 999 -(p12) fmerge.s POW_f123 = f8,POW_f123 // if x neg, y odd int +(p12) fnma.s1 POW_A = POW_T1T2, POW_f123, f0 nop.i 999 } { .mfi nop.m 999 -// fma.s1 POW_es = POW_ssq, POW_v3, POW_v2 +(p13) fma.s1 POW_A = POW_T1T2, POW_f123, f0 nop.i 999 } ;; { .mfi nop.m 999 - fma.s1 POW_es = POW_s4, POW_v3, POW_v21ps + fma.s1 POW_es = POW_s4, POW_v3, POW_v21ps nop.i 999 } -;; - - { .mfi nop.m 999 - fma.s1 POW_A = POW_A, POW_f123, f0 - nop.i 999 -} -{ .mfi - nop.m 999 -// fma.s1 POW_es = POW_es, POW_ssq, POW_1ps + fma.s1 POW_A = POW_A, POW_2Mqp1, f0 nop.i 999 } ;; - +// Dummy op to set inexact { .mfi nop.m 999 - fma.s1 POW_A = POW_A, POW_es,f0 + fma.s0 POW_tmp = POW_2M, POW_q, POW_2M nop.i 999 } ;; - - { .mfb nop.m 999 -(p10) fma.s f8 = POW_A, POW_q, POW_A -(p10) br.ret.sptk b0 + fma.s.s0 f8 = POW_A, POW_es, f0 +(p10) br.ret.sptk b0 // Exit main branch if no over/underflow } ;; - - - - // POSSIBLE_OVER_UNDER -// p6 = TRUE ==> Y negative +// p6 = TRUE ==> Y_Gpr negative +// Result is already computed. We just need to know if over/underflow occurred. -{ .mfi - nop.m 999 - fmerge.s POW_abs_A = f0, POW_A - cmp.eq.unc p0,p6 = pow_GR_sign_Y, r0 -} -;; - -{ .mib - nop.m 999 - nop.i 999 -(p6) br.cond.spnt L(POW_POSSIBLE_UNDER) +{ .mfb + cmp.eq p0,p6 = pow_GR_sign_Y_Gpr, r0 + nop.f 999 +(p6) br.cond.spnt POW_POSSIBLE_UNDER } ;; // POSSIBLE_OVER -// We got an answer. +// We got an answer. // overflow is a possibility, not a certainty @@ -1692,21 +1554,20 @@ L(POW_COMMON): // RN RN // RZ - // Put in s2 (td set, wre set) { .mfi - mov pow_GR_gt_ln = 0x1007f + nop.m 999 fsetc.s2 0x7F,0x42 - nop.i 999 + nop.i 999 } ;; - { .mfi - setf.exp POW_gt_pln = pow_GR_gt_ln - fma.s.s2 POW_wre_urm_f8 = POW_abs_A, POW_q, POW_abs_A - nop.i 999 ;; + nop.m 999 + fma.s.s2 POW_wre_urm_f8 = POW_A, POW_es, f0 + nop.i 999 } +;; // Return s2 to default { .mfi @@ -1716,31 +1577,30 @@ L(POW_COMMON): } ;; - // p7 = TRUE ==> yes, we have an overflow { .mfi nop.m 999 - fcmp.ge.unc.s1 p7, p0 = POW_wre_urm_f8, POW_gt_pln + fcmp.ge.s1 p7, p8 = POW_wre_urm_f8, POW_big_pos nop.i 999 } ;; - - -{ .mfb -(p7) mov pow_GR_tag = 30 - fma.s f8 = POW_A, POW_q, POW_A -(p7) br.cond.spnt __libm_error_region +{ .mfi + nop.m 999 +(p8) fcmp.le.s1 p7, p0 = POW_wre_urm_f8, POW_big_neg + nop.i 999 } -{ .mfb - nop.m 999 - nop.f 999 -(p0) br.ret.sptk b0 +;; + +{ .mbb +(p7) mov pow_GR_tag = 30 +(p7) br.cond.spnt __libm_error_region // Branch if overflow + br.ret.sptk b0 // Exit if did not overflow } ;; -L(POW_POSSIBLE_UNDER): +POW_POSSIBLE_UNDER: // We got an answer. input was < -2^9 but > -2^10 (double) // We got an answer. input was < -2^6 but > -2^7 (float) // underflow is a possibility, not a certainty @@ -1763,124 +1623,250 @@ L(POW_POSSIBLE_UNDER): // 0.1...11 2^-3ffe (biased, 1) // largest dn smallest normal - // Put in s2 (td set, ftz set) { .mfi nop.m 999 fsetc.s2 0x7F,0x41 - nop.i 999 + nop.i 999 } ;; - - { .mfi nop.m 999 - fma.s.s2 POW_ftz_urm_f8 = POW_A, POW_q, POW_A + fma.s.s2 POW_ftz_urm_f8 = POW_A, POW_es, f0 nop.i 999 } ;; - // Return s2 to default { .mfi nop.m 999 fsetc.s2 0x7F,0x40 - nop.i 999 + nop.i 999 } ;; - // p7 = TRUE ==> yes, we have an underflow { .mfi nop.m 999 - fcmp.eq.unc.s1 p7, p0 = POW_ftz_urm_f8, f0 - nop.i 999 + fcmp.eq.s1 p7, p0 = POW_ftz_urm_f8, f0 + nop.i 999 } ;; +{ .mbb +(p7) mov pow_GR_tag = 31 +(p7) br.cond.spnt __libm_error_region // Branch if underflow + br.ret.sptk b0 // Exit if did not underflow +} +;; +POW_X_DENORM: +// Here if x unorm. Use the NORM_X for getf instructions, and then back +// to normal path +{ .mfi + getf.exp pow_GR_signexp_X = POW_NORM_X + nop.f 999 + nop.i 999 +} +;; +{ .mmi + getf.sig pow_GR_sig_X = POW_NORM_X +;; + and pow_GR_exp_X = pow_GR_signexp_X, pow_GR_17ones + nop.i 999 +} +;; + +{ .mib + sub pow_GR_true_exp_X = pow_GR_exp_X, pow_GR_16ones + nop.i 999 + br.cond.sptk POW_COMMON +} +;; + +POW_X_0: +// Here if x=0 and y not nan +// +// We have the following cases: +// p6 x=0 and y>0 and is an integer (may be even or odd) +// p7 x=0 and y>0 and is NOT an integer, return +0 +// p8 x=0 and y>0 and so big as to always be an even integer, return +0 +// p9 x=0 and y>0 and may not be integer +// p10 x=0 and y>0 and is an odd integer, return x +// p11 x=0 and y>0 and is an even integer, return +0 +// p12 used in dummy fcmp to set denormal flag if y=unorm +// p13 x=0 and y>0 +// p14 x=0 and y=0, branch to code for calling error handling +// p15 x=0 and y<0, branch to code for calling error handling +// +{ .mfi + getf.sig pow_GR_sig_int_Y = POW_int_Y // Get signif of int_Y + fcmp.lt.s1 p15,p13 = f9, f0 // Test for y<0 + and pow_GR_exp_Y = pow_GR_signexp_Y, pow_GR_17ones +} +{ .mfb + cmp.ne p14,p0 = pow_GR_y_zero,r0 // Test for y=0 + fcvt.xf POW_float_int_Y = POW_int_Y +(p14) br.cond.spnt POW_X_0_Y_0 // Branch if x=0 and y=0 +} +;; +// If x=0 and y>0, test y and flag denormal { .mfb -(p7) mov pow_GR_tag = 31 - fma.s f8 = POW_A, POW_q, POW_A -(p7) br.cond.spnt __libm_error_region +(p13) cmp.gt.unc p8,p9 = pow_GR_exp_Y, pow_GR_10033 // Test y +big = even int +(p13) fcmp.eq.s0 p12,p0 = f9,f0 // If x=0, y>0 dummy op to flag denormal +(p15) br.cond.spnt POW_X_0_Y_NEG // Branch if x=0 and y<0 } ;; +// Here if x=0 and y>0 +{ .mfi + nop.m 999 +(p9) fcmp.eq.unc.s1 p6,p7 = POW_float_int_Y, POW_NORM_Y // Test y=int + nop.i 999 +} +{ .mfi + nop.m 999 +(p8) fma.s.s0 f8 = f0,f0,f0 // If x=0, y>0 and large even int, return +0 + nop.i 999 +} +;; + +{ .mfi + nop.m 999 +(p7) fma.s.s0 f8 = f0,f0,f0 // Result +0 if x=0 and y>0 and not integer +(p6) tbit.nz.unc p10,p11 = pow_GR_sig_int_Y,0 // If y>0 int, test y even/odd +} +;; +// Note if x=0, y>0 and odd integer, just return x { .mfb nop.m 999 - nop.f 999 - br.ret.sptk b0 +(p11) fma.s.s0 f8 = f0,f0,f0 // Result +0 if x=0 and y even integer + br.ret.sptk b0 // Exit if x=0 and y>0 } ;; +POW_X_0_Y_0: +// When X is +-0 and Y is +-0, IEEE returns 1.0 +// We call error support with this value -L(POW_X_DENORM): -// Here if x unorm. Use the NORM_X for getf instructions, and the back -// to normal path -{ .mfi - getf.exp pow_GR_signexp_X = POW_NORM_X - nop.f 999 - nop.i 999 +{ .mfb + mov pow_GR_tag = 32 + fma.s.s0 f8 = f1,f1,f0 + br.cond.sptk __libm_error_region } ;; +POW_X_0_Y_NEG: +// When X is +-0 and Y is negative, IEEE returns +// X Y answer +// +0 -odd int +inf +// -0 -odd int -inf + +// +0 !-odd int +inf +// -0 !-odd int +inf + +// p6 == Y is a floating point number outside the integer. +// Hence it is an integer and is even. +// return +inf + +// p7 == Y is a floating point number within the integer range. +// p9 == (int_Y = NORM_Y), Y is an integer, which may be odd or even. +// p11 odd +// return (sign_of_x)inf +// p12 even +// return +inf +// p10 == Y is not an integer +// return +inf +// + { .mfi - getf.sig pow_GR_sig_X = POW_NORM_X - nop.f 999 - nop.i 999 + nop.m 999 + nop.f 999 + cmp.gt p6,p7 = pow_GR_exp_Y, pow_GR_10033 } ;; { .mfi - and pow_GR_exp_X = pow_GR_signexp_X, pow_GR_17ones - nop.f 999 + mov pow_GR_tag = 33 +(p7) fcmp.eq.unc.s1 p9,p10 = POW_float_int_Y, POW_NORM_Y + nop.i 999 } ;; -{ .mib - sub pow_GR_true_exp_X = pow_GR_exp_X, pow_GR_16ones - shl pow_GR_offset = pow_GR_sig_X, 1 - br.cond.sptk L(POW_COMMON) +{ .mfb + nop.m 999 +(p6) frcpa.s0 f8,p13 = f1, f0 +(p6) br.cond.sptk __libm_error_region // x=0, y<0, y large neg int } ;; +{ .mfb + nop.m 999 +(p10) frcpa.s0 f8,p13 = f1, f0 +(p10) br.cond.sptk __libm_error_region // x=0, y<0, y not int +} +;; -L(POW_X_0_Y_0): -// When X is +-0 and Y is +-0, IEEE returns 1.0 -// We call error support with this value +// x=0, y<0, y an int +{ .mib + nop.m 999 +(p9) tbit.nz.unc p11,p12 = pow_GR_sig_int_Y,0 + nop.b 999 +} +;; + +{ .mfi + nop.m 999 +(p12) frcpa.s0 f8,p13 = f1,f0 + nop.i 999 +} +;; { .mfb - mov pow_GR_tag = 32 - fma.s f8 = f1,f1,f0 - br.cond.sptk __libm_error_region + nop.m 999 +(p11) frcpa.s0 f8,p13 = f1,f8 + br.cond.sptk __libm_error_region } ;; +POW_Y_0: +// Here for y zero, x anything but zero and nan +// Set flag if x denormal +// Result is +1.0 +{ .mfi + nop.m 999 + fcmp.eq.s0 p6,p0 = f8,f0 // Sets flag if x denormal + nop.i 999 +} +{ .mfb + nop.m 999 + fma.s.s0 f8 = f1,f1,f0 + br.ret.sptk b0 +} +;; -L(POW_X_INF): -// When X is +-inf and Y is +-, IEEE returns +POW_X_INF: +// Here when X is +-inf -// overflow -// X +inf Y +inf +inf -// X -inf Y +inf +inf +// X +inf Y +inf +inf +// X -inf Y +inf +inf -// X +inf Y >0 +inf +// X +inf Y >0 +inf // X -inf Y >0, !odd integer +inf <== (-inf)^0.5 = +inf !! -// X -inf Y >0, odd integer -inf +// X -inf Y >0, odd integer -inf -// underflow -// X +inf Y -inf +0 -// X -inf Y -inf +0 +// X +inf Y -inf +0 +// X -inf Y -inf +0 -// X +inf Y <0 +0 -// X -inf Y <0, !odd integer +0 -// X -inf Y <0, odd integer -0 +// X +inf Y <0 +0 +// X -inf Y <0, !odd integer +0 +// X -inf Y <0, odd integer -0 // X + inf Y=+0 +1 // X + inf Y=-0 +1 @@ -1892,32 +1878,30 @@ L(POW_X_INF): // p6 == Y is a floating point number outside the integer. // Hence it is an integer and is even. -// p13 == (Y negative) +// p13 == (Y negative) // return +inf // p14 == (Y positive) // return +0 - - // p7 == Y is a floating point number within the integer range. // p9 == (int_Y = NORM_Y), Y is an integer, which may be odd or even. // p11 odd -// p13 == (Y negative) +// p13 == (Y negative) // return (sign_of_x)inf -// p14 == (Y positive) +// p14 == (Y positive) // return (sign_of_x)0 -// pxx even -// p13 == (Y negative) -// return +inf +// pxx even +// p13 == (Y negative) +// return +inf // p14 == (Y positive) -// return +0 +// return +0 // pxx == Y is not an integer -// p13 == (Y negative) +// p13 == (Y negative) // return +inf // p14 == (Y positive) // return +0 -// +// // If x=inf, test y and flag denormal { .mfi @@ -1929,207 +1913,131 @@ L(POW_X_INF): { .mfi nop.m 999 - fcmp.lt p13,p14 = POW_NORM_Y,f0 - cmp.gt.unc p6,p7 = pow_GR_exp_Y, pow_GR_10033 + fcmp.lt.s0 p13,p14 = POW_NORM_Y,f0 + cmp.gt p6,p7 = pow_GR_exp_Y, pow_GR_10033 } { .mfi nop.m 999 - fclass.m p12,p0 = f9, 0x23 + fclass.m p12,p0 = f9, 0x23 //@inf nop.i 999 } ;; - { .mfi nop.m 999 - fclass.m p15,p0 = f9, 0x07 //@zero + fclass.m p15,p0 = f9, 0x07 //@zero nop.i 999 } ;; { .mfb nop.m 999 -(p15) fmerge.s f8 = f1,f1 -(p15) br.ret.spnt b0 +(p15) fmerge.s f8 = f1,f1 // Return +1.0 if x=inf, y=0 +(p15) br.ret.spnt b0 // Exit if x=inf, y=0 } ;; - { .mfi -(p13) mov pow_GR_tag = 31 -(p14) frcpa.s1 f8,p10 = f1,f0 + nop.m 999 +(p14) frcpa.s1 f8,p10 = f1,f0 // If x=inf, y>0, assume result +inf nop.i 999 } { .mfb -(p14) mov pow_GR_tag = 30 -(p13) fma.s1 f8 = f0,f0,f0 -(p12) br.ret.spnt b0 -} -;; - - - -{ .mfb nop.m 999 -(p7) fcmp.eq.unc.s1 p9,p0 = POW_float_int_Y, POW_NORM_Y - nop.b 999 +(p13) fma.s.s0 f8 = f0,f0,f0 // If x=inf, y<0, assume result +0.0 +(p12) br.ret.spnt b0 // Exit if x=inf, y=inf } ;; +// Here if x=inf, and 0 < |y| < inf. Need to correct results if y odd integer. { .mfi nop.m 999 - nop.f 999 -(p9) tbit.nz.unc p11,p0 = pow_GR_sig_int_Y,0 -} -;; - -{ .mfb - nop.m 999 -(p11) fmerge.s f8 = POW_NORM_X,f8 - br.ret.sptk b0 +(p7) fcmp.eq.unc.s1 p9,p0 = POW_float_int_Y, POW_NORM_Y // Is y integer? + nop.i 999 } ;; - - -L(POW_X_0_Y_NEG): -// When X is +-0 and Y is negative, IEEE returns -// X Y answer -// +0 -odd int +inf -// -0 -odd int -inf - -// +0 !-odd int +inf -// -0 !-odd int +inf - - -// p6 == Y is a floating point number outside the integer. -// Hence it is an integer and is even. -// return +inf - -// p7 == Y is a floating point number within the integer range. -// p9 == (int_Y = NORM_Y), Y is an integer, which may be odd or even. -// p11 odd -// return (sign_of_x)inf -// p12 even -// return +inf -// p10 == Y is not an integer -// return +inf -// -// - { .mfi nop.m 999 nop.f 999 - cmp.gt.unc p6,p7 = pow_GR_exp_Y, pow_GR_10033 -} -;; - - -{ .mfi - mov pow_GR_tag = 33 -(p7) fcmp.eq.unc.s1 p9,p10 = POW_float_int_Y, POW_NORM_Y - nop.i 999 -} -;; - - -{ .mfb - nop.m 999 -(p6) frcpa.s0 f8,p13 = f1, f0 -(p6) br.cond.sptk __libm_error_region +(p9) tbit.nz.unc p11,p0 = pow_GR_sig_int_Y,0 // Test for y odd integer } ;; { .mfb nop.m 999 -(p10) frcpa.s0 f8,p13 = f1, f0 -(p10) br.cond.sptk __libm_error_region +(p11) fmerge.s f8 = POW_NORM_X,f8 // If y odd integer use sign of x + br.ret.sptk b0 // Exit for x=inf, 0 < |y| < inf } ;; +POW_X_NEG_Y_NONINT: +// When X is negative and Y is a non-integer, IEEE +// returns a qnan indefinite. +// We call error support with this value -{ .mib - nop.m 999 -(p9) tbit.nz.unc p11,p12 = pow_GR_sig_int_Y,0 - nop.b 999 +{ .mfb + mov pow_GR_tag = 34 + frcpa.s0 f8,p6 = f0,f0 + br.cond.sptk __libm_error_region } ;; - - +POW_X_NAN: +// Here if x=nan, y not nan { .mfi - nop.m 999 -(p12) frcpa.s0 f8,p13 = f1,f0 - nop.i 999 -} -;; - -{ .mfb - nop.m 999 -(p11) frcpa f8,p13 = f1,f8 - br.cond.sptk __libm_error_region + nop.m 999 + fclass.m p9,p13 = f9, 0x07 // Test y=zero + nop.i 999 } ;; - - - -L(POW_X_NEG_Y_NONINT): -// When X is negative and Y is a non-integer, IEEE -// returns a qnan indefinite. -// We call error support with this value - { .mfb - mov pow_GR_tag = 34 - frcpa f8,p6 = f0,f0 - br.cond.sptk __libm_error_region + nop.m 999 +(p13) fma.s.s0 f8 = f8,f1,f0 +(p13) br.ret.sptk b0 // Exit if x nan, y anything but zero or nan } ;; - - - -L(POW_X_NAN_Y_0): +POW_X_NAN_Y_0: // When X is a NAN and Y is zero, IEEE returns 1. // We call error support with this value. - { .mfi - nop.m 0 - fma.s.s0 f10 = f8,f1,f0 - nop.i 0 + nop.m 999 + fcmp.eq.s0 p6,p0 = f8,f0 // Dummy op to set invalid on snan + nop.i 999 } { .mfb - mov pow_GR_tag = 35 - fma.s.s0 f8 = f0,f0,f1 + mov pow_GR_tag = 35 + fma.s.s0 f8 = f0,f0,f1 br.cond.sptk __libm_error_region } ;; -L(POW_OVER_UNDER_X_NOT_INF): +POW_OVER_UNDER_X_NOT_INF: // p8 is TRUE for overflow // p9 is TRUE for underflow // if y is infinity, we should not over/underflow - { .mfi nop.m 999 - fcmp.eq.unc.s1 p14, p13 = POW_xsq,f1 - cmp.eq.unc p8,p9 = pow_GR_sign_Y_Gpr, r0 + fcmp.eq.s1 p14, p13 = POW_xsq,f1 // Test |x|=1 + cmp.eq p8,p9 = pow_GR_sign_Y_Gpr, r0 } ;; { .mfi nop.m 999 -(p14) fclass.m.unc p15, p0 = f9, 0x23 +(p14) fclass.m.unc p15, p0 = f9, 0x23 // If |x|=1, test y=inf nop.i 999 } { .mfi nop.m 999 -(p13) fclass.m.unc p11,p0 = f9, 0x23 +(p13) fclass.m.unc p11,p0 = f9, 0x23 // If |x| not 1, test y=inf nop.i 999 } ;; @@ -2137,31 +2045,33 @@ L(POW_OVER_UNDER_X_NOT_INF): // p15 = TRUE if |x|=1, y=inf, return +1 { .mfb nop.m 999 -(p15) fma.s f8 = f1,f1,f0 -(p15) br.ret.spnt b0 +(p15) fma.s.s0 f8 = f1,f1,f0 // If |x|=1, y=inf, result +1 +(p15) br.ret.spnt b0 // Exit if |x|=1, y=inf } ;; .pred.rel "mutex",p8,p9 { .mfb -(p8) setf.exp f8 = pow_GR_17ones -(p9) fmerge.s f8 = f0,f0 -(p11) br.ret.sptk b0 +(p8) setf.exp f8 = pow_GR_17ones // If exp(+big), result inf +(p9) fmerge.s f8 = f0,f0 // If exp(-big), result 0 +(p11) br.ret.sptk b0 // Exit if |x| not 1, y=inf } +;; { .mfb nop.m 999 nop.f 999 - br.cond.sptk L(POW_OVER_UNDER_ERROR) + br.cond.sptk POW_OVER_UNDER_ERROR // Branch if y not inf } ;; -L(POW_Y_NAN): -// Is x = +1 then result is +1, else result is quiet Y +POW_Y_NAN: +// Here if y=nan, x anything +// If x = +1 then result is +1, else result is quiet Y { .mfi nop.m 999 - fcmp.eq.s1 p10,p9 = POW_NORM_X, f1 + fcmp.eq.s1 p10,p9 = POW_NORM_X, f1 nop.i 999 } ;; @@ -2175,148 +2085,117 @@ L(POW_Y_NAN): { .mfi nop.m 999 -(p10) fma.s f8 = f1,f1,f0 +(p10) fma.s.s0 f8 = f1,f1,f0 nop.i 999 } { .mfb nop.m 999 -(p9) fma.s f8 = f9,f8,f0 - br.ret.sptk b0 +(p9) fma.s.s0 f8 = f9,f8,f0 + br.ret.sptk b0 // Exit y=nan } ;; -L(POW_OVER_UNDER_ERROR): +POW_OVER_UNDER_ERROR: +// Here if we have overflow or underflow. +// Enter with p12 true if x negative and y odd int to force -0 or -inf { .mfi - nop.m 999 - fmerge.s f10 = POW_NORM_X,POW_NORM_X - nop.i 999 -} -{ .mfi - sub pow_GR_17ones_m1 = pow_GR_17ones, r0, 1 - nop.f 999 - mov pow_GR_one = 0x1 + sub pow_GR_17ones_m1 = pow_GR_17ones, r0, 1 + nop.f 999 + mov pow_GR_one = 0x1 } ;; -// overflow +// overflow, force inf with O flag { .mmb -(p8) mov pow_GR_tag = 30 -(p8) setf.exp f11 = pow_GR_17ones_m1 +(p8) mov pow_GR_tag = 30 +(p8) setf.exp POW_tmp = pow_GR_17ones_m1 nop.b 999 } ;; - -// underflow +// underflow, force zero with I, U flags { .mmi -(p9) mov pow_GR_tag = 31 -(p9) setf.exp f11 = pow_GR_one +(p9) mov pow_GR_tag = 31 +(p9) setf.exp POW_tmp = pow_GR_one nop.i 999 } ;; - -// p12 x is negative and y is an odd integer - - { .mfi nop.m 999 - fma.s f8 = f11, f11, f0 + fma.s.s0 f8 = POW_tmp, POW_tmp, f0 nop.i 999 } ;; +// p12 x is negative and y is an odd integer, change sign of result { .mfi nop.m 999 -(p12) fmerge.ns f8 = f8, f8 +(p12) fnma.s.s0 f8 = POW_tmp, POW_tmp, f0 nop.i 999 } ;; +GLOBAL_LIBM_END(powf) -.endp powf -ASM_SIZE_DIRECTIVE(powf) - - -// Stack operations when calling error support. -// (1) (2) (3) (call) (4) -// sp -> + psp -> + psp -> + sp -> + -// | | | | -// | | <- GR_Y R3 ->| <- GR_RESULT | -> f8 -// | | | | -// | <-GR_Y Y2->| Y2 ->| <- GR_Y | -// | | | | -// | | <- GR_X X1 ->| | -// | | | | -// sp-64 -> + sp -> + sp -> + + -// save ar.pfs save b0 restore gp -// save gp restore ar.pfs - - +LOCAL_LIBM_ENTRY(__libm_error_region) -.proc __libm_error_region -__libm_error_region: - -// Answer is inf for overflow and 0 for underflow. .prologue -// (1) { .mfi - add GR_Parameter_Y=-32,sp // Parameter 2 value + add GR_Parameter_Y=-32,sp // Parameter 2 value nop.f 0 .save ar.pfs,GR_SAVE_PFS - mov GR_SAVE_PFS=ar.pfs // Save ar.pfs + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs } { .mfi .fframe 64 - add sp=-64,sp // Create new stack + add sp=-64,sp // Create new stack nop.f 0 - mov GR_SAVE_GP=gp // Save gp + mov GR_SAVE_GP=gp // Save gp };; - -// (2) { .mmi stfs [GR_Parameter_Y] = POW_NORM_Y,16 // STORE Parameter 2 on stack - add GR_Parameter_X = 16,sp // Parameter 1 address + add GR_Parameter_X = 16,sp // Parameter 1 address .save b0, GR_SAVE_B0 - mov GR_SAVE_B0=b0 // Save b0 + mov GR_SAVE_B0=b0 // Save b0 };; .body -// (3) { .mib - stfs [GR_Parameter_X] = POW_NORM_X // STORE Parameter 1 on stack + stfs [GR_Parameter_X] = POW_NORM_X // STORE Parameter 1 on stack add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address - nop.b 0 + nop.b 0 } { .mib - stfs [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack + stfs [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack add GR_Parameter_Y = -16,GR_Parameter_Y - br.call.sptk b0=__libm_error_support# // Call error handling function + br.call.sptk b0=__libm_error_support# // Call error handling function };; + { .mmi - nop.m 0 - nop.m 0 add GR_Parameter_RESULT = 48,sp + nop.m 0 + nop.i 0 };; -// (4) { .mmi - ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack + ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack .restore sp - add sp = 64,sp // Restore stack pointer - mov b0 = GR_SAVE_B0 // Restore return address + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address };; + { .mib - mov gp = GR_SAVE_GP // Restore gp - mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs - br.ret.sptk b0 // Return + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return };; -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) +LOCAL_LIBM_END(__libm_error_region) .type __libm_error_support#,@function .global __libm_error_support# + diff --git a/sysdeps/ia64/fpu/e_powl.S b/sysdeps/ia64/fpu/e_powl.S index d286e9abad..0896c19aac 100644 --- a/sysdeps/ia64/fpu/e_powl.S +++ b/sysdeps/ia64/fpu/e_powl.S @@ -1,10 +1,10 @@ .file "powl.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,61 +20,69 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// // Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// problem reports or change requests be submitted to it directly at +// http://www.intel.com/software/products/opensource/libraries/num.htm. // -// ********************************************************************* +//********************************************************************* // // Function: powl(x,y), where -// y +// y // powl(x,y) = x , for double extended precision x and y values // -// ********************************************************************* +//********************************************************************* // -// History: -// 2/02/00 (Hand Optimized) -// 4/04/00 Unwind support added -// 8/15/00 Bundle added after call to __libm_error_support to properly +// History: +// 02/02/00 (Hand Optimized) +// 04/04/00 Unwind support added +// 08/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. -// 1/22/01 Corrected results for powl(1,inf), powl(1,nan), and +// 01/22/01 Corrected results for powl(1,inf), powl(1,nan), and // powl(snan,0) to be 1 per C99, not nan. Fixed many flag settings. -// 2/06/01 Call __libm_error support if over/underflow when y=2. +// 02/06/01 Call __libm_error support if over/underflow when y=2. +// 04/17/01 Support added for y close to 1 and x a non-special value. +// Shared software under/overflow detection for all paths +// 02/07/02 Corrected sf3 setting to disable traps +// 05/13/02 Improved performance of all paths +// 02/10/03 Reordered header: .section, .global, .proc, .align; +// used data8 for long double table values +// 04/17/03 Added missing mutex directive // -// ********************************************************************* +//********************************************************************* // // Resources Used: // -// Floating-Point Registers: -// f8 (Input and Return Value) -// f9-f15,f32-f63,f99 +// Floating-Point Registers: +// f8 (Input x and Return Value) +// f9 (Input y) +// f10-f15,f32-f79 // // General Purpose Registers: -// Locals r32 - r61 +// Locals r14-24,r32-r65 // Parameters to __libm_error_support r62,r63,r64,r65 // // Predicate Registers: p6-p15 // -// ********************************************************************* +//********************************************************************* // // Special Cases and IEEE special conditions: // // Denormal fault raised on denormal inputs -// Overflow exceptions raised when appropriate for pow -// Underflow exceptions raised when appropriate for pow +// Overflow exceptions raised when appropriate for pow +// Underflow exceptions raised when appropriate for pow // (Error Handling Routine called for overflow and Underflow) // Inexact raised when appropriate by algorithm // @@ -102,8 +110,8 @@ // 22. X or Y denorm/unorm and denorm/unorm operand trap is enabled, // generate denorm/unorm fault except if invalid or div_0 raised. // -// ********************************************************************* -// +//********************************************************************* +// // Algorithm // ========= // @@ -113,23 +121,23 @@ // If Y = 0.5, return sqrt(X). // // Compute log(X) to extra precision. -// +// // ker_log_80( X, logX_hi, logX_lo, Safe ); // -// ...logX_hi + logX_lo approximates log(X) to roughly 80 +// ...logX_hi + logX_lo approximates log(X) to roughly 80 // ...significant bits of accuracy. // // Compute Y*log(X) to extra precision. // // P_hi := Y * logX_hi -// P_lo := Y * logX_hi - P_hi ...using FMA -// P_lo := Y * logX_lo + P_lo ...using FMA +// P_lo := Y * logX_hi - P_hi ...using FMA +// P_lo := Y * logX_lo + P_lo ...using FMA // // Compute exp(P_hi + P_lo) // -// Flag := 2; +// Flag := 2; // Expo_Range := 2; (assuming double-extended power function) -// ker_exp_64( P_hi, P_lo, Flag, Expo_Range, +// ker_exp_64( P_hi, P_lo, Flag, Expo_Range, // Z_hi, Z_lo, scale, Safe ) // // scale := sgn * scale @@ -138,7 +146,7 @@ // return scale*Z_hi + (scale*Z_lo) // quickly // Else -// take necessary precaution in computing +// take necessary precaution in computing // scale*Z_hi + (scale*Z_lo) // to set possible exceptions correctly. // End If @@ -152,8 +160,8 @@ // If Y is qNaN, return Y without exception. // If X is qNaN, return X without exception. // -// At this point, X is real and Y is +-inf. -// Thus |X| can only be 1, strictly bigger than 1, or +// At this point, X is real and Y is +-inf. +// Thus |X| can only be 1, strictly bigger than 1, or // strictly less than 1. // // If |X| < 1, then @@ -169,8 +177,8 @@ // ...Note that Y is real, finite, non-zero, and not +1. // // If X is qNaN, return X without exception. -// -// If X is +-0, +// +// If X is +-0, // return ( Y > 0 ? +0 : +inf ) // // If X is +inf @@ -180,11 +188,11 @@ // return -0 ** -Y // return ( Y > 0 ? +inf : +0 ) // -// Case_Invalid +// Case_Invalid // // Return 0 * inf to generate a quiet NaN together // with an invalid exception. -// +// // Implementation // ============== // @@ -193,15 +201,15 @@ // // STAGE 1 // ------- -// This stage contains two threads. +// This stage contains two threads. // // Stage1.Thread1 // // fclass.m X_excep, X_ok = X, (NatVal or s/qNaN) or -// +-0, +-infinity +// +-0, +-infinity // // fclass.nm X_unsupp, X_supp = X, (NatVal or s/qNaN) or -// +-(0, unnorm, norm, infinity) +// +-(0, unnorm, norm, infinity) // // X_norm := fnorm( X ) with traps disabled // @@ -209,26 +217,26 @@ // If (X_unsupp) goto Filtering (Step 2) // // Stage1.Thread2 -// .............. +// .............. // // fclass.m Y_excep, Y_ok = Y, (NatVal or s/qNaN) or -// +-0, +-infinity +// +-0, +-infinity // // fclass.nm Y_unsupp, Y_supp = Y, (NatVal or s/qNaN) or -// +-(0, unnorm, norm, infinity) +// +-(0, unnorm, norm, infinity) // // Y_norm := fnorm( Y ) with traps disabled // // If (Y_excep) goto Filtering (Step 2) // If (Y_unsupp) goto Filtering (Step 2) // -// +// // STAGE 2 // ------- // This stage contains two threads. // -// Stage2.Thread1 -// .............. +// Stage2.Thread1 +// .............. // // Set X_lt_0 if X < 0 (using fcmp) // sgn := +1.0 @@ -245,14 +253,14 @@ // This stage contains two threads. // // -// Stage3.Thread1 -// .............. +// Stage3.Thread1 +// .............. // // X := fnorm(X) in prevailing traps // // -// Stage3.Thread2 -// .............. +// Stage3.Thread2 +// .............. // // Y := fnorm(Y) in prevailing traps // @@ -262,60 +270,56 @@ // Go to Case_Normal. // -#include "libm_support.h" - -#ifdef _LIBC -.rodata -#else -.data -#endif - -// Inv_L, L_hi, L_lo -.align 64 -Constants_exp_64_Arg: -ASM_TYPE_DIRECTIVE(Constants_exp_64_Arg,@object) -data4 0x5C17F0BC,0xB8AA3B29,0x0000400B,0x00000000 -data4 0x00000000,0xB17217F4,0x00003FF2,0x00000000 -data4 0xF278ECE6,0xF473DE6A,0x00003FD4,0x00000000 -ASM_SIZE_DIRECTIVE(Constants_exp_64_Arg) - -.align 64 -Constants_exp_64_Exponents: -ASM_TYPE_DIRECTIVE(Constants_exp_64_Exponents,@object) -data4 0x0000007E,0x00000000,0xFFFFFF83,0xFFFFFFFF -data4 0x000003FE,0x00000000,0xFFFFFC03,0xFFFFFFFF -data4 0x00003FFE,0x00000000,0xFFFFC003,0xFFFFFFFF -data4 0x00003FFE,0x00000000,0xFFFFC003,0xFFFFFFFF -data4 0xFFFFFFE2,0xFFFFFFFF,0xFFFFFFC4,0xFFFFFFFF -data4 0xFFFFFFBA,0xFFFFFFFF,0xFFFFFFBA,0xFFFFFFFF -ASM_SIZE_DIRECTIVE(Constants_exp_64_Exponents) - -.align 64 -Constants_exp_64_A: -ASM_TYPE_DIRECTIVE(Constants_exp_64_A,@object) -// Reversed -data4 0xB1B736A0,0xAAAAAAAB,0x00003FFA,0x00000000 -data4 0x90CD6327,0xAAAAAAAB,0x00003FFC,0x00000000 -data4 0xFFFFFFFF,0xFFFFFFFF,0x00003FFD,0x00000000 -ASM_SIZE_DIRECTIVE(Constants_exp_64_A) - -.align 64 -Constants_exp_64_P: -ASM_TYPE_DIRECTIVE(Constants_exp_64_P,@object) -// Reversed -data4 0x43914A8A,0xD00D6C81,0x00003FF2,0x00000000 -data4 0x30304B30,0xB60BC4AC,0x00003FF5,0x00000000 -data4 0x7474C518,0x88888888,0x00003FF8,0x00000000 -data4 0x8DAE729D,0xAAAAAAAA,0x00003FFA,0x00000000 -data4 0xAAAAAF61,0xAAAAAAAA,0x00003FFC,0x00000000 -data4 0x000004C7,0x80000000,0x00003FFE,0x00000000 -ASM_SIZE_DIRECTIVE(Constants_exp_64_P) - -.align 64 -Constants_exp_64_T1: -ASM_TYPE_DIRECTIVE(Constants_exp_64_T1,@object) -data4 0x3F800000,0x3F8164D2,0x3F82CD87,0x3F843A29 -data4 0x3F85AAC3,0x3F871F62,0x3F88980F,0x3F8A14D5 + +// ************* DO NOT CHANGE ORDER OF THESE TABLES ******************** + +// double-extended 1/ln(2) +// 3fff b8aa 3b29 5c17 f0bb be87fed0691d3e88 +// 3fff b8aa 3b29 5c17 f0bc +// For speed the significand will be loaded directly with a movl and setf.sig +// and the exponent will be bias+63 instead of bias+0. Thus subsequent +// computations need to scale appropriately. +// The constant 2^12/ln(2) is needed for the computation of N. This is also +// obtained by scaling the computations. +// +// Two shifting constants are loaded directly with movl and setf.d. +// 1. RSHF_2TO51 = 1.1000..00 * 2^(63-12) +// This constant is added to x*1/ln2 to shift the integer part of +// x*2^12/ln2 into the rightmost bits of the significand. +// The result of this fma is N_signif. +// 2. RSHF = 1.1000..00 * 2^(63) +// This constant is subtracted from N_signif * 2^(-51) to give +// the integer part of N, N_fix, as a floating-point number. +// The result of this fms is float_N. +RODATA + +.align 16 +// L_hi, L_lo +LOCAL_OBJECT_START(Constants_exp_64_Arg) +data8 0xB17217F400000000,0x00003FF2 // L_hi = hi part log(2)/2^12 +data8 0xF473DE6AF278ECE6,0x00003FD4 // L_lo = lo part log(2)/2^12 +LOCAL_OBJECT_END(Constants_exp_64_Arg) + +LOCAL_OBJECT_START(Constants_exp_64_A) +// Reversed +data8 0xAAAAAAABB1B736A0,0x00003FFA +data8 0xAAAAAAAB90CD6327,0x00003FFC +data8 0xFFFFFFFFFFFFFFFF,0x00003FFD +LOCAL_OBJECT_END(Constants_exp_64_A) + +LOCAL_OBJECT_START(Constants_exp_64_P) +// Reversed +data8 0xD00D6C8143914A8A,0x00003FF2 +data8 0xB60BC4AC30304B30,0x00003FF5 +data8 0x888888887474C518,0x00003FF8 +data8 0xAAAAAAAA8DAE729D,0x00003FFA +data8 0xAAAAAAAAAAAAAF61,0x00003FFC +data8 0x80000000000004C7,0x00003FFE +LOCAL_OBJECT_END(Constants_exp_64_P) + +LOCAL_OBJECT_START(Constants_exp_64_T1) +data4 0x3F800000,0x3F8164D2,0x3F82CD87,0x3F843A29 +data4 0x3F85AAC3,0x3F871F62,0x3F88980F,0x3F8A14D5 data4 0x3F8B95C2,0x3F8D1ADF,0x3F8EA43A,0x3F9031DC data4 0x3F91C3D3,0x3F935A2B,0x3F94F4F0,0x3F96942D data4 0x3F9837F0,0x3F99E046,0x3F9B8D3A,0x3F9D3EDA @@ -330,274 +334,263 @@ data4 0x3FD744FD,0x3FD99D16,0x3FDBFBB8,0x3FDE60F5 data4 0x3FE0CCDF,0x3FE33F89,0x3FE5B907,0x3FE8396A data4 0x3FEAC0C7,0x3FED4F30,0x3FEFE4BA,0x3FF28177 data4 0x3FF5257D,0x3FF7D0DF,0x3FFA83B3,0x3FFD3E0C -ASM_SIZE_DIRECTIVE(Constants_exp_64_T1) - -.align 64 -Constants_exp_64_T2: -ASM_TYPE_DIRECTIVE(Constants_exp_64_T2,@object) -data4 0x3F800000,0x3F80058C,0x3F800B18,0x3F8010A4 -data4 0x3F801630,0x3F801BBD,0x3F80214A,0x3F8026D7 -data4 0x3F802C64,0x3F8031F2,0x3F803780,0x3F803D0E -data4 0x3F80429C,0x3F80482B,0x3F804DB9,0x3F805349 -data4 0x3F8058D8,0x3F805E67,0x3F8063F7,0x3F806987 -data4 0x3F806F17,0x3F8074A8,0x3F807A39,0x3F807FCA -data4 0x3F80855B,0x3F808AEC,0x3F80907E,0x3F809610 -data4 0x3F809BA2,0x3F80A135,0x3F80A6C7,0x3F80AC5A -data4 0x3F80B1ED,0x3F80B781,0x3F80BD14,0x3F80C2A8 -data4 0x3F80C83C,0x3F80CDD1,0x3F80D365,0x3F80D8FA -data4 0x3F80DE8F,0x3F80E425,0x3F80E9BA,0x3F80EF50 -data4 0x3F80F4E6,0x3F80FA7C,0x3F810013,0x3F8105AA -data4 0x3F810B41,0x3F8110D8,0x3F81166F,0x3F811C07 -data4 0x3F81219F,0x3F812737,0x3F812CD0,0x3F813269 -data4 0x3F813802,0x3F813D9B,0x3F814334,0x3F8148CE +LOCAL_OBJECT_END(Constants_exp_64_T1) + +LOCAL_OBJECT_START(Constants_exp_64_T2) +data4 0x3F800000,0x3F80058C,0x3F800B18,0x3F8010A4 +data4 0x3F801630,0x3F801BBD,0x3F80214A,0x3F8026D7 +data4 0x3F802C64,0x3F8031F2,0x3F803780,0x3F803D0E +data4 0x3F80429C,0x3F80482B,0x3F804DB9,0x3F805349 +data4 0x3F8058D8,0x3F805E67,0x3F8063F7,0x3F806987 +data4 0x3F806F17,0x3F8074A8,0x3F807A39,0x3F807FCA +data4 0x3F80855B,0x3F808AEC,0x3F80907E,0x3F809610 +data4 0x3F809BA2,0x3F80A135,0x3F80A6C7,0x3F80AC5A +data4 0x3F80B1ED,0x3F80B781,0x3F80BD14,0x3F80C2A8 +data4 0x3F80C83C,0x3F80CDD1,0x3F80D365,0x3F80D8FA +data4 0x3F80DE8F,0x3F80E425,0x3F80E9BA,0x3F80EF50 +data4 0x3F80F4E6,0x3F80FA7C,0x3F810013,0x3F8105AA +data4 0x3F810B41,0x3F8110D8,0x3F81166F,0x3F811C07 +data4 0x3F81219F,0x3F812737,0x3F812CD0,0x3F813269 +data4 0x3F813802,0x3F813D9B,0x3F814334,0x3F8148CE data4 0x3F814E68,0x3F815402,0x3F81599C,0x3F815F37 -ASM_SIZE_DIRECTIVE(Constants_exp_64_T2) - -.align 64 -Constants_exp_64_W1: -ASM_TYPE_DIRECTIVE(Constants_exp_64_W1,@object) -data4 0x00000000,0x00000000,0x171EC4B4,0xBE384454 -data4 0x4AA72766,0xBE694741,0xD42518F8,0xBE5D32B6 -data4 0x3A319149,0x3E68D96D,0x62415F36,0xBE68F4DA -data4 0xC9C86A3B,0xBE6DDA2F,0xF49228FE,0x3E6B2E50 -data4 0x1188B886,0xBE49C0C2,0x1A4C2F1F,0x3E64BFC2 -data4 0x2CB98B54,0xBE6A2FBB,0x9A55D329,0x3E5DC5DE -data4 0x39A7AACE,0x3E696490,0x5C66DBA5,0x3E54728B -data4 0xBA1C7D7D,0xBE62B0DB,0x09F1AF5F,0x3E576E04 -data4 0x1A0DD6A1,0x3E612500,0x795FBDEF,0xBE66A419 -data4 0xE1BD41FC,0xBE5CDE8C,0xEA54964F,0xBE621376 -data4 0x476E76EE,0x3E6370BE,0x3427EB92,0x3E390D1A -data4 0x2BF82BF8,0x3E1336DE,0xD0F7BD9E,0xBE5FF1CB -data4 0x0CEB09DD,0xBE60A355,0x0980F30D,0xBE5CA37E -data4 0x4C082D25,0xBE5C541B,0x3B467D29,0xBE5BBECA -data4 0xB9D946C5,0xBE400D8A,0x07ED374A,0xBE5E2A08 -data4 0x365C8B0A,0xBE66CB28,0xD3403BCA,0x3E3AAD5B -data4 0xC7EA21E0,0x3E526055,0xE72880D6,0xBE442C75 -data4 0x85222A43,0x3E58B2BB,0x522C42BF,0xBE5AAB79 -data4 0x469DC2BC,0xBE605CB4,0xA48C40DC,0xBE589FA7 -data4 0x1AA42614,0xBE51C214,0xC37293F4,0xBE48D087 -data4 0xA2D673E0,0x3E367A1C,0x114F7A38,0xBE51BEBB -data4 0x661A4B48,0xBE6348E5,0x1D3B9962,0xBDF52643 -data4 0x35A78A53,0x3E3A3B5E,0x1CECD788,0xBE46C46C -data4 0x7857D689,0xBE60B7EC,0xD14F1AD7,0xBE594D3D -data4 0x4C9A8F60,0xBE4F9C30,0x02DFF9D2,0xBE521873 -data4 0x55E6D68F,0xBE5E4C88,0x667F3DC4,0xBE62140F -data4 0x3BF88747,0xBE36961B,0xC96EC6AA,0x3E602861 -data4 0xD57FD718,0xBE3B5151,0xFC4A627B,0x3E561CD0 -data4 0xCA913FEA,0xBE3A5217,0x9A5D193A,0x3E40A3CC -data4 0x10A9C312,0xBE5AB713,0xC5F57719,0x3E4FDADB -data4 0xDBDF59D5,0x3E361428,0x61B4180D,0x3E5DB5DB -data4 0x7408D856,0xBE42AD5F,0x31B2B707,0x3E2A3148 -ASM_SIZE_DIRECTIVE(Constants_exp_64_W1) - -.align 64 -Constants_exp_64_W2: -ASM_TYPE_DIRECTIVE(Constants_exp_64_W2,@object) -data4 0x00000000,0x00000000,0x37A3D7A2,0xBE641F25 -data4 0xAD028C40,0xBE68DD57,0xF212B1B6,0xBE5C77D8 -data4 0x1BA5B070,0x3E57878F,0x2ECAE6FE,0xBE55A36A -data4 0x569DFA3B,0xBE620608,0xA6D300A3,0xBE53B50E -data4 0x223F8F2C,0x3E5B5EF2,0xD6DE0DF4,0xBE56A0D9 -data4 0xEAE28F51,0xBE64EEF3,0x367EA80B,0xBE5E5AE2 -data4 0x5FCBC02D,0x3E47CB1A,0x9BDAFEB7,0xBE656BA0 -data4 0x805AFEE7,0x3E6E70C6,0xA3415EBA,0xBE6E0509 -data4 0x49BFF529,0xBE56856B,0x00508651,0x3E66DD33 -data4 0xC114BC13,0x3E51165F,0xC453290F,0x3E53333D -data4 0x05539FDA,0x3E6A072B,0x7C0A7696,0xBE47CD87 -data4 0xEB05C6D9,0xBE668BF4,0x6AE86C93,0xBE67C3E3 -data4 0xD0B3E84B,0xBE533904,0x556B53CE,0x3E63E8D9 -data4 0x63A98DC8,0x3E212C89,0x032A7A22,0xBE33138F -data4 0xBC584008,0x3E530FA9,0xCCB93C97,0xBE6ADF82 -data4 0x8370EA39,0x3E5F9113,0xFB6A05D8,0x3E5443A4 -data4 0x181FEE7A,0x3E63DACD,0xF0F67DEC,0xBE62B29D -data4 0x3DDE6307,0x3E65C483,0xD40A24C1,0x3E5BF030 -data4 0x14E437BE,0x3E658B8F,0xED98B6C7,0xBE631C29 -data4 0x04CF7C71,0x3E6335D2,0xE954A79D,0x3E529EED -data4 0xF64A2FB8,0x3E5D9257,0x854ED06C,0xBE6BED1B -data4 0xD71405CB,0x3E5096F6,0xACB9FDF5,0xBE3D4893 -data4 0x01B68349,0xBDFEB158,0xC6A463B9,0x3E628D35 -data4 0xADE45917,0xBE559725,0x042FC476,0xBE68C29C -data4 0x01E511FA,0xBE67593B,0x398801ED,0xBE4A4313 -data4 0xDA7C3300,0x3E699571,0x08062A9E,0x3E5349BE -data4 0x755BB28E,0x3E5229C4,0x77A1F80D,0x3E67E426 -data4 0x6B69C352,0xBE52B33F,0x084DA57F,0xBE6B3550 -data4 0xD1D09A20,0xBE6DB03F,0x2161B2C1,0xBE60CBC4 -data4 0x78A2B771,0x3E56ED9C,0x9D0FA795,0xBE508E31 -data4 0xFD1A54E9,0xBE59482A,0xB07FD23E,0xBE2A17CE -data4 0x17365712,0x3E68BF5C,0xB3785569,0x3E3956F9 -ASM_SIZE_DIRECTIVE(Constants_exp_64_W2) - -.align 64 -Constants_log_80_P: -ASM_TYPE_DIRECTIVE(Constants_log_80_P,@object) -// 1/2, P_8, P_7, ..., P_1 -data4 0x00000000, 0x80000000, 0x00003FFE, 0x00000000 -data4 0x3B1042BC, 0xCCCE8B88, 0x0000BFFB, 0x00000000 -data4 0xCADC2149, 0xE38997B7, 0x00003FFB, 0x00000000 -data4 0xB1ACB090, 0xFFFFFFFE, 0x0000BFFB, 0x00000000 -data4 0x06481C81, 0x92492498, 0x00003FFC, 0x00000000 -data4 0xAAAAB0EF, 0xAAAAAAAA, 0x0000BFFC, 0x00000000 -data4 0xCCC91416, 0xCCCCCCCC, 0x00003FFC, 0x00000000 -data4 0x00000000, 0x80000000, 0x0000BFFD, 0x00000000 -data4 0xAAAAAAAB, 0xAAAAAAAA, 0x00003FFD -ASM_SIZE_DIRECTIVE(Constants_log_80_P) - -.align 64 -Constants_log_80_Q: -ASM_TYPE_DIRECTIVE(Constants_log_80_Q,@object) -// log2_hi, log2_lo, Q_6, Q_5, Q_4, Q_3, Q_2, Q_1 -data4 0x00000000,0xB1721800,0x00003FFE,0x00000000 -data4 0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000 -data4 0xA51BE0AF,0x92492453,0x00003FFC,0x00000000 -data4 0xA0CFD29F,0xAAAAAB73,0x0000BFFC,0x00000000 -data4 0xCCCE3872,0xCCCCCCCC,0x00003FFC,0x00000000 -data4 0xFFFFB4FB,0xFFFFFFFF,0x0000BFFC,0x00000000 -data4 0xAAAAAAAB,0xAAAAAAAA,0x00003FFD,0x00000000 -data4 0x00000000,0x80000000,0x0000BFFE,0x00000000 -ASM_SIZE_DIRECTIVE(Constants_log_80_Q) - -.align 64 -Constants_log_80_Z_G_H_h1: -ASM_TYPE_DIRECTIVE(Constants_log_80_Z_G_H_h1,@object) -// Z1 - 16 bit fixed, G1 and H1 IEEE single, h1 IEEE double +LOCAL_OBJECT_END(Constants_exp_64_T2) + +LOCAL_OBJECT_START(Constants_exp_64_W1) +data8 0x0000000000000000, 0xBE384454171EC4B4 +data8 0xBE6947414AA72766, 0xBE5D32B6D42518F8 +data8 0x3E68D96D3A319149, 0xBE68F4DA62415F36 +data8 0xBE6DDA2FC9C86A3B, 0x3E6B2E50F49228FE +data8 0xBE49C0C21188B886, 0x3E64BFC21A4C2F1F +data8 0xBE6A2FBB2CB98B54, 0x3E5DC5DE9A55D329 +data8 0x3E69649039A7AACE, 0x3E54728B5C66DBA5 +data8 0xBE62B0DBBA1C7D7D, 0x3E576E0409F1AF5F +data8 0x3E6125001A0DD6A1, 0xBE66A419795FBDEF +data8 0xBE5CDE8CE1BD41FC, 0xBE621376EA54964F +data8 0x3E6370BE476E76EE, 0x3E390D1A3427EB92 +data8 0x3E1336DE2BF82BF8, 0xBE5FF1CBD0F7BD9E +data8 0xBE60A3550CEB09DD, 0xBE5CA37E0980F30D +data8 0xBE5C541B4C082D25, 0xBE5BBECA3B467D29 +data8 0xBE400D8AB9D946C5, 0xBE5E2A0807ED374A +data8 0xBE66CB28365C8B0A, 0x3E3AAD5BD3403BCA +data8 0x3E526055C7EA21E0, 0xBE442C75E72880D6 +data8 0x3E58B2BB85222A43, 0xBE5AAB79522C42BF +data8 0xBE605CB4469DC2BC, 0xBE589FA7A48C40DC +data8 0xBE51C2141AA42614, 0xBE48D087C37293F4 +data8 0x3E367A1CA2D673E0, 0xBE51BEBB114F7A38 +data8 0xBE6348E5661A4B48, 0xBDF526431D3B9962 +data8 0x3E3A3B5E35A78A53, 0xBE46C46C1CECD788 +data8 0xBE60B7EC7857D689, 0xBE594D3DD14F1AD7 +data8 0xBE4F9C304C9A8F60, 0xBE52187302DFF9D2 +data8 0xBE5E4C8855E6D68F, 0xBE62140F667F3DC4 +data8 0xBE36961B3BF88747, 0x3E602861C96EC6AA +data8 0xBE3B5151D57FD718, 0x3E561CD0FC4A627B +data8 0xBE3A5217CA913FEA, 0x3E40A3CC9A5D193A +data8 0xBE5AB71310A9C312, 0x3E4FDADBC5F57719 +data8 0x3E361428DBDF59D5, 0x3E5DB5DB61B4180D +data8 0xBE42AD5F7408D856, 0x3E2A314831B2B707 +LOCAL_OBJECT_END(Constants_exp_64_W1) + +LOCAL_OBJECT_START(Constants_exp_64_W2) +data8 0x0000000000000000, 0xBE641F2537A3D7A2 +data8 0xBE68DD57AD028C40, 0xBE5C77D8F212B1B6 +data8 0x3E57878F1BA5B070, 0xBE55A36A2ECAE6FE +data8 0xBE620608569DFA3B, 0xBE53B50EA6D300A3 +data8 0x3E5B5EF2223F8F2C, 0xBE56A0D9D6DE0DF4 +data8 0xBE64EEF3EAE28F51, 0xBE5E5AE2367EA80B +data8 0x3E47CB1A5FCBC02D, 0xBE656BA09BDAFEB7 +data8 0x3E6E70C6805AFEE7, 0xBE6E0509A3415EBA +data8 0xBE56856B49BFF529, 0x3E66DD3300508651 +data8 0x3E51165FC114BC13, 0x3E53333DC453290F +data8 0x3E6A072B05539FDA, 0xBE47CD877C0A7696 +data8 0xBE668BF4EB05C6D9, 0xBE67C3E36AE86C93 +data8 0xBE533904D0B3E84B, 0x3E63E8D9556B53CE +data8 0x3E212C8963A98DC8, 0xBE33138F032A7A22 +data8 0x3E530FA9BC584008, 0xBE6ADF82CCB93C97 +data8 0x3E5F91138370EA39, 0x3E5443A4FB6A05D8 +data8 0x3E63DACD181FEE7A, 0xBE62B29DF0F67DEC +data8 0x3E65C4833DDE6307, 0x3E5BF030D40A24C1 +data8 0x3E658B8F14E437BE, 0xBE631C29ED98B6C7 +data8 0x3E6335D204CF7C71, 0x3E529EEDE954A79D +data8 0x3E5D9257F64A2FB8, 0xBE6BED1B854ED06C +data8 0x3E5096F6D71405CB, 0xBE3D4893ACB9FDF5 +data8 0xBDFEB15801B68349, 0x3E628D35C6A463B9 +data8 0xBE559725ADE45917, 0xBE68C29C042FC476 +data8 0xBE67593B01E511FA, 0xBE4A4313398801ED +data8 0x3E699571DA7C3300, 0x3E5349BE08062A9E +data8 0x3E5229C4755BB28E, 0x3E67E42677A1F80D +data8 0xBE52B33F6B69C352, 0xBE6B3550084DA57F +data8 0xBE6DB03FD1D09A20, 0xBE60CBC42161B2C1 +data8 0x3E56ED9C78A2B771, 0xBE508E319D0FA795 +data8 0xBE59482AFD1A54E9, 0xBE2A17CEB07FD23E +data8 0x3E68BF5C17365712, 0x3E3956F9B3785569 +LOCAL_OBJECT_END(Constants_exp_64_W2) + +LOCAL_OBJECT_START(Constants_log_80_P) +// P_8, P_7, ..., P_1 +data8 0xCCCE8B883B1042BC, 0x0000BFFB // P_8 +data8 0xE38997B7CADC2149, 0x00003FFB // P_7 +data8 0xFFFFFFFEB1ACB090, 0x0000BFFB // P_6 +data8 0x9249249806481C81, 0x00003FFC // P_5 +data8 0x0000000000000000, 0x00000000 // Pad for bank conflicts +data8 0xAAAAAAAAAAAAB0EF, 0x0000BFFC // P_4 +data8 0xCCCCCCCCCCC91416, 0x00003FFC // P_3 +data8 0x8000000000000000, 0x0000BFFD // P_2 +data8 0xAAAAAAAAAAAAAAAB, 0x00003FFD // P_1 +LOCAL_OBJECT_END(Constants_log_80_P) + +LOCAL_OBJECT_START(Constants_log_80_Q) +// log2_hi, log2_lo, Q_6, Q_5, Q_4, Q_3, Q_2, Q_1 +data8 0xB172180000000000,0x00003FFE +data8 0x82E308654361C4C6,0x0000BFE2 +data8 0x92492453A51BE0AF,0x00003FFC +data8 0xAAAAAB73A0CFD29F,0x0000BFFC +data8 0xCCCCCCCCCCCE3872,0x00003FFC +data8 0xFFFFFFFFFFFFB4FB,0x0000BFFC +data8 0xAAAAAAAAAAAAAAAB,0x00003FFD +data8 0x8000000000000000,0x0000BFFE +LOCAL_OBJECT_END(Constants_log_80_Q) + +LOCAL_OBJECT_START(Constants_log_80_Z_G_H_h1) +// Z1 - 16 bit fixed, G1 and H1 IEEE single, h1 IEEE double data4 0x00008000,0x3F800000,0x00000000,0x00000000 -data4 0x00000000,0x00000000,0x00000000,0x00000000 +data4 0x00000000,0x00000000,0x00000000,0x00000000 data4 0x00007879,0x3F70F0F0,0x3D785196,0x00000000 data4 0xEBA0E0D1,0x8B1D330B,0x00003FDA,0x00000000 data4 0x000071C8,0x3F638E38,0x3DF13843,0x00000000 data4 0x9EADD553,0xE2AF365E,0x00003FE2,0x00000000 data4 0x00006BCB,0x3F579430,0x3E2FF9A0,0x00000000 -data4 0x752F34A2,0xF585FEC3,0x0000BFE3,0x00000000 +data4 0x752F34A2,0xF585FEC3,0x0000BFE3,0x00000000 data4 0x00006667,0x3F4CCCC8,0x3E647FD6,0x00000000 -data4 0x893B03F3,0xF3546435,0x00003FE2,0x00000000 -data4 0x00006187,0x3F430C30,0x3E8B3AE7,0x00000000 -data4 0x39CDD2AC,0xBABA62E0,0x00003FE4,0x00000000 -data4 0x00005D18,0x3F3A2E88,0x3EA30C68,0x00000000 +data4 0x893B03F3,0xF3546435,0x00003FE2,0x00000000 +data4 0x00006187,0x3F430C30,0x3E8B3AE7,0x00000000 +data4 0x39CDD2AC,0xBABA62E0,0x00003FE4,0x00000000 +data4 0x00005D18,0x3F3A2E88,0x3EA30C68,0x00000000 data4 0x457978A1,0x8718789F,0x00003FE2,0x00000000 -data4 0x0000590C,0x3F321640,0x3EB9CEC8,0x00000000 -data4 0x3185E56A,0x9442DF96,0x0000BFE4,0x00000000 -data4 0x00005556,0x3F2AAAA8,0x3ECF9927,0x00000000 -data4 0x2BBE2CBD,0xCBF9A4BF,0x00003FE4,0x00000000 -data4 0x000051EC,0x3F23D708,0x3EE47FC5,0x00000000 -data4 0x852D5935,0xF3537535,0x00003FE3,0x00000000 -data4 0x00004EC5,0x3F1D89D8,0x3EF8947D,0x00000000 -data4 0x46CDF32F,0xA1F1E699,0x0000BFDF,0x00000000 -data4 0x00004BDB,0x3F17B420,0x3F05F3A1,0x00000000 -data4 0xD8484CE3,0x84A61856,0x00003FE4,0x00000000 +data4 0x0000590C,0x3F321640,0x3EB9CEC8,0x00000000 +data4 0x3185E56A,0x9442DF96,0x0000BFE4,0x00000000 +data4 0x00005556,0x3F2AAAA8,0x3ECF9927,0x00000000 +data4 0x2BBE2CBD,0xCBF9A4BF,0x00003FE4,0x00000000 +data4 0x000051EC,0x3F23D708,0x3EE47FC5,0x00000000 +data4 0x852D5935,0xF3537535,0x00003FE3,0x00000000 +data4 0x00004EC5,0x3F1D89D8,0x3EF8947D,0x00000000 +data4 0x46CDF32F,0xA1F1E699,0x0000BFDF,0x00000000 +data4 0x00004BDB,0x3F17B420,0x3F05F3A1,0x00000000 +data4 0xD8484CE3,0x84A61856,0x00003FE4,0x00000000 data4 0x00004925,0x3F124920,0x3F0F4303,0x00000000 -data4 0xFF28821B,0xC7DD97E0,0x0000BFE2,0x00000000 -data4 0x0000469F,0x3F0D3DC8,0x3F183EBF,0x00000000 -data4 0xEF1FD32F,0xD3C4A887,0x00003FE3,0x00000000 -data4 0x00004445,0x3F088888,0x3F20EC80,0x00000000 -data4 0x464C76DA,0x84672BE6,0x00003FE5,0x00000000 +data4 0xFF28821B,0xC7DD97E0,0x0000BFE2,0x00000000 +data4 0x0000469F,0x3F0D3DC8,0x3F183EBF,0x00000000 +data4 0xEF1FD32F,0xD3C4A887,0x00003FE3,0x00000000 +data4 0x00004445,0x3F088888,0x3F20EC80,0x00000000 +data4 0x464C76DA,0x84672BE6,0x00003FE5,0x00000000 data4 0x00004211,0x3F042108,0x3F29516A,0x00000000 -data4 0x18835FB9,0x9A43A511,0x0000BFE5,0x00000000 -ASM_SIZE_DIRECTIVE(Constants_log_80_Z_G_H_h1) - -.align 64 -Constants_log_80_Z_G_H_h2: -ASM_TYPE_DIRECTIVE(Constants_log_80_Z_G_H_h2,@object) -// Z2 - 16 bit fixed, G2 and H2 IEEE single, h2 IEEE double -data4 0x00008000,0x3F800000,0x00000000,0x00000000 -data4 0x00000000,0x00000000,0x00000000,0x00000000 -data4 0x00007F81,0x3F7F00F8,0x3B7F875D,0x00000000 +data4 0x18835FB9,0x9A43A511,0x0000BFE5,0x00000000 +LOCAL_OBJECT_END(Constants_log_80_Z_G_H_h1) + +LOCAL_OBJECT_START(Constants_log_80_Z_G_H_h2) +// Z2 - 16 bit fixed, G2 and H2 IEEE single, h2 IEEE double +data4 0x00008000,0x3F800000,0x00000000,0x00000000 +data4 0x00000000,0x00000000,0x00000000,0x00000000 +data4 0x00007F81,0x3F7F00F8,0x3B7F875D,0x00000000 data4 0x211398BF,0xAD08B116,0x00003FDB,0x00000000 -data4 0x00007F02,0x3F7E03F8,0x3BFF015B,0x00000000 -data4 0xC376958E,0xB106790F,0x00003FDE,0x00000000 -data4 0x00007E85,0x3F7D08E0,0x3C3EE393,0x00000000 -data4 0x79A7679A,0xFD03F242,0x0000BFDA,0x00000000 -data4 0x00007E08,0x3F7C0FC0,0x3C7E0586,0x00000000 -data4 0x05E7AE08,0xF03F81C3,0x0000BFDF,0x00000000 -data4 0x00007D8D,0x3F7B1880,0x3C9E75D2,0x00000000 +data4 0x00007F02,0x3F7E03F8,0x3BFF015B,0x00000000 +data4 0xC376958E,0xB106790F,0x00003FDE,0x00000000 +data4 0x00007E85,0x3F7D08E0,0x3C3EE393,0x00000000 +data4 0x79A7679A,0xFD03F242,0x0000BFDA,0x00000000 +data4 0x00007E08,0x3F7C0FC0,0x3C7E0586,0x00000000 +data4 0x05E7AE08,0xF03F81C3,0x0000BFDF,0x00000000 +data4 0x00007D8D,0x3F7B1880,0x3C9E75D2,0x00000000 data4 0x049EB22F,0xD1B87D3C,0x00003FDE,0x00000000 -data4 0x00007D12,0x3F7A2328,0x3CBDC97A,0x00000000 -data4 0x3A9E81E0,0xFABC8B95,0x00003FDF,0x00000000 +data4 0x00007D12,0x3F7A2328,0x3CBDC97A,0x00000000 +data4 0x3A9E81E0,0xFABC8B95,0x00003FDF,0x00000000 data4 0x00007C98,0x3F792FB0,0x3CDCFE47,0x00000000 -data4 0x7C4B5443,0xF5F3653F,0x00003FDF,0x00000000 -data4 0x00007C20,0x3F783E08,0x3CFC15D0,0x00000000 -data4 0xF65A1773,0xE78AB204,0x00003FE0,0x00000000 -data4 0x00007BA8,0x3F774E38,0x3D0D874D,0x00000000 -data4 0x7B8EF695,0xDB7CBFFF,0x0000BFE0,0x00000000 -data4 0x00007B31,0x3F766038,0x3D1CF49B,0x00000000 -data4 0xCF773FB3,0xC0241AEA,0x0000BFE0,0x00000000 -data4 0x00007ABB,0x3F757400,0x3D2C531D,0x00000000 -data4 0xC9539FDF,0xFC8F4D48,0x00003FE1,0x00000000 -data4 0x00007A45,0x3F748988,0x3D3BA322,0x00000000 -data4 0x954665C2,0x9CD035FB,0x0000BFE1,0x00000000 -data4 0x000079D1,0x3F73A0D0,0x3D4AE46F,0x00000000 -data4 0xDD367A30,0xEC9017C7,0x00003FE1,0x00000000 -data4 0x0000795D,0x3F72B9D0,0x3D5A1756,0x00000000 -data4 0xCB11189C,0xEE6625D3,0x0000BFE1,0x00000000 -data4 0x000078EB,0x3F71D488,0x3D693B9D,0x00000000 +data4 0x7C4B5443,0xF5F3653F,0x00003FDF,0x00000000 +data4 0x00007C20,0x3F783E08,0x3CFC15D0,0x00000000 +data4 0xF65A1773,0xE78AB204,0x00003FE0,0x00000000 +data4 0x00007BA8,0x3F774E38,0x3D0D874D,0x00000000 +data4 0x7B8EF695,0xDB7CBFFF,0x0000BFE0,0x00000000 +data4 0x00007B31,0x3F766038,0x3D1CF49B,0x00000000 +data4 0xCF773FB3,0xC0241AEA,0x0000BFE0,0x00000000 +data4 0x00007ABB,0x3F757400,0x3D2C531D,0x00000000 +data4 0xC9539FDF,0xFC8F4D48,0x00003FE1,0x00000000 +data4 0x00007A45,0x3F748988,0x3D3BA322,0x00000000 +data4 0x954665C2,0x9CD035FB,0x0000BFE1,0x00000000 +data4 0x000079D1,0x3F73A0D0,0x3D4AE46F,0x00000000 +data4 0xDD367A30,0xEC9017C7,0x00003FE1,0x00000000 +data4 0x0000795D,0x3F72B9D0,0x3D5A1756,0x00000000 +data4 0xCB11189C,0xEE6625D3,0x0000BFE1,0x00000000 +data4 0x000078EB,0x3F71D488,0x3D693B9D,0x00000000 data4 0xBE11C424,0xA49C8DB5,0x0000BFE0,0x00000000 -ASM_SIZE_DIRECTIVE(Constants_log_80_Z_G_H_h2) - -.align 64 -Constants_log_80_h3_G_H: -ASM_TYPE_DIRECTIVE(Constants_log_80_h3_G_H,@object) -// h3 IEEE double extended, H3 and G3 IEEE single -data4 0x112666B0,0xAAACAAB1,0x00003FD3,0x3F7FFC00 +LOCAL_OBJECT_END(Constants_log_80_Z_G_H_h2) + +LOCAL_OBJECT_START(Constants_log_80_h3_G_H) +// h3 IEEE double extended, H3 and G3 IEEE single +data4 0x112666B0,0xAAACAAB1,0x00003FD3,0x3F7FFC00 data4 0x9B7FAD21,0x90051030,0x00003FD8,0x3F7FF400 -data4 0xF4D783C4,0xA6B46F46,0x00003FDA,0x3F7FEC00 -data4 0x11C6DDCA,0xDA148D88,0x0000BFD8,0x3F7FE400 +data4 0xF4D783C4,0xA6B46F46,0x00003FDA,0x3F7FEC00 +data4 0x11C6DDCA,0xDA148D88,0x0000BFD8,0x3F7FE400 data4 0xCA964D95,0xCE65C1D8,0x0000BFD8,0x3F7FDC00 -data4 0x23412D13,0x883838EE,0x0000BFDB,0x3F7FD400 -data4 0x983ED687,0xB7E5CFA1,0x00003FDB,0x3F7FCC08 -data4 0xE3C3930B,0xDBE23B16,0x0000BFD9,0x3F7FC408 -data4 0x48AA4DFC,0x9B92F1FC,0x0000BFDC,0x3F7FBC10 -data4 0xCE9C8F7E,0x9A8CEB15,0x0000BFD9,0x3F7FB410 -data4 0x0DECE74A,0x8C220879,0x00003FDC,0x3F7FAC18 +data4 0x23412D13,0x883838EE,0x0000BFDB,0x3F7FD400 +data4 0x983ED687,0xB7E5CFA1,0x00003FDB,0x3F7FCC08 +data4 0xE3C3930B,0xDBE23B16,0x0000BFD9,0x3F7FC408 +data4 0x48AA4DFC,0x9B92F1FC,0x0000BFDC,0x3F7FBC10 +data4 0xCE9C8F7E,0x9A8CEB15,0x0000BFD9,0x3F7FB410 +data4 0x0DECE74A,0x8C220879,0x00003FDC,0x3F7FAC18 data4 0x2F053150,0xB25CA912,0x0000BFDA,0x3F7FA420 -data4 0xD9A5BE20,0xA5876555,0x00003FDB,0x3F7F9C20 -data4 0x2053F087,0xC919BB6E,0x00003FD9,0x3F7F9428 -data4 0x041E9A77,0xB70BDA79,0x00003FDC,0x3F7F8C30 -data4 0xEA1C9C30,0xF18A5C08,0x00003FDA,0x3F7F8438 -data4 0x796D89E5,0xA3790D84,0x0000BFDD,0x3F7F7C40 -data4 0xA2915A3A,0xE1852369,0x0000BFDD,0x3F7F7448 -data4 0xA39ED868,0xD803858F,0x00003FDC,0x3F7F6C50 -data4 0x9417EBB7,0xB2EEE356,0x0000BFDD,0x3F7F6458 -data4 0x9BB0D07F,0xED5C1F8A,0x0000BFDC,0x3F7F5C68 -data4 0xE87C740A,0xD6D201A0,0x0000BFDD,0x3F7F5470 -data4 0x1CA74025,0xE8DEBF5E,0x00003FDC,0x3F7F4C78 +data4 0xD9A5BE20,0xA5876555,0x00003FDB,0x3F7F9C20 +data4 0x2053F087,0xC919BB6E,0x00003FD9,0x3F7F9428 +data4 0x041E9A77,0xB70BDA79,0x00003FDC,0x3F7F8C30 +data4 0xEA1C9C30,0xF18A5C08,0x00003FDA,0x3F7F8438 +data4 0x796D89E5,0xA3790D84,0x0000BFDD,0x3F7F7C40 +data4 0xA2915A3A,0xE1852369,0x0000BFDD,0x3F7F7448 +data4 0xA39ED868,0xD803858F,0x00003FDC,0x3F7F6C50 +data4 0x9417EBB7,0xB2EEE356,0x0000BFDD,0x3F7F6458 +data4 0x9BB0D07F,0xED5C1F8A,0x0000BFDC,0x3F7F5C68 +data4 0xE87C740A,0xD6D201A0,0x0000BFDD,0x3F7F5470 +data4 0x1CA74025,0xE8DEBF5E,0x00003FDC,0x3F7F4C78 data4 0x1F34A7EB,0x9A995A97,0x0000BFDC,0x3F7F4488 -data4 0x359EED97,0x9CB0F742,0x0000BFDA,0x3F7F3C90 -data4 0xBBC6A1C8,0xD6F833C2,0x0000BFDD,0x3F7F34A0 -data4 0xE71090EC,0xE1F68F2A,0x00003FDC,0x3F7F2CA8 -data4 0xC160A74F,0xD1881CF1,0x0000BFDB,0x3F7F24B8 -data4 0xD78CB5A4,0x9AD05AE2,0x00003FD6,0x3F7F1CC8 -data4 0x9A77DC4B,0xE658CB8E,0x0000BFDD,0x3F7F14D8 -data4 0x6BD6D312,0xBA281296,0x00003FDC,0x3F7F0CE0 -data4 0xF95210D0,0xB478BBEB,0x0000BFDB,0x3F7F04F0 -data4 0x38800100,0x39400480,0x39A00640,0x39E00C41 // H's start here -data4 0x3A100A21,0x3A300F22,0x3A4FF51C,0x3A6FFC1D +data4 0x359EED97,0x9CB0F742,0x0000BFDA,0x3F7F3C90 +data4 0xBBC6A1C8,0xD6F833C2,0x0000BFDD,0x3F7F34A0 +data4 0xE71090EC,0xE1F68F2A,0x00003FDC,0x3F7F2CA8 +data4 0xC160A74F,0xD1881CF1,0x0000BFDB,0x3F7F24B8 +data4 0xD78CB5A4,0x9AD05AE2,0x00003FD6,0x3F7F1CC8 +data4 0x9A77DC4B,0xE658CB8E,0x0000BFDD,0x3F7F14D8 +data4 0x6BD6D312,0xBA281296,0x00003FDC,0x3F7F0CE0 +data4 0xF95210D0,0xB478BBEB,0x0000BFDB,0x3F7F04F0 +data4 0x38800100,0x39400480,0x39A00640,0x39E00C41 // H's start here +data4 0x3A100A21,0x3A300F22,0x3A4FF51C,0x3A6FFC1D data4 0x3A87F20B,0x3A97F68B,0x3AA7EB86,0x3AB7E101 -data4 0x3AC7E701,0x3AD7DD7B,0x3AE7D474,0x3AF7CBED -data4 0x3B03E1F3,0x3B0BDE2F,0x3B13DAAA,0x3B1BD766 -data4 0x3B23CC5C,0x3B2BC997,0x3B33C711,0x3B3BBCC6 -data4 0x3B43BAC0,0x3B4BB0F4,0x3B53AF6D,0x3B5BA620 -data4 0x3B639D12,0x3B6B9444,0x3B7393BC,0x3B7B8B6D -ASM_SIZE_DIRECTIVE(Constants_log_80_h3_G_H) - -.align 64 -Constant_half: -ASM_TYPE_DIRECTIVE(Constant_half,@object) -data4 0x00000000,0x80000000,0x00003FFE -ASM_SIZE_DIRECTIVE(Constant_half) - -GR_Expo_Range = r32 -GR_Flag = r33 +data4 0x3AC7E701,0x3AD7DD7B,0x3AE7D474,0x3AF7CBED +data4 0x3B03E1F3,0x3B0BDE2F,0x3B13DAAA,0x3B1BD766 +data4 0x3B23CC5C,0x3B2BC997,0x3B33C711,0x3B3BBCC6 +data4 0x3B43BAC0,0x3B4BB0F4,0x3B53AF6D,0x3B5BA620 +data4 0x3B639D12,0x3B6B9444,0x3B7393BC,0x3B7B8B6D +LOCAL_OBJECT_END(Constants_log_80_h3_G_H) + +GR_sig_inv_ln2 = r14 +GR_rshf_2to51 = r15 +GR_exp_2tom51 = r16 +GR_rshf = r17 +GR_exp_half = r18 +GR_sign_mask = r19 +GR_exp_square_oflow = r20 +GR_exp_square_uflow = r21 +GR_exp_ynear1_oflow = r22 +GR_exp_ynear1_uflow = r23 +GR_signif_Z = r24 + +GR_signexp_x = r32 + +GR_exp_x = r33 + GR_Table_Ptr = r34 GR_Table_Ptr1 = r35 -GR_BIAS = r35 GR_Index1 = r36 -GR_sign_mask = r36 GR_Index2 = r37 GR_Expo_X = r37 -GR_signif_Z = r38 GR_M = r38 GR_X_0 = r39 @@ -620,45 +613,49 @@ GR_k = r44 GR_Big_Pos_Exp = r45 +GR_exp_pos_max = r46 -GR_BIAS_p_k = r47 -GR_BIASed_exp_y = r47 +GR_exp_bias_p_k = r47 -GR_Big_Neg_Exp = r48 GR_Index3 = r48 GR_temp = r48 GR_vsm_expo = r49 -GR_y_sign = r49 GR_T1_ptr = r50 +GR_P_ptr1 = r50 GR_T2_ptr = r51 +GR_P_ptr2 = r51 GR_N_fix = r52 GR_exp_y = r53 GR_signif_y = r54 -GR_exp_and_sign_y = r55 +GR_signexp_y = r55 +GR_fraction_y = r55 GR_low_order_bit = r56 -GR_get_exp_mask = r57 -GR_exponent_zero = r58 - -// ** Registers for unwind support +GR_exp_mask = r57 +GR_exp_bias = r58 +GR_y_sign = r59 +GR_table_base = r60 +GR_ptr_exp_Arg = r61 +GR_Delta_Exp = r62 +GR_Special_Exp = r63 +GR_exp_neg_max = r64 +GR_Big_Neg_Exp = r65 + +//** Registers for unwind support GR_SAVE_PFS = r59 GR_SAVE_B0 = r60 GR_SAVE_GP = r61 -GR_Parameter_X = r62 -GR_Parameter_Y = r63 -GR_Parameter_RESULT = r64 -GR_Parameter_TAG = r65 - -FR_X = f8 -FR_Y = f9 -FR_RESULT = f99 +GR_Parameter_X = r62 +GR_Parameter_Y = r63 +GR_Parameter_RESULT = r64 +GR_Parameter_TAG = r65 -// ** +//** FR_Input_X = f8 -FR_Output = f8 +FR_Result = f8 FR_Input_Y = f9 FR_Neg = f10 @@ -671,7 +668,6 @@ FR_poly_hi = f11 FR_Sgn = f12 -FR_Neg_X = f13 FR_half_W = f13 FR_X_cor = f14 @@ -698,13 +694,11 @@ FR_Scale = f36 FR_G_1 = f37 FR_G = f37 FR_Wsq = f37 -FR_L_Inv = f37 FR_temp = f37 FR_H_1 = f38 FR_H = f38 FR_W4 = f38 -FR_float_N = f38 FR_h = f39 FR_h_1 = f39 @@ -720,9 +714,7 @@ FR_L_lo = f41 FR_A_1 = f41 FR_h_2 = f42 -FR_P_6 = f42 -FR_abs_W = f43 FR_W1 = f43 FR_G_3 = f44 @@ -740,7 +732,6 @@ FR_H_3 = f47 FR_float_N = f48 -FR_P_4 = f49 FR_A_2 = f49 FR_Q_4 = f50 @@ -768,7 +759,6 @@ FR_Two = f56 FR_Big = f57 FR_neg_2_mK = f58 -FR_NBig = f58 FR_r = f59 @@ -777,1652 +767,1253 @@ FR_poly_lo = f60 FR_poly = f61 FR_P_5 = f62 +FR_Result_small = f62 FR_rsq = f63 -FR_Result = f99 -FR_Result_small = f100 -FR_Result_big = f101 +FR_Delta = f64 -.section .text -.proc powl# -.global powl# -.align 64 +FR_save_Input_X = f65 +FR_norm_X = f66 +FR_norm_Y = f67 +FR_Y_lo_2 = f68 -powl: -{ .mfi -alloc GR_Expo_Range = ar.pfs,0,30,4,0 -(p0) fclass.m.unc p7, p13 = FR_Input_Y, 0x1E7 -nop.i 0 -} -{ .mfi -(p0) getf.exp GR_exp_and_sign_y = FR_Input_Y +FR_P_6 = f69 +FR_Result_big = f69 + +FR_RSHF_2TO51 = f70 +FR_INV_LN2_2TO63 = f71 +FR_2TOM51 = f72 +FR_RSHF = f73 +FR_TMP1 = f74 +FR_TMP2 = f75 +FR_TMP3 = f76 +FR_Tscale = f77 +FR_P_4 = f78 +FR_NBig = f79 + + +.section .text +GLOBAL_LIBM_ENTRY(powl) // -// Save State +// Get significand of x. It is the critical path. // -(p0) fclass.m.unc p6, p12 = FR_Input_X, 0x1E7 -nop.i 0 -};; { .mfi -(p0) getf.sig GR_signif_y = FR_Input_Y -(p0) fcmp.eq.unc.s1 p12, p13 = FR_Input_X, f1 -nop.i 0 + getf.sig GR_signif_Z = FR_Input_X // Get significand of x + fclass.m p11, p12 = FR_Input_X, 0x0b // Test x unorm + nop.i 999 } { .mfi - nop.m 999 -// -// Check for y = 1 -// Identify EM unsupporteds. -// Load FR_half = .5 -// -(p0) fadd.s1 FR_Two = f1, f1 -// -// Load 1/2 in GP register -// -nop.i 0 + nop.m 999 + fnorm.s1 FR_norm_X = FR_Input_X // Normalize x + mov GR_exp_half = 0xffff - 1 // Exponent for 0.5 } ;; -{ .mmi - nop.m 999 -(p0) addl GR_Table_Ptr = @ltoff(Constant_half#), gp - nop.i 999 +{ .mfi + alloc r32 = ar.pfs,0,30,4,0 + fclass.m p7, p0 = FR_Input_Y, 0x1E7 // Test y natval, nan, inf, zero + mov GR_exp_pos_max = 0x13fff // Max exponent for pos oflow test +} +{ .mfi + addl GR_table_base = @ltoff(Constants_exp_64_Arg#), gp // Ptr to tables + fnorm.s1 FR_norm_Y = FR_Input_Y // Normalize y + mov GR_exp_neg_max = 0x33fff // Max exponent for neg oflow test } ;; -{ .mmi - ld8 GR_Table_Ptr = [GR_Table_Ptr] - nop.m 999 - nop.i 999 +{ .mfi + getf.exp GR_signexp_y = FR_Input_Y // Get sign and exp of y +(p12) fclass.m p11, p0 = FR_Input_Y, 0x0b // Test y unorm + mov GR_sign_mask = 0x20000 // Sign mask +} +{ .mfi + ld8 GR_table_base = [GR_table_base] // Get base address for tables + fadd.s1 FR_Two = f1, f1 // Form 2.0 for square test + mov GR_exp_mask = 0x1FFFF // Exponent mask } ;; -{ .mlx -(p0) ldfe FR_Half =[GR_Table_Ptr],0 -(p0) movl GR_get_exp_mask = 0x1FFFF ;; +{ .mfi + getf.sig GR_signif_y = FR_Input_Y // Get significand of y + fclass.m p6, p0 = FR_Input_X, 0x1E7 // Test x natval, nan, inf, zero + nop.i 999 } +;; { .mfi - nop.m 999 -(p0) fclass.nm.unc p9, p15 = FR_Input_Y, 0x1FF -// -// Create FR_Two = 2 -// Get exp and significand of Y -// Crate Masks -// sgn = 1 -// -(p0) and GR_exp_y = GR_get_exp_mask,GR_exp_and_sign_y + getf.exp GR_signexp_x = FR_Input_X // Get signexp of x + fmerge.s FR_save_Input_X = FR_Input_X, FR_Input_X + extr.u GR_Index1 = GR_signif_Z, 59, 4 // Extract upper 4 signif bits of x } -{ .mlx - nop.m 999 -(p0) movl GR_exponent_zero = 0xFFFF ;; +{ .mfb + setf.exp FR_Half = GR_exp_half // Load half + nop.f 999 +(p11) br.cond.spnt POWL_DENORM // Branch if x or y denorm/unorm } +;; + +// Return here from POWL_DENORM +POWL_COMMON: { .mfi - nop.m 999 -(p0) mov FR_Sgn = f1 - nop.i 999 + setf.exp FR_Big = GR_exp_pos_max // Form big pos value for oflow test + fclass.nm p11, p0 = FR_Input_Y, 0x1FF // Test Y unsupported + shl GR_Index1 = GR_Index1,5 // Adjust index1 pointer x 32 } { .mfi - nop.m 999 -(p0) fcmp.eq.unc.s1 p10, p11 = FR_Input_Y, f1 - nop.i 999 ;; + add GR_Table_Ptr = 0x7c0, GR_table_base // Constants_log_80_Z_G_H_h1 + fma.s1 FR_Sgn = f1,f1,f0 // Assume result positive + mov GR_exp_bias = 0xFFFF // Form exponent bias } -{ .mfb - nop.m 999 +;; + // // Identify NatVals, NaNs, Infs, and Zeros. -// Load Half // -(p0) fclass.nm.unc p8, p14 = FR_Input_X, 0x1FF -// -// Remove sign bit from exponent of y. -// Check for x = 1 -// -(p6) br.cond.spnt L(POWL_64_SPECIAL) ;; -} -{ .mib - nop.m 999 - nop.i 999 -(p7) br.cond.spnt L(POWL_64_SPECIAL) ;; -} -{ .mib - nop.m 999 - nop.i 999 -(p8) br.cond.spnt L(POWL_64_UNSUPPORT) ;; -} -{ .mib - nop.m 999 - nop.i 999 -(p9) br.cond.spnt L(POWL_64_UNSUPPORT) ;; -} -{ .mfi -(p0) cmp.lt.unc p9, p0 = GR_exp_y,GR_exponent_zero -(p0) fcmp.lt.unc.s1 p6, p13 = FR_Input_X, f0 // +// Remove sign bit from exponent of y. +// Check for x = 1 // Branch on Infs, Nans, Zeros, and Natvals // Check to see that exponent < 0 // -(p0) sub GR_exp_y = GR_exp_y,GR_exponent_zero -} -// x not zero, is y ==2? { .mfi - nop.m 999 -(p11) fcmp.eq.unc.s1 p7, p14 = FR_Input_Y, FR_Two - nop.i 999 ;; + setf.exp FR_NBig = GR_exp_neg_max // Form big neg value for oflow test + fclass.nm p8, p0 = FR_Input_X, 0x1FF // Test X unsupported + and GR_exp_y = GR_exp_mask,GR_signexp_y // Get biased exponent of y } { .mfb - nop.m 999 -(p9) fcmp.lt.unc.s1 p9, p0 = FR_Input_X, f0 -(p7) br.cond.spnt L(POWL_64_SQUARE) ;; // Branch if x not zero and y=2 + add GR_Index1 = GR_Index1,GR_Table_Ptr + nop.f 999 +(p6) br.cond.spnt POWL_64_SPECIAL // Branch if x natval, nan, inf, zero } -{ .mfi - nop.m 999 -(p6) fmerge.ns FR_Neg_X = FR_Input_X, FR_Input_X - nop.i 999 ;; -} -{ .mfi - nop.m 999 -(p10) fmpy.s0 FR_Result = FR_Input_X, f1 -// -// For y = 1, compute result = x -// For x = 1, compute 1 -// When Y is one return X and possible raise -// denormal operand exception. -// Remove exponent BIAS +;; + +// load Z_1 from Index1 + +// There is logic starting here to determine if y is an integer when x < 0. +// If 0 < |y| < 1 then clearly y is not an integer. +// If |y| > 1, then the significand of y is shifted left by the size of +// the exponent of y. This preserves the lsb of the integer part + the +// fractional bits. The lsb of the integer can be tested to determine if +// the integer is even or odd. The fractional bits can be tested. If zero, +// then y is an integer. // -(p6) shl GR_exp_and_sign_y= GR_signif_y,GR_exp_y ;; -} { .mfi -(p9) or GR_exp_and_sign_y = 0xF,GR_signif_y -(p12) fma.s0 FR_Result = FR_Input_Y, f0, f1 - nop.i 999 ;; + ld2 GR_Z_1 =[GR_Index1],4 // Load Z_1 + fmerge.s FR_Z = f0, FR_norm_X // Z = |x| + extr.u GR_X_0 = GR_signif_Z, 49, 15 // Extract X_0 from significand } -{ .mii - nop.m 999 -(p6) extr.u GR_exp_y = GR_exp_and_sign_y,63,1 ;; -(p6) cmp.ne.unc p9, p0 = GR_exp_y, r0 +{ .mfb + cmp.lt p9, p0 = GR_exp_y,GR_exp_bias // Test 0 < |y| < 1 + nop.f 999 +(p7) br.cond.spnt POWL_64_SPECIAL // Branch if y natval, nan, inf, zero } -{ .mii - nop.m 999 -// -// Both predicates can be set. -// Don't consider y's < 1. -// -(p6) shl GR_signif_y= GR_exp_and_sign_y,1 ;; -// -// Is shift off integer part of y. -// Get y's even or odd bit. -// -(p6) cmp.ne.unc p8, p0 = GR_signif_y, r0 +;; + +{ .mfb + ldfs FR_G_1 = [GR_Index1],4 // Load G_1 + fcmp.eq.s1 p10, p0 = FR_Input_Y, f1 // Test Y = +1.0 +(p8) br.cond.spnt POWL_64_UNSUPPORT // Branch if x unsupported } -{ .mib - nop.m 999 - nop.i 999 +;; + // -// Is the fractional part of the y = 0? -// Is the integer even or odd. +// X_0 = High order 15 bit of Z // -(p10) br.cond.spnt L(POWL_64_RETURN) ;; -} -{ .mib - nop.m 999 - nop.i 999 -(p12) br.cond.spnt L(POWL_64_RETURN) ;; -} -{ .mib - nop.m 999 - nop.i 999 -(p8) br.cond.spnt L(POWL_64_XNEG) ;; +{ .mfb + ldfs FR_H_1 = [GR_Index1],8 // Load H_1 +(p9) fcmp.lt.unc.s1 p9, p0 = FR_Input_X, f0 // Test x<0, 0 <|y|<1 +(p11) br.cond.spnt POWL_64_UNSUPPORT // Branch if y unsupported } +;; + { .mfi - nop.m 999 -(p9) fmerge.ns FR_Sgn = FR_Sgn, FR_Sgn - nop.i 999 + ldfe FR_h_1 = [GR_Index1] // Load h_1 + fcmp.eq.s1 p7, p0 = FR_Input_Y, FR_Two // Test y = 2.0 + pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 // X_1 = X_0 * Z_1 (bits 15-30) + // Wait 4 cycles to use result } { .mfi - nop.m 999 -(p0) fcmp.eq.unc.s0 p11, p0 = FR_Input_Y, FR_Half - nop.i 999 ;; + add GR_Table_Ptr = 0x9c0, GR_table_base // Constants_log_80_Z_G_H_h2 + nop.f 999 + sub GR_exp_y = GR_exp_y,GR_exp_bias // Get true exponent of y } +;; + // -// Raise possible denormal operand exception for both -// X and Y. +// Branch for (x < 0) and Y not an integer. // { .mfb - nop.m 999 -// -// Branch for (x < 0) and Y not an integer. -// -(p0) fcmp.eq.unc.s0 p12, p0 = FR_Input_X, f1 -// -// For x < 0 and y integer, make x positive -// For x < 0 and y odd integer,, set sign = -1. -// -(p11) br.cond.spnt L(POWL_64_SQRT) ;; -} -{ .mmf -(p0) cmp.eq.unc p15, p14 = r0, r0 - nop.m 999 -(p13) fnorm.s1 FR_Z = FR_Input_X ;; -} -{ .mfi - nop.m 999 -(p6) fnorm.s1 FR_Z = FR_Neg_X - nop.i 999 + nop.m 999 + fcmp.lt.s1 p6, p0 = FR_Input_X, f0 // Test x < 0 +(p9) br.cond.spnt POWL_64_XNEG // Branch if x < 0, 0 < |y| < 1 } ;; -// -// Branch to embedded sqrt(x) -// -// -// Computes ln( x ) to extra precision -// Input FR 1: FR_X -// Output FR 2: FR_Y_hi -// Output FR 3: FR_Y_lo -// Output PR 1: PR_Safe -// - -{ .mmi +{ .mfi nop.m 999 -(p0) addl GR_Table_Ptr = @ltoff(Constants_log_80_Z_G_H_h1#), gp + fcmp.eq.s1 p12, p0 = FR_Input_X, f1 // Test x=+1.0 nop.i 999 } +{ .mfb + nop.m 999 + fsub.s1 FR_W = FR_Z, f1 // W = Z - 1 +(p7) br.cond.spnt POWL_64_SQUARE // Branch if y=2 +} ;; -{ .mmi - ld8 GR_Table_Ptr = [GR_Table_Ptr] +{ .mfi nop.m 999 - nop.i 999 +(p10) fmpy.s0 FR_Result = FR_Input_X, f1 // If y=+1.0, result=x +(p6) shl GR_fraction_y= GR_signif_y,GR_exp_y // Get lsb of int + fraction + // Wait 4 cycles to use result } ;; - -{ .mlx - nop.m 999 -(p0) movl GR_BIAS = 0x000000000000FFFF ;; -} { .mfi - nop.m 999 -(p0) fsub.s1 FR_W = FR_Z, f1 - nop.i 999 ;; -} -// -// Z = Norm(X) - both + and - case -// Set Safe = True -// -{ .mmb -(p0) getf.sig GR_signif_Z = FR_Z -(p0) getf.exp GR_N = FR_Z - nop.b 999 ;; -} -{ .mii - nop.m 999 -// -// Get significand of Z -// W = Z - 1 -// -(p0) extr.u GR_Index1 = GR_signif_Z, 59, 4 ;; -// -// Index1 = High order 4 bits of Z -// X_0 = High order 15 bit of Z -// -(p0) shl GR_Index1 = GR_Index1,5 ;; -} -{ .mfi - nop.m 999 -// -// Add offset to Index1 ptr. -// -(p0) fabs FR_abs_W = FR_W -// -// BIAS = 0x000...FFFF -// Adjust Index1 ptr ( x 32) . -// -(p0) add GR_Index1 = GR_Index1,GR_Table_Ptr + nop.m 999 +(p12) fma.s0 FR_Result = FR_Input_Y, f0, f1 // If x=1.0, result=1, chk denorm + extr.u GR_Index2 = GR_X_1, 6, 4 // Extract index2 } -{ .mmi - nop.m 999 ;; -(p0) ld2 GR_Z_1 =[GR_Index1],4 -(p0) extr.u GR_X_0 = GR_signif_Z, 49, 15 +;; + +// +// N = exponent of Z +// +{ .mib + getf.exp GR_N = FR_Z // Get exponent of Z (also x) + shl GR_Index2=GR_Index2,5 // Index2 x 32 bytes +(p10) br.ret.spnt b0 // Exit if y=+1.0 } ;; -{ .mmi - nop.m 999 -(p0) addl GR_Table_Ptr = @ltoff(Constants_log_80_Z_G_H_h2#), gp +{ .mib + add GR_Index2 = GR_Index2, GR_Table_Ptr // Pointer to table 2 nop.i 999 +(p12) br.ret.spnt b0 // Exit if x=+1.0 } ;; { .mmi - ld8 GR_Table_Ptr = [GR_Table_Ptr] - nop.m 999 + ld2 GR_Z_2 =[GR_Index2],4 // Load Z_2 +;; + ldfs FR_G_2 = [GR_Index2],4 // Load G_2 nop.i 999 } ;; - -{ .mmi -(p0) ldfs FR_G_1 = [GR_Index1],4 ;; -(p0) ldfs FR_H_1 = [GR_Index1],8 - nop.i 999 ;; +{ .mii + ldfs FR_H_2 = [GR_Index2],8 // Load H_2 +(p6) tbit.nz.unc p9, p0 = GR_fraction_y, 63 // Test x<0 and y odd integer + add GR_Table_Ptr = 0xbcc, GR_table_base // Constants_log_80_h3_G_H, G_3 } +;; + // -// Adjust Index2 (x 32). +// For x < 0 and y odd integer,, set sign = -1. // { .mfi -(p0) ldfe FR_h_1 = [GR_Index1],0 - nop.f 999 -(p0) pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 ;; -} -{ .mmi - nop.m 999 ;; -// -// load Z_1 from Index1 -// abs_W = |W| -// Point to Table2 -// -(p0) getf.exp GR_M = FR_abs_W -// -// M = M - BIAS -// Load G_1 -// N = exponent of Z -// - nop.i 999;; + getf.exp GR_M = FR_W // Get signexp of W + nop.f 999 + pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 // X_2 = X_1 * Z_2 (bits 15-30) } -{ .mmi - nop.m 999 - nop.m 999 - nop.i 999;; +{ .mfi + ldfe FR_h_2 = [GR_Index2] // Load h_2 +(p9) fnma.s1 FR_Sgn = f1, f1, f0 // If x<0, y odd int, result negative + sub GR_N = GR_N, GR_exp_bias // Get true exponent of x = N } -{ .mmi - nop.m 999 - nop.m 999 - nop.i 999;; +;; + +{ .mfi + add GR_Table_Ptr1 = 0xdc0, GR_table_base // Ptr to H_3 + fcmp.eq.s0 p11, p0 = FR_Input_Y, FR_Half // Test y=0.5, also set denorm +(p6) shl GR_fraction_y= GR_fraction_y, 1 // Shift left 1 to get fraction } -{ .mmi - nop.m 999 - nop.m 999 -(p0) extr.u GR_Index2 = GR_X_1, 6, 4 ;; +;; + +{ .mmb + setf.sig FR_float_N = GR_N +(p6) cmp.ne.unc p8, p0 = GR_fraction_y, r0 // Test x<0 and y not integer +(p8) br.cond.spnt POWL_64_XNEG // Branch if x<0 and y not int } -{ .mii - nop.m 999 -// -// Extract Index2 -// Load H_1 -// Is -8 > M ? +;; + // -(p0) shl GR_Index2=GR_Index2,5 ;; -(p0) add GR_Index2 = GR_Index2, GR_Table_Ptr -} +// Raise possible denormal operand exception for both X and Y. +// Set pointers in case |x| near 1 +// Branch to embedded sqrt(x) if y=0.5 // -// M = exponent of abs_W -// X_1 = X_0 * Z_1 -// -{ .mii -(p0) sub GR_M = GR_M, GR_BIAS - nop.i 999 ;; -(p0) cmp.gt.unc p7, p14 = -8, GR_M +{ .mfi + add GR_P_ptr1 = 0x6b0, GR_table_base // Constants_log_80_P, P8, NEAR path + fcmp.eq.s0 p12, p0 = FR_Input_X, FR_Input_Y // Dummy to set denormal + add GR_P_ptr2 = 0x700, GR_table_base // Constants_log_80_P, P4, NEAR path } -{ .mib - nop.m 999 - nop.i 999 -(p7) br.cond.spnt L(LOGL80_NEAR) ;; +{ .mfb + cmp.eq p15, p14 = r0, r0 // Assume result safe (no over/under) + fsub.s1 FR_Delta = FR_Input_Y,f1 // Delta = y - 1.0 +(p11) br.cond.spnt POWL_64_SQRT // Branch if y=0.5 } +;; + // -// Load h_1 -// Possible branch out. -// Add offset of table to Index2 +// Computes ln( x ) to extra precision +// Input FR 1: FR_X +// Output FR 2: FR_Y_hi +// Output FR 3: FR_Y_lo +// Output PR 1: PR_Safe // { .mfi -(p0) ld2 GR_Z_2 =[GR_Index2],4 -(p0) fmerge.se FR_S = f1,FR_Z -(p0) sub GR_N = GR_N, GR_BIAS + and GR_M = GR_exp_mask, GR_M // Mask to get exponent of W + nop.f 999 + extr.u GR_Index3 = GR_X_2, 1, 5 // Get index3 } ;; { .mmi - nop.m 999 -(p0) addl GR_Table_Ptr = @ltoff(Constants_log_80_h3_G_H#), gp - nop.i 999 + shladd GR_Table_Ptr1 = GR_Index3,2,GR_Table_Ptr1 // Ptr to H_3 + shladd GR_Index3 = GR_Index3,4,GR_Table_Ptr // Ptr to G_3 + sub GR_M = GR_M, GR_exp_bias // Get true exponent of W } ;; -{ .mmi - ld8 GR_Table_Ptr = [GR_Table_Ptr] - nop.m 999 - nop.i 999 +{ .mib + ldfs FR_G_3 = [GR_Index3],-12 // Load G_3 + cmp.gt p7, p14 = -8, GR_M // Test if |x-1| < 2^-8 +(p7) br.cond.spnt LOGL80_NEAR // Branch if |x-1| < 2^-8 } ;; -// -// load Z_2 -// N - BIAS -// Point to Table 3. -// S = merging of Z and 1.0 -// -{ .mmi -(p0) ldfs FR_G_2 = [GR_Index2],4 -(p0) setf.sig FR_float_N = GR_N -(p0) add GR_Table_Ptr1 = 0x200,GR_Table_Ptr ;; -} -// -// load G_2 -// X_2 = X_1 * Z_2 -// Add offset to Table 2 ptr. -// float_N = significand of N -// -{ .mmi -(p0) ldfs FR_H_2 = [GR_Index2],8 ;; -// -// load H_2 -// G = G * G_2 -// -(p0) ldfe FR_h_2 = [GR_Index2],0 -(p0) pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 ;; -} -{ .mmi - nop.m 999 - nop.m 999 - nop.i 999;; -} -{ .mmi - nop.m 999 - nop.m 999 - nop.i 999;; -} -{ .mmi - nop.m 999 - nop.m 999 - nop.i 999;; +// Here if |x-1| >= 2^-8 +{ .mmf + ldfs FR_H_3 = [GR_Table_Ptr1] // Load H_3 + nop.m 999 + nop.f 999 } -{ .mii - nop.m 999 - nop.i 999 ;; -(p0) extr.u GR_Index3 = GR_X_2, 1, 5 ;; +;; + +{ .mfi + ldfe FR_h_3 = [GR_Index3] // Load h_3 + fmerge.se FR_S = f1,FR_Z // S = merge of 1.0 and signif(Z) + nop.i 999 } { .mfi -(p0) shladd GR_Table_Ptr1 = GR_Index3,2,GR_Table_Ptr1 - nop.f 999 + add GR_Table_Ptr = 0x740, GR_table_base // Constants_log_80_Q + fmpy.s1 FR_G = FR_G_1, FR_G_2 // G = G_1 * G_2 + nop.i 999 +} +;; + // -// h = h_1 + h_2 -// Adjust Index3 +// Begin Loading Q's - load log2_hi part // -(p0) shladd GR_Index3 = GR_Index3,4,GR_Table_Ptr ;; -} -{ .mmb - nop.m 999 -(p0) ldfe FR_h_3 = [GR_Index3],12 - nop.b 999 ;; -} -{ .mmf -(p0) ldfs FR_H_3 = [GR_Table_Ptr1],0 +{ .mfi + ldfe FR_log2_hi = [GR_Table_Ptr],16 // Load log2_hi + fadd.s1 FR_H = FR_H_1, FR_H_2 // H = H_1 + H_2 + nop.i 999 +};; + // -// float_N = Make N a fp number -// Load h_3 -// Get pointer to Q table. +// h = h_1 + h_2 // -(p0) ldfs FR_G_3 = [GR_Index3],0 -(p0) fmpy.s1 FR_G = FR_G_1, FR_G_2 +{ .mfi + ldfe FR_log2_lo = [GR_Table_Ptr],16 // Load log2_lo + fadd.s1 FR_h = FR_h_1, FR_h_2 // h = h_1 + h_2 + nop.i 999 } ;; -{ .mmi - nop.m 999 -(p0) addl GR_Table_Ptr = @ltoff(Constants_log_80_Q#), gp +{ .mfi + ldfe FR_Q_6 = [GR_Table_Ptr],16 // Load Q_6 + fcvt.xf FR_float_N = FR_float_N nop.i 999 } ;; -{ .mmi - ld8 GR_Table_Ptr = [GR_Table_Ptr] - nop.m 999 +{ .mfi + ldfe FR_Q_5 = [GR_Table_Ptr],16 // Load Q_5 + nop.f 999 nop.i 999 } ;; - - -{ .mfi -(p0) ldfe FR_log2_hi = [GR_Table_Ptr],16 -(p0) fadd.s1 FR_H = FR_H_1, FR_H_2 - nop.i 999 ;; -} -{ .mmf - nop.m 999 -// -// G = G_1 * G_2 * G_3 -// -(p0) ldfe FR_log2_lo = [GR_Table_Ptr],16 -// -// load h_2 -// H = H_1 + H_2 -// Get Index3 // -(p0) fadd.s1 FR_h = FR_h_1, FR_h_2 ;; -} -// -// Load log2_lo part -// r = G*S -1 +// G = G_1 * G_2 * G_3 // { .mfi -(p0) ldfe FR_Q_6 = [GR_Table_Ptr],16 -// -// Load H_3 -// -(p0) fcvt.xf FR_float_N = FR_float_N - nop.i 999 ;; + ldfe FR_Q_4 = [GR_Table_Ptr],16 // Load Q_4 + fmpy.s1 FR_G = FR_G, FR_G_3 + nop.i 999 } +;; + // -// Load Q_6 +// H = H_1 + H_2 + H_3 // -{ .mmi -(p0) ldfe FR_Q_5 = [GR_Table_Ptr],16 ;; -(p0) ldfe FR_Q_4 = [GR_Table_Ptr],16 - nop.i 999 ;; -} -{ .mmi -(p0) ldfe FR_Q_3 = [GR_Table_Ptr],16 ;; -(p0) ldfe FR_Q_2 = [GR_Table_Ptr],16 - nop.i 999 ;; +{ .mfi + ldfe FR_Q_3 = [GR_Table_Ptr],16 // Load Q_3 + fadd.s1 FR_H = FR_H, FR_H_3 + nop.i 999 } -{ .mmf - nop.m 999 -// -// poly_lo = Q_5 + r * Q_6 -// Load Q_2 -// rsq = r * r +;; + // -(p0) ldfe FR_Q_1 = [GR_Table_Ptr],16 +// Y_lo = poly + Y_lo // -// h = h_1 + h_2 + h_3 -// H = H_1 + H_2 + H_3 -// Load G_3. -// Begin Loading Q's - load log2_hi part +// h = h_1 + h_2 + h_3 // -(p0) fmpy.s1 FR_G = FR_G, FR_G_3 -} { .mfi - nop.m 999 -(p0) fadd.s1 FR_H = FR_H, FR_H_3 - nop.i 999 + ldfe FR_Q_2 = [GR_Table_Ptr],16 // Load Q_2 + fadd.s1 FR_h = FR_h, FR_h_3 + nop.i 999 } ;; // -// Y_lo = poly + Y_lo +// GS_hi = G*S +// r = G*S -1 // - -{ .mmi - nop.m 999 -(p0) addl GR_Table_Ptr = @ltoff(Constants_exp_64_Arg#), gp +{ .mfi + ldfe FR_Q_1 = [GR_Table_Ptr],16 // Load Q_1 + fmpy.s1 FR_GS_hi = FR_G, FR_S nop.i 999 } -;; - -{ .mmi - ld8 GR_Table_Ptr = [GR_Table_Ptr] +{ .mfi nop.m 999 + fms.s1 FR_r = FR_G, FR_S, f1 nop.i 999 } ;; - -{ .mfi - nop.m 999 -(p0) fadd.s1 FR_h = FR_h, FR_h_3 - nop.i 999 ;; -} -{ .mfi - nop.m 999 // -// Load Q_5 +// poly_lo = Q_5 + r * Q_6 // -(p0) fmpy.s1 FR_GS_hi = FR_G, FR_S - nop.i 999 -} -{ .mfi - nop.m 999 -(p0) fms.s1 FR_r = FR_G, FR_S, f1 - nop.i 999 ;; -} { .mfi - nop.m 999 -(p0) fma.s1 FR_poly_lo = FR_r, FR_Q_6, FR_Q_5 - nop.i 999 + getf.exp GR_Delta_Exp = FR_Delta // Get signexp of y-1 for exp calc + fma.s1 FR_poly_lo = FR_r, FR_Q_6, FR_Q_5 + nop.i 999 } -{ .mfi - nop.m 999 // -// GS_hi = G*S -// Load Q_4 +// r_cor = GS_hi -1 // -(p0) fsub.s1 FR_r_cor = FR_GS_hi, f1 - nop.i 999 ;; -} -{ .mfi - nop.m 999 -(p0) fms.s1 FR_GS_lo = FR_G, FR_S, FR_GS_hi - nop.i 999 -} { .mfi - nop.m 999 -(p0) fma.s1 FR_poly = FR_r, FR_Q_2, FR_Q_1 - nop.i 999 ;; + nop.m 999 + fsub.s1 FR_r_cor = FR_GS_hi, f1 + nop.i 999 } -{ .mfi - nop.m 999 +;; + // -// Load Q_3 -// r_cor = GS_hi -1 // GS_lo = G*S - GS_hi // -(p0) fmpy.s1 FR_rsq = FR_r, FR_r - nop.i 999 -} { .mfi - nop.m 999 -(p0) fma.s1 FR_G = FR_float_N, FR_log2_hi, FR_H - nop.i 999 ;; + nop.m 999 + fms.s1 FR_GS_lo = FR_G, FR_S, FR_GS_hi + nop.i 999 } -{ .mfi - nop.m 999 +;; + // -// poly = poly_hi + rsq * poly_lo -// Tbl = float_N*log2_hi + H +// rsq = r * r // -(p0) fma.s1 FR_Y_lo = FR_float_N, FR_log2_lo, FR_h - nop.i 999 ;; -} { .mfi - nop.m 999 -// -// r_cor = r_cor - r -// poly_hi = r * Q_2 + Q_1 -// -(p0) fma.s1 FR_poly_lo = FR_r, FR_poly_lo, FR_Q_4 - nop.i 999 + nop.m 999 + fmpy.s1 FR_rsq = FR_r, FR_r + nop.i 999 } -{ .mfi - nop.m 999 // -// Load Q_1 +// G = float_N*log2_hi + H // -(p0) fsub.s1 FR_r_cor = FR_r_cor, FR_r - nop.i 999 ;; -} { .mfi - nop.m 999 -// -// Y_lo = float_N*log2_lo + h -// -(p0) fadd.s1 FR_Y_hi = FR_G, FR_r - nop.i 999 ;; + nop.m 999 + fma.s1 FR_G = FR_float_N, FR_log2_hi, FR_H + nop.i 999 } -{ .mfi - nop.m 999 +;; + // -// poly_lo = Q_4 + r * poly_lo;; -// r_cor = r_cor + GS_lo;; +// Y_lo = float_N*log2_lo + h // -(p0) fma.s1 FR_poly_lo = FR_r, FR_poly_lo, FR_Q_3 - nop.i 999 -} { .mfi - nop.m 999 -(p0) fadd.s1 FR_r_cor = FR_r_cor, FR_GS_lo - nop.i 999 ;; -} -{ .mfi - nop.m 999 -(p0) fadd.s1 FR_r_cor = FR_r_cor, FR_Y_lo - nop.i 999 + nop.m 999 + fma.s1 FR_Y_lo = FR_float_N, FR_log2_lo, FR_h + nop.i 999 } -{ .mfi - nop.m 999 +;; + // -// poly_lo = Q_3 + r * poly_lo;; +// poly_lo = Q_4 + r * poly_lo +// r_cor = r_cor - r // -(p0) fma.s1 FR_poly = FR_rsq, FR_poly_lo, FR_poly - nop.i 999 ;; -} { .mfi - nop.m 999 -(p0) fsub.s1 FR_Y_lo = FR_G, FR_Y_hi - nop.i 999 -} -{ .mmi -(p0) ldfe FR_L_Inv = [GR_Table_Ptr],16 ;; -(p0) ldfe FR_L_hi = [GR_Table_Ptr],16 - nop.i 999 ;; + nop.m 999 + fma.s1 FR_poly_lo = FR_r, FR_poly_lo, FR_Q_4 + nop.i 999 } { .mfi -(p0) ldfe FR_L_lo = [GR_Table_Ptr],16 - nop.f 999 - nop.i 999 ;; + nop.m 999 + fsub.s1 FR_r_cor = FR_r_cor, FR_r + nop.i 999 } -{ .mfi - nop.m 999 +;; + // -// Y_hi = Tbl + r -// r_cor = r_cor + Y_lo +// poly_hi = r * Q_2 + Q_1 +// Y_hi = G + r // -(p0) fma.s1 FR_poly = FR_rsq, FR_poly, FR_r_cor - nop.i 999 ;; -} { .mfi - nop.m 999 -// Y_lo = Tbl - Y_hi -// poly = rsq * poly + r_cor -// -(p0) fadd.s1 FR_Y_lo = FR_Y_lo, FR_r - nop.i 999 ;; + nop.m 999 + fma.s1 FR_poly = FR_r, FR_Q_2, FR_Q_1 + nop.i 999 } -{ .mfb - nop.m 999 -// -// Y_lo = Y_lo + r -// -(p0) fadd.s1 FR_Y_lo = FR_Y_lo, FR_poly -// -// Load L_Inv -// Load L_hi -// Load L_lo -// all long before they are needed. -// They are used in LOGL_RETURN PATH -// -br.cond.sptk L(LOGL_RETURN) ;; +{ .mfi + nop.m 999 + fadd.s1 FR_Y_hi = FR_G, FR_r + nop.i 999 } -L(LOGL80_NEAR): +;; + // -// Branch LOGL80_NEAR +// poly_lo = Q_3 + r * poly_lo +// r_cor = r_cor + GS_lo // - -{ .mmi +{ .mfi nop.m 999 -(p0) addl GR_Table_Ptr = @ltoff(Constants_log_80_P#), gp + fma.s1 FR_poly_lo = FR_r, FR_poly_lo, FR_Q_3 nop.i 999 } -;; - -{ .mmi - ld8 GR_Table_Ptr = [GR_Table_Ptr] +{ .mfi nop.m 999 + fadd.s1 FR_r_cor = FR_r_cor, FR_GS_lo nop.i 999 } ;; -{ .mfi - nop.m 999 -(p0) fmpy.s1 FR_Wsq = FR_W, FR_W -(p0) add GR_Table_Ptr1 = 0x50,GR_Table_Ptr -} // -// Adjust ptr to 1/2 -// Adjust Ptr1 to P_4 +// Y_lo = G - Y_hi // -{ .mmi -(p0) ldfe FR_Half = [GR_Table_Ptr],16 ;; -(p0) ldfe FR_P_4 = [GR_Table_Ptr1],16 - nop.i 999 +{ .mfi + nop.m 999 + fsub.s1 FR_Y_lo_2 = FR_G, FR_Y_hi + nop.i 999 } +;; + // -// Load 1/2 +// r_cor = r_cor + Y_lo +// poly = poly_hi + rsq * poly_lo // -{ .mmi -(p0) ldfe FR_P_8 = [GR_Table_Ptr],16 ;; -(p0) ldfe FR_P_3 = [GR_Table_Ptr1],16 - nop.i 999 +{ .mfi + add GR_Table_Ptr = 0x0, GR_table_base // Constants_exp_64_Arg + fadd.s1 FR_r_cor = FR_r_cor, FR_Y_lo + nop.i 999 } -{ .mmi -(p0) ldfe FR_P_7 = [GR_Table_Ptr],16 ;; -(p0) ldfe FR_P_2 = [GR_Table_Ptr1],16 - nop.i 999 +{ .mfi + nop.m 999 + fma.s1 FR_poly = FR_rsq, FR_poly_lo, FR_poly + nop.i 999 } +;; + // -// Load P_7 -// half_W = .5 * W -// Load P_3 -// -{ .mmi -(p0) ldfe FR_P_6 = [GR_Table_Ptr],16 ;; -(p0) ldfe FR_P_1 = [GR_Table_Ptr1],16 - nop.i 999 ;; -} +// Load L_hi +// Load L_lo +// all long before they are needed. +// They are used in LOGL_RETURN PATH // -// Load P_6 -// Wsq = w * w -// poly = w*P_4 + P_3 -// Load P_2 +// Y_lo = Y_lo + r +// poly = rsq * poly + r_cor // { .mfi -(p0) ldfe FR_P_5 = [GR_Table_Ptr],16 -// -// Load P_5 -// poly_lo = w * P_8 + P_7 -// Y_hi = w - (1/2)w*w -// Load P_1 -// -(p0) fmpy.s1 FR_W4 = FR_Wsq, FR_Wsq - nop.i 999 + ldfe FR_L_hi = [GR_Table_Ptr],16 // Load L_hi + fadd.s1 FR_Y_lo = FR_Y_lo_2, FR_r + nop.i 999 } { .mfi - nop.m 999 -(p0) fmpy.s1 FR_W3 = FR_Wsq, FR_W - nop.i 999 + nop.m 999 + fma.s1 FR_poly = FR_rsq, FR_poly, FR_r_cor + nop.i 999 } ;; +{ .mfb + ldfe FR_L_lo = [GR_Table_Ptr],16 // Load L_lo + fadd.s1 FR_Y_lo = FR_Y_lo, FR_poly + br.cond.sptk LOGL_RETURN // Branch to common code +} +;; + + +LOGL80_NEAR: +// Here if |x-1| < 2^-8 // -// Y_lo = W3 * poly + Y_lo +// Branch LOGL80_NEAR // +{ .mmf + ldfe FR_P_8 = [GR_P_ptr1],16 // Load P_8 + ldfe FR_P_4 = [GR_P_ptr2],16 // Load P_4 + fmpy.s1 FR_Wsq = FR_W, FR_W +} +;; + { .mmi - nop.m 999 -(p0) addl GR_Table_Ptr = @ltoff(Constants_exp_64_Arg#), gp + ldfe FR_P_7 = [GR_P_ptr1],16 // Load P_7 + ldfe FR_P_3 = [GR_P_ptr2],16 // Load P_3 nop.i 999 } ;; { .mmi - ld8 GR_Table_Ptr = [GR_Table_Ptr] - nop.m 999 + ldfe FR_P_6 = [GR_P_ptr1],16 // Load P_6 + ldfe FR_P_2 = [GR_P_ptr2],16 // Load P_2 nop.i 999 } ;; - { .mmi -(p0) ldfe FR_L_Inv = [GR_Table_Ptr],16 ;; -(p0) ldfe FR_L_hi = [GR_Table_Ptr],16 - nop.i 999 ;; -} -{ .mfi -(p0) ldfe FR_L_lo = [GR_Table_Ptr],16 -// -// Load P_8 -// Load P_4 -// -(p0) fmpy.s1 FR_half_W = FR_Half, FR_W - nop.i 999 ;; + ldfe FR_P_5 = [GR_P_ptr1],16 // Load P_5 + ldfe FR_P_1 = [GR_P_ptr2],16 // Load P_1 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fma.s1 FR_poly_lo = FR_W, FR_P_8,FR_P_7 - nop.i 999 + getf.exp GR_Delta_Exp = FR_Delta // Get signexp of y-1 for exp calc + fmpy.s1 FR_W4 = FR_Wsq, FR_Wsq + nop.i 999 } { .mfi - nop.m 999 -(p0) fma.s1 FR_poly = FR_W, FR_P_4, FR_P_3 - nop.i 999 ;; + add GR_Table_Ptr = 0x0, GR_table_base // Constants_exp_64_Arg + fmpy.s1 FR_W3 = FR_Wsq, FR_W + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fnma.s1 FR_Y_hi = FR_W, FR_half_W, FR_W - nop.i 999 ;; + nop.m 999 + fmpy.s1 FR_half_W = FR_Half, FR_W + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// W4 = Wsq * Wsq -// poly = w *poly + P_2 -// -(p0) fma.s1 FR_poly_lo = FR_W, FR_poly_lo, FR_P_6 - nop.i 999 + ldfe FR_L_hi = [GR_Table_Ptr],16 + fma.s1 FR_poly_lo = FR_W, FR_P_8,FR_P_7 + nop.i 999 } { .mfi - nop.m 999 -(p0) fma.s1 FR_poly = FR_W, FR_poly, FR_P_2 - nop.i 999 ;; + nop.m 999 + fma.s1 FR_poly = FR_W, FR_P_4, FR_P_3 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fsub.s1 FR_Y_lo = FR_W, FR_Y_hi - nop.i 999 ;; + ldfe FR_L_lo = [GR_Table_Ptr],16 + fnma.s1 FR_Y_hi = FR_W, FR_half_W, FR_W + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// poly = w * poly + P_1 -// w3 = wsq * w -// -(p0) fma.s1 FR_poly_lo = FR_W, FR_poly_lo, FR_P_5 - nop.i 999 + nop.m 999 + fma.s1 FR_poly_lo = FR_W, FR_poly_lo, FR_P_6 + nop.i 999 } { .mfi - nop.m 999 -// -// poly_lo = w * poly_lo + P_6 -// Y_lo = W - Y_hi -// -(p0) fma.s1 FR_poly = FR_W, FR_poly, FR_P_1 - nop.i 999 ;; + nop.m 999 + fma.s1 FR_poly = FR_W, FR_poly, FR_P_2 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fnma.s1 FR_Y_lo = FR_W, FR_half_W, FR_Y_lo - nop.i 999 ;; + nop.m 999 + fsub.s1 FR_Y_lo = FR_W, FR_Y_hi + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// poly_lo = w * poly_lo + -// Y_lo = Y_lo - w * (1/2)w -// -(p0) fma.s1 FR_poly = FR_poly_lo, FR_W4, FR_poly - nop.i 999 ;; + nop.m 999 + fma.s1 FR_poly_lo = FR_W, FR_poly_lo, FR_P_5 + nop.i 999 } { .mfi - nop.m 999 -// -// Y_lo = (W-Y_hi) - w * (1/2)w -// poly = W4* poly_lo + poly -// -(p0) fma.s1 FR_Y_lo = FR_poly, FR_W3, FR_Y_lo - nop.i 999 ;; + nop.m 999 + fma.s1 FR_poly = FR_W, FR_poly, FR_P_1 + nop.i 999 } -L(LOGL_RETURN): +;; + { .mfi -(p0) add GR_Expo_Range = 0x2,r0 -// -// Load L_Inv -// Load L_hi -// Load L_lo -// all long before they are needed. -// -// -// kernel_log_80 computed ln(X) -// and return logX_hi and logX_lo as results. -// PR_pow_Safe set as well. -// -(p0) fmpy.s1 FR_X_lo = FR_Input_Y, FR_logx_lo -// -// Compute Y * (logX_hi + logX_lo) -// P_hi -> X -// P_lo -> X_cor -// (Manipulate names so that inputs are in -// the place kernel_exp expects them) -// Set GR_Flag to 2 -// Set GR_Expo_Range to Double -// -// This function computes exp( x + x_cor) -// Input FR 1: FR_X -// Input FR 2: FR_X_cor -// Input GR 1: GR_Flag -// Input GR 2: GR_Expo_Range -// Output FR 3: FR_Y_hi -// Output FR 4: FR_Y_lo -// Output FR 5: FR_Scale -// Output PR 1: PR_Safe -// -(p0) cmp.eq.unc p15, p0 = r0, r0 + nop.m 999 + fnma.s1 FR_Y_lo = FR_W, FR_half_W, FR_Y_lo + nop.i 999 } ;; -{ .mmi -(p0) addl GR_W1_ptr = @ltoff(Constants_exp_64_W1#), gp -(p0) addl GR_W2_ptr = @ltoff(Constants_exp_64_W2#), gp -(p0) add GR_Flag = 0x2,r0 +{ .mfi + nop.m 999 + fma.s1 FR_poly = FR_poly_lo, FR_W4, FR_poly + nop.i 999 } ;; -{ .mmi - ld8 GR_W1_ptr = [GR_W1_ptr] - ld8 GR_W2_ptr = [GR_W2_ptr] -(p0) cmp.ne.unc p7, p0 = 0x1, GR_Flag +{ .mfi + nop.m 999 + fma.s1 FR_Y_lo = FR_poly, FR_W3, FR_Y_lo + nop.i 999 } ;; -{ .mlx - nop.m 999 -(p0) movl GR_Mask = 0x1FFFF ;; -} +LOGL_RETURN: +// Common code for completion of both logx paths -{ .mlx - nop.m 999 -(p0) movl GR_BIAS = 0x0FFFF ;; -} -{ .mfi - nop.m 999 // -// X_lo = Y * logX_lo +// L_hi, L_lo already loaded. // -(p0) fma.s1 FR_P_hi = FR_Input_Y, FR_logx_hi,FR_X_lo - nop.i 999 ;; -} -{ .mfi - nop.m 999 // -// Set Safe=True -// Flag is always 2 for this routine +// kernel_log_80 computed ln(X) +// and return logX_hi and logX_lo as results. +// PR_pow_Safe set as well. // -(p0) fmpy.s1 FR_float_N = FR_X, FR_L_Inv - nop.i 999 -} -{ .mfi - nop.m 999 // -// X_hi = Y * logX_hi + X_lo -// Set GR_Flag = 2 for exp(x + xcor) +// Compute Y * (logX_hi + logX_lo) +// P_hi -> X +// P_lo -> X_cor +// (Manipulate names so that inputs are in +// the place kernel_exp expects them) // -(p0) fms.s1 FR_P_lo= FR_Input_Y, FR_logx_hi, FR_P_hi - nop.i 999 ;; +// This function computes exp( x + x_cor) +// Input FR 1: FR_X +// Input FR 2: FR_X_cor +// Output FR 3: FR_Y_hi +// Output FR 4: FR_Y_lo +// Output FR 5: FR_Scale +// Output PR 1: PR_Safe +// +// P15 is True +// +// Load constants used in computing N using right-shift technique +{ .mlx + mov GR_exp_2tom51 = 0xffff-51 + movl GR_sig_inv_ln2 = 0xb8aa3b295c17f0bc // significand of 1/ln2 } -{ .mmi - nop.m 999 ;; -(p0) getf.exp GR_Expo_X = FR_X - nop.i 999 ;; +{ .mlx + add GR_Special_Exp = -50,GR_exp_bias + movl GR_rshf_2to51 = 0x4718000000000000 // 1.10000 2^(63+51) } -{ .mfi -(p0) and GR_Expo_X = GR_Expo_X, GR_Mask +;; + // -// Calculate unBIASed exponent of X // Point to Table of W1s // Point to Table of W2s // -(p0) fcvt.fx.s1 FR_N = FR_float_N - nop.i 999 ;; -} +{ .mmi + add GR_W1_ptr = 0x2b0, GR_table_base // Constants_exp_64_W1 + add GR_W2_ptr = 0x4b0, GR_table_base // Constants_exp_64_W2 + cmp.le p6,p0= GR_Delta_Exp,GR_Special_Exp +};; + +// Form two constants we need +// 1/ln2 * 2^63 to compute w = x * 1/ln2 * 128 +// 1.1000..000 * 2^(63+63-12) to right shift int(N) into the significand + { .mfi - nop.m 999 -(p0) fadd.s1 FR_P_lo = FR_P_lo, FR_X_lo -// -// Float_N = X * L_Inv -// Create exponent BIAS -// Get BIASed exponent of X -// -(p0) sub GR_Expo_X = GR_Expo_X, GR_BIAS ;; + setf.sig FR_INV_LN2_2TO63 = GR_sig_inv_ln2 // form 1/ln2 * 2^63 + nop.f 999 + and GR_Delta_Exp=GR_Delta_Exp,GR_exp_mask // Get exponent of y-1 } -{ .mib -(p0) cmp.gt.unc p9, p0 = -6, GR_Expo_X - nop.i 999 -// -// N = fcvt.fx(float_N) -// If -6 > Expo_X, set P9 -// -(p9) br.cond.spnt L(EXPL_SMALL) +{ .mlx + setf.d FR_RSHF_2TO51 = GR_rshf_2to51 // Form const 1.1000 * 2^(63+51) + movl GR_rshf = 0x43e8000000000000 // 1.10000 2^63 for right shift } ;; -// -// If expo_X < -6 goto exp_small -// -{ .mmi +{ .mfi nop.m 999 -(p0) addl GR_T1_ptr = @ltoff(Constants_exp_64_T1#), gp -(p0) cmp.lt.unc p10, p0 = 14, GR_Expo_X + fmpy.s1 FR_X_lo = FR_Input_Y, FR_logx_lo // logx_lo is Y_lo + cmp.eq p15, p0= r0, r0 // Set p15, assume safe +};; + +{ .mmi + setf.exp FR_2TOM51 = GR_exp_2tom51 // Form 2^-51 for scaling float_N + setf.d FR_RSHF = GR_rshf // Form right shift const 1.1000 * 2^63 + add GR_Table_Ptr1 = 0x50, GR_table_base // Constants_exp_64_P for + // EXPL_SMALL path } ;; { .mmi - ld8 GR_T1_ptr = [GR_T1_ptr] - nop.m 999 + ldfe FR_P_6 = [GR_Table_Ptr1],16 // Load P_6 for EXPL_SMALL path +;; + ldfe FR_P_5 = [GR_Table_Ptr1],16 // Load P_5 for EXPL_SMALL path nop.i 999 } ;; -{ .mib - nop.m 999 - nop.i 999 -// -// If 14 < Expo_X, set P10 -// Create pointer to T1 table -// -(p10) br.cond.spnt L(EXPL_HUGE) ;; +{ .mfi + ldfe FR_P_4 = [GR_Table_Ptr1],16 // Load P_4 for EXPL_SMALL path + fma.s1 FR_P_hi = FR_Input_Y, FR_logx_hi,FR_X_lo // logx_hi ix Y_hi + nop.i 999 } - +;; { .mmi -(p0) addl GR_Table_Ptr = @ltoff(Constants_exp_64_Exponents#), gp -(p0) addl GR_T2_ptr = @ltoff(Constants_exp_64_T2#), gp + ldfe FR_P_3 = [GR_Table_Ptr1],16 // Load P_3 for EXPL_SMALL path +;; + ldfe FR_P_2 = [GR_Table_Ptr1],16 // Load P_2 for EXPL_SMALL path nop.i 999 } ;; -{ .mmi - ld8 GR_Table_Ptr = [GR_Table_Ptr] - ld8 GR_T2_ptr = [GR_T2_ptr] +// N = X * Inv_log2_by_2^12 +// By adding 1.10...0*2^63 we shift and get round_int(N_signif) in significand. +// We actually add 1.10...0*2^51 to X * Inv_log2 to do the same thing. +{ .mfi + ldfe FR_P_1 = [GR_Table_Ptr1] // Load P_1 for EXPL_SMALL path + fma.s1 FR_N = FR_X, FR_INV_LN2_2TO63, FR_RSHF_2TO51 nop.i 999 } +{ .mfb + nop.m 999 + fms.s1 FR_P_lo= FR_Input_Y, FR_logx_hi, FR_P_hi // P_hi is X +(p6) br.cond.spnt POWL_Y_ALMOST_1 // Branch if |y-1| < 2^-50 +} ;; - { .mmi -(p0) shladd GR_Table_Ptr = GR_Expo_Range,4,GR_Table_Ptr ;; -// -// Adjust T1_ptr by x 4 for single-precision values -// Adjust T2_ptr by x 4 for single-precision values -// -(p0) ld8 GR_Big_Pos_Exp = [GR_Table_Ptr],8 - nop.i 999 ;; -} -// -// Load double W1 -// Load +max exponent -// -{ .mfi -(p0) ld8 GR_Big_Neg_Exp = [GR_Table_Ptr],0 -// -// If 14 < Expo_X, goto exp_huge -// -(p0) fcvt.xf FR_float_N = FR_N - nop.i 999 + getf.exp GR_Expo_X = FR_X + add GR_T1_ptr = 0x0b0, GR_table_base // Constants_exp_64_T1 + add GR_T2_ptr = 0x1b0, GR_table_base // Constants_exp_64_T2 } ;; -// -// Load double W2 -// Load -max exponent -// Load ptr to A's -// +// float_N = round_int(N) +// The signficand of N contains the rounded integer part of X * 2^12/ln2, +// as a twos complement number in the lower bits (that is, it may be negative). +// That twos complement number (called N) is put into GR_N_fix. -{ .mmi -(p0) getf.sig GR_N_fix = FR_N -(p0) addl GR_Table_Ptr = @ltoff(Constants_exp_64_A#), gp +// Since N is scaled by 2^51, it must be multiplied by 2^-51 +// before the shift constant 1.10000 * 2^63 is subtracted to yield float_N. +// Thus, float_N contains the floating point version of N + + +{ .mfi + add GR_Table_Ptr = 0x20, GR_table_base // Constants_exp_64_A + fms.s1 FR_float_N = FR_N, FR_2TOM51, FR_RSHF // Form float_N nop.i 999 } -;; +// Create low part of Y(ln(x)_hi + ln(x)_lo) as P_lo +{ .mfi + mov GR_Big_Pos_Exp = 0x3ffe // 16382, largest safe exponent + fadd.s1 FR_P_lo = FR_P_lo, FR_X_lo + mov GR_Big_Neg_Exp = -0x3ffd // -16381 smallest safe exponent +};; -{ .mmi - ld8 GR_Table_Ptr = [GR_Table_Ptr] +{ .mfi nop.m 999 - nop.i 999 + fmpy.s1 FR_rsq = FR_X, FR_X // rsq = X*X for EXPL_SMALL path + mov GR_vsm_expo = -70 // Exponent for very small path +} +{ .mfi + nop.m 999 + fma.s1 FR_poly_lo = FR_P_6, FR_X, FR_P_5 // poly_lo for EXPL_SMALL path + add GR_temp = 0x1,r0 // For tiny signif if small path } ;; // -// Load single T1 -// Load single T2 -// W_1_p1 = W_1 + 1 -// -{ .mmi -(p0) ldfe FR_A_3 = [GR_Table_Ptr],16 ;; -// -// Load A_3 -// if k > big_pos_exp, set p14 and Safe=False -// -(p0) ldfe FR_A_2 = [GR_Table_Ptr],16 -(p0) extr.u GR_M1 = GR_N_fix, 6, 6 -} -{ .mmi - nop.m 999 ;; -(p0) shladd GR_W1_ptr = GR_M1,3,GR_W1_ptr -// -// float_N = fcvt.xf(N) -// N_fix = significand of N -// Create pointer to T2 table -// -(p0) extr.u GR_M2 = GR_N_fix, 0, 6 -} -// -// r = r + X_cor -// Adjust W1_ptr by x 8 for double-precision values -// Adjust W2_ptr by x 8 for double-precision values -// Adjust Table_ptr by Expo_Rangex16 +// If expo_X < -6 goto exp_small // { .mmi -(p0) shladd GR_T1_ptr = GR_M1,2,GR_T1_ptr ;; -(p0) ldfd FR_W1 = [GR_W1_ptr],0 -(p0) shladd GR_W2_ptr = GR_M2,3,GR_W2_ptr + getf.sig GR_N_fix = FR_N + ldfe FR_A_3 = [GR_Table_Ptr],16 // Load A_3 + and GR_Expo_X = GR_Expo_X, GR_exp_mask // Get exponent of X } -// -// Load ptr to A's -// +;; + { .mfi -(p0) ldfs FR_T1 = [GR_T1_ptr],0 -(p0) fnma.s1 FR_r = FR_L_hi, FR_float_N, FR_X -(p0) shladd GR_T2_ptr = GR_M2,2,GR_T2_ptr ;; + ldfe FR_A_2 = [GR_Table_Ptr],16 // Load A_2 + nop.f 999 + sub GR_Expo_X = GR_Expo_X, GR_exp_bias // Get true exponent of X } -{ .mmi -(p0) ldfd FR_W2 = [GR_W2_ptr],0 -(p0) ldfs FR_T2 = [GR_T2_ptr],0 +;; + // -// r = x - L_hi * float_N -// M2 = extr.u(N_fix,0,6) -// M1 = extr.u(N_fix,6,6) +// If -6 > Expo_X, set P9 and branch // -(p0) extr GR_k = GR_N_fix, 12, 52 ;; +{ .mfb + cmp.gt p9, p0 = -6, GR_Expo_X + fnma.s1 FR_r = FR_L_hi, FR_float_N, FR_X // r = X - L_hi * float_N +(p9) br.cond.spnt EXPL_SMALL // Branch if |X| < 2^-6 } +;; + // -// Load A_1 -// poly = A_3 * r + A_2 -// rsq = r*r +// If 14 <= Expo_X, set P10 // -{ .mii -(p0) add GR_BIAS_p_k = GR_BIAS, GR_k -(p0) cmp.gt.unc p14,p15 = GR_k,GR_Big_Pos_Exp ;; -(p15) cmp.lt p14,p15 = GR_k,GR_Big_Neg_Exp +{ .mib + cmp.le p10, p0 = 14, GR_Expo_X + nop.i 999 +(p10) br.cond.spnt EXPL_HUGE // Branch if |X| >= 2^14 } +;; + // -// BIAS_p_K = BIAS + k -// T = T1 * T2 +// Load single T1 +// Load single T2 +// W_1_p1 = W_1 + 1 // -{ .mfi -(p0) setf.exp FR_Scale = GR_BIAS_p_k - nop.f 999 - nop.i 999 ;; -} -{ .mfi - nop.m 999 -(p0) fnma.s1 FR_r = FR_L_lo, FR_float_N, FR_r - nop.i 999 +{ .mmi + nop.m 999 + nop.m 999 + extr.u GR_M1 = GR_N_fix, 6, 6 // Extract index M_1 } +;; + // -// W = W_1_p1 * W2 + W1 +// k = extr.u(N_fix,0,6) // -{ .mfi -(p0) ldfe FR_A_1 = [GR_Table_Ptr],16 - nop.f 999 - nop.i 999 ;; +{ .mmi + shladd GR_W1_ptr = GR_M1,3,GR_W1_ptr // Point to W1 + shladd GR_T1_ptr = GR_M1,2,GR_T1_ptr // Point to T1 + extr.u GR_M2 = GR_N_fix, 0, 6 // Extract index M_2 } -{ .mfi - nop.m 999 -(p0) fadd.s1 FR_W_1_p1 = FR_W1, f1 - nop.i 999 ;; +;; + +// N_fix is only correct up to 50 bits because of our right shift technique. +// Actually in the normal path we will have restricted K to about 14 bits. +// Somewhat arbitrarily we extract 32 bits. +{ .mmi + ldfd FR_W1 = [GR_W1_ptr] + shladd GR_W2_ptr = GR_M2,3,GR_W2_ptr // Point to W2 + extr GR_k = GR_N_fix, 12, 32 // Extract k } +;; + { .mfi - nop.m 999 -// -// k = extr.u(N_fix,0,6) -// r = r - N * L_lo -// Load ptr to Table of exponent thresholds. -// -(p0) fadd.s1 FR_r = FR_r, FR_X_cor - nop.i 999 + ldfs FR_T1 = [GR_T1_ptr] + fnma.s1 FR_r = FR_L_lo, FR_float_N, FR_r + shladd GR_T2_ptr = GR_M2,2,GR_T2_ptr // Point to T2 } { .mfi - nop.m 999 -(p0) fmpy.s1 FR_T = FR_T1, FR_T2 - nop.i 999 ;; + add GR_exp_bias_p_k = GR_exp_bias, GR_k + nop.f 999 + cmp.gt p14,p15 = GR_k,GR_Big_Pos_Exp } -{ .mfi - nop.m 999 +;; + // -// if k < big_neg_exp, set p14 and Safe=False -// Load A_2 +// if k < big_neg_exp, set p14 and Safe=False // -(p0) fma.s1 FR_W = FR_W2, FR_W_1_p1, FR_W1 - nop.i 999 ;; +{ .mmi + ldfs FR_T2 = [GR_T2_ptr] +(p15) cmp.lt p14,p15 = GR_k,GR_Big_Neg_Exp + nop.i 999 } -{ .mfi - nop.m 999 -(p0) fma.s1 FR_poly = FR_r, FR_A_3, FR_A_2 - nop.i 999 +;; + +{ .mmi + setf.exp FR_Scale = GR_exp_bias_p_k + ldfd FR_W2 = [GR_W2_ptr] + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fmpy.s1 FR_rsq = FR_r, FR_r - nop.i 999 ;; + ldfe FR_A_1 = [GR_Table_Ptr],16 + fadd.s1 FR_r = FR_r, FR_X_cor + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) mov FR_Y_hi = FR_T - nop.i 999 ;; + nop.m 999 + fadd.s1 FR_W_1_p1 = FR_W1, f1 + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// Scale = set_exp(BIAS_p_k) -// poly = r * poly + A_1 -// -(p0) fadd.s1 FR_Wp1 = FR_W, f1 - nop.i 999 ;; + nop.m 999 + fma.s1 FR_poly = FR_r, FR_A_3, FR_A_2 + nop.i 999 } { .mfi - nop.m 999 -(p0) fma.s1 FR_poly = FR_r, FR_poly, FR_A_1 - nop.i 999 ;; + nop.m 999 + fmpy.s1 FR_rsq = FR_r, FR_r + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fma.s1 FR_poly = FR_rsq, FR_poly,FR_r - nop.i 999 ;; + nop.m 999 + fmpy.s1 FR_T = FR_T1, FR_T2 + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// Wp1 = W + 1 -// poly = rsq * poly + rk -// -(p0) fma.s1 FR_Y_lo = FR_Wp1, FR_poly, FR_W - nop.i 999 ;; -} -{ .mfb - nop.m 999 -// -// Y_lo = poly * Wp1 + W -// Y_hi = T -// -(p0) fmpy.s1 FR_Y_lo = FR_Y_lo, FR_T -// -// Y_lo = T * Y_lo -// -(p0) br.cond.sptk L(EXPL_RETURN) ;; + nop.m 999 + fma.s1 FR_W = FR_W2, FR_W_1_p1, FR_W1 + nop.i 999 } +;; -L(EXPL_SMALL): - -// -// r4 = rsq * rsq -// - -{ .mmi +{ .mfi nop.m 999 -(p0) addl GR_Table_Ptr1 = @ltoff(Constants_exp_64_P), gp + fma.s1 FR_TMP1 = FR_Scale, FR_Sgn, f0 nop.i 999 } ;; -{ .mmi - ld8 GR_Table_Ptr1 = [GR_Table_Ptr1] +{ .mfi nop.m 999 + fma.s1 FR_poly = FR_r, FR_poly, FR_A_1 nop.i 999 } ;; -{ .mmf - nop.m 999 -(p0) ldfe FR_P_6 = [GR_Table_Ptr1],16 -// -// Return -// -(p0) fadd.s1 FR_r = FR_X,f0 ;; +{ .mfi + nop.m 999 + fma.s1 FR_TMP2 = FR_T, f1, f0 // TMP2 = Y_hi = T + nop.i 999 } +;; -{ .mmi +{ .mfi nop.m 999 -(p0) addl GR_Table_Ptr = @ltoff(Constants_exp_64_Exponents#), gp + fadd.s1 FR_Wp1 = FR_W, f1 nop.i 999 } ;; -{ .mmi - ld8 GR_Table_Ptr = [GR_Table_Ptr] -(p0) ldfe FR_P_5 = [GR_Table_Ptr1],16 +{ .mfi + nop.m 999 + fma.s1 FR_poly = FR_rsq, FR_poly,FR_r nop.i 999 } ;; -// -// Is input very small? -// Load P_5 -// -{ .mii -(p0) ldfe FR_P_4 = [GR_Table_Ptr1],16 -(p0) add GR_Table_Ptr = 0x040,GR_Table_Ptr ;; -(p0) shladd GR_Table_Ptr = GR_Expo_Range,3,GR_Table_Ptr ;; -} -{ .mmb -(p0) ldfe FR_P_3 = [GR_Table_Ptr1],16 -// -// Adjust ptr. -// -(p0) ld8 GR_vsm_expo = [GR_Table_Ptr],0 - nop.b 999 ;; -} { .mfi - nop.m 999 -// -// r = X (don't seem to need X_Cor) -// Load the threshold exponents -// -(p0) fmpy.s1 FR_rsq = FR_r, FR_r - nop.i 999 ;; + nop.m 999 + fma.s1 FR_Tscale = FR_T, FR_TMP1, f0 // Scale * Sgn * T + nop.i 999 } -// -// Load the negative integer -// Load P_5 -// { .mfi -(p0) cmp.lt.unc p12, p0 = GR_Expo_X, GR_vsm_expo - nop.f 999 - nop.i 999 ;; + nop.m 999 + fma.s1 FR_Y_lo = FR_Wp1, FR_poly, FR_W + nop.i 999 } +;; + { .mfb - nop.m 999 -// -// rsq = r * r -// Offset into exponents -// -(p0) fmpy.s1 FR_r4 = FR_rsq, FR_rsq -(p12) br.cond.spnt L(EXPL_VERY_SMALL) ;; + nop.m 999 + fmpy.s1 FR_TMP3 = FR_Y_lo, FR_Tscale + br.cond.sptk POWL_64_SHARED } -{ .mfi -(p0) ldfe FR_P_2 = [GR_Table_Ptr1],16 -// -// Load p4,p3,p2,p1 -// -(p0) fma.s1 FR_poly_lo = FR_P_6, FR_r, FR_P_5 +;; + + +EXPL_SMALL: +// Here if |ylogx| < 2^-6 // -// Y_lo = r4 * poly_lo + poly_hi -// Scale = 1.0 +// Begin creating lsb to perturb final result // -(p0) add GR_temp = 0x1,r0 ;; +{ .mfi + setf.sig FR_temp = GR_temp + fma.s1 FR_poly_lo = FR_poly_lo, FR_X, FR_P_4 + cmp.lt p12, p0 = GR_Expo_X, GR_vsm_expo // Test |ylogx| < 2^-70 } -{ .mmf - nop.m 999 -(p0) ldfe FR_P_1 = [GR_Table_Ptr1],0 -(p0) mov FR_Scale = f1 +{ .mfi + nop.m 999 + fma.s1 FR_poly_hi = FR_P_2, FR_X, FR_P_1 + nop.i 999 } -// -// Begin creating lsb to perturb final result -// +;; + { .mfi -(p0) setf.sig FR_temp = GR_temp -(p0) mov FR_Y_hi = f1 - nop.i 999 ;; + nop.m 999 + fmpy.s1 FR_TMP2 = f1, f1 + nop.i 999 } { .mfi - nop.m 999 -// -// poly_lo = p_5 + p_6 * r -// poly_hi = p_1 + p_2 * r -// -(p0) fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_P_4 - nop.i 999 ;; + nop.m 999 + fmpy.s1 FR_TMP1 = FR_Sgn, f1 + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// poly_lo = p_4 + poly_lo * r -// poly_hi = r + poly_hi * rsq -// -(p0) fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_P_3 - nop.i 999 + nop.m 999 + fmpy.s1 FR_r4 = FR_rsq, FR_rsq +(p12) cmp.eq p15, p0 = r0, r0 // Set safe if |ylogx| < 2^-70 } +{ .mfb + nop.m 999 +(p12) fmpy.s1 FR_TMP3 = FR_Sgn, FR_X +(p12) br.cond.spnt POWL_64_SHARED // Branch if |ylogx| < 2^-70 +} +;; + { .mfi - nop.m 999 -(p0) fma.s1 FR_poly_hi = FR_P_2, FR_r, FR_P_1 - nop.i 999 ;; + nop.m 999 + fma.s1 FR_poly_lo = FR_poly_lo, FR_X, FR_P_3 + nop.i 999 } { .mfi - nop.m 999 -(p0) fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, FR_r - nop.i 999 ;; + nop.m 999 + fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, FR_X + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// poly_lo = p_3 + poly_lo * r -// Y_hi = 1, always -// -(p0) fma.s1 FR_Y_lo = FR_poly_lo, FR_r4, FR_poly_hi - nop.i 999 ;; + nop.m 999 + fma.s1 FR_Y_lo = FR_poly_lo, FR_r4, FR_poly_hi + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// Set lsb in fp register -// -(p0) for FR_temp = FR_Y_lo,FR_temp - nop.i 999 ;; + nop.m 999 + fmpy.s1 FR_TMP3 = FR_Y_lo, FR_TMP1 // Add sign info + nop.i 999 } -{ .mfb - nop.m 999 +;; + // // Toggle on last bit of Y_lo -// -(p0) fmerge.se FR_Y_lo = FR_Y_lo,FR_temp -// // Set lsb of Y_lo to 1 // -(p0) br.cond.sptk L(EXPL_RETURN) ;; -} -L(EXPL_VERY_SMALL): { .mfi - nop.m 999 -(p0) mov FR_Y_lo = FR_r -(p0) cmp.eq.unc p15, p0 = r0, r0 + nop.m 999 + for FR_temp = FR_Y_lo,FR_temp + nop.i 999 } -{ .mfi - nop.m 999 -(p0) mov FR_Scale = f1 - nop.i 999 -};; +;; + { .mfb - nop.m 999 -(p0) mov FR_Y_hi = f1 -// -// If flag_not_1, -// Y_hi = 1.0 -// Y_lo = X + X_cor -// PR_Safe = true -// -(p0) br.cond.sptk L(EXPL_RETURN) ;; + nop.m 999 + fmerge.se FR_TMP3 = FR_TMP3,FR_temp + br.cond.sptk POWL_64_SHARED } -L(EXPL_HUGE): +;; + + +EXPL_HUGE: +// Here if |ylogx| >= 2^14 { .mfi - nop.m 999 -// -// Return for flag=2 -// -(p0) fcmp.gt.unc.s1 p12, p13 = FR_X, f0 -(p0) cmp.eq.unc p14, p15 = r0, r0 ;; + mov GR_temp = 0x0A1DC // If X < 0, exponent -24100 + fcmp.gt.s1 p12, p13 = FR_X, f0 // Test X > 0 + cmp.eq p14, p15 = r0, r0 // Set Safe to false } -{ .mlx - nop.m 999 -// -// Set Safe to false -// Is x > 0 -// -(p12) movl GR_Mask = 0x15DC0 ;; -} -{ .mlx -(p12) setf.exp FR_Y_hi = GR_Mask -(p13) movl GR_Mask = 0xA240 ;; +;; + +{ .mmi +(p12) mov GR_Mask = 0x15DC0 // If X > 0, exponent +24000 +(p13) mov GR_Mask = 0x0A240 // If X < 0, exponent -24000 + nop.i 999 } -{ .mlx -(p13) setf.exp FR_Y_hi = GR_Mask -// -// x > 0: Create mask for Y_hi = 2**(24,000) -// x <= 0: Create mask for Y_hi = 2**(-24,000) -// -(p13) movl GR_temp = 0xA1DC ;; +;; + +{ .mmf + setf.exp FR_TMP2 = GR_Mask // Form Y_hi = TMP2 +(p13) setf.exp FR_Y_lo = GR_temp // If X < 0, Y_lo = 2^-24100 +(p12) mov FR_Y_lo = f1 // IF X > 0, Y_lo = 1.0 } +;; + { .mfi -(p13) setf.exp FR_Y_lo = GR_temp -// -// x < =0: Create mask for 2**(-24,100) -// x <= 0: Y_lo = w**(-24,100) -// -(p12) mov FR_Y_lo = f1 - nop.i 999 ;; + nop.m 999 + fmpy.s1 FR_TMP1 = FR_TMP2, FR_Sgn // TMP1 = Y_hi * Sgn + nop.i 999 } -{ .mfi - nop.m 999 -(p12) mov FR_Scale = FR_Y_hi - nop.i 999 ;; +;; + +{ .mfb + nop.m 999 + fmpy.s1 FR_TMP3 = FR_Y_lo,FR_TMP1 // TMP3 = Y_lo * (Y_hi * Sgn) + br.cond.sptk POWL_64_SHARED } -{ .mfi - nop.m 999 +;; + +POWL_Y_ALMOST_1: +// Here if delta = |y-1| < 2^-50 // -// x > 0: Y_lo = 1.0 -// x > 0: Scale = 2**(24,000) +// x**(1 + delta) = x * e (ln(x)*delta) = x ( 1 + ln(x) * delta) // -(p13) mov FR_Scale = FR_Y_hi - nop.i 999 ;; -} -L(EXPL_RETURN): +// Computation will be safe for 2^-16381 <= x < 2^16383 + { .mfi - nop.m 999 -// -// Scale = 2**(24,000) -// -// -// exp(y *ln(x)) almost complete -// FR_Scale is Scale -// f34 is Z_hi -// f35 is Z_lo -// -(p0) fmpy.s1 FR_Sgn = FR_Scale, FR_Sgn - nop.i 999 ;; + mov GR_exp_ynear1_oflow = 0xffff + 16383 + fma.s1 FR_TMP1 = FR_Input_X,FR_Delta,f0 + and GR_exp_x = GR_exp_mask, GR_signexp_x } +;; + { .mfi - nop.m 999 -// -// sgn * scale -// -(p0) fmpy.s1 FR_Y_lo = FR_Y_lo,FR_Sgn - nop.i 999 ;; + cmp.lt p15, p14 = GR_exp_x, GR_exp_ynear1_oflow + fma.s1 FR_TMP2 = FR_logx_hi,f1,FR_X_lo + mov GR_exp_ynear1_uflow = 0xffff - 16381 } +;; + { .mfb - nop.m 999 -// -// Z_lo * (sgn * scale) +(p15) cmp.ge p15, p14 = GR_exp_x, GR_exp_ynear1_uflow + fma.s1 FR_TMP3 = FR_Input_X,f1,f0 + br.cond.sptk POWL_64_SHARED +};; + +POWL_64_SQUARE: // -(p0) fma.s0 FR_Result = FR_Y_hi, FR_Sgn, FR_Y_lo +// Here if x not zero and y=2. // -// Z_hi * (sgn * scale) + Z_lo +// Setup for multipath code // -(p15) br.cond.sptk L(POWL_64_RETURN) ;; -} { .mfi - nop.m 999 -(p0) fsetc.s3 0x7F,0x01 - nop.i 999 -} -{ .mlx - nop.m 999 -// -// Z_hi * (sgn * scale) + Z_lo with wre & td -// Z_hi * (sgn * scale) + Z_lo with fz & td -// -(p0) movl GR_T1_ptr = 0x00000000013FFF ;; + mov GR_exp_square_oflow = 0xffff + 8192 // Exponent where x*x overflows + fmerge.se FR_TMP1 = FR_Input_X, FR_Input_X + and GR_exp_x = GR_exp_mask, GR_signexp_x // Get exponent of x } +;; + { .mfi - nop.m 999 -(p0) fma.s3 FR_Result_small = FR_Y_hi, FR_Sgn, FR_Y_lo - nop.i 999 + cmp.lt p15, p14 = GR_exp_x, GR_exp_square_oflow // Decide safe/unsafe + fmerge.se FR_TMP2 = FR_Input_X, FR_Input_X + mov GR_exp_square_uflow = 0xffff - 8191 // Exponent where x*x underflows } +;; + { .mfi - nop.m 999 -(p0) fsetc.s3 0x7F,0x40 - nop.i 999 ;; +(p15) cmp.ge p15, p14 = GR_exp_x, GR_exp_square_uflow // Decide safe/unsafe + fma.s1 FR_TMP3 = f0,f0,f0 + nop.i 999 } -{ .mfi - nop.m 999 +;; + // -// Return if no danger of over of underflow. +// This is the shared path that will set overflow and underflow. // -(p0) fsetc.s2 0x7F,0x42 - nop.i 999;; -} -{ .mfi - nop.m 999 +POWL_64_SHARED: + // -// S0 user supplied status -// S2 user supplied status + WRE + TD (Overflows) -// S3 user supplied status + FZ + TD (Underflows) +// Return if no danger of over or underflow. // -(p0) fma.s2 FR_Result_big = FR_Y_hi, FR_Sgn, FR_Y_lo - nop.i 999 ;; +{ .mfb + nop.m 999 + fma.s0 FR_Result = FR_TMP1, FR_TMP2, FR_TMP3 +(p15) br.ret.sptk b0 // Main path return if certain no over/underflow } +;; + // -// S0 user supplied status -// S2 user supplied status + WRE + TD (Overflows) -// S3 user supplied status + FZ + TD (Underflows) +// S0 user supplied status +// S2 user supplied status + WRE + TD (Overflows) +// S2 user supplied status + FZ + TD (Underflows) // // // If (Safe) is true, then @@ -2430,973 +2021,741 @@ L(EXPL_RETURN): // No overflow or underflow here, but perhaps inexact. // Return // Else -// Determine if overflow or underflow was raised. -// Fetch +/- overflow threshold for IEEE single, double, -// double extended -// -{ .mfi -(p0) setf.exp FR_Big = GR_T1_ptr -(p0) fsetc.s2 0x7F,0x40 - nop.i 999 ;; -} -{ .mfi - nop.m 999 -(p0) fclass.m.unc p11, p0 = FR_Result_small, 0x00F - nop.i 999 ;; -} -{ .mfi - nop.m 999 -(p0) fmerge.ns FR_NBig = FR_Big, FR_Big - nop.i 999 -} -{ .mfi - nop.m 999 -// -// Create largest double exponent + 1. -// Create smallest double exponent - 1. -// Identify denormals -// -(p0) fcmp.ge.unc.s1 p8, p0 = FR_Result_big , FR_Big - nop.i 999 ;; -} -{ .mii - nop.m 999 - nop.i 999 ;; -// -// fcmp: resultS2 <= - overflow threshold -// fclass: resultS3 is denorm/unorm/0 -// -(p8) mov GR_Parameter_TAG = 18 ;; -} -{ .mfb - nop.m 999 -// -// fcmp: resultS2 >= + overflow threshold -// -(p0) fcmp.le.unc.s1 p9, p0 = FR_Result_big, FR_NBig -(p8) br.cond.spnt __libm_error_region ;; -} -{ .mii - nop.m 999 - nop.i 999 ;; -(p9) mov GR_Parameter_TAG = 18 -} -{ .mib - nop.m 999 - nop.i 999 -(p9) br.cond.spnt __libm_error_region ;; -} -// -// Report that pow overflowed - either +Inf, or -Inf -// -{ .mmb -(p11) mov GR_Parameter_TAG = 19 - nop.m 999 -(p11) br.cond.spnt __libm_error_region ;; -} -{ .mib - nop.m 999 - nop.i 999 -// -// Report that pow underflowed -// -(p0) br.cond.sptk L(POWL_64_RETURN) ;; -} - +// Determine if overflow or underflow was raised. +// Fetch +/- overflow threshold for IEEE double extended -L(POWL_64_SQUARE): -// Here if x not zero and y=2. -// Must call __libm_error_support for overflow or underflow -// -// S0 user supplied status -// S2 user supplied status + WRE + TD (Overflows) -// S3 user supplied status + FZ + TD (Underflows) -// { .mfi - nop.m 999 -(p0) fma.s0 FR_Result = FR_Input_X, FR_Input_X, f0 - nop.i 999 -} -{ .mfi - nop.m 999 -(p0) fsetc.s3 0x7F,0x01 - nop.i 999 -} -{ .mlx - nop.m 999 -(p0) movl GR_T1_ptr = 0x00000000013FFF ;; -} -{ .mfi - nop.m 999 -(p0) fma.s3 FR_Result_small = FR_Input_X, FR_Input_X, f0 - nop.i 999 -} -{ .mfi - nop.m 999 -(p0) fsetc.s3 0x7F,0x40 - nop.i 999 ;; + nop.m 999 + fsetc.s2 0x7F,0x41 // For underflow test, set S2=User+TD+FTZ + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// Return if no danger of over of underflow. -// -(p0) fsetc.s2 0x7F,0x42 - nop.i 999;; + nop.m 999 + fma.s2 FR_Result_small = FR_TMP1, FR_TMP2, FR_TMP3 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fma.s2 FR_Result_big = FR_Input_X, FR_Input_X, f0 - nop.i 999 ;; + nop.m 999 + fsetc.s2 0x7F,0x42 // For overflow test, set S2=User+TD+WRE + nop.i 999 } -// -// S0 user supplied status -// S2 user supplied status + WRE + TD (Overflows) -// S3 user supplied status + FZ + TD (Underflows) -// -// -// If (Safe) is true, then -// Compute result using user supplied status field. -// No overflow or underflow here, but perhaps inexact. -// Return -// Else -// Determine if overflow or underflow was raised. -// Fetch +/- overflow threshold for IEEE single, double, -// double extended -// +;; + { .mfi -(p0) setf.exp FR_Big = GR_T1_ptr -(p0) fsetc.s2 0x7F,0x40 - nop.i 999 ;; + nop.m 999 + fma.s2 FR_Result_big = FR_TMP1, FR_TMP2,FR_TMP3 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fclass.m.unc p11, p0 = FR_Result_small, 0x00F - nop.i 999 ;; + nop.m 999 + fsetc.s2 0x7F,0x40 // Reset S2=User + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fmerge.ns FR_NBig = FR_Big, FR_Big - nop.i 999 + nop.m 999 + fclass.m p11, p0 = FR_Result_small, 0x00F // Test small result unorm/zero + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// Create largest double exponent + 1. -// Create smallest double exponent - 1. -// Identify denormals -// -(p0) fcmp.ge.unc.s1 p8, p0 = FR_Result_big , FR_Big - nop.i 999 ;; -} -{ .mii - nop.m 999 - nop.i 999 ;; -// -// fcmp: resultS2 <= - overflow threshold -// fclass: resultS3 is denorm/unorm/0 -// -(p8) mov GR_Parameter_TAG = 18 ;; + nop.m 999 + fcmp.ge.s1 p8, p0 = FR_Result_big , FR_Big // Test >= + oflow threshold + nop.i 999 } +;; + { .mfb - nop.m 999 -// -// fcmp: resultS2 >= + overflow threshold -// -(p0) fcmp.le.unc.s1 p9, p0 = FR_Result_big, FR_NBig -(p8) br.cond.spnt __libm_error_region ;; -} -{ .mii - nop.m 999 - nop.i 999 ;; -(p9) mov GR_Parameter_TAG = 18 -} -{ .mib - nop.m 999 - nop.i 999 -(p9) br.cond.spnt __libm_error_region ;; -} -// -// Report that pow overflowed - either +Inf, or -Inf -// -{ .mmb -(p11) mov GR_Parameter_TAG = 19 - nop.m 999 -(p11) br.cond.spnt __libm_error_region ;; -} -{ .mib - nop.m 999 - nop.i 999 -// -// Report that pow underflowed -// -(p0) br.cond.sptk L(POWL_64_RETURN) ;; +(p11) mov GR_Parameter_TAG = 19 // Set tag for underflow + fcmp.le.s1 p9, p0 = FR_Result_big, FR_NBig // Test <= - oflow threshold +(p11) br.cond.spnt __libm_error_region // Branch if pow underflowed } +;; +{ .mfb +(p8) mov GR_Parameter_TAG = 18 // Set tag for overflow + nop.f 999 +(p8) br.cond.spnt __libm_error_region // Branch if pow +overflow +} +;; +{ .mbb +(p9) mov GR_Parameter_TAG = 18 // Set tag for overflow +(p9) br.cond.spnt __libm_error_region // Branch if pow -overflow + br.ret.sptk b0 // Branch if result really ok +} +;; -L(POWL_64_SPECIAL): +POWL_64_SPECIAL: +// Here if x or y is NatVal, nan, inf, or zero { .mfi - nop.m 999 -(p0) fcmp.eq.s1 p15, p0 = FR_Input_X, f1 // Is x=+1 - nop.i 999 ;; + nop.m 999 + fcmp.eq.s1 p15, p0 = FR_Input_X, f1 // Test x=+1 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fclass.m.unc p14, p0 = FR_Input_Y, 0x023 - nop.i 999 ;; + nop.m 999 + fclass.m p8, p0 = FR_Input_X, 0x143 // Test x natval, snan + nop.i 999 } +;; { .mfi - nop.m 999 -(p15) fcmp.eq.unc.s0 p6,p0 = FR_Input_Y, f0 // If x=1, flag invalid if y=SNaN - nop.i 999 + nop.m 999 +(p15) fcmp.eq.unc.s0 p6,p0 = FR_Input_Y, f0 // If x=1, flag invalid if y=SNaN + nop.i 999 } { .mfb - nop.m 999 -(p15) fmpy.s0 FR_Result = f1,f1 // If x=1, result=1 -(p15) br.cond.spnt L(POWL_64_RETURN) ;; // Exit if x=1 + nop.m 999 +(p15) fmpy.s0 FR_Result = f1,f1 // If x=1, result=1 +(p15) br.ret.spnt b0 // Exit if x=1 } +;; { .mfi - nop.m 999 -(p0) fclass.m.unc p13, p0 = FR_Input_X, 0x023 - nop.i 999 ;; -} -{ .mfi - nop.m 999 -(p0) fclass.m.unc p8, p0 = FR_Input_X, 0x143 - nop.i 999 + nop.m 999 + fclass.m p6, p0 = FR_Input_Y, 0x007 // Test y zero + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fclass.m.unc p9, p0 = FR_Input_Y, 0x143 - nop.i 999 ;; + nop.m 999 + fclass.m p9, p0 = FR_Input_Y, 0x143 // Test y natval, snan + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fclass.m.unc p10, p0 = FR_Input_X, 0x083 - nop.i 999 + nop.m 999 + fclass.m p10, p0 = FR_Input_X, 0x083 // Test x qnan + nop.i 999 } { .mfi - nop.m 999 -(p0) fclass.m.unc p11, p0 = FR_Input_Y, 0x083 - nop.i 999 ;; + nop.m 999 +(p8) fmpy.s0 FR_Result = FR_Input_Y, FR_Input_X // If x=snan, result=qnan +(p6) cmp.ne p8,p0 = r0,r0 // Don't exit if x=snan, y=0 ==> result=+1 } +;; + { .mfi - nop.m 999 -(p0) fclass.m.unc p6, p0 = FR_Input_Y, 0x007 - nop.i 999 + nop.m 999 +(p6) fclass.m.unc p15, p0 = FR_Input_X,0x007 // Test x=0, y=0 + nop.i 999 } -{ .mfi - nop.m 999 -(p0) fcmp.eq.unc.s1 p7, p0 = FR_Input_Y, f1 - nop.i 999 ;; +{ .mfb + nop.m 999 +(p9) fmpy.s0 FR_Result = FR_Input_Y, FR_Input_X // If y=snan, result=qnan +(p8) br.ret.spnt b0 // Exit if x=snan, y not 0, + // result=qnan } +;; + { .mfi - nop.m 999 -// -// set p13 if x +/- Inf -// set p14 if y +/- Inf -// set p8 if x Natval or +/-SNaN -// set p9 if y Natval or +/-SNaN -// set p10 if x QNaN -// set p11 if y QNaNs -// set p6 if y is +/-0 -// set p7 if y is 1 -// -(p8) fmpy.s0 FR_Result = FR_Input_Y, FR_Input_X -(p6) cmp.ne p8,p0 = r0,r0 ;; // Don't exit if x=snan, y=0 ==> result=+1 -} -{ .mfb - nop.m 999 -(p9) fmpy.s0 FR_Result = FR_Input_Y, FR_Input_X -(p8) br.cond.spnt L(POWL_64_RETURN) ;; + nop.m 999 + fcmp.eq.s1 p7, p0 = FR_Input_Y, f1 // Test y +1.0 + nop.i 999 } { .mfb - nop.m 999 -(p10) fmpy.s0 FR_Result = FR_Input_X, f0 -(p9) br.cond.spnt L(POWL_64_RETURN) ;; -} -{ .mfi - nop.m 999 -// -// Produce result for SNaN and NatVals and return -// -(p6) fclass.m.unc p15, p0 = FR_Input_X,0x007 - nop.i 999 + nop.m 999 +(p10) fmpy.s0 FR_Result = FR_Input_X, f0 // If x=qnan, result=qnan +(p9) br.ret.spnt b0 // Exit if y=snan, result=qnan } +;; + { .mfi - nop.m 999 -// -// If Y +/- 0, set p15 if x +/- 0 -// -(p6) fclass.m.unc p8, p0 = FR_Input_X,0x0C3 - nop.i 999 ;; + nop.m 999 +(p6) fclass.m.unc p8, p0 = FR_Input_X,0x0C3 // Test x=nan, y=0 + nop.i 999 } +;; { .mfi - nop.m 999 -(p6) fcmp.eq.s0 p9,p0 = FR_Input_X, f0 // If y=0, flag if x denormal - nop.i 999 + nop.m 999 +(p6) fcmp.eq.s0 p9,p0 = FR_Input_X, f0 // If y=0, flag if x denormal + nop.i 999 } { .mfi - nop.m 999 -(p6) fadd.s0 FR_Result = f1, f0 - nop.i 999 ;; + nop.m 999 +(p6) fadd.s0 FR_Result = f1, f0 // If y=0, result=1 + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// Set p8 if y = +/-0 and X is a QNaN/SNaN -// If y = +/-0, let result = 1.0 -// -(p7) fmpy.s0 FR_Result = FR_Input_X,f1 -// -// If y == 1, result = x * 1 -// -(p15) mov GR_Parameter_TAG = 20 -} -{ .mib - nop.m 999 - nop.i 999 -(p15) br.cond.spnt __libm_error_region ;; -} -{ .mib - nop.m 999 -// -// If x and y are both zero, result = 1.0 and call error -// support. -// -(p8) mov GR_Parameter_TAG = 23 -(p8) br.cond.spnt __libm_error_region ;; + nop.m 999 + fclass.m p11, p0 = FR_Input_Y, 0x083 // Test y qnan + nop.i 999 } -{ .mib - nop.m 999 - nop.i 999 -// -// If y = +/-0 and x is a QNaN, result = 1.0 and call error -// support. -// -(p6) br.cond.spnt L(POWL_64_RETURN) ;; +{ .mfb +(p15) mov GR_Parameter_TAG = 20 // Error tag for x=0, y=0 +(p7) fmpy.s0 FR_Result = FR_Input_X,f1 // If y=1, result=x +(p15) br.cond.spnt __libm_error_region // Branch if x=0, y=0, result=1 } +;; -// If x=0, y=-inf, go to the X_IS_ZERO path { .mfb - nop.m 999 -(p14) fcmp.eq.unc.s1 p0,p14 = FR_Input_X,f0 -(p7) br.cond.spnt L(POWL_64_RETURN) ;; +(p8) mov GR_Parameter_TAG = 23 // Error tag for x=nan, y=0 + fclass.m p14, p0 = FR_Input_Y, 0x023 // Test y inf +(p8) br.cond.spnt __libm_error_region // Branch if x=snan, y=0, + // result=1 } +;; -{ .mfi - nop.m 999 -// -// Produce all results for x**0 and x**1 -// Let all the result x ** 0 == 1 and return -// Let all x ** 1 == x and return -// -(p10) fmpy.s0 FR_Result = FR_Input_Y,FR_Input_X - nop.i 999 ;; -} { .mfb - nop.m 999 -(p11) fmpy.s0 FR_Result = FR_Input_Y,FR_Input_X -(p10) br.cond.spnt L(POWL_64_RETURN) ;; -} -{ .mib - nop.m 999 - nop.i 999 -(p11) br.cond.spnt L(POWL_64_RETURN) ;; -} -{ .mib - nop.m 999 - nop.i 999 -// -// Return result for x or y QNaN input with QNaN result -// -(p14) br.cond.spnt L(POWL_64_Y_IS_INF) ;; + nop.m 999 + fclass.m p13, p0 = FR_Input_X, 0x023 // Test x inf +(p6) br.ret.spnt b0 // Exit y=0, x not nan or 0, + // result=1 } -{ .mib - nop.m 999 - nop.i 999 -(p13) br.cond.spnt L(POWL_64_X_IS_INF) ;; +;; + +{ .mfb + nop.m 999 +(p14) fcmp.eq.unc.s1 p0,p14 = FR_Input_X,f0 // Test x not 0, y=inf +(p7) br.ret.spnt b0 // Exit y=1, x not snan, + // result=x } -L(POWL_64_X_IS_ZERO): -{ .mmb -(p0) getf.sig GR_signif_y = FR_Input_Y -(p0) getf.exp GR_BIASed_exp_y = FR_Input_Y - nop.b 999 ;; +;; + +{ .mfb + nop.m 999 +(p10) fmpy.s0 FR_Result = FR_Input_Y,FR_Input_X // If x=qnan, y not snan, + // result=qnan +(p10) br.ret.spnt b0 // Exit x=qnan, y not snan, + // result=qnan } -{ .mlx - nop.m 999 -(p0) movl GR_Mask = 0x1FFFF +;; + +{ .mfb + nop.m 999 +(p11) fmpy.s0 FR_Result = FR_Input_Y,FR_Input_X // If y=qnan, x not nan or 1, + // result=qnan +(p11) br.ret.spnt b0 // Exit y=qnan, x not nan or 1, + // result=qnan } -{ .mlx - nop.m 999 -(p0) movl GR_y_sign = 0x20000 ;; +;; + +{ .mbb + nop.m 999 +(p14) br.cond.spnt POWL_64_Y_IS_INF // Branch if y=inf, x not 1 or nan +(p13) br.cond.spnt POWL_64_X_IS_INF // Branch if x=inf, y not 1 or nan } -// -// Get BIASed exp and significand of y +;; + + +POWL_64_X_IS_ZERO: +// Here if x=0, y not nan or 1 or inf or 0 + +// There is logic starting here to determine if y is an integer when x = 0. +// If 0 < |y| < 1 then clearly y is not an integer. +// If |y| > 1, then the significand of y is shifted left by the size of +// the exponent of y. This preserves the lsb of the integer part + the +// fractional bits. The lsb of the integer can be tested to determine if +// the integer is even or odd. The fractional bits can be tested. If zero, +// then y is an integer. // { .mfi -(p0) and GR_exp_y = GR_Mask,GR_BIASed_exp_y - nop.f 999 -(p0) and GR_y_sign = GR_y_sign,GR_BIASed_exp_y -} -{ .mlx - nop.m 999 -(p0) movl GR_BIAS = 0xFFFF ;; + and GR_exp_y = GR_exp_mask,GR_signexp_y // Get biased exponent of y + nop.f 999 + and GR_y_sign = GR_sign_mask,GR_signexp_y // Get sign of y } -{ .mfi -(p0) cmp.lt.unc p9, p8 = GR_exp_y,GR_BIAS - nop.f 999 +;; + // // Maybe y is < 1 already, so // can never be an integer. -// Remove sign bit from exponent. -// -(p0) sub GR_exp_y = GR_exp_y,GR_BIAS ;; -} -{ .mii - nop.m 999 - nop.i 999 ;; -// -// Remove exponent BIAS // -(p8) shl GR_exp_y= GR_signif_y,GR_exp_y ;; -} { .mfi -(p9) or GR_exp_y= 0xF,GR_signif_y - nop.f 999 - nop.i 999 ;; + cmp.lt p9, p8 = GR_exp_y,GR_exp_bias // Test 0 < |y| < 1 + nop.f 999 + sub GR_exp_y = GR_exp_y,GR_exp_bias // Get true exponent of y } -{ .mii - nop.m 999 +;; + // // Shift significand of y looking for nonzero bits // For y > 1, shift signif_y exp_y bits to the left -// For y < 1, turn on 4 low order bits of significand of y +// For y < 1, turn on 4 low order bits of significand of y // so that the fraction will always be non-zero // -(p0) shl GR_signif_y= GR_exp_y,1 ;; -(p0) extr.u GR_low_order_bit = GR_exp_y,63,1 +{ .mmi +(p9) or GR_exp_y= 0xF,GR_signif_y // Force nonzero fraction if y<1 +;; + nop.m 999 +(p8) shl GR_exp_y= GR_signif_y,GR_exp_y // Get lsb of int + fraction + // Wait 4 cycles to use result +} +;; + +{ .mmi + nop.m 999 +;; + nop.m 999 + nop.i 999 +} +;; + +{ .mmi + nop.m 999 +;; + nop.m 999 + shl GR_fraction_y= GR_exp_y,1 // Shift left 1 to get fraction } +;; + // // Integer part of y shifted off. // Get y's low even or odd bit - y might not be an int. // { .mii -(p0) cmp.eq.unc p13,p0 = GR_signif_y, r0 -(p0) cmp.eq.unc p8,p9 = GR_y_sign, r0 ;; -// -// Is y an int? -// Is y positive -// -(p13) cmp.ne.unc p13,p0 = GR_low_order_bit, r0 ;; + cmp.eq p13,p0 = GR_fraction_y, r0 // Test for y integer + cmp.eq p8,p0 = GR_y_sign, r0 // Test for y > 0 +;; +(p13) tbit.nz.unc p13,p0 = GR_exp_y, 63 // Test if y an odd integer } +;; + +{ .mfi +(p13) cmp.eq.unc p13,p14 = GR_y_sign, r0 // Test y pos odd integer +(p8) fcmp.eq.s0 p12,p0 = FR_Input_Y, f0 // If x=0 and y>0 flag if y denormal + nop.i 999 +} +;; + // -// Is y and int and odd? +// Return +/-0 when x=+/-0 and y is positive odd integer // { .mfb -(p13) cmp.eq.unc p13,p14 = GR_y_sign, r0 -(p8) fcmp.eq.s0 p12,p0 = FR_Input_Y, f0 // If x=0 and y>0 flag if y denormal - nop.b 999 ;; + nop.m 999 +(p13) mov FR_Result = FR_Input_X // If x=0, y pos odd int, result=x +(p13) br.ret.spnt b0 // Exit x=0, y pos odd int, result=x } -{ .mfb - nop.m 999 +;; + // -// Is y and int and odd and positive? +// Return +/-inf when x=+/-0 and y is negative odd int // -(p13) mov FR_Result = FR_Input_X -(p13) br.cond.sptk L(POWL_64_RETURN) ;; +{ .mfb +(p14) mov GR_Parameter_TAG = 21 +(p14) frcpa.s0 FR_Result, p0 = f1, FR_Input_X // Result +-inf, set Z flag +(p14) br.cond.spnt __libm_error_region } -{ .mfi - nop.m 999 +;; + // -// Return +/-0 when x=+/-0 and y is and odd pos. int +// Return +0 when x=+/-0 and y positive and not an odd integer // -(p14) frcpa.s0 FR_Result, p10 = f1, FR_Input_X -(p14) mov GR_Parameter_TAG = 21 -} -{ .mib - nop.m 999 - nop.i 999 -(p14) br.cond.spnt __libm_error_region ;; +{ .mfb + nop.m 999 +(p8) mov FR_Result = f0 // If x=0, y>0 and not odd integer, result=+0 +(p8) br.ret.sptk b0 // Exit x=0, y>0 and not odd integer, result=+0 } +;; -{ .mfb - nop.m 999 // -// Return +/-0 when x=+/-Inf and y is and odd neg int -// and raise dz exception +// Return +inf when x=+/-0 and y is negative and not odd int // -(p8) mov FR_Result = f0 -(p8) br.cond.sptk L(POWL_64_RETURN) ;; +{ .mfb + mov GR_Parameter_TAG = 21 + frcpa.s0 FR_Result, p10 = f1,f0 // Result +inf, raise Z flag + br.cond.sptk __libm_error_region } -{ .mfi - nop.m 999 +;; + + +POWL_64_X_IS_INF: // -// Return +0 when x=+/-0 and y > 0 and not odd. +// Here if x=inf, y not 1 or nan // -(p9) frcpa.s0 FR_Result, p10 = f1,f0 -(p9) mov GR_Parameter_TAG = 21 -} -{ .mib - nop.m 999 - nop.i 999 -(p9) br.cond.sptk __libm_error_region ;; -} -L(POWL_64_X_IS_INF): { .mfi -(p0) getf.exp GR_exp_y = FR_Input_Y -(p0) fclass.m.unc p13, p0 = FR_Input_X,0x022 -(p0) mov GR_Mask = 0x1FFFF ;; + and GR_exp_y = GR_exp_mask,GR_signexp_y // Get biased exponent y + fclass.m p13, p0 = FR_Input_X,0x022 // Test x=-inf + nop.i 999 } +;; { .mfi -(p0) getf.sig GR_signif_y = FR_Input_Y -(p0) fcmp.eq.s0 p9,p0 = FR_Input_Y, f0 // Flag if y denormal - nop.i 999 ;; + and GR_y_sign = GR_sign_mask,GR_signexp_y // Get sign of y + fcmp.eq.s0 p9,p0 = FR_Input_Y, f0 // Dummy to set flag if y denorm + nop.i 999 } +;; // -// Get exp and significand of y -// Create exponent mask and sign mask +// Maybe y is < 1 already, so +// isn't an int. // -{ .mlx -(p0) and GR_low_order_bit = GR_Mask,GR_exp_y -(p0) movl GR_BIAS = 0xFFFF +{ .mfi +(p13) cmp.lt.unc p9, p8 = GR_exp_y,GR_exp_bias // Test 0 < |y| < 1 if x=-inf + fclass.m p11, p0 = FR_Input_X,0x021 // Test x=+inf + sub GR_exp_y = GR_exp_y,GR_exp_bias // Get true exponent y } -{ .mmi - nop.m 999 ;; +;; + // -// Remove sign bit from exponent. +// Shift significand of y looking for nonzero bits +// For y > 1, shift signif_y exp_y bits to the left +// For y < 1, turn on 4 low order bits of significand of y +// so that the fraction will always be non-zero // -(p0) cmp.lt.unc p9, p8 = GR_low_order_bit,GR_BIAS +{ .mmi +(p9) or GR_exp_y= 0xF,GR_signif_y // Force nonzero fraction if y<1 +;; +(p11) cmp.eq.unc p14,p12 = GR_y_sign, r0 // Test x=+inf, y>0 +(p8) shl GR_exp_y= GR_signif_y,GR_exp_y // Get lsb of int + fraction + // Wait 4 cycles to use result +} +;; + // -// Maybe y is < 1 already, so -// isn't an int. +// Return +inf for x=+inf, y > 0 +// Return +0 for x=+inf, y < 0 // -(p0) sub GR_low_order_bit = GR_low_order_bit,GR_BIAS +{ .mfi + nop.m 999 +(p12) mov FR_Result = f0 // If x=+inf, y<0, result=+0 + nop.i 999 } -{ .mlx - nop.m 999 -(p0) movl GR_sign_mask = 0x20000 ;; +{ .mfb + nop.m 999 +(p14) fma.s0 FR_Result = FR_Input_X,f1,f0 // If x=+inf, y>0, result=+inf +(p11) br.ret.sptk b0 // Exit x=+inf } -{ .mfi -(p0) and GR_sign_mask = GR_sign_mask,GR_exp_y +;; + // -// Return +Inf when x=+/-0 and y < 0 and not odd and raise -// divide-by-zero exception. +// Here only if x=-inf. Wait until can use result of shl... // -(p0) fclass.m.unc p11, p0 = FR_Input_X,0x021 - nop.i 999 ;; -} { .mmi - nop.m 999 ;; -// -// Is shift off integer part of y. -// Get y's even or odd bit - y might not be an int. -// -(p11) cmp.eq.unc p11,p12 = GR_sign_mask, r0 -// -// Remove exponent BIAS -// -(p8) shl GR_exp_y = GR_signif_y,GR_low_order_bit ;; + nop.m 999 +;; + nop.m 999 + nop.i 999 } +;; + { .mfi -(p9) or GR_exp_y = 0xF,GR_signif_y -// -// Is y positive or negative when x is +Inf? -// Is y and int when x = -Inf -// -(p11) mov FR_Result = FR_Input_X - nop.i 999 ;; + cmp.eq p8,p9 = GR_y_sign, r0 // Test y pos + nop.f 999 + shl GR_fraction_y = GR_exp_y,1 // Shift left 1 to get fraction } -{ .mfi - nop.m 999 -(p12) mov FR_Result = f0 - nop.i 999 ;; +;; + +{ .mmi + cmp.eq p13,p0 = GR_fraction_y, r0 // Test y integer +;; + nop.m 999 +(p13) tbit.nz.unc p13,p0 = GR_exp_y, 63 // Test y odd integer } -{ .mii - nop.m 999 +;; + // -// Shift signficand looking for nonzero bits -// For y non-ints, upset the significand. +// Is y even or odd? // -(p0) shl GR_signif_y = GR_exp_y,1 ;; -(p13) cmp.eq.unc p13,p0 = GR_signif_y, r0 -} { .mii - nop.m 999 -(p0) extr.u GR_low_order_bit = GR_exp_y,63,1 ;; -(p13) cmp.ne.unc p13,p0 = GR_low_order_bit, r0 -} -{ .mib - nop.m 999 - nop.i 999 -(p11) br.cond.sptk L(POWL_64_RETURN) ;; -} -{ .mib - nop.m 999 - nop.i 999 -(p12) br.cond.sptk L(POWL_64_RETURN) ;; +(p13) cmp.eq.unc p14,p10 = GR_y_sign, r0 // Test x=-inf, y pos odd int +(p13) cmp.ne.and p8,p9 = r0,r0 // If y odd int, turn off p8,p9 + nop.i 999 } +;; + // -// Return Inf for y > 0 -// Return +0 for y < 0 -// Is y even or odd? +// Return -0 for x = -inf and y < 0 and odd int. +// Return -Inf for x = -inf and y > 0 and odd int. // -{ .mii -(p13) cmp.eq.unc p13,p10 = GR_sign_mask, r0 -(p0) cmp.eq.unc p8,p9 = GR_sign_mask, r0 ;; - nop.i 999 +{ .mfi + nop.m 999 +(p10) fmerge.ns FR_Result = f0, f0 // If x=-inf, y neg odd int, result=-0 + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 +(p14) fmpy.s0 FR_Result = FR_Input_X,f1 // If x=-inf, y pos odd int, result=-inf + nop.i 999 +} +;; + // -// For x = -inf, y is and int, positive -// and odd -// Is y positive in general? +// Return Inf for x = -inf and y > 0 not an odd int. +// Return +0 for x = -inf and y < 0 not an odd int. // -(p13) mov FR_Result = FR_Input_X - nop.i 999 ;; +.pred.rel "mutex",p8,p9 +{ .mfi + nop.m 999 +(p8) fmerge.ns FR_Result = FR_Input_X, FR_Input_X // If x=-inf, y>0 not odd int + // result=+inf + nop.i 999 } { .mfb - nop.m 999 -(p10) fmerge.ns FR_Result = f0, f0 -(p13) br.cond.sptk L(POWL_64_RETURN) ;; -} -{ .mib - nop.m 999 - nop.i 999 -(p10) br.cond.sptk L(POWL_64_RETURN) ;; + nop.m 999 +(p9) fmpy.s0 FR_Result = f0,f0 // If x=-inf, y<0 not odd int + // result=+0 + br.ret.sptk b0 // Exit for x=-inf } -{ .mfi - nop.m 999 +;; + + +POWL_64_Y_IS_INF: +// Here if y=inf, x not 1 or nan // -// Return -Inf for x = -inf and y > 0 and odd int. -// Return -0 for x = -inf and y < 0 and odd int. +// For y = +Inf and |x| < 1 returns 0 +// For y = +Inf and |x| > 1 returns Inf +// For y = -Inf and |x| < 1 returns Inf +// For y = -Inf and |x| > 1 returns 0 +// For y = Inf and |x| = 1 returns 1 // -(p8) fmerge.ns FR_Result = FR_Input_X, FR_Input_X - nop.i 999 ;; -} -{ .mfb - nop.m 999 -(p9) mov FR_Result = f0 -(p8) br.cond.sptk L(POWL_64_RETURN) ;; +{ .mfi + nop.m 999 + fclass.m p8, p0 = FR_Input_Y, 0x021 // Test y=+inf + nop.i 999 } -{ .mib - nop.m 999 - nop.i 999 -(p9) br.cond.sptk L(POWL_64_RETURN) ;; +;; + +{ .mfi + nop.m 999 + fclass.m p9, p0 = FR_Input_Y, 0x022 // Test y=-inf + nop.i 999 } -L(POWL_64_Y_IS_INF): +;; + { .mfi - nop.m 999 -// -// Return Inf for x = -inf and y > 0 not an odd int. -// Return +0 for x = -inf and y < 0 and not an odd int. -// -(p0) fclass.m.unc p8, p0 = FR_Input_Y, 0x021 - nop.i 999 + nop.m 999 + fabs FR_X = FR_Input_X // Form |x| + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fclass.m.unc p9, p0 = FR_Input_Y, 0x022 - nop.i 999 ;; + nop.m 999 + fcmp.eq.s0 p10,p0 = FR_Input_X, f0 // flag if x denormal + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fabs FR_X = FR_Input_X - nop.i 999 ;; + nop.m 999 +(p8) fcmp.lt.unc.s1 p6, p0 = FR_X, f1 // Test y=+inf, |x|<1 + nop.i 999 } +;; { .mfi - nop.m 999 -(p0) fcmp.eq.s0 p10,p0 = FR_Input_X, f0 // flag if x denormal - nop.i 999 ;; + nop.m 999 +(p8) fcmp.gt.unc.s1 p7, p0 = FR_X, f1 // Test y=+inf, |x|>1 + nop.i 999 } +;; { .mfi - nop.m 999 -// -// Find y = +/- Inf -// Compute |x| -// -(p8) fcmp.lt.unc.s1 p6, p0 = FR_X, f1 - nop.i 999 + nop.m 999 +(p9) fcmp.lt.unc.s1 p12, p0 = FR_X, f1 // Test y=-inf, |x|<1 + nop.i 999 } { .mfi - nop.m 999 -(p8) fcmp.gt.unc.s1 p7, p0 = FR_X, f1 - nop.i 999 ;; + nop.m 999 +(p6) fmpy.s0 FR_Result = f0,f0 // If y=+inf, |x|<1, result=+0 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p9) fcmp.lt.unc.s1 p12, p0 = FR_X, f1 - nop.i 999 + nop.m 999 +(p9) fcmp.gt.unc.s1 p13, p0 = FR_X, f1 // Test y=-inf, |x|>1 + nop.i 999 } { .mfi - nop.m 999 -(p9) fcmp.gt.unc.s1 p13, p0 = FR_X, f1 - nop.i 999 ;; + nop.m 999 +(p7) fmpy.s0 FR_Result = FR_Input_Y, f1 // If y=+inf, |x|>1, result=+inf + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// For y = +Inf and |x| < 1 returns 0 -// For y = +Inf and |x| > 1 returns Inf -// For y = -Inf and |x| < 1 returns Inf -// For y = -Inf and |x| > 1 returns 0 -// -(p6) mov FR_Result = f0 - nop.i 999 ;; + nop.m 999 + fcmp.eq.s1 p14, p0 = FR_X, f1 // Test y=inf, |x|=1 + nop.i 999 } { .mfi - nop.m 999 -(p7) mov FR_Result = FR_Input_Y - nop.i 999 ;; + nop.m 999 +(p12) fnma.s0 FR_Result = FR_Input_Y, f1, f0 // If y=-inf, |x|<1, result=+inf + nop.i 999 } +;; + { .mfi - nop.m 999 -(p12) fmpy.s0 FR_Result = FR_Input_Y, FR_Input_Y - nop.i 999 ;; + nop.m 999 +(p13) mov FR_Result = f0 // If y=-inf, |x|>1, result=+0 + nop.i 999 } +;; + { .mfb - nop.m 999 -(p13) mov FR_Result = f0 -// -// Produce x ** +/- Inf results -// -(p6) br.cond.spnt L(POWL_64_RETURN) ;; + nop.m 999 +(p14) fmpy.s0 FR_Result = f1,f1 // If y=inf, |x|=1, result=+1 + br.ret.sptk b0 // Common return for y=inf } -{ .mib - nop.m 999 - nop.i 999 -(p7) br.cond.spnt L(POWL_64_RETURN) ;; +;; + + +// Here if x or y denorm/unorm +POWL_DENORM: +{ .mmi + getf.sig GR_signif_Z = FR_norm_X // Get significand of x +;; + getf.exp GR_signexp_y = FR_norm_Y // Get sign and exp of y + nop.i 999 } -{ .mib - nop.m 999 - nop.i 999 -(p12) br.cond.spnt L(POWL_64_RETURN) ;; +;; + +{ .mfi + getf.sig GR_signif_y = FR_norm_Y // Get significand of y + nop.f 999 + nop.i 999 } +;; + { .mib - nop.m 999 - nop.i 999 -(p13) br.cond.spnt L(POWL_64_RETURN) ;; + getf.exp GR_signexp_x = FR_norm_X // Get sign and exp of x + extr.u GR_Index1 = GR_signif_Z, 59, 4 // Extract upper 4 signif bits of x + br.cond.sptk POWL_COMMON // Branch back to main path } -{ .mfb - nop.m 999 +;; + + +POWL_64_UNSUPPORT: // -// +/-1 ** +/-Inf, result is +1 +// Raise exceptions for specific +// values - pseudo NaN and +// infinities. +// Return NaN and raise invalid // -(p0) fmpy.s0 FR_Result = f1,f1 -(p0) br.cond.sptk L(POWL_64_RETURN) ;; -} -L(POWL_64_UNSUPPORT): { .mfb - nop.m 999 + nop.m 999 + fmpy.s0 FR_Result = FR_Input_X,f0 + br.ret.sptk b0 +} +;; + +POWL_64_XNEG: // -// Return NaN and raise invalid +// Raise invalid for x < 0 and +// y not an integer // -(p0) fmpy.s0 FR_Result = FR_Input_X,f0 -// -// Raise exceptions for specific -// values - pseudo NaN and -// infinities. -// -(p0) br.cond.sptk L(POWL_64_RETURN) ;; -} -L(POWL_64_XNEG): { .mfi - nop.m 999 -(p0) frcpa.s0 FR_Result, p8 = f0, f0 -// -// Raise invalid for x < 0 and -// y not an integer and -// -(p0) mov GR_Parameter_TAG = 22 + nop.m 999 + frcpa.s0 FR_Result, p8 = f0, f0 + mov GR_Parameter_TAG = 22 } { .mib - nop.m 999 - nop.i 999 -(p0) br.cond.sptk __libm_error_region ;; + nop.m 999 + nop.i 999 + br.cond.sptk __libm_error_region } -L(POWL_64_SQRT): +;; + +POWL_64_SQRT: { .mfi - nop.m 999 -(p0) frsqrta.s0 FR_Result,p10 = FR_Input_X - nop.i 999 ;; + nop.m 999 + frsqrta.s0 FR_Result,p10 = FR_save_Input_X + nop.i 999 ;; } { .mfi - nop.m 999 -(p10) fma.s1 f62=FR_Half,FR_Input_X,f0 - nop.i 999 ;; + nop.m 999 +(p10) fma.s1 f62=FR_Half,FR_save_Input_X,f0 + nop.i 999 ;; } { .mfi - nop.m 999 -// -// Step (2) -// h = 1/2 * a in f9 -// -(p10) fma.s1 f63=FR_Result,FR_Result,f0 - nop.i 999 ;; + nop.m 999 +(p10) fma.s1 f63=FR_Result,FR_Result,f0 + nop.i 999 ;; } { .mfi - nop.m 999 -// -// Step (3) -// t1 = y0 * y0 in f10 -// -(p10) fnma.s1 f32=f63,f62,f11 - nop.i 999 ;; + nop.m 999 +(p10) fnma.s1 f32=f63,f62,FR_Half + nop.i 999 ;; } { .mfi - nop.m 999 -// -// Step (4) -// t2 = 1/2 - t1 * h in f10 -// -(p10) fma.s1 f33=f32,FR_Result,FR_Result - nop.i 999 ;; + nop.m 999 +(p10) fma.s1 f33=f32,FR_Result,FR_Result + nop.i 999 ;; } { .mfi - nop.m 999 -// -// Step (5) -// y1 = y0 + t2 * y0 in f13 -// -(p10) fma.s1 f34=f33,f62,f0 - nop.i 999 ;; + nop.m 999 +(p10) fma.s1 f34=f33,f62,f0 + nop.i 999 ;; } { .mfi - nop.m 999 -// -// Step (6) -// t3 = y1 * h in f10 -// -(p10) fnma.s1 f35=f34,f33,f11 - nop.i 999 ;; + nop.m 999 +(p10) fnma.s1 f35=f34,f33,FR_Half + nop.i 999 ;; } { .mfi - nop.m 999 -// -// Step (7) -// t4 = 1/2 - t3 * y1 in f10 -// -(p10) fma.s1 f63=f35,f33,f33 - nop.i 999 ;; + nop.m 999 +(p10) fma.s1 f63=f35,f33,f33 + nop.i 999 ;; } { .mfi - nop.m 999 -// -// Step (8) -// y2 = y1 + t4 * y1 in f13 -// -(p10) fma.s1 f32=FR_Input_X,f63,f0 - nop.i 999 + nop.m 999 +(p10) fma.s1 f32=FR_save_Input_X,f63,f0 + nop.i 999 } { .mfi - nop.m 999 -// -// Step (9) -// S = a * y2 in f10 -// -(p10) fma.s1 FR_Result=f63,f62,f0 - nop.i 999 ;; + nop.m 999 +(p10) fma.s1 FR_Result=f63,f62,f0 + nop.i 999 ;; } { .mfi - nop.m 999 -// -// Step (10) -// t5 = y2 * h in f9 -// -(p10) fma.s1 f33=f11,f63,f0 - nop.i 999 ;; + nop.m 999 +(p10) fma.s1 f33=f11,f63,f0 + nop.i 999 ;; } { .mfi - nop.m 999 -// -// Step (11) -// H = 1/2 * y2 in f11 -// -(p10) fnma.s1 f34=f32,f32,f8 - nop.i 999 + nop.m 999 +(p10) fnma.s1 f34=f32,f32,FR_save_Input_X + nop.i 999 } { .mfi - nop.m 999 -// -// Step (12) -// d = a - S * S in f12 -// -(p10) fnma.s1 f35=FR_Result,f63,f11 - nop.i 999 ;; + nop.m 999 +(p10) fnma.s1 f35=FR_Result,f63,FR_Half + nop.i 999 ;; } { .mfi - nop.m 999 -// -// Step (13) -// t6 = 1/2 - t5 * y2 in f7 -// -(p10) fma.s1 f62=f33,f34,f32 - nop.i 999 + nop.m 999 +(p10) fma.s1 f62=f33,f34,f32 + nop.i 999 } { .mfi - nop.m 999 -// -// Step (14) -// S1 = S + d * H in f13 -// -(p10) fma.s1 f63=f33,f35,f33 - nop.i 999 ;; + nop.m 999 +(p10) fma.s1 f63=f33,f35,f33 + nop.i 999 ;; } { .mfi - nop.m 999 -// -// Step (15) -// H1 = H + t6 * h in f7 -// -(p10) fnma.s1 f32=f62,f62,FR_Input_X - nop.i 999 ;; + nop.m 999 +(p10) fnma.s1 f32=f62,f62,FR_save_Input_X + nop.i 999 ;; } { .mfb - nop.m 999 -// -// Step (16) -// d1 = a - S1 * S1 -// -(p10) fma.s0 FR_Result=f32,f63,f62 -// -// Step (17) -// R = S1 + d1 * H1 -// -(p10) br.cond.sptk L(POWL_64_RETURN) ;; -} -{ .mib - nop.m 999 - nop.i 999 -// -// Do the Newton-Raphson iteration from the EAS. -// -(p0) br.cond.sptk L(POWL_64_RETURN) ;; + nop.m 999 +(p10) fma.s0 FR_Result=f32,f63,f62 + br.ret.sptk b0 // Exit for x > 0, y = 0.5 } -// -// Take care of the degenerate cases. -// +;; -L(POWL_64_RETURN): -{ .mfb - nop.m 999 -(p0) mov FR_Output = FR_Result -(p0) br.ret.sptk b0 ;; -} -.endp powl -ASM_SIZE_DIRECTIVE(powl) +GLOBAL_LIBM_END(powl) -.proc __libm_error_region -__libm_error_region: +LOCAL_LIBM_ENTRY(__libm_error_region) .prologue { .mfi add GR_Parameter_Y=-32,sp // Parameter 2 value @@ -3411,32 +2770,32 @@ __libm_error_region: mov GR_SAVE_GP=gp // Save gp };; { .mmi - stfe [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack + stfe [GR_Parameter_Y] = FR_Input_Y,16 // Save Parameter 2 on stack add GR_Parameter_X = 16,sp // Parameter 1 address .save b0, GR_SAVE_B0 mov GR_SAVE_B0=b0 // Save b0 };; .body { .mib - stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack + stfe [GR_Parameter_X] = FR_save_Input_X // Store Parameter 1 on stack add GR_Parameter_RESULT = 0,GR_Parameter_Y nop.b 0 // Parameter 3 address } { .mib - stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack + stfe [GR_Parameter_Y] = FR_Result // Store Parameter 3 on stack add GR_Parameter_Y = -16,GR_Parameter_Y - br.call.sptk b0=__libm_error_support# // Call error handling function + br.call.sptk b0=__libm_error_support# // Call error handling function };; { .mmi - nop.m 0 - nop.m 0 add GR_Parameter_RESULT = 48,sp + nop.m 0 + nop.i 0 };; { .mmi - ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack + ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack .restore sp - add sp = 64,sp // Restore stack pointer - mov b0 = GR_SAVE_B0 // Restore return address + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address };; { .mib mov gp = GR_SAVE_GP // Restore gp @@ -3444,7 +2803,6 @@ __libm_error_region: br.ret.sptk b0 // Return };; -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) +.endp .type __libm_error_support#,@function .global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_remainder.S b/sysdeps/ia64/fpu/e_remainder.S index d8a27722de..2f6e90f994 100644 --- a/sysdeps/ia64/fpu/e_remainder.S +++ b/sysdeps/ia64/fpu/e_remainder.S @@ -1,10 +1,10 @@ - .file "remainder.asm" -// Copyright (C) 2000, 2001, Intel Corporation +.file "remainder.s" + + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. // -// Contributed 2/2/2000 by John Harrison, Cristina Iordache, Ted Kubaska, Bob Norin, -// Shane Story, and Ping Tak Peter Tang of the Computational Software Lab, -// Intel Corporation. +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,7 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -35,17 +35,19 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //==================================================================== -// 2/02/00 Initial version -// 3/02/00 New Algorithm -// 4/04/00 Unwind support added -// 7/21/00 Fixed quotient=2^{24*m+23}*1.q1...q23 1 bug -// 8/15/00 Bundle added after call to __libm_error_support to properly +// 02/02/00 Initial version +// 03/02/00 New Algorithm +// 04/04/00 Unwind support added +// 07/21/00 Fixed quotient=2^{24*m+23}*1.q1...q23 1 bug +// 08/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. -//11/29/00 Set FR_Y to f9 +// 11/29/00 Set FR_Y to f9 +// 05/20/02 Cleaned up namespace and sf0 syntax +// 02/10/03 Reordered header: .section, .global, .proc, .align // // API //==================================================================== @@ -78,16 +80,12 @@ // a=+/- Inf, or b=+/-0: return NaN, call libm_error_support // a=NaN or b=NaN: return NaN -#include "libm_support.h" - // Registers used //==================================================================== // Predicate registers: p6-p14 // General registers: r2,r3,r28,r29,r32 (ar.pfs), r33-r39 // Floating point registers: f6-f15,f32 - .section .text - GR_SAVE_B0 = r33 GR_SAVE_PFS = r34 GR_SAVE_GP = r35 @@ -103,18 +101,9 @@ FR_Y = f9 FR_RESULT = f8 +.section .text +GLOBAL_IEEE754_ENTRY(remainder) - .proc remainder# - .align 32 - .global remainder# - .align 32 - -remainder: -#ifdef _LIBC -.global __remainder -.type __remainder,@function -__remainder: -#endif // inputs in f8, f9 // result in f8 @@ -139,7 +128,7 @@ __remainder: // Y +-NAN, +-inf, +-0? p11 { .mfi setf.exp f32=r28 -(p0) fclass.m.unc p11,p0 = f9, 0xe7 + fclass.m.unc p11,p0 = f9, 0xe7 nop.i 999 } // qnan snan inf norm unorm 0 -+ @@ -148,7 +137,7 @@ __remainder: // X +-NAN, +-inf, ? p9 { .mfi nop.m 999 -(p0) fclass.m.unc p9,p0 = f8, 0xe3 + fclass.m.unc p9,p0 = f8, 0xe3 nop.i 999;; } @@ -167,8 +156,8 @@ __remainder: } {.bbb - (p9) br.cond.spnt L(FREM_X_NAN_INF) - (p11) br.cond.spnt L(FREM_Y_NAN_INF_ZERO) + (p9) br.cond.spnt FREM_X_NAN_INF + (p11) br.cond.spnt FREM_Y_NAN_INF_ZERO nop.b 0 } {.mfi nop.m 0 @@ -178,7 +167,7 @@ __remainder: } -L(remloop24): +remloop24: { .mfi nop.m 0 // Step (2) @@ -200,7 +189,7 @@ L(remloop24): {.mfi nop.m 0 // q1=q0*(1+e0) - fma.s1 f15=f12,f7,f12 + (p6) fma.s1 f15=f12,f7,f12 nop.i 0 } { .mfi @@ -331,7 +320,7 @@ L(remloop24): // (p9) set r=r2 (new a, if not last iteration) // (p10) new a =r (p10) mov f13=f6 - (p12) br.cond.sptk L(remloop24);; + (p12) br.cond.sptk remloop24;; } // last iteration @@ -388,7 +377,7 @@ L(remloop24): } -L(FREM_X_NAN_INF): +FREM_X_NAN_INF: // Y zero ? {.mfi @@ -405,19 +394,19 @@ L(FREM_X_NAN_INF): nop.m 0 nop.i 0 // if Y zero - (p11) br.cond.spnt L(FREM_Y_ZERO);; + (p11) br.cond.spnt FREM_Y_ZERO;; } // X infinity? Return QNAN indefinite { .mfi nop.m 999 -(p0) fclass.m.unc p8,p0 = f8, 0x23 + fclass.m.unc p8,p0 = f8, 0x23 nop.i 999 } // X infinity? Return QNAN indefinite { .mfi nop.m 999 -(p0) fclass.m.unc p11,p0 = f8, 0x23 + fclass.m.unc p11,p0 = f8, 0x23 nop.i 999;; } // Y NaN ? @@ -445,14 +434,14 @@ L(FREM_X_NAN_INF): } { .mfi nop.m 999 -(p8) fma.d f8=f8,f1,f0 +(p8) fma.d.s0 f8=f8,f1,f0 nop.i 0 ;; } { .mfb nop.m 999 frcpa.s0 f8,p7=f8,f9 - (p11) br.cond.spnt L(EXP_ERROR_RETURN);; + (p11) br.cond.spnt EXP_ERROR_RETURN;; } { .mib nop.m 0 @@ -461,35 +450,35 @@ L(FREM_X_NAN_INF): } -L(FREM_Y_NAN_INF_ZERO): +FREM_Y_NAN_INF_ZERO: // Y INF { .mfi nop.m 999 -(p0) fclass.m.unc p7,p0 = f9, 0x23 + fclass.m.unc p7,p0 = f9, 0x23 nop.i 999 ;; } { .mfb nop.m 999 -(p7) fma.d f8=f8,f1,f0 +(p7) fma.d.s0 f8=f8,f1,f0 (p7) br.ret.spnt b0 ;; } // Y NAN? { .mfi nop.m 999 -(p0) fclass.m.unc p9,p0 = f9, 0xc3 + fclass.m.unc p9,p0 = f9, 0xc3 nop.i 999 ;; } { .mfb nop.m 999 -(p9) fma.d f8=f9,f1,f0 +(p9) fma.d.s0 f8=f9,f1,f0 (p9) br.ret.spnt b0 ;; } -L(FREM_Y_ZERO): +FREM_Y_ZERO: // Y zero? Must be zero at this point // because it is the only choice left. // Return QNAN indefinite @@ -497,7 +486,7 @@ L(FREM_Y_ZERO): // X NAN? { .mfi nop.m 999 -(p0) fclass.m.unc p9,p10 = f8, 0xc3 + fclass.m.unc p9,p10 = f8, 0xc3 nop.i 999 ;; } { .mfi @@ -508,47 +497,41 @@ L(FREM_Y_ZERO): {.mfi nop.m 999 - (p9) frcpa f11,p7=f8,f0 + (p9) frcpa.s0 f11,p7=f8,f0 nop.i 0;; } { .mfi nop.m 999 -(p10) frcpa f11,p7 = f0,f0 +(p10) frcpa.s0 f11,p7 = f0,f0 nop.i 999;; } { .mfi nop.m 999 -(p0) fmerge.s f10 = f8, f8 + fmerge.s f10 = f8, f8 nop.i 999 } { .mfi nop.m 999 -(p0) fma.d f8=f11,f1,f0 + fma.d.s0 f8=f11,f1,f0 nop.i 999 } -L(EXP_ERROR_RETURN): +EXP_ERROR_RETURN: { .mib -(p0) mov GR_Parameter_TAG = 124 + mov GR_Parameter_TAG = 124 nop.i 999 -(p0) br.sptk __libm_error_region;; + br.sptk __libm_error_region;; } -.endp remainder -ASM_SIZE_DIRECTIVE(remainder) -#ifdef _LIBC -ASM_SIZE_DIRECTIVE(__remainder) -#endif - +GLOBAL_IEEE754_END(remainder) -.proc __libm_error_region -__libm_error_region: +LOCAL_LIBM_ENTRY(__libm_error_region) .prologue { .mfi add GR_Parameter_Y=-32,sp // Parameter 2 value @@ -596,10 +579,11 @@ __libm_error_region: br.ret.sptk b0 // Return };; -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) +LOCAL_LIBM_END(__libm_error_region) .type __libm_error_support#,@function .global __libm_error_support# + + diff --git a/sysdeps/ia64/fpu/e_remainderf.S b/sysdeps/ia64/fpu/e_remainderf.S index 40f9b32921..bbb5fd0e0f 100644 --- a/sysdeps/ia64/fpu/e_remainderf.S +++ b/sysdeps/ia64/fpu/e_remainderf.S @@ -1,11 +1,10 @@ - .file "remainderf.asm" -// Copyright (C) 2000, 2001, Intel Corporation +.file "remainderf.s" + + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. // -// Contributed 2/2/2000 by John Harrison, Cristina Iordache, Ted Kubaska, -// Bob Norin, Shane Story, and Ping Tak Peter Tang of the Computational -// Software Lab, -// Intel Corporation. +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -21,7 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -36,17 +35,19 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //==================================================================== -// 2/02/00 Initial version -// 3/02/00 New algorithm -// 4/04/00 Unwind support added -// 7/21/00 Fixed quotient=2^{24*m+23} bug -// 8/15/00 Bundle added after call to __libm_error_support to properly +// 02/02/00 Initial version +// 03/02/00 New algorithm +// 04/04/00 Unwind support added +// 07/21/00 Fixed quotient=2^{24*m+23} bug +// 08/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. -//11/29/00 Set FR_Y to f9 +// 11/29/00 Set FR_Y to f9 +// 05/20/02 Cleaned up namespace and sf0 syntax +// 02/10/03 Reordered header: .section, .global, .proc, .align // // API //==================================================================== @@ -78,9 +79,6 @@ //==================================================================== // a=+/- Inf, or b=+/-0: return NaN, call libm_error_support // a=NaN or b=NaN: return NaN - -#include "libm_support.h" - // // Registers used //==================================================================== @@ -89,8 +87,6 @@ // Floating point registers: f6-f15 // -.section .text - GR_SAVE_B0 = r33 GR_SAVE_PFS = r34 GR_SAVE_GP = r35 @@ -106,17 +102,9 @@ FR_Y = f9 FR_RESULT = f8 - .proc remainderf# - .align 32 - .global remainderf# - .align 32 +.section .text +GLOBAL_IEEE754_ENTRY(remainderf) -remainderf: -#ifdef _LIBC -.global __remainderf -.type __remainderf,@function -__remainderf: -#endif // inputs in f8, f9 // result in f8 @@ -141,7 +129,7 @@ __remainderf: // Y +-NAN, +-inf, +-0? p11 { .mfi nop.m 999 -(p0) fclass.m.unc p11,p0 = f9, 0xe7 + fclass.m.unc p11,p0 = f9, 0xe7 nop.i 999 } // qnan snan inf norm unorm 0 -+ @@ -150,7 +138,7 @@ __remainderf: // X +-NAN, +-inf, ? p9 { .mfi nop.m 999 -(p0) fclass.m.unc p9,p0 = f8, 0xe3 + fclass.m.unc p9,p0 = f8, 0xe3 nop.i 999;; } @@ -168,8 +156,8 @@ __remainderf: nop.i 0;; } {.bbb - (p9) br.cond.spnt L(FREM_X_NAN_INF) - (p11) br.cond.spnt L(FREM_Y_NAN_INF_ZERO) + (p9) br.cond.spnt FREM_X_NAN_INF + (p11) br.cond.spnt FREM_Y_NAN_INF_ZERO nop.b 0 } {.mfi nop.m 0 @@ -179,7 +167,7 @@ __remainderf: } .align 32 -L(remloop24): +remloop24: { .mfi // f12=2^{24}-2 setf.s f12=r3 @@ -347,7 +335,7 @@ L(remloop24): // (p9) set r=r2 (new a, if not last iteration) // (p10) new a =r (p10) mov f13=f6 - (p12) br.cond.sptk L(remloop24);; + (p12) br.cond.sptk remloop24;; } // last iteration @@ -408,7 +396,7 @@ L(remloop24): } -L(FREM_X_NAN_INF): +FREM_X_NAN_INF: // Y zero ? {.mfi @@ -425,19 +413,19 @@ L(FREM_X_NAN_INF): nop.m 0 nop.i 0 // if Y zero - (p11) br.cond.spnt L(FREM_Y_ZERO);; + (p11) br.cond.spnt FREM_Y_ZERO;; } // X infinity? Return QNAN indefinite { .mfi nop.m 999 -(p0) fclass.m.unc p8,p0 = f8, 0x23 + fclass.m.unc p8,p0 = f8, 0x23 nop.i 999 } // X infinity? Return QNAN indefinite { .mfi nop.m 999 -(p0) fclass.m.unc p11,p0 = f8, 0x23 + fclass.m.unc p11,p0 = f8, 0x23 nop.i 999;; } // Y NaN ? @@ -465,14 +453,14 @@ L(FREM_X_NAN_INF): } { .mfi nop.m 999 -(p8) fma.s f8=f8,f1,f0 +(p8) fma.s.s0 f8=f8,f1,f0 nop.i 0 ;; } { .mfb nop.m 999 frcpa.s0 f8,p7=f8,f9 - (p11) br.cond.spnt L(EXP_ERROR_RETURN);; + (p11) br.cond.spnt EXP_ERROR_RETURN;; } { .mib nop.m 0 @@ -481,35 +469,35 @@ L(FREM_X_NAN_INF): } -L(FREM_Y_NAN_INF_ZERO): +FREM_Y_NAN_INF_ZERO: // Y INF { .mfi nop.m 999 -(p0) fclass.m.unc p7,p0 = f9, 0x23 + fclass.m.unc p7,p0 = f9, 0x23 nop.i 999 ;; } { .mfb nop.m 999 -(p7) fma.s f8=f8,f1,f0 +(p7) fma.s.s0 f8=f8,f1,f0 (p7) br.ret.spnt b0 ;; } // Y NAN? { .mfi nop.m 999 -(p0) fclass.m.unc p9,p0 = f9, 0xc3 + fclass.m.unc p9,p0 = f9, 0xc3 nop.i 999 ;; } { .mfb nop.m 999 -(p9) fma.s f8=f9,f1,f0 +(p9) fma.s.s0 f8=f9,f1,f0 (p9) br.ret.spnt b0 ;; } -L(FREM_Y_ZERO): +FREM_Y_ZERO: // Y zero? Must be zero at this point // because it is the only choice left. // Return QNAN indefinite @@ -517,7 +505,7 @@ L(FREM_Y_ZERO): // X NAN? { .mfi nop.m 999 -(p0) fclass.m.unc p9,p10 = f8, 0xc3 + fclass.m.unc p9,p10 = f8, 0xc3 nop.i 999 ;; } { .mfi @@ -528,47 +516,41 @@ L(FREM_Y_ZERO): {.mfi nop.m 999 - (p9) frcpa f11,p7=f8,f0 + (p9) frcpa.s0 f11,p7=f8,f0 nop.i 0;; } { .mfi nop.m 999 -(p10) frcpa f11,p7 = f0,f0 +(p10) frcpa.s0 f11,p7 = f0,f0 nop.i 999;; } { .mfi nop.m 999 -(p0) fmerge.s f10 = f8, f8 + fmerge.s f10 = f8, f8 nop.i 999 } { .mfi nop.m 999 -(p0) fma.s f8=f11,f1,f0 + fma.s.s0 f8=f11,f1,f0 nop.i 999 } -L(EXP_ERROR_RETURN): +EXP_ERROR_RETURN: { .mib -(p0) mov GR_Parameter_TAG = 125 + mov GR_Parameter_TAG = 125 nop.i 999 -(p0) br.sptk __libm_error_region;; + br.sptk __libm_error_region;; } -.endp remainderf -ASM_SIZE_DIRECTIVE(remainderf) -#ifdef _LIBC -ASM_SIZE_DIRECTIVE(__remainderf) -#endif - +GLOBAL_IEEE754_END(remainderf) -.proc __libm_error_region -__libm_error_region: +LOCAL_LIBM_ENTRY(__libm_error_region) .prologue { .mfi add GR_Parameter_Y=-32,sp // Parameter 2 value @@ -616,9 +598,11 @@ __libm_error_region: br.ret.sptk b0 // Return };; -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) +LOCAL_LIBM_END(__libm_error_region) .type __libm_error_support#,@function .global __libm_error_support# + + + diff --git a/sysdeps/ia64/fpu/e_remainderl.S b/sysdeps/ia64/fpu/e_remainderl.S index 5856861442..1c1a3c3072 100644 --- a/sysdeps/ia64/fpu/e_remainderl.S +++ b/sysdeps/ia64/fpu/e_remainderl.S @@ -1,10 +1,10 @@ -.file "remainderl.asm" -// Copyright (C) 2000, 2001, Intel Corporation +.file "remainderl.s" + + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. // -// Contributed 2/2/2000 by John Harrison, Cristina Iordache, Ted Kubaska, -// Bob Norin, Shane Story, and Ping Tak Peter Tang of the Computational -// Software Lab, Intel Corporation. +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,7 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -35,17 +35,19 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //==================================================================== -// 2/02/00 Initial version -// 3/02/00 New algorithm -// 4/04/00 Unwind support added -// 7/21/00 Fixed quotient=2^{24*m+23}*1.q1...q23 1 bug -// 8/15/00 Bundle added after call to __libm_error_support to properly +// 02/02/00 Initial version +// 03/02/00 New algorithm +// 04/04/00 Unwind support added +// 07/21/00 Fixed quotient=2^{24*m+23}*1.q1...q23 1 bug +// 08/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. -//11/29/00 Set FR_Y to f9 +// 11/29/00 Set FR_Y to f9 +// 05/20/02 Cleaned up namespace and sf0 syntax +// 02/10/03 Reordered header: .section, .global, .proc, .align // // API //==================================================================== @@ -77,9 +79,6 @@ //==================================================================== // a=+/- Inf, or b=+/-0: return NaN, call libm_error_support // a=NaN or b=NaN: return NaN - -#include "libm_support.h" - // // Registers used //==================================================================== @@ -87,8 +86,6 @@ // General registers: r2,r3,r28,r29,r32 (ar.pfs), r33-r39 // Floating point registers: f6-f15,f32 // -.section .text - GR_SAVE_B0 = r33 GR_SAVE_PFS = r34 @@ -105,19 +102,9 @@ FR_Y = f9 FR_RESULT = f8 +.section .text +GLOBAL_IEEE754_ENTRY(remainderl) - - .proc remainderl# - .align 32 - .global remainderl# - .align 32 - -remainderl: -#ifdef _LIBC -.global __remainderl -.type __remainderl,@function -__remainderl: -#endif // inputs in f8, f9 // result in f8 @@ -159,7 +146,7 @@ cmp.eq p11,p10=r29,r0;; // X +-NAN, +-inf, ? p9 { .mfi nop.m 999 -(p0) fclass.m.unc p9,p8 = f8, 0xe3 + fclass.m.unc p9,p8 = f8, 0xe3 nop.i 999;; } @@ -196,8 +183,8 @@ cmp.eq p11,p10=r29,r0;; } {.bbb - (p9) br.cond.spnt L(FREM_X_NAN_INF) - (p11) br.cond.spnt L(FREM_Y_NAN_INF_ZERO) + (p9) br.cond.spnt FREM_X_NAN_INF + (p11) br.cond.spnt FREM_Y_NAN_INF_ZERO nop.b 0 } {.mfi nop.m 0 @@ -206,7 +193,7 @@ cmp.eq p11,p10=r29,r0;; nop.i 0;; } -L(remloop24): +remloop24: { .mfi nop.m 0 // Step (2) @@ -228,7 +215,7 @@ L(remloop24): {.mfi nop.m 0 // q1=q0*(1+e0) - fma.s1 f15=f12,f7,f12 + (p6) fma.s1 f15=f12,f7,f12 nop.i 0 } { .mfi @@ -358,7 +345,7 @@ L(remloop24): // (p9) set r=r2 (new a, if not last iteration) // (p10) new a =r (p10) mov f13=f6 - (p12) br.cond.sptk L(remloop24);; + (p12) br.cond.sptk remloop24;; } // last iteration @@ -416,7 +403,7 @@ L(remloop24): -L(FREM_X_NAN_INF): +FREM_X_NAN_INF: // Y zero ? {.mfi @@ -433,19 +420,19 @@ L(FREM_X_NAN_INF): nop.m 0 nop.i 0 // if Y zero - (p11) br.cond.spnt L(FREM_Y_ZERO);; + (p11) br.cond.spnt FREM_Y_ZERO;; } // X infinity? Return QNAN indefinite { .mfi nop.m 999 -(p0) fclass.m.unc p8,p0 = f8, 0x23 + fclass.m.unc p8,p0 = f8, 0x23 nop.i 999 } // X infinity? Return QNAN indefinite { .mfi nop.m 999 -(p0) fclass.m.unc p11,p0 = f8, 0x23 + fclass.m.unc p11,p0 = f8, 0x23 nop.i 999;; } // Y NaN ? @@ -473,14 +460,14 @@ L(FREM_X_NAN_INF): } { .mfi nop.m 999 -(p8) fma f8=f8,f1,f0 +(p8) fma.s0 f8=f8,f1,f0 nop.i 0 ;; } { .mfb nop.m 999 frcpa.s0 f8,p7=f8,f9 - (p11) br.cond.spnt L(EXP_ERROR_RETURN);; + (p11) br.cond.spnt EXP_ERROR_RETURN;; } { .mib nop.m 0 @@ -489,24 +476,24 @@ L(FREM_X_NAN_INF): } -L(FREM_Y_NAN_INF_ZERO): +FREM_Y_NAN_INF_ZERO: // Y INF { .mfi nop.m 999 -(p0) fclass.m.unc p7,p0 = f9, 0x23 + fclass.m.unc p7,p0 = f9, 0x23 nop.i 999 ;; } { .mfb nop.m 999 -(p7) fma f8=f8,f1,f0 +(p7) fma.s0 f8=f8,f1,f0 (p7) br.ret.spnt b0 ;; } // Y NAN? { .mfi nop.m 999 -(p0) fclass.m.unc p9,p10 = f9, 0xc3 + fclass.m.unc p9,p10 = f9, 0xc3 nop.i 999 ;; } { .mfi @@ -517,11 +504,11 @@ L(FREM_Y_NAN_INF_ZERO): { .mfb nop.m 999 -(p9) fma f8=f9,f1,f0 +(p9) fma.s0 f8=f9,f1,f0 (p9) br.ret.spnt b0 ;; } -L(FREM_Y_ZERO): +FREM_Y_ZERO: // Y zero? Must be zero at this point // because it is the only choice left. // Return QNAN indefinite @@ -529,7 +516,7 @@ L(FREM_Y_ZERO): // X NAN? { .mfi nop.m 999 -(p0) fclass.m.unc p9,p10 = f8, 0xc3 + fclass.m.unc p9,p10 = f8, 0xc3 nop.i 999 ;; } { .mfi @@ -540,43 +527,37 @@ L(FREM_Y_ZERO): {.mfi nop.m 999 - (p9) frcpa f11,p7=f8,f0 + (p9) frcpa.s0 f11,p7=f8,f0 nop.i 0;; } { .mfi nop.m 999 -(p10) frcpa f11,p7 = f0,f0 +(p10) frcpa.s0 f11,p7 = f0,f0 nop.i 999;; } { .mfi nop.m 999 -(p0) fmerge.s f10 = f8, f8 + fmerge.s f10 = f8, f8 nop.i 999 } { .mfi nop.m 999 -(p0) fma f8=f11,f1,f0 + fma.s0 f8=f11,f1,f0 nop.i 999;; } -L(EXP_ERROR_RETURN): +EXP_ERROR_RETURN: { .mib -(p0) mov GR_Parameter_TAG = 123 + mov GR_Parameter_TAG = 123 nop.i 999 -(p0) br.sptk __libm_error_region;; + br.sptk __libm_error_region;; } -.endp remainderl -ASM_SIZE_DIRECTIVE(remainderl) -#ifdef _LIBC -ASM_SIZE_DIRECTIVE(__remainderl) -#endif - -.proc __libm_error_region -__libm_error_region: +GLOBAL_IEEE754_END(remainderl) +LOCAL_LIBM_ENTRY(__libm_error_region) .prologue { .mfi add GR_Parameter_Y=-32,sp // Parameter 2 value @@ -624,9 +605,12 @@ __libm_error_region: br.ret.sptk b0 // Return };; -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) +LOCAL_LIBM_END(__libm_error_region) .type __libm_error_support#,@function .global __libm_error_support# + + + + diff --git a/sysdeps/ia64/fpu/e_scalb.S b/sysdeps/ia64/fpu/e_scalb.S index 7f5b5796de..82e914e259 100644 --- a/sysdeps/ia64/fpu/e_scalb.S +++ b/sysdeps/ia64/fpu/e_scalb.S @@ -1,10 +1,10 @@ .file "scalb.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,7 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -35,12 +35,14 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 2/02/00 Initial version -// 1/26/01 Scalb completely reworked and now standalone version +// 02/02/00 Initial version +// 01/26/01 Scalb completely reworked and now standalone version +// 05/20/02 Cleaned up namespace and sf0 syntax +// 02/10/03 Reordered header: .section, .global, .proc, .align // // API //============================================================== @@ -53,8 +55,6 @@ // // -#include "libm_support.h" - FR_Floating_X = f8 FR_Result = f8 FR_Floating_N = f9 @@ -84,19 +84,8 @@ GR_Parameter_Y = r36 GR_Parameter_RESULT = r37 GR_Tag = r38 -.align 32 -.global scalb - .section .text -.proc scalb -.align 32 - -scalb: -#ifdef _LIBC -.global __ieee754_scalb -.type __ieee754_scalb,@function -__ieee754_scalb: -#endif +GLOBAL_IEEE754_ENTRY(scalb) // // Is x NAN, INF, ZERO, +-? @@ -140,12 +129,12 @@ __ieee754_scalb: { .mib setf.exp FR_Big = GR_Scratch nop.i 0 -(p6) br.cond.spnt L(SCALB_NAN_INF_ZERO) +(p6) br.cond.spnt SCALB_NAN_INF_ZERO } { .mib setf.exp FR_NBig = GR_Scratch1 nop.i 0 -(p7) br.cond.spnt L(SCALB_NAN_INF_ZERO) +(p7) br.cond.spnt SCALB_NAN_INF_ZERO };; // @@ -212,7 +201,7 @@ __ieee754_scalb: } { .mfb nop.m 0 -(p7) frcpa f8,p11 = f0,f0 +(p7) frcpa.s0 f8,p11 = f0,f0 (p7) br.ret.spnt b0 };; @@ -246,7 +235,7 @@ __ieee754_scalb: } { .mlx nop.m 999 -(p0) movl GR_Scratch = 0x00000000000303FF + movl GR_Scratch = 0x00000000000303FF };; { .mfi nop.m 0 @@ -255,7 +244,7 @@ __ieee754_scalb: } { .mlx nop.m 999 -(p0) movl GR_Scratch1= 0x00000000000103FF + movl GR_Scratch1= 0x00000000000103FF };; // Set up necessary status fields @@ -266,12 +255,12 @@ __ieee754_scalb: // { .mfi nop.m 999 -(p0) fsetc.s3 0x7F,0x41 + fsetc.s3 0x7F,0x41 nop.i 999 } { .mfi nop.m 999 -(p0) fsetc.s2 0x7F,0x42 + fsetc.s2 0x7F,0x42 nop.i 999 };; @@ -345,7 +334,7 @@ __ieee754_scalb: { .mfb (p6) addl GR_Tag = 54, r0 (p8) fcmp.le.unc.s1 p9, p10 = FR_Result2 , FR_NBig -(p6) br.cond.spnt L(SCALB_UNDERFLOW) +(p6) br.cond.spnt SCALB_UNDERFLOW };; // @@ -353,8 +342,8 @@ __ieee754_scalb: // { .mbb nop.m 0 -(p7) br.cond.spnt L(SCALB_OVERFLOW) -(p9) br.cond.spnt L(SCALB_OVERFLOW) +(p7) br.cond.spnt SCALB_OVERFLOW +(p9) br.cond.spnt SCALB_OVERFLOW };; // @@ -366,7 +355,7 @@ __ieee754_scalb: br.ret.sptk b0;; } -L(SCALB_NAN_INF_ZERO): +SCALB_NAN_INF_ZERO: // // Convert N to a fp integer @@ -471,16 +460,11 @@ L(SCALB_NAN_INF_ZERO): br.ret.sptk b0 };; -.endp scalb -ASM_SIZE_DIRECTIVE(scalb) -#ifdef _LIBC -ASM_SIZE_DIRECTIVE(__ieee754_scalb) -#endif -.proc __libm_error_region +GLOBAL_IEEE754_END(scalb) __libm_error_region: -L(SCALB_OVERFLOW): -L(SCALB_UNDERFLOW): +SCALB_OVERFLOW: +SCALB_UNDERFLOW: // // Get stack address of N @@ -557,8 +541,7 @@ L(SCALB_UNDERFLOW): br.ret.sptk b0 };; -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) +LOCAL_LIBM_END(__libm_error_region) .type __libm_error_support#,@function .global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_scalbf.S b/sysdeps/ia64/fpu/e_scalbf.S index 40af080d38..07acb3297e 100644 --- a/sysdeps/ia64/fpu/e_scalbf.S +++ b/sysdeps/ia64/fpu/e_scalbf.S @@ -1,10 +1,10 @@ .file "scalbf.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,7 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -35,12 +35,14 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 2/02/00 Initial version -// 1/26/01 Scalb completely reworked and now standalone version +// 02/02/00 Initial version +// 01/26/01 Scalb completely reworked and now standalone version +// 05/20/02 Cleaned up namespace and sf0 syntax +// 02/10/03 Reordered header: .section, .global, .proc, .align // // API //============================================================== @@ -53,8 +55,6 @@ // // -#include "libm_support.h" - FR_Floating_X = f8 FR_Result = f8 FR_Floating_N = f9 @@ -84,19 +84,8 @@ GR_Parameter_Y = r36 GR_Parameter_RESULT = r37 GR_Tag = r38 -.align 32 -.global scalbf - .section .text -.proc scalbf -.align 32 - -scalbf: -#ifdef _LIBC -.global __ieee754_scalbf -.type __ieee754_scalbf,@function -__ieee754_scalbf: -#endif +GLOBAL_IEEE754_ENTRY(scalbf) // // Is x NAN, INF, ZERO, +-? @@ -140,12 +129,12 @@ __ieee754_scalbf: { .mib setf.exp FR_Big = GR_Scratch nop.i 0 -(p6) br.cond.spnt L(SCALBF_NAN_INF_ZERO) +(p6) br.cond.spnt SCALBF_NAN_INF_ZERO } { .mib setf.exp FR_NBig = GR_Scratch1 nop.i 0 -(p7) br.cond.spnt L(SCALBF_NAN_INF_ZERO) +(p7) br.cond.spnt SCALBF_NAN_INF_ZERO };; // @@ -212,7 +201,7 @@ __ieee754_scalbf: } { .mfb nop.m 0 -(p7) frcpa f8,p11 = f0,f0 +(p7) frcpa.s0 f8,p11 = f0,f0 (p7) br.ret.spnt b0 };; @@ -246,7 +235,7 @@ __ieee754_scalbf: } { .mlx nop.m 999 -(p0) movl GR_Scratch = 0x000000000003007F + movl GR_Scratch = 0x000000000003007F };; { .mfi nop.m 0 @@ -255,7 +244,7 @@ __ieee754_scalbf: } { .mlx nop.m 999 -(p0) movl GR_Scratch1= 0x000000000001007F + movl GR_Scratch1= 0x000000000001007F };; // Set up necessary status fields @@ -266,12 +255,12 @@ __ieee754_scalbf: // { .mfi nop.m 999 -(p0) fsetc.s3 0x7F,0x41 + fsetc.s3 0x7F,0x41 nop.i 999 } { .mfi nop.m 999 -(p0) fsetc.s2 0x7F,0x42 + fsetc.s2 0x7F,0x42 nop.i 999 };; @@ -345,7 +334,7 @@ __ieee754_scalbf: { .mfb (p6) addl GR_Tag = 56, r0 (p8) fcmp.le.unc.s1 p9, p10 = FR_Result2 , FR_NBig -(p6) br.cond.spnt L(SCALBF_UNDERFLOW) +(p6) br.cond.spnt SCALBF_UNDERFLOW };; // @@ -353,8 +342,8 @@ __ieee754_scalbf: // { .mbb nop.m 0 -(p7) br.cond.spnt L(SCALBF_OVERFLOW) -(p9) br.cond.spnt L(SCALBF_OVERFLOW) +(p7) br.cond.spnt SCALBF_OVERFLOW +(p9) br.cond.spnt SCALBF_OVERFLOW };; // @@ -366,7 +355,7 @@ __ieee754_scalbf: br.ret.sptk b0;; } -L(SCALBF_NAN_INF_ZERO): +SCALBF_NAN_INF_ZERO: // // Convert N to a fp integer @@ -471,16 +460,11 @@ L(SCALBF_NAN_INF_ZERO): br.ret.sptk b0 };; -.endp scalbf -ASM_SIZE_DIRECTIVE(scalbf) -#ifdef _LIBC -ASM_SIZE_DIRECTIVE(__ieee754_scalbf) -#endif -.proc __libm_error_region +GLOBAL_IEEE754_END(scalbf) __libm_error_region: -L(SCALBF_OVERFLOW): -L(SCALBF_UNDERFLOW): +SCALBF_OVERFLOW: +SCALBF_UNDERFLOW: // // Get stack address of N @@ -557,8 +541,7 @@ L(SCALBF_UNDERFLOW): br.ret.sptk b0 };; -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) +LOCAL_LIBM_END(__libm_error_region) .type __libm_error_support#,@function .global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_scalbl.S b/sysdeps/ia64/fpu/e_scalbl.S index 43eac7a2ad..d22d029155 100644 --- a/sysdeps/ia64/fpu/e_scalbl.S +++ b/sysdeps/ia64/fpu/e_scalbl.S @@ -1,10 +1,10 @@ .file "scalbl.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,7 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -35,12 +35,14 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 2/02/00 Initial version -// 1/26/01 Scalb completely reworked and now standalone version +// 02/02/00 Initial version +// 01/26/01 Scalb completely reworked and now standalone version +// 05/20/02 Cleaned up namespace and sf0 syntax +// 02/10/03 Reordered header: .section, .global, .proc, .align // // API //============================================================== @@ -53,8 +55,6 @@ // // -#include "libm_support.h" - FR_Floating_X = f8 FR_Result = f8 FR_Floating_N = f9 @@ -84,19 +84,8 @@ GR_Parameter_Y = r36 GR_Parameter_RESULT = r37 GR_Tag = r38 -.align 32 -.global scalbl - .section .text -.proc scalbl -.align 32 - -scalbl: -#ifdef _LIBC -.global __ieee754_scalbl -.type __ieee754_scalbl,@function -__ieee754_scalbl: -#endif +GLOBAL_IEEE754_ENTRY(scalbl) // // Is x NAN, INF, ZERO, +-? @@ -140,12 +129,12 @@ __ieee754_scalbl: { .mib setf.exp FR_Big = GR_Scratch nop.i 0 -(p6) br.cond.spnt L(SCALBL_NAN_INF_ZERO) +(p6) br.cond.spnt SCALBL_NAN_INF_ZERO } { .mib setf.exp FR_NBig = GR_Scratch1 nop.i 0 -(p7) br.cond.spnt L(SCALBL_NAN_INF_ZERO) +(p7) br.cond.spnt SCALBL_NAN_INF_ZERO };; // @@ -212,7 +201,7 @@ __ieee754_scalbl: } { .mfb nop.m 0 -(p7) frcpa f8,p11 = f0,f0 +(p7) frcpa.s0 f8,p11 = f0,f0 (p7) br.ret.spnt b0 };; @@ -246,7 +235,7 @@ __ieee754_scalbl: } { .mlx nop.m 999 -(p0) movl GR_Scratch = 0x0000000000033FFF + movl GR_Scratch = 0x0000000000033FFF };; { .mfi nop.m 0 @@ -255,7 +244,7 @@ __ieee754_scalbl: } { .mlx nop.m 999 -(p0) movl GR_Scratch1= 0x0000000000013FFF + movl GR_Scratch1= 0x0000000000013FFF };; // Set up necessary status fields @@ -266,12 +255,12 @@ __ieee754_scalbl: // { .mfi nop.m 999 -(p0) fsetc.s3 0x7F,0x41 + fsetc.s3 0x7F,0x41 nop.i 999 } { .mfi nop.m 999 -(p0) fsetc.s2 0x7F,0x42 + fsetc.s2 0x7F,0x42 nop.i 999 };; @@ -345,7 +334,7 @@ __ieee754_scalbl: { .mfb (p6) addl GR_Tag = 52, r0 (p8) fcmp.le.unc.s1 p9, p10 = FR_Result2 , FR_NBig -(p6) br.cond.spnt L(SCALBL_UNDERFLOW) +(p6) br.cond.spnt SCALBL_UNDERFLOW };; // @@ -353,8 +342,8 @@ __ieee754_scalbl: // { .mbb nop.m 0 -(p7) br.cond.spnt L(SCALBL_OVERFLOW) -(p9) br.cond.spnt L(SCALBL_OVERFLOW) +(p7) br.cond.spnt SCALBL_OVERFLOW +(p9) br.cond.spnt SCALBL_OVERFLOW };; // @@ -366,7 +355,7 @@ __ieee754_scalbl: br.ret.sptk b0;; } -L(SCALBL_NAN_INF_ZERO): +SCALBL_NAN_INF_ZERO: // // Convert N to a fp integer @@ -471,16 +460,11 @@ L(SCALBL_NAN_INF_ZERO): br.ret.sptk b0 };; -.endp scalbl -ASM_SIZE_DIRECTIVE(scalbl) -#ifdef _LIBC -ASM_SIZE_DIRECTIVE(__ieee754_scalbl) -#endif -.proc __libm_error_region +GLOBAL_IEEE754_END(scalbl) __libm_error_region: -L(SCALBL_OVERFLOW): -L(SCALBL_UNDERFLOW): +SCALBL_OVERFLOW: +SCALBL_UNDERFLOW: // // Get stack address of N @@ -557,8 +541,7 @@ L(SCALBL_UNDERFLOW): br.ret.sptk b0 };; -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) +LOCAL_LIBM_END(__libm_error_region) .type __libm_error_support#,@function .global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_sinh.S b/sysdeps/ia64/fpu/e_sinh.S index 4415dc7524..84c312c2b7 100644 --- a/sysdeps/ia64/fpu/e_sinh.S +++ b/sysdeps/ia64/fpu/e_sinh.S @@ -1,10 +1,10 @@ .file "sinh.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2002, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,1249 +20,838 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// // Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// problem reports or change requests be submitted to it directly at +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 2/02/00 Initial version -// 4/04/00 Unwind support added -// 8/15/00 Bundle added after call to __libm_error_support to properly +// 02/02/00 Initial version +// 04/04/00 Unwind support added +// 08/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. // 10/12/00 Update to set denormal operand and underflow flags -// 1/22/01 Fixed to set inexact flag for small args. -// +// 01/22/01 Fixed to set inexact flag for small args. +// 05/02/01 Reworked to improve speed of all paths +// 05/20/02 Cleaned up namespace and sf0 syntax +// 11/20/02 Improved speed with new algorithm + // API //============================================================== -// double = sinh(double) -// input floating point f8 -// output floating point f8 -// -// Registers used -//============================================================== -// general registers: -// r32 -> r47 -// predicate registers used: -// p6 p7 p8 p9 -// floating-point registers used: -// f9 -> f15; f32 -> f45; -// f8 has input, then output -// +// double sinh(double) + // Overview of operation //============================================================== -// There are four paths -// 1. |x| < 0.25 SINH_BY_POLY -// 2. |x| < 32 SINH_BY_TBL -// 3. |x| < 2^14 SINH_BY_EXP -// 4. |x_ >= 2^14 SINH_HUGE -// -// For double extended we get infinity for x >= 400c b174 ddc0 31ae c0ea -// >= 1.0110001.... x 2^13 -// >= 11357.2166 -// -// But for double we get infinity for x >= 408633ce8fb9f87e -// >= 1.0110...x 2^9 -// >= +7.10476e+002 -// -// And for single we get infinity for x >= 42b3a496 -// >= 1.0110... 2^6 -// >= 89.8215 +// Case 1: 0 < |x| < 2^-60 +// Result = x, computed by x+sgn(x)*x^2) to handle flags and rounding // -// SAFE: If there is danger of overflow set SAFE to 0 -// NOT implemented: if there is danger of underflow, set SAFE to 0 -// SAFE for all paths listed below +// Case 2: 2^-60 < |x| < 0.25 +// Evaluate sinh(x) by a 13th order polynomial +// Care is take for the order of multiplication; and A1 is not exactly 1/3!, +// A2 is not exactly 1/5!, etc. +// sinh(x) = x + (A1*x^3 + A2*x^5 + A3*x^7 + A4*x^9 + A5*x^11 + A6*x^13) // -// 1. SINH_BY_POLY -// =============== -// If |x| is less than the tiny threshold, then clear SAFE -// For double, the tiny threshold is -1022 = -0x3fe => -3fe + ffff = fc01 -// register-biased, this is fc01 -// For single, the tiny threshold is -126 = -7e => -7e + ffff = ff81 -// If |x| < tiny threshold, set SAFE = 0 +// Case 3: 0.25 < |x| < 710.47586 +// Algorithm is based on the identity sinh(x) = ( exp(x) - exp(-x) ) / 2. +// The algorithm for exp is described as below. There are a number of +// economies from evaluating both exp(x) and exp(-x). Although we +// are evaluating both quantities, only where the quantities diverge do we +// duplicate the computations. The basic algorithm for exp(x) is described +// below. // -// 2. SINH_BY_TBL -// ============= -// SAFE: SAFE is always 1 for TBL; -// -// 3. SINH_BY_EXP -// ============== -// There is a danger of double-extended overflow if N-1 > 16382 = 0x3ffe -// r34 has N-1; 16382 is in register biased form, 0x13ffd -// There is danger of double overflow if N-1 > 0x3fe -// in register biased form, 0x103fd -// Analagously, there is danger of single overflow if N-1 > 0x7e -// in register biased form, 0x1007d -// SAFE: If there is danger of overflow set SAFE to 0 -// -// 4. SINH_HUGE -// ============ -// SAFE: SAFE is always 0 for HUGE +// Take the input x. w is "how many log2/128 in x?" +// w = x * 128/log2 +// n = int(w) +// x = n log2/128 + r + delta -#include "libm_support.h" +// n = 128M + index_1 + 2^4 index_2 +// x = M log2 + (log2/128) index_1 + (log2/8) index_2 + r + delta -// -// Assembly macros -//============================================================== -sinh_FR_X = f44 -sinh_FR_X2 = f9 -sinh_FR_X4 = f10 -sinh_FR_SGNX = f40 -sinh_FR_all_ones = f45 -sinh_FR_tmp = f42 - -sinh_FR_Inv_log2by64 = f9 -sinh_FR_log2by64_lo = f11 -sinh_FR_log2by64_hi = f10 - -sinh_FR_A1 = f9 -sinh_FR_A2 = f10 -sinh_FR_A3 = f11 - -sinh_FR_Rcub = f12 -sinh_FR_M_temp = f13 -sinh_FR_R_temp = f13 -sinh_FR_Rsq = f13 -sinh_FR_R = f14 - -sinh_FR_M = f38 - -sinh_FR_B1 = f15 -sinh_FR_B2 = f32 -sinh_FR_B3 = f33 - -sinh_FR_peven_temp1 = f34 -sinh_FR_peven_temp2 = f35 -sinh_FR_peven = f36 - -sinh_FR_podd_temp1 = f34 -sinh_FR_podd_temp2 = f35 -sinh_FR_podd = f37 - -sinh_FR_poly_podd_temp1 = f11 -sinh_FR_poly_podd_temp2 = f13 -sinh_FR_poly_peven_temp1 = f11 -sinh_FR_poly_peven_temp2 = f13 +// exp(x) = 2^M 2^(index_1/128) 2^(index_2/8) exp(r) exp(delta) +// Construct 2^M +// Get 2^(index_1/128) from table_1; +// Get 2^(index_2/8) from table_2; +// Calculate exp(r) by 5th order polynomial +// r = x - n (log2/128)_high +// delta = - n (log2/128)_low +// Calculate exp(delta) as 1 + delta -sinh_FR_J_temp = f9 -sinh_FR_J = f10 -sinh_FR_Mmj = f39 - -sinh_FR_N_temp1 = f11 -sinh_FR_N_temp2 = f12 -sinh_FR_N = f13 - -sinh_FR_spos = f14 -sinh_FR_sneg = f15 - -sinh_FR_Tjhi = f32 -sinh_FR_Tjlo = f33 -sinh_FR_Tmjhi = f34 -sinh_FR_Tmjlo = f35 - -sinh_GR_mJ = r35 -sinh_GR_J = r36 - -sinh_AD_mJ = r38 -sinh_AD_J = r39 -sinh_GR_all_ones = r40 - -sinh_FR_S_hi = f9 -sinh_FR_S_hi_temp = f10 -sinh_FR_S_lo_temp1 = f11 -sinh_FR_S_lo_temp2 = f12 -sinh_FR_S_lo_temp3 = f13 - -sinh_FR_S_lo = f38 -sinh_FR_C_hi = f39 +// Special values +//============================================================== +// sinh(+0) = +0 +// sinh(-0) = -0 -sinh_FR_C_hi_temp1 = f10 -sinh_FR_Y_hi = f11 -sinh_FR_Y_lo_temp = f12 -sinh_FR_Y_lo = f13 -sinh_FR_SINH = f9 +// sinh(+qnan) = +qnan +// sinh(-qnan) = -qnan +// sinh(+snan) = +qnan +// sinh(-snan) = -qnan -sinh_FR_P1 = f14 -sinh_FR_P2 = f15 -sinh_FR_P3 = f32 -sinh_FR_P4 = f33 -sinh_FR_P5 = f34 -sinh_FR_P6 = f35 +// sinh(-inf) = -inf +// sinh(+inf) = +inf -sinh_FR_TINY_THRESH = f9 +// Overflow and Underflow +//======================= +// sinh(x) = largest double normal when +// |x| = 710.47586 = 0x408633ce8fb9f87d +// +// Underflow is handled as described in case 1 above -sinh_FR_SINH_temp = f10 -sinh_FR_SCALE = f11 +// Registers used +//============================================================== +// Floating Point registers used: +// f8, input, output +// f6 -> f15, f32 -> f61 -sinh_FR_signed_hi_lo = f10 +// General registers used: +// r14 -> r40 +// Predicate registers used: +// p6 -> p15 -GR_SAVE_PFS = r41 -GR_SAVE_B0 = r42 -GR_SAVE_GP = r43 +// Assembly macros +//============================================================== -GR_Parameter_X = r44 -GR_Parameter_Y = r45 -GR_Parameter_RESULT = r46 +rRshf = r14 +rN_neg = r14 +rAD_TB1 = r15 +rAD_TB2 = r16 +rAD_P = r17 +rN = r18 +rIndex_1 = r19 +rIndex_2_16 = r20 +rM = r21 +rBiased_M = r21 +rSig_inv_ln2 = r22 +rIndex_1_neg = r22 +rExp_bias = r23 +rExp_bias_minus_1 = r23 +rExp_mask = r24 +rTmp = r24 +rGt_ln = r24 +rIndex_2_16_neg = r24 +rM_neg = r25 +rBiased_M_neg = r25 +rRshf_2to56 = r26 +rAD_T1_neg = r26 +rExp_2tom56 = r28 +rAD_T2_neg = r28 +rAD_T1 = r29 +rAD_T2 = r30 +rSignexp_x = r31 +rExp_x = r31 + +GR_SAVE_B0 = r33 +GR_SAVE_PFS = r34 +GR_SAVE_GP = r35 + +GR_Parameter_X = r37 +GR_Parameter_Y = r38 +GR_Parameter_RESULT = r39 +GR_Parameter_TAG = r40 + + +FR_X = f10 +FR_Y = f1 +FR_RESULT = f8 + +fRSHF_2TO56 = f6 +fINV_LN2_2TO63 = f7 +fW_2TO56_RSH = f9 +f2TOM56 = f11 +fP5 = f12 +fP4 = f13 +fP3 = f14 +fP2 = f15 + +fLn2_by_128_hi = f33 +fLn2_by_128_lo = f34 + +fRSHF = f35 +fNfloat = f36 +fNormX = f37 +fR = f38 +fF = f39 + +fRsq = f40 +f2M = f41 +fS1 = f42 +fT1 = f42 +fS2 = f43 +fT2 = f43 +fS = f43 +fWre_urm_f8 = f44 +fAbsX = f44 + +fMIN_DBL_OFLOW_ARG = f45 +fMAX_DBL_NORM_ARG = f46 +fXsq = f47 +fX4 = f48 +fGt_pln = f49 +fTmp = f49 + +fP54 = f50 +fP5432 = f50 +fP32 = f51 +fP = f52 +fP54_neg = f53 +fP5432_neg = f53 +fP32_neg = f54 +fP_neg = f55 +fF_neg = f56 + +f2M_neg = f57 +fS1_neg = f58 +fT1_neg = f58 +fS2_neg = f59 +fT2_neg = f59 +fS_neg = f59 +fExp = f60 +fExp_neg = f61 + +fA6 = f50 +fA65 = f50 +fA6543 = f50 +fA654321 = f50 +fA5 = f51 +fA4 = f52 +fA43 = f52 +fA3 = f53 +fA2 = f54 +fA21 = f54 +fA1 = f55 +fX3 = f56 // Data tables //============================================================== -#ifdef _LIBC -.rodata -#else -.data -#endif - +RODATA .align 16 -double_sinh_arg_reduction: -ASM_TYPE_DIRECTIVE(double_sinh_arg_reduction,@object) - data8 0xB8AA3B295C17F0BC, 0x00004005 - data8 0xB17217F7D1000000, 0x00003FF8 - data8 0xCF79ABC9E3B39804, 0x00003FD0 -ASM_SIZE_DIRECTIVE(double_sinh_arg_reduction) - -double_sinh_p_table: -ASM_TYPE_DIRECTIVE(double_sinh_p_table,@object) - data8 0xAAAAAAAAAAAAAAAB, 0x00003FFC - data8 0x8888888888888412, 0x00003FF8 - data8 0xD00D00D00D4D39F2, 0x00003FF2 - data8 0xB8EF1D28926D8891, 0x00003FEC - data8 0xD732377688025BE9, 0x00003FE5 - data8 0xB08AF9AE78C1239F, 0x00003FDE -ASM_SIZE_DIRECTIVE(double_sinh_p_table) - -double_sinh_ab_table: -ASM_TYPE_DIRECTIVE(double_sinh_ab_table,@object) - data8 0xAAAAAAAAAAAAAAAC, 0x00003FFC - data8 0x88888888884ECDD5, 0x00003FF8 - data8 0xD00D0C6DCC26A86B, 0x00003FF2 - data8 0x8000000000000002, 0x00003FFE - data8 0xAAAAAAAAAA402C77, 0x00003FFA - data8 0xB60B6CC96BDB144D, 0x00003FF5 -ASM_SIZE_DIRECTIVE(double_sinh_ab_table) - -double_sinh_j_table: -ASM_TYPE_DIRECTIVE(double_sinh_j_table,@object) - data8 0xB504F333F9DE6484, 0x00003FFE, 0x1EB2FB13, 0x00000000 - data8 0xB6FD91E328D17791, 0x00003FFE, 0x1CE2CBE2, 0x00000000 - data8 0xB8FBAF4762FB9EE9, 0x00003FFE, 0x1DDC3CBC, 0x00000000 - data8 0xBAFF5AB2133E45FB, 0x00003FFE, 0x1EE9AA34, 0x00000000 - data8 0xBD08A39F580C36BF, 0x00003FFE, 0x9EAEFDC1, 0x00000000 - data8 0xBF1799B67A731083, 0x00003FFE, 0x9DBF517B, 0x00000000 - data8 0xC12C4CCA66709456, 0x00003FFE, 0x1EF88AFB, 0x00000000 - data8 0xC346CCDA24976407, 0x00003FFE, 0x1E03B216, 0x00000000 - data8 0xC5672A115506DADD, 0x00003FFE, 0x1E78AB43, 0x00000000 - data8 0xC78D74C8ABB9B15D, 0x00003FFE, 0x9E7B1747, 0x00000000 - data8 0xC9B9BD866E2F27A3, 0x00003FFE, 0x9EFE3C0E, 0x00000000 - data8 0xCBEC14FEF2727C5D, 0x00003FFE, 0x9D36F837, 0x00000000 - data8 0xCE248C151F8480E4, 0x00003FFE, 0x9DEE53E4, 0x00000000 - data8 0xD06333DAEF2B2595, 0x00003FFE, 0x9E24AE8E, 0x00000000 - data8 0xD2A81D91F12AE45A, 0x00003FFE, 0x1D912473, 0x00000000 - data8 0xD4F35AABCFEDFA1F, 0x00003FFE, 0x1EB243BE, 0x00000000 - data8 0xD744FCCAD69D6AF4, 0x00003FFE, 0x1E669A2F, 0x00000000 - data8 0xD99D15C278AFD7B6, 0x00003FFE, 0x9BBC610A, 0x00000000 - data8 0xDBFBB797DAF23755, 0x00003FFE, 0x1E761035, 0x00000000 - data8 0xDE60F4825E0E9124, 0x00003FFE, 0x9E0BE175, 0x00000000 - data8 0xE0CCDEEC2A94E111, 0x00003FFE, 0x1CCB12A1, 0x00000000 - data8 0xE33F8972BE8A5A51, 0x00003FFE, 0x1D1BFE90, 0x00000000 - data8 0xE5B906E77C8348A8, 0x00003FFE, 0x1DF2F47A, 0x00000000 - data8 0xE8396A503C4BDC68, 0x00003FFE, 0x1EF22F22, 0x00000000 - data8 0xEAC0C6E7DD24392F, 0x00003FFE, 0x9E3F4A29, 0x00000000 - data8 0xED4F301ED9942B84, 0x00003FFE, 0x1EC01A5B, 0x00000000 - data8 0xEFE4B99BDCDAF5CB, 0x00003FFE, 0x1E8CAC3A, 0x00000000 - data8 0xF281773C59FFB13A, 0x00003FFE, 0x9DBB3FAB, 0x00000000 - data8 0xF5257D152486CC2C, 0x00003FFE, 0x1EF73A19, 0x00000000 - data8 0xF7D0DF730AD13BB9, 0x00003FFE, 0x9BB795B5, 0x00000000 - data8 0xFA83B2DB722A033A, 0x00003FFE, 0x1EF84B76, 0x00000000 - data8 0xFD3E0C0CF486C175, 0x00003FFE, 0x9EF5818B, 0x00000000 - data8 0x8000000000000000, 0x00003FFF, 0x00000000, 0x00000000 - data8 0x8164D1F3BC030773, 0x00003FFF, 0x1F77CACA, 0x00000000 - data8 0x82CD8698AC2BA1D7, 0x00003FFF, 0x1EF8A91D, 0x00000000 - data8 0x843A28C3ACDE4046, 0x00003FFF, 0x1E57C976, 0x00000000 - data8 0x85AAC367CC487B15, 0x00003FFF, 0x9EE8DA92, 0x00000000 - data8 0x871F61969E8D1010, 0x00003FFF, 0x1EE85C9F, 0x00000000 - data8 0x88980E8092DA8527, 0x00003FFF, 0x1F3BF1AF, 0x00000000 - data8 0x8A14D575496EFD9A, 0x00003FFF, 0x1D80CA1E, 0x00000000 - data8 0x8B95C1E3EA8BD6E7, 0x00003FFF, 0x9D0373AF, 0x00000000 - data8 0x8D1ADF5B7E5BA9E6, 0x00003FFF, 0x9F167097, 0x00000000 - data8 0x8EA4398B45CD53C0, 0x00003FFF, 0x1EB70051, 0x00000000 - data8 0x9031DC431466B1DC, 0x00003FFF, 0x1F6EB029, 0x00000000 - data8 0x91C3D373AB11C336, 0x00003FFF, 0x1DFD6D8E, 0x00000000 - data8 0x935A2B2F13E6E92C, 0x00003FFF, 0x9EB319B0, 0x00000000 - data8 0x94F4EFA8FEF70961, 0x00003FFF, 0x1EBA2BEB, 0x00000000 - data8 0x96942D3720185A00, 0x00003FFF, 0x1F11D537, 0x00000000 - data8 0x9837F0518DB8A96F, 0x00003FFF, 0x1F0D5A46, 0x00000000 - data8 0x99E0459320B7FA65, 0x00003FFF, 0x9E5E7BCA, 0x00000000 - data8 0x9B8D39B9D54E5539, 0x00003FFF, 0x9F3AAFD1, 0x00000000 - data8 0x9D3ED9A72CFFB751, 0x00003FFF, 0x9E86DACC, 0x00000000 - data8 0x9EF5326091A111AE, 0x00003FFF, 0x9F3EDDC2, 0x00000000 - data8 0xA0B0510FB9714FC2, 0x00003FFF, 0x1E496E3D, 0x00000000 - data8 0xA27043030C496819, 0x00003FFF, 0x9F490BF6, 0x00000000 - data8 0xA43515AE09E6809E, 0x00003FFF, 0x1DD1DB48, 0x00000000 - data8 0xA5FED6A9B15138EA, 0x00003FFF, 0x1E65EBFB, 0x00000000 - data8 0xA7CD93B4E965356A, 0x00003FFF, 0x9F427496, 0x00000000 - data8 0xA9A15AB4EA7C0EF8, 0x00003FFF, 0x1F283C4A, 0x00000000 - data8 0xAB7A39B5A93ED337, 0x00003FFF, 0x1F4B0047, 0x00000000 - data8 0xAD583EEA42A14AC6, 0x00003FFF, 0x1F130152, 0x00000000 - data8 0xAF3B78AD690A4375, 0x00003FFF, 0x9E8367C0, 0x00000000 - data8 0xB123F581D2AC2590, 0x00003FFF, 0x9F705F90, 0x00000000 - data8 0xB311C412A9112489, 0x00003FFF, 0x1EFB3C53, 0x00000000 - data8 0xB504F333F9DE6484, 0x00003FFF, 0x1F32FB13, 0x00000000 -ASM_SIZE_DIRECTIVE(double_sinh_j_table) - -.align 32 -.global sinh# -.section .text -.proc sinh# -.align 32 - -sinh: -#ifdef _LIBC -.global __ieee754_sinh -.type __ieee754_sinh,@function -__ieee754_sinh: -#endif - -// X infinity or NAN? -// Take invalid fault if enabled +// ************* DO NOT CHANGE ORDER OF THESE TABLES ******************** +// double-extended 1/ln(2) +// 3fff b8aa 3b29 5c17 f0bb be87fed0691d3e88 +// 3fff b8aa 3b29 5c17 f0bc +// For speed the significand will be loaded directly with a movl and setf.sig +// and the exponent will be bias+63 instead of bias+0. Thus subsequent +// computations need to scale appropriately. +// The constant 128/ln(2) is needed for the computation of w. This is also +// obtained by scaling the computations. +// +// Two shifting constants are loaded directly with movl and setf.d. +// 1. fRSHF_2TO56 = 1.1000..00 * 2^(63-7) +// This constant is added to x*1/ln2 to shift the integer part of +// x*128/ln2 into the rightmost bits of the significand. +// The result of this fma is fW_2TO56_RSH. +// 2. fRSHF = 1.1000..00 * 2^(63) +// This constant is subtracted from fW_2TO56_RSH * 2^(-56) to give +// the integer part of w, n, as a floating-point number. +// The result of this fms is fNfloat. + + +LOCAL_OBJECT_START(exp_table_1) +data8 0x408633ce8fb9f87e // smallest dbl overflow arg +data8 0x408633ce8fb9f87d // largest dbl arg to give normal dbl result +data8 0xb17217f7d1cf79ab , 0x00003ff7 // ln2/128 hi +data8 0xc9e3b39803f2f6af , 0x00003fb7 // ln2/128 lo +// +// Table 1 is 2^(index_1/128) where +// index_1 goes from 0 to 15 +// +data8 0x8000000000000000 , 0x00003FFF +data8 0x80B1ED4FD999AB6C , 0x00003FFF +data8 0x8164D1F3BC030773 , 0x00003FFF +data8 0x8218AF4373FC25EC , 0x00003FFF +data8 0x82CD8698AC2BA1D7 , 0x00003FFF +data8 0x8383594EEFB6EE37 , 0x00003FFF +data8 0x843A28C3ACDE4046 , 0x00003FFF +data8 0x84F1F656379C1A29 , 0x00003FFF +data8 0x85AAC367CC487B15 , 0x00003FFF +data8 0x8664915B923FBA04 , 0x00003FFF +data8 0x871F61969E8D1010 , 0x00003FFF +data8 0x87DB357FF698D792 , 0x00003FFF +data8 0x88980E8092DA8527 , 0x00003FFF +data8 0x8955EE03618E5FDD , 0x00003FFF +data8 0x8A14D575496EFD9A , 0x00003FFF +data8 0x8AD4C6452C728924 , 0x00003FFF +LOCAL_OBJECT_END(exp_table_1) + +// Table 2 is 2^(index_1/8) where +// index_2 goes from 0 to 7 +LOCAL_OBJECT_START(exp_table_2) +data8 0x8000000000000000 , 0x00003FFF +data8 0x8B95C1E3EA8BD6E7 , 0x00003FFF +data8 0x9837F0518DB8A96F , 0x00003FFF +data8 0xA5FED6A9B15138EA , 0x00003FFF +data8 0xB504F333F9DE6484 , 0x00003FFF +data8 0xC5672A115506DADD , 0x00003FFF +data8 0xD744FCCAD69D6AF4 , 0x00003FFF +data8 0xEAC0C6E7DD24392F , 0x00003FFF +LOCAL_OBJECT_END(exp_table_2) + + +LOCAL_OBJECT_START(exp_p_table) +data8 0x3f8111116da21757 //P5 +data8 0x3fa55555d787761c //P4 +data8 0x3fc5555555555414 //P3 +data8 0x3fdffffffffffd6a //P2 +LOCAL_OBJECT_END(exp_p_table) + +LOCAL_OBJECT_START(sinh_p_table) +data8 0xB08AF9AE78C1239F, 0x00003FDE // A6 +data8 0xB8EF1D28926D8891, 0x00003FEC // A4 +data8 0x8888888888888412, 0x00003FF8 // A2 +data8 0xD732377688025BE9, 0x00003FE5 // A5 +data8 0xD00D00D00D4D39F2, 0x00003FF2 // A3 +data8 0xAAAAAAAAAAAAAAAB, 0x00003FFC // A1 +LOCAL_OBJECT_END(sinh_p_table) -{ .mfi - alloc r32 = ar.pfs,0,12,4,0 -(p0) fclass.m.unc p6,p0 = f8, 0xe3 //@qnan | @snan | @inf - mov sinh_GR_all_ones = -1 -} -;; +.section .text +GLOBAL_IEEE754_ENTRY(sinh) -{ .mfb - nop.m 999 -(p6) fma.d.s0 f8 = f8,f1,f8 -(p6) br.ret.spnt b0 ;; +{ .mlx + getf.exp rSignexp_x = f8 // Must recompute if x unorm + movl rSig_inv_ln2 = 0xb8aa3b295c17f0bc // significand of 1/ln2 } - -// Put 0.25 in f9; p6 true if x < 0.25 -// Make constant that will generate inexact when squared { .mlx - setf.sig sinh_FR_all_ones = sinh_GR_all_ones -(p0) movl r32 = 0x000000000000fffd ;; + addl rAD_TB1 = @ltoff(exp_table_1), gp + movl rRshf_2to56 = 0x4768000000000000 // 1.10000 2^(63+56) } +;; { .mfi -(p0) setf.exp f9 = r32 -(p0) fclass.m.unc p7,p0 = f8, 0x07 //@zero - nop.i 999 ;; + ld8 rAD_TB1 = [rAD_TB1] + fclass.m p6,p0 = f8,0x0b // Test for x=unorm + mov rExp_mask = 0x1ffff } - -{ .mfb - nop.m 999 -(p0) fmerge.s sinh_FR_X = f0,f8 -(p7) br.ret.spnt b0 ;; -} - -// Identify denormal operands. { .mfi - nop.m 999 - fclass.m.unc p10,p0 = f8, 0x09 // + denorm - nop.i 999 -};; -{ .mfi - nop.m 999 - fclass.m.unc p11,p0 = f8, 0x0a // - denorm - nop.i 999 + mov rExp_bias = 0xffff + fnorm.s1 fNormX = f8 + mov rExp_2tom56 = 0xffff-56 } +;; + +// Form two constants we need +// 1/ln2 * 2^63 to compute w = x * 1/ln2 * 128 +// 1.1000..000 * 2^(63+63-7) to right shift int(w) into the significand { .mfi - nop.m 999 -(p0) fmerge.s sinh_FR_SGNX = f8,f1 - nop.i 999 ;; + setf.sig fINV_LN2_2TO63 = rSig_inv_ln2 // form 1/ln2 * 2^63 + fclass.m p8,p0 = f8,0x07 // Test for x=0 + nop.i 999 } +{ .mlx + setf.d fRSHF_2TO56 = rRshf_2to56 // Form const 1.100 * 2^(63+56) + movl rRshf = 0x43e8000000000000 // 1.10000 2^63 for right shift +} +;; { .mfi - nop.m 999 -(p0) fcmp.lt.unc.s1 p0,p7 = sinh_FR_X,f9 - nop.i 999 ;; + ldfpd fMIN_DBL_OFLOW_ARG, fMAX_DBL_NORM_ARG = [rAD_TB1],16 + fclass.m p10,p0 = f8,0x1e3 // Test for x=inf, nan, NaT + nop.i 0 } - -{ .mib - nop.m 999 - nop.i 999 -(p7) br.cond.sptk L(SINH_BY_TBL) ;; -} - - -L(SINH_BY_POLY): - -// POLY cannot overflow so there is no need to call __libm_error_support -// Set tiny_SAFE (p7) to 1(0) if answer is not tiny -// Currently we do not use tiny_SAFE. So the setting of tiny_SAFE is -// commented out. -//(p0) movl r32 = 0x000000000000fc01 -//(p0) setf.exp f10 = r32 -//(p0) fcmp.lt.unc.s1 p6,p7 = f8,f10 -// Here is essentially the algorithm for SINH_BY_POLY. Care is take for the order -// of multiplication; and P_1 is not exactly 1/3!, P_2 is not exactly 1/5!, etc. -// Note that ax = |x| -// sinh(x) = sign * (series(e^x) - series(e^-x))/2 -// = sign * (ax + ax^3/3! + ax^5/5! + ax^7/7! + ax^9/9! + ax^11/11! + ax^13/13!) -// = sign * (ax + ax * ( ax^2 * (1/3! + ax^4 * (1/7! + ax^4*1/11!)) ) -// + ax * ( ax^4 * (1/5! + ax^4 * (1/9! + ax^4*1/13!)) ) ) -// = sign * (ax + ax*p_odd + (ax*p_even)) -// = sign * (ax + Y_lo) -// sinh(x) = sign * (Y_hi + Y_lo) -// Get the values of P_x from the table { .mfb -(p0) addl r34 = @ltoff(double_sinh_p_table), gp -(p10) fma.d.s0 f8 = f8,f8,f8 -(p10) br.ret.spnt b0 + setf.exp f2TOM56 = rExp_2tom56 // form 2^-56 for scaling Nfloat + nop.f 0 +(p6) br.cond.spnt SINH_UNORM // Branch if x=unorm } ;; +SINH_COMMON: +{ .mfi + ldfe fLn2_by_128_hi = [rAD_TB1],16 + nop.f 0 + nop.i 0 +} { .mfb - ld8 r34 = [r34] -(p11) fnma.d.s0 f8 = f8,f8,f8 -(p11) br.ret.spnt b0 + setf.d fRSHF = rRshf // Form right shift const 1.100 * 2^63 + nop.f 0 +(p8) br.ret.spnt b0 // Exit for x=0, result=x } ;; -// Calculate sinh_FR_X2 = ax*ax and sinh_FR_X4 = ax*ax*ax*ax -{ .mmf - nop.m 999 -(p0) ldfe sinh_FR_P1 = [r34],16 -(p0) fma.s1 sinh_FR_X2 = sinh_FR_X, sinh_FR_X, f0 ;; -} - -{ .mmi -(p0) ldfe sinh_FR_P2 = [r34],16 ;; -(p0) ldfe sinh_FR_P3 = [r34],16 - nop.i 999 ;; +{ .mfi + ldfe fLn2_by_128_lo = [rAD_TB1],16 + nop.f 0 + nop.i 0 } - -{ .mmi -(p0) ldfe sinh_FR_P4 = [r34],16 ;; -(p0) ldfe sinh_FR_P5 = [r34],16 - nop.i 999 ;; +{ .mfb + and rExp_x = rExp_mask, rSignexp_x // Biased exponent of x +(p10) fma.d.s0 f8 = f8,f1,f0 // Result if x=inf, nan, NaT +(p10) br.ret.spnt b0 // quick exit for x=inf, nan, NaT } +;; +// After that last load rAD_TB1 points to the beginning of table 1 { .mfi -(p0) ldfe sinh_FR_P6 = [r34],16 -(p0) fma.s1 sinh_FR_X4 = sinh_FR_X2, sinh_FR_X2, f0 - nop.i 999 ;; + nop.m 0 + fcmp.eq.s0 p6,p0 = f8, f0 // Dummy to set D + sub rExp_x = rExp_x, rExp_bias // True exponent of x } +;; -// Calculate sinh_FR_podd = p_odd and sinh_FR_peven = p_even { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_poly_podd_temp1 = sinh_FR_X4, sinh_FR_P5, sinh_FR_P3 - nop.i 999 ;; + nop.m 0 + fmerge.s fAbsX = f0, fNormX // Form |x| + nop.i 0 } - -{ .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_poly_podd_temp2 = sinh_FR_X4, sinh_FR_poly_podd_temp1, sinh_FR_P1 - nop.i 999 +{ .mfb + cmp.gt p7, p0 = -2, rExp_x // Test |x| < 2^(-2) + fma.s1 fXsq = fNormX, fNormX, f0 // x*x for small path +(p7) br.cond.spnt SINH_SMALL // Branch if 0 < |x| < 2^-2 } +;; -{ .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_poly_peven_temp1 = sinh_FR_X4, sinh_FR_P6, sinh_FR_P4 - nop.i 999 ;; -} +// W = X * Inv_log2_by_128 +// By adding 1.10...0*2^63 we shift and get round_int(W) in significand. +// We actually add 1.10...0*2^56 to X * Inv_log2 to do the same thing. { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_podd = sinh_FR_X2, sinh_FR_poly_podd_temp2, f0 - nop.i 999 + add rAD_P = 0x180, rAD_TB1 + fma.s1 fW_2TO56_RSH = fNormX, fINV_LN2_2TO63, fRSHF_2TO56 + add rAD_TB2 = 0x100, rAD_TB1 } +;; -{ .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_poly_peven_temp2 = sinh_FR_X4, sinh_FR_poly_peven_temp1, sinh_FR_P2 - nop.i 999 ;; -} +// Divide arguments into the following categories: +// Certain Safe - 0.25 <= |x| <= MAX_DBL_NORM_ARG +// Possible Overflow p14 - MAX_DBL_NORM_ARG < |x| < MIN_DBL_OFLOW_ARG +// Certain Overflow p15 - MIN_DBL_OFLOW_ARG <= |x| < +inf +// +// If the input is really a double arg, then there will never be +// "Possible Overflow" arguments. +// { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_peven = sinh_FR_X4, sinh_FR_poly_peven_temp2, f0 - nop.i 999 ;; + ldfpd fP5, fP4 = [rAD_P] ,16 + fcmp.ge.s1 p15,p14 = fAbsX,fMIN_DBL_OFLOW_ARG + nop.i 0 } +;; -// Calculate sinh_FR_Y_lo = ax*p_odd + (ax*p_even) -{ .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_Y_lo_temp = sinh_FR_X, sinh_FR_peven, f0 - nop.i 999 ;; -} +// Nfloat = round_int(W) +// The signficand of fW_2TO56_RSH contains the rounded integer part of W, +// as a twos complement number in the lower bits (that is, it may be negative). +// That twos complement number (called N) is put into rN. -{ .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_Y_lo = sinh_FR_X, sinh_FR_podd, sinh_FR_Y_lo_temp - nop.i 999 ;; -} +// Since fW_2TO56_RSH is scaled by 2^56, it must be multiplied by 2^-56 +// before the shift constant 1.10000 * 2^63 is subtracted to yield fNfloat. +// Thus, fNfloat contains the floating point version of N -// Calculate sinh_FR_SINH = Y_hi + Y_lo. Note that ax = Y_hi { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_SINH = sinh_FR_X, f1, sinh_FR_Y_lo - nop.i 999 ;; + ldfpd fP3, fP2 = [rAD_P] +(p14) fcmp.gt.unc.s1 p14,p0 = fAbsX,fMAX_DBL_NORM_ARG + nop.i 0 } -// Dummy multiply to generate inexact -{ .mfi - nop.m 999 -(p0) fmpy.s0 sinh_FR_tmp = sinh_FR_all_ones, sinh_FR_all_ones - nop.i 999 -} - -// Calculate f8 = sign * (Y_hi + Y_lo) -// Go to return { .mfb - nop.m 999 -(p0) fma.d.s0 f8 = sinh_FR_SGNX,sinh_FR_SINH,f0 -(p0) br.ret.sptk b0 ;; + nop.m 0 + fms.s1 fNfloat = fW_2TO56_RSH, f2TOM56, fRSHF +(p15) br.cond.spnt SINH_CERTAIN_OVERFLOW } +;; - -L(SINH_BY_TBL): - -// Now that we are at TBL; so far all we know is that |x| >= 0.25. -// The first two steps are the same for TBL and EXP, but if we are HUGE -// we want to leave now. -// Double-extended: -// Go to HUGE if |x| >= 2^14, 1000d (register-biased) is e = 14 (true) -// Double -// Go to HUGE if |x| >= 2^10, 10009 (register-biased) is e = 10 (true) -// Single -// Go to HUGE if |x| >= 2^7, 10006 (register-biased) is e = 7 (true) - -{ .mlx - nop.m 999 -(p0) movl r32 = 0x0000000000010009 ;; +{ .mfi + getf.sig rN = fW_2TO56_RSH + nop.f 0 + mov rExp_bias_minus_1 = 0xfffe } +;; +// rIndex_1 has index_1 +// rIndex_2_16 has index_2 * 16 +// rBiased_M has M + +// rM has true M +// r = x - Nfloat * ln2_by_128_hi +// f = 1 - Nfloat * ln2_by_128_lo { .mfi -(p0) setf.exp f9 = r32 - nop.f 999 - nop.i 999 ;; + and rIndex_1 = 0x0f, rN + fnma.s1 fR = fNfloat, fLn2_by_128_hi, fNormX + shr rM = rN, 0x7 } - { .mfi - nop.m 999 -(p0) fcmp.ge.unc.s1 p6,p7 = sinh_FR_X,f9 - nop.i 999 ;; + and rIndex_2_16 = 0x70, rN + fnma.s1 fF = fNfloat, fLn2_by_128_lo, f1 + sub rN_neg = r0, rN } - -{ .mib - nop.m 999 - nop.i 999 -(p6) br.cond.spnt L(SINH_HUGE) ;; -} - -// r32 = 1 -// r34 = N-1 -// r35 = N -// r36 = j -// r37 = N+1 - -// TBL can never overflow -// sinh(x) = sinh(B+R) -// = sinh(B)cosh(R) + cosh(B)sinh(R) -// -// ax = |x| = M*log2/64 + R -// B = M*log2/64 -// M = 64*N + j -// We will calcualte M and get N as (M-j)/64 -// The division is a shift. -// exp(B) = exp(N*log2 + j*log2/64) -// = 2^N * 2^(j*log2/64) -// sinh(B) = 1/2(e^B -e^-B) -// = 1/2(2^N * 2^(j*log2/64) - 2^-N * 2^(-j*log2/64)) -// sinh(B) = (2^(N-1) * 2^(j*log2/64) - 2^(-N-1) * 2^(-j*log2/64)) -// cosh(B) = (2^(N-1) * 2^(j*log2/64) + 2^(-N-1) * 2^(-j*log2/64)) -// 2^(j*log2/64) is stored as Tjhi + Tjlo , j= -32,....,32 -// Tjhi is double-extended (80-bit) and Tjlo is single(32-bit) -// R = ax - M*log2/64 -// R = ax - M*log2_by_64_hi - M*log2_by_64_lo -// exp(R) = 1 + R +R^2(1/2! + R(1/3! + R(1/4! + ... + R(1/n!)...) -// = 1 + p_odd + p_even -// where the p_even uses the A coefficients and the p_even uses the B coefficients -// So sinh(R) = 1 + p_odd + p_even -(1 -p_odd -p_even)/2 = p_odd -// cosh(R) = 1 + p_even -// sinh(B) = S_hi + S_lo -// cosh(B) = C_hi -// sinh(x) = sinh(B)cosh(R) + cosh(B)sinh(R) -// ****************************************************** -// STEP 1 (TBL and EXP) -// ****************************************************** -// Get the following constants. -// f9 = Inv_log2by64 -// f10 = log2by64_hi -// f11 = log2by64_lo +;; { .mmi -(p0) adds r32 = 0x1,r0 -(p0) addl r34 = @ltoff(double_sinh_arg_reduction), gp - nop.i 999 + and rIndex_1_neg = 0x0f, rN_neg + add rBiased_M = rExp_bias_minus_1, rM + shr rM_neg = rN_neg, 0x7 } -;; - { .mmi - ld8 r34 = [r34] - nop.m 999 - nop.i 999 + and rIndex_2_16_neg = 0x70, rN_neg + add rAD_T2 = rAD_TB2, rIndex_2_16 + shladd rAD_T1 = rIndex_1, 4, rAD_TB1 } ;; - -// We want 2^(N-1) and 2^(-N-1). So bias N-1 and -N-1 and -// put them in an exponent. -// sinh_FR_spos = 2^(N-1) and sinh_FR_sneg = 2^(-N-1) -// r39 = 0xffff + (N-1) = 0xffff +N -1 -// r40 = 0xffff - (N +1) = 0xffff -N -1 - -{ .mlx - nop.m 999 -(p0) movl r38 = 0x000000000000fffe ;; -} +// rAD_T1 has address of T1 +// rAD_T2 has address if T2 { .mmi -(p0) ldfe sinh_FR_Inv_log2by64 = [r34],16 ;; -(p0) ldfe sinh_FR_log2by64_hi = [r34],16 - nop.i 999 ;; -} - -{ .mbb -(p0) ldfe sinh_FR_log2by64_lo = [r34],16 - nop.b 999 - nop.b 999 ;; + setf.exp f2M = rBiased_M + ldfe fT2 = [rAD_T2] + nop.i 0 } - -// Get the A coefficients -// f9 = A_1 -// f10 = A_2 -// f11 = A_3 - { .mmi - nop.m 999 -(p0) addl r34 = @ltoff(double_sinh_ab_table), gp - nop.i 999 + add rBiased_M_neg = rExp_bias_minus_1, rM_neg + add rAD_T2_neg = rAD_TB2, rIndex_2_16_neg + shladd rAD_T1_neg = rIndex_1_neg, 4, rAD_TB1 } ;; +// Create Scale = 2^M +// Load T1 and T2 { .mmi - ld8 r34 = [r34] - nop.m 999 - nop.i 999 + ldfe fT1 = [rAD_T1] + nop.m 0 + nop.i 0 } -;; - - -// Calculate M and keep it as integer and floating point. -// f38 = M = round-to-integer(x*Inv_log2by64) -// sinh_FR_M = M = truncate(ax/(log2/64)) -// Put the significand of M in r35 -// and the floating point representation of M in sinh_FR_M - -{ .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_M = sinh_FR_X, sinh_FR_Inv_log2by64, f0 - nop.i 999 -} - -{ .mfi -(p0) ldfe sinh_FR_A1 = [r34],16 - nop.f 999 - nop.i 999 ;; +{ .mmf + setf.exp f2M_neg = rBiased_M_neg + ldfe fT2_neg = [rAD_T2_neg] + fma.s1 fF_neg = fNfloat, fLn2_by_128_lo, f1 } +;; { .mfi - nop.m 999 -(p0) fcvt.fx.s1 sinh_FR_M_temp = sinh_FR_M - nop.i 999 ;; + nop.m 0 + fma.s1 fRsq = fR, fR, f0 + nop.i 0 } - { .mfi - nop.m 999 -(p0) fnorm.s1 sinh_FR_M = sinh_FR_M_temp - nop.i 999 ;; + ldfe fT1_neg = [rAD_T1_neg] + fma.s1 fP54 = fR, fP5, fP4 + nop.i 0 } +;; { .mfi -(p0) getf.sig r35 = sinh_FR_M_temp - nop.f 999 - nop.i 999 ;; + nop.m 0 + fma.s1 fP32 = fR, fP3, fP2 + nop.i 0 } - -// M is still in r35. Calculate j. j is the signed extension of the six lsb of M. It -// has a range of -32 thru 31. -// r35 = M -// r36 = j - -{ .mii - nop.m 999 - nop.i 999 ;; -(p0) and r36 = 0x3f, r35 ;; -} - -// Calculate R -// f13 = f44 - f12*f10 = ax - M*log2by64_hi -// f14 = f13 - f8*f11 = R = (ax - M*log2by64_hi) - M*log2by64_lo - { .mfi - nop.m 999 -(p0) fnma.s1 sinh_FR_R_temp = sinh_FR_M, sinh_FR_log2by64_hi, sinh_FR_X - nop.i 999 + nop.m 0 + fnma.s1 fP54_neg = fR, fP5, fP4 + nop.i 0 } +;; { .mfi -(p0) ldfe sinh_FR_A2 = [r34],16 - nop.f 999 - nop.i 999 ;; + nop.m 0 + fnma.s1 fP32_neg = fR, fP3, fP2 + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fnma.s1 sinh_FR_R = sinh_FR_M, sinh_FR_log2by64_lo, sinh_FR_R_temp - nop.i 999 -} - -// Get the B coefficients -// f15 = B_1 -// f32 = B_2 -// f33 = B_3 - -{ .mmi -(p0) ldfe sinh_FR_A3 = [r34],16 ;; -(p0) ldfe sinh_FR_B1 = [r34],16 - nop.i 999 ;; -} - -{ .mmi -(p0) ldfe sinh_FR_B2 = [r34],16 ;; -(p0) ldfe sinh_FR_B3 = [r34],16 - nop.i 999 ;; -} - -{ .mii - nop.m 999 -(p0) shl r34 = r36, 0x2 ;; -(p0) sxt1 r37 = r34 ;; + nop.m 0 + fma.s1 fP5432 = fRsq, fP54, fP32 + nop.i 0 } - -// ****************************************************** -// STEP 2 (TBL and EXP) -// ****************************************************** -// Calculate Rsquared and Rcubed in preparation for p_even and p_odd -// f12 = R*R*R -// f13 = R*R -// f14 = R <== from above - { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_Rsq = sinh_FR_R, sinh_FR_R, f0 -(p0) shr r36 = r37, 0x2 ;; -} - -// r34 = M-j = r35 - r36 -// r35 = N = (M-j)/64 - -{ .mii -(p0) sub r34 = r35, r36 - nop.i 999 ;; -(p0) shr r35 = r34, 0x6 ;; -} - -{ .mii -(p0) sub r40 = r38, r35 -(p0) adds r37 = 0x1, r35 -(p0) add r39 = r38, r35 ;; -} - -// Get the address of the J table, add the offset, -// addresses are sinh_AD_mJ and sinh_AD_J, get the T value -// f32 = T(j)_hi -// f33 = T(j)_lo -// f34 = T(-j)_hi -// f35 = T(-j)_lo - -{ .mmi -(p0) sub r34 = r35, r32 -(p0) addl r37 = @ltoff(double_sinh_j_table), gp - nop.i 999 + nop.m 0 + fma.s1 fS2 = fF,fT2,f0 + nop.i 0 } ;; -{ .mmi - ld8 r37 = [r37] - nop.m 999 - nop.i 999 -} -;; - - -{ .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_Rcub = sinh_FR_Rsq, sinh_FR_R, f0 - nop.i 999 -} - -// ****************************************************** -// STEP 3 Now decide if we need to branch to EXP -// ****************************************************** -// Put 32 in f9; p6 true if x < 32 -// Go to EXP if |x| >= 32 - -{ .mlx - nop.m 999 -(p0) movl r32 = 0x0000000000010004 ;; -} - -// Calculate p_even -// f34 = B_2 + Rsq *B_3 -// f35 = B_1 + Rsq*f34 = B_1 + Rsq * (B_2 + Rsq *B_3) -// f36 = p_even = Rsq * f35 = Rsq * (B_1 + Rsq * (B_2 + Rsq *B_3)) - { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_peven_temp1 = sinh_FR_Rsq, sinh_FR_B3, sinh_FR_B2 - nop.i 999 ;; + nop.m 0 + fma.s1 fS1 = f2M,fT1,f0 + nop.i 0 } - { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_peven_temp2 = sinh_FR_Rsq, sinh_FR_peven_temp1, sinh_FR_B1 - nop.i 999 + nop.m 0 + fma.s1 fP5432_neg = fRsq, fP54_neg, fP32_neg + nop.i 0 } - -// Calculate p_odd -// f34 = A_2 + Rsq *A_3 -// f35 = A_1 + Rsq * (A_2 + Rsq *A_3) -// f37 = podd = R + Rcub * (A_1 + Rsq * (A_2 + Rsq *A_3)) +;; { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_podd_temp1 = sinh_FR_Rsq, sinh_FR_A3, sinh_FR_A2 - nop.i 999 ;; + nop.m 0 + fma.s1 fS1_neg = f2M_neg,fT1_neg,f0 + nop.i 0 } - { .mfi -(p0) setf.exp sinh_FR_N_temp1 = r39 - nop.f 999 - nop.i 999 ;; + nop.m 0 + fma.s1 fS2_neg = fF_neg,fT2_neg,f0 + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_peven = sinh_FR_Rsq, sinh_FR_peven_temp2, f0 - nop.i 999 + nop.m 0 + fma.s1 fP = fRsq, fP5432, fR + nop.i 0 } - { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_podd_temp2 = sinh_FR_Rsq, sinh_FR_podd_temp1, sinh_FR_A1 - nop.i 999 ;; + nop.m 0 + fma.s1 fS = fS1,fS2,f0 + nop.i 0 } +;; { .mfi -(p0) setf.exp f9 = r32 - nop.f 999 - nop.i 999 ;; + nop.m 0 + fms.s1 fP_neg = fRsq, fP5432_neg, fR + nop.i 0 } - { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_podd = sinh_FR_podd_temp2, sinh_FR_Rcub, sinh_FR_R - nop.i 999 + nop.m 0 + fma.s1 fS_neg = fS1_neg,fS2_neg,f0 + nop.i 0 } +;; -// sinh_GR_mj contains the table offset for -j -// sinh_GR_j contains the table offset for +j -// p6 is true when j <= 0 - -{ .mlx -(p0) setf.exp sinh_FR_N_temp2 = r40 -(p0) movl r40 = 0x0000000000000020 ;; +{ .mfb + nop.m 0 + fmpy.s0 fTmp = fLn2_by_128_lo, fLn2_by_128_lo // Force inexact +(p14) br.cond.spnt SINH_POSSIBLE_OVERFLOW } +;; { .mfi -(p0) sub sinh_GR_mJ = r40, r36 -(p0) fmerge.se sinh_FR_spos = sinh_FR_N_temp1, f1 -(p0) adds sinh_GR_J = 0x20, r36 ;; + nop.m 0 + fma.s1 fExp = fS, fP, fS + nop.i 0 } - -{ .mii - nop.m 999 -(p0) shl sinh_GR_mJ = sinh_GR_mJ, 5 ;; -(p0) add sinh_AD_mJ = r37, sinh_GR_mJ ;; -} - -{ .mmi - nop.m 999 -(p0) ldfe sinh_FR_Tmjhi = [sinh_AD_mJ],16 -(p0) shl sinh_GR_J = sinh_GR_J, 5 ;; -} - { .mfi -(p0) ldfs sinh_FR_Tmjlo = [sinh_AD_mJ],16 -(p0) fcmp.lt.unc.s1 p0,p7 = sinh_FR_X,f9 -(p0) add sinh_AD_J = r37, sinh_GR_J ;; -} - -{ .mmi -(p0) ldfe sinh_FR_Tjhi = [sinh_AD_J],16 ;; -(p0) ldfs sinh_FR_Tjlo = [sinh_AD_J],16 - nop.i 999 ;; + nop.m 0 + fma.s1 fExp_neg = fS_neg, fP_neg, fS_neg + nop.i 0 } +;; { .mfb - nop.m 999 -(p0) fmerge.se sinh_FR_sneg = sinh_FR_N_temp2, f1 -(p7) br.cond.spnt L(SINH_BY_EXP) ;; + nop.m 0 + fms.d.s0 f8 = fExp, f1, fExp_neg + br.ret.sptk b0 // Normal path exit } +;; +// Here if 0 < |x| < 0.25 +SINH_SMALL: { .mfi - nop.m 999 - nop.f 999 - nop.i 999 ;; + add rAD_T1 = 0x1a0, rAD_TB1 + fcmp.lt.s1 p7, p8 = fNormX, f0 // Test sign of x + cmp.gt p6, p0 = -60, rExp_x // Test |x| < 2^(-60) } - -// ****************************************************** -// If NOT branch to EXP -// ****************************************************** -// Calculate S_hi and S_lo -// sinh_FR_S_hi_temp = sinh_FR_sneg * sinh_FR_Tmjhi -// sinh_FR_S_hi = sinh_FR_spos * sinh_FR_Tjhi - sinh_FR_S_hi_temp -// sinh_FR_S_hi = sinh_FR_spos * sinh_FR_Tjhi - (sinh_FR_sneg * sinh_FR_Tmjlo) - { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_S_hi_temp = sinh_FR_sneg, sinh_FR_Tmjhi, f0 - nop.i 999 ;; + add rAD_T2 = 0x1d0, rAD_TB1 + nop.f 0 + nop.i 0 } +;; -{ .mfi - nop.m 999 -(p0) fms.s1 sinh_FR_S_hi = sinh_FR_spos, sinh_FR_Tjhi, sinh_FR_S_hi_temp - nop.i 999 +{ .mmb + ldfe fA6 = [rAD_T1],16 + ldfe fA5 = [rAD_T2],16 +(p6) br.cond.spnt SINH_VERY_SMALL // Branch if |x| < 2^(-60) } +;; -// Calculate C_hi -// sinh_FR_C_hi_temp1 = sinh_FR_sneg * sinh_FR_Tmjhi -// sinh_FR_C_hi = sinh_FR_spos * sinh_FR_Tjhi + sinh_FR_C_hi_temp1 - -{ .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_C_hi_temp1 = sinh_FR_sneg, sinh_FR_Tmjhi, f0 - nop.i 999 ;; +{ .mmi + ldfe fA4 = [rAD_T1],16 + ldfe fA3 = [rAD_T2],16 + nop.i 0 } +;; -// sinh_FR_S_lo_temp1 = sinh_FR_spos * sinh_FR_Tjhi - sinh_FR_S_hi -// sinh_FR_S_lo_temp2 = -sinh_FR_sneg * sinh_FR_Tmjlo + (sinh_FR_spos * sinh_FR_Tjhi - sinh_FR_S_hi) -// sinh_FR_S_lo_temp2 = -sinh_FR_sneg * sinh_FR_Tmjlo + (sinh_FR_S_lo_temp1 ) - -{ .mfi - nop.m 999 -(p0) fms.s1 sinh_FR_S_lo_temp1 = sinh_FR_spos, sinh_FR_Tjhi, sinh_FR_S_hi - nop.i 999 +{ .mmi + ldfe fA2 = [rAD_T1] + ldfe fA1 = [rAD_T2] + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_C_hi = sinh_FR_spos, sinh_FR_Tjhi, sinh_FR_C_hi_temp1 - nop.i 999 ;; + nop.m 0 + fma.s1 fX3 = fNormX, fXsq, f0 + nop.i 0 } - { .mfi - nop.m 999 -(p0) fnma.s1 sinh_FR_S_lo_temp2 = sinh_FR_sneg, sinh_FR_Tmjhi, sinh_FR_S_lo_temp1 - nop.i 999 + nop.m 0 + fma.s1 fX4 = fXsq, fXsq, f0 + nop.i 0 } - -// sinh_FR_S_lo_temp1 = sinh_FR_sneg * sinh_FR_Tmjlo -// sinh_FR_S_lo_temp3 = sinh_FR_spos * sinh_FR_Tjlo - sinh_FR_S_lo_temp1 -// sinh_FR_S_lo_temp3 = sinh_FR_spos * sinh_FR_Tjlo -(sinh_FR_sneg * sinh_FR_Tmjlo) -// sinh_FR_S_lo = sinh_FR_S_lo_temp3 + sinh_FR_S_lo_temp2 +;; { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_S_lo_temp1 = sinh_FR_sneg, sinh_FR_Tmjlo, f0 - nop.i 999 ;; + nop.m 0 + fma.s1 fA65 = fXsq, fA6, fA5 + nop.i 0 } - -/////////// BUG FIX fma to fms -TK { .mfi - nop.m 999 -(p0) fms.s1 sinh_FR_S_lo_temp3 = sinh_FR_spos, sinh_FR_Tjlo, sinh_FR_S_lo_temp1 - nop.i 999 ;; + nop.m 0 + fma.s1 fA43 = fXsq, fA4, fA3 + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_S_lo = sinh_FR_S_lo_temp3, f1, sinh_FR_S_lo_temp2 - nop.i 999 ;; + nop.m 0 + fma.s1 fA21 = fXsq, fA2, fA1 + nop.i 0 } - -// Y_hi = S_hi -// Y_lo = C_hi*p_odd + (S_hi*p_even + S_lo) -// sinh_FR_Y_lo_temp = sinh_FR_S_hi * sinh_FR_peven + sinh_FR_S_lo -// sinh_FR_Y_lo = sinh_FR_C_hi * sinh_FR_podd + sinh_FR_Y_lo_temp +;; { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_Y_lo_temp = sinh_FR_S_hi, sinh_FR_peven, sinh_FR_S_lo - nop.i 999 ;; + nop.m 0 + fma.s1 fA6543 = fX4, fA65, fA43 + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_Y_lo = sinh_FR_C_hi, sinh_FR_podd, sinh_FR_Y_lo_temp - nop.i 999 ;; + nop.m 0 + fma.s1 fA654321 = fX4, fA6543, fA21 + nop.i 0 } - -// sinh_FR_SINH = Y_hi + Y_lo -// f8 = answer = sinh_FR_SGNX * sinh_FR_SINH +;; // Dummy multiply to generate inexact { .mfi - nop.m 999 -(p0) fmpy.s0 sinh_FR_tmp = sinh_FR_all_ones, sinh_FR_all_ones - nop.i 999 + nop.m 0 + fmpy.s0 fTmp = fA6, fA6 + nop.i 0 } -{ .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_SINH = sinh_FR_S_hi, f1, sinh_FR_Y_lo - nop.i 999 ;; +{ .mfb + nop.m 0 + fma.d.s0 f8 = fA654321, fX3, fNormX + br.ret.sptk b0 // Exit if 2^-60 < |x| < 0.25 } +;; +SINH_VERY_SMALL: +// Here if 0 < |x| < 2^-60 +// Compute result by x + sgn(x)*x^2 to get properly rounded result +.pred.rel "mutex",p7,p8 +{ .mfi + nop.m 0 +(p7) fnma.d.s0 f8 = fNormX, fNormX, fNormX // If x<0 result ~ x-x^2 + nop.i 0 +} { .mfb - nop.m 999 -(p0) fma.d.s0 f8 = sinh_FR_SGNX, sinh_FR_SINH,f0 -(p0) br.ret.sptk b0 ;; + nop.m 0 +(p8) fma.d.s0 f8 = fNormX, fNormX, fNormX // If x>0 result ~ x+x^2 + br.ret.sptk b0 // Exit if |x| < 2^-60 } +;; -L(SINH_BY_EXP): +SINH_POSSIBLE_OVERFLOW: -// When p7 is true, we know that an overflow is not going to happen -// When p7 is false, we must check for possible overflow -// p7 is the over_SAFE flag -// Y_hi = Tjhi -// Y_lo = Tjhi * (p_odd + p_even) +Tjlo -// Scale = sign * 2^(N-1) -// sinh_FR_Y_lo = sinh_FR_Tjhi * (sinh_FR_peven + sinh_FR_podd) -// sinh_FR_Y_lo = sinh_FR_Tjhi * (sinh_FR_Y_lo_temp ) +// Here if fMAX_DBL_NORM_ARG < |x| < fMIN_DBL_OFLOW_ARG +// This cannot happen if input is a double, only if input higher precision. +// Overflow is a possibility, not a certainty. -{ .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_Y_lo_temp = sinh_FR_peven, f1, sinh_FR_podd - nop.i 999 -} - -// Now we are in EXP. This is the only path where an overflow is possible -// but not for certain. So this is the only path where over_SAFE has any use. -// r34 still has N-1 -// There is a danger of double-extended overflow if N-1 > 16382 = 0x3ffe -// There is a danger of double overflow if N-1 > 0x3fe = 1022 -{ .mlx - nop.m 999 -(p0) movl r32 = 0x00000000000003fe ;; -} +// Recompute result using status field 2 with user's rounding mode, +// and wre set. If result is larger than largest double, then we have +// overflow { .mfi -(p0) cmp.gt.unc p0,p7 = r34, r32 -(p0) fmerge.s sinh_FR_SCALE = sinh_FR_SGNX, sinh_FR_spos - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_Y_lo = sinh_FR_Tjhi, sinh_FR_Y_lo_temp, sinh_FR_Tjlo - nop.i 999 ;; + mov rGt_ln = 0x103ff // Exponent for largest dbl + 1 ulp + fsetc.s2 0x7F,0x42 // Get user's round mode, set wre + nop.i 0 } +;; -// f8 = answer = scale * (Y_hi + Y_lo) { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_SINH_temp = sinh_FR_Y_lo, f1, sinh_FR_Tjhi - nop.i 999 ;; + setf.exp fGt_pln = rGt_ln // Create largest double + 1 ulp + fma.d.s2 fWre_urm_f8 = fS, fP, fS // Result with wre set + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fma.d.s0 f44 = sinh_FR_SCALE, sinh_FR_SINH_temp, f0 - nop.i 999 ;; + nop.m 0 + fsetc.s2 0x7F,0x40 // Turn off wre in sf2 + nop.i 0 } +;; -// Dummy multiply to generate inexact { .mfi - nop.m 999 -(p7) fmpy.s0 sinh_FR_tmp = sinh_FR_all_ones, sinh_FR_all_ones - nop.i 999 ;; + nop.m 0 + fcmp.ge.s1 p6, p0 = fWre_urm_f8, fGt_pln // Test for overflow + nop.i 0 } +;; -// If over_SAFE is set, return { .mfb - nop.m 999 -(p7) fmerge.s f8 = f44,f44 -(p7) br.ret.sptk b0 ;; + nop.m 0 + nop.f 0 +(p6) br.cond.spnt SINH_CERTAIN_OVERFLOW // Branch if overflow } +;; -// Else see if we overflowed -// S0 user supplied status -// S2 user supplied status + WRE + TD (Overflows) -// If WRE is set then an overflow will not occur in EXP. -// The input value that would cause a register (WRE) value to overflow is about 2^15 -// and this input would go into the HUGE path. -// Answer with WRE is in f43. - -{ .mfi - nop.m 999 -(p0) fsetc.s2 0x7F,0x42 - nop.i 999;; +{ .mfb + nop.m 0 + fma.d.s0 f8 = fS, fP, fS + br.ret.sptk b0 // Exit if really no overflow } +;; +SINH_CERTAIN_OVERFLOW: { .mfi - nop.m 999 -(p0) fma.d.s2 f43 = sinh_FR_SCALE, sinh_FR_SINH_temp, f0 - nop.i 999 ;; -} - -// 103FF => 103FF -FFFF = 400(true) -// 400 + 3FF = 7FF, which is 1 more that the exponent of the largest -// double (7FE). So 0 103FF 8000000000000000 is one ulp more than -// largest double in register bias -// Now set p8 if the answer with WRE is greater than or equal this value -// Also set p9 if the answer with WRE is less than or equal to negative this value - -{ .mlx - nop.m 999 -(p0) movl r32 = 0x000000000103FF ;; + sub rTmp = rExp_mask, r0, 1 + fcmp.lt.s1 p6, p7 = fNormX, f0 // Test for x < 0 + nop.i 0 } +;; { .mmf - nop.m 999 -(p0) setf.exp f41 = r32 -(p0) fsetc.s2 0x7F,0x40 ;; -} - -{ .mfi - nop.m 999 -(p0) fcmp.ge.unc.s1 p8, p0 = f43, f41 - nop.i 999 + alloc r32=ar.pfs,1,4,4,0 + setf.exp fTmp = rTmp + fmerge.s FR_X = f8,f8 } +;; { .mfi - nop.m 999 -(p0) fmerge.ns f42 = f41, f41 - nop.i 999 ;; -} - -// The error tag for overflow is 127 -{ .mii - nop.m 999 - nop.i 999 ;; -(p8) mov r47 = 127 ;; + mov GR_Parameter_TAG = 127 +(p6) fnma.d.s0 FR_RESULT = fTmp, fTmp, f0 // Set I,O and -INF result + nop.i 0 } - { .mfb - nop.m 999 -(p0) fcmp.le.unc.s1 p9, p0 = f43, f42 -(p8) br.cond.spnt L(SINH_ERROR_SUPPORT) ;; -} - -{ .mii - nop.m 999 - nop.i 999 ;; -(p9) mov r47 = 127 -} - -{ .mib - nop.m 999 - nop.i 999 -(p9) br.cond.spnt L(SINH_ERROR_SUPPORT) ;; -} - -// Dummy multiply to generate inexact -{ .mfi - nop.m 999 -(p0) fmpy.s0 sinh_FR_tmp = sinh_FR_all_ones, sinh_FR_all_ones - nop.i 999 ;; + nop.m 0 +(p7) fma.d.s0 FR_RESULT = fTmp, fTmp, f0 // Set I,O and +INF result + br.cond.sptk __libm_error_region } +;; +// Here if x unorm +SINH_UNORM: { .mfb - nop.m 999 -(p0) fmerge.s f8 = f44,f44 -(p0) br.ret.sptk b0 ;; -} - -L(SINH_HUGE): - -// for SINH_HUGE, put 24000 in exponent; take sign from input; add 1 -// SAFE: SAFE is always 0 for HUGE - -{ .mlx - nop.m 999 -(p0) movl r32 = 0x0000000000015dbf ;; -} - -{ .mfi -(p0) setf.exp f9 = r32 - nop.f 999 - nop.i 999 ;; + getf.exp rSignexp_x = fNormX // Must recompute if x unorm + fcmp.eq.s0 p6, p0 = f8, f0 // Set D flag + br.cond.sptk SINH_COMMON } +;; -{ .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_signed_hi_lo = sinh_FR_SGNX, f9, f1 - nop.i 999 ;; -} +GLOBAL_IEEE754_END(sinh) -{ .mfi - nop.m 999 -(p0) fma.d.s0 f44 = sinh_FR_signed_hi_lo, f9, f0 -(p0) mov r47 = 127 -} -.endp sinh -ASM_SIZE_DIRECTIVE(sinh) -#ifdef _LIBC -ASM_SIZE_DIRECTIVE(__ieee754_sinh) -#endif - -// Stack operations when calling error support. -// (1) (2) (3) (call) (4) -// sp -> + psp -> + psp -> + sp -> + -// | | | | -// | | <- GR_Y R3 ->| <- GR_RESULT | -> f8 -// | | | | -// | <-GR_Y Y2->| Y2 ->| <- GR_Y | -// | | | | -// | | <- GR_X X1 ->| | -// | | | | -// sp-64 -> + sp -> + sp -> + + -// save ar.pfs save b0 restore gp -// save gp restore ar.pfs - -.proc __libm_error_region -__libm_error_region: -L(SINH_ERROR_SUPPORT): +LOCAL_LIBM_ENTRY(__libm_error_region) .prologue - -// (1) { .mfi add GR_Parameter_Y=-32,sp // Parameter 2 value nop.f 0 @@ -1271,39 +860,32 @@ L(SINH_ERROR_SUPPORT): } { .mfi .fframe 64 - add sp=-64,sp // Create new stack + add sp=-64,sp // Create new stack nop.f 0 - mov GR_SAVE_GP=gp // Save gp + mov GR_SAVE_GP=gp // Save gp };; - - -// (2) { .mmi - stfd [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack - add GR_Parameter_X = 16,sp // Parameter 1 address + stfd [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address .save b0, GR_SAVE_B0 - mov GR_SAVE_B0=b0 // Save b0 + mov GR_SAVE_B0=b0 // Save b0 };; - .body -// (3) { .mib - stfd [GR_Parameter_X] = f8 // STORE Parameter 1 on stack - add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address - nop.b 0 + stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address + nop.b 0 } { .mib - stfd [GR_Parameter_Y] = f44 // STORE Parameter 3 on stack + stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack add GR_Parameter_Y = -16,GR_Parameter_Y - br.call.sptk b0=__libm_error_support# // Call error handling function + br.call.sptk b0=__libm_error_support# // Call error handling function };; { .mmi - nop.m 0 - nop.m 0 add GR_Parameter_RESULT = 48,sp + nop.m 0 + nop.i 0 };; - -// (4) { .mmi ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack .restore sp @@ -1316,8 +898,6 @@ L(SINH_ERROR_SUPPORT): br.ret.sptk b0 // Return };; -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) - +LOCAL_LIBM_END(__libm_error_region) .type __libm_error_support#,@function .global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_sinhf.S b/sysdeps/ia64/fpu/e_sinhf.S index d5aa2dca16..4a407b7f3c 100644 --- a/sysdeps/ia64/fpu/e_sinhf.S +++ b/sysdeps/ia64/fpu/e_sinhf.S @@ -1,10 +1,10 @@ .file "sinhf.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2002, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,1305 +20,727 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://www.intel.com/software/products/opensource/libraries/num.htm. + // History -//============================================================== -// 2/02/00 Initial version -// 4/04/00 Unwind support added -// 8/15/00 Bundle added after call to __libm_error_support to properly +//********************************************************************* +// 02/02/00 Initial version +// 04/04/00 Unwind support added +// 08/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. // 10/12/00 Update to set denormal operand and underflow flags -// 1/22/01 Fixed to set inexact flag for small args. +// 01/22/01 Fixed to set inexact flag for small args. +// 05/02/01 Reworked to improve speed of all paths +// 05/20/02 Cleaned up namespace and sf0 syntax +// 11/20/02 Improved algorithm based on expf // // API -//============================================================== -// float = sinhf(float) -// input floating point f8 -// output floating point f8 -// -// Registers used -//============================================================== -// general registers: -// r32 -> r47 -// predicate registers used: -// p6 p7 p8 p9 -// floating-point registers used: -// f9 -> f15; f32 -> f45; -// f8 has input, then output +//********************************************************************* +// float sinhf(float) // // Overview of operation -//============================================================== -// There are four paths -// 1. |x| < 0.25 SINH_BY_POLY -// 2. |x| < 32 SINH_BY_TBL -// 3. |x| < 2^14 SINH_BY_EXP -// 4. |x_ >= 2^14 SINH_HUGE -// -// For double extended we get infinity for x >= 400c b174 ddc0 31ae c0ea -// >= 1.0110001.... x 2^13 -// >= 11357.2166 +//********************************************************************* +// Case 1: 0 < |x| < 2^-60 +// Result = x, computed by x+sgn(x)*x^2) to handle flags and rounding // -// But for double we get infinity for x >= 408633ce8fb9f87e -// >= 1.0110...x 2^9 -// >= +7.10476e+002 +// Case 2: 2^-60 < |x| < 0.25 +// Evaluate sinh(x) by a 9th order polynomial +// Care is take for the order of multiplication; and A2 is not exactly 1/5!, +// A3 is not exactly 1/7!, etc. +// sinh(x) = x + (A1*x^3 + A2*x^5 + A3*x^7 + A4*x^9) // -// And for single we get infinity for x >= 42b3a496 -// >= 1.0110... 2^6 -// >= 89.8215 +// Case 3: 0.25 < |x| < 89.41598 +// Algorithm is based on the identity sinh(x) = ( exp(x) - exp(-x) ) / 2. +// The algorithm for exp is described as below. There are a number of +// economies from evaluating both exp(x) and exp(-x). Although we +// are evaluating both quantities, only where the quantities diverge do we +// duplicate the computations. The basic algorithm for exp(x) is described +// below. // -// SAFE: If there is danger of overflow set SAFE to 0 -// NOT implemented: if there is danger of underflow, set SAFE to 0 -// SAFE for all paths listed below -// -// 1. SINH_BY_POLY -// =============== -// If |x| is less than the tiny threshold, then clear SAFE -// For double, the tiny threshold is -1022 = -0x3fe => -3fe + ffff = fc01 -// register-biased, this is fc01 -// For single, the tiny threshold is -126 = -7e => -7e + ffff = ff81 -// If |x| < tiny threshold, set SAFE = 0 -// -// 2. SINH_BY_TBL -// ============= -// SAFE: SAFE is always 1 for TBL; +// Take the input x. w is "how many log2/128 in x?" +// w = x * 64/log2 +// NJ = int(w) +// x = NJ*log2/64 + R + +// NJ = 64*n + j +// x = n*log2 + (log2/64)*j + R // -// 3. SINH_BY_EXP -// ============== -// There is a danger of double-extended overflow if N-1 > 16382 = 0x3ffe -// r34 has N-1; 16382 is in register biased form, 0x13ffd -// There is danger of double overflow if N-1 > 0x3fe -// in register biased form, 0x103fd -// Analagously, there is danger of single overflow if N-1 > 0x7e -// in register biased form, 0x1007d -// SAFE: If there is danger of overflow set SAFE to 0 +// So, exp(x) = 2^n * 2^(j/64)* exp(R) // -// 4. SINH_HUGE -// ============ -// SAFE: SAFE is always 0 for HUGE +// T = 2^n * 2^(j/64) +// Construct 2^n +// Get 2^(j/64) table +// actually all the entries of 2^(j/64) table are stored in DP and +// with exponent bits set to 0 -> multiplication on 2^n can be +// performed by doing logical "or" operation with bits presenting 2^n + +// exp(R) = 1 + (exp(R) - 1) +// P = exp(R) - 1 approximated by Taylor series of 3rd degree +// P = A3*R^3 + A2*R^2 + R, A3 = 1/6, A2 = 1/2 // -#include "libm_support.h" - -// Assembly macros -//============================================================== -sinh_FR_X = f44 -sinh_FR_X2 = f9 -sinh_FR_X4 = f10 -sinh_FR_SGNX = f40 -sinh_FR_all_ones = f45 -sinh_FR_tmp = f42 - -sinh_FR_Inv_log2by64 = f9 -sinh_FR_log2by64_lo = f11 -sinh_FR_log2by64_hi = f10 - -sinh_FR_A1 = f9 -sinh_FR_A2 = f10 -sinh_FR_A3 = f11 - -sinh_FR_Rcub = f12 -sinh_FR_M_temp = f13 -sinh_FR_R_temp = f13 -sinh_FR_Rsq = f13 -sinh_FR_R = f14 - -sinh_FR_M = f38 - -sinh_FR_B1 = f15 -sinh_FR_B2 = f32 -sinh_FR_B3 = f33 +// The final result is reconstructed as follows +// exp(x) = T + T*P -sinh_FR_peven_temp1 = f34 -sinh_FR_peven_temp2 = f35 -sinh_FR_peven = f36 +// Special values +//********************************************************************* +// sinhf(+0) = +0 +// sinhf(-0) = -0 -sinh_FR_podd_temp1 = f34 -sinh_FR_podd_temp2 = f35 -sinh_FR_podd = f37 +// sinhf(+qnan) = +qnan +// sinhf(-qnan) = -qnan +// sinhf(+snan) = +qnan +// sinhf(-snan) = -qnan -sinh_FR_poly_podd_temp1 = f11 -sinh_FR_poly_podd_temp2 = f13 -sinh_FR_poly_peven_temp1 = f11 -sinh_FR_poly_peven_temp2 = f13 +// sinhf(-inf) = -inf +// sinhf(+inf) = +inf -sinh_FR_J_temp = f9 -sinh_FR_J = f10 - -sinh_FR_Mmj = f39 - -sinh_FR_N_temp1 = f11 -sinh_FR_N_temp2 = f12 -sinh_FR_N = f13 - -sinh_FR_spos = f14 -sinh_FR_sneg = f15 - -sinh_FR_Tjhi = f32 -sinh_FR_Tjlo = f33 -sinh_FR_Tmjhi = f34 -sinh_FR_Tmjlo = f35 - -sinh_GR_mJ = r35 -sinh_GR_J = r36 - -sinh_AD_mJ = r38 -sinh_AD_J = r39 -sinh_GR_all_ones = r40 - -sinh_FR_S_hi = f9 -sinh_FR_S_hi_temp = f10 -sinh_FR_S_lo_temp1 = f11 -sinh_FR_S_lo_temp2 = f12 -sinh_FR_S_lo_temp3 = f13 - -sinh_FR_S_lo = f38 -sinh_FR_C_hi = f39 - -sinh_FR_C_hi_temp1 = f10 -sinh_FR_Y_hi = f11 -sinh_FR_Y_lo_temp = f12 -sinh_FR_Y_lo = f13 -sinh_FR_SINH = f9 - -sinh_FR_P1 = f14 -sinh_FR_P2 = f15 -sinh_FR_P3 = f32 -sinh_FR_P4 = f33 -sinh_FR_P5 = f34 -sinh_FR_P6 = f35 - -sinh_FR_TINY_THRESH = f9 - -sinh_FR_SINH_temp = f10 -sinh_FR_SCALE = f11 - -sinh_FR_signed_hi_lo = f10 - - -GR_SAVE_PFS = r41 -GR_SAVE_B0 = r42 -GR_SAVE_GP = r43 +// Overflow and Underflow +//********************************************************************* +// sinhf(x) = largest single normal when +// x = 89.41598 = 0x42b2d4fc +// +// Underflow is handled as described in case 1 above -GR_Parameter_X = r44 -GR_Parameter_Y = r45 -GR_Parameter_RESULT = r46 +// Registers used +//********************************************************************* +// Floating Point registers used: +// f8 input, output +// f6,f7, f9 -> f15, f32 -> f45 -// Data tables -//============================================================== +// General registers used: +// r2, r3, r16 -> r38 -#ifdef _LIBC -.rodata -#else -.data -#endif +// Predicate registers used: +// p6 -> p15 +// Assembly macros +//********************************************************************* +// integer registers used +// scratch +rNJ = r2 +rNJ_neg = r3 + +rJ_neg = r16 +rN_neg = r17 +rSignexp_x = r18 +rExp_x = r18 +rExp_mask = r19 +rExp_bias = r20 +rAd1 = r21 +rAd2 = r22 +rJ = r23 +rN = r24 +rTblAddr = r25 +rA3 = r26 +rExpHalf = r27 +rLn2Div64 = r28 +rGt_ln = r29 +r17ones_m1 = r29 +rRightShifter = r30 +rJ_mask = r30 +r64DivLn2 = r31 +rN_mask = r31 +// stacked +GR_SAVE_PFS = r32 +GR_SAVE_B0 = r33 +GR_SAVE_GP = r34 +GR_Parameter_X = r35 +GR_Parameter_Y = r36 +GR_Parameter_RESULT = r37 +GR_Parameter_TAG = r38 + +// floating point registers used +FR_X = f10 +FR_Y = f1 +FR_RESULT = f8 +// scratch +fRightShifter = f6 +f64DivLn2 = f7 +fNormX = f9 +fNint = f10 +fN = f11 +fR = f12 +fLn2Div64 = f13 +fA2 = f14 +fA3 = f15 +// stacked +fP = f32 +fT = f33 +fMIN_SGL_OFLOW_ARG = f34 +fMAX_SGL_NORM_ARG = f35 +fRSqr = f36 +fA1 = f37 +fA21 = f37 +fA4 = f38 +fA43 = f38 +fA4321 = f38 +fX4 = f39 +fTmp = f39 +fGt_pln = f39 +fWre_urm_f8 = f40 +fXsq = f40 +fP_neg = f41 +fX3 = f41 +fT_neg = f42 +fExp = f43 +fExp_neg = f44 +fAbsX = f45 + + +RODATA .align 16 -double_sinh_arg_reduction: -ASM_TYPE_DIRECTIVE(double_sinh_arg_reduction,@object) - data8 0xB8AA3B295C17F0BC, 0x00004005 - data8 0xB17217F7D1000000, 0x00003FF8 - data8 0xCF79ABC9E3B39804, 0x00003FD0 -ASM_SIZE_DIRECTIVE(double_sinh_arg_reduction) - -double_sinh_p_table: -ASM_TYPE_DIRECTIVE(double_sinh_p_table,@object) - data8 0xAAAAAAAAAAAAAAAB, 0x00003FFC - data8 0x8888888888888412, 0x00003FF8 - data8 0xD00D00D00D4D39F2, 0x00003FF2 - data8 0xB8EF1D28926D8891, 0x00003FEC - data8 0xD732377688025BE9, 0x00003FE5 - data8 0xB08AF9AE78C1239F, 0x00003FDE -ASM_SIZE_DIRECTIVE(double_sinh_p_table) - -double_sinh_ab_table: -ASM_TYPE_DIRECTIVE(double_sinh_ab_table,@object) - data8 0xAAAAAAAAAAAAAAAC, 0x00003FFC - data8 0x88888888884ECDD5, 0x00003FF8 - data8 0xD00D0C6DCC26A86B, 0x00003FF2 - data8 0x8000000000000002, 0x00003FFE - data8 0xAAAAAAAAAA402C77, 0x00003FFA - data8 0xB60B6CC96BDB144D, 0x00003FF5 -ASM_SIZE_DIRECTIVE(double_sinh_ab_table) - -double_sinh_j_table: -ASM_TYPE_DIRECTIVE(double_sinh_j_table,@object) - data8 0xB504F333F9DE6484, 0x00003FFE, 0x1EB2FB13, 0x00000000 - data8 0xB6FD91E328D17791, 0x00003FFE, 0x1CE2CBE2, 0x00000000 - data8 0xB8FBAF4762FB9EE9, 0x00003FFE, 0x1DDC3CBC, 0x00000000 - data8 0xBAFF5AB2133E45FB, 0x00003FFE, 0x1EE9AA34, 0x00000000 - data8 0xBD08A39F580C36BF, 0x00003FFE, 0x9EAEFDC1, 0x00000000 - data8 0xBF1799B67A731083, 0x00003FFE, 0x9DBF517B, 0x00000000 - data8 0xC12C4CCA66709456, 0x00003FFE, 0x1EF88AFB, 0x00000000 - data8 0xC346CCDA24976407, 0x00003FFE, 0x1E03B216, 0x00000000 - data8 0xC5672A115506DADD, 0x00003FFE, 0x1E78AB43, 0x00000000 - data8 0xC78D74C8ABB9B15D, 0x00003FFE, 0x9E7B1747, 0x00000000 - data8 0xC9B9BD866E2F27A3, 0x00003FFE, 0x9EFE3C0E, 0x00000000 - data8 0xCBEC14FEF2727C5D, 0x00003FFE, 0x9D36F837, 0x00000000 - data8 0xCE248C151F8480E4, 0x00003FFE, 0x9DEE53E4, 0x00000000 - data8 0xD06333DAEF2B2595, 0x00003FFE, 0x9E24AE8E, 0x00000000 - data8 0xD2A81D91F12AE45A, 0x00003FFE, 0x1D912473, 0x00000000 - data8 0xD4F35AABCFEDFA1F, 0x00003FFE, 0x1EB243BE, 0x00000000 - data8 0xD744FCCAD69D6AF4, 0x00003FFE, 0x1E669A2F, 0x00000000 - data8 0xD99D15C278AFD7B6, 0x00003FFE, 0x9BBC610A, 0x00000000 - data8 0xDBFBB797DAF23755, 0x00003FFE, 0x1E761035, 0x00000000 - data8 0xDE60F4825E0E9124, 0x00003FFE, 0x9E0BE175, 0x00000000 - data8 0xE0CCDEEC2A94E111, 0x00003FFE, 0x1CCB12A1, 0x00000000 - data8 0xE33F8972BE8A5A51, 0x00003FFE, 0x1D1BFE90, 0x00000000 - data8 0xE5B906E77C8348A8, 0x00003FFE, 0x1DF2F47A, 0x00000000 - data8 0xE8396A503C4BDC68, 0x00003FFE, 0x1EF22F22, 0x00000000 - data8 0xEAC0C6E7DD24392F, 0x00003FFE, 0x9E3F4A29, 0x00000000 - data8 0xED4F301ED9942B84, 0x00003FFE, 0x1EC01A5B, 0x00000000 - data8 0xEFE4B99BDCDAF5CB, 0x00003FFE, 0x1E8CAC3A, 0x00000000 - data8 0xF281773C59FFB13A, 0x00003FFE, 0x9DBB3FAB, 0x00000000 - data8 0xF5257D152486CC2C, 0x00003FFE, 0x1EF73A19, 0x00000000 - data8 0xF7D0DF730AD13BB9, 0x00003FFE, 0x9BB795B5, 0x00000000 - data8 0xFA83B2DB722A033A, 0x00003FFE, 0x1EF84B76, 0x00000000 - data8 0xFD3E0C0CF486C175, 0x00003FFE, 0x9EF5818B, 0x00000000 - data8 0x8000000000000000, 0x00003FFF, 0x00000000, 0x00000000 - data8 0x8164D1F3BC030773, 0x00003FFF, 0x1F77CACA, 0x00000000 - data8 0x82CD8698AC2BA1D7, 0x00003FFF, 0x1EF8A91D, 0x00000000 - data8 0x843A28C3ACDE4046, 0x00003FFF, 0x1E57C976, 0x00000000 - data8 0x85AAC367CC487B15, 0x00003FFF, 0x9EE8DA92, 0x00000000 - data8 0x871F61969E8D1010, 0x00003FFF, 0x1EE85C9F, 0x00000000 - data8 0x88980E8092DA8527, 0x00003FFF, 0x1F3BF1AF, 0x00000000 - data8 0x8A14D575496EFD9A, 0x00003FFF, 0x1D80CA1E, 0x00000000 - data8 0x8B95C1E3EA8BD6E7, 0x00003FFF, 0x9D0373AF, 0x00000000 - data8 0x8D1ADF5B7E5BA9E6, 0x00003FFF, 0x9F167097, 0x00000000 - data8 0x8EA4398B45CD53C0, 0x00003FFF, 0x1EB70051, 0x00000000 - data8 0x9031DC431466B1DC, 0x00003FFF, 0x1F6EB029, 0x00000000 - data8 0x91C3D373AB11C336, 0x00003FFF, 0x1DFD6D8E, 0x00000000 - data8 0x935A2B2F13E6E92C, 0x00003FFF, 0x9EB319B0, 0x00000000 - data8 0x94F4EFA8FEF70961, 0x00003FFF, 0x1EBA2BEB, 0x00000000 - data8 0x96942D3720185A00, 0x00003FFF, 0x1F11D537, 0x00000000 - data8 0x9837F0518DB8A96F, 0x00003FFF, 0x1F0D5A46, 0x00000000 - data8 0x99E0459320B7FA65, 0x00003FFF, 0x9E5E7BCA, 0x00000000 - data8 0x9B8D39B9D54E5539, 0x00003FFF, 0x9F3AAFD1, 0x00000000 - data8 0x9D3ED9A72CFFB751, 0x00003FFF, 0x9E86DACC, 0x00000000 - data8 0x9EF5326091A111AE, 0x00003FFF, 0x9F3EDDC2, 0x00000000 - data8 0xA0B0510FB9714FC2, 0x00003FFF, 0x1E496E3D, 0x00000000 - data8 0xA27043030C496819, 0x00003FFF, 0x9F490BF6, 0x00000000 - data8 0xA43515AE09E6809E, 0x00003FFF, 0x1DD1DB48, 0x00000000 - data8 0xA5FED6A9B15138EA, 0x00003FFF, 0x1E65EBFB, 0x00000000 - data8 0xA7CD93B4E965356A, 0x00003FFF, 0x9F427496, 0x00000000 - data8 0xA9A15AB4EA7C0EF8, 0x00003FFF, 0x1F283C4A, 0x00000000 - data8 0xAB7A39B5A93ED337, 0x00003FFF, 0x1F4B0047, 0x00000000 - data8 0xAD583EEA42A14AC6, 0x00003FFF, 0x1F130152, 0x00000000 - data8 0xAF3B78AD690A4375, 0x00003FFF, 0x9E8367C0, 0x00000000 - data8 0xB123F581D2AC2590, 0x00003FFF, 0x9F705F90, 0x00000000 - data8 0xB311C412A9112489, 0x00003FFF, 0x1EFB3C53, 0x00000000 - data8 0xB504F333F9DE6484, 0x00003FFF, 0x1F32FB13, 0x00000000 -ASM_SIZE_DIRECTIVE(double_sinh_j_table) - -.align 32 -.global sinhf# - -.section .text -.proc sinhf# -.align 32 - -sinhf: -#ifdef _LIBC -.global __ieee754_sinhf -.type __ieee754_sinhf,@function -__ieee754_sinhf: -#endif - -// X infinity or NAN? -// Take invalid fault if enabled - -{ .mfi - alloc r32 = ar.pfs,0,12,4,0 -(p0) fclass.m.unc p6,p0 = f8, 0xe3 //@qnan | @snan | @inf - mov sinh_GR_all_ones = -1 -} -;; +LOCAL_OBJECT_START(_sinhf_table) +data4 0x42b2d4fd // Smallest single arg to overflow single result +data4 0x42b2d4fc // Largest single arg to give normal single result +data4 0x00000000 // pad +data4 0x00000000 // pad +// +// 2^(j/64) table, j goes from 0 to 63 +data8 0x0000000000000000 // 2^(0/64) +data8 0x00002C9A3E778061 // 2^(1/64) +data8 0x000059B0D3158574 // 2^(2/64) +data8 0x0000874518759BC8 // 2^(3/64) +data8 0x0000B5586CF9890F // 2^(4/64) +data8 0x0000E3EC32D3D1A2 // 2^(5/64) +data8 0x00011301D0125B51 // 2^(6/64) +data8 0x0001429AAEA92DE0 // 2^(7/64) +data8 0x000172B83C7D517B // 2^(8/64) +data8 0x0001A35BEB6FCB75 // 2^(9/64) +data8 0x0001D4873168B9AA // 2^(10/64) +data8 0x0002063B88628CD6 // 2^(11/64) +data8 0x0002387A6E756238 // 2^(12/64) +data8 0x00026B4565E27CDD // 2^(13/64) +data8 0x00029E9DF51FDEE1 // 2^(14/64) +data8 0x0002D285A6E4030B // 2^(15/64) +data8 0x000306FE0A31B715 // 2^(16/64) +data8 0x00033C08B26416FF // 2^(17/64) +data8 0x000371A7373AA9CB // 2^(18/64) +data8 0x0003A7DB34E59FF7 // 2^(19/64) +data8 0x0003DEA64C123422 // 2^(20/64) +data8 0x0004160A21F72E2A // 2^(21/64) +data8 0x00044E086061892D // 2^(22/64) +data8 0x000486A2B5C13CD0 // 2^(23/64) +data8 0x0004BFDAD5362A27 // 2^(24/64) +data8 0x0004F9B2769D2CA7 // 2^(25/64) +data8 0x0005342B569D4F82 // 2^(26/64) +data8 0x00056F4736B527DA // 2^(27/64) +data8 0x0005AB07DD485429 // 2^(28/64) +data8 0x0005E76F15AD2148 // 2^(29/64) +data8 0x0006247EB03A5585 // 2^(30/64) +data8 0x0006623882552225 // 2^(31/64) +data8 0x0006A09E667F3BCD // 2^(32/64) +data8 0x0006DFB23C651A2F // 2^(33/64) +data8 0x00071F75E8EC5F74 // 2^(34/64) +data8 0x00075FEB564267C9 // 2^(35/64) +data8 0x0007A11473EB0187 // 2^(36/64) +data8 0x0007E2F336CF4E62 // 2^(37/64) +data8 0x00082589994CCE13 // 2^(38/64) +data8 0x000868D99B4492ED // 2^(39/64) +data8 0x0008ACE5422AA0DB // 2^(40/64) +data8 0x0008F1AE99157736 // 2^(41/64) +data8 0x00093737B0CDC5E5 // 2^(42/64) +data8 0x00097D829FDE4E50 // 2^(43/64) +data8 0x0009C49182A3F090 // 2^(44/64) +data8 0x000A0C667B5DE565 // 2^(45/64) +data8 0x000A5503B23E255D // 2^(46/64) +data8 0x000A9E6B5579FDBF // 2^(47/64) +data8 0x000AE89F995AD3AD // 2^(48/64) +data8 0x000B33A2B84F15FB // 2^(49/64) +data8 0x000B7F76F2FB5E47 // 2^(50/64) +data8 0x000BCC1E904BC1D2 // 2^(51/64) +data8 0x000C199BDD85529C // 2^(52/64) +data8 0x000C67F12E57D14B // 2^(53/64) +data8 0x000CB720DCEF9069 // 2^(54/64) +data8 0x000D072D4A07897C // 2^(55/64) +data8 0x000D5818DCFBA487 // 2^(56/64) +data8 0x000DA9E603DB3285 // 2^(57/64) +data8 0x000DFC97337B9B5F // 2^(58/64) +data8 0x000E502EE78B3FF6 // 2^(59/64) +data8 0x000EA4AFA2A490DA // 2^(60/64) +data8 0x000EFA1BEE615A27 // 2^(61/64) +data8 0x000F50765B6E4540 // 2^(62/64) +data8 0x000FA7C1819E90D8 // 2^(63/64) +LOCAL_OBJECT_END(_sinhf_table) + +LOCAL_OBJECT_START(sinh_p_table) +data8 0x3ec749d84bc96d7d // A4 +data8 0x3f2a0168d09557cf // A3 +data8 0x3f811111326ed15a // A2 +data8 0x3fc55555552ed1e2 // A1 +LOCAL_OBJECT_END(sinh_p_table) -{ .mfb - nop.m 999 -(p6) fma.s.s0 f8 = f8,f1,f8 -(p6) br.ret.spnt b0 ;; -} +.section .text +GLOBAL_IEEE754_ENTRY(sinhf) -// Put 0.25 in f9; p6 true if x < 0.25 -// Make constant that will generate inexact when squared { .mlx - setf.sig sinh_FR_all_ones = sinh_GR_all_ones -(p0) movl r32 = 0x000000000000fffd ;; -} - -{ .mfi -(p0) setf.exp f9 = r32 -(p0) fclass.m.unc p7,p0 = f8, 0x07 //@zero - nop.i 999 ;; -} - -{ .mfb - nop.m 999 -(p0) fmerge.s sinh_FR_X = f0,f8 -(p7) br.ret.spnt b0 ;; + getf.exp rSignexp_x = f8 // Must recompute if x unorm + movl r64DivLn2 = 0x40571547652B82FE // 64/ln(2) } - -// Identify denormal operands. -{ .mfi - nop.m 999 - fclass.m.unc p10,p0 = f8, 0x09 // + denorm - nop.i 999 -};; -{ .mfi - nop.m 999 - fclass.m.unc p11,p0 = f8, 0x0a // - denorm - nop.i 999 -} - -{ .mfi - nop.m 999 -(p0) fmerge.s sinh_FR_SGNX = f8,f1 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p0) fcmp.lt.unc.s1 p0,p7 = sinh_FR_X,f9 - nop.i 999 ;; -} - -{ .mib - nop.m 999 - nop.i 999 -(p7) br.cond.sptk L(SINH_BY_TBL) ;; -} - - -L(SINH_BY_POLY): - -// POLY cannot overflow so there is no need to call __libm_error_support -// Set tiny_SAFE (p7) to 1(0) if answer is not tiny -// Currently we do not use tiny_SAFE. So the setting of tiny_SAFE is -// commented out. -//(p0) movl r32 = 0x000000000000fc01 -//(p0) setf.exp f10 = r32 -//(p0) fcmp.lt.unc.s1 p6,p7 = f8,f10 -// Here is essentially the algorithm for SINH_BY_POLY. Care is take for the order -// of multiplication; and P_1 is not exactly 1/3!, P_2 is not exactly 1/5!, etc. -// Note that ax = |x| -// sinh(x) = sign * (series(e^x) - series(e^-x))/2 -// = sign * (ax + ax^3/3! + ax^5/5! + ax^7/7! + ax^9/9! + ax^11/11! + ax^13/13!) -// = sign * (ax + ax * ( ax^2 * (1/3! + ax^4 * (1/7! + ax^4*1/11!)) ) -// + ax * ( ax^4 * (1/5! + ax^4 * (1/9! + ax^4*1/13!)) ) ) -// = sign * (ax + ax*p_odd + (ax*p_even)) -// = sign * (ax + Y_lo) -// sinh(x) = sign * (Y_hi + Y_lo) -// Get the values of P_x from the table -{ .mfb -(p0) addl r34 = @ltoff(double_sinh_p_table), gp -(p10) fma.s.s0 f8 = f8,f8,f8 -(p10) br.ret.spnt b0 -} -;; - -{ .mfb - ld8 r34 = [r34] -(p11) fnma.s.s0 f8 = f8,f8,f8 -(p11) br.ret.spnt b0 +{ .mlx + addl rTblAddr = @ltoff(_sinhf_table),gp + movl rRightShifter = 0x43E8000000000000 // DP Right Shifter } ;; -// Calculate sinh_FR_X2 = ax*ax and sinh_FR_X4 = ax*ax*ax*ax -{ .mmf - nop.m 999 -(p0) ldfe sinh_FR_P1 = [r34],16 -(p0) fma.s1 sinh_FR_X2 = sinh_FR_X, sinh_FR_X, f0 ;; -} - -{ .mmi -(p0) ldfe sinh_FR_P2 = [r34],16 ;; -(p0) ldfe sinh_FR_P3 = [r34],16 - nop.i 999 ;; -} - -{ .mmi -(p0) ldfe sinh_FR_P4 = [r34],16 ;; -(p0) ldfe sinh_FR_P5 = [r34],16 - nop.i 999 ;; -} - -{ .mfi -(p0) ldfe sinh_FR_P6 = [r34],16 -(p0) fma.s1 sinh_FR_X4 = sinh_FR_X2, sinh_FR_X2, f0 - nop.i 999 ;; -} - -// Calculate sinh_FR_podd = p_odd and sinh_FR_peven = p_even { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_poly_podd_temp1 = sinh_FR_X4, sinh_FR_P5, sinh_FR_P3 - nop.i 999 ;; + // point to the beginning of the table + ld8 rTblAddr = [rTblAddr] + fclass.m p6, p0 = f8, 0x0b // Test for x=unorm + addl rA3 = 0x3E2AA, r0 // high bits of 1.0/6.0 rounded to SP } - { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_poly_podd_temp2 = sinh_FR_X4, sinh_FR_poly_podd_temp1, sinh_FR_P1 - nop.i 999 + nop.m 0 + fnorm.s1 fNormX = f8 // normalized x + addl rExpHalf = 0xFFFE, r0 // exponent of 1/2 } +;; { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_poly_peven_temp1 = sinh_FR_X4, sinh_FR_P6, sinh_FR_P4 - nop.i 999 ;; + setf.d f64DivLn2 = r64DivLn2 // load 64/ln(2) to FP reg + fclass.m p15, p0 = f8, 0x1e3 // test for NaT,NaN,Inf + nop.i 0 } - -{ .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_podd = sinh_FR_X2, sinh_FR_poly_podd_temp2, f0 - nop.i 999 +{ .mlx + // load Right Shifter to FP reg + setf.d fRightShifter = rRightShifter + movl rLn2Div64 = 0x3F862E42FEFA39EF // DP ln(2)/64 in GR } +;; { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_poly_peven_temp2 = sinh_FR_X4, sinh_FR_poly_peven_temp1, sinh_FR_P2 - nop.i 999 ;; + mov rExp_mask = 0x1ffff + fcmp.eq.s1 p13, p0 = f0, f8 // test for x = 0.0 + shl rA3 = rA3, 12 // 0x3E2AA000, approx to 1.0/6.0 in SP } - -{ .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_peven = sinh_FR_X4, sinh_FR_poly_peven_temp2, f0 - nop.i 999 ;; +{ .mfb + nop.m 0 + nop.f 0 +(p6) br.cond.spnt SINH_UNORM // Branch if x=unorm } +;; -// Calculate sinh_FR_Y_lo = ax*p_odd + (ax*p_even) +SINH_COMMON: { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_Y_lo_temp = sinh_FR_X, sinh_FR_peven, f0 - nop.i 999 ;; + setf.exp fA2 = rExpHalf // load A2 to FP reg + nop.f 0 + mov rExp_bias = 0xffff } - -{ .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_Y_lo = sinh_FR_X, sinh_FR_podd, sinh_FR_Y_lo_temp - nop.i 999 ;; +{ .mfb + setf.d fLn2Div64 = rLn2Div64 // load ln(2)/64 to FP reg +(p15) fma.s.s0 f8 = f8, f1, f0 // result if x = NaT,NaN,Inf +(p15) br.ret.spnt b0 // exit here if x = NaT,NaN,Inf } +;; -// Calculate sinh_FR_SINH = Y_hi + Y_lo. Note that ax = Y_hi { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_SINH = sinh_FR_X, f1, sinh_FR_Y_lo - nop.i 999 ;; + // min overflow and max normal threshold + ldfps fMIN_SGL_OFLOW_ARG, fMAX_SGL_NORM_ARG = [rTblAddr], 8 + nop.f 0 + and rExp_x = rExp_mask, rSignexp_x // Biased exponent of x } -// Dummy multiply to generate inexact -{ .mfi - nop.m 999 -(p0) fmpy.s0 sinh_FR_tmp = sinh_FR_all_ones, sinh_FR_all_ones - nop.i 999 -} - -// Calculate f8 = sign * (Y_hi + Y_lo) -// Go to return { .mfb - nop.m 999 -(p0) fma.s.s0 f8 = sinh_FR_SGNX,sinh_FR_SINH,f0 -(p0) br.ret.sptk b0 ;; -} - - -L(SINH_BY_TBL): - -// Now that we are at TBL; so far all we know is that |x| >= 0.25. -// The first two steps are the same for TBL and EXP, but if we are HUGE -// we want to leave now. -// Double-extended: -// Go to HUGE if |x| >= 2^14, 1000d (register-biased) is e = 14 (true) -// Double -// Go to HUGE if |x| >= 2^10, 10009 (register-biased) is e = 10 (true) -// Single -// Go to HUGE if |x| >= 2^7, 10006 (register-biased) is e = 7 (true) - -{ .mlx - nop.m 999 -(p0) movl r32 = 0x0000000000010006 ;; + setf.s fA3 = rA3 // load A3 to FP reg + nop.f 0 +(p13) br.ret.spnt b0 // exit here if x=0.0, return x } +;; { .mfi -(p0) setf.exp f9 = r32 - nop.f 999 - nop.i 999 ;; + sub rExp_x = rExp_x, rExp_bias // True exponent of x + fmerge.s fAbsX = f0, fNormX // Form |x| + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fcmp.ge.unc.s1 p6,p7 = sinh_FR_X,f9 - nop.i 999 ;; -} - -{ .mib - nop.m 999 - nop.i 999 -(p6) br.cond.spnt L(SINH_HUGE) ;; -} - -// r32 = 1 -// r34 = N-1 -// r35 = N -// r36 = j -// r37 = N+1 - -// TBL can never overflow -// sinh(x) = sinh(B+R) -// = sinh(B)cosh(R) + cosh(B)sinh(R) -// -// ax = |x| = M*log2/64 + R -// B = M*log2/64 -// M = 64*N + j -// We will calcualte M and get N as (M-j)/64 -// The division is a shift. -// exp(B) = exp(N*log2 + j*log2/64) -// = 2^N * 2^(j*log2/64) -// sinh(B) = 1/2(e^B -e^-B) -// = 1/2(2^N * 2^(j*log2/64) - 2^-N * 2^(-j*log2/64)) -// sinh(B) = (2^(N-1) * 2^(j*log2/64) - 2^(-N-1) * 2^(-j*log2/64)) -// cosh(B) = (2^(N-1) * 2^(j*log2/64) + 2^(-N-1) * 2^(-j*log2/64)) -// 2^(j*log2/64) is stored as Tjhi + Tjlo , j= -32,....,32 -// Tjhi is double-extended (80-bit) and Tjlo is single(32-bit) -// R = ax - M*log2/64 -// R = ax - M*log2_by_64_hi - M*log2_by_64_lo -// exp(R) = 1 + R +R^2(1/2! + R(1/3! + R(1/4! + ... + R(1/n!)...) -// = 1 + p_odd + p_even -// where the p_even uses the A coefficients and the p_even uses the B coefficients -// So sinh(R) = 1 + p_odd + p_even -(1 -p_odd -p_even)/2 = p_odd -// cosh(R) = 1 + p_even -// sinh(B) = S_hi + S_lo -// cosh(B) = C_hi -// sinh(x) = sinh(B)cosh(R) + cosh(B)sinh(R) -// ****************************************************** -// STEP 1 (TBL and EXP) -// ****************************************************** -// Get the following constants. -// f9 = Inv_log2by64 -// f10 = log2by64_hi -// f11 = log2by64_lo - -{ .mmi -(p0) adds r32 = 0x1,r0 -(p0) addl r34 = @ltoff(double_sinh_arg_reduction), gp - nop.i 999 + nop.m 0 + // x*(64/ln(2)) + Right Shifter + fma.s1 fNint = fNormX, f64DivLn2, fRightShifter + add rTblAddr = 8, rTblAddr } -;; - -{ .mmi - ld8 r34 = [r34] - nop.m 999 - nop.i 999 +{ .mfb + cmp.gt p7, p0 = -2, rExp_x // Test |x| < 2^(-2) + fma.s1 fXsq = fNormX, fNormX, f0 // x*x for small path +(p7) br.cond.spnt SINH_SMALL // Branch if 0 < |x| < 2^-2 } ;; - -// We want 2^(N-1) and 2^(-N-1). So bias N-1 and -N-1 and -// put them in an exponent. -// sinh_FR_spos = 2^(N-1) and sinh_FR_sneg = 2^(-N-1) -// r39 = 0xffff + (N-1) = 0xffff +N -1 -// r40 = 0xffff - (N +1) = 0xffff -N -1 - -{ .mlx - nop.m 999 -(p0) movl r38 = 0x000000000000fffe ;; -} - -{ .mmi -(p0) ldfe sinh_FR_Inv_log2by64 = [r34],16 ;; -(p0) ldfe sinh_FR_log2by64_hi = [r34],16 - nop.i 999 ;; -} - -{ .mbb -(p0) ldfe sinh_FR_log2by64_lo = [r34],16 - nop.b 999 - nop.b 999 ;; -} - -// Get the A coefficients -// f9 = A_1 -// f10 = A_2 -// f11 = A_3 - -{ .mmi - nop.m 999 -(p0) addl r34 = @ltoff(double_sinh_ab_table), gp - nop.i 999 +{ .mfi + nop.m 0 + // check for overflow + fcmp.ge.s1 p12, p13 = fAbsX, fMIN_SGL_OFLOW_ARG + mov rJ_mask = 0x3f // 6-bit mask for J } ;; -{ .mmi - ld8 r34 = [r34] - nop.m 999 - nop.i 999 +{ .mfb + nop.m 0 + fms.s1 fN = fNint, f1, fRightShifter // n in FP register + // branch out if overflow +(p12) br.cond.spnt SINH_CERTAIN_OVERFLOW } ;; - -// Calculate M and keep it as integer and floating point. -// f38 = M = round-to-integer(x*Inv_log2by64) -// sinh_FR_M = M = truncate(ax/(log2/64)) -// Put the significand of M in r35 -// and the floating point representation of M in sinh_FR_M - { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_M = sinh_FR_X, sinh_FR_Inv_log2by64, f0 - nop.i 999 + getf.sig rNJ = fNint // bits of n, j + // check for possible overflow + fcmp.gt.s1 p13, p0 = fAbsX, fMAX_SGL_NORM_ARG + nop.i 0 } +;; { .mfi -(p0) ldfe sinh_FR_A1 = [r34],16 - nop.f 999 - nop.i 999 ;; + addl rN = 0xFFBF - 63, rNJ // biased and shifted n-1,j + fnma.s1 fR = fLn2Div64, fN, fNormX // R = x - N*ln(2)/64 + and rJ = rJ_mask, rNJ // bits of j } - { .mfi - nop.m 999 -(p0) fcvt.fx.s1 sinh_FR_M_temp = sinh_FR_M - nop.i 999 ;; + sub rNJ_neg = r0, rNJ // bits of n, j for -x + nop.f 0 + andcm rN_mask = -1, rJ_mask // 0xff...fc0 to mask N } +;; { .mfi - nop.m 999 -(p0) fnorm.s1 sinh_FR_M = sinh_FR_M_temp - nop.i 999 ;; + shladd rJ = rJ, 3, rTblAddr // address in the 2^(j/64) table + nop.f 0 + and rN = rN_mask, rN // biased, shifted n-1 } - { .mfi -(p0) getf.sig r35 = sinh_FR_M_temp - nop.f 999 - nop.i 999 ;; + addl rN_neg = 0xFFBF - 63, rNJ_neg // -x biased, shifted n-1,j + nop.f 0 + and rJ_neg = rJ_mask, rNJ_neg // bits of j for -x } - -// M is still in r35. Calculate j. j is the signed extension of the six lsb of M. It -// has a range of -32 thru 31. -// r35 = M -// r36 = j - -{ .mii - nop.m 999 - nop.i 999 ;; -(p0) and r36 = 0x3f, r35 ;; -} - -// Calculate R -// f13 = f44 - f12*f10 = ax - M*log2by64_hi -// f14 = f13 - f8*f11 = R = (ax - M*log2by64_hi) - M*log2by64_lo +;; { .mfi - nop.m 999 -(p0) fnma.s1 sinh_FR_R_temp = sinh_FR_M, sinh_FR_log2by64_hi, sinh_FR_X - nop.i 999 + ld8 rJ = [rJ] // Table value + nop.f 0 + shl rN = rN, 46 // 2^(n-1) bits in DP format } - { .mfi -(p0) ldfe sinh_FR_A2 = [r34],16 - nop.f 999 - nop.i 999 ;; + shladd rJ_neg = rJ_neg, 3, rTblAddr // addr in 2^(j/64) table -x + nop.f 0 + and rN_neg = rN_mask, rN_neg // biased, shifted n-1 for -x } +;; { .mfi - nop.m 999 -(p0) fnma.s1 sinh_FR_R = sinh_FR_M, sinh_FR_log2by64_lo, sinh_FR_R_temp - nop.i 999 + ld8 rJ_neg = [rJ_neg] // Table value for -x + nop.f 0 + shl rN_neg = rN_neg, 46 // 2^(n-1) bits in DP format for -x } - -// Get the B coefficients -// f15 = B_1 -// f32 = B_2 -// f33 = B_3 - -{ .mmi -(p0) ldfe sinh_FR_A3 = [r34],16 ;; -(p0) ldfe sinh_FR_B1 = [r34],16 - nop.i 999 ;; -} - -{ .mmi -(p0) ldfe sinh_FR_B2 = [r34],16 ;; -(p0) ldfe sinh_FR_B3 = [r34],16 - nop.i 999 ;; -} - -{ .mii - nop.m 999 -(p0) shl r34 = r36, 0x2 ;; -(p0) sxt1 r37 = r34 ;; -} - -// ****************************************************** -// STEP 2 (TBL and EXP) -// ****************************************************** -// Calculate Rsquared and Rcubed in preparation for p_even and p_odd -// f12 = R*R*R -// f13 = R*R -// f14 = R <== from above +;; { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_Rsq = sinh_FR_R, sinh_FR_R, f0 -(p0) shr r36 = r37, 0x2 ;; -} - -// r34 = M-j = r35 - r36 -// r35 = N = (M-j)/64 - -{ .mii -(p0) sub r34 = r35, r36 - nop.i 999 ;; -(p0) shr r35 = r34, 0x6 ;; -} - -{ .mii -(p0) sub r40 = r38, r35 -(p0) adds r37 = 0x1, r35 -(p0) add r39 = r38, r35 ;; -} - -// Get the address of the J table, add the offset, -// addresses are sinh_AD_mJ and sinh_AD_J, get the T value -// f32 = T(j)_hi -// f33 = T(j)_lo -// f34 = T(-j)_hi -// f35 = T(-j)_lo - -{ .mmi -(p0) sub r34 = r35, r32 -(p0) addl r37 = @ltoff(double_sinh_j_table), gp - nop.i 999 + or rN = rN, rJ // bits of 2^n * 2^(j/64) in DP format + nop.f 0 + nop.i 0 } ;; -{ .mmi - ld8 r37 = [r37] - nop.m 999 - nop.i 999 +{ .mmf + setf.d fT = rN // 2^(n-1) * 2^(j/64) + or rN_neg = rN_neg, rJ_neg // -x bits of 2^n * 2^(j/64) in DP + fma.s1 fRSqr = fR, fR, f0 // R^2 } ;; - -{ .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_Rcub = sinh_FR_Rsq, sinh_FR_R, f0 - nop.i 999 -} - -// ****************************************************** -// STEP 3 Now decide if we need to branch to EXP -// ****************************************************** -// Put 32 in f9; p6 true if x < 32 -// Go to EXP if |x| >= 32 - -{ .mlx - nop.m 999 -(p0) movl r32 = 0x0000000000010004 ;; -} - -// Calculate p_even -// f34 = B_2 + Rsq *B_3 -// f35 = B_1 + Rsq*f34 = B_1 + Rsq * (B_2 + Rsq *B_3) -// f36 = p_even = Rsq * f35 = Rsq * (B_1 + Rsq * (B_2 + Rsq *B_3)) - -{ .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_peven_temp1 = sinh_FR_Rsq, sinh_FR_B3, sinh_FR_B2 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_peven_temp2 = sinh_FR_Rsq, sinh_FR_peven_temp1, sinh_FR_B1 - nop.i 999 -} - -// Calculate p_odd -// f34 = A_2 + Rsq *A_3 -// f35 = A_1 + Rsq * (A_2 + Rsq *A_3) -// f37 = podd = R + Rcub * (A_1 + Rsq * (A_2 + Rsq *A_3)) - -{ .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_podd_temp1 = sinh_FR_Rsq, sinh_FR_A3, sinh_FR_A2 - nop.i 999 ;; -} - { .mfi -(p0) setf.exp sinh_FR_N_temp1 = r39 - nop.f 999 - nop.i 999 ;; + setf.d fT_neg = rN_neg // 2^(n-1) * 2^(j/64) for -x + fma.s1 fP = fA3, fR, fA2 // A3*R + A2 + nop.i 0 } - -{ .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_peven = sinh_FR_Rsq, sinh_FR_peven_temp2, f0 - nop.i 999 -} - { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_podd_temp2 = sinh_FR_Rsq, sinh_FR_podd_temp1, sinh_FR_A1 - nop.i 999 ;; + nop.m 0 + fnma.s1 fP_neg = fA3, fR, fA2 // A3*R + A2 for -x + nop.i 0 } +;; { .mfi -(p0) setf.exp f9 = r32 - nop.f 999 - nop.i 999 ;; + nop.m 0 + fma.s1 fP = fP, fRSqr, fR // P = (A3*R + A2)*R^2 + R + nop.i 0 } - { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_podd = sinh_FR_podd_temp2, sinh_FR_Rcub, sinh_FR_R - nop.i 999 -} - -// sinh_GR_mj contains the table offset for -j -// sinh_GR_j contains the table offset for +j -// p6 is true when j <= 0 - -{ .mlx -(p0) setf.exp sinh_FR_N_temp2 = r40 -(p0) movl r40 = 0x0000000000000020 ;; + nop.m 0 + fms.s1 fP_neg = fP_neg, fRSqr, fR // P = (A3*R + A2)*R^2 + R, -x + nop.i 0 } +;; { .mfi -(p0) sub sinh_GR_mJ = r40, r36 -(p0) fmerge.se sinh_FR_spos = sinh_FR_N_temp1, f1 -(p0) adds sinh_GR_J = 0x20, r36 ;; -} - -{ .mii - nop.m 999 -(p0) shl sinh_GR_mJ = sinh_GR_mJ, 5 ;; -(p0) add sinh_AD_mJ = r37, sinh_GR_mJ ;; -} - -{ .mmi - nop.m 999 -(p0) ldfe sinh_FR_Tmjhi = [sinh_AD_mJ],16 -(p0) shl sinh_GR_J = sinh_GR_J, 5 ;; + nop.m 0 + fmpy.s0 fTmp = fLn2Div64, fLn2Div64 // Force inexact + nop.i 0 } +;; { .mfi -(p0) ldfs sinh_FR_Tmjlo = [sinh_AD_mJ],16 -(p0) fcmp.lt.unc.s1 p0,p7 = sinh_FR_X,f9 -(p0) add sinh_AD_J = r37, sinh_GR_J ;; + nop.m 0 + fma.s1 fExp = fP, fT, fT // exp(x)/2 + nop.i 0 } - -{ .mmi -(p0) ldfe sinh_FR_Tjhi = [sinh_AD_J],16 ;; -(p0) ldfs sinh_FR_Tjlo = [sinh_AD_J],16 - nop.i 999 ;; +{ .mfb + nop.m 0 + fma.s1 fExp_neg = fP_neg, fT_neg, fT_neg // exp(-x)/2 + // branch out if possible overflow result +(p13) br.cond.spnt SINH_POSSIBLE_OVERFLOW } +;; { .mfb - nop.m 999 -(p0) fmerge.se sinh_FR_sneg = sinh_FR_N_temp2, f1 -(p7) br.cond.spnt L(SINH_BY_EXP) ;; + nop.m 0 + // final result in the absence of overflow + fms.s.s0 f8 = fExp, f1, fExp_neg // result = (exp(x)-exp(-x))/2 + // exit here in the absence of overflow + br.ret.sptk b0 // Exit main path, 0.25 <= |x| < 89.41598 } +;; +// Here if 0 < |x| < 0.25. Evaluate 9th order polynomial. +SINH_SMALL: { .mfi - nop.m 999 - nop.f 999 - nop.i 999 ;; + add rAd1 = 0x200, rTblAddr + fcmp.lt.s1 p7, p8 = fNormX, f0 // Test sign of x + cmp.gt p6, p0 = -60, rExp_x // Test |x| < 2^(-60) } - -// ****************************************************** -// If NOT branch to EXP -// ****************************************************** -// Calculate S_hi and S_lo -// sinh_FR_S_hi_temp = sinh_FR_sneg * sinh_FR_Tmjhi -// sinh_FR_S_hi = sinh_FR_spos * sinh_FR_Tjhi - sinh_FR_S_hi_temp -// sinh_FR_S_hi = sinh_FR_spos * sinh_FR_Tjhi - (sinh_FR_sneg * sinh_FR_Tmjlo) - { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_S_hi_temp = sinh_FR_sneg, sinh_FR_Tmjhi, f0 - nop.i 999 ;; + add rAd2 = 0x210, rTblAddr + nop.f 0 + nop.i 0 } +;; -{ .mfi - nop.m 999 -(p0) fms.s1 sinh_FR_S_hi = sinh_FR_spos, sinh_FR_Tjhi, sinh_FR_S_hi_temp - nop.i 999 +{ .mmb + ldfpd fA4, fA3 = [rAd1] + ldfpd fA2, fA1 = [rAd2] +(p6) br.cond.spnt SINH_VERY_SMALL // Branch if |x| < 2^(-60) } - -// Calculate C_hi -// sinh_FR_C_hi_temp1 = sinh_FR_sneg * sinh_FR_Tmjhi -// sinh_FR_C_hi = sinh_FR_spos * sinh_FR_Tjhi + sinh_FR_C_hi_temp1 - -{ .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_C_hi_temp1 = sinh_FR_sneg, sinh_FR_Tmjhi, f0 - nop.i 999 ;; -} - -// sinh_FR_S_lo_temp1 = sinh_FR_spos * sinh_FR_Tjhi - sinh_FR_S_hi -// sinh_FR_S_lo_temp2 = -sinh_FR_sneg * sinh_FR_Tmjlo + (sinh_FR_spos * sinh_FR_Tjhi - sinh_FR_S_hi) -// sinh_FR_S_lo_temp2 = -sinh_FR_sneg * sinh_FR_Tmjlo + (sinh_FR_S_lo_temp1 ) +;; { .mfi - nop.m 999 -(p0) fms.s1 sinh_FR_S_lo_temp1 = sinh_FR_spos, sinh_FR_Tjhi, sinh_FR_S_hi - nop.i 999 + nop.m 0 + fma.s1 fX3 = fXsq, fNormX, f0 + nop.i 0 } - { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_C_hi = sinh_FR_spos, sinh_FR_Tjhi, sinh_FR_C_hi_temp1 - nop.i 999 ;; + nop.m 0 + fma.s1 fX4 = fXsq, fXsq, f0 + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fnma.s1 sinh_FR_S_lo_temp2 = sinh_FR_sneg, sinh_FR_Tmjhi, sinh_FR_S_lo_temp1 - nop.i 999 + nop.m 0 + fma.s1 fA43 = fXsq, fA4, fA3 + nop.i 0 } - -// sinh_FR_S_lo_temp1 = sinh_FR_sneg * sinh_FR_Tmjlo -// sinh_FR_S_lo_temp3 = sinh_FR_spos * sinh_FR_Tjlo - sinh_FR_S_lo_temp1 -// sinh_FR_S_lo_temp3 = sinh_FR_spos * sinh_FR_Tjlo -(sinh_FR_sneg * sinh_FR_Tmjlo) -// sinh_FR_S_lo = sinh_FR_S_lo_temp3 + sinh_FR_S_lo_temp2 - { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_S_lo_temp1 = sinh_FR_sneg, sinh_FR_Tmjlo, f0 - nop.i 999 ;; -} - -/////////// BUG FIX fma to fms -TK -{ .mfi - nop.m 999 -(p0) fms.s1 sinh_FR_S_lo_temp3 = sinh_FR_spos, sinh_FR_Tjlo, sinh_FR_S_lo_temp1 - nop.i 999 ;; + nop.m 0 + fma.s1 fA21 = fXsq, fA2, fA1 + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_S_lo = sinh_FR_S_lo_temp3, f1, sinh_FR_S_lo_temp2 - nop.i 999 ;; + nop.m 0 + fma.s1 fA4321 = fX4, fA43, fA21 + nop.i 0 } +;; -// Y_hi = S_hi -// Y_lo = C_hi*p_odd + (S_hi*p_even + S_lo) -// sinh_FR_Y_lo_temp = sinh_FR_S_hi * sinh_FR_peven + sinh_FR_S_lo -// sinh_FR_Y_lo = sinh_FR_C_hi * sinh_FR_podd + sinh_FR_Y_lo_temp - +// Dummy multiply to generate inexact { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_Y_lo_temp = sinh_FR_S_hi, sinh_FR_peven, sinh_FR_S_lo - nop.i 999 ;; + nop.m 0 + fmpy.s0 fTmp = fA4, fA4 + nop.i 0 } - -{ .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_Y_lo = sinh_FR_C_hi, sinh_FR_podd, sinh_FR_Y_lo_temp - nop.i 999 ;; +{ .mfb + nop.m 0 + fma.s.s0 f8 = fA4321, fX3, fNormX + br.ret.sptk b0 // Exit if 2^-60 < |x| < 0.25 } +;; -// sinh_FR_SINH = Y_hi + Y_lo -// f8 = answer = sinh_FR_SGNX * sinh_FR_SINH - -// Dummy multiply to generate inexact -{ .mfi - nop.m 999 -(p0) fmpy.s0 sinh_FR_tmp = sinh_FR_all_ones, sinh_FR_all_ones - nop.i 999 -} +SINH_VERY_SMALL: +// Here if 0 < |x| < 2^-60 +// Compute result by x + sgn(x)*x^2 to get properly rounded result +.pred.rel "mutex",p7,p8 { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_SINH = sinh_FR_S_hi, f1, sinh_FR_Y_lo - nop.i 999 ;; + nop.m 0 +(p7) fnma.s.s0 f8 = fNormX, fNormX, fNormX // If x<0 result ~ x-x^2 + nop.i 0 } - { .mfb - nop.m 999 -(p0) fma.s.s0 f8 = sinh_FR_SGNX, sinh_FR_SINH,f0 -(p0) br.ret.sptk b0 ;; + nop.m 0 +(p8) fma.s.s0 f8 = fNormX, fNormX, fNormX // If x>0 result ~ x+x^2 + br.ret.sptk b0 // Exit if |x| < 2^-60 } +;; +SINH_POSSIBLE_OVERFLOW: -L(SINH_BY_EXP): +// Here if fMAX_SGL_NORM_ARG < x < fMIN_SGL_OFLOW_ARG +// This cannot happen if input is a single, only if input higher precision. +// Overflow is a possibility, not a certainty. -// When p7 is true, we know that an overflow is not going to happen -// When p7 is false, we must check for possible overflow -// p7 is the over_SAFE flag -// Y_hi = Tjhi -// Y_lo = Tjhi * (p_odd + p_even) +Tjlo -// Scale = sign * 2^(N-1) -// sinh_FR_Y_lo = sinh_FR_Tjhi * (sinh_FR_peven + sinh_FR_podd) -// sinh_FR_Y_lo = sinh_FR_Tjhi * (sinh_FR_Y_lo_temp ) +// Recompute result using status field 2 with user's rounding mode, +// and wre set. If result is larger than largest single, then we have +// overflow { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_Y_lo_temp = sinh_FR_peven, f1, sinh_FR_podd - nop.i 999 -} - -// Now we are in EXP. This is the only path where an overflow is possible -// but not for certain. So this is the only path where over_SAFE has any use. -// r34 still has N-1 -// There is a danger of double-extended overflow if N-1 > 16382 = 0x3ffe -// There is a danger of double overflow if N-1 > 0x3fe = 1022 -// There is a danger of single overflow if N-1 > 0x7e = 126 -{ .mlx - nop.m 999 -(p0) movl r32 = 0x000000000000007e ;; -} - -{ .mfi -(p0) cmp.gt.unc p0,p7 = r34, r32 -(p0) fmerge.s sinh_FR_SCALE = sinh_FR_SGNX, sinh_FR_spos - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_Y_lo = sinh_FR_Tjhi, sinh_FR_Y_lo_temp, sinh_FR_Tjlo - nop.i 999 ;; + mov rGt_ln = 0x1007f // Exponent for largest single + 1 ulp + fsetc.s2 0x7F,0x42 // Get user's round mode, set wre + nop.i 0 } +;; -// f8 = answer = scale * (Y_hi + Y_lo) { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_SINH_temp = sinh_FR_Y_lo, f1, sinh_FR_Tjhi - nop.i 999 ;; + setf.exp fGt_pln = rGt_ln // Create largest single + 1 ulp + fma.s.s2 fWre_urm_f8 = fP, fT, fT // Result with wre set + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fma.s.s0 f44 = sinh_FR_SCALE, sinh_FR_SINH_temp, f0 - nop.i 999 ;; + nop.m 0 + fsetc.s2 0x7F,0x40 // Turn off wre in sf2 + nop.i 0 } +;; -// Dummy multiply to generate inexact { .mfi - nop.m 999 -(p7) fmpy.s0 sinh_FR_tmp = sinh_FR_all_ones, sinh_FR_all_ones - nop.i 999 ;; + nop.m 0 + fcmp.ge.s1 p6, p0 = fWre_urm_f8, fGt_pln // Test for overflow + nop.i 0 } +;; -// If over_SAFE is set, return { .mfb - nop.m 999 -(p7) fmerge.s f8 = f44,f44 -(p7) br.ret.sptk b0 ;; + nop.m 0 + nop.f 0 +(p6) br.cond.spnt SINH_CERTAIN_OVERFLOW // Branch if overflow } +;; -// Else see if we overflowed -// S0 user supplied status -// S2 user supplied status + WRE + TD (Overflows) -// If WRE is set then an overflow will not occur in EXP. -// The input value that would cause a register (WRE) value to overflow is about 2^15 -// and this input would go into the HUGE path. -// Answer with WRE is in f43. - -{ .mfi - nop.m 999 -(p0) fsetc.s2 0x7F,0x42 - nop.i 999;; +{ .mfb + nop.m 0 + fma.s.s0 f8 = fP, fT, fT + br.ret.sptk b0 // Exit if really no overflow } +;; +// here if overflow +SINH_CERTAIN_OVERFLOW: { .mfi - nop.m 999 -(p0) fma.s.s2 f43 = sinh_FR_SCALE, sinh_FR_SINH_temp, f0 - nop.i 999 ;; -} - -// 1007F => 1007F -FFFF = 80(true) -// 80 + 7F = FF, which is 1 more that the exponent of the largest -// double (FE). So 0 1007F 8000000000000000 is one ulp more than -// largest single in register bias -// Now set p8 if the answer with WRE is greater than or equal this value -// Also set p9 if the answer with WRE is less than or equal to negative this value - -{ .mlx - nop.m 999 -(p0) movl r32 = 0x0000000001007F ;; + addl r17ones_m1 = 0x1FFFE, r0 + fcmp.lt.s1 p6, p7 = fNormX, f0 // Test for x < 0 + nop.i 0 } +;; { .mmf - nop.m 999 -(p0) setf.exp f41 = r32 -(p0) fsetc.s2 0x7F,0x40 ;; -} - -{ .mfi - nop.m 999 -(p0) fcmp.ge.unc.s1 p8, p0 = f43, f41 - nop.i 999 + alloc r32 = ar.pfs, 0, 3, 4, 0 // get some registers + setf.exp fTmp = r17ones_m1 + fmerge.s FR_X = f8,f8 } +;; { .mfi - nop.m 999 -(p0) fmerge.ns f42 = f41, f41 - nop.i 999 ;; -} - -// The error tag for overflow is 128 -{ .mii - nop.m 999 - nop.i 999 ;; -(p8) mov r47 = 128 ;; + mov GR_Parameter_TAG = 128 +(p6) fnma.s.s0 FR_RESULT = fTmp, fTmp, f0 // Set I,O and -INF result + nop.i 0 } - { .mfb - nop.m 999 -(p0) fcmp.le.unc.s1 p9, p0 = f43, f42 -(p8) br.cond.spnt L(SINH_ERROR_SUPPORT) ;; -} - -{ .mii - nop.m 999 - nop.i 999 ;; -(p9) mov r47 = 128 -} - -{ .mib - nop.m 999 - nop.i 999 -(p9) br.cond.spnt L(SINH_ERROR_SUPPORT) ;; -} - -// Dummy multiply to generate inexact -{ .mfi - nop.m 999 -(p0) fmpy.s0 sinh_FR_tmp = sinh_FR_all_ones, sinh_FR_all_ones - nop.i 999 ;; + nop.m 0 +(p7) fma.s.s0 FR_RESULT = fTmp, fTmp, f0 // Set I,O and +INF result + br.cond.sptk __libm_error_region } +;; +// Here if x unorm +SINH_UNORM: { .mfb - nop.m 999 -(p0) fmerge.s f8 = f44,f44 -(p0) br.ret.sptk b0 ;; -} - -L(SINH_HUGE): - -// for SINH_HUGE, put 24000 in exponent; take sign from input; add 1 -// SAFE: SAFE is always 0 for HUGE - -{ .mlx - nop.m 999 -(p0) movl r32 = 0x0000000000015dbf ;; -} - -{ .mfi -(p0) setf.exp f9 = r32 - nop.f 999 - nop.i 999 ;; + getf.exp rSignexp_x = fNormX // Must recompute if x unorm + fcmp.eq.s0 p6, p0 = f8, f0 // Set D flag + br.cond.sptk SINH_COMMON // Return to main path } +;; -{ .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_signed_hi_lo = sinh_FR_SGNX, f9, f1 - nop.i 999 ;; -} +GLOBAL_IEEE754_END(sinhf) -{ .mfi - nop.m 999 -(p0) fma.s.s0 f44 = sinh_FR_signed_hi_lo, f9, f0 -(p0) mov r47 = 128 -} -.endp sinhf -ASM_SIZE_DIRECTIVE(sinhf) -#ifdef _LIBC -ASM_SIZE_DIRECTIVE(__ieee754_sinhf) -#endif - -// Stack operations when calling error support. -// (1) (2) (3) (call) (4) -// sp -> + psp -> + psp -> + sp -> + -// | | | | -// | | <- GR_Y R3 ->| <- GR_RESULT | -> f8 -// | | | | -// | <-GR_Y Y2->| Y2 ->| <- GR_Y | -// | | | | -// | | <- GR_X X1 ->| | -// | | | | -// sp-64 -> + sp -> + sp -> + + -// save ar.pfs save b0 restore gp -// save gp restore ar.pfs - -.proc __libm_error_region -__libm_error_region: -L(SINH_ERROR_SUPPORT): +LOCAL_LIBM_ENTRY(__libm_error_region) .prologue - -// (1) { .mfi - add GR_Parameter_Y=-32,sp // Parameter 2 value - nop.f 0 + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 0 .save ar.pfs,GR_SAVE_PFS - mov GR_SAVE_PFS=ar.pfs // Save ar.pfs + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs } { .mfi .fframe 64 - add sp=-64,sp // Create new stack - nop.f 0 - mov GR_SAVE_GP=gp // Save gp + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp };; - - -// (2) { .mmi - stfs [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack - add GR_Parameter_X = 16,sp // Parameter 1 address + stfs [GR_Parameter_Y] = FR_Y,16 // Store Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address .save b0, GR_SAVE_B0 - mov GR_SAVE_B0=b0 // Save b0 + mov GR_SAVE_B0=b0 // Save b0 };; - .body -// (3) -{ .mib - stfs [GR_Parameter_X] = f8 // STORE Parameter 1 on stack - add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address - nop.b 0 +{ .mfi + stfs [GR_Parameter_X] = FR_X // Store Parameter 1 on stack + nop.f 0 + add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address } { .mib - stfs [GR_Parameter_Y] = f44 // STORE Parameter 3 on stack - add GR_Parameter_Y = -16,GR_Parameter_Y - br.call.sptk b0=__libm_error_support# // Call error handling function + stfs [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function };; + { .mmi - nop.m 0 - nop.m 0 - add GR_Parameter_RESULT = 48,sp + add GR_Parameter_RESULT = 48,sp + nop.m 0 + nop.i 0 };; -// (4) { .mmi - ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack + ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack .restore sp - add sp = 64,sp // Restore stack pointer - mov b0 = GR_SAVE_B0 // Restore return address + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address };; { .mib - mov gp = GR_SAVE_GP // Restore gp - mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs - br.ret.sptk b0 // Return + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return };; -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) +LOCAL_LIBM_END(__libm_error_region) + .type __libm_error_support#,@function .global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_sinhl.S b/sysdeps/ia64/fpu/e_sinhl.S index b880b95b64..ccc996a8cc 100644 --- a/sysdeps/ia64/fpu/e_sinhl.S +++ b/sysdeps/ia64/fpu/e_sinhl.S @@ -1,10 +1,10 @@ .file "sinhl.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2002, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,7 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -35,17 +35,20 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 2/02/00 Initial version -// 4/04/00 Unwind support added -// 8/15/00 Bundle added after call to __libm_error_support to properly +// 02/02/00 Initial version +// 04/04/00 Unwind support added +// 08/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. // 10/12/00 Update to set denormal operand and underflow flags -// 1/22/01 Fixed to set inexact flag for small args. Fixed incorrect +// 01/22/01 Fixed to set inexact flag for small args. Fixed incorrect // call to __libm_error_support for 710.476 < x < 11357.2166. +// 05/02/01 Reworked to improve speed of all paths +// 05/20/02 Cleaned up namespace and sf0 syntax +// 12/04/02 Improved performance // // API //============================================================== @@ -56,1269 +59,1058 @@ // Registers used //============================================================== // general registers: -// r32 -> r47 +// r14 -> r40 // predicate registers used: -// p6 p7 p8 p9 +// p6 -> p11 // floating-point registers used: -// f9 -> f15; f32 -> f45; +// f9 -> f15; f32 -> f90; // f8 has input, then output // // Overview of operation //============================================================== -// There are four paths -// 1. |x| < 0.25 SINH_BY_POLY -// 2. |x| < 32 SINH_BY_TBL -// 3. |x| < 2^14 SINH_BY_EXP -// 4. |x_ >= 2^14 SINH_HUGE -// -// For double extended we get infinity for x >= 400c b174 ddc0 31ae c0ea -// >= 1.0110001.... x 2^13 -// >= 11357.2166 +// There are seven paths +// 1. 0 < |x| < 0.25 SINH_BY_POLY +// 2. 0.25 <=|x| < 32 SINH_BY_TBL +// 3. 32 <= |x| < 11357.21655 SINH_BY_EXP (merged path with SINH_BY_TBL) +// 4. |x| >= 11357.21655 SINH_HUGE +// 5. x=0 Done with early exit +// 6. x=inf,nan Done with early exit +// 7. x=denormal SINH_DENORM // -// But for double we get infinity for x >= 408633ce8fb9f87e -// >= 1.0110...x 2^9 -// >= +7.10476e+002 +// For double extended we get overflow for x >= 400c b174 ddc0 31ae c0ea +// >= 11357.21655 // -// And for single we get infinity for x >= 42b3a496 -// >= 1.0110... 2^6 -// >= 89.8215 // -// SAFE: If there is danger of overflow set SAFE to 0 -// NOT implemented: if there is danger of underflow, set SAFE to 0 -// SAFE for all paths listed below -// -// 1. SINH_BY_POLY +// 1. SINH_BY_POLY 0 < |x| < 0.25 // =============== -// If |x| is less than the tiny threshold, then clear SAFE -// For double, the tiny threshold is -1022 = -0x3fe => -3fe + ffff = fc01 -// register-biased, this is fc01 -// For single, the tiny threshold is -126 = -7e => -7e + ffff = ff81 -// If |x| < tiny threshold, set SAFE = 0 +// Evaluate sinh(x) by a 13th order polynomial +// Care is take for the order of multiplication; and P_1 is not exactly 1/3!, +// P_2 is not exactly 1/5!, etc. +// sinh(x) = sign * (series(e^x) - series(e^-x))/2 +// = sign * (ax + ax^3/3! + ax^5/5! + ax^7/7! + ax^9/9! + ax^11/11! +// + ax^13/13!) +// = sign * (ax + ax * ( ax^2 * (1/3! + ax^4 * (1/7! + ax^4*1/11!)) ) +// + ax * ( ax^4 * (1/5! + ax^4 * (1/9! + ax^4*1/13!)) )) +// = sign * (ax + ax*p_odd + (ax*p_even)) +// = sign * (ax + Y_lo) +// sinh(x) = sign * (Y_hi + Y_lo) +// Note that ax = |x| // -// 2. SINH_BY_TBL +// 2. SINH_BY_TBL 0.25 <= |x| < 32.0 // ============= -// SAFE: SAFE is always 1 for TBL; +// sinh(x) = sinh(B+R) +// = sinh(B)cosh(R) + cosh(B)sinh(R) +// +// ax = |x| = M*log2/64 + R +// B = M*log2/64 +// M = 64*N + j +// We will calculate M and get N as (M-j)/64 +// The division is a shift. +// exp(B) = exp(N*log2 + j*log2/64) +// = 2^N * 2^(j*log2/64) +// sinh(B) = 1/2(e^B -e^-B) +// = 1/2(2^N * 2^(j*log2/64) - 2^-N * 2^(-j*log2/64)) +// sinh(B) = (2^(N-1) * 2^(j*log2/64) - 2^(-N-1) * 2^(-j*log2/64)) +// cosh(B) = (2^(N-1) * 2^(j*log2/64) + 2^(-N-1) * 2^(-j*log2/64)) +// 2^(j*log2/64) is stored as Tjhi + Tjlo , j= -32,....,32 +// Tjhi is double-extended (80-bit) and Tjlo is single(32-bit) +// +// R = ax - M*log2/64 +// R = ax - M*log2_by_64_hi - M*log2_by_64_lo +// exp(R) = 1 + R +R^2(1/2! + R(1/3! + R(1/4! + ... + R(1/n!)...) +// = 1 + p_odd + p_even +// where the p_even uses the A coefficients and the p_even uses +// the B coefficients +// +// So sinh(R) = 1 + p_odd + p_even -(1 -p_odd -p_even)/2 = p_odd +// cosh(R) = 1 + p_even +// sinh(B) = S_hi + S_lo +// cosh(B) = C_hi +// sinh(x) = sinh(B)cosh(R) + cosh(B)sinh(R) // -// 3. SINH_BY_EXP +// 3. SINH_BY_EXP 32.0 <= |x| < 11357.21655 ( 400c b174 ddc0 31ae c0ea ) // ============== -// There is a danger of double-extended overflow if N-1 > 16382 = 0x3ffe -// r34 has N-1; 16382 is in register biased form, 0x13ffd -// There is danger of double overflow if N-1 > 0x3fe -// in register biased form, 0x103fd -// Analagously, there is danger of single overflow if N-1 > 0x7e -// in register biased form, 0x1007d -// SAFE: If there is danger of overflow set SAFE to 0 +// Can approximate result by exp(x)/2 in this region. +// Y_hi = Tjhi +// Y_lo = Tjhi * (p_odd + p_even) + Tjlo +// sinh(x) = Y_hi + Y_lo // -// 4. SINH_HUGE +// 4. SINH_HUGE |x| >= 11357.21655 ( 400c b174 ddc0 31ae c0ea ) // ============ -// SAFE: SAFE is always 0 for HUGE +// Set error tag and call error support +// // - -#include "libm_support.h" - // Assembly macros //============================================================== -sinh_FR_X = f44 -sinh_FR_X2 = f9 -sinh_FR_X4 = f10 -sinh_FR_SGNX = f40 -sinh_FR_all_ones = f45 -sinh_FR_tmp = f42 - -sinh_FR_Inv_log2by64 = f9 -sinh_FR_log2by64_lo = f11 -sinh_FR_log2by64_hi = f10 - -sinh_FR_A1 = f9 -sinh_FR_A2 = f10 -sinh_FR_A3 = f11 - -sinh_FR_Rcub = f12 -sinh_FR_M_temp = f13 -sinh_FR_R_temp = f13 -sinh_FR_Rsq = f13 -sinh_FR_R = f14 - -sinh_FR_M = f38 - -sinh_FR_B1 = f15 -sinh_FR_B2 = f32 -sinh_FR_B3 = f33 +r_ad5 = r14 +r_rshf_2to57 = r15 +r_exp_denorm = r15 +r_ad_mJ_lo = r15 +r_ad_J_lo = r16 +r_2Nm1 = r17 +r_2mNm1 = r18 +r_exp_x = r18 +r_ad_J_hi = r19 +r_ad2o = r19 +r_ad_mJ_hi = r20 +r_mj = r21 +r_ad2e = r22 +r_ad3 = r23 +r_ad1 = r24 +r_Mmj = r24 +r_rshf = r25 +r_M = r25 +r_N = r25 +r_jshf = r26 +r_exp_2tom57 = r26 +r_j = r26 +r_exp_mask = r27 +r_signexp_x = r28 +r_signexp_sgnx_0_5 = r28 +r_exp_0_25 = r29 +r_sig_inv_ln2 = r30 +r_exp_32 = r30 +r_exp_huge = r30 +r_ad4 = r31 + +GR_SAVE_PFS = r34 +GR_SAVE_B0 = r35 +GR_SAVE_GP = r36 + +GR_Parameter_X = r37 +GR_Parameter_Y = r38 +GR_Parameter_RESULT = r39 +GR_Parameter_TAG = r40 + + +f_ABS_X = f9 +f_X2 = f10 +f_X4 = f11 +f_tmp = f14 +f_RSHF = f15 + +f_Inv_log2by64 = f32 +f_log2by64_lo = f33 +f_log2by64_hi = f34 +f_A1 = f35 + +f_A2 = f36 +f_A3 = f37 +f_Rcub = f38 +f_M_temp = f39 +f_R_temp = f40 + +f_Rsq = f41 +f_R = f42 +f_M = f43 +f_B1 = f44 +f_B2 = f45 + +f_B3 = f46 +f_peven_temp1 = f47 +f_peven_temp2 = f48 +f_peven = f49 +f_podd_temp1 = f50 + +f_podd_temp2 = f51 +f_podd = f52 +f_poly65 = f53 +f_poly6543 = f53 +f_poly6to1 = f53 +f_poly43 = f54 +f_poly21 = f55 + +f_X3 = f56 +f_INV_LN2_2TO63 = f57 +f_RSHF_2TO57 = f58 +f_2TOM57 = f59 +f_smlst_oflow_input = f60 + +f_pre_result = f61 +f_huge = f62 +f_spos = f63 +f_sneg = f64 +f_Tjhi = f65 + +f_Tjlo = f66 +f_Tmjhi = f67 +f_Tmjlo = f68 +f_S_hi = f69 +f_SC_hi_temp = f70 + +f_S_lo_temp1 = f71 +f_S_lo_temp2 = f72 +f_S_lo_temp3 = f73 +f_S_lo_temp4 = f73 +f_S_lo = f74 +f_C_hi = f75 + +f_Y_hi = f77 +f_Y_lo_temp = f78 +f_Y_lo = f79 +f_NORM_X = f80 + +f_P1 = f81 +f_P2 = f82 +f_P3 = f83 +f_P4 = f84 +f_P5 = f85 + +f_P6 = f86 +f_Tjhi_spos = f87 +f_Tjlo_spos = f88 +f_huge = f89 +f_signed_hi_lo = f90 -sinh_FR_peven_temp1 = f34 -sinh_FR_peven_temp2 = f35 -sinh_FR_peven = f36 - -sinh_FR_podd_temp1 = f34 -sinh_FR_podd_temp2 = f35 -sinh_FR_podd = f37 - -sinh_FR_poly_podd_temp1 = f11 -sinh_FR_poly_podd_temp2 = f13 -sinh_FR_poly_peven_temp1 = f11 -sinh_FR_poly_peven_temp2 = f13 - -sinh_FR_J_temp = f9 -sinh_FR_J = f10 - -sinh_FR_Mmj = f39 - -sinh_FR_N_temp1 = f11 -sinh_FR_N_temp2 = f12 -sinh_FR_N = f13 - -sinh_FR_spos = f14 -sinh_FR_sneg = f15 - -sinh_FR_Tjhi = f32 -sinh_FR_Tjlo = f33 -sinh_FR_Tmjhi = f34 -sinh_FR_Tmjlo = f35 - -sinh_GR_mJ = r35 -sinh_GR_J = r36 - -sinh_AD_mJ = r38 -sinh_AD_J = r39 -sinh_GR_all_ones = r40 - -sinh_FR_S_hi = f9 -sinh_FR_S_hi_temp = f10 -sinh_FR_S_lo_temp1 = f11 -sinh_FR_S_lo_temp2 = f12 -sinh_FR_S_lo_temp3 = f13 - -sinh_FR_S_lo = f38 -sinh_FR_C_hi = f39 - -sinh_FR_C_hi_temp1 = f10 -sinh_FR_Y_hi = f11 -sinh_FR_Y_lo_temp = f12 -sinh_FR_Y_lo = f13 -sinh_FR_SINH = f9 - -sinh_FR_P1 = f14 -sinh_FR_P2 = f15 -sinh_FR_P3 = f32 -sinh_FR_P4 = f33 -sinh_FR_P5 = f34 -sinh_FR_P6 = f35 - -sinh_FR_TINY_THRESH = f9 - -sinh_FR_SINH_temp = f10 -sinh_FR_SCALE = f11 - -sinh_FR_signed_hi_lo = f10 - - -GR_SAVE_PFS = r41 -GR_SAVE_B0 = r42 -GR_SAVE_GP = r43 - -GR_Parameter_X = r44 -GR_Parameter_Y = r45 -GR_Parameter_RESULT = r46 // Data tables //============================================================== -#ifdef _LIBC -.rodata -#else -.data -#endif +// DO NOT CHANGE ORDER OF THESE TABLES +RODATA .align 16 -double_sinh_arg_reduction: -ASM_TYPE_DIRECTIVE(double_sinh_arg_reduction,@object) - data8 0xB8AA3B295C17F0BC, 0x00004005 - data8 0xB17217F7D1000000, 0x00003FF8 - data8 0xCF79ABC9E3B39804, 0x00003FD0 -ASM_SIZE_DIRECTIVE(double_sinh_arg_reduction) - -double_sinh_p_table: -ASM_TYPE_DIRECTIVE(double_sinh_p_table,@object) - data8 0xAAAAAAAAAAAAAAAB, 0x00003FFC - data8 0x8888888888888412, 0x00003FF8 - data8 0xD00D00D00D4D39F2, 0x00003FF2 - data8 0xB8EF1D28926D8891, 0x00003FEC - data8 0xD732377688025BE9, 0x00003FE5 - data8 0xB08AF9AE78C1239F, 0x00003FDE -ASM_SIZE_DIRECTIVE(double_sinh_p_table) - -double_sinh_ab_table: -ASM_TYPE_DIRECTIVE(double_sinh_ab_table,@object) - data8 0xAAAAAAAAAAAAAAAC, 0x00003FFC - data8 0x88888888884ECDD5, 0x00003FF8 - data8 0xD00D0C6DCC26A86B, 0x00003FF2 - data8 0x8000000000000002, 0x00003FFE - data8 0xAAAAAAAAAA402C77, 0x00003FFA - data8 0xB60B6CC96BDB144D, 0x00003FF5 -ASM_SIZE_DIRECTIVE(double_sinh_ab_table) - -double_sinh_j_table: -ASM_TYPE_DIRECTIVE(double_sinh_j_table,@object) - data8 0xB504F333F9DE6484, 0x00003FFE, 0x1EB2FB13, 0x00000000 - data8 0xB6FD91E328D17791, 0x00003FFE, 0x1CE2CBE2, 0x00000000 - data8 0xB8FBAF4762FB9EE9, 0x00003FFE, 0x1DDC3CBC, 0x00000000 - data8 0xBAFF5AB2133E45FB, 0x00003FFE, 0x1EE9AA34, 0x00000000 - data8 0xBD08A39F580C36BF, 0x00003FFE, 0x9EAEFDC1, 0x00000000 - data8 0xBF1799B67A731083, 0x00003FFE, 0x9DBF517B, 0x00000000 - data8 0xC12C4CCA66709456, 0x00003FFE, 0x1EF88AFB, 0x00000000 - data8 0xC346CCDA24976407, 0x00003FFE, 0x1E03B216, 0x00000000 - data8 0xC5672A115506DADD, 0x00003FFE, 0x1E78AB43, 0x00000000 - data8 0xC78D74C8ABB9B15D, 0x00003FFE, 0x9E7B1747, 0x00000000 - data8 0xC9B9BD866E2F27A3, 0x00003FFE, 0x9EFE3C0E, 0x00000000 - data8 0xCBEC14FEF2727C5D, 0x00003FFE, 0x9D36F837, 0x00000000 - data8 0xCE248C151F8480E4, 0x00003FFE, 0x9DEE53E4, 0x00000000 - data8 0xD06333DAEF2B2595, 0x00003FFE, 0x9E24AE8E, 0x00000000 - data8 0xD2A81D91F12AE45A, 0x00003FFE, 0x1D912473, 0x00000000 - data8 0xD4F35AABCFEDFA1F, 0x00003FFE, 0x1EB243BE, 0x00000000 - data8 0xD744FCCAD69D6AF4, 0x00003FFE, 0x1E669A2F, 0x00000000 - data8 0xD99D15C278AFD7B6, 0x00003FFE, 0x9BBC610A, 0x00000000 - data8 0xDBFBB797DAF23755, 0x00003FFE, 0x1E761035, 0x00000000 - data8 0xDE60F4825E0E9124, 0x00003FFE, 0x9E0BE175, 0x00000000 - data8 0xE0CCDEEC2A94E111, 0x00003FFE, 0x1CCB12A1, 0x00000000 - data8 0xE33F8972BE8A5A51, 0x00003FFE, 0x1D1BFE90, 0x00000000 - data8 0xE5B906E77C8348A8, 0x00003FFE, 0x1DF2F47A, 0x00000000 - data8 0xE8396A503C4BDC68, 0x00003FFE, 0x1EF22F22, 0x00000000 - data8 0xEAC0C6E7DD24392F, 0x00003FFE, 0x9E3F4A29, 0x00000000 - data8 0xED4F301ED9942B84, 0x00003FFE, 0x1EC01A5B, 0x00000000 - data8 0xEFE4B99BDCDAF5CB, 0x00003FFE, 0x1E8CAC3A, 0x00000000 - data8 0xF281773C59FFB13A, 0x00003FFE, 0x9DBB3FAB, 0x00000000 - data8 0xF5257D152486CC2C, 0x00003FFE, 0x1EF73A19, 0x00000000 - data8 0xF7D0DF730AD13BB9, 0x00003FFE, 0x9BB795B5, 0x00000000 - data8 0xFA83B2DB722A033A, 0x00003FFE, 0x1EF84B76, 0x00000000 - data8 0xFD3E0C0CF486C175, 0x00003FFE, 0x9EF5818B, 0x00000000 - data8 0x8000000000000000, 0x00003FFF, 0x00000000, 0x00000000 - data8 0x8164D1F3BC030773, 0x00003FFF, 0x1F77CACA, 0x00000000 - data8 0x82CD8698AC2BA1D7, 0x00003FFF, 0x1EF8A91D, 0x00000000 - data8 0x843A28C3ACDE4046, 0x00003FFF, 0x1E57C976, 0x00000000 - data8 0x85AAC367CC487B15, 0x00003FFF, 0x9EE8DA92, 0x00000000 - data8 0x871F61969E8D1010, 0x00003FFF, 0x1EE85C9F, 0x00000000 - data8 0x88980E8092DA8527, 0x00003FFF, 0x1F3BF1AF, 0x00000000 - data8 0x8A14D575496EFD9A, 0x00003FFF, 0x1D80CA1E, 0x00000000 - data8 0x8B95C1E3EA8BD6E7, 0x00003FFF, 0x9D0373AF, 0x00000000 - data8 0x8D1ADF5B7E5BA9E6, 0x00003FFF, 0x9F167097, 0x00000000 - data8 0x8EA4398B45CD53C0, 0x00003FFF, 0x1EB70051, 0x00000000 - data8 0x9031DC431466B1DC, 0x00003FFF, 0x1F6EB029, 0x00000000 - data8 0x91C3D373AB11C336, 0x00003FFF, 0x1DFD6D8E, 0x00000000 - data8 0x935A2B2F13E6E92C, 0x00003FFF, 0x9EB319B0, 0x00000000 - data8 0x94F4EFA8FEF70961, 0x00003FFF, 0x1EBA2BEB, 0x00000000 - data8 0x96942D3720185A00, 0x00003FFF, 0x1F11D537, 0x00000000 - data8 0x9837F0518DB8A96F, 0x00003FFF, 0x1F0D5A46, 0x00000000 - data8 0x99E0459320B7FA65, 0x00003FFF, 0x9E5E7BCA, 0x00000000 - data8 0x9B8D39B9D54E5539, 0x00003FFF, 0x9F3AAFD1, 0x00000000 - data8 0x9D3ED9A72CFFB751, 0x00003FFF, 0x9E86DACC, 0x00000000 - data8 0x9EF5326091A111AE, 0x00003FFF, 0x9F3EDDC2, 0x00000000 - data8 0xA0B0510FB9714FC2, 0x00003FFF, 0x1E496E3D, 0x00000000 - data8 0xA27043030C496819, 0x00003FFF, 0x9F490BF6, 0x00000000 - data8 0xA43515AE09E6809E, 0x00003FFF, 0x1DD1DB48, 0x00000000 - data8 0xA5FED6A9B15138EA, 0x00003FFF, 0x1E65EBFB, 0x00000000 - data8 0xA7CD93B4E965356A, 0x00003FFF, 0x9F427496, 0x00000000 - data8 0xA9A15AB4EA7C0EF8, 0x00003FFF, 0x1F283C4A, 0x00000000 - data8 0xAB7A39B5A93ED337, 0x00003FFF, 0x1F4B0047, 0x00000000 - data8 0xAD583EEA42A14AC6, 0x00003FFF, 0x1F130152, 0x00000000 - data8 0xAF3B78AD690A4375, 0x00003FFF, 0x9E8367C0, 0x00000000 - data8 0xB123F581D2AC2590, 0x00003FFF, 0x9F705F90, 0x00000000 - data8 0xB311C412A9112489, 0x00003FFF, 0x1EFB3C53, 0x00000000 - data8 0xB504F333F9DE6484, 0x00003FFF, 0x1F32FB13, 0x00000000 -ASM_SIZE_DIRECTIVE(double_sinh_j_table) - -.align 32 -.global sinhl# - -.section .text -.proc sinhl# -.align 32 - -sinhl: -#ifdef _LIBC -.global __ieee754_sinhl -.type __ieee754_sinhl,@function -__ieee754_sinhl: -#endif - -// X infinity or NAN? -// Take invalid fault if enabled - +LOCAL_OBJECT_START(sinh_arg_reduction) +// data8 0xB8AA3B295C17F0BC, 0x00004005 // 64/log2 -- signif loaded with setf + data8 0xB17217F7D1000000, 0x00003FF8 // log2/64 high part + data8 0xCF79ABC9E3B39804, 0x00003FD0 // log2/64 low part + data8 0xb174ddc031aec0ea, 0x0000400c // Smallest x to overflow (11357.21655) +LOCAL_OBJECT_END(sinh_arg_reduction) + +LOCAL_OBJECT_START(sinh_p_table) + data8 0xB08AF9AE78C1239F, 0x00003FDE // P6 + data8 0xB8EF1D28926D8891, 0x00003FEC // P4 + data8 0x8888888888888412, 0x00003FF8 // P2 + data8 0xD732377688025BE9, 0x00003FE5 // P5 + data8 0xD00D00D00D4D39F2, 0x00003FF2 // P3 + data8 0xAAAAAAAAAAAAAAAB, 0x00003FFC // P1 +LOCAL_OBJECT_END(sinh_p_table) + +LOCAL_OBJECT_START(sinh_ab_table) + data8 0xAAAAAAAAAAAAAAAC, 0x00003FFC // A1 + data8 0x88888888884ECDD5, 0x00003FF8 // A2 + data8 0xD00D0C6DCC26A86B, 0x00003FF2 // A3 + data8 0x8000000000000002, 0x00003FFE // B1 + data8 0xAAAAAAAAAA402C77, 0x00003FFA // B2 + data8 0xB60B6CC96BDB144D, 0x00003FF5 // B3 +LOCAL_OBJECT_END(sinh_ab_table) + +LOCAL_OBJECT_START(sinh_j_hi_table) + data8 0xB504F333F9DE6484, 0x00003FFE + data8 0xB6FD91E328D17791, 0x00003FFE + data8 0xB8FBAF4762FB9EE9, 0x00003FFE + data8 0xBAFF5AB2133E45FB, 0x00003FFE + data8 0xBD08A39F580C36BF, 0x00003FFE + data8 0xBF1799B67A731083, 0x00003FFE + data8 0xC12C4CCA66709456, 0x00003FFE + data8 0xC346CCDA24976407, 0x00003FFE + data8 0xC5672A115506DADD, 0x00003FFE + data8 0xC78D74C8ABB9B15D, 0x00003FFE + data8 0xC9B9BD866E2F27A3, 0x00003FFE + data8 0xCBEC14FEF2727C5D, 0x00003FFE + data8 0xCE248C151F8480E4, 0x00003FFE + data8 0xD06333DAEF2B2595, 0x00003FFE + data8 0xD2A81D91F12AE45A, 0x00003FFE + data8 0xD4F35AABCFEDFA1F, 0x00003FFE + data8 0xD744FCCAD69D6AF4, 0x00003FFE + data8 0xD99D15C278AFD7B6, 0x00003FFE + data8 0xDBFBB797DAF23755, 0x00003FFE + data8 0xDE60F4825E0E9124, 0x00003FFE + data8 0xE0CCDEEC2A94E111, 0x00003FFE + data8 0xE33F8972BE8A5A51, 0x00003FFE + data8 0xE5B906E77C8348A8, 0x00003FFE + data8 0xE8396A503C4BDC68, 0x00003FFE + data8 0xEAC0C6E7DD24392F, 0x00003FFE + data8 0xED4F301ED9942B84, 0x00003FFE + data8 0xEFE4B99BDCDAF5CB, 0x00003FFE + data8 0xF281773C59FFB13A, 0x00003FFE + data8 0xF5257D152486CC2C, 0x00003FFE + data8 0xF7D0DF730AD13BB9, 0x00003FFE + data8 0xFA83B2DB722A033A, 0x00003FFE + data8 0xFD3E0C0CF486C175, 0x00003FFE + data8 0x8000000000000000, 0x00003FFF // Center of table + data8 0x8164D1F3BC030773, 0x00003FFF + data8 0x82CD8698AC2BA1D7, 0x00003FFF + data8 0x843A28C3ACDE4046, 0x00003FFF + data8 0x85AAC367CC487B15, 0x00003FFF + data8 0x871F61969E8D1010, 0x00003FFF + data8 0x88980E8092DA8527, 0x00003FFF + data8 0x8A14D575496EFD9A, 0x00003FFF + data8 0x8B95C1E3EA8BD6E7, 0x00003FFF + data8 0x8D1ADF5B7E5BA9E6, 0x00003FFF + data8 0x8EA4398B45CD53C0, 0x00003FFF + data8 0x9031DC431466B1DC, 0x00003FFF + data8 0x91C3D373AB11C336, 0x00003FFF + data8 0x935A2B2F13E6E92C, 0x00003FFF + data8 0x94F4EFA8FEF70961, 0x00003FFF + data8 0x96942D3720185A00, 0x00003FFF + data8 0x9837F0518DB8A96F, 0x00003FFF + data8 0x99E0459320B7FA65, 0x00003FFF + data8 0x9B8D39B9D54E5539, 0x00003FFF + data8 0x9D3ED9A72CFFB751, 0x00003FFF + data8 0x9EF5326091A111AE, 0x00003FFF + data8 0xA0B0510FB9714FC2, 0x00003FFF + data8 0xA27043030C496819, 0x00003FFF + data8 0xA43515AE09E6809E, 0x00003FFF + data8 0xA5FED6A9B15138EA, 0x00003FFF + data8 0xA7CD93B4E965356A, 0x00003FFF + data8 0xA9A15AB4EA7C0EF8, 0x00003FFF + data8 0xAB7A39B5A93ED337, 0x00003FFF + data8 0xAD583EEA42A14AC6, 0x00003FFF + data8 0xAF3B78AD690A4375, 0x00003FFF + data8 0xB123F581D2AC2590, 0x00003FFF + data8 0xB311C412A9112489, 0x00003FFF + data8 0xB504F333F9DE6484, 0x00003FFF +LOCAL_OBJECT_END(sinh_j_hi_table) + +LOCAL_OBJECT_START(sinh_j_lo_table) + data4 0x1EB2FB13 + data4 0x1CE2CBE2 + data4 0x1DDC3CBC + data4 0x1EE9AA34 + data4 0x9EAEFDC1 + data4 0x9DBF517B + data4 0x1EF88AFB + data4 0x1E03B216 + data4 0x1E78AB43 + data4 0x9E7B1747 + data4 0x9EFE3C0E + data4 0x9D36F837 + data4 0x9DEE53E4 + data4 0x9E24AE8E + data4 0x1D912473 + data4 0x1EB243BE + data4 0x1E669A2F + data4 0x9BBC610A + data4 0x1E761035 + data4 0x9E0BE175 + data4 0x1CCB12A1 + data4 0x1D1BFE90 + data4 0x1DF2F47A + data4 0x1EF22F22 + data4 0x9E3F4A29 + data4 0x1EC01A5B + data4 0x1E8CAC3A + data4 0x9DBB3FAB + data4 0x1EF73A19 + data4 0x9BB795B5 + data4 0x1EF84B76 + data4 0x9EF5818B + data4 0x00000000 // Center of table + data4 0x1F77CACA + data4 0x1EF8A91D + data4 0x1E57C976 + data4 0x9EE8DA92 + data4 0x1EE85C9F + data4 0x1F3BF1AF + data4 0x1D80CA1E + data4 0x9D0373AF + data4 0x9F167097 + data4 0x1EB70051 + data4 0x1F6EB029 + data4 0x1DFD6D8E + data4 0x9EB319B0 + data4 0x1EBA2BEB + data4 0x1F11D537 + data4 0x1F0D5A46 + data4 0x9E5E7BCA + data4 0x9F3AAFD1 + data4 0x9E86DACC + data4 0x9F3EDDC2 + data4 0x1E496E3D + data4 0x9F490BF6 + data4 0x1DD1DB48 + data4 0x1E65EBFB + data4 0x9F427496 + data4 0x1F283C4A + data4 0x1F4B0047 + data4 0x1F130152 + data4 0x9E8367C0 + data4 0x9F705F90 + data4 0x1EFB3C53 + data4 0x1F32FB13 +LOCAL_OBJECT_END(sinh_j_lo_table) -{ .mfi - alloc r32 = ar.pfs,0,12,4,0 -(p0) fclass.m.unc p6,p0 = f8, 0xe3 //@qnan | @snan | @inf - mov sinh_GR_all_ones = -1 -} -;; +.section .text +GLOBAL_IEEE754_ENTRY(sinhl) -{ .mfb - nop.m 999 -(p6) fma.s0 f8 = f8,f1,f8 -(p6) br.ret.spnt b0 ;; -} - -// Put 0.25 in f9; p6 true if x < 0.25 -// Make constant that will generate inexact when squared { .mlx - setf.sig sinh_FR_all_ones = sinh_GR_all_ones -(p0) movl r32 = 0x000000000000fffd ;; -} - -{ .mfi -(p0) setf.exp f9 = r32 -(p0) fclass.m.unc p7,p0 = f8, 0x07 //@zero - nop.i 999 ;; -} - -{ .mfb - nop.m 999 -(p0) fmerge.s sinh_FR_X = f0,f8 -(p7) br.ret.spnt b0 ;; + getf.exp r_signexp_x = f8 // Get signexp of x, must redo if unorm + movl r_sig_inv_ln2 = 0xb8aa3b295c17f0bc // significand of 1/ln2 } - -// Identify denormal operands. -{ .mfi - nop.m 999 - fclass.m.unc p10,p0 = f8, 0x09 // + denorm - nop.i 999 -};; -{ .mfi - nop.m 999 - fclass.m.unc p11,p0 = f8, 0x0a // - denorm - nop.i 999 +{ .mlx + addl r_ad1 = @ltoff(sinh_arg_reduction), gp + movl r_rshf_2to57 = 0x4778000000000000 // 1.10000 2^(63+57) } +;; { .mfi - nop.m 999 -(p0) fmerge.s sinh_FR_SGNX = f8,f1 - nop.i 999 ;; + ld8 r_ad1 = [r_ad1] + fmerge.s f_ABS_X = f0,f8 + mov r_exp_0_25 = 0x0fffd // Form exponent for 0.25 } - { .mfi - nop.m 999 -(p0) fcmp.lt.unc.s1 p0,p7 = sinh_FR_X,f9 - nop.i 999 ;; -} - -{ .mib - nop.m 999 - nop.i 999 -(p7) br.cond.sptk L(SINH_BY_TBL) ;; -} - - -L(SINH_BY_POLY): - -// POLY cannot overflow so there is no need to call __libm_error_support -// Set tiny_SAFE (p7) to 1(0) if answer is not tiny -// Currently we do not use tiny_SAFE. So the setting of tiny_SAFE is -// commented out. -//(p0) movl r32 = 0x000000000000fc01 -//(p0) setf.exp f10 = r32 -//(p0) fcmp.lt.unc.s1 p6,p7 = f8,f10 -// Here is essentially the algorithm for SINH_BY_POLY. Care is take for the order -// of multiplication; and P_1 is not exactly 1/3!, P_2 is not exactly 1/5!, etc. -// Note that ax = |x| -// sinh(x) = sign * (series(e^x) - series(e^-x))/2 -// = sign * (ax + ax^3/3! + ax^5/5! + ax^7/7! + ax^9/9! + ax^11/11! + ax^13/13!) -// = sign * (ax + ax * ( ax^2 * (1/3! + ax^4 * (1/7! + ax^4*1/11!)) ) -// + ax * ( ax^4 * (1/5! + ax^4 * (1/9! + ax^4*1/13!)) ) ) -// = sign * (ax + ax*p_odd + (ax*p_even)) -// = sign * (ax + Y_lo) -// sinh(x) = sign * (Y_hi + Y_lo) -// Get the values of P_x from the table -{ .mfb -(p0) addl r34 = @ltoff(double_sinh_p_table), gp -(p10) fma.s0 f8 = f8,f8,f8 -(p10) br.ret.spnt b0 -} -;; - -{ .mfb - ld8 r34 = [r34] -(p11) fnma.s0 f8 = f8,f8,f8 -(p11) br.ret.spnt b0 + nop.m 0 + fnorm.s1 f_NORM_X = f8 + mov r_exp_2tom57 = 0xffff-57 } ;; -// Calculate sinh_FR_X2 = ax*ax and sinh_FR_X4 = ax*ax*ax*ax -{ .mmf - nop.m 999 -(p0) ldfe sinh_FR_P1 = [r34],16 -(p0) fma.s1 sinh_FR_X2 = sinh_FR_X, sinh_FR_X, f0 ;; -} - -{ .mmi -(p0) ldfe sinh_FR_P2 = [r34],16 ;; -(p0) ldfe sinh_FR_P3 = [r34],16 - nop.i 999 ;; -} - -{ .mmi -(p0) ldfe sinh_FR_P4 = [r34],16 ;; -(p0) ldfe sinh_FR_P5 = [r34],16 - nop.i 999 ;; -} - { .mfi -(p0) ldfe sinh_FR_P6 = [r34],16 -(p0) fma.s1 sinh_FR_X4 = sinh_FR_X2, sinh_FR_X2, f0 - nop.i 999 ;; + setf.d f_RSHF_2TO57 = r_rshf_2to57 // Form const 1.100 * 2^120 + fclass.m p10,p0 = f8, 0x0b // Test for denorm + mov r_exp_mask = 0x1ffff } - -// Calculate sinh_FR_podd = p_odd and sinh_FR_peven = p_even -{ .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_poly_podd_temp1 = sinh_FR_X4, sinh_FR_P5, sinh_FR_P3 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_poly_podd_temp2 = sinh_FR_X4, sinh_FR_poly_podd_temp1, sinh_FR_P1 - nop.i 999 +{ .mlx + setf.sig f_INV_LN2_2TO63 = r_sig_inv_ln2 // Form 1/ln2 * 2^63 + movl r_rshf = 0x43e8000000000000 // 1.1000 2^63 for right shift } +;; { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_poly_peven_temp1 = sinh_FR_X4, sinh_FR_P6, sinh_FR_P4 - nop.i 999 ;; + nop.m 0 + fclass.m p7,p0 = f8, 0x07 // Test if x=0 + nop.i 0 } - { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_podd = sinh_FR_X2, sinh_FR_poly_podd_temp2, f0 - nop.i 999 + setf.exp f_2TOM57 = r_exp_2tom57 // Form 2^-57 for scaling + nop.f 0 + add r_ad3 = 0x90, r_ad1 // Point to ab_table } +;; { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_poly_peven_temp2 = sinh_FR_X4, sinh_FR_poly_peven_temp1, sinh_FR_P2 - nop.i 999 ;; + setf.d f_RSHF = r_rshf // Form right shift const 1.100 * 2^63 + fclass.m p6,p0 = f8, 0xe3 // Test if x nan, inf + add r_ad4 = 0x2f0, r_ad1 // Point to j_hi_table midpoint } - -{ .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_peven = sinh_FR_X4, sinh_FR_poly_peven_temp2, f0 - nop.i 999 ;; +{ .mib + add r_ad2e = 0x20, r_ad1 // Point to p_table + nop.i 0 +(p10) br.cond.spnt SINH_DENORM // Branch if x denorm } +;; -// Calculate sinh_FR_Y_lo = ax*p_odd + (ax*p_even) +// Common path -- return here from SINH_DENORM if x is unnorm +SINH_COMMON: { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_Y_lo_temp = sinh_FR_X, sinh_FR_peven, f0 - nop.i 999 ;; + ldfe f_smlst_oflow_input = [r_ad2e],16 + nop.f 0 + add r_ad5 = 0x580, r_ad1 // Point to j_lo_table midpoint } - -{ .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_Y_lo = sinh_FR_X, sinh_FR_podd, sinh_FR_Y_lo_temp - nop.i 999 ;; +{ .mib + ldfe f_log2by64_hi = [r_ad1],16 + and r_exp_x = r_exp_mask, r_signexp_x +(p7) br.ret.spnt b0 // Exit if x=0 } +;; -// Calculate sinh_FR_SINH = Y_hi + Y_lo. Note that ax = Y_hi -{ .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_SINH = sinh_FR_X, f1, sinh_FR_Y_lo - nop.i 999 ;; -} -// Dummy multiply to generate inexact +// Get the A coefficients for SINH_BY_TBL { .mfi - nop.m 999 -(p0) fmpy.s0 sinh_FR_tmp = sinh_FR_all_ones, sinh_FR_all_ones - nop.i 999 + ldfe f_A1 = [r_ad3],16 + fcmp.lt.s1 p8,p9 = f8,f0 // Test for x<0 + cmp.lt p7,p0 = r_exp_x, r_exp_0_25 // Test x < 0.25 } - -// Calculate f8 = sign * (Y_hi + Y_lo) -// Go to return { .mfb - nop.m 999 -(p0) fma.s0 f8 = sinh_FR_SGNX,sinh_FR_SINH,f0 -(p0) br.ret.sptk b0 ;; -} - - -L(SINH_BY_TBL): - -// Now that we are at TBL; so far all we know is that |x| >= 0.25. -// The first two steps are the same for TBL and EXP, but if we are HUGE -// we want to leave now. -// Double-extended: -// Go to HUGE if |x| >= 2^14, 1000d (register-biased) is e = 14 (true) -// Double -// Go to HUGE if |x| >= 2^10, 10009 (register-biased) is e = 10 (true) -// Single -// Go to HUGE if |x| >= 2^7, 10006 (register-biased) is e = 7 (true) - -{ .mlx - nop.m 999 -(p0) movl r32 = 0x000000000001000d ;; -} - -{ .mfi -(p0) setf.exp f9 = r32 - nop.f 999 - nop.i 999 ;; + add r_ad2o = 0x30, r_ad2e // Point to p_table odd coeffs +(p6) fma.s0 f8 = f8,f1,f0 // Result for x nan, inf +(p6) br.ret.spnt b0 // Exit for x nan, inf } +;; +// Calculate X2 = ax*ax for SINH_BY_POLY { .mfi - nop.m 999 -(p0) fcmp.ge.unc.s1 p6,p7 = sinh_FR_X,f9 - nop.i 999 ;; + ldfe f_log2by64_lo = [r_ad1],16 + nop.f 0 + nop.i 0 } - -{ .mib - nop.m 999 - nop.i 999 -(p6) br.cond.spnt L(SINH_HUGE) ;; +{ .mfb + ldfe f_A2 = [r_ad3],16 + fma.s1 f_X2 = f_NORM_X, f_NORM_X, f0 +(p7) br.cond.spnt SINH_BY_POLY } +;; -// r32 = 1 -// r34 = N-1 -// r35 = N -// r36 = j -// r37 = N+1 - -// TBL can never overflow -// sinh(x) = sinh(B+R) -// = sinh(B)cosh(R) + cosh(B)sinh(R) -// -// ax = |x| = M*log2/64 + R -// B = M*log2/64 -// M = 64*N + j -// We will calcualte M and get N as (M-j)/64 -// The division is a shift. -// exp(B) = exp(N*log2 + j*log2/64) -// = 2^N * 2^(j*log2/64) -// sinh(B) = 1/2(e^B -e^-B) -// = 1/2(2^N * 2^(j*log2/64) - 2^-N * 2^(-j*log2/64)) -// sinh(B) = (2^(N-1) * 2^(j*log2/64) - 2^(-N-1) * 2^(-j*log2/64)) -// cosh(B) = (2^(N-1) * 2^(j*log2/64) + 2^(-N-1) * 2^(-j*log2/64)) -// 2^(j*log2/64) is stored as Tjhi + Tjlo , j= -32,....,32 -// Tjhi is double-extended (80-bit) and Tjlo is single(32-bit) -// R = ax - M*log2/64 -// R = ax - M*log2_by_64_hi - M*log2_by_64_lo -// exp(R) = 1 + R +R^2(1/2! + R(1/3! + R(1/4! + ... + R(1/n!)...) -// = 1 + p_odd + p_even -// where the p_even uses the A coefficients and the p_even uses the B coefficients -// So sinh(R) = 1 + p_odd + p_even -(1 -p_odd -p_even)/2 = p_odd -// cosh(R) = 1 + p_even -// sinh(B) = S_hi + S_lo -// cosh(B) = C_hi -// sinh(x) = sinh(B)cosh(R) + cosh(B)sinh(R) +// Here if |x| >= 0.25 +SINH_BY_TBL: // ****************************************************** -// STEP 1 (TBL and EXP) +// STEP 1 (TBL and EXP) - Argument reduction // ****************************************************** // Get the following constants. -// f9 = Inv_log2by64 -// f10 = log2by64_hi -// f11 = log2by64_lo - -{ .mmi -(p0) adds r32 = 0x1,r0 -(p0) addl r34 = @ltoff(double_sinh_arg_reduction), gp - nop.i 999 -} -;; - -{ .mmi - ld8 r34 = [r34] - nop.m 999 - nop.i 999 -} -;; +// Inv_log2by64 +// log2by64_hi +// log2by64_lo // We want 2^(N-1) and 2^(-N-1). So bias N-1 and -N-1 and // put them in an exponent. -// sinh_FR_spos = 2^(N-1) and sinh_FR_sneg = 2^(-N-1) -// r39 = 0xffff + (N-1) = 0xffff +N -1 -// r40 = 0xffff - (N +1) = 0xffff -N -1 - -{ .mlx - nop.m 999 -(p0) movl r38 = 0x000000000000fffe ;; -} - -{ .mmi -(p0) ldfe sinh_FR_Inv_log2by64 = [r34],16 ;; -(p0) ldfe sinh_FR_log2by64_hi = [r34],16 - nop.i 999 ;; -} - -{ .mbb -(p0) ldfe sinh_FR_log2by64_lo = [r34],16 - nop.b 999 - nop.b 999 ;; -} +// f_spos = 2^(N-1) and f_sneg = 2^(-N-1) +// 0xffff + (N-1) = 0xffff +N -1 +// 0xffff - (N +1) = 0xffff -N -1 -// Get the A coefficients -// f9 = A_1 -// f10 = A_2 -// f11 = A_3 -{ .mmi - nop.m 999 -(p0) addl r34 = @ltoff(double_sinh_ab_table), gp - nop.i 999 -} -;; +// Calculate M and keep it as integer and floating point. +// M = round-to-integer(x*Inv_log2by64) +// f_M = M = truncate(ax/(log2/64)) +// Put the integer representation of M in r_M +// and the floating point representation of M in f_M +// Get the remaining A,B coefficients { .mmi - ld8 r34 = [r34] - nop.m 999 - nop.i 999 + ldfe f_A3 = [r_ad3],16 + nop.m 0 + nop.i 0 } ;; - -// Calculate M and keep it as integer and floating point. -// f38 = M = round-to-integer(x*Inv_log2by64) -// sinh_FR_M = M = truncate(ax/(log2/64)) -// Put the significand of M in r35 -// and the floating point representation of M in sinh_FR_M - +.pred.rel "mutex",p8,p9 +// Use constant (1.100*2^(63-6)) to get rounded M into rightmost significand +// |x| * 64 * 1/ln2 * 2^(63-6) + 1.1000 * 2^(63+(63-6)) { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_M = sinh_FR_X, sinh_FR_Inv_log2by64, f0 - nop.i 999 +(p8) mov r_signexp_sgnx_0_5 = 0x2fffe // signexp of -0.5 + fma.s1 f_M_temp = f_ABS_X, f_INV_LN2_2TO63, f_RSHF_2TO57 +(p9) mov r_signexp_sgnx_0_5 = 0x0fffe // signexp of +0.5 } +;; +// Test for |x| >= overflow limit { .mfi -(p0) ldfe sinh_FR_A1 = [r34],16 - nop.f 999 - nop.i 999 ;; + ldfe f_B1 = [r_ad3],16 + fcmp.ge.s1 p6,p0 = f_ABS_X, f_smlst_oflow_input + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fcvt.fx.s1 sinh_FR_M_temp = sinh_FR_M - nop.i 999 ;; + ldfe f_B2 = [r_ad3],16 + nop.f 0 + mov r_exp_32 = 0x10004 } +;; -{ .mfi - nop.m 999 -(p0) fnorm.s1 sinh_FR_M = sinh_FR_M_temp - nop.i 999 ;; +// Subtract RSHF constant to get rounded M as a floating point value +// M_temp * 2^(63-6) - 2^63 +{ .mfb + ldfe f_B3 = [r_ad3],16 + fms.s1 f_M = f_M_temp, f_2TOM57, f_RSHF +(p6) br.cond.spnt SINH_HUGE // Branch if result will overflow } +;; { .mfi -(p0) getf.sig r35 = sinh_FR_M_temp - nop.f 999 - nop.i 999 ;; + getf.sig r_M = f_M_temp + nop.f 0 + cmp.ge p7,p6 = r_exp_x, r_exp_32 // Test if x >= 32 } +;; -// M is still in r35. Calculate j. j is the signed extension of the six lsb of M. It +// Calculate j. j is the signed extension of the six lsb of M. It // has a range of -32 thru 31. -// r35 = M -// r36 = j - -{ .mii - nop.m 999 - nop.i 999 ;; -(p0) and r36 = 0x3f, r35 ;; -} // Calculate R -// f13 = f44 - f12*f10 = ax - M*log2by64_hi -// f14 = f13 - f8*f11 = R = (ax - M*log2by64_hi) - M*log2by64_lo - -{ .mfi - nop.m 999 -(p0) fnma.s1 sinh_FR_R_temp = sinh_FR_M, sinh_FR_log2by64_hi, sinh_FR_X - nop.i 999 -} +// ax - M*log2by64_hi +// R = (ax - M*log2by64_hi) - M*log2by64_lo { .mfi -(p0) ldfe sinh_FR_A2 = [r34],16 - nop.f 999 - nop.i 999 ;; + nop.m 0 + fnma.s1 f_R_temp = f_M, f_log2by64_hi, f_ABS_X + and r_j = 0x3f, r_M } +;; -{ .mfi - nop.m 999 -(p0) fnma.s1 sinh_FR_R = sinh_FR_M, sinh_FR_log2by64_lo, sinh_FR_R_temp - nop.i 999 +{ .mii + nop.m 0 + shl r_jshf = r_j, 0x2 // Shift j so can sign extend it +;; + sxt1 r_jshf = r_jshf } +;; -// Get the B coefficients -// f15 = B_1 -// f32 = B_2 -// f33 = B_3 - -{ .mmi -(p0) ldfe sinh_FR_A3 = [r34],16 ;; -(p0) ldfe sinh_FR_B1 = [r34],16 - nop.i 999 ;; +{ .mii + nop.m 0 + shr r_j = r_jshf, 0x2 // Now j has range -32 to 31 + nop.i 0 } +;; { .mmi -(p0) ldfe sinh_FR_B2 = [r34],16 ;; -(p0) ldfe sinh_FR_B3 = [r34],16 - nop.i 999 ;; -} - -{ .mii - nop.m 999 -(p0) shl r34 = r36, 0x2 ;; -(p0) sxt1 r37 = r34 ;; + shladd r_ad_J_hi = r_j, 4, r_ad4 // pointer to Tjhi + sub r_Mmj = r_M, r_j // M-j + sub r_mj = r0, r_j // Form -j } +;; -// ****************************************************** -// STEP 2 (TBL and EXP) -// ****************************************************** -// Calculate Rsquared and Rcubed in preparation for p_even and p_odd -// f12 = R*R*R -// f13 = R*R -// f14 = R <== from above - +// The TBL and EXP branches are merged and predicated +// If TBL, p6 true, 0.25 <= |x| < 32 +// If EXP, p7 true, 32 <= |x| < overflow_limit +// +// N = (M-j)/64 { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_Rsq = sinh_FR_R, sinh_FR_R, f0 -(p0) shr r36 = r37, 0x2 ;; + ldfe f_Tjhi = [r_ad_J_hi] + fnma.s1 f_R = f_M, f_log2by64_lo, f_R_temp + shr r_N = r_Mmj, 0x6 // N = (M-j)/64 } - -// r34 = M-j = r35 - r36 -// r35 = N = (M-j)/64 - -{ .mii -(p0) sub r34 = r35, r36 - nop.i 999 ;; -(p0) shr r35 = r34, 0x6 ;; +{ .mfi + shladd r_ad_mJ_hi = r_mj, 4, r_ad4 // pointer to Tmjhi + nop.f 0 + shladd r_ad_mJ_lo = r_mj, 2, r_ad5 // pointer to Tmjlo } +;; -{ .mii -(p0) sub r40 = r38, r35 -(p0) adds r37 = 0x1, r35 -(p0) add r39 = r38, r35 ;; +{ .mfi + sub r_2mNm1 = r_signexp_sgnx_0_5, r_N // signexp sgnx*2^(-N-1) + nop.f 0 + shladd r_ad_J_lo = r_j, 2, r_ad5 // pointer to Tjlo } - -// Get the address of the J table, add the offset, -// addresses are sinh_AD_mJ and sinh_AD_J, get the T value -// f32 = T(j)_hi -// f33 = T(j)_lo -// f34 = T(-j)_hi -// f35 = T(-j)_lo - -{ .mmi -(p0) sub r34 = r35, r32 -(p0) addl r37 = @ltoff(double_sinh_j_table), gp - nop.i 999 +{ .mfi + ldfe f_Tmjhi = [r_ad_mJ_hi] + nop.f 0 + add r_2Nm1 = r_signexp_sgnx_0_5, r_N // signexp sgnx*2^(N-1) } ;; -{ .mmi - ld8 r37 = [r37] - nop.m 999 - nop.i 999 +{ .mmf + ldfs f_Tmjlo = [r_ad_mJ_lo] + setf.exp f_sneg = r_2mNm1 // Form sgnx * 2^(-N-1) + nop.f 0 } ;; - -{ .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_Rcub = sinh_FR_Rsq, sinh_FR_R, f0 - nop.i 999 +{ .mmf + ldfs f_Tjlo = [r_ad_J_lo] + setf.exp f_spos = r_2Nm1 // Form sgnx * 2^(N-1) + nop.f 0 } +;; // ****************************************************** -// STEP 3 Now decide if we need to branch to EXP +// STEP 2 (TBL and EXP) // ****************************************************** -// Put 32 in f9; p6 true if x < 32 -// Go to EXP if |x| >= 32 +// Calculate Rsquared and Rcubed in preparation for p_even and p_odd -{ .mlx - nop.m 999 -(p0) movl r32 = 0x0000000000010004 ;; +{ .mmf + nop.m 0 + nop.m 0 + fma.s1 f_Rsq = f_R, f_R, f0 } +;; -// Calculate p_even -// f34 = B_2 + Rsq *B_3 -// f35 = B_1 + Rsq*f34 = B_1 + Rsq * (B_2 + Rsq *B_3) -// f36 = p_even = Rsq * f35 = Rsq * (B_1 + Rsq * (B_2 + Rsq *B_3)) - -{ .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_peven_temp1 = sinh_FR_Rsq, sinh_FR_B3, sinh_FR_B2 - nop.i 999 ;; -} +// Calculate p_even +// B_2 + Rsq *B_3 +// B_1 + Rsq * (B_2 + Rsq *B_3) +// p_even = Rsq * (B_1 + Rsq * (B_2 + Rsq *B_3)) { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_peven_temp2 = sinh_FR_Rsq, sinh_FR_peven_temp1, sinh_FR_B1 - nop.i 999 + nop.m 0 + fma.s1 f_peven_temp1 = f_Rsq, f_B3, f_B2 + nop.i 0 } - // Calculate p_odd -// f34 = A_2 + Rsq *A_3 -// f35 = A_1 + Rsq * (A_2 + Rsq *A_3) -// f37 = podd = R + Rcub * (A_1 + Rsq * (A_2 + Rsq *A_3)) - +// A_2 + Rsq *A_3 +// A_1 + Rsq * (A_2 + Rsq *A_3) +// podd = R + Rcub * (A_1 + Rsq * (A_2 + Rsq *A_3)) { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_podd_temp1 = sinh_FR_Rsq, sinh_FR_A3, sinh_FR_A2 - nop.i 999 ;; + nop.m 0 + fma.s1 f_podd_temp1 = f_Rsq, f_A3, f_A2 + nop.i 0 } +;; { .mfi -(p0) setf.exp sinh_FR_N_temp1 = r39 - nop.f 999 - nop.i 999 ;; + nop.m 0 + fma.s1 f_Rcub = f_Rsq, f_R, f0 + nop.i 0 } +;; -{ .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_peven = sinh_FR_Rsq, sinh_FR_peven_temp2, f0 - nop.i 999 -} +// +// If TBL, +// Calculate S_hi and S_lo, and C_hi +// SC_hi_temp = sneg * Tmjhi +// S_hi = spos * Tjhi - SC_hi_temp +// S_hi = spos * Tjhi - (sneg * Tmjhi) +// C_hi = spos * Tjhi + SC_hi_temp +// C_hi = spos * Tjhi + (sneg * Tmjhi) { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_podd_temp2 = sinh_FR_Rsq, sinh_FR_podd_temp1, sinh_FR_A1 - nop.i 999 ;; + nop.m 0 +(p6) fma.s1 f_SC_hi_temp = f_sneg, f_Tmjhi, f0 + nop.i 0 } +;; +// If TBL, +// S_lo_temp3 = sneg * Tmjlo +// S_lo_temp4 = spos * Tjlo - S_lo_temp3 +// S_lo_temp4 = spos * Tjlo -(sneg * Tmjlo) { .mfi -(p0) setf.exp f9 = r32 - nop.f 999 - nop.i 999 ;; + nop.m 0 +(p6) fma.s1 f_S_lo_temp3 = f_sneg, f_Tmjlo, f0 + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_podd = sinh_FR_podd_temp2, sinh_FR_Rcub, sinh_FR_R - nop.i 999 -} - -// sinh_GR_mj contains the table offset for -j -// sinh_GR_j contains the table offset for +j -// p6 is true when j <= 0 - -{ .mlx -(p0) setf.exp sinh_FR_N_temp2 = r40 -(p0) movl r40 = 0x0000000000000020 ;; + nop.m 0 + fma.s1 f_peven_temp2 = f_Rsq, f_peven_temp1, f_B1 + nop.i 0 } - { .mfi -(p0) sub sinh_GR_mJ = r40, r36 -(p0) fmerge.se sinh_FR_spos = sinh_FR_N_temp1, f1 -(p0) adds sinh_GR_J = 0x20, r36 ;; -} - -{ .mii - nop.m 999 -(p0) shl sinh_GR_mJ = sinh_GR_mJ, 5 ;; -(p0) add sinh_AD_mJ = r37, sinh_GR_mJ ;; -} - -{ .mmi - nop.m 999 -(p0) ldfe sinh_FR_Tmjhi = [sinh_AD_mJ],16 -(p0) shl sinh_GR_J = sinh_GR_J, 5 ;; + nop.m 0 + fma.s1 f_podd_temp2 = f_Rsq, f_podd_temp1, f_A1 + nop.i 0 } +;; +// If EXP, +// Compute sgnx * 2^(N-1) * Tjhi and sgnx * 2^(N-1) * Tjlo { .mfi -(p0) ldfs sinh_FR_Tmjlo = [sinh_AD_mJ],16 -(p0) fcmp.lt.unc.s1 p0,p7 = sinh_FR_X,f9 -(p0) add sinh_AD_J = r37, sinh_GR_J ;; -} - -{ .mmi -(p0) ldfe sinh_FR_Tjhi = [sinh_AD_J],16 ;; -(p0) ldfs sinh_FR_Tjlo = [sinh_AD_J],16 - nop.i 999 ;; -} - -{ .mfb - nop.m 999 -(p0) fmerge.se sinh_FR_sneg = sinh_FR_N_temp2, f1 -(p7) br.cond.spnt L(SINH_BY_EXP) ;; + nop.m 0 +(p7) fma.s1 f_Tjhi_spos = f_Tjhi, f_spos, f0 + nop.i 0 } - { .mfi - nop.m 999 - nop.f 999 - nop.i 999 ;; + nop.m 0 +(p7) fma.s1 f_Tjlo_spos = f_Tjlo, f_spos, f0 + nop.i 0 } - -// ****************************************************** -// If NOT branch to EXP -// ****************************************************** -// Calculate S_hi and S_lo -// sinh_FR_S_hi_temp = sinh_FR_sneg * sinh_FR_Tmjhi -// sinh_FR_S_hi = sinh_FR_spos * sinh_FR_Tjhi - sinh_FR_S_hi_temp -// sinh_FR_S_hi = sinh_FR_spos * sinh_FR_Tjhi - (sinh_FR_sneg * sinh_FR_Tmjlo) +;; { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_S_hi_temp = sinh_FR_sneg, sinh_FR_Tmjhi, f0 - nop.i 999 ;; + nop.m 0 +(p6) fms.s1 f_S_hi = f_spos, f_Tjhi, f_SC_hi_temp + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fms.s1 sinh_FR_S_hi = sinh_FR_spos, sinh_FR_Tjhi, sinh_FR_S_hi_temp - nop.i 999 + nop.m 0 +(p6) fma.s1 f_C_hi = f_spos, f_Tjhi, f_SC_hi_temp + nop.i 0 } - -// Calculate C_hi -// sinh_FR_C_hi_temp1 = sinh_FR_sneg * sinh_FR_Tmjhi -// sinh_FR_C_hi = sinh_FR_spos * sinh_FR_Tjhi + sinh_FR_C_hi_temp1 - { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_C_hi_temp1 = sinh_FR_sneg, sinh_FR_Tmjhi, f0 - nop.i 999 ;; + nop.m 0 +(p6) fms.s1 f_S_lo_temp4 = f_spos, f_Tjlo, f_S_lo_temp3 + nop.i 0 } - -// sinh_FR_S_lo_temp1 = sinh_FR_spos * sinh_FR_Tjhi - sinh_FR_S_hi -// sinh_FR_S_lo_temp2 = -sinh_FR_sneg * sinh_FR_Tmjlo + (sinh_FR_spos * sinh_FR_Tjhi - sinh_FR_S_hi) -// sinh_FR_S_lo_temp2 = -sinh_FR_sneg * sinh_FR_Tmjlo + (sinh_FR_S_lo_temp1 ) +;; { .mfi - nop.m 999 -(p0) fms.s1 sinh_FR_S_lo_temp1 = sinh_FR_spos, sinh_FR_Tjhi, sinh_FR_S_hi - nop.i 999 + nop.m 0 + fma.s1 f_peven = f_Rsq, f_peven_temp2, f0 + nop.i 0 } - { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_C_hi = sinh_FR_spos, sinh_FR_Tjhi, sinh_FR_C_hi_temp1 - nop.i 999 ;; + nop.m 0 + fma.s1 f_podd = f_podd_temp2, f_Rcub, f_R + nop.i 0 } +;; + +// If TBL, +// S_lo_temp1 = spos * Tjhi - S_hi +// S_lo_temp2 = -sneg * Tmjlo + S_lo_temp1 +// S_lo_temp2 = -sneg * Tmjlo + (spos * Tjhi - S_hi) { .mfi - nop.m 999 -(p0) fnma.s1 sinh_FR_S_lo_temp2 = sinh_FR_sneg, sinh_FR_Tmjhi, sinh_FR_S_lo_temp1 - nop.i 999 + nop.m 0 +(p6) fms.s1 f_S_lo_temp1 = f_spos, f_Tjhi, f_S_hi + nop.i 0 } - -// sinh_FR_S_lo_temp1 = sinh_FR_sneg * sinh_FR_Tmjlo -// sinh_FR_S_lo_temp3 = sinh_FR_spos * sinh_FR_Tjlo - sinh_FR_S_lo_temp1 -// sinh_FR_S_lo_temp3 = sinh_FR_spos * sinh_FR_Tjlo -(sinh_FR_sneg * sinh_FR_Tmjlo) -// sinh_FR_S_lo = sinh_FR_S_lo_temp3 + sinh_FR_S_lo_temp2 +;; { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_S_lo_temp1 = sinh_FR_sneg, sinh_FR_Tmjlo, f0 - nop.i 999 ;; + nop.m 0 +(p6) fnma.s1 f_S_lo_temp2 = f_sneg, f_Tmjhi, f_S_lo_temp1 + nop.i 0 } +;; -/////////// BUG FIX fma to fms -TK +// If EXP, +// Y_hi = sgnx * 2^(N-1) * Tjhi +// Y_lo = sgnx * 2^(N-1) * Tjhi * (p_odd + p_even) + sgnx * 2^(N-1) * Tjlo { .mfi - nop.m 999 -(p0) fms.s1 sinh_FR_S_lo_temp3 = sinh_FR_spos, sinh_FR_Tjlo, sinh_FR_S_lo_temp1 - nop.i 999 ;; + nop.m 0 +(p7) fma.s1 f_Y_lo_temp = f_peven, f1, f_podd + nop.i 0 } +;; +// If TBL, +// S_lo = S_lo_temp4 + S_lo_temp2 { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_S_lo = sinh_FR_S_lo_temp3, f1, sinh_FR_S_lo_temp2 - nop.i 999 ;; + nop.m 0 +(p6) fma.s1 f_S_lo = f_S_lo_temp4, f1, f_S_lo_temp2 + nop.i 0 } +;; +// If TBL, // Y_hi = S_hi // Y_lo = C_hi*p_odd + (S_hi*p_even + S_lo) -// sinh_FR_Y_lo_temp = sinh_FR_S_hi * sinh_FR_peven + sinh_FR_S_lo -// sinh_FR_Y_lo = sinh_FR_C_hi * sinh_FR_podd + sinh_FR_Y_lo_temp - { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_Y_lo_temp = sinh_FR_S_hi, sinh_FR_peven, sinh_FR_S_lo - nop.i 999 ;; + nop.m 0 +(p6) fma.s1 f_Y_lo_temp = f_S_hi, f_peven, f_S_lo + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_Y_lo = sinh_FR_C_hi, sinh_FR_podd, sinh_FR_Y_lo_temp - nop.i 999 ;; + nop.m 0 +(p7) fma.s1 f_Y_lo = f_Tjhi_spos, f_Y_lo_temp, f_Tjlo_spos + nop.i 0 } - -// sinh_FR_SINH = Y_hi + Y_lo -// f8 = answer = sinh_FR_SGNX * sinh_FR_SINH +;; // Dummy multiply to generate inexact { .mfi - nop.m 999 -(p0) fmpy.s0 sinh_FR_tmp = sinh_FR_all_ones, sinh_FR_all_ones - nop.i 999 + nop.m 0 + fmpy.s0 f_tmp = f_B2, f_B2 + nop.i 0 +} +{ .mfi + nop.m 0 +(p6) fma.s1 f_Y_lo = f_C_hi, f_podd, f_Y_lo_temp + nop.i 0 } +;; + +// f8 = answer = Y_hi + Y_lo { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_SINH = sinh_FR_S_hi, f1, sinh_FR_Y_lo - nop.i 999 ;; + nop.m 0 +(p7) fma.s0 f8 = f_Y_lo, f1, f_Tjhi_spos + nop.i 0 } +;; +// f8 = answer = Y_hi + Y_lo { .mfb - nop.m 999 -(p0) fma.s0 f8 = sinh_FR_SGNX, sinh_FR_SINH,f0 -(p0) br.ret.sptk b0 ;; + nop.m 0 +(p6) fma.s0 f8 = f_Y_lo, f1, f_S_hi + br.ret.sptk b0 // Exit for SINH_BY_TBL and SINH_BY_EXP } +;; -L(SINH_BY_EXP): - -// When p7 is true, we know that an overflow is not going to happen -// When p7 is false, we must check for possible overflow -// p7 is the over_SAFE flag -// Y_hi = Tjhi -// Y_lo = Tjhi * (p_odd + p_even) +Tjlo -// Scale = sign * 2^(N-1) -// sinh_FR_Y_lo = sinh_FR_Tjhi * (sinh_FR_peven + sinh_FR_podd) -// sinh_FR_Y_lo = sinh_FR_Tjhi * (sinh_FR_Y_lo_temp ) +// Here if 0 < |x| < 0.25 +SINH_BY_POLY: +{ .mmf + ldfe f_P6 = [r_ad2e],16 + ldfe f_P5 = [r_ad2o],16 + nop.f 0 +} +;; -{ .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_Y_lo_temp = sinh_FR_peven, f1, sinh_FR_podd - nop.i 999 +{ .mmi + ldfe f_P4 = [r_ad2e],16 + ldfe f_P3 = [r_ad2o],16 + nop.i 0 } +;; -// Now we are in EXP. This is the only path where an overflow is possible -// but not for certain. So this is the only path where over_SAFE has any use. -// r34 still has N-1 -// There is a danger of double-extended overflow if N-1 > 16382 = 0x3ffe -// There is a danger of double overflow if N-1 > 0x3fe = 1022 -{ .mlx - nop.m 999 -(p0) movl r32 = 0x0000000000003ffe ;; +{ .mmi + ldfe f_P2 = [r_ad2e],16 + ldfe f_P1 = [r_ad2o],16 + nop.i 0 } +;; { .mfi -(p0) cmp.gt.unc p0,p7 = r34, r32 -(p0) fmerge.s sinh_FR_SCALE = sinh_FR_SGNX, sinh_FR_spos - nop.i 999 ;; + nop.m 0 + fma.s1 f_X3 = f_NORM_X, f_X2, f0 + nop.i 0 } - { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_Y_lo = sinh_FR_Tjhi, sinh_FR_Y_lo_temp, sinh_FR_Tjlo - nop.i 999 ;; + nop.m 0 + fma.s1 f_X4 = f_X2, f_X2, f0 + nop.i 0 } +;; -// f8 = answer = scale * (Y_hi + Y_lo) { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_SINH_temp = sinh_FR_Y_lo, f1, sinh_FR_Tjhi - nop.i 999 ;; + nop.m 0 + fma.s1 f_poly65 = f_X2, f_P6, f_P5 + nop.i 0 } - { .mfi - nop.m 999 -(p0) fma.s0 f44 = sinh_FR_SCALE, sinh_FR_SINH_temp, f0 - nop.i 999 ;; + nop.m 0 + fma.s1 f_poly43 = f_X2, f_P4, f_P3 + nop.i 0 } +;; -// Dummy multiply to generate inexact { .mfi - nop.m 999 -(p7) fmpy.s0 sinh_FR_tmp = sinh_FR_all_ones, sinh_FR_all_ones - nop.i 999 ;; + nop.m 0 + fma.s1 f_poly21 = f_X2, f_P2, f_P1 + nop.i 0 } +;; -// If over_SAFE is set, return -{ .mfb - nop.m 999 -(p7) fmerge.s f8 = f44,f44 -(p7) br.ret.sptk b0 ;; +{ .mfi + nop.m 0 + fma.s1 f_poly6543 = f_X4, f_poly65, f_poly43 + nop.i 0 } - -// Else see if we overflowed -// S0 user supplied status -// S2 user supplied status + WRE + TD (Overflows) -// If WRE is set then an overflow will not occur in EXP. -// The input value that would cause a register (WRE) value to overflow is about 2^15 -// and this input would go into the HUGE path. -// Answer with WRE is in f43. +;; { .mfi - nop.m 999 -(p0) fsetc.s2 0x7F,0x42 - nop.i 999;; + nop.m 0 + fma.s1 f_poly6to1 = f_X4, f_poly6543, f_poly21 + nop.i 0 } +;; +// Dummy multiply to generate inexact { .mfi - nop.m 999 -(p0) fma.s2 f43 = sinh_FR_SCALE, sinh_FR_SINH_temp, f0 - nop.i 999 ;; + nop.m 0 + fmpy.s0 f_tmp = f_P6, f_P6 + nop.i 0 } - -// 13FFF => 13FFF -FFFF = 4000(true) -// 4000 + 3FFF = 7FFF, which is 1 more that the exponent of the largest -// long double (7FFE). So 0 13FFF 8000000000000000 is one ulp more than -// largest long double in register bias -// Now set p8 if the answer with WRE is greater than or equal this value -// Also set p9 if the answer with WRE is less than or equal to negative this value - -{ .mlx - nop.m 999 -(p0) movl r32 = 0x00000000013FFF ;; +{ .mfb + nop.m 0 + fma.s0 f8 = f_poly6to1, f_X3, f_NORM_X + br.ret.sptk b0 // Exit SINH_BY_POLY } +;; -{ .mmf - nop.m 999 -(p0) setf.exp f41 = r32 -(p0) fsetc.s2 0x7F,0x40 ;; -} -{ .mfi - nop.m 999 -(p0) fcmp.ge.unc.s1 p8, p0 = f43, f41 - nop.i 999 +// Here if x denorm or unorm +SINH_DENORM: +// Determine if x really a denorm and not a unorm +{ .mmf + getf.exp r_signexp_x = f_NORM_X + mov r_exp_denorm = 0x0c001 // Real denorms have exp < this + fmerge.s f_ABS_X = f0, f_NORM_X } +;; { .mfi - nop.m 999 -(p0) fmerge.ns f42 = f41, f41 - nop.i 999 ;; + nop.m 0 + fcmp.eq.s0 p10,p0 = f8, f0 // Set denorm flag + nop.i 0 } +;; -// The error tag for overflow is 126 -{ .mii - nop.m 999 - nop.i 999 ;; -(p8) mov r47 = 126 ;; +// Set p8 if really a denorm +{ .mmi + and r_exp_x = r_exp_mask, r_signexp_x +;; + cmp.lt p8,p9 = r_exp_x, r_exp_denorm + nop.i 0 } +;; +// Identify denormal operands. { .mfb - nop.m 999 -(p0) fcmp.le.unc.s1 p9, p0 = f43, f42 -(p8) br.cond.spnt L(SINH_ERROR_SUPPORT) ;; -} - -{ .mii - nop.m 999 - nop.i 999 ;; -(p9) mov r47 = 126 -} - -{ .mib - nop.m 999 - nop.i 999 -(p9) br.cond.spnt L(SINH_ERROR_SUPPORT) ;; + nop.m 0 +(p8) fcmp.ge.unc.s1 p6,p7 = f8, f0 // Test sign of denorm +(p9) br.cond.sptk SINH_COMMON // Return to main path if x unorm } +;; -// Dummy multiply to generate inexact { .mfi - nop.m 999 -(p0) fmpy.s0 sinh_FR_tmp = sinh_FR_all_ones, sinh_FR_all_ones - nop.i 999 ;; + nop.m 0 +(p6) fma.s0 f8 = f8,f8,f8 // If x +denorm, result=x+x^2 + nop.i 0 } - { .mfb - nop.m 999 -(p0) fmerge.s f8 = f44,f44 -(p0) br.ret.sptk b0 ;; + nop.m 0 +(p7) fnma.s0 f8 = f8,f8,f8 // If x -denorm, result=x-x^2 + br.ret.sptk b0 // Exit if x denorm } +;; -L(SINH_HUGE): - -// for SINH_HUGE, put 24000 in exponent; take sign from input; add 1 -// SAFE: SAFE is always 0 for HUGE -{ .mlx - nop.m 999 -(p0) movl r32 = 0x0000000000015dbf ;; +// Here if |x| >= overflow limit +SINH_HUGE: +// for SINH_HUGE, put 24000 in exponent; take sign from input +{ .mmi + mov r_exp_huge = 0x15dbf +;; + setf.exp f_huge = r_exp_huge + nop.i 0 } +;; +.pred.rel "mutex",p8,p9 { .mfi -(p0) setf.exp f9 = r32 - nop.f 999 - nop.i 999 ;; + alloc r32 = ar.pfs,0,5,4,0 +(p8) fnma.s1 f_signed_hi_lo = f_huge, f1, f1 + nop.i 0 } - { .mfi - nop.m 999 -(p0) fma.s1 sinh_FR_signed_hi_lo = sinh_FR_SGNX, f9, f1 - nop.i 999 ;; + nop.m 0 +(p9) fma.s1 f_signed_hi_lo = f_huge, f1, f1 + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fma.s0 f44 = sinh_FR_signed_hi_lo, f9, f0 -(p0) mov r47 = 126 -} -.endp sinhl -ASM_SIZE_DIRECTIVE(sinhl) -#ifdef _LIBC -ASM_SIZE_DIRECTIVE(__ieee754_sinhl) -#endif - -// Stack operations when calling error support. -// (1) (2) (3) (call) (4) -// sp -> + psp -> + psp -> + sp -> + -// | | | | -// | | <- GR_Y R3 ->| <- GR_RESULT | -> f8 -// | | | | -// | <-GR_Y Y2->| Y2 ->| <- GR_Y | -// | | | | -// | | <- GR_X X1 ->| | -// | | | | -// sp-64 -> + sp -> + sp -> + + -// save ar.pfs save b0 restore gp -// save gp restore ar.pfs - -.proc __libm_error_region -__libm_error_region: -L(SINH_ERROR_SUPPORT): + nop.m 0 + fma.s0 f_pre_result = f_signed_hi_lo, f_huge, f0 + mov GR_Parameter_TAG = 126 +} +;; + +GLOBAL_IEEE754_END(sinhl) + +LOCAL_LIBM_ENTRY(__libm_error_region) .prologue -// (1) { .mfi - add GR_Parameter_Y=-32,sp // Parameter 2 value + add GR_Parameter_Y=-32,sp // Parameter 2 value nop.f 0 .save ar.pfs,GR_SAVE_PFS - mov GR_SAVE_PFS=ar.pfs // Save ar.pfs + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs } { .mfi .fframe 64 - add sp=-64,sp // Create new stack + add sp=-64,sp // Create new stack nop.f 0 - mov GR_SAVE_GP=gp // Save gp + mov GR_SAVE_GP=gp // Save gp };; - -// (2) { .mmi - stfe [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack - add GR_Parameter_X = 16,sp // Parameter 1 address + stfe [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address .save b0, GR_SAVE_B0 - mov GR_SAVE_B0=b0 // Save b0 + mov GR_SAVE_B0=b0 // Save b0 };; .body -// (3) { .mib - stfe [GR_Parameter_X] = f8 // STORE Parameter 1 on stack + stfe [GR_Parameter_X] = f8 // STORE Parameter 1 on stack add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address nop.b 0 } { .mib - stfe [GR_Parameter_Y] = f44 // STORE Parameter 3 on stack + stfe [GR_Parameter_Y] = f_pre_result // STORE Parameter 3 on stack add GR_Parameter_Y = -16,GR_Parameter_Y - br.call.sptk b0=__libm_error_support# // Call error handling function + br.call.sptk b0=__libm_error_support# // Call error handling function };; + { .mmi - nop.m 0 - nop.m 0 add GR_Parameter_RESULT = 48,sp + nop.m 0 + nop.i 0 };; -// (4) { .mmi - ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack + ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack .restore sp - add sp = 64,sp // Restore stack pointer - mov b0 = GR_SAVE_B0 // Restore return address + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address };; + { .mib - mov gp = GR_SAVE_GP // Restore gp - mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs - br.ret.sptk b0 // Return + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return };; -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) +LOCAL_LIBM_END(__libm_error_region) + .type __libm_error_support#,@function .global __libm_error_support# diff --git a/sysdeps/ia64/fpu/e_sqrt.S b/sysdeps/ia64/fpu/e_sqrt.S index dd057f58ee..0e208b3de1 100644 --- a/sysdeps/ia64/fpu/e_sqrt.S +++ b/sysdeps/ia64/fpu/e_sqrt.S @@ -1,11 +1,11 @@ .file "sqrt.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. // -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. -// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -20,7 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -35,27 +35,28 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. // -// ******************************************************************** +//******************************************************************** // History -// ******************************************************************** -// 2/02/00 Initial version -// 4/04/00 Unwind support added -// 8/15/00 Bundle added after call to __libm_error_support to properly +//******************************************************************** +// 02/02/00 Initial version +// 04/04/00 Unwind support added +// 08/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. +// 02/10/03 Reordered header: .section, .global, .proc, .align // -// ******************************************************************** +//******************************************************************** // // Function: Combined sqrt(x), where // _ // sqrt(x) = |x, for double precision x values // -// ******************************************************************** +//******************************************************************** // // Accuracy: Correctly Rounded // -// ******************************************************************** +//******************************************************************** // // Resources Used: // @@ -68,7 +69,7 @@ // // Predicate Registers: p6, p7, p8 // -// ********************************************************************* +//********************************************************************* // // IEEE Special Conditions: // @@ -78,15 +79,13 @@ // sqrt(+/-0) = +/-0 // sqrt(negative) = QNaN and error handling is called // -// ********************************************************************* +//********************************************************************* // // Implementation: // // Modified Newton-Raphson Algorithm // -// ********************************************************************* - -#include "libm_support.h" +//********************************************************************* GR_SAVE_PFS = r33 GR_SAVE_B0 = r34 @@ -98,19 +97,7 @@ GR_Parameter_RESULT = r39 .section .text -.proc sqrt# -.global sqrt# -.align 64 - -sqrt: -#ifdef _LIBC -.global __sqrt -.type __sqrt,@function -__sqrt: -.global __ieee754_sqrt -.type __ieee754_sqrt,@function -__ieee754_sqrt: -#endif +GLOBAL_IEEE754_ENTRY(sqrt) { .mfi alloc r32= ar.pfs,0,5,4,0 frsqrta.s0 f7,p6=f8 @@ -255,7 +242,7 @@ __ieee754_sqrt: { .mfb nop.m 0 - (p0) mov f8 = f7 + mov f8 = f7 (p8) br.ret.sptk b0 ;; } { .mfb @@ -264,13 +251,7 @@ __ieee754_sqrt: (p7) br.cond.sptk __libm_error_region ;; } // END DOUBLE PRECISION MINIMUM LATENCY SQUARE ROOT ALGORITHM -.endp sqrt# -ASM_SIZE_DIRECTIVE(sqrt) -#ifdef _LIBC -ASM_SIZE_DIRECTIVE(__sqrt) -ASM_SIZE_DIRECTIVE(__ieee754_sqrt) -#endif - +GLOBAL_IEEE754_END(sqrt) // Stack operations when calling error support. // (1) (2) (3) (call) (4) // sp -> + psp -> + psp -> + sp -> + @@ -286,8 +267,7 @@ ASM_SIZE_DIRECTIVE(__ieee754_sqrt) // save gp restore ar.pfs -.proc __libm_error_region -__libm_error_region: +LOCAL_LIBM_ENTRY(__libm_error_region) // // This branch includes all those special values that are not negative, @@ -352,8 +332,9 @@ __libm_error_region: br.ret.sptk b0 // Return };; -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) +LOCAL_LIBM_END(__libm_error_region) + + .type __libm_error_support#,@function diff --git a/sysdeps/ia64/fpu/e_sqrtf.S b/sysdeps/ia64/fpu/e_sqrtf.S index 1799845d6d..bee0df7414 100644 --- a/sysdeps/ia64/fpu/e_sqrtf.S +++ b/sysdeps/ia64/fpu/e_sqrtf.S @@ -1,10 +1,10 @@ .file "sqrtf.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,7 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -35,27 +35,29 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. // -// ********************************************************************* +//********************************************************************* // History: // -// 2/02/00 Initial version -// 4/04/00 Unwind support added -// 8/15/00 Bundle added after call to __libm_error_support to properly +// 02/02/00 Initial version +// 04/04/00 Unwind support added +// 08/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. +// 05/20/02 Cleaned up namespace and sf0 syntax +// 02/10/03 Reordered header: .section, .global, .proc, .align // -// ********************************************************************* +//********************************************************************* // // Function: Combined sqrtf(x), where // _ // sqrtf(x) = |x, for single precision x values // -// ******************************************************************** +//******************************************************************** // // Accuracy: Correctly Rounded // -// ******************************************************************** +//******************************************************************** // // Resources Used: // @@ -68,7 +70,7 @@ // // Predicate Registers: p6, p7, p8 // -// ******************************************************************** +//******************************************************************** // // IEEE Special Conditions: // @@ -78,15 +80,14 @@ // sqrtf(+/-0) = +/-0 // sqrtf(negative) = QNaN and error handling is called // -// ******************************************************************** +//******************************************************************** // // Implementation: // // Modified Newton-Raphson Algorithm // -// ******************************************************************** +//******************************************************************** -#include "libm_support.h" GR_SAVE_B0 = r34 GR_SAVE_PFS = r33 @@ -102,21 +103,8 @@ FR_Y = f0 FR_RESULT = f8 - .section .text -.proc sqrtf# -.global sqrtf# -.align 64 - -sqrtf: -#ifdef _LIBC -.global __sqrtf -.type __sqrtf,@function -__sqrtf: -.global __ieee754_sqrtf -.type __ieee754_sqrtf,@function -__ieee754_sqrtf: -#endif +GLOBAL_IEEE754_ENTRY(sqrtf) { .mlx // BEGIN SINGLE PRECISION MINIMUM LATENCY SQUARE ROOT ALGORITHM alloc r32= ar.pfs,0,5,4,0 @@ -197,7 +185,7 @@ __ieee754_sqrtf: // Step (10) // d1 = a - S1 * S1 in f9 (p6) fnma.s1 f9=f7,f7,f8 - nop.i 0;;; + nop.i 0;; } { .mfb nop.m 0 // Step (11) @@ -207,27 +195,20 @@ __ieee754_sqrtf: // END SINGLE PRECISION MINIMUM LATENCY SQUARE ROOT ALGORITHM } { .mfb nop.m 0 - (p0) mov f8 = f7 + mov f8 = f7 (p8) br.ret.sptk b0 ;; } // // This branch includes all those special values that are not negative, // with the result equal to frcpa(x) // -.endp sqrtf -ASM_SIZE_DIRECTIVE(sqrtf) -#ifdef _LIBC -ASM_SIZE_DIRECTIVE(__sqrtf) -ASM_SIZE_DIRECTIVE(__ieee754_sqrtf) -#endif - +GLOBAL_IEEE754_END(sqrtf) -.proc __libm_error_region -__libm_error_region: +LOCAL_LIBM_ENTRY(__libm_error_region) .prologue { .mii add GR_Parameter_Y=-32,sp // Parameter 2 value -(p0) mov GR_Parameter_TAG = 50 + mov GR_Parameter_TAG = 50 .save ar.pfs,GR_SAVE_PFS mov GR_SAVE_PFS=ar.pfs // Save ar.pfs } @@ -271,8 +252,7 @@ __libm_error_region: br.ret.sptk b0 // Return };; -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) +LOCAL_LIBM_END(__libm_error_region) .type __libm_error_support#,@function diff --git a/sysdeps/ia64/fpu/e_sqrtl.S b/sysdeps/ia64/fpu/e_sqrtl.S index e41148243a..ec1475626d 100644 --- a/sysdeps/ia64/fpu/e_sqrtl.S +++ b/sysdeps/ia64/fpu/e_sqrtl.S @@ -1,10 +1,10 @@ .file "sqrtl.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,7 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -35,23 +35,25 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. // -// ******************************************************************** +//******************************************************************** // // History: -// 2/02/00 (hand-optimized) -// 4/04/00 Unwind support added -// 8/15/00 Bundle added after call to __libm_error_support to properly +// 02/02/00 (hand-optimized) +// 04/04/00 Unwind support added +// 08/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. +// 05/20/02 Cleaned up namespace and sf0 syntax +// 02/10/03 Reordered header: .section, .global, .proc, .align // -// ******************************************************************** +//******************************************************************** // // Function: Combined sqrtl(x), where // _ // sqrtl(x) = |x, for double-extended precision x values // -// ******************************************************************** +//******************************************************************** // // Resources Used: // @@ -64,7 +66,7 @@ // // Predicate Registers: p6, p7, p8 // -// ******************************************************************** +//******************************************************************** // // IEEE Special Conditions: // @@ -74,15 +76,13 @@ // sqrtl(+/-0) = +/-0 // sqrtl(negative) = QNaN and error handling is called // -// ******************************************************************** +//******************************************************************** // // Implementation: // // Modified Newton-Raphson Algorithm // -// ******************************************************************** - -#include "libm_support.h" +//******************************************************************** GR_SAVE_PFS = r33 GR_SAVE_B0 = r34 @@ -97,19 +97,7 @@ FR_Y = f0 FR_RESULT = f8 .section .text -.proc sqrtl# -.global sqrtl# -.align 64 - -sqrtl: -#ifdef _LIBC -.global __sqrtl -.type __sqrtl,@function -__sqrtl: -.global __ieee754_sqrtl -.type __ieee754_sqrtl,@function -__ieee754_sqrtl: -#endif +GLOBAL_IEEE754_ENTRY(sqrtl) { .mlx alloc r32= ar.pfs,0,5,4,0 // exponent of +1/2 in r2 @@ -151,7 +139,7 @@ alloc r32= ar.pfs,0,5,4,0 } { .mfi nop.m 0 - (p0) mov f15=f8 + mov f15=f8 nop.i 0;; } { .mfi nop.m 0 @@ -221,8 +209,8 @@ alloc r32= ar.pfs,0,5,4,0 (p6) br.ret.sptk b0 ;; } { .mfb - (p0) mov GR_Parameter_TAG = 48 - (p0) mov f8 = f7 + mov GR_Parameter_TAG = 48 + mov f8 = f7 (p8) br.ret.sptk b0 ;; } // @@ -232,15 +220,8 @@ alloc r32= ar.pfs,0,5,4,0 // END DOUBLE EXTENDED PRECISION MINIMUM LATENCY SQUARE ROOT ALGORITHM -.endp sqrtl# -ASM_SIZE_DIRECTIVE(sqrtl) -#ifdef _LIBC -ASM_SIZE_DIRECTIVE(__sqrtl) -ASM_SIZE_DIRECTIVE(__ieee754_sqrtl) -#endif - -.proc __libm_error_region -__libm_error_region: +GLOBAL_IEEE754_END(sqrtl) +LOCAL_LIBM_ENTRY(__libm_error_region) .prologue { .mfi add GR_Parameter_Y=-32,sp // Parameter 2 value @@ -288,7 +269,6 @@ __libm_error_region: br.ret.sptk b0 // Return };; -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) +LOCAL_LIBM_END(__libm_error_region#) .type __libm_error_support#,@function .global __libm_error_support# diff --git a/sysdeps/ia64/fpu/libm_atan2_reg.S b/sysdeps/ia64/fpu/libm_atan2_reg.S deleted file mode 100644 index 5649670d19..0000000000 --- a/sysdeps/ia64/fpu/libm_atan2_reg.S +++ /dev/null @@ -1,1234 +0,0 @@ -.file "libm_atan2_reg.s" - -// Copyright (C) 2000, 2001, Intel Corporation -// All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// * The name of Intel Corporation may not be used to endorse or promote -// products derived from this software without specific prior written -// permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. -// -// History -//============================================================== -// 2/02/00: Initial version -// 4/04/00 Unwind support added - -#include "libm_support.h" - -.data - -.align 64 -ASM_TYPE_DIRECTIVE(Constants_atan#,@object) -Constants_atan: -data4 0x54442D18, 0x3FF921FB, 0x248D3132, 0x3E000000 -// double pi/2, single lo_pi/2, two**(-3) -data4 0xAAAAAAA3, 0xAAAAAAAA, 0x0000BFFD, 0x00000000 // P_1 -data4 0xCCCC54B2, 0xCCCCCCCC, 0x00003FFC, 0x00000000 // P_2 -data4 0x47E4D0C2, 0x92492492, 0x0000BFFC, 0x00000000 // P_3 -data4 0x58870889, 0xE38E38E0, 0x00003FFB, 0x00000000 // P_4 -data4 0x290149F8, 0xBA2E895B, 0x0000BFFB, 0x00000000 // P_5 -data4 0x250F733D, 0x9D88E6D4, 0x00003FFB, 0x00000000 // P_6 -data4 0xFB8745A0, 0x884E51FF, 0x0000BFFB, 0x00000000 // P_7 -data4 0x394396BD, 0xE1C7412B, 0x00003FFA, 0x00000000 // P_8 -data4 0xAAAAA52F, 0xAAAAAAAA, 0x0000BFFD, 0x00000000 // Q_1 -data4 0xC75B60D3, 0xCCCCCCCC, 0x00003FFC, 0x00000000 // Q_2 -data4 0x011F1940, 0x924923AD, 0x0000BFFC, 0x00000000 // Q_3 -data4 0x2A5F89BD, 0xE36F716D, 0x00003FFB, 0x00000000 // Q_4 -// Entries Tbl_hi (double precision) -// B = 1+Index/16+1/32 Index = 0 -// Entries Tbl_lo (single precision) -// B = 1+Index/16+1/32 Index = 0 -data4 0xA935BD8E, 0x3FE9A000, 0x23ACA08F, 0x00000000 -// Entries Tbl_hi (double precision) Index = 0,1,...,15 -// B = 2^(-1)*(1+Index/16+1/32) -// Entries Tbl_lo (single precision) -// Index = 0,1,...,15 B = 2^(-1)*(1+Index/16+1/32) -data4 0x7F175A34, 0x3FDE77EB, 0x238729EE, 0x00000000 -data4 0x73C1A40B, 0x3FE0039C, 0x249334DB, 0x00000000 -data4 0x5B5B43DA, 0x3FE0C614, 0x22CBA7D1, 0x00000000 -data4 0x88BE7C13, 0x3FE1835A, 0x246310E7, 0x00000000 -data4 0xE2CC9E6A, 0x3FE23B71, 0x236210E5, 0x00000000 -data4 0x8406CBCA, 0x3FE2EE62, 0x2462EAF5, 0x00000000 -data4 0x1CD41719, 0x3FE39C39, 0x24B73EF3, 0x00000000 -data4 0x5B795B55, 0x3FE44506, 0x24C11260, 0x00000000 -data4 0x5BB6EC04, 0x3FE4E8DE, 0x242519EE, 0x00000000 -data4 0x1F732FBA, 0x3FE587D8, 0x24D4346C, 0x00000000 -data4 0x115D7B8D, 0x3FE6220D, 0x24ED487B, 0x00000000 -data4 0x920B3D98, 0x3FE6B798, 0x2495FF1E, 0x00000000 -data4 0x8FBA8E0F, 0x3FE74897, 0x223D9531, 0x00000000 -data4 0x289FA093, 0x3FE7D528, 0x242B0411, 0x00000000 -data4 0x576CC2C5, 0x3FE85D69, 0x2335B374, 0x00000000 -data4 0xA99CC05D, 0x3FE8E17A, 0x24C27CFB, 0x00000000 -// -// Entries Tbl_hi (double precision) Index = 0,1,...,15 -// B = 2^(-2)*(1+Index/16+1/32) -// Entries Tbl_lo (single precision) -// Index = 0,1,...,15 B = 2^(-2)*(1+Index/16+1/32) -// -data4 0x510665B5, 0x3FD025FA, 0x24263482, 0x00000000 -data4 0x362431C9, 0x3FD1151A, 0x242C8DC9, 0x00000000 -data4 0x67E47C95, 0x3FD20255, 0x245CF9BA, 0x00000000 -data4 0x7A823CFE, 0x3FD2ED98, 0x235C892C, 0x00000000 -data4 0x29271134, 0x3FD3D6D1, 0x2389BE52, 0x00000000 -data4 0x586890E6, 0x3FD4BDEE, 0x24436471, 0x00000000 -data4 0x175E0F4E, 0x3FD5A2E0, 0x2389DBD4, 0x00000000 -data4 0x9F5FA6FD, 0x3FD68597, 0x2476D43F, 0x00000000 -data4 0x52817501, 0x3FD76607, 0x24711774, 0x00000000 -data4 0xB8DF95D7, 0x3FD84422, 0x23EBB501, 0x00000000 -data4 0x7CD0C662, 0x3FD91FDE, 0x23883A0C, 0x00000000 -data4 0x66168001, 0x3FD9F930, 0x240DF63F, 0x00000000 -data4 0x5422058B, 0x3FDAD00F, 0x23FE261A, 0x00000000 -data4 0x378624A5, 0x3FDBA473, 0x23A8CD0E, 0x00000000 -data4 0x0AAD71F8, 0x3FDC7655, 0x2422D1D0, 0x00000000 -data4 0xC9EC862B, 0x3FDD45AE, 0x2344A109, 0x00000000 -// -// Entries Tbl_hi (double precision) Index = 0,1,...,15 -// B = 2^(-3)*(1+Index/16+1/32) -// Entries Tbl_lo (single precision) -// Index = 0,1,...,15 B = 2^(-3)*(1+Index/16+1/32) -// -data4 0x84212B3D, 0x3FC068D5, 0x239874B6, 0x00000000 -data4 0x41060850, 0x3FC16465, 0x2335E774, 0x00000000 -data4 0x171A535C, 0x3FC25F6E, 0x233E36BE, 0x00000000 -data4 0xEDEB99A3, 0x3FC359E8, 0x239680A3, 0x00000000 -data4 0xC6092A9E, 0x3FC453CE, 0x230FB29E, 0x00000000 -data4 0xBA11570A, 0x3FC54D18, 0x230C1418, 0x00000000 -data4 0xFFB3AA73, 0x3FC645BF, 0x23F0564A, 0x00000000 -data4 0xE8A7D201, 0x3FC73DBD, 0x23D4A5E1, 0x00000000 -data4 0xE398EBC7, 0x3FC8350B, 0x23D4ADDA, 0x00000000 -data4 0x7D050271, 0x3FC92BA3, 0x23BCB085, 0x00000000 -data4 0x601081A5, 0x3FCA217E, 0x23BC841D, 0x00000000 -data4 0x574D780B, 0x3FCB1696, 0x23CF4A8E, 0x00000000 -data4 0x4D768466, 0x3FCC0AE5, 0x23BECC90, 0x00000000 -data4 0x4E1D5395, 0x3FCCFE65, 0x2323DCD2, 0x00000000 -data4 0x864C9D9D, 0x3FCDF110, 0x23F53F3A, 0x00000000 -data4 0x451D980C, 0x3FCEE2E1, 0x23CCB11F, 0x00000000 -data4 0x54442D18, 0x400921FB, 0x33145C07, 0x3CA1A626 // I two doubles -data4 0x54442D18, 0x3FF921FB, 0x33145C07, 0x3C91A626 // I_by_2 two dbls -data4 0x54442D18, 0x3FE921FB, 0x33145C07, 0x3C81A626 // I_by_4 two dbls -data4 0x7F3321D2, 0x4002D97C, 0x4C9E8A0A, 0x3C9A7939 // 3I_by_4 two dbls -ASM_SIZE_DIRECTIVE(Constants_atan#) -.section .text - -.proc __libm_atan2_reg# -.global __libm_atan2_reg# -.align 64 -__libm_atan2_reg: - - -{ .mfi - alloc r32 = ar.pfs,0,20,4,0 -(p0) mov f32 = f8 - nop.i 0 -} -{ .mmi - nop.m 0 -(p0) addl r39 = @ltoff(Constants_atan#), gp - nop.i 999 -} -;; - -{ .mmi - ld8 r39 = [r39] - nop.m 999 - nop.i 999 -} -;; - -{ .mfi - nop 999 // EMbo added ... -(p0) mov f33 = f9 - nop.i 0 - } { .mfi - nop 999 // EMbo added ... -(p0) fclass.nm.unc p9,p0 = f32 ,0x1FF - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p0) fclass.nm.unc p8,p0 = f33 ,0x1FF - nop 999 // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p0) fclass.m.unc p6,p0 = f33 ,0x103 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p0) fclass.m.unc p7,p0 = f32 ,0x103 - nop 999 // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p0) fclass.m.unc p12,p0 = f33 ,0x0C3 - nop 999;; // EMbo added ... - } { .mfb - nop 999 // EMbo added ... -// -// Check for NatVals. -// Check for EM Unsupporteds -// Check for NaNs. -// -(p0) fclass.m.unc p13,p0 = f32 ,0x0C3 -(p6) br.cond.sptk L(ATAN_NATVAL);; - } { .mbb - nop 999 // EMbo added ... -(p7) br.cond.sptk L(ATAN_NATVAL) -(p8) br.cond.sptk L(ATAN_UNSUPPORTED);; - } { .mib -(p0) add r40 = 96, r39 - nop 999 // EMbo added ... -(p9) br.cond.sptk L(ATAN_UNSUPPORTED);; - } { .mib -(p0) ldfd f50 = [r39],8 - nop 999 // EMbo added ... -(p12) br.cond.sptk L(ATAN_NAN);; - } { .mfb - nop 999 // EMbo added ... -(p0) fnorm.s1 f33 = f33 -(p13) br.cond.sptk L(ATAN_NAN);; - } { .mfi -(p0) ldfs f51 = [r39],4 -// -// Remove sign bits from exponents -// Load 2**(-3) -// Normalize the input argument. -// -(p0) fnorm.s1 f32 = f32 - nop 999 // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p0) mov f82 = f1 - nop 999;; // EMbo added ... - } { .mmi - nop 999;; // EMbo added ... -(p0) ldfs f78 = [r39],180 - nop 999;; // EMbo added ... - } { .mmi -(p0) getf.exp r36 = f33;; -// -// Get exp and sign of ArgX -// Get exp and sign of ArgY -// Load 2**(-3) and increment ptr to Q_4. -// -(p0) getf.exp r37 = f32 -(p0) shr.u r36 = r36,17;; - } { .mfi - nop 999 // EMbo added ... -(p0) fmerge.s f84 = f1,f32 -(p0) shr.u r37 = r37,17;; - } { .mfi - nop 999 // EMbo added ... -// -// ArgX_abs = |ArgX| -// ArgY_abs = |ArgY| -// sign_X is sign bit of ArgX -// sign_Y is sign bit of ArgY -// -(p0) fmerge.s f83 = f1,f33 -(p0) cmp.eq.unc p8,p9 = 0x00000, r37;; - } { .mfi - nop 999 // EMbo added ... -(p8) fadd.s1 f34 = f0, f1 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p9) fsub.s1 f34 = f0, f1 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p0) fmin.s1 f36 = f83, f84 - nop 999 // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p0) fmax.s1 f35 = f83, f84 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -// -// Is ArgX_abs >= ArgY_abs -// Is sign_Y == 0? -// -(p0) fcmp.ge.s1 p6,p7 = f83,f84 - nop 999;; // EMbo added ... - } { .mii -(p6) cmp.eq.unc p10, p11 = 0x00000, r36 -(p6) add r38 = r0, r0;; -// -// U = max(ArgX_abs,ArgY_abs) -// V = min(ArgX_abs,ArgY_abs) -// if p6, swap = 0 -// if p7, swap = 1 -// -// -// Let M = 1.0 -// if p8, s_Y = 1.0 -// if p9, s_Y = -1.0 -// -(p7) add r38 = 1,r0;; - } { .mfi - nop 999 // EMbo added ... -(p0) frcpa.s1 f37, p6 = f36, f35 - nop 999;; // EMbo added ... - } { .mfb - nop 999 // EMbo added ... -// -// E = frcpa(V,U) -// -(p10) fsub.s1 f82 = f82, f1 -(p6) br.cond.sptk L(ATAN_STEP2);; - } { .mib - nop 999 // EMbo added ... - nop 999 // EMbo added ... -// /**************************************************/ -// /********************* STEP2 **********************/ -// /**************************************************/ -(p0) br.cond.spnt L(ATAN_SPECIAL_HANDLING);; - } -L(ATAN_STEP2): - { .mlx - nop 999 // EMbo added ... -(p0) movl r47 = 0x8400000000000000 - } { .mlx - nop 999 // EMbo added ... -(p0) movl r48 = 0x0000000000000100;; - } { .mfi - nop 999 // EMbo added ... -(p0) fmpy.s1 f38 = f37, f36 - nop 999 // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p0) fcmp.lt.unc.s0 p0,p9 = f9,f1 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p0) fcmp.lt.unc.s0 p0,p8 = f8,f1 - nop 999 // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -// -// Q = E * V -// -(p11) fadd.s1 f82 = f82, f1 - nop 999;; // EMbo added ... - } { .mfi -(p0) getf.sig r46 = f38 -(p0) fcmp.lt.unc p6,p7 = f38,f78 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p0) fmpy.s1 f38 = f37, f36 -(p0) extr.u r42 = r46, 59, 4;; - } { .mfi - nop 999 // EMbo added ... -(p0) fmpy.s1 f50 = f82, f50 -(p0) dep r47 = r42, r47, 59, 4 - } { .mfi - nop 999 // EMbo added ... -(p0) fmpy.s1 f51 = f82, f51 - nop 999;; // EMbo added ... - } { .mmi - nop 999;; // EMbo added ... -// -// Is Q < 2**(-3)? -// -// -// Do fcmp to raise any denormal operand -// exceptions. -// -(p0) getf.exp r45 = f38 - nop 999;; // EMbo added ... - } { .mib -// -// lookup = b_1 b_2 b_3 B_4 -// -// -// Generate 1.b_1 b_2 b_3 b_4 1 0 0 0 ... 0 -// -(p0) andcm r41 = 0x0003, r45 - nop 999 // EMbo added ... -// -// We waited a few extra cycles so P_lo and P_hi could be calculated. -// Load the constant 256 for loading up table entries. -// -// /**************************************************/ -// /********************* STEP3 **********************/ -// /**************************************************/ -(p6) br.cond.spnt L(ATAN_POLY);; - } { .mii -(p0) setf.sig f39 = r47 -(p0) cmp.eq.unc p8, p9 = 0x0000, r41 -// -// z_hi = s exp 1.b_1 b_2 b_3 b_4 1 0 0 0 ... 0 -// point to beginning of Tbl_hi entries - k = 0. -// -(p0) add r40 = 16, r39 - } { .mmi -(p0) ldfe f73 = [r39],-16;; -(p9) sub r41 = r41,r0,1 -(p9) add r40 = 16,r40 - } { .mfi -(p8) ldfd f48 = [r40],8 -(p0) fmpy.s1 f50 = f34, f50 -(p0) xor r38 = r36,r38;; - } { .mmi -(p0) ldfe f71 = [r39],-16;; -(p8) ldfs f49 = [r40],8 -(p9) pmpy2.r r41 = r41,r48;; - } { .mfi -(p0) ldfe f69 = [r39],-16 -// -// Let z_hi have exponent and sign of original Q -// Load the Tbl_hi(0) else, increment pointer. -// -(p0) fmerge.se f39 = f38,f39 -(p9) shladd r42 = r42,0x0004,r41;; - } { .mmi -(p9) add r40 = r40, r42;; -(p9) ldfd f48 = [r40],8 - nop 999;; // EMbo added ... - } { .mmi -(p0) ldfe f67 = [r39],-16;; -(p9) ldfs f49 = [r40],8 - nop 999 // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -// -// U_prime_hi = U + V * z_hi -// Load the Tbl_lo(0) -// -(p0) fma.s1 f40 = f36, f39, f35 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p0) fnma.s1 f42 = f35, f39, f36 - nop 999 // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p0) mov f52 = f48 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p0) frcpa.s1 f43, p6 = f1, f40 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -// -// U_prime_lo = U - U_prime_hi -// k = k * 256 - result can be 0, 256, or 512. -// -(p0) fsub.s1 f41 = f35, f40 -(p0) cmp.eq.unc p7, p6 = 0x00000, r38 - } { .mfi - nop 999 // EMbo added ... -(p0) fmpy.s1 f52 = f34, f52 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p7) fadd.s1 f54 = f0, f1 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p6) fsub.s1 f54 = f0, f1 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p0) fnma.s1 f80 = f43, f40, f1 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p0) fadd.s1 f79 = f41, f40 - nop 999 // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p0) fma.s1 f41 = f36, f39, f41 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p0) fma.s1 f56 = f54, f52, f50 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p0) fma.s1 f43 = f80, f43, f43 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -// -// U_prime_lo = U - U_hold -// lookup -> lookup * 16 + k -// -// -// V_prime = V - U * z_hi -// U_prime_lo = V * z_hi + U_prime_lo -// -(p0) fsub.s1 f79 = f35, f79 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p0) fnma.s1 f80 = f43, f40, f1 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -// -// C_hi = frcpa(1,U_prime_hi) -// U_prime_lo = U_prime_lo + U_hold -// -// -// C_hi_hold = 1 - C_hi * U_prime_hi (1) -// -// -// C_hi = C_hi + C_hi * C_hi_hold (1) -// -// -// C_hi_hold = 1 - C_hi * U_prime_hi (2) -// -(p0) fadd.s1 f41 = f41, f79 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -// -// C_hi = C_hi + C_hi * C_hi_hold (2) -// -(p0) fma.s1 f43 = f80, f43, f43 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -// -// C_hi_hold = 1 - C_hi * U_prime_hi (3) -// -(p0) fnma.s1 f80 = f43, f40, f1 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -// -// C_hi = C_hi + C_hi * C_hi_hold (3) -// -(p0) fma.s1 f43 = f80, f43, f43 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -// -// w_hi = V_prime * C_hi -// -(p0) fmpy.s1 f44 = f42, f43 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p0) fmpy.s1 f46 = f44, f44 - nop 999 // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -// -// wsq = w_hi * w_hi -// w_lo = = V_prime - w_hi * U_prime_hi -// -(p0) fnma.s1 f45 = f44, f40, f42 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p0) fma.s1 f47 = f46, f73, f71 - nop 999 // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -// -// poly = Q_3 + wsq * Q_4 -// w_lo = = w_lo - w_hi * U_prime_lo -// -(p0) fnma.s1 f45 = f44, f41, f45 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p0) fma.s1 f47 = f46, f47, f69 - nop 999 // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -// -// poly = Q_2 + wsq * poly -// w_lo = = w_lo * C_hi -// -(p0) fmpy.s1 f45 = f43, f45 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p0) fma.s1 f47 = f46, f47, f67 - nop 999 // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -// -// poly = Q_1 + wsq * poly -// A_lo = Tbl_lo + w_lo -// swap = xor(swap,sign_X) -// -(p0) fadd.s1 f53 = f49, f45 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -// -// Is (swap) != 0 ? -// poly = wsq * poly -// A_hi = Tbl_hi -// -(p0) fmpy.s1 f47 = f46, f47 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -// -// poly = wsq * poly -// -// -// if (p6) sigma = -1.0 -// if (p7) sigma = 1.0 -// -(p0) fmpy.s1 f47 = f44, f47 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -// -// P_hi = s_Y * P_hi -// A_lo = A_lo + poly -// -(p0) fadd.s1 f53 = f53, f47 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -// -// A_lo = A_lo + w_hi -// A_hi = s_Y * A_hi -// -(p0) fadd.s1 f53 = f53, f44 - nop 999;; // EMbo added ... - } { .mfb - nop 999 // EMbo added ... -// -// result_hi = P_hi + sigma * A_hi -// result_lo = P_lo + sigma * A_lo -// -(p0) fma.s1 f55 = f54, f53, f51 -(p0) br.cond.sptk L(RETURN_ATAN);; -} -// -// result = result_hi + result_lo * s_Y (User Supplied Rounding Mode) -// -// (p0) fma.d.s0 f57 = f55, f34, f56 -// -// /**************************************************/ -// /********************* STEP4 **********************/ -// /**************************************************/ -// -L(ATAN_POLY): -{ .mmi -(p0) xor r38 = r36,r38 -(p0) addl r39 = @ltoff(Constants_atan#), gp - nop.i 999 -} -;; - -{ .mmi - ld8 r39 = [r39] - nop.m 999 - nop.i 999 -} -;; - - -{ .mlx - nop 999 // EMbo added ... -(p0) movl r47 = 0x24005;; - } { .mfi -(p0) add r39 = 128, r39 -(p0) fnma.s1 f81 = f37, f35, f1 -(p0) cmp.eq.unc p7, p6 = 0x00000, r38;; - } { .mmf - nop 999 // EMbo added ... -(p0) ldfe f77 = [r39],-16 -// -// Iterate 3 times E = E + E*(1.0 - E*U) -// Also load P_8, P_7, P_6, P_5, P_4 -// E_hold = 1.0 - E * U (1) -// A_temp = Q -// -(p0) mov f85 = f38;; - } { .mmf - nop 999 // EMbo added ... -(p0) ldfe f76 = [r39],-16 -(p6) fsub.s1 f54 = f0, f1;; - } { .mmf - nop 999 // EMbo added ... -(p0) ldfe f75 = [r39],-16 -// -// E = E + E_hold*E (1) -// Point to P_8. -// -(p0) fma.s1 f37 = f37, f81, f37;; - } { .mmf - nop 999 // EMbo added ... -(p0) ldfe f74 = [r39],-16 -(p0) fnma.s1 f64 = f85, f35, f36;; - } { .mmf - nop 999 // EMbo added ... -(p0) ldfe f72 = [r39],-16 -(p7) fadd.s1 f54 = f0, f1;; - } { .mmf - nop 999 // EMbo added ... -(p0) ldfe f70 = [r39],-16 -// -// E_hold = 1.0 - E * U (2) -// -(p0) fnma.s1 f81 = f37, f35, f1;; - } { .mmf - nop 999 // EMbo added ... -(p0) ldfe f68 = [r39],-16 -(p0) fmpy.s1 f50 = f34, f50;; - } { .mmf - nop 999 // EMbo added ... -(p0) ldfe f66 = [r39],-16 -(p0) fmpy.d.s0 f67 = f67, f67 - } { .mfi - nop 999 // EMbo added ... -// -// E = E + E_hold*E (2) -// -(p0) fma.s1 f37 = f37, f81, f37 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -// -// E_hold = 1.0 - E * U (3) -// -(p0) fnma.s1 f81 = f37, f35, f1 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -// -// E = E + E_hold*E (3) -// At this point E approximates 1/U to roughly working precision -// z = V*E approximates V/U -// -(p0) fma.s1 f37 = f37, f81, f37 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -// -// z = V * E -// -(p0) fmpy.s1 f59 = f36, f37 - nop 999 // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p0) fmpy.s1 f64 = f64, f37 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -// -// zsq = z * z -// Also load P_3 -// -(p0) fmpy.s1 f60 = f59, f59 - nop 999 // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p0) fadd.s1 f52 = f85, f64 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p0) fma.s1 f62 = f60, f77, f76 - nop 999 // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p0) fma.s1 f63 = f60, f70, f68 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -// -// z8 = zsq * zsq -// Also load P_2 -// -(p0) fmpy.s1 f61 = f60, f60 - nop 999 // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p0) fsub.s1 f85 = f85, f52 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p0) fmerge.s f65 = f52,f52 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p0) fma.s1 f62 = f60, f62, f75 - nop 999 // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p0) fma.s1 f63 = f60, f63, f66 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -// -// z8 = z8 * z8 -// Also load P_1 -// poly1 = _4 + zsq*(P_5 + zsq*(P_6 + zsq*(P_7 + zsq*P_8))) -// poly2 = zsq*(P_1 + zsq*(P_2 + zsq*P_3)) -// -// -// poly1 = P_7 + zsq * P_8 -// poly2 = P_2 + zsq * P_3 -// poly1 = P_4 + zsq*(P_5 + zsq*(P_6 + zsq*poly1)) -// poly2 = zsq*(P_1 + zsq*poly2) -// -// -// poly1 = P_6 + zsq * poly1 -// poly2 = P_1 + zsq * poly2 -// poly1 = P_4 + zsq*(P_5 + zsq*poly1) -// poly2 = zsq*poly2 -// -(p0) fmpy.s1 f61 = f61, f61 - nop 999 // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p0) fadd.s1 f64 = f85, f64 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p0) fma.s1 f62 = f60, f62, f74 - nop 999 // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -// -// poly1 = P_5 + zsq * poly1 -// poly2 = zsq * poly2 -// poly1 = P_4 + zsq*poly1 -// -(p0) fmpy.s1 f63 = f63, f60 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -// -// poly1 = P_4 + zsq * poly1 -// swap = xor(swap,sign_X) -// -(p0) fma.s1 f62 = f60, f62, f72 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -// -// poly = z8*poly1 + poly2 (Typo in writeup) -// Is (swap) != 0 ? -// -// -// z_lo = V - A_temp * U -// if (p7) sigma = 1.0 -// Writeup shows A_temp as A_hi -// -// -// z_lo = z_lo * E -// if (p6) sigma = -1.0 -// z_lo = (V - A_temp * U) *E -// -// -// Fixup added to force inexact later - -// A_hi = A_temp + z_lo -// z_lo = (A_temp - A_hi) + z_lo -// z_lo = A_hi - z_lo -A_hi + z_lo = about 0 -// -(p0) fma.s1 f47 = f61, f62, f63 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -// -// A_lo = z * poly + z_lo -// -(p0) fma.s1 f53 = f59, f47, f64 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p0) fadd.s1 f52 = f65, f53 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p0) fsub.s1 f65 = f65, f52 - nop 999 // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p0) fmpy.s1 f52 = f34, f52 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p0) fadd.s1 f53 = f65, f53 - nop 999 // EMbo added ... - } { .mfi -(p0) setf.exp f65 = r47 -(p0) fma.s1 f56 = f54, f52, f50 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p0) fclass.m.unc p6,p0 = f53,0x007 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -// -// P_hi = s_Y * P_hi -// A_hi = s_Y * A_hi -// -// -// result_hi = P_hi + sigma * A_hi -// -(p6) mov f53 = f65 - nop 999 // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -// -// tmp = P_hi - result_hi -// -(p0) fsub.s1 f65 = f50, f56 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p0) fma.s1 f65 = f52, f54, f65 - nop 999 // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -// -// tmp = sigma * A_hi + tmp -// sigma = A_lo * sigma + P_lo -// -(p0) fma.s1 f54 = f53, f54, f51 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -// -// result_lo = s_Y * sigma + tmp -// -(p0) fma.s1 f55 = f34, f54, f65 - nop 999;; // EMbo added ... - } { .mfb - nop.m 0 - mov f34 = f1 -(p0) br.cond.sptk L(RETURN_ATAN);; -} -// -// result = result_hi + result_lo (User Supplied Rounding Mode) -// -// (p0) fadd.d.s0 f57 = f55, f56 -L(ATAN_UNSUPPORTED): -L(ATAN_NATVAL): - { .mfb - nop 999 // EMbo added ... -// -// Deal with the NatVal and unsupported cases. -// Raise invalid if warrented. -// -(p0) fmpy.d.s0 f57 = f8, f9 -br.cond.sptk L(RETURN_ATAN);; - } -L(ATAN_NAN): - { .mfb - nop 999 // EMbo added ... -// -// If only one NaN, then generate the resulting -// NaN and return - may raise invalid. -// -(p0) fmpy.d.s0 f57 = f8, f9 -(p0) br.cond.sptk L(RETURN_ATAN);; - } -L(ATAN_SPECIAL_HANDLING): - - { .mmf -(p0) addl r39 = @ltoff(Constants_atan#), gp - nop.m 999 -(p0) fcmp.lt.s0 p0,p7 = f8,f1 - } -;; - -// -// Raise denormal operand faults if necessary -// - -{ .mfi - ld8 r39 = [r39] -(p0) fcmp.lt.s0 p0,p6 = f9,f1 - nop 999;; // EMbo added ... -} -;; - - - -{ .mfi - nop 999 // EMbo added ... -(p0) fclass.m.unc p6,p7 = f32,0x007 - nop 999;; // EMbo added ... - } { .mlx - nop 999 // EMbo added ... -(p0) movl r47 = 992;; - } { .mib -(p0) add r39 = r39, r47 - nop 999 // EMbo added ... -(p7) br.cond.sptk L(ATAN_ArgY_Not_ZERO);; - } { .mfi - nop 999 // EMbo added ... -(p6) fclass.m.unc p14,p0 = f33,0x035 - nop 999 // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p6) fclass.m.unc p15,p0 = f33,0x036 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p6) fclass.m.unc p13,p0 = f33,0x007 - nop 999 // EMbo added ... - } { .mfi -(p0) ldfd f56 = [r39],8 - nop 999 // EMbo added ... - nop 999;; // EMbo added ... - } { .mfi -(p0) ldfd f55 = [r39],-8 -(p14) fmerge.s f56 = f32,f0 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -// -// Return sign_Y * 0 when Y = +/-0 and X > 0 -// -(p14) fmerge.s f55 = f32,f0 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p15) fmerge.s f56 = f32,f56 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -// -// Return sign_Y * PI when X < -0 -// -// -(p15) fmerge.s f55 = f32,f55 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p0) fadd.d.s0 f57 = f56,f55 - nop.i 0 - } { .bbb -// -// Call error support function for atan(0,0) -// - expected value already computed. -// - nop.b 0 - nop.b 0 -(p0) br.cond.sptk L(RETURN_ATAN) - } -L(ATAN_ArgY_Not_ZERO): - { .mfi - nop 999 // EMbo added ... -(p0) fclass.m.unc p9,p10 = f32,0x023 - nop 999;; // EMbo added ... - } { .mfb - nop 999 // EMbo added ... -(p9) fclass.m.unc p6,p0 = f33,0x017 -(p10) br.cond.sptk L(ATAN_ArgY_Not_INF);; - } { .mfi -(p6) add r39 = 16,r39 -(p9) fclass.m.unc p7,p0 = f33,0x021 - nop 999;; // EMbo added ... - } { .mmf - nop 999 // EMbo added ... -(p0) ldfd f56 = [r39],8 -(p9) fclass.m.unc p8,p0 = f33,0x022;; - } { .mbb -(p0) ldfd f55 = [r39],-8 - nop 999 // EMbo added ... - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p6) fmerge.s f56 = f32,f56 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p6) fmerge.s f55 = f32,f55 - nop 999;; // EMbo added ... - } { .mfb - nop 999 // EMbo added ... -// -// Load I/2 and adjust its sign. -// Return +I/2 when ArgY = +Inf and ArgX = +/-0,normal -// Return -I/2 when ArgY = -Inf and ArgX = +/-0,normal -// -(p6) fadd.d.s0 f57 = f56, f55 -(p6) br.cond.sptk L(RETURN_ATAN);; - } { .mmi -(p7) add r39 = 32,r39;; -(p7) ldfd f56 = [r39],8 - nop 999;; // EMbo added ... - } { .mmi - nop 999;; // EMbo added ... -(p7) ldfd f55 = [r39],-8 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p7) fmerge.s f56 = f32,f56 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p7) fmerge.s f55 = f32,f55 - nop 999;; // EMbo added ... - } { .mfb - nop 999 // EMbo added ... -// -// Load PI/4 and adjust its sign. -// Return +PI/4 when ArgY = +Inf and ArgX = +Inf -// Return -PI/4 when ArgY = -Inf and ArgX = +Inf -// -(p7) fadd.d.s0 f57 = f56, f55 -(p7) br.cond.sptk L(RETURN_ATAN);; - } { .mmi -(p8) add r39 = 48,r39;; -(p8) ldfd f56 =[r39],8 - nop 999;; // EMbo added ... - } { .mmi - nop 999;; // EMbo added ... -(p8) ldfd f55 =[r39],-8 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p8) fmerge.s f56 = f32,f56 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p8) fmerge.s f55 = f32,f55 - nop 999;; // EMbo added ... - } { .mfb - nop 999 // EMbo added ... -// -// Load I/4 and adjust its sign. -// Return +3I/4 when ArgY = +Inf and ArgX = -Inf -// Return -3I/4 when ArgY = -Inf and ArgX = -Inf -// -(p8) fadd.d.s0 f57 = f56, f55 -(p8) br.cond.sptk L(RETURN_ATAN);; - } -L(ATAN_ArgY_Not_INF): - { .mfi - nop 999 // EMbo added ... -(p0) fclass.m.unc p6,p0 = f33,0x007 - nop 999 // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p0) fclass.m.unc p7,p0 = f33,0x021 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p0) fclass.m.unc p8,p0 = f33,0x022 -(p6) add r39 = 16,r39;; - } { .mfi -(p6) ldfd f56 =[r39],8 - nop 999 // EMbo added ... - nop 999;; // EMbo added ... - } { .mmi - nop 999;; // EMbo added ... -(p6) ldfd f55 =[r39],-8 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p6) fmerge.s f56 = f32,f56 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p6) fmerge.s f55 = f32,f55 - nop 999;; // EMbo added ... - } { .mfb - nop 999 // EMbo added ... -// -// return = sign_Y * I/2 when ArgX = +/-0 -// -(p6) fadd.d.s0 f57 = f56, f55 -(p6) br.cond.sptk L(RETURN_ATAN);; - } { .mfi - nop 999 // EMbo added ... -(p7) fmerge.s f56 = f32,f0 - nop 999 // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p7) fmerge.s f55 = f32,f0 - nop 999;; // EMbo added ... - } { .mfb - nop 999 // EMbo added ... -// -// return = sign_Y * 0 when ArgX = Inf -// -(p7) fadd.d.s0 f57 = f56, f55 -(p7) br.cond.sptk L(RETURN_ATAN);; - } { .mfi -(p8) ldfd f56 = [r39],8 - nop 999 // EMbo added ... - nop 999;; // EMbo added ... - } { .mmi - nop 999;; // EMbo added ... -(p8) ldfd f55 = [r39],-8 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p8) fmerge.s f56 = f32,f56 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -(p8) fmerge.s f55 = f32,f55 - nop 999;; // EMbo added ... - } { .mfi - nop 999 // EMbo added ... -// -// return = sign_Y * I when ArgX = -Inf -// -(p8) fadd.d.s0 f57 = f56, f55 - nop 999 // EMbo added ... - };; -L(RETURN_ATAN): -// mov f8 = f57 ;; -// The answer is in f57. -// But Z_hi is f56 -// Z_lo is f55 -// s_Y is f34 -// W is in f9 and untouched - -{ .mfi - nop 999 -mov f8 = f56 - nop.i 0 -};; - -{ .mfi - nop 999 -mov f10 = f55 - nop.i 999 -} -{ .mfb - nop 999 -mov f11 = f34 -br.ret.sptk b0 -};; - -.endp __libm_atan2_reg -ASM_SIZE_DIRECTIVE(__libm_atan2_reg) diff --git a/sysdeps/ia64/fpu/libm_error.c b/sysdeps/ia64/fpu/libm_error.c index ebbaad02ad..42ca36d98f 100644 --- a/sysdeps/ia64/fpu/libm_error.c +++ b/sysdeps/ia64/fpu/libm_error.c @@ -1,9 +1,10 @@ -// -// Copyright (C) 2000, 2001, Intel Corporation +/* file: libm_error.c */ + + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. // -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, James -// Edwards, and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -19,14 +20,15 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. + // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS @@ -34,19 +36,39 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== // 2/02/00: Initial version -// 3/22/00: Updated to support flexible and dynamic error handling. -// 8/16/00: Changed all matherr function-calls to use the pmatherr +// 3/22/00: Updated to support flexible and dynamic error handling. +// 8/16/00: Changed all matherr function-calls to use the pmatherr // function-pointers. // 10/03/00: Corrected a scalb type. // 11/28/00: Changed INPUT_XL to INPUT_XD for scalb_underflow case. // 12/07/00: Added code to make scalbn error support equivalent to ldexp. // 2/07/01: Added __declspec(align(16)) to long double constants to correct // alignment problem. +// 4/23/01: Added code for remquo +// 6/07/01: Added code for fdim, lrint, lround, llrint, llround +// Deleted code for remquo +// 8/15/01: Added code for scalbln, nexttoward +// 12/10/01: Added code for erfc +// 12/27/01: Added code for degree argument functions +// 01/02/02: Added code for tand, cotd +// 01/15/02: Corrected SVID/XOPEN code for log1p, pow, and acosh +// 01/25/02: Corrected ISOC for lgamma and gamma to return EDOM for neg ints +// 01/28/02: Corrected SVID/XOPEN stderr message for log2 +// 05/20/02: Added code for cot +// 07/01/02: Added code for sinhcosh +// 10/04/02: Underflow detection in ISOC path redefined to +// be zero rather than tiny and inexact +// 12/06/02: Added code for annuity and compound +// 01/30/03: Corrected test for underflow in ISOC path to not set denormal +// 04/10/03: Corrected ISOC branch for gamma/lgamma to return ERANGE for neg ints. +// Added code for tgamma +// 04/11/03: Corrected POSIX/SVID/XOPEN branches for gamma/lgamma +// to return EDOM for neg ints. // #include <errno.h> @@ -54,38 +76,41 @@ #include <stdlib.h> #include "libm_support.h" -#ifndef _LIBC +#ifdef _LIBC +# define pmatherr matherr +# define pmatherrf matherrf +# define pmatherrl matherrl +#else _LIB_VERSION_TYPE #if defined( __POSIX__ ) -_LIB_VERSION = _POSIX_; +_LIB_VERSIONIMF = _POSIX_; #elif defined( __XOPEN__ ) -_LIB_VERSION = _XOPEN_; +_LIB_VERSIONIMF = _XOPEN_; #elif defined( __SVID__ ) -_LIB_VERSION = _SVID_; +_LIB_VERSIONIMF = _SVID_; #elif defined( __IEEE__ ) -_LIB_VERSION = _IEEE_; +_LIB_VERSIONIMF = _IEEE_; #else -_LIB_VERSION = _ISOC_; -#endif +_LIB_VERSIONIMF = _ISOC_; #endif /************************************************************/ /* matherrX function pointers and setusermatherrX functions */ /************************************************************/ -#if 0 int (*pmatherrf)(struct exceptionf*) = MATHERR_F; int (*pmatherr)(struct EXC_DECL_D*) = MATHERR_D; int (*pmatherrl)(struct exceptionl*) = matherrl; void __libm_setusermatherrf( int(*user_merrf)(struct exceptionf*) ) -{ pmatherrf = ( (user_merrf==NULL)? (MATHERR_F) : (user_merrf) ); } +{ pmatherrf = ( (user_merrf==NULL)? (MATHERR_F) : (user_merrf) ); } void __libm_setusermatherr( int(*user_merr)(struct EXC_DECL_D*) ) -{ pmatherr = ( (user_merr==NULL)? (MATHERR_D) : (user_merr) ); } +{ pmatherr = ( (user_merr==NULL)? (MATHERR_D) : (user_merr) ); } void __libm_setusermatherrl( int(*user_merrl)(struct exceptionl*) ) -{ pmatherrl = ( (user_merrl==NULL)? (matherrl) : (user_merrl) ); } -#endif +{ pmatherrl = ( (user_merrl==NULL)? (matherrl) : (user_merrl) ); } + +#endif /* !_LIBC */ /***********************************************/ /* error-handling function, libm_error_support */ @@ -93,22 +118,27 @@ void __libm_setusermatherrl( int(*user_merrl)(struct exceptionl*) ) void __libm_error_support(void *arg1,void *arg2,void *retval,error_types input_tag) { - # ifdef __cplusplus struct __exception exc; -# else +# else struct exception exc; -# endif +# endif struct exceptionf excf; struct exceptionl excl; -# if defined opensource || defined _LIBC +# if defined(__GNUC__) +#define ALIGNIT __attribute__ ((__aligned__ (16))) +# elif defined opensource #define ALIGNIT -#define ALIGNATTR __attribute__ ((__aligned__ (16))) # else #define ALIGNIT __declspec(align(16)) -#define ALIGNATTR +# endif + +# ifdef SIZE_LONG_INT_64 +#define __INT_64__ signed long +# else +#define __INT_64__ __int64 # endif const char float_inf[4] = {0x00,0x00,0x80,0x7F}; @@ -118,66 +148,74 @@ const char float_neg_inf[4] = {0x00,0x00,0x80,0xFF}; const char float_neg_huge[4] = {0xFF,0xFF,0x7F,0xFF}; const char float_neg_zero[4] = {0x00,0x00,0x00,0x80}; ALIGNIT -const char double_inf[8] ALIGNATTR = {0x00,0x00,0x00,0x00,0x00,0x00,0xF0,0x7F}; +const char double_inf[8] = {0x00,0x00,0x00,0x00,0x00,0x00,0xF0,0x7F}; +#if 0 /* unused */ ALIGNIT -//const char double_huge[8] ALIGNATTR = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xEF,0x7F}; +const char double_huge[8] = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xEF,0x7F}; +#endif ALIGNIT -const char double_zero[8] ALIGNATTR = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00}; +const char double_zero[8] = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00}; ALIGNIT -const char double_neg_inf[8] ALIGNATTR = {0x00,0x00,0x00,0x00,0x00,0x00,0xF0,0xFF}; +const char double_neg_inf[8] = {0x00,0x00,0x00,0x00,0x00,0x00,0xF0,0xFF}; +#if 0 /* unused */ ALIGNIT -//const char double_neg_huge[8] ALIGNATTR = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xEF,0xFF}; +const char double_neg_huge[8] = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xEF,0xFF}; +#endif ALIGNIT -const char double_neg_zero[8] ALIGNATTR = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x80}; +const char double_neg_zero[8] = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x80}; ALIGNIT -const char long_double_inf[16] ALIGNATTR = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x80,0xFF,0x7F,0x00,0x00,0x00,0x00,0x00,0x00}; +const char long_double_inf[16] = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x80,0xFF,0x7F,0x00,0x00,0x00,0x00,0x00,0x00}; +#if 0 /* unused */ ALIGNIT -//const char long_double_huge[16] ALIGNATTR = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFE,0x7F,0x00,0x00,0x00,0x00,0x00,0x00}; +const char long_double_huge[16] = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFE,0x7F,0x00,0x00,0x00,0x00,0x00,0x00}; +#endif ALIGNIT -const char long_double_zero[16] ALIGNATTR = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00}; +const char long_double_zero[16] = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00}; ALIGNIT -const char long_double_neg_inf[16] ALIGNATTR = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x80,0xFF,0xFF,0x00,0x00,0x00,0x00,0x00,0x00}; +const char long_double_neg_inf[16] = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x80,0xFF,0xFF,0x00,0x00,0x00,0x00,0x00,0x00}; +#if 0 /* unused */ ALIGNIT -//const char long_double_neg_huge[16] ALIGNATTR = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 0xFE,0xFF,0x00,0x00,0x00,0x00,0x00,0x00}; +const char long_double_neg_huge[16] = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFE,0xFF,0x00,0x00,0x00,0x00,0x00,0x00}; +#endif ALIGNIT -const char long_double_neg_zero[16] ALIGNATTR = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x80,0x00,0x00,0x00,0x00,0x00,0x00}; +const char long_double_neg_zero[16] = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00,0x00}; -#define RETVAL_HUGE_VALL *(long double *)retval = *(long double *)long_double_inf -#define RETVAL_NEG_HUGE_VALL *(long double *)retval = *(long double *)long_double_neg_inf -#define RETVAL_HUGEL *(long double *)retval = (long double)*(float *)float_huge -#define RETVAL_NEG_HUGEL *(long double *)retval =(long double)*(float*)float_neg_huge +#define RETVAL_HUGE_VALL *(long double *)retval = *(long double *)long_double_inf +#define RETVAL_NEG_HUGE_VALL *(long double *)retval = *(long double *)long_double_neg_inf +#define RETVAL_HUGEL *(long double *)retval = (long double)*(float *)float_huge +#define RETVAL_NEG_HUGEL *(long double *)retval =(long double)*(float*)float_neg_huge #define RETVAL_HUGE_VALD *(double *)retval = *(double *) double_inf #define RETVAL_NEG_HUGE_VALD *(double *)retval = *(double *) double_neg_inf #define RETVAL_HUGED *(double *)retval = (double) *(float *)float_huge -#define RETVAL_NEG_HUGED *(double *)retval = (double) *(float *) float_neg_huge +#define RETVAL_NEG_HUGED *(double *)retval = (double) *(float *) float_neg_huge #define RETVAL_HUGE_VALF *(float *)retval = *(float *) float_inf #define RETVAL_NEG_HUGE_VALF *(float *)retval = *(float *) float_neg_inf #define RETVAL_HUGEF *(float *)retval = *(float *) float_huge -#define RETVAL_NEG_HUGEF *(float *)retval = *(float *) float_neg_huge +#define RETVAL_NEG_HUGEF *(float *)retval = *(float *) float_neg_huge -#define RETVAL_ZEROL *(long double *)retval = *(long double *)long_double_zero -#define RETVAL_ZEROD *(double *)retval = *(double *)double_zero -#define RETVAL_ZEROF *(float *)retval = *(float *)float_zero +#define RETVAL_ZEROL *(long double *)retval = *(long double *)long_double_zero +#define RETVAL_ZEROD *(double *)retval = *(double *)double_zero +#define RETVAL_ZEROF *(float *)retval = *(float *)float_zero -#define RETVAL_NEG_ZEROL *(long double *)retval = *(long double *)long_double_neg_zero -#define RETVAL_NEG_ZEROD *(double *)retval = *(double *)double_neg_zero -#define RETVAL_NEG_ZEROF *(float *)retval = *(float *)float_neg_zero +#define RETVAL_NEG_ZEROL *(long double *)retval = *(long double *)long_double_neg_zero +#define RETVAL_NEG_ZEROD *(double *)retval = *(double *)double_neg_zero +#define RETVAL_NEG_ZEROF *(float *)retval = *(float *)float_neg_zero -#define RETVAL_ONEL *(long double *)retval = (long double) 1.0 -#define RETVAL_ONED *(double *)retval = 1.0 -#define RETVAL_ONEF *(float *)retval = 1.0f +#define RETVAL_ONEL *(long double *)retval = (long double) 1.0 +#define RETVAL_ONED *(double *)retval = 1.0 +#define RETVAL_ONEF *(float *)retval = 1.0f -#define NOT_MATHERRL excl.arg1=*(long double *)arg1;excl.arg2=*(long double *)arg2;excl.retval=*(long double *)retval;if(!matherrl(&excl)) -#define NOT_MATHERRD exc.arg1=*(double *)arg1;exc.arg2=*(double *)arg2;exc.retval=*(double *)retval;if(!MATHERR_D(&exc)) -#define NOT_MATHERRF excf.arg1=*(float *)arg1;excf.arg2=*(float *)arg2;excf.retval=*(float *)retval;if(!MATHERR_F(&excf)) +#define NOT_MATHERRL excl.arg1=*(long double *)arg1;excl.arg2=*(long double *)arg2;excl.retval=*(long double *)retval;if(!pmatherrl(&excl)) +#define NOT_MATHERRD exc.arg1=*(double *)arg1;exc.arg2=*(double *)arg2;exc.retval=*(double *)retval;if(!pmatherr(&exc)) +#define NOT_MATHERRF excf.arg1=*(float *)arg1;excf.arg2=*(float *)arg2;excf.retval=*(float *)retval;if(!pmatherrf(&excf)) -#define ifSVID if(_LIB_VERSION==_SVID_) +#define ifSVID if(_LIB_VERSIONIMF==_SVID_) -#define NAMEL excl.name -#define NAMED exc.name -#define NAMEF excf.name +#define NAMEL excl.name +#define NAMED exc.name +#define NAMEF excf.name // // These should work OK for MS because they are ints - @@ -192,28 +230,28 @@ const char long_double_neg_zero[16] ALIGNATTR = {0x00,0x00,0x00,0x00,0x00,0x00,0 #define PLOSS 6 #define SINGL excl.type = SING -#define DOMAINL excl.type = DOMAIN -#define OVERFLOWL excl.type = OVERFLOW -#define UNDERFLOWL excl.type = UNDERFLOW -#define TLOSSL excl.type = TLOSS +#define DOMAINL excl.type = DOMAIN +#define OVERFLOWL excl.type = OVERFLOW +#define UNDERFLOWL excl.type = UNDERFLOW +#define TLOSSL excl.type = TLOSS #define SINGD exc.type = SING -#define DOMAIND exc.type = DOMAIN -#define OVERFLOWD exc.type = OVERFLOW -#define UNDERFLOWD exc.type = UNDERFLOW -#define TLOSSD exc.type = TLOSS +#define DOMAIND exc.type = DOMAIN +#define OVERFLOWD exc.type = OVERFLOW +#define UNDERFLOWD exc.type = UNDERFLOW +#define TLOSSD exc.type = TLOSS #define SINGF excf.type = SING -#define DOMAINF excf.type = DOMAIN -#define OVERFLOWF excf.type = OVERFLOW -#define UNDERFLOWF excf.type = UNDERFLOW -#define TLOSSF excf.type = TLOSS +#define DOMAINF excf.type = DOMAIN +#define OVERFLOWF excf.type = OVERFLOW +#define UNDERFLOWF excf.type = UNDERFLOW +#define TLOSSF excf.type = TLOSS #define INPUT_XL (excl.arg1=*(long double*)arg1) #define INPUT_XD (exc.arg1=*(double*)arg1) #define INPUT_XF (excf.arg1=*(float*)arg1) -#define INPUT_YL (excl.arg1=*(long double*)arg2) -#define INPUT_YD (exc.arg1=*(double*)arg2) -#define INPUT_YF (excf.arg1=*(float*)arg2) -#define INPUT_RESL (*(long double *)retval) +#define INPUT_YL (excl.arg2=*(long double*)arg2) +#define INPUT_YD (exc.arg2=*(double*)arg2) +#define INPUT_YF (excf.arg2=*(float*)arg2) +#define INPUT_RESL (*(long double *)retval) #define INPUT_RESD (*(double *)retval) #define INPUT_RESF (*(float *)retval) @@ -248,11 +286,17 @@ const char long_double_neg_zero[16] ALIGNATTR = {0x00,0x00,0x00,0x00,0x00,0x00,0 #define WRITED_LOG1P_NEGATIVE fputs("log1p: DOMAIN error\n",stderr) #define WRITEF_LOG1P_NEGATIVE fputs("log1pf: DOMAIN error\n",stderr) #define WRITEL_LOG10_ZERO fputs("log10l: SING error\n",stderr) -#define WRITED_LOG10_ZERO fputs("log10: SING error\n",stderr) +#define WRITED_LOG10_ZERO fputs("log10: SING error\n",stderr) #define WRITEF_LOG10_ZERO fputs("log10f: SING error\n",stderr) #define WRITEL_LOG10_NEGATIVE fputs("log10l: DOMAIN error\n",stderr) #define WRITED_LOG10_NEGATIVE fputs("log10: DOMAIN error\n",stderr) #define WRITEF_LOG10_NEGATIVE fputs("log10f: DOMAIN error\n",stderr) +#define WRITEL_LOG2_ZERO fputs("log2l: SING error\n",stderr) +#define WRITED_LOG2_ZERO fputs("log2: SING error\n",stderr) +#define WRITEF_LOG2_ZERO fputs("log2f: SING error\n",stderr) +#define WRITEL_LOG2_NEGATIVE fputs("log2l: DOMAIN error\n",stderr) +#define WRITED_LOG2_NEGATIVE fputs("log2: DOMAIN error\n",stderr) +#define WRITEF_LOG2_NEGATIVE fputs("log2f: DOMAIN error\n",stderr) #define WRITEL_POW_ZERO_TO_ZERO fputs("powl(0,0): DOMAIN error\n",stderr) #define WRITED_POW_ZERO_TO_ZERO fputs("pow(0,0): DOMAIN error\n",stderr) #define WRITEF_POW_ZERO_TO_ZERO fputs("powf(0,0): DOMAIN error\n",stderr) @@ -295,6 +339,9 @@ const char long_double_neg_zero[16] ALIGNATTR = {0x00,0x00,0x00,0x00,0x00,0x00,0 #define WRITEL_GAMMA_NEGATIVE fputs("gammal: SING error\n",stderr) #define WRITED_GAMMA_NEGATIVE fputs("gamma: SING error\n",stderr) #define WRITEF_GAMMA_NEGATIVE fputs("gammaf: SING error\n",stderr) +#define WRITEL_TGAMMA_NEGATIVE fputs("tgammal: DOMAIN error\n",stderr) +#define WRITED_TGAMMA_NEGATIVE fputs("tgamma: DOMAIN error\n",stderr) +#define WRITEF_TGAMMA_NEGATIVE fputs("tgammaf: DOMAIN error\n",stderr) #define WRITEL_J0_TLOSS fputs("j0l: TLOSS error\n",stderr) #define WRITEL_Y0_TLOSS fputs("y0l: TLOSS error\n",stderr) #define WRITEL_J1_TLOSS fputs("j1l: TLOSS error\n",stderr) @@ -313,16 +360,26 @@ const char long_double_neg_zero[16] ALIGNATTR = {0x00,0x00,0x00,0x00,0x00,0x00,0 #define WRITEF_Y1_TLOSS fputs("y1f: TLOSS error\n",stderr) #define WRITEF_JN_TLOSS fputs("jnf: TLOSS error\n",stderr) #define WRITEF_YN_TLOSS fputs("ynf: TLOSS error\n",stderr) +#define WRITEL_ACOSD fputs("acosdl: DOMAIN error\n",stderr) +#define WRITED_ACOSD fputs("acosd: DOMAIN error\n",stderr) +#define WRITEF_ACOSD fputs("acosdf: DOMAIN error\n",stderr) +#define WRITEL_ASIND fputs("asindl: DOMAIN error\n",stderr) +#define WRITED_ASIND fputs("asind: DOMAIN error\n",stderr) +#define WRITEF_ASIND fputs("asindf: DOMAIN error\n",stderr) +#define WRITEL_ATAN2D_ZERO_BY_ZERO fputs("atan2dl: DOMAIN error\n",stderr) +#define WRITED_ATAN2D_ZERO_BY_ZERO fputs("atan2d: DOMAIN error\n",stderr) +#define WRITEF_ATAN2D_ZERO_BY_ZERO fputs("atan2df: DOMAIN error\n",stderr) + /***********************/ /* IEEE Path */ /***********************/ -if(_LIB_VERSION==_IEEE_) return; +if(_LIB_VERSIONIMF==_IEEE_) return; /***********************/ /* C9X Path */ /***********************/ -else if(_LIB_VERSION==_ISOC_) +else if(_LIB_VERSIONIMF==_ISOC_) { switch(input_tag) { @@ -339,80 +396,146 @@ else if(_LIB_VERSION==_ISOC_) case log1p_zero: case log1pf_zero: case powl_overflow: - case pow_overflow: - case powf_overflow: - case powl_underflow: - case pow_underflow: - case powf_underflow: + case pow_overflow: + case powf_overflow: case expl_overflow: - case exp_overflow: - case expf_overflow: - case expl_underflow: - case exp_underflow: - case expf_underflow: + case exp_overflow: + case expf_overflow: case exp2l_overflow: - case exp2_overflow: - case exp2f_overflow: - case exp2l_underflow: - case exp2_underflow: - case exp2f_underflow: + case exp2_overflow: + case exp2f_overflow: case exp10l_overflow: - case exp10_overflow: - case exp10f_overflow: + case exp10_overflow: + case exp10f_overflow: case expm1l_overflow: - case expm1_overflow: - case expm1f_overflow: + case expm1_overflow: + case expm1f_overflow: case hypotl_overflow: case hypot_overflow: case hypotf_overflow: - case sinhl_overflow: - case sinh_overflow: - case sinhf_overflow: - case atanhl_eq_one: - case atanh_eq_one: - case atanhf_eq_one: + case sinhl_overflow: + case sinh_overflow: + case sinhf_overflow: + case atanhl_eq_one: + case atanh_eq_one: + case atanhf_eq_one: case scalbl_overflow: case scalb_overflow: case scalbf_overflow: - case scalbl_underflow: - case scalb_underflow: - case scalbf_underflow: case coshl_overflow: case cosh_overflow: case coshf_overflow: case nextafterl_overflow: case nextafter_overflow: case nextafterf_overflow: + case nexttowardl_overflow: + case nexttoward_overflow: + case nexttowardf_overflow: case scalbnl_overflow: case scalbn_overflow: case scalbnf_overflow: - case scalbnl_underflow: - case scalbn_underflow: - case scalbnf_underflow: + case scalblnl_overflow: + case scalbln_overflow: + case scalblnf_overflow: case ldexpl_overflow: case ldexp_overflow: case ldexpf_overflow: - case ldexpl_underflow: - case ldexp_underflow: - case ldexpf_underflow: case lgammal_overflow: case lgamma_overflow: case lgammaf_overflow: - case lgammal_negative: - case lgamma_negative: - case lgammaf_negative: case gammal_overflow: case gamma_overflow: case gammaf_overflow: + case lgammal_negative: + case lgamma_negative: + case lgammaf_negative: case gammal_negative: case gamma_negative: case gammaf_negative: case ilogbl_zero: - case ilogb_zero: + case ilogb_zero: case ilogbf_zero: + case fdiml_overflow: + case fdim_overflow: + case fdimf_overflow: + case llrintl_large: + case llrint_large: + case llrintf_large: + case llroundl_large: + case llround_large: + case llroundf_large: + case lrintl_large: + case lrint_large: + case lrintf_large: + case lroundl_large: + case lround_large: + case lroundf_large: + case tandl_overflow: + case tand_overflow: + case tandf_overflow: + case cotdl_overflow: + case cotd_overflow: + case cotdf_overflow: + case cotl_overflow: + case cot_overflow: + case cotf_overflow: + case sinhcoshl_overflow: + case sinhcosh_overflow: + case sinhcoshf_overflow: + case annuityl_overflow: + case annuity_overflow: + case annuityf_overflow: + case compoundl_overflow: + case compound_overflow: + case compoundf_overflow: + case tgammal_overflow: + case tgamma_overflow: + case tgammaf_overflow: { ERRNO_RANGE; break; } + case powl_underflow: + case expl_underflow: + case exp2l_underflow: + case scalbl_underflow: + case scalbnl_underflow: + case scalblnl_underflow: + case ldexpl_underflow: + case erfcl_underflow: + case annuityl_underflow: + case compoundl_underflow: + { + if ( *(__INT_64__*)retval == 0 ) ERRNO_RANGE; + break; + } + case pow_underflow: + case exp_underflow: + case exp2_underflow: + case scalb_underflow: + case scalbn_underflow: + case scalbln_underflow: + case ldexp_underflow: + case erfc_underflow: + case annuity_underflow: + case compound_underflow: + { + if ( ((*(__INT_64__*)retval)<<1) == 0 ) ERRNO_RANGE; + break; + } + case powf_underflow: + case expf_underflow: + case exp2f_underflow: + case scalbf_underflow: + case scalbnf_underflow: + case scalblnf_underflow: + case ldexpf_underflow: + case erfcf_underflow: + case annuityf_underflow: + case compoundf_underflow: + { + if ( ((*(__INT_64__*)retval)<<33) == 0 ) ERRNO_RANGE; + break; + } case logl_negative: case log_negative: case logf_negative: @@ -440,17 +563,17 @@ else if(_LIB_VERSION==_ISOC_) case fmodl_by_zero: case fmod_by_zero: case fmodf_by_zero: - case atanhl_gt_one: - case atanh_gt_one: - case atanhf_gt_one: - case acosl_gt_one: - case acos_gt_one: - case acosf_gt_one: - case asinl_gt_one: - case asin_gt_one: - case asinf_gt_one: + case atanhl_gt_one: + case atanh_gt_one: + case atanhf_gt_one: + case acosl_gt_one: + case acos_gt_one: + case acosf_gt_one: + case asinl_gt_one: + case asin_gt_one: + case asinf_gt_one: case logbl_zero: - case logb_zero: + case logb_zero: case logbf_zero: case acoshl_lt_one: case acosh_lt_one: @@ -473,6 +596,30 @@ else if(_LIB_VERSION==_ISOC_) case ynl_negative: case yn_negative: case ynf_negative: + case acosdl_gt_one: + case acosd_gt_one: + case acosdf_gt_one: + case asindl_gt_one: + case asind_gt_one: + case asindf_gt_one: + case atan2dl_zero: + case atan2d_zero: + case atan2df_zero: + case annuityl_by_zero: + case annuity_by_zero: + case annuityf_by_zero: + case annuityl_less_m1: + case annuity_less_m1: + case annuityf_less_m1: + case compoundl_by_zero: + case compound_by_zero: + case compoundf_by_zero: + case compoundl_less_m1: + case compound_less_m1: + case compoundf_less_m1: + case tgammal_negative: + case tgamma_negative: + case tgammaf_negative: { ERRNO_DOMAIN; break; } @@ -486,31 +633,37 @@ else if(_LIB_VERSION==_ISOC_) /* _POSIX_ Path */ /***********************/ -else if(_LIB_VERSION==_POSIX_) +else if(_LIB_VERSIONIMF==_POSIX_) { switch(input_tag) { case gammal_overflow: case lgammal_overflow: + case tgammal_overflow: { RETVAL_HUGE_VALL; ERRNO_RANGE; break; } case gamma_overflow: case lgamma_overflow: + case tgamma_overflow: { RETVAL_HUGE_VALD; ERRNO_RANGE; break; } case gammaf_overflow: case lgammaf_overflow: + case tgammaf_overflow: { RETVAL_HUGE_VALF; ERRNO_RANGE; break; } case gammal_negative: - case gamma_negative: - case gammaf_negative: case lgammal_negative: + case gamma_negative: case lgamma_negative: + case gammaf_negative: case lgammaf_negative: + case tgammal_negative: + case tgamma_negative: + case tgammaf_negative: { ERRNO_DOMAIN; break; } @@ -526,38 +679,56 @@ switch(input_tag) case scalbn_underflow: case scalbnf_overflow: case scalbnf_underflow: + case scalblnl_overflow: + case scalblnl_underflow: + case scalbln_overflow: + case scalbln_underflow: + case scalblnf_overflow: + case scalblnf_underflow: + case tandl_overflow: + case tand_overflow: + case tandf_overflow: + case cotdl_overflow: + case cotd_overflow: + case cotdf_overflow: + case cotl_overflow: + case cot_overflow: + case cotf_overflow: + case sinhcoshl_overflow: + case sinhcosh_overflow: + case sinhcoshf_overflow: { ERRNO_RANGE; break; } - case atanhl_gt_one: - case atanhl_eq_one: + case atanhl_gt_one: + case atanhl_eq_one: /* atanhl(|x| >= 1) */ { ERRNO_DOMAIN; break; } - case atanh_gt_one: - case atanh_eq_one: + case atanh_gt_one: + case atanh_eq_one: /* atanh(|x| >= 1) */ { ERRNO_DOMAIN; break; } - case atanhf_gt_one: - case atanhf_eq_one: + case atanhf_gt_one: + case atanhf_eq_one: /* atanhf(|x| >= 1) */ { ERRNO_DOMAIN; break; } - case sqrtl_negative: + case sqrtl_negative: /* sqrtl(x < 0) */ { ERRNO_DOMAIN; break; } - case sqrt_negative: + case sqrt_negative: /* sqrt(x < 0) */ { ERRNO_DOMAIN; break; } - case sqrtf_negative: + case sqrtf_negative: /* sqrtf(x < 0) */ { ERRNO_DOMAIN; break; @@ -606,7 +777,7 @@ switch(input_tag) /* yn(x < 0) */ { RETVAL_NEG_HUGE_VALD; ERRNO_DOMAIN; break; - } + } case y0f_negative: case y1f_negative: case ynf_negative: @@ -615,10 +786,11 @@ switch(input_tag) /* ynf(x < 0) */ { RETVAL_NEG_HUGE_VALF; ERRNO_DOMAIN; break; - } + } case logl_zero: case log1pl_zero: case log10l_zero: + case log2l_zero: /* logl(0) */ /* log1pl(0) */ /* log10l(0) */ @@ -628,7 +800,7 @@ switch(input_tag) case log_zero: case log1p_zero: case log10_zero: - case log2l_zero: + case log2_zero: /* log(0) */ /* log1p(0) */ /* log10(0) */ @@ -638,6 +810,7 @@ switch(input_tag) case logf_zero: case log1pf_zero: case log10f_zero: + case log2f_zero: /* logf(0) */ /* log1pf(0) */ /* log10f(0) */ @@ -652,6 +825,9 @@ switch(input_tag) /* log1pl(x < 0) */ /* log10l(x < 0) */ { +#ifndef _LIBC + RETVAL_NEG_HUGE_VALL; +#endif ERRNO_DOMAIN; break; } case log_negative: @@ -662,8 +838,11 @@ switch(input_tag) /* log1p(x < 0) */ /* log10(x < 0) */ { +#ifndef _LIBC + RETVAL_NEG_HUGE_VALD; +#endif ERRNO_DOMAIN; break; - } + } case logf_negative: case log1pf_negative: case log10f_negative: @@ -672,34 +851,46 @@ switch(input_tag) /* log1pf(x < 0) */ /* log10f(x < 0) */ { +#ifndef _LIBC + RETVAL_NEG_HUGE_VALF; +#endif ERRNO_DOMAIN; break; - } + } case expl_overflow: + case exp2l_overflow: + case exp10l_overflow: /* expl overflow */ { RETVAL_HUGE_VALL; ERRNO_RANGE; break; } case exp_overflow: + case exp2_overflow: + case exp10_overflow: /* exp overflow */ { RETVAL_HUGE_VALD; ERRNO_RANGE; break; } case expf_overflow: + case exp2f_overflow: + case exp10f_overflow: /* expf overflow */ { RETVAL_HUGE_VALF; ERRNO_RANGE; break; } case expl_underflow: + case exp2l_underflow: /* expl underflow */ { RETVAL_ZEROL; ERRNO_RANGE; break; } case exp_underflow: + case exp2_underflow: /* exp underflow */ { RETVAL_ZEROD; ERRNO_RANGE; break; } case expf_underflow: + case exp2f_underflow: /* expf underflow */ { RETVAL_ZEROF; ERRNO_RANGE; break; @@ -750,13 +941,17 @@ switch(input_tag) break; } case powl_overflow: + case annuityl_overflow: + case compoundl_overflow: /* powl(x,y) overflow */ { if (INPUT_RESL < 0) RETVAL_NEG_HUGE_VALL; else RETVAL_HUGE_VALL; - ERRNO_RANGE; break; + ERRNO_RANGE; break; } case pow_overflow: + case annuity_overflow: + case compound_overflow: /* pow(x,y) overflow */ { if (INPUT_RESD < 0) RETVAL_NEG_HUGE_VALD; @@ -764,6 +959,8 @@ switch(input_tag) ERRNO_RANGE; break; } case powf_overflow: + case annuityf_overflow: + case compoundf_overflow: /* powf(x,y) overflow */ { if (INPUT_RESF < 0) RETVAL_NEG_HUGE_VALF; @@ -771,20 +968,41 @@ switch(input_tag) ERRNO_RANGE; break; } case powl_underflow: + case annuityl_underflow: + case compoundl_underflow: /* powl(x,y) underflow */ { RETVAL_ZEROL; ERRNO_RANGE; break; } case pow_underflow: + case annuity_underflow: + case compound_underflow: /* pow(x,y) underflow */ { RETVAL_ZEROD; ERRNO_RANGE; break; } - case powf_underflow: + case powf_underflow: + case annuityf_underflow: + case compoundf_underflow: /* powf(x,y) underflow */ { RETVAL_ZEROF; ERRNO_RANGE; break; } + case annuityl_by_zero: + case annuityl_less_m1: + case compoundl_by_zero: + case compoundl_less_m1: + case annuity_by_zero: + case annuity_less_m1: + case compound_by_zero: + case compound_less_m1: + case annuityf_by_zero: + case annuityf_less_m1: + case compoundf_by_zero: + case compoundf_less_m1: + { + ERRNO_DOMAIN; break; + } case powl_zero_to_negative: /* 0**neg */ { @@ -820,7 +1038,7 @@ switch(input_tag) /* Special Error */ { break; - } + } case pow_nan_to_zero: /* pow(NaN,0.0) */ { @@ -832,36 +1050,51 @@ switch(input_tag) break; } case atan2l_zero: - /* atan2l(0,0) */ + case atan2dl_zero: + /* atan2dl(0,0) */ { - /* XXX arg1 and arg2 are switched!!!! */ +#ifndef _LIBC + RETVAL_ZEROL; +#else + /* XXX arg1 and arg2 are switched!!!! */ if (signbit (*(long double *) arg1)) /* y == -0 */ - *(long double *) retval = copysignl (M_PIl, *(long double *) arg2); + *(long double *) retval = __libm_copysignl (M_PIl, *(long double *) arg2); else *(long double *) retval = *(long double *) arg2; +#endif ERRNO_DOMAIN; break; } case atan2_zero: - /* atan2(0,0) */ + case atan2d_zero: + /* atan2d(0,0) */ { - /* XXX arg1 and arg2 are switched!!!! */ +#ifndef _LIBC + RETVAL_ZEROD; +#else + /* XXX arg1 and arg2 are switched!!!! */ if (signbit (*(double *) arg1)) /* y == -0 */ - *(double *) retval = copysign (M_PI, *(double *) arg2); + *(double *) retval = __libm_copysign (M_PI, *(double *) arg2); else *(double *) retval = *(double *) arg2; +#endif ERRNO_DOMAIN; break; } - case - atan2f_zero: + case atan2f_zero: + case atan2df_zero: /* atan2f(0,0) */ + /* atan2df(0,0) */ { +#ifndef _LIBC + RETVAL_ZEROF; +#else if (signbit (*(float *) arg2)) /* y == -0 */ - *(float *) retval = copysignf (M_PI, *(float *) arg1); + *(float *) retval = __libm_copysignf (M_PI, *(float *) arg1); else *(float *) retval = *(float *) arg1; +#endif ERRNO_DOMAIN; break; } case expm1l_overflow: @@ -912,42 +1145,42 @@ switch(input_tag) case scalbl_underflow: /* scalbl underflow */ { - if (INPUT_XL < 0) RETVAL_NEG_ZEROL; + if (INPUT_XL < 0) RETVAL_NEG_ZEROL; else RETVAL_ZEROL; ERRNO_RANGE; break; } case scalb_underflow: /* scalb underflow */ { - if (INPUT_XD < 0) RETVAL_NEG_ZEROD; + if (INPUT_XD < 0) RETVAL_NEG_ZEROD; else RETVAL_ZEROD; ERRNO_RANGE; break; } case scalbf_underflow: /* scalbf underflow */ { - if (INPUT_XF < 0) RETVAL_NEG_ZEROF; + if (INPUT_XF < 0) RETVAL_NEG_ZEROF; else RETVAL_ZEROF; ERRNO_RANGE; break; } case scalbl_overflow: /* scalbl overflow */ { - if (INPUT_XL < 0) RETVAL_NEG_HUGE_VALL; + if (INPUT_XL < 0) RETVAL_NEG_HUGE_VALL; else RETVAL_HUGE_VALL; ERRNO_RANGE; break; } case scalb_overflow: /* scalb overflow */ { - if (INPUT_XD < 0) RETVAL_NEG_HUGE_VALD; + if (INPUT_XD < 0) RETVAL_NEG_HUGE_VALD; else RETVAL_HUGE_VALD; ERRNO_RANGE; break; } case scalbf_overflow: /* scalbf overflow */ { - if (INPUT_XF < 0) RETVAL_NEG_HUGE_VALF; + if (INPUT_XF < 0) RETVAL_NEG_HUGE_VALF; else RETVAL_HUGE_VALF; ERRNO_RANGE; break; } @@ -967,33 +1200,62 @@ switch(input_tag) ERRNO_DOMAIN; break; } case acosl_gt_one: + case acosdl_gt_one: /* acosl(x > 1) */ + /* acosdl(x > 1) */ { +#ifndef _LIBC + RETVAL_ZEROL; +#endif ERRNO_DOMAIN; break; } case acos_gt_one: + case acosd_gt_one: /* acos(x > 1) */ + /* acosd(x > 1) */ { - ERRNO_DOMAIN; break; +#ifndef _LIBC + RETVAL_ZEROD; +#endif + ERRNO_DOMAIN; break; } case acosf_gt_one: + case acosdf_gt_one: /* acosf(x > 1) */ + /* acosdf(x > 1) */ { - ERRNO_DOMAIN; break; +#ifndef _LIBC + RETVAL_ZEROF; +#endif + ERRNO_DOMAIN; break; } case asinl_gt_one: + case asindl_gt_one: /* asinl(x > 1) */ + /* asindl(x > 1) */ { +#ifndef _LIBC + RETVAL_ZEROL; +#endif ERRNO_DOMAIN; break; } case asin_gt_one: + case asind_gt_one: /* asin(x > 1) */ + /* asind(x > 1) */ { +#ifndef _LIBC + RETVAL_ZEROD; +#endif ERRNO_DOMAIN; break; } case asinf_gt_one: - /* asinf(x > 1) */ + case asindf_gt_one: + /* asindf(x > 1) */ { +#ifndef _LIBC + RETVAL_ZEROF; +#endif ERRNO_DOMAIN; break; } case remainderl_by_zero: @@ -1029,6 +1291,15 @@ switch(input_tag) { RETVAL_HUGE_VALF; ERRNO_RANGE; break; } + case nextafterl_overflow: + case nextafter_overflow: + case nextafterf_overflow: + case nexttowardl_overflow: + case nexttoward_overflow: + case nexttowardf_overflow: + { + ERRNO_RANGE; break; + } case sinhl_overflow: /* sinhl overflows */ { @@ -1090,7 +1361,7 @@ return; /*******************************/ /* __SVID__ and __XOPEN__ Path */ /*******************************/ -else +else { switch(input_tag) { @@ -1106,15 +1377,57 @@ else case scalbn_underflow: case scalbnf_overflow: case scalbnf_underflow: + case scalblnl_overflow: + case scalblnl_underflow: + case scalbln_overflow: + case scalbln_underflow: + case scalblnf_overflow: + case scalblnf_underflow: + case tandl_overflow: + case tand_overflow: + case tandf_overflow: + case cotdl_overflow: + case cotd_overflow: + case cotdf_overflow: + case cotl_overflow: + case cot_overflow: + case cotf_overflow: + case annuityl_overflow: + case annuityl_underflow: + case annuity_overflow: + case annuity_underflow: + case annuityf_overflow: + case annuityf_underflow: + case compoundl_overflow: + case compoundl_underflow: + case compound_overflow: + case compound_underflow: + case compoundf_overflow: + case compoundf_underflow: { ERRNO_RANGE; break; } - case sqrtl_negative: + case annuityl_by_zero: + case annuityl_less_m1: + case annuity_by_zero: + case annuity_less_m1: + case annuityf_by_zero: + case annuityf_less_m1: + case compoundl_by_zero: + case compoundl_less_m1: + case compound_by_zero: + case compound_less_m1: + case compoundf_by_zero: + case compoundf_less_m1: + { + ERRNO_DOMAIN; break; + } + case sqrtl_negative: /* sqrtl(x < 0) */ { DOMAINL; NAMEL = (char *) "sqrtl"; - ifSVID - { + ifSVID + { RETVAL_ZEROL; NOT_MATHERRL { @@ -1122,22 +1435,22 @@ else ERRNO_DOMAIN; } } - else + else { /* NaN already computed */ NOT_MATHERRL {ERRNO_DOMAIN;} } - *(long double *)retval = excl.retval; + *(long double *)retval = excl.retval; break; } - case sqrt_negative: + case sqrt_negative: /* sqrt(x < 0) */ { DOMAIND; NAMED = (char *) "sqrt"; - ifSVID + ifSVID { - + RETVAL_ZEROD; - NOT_MATHERRD + NOT_MATHERRD { WRITED_SQRT; ERRNO_DOMAIN; @@ -1146,18 +1459,18 @@ else else { /* NaN already computed */ NOT_MATHERRD {ERRNO_DOMAIN;} - } - *(double *)retval = exc.retval; + } + *(double *)retval = exc.retval; break; } - case sqrtf_negative: + case sqrtf_negative: /* sqrtf(x < 0) */ { DOMAINF; NAMEF = (char *) "sqrtf"; - ifSVID + ifSVID { RETVAL_ZEROF; - NOT_MATHERRF + NOT_MATHERRF { WRITEF_SQRT; ERRNO_DOMAIN; @@ -1166,62 +1479,59 @@ else else { NOT_MATHERRF {ERRNO_DOMAIN;} - } - *(float *)retval = excf.retval; + } + *(float *)retval = excf.retval; break; } case logl_zero: - case log2l_zero: /* logl(0) */ { SINGL; NAMEL = (char *) "logl"; - ifSVID + ifSVID { RETVAL_NEG_HUGEL; - NOT_MATHERRL + NOT_MATHERRL { WRITEL_LOG_ZERO; ERRNO_DOMAIN; - } + } } else { RETVAL_NEG_HUGE_VALL; NOT_MATHERRL {ERRNO_DOMAIN;} - } - *(long double *)retval = excl.retval; + } + *(long double *)retval = excl.retval; break; } case log_zero: - case log2_zero: /* log(0) */ { SINGD; NAMED = (char *) "log"; - ifSVID + ifSVID { RETVAL_NEG_HUGED; - NOT_MATHERRD + NOT_MATHERRD { WRITED_LOG_ZERO; ERRNO_DOMAIN; - } + } } else { RETVAL_NEG_HUGE_VALD; NOT_MATHERRD {ERRNO_DOMAIN;} } - *(double *)retval = exc.retval; + *(double *)retval = exc.retval; break; } case logf_zero: - case log2f_zero: /* logf(0) */ { SINGF; NAMEF = (char *) "logf"; - ifSVID + ifSVID { - RETVAL_NEG_HUGEF; + RETVAL_NEG_HUGEF; NOT_MATHERRF { WRITEF_LOG_ZERO; @@ -1230,22 +1540,21 @@ else } else { - RETVAL_NEG_HUGE_VALF; + RETVAL_NEG_HUGE_VALF; NOT_MATHERRF {ERRNO_DOMAIN;} } - *(float *)retval = excf.retval; + *(float *)retval = excf.retval; break; } case logl_negative: - case log2l_negative: /* logl(x < 0) */ { DOMAINL; NAMEL = (char *) "logl"; - ifSVID + ifSVID { RETVAL_NEG_HUGEL; - NOT_MATHERRL + NOT_MATHERRL { WRITEL_LOG_NEGATIVE; ERRNO_DOMAIN; @@ -1253,21 +1562,20 @@ else } else { - RETVAL_NEG_HUGE_VALL; + RETVAL_NEG_HUGE_VALL; NOT_MATHERRL {ERRNO_DOMAIN;} } - *(long double *)retval = excl.retval; + *(long double *)retval = excl.retval; break; } case log_negative: - case log2_negative: /* log(x < 0) */ { DOMAIND; NAMED = (char *) "log"; - ifSVID + ifSVID { RETVAL_NEG_HUGED; - NOT_MATHERRD + NOT_MATHERRD { WRITED_LOG_NEGATIVE; ERRNO_DOMAIN; @@ -1275,39 +1583,38 @@ else } else { - RETVAL_NEG_HUGE_VALD; + RETVAL_NEG_HUGE_VALD; NOT_MATHERRD {ERRNO_DOMAIN;} } - *(double *)retval = exc.retval; + *(double *)retval = exc.retval; break; - } + } case logf_negative: - case log2f_negative: /* logf(x < 0) */ { DOMAINF; NAMEF = (char *) "logf"; - ifSVID + ifSVID { RETVAL_NEG_HUGEF; - NOT_MATHERRF + NOT_MATHERRF { WRITEF_LOG_NEGATIVE; ERRNO_DOMAIN; } - } + } else { - RETVAL_NEG_HUGE_VALF; + RETVAL_NEG_HUGE_VALF; NOT_MATHERRF{ERRNO_DOMAIN;} } - *(float *)retval = excf.retval; + *(float *)retval = excf.retval; break; } case log1pl_zero: /* log1pl(-1) */ { SINGL; NAMEL = (char *) "log1pl"; - ifSVID + ifSVID { RETVAL_NEG_HUGEL; NOT_MATHERRL @@ -1328,7 +1635,7 @@ else /* log1p(-1) */ { SINGD; NAMED = (char *) "log1p"; - ifSVID + ifSVID { RETVAL_NEG_HUGED; NOT_MATHERRD @@ -1349,7 +1656,7 @@ else /* log1pf(-1) */ { SINGF; NAMEF = (char *) "log1pf"; - ifSVID + ifSVID { RETVAL_NEG_HUGEF; NOT_MATHERRF @@ -1361,11 +1668,11 @@ else else { RETVAL_NEG_HUGE_VALF; - NOT_MATHERRF {}ERRNO_DOMAIN; + NOT_MATHERRF {ERRNO_DOMAIN;} } *(float *)retval = excf.retval; break; - } + } case log1pl_negative: /* log1pl(x < -1) */ { @@ -1379,7 +1686,7 @@ else ERRNO_DOMAIN; } } - else + else { RETVAL_NEG_HUGE_VALL; NOT_MATHERRL {ERRNO_DOMAIN;} @@ -1400,7 +1707,7 @@ else ERRNO_DOMAIN; } } - else + else { RETVAL_NEG_HUGE_VALD; NOT_MATHERRD {ERRNO_DOMAIN;} @@ -1421,7 +1728,7 @@ else ERRNO_DOMAIN; } } - else + else { RETVAL_NEG_HUGE_VALF; NOT_MATHERRF {ERRNO_DOMAIN;} @@ -1433,7 +1740,7 @@ else /* log10l(0) */ { SINGL; NAMEL = (char *) "log10l"; - ifSVID + ifSVID { RETVAL_NEG_HUGEL; NOT_MATHERRL @@ -1447,14 +1754,14 @@ else RETVAL_NEG_HUGE_VALL; NOT_MATHERRL {ERRNO_DOMAIN;} } - *(long double *)retval = excl.retval; + *(long double *)retval = excl.retval; break; } case log10_zero: /* log10(0) */ { SINGD; NAMED = (char *) "log10"; - ifSVID + ifSVID { RETVAL_NEG_HUGED; NOT_MATHERRD @@ -1468,14 +1775,14 @@ else RETVAL_NEG_HUGE_VALD; NOT_MATHERRD {ERRNO_DOMAIN;} } - *(double *)retval = exc.retval; + *(double *)retval = exc.retval; break; } case log10f_zero: /* log10f(0) */ { SINGF; NAMEF = (char *) "log10f"; - ifSVID + ifSVID { RETVAL_NEG_HUGEF; NOT_MATHERRF @@ -1489,17 +1796,17 @@ else RETVAL_NEG_HUGE_VALF; NOT_MATHERRF {ERRNO_DOMAIN;} } - *(float *)retval = excf.retval; + *(float *)retval = excf.retval; break; } case log10l_negative: /* log10l(x < 0) */ { DOMAINL; NAMEL = (char *) "log10l"; - ifSVID + ifSVID { RETVAL_NEG_HUGEL; - NOT_MATHERRL + NOT_MATHERRL { WRITEL_LOG10_NEGATIVE; ERRNO_DOMAIN; @@ -1510,38 +1817,38 @@ else RETVAL_NEG_HUGE_VALL; NOT_MATHERRL {ERRNO_DOMAIN;} } - *(long double *)retval = excl.retval; + *(long double *)retval = excl.retval; break; } case log10_negative: /* log10(x < 0) */ { DOMAIND; NAMED = (char *) "log10"; - ifSVID + ifSVID { RETVAL_NEG_HUGED; - NOT_MATHERRD + NOT_MATHERRD { WRITED_LOG10_NEGATIVE; ERRNO_DOMAIN; } - } + } else { RETVAL_NEG_HUGE_VALD; NOT_MATHERRD {ERRNO_DOMAIN;} } - *(double *)retval = exc.retval; + *(double *)retval = exc.retval; break; } case log10f_negative: /* log10f(x < 0) */ { DOMAINF; NAMEF = (char *) "log10f"; - ifSVID + ifSVID { RETVAL_NEG_HUGEF; - NOT_MATHERRF + NOT_MATHERRF { WRITEF_LOG10_NEGATIVE; ERRNO_DOMAIN; @@ -1552,14 +1859,119 @@ else RETVAL_NEG_HUGE_VALF; NOT_MATHERRF {ERRNO_DOMAIN;} } - *(float *)retval = excf.retval; + *(float *)retval = excf.retval; + break; + } + case log2_zero: + /* log2(0) */ + { + SINGD; NAMED = (char *) "log2"; + ifSVID + { + RETVAL_NEG_HUGED; + NOT_MATHERRD + { + WRITED_LOG2_ZERO; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_NEG_HUGE_VALD; + NOT_MATHERRD {ERRNO_DOMAIN;} + } + *(double *)retval = exc.retval; + break; + } + case log2f_zero: + /* log2f(0) */ + { + SINGF; NAMEF = (char *) "log2f"; + ifSVID + { + RETVAL_NEG_HUGEF; + NOT_MATHERRF + { + WRITEF_LOG2_ZERO; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_NEG_HUGE_VALF; + NOT_MATHERRF {ERRNO_DOMAIN;} + } + *(float *)retval = excf.retval; + break; + } + case log2l_negative: + /* log2l(x < 0) */ + { + DOMAINL; NAMEL = (char *) "log2l"; + ifSVID + { + RETVAL_NEG_HUGEL; + NOT_MATHERRL + { + WRITEL_LOG2_NEGATIVE; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_NEG_HUGE_VALL; + NOT_MATHERRL {ERRNO_DOMAIN;} + } + *(long double *)retval = excl.retval; + break; + } + case log2_negative: + /* log2(x < 0) */ + { + DOMAIND; NAMED = (char *) "log2"; + ifSVID + { + RETVAL_NEG_HUGED; + NOT_MATHERRD + { + WRITED_LOG2_NEGATIVE; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_NEG_HUGE_VALD; + NOT_MATHERRD {ERRNO_DOMAIN;} + } + *(double *)retval = exc.retval; + break; + } + case log2f_negative: + /* log2f(x < 0) */ + { + DOMAINF; NAMEF = (char *) "log2f"; + ifSVID + { + RETVAL_NEG_HUGEF; + NOT_MATHERRF + { + WRITEF_LOG2_NEGATIVE; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_NEG_HUGE_VALF; + NOT_MATHERRF {ERRNO_DOMAIN;} + } + *(float *)retval = excf.retval; break; } case expl_overflow: /* expl overflow */ { OVERFLOWL; NAMEL = (char *) "expl"; - ifSVID + ifSVID { RETVAL_HUGEL; } @@ -1568,14 +1980,14 @@ else RETVAL_HUGE_VALL; } NOT_MATHERRL {ERRNO_RANGE;} - *(long double *)retval = excl.retval; + *(long double *)retval = excl.retval; break; } case exp_overflow: /* exp overflow */ { OVERFLOWD; NAMED = (char *) "exp"; - ifSVID + ifSVID { RETVAL_HUGED; } @@ -1584,14 +1996,14 @@ else RETVAL_HUGE_VALD; } NOT_MATHERRD {ERRNO_RANGE;} - *(double *)retval = exc.retval; + *(double *)retval = exc.retval; break; } case expf_overflow: /* expf overflow */ { OVERFLOWF; NAMEF = (char *) "expf"; - ifSVID + ifSVID { RETVAL_HUGEF; } @@ -1600,7 +2012,7 @@ else RETVAL_HUGE_VALF; } NOT_MATHERRF {ERRNO_RANGE;} - *(float *)retval = excf.retval; + *(float *)retval = excf.retval; break; } case expl_underflow: @@ -1608,7 +2020,7 @@ else { UNDERFLOWL; NAMEL = (char *) "expl"; RETVAL_ZEROL; NOT_MATHERRL {ERRNO_RANGE;} - *(long double *)retval = excl.retval; + *(long double *)retval = excl.retval; break; } case exp_underflow: @@ -1616,7 +2028,7 @@ else { UNDERFLOWD; NAMED = (char *) "exp"; RETVAL_ZEROD; NOT_MATHERRD {ERRNO_RANGE;} - *(double *)retval = exc.retval; + *(double *)retval = exc.retval; break; } case expf_underflow: @@ -1624,22 +2036,22 @@ else { UNDERFLOWF; NAMEF = (char *) "expf"; RETVAL_ZEROF; NOT_MATHERRF {ERRNO_RANGE;} - *(float *)retval = excf.retval; + *(float *)retval = excf.retval; break; } case powl_zero_to_zero: /* powl 0**0 */ { DOMAINL; NAMEL = (char *) "powl"; - ifSVID + ifSVID { RETVAL_ZEROL; - NOT_MATHERRL + NOT_MATHERRL { WRITEL_POW_ZERO_TO_ZERO; - ERRNO_RANGE; + ERRNO_DOMAIN; } - *(long double *)retval = excl.retval; + *(long double *)retval = excl.retval; } else RETVAL_ONEL; break; @@ -1648,15 +2060,15 @@ else /* pow 0**0 */ { DOMAIND; NAMED = (char *) "pow"; - ifSVID + ifSVID { RETVAL_ZEROD; - NOT_MATHERRD + NOT_MATHERRD { WRITED_POW_ZERO_TO_ZERO; - ERRNO_RANGE; + ERRNO_DOMAIN; } - *(double *)retval = exc.retval; + *(double *)retval = exc.retval; } else RETVAL_ONED; break; @@ -1665,15 +2077,15 @@ else /* powf 0**0 */ { DOMAINF; NAMEF = (char *) "powf"; - ifSVID + ifSVID { RETVAL_ZEROF; - NOT_MATHERRF + NOT_MATHERRF { WRITEF_POW_ZERO_TO_ZERO; - ERRNO_RANGE; + ERRNO_DOMAIN; } - *(float *)retval = excf.retval; + *(float *)retval = excf.retval; } else RETVAL_ONEF; break; @@ -1682,54 +2094,54 @@ else /* powl(x,y) overflow */ { OVERFLOWL; NAMEL = (char *) "powl"; - ifSVID + ifSVID { if (INPUT_XL < 0) RETVAL_NEG_HUGEL; else RETVAL_HUGEL; } else - { + { if (INPUT_XL < 0) RETVAL_NEG_HUGE_VALL; else RETVAL_HUGE_VALL; } NOT_MATHERRL {ERRNO_RANGE;} - *(long double *)retval = excl.retval; + *(long double *)retval = excl.retval; break; } case pow_overflow: /* pow(x,y) overflow */ { OVERFLOWD; NAMED = (char *) "pow"; - ifSVID + ifSVID { if (INPUT_XD < 0) RETVAL_NEG_HUGED; else RETVAL_HUGED; } else - { + { if (INPUT_XD < 0) RETVAL_NEG_HUGE_VALD; else RETVAL_HUGE_VALD; } NOT_MATHERRD {ERRNO_RANGE;} - *(double *)retval = exc.retval; + *(double *)retval = exc.retval; break; } case powf_overflow: /* powf(x,y) overflow */ { OVERFLOWF; NAMEF = (char *) "powf"; - ifSVID + ifSVID { if (INPUT_XF < 0) RETVAL_NEG_HUGEF; - else RETVAL_HUGEF; + else RETVAL_HUGEF; } else - { + { if (INPUT_XF < 0) RETVAL_NEG_HUGE_VALF; else RETVAL_HUGE_VALF; } NOT_MATHERRF {ERRNO_RANGE;} - *(float *)retval = excf.retval; + *(float *)retval = excf.retval; break; } case powl_underflow: @@ -1737,7 +2149,7 @@ else { UNDERFLOWL; NAMEL = (char *) "powl"; RETVAL_ZEROL; NOT_MATHERRL {ERRNO_RANGE;} - *(long double *)retval = excl.retval; + *(long double *)retval = excl.retval; break; } case pow_underflow: @@ -1745,7 +2157,7 @@ else { UNDERFLOWD; NAMED = (char *) "pow"; RETVAL_ZEROD; NOT_MATHERRD {ERRNO_RANGE;} - *(double *)retval = exc.retval; + *(double *)retval = exc.retval; break; } case powf_underflow: @@ -1753,17 +2165,17 @@ else { UNDERFLOWF; NAMEF = (char *) "powf"; RETVAL_ZEROF; NOT_MATHERRF {ERRNO_RANGE;} - *(float *)retval = excf.retval; + *(float *)retval = excf.retval; break; } case powl_zero_to_negative: /* 0 to neg */ { DOMAINL; NAMEL = (char *) "powl"; - ifSVID - { + ifSVID + { RETVAL_ZEROL; - NOT_MATHERRL + NOT_MATHERRL { WRITEL_POW_ZERO_TO_NEGATIVE; ERRNO_DOMAIN; @@ -1774,17 +2186,17 @@ else RETVAL_NEG_HUGE_VALL; NOT_MATHERRL {ERRNO_DOMAIN;} } - *(long double *)retval = excl.retval; + *(long double *)retval = excl.retval; break; } case pow_zero_to_negative: /* 0**neg */ { DOMAIND; NAMED = (char *) "pow"; - ifSVID - { + ifSVID + { RETVAL_ZEROD; - NOT_MATHERRD + NOT_MATHERRD { WRITED_POW_ZERO_TO_NEGATIVE; ERRNO_DOMAIN; @@ -1795,7 +2207,7 @@ else RETVAL_NEG_HUGE_VALD; NOT_MATHERRD {ERRNO_DOMAIN;} } - *(double *)retval = exc.retval; + *(double *)retval = exc.retval; break; } case powf_zero_to_negative: @@ -1803,10 +2215,10 @@ else { DOMAINF; NAMEF = (char *) "powf"; RETVAL_NEG_HUGE_VALF; - ifSVID - { + ifSVID + { RETVAL_ZEROF; - NOT_MATHERRF + NOT_MATHERRF { WRITEF_POW_ZERO_TO_NEGATIVE; ERRNO_DOMAIN; @@ -1817,17 +2229,17 @@ else RETVAL_NEG_HUGE_VALF; NOT_MATHERRF {ERRNO_DOMAIN;} } - *(float *)retval = excf.retval; + *(float *)retval = excf.retval; break; } case powl_neg_to_non_integer: /* neg**non_integral */ { DOMAINL; NAMEL = (char *) "powl"; - ifSVID - { + ifSVID + { RETVAL_ZEROF; - NOT_MATHERRL + NOT_MATHERRL { WRITEL_POW_NEG_TO_NON_INTEGER; ERRNO_DOMAIN; @@ -1837,17 +2249,17 @@ else { NOT_MATHERRL {ERRNO_DOMAIN;} } - *(long double *)retval = excl.retval; + *(long double *)retval = excl.retval; break; } case pow_neg_to_non_integer: /* neg**non_integral */ { DOMAIND; NAMED = (char *) "pow"; - ifSVID - { + ifSVID + { RETVAL_ZEROD; - NOT_MATHERRD + NOT_MATHERRD { WRITED_POW_NEG_TO_NON_INTEGER; ERRNO_DOMAIN; @@ -1857,17 +2269,17 @@ else { NOT_MATHERRD {ERRNO_DOMAIN;} } - *(double *)retval = exc.retval; + *(double *)retval = exc.retval; break; } case powf_neg_to_non_integer: /* neg**non-integral */ { DOMAINF; NAMEF = (char *) "powf"; - ifSVID - { + ifSVID + { RETVAL_ZEROF; - NOT_MATHERRF + NOT_MATHERRF { WRITEF_POW_NEG_TO_NON_INTEGER; ERRNO_DOMAIN; @@ -1877,37 +2289,37 @@ else { NOT_MATHERRF {ERRNO_DOMAIN;} } - *(float *)retval = excf.retval; + *(float *)retval = excf.retval; break; } case powl_nan_to_zero: /* pow(NaN,0.0) */ /* Special Error */ { - DOMAINL; NAMEL = (char *) "powl"; INPUT_XL; INPUT_YL; - excl.retval = *(long double *)arg1; + DOMAINL; NAMEL = (char *) "powl"; + *(long double *)retval = *(long double *)arg1; NOT_MATHERRL {ERRNO_DOMAIN;} - *(long double *)retval = excl.retval; + *(long double *)retval = excl.retval; break; - } + } case pow_nan_to_zero: /* pow(NaN,0.0) */ /* Special Error */ { - DOMAIND; NAMED = (char *) "pow"; INPUT_XD; INPUT_YD; - exc.retval = *(double *)arg1; + DOMAIND; NAMED = (char *) "pow"; + *(double *)retval = *(double *)arg1; NOT_MATHERRD {ERRNO_DOMAIN;} - *(double *)retval = exc.retval; + *(double *)retval = exc.retval; break; } case powf_nan_to_zero: /* powf(NaN,0.0) */ /* Special Error */ { - DOMAINF; NAMEF = (char *) "powf"; INPUT_XF; INPUT_YF; - excf.retval = *(float *)arg1; + DOMAINF; NAMEF = (char *) "powf"; + *(float *)retval = *(float *)arg1; NOT_MATHERRF {ERRNO_DOMAIN;} - *(float *)retval = excf.retval; + *(float *)retval = excf.retval; break; } case atan2l_zero: @@ -1915,15 +2327,15 @@ else { DOMAINL; NAMEL = (char *) "atan2l"; RETVAL_ZEROL; - NOT_MATHERRL + NOT_MATHERRL { - ifSVID + ifSVID { WRITEL_ATAN2_ZERO_BY_ZERO; } ERRNO_DOMAIN; } - *(long double *)retval = excl.retval; + *(long double *)retval = excl.retval; break; } case atan2_zero: @@ -1931,15 +2343,15 @@ else { DOMAIND; NAMED = (char *) "atan2"; RETVAL_ZEROD; - NOT_MATHERRD + NOT_MATHERRD { - ifSVID - { + ifSVID + { WRITED_ATAN2_ZERO_BY_ZERO; } ERRNO_DOMAIN; } - *(double *)retval = exc.retval; + *(double *)retval = exc.retval; break; } case atan2f_zero: @@ -1947,13 +2359,59 @@ else { DOMAINF; NAMEF = (char *) "atan2f"; RETVAL_ZEROF; - NOT_MATHERRF - ifSVID + NOT_MATHERRF + ifSVID { WRITEF_ATAN2_ZERO_BY_ZERO; } ERRNO_DOMAIN; - *(float *)retval = excf.retval; + *(float *)retval = excf.retval; + break; + } + case atan2dl_zero: + /* atan2dl(0.0,0.0) */ + { + DOMAINL; NAMEL = (char *) "atan2dl"; + RETVAL_ZEROL; + NOT_MATHERRL + { + ifSVID + { + WRITEL_ATAN2D_ZERO_BY_ZERO; + } + ERRNO_DOMAIN; + } + *(long double *)retval = excl.retval; + break; + } + case atan2d_zero: + /* atan2d(0.0,0.0) */ + { + DOMAIND; NAMED = (char *) "atan2d"; + RETVAL_ZEROD; + NOT_MATHERRD + { + ifSVID + { + WRITED_ATAN2D_ZERO_BY_ZERO; + } + ERRNO_DOMAIN; + } + *(double *)retval = exc.retval; + break; + } + case atan2df_zero: + /* atan2df(0.0,0.0) */ + { + DOMAINF; NAMEF = (char *) "atan2df"; + RETVAL_ZEROF; + NOT_MATHERRF + ifSVID + { + WRITEF_ATAN2D_ZERO_BY_ZERO; + } + ERRNO_DOMAIN; + *(float *)retval = excf.retval; break; } case expm1_overflow: @@ -1990,8 +2448,8 @@ else UNDERFLOWL; NAMEL = (char *) "scalbl"; if (INPUT_XL < 0.0L) RETVAL_NEG_ZEROL; else RETVAL_ZEROL; - NOT_MATHERRL {ERRNO_RANGE;} - *(long double *)retval = excf.retval; + NOT_MATHERRL {ERRNO_RANGE;} + *(long double *)retval = excl.retval; break; } case scalb_underflow: @@ -2000,8 +2458,8 @@ else UNDERFLOWD; NAMED = (char *) "scalb"; if (INPUT_XD < 0.0) RETVAL_NEG_ZEROD; else RETVAL_ZEROD; - NOT_MATHERRD {ERRNO_RANGE;} - *(double *)retval = exc.retval; + NOT_MATHERRD {ERRNO_RANGE;} + *(double *)retval = exc.retval; break; } case scalbf_underflow: @@ -2010,8 +2468,8 @@ else UNDERFLOWF; NAMEF = (char *) "scalbf"; if (INPUT_XF < 0.0) RETVAL_NEG_ZEROF; else RETVAL_ZEROF; - NOT_MATHERRF {ERRNO_RANGE;} - *(float *)retval = excf.retval; + NOT_MATHERRF {ERRNO_RANGE;} + *(float *)retval = excf.retval; break; } case scalbl_overflow: @@ -2020,8 +2478,8 @@ else OVERFLOWL; NAMEL = (char *) "scalbl"; if (INPUT_XL < 0) RETVAL_NEG_HUGE_VALL; else RETVAL_HUGE_VALL; - NOT_MATHERRL {ERRNO_RANGE;} - *(long double *)retval = excl.retval; + NOT_MATHERRL {ERRNO_RANGE;} + *(long double *)retval = excl.retval; break; } case scalb_overflow: @@ -2030,8 +2488,8 @@ else OVERFLOWD; NAMED = (char *) "scalb"; if (INPUT_XD < 0) RETVAL_NEG_HUGE_VALD; else RETVAL_HUGE_VALD; - NOT_MATHERRD {ERRNO_RANGE;} - *(double *)retval = exc.retval; + NOT_MATHERRD {ERRNO_RANGE;} + *(double *)retval = exc.retval; break; } case scalbf_overflow: @@ -2040,8 +2498,8 @@ else OVERFLOWF; NAMEF = (char *) "scalbf"; if (INPUT_XF < 0) RETVAL_NEG_HUGE_VALF; else RETVAL_HUGE_VALF; - NOT_MATHERRF {ERRNO_RANGE;} - *(float *)retval = excf.retval; + NOT_MATHERRF {ERRNO_RANGE;} + *(float *)retval = excf.retval; break; } case hypotl_overflow: @@ -2049,7 +2507,7 @@ else { OVERFLOWL; NAMEL = (char *) "hypotl"; ifSVID - { + { RETVAL_HUGEL; } else @@ -2057,7 +2515,7 @@ else RETVAL_HUGE_VALL; } NOT_MATHERRL {ERRNO_RANGE;} - *(long double *)retval = excl.retval; + *(long double *)retval = excl.retval; break; } case hypot_overflow: @@ -2065,7 +2523,7 @@ else { OVERFLOWD; NAMED = (char *) "hypot"; ifSVID - { + { RETVAL_HUGED; } else @@ -2073,14 +2531,14 @@ else RETVAL_HUGE_VALD; } NOT_MATHERRD {ERRNO_RANGE;} - *(double *)retval = exc.retval; + *(double *)retval = exc.retval; break; } case hypotf_overflow: /* hypotf overflow */ - { + { OVERFLOWF; NAMEF = (char *) "hypotf"; - ifSVID + ifSVID { RETVAL_HUGEF; } @@ -2089,7 +2547,7 @@ else RETVAL_HUGE_VALF; } NOT_MATHERRF {ERRNO_RANGE;} - *(float *)retval = excf.retval; + *(float *)retval = excf.retval; break; } case acosl_gt_one: @@ -2097,7 +2555,7 @@ else { DOMAINL; NAMEL = (char *) "acosl"; RETVAL_ZEROL; - ifSVID + ifSVID { NOT_MATHERRL { @@ -2117,7 +2575,7 @@ else { DOMAIND; NAMED = (char *) "acos"; RETVAL_ZEROD; - ifSVID + ifSVID { NOT_MATHERRD { @@ -2137,9 +2595,9 @@ else { DOMAINF; NAMEF = (char *) "acosf"; RETVAL_ZEROF; - ifSVID + ifSVID { - NOT_MATHERRF + NOT_MATHERRF { WRITEF_ACOS; ERRNO_DOMAIN; @@ -2148,8 +2606,8 @@ else else { NOT_MATHERRF {ERRNO_DOMAIN;} - } - *(float *)retval = excf.retval; + } + *(float *)retval = excf.retval; break; } case asinl_gt_one: @@ -2157,7 +2615,7 @@ else { DOMAINL; NAMEL = (char *) "asinl"; RETVAL_ZEROL; - ifSVID + ifSVID { NOT_MATHERRL { @@ -2177,7 +2635,7 @@ else { DOMAIND; NAMED = (char *) "asin"; RETVAL_ZEROD; - ifSVID + ifSVID { NOT_MATHERRD { @@ -2197,9 +2655,9 @@ else { DOMAINF; NAMEF = (char *) "asinf"; RETVAL_ZEROF; - ifSVID + ifSVID { - NOT_MATHERRF + NOT_MATHERRF { WRITEF_ASIN; ERRNO_DOMAIN; @@ -2208,8 +2666,128 @@ else else { NOT_MATHERRF {ERRNO_DOMAIN;} + } + *(float *)retval = excf.retval; + break; + } + case acosdl_gt_one: + /* acosdl(x > 1) */ + { + DOMAINL; NAMEL = (char *) "acosdl"; + RETVAL_ZEROL; + ifSVID + { + NOT_MATHERRL + { + WRITEL_ACOSD; + ERRNO_DOMAIN; + } } - *(float *)retval = excf.retval; + else + { + NOT_MATHERRL {ERRNO_DOMAIN;} + } + *(long double *)retval = excl.retval; + break; + } + case acosd_gt_one: + /* acosd(x > 1) */ + { + DOMAIND; NAMED = (char *) "acosd"; + RETVAL_ZEROD; + ifSVID + { + NOT_MATHERRD + { + WRITED_ACOSD; + ERRNO_DOMAIN; + } + } + else + { + NOT_MATHERRD {ERRNO_DOMAIN;} + } + *(double *)retval = exc.retval; + break; + } + case acosdf_gt_one: + /* acosdf(x > 1) */ + { + DOMAINF; NAMEF = (char *) "acosdf"; + RETVAL_ZEROF; + ifSVID + { + NOT_MATHERRF + { + WRITEF_ACOSD; + ERRNO_DOMAIN; + } + } + else + { + NOT_MATHERRF {ERRNO_DOMAIN;} + } + *(float *)retval = excf.retval; + break; + } + case asindl_gt_one: + /* asindl(x > 1) */ + { + DOMAINL; NAMEL = (char *) "asindl"; + RETVAL_ZEROL; + ifSVID + { + NOT_MATHERRL + { + WRITEL_ASIND; + ERRNO_DOMAIN; + } + } + else + { + NOT_MATHERRL {ERRNO_DOMAIN;} + } + *(long double *)retval = excl.retval; + break; + } + case asind_gt_one: + /* asind(x > 1) */ + { + DOMAIND; NAMED = (char *) "asind"; + RETVAL_ZEROD; + ifSVID + { + NOT_MATHERRD + { + WRITED_ASIND; + ERRNO_DOMAIN; + } + } + else + { + NOT_MATHERRD {ERRNO_DOMAIN;} + } + *(double *)retval = exc.retval; + break; + } + case asindf_gt_one: + /* asindf(x > 1) */ + { + DOMAINF; NAMEF = (char *) "asindf"; + RETVAL_ZEROF; + ifSVID + { + NOT_MATHERRF + { + WRITEF_ASIND; + ERRNO_DOMAIN; + } + } + else + { + NOT_MATHERRF {ERRNO_DOMAIN;} + } + *(float *)retval = excf.retval; break; } case coshl_overflow: @@ -2220,7 +2798,7 @@ else { RETVAL_HUGEL; } - else + else { RETVAL_HUGE_VALL; } @@ -2236,7 +2814,7 @@ else { RETVAL_HUGED; } - else + else { RETVAL_HUGE_VALD; } @@ -2252,7 +2830,7 @@ else { RETVAL_HUGEF; } - else + else { RETVAL_HUGE_VALF; } @@ -2269,7 +2847,7 @@ else if (INPUT_XL > 0.0) RETVAL_HUGEL; else RETVAL_NEG_HUGEL; } - else + else { if (INPUT_XL > 0.0) RETVAL_HUGE_VALL; else RETVAL_NEG_HUGE_VALL; @@ -2287,7 +2865,7 @@ else if (INPUT_XD > 0.0) RETVAL_HUGED; else RETVAL_NEG_HUGED; } - else + else { if (INPUT_XD > 0.0) RETVAL_HUGE_VALD; else RETVAL_NEG_HUGE_VALD; @@ -2305,7 +2883,7 @@ else if( INPUT_XF > 0.0) RETVAL_HUGEF; else RETVAL_NEG_HUGEF; } - else + else { if (INPUT_XF > 0.0) RETVAL_HUGE_VALF; else RETVAL_NEG_HUGE_VALF; @@ -2318,7 +2896,7 @@ else /* acoshl(x < 1) */ { DOMAINL; NAMEL = (char *) "acoshl"; - ifSVID + ifSVID { NOT_MATHERRL { @@ -2326,7 +2904,10 @@ else ERRNO_DOMAIN; } } - else NOT_MATHERRL {ERRNO_DOMAIN;} + else + { + NOT_MATHERRL {ERRNO_DOMAIN;} + } *(long double *)retval = excl.retval; break; } @@ -2334,7 +2915,7 @@ else /* acosh(x < 1) */ { DOMAIND; NAMED = (char *) "acosh"; - ifSVID + ifSVID { NOT_MATHERRD { @@ -2342,7 +2923,10 @@ else ERRNO_DOMAIN; } } - else NOT_MATHERRD {ERRNO_DOMAIN;} + else + { + NOT_MATHERRD {ERRNO_DOMAIN;} + } *(double *)retval = exc.retval; break; } @@ -2350,7 +2934,7 @@ else /* acoshf(x < 1) */ { DOMAINF; NAMEF = (char *) "acoshf"; - ifSVID + ifSVID { NOT_MATHERRF { @@ -2369,7 +2953,7 @@ else /* atanhl(|x| > 1) */ { DOMAINL; NAMEL = (char *) "atanhl"; - ifSVID + ifSVID { NOT_MATHERRL { @@ -2387,7 +2971,7 @@ else /* atanh(|x| > 1) */ { DOMAIND; NAMED = (char *) "atanh"; - ifSVID + ifSVID { NOT_MATHERRD { @@ -2405,7 +2989,7 @@ else /* atanhf(|x| > 1) */ { DOMAINF; NAMEF = (char *) "atanhf"; - ifSVID + ifSVID { NOT_MATHERRF { @@ -2422,8 +3006,8 @@ else case atanhl_eq_one: /* atanhl(|x| == 1) */ { - SINGL; NAMEL = (char *)"atanhl"; - ifSVID + SINGL; NAMEL = (char *) "atanhl"; + ifSVID { NOT_MATHERRL { @@ -2441,7 +3025,7 @@ else /* atanh(|x| == 1) */ { SINGD; NAMED = (char *) "atanh"; - ifSVID + ifSVID { NOT_MATHERRD { @@ -2459,7 +3043,7 @@ else /* atanhf(|x| == 1) */ { SINGF; NAMEF = (char *) "atanhf"; - ifSVID + ifSVID { NOT_MATHERRF { @@ -2477,7 +3061,7 @@ else /* gammal overflow */ { OVERFLOWL; NAMEL = (char *) "gammal"; - ifSVID + ifSVID { RETVAL_HUGEL; } @@ -2485,15 +3069,15 @@ else { RETVAL_HUGE_VALL; } - NOT_MATHERRL {ERRNO_RANGE;} - *(long double *)retval = excl.retval; + NOT_MATHERRL{ERRNO_RANGE;} + *(long double*)retval = excl.retval; break; } case gamma_overflow: /* gamma overflow */ { OVERFLOWD; NAMED = (char *) "gamma"; - ifSVID + ifSVID { RETVAL_HUGED; } @@ -2501,31 +3085,94 @@ else { RETVAL_HUGE_VALD; } - NOT_MATHERRD {ERRNO_RANGE;} - *(double *)retval = exc.retval; + NOT_MATHERRD{ERRNO_RANGE;} + *(double*)retval = exc.retval; break; } case gammaf_overflow: /* gammaf overflow */ { OVERFLOWF; NAMEF = (char *) "gammaf"; + ifSVID + { + RETVAL_HUGEF; + } + else + { + RETVAL_HUGE_VALF; + } + NOT_MATHERRF{ERRNO_RANGE;} + *(float*)retval = excf.retval; + break; + } + case gammal_negative: + /* gammal -int or 0 */ + { + SINGL; NAMEL = (char *) "gammal"; ifSVID { + RETVAL_HUGEL; + NOT_MATHERRL + { + WRITEL_GAMMA_NEGATIVE; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_HUGE_VALL; + NOT_MATHERRL{ERRNO_DOMAIN;} + } + *(long double*)retval = excl.retval; + break; + } + case gamma_negative: + /* gamma -int or 0 */ + { + SINGD; NAMED = (char *) "gamma"; + ifSVID + { + RETVAL_HUGED; + NOT_MATHERRD + { + WRITED_GAMMA_NEGATIVE; + ERRNO_DOMAIN; + } + } + else + { + RETVAL_HUGE_VALD; + NOT_MATHERRD{ERRNO_DOMAIN;} + } + *(double*)retval = exc.retval; + break; + } + case gammaf_negative: + /* gammaf -int or 0 */ + { + SINGF; NAMEF = (char *) "gammaf"; + ifSVID + { RETVAL_HUGEF; + NOT_MATHERRF + { + WRITEF_GAMMA_NEGATIVE; + ERRNO_DOMAIN; + } } else { RETVAL_HUGE_VALF; + NOT_MATHERRF{ERRNO_DOMAIN;} } - NOT_MATHERRF {ERRNO_RANGE;} - *(float *)retval = excf.retval; + *(float*)retval = excf.retval; break; } case lgammal_overflow: /* lgammal overflow */ { OVERFLOWL; NAMEL = (char *) "lgammal"; - ifSVID + ifSVID { RETVAL_HUGEL; } @@ -2533,15 +3180,15 @@ else { RETVAL_HUGE_VALL; } - NOT_MATHERRL {ERRNO_RANGE;} - *(long double *)retval = excl.retval; + NOT_MATHERRL{ERRNO_RANGE;} + *(long double*)retval = excl.retval; break; } case lgamma_overflow: /* lgamma overflow */ { OVERFLOWD; NAMED = (char *) "lgamma"; - ifSVID + ifSVID { RETVAL_HUGED; } @@ -2549,15 +3196,15 @@ else { RETVAL_HUGE_VALD; } - NOT_MATHERRD {ERRNO_RANGE;} - *(double *)retval = exc.retval; + NOT_MATHERRD{ERRNO_RANGE;} + *(double*)retval = exc.retval; break; } case lgammaf_overflow: /* lgammaf overflow */ { OVERFLOWF; NAMEF = (char *) "lgammaf"; - ifSVID + ifSVID { RETVAL_HUGEF; } @@ -2565,8 +3212,8 @@ else { RETVAL_HUGE_VALF; } - NOT_MATHERRF {ERRNO_RANGE;} - *(float *)retval = excf.retval; + NOT_MATHERRF{ERRNO_RANGE;} + *(float*)retval = excf.retval; break; } case lgammal_negative: @@ -2578,16 +3225,16 @@ else RETVAL_HUGEL; NOT_MATHERRL { - WRITEL_LGAMMA_NEGATIVE; - ERRNO_DOMAIN; + WRITEL_GAMMA_NEGATIVE; + ERRNO_DOMAIN; } } else { RETVAL_HUGE_VALL; - NOT_MATHERRL {ERRNO_DOMAIN;} + NOT_MATHERRL{ERRNO_DOMAIN;} } - *(long double *)retval = excl.retval; + *(long double*)retval = excl.retval; break; } case lgamma_negative: @@ -2606,16 +3253,16 @@ else else { RETVAL_HUGE_VALD; - NOT_MATHERRD {ERRNO_DOMAIN;} + NOT_MATHERRD{ERRNO_DOMAIN;} } - *(double *)retval = exc.retval; + *(double*)retval = exc.retval; break; } case lgammaf_negative: /* lgammaf -int or 0 */ { SINGF; NAMEF = (char *) "lgammaf"; - ifSVID + ifSVID { RETVAL_HUGEF; NOT_MATHERRF @@ -2627,72 +3274,114 @@ else else { RETVAL_HUGE_VALF; - NOT_MATHERRF {ERRNO_DOMAIN;} + NOT_MATHERRF{ERRNO_DOMAIN;} } - *(float *)retval = excf.retval; + *(float*)retval = excf.retval; break; } - case gammal_negative: - /* gammal -int or 0 */ + case tgammal_overflow: + /* tgammal overflow */ { - SINGL; NAMEL = (char *) "gammal"; - ifSVID + OVERFLOWL; NAMEL = (char *) "tgammal"; + ifSVID { RETVAL_HUGEL; + } + else + { + RETVAL_HUGE_VALL; + } + NOT_MATHERRL{ERRNO_RANGE;} + *(long double*)retval = excl.retval; + break; + } + case tgamma_overflow: + /* tgamma overflow */ + { + OVERFLOWD; NAMED = (char *) "tgamma"; + ifSVID + { + RETVAL_HUGED; + } + else + { + RETVAL_HUGE_VALD; + } + NOT_MATHERRD{ERRNO_RANGE;} + *(double*)retval = exc.retval; + break; + } + case tgammaf_overflow: + /* tgammaf overflow */ + { + OVERFLOWF; NAMEF = (char *) "tgammaf"; + ifSVID + { + RETVAL_HUGEF; + } + else + { + RETVAL_HUGE_VALF; + } + NOT_MATHERRF{ERRNO_RANGE;} + *(float*)retval = excf.retval; + break; + } + case tgammal_negative: + /* tgammal -int or 0 */ + { + SINGL; NAMEL = (char *) "tgammal"; + ifSVID + { NOT_MATHERRL { - WRITEL_GAMMA_NEGATIVE; - ERRNO_DOMAIN; + WRITEL_TGAMMA_NEGATIVE; + ERRNO_DOMAIN; } } else { - RETVAL_HUGE_VALL; - NOT_MATHERRL {ERRNO_DOMAIN;} + NOT_MATHERRL{ERRNO_DOMAIN;} } - *(long double *)retval = excl.retval; + *(long double*)retval = excl.retval; break; } - case gamma_negative: - /* gamma -int or 0 */ + case tgamma_negative: + /* tgamma -int or 0 */ { - SINGD; NAMED = (char *) "gamma"; - ifSVID + SINGD; NAMED = (char *) "tgamma"; + ifSVID { - RETVAL_HUGED; NOT_MATHERRD { - WRITED_GAMMA_NEGATIVE; - ERRNO_DOMAIN; + WRITED_TGAMMA_NEGATIVE; + ERRNO_DOMAIN; } } else { - RETVAL_HUGE_VALD; - NOT_MATHERRD {ERRNO_DOMAIN;} + NOT_MATHERRD{ERRNO_DOMAIN;} } - *(double *)retval = exc.retval; + *(double*)retval = exc.retval; break; } - case gammaf_negative: - /* gammaf -int or 0 */ + case tgammaf_negative: + /* tgammaf -int or 0 */ { - SINGF; NAMEF = (char *) "gammaf"; - ifSVID + SINGF; NAMEF = (char *) "tgammaf"; + ifSVID { - RETVAL_HUGEF; NOT_MATHERRF { - WRITEF_GAMMA_NEGATIVE; - ERRNO_DOMAIN; + WRITEF_TGAMMA_NEGATIVE; + ERRNO_DOMAIN; } } else { - RETVAL_HUGE_VALF; - NOT_MATHERRF {ERRNO_DOMAIN;} + NOT_MATHERRF{ERRNO_DOMAIN;} } - *(float *)retval = excf.retval; + *(float*)retval = excf.retval; break; } case j0l_gt_loss: @@ -2700,7 +3389,7 @@ else { TLOSSL; NAMEL = (char *) "j0l"; RETVAL_ZEROL; - ifSVID + ifSVID { NOT_MATHERRL { @@ -2712,7 +3401,7 @@ else { NOT_MATHERRL {ERRNO_RANGE;} } - *(long double *)retval = excl.retval; + *(long double *)retval = excl.retval; break; } case j0_gt_loss: @@ -2720,7 +3409,7 @@ else { TLOSSD; NAMED = (char *) "j0"; RETVAL_ZEROD; - ifSVID + ifSVID { NOT_MATHERRD { @@ -2732,7 +3421,7 @@ else { NOT_MATHERRD {ERRNO_RANGE;} } - *(double*)retval = exc.retval; + *(double*)retval = exc.retval; break; } case j0f_gt_loss: @@ -2740,7 +3429,7 @@ else { TLOSSF; NAMEF = (char *) "j0f"; RETVAL_ZEROF; - ifSVID + ifSVID { NOT_MATHERRF { @@ -2760,7 +3449,7 @@ else { TLOSSL; NAMEL = (char *) "j1l"; RETVAL_ZEROL; - ifSVID + ifSVID { NOT_MATHERRL { @@ -2772,7 +3461,7 @@ else { NOT_MATHERRL {ERRNO_RANGE;} } - *(long double *)retval = excl.retval; + *(long double *)retval = excl.retval; break; } case j1_gt_loss: @@ -2780,7 +3469,7 @@ else { TLOSSD; NAMED = (char *) "j1"; RETVAL_ZEROD; - ifSVID + ifSVID { NOT_MATHERRD { @@ -2792,7 +3481,7 @@ else { NOT_MATHERRD {ERRNO_RANGE;} } - *(double*)retval = exc.retval; + *(double*)retval = exc.retval; break; } case j1f_gt_loss: @@ -2800,7 +3489,7 @@ else { TLOSSF; NAMEF = (char *) "j1f"; RETVAL_ZEROF; - ifSVID + ifSVID { NOT_MATHERRF { @@ -2820,7 +3509,7 @@ else { TLOSSL; NAMEL = (char *) "jnl"; RETVAL_ZEROL; - ifSVID + ifSVID { NOT_MATHERRL { @@ -2832,7 +3521,7 @@ else { NOT_MATHERRL {ERRNO_RANGE;} } - *(long double *)retval = excl.retval; + *(long double *)retval = excl.retval; break; } case jn_gt_loss: @@ -2840,7 +3529,7 @@ else { TLOSSD; NAMED = (char *) "jn"; RETVAL_ZEROD; - ifSVID + ifSVID { NOT_MATHERRD { @@ -2852,7 +3541,7 @@ else { NOT_MATHERRD {ERRNO_RANGE;} } - *(double*)retval = exc.retval; + *(double*)retval = exc.retval; break; } case jnf_gt_loss: @@ -2860,7 +3549,7 @@ else { TLOSSF; NAMEF = (char *) "jnf"; RETVAL_ZEROF; - ifSVID + ifSVID { NOT_MATHERRF { @@ -2880,7 +3569,7 @@ else { TLOSSL; NAMEL = (char *) "y0l"; RETVAL_ZEROL; - ifSVID + ifSVID { NOT_MATHERRL { @@ -2900,7 +3589,7 @@ else { TLOSSD; NAMED = (char *) "y0"; RETVAL_ZEROD; - ifSVID + ifSVID { NOT_MATHERRD { @@ -2920,7 +3609,7 @@ else { TLOSSF; NAMEF = (char *) "y0f"; RETVAL_ZEROF; - ifSVID + ifSVID { NOT_MATHERRF { @@ -2939,10 +3628,10 @@ else /* y0l(0) */ { DOMAINL; NAMEL = (char *) "y0l"; - ifSVID + ifSVID { RETVAL_NEG_HUGEL; - NOT_MATHERRL + NOT_MATHERRL { WRITEL_Y0_ZERO; ERRNO_DOMAIN; @@ -2950,20 +3639,20 @@ else } else { - RETVAL_NEG_HUGE_VALL; + RETVAL_NEG_HUGE_VALL; NOT_MATHERRL {ERRNO_DOMAIN;} } - *(long double *)retval = excl.retval; + *(long double *)retval = excl.retval; break; } case y0_zero: /* y0(0) */ { DOMAIND; NAMED = (char *) "y0"; - ifSVID + ifSVID { RETVAL_NEG_HUGED; - NOT_MATHERRD + NOT_MATHERRD { WRITED_Y0_ZERO; ERRNO_DOMAIN; @@ -2971,20 +3660,20 @@ else } else { - RETVAL_NEG_HUGE_VALD; + RETVAL_NEG_HUGE_VALD; NOT_MATHERRD {ERRNO_DOMAIN;} } - *(double *)retval = exc.retval; + *(double *)retval = exc.retval; break; } case y0f_zero: /* y0f(0) */ { DOMAINF; NAMEF = (char *) "y0f"; - ifSVID + ifSVID { RETVAL_NEG_HUGEF; - NOT_MATHERRF + NOT_MATHERRF { WRITEF_Y0_ZERO; ERRNO_DOMAIN; @@ -2992,10 +3681,10 @@ else } else { - RETVAL_NEG_HUGE_VALF; + RETVAL_NEG_HUGE_VALF; NOT_MATHERRF {ERRNO_DOMAIN;} } - *(float *)retval = excf.retval; + *(float *)retval = excf.retval; break; } case y1l_gt_loss: @@ -3003,7 +3692,7 @@ else { TLOSSL; NAMEL = (char *) "y1l"; RETVAL_ZEROL; - ifSVID + ifSVID { NOT_MATHERRL { @@ -3023,7 +3712,7 @@ else { TLOSSD; NAMED = (char *) "y1"; RETVAL_ZEROD; - ifSVID + ifSVID { NOT_MATHERRD { @@ -3043,7 +3732,7 @@ else { TLOSSF; NAMEF = (char *) "y1f"; RETVAL_ZEROF; - ifSVID + ifSVID { NOT_MATHERRF { @@ -3062,10 +3751,10 @@ else /* y1l(0) */ { DOMAINL; NAMEL = (char *) "y1l"; - ifSVID + ifSVID { RETVAL_NEG_HUGEL; - NOT_MATHERRL + NOT_MATHERRL { WRITEL_Y1_ZERO; ERRNO_DOMAIN; @@ -3073,20 +3762,20 @@ else } else { - RETVAL_NEG_HUGE_VALL; + RETVAL_NEG_HUGE_VALL; NOT_MATHERRL {ERRNO_DOMAIN;} } - *(long double *)retval = excl.retval; + *(long double *)retval = excl.retval; break; } case y1_zero: /* y1(0) */ { DOMAIND; NAMED = (char *) "y1"; - ifSVID + ifSVID { RETVAL_NEG_HUGED; - NOT_MATHERRD + NOT_MATHERRD { WRITED_Y1_ZERO; ERRNO_DOMAIN; @@ -3094,30 +3783,30 @@ else } else { - RETVAL_NEG_HUGE_VALD; + RETVAL_NEG_HUGE_VALD; NOT_MATHERRD {ERRNO_DOMAIN;} } - *(double *)retval = exc.retval; + *(double *)retval = exc.retval; break; } case y1f_zero: /* y1f(0) */ { DOMAINF; NAMEF = (char *) "y1f"; - ifSVID + ifSVID { RETVAL_NEG_HUGEF; - NOT_MATHERRF + NOT_MATHERRF { WRITEF_Y1_ZERO; ERRNO_DOMAIN; } }else { - RETVAL_NEG_HUGE_VALF; + RETVAL_NEG_HUGE_VALF; NOT_MATHERRF {ERRNO_DOMAIN;} } - *(float *)retval = excf.retval; + *(float *)retval = excf.retval; break; } case ynl_gt_loss: @@ -3125,7 +3814,7 @@ else { TLOSSL; NAMEL = (char *) "ynl"; RETVAL_ZEROL; - ifSVID + ifSVID { NOT_MATHERRL { @@ -3145,7 +3834,7 @@ else { TLOSSD; NAMED = (char *) "yn"; RETVAL_ZEROD; - ifSVID + ifSVID { NOT_MATHERRD { @@ -3165,7 +3854,7 @@ else { TLOSSF; NAMEF = (char *) "ynf"; RETVAL_ZEROF; - ifSVID + ifSVID { NOT_MATHERRF { @@ -3184,10 +3873,10 @@ else /* ynl(0) */ { DOMAINL; NAMEL = (char *) "ynl"; - ifSVID + ifSVID { RETVAL_NEG_HUGEL; - NOT_MATHERRL + NOT_MATHERRL { WRITEL_YN_ZERO; ERRNO_DOMAIN; @@ -3195,20 +3884,20 @@ else } else { - RETVAL_NEG_HUGE_VALL; + RETVAL_NEG_HUGE_VALL; NOT_MATHERRL {ERRNO_DOMAIN;} } - *(long double *)retval = excl.retval; + *(long double *)retval = excl.retval; break; } case yn_zero: /* yn(0) */ { DOMAIND; NAMED = (char *) "yn"; - ifSVID + ifSVID { RETVAL_NEG_HUGED; - NOT_MATHERRD + NOT_MATHERRD { WRITED_YN_ZERO; ERRNO_DOMAIN; @@ -3216,20 +3905,20 @@ else } else { - RETVAL_NEG_HUGE_VALD; + RETVAL_NEG_HUGE_VALD; NOT_MATHERRD {ERRNO_DOMAIN;} } - *(double *)retval = exc.retval; + *(double *)retval = exc.retval; break; } case ynf_zero: /* ynf(0) */ { DOMAINF; NAMEF = (char *) "ynf"; - ifSVID + ifSVID { RETVAL_NEG_HUGEF; - NOT_MATHERRF + NOT_MATHERRF { WRITEF_YN_ZERO; ERRNO_DOMAIN; @@ -3237,20 +3926,20 @@ else } else { - RETVAL_NEG_HUGE_VALF; + RETVAL_NEG_HUGE_VALF; NOT_MATHERRF {ERRNO_DOMAIN;} } - *(float *)retval = excf.retval; + *(float *)retval = excf.retval; break; } case y0l_negative: /* y0l(x<0) */ { DOMAINL; NAMEL = (char *) "y0l"; - ifSVID + ifSVID { RETVAL_NEG_HUGEL; - NOT_MATHERRL + NOT_MATHERRL { WRITEL_Y0_NEGATIVE; ERRNO_DOMAIN; @@ -3258,20 +3947,20 @@ else } else { - RETVAL_NEG_HUGE_VALL; + RETVAL_NEG_HUGE_VALL; NOT_MATHERRL {ERRNO_DOMAIN;} } - *(long double *)retval = excl.retval; + *(long double *)retval = excl.retval; break; } case y0_negative: /* y0(x<0) */ { DOMAIND; NAMED = (char *) "y0"; - ifSVID + ifSVID { RETVAL_NEG_HUGED; - NOT_MATHERRD + NOT_MATHERRD { WRITED_Y0_NEGATIVE; ERRNO_DOMAIN; @@ -3279,20 +3968,20 @@ else } else { - RETVAL_NEG_HUGE_VALD; + RETVAL_NEG_HUGE_VALD; NOT_MATHERRD {ERRNO_DOMAIN;} } - *(double *)retval = exc.retval; + *(double *)retval = exc.retval; break; } case y0f_negative: /* y0f(x<0) */ { DOMAINF; NAMEF = (char *) "y0f"; - ifSVID + ifSVID { RETVAL_NEG_HUGEF; - NOT_MATHERRF + NOT_MATHERRF { WRITEF_Y0_NEGATIVE; ERRNO_DOMAIN; @@ -3300,20 +3989,20 @@ else } else { - RETVAL_NEG_HUGE_VALF; + RETVAL_NEG_HUGE_VALF; NOT_MATHERRF {ERRNO_DOMAIN;} } - *(float *)retval = excf.retval; + *(float *)retval = excf.retval; break; } case y1l_negative: /* y1l(x<0) */ { DOMAINL; NAMEL = (char *) "y1l"; - ifSVID + ifSVID { RETVAL_NEG_HUGEL; - NOT_MATHERRL + NOT_MATHERRL { WRITEL_Y1_NEGATIVE; ERRNO_DOMAIN; @@ -3321,20 +4010,20 @@ else } else { - RETVAL_NEG_HUGE_VALL; + RETVAL_NEG_HUGE_VALL; NOT_MATHERRL {ERRNO_DOMAIN;} } - *(long double *)retval = excl.retval; + *(long double *)retval = excl.retval; break; } case y1_negative: /* y1(x<0) */ { DOMAIND; NAMED = (char *) "y1"; - ifSVID + ifSVID { RETVAL_NEG_HUGED; - NOT_MATHERRD + NOT_MATHERRD { WRITED_Y1_NEGATIUE; ERRNO_DOMAIN; @@ -3342,20 +4031,20 @@ else } else { - RETVAL_NEG_HUGE_VALD; + RETVAL_NEG_HUGE_VALD; NOT_MATHERRD {ERRNO_DOMAIN;} } - *(double *)retval = exc.retval; + *(double *)retval = exc.retval; break; } case y1f_negative: /* y1f(x<0) */ { DOMAINF; NAMEF = (char *) "y1f"; - ifSVID + ifSVID { RETVAL_NEG_HUGEF; - NOT_MATHERRF + NOT_MATHERRF { WRITEF_Y1_NEGATIVE; ERRNO_DOMAIN; @@ -3363,20 +4052,20 @@ else } else { - RETVAL_NEG_HUGE_VALF; + RETVAL_NEG_HUGE_VALF; NOT_MATHERRF {ERRNO_DOMAIN;} } - *(float *)retval = excf.retval; + *(float *)retval = excf.retval; break; } case ynl_negative: /* ynl(x<0) */ { DOMAINL; NAMEL = (char *) "ynl"; - ifSVID + ifSVID { RETVAL_NEG_HUGEL; - NOT_MATHERRL + NOT_MATHERRL { WRITEL_YN_NEGATIVE; ERRNO_DOMAIN; @@ -3384,20 +4073,20 @@ else } else { - RETVAL_NEG_HUGE_VALL; + RETVAL_NEG_HUGE_VALL; NOT_MATHERRL {ERRNO_DOMAIN;} } - *(long double *)retval = excl.retval; + *(long double *)retval = excl.retval; break; } case yn_negative: /* yn(x<0) */ { DOMAIND; NAMED = (char *) "yn"; - ifSVID + ifSVID { RETVAL_NEG_HUGED; - NOT_MATHERRD + NOT_MATHERRD { WRITED_YN_NEGATIVE; ERRNO_DOMAIN; @@ -3405,20 +4094,20 @@ else } else { - RETVAL_NEG_HUGE_VALD; + RETVAL_NEG_HUGE_VALD; NOT_MATHERRD {ERRNO_DOMAIN;} } - *(double *)retval = exc.retval; + *(double *)retval = exc.retval; break; } case ynf_negative: /* ynf(x<0) */ { DOMAINF; NAMEF = (char *) "ynf"; - ifSVID + ifSVID { RETVAL_NEG_HUGEF; - NOT_MATHERRF + NOT_MATHERRF { WRITEF_YN_NEGATIVE; ERRNO_DOMAIN; @@ -3426,18 +4115,18 @@ else } else { - RETVAL_NEG_HUGE_VALF; + RETVAL_NEG_HUGE_VALF; NOT_MATHERRF {ERRNO_DOMAIN;} } - *(float *)retval = excf.retval; + *(float *)retval = excf.retval; break; } - case fmodl_by_zero: + case fmodl_by_zero: /* fmodl(x,0) */ { DOMAINL; NAMEL = (char *) "fmodl"; - ifSVID - { + ifSVID + { *(long double *)retval = *(long double *)arg1; NOT_MATHERRL { @@ -3445,21 +4134,21 @@ else ERRNO_DOMAIN; } } - else + else { /* NaN already computed */ NOT_MATHERRL {ERRNO_DOMAIN;} } - *(long double *)retval = excl.retval; + *(long double *)retval = excl.retval; break; } - case fmod_by_zero: + case fmod_by_zero: /* fmod(x,0) */ { DOMAIND; NAMED = (char *) "fmod"; - ifSVID + ifSVID { *(double *)retval = *(double *)arg1; - NOT_MATHERRD + NOT_MATHERRD { WRITED_FMOD; ERRNO_DOMAIN; @@ -3468,18 +4157,18 @@ else else { /* NaN already computed */ NOT_MATHERRD {ERRNO_DOMAIN;} - } - *(double *)retval = exc.retval; + } + *(double *)retval = exc.retval; break; } - case fmodf_by_zero: + case fmodf_by_zero: /* fmodf(x,0) */ { DOMAINF; NAMEF = (char *) "fmodf"; - ifSVID + ifSVID { *(float *)retval = *(float *)arg1; - NOT_MATHERRF + NOT_MATHERRF { WRITEF_FMOD; ERRNO_DOMAIN; @@ -3488,36 +4177,36 @@ else else { NOT_MATHERRF {ERRNO_DOMAIN;} - } - *(float *)retval = excf.retval; + } + *(float *)retval = excf.retval; break; } - case remainderl_by_zero: + case remainderl_by_zero: /* remainderl(x,0) */ { DOMAINL; NAMEL = (char *) "remainderl"; - ifSVID - { + ifSVID + { NOT_MATHERRL { WRITEL_REM; ERRNO_DOMAIN; } } - else + else { /* NaN already computed */ NOT_MATHERRL {ERRNO_DOMAIN;} } - *(long double *)retval = excl.retval; + *(long double *)retval = excl.retval; break; } - case remainder_by_zero: + case remainder_by_zero: /* remainder(x,0) */ { DOMAIND; NAMED = (char *) "remainder"; - ifSVID + ifSVID { - NOT_MATHERRD + NOT_MATHERRD { WRITED_REM; ERRNO_DOMAIN; @@ -3526,17 +4215,17 @@ else else { /* NaN already computed */ NOT_MATHERRD {ERRNO_DOMAIN;} - } - *(double *)retval = exc.retval; + } + *(double *)retval = exc.retval; break; } - case remainderf_by_zero: + case remainderf_by_zero: /* remainderf(x,0) */ { DOMAINF; NAMEF = (char *) "remainderf"; - ifSVID + ifSVID { - NOT_MATHERRF + NOT_MATHERRF { WRITEF_REM; ERRNO_DOMAIN; @@ -3545,12 +4234,14 @@ else else { NOT_MATHERRF {ERRNO_DOMAIN;} - } - *(float *)retval = excf.retval; + } + *(float *)retval = excf.retval; break; } default: - abort(); + /* We don't want to abort () since SVID doesn't cover all math + library functions. */ + break; } return; } diff --git a/sysdeps/ia64/fpu/libm_reduce.S b/sysdeps/ia64/fpu/libm_reduce.S index 1c7f4e1e88..8bdf91d6de 100644 --- a/sysdeps/ia64/fpu/libm_reduce.S +++ b/sysdeps/ia64/fpu/libm_reduce.S @@ -1,10 +1,10 @@ .file "libm_reduce.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,304 +20,310 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// // Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// problem reports or change requests be submitted to it directly at +// http://www.intel.com/software/products/opensource/libraries/num.htm. // -// History: 02/02/00 Initial Version +// History: +// 02/02/00 Initial Version +// 05/13/02 Rescheduled for speed, changed interface to pass +// parameters in fp registers +// 02/10/03 Reordered header: .section, .global, .proc, .align; +// used data8 for long double data storage // -// ********************************************************************* -// ********************************************************************* +//********************************************************************* +//********************************************************************* // // Function: __libm_pi_by_two_reduce(x) return r, c, and N where // x = N * pi/4 + (r+c) , where |r+c| <= pi/4. // This function is not designed to be used by the // general user. // -// ********************************************************************* +//********************************************************************* // // Accuracy: Returns double-precision values // -// ********************************************************************* +//********************************************************************* // // Resources Used: // -// Floating-Point Registers: f32-f70 +// Floating-Point Registers: +// f8 = Input x, return value r +// f9 = return value c +// f32-f70 // // General Purpose Registers: // r8 = return value N -// r32 = Address of x -// r33 = Address of where to place r and then c // r34-r64 // // Predicate Registers: p6-p14 // -// ********************************************************************* +//********************************************************************* // // IEEE Special Conditions: // -// No condions should be raised. +// No condions should be raised. // -// ********************************************************************* +//********************************************************************* // // I. Introduction // =============== // // For the forward trigonometric functions sin, cos, sincos, and -// tan, the original algorithms for IA 64 handle arguments up to +// tan, the original algorithms for IA 64 handle arguments up to // 1 ulp less than 2^63 in magnitude. For double-extended arguments x, -// |x| >= 2^63, this routine returns CASE, N and r_hi, r_lo where -// +// |x| >= 2^63, this routine returns N and r_hi, r_lo where +// // x is accurately approximated by // 2*K*pi + N * pi/2 + r_hi + r_lo, |r_hi+r_lo| <= pi/4. // CASE = 1 or 2. // CASE is 1 unless |r_hi + r_lo| < 2^(-33). -// +// // The exact value of K is not determined, but that information is // not required in trigonometric function computations. -// -// We first assume the argument x in question satisfies x >= 2^(63). +// +// We first assume the argument x in question satisfies x >= 2^(63). // In particular, it is positive. Negative x can be handled by symmetry: -// +// // -x is accurately approximated by // -2*K*pi + (-N) * pi/2 - (r_hi + r_lo), |r_hi+r_lo| <= pi/4. -// +// // The idea of the reduction is that -// -// x * 2/pi = N_big + N + f, |f| <= 1/2 -// +// +// x * 2/pi = N_big + N + f, |f| <= 1/2 +// // Moreover, for double extended x, |f| >= 2^(-75). (This is an // non-obvious fact found by enumeration using a special algorithm -// involving continued fraction.) The algorithm described below +// involving continued fraction.) The algorithm described below // calculates N and an accurate approximation of f. -// -// Roughly speaking, an appropriate 256-bit (4 X 64) portion of +// +// Roughly speaking, an appropriate 256-bit (4 X 64) portion of // 2/pi is multiplied with x to give the desired information. -// +// // II. Representation of 2/PI // ========================== -// +// // The value of 2/pi in binary fixed-point is -// +// // .101000101111100110...... -// +// // We store 2/pi in a table, starting at the position corresponding -// to bit position 63 -// +// to bit position 63 +// // bit position 63 62 ... 0 -1 -2 -3 -4 -5 -6 -7 .... -16576 -// -// 0 0 ... 0 . 1 0 1 0 1 0 1 .... X -// +// +// 0 0 ... 0 . 1 0 1 0 1 0 1 .... X +// // ^ -// |__ implied binary pt -// +// |__ implied binary pt +// // III. Algorithm // ============== -// +// // This describes the algorithm in the most natural way using -// unsigned interger multiplication. The implementation section +// unsigned interger multiplication. The implementation section // describes how the integer arithmetic is simulated. -// +// // STEP 0. Initialization // ---------------------- -// -// Let the input argument x be -// +// +// Let the input argument x be +// // x = 2^m * ( 1. b_1 b_2 b_3 ... b_63 ), 63 <= m <= 16383. -// -// The first crucial step is to fetch four 64-bit portions of 2/pi. +// +// The first crucial step is to fetch four 64-bit portions of 2/pi. // To fulfill this goal, we calculate the bit position L of the // beginning of these 256-bit quantity by -// +// // L := 62 - m. -// -// Note that -16321 <= L <= -1 because 63 <= m <= 16383; and that +// +// Note that -16321 <= L <= -1 because 63 <= m <= 16383; and that // the storage of 2/pi is adequate. -// +// // Fetch P_1, P_2, P_3, P_4 beginning at bit position L thus: -// +// // bit position L L-1 L-2 ... L-63 -// +// // P_1 = b b b ... b -// +// // each b can be 0 or 1. Also, let P_0 be the two bits correspoding to // bit positions L+2 and L+1. So, when each of the P_j is interpreted // with appropriate scaling, we have // // 2/pi = P_big + P_0 + (P_1 + P_2 + P_3 + P_4) + P_small -// +// // Note that P_big and P_small can be ignored. The reasons are as follow. // First, consider P_big. If P_big = 0, we can certainly ignore it. -// Otherwise, P_big >= 2^(L+3). Now, -// +// Otherwise, P_big >= 2^(L+3). Now, +// // P_big * ulp(x) >= 2^(L+3) * 2^(m-63) -// >= 2^(65-m + m-63 ) -// >= 2^2 -// +// >= 2^(65-m + m-63 ) +// >= 2^2 +// // Thus, P_big * x is an integer of the form 4*K. So -// -// x = 4*K * (pi/2) + x*(P_0 + P_1 + P_2 + P_3 + P_4)*(pi/2) +// +// x = 4*K * (pi/2) + x*(P_0 + P_1 + P_2 + P_3 + P_4)*(pi/2) // + x*P_small*(pi/2). -// +// // Hence, P_big*x corresponds to information that can be ignored for // trigonometic function evaluation. -// +// // Next, we must estimate the effect of ignoring P_small. The absolute // error made by ignoring P_small is bounded by -// +// // |P_small * x| <= ulp(P_4) * x -// <= 2^(L-255) * 2^(m+1) -// <= 2^(62-m-255 + m + 1) -// <= 2^(-192) -// -// Since for double-extended precision, x * 2/pi = integer + f, +// <= 2^(L-255) * 2^(m+1) +// <= 2^(62-m-255 + m + 1) +// <= 2^(-192) +// +// Since for double-extended precision, x * 2/pi = integer + f, // 0.5 >= |f| >= 2^(-75), the relative error introduced by ignoring // P_small is bounded by 2^(-192+75) <= 2^(-117), which is acceptable. -// +// // Further note that if x is split into x_hi + x_lo where x_lo is the // two bits corresponding to bit positions 2^(m-62) and 2^(m-63); then -// -// P_0 * x_hi -// +// +// P_0 * x_hi +// // is also an integer of the form 4*K; and thus can also be ignored. // Let M := P_0 * x_lo which is a small integer. The main part of the // calculation is really the multiplication of x with the four pieces // P_1, P_2, P_3, and P_4. -// +// // Unless the reduced argument is extremely small in magnitude, it // suffices to carry out the multiplication of x with P_1, P_2, and -// P_3. x*P_4 will be carried out and added on as a correction only +// P_3. x*P_4 will be carried out and added on as a correction only // when it is found to be needed. Note also that x*P_4 need not be // computed exactly. A straightforward multiplication suffices since // the rounding error thus produced would be bounded by 2^(-3*64), // that is 2^(-192) which is small enough as the reduced argument // is bounded from below by 2^(-75). -// +// // Now that we have four 64-bit data representing 2/pi and a // 64-bit x. We first need to calculate a highly accurate product // of x and P_1, P_2, P_3. This is best understood as integer // multiplication. -// -// +// +// // STEP 1. Multiplication // ---------------------- -// -// +// +// // --------- --------- --------- -// | P_1 | | P_2 | | P_3 | -// --------- --------- --------- -// +// | P_1 | | P_2 | | P_3 | +// --------- --------- --------- +// +// --------- +// X | X | // --------- -// X | X | -// --------- // ---------------------------------------------------- // // --------- --------- -// | A_hi | | A_lo | -// --------- --------- +// | A_hi | | A_lo | +// --------- --------- // // // --------- --------- -// | B_hi | | B_lo | -// --------- --------- +// | B_hi | | B_lo | +// --------- --------- // // -// --------- --------- -// | C_hi | | C_lo | -// --------- --------- +// --------- --------- +// | C_hi | | C_lo | +// --------- --------- // // ==================================================== // --------- --------- --------- --------- -// | S_0 | | S_1 | | S_2 | | S_3 | -// --------- --------- --------- --------- +// | S_0 | | S_1 | | S_2 | | S_3 | +// --------- --------- --------- --------- // // // // STEP 2. Get N and f // ------------------- -// +// // Conceptually, after the individual pieces S_0, S_1, ..., are obtained, // we have to sum them and obtain an integer part, N, and a fraction, f. // Here, |f| <= 1/2, and N is an integer. Note also that N need only to // be known to module 2^k, k >= 2. In the case when |f| is small enough, // we would need to add in the value x*P_4. -// -// +// +// // STEP 3. Get reduced argument // ---------------------------- -// +// // The value f is not yet the reduced argument that we seek. The // equation -// -// x * 2/pi = 4K + N + f -// +// +// x * 2/pi = 4K + N + f +// // says that -// +// // x = 2*K*pi + N * pi/2 + f * (pi/2). -// +// // Thus, the reduced argument is given by -// -// reduced argument = f * pi/2. -// +// +// reduced argument = f * pi/2. +// // This multiplication must be performed to extra precision. -// +// // IV. Implementation // ================== -// +// // Step 0. Initialization // ---------------------- -// +// // Set sgn_x := sign(x); x := |x|; x_lo := 2 lsb of x. -// +// // In memory, 2/pi is stored contigously as -// +// // 0x00000000 0x00000000 0xA2F.... // ^ // |__ implied binary bit -// +// // Given x = 2^m * 1.xxxx...xxx; we calculate L := 62 - m. Thus // -1 <= L <= -16321. We fetch from memory 5 integer pieces of data. -// +// // P_0 is the two bits corresponding to bit positions L+2 and L+1 // P_1 is the 64-bit starting at bit position L // P_2 is the 64-bit starting at bit position L-64 // P_3 is the 64-bit starting at bit position L-128 // P_4 is the 64-bit starting at bit position L-192 -// +// // For example, if m = 63, P_0 would be 0 and P_1 would look like // 0xA2F... -// +// // If m = 65, P_0 would be the two msb of 0xA, thus, P_0 is 10 in binary. -// P_1 in binary would be 1 0 0 0 1 0 1 1 1 1 .... -// +// P_1 in binary would be 1 0 0 0 1 0 1 1 1 1 .... +// // Step 1. Multiplication // ---------------------- -// +// // At this point, P_1, P_2, P_3, P_4 are integers. They are // supposed to be interpreted as -// +// // 2^(L-63) * P_1; // 2^(L-63-64) * P_2; // 2^(L-63-128) * P_3; // 2^(L-63-192) * P_4; -// +// // Since each of them need to be multiplied to x, we would scale // both x and the P_j's by some convenient factors: scale each // of P_j's up by 2^(63-L), and scale x down by 2^(L-63). -// +// // p_1 := fcvt.xf ( P_1 ) // p_2 := fcvt.xf ( P_2 ) * 2^(-64) // p_3 := fcvt.xf ( P_3 ) * 2^(-128) @@ -325,30 +331,30 @@ // x := replace exponent of x by -1 // because 2^m * 1.xxxx...xxx * 2^(L-63) // is 2^(-1) * 1.xxxx...xxx -// +// // We are now faced with the task of computing the following -// +// // --------- --------- --------- -// | P_1 | | P_2 | | P_3 | -// --------- --------- --------- -// +// | P_1 | | P_2 | | P_3 | +// --------- --------- --------- +// // --------- -// X | X | -// --------- +// X | X | +// --------- // ---------------------------------------------------- -// +// // --------- --------- -// | A_hi | | A_lo | -// --------- --------- -// +// | A_hi | | A_lo | +// --------- --------- +// // --------- --------- -// | B_hi | | B_lo | -// --------- --------- -// -// --------- --------- -// | C_hi | | C_lo | -// --------- --------- -// +// | B_hi | | B_lo | +// --------- --------- +// +// --------- --------- +// | C_hi | | C_lo | +// --------- --------- +// // ==================================================== // ----------- --------- --------- --------- // | S_0 | | S_1 | | S_2 | | S_3 | @@ -357,108 +363,108 @@ // | |___ binary point // | // |___ possibly one more bit -// +// // Let FPSR3 be set to round towards zero with widest precision -// and exponent range. Unless an explicit FPSR is given, +// and exponent range. Unless an explicit FPSR is given, // round-to-nearest with widest precision and exponent range is // used. -// +// // Define sigma_C := 2^63; sigma_B := 2^(-1); sigma_C := 2^(-65). -// +// // Tmp_C := fmpy.fpsr3( x, p_1 ); // If Tmp_C >= sigma_C then // C_hi := Tmp_C; // C_lo := x*p_1 - C_hi ...fma, exact // Else // C_hi := fadd.fpsr3(sigma_C, Tmp_C) - sigma_C -// ...subtraction is exact, regardless -// ...of rounding direction +// ...subtraction is exact, regardless +// ...of rounding direction // C_lo := x*p_1 - C_hi ...fma, exact // End If -// +// // Tmp_B := fmpy.fpsr3( x, p_2 ); // If Tmp_B >= sigma_B then // B_hi := Tmp_B; // B_lo := x*p_2 - B_hi ...fma, exact // Else // B_hi := fadd.fpsr3(sigma_B, Tmp_B) - sigma_B -// ...subtraction is exact, regardless -// ...of rounding direction +// ...subtraction is exact, regardless +// ...of rounding direction // B_lo := x*p_2 - B_hi ...fma, exact // End If -// +// // Tmp_A := fmpy.fpsr3( x, p_3 ); // If Tmp_A >= sigma_A then // A_hi := Tmp_A; // A_lo := x*p_3 - A_hi ...fma, exact // Else // A_hi := fadd.fpsr3(sigma_A, Tmp_A) - sigma_A -// ...subtraction is exact, regardless -// ...of rounding direction +// ...subtraction is exact, regardless +// ...of rounding direction // A_lo := x*p_3 - A_hi ...fma, exact // End If -// +// // ...Note that C_hi is of integer value. We need only the -// ...last few bits. Thus we can ensure C_hi is never a big +// ...last few bits. Thus we can ensure C_hi is never a big // ...integer, freeing us from overflow worry. -// +// // Tmp_C := fadd.fpsr3( C_hi, 2^(70) ) - 2^(70); // ...Tmp_C is the upper portion of C_hi // C_hi := C_hi - Tmp_C // ...0 <= C_hi < 2^7 -// +// // Step 2. Get N and f // ------------------- -// -// At this point, we have all the components to obtain +// +// At this point, we have all the components to obtain // S_0, S_1, S_2, S_3 and thus N and f. We start by adding // C_lo and B_hi. This sum together with C_hi gives a good -// estimation of N and f. -// +// estimation of N and f. +// // A := fadd.fpsr3( B_hi, C_lo ) // B := max( B_hi, C_lo ) // b := min( B_hi, C_lo ) -// -// a := (B - A) + b ...exact. Note that a is either 0 -// ...or 2^(-64). -// +// +// a := (B - A) + b ...exact. Note that a is either 0 +// ...or 2^(-64). +// // N := round_to_nearest_integer_value( A ); -// f := A - N; ...exact because lsb(A) >= 2^(-64) -// ...and |f| <= 1/2. -// -// f := f + a ...exact because a is 0 or 2^(-64); -// ...the msb of the sum is <= 1/2 -// ...lsb >= 2^(-64). -// +// f := A - N; ...exact because lsb(A) >= 2^(-64) +// ...and |f| <= 1/2. +// +// f := f + a ...exact because a is 0 or 2^(-64); +// ...the msb of the sum is <= 1/2 +// ...lsb >= 2^(-64). +// // N := convert to integer format( C_hi + N ); // M := P_0 * x_lo; // N := N + M; -// +// // If sgn_x == 1 (that is original x was negative) // N := 2^10 - N // ...this maintains N to be non-negative, but still // ...equivalent to the (negated N) mod 4. // End If -// +// // If |f| >= 2^(-33) -// +// // ...Case 1 // CASE := 1 // g := A_hi + B_lo; // s_hi := f + g; // s_lo := (f - s_hi) + g; -// +// // Else -// +// // ...Case 2 // CASE := 2 // A := fadd.fpsr3( A_hi, B_lo ) // B := max( A_hi, B_lo ) // b := min( A_hi, B_lo ) -// -// a := (B - A) + b ...exact. Note that a is either 0 -// ...or 2^(-128). -// +// +// a := (B - A) + b ...exact. Note that a is either 0 +// ...or 2^(-128). +// // f_hi := A + f; // f_lo := (f - f_hi) + A; // ...this is exact. @@ -468,9 +474,9 @@ // ...If f = 2^(-64), f-f_hi involves cancellation and is // ...exact. If f = -2^(-64), then A + f is exact. Hence // ...f-f_hi is -A exactly, giving f_lo = 0. -// +// // f_lo := f_lo + a; -// +// // If |f| >= 2^(-50) then // s_hi := f_hi; // s_lo := f_lo; @@ -479,117 +485,111 @@ // s_hi := f_hi + f_lo // s_lo := (f_hi - s_hi) + f_lo // End If -// +// // End If -// +// // Step 3. Get reduced argument // ---------------------------- -// +// // If sgn_x == 0 (that is original x is positive) -// +// // D_hi := Pi_by_2_hi // D_lo := Pi_by_2_lo // ...load from table -// +// // Else -// +// // D_hi := neg_Pi_by_2_hi // D_lo := neg_Pi_by_2_lo // ...load from table // End If -// +// // r_hi := s_hi*D_hi -// r_lo := s_hi*D_hi - r_hi ...fma +// r_lo := s_hi*D_hi - r_hi ...fma // r_lo := (s_hi*D_lo + r_lo) + s_lo*D_hi -// -// Return CASE, N, r_hi, r_lo -// - -#include "libm_support.h" - -FR_X = f32 -FR_N = f33 -FR_p_1 = f34 -FR_TWOM33 = f35 -FR_TWOM50 = f36 -FR_g = f37 -FR_p_2 = f38 -FR_f = f39 -FR_s_lo = f40 -FR_p_3 = f41 -FR_f_abs = f42 -FR_D_lo = f43 -FR_p_4 = f44 -FR_D_hi = f45 -FR_Tmp2_C = f46 -FR_s_hi = f47 -FR_sigma_A = f48 -FR_A = f49 -FR_sigma_B = f50 -FR_B = f51 -FR_sigma_C = f52 -FR_b = f53 -FR_ScaleP2 = f54 -FR_ScaleP3 = f55 -FR_ScaleP4 = f56 -FR_Tmp_A = f57 -FR_Tmp_B = f58 -FR_Tmp_C = f59 -FR_A_hi = f60 -FR_f_hi = f61 -FR_r_hi = f62 -FR_A_lo = f63 -FR_B_hi = f64 -FR_a = f65 -FR_B_lo = f66 +// +// Return N, r_hi, r_lo +// +FR_input_X = f8 +FR_r_hi = f8 +FR_r_lo = f9 + +FR_X = f32 +FR_N = f33 +FR_p_1 = f34 +FR_TWOM33 = f35 +FR_TWOM50 = f36 +FR_g = f37 +FR_p_2 = f38 +FR_f = f39 +FR_s_lo = f40 +FR_p_3 = f41 +FR_f_abs = f42 +FR_D_lo = f43 +FR_p_4 = f44 +FR_D_hi = f45 +FR_Tmp2_C = f46 +FR_s_hi = f47 +FR_sigma_A = f48 +FR_A = f49 +FR_sigma_B = f50 +FR_B = f51 +FR_sigma_C = f52 +FR_b = f53 +FR_ScaleP2 = f54 +FR_ScaleP3 = f55 +FR_ScaleP4 = f56 +FR_Tmp_A = f57 +FR_Tmp_B = f58 +FR_Tmp_C = f59 +FR_A_hi = f60 +FR_f_hi = f61 +FR_RSHF = f62 +FR_A_lo = f63 +FR_B_hi = f64 +FR_a = f65 +FR_B_lo = f66 FR_f_lo = f67 -FR_r_lo = f68 -FR_C_hi = f69 -FR_C_lo = f70 +FR_N_fix = f68 +FR_C_hi = f69 +FR_C_lo = f70 GR_N = r8 -GR_Address_of_Input = r32 -GR_Address_of_Outputs = r33 -GR_Exp_x = r36 -GR_Temp = r37 -GR_BIASL63 = r38 +GR_Exp_x = r36 +GR_Temp = r37 +GR_BIASL63 = r38 GR_CASE = r39 -GR_x_lo = r40 -GR_sgn_x = r41 +GR_x_lo = r40 +GR_sgn_x = r41 GR_M = r42 GR_BASE = r43 GR_LENGTH1 = r44 GR_LENGTH2 = r45 GR_ASUB = r46 GR_P_0 = r47 -GR_P_1 = r48 -GR_P_2 = r49 -GR_P_3 = r50 -GR_P_4 = r51 +GR_P_1 = r48 +GR_P_2 = r49 +GR_P_3 = r50 +GR_P_4 = r51 GR_START = r52 GR_SEGMENT = r53 GR_A = r54 -GR_B = r55 +GR_B = r55 GR_C = r56 GR_D = r57 GR_E = r58 -GR_TEMP1 = r59 -GR_TEMP2 = r60 -GR_TEMP3 = r61 -GR_TEMP4 = r62 +GR_TEMP1 = r59 +GR_TEMP2 = r60 +GR_TEMP3 = r61 +GR_TEMP4 = r62 GR_TEMP5 = r63 GR_TEMP6 = r64 +GR_rshf = r64 +RODATA .align 64 -#ifdef _LIBC -.rodata -#else -.data -#endif - -Constants_Bits_of_2_by_pi: -ASM_TYPE_DIRECTIVE(Constants_Bits_of_2_by_pi,@object) +LOCAL_OBJECT_START(Constants_Bits_of_2_by_pi) data8 0x0000000000000000,0xA2F9836E4E441529 data8 0xFC2757D1F534DDC0,0xDB6295993C439041 data8 0xFE5163ABDEBBC561,0xB7246E3A424DD2E0 @@ -721,34 +721,33 @@ data8 0xB5D6DF8261DD9602,0x36169F3AC4A1A283 data8 0x6DED727A8D39A9B8,0x825C326B5B2746ED data8 0x34007700D255F4FC,0x4D59018071E0E13F data8 0x89B295F364A8F1AE,0xA74B38FC4CEAB2BB -ASM_SIZE_DIRECTIVE(Constants_Bits_of_2_by_pi) +LOCAL_OBJECT_END(Constants_Bits_of_2_by_pi) -Constants_Bits_of_pi_by_2: -ASM_TYPE_DIRECTIVE(Constants_Bits_of_pi_by_2,@object) -data4 0x2168C234,0xC90FDAA2,0x00003FFF,0x00000000 -data4 0x80DC1CD1,0xC4C6628B,0x00003FBF,0x00000000 -ASM_SIZE_DIRECTIVE(Constants_Bits_of_pi_by_2) +LOCAL_OBJECT_START(Constants_Bits_of_pi_by_2) +data8 0xC90FDAA22168C234,0x00003FFF +data8 0xC4C6628B80DC1CD1,0x00003FBF +LOCAL_OBJECT_END(Constants_Bits_of_pi_by_2) .section .text -.proc __libm_pi_by_2_reduce# .global __libm_pi_by_2_reduce# -.align 64 +.proc __libm_pi_by_2_reduce# +.align 32 -__libm_pi_by_2_reduce: +__libm_pi_by_2_reduce: -// X is at the address in Address_of_Input -// Place the two-piece result at the address in Address_of_Outputs -// r followed by c -// N is returned +// X is in f8 +// Place the two-piece result r (r_hi) in f8 and c (r_lo) in f9 +// N is returned in r8 -{ .mmf -alloc r34 = ar.pfs,2,34,0,0 -(p0) ldfe FR_X = [GR_Address_of_Input] -(p0) fsetc.s3 0x00,0x7F ;; +{ .mfi + alloc r34 = ar.pfs,2,34,0,0 + fsetc.s3 0x00,0x7F // Set sf3 to round to zero, 82-bit prec, td, ftz + nop.i 999 } -{ .mlx - nop.m 999 -(p0) movl GR_BIASL63 = 0x1003E +{ .mfi + addl GR_BASE = @ltoff(Constants_Bits_of_2_by_pi#), gp + nop.f 999 + mov GR_BIASL63 = 0x1003E } ;; @@ -765,73 +764,61 @@ alloc r34 = ar.pfs,2,34,0,0 // Address_BASE = shladd(SEGMENT,3) + BASE - { .mmi - nop.m 999 -(p0) addl GR_BASE = @ltoff(Constants_Bits_of_2_by_pi#), gp - nop.i 999 + getf.exp GR_Exp_x = FR_input_X + ld8 GR_BASE = [GR_BASE] + mov GR_TEMP5 = 0x0FFFE } ;; +// Define sigma_C := 2^63; sigma_B := 2^(-1); sigma_A := 2^(-65). { .mmi - ld8 GR_BASE = [GR_BASE] - nop.m 999 + getf.sig GR_x_lo = FR_input_X + mov GR_TEMP6 = 0x0FFBE nop.i 999 } ;; - -{ .mlx - nop.m 999 -(p0) movl GR_TEMP5 = 0x000000000000FFFE -} -{ .mmi - nop.m 999 ;; -(p0) setf.exp FR_sigma_B = GR_TEMP5 - nop.i 999 -} -{ .mlx - nop.m 999 -(p0) movl GR_TEMP6 = 0x000000000000FFBE ;; -} -// Define sigma_C := 2^63; sigma_B := 2^(-1); sigma_A := 2^(-65). -{ .mfi -(p0) setf.exp FR_sigma_A = GR_TEMP6 - nop.f 999 - nop.i 999 ;; -} -// Special Code for testing DE arguments -// (p0) movl GR_BIASL63 = 0x0000000000013FFE -// (p0) movl GR_x_lo = 0xFFFFFFFFFFFFFFFF -// (p0) setf.exp FR_X = GR_BIASL63 -// (p0) setf.sig FR_ScaleP3 = GR_x_lo -// (p0) fmerge.se FR_X = FR_X,FR_ScaleP3 +// Special Code for testing DE arguments +// movl GR_BIASL63 = 0x0000000000013FFE +// movl GR_x_lo = 0xFFFFFFFFFFFFFFFF +// setf.exp FR_X = GR_BIASL63 +// setf.sig FR_ScaleP3 = GR_x_lo +// fmerge.se FR_X = FR_X,FR_ScaleP3 // Set sgn_x := sign(x); x := |x|; x_lo := 2 lsb of x. // 2/pi is stored contigously as // 0x00000000 0x00000000.0xA2F.... // M = EXP - BIAS ( M >= 63) // Given x = 2^m * 1.xxxx...xxx; we calculate L := 62 - m. // Thus -1 <= L <= -16321. -{ .mmf -(p0) getf.exp GR_Exp_x = FR_X -(p0) getf.sig GR_x_lo = FR_X -(p0) fabs FR_X = FR_X ;; +{ .mmi + setf.exp FR_sigma_B = GR_TEMP5 + setf.exp FR_sigma_A = GR_TEMP6 + extr.u GR_M = GR_Exp_x,0,17 } +;; + { .mii -(p0) and GR_x_lo = 0x03,GR_x_lo -(p0) extr.u GR_M = GR_Exp_x,0,17 ;; -(p0) sub GR_START = GR_M,GR_BIASL63 + and GR_x_lo = 0x03,GR_x_lo + sub GR_START = GR_M,GR_BIASL63 + add GR_BASE = 8,GR_BASE // To effectively add 1 to SEGMENT } -{ .mmi - nop.m 999 ;; -(p0) and GR_LENGTH1 = 0x3F,GR_START -(p0) shr.u GR_SEGMENT = GR_START,6 +;; + +{ .mii + and GR_LENGTH1 = 0x3F,GR_START + shr.u GR_SEGMENT = GR_START,6 + nop.i 999 } +;; + { .mmi - nop.m 999 ;; -(p0) add GR_SEGMENT = 0x1,GR_SEGMENT -(p0) sub GR_LENGTH2 = 0x40,GR_LENGTH1 + shladd GR_BASE = GR_SEGMENT,3,GR_BASE + sub GR_LENGTH2 = 0x40,GR_LENGTH1 + cmp.le p6,p7 = 0x2,GR_LENGTH1 } +;; + // P_0 is the two bits corresponding to bit positions L+2 and L+1 // P_1 is the 64-bit starting at bit position L // P_2 is the 64-bit starting at bit position L-64 @@ -849,13 +836,13 @@ alloc r34 = ar.pfs,2,34,0,0 // P_4 is made up of Clo and Dhi // P_4 = deposit Dlo, position 0, length2 into P_4, position length1 // deposit Ehi, position length2, length1 into P_4, position 0 -{ .mmi -(p0) cmp.le.unc p6,p7 = 0x2,GR_LENGTH1 ;; -(p0) shladd GR_BASE = GR_SEGMENT,3,GR_BASE -(p7) cmp.eq.unc p8,p9 = 0x1,GR_LENGTH1 ;; +{ .mfi + ld8 GR_A = [GR_BASE],8 + fabs FR_X = FR_input_X +(p7) cmp.eq.unc p8,p9 = 0x1,GR_LENGTH1 } -{ .mmi - nop.m 999 +;; + // ld_64 A at Base and increment Base by 8 // ld_64 B at Base and increment Base by 8 // ld_64 C at Base and increment Base by 8 @@ -866,31 +853,35 @@ alloc r34 = ar.pfs,2,34,0,0 // A, B, C, D, and E look like | length1 | length2 | // --------------------- // hi lo -(p0) ld8 GR_A = [GR_BASE],8 -(p0) extr.u GR_sgn_x = GR_Exp_x,17,1 ;; -} -{ .mmf - nop.m 999 -(p0) ld8 GR_B = [GR_BASE],8 -(p0) fmerge.se FR_X = FR_sigma_B,FR_X ;; +{ .mlx + ld8 GR_B = [GR_BASE],8 + movl GR_rshf = 0x43e8000000000000 // 1.10000 2^63 for right shift N_fix } -{ .mii -(p0) ld8 GR_C = [GR_BASE],8 -(p8) extr.u GR_Temp = GR_A,63,1 ;; -(p0) shl GR_TEMP1 = GR_A,GR_LENGTH1 +;; + +{ .mmi + ld8 GR_C = [GR_BASE],8 + nop.m 999 +(p8) extr.u GR_Temp = GR_A,63,1 } -{ .mii -(p0) ld8 GR_D = [GR_BASE],8 +;; + // If length1 >= 2, // P_0 = deposit Ahi, position length2, 2 bit into P_0 at position 0. -(p6) shr.u GR_P_0 = GR_A,GR_LENGTH2 ;; -(p0) shl GR_TEMP2 = GR_B,GR_LENGTH1 +{ .mii + ld8 GR_D = [GR_BASE],8 + shl GR_TEMP1 = GR_A,GR_LENGTH1 // MM instruction +(p6) shr.u GR_P_0 = GR_A,GR_LENGTH2 // MM instruction } +;; + { .mii -(p0) ld8 GR_E = [GR_BASE],-40 -(p0) shr.u GR_P_1 = GR_B,GR_LENGTH2 ;; -(p0) shr.u GR_P_2 = GR_C,GR_LENGTH2 + ld8 GR_E = [GR_BASE],-40 + shl GR_TEMP2 = GR_B,GR_LENGTH1 // MM instruction + shr.u GR_P_1 = GR_B,GR_LENGTH2 // MM instruction } +;; + // Else // Load 16 bit of ASUB from (Base_Address_of_A - 2) // P_0 = ASUB & 0x3 @@ -900,43 +891,56 @@ alloc r34 = ar.pfs,2,34,0,0 // Deposit element 63 from Ahi and place in element 0 of P_0. // Endif // Endif + { .mii (p7) ld2 GR_ASUB = [GR_BASE],8 -(p0) shl GR_TEMP3 = GR_C,GR_LENGTH1 ;; -(p0) shl GR_TEMP4 = GR_D,GR_LENGTH1 + shl GR_TEMP3 = GR_C,GR_LENGTH1 // MM instruction + shr.u GR_P_2 = GR_C,GR_LENGTH2 // MM instruction } +;; + { .mii - nop.m 999 -(p0) shr.u GR_P_3 = GR_D,GR_LENGTH2 ;; -(p0) shr.u GR_P_4 = GR_E,GR_LENGTH2 + setf.d FR_RSHF = GR_rshf // Form right shift const 1.100 * 2^63 + shl GR_TEMP4 = GR_D,GR_LENGTH1 // MM instruction + shr.u GR_P_3 = GR_D,GR_LENGTH2 // MM instruction } -{ .mii +;; + +{ .mmi (p7) and GR_P_0 = 0x03,GR_ASUB -(p6) and GR_P_0 = 0x03,GR_P_0 ;; -(p0) or GR_P_1 = GR_P_1,GR_TEMP1 +(p6) and GR_P_0 = 0x03,GR_P_0 + shr.u GR_P_4 = GR_E,GR_LENGTH2 // MM instruction } +;; + { .mmi -(p8) and GR_P_0 = 0x1,GR_P_0 ;; -(p0) or GR_P_2 = GR_P_2,GR_TEMP2 -(p8) shl GR_P_0 = GR_P_0,0x1 ;; -} -{ .mii - nop.m 999 -(p0) or GR_P_3 = GR_P_3,GR_TEMP3 -(p8) or GR_P_0 = GR_P_0,GR_Temp + nop.m 999 + or GR_P_1 = GR_P_1,GR_TEMP1 +(p8) and GR_P_0 = 0x1,GR_P_0 } +;; + { .mmi -(p0) setf.sig FR_p_1 = GR_P_1 ;; -(p0) setf.sig FR_p_2 = GR_P_2 -(p0) or GR_P_4 = GR_P_4,GR_TEMP4 ;; + setf.sig FR_p_1 = GR_P_1 + or GR_P_2 = GR_P_2,GR_TEMP2 +(p8) shladd GR_P_0 = GR_P_0,1,GR_Temp } +;; + +{ .mmf + setf.sig FR_p_2 = GR_P_2 + or GR_P_3 = GR_P_3,GR_TEMP3 + fmerge.se FR_X = FR_sigma_B,FR_X +} +;; + { .mmi - nop.m 999 ;; -(p0) setf.sig FR_p_3 = GR_P_3 -(p0) pmpy2.r GR_M = GR_P_0,GR_x_lo + setf.sig FR_p_3 = GR_P_3 + or GR_P_4 = GR_P_4,GR_TEMP4 + pmpy2.r GR_M = GR_P_0,GR_x_lo } -{ .mlx -(p0) setf.sig FR_p_4 = GR_P_4 +;; + // P_1, P_2, P_3, P_4 are integers. They should be // 2^(L-63) * P_1; // 2^(L-63-64) * P_2; @@ -954,18 +958,18 @@ alloc r34 = ar.pfs,2,34,0,0 // | P_1 | | P_2 | | P_3 | // --------- --------- --------- // --------- -// X | X | -// --------- +// X | X | +// --------- // ---------------------------------------------------- // --------- --------- -// | A_hi | | A_lo | -// --------- --------- +// | A_hi | | A_lo | +// --------- --------- // --------- --------- -// | B_hi | | B_lo | -// --------- --------- +// | B_hi | | B_lo | +// --------- --------- +// --------- --------- +// | C_hi | | C_lo | // --------- --------- -// | C_hi | | C_lo | -// --------- --------- // ==================================================== // ----------- --------- --------- --------- // | S_0 | | S_1 | | S_2 | | S_3 | @@ -977,52 +981,55 @@ alloc r34 = ar.pfs,2,34,0,0 // and exponent range. Unless an explicit FPSR is given, // round-to-nearest with widest precision and exponent range is // used. -(p0) movl GR_TEMP1 = 0x000000000000FFBF -} { .mmi - nop.m 999 ;; -(p0) setf.exp FR_ScaleP2 = GR_TEMP1 - nop.i 999 -} -{ .mlx - nop.m 999 -(p0) movl GR_TEMP4 = 0x000000000001003E + setf.sig FR_p_4 = GR_P_4 + mov GR_TEMP1 = 0x0FFBF + nop.i 999 } +;; + { .mmi - nop.m 999 ;; -(p0) setf.exp FR_sigma_C = GR_TEMP4 - nop.i 999 + setf.exp FR_ScaleP2 = GR_TEMP1 + mov GR_TEMP2 = 0x0FF7F + nop.i 999 } -{ .mlx - nop.m 999 -(p0) movl GR_TEMP2 = 0x000000000000FF7F ;; +;; + +{ .mmi + setf.exp FR_ScaleP3 = GR_TEMP2 + mov GR_TEMP4 = 0x1003E + nop.i 999 } +;; + { .mmf - nop.m 999 -(p0) setf.exp FR_ScaleP3 = GR_TEMP2 -(p0) fcvt.xuf.s1 FR_p_1 = FR_p_1 ;; + setf.exp FR_sigma_C = GR_TEMP4 + mov GR_Temp = 0x0FFDE + fcvt.xuf.s1 FR_p_1 = FR_p_1 } +;; + { .mfi - nop.m 999 -(p0) fcvt.xuf.s1 FR_p_2 = FR_p_2 - nop.i 999 -} -{ .mlx - nop.m 999 -(p0) movl GR_Temp = 0x000000000000FFDE ;; -} -{ .mmf - nop.m 999 -(p0) setf.exp FR_TWOM33 = GR_Temp -(p0) fcvt.xuf.s1 FR_p_3 = FR_p_3 ;; + setf.exp FR_TWOM33 = GR_Temp + fcvt.xuf.s1 FR_p_2 = FR_p_2 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fcvt.xuf.s1 FR_p_4 = FR_p_4 - nop.i 999 ;; + nop.m 999 + fcvt.xuf.s1 FR_p_3 = FR_p_3 + nop.i 999 } +;; + { .mfi - nop.m 999 + nop.m 999 + fcvt.xuf.s1 FR_p_4 = FR_p_4 + nop.i 999 +} +;; + // Tmp_C := fmpy.fpsr3( x, p_1 ); // Tmp_B := fmpy.fpsr3( x, p_2 ); // Tmp_A := fmpy.fpsr3( x, p_3 ); @@ -1048,55 +1055,62 @@ alloc r34 = ar.pfs,2,34,0,0 // Exact, regardless ...of rounding direction // A_lo := x*p_3 - A_hi ...fma, exact // Endif -(p0) fmpy.s3 FR_Tmp_C = FR_X,FR_p_1 - nop.i 999 ;; -} { .mfi - nop.m 999 -(p0) fmpy.s1 FR_p_2 = FR_p_2,FR_ScaleP2 - nop.i 999 -} -{ .mlx - nop.m 999 -(p0) movl GR_Temp = 0x0000000000000400 + nop.m 999 + fmpy.s3 FR_Tmp_C = FR_X,FR_p_1 + nop.i 999 } -{ .mlx - nop.m 999 -(p0) movl GR_TEMP3 = 0x000000000000FF3F ;; +;; + +{ .mfi + mov GR_TEMP3 = 0x0FF3F + fmpy.s1 FR_p_2 = FR_p_2,FR_ScaleP2 + nop.i 999 } +;; + { .mmf - nop.m 999 -(p0) setf.exp FR_ScaleP4 = GR_TEMP3 -(p0) fmpy.s1 FR_p_3 = FR_p_3,FR_ScaleP3 ;; + setf.exp FR_ScaleP4 = GR_TEMP3 + mov GR_TEMP4 = 0x10045 + fmpy.s1 FR_p_3 = FR_p_3,FR_ScaleP3 } -{ .mlx - nop.m 999 -(p0) movl GR_TEMP4 = 0x0000000000010045 ;; +;; + +{ .mfi + nop.m 999 + fadd.s3 FR_C_hi = FR_sigma_C,FR_Tmp_C // For Tmp_C < sigma_C case + nop.i 999 } +;; + { .mmf - nop.m 999 -(p0) setf.exp FR_Tmp2_C = GR_TEMP4 -(p0) fmpy.s3 FR_Tmp_B = FR_X,FR_p_2 ;; + setf.exp FR_Tmp2_C = GR_TEMP4 + nop.m 999 + fmpy.s3 FR_Tmp_B = FR_X,FR_p_2 } +;; + { .mfi - nop.m 999 -(p0) fcmp.ge.unc.s1 p12, p9 = FR_Tmp_C,FR_sigma_C - nop.i 999 ;; + addl GR_BASE = @ltoff(Constants_Bits_of_pi_by_2#), gp + fcmp.ge.s1 p12, p9 = FR_Tmp_C,FR_sigma_C + nop.i 999 } { .mfi - nop.m 999 -(p0) fmpy.s3 FR_Tmp_A = FR_X,FR_p_3 - nop.i 999 ;; + nop.m 999 + fmpy.s3 FR_Tmp_A = FR_X,FR_p_3 + nop.i 99 } +;; + { .mfi - nop.m 999 + ld8 GR_BASE = [GR_BASE] (p12) mov FR_C_hi = FR_Tmp_C - nop.i 999 ;; + nop.i 999 } { .mfi -(p0) addl GR_BASE = @ltoff(Constants_Bits_of_pi_by_2#), gp -(p9) fadd.s3 FR_C_hi = FR_sigma_C,FR_Tmp_C - nop.i 999 + nop.m 999 +(p9) fsub.s1 FR_C_hi = FR_C_hi,FR_sigma_C + nop.i 999 } ;; @@ -1114,97 +1128,106 @@ alloc r34 = ar.pfs,2,34,0,0 // Load from table // End If - -{ .mmi - ld8 GR_BASE = [GR_BASE] +{ .mfi nop.m 999 + fmpy.s1 FR_p_4 = FR_p_4,FR_ScaleP4 nop.i 999 } -;; - - { .mfi -(p0) ldfe FR_D_hi = [GR_BASE],16 -(p0) fmpy.s1 FR_p_4 = FR_p_4,FR_ScaleP4 - nop.i 999 ;; + nop.m 999 + fadd.s3 FR_B_hi = FR_sigma_B,FR_Tmp_B // For Tmp_B < sigma_B case + nop.i 999 } +;; + { .mfi -(p0) ldfe FR_D_lo = [GR_BASE],0 -(p0) fcmp.ge.unc.s1 p13, p10 = FR_Tmp_B,FR_sigma_B - nop.i 999 ;; + nop.m 999 + fadd.s3 FR_A_hi = FR_sigma_A,FR_Tmp_A // For Tmp_A < sigma_A case + nop.i 999 } +;; + { .mfi - nop.m 999 -(p13) mov FR_B_hi = FR_Tmp_B - nop.i 999 + nop.m 999 + fcmp.ge.s1 p13, p10 = FR_Tmp_B,FR_sigma_B + nop.i 999 } { .mfi - nop.m 999 -(p12) fms.s1 FR_C_lo = FR_X,FR_p_1,FR_C_hi - nop.i 999 ;; + nop.m 999 + fms.s1 FR_C_lo = FR_X,FR_p_1,FR_C_hi + nop.i 999 } +;; + { .mfi - nop.m 999 -(p10) fadd.s3 FR_B_hi = FR_sigma_B,FR_Tmp_B - nop.i 999 + ldfe FR_D_hi = [GR_BASE],16 + fcmp.ge.s1 p14, p11 = FR_Tmp_A,FR_sigma_A + nop.i 999 } +;; + { .mfi - nop.m 999 -(p9) fsub.s1 FR_C_hi = FR_C_hi,FR_sigma_C - nop.i 999 ;; + ldfe FR_D_lo = [GR_BASE] +(p13) mov FR_B_hi = FR_Tmp_B + nop.i 999 } { .mfi - nop.m 999 -(p0) fcmp.ge.unc.s1 p14, p11 = FR_Tmp_A,FR_sigma_A - nop.i 999 ;; + nop.m 999 +(p10) fsub.s1 FR_B_hi = FR_B_hi,FR_sigma_B + nop.i 999 } +;; + { .mfi - nop.m 999 + nop.m 999 (p14) mov FR_A_hi = FR_Tmp_A - nop.i 999 ;; -} -{ .mfi - nop.m 999 -(p11) fadd.s3 FR_A_hi = FR_sigma_A,FR_Tmp_A - nop.i 999 ;; -} -{ .mfi - nop.m 999 -(p9) fms.s1 FR_C_lo = FR_X,FR_p_1,FR_C_hi -(p0) cmp.eq.unc p12,p9 = 0x1,GR_sgn_x -} -{ .mfi - nop.m 999 -(p13) fms.s1 FR_B_lo = FR_X,FR_p_2,FR_B_hi - nop.i 999 ;; + nop.i 999 } { .mfi - nop.m 999 -(p10) fsub.s1 FR_B_hi = FR_B_hi,FR_sigma_B - nop.i 999 + nop.m 999 +(p11) fsub.s1 FR_A_hi = FR_A_hi,FR_sigma_A + nop.i 999 } -{ .mfi - nop.m 999 +;; + // Note that C_hi is of integer value. We need only the // last few bits. Thus we can ensure C_hi is never a big // integer, freeing us from overflow worry. // Tmp_C := fadd.fpsr3( C_hi, 2^(70) ) - 2^(70); // Tmp_C is the upper portion of C_hi -(p0) fadd.s3 FR_Tmp_C = FR_C_hi,FR_Tmp2_C - nop.i 999 ;; +{ .mfi + nop.m 999 + fadd.s3 FR_Tmp_C = FR_C_hi,FR_Tmp2_C + tbit.z p12,p9 = GR_Exp_x, 17 } +;; + { .mfi - nop.m 999 -(p14) fms.s1 FR_A_lo = FR_X,FR_p_3,FR_A_hi - nop.i 999 + nop.m 999 + fms.s1 FR_B_lo = FR_X,FR_p_2,FR_B_hi + nop.i 999 } { .mfi - nop.m 999 -(p11) fsub.s1 FR_A_hi = FR_A_hi,FR_sigma_A - nop.i 999 ;; + nop.m 999 + fadd.s3 FR_A = FR_B_hi,FR_C_lo + nop.i 999 } +;; + +{ .mfi + nop.m 999 + fms.s1 FR_A_lo = FR_X,FR_p_3,FR_A_hi + nop.i 999 +} +;; + { .mfi - nop.m 999 + nop.m 999 + fsub.s1 FR_Tmp_C = FR_Tmp_C,FR_Tmp2_C + nop.i 999 +} +;; + // ******************* // Step 2. Get N and f // ******************* @@ -1215,168 +1238,213 @@ alloc r34 = ar.pfs,2,34,0,0 // A := fadd.fpsr3( B_hi, C_lo ) // B := max( B_hi, C_lo ) // b := min( B_hi, C_lo ) -(p0) fadd.s3 FR_A = FR_B_hi,FR_C_lo - nop.i 999 -} { .mfi - nop.m 999 -(p10) fms.s1 FR_B_lo = FR_X,FR_p_2,FR_B_hi - nop.i 999 ;; + nop.m 999 + fmax.s1 FR_B = FR_B_hi,FR_C_lo + nop.i 999 } +;; + +// We use a right-shift trick to get the integer part of A into the rightmost +// bits of the significand by adding 1.1000..00 * 2^63. This operation is good +// if |A| < 2^61, which it is in this case. We are doing this to save a few +// cycles over using fcvt.fx followed by fnorm. The second step of the trick +// is to subtract the same constant to float the rounded integer into a fp reg. + { .mfi - nop.m 999 -(p0) fsub.s1 FR_Tmp_C = FR_Tmp_C,FR_Tmp2_C - nop.i 999 ;; + nop.m 999 +// N := round_to_nearest_integer_value( A ); + fma.s1 FR_N_fix = FR_A, f1, FR_RSHF + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fmax.s1 FR_B = FR_B_hi,FR_C_lo - nop.i 999 ;; + nop.m 999 + fmin.s1 FR_b = FR_B_hi,FR_C_lo + nop.i 999 } { .mfi - nop.m 999 -(p0) fmin.s1 FR_b = FR_B_hi,FR_C_lo - nop.i 999 + nop.m 999 +// C_hi := C_hi - Tmp_C ...0 <= C_hi < 2^7 + fsub.s1 FR_C_hi = FR_C_hi,FR_Tmp_C + nop.i 999 } +;; + { .mfi - nop.m 999 -(p11) fms.s1 FR_A_lo = FR_X,FR_p_3,FR_A_hi - nop.i 999 ;; + nop.m 999 +// a := (B - A) + b: Exact - note that a is either 0 or 2^(-64). + fsub.s1 FR_a = FR_B,FR_A + nop.i 999 } +;; + { .mfi - nop.m 999 -// N := round_to_nearest_integer_value( A ); -(p0) fcvt.fx.s1 FR_N = FR_A - nop.i 999 ;; + nop.m 999 + fms.s1 FR_N = FR_N_fix, f1, FR_RSHF + nop.i 999 } +;; + { .mfi - nop.m 999 -// C_hi := C_hi - Tmp_C ...0 <= C_hi < 2^7 -(p0) fsub.s1 FR_C_hi = FR_C_hi,FR_Tmp_C - nop.i 999 ;; + nop.m 999 + fadd.s1 FR_a = FR_a,FR_b + nop.i 999 } +;; + +// f := A - N; Exact because lsb(A) >= 2^(-64) and |f| <= 1/2. +// N := convert to integer format( C_hi + N ); +// M := P_0 * x_lo; +// N := N + M; { .mfi - nop.m 999 -// a := (B - A) + b: Exact - note that a is either 0 or 2^(-64). -(p0) fsub.s1 FR_a = FR_B,FR_A - nop.i 999 ;; + nop.m 999 + fsub.s1 FR_f = FR_A,FR_N + nop.i 999 } { .mfi - nop.m 999 -// f := A - N; Exact because lsb(A) >= 2^(-64) and |f| <= 1/2. -(p0) fnorm.s1 FR_N = FR_N - nop.i 999 + nop.m 999 + fadd.s1 FR_N = FR_N,FR_C_hi + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fadd.s1 FR_a = FR_a,FR_b - nop.i 999 ;; + nop.m 999 +(p9) fsub.s1 FR_D_hi = f0, FR_D_hi + nop.i 999 } { .mfi - nop.m 999 -(p0) fsub.s1 FR_f = FR_A,FR_N - nop.i 999 + nop.m 999 +(p9) fsub.s1 FR_D_lo = f0, FR_D_lo + nop.i 999 } +;; + { .mfi - nop.m 999 -// N := convert to integer format( C_hi + N ); -// M := P_0 * x_lo; -// N := N + M; -(p0) fadd.s1 FR_N = FR_N,FR_C_hi - nop.i 999 ;; + nop.m 999 + fadd.s1 FR_g = FR_A_hi,FR_B_lo // For Case 1, g=A_hi+B_lo + nop.i 999 } { .mfi - nop.m 999 -// f = f + a Exact because a is 0 or 2^(-64); -// the msb of the sum is <= 1/2 and lsb >= 2^(-64). -(p0) fadd.s1 FR_f = FR_f,FR_a - nop.i 999 + nop.m 999 + fadd.s3 FR_A = FR_A_hi,FR_B_lo // For Case 2, A=A_hi+B_lo w/ sf3 + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// Create 2**(-33) -// -(p0) fcvt.fx.s1 FR_N = FR_N - nop.i 999 ;; + mov GR_Temp = 0x0FFCD // For Case 2, exponent of 2^-50 + fmax.s1 FR_B = FR_A_hi,FR_B_lo // For Case 2, B=max(A_hi,B_lo) + nop.i 999 } +;; + +// f = f + a Exact because a is 0 or 2^(-64); +// the msb of the sum is <= 1/2 and lsb >= 2^(-64). { .mfi - nop.m 999 -(p0) fabs FR_f_abs = FR_f - nop.i 999 ;; + setf.exp FR_TWOM50 = GR_Temp // For Case 2, form 2^-50 + fcvt.fx.s1 FR_N = FR_N + nop.i 999 } { .mfi -(p0) getf.sig GR_N = FR_N - nop.f 999 - nop.i 999 ;; + nop.m 999 + fadd.s1 FR_f = FR_f,FR_a + nop.i 999 } -{ .mii - nop.m 999 - nop.i 999 ;; -(p0) add GR_N = GR_N,GR_M ;; +;; + +{ .mfi + nop.m 999 + fmin.s1 FR_b = FR_A_hi,FR_B_lo // For Case 2, b=min(A_hi,B_lo) + nop.i 999 } -// If sgn_x == 1 (that is original x was negative) -// N := 2^10 - N -// this maintains N to be non-negative, but still -// equivalent to the (negated N) mod 4. -// End If -{ .mii -(p12) sub GR_N = GR_Temp,GR_N -(p0) cmp.eq.unc p12,p9 = 0x0,GR_sgn_x ;; - nop.i 999 +;; + +{ .mfi + nop.m 999 + fsub.s1 FR_a = FR_B,FR_A // For Case 2, a=B-A + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fcmp.ge.unc.s1 p13, p10 = FR_f_abs,FR_TWOM33 - nop.i 999 ;; + nop.m 999 + fadd.s1 FR_s_hi = FR_f,FR_g // For Case 1, s_hi=f+g + nop.i 999 } { .mfi - nop.m 999 -(p9) fsub.s1 FR_D_hi = f0, FR_D_hi - nop.i 999 ;; + nop.m 999 + fadd.s1 FR_f_hi = FR_A,FR_f // For Case 2, f_hi=A+f + nop.i 999 } +;; + { .mfi - nop.m 999 -(p10) fadd.s3 FR_A = FR_A_hi,FR_B_lo - nop.i 999 + nop.m 999 + fabs FR_f_abs = FR_f + nop.i 999 } +;; + { .mfi - nop.m 999 -(p13) fadd.s1 FR_g = FR_A_hi,FR_B_lo - nop.i 999 ;; + getf.sig GR_N = FR_N + fsetc.s3 0x7F,0x40 // Reset sf3 to user settings + td + nop.i 999 } +;; + { .mfi - nop.m 999 -(p10) fmax.s1 FR_B = FR_A_hi,FR_B_lo - nop.i 999 + nop.m 999 + fsub.s1 FR_s_lo = FR_f,FR_s_hi // For Case 1, s_lo=f-s_hi + nop.i 999 } { .mfi - nop.m 999 -(p9) fsub.s1 FR_D_lo = f0, FR_D_lo - nop.i 999 ;; + nop.m 999 + fsub.s1 FR_f_lo = FR_f,FR_f_hi // For Case 2, f_lo=f-f_hi + nop.i 999 } +;; + { .mfi - nop.m 999 -(p10) fmin.s1 FR_b = FR_A_hi,FR_B_lo - nop.i 999 ;; + nop.m 999 + fmpy.s1 FR_r_hi = FR_s_hi,FR_D_hi // For Case 1, r_hi=s_hi*D_hi + nop.i 999 } { .mfi - nop.m 999 -(p0) fsetc.s3 0x7F,0x40 - nop.i 999 + nop.m 999 + fadd.s1 FR_a = FR_a,FR_b // For Case 2, a=a+b + nop.i 999 } -{ .mlx - nop.m 999 -(p10) movl GR_Temp = 0x000000000000FFCD ;; +;; + + +// If sgn_x == 1 (that is original x was negative) +// N := 2^10 - N +// this maintains N to be non-negative, but still +// equivalent to the (negated N) mod 4. +// End If +{ .mfi + add GR_N = GR_N,GR_M + fcmp.ge.s1 p13, p10 = FR_f_abs,FR_TWOM33 + mov GR_Temp = 0x00400 } -{ .mmf - nop.m 999 -(p10) setf.exp FR_TWOM50 = GR_Temp -(p10) fadd.s1 FR_f_hi = FR_A,FR_f ;; +;; + +{ .mfi +(p9) sub GR_N = GR_Temp,GR_N + fadd.s1 FR_s_lo = FR_s_lo,FR_g // For Case 1, s_lo=s_lo+g + nop.i 999 } { .mfi - nop.m 999 -// a := (B - A) + b Exact. + nop.m 999 + fadd.s1 FR_f_lo = FR_f_lo,FR_A // For Case 2, f_lo=f_lo+A + nop.i 999 +} +;; + +// a := (B - A) + b Exact. // Note that a is either 0 or 2^(-128). // f_hi := A + f; // f_lo := (f - f_hi) + A @@ -1387,68 +1455,32 @@ alloc r34 = ar.pfs,2,34,0,0 // exact. If f = -2^(-64), then A + f is exact. Hence // f-f_hi is -A exactly, giving f_lo = 0. // f_lo := f_lo + a; -(p10) fsub.s1 FR_a = FR_B,FR_A - nop.i 999 -} -{ .mfi - nop.m 999 -(p13) fadd.s1 FR_s_hi = FR_f,FR_g - nop.i 999 ;; -} -{ .mlx - nop.m 999 + // If |f| >= 2^(-33) // Case 1 // CASE := 1 // g := A_hi + B_lo; // s_hi := f + g; // s_lo := (f - s_hi) + g; -(p13) movl GR_CASE = 0x1 ;; -} -{ .mlx - nop.m 999 // Else // Case 2 // CASE := 2 // A := fadd.fpsr3( A_hi, B_lo ) // B := max( A_hi, B_lo ) // b := min( A_hi, B_lo ) -(p10) movl GR_CASE = 0x2 -} -{ .mfi - nop.m 999 -(p10) fsub.s1 FR_f_lo = FR_f,FR_f_hi - nop.i 999 ;; -} -{ .mfi - nop.m 999 -(p10) fadd.s1 FR_a = FR_a,FR_b - nop.i 999 -} -{ .mfi - nop.m 999 -(p13) fsub.s1 FR_s_lo = FR_f,FR_s_hi - nop.i 999 ;; -} -{ .mfi - nop.m 999 -(p13) fadd.s1 FR_s_lo = FR_s_lo,FR_g - nop.i 999 ;; -} + { .mfi - nop.m 999 -(p10) fcmp.ge.unc.s1 p14, p11 = FR_f_abs,FR_TWOM50 - nop.i 999 ;; + nop.m 999 +(p10) fcmp.ge.unc.s1 p14, p11 = FR_f_abs,FR_TWOM50 + nop.i 999 } { .mfi - nop.m 999 -// -// Create 2**(-50) -(p10) fadd.s1 FR_f_lo = FR_f_lo,FR_A - nop.i 999 ;; + nop.m 999 +(p13) fms.s1 FR_r_lo = FR_s_hi,FR_D_hi,FR_r_hi //For Case 1, r_lo=s_hi*D_hi+r_hi + nop.i 999 } -{ .mfi - nop.m 999 +;; + // If |f| >= 2^(-50) then // s_hi := f_hi; // s_lo := f_lo; @@ -1457,84 +1489,90 @@ alloc r34 = ar.pfs,2,34,0,0 // s_hi := f_hi + f_lo // s_lo := (f_hi - s_hi) + f_lo // End If -(p14) mov FR_s_hi = FR_f_hi - nop.i 999 ;; +{ .mfi + nop.m 999 +(p14) mov FR_s_hi = FR_f_hi + nop.i 999 } { .mfi - nop.m 999 -(p10) fadd.s1 FR_f_lo = FR_f_lo,FR_a - nop.i 999 ;; + nop.m 999 +(p10) fadd.s1 FR_f_lo = FR_f_lo,FR_a + nop.i 999 } +;; + { .mfi - nop.m 999 -(p14) mov FR_s_lo = FR_f_lo - nop.i 999 + nop.m 999 +(p14) mov FR_s_lo = FR_f_lo + nop.i 999 } { .mfi - nop.m 999 -(p11) fadd.s1 FR_f_lo = FR_f_lo,FR_A_lo - nop.i 999 ;; + nop.m 999 +(p11) fadd.s1 FR_f_lo = FR_f_lo,FR_A_lo + nop.i 999 } +;; + { .mfi - nop.m 999 -(p11) fma.s1 FR_f_lo = FR_X,FR_p_4,FR_f_lo - nop.i 999 ;; + nop.m 999 +(p11) fma.s1 FR_f_lo = FR_X,FR_p_4,FR_f_lo + nop.i 999 } +;; + { .mfi - nop.m 999 -(p11) fadd.s1 FR_s_hi = FR_f_hi,FR_f_lo - nop.i 999 ;; + nop.m 999 +(p13) fma.s1 FR_r_lo = FR_s_hi,FR_D_lo,FR_r_lo //For Case 1, r_lo=s_hi*D_lo+r_lo + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 +(p11) fadd.s1 FR_s_hi = FR_f_hi,FR_f_lo + nop.i 999 +} +;; + // r_hi := s_hi*D_hi // r_lo := s_hi*D_hi - r_hi with fma // r_lo := (s_hi*D_lo + r_lo) + s_lo*D_hi -(p0) fmpy.s1 FR_r_hi = FR_s_hi,FR_D_hi - nop.i 999 -} { .mfi - nop.m 999 -(p11) fsub.s1 FR_s_lo = FR_f_hi,FR_s_hi - nop.i 999 ;; + nop.m 999 +(p10) fmpy.s1 FR_r_hi = FR_s_hi,FR_D_hi + nop.i 999 } { .mfi - nop.m 999 -(p0) fms.s1 FR_r_lo = FR_s_hi,FR_D_hi,FR_r_hi - nop.i 999 + nop.m 999 +(p11) fsub.s1 FR_s_lo = FR_f_hi,FR_s_hi + nop.i 999 } +;; + { .mfi - nop.m 999 -(p11) fadd.s1 FR_s_lo = FR_s_lo,FR_f_lo - nop.i 999 ;; -} -{ .mmi - nop.m 999 ;; -// Return N, r_hi, r_lo -// We do not return CASE -(p0) stfe [GR_Address_of_Outputs] = FR_r_hi,16 - nop.i 999 ;; + nop.m 999 +(p10) fms.s1 FR_r_lo = FR_s_hi,FR_D_hi,FR_r_hi + nop.i 999 } { .mfi - nop.m 999 -(p0) fma.s1 FR_r_lo = FR_s_hi,FR_D_lo,FR_r_lo - nop.i 999 ;; + nop.m 999 +(p11) fadd.s1 FR_s_lo = FR_s_lo,FR_f_lo + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fma.s1 FR_r_lo = FR_s_lo,FR_D_hi,FR_r_lo - nop.i 999 ;; -} -{ .mmi - nop.m 999 ;; -(p0) stfe [GR_Address_of_Outputs] = FR_r_lo,-16 - nop.i 999 + nop.m 999 +(p10) fma.s1 FR_r_lo = FR_s_hi,FR_D_lo,FR_r_lo + nop.i 999 } -{ .mib - nop.m 999 - nop.i 999 -(p0) br.ret.sptk b0 ;; +;; + +// Return N, r_hi, r_lo +// We do not return CASE +{ .mfb + nop.m 999 + fma.s1 FR_r_lo = FR_s_lo,FR_D_hi,FR_r_lo + br.ret.sptk b0 } +;; -.endp __libm_pi_by_2_reduce -ASM_SIZE_DIRECTIVE(__libm_pi_by_2_reduce) +.endp __libm_pi_by_2_reduce# diff --git a/sysdeps/ia64/fpu/libm_support.h b/sysdeps/ia64/fpu/libm_support.h index 5d3498dfc9..50dac33133 100644 --- a/sysdeps/ia64/fpu/libm_support.h +++ b/sysdeps/ia64/fpu/libm_support.h @@ -1,9 +1,10 @@ -// -// Copyright (C) 2000, 2001, Intel Corporation +/* file: libm_support.h */ + + +// Copyright (c) 2000 - 2002, Intel Corporation // All rights reserved. // -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -19,14 +20,14 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS @@ -34,45 +35,51 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. // -// History: 02/02/2000 Initial version +// History: 02/02/2000 Initial version // 2/28/2000 added tags for logb and nextafter -// 3/22/2000 Changes to support _LIB_VERSION variable -// and filled some enum gaps. Added support for C99. +// 3/22/2000 Changes to support _LIB_VERSIONIMF variable +// and filled some enum gaps. Added support for C99. // 5/31/2000 added prototypes for __libm_frexp_4l/8l -// 8/10/2000 Changed declaration of _LIB_VERSION to work for library +// 8/10/2000 Changed declaration of _LIB_VERSIONIMF to work for library // builds and other application builds (precompiler directives). // 8/11/2000 Added pointers-to-matherr-functions declarations to allow // for user-defined matherr functions in the dll build. // 12/07/2000 Added scalbn error_types values. +// 5/01/2001 Added error_types values for C99 nearest integer +// functions. +// 6/07/2001 Added error_types values for fdim. +// 6/18/2001 Added include of complex_support.h. +// 8/03/2001 Added error_types values for nexttoward, scalbln. +// 8/23/2001 Corrected tag numbers from 186 and higher. +// 8/27/2001 Added check for long int and long long int definitions. +// 12/10/2001 Added error_types for erfc. +// 12/27/2001 Added error_types for degree argument functions. +// 01/02/2002 Added error_types for tand, cotd. +// 01/04/2002 Delete include of complex_support.h +// 01/23/2002 Deleted prototypes for __libm_frexp*. Added check for +// multiple int, long int, and long long int definitions. +// 05/20/2002 Added error_types for cot. +// 06/27/2002 Added error_types for sinhcosh. +// 12/05/2002 Added error_types for annuity and compound +// 04/10/2003 Added error_types for tgammal/tgamma/tgammaf // -#ifndef __ASSEMBLER__ -#include <math.h> - -float __libm_frexp_4f( float x, int* exp); -float _GI___libm_frexp_4f( float x, int* exp); -float __libm_frexp_8f( float x, int* exp); -double __libm_frexp_4( double x, int* exp); -double _GI___libm_frexp_4( double x, int* exp); -double __libm_frexp_8( double x, int* exp); -long double __libm_frexp_4l( long double x, int* exp); -long double _GI___libm_frexp_4l( long double x, int* exp); -long double __libm_frexp_8l( long double x, int* exp); void __libm_sincos_pi4(double,double*,double*,int); void __libm_y0y1(double , double *, double *); void __libm_j0j1(double , double *, double *); -double __libm_lgamma_kernel(double,int*,int,int); double __libm_j0(double); double __libm_j1(double); double __libm_jn(int,double); double __libm_y0(double); double __libm_y1(double); double __libm_yn(int,double); +double __libm_copysign (double, double); +float __libm_copysignf (float, float); +long double __libm_copysignl (long double, long double); -extern double rint(double); extern double sqrt(double); extern double fabs(double); extern double log(double); @@ -112,24 +119,31 @@ extern long double log1pl(long double); extern long double logl(long double); extern long double sqrtl(long double); extern long double expl(long double); - -extern long lround(double); -extern long lroundf(float); -extern long lroundl(long double); +extern long double fabsl(long double); #if !(defined(SIZE_INT_32) || defined(SIZE_INT_64)) - #error integer size not established; define SIZE_INT_32 or SIZE_INT_64 +#error integer size not established; define SIZE_INT_32 or SIZE_INT_64 #endif -struct fp64 { /*/ sign:1 exponent:11 significand:52 (implied leading 1)*/ - unsigned lo_significand:32; - unsigned hi_significand:20; - unsigned exponent:11; - unsigned sign:1; -}; +#if (defined(SIZE_INT_32) && defined(SIZE_INT_64)) +#error multiple integer size definitions; define SIZE_INT_32 or SIZE_INT_64 +#endif -#define HI_SIGNIFICAND_LESS(X, HI) ((X)->hi_significand < 0x ## HI) -#define f64abs(x) ((x) < 0.0 ? -(x) : (x)) +#if !(defined(SIZE_LONG_INT_32) || defined(SIZE_LONG_INT_64)) +#error long int size not established; define SIZE_LONG_INT_32 or SIZE_LONG_INT_64 +#endif + +#if (defined(SIZE_LONG_INT_32) && defined(SIZE_LONG_INT_64)) +#error multiple long int size definitions; define SIZE_LONG_INT_32 or SIZE_LONG_INT_64 +#endif + +#if !(defined(SIZE_LONG_LONG_INT_32) || defined(SIZE_LONG_LONG_INT_64)) +#error long long int size not established; define SIZE_LONG_LONG_INT_32 or SIZE_LONG_LONG_INT_64 +#endif + +#if (defined(SIZE_LONG_LONG_INT_32) && defined(SIZE_LONG_LONG_INT_64)) +#error multiple long long int size definitions; define SIZE_LONG_LONG_INT_32 or SIZE_LONG_LONG_INT_64 +#endif typedef enum { @@ -148,14 +162,14 @@ typedef enum powl_neg_to_non_integer, /* 22 */ powl_nan_to_zero, /* 23 */ pow_overflow, pow_underflow, /* 24, 25 */ - pow_zero_to_zero, /* 26 */ + pow_zero_to_zero, /* 26 */ pow_zero_to_negative, /* 27 */ pow_neg_to_non_integer, /* 28 */ pow_nan_to_zero, /* 29 */ powf_overflow, powf_underflow, /* 30, 31 */ powf_zero_to_zero, /* 32 */ - powf_zero_to_negative, /* 33 */ - powf_neg_to_non_integer, /* 34 */ + powf_zero_to_negative, /* 33 */ + powf_neg_to_non_integer, /* 34 */ powf_nan_to_zero, /* 35 */ atan2l_zero, /* 36 */ atan2_zero, /* 37 */ @@ -181,13 +195,13 @@ typedef enum y0l_zero, y0l_negative,y0l_gt_loss, /* 66, 67, 68 */ y0_zero, y0_negative,y0_gt_loss, /* 69, 70, 71 */ y0f_zero, y0f_negative,y0f_gt_loss, /* 72, 73, 74 */ - y1l_zero, y1l_negative,y1l_gt_loss, /* 75, 76, 77 */ - y1_zero, y1_negative,y1_gt_loss, /* 78, 79, 80 */ - y1f_zero, y1f_negative,y1f_gt_loss, /* 81, 82, 83 */ + y1l_zero, y1l_negative,y1l_gt_loss, /* 75, 76, 77 */ + y1_zero, y1_negative,y1_gt_loss, /* 78, 79, 80 */ + y1f_zero, y1f_negative,y1f_gt_loss, /* 81, 82, 83 */ ynl_zero, ynl_negative,ynl_gt_loss, /* 84, 85, 86 */ yn_zero, yn_negative,yn_gt_loss, /* 87, 88, 89 */ ynf_zero, ynf_negative,ynf_gt_loss, /* 90, 91, 92 */ - j0l_gt_loss, /* 93 */ + j0l_gt_loss, /* 93 */ j0_gt_loss, /* 94 */ j0f_gt_loss, /* 95 */ j1l_gt_loss, /* 96 */ @@ -201,7 +215,7 @@ typedef enum lgammaf_overflow, lgammaf_negative, lgammaf_reserve,/* 108, 109, 110 */ gammal_overflow,gammal_negative, gammal_reserve, /* 111, 112, 113 */ gamma_overflow, gamma_negative, gamma_reserve, /* 114, 115, 116 */ - gammaf_overflow,gammaf_negative,gammaf_reserve, /* 117, 118, 119 */ + gammaf_overflow,gammaf_negative,gammaf_reserve, /* 117, 118, 119 */ fmodl_by_zero, /* 120 */ fmod_by_zero, /* 121 */ fmodf_by_zero, /* 122 */ @@ -222,7 +236,7 @@ typedef enum ldexp_overflow, ldexp_underflow, /* 146, 147 */ ldexpf_overflow, ldexpf_underflow, /* 148, 149 */ logbl_zero, logb_zero, logbf_zero, /* 150, 151, 152 */ - nextafterl_overflow, nextafter_overflow, + nextafterl_overflow, nextafter_overflow, nextafterf_overflow, /* 153, 154, 155 */ ilogbl_zero, ilogb_zero, ilogbf_zero, /* 156, 157, 158 */ exp2l_overflow, exp2l_underflow, /* 159, 160 */ @@ -235,18 +249,406 @@ typedef enum log2f_zero, log2f_negative, /* 172, 173 */ scalbnl_overflow, scalbnl_underflow, /* 174, 175 */ scalbn_overflow, scalbn_underflow, /* 176, 177 */ - scalbnf_overflow, scalbnf_underflow /* 178, 179 */ + scalbnf_overflow, scalbnf_underflow, /* 178, 179 */ + remquol_by_zero, /* 180 */ + remquo_by_zero, /* 181 */ + remquof_by_zero, /* 182 */ + lrintl_large, lrint_large, lrintf_large, /* 183, 184, 185 */ + llrintl_large, llrint_large, llrintf_large, /* 186, 187, 188 */ + lroundl_large, lround_large, lroundf_large, /* 189, 190, 191 */ + llroundl_large, llround_large, llroundf_large, /* 192, 193, 194 */ + fdiml_overflow, fdim_overflow, fdimf_overflow, /* 195, 196, 197 */ + nexttowardl_overflow, nexttoward_overflow, + nexttowardf_overflow, /* 198, 199, 200 */ + scalblnl_overflow, scalblnl_underflow, /* 201, 202 */ + scalbln_overflow, scalbln_underflow, /* 203, 204 */ + scalblnf_overflow, scalblnf_underflow, /* 205, 206 */ + erfcl_underflow, erfc_underflow, erfcf_underflow, /* 207, 208, 209 */ + acosdl_gt_one, acosd_gt_one, acosdf_gt_one, /* 210, 211, 212 */ + asindl_gt_one, asind_gt_one, asindf_gt_one, /* 213, 214, 215 */ + atan2dl_zero, atan2d_zero, atan2df_zero, /* 216, 217, 218 */ + tandl_overflow, tand_overflow, tandf_overflow, /* 219, 220, 221 */ + cotdl_overflow, cotd_overflow, cotdf_overflow, /* 222, 223, 224 */ + cotl_overflow, cot_overflow, cotf_overflow, /* 225, 226, 227 */ + sinhcoshl_overflow, sinhcosh_overflow, sinhcoshf_overflow, /* 228, 229, 230 */ + annuityl_by_zero, annuity_by_zero, annuityf_by_zero, /* 231, 232, 233 */ + annuityl_less_m1, annuity_less_m1, annuityf_less_m1, /* 234, 235, 236 */ + annuityl_overflow, annuity_overflow, annuityf_overflow, /* 237, 238, 239 */ + annuityl_underflow, annuity_underflow, annuityf_underflow, /* 240, 241, 242 */ + compoundl_by_zero, compound_by_zero, compoundf_by_zero, /* 243, 244, 245 */ + compoundl_less_m1, compound_less_m1, compoundf_less_m1, /* 246, 247, 248 */ + compoundl_overflow, compound_overflow, compoundf_overflow, /* 249, 250, 251 */ + compoundl_underflow, compound_underflow, compoundf_underflow, /* 252, 253, 254 */ + tgammal_overflow, tgammal_negative, tgammal_reserve, /* 255, 256, 257 */ + tgamma_overflow, tgamma_negative, tgamma_reserve, /* 258, 259, 260 */ + tgammaf_overflow, tgammaf_negative, tgammaf_reserve, /* 261, 262, 263 */ } error_types; void __libm_error_support(void*,void*,void*,error_types); +#ifdef _LIBC libc_hidden_proto(__libm_error_support) +#endif + +#define HI_SIGNIFICAND_LESS(X, HI) ((X)->hi_significand < 0x ## HI) +#define f64abs(x) ((x) < 0.0 ? -(x) : (x)) + +#if !defined(__USE_EXTERNAL_FPMEMTYP_H__) + +#define BIAS_32 0x007F +#define BIAS_64 0x03FF +#define BIAS_80 0x3FFF + +#define MAXEXP_32 0x00FE +#define MAXEXP_64 0x07FE +#define MAXEXP_80 0x7FFE + +#define EXPINF_32 0x00FF +#define EXPINF_64 0x07FF +#define EXPINF_80 0x7FFF + +struct fp32 { /*// sign:1 exponent:8 significand:23 (implied leading 1)*/ +#if defined(SIZE_INT_32) + unsigned significand:23; + unsigned exponent:8; + unsigned sign:1; +#elif defined(SIZE_INT_64) + unsigned significand:23; + unsigned exponent:8; + unsigned sign:1; +#endif +}; + +struct fp64 { /*/ sign:1 exponent:11 significand:52 (implied leading 1)*/ +#if defined(SIZE_INT_32) + unsigned lo_significand:32; + unsigned hi_significand:20; + unsigned exponent:11; + unsigned sign:1; +#elif defined(SIZE_INT_64) + unsigned significand:52; + unsigned exponent:11; + unsigned sign:1; +#endif +}; + +struct fp80 { /*/ sign:1 exponent:15 significand:64 (NO implied bits) */ +#if defined(SIZE_INT_32) + unsigned lo_significand; + unsigned hi_significand; + unsigned exponent:15; + unsigned sign:1; +#elif defined(SIZE_INT_64) + unsigned significand; + unsigned exponent:15; + unsigned sign:1; +#endif +}; + +#endif /*__USE_EXTERNAL_FPMEMTYP_H__*/ + +/* macros to form a double value in hex representation (unsigned int type) */ + +#define DOUBLE_HEX(hi,lo) 0x##lo,0x##hi /*LITTLE_ENDIAN*/ + +/* macros to form a long double value in hex representation (unsigned short type) */ + +#if defined(_WIN32) || defined(_WIN64) +#define LDOUBLE_ALIGN 16 +#else +#define LDOUBLE_ALIGN 12 +#endif + +#if (LDOUBLE_ALIGN == 16) +#define _XPD_ ,0x0000,0x0000,0x0000 +#else /*12*/ +#define _XPD_ ,0x0000 +#endif + +#define LDOUBLE_HEX(w4,w3,w2,w1,w0) 0x##w0,0x##w1,0x##w2,0x##w3,0x##w4 _XPD_ /*LITTLE_ENDIAN*/ + +/* macros to sign-expand low 'num' bits of 'val' to native integer */ -#define BIAS_64 1023 -#define EXPINF_64 2047 +#if defined(SIZE_INT_32) +# define SIGN_EXPAND(val,num) ((int)(val) << (32-(num))) >> (32-(num)) /* sign expand of 'num' LSBs */ +#elif defined(SIZE_INT_64) +# define SIGN_EXPAND(val,num) ((int)(val) << (64-(num))) >> (64-(num)) /* sign expand of 'num' LSBs */ +#endif + +/* macros to form pointers to FP number on-the-fly */ + +#define FP32(f) ((struct fp32 *)&f) +#define FP64(d) ((struct fp64 *)&d) +#define FP80(ld) ((struct fp80 *)&ld) + +/* macros to extract signed low and high doubleword of long double */ + +#if defined(SIZE_INT_32) +# define HI_DWORD_80(ld) ((((FP80(ld)->sign << 15) | FP80(ld)->exponent) << 16) | \ + ((FP80(ld)->hi_significand >> 16) & 0xFFFF)) +# define LO_DWORD_80(ld) SIGN_EXPAND(FP80(ld)->lo_significand, 32) +#elif defined(SIZE_INT_64) +# define HI_DWORD_80(ld) ((((FP80(ld)->sign << 15) | FP80(ld)->exponent) << 16) | \ + ((FP80(ld)->significand >> 48) & 0xFFFF)) +# define LO_DWORD_80(ld) SIGN_EXPAND(FP80(ld)->significand, 32) +#endif + +/* macros to extract hi bits of significand. + * note that explicit high bit do not count (returns as is) + */ + +#if defined(SIZE_INT_32) +# define HI_SIGNIFICAND_80(X,NBITS) ((X)->hi_significand >> (31 - (NBITS))) +#elif defined(SIZE_INT_64) +# define HI_SIGNIFICAND_80(X,NBITS) ((X)->significand >> (63 - (NBITS))) +#endif + +/* macros to check, whether a significand bits are all zero, or some of them are non-zero. + * note that SIGNIFICAND_ZERO_80 tests high bit also, but SIGNIFICAND_NONZERO_80 does not + */ + +#define SIGNIFICAND_ZERO_32(X) ((X)->significand == 0) +#define SIGNIFICAND_NONZERO_32(X) ((X)->significand != 0) -#define DOUBLE_HEX(HI, LO) 0x ## LO, 0x ## HI +#if defined(SIZE_INT_32) +# define SIGNIFICAND_ZERO_64(X) (((X)->hi_significand == 0) && ((X)->lo_significand == 0)) +# define SIGNIFICAND_NONZERO_64(X) (((X)->hi_significand != 0) || ((X)->lo_significand != 0)) +#elif defined(SIZE_INT_64) +# define SIGNIFICAND_ZERO_64(X) ((X)->significand == 0) +# define SIGNIFICAND_NONZERO_64(X) ((X)->significand != 0) +#endif + +#if defined(SIZE_INT_32) +# define SIGNIFICAND_ZERO_80(X) (((X)->hi_significand == 0x00000000) && ((X)->lo_significand == 0)) +# define SIGNIFICAND_NONZERO_80(X) (((X)->hi_significand != 0x80000000) || ((X)->lo_significand != 0)) +#elif defined(SIZE_INT_64) +# define SIGNIFICAND_ZERO_80(X) ((X)->significand == 0x0000000000000000) +# define SIGNIFICAND_NONZERO_80(X) ((X)->significand != 0x8000000000000000) +#endif + +/* macros to compare long double with constant value, represented as hex */ + +#define SIGNIFICAND_EQ_HEX_32(X,BITS) ((X)->significand == 0x ## BITS) +#define SIGNIFICAND_GT_HEX_32(X,BITS) ((X)->significand > 0x ## BITS) +#define SIGNIFICAND_GE_HEX_32(X,BITS) ((X)->significand >= 0x ## BITS) +#define SIGNIFICAND_LT_HEX_32(X,BITS) ((X)->significand < 0x ## BITS) +#define SIGNIFICAND_LE_HEX_32(X,BITS) ((X)->significand <= 0x ## BITS) + +#if defined(SIZE_INT_32) +# define SIGNIFICAND_EQ_HEX_64(X,HI,LO) \ + (((X)->hi_significand == 0x ## HI) && ((X)->lo_significand == 0x ## LO)) +# define SIGNIFICAND_GT_HEX_64(X,HI,LO) (((X)->hi_significand > 0x ## HI) || \ + (((X)->hi_significand == 0x ## HI) && ((X)->lo_significand > 0x ## LO))) +# define SIGNIFICAND_GE_HEX_64(X,HI,LO) (((X)->hi_significand > 0x ## HI) || \ + (((X)->hi_significand == 0x ## HI) && ((X)->lo_significand >= 0x ## LO))) +# define SIGNIFICAND_LT_HEX_64(X,HI,LO) (((X)->hi_significand < 0x ## HI) || \ + (((X)->hi_significand == 0x ## HI) && ((X)->lo_significand < 0x ## LO))) +# define SIGNIFICAND_LE_HEX_64(X,HI,LO) (((X)->hi_significand < 0x ## HI) || \ + (((X)->hi_significand == 0x ## HI) && ((X)->lo_significand <= 0x ## LO))) +#elif defined(SIZE_INT_64) +# define SIGNIFICAND_EQ_HEX_64(X,HI,LO) ((X)->significand == 0x ## HI ## LO) +# define SIGNIFICAND_GT_HEX_64(X,HI,LO) ((X)->significand > 0x ## HI ## LO) +# define SIGNIFICAND_GE_HEX_64(X,HI,LO) ((X)->significand >= 0x ## HI ## LO) +# define SIGNIFICAND_LT_HEX_64(X,HI,LO) ((X)->significand < 0x ## HI ## LO) +# define SIGNIFICAND_LE_HEX_64(X,HI,LO) ((X)->significand <= 0x ## HI ## LO) +#endif + +#if defined(SIZE_INT_32) +# define SIGNIFICAND_EQ_HEX_80(X,HI,LO) \ + (((X)->hi_significand == 0x ## HI) && ((X)->lo_significand == 0x ## LO)) +# define SIGNIFICAND_GT_HEX_80(X,HI,LO) (((X)->hi_significand > 0x ## HI) || \ + (((X)->hi_significand == 0x ## HI) && ((X)->lo_significand > 0x ## LO))) +# define SIGNIFICAND_GE_HEX_80(X,HI,LO) (((X)->hi_significand > 0x ## HI) || \ + (((X)->hi_significand == 0x ## HI) && ((X)->lo_significand >= 0x ## LO))) +# define SIGNIFICAND_LT_HEX_80(X,HI,LO) (((X)->hi_significand < 0x ## HI) || \ + (((X)->hi_significand == 0x ## HI) && ((X)->lo_significand < 0x ## LO))) +# define SIGNIFICAND_LE_HEX_80(X,HI,LO) (((X)->hi_significand < 0x ## HI) || \ + (((X)->hi_significand == 0x ## HI) && ((X)->lo_significand <= 0x ## LO))) +#elif defined(SIZE_INT_64) +# define SIGNIFICAND_EQ_HEX_80(X,HI,LO) ((X)->significand == 0x ## HI ## LO) +# define SIGNIFICAND_GT_HEX_80(X,HI,LO) ((X)->significand > 0x ## HI ## LO) +# define SIGNIFICAND_GE_HEX_80(X,HI,LO) ((X)->significand >= 0x ## HI ## LO) +# define SIGNIFICAND_LT_HEX_80(X,HI,LO) ((X)->significand < 0x ## HI ## LO) +# define SIGNIFICAND_LE_HEX_80(X,HI,LO) ((X)->significand <= 0x ## HI ## LO) +#endif + +#define VALUE_EQ_HEX_32(X,EXP,BITS) \ + (((X)->exponent == (EXP)) && (SIGNIFICAND_EQ_HEX_32(X, BITS))) +#define VALUE_GT_HEX_32(X,EXP,BITS) (((X)->exponent > (EXP)) || \ + (((X)->exponent == (EXP)) && (SIGNIFICAND_GT_HEX_32(X, BITS)))) +#define VALUE_GE_HEX_32(X,EXP,BITS) (((X)->exponent > (EXP)) || \ + (((X)->exponent == (EXP)) && (SIGNIFICAND_GE_HEX_32(X, BITS)))) +#define VALUE_LT_HEX_32(X,EXP,BITS) (((X)->exponent < (EXP)) || \ + (((X)->exponent == (EXP)) && (SIGNIFICAND_LT_HEX_32(X, BITS)))) +#define VALUE_LE_HEX_32(X,EXP,BITS) (((X)->exponent < (EXP)) || \ + (((X)->exponent == (EXP)) && (SIGNIFICAND_LE_HEX_32(X, BITS)))) + +#define VALUE_EQ_HEX_64(X,EXP,HI,LO) \ + (((X)->exponent == (EXP)) && (SIGNIFICAND_EQ_HEX_64(X, HI, LO))) +#define VALUE_GT_HEX_64(X,EXP,HI,LO) (((X)->exponent > (EXP)) || \ + (((X)->exponent == (EXP)) && (SIGNIFICAND_GT_HEX_64(X, HI, LO)))) +#define VALUE_GE_HEX_64(X,EXP,HI,LO) (((X)->exponent > (EXP)) || \ + (((X)->exponent == (EXP)) && (SIGNIFICAND_GE_HEX_64(X, HI, LO)))) +#define VALUE_LT_HEX_64(X,EXP,HI,LO) (((X)->exponent < (EXP)) || \ + (((X)->exponent == (EXP)) && (SIGNIFICAND_LT_HEX_64(X, HI, LO)))) +#define VALUE_LE_HEX_64(X,EXP,HI,LO) (((X)->exponent < (EXP)) || \ + (((X)->exponent == (EXP)) && (SIGNIFICAND_LE_HEX_64(X, HI, LO)))) + +#define VALUE_EQ_HEX_80(X,EXP,HI,LO) \ + (((X)->exponent == (EXP)) && (SIGNIFICAND_EQ_HEX_80(X, HI, LO))) +#define VALUE_GT_HEX_80(X,EXP,HI,LO) (((X)->exponent > (EXP)) || \ + (((X)->exponent == (EXP)) && (SIGNIFICAND_GT_HEX_80(X, HI, LO)))) +#define VALUE_GE_HEX_80(X,EXP,HI,LO) (((X)->exponent > (EXP)) || \ + (((X)->exponent == (EXP)) && (SIGNIFICAND_GE_HEX_80(X, HI, LO)))) +#define VALUE_LT_HEX_80(X,EXP,HI,LO) (((X)->exponent < (EXP)) || \ + (((X)->exponent == (EXP)) && (SIGNIFICAND_LT_HEX_80(X, HI, LO)))) +#define VALUE_LE_HEX_80(X,EXP,HI,LO) (((X)->exponent < (EXP)) || \ + (((X)->exponent == (EXP)) && (SIGNIFICAND_LE_HEX_80(X, HI, LO)))) + +/* macros to compare two long doubles */ + +#define SIGNIFICAND_EQ_32(X,Y) ((X)->significand == (Y)->significand) +#define SIGNIFICAND_GT_32(X,Y) ((X)->significand > (Y)->significand) +#define SIGNIFICAND_GE_32(X,Y) ((X)->significand >= (Y)->significand) +#define SIGNIFICAND_LT_32(X,Y) ((X)->significand < (Y)->significand) +#define SIGNIFICAND_LE_32(X,Y) ((X)->significand <= (Y)->significand) + +#if defined(SIZE_INT_32) +# define SIGNIFICAND_EQ_64(X,Y) \ + (((X)->hi_significand == (Y)->hi_significand) && ((X)->lo_significand == (Y)->lo_significand)) +# define SIGNIFICAND_GT_64(X,Y) (((X)->hi_significand > (Y)->hi_significand) || \ + (((X)->hi_significand == (Y)->hi_significand) && ((X)->lo_significand > (Y)->lo_significand))) +# define SIGNIFICAND_GE_64(X,Y) (((X)->hi_significand > (Y)->hi_significand) || \ + (((X)->hi_significand == (Y)->hi_significand) && ((X)->lo_significand >= (Y)->lo_significand))) +# define SIGNIFICAND_LT_64(X,Y) (((X)->hi_significand < (Y)->hi_significand) || \ + (((X)->hi_significand == (Y)->hi_significand) && ((X)->lo_significand < (Y)->lo_significand))) +# define SIGNIFICAND_LE_64(X,Y) (((X)->hi_significand < (Y)->hi_significand) || \ + (((X)->hi_significand == (Y)->hi_significand) && ((X)->lo_significand <= (Y)->lo_significand))) +#elif defined(SIZE_INT_64) +# define SIGNIFICAND_EQ_64(X,Y) ((X)->significand == (Y)->significand) +# define SIGNIFICAND_GT_64(X,Y) ((X)->significand > (Y)->significand) +# define SIGNIFICAND_GE_64(X,Y) ((X)->significand >= (Y)->significand) +# define SIGNIFICAND_LT_64(X,Y) ((X)->significand < (Y)->significand) +# define SIGNIFICAND_LE_64(X,Y) ((X)->significand <= (Y)->significand) +#endif + +#if defined(SIZE_INT_32) +# define SIGNIFICAND_EQ_80(X,Y) \ + (((X)->hi_significand == (Y)->hi_significand) && ((X)->lo_significand == (Y)->lo_significand)) +# define SIGNIFICAND_GT_80(X,Y) (((X)->hi_significand > (Y)->hi_significand) || \ + (((X)->hi_significand == (Y)->hi_significand) && ((X)->lo_significand > (Y)->lo_significand))) +# define SIGNIFICAND_GE_80(X,Y) (((X)->hi_significand > (Y)->hi_significand) || \ + (((X)->hi_significand == (Y)->hi_significand) && ((X)->lo_significand >= (Y)->lo_significand))) +# define SIGNIFICAND_LT_80(X,Y) (((X)->hi_significand < (Y)->hi_significand) || \ + (((X)->hi_significand == (Y)->hi_significand) && ((X)->lo_significand < (Y)->lo_significand))) +# define SIGNIFICAND_LE_80(X,Y) (((X)->hi_significand < (Y)->hi_significand) || \ + (((X)->hi_significand == (Y)->hi_significand) && ((X)->lo_significand <= (Y)->lo_significand))) +#elif defined(SIZE_INT_64) +# define SIGNIFICAND_EQ_80(X,Y) ((X)->significand == (Y)->significand) +# define SIGNIFICAND_GT_80(X,Y) ((X)->significand > (Y)->significand) +# define SIGNIFICAND_GE_80(X,Y) ((X)->significand >= (Y)->significand) +# define SIGNIFICAND_LT_80(X,Y) ((X)->significand < (Y)->significand) +# define SIGNIFICAND_LE_80(X,Y) ((X)->significand <= (Y)->significand) +#endif + +#define VALUE_EQ_32(X,Y) \ + (((X)->exponent == (Y)->exponent) && (SIGNIFICAND_EQ_32(X, Y))) +#define VALUE_GT_32(X,Y) (((X)->exponent > (Y)->exponent) || \ + (((X)->exponent == (Y)->exponent) && (SIGNIFICAND_GT_32(X, Y)))) +#define VALUE_GE_32(X,Y) (((X)->exponent > (Y)->exponent) || \ + (((X)->exponent == (Y)->exponent) && (SIGNIFICAND_GE_32(X, Y)))) +#define VALUE_LT_32(X,Y) (((X)->exponent < (Y)->exponent) || \ + (((X)->exponent == (Y)->exponent) && (SIGNIFICAND_LT_32(X, Y)))) +#define VALUE_LE_32(X,Y) (((X)->exponent < (Y)->exponent) || \ + (((X)->exponent == (Y)->exponent) && (SIGNIFICAND_LE_32(X, Y)))) + +#define VALUE_EQ_64(X,Y) \ + (((X)->exponent == (Y)->exponent) && (SIGNIFICAND_EQ_64(X, Y))) +#define VALUE_GT_64(X,Y) (((X)->exponent > (Y)->exponent) || \ + (((X)->exponent == (Y)->exponent) && (SIGNIFICAND_GT_64(X, Y)))) +#define VALUE_GE_64(X,Y) (((X)->exponent > (Y)->exponent) || \ + (((X)->exponent == (Y)->exponent) && (SIGNIFICAND_GE_64(X, Y)))) +#define VALUE_LT_64(X,Y) (((X)->exponent < (Y)->exponent) || \ + (((X)->exponent == (Y)->exponent) && (SIGNIFICAND_LT_64(X, Y)))) +#define VALUE_LE_64(X,Y) (((X)->exponent < (Y)->exponent) || \ + (((X)->exponent == (Y)->exponent) && (SIGNIFICAND_LE_64(X, Y)))) + +#define VALUE_EQ_80(X,Y) \ + (((X)->exponent == (Y)->exponent) && (SIGNIFICAND_EQ_80(X, Y))) +#define VALUE_GT_80(X,Y) (((X)->exponent > (Y)->exponent) || \ + (((X)->exponent == (Y)->exponent) && (SIGNIFICAND_GT_80(X, Y)))) +#define VALUE_GE_80(X,Y) (((X)->exponent > (Y)->exponent) || \ + (((X)->exponent == (Y)->exponent) && (SIGNIFICAND_GE_80(X, Y)))) +#define VALUE_LT_80(X,Y) (((X)->exponent < (Y)->exponent) || \ + (((X)->exponent == (Y)->exponent) && (SIGNIFICAND_LT_80(X, Y)))) +#define VALUE_LE_80(X,Y) (((X)->exponent < (Y)->exponent) || \ + (((X)->exponent == (Y)->exponent) && (SIGNIFICAND_LE_80(X, Y)))) + +/* add/subtract 1 ulp macros */ + +#if defined(SIZE_INT_32) +# define ADD_ULP_80(X) \ + if ((++(X)->lo_significand == 0) && \ + (++(X)->hi_significand == (((X)->exponent == 0) ? 0x80000000 : 0))) \ + { \ + (X)->hi_significand |= 0x80000000; \ + ++(X)->exponent; \ + } +# define SUB_ULP_80(X) \ + if (--(X)->lo_significand == 0xFFFFFFFF) { \ + --(X)->hi_significand; \ + if (((X)->exponent != 0) && \ + ((X)->hi_significand == 0x7FFFFFFF) && \ + (--(X)->exponent != 0)) \ + { \ + (X)->hi_significand |= 0x80000000; \ + } \ + } +#elif defined(SIZE_INT_64) +# define ADD_ULP_80(X) \ + if (++(X)->significand == (((X)->exponent == 0) ? 0x8000000000000000 : 0))) { \ + (X)->significand |= 0x8000000000000000; \ + ++(X)->exponent; \ + } +# define SUB_ULP_80(X) \ + { \ + --(X)->significand; \ + if (((X)->exponent != 0) && \ + ((X)->significand == 0x7FFFFFFFFFFFFFFF) && \ + (--(X)->exponent != 0)) \ + { \ + (X)->significand |= 0x8000000000000000; \ + } \ + } +#endif + + + +#if (defined(_WIN32) && !defined(_WIN64)) + +#define FP80_DECLARE() +#define _FPC_64 0x0300 +static unsigned short __wControlWord, __wNewControlWord; +#define FP80_SET() { \ + __asm { fnstcw word ptr [__wControlWord] } \ + __wNewControlWord = __wControlWord | _FPC_64; \ + __asm { fldcw word ptr [__wNewControlWord] } \ + } +#define FP80_RESET() { \ + __asm { fldcw word ptr [__wControlWord] } \ + } +#else /* defined(_WIN32) && !defined(_WIN64) */ + +#define FP80_DECLARE() +#define FP80_SET() +#define FP80_RESET() + +#endif /* defined(_WIN32) && !defined(_WIN64) */ + + +#ifdef _LIBC +# include <math.h> +#else -#if 0 static const unsigned INF[] = { DOUBLE_HEX(7ff00000, 00000000), DOUBLE_HEX(fff00000, 00000000) @@ -255,12 +657,12 @@ static const unsigned INF[] = { static const double _zeroo = 0.0; static const double _bigg = 1.0e300; static const double _ponee = 1.0; -static const double _nonee = -1.0; +static const double _nonee = -1.0; #define INVALID (_zeroo * *((double*)&INF[0])) -#define PINF *((double*)&INF[0]) -#define NINF -PINF -#define PINF_DZ (_ponee/_zeroo) +#define PINF *((double*)&INF[0]) +#define NINF -PINF +#define PINF_DZ (_ponee/_zeroo) #define X_TLOSS 1.41484755040568800000e+16 #endif @@ -278,7 +680,7 @@ struct __exception char *name; double arg1, arg2, retval; }; -# else +# else # ifndef _LIBC struct exception @@ -300,18 +702,18 @@ struct exceptionl }; #ifdef _MS_ -#define MATHERR_F _matherrf -#define MATHERR_D _matherr +#define MATHERR_F _matherrf +#define MATHERR_D _matherr #else -#define MATHERR_F matherrf -#define MATHERR_D matherr +#define MATHERR_F matherrf +#define MATHERR_D matherr #endif # ifdef __cplusplus -#define EXC_DECL_D __exception +#define EXC_DECL_D __exception #else // exception is a reserved name in C++ -#define EXC_DECL_D exception +#define EXC_DECL_D exception #endif extern int MATHERR_F(struct exceptionf*); @@ -324,7 +726,7 @@ extern int matherrl(struct exceptionl*); #define ERRNO_DOMAIN errno = EDOM -// Add code to support _LIB_VERSION +// Add code to support _LIB_VERSIONIMF #ifndef _LIBC typedef enum { @@ -335,29 +737,19 @@ typedef enum _ISOC_ // ISO C9X } _LIB_VERSION_TYPE; -extern _LIB_VERSION_TYPE _LIB_VERSION; -#endif -// This is a run-time variable and may effect -// floating point behavior of the libm functions - -#elif defined _LIBC - -# if !defined NOT_IN_libc && defined SHARED && defined DO_VERSIONING \ - && !defined HAVE_BROKEN_ALIAS_ATTRIBUTE && !defined NO_HIDDEN -# define __libm_error_support __GI___libm_error_support -# endif - -#endif /* __ASSEMBLER__ */ - -/* Support for compatible assembler handling. */ -#if !defined L && defined _LIBC -#define L(name) .L##name -#endif -#ifdef __ELF__ -#define ASM_SIZE_DIRECTIVE(name) .size name,.-name -#define ASM_TYPE_DIRECTIVE(name,T) .type name,T +#if !defined( LIBM_BUILD ) +#if defined( _DLL ) +extern _LIB_VERSION_TYPE __declspec(dllimport) _LIB_VERSIONIMF; +#else +extern _LIB_VERSION_TYPE _LIB_VERSIONIMF; +#endif /* _DLL */ #else -#define ASM_SIZE_DIRECTIVE(name) -#define ASM_TYPE_DIRECTIVE(name,T) +extern int (*pmatherrf)(struct exceptionf*); +extern int (*pmatherr)(struct EXC_DECL_D*); +extern int (*pmatherrl)(struct exceptionl*); +#endif /* LIBM_BUILD */ + +// This is a run-time variable and may affect +// floating point behavior of the libm functions #endif diff --git a/sysdeps/ia64/fpu/s_atan.S b/sysdeps/ia64/fpu/s_atan.S index c0daabd3d7..720ecad28a 100644 --- a/sysdeps/ia64/fpu/s_atan.S +++ b/sysdeps/ia64/fpu/s_atan.S @@ -1,10 +1,10 @@ .file "atan.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,947 +20,734 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// // Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// problem reports or change requests be submitted to it directly at +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 2/02/00: Initial version -// 4/13/00: Improved speed -// 4/19/00: Removed the qualifying predicate from the fmerge.s that -// takes the absolute value. -// 6/16/00: Reassigned FP registers to eliminate stalls on loads -// 8/30/00: Saved 5 cycles in main path by rearranging large argument logic -// and delaying use of result of fcmp in load by 1 group +// 02/02/00 Initial version +// 04/13/00 Improved speed +// 04/19/00 Removed the qualifying predicate from the fmerge.s that +// takes the absolute value. +// 06/16/00 Reassigned FP registers to eliminate stalls on loads +// 08/30/00 Saved 5 cycles in main path by rearranging large argument logic +// and delaying use of result of fcmp in load by 1 group +// 05/20/02 Cleaned up namespace and sf0 syntax +// 08/20/02 Use atan2 algorithm with x=1 for better accuracy +// 02/06/03 Reordered header: .section, .global, .proc, .align // // API //============================================================== -// double atan( double x); +// double atan(double Y) // // Overview of operation //============================================================== -// atan(x) = sign(X)pi/2 - atan(1/x) // -// We have two paths: |x| > 1 and |x| <= 1 +// The atan function returns values in the interval [-pi/2,+pi/2]. // -// |x| > 1 -// ========================================== +// The algorithm used is the atan2(Y,X) algorithm where we fix X=1.0. // -// c = frcpa(x) which is approximately 1/x +// There are two basic paths: swap true and swap false. +// atan2(Y,X) ==> atan2(V/U) where U >= V. If Y > X, we must swap. // -// xc = 1- B -// B = 1-xc +// p6 swap True |Y| > |X| +// p7 swap False |Y| <= |X| // -// Approximate 1/(1-B)^k by a polynomial in B, poly(B) -// k is 45. // -// poly(B) = 1 + r1 B + r2 B^2 + ...+ r10 B^10 +// Simple trigonometric identities show +// Region 1 +// |Y|<=1.0, V=Y, U=1.0 atan2(Y,X) = sgnY * (0 + atan(V/U)) // -// c^k = (1-B)^k/x^k -// c^k/(1-B)^k = 1/x^k -// c^k poly(B) = 1/x^k - -// poly(x) = series(atan(1/x)) = 1/x - 1/3x^3 + 1/5x^5 - 1/7x^7 .... + 1/45 x^45 -// = 1/x^45 ( x^44 - x^42/3 + x^40/5 - x^38/7 ... +1) -// = 1/x^45 ( y^22 - y^21/3 + y^20/5 - y^19/7 ... +1) -// -// = c^45 poly(B) poly(x) -// = c^45 r(B) q(y) - -// q(y) = q0 + q1 y + q2 y^2 + ... + q22 y^22 -// where q22 is 1.0 - -// atan(x) = sign(X)pi/2 - c^45 r(B) q(y) - -// |x| <= 1 -// ========================================== -// poly(x) = series(atan(x)) = x - x^3/3 + x^5/5 + ..... -// poly(x) = series(atan(x)) = x + x^3(- 1/3 + x^2/5 + ..... +x^47/47) -// poly(x) = series(atan(x)) = x + x^3(p0 + x^2/5 + ..... + x^44/47) -// poly(x) = series(atan(x)) = x + x^3(p0 + y/5 + ..... + y^22/47) - -// where p0 is about -1/3. - -// atan(x) = poly(x) - -#include "libm_support.h" +// Region 2 +// |Y|>1.0, V=1.0, U=Y atan2(Y,X) = sgnY * (pi/2 - atan(V/U)) +// +// +// We compute atan(V/U) from the identity +// atan(z) + atan([(V/U)-z] / [1+(V/U)z]) +// where z is a limited precision approximation (16 bits) to V/U +// +// z is calculated with the assistance of the frcpa instruction. +// +// atan(z) is calculated by a polynomial z + z^3 * p(w), w=z^2 +// where p(w) = P0+P1*w+...+P22*w^22 +// +// Let d = [(V/U)-z] / [1+(V/U)z]) = (V-U*z)/(U+V*z) +// +// Approximate atan(d) by d + P0*d^3 +// Let F = 1/(U+V*z) * (1-a), where |a|< 2^-8.8. +// Compute q(a) = 1 + a + ... + a^5. +// Then F*q(a) approximates the reciprocal to more than 50 bits. -// Special Values +// Special values //============================================================== // atan(QNAN) = QNAN // atan(SNAN) = quieted SNAN -// atan(+-inf) = +- pi/2 +// atan(+-inf) = +- pi/2 // atan(+-0) = +-0 - - // Registers used //============================================================== -// predicate registers used: -// p6 -> p11 +// predicate registers used: +// p6 -> p15 -// floating-point registers used: -// f32 -> f127 +// floating-point registers used: +// f8, input +// f32 -> f116 // general registers used -// r32 -> r37 +// r14 -> r16 // Assembly macros //============================================================== -atan_Pi_by_2 = f32 -atan_S_PI = f33 -atan_ABS_f8 = f34 - -atan_R0 = f35 -atan_R1 = f36 -atan_R2 = f37 -atan_R3 = f38 -atan_R4 = f39 -atan_R5 = f40 -atan_R6 = f41 -atan_R7 = f42 -atan_R8 = f43 -atan_R9 = f44 -atan_R10 = f45 - -atan_Q0 = f46 - -atan_Q1 = f47 -atan_Q2 = f48 -atan_Q3 = f49 -atan_Q4 = f50 -atan_Q5 = f51 -atan_Q6 = f52 -atan_Q7 = f53 -atan_Q8 = f54 -atan_Q9 = f55 -atan_Q10 = f56 - -atan_Q11 = f57 -atan_Q12 = f58 -atan_Q13 = f59 -atan_Q14 = f60 -atan_Q15 = f61 -atan_Q16 = f62 -atan_Q17 = f63 -atan_Q18 = f64 -atan_Q19 = f65 -atan_Q20 = f66 -atan_Q21 = f67 -atan_Q22 = f68 - -// P and Q constants are mutually exclusive -// so they can share macro definitions -atan_P0 = f46 - -atan_P1 = f47 -atan_P2 = f48 -atan_P3 = f49 -atan_P4 = f10 -atan_P5 = f11 -atan_P6 = f12 -atan_P7 = f13 -atan_P10 = f103 - -atan_P11 = f114 -atan_P12 = f58 -atan_P13 = f59 -atan_P14 = f60 -atan_P15 = f61 -atan_P16 = f62 -atan_P17 = f63 -atan_P18 = f64 -atan_P19 = f65 -atan_P20 = f14 -atan_P21 = f99 -atan_P22 = f68 -// end of P constant macros - -atan_C = f69 -atan_Y = f70 -atan_B = f71 -atan_Z = f72 -atan_V11 = f73 -atan_V12 = f74 - -atan_V7 = f75 -atan_V8 = f76 - -atan_W13 = f77 -atan_W11 = f78 - -atan_V3 = f79 -atan_V4 = f80 - -atan_G11 = f81 -atan_G12 = f82 -atan_G7 = f83 -atan_G8 = f84 - -atan_Z1 = f85 -atan_W7 = f86 - -atan_G3 = f87 -atan_W8 = f88 -atan_V9 = f89 -atan_V10 = f90 - -atan_G10 = f91 -atan_W3 = f92 -atan_G4 = f93 -atan_G9 = f94 - -atan_G6 = f95 -atan_W4 = f96 -atan_Z2 = f97 -atan_V6 = f98 - -atan_V2 = f99 -atan_W6 = f100 -atan_W10 = f101 -atan_Y3 = f102 - -atan_G2 = f103 - -atan_Y8 = f104 - -atan_G5 = f105 -atan_Z3 = f106 -atan_Z4 = f107 -atan_W2 = f108 -atan_V5 = f109 - -atan_W5 = f110 -atan_G1 = f111 -atan_Y11 = f112 - -atan_Z5 = f113 -atan_Z6 = f114 -atan_V1 = f115 -atan_W1 = f116 - -atan_Z7 = f117 -atan_Q = f118 -atan_Z = f119 -atan_abs_f8 = f120 - -atan_V13 = f121 -atan_Xcub = f122 -atan_Y12 = f123 -atan_P = f124 - -atan_NORM_f8 = f125 - -atan_P8 = f126 -atan_P9 = f127 - - - - -atan_GR_AD_R = r14 -atan_GR_AD_Q = r15 -atan_GR_AD_P = r16 -atan_GR_10172 = r17 -atan_GR_exp_f8 = r18 -atan_GR_signexp_f8 = r19 -atan_GR_exp_mask = r20 - - +EXP_AD_P1 = r14 +EXP_AD_P2 = r15 +rsig_near_one = r16 + +atan2_Y = f8 +atan2_X = f1 + +atan2_u1_X = f32 +atan2_u1_Y = f33 +atan2_z2_X = f34 + +atan2_two = f36 +atan2_B1sq_Y = f37 +atan2_z1_X = f38 +atan2_B1X = f40 + +atan2_B1Y = f41 +atan2_wp_X = f42 +atan2_B1sq_X = f43 +atan2_z = f44 +atan2_w = f45 + +atan2_P0 = f46 +atan2_P1 = f47 +atan2_P2 = f48 +atan2_P3 = f49 +atan2_P4 = f50 + +atan2_P5 = f51 +atan2_P6 = f52 +atan2_P7 = f53 +atan2_P8 = f54 +atan2_P9 = f55 + +atan2_P10 = f56 +atan2_P11 = f57 +atan2_P12 = f58 +atan2_P13 = f59 +atan2_P14 = f60 + +atan2_P15 = f61 +atan2_P16 = f62 +atan2_P17 = f63 +atan2_P18 = f64 +atan2_P19 = f65 + +atan2_P20 = f66 +atan2_P21 = f67 +atan2_P22 = f68 +atan2_pi_by_2 = f69 +atan2_sgn_pi_by_2 = f69 +atan2_V13 = f70 + +atan2_W11 = f71 +atan2_E = f72 +atan2_wp_Y = f73 +atan2_V11 = f74 +atan2_V12 = f75 + +atan2_V7 = f76 +atan2_V8 = f77 +atan2_W7 = f78 +atan2_W8 = f79 +atan2_W3 = f80 + +atan2_W4 = f81 +atan2_V3 = f82 +atan2_V4 = f83 +atan2_F = f84 +atan2_gV = f85 + +atan2_V10 = f86 +atan2_zcub = f87 +atan2_V6 = f88 +atan2_V9 = f89 +atan2_W10 = f90 + +atan2_W6 = f91 +atan2_W2 = f92 +atan2_V2 = f93 +atan2_alpha = f94 +atan2_alpha_1 = f95 + +atan2_gVF = f96 +atan2_V5 = f97 +atan2_W12 = f98 +atan2_W5 = f99 +atan2_alpha_sq = f100 + +atan2_Cp = f101 +atan2_V1 = f102 +atan2_ysq = f103 +atan2_W1 = f104 +atan2_alpha_cub = f105 + +atan2_C = f106 +atan2_d = f108 +atan2_A_hi = f109 +atan2_dsq = f110 + +atan2_pd = f111 +atan2_A_lo = f112 +atan2_A = f113 +atan2_Pp = f114 +atan2_sgnY = f115 + +atan2_sig_near_one = f116 +atan2_near_one = f116 ///////////////////////////////////////////////////////////// -#ifdef _LIBC -.rodata -#else -.data -#endif +RODATA .align 16 -double_atan_constants_R: -ASM_TYPE_DIRECTIVE(double_atan_constants_R,@object) - data8 0xB36B46B9C5443CED, 0x0000401C //R8 - data8 0x842633E0D126261F, 0x0000401F //R9 - data8 0xBE04FFFFFFFF46E0, 0x00004010 //R4 - data8 0xE8C62000244D66E2, 0x00004013 //R5 - data8 0xF2790C001E3789B3, 0x00004016 //R6 - data8 0xDCD2CCF97D7C764F, 0x00004019 //R7 - data8 0xB40000000000000B, 0x00004004 //R1 - data8 0xB265F3D38F5EE28F, 0x00004021 //R10 - data8 0x8160000000000001, 0x00004009 //R2 - data8 0xFD5BFFFFFFFE55CD, 0x0000400C //R3 - data8 0xC90FDAA22168C235, 0x00003FFF // pi/2 -ASM_SIZE_DIRECTIVE(double_atan_constants_R) - -double_atan_constants_Q: -ASM_TYPE_DIRECTIVE(double_atan_constants_Q,@object) - data8 0xEBD602FA7761BC33, 0x00003FF9 //Q8 - data8 0x8CB1CABD6A91913C, 0x0000BFFA //Q9 - data8 0x84C665C37D623CD2, 0x00003FF7 //Q4 - data8 0x8DE0D1673DAEA9BC, 0x0000BFF8 //Q5 - data8 0xF658ADBE2C6E6FCC, 0x00003FF8 //Q6 - - data8 0xB56307BE1DD3FFB6, 0x0000BFF9 //Q7 - data8 0xAAAAAAAAAAAA8000, 0x0000BFFD //Q21 - data8 0x8000000000000000, 0x00003FFF //Q22 - data8 0x924924923A9D710C, 0x0000BFFC //Q19 - data8 0xCCCCCCCCCC9380E7, 0x00003FFC //Q20 - - data8 0xA644DC250EFA2800, 0x00003FED //Q0 - data8 0x83DEAE24EEBF5E44, 0x0000BFF1 //Q1 - data8 0xC758CCC64793D4EC, 0x00003FF3 //Q2 - data8 0xBFDC0B54E7C89DCE, 0x0000BFF5 //Q3 - data8 0x888855199D1290AF, 0x0000BFFB //Q15 - - data8 0x9D89D3BE514B0178, 0x00003FFB //Q16 - data8 0xBA2E8B4DEC70282A, 0x0000BFFB //Q17 - data8 0xE38E38DF9E9FC83B, 0x00003FFB //Q18 - data8 0x9F8781CC990029D9, 0x00003FFA //Q10 - data8 0xB0B39472DEBA3C79, 0x0000BFFA //Q11 - - data8 0xC2AFAEF8C85B0BC6, 0x00003FFA //Q12 - data8 0xD780E539797525DD, 0x0000BFFA //Q13 - data8 0xF0EDC449AC786DF9, 0x00003FFA //Q14 -ASM_SIZE_DIRECTIVE(double_atan_constants_Q) - - - -double_atan_constants_P: -ASM_TYPE_DIRECTIVE(double_atan_constants_P,@object) - data8 0xB1899EC590CDB8DF, 0x0000BFFA //P10 - data8 0xA1E79850A67D59B0, 0x00003FFA //P11 - data8 0x911D8B30C2A96E6D, 0x0000BFF3 //P20 - data8 0xB87233C68A640706, 0x00003FF0 //P21 - data8 0xD78E4B82F3C29D7A, 0x0000BFFA //P8 - - data8 0xC2EBE37AF932C14F, 0x00003FFA //P9 - data8 0xBA2E8B94AA104DD6, 0x0000BFFB //P4 - data8 0x9D89D7A640B71D38, 0x00003FFB //P5 - data8 0x88887CA2CE9B2A40, 0x0000BFFB //P6 - data8 0xF0F017D57A919C1E, 0x00003FFA //P7 - - data8 0xD0D635F230C80E06, 0x0000BFF8 //P16 - data8 0xE847BECA7209B479, 0x00003FF7 //P17 - data8 0xD14C6A2AAE0D5B07, 0x0000BFF6 //P18 - data8 0x915F612A5C469117, 0x00003FF5 //P19 - data8 0x921EDE5FD0DBBBE2, 0x0000BFFA //P12 - - data8 0xFFD303C2C8535445, 0x00003FF9 //P13 - data8 0xD30DF50E295386F7, 0x0000BFF9 //P14 - data8 0x9E81F2B1BBD210A8, 0x00003FF9 //P15 - data8 0xAAAAAAAAAAAAA800, 0x0000BFFD //P0 - data8 0xCCCCCCCCCCC7D476, 0x00003FFC //P1 - - data8 0x9249249247838066, 0x0000BFFC //P2 - data8 0xE38E38E302290D68, 0x00003FFB //P3 - data8 0xDF7F0A816F7E5025, 0x0000BFEC //P22 -ASM_SIZE_DIRECTIVE(double_atan_constants_P) - - -.align 32 -.global atan# - -//////////////////////////////////////////////////////// - +LOCAL_OBJECT_START(atan2_tb1) +data8 0xA21922DC45605EA1 , 0x00003FFA // P11 +data8 0xB199DD6D2675C40F , 0x0000BFFA // P10 +data8 0xC2F01E5DDD100DBE , 0x00003FFA // P9 +data8 0xD78F28FC2A592781 , 0x0000BFFA // P8 +data8 0xF0F03ADB3FC930D3 , 0x00003FFA // P7 +data8 0x88887EBB209E3543 , 0x0000BFFB // P6 +data8 0x9D89D7D55C3287A5 , 0x00003FFB // P5 +data8 0xBA2E8B9793955C77 , 0x0000BFFB // P4 +data8 0xE38E38E320A8A098 , 0x00003FFB // P3 +data8 0x9249249247E37913 , 0x0000BFFC // P2 +data8 0xCCCCCCCCCCC906CD , 0x00003FFC // P1 +data8 0xAAAAAAAAAAAAA8A9 , 0x0000BFFD // P0 +data8 0x0000000000000000 , 0x00000000 // pad to avoid bank conflict +LOCAL_OBJECT_END(atan2_tb1) + +LOCAL_OBJECT_START(atan2_tb2) +data8 0xCE585A259BD8374C , 0x00003FF0 // P21 +data8 0x9F90FB984D8E39D0 , 0x0000BFF3 // P20 +data8 0x9D3436AABE218776 , 0x00003FF5 // P19 +data8 0xDEC343E068A6D2A8 , 0x0000BFF6 // P18 +data8 0xF396268151CFB11C , 0x00003FF7 // P17 +data8 0xD818B4BB43D84BF2 , 0x0000BFF8 // P16 +data8 0xA2270D30A90AA220 , 0x00003FF9 // P15 +data8 0xD5F4F2182E7A8725 , 0x0000BFF9 // P14 +data8 0x80D601879218B53A , 0x00003FFA // P13 +data8 0x9297B23CCFFB291F , 0x0000BFFA // P12 +data8 0xFE7E52D2A89995B3 , 0x0000BFEC // P22 +data8 0xC90FDAA22168C235 , 0x00003FFF // pi/2 +LOCAL_OBJECT_END(atan2_tb2) -.section .text -.proc atan# -.align 32 -atan: - -{ .mmf -(p0) addl atan_GR_AD_P = @ltoff(double_atan_constants_P), gp -(p0) addl atan_GR_AD_Q = @ltoff(double_atan_constants_Q), gp -(p0) fmerge.s atan_ABS_f8 = f0,f8 -} -;; -{ .mmf - ld8 atan_GR_AD_P = [atan_GR_AD_P] - ld8 atan_GR_AD_Q = [atan_GR_AD_Q] -(p0) frcpa.s1 atan_C,p8 = f1,f8 -} -;; +.section .text +GLOBAL_LIBM_ENTRY(atan) -{ .mmf -(p0) addl atan_GR_AD_R = @ltoff(double_atan_constants_R), gp -(p0) addl atan_GR_exp_mask = 0x1ffff, r0 -(p0) fma.s1 atan_Y = f8,f8,f0 +{ .mfi + nop.m 999 + frcpa.s1 atan2_u1_Y,p7 = f1,atan2_Y + nop.i 999 } +{ .mfi + addl EXP_AD_P1 = @ltoff(atan2_tb1), gp + fma.s1 atan2_two = f1,f1,f1 + nop.i 999 ;; - -// This fnorm takes faults or sets fault flags -{ .mmf -(p0) mov atan_GR_10172 = 0x10172 - ld8 atan_GR_AD_R = [atan_GR_AD_R] -(p0) fnorm atan_NORM_f8 = f8 } -;; - - -// qnan snan inf norm unorm 0 -+ -// 1 1 0 0 0 1 11 -// c 7 - -// p9 set if we have a NAN or +-0 -{ .mmf -(p0) ldfe atan_Q8 = [atan_GR_AD_Q],16 -(p0) ldfe atan_P10 = [atan_GR_AD_P],16 -(p0) fclass.m.unc p9, p0 = f8, 0xc7 +{ .mfi + ld8 EXP_AD_P1 = [EXP_AD_P1] + frcpa.s1 atan2_u1_X,p6 = f1,atan2_X + nop.i 999 } -;; - - -{ .mmi -(p0) ldfe atan_Q9 = [atan_GR_AD_Q],16 -(p0) ldfe atan_P11 = [atan_GR_AD_P],16 - nop.i 999 +{ .mfi + nop.m 999 + fma.s1 atan2_ysq = atan2_Y,atan2_Y,f0 + nop.i 999 } ;; - -{ .mmf -(p0) ldfe atan_Q4 = [atan_GR_AD_Q],16 -(p0) ldfe atan_P20 = [atan_GR_AD_P],16 -(p9) fma.d.s0 f8 = f8,f1,f0 -;; -} - -// Exit if we have a NAN or +-0 -{ .mmb -(p0) ldfe atan_Q5 = [atan_GR_AD_Q],16 -(p0) ldfe atan_P21 = [atan_GR_AD_P],16 -(p9) br.ret.spnt b0 -;; +{ .mfi + add EXP_AD_P2 = 0xd0,EXP_AD_P1 + fmerge.s atan2_sgnY = atan2_Y,f1 + nop.i 999 } - - -// p6 is TRUE if |x| <= 1 -// p7 is TRUE if |x| > 1 -{ .mmf -(p0) ldfe atan_Q6 = [atan_GR_AD_Q],16 -(p0) ldfe atan_P8 = [atan_GR_AD_P],16 -(p0) fcmp.le.unc p6,p7 = atan_ABS_f8, f1 ;; -} { .mfi -(p0) ldfe atan_Q7 = [atan_GR_AD_Q],16 -(p0) fma.s1 atan_Z = atan_C, atan_C, f0 - nop.i 999 + ldfe atan2_P11 = [EXP_AD_P1],16 + fclass.m p10,p0 = atan2_Y, 0xc3 // Test for y=nan + nop.i 999 } { .mfi -(p0) ldfe atan_P9 = [atan_GR_AD_P],16 -(p0) fnma.s1 atan_B = atan_C,f8, f1 - nop.i 999 ;; + ldfe atan2_P21 = [EXP_AD_P2],16 + nop.f 999 + nop.i 999 +;; } { .mfi -(p0) ldfe atan_Q21 = [atan_GR_AD_Q],16 -(p0) fma.s1 atan_V12 = atan_Y, atan_Y, f0 - nop.i 999 + ldfe atan2_P10 = [EXP_AD_P1],16 + fnma.s1 atan2_B1Y = atan2_u1_Y, atan2_Y, atan2_two + nop.i 999 } { .mfi -(p0) ldfe atan_P4 = [atan_GR_AD_P],16 -(p0) fma.s1 atan_Xcub = f8, atan_Y , f0 - nop.i 999 -;; -} - - -{ .mmi -(p7) ldfe atan_Q22 = [atan_GR_AD_Q],16 -(p6) ldfe atan_P5 = [atan_GR_AD_P],16 -(p6) cmp.eq.unc p8,p0 = r0,r0 -;; -} - - -{ .mmi -(p7) ldfe atan_Q19 = [atan_GR_AD_Q],16 -(p6) ldfe atan_P6 = [atan_GR_AD_P],16 -(p7) cmp.eq.unc p9,p0 = r0,r0 -;; -} - - -{ .mmi -(p7) ldfe atan_Q20 = [atan_GR_AD_Q],16 -(p6) ldfe atan_P7 = [atan_GR_AD_P],16 - nop.i 999 + ldfe atan2_P20 = [EXP_AD_P2],16 + fma.s1 atan2_wp_Y = atan2_u1_Y, atan2_u1_Y, f0 + nop.i 999 ;; } { .mfi -(p7) ldfe atan_Q0 = [atan_GR_AD_Q],16 -(p6) fma.s1 atan_V13 = atan_Y, atan_P11, atan_P10 - nop.i 999 + ldfe atan2_P9 = [EXP_AD_P1],16 + fma.s1 atan2_z1_X = atan2_u1_X, atan2_Y, f0 + nop.i 999 } { .mfi -(p6) ldfe atan_P16 = [atan_GR_AD_P],16 -(p7) fma.s1 atan_V11 = atan_Y, atan_Q9, atan_Q8 - nop.i 999 ;; + ldfe atan2_P19 = [EXP_AD_P2],16 + fnma.s1 atan2_B1X = atan2_u1_X, atan2_X, atan2_two + nop.i 999 } - +;; { .mfi -(p7) ldfe atan_Q1 = [atan_GR_AD_Q],16 -(p7) fma.s1 atan_G12 = atan_B, atan_B, f0 - nop.i 999 + ldfe atan2_P8 = [EXP_AD_P1],16 + fma.s1 atan2_z2_X = atan2_u1_X, atan2_ysq, f0 + nop.i 999 } -{ .mfi -(p6) ldfe atan_P17 = [atan_GR_AD_P],16 -(p0) fma.s1 atan_V9 = atan_V12, atan_V12, f0 - nop.i 999 ;; +{ .mfb + ldfe atan2_P18 = [EXP_AD_P2],16 +(p10) fma.d.s0 f8 = atan2_Y,atan2_X,f0 // If y=nan, result quietized y +(p10) br.ret.spnt b0 // Exit if y=nan } +;; - +// p6 true if swap, means |y| > 1.0 or ysq > 1.0 +// p7 true if no swap, means 1.0 >= |y| or 1.0 >= ysq { .mfi -(p7) ldfe atan_Q2 = [atan_GR_AD_Q],16 -(p6) fma.s1 atan_W11 = atan_Y, atan_P21, atan_P20 - nop.i 999 + ldfe atan2_P7 = [EXP_AD_P1],16 + fcmp.ge.s1 p7,p6 = f1, atan2_ysq + nop.i 999 } -{ .mfi -(p6) ldfe atan_P18 = [atan_GR_AD_P],16 -(p7) fma.s1 atan_V7 = atan_Y, atan_Q5, atan_Q4 - nop.i 999 ;; +{ .mmf + ldfe atan2_P17 = [EXP_AD_P2],16 + nop.m 999 + nop.f 999 } +;; { .mfi -(p7) ldfe atan_Q3 = [atan_GR_AD_Q],16 -(p7) fma.s1 atan_Z1 = atan_Z, atan_Z, f0 - nop.i 999 + ldfe atan2_P6 = [EXP_AD_P1],16 + fma.s1 atan2_E = atan2_u1_Y, atan2_B1Y, atan2_Y + nop.i 999 } { .mfi -(p6) ldfe atan_P19 = [atan_GR_AD_P],16 -(p7) fma.s1 atan_Y3 = atan_Y , atan_V12, f0 - nop.i 999 ;; + ldfe atan2_P16 = [EXP_AD_P2],16 + fma.s1 atan2_B1sq_Y = atan2_B1Y, atan2_B1Y, f0 + nop.i 999 +;; } { .mfi -(p7) ldfe atan_R8 = [atan_GR_AD_R],16 -(p6) fma.s1 atan_V11 = atan_Y, atan_P9, atan_P8 - nop.i 999 + ldfe atan2_P5 = [EXP_AD_P1],16 +(p7) fma.s1 atan2_wp_X = atan2_z1_X, atan2_z1_X, f0 + nop.i 999 } { .mfi -(p6) ldfe atan_P12 = [atan_GR_AD_P],16 -(p7) fma.s1 atan_V8 = atan_Y, atan_Q7, atan_Q6 - nop.i 999 ;; -} - -{ .mmi -(p7) ldfe atan_R9 = [atan_GR_AD_R],16 -(p6) ldfe atan_P13 = [atan_GR_AD_P],16 - nop.i 999 + ldfe atan2_P15 = [EXP_AD_P2],16 +(p7) fma.s1 atan2_B1sq_X = atan2_B1X, atan2_B1X, f0 + nop.i 999 ;; } { .mfi -(p7) ldfe atan_R4 = [atan_GR_AD_R],16 -(p6) fma.s1 atan_V7 = atan_Y, atan_P5, atan_P4 - nop.i 999 + ldfe atan2_P4 = [EXP_AD_P1],16 +(p6) fma.s1 atan2_z = atan2_u1_Y, atan2_B1Y, f0 + nop.i 999 } { .mfi -(p6) ldfe atan_P14 = [atan_GR_AD_P],16 -(p7) fma.s1 atan_W13 = atan_Y, atan_Q22, atan_Q21 - nop.i 999 ;; + ldfe atan2_P14 = [EXP_AD_P2],16 +(p7) fma.s1 atan2_E = atan2_z2_X, atan2_B1X, atan2_X + nop.i 999 +;; } { .mfi -(p7) ldfe atan_R5 = [atan_GR_AD_R],16 -(p6) fma.s1 atan_Y12 = atan_V9 , atan_V9 , f0 - nop.i 999 + ldfe atan2_P3 = [EXP_AD_P1],16 + fcmp.eq.s0 p14,p15=atan2_X,atan2_Y // Dummy for denorm and invalid + nop.i 999 } -{ .mfi -(p6) ldfe atan_P15 = [atan_GR_AD_P],16 -(p7) fma.s1 atan_Y8 = atan_V9 , atan_V9 , f0 - nop.i 999 ;; +{ .mmf + ldfe atan2_P13 = [EXP_AD_P2],16 + nop.m 999 +(p7) fma.s1 atan2_z = atan2_z1_X, atan2_B1X, f0 +;; } - { .mfi -(p7) ldfe atan_R6 = [atan_GR_AD_R],16 -(p6) fma.s1 atan_V8 = atan_Y, atan_P7, atan_P6 - nop.i 999 + ldfe atan2_P2 = [EXP_AD_P1],16 +(p6) fma.s1 atan2_w = atan2_wp_Y, atan2_B1sq_Y,f0 + nop.i 999 } -{ .mfi -(p6) ldfe atan_P0 = [atan_GR_AD_P],16 -(p7) fma.s1 atan_W11 = atan_Y, atan_Q20, atan_Q19 - nop.i 999 ;; +{ .mlx + ldfe atan2_P12 = [EXP_AD_P2],16 + movl rsig_near_one = 0x8000000000000001 // signif near 1.0 +;; } - { .mfi -(p7) ldfe atan_R7 = [atan_GR_AD_R],16 -(p7) fma.s1 atan_Z2 = atan_Z1 , atan_Z1, f0 - nop.i 999 + ldfe atan2_P1 = [EXP_AD_P1],16 + fclass.m p9,p0 = atan2_Y, 0x23 // test if y inf + nop.i 999 } { .mfi -(p6) ldfe atan_P1 = [atan_GR_AD_P],16 -(p6) fma.s1 atan_V10 = atan_V12, atan_V13, atan_V11 - nop.i 999 ;; + ldfe atan2_P22 = [EXP_AD_P2],16 +(p7) fma.s1 atan2_w = atan2_wp_X, atan2_B1sq_X,f0 + nop.i 999 +;; } { .mfi -(p7) ldfe atan_Q15 = [atan_GR_AD_Q],16 -(p6) fma.s1 atan_W7 = atan_Y, atan_P17, atan_P16 - nop.i 999 + ldfe atan2_P0 = [EXP_AD_P1],16 + frcpa.s1 atan2_F,p0 = f1, atan2_E + nop.i 999 } { .mfi -(p6) ldfe atan_P2 = [atan_GR_AD_P],16 -(p7) fma.s1 atan_V3 = atan_Y, atan_Q1 , atan_Q0 - nop.i 999 ;; + ldfe atan2_pi_by_2 = [EXP_AD_P2],16 +(p6) fnma.s1 atan2_gV = atan2_Y, atan2_z, atan2_X + nop.i 999 +;; } { .mfi -(p7) ldfe atan_Q16 = [atan_GR_AD_Q],16 -(p7) fma.s1 atan_G9 = atan_G12, atan_G12, f0 - nop.i 999 + setf.sig atan2_sig_near_one = rsig_near_one +(p7) fnma.s1 atan2_gV = atan2_X, atan2_z, atan2_Y + nop.i 999 } -{ .mfi -(p6) ldfe atan_P3 = [atan_GR_AD_P],16 -(p7) fma.s1 atan_V6 = atan_V12, atan_V8, atan_V7 - nop.i 999 ;; +{ .mfb + nop.m 999 +(p9) fma.d.s0 f8 = atan2_sgnY, atan2_pi_by_2, f0 // +-pi/2 if y inf +(p9) br.ret.spnt b0 // exit if y inf, result is +-pi/2 +;; } - { .mfi -(p7) ldfe atan_R1 = [atan_GR_AD_R],16 -(p6) fma.s1 atan_W8 = atan_Y, atan_P19, atan_P18 - nop.i 999 + nop.m 999 + fma.s1 atan2_V13 = atan2_w, atan2_P11, atan2_P10 + nop.i 999 } { .mfi -(p6) ldfe atan_P22 = [atan_GR_AD_P],16 -(p7) fma.s1 atan_V4 = atan_Y, atan_Q3 , atan_Q2 - nop.i 999 ;; + nop.m 999 + fma.s1 atan2_W11 = atan2_w, atan2_P21, atan2_P20 + nop.i 999 +;; } - { .mfi - getf.exp atan_GR_signexp_f8 = atan_NORM_f8 -(p7) fma.s1 atan_Y11 = atan_Y3 , atan_Y8 , f0 - nop.i 999 + nop.m 999 + fma.s1 atan2_V11 = atan2_w, atan2_P9, atan2_P8 + nop.i 999 } { .mfi -(p7) ldfe atan_Q17 = [atan_GR_AD_Q],16 -(p6) fma.s1 atan_V6 = atan_V12, atan_V8, atan_V7 - nop.i 999 ;; + nop.m 999 + fma.s1 atan2_V12 = atan2_w, atan2_w, f0 + nop.i 999 +;; } - { .mfi -(p7) ldfe atan_Q18 = [atan_GR_AD_Q],16 -(p6) fma.s1 atan_W3 = atan_Y, atan_P13, atan_P12 - nop.i 999 + nop.m 999 + fma.s1 atan2_V8 = atan2_w, atan2_P7 , atan2_P6 + nop.i 999 } { .mfi -(p7) ldfe atan_R10 = [atan_GR_AD_R],16 -(p7) fma.s1 atan_G11 = atan_B, atan_R9 , atan_R8 - nop.i 999 ;; + nop.m 999 + fma.s1 atan2_W8 = atan2_w, atan2_P19, atan2_P18 + nop.i 999 +;; } - { .mfi -(p7) ldfe atan_Q10 = [atan_GR_AD_Q],16 -(p7) fma.s1 atan_Z3 = atan_Z1 , atan_Z2 , f0 -(p0) and atan_GR_exp_f8 = atan_GR_signexp_f8,atan_GR_exp_mask + nop.m 999 + fnma.s1 atan2_alpha = atan2_E, atan2_F, f1 + nop.i 999 } { .mfi -(p7) ldfe atan_R2 = [atan_GR_AD_R],16 -(p7) fma.s1 atan_Z4 = atan_Z2 , atan_Z2 , f0 - nop.i 999 ;; + nop.m 999 + fnma.s1 atan2_alpha_1 = atan2_E, atan2_F, atan2_two + nop.i 999 +;; } { .mfi -(p7) ldfe atan_Q11 = [atan_GR_AD_Q],16 -(p6) fma.s1 atan_W4 = atan_Y, atan_P15, atan_P14 - nop.i 999 + nop.m 999 + fma.s1 atan2_V7 = atan2_w, atan2_P5 , atan2_P4 + nop.i 999 } { .mfi -(p7) ldfe atan_R3 = [atan_GR_AD_R],16 -(p7) fma.s1 atan_G7 = atan_B, atan_R5 , atan_R4 -(p0) cmp.le.unc p11,p0 = atan_GR_10172,atan_GR_exp_f8 -;; -} - - -{ .mmf -(p9) ldfe atan_Q12 = [atan_GR_AD_Q],16 -(p0) ldfe atan_S_PI = [atan_GR_AD_R],16 -(p8) fma.s1 atan_W6 = atan_V12, atan_W8, atan_W7 + nop.m 999 + fma.s1 atan2_W7 = atan2_w, atan2_P17, atan2_P16 + nop.i 999 ;; } - - { .mfi -(p9) ldfe atan_Q13 = [atan_GR_AD_Q],16 -(p8) fma.s1 atan_V3 = atan_Y, atan_P1 , atan_P0 -(p11) cmp.ne.and p6,p7 = r0,r0 + nop.m 999 + fma.s1 atan2_V4 = atan2_w, atan2_P3 , atan2_P2 + nop.i 999 } { .mfi - nop.m 999 -(p8) fma.s1 atan_V5 = atan_V9 , atan_V10, atan_V6 - nop.i 999 ;; + nop.m 999 + fma.s1 atan2_W4 = atan2_w, atan2_P15, atan2_P14 + nop.i 999 +;; } - -.pred.rel "mutex",p6,p7,p11 { .mfi -(p7) ldfe atan_Q14 = [atan_GR_AD_Q],16 -(p6) fma.s1 atan_Y12 = atan_V9 , atan_Y12, f0 - nop.i 999 + nop.m 999 + fma.s1 atan2_V3 = atan2_w, atan2_P1 , atan2_P0 + nop.i 999 } { .mfi - nop.m 999 -(p7) fma.s1 atan_G8 = atan_B, atan_R7 , atan_R6 - nop.i 999 ;; + nop.m 999 + fma.s1 atan2_W3 = atan2_w, atan2_P13, atan2_P12 + nop.i 999 +;; } - { .mfi - nop.m 999 -(p6) fma.s1 atan_V4 = atan_Y, atan_P3 , atan_P2 - nop.i 999 + nop.m 999 + fma.s1 atan2_V10 = atan2_V12, atan2_V13, atan2_V11 + nop.i 999 } { .mfi - nop.m 999 -(p7) fma.s1 atan_W7 = atan_Y, atan_Q16, atan_Q15 - nop.i 999 ;; + nop.m 999 + fma.s1 atan2_gVF = atan2_gV, atan2_F, f0 + nop.i 999 +;; } - { .mfi - nop.m 999 -(p6) fma.s1 atan_W10 = atan_V12, atan_P22, atan_W11 - nop.i 999 + nop.m 999 + fma.s1 atan2_alpha_sq = atan2_alpha, atan2_alpha, f0 + nop.i 999 } { .mfi - nop.m 999 -(p7) fma.s1 atan_G3 = atan_B, atan_R1 , f1 - nop.i 999 ;; + nop.m 999 + fma.s1 atan2_Cp = atan2_alpha, atan2_alpha_1, f1 + nop.i 999 +;; } - { .mfi - nop.m 999 -(p6) fma.s1 atan_W2 = atan_V12, atan_W4 , atan_W3 - nop.i 999 + nop.m 999 + fma.s1 atan2_V9 = atan2_V12, atan2_V12, f0 + nop.i 999 } { .mfi - nop.m 999 -(p7) fma.s1 atan_V2 = atan_V12, atan_V4 , atan_V3 - nop.i 999 ;; + nop.m 999 + fma.s1 atan2_W10 = atan2_V12, atan2_P22 , atan2_W11 + nop.i 999 +;; } { .mfi - nop.m 999 -(p7) fma.s1 atan_W8 = atan_Y, atan_Q18, atan_Q17 - nop.i 999 + nop.m 999 + fma.s1 atan2_V6 = atan2_V12, atan2_V8 , atan2_V7 + nop.i 999 } { .mfi - nop.m 999 -(p7) fma.s1 atan_G10 = atan_G12, atan_R10, atan_G11 - nop.i 999 ;; + nop.m 999 + fma.s1 atan2_W6 = atan2_V12, atan2_W8 , atan2_W7 + nop.i 999 +;; } { .mfi - nop.m 999 -(p7) fma.s1 atan_V10 = atan_V12, atan_Q10, atan_V11 - nop.i 999 + nop.m 999 + fma.s1 atan2_V2 = atan2_V12, atan2_V4 , atan2_V3 + nop.i 999 } { .mfi - nop.m 999 -(p7) fma.s1 atan_G6 = atan_G12, atan_G8 , atan_G7 - nop.i 999 ;; + nop.m 999 + fma.s1 atan2_W2 = atan2_V12, atan2_W4 , atan2_W3 + nop.i 999 +;; } - { .mfi - nop.m 999 -(p6) fma.s1 atan_V2 = atan_V12, atan_V4, atan_V3 - nop.i 999 + nop.m 999 + fma.s1 atan2_alpha_cub = atan2_alpha, atan2_alpha_sq, f0 + nop.i 999 } { .mfi - nop.m 999 -(p7) fma.s1 atan_G4 = atan_B , atan_R3 , atan_R2 - nop.i 999 ;; + nop.m 999 + fma.s1 atan2_C = atan2_gVF, atan2_Cp, f0 + nop.i 999 +;; } - { .mfi - nop.m 999 -(p6) fma.s1 atan_W5 = atan_V9 , atan_W10, atan_W6 - nop.i 999 -} -{ .mfi - nop.m 999 -(p7) fma.s1 atan_W3 = atan_Y , atan_Q12, atan_Q11 - nop.i 999 ;; + nop.m 999 + fma.s1 atan2_W12 = atan2_V9, atan2_V9, f0 + nop.i 999 +;; } - { .mfi - nop.m 999 -(p7) fma.s1 atan_Z5 = atan_Z3 , atan_Z4 , f0 - nop.i 999 + nop.m 999 + fma.s1 atan2_V5 = atan2_V9, atan2_V10, atan2_V6 + nop.i 999 } { .mfi - nop.m 999 -(p7) fma.s1 atan_W10 = atan_V12, atan_W13, atan_W11 - nop.i 999 ;; + nop.m 999 + fma.s1 atan2_W5 = atan2_V9, atan2_W10, atan2_W6 + nop.i 999 +;; } - { .mfi - nop.m 999 -(p7) fma.s1 atan_W4 = atan_Y , atan_Q14, atan_Q13 - nop.i 999 + nop.m 999 + fclass.m p8,p0 = atan2_Y, 0x07 // Test for y=0 + nop.i 999 } { .mfi - nop.m 999 -(p7) fma.s1 atan_W6 = atan_V12, atan_W8, atan_W7 - nop.i 999 ;; + nop.m 999 + fma.s1 atan2_d = atan2_alpha_cub, atan2_C, atan2_C + nop.i 999 } +;; { .mfi - nop.m 999 -(p7) fma.s1 atan_V5 = atan_V9 , atan_V10, atan_V6 - nop.i 999 + nop.m 999 + fma.s1 atan2_W12 = atan2_V9, atan2_W12, f0 + nop.i 999 } -{ .mfi - nop.m 999 -(p7) fma.s1 atan_G5 = atan_G9 , atan_G10, atan_G6 - nop.i 999 ;; -} - +;; { .mfi - nop.m 999 -(p6) fma.s1 atan_V1 = atan_V9 , atan_V5 , atan_V2 - nop.i 999 + nop.m 999 + fma.s1 atan2_V1 = atan2_V9, atan2_V5, atan2_V2 + nop.i 999 } { .mfi - nop.m 999 -(p7) fma.s1 atan_G2 = atan_G12, atan_G4 , atan_G3 - nop.i 999 ;; + nop.m 999 + fma.s1 atan2_W1 = atan2_V9, atan2_W5, atan2_W2 + nop.i 999 +;; } - -{ .mfi - nop.m 999 -(p6) fma.s1 atan_W1 = atan_V9 , atan_W5 , atan_W2 - nop.i 999 -} { .mfi - nop.m 999 -(p7) fma.s1 atan_Z6 = atan_Z4 , atan_C , f0 - nop.i 999 ;; + nop.m 999 +(p8) fmerge.s f8 = atan2_sgnY, f0 // +-0 if y=0 + nop.i 999 } - -{ .mfi - nop.m 999 -(p0) fmerge.s atan_S_PI = f8, atan_S_PI - nop.i 999 ;; +{ .mfb + nop.m 999 + fma.s1 atan2_zcub = atan2_z, atan2_w, f0 +(p8) br.ret.spnt b0 // Exit if y=0 +;; } - { .mfi - nop.m 999 -(p7) fma.s1 atan_W5 = atan_V9 , atan_W10, atan_W6 - nop.i 999 + nop.m 999 + fma.s1 atan2_pd = atan2_P0, atan2_d, f0 + nop.i 999 } { .mfi - nop.m 999 -(p7) fma.s1 atan_W2 = atan_V12, atan_W4 , atan_W3 - nop.i 999 ;; + nop.m 999 + fma.s1 atan2_dsq = atan2_d, atan2_d, f0 + nop.i 999 +;; } { .mfi - nop.m 999 -(p7) fma.s1 atan_G1 = atan_G9 , atan_G5 , atan_G2 - nop.i 999 + nop.m 999 + fmerge.se atan2_near_one = f1, atan2_sig_near_one // Const ~1.0 + nop.i 999 } { .mfi - nop.m 999 -(p7) fma.s1 atan_V1 = atan_V9 , atan_V5 , atan_V2 - nop.i 999 ;; + nop.m 999 + fma.s1 atan2_Pp = atan2_W12, atan2_W1, atan2_V1 + nop.i 999 +;; } - { .mfi - nop.m 999 -(p6) fma.s1 atan_P = atan_Y12, atan_W1 , atan_V1 - nop.i 999 + nop.m 999 + fma.s1 atan2_sgn_pi_by_2 = atan2_pi_by_2, atan2_sgnY, f0 + nop.i 999 } { .mfi - nop.m 999 -(p7) fma.s1 atan_Z7 = atan_Z5 , atan_Z6 , f0 - nop.i 999 ;; + nop.m 999 + fma.s1 atan2_A_lo = atan2_pd, atan2_dsq, atan2_d + nop.i 999 +;; } { .mfi - nop.m 999 -(p7) fma.s1 atan_W1 = atan_V9 , atan_W5 , atan_W2 - nop.i 999 ;; + nop.m 999 + fma.s1 atan2_A_hi = atan2_zcub, atan2_Pp, atan2_z + nop.i 999 +;; } { .mfi - nop.m 999 -(p11) fma.d.s0 f8 = atan_S_PI,f1,f0 - nop.i 999 + nop.m 999 +(p6) fma.s1 atan2_A = atan2_A_hi, f1, atan2_A_lo + nop.i 999 } +// For |Y| <= |X| and X > 0, result is A_hi + A_lo { .mfi - nop.m 999 -(p7) fma.s1 atan_Z = atan_G1 , atan_Z7 , f0 - nop.i 999 ;; -} - - -{ .mfi - nop.m 999 -(p7) fma.s1 atan_Q = atan_Y11, atan_W1 , atan_V1 - nop.i 999 ;; + nop.m 999 +(p7) fma.d.s0 f8 = atan2_A_hi, f1, atan2_A_lo + nop.i 999 +;; } - -{ .mfi - nop.m 999 -(p6) fma.d.s0 f8 = atan_P , atan_Xcub , f8 - nop.i 999 -} +// For |Y| > |X|, result is +- pi/2 - (A_hi + A_lo) +// We perturb A by multiplying by 1.0+1ulp as we produce the result +// in order to get symmetrically rounded results in directed rounding modes. +// If we don't do this, there are a few cases where the trailing 11 bits of +// the significand of the result, before converting to double, are zero. These +// cases do not round symmetrically in round to +infinity or round to -infinity. { .mfb - nop.m 999 -(p7) fnma.d.s0 f8 = atan_Z , atan_Q , atan_S_PI -(p0) br.ret.sptk b0 ;; + nop.m 999 +(p6) fnma.d.s0 f8 = atan2_A, atan2_near_one, atan2_sgn_pi_by_2 + br.ret.sptk b0 +;; } -.endp atan -ASM_SIZE_DIRECTIVE(atan) +GLOBAL_LIBM_END(atan) diff --git a/sysdeps/ia64/fpu/s_atanf.S b/sysdeps/ia64/fpu/s_atanf.S index b0a68737aa..fb7f4a307e 100644 --- a/sysdeps/ia64/fpu/s_atanf.S +++ b/sysdeps/ia64/fpu/s_atanf.S @@ -1,12 +1,10 @@ .file "atanf.s" -// THIS IS NOT OPTIMIZED AND NOT OFFICIAL -// Copyright (C) 2000, 2001, Intel Corporation +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. // -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -22,7 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -37,16 +35,18 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. // History //============================================================== -// ?/??/00 Initial revision -// 8/17/00 Changed predicate register macro-usage to direct predicate +// 02/20/00 Initial version +// 08/17/00 Changed predicate register macro-usage to direct predicate // names due to an assembler bug. - -#include "libm_support.h" +// 02/06/02 Corrected .section statement +// 05/20/02 Cleaned up namespace and sf0 syntax +// 02/06/03 Reordered header: .section, .global, .proc, .align; +// added missing bundling // // Assembly macros @@ -140,16 +140,11 @@ atanf_answer = f8 //atanf_pred_GT1 = p7 -#ifdef _LIBC -.rodata -#else -.data -#endif +RODATA .align 16 -atanf_coeff_1_table: -ASM_TYPE_DIRECTIVE(atanf_coeff_1_table,@object) +LOCAL_OBJECT_START(atanf_coeff_1_table) data8 0x40c4c241be751ff2 // r4 data8 0x40e9f300c2f3070b // r5 data8 0x409babffef772075 // r3 @@ -164,12 +159,11 @@ data8 0xbfc2473c5145ee38 // p3 data8 0x3fbc4f512b1865f5 // p4 data8 0x3fc9997e7afbff4e // p2 data8 0x3ff921fb54442d18 // pi/2 -ASM_SIZE_DIRECTIVE(atanf_coeff_1_table) +LOCAL_OBJECT_END(atanf_coeff_1_table) -atanf_coeff_2_table: -ASM_TYPE_DIRECTIVE(atanf_coeff_2_table,@object) +LOCAL_OBJECT_START(atanf_coeff_2_table) data8 0x4035000000004284 // r1 data8 0x406cdffff336a59b // r2 data8 0x3fbc4f512b1865f5 // p4 = q6 @@ -182,18 +176,12 @@ data8 0xbfa6e10ba401393f // p7 data8 0x3f97105b4160f86b // p8 data8 0xbf7deaadaa336451 // p9 data8 0x3f522e5d33bc9baa // p10 -ASM_SIZE_DIRECTIVE(atanf_coeff_2_table) - - +LOCAL_OBJECT_END(atanf_coeff_2_table) -.global atanf -.text -.proc atanf - -.align 32 -atanf: +.section .text +GLOBAL_LIBM_ENTRY(atanf) { .mfi alloc r32 = ar.pfs,1,2,0,0 @@ -325,7 +313,7 @@ atanf: { .mfb nop.m 999 fma.s1 atanf_x5 = atanf_t,atanf_xcub,f0 -(p8) br.cond.spnt L(ATANF_X_INF_NAN_ZERO) +(p8) br.cond.spnt ATANF_X_INF_NAN_ZERO } ;; @@ -487,7 +475,7 @@ atanf: { .mfi nop.m 999 - fma atanf_sgnx_piby2 = atanf_sgn_x,atanf_piby2,f0 + fma.s0 atanf_sgnx_piby2 = atanf_sgn_x,atanf_piby2,f0 nop.i 999 } { .mfi @@ -530,27 +518,38 @@ atanf: { .mfi nop.m 999 //(atanf_pred_GT1) fnma.s atanf_answer = atanf_poly_q,atanf_z21_poly_r,atanf_sgnx_piby2 -(p7) fnma.s atanf_answer = atanf_poly_q,atanf_z21_poly_r,atanf_sgnx_piby2 +(p7) fnma.s.s0 atanf_answer = atanf_poly_q,atanf_z21_poly_r,atanf_sgnx_piby2 nop.i 999;; } { .mfb nop.m 999 //(atanf_pred_LE1) fma.s atanf_answer = atanf_x11,atanf_poly_p1,atanf_poly_p4 -(p6) fma.s atanf_answer = atanf_x11,atanf_poly_p1,atanf_poly_p4 +(p6) fma.s.s0 atanf_answer = atanf_x11,atanf_poly_p1,atanf_poly_p4 br.ret.sptk b0 } -L(ATANF_X_INF_NAN_ZERO): +ATANF_X_INF_NAN_ZERO: - fclass.m p8,p9 = f8,0x23 // @inf +{ .mfi + nop.m 0 + fclass.m p8,p9 = f8,0x23 // @inf + nop.i 0 +} ;; +{ .mfi + nop.m 0 (p8) fmerge.s f8 = f8, atanf_piby2 + nop.i 0 +} ;; - fnorm.s f8 = f8 +{ .mfb + nop.m 0 + fnorm.s.s0 f8 = f8 br.ret.sptk b0 +} +;; -.endp atanf -ASM_SIZE_DIRECTIVE(atanf) +GLOBAL_LIBM_END(atanf) diff --git a/sysdeps/ia64/fpu/s_atanl.S b/sysdeps/ia64/fpu/s_atanl.S index 28d44c1850..bfd9f458f4 100644 --- a/sysdeps/ia64/fpu/s_atanl.S +++ b/sysdeps/ia64/fpu/s_atanl.S @@ -1,10 +1,10 @@ .file "atanl.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,7 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -35,41 +35,52 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // -// ********************************************************************* +//********************************************************************* // // History -// 2/02/00 (hand-optimized) -// 4/04/00 Unwind support added -// 8/15/00 Bundle added after call to __libm_error_support to properly +// 02/02/00 (hand-optimized) +// 04/04/00 Unwind support added +// 08/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. +// 03/13/01 Fixed flags when denormal raised on intermediate result +// 01/08/02 Improved speed. +// 02/06/02 Corrected .section statement +// 05/20/02 Cleaned up namespace and sf0 syntax +// 02/10/03 Reordered header: .section, .global, .proc, .align; +// used data8 for long double table values // -// ********************************************************************* +//********************************************************************* // // Function: atanl(x) = inverse tangent(x), for double extended x values -// Function: atan2l(y,x) = atan(y/x), for double extended x values +// Function: atan2l(y,x) = atan(y/x), for double extended y, x values +// +// API +// +// long double atanl (long double x) +// long double atan2l (long double y, long double x) // -// ********************************************************************* +//********************************************************************* // // Resources Used: // // Floating-Point Registers: f8 (Input and Return Value) -// f9-f15 -// f32-f79 +// f9 (Input for atan2l) +// f10-f15, f32-f83 // // General Purpose Registers: -// r32-r48 -// r49,r50,r51,r52 (Arguments to error support for 0,0 case) +// r32-r51 +// r49-r52 (Arguments to error support for 0,0 case) // // Predicate Registers: p6-p15 // -// ********************************************************************* +//********************************************************************* // // IEEE Special Conditions: // -// Denormal fault raised on denormal inputs +// Denormal fault raised on denormal inputs // Underflow exceptions may occur // Special error handling for the y=0 and x=0 case // Inexact raised when appropriate by algorithm @@ -92,7 +103,7 @@ // atan2l(+/-Inf, Inf) = +/-pi/4 // atan2l(+/-Inf, -Inf) = +/-3pi/4 // -// ********************************************************************* +//********************************************************************* // // Mathematical Description // --------------------------- @@ -108,16 +119,16 @@ // // // (Arg_X, Arg_Y) x -// \ -// \ -// \ -// \ +// \ +// \ +// \ +// \ // \ angle between is ATANL(Arg_Y,Arg_X) -// \ +// \ // ------------------> X-axis // Origin @@ -232,14 +243,14 @@ // z_hi = 2^k * 1.b_1 b_2 b_3 b_4 1 // // then -// / \ +// / \ // | (V/U) - z_hi | // arctan(V/U) = arctan(z_hi) + acrtan| -------------- | // | 1 + (V/U)*z_hi | // \ / // -// / \ +// / \ // | V - z_hi*U | // = arctan(z_hi) + acrtan| -------------- | @@ -295,7 +306,7 @@ // U := max( |Arg_X|, |Arg_Y| ) // V := min( |Arg_X|, |Arg_Y| ) // -// execute: frcap E, pred, V, U +// execute: frcpa E, pred, V, U // If pred is 0, go to Step 5 for special cases handling. // // Step 2. Decide on branch. @@ -399,7 +410,7 @@ // // z := V * E ...z approximates V/U to roughly working precision // zsq := z * z -// z8 := zsq * zsq; z8 := z8 * z8 +// z4 := zsq * zsq; z8 := z4 * z4 // // poly1 := P_4 + zsq*(P_5 + zsq*(P_6 + zsq*(P_7 + zsq*P_8))) // poly2 := zsq*(P_1 + zsq*(P_2 + zsq*P_3)) @@ -438,12 +449,11 @@ // // Step 5. Special Cases // -// If pred is 0 where pred is obtained in -// frcap E, pred, V, U +// These are detected early in the function by fclass instructions. // -// we are in one of those special cases of 0,+-inf or NaN +// We are in one of those special cases when X or Y is 0,+-inf or NaN // -// If one of U and V is NaN, return U+V (which will generate +// If one of X and Y is NaN, return X+Y (which will generate // invalid in case one is a signaling NaN). Otherwise, // return the Result as described in the table // @@ -469,8 +479,6 @@ // // -#include "libm_support.h" - ArgY_orig = f8 Result = f8 FR_RESULT = f8 @@ -504,6 +512,7 @@ Res_hi = f49 Res_lo = f50 Z = f52 zsq = f53 +z4 = f54 z8 = f54 poly1 = f55 poly2 = f56 @@ -521,8 +530,8 @@ P_5 = f67 P_6 = f68 P_7 = f69 P_8 = f70 -TWO_TO_NEG3 = f71 -U_hold = f72 +U_hold = f71 +TWO_TO_NEG3 = f72 C_hi_hold = f73 E_hold = f74 M = f75 @@ -530,6 +539,11 @@ ArgX_abs = f76 ArgY_abs = f77 Result_lo = f78 A_temp = f79 +FR_temp = f80 +Xsq = f81 +Ysq = f82 +tmp_small = f83 + GR_SAVE_PFS = r33 GR_SAVE_B0 = r34 GR_SAVE_GP = r35 @@ -545,1415 +559,1399 @@ exp_ArgY = r44 exponent_Q = r45 significand_Q = r46 special = r47 -special1 = r48 +sp_exp_Q = r48 +sp_exp_4sig_Q = r49 +table_base = r50 +int_temp = r51 + GR_Parameter_X = r49 GR_Parameter_Y = r50 GR_Parameter_RESULT = r51 GR_Parameter_TAG = r52 -int_temp = r52 - -#ifdef _LIBC -.rodata -#else -.data -#endif -.align 64 - -Constants_atan: -ASM_TYPE_DIRECTIVE(Constants_atan,@object) -data4 0x54442D18, 0x3FF921FB, 0x248D3132, 0x3E000000 -// double pi/2, single lo_pi/2, two**(-3) -data4 0xAAAAAAA3, 0xAAAAAAAA, 0x0000BFFD, 0x00000000 // P_1 -data4 0xCCCC54B2, 0xCCCCCCCC, 0x00003FFC, 0x00000000 // P_2 -data4 0x47E4D0C2, 0x92492492, 0x0000BFFC, 0x00000000 // P_3 -data4 0x58870889, 0xE38E38E0, 0x00003FFB, 0x00000000 // P_4 -data4 0x290149F8, 0xBA2E895B, 0x0000BFFB, 0x00000000 // P_5 -data4 0x250F733D, 0x9D88E6D4, 0x00003FFB, 0x00000000 // P_6 -data4 0xFB8745A0, 0x884E51FF, 0x0000BFFB, 0x00000000 // P_7 -data4 0x394396BD, 0xE1C7412B, 0x00003FFA, 0x00000000 // P_8 -data4 0xAAAAA52F, 0xAAAAAAAA, 0x0000BFFD, 0x00000000 // Q_1 -data4 0xC75B60D3, 0xCCCCCCCC, 0x00003FFC, 0x00000000 // Q_2 -data4 0x011F1940, 0x924923AD, 0x0000BFFC, 0x00000000 // Q_3 -data4 0x2A5F89BD, 0xE36F716D, 0x00003FFB, 0x00000000 // Q_4 +GR_temp = r52 + +RODATA +.align 16 + +LOCAL_OBJECT_START(Constants_atan) +// double pi/2 +data8 0x3FF921FB54442D18 +// single lo_pi/2, two**(-3) +data4 0x248D3132, 0x3E000000 +data8 0xAAAAAAAAAAAAAAA3, 0xBFFD // P_1 +data8 0xCCCCCCCCCCCC54B2, 0x3FFC // P_2 +data8 0x9249249247E4D0C2, 0xBFFC // P_3 +data8 0xE38E38E058870889, 0x3FFB // P_4 +data8 0xBA2E895B290149F8, 0xBFFB // P_5 +data8 0x9D88E6D4250F733D, 0x3FFB // P_6 +data8 0x884E51FFFB8745A0, 0xBFFB // P_7 +data8 0xE1C7412B394396BD, 0x3FFA // P_8 +data8 0xAAAAAAAAAAAAA52F, 0xBFFD // Q_1 +data8 0xCCCCCCCCC75B60D3, 0x3FFC // Q_2 +data8 0x924923AD011F1940, 0xBFFC // Q_3 +data8 0xE36F716D2A5F89BD, 0x3FFB // Q_4 // // Entries Tbl_hi (double precision) // B = 1+Index/16+1/32 Index = 0 // Entries Tbl_lo (single precision) // B = 1+Index/16+1/32 Index = 0 // -data4 0xA935BD8E, 0x3FE9A000, 0x23ACA08F, 0x00000000 +data8 0x3FE9A000A935BD8E +data4 0x23ACA08F, 0x00000000 // // Entries Tbl_hi (double precision) Index = 0,1,...,15 // B = 2^(-1)*(1+Index/16+1/32) // Entries Tbl_lo (single precision) // Index = 0,1,...,15 B = 2^(-1)*(1+Index/16+1/32) // -data4 0x7F175A34, 0x3FDE77EB, 0x238729EE, 0x00000000 -data4 0x73C1A40B, 0x3FE0039C, 0x249334DB, 0x00000000 -data4 0x5B5B43DA, 0x3FE0C614, 0x22CBA7D1, 0x00000000 -data4 0x88BE7C13, 0x3FE1835A, 0x246310E7, 0x00000000 -data4 0xE2CC9E6A, 0x3FE23B71, 0x236210E5, 0x00000000 -data4 0x8406CBCA, 0x3FE2EE62, 0x2462EAF5, 0x00000000 -data4 0x1CD41719, 0x3FE39C39, 0x24B73EF3, 0x00000000 -data4 0x5B795B55, 0x3FE44506, 0x24C11260, 0x00000000 -data4 0x5BB6EC04, 0x3FE4E8DE, 0x242519EE, 0x00000000 -data4 0x1F732FBA, 0x3FE587D8, 0x24D4346C, 0x00000000 -data4 0x115D7B8D, 0x3FE6220D, 0x24ED487B, 0x00000000 -data4 0x920B3D98, 0x3FE6B798, 0x2495FF1E, 0x00000000 -data4 0x8FBA8E0F, 0x3FE74897, 0x223D9531, 0x00000000 -data4 0x289FA093, 0x3FE7D528, 0x242B0411, 0x00000000 -data4 0x576CC2C5, 0x3FE85D69, 0x2335B374, 0x00000000 -data4 0xA99CC05D, 0x3FE8E17A, 0x24C27CFB, 0x00000000 +data8 0x3FDE77EB7F175A34 +data4 0x238729EE, 0x00000000 +data8 0x3FE0039C73C1A40B +data4 0x249334DB, 0x00000000 +data8 0x3FE0C6145B5B43DA +data4 0x22CBA7D1, 0x00000000 +data8 0x3FE1835A88BE7C13 +data4 0x246310E7, 0x00000000 +data8 0x3FE23B71E2CC9E6A +data4 0x236210E5, 0x00000000 +data8 0x3FE2EE628406CBCA +data4 0x2462EAF5, 0x00000000 +data8 0x3FE39C391CD41719 +data4 0x24B73EF3, 0x00000000 +data8 0x3FE445065B795B55 +data4 0x24C11260, 0x00000000 +data8 0x3FE4E8DE5BB6EC04 +data4 0x242519EE, 0x00000000 +data8 0x3FE587D81F732FBA +data4 0x24D4346C, 0x00000000 +data8 0x3FE6220D115D7B8D +data4 0x24ED487B, 0x00000000 +data8 0x3FE6B798920B3D98 +data4 0x2495FF1E, 0x00000000 +data8 0x3FE748978FBA8E0F +data4 0x223D9531, 0x00000000 +data8 0x3FE7D528289FA093 +data4 0x242B0411, 0x00000000 +data8 0x3FE85D69576CC2C5 +data4 0x2335B374, 0x00000000 +data8 0x3FE8E17AA99CC05D +data4 0x24C27CFB, 0x00000000 // // Entries Tbl_hi (double precision) Index = 0,1,...,15 // B = 2^(-2)*(1+Index/16+1/32) // Entries Tbl_lo (single precision) // Index = 0,1,...,15 B = 2^(-2)*(1+Index/16+1/32) // -data4 0x510665B5, 0x3FD025FA, 0x24263482, 0x00000000 -data4 0x362431C9, 0x3FD1151A, 0x242C8DC9, 0x00000000 -data4 0x67E47C95, 0x3FD20255, 0x245CF9BA, 0x00000000 -data4 0x7A823CFE, 0x3FD2ED98, 0x235C892C, 0x00000000 -data4 0x29271134, 0x3FD3D6D1, 0x2389BE52, 0x00000000 -data4 0x586890E6, 0x3FD4BDEE, 0x24436471, 0x00000000 -data4 0x175E0F4E, 0x3FD5A2E0, 0x2389DBD4, 0x00000000 -data4 0x9F5FA6FD, 0x3FD68597, 0x2476D43F, 0x00000000 -data4 0x52817501, 0x3FD76607, 0x24711774, 0x00000000 -data4 0xB8DF95D7, 0x3FD84422, 0x23EBB501, 0x00000000 -data4 0x7CD0C662, 0x3FD91FDE, 0x23883A0C, 0x00000000 -data4 0x66168001, 0x3FD9F930, 0x240DF63F, 0x00000000 -data4 0x5422058B, 0x3FDAD00F, 0x23FE261A, 0x00000000 -data4 0x378624A5, 0x3FDBA473, 0x23A8CD0E, 0x00000000 -data4 0x0AAD71F8, 0x3FDC7655, 0x2422D1D0, 0x00000000 -data4 0xC9EC862B, 0x3FDD45AE, 0x2344A109, 0x00000000 +data8 0x3FD025FA510665B5 +data4 0x24263482, 0x00000000 +data8 0x3FD1151A362431C9 +data4 0x242C8DC9, 0x00000000 +data8 0x3FD2025567E47C95 +data4 0x245CF9BA, 0x00000000 +data8 0x3FD2ED987A823CFE +data4 0x235C892C, 0x00000000 +data8 0x3FD3D6D129271134 +data4 0x2389BE52, 0x00000000 +data8 0x3FD4BDEE586890E6 +data4 0x24436471, 0x00000000 +data8 0x3FD5A2E0175E0F4E +data4 0x2389DBD4, 0x00000000 +data8 0x3FD685979F5FA6FD +data4 0x2476D43F, 0x00000000 +data8 0x3FD7660752817501 +data4 0x24711774, 0x00000000 +data8 0x3FD84422B8DF95D7 +data4 0x23EBB501, 0x00000000 +data8 0x3FD91FDE7CD0C662 +data4 0x23883A0C, 0x00000000 +data8 0x3FD9F93066168001 +data4 0x240DF63F, 0x00000000 +data8 0x3FDAD00F5422058B +data4 0x23FE261A, 0x00000000 +data8 0x3FDBA473378624A5 +data4 0x23A8CD0E, 0x00000000 +data8 0x3FDC76550AAD71F8 +data4 0x2422D1D0, 0x00000000 +data8 0x3FDD45AEC9EC862B +data4 0x2344A109, 0x00000000 // // Entries Tbl_hi (double precision) Index = 0,1,...,15 // B = 2^(-3)*(1+Index/16+1/32) // Entries Tbl_lo (single precision) // Index = 0,1,...,15 B = 2^(-3)*(1+Index/16+1/32) // -data4 0x84212B3D, 0x3FC068D5, 0x239874B6, 0x00000000 -data4 0x41060850, 0x3FC16465, 0x2335E774, 0x00000000 -data4 0x171A535C, 0x3FC25F6E, 0x233E36BE, 0x00000000 -data4 0xEDEB99A3, 0x3FC359E8, 0x239680A3, 0x00000000 -data4 0xC6092A9E, 0x3FC453CE, 0x230FB29E, 0x00000000 -data4 0xBA11570A, 0x3FC54D18, 0x230C1418, 0x00000000 -data4 0xFFB3AA73, 0x3FC645BF, 0x23F0564A, 0x00000000 -data4 0xE8A7D201, 0x3FC73DBD, 0x23D4A5E1, 0x00000000 -data4 0xE398EBC7, 0x3FC8350B, 0x23D4ADDA, 0x00000000 -data4 0x7D050271, 0x3FC92BA3, 0x23BCB085, 0x00000000 -data4 0x601081A5, 0x3FCA217E, 0x23BC841D, 0x00000000 -data4 0x574D780B, 0x3FCB1696, 0x23CF4A8E, 0x00000000 -data4 0x4D768466, 0x3FCC0AE5, 0x23BECC90, 0x00000000 -data4 0x4E1D5395, 0x3FCCFE65, 0x2323DCD2, 0x00000000 -data4 0x864C9D9D, 0x3FCDF110, 0x23F53F3A, 0x00000000 -data4 0x451D980C, 0x3FCEE2E1, 0x23CCB11F, 0x00000000 - -data4 0x54442D18, 0x400921FB, 0x33145C07, 0x3CA1A626 // PI two doubles -data4 0x54442D18, 0x3FF921FB, 0x33145C07, 0x3C91A626 // PI_by_2 two dbles -data4 0x54442D18, 0x3FE921FB, 0x33145C07, 0x3C81A626 // PI_by_4 two dbles -data4 0x7F3321D2, 0x4002D97C, 0x4C9E8A0A, 0x3C9A7939 // 3PI_by_4 two dbles -ASM_SIZE_DIRECTIVE(Constants_atan) - - -.text -.proc atanl# -.global atanl# -.align 64 - -atanl: -{ .mfb - nop.m 999 -(p0) mov ArgX_orig = f1 -(p0) br.cond.sptk atan2l ;; -} -.endp atanl -ASM_SIZE_DIRECTIVE(atanl) - -.text -.proc atan2l# -.global atan2l# -#ifdef _LIBC -.proc __atan2l# -.global __atan2l# -.proc __ieee754_atan2l# -.global __ieee754_atan2l# -#endif -.align 64 - - -atan2l: -#ifdef _LIBC -__atan2l: -__ieee754_atan2l: -#endif -{ .mfi -alloc r32 = ar.pfs, 0, 17 , 4, 0 -(p0) mov ArgY = ArgY_orig -} -{ .mfi - nop.m 999 -(p0) mov ArgX = ArgX_orig - nop.i 999 -};; +data8 0x3FC068D584212B3D +data4 0x239874B6, 0x00000000 +data8 0x3FC1646541060850 +data4 0x2335E774, 0x00000000 +data8 0x3FC25F6E171A535C +data4 0x233E36BE, 0x00000000 +data8 0x3FC359E8EDEB99A3 +data4 0x239680A3, 0x00000000 +data8 0x3FC453CEC6092A9E +data4 0x230FB29E, 0x00000000 +data8 0x3FC54D18BA11570A +data4 0x230C1418, 0x00000000 +data8 0x3FC645BFFFB3AA73 +data4 0x23F0564A, 0x00000000 +data8 0x3FC73DBDE8A7D201 +data4 0x23D4A5E1, 0x00000000 +data8 0x3FC8350BE398EBC7 +data4 0x23D4ADDA, 0x00000000 +data8 0x3FC92BA37D050271 +data4 0x23BCB085, 0x00000000 +data8 0x3FCA217E601081A5 +data4 0x23BC841D, 0x00000000 +data8 0x3FCB1696574D780B +data4 0x23CF4A8E, 0x00000000 +data8 0x3FCC0AE54D768466 +data4 0x23BECC90, 0x00000000 +data8 0x3FCCFE654E1D5395 +data4 0x2323DCD2, 0x00000000 +data8 0x3FCDF110864C9D9D +data4 0x23F53F3A, 0x00000000 +data8 0x3FCEE2E1451D980C +data4 0x23CCB11F, 0x00000000 +// +data8 0x400921FB54442D18, 0x3CA1A62633145C07 // PI two doubles +data8 0x3FF921FB54442D18, 0x3C91A62633145C07 // PI_by_2 two dbles +data8 0x3FE921FB54442D18, 0x3C81A62633145C07 // PI_by_4 two dbles +data8 0x4002D97C7F3321D2, 0x3C9A79394C9E8A0A // 3PI_by_4 two dbles +LOCAL_OBJECT_END(Constants_atan) + + +.section .text +GLOBAL_IEEE754_ENTRY(atanl) + +// Use common code with atan2l after setting x=1.0 { .mfi - nop.m 999 -(p0) fclass.m.unc p7,p0 = ArgY_orig, 0x103 - nop.i 999 + alloc r32 = ar.pfs, 0, 17, 4, 0 + fma.s1 Ysq = ArgY_orig, ArgY_orig, f0 // Form y*y + nop.i 999 } { .mfi - nop.m 999 -// -// -// Save original input args and load table ptr. -// -(p0) fclass.m.unc p6,p0 = ArgX_orig, 0x103 - nop.i 999 -};; + addl table_ptr1 = @ltoff(Constants_atan#), gp // Address of table pointer + fma.s1 Xsq = f1, f1, f0 // Form x*x + nop.i 999 +} +;; + { .mfi -(p0) addl table_ptr1 = @ltoff(Constants_atan#), gp -(p0) fclass.m.unc p0,p9 = ArgY_orig, 0x1FF - nop.i 999 ;; + ld8 table_ptr1 = [table_ptr1] // Get table pointer + fnorm.s1 ArgY = ArgY_orig + nop.i 999 } { .mfi - ld8 table_ptr1 = [table_ptr1] -(p0) fclass.m.unc p0,p8 = ArgX_orig, 0x1FF - nop.i 999 + nop.m 999 + fnorm.s1 ArgX = f1 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fclass.m.unc p13,p0 = ArgY_orig, 0x0C3 - nop.i 999 ;; + getf.exp sign_X = f1 // Get signexp of x + fmerge.s ArgX_abs = f0, f1 // Form |x| + nop.i 999 } { .mfi -(p0) fclass.m.unc p12,p0 = ArgX_orig, 0x0C3 - nop.i 999 + nop.m 999 + fnorm.s1 ArgX_orig = f1 + nop.i 999 } +;; +{ .mfi + getf.exp sign_Y = ArgY_orig // Get signexp of y + fmerge.s ArgY_abs = f0, ArgY_orig // Form |y| + mov table_base = table_ptr1 // Save base pointer to tables +} +;; -// -// Check for NatVals. -// Check for everything - if false, then must be pseudo-zero -// or pseudo-nan (IA unsupporteds). -// -{ .mib - nop.m 999 - nop.i 999 -(p6) br.cond.spnt L(ATANL_NATVAL) ;; +{ .mfi + ldfd P_hi = [table_ptr1],8 // Load double precision hi part of pi + fclass.m p8,p0 = ArgY_orig, 0x1e7 // Test y natval, nan, inf, zero + nop.i 999 } +;; -{ .mib - nop.m 999 - nop.i 999 -(p7) br.cond.spnt L(ATANL_NATVAL) ;; +{ .mfi + ldfps P_lo, TWO_TO_NEG3 = [table_ptr1], 8 // Load P_lo and constant 2^-3 + nop.f 999 + nop.i 999 } -{ .mib -(p0) ldfd P_hi = [table_ptr1],8 - nop.i 999 -(p8) br.cond.spnt L(ATANL_UNSUPPORTED) ;; +{ .mfi + nop.m 999 + fma.s1 M = f1, f1, f0 // Set M = 1.0 + nop.i 999 } -{ .mbb -(p0) add table_ptr2 = 96, table_ptr1 -(p9) br.cond.spnt L(ATANL_UNSUPPORTED) +;; + // -// Load double precision high-order part of pi +// Check for everything - if false, then must be pseudo-zero +// or pseudo-nan (IA unsupporteds). // -(p12) br.cond.spnt L(ATANL_NAN) ;; -} { .mfb - nop.m 999 -(p0) fnorm.s1 ArgX = ArgX -(p13) br.cond.spnt L(ATANL_NAN) ;; -} -// -// Normalize the input argument. -// Branch out if NaN inputs -// -{ .mmf -(p0) ldfs P_lo = [table_ptr1], 4 - nop.m 999 -(p0) fnorm.s1 ArgY = ArgY ;; + nop.m 999 + fclass.m p0,p12 = f1, 0x1FF // Test x unsupported +(p8) br.cond.spnt ATANL_Y_SPECIAL // Branch if y natval, nan, inf, zero } -{ .mmf - nop.m 999 -(p0) ldfs TWO_TO_NEG3 = [table_ptr1], 180 -// +;; + // U = max(ArgX_abs,ArgY_abs) // V = min(ArgX_abs,ArgY_abs) -// if PR1, swap = 0 -// if PR2, swap = 1 -// -(p0) mov M = f1 ;; -} { .mfi - nop.m 999 -// -// Get exp and sign of ArgX -// Get exp and sign of ArgY -// Load 2**(-3) and increment ptr to Q_4. -// -(p0) fmerge.s ArgX_abs = f1, ArgX - nop.i 999 ;; + nop.m 999 + fcmp.ge.s1 p6,p7 = Xsq, Ysq // Test for |x| >= |y| using squares + nop.i 999 } -// -// load single precision low-order part of pi = P_lo -// +{ .mfb + nop.m 999 + fma.s1 V = ArgX_abs, f1, f0 // Set V assuming |x| < |y| + br.cond.sptk ATANL_COMMON // Branch to common code +} +;; + +GLOBAL_IEEE754_END(atanl) +GLOBAL_IEEE754_ENTRY(atan2l) + { .mfi -(p0) getf.exp sign_X = ArgX -(p0) fmerge.s ArgY_abs = f1, ArgY - nop.i 999 ;; + alloc r32 = ar.pfs, 0, 17, 4, 0 + fma.s1 Ysq = ArgY_orig, ArgY_orig, f0 // Form y*y + nop.i 999 } -{ .mii -(p0) getf.exp sign_Y = ArgY - nop.i 999 ;; -(p0) shr sign_X = sign_X, 17 ;; +{ .mfi + addl table_ptr1 = @ltoff(Constants_atan#), gp // Address of table pointer + fma.s1 Xsq = ArgX_orig, ArgX_orig, f0 // Form x*x + nop.i 999 } -{ .mii - nop.m 999 -(p0) shr sign_Y = sign_Y, 17 ;; -(p0) cmp.eq.unc p8, p9 = 0x00000, sign_Y ;; +;; + +{ .mfi + ld8 table_ptr1 = [table_ptr1] // Get table pointer + fnorm.s1 ArgY = ArgY_orig + nop.i 999 } { .mfi - nop.m 999 -// -// Is ArgX_abs >= ArgY_abs -// Is sign_Y == 0? -// -(p0) fmax.s1 U = ArgX_abs, ArgY_abs - nop.i 999 + nop.m 999 + fnorm.s1 ArgX = ArgX_orig + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// ArgX_abs = |ArgX| -// ArgY_abs = |ArgY| -// sign_X is sign bit of ArgX -// sign_Y is sign bit of ArgY -// -(p0) fcmp.ge.s1 p6, p7 = ArgX_abs, ArgY_abs - nop.i 999 ;; + getf.exp sign_X = ArgX_orig // Get signexp of x + fmerge.s ArgX_abs = f0, ArgX_orig // Form |x| + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fmin.s1 V = ArgX_abs, ArgY_abs - nop.i 999 ;; + getf.exp sign_Y = ArgY_orig // Get signexp of y + fmerge.s ArgY_abs = f0, ArgY_orig // Form |y| + mov table_base = table_ptr1 // Save base pointer to tables } +;; + { .mfi - nop.m 999 -(p8) fadd.s1 s_Y = f0, f1 -(p6) cmp.eq.unc p10, p11 = 0x00000, sign_X + ldfd P_hi = [table_ptr1],8 // Load double precision hi part of pi + fclass.m p8,p0 = ArgY_orig, 0x1e7 // Test y natval, nan, inf, zero + nop.i 999 } -{ .mii -(p6) add swap = r0, r0 - nop.i 999 ;; -(p7) add swap = 1, r0 +;; + +{ .mfi + ldfps P_lo, TWO_TO_NEG3 = [table_ptr1], 8 // Load P_lo and constant 2^-3 + fclass.m p9,p0 = ArgX_orig, 0x1e7 // Test x natval, nan, inf, zero + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 + fma.s1 M = f1, f1, f0 // Set M = 1.0 + nop.i 999 +} +;; + // -// Let M = 1.0 -// if p8, s_Y = 1.0 -// if p9, s_Y = -1.0 +// Check for everything - if false, then must be pseudo-zero +// or pseudo-nan (IA unsupporteds). // -(p10) fsub.s1 M = M, f1 - nop.i 999 ;; +{ .mfb + nop.m 999 + fclass.m p0,p12 = ArgX_orig, 0x1FF // Test x unsupported +(p8) br.cond.spnt ATANL_Y_SPECIAL // Branch if y natval, nan, inf, zero } +;; + +// U = max(ArgX_abs,ArgY_abs) +// V = min(ArgX_abs,ArgY_abs) { .mfi - nop.m 999 -(p9) fsub.s1 s_Y = f0, f1 - nop.i 999 ;; + nop.m 999 + fcmp.ge.s1 p6,p7 = Xsq, Ysq // Test for |x| >= |y| using squares + nop.i 999 } +{ .mfb + nop.m 999 + fma.s1 V = ArgX_abs, f1, f0 // Set V assuming |x| < |y| +(p9) br.cond.spnt ATANL_X_SPECIAL // Branch if x natval, nan, inf, zero +} +;; + +// Now common code for atanl and atan2l +ATANL_COMMON: { .mfi - nop.m 999 -(p0) frcpa.s1 E, p6 = V, U - nop.i 999 ;; + nop.m 999 + fclass.m p0,p13 = ArgY_orig, 0x1FF // Test y unsupported + shr sign_X = sign_X, 17 // Get sign bit of x +} +{ .mfi + nop.m 999 + fma.s1 U = ArgY_abs, f1, f0 // Set U assuming |x| < |y| + adds table_ptr1 = 176, table_ptr1 // Point to Q4 } -{ .mbb - nop.m 999 +;; + +{ .mfi +(p6) add swap = r0, r0 // Set swap=0 if |x| >= |y| +(p6) frcpa.s1 E, p0 = ArgY_abs, ArgX_abs // Compute E if |x| >= |y| + shr sign_Y = sign_Y, 17 // Get sign bit of y +} +{ .mfb + nop.m 999 +(p6) fma.s1 V = ArgY_abs, f1, f0 // Set V if |x| >= |y| +(p12) br.cond.spnt ATANL_UNSUPPORTED // Branch if x unsupported +} +;; + +// Set p8 if y >=0 +// Set p9 if y < 0 +// Set p10 if |x| >= |y| and x >=0 +// Set p11 if |x| >= |y| and x < 0 +{ .mfi + cmp.eq p8, p9 = 0, sign_Y // Test for y >= 0 +(p7) frcpa.s1 E, p0 = ArgX_abs, ArgY_abs // Compute E if |x| < |y| +(p7) add swap = 1, r0 // Set swap=1 if |x| < |y| +} +{ .mfb +(p6) cmp.eq.unc p10, p11 = 0, sign_X // If |x| >= |y|, test for x >= 0 +(p6) fma.s1 U = ArgX_abs, f1, f0 // Set U if |x| >= |y| +(p13) br.cond.spnt ATANL_UNSUPPORTED // Branch if y unsupported +} +;; + // -// E = frcpa(V,U) +// if p8, s_Y = 1.0 +// if p9, s_Y = -1.0 // -(p6) br.cond.sptk L(ATANL_STEP2) -(p0) br.cond.spnt L(ATANL_SPECIAL_HANDLING) ;; +.pred.rel "mutex",p8,p9 +{ .mfi + nop.m 999 +(p8) fadd.s1 s_Y = f0, f1 // If y >= 0 set s_Y = 1.0 + nop.i 999 } -L(ATANL_STEP2): { .mfi - nop.m 999 -(p0) fmpy.s1 Q = E, V - nop.i 999 + nop.m 999 +(p9) fsub.s1 s_Y = f0, f1 // If y < 0 set s_Y = -1.0 + nop.i 999 } +;; + +.pred.rel "mutex",p10,p11 { .mfi - nop.m 999 -(p0) fcmp.eq.s0 p0, p9 = f1, ArgY_orig - nop.i 999 ;; + nop.m 999 +(p10) fsub.s1 M = M, f1 // If |x| >= |y| and x >=0, set M=0 + nop.i 999 } { .mfi - nop.m 999 -// -// Is Q < 2**(-3)? -// -(p0) fcmp.eq.s0 p0, p8 = f1, ArgX_orig - nop.i 999 + nop.m 999 +(p11) fadd.s1 M = M, f1 // If |x| >= |y| and x < 0, set M=2.0 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p11) fadd.s1 M = M, f1 - nop.i 999 ;; + nop.m 999 + fcmp.eq.s0 p0, p9 = ArgX_orig, ArgY_orig // Dummy to set denormal flag + nop.i 999 } -{ .mlx - nop.m 999 // ************************************************* // ********************* STEP2 ********************* // ************************************************* -(p0) movl special = 0x8400000000000000 -} -{ .mlx - nop.m 999 // -// lookup = b_1 b_2 b_3 B_4 +// Q = E * V // -(p0) movl special1 = 0x0000000000000100 ;; +{ .mfi + nop.m 999 + fmpy.s1 Q = E, V + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// Do fnorms to raise any denormal operand -// exceptions. -// -(p0) fmpy.s1 P_hi = M, P_hi - nop.i 999 + nop.m 999 + fnma.s1 E_hold = E, U, f1 // E_hold = 1.0 - E*U (1) if POLY path + nop.i 999 } +;; + +// Create a single precision representation of the signexp of Q with the +// 4 most significant bits of the significand followed by a 1 and then 18 0's { .mfi - nop.m 999 -(p0) fmpy.s1 P_lo = M, P_lo - nop.i 999 ;; + nop.m 999 + fmpy.s1 P_hi = M, P_hi + dep.z special = 0x1, 18, 1 // Form 0x0000000000040000 } { .mfi - nop.m 999 -// -// Q = E * V -// -(p0) fcmp.lt.unc.s1 p6, p7 = Q, TWO_TO_NEG3 - nop.i 999 ;; + nop.m 999 + fmpy.s1 P_lo = M, P_lo + add table_ptr2 = 32, table_ptr1 } -{ .mmb -(p0) getf.sig significand_Q = Q -(p0) getf.exp exponent_Q = Q - nop.b 999 ;; +;; + +{ .mfi + nop.m 999 + fma.s1 A_temp = Q, f1, f0 // Set A_temp if POLY path + nop.i 999 } -{ .mmi - nop.m 999 ;; -(p0) andcm k = 0x0003, exponent_Q -(p0) extr.u lookup = significand_Q, 59, 4 ;; +{ .mfi + nop.m 999 + fma.s1 E = E, E_hold, E // E = E + E*E_hold (1) if POLY path + nop.i 999 } -{ .mib - nop.m 999 -(p0) dep special = lookup, special, 59, 4 +;; + // -// Generate 1.b_1 b_2 b_3 b_4 1 0 0 0 ... 0 +// Is Q < 2**(-3)? +// swap = xor(swap,sign_X) // -(p6) br.cond.spnt L(ATANL_POLY) ;; -} { .mfi -(p0) cmp.eq.unc p8, p9 = 0x0000, k -(p0) fmpy.s1 P_hi = s_Y, P_hi + nop.m 999 + fcmp.lt.s1 p9, p0 = Q, TWO_TO_NEG3 // Test Q < 2^-3 + xor swap = sign_X, swap +} +;; + +// P_hi = s_Y * P_hi +{ .mmf + getf.exp exponent_Q = Q // Get signexp of Q + cmp.eq.unc p7, p6 = 0x00000, swap + fmpy.s1 P_hi = s_Y, P_hi +} +;; + // -// We waited a few extra cycles so P_lo and P_hi could be calculated. -// Load the constant 256 for loading up table entries. +// if (PR_1) sigma = -1.0 +// if (PR_2) sigma = 1.0 +// +{ .mfi + getf.sig significand_Q = Q // Get significand of Q +(p6) fsub.s1 sigma = f0, f1 + nop.i 999 +} +{ .mfb +(p9) add table_ptr1 = 128, table_base // Point to P8 if POLY path +(p7) fadd.s1 sigma = f0, f1 +(p9) br.cond.spnt ATANL_POLY // Branch to POLY if 0 < Q < 2^-3 +} +;; + // // ************************************************* // ******************** STEP3 ********************** // ************************************************* -(p0) add table_ptr2 = 16, table_ptr1 -} // -// Let z_hi have exponent and sign of original Q -// Load the Tbl_hi(0) else, increment pointer. +// lookup = b_1 b_2 b_3 B_4 // -{ .mii -(p0) ldfe Q_4 = [table_ptr1], -16 -(p0) xor swap = sign_X, swap ;; -(p9) sub k = k, r0, 1 -} { .mmi -(p0) setf.sig z_hi = special -(p0) ldfe Q_3 = [table_ptr1], -16 -(p9) add table_ptr2 = 16, table_ptr2 ;; + nop.m 999 + nop.m 999 + andcm k = 0x0003, exponent_Q // k=0,1,2,3 for exp_Q=0,-1,-2,-3 } +;; + // -// U_hold = U - U_prime_hi -// k = k * 256 - Result can be 0, 256, or 512. +// Generate sign_exp_Q b_1 b_2 b_3 b_4 1 0 0 0 ... 0 in single precision +// representation. Note sign of Q is always 0. // -{ .mmb -(p0) ldfe Q_2 = [table_ptr1], -16 -(p8) ldfd Tbl_hi = [table_ptr2], 8 - nop.b 999 ;; +{ .mfi + cmp.eq p8, p9 = 0x0000, k // Test k=0 + nop.f 999 + extr.u lookup = significand_Q, 59, 4 // Extract b_1 b_2 b_3 b_4 for index } -// -// U_prime_lo = U_hold + V * z_hi -// lookup -> lookup * 16 + k -// -{ .mmi -(p0) ldfe Q_1 = [table_ptr1], -16 ;; -(p8) ldfs Tbl_lo = [table_ptr2], 8 -// -// U_prime_hi = U + V * z_hi -// Load the Tbl_lo(0) -// -(p9) pmpy2.r k = k, special1 ;; +{ .mfi + sub sp_exp_Q = 0x7f, k // Form single prec biased exp of Q + nop.f 999 + sub k = k, r0, 1 // Decrement k } -{ .mii - nop.m 999 - nop.i 999 - nop.i 999 ;; +;; + +// Form pointer to B index table +{ .mfi + ldfe Q_4 = [table_ptr1], -16 // Load Q_4 + nop.f 999 +(p9) shl k = k, 8 // k = 0, 256, or 512 } -{ .mii - nop.m 999 - nop.i 999 - nop.i 999 ;; +{ .mfi +(p9) shladd table_ptr2 = lookup, 4, table_ptr2 + nop.f 999 + shladd sp_exp_4sig_Q = sp_exp_Q, 4, lookup // Shift and add in 4 high bits } -{ .mii - nop.m 999 - nop.i 999 - nop.i 999 ;; +;; + +{ .mmi +(p8) add table_ptr2 = -16, table_ptr2 // Pointer if original k was 0 +(p9) add table_ptr2 = k, table_ptr2 // Pointer if k was 1, 2, 3 + dep special = sp_exp_4sig_Q, special, 19, 13 // Form z_hi as single prec } -{ .mii - nop.m 999 - nop.i 999 ;; -(p9) shladd lookup = lookup, 0x0004, k ;; +;; + +// z_hi = s exp 1.b_1 b_2 b_3 b_4 1 0 0 0 ... 0 +{ .mmi + ldfd Tbl_hi = [table_ptr2], 8 // Load Tbl_hi from index table +;; + setf.s z_hi = special // Form z_hi + nop.i 999 } { .mmi -(p9) add table_ptr2 = table_ptr2, lookup ;; -// -// V_prime = V - U * z_hi -// -(p9) ldfd Tbl_hi = [table_ptr2], 8 - nop.i 999 ;; + ldfs Tbl_lo = [table_ptr2], 8 // Load Tbl_lo from index table +;; + ldfe Q_3 = [table_ptr1], -16 // Load Q_3 + nop.i 999 } +;; + +{ .mmi + ldfe Q_2 = [table_ptr1], -16 // Load Q_2 + nop.m 999 + nop.i 999 +} +;; + { .mmf - nop.m 999 -// -// C_hi = frcpa(1,U_prime_hi) -// -(p9) ldfs Tbl_lo = [table_ptr2], 8 -// -// z_hi = s exp 1.b_1 b_2 b_3 b_4 1 0 0 0 ... 0 -// Point to beginning of Tbl_hi entries - k = 0. -// -(p0) fmerge.se z_hi = Q, z_hi ;; + ldfe Q_1 = [table_ptr1], -16 // Load Q_1 + nop.m 999 + nop.f 999 } +;; + { .mfi - nop.m 999 -(p0) fma.s1 U_prime_hi = V, z_hi, U - nop.i 999 + nop.m 999 + fma.s1 U_prime_hi = V, z_hi, U // U_prime_hi = U + V * z_hi + nop.i 999 } { .mfi - nop.m 999 -(p0) fnma.s1 V_prime = U, z_hi, V - nop.i 999 ;; + nop.m 999 + fnma.s1 V_prime = U, z_hi, V // V_prime = V - U * z_hi + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) mov A_hi = Tbl_hi - nop.i 999 ;; + nop.m 999 + mov A_hi = Tbl_hi // Start with A_hi = Tbl_hi + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fsub.s1 U_hold = U, U_prime_hi - nop.i 999 ;; + nop.m 999 + fsub.s1 U_hold = U, U_prime_hi // U_hold = U - U_prime_hi + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) frcpa.s1 C_hi, p6 = f1, U_prime_hi - nop.i 999 ;; + nop.m 999 + frcpa.s1 C_hi, p0 = f1, U_prime_hi // C_hi = frcpa(1,U_prime_hi) + nop.i 999 } +;; + { .mfi -(p0) cmp.eq.unc p7, p6 = 0x00000, swap -(p0) fmpy.s1 A_hi = s_Y, A_hi - nop.i 999 ;; + nop.m 999 + fmpy.s1 A_hi = s_Y, A_hi // A_hi = s_Y * A_hi + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// poly = wsq * poly -// -(p7) fadd.s1 sigma = f0, f1 - nop.i 999 ;; + nop.m 999 + fma.s1 U_prime_lo = z_hi, V, U_hold // U_prime_lo = U_hold + V * z_hi + nop.i 999 } +;; + +// C_hi_hold = 1 - C_hi * U_prime_hi (1) { .mfi - nop.m 999 -(p0) fma.s1 U_prime_lo = z_hi, V, U_hold - nop.i 999 + nop.m 999 + fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p6) fsub.s1 sigma = f0, f1 - nop.i 999 ;; + nop.m 999 + fma.s1 Res_hi = sigma, A_hi, P_hi // Res_hi = P_hi + sigma * A_hi + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1 - nop.i 999 ;; + nop.m 999 + fma.s1 C_hi = C_hi_hold, C_hi, C_hi // C_hi = C_hi + C_hi * C_hi_hold (1) + nop.i 999 } +;; + +// C_hi_hold = 1 - C_hi * U_prime_hi (2) { .mfi - nop.m 999 -// -// A_lo = A_lo + w_hi -// A_hi = s_Y * A_hi -// -(p0) fma.s1 Res_hi = sigma, A_hi, P_hi - nop.i 999 ;; + nop.m 999 + fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1 + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// C_hi_hold = 1 - C_hi * U_prime_hi (1) -// -(p0) fma.s1 C_hi = C_hi_hold, C_hi, C_hi - nop.i 999 ;; + nop.m 999 + fma.s1 C_hi = C_hi_hold, C_hi, C_hi // C_hi = C_hi + C_hi * C_hi_hold (2) + nop.i 999 } +;; + +// C_hi_hold = 1 - C_hi * U_prime_hi (3) { .mfi - nop.m 999 -// -// C_hi = C_hi + C_hi * C_hi_hold (1) -// -(p0) fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1 - nop.i 999 ;; + nop.m 999 + fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1 + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// C_hi_hold = 1 - C_hi * U_prime_hi (2) -// -(p0) fma.s1 C_hi = C_hi_hold, C_hi, C_hi - nop.i 999 ;; + nop.m 999 + fma.s1 C_hi = C_hi_hold, C_hi, C_hi // C_hi = C_hi + C_hi * C_hi_hold (3) + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// C_hi = C_hi + C_hi * C_hi_hold (2) -// -(p0) fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1 - nop.i 999 ;; + nop.m 999 + fmpy.s1 w_hi = V_prime, C_hi // w_hi = V_prime * C_hi + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// C_hi_hold = 1 - C_hi * U_prime_hi (3) -// -(p0) fma.s1 C_hi = C_hi_hold, C_hi, C_hi - nop.i 999 ;; + nop.m 999 + fmpy.s1 wsq = w_hi, w_hi // wsq = w_hi * w_hi + nop.i 999 } { .mfi - nop.m 999 -// -// C_hi = C_hi + C_hi * C_hi_hold (3) -// -(p0) fmpy.s1 w_hi = V_prime, C_hi - nop.i 999 ;; + nop.m 999 + fnma.s1 w_lo = w_hi, U_prime_hi, V_prime // w_lo = V_prime-w_hi*U_prime_hi + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// w_hi = V_prime * C_hi -// -(p0) fmpy.s1 wsq = w_hi, w_hi - nop.i 999 + nop.m 999 + fma.s1 poly = wsq, Q_4, Q_3 // poly = Q_3 + wsq * Q_4 + nop.i 999 } { .mfi - nop.m 999 -(p0) fnma.s1 w_lo = w_hi, U_prime_hi, V_prime - nop.i 999 ;; + nop.m 999 + fnma.s1 w_lo = w_hi, U_prime_lo, w_lo // w_lo = w_lo - w_hi * U_prime_lo + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// wsq = w_hi * w_hi -// w_lo = = V_prime - w_hi * U_prime_hi -// -(p0) fma.s1 poly = wsq, Q_4, Q_3 - nop.i 999 + nop.m 999 + fma.s1 poly = wsq, poly, Q_2 // poly = Q_2 + wsq * poly + nop.i 999 } { .mfi - nop.m 999 -(p0) fnma.s1 w_lo = w_hi, U_prime_lo, w_lo - nop.i 999 ;; + nop.m 999 + fmpy.s1 w_lo = C_hi, w_lo // w_lo = = w_lo * C_hi + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// poly = Q_3 + wsq * Q_4 -// w_lo = = w_lo - w_hi * U_prime_lo -// -(p0) fma.s1 poly = wsq, poly, Q_2 - nop.i 999 + nop.m 999 + fma.s1 poly = wsq, poly, Q_1 // poly = Q_1 + wsq * poly + nop.i 999 } { .mfi - nop.m 999 -(p0) fmpy.s1 w_lo = C_hi, w_lo - nop.i 999 ;; + nop.m 999 + fadd.s1 A_lo = Tbl_lo, w_lo // A_lo = Tbl_lo + w_lo + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// poly = Q_2 + wsq * poly -// w_lo = = w_lo * C_hi -// -(p0) fma.s1 poly = wsq, poly, Q_1 - nop.i 999 + nop.m 999 + fmpy.s0 Q_1 = Q_1, Q_1 // Dummy operation to raise inexact + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fadd.s1 A_lo = Tbl_lo, w_lo - nop.i 999 ;; + nop.m 999 + fmpy.s1 poly = wsq, poly // poly = wsq * poly + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// Result = Res_hi + Res_lo * s_Y (User Supplied Rounding Mode) -// -(p0) fmpy.s0 Q_1 = Q_1, Q_1 - nop.i 999 ;; + nop.m 999 + fmpy.s1 poly = w_hi, poly // poly = w_hi * poly + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// poly = Q_1 + wsq * poly -// A_lo = Tbl_lo + w_lo -// swap = xor(swap,sign_X) -// -(p0) fmpy.s1 poly = wsq, poly - nop.i 999 ;; + nop.m 999 + fadd.s1 A_lo = A_lo, poly // A_lo = A_lo + poly + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// Is (swap) != 0 ? -// poly = wsq * poly -// A_hi = Tbl_hi -// -(p0) fmpy.s1 poly = w_hi, poly - nop.i 999 ;; + nop.m 999 + fadd.s1 A_lo = A_lo, w_hi // A_lo = A_lo + w_hi + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// if (PR_1) sigma = -1.0 -// if (PR_2) sigma = 1.0 -// -(p0) fadd.s1 A_lo = A_lo, poly - nop.i 999 ;; + nop.m 999 + fma.s1 Res_lo = sigma, A_lo, P_lo // Res_lo = P_lo + sigma * A_lo + nop.i 999 } -{ .mfi - nop.m 999 +;; + // -// P_hi = s_Y * P_hi -// A_lo = A_lo + poly +// Result = Res_hi + Res_lo * s_Y (User Supplied Rounding Mode) // -(p0) fadd.s1 A_lo = A_lo, w_hi - nop.i 999 ;; -} -{ .mfi - nop.m 999 -(p0) fma.s1 Res_lo = sigma, A_lo, P_lo - nop.i 999 ;; -} { .mfb - nop.m 999 -// -// Res_hi = P_hi + sigma * A_hi -// Res_lo = P_lo + sigma * A_lo -// -(p0) fma.s0 Result = Res_lo, s_Y, Res_hi -// -// Raise inexact. -// -br.ret.sptk b0 ;; -} -// -// poly1 = P_5 + zsq * poly1 -// poly2 = zsq * poly2 -// -L(ATANL_POLY): -{ .mmf -(p0) xor swap = sign_X, swap - nop.m 999 -(p0) fnma.s1 E_hold = E, U, f1 ;; + nop.m 999 + fma.s0 Result = Res_lo, s_Y, Res_hi + br.ret.sptk b0 // Exit table path 2^-3 <= V/U < 1 } -{ .mfi - nop.m 999 -(p0) mov A_temp = Q +;; + + +ATANL_POLY: +// Here if 0 < V/U < 2^-3 // -// poly1 = P_4 + zsq * poly1 -// swap = xor(swap,sign_X) +// *********************************************** +// ******************** STEP4 ******************** +// *********************************************** + // -// sign_X gr_002 -// swap gr_004 -// poly1 = poly1 <== Done with poly1 -// poly1 = P_4 + zsq * poly1 -// swap = xor(swap,sign_X) +// Following: +// Iterate 3 times E = E + E*(1.0 - E*U) +// Also load P_8, P_7, P_6, P_5, P_4 // -(p0) cmp.eq.unc p7, p6 = 0x00000, swap -} -{ .mfi - nop.m 999 -(p0) fmpy.s1 P_hi = s_Y, P_hi - nop.i 999 ;; -} { .mfi - nop.m 999 -(p6) fsub.s1 sigma = f0, f1 - nop.i 999 + ldfe P_8 = [table_ptr1], -16 // Load P_8 + fnma.s1 z_lo = A_temp, U, V // z_lo = V - A_temp * U + nop.i 999 } { .mfi - nop.m 999 -(p7) fadd.s1 sigma = f0, f1 - nop.i 999 ;; -} - -// *********************************************** -// ******************** STEP4 ******************** -// *********************************************** - -{ .mmi nop.m 999 -(p0) addl table_ptr1 = @ltoff(Constants_atan#), gp + fnma.s1 E_hold = E, U, f1 // E_hold = 1.0 - E*U (2) nop.i 999 } ;; { .mmi - ld8 table_ptr1 = [table_ptr1] - nop.m 999 + ldfe P_7 = [table_ptr1], -16 // Load P_7 +;; + ldfe P_6 = [table_ptr1], -16 // Load P_6 nop.i 999 } ;; - { .mfi - nop.m 999 -(p0) fma.s1 E = E, E_hold, E -// -// Following: -// Iterate 3 times E = E + E*(1.0 - E*U) -// Also load P_8, P_7, P_6, P_5, P_4 -// E_hold = 1.0 - E * U (1) -// A_temp = Q -// -(p0) add table_ptr1 = 128, table_ptr1 ;; -} -{ .mmf - nop.m 999 -// -// E = E + E_hold*E (1) -// Point to P_8. -// -(p0) ldfe P_8 = [table_ptr1], -16 -// -// poly = z8*poly1 + poly2 (Typo in writeup) -// Is (swap) != 0 ? -// -(p0) fnma.s1 z_lo = A_temp, U, V ;; + ldfe P_5 = [table_ptr1], -16 // Load P_5 + fma.s1 E = E, E_hold, E // E = E + E_hold*E (2) + nop.i 999 } -{ .mmb - nop.m 999 -// -// E_hold = 1.0 - E * U (2) -// -(p0) ldfe P_7 = [table_ptr1], -16 - nop.b 999 ;; +;; + +{ .mmi + ldfe P_4 = [table_ptr1], -16 // Load P_4 +;; + ldfe P_3 = [table_ptr1], -16 // Load P_3 + nop.i 999 } -{ .mmb - nop.m 999 -// -// E = E + E_hold*E (2) -// -(p0) ldfe P_6 = [table_ptr1], -16 - nop.b 999 ;; +;; + +{ .mfi + ldfe P_2 = [table_ptr1], -16 // Load P_2 + fnma.s1 E_hold = E, U, f1 // E_hold = 1.0 - E*U (3) + nop.i 999 } -{ .mmb - nop.m 999 -// -// E_hold = 1.0 - E * U (3) -// -(p0) ldfe P_5 = [table_ptr1], -16 - nop.b 999 ;; +{ .mlx + nop.m 999 + movl int_temp = 0x24005 // Signexp for small neg number } +;; + { .mmf - nop.m 999 -// -// E = E + E_hold*E (3) + ldfe P_1 = [table_ptr1], -16 // Load P_1 + setf.exp tmp_small = int_temp // Form small neg number + fma.s1 E = E, E_hold, E // E = E + E_hold*E (3) +} +;; + // // // At this point E approximates 1/U to roughly working precision -// z = V*E approximates V/U +// Z = V*E approximates V/U // -(p0) ldfe P_4 = [table_ptr1], -16 -(p0) fnma.s1 E_hold = E, U, f1 ;; +{ .mfi + nop.m 999 + fmpy.s1 Z = V, E // Z = V * E + nop.i 999 } -{ .mmb - nop.m 999 -// -// Z = V * E -// -(p0) ldfe P_3 = [table_ptr1], -16 - nop.b 999 ;; +{ .mfi + nop.m 999 + fmpy.s1 z_lo = z_lo, E // z_lo = z_lo * E + nop.i 999 } -{ .mmb - nop.m 999 +;; + // -// zsq = Z * Z +// Now what we want to do is +// poly1 = P_4 + zsq*(P_5 + zsq*(P_6 + zsq*(P_7 + zsq*P_8))) +// poly2 = zsq*(P_1 + zsq*(P_2 + zsq*P_3)) // -(p0) ldfe P_2 = [table_ptr1], -16 - nop.b 999 ;; -} -{ .mmb - nop.m 999 // -// z8 = zsq * zsq +// Fixup added to force inexact later - +// A_hi = A_temp + z_lo +// z_lo = (A_temp - A_hi) + z_lo // -(p0) ldfe P_1 = [table_ptr1], -16 - nop.b 999 ;; -} -{ .mlx - nop.m 999 -(p0) movl int_temp = 0x24005 -} { .mfi - nop.m 999 -(p0) fma.s1 E = E, E_hold, E - nop.i 999 ;; + nop.m 999 + fmpy.s1 zsq = Z, Z // zsq = Z * Z + nop.i 999 } { .mfi - nop.m 999 -(p0) fnma.s1 E_hold = E, U, f1 - nop.i 999 ;; + nop.m 999 + fadd.s1 A_hi = A_temp, z_lo // A_hi = A_temp + z_lo + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fma.s1 E = E, E_hold, E - nop.i 999 ;; + nop.m 999 + fma.s1 poly1 = zsq, P_8, P_7 // poly1 = P_7 + zsq * P_8 + nop.i 999 } { .mfi - nop.m 999 -(p0) fmpy.s1 Z = V, E - nop.i 999 + nop.m 999 + fma.s1 poly2 = zsq, P_3, P_2 // poly2 = P_2 + zsq * P_3 + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// z_lo = V - A_temp * U -// if (PR_2) sigma = 1.0 -// -(p0) fmpy.s1 z_lo = z_lo, E - nop.i 999 ;; + nop.m 999 + fmpy.s1 z4 = zsq, zsq // z4 = zsq * zsq + nop.i 999 } { .mfi - nop.m 999 -(p0) fmpy.s1 zsq = Z, Z - nop.i 999 + nop.m 999 + fsub.s1 A_temp = A_temp, A_hi // A_temp = A_temp - A_hi + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// z_lo = z_lo * E -// if (PR_1) sigma = -1.0 -// -(p0) fadd.s1 A_hi = A_temp, z_lo - nop.i 999 ;; + nop.m 999 + fmerge.s tmp = A_hi, A_hi // Copy tmp = A_hi + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// z8 = z8 * z8 -// -// -// Now what we want to do is -// poly1 = P_4 + zsq*(P_5 + zsq*(P_6 + zsq*(P_7 + zsq*P_8))) -// poly2 = zsq*(P_1 + zsq*(P_2 + zsq*P_3)) -// -(p0) fma.s1 poly1 = zsq, P_8, P_7 - nop.i 999 + nop.m 999 + fma.s1 poly1 = zsq, poly1, P_6 // poly1 = P_6 + zsq * poly1 + nop.i 999 } { .mfi - nop.m 999 -(p0) fma.s1 poly2 = zsq, P_3, P_2 - nop.i 999 ;; + nop.m 999 + fma.s1 poly2 = zsq, poly2, P_1 // poly2 = P_2 + zsq * poly2 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fmpy.s1 z8 = zsq, zsq - nop.i 999 + nop.m 999 + fmpy.s1 z8 = z4, z4 // z8 = z4 * z4 + nop.i 999 } { .mfi - nop.m 999 -(p0) fsub.s1 A_temp = A_temp, A_hi - nop.i 999 ;; + nop.m 999 + fadd.s1 z_lo = A_temp, z_lo // z_lo = (A_temp - A_hi) + z_lo + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// A_lo = Z * poly + z_lo -// -(p0) fmerge.s tmp = A_hi, A_hi - nop.i 999 ;; + nop.m 999 + fma.s1 poly1 = zsq, poly1, P_5 // poly1 = P_5 + zsq * poly1 + nop.i 999 } { .mfi - nop.m 999 -// -// poly1 = P_7 + zsq * P_8 -// poly2 = P_2 + zsq * P_3 -// -(p0) fma.s1 poly1 = zsq, poly1, P_6 - nop.i 999 + nop.m 999 + fmpy.s1 poly2 = poly2, zsq // poly2 = zsq * poly2 + nop.i 999 } +;; + +// Create small GR double in case need to raise underflow { .mfi - nop.m 999 -(p0) fma.s1 poly2 = zsq, poly2, P_1 - nop.i 999 ;; + nop.m 999 + fma.s1 poly1 = zsq, poly1, P_4 // poly1 = P_4 + zsq * poly1 + dep GR_temp = -1,r0,0,53 } +;; + +// Create small double in case need to raise underflow { .mfi - nop.m 999 -(p0) fmpy.s1 z8 = z8, z8 - nop.i 999 + setf.d FR_temp = GR_temp + fma.s1 poly = z8, poly1, poly2 // poly = poly2 + z8 * poly1 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fadd.s1 z_lo = A_temp, z_lo - nop.i 999 ;; + nop.m 999 + fma.s1 A_lo = Z, poly, z_lo // A_lo = z_lo + Z * poly + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// poly1 = P_6 + zsq * poly1 -// poly2 = P_2 + zsq * poly2 -// -(p0) fma.s1 poly1 = zsq, poly1, P_5 - nop.i 999 + nop.m 999 + fadd.s1 A_hi = tmp, A_lo // A_hi = tmp + A_lo + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fmpy.s1 poly2 = poly2, zsq - nop.i 999 ;; + nop.m 999 + fsub.s1 tmp = tmp, A_hi // tmp = tmp - A_hi + nop.i 999 } { .mfi - nop.m 999 -// -// Result = Res_hi + Res_lo (User Supplied Rounding Mode) -// -(p0) fmpy.s1 P_5 = P_5, P_5 - nop.i 999 ;; + nop.m 999 + fmpy.s1 A_hi = s_Y, A_hi // A_hi = s_Y * A_hi + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fma.s1 poly1 = zsq, poly1, P_4 - nop.i 999 ;; + nop.m 999 + fadd.s1 A_lo = tmp, A_lo // A_lo = tmp + A_lo + nop.i 999 } { .mfi - nop.m 999 -(p0) fma.s1 poly = z8, poly1, poly2 - nop.i 999 ;; + nop.m 999 + fma.s1 Res_hi = sigma, A_hi, P_hi // Res_hi = P_hi + sigma * A_hi + nop.i 999 } +;; + { .mfi - nop.m 999 + nop.m 999 + fsub.s1 tmp = P_hi, Res_hi // tmp = P_hi - Res_hi + nop.i 999 +} +;; + // -// Fixup added to force inexact later - -// A_hi = A_temp + z_lo -// z_lo = (A_temp - A_hi) + z_lo +// Test if A_lo is zero // -(p0) fma.s1 A_lo = Z, poly, z_lo - nop.i 999 ;; -} { .mfi - nop.m 999 -(p0) fadd.s1 A_hi = tmp, A_lo - nop.i 999 ;; + nop.m 999 + fclass.m p6,p0 = A_lo, 0x007 // Test A_lo = 0 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fsub.s1 tmp = tmp, A_hi - nop.i 999 + nop.m 999 +(p6) mov A_lo = tmp_small // If A_lo zero, make very small + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fmpy.s1 A_hi = s_Y, A_hi - nop.i 999 ;; + nop.m 999 + fma.s1 tmp = A_hi, sigma, tmp // tmp = sigma * A_hi + tmp + nop.i 999 } { .mfi - nop.m 999 -(p0) fadd.s1 A_lo = tmp, A_lo - nop.i 999 + nop.m 999 + fma.s1 sigma = A_lo, sigma, P_lo // sigma = A_lo * sigma + P_lo + nop.i 999 } +;; + { .mfi -(p0) setf.exp tmp = int_temp + nop.m 999 + fma.s1 Res_lo = s_Y, sigma, tmp // Res_lo = s_Y * sigma + tmp + nop.i 999 +} +;; + // -// P_hi = s_Y * P_hi -// A_hi = s_Y * A_hi +// Test if Res_lo is denormal // -(p0) fma.s1 Res_hi = sigma, A_hi, P_hi - nop.i 999 ;; -} -{ .mfi - nop.m 999 -(p0) fclass.m.unc p6,p0 = A_lo, 0x007 - nop.i 999 ;; -} { .mfi - nop.m 999 -(p6) mov A_lo = tmp - nop.i 999 + nop.m 999 + fclass.m p14, p15 = Res_lo, 0x0b + nop.i 999 } -{ .mfi - nop.m 999 +;; + // -// Res_hi = P_hi + sigma * A_hi +// Compute Result = Res_lo + Res_hi. Use s3 if Res_lo is denormal. // -(p0) fsub.s1 tmp = P_hi, Res_hi - nop.i 999 ;; -} { .mfi - nop.m 999 -// -// tmp = P_hi - Res_hi -// -(p0) fma.s1 tmp = A_hi, sigma, tmp - nop.i 999 + nop.m 999 +(p14) fadd.s3 Result = Res_lo, Res_hi // Result for Res_lo denormal + nop.i 999 } { .mfi - nop.m 999 -(p0) fma.s1 sigma = A_lo, sigma, P_lo - nop.i 999 ;; + nop.m 999 +(p15) fadd.s0 Result = Res_lo, Res_hi // Result for Res_lo normal + nop.i 999 } +;; + +// +// If Res_lo is denormal test if Result equals zero +// { .mfi - nop.m 999 -// -// tmp = sigma * A_hi + tmp -// sigma = A_lo * sigma + P_lo -// -(p0) fma.s1 Res_lo = s_Y, sigma, tmp - nop.i 999 ;; + nop.m 999 +(p14) fclass.m.unc p14, p0 = Result, 0x07 + nop.i 999 } -{ .mfb - nop.m 999 +;; + // -// Res_lo = s_Y * sigma + tmp +// If Res_lo is denormal and Result equals zero, raise inexact, underflow +// by squaring small double // -(p0) fadd.s0 Result = Res_lo, Res_hi -br.ret.sptk b0 ;; +{ .mfb + nop.m 999 +(p14) fmpy.d.s0 FR_temp = FR_temp, FR_temp + br.ret.sptk b0 // Exit POLY path, 0 < Q < 2^-3 } -L(ATANL_NATVAL): -L(ATANL_UNSUPPORTED): -L(ATANL_NAN): +;; + + +ATANL_UNSUPPORTED: { .mfb - nop.m 999 -(p0) fmpy.s0 Result = ArgX,ArgY -(p0) br.ret.sptk b0 ;; + nop.m 999 + fmpy.s0 Result = ArgX,ArgY + br.ret.sptk b0 } -L(ATANL_SPECIAL_HANDLING): +;; + +// Here if y natval, nan, inf, zero +ATANL_Y_SPECIAL: +// Here if x natval, nan, inf, zero +ATANL_X_SPECIAL: { .mfi - nop.m 999 -(p0) fcmp.eq.s0 p0, p6 = f1, ArgY_orig - nop.i 999 + nop.m 999 + fclass.m p13,p12 = ArgY_orig, 0x0c3 // Test y nan + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fcmp.eq.s0 p0, p5 = f1, ArgX_orig - nop.i 999 ;; + nop.m 999 + fclass.m p15,p14 = ArgY_orig, 0x103 // Test y natval + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fclass.m.unc p6, p7 = ArgY, 0x007 - nop.i 999 -} -{ .mlx - nop.m 999 -(p0) movl special = 992 + nop.m 999 +(p12) fclass.m p13,p0 = ArgX_orig, 0x0c3 // Test x nan + nop.i 999 } ;; - -{ .mmi +{ .mfi nop.m 999 -(p0) addl table_ptr1 = @ltoff(Constants_atan#), gp +(p14) fclass.m p15,p0 = ArgX_orig, 0x103 // Test x natval nop.i 999 } ;; -{ .mmi - ld8 table_ptr1 = [table_ptr1] +{ .mfb nop.m 999 - nop.i 999 +(p13) fmpy.s0 Result = ArgX_orig, ArgY_orig // Result nan if x or y nan +(p13) br.ret.spnt b0 // Exit if x or y nan +} +;; + +{ .mfb + nop.m 999 +(p15) fmpy.s0 Result = ArgX_orig, ArgY_orig // Result natval if x or y natval +(p15) br.ret.spnt b0 // Exit if x or y natval } ;; -{ .mib -(p0) add table_ptr1 = table_ptr1, special - nop.i 999 -(p7) br.cond.spnt L(ATANL_ArgY_Not_ZERO) ;; +// Here if x or y inf or zero +ATANL_SPECIAL_HANDLING: +{ .mfi + nop.m 999 + fclass.m p6, p7 = ArgY_orig, 0x007 // Test y zero + mov special = 992 // Offset to table } +;; + +{ .mfb + add table_ptr1 = table_base, special // Point to 3pi/4 + fcmp.eq.s0 p0, p9 = ArgX_orig, ArgY_orig // Dummy to set denormal flag +(p7) br.cond.spnt ATANL_ArgY_Not_ZERO // Branch if y not zero +} +;; + +// Here if y zero { .mmf -(p0) ldfd Result = [table_ptr1], 8 - nop.m 999 -(p6) fclass.m.unc p14, p0 = ArgX, 0x035 ;; + ldfd Result = [table_ptr1], 8 // Get pi high + nop.m 999 + fclass.m p14, p0 = ArgX, 0x035 // Test for x>=+0 } +;; + { .mmf - nop.m 999 -(p0) ldfd Result_lo = [table_ptr1], -8 -(p6) fclass.m.unc p15, p0 = ArgX, 0x036 ;; + nop.m 999 + ldfd Result_lo = [table_ptr1], -8 // Get pi lo + fclass.m p15, p0 = ArgX, 0x036 // Test for x<=-0 } +;; + +// +// Return sign_Y * 0 when ArgX > +0 +// { .mfi - nop.m 999 -(p14) fmerge.s Result = ArgY, f0 - nop.i 999 + nop.m 999 +(p14) fmerge.s Result = ArgY, f0 // If x>=+0, y=0, hi sgn(y)*0 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p6) fclass.m.unc p13, p0 = ArgX, 0x007 - nop.i 999 ;; + nop.m 999 + fclass.m p13, p0 = ArgX, 0x007 // Test for x=0 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p14) fmerge.s Result_lo = ArgY, f0 - nop.i 999 ;; + nop.m 999 +(p14) fmerge.s Result_lo = ArgY, f0 // If x>=+0, y=0, lo sgn(y)*0 + nop.i 999 } +;; + { .mfi -(p13) mov GR_Parameter_TAG = 36 - nop.f 999 - nop.i 999 ;; +(p13) mov GR_Parameter_TAG = 36 // Error tag for x=0, y=0 + nop.f 999 + nop.i 999 } -{ .mfi - nop.m 999 +;; + // -// Return sign_Y * 0 when ArgX > +0 +// Return sign_Y * pi when ArgX < -0 // -(p15) fmerge.s Result = ArgY, Result - nop.i 999 ;; -} { .mfi - nop.m 999 -(p15) fmerge.s Result_lo = ArgY, Result_lo - nop.i 999 ;; + nop.m 999 +(p15) fmerge.s Result = ArgY, Result // If x<0, y=0, hi=sgn(y)*pi + nop.i 999 } -{ .mfb - nop.m 999 -// -// Return sign_Y * 0 when ArgX < -0 -// -(p0) fadd.s0 Result = Result, Result_lo -(p13) br.cond.spnt __libm_error_region ;; +;; + +{ .mfi + nop.m 999 +(p15) fmerge.s Result_lo = ArgY, Result_lo // If x<0, y=0, lo=sgn(y)*pi + nop.i 999 } -{ .mib - nop.m 999 - nop.i 999 +;; + // -// Call error support funciton for atan(0,0) +// Call error support function for atan(0,0) // -(p0) br.ret.sptk b0 ;; -} -L(ATANL_ArgY_Not_ZERO): -{ .mfi - nop.m 999 -(p0) fclass.m.unc p9, p10 = ArgY, 0x023 - nop.i 999 ;; +{ .mfb + nop.m 999 + fadd.s0 Result = Result, Result_lo +(p13) br.cond.spnt __libm_error_region // Branch if atan(0,0) } +;; + { .mib - nop.m 999 - nop.i 999 -(p10) br.cond.spnt L(ATANL_ArgY_Not_INF) ;; -} -{ .mfi - nop.m 999 -(p9) fclass.m.unc p6, p0 = ArgX, 0x017 - nop.i 999 -} -{ .mfi - nop.m 999 -(p9) fclass.m.unc p7, p0 = ArgX, 0x021 - nop.i 999 ;; -} -{ .mfi - nop.m 999 -(p9) fclass.m.unc p8, p0 = ArgX, 0x022 - nop.i 999 ;; -} -{ .mmi -(p6) add table_ptr1 = 16, table_ptr1 ;; -(p0) ldfd Result = [table_ptr1], 8 - nop.i 999 ;; -} -{ .mfi -(p0) ldfd Result_lo = [table_ptr1], -8 - nop.f 999 - nop.i 999 ;; -} -{ .mfi - nop.m 999 -(p6) fmerge.s Result = ArgY, Result - nop.i 999 ;; + nop.m 999 + nop.i 999 + br.ret.sptk b0 // Exit for y=0, x not 0 } +;; + +// Here if y not zero +ATANL_ArgY_Not_ZERO: { .mfi - nop.m 999 -(p6) fmerge.s Result_lo = ArgY, Result_lo - nop.i 999 ;; + nop.m 999 + fclass.m p0, p10 = ArgY, 0x023 // Test y inf + nop.i 999 } +;; + { .mfb - nop.m 999 -(p6) fadd.s0 Result = Result, Result_lo -(p6) br.ret.sptk b0 ;; + nop.m 999 + fclass.m p6, p0 = ArgX, 0x017 // Test for 0 <= |x| < inf +(p10) br.cond.spnt ATANL_ArgY_Not_INF // Branch if 0 < |y| < inf } +;; + +// Here if y=inf // -// Load PI/2 and adjust its sign. // Return +PI/2 when ArgY = +Inf and ArgX = +/-0 or normal // Return -PI/2 when ArgY = -Inf and ArgX = +/-0 or normal +// Return +PI/4 when ArgY = +Inf and ArgX = +Inf +// Return -PI/4 when ArgY = -Inf and ArgX = +Inf +// Return +3PI/4 when ArgY = +Inf and ArgX = -Inf +// Return -3PI/4 when ArgY = -Inf and ArgX = -Inf // -{ .mmi -(p7) add table_ptr1 = 32, table_ptr1 ;; -(p7) ldfd Result = [table_ptr1], 8 - nop.i 999 ;; -} { .mfi -(p7) ldfd Result_lo = [table_ptr1], -8 - nop.f 999 - nop.i 999 ;; -} -{ .mfi - nop.m 999 -(p7) fmerge.s Result = ArgY, Result - nop.i 999 ;; + nop.m 999 + fclass.m p7, p0 = ArgX, 0x021 // Test for x=+inf + nop.i 999 } +;; + { .mfi - nop.m 999 -(p7) fmerge.s Result_lo = ArgY, Result_lo - nop.i 999 ;; -} -{ .mfb - nop.m 999 -(p7) fadd.s0 Result = Result, Result_lo -(p7) br.ret.sptk b0 ;; +(p6) add table_ptr1 = 16, table_ptr1 // Point to pi/2, if x finite + fclass.m p8, p0 = ArgX, 0x022 // Test for x=-inf + nop.i 999 } -// -// Load PI/4 and adjust its sign. -// Return +PI/4 when ArgY = +Inf and ArgX = +Inf -// Return -PI/4 when ArgY = -Inf and ArgX = +Inf -// +;; + { .mmi -(p8) add table_ptr1 = 48, table_ptr1 ;; -(p8) ldfd Result = [table_ptr1], 8 - nop.i 999 ;; +(p7) add table_ptr1 = 32, table_ptr1 // Point to pi/4 if x=+inf +;; +(p8) add table_ptr1 = 48, table_ptr1 // Point to 3pi/4 if x=-inf + + nop.i 999 } -{ .mfi -(p8) ldfd Result_lo = [table_ptr1], -8 - nop.f 999 - nop.i 999 ;; +;; + +{ .mmi + ldfd Result = [table_ptr1], 8 // Load pi/2, pi/4, or 3pi/4 hi +;; + ldfd Result_lo = [table_ptr1], -8 // Load pi/2, pi/4, or 3pi/4 lo + nop.i 999 } +;; + { .mfi - nop.m 999 -(p8) fmerge.s Result = ArgY, Result - nop.i 999 ;; + nop.m 999 + fmerge.s Result = ArgY, Result // Merge sgn(y) in hi + nop.i 999 } +;; + { .mfi - nop.m 999 -(p8) fmerge.s Result_lo = ArgY, Result_lo - nop.i 999 ;; + nop.m 999 + fmerge.s Result_lo = ArgY, Result_lo // Merge sgn(y) in lo + nop.i 999 } +;; + { .mfb - nop.m 999 -(p8) fadd.s0 Result = Result, Result_lo -(p8) br.ret.sptk b0 ;; + nop.m 999 + fadd.s0 Result = Result, Result_lo // Compute complete result + br.ret.sptk b0 // Exit for y=inf } -L(ATANL_ArgY_Not_INF): -{ .mfi - nop.m 999 +;; + +// Here if y not INF, and x=0 or INF +ATANL_ArgY_Not_INF: // -// Load PI/4 and adjust its sign. -// Return +3PI/4 when ArgY = +Inf and ArgX = -Inf -// Return -3PI/4 when ArgY = -Inf and ArgX = -Inf +// Return +PI/2 when ArgY NOT Inf, ArgY > 0 and ArgX = +/-0 +// Return -PI/2 when ArgY NOT Inf, ArgY < 0 and ArgX = +/-0 +// Return +0 when ArgY NOT Inf, ArgY > 0 and ArgX = +Inf +// Return -0 when ArgY NOT Inf, ArgY > 0 and ArgX = +Inf +// Return +PI when ArgY NOT Inf, ArgY > 0 and ArgX = -Inf +// Return -PI when ArgY NOT Inf, ArgY > 0 and ArgX = -Inf // -(p0) fclass.m.unc p6, p0 = ArgX, 0x007 - nop.i 999 -} -{ .mfi - nop.m 999 -(p0) fclass.m.unc p7, p0 = ArgX, 0x021 - nop.i 999 ;; -} { .mfi - nop.m 999 -(p0) fclass.m.unc p8, p0 = ArgX, 0x022 - nop.i 999 ;; -} -{ .mmi -(p6) add table_ptr1 = 16, table_ptr1 ;; -(p6) ldfd Result = [table_ptr1], 8 - nop.i 999 ;; + nop.m 999 + fclass.m p7, p9 = ArgX, 0x021 // Test for x=+inf + nop.i 999 } +;; + { .mfi -(p6) ldfd Result_lo = [table_ptr1], -8 - nop.f 999 - nop.i 999 ;; + nop.m 999 + fclass.m p6, p0 = ArgX, 0x007 // Test for x=0 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p6) fmerge.s Result = ArgY, Result - nop.i 999 ;; +(p6) add table_ptr1 = 16, table_ptr1 // Point to pi/2 + fclass.m p8, p0 = ArgX, 0x022 // Test for x=-inf + nop.i 999 } +;; + +.pred.rel "mutex",p7,p9 { .mfi - nop.m 999 -(p6) fmerge.s Result_lo = ArgY, Result_lo - nop.i 999 ;; -} -{ .mfb - nop.m 999 -(p6) fadd.s0 Result = Result, Result_lo -(p6) br.ret.spnt b0 ;; +(p9) ldfd Result = [table_ptr1], 8 // Load pi or pi/2 hi +(p7) fmerge.s Result = ArgY, f0 // If y not inf, x=+inf, sgn(y)*0 + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// return = sign_Y * PI/2 when ArgX = 0 -// -(p7) fmerge.s Result = ArgY, f0 - nop.i 999 ;; -} -{ .mfb - nop.m 999 -(p7) fnorm.s0 Result = Result -(p7) br.ret.spnt b0 ;; -} -// -// return = sign_Y * 0 when ArgX = Inf -// -{ .mmi -(p8) ldfd Result = [table_ptr1], 8 ;; -(p8) ldfd Result_lo = [table_ptr1], -8 - nop.i 999 ;; +(p9) ldfd Result_lo = [table_ptr1], -8 // Load pi or pi/2 lo +(p7) fnorm.s0 Result = Result // If y not inf, x=+inf normalize + nop.i 999 } +;; + { .mfi - nop.m 999 -(p8) fmerge.s Result = ArgY, Result - nop.i 999 ;; + nop.m 999 +(p9) fmerge.s Result = ArgY, Result // Merge sgn(y) in hi + nop.i 999 } +;; + { .mfi - nop.m 999 -(p8) fmerge.s Result_lo = ArgY, Result_lo - nop.i 999 ;; + nop.m 999 +(p9) fmerge.s Result_lo = ArgY, Result_lo // Merge sgn(y) in lo + nop.i 999 } +;; + { .mfb - nop.m 999 -(p8) fadd.s0 Result = Result, Result_lo -(p8) br.ret.sptk b0 ;; + nop.m 999 +(p9) fadd.s0 Result = Result, Result_lo // Compute complete result + br.ret.spnt b0 // Exit for y not inf, x=0,inf } -// -// return = sign_Y * PI when ArgX = -Inf -// -.endp atan2l -ASM_SIZE_DIRECTIVE(atan2l) -ASM_SIZE_DIRECTIVE(__atan2l) -ASM_SIZE_DIRECTIVE(__ieee754_atan2l) - -.proc __libm_error_region -__libm_error_region: +;; + +GLOBAL_IEEE754_END(atan2l) +LOCAL_LIBM_ENTRY(__libm_error_region) .prologue { .mfi add GR_Parameter_Y=-32,sp // Parameter 2 value @@ -2001,7 +1999,6 @@ __libm_error_region: br.ret.sptk b0 // Return };; -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) +LOCAL_LIBM_END(__libm_error_region#) .type __libm_error_support#,@function .global __libm_error_support# diff --git a/sysdeps/ia64/fpu/s_cbrt.S b/sysdeps/ia64/fpu/s_cbrt.S index 1e23b6024d..b7a827d1da 100644 --- a/sysdeps/ia64/fpu/s_cbrt.S +++ b/sysdeps/ia64/fpu/s_cbrt.S @@ -1,11 +1,10 @@ -.file "cbrt.asm" +.file "cbrt.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Cristina Iordache, Ted Kubaska, -// Bob Norin, Shane Story, and Ping Tak Peter Tang -// of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -21,27 +20,30 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// // Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// problem reports or change requests be submitted to it directly at +// http: //www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 2/02/00: Initial version -// 5/19/00: New version (modified algorithm) +// 02/02/00 Initial version +// 05/19/00 New version (modified algorithm) +// 05/20/02 Cleaned up namespace and sf0 syntax +// 01/28/03 Updated polynomial coefficients (changed to Remez coefficients), +// to slightly improve accuracy // // API //============================================================== @@ -53,637 +55,713 @@ // // Implementation // -// cbrt(a) = cbrt(a y) / cbrt(y) -// = cbrt(1 - (1 - a y)) * 1/cbrt(y) +// Let y= frcpa(a), where a is the argument +// +// cbrt(a)= cbrt(a*y)/cbrt(y) = cbrt(1 - (1-a*y)) * (1/cbrt(y)) +// +// For all values of y, the 3 possible significands of 1/cbrt(y) +// are stored in a table (T0) to 64 bits of accuracy. (There are +// 3 possible significands because the exponent of y modulo 3 +// can be 0, 1, or 2.) // -// where y = frcpa(a). // -// * cbrt(1 - (1 - a y)) is approximated by a degree-5 polynomial -// -// 1 - (1/3)*r - (1/9)*r^2 - (5/81)*r^3 - (10/243)*r^4 - (22/729)*r^5 -// -// in r = 1 - a y. +// * cbrt(1 - (1-a*y)) is approximated by a degree-5 polynomial ~ +// +// ~ 1 - (1/3)*r - (1/9)*r^2 - (5/81)*r^3 - (10/243)*r^4 - (22/729)*r^5 +// +// in r = 1-a*y. // -// * The values 1/cbrt(y) are stored in a table of constants T0 -// to 64 bits of accuracy // // The table values are stored for three exponent values and are // then multiplied by e/3 where e is the exponent of the input number. // This computation is carried out in parallel with the polynomial // evaluation: // -// T = 2^(e/3) * T0 +// T= 2^(e/3) * T0 //=============== -// input = x -// C = frcpa(x) -// r = 1 - C * x +// input= x +// C= frcpa(x) +// r= 1 - C * x // -// Special values +// Special values //============================================================== // Registers used //============================================================== -// f6-f15 -// r2, r23-r26, r28-r30 -// p6,p7,p8,p12 +// f6-f15 +// GR_GP, r23-r26, r28-r30 +// p6, p7, p8, p12 + + FR_R = f6 + FR_COEFF1 = f7 + FR_COEFF2 = f9 + FR_COEFF3 = f10 + FR_COEFF4 = f11 + FR_COEFF5 = f12 + FR_R2 = f13 + FR_ARG = f14 + FR_P23 = f15 + FR_P25 = f32 + FR_P15 = f33 + FR_P1 = f34 + FR_P45 = f35 + FR_2EXP = f36 + FR_TMP63 = f37 + + GR_GP = r2 + GR_ADDR = r2 + GR_CONST1 = r3 + GR_I1 = r8 + GR_EXP = r9 + GR_ADDR2 = r10 + GR_IT1 = r11 + GR_TMP2 = r11 + GR_EXPON = r15 + GR_TMP1 = r16 + GR_TMP6 = r16 + GR_ITB1 = r17 + GR_TMP3 = r18 + GR_TMP4 = r19 + GR_TMP63 = r19 + GR_TMP5 = r20 + GR_EXP_BY_3 = r20 + GR_CONST4 = r21 + GR_TMP6 = r22 + GR_INDEX = r23 + GR_EBIAS = r24 + GR_SIGNIF = r25 + GR_SIGNIF2 = r25 + GR_TEST = r25 + GR_ARGEXP = r26 + GR_CONST2 = r27 + GR_SIGN = r28 + GR_REM = r29 + GR_CONST3 = r30 + GR_SEXP = r31 + + + -#include "libm_support.h" // Data tables //============================================================== -#ifdef _LIBC -.rodata -#else -.data -#endif +RODATA .align 16 -poly_coeffs: -ASM_TYPE_DIRECTIVE(poly_coeffs,@object) -data8 0xaaaaaaaaaaaaaaab, 0x00003ffd // 1/3 -data8 0x3fbc71c71c71c71d, 0x3faf9add3c0ca459 -data8 0x3fa511e8d2b3183b, 0x3f9ee7113506ac13 -ASM_SIZE_DIRECTIVE(poly_coeffs) - -T_table: -ASM_TYPE_DIRECTIVE(T_table,@object) - -data8 0x80155c748c374836, 0xa160019ed37fb4ae -data8 0xcb51ddcb9e93095e, 0x8040404b0879f7f9 -data8 0xa1960b5966da4608, 0xcb95f333968ad59b -data8 0x806b5dce4b405c10, 0xa1cc5dbe6dc2aab4 -data8 0xcbda64292d3ffd97, 0x8096b586974669b1 -data8 0xa202f97995b69c0d, 0xcc1f3184af961596 -data8 0x80bcd273d952a028, 0xa232fe6eb0c0577d -data8 0xcc5bb1ac954d33e2, 0x80e898c52813f2f3 -data8 0xa26a2582012f6e17, 0xcca12e9831fc6402 -data8 0x81149add67c2d208, 0xa2a197e5d10465cb -data8 0xcce70a67b64f24ad, 0x813b4e2c856b6e9a -data8 0xa2d25a532efefbc8, 0xcd24794726477ea5 -data8 0x8167c1dde03de7aa, 0xa30a5bd6e49e4ab8 -data8 0xcd6b096a0b70ee87, 0x818ed973b811135e -data8 0xa33b9c9b59879e24, 0xcda9177738b15a90 -data8 0x81bbc0c33e13ec98, 0xa3742fca6a3c1f21 -data8 0xcdf05f2247dffab9, 0x81e33e69fbe7504a -data8 0xa3a5f1273887bf22, 0xce2f0f347f96f906 -data8 0x820aec524e3c23e9, 0xa3d7ef508ff11574 -data8 0xce6e0be0cd551a61, 0x823880f78e70b805 -data8 0xa4115ce30548bc15, 0xceb666b2c347d1de -data8 0x826097a62a8e5200, 0xa443df0e53df577a -data8 0xcef609b0cb874f00, 0x8288dfe00e9b5eaf -data8 0xa4769fa5913c0ec3, 0xcf35fb5447e5c765 -data8 0x82b15a10c5371624, 0xa4a99f303bc7def5 -data8 0xcf763c47ee869f00, 0x82da06a527b18937 -data8 0xa4dcde37779adf4b, 0xcfb6cd3888d71785 -data8 0x8302e60b635ab394, 0xa5105d46152c938a -data8 0xcff7aed4fbfbb447, 0x832bf8b2feec2f0e -data8 0xa5441ce89825cb8d, 0xd038e1ce5167e3c6 -data8 0x83553f0ce00e276b, 0xa5781dad3e54d899 -data8 0xd07a66d7bfa0ebba, 0x837eb98b50f8322a -data8 0xa5ac602406c4e68c, 0xd0bc3ea6b32d1b21 -data8 0x83a270f44c84f699, 0xa5d9601d95c2c0bc -data8 0xd0f4f0e8f36c1bf8, 0x83cc4d7cfcfac5ca -data8 0xa60e1e1a2de14745, 0xd1376458e34b037e -data8 0x83f65f78a8872b4c, 0xa6431f6e3fbd9658 -data8 0xd17a2ca133f78572, 0x8420a75f2f7b53c8 -data8 0xa67864b0d432fda4, 0xd1bd4a80301c5715 -data8 0x844510461ff14209, 0xa6a6444aa0243c0b -data8 0xd1f71682b2fa4575, 0x846fbd91b930bed2 -data8 0xa6dc094d10f25792, 0xd23ad555f773f059 -data8 0x84947e18234f3294, 0xa70a574cc02bba69 -data8 0xd2752c7039a5bf73, 0x84bf92755825045a -data8 0xa7409e2af9549084, 0xd2b98ee008c06b59 -data8 0x84e4ac0ee112ba51, 0xa76f5c64ca2cf13b -data8 0xd2f4735ffd700280, 0x8509ef44b86f20be -data8 0xa79e4f0babab5dc0, 0xd32f99ed6d9ac0e1 -data8 0x85359d5d91768427, 0xa7d5579ae5164b85 -data8 0xd374f0666c75d51c, 0x855b3bd5b7384357 -data8 0xa804bd3c6fe61cc8, 0xd3b0a7d13618e4a1 -data8 0x858104f0c415f79a, 0xa8345895e5250a5a -data8 0xd3eca2ea53bcec0c, 0x85a6f90390d29864 -data8 0xa8642a122b44ef0b, 0xd428e23874f13a17 -data8 0x85d3772fcd56a1dd, 0xa89c38ca18f6108b -data8 0xd46f82fe293bc6d3, 0x85f9c982fcc002f3 -data8 0xa8cc81063b6e87ca, 0xd4ac57e9b7186420 -data8 0x862047e0e7ea554b, 0xa8fd00bfa409285e -data8 0xd4e972becb04e8b8, 0x8646f2a26f7f5852 -data8 0xa92db8664d5516da, 0xd526d40a7a9b43a3 -data8 0x866dca21754096b5, 0xa95ea86b75cc2c20 -data8 0xd5647c5b73917370, 0x8694ceb8dfd17a37 -data8 0xa98fd141a4992deb, 0xd5a26c4201bd6d13 -data8 0x86bc00c49e9307e8, 0xa9c1335cae7446ba -data8 0xd5e0a45015350a7e, 0x86dccd74fce79610 -data8 0xa9ea8686f556f645, 0xd614b539c6194104 -data8 0x870453c845acf90f, 0xaa1c52d17906bb19 -data8 0xd6537310e224283f, 0x872c089a1e90342c -data8 0xaa4e59b046dab887, 0xd6927ab62244c917 -data8 0x8753ec4a92d16c5e, 0xaa809b9c60d1890b -data8 0xd6d1ccc1fc4ef4b7, 0x877bff3aca19f6b4 -data8 0xaab319102f3f9b33, 0xd71169cea98fdded -data8 0x879d88b6fe1c324c, 0xaadd5a18c1e21274 -data8 0xd746a66a5bc9f6d9, 0x87c5f346dbf98c3a -data8 0xab1045f2ac31bdf5, 0xd786ce8f0fae5317 -data8 0x87e7c653efacef2c, 0xab3ae3ab2df7231e -data8 0xd7bc7ff214c4e75a, 0x881089d4e73ffefc -data8 0xab6e3f945d1e96fc, 0xd7fd35467a517ed1 -data8 0x88397e6a366f2a8a, 0xaba1d953a08fa94e -data8 0xd83e38838648d815, 0x885bc559e5e1c081 -data8 0xabcd090db7ef4c3f, 0xd874a1db598b8951 -data8 0x887e2ee392bb7a93, 0xabf864602d7c323d -data8 0xd8ab42205b80edaf, 0x88a7a8587e404257 -data8 0xac2ca5886ccf9b57, 0xd8ed1849d202f965 -data8 0x88ca5eda67594784, 0xac5861d4aa441f0f -data8 0xd92432bd5a173685, 0x88f4356166bd590e -data8 0xac8d183fe3a2fbed, 0xd9669ca45b03c23e -data8 0x89173a0acf5ce026, 0xacb93703ff51571e -data8 0xd99e3327cf89574e, 0x893a62a098b6a57b -data8 0xace5830ad0c3f14b, 0xd9d602b19b100466 -data8 0x895daf637236ae2c, 0xad11fca5d78b3ff2 -data8 0xda0e0ba86c096841, 0x89883b9d1c2fa9c5 -data8 0xad4797fddf91a798, 0xda5195fcdb1c3dce -data8 0x89abd8dd374a5d7b, 0xad747701e559ebcb -data8 0xda8a1eb87a491f6c, 0x89cf9b1dcd197fa0 -data8 0xada184a47e9c7613, 0xdac2e230b91c3f84 -data8 0x89f382a258ea79de, 0xadcec13ab0dda8ff -data8 0xdafbe0d0b66aea30, 0x8a178faf06648f29 -data8 0xadfc2d1a5fd21ba8, 0xdb351b04a8fafced -data8 0x8a3bc288b3e1d18a, 0xae29c89a5053c33a -data8 0xdb6e9139e33cdd8e, 0x8a601b74f4d1f835 -data8 0xae5794122b638df9, 0xdba843ded7151ea1 -data8 0x8a849aba14274764, 0xae858fda8137ae0a -data8 0xdbe2336319b61fc8, 0x8aa9409f16cdbc9b -data8 0xaeb3bc4ccc56d3d1, 0xdc1c60376789fa68 -data8 0x8ace0d6bbe2cb316, 0xaee219c374c09920 -data8 0xdc56cacda82d0cd5, 0x8af301688ab33558 -data8 0xaf10a899d3235fe7, 0xdc917398f2797814 -data8 0x8b181cdebe6f3206, 0xaf3f692c341fe8b4 -data8 0xdccc5b0d90a3e628, 0x8b3d60185fafcb7c -data8 0xaf6e5bd7db9ae6c2, 0xdd0781a10469f0f2 -data8 0x8b62cb603bb2fad0, 0xaf9d80fb081cd91b -data8 0xdd42e7ca0b52838f, 0x8b80d7d6bc4104de -data8 0xafc35ce063eb3787, 0xdd729ad01c69114d -data8 0x8ba68bf73ac74f39, 0xaff2ddcb5f28f03d -data8 0xddae749c001fbf5e, 0x8bcc68fb9f9f7335 -data8 0xb022923b148e05c5, 0xddea8f50a51c69b1 -data8 0x8bf26f31c534fca2, 0xb0527a919adbf58b -data8 0xde26eb69a0f0f111, 0x8c10f86e13a1a1f9 -data8 0xb078f3ab1d701c65, 0xde576480262399bc -data8 0x8c3749916cc6abb5, 0xb0a93a6870649f31 -data8 0xde943789645933c8, 0x8c5dc4c4f7706032 -data8 0xb0d9b624d62ec856, 0xded14d58139a28af -data8 0x8c7cac3a8c42e3e0, 0xb100a5f53fb3c8e1 -data8 0xdf025c00bbf2b5c7, 0x8ca373f1b7bf2716 -data8 0xb131821882f5540a, 0xdf3feb44d723a713 -data8 0x8cc29907fb951294, 0xb158bf8e4cb04055 -data8 0xdf715bc16c159be0, 0x8ce9ae4e9492aac8 -data8 0xb189fd69d56b238f, 0xdfaf66240e29cda8 -data8 0x8d0911dddbfdad0e, 0xb1b189958e8108e4 -data8 0xdfe139cbf6e19bdc, 0x8d3075c4f20f04ee -data8 0xb1e32a8165b09832, 0xe01fc0fe94d9fc52 -data8 0x8d5018a9d4de77d5, 0xb20b0678fc271eec -data8 0xe051f92ffcc0bd60, 0x8d77cc47dd143515 -data8 0xb23d0bd3f7592b6e, 0xe090feec9c9a06ac -data8 0x8d97af6352739cb7, 0xb26538b2db8420dc -data8 0xe0c39d0c9ff862d6, 0x8db7af523167800f -data8 0xb28d89e339ceca14, 0xe0f668eeb99f188d -data8 0x8ddfd80bc68c32ff, 0xb2c022ca12e55a16 -data8 0xe1362890eb663139, 0x8e00197e1e7c88fe -data8 0xb2e8c6852c6b03f1, 0xe1695c7212aecbaa -data8 0x8e207859f77e20e7, 0xb3118f4eda9fe40f -data8 0xe19cbf0391bbbbe9, 0x8e40f4ce60c9f8e2 -data8 0xb33a7d6268109ebe, 0xe1d050901c531e85 -data8 0x8e69ba46cf2fde4d, 0xb36ddbc5ea70ec55 -data8 0xe2110903b4f4047a, 0x8e8a7a00bd7ae63e -data8 0xb3971e9b39264023, 0xe2450559b4d80b6d -data8 0x8eab57ef1cf2f529, 0xb3c0877ecc18e24a -data8 0xe27931a231554ef3, 0x8ecc5442cffb1dad -data8 0xb3ea16ae3a6c905f, 0xe2ad8e2ac3c5b04b -data8 0x8eed6f2d2a4acbfe, 0xb413cc67aa0e4d2d -data8 0xe2e21b41b9694cce, 0x8f0ea8dff24441ff -data8 0xb43da8e9d163e1af, 0xe316d93615862714 -data8 0x8f385c95d696b817, 0xb47233773b84d425 -data8 0xe3590bd86a0d30f9, 0x8f59dc43edd930f3 -data8 0xb49c6825430fe730, 0xe38e38e38e38e38e -data8 0x8f7b7b5f5ffad1c4, 0xb4c6c46bcdb27dcf -data8 0xe3c397d1e6db7839, 0x8f9d3a1bea165f38 -data8 0xb4f1488c0b35d26f, 0xe3f928f5953feb9e -data8 0x8fbf18adc34b66da, 0xb51bf4c7c51f0168 -data8 0xe42eeca17c62886c, 0x8fe117499e356095 -data8 0xb546c9616087ab9c, 0xe464e32943446305 -data8 0x90033624aa685f8d, 0xb571c69bdffd9a70 -data8 0xe49b0ce15747a8a2, 0x9025757495f36b86 -data8 0xb59cecbae56984c3, 0xe4d16a1eee94e9d4 -data8 0x903f3a5dcc091203, 0xb5bd64512bb14bb7 -data8 0xe4fa52107353f67d, 0x9061b2fceb2bdbab -data8 0xb5e8d2a4bf5ba416, 0xe5310a471f4d2dc3 -data8 0x90844ca7211032a7, 0xb6146a9a1bc47819 -data8 0xe567f6f1c2b9c224, 0x90a7079403e6a15d -data8 0xb6402c7749d621c0, 0xe59f18689a9e4c9a -data8 0x90c9e3fbafd63799, 0xb66c1882fb435ea2 -data8 0xe5d66f04b8a68ecf, 0x90ece216c8a16ee4 -data8 0xb6982f048c999a56, 0xe60dfb2005c192e9 -data8 0x9110021e7b516f0a, 0xb6c47044075b4142 -data8 0xe645bd1544c7ea51, 0x912a708a39be9075 -data8 0xb6e5bd6bfd02bafd, 0xe66fb21b505b20a0 -data8 0x914dcc7b31146370, 0xb7124a2736ff8ef2 -data8 0xe6a7d32af4a7c59a, 0x91714af8cfe984d5 -data8 0xb73f026a01e94177, 0xe6e02b129c6a5ae4 -data8 0x918c00a6f3795e97, 0xb760a959f1d0a7a7 -data8 0xe70a9136a7403039, 0x91afbc299ed0295d -data8 0xb78dae7e06868ab0, 0xe74349fb2d92a589 -data8 0x91d39add3e958db0, 0xb7badff8ad9e4e02 -data8 0xe77c3a9c86ed7d42, 0x91ee9920a8974d92 -data8 0xb7dce25b8e17ae9f, 0xe7a713f88151518a -data8 0x9212b5fcac537c19, 0xb80a6226904045e2 -data8 0xe7e067453317ed2b, 0x9236f6b256923fcf -data8 0xb8380f1cafd73c1c, 0xe819f37a81871bb5 -data8 0x92523ee6f90dcfc3, 0xb85a6ea8e321b4d8 -data8 0xe8454236bfaeca14, 0x9276bef031e6eb79 -data8 0xb8886b684ae7d2fa, 0xe87f32f24c3fc90e -data8 0x929236ec237a24ad, 0xb8ab0726fa00cf5d -data8 0xe8aacd8688892ba6, 0x92b6f70b7efe9dc3 -data8 0xb8d954a4d13b7cb1, 0xe8e523fd32f606f7 -data8 0x92d29f61eec7dc2b, 0xb8fc2d4f6cd9f04a -data8 0xe9110b5311407927, 0x92f7a05d5b8ba92f -data8 0xb92acc851476b1ab, 0xe94bc8bf0c108fa3 -data8 0x931379a403be5c16, 0xb94de2d841a184c2 -data8 0xe977fdc439c2ca3c, 0x9338bc44de2e3f34 -data8 0xb97cd4c36c92693c, 0xe9b3236528fc349e -data8 0x9354c71412c69486, 0xb9a0297f172665e3 -data8 0xe9dfa70b745ac1b4, 0x937a4c273907e262 -data8 0xb9cf6f21e36c3924, 0xea1b36268d0eaa38 -data8 0x93968919f6e7975d, 0xb9f3030951267208 -data8 0xea480963fd394197, 0x93bc516fdd4680c9 -data8 0xba229d6a618e7c59, 0xea84034425f27484 -data8 0x93d8c123d9be59b2, 0xba467144459f9855 -data8 0xeab12713138dd1cc, 0x93f546c955e60076 -data8 0xba6a60c3c48f1a4b, 0xeade6db73a5e503b -data8 0x941b70a65879079f, 0xba9a76056b67ee7a -data8 0xeb1b0268343b121b, 0x943829f337410591 -data8 0xbabea699563ada6e, 0xeb489b0b2bdb5f14 -data8 0x9454f995765bc4d2, 0xbae2f350b262cc4b -data8 0xeb765721e85f03d0, 0x947b86b57f5842ed -data8 0xbb1385a23be24e57, 0xebb389645f222f62 -data8 0x94988aeb23470f86, 0xbb3814975e17c680 -data8 0xebe198f090607e0c, 0x94b5a5dc9695f42a -data8 0xbb5cc031009bf467, 0xec0fcc9321024509 -data8 0x94d2d7a9170d8b42, 0xbb81889680024764 -data8 0xec3e247da8b82f61, 0x94f9e87dd78bf019 -data8 0xbbb2c0d8703ae95d, 0xec7c27d21321c9f7 -data8 0x95175019a503d89e, 0xbbd7cd09ba3c5463 -data8 0xecaad5278824e453, 0x9534cefa625fcb3a -data8 0xbbfcf68c4977718f, 0xecd9a76d097d4e77 -data8 0x955265405c491a25, 0xbc223d88cfc88eee -data8 0xed089ed5dcd99446, 0x9570130c1f9bb857 -data8 0xbc47a2284fee4ff8, 0xed37bb95add09a1c -data8 0x9597ca4119525184, 0xbc79ac0916ed7b8a -data8 0xed76c70508f904b6, 0x95b5af6fb5aa4d3c -data8 0xbc9f5670d1a13030, 0xeda63bb05e7f93c6 -data8 0x95d3ac9273aafd7a, 0xbcc51f068cb95c1d -data8 0xedd5d661daed2dc4, 0x95f1c1cafdfd3684 -data8 0xbceb05f4b30a9bc0, 0xee05974eef86b903 -data8 0x960fef3b430b8d5f, 0xbd110b6604c7d306 -data8 0xee357ead791fc670, 0x962e350575b409c5 -data8 0xbd372f8598620f19, 0xee658cb3c134a463 -data8 0x964c934c0dfc1708, 0xbd5d727edb6b3c7e -data8 0xee95c1987f080211, 0x966b0a31c9c6bc7d -data8 0xbd83d47d937bbc6d, 0xeec61d92d8c4314f -data8 0x968999d9ad8d264e, 0xbdaa55addf1ae47d -data8 0xeef6a0da64a014ac, 0x96a8426705198795 -data8 0xbdd0f63c36aa73f0, 0xef274ba72a07c811 -data8 0x96c703fd64445ee5, 0xbdf7b6556d550a15 -data8 0xef581e31a2c91260, 0x96e5dec0a7b4268d -data8 0xbe1e9626b1ffa96b, 0xef8918b2bc43aec6 -data8 0x9704d2d4f59f79f3, 0xbe4595dd903e5371 -data8 0xefba3b63d89d7cbf, 0x9723e05ebe91b9b0 -data8 0xbe6cb5a7f14bc935, 0xefeb867ecffaa607 -data8 0x97430782be323831, 0xbe93f5b41d047cf7 -data8 0xf01cfa3df1b9c9fa, 0x97624865fc0df8bf -data8 0xbebb5630bae4c15f, 0xf04e96dc05b43e2d -data8 0x9781a32dcc640b2a, 0xbee2d74cd30a430c -data8 0xf0805c944d827454, 0x97a117ffd0f48e46 -data8 0xbf0a7937cf38d981, 0xf0b24ba285c495cb -data8 0x97c0a701f9d263c9, 0xbf323c217be2bc8c -data8 0xf0e46442e76f6569, 0x97e0505a8637a036 -data8 0xbf5a203a09342bbb, 0xf116a6b2291d7896 -data8 0x97f57a9fb0b08c6e, 0xbf74cad1c14ebfc4 -data8 0xf1383fa9e9b5b381, 0x9815503365914a9d -data8 0xbf9ce6a497a89f78, 0xf16ac84f90083b9b -data8 0x98354085054fd204, 0xbfc52428bec6e72f -data8 0xf19d7b686dcb03d7, 0x98554bbbf8a77902 -data8 0xbfed838fddab024b, 0xf1d0593311db1757 -data8 0x987571fffb7f94f6, 0xc016050c0420981a -data8 0xf20361ee8f1c711e, 0x9895b3791dd03c23 -data8 0xc03ea8cfabddc330, 0xf23695da7de51d3f -data8 0x98ab43a5fc65d0c8, 0xc059d3cbd65ddbce -data8 0xf258d095e465cc35, 0x98cbb2d196bd713d -data8 0xc082b122a3c78c9d, 0xf28c4d0bfc982b34 -data8 0x98ec3d9ec7b6f21a, 0xc0abb1499ae736c4 -data8 0xf2bff55eb3f0ea71, 0x990ce436db5e8344 -data8 0xc0d4d474c3aedaaf, 0xf2f3c9cf9884636e -data8 0x9922b8218160967a, 0xc0f054ca33eb3437 -data8 0xf31670135ab9cc0f, 0x99438d686f75779d -data8 0xc119b2c67e600ed0, 0xf34a8e9f0b54cdfb -data8 0x99647eea131fa20b, 0xc1433453de2033ff -data8 0xf37ed9fa6b8add3f, 0x997a85045a47c6d0 -data8 0xc15ef3e44e10032d, 0xf3a1cfe884ef6bb6 -data8 0x999ba5f14f8add02, 0xc188b130431d80e6 -data8 0xf3d66689dcc8e8d3, 0x99bce38b5465ecae -data8 0xc1b2929d6067730e, 0xf40b2ab069d5c96a -data8 0x99d31ca0887f30f9, 0xc1ce9268f31cc734 -data8 0xf42e718b90c8bc16, 0x99f48a669c74c09e -data8 0xc1f8b0877c1b0c08, 0xf463822a0a3b4b00 -data8 0x9a16154eb445c873, 0xc222f35a87b415ba -data8 0xf498c1076015faf8, 0x9a2c822ec198d667 -data8 0xc23f3467349e5c88, 0xf4bc5a19a33990b5 -data8 0x9a4e3e080cd91b78, 0xc269b4e40e088c01 -data8 0xf4f1e6a7d6f5425f, 0x9a70177afe52322e -data8 0xc2945aac24daaf6e, 0xf527a232cf6be334 -data8 0x9a86b8fa94eebe10, 0xc2b0de05e43c1d66 -data8 0xf54b8ecdcda90851, 0x9aa8c42866ae2958 -data8 0xc2dbc275e1229d09, 0xf5819949c7ad87b4 -data8 0x9abf86f9e12fc45e, 0xc2f86fca9d80eeff -data8 0xf5a5bac9213b48a9, 0x9ae1c462fc05f49d -data8 0xc323938449a2587e, 0xf5dc1501f324a812 -data8 0x9af8a8dc936b84d0, 0xc3406b40a538ed20 -data8 0xf6006bee86b5589e, 0x9b1b19033be35730 -data8 0xc36bcee8211d15e0, 0xf63716b2fa067fa4 -data8 0x9b3da7daf04c2892, 0xc397593adf2ba366 -data8 0xf66df22fb6132b9c, 0x9b54c2e4c8a9012b -data8 0xc3b475b6206155d5, 0xf6929fb98225deb1 -data8 0x9b77854e6c661200, 0xc3e0410243b97383 -data8 0xf6c9cd13021e3fea, 0x9b8ec2e678d56d2f -data8 0xc3fd890709833d37, 0xf6eeb177472cedae -data8 0x9ba60e6a5ca133b6, 0xc41ae295f7e7fa06 -data8 0xf713abf4cb0b3afb, 0x9bc919ea66a151a4 -data8 0xc44709f7bb8a4dd2, 0xf74b4d5333684ef1 -data8 0x9be0887c09ef82bb, 0xc4648fb0e0bec4c1 -data8 0xf7707f75a72f8e94, 0x9c03c8d5fffc3503 -data8 0xc490f9a94695ba14, 0xf7a874b97927af44 -data8 0x9c1b5ad21a81cbb9, 0xc4aeac0173b7d390 -data8 0xf7cddf140aedf1d8, 0x9c3ed09216e9ca02 -data8 0xc4db5941007aa853, 0xf806291bacb7f7a9 -data8 0x9c568656c0423def, 0xc4f938aec206291a -data8 0xf82bcc43b92eafef, 0x9c7a320af242ce60 -data8 0xc52629e899dfd622, 0xf8646bf0defb759e -data8 0x9c920bf7a8c01dc2, 0xc54436e44043b965 -data8 0xf88a487dfc3ff5f7, 0x9ca9f475d98b159c -data8 0xc562563abf9ea07f, 0xf8b03c2b46cdc17f -data8 0x9ccdeca60e80b5f8, 0xc58fa7d1dc42921c -data8 0xf8e95541c152ae7a, 0x9ce5f9d4653d4902 -data8 0xc5adf561b91e110a, 0xf90f832c2700c160 -data8 0x9cfe15cb38bfdd8e, 0xc5cc5591bdbd82fa -data8 0xf935c88e0c7f419b, 0x9d225b983f6c1f96 -data8 0xc5fa08f1ff20593c, 0xf96f5cd84fd86873 -data8 0x9d3a9cca32261ed7, 0xc618980a79ce6862 -data8 0xf995dd53ebdd9d6d, 0x9d52ecfccebe1768 -data8 0xc6373a09e34b50fa, 0xf9bc75a034436a41 -data8 0x9d77818d95b82f86, 0xc66550a6e0baaf35 -data8 0xf9f686f26d5518de, 0x9d8ff7893fa4706c -data8 0xc6842241926342c9, 0xfa1d5b39b910a8c5 -data8 0x9da87cbef36f2a5e, 0xc6a3070b7c93bb9e -data8 0xfa4447acc4ecbfd2, 0x9dcd6140b4a35aeb -data8 0xc6d18260bb84081b, 0xfa7ed7e51e6fdfb4 -data8 0x9de60cd06dc6e2d4, 0xc6f0977c9416828b -data8 0xfaa601394d49a1a0, 0x9dfec7d4cc43b76f -data8 0xc70fc0117c641630, 0xfacd431644ce0e40 -data8 0x9e17925ec9fccc4a, 0xc72efc34d7e615be -data8 0xfaf49d96f7a75909, 0x9e3cdf6db57dc075 -data8 0xc75dfb441594141e, 0xfb2fd3c65e562fd5 -data8 0x9e55d110b63637a8, 0xc77d68aa019bda4c -data8 0xfb576c5762024805, 0x9e6ed27594550d2e -data8 0xc79ce9ea478dbc4f, 0xfb7f1debc22c4040 -data8 0x9e87e3adc385d393, 0xc7bc7f1ae453219d -data8 0xfba6e89f32d0190a, 0x9ead9b54b37a1055 -data8 0xc7ec0476e15e141a, 0xfbe2c803a0894893 -data8 0x9ec6d46a3d7de215, 0xc80bcbe16f1d540f -data8 0xfc0ad1ff0ed9ecf0, 0x9ee01d9108be3154 -data8 0xc82ba78a5d349735, 0xfc32f57bdfbcbe7f -data8 0x9ef976db07288d04, 0xc84b978847a06b87 -data8 0xfc5b32968f99b21c, 0x9f12e05a4759ec25 -data8 0xc86b9bf1ee817bc6, 0xfc83896bc861ab08 -data8 0x9f2c5a20f4da6668, 0xc88bb4de3667cdf4 -data8 0xfcabfa1861ed4815, 0x9f52af78ed1733ca -data8 0xc8bc00e7fe9e23a3, 0xfce8d3cea7d3163e -data8 0x9f6c52426a39d003, 0xc8dc4d7ff2d25232 -data8 0xfd118595143ee273, 0x9f860593d42fd7f3 -data8 0xc8fcaeebcb40eb47, 0xfd3a519943d4865a -data8 0x9f9fc97fdb96bd51, 0xc91d25431426a663 -data8 0xfd6337f8e1ae5a4b, 0x9fb99e194f4a7037 -data8 0xc93db09d7fdb2949, 0xfd8c38d1c8e927eb -data8 0x9fd383731ca51db9, 0xc95e5112e721582a -data8 0xfdb5544205095a53, 0x9fed79a04fbf9423 -data8 0xc97f06bb49787677, 0xfdde8a67d2613531 -data8 0xa00780b413b24ee8, 0xc99fd1aecd6e1b06 -data8 0xfe07db619e781611, 0xa02eab2c4474b0cd -data8 0xc9d12a3e27bb1625, 0xfe460768d80bf758 -data8 0xa048dcd51ccfd142, 0xc9f22ad82ba3d5f0 -data8 0xfe6f9bfb06cd32f6, 0xa0631fa894b11b8d -data8 0xca134113105e67b2, 0xfe994bcd3d14fcc2 -data8 0xa07d73ba65e680af, 0xca346d07b045a876 -data8 0xfec316fecaf3f2ab, 0xa097d91e6aaf71b0 -data8 0xca55aecf0e94bb88, 0xfeecfdaf33fadb80 -data8 0xa0b24fe89e02602f, 0xca77068257be9bab -data8 0xff16fffe2fa8fad6, 0xa0ccd82d1bd2f68b -data8 0xca98743ae1c693a8, 0xff411e0ba9db886d -data8 0xa0e77200215909e6, 0xcab9f8122c99a101 -data8 0xff6b57f7c33e4e9a, 0xa1021d760d584855 -data8 0xcadb9221e268c3b5, 0xff95ade2d1bd7358 -data8 0xa11cdaa36068a57d, 0xcafd4283d8043dfd -data8 0xffc01fed60f86fb5, 0xa137a99cbd3f880b -data8 0xcb1f09520d37c6fb, 0xffeaae3832b63956 -ASM_SIZE_DIRECTIVE(T_table) - - - - - - -.align 32 -.global cbrt# +LOCAL_OBJECT_START(poly_coeffs) + + data8 0xaaaaaaaaaaaaaab4, 0x0000bffd // ~ 1/3 + data8 0xbfbc71c71c718e45, 0xbfaf9add3c0bbb43 + data8 0xbfa511edb93dc98d, 0xbf9ee71c45f0dfbc +LOCAL_OBJECT_END(poly_coeffs) + + +// For every entry B in the frcpa table, this table contains +// the significands of cbrt(1/B), cbrt(2/B), cbrt(4/B). +// The index to this table is the same as the frcpa index. + +LOCAL_OBJECT_START(T_table) + + + data8 0x80155c748c374836, 0xa160019ed37fb4ae + data8 0xcb51ddcb9e93095e, 0x8040404b0879f7f9 + data8 0xa1960b5966da4608, 0xcb95f333968ad59b + data8 0x806b5dce4b405c10, 0xa1cc5dbe6dc2aab4 + data8 0xcbda64292d3ffd97, 0x8096b586974669b1 + data8 0xa202f97995b69c0d, 0xcc1f3184af961596 + data8 0x80bcd273d952a028, 0xa232fe6eb0c0577d + data8 0xcc5bb1ac954d33e2, 0x80e898c52813f2f3 + data8 0xa26a2582012f6e17, 0xcca12e9831fc6402 + data8 0x81149add67c2d208, 0xa2a197e5d10465cb + data8 0xcce70a67b64f24ad, 0x813b4e2c856b6e9a + data8 0xa2d25a532efefbc8, 0xcd24794726477ea5 + data8 0x8167c1dde03de7aa, 0xa30a5bd6e49e4ab8 + data8 0xcd6b096a0b70ee87, 0x818ed973b811135e + data8 0xa33b9c9b59879e24, 0xcda9177738b15a90 + data8 0x81bbc0c33e13ec98, 0xa3742fca6a3c1f21 + data8 0xcdf05f2247dffab9, 0x81e33e69fbe7504a + data8 0xa3a5f1273887bf22, 0xce2f0f347f96f906 + data8 0x820aec524e3c23e9, 0xa3d7ef508ff11574 + data8 0xce6e0be0cd551a61, 0x823880f78e70b805 + data8 0xa4115ce30548bc15, 0xceb666b2c347d1de + data8 0x826097a62a8e5200, 0xa443df0e53df577a + data8 0xcef609b0cb874f00, 0x8288dfe00e9b5eaf + data8 0xa4769fa5913c0ec3, 0xcf35fb5447e5c765 + data8 0x82b15a10c5371624, 0xa4a99f303bc7def5 + data8 0xcf763c47ee869f00, 0x82da06a527b18937 + data8 0xa4dcde37779adf4b, 0xcfb6cd3888d71785 + data8 0x8302e60b635ab394, 0xa5105d46152c938a + data8 0xcff7aed4fbfbb447, 0x832bf8b2feec2f0e + data8 0xa5441ce89825cb8d, 0xd038e1ce5167e3c6 + data8 0x83553f0ce00e276b, 0xa5781dad3e54d899 + data8 0xd07a66d7bfa0ebba, 0x837eb98b50f8322a + data8 0xa5ac602406c4e68c, 0xd0bc3ea6b32d1b21 + data8 0x83a270f44c84f699, 0xa5d9601d95c2c0bc + data8 0xd0f4f0e8f36c1bf8, 0x83cc4d7cfcfac5ca + data8 0xa60e1e1a2de14745, 0xd1376458e34b037e + data8 0x83f65f78a8872b4c, 0xa6431f6e3fbd9658 + data8 0xd17a2ca133f78572, 0x8420a75f2f7b53c8 + data8 0xa67864b0d432fda4, 0xd1bd4a80301c5715 + data8 0x844510461ff14209, 0xa6a6444aa0243c0b + data8 0xd1f71682b2fa4575, 0x846fbd91b930bed2 + data8 0xa6dc094d10f25792, 0xd23ad555f773f059 + data8 0x84947e18234f3294, 0xa70a574cc02bba69 + data8 0xd2752c7039a5bf73, 0x84bf92755825045a + data8 0xa7409e2af9549084, 0xd2b98ee008c06b59 + data8 0x84e4ac0ee112ba51, 0xa76f5c64ca2cf13b + data8 0xd2f4735ffd700280, 0x8509ef44b86f20be + data8 0xa79e4f0babab5dc0, 0xd32f99ed6d9ac0e1 + data8 0x85359d5d91768427, 0xa7d5579ae5164b85 + data8 0xd374f0666c75d51c, 0x855b3bd5b7384357 + data8 0xa804bd3c6fe61cc8, 0xd3b0a7d13618e4a1 + data8 0x858104f0c415f79a, 0xa8345895e5250a5a + data8 0xd3eca2ea53bcec0c, 0x85a6f90390d29864 + data8 0xa8642a122b44ef0b, 0xd428e23874f13a17 + data8 0x85d3772fcd56a1dd, 0xa89c38ca18f6108b + data8 0xd46f82fe293bc6d3, 0x85f9c982fcc002f3 + data8 0xa8cc81063b6e87ca, 0xd4ac57e9b7186420 + data8 0x862047e0e7ea554b, 0xa8fd00bfa409285e + data8 0xd4e972becb04e8b8, 0x8646f2a26f7f5852 + data8 0xa92db8664d5516da, 0xd526d40a7a9b43a3 + data8 0x866dca21754096b5, 0xa95ea86b75cc2c20 + data8 0xd5647c5b73917370, 0x8694ceb8dfd17a37 + data8 0xa98fd141a4992deb, 0xd5a26c4201bd6d13 + data8 0x86bc00c49e9307e8, 0xa9c1335cae7446ba + data8 0xd5e0a45015350a7e, 0x86dccd74fce79610 + data8 0xa9ea8686f556f645, 0xd614b539c6194104 + data8 0x870453c845acf90f, 0xaa1c52d17906bb19 + data8 0xd6537310e224283f, 0x872c089a1e90342c + data8 0xaa4e59b046dab887, 0xd6927ab62244c917 + data8 0x8753ec4a92d16c5e, 0xaa809b9c60d1890b + data8 0xd6d1ccc1fc4ef4b7, 0x877bff3aca19f6b4 + data8 0xaab319102f3f9b33, 0xd71169cea98fdded + data8 0x879d88b6fe1c324c, 0xaadd5a18c1e21274 + data8 0xd746a66a5bc9f6d9, 0x87c5f346dbf98c3a + data8 0xab1045f2ac31bdf5, 0xd786ce8f0fae5317 + data8 0x87e7c653efacef2c, 0xab3ae3ab2df7231e + data8 0xd7bc7ff214c4e75a, 0x881089d4e73ffefc + data8 0xab6e3f945d1e96fc, 0xd7fd35467a517ed1 + data8 0x88397e6a366f2a8a, 0xaba1d953a08fa94e + data8 0xd83e38838648d815, 0x885bc559e5e1c081 + data8 0xabcd090db7ef4c3f, 0xd874a1db598b8951 + data8 0x887e2ee392bb7a93, 0xabf864602d7c323d + data8 0xd8ab42205b80edaf, 0x88a7a8587e404257 + data8 0xac2ca5886ccf9b57, 0xd8ed1849d202f965 + data8 0x88ca5eda67594784, 0xac5861d4aa441f0f + data8 0xd92432bd5a173685, 0x88f4356166bd590e + data8 0xac8d183fe3a2fbed, 0xd9669ca45b03c23e + data8 0x89173a0acf5ce026, 0xacb93703ff51571e + data8 0xd99e3327cf89574e, 0x893a62a098b6a57b + data8 0xace5830ad0c3f14b, 0xd9d602b19b100466 + data8 0x895daf637236ae2c, 0xad11fca5d78b3ff2 + data8 0xda0e0ba86c096841, 0x89883b9d1c2fa9c5 + data8 0xad4797fddf91a798, 0xda5195fcdb1c3dce + data8 0x89abd8dd374a5d7b, 0xad747701e559ebcb + data8 0xda8a1eb87a491f6c, 0x89cf9b1dcd197fa0 + data8 0xada184a47e9c7613, 0xdac2e230b91c3f84 + data8 0x89f382a258ea79de, 0xadcec13ab0dda8ff + data8 0xdafbe0d0b66aea30, 0x8a178faf06648f29 + data8 0xadfc2d1a5fd21ba8, 0xdb351b04a8fafced + data8 0x8a3bc288b3e1d18a, 0xae29c89a5053c33a + data8 0xdb6e9139e33cdd8e, 0x8a601b74f4d1f835 + data8 0xae5794122b638df9, 0xdba843ded7151ea1 + data8 0x8a849aba14274764, 0xae858fda8137ae0a + data8 0xdbe2336319b61fc8, 0x8aa9409f16cdbc9b + data8 0xaeb3bc4ccc56d3d1, 0xdc1c60376789fa68 + data8 0x8ace0d6bbe2cb316, 0xaee219c374c09920 + data8 0xdc56cacda82d0cd5, 0x8af301688ab33558 + data8 0xaf10a899d3235fe7, 0xdc917398f2797814 + data8 0x8b181cdebe6f3206, 0xaf3f692c341fe8b4 + data8 0xdccc5b0d90a3e628, 0x8b3d60185fafcb7c + data8 0xaf6e5bd7db9ae6c2, 0xdd0781a10469f0f2 + data8 0x8b62cb603bb2fad0, 0xaf9d80fb081cd91b + data8 0xdd42e7ca0b52838f, 0x8b80d7d6bc4104de + data8 0xafc35ce063eb3787, 0xdd729ad01c69114d + data8 0x8ba68bf73ac74f39, 0xaff2ddcb5f28f03d + data8 0xddae749c001fbf5e, 0x8bcc68fb9f9f7335 + data8 0xb022923b148e05c5, 0xddea8f50a51c69b1 + data8 0x8bf26f31c534fca2, 0xb0527a919adbf58b + data8 0xde26eb69a0f0f111, 0x8c10f86e13a1a1f9 + data8 0xb078f3ab1d701c65, 0xde576480262399bc + data8 0x8c3749916cc6abb5, 0xb0a93a6870649f31 + data8 0xde943789645933c8, 0x8c5dc4c4f7706032 + data8 0xb0d9b624d62ec856, 0xded14d58139a28af + data8 0x8c7cac3a8c42e3e0, 0xb100a5f53fb3c8e1 + data8 0xdf025c00bbf2b5c7, 0x8ca373f1b7bf2716 + data8 0xb131821882f5540a, 0xdf3feb44d723a713 + data8 0x8cc29907fb951294, 0xb158bf8e4cb04055 + data8 0xdf715bc16c159be0, 0x8ce9ae4e9492aac8 + data8 0xb189fd69d56b238f, 0xdfaf66240e29cda8 + data8 0x8d0911dddbfdad0e, 0xb1b189958e8108e4 + data8 0xdfe139cbf6e19bdc, 0x8d3075c4f20f04ee + data8 0xb1e32a8165b09832, 0xe01fc0fe94d9fc52 + data8 0x8d5018a9d4de77d5, 0xb20b0678fc271eec + data8 0xe051f92ffcc0bd60, 0x8d77cc47dd143515 + data8 0xb23d0bd3f7592b6e, 0xe090feec9c9a06ac + data8 0x8d97af6352739cb7, 0xb26538b2db8420dc + data8 0xe0c39d0c9ff862d6, 0x8db7af523167800f + data8 0xb28d89e339ceca14, 0xe0f668eeb99f188d + data8 0x8ddfd80bc68c32ff, 0xb2c022ca12e55a16 + data8 0xe1362890eb663139, 0x8e00197e1e7c88fe + data8 0xb2e8c6852c6b03f1, 0xe1695c7212aecbaa + data8 0x8e207859f77e20e7, 0xb3118f4eda9fe40f + data8 0xe19cbf0391bbbbe9, 0x8e40f4ce60c9f8e2 + data8 0xb33a7d6268109ebe, 0xe1d050901c531e85 + data8 0x8e69ba46cf2fde4d, 0xb36ddbc5ea70ec55 + data8 0xe2110903b4f4047a, 0x8e8a7a00bd7ae63e + data8 0xb3971e9b39264023, 0xe2450559b4d80b6d + data8 0x8eab57ef1cf2f529, 0xb3c0877ecc18e24a + data8 0xe27931a231554ef3, 0x8ecc5442cffb1dad + data8 0xb3ea16ae3a6c905f, 0xe2ad8e2ac3c5b04b + data8 0x8eed6f2d2a4acbfe, 0xb413cc67aa0e4d2d + data8 0xe2e21b41b9694cce, 0x8f0ea8dff24441ff + data8 0xb43da8e9d163e1af, 0xe316d93615862714 + data8 0x8f385c95d696b817, 0xb47233773b84d425 + data8 0xe3590bd86a0d30f9, 0x8f59dc43edd930f3 + data8 0xb49c6825430fe730, 0xe38e38e38e38e38e + data8 0x8f7b7b5f5ffad1c4, 0xb4c6c46bcdb27dcf + data8 0xe3c397d1e6db7839, 0x8f9d3a1bea165f38 + data8 0xb4f1488c0b35d26f, 0xe3f928f5953feb9e + data8 0x8fbf18adc34b66da, 0xb51bf4c7c51f0168 + data8 0xe42eeca17c62886c, 0x8fe117499e356095 + data8 0xb546c9616087ab9c, 0xe464e32943446305 + data8 0x90033624aa685f8d, 0xb571c69bdffd9a70 + data8 0xe49b0ce15747a8a2, 0x9025757495f36b86 + data8 0xb59cecbae56984c3, 0xe4d16a1eee94e9d4 + data8 0x903f3a5dcc091203, 0xb5bd64512bb14bb7 + data8 0xe4fa52107353f67d, 0x9061b2fceb2bdbab + data8 0xb5e8d2a4bf5ba416, 0xe5310a471f4d2dc3 + data8 0x90844ca7211032a7, 0xb6146a9a1bc47819 + data8 0xe567f6f1c2b9c224, 0x90a7079403e6a15d + data8 0xb6402c7749d621c0, 0xe59f18689a9e4c9a + data8 0x90c9e3fbafd63799, 0xb66c1882fb435ea2 + data8 0xe5d66f04b8a68ecf, 0x90ece216c8a16ee4 + data8 0xb6982f048c999a56, 0xe60dfb2005c192e9 + data8 0x9110021e7b516f0a, 0xb6c47044075b4142 + data8 0xe645bd1544c7ea51, 0x912a708a39be9075 + data8 0xb6e5bd6bfd02bafd, 0xe66fb21b505b20a0 + data8 0x914dcc7b31146370, 0xb7124a2736ff8ef2 + data8 0xe6a7d32af4a7c59a, 0x91714af8cfe984d5 + data8 0xb73f026a01e94177, 0xe6e02b129c6a5ae4 + data8 0x918c00a6f3795e97, 0xb760a959f1d0a7a7 + data8 0xe70a9136a7403039, 0x91afbc299ed0295d + data8 0xb78dae7e06868ab0, 0xe74349fb2d92a589 + data8 0x91d39add3e958db0, 0xb7badff8ad9e4e02 + data8 0xe77c3a9c86ed7d42, 0x91ee9920a8974d92 + data8 0xb7dce25b8e17ae9f, 0xe7a713f88151518a + data8 0x9212b5fcac537c19, 0xb80a6226904045e2 + data8 0xe7e067453317ed2b, 0x9236f6b256923fcf + data8 0xb8380f1cafd73c1c, 0xe819f37a81871bb5 + data8 0x92523ee6f90dcfc3, 0xb85a6ea8e321b4d8 + data8 0xe8454236bfaeca14, 0x9276bef031e6eb79 + data8 0xb8886b684ae7d2fa, 0xe87f32f24c3fc90e + data8 0x929236ec237a24ad, 0xb8ab0726fa00cf5d + data8 0xe8aacd8688892ba6, 0x92b6f70b7efe9dc3 + data8 0xb8d954a4d13b7cb1, 0xe8e523fd32f606f7 + data8 0x92d29f61eec7dc2b, 0xb8fc2d4f6cd9f04a + data8 0xe9110b5311407927, 0x92f7a05d5b8ba92f + data8 0xb92acc851476b1ab, 0xe94bc8bf0c108fa3 + data8 0x931379a403be5c16, 0xb94de2d841a184c2 + data8 0xe977fdc439c2ca3c, 0x9338bc44de2e3f34 + data8 0xb97cd4c36c92693c, 0xe9b3236528fc349e + data8 0x9354c71412c69486, 0xb9a0297f172665e3 + data8 0xe9dfa70b745ac1b4, 0x937a4c273907e262 + data8 0xb9cf6f21e36c3924, 0xea1b36268d0eaa38 + data8 0x93968919f6e7975d, 0xb9f3030951267208 + data8 0xea480963fd394197, 0x93bc516fdd4680c9 + data8 0xba229d6a618e7c59, 0xea84034425f27484 + data8 0x93d8c123d9be59b2, 0xba467144459f9855 + data8 0xeab12713138dd1cc, 0x93f546c955e60076 + data8 0xba6a60c3c48f1a4b, 0xeade6db73a5e503b + data8 0x941b70a65879079f, 0xba9a76056b67ee7a + data8 0xeb1b0268343b121b, 0x943829f337410591 + data8 0xbabea699563ada6e, 0xeb489b0b2bdb5f14 + data8 0x9454f995765bc4d2, 0xbae2f350b262cc4b + data8 0xeb765721e85f03d0, 0x947b86b57f5842ed + data8 0xbb1385a23be24e57, 0xebb389645f222f62 + data8 0x94988aeb23470f86, 0xbb3814975e17c680 + data8 0xebe198f090607e0c, 0x94b5a5dc9695f42a + data8 0xbb5cc031009bf467, 0xec0fcc9321024509 + data8 0x94d2d7a9170d8b42, 0xbb81889680024764 + data8 0xec3e247da8b82f61, 0x94f9e87dd78bf019 + data8 0xbbb2c0d8703ae95d, 0xec7c27d21321c9f7 + data8 0x95175019a503d89e, 0xbbd7cd09ba3c5463 + data8 0xecaad5278824e453, 0x9534cefa625fcb3a + data8 0xbbfcf68c4977718f, 0xecd9a76d097d4e77 + data8 0x955265405c491a25, 0xbc223d88cfc88eee + data8 0xed089ed5dcd99446, 0x9570130c1f9bb857 + data8 0xbc47a2284fee4ff8, 0xed37bb95add09a1c + data8 0x9597ca4119525184, 0xbc79ac0916ed7b8a + data8 0xed76c70508f904b6, 0x95b5af6fb5aa4d3c + data8 0xbc9f5670d1a13030, 0xeda63bb05e7f93c6 + data8 0x95d3ac9273aafd7a, 0xbcc51f068cb95c1d + data8 0xedd5d661daed2dc4, 0x95f1c1cafdfd3684 + data8 0xbceb05f4b30a9bc0, 0xee05974eef86b903 + data8 0x960fef3b430b8d5f, 0xbd110b6604c7d306 + data8 0xee357ead791fc670, 0x962e350575b409c5 + data8 0xbd372f8598620f19, 0xee658cb3c134a463 + data8 0x964c934c0dfc1708, 0xbd5d727edb6b3c7e + data8 0xee95c1987f080211, 0x966b0a31c9c6bc7d + data8 0xbd83d47d937bbc6d, 0xeec61d92d8c4314f + data8 0x968999d9ad8d264e, 0xbdaa55addf1ae47d + data8 0xeef6a0da64a014ac, 0x96a8426705198795 + data8 0xbdd0f63c36aa73f0, 0xef274ba72a07c811 + data8 0x96c703fd64445ee5, 0xbdf7b6556d550a15 + data8 0xef581e31a2c91260, 0x96e5dec0a7b4268d + data8 0xbe1e9626b1ffa96b, 0xef8918b2bc43aec6 + data8 0x9704d2d4f59f79f3, 0xbe4595dd903e5371 + data8 0xefba3b63d89d7cbf, 0x9723e05ebe91b9b0 + data8 0xbe6cb5a7f14bc935, 0xefeb867ecffaa607 + data8 0x97430782be323831, 0xbe93f5b41d047cf7 + data8 0xf01cfa3df1b9c9fa, 0x97624865fc0df8bf + data8 0xbebb5630bae4c15f, 0xf04e96dc05b43e2d + data8 0x9781a32dcc640b2a, 0xbee2d74cd30a430c + data8 0xf0805c944d827454, 0x97a117ffd0f48e46 + data8 0xbf0a7937cf38d981, 0xf0b24ba285c495cb + data8 0x97c0a701f9d263c9, 0xbf323c217be2bc8c + data8 0xf0e46442e76f6569, 0x97e0505a8637a036 + data8 0xbf5a203a09342bbb, 0xf116a6b2291d7896 + data8 0x97f57a9fb0b08c6e, 0xbf74cad1c14ebfc4 + data8 0xf1383fa9e9b5b381, 0x9815503365914a9d + data8 0xbf9ce6a497a89f78, 0xf16ac84f90083b9b + data8 0x98354085054fd204, 0xbfc52428bec6e72f + data8 0xf19d7b686dcb03d7, 0x98554bbbf8a77902 + data8 0xbfed838fddab024b, 0xf1d0593311db1757 + data8 0x987571fffb7f94f6, 0xc016050c0420981a + data8 0xf20361ee8f1c711e, 0x9895b3791dd03c23 + data8 0xc03ea8cfabddc330, 0xf23695da7de51d3f + data8 0x98ab43a5fc65d0c8, 0xc059d3cbd65ddbce + data8 0xf258d095e465cc35, 0x98cbb2d196bd713d + data8 0xc082b122a3c78c9d, 0xf28c4d0bfc982b34 + data8 0x98ec3d9ec7b6f21a, 0xc0abb1499ae736c4 + data8 0xf2bff55eb3f0ea71, 0x990ce436db5e8344 + data8 0xc0d4d474c3aedaaf, 0xf2f3c9cf9884636e + data8 0x9922b8218160967a, 0xc0f054ca33eb3437 + data8 0xf31670135ab9cc0f, 0x99438d686f75779d + data8 0xc119b2c67e600ed0, 0xf34a8e9f0b54cdfb + data8 0x99647eea131fa20b, 0xc1433453de2033ff + data8 0xf37ed9fa6b8add3f, 0x997a85045a47c6d0 + data8 0xc15ef3e44e10032d, 0xf3a1cfe884ef6bb6 + data8 0x999ba5f14f8add02, 0xc188b130431d80e6 + data8 0xf3d66689dcc8e8d3, 0x99bce38b5465ecae + data8 0xc1b2929d6067730e, 0xf40b2ab069d5c96a + data8 0x99d31ca0887f30f9, 0xc1ce9268f31cc734 + data8 0xf42e718b90c8bc16, 0x99f48a669c74c09e + data8 0xc1f8b0877c1b0c08, 0xf463822a0a3b4b00 + data8 0x9a16154eb445c873, 0xc222f35a87b415ba + data8 0xf498c1076015faf8, 0x9a2c822ec198d667 + data8 0xc23f3467349e5c88, 0xf4bc5a19a33990b5 + data8 0x9a4e3e080cd91b78, 0xc269b4e40e088c01 + data8 0xf4f1e6a7d6f5425f, 0x9a70177afe52322e + data8 0xc2945aac24daaf6e, 0xf527a232cf6be334 + data8 0x9a86b8fa94eebe10, 0xc2b0de05e43c1d66 + data8 0xf54b8ecdcda90851, 0x9aa8c42866ae2958 + data8 0xc2dbc275e1229d09, 0xf5819949c7ad87b4 + data8 0x9abf86f9e12fc45e, 0xc2f86fca9d80eeff + data8 0xf5a5bac9213b48a9, 0x9ae1c462fc05f49d + data8 0xc323938449a2587e, 0xf5dc1501f324a812 + data8 0x9af8a8dc936b84d0, 0xc3406b40a538ed20 + data8 0xf6006bee86b5589e, 0x9b1b19033be35730 + data8 0xc36bcee8211d15e0, 0xf63716b2fa067fa4 + data8 0x9b3da7daf04c2892, 0xc397593adf2ba366 + data8 0xf66df22fb6132b9c, 0x9b54c2e4c8a9012b + data8 0xc3b475b6206155d5, 0xf6929fb98225deb1 + data8 0x9b77854e6c661200, 0xc3e0410243b97383 + data8 0xf6c9cd13021e3fea, 0x9b8ec2e678d56d2f + data8 0xc3fd890709833d37, 0xf6eeb177472cedae + data8 0x9ba60e6a5ca133b6, 0xc41ae295f7e7fa06 + data8 0xf713abf4cb0b3afb, 0x9bc919ea66a151a4 + data8 0xc44709f7bb8a4dd2, 0xf74b4d5333684ef1 + data8 0x9be0887c09ef82bb, 0xc4648fb0e0bec4c1 + data8 0xf7707f75a72f8e94, 0x9c03c8d5fffc3503 + data8 0xc490f9a94695ba14, 0xf7a874b97927af44 + data8 0x9c1b5ad21a81cbb9, 0xc4aeac0173b7d390 + data8 0xf7cddf140aedf1d8, 0x9c3ed09216e9ca02 + data8 0xc4db5941007aa853, 0xf806291bacb7f7a9 + data8 0x9c568656c0423def, 0xc4f938aec206291a + data8 0xf82bcc43b92eafef, 0x9c7a320af242ce60 + data8 0xc52629e899dfd622, 0xf8646bf0defb759e + data8 0x9c920bf7a8c01dc2, 0xc54436e44043b965 + data8 0xf88a487dfc3ff5f7, 0x9ca9f475d98b159c + data8 0xc562563abf9ea07f, 0xf8b03c2b46cdc17f + data8 0x9ccdeca60e80b5f8, 0xc58fa7d1dc42921c + data8 0xf8e95541c152ae7a, 0x9ce5f9d4653d4902 + data8 0xc5adf561b91e110a, 0xf90f832c2700c160 + data8 0x9cfe15cb38bfdd8e, 0xc5cc5591bdbd82fa + data8 0xf935c88e0c7f419b, 0x9d225b983f6c1f96 + data8 0xc5fa08f1ff20593c, 0xf96f5cd84fd86873 + data8 0x9d3a9cca32261ed7, 0xc618980a79ce6862 + data8 0xf995dd53ebdd9d6d, 0x9d52ecfccebe1768 + data8 0xc6373a09e34b50fa, 0xf9bc75a034436a41 + data8 0x9d77818d95b82f86, 0xc66550a6e0baaf35 + data8 0xf9f686f26d5518de, 0x9d8ff7893fa4706c + data8 0xc6842241926342c9, 0xfa1d5b39b910a8c5 + data8 0x9da87cbef36f2a5e, 0xc6a3070b7c93bb9e + data8 0xfa4447acc4ecbfd2, 0x9dcd6140b4a35aeb + data8 0xc6d18260bb84081b, 0xfa7ed7e51e6fdfb4 + data8 0x9de60cd06dc6e2d4, 0xc6f0977c9416828b + data8 0xfaa601394d49a1a0, 0x9dfec7d4cc43b76f + data8 0xc70fc0117c641630, 0xfacd431644ce0e40 + data8 0x9e17925ec9fccc4a, 0xc72efc34d7e615be + data8 0xfaf49d96f7a75909, 0x9e3cdf6db57dc075 + data8 0xc75dfb441594141e, 0xfb2fd3c65e562fd5 + data8 0x9e55d110b63637a8, 0xc77d68aa019bda4c + data8 0xfb576c5762024805, 0x9e6ed27594550d2e + data8 0xc79ce9ea478dbc4f, 0xfb7f1debc22c4040 + data8 0x9e87e3adc385d393, 0xc7bc7f1ae453219d + data8 0xfba6e89f32d0190a, 0x9ead9b54b37a1055 + data8 0xc7ec0476e15e141a, 0xfbe2c803a0894893 + data8 0x9ec6d46a3d7de215, 0xc80bcbe16f1d540f + data8 0xfc0ad1ff0ed9ecf0, 0x9ee01d9108be3154 + data8 0xc82ba78a5d349735, 0xfc32f57bdfbcbe7f + data8 0x9ef976db07288d04, 0xc84b978847a06b87 + data8 0xfc5b32968f99b21c, 0x9f12e05a4759ec25 + data8 0xc86b9bf1ee817bc6, 0xfc83896bc861ab08 + data8 0x9f2c5a20f4da6668, 0xc88bb4de3667cdf4 + data8 0xfcabfa1861ed4815, 0x9f52af78ed1733ca + data8 0xc8bc00e7fe9e23a3, 0xfce8d3cea7d3163e + data8 0x9f6c52426a39d003, 0xc8dc4d7ff2d25232 + data8 0xfd118595143ee273, 0x9f860593d42fd7f3 + data8 0xc8fcaeebcb40eb47, 0xfd3a519943d4865a + data8 0x9f9fc97fdb96bd51, 0xc91d25431426a663 + data8 0xfd6337f8e1ae5a4b, 0x9fb99e194f4a7037 + data8 0xc93db09d7fdb2949, 0xfd8c38d1c8e927eb + data8 0x9fd383731ca51db9, 0xc95e5112e721582a + data8 0xfdb5544205095a53, 0x9fed79a04fbf9423 + data8 0xc97f06bb49787677, 0xfdde8a67d2613531 + data8 0xa00780b413b24ee8, 0xc99fd1aecd6e1b06 + data8 0xfe07db619e781611, 0xa02eab2c4474b0cd + data8 0xc9d12a3e27bb1625, 0xfe460768d80bf758 + data8 0xa048dcd51ccfd142, 0xc9f22ad82ba3d5f0 + data8 0xfe6f9bfb06cd32f6, 0xa0631fa894b11b8d + data8 0xca134113105e67b2, 0xfe994bcd3d14fcc2 + data8 0xa07d73ba65e680af, 0xca346d07b045a876 + data8 0xfec316fecaf3f2ab, 0xa097d91e6aaf71b0 + data8 0xca55aecf0e94bb88, 0xfeecfdaf33fadb80 + data8 0xa0b24fe89e02602f, 0xca77068257be9bab + data8 0xff16fffe2fa8fad6, 0xa0ccd82d1bd2f68b + data8 0xca98743ae1c693a8, 0xff411e0ba9db886d + data8 0xa0e77200215909e6, 0xcab9f8122c99a101 + data8 0xff6b57f7c33e4e9a, 0xa1021d760d584855 + data8 0xcadb9221e268c3b5, 0xff95ade2d1bd7358 + data8 0xa11cdaa36068a57d, 0xcafd4283d8043dfd + data8 0xffc01fed60f86fb5, 0xa137a99cbd3f880b + data8 0xcb1f09520d37c6fb, 0xffeaae3832b63956 +LOCAL_OBJECT_END(T_table) + + + + + + .section .text -.proc cbrt# -.align 32 -cbrt: - - -{ .mfi - // get significand - getf.sig r23=f8 - // will continue only for normal/denormal numbers - (p0) fclass.nm.unc p12,p0 = f8, 0x1b - // r2 = pointer to C_1,...,C_5 followed by T_table - addl r2 = @ltoff(poly_coeffs), gp +GLOBAL_LIBM_ENTRY(cbrt) + + +{.mfi + // get significand + getf.sig GR_SIGNIF = f8 + // normalize a + fma.s1 FR_ARG = f8, f1, f0 + // GR_GP = pointer to C_1,..., C_5 followed by T_table + addl GR_GP = @ltoff(poly_coeffs), gp ;; } + {.mfi - // get exponent - getf.exp r24=f8 - // normalize a - fma.s1 f14=f8,f1,f0 - // r29=bias-((2^{12}-1)/3) -63=0xffff-0x555-0x3f=0xfa6b - mov r29=0xfa6b;; + // get exponent + getf.exp GR_ARGEXP = f8 + // will continue only for normal/denormal numbers + fclass.m.unc p12, p13 = f8, 0x1e7 + // GR_CONST4 = bias-((2^{12}-1)/3)-63 = 0xffff-0x555-0x3f = 0xfa6b + mov GR_CONST4 = 0xfa6b ;; } + {.mlx - mov r25=0x20000 - // r28=2^52 - movl r28=0x8000000000000000;; -} -{.mfb - // load start address for C_1,...,C_5 followed by T_table - ld8 r3=[r2] - (p12) fma.d.s0 f8=f8,f1,f0 - (p12) br.ret.spnt b0 + mov GR_CONST2 = 0x20000 + // GR_CONST3 = 2^52 + movl GR_CONST3 = 0x8000000000000000 ;; } + +.pred.rel "mutex", p12, p13 {.mfi - nop.m 0 - // y=frcpa(a) - frcpa.s0 f8,p6=f1,f8 - // p7=1 if denormal input - cmp.gtu p7,p0=r28,r23;; + // load start address for C_1,..., C_5 followed by T_table + ld8 GR_ADDR = [ GR_GP ] + // y = frcpa(a) + (p13) frcpa.s0 f8, p0 = f1, f8 + // p7 = 1 if denormal input + cmp.gtu p7, p0 = GR_CONST3, GR_SIGNIF +} +{.mfb + nop.m 0 + // if argument is 0, +/-Infinity, NaN, or NaTVal, then return + (p12) fma.d.s0 f8 = f8, f1, f0 + (p12) br.ret.spnt b0 ;; } + {.mmi - // get exponent - (p7) getf.exp r24=f14 - // get normalized significand - (p7) getf.sig r23=f14 - // r28=bias-(2^{12}-1) - mov r28=0xf000;; + // get exponent (for denormal input) + (p7) getf.exp GR_ARGEXP = FR_ARG + // get normalized significand (for denormal input) + (p7) getf.sig GR_SIGNIF = FR_ARG + // GR_CONST1 = bias-(2^{12}-1) + mov GR_CONST1 = 0xf000 ;; } + {.mii - // get r26=sign - and r26=r24,r25 - // eliminate leading 1 from r23=1st table index - shl r23=r23,1 - // eliminate sign from exponent (r25) - andcm r25=r24,r25;; + // get GR_SIGN = sign + and GR_SIGN = GR_ARGEXP, GR_CONST2 + // eliminate leading 1 from GR_I1 = 1st table index + shl GR_I1 = GR_SIGNIF, 1 + // eliminate sign from exponent + andcm GR_EXP = GR_ARGEXP, GR_CONST2 ;; } + {.mib - add r2=32,r3 - // r23=1st table index (y_index,8 bits) - shr.u r23=r23,56 - nop.b 0 + add GR_ADDR2 = 32, GR_ADDR + // GR_IT1 = 1st table index (y_index, 8 bits) + shr.u GR_IT1 = GR_I1, 56 + nop.b 0 } {.mib - // load C_1 - ldfe f7=[r3],16 - // subtract bias from r25=exponent - sub r25=r25,r28 - nop.b 0;; + // load C_1 + ldfe FR_COEFF1 = [ GR_ADDR ], 16 + // subtract bias from GR_EXPON = exponent + sub GR_EXPON = GR_EXP, GR_CONST1 + nop.b 0 ;; } + {.mib - // load C_2, C_3 - ldfpd f9,f10=[r3] - // 1: exponent*=5; // (2^{16}-1)/3=0x5555 - shladd r24=r25,2,r25 - nop.b 0 + // load C_2, C_3 + ldfpd FR_COEFF2, FR_COEFF3 = [ GR_ADDR ] + // 1: exponent* = 5; // (2^{16}-1)/3 = 0x5555 + shladd GR_TMP1 = GR_EXPON, 2, GR_EXPON + nop.b 0 } {.mib - // load C_4, C_5 - ldfpd f11,f12=[r2],16 - // r23=3*y_index - shladd r23=r23,1,r23 - nop.b 0;; + // load C_4, C_5 + ldfpd FR_COEFF4, FR_COEFF5 = [ GR_ADDR2 ], 16 + // GR_TMP2 = 3*y_index + shladd GR_TMP2 = GR_IT1, 1, GR_IT1 + nop.b 0 ;; } {.mfi - // r30=(5*expon)*16+5*expon=(0x55)*expon - shladd r30=r24,4,r24 - // r=1-a*y - (p6) fnma.s1 f6=f8,f14,f1 - // adjust T_table pointer by 1st index - shladd r2=r23,3,r2;; + // GR_TMP6 = (5*expon)*16+5*expon = (0x55)*expon + shladd GR_TMP6 = GR_TMP1, 4, GR_TMP1 + // r = 1-a*y + fnma.s1 FR_R = f8, FR_ARG, f1 + // adjust T_table pointer by 1st index + shladd GR_ITB1 = GR_TMP2, 3, GR_ADDR2 ;; } {.mii - nop.m 0 - // r24=(0x5500)*expon - shl r24=r30,8;; - // r24=(0x5555)*expon - add r24=r24,r30;; + // eliminate leading 1 from significand + add GR_SIGNIF2 = GR_SIGNIF, GR_SIGNIF + // GR_TMP3 = (0x5500)*expon + shl GR_TMP3 = GR_TMP6, 8 ;; + // GR_TMP4 = (0x5555)*expon + add GR_TMP4 = GR_TMP3, GR_TMP6 ;; } + {.mii - // r24=(0x5556)*expon // 0x5556=(2^{16}+2)/3 - add r24=r24,r25 - nop.i 0;; - // r24=floor(expon/3) - shr r24=r24,16;; + // GR_TMP5 = (0x5556)*expon // 0x5556 = (2^{16}+2)/3 + add GR_TMP5 = GR_TMP4, GR_EXPON + nop.i 0 ;; + // GR_EXP_BY_3 = floor(expon/3) + shr GR_EXP_BY_3 = GR_TMP5, 16 ;; } + {.mfi - // r28=3*exponent - shladd r28=r24,1,r24 - // r2=r*r - (p6) fma.s1 f13=f6,f6,f0 - // bias exponent - add r24=r29,r24;; + // GR_TMP6 = 3*exponent + shladd GR_TMP6 = GR_EXP_BY_3, 1, GR_EXP_BY_3 + // r*r + fma.s1 FR_R2 = FR_R, FR_R, f0 + // bias exponent + add GR_EBIAS = GR_CONST4, GR_EXP_BY_3 ;; } + {.mfi - // get remainder of exponent/3 : r25-r28 - sub r25=r25,r28 - // c2+c3*r - (p6) fma.s1 f9=f10,f6,f9 - // add sign to exponent - or r24=r24,r26 + // get remainder of exponent/3 + sub GR_REM = GR_EXPON, GR_TMP6 + // c2+c3*r + fma.s1 FR_P23 = FR_COEFF3, FR_R, FR_COEFF2 + nop.i 0 } {.mfi - nop.m 0 - // c4+c5*r - (p6) fma.s1 f11=f12,f6,f11 - nop.i 0;; + // add sign to exponent + or GR_SEXP = GR_EBIAS, GR_SIGN + // c4+c5*r + fma.s1 FR_P45 = FR_COEFF5, FR_R, FR_COEFF4 + mov GR_TMP63 = 63+0xffff ;; } + {.mmi - // f14=sign*2^{exponent/3} - (p6) setf.exp f14=r24 - // adjust T_table pointer by 2nd index - shladd r2=r25,3,r2 - nop.i 0;; + // FR_2EXP = sign*2^{exponent/3} + setf.exp FR_2EXP = GR_SEXP + // adjust T_table pointer by 2nd index + shladd GR_INDEX = GR_REM, 3, GR_ITB1 + // is the argument of the form 2^(3*k) ? + // get (significand - leading 1) | (exponent mod 3) + or GR_TEST = GR_REM, GR_SIGNIF2 ;; } + {.mmi - // load T - (p6) ldf8 f8=[r2] - nop.m 0 - nop.i 0;; + // 2^63 + setf.exp FR_TMP63 = GR_TMP63 + // load T + ldf8 f8 = [ GR_INDEX ] + // is the argument of the form 2^(3*k) ? + cmp.eq p14, p0 = GR_TEST, r0 ;; } {.mfi - nop.m 0 - // (c2+c3*r)+r^2*(c4+c5*r) - (p6) fma.s1 f9=f11,f13,f9 - nop.i 0 + nop.m 0 + // (c2+c3*r)+r^2*(c4+c5*r) + fma.s1 FR_P25 = FR_P45, FR_R2, FR_P23 + nop.i 0 } {.mfi - nop.m 0 - // c1*r - (p6) fma.s1 f7=f7,f6,f0 - nop.i 0;; + nop.m 0 + // c1*r + fma.s1 FR_P1 = FR_COEFF1, FR_R, f0 + nop.i 0 ;; +} + +{.mfb + nop.m 0 + (p14) fma.d.s0 f8 = FR_2EXP, FR_TMP63, f0 + (p14) br.ret.spnt b0 ;; } {.mfi - nop.m 0 - // P=c1*r+r^2*[(c2+c3*r)+r^2*(c4+c5*r)] - (p6) fma.s1 f9=f9,f13,f7 - nop.i 0 + nop.m 0 + // P = c1*r+r^2* [ (c2+c3*r)+r^2*(c4+c5*r) ] + fma.s1 FR_P15 = FR_P25, FR_R2, FR_P1 + nop.i 0 } {.mfi - nop.m 0 - // T'=T*(2^exp) - (p6) fma.s1 f8=f8,f14,f0 - nop.i 0;; + nop.m 0 + // T' = T*(2^exp) + fma.s1 f8 = f8, FR_2EXP, f0 + nop.i 0 ;; } + {.mfb - nop.m 0 - // result = T'-T'*P - (p6) fnma.d.s0 f8=f8,f9,f8 - br.ret.sptk b0;; + nop.m 0 + // result = T'+T'*P + fma.d.s0 f8 = f8, FR_P15, f8 + br.ret.sptk b0 ;; } -.endp cbrt -ASM_SIZE_DIRECTIVE(cbrt) + + +GLOBAL_LIBM_END(cbrt) diff --git a/sysdeps/ia64/fpu/s_cbrtf.S b/sysdeps/ia64/fpu/s_cbrtf.S index 20167797b8..c8c6500b25 100644 --- a/sysdeps/ia64/fpu/s_cbrtf.S +++ b/sysdeps/ia64/fpu/s_cbrtf.S @@ -1,11 +1,10 @@ -.file "cbrtf.asm" +.file "cbrtf.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Cristina Iordache, Ted Kubaska, -// Bob Norin, Shane Story, and Ping Tak Peter Tang -// of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -21,27 +20,30 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// // Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// problem reports or change requests be submitted to it directly at +// http: //www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 2/02/00: Initial version -// 5/18/00: New version (modified algorithm) +// 02/02/00 Initial version +// 05/18/00 New version (modified algorithm) +// 05/20/02 Cleaned up namespace and sf0 syntax +// 01/28/03 Rescheduled some instructions for better performance +// on Itanium 2, and reformatted // // API //============================================================== @@ -53,616 +55,710 @@ // // Implementation // -// cbrt(a) = cbrt(a y) / cbrt(y) -// = cbrt(1 - (1 - a y)) * 1/cbrt(y) +// Let y= frcpa(a), where a is the argument // -// where y = frcpa(a). +// cbrt(a)= cbrt(a*y)/cbrt(y) = cbrt(1 - (1-a*y)) * (1/cbrt(y)) // -// * cbrt(1 - (1 - a y)) is approximated by a degree-2 polynomial -// -// 1 - (1/3)*r - (1/9)*r^2 -// -// in r = 1 - a y. +// For all values of y, the 3 possible significands of 1/cbrt(y) +// are stored in a table (T0) to 64 bits of accuracy. (There are +// 3 possible significands because the exponent of y modulo 3 +// can be 0, 1, or 2.) // -// * The values 1/cbrt(y) are stored in a table of constants T0 -// to 64 bits of accuracy +// +// * cbrt(1 - (1-a*y)) is approximated by a degree-2 polynomial +// +// 1 - (1/3)*r - (1/9)*r^2 +// +// in r = 1-a*y. // // The table values are stored for three exponent values and are -// then multiplied by e/3 where e is the exponent of the input number. +// then multiplied by 2^(e/3) where e is the exponent of the input number. // This computation is carried out in parallel with the polynomial // evaluation: // -// T = 2^(e/3) * T0 +// T= 2^(e/3) * T0 //=============== -// input = x -// C = frcpa(x) -// r = 1 - C * x +// input= x +// C= frcpa(x) +// r= 1 - C * x // -// Special values +// Special values //============================================================== // Registers used //============================================================== -// f6-f15 -// r2, r23-r26, r28-r30 -// p6,p7,p8,p12 +// p6, p7, p8, p12 + + FR_R = f6 + FR_COEFF1 = f7 + FR_COEFF2 = f9 + FR_T0 = f10 + FR_T1 = f11 + FR_T2 = f12 + FR_2M63 = f13 + FR_ARG = f14 + FR_Y = f15 + + GR_GP = r2 + GR_ADDR = r2 + GR_TMP5 = r3 + GR_CONST = r8 + GR_TMP63 = r8 + GR_SIGN = r9 + GR_CT2 = r10 + GR_CT3 = r11 + GR_TMP4 = r14 + GR_EBIAS3 = r15 + GR_REM = r16 + GR_SEXP = r17 + GR_2P63 = r18 + GR_SIGNIF = r19 + GR_I1 = r20 + GR_EBIAS = r21 + GR_EXP = r22 + GR_IT1 = r23 + GR_E5 = r24 + GR_IT1_3 = r25 + GR_TP1 = r26 + GR_TMP = r27 + GR_TMP2 = r28 + GR_TMP3 = r29 + GR_EXP3 = r30 + GR_ARGEXP = r31 + + -#include "libm_support.h" // Data tables //============================================================== -#ifdef _LIBC -.rodata -#else -.data -#endif +RODATA .align 16 -poly_coeffs: -ASM_TYPE_DIRECTIVE(poly_coeffs,@object) -data8 0xaaaaaaaaaaaaaaab, 0x00003ffd // 1/3 -data8 0xe38e38e38e38e38e, 0x00003ffb // 1/9 -ASM_SIZE_DIRECTIVE(poly_coeffs) - - -T_table: -ASM_TYPE_DIRECTIVE(T_table,@object) - -data8 0x80155c748c374836, 0xa160019ed37fb4ae -data8 0xcb51ddcb9e93095e, 0x8040404b0879f7f9 -data8 0xa1960b5966da4608, 0xcb95f333968ad59b -data8 0x806b5dce4b405c10, 0xa1cc5dbe6dc2aab4 -data8 0xcbda64292d3ffd97, 0x8096b586974669b1 -data8 0xa202f97995b69c0d, 0xcc1f3184af961596 -data8 0x80bcd273d952a028, 0xa232fe6eb0c0577d -data8 0xcc5bb1ac954d33e2, 0x80e898c52813f2f3 -data8 0xa26a2582012f6e17, 0xcca12e9831fc6402 -data8 0x81149add67c2d208, 0xa2a197e5d10465cb -data8 0xcce70a67b64f24ad, 0x813b4e2c856b6e9a -data8 0xa2d25a532efefbc8, 0xcd24794726477ea5 -data8 0x8167c1dde03de7aa, 0xa30a5bd6e49e4ab8 -data8 0xcd6b096a0b70ee87, 0x818ed973b811135e -data8 0xa33b9c9b59879e24, 0xcda9177738b15a90 -data8 0x81bbc0c33e13ec98, 0xa3742fca6a3c1f21 -data8 0xcdf05f2247dffab9, 0x81e33e69fbe7504a -data8 0xa3a5f1273887bf22, 0xce2f0f347f96f906 -data8 0x820aec524e3c23e9, 0xa3d7ef508ff11574 -data8 0xce6e0be0cd551a61, 0x823880f78e70b805 -data8 0xa4115ce30548bc15, 0xceb666b2c347d1de -data8 0x826097a62a8e5200, 0xa443df0e53df577a -data8 0xcef609b0cb874f00, 0x8288dfe00e9b5eaf -data8 0xa4769fa5913c0ec3, 0xcf35fb5447e5c765 -data8 0x82b15a10c5371624, 0xa4a99f303bc7def5 -data8 0xcf763c47ee869f00, 0x82da06a527b18937 -data8 0xa4dcde37779adf4b, 0xcfb6cd3888d71785 -data8 0x8302e60b635ab394, 0xa5105d46152c938a -data8 0xcff7aed4fbfbb447, 0x832bf8b2feec2f0e -data8 0xa5441ce89825cb8d, 0xd038e1ce5167e3c6 -data8 0x83553f0ce00e276b, 0xa5781dad3e54d899 -data8 0xd07a66d7bfa0ebba, 0x837eb98b50f8322a -data8 0xa5ac602406c4e68c, 0xd0bc3ea6b32d1b21 -data8 0x83a270f44c84f699, 0xa5d9601d95c2c0bc -data8 0xd0f4f0e8f36c1bf8, 0x83cc4d7cfcfac5ca -data8 0xa60e1e1a2de14745, 0xd1376458e34b037e -data8 0x83f65f78a8872b4c, 0xa6431f6e3fbd9658 -data8 0xd17a2ca133f78572, 0x8420a75f2f7b53c8 -data8 0xa67864b0d432fda4, 0xd1bd4a80301c5715 -data8 0x844510461ff14209, 0xa6a6444aa0243c0b -data8 0xd1f71682b2fa4575, 0x846fbd91b930bed2 -data8 0xa6dc094d10f25792, 0xd23ad555f773f059 -data8 0x84947e18234f3294, 0xa70a574cc02bba69 -data8 0xd2752c7039a5bf73, 0x84bf92755825045a -data8 0xa7409e2af9549084, 0xd2b98ee008c06b59 -data8 0x84e4ac0ee112ba51, 0xa76f5c64ca2cf13b -data8 0xd2f4735ffd700280, 0x8509ef44b86f20be -data8 0xa79e4f0babab5dc0, 0xd32f99ed6d9ac0e1 -data8 0x85359d5d91768427, 0xa7d5579ae5164b85 -data8 0xd374f0666c75d51c, 0x855b3bd5b7384357 -data8 0xa804bd3c6fe61cc8, 0xd3b0a7d13618e4a1 -data8 0x858104f0c415f79a, 0xa8345895e5250a5a -data8 0xd3eca2ea53bcec0c, 0x85a6f90390d29864 -data8 0xa8642a122b44ef0b, 0xd428e23874f13a17 -data8 0x85d3772fcd56a1dd, 0xa89c38ca18f6108b -data8 0xd46f82fe293bc6d3, 0x85f9c982fcc002f3 -data8 0xa8cc81063b6e87ca, 0xd4ac57e9b7186420 -data8 0x862047e0e7ea554b, 0xa8fd00bfa409285e -data8 0xd4e972becb04e8b8, 0x8646f2a26f7f5852 -data8 0xa92db8664d5516da, 0xd526d40a7a9b43a3 -data8 0x866dca21754096b5, 0xa95ea86b75cc2c20 -data8 0xd5647c5b73917370, 0x8694ceb8dfd17a37 -data8 0xa98fd141a4992deb, 0xd5a26c4201bd6d13 -data8 0x86bc00c49e9307e8, 0xa9c1335cae7446ba -data8 0xd5e0a45015350a7e, 0x86dccd74fce79610 -data8 0xa9ea8686f556f645, 0xd614b539c6194104 -data8 0x870453c845acf90f, 0xaa1c52d17906bb19 -data8 0xd6537310e224283f, 0x872c089a1e90342c -data8 0xaa4e59b046dab887, 0xd6927ab62244c917 -data8 0x8753ec4a92d16c5e, 0xaa809b9c60d1890b -data8 0xd6d1ccc1fc4ef4b7, 0x877bff3aca19f6b4 -data8 0xaab319102f3f9b33, 0xd71169cea98fdded -data8 0x879d88b6fe1c324c, 0xaadd5a18c1e21274 -data8 0xd746a66a5bc9f6d9, 0x87c5f346dbf98c3a -data8 0xab1045f2ac31bdf5, 0xd786ce8f0fae5317 -data8 0x87e7c653efacef2c, 0xab3ae3ab2df7231e -data8 0xd7bc7ff214c4e75a, 0x881089d4e73ffefc -data8 0xab6e3f945d1e96fc, 0xd7fd35467a517ed1 -data8 0x88397e6a366f2a8a, 0xaba1d953a08fa94e -data8 0xd83e38838648d815, 0x885bc559e5e1c081 -data8 0xabcd090db7ef4c3f, 0xd874a1db598b8951 -data8 0x887e2ee392bb7a93, 0xabf864602d7c323d -data8 0xd8ab42205b80edaf, 0x88a7a8587e404257 -data8 0xac2ca5886ccf9b57, 0xd8ed1849d202f965 -data8 0x88ca5eda67594784, 0xac5861d4aa441f0f -data8 0xd92432bd5a173685, 0x88f4356166bd590e -data8 0xac8d183fe3a2fbed, 0xd9669ca45b03c23e -data8 0x89173a0acf5ce026, 0xacb93703ff51571e -data8 0xd99e3327cf89574e, 0x893a62a098b6a57b -data8 0xace5830ad0c3f14b, 0xd9d602b19b100466 -data8 0x895daf637236ae2c, 0xad11fca5d78b3ff2 -data8 0xda0e0ba86c096841, 0x89883b9d1c2fa9c5 -data8 0xad4797fddf91a798, 0xda5195fcdb1c3dce -data8 0x89abd8dd374a5d7b, 0xad747701e559ebcb -data8 0xda8a1eb87a491f6c, 0x89cf9b1dcd197fa0 -data8 0xada184a47e9c7613, 0xdac2e230b91c3f84 -data8 0x89f382a258ea79de, 0xadcec13ab0dda8ff -data8 0xdafbe0d0b66aea30, 0x8a178faf06648f29 -data8 0xadfc2d1a5fd21ba8, 0xdb351b04a8fafced -data8 0x8a3bc288b3e1d18a, 0xae29c89a5053c33a -data8 0xdb6e9139e33cdd8e, 0x8a601b74f4d1f835 -data8 0xae5794122b638df9, 0xdba843ded7151ea1 -data8 0x8a849aba14274764, 0xae858fda8137ae0a -data8 0xdbe2336319b61fc8, 0x8aa9409f16cdbc9b -data8 0xaeb3bc4ccc56d3d1, 0xdc1c60376789fa68 -data8 0x8ace0d6bbe2cb316, 0xaee219c374c09920 -data8 0xdc56cacda82d0cd5, 0x8af301688ab33558 -data8 0xaf10a899d3235fe7, 0xdc917398f2797814 -data8 0x8b181cdebe6f3206, 0xaf3f692c341fe8b4 -data8 0xdccc5b0d90a3e628, 0x8b3d60185fafcb7c -data8 0xaf6e5bd7db9ae6c2, 0xdd0781a10469f0f2 -data8 0x8b62cb603bb2fad0, 0xaf9d80fb081cd91b -data8 0xdd42e7ca0b52838f, 0x8b80d7d6bc4104de -data8 0xafc35ce063eb3787, 0xdd729ad01c69114d -data8 0x8ba68bf73ac74f39, 0xaff2ddcb5f28f03d -data8 0xddae749c001fbf5e, 0x8bcc68fb9f9f7335 -data8 0xb022923b148e05c5, 0xddea8f50a51c69b1 -data8 0x8bf26f31c534fca2, 0xb0527a919adbf58b -data8 0xde26eb69a0f0f111, 0x8c10f86e13a1a1f9 -data8 0xb078f3ab1d701c65, 0xde576480262399bc -data8 0x8c3749916cc6abb5, 0xb0a93a6870649f31 -data8 0xde943789645933c8, 0x8c5dc4c4f7706032 -data8 0xb0d9b624d62ec856, 0xded14d58139a28af -data8 0x8c7cac3a8c42e3e0, 0xb100a5f53fb3c8e1 -data8 0xdf025c00bbf2b5c7, 0x8ca373f1b7bf2716 -data8 0xb131821882f5540a, 0xdf3feb44d723a713 -data8 0x8cc29907fb951294, 0xb158bf8e4cb04055 -data8 0xdf715bc16c159be0, 0x8ce9ae4e9492aac8 -data8 0xb189fd69d56b238f, 0xdfaf66240e29cda8 -data8 0x8d0911dddbfdad0e, 0xb1b189958e8108e4 -data8 0xdfe139cbf6e19bdc, 0x8d3075c4f20f04ee -data8 0xb1e32a8165b09832, 0xe01fc0fe94d9fc52 -data8 0x8d5018a9d4de77d5, 0xb20b0678fc271eec -data8 0xe051f92ffcc0bd60, 0x8d77cc47dd143515 -data8 0xb23d0bd3f7592b6e, 0xe090feec9c9a06ac -data8 0x8d97af6352739cb7, 0xb26538b2db8420dc -data8 0xe0c39d0c9ff862d6, 0x8db7af523167800f -data8 0xb28d89e339ceca14, 0xe0f668eeb99f188d -data8 0x8ddfd80bc68c32ff, 0xb2c022ca12e55a16 -data8 0xe1362890eb663139, 0x8e00197e1e7c88fe -data8 0xb2e8c6852c6b03f1, 0xe1695c7212aecbaa -data8 0x8e207859f77e20e7, 0xb3118f4eda9fe40f -data8 0xe19cbf0391bbbbe9, 0x8e40f4ce60c9f8e2 -data8 0xb33a7d6268109ebe, 0xe1d050901c531e85 -data8 0x8e69ba46cf2fde4d, 0xb36ddbc5ea70ec55 -data8 0xe2110903b4f4047a, 0x8e8a7a00bd7ae63e -data8 0xb3971e9b39264023, 0xe2450559b4d80b6d -data8 0x8eab57ef1cf2f529, 0xb3c0877ecc18e24a -data8 0xe27931a231554ef3, 0x8ecc5442cffb1dad -data8 0xb3ea16ae3a6c905f, 0xe2ad8e2ac3c5b04b -data8 0x8eed6f2d2a4acbfe, 0xb413cc67aa0e4d2d -data8 0xe2e21b41b9694cce, 0x8f0ea8dff24441ff -data8 0xb43da8e9d163e1af, 0xe316d93615862714 -data8 0x8f385c95d696b817, 0xb47233773b84d425 -data8 0xe3590bd86a0d30f9, 0x8f59dc43edd930f3 -data8 0xb49c6825430fe730, 0xe38e38e38e38e38e -data8 0x8f7b7b5f5ffad1c4, 0xb4c6c46bcdb27dcf -data8 0xe3c397d1e6db7839, 0x8f9d3a1bea165f38 -data8 0xb4f1488c0b35d26f, 0xe3f928f5953feb9e -data8 0x8fbf18adc34b66da, 0xb51bf4c7c51f0168 -data8 0xe42eeca17c62886c, 0x8fe117499e356095 -data8 0xb546c9616087ab9c, 0xe464e32943446305 -data8 0x90033624aa685f8d, 0xb571c69bdffd9a70 -data8 0xe49b0ce15747a8a2, 0x9025757495f36b86 -data8 0xb59cecbae56984c3, 0xe4d16a1eee94e9d4 -data8 0x903f3a5dcc091203, 0xb5bd64512bb14bb7 -data8 0xe4fa52107353f67d, 0x9061b2fceb2bdbab -data8 0xb5e8d2a4bf5ba416, 0xe5310a471f4d2dc3 -data8 0x90844ca7211032a7, 0xb6146a9a1bc47819 -data8 0xe567f6f1c2b9c224, 0x90a7079403e6a15d -data8 0xb6402c7749d621c0, 0xe59f18689a9e4c9a -data8 0x90c9e3fbafd63799, 0xb66c1882fb435ea2 -data8 0xe5d66f04b8a68ecf, 0x90ece216c8a16ee4 -data8 0xb6982f048c999a56, 0xe60dfb2005c192e9 -data8 0x9110021e7b516f0a, 0xb6c47044075b4142 -data8 0xe645bd1544c7ea51, 0x912a708a39be9075 -data8 0xb6e5bd6bfd02bafd, 0xe66fb21b505b20a0 -data8 0x914dcc7b31146370, 0xb7124a2736ff8ef2 -data8 0xe6a7d32af4a7c59a, 0x91714af8cfe984d5 -data8 0xb73f026a01e94177, 0xe6e02b129c6a5ae4 -data8 0x918c00a6f3795e97, 0xb760a959f1d0a7a7 -data8 0xe70a9136a7403039, 0x91afbc299ed0295d -data8 0xb78dae7e06868ab0, 0xe74349fb2d92a589 -data8 0x91d39add3e958db0, 0xb7badff8ad9e4e02 -data8 0xe77c3a9c86ed7d42, 0x91ee9920a8974d92 -data8 0xb7dce25b8e17ae9f, 0xe7a713f88151518a -data8 0x9212b5fcac537c19, 0xb80a6226904045e2 -data8 0xe7e067453317ed2b, 0x9236f6b256923fcf -data8 0xb8380f1cafd73c1c, 0xe819f37a81871bb5 -data8 0x92523ee6f90dcfc3, 0xb85a6ea8e321b4d8 -data8 0xe8454236bfaeca14, 0x9276bef031e6eb79 -data8 0xb8886b684ae7d2fa, 0xe87f32f24c3fc90e -data8 0x929236ec237a24ad, 0xb8ab0726fa00cf5d -data8 0xe8aacd8688892ba6, 0x92b6f70b7efe9dc3 -data8 0xb8d954a4d13b7cb1, 0xe8e523fd32f606f7 -data8 0x92d29f61eec7dc2b, 0xb8fc2d4f6cd9f04a -data8 0xe9110b5311407927, 0x92f7a05d5b8ba92f -data8 0xb92acc851476b1ab, 0xe94bc8bf0c108fa3 -data8 0x931379a403be5c16, 0xb94de2d841a184c2 -data8 0xe977fdc439c2ca3c, 0x9338bc44de2e3f34 -data8 0xb97cd4c36c92693c, 0xe9b3236528fc349e -data8 0x9354c71412c69486, 0xb9a0297f172665e3 -data8 0xe9dfa70b745ac1b4, 0x937a4c273907e262 -data8 0xb9cf6f21e36c3924, 0xea1b36268d0eaa38 -data8 0x93968919f6e7975d, 0xb9f3030951267208 -data8 0xea480963fd394197, 0x93bc516fdd4680c9 -data8 0xba229d6a618e7c59, 0xea84034425f27484 -data8 0x93d8c123d9be59b2, 0xba467144459f9855 -data8 0xeab12713138dd1cc, 0x93f546c955e60076 -data8 0xba6a60c3c48f1a4b, 0xeade6db73a5e503b -data8 0x941b70a65879079f, 0xba9a76056b67ee7a -data8 0xeb1b0268343b121b, 0x943829f337410591 -data8 0xbabea699563ada6e, 0xeb489b0b2bdb5f14 -data8 0x9454f995765bc4d2, 0xbae2f350b262cc4b -data8 0xeb765721e85f03d0, 0x947b86b57f5842ed -data8 0xbb1385a23be24e57, 0xebb389645f222f62 -data8 0x94988aeb23470f86, 0xbb3814975e17c680 -data8 0xebe198f090607e0c, 0x94b5a5dc9695f42a -data8 0xbb5cc031009bf467, 0xec0fcc9321024509 -data8 0x94d2d7a9170d8b42, 0xbb81889680024764 -data8 0xec3e247da8b82f61, 0x94f9e87dd78bf019 -data8 0xbbb2c0d8703ae95d, 0xec7c27d21321c9f7 -data8 0x95175019a503d89e, 0xbbd7cd09ba3c5463 -data8 0xecaad5278824e453, 0x9534cefa625fcb3a -data8 0xbbfcf68c4977718f, 0xecd9a76d097d4e77 -data8 0x955265405c491a25, 0xbc223d88cfc88eee -data8 0xed089ed5dcd99446, 0x9570130c1f9bb857 -data8 0xbc47a2284fee4ff8, 0xed37bb95add09a1c -data8 0x9597ca4119525184, 0xbc79ac0916ed7b8a -data8 0xed76c70508f904b6, 0x95b5af6fb5aa4d3c -data8 0xbc9f5670d1a13030, 0xeda63bb05e7f93c6 -data8 0x95d3ac9273aafd7a, 0xbcc51f068cb95c1d -data8 0xedd5d661daed2dc4, 0x95f1c1cafdfd3684 -data8 0xbceb05f4b30a9bc0, 0xee05974eef86b903 -data8 0x960fef3b430b8d5f, 0xbd110b6604c7d306 -data8 0xee357ead791fc670, 0x962e350575b409c5 -data8 0xbd372f8598620f19, 0xee658cb3c134a463 -data8 0x964c934c0dfc1708, 0xbd5d727edb6b3c7e -data8 0xee95c1987f080211, 0x966b0a31c9c6bc7d -data8 0xbd83d47d937bbc6d, 0xeec61d92d8c4314f -data8 0x968999d9ad8d264e, 0xbdaa55addf1ae47d -data8 0xeef6a0da64a014ac, 0x96a8426705198795 -data8 0xbdd0f63c36aa73f0, 0xef274ba72a07c811 -data8 0x96c703fd64445ee5, 0xbdf7b6556d550a15 -data8 0xef581e31a2c91260, 0x96e5dec0a7b4268d -data8 0xbe1e9626b1ffa96b, 0xef8918b2bc43aec6 -data8 0x9704d2d4f59f79f3, 0xbe4595dd903e5371 -data8 0xefba3b63d89d7cbf, 0x9723e05ebe91b9b0 -data8 0xbe6cb5a7f14bc935, 0xefeb867ecffaa607 -data8 0x97430782be323831, 0xbe93f5b41d047cf7 -data8 0xf01cfa3df1b9c9fa, 0x97624865fc0df8bf -data8 0xbebb5630bae4c15f, 0xf04e96dc05b43e2d -data8 0x9781a32dcc640b2a, 0xbee2d74cd30a430c -data8 0xf0805c944d827454, 0x97a117ffd0f48e46 -data8 0xbf0a7937cf38d981, 0xf0b24ba285c495cb -data8 0x97c0a701f9d263c9, 0xbf323c217be2bc8c -data8 0xf0e46442e76f6569, 0x97e0505a8637a036 -data8 0xbf5a203a09342bbb, 0xf116a6b2291d7896 -data8 0x97f57a9fb0b08c6e, 0xbf74cad1c14ebfc4 -data8 0xf1383fa9e9b5b381, 0x9815503365914a9d -data8 0xbf9ce6a497a89f78, 0xf16ac84f90083b9b -data8 0x98354085054fd204, 0xbfc52428bec6e72f -data8 0xf19d7b686dcb03d7, 0x98554bbbf8a77902 -data8 0xbfed838fddab024b, 0xf1d0593311db1757 -data8 0x987571fffb7f94f6, 0xc016050c0420981a -data8 0xf20361ee8f1c711e, 0x9895b3791dd03c23 -data8 0xc03ea8cfabddc330, 0xf23695da7de51d3f -data8 0x98ab43a5fc65d0c8, 0xc059d3cbd65ddbce -data8 0xf258d095e465cc35, 0x98cbb2d196bd713d -data8 0xc082b122a3c78c9d, 0xf28c4d0bfc982b34 -data8 0x98ec3d9ec7b6f21a, 0xc0abb1499ae736c4 -data8 0xf2bff55eb3f0ea71, 0x990ce436db5e8344 -data8 0xc0d4d474c3aedaaf, 0xf2f3c9cf9884636e -data8 0x9922b8218160967a, 0xc0f054ca33eb3437 -data8 0xf31670135ab9cc0f, 0x99438d686f75779d -data8 0xc119b2c67e600ed0, 0xf34a8e9f0b54cdfb -data8 0x99647eea131fa20b, 0xc1433453de2033ff -data8 0xf37ed9fa6b8add3f, 0x997a85045a47c6d0 -data8 0xc15ef3e44e10032d, 0xf3a1cfe884ef6bb6 -data8 0x999ba5f14f8add02, 0xc188b130431d80e6 -data8 0xf3d66689dcc8e8d3, 0x99bce38b5465ecae -data8 0xc1b2929d6067730e, 0xf40b2ab069d5c96a -data8 0x99d31ca0887f30f9, 0xc1ce9268f31cc734 -data8 0xf42e718b90c8bc16, 0x99f48a669c74c09e -data8 0xc1f8b0877c1b0c08, 0xf463822a0a3b4b00 -data8 0x9a16154eb445c873, 0xc222f35a87b415ba -data8 0xf498c1076015faf8, 0x9a2c822ec198d667 -data8 0xc23f3467349e5c88, 0xf4bc5a19a33990b5 -data8 0x9a4e3e080cd91b78, 0xc269b4e40e088c01 -data8 0xf4f1e6a7d6f5425f, 0x9a70177afe52322e -data8 0xc2945aac24daaf6e, 0xf527a232cf6be334 -data8 0x9a86b8fa94eebe10, 0xc2b0de05e43c1d66 -data8 0xf54b8ecdcda90851, 0x9aa8c42866ae2958 -data8 0xc2dbc275e1229d09, 0xf5819949c7ad87b4 -data8 0x9abf86f9e12fc45e, 0xc2f86fca9d80eeff -data8 0xf5a5bac9213b48a9, 0x9ae1c462fc05f49d -data8 0xc323938449a2587e, 0xf5dc1501f324a812 -data8 0x9af8a8dc936b84d0, 0xc3406b40a538ed20 -data8 0xf6006bee86b5589e, 0x9b1b19033be35730 -data8 0xc36bcee8211d15e0, 0xf63716b2fa067fa4 -data8 0x9b3da7daf04c2892, 0xc397593adf2ba366 -data8 0xf66df22fb6132b9c, 0x9b54c2e4c8a9012b -data8 0xc3b475b6206155d5, 0xf6929fb98225deb1 -data8 0x9b77854e6c661200, 0xc3e0410243b97383 -data8 0xf6c9cd13021e3fea, 0x9b8ec2e678d56d2f -data8 0xc3fd890709833d37, 0xf6eeb177472cedae -data8 0x9ba60e6a5ca133b6, 0xc41ae295f7e7fa06 -data8 0xf713abf4cb0b3afb, 0x9bc919ea66a151a4 -data8 0xc44709f7bb8a4dd2, 0xf74b4d5333684ef1 -data8 0x9be0887c09ef82bb, 0xc4648fb0e0bec4c1 -data8 0xf7707f75a72f8e94, 0x9c03c8d5fffc3503 -data8 0xc490f9a94695ba14, 0xf7a874b97927af44 -data8 0x9c1b5ad21a81cbb9, 0xc4aeac0173b7d390 -data8 0xf7cddf140aedf1d8, 0x9c3ed09216e9ca02 -data8 0xc4db5941007aa853, 0xf806291bacb7f7a9 -data8 0x9c568656c0423def, 0xc4f938aec206291a -data8 0xf82bcc43b92eafef, 0x9c7a320af242ce60 -data8 0xc52629e899dfd622, 0xf8646bf0defb759e -data8 0x9c920bf7a8c01dc2, 0xc54436e44043b965 -data8 0xf88a487dfc3ff5f7, 0x9ca9f475d98b159c -data8 0xc562563abf9ea07f, 0xf8b03c2b46cdc17f -data8 0x9ccdeca60e80b5f8, 0xc58fa7d1dc42921c -data8 0xf8e95541c152ae7a, 0x9ce5f9d4653d4902 -data8 0xc5adf561b91e110a, 0xf90f832c2700c160 -data8 0x9cfe15cb38bfdd8e, 0xc5cc5591bdbd82fa -data8 0xf935c88e0c7f419b, 0x9d225b983f6c1f96 -data8 0xc5fa08f1ff20593c, 0xf96f5cd84fd86873 -data8 0x9d3a9cca32261ed7, 0xc618980a79ce6862 -data8 0xf995dd53ebdd9d6d, 0x9d52ecfccebe1768 -data8 0xc6373a09e34b50fa, 0xf9bc75a034436a41 -data8 0x9d77818d95b82f86, 0xc66550a6e0baaf35 -data8 0xf9f686f26d5518de, 0x9d8ff7893fa4706c -data8 0xc6842241926342c9, 0xfa1d5b39b910a8c5 -data8 0x9da87cbef36f2a5e, 0xc6a3070b7c93bb9e -data8 0xfa4447acc4ecbfd2, 0x9dcd6140b4a35aeb -data8 0xc6d18260bb84081b, 0xfa7ed7e51e6fdfb4 -data8 0x9de60cd06dc6e2d4, 0xc6f0977c9416828b -data8 0xfaa601394d49a1a0, 0x9dfec7d4cc43b76f -data8 0xc70fc0117c641630, 0xfacd431644ce0e40 -data8 0x9e17925ec9fccc4a, 0xc72efc34d7e615be -data8 0xfaf49d96f7a75909, 0x9e3cdf6db57dc075 -data8 0xc75dfb441594141e, 0xfb2fd3c65e562fd5 -data8 0x9e55d110b63637a8, 0xc77d68aa019bda4c -data8 0xfb576c5762024805, 0x9e6ed27594550d2e -data8 0xc79ce9ea478dbc4f, 0xfb7f1debc22c4040 -data8 0x9e87e3adc385d393, 0xc7bc7f1ae453219d -data8 0xfba6e89f32d0190a, 0x9ead9b54b37a1055 -data8 0xc7ec0476e15e141a, 0xfbe2c803a0894893 -data8 0x9ec6d46a3d7de215, 0xc80bcbe16f1d540f -data8 0xfc0ad1ff0ed9ecf0, 0x9ee01d9108be3154 -data8 0xc82ba78a5d349735, 0xfc32f57bdfbcbe7f -data8 0x9ef976db07288d04, 0xc84b978847a06b87 -data8 0xfc5b32968f99b21c, 0x9f12e05a4759ec25 -data8 0xc86b9bf1ee817bc6, 0xfc83896bc861ab08 -data8 0x9f2c5a20f4da6668, 0xc88bb4de3667cdf4 -data8 0xfcabfa1861ed4815, 0x9f52af78ed1733ca -data8 0xc8bc00e7fe9e23a3, 0xfce8d3cea7d3163e -data8 0x9f6c52426a39d003, 0xc8dc4d7ff2d25232 -data8 0xfd118595143ee273, 0x9f860593d42fd7f3 -data8 0xc8fcaeebcb40eb47, 0xfd3a519943d4865a -data8 0x9f9fc97fdb96bd51, 0xc91d25431426a663 -data8 0xfd6337f8e1ae5a4b, 0x9fb99e194f4a7037 -data8 0xc93db09d7fdb2949, 0xfd8c38d1c8e927eb -data8 0x9fd383731ca51db9, 0xc95e5112e721582a -data8 0xfdb5544205095a53, 0x9fed79a04fbf9423 -data8 0xc97f06bb49787677, 0xfdde8a67d2613531 -data8 0xa00780b413b24ee8, 0xc99fd1aecd6e1b06 -data8 0xfe07db619e781611, 0xa02eab2c4474b0cd -data8 0xc9d12a3e27bb1625, 0xfe460768d80bf758 -data8 0xa048dcd51ccfd142, 0xc9f22ad82ba3d5f0 -data8 0xfe6f9bfb06cd32f6, 0xa0631fa894b11b8d -data8 0xca134113105e67b2, 0xfe994bcd3d14fcc2 -data8 0xa07d73ba65e680af, 0xca346d07b045a876 -data8 0xfec316fecaf3f2ab, 0xa097d91e6aaf71b0 -data8 0xca55aecf0e94bb88, 0xfeecfdaf33fadb80 -data8 0xa0b24fe89e02602f, 0xca77068257be9bab -data8 0xff16fffe2fa8fad6, 0xa0ccd82d1bd2f68b -data8 0xca98743ae1c693a8, 0xff411e0ba9db886d -data8 0xa0e77200215909e6, 0xcab9f8122c99a101 -data8 0xff6b57f7c33e4e9a, 0xa1021d760d584855 -data8 0xcadb9221e268c3b5, 0xff95ade2d1bd7358 -data8 0xa11cdaa36068a57d, 0xcafd4283d8043dfd -data8 0xffc01fed60f86fb5, 0xa137a99cbd3f880b -data8 0xcb1f09520d37c6fb, 0xffeaae3832b63956 -ASM_SIZE_DIRECTIVE(T_table) - - - - - - -.align 32 -.global cbrtf# +LOCAL_OBJECT_START(poly_coeffs) + + data8 0xaaaab19b7e1f5ef9, 0x00003ffd // ~ 1/3 + data8 0xe38e5192a5a8e56c, 0x00003ffb // ~ 1/9 +LOCAL_OBJECT_END(poly_coeffs) + +// For every entry B in the frcpa table, this table contains +// the significands of cbrt(1/B), cbrt(2/B), cbrt(4/B). +// The index to this table is the same as the frcpa index. + +LOCAL_OBJECT_START(T_table) + + data8 0x80155c748c374836, 0xa160019ed37fb4ae + data8 0xcb51ddcb9e93095e, 0x8040404b0879f7f9 + data8 0xa1960b5966da4608, 0xcb95f333968ad59b + data8 0x806b5dce4b405c10, 0xa1cc5dbe6dc2aab4 + data8 0xcbda64292d3ffd97, 0x8096b586974669b1 + data8 0xa202f97995b69c0d, 0xcc1f3184af961596 + data8 0x80bcd273d952a028, 0xa232fe6eb0c0577d + data8 0xcc5bb1ac954d33e2, 0x80e898c52813f2f3 + data8 0xa26a2582012f6e17, 0xcca12e9831fc6402 + data8 0x81149add67c2d208, 0xa2a197e5d10465cb + data8 0xcce70a67b64f24ad, 0x813b4e2c856b6e9a + data8 0xa2d25a532efefbc8, 0xcd24794726477ea5 + data8 0x8167c1dde03de7aa, 0xa30a5bd6e49e4ab8 + data8 0xcd6b096a0b70ee87, 0x818ed973b811135e + data8 0xa33b9c9b59879e24, 0xcda9177738b15a90 + data8 0x81bbc0c33e13ec98, 0xa3742fca6a3c1f21 + data8 0xcdf05f2247dffab9, 0x81e33e69fbe7504a + data8 0xa3a5f1273887bf22, 0xce2f0f347f96f906 + data8 0x820aec524e3c23e9, 0xa3d7ef508ff11574 + data8 0xce6e0be0cd551a61, 0x823880f78e70b805 + data8 0xa4115ce30548bc15, 0xceb666b2c347d1de + data8 0x826097a62a8e5200, 0xa443df0e53df577a + data8 0xcef609b0cb874f00, 0x8288dfe00e9b5eaf + data8 0xa4769fa5913c0ec3, 0xcf35fb5447e5c765 + data8 0x82b15a10c5371624, 0xa4a99f303bc7def5 + data8 0xcf763c47ee869f00, 0x82da06a527b18937 + data8 0xa4dcde37779adf4b, 0xcfb6cd3888d71785 + data8 0x8302e60b635ab394, 0xa5105d46152c938a + data8 0xcff7aed4fbfbb447, 0x832bf8b2feec2f0e + data8 0xa5441ce89825cb8d, 0xd038e1ce5167e3c6 + data8 0x83553f0ce00e276b, 0xa5781dad3e54d899 + data8 0xd07a66d7bfa0ebba, 0x837eb98b50f8322a + data8 0xa5ac602406c4e68c, 0xd0bc3ea6b32d1b21 + data8 0x83a270f44c84f699, 0xa5d9601d95c2c0bc + data8 0xd0f4f0e8f36c1bf8, 0x83cc4d7cfcfac5ca + data8 0xa60e1e1a2de14745, 0xd1376458e34b037e + data8 0x83f65f78a8872b4c, 0xa6431f6e3fbd9658 + data8 0xd17a2ca133f78572, 0x8420a75f2f7b53c8 + data8 0xa67864b0d432fda4, 0xd1bd4a80301c5715 + data8 0x844510461ff14209, 0xa6a6444aa0243c0b + data8 0xd1f71682b2fa4575, 0x846fbd91b930bed2 + data8 0xa6dc094d10f25792, 0xd23ad555f773f059 + data8 0x84947e18234f3294, 0xa70a574cc02bba69 + data8 0xd2752c7039a5bf73, 0x84bf92755825045a + data8 0xa7409e2af9549084, 0xd2b98ee008c06b59 + data8 0x84e4ac0ee112ba51, 0xa76f5c64ca2cf13b + data8 0xd2f4735ffd700280, 0x8509ef44b86f20be + data8 0xa79e4f0babab5dc0, 0xd32f99ed6d9ac0e1 + data8 0x85359d5d91768427, 0xa7d5579ae5164b85 + data8 0xd374f0666c75d51c, 0x855b3bd5b7384357 + data8 0xa804bd3c6fe61cc8, 0xd3b0a7d13618e4a1 + data8 0x858104f0c415f79a, 0xa8345895e5250a5a + data8 0xd3eca2ea53bcec0c, 0x85a6f90390d29864 + data8 0xa8642a122b44ef0b, 0xd428e23874f13a17 + data8 0x85d3772fcd56a1dd, 0xa89c38ca18f6108b + data8 0xd46f82fe293bc6d3, 0x85f9c982fcc002f3 + data8 0xa8cc81063b6e87ca, 0xd4ac57e9b7186420 + data8 0x862047e0e7ea554b, 0xa8fd00bfa409285e + data8 0xd4e972becb04e8b8, 0x8646f2a26f7f5852 + data8 0xa92db8664d5516da, 0xd526d40a7a9b43a3 + data8 0x866dca21754096b5, 0xa95ea86b75cc2c20 + data8 0xd5647c5b73917370, 0x8694ceb8dfd17a37 + data8 0xa98fd141a4992deb, 0xd5a26c4201bd6d13 + data8 0x86bc00c49e9307e8, 0xa9c1335cae7446ba + data8 0xd5e0a45015350a7e, 0x86dccd74fce79610 + data8 0xa9ea8686f556f645, 0xd614b539c6194104 + data8 0x870453c845acf90f, 0xaa1c52d17906bb19 + data8 0xd6537310e224283f, 0x872c089a1e90342c + data8 0xaa4e59b046dab887, 0xd6927ab62244c917 + data8 0x8753ec4a92d16c5e, 0xaa809b9c60d1890b + data8 0xd6d1ccc1fc4ef4b7, 0x877bff3aca19f6b4 + data8 0xaab319102f3f9b33, 0xd71169cea98fdded + data8 0x879d88b6fe1c324c, 0xaadd5a18c1e21274 + data8 0xd746a66a5bc9f6d9, 0x87c5f346dbf98c3a + data8 0xab1045f2ac31bdf5, 0xd786ce8f0fae5317 + data8 0x87e7c653efacef2c, 0xab3ae3ab2df7231e + data8 0xd7bc7ff214c4e75a, 0x881089d4e73ffefc + data8 0xab6e3f945d1e96fc, 0xd7fd35467a517ed1 + data8 0x88397e6a366f2a8a, 0xaba1d953a08fa94e + data8 0xd83e38838648d815, 0x885bc559e5e1c081 + data8 0xabcd090db7ef4c3f, 0xd874a1db598b8951 + data8 0x887e2ee392bb7a93, 0xabf864602d7c323d + data8 0xd8ab42205b80edaf, 0x88a7a8587e404257 + data8 0xac2ca5886ccf9b57, 0xd8ed1849d202f965 + data8 0x88ca5eda67594784, 0xac5861d4aa441f0f + data8 0xd92432bd5a173685, 0x88f4356166bd590e + data8 0xac8d183fe3a2fbed, 0xd9669ca45b03c23e + data8 0x89173a0acf5ce026, 0xacb93703ff51571e + data8 0xd99e3327cf89574e, 0x893a62a098b6a57b + data8 0xace5830ad0c3f14b, 0xd9d602b19b100466 + data8 0x895daf637236ae2c, 0xad11fca5d78b3ff2 + data8 0xda0e0ba86c096841, 0x89883b9d1c2fa9c5 + data8 0xad4797fddf91a798, 0xda5195fcdb1c3dce + data8 0x89abd8dd374a5d7b, 0xad747701e559ebcb + data8 0xda8a1eb87a491f6c, 0x89cf9b1dcd197fa0 + data8 0xada184a47e9c7613, 0xdac2e230b91c3f84 + data8 0x89f382a258ea79de, 0xadcec13ab0dda8ff + data8 0xdafbe0d0b66aea30, 0x8a178faf06648f29 + data8 0xadfc2d1a5fd21ba8, 0xdb351b04a8fafced + data8 0x8a3bc288b3e1d18a, 0xae29c89a5053c33a + data8 0xdb6e9139e33cdd8e, 0x8a601b74f4d1f835 + data8 0xae5794122b638df9, 0xdba843ded7151ea1 + data8 0x8a849aba14274764, 0xae858fda8137ae0a + data8 0xdbe2336319b61fc8, 0x8aa9409f16cdbc9b + data8 0xaeb3bc4ccc56d3d1, 0xdc1c60376789fa68 + data8 0x8ace0d6bbe2cb316, 0xaee219c374c09920 + data8 0xdc56cacda82d0cd5, 0x8af301688ab33558 + data8 0xaf10a899d3235fe7, 0xdc917398f2797814 + data8 0x8b181cdebe6f3206, 0xaf3f692c341fe8b4 + data8 0xdccc5b0d90a3e628, 0x8b3d60185fafcb7c + data8 0xaf6e5bd7db9ae6c2, 0xdd0781a10469f0f2 + data8 0x8b62cb603bb2fad0, 0xaf9d80fb081cd91b + data8 0xdd42e7ca0b52838f, 0x8b80d7d6bc4104de + data8 0xafc35ce063eb3787, 0xdd729ad01c69114d + data8 0x8ba68bf73ac74f39, 0xaff2ddcb5f28f03d + data8 0xddae749c001fbf5e, 0x8bcc68fb9f9f7335 + data8 0xb022923b148e05c5, 0xddea8f50a51c69b1 + data8 0x8bf26f31c534fca2, 0xb0527a919adbf58b + data8 0xde26eb69a0f0f111, 0x8c10f86e13a1a1f9 + data8 0xb078f3ab1d701c65, 0xde576480262399bc + data8 0x8c3749916cc6abb5, 0xb0a93a6870649f31 + data8 0xde943789645933c8, 0x8c5dc4c4f7706032 + data8 0xb0d9b624d62ec856, 0xded14d58139a28af + data8 0x8c7cac3a8c42e3e0, 0xb100a5f53fb3c8e1 + data8 0xdf025c00bbf2b5c7, 0x8ca373f1b7bf2716 + data8 0xb131821882f5540a, 0xdf3feb44d723a713 + data8 0x8cc29907fb951294, 0xb158bf8e4cb04055 + data8 0xdf715bc16c159be0, 0x8ce9ae4e9492aac8 + data8 0xb189fd69d56b238f, 0xdfaf66240e29cda8 + data8 0x8d0911dddbfdad0e, 0xb1b189958e8108e4 + data8 0xdfe139cbf6e19bdc, 0x8d3075c4f20f04ee + data8 0xb1e32a8165b09832, 0xe01fc0fe94d9fc52 + data8 0x8d5018a9d4de77d5, 0xb20b0678fc271eec + data8 0xe051f92ffcc0bd60, 0x8d77cc47dd143515 + data8 0xb23d0bd3f7592b6e, 0xe090feec9c9a06ac + data8 0x8d97af6352739cb7, 0xb26538b2db8420dc + data8 0xe0c39d0c9ff862d6, 0x8db7af523167800f + data8 0xb28d89e339ceca14, 0xe0f668eeb99f188d + data8 0x8ddfd80bc68c32ff, 0xb2c022ca12e55a16 + data8 0xe1362890eb663139, 0x8e00197e1e7c88fe + data8 0xb2e8c6852c6b03f1, 0xe1695c7212aecbaa + data8 0x8e207859f77e20e7, 0xb3118f4eda9fe40f + data8 0xe19cbf0391bbbbe9, 0x8e40f4ce60c9f8e2 + data8 0xb33a7d6268109ebe, 0xe1d050901c531e85 + data8 0x8e69ba46cf2fde4d, 0xb36ddbc5ea70ec55 + data8 0xe2110903b4f4047a, 0x8e8a7a00bd7ae63e + data8 0xb3971e9b39264023, 0xe2450559b4d80b6d + data8 0x8eab57ef1cf2f529, 0xb3c0877ecc18e24a + data8 0xe27931a231554ef3, 0x8ecc5442cffb1dad + data8 0xb3ea16ae3a6c905f, 0xe2ad8e2ac3c5b04b + data8 0x8eed6f2d2a4acbfe, 0xb413cc67aa0e4d2d + data8 0xe2e21b41b9694cce, 0x8f0ea8dff24441ff + data8 0xb43da8e9d163e1af, 0xe316d93615862714 + data8 0x8f385c95d696b817, 0xb47233773b84d425 + data8 0xe3590bd86a0d30f9, 0x8f59dc43edd930f3 + data8 0xb49c6825430fe730, 0xe38e38e38e38e38e + data8 0x8f7b7b5f5ffad1c4, 0xb4c6c46bcdb27dcf + data8 0xe3c397d1e6db7839, 0x8f9d3a1bea165f38 + data8 0xb4f1488c0b35d26f, 0xe3f928f5953feb9e + data8 0x8fbf18adc34b66da, 0xb51bf4c7c51f0168 + data8 0xe42eeca17c62886c, 0x8fe117499e356095 + data8 0xb546c9616087ab9c, 0xe464e32943446305 + data8 0x90033624aa685f8d, 0xb571c69bdffd9a70 + data8 0xe49b0ce15747a8a2, 0x9025757495f36b86 + data8 0xb59cecbae56984c3, 0xe4d16a1eee94e9d4 + data8 0x903f3a5dcc091203, 0xb5bd64512bb14bb7 + data8 0xe4fa52107353f67d, 0x9061b2fceb2bdbab + data8 0xb5e8d2a4bf5ba416, 0xe5310a471f4d2dc3 + data8 0x90844ca7211032a7, 0xb6146a9a1bc47819 + data8 0xe567f6f1c2b9c224, 0x90a7079403e6a15d + data8 0xb6402c7749d621c0, 0xe59f18689a9e4c9a + data8 0x90c9e3fbafd63799, 0xb66c1882fb435ea2 + data8 0xe5d66f04b8a68ecf, 0x90ece216c8a16ee4 + data8 0xb6982f048c999a56, 0xe60dfb2005c192e9 + data8 0x9110021e7b516f0a, 0xb6c47044075b4142 + data8 0xe645bd1544c7ea51, 0x912a708a39be9075 + data8 0xb6e5bd6bfd02bafd, 0xe66fb21b505b20a0 + data8 0x914dcc7b31146370, 0xb7124a2736ff8ef2 + data8 0xe6a7d32af4a7c59a, 0x91714af8cfe984d5 + data8 0xb73f026a01e94177, 0xe6e02b129c6a5ae4 + data8 0x918c00a6f3795e97, 0xb760a959f1d0a7a7 + data8 0xe70a9136a7403039, 0x91afbc299ed0295d + data8 0xb78dae7e06868ab0, 0xe74349fb2d92a589 + data8 0x91d39add3e958db0, 0xb7badff8ad9e4e02 + data8 0xe77c3a9c86ed7d42, 0x91ee9920a8974d92 + data8 0xb7dce25b8e17ae9f, 0xe7a713f88151518a + data8 0x9212b5fcac537c19, 0xb80a6226904045e2 + data8 0xe7e067453317ed2b, 0x9236f6b256923fcf + data8 0xb8380f1cafd73c1c, 0xe819f37a81871bb5 + data8 0x92523ee6f90dcfc3, 0xb85a6ea8e321b4d8 + data8 0xe8454236bfaeca14, 0x9276bef031e6eb79 + data8 0xb8886b684ae7d2fa, 0xe87f32f24c3fc90e + data8 0x929236ec237a24ad, 0xb8ab0726fa00cf5d + data8 0xe8aacd8688892ba6, 0x92b6f70b7efe9dc3 + data8 0xb8d954a4d13b7cb1, 0xe8e523fd32f606f7 + data8 0x92d29f61eec7dc2b, 0xb8fc2d4f6cd9f04a + data8 0xe9110b5311407927, 0x92f7a05d5b8ba92f + data8 0xb92acc851476b1ab, 0xe94bc8bf0c108fa3 + data8 0x931379a403be5c16, 0xb94de2d841a184c2 + data8 0xe977fdc439c2ca3c, 0x9338bc44de2e3f34 + data8 0xb97cd4c36c92693c, 0xe9b3236528fc349e + data8 0x9354c71412c69486, 0xb9a0297f172665e3 + data8 0xe9dfa70b745ac1b4, 0x937a4c273907e262 + data8 0xb9cf6f21e36c3924, 0xea1b36268d0eaa38 + data8 0x93968919f6e7975d, 0xb9f3030951267208 + data8 0xea480963fd394197, 0x93bc516fdd4680c9 + data8 0xba229d6a618e7c59, 0xea84034425f27484 + data8 0x93d8c123d9be59b2, 0xba467144459f9855 + data8 0xeab12713138dd1cc, 0x93f546c955e60076 + data8 0xba6a60c3c48f1a4b, 0xeade6db73a5e503b + data8 0x941b70a65879079f, 0xba9a76056b67ee7a + data8 0xeb1b0268343b121b, 0x943829f337410591 + data8 0xbabea699563ada6e, 0xeb489b0b2bdb5f14 + data8 0x9454f995765bc4d2, 0xbae2f350b262cc4b + data8 0xeb765721e85f03d0, 0x947b86b57f5842ed + data8 0xbb1385a23be24e57, 0xebb389645f222f62 + data8 0x94988aeb23470f86, 0xbb3814975e17c680 + data8 0xebe198f090607e0c, 0x94b5a5dc9695f42a + data8 0xbb5cc031009bf467, 0xec0fcc9321024509 + data8 0x94d2d7a9170d8b42, 0xbb81889680024764 + data8 0xec3e247da8b82f61, 0x94f9e87dd78bf019 + data8 0xbbb2c0d8703ae95d, 0xec7c27d21321c9f7 + data8 0x95175019a503d89e, 0xbbd7cd09ba3c5463 + data8 0xecaad5278824e453, 0x9534cefa625fcb3a + data8 0xbbfcf68c4977718f, 0xecd9a76d097d4e77 + data8 0x955265405c491a25, 0xbc223d88cfc88eee + data8 0xed089ed5dcd99446, 0x9570130c1f9bb857 + data8 0xbc47a2284fee4ff8, 0xed37bb95add09a1c + data8 0x9597ca4119525184, 0xbc79ac0916ed7b8a + data8 0xed76c70508f904b6, 0x95b5af6fb5aa4d3c + data8 0xbc9f5670d1a13030, 0xeda63bb05e7f93c6 + data8 0x95d3ac9273aafd7a, 0xbcc51f068cb95c1d + data8 0xedd5d661daed2dc4, 0x95f1c1cafdfd3684 + data8 0xbceb05f4b30a9bc0, 0xee05974eef86b903 + data8 0x960fef3b430b8d5f, 0xbd110b6604c7d306 + data8 0xee357ead791fc670, 0x962e350575b409c5 + data8 0xbd372f8598620f19, 0xee658cb3c134a463 + data8 0x964c934c0dfc1708, 0xbd5d727edb6b3c7e + data8 0xee95c1987f080211, 0x966b0a31c9c6bc7d + data8 0xbd83d47d937bbc6d, 0xeec61d92d8c4314f + data8 0x968999d9ad8d264e, 0xbdaa55addf1ae47d + data8 0xeef6a0da64a014ac, 0x96a8426705198795 + data8 0xbdd0f63c36aa73f0, 0xef274ba72a07c811 + data8 0x96c703fd64445ee5, 0xbdf7b6556d550a15 + data8 0xef581e31a2c91260, 0x96e5dec0a7b4268d + data8 0xbe1e9626b1ffa96b, 0xef8918b2bc43aec6 + data8 0x9704d2d4f59f79f3, 0xbe4595dd903e5371 + data8 0xefba3b63d89d7cbf, 0x9723e05ebe91b9b0 + data8 0xbe6cb5a7f14bc935, 0xefeb867ecffaa607 + data8 0x97430782be323831, 0xbe93f5b41d047cf7 + data8 0xf01cfa3df1b9c9fa, 0x97624865fc0df8bf + data8 0xbebb5630bae4c15f, 0xf04e96dc05b43e2d + data8 0x9781a32dcc640b2a, 0xbee2d74cd30a430c + data8 0xf0805c944d827454, 0x97a117ffd0f48e46 + data8 0xbf0a7937cf38d981, 0xf0b24ba285c495cb + data8 0x97c0a701f9d263c9, 0xbf323c217be2bc8c + data8 0xf0e46442e76f6569, 0x97e0505a8637a036 + data8 0xbf5a203a09342bbb, 0xf116a6b2291d7896 + data8 0x97f57a9fb0b08c6e, 0xbf74cad1c14ebfc4 + data8 0xf1383fa9e9b5b381, 0x9815503365914a9d + data8 0xbf9ce6a497a89f78, 0xf16ac84f90083b9b + data8 0x98354085054fd204, 0xbfc52428bec6e72f + data8 0xf19d7b686dcb03d7, 0x98554bbbf8a77902 + data8 0xbfed838fddab024b, 0xf1d0593311db1757 + data8 0x987571fffb7f94f6, 0xc016050c0420981a + data8 0xf20361ee8f1c711e, 0x9895b3791dd03c23 + data8 0xc03ea8cfabddc330, 0xf23695da7de51d3f + data8 0x98ab43a5fc65d0c8, 0xc059d3cbd65ddbce + data8 0xf258d095e465cc35, 0x98cbb2d196bd713d + data8 0xc082b122a3c78c9d, 0xf28c4d0bfc982b34 + data8 0x98ec3d9ec7b6f21a, 0xc0abb1499ae736c4 + data8 0xf2bff55eb3f0ea71, 0x990ce436db5e8344 + data8 0xc0d4d474c3aedaaf, 0xf2f3c9cf9884636e + data8 0x9922b8218160967a, 0xc0f054ca33eb3437 + data8 0xf31670135ab9cc0f, 0x99438d686f75779d + data8 0xc119b2c67e600ed0, 0xf34a8e9f0b54cdfb + data8 0x99647eea131fa20b, 0xc1433453de2033ff + data8 0xf37ed9fa6b8add3f, 0x997a85045a47c6d0 + data8 0xc15ef3e44e10032d, 0xf3a1cfe884ef6bb6 + data8 0x999ba5f14f8add02, 0xc188b130431d80e6 + data8 0xf3d66689dcc8e8d3, 0x99bce38b5465ecae + data8 0xc1b2929d6067730e, 0xf40b2ab069d5c96a + data8 0x99d31ca0887f30f9, 0xc1ce9268f31cc734 + data8 0xf42e718b90c8bc16, 0x99f48a669c74c09e + data8 0xc1f8b0877c1b0c08, 0xf463822a0a3b4b00 + data8 0x9a16154eb445c873, 0xc222f35a87b415ba + data8 0xf498c1076015faf8, 0x9a2c822ec198d667 + data8 0xc23f3467349e5c88, 0xf4bc5a19a33990b5 + data8 0x9a4e3e080cd91b78, 0xc269b4e40e088c01 + data8 0xf4f1e6a7d6f5425f, 0x9a70177afe52322e + data8 0xc2945aac24daaf6e, 0xf527a232cf6be334 + data8 0x9a86b8fa94eebe10, 0xc2b0de05e43c1d66 + data8 0xf54b8ecdcda90851, 0x9aa8c42866ae2958 + data8 0xc2dbc275e1229d09, 0xf5819949c7ad87b4 + data8 0x9abf86f9e12fc45e, 0xc2f86fca9d80eeff + data8 0xf5a5bac9213b48a9, 0x9ae1c462fc05f49d + data8 0xc323938449a2587e, 0xf5dc1501f324a812 + data8 0x9af8a8dc936b84d0, 0xc3406b40a538ed20 + data8 0xf6006bee86b5589e, 0x9b1b19033be35730 + data8 0xc36bcee8211d15e0, 0xf63716b2fa067fa4 + data8 0x9b3da7daf04c2892, 0xc397593adf2ba366 + data8 0xf66df22fb6132b9c, 0x9b54c2e4c8a9012b + data8 0xc3b475b6206155d5, 0xf6929fb98225deb1 + data8 0x9b77854e6c661200, 0xc3e0410243b97383 + data8 0xf6c9cd13021e3fea, 0x9b8ec2e678d56d2f + data8 0xc3fd890709833d37, 0xf6eeb177472cedae + data8 0x9ba60e6a5ca133b6, 0xc41ae295f7e7fa06 + data8 0xf713abf4cb0b3afb, 0x9bc919ea66a151a4 + data8 0xc44709f7bb8a4dd2, 0xf74b4d5333684ef1 + data8 0x9be0887c09ef82bb, 0xc4648fb0e0bec4c1 + data8 0xf7707f75a72f8e94, 0x9c03c8d5fffc3503 + data8 0xc490f9a94695ba14, 0xf7a874b97927af44 + data8 0x9c1b5ad21a81cbb9, 0xc4aeac0173b7d390 + data8 0xf7cddf140aedf1d8, 0x9c3ed09216e9ca02 + data8 0xc4db5941007aa853, 0xf806291bacb7f7a9 + data8 0x9c568656c0423def, 0xc4f938aec206291a + data8 0xf82bcc43b92eafef, 0x9c7a320af242ce60 + data8 0xc52629e899dfd622, 0xf8646bf0defb759e + data8 0x9c920bf7a8c01dc2, 0xc54436e44043b965 + data8 0xf88a487dfc3ff5f7, 0x9ca9f475d98b159c + data8 0xc562563abf9ea07f, 0xf8b03c2b46cdc17f + data8 0x9ccdeca60e80b5f8, 0xc58fa7d1dc42921c + data8 0xf8e95541c152ae7a, 0x9ce5f9d4653d4902 + data8 0xc5adf561b91e110a, 0xf90f832c2700c160 + data8 0x9cfe15cb38bfdd8e, 0xc5cc5591bdbd82fa + data8 0xf935c88e0c7f419b, 0x9d225b983f6c1f96 + data8 0xc5fa08f1ff20593c, 0xf96f5cd84fd86873 + data8 0x9d3a9cca32261ed7, 0xc618980a79ce6862 + data8 0xf995dd53ebdd9d6d, 0x9d52ecfccebe1768 + data8 0xc6373a09e34b50fa, 0xf9bc75a034436a41 + data8 0x9d77818d95b82f86, 0xc66550a6e0baaf35 + data8 0xf9f686f26d5518de, 0x9d8ff7893fa4706c + data8 0xc6842241926342c9, 0xfa1d5b39b910a8c5 + data8 0x9da87cbef36f2a5e, 0xc6a3070b7c93bb9e + data8 0xfa4447acc4ecbfd2, 0x9dcd6140b4a35aeb + data8 0xc6d18260bb84081b, 0xfa7ed7e51e6fdfb4 + data8 0x9de60cd06dc6e2d4, 0xc6f0977c9416828b + data8 0xfaa601394d49a1a0, 0x9dfec7d4cc43b76f + data8 0xc70fc0117c641630, 0xfacd431644ce0e40 + data8 0x9e17925ec9fccc4a, 0xc72efc34d7e615be + data8 0xfaf49d96f7a75909, 0x9e3cdf6db57dc075 + data8 0xc75dfb441594141e, 0xfb2fd3c65e562fd5 + data8 0x9e55d110b63637a8, 0xc77d68aa019bda4c + data8 0xfb576c5762024805, 0x9e6ed27594550d2e + data8 0xc79ce9ea478dbc4f, 0xfb7f1debc22c4040 + data8 0x9e87e3adc385d393, 0xc7bc7f1ae453219d + data8 0xfba6e89f32d0190a, 0x9ead9b54b37a1055 + data8 0xc7ec0476e15e141a, 0xfbe2c803a0894893 + data8 0x9ec6d46a3d7de215, 0xc80bcbe16f1d540f + data8 0xfc0ad1ff0ed9ecf0, 0x9ee01d9108be3154 + data8 0xc82ba78a5d349735, 0xfc32f57bdfbcbe7f + data8 0x9ef976db07288d04, 0xc84b978847a06b87 + data8 0xfc5b32968f99b21c, 0x9f12e05a4759ec25 + data8 0xc86b9bf1ee817bc6, 0xfc83896bc861ab08 + data8 0x9f2c5a20f4da6668, 0xc88bb4de3667cdf4 + data8 0xfcabfa1861ed4815, 0x9f52af78ed1733ca + data8 0xc8bc00e7fe9e23a3, 0xfce8d3cea7d3163e + data8 0x9f6c52426a39d003, 0xc8dc4d7ff2d25232 + data8 0xfd118595143ee273, 0x9f860593d42fd7f3 + data8 0xc8fcaeebcb40eb47, 0xfd3a519943d4865a + data8 0x9f9fc97fdb96bd51, 0xc91d25431426a663 + data8 0xfd6337f8e1ae5a4b, 0x9fb99e194f4a7037 + data8 0xc93db09d7fdb2949, 0xfd8c38d1c8e927eb + data8 0x9fd383731ca51db9, 0xc95e5112e721582a + data8 0xfdb5544205095a53, 0x9fed79a04fbf9423 + data8 0xc97f06bb49787677, 0xfdde8a67d2613531 + data8 0xa00780b413b24ee8, 0xc99fd1aecd6e1b06 + data8 0xfe07db619e781611, 0xa02eab2c4474b0cd + data8 0xc9d12a3e27bb1625, 0xfe460768d80bf758 + data8 0xa048dcd51ccfd142, 0xc9f22ad82ba3d5f0 + data8 0xfe6f9bfb06cd32f6, 0xa0631fa894b11b8d + data8 0xca134113105e67b2, 0xfe994bcd3d14fcc2 + data8 0xa07d73ba65e680af, 0xca346d07b045a876 + data8 0xfec316fecaf3f2ab, 0xa097d91e6aaf71b0 + data8 0xca55aecf0e94bb88, 0xfeecfdaf33fadb80 + data8 0xa0b24fe89e02602f, 0xca77068257be9bab + data8 0xff16fffe2fa8fad6, 0xa0ccd82d1bd2f68b + data8 0xca98743ae1c693a8, 0xff411e0ba9db886d + data8 0xa0e77200215909e6, 0xcab9f8122c99a101 + data8 0xff6b57f7c33e4e9a, 0xa1021d760d584855 + data8 0xcadb9221e268c3b5, 0xff95ade2d1bd7358 + data8 0xa11cdaa36068a57d, 0xcafd4283d8043dfd + data8 0xffc01fed60f86fb5, 0xa137a99cbd3f880b + data8 0xcb1f09520d37c6fb, 0xffeaae3832b63956 +LOCAL_OBJECT_END(T_table) + + + + + + .section .text -.proc cbrtf# -.align 32 -cbrtf: +GLOBAL_LIBM_ENTRY(cbrtf) -{ .mfi - getf.sig r28=f8 - // will continue only for normal/denormal numbers -(p0) fclass.nm.unc p12,p7 = f8, 0x1b - // r2 = pointer to C_1,C_2 followed by T_table - addl r2 = @ltoff(poly_coeffs), gp +{.mfi + getf.sig GR_SIGNIF = f8 + // will continue only for normal/denormal numbers + fclass.nm.unc p12, p7 = f8, 0x1b + // GR_GP = pointer to C_1, C_2 followed by T_table + nop.i 0 } {.mfi - // r29=bias-((2^8-1)/3) -63=0xffff-0x55-0x3f=0xff6b - mov r29=0xff6b - // normalize a - fma.s1 f14=f8,f1,f0 - nop.i 0;; + addl GR_GP = @ltoff(poly_coeffs), gp + // normalize a + fma.s1 FR_ARG = f8, f1, f0 + // GR_CT3 = bias-((2^8-1)/3) -63 = 0xffff-0x55-0x3f = 0xff6b + mov GR_CT3 = 0xff6b ;; } -{.mib - nop.m 0 - (p7) cmp.eq p12,p0=r28,r0 - nop.b 0;; + +{.mmi + // get exponent + getf.exp GR_ARGEXP = f8 + // load start address for C_1, C_2 followed by T_table + ld8 GR_ADDR = [ GR_GP ] + nop.i 0 ;; } -{.mfb - // load start address for C_1,C_2 followed by T_table - ld8 r2=[r2] - (p12) fma.s.s0 f8=f8,f1,f0 - (p12) br.ret.spnt b0;; + +{.mlx + // check if input significand is 0 + (p7) cmp.eq p12, p7 = GR_SIGNIF, r0 + // GR_2P63 = 2^63 + movl GR_2P63 = 0x8000000000000000 ;; +} + +{.mfi + nop.m 0 + // y = frcpa(a) + // p7 = 1 for normal and denormal (but non-zero) arguments + (p7) frcpa.s0 FR_Y, p0 = f1, f8 + // p9 = 1 if denormal input + cmp.gtu p9, p0 = GR_2P63, GR_SIGNIF } -{.mmf - // load C_1 - ldfe f7=[r2],16 - nop.m 0 - // y=frcpa(a) - frcpa.s0 f8,p6=f1,f8;; +{.mfb + // load C_1 + ldfe FR_COEFF1 = [ GR_ADDR ], 16 + // if argument is 0, +/-Infinity, or NaN, return + (p12) fma.s.s0 f8 = f8, f1, f0 + (p12) br.ret.spnt b0 ;; } + {.mmi - // load C_2 - ldfe f9=[r2],16 - // r28=bias-(2^8-1) - mov r28=0xff00 - nop.i 0;; + // get normalized significand (for denormal inputs only) + (p9) getf.sig GR_SIGNIF = FR_ARG + // load C_2 + ldfe FR_COEFF2 = [ GR_ADDR ], 16 + // GR_CT2 = bias-(2^8-1) + mov GR_CT2 = 0xff00 } -{.mmi - // get normalized significand - getf.sig r23=f14 - // get exponent - getf.exp r24=f14 - mov r25=0x20000;; + +{.mii + // get exponent (for denormal inputs only) + (p9) getf.exp GR_ARGEXP = FR_ARG + nop.i 0 + mov GR_CONST = 0x20000 ;; } + + {.mii - // get r26=sign - and r26=r24,r25 - // eliminate leading 1 from r23=1st table index - shl r23=r23,1 - // eliminate sign from exponent (r25) - andcm r25=r24,r25;; + // get GR_SIGN = sign + and GR_SIGN = GR_ARGEXP, GR_CONST + // eliminate leading 1 from GR_I1 = 1st table index + shl GR_I1 = GR_SIGNIF, 1 + // eliminate sign from exponent + andcm GR_EBIAS = GR_ARGEXP, GR_CONST ;; } + + {.mfi - // subtract bias from r25=exponent - sub r25=r25,r28 - // r=1-a*y - (p6) fnma.s1 f6=f8,f14,f1 - // r23=1st table index (y_index8 bits) - shr.u r23=r23,56;; + // subtract bias from GR_EXP = exponent + sub GR_EXP = GR_EBIAS, GR_CT2 + // r = 1-a*y + fnma.s1 FR_R = FR_Y, FR_ARG, f1 + // GR_IT1 = 1st table index (y_index8 bits) + shr.u GR_IT1 = GR_I1, 56 ;; } + + {.mii - // 1: exponent*=5; // (2^{16}-1)/3=0x5555 - shladd r24=r25,2,r25 - // r23=3*y_index - shladd r23=r23,1,r23;; - // r30=(5*expon)*16+5*expon=(0x55)*expon - shladd r30=r24,4,r24;; + // 1: exponent* = 5; // (2^{16}-1)/3 = 0x5555 + shladd GR_E5 = GR_EXP, 2, GR_EXP + // GR_IT1_3 = 3*y_index + shladd GR_IT1_3 = GR_IT1, 1, GR_IT1 + nop.i 0 ;; } + + +{.mmi + // GR_TMP5 = (5*expon)*16+5*expon = (0x55)*expon + shladd GR_TMP5 = GR_E5, 4, GR_E5 + // adjust T_table pointer by 1st index + shladd GR_TP1 = GR_IT1_3, 3, GR_ADDR + nop.i 0 ;; +} + + {.mmi - // adjust T_table pointer by 1st index - shladd r2=r23,3,r2;; - // f10=T[0][y] - (p6) ldf8 f10=[r2],8 - // r24=(0x5500)*expon - shl r24=r30,8;; + // FR_T0 = T [ 0 ] [ y ] + ldf8 FR_T0 = [ GR_TP1 ], 8 + // get 2^{-63} + mov GR_TMP63 = 0xffff + 63 + // GR_TMP = (0x5500)*expon + shl GR_TMP = GR_TMP5, 8 ;; } + + {.mfi - // f11=T[1][y] - (p6) ldf8 f11=[r2],8 - // P_1=C_1+C_2*r - (p6) fma.s1 f7=f9,f6,f7 - // r24=(0x5555)*expon - add r24=r24,r30;; + // FR_T1 = T [ 1 ] [ y ] + ldf8 FR_T1 = [ GR_TP1 ], 8 + // P_1 = C_1+C_2*r + fma.s1 FR_COEFF1 = FR_COEFF2, FR_R, FR_COEFF1 + // GR_TMP2 = (0x5555)*expon + add GR_TMP2 = GR_TMP, GR_TMP5 ;; } + + {.mmi - // r24=(0x5556)*expon // 0x5556=(2^{16}+2)/3 - add r24=r24,r25;; - // f8=T[2][y] - (p6) ldf8 f8=[r2] - // r24=floor(expon/3) - shr r24=r24,16;; + // GR_TMP3 = (0x5556)*expon // 0x5556 = (2^{16}+2)/3 + add GR_TMP3 = GR_TMP2, GR_EXP ;; + // FR_T2 = T [ 2 ] [ y ] + ldf8 FR_T2 = [ GR_TP1 ] + // GR_EXP3 = floor(expon/3) + shr GR_EXP3 = GR_TMP3, 16 ;; } + + {.mmi - nop.m 0 - // r28=3*exponent - shladd r28=r24,1,r24 - // bias exponent - add r24=r29,r24;; + setf.exp FR_2M63 = GR_TMP63 + // GR_TMP4 = 3*exponent + shladd GR_TMP4 = GR_EXP3, 1, GR_EXP3 + // bias exponent + add GR_EBIAS3 = GR_CT3, GR_EXP3 ;; +} + + +{.mmf + // get remainder of exponent/3 + sub GR_REM = GR_EXP, GR_TMP4 + // add sign to exponent + or GR_SEXP = GR_EBIAS3, GR_SIGN + // P_2 = -r*P_1 + fnma.s1 FR_R = FR_COEFF1, FR_R, f0 ;; } + + + {.mmi - // get remainder of exponent/3 - sub r25=r25,r28 - // add sign to exponent - or r24=r24,r26 - nop.i 0;; -} -{.mfi - nop.m 0 - // P_2=-r*P_1 - (p6) fnma.s1 f6=f7,f6,f0 - // remainder=0 ? - (p6) cmp.eq.unc p7,p8=r0,r25;; + // FR_ARG = sign*2^{exponent/3} + setf.exp FR_ARG = GR_SEXP + nop.m 0 + // remainder = 0 ? + // p7=1 if input exponent is 3*j (remainder is 0) + cmp.eq.unc p7, p8 = r0, GR_REM ;; } + + {.mfi - // f14=sign*2^{exponent/3} - (p6) setf.exp f14=r24 - nop.f 0 - // remainder = 1 ? - (p8) cmp.eq.unc p8,p12=1,r25;; + // remainder = 1 ? + // p8=1 if input exponent is 3*j+1 (remainder is 1) + // p12=1 if input exponent is 3*j+2 (remainder is 2) + (p8) cmp.eq.unc p8, p12 = 1, GR_REM + // p7=1 -> remainder = 0 -> use T = FR_T0 + (p7) fma.s1 f8 = FR_T0, FR_R, FR_T0 + // argument is of the form 2^(3*k) ? + // ( GR_I1 holds significand bits, without the leading 1) + or GR_I1 = GR_I1, GR_REM ;; } -.pred.rel "mutex",p7,p8 + + +.pred.rel "mutex", p12, p8 {.mfi - nop.m 0 - // remainder=0 -> use T=f10 - (p7) fma.s1 f8=f10,f6,f10 - nop.i 0 + nop.m 0 + // p8=1 -> remainder = 1 -> use FR_T1 + (p8) fma.s1 f8 = FR_T1, FR_R, FR_T1 + // argument is of the form 2^(3*k) ? + cmp.eq p14, p7 = GR_I1, r0 } + + {.mfi - nop.m 0 - // remainder =1 -> use f11 - (p8) fma.s1 f8=f11,f6,f11 - nop.i 0;; + nop.m 0 + // p12=1 -> remainder=2 -> result = T+T*P_2 + (p12) fma.s1 f8 = FR_T2, FR_R, FR_T2 + nop.i 0 ;; } + + +.pred.rel "mutex", p14, p7 {.mfi - nop.m 0 - // result=T+T*P_2 - (p12) fma.s.s0 f8=f8,f6,f8 - nop.i 0;; + nop.m 0 + // if argument is sgn*2^{3*(expon/3)} + (p14) fma.s.s0 f8 = FR_2M63, FR_ARG, f0 + nop.i 0 } {.mfb - nop.m 0 - // T*=sgn*2^{expon/3} - (p6) fma.s.s0 f8=f8,f14,f0 - br.ret.sptk b0;; + nop.m 0 + // T* = sgn*2^{expon/3} + (p7) fma.s.s0 f8 = f8, FR_ARG, f0 + br.ret.sptk b0 ;; } -.endp cbrtf -ASM_SIZE_DIRECTIVE(cbrtf) + + +GLOBAL_LIBM_END(cbrtf) + + + diff --git a/sysdeps/ia64/fpu/s_cbrtl.S b/sysdeps/ia64/fpu/s_cbrtl.S index d4bbf8fdbf..3e621e2c12 100644 --- a/sysdeps/ia64/fpu/s_cbrtl.S +++ b/sysdeps/ia64/fpu/s_cbrtl.S @@ -1,11 +1,10 @@ -.file "cbrtl.asm" +.file "cbrtl.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Cristina Iordache, Ted Kubaska, -// Bob Norin, Shane Story, and Ping Tak Peter Tang -// of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -21,7 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -36,11 +35,13 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 4/28/00: Initial version +// 04/28/00 Initial version +// 05/20/02 Cleaned up namespace and sf0 syntax +// 02/06/03 Reordered header: .section, .global, .proc, .align // // API //============================================================== @@ -95,29 +96,26 @@ // r2-r3, r23-r30 // p6,p7,p12 -#include "libm_support.h" + // Data tables //============================================================== -#ifdef _LIBC -.rodata -#else -.data -#endif +RODATA .align 16 -poly_coeffs: -ASM_TYPE_DIRECTIVE(poly_coeffs,@object) +LOCAL_OBJECT_START(poly_coeffs) + data8 0xaaaaaaaaaaaaaab1, 0x00003ffd // C_1 data8 0xe38e38e38e38e3e0, 0x00003ffb // C_2 data8 0x3faf9add3c0be9a6, 0x3fa511e8d2b1f749 // C_3, C_4 data8 0x3f9ee71b2c6ebe99, 0x3f9809180fd0340c // C_5, C_6 -ASM_SIZE_DIRECTIVE(poly_coeffs) +LOCAL_OBJECT_END(poly_coeffs) + + +LOCAL_OBJECT_START(T_table) -T_table: -ASM_TYPE_DIRECTIVE(T_table,@object) data8 0x80155c748c374836, 0x8040404b0879f7f9 data8 0x806b5dce4b405c10, 0x8096b586974669b1 @@ -503,14 +501,15 @@ data8 0xfec316fecaf3f2ab, 0xfeecfdaf33fadb80 data8 0xff16fffe2fa8fad6, 0xff411e0ba9db886d data8 0xff6b57f7c33e4e9a, 0xff95ade2d1bd7358 data8 0xffc01fed60f86fb5, 0xffeaae3832b63956 -ASM_SIZE_DIRECTIVE(T_table) +LOCAL_OBJECT_END(T_table) -D_table: -ASM_TYPE_DIRECTIVE(D_table,@object) + +LOCAL_OBJECT_START(D_table) + data4 0x1e50f488, 0x1ebdc559, 0x1e649ec1, 0x9eed9b2c data4 0x9e511c44, 0x9ec6d551, 0x9eefe248, 0x9e313854 data4 0x9f54ff18, 0x9d231411, 0x1ee5d63c, 0x9edf6b95 @@ -703,25 +702,16 @@ data4 0x9eafd508, 0x9ef0e9fc, 0x1d1307ac, 0x1eecee20 data4 0x1cf60c6f, 0x9d556216, 0x9eaed175, 0x9ec919f4 data4 0x1ec2c988, 0x1cd82772, 0x9dc99456, 0x1eab0467 data4 0x1e89b36f, 0x1c757944, 0x1eef9abd, 0x9e98664d -ASM_SIZE_DIRECTIVE(D_table) - - +LOCAL_OBJECT_END(D_table) - -.align 32 -.global cbrtl# - .section .text -.proc cbrtl# -.align 32 -cbrtl: - +GLOBAL_LIBM_ENTRY(cbrtl) { .mfi getf.sig r3=f8 // will continue only for normal/denormal numbers -(p0) fclass.nm.unc p12,p7 = f8, 0x1b + fclass.nm.unc p12,p7 = f8, 0x1b // r2 = pointer to C_1...C_6 followed by T_table addl r2 = @ltoff(poly_coeffs), gp;; } @@ -898,5 +888,5 @@ cbrtl: (p6) fma.s0 f8=f8,f6,f8 br.ret.sptk b0;; } -.endp cbrtl -ASM_SIZE_DIRECTIVE(cbrtl) +GLOBAL_LIBM_END(cbrtl) + diff --git a/sysdeps/ia64/fpu/s_ceil.S b/sysdeps/ia64/fpu/s_ceil.S index f7e6d2cfa6..d1d2980618 100644 --- a/sysdeps/ia64/fpu/s_ceil.S +++ b/sysdeps/ia64/fpu/s_ceil.S @@ -1,10 +1,10 @@ .file "ceil.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,90 +20,67 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// // Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// problem reports or change requests be submitted to it directly at +// http://www.intel.com/software/products/opensource/libraries/num.htm. // - -#include "libm_support.h" - -.align 32 -.global ceil# - -.section .text -.proc ceil# -.align 32 - // History //============================================================== -// 2/02/00: Initial version -// 6/13/00: Improved speed -// 6/27/00: Eliminated incorrect invalid flag setting +// 02/02/00 Initial version +// 06/13/00 Improved speed +// 06/27/00 Eliminated incorrect invalid flag setting +// 05/20/02 Cleaned up namespace and sf0 syntax +// 01/28/03 Improved performance +//============================================================== // API //============================================================== // double ceil(double x) +//============================================================== -// general input registers: - -ceil_GR_FFFF = r14 -ceil_GR_signexp = r15 -ceil_GR_exponent = r16 -ceil_GR_expmask = r17 -ceil_GR_bigexp = r18 - - -// predicate registers used: +// general input registers: +// r14 - r19 -// p6 ==> Input is NaN, infinity, zero -// p7 ==> Input is denormal -// p8 ==> Input is <0 -// p9 ==> Input is >=0 -// p10 ==> Input is already an integer (bigger than largest integer) -// p11 ==> Input is not a large integer -// p12 ==> Input is a smaller integer -// p13 ==> Input is not an even integer, so inexact must be set -// p14 ==> Input is between -1 and 0, so result will be -0 and inexact +rSignexp = r14 +rExp = r15 +rExpMask = r16 +rBigexp = r17 +rM1 = r18 +rSignexpM1 = r19 +// floating-point registers: +// f8 - f13 -// floating-point registers used: +fXInt = f9 +fNormX = f10 +fTmp = f11 +fAdj = f12 +fPreResult = f13 -CEIL_SIGNED_ZERO = f7 -CEIL_NORM_f8 = f9 -CEIL_FFFF = f10 -CEIL_INEXACT = f11 -CEIL_FLOAT_INT_f8 = f12 -CEIL_INT_f8 = f13 -CEIL_adj = f14 -CEIL_MINUS_ONE = f15 +// predicate registers used: +// p6 - p10 // Overview of operation //============================================================== - // double ceil(double x) -// Return an integer value (represented as a double) that is the smallest +// Return an integer value (represented as a double) that is the smallest // value not less than x // This is x rounded toward +infinity to an integral value. // Inexact is set if x != ceil(x) -// ************************************************************************** - -// Set denormal flag for denormal input and -// and take denormal fault if necessary. - -// Is the input an integer value already? +//============================================================== // double_extended // if the exponent is > 1003e => 3F(true) = 63(decimal) @@ -124,139 +101,124 @@ CEIL_MINUS_ONE = f15 // If we multiply by 2^23, we no longer have a fractional part // So input is an integer value already. -// If x is NAN, ZERO, or INFINITY, then return - -// qnan snan inf norm unorm 0 -+ -// 1 1 1 0 0 1 11 0xe7 - -ceil: +.section .text +GLOBAL_LIBM_ENTRY(ceil) { .mfi - getf.exp ceil_GR_signexp = f8 - fcvt.fx.trunc.s1 CEIL_INT_f8 = f8 - addl ceil_GR_bigexp = 0x10033, r0 + getf.exp rSignexp = f8 // Get signexp, recompute if unorm + fclass.m p7,p0 = f8, 0x0b // Test x unorm + addl rBigexp = 0x10033, r0 // Set exponent at which is integer } { .mfi - addl ceil_GR_FFFF = -1,r0 - fcmp.lt.s1 p8,p9 = f8,f0 - mov ceil_GR_expmask = 0x1FFFF ;; + mov rM1 = -1 // Set all ones + fcvt.fx.trunc.s1 fXInt = f8 // Convert to int in significand + mov rExpMask = 0x1FFFF // Form exponent mask } +;; -// p7 ==> denorm { .mfi - setf.sig CEIL_FFFF = ceil_GR_FFFF - fclass.m p7,p0 = f8, 0x0b - nop.i 999 + mov rSignexpM1 = 0x2FFFF // Form signexp of -1 + fcmp.lt.s1 p8,p9 = f8, f0 // Test x < 0 + nop.i 0 } -{ .mfi - nop.m 999 - fnorm CEIL_NORM_f8 = f8 - nop.i 999 ;; +{ .mfb + setf.sig fTmp = rM1 // Make const for setting inexact + fnorm.s1 fNormX = f8 // Normalize input +(p7) br.cond.spnt CEIL_UNORM // Branch if x unorm } +;; -// Form 0 with sign of input in case negative zero is needed -{ .mfi - nop.m 999 - fmerge.s CEIL_SIGNED_ZERO = f8, f0 - nop.i 999 -} +CEIL_COMMON: +// Return here from CEIL_UNORM { .mfi - nop.m 999 - fsub.s1 CEIL_MINUS_ONE = f0, f1 - nop.i 999 ;; -} - -// p6 ==> NAN, INF, ZERO -{ .mfb - nop.m 999 - fclass.m p6,p10 = f8, 0xe7 -(p7) br.cond.spnt L(CEIL_DENORM) ;; + nop.m 0 + fclass.m p6,p0 = f8, 0x1e7 // Test x natval, nan, inf, 0 + nop.i 0 } +;; -L(CEIL_COMMON): .pred.rel "mutex",p8,p9 -// Set adjustment to add to trunc(x) for result -// If x>0, adjustment is 1.0 -// If x<=0, adjustment is 0.0 { .mfi - and ceil_GR_exponent = ceil_GR_signexp, ceil_GR_expmask -(p9) fadd.s1 CEIL_adj = f1,f0 - nop.i 999 + nop.m 0 +(p8) fma.s1 fAdj = f0, f0, f0 // If x < 0, adjustment is 0 + nop.i 0 } { .mfi - nop.m 999 -(p8) fadd.s1 CEIL_adj = f0,f0 - nop.i 999 ;; + nop.m 0 +(p9) fma.s1 fAdj = f1, f1, f0 // If x > 0, adjustment is +1 + nop.i 0 } +;; { .mfi -(p10) cmp.ge.unc p10,p11 = ceil_GR_exponent, ceil_GR_bigexp -(p6) fnorm.d f8 = f8 - nop.i 999 ;; + nop.m 0 + fcvt.xf fPreResult = fXInt // trunc(x) + nop.i 0 } - -{ .mfi - nop.m 999 -(p11) fcvt.xf CEIL_FLOAT_INT_f8 = CEIL_INT_f8 - nop.i 999 ;; +{ .mfb + nop.m 0 +(p6) fma.d.s0 f8 = f8, f1, f0 // Result if x natval, nan, inf, 0 +(p6) br.ret.spnt b0 // Exit if x natval, nan, inf, 0 } +;; -{ .mfi - nop.m 999 -(p10) fnorm.d f8 = CEIL_NORM_f8 - nop.i 999 ;; +{ .mmi + and rExp = rSignexp, rExpMask // Get biased exponent +;; + cmp.ge p7,p6 = rExp, rBigexp // Is |x| >= 2^52? +(p8) cmp.lt.unc p10,p0 = rSignexp, rSignexpM1 // Is -1 < x < 0? } +;; -// Is -1 < x < 0? If so, result will be -0. Special case it with p14 set. +// If -1 < x < 0, we turn off p6 and compute result as -0 { .mfi - nop.m 999 -(p8) fcmp.gt.unc.s1 p14,p0 = CEIL_NORM_f8, CEIL_MINUS_ONE - nop.i 999 ;; +(p10) cmp.ne p6,p0 = r0,r0 +(p10) fmerge.s f8 = fNormX, f0 + nop.i 0 } +;; +.pred.rel "mutex",p6,p7 { .mfi -(p14) cmp.ne p11,p0 = r0,r0 -(p14) fnorm.d f8 = CEIL_SIGNED_ZERO - nop.i 999 + nop.m 0 +(p6) fma.d.s0 f8 = fPreResult, f1, fAdj // Result if !int, |x| < 2^52 + nop.i 0 } { .mfi - nop.m 999 -(p14) fmpy.s0 CEIL_INEXACT = CEIL_FFFF,CEIL_FFFF - nop.i 999 ;; + nop.m 0 +(p7) fma.d.s0 f8 = fNormX, f1, f0 // Result, if |x| >= 2^52 +(p10) cmp.eq p6,p0 = r0,r0 // If -1 < x < 0, turn on p6 again } +;; { .mfi - nop.m 999 -(p11) fadd.d f8 = CEIL_FLOAT_INT_f8,CEIL_adj - nop.i 999 ;; -} -{ .mfi - nop.m 999 -(p11) fcmp.eq.unc.s1 p12,p13 = CEIL_FLOAT_INT_f8, CEIL_NORM_f8 - nop.i 999 ;; + nop.m 0 +(p6) fcmp.eq.unc.s1 p8, p9 = fPreResult, fNormX // Is trunc(x) = x ? + nop.i 0 } +;; -// Set inexact if result not equal to input { .mfi - nop.m 999 -(p13) fmpy.s0 CEIL_INEXACT = CEIL_FFFF,CEIL_FFFF - nop.i 999 + nop.m 0 +(p9) fmpy.s0 fTmp = fTmp, fTmp // Dummy to set inexact + nop.i 0 } -// Set result to input if integer { .mfb - nop.m 999 -(p12) fnorm.d f8 = CEIL_NORM_f8 - br.ret.sptk b0 ;; + nop.m 0 +(p8) fma.d.s0 f8 = fNormX, f1, f0 // If x int, result normalized x + br.ret.sptk b0 // Exit main path, 0 < |x| < 2^52 } +;; + -// Here if input denorm -L(CEIL_DENORM): +CEIL_UNORM: +// Here if x unorm { .mfb - getf.exp ceil_GR_signexp = CEIL_NORM_f8 - fcvt.fx.trunc.s1 CEIL_INT_f8 = CEIL_NORM_f8 - br.cond.sptk L(CEIL_COMMON) ;; + getf.exp rSignexp = fNormX // Get signexp, recompute if unorm + fcmp.eq.s0 p7,p0 = f8, f0 // Dummy op to set denormal flag + br.cond.sptk CEIL_COMMON // Return to main path } +;; -.endp ceil -ASM_SIZE_DIRECTIVE(ceil) +GLOBAL_LIBM_END(ceil) diff --git a/sysdeps/ia64/fpu/s_ceilf.S b/sysdeps/ia64/fpu/s_ceilf.S index d1011052e8..051534a202 100644 --- a/sysdeps/ia64/fpu/s_ceilf.S +++ b/sysdeps/ia64/fpu/s_ceilf.S @@ -1,10 +1,10 @@ .file "ceilf.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,90 +20,67 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// // Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// problem reports or change requests be submitted to it directly at +// http://www.intel.com/software/products/opensource/libraries/num.htm. // - -#include "libm_support.h" - -.align 32 -.global ceilf# - -.section .text -.proc ceilf# -.align 32 - // History //============================================================== -// 2/02/00: Initial version -// 6/13/00: Improved speed -// 6/27/00: Eliminated incorrect invalid flag setting +// 02/02/00 Initial version +// 06/13/00 Improved speed +// 06/27/00 Eliminated incorrect invalid flag setting +// 05/20/02 Cleaned up namespace and sf0 syntax +// 01/28/03 Improved performance +//============================================================== // API //============================================================== // float ceilf(float x) +//============================================================== -// general input registers: - -ceil_GR_FFFF = r14 -ceil_GR_signexp = r15 -ceil_GR_exponent = r16 -ceil_GR_expmask = r17 -ceil_GR_bigexp = r18 - - -// predicate registers used: +// general input registers: +// r14 - r19 -// p6 ==> Input is NaN, infinity, zero -// p7 ==> Input is denormal -// p8 ==> Input is <0 -// p9 ==> Input is >=0 -// p10 ==> Input is already an integer (bigger than largest integer) -// p11 ==> Input is not a large integer -// p12 ==> Input is a smaller integer -// p13 ==> Input is not an even integer, so inexact must be set -// p14 ==> Input is between -1 and 0, so result will be -0 and inexact +rSignexp = r14 +rExp = r15 +rExpMask = r16 +rBigexp = r17 +rM1 = r18 +rSignexpM1 = r19 +// floating-point registers: +// f8 - f13 -// floating-point registers used: +fXInt = f9 +fNormX = f10 +fTmp = f11 +fAdj = f12 +fPreResult = f13 -CEIL_SIGNED_ZERO = f7 -CEIL_NORM_f8 = f9 -CEIL_FFFF = f10 -CEIL_INEXACT = f11 -CEIL_FLOAT_INT_f8 = f12 -CEIL_INT_f8 = f13 -CEIL_adj = f14 -CEIL_MINUS_ONE = f15 +// predicate registers used: +// p6 - p10 // Overview of operation //============================================================== - // float ceilf(float x) -// Return an integer value (represented as a float) that is the smallest +// Return an integer value (represented as a float) that is the smallest // value not less than x // This is x rounded toward +infinity to an integral value. // Inexact is set if x != ceilf(x) -// ************************************************************************** - -// Set denormal flag for denormal input and -// and take denormal fault if necessary. - -// Is the input an integer value already? +//============================================================== // double_extended // if the exponent is > 1003e => 3F(true) = 63(decimal) @@ -124,139 +101,124 @@ CEIL_MINUS_ONE = f15 // If we multiply by 2^23, we no longer have a fractional part // So input is an integer value already. -// If x is NAN, ZERO, or INFINITY, then return - -// qnan snan inf norm unorm 0 -+ -// 1 1 1 0 0 1 11 0xe7 - -ceilf: +.section .text +GLOBAL_LIBM_ENTRY(ceilf) { .mfi - getf.exp ceil_GR_signexp = f8 - fcvt.fx.trunc.s1 CEIL_INT_f8 = f8 - addl ceil_GR_bigexp = 0x10016, r0 + getf.exp rSignexp = f8 // Get signexp, recompute if unorm + fclass.m p7,p0 = f8, 0x0b // Test x unorm + addl rBigexp = 0x10016, r0 // Set exponent at which is integer } { .mfi - addl ceil_GR_FFFF = -1,r0 - fcmp.lt.s1 p8,p9 = f8,f0 - mov ceil_GR_expmask = 0x1FFFF ;; + mov rM1 = -1 // Set all ones + fcvt.fx.trunc.s1 fXInt = f8 // Convert to int in significand + mov rExpMask = 0x1FFFF // Form exponent mask } +;; -// p7 ==> denorm { .mfi - setf.sig CEIL_FFFF = ceil_GR_FFFF - fclass.m p7,p0 = f8, 0x0b - nop.i 999 + mov rSignexpM1 = 0x2FFFF // Form signexp of -1 + fcmp.lt.s1 p8,p9 = f8, f0 // Test x < 0 + nop.i 0 } -{ .mfi - nop.m 999 - fnorm CEIL_NORM_f8 = f8 - nop.i 999 ;; +{ .mfb + setf.sig fTmp = rM1 // Make const for setting inexact + fnorm.s1 fNormX = f8 // Normalize input +(p7) br.cond.spnt CEIL_UNORM // Branch if x unorm } +;; -// Form 0 with sign of input in case negative zero is needed -{ .mfi - nop.m 999 - fmerge.s CEIL_SIGNED_ZERO = f8, f0 - nop.i 999 -} +CEIL_COMMON: +// Return here from CEIL_UNORM { .mfi - nop.m 999 - fsub.s1 CEIL_MINUS_ONE = f0, f1 - nop.i 999 ;; -} - -// p6 ==> NAN, INF, ZERO -{ .mfb - nop.m 999 - fclass.m p6,p10 = f8, 0xe7 -(p7) br.cond.spnt L(CEIL_DENORM) ;; + nop.m 0 + fclass.m p6,p0 = f8, 0x1e7 // Test x natval, nan, inf, 0 + nop.i 0 } +;; -L(CEIL_COMMON): .pred.rel "mutex",p8,p9 -// Set adjustment to add to trunc(x) for result -// If x>0, adjustment is 1.0 -// If x<=0, adjustment is 0.0 { .mfi - and ceil_GR_exponent = ceil_GR_signexp, ceil_GR_expmask -(p9) fadd.s1 CEIL_adj = f1,f0 - nop.i 999 + nop.m 0 +(p8) fma.s1 fAdj = f0, f0, f0 // If x < 0, adjustment is 0 + nop.i 0 } { .mfi - nop.m 999 -(p8) fadd.s1 CEIL_adj = f0,f0 - nop.i 999 ;; + nop.m 0 +(p9) fma.s1 fAdj = f1, f1, f0 // If x > 0, adjustment is +1 + nop.i 0 } +;; { .mfi -(p10) cmp.ge.unc p10,p11 = ceil_GR_exponent, ceil_GR_bigexp -(p6) fnorm.s f8 = f8 - nop.i 999 ;; + nop.m 0 + fcvt.xf fPreResult = fXInt // trunc(x) + nop.i 0 } - -{ .mfi - nop.m 999 -(p11) fcvt.xf CEIL_FLOAT_INT_f8 = CEIL_INT_f8 - nop.i 999 ;; +{ .mfb + nop.m 0 +(p6) fma.s.s0 f8 = f8, f1, f0 // Result if x natval, nan, inf, 0 +(p6) br.ret.spnt b0 // Exit if x natval, nan, inf, 0 } +;; -{ .mfi - nop.m 999 -(p10) fnorm.s f8 = CEIL_NORM_f8 - nop.i 999 ;; +{ .mmi + and rExp = rSignexp, rExpMask // Get biased exponent +;; + cmp.ge p7,p6 = rExp, rBigexp // Is |x| >= 2^23? +(p8) cmp.lt.unc p10,p0 = rSignexp, rSignexpM1 // Is -1 < x < 0? } +;; -// Is -1 < x < 0? If so, result will be -0. Special case it with p14 set. +// If -1 < x < 0, we turn off p6 and compute result as -0 { .mfi - nop.m 999 -(p8) fcmp.gt.unc.s1 p14,p0 = CEIL_NORM_f8, CEIL_MINUS_ONE - nop.i 999 ;; +(p10) cmp.ne p6,p0 = r0,r0 +(p10) fmerge.s f8 = fNormX, f0 + nop.i 0 } +;; +.pred.rel "mutex",p6,p7 { .mfi -(p14) cmp.ne p11,p0 = r0,r0 -(p14) fnorm.s f8 = CEIL_SIGNED_ZERO - nop.i 999 + nop.m 0 +(p6) fma.s.s0 f8 = fPreResult, f1, fAdj // Result if !int, |x| < 2^23 + nop.i 0 } { .mfi - nop.m 999 -(p14) fmpy.s0 CEIL_INEXACT = CEIL_FFFF,CEIL_FFFF - nop.i 999 ;; + nop.m 0 +(p7) fma.s.s0 f8 = fNormX, f1, f0 // Result, if |x| >= 2^23 +(p10) cmp.eq p6,p0 = r0,r0 // If -1 < x < 0, turn on p6 again } +;; { .mfi - nop.m 999 -(p11) fadd.s f8 = CEIL_FLOAT_INT_f8,CEIL_adj - nop.i 999 ;; -} -{ .mfi - nop.m 999 -(p11) fcmp.eq.unc.s1 p12,p13 = CEIL_FLOAT_INT_f8, CEIL_NORM_f8 - nop.i 999 ;; + nop.m 0 +(p6) fcmp.eq.unc.s1 p8, p9 = fPreResult, fNormX // Is trunc(x) = x ? + nop.i 0 } +;; -// Set inexact if result not equal to input { .mfi - nop.m 999 -(p13) fmpy.s0 CEIL_INEXACT = CEIL_FFFF,CEIL_FFFF - nop.i 999 + nop.m 0 +(p9) fmpy.s0 fTmp = fTmp, fTmp // Dummy to set inexact + nop.i 0 } -// Set result to input if integer { .mfb - nop.m 999 -(p12) fnorm.s f8 = CEIL_NORM_f8 - br.ret.sptk b0 ;; + nop.m 0 +(p8) fma.s.s0 f8 = fNormX, f1, f0 // If x int, result normalized x + br.ret.sptk b0 // Exit main path, 0 < |x| < 2^23 } +;; + -// Here if input denorm -L(CEIL_DENORM): +CEIL_UNORM: +// Here if x unorm { .mfb - getf.exp ceil_GR_signexp = CEIL_NORM_f8 - fcvt.fx.trunc.s1 CEIL_INT_f8 = CEIL_NORM_f8 - br.cond.sptk L(CEIL_COMMON) ;; + getf.exp rSignexp = fNormX // Get signexp, recompute if unorm + fcmp.eq.s0 p7,p0 = f8, f0 // Dummy op to set denormal flag + br.cond.sptk CEIL_COMMON // Return to main path } +;; -.endp ceilf -ASM_SIZE_DIRECTIVE(ceilf) +GLOBAL_LIBM_END(ceilf) diff --git a/sysdeps/ia64/fpu/s_ceill.S b/sysdeps/ia64/fpu/s_ceill.S index d3d8719584..71cb01d3fa 100644 --- a/sysdeps/ia64/fpu/s_ceill.S +++ b/sysdeps/ia64/fpu/s_ceill.S @@ -1,10 +1,10 @@ .file "ceill.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,90 +20,67 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// // Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// problem reports or change requests be submitted to it directly at +// http://www.intel.com/software/products/opensource/libraries/num.htm. // - -#include "libm_support.h" - -.align 32 -.global ceill# - -.section .text -.proc ceill# -.align 32 - // History //============================================================== -// 2/02/00: Initial version -// 6/13/00: Improved speed -// 6/27/00: Eliminated incorrect invalid flag setting +// 02/02/00 Initial version +// 06/13/00 Improved speed +// 06/27/00 Eliminated incorrect invalid flag setting +// 05/20/02 Cleaned up namespace and sf0 syntax +// 01/28/03 Improved performance +//============================================================== // API //============================================================== -// double ceill(double x) - -// general input registers: - -ceil_GR_FFFF = r14 -ceil_GR_signexp = r15 -ceil_GR_exponent = r16 -ceil_GR_expmask = r17 -ceil_GR_bigexp = r18 - +// long double ceill(long double x) +//============================================================== -// predicate registers used: +// general input registers: +// r14 - r19 -// p6 ==> Input is NaN, infinity, zero -// p7 ==> Input is denormal -// p8 ==> Input is <0 -// p9 ==> Input is >=0 -// p10 ==> Input is already an integer (bigger than largest integer) -// p11 ==> Input is not a large integer -// p12 ==> Input is a smaller integer -// p13 ==> Input is not an even integer, so inexact must be set -// p14 ==> Input is between -1 and 0, so result will be -0 and inexact +rSignexp = r14 +rExp = r15 +rExpMask = r16 +rBigexp = r17 +rM1 = r18 +rSignexpM1 = r19 +// floating-point registers: +// f8 - f13 -// floating-point registers used: +fXInt = f9 +fNormX = f10 +fTmp = f11 +fAdj = f12 +fPreResult = f13 -CEIL_SIGNED_ZERO = f7 -CEIL_NORM_f8 = f9 -CEIL_FFFF = f10 -CEIL_INEXACT = f11 -CEIL_FLOAT_INT_f8 = f12 -CEIL_INT_f8 = f13 -CEIL_adj = f14 -CEIL_MINUS_ONE = f15 +// predicate registers used: +// p6 - p10 // Overview of operation //============================================================== - // long double ceill(long double x) -// Return an integer value (represented as a long double) that is the smallest +// Return an integer value (represented as a long double) that is the smallest // value not less than x // This is x rounded toward +infinity to an integral value. // Inexact is set if x != ceill(x) -// ************************************************************************** - -// Set denormal flag for denormal input and -// and take denormal fault if necessary. - -// Is the input an integer value already? +//============================================================== // double_extended // if the exponent is > 1003e => 3F(true) = 63(decimal) @@ -124,139 +101,124 @@ CEIL_MINUS_ONE = f15 // If we multiply by 2^23, we no longer have a fractional part // So input is an integer value already. -// If x is NAN, ZERO, or INFINITY, then return - -// qnan snan inf norm unorm 0 -+ -// 1 1 1 0 0 1 11 0xe7 - -ceill: +.section .text +GLOBAL_LIBM_ENTRY(ceill) { .mfi - getf.exp ceil_GR_signexp = f8 - fcvt.fx.trunc.s1 CEIL_INT_f8 = f8 - addl ceil_GR_bigexp = 0x1003e, r0 + getf.exp rSignexp = f8 // Get signexp, recompute if unorm + fclass.m p7,p0 = f8, 0x0b // Test x unorm + addl rBigexp = 0x1003e, r0 // Set exponent at which is integer } { .mfi - addl ceil_GR_FFFF = -1,r0 - fcmp.lt.s1 p8,p9 = f8,f0 - mov ceil_GR_expmask = 0x1FFFF ;; + mov rM1 = -1 // Set all ones + fcvt.fx.trunc.s1 fXInt = f8 // Convert to int in significand + mov rExpMask = 0x1FFFF // Form exponent mask } +;; -// p7 ==> denorm { .mfi - setf.sig CEIL_FFFF = ceil_GR_FFFF - fclass.m p7,p0 = f8, 0x0b - nop.i 999 + mov rSignexpM1 = 0x2FFFF // Form signexp of -1 + fcmp.lt.s1 p8,p9 = f8, f0 // Test x < 0 + nop.i 0 } -{ .mfi - nop.m 999 - fnorm CEIL_NORM_f8 = f8 - nop.i 999 ;; +{ .mfb + setf.sig fTmp = rM1 // Make const for setting inexact + fnorm.s1 fNormX = f8 // Normalize input +(p7) br.cond.spnt CEIL_UNORM // Branch if x unorm } +;; -// Form 0 with sign of input in case negative zero is needed -{ .mfi - nop.m 999 - fmerge.s CEIL_SIGNED_ZERO = f8, f0 - nop.i 999 -} +CEIL_COMMON: +// Return here from CEIL_UNORM { .mfi - nop.m 999 - fsub.s1 CEIL_MINUS_ONE = f0, f1 - nop.i 999 ;; -} - -// p6 ==> NAN, INF, ZERO -{ .mfb - nop.m 999 - fclass.m p6,p10 = f8, 0xe7 -(p7) br.cond.spnt L(CEIL_DENORM) ;; + nop.m 0 + fclass.m p6,p0 = f8, 0x1e7 // Test x natval, nan, inf, 0 + nop.i 0 } +;; -L(CEIL_COMMON): .pred.rel "mutex",p8,p9 -// Set adjustment to add to trunc(x) for result -// If x>0, adjustment is 1.0 -// If x<=0, adjustment is 0.0 { .mfi - and ceil_GR_exponent = ceil_GR_signexp, ceil_GR_expmask -(p9) fadd.s1 CEIL_adj = f1,f0 - nop.i 999 + nop.m 0 +(p8) fma.s1 fAdj = f0, f0, f0 // If x < 0, adjustment is 0 + nop.i 0 } { .mfi - nop.m 999 -(p8) fadd.s1 CEIL_adj = f0,f0 - nop.i 999 ;; + nop.m 0 +(p9) fma.s1 fAdj = f1, f1, f0 // If x > 0, adjustment is +1 + nop.i 0 } +;; { .mfi -(p10) cmp.ge.unc p10,p11 = ceil_GR_exponent, ceil_GR_bigexp -(p6) fnorm f8 = f8 - nop.i 999 ;; + nop.m 0 + fcvt.xf fPreResult = fXInt // trunc(x) + nop.i 0 } - -{ .mfi - nop.m 999 -(p11) fcvt.xf CEIL_FLOAT_INT_f8 = CEIL_INT_f8 - nop.i 999 ;; +{ .mfb + nop.m 0 +(p6) fma.s0 f8 = f8, f1, f0 // Result if x natval, nan, inf, 0 +(p6) br.ret.spnt b0 // Exit if x natval, nan, inf, 0 } +;; -{ .mfi - nop.m 999 -(p10) fnorm f8 = CEIL_NORM_f8 - nop.i 999 ;; +{ .mmi + and rExp = rSignexp, rExpMask // Get biased exponent +;; + cmp.ge p7,p6 = rExp, rBigexp // Is |x| >= 2^63? +(p8) cmp.lt.unc p10,p0 = rSignexp, rSignexpM1 // Is -1 < x < 0? } +;; -// Is -1 < x < 0? If so, result will be -0. Special case it with p14 set. +// If -1 < x < 0, we turn off p6 and compute result as -0 { .mfi - nop.m 999 -(p8) fcmp.gt.unc.s1 p14,p0 = CEIL_NORM_f8, CEIL_MINUS_ONE - nop.i 999 ;; +(p10) cmp.ne p6,p0 = r0,r0 +(p10) fmerge.s f8 = fNormX, f0 + nop.i 0 } +;; +.pred.rel "mutex",p6,p7 { .mfi -(p14) cmp.ne p11,p0 = r0,r0 -(p14) fnorm f8 = CEIL_SIGNED_ZERO - nop.i 999 + nop.m 0 +(p6) fma.s0 f8 = fPreResult, f1, fAdj // Result if !int, |x| < 2^63 + nop.i 0 } { .mfi - nop.m 999 -(p14) fmpy.s0 CEIL_INEXACT = CEIL_FFFF,CEIL_FFFF - nop.i 999 ;; + nop.m 0 +(p7) fma.s0 f8 = fNormX, f1, f0 // Result, if |x| >= 2^63 +(p10) cmp.eq p6,p0 = r0,r0 // If -1 < x < 0, turn on p6 again } +;; { .mfi - nop.m 999 -(p11) fadd f8 = CEIL_FLOAT_INT_f8,CEIL_adj - nop.i 999 ;; -} -{ .mfi - nop.m 999 -(p11) fcmp.eq.unc.s1 p12,p13 = CEIL_FLOAT_INT_f8, CEIL_NORM_f8 - nop.i 999 ;; + nop.m 0 +(p6) fcmp.eq.unc.s1 p8, p9 = fPreResult, fNormX // Is trunc(x) = x ? + nop.i 0 } +;; -// Set inexact if result not equal to input { .mfi - nop.m 999 -(p13) fmpy.s0 CEIL_INEXACT = CEIL_FFFF,CEIL_FFFF - nop.i 999 + nop.m 0 +(p9) fmpy.s0 fTmp = fTmp, fTmp // Dummy to set inexact + nop.i 0 } -// Set result to input if integer { .mfb - nop.m 999 -(p12) fnorm f8 = CEIL_NORM_f8 - br.ret.sptk b0 ;; + nop.m 0 +(p8) fma.s0 f8 = fNormX, f1, f0 // If x int, result normalized x + br.ret.sptk b0 // Exit main path, 0 < |x| < 2^63 } +;; + -// Here if input denorm -L(CEIL_DENORM): +CEIL_UNORM: +// Here if x unorm { .mfb - getf.exp ceil_GR_signexp = CEIL_NORM_f8 - fcvt.fx.trunc.s1 CEIL_INT_f8 = CEIL_NORM_f8 - br.cond.sptk L(CEIL_COMMON) ;; + getf.exp rSignexp = fNormX // Get signexp, recompute if unorm + fcmp.eq.s0 p7,p0 = f8, f0 // Dummy op to set denormal flag + br.cond.sptk CEIL_COMMON // Return to main path } +;; -.endp ceill -ASM_SIZE_DIRECTIVE(ceill) +GLOBAL_LIBM_END(ceill) diff --git a/sysdeps/ia64/fpu/s_copysign.S b/sysdeps/ia64/fpu/s_copysign.S index e0d08cb721..0903565ff3 100644 --- a/sysdeps/ia64/fpu/s_copysign.S +++ b/sysdeps/ia64/fpu/s_copysign.S @@ -23,12 +23,16 @@ ENTRY (__copysign) { fmerge.s fret0 = farg1, farg0 br.ret.sptk.many rp -} +} END (__copysign) strong_alias (__copysign, __copysignf) strong_alias (__copysign, __copysignl) +strong_alias (__copysign, __libm_copysign) +strong_alias (__copysign, __libm_copysignf) +strong_alias (__copysign, __libm_copysignl) + weak_alias (__copysign, copysign) weak_alias (__copysignf, copysignf) weak_alias (__copysignl, copysignl) diff --git a/sysdeps/ia64/fpu/s_cos.S b/sysdeps/ia64/fpu/s_cos.S index 6540aec724..84c177abab 100644 --- a/sysdeps/ia64/fpu/s_cos.S +++ b/sysdeps/ia64/fpu/s_cos.S @@ -1,10 +1,10 @@ .file "sincos.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. // -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,7 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -35,17 +35,22 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 2/02/00 Initial revision -// 4/02/00 Unwind support added. -// 6/16/00 Updated tables to enforce symmetry -// 8/31/00 Saved 2 cycles in main path, and 9 in other paths. -// 9/20/00 The updated tables regressed to an old version, so reinstated them +// 02/02/00 Initial version +// 04/02/00 Unwind support added. +// 06/16/00 Updated tables to enforce symmetry +// 08/31/00 Saved 2 cycles in main path, and 9 in other paths. +// 09/20/00 The updated tables regressed to an old version, so reinstated them // 10/18/00 Changed one table entry to ensure symmetry -// 1/03/01 Improved speed, fixed flag settings for small arguments. +// 01/03/01 Improved speed, fixed flag settings for small arguments. +// 02/18/02 Large arguments processing routine excluded +// 05/20/02 Cleaned up namespace and sf0 syntax +// 06/03/02 Insure inexact flag set for large arg result +// 09/05/02 Work range is widened by reduction strengthen (3 parts of Pi/16) +// 02/10/03 Reordered header: .section, .global, .proc, .align // API //============================================================== @@ -63,9 +68,13 @@ // nfloat = Round result to integer (round-to-nearest) // // r = x - nfloat * pi/2^k -// Do this as (x - nfloat * HIGH(pi/2^k)) - nfloat * LOW(pi/2^k) for increased accuracy. +// Do this as ((((x - nfloat * HIGH(pi/2^k))) - +// nfloat * LOW(pi/2^k)) - +// nfloat * LOWEST(pi/2^k) for increased accuracy. // pi/2^k is stored as two numbers that when added make pi/2^k. // pi/2^k = HIGH(pi/2^k) + LOW(pi/2^k) +// HIGH and LOW parts are rounded to zero values, +// and LOWEST is rounded to nearest one. // // x = (nfloat * pi/2^k) + r // r is small enough that we can use a polynomial approximation @@ -121,7 +130,7 @@ // // as follows // -// Sm = Sin(Mpi/2^k) and Cm = Cos(Mpi/2^k) +// S[m] = Sin(Mpi/2^k) and C[m] = Cos(Mpi/2^k) // rsq = r*r // // @@ -141,23 +150,22 @@ // // P = r + rcub * P // -// Answer = Sm Cos(r) + Cm P +// Answer = S[m] Cos(r) + [Cm] P // // Cos(r) = 1 + rsq Q // Cos(r) = 1 + r^2 Q // Cos(r) = 1 + r^2 (q1 + r^2q2 + r^4q3 + r^6q4) // Cos(r) = 1 + r^2q1 + r^4q2 + r^6q3 + r^8q4 + ... // -// Sm Cos(r) = Sm(1 + rsq Q) -// Sm Cos(r) = Sm + Sm rsq Q -// Sm Cos(r) = Sm + s_rsq Q -// Q = Sm + s_rsq Q +// S[m] Cos(r) = S[m](1 + rsq Q) +// S[m] Cos(r) = S[m] + Sm rsq Q +// S[m] Cos(r) = S[m] + s_rsq Q +// Q = S[m] + s_rsq Q // // Then, // -// Answer = Q + Cm P +// Answer = Q + C[m] P -#include "libm_support.h" // Registers used //============================================================== @@ -174,99 +182,97 @@ // Assembly macros //============================================================== -sind_NORM_f8 = f9 -sind_W = f10 -sind_int_Nfloat = f11 -sind_Nfloat = f12 +sincos_NORM_f8 = f9 +sincos_W = f10 +sincos_int_Nfloat = f11 +sincos_Nfloat = f12 -sind_r = f13 -sind_rsq = f14 -sind_rcub = f15 +sincos_r = f13 +sincos_rsq = f14 +sincos_rcub = f15 +sincos_save_tmp = f15 -sind_Inv_Pi_by_16 = f32 -sind_Pi_by_16_hi = f33 -sind_Pi_by_16_lo = f34 +sincos_Inv_Pi_by_16 = f32 +sincos_Pi_by_16_1 = f33 +sincos_Pi_by_16_2 = f34 -sind_Inv_Pi_by_64 = f35 -sind_Pi_by_64_hi = f36 -sind_Pi_by_64_lo = f37 +sincos_Inv_Pi_by_64 = f35 -sind_Sm = f38 -sind_Cm = f39 +sincos_Pi_by_16_3 = f36 -sind_P1 = f40 -sind_Q1 = f41 -sind_P2 = f42 -sind_Q2 = f43 -sind_P3 = f44 -sind_Q3 = f45 -sind_P4 = f46 -sind_Q4 = f47 +sincos_r_exact = f37 -sind_P_temp1 = f48 -sind_P_temp2 = f49 +sincos_Sm = f38 +sincos_Cm = f39 -sind_Q_temp1 = f50 -sind_Q_temp2 = f51 +sincos_P1 = f40 +sincos_Q1 = f41 +sincos_P2 = f42 +sincos_Q2 = f43 +sincos_P3 = f44 +sincos_Q3 = f45 +sincos_P4 = f46 +sincos_Q4 = f47 -sind_P = f52 -sind_Q = f53 +sincos_P_temp1 = f48 +sincos_P_temp2 = f49 -sind_srsq = f54 +sincos_Q_temp1 = f50 +sincos_Q_temp2 = f51 -sind_SIG_INV_PI_BY_16_2TO61 = f55 -sind_RSHF_2TO61 = f56 -sind_RSHF = f57 -sind_2TOM61 = f58 -sind_NFLOAT = f59 -sind_W_2TO61_RSH = f60 +sincos_P = f52 +sincos_Q = f53 -fp_tmp = f61 +sincos_srsq = f54 + +sincos_SIG_INV_PI_BY_16_2TO61 = f55 +sincos_RSHF_2TO61 = f56 +sincos_RSHF = f57 +sincos_2TOM61 = f58 +sincos_NFLOAT = f59 +sincos_W_2TO61_RSH = f60 + +fp_tmp = f61 ///////////////////////////////////////////////////////////// -sind_AD_1 = r33 -sind_AD_2 = r34 -sind_exp_limit = r35 -sind_r_signexp = r36 -sind_AD_beta_table = r37 -sind_r_sincos = r38 +sincos_AD_1 = r33 +sincos_AD_2 = r34 +sincos_exp_limit = r35 +sincos_r_signexp = r36 +sincos_AD_beta_table = r37 +sincos_r_sincos = r38 -sind_r_exp = r39 -sind_r_17_ones = r40 +sincos_r_exp = r39 +sincos_r_17_ones = r40 -sind_GR_sig_inv_pi_by_16 = r14 -sind_GR_rshf_2to61 = r15 -sind_GR_rshf = r16 -sind_GR_exp_2tom61 = r17 -sind_GR_n = r18 -sind_GR_m = r19 -sind_GR_32m = r19 +sincos_GR_sig_inv_pi_by_16 = r14 +sincos_GR_rshf_2to61 = r15 +sincos_GR_rshf = r16 +sincos_GR_exp_2tom61 = r17 +sincos_GR_n = r18 +sincos_GR_m = r19 +sincos_GR_32m = r19 +sincos_GR_all_ones = r19 -gr_tmp = r41 -GR_SAVE_PFS = r41 -GR_SAVE_B0 = r42 -GR_SAVE_GP = r43 +gr_tmp = r41 +GR_SAVE_PFS = r41 +GR_SAVE_B0 = r42 +GR_SAVE_GP = r43 -#ifdef _LIBC -.rodata -#else -.data -#endif +RODATA +// Pi/16 parts .align 16 -double_sind_pi: -ASM_TYPE_DIRECTIVE(double_sind_pi,@object) -// data8 0xA2F9836E4E44152A, 0x00004001 // 16/pi (significand loaded w/ setf) -// c90fdaa22168c234 - data8 0xC90FDAA22168C234, 0x00003FFC // pi/16 hi -// c4c6628b80dc1cd1 29024e088a - data8 0xC4C6628B80DC1CD1, 0x00003FBC // pi/16 lo -ASM_SIZE_DIRECTIVE(double_sind_pi) - -double_sind_pq_k4: -ASM_TYPE_DIRECTIVE(double_sind_pq_k4,@object) +LOCAL_OBJECT_START(double_sincos_pi) + data8 0xC90FDAA22168C234, 0x00003FFC // pi/16 1st part + data8 0xC4C6628B80DC1CD1, 0x00003FBC // pi/16 2nd part + data8 0xA4093822299F31D0, 0x00003F7A // pi/16 3rd part +LOCAL_OBJECT_END(double_sincos_pi) + +// Coefficients for polynomials +LOCAL_OBJECT_START(double_sincos_pq_k4) data8 0x3EC71C963717C63A // P4 data8 0x3EF9FFBA8F191AE6 // Q4 data8 0xBF2A01A00F4E11A8 // P3 @@ -275,125 +281,119 @@ ASM_TYPE_DIRECTIVE(double_sind_pq_k4,@object) data8 0x3FA555555554DD45 // Q2 data8 0xBFC5555555555555 // P1 data8 0xBFDFFFFFFFFFFFFC // Q1 -ASM_SIZE_DIRECTIVE(double_sind_pq_k4) +LOCAL_OBJECT_END(double_sincos_pq_k4) +// Sincos table (S[m], C[m]) +LOCAL_OBJECT_START(double_sin_cos_beta_k4) -double_sin_cos_beta_k4: -ASM_TYPE_DIRECTIVE(double_sin_cos_beta_k4,@object) data8 0x0000000000000000 , 0x00000000 // sin( 0 pi/16) S0 data8 0x8000000000000000 , 0x00003fff // cos( 0 pi/16) C0 - +// data8 0xc7c5c1e34d3055b3 , 0x00003ffc // sin( 1 pi/16) S1 data8 0xfb14be7fbae58157 , 0x00003ffe // cos( 1 pi/16) C1 - +// data8 0xc3ef1535754b168e , 0x00003ffd // sin( 2 pi/16) S2 data8 0xec835e79946a3146 , 0x00003ffe // cos( 2 pi/16) C2 - +// data8 0x8e39d9cd73464364 , 0x00003ffe // sin( 3 pi/16) S3 data8 0xd4db3148750d181a , 0x00003ffe // cos( 3 pi/16) C3 - +// data8 0xb504f333f9de6484 , 0x00003ffe // sin( 4 pi/16) S4 data8 0xb504f333f9de6484 , 0x00003ffe // cos( 4 pi/16) C4 - - +// +// data8 0xd4db3148750d181a , 0x00003ffe // sin( 5 pi/16) C3 data8 0x8e39d9cd73464364 , 0x00003ffe // cos( 5 pi/16) S3 - +// data8 0xec835e79946a3146 , 0x00003ffe // sin( 6 pi/16) C2 data8 0xc3ef1535754b168e , 0x00003ffd // cos( 6 pi/16) S2 - +// data8 0xfb14be7fbae58157 , 0x00003ffe // sin( 7 pi/16) C1 data8 0xc7c5c1e34d3055b3 , 0x00003ffc // cos( 7 pi/16) S1 - +// data8 0x8000000000000000 , 0x00003fff // sin( 8 pi/16) C0 data8 0x0000000000000000 , 0x00000000 // cos( 8 pi/16) S0 - - +// +// data8 0xfb14be7fbae58157 , 0x00003ffe // sin( 9 pi/16) C1 data8 0xc7c5c1e34d3055b3 , 0x0000bffc // cos( 9 pi/16) -S1 - +// data8 0xec835e79946a3146 , 0x00003ffe // sin(10 pi/16) C2 data8 0xc3ef1535754b168e , 0x0000bffd // cos(10 pi/16) -S2 - +// data8 0xd4db3148750d181a , 0x00003ffe // sin(11 pi/16) C3 data8 0x8e39d9cd73464364 , 0x0000bffe // cos(11 pi/16) -S3 - +// data8 0xb504f333f9de6484 , 0x00003ffe // sin(12 pi/16) S4 data8 0xb504f333f9de6484 , 0x0000bffe // cos(12 pi/16) -S4 - - +// +// data8 0x8e39d9cd73464364 , 0x00003ffe // sin(13 pi/16) S3 data8 0xd4db3148750d181a , 0x0000bffe // cos(13 pi/16) -C3 - +// data8 0xc3ef1535754b168e , 0x00003ffd // sin(14 pi/16) S2 data8 0xec835e79946a3146 , 0x0000bffe // cos(14 pi/16) -C2 - +// data8 0xc7c5c1e34d3055b3 , 0x00003ffc // sin(15 pi/16) S1 data8 0xfb14be7fbae58157 , 0x0000bffe // cos(15 pi/16) -C1 - +// data8 0x0000000000000000 , 0x00000000 // sin(16 pi/16) S0 data8 0x8000000000000000 , 0x0000bfff // cos(16 pi/16) -C0 - - +// +// data8 0xc7c5c1e34d3055b3 , 0x0000bffc // sin(17 pi/16) -S1 data8 0xfb14be7fbae58157 , 0x0000bffe // cos(17 pi/16) -C1 - +// data8 0xc3ef1535754b168e , 0x0000bffd // sin(18 pi/16) -S2 data8 0xec835e79946a3146 , 0x0000bffe // cos(18 pi/16) -C2 - +// data8 0x8e39d9cd73464364 , 0x0000bffe // sin(19 pi/16) -S3 data8 0xd4db3148750d181a , 0x0000bffe // cos(19 pi/16) -C3 - +// data8 0xb504f333f9de6484 , 0x0000bffe // sin(20 pi/16) -S4 data8 0xb504f333f9de6484 , 0x0000bffe // cos(20 pi/16) -S4 - - +// +// data8 0xd4db3148750d181a , 0x0000bffe // sin(21 pi/16) -C3 data8 0x8e39d9cd73464364 , 0x0000bffe // cos(21 pi/16) -S3 - +// data8 0xec835e79946a3146 , 0x0000bffe // sin(22 pi/16) -C2 data8 0xc3ef1535754b168e , 0x0000bffd // cos(22 pi/16) -S2 - +// data8 0xfb14be7fbae58157 , 0x0000bffe // sin(23 pi/16) -C1 data8 0xc7c5c1e34d3055b3 , 0x0000bffc // cos(23 pi/16) -S1 - +// data8 0x8000000000000000 , 0x0000bfff // sin(24 pi/16) -C0 data8 0x0000000000000000 , 0x00000000 // cos(24 pi/16) S0 - - +// +// data8 0xfb14be7fbae58157 , 0x0000bffe // sin(25 pi/16) -C1 data8 0xc7c5c1e34d3055b3 , 0x00003ffc // cos(25 pi/16) S1 - +// data8 0xec835e79946a3146 , 0x0000bffe // sin(26 pi/16) -C2 data8 0xc3ef1535754b168e , 0x00003ffd // cos(26 pi/16) S2 - +// data8 0xd4db3148750d181a , 0x0000bffe // sin(27 pi/16) -C3 data8 0x8e39d9cd73464364 , 0x00003ffe // cos(27 pi/16) S3 - +// data8 0xb504f333f9de6484 , 0x0000bffe // sin(28 pi/16) -S4 data8 0xb504f333f9de6484 , 0x00003ffe // cos(28 pi/16) S4 - - +// +// data8 0x8e39d9cd73464364 , 0x0000bffe // sin(29 pi/16) -S3 data8 0xd4db3148750d181a , 0x00003ffe // cos(29 pi/16) C3 - +// data8 0xc3ef1535754b168e , 0x0000bffd // sin(30 pi/16) -S2 data8 0xec835e79946a3146 , 0x00003ffe // cos(30 pi/16) C2 - +// data8 0xc7c5c1e34d3055b3 , 0x0000bffc // sin(31 pi/16) -S1 data8 0xfb14be7fbae58157 , 0x00003ffe // cos(31 pi/16) C1 - +// data8 0x0000000000000000 , 0x00000000 // sin(32 pi/16) S0 data8 0x8000000000000000 , 0x00003fff // cos(32 pi/16) C0 -ASM_SIZE_DIRECTIVE(double_sin_cos_beta_k4) +LOCAL_OBJECT_END(double_sin_cos_beta_k4) -.align 32 -.global sin# -.global cos# -#ifdef _LIBC -.global __sin# -.global __cos# -#endif +.section .text //////////////////////////////////////////////////////// // There are two entry points: sin and cos @@ -402,85 +402,63 @@ ASM_SIZE_DIRECTIVE(double_sin_cos_beta_k4) // If from sin, p8 is true // If from cos, p9 is true -.section .text -.proc sin# -#ifdef _LIBC -.proc __sin# -#endif -.align 32 - -sin: -#ifdef _LIBC -__sin: -#endif +GLOBAL_IEEE754_ENTRY(sin) { .mlx - alloc r32=ar.pfs,1,13,0,0 - movl sind_GR_sig_inv_pi_by_16 = 0xA2F9836E4E44152A // significand of 16/pi + alloc r32 = ar.pfs, 1, 13, 0, 0 + movl sincos_GR_sig_inv_pi_by_16 = 0xA2F9836E4E44152A // signd of 16/pi } { .mlx - addl sind_AD_1 = @ltoff(double_sind_pi), gp - movl sind_GR_rshf_2to61 = 0x47b8000000000000 // 1.1000 2^(63+63-2) + addl sincos_AD_1 = @ltoff(double_sincos_pi), gp + movl sincos_GR_rshf_2to61 = 0x47b8000000000000 // 1.1 2^(63+63-2) } ;; { .mfi - ld8 sind_AD_1 = [sind_AD_1] - fnorm sind_NORM_f8 = f8 - cmp.eq p8,p9 = r0, r0 + ld8 sincos_AD_1 = [sincos_AD_1] + fnorm.s0 sincos_NORM_f8 = f8 // Normalize argument + cmp.eq p8,p9 = r0, r0 // set p8 (clear p9) for sin } { .mib - mov sind_GR_exp_2tom61 = 0xffff-61 // exponent of scaling factor 2^-61 - mov sind_r_sincos = 0x0 - br.cond.sptk L(SIND_SINCOS) + mov sincos_GR_exp_2tom61 = 0xffff-61 // exponent of scale 2^-61 + mov sincos_r_sincos = 0x0 // sincos_r_sincos = 0 for sin + br.cond.sptk _SINCOS_COMMON // go to common part } ;; -.endp sin -ASM_SIZE_DIRECTIVE(sin) - - -.section .text -.proc cos# -#ifdef _LIBC -.proc __cos# -#endif -.align 32 -cos: -#ifdef _LIBC -__cos: -#endif +GLOBAL_IEEE754_END(sin) +GLOBAL_IEEE754_ENTRY(cos) { .mlx - alloc r32=ar.pfs,1,13,0,0 - movl sind_GR_sig_inv_pi_by_16 = 0xA2F9836E4E44152A // significand of 16/pi + alloc r32 = ar.pfs, 1, 13, 0, 0 + movl sincos_GR_sig_inv_pi_by_16 = 0xA2F9836E4E44152A // signd of 16/pi } { .mlx - addl sind_AD_1 = @ltoff(double_sind_pi), gp - movl sind_GR_rshf_2to61 = 0x47b8000000000000 // 1.1000 2^(63+63-2) + addl sincos_AD_1 = @ltoff(double_sincos_pi), gp + movl sincos_GR_rshf_2to61 = 0x47b8000000000000 // 1.1 2^(63+63-2) } ;; { .mfi - ld8 sind_AD_1 = [sind_AD_1] - fnorm.s1 sind_NORM_f8 = f8 - cmp.eq p9,p8 = r0, r0 + ld8 sincos_AD_1 = [sincos_AD_1] + fnorm.s1 sincos_NORM_f8 = f8 // Normalize argument + cmp.eq p9,p8 = r0, r0 // set p9 (clear p8) for cos } { .mib - mov sind_GR_exp_2tom61 = 0xffff-61 // exponent of scaling factor 2^-61 - mov sind_r_sincos = 0x8 - br.cond.sptk L(SIND_SINCOS) + mov sincos_GR_exp_2tom61 = 0xffff-61 // exp of scale 2^-61 + mov sincos_r_sincos = 0x8 // sincos_r_sincos = 8 for cos + nop.b 999 } ;; - //////////////////////////////////////////////////////// // All entry points end up here. -// If from sin, sind_r_sincos is 0 and p8 is true -// If from cos, sind_r_sincos is 8 = 2^(k-1) and p9 is true -// We add sind_r_sincos to N +// If from sin, sincos_r_sincos is 0 and p8 is true +// If from cos, sincos_r_sincos is 8 = 2^(k-1) and p9 is true +// We add sincos_r_sincos to N -L(SIND_SINCOS): +///////////// Common sin and cos part ////////////////// +_SINCOS_COMMON: // Form two constants we need @@ -488,3014 +466,320 @@ L(SIND_SINCOS): // 1.1000...000 * 2^(63+63-2) to right shift int(W) into the low significand // fcmp used to set denormal, and invalid on snans { .mfi - setf.sig sind_SIG_INV_PI_BY_16_2TO61 = sind_GR_sig_inv_pi_by_16 - fcmp.eq.s0 p12,p0=f8,f0 - mov sind_r_17_ones = 0x1ffff + setf.sig sincos_SIG_INV_PI_BY_16_2TO61 = sincos_GR_sig_inv_pi_by_16 + fclass.m p6,p0 = f8, 0xe7 // if x = 0,inf,nan + mov sincos_exp_limit = 0x1001a } { .mlx - setf.d sind_RSHF_2TO61 = sind_GR_rshf_2to61 - movl sind_GR_rshf = 0x43e8000000000000 // 1.1000 2^63 for right shift -} + setf.d sincos_RSHF_2TO61 = sincos_GR_rshf_2to61 + movl sincos_GR_rshf = 0x43e8000000000000 // 1.1 2^63 +} // Right shift ;; // Form another constant // 2^-61 for scaling Nfloat -// 0x10009 is register_bias + 10. -// So if f8 > 2^10 = Gamma, go to DBX -{ .mfi - setf.exp sind_2TOM61 = sind_GR_exp_2tom61 - fclass.m p13,p0 = f8, 0x23 // Test for x inf - mov sind_exp_limit = 0x10009 +// 0x1001a is register_bias + 27. +// So if f8 >= 2^27, go to large argument routines +{ .mmi + getf.exp sincos_r_signexp = f8 + setf.exp sincos_2TOM61 = sincos_GR_exp_2tom61 + addl gr_tmp = -1,r0 // For "inexect" constant create } ;; // Load the two pieces of pi/16 // Form another constant // 1.1000...000 * 2^63, the right shift constant -{ .mmf - ldfe sind_Pi_by_16_hi = [sind_AD_1],16 - setf.d sind_RSHF = sind_GR_rshf - fclass.m p14,p0 = f8, 0xc3 // Test for x nan -} -;; - -{ .mfi - ldfe sind_Pi_by_16_lo = [sind_AD_1],16 -(p13) frcpa.s0 f8,p12=f0,f0 // force qnan indef for x=inf - addl gr_tmp = -1,r0 -} -{ .mfb - addl sind_AD_beta_table = @ltoff(double_sin_cos_beta_k4), gp - nop.f 999 -(p13) br.ret.spnt b0 ;; // Exit for x=inf -} - -// Start loading P, Q coefficients -// SIN(0) -{ .mfi - ldfpd sind_P4,sind_Q4 = [sind_AD_1],16 -(p8) fclass.m.unc p6,p0 = f8, 0x07 // Test for sin(0) - nop.i 999 -} -{ .mfb - addl sind_AD_beta_table = @ltoff(double_sin_cos_beta_k4), gp -(p14) fma.d f8=f8,f1,f0 // qnan for x=nan -(p14) br.ret.spnt b0 ;; // Exit for x=nan -} - - -// COS(0) -{ .mfi - getf.exp sind_r_signexp = f8 -(p9) fclass.m.unc p7,p0 = f8, 0x07 // Test for sin(0) - nop.i 999 -} -{ .mfi - ld8 sind_AD_beta_table = [sind_AD_beta_table] - nop.f 999 - nop.i 999 ;; -} - { .mmb - ldfpd sind_P3,sind_Q3 = [sind_AD_1],16 - setf.sig fp_tmp = gr_tmp // Create constant such that fmpy sets inexact -(p6) br.ret.spnt b0 ;; -} - -{ .mfb - and sind_r_exp = sind_r_17_ones, sind_r_signexp -(p7) fmerge.s f8 = f1,f1 -(p7) br.ret.spnt b0 ;; -} - -// p10 is true if we must call routines to handle larger arguments -// p10 is true if f8 exp is > 0x10009 - -{ .mfi - ldfpd sind_P2,sind_Q2 = [sind_AD_1],16 - nop.f 999 - cmp.ge p10,p0 = sind_r_exp,sind_exp_limit + ldfe sincos_Pi_by_16_1 = [sincos_AD_1],16 + setf.d sincos_RSHF = sincos_GR_rshf +(p6) br.cond.spnt _SINCOS_SPECIAL_ARGS } ;; -// sind_W = x * sind_Inv_Pi_by_16 -// Multiply x by scaled 16/pi and add large const to shift integer part of W to -// rightmost bits of significand -{ .mfi - ldfpd sind_P1,sind_Q1 = [sind_AD_1] - fma.s1 sind_W_2TO61_RSH = sind_NORM_f8,sind_SIG_INV_PI_BY_16_2TO61,sind_RSHF_2TO61 - nop.i 999 -} -{ .mbb -(p10) cmp.ne.unc p11,p12=sind_r_sincos,r0 // p11 call __libm_cos_double_dbx - // p12 call __libm_sin_double_dbx -(p11) br.cond.spnt L(COSD_DBX) -(p12) br.cond.spnt L(SIND_DBX) -} -;; - - -// sind_NFLOAT = Round_Int_Nearest(sind_W) -// This is done by scaling back by 2^-61 and subtracting the shift constant -{ .mfi - nop.m 999 - fms.s1 sind_NFLOAT = sind_W_2TO61_RSH,sind_2TOM61,sind_RSHF - nop.i 999 ;; -} - - -// get N = (int)sind_int_Nfloat -{ .mfi - getf.sig sind_GR_n = sind_W_2TO61_RSH - nop.f 999 - nop.i 999 ;; -} - -// Add 2^(k-1) (which is in sind_r_sincos) to N -// sind_r = -sind_Nfloat * sind_Pi_by_16_hi + x -// sind_r = sind_r -sind_Nfloat * sind_Pi_by_16_lo -{ .mfi - add sind_GR_n = sind_GR_n, sind_r_sincos - fnma.s1 sind_r = sind_NFLOAT, sind_Pi_by_16_hi, sind_NORM_f8 - nop.i 999 ;; -} - - -// Get M (least k+1 bits of N) { .mmi - and sind_GR_m = 0x1f,sind_GR_n ;; - nop.m 999 - shl sind_GR_32m = sind_GR_m,5 ;; -} - -// Add 32*M to address of sin_cos_beta table -{ .mmi - add sind_AD_2 = sind_GR_32m, sind_AD_beta_table - nop.m 999 - nop.i 999 ;; -} - -{ .mfi - ldfe sind_Sm = [sind_AD_2],16 -(p8) fclass.m.unc p10,p0=f8,0x0b // If sin, note denormal input to set uflow - nop.i 999 ;; -} - -{ .mfi - ldfe sind_Cm = [sind_AD_2] - fnma.s1 sind_r = sind_NFLOAT, sind_Pi_by_16_lo, sind_r - nop.i 999 ;; -} - -// get rsq -{ .mfi - nop.m 999 - fma.s1 sind_rsq = sind_r, sind_r, f0 - nop.i 999 -} -{ .mfi - nop.m 999 - fmpy.s0 fp_tmp = fp_tmp,fp_tmp // fmpy forces inexact flag - nop.i 999 ;; -} - -// form P and Q series -{ .mfi - nop.m 999 - fma.s1 sind_P_temp1 = sind_rsq, sind_P4, sind_P3 - nop.i 999 -} - -{ .mfi - nop.m 999 - fma.s1 sind_Q_temp1 = sind_rsq, sind_Q4, sind_Q3 - nop.i 999 ;; -} - -// get rcube and sm*rsq -{ .mfi - nop.m 999 - fmpy.s1 sind_srsq = sind_Sm,sind_rsq - nop.i 999 -} - -{ .mfi - nop.m 999 - fmpy.s1 sind_rcub = sind_r, sind_rsq - nop.i 999 ;; -} - -{ .mfi - nop.m 999 - fma.s1 sind_Q_temp2 = sind_rsq, sind_Q_temp1, sind_Q2 - nop.i 999 -} - -{ .mfi - nop.m 999 - fma.s1 sind_P_temp2 = sind_rsq, sind_P_temp1, sind_P2 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 - fma.s1 sind_Q = sind_rsq, sind_Q_temp2, sind_Q1 - nop.i 999 -} - -{ .mfi - nop.m 999 - fma.s1 sind_P = sind_rsq, sind_P_temp2, sind_P1 - nop.i 999 ;; -} - -// Get final P and Q -{ .mfi - nop.m 999 - fma.s1 sind_Q = sind_srsq,sind_Q, sind_Sm - nop.i 999 -} - -{ .mfi - nop.m 999 - fma.s1 sind_P = sind_rcub,sind_P, sind_r - nop.i 999 ;; -} - -// If sin(denormal), force inexact to be set -{ .mfi - nop.m 999 -(p10) fmpy.d.s0 fp_tmp = f8,f8 - nop.i 999 ;; -} - -// Final calculation -{ .mfb - nop.m 999 - fma.d f8 = sind_Cm, sind_P, sind_Q - br.ret.sptk b0 ;; -} -.endp cos# -ASM_SIZE_DIRECTIVE(cos#) - - - -.proc __libm_callout_1s -__libm_callout_1s: -L(SIND_DBX): -.prologue -{ .mfi - nop.m 0 - nop.f 0 -.save ar.pfs,GR_SAVE_PFS - mov GR_SAVE_PFS=ar.pfs -} -;; - -{ .mfi - mov GR_SAVE_GP=gp - nop.f 0 -.save b0, GR_SAVE_B0 - mov GR_SAVE_B0=b0 -} - -.body -{ .mib - nop.m 999 - nop.i 999 - br.call.sptk.many b0=__libm_sin_double_dbx# ;; -} -;; - - -{ .mfi - mov gp = GR_SAVE_GP - nop.f 999 - mov b0 = GR_SAVE_B0 -} -;; - -{ .mib - nop.m 999 - mov ar.pfs = GR_SAVE_PFS - br.ret.sptk b0 ;; -} -.endp __libm_callout_1s -ASM_SIZE_DIRECTIVE(__libm_callout_1s) - - -.proc __libm_callout_1c -__libm_callout_1c: -L(COSD_DBX): -.prologue -{ .mfi - nop.m 0 - nop.f 0 -.save ar.pfs,GR_SAVE_PFS - mov GR_SAVE_PFS=ar.pfs -} -;; - -{ .mfi - mov GR_SAVE_GP=gp - nop.f 0 -.save b0, GR_SAVE_B0 - mov GR_SAVE_B0=b0 -} - -.body -{ .mib - nop.m 999 - nop.i 999 - br.call.sptk.many b0=__libm_cos_double_dbx# ;; -} -;; - + ldfe sincos_Pi_by_16_2 = [sincos_AD_1],16 + setf.sig fp_tmp = gr_tmp // constant for inexact set + nop.i 999 +};; { .mfi - mov gp = GR_SAVE_GP - nop.f 999 - mov b0 = GR_SAVE_B0 -} -;; - -{ .mib - nop.m 999 - mov ar.pfs = GR_SAVE_PFS - br.ret.sptk b0 ;; -} -.endp __libm_callout_1c -ASM_SIZE_DIRECTIVE(__libm_callout_1c) - - -// ==================================================================== -// ==================================================================== - -// These functions calculate the sin and cos for inputs -// greater than 2^10 -// __libm_sin_double_dbx# and __libm_cos_double_dbx# - -// ********************************************************************* -// ********************************************************************* -// -// Function: Combined sin(x) and cos(x), where -// -// sin(x) = sine(x), for double precision x values -// cos(x) = cosine(x), for double precision x values -// -// ********************************************************************* -// -// Accuracy: Within .7 ulps for 80-bit floating point values -// Very accurate for double precision values -// -// ********************************************************************* -// -// Resources Used: -// -// Floating-Point Registers: f8 (Input and Return Value) -// f32-f99 -// -// General Purpose Registers: -// r32-r43 -// r44-r45 (Used to pass arguments to pi_by_2 reduce routine) -// -// Predicate Registers: p6-p13 -// -// ********************************************************************* -// -// IEEE Special Conditions: -// -// Denormal fault raised on denormal inputs -// Overflow exceptions do not occur -// Underflow exceptions raised when appropriate for sin -// (No specialized error handling for this routine) -// Inexact raised when appropriate by algorithm -// -// sin(SNaN) = QNaN -// sin(QNaN) = QNaN -// sin(inf) = QNaN -// sin(+/-0) = +/-0 -// cos(inf) = QNaN -// cos(SNaN) = QNaN -// cos(QNaN) = QNaN -// cos(0) = 1 -// -// ********************************************************************* -// -// Mathematical Description -// ======================== -// -// The computation of FSIN and FCOS is best handled in one piece of -// code. The main reason is that given any argument Arg, computation -// of trigonometric functions first calculate N and an approximation -// to alpha where -// -// Arg = N pi/2 + alpha, |alpha| <= pi/4. -// -// Since -// -// cos( Arg ) = sin( (N+1) pi/2 + alpha ), -// -// therefore, the code for computing sine will produce cosine as long -// as 1 is added to N immediately after the argument reduction -// process. -// -// Let M = N if sine -// N+1 if cosine. -// -// Now, given -// -// Arg = M pi/2 + alpha, |alpha| <= pi/4, -// -// let I = M mod 4, or I be the two lsb of M when M is represented -// as 2's complement. I = [i_0 i_1]. Then -// -// sin( Arg ) = (-1)^i_0 sin( alpha ) if i_1 = 0, -// = (-1)^i_0 cos( alpha ) if i_1 = 1. -// -// For example: -// if M = -1, I = 11 -// sin ((-pi/2 + alpha) = (-1) cos (alpha) -// if M = 0, I = 00 -// sin (alpha) = sin (alpha) -// if M = 1, I = 01 -// sin (pi/2 + alpha) = cos (alpha) -// if M = 2, I = 10 -// sin (pi + alpha) = (-1) sin (alpha) -// if M = 3, I = 11 -// sin ((3/2)pi + alpha) = (-1) cos (alpha) -// -// The value of alpha is obtained by argument reduction and -// represented by two working precision numbers r and c where -// -// alpha = r + c accurately. -// -// The reduction method is described in a previous write up. -// The argument reduction scheme identifies 4 cases. For Cases 2 -// and 4, because |alpha| is small, sin(r+c) and cos(r+c) can be -// computed very easily by 2 or 3 terms of the Taylor series -// expansion as follows: -// -// Case 2: -// ------- -// -// sin(r + c) = r + c - r^3/6 accurately -// cos(r + c) = 1 - 2^(-67) accurately -// -// Case 4: -// ------- -// -// sin(r + c) = r + c - r^3/6 + r^5/120 accurately -// cos(r + c) = 1 - r^2/2 + r^4/24 accurately -// -// The only cases left are Cases 1 and 3 of the argument reduction -// procedure. These two cases will be merged since after the -// argument is reduced in either cases, we have the reduced argument -// represented as r + c and that the magnitude |r + c| is not small -// enough to allow the usage of a very short approximation. -// -// The required calculation is either -// -// sin(r + c) = sin(r) + correction, or -// cos(r + c) = cos(r) + correction. -// -// Specifically, -// -// sin(r + c) = sin(r) + c sin'(r) + O(c^2) -// = sin(r) + c cos (r) + O(c^2) -// = sin(r) + c(1 - r^2/2) accurately. -// Similarly, -// -// cos(r + c) = cos(r) - c sin(r) + O(c^2) -// = cos(r) - c(r - r^3/6) accurately. -// -// We therefore concentrate on accurately calculating sin(r) and -// cos(r) for a working-precision number r, |r| <= pi/4 to within -// 0.1% or so. -// -// The greatest challenge of this task is that the second terms of -// the Taylor series -// -// r - r^3/3! + r^r/5! - ... -// -// and -// -// 1 - r^2/2! + r^4/4! - ... -// -// are not very small when |r| is close to pi/4 and the rounding -// errors will be a concern if simple polynomial accumulation is -// used. When |r| < 2^-3, however, the second terms will be small -// enough (6 bits or so of right shift) that a normal Horner -// recurrence suffices. Hence there are two cases that we consider -// in the accurate computation of sin(r) and cos(r), |r| <= pi/4. -// -// Case small_r: |r| < 2^(-3) -// -------------------------- -// -// Since Arg = M pi/4 + r + c accurately, and M mod 4 is [i_0 i_1], -// we have -// -// sin(Arg) = (-1)^i_0 * sin(r + c) if i_1 = 0 -// = (-1)^i_0 * cos(r + c) if i_1 = 1 -// -// can be accurately approximated by -// -// sin(Arg) = (-1)^i_0 * [sin(r) + c] if i_1 = 0 -// = (-1)^i_0 * [cos(r) - c*r] if i_1 = 1 -// -// because |r| is small and thus the second terms in the correction -// are unneccessary. -// -// Finally, sin(r) and cos(r) are approximated by polynomials of -// moderate lengths. -// -// sin(r) = r + S_1 r^3 + S_2 r^5 + ... + S_5 r^11 -// cos(r) = 1 + C_1 r^2 + C_2 r^4 + ... + C_5 r^10 -// -// We can make use of predicates to selectively calculate -// sin(r) or cos(r) based on i_1. -// -// Case normal_r: 2^(-3) <= |r| <= pi/4 -// ------------------------------------ -// -// This case is more likely than the previous one if one considers -// r to be uniformly distributed in [-pi/4 pi/4]. Again, -// -// sin(Arg) = (-1)^i_0 * sin(r + c) if i_1 = 0 -// = (-1)^i_0 * cos(r + c) if i_1 = 1. -// -// Because |r| is now larger, we need one extra term in the -// correction. sin(Arg) can be accurately approximated by -// -// sin(Arg) = (-1)^i_0 * [sin(r) + c(1-r^2/2)] if i_1 = 0 -// = (-1)^i_0 * [cos(r) - c*r*(1 - r^2/6)] i_1 = 1. -// -// Finally, sin(r) and cos(r) are approximated by polynomials of -// moderate lengths. -// -// sin(r) = r + PP_1_hi r^3 + PP_1_lo r^3 + -// PP_2 r^5 + ... + PP_8 r^17 -// -// cos(r) = 1 + QQ_1 r^2 + QQ_2 r^4 + ... + QQ_8 r^16 -// -// where PP_1_hi is only about 16 bits long and QQ_1 is -1/2. -// The crux in accurate computation is to calculate -// -// r + PP_1_hi r^3 or 1 + QQ_1 r^2 -// -// accurately as two pieces: U_hi and U_lo. The way to achieve this -// is to obtain r_hi as a 10 sig. bit number that approximates r to -// roughly 8 bits or so of accuracy. (One convenient way is -// -// r_hi := frcpa( frcpa( r ) ).) -// -// This way, -// -// r + PP_1_hi r^3 = r + PP_1_hi r_hi^3 + -// PP_1_hi (r^3 - r_hi^3) -// = [r + PP_1_hi r_hi^3] + -// [PP_1_hi (r - r_hi) -// (r^2 + r_hi r + r_hi^2) ] -// = U_hi + U_lo -// -// Since r_hi is only 10 bit long and PP_1_hi is only 16 bit long, -// PP_1_hi * r_hi^3 is only at most 46 bit long and thus computed -// exactly. Furthermore, r and PP_1_hi r_hi^3 are of opposite sign -// and that there is no more than 8 bit shift off between r and -// PP_1_hi * r_hi^3. Hence the sum, U_hi, is representable and thus -// calculated without any error. Finally, the fact that -// -// |U_lo| <= 2^(-8) |U_hi| -// -// says that U_hi + U_lo is approximating r + PP_1_hi r^3 to roughly -// 8 extra bits of accuracy. -// -// Similarly, -// -// 1 + QQ_1 r^2 = [1 + QQ_1 r_hi^2] + -// [QQ_1 (r - r_hi)(r + r_hi)] -// = U_hi + U_lo. -// -// Summarizing, we calculate r_hi = frcpa( frcpa( r ) ). -// -// If i_1 = 0, then -// -// U_hi := r + PP_1_hi * r_hi^3 -// U_lo := PP_1_hi * (r - r_hi) * (r^2 + r*r_hi + r_hi^2) -// poly := PP_1_lo r^3 + PP_2 r^5 + ... + PP_8 r^17 -// correction := c * ( 1 + C_1 r^2 ) -// -// Else ...i_1 = 1 -// -// U_hi := 1 + QQ_1 * r_hi * r_hi -// U_lo := QQ_1 * (r - r_hi) * (r + r_hi) -// poly := QQ_2 * r^4 + QQ_3 * r^6 + ... + QQ_8 r^16 -// correction := -c * r * (1 + S_1 * r^2) -// -// End -// -// Finally, -// -// V := poly + ( U_lo + correction ) -// -// / U_hi + V if i_0 = 0 -// result := | -// \ (-U_hi) - V if i_0 = 1 -// -// It is important that in the last step, negation of U_hi is -// performed prior to the subtraction which is to be performed in -// the user-set rounding mode. -// -// -// Algorithmic Description -// ======================= -// -// The argument reduction algorithm is tightly integrated into FSIN -// and FCOS which share the same code. The following is complete and -// self-contained. The argument reduction description given -// previously is repeated below. -// -// -// Step 0. Initialization. -// -// If FSIN is invoked, set N_inc := 0; else if FCOS is invoked, -// set N_inc := 1. -// -// Step 1. Check for exceptional and special cases. -// -// * If Arg is +-0, +-inf, NaN, NaT, go to Step 10 for special -// handling. -// * If |Arg| < 2^24, go to Step 2 for reduction of moderate -// arguments. This is the most likely case. -// * If |Arg| < 2^63, go to Step 8 for pre-reduction of large -// arguments. -// * If |Arg| >= 2^63, go to Step 10 for special handling. -// -// Step 2. Reduction of moderate arguments. -// -// If |Arg| < pi/4 ...quick branch -// N_fix := N_inc (integer) -// r := Arg -// c := 0.0 -// Branch to Step 4, Case_1_complete -// Else ...cf. argument reduction -// N := Arg * two_by_PI (fp) -// N_fix := fcvt.fx( N ) (int) -// N := fcvt.xf( N_fix ) -// N_fix := N_fix + N_inc -// s := Arg - N * P_1 (first piece of pi/2) -// w := -N * P_2 (second piece of pi/2) -// -// If |s| >= 2^(-33) -// go to Step 3, Case_1_reduce -// Else -// go to Step 7, Case_2_reduce -// Endif -// Endif -// -// Step 3. Case_1_reduce. -// -// r := s + w -// c := (s - r) + w ...observe order -// -// Step 4. Case_1_complete -// -// ...At this point, the reduced argument alpha is -// ...accurately represented as r + c. -// If |r| < 2^(-3), go to Step 6, small_r. -// -// Step 5. Normal_r. -// -// Let [i_0 i_1] by the 2 lsb of N_fix. -// FR_rsq := r * r -// r_hi := frcpa( frcpa( r ) ) -// r_lo := r - r_hi -// -// If i_1 = 0, then -// poly := r*FR_rsq*(PP_1_lo + FR_rsq*(PP_2 + ... FR_rsq*PP_8)) -// U_hi := r + PP_1_hi*r_hi*r_hi*r_hi ...any order -// U_lo := PP_1_hi*r_lo*(r*r + r*r_hi + r_hi*r_hi) -// correction := c + c*C_1*FR_rsq ...any order -// Else -// poly := FR_rsq*FR_rsq*(QQ_2 + FR_rsq*(QQ_3 + ... + FR_rsq*QQ_8)) -// U_hi := 1 + QQ_1 * r_hi * r_hi ...any order -// U_lo := QQ_1 * r_lo * (r + r_hi) -// correction := -c*(r + S_1*FR_rsq*r) ...any order -// Endif -// -// V := poly + (U_lo + correction) ...observe order -// -// result := (i_0 == 0? 1.0 : -1.0) -// -// Last instruction in user-set rounding mode -// -// result := (i_0 == 0? result*U_hi + V : -// result*U_hi - V) -// -// Return -// -// Step 6. Small_r. -// -// ...Use flush to zero mode without causing exception -// Let [i_0 i_1] be the two lsb of N_fix. -// -// FR_rsq := r * r -// -// If i_1 = 0 then -// z := FR_rsq*FR_rsq; z := FR_rsq*z *r -// poly_lo := S_3 + FR_rsq*(S_4 + FR_rsq*S_5) -// poly_hi := r*FR_rsq*(S_1 + FR_rsq*S_2) -// correction := c -// result := r -// Else -// z := FR_rsq*FR_rsq; z := FR_rsq*z -// poly_lo := C_3 + FR_rsq*(C_4 + FR_rsq*C_5) -// poly_hi := FR_rsq*(C_1 + FR_rsq*C_2) -// correction := -c*r -// result := 1 -// Endif -// -// poly := poly_hi + (z * poly_lo + correction) -// -// If i_0 = 1, result := -result -// -// Last operation. Perform in user-set rounding mode -// -// result := (i_0 == 0? result + poly : -// result - poly ) -// Return -// -// Step 7. Case_2_reduce. -// -// ...Refer to the write up for argument reduction for -// ...rationale. The reduction algorithm below is taken from -// ...argument reduction description and integrated this. -// -// w := N*P_3 -// U_1 := N*P_2 + w ...FMA -// U_2 := (N*P_2 - U_1) + w ...2 FMA -// ...U_1 + U_2 is N*(P_2+P_3) accurately -// -// r := s - U_1 -// c := ( (s - r) - U_1 ) - U_2 -// -// ...The mathematical sum r + c approximates the reduced -// ...argument accurately. Note that although compared to -// ...Case 1, this case requires much more work to reduce -// ...the argument, the subsequent calculation needed for -// ...any of the trigonometric function is very little because -// ...|alpha| < 1.01*2^(-33) and thus two terms of the -// ...Taylor series expansion suffices. -// -// If i_1 = 0 then -// poly := c + S_1 * r * r * r ...any order -// result := r -// Else -// poly := -2^(-67) -// result := 1.0 -// Endif -// -// If i_0 = 1, result := -result -// -// Last operation. Perform in user-set rounding mode -// -// result := (i_0 == 0? result + poly : -// result - poly ) -// -// Return -// -// -// Step 8. Pre-reduction of large arguments. -// -// ...Again, the following reduction procedure was described -// ...in the separate write up for argument reduction, which -// ...is tightly integrated here. - -// N_0 := Arg * Inv_P_0 -// N_0_fix := fcvt.fx( N_0 ) -// N_0 := fcvt.xf( N_0_fix) - -// Arg' := Arg - N_0 * P_0 -// w := N_0 * d_1 -// N := Arg' * two_by_PI -// N_fix := fcvt.fx( N ) -// N := fcvt.xf( N_fix ) -// N_fix := N_fix + N_inc -// -// s := Arg' - N * P_1 -// w := w - N * P_2 -// -// If |s| >= 2^(-14) -// go to Step 3 -// Else -// go to Step 9 -// Endif -// -// Step 9. Case_4_reduce. -// -// ...first obtain N_0*d_1 and -N*P_2 accurately -// U_hi := N_0 * d_1 V_hi := -N*P_2 -// U_lo := N_0 * d_1 - U_hi V_lo := -N*P_2 - U_hi ...FMAs -// -// ...compute the contribution from N_0*d_1 and -N*P_3 -// w := -N*P_3 -// w := w + N_0*d_2 -// t := U_lo + V_lo + w ...any order -// -// ...at this point, the mathematical value -// ...s + U_hi + V_hi + t approximates the true reduced argument -// ...accurately. Just need to compute this accurately. -// -// ...Calculate U_hi + V_hi accurately: -// A := U_hi + V_hi -// if |U_hi| >= |V_hi| then -// a := (U_hi - A) + V_hi -// else -// a := (V_hi - A) + U_hi -// endif -// ...order in computing "a" must be observed. This branch is -// ...best implemented by predicates. -// ...A + a is U_hi + V_hi accurately. Moreover, "a" is -// ...much smaller than A: |a| <= (1/2)ulp(A). -// -// ...Just need to calculate s + A + a + t -// C_hi := s + A t := t + a -// C_lo := (s - C_hi) + A -// C_lo := C_lo + t -// -// ...Final steps for reduction -// r := C_hi + C_lo -// c := (C_hi - r) + C_lo -// -// ...At this point, we have r and c -// ...And all we need is a couple of terms of the corresponding -// ...Taylor series. -// -// If i_1 = 0 -// poly := c + r*FR_rsq*(S_1 + FR_rsq*S_2) -// result := r -// Else -// poly := FR_rsq*(C_1 + FR_rsq*C_2) -// result := 1 -// Endif -// -// If i_0 = 1, result := -result -// -// Last operation. Perform in user-set rounding mode -// -// result := (i_0 == 0? result + poly : -// result - poly ) -// Return -// -// Large Arguments: For arguments above 2**63, a Payne-Hanek -// style argument reduction is used and pi_by_2 reduce is called. -// - - -#ifdef _LIBC -.rodata -#else -.data -#endif -.align 64 - -FSINCOS_CONSTANTS: -ASM_TYPE_DIRECTIVE(FSINCOS_CONSTANTS,@object) -data4 0x4B800000, 0xCB800000, 0x00000000,0x00000000 // two**24, -two**24 -data4 0x4E44152A, 0xA2F9836E, 0x00003FFE,0x00000000 // Inv_pi_by_2 -data4 0xCE81B9F1, 0xC84D32B0, 0x00004016,0x00000000 // P_0 -data4 0x2168C235, 0xC90FDAA2, 0x00003FFF,0x00000000 // P_1 -data4 0xFC8F8CBB, 0xECE675D1, 0x0000BFBD,0x00000000 // P_2 -data4 0xACC19C60, 0xB7ED8FBB, 0x0000BF7C,0x00000000 // P_3 -data4 0x5F000000, 0xDF000000, 0x00000000,0x00000000 // two_to_63, -two_to_63 -data4 0x6EC6B45A, 0xA397E504, 0x00003FE7,0x00000000 // Inv_P_0 -data4 0xDBD171A1, 0x8D848E89, 0x0000BFBF,0x00000000 // d_1 -data4 0x18A66F8E, 0xD5394C36, 0x0000BF7C,0x00000000 // d_2 -data4 0x2168C234, 0xC90FDAA2, 0x00003FFE,0x00000000 // pi_by_4 -data4 0x2168C234, 0xC90FDAA2, 0x0000BFFE,0x00000000 // neg_pi_by_4 -data4 0x3E000000, 0xBE000000, 0x00000000,0x00000000 // two**-3, -two**-3 -data4 0x2F000000, 0xAF000000, 0x9E000000,0x00000000 // two**-33, -two**-33, -two**-67 -data4 0xA21C0BC9, 0xCC8ABEBC, 0x00003FCE,0x00000000 // PP_8 -data4 0x720221DA, 0xD7468A05, 0x0000BFD6,0x00000000 // PP_7 -data4 0x640AD517, 0xB092382F, 0x00003FDE,0x00000000 // PP_6 -data4 0xD1EB75A4, 0xD7322B47, 0x0000BFE5,0x00000000 // PP_5 -data4 0xFFFFFFFE, 0xFFFFFFFF, 0x0000BFFD,0x00000000 // C_1 -data4 0x00000000, 0xAAAA0000, 0x0000BFFC,0x00000000 // PP_1_hi -data4 0xBAF69EEA, 0xB8EF1D2A, 0x00003FEC,0x00000000 // PP_4 -data4 0x0D03BB69, 0xD00D00D0, 0x0000BFF2,0x00000000 // PP_3 -data4 0x88888962, 0x88888888, 0x00003FF8,0x00000000 // PP_2 -data4 0xAAAB0000, 0xAAAAAAAA, 0x0000BFEC,0x00000000 // PP_1_lo -data4 0xC2B0FE52, 0xD56232EF, 0x00003FD2,0x00000000 // QQ_8 -data4 0x2B48DCA6, 0xC9C99ABA, 0x0000BFDA,0x00000000 // QQ_7 -data4 0x9C716658, 0x8F76C650, 0x00003FE2,0x00000000 // QQ_6 -data4 0xFDA8D0FC, 0x93F27DBA, 0x0000BFE9,0x00000000 // QQ_5 -data4 0xAAAAAAAA, 0xAAAAAAAA, 0x0000BFFC,0x00000000 // S_1 -data4 0x00000000, 0x80000000, 0x0000BFFE,0x00000000 // QQ_1 -data4 0x0C6E5041, 0xD00D00D0, 0x00003FEF,0x00000000 // QQ_4 -data4 0x0B607F60, 0xB60B60B6, 0x0000BFF5,0x00000000 // QQ_3 -data4 0xAAAAAA9B, 0xAAAAAAAA, 0x00003FFA,0x00000000 // QQ_2 -data4 0xFFFFFFFE, 0xFFFFFFFF, 0x0000BFFD,0x00000000 // C_1 -data4 0xAAAA719F, 0xAAAAAAAA, 0x00003FFA,0x00000000 // C_2 -data4 0x0356F994, 0xB60B60B6, 0x0000BFF5,0x00000000 // C_3 -data4 0xB2385EA9, 0xD00CFFD5, 0x00003FEF,0x00000000 // C_4 -data4 0x292A14CD, 0x93E4BD18, 0x0000BFE9,0x00000000 // C_5 -data4 0xAAAAAAAA, 0xAAAAAAAA, 0x0000BFFC,0x00000000 // S_1 -data4 0x888868DB, 0x88888888, 0x00003FF8,0x00000000 // S_2 -data4 0x055EFD4B, 0xD00D00D0, 0x0000BFF2,0x00000000 // S_3 -data4 0x839730B9, 0xB8EF1C5D, 0x00003FEC,0x00000000 // S_4 -data4 0xE5B3F492, 0xD71EA3A4, 0x0000BFE5,0x00000000 // S_5 -data4 0x38800000, 0xB8800000, 0x00000000 // two**-14, -two**-14 -ASM_SIZE_DIRECTIVE(FSINCOS_CONSTANTS) - -FR_Input_X = f8 -FR_Neg_Two_to_M3 = f32 -FR_Two_to_63 = f32 -FR_Two_to_24 = f33 -FR_Pi_by_4 = f33 -FR_Two_to_M14 = f34 -FR_Two_to_M33 = f35 -FR_Neg_Two_to_24 = f36 -FR_Neg_Pi_by_4 = f36 -FR_Neg_Two_to_M14 = f37 -FR_Neg_Two_to_M33 = f38 -FR_Neg_Two_to_M67 = f39 -FR_Inv_pi_by_2 = f40 -FR_N_float = f41 -FR_N_fix = f42 -FR_P_1 = f43 -FR_P_2 = f44 -FR_P_3 = f45 -FR_s = f46 -FR_w = f47 -FR_c = f48 -FR_r = f49 -FR_Z = f50 -FR_A = f51 -FR_a = f52 -FR_t = f53 -FR_U_1 = f54 -FR_U_2 = f55 -FR_C_1 = f56 -FR_C_2 = f57 -FR_C_3 = f58 -FR_C_4 = f59 -FR_C_5 = f60 -FR_S_1 = f61 -FR_S_2 = f62 -FR_S_3 = f63 -FR_S_4 = f64 -FR_S_5 = f65 -FR_poly_hi = f66 -FR_poly_lo = f67 -FR_r_hi = f68 -FR_r_lo = f69 -FR_rsq = f70 -FR_r_cubed = f71 -FR_C_hi = f72 -FR_N_0 = f73 -FR_d_1 = f74 -FR_V = f75 -FR_V_hi = f75 -FR_V_lo = f76 -FR_U_hi = f77 -FR_U_lo = f78 -FR_U_hiabs = f79 -FR_V_hiabs = f80 -FR_PP_8 = f81 -FR_QQ_8 = f81 -FR_PP_7 = f82 -FR_QQ_7 = f82 -FR_PP_6 = f83 -FR_QQ_6 = f83 -FR_PP_5 = f84 -FR_QQ_5 = f84 -FR_PP_4 = f85 -FR_QQ_4 = f85 -FR_PP_3 = f86 -FR_QQ_3 = f86 -FR_PP_2 = f87 -FR_QQ_2 = f87 -FR_QQ_1 = f88 -FR_N_0_fix = f89 -FR_Inv_P_0 = f90 -FR_corr = f91 -FR_poly = f92 -FR_d_2 = f93 -FR_Two_to_M3 = f94 -FR_Neg_Two_to_63 = f94 -FR_P_0 = f95 -FR_C_lo = f96 -FR_PP_1 = f97 -FR_PP_1_lo = f98 -FR_ArgPrime = f99 - -GR_Table_Base = r32 -GR_Table_Base1 = r33 -GR_i_0 = r34 -GR_i_1 = r35 -GR_N_Inc = r36 -GR_Sin_or_Cos = r37 - -GR_SAVE_B0 = r39 -GR_SAVE_GP = r40 -GR_SAVE_PFS = r41 - -.section .text -.proc __libm_sin_double_dbx# -.align 64 -__libm_sin_double_dbx: - -{ .mlx -alloc GR_Table_Base = ar.pfs,0,12,2,0 - movl GR_Sin_or_Cos = 0x0 ;; -} - -{ .mmi - nop.m 999 - addl GR_Table_Base = @ltoff(FSINCOS_CONSTANTS#), gp - nop.i 999 -} -;; - -{ .mmi - ld8 GR_Table_Base = [GR_Table_Base] - nop.m 999 - nop.i 999 -} -;; - - -{ .mib - nop.m 999 - nop.i 999 - br.cond.sptk L(SINCOS_CONTINUE) ;; -} - -.endp __libm_sin_double_dbx# -ASM_SIZE_DIRECTIVE(__libm_sin_double_dbx) - -.section .text -.proc __libm_cos_double_dbx# -__libm_cos_double_dbx: - -{ .mlx -alloc GR_Table_Base= ar.pfs,0,12,2,0 - movl GR_Sin_or_Cos = 0x1 ;; -} - -{ .mmi - nop.m 999 - addl GR_Table_Base = @ltoff(FSINCOS_CONSTANTS#), gp - nop.i 999 -} -;; + ldfe sincos_Pi_by_16_3 = [sincos_AD_1],16 + nop.f 999 + nop.i 999 +};; +// Polynomial coefficients (Q4, P4, Q3, P3, Q2, Q1, P2, P1) loading { .mmi - ld8 GR_Table_Base = [GR_Table_Base] - nop.m 999 - nop.i 999 -} -;; - -// -// Load Table Address -// -L(SINCOS_CONTINUE): + ldfpd sincos_P4,sincos_Q4 = [sincos_AD_1],16 + nop.m 999 + nop.i 999 +};; +// Select exponent (17 lsb) { .mmi - add GR_Table_Base1 = 96, GR_Table_Base - ldfs FR_Two_to_24 = [GR_Table_Base], 4 - nop.i 999 + ldfpd sincos_P3,sincos_Q3 = [sincos_AD_1],16 + nop.m 999 + dep.z sincos_r_exp = sincos_r_signexp, 0, 17 } ;; -{ .mmi - nop.m 999 -// -// Load 2**24, load 2**63. -// - ldfs FR_Neg_Two_to_24 = [GR_Table_Base], 12 - mov r41 = ar.pfs ;; -} - -{ .mfi - ldfs FR_Two_to_63 = [GR_Table_Base1], 4 -// -// Check for unnormals - unsupported operands. We do not want -// to generate denormal exception -// Check for NatVals, QNaNs, SNaNs, +/-Infs -// Check for EM unsupporteds -// Check for Zero -// - fclass.m.unc p6, p8 = FR_Input_X, 0x1E3 - mov r40 = gp ;; -} - -{ .mfi - nop.m 999 - fclass.nm.unc p8, p0 = FR_Input_X, 0x1FF -// GR_Sin_or_Cos denotes - mov r39 = b0 -} - -{ .mfb - ldfs FR_Neg_Two_to_63 = [GR_Table_Base1], 12 - fclass.m.unc p10, p0 = FR_Input_X, 0x007 -(p6) br.cond.spnt L(SINCOS_SPECIAL) ;; -} - -{ .mib - nop.m 999 - nop.i 999 -(p8) br.cond.spnt L(SINCOS_SPECIAL) ;; -} - -{ .mib - nop.m 999 - nop.i 999 -// -// Branch if +/- NaN, Inf. -// Load -2**24, load -2**63. -// -(p10) br.cond.spnt L(SINCOS_ZERO) ;; -} - -{ .mmb - ldfe FR_Inv_pi_by_2 = [GR_Table_Base], 16 - ldfe FR_Inv_P_0 = [GR_Table_Base1], 16 - nop.b 999 ;; -} - -{ .mmb - nop.m 999 - ldfe FR_d_1 = [GR_Table_Base1], 16 - nop.b 999 ;; -} -// -// Raise possible denormal operand flag with useful fcmp -// Is x <= -2**63 -// Load Inv_P_0 for pre-reduction -// Load Inv_pi_by_2 -// - +// p10 is true if we must call routines to handle larger arguments +// p10 is true if f8 exp is >= 0x1001a (2^27) { .mmb - ldfe FR_P_0 = [GR_Table_Base], 16 - ldfe FR_d_2 = [GR_Table_Base1], 16 - nop.b 999 ;; -} -// -// Load P_0 -// Load d_1 -// Is x >= 2**63 -// Is x <= -2**24? -// - -{ .mmi - ldfe FR_P_1 = [GR_Table_Base], 16 ;; -// -// Load P_1 -// Load d_2 -// Is x >= 2**24? -// - ldfe FR_P_2 = [GR_Table_Base], 16 - nop.i 999 ;; -} - -{ .mmf - nop.m 999 - ldfe FR_P_3 = [GR_Table_Base], 16 - fcmp.le.unc.s1 p7, p8 = FR_Input_X, FR_Neg_Two_to_24 -} - -{ .mfi - nop.m 999 -// -// Branch if +/- zero. -// Decide about the paths to take: -// If -2**24 < FR_Input_X < 2**24 - CASE 1 OR 2 -// OTHERWISE - CASE 3 OR 4 -// - fcmp.le.unc.s0 p10, p11 = FR_Input_X, FR_Neg_Two_to_63 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p8) fcmp.ge.s1 p7, p0 = FR_Input_X, FR_Two_to_24 - nop.i 999 -} - -{ .mfi - ldfe FR_Pi_by_4 = [GR_Table_Base1], 16 -(p11) fcmp.ge.s1 p10, p0 = FR_Input_X, FR_Two_to_63 - nop.i 999 ;; -} - -{ .mmi - ldfe FR_Neg_Pi_by_4 = [GR_Table_Base1], 16 ;; - ldfs FR_Two_to_M3 = [GR_Table_Base1], 4 - nop.i 999 ;; -} - -{ .mib - ldfs FR_Neg_Two_to_M3 = [GR_Table_Base1], 12 - nop.i 999 -// -// Load P_2 -// Load P_3 -// Load pi_by_4 -// Load neg_pi_by_4 -// Load 2**(-3) -// Load -2**(-3). -// -(p10) br.cond.spnt L(SINCOS_ARG_TOO_LARGE) ;; -} - -{ .mib - nop.m 999 - nop.i 999 -// -// Branch out if x >= 2**63. Use Payne-Hanek Reduction -// -(p7) br.cond.spnt L(SINCOS_LARGER_ARG) ;; -} - -{ .mfi - nop.m 999 -// -// Branch if Arg <= -2**24 or Arg >= 2**24 and use pre-reduction. -// - fma.s1 FR_N_float = FR_Input_X, FR_Inv_pi_by_2, f0 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 - fcmp.lt.unc.s1 p6, p7 = FR_Input_X, FR_Pi_by_4 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// Select the case when |Arg| < pi/4 -// Else Select the case when |Arg| >= pi/4 -// - fcvt.fx.s1 FR_N_fix = FR_N_float - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// N = Arg * 2/pi -// Check if Arg < pi/4 -// -(p6) fcmp.gt.s1 p6, p7 = FR_Input_X, FR_Neg_Pi_by_4 - nop.i 999 ;; -} -// -// Case 2: Convert integer N_fix back to normalized floating-point value. -// Case 1: p8 is only affected when p6 is set -// - -{ .mfi -(p7) ldfs FR_Two_to_M33 = [GR_Table_Base1], 4 -// -// Grab the integer part of N and call it N_fix -// -(p6) fmerge.se FR_r = FR_Input_X, FR_Input_X -// If |x| < pi/4, r = x and c = 0 -// lf |x| < pi/4, is x < 2**(-3). -// r = Arg -// c = 0 -(p6) mov GR_N_Inc = GR_Sin_or_Cos ;; -} - -{ .mmf - nop.m 999 -(p7) ldfs FR_Neg_Two_to_M33 = [GR_Table_Base1], 4 -(p6) fmerge.se FR_c = f0, f0 -} - -{ .mfi - nop.m 999 -(p6) fcmp.lt.unc.s1 p8, p9 = FR_Input_X, FR_Two_to_M3 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// lf |x| < pi/4, is -2**(-3)< x < 2**(-3) - set p8. -// If |x| >= pi/4, -// Create the right N for |x| < pi/4 and otherwise -// Case 2: Place integer part of N in GP register -// -(p7) fcvt.xf FR_N_float = FR_N_fix - nop.i 999 ;; -} - -{ .mmf - nop.m 999 -(p7) getf.sig GR_N_Inc = FR_N_fix -(p8) fcmp.gt.s1 p8, p0 = FR_Input_X, FR_Neg_Two_to_M3 ;; -} - -{ .mib - nop.m 999 - nop.i 999 -// -// Load 2**(-33), -2**(-33) -// -(p8) br.cond.spnt L(SINCOS_SMALL_R) ;; -} - -{ .mib - nop.m 999 - nop.i 999 -(p6) br.cond.sptk L(SINCOS_NORMAL_R) ;; -} -// -// if |x| < pi/4, branch based on |x| < 2**(-3) or otherwise. -// -// -// In this branch, |x| >= pi/4. -// - -{ .mfi - ldfs FR_Neg_Two_to_M67 = [GR_Table_Base1], 8 -// -// Load -2**(-67) -// - fnma.s1 FR_s = FR_N_float, FR_P_1, FR_Input_X -// -// w = N * P_2 -// s = -N * P_1 + Arg -// - add GR_N_Inc = GR_N_Inc, GR_Sin_or_Cos -} - -{ .mfi - nop.m 999 - fma.s1 FR_w = FR_N_float, FR_P_2, f0 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// Adjust N_fix by N_inc to determine whether sine or -// cosine is being calculated -// - fcmp.lt.unc.s1 p7, p6 = FR_s, FR_Two_to_M33 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p7) fcmp.gt.s1 p7, p6 = FR_s, FR_Neg_Two_to_M33 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// Remember x >= pi/4. -// Is s <= -2**(-33) or s >= 2**(-33) (p6) -// or -2**(-33) < s < 2**(-33) (p7) -(p6) fms.s1 FR_r = FR_s, f1, FR_w - nop.i 999 -} - -{ .mfi - nop.m 999 -(p7) fma.s1 FR_w = FR_N_float, FR_P_3, f0 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p7) fma.s1 FR_U_1 = FR_N_float, FR_P_2, FR_w - nop.i 999 -} - -{ .mfi - nop.m 999 -(p6) fms.s1 FR_c = FR_s, f1, FR_r - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// For big s: r = s - w: No futher reduction is necessary -// For small s: w = N * P_3 (change sign) More reduction -// -(p6) fcmp.lt.unc.s1 p8, p9 = FR_r, FR_Two_to_M3 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p8) fcmp.gt.s1 p8, p9 = FR_r, FR_Neg_Two_to_M3 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p7) fms.s1 FR_r = FR_s, f1, FR_U_1 - nop.i 999 -} - -{ .mfb - nop.m 999 -// -// For big s: Is |r| < 2**(-3)? -// For big s: c = S - r -// For small s: U_1 = N * P_2 + w -// -// If p8 is set, prepare to branch to Small_R. -// If p9 is set, prepare to branch to Normal_R. -// For big s, r is complete here. -// -(p6) fms.s1 FR_c = FR_c, f1, FR_w -// -// For big s: c = c + w (w has not been negated.) -// For small s: r = S - U_1 -// -(p8) br.cond.spnt L(SINCOS_SMALL_R) ;; -} - -{ .mib - nop.m 999 - nop.i 999 -(p9) br.cond.sptk L(SINCOS_NORMAL_R) ;; -} - -{ .mfi -(p7) add GR_Table_Base1 = 224, GR_Table_Base1 -// -// Branch to SINCOS_SMALL_R or SINCOS_NORMAL_R -// -(p7) fms.s1 FR_U_2 = FR_N_float, FR_P_2, FR_U_1 -// -// c = S - U_1 -// r = S_1 * r -// -// -(p7) extr.u GR_i_1 = GR_N_Inc, 0, 1 -} - -{ .mmi - nop.m 999 ;; -// -// Get [i_0,i_1] - two lsb of N_fix_gr. -// Do dummy fmpy so inexact is always set. -// -(p7) cmp.eq.unc p9, p10 = 0x0, GR_i_1 -(p7) extr.u GR_i_0 = GR_N_Inc, 1, 1 ;; -} -// -// For small s: U_2 = N * P_2 - U_1 -// S_1 stored constant - grab the one stored with the -// coefficients. -// - -{ .mfi -(p7) ldfe FR_S_1 = [GR_Table_Base1], 16 -// -// Check if i_1 and i_0 != 0 -// -(p10) fma.s1 FR_poly = f0, f1, FR_Neg_Two_to_M67 -(p7) cmp.eq.unc p11, p12 = 0x0, GR_i_0 ;; -} - -{ .mfi - nop.m 999 -(p7) fms.s1 FR_s = FR_s, f1, FR_r - nop.i 999 -} - -{ .mfi - nop.m 999 -// -// S = S - r -// U_2 = U_2 + w -// load S_1 -// -(p7) fma.s1 FR_rsq = FR_r, FR_r, f0 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p7) fma.s1 FR_U_2 = FR_U_2, f1, FR_w - nop.i 999 -} - -{ .mfi - nop.m 999 -(p7) fmerge.se FR_Input_X = FR_r, FR_r - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p10) fma.s1 FR_Input_X = f0, f1, f1 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// FR_rsq = r * r -// Save r as the result. -// -(p7) fms.s1 FR_c = FR_s, f1, FR_U_1 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// if ( i_1 ==0) poly = c + S_1*r*r*r -// else Result = 1 -// -(p12) fnma.s1 FR_Input_X = FR_Input_X, f1, f0 - nop.i 999 -} - -{ .mfi - nop.m 999 -(p7) fma.s1 FR_r = FR_S_1, FR_r, f0 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p7) fma.d.s0 FR_S_1 = FR_S_1, FR_S_1, f0 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// If i_1 != 0, poly = 2**(-67) -// -(p7) fms.s1 FR_c = FR_c, f1, FR_U_2 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// c = c - U_2 -// -(p9) fma.s1 FR_poly = FR_r, FR_rsq, FR_c - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// i_0 != 0, so Result = -Result -// -(p11) fma.d.s0 FR_Input_X = FR_Input_X, f1, FR_poly - nop.i 999 ;; -} - -{ .mfb - nop.m 999 -(p12) fms.d.s0 FR_Input_X = FR_Input_X, f1, FR_poly -// -// if (i_0 == 0), Result = Result + poly -// else Result = Result - poly -// - br.ret.sptk b0 ;; -} -L(SINCOS_LARGER_ARG): - -{ .mfi - nop.m 999 - fma.s1 FR_N_0 = FR_Input_X, FR_Inv_P_0, f0 - nop.i 999 -} -;; - -// This path for argument > 2*24 -// Adjust table_ptr1 to beginning of table. -// - -{ .mmi - nop.m 999 - addl GR_Table_Base = @ltoff(FSINCOS_CONSTANTS#), gp - nop.i 999 -} -;; - -{ .mmi - ld8 GR_Table_Base = [GR_Table_Base] - nop.m 999 - nop.i 999 -} -;; - - -// -// Point to 2*-14 -// N_0 = Arg * Inv_P_0 -// - -{ .mmi - add GR_Table_Base = 688, GR_Table_Base ;; - ldfs FR_Two_to_M14 = [GR_Table_Base], 4 - nop.i 999 ;; -} - -{ .mfi - ldfs FR_Neg_Two_to_M14 = [GR_Table_Base], 0 - nop.f 999 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// Load values 2**(-14) and -2**(-14) -// - fcvt.fx.s1 FR_N_0_fix = FR_N_0 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// N_0_fix = integer part of N_0 -// - fcvt.xf FR_N_0 = FR_N_0_fix - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// Make N_0 the integer part -// - fnma.s1 FR_ArgPrime = FR_N_0, FR_P_0, FR_Input_X - nop.i 999 -} - -{ .mfi - nop.m 999 - fma.s1 FR_w = FR_N_0, FR_d_1, f0 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// Arg' = -N_0 * P_0 + Arg -// w = N_0 * d_1 -// - fma.s1 FR_N_float = FR_ArgPrime, FR_Inv_pi_by_2, f0 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// N = A' * 2/pi -// - fcvt.fx.s1 FR_N_fix = FR_N_float - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// N_fix is the integer part -// - fcvt.xf FR_N_float = FR_N_fix - nop.i 999 ;; -} - -{ .mfi - getf.sig GR_N_Inc = FR_N_fix - nop.f 999 - nop.i 999 ;; -} - -{ .mii - nop.m 999 - nop.i 999 ;; - add GR_N_Inc = GR_N_Inc, GR_Sin_or_Cos ;; -} - -{ .mfi - nop.m 999 -// -// N is the integer part of the reduced-reduced argument. -// Put the integer in a GP register -// - fnma.s1 FR_s = FR_N_float, FR_P_1, FR_ArgPrime - nop.i 999 -} - -{ .mfi - nop.m 999 - fnma.s1 FR_w = FR_N_float, FR_P_2, FR_w - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// s = -N*P_1 + Arg' -// w = -N*P_2 + w -// N_fix_gr = N_fix_gr + N_inc -// - fcmp.lt.unc.s1 p9, p8 = FR_s, FR_Two_to_M14 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p9) fcmp.gt.s1 p9, p8 = FR_s, FR_Neg_Two_to_M14 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// For |s| > 2**(-14) r = S + w (r complete) -// Else U_hi = N_0 * d_1 -// -(p9) fma.s1 FR_V_hi = FR_N_float, FR_P_2, f0 - nop.i 999 -} - -{ .mfi - nop.m 999 -(p9) fma.s1 FR_U_hi = FR_N_0, FR_d_1, f0 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// Either S <= -2**(-14) or S >= 2**(-14) -// or -2**(-14) < s < 2**(-14) -// -(p8) fma.s1 FR_r = FR_s, f1, FR_w - nop.i 999 -} - -{ .mfi - nop.m 999 -(p9) fma.s1 FR_w = FR_N_float, FR_P_3, f0 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// We need abs of both U_hi and V_hi - don't -// worry about switched sign of V_hi. -// -(p9) fms.s1 FR_A = FR_U_hi, f1, FR_V_hi - nop.i 999 -} - -{ .mfi - nop.m 999 -// -// Big s: finish up c = (S - r) + w (c complete) -// Case 4: A = U_hi + V_hi -// Note: Worry about switched sign of V_hi, so subtract instead of add. -// -(p9) fnma.s1 FR_V_lo = FR_N_float, FR_P_2, FR_V_hi - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p9) fms.s1 FR_U_lo = FR_N_0, FR_d_1, FR_U_hi - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p9) fmerge.s FR_V_hiabs = f0, FR_V_hi - nop.i 999 -} - -{ .mfi - nop.m 999 -// For big s: c = S - r -// For small s do more work: U_lo = N_0 * d_1 - U_hi -// -(p9) fmerge.s FR_U_hiabs = f0, FR_U_hi - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// For big s: Is |r| < 2**(-3) -// For big s: if p12 set, prepare to branch to Small_R. -// For big s: If p13 set, prepare to branch to Normal_R. -// -(p8) fms.s1 FR_c = FR_s, f1, FR_r - nop.i 999 -} - -{ .mfi - nop.m 999 -// -// For small S: V_hi = N * P_2 -// w = N * P_3 -// Note the product does not include the (-) as in the writeup -// so (-) missing for V_hi and w. -// -(p8) fcmp.lt.unc.s1 p12, p13 = FR_r, FR_Two_to_M3 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p12) fcmp.gt.s1 p12, p13 = FR_r, FR_Neg_Two_to_M3 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p8) fma.s1 FR_c = FR_c, f1, FR_w - nop.i 999 -} - -{ .mfb - nop.m 999 -(p9) fms.s1 FR_w = FR_N_0, FR_d_2, FR_w -(p12) br.cond.spnt L(SINCOS_SMALL_R) ;; -} - -{ .mib - nop.m 999 - nop.i 999 -(p13) br.cond.sptk L(SINCOS_NORMAL_R) ;; -} + ldfpd sincos_P2,sincos_Q2 = [sincos_AD_1],16 + cmp.ge p10,p0 = sincos_r_exp,sincos_exp_limit +(p10) br.cond.spnt _SINCOS_LARGE_ARGS // Go to "large args" routine +};; +// sincos_W = x * sincos_Inv_Pi_by_16 +// Multiply x by scaled 16/pi and add large const to shift integer part of W to +// rightmost bits of significand { .mfi - nop.m 999 -// -// Big s: Vector off when |r| < 2**(-3). Recall that p8 will be true. -// The remaining stuff is for Case 4. -// Small s: V_lo = N * P_2 + U_hi (U_hi is in place of V_hi in writeup) -// Note: the (-) is still missing for V_lo. -// Small s: w = w + N_0 * d_2 -// Note: the (-) is now incorporated in w. -// -(p9) fcmp.ge.unc.s1 p10, p11 = FR_U_hiabs, FR_V_hiabs - extr.u GR_i_1 = GR_N_Inc, 0, 1 ;; -} + ldfpd sincos_P1,sincos_Q1 = [sincos_AD_1],16 + fma.s1 sincos_W_2TO61_RSH = sincos_NORM_f8,sincos_SIG_INV_PI_BY_16_2TO61,sincos_RSHF_2TO61 + nop.i 999 +};; +// sincos_NFLOAT = Round_Int_Nearest(sincos_W) +// This is done by scaling back by 2^-61 and subtracting the shift constant { .mfi - nop.m 999 -// -// C_hi = S + A -// -(p9) fma.s1 FR_t = FR_U_lo, f1, FR_V_lo - extr.u GR_i_0 = GR_N_Inc, 1, 1 ;; -} + nop.m 999 + fms.s1 sincos_NFLOAT = sincos_W_2TO61_RSH,sincos_2TOM61,sincos_RSHF + nop.i 999 +};; -{ .mfi - nop.m 999 -// -// t = U_lo + V_lo -// -// -(p10) fms.s1 FR_a = FR_U_hi, f1, FR_A - nop.i 999 ;; -} +// get N = (int)sincos_int_Nfloat { .mfi - nop.m 999 -(p11) fma.s1 FR_a = FR_V_hi, f1, FR_A - nop.i 999 -} -;; - -{ .mmi - nop.m 999 - addl GR_Table_Base = @ltoff(FSINCOS_CONSTANTS#), gp - nop.i 999 -} -;; - -{ .mmi - ld8 GR_Table_Base = [GR_Table_Base] - nop.m 999 - nop.i 999 -} -;; - + getf.sig sincos_GR_n = sincos_W_2TO61_RSH + nop.f 999 + nop.i 999 +};; +// Add 2^(k-1) (which is in sincos_r_sincos) to N +// sincos_r = -sincos_Nfloat * sincos_Pi_by_16_1 + x { .mfi - add GR_Table_Base = 528, GR_Table_Base -// -// Is U_hiabs >= V_hiabs? -// -(p9) fma.s1 FR_C_hi = FR_s, f1, FR_A - nop.i 999 ;; -} + add sincos_GR_n = sincos_GR_n, sincos_r_sincos + fnma.s1 sincos_r = sincos_NFLOAT, sincos_Pi_by_16_1, sincos_NORM_f8 + nop.i 999 +};; +// Get M (least k+1 bits of N) { .mmi - ldfe FR_C_1 = [GR_Table_Base], 16 ;; - ldfe FR_C_2 = [GR_Table_Base], 64 - nop.i 999 ;; -} - -{ .mmf - nop.m 999 -// -// c = c + C_lo finished. -// Load C_2 -// - ldfe FR_S_1 = [GR_Table_Base], 16 -// -// C_lo = S - C_hi -// - fma.s1 FR_t = FR_t, f1, FR_w ;; -} -// -// r and c have been computed. -// Make sure ftz mode is set - should be automatic when using wre -// |r| < 2**(-3) -// Get [i_0,i_1] - two lsb of N_fix. -// Load S_1 -// + and sincos_GR_m = 0x1f,sincos_GR_n;; + nop.m 999 + shl sincos_GR_32m = sincos_GR_m,5 +};; +// Add 32*M to address of sin_cos_beta table { .mfi - ldfe FR_S_2 = [GR_Table_Base], 64 -// -// t = t + w -// -(p10) fms.s1 FR_a = FR_a, f1, FR_V_hi - cmp.eq.unc p9, p10 = 0x0, GR_i_0 -} + add sincos_AD_2 = sincos_GR_32m, sincos_AD_1 +(p8) fclass.m.unc p10,p0 = f8,0x0b // For sin denorm. - set uflow + nop.i 999 +};; +// Load Sin and Cos table value using obtained index m (sincosf_AD_2) { .mfi - nop.m 999 -// -// For larger u than v: a = U_hi - A -// Else a = V_hi - A (do an add to account for missing (-) on V_hi -// - fms.s1 FR_C_lo = FR_s, f1, FR_C_hi - nop.i 999 ;; -} + ldfe sincos_Sm = [sincos_AD_2],16 +(p9) fclass.m.unc p11,p0 = f8,0x0b // For cos denorm - set denorm + nop.i 999 +};; +// sincos_r = sincos_r -sincos_Nfloat * sincos_Pi_by_16_2 { .mfi - nop.m 999 -(p11) fms.s1 FR_a = FR_U_hi, f1, FR_a - cmp.eq.unc p11, p12 = 0x0, GR_i_1 -} + ldfe sincos_Cm = [sincos_AD_2] + fnma.s1 sincos_r = sincos_NFLOAT, sincos_Pi_by_16_2, sincos_r + nop.i 999 +};; +// get rsq = r*r { .mfi - nop.m 999 -// -// If u > v: a = (U_hi - A) + V_hi -// Else a = (V_hi - A) + U_hi -// In each case account for negative missing from V_hi. -// - fma.s1 FR_C_lo = FR_C_lo, f1, FR_A - nop.i 999 ;; + nop.m 999 + fma.s1 sincos_rsq = sincos_r, sincos_r, f0 // r^2 = r*r + nop.i 999 } - { .mfi - nop.m 999 -// -// C_lo = (S - C_hi) + A -// - fma.s1 FR_t = FR_t, f1, FR_a - nop.i 999 ;; -} + nop.m 999 + fmpy.s0 fp_tmp = fp_tmp,fp_tmp // forces inexact flag + nop.i 999 +};; +// sincos_r_exact = sincos_r -sincos_Nfloat * sincos_Pi_by_16_3 { .mfi - nop.m 999 -// -// t = t + a -// - fma.s1 FR_C_lo = FR_C_lo, f1, FR_t - nop.i 999 ;; -} + nop.m 999 + fnma.s1 sincos_r_exact = sincos_NFLOAT, sincos_Pi_by_16_3, sincos_r + nop.i 999 +};; +// Polynomials calculation +// P_1 = P4*r^2 + P3 +// Q_2 = Q4*r^2 + Q3 { .mfi - nop.m 999 -// -// C_lo = C_lo + t -// Adjust Table_Base to beginning of table -// - fma.s1 FR_r = FR_C_hi, f1, FR_C_lo - nop.i 999 ;; + nop.m 999 + fma.s1 sincos_P_temp1 = sincos_rsq, sincos_P4, sincos_P3 + nop.i 999 } - { .mfi - nop.m 999 -// -// Load S_2 -// - fma.s1 FR_rsq = FR_r, FR_r, f0 - nop.i 999 -} + nop.m 999 + fma.s1 sincos_Q_temp1 = sincos_rsq, sincos_Q4, sincos_Q3 + nop.i 999 +};; +// get rcube = r^3 and S[m]*r^2 { .mfi - nop.m 999 -// -// Table_Base points to C_1 -// r = C_hi + C_lo -// - fms.s1 FR_c = FR_C_hi, f1, FR_r - nop.i 999 ;; + nop.m 999 + fmpy.s1 sincos_srsq = sincos_Sm,sincos_rsq + nop.i 999 } - { .mfi - nop.m 999 -// -// if i_1 ==0: poly = S_2 * FR_rsq + S_1 -// else poly = C_2 * FR_rsq + C_1 -// -(p11) fma.s1 FR_Input_X = f0, f1, FR_r - nop.i 999 ;; -} + nop.m 999 + fmpy.s1 sincos_rcub = sincos_r_exact, sincos_rsq + nop.i 999 +};; +// Polynomials calculation +// Q_2 = Q_1*r^2 + Q2 +// P_1 = P_1*r^2 + P2 { .mfi - nop.m 999 -(p12) fma.s1 FR_Input_X = f0, f1, f1 - nop.i 999 ;; + nop.m 999 + fma.s1 sincos_Q_temp2 = sincos_rsq, sincos_Q_temp1, sincos_Q2 + nop.i 999 } - { .mfi - nop.m 999 -// -// Compute r_cube = FR_rsq * r -// -(p11) fma.s1 FR_poly = FR_rsq, FR_S_2, FR_S_1 - nop.i 999 ;; -} + nop.m 999 + fma.s1 sincos_P_temp2 = sincos_rsq, sincos_P_temp1, sincos_P2 + nop.i 999 +};; +// Polynomials calculation +// Q = Q_2*r^2 + Q1 +// P = P_2*r^2 + P1 { .mfi - nop.m 999 -(p12) fma.s1 FR_poly = FR_rsq, FR_C_2, FR_C_1 - nop.i 999 + nop.m 999 + fma.s1 sincos_Q = sincos_rsq, sincos_Q_temp2, sincos_Q1 + nop.i 999 } { .mfi - nop.m 999 -// -// Compute FR_rsq = r * r -// Is i_1 == 0 ? -// - fma.s1 FR_r_cubed = FR_rsq, FR_r, f0 - nop.i 999 ;; -} + nop.m 999 + fma.s1 sincos_P = sincos_rsq, sincos_P_temp2, sincos_P1 + nop.i 999 +};; +// Get final P and Q +// Q = Q*S[m]*r^2 + S[m] +// P = P*r^3 + r { .mfi - nop.m 999 -// -// c = C_hi - r -// Load C_1 -// - fma.s1 FR_c = FR_c, f1, FR_C_lo - nop.i 999 + nop.m 999 + fma.s1 sincos_Q = sincos_srsq,sincos_Q, sincos_Sm + nop.i 999 } { .mfi - nop.m 999 -// -// if i_1 ==0: poly = r_cube * poly + c -// else poly = FR_rsq * poly -// -(p10) fms.s1 FR_Input_X = f0, f1, FR_Input_X - nop.i 999 ;; -} + nop.m 999 + fma.s1 sincos_P = sincos_rcub,sincos_P, sincos_r_exact + nop.i 999 +};; +// If sin(denormal), force underflow to be set +.pred.rel "mutex",p10,p11 { .mfi - nop.m 999 -// -// if i_1 ==0: Result = r -// else Result = 1.0 -// -(p11) fma.s1 FR_poly = FR_r_cubed, FR_poly, FR_c - nop.i 999 ;; + nop.m 999 +(p10) fmpy.d.s0 fp_tmp = f8,f8 // forces underflow flag + nop.i 999 // for denormal sine args } - { .mfi - nop.m 999 -(p12) fma.s1 FR_poly = FR_rsq, FR_poly, f0 - nop.i 999 ;; -} + nop.m 999 +(p11) fma.d.s0 fp_tmp = f8,f1, f8 // forces denormal flag + nop.i 999 // for denormal cosine args +};; -{ .mfi - nop.m 999 -// -// if i_0 !=0: Result = -Result -// -(p9) fma.d.s0 FR_Input_X = FR_Input_X, f1, FR_poly - nop.i 999 ;; -} +// Final calculation +// result = C[m]*P + Q { .mfb - nop.m 999 -(p10) fms.d.s0 FR_Input_X = FR_Input_X, f1, FR_poly -// -// if i_0 == 0: Result = Result + poly -// else Result = Result - poly -// - br.ret.sptk b0 ;; -} -L(SINCOS_SMALL_R): - -{ .mii - nop.m 999 - extr.u GR_i_1 = GR_N_Inc, 0, 1 ;; -// -// -// Compare both i_1 and i_0 with 0. -// if i_1 == 0, set p9. -// if i_0 == 0, set p11. -// - cmp.eq.unc p9, p10 = 0x0, GR_i_1 ;; -} - -{ .mfi - nop.m 999 - fma.s1 FR_rsq = FR_r, FR_r, f0 - extr.u GR_i_0 = GR_N_Inc, 1, 1 ;; -} - -{ .mfi - nop.m 999 -// -// Z = Z * FR_rsq -// -(p10) fnma.s1 FR_c = FR_c, FR_r, f0 - cmp.eq.unc p11, p12 = 0x0, GR_i_0 -} -;; - -// ****************************************************************** -// ****************************************************************** -// ****************************************************************** -// r and c have been computed. -// We know whether this is the sine or cosine routine. -// Make sure ftz mode is set - should be automatic when using wre -// |r| < 2**(-3) -// -// Set table_ptr1 to beginning of constant table. -// Get [i_0,i_1] - two lsb of N_fix_gr. -// - -{ .mmi - nop.m 999 - addl GR_Table_Base = @ltoff(FSINCOS_CONSTANTS#), gp - nop.i 999 -} -;; - -{ .mmi - ld8 GR_Table_Base = [GR_Table_Base] - nop.m 999 - nop.i 999 -} -;; - - -// -// Set table_ptr1 to point to S_5. -// Set table_ptr1 to point to C_5. -// Compute FR_rsq = r * r -// - -{ .mfi -(p9) add GR_Table_Base = 672, GR_Table_Base -(p10) fmerge.s FR_r = f1, f1 -(p10) add GR_Table_Base = 592, GR_Table_Base ;; -} -// -// Set table_ptr1 to point to S_5. -// Set table_ptr1 to point to C_5. -// - -{ .mmi -(p9) ldfe FR_S_5 = [GR_Table_Base], -16 ;; -// -// if (i_1 == 0) load S_5 -// if (i_1 != 0) load C_5 -// -(p9) ldfe FR_S_4 = [GR_Table_Base], -16 - nop.i 999 ;; -} - -{ .mmf -(p10) ldfe FR_C_5 = [GR_Table_Base], -16 -// -// Z = FR_rsq * FR_rsq -// -(p9) ldfe FR_S_3 = [GR_Table_Base], -16 -// -// Compute FR_rsq = r * r -// if (i_1 == 0) load S_4 -// if (i_1 != 0) load C_4 -// - fma.s1 FR_Z = FR_rsq, FR_rsq, f0 ;; -} -// -// if (i_1 == 0) load S_3 -// if (i_1 != 0) load C_3 -// - -{ .mmi -(p9) ldfe FR_S_2 = [GR_Table_Base], -16 ;; -// -// if (i_1 == 0) load S_2 -// if (i_1 != 0) load C_2 -// -(p9) ldfe FR_S_1 = [GR_Table_Base], -16 - nop.i 999 -} - -{ .mmi -(p10) ldfe FR_C_4 = [GR_Table_Base], -16 ;; -(p10) ldfe FR_C_3 = [GR_Table_Base], -16 - nop.i 999 ;; -} - -{ .mmi -(p10) ldfe FR_C_2 = [GR_Table_Base], -16 ;; -(p10) ldfe FR_C_1 = [GR_Table_Base], -16 - nop.i 999 -} - -{ .mfi - nop.m 999 -// -// if (i_1 != 0): -// poly_lo = FR_rsq * C_5 + C_4 -// poly_hi = FR_rsq * C_2 + C_1 -// -(p9) fma.s1 FR_Z = FR_Z, FR_r, f0 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// if (i_1 == 0) load S_1 -// if (i_1 != 0) load C_1 -// -(p9) fma.s1 FR_poly_lo = FR_rsq, FR_S_5, FR_S_4 - nop.i 999 -} - -{ .mfi - nop.m 999 -// -// c = -c * r -// dummy fmpy's to flag inexact. -// -(p9) fma.d.s0 FR_S_4 = FR_S_4, FR_S_4, f0 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// poly_lo = FR_rsq * poly_lo + C_3 -// poly_hi = FR_rsq * poly_hi -// - fma.s1 FR_Z = FR_Z, FR_rsq, f0 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p9) fma.s1 FR_poly_hi = FR_rsq, FR_S_2, FR_S_1 - nop.i 999 -} - -{ .mfi - nop.m 999 -// -// if (i_1 == 0): -// poly_lo = FR_rsq * S_5 + S_4 -// poly_hi = FR_rsq * S_2 + S_1 -// -(p10) fma.s1 FR_poly_lo = FR_rsq, FR_C_5, FR_C_4 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// if (i_1 == 0): -// Z = Z * r for only one of the small r cases - not there -// in original implementation notes. -// -(p9) fma.s1 FR_poly_lo = FR_rsq, FR_poly_lo, FR_S_3 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p10) fma.s1 FR_poly_hi = FR_rsq, FR_C_2, FR_C_1 - nop.i 999 -} - -{ .mfi - nop.m 999 -(p10) fma.d.s0 FR_C_1 = FR_C_1, FR_C_1, f0 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p9) fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, f0 - nop.i 999 -} - -{ .mfi - nop.m 999 -// -// poly_lo = FR_rsq * poly_lo + S_3 -// poly_hi = FR_rsq * poly_hi -// -(p10) fma.s1 FR_poly_lo = FR_rsq, FR_poly_lo, FR_C_3 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p10) fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, f0 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// if (i_1 == 0): dummy fmpy's to flag inexact -// r = 1 -// -(p9) fma.s1 FR_poly_hi = FR_r, FR_poly_hi, f0 - nop.i 999 -} - -{ .mfi - nop.m 999 -// -// poly_hi = r * poly_hi -// - fma.s1 FR_poly = FR_Z, FR_poly_lo, FR_c - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p12) fms.s1 FR_r = f0, f1, FR_r - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// poly_hi = Z * poly_lo + c -// if i_0 == 1: r = -r -// - fma.s1 FR_poly = FR_poly, f1, FR_poly_hi - nop.i 999 ;; -} + nop.m 999 + fma.d.s0 f8 = sincos_Cm, sincos_P, sincos_Q + br.ret.sptk b0 // Exit for common path +};; +////////// x = 0/Inf/NaN path ////////////////// +_SINCOS_SPECIAL_ARGS: +.pred.rel "mutex",p8,p9 +// sin(+/-0) = +/-0 +// sin(Inf) = NaN +// sin(NaN) = NaN { .mfi - nop.m 999 -(p12) fms.d.s0 FR_Input_X = FR_r, f1, FR_poly - nop.i 999 + nop.m 999 +(p8) fma.d.s0 f8 = f8, f0, f0 // sin(+/-0,NaN,Inf) + nop.i 999 } - +// cos(+/-0) = 1.0 +// cos(Inf) = NaN +// cos(NaN) = NaN { .mfb - nop.m 999 -// -// poly = poly + poly_hi -// -(p11) fma.d.s0 FR_Input_X = FR_r, f1, FR_poly -// -// if (i_0 == 0) Result = r + poly -// if (i_0 != 0) Result = r - poly -// - br.ret.sptk b0 ;; -} -L(SINCOS_NORMAL_R): - -{ .mii - nop.m 999 - extr.u GR_i_1 = GR_N_Inc, 0, 1 ;; -// -// Set table_ptr1 and table_ptr2 to base address of -// constant table. - cmp.eq.unc p9, p10 = 0x0, GR_i_1 ;; -} - -{ .mfi - nop.m 999 - fma.s1 FR_rsq = FR_r, FR_r, f0 - extr.u GR_i_0 = GR_N_Inc, 1, 1 ;; -} + nop.m 999 +(p9) fma.d.s0 f8 = f8, f0, f1 // cos(+/-0,NaN,Inf) + br.ret.sptk b0 // Exit for x = 0/Inf/NaN path +};; +GLOBAL_IEEE754_END(cos) +//////////// x >= 2^27 - large arguments routine call //////////// +LOCAL_LIBM_ENTRY(__libm_callout_sincos) +_SINCOS_LARGE_ARGS: +.prologue { .mfi - nop.m 999 - frcpa.s1 FR_r_hi, p6 = f1, FR_r - cmp.eq.unc p11, p12 = 0x0, GR_i_0 -} -;; - -// ****************************************************************** -// ****************************************************************** -// ****************************************************************** -// -// r and c have been computed. -// We known whether this is the sine or cosine routine. -// Make sure ftz mode is set - should be automatic when using wre -// Get [i_0,i_1] - two lsb of N_fix_gr alone. -// - -{ .mmi - nop.m 999 - addl GR_Table_Base = @ltoff(FSINCOS_CONSTANTS#), gp - nop.i 999 + mov sincos_GR_all_ones = -1 // 0xffffffff + nop.f 999 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS = ar.pfs } ;; -{ .mmi - ld8 GR_Table_Base = [GR_Table_Base] - nop.m 999 - nop.i 999 -} -;; - - -{ .mfi -(p10) add GR_Table_Base = 384, GR_Table_Base -(p12) fms.s1 FR_Input_X = f0, f1, f1 -(p9) add GR_Table_Base = 224, GR_Table_Base ;; -} - -{ .mmf - nop.m 999 -(p10) ldfe FR_QQ_8 = [GR_Table_Base], 16 -// -// if (i_1==0) poly = poly * FR_rsq + PP_1_lo -// else poly = FR_rsq * poly -// -(p11) fma.s1 FR_Input_X = f0, f1, f1 ;; -} - -{ .mmf -(p10) ldfe FR_QQ_7 = [GR_Table_Base], 16 -// -// Adjust table pointers based on i_0 -// Compute rsq = r * r -// -(p9) ldfe FR_PP_8 = [GR_Table_Base], 16 - fma.s1 FR_r_cubed = FR_r, FR_rsq, f0 ;; -} - -{ .mmf -(p9) ldfe FR_PP_7 = [GR_Table_Base], 16 -(p10) ldfe FR_QQ_6 = [GR_Table_Base], 16 -// -// Load PP_8 and QQ_8; PP_7 and QQ_7 -// - frcpa.s1 FR_r_hi, p6 = f1, FR_r_hi ;; -} -// -// if (i_1==0) poly = PP_7 + FR_rsq * PP_8. -// else poly = QQ_7 + FR_rsq * QQ_8. -// - -{ .mmb -(p9) ldfe FR_PP_6 = [GR_Table_Base], 16 -(p10) ldfe FR_QQ_5 = [GR_Table_Base], 16 - nop.b 999 ;; -} - -{ .mmb -(p9) ldfe FR_PP_5 = [GR_Table_Base], 16 -(p10) ldfe FR_S_1 = [GR_Table_Base], 16 - nop.b 999 ;; -} - -{ .mmb -(p10) ldfe FR_QQ_1 = [GR_Table_Base], 16 -(p9) ldfe FR_C_1 = [GR_Table_Base], 16 - nop.b 999 ;; -} - -{ .mmi -(p10) ldfe FR_QQ_4 = [GR_Table_Base], 16 ;; -(p9) ldfe FR_PP_1 = [GR_Table_Base], 16 - nop.i 999 ;; -} - -{ .mmf -(p10) ldfe FR_QQ_3 = [GR_Table_Base], 16 -// -// if (i_1=0) corr = corr + c*c -// else corr = corr * c -// -(p9) ldfe FR_PP_4 = [GR_Table_Base], 16 -(p10) fma.s1 FR_poly = FR_rsq, FR_QQ_8, FR_QQ_7 ;; -} -// -// if (i_1=0) poly = rsq * poly + PP_5 -// else poly = rsq * poly + QQ_5 -// Load PP_4 or QQ_4 -// - -{ .mmf -(p9) ldfe FR_PP_3 = [GR_Table_Base], 16 -(p10) ldfe FR_QQ_2 = [GR_Table_Base], 16 -// -// r_hi = frcpa(frcpa(r)). -// r_cube = r * FR_rsq. -// -(p9) fma.s1 FR_poly = FR_rsq, FR_PP_8, FR_PP_7 ;; -} -// -// Do dummy multiplies so inexact is always set. -// - -{ .mfi -(p9) ldfe FR_PP_2 = [GR_Table_Base], 16 -// -// r_lo = r - r_hi -// -(p9) fma.s1 FR_U_lo = FR_r_hi, FR_r_hi, f0 - nop.i 999 ;; -} - -{ .mmf - nop.m 999 -(p9) ldfe FR_PP_1_lo = [GR_Table_Base], 16 -(p10) fma.s1 FR_corr = FR_S_1, FR_r_cubed, FR_r -} - -{ .mfi - nop.m 999 -(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_6 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// if (i_1=0) U_lo = r_hi * r_hi -// else U_lo = r_hi + r -// -(p9) fma.s1 FR_corr = FR_C_1, FR_rsq, f0 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// if (i_1=0) corr = C_1 * rsq -// else corr = S_1 * r_cubed + r -// -(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_6 - nop.i 999 -} - -{ .mfi - nop.m 999 -(p10) fma.s1 FR_U_lo = FR_r_hi, f1, FR_r - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// if (i_1=0) U_hi = r_hi + U_hi -// else U_hi = QQ_1 * U_hi + 1 -// -(p9) fma.s1 FR_U_lo = FR_r, FR_r_hi, FR_U_lo - nop.i 999 -} - -{ .mfi - nop.m 999 -// -// U_hi = r_hi * r_hi -// - fms.s1 FR_r_lo = FR_r, f1, FR_r_hi - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// Load PP_1, PP_6, PP_5, and C_1 -// Load QQ_1, QQ_6, QQ_5, and S_1 -// - fma.s1 FR_U_hi = FR_r_hi, FR_r_hi, f0 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_5 - nop.i 999 -} - -{ .mfi - nop.m 999 -(p10) fnma.s1 FR_corr = FR_corr, FR_c, f0 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// if (i_1=0) U_lo = r * r_hi + U_lo -// else U_lo = r_lo * U_lo -// -(p9) fma.s1 FR_corr = FR_corr, FR_c, FR_c - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_5 - nop.i 999 -} - -{ .mfi - nop.m 999 -// -// if (i_1 =0) U_hi = r + U_hi -// if (i_1 =0) U_lo = r_lo * U_lo -// -// -(p9) fma.d.s0 FR_PP_5 = FR_PP_5, FR_PP_4, f0 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p9) fma.s1 FR_U_lo = FR_r, FR_r, FR_U_lo - nop.i 999 -} - -{ .mfi - nop.m 999 -(p10) fma.s1 FR_U_lo = FR_r_lo, FR_U_lo, f0 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// if (i_1=0) poly = poly * rsq + PP_6 -// else poly = poly * rsq + QQ_6 -// -(p9) fma.s1 FR_U_hi = FR_r_hi, FR_U_hi, f0 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_4 - nop.i 999 -} - -{ .mfi - nop.m 999 -(p10) fma.s1 FR_U_hi = FR_QQ_1, FR_U_hi, f1 - nop.i 999 ;; -} - { .mfi - nop.m 999 -(p10) fma.d.s0 FR_QQ_5 = FR_QQ_5, FR_QQ_5, f0 - nop.i 999 ;; + mov GR_SAVE_GP = gp + nop.f 999 +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0 = b0 } -{ .mfi - nop.m 999 -// -// if (i_1!=0) U_hi = PP_1 * U_hi -// if (i_1!=0) U_lo = r * r + U_lo -// Load PP_3 or QQ_3 -// -(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_4 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p9) fma.s1 FR_U_lo = FR_r_lo, FR_U_lo, f0 - nop.i 999 -} - -{ .mfi - nop.m 999 -(p10) fma.s1 FR_U_lo = FR_QQ_1,FR_U_lo, f0 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p9) fma.s1 FR_U_hi = FR_PP_1, FR_U_hi, f0 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_3 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// Load PP_2, QQ_2 -// -(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_3 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// if (i_1==0) poly = FR_rsq * poly + PP_3 -// else poly = FR_rsq * poly + QQ_3 -// Load PP_1_lo -// -(p9) fma.s1 FR_U_lo = FR_PP_1, FR_U_lo, f0 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// if (i_1 =0) poly = poly * rsq + pp_r4 -// else poly = poly * rsq + qq_r4 -// -(p9) fma.s1 FR_U_hi = FR_r, f1, FR_U_hi - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_2 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// if (i_1==0) U_lo = PP_1_hi * U_lo -// else U_lo = QQ_1 * U_lo -// -(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_2 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// if (i_0==0) Result = 1 -// else Result = -1 -// - fma.s1 FR_V = FR_U_lo, f1, FR_corr - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p10) fma.s1 FR_poly = FR_rsq, FR_poly, f0 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// if (i_1==0) poly = FR_rsq * poly + PP_2 -// else poly = FR_rsq * poly + QQ_2 -// -(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_1_lo - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p10) fma.s1 FR_poly = FR_rsq, FR_poly, f0 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// V = U_lo + corr -// -(p9) fma.s1 FR_poly = FR_r_cubed, FR_poly, f0 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// if (i_1==0) poly = r_cube * poly -// else poly = FR_rsq * poly -// - fma.s1 FR_V = FR_poly, f1, FR_V - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p12) fms.d.s0 FR_Input_X = FR_Input_X, FR_U_hi, FR_V - nop.i 999 -} - -{ .mfb - nop.m 999 -// -// V = V + poly -// -(p11) fma.d.s0 FR_Input_X = FR_Input_X, FR_U_hi, FR_V -// -// if (i_0==0) Result = Result * U_hi + V -// else Result = Result * U_hi - V -// - br.ret.sptk b0 ;; -} - -// -// If cosine, FR_Input_X = 1 -// If sine, FR_Input_X = +/-Zero (Input FR_Input_X) -// Results are exact, no exceptions -// -L(SINCOS_ZERO): - -{ .mmb - cmp.eq.unc p6, p7 = 0x1, GR_Sin_or_Cos - nop.m 999 - nop.b 999 ;; -} - -{ .mfi - nop.m 999 -(p7) fmerge.s FR_Input_X = FR_Input_X, FR_Input_X - nop.i 999 -} - -{ .mfb - nop.m 999 -(p6) fmerge.s FR_Input_X = f1, f1 - br.ret.sptk b0 ;; -} - -L(SINCOS_SPECIAL): - -// -// Path for Arg = +/- QNaN, SNaN, Inf -// Invalid can be raised. SNaNs -// become QNaNs -// - -{ .mfb - nop.m 999 - fmpy.d.s0 FR_Input_X = FR_Input_X, f0 - br.ret.sptk b0 ;; -} -.endp __libm_cos_double_dbx# -ASM_SIZE_DIRECTIVE(__libm_cos_double_dbx#) - - - -// -// Call int pi_by_2_reduce(double* x, double *y) -// for |arguments| >= 2**63 -// Address to save r and c as double -// -// -// psp sp+64 -// sp+48 -> f0 c -// r45 sp+32 -> f0 r -// r44 -> sp+16 -> InputX -// sp sp -> scratch provided to callee - +.body +{ .mbb + setf.sig sincos_save_tmp = sincos_GR_all_ones// inexact set + nop.b 999 +(p8) br.call.sptk.many b0 = __libm_sin_large# // sin(large_X) +};; -.proc __libm_callout_2 -__libm_callout_2: -L(SINCOS_ARG_TOO_LARGE): +{ .mbb + cmp.ne p9,p0 = sincos_r_sincos, r0 // set p9 if cos + nop.b 999 +(p9) br.call.sptk.many b0 = __libm_cos_large# // cos(large_X) +};; -.prologue { .mfi - add r45=-32,sp // Parameter: r address - nop.f 0 -.save ar.pfs,GR_SAVE_PFS - mov GR_SAVE_PFS=ar.pfs // Save ar.pfs + mov gp = GR_SAVE_GP + fma.d.s0 f8 = f8, f1, f0 // Round result to double + mov b0 = GR_SAVE_B0 } +// Force inexact set { .mfi -.fframe 64 - add sp=-64,sp // Create new stack - nop.f 0 - mov GR_SAVE_GP=gp // Save gp -};; -{ .mmi - stfe [r45] = f0,16 // Clear Parameter r on stack - add r44 = 16,sp // Parameter x address -.save b0, GR_SAVE_B0 - mov GR_SAVE_B0=b0 // Save b0 -};; -.body -{ .mib - stfe [r45] = f0,-16 // Clear Parameter c on stack - nop.i 0 - nop.b 0 -} -{ .mib - stfe [r44] = FR_Input_X // Store Parameter x on stack - nop.i 0 - br.call.sptk b0=__libm_pi_by_2_reduce# ;; + nop.m 999 + fmpy.s0 sincos_save_tmp = sincos_save_tmp, sincos_save_tmp + nop.i 999 };; - -{ .mii - ldfe FR_Input_X =[r44],16 -// -// Get r and c off stack -// - adds GR_Table_Base1 = -16, GR_Table_Base1 -// -// Get r and c off stack -// - add GR_N_Inc = GR_Sin_or_Cos,r8 ;; -} -{ .mmb - ldfe FR_r =[r45],16 -// -// Get X off the stack -// Readjust Table ptr -// - ldfs FR_Two_to_M3 = [GR_Table_Base1],4 - nop.b 999 ;; -} -{ .mmb - ldfs FR_Neg_Two_to_M3 = [GR_Table_Base1],0 - ldfe FR_c =[r45] - nop.b 999 ;; -} - -{ .mfi -.restore sp - add sp = 64,sp // Restore stack pointer - fcmp.lt.unc.s1 p6, p0 = FR_r, FR_Two_to_M3 - mov b0 = GR_SAVE_B0 // Restore return address -};; { .mib - mov gp = GR_SAVE_GP // Restore gp - mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs - nop.b 0 + nop.m 999 + mov ar.pfs = GR_SAVE_PFS + br.ret.sptk b0 // Exit for large arguments routine call };; +LOCAL_LIBM_END(__libm_callout_sincos) -{ .mfi - nop.m 999 -(p6) fcmp.gt.unc.s1 p6, p0 = FR_r, FR_Neg_Two_to_M3 - nop.i 999 ;; -} - -{ .mib - nop.m 999 - nop.i 999 -(p6) br.cond.spnt L(SINCOS_SMALL_R) ;; -} - -{ .mib - nop.m 999 - nop.i 999 - br.cond.sptk L(SINCOS_NORMAL_R) ;; -} - -.endp __libm_callout_2 -ASM_SIZE_DIRECTIVE(__libm_callout_2) - -.type __libm_pi_by_2_reduce#,@function -.global __libm_pi_by_2_reduce# - +.type __libm_sin_large#,@function +.global __libm_sin_large# +.type __libm_cos_large#,@function +.global __libm_cos_large# -.type __libm_sin_double_dbx#,@function -.global __libm_sin_double_dbx# -.type __libm_cos_double_dbx#,@function -.global __libm_cos_double_dbx# diff --git a/sysdeps/ia64/fpu/s_cosf.S b/sysdeps/ia64/fpu/s_cosf.S index 0e47255b3f..89cf82372d 100644 --- a/sysdeps/ia64/fpu/s_cosf.S +++ b/sysdeps/ia64/fpu/s_cosf.S @@ -1,12 +1,10 @@ - .file "sincosf.s" -// Copyright (C) 2000, 2001, Intel Corporation +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. // -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -22,7 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -37,663 +35,680 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. - - +// http://www.intel.com/software/products/opensource/libraries/num.htm. +// // History //============================================================== -// 2/02/00 Initial revision -// 4/02/00 Unwind support added. -// 5/10/00 Improved speed with new algorithm. -// 8/08/00 Improved speed by avoiding SIR flush. -// 8/17/00 Changed predicate register macro-usage to direct predicate -// names due to an assembler bug. -// 8/30/00 Put sin_of_r before sin_tbl_S_cos_of_r to gain a cycle -// 1/02/00 Fixed flag settings, improved speed. +// 02/02/00 Initial version +// 04/02/00 Unwind support added. +// 06/16/00 Updated tables to enforce symmetry +// 08/31/00 Saved 2 cycles in main path, and 9 in other paths. +// 09/20/00 The updated tables regressed to an old version, so reinstated them +// 10/18/00 Changed one table entry to ensure symmetry +// 01/03/01 Improved speed, fixed flag settings for small arguments. +// 02/18/02 Large arguments processing routine excluded +// 05/20/02 Cleaned up namespace and sf0 syntax +// 06/03/02 Insure inexact flag set for large arg result +// 09/05/02 Single precision version is made using double precision one as base +// 02/10/03 Reordered header: .section, .global, .proc, .align // // API //============================================================== // float sinf( float x); // float cosf( float x); // +// Overview of operation +//============================================================== +// +// Step 1 +// ====== +// Reduce x to region -1/2*pi/2^k ===== 0 ===== +1/2*pi/2^k where k=4 +// divide x by pi/2^k. +// Multiply by 2^k/pi. +// nfloat = Round result to integer (round-to-nearest) +// +// r = x - nfloat * pi/2^k +// Do this as (x - nfloat * HIGH(pi/2^k)) - nfloat * LOW(pi/2^k) -#include "libm_support.h" - -// Assembly macros +// for increased accuracy. +// pi/2^k is stored as two numbers that when added make pi/2^k. +// pi/2^k = HIGH(pi/2^k) + LOW(pi/2^k) +// HIGH part is rounded to zero, LOW - to nearest +// +// x = (nfloat * pi/2^k) + r +// r is small enough that we can use a polynomial approximation +// and is referred to as the reduced argument. +// +// Step 3 +// ====== +// Take the unreduced part and remove the multiples of 2pi. +// So nfloat = nfloat (with lower k+1 bits cleared) + lower k+1 bits +// +// nfloat (with lower k+1 bits cleared) is a multiple of 2^(k+1) +// N * 2^(k+1) +// nfloat * pi/2^k = N * 2^(k+1) * pi/2^k + (lower k+1 bits) * pi/2^k +// nfloat * pi/2^k = N * 2 * pi + (lower k+1 bits) * pi/2^k +// nfloat * pi/2^k = N2pi + M * pi/2^k +// +// +// Sin(x) = Sin((nfloat * pi/2^k) + r) +// = Sin(nfloat * pi/2^k) * Cos(r) + Cos(nfloat * pi/2^k) * Sin(r) +// +// Sin(nfloat * pi/2^k) = Sin(N2pi + Mpi/2^k) +// = Sin(N2pi)Cos(Mpi/2^k) + Cos(N2pi)Sin(Mpi/2^k) +// = Sin(Mpi/2^k) +// +// Cos(nfloat * pi/2^k) = Cos(N2pi + Mpi/2^k) +// = Cos(N2pi)Cos(Mpi/2^k) + Sin(N2pi)Sin(Mpi/2^k) +// = Cos(Mpi/2^k) +// +// Sin(x) = Sin(Mpi/2^k) Cos(r) + Cos(Mpi/2^k) Sin(r) +// +// +// Step 4 +// ====== +// 0 <= M < 2^(k+1) +// There are 2^(k+1) Sin entries in a table. +// There are 2^(k+1) Cos entries in a table. +// +// Get Sin(Mpi/2^k) and Cos(Mpi/2^k) by table lookup. +// +// +// Step 5 +// ====== +// Calculate Cos(r) and Sin(r) by polynomial approximation. +// +// Cos(r) = 1 + r^2 q1 + r^4 q2 = Series for Cos +// Sin(r) = r + r^3 p1 + r^5 p2 = Series for Sin +// +// and the coefficients q1, q2 and p1, p2 are stored in a table +// +// +// Calculate +// Sin(x) = Sin(Mpi/2^k) Cos(r) + Cos(Mpi/2^k) Sin(r) +// +// as follows +// +// S[m] = Sin(Mpi/2^k) and C[m] = Cos(Mpi/2^k) +// rsq = r*r +// +// +// P = P1 + r^2*P2 +// Q = Q1 + r^2*Q2 +// +// rcub = r * rsq +// Sin(r) = r + rcub * P +// = r + r^3p1 + r^5p2 = Sin(r) +// +// The coefficients are not exactly these values, but almost. +// +// p1 = -1/6 = -1/3! +// p2 = 1/120 = 1/5! +// p3 = -1/5040 = -1/7! +// p4 = 1/362889 = 1/9! +// +// P = r + r^3 * P +// +// Answer = S[m] Cos(r) + C[m] P +// +// Cos(r) = 1 + rsq Q +// Cos(r) = 1 + r^2 Q +// Cos(r) = 1 + r^2 (q1 + r^2q2) +// Cos(r) = 1 + r^2q1 + r^4q2 +// +// S[m] Cos(r) = S[m](1 + rsq Q) +// S[m] Cos(r) = S[m] + S[m] rsq Q +// S[m] Cos(r) = S[m] + s_rsq Q +// Q = S[m] + s_rsq Q +// +// Then, +// +// Answer = Q + C[m] P + + +// Registers used //============================================================== +// general input registers: +// r14 -> r19 +// r32 -> r45 -// SIN_Sin_Flag = p6 -// SIN_Cos_Flag = p7 - -// integer registers used - - SIN_AD_PQ_1 = r33 - SIN_AD_PQ_2 = r33 - sin_GR_sincos_flag = r34 - sin_GR_Mint = r35 - - sin_GR_index = r36 - gr_tmp = r37 - - GR_SAVE_B0 = r37 - GR_SAVE_GP = r38 - GR_SAVE_PFS = r39 - - -// floating point registers used - - sin_coeff_P1 = f32 - sin_coeff_P2 = f33 - sin_coeff_Q1 = f34 - sin_coeff_Q2 = f35 - sin_coeff_P4 = f36 - sin_coeff_P5 = f37 - sin_coeff_Q3 = f38 - sin_coeff_Q4 = f39 - sin_Mx = f40 - sin_Mfloat = f41 - sin_tbl_S = f42 - sin_tbl_C = f43 - sin_r = f44 - sin_rcube = f45 - sin_tsq = f46 - sin_r7 = f47 - sin_t = f48 - sin_poly_p2 = f49 - sin_poly_p1 = f50 - fp_tmp = f51 - sin_poly_p3 = f52 - sin_poly_p4 = f53 - sin_of_r = f54 - sin_S_t = f55 - sin_poly_q2 = f56 - sin_poly_q1 = f57 - sin_S_tcube = f58 - sin_poly_q3 = f59 - sin_poly_q4 = f60 - sin_tbl_S_tcube = f61 - sin_tbl_S_cos_of_r = f62 - - sin_coeff_Q5 = f63 - sin_coeff_Q6 = f64 - sin_coeff_P3 = f65 - - sin_poly_q5 = f66 - sin_poly_q12 = f67 - sin_poly_q3456 = f68 - fp_tmp2 = f69 - SIN_NORM_f8 = f70 - - -#ifdef _LIBC -.rodata -#else -.data -#endif +// predicate registers used: +// p6 -> p14 -.align 16 +// floating-point registers used +// f9 -> f15 +// f32 -> f61 -sin_coeff_1_table: -ASM_TYPE_DIRECTIVE(sin_coeff_1_table,@object) -data8 0xBF56C16C16BF6462 // q3 -data8 0x3EFA01A0128B9EBC // q4 -data8 0xBE927E42FDF33FFE // q5 -data8 0x3E21DA5C72A446F3 // q6 -data8 0x3EC71DD1D5E421A4 // p4 -data8 0xBE5AC5C9D0ACF95A // p5 -data8 0xBFC55555555554CA // p1 -data8 0x3F811111110F2395 // p2 -data8 0xBFE0000000000000 // q1 -data8 0x3FA55555555554EF // q2 -data8 0xBF2A01A011232913 // p3 -data8 0x0000000000000000 // pad - - -///////////////////////////////////////// - -data8 0xBFE1A54991426566 //sin(-32) -data8 0x3FEAB1F5305DE8E5 //cos(-32) -data8 0x3FD9DBC0B640FC81 //sin(-31) -data8 0x3FED4591C3E12A20 //cos(-31) -data8 0x3FEF9DF47F1C903D //sin(-30) -data8 0x3FC3BE82F2505A52 //cos(-30) -data8 0x3FE53C7D20A6C9E7 //sin(-29) -data8 0xBFE7F01658314E47 //cos(-29) -data8 0xBFD156853B4514D6 //sin(-28) -data8 0xBFEECDAAD1582500 //cos(-28) -data8 0xBFEE9AA1B0E5BA30 //sin(-27) -data8 0xBFD2B266F959DED5 //cos(-27) -data8 0xBFE866E0FAC32583 //sin(-26) -data8 0x3FE4B3902691A9ED //cos(-26) -data8 0x3FC0F0E6F31E809D //sin(-25) -data8 0x3FEFB7EEF59504FF //cos(-25) -data8 0x3FECFA7F7919140F //sin(-24) -data8 0x3FDB25BFB50A609A //cos(-24) -data8 0x3FEB143CD0247D02 //sin(-23) -data8 0xBFE10CF7D591F272 //cos(-23) -data8 0x3F8220A29F6EB9F4 //sin(-22) -data8 0xBFEFFFADD8D4ACDA //cos(-22) -data8 0xBFEAC5E20BB0D7ED //sin(-21) -data8 0xBFE186FF83773759 //cos(-21) -data8 0xBFED36D8F55D3CE0 //sin(-20) -data8 0x3FDA1E043964A83F //cos(-20) -data8 0xBFC32F2D28F584CF //sin(-19) -data8 0x3FEFA377DE108258 //cos(-19) -data8 0x3FE8081668131E26 //sin(-18) -data8 0x3FE52150815D2470 //cos(-18) -data8 0x3FEEC3C4AC42882B //sin(-17) -data8 0xBFD19C46B07F58E7 //cos(-17) -data8 0x3FD26D02085F20F8 //sin(-16) -data8 0xBFEEA5257E962F74 //cos(-16) -data8 0xBFE4CF2871CEC2E8 //sin(-15) -data8 0xBFE84F5D069CA4F3 //cos(-15) -data8 0xBFEFB30E327C5E45 //sin(-14) -data8 0x3FC1809AEC2CA0ED //cos(-14) -data8 0xBFDAE4044881C506 //sin(-13) -data8 0x3FED09CDD5260CB7 //cos(-13) -data8 0x3FE12B9AF7D765A5 //sin(-12) -data8 0x3FEB00DA046B65E3 //cos(-12) -data8 0x3FEFFFEB762E93EB //sin(-11) -data8 0x3F7220AE41EE2FDF //cos(-11) -data8 0x3FE1689EF5F34F52 //sin(-10) -data8 0xBFEAD9AC890C6B1F //cos(-10) -data8 0xBFDA6026360C2F91 //sin( -9) -data8 0xBFED27FAA6A6196B //cos( -9) -data8 0xBFEFA8D2A028CF7B //sin( -8) -data8 0xBFC29FBEBF632F94 //cos( -8) -data8 0xBFE50608C26D0A08 //sin( -7) -data8 0x3FE81FF79ED92017 //cos( -7) -data8 0x3FD1E1F18AB0A2C0 //sin( -6) -data8 0x3FEEB9B7097822F5 //cos( -6) -data8 0x3FEEAF81F5E09933 //sin( -5) -data8 0x3FD22785706B4AD9 //cos( -5) -data8 0x3FE837B9DDDC1EAE //sin( -4) -data8 0xBFE4EAA606DB24C1 //cos( -4) -data8 0xBFC210386DB6D55B //sin( -3) -data8 0xBFEFAE04BE85E5D2 //cos( -3) -data8 0xBFED18F6EAD1B446 //sin( -2) -data8 0xBFDAA22657537205 //cos( -2) -data8 0xBFEAED548F090CEE //sin( -1) -data8 0x3FE14A280FB5068C //cos( -1) -data8 0x0000000000000000 //sin( 0) -data8 0x3FF0000000000000 //cos( 0) -data8 0x3FEAED548F090CEE //sin( 1) -data8 0x3FE14A280FB5068C //cos( 1) -data8 0x3FED18F6EAD1B446 //sin( 2) -data8 0xBFDAA22657537205 //cos( 2) -data8 0x3FC210386DB6D55B //sin( 3) -data8 0xBFEFAE04BE85E5D2 //cos( 3) -data8 0xBFE837B9DDDC1EAE //sin( 4) -data8 0xBFE4EAA606DB24C1 //cos( 4) -data8 0xBFEEAF81F5E09933 //sin( 5) -data8 0x3FD22785706B4AD9 //cos( 5) -data8 0xBFD1E1F18AB0A2C0 //sin( 6) -data8 0x3FEEB9B7097822F5 //cos( 6) -data8 0x3FE50608C26D0A08 //sin( 7) -data8 0x3FE81FF79ED92017 //cos( 7) -data8 0x3FEFA8D2A028CF7B //sin( 8) -data8 0xBFC29FBEBF632F94 //cos( 8) -data8 0x3FDA6026360C2F91 //sin( 9) -data8 0xBFED27FAA6A6196B //cos( 9) -data8 0xBFE1689EF5F34F52 //sin( 10) -data8 0xBFEAD9AC890C6B1F //cos( 10) -data8 0xBFEFFFEB762E93EB //sin( 11) -data8 0x3F7220AE41EE2FDF //cos( 11) -data8 0xBFE12B9AF7D765A5 //sin( 12) -data8 0x3FEB00DA046B65E3 //cos( 12) -data8 0x3FDAE4044881C506 //sin( 13) -data8 0x3FED09CDD5260CB7 //cos( 13) -data8 0x3FEFB30E327C5E45 //sin( 14) -data8 0x3FC1809AEC2CA0ED //cos( 14) -data8 0x3FE4CF2871CEC2E8 //sin( 15) -data8 0xBFE84F5D069CA4F3 //cos( 15) -data8 0xBFD26D02085F20F8 //sin( 16) -data8 0xBFEEA5257E962F74 //cos( 16) -data8 0xBFEEC3C4AC42882B //sin( 17) -data8 0xBFD19C46B07F58E7 //cos( 17) -data8 0xBFE8081668131E26 //sin( 18) -data8 0x3FE52150815D2470 //cos( 18) -data8 0x3FC32F2D28F584CF //sin( 19) -data8 0x3FEFA377DE108258 //cos( 19) -data8 0x3FED36D8F55D3CE0 //sin( 20) -data8 0x3FDA1E043964A83F //cos( 20) -data8 0x3FEAC5E20BB0D7ED //sin( 21) -data8 0xBFE186FF83773759 //cos( 21) -data8 0xBF8220A29F6EB9F4 //sin( 22) -data8 0xBFEFFFADD8D4ACDA //cos( 22) -data8 0xBFEB143CD0247D02 //sin( 23) -data8 0xBFE10CF7D591F272 //cos( 23) -data8 0xBFECFA7F7919140F //sin( 24) -data8 0x3FDB25BFB50A609A //cos( 24) -data8 0xBFC0F0E6F31E809D //sin( 25) -data8 0x3FEFB7EEF59504FF //cos( 25) -data8 0x3FE866E0FAC32583 //sin( 26) -data8 0x3FE4B3902691A9ED //cos( 26) -data8 0x3FEE9AA1B0E5BA30 //sin( 27) -data8 0xBFD2B266F959DED5 //cos( 27) -data8 0x3FD156853B4514D6 //sin( 28) -data8 0xBFEECDAAD1582500 //cos( 28) -data8 0xBFE53C7D20A6C9E7 //sin( 29) -data8 0xBFE7F01658314E47 //cos( 29) -data8 0xBFEF9DF47F1C903D //sin( 30) -data8 0x3FC3BE82F2505A52 //cos( 30) -data8 0xBFD9DBC0B640FC81 //sin( 31) -data8 0x3FED4591C3E12A20 //cos( 31) -data8 0x3FE1A54991426566 //sin( 32) -data8 0x3FEAB1F5305DE8E5 //cos( 32) -ASM_SIZE_DIRECTIVE(sin_coeff_1_table) - -////////////////////////////////////////// - - -.global sinf -.global cosf -#ifdef _LIBC -.global __sinf -.global __cosf -#endif - -.text -.proc cosf -#ifdef _LIBC -.proc __cosf -#endif -.align 32 - - -cosf: -#ifdef _LIBC -__cosf: -#endif -{ .mfi - alloc r32 = ar.pfs,1,7,0,0 - fcvt.fx.s1 sin_Mx = f8 - cmp.ne p6,p7 = r0,r0 // p7 set if cos -} -{ .mfi - addl SIN_AD_PQ_1 = @ltoff(sin_coeff_1_table),gp - fnorm.s0 SIN_NORM_f8 = f8 // Sets denormal or invalid - mov sin_GR_sincos_flag = 0x0 -} -;; +// Assembly macros +//============================================================== +sincosf_NORM_f8 = f9 +sincosf_W = f10 +sincosf_int_Nfloat = f11 +sincosf_Nfloat = f12 -{ .mfi - ld8 SIN_AD_PQ_1 = [SIN_AD_PQ_1] - fclass.m.unc p9,p0 = f8, 0x07 - cmp.ne p8,p0 = r0,r0 -} -{ .mfb - nop.m 999 - nop.f 999 - br.sptk L(SINCOSF_COMMON) -} -;; +sincosf_r = f13 +sincosf_rsq = f14 +sincosf_rcub = f15 +sincosf_save_tmp = f15 -.endp cosf -ASM_SIZE_DIRECTIVE(cosf) +sincosf_Inv_Pi_by_16 = f32 +sincosf_Pi_by_16_1 = f33 +sincosf_Pi_by_16_2 = f34 +sincosf_Inv_Pi_by_64 = f35 -.text -.proc sinf -#ifdef _LIBC -.proc __sinf -#endif -.align 32 +sincosf_Pi_by_16_3 = f36 -sinf: -#ifdef _LIBC -__sinf: -#endif -{ .mfi - alloc r32 = ar.pfs,1,7,0,0 - fcvt.fx.s1 sin_Mx = f8 - cmp.eq p6,p7 = r0,r0 // p6 set if sin -} -{ .mfi - addl SIN_AD_PQ_1 = @ltoff(sin_coeff_1_table),gp - fnorm.s0 SIN_NORM_f8 = f8 // Sets denormal or invalid - mov sin_GR_sincos_flag = 0x1 -} -;; +sincosf_r_exact = f37 -{ .mfi - ld8 SIN_AD_PQ_1 = [SIN_AD_PQ_1] - fclass.m.unc p8,p0 = f8, 0x07 - cmp.ne p9,p0 = r0,r0 -} -{ .mfb - nop.m 999 - nop.f 999 - br.sptk L(SINCOSF_COMMON) -} -;; +sincosf_Sm = f38 +sincosf_Cm = f39 +sincosf_P1 = f40 +sincosf_Q1 = f41 +sincosf_P2 = f42 +sincosf_Q2 = f43 +sincosf_P3 = f44 +sincosf_Q3 = f45 +sincosf_P4 = f46 +sincosf_Q4 = f47 -L(SINCOSF_COMMON): +sincosf_P_temp1 = f48 +sincosf_P_temp2 = f49 -// Here with p6 if sin, p7 if cos, p8 if sin(0), p9 if cos(0) +sincosf_Q_temp1 = f50 +sincosf_Q_temp2 = f51 +sincosf_P = f52 +sincosf_Q = f53 -{ .mmf - ldfpd sin_coeff_Q3, sin_coeff_Q4 = [SIN_AD_PQ_1], 16 - nop.m 999 - fclass.m.unc p11,p0 = f8, 0x23 // Test for x=inf -} -;; +sincosf_srsq = f54 -{ .mfb - ldfpd sin_coeff_Q5, sin_coeff_Q6 = [SIN_AD_PQ_1], 16 - fclass.m.unc p10,p0 = f8, 0xc3 // Test for x=nan -(p8) br.ret.spnt b0 // Exit for sin(0) -} -{ .mfb - nop.m 999 -(p9) fma.s f8 = f1,f1,f0 -(p9) br.ret.spnt b0 // Exit for cos(0) -} -;; +sincosf_SIG_INV_PI_BY_16_2TO61 = f55 +sincosf_RSHF_2TO61 = f56 +sincosf_RSHF = f57 +sincosf_2TOM61 = f58 +sincosf_NFLOAT = f59 +sincosf_W_2TO61_RSH = f60 -{ .mmf - ldfpd sin_coeff_P4, sin_coeff_P5 = [SIN_AD_PQ_1], 16 - addl gr_tmp = -1,r0 - fcvt.xf sin_Mfloat = sin_Mx -} -;; +fp_tmp = f61 -{ .mfi - getf.sig sin_GR_Mint = sin_Mx -(p11) frcpa.s0 f8,p13 = f0,f0 // qnan indef if x=inf - nop.i 999 -} -{ .mfb - ldfpd sin_coeff_P1, sin_coeff_P2 = [SIN_AD_PQ_1], 16 - nop.f 999 -(p11) br.ret.spnt b0 // Exit for x=inf -} -;; +///////////////////////////////////////////////////////////// -{ .mfi - ldfpd sin_coeff_Q1, sin_coeff_Q2 = [SIN_AD_PQ_1], 16 - nop.f 999 - cmp.ge p8,p9 = -33,sin_GR_Mint -} -{ .mfb - add sin_GR_index = 32,sin_GR_Mint -(p10) fma.s f8 = f8,f1,f0 // Force qnan if x=nan -(p10) br.ret.spnt b0 // Exit for x=nan -} -;; +sincosf_AD_1 = r33 +sincosf_AD_2 = r34 +sincosf_exp_limit = r35 +sincosf_r_signexp = r36 +sincosf_AD_beta_table = r37 +sincosf_r_sincos = r38 -{ .mmi - ldfd sin_coeff_P3 = [SIN_AD_PQ_1], 16 -(p9) cmp.le p8,p0 = 33, sin_GR_Mint - shl sin_GR_index = sin_GR_index,4 -} -;; +sincosf_r_exp = r39 +sincosf_r_17_ones = r40 +sincosf_GR_sig_inv_pi_by_16 = r14 +sincosf_GR_rshf_2to61 = r15 +sincosf_GR_rshf = r16 +sincosf_GR_exp_2tom61 = r17 +sincosf_GR_n = r18 +sincosf_GR_m = r19 +sincosf_GR_32m = r19 +sincosf_GR_all_ones = r19 -{ .mfi - setf.sig fp_tmp = gr_tmp // Create constant such that fmpy sets inexact - fnma.s1 sin_r = f1,sin_Mfloat,SIN_NORM_f8 -(p8) cmp.eq.unc p11,p12=sin_GR_sincos_flag,r0 // p11 if must call dbl cos - // p12 if must call dbl sin -} -{ .mbb - add SIN_AD_PQ_2 = sin_GR_index,SIN_AD_PQ_1 -(p11) br.cond.spnt COS_DOUBLE -(p12) br.cond.spnt SIN_DOUBLE -} -;; +gr_tmp = r41 +GR_SAVE_PFS = r41 +GR_SAVE_B0 = r42 +GR_SAVE_GP = r43 -.pred.rel "mutex",p6,p7 //SIN_Sin_Flag, SIN_Cos_Flag -{ .mmi -(p6) ldfpd sin_tbl_S,sin_tbl_C = [SIN_AD_PQ_2] -(p7) ldfpd sin_tbl_C,sin_tbl_S = [SIN_AD_PQ_2] - nop.i 999 -} -;; +RODATA +.align 16 -{ .mfi - nop.m 999 -(p6) fclass.m.unc p8,p0 = f8, 0x0b // If sin, note denormal input to set uflow - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 sin_t = sin_r,sin_r,f0 - nop.i 999 -} -;; +// Pi/16 parts +LOCAL_OBJECT_START(double_sincosf_pi) + data8 0xC90FDAA22168C234, 0x00003FFC // pi/16 1st part + data8 0xC4C6628B80DC1CD1, 0x00003FBC // pi/16 2nd part +LOCAL_OBJECT_END(double_sincosf_pi) + +// Coefficients for polynomials +LOCAL_OBJECT_START(double_sincosf_pq_k4) + data8 0x3F810FABB668E9A2 // P2 + data8 0x3FA552E3D6DE75C9 // Q2 + data8 0xBFC555554447BC7F // P1 + data8 0xBFDFFFFFC447610A // Q1 +LOCAL_OBJECT_END(double_sincosf_pq_k4) + +// Sincos table (S[m], C[m]) +LOCAL_OBJECT_START(double_sin_cos_beta_k4) + data8 0x0000000000000000 // sin ( 0 Pi / 16 ) + data8 0x3FF0000000000000 // cos ( 0 Pi / 16 ) +// + data8 0x3FC8F8B83C69A60B // sin ( 1 Pi / 16 ) + data8 0x3FEF6297CFF75CB0 // cos ( 1 Pi / 16 ) +// + data8 0x3FD87DE2A6AEA963 // sin ( 2 Pi / 16 ) + data8 0x3FED906BCF328D46 // cos ( 2 Pi / 16 ) +// + data8 0x3FE1C73B39AE68C8 // sin ( 3 Pi / 16 ) + data8 0x3FEA9B66290EA1A3 // cos ( 3 Pi / 16 ) +// + data8 0x3FE6A09E667F3BCD // sin ( 4 Pi / 16 ) + data8 0x3FE6A09E667F3BCD // cos ( 4 Pi / 16 ) +// + data8 0x3FEA9B66290EA1A3 // sin ( 5 Pi / 16 ) + data8 0x3FE1C73B39AE68C8 // cos ( 5 Pi / 16 ) +// + data8 0x3FED906BCF328D46 // sin ( 6 Pi / 16 ) + data8 0x3FD87DE2A6AEA963 // cos ( 6 Pi / 16 ) +// + data8 0x3FEF6297CFF75CB0 // sin ( 7 Pi / 16 ) + data8 0x3FC8F8B83C69A60B // cos ( 7 Pi / 16 ) +// + data8 0x3FF0000000000000 // sin ( 8 Pi / 16 ) + data8 0x0000000000000000 // cos ( 8 Pi / 16 ) +// + data8 0x3FEF6297CFF75CB0 // sin ( 9 Pi / 16 ) + data8 0xBFC8F8B83C69A60B // cos ( 9 Pi / 16 ) +// + data8 0x3FED906BCF328D46 // sin ( 10 Pi / 16 ) + data8 0xBFD87DE2A6AEA963 // cos ( 10 Pi / 16 ) +// + data8 0x3FEA9B66290EA1A3 // sin ( 11 Pi / 16 ) + data8 0xBFE1C73B39AE68C8 // cos ( 11 Pi / 16 ) +// + data8 0x3FE6A09E667F3BCD // sin ( 12 Pi / 16 ) + data8 0xBFE6A09E667F3BCD // cos ( 12 Pi / 16 ) +// + data8 0x3FE1C73B39AE68C8 // sin ( 13 Pi / 16 ) + data8 0xBFEA9B66290EA1A3 // cos ( 13 Pi / 16 ) +// + data8 0x3FD87DE2A6AEA963 // sin ( 14 Pi / 16 ) + data8 0xBFED906BCF328D46 // cos ( 14 Pi / 16 ) +// + data8 0x3FC8F8B83C69A60B // sin ( 15 Pi / 16 ) + data8 0xBFEF6297CFF75CB0 // cos ( 15 Pi / 16 ) +// + data8 0x0000000000000000 // sin ( 16 Pi / 16 ) + data8 0xBFF0000000000000 // cos ( 16 Pi / 16 ) +// + data8 0xBFC8F8B83C69A60B // sin ( 17 Pi / 16 ) + data8 0xBFEF6297CFF75CB0 // cos ( 17 Pi / 16 ) +// + data8 0xBFD87DE2A6AEA963 // sin ( 18 Pi / 16 ) + data8 0xBFED906BCF328D46 // cos ( 18 Pi / 16 ) +// + data8 0xBFE1C73B39AE68C8 // sin ( 19 Pi / 16 ) + data8 0xBFEA9B66290EA1A3 // cos ( 19 Pi / 16 ) +// + data8 0xBFE6A09E667F3BCD // sin ( 20 Pi / 16 ) + data8 0xBFE6A09E667F3BCD // cos ( 20 Pi / 16 ) +// + data8 0xBFEA9B66290EA1A3 // sin ( 21 Pi / 16 ) + data8 0xBFE1C73B39AE68C8 // cos ( 21 Pi / 16 ) +// + data8 0xBFED906BCF328D46 // sin ( 22 Pi / 16 ) + data8 0xBFD87DE2A6AEA963 // cos ( 22 Pi / 16 ) +// + data8 0xBFEF6297CFF75CB0 // sin ( 23 Pi / 16 ) + data8 0xBFC8F8B83C69A60B // cos ( 23 Pi / 16 ) +// + data8 0xBFF0000000000000 // sin ( 24 Pi / 16 ) + data8 0x0000000000000000 // cos ( 24 Pi / 16 ) +// + data8 0xBFEF6297CFF75CB0 // sin ( 25 Pi / 16 ) + data8 0x3FC8F8B83C69A60B // cos ( 25 Pi / 16 ) +// + data8 0xBFED906BCF328D46 // sin ( 26 Pi / 16 ) + data8 0x3FD87DE2A6AEA963 // cos ( 26 Pi / 16 ) +// + data8 0xBFEA9B66290EA1A3 // sin ( 27 Pi / 16 ) + data8 0x3FE1C73B39AE68C8 // cos ( 27 Pi / 16 ) +// + data8 0xBFE6A09E667F3BCD // sin ( 28 Pi / 16 ) + data8 0x3FE6A09E667F3BCD // cos ( 28 Pi / 16 ) +// + data8 0xBFE1C73B39AE68C8 // sin ( 29 Pi / 16 ) + data8 0x3FEA9B66290EA1A3 // cos ( 29 Pi / 16 ) +// + data8 0xBFD87DE2A6AEA963 // sin ( 30 Pi / 16 ) + data8 0x3FED906BCF328D46 // cos ( 30 Pi / 16 ) +// + data8 0xBFC8F8B83C69A60B // sin ( 31 Pi / 16 ) + data8 0x3FEF6297CFF75CB0 // cos ( 31 Pi / 16 ) +// + data8 0x0000000000000000 // sin ( 32 Pi / 16 ) + data8 0x3FF0000000000000 // cos ( 32 Pi / 16 ) +LOCAL_OBJECT_END(double_sin_cos_beta_k4) -{ .mfi - nop.m 999 - fma.s1 sin_rcube = sin_t,sin_r,f0 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 sin_tsq = sin_t,sin_t,f0 - nop.i 999 -} -;; +.section .text -{ .mfi - nop.m 999 - fma.s1 sin_poly_q3 = sin_t,sin_coeff_Q4,sin_coeff_Q3 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 sin_poly_q5 = sin_t,sin_coeff_Q6,sin_coeff_Q5 - nop.i 999 -} -;; +//////////////////////////////////////////////////////// +// There are two entry points: sin and cos +// If from sin, p8 is true +// If from cos, p9 is true -{ .mfi - nop.m 999 - fma.s1 sin_poly_p1 = sin_t,sin_coeff_P5,sin_coeff_P4 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 sin_poly_p2 = sin_t,sin_coeff_P2,sin_coeff_P1 - nop.i 999 -} -;; +GLOBAL_IEEE754_ENTRY(sinf) -{ .mfi - nop.m 999 - fma.s1 sin_poly_q1 = sin_t,sin_coeff_Q2,sin_coeff_Q1 - nop.i 999 +{ .mlx + alloc r32 = ar.pfs,1,13,0,0 + movl sincosf_GR_sig_inv_pi_by_16 = 0xA2F9836E4E44152A //signd of 16/pi } -{ .mfi - nop.m 999 - fma.s1 sin_S_t = sin_t,sin_tbl_S,f0 - nop.i 999 -} -;; +{ .mlx + addl sincosf_AD_1 = @ltoff(double_sincosf_pi), gp + movl sincosf_GR_rshf_2to61 = 0x47b8000000000000 // 1.1 2^(63+63-2) +};; -{ .mfi - nop.m 999 -(p8) fmpy.s.s0 fp_tmp2 = f8,f8 // Dummy mult to set underflow if sin(denormal) - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 sin_r7 = sin_rcube,sin_tsq,f0 - nop.i 999 +{ .mfi + ld8 sincosf_AD_1 = [sincosf_AD_1] + fnorm.s1 sincosf_NORM_f8 = f8 // Normalize argument + cmp.eq p8,p9 = r0, r0 // set p8 (clear p9) for sin } -;; +{ .mib + mov sincosf_GR_exp_2tom61 = 0xffff-61 // exponent of scale 2^-61 + mov sincosf_r_sincos = 0x0 // 0 for sin + br.cond.sptk _SINCOSF_COMMON // go to common part +};; -{ .mfi - nop.m 999 - fma.s1 sin_poly_q3456 = sin_tsq,sin_poly_q5,sin_poly_q3 - nop.i 999 -} -;; +GLOBAL_IEEE754_END(sinf) +GLOBAL_IEEE754_ENTRY(cosf) -{ .mfi - nop.m 999 - fma.s1 sin_poly_p3 = sin_t,sin_poly_p1,sin_coeff_P3 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 sin_poly_p4 = sin_rcube,sin_poly_p2,sin_r - nop.i 999 +{ .mlx + alloc r32 = ar.pfs,1,13,0,0 + movl sincosf_GR_sig_inv_pi_by_16 = 0xA2F9836E4E44152A //signd of 16/pi } -;; +{ .mlx + addl sincosf_AD_1 = @ltoff(double_sincosf_pi), gp + movl sincosf_GR_rshf_2to61 = 0x47b8000000000000 // 1.1 2^(63+63-2) +};; -{ .mfi - nop.m 999 - fma.s1 sin_tbl_S_tcube = sin_S_t,sin_tsq,f0 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 sin_poly_q12 = sin_S_t,sin_poly_q1,sin_tbl_S - nop.i 999 +{ .mfi + ld8 sincosf_AD_1 = [sincosf_AD_1] + fnorm.s1 sincosf_NORM_f8 = f8 // Normalize argument + cmp.eq p9,p8 = r0, r0 // set p9 (clear p8) for cos } -;; +{ .mib + mov sincosf_GR_exp_2tom61 = 0xffff-61 // exponent of scale 2^-61 + mov sincosf_r_sincos = 0x8 // 8 for cos + nop.b 999 +};; + +//////////////////////////////////////////////////////// +// All entry points end up here. +// If from sin, sincosf_r_sincos is 0 and p8 is true +// If from cos, sincosf_r_sincos is 8 = 2^(k-1) and p9 is true +// We add sincosf_r_sincos to N + +///////////// Common sin and cos part ////////////////// +_SINCOSF_COMMON: + +// Form two constants we need +// 16/pi * 2^-2 * 2^63, scaled by 2^61 since we just loaded the significand +// 1.1000...000 * 2^(63+63-2) to right shift int(W) into the low significand +// fcmp used to set denormal, and invalid on snans +{ .mfi + setf.sig sincosf_SIG_INV_PI_BY_16_2TO61 = sincosf_GR_sig_inv_pi_by_16 + fclass.m p6,p0 = f8, 0xe7 // if x=0,inf,nan + mov sincosf_exp_limit = 0x10017 +} +{ .mlx + setf.d sincosf_RSHF_2TO61 = sincosf_GR_rshf_2to61 + movl sincosf_GR_rshf = 0x43e8000000000000 // 1.1000 2^63 +};; // Right shift + +// Form another constant +// 2^-61 for scaling Nfloat +// 0x10017 is register_bias + 24. +// So if f8 >= 2^24, go to large argument routines +{ .mmi + getf.exp sincosf_r_signexp = f8 + setf.exp sincosf_2TOM61 = sincosf_GR_exp_2tom61 + addl gr_tmp = -1,r0 // For "inexect" constant create +};; + +// Load the two pieces of pi/16 +// Form another constant +// 1.1000...000 * 2^63, the right shift constant +{ .mmb + ldfe sincosf_Pi_by_16_1 = [sincosf_AD_1],16 + setf.d sincosf_RSHF = sincosf_GR_rshf +(p6) br.cond.spnt _SINCOSF_SPECIAL_ARGS +};; -{ .mfi - nop.m 999 - fma.d.s1 sin_of_r = sin_r7,sin_poly_p3,sin_poly_p4 - nop.i 999 -} -;; +// Getting argument's exp for "large arguments" filtering +{ .mmi + ldfe sincosf_Pi_by_16_2 = [sincosf_AD_1],16 + setf.sig fp_tmp = gr_tmp // constant for inexact set + nop.i 999 +};; -{ .mfi - nop.m 999 - fma.d.s1 sin_tbl_S_cos_of_r = sin_tbl_S_tcube,sin_poly_q3456,sin_poly_q12 - nop.i 999 -} -{ .mfi - nop.m 999 - fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact - nop.i 999 -} -;; +// Polynomial coefficients (Q2, Q1, P2, P1) loading +{ .mmi + ldfpd sincosf_P2,sincosf_Q2 = [sincosf_AD_1],16 + nop.m 999 + nop.i 999 +};; +// Select exponent (17 lsb) +{ .mmi + ldfpd sincosf_P1,sincosf_Q1 = [sincosf_AD_1],16 + nop.m 999 + dep.z sincosf_r_exp = sincosf_r_signexp, 0, 17 +};; -.pred.rel "mutex",p6,p7 //SIN_Sin_Flag, SIN_Cos_Flag -{ .mfi - nop.m 999 -//(SIN_Sin_Flag) fma.s f8 = sin_tbl_C,sin_of_r,sin_tbl_S_cos_of_r -(p6) fma.s f8 = sin_tbl_C,sin_of_r,sin_tbl_S_cos_of_r - nop.i 999 -} -{ .mfb - nop.m 999 -//(SIN_Cos_Flag) fnma.s f8 = sin_tbl_C,sin_of_r,sin_tbl_S_cos_of_r -(p7) fnma.s f8 = sin_tbl_C,sin_of_r,sin_tbl_S_cos_of_r - br.ret.sptk b0 -} +// p10 is true if we must call routines to handle larger arguments +// p10 is true if f8 exp is >= 0x10017 (2^24) +{ .mfb + cmp.ge p10,p0 = sincosf_r_exp,sincosf_exp_limit + nop.f 999 +(p10) br.cond.spnt _SINCOSF_LARGE_ARGS // Go to "large args" routine +};; + +// sincosf_W = x * sincosf_Inv_Pi_by_16 +// Multiply x by scaled 16/pi and add large const to shift integer part of W to +// rightmost bits of significand +{ .mfi + nop.m 999 + fma.s1 sincosf_W_2TO61_RSH = sincosf_NORM_f8, sincosf_SIG_INV_PI_BY_16_2TO61, sincosf_RSHF_2TO61 + nop.i 999 +};; -.endp sinf -ASM_SIZE_DIRECTIVE(sinf) +// sincosf_NFLOAT = Round_Int_Nearest(sincosf_W) +// This is done by scaling back by 2^-61 and subtracting the shift constant +{ .mfi + nop.m 999 + fms.s1 sincosf_NFLOAT = sincosf_W_2TO61_RSH,sincosf_2TOM61,sincosf_RSHF + nop.i 999 +};; +// get N = (int)sincosf_int_Nfloat +{ .mfi + getf.sig sincosf_GR_n = sincosf_W_2TO61_RSH // integer N value + nop.f 999 + nop.i 999 +};; -.proc SIN_DOUBLE -SIN_DOUBLE: -.prologue +// Add 2^(k-1) (which is in sincosf_r_sincos=8) to N +// sincosf_r = -sincosf_Nfloat * sincosf_Pi_by_16_1 + x { .mfi - nop.m 0 - nop.f 0 -.save ar.pfs,GR_SAVE_PFS - mov GR_SAVE_PFS=ar.pfs -} -;; + add sincosf_GR_n = sincosf_GR_n, sincosf_r_sincos + fnma.s1 sincosf_r = sincosf_NFLOAT, sincosf_Pi_by_16_1, sincosf_NORM_f8 + nop.i 999 +};; +// Get M (least k+1 bits of N) +{ .mmi + and sincosf_GR_m = 0x1f,sincosf_GR_n // Put mask 0x1F - + nop.m 999 // - select k+1 bits + nop.i 999 +};; + +// Add 16*M to address of sin_cos_beta table { .mfi - mov GR_SAVE_GP=gp - nop.f 0 -.save b0, GR_SAVE_B0 - mov GR_SAVE_B0=b0 -} + shladd sincosf_AD_2 = sincosf_GR_32m, 4, sincosf_AD_1 +(p8) fclass.m.unc p10,p0 = f8,0x0b // If sin denormal input - + nop.i 999 +};; -.body -{ .mmb - nop.m 999 - nop.m 999 - br.call.sptk.many b0=sin +// Load Sin and Cos table value using obtained index m (sincosf_AD_2) +{ .mfi + ldfd sincosf_Sm = [sincosf_AD_2],8 // Sin value S[m] +(p9) fclass.m.unc p11,p0 = f8,0x0b // If cos denormal input - + nop.i 999 // - set denormal +};; + +// sincosf_r = sincosf_r -sincosf_Nfloat * sincosf_Pi_by_16_2 +{ .mfi + ldfd sincosf_Cm = [sincosf_AD_2] // Cos table value C[m] + fnma.s1 sincosf_r_exact = sincosf_NFLOAT, sincosf_Pi_by_16_2, sincosf_r + nop.i 999 } -;; +// get rsq = r*r +{ .mfi + nop.m 999 + fma.s1 sincosf_rsq = sincosf_r, sincosf_r, f0 // r^2 = r*r + nop.i 999 +};; { .mfi - mov gp = GR_SAVE_GP - nop.f 999 - mov b0 = GR_SAVE_B0 + nop.m 999 + fmpy.s0 fp_tmp = fp_tmp, fp_tmp // forces inexact flag + nop.i 999 +};; + +// Polynomials calculation +// Q = Q2*r^2 + Q1 +// P = P2*r^2 + P1 +{ .mfi + nop.m 999 + fma.s1 sincosf_Q = sincosf_rsq, sincosf_Q2, sincosf_Q1 + nop.i 999 } -;; +{ .mfi + nop.m 999 + fma.s1 sincosf_P = sincosf_rsq, sincosf_P2, sincosf_P1 + nop.i 999 +};; +// get rcube and S[m]*r^2 { .mfi - nop.m 999 - fma.s f8 = f8,f1,f0 -(p0) mov ar.pfs = GR_SAVE_PFS + nop.m 999 + fmpy.s1 sincosf_srsq = sincosf_Sm,sincosf_rsq // r^2*S[m] + nop.i 999 } -{ .mib - nop.m 999 - nop.i 999 -(p0) br.ret.sptk b0 +{ .mfi + nop.m 999 + fmpy.s1 sincosf_rcub = sincosf_r_exact, sincosf_rsq + nop.i 999 +};; + +// Get final P and Q +// Q = Q*S[m]*r^2 + S[m] +// P = P*r^3 + r +{ .mfi + nop.m 999 + fma.s1 sincosf_Q = sincosf_srsq,sincosf_Q, sincosf_Sm + nop.i 999 } -;; +{ .mfi + nop.m 999 + fma.s1 sincosf_P = sincosf_rcub,sincosf_P,sincosf_r_exact + nop.i 999 +};; -.endp SIN_DOUBLE -ASM_SIZE_DIRECTIVE(SIN_DOUBLE) +// If sinf(denormal) - force underflow to be set +.pred.rel "mutex",p10,p11 +{ .mfi + nop.m 999 +(p10) fmpy.s.s0 fp_tmp = f8,f8 // forces underflow flag + nop.i 999 // for denormal sine args +} +// If cosf(denormal) - force denormal to be set +{ .mfi + nop.m 999 +(p11) fma.s.s0 fp_tmp = f8, f1, f8 // forces denormal flag + nop.i 999 // for denormal cosine args +};; -.proc COS_DOUBLE -COS_DOUBLE: +// Final calculation +// result = C[m]*P + Q +{ .mfb + nop.m 999 + fma.s.s0 f8 = sincosf_Cm, sincosf_P, sincosf_Q + br.ret.sptk b0 // Exit for common path +};; + +////////// x = 0/Inf/NaN path ////////////////// +_SINCOSF_SPECIAL_ARGS: +.pred.rel "mutex",p8,p9 +// sinf(+/-0) = +/-0 +// sinf(Inf) = NaN +// sinf(NaN) = NaN +{ .mfi + nop.m 999 +(p8) fma.s.s0 f8 = f8, f0, f0 // sinf(+/-0,NaN,Inf) + nop.i 999 +} +// cosf(+/-0) = 1.0 +// cosf(Inf) = NaN +// cosf(NaN) = NaN +{ .mfb + nop.m 999 +(p9) fma.s.s0 f8 = f8, f0, f1 // cosf(+/-0,NaN,Inf) + br.ret.sptk b0 // Exit for x = 0/Inf/NaN path +};; + +GLOBAL_IEEE754_END(cosf) +//////////// x >= 2^24 - large arguments routine call //////////// +LOCAL_LIBM_ENTRY(__libm_callout_sincosf) +_SINCOSF_LARGE_ARGS: .prologue { .mfi - nop.m 0 - nop.f 0 -.save ar.pfs,GR_SAVE_PFS - mov GR_SAVE_PFS=ar.pfs + mov sincosf_GR_all_ones = -1 // 0xffffffff + nop.f 999 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS = ar.pfs } ;; { .mfi - mov GR_SAVE_GP=gp - nop.f 0 -.save b0, GR_SAVE_B0 - mov GR_SAVE_B0=b0 + mov GR_SAVE_GP = gp + nop.f 999 +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0 = b0 } - .body -{ .mmb - nop.m 999 - nop.m 999 - br.call.sptk.many b0=cos -} -;; -{ .mfi - mov gp = GR_SAVE_GP - nop.f 999 - mov b0 = GR_SAVE_B0 -} -;; +{ .mbb + setf.sig sincosf_save_tmp = sincosf_GR_all_ones // inexact set + nop.b 999 +(p8) br.call.sptk.many b0 = __libm_sin_large# // sinf(large_X) +};; + +{ .mbb + cmp.ne p9,p0 = sincosf_r_sincos, r0 // set p9 if cos + nop.b 999 +(p9) br.call.sptk.many b0 = __libm_cos_large# // cosf(large_X) +};; { .mfi - nop.m 999 - fma.s f8 = f8,f1,f0 -(p0) mov ar.pfs = GR_SAVE_PFS -} -{ .mib - nop.m 999 - nop.i 999 -(p0) br.ret.sptk b0 + mov gp = GR_SAVE_GP + fma.s.s0 f8 = f8, f1, f0 // Round result to single + mov b0 = GR_SAVE_B0 } -;; - -.endp COS_DOUBLE -ASM_SIZE_DIRECTIVE(COS_DOUBLE) +{ .mfi // force inexact set + nop.m 999 + fmpy.s0 sincosf_save_tmp = sincosf_save_tmp, sincosf_save_tmp + nop.i 999 +};; +{ .mib + nop.m 999 + mov ar.pfs = GR_SAVE_PFS + br.ret.sptk b0 // Exit for large arguments routine call +};; +LOCAL_LIBM_END(__libm_callout_sincosf) +.type __libm_sin_large#, @function +.global __libm_sin_large# +.type __libm_cos_large#, @function +.global __libm_cos_large# -.type sin,@function -.global sin -.type cos,@function -.global cos diff --git a/sysdeps/ia64/fpu/s_cosl.S b/sysdeps/ia64/fpu/s_cosl.S index 2755580c0d..374e822256 100644 --- a/sysdeps/ia64/fpu/s_cosl.S +++ b/sysdeps/ia64/fpu/s_cosl.S @@ -1,10 +1,10 @@ .file "sincosl.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,76 +20,81 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// // Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// problem reports or change requests be submitted to it directly at +// http://www.intel.com/software/products/opensource/libraries/num.htm. // -// ********************************************************************* +//********************************************************************* // -// History: -// 2/02/2000 (hand-optimized) -// 4/04/00 Unwind support added +// History: +// 02/02/00 (hand-optimized) +// 04/04/00 Unwind support added +// 07/30/01 Improved speed on all paths +// 08/20/01 Fixed bundling typo +// 05/13/02 Changed interface to __libm_pi_by_2_reduce +// 02/10/03 Reordered header: .section, .global, .proc, .align; +// used data8 for long double table values // -// ********************************************************************* +//********************************************************************* // // Function: Combined sinl(x) and cosl(x), where // // sinl(x) = sine(x), for double-extended precision x values // cosl(x) = cosine(x), for double-extended precision x values // -// ********************************************************************* +//********************************************************************* // // Resources Used: // -// Floating-Point Registers: f8 (Input and Return Value) +// Floating-Point Registers: f8 (Input and Return Value) // f32-f99 // // General Purpose Registers: -// r32-r43 +// r32-r43 // r44-r45 (Used to pass arguments to pi_by_2 reduce routine) // // Predicate Registers: p6-p13 // -// ********************************************************************* +//********************************************************************* // // IEEE Special Conditions: // // Denormal fault raised on denormal inputs // Overflow exceptions do not occur -// Underflow exceptions raised when appropriate for sin +// Underflow exceptions raised when appropriate for sin // (No specialized error handling for this routine) // Inexact raised when appropriate by algorithm // // sinl(SNaN) = QNaN // sinl(QNaN) = QNaN -// sinl(inf) = QNaN +// sinl(inf) = QNaN // sinl(+/-0) = +/-0 -// cosl(inf) = QNaN +// cosl(inf) = QNaN // cosl(SNaN) = QNaN // cosl(QNaN) = QNaN // cosl(0) = 1 -// -// ********************************************************************* +// +//********************************************************************* // // Mathematical Description // ======================== // -// The computation of FSIN and FCOS is best handled in one piece of -// code. The main reason is that given any argument Arg, computation -// of trigonometric functions first calculate N and an approximation +// The computation of FSIN and FCOS is best handled in one piece of +// code. The main reason is that given any argument Arg, computation +// of trigonometric functions first calculate N and an approximation // to alpha where // // Arg = N pi/2 + alpha, |alpha| <= pi/4. @@ -98,62 +103,62 @@ // // cosl( Arg ) = sinl( (N+1) pi/2 + alpha ), // -// therefore, the code for computing sine will produce cosine as long -// as 1 is added to N immediately after the argument reduction +// therefore, the code for computing sine will produce cosine as long +// as 1 is added to N immediately after the argument reduction // process. // // Let M = N if sine -// N+1 if cosine. +// N+1 if cosine. // // Now, given // // Arg = M pi/2 + alpha, |alpha| <= pi/4, // -// let I = M mod 4, or I be the two lsb of M when M is represented +// let I = M mod 4, or I be the two lsb of M when M is represented // as 2's complement. I = [i_0 i_1]. Then // -// sinl( Arg ) = (-1)^i_0 sinl( alpha ) if i_1 = 0, +// sinl( Arg ) = (-1)^i_0 sinl( alpha ) if i_1 = 0, // = (-1)^i_0 cosl( alpha ) if i_1 = 1. // // For example: -// if M = -1, I = 11 +// if M = -1, I = 11 // sin ((-pi/2 + alpha) = (-1) cos (alpha) -// if M = 0, I = 00 +// if M = 0, I = 00 // sin (alpha) = sin (alpha) -// if M = 1, I = 01 +// if M = 1, I = 01 // sin (pi/2 + alpha) = cos (alpha) -// if M = 2, I = 10 +// if M = 2, I = 10 // sin (pi + alpha) = (-1) sin (alpha) -// if M = 3, I = 11 +// if M = 3, I = 11 // sin ((3/2)pi + alpha) = (-1) cos (alpha) // -// The value of alpha is obtained by argument reduction and +// The value of alpha is obtained by argument reduction and // represented by two working precision numbers r and c where // // alpha = r + c accurately. // // The reduction method is described in a previous write up. -// The argument reduction scheme identifies 4 cases. For Cases 2 -// and 4, because |alpha| is small, sinl(r+c) and cosl(r+c) can be -// computed very easily by 2 or 3 terms of the Taylor series +// The argument reduction scheme identifies 4 cases. For Cases 2 +// and 4, because |alpha| is small, sinl(r+c) and cosl(r+c) can be +// computed very easily by 2 or 3 terms of the Taylor series // expansion as follows: // // Case 2: // ------- // -// sinl(r + c) = r + c - r^3/6 accurately -// cosl(r + c) = 1 - 2^(-67) accurately +// sinl(r + c) = r + c - r^3/6 accurately +// cosl(r + c) = 1 - 2^(-67) accurately // // Case 4: // ------- // -// sinl(r + c) = r + c - r^3/6 + r^5/120 accurately -// cosl(r + c) = 1 - r^2/2 + r^4/24 accurately +// sinl(r + c) = r + c - r^3/6 + r^5/120 accurately +// cosl(r + c) = 1 - r^2/2 + r^4/24 accurately // -// The only cases left are Cases 1 and 3 of the argument reduction -// procedure. These two cases will be merged since after the -// argument is reduced in either cases, we have the reduced argument -// represented as r + c and that the magnitude |r + c| is not small +// The only cases left are Cases 1 and 3 of the argument reduction +// procedure. These two cases will be merged since after the +// argument is reduced in either cases, we have the reduced argument +// represented as r + c and that the magnitude |r + c| is not small // enough to allow the usage of a very short approximation. // // The required calculation is either @@ -163,32 +168,32 @@ // // Specifically, // -// sinl(r + c) = sinl(r) + c sin'(r) + O(c^2) -// = sinl(r) + c cos (r) + O(c^2) -// = sinl(r) + c(1 - r^2/2) accurately. +// sinl(r + c) = sinl(r) + c sin'(r) + O(c^2) +// = sinl(r) + c cos (r) + O(c^2) +// = sinl(r) + c(1 - r^2/2) accurately. // Similarly, // -// cosl(r + c) = cosl(r) - c sinl(r) + O(c^2) -// = cosl(r) - c(r - r^3/6) accurately. +// cosl(r + c) = cosl(r) - c sinl(r) + O(c^2) +// = cosl(r) - c(r - r^3/6) accurately. // -// We therefore concentrate on accurately calculating sinl(r) and +// We therefore concentrate on accurately calculating sinl(r) and // cosl(r) for a working-precision number r, |r| <= pi/4 to within // 0.1% or so. // -// The greatest challenge of this task is that the second terms of +// The greatest challenge of this task is that the second terms of // the Taylor series -// -// r - r^3/3! + r^r/5! - ... +// +// r - r^3/3! + r^r/5! - ... // // and // -// 1 - r^2/2! + r^4/4! - ... +// 1 - r^2/2! + r^4/4! - ... // -// are not very small when |r| is close to pi/4 and the rounding -// errors will be a concern if simple polynomial accumulation is -// used. When |r| < 2^-3, however, the second terms will be small -// enough (6 bits or so of right shift) that a normal Horner -// recurrence suffices. Hence there are two cases that we consider +// are not very small when |r| is close to pi/4 and the rounding +// errors will be a concern if simple polynomial accumulation is +// used. When |r| < 2^-3, however, the second terms will be small +// enough (6 bits or so of right shift) that a normal Horner +// recurrence suffices. Hence there are two cases that we consider // in the accurate computation of sinl(r) and cosl(r), |r| <= pi/4. // // Case small_r: |r| < 2^(-3) @@ -197,88 +202,88 @@ // Since Arg = M pi/4 + r + c accurately, and M mod 4 is [i_0 i_1], // we have // -// sinl(Arg) = (-1)^i_0 * sinl(r + c) if i_1 = 0 -// = (-1)^i_0 * cosl(r + c) if i_1 = 1 +// sinl(Arg) = (-1)^i_0 * sinl(r + c) if i_1 = 0 +// = (-1)^i_0 * cosl(r + c) if i_1 = 1 // // can be accurately approximated by // -// sinl(Arg) = (-1)^i_0 * [sinl(r) + c] if i_1 = 0 +// sinl(Arg) = (-1)^i_0 * [sinl(r) + c] if i_1 = 0 // = (-1)^i_0 * [cosl(r) - c*r] if i_1 = 1 // -// because |r| is small and thus the second terms in the correction +// because |r| is small and thus the second terms in the correction // are unneccessary. // -// Finally, sinl(r) and cosl(r) are approximated by polynomials of +// Finally, sinl(r) and cosl(r) are approximated by polynomials of // moderate lengths. // // sinl(r) = r + S_1 r^3 + S_2 r^5 + ... + S_5 r^11 // cosl(r) = 1 + C_1 r^2 + C_2 r^4 + ... + C_5 r^10 // -// We can make use of predicates to selectively calculate -// sinl(r) or cosl(r) based on i_1. +// We can make use of predicates to selectively calculate +// sinl(r) or cosl(r) based on i_1. // // Case normal_r: 2^(-3) <= |r| <= pi/4 // ------------------------------------ // // This case is more likely than the previous one if one considers // r to be uniformly distributed in [-pi/4 pi/4]. Again, -// -// sinl(Arg) = (-1)^i_0 * sinl(r + c) if i_1 = 0 -// = (-1)^i_0 * cosl(r + c) if i_1 = 1. // -// Because |r| is now larger, we need one extra term in the +// sinl(Arg) = (-1)^i_0 * sinl(r + c) if i_1 = 0 +// = (-1)^i_0 * cosl(r + c) if i_1 = 1. +// +// Because |r| is now larger, we need one extra term in the // correction. sinl(Arg) can be accurately approximated by // // sinl(Arg) = (-1)^i_0 * [sinl(r) + c(1-r^2/2)] if i_1 = 0 // = (-1)^i_0 * [cosl(r) - c*r*(1 - r^2/6)] i_1 = 1. // -// Finally, sinl(r) and cosl(r) are approximated by polynomials of +// Finally, sinl(r) and cosl(r) are approximated by polynomials of // moderate lengths. // -// sinl(r) = r + PP_1_hi r^3 + PP_1_lo r^3 + -// PP_2 r^5 + ... + PP_8 r^17 +// sinl(r) = r + PP_1_hi r^3 + PP_1_lo r^3 + +// PP_2 r^5 + ... + PP_8 r^17 // -// cosl(r) = 1 + QQ_1 r^2 + QQ_2 r^4 + ... + QQ_8 r^16 +// cosl(r) = 1 + QQ_1 r^2 + QQ_2 r^4 + ... + QQ_8 r^16 // -// where PP_1_hi is only about 16 bits long and QQ_1 is -1/2. -// The crux in accurate computation is to calculate +// where PP_1_hi is only about 16 bits long and QQ_1 is -1/2. +// The crux in accurate computation is to calculate // // r + PP_1_hi r^3 or 1 + QQ_1 r^2 // -// accurately as two pieces: U_hi and U_lo. The way to achieve this -// is to obtain r_hi as a 10 sig. bit number that approximates r to +// accurately as two pieces: U_hi and U_lo. The way to achieve this +// is to obtain r_hi as a 10 sig. bit number that approximates r to // roughly 8 bits or so of accuracy. (One convenient way is // // r_hi := frcpa( frcpa( r ) ).) // // This way, // -// r + PP_1_hi r^3 = r + PP_1_hi r_hi^3 + -// PP_1_hi (r^3 - r_hi^3) -// = [r + PP_1_hi r_hi^3] + -// [PP_1_hi (r - r_hi) -// (r^2 + r_hi r + r_hi^2) ] -// = U_hi + U_lo +// r + PP_1_hi r^3 = r + PP_1_hi r_hi^3 + +// PP_1_hi (r^3 - r_hi^3) +// = [r + PP_1_hi r_hi^3] + +// [PP_1_hi (r - r_hi) +// (r^2 + r_hi r + r_hi^2) ] +// = U_hi + U_lo // // Since r_hi is only 10 bit long and PP_1_hi is only 16 bit long, -// PP_1_hi * r_hi^3 is only at most 46 bit long and thus computed -// exactly. Furthermore, r and PP_1_hi r_hi^3 are of opposite sign -// and that there is no more than 8 bit shift off between r and -// PP_1_hi * r_hi^3. Hence the sum, U_hi, is representable and thus -// calculated without any error. Finally, the fact that +// PP_1_hi * r_hi^3 is only at most 46 bit long and thus computed +// exactly. Furthermore, r and PP_1_hi r_hi^3 are of opposite sign +// and that there is no more than 8 bit shift off between r and +// PP_1_hi * r_hi^3. Hence the sum, U_hi, is representable and thus +// calculated without any error. Finally, the fact that // -// |U_lo| <= 2^(-8) |U_hi| +// |U_lo| <= 2^(-8) |U_hi| // -// says that U_hi + U_lo is approximating r + PP_1_hi r^3 to roughly +// says that U_hi + U_lo is approximating r + PP_1_hi r^3 to roughly // 8 extra bits of accuracy. // // Similarly, // -// 1 + QQ_1 r^2 = [1 + QQ_1 r_hi^2] + -// [QQ_1 (r - r_hi)(r + r_hi)] -// = U_hi + U_lo. -// -// Summarizing, we calculate r_hi = frcpa( frcpa( r ) ). +// 1 + QQ_1 r^2 = [1 + QQ_1 r_hi^2] + +// [QQ_1 (r - r_hi)(r + r_hi)] +// = U_hi + U_lo. +// +// Summarizing, we calculate r_hi = frcpa( frcpa( r ) ). // // If i_1 = 0, then // @@ -297,35 +302,35 @@ // End // // Finally, -// -// V := poly + ( U_lo + correction ) +// +// V := poly + ( U_lo + correction ) // // / U_hi + V if i_0 = 0 -// result := | +// result := | // \ (-U_hi) - V if i_0 = 1 // -// It is important that in the last step, negation of U_hi is -// performed prior to the subtraction which is to be performed in -// the user-set rounding mode. +// It is important that in the last step, negation of U_hi is +// performed prior to the subtraction which is to be performed in +// the user-set rounding mode. // // // Algorithmic Description // ======================= // -// The argument reduction algorithm is tightly integrated into FSIN -// and FCOS which share the same code. The following is complete and -// self-contained. The argument reduction description given +// The argument reduction algorithm is tightly integrated into FSIN +// and FCOS which share the same code. The following is complete and +// self-contained. The argument reduction description given // previously is repeated below. // // -// Step 0. Initialization. +// Step 0. Initialization. // // If FSIN is invoked, set N_inc := 0; else if FCOS is invoked, // set N_inc := 1. // // Step 1. Check for exceptional and special cases. // -// * If Arg is +-0, +-inf, NaN, NaT, go to Step 10 for special +// * If Arg is +-0, +-inf, NaN, NaT, go to Step 10 for special // handling. // * If |Arg| < 2^24, go to Step 2 for reduction of moderate // arguments. This is the most likely case. @@ -335,18 +340,18 @@ // // Step 2. Reduction of moderate arguments. // -// If |Arg| < pi/4 ...quick branch -// N_fix := N_inc (integer) +// If |Arg| < pi/4 ...quick branch +// N_fix := N_inc (integer) // r := Arg // c := 0.0 // Branch to Step 4, Case_1_complete -// Else ...cf. argument reduction -// N := Arg * two_by_PI (fp) -// N_fix := fcvt.fx( N ) (int) +// Else ...cf. argument reduction +// N := Arg * two_by_PI (fp) +// N_fix := fcvt.fx( N ) (int) // N := fcvt.xf( N_fix ) // N_fix := N_fix + N_inc -// s := Arg - N * P_1 (first piece of pi/2) -// w := -N * P_2 (second piece of pi/2) +// s := Arg - N * P_1 (first piece of pi/2) +// w := -N * P_2 (second piece of pi/2) // // If |s| >= 2^(-33) // go to Step 3, Case_1_reduce @@ -358,8 +363,8 @@ // Step 3. Case_1_reduce. // // r := s + w -// c := (s - r) + w ...observe order -// +// c := (s - r) + w ...observe order +// // Step 4. Case_1_complete // // ...At this point, the reduced argument alpha is @@ -375,17 +380,17 @@ // // If i_1 = 0, then // poly := r*FR_rsq*(PP_1_lo + FR_rsq*(PP_2 + ... FR_rsq*PP_8)) -// U_hi := r + PP_1_hi*r_hi*r_hi*r_hi ...any order +// U_hi := r + PP_1_hi*r_hi*r_hi*r_hi ...any order // U_lo := PP_1_hi*r_lo*(r*r + r*r_hi + r_hi*r_hi) -// correction := c + c*C_1*FR_rsq ...any order +// correction := c + c*C_1*FR_rsq ...any order // Else // poly := FR_rsq*FR_rsq*(QQ_2 + FR_rsq*(QQ_3 + ... + FR_rsq*QQ_8)) -// U_hi := 1 + QQ_1 * r_hi * r_hi ...any order +// U_hi := 1 + QQ_1 * r_hi * r_hi ...any order // U_lo := QQ_1 * r_lo * (r + r_hi) -// correction := -c*(r + S_1*FR_rsq*r) ...any order +// correction := -c*(r + S_1*FR_rsq*r) ...any order // Endif // -// V := poly + (U_lo + correction) ...observe order +// V := poly + (U_lo + correction) ...observe order // // result := (i_0 == 0? 1.0 : -1.0) // @@ -397,7 +402,7 @@ // Return // // Step 6. Small_r. -// +// // ...Use flush to zero mode without causing exception // Let [i_0 i_1] be the two lsb of N_fix. // @@ -412,7 +417,7 @@ // Else // z := FR_rsq*FR_rsq; z := FR_rsq*z // poly_lo := C_3 + FR_rsq*(C_4 + FR_rsq*C_5) -// poly_hi := FR_rsq*(C_1 + FR_rsq*C_2) +// poly_hi := FR_rsq*(C_1 + FR_rsq*C_2) // correction := -c*r // result := 1 // Endif @@ -429,15 +434,15 @@ // // Step 7. Case_2_reduce. // -// ...Refer to the write up for argument reduction for +// ...Refer to the write up for argument reduction for // ...rationale. The reduction algorithm below is taken from // ...argument reduction description and integrated this. // // w := N*P_3 -// U_1 := N*P_2 + w ...FMA -// U_2 := (N*P_2 - U_1) + w ...2 FMA +// U_1 := N*P_2 + w ...FMA +// U_2 := (N*P_2 - U_1) + w ...2 FMA // ...U_1 + U_2 is N*(P_2+P_3) accurately -// +// // r := s - U_1 // c := ( (s - r) - U_1 ) - U_2 // @@ -446,29 +451,29 @@ // ...Case 1, this case requires much more work to reduce // ...the argument, the subsequent calculation needed for // ...any of the trigonometric function is very little because -// ...|alpha| < 1.01*2^(-33) and thus two terms of the +// ...|alpha| < 1.01*2^(-33) and thus two terms of the // ...Taylor series expansion suffices. // // If i_1 = 0 then -// poly := c + S_1 * r * r * r ...any order +// poly := c + S_1 * r * r * r ...any order // result := r // Else // poly := -2^(-67) // result := 1.0 // Endif -// +// // If i_0 = 1, result := -result // // Last operation. Perform in user-set rounding mode // // result := (i_0 == 0? result + poly : // result - poly ) -// +// // Return // -// +// // Step 8. Pre-reduction of large arguments. -// +// // ...Again, the following reduction procedure was described // ...in the separate write up for argument reduction, which // ...is tightly integrated here. @@ -476,13 +481,13 @@ // N_0 := Arg * Inv_P_0 // N_0_fix := fcvt.fx( N_0 ) // N_0 := fcvt.xf( N_0_fix) - + // Arg' := Arg - N_0 * P_0 // w := N_0 * d_1 // N := Arg' * two_by_PI // N_fix := fcvt.fx( N ) // N := fcvt.xf( N_fix ) -// N_fix := N_fix + N_inc +// N_fix := N_fix + N_inc // // s := Arg' - N * P_1 // w := w - N * P_2 @@ -494,15 +499,15 @@ // Endif // // Step 9. Case_4_reduce. -// +// // ...first obtain N_0*d_1 and -N*P_2 accurately -// U_hi := N_0 * d_1 V_hi := -N*P_2 -// U_lo := N_0 * d_1 - U_hi V_lo := -N*P_2 - U_hi ...FMAs +// U_hi := N_0 * d_1 V_hi := -N*P_2 +// U_lo := N_0 * d_1 - U_hi V_lo := -N*P_2 - U_hi ...FMAs // // ...compute the contribution from N_0*d_1 and -N*P_3 // w := -N*P_3 // w := w + N_0*d_2 -// t := U_lo + V_lo + w ...any order +// t := U_lo + V_lo + w ...any order // // ...at this point, the mathematical value // ...s + U_hi + V_hi + t approximates the true reduced argument @@ -517,12 +522,12 @@ // endif // ...order in computing "a" must be observed. This branch is // ...best implemented by predicates. -// ...A + a is U_hi + V_hi accurately. Moreover, "a" is +// ...A + a is U_hi + V_hi accurately. Moreover, "a" is // ...much smaller than A: |a| <= (1/2)ulp(A). // // ...Just need to calculate s + A + a + t -// C_hi := s + A t := t + a -// C_lo := (s - C_hi) + A +// C_hi := s + A t := t + a +// C_lo := (s - C_hi) + A // C_lo := C_lo + t // // ...Final steps for reduction @@ -548,156 +553,191 @@ // result := (i_0 == 0? result + poly : // result - poly ) // Return -// +// // Large Arguments: For arguments above 2**63, a Payne-Hanek // style argument reduction is used and pi_by_2 reduce is called. // -#include "libm_support.h" - -#ifdef _LIBC -.rodata -#else -.data -#endif -.align 64 - -FSINCOSL_CONSTANTS: -ASM_TYPE_DIRECTIVE(FSINCOSL_CONSTANTS,@object) -data4 0x4B800000, 0xCB800000, 0x00000000,0x00000000 // two**24, -two**24 -data4 0x4E44152A, 0xA2F9836E, 0x00003FFE,0x00000000 // Inv_pi_by_2 -data4 0xCE81B9F1, 0xC84D32B0, 0x00004016,0x00000000 // P_0 -data4 0x2168C235, 0xC90FDAA2, 0x00003FFF,0x00000000 // P_1 -data4 0xFC8F8CBB, 0xECE675D1, 0x0000BFBD,0x00000000 // P_2 -data4 0xACC19C60, 0xB7ED8FBB, 0x0000BF7C,0x00000000 // P_3 -data4 0x5F000000, 0xDF000000, 0x00000000,0x00000000 // two_to_63, -two_to_63 -data4 0x6EC6B45A, 0xA397E504, 0x00003FE7,0x00000000 // Inv_P_0 -data4 0xDBD171A1, 0x8D848E89, 0x0000BFBF,0x00000000 // d_1 -data4 0x18A66F8E, 0xD5394C36, 0x0000BF7C,0x00000000 // d_2 -data4 0x2168C234, 0xC90FDAA2, 0x00003FFE,0x00000000 // pi_by_4 -data4 0x2168C234, 0xC90FDAA2, 0x0000BFFE,0x00000000 // neg_pi_by_4 -data4 0x3E000000, 0xBE000000, 0x00000000,0x00000000 // two**-3, -two**-3 -data4 0x2F000000, 0xAF000000, 0x9E000000,0x00000000 // two**-33, -two**-33, -two**-67 -data4 0xA21C0BC9, 0xCC8ABEBC, 0x00003FCE,0x00000000 // PP_8 -data4 0x720221DA, 0xD7468A05, 0x0000BFD6,0x00000000 // PP_7 -data4 0x640AD517, 0xB092382F, 0x00003FDE,0x00000000 // PP_6 -data4 0xD1EB75A4, 0xD7322B47, 0x0000BFE5,0x00000000 // PP_5 -data4 0xFFFFFFFE, 0xFFFFFFFF, 0x0000BFFD,0x00000000 // C_1 -data4 0x00000000, 0xAAAA0000, 0x0000BFFC,0x00000000 // PP_1_hi -data4 0xBAF69EEA, 0xB8EF1D2A, 0x00003FEC,0x00000000 // PP_4 -data4 0x0D03BB69, 0xD00D00D0, 0x0000BFF2,0x00000000 // PP_3 -data4 0x88888962, 0x88888888, 0x00003FF8,0x00000000 // PP_2 -data4 0xAAAB0000, 0xAAAAAAAA, 0x0000BFEC,0x00000000 // PP_1_lo -data4 0xC2B0FE52, 0xD56232EF, 0x00003FD2,0x00000000 // QQ_8 -data4 0x2B48DCA6, 0xC9C99ABA, 0x0000BFDA,0x00000000 // QQ_7 -data4 0x9C716658, 0x8F76C650, 0x00003FE2,0x00000000 // QQ_6 -data4 0xFDA8D0FC, 0x93F27DBA, 0x0000BFE9,0x00000000 // QQ_5 -data4 0xAAAAAAAA, 0xAAAAAAAA, 0x0000BFFC,0x00000000 // S_1 -data4 0x00000000, 0x80000000, 0x0000BFFE,0x00000000 // QQ_1 -data4 0x0C6E5041, 0xD00D00D0, 0x00003FEF,0x00000000 // QQ_4 -data4 0x0B607F60, 0xB60B60B6, 0x0000BFF5,0x00000000 // QQ_3 -data4 0xAAAAAA9B, 0xAAAAAAAA, 0x00003FFA,0x00000000 // QQ_2 -data4 0xFFFFFFFE, 0xFFFFFFFF, 0x0000BFFD,0x00000000 // C_1 -data4 0xAAAA719F, 0xAAAAAAAA, 0x00003FFA,0x00000000 // C_2 -data4 0x0356F994, 0xB60B60B6, 0x0000BFF5,0x00000000 // C_3 -data4 0xB2385EA9, 0xD00CFFD5, 0x00003FEF,0x00000000 // C_4 -data4 0x292A14CD, 0x93E4BD18, 0x0000BFE9,0x00000000 // C_5 -data4 0xAAAAAAAA, 0xAAAAAAAA, 0x0000BFFC,0x00000000 // S_1 -data4 0x888868DB, 0x88888888, 0x00003FF8,0x00000000 // S_2 -data4 0x055EFD4B, 0xD00D00D0, 0x0000BFF2,0x00000000 // S_3 -data4 0x839730B9, 0xB8EF1C5D, 0x00003FEC,0x00000000 // S_4 -data4 0xE5B3F492, 0xD71EA3A4, 0x0000BFE5,0x00000000 // S_5 -data4 0x38800000, 0xB8800000, 0x00000000 // two**-14, -two**-14 -ASM_SIZE_DIRECTIVE(FSINCOSL_CONSTANTS) - -FR_Input_X = f8 -FR_Neg_Two_to_M3 = f32 -FR_Two_to_63 = f32 -FR_Two_to_24 = f33 -FR_Pi_by_4 = f33 -FR_Two_to_M14 = f34 -FR_Two_to_M33 = f35 -FR_Neg_Two_to_24 = f36 -FR_Neg_Pi_by_4 = f36 -FR_Neg_Two_to_M14 = f37 -FR_Neg_Two_to_M33 = f38 -FR_Neg_Two_to_M67 = f39 -FR_Inv_pi_by_2 = f40 -FR_N_float = f41 -FR_N_fix = f42 -FR_P_1 = f43 -FR_P_2 = f44 -FR_P_3 = f45 -FR_s = f46 -FR_w = f47 -FR_c = f48 -FR_r = f49 -FR_Z = f50 -FR_A = f51 -FR_a = f52 -FR_t = f53 -FR_U_1 = f54 -FR_U_2 = f55 -FR_C_1 = f56 -FR_C_2 = f57 -FR_C_3 = f58 -FR_C_4 = f59 -FR_C_5 = f60 -FR_S_1 = f61 -FR_S_2 = f62 -FR_S_3 = f63 -FR_S_4 = f64 -FR_S_5 = f65 -FR_poly_hi = f66 -FR_poly_lo = f67 -FR_r_hi = f68 -FR_r_lo = f69 -FR_rsq = f70 -FR_r_cubed = f71 -FR_C_hi = f72 -FR_N_0 = f73 -FR_d_1 = f74 -FR_V = f75 -FR_V_hi = f75 -FR_V_lo = f76 -FR_U_hi = f77 -FR_U_lo = f78 -FR_U_hiabs = f79 -FR_V_hiabs = f80 -FR_PP_8 = f81 -FR_QQ_8 = f81 -FR_PP_7 = f82 -FR_QQ_7 = f82 -FR_PP_6 = f83 -FR_QQ_6 = f83 -FR_PP_5 = f84 -FR_QQ_5 = f84 -FR_PP_4 = f85 -FR_QQ_4 = f85 -FR_PP_3 = f86 -FR_QQ_3 = f86 -FR_PP_2 = f87 -FR_QQ_2 = f87 -FR_QQ_1 = f88 -FR_N_0_fix = f89 -FR_Inv_P_0 = f90 -FR_corr = f91 -FR_poly = f92 -FR_d_2 = f93 -FR_Two_to_M3 = f94 -FR_Neg_Two_to_63 = f94 -FR_P_0 = f95 -FR_C_lo = f96 -FR_PP_1 = f97 -FR_PP_1_lo = f98 -FR_ArgPrime = f99 - -GR_Table_Base = r32 -GR_Table_Base1 = r33 -GR_i_0 = r34 -GR_i_1 = r35 -GR_N_Inc = r36 -GR_Sin_or_Cos = r37 + +RODATA +.align 16 + +LOCAL_OBJECT_START(FSINCOSL_CONSTANTS) + +sincosl_table_p: +data8 0xA2F9836E4E44152A, 0x00003FFE // Inv_pi_by_2 +data8 0xC84D32B0CE81B9F1, 0x00004016 // P_0 +data8 0xC90FDAA22168C235, 0x00003FFF // P_1 +data8 0xECE675D1FC8F8CBB, 0x0000BFBD // P_2 +data8 0xB7ED8FBBACC19C60, 0x0000BF7C // P_3 +data8 0x8D848E89DBD171A1, 0x0000BFBF // d_1 +data8 0xD5394C3618A66F8E, 0x0000BF7C // d_2 +LOCAL_OBJECT_END(FSINCOSL_CONSTANTS) + +LOCAL_OBJECT_START(sincosl_table_d) +data8 0xC90FDAA22168C234, 0x00003FFE // pi_by_4 +data8 0xA397E5046EC6B45A, 0x00003FE7 // Inv_P_0 +data4 0x3E000000, 0xBE000000 // 2^-3 and -2^-3 +data4 0x2F000000, 0xAF000000 // 2^-33 and -2^-33 +data4 0x9E000000, 0x00000000 // -2^-67 +data4 0x00000000, 0x00000000 // pad +LOCAL_OBJECT_END(sincosl_table_d) + +LOCAL_OBJECT_START(sincosl_table_pp) +data8 0xCC8ABEBCA21C0BC9, 0x00003FCE // PP_8 +data8 0xD7468A05720221DA, 0x0000BFD6 // PP_7 +data8 0xB092382F640AD517, 0x00003FDE // PP_6 +data8 0xD7322B47D1EB75A4, 0x0000BFE5 // PP_5 +data8 0xFFFFFFFFFFFFFFFE, 0x0000BFFD // C_1 +data8 0xAAAA000000000000, 0x0000BFFC // PP_1_hi +data8 0xB8EF1D2ABAF69EEA, 0x00003FEC // PP_4 +data8 0xD00D00D00D03BB69, 0x0000BFF2 // PP_3 +data8 0x8888888888888962, 0x00003FF8 // PP_2 +data8 0xAAAAAAAAAAAB0000, 0x0000BFEC // PP_1_lo +LOCAL_OBJECT_END(sincosl_table_pp) + +LOCAL_OBJECT_START(sincosl_table_qq) +data8 0xD56232EFC2B0FE52, 0x00003FD2 // QQ_8 +data8 0xC9C99ABA2B48DCA6, 0x0000BFDA // QQ_7 +data8 0x8F76C6509C716658, 0x00003FE2 // QQ_6 +data8 0x93F27DBAFDA8D0FC, 0x0000BFE9 // QQ_5 +data8 0xAAAAAAAAAAAAAAAA, 0x0000BFFC // S_1 +data8 0x8000000000000000, 0x0000BFFE // QQ_1 +data8 0xD00D00D00C6E5041, 0x00003FEF // QQ_4 +data8 0xB60B60B60B607F60, 0x0000BFF5 // QQ_3 +data8 0xAAAAAAAAAAAAAA9B, 0x00003FFA // QQ_2 +LOCAL_OBJECT_END(sincosl_table_qq) + +LOCAL_OBJECT_START(sincosl_table_c) +data8 0xFFFFFFFFFFFFFFFE, 0x0000BFFD // C_1 +data8 0xAAAAAAAAAAAA719F, 0x00003FFA // C_2 +data8 0xB60B60B60356F994, 0x0000BFF5 // C_3 +data8 0xD00CFFD5B2385EA9, 0x00003FEF // C_4 +data8 0x93E4BD18292A14CD, 0x0000BFE9 // C_5 +LOCAL_OBJECT_END(sincosl_table_c) + +LOCAL_OBJECT_START(sincosl_table_s) +data8 0xAAAAAAAAAAAAAAAA, 0x0000BFFC // S_1 +data8 0x88888888888868DB, 0x00003FF8 // S_2 +data8 0xD00D00D0055EFD4B, 0x0000BFF2 // S_3 +data8 0xB8EF1C5D839730B9, 0x00003FEC // S_4 +data8 0xD71EA3A4E5B3F492, 0x0000BFE5 // S_5 +data4 0x38800000, 0xB8800000 // two**-14 and -two**-14 +LOCAL_OBJECT_END(sincosl_table_s) + +FR_Input_X = f8 +FR_Result = f8 + +FR_r = f8 +FR_c = f9 + +FR_norm_x = f9 +FR_inv_pi_2to63 = f10 +FR_rshf_2to64 = f11 +FR_2tom64 = f12 +FR_rshf = f13 +FR_N_float_signif = f14 +FR_abs_x = f15 +FR_Pi_by_4 = f34 +FR_Two_to_M14 = f35 +FR_Neg_Two_to_M14 = f36 +FR_Two_to_M33 = f37 +FR_Neg_Two_to_M33 = f38 +FR_Neg_Two_to_M67 = f39 +FR_Inv_pi_by_2 = f40 +FR_N_float = f41 +FR_N_fix = f42 +FR_P_1 = f43 +FR_P_2 = f44 +FR_P_3 = f45 +FR_s = f46 +FR_w = f47 +FR_d_2 = f48 +FR_tmp_result = f49 +FR_Z = f50 +FR_A = f51 +FR_a = f52 +FR_t = f53 +FR_U_1 = f54 +FR_U_2 = f55 +FR_C_1 = f56 +FR_C_2 = f57 +FR_C_3 = f58 +FR_C_4 = f59 +FR_C_5 = f60 +FR_S_1 = f61 +FR_S_2 = f62 +FR_S_3 = f63 +FR_S_4 = f64 +FR_S_5 = f65 +FR_poly_hi = f66 +FR_poly_lo = f67 +FR_r_hi = f68 +FR_r_lo = f69 +FR_rsq = f70 +FR_r_cubed = f71 +FR_C_hi = f72 +FR_N_0 = f73 +FR_d_1 = f74 +FR_V = f75 +FR_V_hi = f75 +FR_V_lo = f76 +FR_U_hi = f77 +FR_U_lo = f78 +FR_U_hiabs = f79 +FR_V_hiabs = f80 +FR_PP_8 = f81 +FR_QQ_8 = f101 +FR_PP_7 = f82 +FR_QQ_7 = f102 +FR_PP_6 = f83 +FR_QQ_6 = f103 +FR_PP_5 = f84 +FR_QQ_5 = f104 +FR_PP_4 = f85 +FR_QQ_4 = f105 +FR_PP_3 = f86 +FR_QQ_3 = f106 +FR_PP_2 = f87 +FR_QQ_2 = f107 +FR_QQ_1 = f108 +FR_r_hi_sq = f88 +FR_N_0_fix = f89 +FR_Inv_P_0 = f90 +FR_corr = f91 +FR_poly = f92 +FR_Neg_Two_to_M3 = f93 +FR_Two_to_M3 = f94 +FR_P_0 = f95 +FR_C_lo = f96 +FR_PP_1 = f97 +FR_PP_1_lo = f98 +FR_ArgPrime = f99 +FR_inexact = f100 + +GR_sig_inv_pi = r14 +GR_rshf_2to64 = r15 +GR_exp_2tom64 = r16 +GR_rshf = r17 +GR_ad_p = r18 +GR_ad_d = r19 +GR_ad_pp = r20 +GR_ad_qq = r21 +GR_ad_c = r22 +GR_ad_s = r23 +GR_ad_ce = r24 +GR_ad_se = r25 +GR_ad_m14 = r26 +GR_ad_s1 = r27 +GR_exp_m2_to_m3= r36 +GR_N_Inc = r37 +GR_Sin_or_Cos = r38 +GR_signexp_x = r40 +GR_exp_x = r40 +GR_exp_mask = r41 +GR_exp_2_to_63 = r42 +GR_exp_2_to_m3 = r43 +GR_exp_2_to_24 = r44 // Added for unwind support @@ -706,386 +746,376 @@ GR_SAVE_GP = r40 GR_SAVE_PFS = r41 -.global sinl# -.global cosl# -#ifdef _LIBC -.global __sinl# -.global __cosl# -#endif - .section .text -.proc sinl# -#ifdef _LIBC -.proc __sinl# -#endif -.align 64 -sinl: -#ifdef _LIBC -__sinl: -#endif + +GLOBAL_IEEE754_ENTRY(sinl) { .mlx -alloc GR_Table_Base = ar.pfs,0,12,2,0 -(p0) movl GR_Sin_or_Cos = 0x0 ;; + alloc r32 = ar.pfs,0,12,2,0 + movl GR_sig_inv_pi = 0xa2f9836e4e44152a // significand of 1/pi } - -{ .mmi - nop.m 999 -(p0) addl GR_Table_Base = @ltoff(FSINCOSL_CONSTANTS#), gp - nop.i 999 +{ .mlx + mov GR_Sin_or_Cos = 0x0 + movl GR_rshf_2to64 = 0x47e8000000000000 // 1.1000 2^(63+64) } ;; -{ .mmb - ld8 GR_Table_Base = [GR_Table_Base] +{ .mfi + addl GR_ad_p = @ltoff(FSINCOSL_CONSTANTS#), gp + fclass.m p6, p0 = FR_Input_X, 0x1E3 // Test x natval, nan, inf + mov GR_exp_2_to_m3 = 0xffff - 3 // Exponent of 2^-3 +} +{ .mfb nop.m 999 -(p0) br.cond.sptk L(SINCOSL_CONTINUE) ;; + fnorm.s1 FR_norm_x = FR_Input_X // Normalize x + br.cond.sptk SINCOSL_CONTINUE } ;; - -.endp sinl# -ASM_SIZE_DIRECTIVE(sinl#) - -.section .text -.proc cosl# -cosl: -#ifdef _LIBC -.proc __cosl# -__cosl: -#endif +GLOBAL_IEEE754_END(sinl) +GLOBAL_IEEE754_ENTRY(cosl) +{ .mlx + alloc r32 = ar.pfs,0,12,2,0 + movl GR_sig_inv_pi = 0xa2f9836e4e44152a // significand of 1/pi +} { .mlx -alloc GR_Table_Base= ar.pfs,0,12,2,0 -(p0) movl GR_Sin_or_Cos = 0x1 ;; + mov GR_Sin_or_Cos = 0x1 + movl GR_rshf_2to64 = 0x47e8000000000000 // 1.1000 2^(63+64) } ;; -{ .mmi +{ .mfi + addl GR_ad_p = @ltoff(FSINCOSL_CONSTANTS#), gp + fclass.m p6, p0 = FR_Input_X, 0x1E3 // Test x natval, nan, inf + mov GR_exp_2_to_m3 = 0xffff - 3 // Exponent of 2^-3 +} +{ .mfi nop.m 999 -(p0) addl GR_Table_Base = @ltoff(FSINCOSL_CONSTANTS#), gp + fnorm.s1 FR_norm_x = FR_Input_X // Normalize x nop.i 999 } ;; -{ .mmb - ld8 GR_Table_Base = [GR_Table_Base] - nop.m 999 - nop.b 999 +SINCOSL_CONTINUE: +{ .mfi + setf.sig FR_inv_pi_2to63 = GR_sig_inv_pi // Form 1/pi * 2^63 + nop.f 999 + mov GR_exp_2tom64 = 0xffff - 64 // Scaling constant to compute N +} +{ .mlx + setf.d FR_rshf_2to64 = GR_rshf_2to64 // Form const 1.1000 * 2^(63+64) + movl GR_rshf = 0x43e8000000000000 // Form const 1.1000 * 2^63 } ;; +{ .mfi + ld8 GR_ad_p = [GR_ad_p] // Point to Inv_pi_by_2 + fclass.m p7, p0 = FR_Input_X, 0x0b // Test x denormal + nop.i 999 +} +;; - -// -// Load Table Address -// - -L(SINCOSL_CONTINUE): -{ .mmi -(p0) add GR_Table_Base1 = 96, GR_Table_Base -(p0) ldfs FR_Two_to_24 = [GR_Table_Base], 4 -// GR_Sin_or_Cos denotes -(p0) mov r39 = b0 ;; +{ .mfi + getf.exp GR_signexp_x = FR_Input_X // Get sign and exponent of x + fclass.m p10, p0 = FR_Input_X, 0x007 // Test x zero + nop.i 999 } -{ .mmi - nop.m 0 -// -// Load 2**24, load 2**63. -// -(p0) ldfs FR_Neg_Two_to_24 = [GR_Table_Base], 12 - nop.i 0 +{ .mib + mov GR_exp_mask = 0x1ffff // Exponent mask + nop.i 999 +(p6) br.cond.spnt SINCOSL_SPECIAL // Branch if x natval, nan, inf } +;; + { .mfi -(p0) ldfs FR_Two_to_63 = [GR_Table_Base1], 4 -// -// Check for unnormals - unsupported operands. We do not want -// to generate denormal exception -// Check for NatVals, QNaNs, SNaNs, +/-Infs -// Check for EM unsupporteds -// Check for Zero -// -(p0) fclass.m.unc p6, p0 = FR_Input_X, 0x1E3 - nop.i 0 -};; -{ .mmf - nop.m 999 -(p0) ldfs FR_Neg_Two_to_63 = [GR_Table_Base1], 12 -(p0) fclass.nm.unc p8, p0 = FR_Input_X, 0x1FF -} -{ .mfb - nop.m 999 -(p0) fclass.m.unc p10, p0 = FR_Input_X, 0x007 -(p6) br.cond.spnt L(SINCOSL_SPECIAL) ;; + setf.exp FR_2tom64 = GR_exp_2tom64 // Form 2^-64 for scaling N_float + nop.f 0 + add GR_ad_d = 0x70, GR_ad_p // Point to constant table d } { .mib - nop.m 999 - nop.i 999 -(p8) br.cond.spnt L(SINCOSL_SPECIAL) ;; + setf.d FR_rshf = GR_rshf // Form right shift const 1.1000 * 2^63 + mov GR_exp_m2_to_m3 = 0x2fffc // Form -(2^-3) +(p7) br.cond.spnt SINCOSL_DENORMAL // Branch if x denormal } -{ .mib - nop.m 999 - nop.i 999 -// -// Branch if +/- NaN, Inf. -// Load -2**24, load -2**63. -// -(p10) br.cond.spnt L(SINCOSL_ZERO) ;; +;; + +SINCOSL_COMMON: +{ .mfi + and GR_exp_x = GR_exp_mask, GR_signexp_x // Get exponent of x + fclass.nm p8, p0 = FR_Input_X, 0x1FF // Test x unsupported type + mov GR_exp_2_to_63 = 0xffff + 63 // Exponent of 2^63 } -{ .mmb -(p0) ldfe FR_Inv_pi_by_2 = [GR_Table_Base], 16 -(p0) ldfe FR_Inv_P_0 = [GR_Table_Base1], 16 - nop.b 999 ;; +{ .mib + add GR_ad_pp = 0x40, GR_ad_d // Point to constant table pp + mov GR_exp_2_to_24 = 0xffff + 24 // Exponent of 2^24 +(p10) br.cond.spnt SINCOSL_ZERO // Branch if x zero } -{ .mmb -(p0) ldfe FR_d_1 = [GR_Table_Base1], 16 -// -// Raise possible denormal operand flag with useful fcmp -// Is x <= -2**63 -// Load Inv_P_0 for pre-reduction -// Load Inv_pi_by_2 -// -(p0) ldfe FR_P_0 = [GR_Table_Base], 16 - nop.b 999 ;; +;; + +{ .mfi + ldfe FR_Inv_pi_by_2 = [GR_ad_p], 16 // Load 2/pi + fcmp.eq.s0 p15, p0 = FR_Input_X, f0 // Dummy to set denormal + add GR_ad_qq = 0xa0, GR_ad_pp // Point to constant table qq } -{ .mmb -(p0) ldfe FR_d_2 = [GR_Table_Base1], 16 -// -// Load P_0 -// Load d_1 -// Is x >= 2**63 -// Is x <= -2**24? -// -(p0) ldfe FR_P_1 = [GR_Table_Base], 16 - nop.b 999 ;; +{ .mfi + ldfe FR_Pi_by_4 = [GR_ad_d], 16 // Load pi/4 for range test + nop.f 999 + cmp.ge p10,p0 = GR_exp_x, GR_exp_2_to_63 // Is |x| >= 2^63 } -// -// Load P_1 -// Load d_2 -// Is x >= 2**24? -// +;; + { .mfi -(p0) ldfe FR_P_2 = [GR_Table_Base], 16 -(p0) fcmp.le.unc.s1 p7, p8 = FR_Input_X, FR_Neg_Two_to_24 - nop.i 999 ;; + ldfe FR_P_0 = [GR_ad_p], 16 // Load P_0 for pi/4 <= |x| < 2^63 + fmerge.s FR_abs_x = f1, FR_norm_x // |x| + add GR_ad_c = 0x90, GR_ad_qq // Point to constant table c } -{ .mbb -(p0) ldfe FR_P_3 = [GR_Table_Base], 16 - nop.b 999 - nop.b 999 ;; +{ .mfi + ldfe FR_Inv_P_0 = [GR_ad_d], 16 // Load 1/P_0 for pi/4 <= |x| < 2^63 + nop.f 999 + cmp.ge p7,p0 = GR_exp_x, GR_exp_2_to_24 // Is |x| >= 2^24 } +;; + { .mfi - nop.m 999 -(p8) fcmp.ge.s1 p7, p0 = FR_Input_X, FR_Two_to_24 - nop.i 999 + ldfe FR_P_1 = [GR_ad_p], 16 // Load P_1 for pi/4 <= |x| < 2^63 + nop.f 999 + add GR_ad_s = 0x50, GR_ad_c // Point to constant table s } { .mfi -(p0) ldfe FR_Pi_by_4 = [GR_Table_Base1], 16 -// -// Branch if +/- zero. -// Decide about the paths to take: -// If -2**24 < FR_Input_X < 2**24 - CASE 1 OR 2 -// OTHERWISE - CASE 3 OR 4 -// -(p0) fcmp.le.unc.s0 p10, p11 = FR_Input_X, FR_Neg_Two_to_63 - nop.i 999 ;; + ldfe FR_PP_8 = [GR_ad_pp], 16 // Load PP_8 for 2^-3 < |r| < pi/4 + nop.f 999 + nop.i 999 } -{ .mmi -(p0) ldfe FR_Neg_Pi_by_4 = [GR_Table_Base1], 16 ;; -(p0) ldfs FR_Two_to_M3 = [GR_Table_Base1], 4 - nop.i 999 +;; + +{ .mfi + ldfe FR_P_2 = [GR_ad_p], 16 // Load P_2 for pi/4 <= |x| < 2^63 + nop.f 999 + add GR_ad_ce = 0x40, GR_ad_c // Point to end of constant table c } { .mfi - nop.m 999 -(p11) fcmp.ge.s1 p10, p0 = FR_Input_X, FR_Two_to_63 - nop.i 999 ;; + ldfe FR_QQ_8 = [GR_ad_qq], 16 // Load QQ_8 for 2^-3 < |r| < pi/4 + nop.f 999 + nop.i 999 } -{ .mib -(p0) ldfs FR_Neg_Two_to_M3 = [GR_Table_Base1], 12 - nop.i 999 -// -// Load P_2 -// Load P_3 -// Load pi_by_4 -// Load neg_pi_by_4 -// Load 2**(-3) -// Load -2**(-3). -// -(p10) br.cond.spnt L(SINCOSL_ARG_TOO_LARGE) ;; +;; + +{ .mfi + ldfe FR_QQ_7 = [GR_ad_qq], 16 // Load QQ_7 for 2^-3 < |r| < pi/4 + fma.s1 FR_N_float_signif = FR_Input_X, FR_inv_pi_2to63, FR_rshf_2to64 + add GR_ad_se = 0x40, GR_ad_s // Point to end of constant table s } { .mib - nop.m 999 - nop.i 999 -// -// Branch out if x >= 2**63. Use Payne-Hanek Reduction -// -(p7) br.cond.spnt L(SINCOSL_LARGER_ARG) ;; + ldfe FR_PP_7 = [GR_ad_pp], 16 // Load PP_7 for 2^-3 < |r| < pi/4 + mov GR_ad_s1 = GR_ad_s // Save pointer to S_1 +(p10) br.cond.spnt SINCOSL_ARG_TOO_LARGE // Branch if |x| >= 2^63 + // Use Payne-Hanek Reduction } +;; + { .mfi - nop.m 999 -// -// Branch if Arg <= -2**24 or Arg >= 2**24 and use pre-reduction. -// -(p0) fma.s1 FR_N_float = FR_Input_X, FR_Inv_pi_by_2, f0 - nop.i 999 ;; + ldfe FR_P_3 = [GR_ad_p], 16 // Load P_3 for pi/4 <= |x| < 2^63 + fmerge.se FR_r = FR_norm_x, FR_norm_x // r = x, in case |x| < pi/4 + add GR_ad_m14 = 0x50, GR_ad_s // Point to constant table m14 } -{ .mfi - nop.m 999 -(p0) fcmp.lt.unc.s1 p6, p7 = FR_Input_X, FR_Pi_by_4 - nop.i 999 ;; +{ .mfb + ldfps FR_Two_to_M3, FR_Neg_Two_to_M3 = [GR_ad_d], 8 + fma.s1 FR_rsq = FR_norm_x, FR_norm_x, f0 // rsq = x*x, in case |x| < pi/4 +(p7) br.cond.spnt SINCOSL_LARGER_ARG // Branch if 2^24 <= |x| < 2^63 + // Use pre-reduction +} +;; + +{ .mmf + ldfe FR_PP_6 = [GR_ad_pp], 16 // Load PP_6 for normal path + ldfe FR_QQ_6 = [GR_ad_qq], 16 // Load QQ_6 for normal path + fmerge.se FR_c = f0, f0 // c = 0 in case |x| < pi/4 } +;; + +{ .mmf + ldfe FR_PP_5 = [GR_ad_pp], 16 // Load PP_5 for normal path + ldfe FR_QQ_5 = [GR_ad_qq], 16 // Load QQ_5 for normal path + nop.f 999 +} +;; + +// Here if 0 < |x| < 2^24 { .mfi - nop.m 999 -// -// Select the case when |Arg| < pi/4 -// Else Select the case when |Arg| >= pi/4 -// -(p0) fcvt.fx.s1 FR_N_fix = FR_N_float - nop.i 999 ;; + ldfe FR_S_5 = [GR_ad_se], -16 // Load S_5 if i_1=0 + fcmp.lt.s1 p6, p7 = FR_abs_x, FR_Pi_by_4 // Test |x| < pi/4 + nop.i 999 } { .mfi - nop.m 999 + ldfe FR_C_5 = [GR_ad_ce], -16 // Load C_5 if i_1=1 + fms.s1 FR_N_float = FR_N_float_signif, FR_2tom64, FR_rshf + nop.i 999 +} +;; + +{ .mmi + ldfe FR_S_4 = [GR_ad_se], -16 // Load S_4 if i_1=0 + ldfe FR_C_4 = [GR_ad_ce], -16 // Load C_4 if i_1=1 + nop.i 999 +} +;; + // // N = Arg * 2/pi // Check if Arg < pi/4 // -(p6) fcmp.gt.s1 p6, p7 = FR_Input_X, FR_Neg_Pi_by_4 - nop.i 999 ;; -} // // Case 2: Convert integer N_fix back to normalized floating-point value. // Case 1: p8 is only affected when p6 is set // -{ .mfi -(p7) ldfs FR_Two_to_M33 = [GR_Table_Base1], 4 // // Grab the integer part of N and call it N_fix // -(p6) fmerge.se FR_r = FR_Input_X, FR_Input_X -// If |x| < pi/4, r = x and c = 0 +{ .mfi +(p7) ldfps FR_Two_to_M33, FR_Neg_Two_to_M33 = [GR_ad_d], 8 +(p6) fma.s1 FR_r_cubed = FR_r, FR_rsq, f0 // r^3 if |x| < pi/4 +(p6) mov GR_N_Inc = GR_Sin_or_Cos // N_Inc if |x| < pi/4 +} +;; + +// If |x| < pi/4, r = x and c = 0 // lf |x| < pi/4, is x < 2**(-3). -// r = Arg +// r = Arg // c = 0 -(p6) mov GR_N_Inc = GR_Sin_or_Cos ;; -} -{ .mmf - nop.m 999 -(p7) ldfs FR_Neg_Two_to_M33 = [GR_Table_Base1], 4 -(p6) fmerge.se FR_c = f0, f0 -} -{ .mfi - nop.m 999 -(p6) fcmp.lt.unc.s1 p8, p9 = FR_Input_X, FR_Two_to_M3 - nop.i 999 ;; +{ .mmi +(p7) getf.sig GR_N_Inc = FR_N_float_signif +(p6) cmp.lt.unc p8,p0 = GR_exp_x, GR_exp_2_to_m3 // Is |x| < 2^-3 +(p6) tbit.z p9,p10 = GR_N_Inc, 0 // p9 if i_1=0, N mod 4 = 0,1 + // p10 if i_1=1, N mod 4 = 2,3 } -{ .mfi - nop.m 999 +;; + // // lf |x| < pi/4, is -2**(-3)< x < 2**(-3) - set p8. -// If |x| >= pi/4, -// Create the right N for |x| < pi/4 and otherwise +// If |x| >= pi/4, +// Create the right N for |x| < pi/4 and otherwise // Case 2: Place integer part of N in GP register // -(p7) fcvt.xf FR_N_float = FR_N_fix - nop.i 999 ;; -} -{ .mmf - nop.m 999 -(p7) getf.sig GR_N_Inc = FR_N_fix -(p8) fcmp.gt.s1 p8, p0 = FR_Input_X, FR_Neg_Two_to_M3 ;; -} -{ .mib - nop.m 999 - nop.i 999 -// -// Load 2**(-33), -2**(-33) -// -(p8) br.cond.spnt L(SINCOSL_SMALL_R) ;; + + +{ .mbb + nop.m 999 +(p8) br.cond.spnt SINCOSL_SMALL_R_0 // Branch if 0 < |x| < 2^-3 +(p6) br.cond.spnt SINCOSL_NORMAL_R_0 // Branch if 2^-3 <= |x| < pi/4 } -{ .mib - nop.m 999 - nop.i 999 -(p6) br.cond.sptk L(SINCOSL_NORMAL_R) ;; +;; + +// Here if pi/4 <= |x| < 2^24 +{ .mfi + ldfs FR_Neg_Two_to_M67 = [GR_ad_d], 8 // Load -2^-67 + fnma.s1 FR_s = FR_N_float, FR_P_1, FR_Input_X // s = -N * P_1 + Arg + add GR_N_Inc = GR_N_Inc, GR_Sin_or_Cos // Adjust N_Inc for sin/cos } -// -// if |x| < pi/4, branch based on |x| < 2**(-3) or otherwise. -// -// -// In this branch, |x| >= pi/4. -// { .mfi -(p0) ldfs FR_Neg_Two_to_M67 = [GR_Table_Base1], 8 -// -// Load -2**(-67) -// -(p0) fnma.s1 FR_s = FR_N_float, FR_P_1, FR_Input_X -// -// w = N * P_2 -// s = -N * P_1 + Arg -// -(p0) add GR_N_Inc = GR_N_Inc, GR_Sin_or_Cos + nop.m 999 + fma.s1 FR_w = FR_N_float, FR_P_2, f0 // w = N * P_2 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fma.s1 FR_w = FR_N_float, FR_P_2, f0 - nop.i 999 ;; + nop.m 999 + fms.s1 FR_r = FR_s, f1, FR_w // r = s - w, assume |s| >= 2^-33 + tbit.z p9,p10 = GR_N_Inc, 0 // p9 if i_1=0, N mod 4 = 0,1 + // p10 if i_1=1, N mod 4 = 2,3 } +;; + { .mfi - nop.m 999 -// -// Adjust N_fix by N_inc to determine whether sine or -// cosine is being calculated -// -(p0) fcmp.lt.unc.s1 p7, p6 = FR_s, FR_Two_to_M33 - nop.i 999 ;; + nop.m 999 + fcmp.lt.s1 p7, p6 = FR_s, FR_Two_to_M33 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p7) fcmp.gt.s1 p7, p6 = FR_s, FR_Neg_Two_to_M33 - nop.i 999 ;; + nop.m 999 +(p7) fcmp.gt.s1 p7, p6 = FR_s, FR_Neg_Two_to_M33 // p6 if |s| >= 2^-33, else p7 + nop.i 999 } +;; + { .mfi - nop.m 999 -// Remember x >= pi/4. -// Is s <= -2**(-33) or s >= 2**(-33) (p6) -// or -2**(-33) < s < 2**(-33) (p7) -(p6) fms.s1 FR_r = FR_s, f1, FR_w - nop.i 999 + nop.m 999 + fms.s1 FR_c = FR_s, f1, FR_r // c = s - r, for |s| >= 2^-33 + nop.i 999 } { .mfi - nop.m 999 -(p7) fma.s1 FR_w = FR_N_float, FR_P_3, f0 - nop.i 999 ;; + nop.m 999 + fma.s1 FR_rsq = FR_r, FR_r, f0 // rsq = r * r, for |s| >= 2^-33 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p7) fma.s1 FR_U_1 = FR_N_float, FR_P_2, FR_w - nop.i 999 + nop.m 999 +(p7) fma.s1 FR_w = FR_N_float, FR_P_3, f0 + nop.i 999 } +;; + +{ .mmf +(p9) ldfe FR_C_1 = [GR_ad_pp], 16 // Load C_1 if i_1=0 +(p10) ldfe FR_S_1 = [GR_ad_qq], 16 // Load S_1 if i_1=1 + frcpa.s1 FR_r_hi, p15 = f1, FR_r // r_hi = frcpa(r) +} +;; + { .mfi - nop.m 999 -(p6) fms.s1 FR_c = FR_s, f1, FR_r - nop.i 999 ;; + nop.m 999 +(p6) fcmp.lt.unc.s1 p8, p13 = FR_r, FR_Two_to_M3 // If big s, test r with 2^-3 + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// For big s: r = s - w: No futher reduction is necessary + nop.m 999 +(p7) fma.s1 FR_U_1 = FR_N_float, FR_P_2, FR_w + nop.i 999 +} +;; + +// +// For big s: r = s - w: No futher reduction is necessary // For small s: w = N * P_3 (change sign) More reduction // -(p6) fcmp.lt.unc.s1 p8, p9 = FR_r, FR_Two_to_M3 - nop.i 999 ;; +{ .mfi + nop.m 999 +(p8) fcmp.gt.s1 p8, p13 = FR_r, FR_Neg_Two_to_M3 // If big s, p8 if |r| < 2^-3 + nop.i 999 ;; } + { .mfi - nop.m 999 -(p8) fcmp.gt.s1 p8, p9 = FR_r, FR_Neg_Two_to_M3 - nop.i 999 ;; + nop.m 999 +(p9) fma.s1 FR_poly = FR_rsq, FR_PP_8, FR_PP_7 // poly = rsq*PP_8+PP_7 if i_1=0 + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 +(p10) fma.s1 FR_poly = FR_rsq, FR_QQ_8, FR_QQ_7 // poly = rsq*QQ_8+QQ_7 if i_1=1 + nop.i 999 +} +;; + +{ .mfi + nop.m 999 (p7) fms.s1 FR_r = FR_s, f1, FR_U_1 - nop.i 999 + nop.i 999 } -{ .mfb - nop.m 999 +;; + +{ .mfi + nop.m 999 +(p6) fma.s1 FR_r_cubed = FR_r, FR_rsq, f0 // rcubed = r * rsq + nop.i 999 +} +;; + +{ .mfi // // For big s: Is |r| < 2**(-3)? // For big s: c = S - r @@ -1095,355 +1125,356 @@ L(SINCOSL_CONTINUE): // If p9 is set, prepare to branch to Normal_R. // For big s, r is complete here. // -(p6) fms.s1 FR_c = FR_c, f1, FR_w -// +// // For big s: c = c + w (w has not been negated.) // For small s: r = S - U_1 // -(p8) br.cond.spnt L(SINCOSL_SMALL_R) ;; + nop.m 999 +(p6) fms.s1 FR_c = FR_c, f1, FR_w + nop.i 999 } -{ .mib - nop.m 999 - nop.i 999 -(p9) br.cond.sptk L(SINCOSL_NORMAL_R) ;; +{ .mbb + nop.m 999 +(p8) br.cond.spnt SINCOSL_SMALL_R_1 // Branch if |s|>=2^-33, |r| < 2^-3, + // and pi/4 <= |x| < 2^24 +(p13) br.cond.sptk SINCOSL_NORMAL_R_1 // Branch if |s|>=2^-33, |r| >= 2^-3, + // and pi/4 <= |x| < 2^24 } -{ .mfi -(p7) add GR_Table_Base1 = 224, GR_Table_Base1 +;; + +SINCOSL_S_TINY: +// +// Here if |s| < 2^-33, and pi/4 <= |x| < 2^24 // -// Branch to SINCOSL_SMALL_R or SINCOSL_NORMAL_R +{ .mfi + fms.s1 FR_U_2 = FR_N_float, FR_P_2, FR_U_1 // -(p7) fms.s1 FR_U_2 = FR_N_float, FR_P_2, FR_U_1 -// // c = S - U_1 // r = S_1 * r // // -(p7) extr.u GR_i_1 = GR_N_Inc, 0, 1 ;; } +;; + { .mmi - nop.m 999 + nop.m 999 // // Get [i_0,i_1] - two lsb of N_fix_gr. // Do dummy fmpy so inexact is always set. // -(p7) cmp.eq.unc p9, p10 = 0x0, GR_i_1 -(p7) extr.u GR_i_0 = GR_N_Inc, 1, 1 ;; + tbit.z p9,p10 = GR_N_Inc, 0 // p9 if i_1=0, N mod 4 = 0,1 + // p10 if i_1=1, N mod 4 = 2,3 } -// +;; + +// // For small s: U_2 = N * P_2 - U_1 // S_1 stored constant - grab the one stored with the // coefficients. -// +// { .mfi -(p7) ldfe FR_S_1 = [GR_Table_Base1], 16 + ldfe FR_S_1 = [GR_ad_s1], 16 // // Check if i_1 and i_0 != 0 // -(p10) fma.s1 FR_poly = f0, f1, FR_Neg_Two_to_M67 -(p7) cmp.eq.unc p11, p12 = 0x0, GR_i_0 ;; +(p10) fma.s1 FR_poly = f0, f1, FR_Neg_Two_to_M67 + tbit.z p11,p12 = GR_N_Inc, 1 // p11 if i_0=0, N mod 4 = 0,2 + // p12 if i_0=1, N mod 4 = 1,3 } +;; + { .mfi - nop.m 999 -(p7) fms.s1 FR_s = FR_s, f1, FR_r - nop.i 999 + nop.m 999 + fms.s1 FR_s = FR_s, f1, FR_r + nop.i 999 } { .mfi - nop.m 999 -// + nop.m 999 +// // S = S - r // U_2 = U_2 + w // load S_1 // -(p7) fma.s1 FR_rsq = FR_r, FR_r, f0 - nop.i 999 ;; + fma.s1 FR_rsq = FR_r, FR_r, f0 + nop.i 999 ;; } { .mfi - nop.m 999 -(p7) fma.s1 FR_U_2 = FR_U_2, f1, FR_w - nop.i 999 + nop.m 999 + fma.s1 FR_U_2 = FR_U_2, f1, FR_w + nop.i 999 } { .mfi - nop.m 999 -(p7) fmerge.se FR_Input_X = FR_r, FR_r - nop.i 999 ;; + nop.m 999 + fmerge.se FR_tmp_result = FR_r, FR_r + nop.i 999 ;; } { .mfi - nop.m 999 -(p10) fma.s1 FR_Input_X = f0, f1, f1 - nop.i 999 ;; + nop.m 999 +(p10) fma.s1 FR_tmp_result = f0, f1, f1 + nop.i 999 ;; } { .mfi - nop.m 999 -// + nop.m 999 +// // FR_rsq = r * r // Save r as the result. // -(p7) fms.s1 FR_c = FR_s, f1, FR_U_1 - nop.i 999 ;; + fms.s1 FR_c = FR_s, f1, FR_U_1 + nop.i 999 ;; } { .mfi - nop.m 999 -// + nop.m 999 +// // if ( i_1 ==0) poly = c + S_1*r*r*r // else Result = 1 // -(p12) fnma.s1 FR_Input_X = FR_Input_X, f1, f0 - nop.i 999 +(p12) fnma.s1 FR_tmp_result = FR_tmp_result, f1, f0 + nop.i 999 } { .mfi - nop.m 999 -(p7) fma.s1 FR_r = FR_S_1, FR_r, f0 - nop.i 999 ;; + nop.m 999 + fma.s1 FR_r = FR_S_1, FR_r, f0 + nop.i 999 ;; } { .mfi - nop.m 999 -(p7) fma.s0 FR_S_1 = FR_S_1, FR_S_1, f0 - nop.i 999 ;; + nop.m 999 + fma.s0 FR_S_1 = FR_S_1, FR_S_1, f0 + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // If i_1 != 0, poly = 2**(-67) // -(p7) fms.s1 FR_c = FR_c, f1, FR_U_2 - nop.i 999 ;; + fms.s1 FR_c = FR_c, f1, FR_U_2 + nop.i 999 ;; } { .mfi - nop.m 999 -// + nop.m 999 +// // c = c - U_2 -// +// (p9) fma.s1 FR_poly = FR_r, FR_rsq, FR_c - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // i_0 != 0, so Result = -Result // -(p11) fma.s0 FR_Input_X = FR_Input_X, f1, FR_poly - nop.i 999 ;; +(p11) fma.s0 FR_Result = FR_tmp_result, f1, FR_poly + nop.i 999 ;; } { .mfb - nop.m 999 -(p12) fms.s0 FR_Input_X = FR_Input_X, f1, FR_poly + nop.m 999 +(p12) fms.s0 FR_Result = FR_tmp_result, f1, FR_poly // // if (i_0 == 0), Result = Result + poly // else Result = Result - poly // -(p0) br.ret.sptk b0 ;; -} -L(SINCOSL_LARGER_ARG): -{ .mfi - nop.m 999 -(p0) fma.s1 FR_N_0 = FR_Input_X, FR_Inv_P_0, f0 - nop.i 999 + br.ret.sptk b0 // Exit if |s| < 2^-33, and pi/4 <= |x| < 2^24 } ;; -// This path for argument > 2*24 -// Adjust table_ptr1 to beginning of table. +SINCOSL_LARGER_ARG: // - -{ .mmi - nop.m 999 -(p0) addl GR_Table_Base = @ltoff(FSINCOSL_CONSTANTS#), gp - nop.i 999 -} -;; - -{ .mmi - ld8 GR_Table_Base = [GR_Table_Base] - nop.m 999 - nop.i 999 +// Here if 2^24 <= |x| < 2^63 +// +{ .mfi + ldfe FR_d_1 = [GR_ad_p], 16 // Load d_1 for |x| >= 2^24 path + fma.s1 FR_N_0 = FR_Input_X, FR_Inv_P_0, f0 + nop.i 999 } ;; - -// -// Point to 2*-14 +// // N_0 = Arg * Inv_P_0 // +// Load values 2**(-14) and -2**(-14) { .mmi -(p0) add GR_Table_Base = 688, GR_Table_Base ;; -(p0) ldfs FR_Two_to_M14 = [GR_Table_Base], 4 - nop.i 999 ;; + ldfps FR_Two_to_M14, FR_Neg_Two_to_M14 = [GR_ad_m14] + nop.i 999 ;; } { .mfi -(p0) ldfs FR_Neg_Two_to_M14 = [GR_Table_Base], 0 - nop.f 999 - nop.i 999 ;; + ldfe FR_d_2 = [GR_ad_p], 16 // Load d_2 for |x| >= 2^24 path + nop.f 999 + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // -// Load values 2**(-14) and -2**(-14) // -(p0) fcvt.fx.s1 FR_N_0_fix = FR_N_0 - nop.i 999 ;; + fcvt.fx.s1 FR_N_0_fix = FR_N_0 + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N_0_fix = integer part of N_0 // -(p0) fcvt.xf FR_N_0 = FR_N_0_fix - nop.i 999 ;; + fcvt.xf FR_N_0 = FR_N_0_fix + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // Make N_0 the integer part // -(p0) fnma.s1 FR_ArgPrime = FR_N_0, FR_P_0, FR_Input_X - nop.i 999 + fnma.s1 FR_ArgPrime = FR_N_0, FR_P_0, FR_Input_X + nop.i 999 } { .mfi - nop.m 999 -(p0) fma.s1 FR_w = FR_N_0, FR_d_1, f0 - nop.i 999 ;; + nop.m 999 + fma.s1 FR_w = FR_N_0, FR_d_1, f0 + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // Arg' = -N_0 * P_0 + Arg // w = N_0 * d_1 // -(p0) fma.s1 FR_N_float = FR_ArgPrime, FR_Inv_pi_by_2, f0 - nop.i 999 ;; + fma.s1 FR_N_float = FR_ArgPrime, FR_Inv_pi_by_2, f0 + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // -// N = A' * 2/pi +// N = A' * 2/pi // -(p0) fcvt.fx.s1 FR_N_fix = FR_N_float - nop.i 999 ;; + fcvt.fx.s1 FR_N_fix = FR_N_float + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // -// N_fix is the integer part +// N_fix is the integer part // -(p0) fcvt.xf FR_N_float = FR_N_fix - nop.i 999 ;; + fcvt.xf FR_N_float = FR_N_fix + nop.i 999 ;; } { .mfi -(p0) getf.sig GR_N_Inc = FR_N_fix - nop.f 999 - nop.i 999 ;; + getf.sig GR_N_Inc = FR_N_fix + nop.f 999 + nop.i 999 ;; } { .mii - nop.m 999 - nop.i 999 ;; -(p0) add GR_N_Inc = GR_N_Inc, GR_Sin_or_Cos ;; + nop.m 999 + nop.i 999 ;; + add GR_N_Inc = GR_N_Inc, GR_Sin_or_Cos ;; } { .mfi - nop.m 999 + nop.m 999 // // N is the integer part of the reduced-reduced argument. // Put the integer in a GP register // -(p0) fnma.s1 FR_s = FR_N_float, FR_P_1, FR_ArgPrime - nop.i 999 + fnma.s1 FR_s = FR_N_float, FR_P_1, FR_ArgPrime + nop.i 999 } { .mfi - nop.m 999 -(p0) fnma.s1 FR_w = FR_N_float, FR_P_2, FR_w - nop.i 999 ;; + nop.m 999 + fnma.s1 FR_w = FR_N_float, FR_P_2, FR_w + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // s = -N*P_1 + Arg' // w = -N*P_2 + w // N_fix_gr = N_fix_gr + N_inc // -(p0) fcmp.lt.unc.s1 p9, p8 = FR_s, FR_Two_to_M14 - nop.i 999 ;; + fcmp.lt.unc.s1 p9, p8 = FR_s, FR_Two_to_M14 + nop.i 999 ;; } { .mfi - nop.m 999 -(p9) fcmp.gt.s1 p9, p8 = FR_s, FR_Neg_Two_to_M14 - nop.i 999 ;; + nop.m 999 +(p9) fcmp.gt.s1 p9, p8 = FR_s, FR_Neg_Two_to_M14 // p9 if |s| < 2^-14 + nop.i 999 ;; } + { .mfi - nop.m 999 + nop.m 999 // // For |s| > 2**(-14) r = S + w (r complete) // Else U_hi = N_0 * d_1 // (p9) fma.s1 FR_V_hi = FR_N_float, FR_P_2, f0 - nop.i 999 + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 (p9) fma.s1 FR_U_hi = FR_N_0, FR_d_1, f0 - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // Either S <= -2**(-14) or S >= 2**(-14) // or -2**(-14) < s < 2**(-14) // (p8) fma.s1 FR_r = FR_s, f1, FR_w - nop.i 999 + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 (p9) fma.s1 FR_w = FR_N_float, FR_P_3, f0 - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // We need abs of both U_hi and V_hi - don't // worry about switched sign of V_hi. // (p9) fms.s1 FR_A = FR_U_hi, f1, FR_V_hi - nop.i 999 + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 // -// Big s: finish up c = (S - r) + w (c complete) +// Big s: finish up c = (S - r) + w (c complete) // Case 4: A = U_hi + V_hi // Note: Worry about switched sign of V_hi, so subtract instead of add. // (p9) fnma.s1 FR_V_lo = FR_N_float, FR_P_2, FR_V_hi - nop.i 999 ;; + nop.i 999 ;; } { .mmf - nop.m 999 - nop.m 999 + nop.m 999 + nop.m 999 (p9) fms.s1 FR_U_lo = FR_N_0, FR_d_1, FR_U_hi } { .mfi - nop.m 999 + nop.m 999 (p9) fmerge.s FR_V_hiabs = f0, FR_V_hi - nop.i 999 ;; + nop.i 999 ;; } +//{ .mfb +//(p9) fmerge.s f8= FR_V_lo,FR_V_lo +//(p9) br.ret.sptk b0 +//} +//;; { .mfi - nop.m 999 + nop.m 999 // For big s: c = S - r // For small s do more work: U_lo = N_0 * d_1 - U_hi // (p9) fmerge.s FR_U_hiabs = f0, FR_U_hi - nop.i 999 + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 // -// For big s: Is |r| < 2**(-3) +// For big s: Is |r| < 2**(-3) // For big s: if p12 set, prepare to branch to Small_R. // For big s: If p13 set, prepare to branch to Normal_R. // -(p8) fms.s1 FR_c = FR_s, f1, FR_r - nop.i 999 ;; +(p8) fms.s1 FR_c = FR_s, f1, FR_r + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // For small S: V_hi = N * P_2 // w = N * P_3 @@ -1451,104 +1482,99 @@ L(SINCOSL_LARGER_ARG): // so (-) missing for V_hi and w. // (p8) fcmp.lt.unc.s1 p12, p13 = FR_r, FR_Two_to_M3 - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 (p12) fcmp.gt.s1 p12, p13 = FR_r, FR_Neg_Two_to_M3 - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 (p8) fma.s1 FR_c = FR_c, f1, FR_w - nop.i 999 + nop.i 999 } { .mfb - nop.m 999 + nop.m 999 (p9) fms.s1 FR_w = FR_N_0, FR_d_2, FR_w -(p12) br.cond.spnt L(SINCOSL_SMALL_R) ;; +(p12) br.cond.spnt SINCOSL_SMALL_R // Branch if |r| < 2^-3 + // and 2^24 <= |x| < 2^63 } +;; + { .mib - nop.m 999 - nop.i 999 -(p13) br.cond.sptk L(SINCOSL_NORMAL_R) ;; + nop.m 999 + nop.i 999 +(p13) br.cond.sptk SINCOSL_NORMAL_R // Branch if |r| >= 2^-3 + // and 2^24 <= |x| < 2^63 } +;; + +SINCOSL_LARGER_S_TINY: +// +// Here if |s| < 2^-14, and 2^24 <= |x| < 2^63 +// { .mfi - nop.m 999 -// -// Big s: Vector off when |r| < 2**(-3). Recall that p8 will be true. + nop.m 999 +// +// Big s: Vector off when |r| < 2**(-3). Recall that p8 will be true. // The remaining stuff is for Case 4. // Small s: V_lo = N * P_2 + U_hi (U_hi is in place of V_hi in writeup) // Note: the (-) is still missing for V_lo. // Small s: w = w + N_0 * d_2 // Note: the (-) is now incorporated in w. // -(p9) fcmp.ge.unc.s1 p10, p11 = FR_U_hiabs, FR_V_hiabs -(p0) extr.u GR_i_1 = GR_N_Inc, 0, 1 + fcmp.ge.unc.s1 p7, p8 = FR_U_hiabs, FR_V_hiabs } { .mfi - nop.m 999 + nop.m 999 // // C_hi = S + A // -(p9) fma.s1 FR_t = FR_U_lo, f1, FR_V_lo -(p0) extr.u GR_i_0 = GR_N_Inc, 1, 1 ;; + fma.s1 FR_t = FR_U_lo, f1, FR_V_lo } +;; + { .mfi - nop.m 999 + nop.m 999 // -// t = U_lo + V_lo +// t = U_lo + V_lo // // -(p10) fms.s1 FR_a = FR_U_hi, f1, FR_A - nop.i 999 ;; +(p7) fms.s1 FR_a = FR_U_hi, f1, FR_A + nop.i 999 ;; } { .mfi - nop.m 999 -(p11) fma.s1 FR_a = FR_V_hi, f1, FR_A - nop.i 999 -} -;; - -{ .mmi - nop.m 999 -(p0) addl GR_Table_Base = @ltoff(FSINCOSL_CONSTANTS#), gp - nop.i 999 -} -;; - -{ .mmi - ld8 GR_Table_Base = [GR_Table_Base] - nop.m 999 - nop.i 999 + nop.m 999 +(p8) fma.s1 FR_a = FR_V_hi, f1, FR_A + nop.i 999 } ;; - { .mfi -(p0) add GR_Table_Base = 528, GR_Table_Base // // Is U_hiabs >= V_hiabs? // -(p9) fma.s1 FR_C_hi = FR_s, f1, FR_A - nop.i 999 ;; + nop.m 999 + fma.s1 FR_C_hi = FR_s, f1, FR_A + nop.i 999 ;; } { .mmi -(p0) ldfe FR_C_1 = [GR_Table_Base], 16 ;; -(p0) ldfe FR_C_2 = [GR_Table_Base], 64 - nop.i 999 ;; + ldfe FR_C_1 = [GR_ad_c], 16 ;; + ldfe FR_C_2 = [GR_ad_c], 64 + nop.i 999 ;; } // // c = c + C_lo finished. // Load C_2 // { .mfi -(p0) ldfe FR_S_1 = [GR_Table_Base], 16 + ldfe FR_S_1 = [GR_ad_s], 16 // -// C_lo = S - C_hi +// C_lo = S - C_hi // -(p0) fma.s1 FR_t = FR_t, f1, FR_w - nop.i 999 ;; + fma.s1 FR_t = FR_t, f1, FR_w + nop.i 999 ;; } // // r and c have been computed. @@ -1558,855 +1584,695 @@ L(SINCOSL_LARGER_ARG): // Load S_1 // { .mfi -(p0) ldfe FR_S_2 = [GR_Table_Base], 64 + ldfe FR_S_2 = [GR_ad_s], 64 // -// t = t + w +// t = t + w // -(p10) fms.s1 FR_a = FR_a, f1, FR_V_hi -(p0) cmp.eq.unc p9, p10 = 0x0, GR_i_0 ;; +(p7) fms.s1 FR_a = FR_a, f1, FR_V_hi + tbit.z p9,p10 = GR_N_Inc, 0 // p9 if i_1=0, N mod 4 = 0,1 + // p10 if i_1=1, N mod 4 = 2,3 } +;; { .mfi - nop.m 999 + nop.m 999 // // For larger u than v: a = U_hi - A // Else a = V_hi - A (do an add to account for missing (-) on V_hi // -(p0) fms.s1 FR_C_lo = FR_s, f1, FR_C_hi - nop.i 999 ;; + fms.s1 FR_C_lo = FR_s, f1, FR_C_hi + nop.i 999 ;; } { .mfi - nop.m 999 -(p11) fms.s1 FR_a = FR_U_hi, f1, FR_a -(p0) cmp.eq.unc p11, p12 = 0x0, GR_i_1 ;; + nop.m 999 +(p8) fms.s1 FR_a = FR_U_hi, f1, FR_a + tbit.z p11,p12 = GR_N_Inc, 1 // p11 if i_0=0, N mod 4 = 0,2 + // p12 if i_0=1, N mod 4 = 1,3 } +;; + { .mfi - nop.m 999 + nop.m 999 // // If u > v: a = (U_hi - A) + V_hi // Else a = (V_hi - A) + U_hi // In each case account for negative missing from V_hi. // -(p0) fma.s1 FR_C_lo = FR_C_lo, f1, FR_A - nop.i 999 ;; + fma.s1 FR_C_lo = FR_C_lo, f1, FR_A + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // -// C_lo = (S - C_hi) + A +// C_lo = (S - C_hi) + A // -(p0) fma.s1 FR_t = FR_t, f1, FR_a - nop.i 999 ;; + fma.s1 FR_t = FR_t, f1, FR_a + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // -// t = t + a +// t = t + a // -(p0) fma.s1 FR_C_lo = FR_C_lo, f1, FR_t - nop.i 999 ;; + fma.s1 FR_C_lo = FR_C_lo, f1, FR_t + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // C_lo = C_lo + t -// Adjust Table_Base to beginning of table // -(p0) fma.s1 FR_r = FR_C_hi, f1, FR_C_lo - nop.i 999 ;; + fma.s1 FR_r = FR_C_hi, f1, FR_C_lo + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // Load S_2 // -(p0) fma.s1 FR_rsq = FR_r, FR_r, f0 - nop.i 999 + fma.s1 FR_rsq = FR_r, FR_r, f0 + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 // -// Table_Base points to C_1 // r = C_hi + C_lo // -(p0) fms.s1 FR_c = FR_C_hi, f1, FR_r - nop.i 999 ;; + fms.s1 FR_c = FR_C_hi, f1, FR_r + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // if i_1 ==0: poly = S_2 * FR_rsq + S_1 // else poly = C_2 * FR_rsq + C_1 // -(p11) fma.s1 FR_Input_X = f0, f1, FR_r - nop.i 999 ;; +(p9) fma.s1 FR_tmp_result = f0, f1, FR_r + nop.i 999 ;; } { .mfi - nop.m 999 -(p12) fma.s1 FR_Input_X = f0, f1, f1 - nop.i 999 ;; + nop.m 999 +(p10) fma.s1 FR_tmp_result = f0, f1, f1 + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // -// Compute r_cube = FR_rsq * r +// Compute r_cube = FR_rsq * r // -(p11) fma.s1 FR_poly = FR_rsq, FR_S_2, FR_S_1 - nop.i 999 ;; +(p9) fma.s1 FR_poly = FR_rsq, FR_S_2, FR_S_1 + nop.i 999 ;; } { .mfi - nop.m 999 -(p12) fma.s1 FR_poly = FR_rsq, FR_C_2, FR_C_1 - nop.i 999 + nop.m 999 +(p10) fma.s1 FR_poly = FR_rsq, FR_C_2, FR_C_1 + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 // // Compute FR_rsq = r * r // Is i_1 == 0 ? // -(p0) fma.s1 FR_r_cubed = FR_rsq, FR_r, f0 - nop.i 999 ;; + fma.s1 FR_r_cubed = FR_rsq, FR_r, f0 + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // c = C_hi - r // Load C_1 // -(p0) fma.s1 FR_c = FR_c, f1, FR_C_lo - nop.i 999 + fma.s1 FR_c = FR_c, f1, FR_C_lo + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 // // if i_1 ==0: poly = r_cube * poly + c // else poly = FR_rsq * poly // -(p10) fms.s1 FR_Input_X = f0, f1, FR_Input_X - nop.i 999 ;; +(p12) fms.s1 FR_tmp_result = f0, f1, FR_tmp_result + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // if i_1 ==0: Result = r // else Result = 1.0 // -(p11) fma.s1 FR_poly = FR_r_cubed, FR_poly, FR_c - nop.i 999 ;; +(p9) fma.s1 FR_poly = FR_r_cubed, FR_poly, FR_c + nop.i 999 ;; } { .mfi - nop.m 999 -(p12) fma.s1 FR_poly = FR_rsq, FR_poly, f0 - nop.i 999 ;; + nop.m 999 +(p10) fma.s1 FR_poly = FR_rsq, FR_poly, f0 + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // -// if i_0 !=0: Result = -Result +// if i_0 !=0: Result = -Result // -(p9) fma.s0 FR_Input_X = FR_Input_X, f1, FR_poly - nop.i 999 ;; +(p11) fma.s0 FR_Result = FR_tmp_result, f1, FR_poly + nop.i 999 ;; } { .mfb - nop.m 999 -(p10) fms.s0 FR_Input_X = FR_Input_X, f1, FR_poly + nop.m 999 +(p12) fms.s0 FR_Result = FR_tmp_result, f1, FR_poly // // if i_0 == 0: Result = Result + poly // else Result = Result - poly // -(p0) br.ret.sptk b0 ;; + br.ret.sptk b0 // Exit for |s| < 2^-14, and 2^24 <= |x| < 2^63 } -L(SINCOSL_SMALL_R): -{ .mii - nop.m 999 -(p0) extr.u GR_i_1 = GR_N_Inc, 0, 1 ;; +;; + + +SINCOSL_SMALL_R: // +// Here if |r| < 2^-3 +// +// Enter with r, c, and N_Inc computed // // Compare both i_1 and i_0 with 0. // if i_1 == 0, set p9. // if i_0 == 0, set p11. // -(p0) cmp.eq.unc p9, p10 = 0x0, GR_i_1 ;; -} -{ .mfi - nop.m 999 -(p0) fma.s1 FR_rsq = FR_r, FR_r, f0 -(p0) extr.u GR_i_0 = GR_N_Inc, 1, 1 ;; -} + { .mfi - nop.m 999 -// -// Z = Z * FR_rsq -// -(p10) fnma.s1 FR_c = FR_c, FR_r, f0 -(p0) cmp.eq.unc p11, p12 = 0x0, GR_i_0 + nop.m 999 + fma.s1 FR_rsq = FR_r, FR_r, f0 // rsq = r * r + tbit.z p9,p10 = GR_N_Inc, 0 // p9 if i_1=0, N mod 4 = 0,1 + // p10 if i_1=1, N mod 4 = 2,3 } ;; -// ****************************************************************** -// ****************************************************************** -// ****************************************************************** -// r and c have been computed. -// We know whether this is the sine or cosine routine. -// Make sure ftz mode is set - should be automatic when using wre -// |r| < 2**(-3) -// -// Set table_ptr1 to beginning of constant table. -// Get [i_0,i_1] - two lsb of N_fix_gr. -// - { .mmi - nop.m 999 -(p0) addl GR_Table_Base = @ltoff(FSINCOSL_CONSTANTS#), gp +(p9) ldfe FR_S_5 = [GR_ad_se], -16 // Load S_5 if i_1=0 +(p10) ldfe FR_C_5 = [GR_ad_ce], -16 // Load C_5 if i_1=1 nop.i 999 } ;; { .mmi - ld8 GR_Table_Base = [GR_Table_Base] - nop.m 999 +(p9) ldfe FR_S_4 = [GR_ad_se], -16 // Load S_4 if i_1=0 +(p10) ldfe FR_C_4 = [GR_ad_ce], -16 // Load C_4 if i_1=1 nop.i 999 } ;; - -// -// Set table_ptr1 to point to S_5. -// Set table_ptr1 to point to C_5. -// Compute FR_rsq = r * r -// -{ .mfi -(p9) add GR_Table_Base = 672, GR_Table_Base -(p10) fmerge.s FR_r = f1, f1 -(p10) add GR_Table_Base = 592, GR_Table_Base ;; +SINCOSL_SMALL_R_0: +// Entry point for 2^-3 < |x| < pi/4 +.pred.rel "mutex",p9,p10 +SINCOSL_SMALL_R_1: +// Entry point for pi/4 < |x| < 2^24 and |r| < 2^-3 +.pred.rel "mutex",p9,p10 +{ .mfi +(p9) ldfe FR_S_3 = [GR_ad_se], -16 // Load S_3 if i_1=0 + fma.s1 FR_Z = FR_rsq, FR_rsq, f0 // Z = rsq * rsq + nop.i 999 } -// -// Set table_ptr1 to point to S_5. -// Set table_ptr1 to point to C_5. -// -{ .mmi -(p9) ldfe FR_S_5 = [GR_Table_Base], -16 ;; -// -// if (i_1 == 0) load S_5 -// if (i_1 != 0) load C_5 -// -(p9) ldfe FR_S_4 = [GR_Table_Base], -16 - nop.i 999 ;; +{ .mfi +(p10) ldfe FR_C_3 = [GR_ad_ce], -16 // Load C_3 if i_1=1 +(p10) fnma.s1 FR_c = FR_c, FR_r, f0 // c = -c * r if i_1=0 + nop.i 999 } +;; + { .mmf -(p10) ldfe FR_C_5 = [GR_Table_Base], -16 -// -// Z = FR_rsq * FR_rsq -// -(p9) ldfe FR_S_3 = [GR_Table_Base], -16 -// -// Compute FR_rsq = r * r -// if (i_1 == 0) load S_4 -// if (i_1 != 0) load C_4 -// -(p0) fma.s1 FR_Z = FR_rsq, FR_rsq, f0 ;; -} -// -// if (i_1 == 0) load S_3 -// if (i_1 != 0) load C_3 -// -{ .mmi -(p9) ldfe FR_S_2 = [GR_Table_Base], -16 ;; -// -// if (i_1 == 0) load S_2 -// if (i_1 != 0) load C_2 -// -(p9) ldfe FR_S_1 = [GR_Table_Base], -16 - nop.i 999 -} -{ .mmi -(p10) ldfe FR_C_4 = [GR_Table_Base], -16 ;; -(p10) ldfe FR_C_3 = [GR_Table_Base], -16 - nop.i 999 ;; +(p9) ldfe FR_S_2 = [GR_ad_se], -16 // Load S_2 if i_1=0 +(p10) ldfe FR_C_2 = [GR_ad_ce], -16 // Load C_2 if i_1=1 +(p10) fmerge.s FR_r = f1, f1 } +;; + { .mmi -(p10) ldfe FR_C_2 = [GR_Table_Base], -16 ;; -(p10) ldfe FR_C_1 = [GR_Table_Base], -16 - nop.i 999 -} -{ .mfi - nop.m 999 -// -// if (i_1 != 0): -// poly_lo = FR_rsq * C_5 + C_4 -// poly_hi = FR_rsq * C_2 + C_1 -// -(p9) fma.s1 FR_Z = FR_Z, FR_r, f0 - nop.i 999 ;; +(p9) ldfe FR_S_1 = [GR_ad_se], -16 // Load S_1 if i_1=0 +(p10) ldfe FR_C_1 = [GR_ad_ce], -16 // Load C_1 if i_1=1 + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// if (i_1 == 0) load S_1 -// if (i_1 != 0) load C_1 -// -(p9) fma.s1 FR_poly_lo = FR_rsq, FR_S_5, FR_S_4 - nop.i 999 + nop.m 999 +(p9) fma.s1 FR_Z = FR_Z, FR_r, f0 // Z = Z * r if i_1=0 + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// c = -c * r -// dummy fmpy's to flag inexact. -// -(p9) fma.s0 FR_S_4 = FR_S_4, FR_S_4, f0 - nop.i 999 ;; + nop.m 999 +(p9) fma.s1 FR_poly_lo = FR_rsq, FR_S_5, FR_S_4 // poly_lo=rsq*S_5+S_4 if i_1=0 + nop.i 999 } { .mfi - nop.m 999 -// -// poly_lo = FR_rsq * poly_lo + C_3 -// poly_hi = FR_rsq * poly_hi -// -(p0) fma.s1 FR_Z = FR_Z, FR_rsq, f0 - nop.i 999 ;; + nop.m 999 +(p10) fma.s1 FR_poly_lo = FR_rsq, FR_C_5, FR_C_4 // poly_lo=rsq*C_5+C_4 if i_1=1 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p9) fma.s1 FR_poly_hi = FR_rsq, FR_S_2, FR_S_1 - nop.i 999 + nop.m 999 +(p9) fma.s1 FR_poly_hi = FR_rsq, FR_S_2, FR_S_1 // poly_hi=rsq*S_2+S_1 if i_1=0 + nop.i 999 } { .mfi - nop.m 999 -// -// if (i_1 == 0): -// poly_lo = FR_rsq * S_5 + S_4 -// poly_hi = FR_rsq * S_2 + S_1 -// -(p10) fma.s1 FR_poly_lo = FR_rsq, FR_C_5, FR_C_4 - nop.i 999 ;; + nop.m 999 +(p10) fma.s1 FR_poly_hi = FR_rsq, FR_C_2, FR_C_1 // poly_hi=rsq*C_2+C_1 if i_1=1 + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// if (i_1 == 0): -// Z = Z * r for only one of the small r cases - not there -// in original implementation notes. -// -(p9) fma.s1 FR_poly_lo = FR_rsq, FR_poly_lo, FR_S_3 - nop.i 999 ;; + nop.m 999 + fma.s1 FR_Z = FR_Z, FR_rsq, f0 // Z = Z * rsq + nop.i 999 } +;; + { .mfi - nop.m 999 -(p10) fma.s1 FR_poly_hi = FR_rsq, FR_C_2, FR_C_1 - nop.i 999 + nop.m 999 +(p9) fma.s1 FR_poly_lo = FR_rsq, FR_poly_lo, FR_S_3 // p_lo=p_lo*rsq+S_3, i_1=0 + nop.i 999 } { .mfi - nop.m 999 -(p10) fma.s0 FR_C_1 = FR_C_1, FR_C_1, f0 - nop.i 999 ;; + nop.m 999 +(p10) fma.s1 FR_poly_lo = FR_rsq, FR_poly_lo, FR_C_3 // p_lo=p_lo*rsq+C_3, i_1=1 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p9) fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, f0 - nop.i 999 + nop.m 999 +(p9) fma.s0 FR_inexact = FR_S_4, FR_S_4, f0 // Dummy op to set inexact + tbit.z p11,p12 = GR_N_Inc, 1 // p11 if i_0=0, N mod 4 = 0,2 + // p12 if i_0=1, N mod 4 = 1,3 } { .mfi - nop.m 999 -// -// poly_lo = FR_rsq * poly_lo + S_3 -// poly_hi = FR_rsq * poly_hi -// -(p10) fma.s1 FR_poly_lo = FR_rsq, FR_poly_lo, FR_C_3 - nop.i 999 ;; + nop.m 999 +(p10) fma.s0 FR_inexact = FR_C_1, FR_C_1, f0 // Dummy op to set inexact + nop.i 999 } +;; + { .mfi - nop.m 999 -(p10) fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, f0 - nop.i 999 ;; + nop.m 999 +(p9) fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, f0 // p_hi=p_hi*rsq if i_1=0 + nop.i 999 } { .mfi - nop.m 999 -// -// if (i_1 == 0): dummy fmpy's to flag inexact -// r = 1 -// -(p9) fma.s1 FR_poly_hi = FR_r, FR_poly_hi, f0 - nop.i 999 + nop.m 999 +(p10) fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, f0 // p_hi=p_hi*rsq if i_1=1 + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// poly_hi = r * poly_hi -// -(p0) fma.s1 FR_poly = FR_Z, FR_poly_lo, FR_c - nop.i 999 ;; + nop.m 999 + fma.s1 FR_poly = FR_Z, FR_poly_lo, FR_c // poly=Z*poly_lo+c + nop.i 999 } +;; + { .mfi - nop.m 999 -(p12) fms.s1 FR_r = f0, f1, FR_r - nop.i 999 ;; + nop.m 999 +(p9) fma.s1 FR_poly_hi = FR_r, FR_poly_hi, f0 // p_hi=r*p_hi if i_1=0 + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// poly_hi = Z * poly_lo + c -// if i_0 == 1: r = -r -// -(p0) fma.s1 FR_poly = FR_poly, f1, FR_poly_hi - nop.i 999 ;; + nop.m 999 +(p12) fms.s1 FR_r = f0, f1, FR_r // r = -r if i_0=1 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p12) fms.s0 FR_Input_X = FR_r, f1, FR_poly - nop.i 999 + nop.m 999 + fma.s1 FR_poly = FR_poly, f1, FR_poly_hi // poly=poly+poly_hi + nop.i 999 } -{ .mfb - nop.m 999 -// -// poly = poly + poly_hi -// -(p11) fma.s0 FR_Input_X = FR_r, f1, FR_poly +;; + // // if (i_0 == 0) Result = r + poly // if (i_0 != 0) Result = r - poly // -(p0) br.ret.sptk b0 ;; -} -L(SINCOSL_NORMAL_R): -{ .mii - nop.m 999 -(p0) extr.u GR_i_1 = GR_N_Inc, 0, 1 ;; -// -// Set table_ptr1 and table_ptr2 to base address of -// constant table. -(p0) cmp.eq.unc p9, p10 = 0x0, GR_i_1 ;; -} { .mfi - nop.m 999 -(p0) fma.s1 FR_rsq = FR_r, FR_r, f0 -(p0) extr.u GR_i_0 = GR_N_Inc, 1, 1 ;; + nop.m 999 +(p11) fma.s0 FR_Result = FR_r, f1, FR_poly + nop.i 999 } -{ .mfi - nop.m 999 -(p0) frcpa.s1 FR_r_hi, p6 = f1, FR_r -(p0) cmp.eq.unc p11, p12 = 0x0, GR_i_0 +{ .mfb + nop.m 999 +(p12) fms.s0 FR_Result = FR_r, f1, FR_poly + br.ret.sptk b0 // Exit for |r| < 2^-3 } ;; -// ****************************************************************** -// ****************************************************************** -// ****************************************************************** + +SINCOSL_NORMAL_R: // -// r and c have been computed. -// We known whether this is the sine or cosine routine. -// Make sure ftz mode is set - should be automatic when using wre -// Get [i_0,i_1] - two lsb of N_fix_gr alone. +// Here if 2^-3 <= |r| < pi/4 +// THIS IS THE MAIN PATH // - -{ .mmi - nop.m 999 -(p0) addl GR_Table_Base = @ltoff(FSINCOSL_CONSTANTS#), gp +// Enter with r, c, and N_Inc having been computed +// +{ .mfi + ldfe FR_PP_6 = [GR_ad_pp], 16 // Load PP_6 + fma.s1 FR_rsq = FR_r, FR_r, f0 // rsq = r * r + tbit.z p9,p10 = GR_N_Inc, 0 // p9 if i_1=0, N mod 4 = 0,1 + // p10 if i_1=1, N mod 4 = 2,3 +} +{ .mfi + ldfe FR_QQ_6 = [GR_ad_qq], 16 // Load QQ_6 + nop.f 999 nop.i 999 } ;; { .mmi - ld8 GR_Table_Base = [GR_Table_Base] - nop.m 999 +(p9) ldfe FR_PP_5 = [GR_ad_pp], 16 // Load PP_5 if i_1=0 +(p10) ldfe FR_QQ_5 = [GR_ad_qq], 16 // Load QQ_5 if i_1=1 nop.i 999 } ;; +SINCOSL_NORMAL_R_0: +// Entry for 2^-3 < |x| < pi/4 +.pred.rel "mutex",p9,p10 +{ .mmf +(p9) ldfe FR_C_1 = [GR_ad_pp], 16 // Load C_1 if i_1=0 +(p10) ldfe FR_S_1 = [GR_ad_qq], 16 // Load S_1 if i_1=1 + frcpa.s1 FR_r_hi, p6 = f1, FR_r // r_hi = frcpa(r) +} +;; { .mfi -(p10) add GR_Table_Base = 384, GR_Table_Base -(p12) fms.s1 FR_Input_X = f0, f1, f1 -(p9) add GR_Table_Base = 224, GR_Table_Base ;; + nop.m 999 +(p9) fma.s1 FR_poly = FR_rsq, FR_PP_8, FR_PP_7 // poly = rsq*PP_8+PP_7 if i_1=0 + nop.i 999 } { .mfi -(p10) ldfe FR_QQ_8 = [GR_Table_Base], 16 -// -// if (i_1==0) poly = poly * FR_rsq + PP_1_lo -// else poly = FR_rsq * poly -// -(p11) fma.s1 FR_Input_X = f0, f1, f1 - nop.i 999 ;; -} -{ .mmb -(p10) ldfe FR_QQ_7 = [GR_Table_Base], 16 -// -// Adjust table pointers based on i_0 -// Compute rsq = r * r -// -(p9) ldfe FR_PP_8 = [GR_Table_Base], 16 - nop.b 999 ;; + nop.m 999 +(p10) fma.s1 FR_poly = FR_rsq, FR_QQ_8, FR_QQ_7 // poly = rsq*QQ_8+QQ_7 if i_1=1 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fma.s1 FR_r_cubed = FR_r, FR_rsq, f0 - nop.i 999 ;; + nop.m 999 + fma.s1 FR_r_cubed = FR_r, FR_rsq, f0 // rcubed = r * rsq + nop.i 999 } +;; + + +SINCOSL_NORMAL_R_1: +// Entry for pi/4 <= |x| < 2^24 +.pred.rel "mutex",p9,p10 { .mmf -(p9) ldfe FR_PP_7 = [GR_Table_Base], 16 -(p10) ldfe FR_QQ_6 = [GR_Table_Base], 16 -// -// Load PP_8 and QQ_8; PP_7 and QQ_7 -// -(p0) frcpa.s1 FR_r_hi, p6 = f1, FR_r_hi ;; -} -// -// if (i_1==0) poly = PP_7 + FR_rsq * PP_8. -// else poly = QQ_7 + FR_rsq * QQ_8. -// -{ .mmb -(p9) ldfe FR_PP_6 = [GR_Table_Base], 16 -(p10) ldfe FR_QQ_5 = [GR_Table_Base], 16 - nop.b 999 ;; -} -{ .mmb -(p9) ldfe FR_PP_5 = [GR_Table_Base], 16 -(p10) ldfe FR_S_1 = [GR_Table_Base], 16 - nop.b 999 ;; -} -{ .mmb -(p10) ldfe FR_QQ_1 = [GR_Table_Base], 16 -(p9) ldfe FR_C_1 = [GR_Table_Base], 16 - nop.b 999 ;; -} -{ .mmb -(p10) ldfe FR_QQ_4 = [GR_Table_Base], 16 -(p9) ldfe FR_PP_1 = [GR_Table_Base], 16 - nop.b 999 ;; -} -{ .mmb -(p10) ldfe FR_QQ_3 = [GR_Table_Base], 16 -// -// if (i_1=0) corr = corr + c*c -// else corr = corr * c -// -(p9) ldfe FR_PP_4 = [GR_Table_Base], 16 - nop.b 999 ;; -} -{ .mfi - nop.m 999 -(p10) fma.s1 FR_poly = FR_rsq, FR_QQ_8, FR_QQ_7 - nop.i 999 ;; -} -// -// if (i_1=0) poly = rsq * poly + PP_5 -// else poly = rsq * poly + QQ_5 -// Load PP_4 or QQ_4 -// -{ .mmi -(p9) ldfe FR_PP_3 = [GR_Table_Base], 16 ;; -(p10) ldfe FR_QQ_2 = [GR_Table_Base], 16 - nop.i 999 +(p9) ldfe FR_PP_1 = [GR_ad_pp], 16 // Load PP_1_hi if i_1=0 +(p10) ldfe FR_QQ_1 = [GR_ad_qq], 16 // Load QQ_1 if i_1=1 + frcpa.s1 FR_r_hi, p6 = f1, FR_r_hi // r_hi = frpca(frcpa(r)) } +;; + { .mfi - nop.m 999 -// -// r_hi = frcpa(frcpa(r)). -// r_cube = r * FR_rsq. -// -(p9) fma.s1 FR_poly = FR_rsq, FR_PP_8, FR_PP_7 - nop.i 999 ;; +(p9) ldfe FR_PP_4 = [GR_ad_pp], 16 // Load PP_4 if i_1=0 +(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_6 // poly = rsq*poly+PP_6 if i_1=0 + nop.i 999 } -// -// Do dummy multiplies so inexact is always set. -// { .mfi -(p9) ldfe FR_PP_2 = [GR_Table_Base], 16 -// -// r_lo = r - r_hi -// -(p9) fma.s1 FR_U_lo = FR_r_hi, FR_r_hi, f0 - nop.i 999 ;; -} -{ .mbb -(p9) ldfe FR_PP_1_lo = [GR_Table_Base], 16 - nop.b 999 - nop.b 999 ;; +(p10) ldfe FR_QQ_4 = [GR_ad_qq], 16 // Load QQ_4 if i_1=1 +(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_6 // poly = rsq*poly+QQ_6 if i_1=1 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p10) fma.s1 FR_corr = FR_S_1, FR_r_cubed, FR_r - nop.i 999 + nop.m 999 +(p9) fma.s1 FR_corr = FR_C_1, FR_rsq, f0 // corr = C_1 * rsq if i_1=0 + nop.i 999 } { .mfi - nop.m 999 -(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_6 - nop.i 999 ;; + nop.m 999 +(p10) fma.s1 FR_corr = FR_S_1, FR_r_cubed, FR_r // corr = S_1 * r^3 + r if i_1=1 + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// if (i_1=0) U_lo = r_hi * r_hi -// else U_lo = r_hi + r -// -(p9) fma.s1 FR_corr = FR_C_1, FR_rsq, f0 - nop.i 999 ;; +(p9) ldfe FR_PP_3 = [GR_ad_pp], 16 // Load PP_3 if i_1=0 + fma.s1 FR_r_hi_sq = FR_r_hi, FR_r_hi, f0 // r_hi_sq = r_hi * r_hi + nop.i 999 } { .mfi - nop.m 999 -// -// if (i_1=0) corr = C_1 * rsq -// else corr = S_1 * r_cubed + r -// -(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_6 - nop.i 999 ;; +(p10) ldfe FR_QQ_3 = [GR_ad_qq], 16 // Load QQ_3 if i_1=1 + fms.s1 FR_r_lo = FR_r, f1, FR_r_hi // r_lo = r - r_hi + nop.i 999 } +;; + { .mfi - nop.m 999 -(p10) fma.s1 FR_U_lo = FR_r_hi, f1, FR_r - nop.i 999 +(p9) ldfe FR_PP_2 = [GR_ad_pp], 16 // Load PP_2 if i_1=0 +(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_5 // poly = rsq*poly+PP_5 if i_1=0 + nop.i 999 } { .mfi - nop.m 999 -// -// if (i_1=0) U_hi = r_hi + U_hi -// else U_hi = QQ_1 * U_hi + 1 -// -(p9) fma.s1 FR_U_lo = FR_r, FR_r_hi, FR_U_lo - nop.i 999 ;; +(p10) ldfe FR_QQ_2 = [GR_ad_qq], 16 // Load QQ_2 if i_1=1 +(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_5 // poly = rsq*poly+QQ_5 if i_1=1 + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// U_hi = r_hi * r_hi -// -(p0) fms.s1 FR_r_lo = FR_r, f1, FR_r_hi - nop.i 999 +(p9) ldfe FR_PP_1_lo = [GR_ad_pp], 16 // Load PP_1_lo if i_1=0 +(p9) fma.s1 FR_corr = FR_corr, FR_c, FR_c // corr = corr * c + c if i_1=0 + nop.i 999 } { .mfi - nop.m 999 -// -// Load PP_1, PP_6, PP_5, and C_1 -// Load QQ_1, QQ_6, QQ_5, and S_1 -// -(p0) fma.s1 FR_U_hi = FR_r_hi, FR_r_hi, f0 - nop.i 999 ;; + nop.m 999 +(p10) fnma.s1 FR_corr = FR_corr, FR_c, f0 // corr = -corr * c if i_1=1 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_5 - nop.i 999 + nop.m 999 +(p9) fma.s1 FR_U_lo = FR_r, FR_r_hi, FR_r_hi_sq // U_lo = r*r_hi+r_hi_sq, i_1=0 + nop.i 999 } { .mfi - nop.m 999 -(p10) fnma.s1 FR_corr = FR_corr, FR_c, f0 - nop.i 999 ;; + nop.m 999 +(p10) fma.s1 FR_U_lo = FR_r_hi, f1, FR_r // U_lo = r_hi + r if i_1=1 + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// if (i_1=0) U_lo = r * r_hi + U_lo -// else U_lo = r_lo * U_lo -// -(p9) fma.s1 FR_corr = FR_corr, FR_c, FR_c - nop.i 999 ;; + nop.m 999 +(p9) fma.s1 FR_U_hi = FR_r_hi, FR_r_hi_sq, f0 // U_hi = r_hi*r_hi_sq if i_1=0 + nop.i 999 } { .mfi - nop.m 999 -(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_5 - nop.i 999 + nop.m 999 +(p10) fma.s1 FR_U_hi = FR_QQ_1, FR_r_hi_sq, f1 // U_hi = QQ_1*r_hi_sq+1, i_1=1 + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// if (i_1 =0) U_hi = r + U_hi -// if (i_1 =0) U_lo = r_lo * U_lo -// -// -(p9) fma.s0 FR_PP_5 = FR_PP_5, FR_PP_4, f0 - nop.i 999 ;; + nop.m 999 +(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_4 // poly = poly*rsq+PP_4 if i_1=0 + nop.i 999 } { .mfi - nop.m 999 -(p9) fma.s1 FR_U_lo = FR_r, FR_r, FR_U_lo - nop.i 999 ;; + nop.m 999 +(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_4 // poly = poly*rsq+QQ_4 if i_1=1 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p10) fma.s1 FR_U_lo = FR_r_lo, FR_U_lo, f0 - nop.i 999 ;; + nop.m 999 +(p9) fma.s1 FR_U_lo = FR_r, FR_r, FR_U_lo // U_lo = r * r + U_lo if i_1=0 + nop.i 999 } { .mfi - nop.m 999 -// -// if (i_1=0) poly = poly * rsq + PP_6 -// else poly = poly * rsq + QQ_6 -// -(p9) fma.s1 FR_U_hi = FR_r_hi, FR_U_hi, f0 - nop.i 999 + nop.m 999 +(p10) fma.s1 FR_U_lo = FR_r_lo, FR_U_lo, f0 // U_lo = r_lo * U_lo if i_1=1 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_4 - nop.i 999 ;; + nop.m 999 +(p9) fma.s1 FR_U_hi = FR_PP_1, FR_U_hi, f0 // U_hi = PP_1 * U_hi if i_1=0 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p10) fma.s1 FR_U_hi = FR_QQ_1, FR_U_hi, f1 - nop.i 999 + nop.m 999 +(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_3 // poly = poly*rsq+PP_3 if i_1=0 + nop.i 999 } { .mfi - nop.m 999 -(p10) fma.s0 FR_QQ_5 = FR_QQ_5, FR_QQ_5, f0 - nop.i 999 ;; + nop.m 999 +(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_3 // poly = poly*rsq+QQ_3 if i_1=1 + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// if (i_1!=0) U_hi = PP_1 * U_hi -// if (i_1!=0) U_lo = r * r + U_lo -// Load PP_3 or QQ_3 -// -(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_4 - nop.i 999 ;; + nop.m 999 +(p9) fma.s1 FR_U_lo = FR_r_lo, FR_U_lo, f0 // U_lo = r_lo * U_lo if i_1=0 + nop.i 999 } { .mfi - nop.m 999 -(p9) fma.s1 FR_U_lo = FR_r_lo, FR_U_lo, f0 - nop.i 999 ;; + nop.m 999 +(p10) fma.s1 FR_U_lo = FR_QQ_1,FR_U_lo, f0 // U_lo = QQ_1 * U_lo if i_1=1 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p10) fma.s1 FR_U_lo = FR_QQ_1,FR_U_lo, f0 - nop.i 999 ;; + nop.m 999 +(p9) fma.s1 FR_U_hi = FR_r, f1, FR_U_hi // U_hi = r + U_hi if i_1=0 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p9) fma.s1 FR_U_hi = FR_PP_1, FR_U_hi, f0 - nop.i 999 + nop.m 999 +(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_2 // poly = poly*rsq+PP_2 if i_1=0 + nop.i 999 } { .mfi - nop.m 999 -(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_3 - nop.i 999 ;; + nop.m 999 +(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_2 // poly = poly*rsq+QQ_2 if i_1=1 + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// Load PP_2, QQ_2 -// -(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_3 - nop.i 999 ;; + nop.m 999 +(p9) fma.s1 FR_U_lo = FR_PP_1, FR_U_lo, f0 // U_lo = PP_1 * U_lo if i_1=0 + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// if (i_1==0) poly = FR_rsq * poly + PP_3 -// else poly = FR_rsq * poly + QQ_3 -// Load PP_1_lo -// -(p9) fma.s1 FR_U_lo = FR_PP_1, FR_U_lo, f0 - nop.i 999 ;; + nop.m 999 +(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_1_lo // poly =poly*rsq+PP1lo i_1=0 + nop.i 999 } { .mfi - nop.m 999 -// -// if (i_1 =0) poly = poly * rsq + pp_r4 -// else poly = poly * rsq + qq_r4 -// -(p9) fma.s1 FR_U_hi = FR_r, f1, FR_U_hi - nop.i 999 + nop.m 999 +(p10) fma.s1 FR_poly = FR_rsq, FR_poly, f0 // poly = poly*rsq if i_1=1 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_2 - nop.i 999 ;; + nop.m 999 + fma.s1 FR_V = FR_U_lo, f1, FR_corr // V = U_lo + corr + tbit.z p11,p12 = GR_N_Inc, 1 // p11 if i_0=0, N mod 4 = 0,2 + // p12 if i_0=1, N mod 4 = 1,3 } +;; + { .mfi - nop.m 999 -// -// if (i_1==0) U_lo = PP_1_hi * U_lo -// else U_lo = QQ_1 * U_lo -// -(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_2 - nop.i 999 ;; + nop.m 999 +(p9) fma.s0 FR_inexact = FR_PP_5, FR_PP_4, f0 // Dummy op to set inexact + nop.i 999 } { .mfi - nop.m 999 -// -// if (i_0==0) Result = 1 -// else Result = -1 -// -(p0) fma.s1 FR_V = FR_U_lo, f1, FR_corr - nop.i 999 ;; + nop.m 999 +(p10) fma.s0 FR_inexact = FR_QQ_5, FR_QQ_5, f0 // Dummy op to set inexact + nop.i 999 } +;; + { .mfi - nop.m 999 -(p10) fma.s1 FR_poly = FR_rsq, FR_poly, f0 - nop.i 999 ;; + nop.m 999 +(p9) fma.s1 FR_poly = FR_r_cubed, FR_poly, f0 // poly = poly*r^3 if i_1=0 + nop.i 999 } { .mfi - nop.m 999 -// -// if (i_1==0) poly = FR_rsq * poly + PP_2 -// else poly = FR_rsq * poly + QQ_2 -// -(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_1_lo - nop.i 999 ;; + nop.m 999 +(p10) fma.s1 FR_poly = FR_rsq, FR_poly, f0 // poly = poly*rsq if i_1=1 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p10) fma.s1 FR_poly = FR_rsq, FR_poly, f0 - nop.i 999 ;; + nop.m 999 +(p11) fma.s1 FR_tmp_result = f0, f1, f1// tmp_result=+1.0 if i_0=0 + nop.i 999 } { .mfi - nop.m 999 -// -// V = U_lo + corr -// -(p9) fma.s1 FR_poly = FR_r_cubed, FR_poly, f0 - nop.i 999 ;; + nop.m 999 +(p12) fms.s1 FR_tmp_result = f0, f1, f1// tmp_result=-1.0 if i_0=1 + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// if (i_1==0) poly = r_cube * poly -// else poly = FR_rsq * poly -// -(p0) fma.s1 FR_V = FR_poly, f1, FR_V - nop.i 999 ;; + nop.m 999 + fma.s1 FR_V = FR_poly, f1, FR_V // V = poly + V + nop.i 999 } +;; + +// If i_0 = 0 Result = U_hi + V +// If i_0 = 1 Result = -U_hi - V { .mfi - nop.m 999 -(p12) fms.s0 FR_Input_X = FR_Input_X, FR_U_hi, FR_V - nop.i 999 + nop.m 999 +(p11) fma.s0 FR_Result = FR_tmp_result, FR_U_hi, FR_V + nop.i 999 } { .mfb - nop.m 999 -// -// V = V + poly -// -(p11) fma.s0 FR_Input_X = FR_Input_X, FR_U_hi, FR_V -// -// if (i_0==0) Result = Result * U_hi + V -// else Result = Result * U_hi - V -// -(p0) br.ret.sptk b0 -};; - -// -// If cosine, FR_Input_X = 1 -// If sine, FR_Input_X = +/-Zero (Input FR_Input_X) -// Results are exact, no exceptions -// + nop.m 999 +(p12) fms.s0 FR_Result = FR_tmp_result, FR_U_hi, FR_V + br.ret.sptk b0 // Exit for 2^-3 <= |r| < pi/4 +} +;; -L(SINCOSL_ZERO): -{ .mbb -(p0) cmp.eq.unc p6, p7 = 0x1, GR_Sin_or_Cos - nop.b 999 - nop.b 999 ;; +SINCOSL_ZERO: +// Here if x = 0 +{ .mfi + cmp.eq.unc p6, p7 = 0x1, GR_Sin_or_Cos + nop.f 999 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p7) fmerge.s FR_Input_X = FR_Input_X, FR_Input_X - nop.i 999 + nop.m 999 +(p7) fmerge.s FR_Result = FR_Input_X, FR_Input_X // If sin, result = input + nop.i 999 } { .mfb - nop.m 999 -(p6) fmerge.s FR_Input_X = f1, f1 -(p0) br.ret.sptk b0 ;; + nop.m 999 +(p6) fma.s0 FR_Result = f1, f1, f0 // If cos, result=1.0 + br.ret.sptk b0 // Exit for x=0 } -L(SINCOSL_SPECIAL): +;; + + +SINCOSL_DENORMAL: +{ .mmb + getf.exp GR_signexp_x = FR_norm_x // Get sign and exponent of x + nop.m 999 + br.cond.sptk SINCOSL_COMMON // Return to common code +} +;; + +SINCOSL_SPECIAL: { .mfb nop.m 999 // @@ -2414,106 +2280,82 @@ L(SINCOSL_SPECIAL): // Invalid can be raised. SNaNs // become QNaNs // -(p0) fmpy.s0 FR_Input_X = FR_Input_X, f0 -(p0) br.ret.sptk b0 ;; + fmpy.s0 FR_Result = FR_Input_X, f0 + br.ret.sptk b0 ;; } -.endp cosl# -ASM_SIZE_DIRECTIVE(cosl#) -// Call int pi_by_2_reduce(double* x, double *y) -// for |arguments| >= 2**63 -// Address to save r and c as double -// -// sp+32 -> f0 -// r45 sp+16 -> f0 -// r44 -> sp -> InputX -// +GLOBAL_IEEE754_END(cosl) +// ******************************************************************* +// ******************************************************************* +// ******************************************************************* +// +// Special Code to handle very large argument case. +// Call int __libm_pi_by_2_reduce(x,r,c) for |arguments| >= 2**63 +// The interface is custom: +// On input: +// (Arg or x) is in f8 +// On output: +// r is in f8 +// c is in f9 +// N is in r8 +// Be sure to allocate at least 2 GP registers as output registers for +// __libm_pi_by_2_reduce. This routine uses r49-50. These are used as +// scratch registers within the __libm_pi_by_2_reduce routine (for speed). +// +// We know also that __libm_pi_by_2_reduce preserves f10-15, f71-127. We +// use this to eliminate save/restore of key fp registers in this calling +// function. +// +// ******************************************************************* +// ******************************************************************* +// ******************************************************************* -.proc __libm_callout -__libm_callout: -L(SINCOSL_ARG_TOO_LARGE): +LOCAL_LIBM_ENTRY(__libm_callout) +SINCOSL_ARG_TOO_LARGE: .prologue { .mfi - add r45=-32,sp // Parameter: r address nop.f 0 .save ar.pfs,GR_SAVE_PFS mov GR_SAVE_PFS=ar.pfs // Save ar.pfs -} -{ .mfi -.fframe 64 - add sp=-64,sp // Create new stack - nop.f 0 - mov GR_SAVE_GP=gp // Save gp };; + { .mmi - stfe [r45] = f0,16 // Clear Parameter r on stack - add r44 = 16,sp // Parameter x address + setf.exp FR_Two_to_M3 = GR_exp_2_to_m3 // Form 2^-3 + mov GR_SAVE_GP=gp // Save gp .save b0, GR_SAVE_B0 mov GR_SAVE_B0=b0 // Save b0 };; + .body +// +// Call argument reduction with x in f8 +// Returns with N in r8, r in f8, c in f9 +// Assumes f71-127 are preserved across the call +// { .mib - stfe [r45] = f0,-16 // Clear Parameter c on stack - nop.i 0 - nop.b 0 -} -{ .mib - stfe [r44] = FR_Input_X // Store Parameter x on stack + setf.exp FR_Neg_Two_to_M3 = GR_exp_m2_to_m3 // Form -(2^-3) nop.i 0 -(p0) br.call.sptk b0=__libm_pi_by_2_reduce# ;; + br.call.sptk b0=__libm_pi_by_2_reduce# };; -{ .mii -(p0) ldfe FR_Input_X =[r44],16 -// -// Get r and c off stack -// -(p0) adds GR_Table_Base1 = -16, GR_Table_Base1 -// -// Get r and c off stack -// -(p0) add GR_N_Inc = GR_Sin_or_Cos,r8 ;; -} -{ .mmb -(p0) ldfe FR_r =[r45],16 -// -// Get X off the stack -// Readjust Table ptr -// -(p0) ldfs FR_Two_to_M3 = [GR_Table_Base1],4 - nop.b 999 ;; -} -{ .mmb -(p0) ldfs FR_Neg_Two_to_M3 = [GR_Table_Base1],0 -(p0) ldfe FR_c =[r45] - nop.b 999 ;; -} + { .mfi -.restore sp - add sp = 64,sp // Restore stack pointer -(p0) fcmp.lt.unc.s1 p6, p0 = FR_r, FR_Two_to_M3 + add GR_N_Inc = GR_Sin_or_Cos,r8 + fcmp.lt.unc.s1 p6, p0 = FR_r, FR_Two_to_M3 mov b0 = GR_SAVE_B0 // Restore return address };; -{ .mib + +{ .mfi mov gp = GR_SAVE_GP // Restore gp +(p6) fcmp.gt.unc.s1 p6, p0 = FR_r, FR_Neg_Two_to_M3 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs - nop.b 0 };; -{ .mfi - nop.m 999 -(p6) fcmp.gt.unc.s1 p6, p0 = FR_r, FR_Neg_Two_to_M3 - nop.i 999 ;; -} -{ .mib - nop.m 999 - nop.i 999 -(p6) br.cond.spnt L(SINCOSL_SMALL_R) ;; -} -{ .mib - nop.m 999 - nop.i 999 -(p0) br.cond.sptk L(SINCOSL_NORMAL_R) ;; -} -.endp __libm_callout -ASM_SIZE_DIRECTIVE(__libm_callout) + +{ .mbb + nop.m 999 +(p6) br.cond.spnt SINCOSL_SMALL_R // Branch if |r|< 2^-3 for |x| >= 2^63 + br.cond.sptk SINCOSL_NORMAL_R // Branch if |r|>=2^-3 for |x| >= 2^63 +};; + +.endp .type __libm_pi_by_2_reduce#,@function .global __libm_pi_by_2_reduce# diff --git a/sysdeps/ia64/fpu/s_expm1.S b/sysdeps/ia64/fpu/s_expm1.S index 19a237990c..41b9954ee8 100644 --- a/sysdeps/ia64/fpu/s_expm1.S +++ b/sysdeps/ia64/fpu/s_expm1.S @@ -1,10 +1,10 @@ .file "exp_m1.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2002, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,1694 +20,819 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// HISTORY -// 2/02/00 Initial Version -// 4/04/00 Unwind support added -// 8/15/00 Bundle added after call to __libm_error_support to properly +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://www.intel.com/software/products/opensource/libraries/num.htm. +// +// History +//============================================================== +// 02/02/00 Initial Version +// 04/04/00 Unwind support added +// 08/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. +// 07/07/01 Improved speed of all paths +// 05/20/02 Cleaned up namespace and sf0 syntax +// 11/20/02 Improved speed, algorithm based on exp + +// API +//============================================================== +// double expm1(double) + +// Overview of operation +//============================================================== +// 1. Inputs of Nan, Inf, Zero, NatVal handled with special paths +// +// 2. |x| < 2^-60 +// Result = x, computed by x + x*x to handle appropriate flags and rounding // -// ********************************************************************* -// -// Function: Combined exp(x) and expm1(x), where -// x -// exp(x) = e , for double precision x values -// x -// expm1(x) = e - 1 for double precision x values -// -// ********************************************************************* -// -// Accuracy: Within .7 ulps for 80-bit floating point values -// Very accurate for double precision values -// -// ********************************************************************* -// -// Resources Used: -// -// Floating-Point Registers: f8 (Input and Return Value) -// f9,f32-f61, f99-f102 -// -// General Purpose Registers: -// r32-r61 -// r62-r65 (Used to pass arguments to error handling routine) -// -// Predicate Registers: p6-p15 -// -// ********************************************************************* -// -// IEEE Special Conditions: -// -// Denormal fault raised on denormal inputs -// Overflow exceptions raised when appropriate for exp and expm1 -// Underflow exceptions raised when appropriate for exp and expm1 -// (Error Handling Routine called for overflow and Underflow) -// Inexact raised when appropriate by algorithm -// -// exp(inf) = inf -// exp(-inf) = +0 -// exp(SNaN) = QNaN -// exp(QNaN) = QNaN -// exp(0) = 1 -// exp(EM_special Values) = QNaN -// exp(inf) = inf -// expm1(-inf) = -1 -// expm1(SNaN) = QNaN -// expm1(QNaN) = QNaN -// expm1(0) = 0 -// expm1(EM_special Values) = QNaN -// -// ********************************************************************* -// -// Implementation and Algorithm Notes: -// -// ker_exp_64( in_FR : X, -// in_GR : Flag, -// in_GR : Expo_Range -// out_FR : Y_hi, -// out_FR : Y_lo, -// out_FR : scale, -// out_PR : Safe ) -// -// On input, X is in register format and -// Flag = 0 for exp, -// Flag = 1 for expm1, -// -// On output, provided X and X_cor are real numbers, then -// -// scale*(Y_hi + Y_lo) approximates exp(X) if Flag is 0 -// scale*(Y_hi + Y_lo) approximates exp(X)-1 if Flag is 1 -// -// The accuracy is sufficient for a highly accurate 64 sig. -// bit implementation. Safe is set if there is no danger of -// overflow/underflow when the result is composed from scale, -// Y_hi and Y_lo. Thus, we can have a fast return if Safe is set. -// Otherwise, one must prepare to handle the possible exception -// appropriately. Note that SAFE not set (false) does not mean -// that overflow/underflow will occur; only the setting of SAFE -// guarantees the opposite. -// -// **** High Level Overview **** -// -// The method consists of three cases. -// -// If |X| < Tiny use case exp_tiny; -// else if |X| < 2^(-6) use case exp_small; -// else use case exp_regular; -// -// Case exp_tiny: -// -// 1 + X can be used to approximate exp(X) or exp(X+X_cor); -// X + X^2/2 can be used to approximate exp(X) - 1 -// -// Case exp_small: -// -// Here, exp(X), exp(X+X_cor), and exp(X) - 1 can all be -// appproximated by a relatively simple polynomial. -// -// This polynomial resembles the truncated Taylor series -// -// exp(w) = 1 + w + w^2/2! + w^3/3! + ... + w^n/n! -// -// Case exp_regular: -// -// Here we use a table lookup method. The basic idea is that in -// order to compute exp(X), we accurately decompose X into -// -// X = N * log(2)/(2^12) + r, |r| <= log(2)/2^13. -// -// Hence -// -// exp(X) = 2^( N / 2^12 ) * exp(r). -// -// The value 2^( N / 2^12 ) is obtained by simple combinations -// of values calculated beforehand and stored in table; exp(r) -// is approximated by a short polynomial because |r| is small. -// -// We elaborate this method in 4 steps. -// -// Step 1: Reduction -// -// The value 2^12/log(2) is stored as a double-extended number -// L_Inv. -// -// N := round_to_nearest_integer( X * L_Inv ) -// -// The value log(2)/2^12 is stored as two numbers L_hi and L_lo so -// that r can be computed accurately via -// -// r := (X - N*L_hi) - N*L_lo -// -// We pick L_hi such that N*L_hi is representable in 64 sig. bits -// and thus the FMA X - N*L_hi is error free. So r is the -// 1 rounding error from an exact reduction with respect to -// -// L_hi + L_lo. -// -// In particular, L_hi has 30 significant bit and can be stored -// as a double-precision number; L_lo has 64 significant bits and -// stored as a double-extended number. -// -// In the case Flag = 2, we further modify r by -// -// r := r + X_cor. -// -// Step 2: Approximation -// -// exp(r) - 1 is approximated by a short polynomial of the form -// -// r + A_1 r^2 + A_2 r^3 + A_3 r^4 . -// -// Step 3: Composition from Table Values -// -// The value 2^( N / 2^12 ) can be composed from a couple of tables -// of precalculated values. First, express N as three integers -// K, M_1, and M_2 as -// -// N = K * 2^12 + M_1 * 2^6 + M_2 -// -// Where 0 <= M_1, M_2 < 2^6; and K can be positive or negative. -// When N is represented in 2's complement, M_2 is simply the 6 -// lsb's, M_1 is the next 6, and K is simply N shifted right -// arithmetically (sign extended) by 12 bits. -// -// Now, 2^( N / 2^12 ) is simply -// -// 2^K * 2^( M_1 / 2^6 ) * 2^( M_2 / 2^12 ) -// -// Clearly, 2^K needs no tabulation. The other two values are less -// trivial because if we store each accurately to more than working -// precision, than its product is too expensive to calculate. We -// use the following method. -// -// Define two mathematical values, delta_1 and delta_2, implicitly -// such that -// -// T_1 = exp( [M_1 log(2)/2^6] - delta_1 ) -// T_2 = exp( [M_2 log(2)/2^12] - delta_2 ) -// -// are representable as 24 significant bits. To illustrate the idea, -// we show how we define delta_1: -// -// T_1 := round_to_24_bits( exp( M_1 log(2)/2^6 ) ) -// delta_1 = (M_1 log(2)/2^6) - log( T_1 ) -// -// The last equality means mathematical equality. We then tabulate -// -// W_1 := exp(delta_1) - 1 -// W_2 := exp(delta_2) - 1 -// -// Both in double precision. -// -// From the tabulated values T_1, T_2, W_1, W_2, we compose the values -// T and W via +// 3. 2^-60 <= |x| < 2^-2 +// Result determined by 13th order Taylor series polynomial +// expm1f(x) = x + Q2*x^2 + ... + Q13*x^13 // -// T := T_1 * T_2 ...exactly -// W := W_1 + (1 + W_1)*W_2 +// 4. x < -48.0 +// Here we know result is essentially -1 + eps, where eps only affects +// rounded result. Set I. // -// W approximates exp( delta ) - 1 where delta = delta_1 + delta_2. -// The mathematical product of T and (W+1) is an accurate representation -// of 2^(M_1/2^6) * 2^(M_2/2^12). +// 5. x >= 709.7827 +// Result overflows. Set I, O, and call error support // -// Step 4. Reconstruction -// -// Finally, we can reconstruct exp(X), exp(X) - 1. -// Because -// -// X = K * log(2) + (M_1*log(2)/2^6 - delta_1) -// + (M_2*log(2)/2^12 - delta_2) -// + delta_1 + delta_2 + r ...accurately -// We have -// -// exp(X) ~=~ 2^K * ( T + T*[exp(delta_1+delta_2+r) - 1] ) -// ~=~ 2^K * ( T + T*[exp(delta + r) - 1] ) -// ~=~ 2^K * ( T + T*[(exp(delta)-1) -// + exp(delta)*(exp(r)-1)] ) -// ~=~ 2^K * ( T + T*( W + (1+W)*poly(r) ) ) -// ~=~ 2^K * ( Y_hi + Y_lo ) -// -// where Y_hi = T and Y_lo = T*(W + (1+W)*poly(r)) -// -// For exp(X)-1, we have -// -// exp(X)-1 ~=~ 2^K * ( Y_hi + Y_lo ) - 1 -// ~=~ 2^K * ( Y_hi + Y_lo - 2^(-K) ) -// -// and we combine Y_hi + Y_lo - 2^(-N) into the form of two -// numbers Y_hi + Y_lo carefully. -// -// **** Algorithm Details **** -// -// A careful algorithm must be used to realize the mathematical ideas -// accurately. We describe each of the three cases. We assume SAFE -// is preset to be TRUE. -// -// Case exp_tiny: -// -// The important points are to ensure an accurate result under -// different rounding directions and a correct setting of the SAFE -// flag. -// -// If Flag is 1, then -// SAFE := False ...possibility of underflow -// Scale := 1.0 -// Y_hi := X -// Y_lo := 2^(-17000) -// Else -// Scale := 1.0 -// Y_hi := 1.0 -// Y_lo := X ...for different rounding modes -// Endif -// -// Case exp_small: -// -// Here we compute a simple polynomial. To exploit parallelism, we split -// the polynomial into several portions. -// -// Let r = X -// -// If Flag is not 1 ...i.e. exp( argument ) -// -// rsq := r * r; -// r4 := rsq*rsq -// poly_lo := P_3 + r*(P_4 + r*(P_5 + r*P_6)) -// poly_hi := r + rsq*(P_1 + r*P_2) -// Y_lo := poly_hi + r4 * poly_lo -// set lsb(Y_lo) to 1 -// Y_hi := 1.0 -// Scale := 1.0 -// -// Else ...i.e. exp( argument ) - 1 -// -// rsq := r * r -// r4 := rsq * rsq -// r6 := rsq * r4 -// poly_lo := r6*(Q_5 + r*(Q_6 + r*Q_7)) -// poly_hi := Q_1 + r*(Q_2 + r*(Q_3 + r*Q_4)) -// Y_lo := rsq*poly_hi + poly_lo -// set lsb(Y_lo) to 1 -// Y_hi := X -// Scale := 1.0 -// -// Endif -// -// Case exp_regular: -// -// The previous description contain enough information except the -// computation of poly and the final Y_hi and Y_lo in the case for -// exp(X)-1. -// -// The computation of poly for Step 2: -// -// rsq := r*r -// poly := r + rsq*(A_1 + r*(A_2 + r*A_3)) -// -// For the case exp(X) - 1, we need to incorporate 2^(-K) into -// Y_hi and Y_lo at the end of Step 4. -// -// If K > 10 then -// Y_lo := Y_lo - 2^(-K) -// Else -// If K < -10 then -// Y_lo := Y_hi + Y_lo -// Y_hi := -2^(-K) -// Else -// Y_hi := Y_hi - 2^(-K) -// End If -// End If -// - -#include "libm_support.h" - -GR_SAVE_PFS = r59 -GR_SAVE_B0 = r60 -GR_SAVE_GP = r61 - -GR_Parameter_X = r62 -GR_Parameter_Y = r63 -GR_Parameter_RESULT = r64 - -FR_X = f9 -FR_Y = f1 -FR_RESULT = f99 - -#ifdef _LIBC -.rodata -#else -.data -#endif - -.align 64 -Constants_exp_64_Arg: -ASM_TYPE_DIRECTIVE(Constants_exp_64_Arg,@object) -data4 0x5C17F0BC,0xB8AA3B29,0x0000400B,0x00000000 -data4 0x00000000,0xB17217F4,0x00003FF2,0x00000000 -data4 0xF278ECE6,0xF473DE6A,0x00003FD4,0x00000000 -// /* Inv_L, L_hi, L_lo */ -ASM_SIZE_DIRECTIVE(Constants_exp_64_Arg) - -.align 64 -Constants_exp_64_Exponents: -ASM_TYPE_DIRECTIVE(Constants_exp_64_Exponents,@object) -data4 0x0000007E,0x00000000,0xFFFFFF83,0xFFFFFFFF -data4 0x000003FE,0x00000000,0xFFFFFC03,0xFFFFFFFF -data4 0x00003FFE,0x00000000,0xFFFFC003,0xFFFFFFFF -data4 0x00003FFE,0x00000000,0xFFFFC003,0xFFFFFFFF -data4 0xFFFFFFE2,0xFFFFFFFF,0xFFFFFFC4,0xFFFFFFFF -data4 0xFFFFFFBA,0xFFFFFFFF,0xFFFFFFBA,0xFFFFFFFF -ASM_SIZE_DIRECTIVE(Constants_exp_64_Exponents) - -.align 64 -Constants_exp_64_A: -ASM_TYPE_DIRECTIVE(Constants_exp_64_A,@object) -data4 0xB1B736A0,0xAAAAAAAB,0x00003FFA,0x00000000 -data4 0x90CD6327,0xAAAAAAAB,0x00003FFC,0x00000000 -data4 0xFFFFFFFF,0xFFFFFFFF,0x00003FFD,0x00000000 -// /* Reversed */ -ASM_SIZE_DIRECTIVE(Constants_exp_64_A) - -.align 64 -Constants_exp_64_P: -ASM_TYPE_DIRECTIVE(Constants_exp_64_P,@object) -data4 0x43914A8A,0xD00D6C81,0x00003FF2,0x00000000 -data4 0x30304B30,0xB60BC4AC,0x00003FF5,0x00000000 -data4 0x7474C518,0x88888888,0x00003FF8,0x00000000 -data4 0x8DAE729D,0xAAAAAAAA,0x00003FFA,0x00000000 -data4 0xAAAAAF61,0xAAAAAAAA,0x00003FFC,0x00000000 -data4 0x000004C7,0x80000000,0x00003FFE,0x00000000 -// /* Reversed */ -ASM_SIZE_DIRECTIVE(Constants_exp_64_P) - -.align 64 -Constants_exp_64_Q: -ASM_TYPE_DIRECTIVE(Constants_exp_64_Q,@object) -data4 0xA49EF6CA,0xD00D56F7,0x00003FEF,0x00000000 -data4 0x1C63493D,0xD00D59AB,0x00003FF2,0x00000000 -data4 0xFB50CDD2,0xB60B60B5,0x00003FF5,0x00000000 -data4 0x7BA68DC8,0x88888888,0x00003FF8,0x00000000 -data4 0xAAAAAC8D,0xAAAAAAAA,0x00003FFA,0x00000000 -data4 0xAAAAACCA,0xAAAAAAAA,0x00003FFC,0x00000000 -data4 0x00000000,0x80000000,0x00003FFE,0x00000000 -// /* Reversed */ -ASM_SIZE_DIRECTIVE(Constants_exp_64_Q) - -.align 64 -Constants_exp_64_T1: -ASM_TYPE_DIRECTIVE(Constants_exp_64_T1,@object) -data4 0x3F800000,0x3F8164D2,0x3F82CD87,0x3F843A29 -data4 0x3F85AAC3,0x3F871F62,0x3F88980F,0x3F8A14D5 -data4 0x3F8B95C2,0x3F8D1ADF,0x3F8EA43A,0x3F9031DC -data4 0x3F91C3D3,0x3F935A2B,0x3F94F4F0,0x3F96942D -data4 0x3F9837F0,0x3F99E046,0x3F9B8D3A,0x3F9D3EDA -data4 0x3F9EF532,0x3FA0B051,0x3FA27043,0x3FA43516 -data4 0x3FA5FED7,0x3FA7CD94,0x3FA9A15B,0x3FAB7A3A -data4 0x3FAD583F,0x3FAF3B79,0x3FB123F6,0x3FB311C4 -data4 0x3FB504F3,0x3FB6FD92,0x3FB8FBAF,0x3FBAFF5B -data4 0x3FBD08A4,0x3FBF179A,0x3FC12C4D,0x3FC346CD -data4 0x3FC5672A,0x3FC78D75,0x3FC9B9BE,0x3FCBEC15 -data4 0x3FCE248C,0x3FD06334,0x3FD2A81E,0x3FD4F35B -data4 0x3FD744FD,0x3FD99D16,0x3FDBFBB8,0x3FDE60F5 -data4 0x3FE0CCDF,0x3FE33F89,0x3FE5B907,0x3FE8396A -data4 0x3FEAC0C7,0x3FED4F30,0x3FEFE4BA,0x3FF28177 -data4 0x3FF5257D,0x3FF7D0DF,0x3FFA83B3,0x3FFD3E0C -ASM_SIZE_DIRECTIVE(Constants_exp_64_T1) - -.align 64 -Constants_exp_64_T2: -ASM_TYPE_DIRECTIVE(Constants_exp_64_T2,@object) -data4 0x3F800000,0x3F80058C,0x3F800B18,0x3F8010A4 -data4 0x3F801630,0x3F801BBD,0x3F80214A,0x3F8026D7 -data4 0x3F802C64,0x3F8031F2,0x3F803780,0x3F803D0E -data4 0x3F80429C,0x3F80482B,0x3F804DB9,0x3F805349 -data4 0x3F8058D8,0x3F805E67,0x3F8063F7,0x3F806987 -data4 0x3F806F17,0x3F8074A8,0x3F807A39,0x3F807FCA -data4 0x3F80855B,0x3F808AEC,0x3F80907E,0x3F809610 -data4 0x3F809BA2,0x3F80A135,0x3F80A6C7,0x3F80AC5A -data4 0x3F80B1ED,0x3F80B781,0x3F80BD14,0x3F80C2A8 -data4 0x3F80C83C,0x3F80CDD1,0x3F80D365,0x3F80D8FA -data4 0x3F80DE8F,0x3F80E425,0x3F80E9BA,0x3F80EF50 -data4 0x3F80F4E6,0x3F80FA7C,0x3F810013,0x3F8105AA -data4 0x3F810B41,0x3F8110D8,0x3F81166F,0x3F811C07 -data4 0x3F81219F,0x3F812737,0x3F812CD0,0x3F813269 -data4 0x3F813802,0x3F813D9B,0x3F814334,0x3F8148CE -data4 0x3F814E68,0x3F815402,0x3F81599C,0x3F815F37 -ASM_SIZE_DIRECTIVE(Constants_exp_64_T2) - -.align 64 -Constants_exp_64_W1: -ASM_TYPE_DIRECTIVE(Constants_exp_64_W1,@object) -data4 0x00000000,0x00000000,0x171EC4B4,0xBE384454 -data4 0x4AA72766,0xBE694741,0xD42518F8,0xBE5D32B6 -data4 0x3A319149,0x3E68D96D,0x62415F36,0xBE68F4DA -data4 0xC9C86A3B,0xBE6DDA2F,0xF49228FE,0x3E6B2E50 -data4 0x1188B886,0xBE49C0C2,0x1A4C2F1F,0x3E64BFC2 -data4 0x2CB98B54,0xBE6A2FBB,0x9A55D329,0x3E5DC5DE -data4 0x39A7AACE,0x3E696490,0x5C66DBA5,0x3E54728B -data4 0xBA1C7D7D,0xBE62B0DB,0x09F1AF5F,0x3E576E04 -data4 0x1A0DD6A1,0x3E612500,0x795FBDEF,0xBE66A419 -data4 0xE1BD41FC,0xBE5CDE8C,0xEA54964F,0xBE621376 -data4 0x476E76EE,0x3E6370BE,0x3427EB92,0x3E390D1A -data4 0x2BF82BF8,0x3E1336DE,0xD0F7BD9E,0xBE5FF1CB -data4 0x0CEB09DD,0xBE60A355,0x0980F30D,0xBE5CA37E -data4 0x4C082D25,0xBE5C541B,0x3B467D29,0xBE5BBECA -data4 0xB9D946C5,0xBE400D8A,0x07ED374A,0xBE5E2A08 -data4 0x365C8B0A,0xBE66CB28,0xD3403BCA,0x3E3AAD5B -data4 0xC7EA21E0,0x3E526055,0xE72880D6,0xBE442C75 -data4 0x85222A43,0x3E58B2BB,0x522C42BF,0xBE5AAB79 -data4 0x469DC2BC,0xBE605CB4,0xA48C40DC,0xBE589FA7 -data4 0x1AA42614,0xBE51C214,0xC37293F4,0xBE48D087 -data4 0xA2D673E0,0x3E367A1C,0x114F7A38,0xBE51BEBB -data4 0x661A4B48,0xBE6348E5,0x1D3B9962,0xBDF52643 -data4 0x35A78A53,0x3E3A3B5E,0x1CECD788,0xBE46C46C -data4 0x7857D689,0xBE60B7EC,0xD14F1AD7,0xBE594D3D -data4 0x4C9A8F60,0xBE4F9C30,0x02DFF9D2,0xBE521873 -data4 0x55E6D68F,0xBE5E4C88,0x667F3DC4,0xBE62140F -data4 0x3BF88747,0xBE36961B,0xC96EC6AA,0x3E602861 -data4 0xD57FD718,0xBE3B5151,0xFC4A627B,0x3E561CD0 -data4 0xCA913FEA,0xBE3A5217,0x9A5D193A,0x3E40A3CC -data4 0x10A9C312,0xBE5AB713,0xC5F57719,0x3E4FDADB -data4 0xDBDF59D5,0x3E361428,0x61B4180D,0x3E5DB5DB -data4 0x7408D856,0xBE42AD5F,0x31B2B707,0x3E2A3148 -ASM_SIZE_DIRECTIVE(Constants_exp_64_W1) - -.align 64 -Constants_exp_64_W2: -ASM_TYPE_DIRECTIVE(Constants_exp_64_W2,@object) -data4 0x00000000,0x00000000,0x37A3D7A2,0xBE641F25 -data4 0xAD028C40,0xBE68DD57,0xF212B1B6,0xBE5C77D8 -data4 0x1BA5B070,0x3E57878F,0x2ECAE6FE,0xBE55A36A -data4 0x569DFA3B,0xBE620608,0xA6D300A3,0xBE53B50E -data4 0x223F8F2C,0x3E5B5EF2,0xD6DE0DF4,0xBE56A0D9 -data4 0xEAE28F51,0xBE64EEF3,0x367EA80B,0xBE5E5AE2 -data4 0x5FCBC02D,0x3E47CB1A,0x9BDAFEB7,0xBE656BA0 -data4 0x805AFEE7,0x3E6E70C6,0xA3415EBA,0xBE6E0509 -data4 0x49BFF529,0xBE56856B,0x00508651,0x3E66DD33 -data4 0xC114BC13,0x3E51165F,0xC453290F,0x3E53333D -data4 0x05539FDA,0x3E6A072B,0x7C0A7696,0xBE47CD87 -data4 0xEB05C6D9,0xBE668BF4,0x6AE86C93,0xBE67C3E3 -data4 0xD0B3E84B,0xBE533904,0x556B53CE,0x3E63E8D9 -data4 0x63A98DC8,0x3E212C89,0x032A7A22,0xBE33138F -data4 0xBC584008,0x3E530FA9,0xCCB93C97,0xBE6ADF82 -data4 0x8370EA39,0x3E5F9113,0xFB6A05D8,0x3E5443A4 -data4 0x181FEE7A,0x3E63DACD,0xF0F67DEC,0xBE62B29D -data4 0x3DDE6307,0x3E65C483,0xD40A24C1,0x3E5BF030 -data4 0x14E437BE,0x3E658B8F,0xED98B6C7,0xBE631C29 -data4 0x04CF7C71,0x3E6335D2,0xE954A79D,0x3E529EED -data4 0xF64A2FB8,0x3E5D9257,0x854ED06C,0xBE6BED1B -data4 0xD71405CB,0x3E5096F6,0xACB9FDF5,0xBE3D4893 -data4 0x01B68349,0xBDFEB158,0xC6A463B9,0x3E628D35 -data4 0xADE45917,0xBE559725,0x042FC476,0xBE68C29C -data4 0x01E511FA,0xBE67593B,0x398801ED,0xBE4A4313 -data4 0xDA7C3300,0x3E699571,0x08062A9E,0x3E5349BE -data4 0x755BB28E,0x3E5229C4,0x77A1F80D,0x3E67E426 -data4 0x6B69C352,0xBE52B33F,0x084DA57F,0xBE6B3550 -data4 0xD1D09A20,0xBE6DB03F,0x2161B2C1,0xBE60CBC4 -data4 0x78A2B771,0x3E56ED9C,0x9D0FA795,0xBE508E31 -data4 0xFD1A54E9,0xBE59482A,0xB07FD23E,0xBE2A17CE -data4 0x17365712,0x3E68BF5C,0xB3785569,0x3E3956F9 -ASM_SIZE_DIRECTIVE(Constants_exp_64_W2) +// 6. 2^-2 <= x < 709.7827 or -48.0 <= x < -2^-2 +// This is the main path. The algorithm is described below: -.section .text -.proc expm1# -.global expm1# -.align 64 - -expm1: -#ifdef _LIBC -.global __expm1# -__expm1: -#endif - - -{ .mii - alloc r32 = ar.pfs,0,30,4,0 -(p0) add r33 = 1, r0 -(p0) cmp.eq.unc p7, p0 = r0, r0 -} -;; - - -// -// Set p7 true for expm1 -// Set Flag = r33 = 1 for expm1 -// These are really no longer necesary, but are a remnant -// when this file had multiple entry points. -// They should be carefully removed +// Take the input x. w is "how many log2/128 in x?" +// w = x * 128/log2 +// n = int(w) +// x = n log2/128 + r + delta + +// n = 128M + index_1 + 2^4 index_2 +// x = M log2 + (log2/128) index_1 + (log2/8) index_2 + r + delta + +// exp(x) = 2^M 2^(index_1/128) 2^(index_2/8) exp(r) exp(delta) +// Construct 2^M +// Get 2^(index_1/128) from table_1; +// Get 2^(index_2/8) from table_2; +// Calculate exp(r) by series by 5th order polynomial +// r = x - n (log2/128)_high +// delta = - n (log2/128)_low +// Calculate exp(delta) as 1 + delta + + +// Special values +//============================================================== +// expm1(+0) = +0.0 +// expm1(-0) = -0.0 + +// expm1(+qnan) = +qnan +// expm1(-qnan) = -qnan +// expm1(+snan) = +qnan +// expm1(-snan) = -qnan + +// expm1(-inf) = -1.0 +// expm1(+inf) = +inf + +// Overflow and Underflow +//======================= +// expm1(x) = largest double normal when +// x = 709.7827 = 40862e42fefa39ef +// +// Underflow is handled as described in case 2 above. + + +// Registers used +//============================================================== +// Floating Point registers used: +// f8, input +// f9 -> f15, f32 -> f75 + +// General registers used: +// r14 -> r40 + +// Predicate registers used: +// p6 -> p15 + +// Assembly macros +//============================================================== + +rRshf = r14 +rAD_TB1 = r15 +rAD_T1 = r15 +rAD_TB2 = r16 +rAD_T2 = r16 +rAD_Ln2_lo = r17 +rAD_P = r17 + +rN = r18 +rIndex_1 = r19 +rIndex_2_16 = r20 + +rM = r21 +rBiased_M = r21 +rIndex_1_16 = r22 +rSignexp_x = r23 +rExp_x = r24 +rSig_inv_ln2 = r25 + +rAD_Q1 = r26 +rAD_Q2 = r27 +rTmp = r27 +rExp_bias = r28 +rExp_mask = r29 +rRshf_2to56 = r30 + +rGt_ln = r31 +rExp_2tom56 = r31 + + +GR_SAVE_B0 = r33 +GR_SAVE_PFS = r34 +GR_SAVE_GP = r35 +GR_SAVE_SP = r36 + +GR_Parameter_X = r37 +GR_Parameter_Y = r38 +GR_Parameter_RESULT = r39 +GR_Parameter_TAG = r40 + + +FR_X = f10 +FR_Y = f1 +FR_RESULT = f8 + +fRSHF_2TO56 = f6 +fINV_LN2_2TO63 = f7 +fW_2TO56_RSH = f9 +f2TOM56 = f11 +fP5 = f12 +fP54 = f50 +fP5432 = f50 +fP4 = f13 +fP3 = f14 +fP32 = f14 +fP2 = f15 + +fLn2_by_128_hi = f33 +fLn2_by_128_lo = f34 + +fRSHF = f35 +fNfloat = f36 +fW = f37 +fR = f38 +fF = f39 + +fRsq = f40 +fRcube = f41 + +f2M = f42 +fS1 = f43 +fT1 = f44 + +fMIN_DBL_OFLOW_ARG = f45 +fMAX_DBL_MINUS_1_ARG = f46 +fMAX_DBL_NORM_ARG = f47 +fP_lo = f51 +fP_hi = f52 +fP = f53 +fS = f54 + +fNormX = f56 + +fWre_urm_f8 = f57 + +fGt_pln = f58 +fTmp = f58 + +fS2 = f59 +fT2 = f60 +fSm1 = f61 + +fXsq = f62 +fX6 = f63 +fX4 = f63 +fQ7 = f64 +fQ76 = f64 +fQ7654 = f64 +fQ765432 = f64 +fQ6 = f65 +fQ5 = f66 +fQ54 = f66 +fQ4 = f67 +fQ3 = f68 +fQ32 = f68 +fQ2 = f69 +fQD = f70 +fQDC = f70 +fQDCBA = f70 +fQDCBA98 = f70 +fQDCBA98765432 = f70 +fQC = f71 +fQB = f72 +fQBA = f72 +fQA = f73 +fQ9 = f74 +fQ98 = f74 +fQ8 = f75 + +// Data tables +//============================================================== + +RODATA +.align 16 + +// ************* DO NOT CHANGE ORDER OF THESE TABLES ******************** + +// double-extended 1/ln(2) +// 3fff b8aa 3b29 5c17 f0bb be87fed0691d3e88 +// 3fff b8aa 3b29 5c17 f0bc +// For speed the significand will be loaded directly with a movl and setf.sig +// and the exponent will be bias+63 instead of bias+0. Thus subsequent +// computations need to scale appropriately. +// The constant 128/ln(2) is needed for the computation of w. This is also +// obtained by scaling the computations. +// +// Two shifting constants are loaded directly with movl and setf.d. +// 1. fRSHF_2TO56 = 1.1000..00 * 2^(63-7) +// This constant is added to x*1/ln2 to shift the integer part of +// x*128/ln2 into the rightmost bits of the significand. +// The result of this fma is fW_2TO56_RSH. +// 2. fRSHF = 1.1000..00 * 2^(63) +// This constant is subtracted from fW_2TO56_RSH * 2^(-56) to give +// the integer part of w, n, as a floating-point number. +// The result of this fms is fNfloat. + + +LOCAL_OBJECT_START(exp_Table_1) +data8 0x40862e42fefa39f0 // smallest dbl overflow arg +data8 0xc048000000000000 // approx largest arg for minus one result +data8 0x40862e42fefa39ef // largest dbl arg to give normal dbl result +data8 0x0 // pad +data8 0xb17217f7d1cf79ab , 0x00003ff7 // ln2/128 hi +data8 0xc9e3b39803f2f6af , 0x00003fb7 // ln2/128 lo +// +// Table 1 is 2^(index_1/128) where +// index_1 goes from 0 to 15 +// +data8 0x8000000000000000 , 0x00003FFF +data8 0x80B1ED4FD999AB6C , 0x00003FFF +data8 0x8164D1F3BC030773 , 0x00003FFF +data8 0x8218AF4373FC25EC , 0x00003FFF +data8 0x82CD8698AC2BA1D7 , 0x00003FFF +data8 0x8383594EEFB6EE37 , 0x00003FFF +data8 0x843A28C3ACDE4046 , 0x00003FFF +data8 0x84F1F656379C1A29 , 0x00003FFF +data8 0x85AAC367CC487B15 , 0x00003FFF +data8 0x8664915B923FBA04 , 0x00003FFF +data8 0x871F61969E8D1010 , 0x00003FFF +data8 0x87DB357FF698D792 , 0x00003FFF +data8 0x88980E8092DA8527 , 0x00003FFF +data8 0x8955EE03618E5FDD , 0x00003FFF +data8 0x8A14D575496EFD9A , 0x00003FFF +data8 0x8AD4C6452C728924 , 0x00003FFF +LOCAL_OBJECT_END(exp_Table_1) + +// Table 2 is 2^(index_1/8) where +// index_2 goes from 0 to 7 +LOCAL_OBJECT_START(exp_Table_2) +data8 0x8000000000000000 , 0x00003FFF +data8 0x8B95C1E3EA8BD6E7 , 0x00003FFF +data8 0x9837F0518DB8A96F , 0x00003FFF +data8 0xA5FED6A9B15138EA , 0x00003FFF +data8 0xB504F333F9DE6484 , 0x00003FFF +data8 0xC5672A115506DADD , 0x00003FFF +data8 0xD744FCCAD69D6AF4 , 0x00003FFF +data8 0xEAC0C6E7DD24392F , 0x00003FFF +LOCAL_OBJECT_END(exp_Table_2) + + +LOCAL_OBJECT_START(exp_p_table) +data8 0x3f8111116da21757 //P5 +data8 0x3fa55555d787761c //P4 +data8 0x3fc5555555555414 //P3 +data8 0x3fdffffffffffd6a //P2 +LOCAL_OBJECT_END(exp_p_table) + +LOCAL_OBJECT_START(exp_Q1_table) +data8 0x3de6124613a86d09 // QD = 1/13! +data8 0x3e21eed8eff8d898 // QC = 1/12! +data8 0x3ec71de3a556c734 // Q9 = 1/9! +data8 0x3efa01a01a01a01a // Q8 = 1/8! +data8 0x8888888888888889,0x3ff8 // Q5 = 1/5! +data8 0xaaaaaaaaaaaaaaab,0x3ffc // Q3 = 1/3! +data8 0x0,0x0 // Pad to avoid bank conflicts +LOCAL_OBJECT_END(exp_Q1_table) + +LOCAL_OBJECT_START(exp_Q2_table) +data8 0x3e5ae64567f544e4 // QB = 1/11! +data8 0x3e927e4fb7789f5c // QA = 1/10! +data8 0x3f2a01a01a01a01a // Q7 = 1/7! +data8 0x3f56c16c16c16c17 // Q6 = 1/6! +data8 0xaaaaaaaaaaaaaaab,0x3ffa // Q4 = 1/4! +data8 0x8000000000000000,0x3ffe // Q2 = 1/2! +LOCAL_OBJECT_END(exp_Q2_table) +.section .text +GLOBAL_IEEE754_ENTRY(expm1) -{ .mfi -(p0) add r32 = 1,r0 -(p0) fnorm.s1 f9 = f8 - nop.i 999 +{ .mlx + getf.exp rSignexp_x = f8 // Must recompute if x unorm + movl rSig_inv_ln2 = 0xb8aa3b295c17f0bc // signif of 1/ln2 } - - -{ .mfi - nop.m 999 -(p0) fclass.m.unc p6, p8 = f8, 0x1E7 - nop.i 999 +{ .mlx + addl rAD_TB1 = @ltoff(exp_Table_1), gp + movl rRshf_2to56 = 0x4768000000000000 // 1.10000 2^(63+56) } +;; +// We do this fnorm right at the beginning to normalize +// any input unnormals so that SWA is not taken. { .mfi - nop.m 999 -(p0) fclass.nm.unc p9, p0 = f8, 0x1FF - nop.i 999 + ld8 rAD_TB1 = [rAD_TB1] + fclass.m p6,p0 = f8,0x0b // Test for x=unorm + mov rExp_mask = 0x1ffff } - { .mfi - nop.m 999 -(p0) mov f36 = f1 - nop.i 999 ;; -} - -// -// Identify NatVals, NaNs, Infs, and Zeros. -// Identify EM unsupporteds. -// Save special input registers -// -// Create FR_X_cor = 0.0 -// GR_Flag = 0 -// GR_Expo_Range = 1 -// FR_Scale = 1.0 -// - -{ .mfb - nop.m 999 -(p0) mov f32 = f0 -(p6) br.cond.spnt EXP_64_SPECIAL ;; -} - -{ .mib - nop.m 999 - nop.i 999 -(p9) br.cond.spnt EXP_64_UNSUPPORTED ;; -} - -// -// Branch out for special input values -// - -{ .mfi -(p0) cmp.ne.unc p12, p13 = 0x01, r33 -(p0) fcmp.lt.unc.s0 p9,p0 = f8, f0 -(p0) cmp.eq.unc p15, p0 = r0, r0 -} - -// -// Raise possible denormal operand exception -// Normalize x -// -// This function computes exp( x + x_cor) -// Input FR 1: FR_X -// Input FR 2: FR_X_cor -// Input GR 1: GR_Flag -// Input GR 2: GR_Expo_Range -// Output FR 3: FR_Y_hi -// Output FR 4: FR_Y_lo -// Output FR 5: FR_Scale -// Output PR 1: PR_Safe - -// -// Prepare to load constants -// Set Safe = True -// - -{ .mmi -(p0) addl r34 = @ltoff(Constants_exp_64_Arg#), gp -(p0) addl r40 = @ltoff(Constants_exp_64_W1#), gp -(p0) addl r41 = @ltoff(Constants_exp_64_W2#), gp -} -;; - -{ .mmi - ld8 r34 = [r34] - ld8 r40 = [r40] -(p0) addl r50 = @ltoff(Constants_exp_64_T1#), gp -} -;; - - -{ .mmi - ld8 r41 = [r41] -(p0) ldfe f37 = [r34],16 -(p0) addl r51 = @ltoff(Constants_exp_64_T2#), gp -} -;; - -// -// N = fcvt.fx(float_N) -// Set p14 if -6 > expo_X -// - - -// -// Bias = 0x0FFFF -// expo_X = expo_X and Mask -// - -// -// Load L_lo -// Set p10 if 14 < expo_X -// - -{ .mmi - ld8 r50 = [r50] -(p0) ldfe f40 = [r34],16 - nop.i 999 + mov rExp_bias = 0xffff + fnorm.s1 fNormX = f8 + mov rExp_2tom56 = 0xffff-56 } ;; -{ .mlx - nop.m 999 -(p0) movl r58 = 0x0FFFF -} -;; - -// -// Load W2_ptr -// Branch to SMALL is expo_X < -6 -// +// Form two constants we need +// 1/ln2 * 2^63 to compute w = x * 1/ln2 * 128 +// 1.1000..000 * 2^(63+63-7) to right shift int(w) into the significand -// -// float_N = X * L_Inv -// expo_X = exponent of X -// Mask = 0x1FFFF -// - -{ .mmi - ld8 r51 = [r51] -(p0) ldfe f41 = [r34],16 +{ .mfi + setf.sig fINV_LN2_2TO63 = rSig_inv_ln2 // form 1/ln2 * 2^63 + fclass.m p8,p0 = f8,0x07 // Test for x=0 + nop.i 0 } -;; - { .mlx -(p0) addl r34 = @ltoff(Constants_exp_64_Exponents#), gp -(p0) movl r39 = 0x1FFFF -} -;; - -{ .mmi - ld8 r34 = [r34] -(p0) getf.exp r37 = f9 - nop.i 999 + setf.d fRSHF_2TO56 = rRshf_2to56 // Form 1.100 * 2^(63+56) + movl rRshf = 0x43e8000000000000 // 1.10000 2^63 for rshift } ;; -{ .mii - nop.m 999 - nop.i 999 -(p0) and r37 = r37, r39 ;; -} - -{ .mmi -(p0) sub r37 = r37, r58 ;; -(p0) cmp.gt.unc p14, p0 = -6, r37 -(p0) cmp.lt.unc p10, p0 = 14, r37 ;; -} - { .mfi - nop.m 999 -// -// Load L_inv -// Set p12 true for Flag = 0 (exp) -// Set p13 true for Flag = 1 (expm1) -// -(p0) fmpy.s1 f38 = f9, f37 - nop.i 999 ;; + setf.exp f2TOM56 = rExp_2tom56 // form 2^-56 for scaling Nfloat + fclass.m p9,p0 = f8,0x22 // Test for x=-inf + add rAD_TB2 = 0x140, rAD_TB1 // Point to Table 2 } - -{ .mfb - nop.m 999 -// -// Load L_hi -// expo_X = expo_X - Bias -// get W1_ptr -// -(p0) fcvt.fx.s1 f39 = f38 -(p14) br.cond.spnt EXP_SMALL ;; -} - { .mib - nop.m 999 - nop.i 999 -(p10) br.cond.spnt EXP_HUGE ;; -} - -{ .mmi -(p0) shladd r34 = r32,4,r34 -(p0) addl r35 = @ltoff(Constants_exp_64_A#), gp - nop.i 999 + add rAD_Q1 = 0x1e0, rAD_TB1 // Point to Q table for small path + add rAD_Ln2_lo = 0x30, rAD_TB1 // Point to ln2_by_128_lo +(p6) br.cond.spnt EXPM1_UNORM // Branch if x unorm } ;; -{ .mmi - ld8 r35 = [r35] - nop.m 999 - nop.i 999 -} -;; - -// -// Load T_1,T_2 -// - -{ .mmb -(p0) ldfe f51 = [r35],16 -(p0) ld8 r45 = [r34],8 - nop.b 999 ;; -} -// -// Set Safe = True if k >= big_expo_neg -// Set Safe = False if k < big_expo_neg -// - -{ .mmb -(p0) ldfe f49 = [r35],16 -(p0) ld8 r48 = [r34],0 - nop.b 999 ;; -} - -{ .mfi - nop.m 999 -// -// Branch to HUGE is expo_X > 14 -// -(p0) fcvt.xf f38 = f39 - nop.i 999 ;; -} - +EXPM1_COMMON: { .mfi -(p0) getf.sig r52 = f39 - nop.f 999 - nop.i 999 ;; -} - -{ .mii - nop.m 999 -(p0) extr.u r43 = r52, 6, 6 ;; -// -// r = r - float_N * L_lo -// K = extr(N_fix,12,52) -// -(p0) shladd r40 = r43,3,r40 ;; -} - -{ .mfi -(p0) shladd r50 = r43,2,r50 -(p0) fnma.s1 f42 = f40, f38, f9 -// -// float_N = float(N) -// N_fix = signficand N -// -(p0) extr.u r42 = r52, 0, 6 -} - -{ .mmi -(p0) ldfd f43 = [r40],0 ;; -(p0) shladd r41 = r42,3,r41 -(p0) shladd r51 = r42,2,r51 -} -// -// W_1_p1 = 1 + W_1 -// - -{ .mmi -(p0) ldfs f44 = [r50],0 ;; -(p0) ldfd f45 = [r41],0 -// -// M_2 = extr(N_fix,0,6) -// M_1 = extr(N_fix,6,6) -// r = X - float_N * L_hi -// -(p0) extr r44 = r52, 12, 52 -} - -{ .mmi -(p0) ldfs f46 = [r51],0 ;; -(p0) sub r46 = r58, r44 -(p0) cmp.gt.unc p8, p15 = r44, r45 -} -// -// W = W_1 + W_1_p1*W_2 -// Load A_2 -// Bias_m_K = Bias - K -// - -{ .mii -(p0) ldfe f40 = [r35],16 -// -// load A_1 -// poly = A_2 + r*A_3 -// rsq = r * r -// neg_2_mK = exponent of Bias_m_k -// -(p0) add r47 = r58, r44 ;; -// -// Set Safe = True if k <= big_expo_pos -// Set Safe = False if k > big_expo_pos -// Load A_3 -// -(p15) cmp.lt p8,p15 = r44,r48 ;; -} - -{ .mmf -(p0) setf.exp f61 = r46 -// -// Bias_p + K = Bias + K -// T = T_1 * T_2 -// -(p0) setf.exp f36 = r47 -(p0) fnma.s1 f42 = f41, f38, f42 ;; + ldfpd fMIN_DBL_OFLOW_ARG, fMAX_DBL_MINUS_1_ARG = [rAD_TB1],16 + fclass.m p10,p0 = f8,0x1e1 // Test for x=+inf, NaN, NaT + add rAD_Q2 = 0x50, rAD_Q1 // Point to Q table for small path } - -{ .mfi - nop.m 999 -// -// Load W_1,W_2 -// Load big_exp_pos, load big_exp_neg -// -(p0) fadd.s1 f47 = f43, f1 - nop.i 999 ;; +{ .mfb + nop.m 0 + nop.f 0 +(p8) br.ret.spnt b0 // Exit for x=0, return x } +;; { .mfi - nop.m 999 -(p0) fma.s1 f52 = f42, f51, f49 - nop.i 999 + ldfd fMAX_DBL_NORM_ARG = [rAD_TB1],16 + nop.f 0 + and rExp_x = rExp_mask, rSignexp_x // Biased exponent of x } - -{ .mfi - nop.m 999 -(p0) fmpy.s1 f48 = f42, f42 - nop.i 999 ;; +{ .mfb + setf.d fRSHF = rRshf // Form right shift const 1.100 * 2^63 +(p9) fms.d.s0 f8 = f0,f0,f1 // quick exit for x=-inf +(p9) br.ret.spnt b0 } +;; { .mfi - nop.m 999 -(p0) fmpy.s1 f53 = f44, f46 - nop.i 999 ;; + ldfpd fQD, fQC = [rAD_Q1], 16 // Load coeff for small path + nop.f 0 + sub rExp_x = rExp_x, rExp_bias // True exponent of x } - -{ .mfi - nop.m 999 -(p0) fma.s1 f54 = f45, f47, f43 - nop.i 999 +{ .mfb + ldfpd fQB, fQA = [rAD_Q2], 16 // Load coeff for small path +(p10) fma.d.s0 f8 = f8, f1, f0 // For x=+inf, NaN, NaT +(p10) br.ret.spnt b0 // Exit for x=+inf, NaN, NaT } +;; { .mfi - nop.m 999 -(p0) fneg f61 = f61 - nop.i 999 ;; + ldfpd fQ9, fQ8 = [rAD_Q1], 16 // Load coeff for small path + fma.s1 fXsq = fNormX, fNormX, f0 // x*x for small path + cmp.gt p7, p8 = -2, rExp_x // Test |x| < 2^(-2) } - { .mfi - nop.m 999 -(p0) fma.s1 f52 = f42, f52, f40 - nop.i 999 ;; + ldfpd fQ7, fQ6 = [rAD_Q2], 16 // Load coeff for small path + nop.f 0 + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fadd.s1 f55 = f54, f1 - nop.i 999 + ldfe fQ5 = [rAD_Q1], 16 // Load coeff for small path + nop.f 0 + nop.i 0 } - -{ .mfi - nop.m 999 -// -// W + Wp1 * poly -// -(p0) mov f34 = f53 - nop.i 999 ;; +{ .mib + ldfe fQ4 = [rAD_Q2], 16 // Load coeff for small path +(p7) cmp.gt.unc p6, p7 = -60, rExp_x // Test |x| < 2^(-60) +(p7) br.cond.spnt EXPM1_SMALL // Branch if 2^-60 <= |x| < 2^-2 } +;; -{ .mfi - nop.m 999 -// -// A_1 + r * poly -// Scale = setf_exp(Bias_p_k) -// -(p0) fma.s1 f52 = f48, f52, f42 - nop.i 999 ;; -} +// W = X * Inv_log2_by_128 +// By adding 1.10...0*2^63 we shift and get round_int(W) in significand. +// We actually add 1.10...0*2^56 to X * Inv_log2 to do the same thing. { .mfi - nop.m 999 -// -// poly = r + rsq(A_1 + r*poly) -// Wp1 = 1 + W -// neg_2_mK = -neg_2_mK -// -(p0) fma.s1 f35 = f55, f52, f54 - nop.i 999 ;; + ldfe fLn2_by_128_hi = [rAD_TB1],32 + fma.s1 fW_2TO56_RSH = fNormX, fINV_LN2_2TO63, fRSHF_2TO56 + nop.i 0 } - { .mfb - nop.m 999 -(p0) fmpy.s1 f35 = f35, f53 -// -// Y_hi = T -// Y_lo = T * (W + Wp1*poly) -// -(p12) br.cond.sptk EXP_MAIN ;; + ldfe fLn2_by_128_lo = [rAD_Ln2_lo] +(p6) fma.d.s0 f8 = f8, f8, f8 // If x < 2^-60, result=x+x*x +(p6) br.ret.spnt b0 // Exit if x < 2^-60 } -// -// Branch if exp(x) -// Continue for exp(x-1) -// +;; -{ .mii -(p0) cmp.lt.unc p12, p13 = 10, r44 - nop.i 999 ;; -// -// Set p12 if 10 < K, Else p13 -// -(p13) cmp.gt.unc p13, p14 = -10, r44 ;; -} +// Divide arguments into the following categories: +// Certain minus one p11 - -inf < x <= MAX_DBL_MINUS_1_ARG +// Possible Overflow p14 - MAX_DBL_NORM_ARG < x < MIN_DBL_OFLOW_ARG +// Certain Overflow p15 - MIN_DBL_OFLOW_ARG <= x < +inf // -// K > 10: Y_lo = Y_lo + neg_2_mK -// K <=10: Set p13 if -10 > K, Else set p14 +// If the input is really a double arg, then there will never be "Possible +// Overflow" arguments. // -{ .mfi -(p13) cmp.eq p15, p0 = r0, r0 -(p14) fadd.s1 f34 = f61, f34 - nop.i 999 ;; -} +// After that last load, rAD_TB1 points to the beginning of table 1 { .mfi - nop.m 999 -(p12) fadd.s1 f35 = f35, f61 - nop.i 999 ;; + nop.m 0 + fcmp.ge.s1 p15,p14 = fNormX,fMIN_DBL_OFLOW_ARG + nop.i 0 } +;; { .mfi - nop.m 999 -(p13) fadd.s1 f35 = f35, f34 - nop.i 999 + add rAD_P = 0x80, rAD_TB2 + fcmp.le.s1 p11,p0 = fNormX,fMAX_DBL_MINUS_1_ARG + nop.i 0 } +;; { .mfb - nop.m 999 -// -// K <= 10 and K < -10, Set Safe = True -// K <= 10 and K < 10, Y_lo = Y_hi + Y_lo -// K <= 10 and K > =-10, Y_hi = Y_hi + neg_2_mk -// -(p13) mov f34 = f61 -(p0) br.cond.sptk EXP_MAIN ;; -} -EXP_SMALL: - -{ .mmi -(p12) addl r35 = @ltoff(Constants_exp_64_P#), gp -(p0) addl r34 = @ltoff(Constants_exp_64_Exponents#), gp - nop.i 999 + ldfpd fP5, fP4 = [rAD_P] ,16 +(p14) fcmp.gt.unc.s1 p14,p0 = fNormX,fMAX_DBL_NORM_ARG +(p15) br.cond.spnt EXPM1_CERTAIN_OVERFLOW } ;; -{ .mmi -(p12) ld8 r35 = [r35] - ld8 r34 = [r34] - nop.i 999 -} -;; +// Nfloat = round_int(W) +// The signficand of fW_2TO56_RSH contains the rounded integer part of W, +// as a twos complement number in the lower bits (that is, it may be negative). +// That twos complement number (called N) is put into rN. +// Since fW_2TO56_RSH is scaled by 2^56, it must be multiplied by 2^-56 +// before the shift constant 1.10000 * 2^63 is subtracted to yield fNfloat. +// Thus, fNfloat contains the floating point version of N -{ .mmi -(p13) addl r35 = @ltoff(Constants_exp_64_Q#), gp - nop.m 999 - nop.i 999 +{ .mfb + ldfpd fP3, fP2 = [rAD_P] + fms.s1 fNfloat = fW_2TO56_RSH, f2TOM56, fRSHF +(p11) br.cond.spnt EXPM1_CERTAIN_MINUS_ONE } ;; - -// -// Return -// K <= 10 and K < 10, Y_hi = neg_2_mk -// -// /*******************************************************/ -// /*********** Branch EXP_SMALL *************************/ -// /*******************************************************/ - { .mfi -(p13) ld8 r35 = [r35] -(p0) mov f42 = f9 -(p0) add r34 = 0x48,r34 + getf.sig rN = fW_2TO56_RSH + nop.f 0 + nop.i 0 } ;; -// -// Flag = 0 -// r4 = rsq * rsq -// +// rIndex_1 has index_1 +// rIndex_2_16 has index_2 * 16 +// rBiased_M has M +// rIndex_1_16 has index_1 * 16 +// r = x - Nfloat * ln2_by_128_hi +// f = 1 - Nfloat * ln2_by_128_lo { .mfi -(p0) ld8 r49 =[r34],0 - nop.f 999 - nop.i 999 ;; -} - -{ .mii - nop.m 999 - nop.i 999 ;; -// -// Flag = 1 -// -(p0) cmp.lt.unc p14, p0 = r37, r49 ;; -} - -{ .mfi - nop.m 999 -// -// r = X -// -(p0) fmpy.s1 f48 = f42, f42 - nop.i 999 ;; -} - -{ .mfb - nop.m 999 -// -// rsq = r * r -// -(p0) fmpy.s1 f50 = f48, f48 -// -// Is input very small? -// -(p14) br.cond.spnt EXP_VERY_SMALL ;; -} -// -// Flag_not1: Y_hi = 1.0 -// Flag is 1: r6 = rsq * r4 -// - -{ .mfi -(p12) ldfe f52 = [r35],16 -(p12) mov f34 = f1 -(p0) add r53 = 0x1,r0 ;; -} - -{ .mfi -(p13) ldfe f51 = [r35],16 -// -// Flag_not_1: Y_lo = poly_hi + r4 * poly_lo -// -(p13) mov f34 = f9 - nop.i 999 ;; -} - -{ .mmf -(p12) ldfe f53 = [r35],16 -// -// For Flag_not_1, Y_hi = X -// Scale = 1 -// Create 0x000...01 -// -(p0) setf.sig f37 = r53 -(p0) mov f36 = f1 ;; -} - -{ .mmi -(p13) ldfe f52 = [r35],16 ;; -(p12) ldfe f54 = [r35],16 - nop.i 999 ;; + and rIndex_1 = 0x0f, rN + fnma.s1 fR = fNfloat, fLn2_by_128_hi, fNormX + shr rM = rN, 0x7 } - { .mfi -(p13) ldfe f53 = [r35],16 -(p13) fmpy.s1 f58 = f48, f50 - nop.i 999 ;; + and rIndex_2_16 = 0x70, rN + fnma.s1 fF = fNfloat, fLn2_by_128_lo, f1 + nop.i 0 } -// -// Flag_not1: poly_lo = P_5 + r*P_6 -// Flag_1: poly_lo = Q_6 + r*Q_7 -// +;; -{ .mmi -(p13) ldfe f54 = [r35],16 ;; -(p12) ldfe f55 = [r35],16 - nop.i 999 ;; -} +// rAD_T1 has address of T1 +// rAD_T2 has address if T2 { .mmi -(p12) ldfe f56 = [r35],16 ;; -(p13) ldfe f55 = [r35],16 - nop.i 999 ;; + add rBiased_M = rExp_bias, rM + add rAD_T2 = rAD_TB2, rIndex_2_16 + shladd rAD_T1 = rIndex_1, 4, rAD_TB1 } +;; +// Create Scale = 2^M +// Load T1 and T2 { .mmi -(p12) ldfe f57 = [r35],0 ;; -(p13) ldfe f56 = [r35],16 - nop.i 999 ;; -} - -{ .mfi -(p13) ldfe f57 = [r35],0 - nop.f 999 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// For Flag_not_1, load p5,p6,p1,p2 -// Else load p5,p6,p1,p2 -// -(p12) fma.s1 f60 = f52, f42, f53 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p13) fma.s1 f60 = f51, f42, f52 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p12) fma.s1 f60 = f60, f42, f54 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p12) fma.s1 f59 = f56, f42, f57 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p13) fma.s1 f60 = f42, f60, f53 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p12) fma.s1 f59 = f59, f48, f42 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// Flag_1: poly_lo = Q_5 + r*(Q_6 + r*Q_7) -// Flag_not1: poly_lo = P_4 + r*(P_5 + r*P_6) -// Flag_not1: poly_hi = (P_1 + r*P_2) -// -(p13) fmpy.s1 f60 = f60, f58 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p12) fma.s1 f60 = f60, f42, f55 - nop.i 999 ;; + setf.exp f2M = rBiased_M + ldfe fT2 = [rAD_T2] + nop.i 0 } +;; { .mfi - nop.m 999 -// -// Flag_1: poly_lo = r6 *(Q_5 + ....) -// Flag_not1: poly_hi = r + rsq *(P_1 + r*P_2) -// -(p12) fma.s1 f35 = f60, f50, f59 - nop.i 999 + ldfe fT1 = [rAD_T1] + fmpy.s0 fTmp = fLn2_by_128_lo, fLn2_by_128_lo // Force inexact + nop.i 0 } +;; { .mfi - nop.m 999 -(p13) fma.s1 f59 = f54, f42, f55 - nop.i 999 ;; + nop.m 0 + fma.s1 fP54 = fR, fP5, fP4 + nop.i 0 } - { .mfi - nop.m 999 -// -// Flag_not1: Y_lo = rsq* poly_hi + poly_lo -// Flag_1: poly_lo = rsq* poly_hi + poly_lo -// -(p13) fma.s1 f59 = f59, f42, f56 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// Flag_not_1: (P_1 + r*P_2) -// -(p13) fma.s1 f59 = f59, f42, f57 - nop.i 999 ;; + nop.m 0 + fma.s1 fP32 = fR, fP3, fP2 + nop.i 0 } +;; { .mfi - nop.m 999 -// -// Flag_not_1: poly_hi = r + rsq * (P_1 + r*P_2) -// -(p13) fma.s1 f35 = f59, f48, f60 - nop.i 999 ;; + nop.m 0 + fma.s1 fRsq = fR, fR, f0 + nop.i 0 } +;; { .mfi - nop.m 999 -// -// Create 0.000...01 -// -(p0) for f37 = f35, f37 - nop.i 999 ;; -} - -{ .mfb - nop.m 999 -// -// Set lsb of Y_lo to 1 -// -(p0) fmerge.se f35 = f35,f37 -(p0) br.cond.sptk EXP_MAIN ;; -} -EXP_VERY_SMALL: - -{ .mmi - nop.m 999 -(p13) addl r34 = @ltoff(Constants_exp_64_Exponents#),gp - nop.i 999;; + nop.m 0 + fma.s1 fP5432 = fRsq, fP54, fP32 + nop.i 0 } +;; { .mfi -(p13) ld8 r34 = [r34]; -(p12) mov f35 = f9 - nop.i 999 ;; -} - -{ .mfb - nop.m 999 -(p12) mov f34 = f1 -(p12) br.cond.sptk EXP_MAIN ;; -} - -{ .mlx -(p13) add r34 = 8,r34 -(p13) movl r39 = 0x0FFFE ;; + nop.m 0 + fma.s1 fS2 = fF,fT2,f0 + nop.i 0 } -// -// Load big_exp_neg -// Create 1/2's exponent -// - -{ .mii -(p13) setf.exp f56 = r39 -(p13) shladd r34 = r32,4,r34 ;; - nop.i 999 -} -// -// Negative exponents are stored after positive -// - { .mfi -(p13) ld8 r45 = [r34],0 -// -// Y_hi = x -// Scale = 1 -// -(p13) fmpy.s1 f35 = f9, f9 - nop.i 999 ;; + nop.m 0 + fma.s1 fS1 = f2M,fT1,f0 + nop.i 0 } +;; { .mfi - nop.m 999 -// -// Reset Safe if necessary -// Create 1/2 -// -(p13) mov f34 = f9 - nop.i 999 ;; + nop.m 0 + fma.s1 fP = fRsq, fP5432, fR + nop.i 0 } +;; { .mfi -(p13) cmp.lt.unc p0, p15 = r37, r45 -(p13) mov f36 = f1 - nop.i 999 ;; + nop.m 0 + fms.s1 fSm1 = fS1,fS2,f1 // S - 1.0 + nop.i 0 } - { .mfb - nop.m 999 -// -// Y_lo = x * x -// -(p13) fmpy.s1 f35 = f35, f56 -// -// Y_lo = x*x/2 -// -(p13) br.cond.sptk EXP_MAIN ;; -} -EXP_HUGE: - -{ .mfi - nop.m 999 -(p0) fcmp.gt.unc.s1 p14, p0 = f9, f0 - nop.i 999 -} - -{ .mlx - nop.m 999 -(p0) movl r39 = 0x15DC0 ;; -} - -{ .mfi -(p14) setf.exp f34 = r39 -(p14) mov f35 = f1 -(p14) cmp.eq p0, p15 = r0, r0 ;; + nop.m 0 + fma.s1 fS = fS1,fS2,f0 +(p14) br.cond.spnt EXPM1_POSSIBLE_OVERFLOW } +;; { .mfb - nop.m 999 -(p14) mov f36 = f34 -// -// If x > 0, Set Safe = False -// If x > 0, Y_hi = 2**(24,000) -// If x > 0, Y_lo = 1.0 -// If x > 0, Scale = 2**(24,000) -// -(p14) br.cond.sptk EXP_MAIN ;; -} - -{ .mlx - nop.m 999 -(p12) movl r39 = 0xA240 -} - -{ .mlx - nop.m 999 -(p12) movl r38 = 0xA1DC ;; -} - -{ .mmb -(p13) cmp.eq p15, p14 = r0, r0 -(p12) setf.exp f34 = r39 - nop.b 999 ;; -} - -{ .mlx -(p12) setf.exp f35 = r38 -(p13) movl r39 = 0xFF9C + nop.m 0 + fma.d.s0 f8 = fS, fP, fSm1 + br.ret.sptk b0 // Normal path exit } +;; -{ .mfi - nop.m 999 -(p13) fsub.s1 f34 = f0, f1 - nop.i 999 ;; +// Here if 2^-60 <= |x| <2^-2 +// Compute 13th order polynomial +EXPM1_SMALL: +{ .mmf + ldfe fQ3 = [rAD_Q1], 16 + ldfe fQ2 = [rAD_Q2], 16 + fma.s1 fX4 = fXsq, fXsq, f0 } +;; { .mfi - nop.m 999 -(p12) mov f36 = f34 -(p12) cmp.eq p0, p15 = r0, r0 ;; + nop.m 0 + fma.s1 fQDC = fQD, fNormX, fQC + nop.i 0 } - { .mfi -(p13) setf.exp f35 = r39 -(p13) mov f36 = f1 - nop.i 999 ;; + nop.m 0 + fma.s1 fQBA = fQB, fNormX, fQA + nop.i 0 } -EXP_MAIN: +;; { .mfi -(p0) cmp.ne.unc p12, p0 = 0x01, r33 -(p0) fmpy.s1 f101 = f36, f35 - nop.i 999 ;; + nop.m 0 + fma.s1 fQ98 = fQ9, fNormX, fQ8 + nop.i 0 } - -{ .mfb - nop.m 999 -(p0) fma.d.s0 f99 = f34, f36, f101 -(p15) br.cond.sptk EXP_64_RETURN;; -} - { .mfi - nop.m 999 -(p0) fsetc.s3 0x7F,0x01 - nop.i 999 -} - -{ .mlx - nop.m 999 -(p0) movl r50 = 0x000000000103FF ;; -} -// -// S0 user supplied status -// S2 user supplied status + WRE + TD (Overflows) -// S3 user supplied status + RZ + TD (Underflows) -// -// -// If (Safe) is true, then -// Compute result using user supplied status field. -// No overflow or underflow here, but perhaps inexact. -// Return -// Else -// Determine if overflow or underflow was raised. -// Fetch +/- overflow threshold for IEEE single, double, -// double extended -// - -{ .mfi -(p0) setf.exp f60 = r50 -(p0) fma.d.s3 f102 = f34, f36, f101 - nop.i 999 + nop.m 0 + fma.s1 fQ76= fQ7, fNormX, fQ6 + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fsetc.s3 0x7F,0x40 - nop.i 999 ;; + nop.m 0 + fma.s1 fQ54 = fQ5, fNormX, fQ4 + nop.i 0 } +;; { .mfi - nop.m 999 -// -// For Safe, no need to check for over/under. -// For expm1, handle errors like exp. -// -(p0) fsetc.s2 0x7F,0x42 - nop.i 999;; + nop.m 0 + fma.s1 fX6 = fX4, fXsq, f0 + nop.i 0 } - { .mfi - nop.m 999 -(p0) fma.d.s2 f100 = f34, f36, f101 - nop.i 999 ;; + nop.m 0 + fma.s1 fQ32= fQ3, fNormX, fQ2 + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fsetc.s2 0x7F,0x40 - nop.i 999 ;; + nop.m 0 + fma.s1 fQDCBA = fQDC, fXsq, fQBA + nop.i 0 } - { .mfi - nop.m 999 -(p7) fclass.m.unc p12, p0 = f102, 0x00F - nop.i 999 + nop.m 0 + fma.s1 fQ7654 = fQ76, fXsq, fQ54 + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fclass.m.unc p11, p0 = f102, 0x00F - nop.i 999 ;; + nop.m 0 + fma.s1 fQDCBA98 = fQDCBA, fXsq, fQ98 + nop.i 0 } - { .mfi - nop.m 999 -(p7) fcmp.ge.unc.s1 p10, p0 = f100, f60 - nop.i 999 + nop.m 0 + fma.s1 fQ765432 = fQ7654, fXsq, fQ32 + nop.i 0 } +;; { .mfi - nop.m 999 -// -// Create largest double exponent + 1. -// Create smallest double exponent - 1. -// -(p0) fcmp.ge.unc.s1 p8, p0 = f100, f60 - nop.i 999 ;; -} -// -// fcmp: resultS2 >= + overflow threshold -> set (a) if true -// fcmp: resultS2 <= - overflow threshold -> set (b) if true -// fclass: resultS3 is denorm/unorm/0 -> set (d) if true -// - -{ .mib -(p10) mov r65 = 41 - nop.i 999 -(p10) br.cond.sptk __libm_error_region ;; -} - -{ .mib -(p8) mov r65 = 14 - nop.i 999 -(p8) br.cond.sptk __libm_error_region ;; + nop.m 0 + fma.s1 fQDCBA98765432 = fQDCBA98, fX6, fQ765432 + nop.i 0 } -// -// Report that exp overflowed -// +;; -{ .mib -(p12) mov r65 = 42 - nop.i 999 -(p12) br.cond.sptk __libm_error_region ;; +{ .mfb + nop.m 0 + fma.d.s0 f8 = fQDCBA98765432, fXsq, fNormX + br.ret.sptk b0 // Exit small branch } +;; -{ .mib -(p11) mov r65 = 15 - nop.i 999 -(p11) br.cond.sptk __libm_error_region ;; -} -{ .mib - nop.m 999 - nop.i 999 -// -// Report that exp underflowed -// -(p0) br.cond.sptk EXP_64_RETURN;; -} -EXP_64_SPECIAL: +EXPM1_POSSIBLE_OVERFLOW: -{ .mfi - nop.m 999 -(p0) fclass.m.unc p6, p0 = f8, 0x0c3 - nop.i 999 -} +// Here if fMAX_DBL_NORM_ARG < x < fMIN_DBL_OFLOW_ARG +// This cannot happen if input is a double, only if input higher precision. +// Overflow is a possibility, not a certainty. -{ .mfi - nop.m 999 -(p0) fclass.m.unc p13, p8 = f8, 0x007 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p7) fclass.m.unc p14, p0 = f8, 0x007 - nop.i 999 -} +// Recompute result using status field 2 with user's rounding mode, +// and wre set. If result is larger than largest double, then we have +// overflow { .mfi - nop.m 999 -(p0) fclass.m.unc p12, p9 = f8, 0x021 - nop.i 999 ;; + mov rGt_ln = 0x103ff // Exponent for largest dbl + 1 ulp + fsetc.s2 0x7F,0x42 // Get user's round mode, set wre + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fclass.m.unc p11, p0 = f8, 0x022 - nop.i 999 + setf.exp fGt_pln = rGt_ln // Create largest double + 1 ulp + fma.d.s2 fWre_urm_f8 = fS, fP, fSm1 // Result with wre set + nop.i 0 } +;; { .mfi - nop.m 999 -(p7) fclass.m.unc p10, p0 = f8, 0x022 - nop.i 999 ;; + nop.m 0 + fsetc.s2 0x7F,0x40 // Turn off wre in sf2 + nop.i 0 } +;; { .mfi - nop.m 999 -// -// Identify +/- 0, Inf, or -Inf -// Generate the right kind of NaN. -// -(p13) fadd.d.s0 f99 = f0, f1 - nop.i 999 ;; + nop.m 0 + fcmp.ge.s1 p6, p0 = fWre_urm_f8, fGt_pln // Test for overflow + nop.i 0 } +;; -{ .mfi - nop.m 999 -(p14) mov f99 = f8 - nop.i 999 ;; +{ .mfb + nop.m 0 + nop.f 0 +(p6) br.cond.spnt EXPM1_CERTAIN_OVERFLOW // Branch if overflow } +;; { .mfb - nop.m 999 -(p6) fadd.d.s0 f99 = f8, f1 -// -// exp(+/-0) = 1 -// expm1(+/-0) = +/-0 -// No exceptions raised -// -(p6) br.cond.sptk EXP_64_RETURN;; + nop.m 0 + fma.d.s0 f8 = fS, fP, fSm1 + br.ret.sptk b0 // Exit if really no overflow } +;; -{ .mib - nop.m 999 - nop.i 999 -(p14) br.cond.sptk EXP_64_RETURN;; +EXPM1_CERTAIN_OVERFLOW: +{ .mmi + sub rTmp = rExp_mask, r0, 1 +;; + setf.exp fTmp = rTmp + nop.i 0 } +;; { .mfi - nop.m 999 -(p11) mov f99 = f0 - nop.i 999 ;; + alloc r32=ar.pfs,1,4,4,0 + fmerge.s FR_X = f8,f8 + nop.i 0 } - { .mfb - nop.m 999 -(p10) fsub.d.s1 f99 = f0, f1 -// -// exp(-Inf) = 0 -// expm1(-Inf) = -1 -// No exceptions raised. -// -(p10) br.cond.sptk EXP_64_RETURN;; + mov GR_Parameter_TAG = 41 + fma.d.s0 FR_RESULT = fTmp, fTmp, f0 // Set I,O and +INF result + br.cond.sptk __libm_error_region } +;; +// Here if x unorm +EXPM1_UNORM: { .mfb - nop.m 999 -(p12) fmpy.d.s1 f99 = f8, f1 -// -// exp(+Inf) = Inf -// No exceptions raised. -// -(p0) br.cond.sptk EXP_64_RETURN;; + getf.exp rSignexp_x = fNormX // Must recompute if x unorm + fcmp.eq.s0 p6, p0 = f8, f0 // Set D flag + br.cond.sptk EXPM1_COMMON } +;; - -EXP_64_UNSUPPORTED: - -{ .mfb - nop.m 999 -(p0) fmpy.d.s0 f99 = f8, f0 - nop.b 0;; +// here if result will be -1 and inexact, x <= -48.0 +EXPM1_CERTAIN_MINUS_ONE: +{ .mmi + mov rTmp = 1 +;; + setf.exp fTmp = rTmp + nop.i 0 } +;; -EXP_64_RETURN: { .mfb - nop.m 999 -(p0) mov f8 = f99 -(p0) br.ret.sptk b0 + nop.m 0 + fms.d.s0 FR_RESULT = fTmp, fTmp, f1 // Set I, rounded -1+eps result + br.ret.sptk b0 } -.endp expm1 -ASM_SIZE_DIRECTIVE(expm1) +;; -.proc __libm_error_region -__libm_error_region: +GLOBAL_IEEE754_END(expm1) + +LOCAL_LIBM_ENTRY(__libm_error_region) .prologue -// (1) { .mfi add GR_Parameter_Y=-32,sp // Parameter 2 value nop.f 0 @@ -1716,38 +841,32 @@ __libm_error_region: } { .mfi .fframe 64 - add sp=-64,sp // Create new stack + add sp=-64,sp // Create new stack nop.f 0 - mov GR_SAVE_GP=gp // Save gp + mov GR_SAVE_GP=gp // Save gp };; - -// (2) { .mmi stfd [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack - add GR_Parameter_X = 16,sp // Parameter 1 address + add GR_Parameter_X = 16,sp // Parameter 1 address .save b0, GR_SAVE_B0 - mov GR_SAVE_B0=b0 // Save b0 + mov GR_SAVE_B0=b0 // Save b0 };; - .body -// (3) { .mib - stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack + stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address - nop.b 0 + nop.b 0 } { .mib - stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack + stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack add GR_Parameter_Y = -16,GR_Parameter_Y - br.call.sptk b0=__libm_error_support# // Call error handling function + br.call.sptk b0=__libm_error_support# // Call error handling function };; { .mmi - nop.m 0 - nop.m 0 add GR_Parameter_RESULT = 48,sp + nop.m 0 + nop.i 0 };; - -// (4) { .mmi ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack .restore sp @@ -1760,9 +879,6 @@ __libm_error_region: br.ret.sptk b0 // Return };; -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) - - +LOCAL_LIBM_END(__libm_error_region) .type __libm_error_support#,@function .global __libm_error_support# diff --git a/sysdeps/ia64/fpu/s_expm1f.S b/sysdeps/ia64/fpu/s_expm1f.S index cc2c537ba2..0c5f2e67a8 100644 --- a/sysdeps/ia64/fpu/s_expm1f.S +++ b/sysdeps/ia64/fpu/s_expm1f.S @@ -1,10 +1,10 @@ -.file "exp_m1f.s" +.file "expf_m1.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2002, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,1735 +20,649 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// HISTORY -// 2/02/00 Initial Version -// 4/04/00 Unwind support added -// 8/15/00 Bundle added after call to __libm_error_support to properly +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://www.intel.com/software/products/opensource/libraries/num.htm. + +// History +//********************************************************************* +// 02/02/00 Initial Version +// 04/04/00 Unwind support added +// 08/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. +// 07/07/01 Improved speed of all paths +// 05/20/02 Cleaned up namespace and sf0 syntax +// 11/20/02 Improved speed, algorithm based on expf // -// ********************************************************************* -// -// Function: Combined expf(x) and expm1f(x), where -// x -// expf(x) = e , for single precision x values -// x -// expm1f(x) = e - 1 for single precision x values -// -// ********************************************************************* -// -// Accuracy: Within .7 ulps for 80-bit floating point values -// Very accurate for single precision values -// -// ********************************************************************* -// -// Resources Used: -// -// Floating-Point Registers: f8 (Input and Return Value) -// f9,f32-f61, f99-f102 -// -// General Purpose Registers: -// r32-r61 -// r62-r65 (Used to pass arguments to error handling routine) -// -// Predicate Registers: p6-p15 -// -// ********************************************************************* -// -// IEEE Special Conditions: -// -// Denormal fault raised on denormal inputs -// Overflow exceptions raised when appropriate for exp and expm1 -// Underflow exceptions raised when appropriate for exp and expm1 -// (Error Handling Routine called for overflow and Underflow) -// Inexact raised when appropriate by algorithm -// -// expf(inf) = inf -// expf(-inf) = +0 -// expf(SNaN) = QNaN -// expf(QNaN) = QNaN -// expf(0) = 1 -// expf(EM_special Values) = QNaN -// expf(inf) = inf -// expm1f(-inf) = -1 -// expm1f(SNaN) = QNaN -// expm1f(QNaN) = QNaN -// expm1f(0) = 0 -// expm1f(EM_special Values) = QNaN -// -// ********************************************************************* -// -// Implementation and Algorithm Notes: -// -// ker_exp_64( in_FR : X, -// in_GR : Flag, -// in_GR : Expo_Range -// out_FR : Y_hi, -// out_FR : Y_lo, -// out_FR : scale, -// out_PR : Safe ) -// -// On input, X is in register format and -// Flag = 0 for exp, -// Flag = 1 for expm1, -// -// On output, provided X and X_cor are real numbers, then -// -// scale*(Y_hi + Y_lo) approximates expf(X) if Flag is 0 -// scale*(Y_hi + Y_lo) approximates expf(X)-1 if Flag is 1 -// -// The accuracy is sufficient for a highly accurate 64 sig. -// bit implementation. Safe is set if there is no danger of -// overflow/underflow when the result is composed from scale, -// Y_hi and Y_lo. Thus, we can have a fast return if Safe is set. -// Otherwise, one must prepare to handle the possible exception -// appropriately. Note that SAFE not set (false) does not mean -// that overflow/underflow will occur; only the setting of SAFE -// guarantees the opposite. -// -// **** High Level Overview **** -// -// The method consists of three cases. -// -// If |X| < Tiny use case exp_tiny; -// else if |X| < 2^(-6) use case exp_small; -// else use case exp_regular; -// -// Case exp_tiny: -// -// 1 + X can be used to approximate expf(X) or expf(X+X_cor); -// X + X^2/2 can be used to approximate expf(X) - 1 -// -// Case exp_small: -// -// Here, expf(X), expf(X+X_cor), and expf(X) - 1 can all be -// appproximated by a relatively simple polynomial. -// -// This polynomial resembles the truncated Taylor series -// -// expf(w) = 1 + w + w^2/2! + w^3/3! + ... + w^n/n! -// -// Case exp_regular: -// -// Here we use a table lookup method. The basic idea is that in -// order to compute expf(X), we accurately decompose X into -// -// X = N * log(2)/(2^12) + r, |r| <= log(2)/2^13. -// -// Hence -// -// expf(X) = 2^( N / 2^12 ) * expf(r). -// -// The value 2^( N / 2^12 ) is obtained by simple combinations -// of values calculated beforehand and stored in table; expf(r) -// is approximated by a short polynomial because |r| is small. -// -// We elaborate this method in 4 steps. -// -// Step 1: Reduction -// -// The value 2^12/log(2) is stored as a double-extended number -// L_Inv. -// -// N := round_to_nearest_integer( X * L_Inv ) -// -// The value log(2)/2^12 is stored as two numbers L_hi and L_lo so -// that r can be computed accurately via -// -// r := (X - N*L_hi) - N*L_lo -// -// We pick L_hi such that N*L_hi is representable in 64 sig. bits -// and thus the FMA X - N*L_hi is error free. So r is the -// 1 rounding error from an exact reduction with respect to -// -// L_hi + L_lo. -// -// In particular, L_hi has 30 significant bit and can be stored -// as a double-precision number; L_lo has 64 significant bits and -// stored as a double-extended number. -// -// In the case Flag = 2, we further modify r by -// -// r := r + X_cor. -// -// Step 2: Approximation -// -// expf(r) - 1 is approximated by a short polynomial of the form -// -// r + A_1 r^2 + A_2 r^3 + A_3 r^4 . -// -// Step 3: Composition from Table Values -// -// The value 2^( N / 2^12 ) can be composed from a couple of tables -// of precalculated values. First, express N as three integers -// K, M_1, and M_2 as -// -// N = K * 2^12 + M_1 * 2^6 + M_2 -// -// Where 0 <= M_1, M_2 < 2^6; and K can be positive or negative. -// When N is represented in 2's complement, M_2 is simply the 6 -// lsb's, M_1 is the next 6, and K is simply N shifted right -// arithmetically (sign extended) by 12 bits. -// -// Now, 2^( N / 2^12 ) is simply -// -// 2^K * 2^( M_1 / 2^6 ) * 2^( M_2 / 2^12 ) -// -// Clearly, 2^K needs no tabulation. The other two values are less -// trivial because if we store each accurately to more than working -// precision, than its product is too expensive to calculate. We -// use the following method. -// -// Define two mathematical values, delta_1 and delta_2, implicitly -// such that -// -// T_1 = expf( [M_1 log(2)/2^6] - delta_1 ) -// T_2 = expf( [M_2 log(2)/2^12] - delta_2 ) -// -// are representable as 24 significant bits. To illustrate the idea, -// we show how we define delta_1: -// -// T_1 := round_to_24_bits( expf( M_1 log(2)/2^6 ) ) -// delta_1 = (M_1 log(2)/2^6) - log( T_1 ) -// -// The last equality means mathematical equality. We then tabulate -// -// W_1 := expf(delta_1) - 1 -// W_2 := expf(delta_2) - 1 -// -// Both in double precision. -// -// From the tabulated values T_1, T_2, W_1, W_2, we compose the values -// T and W via -// -// T := T_1 * T_2 ...exactly -// W := W_1 + (1 + W_1)*W_2 -// -// W approximates expf( delta ) - 1 where delta = delta_1 + delta_2. -// The mathematical product of T and (W+1) is an accurate representation -// of 2^(M_1/2^6) * 2^(M_2/2^12). -// -// Step 4. Reconstruction -// -// Finally, we can reconstruct expf(X), expf(X) - 1. -// Because -// -// X = K * log(2) + (M_1*log(2)/2^6 - delta_1) -// + (M_2*log(2)/2^12 - delta_2) -// + delta_1 + delta_2 + r ...accurately -// We have -// -// expf(X) ~=~ 2^K * ( T + T*[expf(delta_1+delta_2+r) - 1] ) -// ~=~ 2^K * ( T + T*[expf(delta + r) - 1] ) -// ~=~ 2^K * ( T + T*[(expf(delta)-1) -// + expf(delta)*(expf(r)-1)] ) -// ~=~ 2^K * ( T + T*( W + (1+W)*poly(r) ) ) -// ~=~ 2^K * ( Y_hi + Y_lo ) -// -// where Y_hi = T and Y_lo = T*(W + (1+W)*poly(r)) -// -// For expf(X)-1, we have -// -// expf(X)-1 ~=~ 2^K * ( Y_hi + Y_lo ) - 1 -// ~=~ 2^K * ( Y_hi + Y_lo - 2^(-K) ) -// -// and we combine Y_hi + Y_lo - 2^(-N) into the form of two -// numbers Y_hi + Y_lo carefully. -// -// **** Algorithm Details **** -// -// A careful algorithm must be used to realize the mathematical ideas -// accurately. We describe each of the three cases. We assume SAFE -// is preset to be TRUE. -// -// Case exp_tiny: -// -// The important points are to ensure an accurate result under -// different rounding directions and a correct setting of the SAFE -// flag. -// -// If Flag is 1, then -// SAFE := False ...possibility of underflow -// Scale := 1.0 -// Y_hi := X -// Y_lo := 2^(-17000) -// Else -// Scale := 1.0 -// Y_hi := 1.0 -// Y_lo := X ...for different rounding modes -// Endif -// -// Case exp_small: -// -// Here we compute a simple polynomial. To exploit parallelism, we split -// the polynomial into several portions. -// -// Let r = X -// -// If Flag is not 1 ...i.e. expf( argument ) -// -// rsq := r * r; -// r4 := rsq*rsq -// poly_lo := P_3 + r*(P_4 + r*(P_5 + r*P_6)) -// poly_hi := r + rsq*(P_1 + r*P_2) -// Y_lo := poly_hi + r4 * poly_lo -// set lsb(Y_lo) to 1 -// Y_hi := 1.0 -// Scale := 1.0 -// -// Else ...i.e. expf( argument ) - 1 -// -// rsq := r * r -// r4 := rsq * rsq -// r6 := rsq * r4 -// poly_lo := r6*(Q_5 + r*(Q_6 + r*Q_7)) -// poly_hi := Q_1 + r*(Q_2 + r*(Q_3 + r*Q_4)) -// Y_lo := rsq*poly_hi + poly_lo -// set lsb(Y_lo) to 1 -// Y_hi := X -// Scale := 1.0 -// -// Endif -// -// Case exp_regular: -// -// The previous description contain enough information except the -// computation of poly and the final Y_hi and Y_lo in the case for -// expf(X)-1. -// -// The computation of poly for Step 2: -// -// rsq := r*r -// poly := r + rsq*(A_1 + r*(A_2 + r*A_3)) -// -// For the case expf(X) - 1, we need to incorporate 2^(-K) into -// Y_hi and Y_lo at the end of Step 4. -// -// If K > 10 then -// Y_lo := Y_lo - 2^(-K) -// Else -// If K < -10 then -// Y_lo := Y_hi + Y_lo -// Y_hi := -2^(-K) -// Else -// Y_hi := Y_hi - 2^(-K) -// End If -// End If // +// API +//********************************************************************* +// float expm1f(float) +// +// Overview of operation +//********************************************************************* +// 1. Inputs of Nan, Inf, Zero, NatVal handled with special paths +// +// 2. |x| < 2^-40 +// Result = x, computed by x + x*x to handle appropriate flags and rounding +// +// 3. 2^-40 <= |x| < 2^-2 +// Result determined by 8th order Taylor series polynomial +// expm1f(x) = x + A2*x^2 + ... + A8*x^8 +// +// 4. x < -24.0 +// Here we know result is essentially -1 + eps, where eps only affects +// rounded result. Set I. +// +// 5. x >= 88.7228 +// Result overflows. Set I, O, and call error support +// +// 6. 2^-2 <= x < 88.7228 or -24.0 <= x < -2^-2 +// This is the main path. The algorithm is described below: + +// Take the input x. w is "how many log2/128 in x?" +// w = x * 64/log2 +// NJ = int(w) +// x = NJ*log2/64 + R + +// NJ = 64*n + j +// x = n*log2 + (log2/64)*j + R +// +// So, exp(x) = 2^n * 2^(j/64)* exp(R) +// +// T = 2^n * 2^(j/64) +// Construct 2^n +// Get 2^(j/64) table +// actually all the entries of 2^(j/64) table are stored in DP and +// with exponent bits set to 0 -> multiplication on 2^n can be +// performed by doing logical "or" operation with bits presenting 2^n + +// exp(R) = 1 + (exp(R) - 1) +// P = exp(R) - 1 approximated by Taylor series of 3rd degree +// P = A3*R^3 + A2*R^2 + R, A3 = 1/6, A2 = 1/2 +// + +// The final result is reconstructed as follows +// expm1f(x) = T*P + (T - 1.0) + +// Special values +//********************************************************************* +// expm1f(+0) = +0.0 +// expm1f(-0) = -0.0 + +// expm1f(+qnan) = +qnan +// expm1f(-qnan) = -qnan +// expm1f(+snan) = +qnan +// expm1f(-snan) = -qnan + +// expm1f(-inf) = -1.0 +// expm1f(+inf) = +inf + +// Overflow and Underflow +//********************************************************************* +// expm1f(x) = largest single normal when +// x = 88.7228 = 0x42b17217 +// +// Underflow is handled as described in case 2 above. + + +// Registers used +//********************************************************************* +// Floating Point registers used: +// f8, input +// f6,f7, f9 -> f15, f32 -> f45 + +// General registers used: +// r3, r20 -> r38 + +// Predicate registers used: +// p9 -> p15 + +// Assembly macros +//********************************************************************* +// integer registers used +// scratch +rNJ = r3 + +rExp_half = r20 +rSignexp_x = r21 +rExp_x = r22 +rExp_mask = r23 +rExp_bias = r24 +rTmp = r25 +rM1_lim = r25 +rGt_ln = r25 +rJ = r26 +rN = r27 +rTblAddr = r28 +rLn2Div64 = r29 +rRightShifter = r30 +r64DivLn2 = r31 +// stacked +GR_SAVE_PFS = r32 +GR_SAVE_B0 = r33 +GR_SAVE_GP = r34 +GR_Parameter_X = r35 +GR_Parameter_Y = r36 +GR_Parameter_RESULT = r37 +GR_Parameter_TAG = r38 + +// floating point registers used +FR_X = f10 +FR_Y = f1 +FR_RESULT = f8 +// scratch +fRightShifter = f6 +f64DivLn2 = f7 +fNormX = f9 +fNint = f10 +fN = f11 +fR = f12 +fLn2Div64 = f13 +fA2 = f14 +fA3 = f15 +// stacked +fP = f32 +fX3 = f33 +fT = f34 +fMIN_SGL_OFLOW_ARG = f35 +fMAX_SGL_NORM_ARG = f36 +fMAX_SGL_MINUS_1_ARG = f37 +fA4 = f38 +fA43 = f38 +fA432 = f38 +fRSqr = f39 +fA5 = f40 +fTmp = f41 +fGt_pln = f41 +fXsq = f41 +fA7 = f42 +fA6 = f43 +fA65 = f43 +fTm1 = f44 +fA8 = f45 +fA87 = f45 +fA8765 = f45 +fA8765432 = f45 +fWre_urm_f8 = f45 + +RODATA +.align 16 +LOCAL_OBJECT_START(_expf_table) +data8 0x3efa01a01a01a01a // A8 = 1/8! +data8 0x3f2a01a01a01a01a // A7 = 1/7! +data8 0x3f56c16c16c16c17 // A6 = 1/6! +data8 0x3f81111111111111 // A5 = 1/5! +data8 0x3fa5555555555555 // A4 = 1/4! +data8 0x3fc5555555555555 // A3 = 1/3! +// +data4 0x42b17218 // Smallest sgl arg to overflow sgl result +data4 0x42b17217 // Largest sgl arg to give sgl result +// +// 2^(j/64) table, j goes from 0 to 63 +data8 0x0000000000000000 // 2^(0/64) +data8 0x00002C9A3E778061 // 2^(1/64) +data8 0x000059B0D3158574 // 2^(2/64) +data8 0x0000874518759BC8 // 2^(3/64) +data8 0x0000B5586CF9890F // 2^(4/64) +data8 0x0000E3EC32D3D1A2 // 2^(5/64) +data8 0x00011301D0125B51 // 2^(6/64) +data8 0x0001429AAEA92DE0 // 2^(7/64) +data8 0x000172B83C7D517B // 2^(8/64) +data8 0x0001A35BEB6FCB75 // 2^(9/64) +data8 0x0001D4873168B9AA // 2^(10/64) +data8 0x0002063B88628CD6 // 2^(11/64) +data8 0x0002387A6E756238 // 2^(12/64) +data8 0x00026B4565E27CDD // 2^(13/64) +data8 0x00029E9DF51FDEE1 // 2^(14/64) +data8 0x0002D285A6E4030B // 2^(15/64) +data8 0x000306FE0A31B715 // 2^(16/64) +data8 0x00033C08B26416FF // 2^(17/64) +data8 0x000371A7373AA9CB // 2^(18/64) +data8 0x0003A7DB34E59FF7 // 2^(19/64) +data8 0x0003DEA64C123422 // 2^(20/64) +data8 0x0004160A21F72E2A // 2^(21/64) +data8 0x00044E086061892D // 2^(22/64) +data8 0x000486A2B5C13CD0 // 2^(23/64) +data8 0x0004BFDAD5362A27 // 2^(24/64) +data8 0x0004F9B2769D2CA7 // 2^(25/64) +data8 0x0005342B569D4F82 // 2^(26/64) +data8 0x00056F4736B527DA // 2^(27/64) +data8 0x0005AB07DD485429 // 2^(28/64) +data8 0x0005E76F15AD2148 // 2^(29/64) +data8 0x0006247EB03A5585 // 2^(30/64) +data8 0x0006623882552225 // 2^(31/64) +data8 0x0006A09E667F3BCD // 2^(32/64) +data8 0x0006DFB23C651A2F // 2^(33/64) +data8 0x00071F75E8EC5F74 // 2^(34/64) +data8 0x00075FEB564267C9 // 2^(35/64) +data8 0x0007A11473EB0187 // 2^(36/64) +data8 0x0007E2F336CF4E62 // 2^(37/64) +data8 0x00082589994CCE13 // 2^(38/64) +data8 0x000868D99B4492ED // 2^(39/64) +data8 0x0008ACE5422AA0DB // 2^(40/64) +data8 0x0008F1AE99157736 // 2^(41/64) +data8 0x00093737B0CDC5E5 // 2^(42/64) +data8 0x00097D829FDE4E50 // 2^(43/64) +data8 0x0009C49182A3F090 // 2^(44/64) +data8 0x000A0C667B5DE565 // 2^(45/64) +data8 0x000A5503B23E255D // 2^(46/64) +data8 0x000A9E6B5579FDBF // 2^(47/64) +data8 0x000AE89F995AD3AD // 2^(48/64) +data8 0x000B33A2B84F15FB // 2^(49/64) +data8 0x000B7F76F2FB5E47 // 2^(50/64) +data8 0x000BCC1E904BC1D2 // 2^(51/64) +data8 0x000C199BDD85529C // 2^(52/64) +data8 0x000C67F12E57D14B // 2^(53/64) +data8 0x000CB720DCEF9069 // 2^(54/64) +data8 0x000D072D4A07897C // 2^(55/64) +data8 0x000D5818DCFBA487 // 2^(56/64) +data8 0x000DA9E603DB3285 // 2^(57/64) +data8 0x000DFC97337B9B5F // 2^(58/64) +data8 0x000E502EE78B3FF6 // 2^(59/64) +data8 0x000EA4AFA2A490DA // 2^(60/64) +data8 0x000EFA1BEE615A27 // 2^(61/64) +data8 0x000F50765B6E4540 // 2^(62/64) +data8 0x000FA7C1819E90D8 // 2^(63/64) +LOCAL_OBJECT_END(_expf_table) -#include "libm_support.h" - - -GR_SAVE_B0 = r60 -GR_SAVE_PFS = r59 -GR_SAVE_GP = r61 - -GR_Parameter_X = r62 -GR_Parameter_Y = r63 -GR_Parameter_RESULT = r64 -GR_Parameter_TAG = r65 - -FR_X = f9 -FR_Y = f1 -FR_RESULT = f99 - - -#ifdef _LIBC -.rodata -#else -.data -#endif - -.align 64 -Constants_exp_64_Arg: -ASM_TYPE_DIRECTIVE(Constants_exp_64_Arg,@object) -data4 0x5C17F0BC,0xB8AA3B29,0x0000400B,0x00000000 -data4 0x00000000,0xB17217F4,0x00003FF2,0x00000000 -data4 0xF278ECE6,0xF473DE6A,0x00003FD4,0x00000000 -// /* Inv_L, L_hi, L_lo */ -ASM_SIZE_DIRECTIVE(Constants_exp_64_Arg) - -.align 64 -Constants_exp_64_Exponents: -ASM_TYPE_DIRECTIVE(Constants_exp_64_Exponents,@object) -data4 0x0000007E,0x00000000,0xFFFFFF83,0xFFFFFFFF -data4 0x000003FE,0x00000000,0xFFFFFC03,0xFFFFFFFF -data4 0x00003FFE,0x00000000,0xFFFFC003,0xFFFFFFFF -data4 0x00003FFE,0x00000000,0xFFFFC003,0xFFFFFFFF -data4 0xFFFFFFE2,0xFFFFFFFF,0xFFFFFFC4,0xFFFFFFFF -data4 0xFFFFFFBA,0xFFFFFFFF,0xFFFFFFBA,0xFFFFFFFF -ASM_SIZE_DIRECTIVE(Constants_exp_64_Exponents) - -.align 64 -Constants_exp_64_A: -ASM_TYPE_DIRECTIVE(Constants_exp_64_A,@object) -data4 0xB1B736A0,0xAAAAAAAB,0x00003FFA,0x00000000 -data4 0x90CD6327,0xAAAAAAAB,0x00003FFC,0x00000000 -data4 0xFFFFFFFF,0xFFFFFFFF,0x00003FFD,0x00000000 -// /* Reversed */ -ASM_SIZE_DIRECTIVE(Constants_exp_64_A) - -.align 64 -Constants_exp_64_P: -ASM_TYPE_DIRECTIVE(Constants_exp_64_P,@object) -data4 0x43914A8A,0xD00D6C81,0x00003FF2,0x00000000 -data4 0x30304B30,0xB60BC4AC,0x00003FF5,0x00000000 -data4 0x7474C518,0x88888888,0x00003FF8,0x00000000 -data4 0x8DAE729D,0xAAAAAAAA,0x00003FFA,0x00000000 -data4 0xAAAAAF61,0xAAAAAAAA,0x00003FFC,0x00000000 -data4 0x000004C7,0x80000000,0x00003FFE,0x00000000 -// /* Reversed */ -ASM_SIZE_DIRECTIVE(Constants_exp_64_P) - -.align 64 -Constants_exp_64_Q: -ASM_TYPE_DIRECTIVE(Constants_exp_64_Q,@object) -data4 0xA49EF6CA,0xD00D56F7,0x00003FEF,0x00000000 -data4 0x1C63493D,0xD00D59AB,0x00003FF2,0x00000000 -data4 0xFB50CDD2,0xB60B60B5,0x00003FF5,0x00000000 -data4 0x7BA68DC8,0x88888888,0x00003FF8,0x00000000 -data4 0xAAAAAC8D,0xAAAAAAAA,0x00003FFA,0x00000000 -data4 0xAAAAACCA,0xAAAAAAAA,0x00003FFC,0x00000000 -data4 0x00000000,0x80000000,0x00003FFE,0x00000000 -// /* Reversed */ -ASM_SIZE_DIRECTIVE(Constants_exp_64_Q) - -.align 64 -Constants_exp_64_T1: -ASM_TYPE_DIRECTIVE(Constants_exp_64_T1,@object) -data4 0x3F800000,0x3F8164D2,0x3F82CD87,0x3F843A29 -data4 0x3F85AAC3,0x3F871F62,0x3F88980F,0x3F8A14D5 -data4 0x3F8B95C2,0x3F8D1ADF,0x3F8EA43A,0x3F9031DC -data4 0x3F91C3D3,0x3F935A2B,0x3F94F4F0,0x3F96942D -data4 0x3F9837F0,0x3F99E046,0x3F9B8D3A,0x3F9D3EDA -data4 0x3F9EF532,0x3FA0B051,0x3FA27043,0x3FA43516 -data4 0x3FA5FED7,0x3FA7CD94,0x3FA9A15B,0x3FAB7A3A -data4 0x3FAD583F,0x3FAF3B79,0x3FB123F6,0x3FB311C4 -data4 0x3FB504F3,0x3FB6FD92,0x3FB8FBAF,0x3FBAFF5B -data4 0x3FBD08A4,0x3FBF179A,0x3FC12C4D,0x3FC346CD -data4 0x3FC5672A,0x3FC78D75,0x3FC9B9BE,0x3FCBEC15 -data4 0x3FCE248C,0x3FD06334,0x3FD2A81E,0x3FD4F35B -data4 0x3FD744FD,0x3FD99D16,0x3FDBFBB8,0x3FDE60F5 -data4 0x3FE0CCDF,0x3FE33F89,0x3FE5B907,0x3FE8396A -data4 0x3FEAC0C7,0x3FED4F30,0x3FEFE4BA,0x3FF28177 -data4 0x3FF5257D,0x3FF7D0DF,0x3FFA83B3,0x3FFD3E0C -ASM_SIZE_DIRECTIVE(Constants_exp_64_T1) - -.align 64 -Constants_exp_64_T2: -ASM_TYPE_DIRECTIVE(Constants_exp_64_T2,@object) -data4 0x3F800000,0x3F80058C,0x3F800B18,0x3F8010A4 -data4 0x3F801630,0x3F801BBD,0x3F80214A,0x3F8026D7 -data4 0x3F802C64,0x3F8031F2,0x3F803780,0x3F803D0E -data4 0x3F80429C,0x3F80482B,0x3F804DB9,0x3F805349 -data4 0x3F8058D8,0x3F805E67,0x3F8063F7,0x3F806987 -data4 0x3F806F17,0x3F8074A8,0x3F807A39,0x3F807FCA -data4 0x3F80855B,0x3F808AEC,0x3F80907E,0x3F809610 -data4 0x3F809BA2,0x3F80A135,0x3F80A6C7,0x3F80AC5A -data4 0x3F80B1ED,0x3F80B781,0x3F80BD14,0x3F80C2A8 -data4 0x3F80C83C,0x3F80CDD1,0x3F80D365,0x3F80D8FA -data4 0x3F80DE8F,0x3F80E425,0x3F80E9BA,0x3F80EF50 -data4 0x3F80F4E6,0x3F80FA7C,0x3F810013,0x3F8105AA -data4 0x3F810B41,0x3F8110D8,0x3F81166F,0x3F811C07 -data4 0x3F81219F,0x3F812737,0x3F812CD0,0x3F813269 -data4 0x3F813802,0x3F813D9B,0x3F814334,0x3F8148CE -data4 0x3F814E68,0x3F815402,0x3F81599C,0x3F815F37 -ASM_SIZE_DIRECTIVE(Constants_exp_64_T2) - -.align 64 -Constants_exp_64_W1: -ASM_TYPE_DIRECTIVE(Constants_exp_64_W1,@object) -data4 0x00000000,0x00000000,0x171EC4B4,0xBE384454 -data4 0x4AA72766,0xBE694741,0xD42518F8,0xBE5D32B6 -data4 0x3A319149,0x3E68D96D,0x62415F36,0xBE68F4DA -data4 0xC9C86A3B,0xBE6DDA2F,0xF49228FE,0x3E6B2E50 -data4 0x1188B886,0xBE49C0C2,0x1A4C2F1F,0x3E64BFC2 -data4 0x2CB98B54,0xBE6A2FBB,0x9A55D329,0x3E5DC5DE -data4 0x39A7AACE,0x3E696490,0x5C66DBA5,0x3E54728B -data4 0xBA1C7D7D,0xBE62B0DB,0x09F1AF5F,0x3E576E04 -data4 0x1A0DD6A1,0x3E612500,0x795FBDEF,0xBE66A419 -data4 0xE1BD41FC,0xBE5CDE8C,0xEA54964F,0xBE621376 -data4 0x476E76EE,0x3E6370BE,0x3427EB92,0x3E390D1A -data4 0x2BF82BF8,0x3E1336DE,0xD0F7BD9E,0xBE5FF1CB -data4 0x0CEB09DD,0xBE60A355,0x0980F30D,0xBE5CA37E -data4 0x4C082D25,0xBE5C541B,0x3B467D29,0xBE5BBECA -data4 0xB9D946C5,0xBE400D8A,0x07ED374A,0xBE5E2A08 -data4 0x365C8B0A,0xBE66CB28,0xD3403BCA,0x3E3AAD5B -data4 0xC7EA21E0,0x3E526055,0xE72880D6,0xBE442C75 -data4 0x85222A43,0x3E58B2BB,0x522C42BF,0xBE5AAB79 -data4 0x469DC2BC,0xBE605CB4,0xA48C40DC,0xBE589FA7 -data4 0x1AA42614,0xBE51C214,0xC37293F4,0xBE48D087 -data4 0xA2D673E0,0x3E367A1C,0x114F7A38,0xBE51BEBB -data4 0x661A4B48,0xBE6348E5,0x1D3B9962,0xBDF52643 -data4 0x35A78A53,0x3E3A3B5E,0x1CECD788,0xBE46C46C -data4 0x7857D689,0xBE60B7EC,0xD14F1AD7,0xBE594D3D -data4 0x4C9A8F60,0xBE4F9C30,0x02DFF9D2,0xBE521873 -data4 0x55E6D68F,0xBE5E4C88,0x667F3DC4,0xBE62140F -data4 0x3BF88747,0xBE36961B,0xC96EC6AA,0x3E602861 -data4 0xD57FD718,0xBE3B5151,0xFC4A627B,0x3E561CD0 -data4 0xCA913FEA,0xBE3A5217,0x9A5D193A,0x3E40A3CC -data4 0x10A9C312,0xBE5AB713,0xC5F57719,0x3E4FDADB -data4 0xDBDF59D5,0x3E361428,0x61B4180D,0x3E5DB5DB -data4 0x7408D856,0xBE42AD5F,0x31B2B707,0x3E2A3148 -ASM_SIZE_DIRECTIVE(Constants_exp_64_W1) - -.align 64 -Constants_exp_64_W2: -ASM_TYPE_DIRECTIVE(Constants_exp_64_W2,@object) -data4 0x00000000,0x00000000,0x37A3D7A2,0xBE641F25 -data4 0xAD028C40,0xBE68DD57,0xF212B1B6,0xBE5C77D8 -data4 0x1BA5B070,0x3E57878F,0x2ECAE6FE,0xBE55A36A -data4 0x569DFA3B,0xBE620608,0xA6D300A3,0xBE53B50E -data4 0x223F8F2C,0x3E5B5EF2,0xD6DE0DF4,0xBE56A0D9 -data4 0xEAE28F51,0xBE64EEF3,0x367EA80B,0xBE5E5AE2 -data4 0x5FCBC02D,0x3E47CB1A,0x9BDAFEB7,0xBE656BA0 -data4 0x805AFEE7,0x3E6E70C6,0xA3415EBA,0xBE6E0509 -data4 0x49BFF529,0xBE56856B,0x00508651,0x3E66DD33 -data4 0xC114BC13,0x3E51165F,0xC453290F,0x3E53333D -data4 0x05539FDA,0x3E6A072B,0x7C0A7696,0xBE47CD87 -data4 0xEB05C6D9,0xBE668BF4,0x6AE86C93,0xBE67C3E3 -data4 0xD0B3E84B,0xBE533904,0x556B53CE,0x3E63E8D9 -data4 0x63A98DC8,0x3E212C89,0x032A7A22,0xBE33138F -data4 0xBC584008,0x3E530FA9,0xCCB93C97,0xBE6ADF82 -data4 0x8370EA39,0x3E5F9113,0xFB6A05D8,0x3E5443A4 -data4 0x181FEE7A,0x3E63DACD,0xF0F67DEC,0xBE62B29D -data4 0x3DDE6307,0x3E65C483,0xD40A24C1,0x3E5BF030 -data4 0x14E437BE,0x3E658B8F,0xED98B6C7,0xBE631C29 -data4 0x04CF7C71,0x3E6335D2,0xE954A79D,0x3E529EED -data4 0xF64A2FB8,0x3E5D9257,0x854ED06C,0xBE6BED1B -data4 0xD71405CB,0x3E5096F6,0xACB9FDF5,0xBE3D4893 -data4 0x01B68349,0xBDFEB158,0xC6A463B9,0x3E628D35 -data4 0xADE45917,0xBE559725,0x042FC476,0xBE68C29C -data4 0x01E511FA,0xBE67593B,0x398801ED,0xBE4A4313 -data4 0xDA7C3300,0x3E699571,0x08062A9E,0x3E5349BE -data4 0x755BB28E,0x3E5229C4,0x77A1F80D,0x3E67E426 -data4 0x6B69C352,0xBE52B33F,0x084DA57F,0xBE6B3550 -data4 0xD1D09A20,0xBE6DB03F,0x2161B2C1,0xBE60CBC4 -data4 0x78A2B771,0x3E56ED9C,0x9D0FA795,0xBE508E31 -data4 0xFD1A54E9,0xBE59482A,0xB07FD23E,0xBE2A17CE -data4 0x17365712,0x3E68BF5C,0xB3785569,0x3E3956F9 -ASM_SIZE_DIRECTIVE(Constants_exp_64_W2) .section .text -.proc expm1f# -.global expm1f# -.align 64 - -expm1f: -#ifdef _LIBC -.global __expm1f# -__expm1f: -#endif - +GLOBAL_IEEE754_ENTRY(expm1f) -{ .mii - alloc r32 = ar.pfs,0,30,4,0 -(p0) add r33 = 1, r0 -(p0) cmp.eq.unc p7, p0 = r0, r0 -} -;; - -// -// Set p7 true for expm1 -// Set Flag = r33 = 1 for expm1 -// These are really no longer necesary, but are a remnant -// when this file had multiple entry points. -// They should be carefully removed - - -{ .mfi -(p0) add r32 = 0,r0 -(p0) fnorm.s1 f9 = f8 - nop.i 0 -} - -{ .mfi - nop.m 0 -// -// Set p7 false for exp -// Set Flag = r33 = 0 for exp -// -(p0) fclass.m.unc p6, p8 = f8, 0x1E7 - nop.i 0 ;; +{ .mlx + getf.exp rSignexp_x = f8 // Must recompute if x unorm + movl r64DivLn2 = 0x40571547652B82FE // 64/ln(2) } - -{ .mfi - nop.m 999 -(p0) fclass.nm.unc p9, p0 = f8, 0x1FF - nop.i 0 +{ .mlx + addl rTblAddr = @ltoff(_expf_table),gp + movl rRightShifter = 0x43E8000000000000 // DP Right Shifter } +;; { .mfi - nop.m 999 -(p0) mov f36 = f1 - nop.i 999 ;; -} - -// -// Identify NatVals, NaNs, Infs, and Zeros. -// Identify EM unsupporteds. -// Save special input registers -// -// Create FR_X_cor = 0.0 -// GR_Flag = 0 -// GR_Expo_Range = 0 (r32) for single precision -// FR_Scale = 1.0 -// - -{ .mfb - nop.m 999 -(p0) mov f32 = f0 -(p6) br.cond.spnt EXPF_64_SPECIAL ;; -} - -{ .mib - nop.m 999 - nop.i 999 -(p9) br.cond.spnt EXPF_64_UNSUPPORTED ;; + // point to the beginning of the table + ld8 rTblAddr = [rTblAddr] + fclass.m p14, p0 = f8 , 0x22 // test for -INF + mov rExp_mask = 0x1ffff // Exponent mask } - -// -// Branch out for special input values -// - { .mfi -(p0) cmp.ne.unc p12, p13 = 0x01, r33 -(p0) fcmp.lt.unc.s0 p9,p0 = f8, f0 -(p0) cmp.eq.unc p15, p0 = r0, r0 -} - -// -// Raise possible denormal operand exception -// Normalize x -// -// This function computes expf( x + x_cor) -// Input FR 1: FR_X -// Input FR 2: FR_X_cor -// Input GR 1: GR_Flag -// Input GR 2: GR_Expo_Range -// Output FR 3: FR_Y_hi -// Output FR 4: FR_Y_lo -// Output FR 5: FR_Scale -// Output PR 1: PR_Safe - -// -// Prepare to load constants -// Set Safe = True -// - -{ .mmi -(p0) addl r34 = @ltoff(Constants_exp_64_Arg#),gp -(p0) addl r40 = @ltoff(Constants_exp_64_W1#),gp -(p0) addl r41 = @ltoff(Constants_exp_64_W2#),gp -};; - -{ .mmi - ld8 r34 = [r34] - ld8 r40 = [r40] -(p0) addl r50 = @ltoff(Constants_exp_64_T1#), gp + nop.m 0 + fnorm.s1 fNormX = f8 // normalized x + nop.i 0 } ;; -{ .mmi - ld8 r41 = [r41] -(p0) ldfe f37 = [r34],16 -(p0) addl r51 = @ltoff(Constants_exp_64_T2#), gp -} -;; -// -// N = fcvt.fx(float_N) -// Set p14 if -6 > expo_X -// -// -// Bias = 0x0FFFF -// expo_X = expo_X and Mask -// -{ .mmi - ld8 r50 = [r50] -(p0) ldfe f40 = [r34],16 - nop.i 999 +{ .mfi + setf.d f64DivLn2 = r64DivLn2 // load 64/ln(2) to FP reg + fclass.m p9, p0 = f8 , 0x0b // test for x unorm + mov rExp_bias = 0xffff // Exponent bias } -;; - -{ .mlx - nop.m 999 -(p0) movl r58 = 0x0FFFF -};; - -// -// Load W2_ptr -// Branch to SMALL is expo_X < -6 -// -// -// float_N = X * L_Inv -// expo_X = exponent of X -// Mask = 0x1FFFF -// - -{ .mmi - ld8 r51 = [r51] -(p0) ldfe f41 = [r34],16 -// -// float_N = X * L_Inv -// expo_X = exponent of X -// Mask = 0x1FFFF -// - nop.i 0 -};; - { .mlx -(p0) addl r34 = @ltoff(Constants_exp_64_Exponents#), gp -(p0) movl r39 = 0x1FFFF + // load Right Shifter to FP reg + setf.d fRightShifter = rRightShifter + movl rLn2Div64 = 0x3F862E42FEFA39EF // DP ln(2)/64 in GR } ;; -{ .mmi - ld8 r34 = [r34] -(p0) getf.exp r37 = f9 - nop.i 999 -} -;; - -{ .mii - nop.m 999 - nop.i 999 -(p0) and r37 = r37, r39 ;; -} - -{ .mmi -(p0) sub r37 = r37, r58 ;; -(p0) cmp.gt.unc p14, p0 = -6, r37 -(p0) cmp.lt.unc p10, p0 = 14, r37 ;; -} - { .mfi - nop.m 999 -// -// Load L_inv -// Set p12 true for Flag = 0 (exp) -// Set p13 true for Flag = 1 (expm1) -// -(p0) fmpy.s1 f38 = f9, f37 - nop.i 999 ;; + ldfpd fA8, fA7 = [rTblAddr], 16 + fcmp.eq.s1 p13, p0 = f0, f8 // test for x = 0.0 + mov rExp_half = 0xfffe } - { .mfb - nop.m 999 -// -// Load L_hi -// expo_X = expo_X - Bias -// get W1_ptr -// -(p0) fcvt.fx.s1 f39 = f38 -(p14) br.cond.spnt EXPF_SMALL ;; -} - -{ .mib - nop.m 999 - nop.i 999 -(p10) br.cond.spnt EXPF_HUGE ;; -} - -{ .mmi -(p0) shladd r34 = r32,4,r34 -(p0) addl r35 = @ltoff(Constants_exp_64_A#),gp - nop.i 999 + setf.d fLn2Div64 = rLn2Div64 // load ln(2)/64 to FP reg + nop.f 0 +(p9) br.cond.spnt EXPM1_UNORM // Branch if x unorm } ;; -{ .mmi - ld8 r35 = [r35] - nop.m 999 - nop.i 999 +EXPM1_COMMON: +{ .mfb + ldfpd fA6, fA5 = [rTblAddr], 16 +(p14) fms.s.s0 f8 = f0, f0, f1 // result if x = -inf +(p14) br.ret.spnt b0 // exit here if x = -inf } ;; -// -// Load T_1,T_2 -// - -{ .mmb -(p0) ldfe f51 = [r35],16 -(p0) ld8 r45 = [r34],8 - nop.b 999 ;; -} -// -// Set Safe = True if k >= big_expo_neg -// Set Safe = False if k < big_expo_neg -// - -{ .mmb -(p0) ldfe f49 = [r35],16 -(p0) ld8 r48 = [r34],0 - nop.b 999 ;; -} - -{ .mfi - nop.m 999 -// -// Branch to HUGE is expo_X > 14 -// -(p0) fcvt.xf f38 = f39 - nop.i 999 ;; -} - -{ .mfi -(p0) getf.sig r52 = f39 - nop.f 999 - nop.i 999 ;; -} - -{ .mii - nop.m 999 -(p0) extr.u r43 = r52, 6, 6 ;; -// -// r = r - float_N * L_lo -// K = extr(N_fix,12,52) -// -(p0) shladd r40 = r43,3,r40 ;; -} - -{ .mfi -(p0) shladd r50 = r43,2,r50 -(p0) fnma.s1 f42 = f40, f38, f9 -// -// float_N = float(N) -// N_fix = signficand N -// -(p0) extr.u r42 = r52, 0, 6 -} - -{ .mmi -(p0) ldfd f43 = [r40],0 ;; -(p0) shladd r41 = r42,3,r41 -(p0) shladd r51 = r42,2,r51 -} -// -// W_1_p1 = 1 + W_1 -// - -{ .mmi -(p0) ldfs f44 = [r50],0 ;; -(p0) ldfd f45 = [r41],0 -// -// M_2 = extr(N_fix,0,6) -// M_1 = extr(N_fix,6,6) -// r = X - float_N * L_hi -// -(p0) extr r44 = r52, 12, 52 -} - -{ .mmi -(p0) ldfs f46 = [r51],0 ;; -(p0) sub r46 = r58, r44 -(p0) cmp.gt.unc p8, p15 = r44, r45 -} -// -// W = W_1 + W_1_p1*W_2 -// Load A_2 -// Bias_m_K = Bias - K -// - -{ .mii -(p0) ldfe f40 = [r35],16 -// -// load A_1 -// poly = A_2 + r*A_3 -// rsq = r * r -// neg_2_mK = exponent of Bias_m_k -// -(p0) add r47 = r58, r44 ;; -// -// Set Safe = True if k <= big_expo_pos -// Set Safe = False if k > big_expo_pos -// Load A_3 -// -(p15) cmp.lt p8,p15 = r44,r48 ;; -} - -{ .mmf -(p0) setf.exp f61 = r46 -// -// Bias_p + K = Bias + K -// T = T_1 * T_2 -// -(p0) setf.exp f36 = r47 -(p0) fnma.s1 f42 = f41, f38, f42 ;; -} - -{ .mfi - nop.m 999 -// -// Load W_1,W_2 -// Load big_exp_pos, load big_exp_neg -// -(p0) fadd.s1 f47 = f43, f1 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p0) fma.s1 f52 = f42, f51, f49 - nop.i 999 -} - -{ .mfi - nop.m 999 -(p0) fmpy.s1 f48 = f42, f42 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p0) fmpy.s1 f53 = f44, f46 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p0) fma.s1 f54 = f45, f47, f43 - nop.i 999 -} - -{ .mfi - nop.m 999 -(p0) fneg f61 = f61 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p0) fma.s1 f52 = f42, f52, f40 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p0) fadd.s1 f55 = f54, f1 - nop.i 999 -} - -{ .mfi - nop.m 999 -// -// W + Wp1 * poly -// -(p0) mov f34 = f53 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// A_1 + r * poly -// Scale = setf_expf(Bias_p_k) -// -(p0) fma.s1 f52 = f48, f52, f42 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// poly = r + rsq(A_1 + r*poly) -// Wp1 = 1 + W -// neg_2_mK = -neg_2_mK -// -(p0) fma.s1 f35 = f55, f52, f54 - nop.i 999 ;; -} - { .mfb - nop.m 999 -(p0) fmpy.s1 f35 = f35, f53 -// -// Y_hi = T -// Y_lo = T * (W + Wp1*poly) -// -(p12) br.cond.sptk EXPF_MAIN ;; -} -// -// Branch if expf(x) -// Continue for expf(x-1) -// - -{ .mii -(p0) cmp.lt.unc p12, p13 = 10, r44 - nop.i 999 ;; -// -// Set p12 if 10 < K, Else p13 -// -(p13) cmp.gt.unc p13, p14 = -10, r44 ;; + ldfpd fA4, fA3 = [rTblAddr], 16 + fclass.m p15, p0 = f8 , 0x1e1 // test for NaT,NaN,+Inf +(p13) br.ret.spnt b0 // exit here if x =0.0, result is x } -// -// K > 10: Y_lo = Y_lo + neg_2_mK -// K <=10: Set p13 if -10 > K, Else set p14 -// +;; { .mfi -(p13) cmp.eq p15, p0 = r0, r0 -(p14) fadd.s1 f34 = f61, f34 - nop.i 999 ;; + // overflow thresholds + ldfps fMIN_SGL_OFLOW_ARG, fMAX_SGL_NORM_ARG = [rTblAddr], 8 + fma.s1 fXsq = fNormX, fNormX, f0 // x^2 for small path + and rExp_x = rExp_mask, rSignexp_x // Biased exponent of x } - -{ .mfi - nop.m 999 -(p12) fadd.s1 f35 = f35, f61 - nop.i 999 ;; +{ .mlx + nop.m 0 + movl rM1_lim = 0xc1c00000 // Minus -1 limit (-24.0), SP } +;; { .mfi - nop.m 999 -(p13) fadd.s1 f35 = f35, f34 - nop.i 999 + setf.exp fA2 = rExp_half + // x*(64/ln(2)) + Right Shifter + fma.s1 fNint = fNormX, f64DivLn2, fRightShifter + sub rExp_x = rExp_x, rExp_bias // True exponent of x } - { .mfb - nop.m 999 -// -// K <= 10 and K < -10, Set Safe = True -// K <= 10 and K < 10, Y_lo = Y_hi + Y_lo -// K <= 10 and K > =-10, Y_hi = Y_hi + neg_2_mk -// -(p13) mov f34 = f61 -(p0) br.cond.sptk EXPF_MAIN ;; -} -EXPF_SMALL: -{ .mmi -(p12) addl r35 = @ltoff(Constants_exp_64_P#), gp -(p0) addl r34 = @ltoff(Constants_exp_64_Exponents#), gp - nop.i 999 -} -;; - -{ .mmi -(p12) ld8 r35 = [r35] - ld8 r34 = [r34] - nop.i 999 + nop.m 0 +(p15) fma.s.s0 f8 = f8, f1, f0 // result if x = NaT,NaN,+Inf +(p15) br.ret.spnt b0 // exit here if x = NaT,NaN,+Inf } ;; - -{ .mmi -(p13) addl r35 = @ltoff(Constants_exp_64_Q#), gp - nop.m 999 - nop.i 999 -} -;; - - -// -// Return -// K <= 10 and K < 10, Y_hi = neg_2_mk -// -// /*******************************************************/ -// /*********** Branch EXP_SMALL *************************/ -// /*******************************************************/ - { .mfi -(p13) ld8 r35 = [r35] -(p0) mov f42 = f9 -(p0) add r34 = 0x48,r34 + setf.s fMAX_SGL_MINUS_1_ARG = rM1_lim // -1 threshold, -24.0 + nop.f 0 + cmp.gt p7, p8 = -2, rExp_x // Test |x| < 2^(-2) } ;; -// -// Flag = 0 -// r4 = rsq * rsq -// - { .mfi -(p0) ld8 r49 =[r34],0 - nop.f 999 - nop.i 999 ;; -} - -{ .mii - nop.m 999 - nop.i 999 ;; -// -// Flag = 1 -// -(p0) cmp.lt.unc p14, p0 = r37, r49 ;; +(p7) cmp.gt.unc p6, p7 = -40, rExp_x // Test |x| < 2^(-40) + fma.s1 fA87 = fA8, fNormX, fA7 // Small path, A8*x+A7 + nop.i 0 } - { .mfi - nop.m 999 -// -// r = X -// -(p0) fmpy.s1 f48 = f42, f42 - nop.i 999 ;; + nop.m 0 + fma.s1 fA65 = fA6, fNormX, fA5 // Small path, A6*x+A5 + nop.i 0 } +;; { .mfb - nop.m 999 -// -// rsq = r * r -// -(p0) fmpy.s1 f50 = f48, f48 -// -// Is input very small? -// -(p14) br.cond.spnt EXPF_VERY_SMALL ;; -} -// -// Flag_not1: Y_hi = 1.0 -// Flag is 1: r6 = rsq * r4 -// - -{ .mfi -(p12) ldfe f52 = [r35],16 -(p12) mov f34 = f1 -(p0) add r53 = 0x1,r0 ;; -} - -{ .mfi -(p13) ldfe f51 = [r35],16 -// -// Flag_not_1: Y_lo = poly_hi + r4 * poly_lo -// -(p13) mov f34 = f9 - nop.i 999 ;; -} - -{ .mmf -(p12) ldfe f53 = [r35],16 -// -// For Flag_not_1, Y_hi = X -// Scale = 1 -// Create 0x000...01 -// -(p0) setf.sig f37 = r53 -(p0) mov f36 = f1 ;; -} - -{ .mmi -(p13) ldfe f52 = [r35],16 ;; -(p12) ldfe f54 = [r35],16 - nop.i 999 ;; -} - -{ .mfi -(p13) ldfe f53 = [r35],16 -(p13) fmpy.s1 f58 = f48, f50 - nop.i 999 ;; -} -// -// Flag_not1: poly_lo = P_5 + r*P_6 -// Flag_1: poly_lo = Q_6 + r*Q_7 -// - -{ .mmi -(p13) ldfe f54 = [r35],16 ;; -(p12) ldfe f55 = [r35],16 - nop.i 999 ;; -} - -{ .mmi -(p12) ldfe f56 = [r35],16 ;; -(p13) ldfe f55 = [r35],16 - nop.i 999 ;; -} - -{ .mmi -(p12) ldfe f57 = [r35],0 ;; -(p13) ldfe f56 = [r35],16 - nop.i 999 ;; -} - -{ .mfi -(p13) ldfe f57 = [r35],0 - nop.f 999 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// For Flag_not_1, load p5,p6,p1,p2 -// Else load p5,p6,p1,p2 -// -(p12) fma.s1 f60 = f52, f42, f53 - nop.i 999 ;; + nop.m 0 +(p6) fma.s.s0 f8 = f8, f8, f8 // If x < 2^-40, result=x+x*x +(p6) br.ret.spnt b0 // Exit if x < 2^-40 } +;; { .mfi - nop.m 999 -(p13) fma.s1 f60 = f51, f42, f52 - nop.i 999 ;; + nop.m 0 + // check for overflow + fcmp.gt.s1 p15, p14 = fNormX, fMIN_SGL_OFLOW_ARG + nop.i 0 } - { .mfi - nop.m 999 -(p12) fma.s1 f60 = f60, f42, f54 - nop.i 999 ;; + nop.m 0 + fms.s1 fN = fNint, f1, fRightShifter // n in FP register + nop.i 0 } +;; { .mfi - nop.m 999 -(p12) fma.s1 f59 = f56, f42, f57 - nop.i 999 ;; + nop.m 0 +(p7) fma.s1 fA43 = fA4, fNormX, fA3 // Small path, A4*x+A3 + nop.i 0 } +;; { .mfi - nop.m 999 -(p13) fma.s1 f60 = f42, f60, f53 - nop.i 999 ;; + getf.sig rNJ = fNint // bits of n, j +(p7) fma.s1 fA8765 = fA87, fXsq, fA65 // Small path, A87*xsq+A65 + nop.i 0 } - -{ .mfi - nop.m 999 -(p12) fma.s1 f59 = f59, f48, f42 - nop.i 999 ;; +{ .mfb + nop.m 0 +(p7) fma.s1 fX3 = fXsq, fNormX, f0 // Small path, x^3 + // branch out if overflow +(p15) br.cond.spnt EXPM1_CERTAIN_OVERFLOW } +;; { .mfi - nop.m 999 -// -// Flag_1: poly_lo = Q_5 + r*(Q_6 + r*Q_7) -// Flag_not1: poly_lo = P_4 + r*(P_5 + r*P_6) -// Flag_not1: poly_hi = (P_1 + r*P_2) -// -(p13) fmpy.s1 f60 = f60, f58 - nop.i 999 ;; + addl rN = 0xffff-63, rNJ // biased and shifted n + fnma.s1 fR = fLn2Div64, fN, fNormX // R = x - N*ln(2)/64 + extr.u rJ = rNJ , 0 , 6 // bits of j } +;; { .mfi - nop.m 999 -(p12) fma.s1 f60 = f60, f42, f55 - nop.i 999 ;; + shladd rJ = rJ, 3, rTblAddr // address in the 2^(j/64) table + // check for certain -1 + fcmp.le.s1 p13, p0 = fNormX, fMAX_SGL_MINUS_1_ARG + shr rN = rN, 6 // biased n } - { .mfi - nop.m 999 -// -// Flag_1: poly_lo = r6 *(Q_5 + ....) -// Flag_not1: poly_hi = r + rsq *(P_1 + r*P_2) -// -(p12) fma.s1 f35 = f60, f50, f59 - nop.i 999 + nop.m 0 +(p7) fma.s1 fA432 = fA43, fNormX, fA2 // Small path, A43*x+A2 + nop.i 0 } +;; { .mfi - nop.m 999 -(p13) fma.s1 f59 = f54, f42, f55 - nop.i 999 ;; + ld8 rJ = [rJ] + nop.f 0 + shl rN = rN , 52 // 2^n bits in DP format } +;; -{ .mfi - nop.m 999 -// -// Flag_not1: Y_lo = rsq* poly_hi + poly_lo -// Flag_1: poly_lo = rsq* poly_hi + poly_lo -// -(p13) fma.s1 f59 = f59, f42, f56 - nop.i 999 ;; +{ .mmi + or rN = rN, rJ // bits of 2^n * 2^(j/64) in DP format +(p13) mov rTmp = 1 // Make small value for -1 path + nop.i 0 } +;; { .mfi - nop.m 999 -// -// Flag_not_1: (P_1 + r*P_2) -// -(p13) fma.s1 f59 = f59, f42, f57 - nop.i 999 ;; + setf.d fT = rN // 2^n + // check for possible overflow (only happens if input higher precision) +(p14) fcmp.gt.s1 p14, p0 = fNormX, fMAX_SGL_NORM_ARG + nop.i 0 } - { .mfi - nop.m 999 -// -// Flag_not_1: poly_hi = r + rsq * (P_1 + r*P_2) -// -(p13) fma.s1 f35 = f59, f48, f60 - nop.i 999 ;; + nop.m 0 +(p7) fma.s1 fA8765432 = fA8765, fX3, fA432 // A8765*x^3+A432 + nop.i 0 } +;; { .mfi - nop.m 999 -// -// Create 0.000...01 -// -(p0) for f37 = f35, f37 - nop.i 999 ;; +(p13) setf.exp fTmp = rTmp // Make small value for -1 path + fma.s1 fP = fA3, fR, fA2 // A3*R + A2 + nop.i 0 } - { .mfb - nop.m 999 -// -// Set lsb of Y_lo to 1 -// -(p0) fmerge.se f35 = f35,f37 -(p0) br.cond.sptk EXPF_MAIN ;; -} -EXPF_VERY_SMALL: - -{ .mmi - nop.m 999 -(p13) addl r34 = @ltoff(Constants_exp_64_Exponents#),gp - nop.i 999;; -} - -{ .mfi -(p13) ld8 r34 = [r34]; -(p12) mov f35 = f9 - nop.i 999 ;; + nop.m 0 + fma.s1 fRSqr = fR, fR, f0 // R^2 +(p13) br.cond.spnt EXPM1_CERTAIN_MINUS_ONE // Branch if x < -24.0 } +;; { .mfb - nop.m 999 -(p12) mov f34 = f1 -(p12) br.cond.sptk EXPF_MAIN ;; -} - -{ .mlx -(p13) add r34 = 8,r34 -(p13) movl r39 = 0x0FFFE ;; -} -// -// Load big_exp_neg -// Create 1/2's exponent -// - -{ .mii -(p13) setf.exp f56 = r39 -(p13) shladd r34 = r32,4,r34 ;; - nop.i 999 -} -// -// Negative exponents are stored after positive -// - -{ .mfi -(p13) ld8 r45 = [r34],0 -// -// Y_hi = x -// Scale = 1 -// -(p13) fmpy.s1 f35 = f9, f9 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// Reset Safe if necessary -// Create 1/2 -// -(p13) mov f34 = f9 - nop.i 999 ;; + nop.m 0 +(p7) fma.s.s0 f8 = fA8765432, fXsq, fNormX // Small path, + // result=xsq*A8765432+x +(p7) br.ret.spnt b0 // Exit if 2^-40 <= |x| < 2^-2 } +;; { .mfi -(p13) cmp.lt.unc p0, p15 = r37, r45 -(p13) mov f36 = f1 - nop.i 999 ;; + nop.m 0 + fma.s1 fP = fP, fRSqr, fR // P = (A3*R + A2)*Rsqr + R + nop.i 0 } +;; { .mfb - nop.m 999 -// -// Y_lo = x * x -// -(p13) fmpy.s1 f35 = f35, f56 -// -// Y_lo = x*x/2 -// -(p13) br.cond.sptk EXPF_MAIN ;; -} -EXPF_HUGE: - -{ .mfi - nop.m 999 -(p0) fcmp.gt.unc.s1 p14, p0 = f9, f0 - nop.i 999 -} - -{ .mlx - nop.m 999 -(p0) movl r39 = 0x15DC0 ;; -} - -{ .mfi -(p14) setf.exp f34 = r39 -(p14) mov f35 = f1 -(p14) cmp.eq p0, p15 = r0, r0 ;; + nop.m 0 + fms.s1 fTm1 = fT, f1, f1 // T - 1.0 +(p14) br.cond.spnt EXPM1_POSSIBLE_OVERFLOW } +;; { .mfb - nop.m 999 -(p14) mov f36 = f34 -// -// If x > 0, Set Safe = False -// If x > 0, Y_hi = 2**(24,000) -// If x > 0, Y_lo = 1.0 -// If x > 0, Scale = 2**(24,000) -// -(p14) br.cond.sptk EXPF_MAIN ;; -} - -{ .mlx - nop.m 999 -(p12) movl r39 = 0xA240 -} - -{ .mlx - nop.m 999 -(p12) movl r38 = 0xA1DC ;; -} - -{ .mmb -(p13) cmp.eq p15, p14 = r0, r0 -(p12) setf.exp f34 = r39 - nop.b 999 ;; -} - -{ .mlx -(p12) setf.exp f35 = r38 -(p13) movl r39 = 0xFF9C -} - -{ .mfi - nop.m 999 -(p13) fsub.s1 f34 = f0, f1 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p12) mov f36 = f34 -(p12) cmp.eq p0, p15 = r0, r0 ;; -} - -{ .mfi -(p13) setf.exp f35 = r39 -(p13) mov f36 = f1 - nop.i 999 ;; -} -EXPF_MAIN: - -{ .mfi -(p0) cmp.ne.unc p12, p0 = 0x01, r33 -(p0) fmpy.s1 f101 = f36, f35 - nop.i 999 ;; + nop.m 0 + fma.s.s0 f8 = fP, fT, fTm1 + br.ret.sptk b0 // Result for main path + // minus_one_limit < x < -2^-2 + // and +2^-2 <= x < overflow_limit } +;; +// Here if x unorm +EXPM1_UNORM: { .mfb - nop.m 999 -(p0) fma.s.s0 f99 = f34, f36, f101 -(p15) br.cond.sptk EXPF_64_RETURN ;; -} - -{ .mfi - nop.m 999 -(p0) fsetc.s3 0x7F,0x01 - nop.i 999 -} - -{ .mlx - nop.m 999 -(p0) movl r50 = 0x0000000001007F ;; -} -// -// S0 user supplied status -// S2 user supplied status + WRE + TD (Overflows) -// S3 user supplied status + RZ + TD (Underflows) -// -// -// If (Safe) is true, then -// Compute result using user supplied status field. -// No overflow or underflow here, but perhaps inexact. -// Return -// Else -// Determine if overflow or underflow was raised. -// Fetch +/- overflow threshold for IEEE single, double, -// double extended -// - -{ .mfi -(p0) setf.exp f60 = r50 -(p0) fma.s.s3 f102 = f34, f36, f101 - nop.i 999 -} - -{ .mfi - nop.m 999 -(p0) fsetc.s3 0x7F,0x40 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// For Safe, no need to check for over/under. -// For expm1, handle errors like exp. -// -(p0) fsetc.s2 0x7F,0x42 - nop.i 999;; -} - -{ .mfi - nop.m 999 -(p0) fma.s.s2 f100 = f34, f36, f101 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p0) fsetc.s2 0x7F,0x40 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p7) fclass.m.unc p12, p0 = f102, 0x00F - nop.i 999 -} - -{ .mfi - nop.m 999 -(p0) fclass.m.unc p11, p0 = f102, 0x00F - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p7) fcmp.ge.unc.s1 p10, p0 = f100, f60 - nop.i 999 -} - -{ .mfi - nop.m 999 -// -// Create largest double exponent + 1. -// Create smallest double exponent - 1. -// -(p0) fcmp.ge.unc.s1 p8, p0 = f100, f60 - nop.i 999 ;; -} -// -// fcmp: resultS2 >= + overflow threshold -> set (a) if true -// fcmp: resultS2 <= - overflow threshold -> set (b) if true -// fclass: resultS3 is denorm/unorm/0 -> set (d) if true -// - -{ .mib -(p10) mov GR_Parameter_TAG = 43 - nop.i 999 -(p10) br.cond.sptk __libm_error_region ;; -} - -{ .mib -(p8) mov GR_Parameter_TAG = 16 - nop.i 999 -(p8) br.cond.sptk __libm_error_region ;; + getf.exp rSignexp_x = fNormX // Must recompute if x unorm + fcmp.eq.s0 p6, p0 = f8, f0 // Set D flag + br.cond.sptk EXPM1_COMMON } -// -// Report that exp overflowed -// - -{ .mib -(p12) mov GR_Parameter_TAG = 44 - nop.i 999 -(p12) br.cond.sptk __libm_error_region ;; -} - -{ .mib -(p11) mov GR_Parameter_TAG = 17 - nop.i 999 -(p11) br.cond.sptk __libm_error_region ;; -} - -{ .mib - nop.m 999 - nop.i 999 -// -// Report that exp underflowed -// -(p0) br.cond.sptk EXPF_64_RETURN ;; -} -EXPF_64_SPECIAL: +;; -{ .mfi - nop.m 999 -(p0) fclass.m.unc p6, p0 = f8, 0x0c3 - nop.i 999 +// here if result will be -1 and inexact, x <= -24.0 +EXPM1_CERTAIN_MINUS_ONE: +{ .mfb + nop.m 0 + fms.s.s0 f8 = fTmp, fTmp, f1 // Result -1, and Inexact set + br.ret.sptk b0 } +;; -{ .mfi - nop.m 999 -(p0) fclass.m.unc p13, p8 = f8, 0x007 - nop.i 999 ;; -} +EXPM1_POSSIBLE_OVERFLOW: -{ .mfi - nop.m 999 -(p7) fclass.m.unc p14, p0 = f8, 0x007 - nop.i 999 -} +// Here if fMAX_SGL_NORM_ARG < x < fMIN_SGL_OFLOW_ARG +// This cannot happen if input is a single, only if input higher precision. +// Overflow is a possibility, not a certainty. -{ .mfi - nop.m 999 -(p0) fclass.m.unc p12, p9 = f8, 0x021 - nop.i 999 ;; -} +// Recompute result using status field 2 with user's rounding mode, +// and wre set. If result is larger than largest single, then we have +// overflow { .mfi - nop.m 999 -(p0) fclass.m.unc p11, p0 = f8, 0x022 - nop.i 999 + mov rGt_ln = 0x1007f // Exponent for largest sgl + 1 ulp + fsetc.s2 0x7F,0x42 // Get user's round mode, set wre + nop.i 0 } +;; { .mfi - nop.m 999 -(p7) fclass.m.unc p10, p0 = f8, 0x022 - nop.i 999 ;; + setf.exp fGt_pln = rGt_ln // Create largest single + 1 ulp + fma.s.s2 fWre_urm_f8 = fP, fT, fTm1 // Result with wre set + nop.i 0 } +;; { .mfi - nop.m 999 -// -// Identify +/- 0, Inf, or -Inf -// Generate the right kind of NaN. -// -(p13) fadd.s.s0 f99 = f0, f1 - nop.i 999 ;; + nop.m 0 + fsetc.s2 0x7F,0x40 // Turn off wre in sf2 + nop.i 0 } +;; { .mfi - nop.m 999 -(p14) mov f99 = f8 - nop.i 999 ;; + nop.m 0 + fcmp.ge.s1 p6, p0 = fWre_urm_f8, fGt_pln // Test for overflow + nop.i 0 } +;; { .mfb - nop.m 999 -(p6) fadd.s.s0 f99 = f8, f1 -// -// expf(+/-0) = 1 -// expm1f(+/-0) = +/-0 -// No exceptions raised -// -(p6) br.cond.sptk EXPF_64_RETURN ;; -} - -{ .mib - nop.m 999 - nop.i 999 -(p14) br.cond.sptk EXPF_64_RETURN ;; -} - -{ .mfi - nop.m 999 -(p11) mov f99 = f0 - nop.i 999 ;; + nop.m 0 + nop.f 0 +(p6) br.cond.spnt EXPM1_CERTAIN_OVERFLOW // Branch if overflow } +;; { .mfb - nop.m 999 -(p10) fsub.s.s1 f99 = f0, f1 -// -// expf(-Inf) = 0 -// expm1f(-Inf) = -1 -// No exceptions raised. -// -(p10) br.cond.sptk EXPF_64_RETURN ;; + nop.m 0 + fma.s.s0 f8 = fP, fT, fTm1 + br.ret.sptk b0 // Exit if really no overflow } +;; -{ .mfb - nop.m 999 -(p12) fmpy.s.s1 f99 = f8, f1 -// -// expf(+Inf) = Inf -// No exceptions raised. -// -(p0) br.cond.sptk EXPF_64_RETURN ;; +// here if overflow +EXPM1_CERTAIN_OVERFLOW: +{ .mmi + addl rTmp = 0x1FFFE, r0;; + setf.exp fTmp = rTmp + nop.i 999 } -EXPF_64_UNSUPPORTED: +;; -{ .mfb - nop.m 999 -(p0) fmpy.s.s0 f99 = f8, f0 - nop.b 0;; +{ .mfi + alloc r32 = ar.pfs, 0, 3, 4, 0 // get some registers + fmerge.s FR_X = fNormX,fNormX + nop.i 0 } - -EXPF_64_RETURN: { .mfb - nop.m 999 -(p0) mov f8 = f99 -(p0) br.ret.sptk b0 + mov GR_Parameter_TAG = 43 + fma.s.s0 FR_RESULT = fTmp, fTmp, f0 // Set I,O and +INF result + br.cond.sptk __libm_error_region } -.endp expm1f -ASM_SIZE_DIRECTIVE(expm1f) +;; +GLOBAL_IEEE754_END(expm1f) -.proc __libm_error_region -__libm_error_region: +LOCAL_LIBM_ENTRY(__libm_error_region) .prologue { .mfi - add GR_Parameter_Y=-32,sp // Parameter 2 value - nop.f 0 + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 999 .save ar.pfs,GR_SAVE_PFS - mov GR_SAVE_PFS=ar.pfs // Save ar.pfs + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs } { .mfi .fframe 64 - add sp=-64,sp // Create new stack - nop.f 0 - mov GR_SAVE_GP=gp // Save gp + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp };; { .mmi - stfs [GR_Parameter_Y] = FR_Y,16 // Store Parameter 2 on stack - add GR_Parameter_X = 16,sp // Parameter 1 address + stfs [GR_Parameter_Y] = FR_Y,16 // Store Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address .save b0, GR_SAVE_B0 - mov GR_SAVE_B0=b0 // Save b0 + mov GR_SAVE_B0=b0 // Save b0 };; .body -{ .mib - stfs [GR_Parameter_X] = FR_X // Store Parameter 1 on stack - add GR_Parameter_RESULT = 0,GR_Parameter_Y - nop.b 0 // Parameter 3 address +{ .mfi + stfs [GR_Parameter_X] = FR_X // Store Parameter 1 on stack + nop.f 0 + add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address } { .mib - stfs [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack - add GR_Parameter_Y = -16,GR_Parameter_Y - br.call.sptk b0=__libm_error_support# // Call error handling function + stfs [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function };; + { .mmi - nop.m 0 - nop.m 0 - add GR_Parameter_RESULT = 48,sp + add GR_Parameter_RESULT = 48,sp + nop.m 0 + nop.i 0 };; + { .mmi - ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack + ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack .restore sp - add sp = 64,sp // Restore stack pointer - mov b0 = GR_SAVE_B0 // Restore return address + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address };; { .mib - mov gp = GR_SAVE_GP // Restore gp - mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs - br.ret.sptk b0 // Return -};; + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) +LOCAL_LIBM_END(__libm_error_region) .type __libm_error_support#,@function diff --git a/sysdeps/ia64/fpu/s_expm1l.S b/sysdeps/ia64/fpu/s_expm1l.S index e53d3c8d7c..069856d244 100644 --- a/sysdeps/ia64/fpu/s_expm1l.S +++ b/sysdeps/ia64/fpu/s_expm1l.S @@ -1,10 +1,10 @@ -.file "exp_m1l.s" +.file "expl_m1.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,7 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -35,15 +35,22 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 4/04/00 Unwind support added -// 8/15/00 Bundle added after call to __libm_error_support to properly +// 02/02/00 Initial Version +// 04/04/00 Unwind support added +// 08/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. +// 07/07/01 Improved speed of all paths +// 05/20/02 Cleaned up namespace and sf0 syntax +// 02/10/03 Reordered header: .section, .global, .proc, .align; +// used data8 for long double table values +// 03/11/03 Improved accuracy and performance, corrected missing inexact flags +// 04/17/03 Eliminated misplaced and unused data label // -// ********************************************************************* +//********************************************************************* // // Function: Combined expl(x) and expm1l(x), where // x @@ -51,20 +58,20 @@ // x // expm1l(x) = e - 1 for double-extended precision x values // -// ********************************************************************* +//********************************************************************* // // Resources Used: // // Floating-Point Registers: f8 (Input and Return Value) -// f9,f32-f61, f99-f102 +// f9-f15,f32-f77 // // General Purpose Registers: -// r32-r61 -// r62-r65 (Used to pass arguments to error handling routine) +// r14-r38 +// r35-r38 (Used to pass arguments to error handling routine) // // Predicate Registers: p6-p15 // -// ********************************************************************* +//********************************************************************* // // IEEE Special Conditions: // @@ -74,39 +81,37 @@ // (Error Handling Routine called for overflow and Underflow) // Inexact raised when appropriate by algorithm // -// expl(inf) = inf -// expl(-inf) = +0 -// expl(SNaN) = QNaN -// expl(QNaN) = QNaN -// expl(0) = 1 -// expl(EM_special Values) = QNaN -// expl(inf) = inf -// expm1l(-inf) = -1 -// expm1l(SNaN) = QNaN -// expm1l(QNaN) = QNaN -// expm1l(0) = 0 -// expm1l(EM_special Values) = QNaN +// exp(inf) = inf +// exp(-inf) = +0 +// exp(SNaN) = QNaN +// exp(QNaN) = QNaN +// exp(0) = 1 +// exp(EM_special Values) = QNaN +// exp(inf) = inf +// expm1(-inf) = -1 +// expm1(SNaN) = QNaN +// expm1(QNaN) = QNaN +// expm1(0) = 0 +// expm1(EM_special Values) = QNaN // -// ********************************************************************* +//********************************************************************* // // Implementation and Algorithm Notes: // // ker_exp_64( in_FR : X, -// in_GR : Flag, -// in_GR : Expo_Range // out_FR : Y_hi, // out_FR : Y_lo, // out_FR : scale, // out_PR : Safe ) // -// On input, X is in register format and -// Flag = 0 for exp, -// Flag = 1 for expm1, +// On input, X is in register format +// p6 for exp, +// p7 for expm1, // -// On output, provided X and X_cor are real numbers, then +// On output, // -// scale*(Y_hi + Y_lo) approximates expl(X) if Flag is 0 -// scale*(Y_hi + Y_lo) approximates expl(X)-1 if Flag is 1 +// scale*(Y_hi + Y_lo) approximates exp(X) if exp +// scale*(Y_hi + Y_lo) approximates exp(X)-1 if expm1 // // The accuracy is sufficient for a highly accurate 64 sig. // bit implementation. Safe is set if there is no danger of @@ -122,36 +127,36 @@ // The method consists of three cases. // // If |X| < Tiny use case exp_tiny; -// else if |X| < 2^(-6) use case exp_small; +// else if |X| < 2^(-m) use case exp_small; m=12 for exp, m=7 for expm1 // else use case exp_regular; // // Case exp_tiny: // -// 1 + X can be used to approximate expl(X) or expl(X+X_cor); -// X + X^2/2 can be used to approximate expl(X) - 1 +// 1 + X can be used to approximate exp(X) +// X + X^2/2 can be used to approximate exp(X) - 1 // // Case exp_small: // -// Here, expl(X), expl(X+X_cor), and expl(X) - 1 can all be +// Here, exp(X) and exp(X) - 1 can all be // appproximated by a relatively simple polynomial. // // This polynomial resembles the truncated Taylor series // -// expl(w) = 1 + w + w^2/2! + w^3/3! + ... + w^n/n! +// exp(w) = 1 + w + w^2/2! + w^3/3! + ... + w^n/n! // // Case exp_regular: // // Here we use a table lookup method. The basic idea is that in -// order to compute expl(X), we accurately decompose X into +// order to compute exp(X), we accurately decompose X into // // X = N * log(2)/(2^12) + r, |r| <= log(2)/2^13. // // Hence // -// expl(X) = 2^( N / 2^12 ) * expl(r). +// exp(X) = 2^( N / 2^12 ) * exp(r). // // The value 2^( N / 2^12 ) is obtained by simple combinations -// of values calculated beforehand and stored in table; expl(r) +// of values calculated beforehand and stored in table; exp(r) // is approximated by a short polynomial because |r| is small. // // We elaborate this method in 4 steps. @@ -178,13 +183,9 @@ // as a double-precision number; L_lo has 64 significant bits and // stored as a double-extended number. // -// In the case Flag = 2, we further modify r by -// -// r := r + X_cor. -// // Step 2: Approximation // -// expl(r) - 1 is approximated by a short polynomial of the form +// exp(r) - 1 is approximated by a short polynomial of the form // // r + A_1 r^2 + A_2 r^3 + A_3 r^4 . // @@ -213,19 +214,19 @@ // Define two mathematical values, delta_1 and delta_2, implicitly // such that // -// T_1 = expl( [M_1 log(2)/2^6] - delta_1 ) -// T_2 = expl( [M_2 log(2)/2^12] - delta_2 ) +// T_1 = exp( [M_1 log(2)/2^6] - delta_1 ) +// T_2 = exp( [M_2 log(2)/2^12] - delta_2 ) // // are representable as 24 significant bits. To illustrate the idea, // we show how we define delta_1: // -// T_1 := round_to_24_bits( expl( M_1 log(2)/2^6 ) ) +// T_1 := round_to_24_bits( exp( M_1 log(2)/2^6 ) ) // delta_1 = (M_1 log(2)/2^6) - log( T_1 ) // // The last equality means mathematical equality. We then tabulate // -// W_1 := expl(delta_1) - 1 -// W_2 := expl(delta_2) - 1 +// W_1 := exp(delta_1) - 1 +// W_2 := exp(delta_2) - 1 // // Both in double precision. // @@ -235,13 +236,13 @@ // T := T_1 * T_2 ...exactly // W := W_1 + (1 + W_1)*W_2 // -// W approximates expl( delta ) - 1 where delta = delta_1 + delta_2. +// W approximates exp( delta ) - 1 where delta = delta_1 + delta_2. // The mathematical product of T and (W+1) is an accurate representation // of 2^(M_1/2^6) * 2^(M_2/2^12). // // Step 4. Reconstruction // -// Finally, we can reconstruct expl(X), expl(X) - 1. +// Finally, we can reconstruct exp(X), exp(X) - 1. // Because // // X = K * log(2) + (M_1*log(2)/2^6 - delta_1) @@ -249,18 +250,18 @@ // + delta_1 + delta_2 + r ...accurately // We have // -// expl(X) ~=~ 2^K * ( T + T*[expl(delta_1+delta_2+r) - 1] ) -// ~=~ 2^K * ( T + T*[expl(delta + r) - 1] ) -// ~=~ 2^K * ( T + T*[(expl(delta)-1) -// + expl(delta)*(expl(r)-1)] ) +// exp(X) ~=~ 2^K * ( T + T*[exp(delta_1+delta_2+r) - 1] ) +// ~=~ 2^K * ( T + T*[exp(delta + r) - 1] ) +// ~=~ 2^K * ( T + T*[(exp(delta)-1) +// + exp(delta)*(exp(r)-1)] ) // ~=~ 2^K * ( T + T*( W + (1+W)*poly(r) ) ) // ~=~ 2^K * ( Y_hi + Y_lo ) // // where Y_hi = T and Y_lo = T*(W + (1+W)*poly(r)) // -// For expl(X)-1, we have +// For exp(X)-1, we have // -// expl(X)-1 ~=~ 2^K * ( Y_hi + Y_lo ) - 1 +// exp(X)-1 ~=~ 2^K * ( Y_hi + Y_lo ) - 1 // ~=~ 2^K * ( Y_hi + Y_lo - 2^(-K) ) // // and we combine Y_hi + Y_lo - 2^(-N) into the form of two @@ -278,7 +279,7 @@ // different rounding directions and a correct setting of the SAFE // flag. // -// If Flag is 1, then +// If expm1 is 1, then // SAFE := False ...possibility of underflow // Scale := 1.0 // Y_hi := X @@ -296,26 +297,25 @@ // // Let r = X // -// If Flag is not 1 ...i.e. expl( argument ) +// If exp ...i.e. exp( argument ) // // rsq := r * r; // r4 := rsq*rsq // poly_lo := P_3 + r*(P_4 + r*(P_5 + r*P_6)) // poly_hi := r + rsq*(P_1 + r*P_2) // Y_lo := poly_hi + r4 * poly_lo -// set lsb(Y_lo) to 1 // Y_hi := 1.0 // Scale := 1.0 // -// Else ...i.e. expl( argument ) - 1 +// Else ...i.e. exp( argument ) - 1 // // rsq := r * r // r4 := rsq * rsq -// r6 := rsq * r4 -// poly_lo := r6*(Q_5 + r*(Q_6 + r*Q_7)) -// poly_hi := Q_1 + r*(Q_2 + r*(Q_3 + r*Q_4)) -// Y_lo := rsq*poly_hi + poly_lo -// set lsb(Y_lo) to 1 +// poly_lo := Q_7 + r*(Q_8 + r*Q_9)) +// poly_med:= Q_3 + r*Q_4 + rsq*(Q_5 + r*Q_6) +// poly_med:= poly_med + r4*poly_lo +// poly_hi := Q_1 + r*Q_2 +// Y_lo := rsq*(poly_hi + rsq*poly_lo) // Y_hi := X // Scale := 1.0 // @@ -325,14 +325,14 @@ // // The previous description contain enough information except the // computation of poly and the final Y_hi and Y_lo in the case for -// expl(X)-1. +// exp(X)-1. // // The computation of poly for Step 2: // // rsq := r*r // poly := r + rsq*(A_1 + r*(A_2 + r*A_3)) // -// For the case expl(X) - 1, we need to incorporate 2^(-K) into +// For the case exp(X) - 1, we need to incorporate 2^(-K) into // Y_hi and Y_lo at the end of Step 4. // // If K > 10 then @@ -346,72 +346,197 @@ // End If // End If // +//======================================================= +// General Purpose Registers +// +GR_ad_Arg = r14 +GR_ad_A = r15 +GR_sig_inv_ln2 = r15 +GR_rshf_2to51 = r16 +GR_ad_PQ = r16 +GR_ad_Q = r16 +GR_signexp_x = r17 +GR_exp_x = r17 +GR_small_exp = r18 +GR_rshf = r18 +GR_exp_mask = r19 +GR_ad_W1 = r20 +GR_exp_2tom51 = r20 +GR_ad_W2 = r21 +GR_exp_underflow = r21 +GR_M2 = r22 +GR_huge_exp = r22 +GR_M1 = r23 +GR_huge_signif = r23 +GR_K = r24 +GR_one = r24 +GR_minus_one = r24 +GR_exp_bias = r25 +GR_ad_Limits = r26 +GR_N_fix = r26 +GR_exp_2_mk = r26 +GR_ad_P = r27 +GR_exp_2_k = r27 +GR_big_expo_neg = r28 +GR_very_small_exp = r29 +GR_exp_half = r29 +GR_ad_T1 = r30 +GR_ad_T2 = r31 -#include "libm_support.h" +GR_SAVE_PFS = r32 +GR_SAVE_B0 = r33 +GR_SAVE_GP = r34 +GR_Parameter_X = r35 +GR_Parameter_Y = r36 +GR_Parameter_RESULT = r37 +GR_Parameter_TAG = r38 -#ifdef _LIBC -.rodata -#else -.data -#endif +// Floating Point Registers +// +FR_norm_x = f9 +FR_RSHF_2TO51 = f10 +FR_INV_LN2_2TO63 = f11 +FR_W_2TO51_RSH = f12 +FR_2TOM51 = f13 +FR_RSHF = f14 +FR_Y_hi = f34 +FR_Y_lo = f35 +FR_scale = f36 +FR_tmp = f37 +FR_float_N = f38 +FR_N_signif = f39 +FR_L_hi = f40 +FR_L_lo = f41 +FR_r = f42 +FR_W1 = f43 +FR_T1 = f44 +FR_W2 = f45 +FR_T2 = f46 +FR_W1_p1 = f47 +FR_rsq = f48 +FR_A2 = f49 +FR_r4 = f50 +FR_A3 = f51 +FR_poly = f52 +FR_T = f53 +FR_W = f54 +FR_Wp1 = f55 +FR_p21 = f59 +FR_p210 = f59 +FR_p65 = f60 +FR_p654 = f60 +FR_p6543 = f60 +FR_2_mk = f61 +FR_P4Q7 = f61 +FR_P4 = f61 +FR_Q7 = f61 +FR_P3Q6 = f62 +FR_P3 = f62 +FR_Q6 = f62 +FR_q65 = f62 +FR_q6543 = f62 +FR_P2Q5 = f63 +FR_P2 = f63 +FR_Q5 = f63 +FR_P1Q4 = f64 +FR_P1 = f64 +FR_Q4 = f64 +FR_q43 = f64 +FR_Q3 = f65 +FR_Q2 = f66 +FR_q21 = f66 +FR_Q1 = f67 +FR_A1 = f68 +FR_P6Q9 = f68 +FR_P6 = f68 +FR_Q9 = f68 +FR_P5Q8 = f69 +FR_P5 = f69 +FR_Q8 = f69 +FR_q987 = f69 +FR_q98 = f69 +FR_q9876543 = f69 +FR_min_oflow_x = f70 +FR_huge_exp = f70 +FR_zero_uflow_x = f71 +FR_huge_signif = f71 +FR_huge = f72 +FR_small = f72 +FR_half = f73 +FR_T_scale = f74 +FR_result_lo = f75 +FR_W_T_scale = f76 +FR_Wp1_T_scale = f77 +FR_ftz = f77 +FR_half_x = f77 +// -.align 64 -Constants_exp_64_Arg: -ASM_TYPE_DIRECTIVE(Constants_exp_64_Arg,@object) -data4 0x5C17F0BC,0xB8AA3B29,0x0000400B,0x00000000 -data4 0x00000000,0xB17217F4,0x00003FF2,0x00000000 -data4 0xF278ECE6,0xF473DE6A,0x00003FD4,0x00000000 -// /* Inv_L, L_hi, L_lo */ -ASM_SIZE_DIRECTIVE(Constants_exp_64_Arg) +FR_X = f9 +FR_Y = f0 +FR_RESULT = f15 -.align 64 -Constants_exp_64_Exponents: -ASM_TYPE_DIRECTIVE(Constants_exp_64_Exponents,@object) -data4 0x0000007E,0x00000000,0xFFFFFF83,0xFFFFFFFF -data4 0x000003FE,0x00000000,0xFFFFFC03,0xFFFFFFFF -data4 0x00003FFE,0x00000000,0xFFFFC003,0xFFFFFFFF -data4 0x00003FFE,0x00000000,0xFFFFC003,0xFFFFFFFF -data4 0xFFFFFFE2,0xFFFFFFFF,0xFFFFFFC4,0xFFFFFFFF -data4 0xFFFFFFBA,0xFFFFFFFF,0xFFFFFFBA,0xFFFFFFFF -ASM_SIZE_DIRECTIVE(Constants_exp_64_Exponents) +// ************* DO NOT CHANGE ORDER OF THESE TABLES ******************** -.align 64 -Constants_exp_64_A: -ASM_TYPE_DIRECTIVE(Constants_exp_64_A,@object) -data4 0xB1B736A0,0xAAAAAAAB,0x00003FFA,0x00000000 -data4 0x90CD6327,0xAAAAAAAB,0x00003FFC,0x00000000 -data4 0xFFFFFFFF,0xFFFFFFFF,0x00003FFD,0x00000000 -// /* Reversed */ -ASM_SIZE_DIRECTIVE(Constants_exp_64_A) +// double-extended 1/ln(2) +// 3fff b8aa 3b29 5c17 f0bb be87fed0691d3e88 +// 3fff b8aa 3b29 5c17 f0bc +// For speed the significand will be loaded directly with a movl and setf.sig +// and the exponent will be bias+63 instead of bias+0. Thus subsequent +// computations need to scale appropriately. +// The constant 2^12/ln(2) is needed for the computation of N. This is also +// obtained by scaling the computations. +// +// Two shifting constants are loaded directly with movl and setf.d. +// 1. RSHF_2TO51 = 1.1000..00 * 2^(63-12) +// This constant is added to x*1/ln2 to shift the integer part of +// x*2^12/ln2 into the rightmost bits of the significand. +// The result of this fma is N_signif. +// 2. RSHF = 1.1000..00 * 2^(63) +// This constant is subtracted from N_signif * 2^(-51) to give +// the integer part of N, N_fix, as a floating-point number. +// The result of this fms is float_N. +RODATA .align 64 -Constants_exp_64_P: -ASM_TYPE_DIRECTIVE(Constants_exp_64_P,@object) -data4 0x43914A8A,0xD00D6C81,0x00003FF2,0x00000000 -data4 0x30304B30,0xB60BC4AC,0x00003FF5,0x00000000 -data4 0x7474C518,0x88888888,0x00003FF8,0x00000000 -data4 0x8DAE729D,0xAAAAAAAA,0x00003FFA,0x00000000 -data4 0xAAAAAF61,0xAAAAAAAA,0x00003FFC,0x00000000 -data4 0x000004C7,0x80000000,0x00003FFE,0x00000000 -// /* Reversed */ -ASM_SIZE_DIRECTIVE(Constants_exp_64_P) +LOCAL_OBJECT_START(Constants_exp_64_Arg) +//data8 0xB8AA3B295C17F0BC,0x0000400B // Inv_L = 2^12/log(2) +data8 0xB17217F400000000,0x00003FF2 // L_hi = hi part log(2)/2^12 +data8 0xF473DE6AF278ECE6,0x00003FD4 // L_lo = lo part log(2)/2^12 +LOCAL_OBJECT_END(Constants_exp_64_Arg) -.align 64 -Constants_exp_64_Q: -ASM_TYPE_DIRECTIVE(Constants_exp_64_Q,@object) -data4 0xA49EF6CA,0xD00D56F7,0x00003FEF,0x00000000 -data4 0x1C63493D,0xD00D59AB,0x00003FF2,0x00000000 -data4 0xFB50CDD2,0xB60B60B5,0x00003FF5,0x00000000 -data4 0x7BA68DC8,0x88888888,0x00003FF8,0x00000000 -data4 0xAAAAAC8D,0xAAAAAAAA,0x00003FFA,0x00000000 -data4 0xAAAAACCA,0xAAAAAAAA,0x00003FFC,0x00000000 -data4 0x00000000,0x80000000,0x00003FFE,0x00000000 -// /* Reversed */ -ASM_SIZE_DIRECTIVE(Constants_exp_64_Q) +LOCAL_OBJECT_START(Constants_exp_64_Limits) +data8 0xb17217f7d1cf79ac,0x0000400c // Smallest long dbl oflow x +data8 0xb220000000000000,0x0000c00c // Small long dbl uflow zero x +LOCAL_OBJECT_END(Constants_exp_64_Limits) -.align 64 -Constants_exp_64_T1: -ASM_TYPE_DIRECTIVE(Constants_exp_64_T1,@object) +LOCAL_OBJECT_START(Constants_exp_64_A) +data8 0xAAAAAAABB1B736A0,0x00003FFA // A3 +data8 0xAAAAAAAB90CD6327,0x00003FFC // A2 +data8 0xFFFFFFFFFFFFFFFF,0x00003FFD // A1 +LOCAL_OBJECT_END(Constants_exp_64_A) + +LOCAL_OBJECT_START(Constants_exp_64_P) +data8 0xD00D6C8143914A8A,0x00003FF2 // P6 +data8 0xB60BC4AC30304B30,0x00003FF5 // P5 +data8 0x888888887474C518,0x00003FF8 // P4 +data8 0xAAAAAAAA8DAE729D,0x00003FFA // P3 +data8 0xAAAAAAAAAAAAAF61,0x00003FFC // P2 +data8 0x80000000000004C7,0x00003FFE // P1 +LOCAL_OBJECT_END(Constants_exp_64_P) + +LOCAL_OBJECT_START(Constants_exp_64_Q) +data8 0x93F2AC5F7471F32E, 0x00003FE9 // Q9 +data8 0xB8DA0F3550B3E764, 0x00003FEC // Q8 +data8 0xD00D00D0028E89C4, 0x00003FEF // Q7 +data8 0xD00D00DAEB8C4E91, 0x00003FF2 // Q6 +data8 0xB60B60B60B60B6F5, 0x00003FF5 // Q5 +data8 0x888888888886CC23, 0x00003FF8 // Q4 +data8 0xAAAAAAAAAAAAAAAB, 0x00003FFA // Q3 +data8 0xAAAAAAAAAAAAAAAB, 0x00003FFC // Q2 +data8 0x8000000000000000, 0x00003FFE // Q1 +LOCAL_OBJECT_END(Constants_exp_64_Q) + +LOCAL_OBJECT_START(Constants_exp_64_T1) data4 0x3F800000,0x3F8164D2,0x3F82CD87,0x3F843A29 data4 0x3F85AAC3,0x3F871F62,0x3F88980F,0x3F8A14D5 data4 0x3F8B95C2,0x3F8D1ADF,0x3F8EA43A,0x3F9031DC @@ -428,11 +553,9 @@ data4 0x3FD744FD,0x3FD99D16,0x3FDBFBB8,0x3FDE60F5 data4 0x3FE0CCDF,0x3FE33F89,0x3FE5B907,0x3FE8396A data4 0x3FEAC0C7,0x3FED4F30,0x3FEFE4BA,0x3FF28177 data4 0x3FF5257D,0x3FF7D0DF,0x3FFA83B3,0x3FFD3E0C -ASM_SIZE_DIRECTIVE(Constants_exp_64_T1) +LOCAL_OBJECT_END(Constants_exp_64_T1) -.align 64 -Constants_exp_64_T2: -ASM_TYPE_DIRECTIVE(Constants_exp_64_T2,@object) +LOCAL_OBJECT_START(Constants_exp_64_T2) data4 0x3F800000,0x3F80058C,0x3F800B18,0x3F8010A4 data4 0x3F801630,0x3F801BBD,0x3F80214A,0x3F8026D7 data4 0x3F802C64,0x3F8031F2,0x3F803780,0x3F803D0E @@ -449,1124 +572,824 @@ data4 0x3F810B41,0x3F8110D8,0x3F81166F,0x3F811C07 data4 0x3F81219F,0x3F812737,0x3F812CD0,0x3F813269 data4 0x3F813802,0x3F813D9B,0x3F814334,0x3F8148CE data4 0x3F814E68,0x3F815402,0x3F81599C,0x3F815F37 -ASM_SIZE_DIRECTIVE(Constants_exp_64_T2) +LOCAL_OBJECT_END(Constants_exp_64_T2) -.align 64 -Constants_exp_64_W1: -ASM_TYPE_DIRECTIVE(Constants_exp_64_W1,@object) -data4 0x00000000,0x00000000,0x171EC4B4,0xBE384454 -data4 0x4AA72766,0xBE694741,0xD42518F8,0xBE5D32B6 -data4 0x3A319149,0x3E68D96D,0x62415F36,0xBE68F4DA -data4 0xC9C86A3B,0xBE6DDA2F,0xF49228FE,0x3E6B2E50 -data4 0x1188B886,0xBE49C0C2,0x1A4C2F1F,0x3E64BFC2 -data4 0x2CB98B54,0xBE6A2FBB,0x9A55D329,0x3E5DC5DE -data4 0x39A7AACE,0x3E696490,0x5C66DBA5,0x3E54728B -data4 0xBA1C7D7D,0xBE62B0DB,0x09F1AF5F,0x3E576E04 -data4 0x1A0DD6A1,0x3E612500,0x795FBDEF,0xBE66A419 -data4 0xE1BD41FC,0xBE5CDE8C,0xEA54964F,0xBE621376 -data4 0x476E76EE,0x3E6370BE,0x3427EB92,0x3E390D1A -data4 0x2BF82BF8,0x3E1336DE,0xD0F7BD9E,0xBE5FF1CB -data4 0x0CEB09DD,0xBE60A355,0x0980F30D,0xBE5CA37E -data4 0x4C082D25,0xBE5C541B,0x3B467D29,0xBE5BBECA -data4 0xB9D946C5,0xBE400D8A,0x07ED374A,0xBE5E2A08 -data4 0x365C8B0A,0xBE66CB28,0xD3403BCA,0x3E3AAD5B -data4 0xC7EA21E0,0x3E526055,0xE72880D6,0xBE442C75 -data4 0x85222A43,0x3E58B2BB,0x522C42BF,0xBE5AAB79 -data4 0x469DC2BC,0xBE605CB4,0xA48C40DC,0xBE589FA7 -data4 0x1AA42614,0xBE51C214,0xC37293F4,0xBE48D087 -data4 0xA2D673E0,0x3E367A1C,0x114F7A38,0xBE51BEBB -data4 0x661A4B48,0xBE6348E5,0x1D3B9962,0xBDF52643 -data4 0x35A78A53,0x3E3A3B5E,0x1CECD788,0xBE46C46C -data4 0x7857D689,0xBE60B7EC,0xD14F1AD7,0xBE594D3D -data4 0x4C9A8F60,0xBE4F9C30,0x02DFF9D2,0xBE521873 -data4 0x55E6D68F,0xBE5E4C88,0x667F3DC4,0xBE62140F -data4 0x3BF88747,0xBE36961B,0xC96EC6AA,0x3E602861 -data4 0xD57FD718,0xBE3B5151,0xFC4A627B,0x3E561CD0 -data4 0xCA913FEA,0xBE3A5217,0x9A5D193A,0x3E40A3CC -data4 0x10A9C312,0xBE5AB713,0xC5F57719,0x3E4FDADB -data4 0xDBDF59D5,0x3E361428,0x61B4180D,0x3E5DB5DB -data4 0x7408D856,0xBE42AD5F,0x31B2B707,0x3E2A3148 -ASM_SIZE_DIRECTIVE(Constants_exp_64_W1) +LOCAL_OBJECT_START(Constants_exp_64_W1) +data8 0x0000000000000000, 0xBE384454171EC4B4 +data8 0xBE6947414AA72766, 0xBE5D32B6D42518F8 +data8 0x3E68D96D3A319149, 0xBE68F4DA62415F36 +data8 0xBE6DDA2FC9C86A3B, 0x3E6B2E50F49228FE +data8 0xBE49C0C21188B886, 0x3E64BFC21A4C2F1F +data8 0xBE6A2FBB2CB98B54, 0x3E5DC5DE9A55D329 +data8 0x3E69649039A7AACE, 0x3E54728B5C66DBA5 +data8 0xBE62B0DBBA1C7D7D, 0x3E576E0409F1AF5F +data8 0x3E6125001A0DD6A1, 0xBE66A419795FBDEF +data8 0xBE5CDE8CE1BD41FC, 0xBE621376EA54964F +data8 0x3E6370BE476E76EE, 0x3E390D1A3427EB92 +data8 0x3E1336DE2BF82BF8, 0xBE5FF1CBD0F7BD9E +data8 0xBE60A3550CEB09DD, 0xBE5CA37E0980F30D +data8 0xBE5C541B4C082D25, 0xBE5BBECA3B467D29 +data8 0xBE400D8AB9D946C5, 0xBE5E2A0807ED374A +data8 0xBE66CB28365C8B0A, 0x3E3AAD5BD3403BCA +data8 0x3E526055C7EA21E0, 0xBE442C75E72880D6 +data8 0x3E58B2BB85222A43, 0xBE5AAB79522C42BF +data8 0xBE605CB4469DC2BC, 0xBE589FA7A48C40DC +data8 0xBE51C2141AA42614, 0xBE48D087C37293F4 +data8 0x3E367A1CA2D673E0, 0xBE51BEBB114F7A38 +data8 0xBE6348E5661A4B48, 0xBDF526431D3B9962 +data8 0x3E3A3B5E35A78A53, 0xBE46C46C1CECD788 +data8 0xBE60B7EC7857D689, 0xBE594D3DD14F1AD7 +data8 0xBE4F9C304C9A8F60, 0xBE52187302DFF9D2 +data8 0xBE5E4C8855E6D68F, 0xBE62140F667F3DC4 +data8 0xBE36961B3BF88747, 0x3E602861C96EC6AA +data8 0xBE3B5151D57FD718, 0x3E561CD0FC4A627B +data8 0xBE3A5217CA913FEA, 0x3E40A3CC9A5D193A +data8 0xBE5AB71310A9C312, 0x3E4FDADBC5F57719 +data8 0x3E361428DBDF59D5, 0x3E5DB5DB61B4180D +data8 0xBE42AD5F7408D856, 0x3E2A314831B2B707 +LOCAL_OBJECT_END(Constants_exp_64_W1) -.align 64 -Constants_exp_64_W2: -ASM_TYPE_DIRECTIVE(Constants_exp_64_W2,@object) -data4 0x00000000,0x00000000,0x37A3D7A2,0xBE641F25 -data4 0xAD028C40,0xBE68DD57,0xF212B1B6,0xBE5C77D8 -data4 0x1BA5B070,0x3E57878F,0x2ECAE6FE,0xBE55A36A -data4 0x569DFA3B,0xBE620608,0xA6D300A3,0xBE53B50E -data4 0x223F8F2C,0x3E5B5EF2,0xD6DE0DF4,0xBE56A0D9 -data4 0xEAE28F51,0xBE64EEF3,0x367EA80B,0xBE5E5AE2 -data4 0x5FCBC02D,0x3E47CB1A,0x9BDAFEB7,0xBE656BA0 -data4 0x805AFEE7,0x3E6E70C6,0xA3415EBA,0xBE6E0509 -data4 0x49BFF529,0xBE56856B,0x00508651,0x3E66DD33 -data4 0xC114BC13,0x3E51165F,0xC453290F,0x3E53333D -data4 0x05539FDA,0x3E6A072B,0x7C0A7696,0xBE47CD87 -data4 0xEB05C6D9,0xBE668BF4,0x6AE86C93,0xBE67C3E3 -data4 0xD0B3E84B,0xBE533904,0x556B53CE,0x3E63E8D9 -data4 0x63A98DC8,0x3E212C89,0x032A7A22,0xBE33138F -data4 0xBC584008,0x3E530FA9,0xCCB93C97,0xBE6ADF82 -data4 0x8370EA39,0x3E5F9113,0xFB6A05D8,0x3E5443A4 -data4 0x181FEE7A,0x3E63DACD,0xF0F67DEC,0xBE62B29D -data4 0x3DDE6307,0x3E65C483,0xD40A24C1,0x3E5BF030 -data4 0x14E437BE,0x3E658B8F,0xED98B6C7,0xBE631C29 -data4 0x04CF7C71,0x3E6335D2,0xE954A79D,0x3E529EED -data4 0xF64A2FB8,0x3E5D9257,0x854ED06C,0xBE6BED1B -data4 0xD71405CB,0x3E5096F6,0xACB9FDF5,0xBE3D4893 -data4 0x01B68349,0xBDFEB158,0xC6A463B9,0x3E628D35 -data4 0xADE45917,0xBE559725,0x042FC476,0xBE68C29C -data4 0x01E511FA,0xBE67593B,0x398801ED,0xBE4A4313 -data4 0xDA7C3300,0x3E699571,0x08062A9E,0x3E5349BE -data4 0x755BB28E,0x3E5229C4,0x77A1F80D,0x3E67E426 -data4 0x6B69C352,0xBE52B33F,0x084DA57F,0xBE6B3550 -data4 0xD1D09A20,0xBE6DB03F,0x2161B2C1,0xBE60CBC4 -data4 0x78A2B771,0x3E56ED9C,0x9D0FA795,0xBE508E31 -data4 0xFD1A54E9,0xBE59482A,0xB07FD23E,0xBE2A17CE -data4 0x17365712,0x3E68BF5C,0xB3785569,0x3E3956F9 -ASM_SIZE_DIRECTIVE(Constants_exp_64_W2) - -GR_SAVE_PFS = r59 -GR_SAVE_B0 = r60 -GR_SAVE_GP = r61 -GR_Parameter_X = r62 -GR_Parameter_Y = r63 -GR_Parameter_RESULT = r64 -GR_Parameter_TAG = r65 +LOCAL_OBJECT_START(Constants_exp_64_W2) +data8 0x0000000000000000, 0xBE641F2537A3D7A2 +data8 0xBE68DD57AD028C40, 0xBE5C77D8F212B1B6 +data8 0x3E57878F1BA5B070, 0xBE55A36A2ECAE6FE +data8 0xBE620608569DFA3B, 0xBE53B50EA6D300A3 +data8 0x3E5B5EF2223F8F2C, 0xBE56A0D9D6DE0DF4 +data8 0xBE64EEF3EAE28F51, 0xBE5E5AE2367EA80B +data8 0x3E47CB1A5FCBC02D, 0xBE656BA09BDAFEB7 +data8 0x3E6E70C6805AFEE7, 0xBE6E0509A3415EBA +data8 0xBE56856B49BFF529, 0x3E66DD3300508651 +data8 0x3E51165FC114BC13, 0x3E53333DC453290F +data8 0x3E6A072B05539FDA, 0xBE47CD877C0A7696 +data8 0xBE668BF4EB05C6D9, 0xBE67C3E36AE86C93 +data8 0xBE533904D0B3E84B, 0x3E63E8D9556B53CE +data8 0x3E212C8963A98DC8, 0xBE33138F032A7A22 +data8 0x3E530FA9BC584008, 0xBE6ADF82CCB93C97 +data8 0x3E5F91138370EA39, 0x3E5443A4FB6A05D8 +data8 0x3E63DACD181FEE7A, 0xBE62B29DF0F67DEC +data8 0x3E65C4833DDE6307, 0x3E5BF030D40A24C1 +data8 0x3E658B8F14E437BE, 0xBE631C29ED98B6C7 +data8 0x3E6335D204CF7C71, 0x3E529EEDE954A79D +data8 0x3E5D9257F64A2FB8, 0xBE6BED1B854ED06C +data8 0x3E5096F6D71405CB, 0xBE3D4893ACB9FDF5 +data8 0xBDFEB15801B68349, 0x3E628D35C6A463B9 +data8 0xBE559725ADE45917, 0xBE68C29C042FC476 +data8 0xBE67593B01E511FA, 0xBE4A4313398801ED +data8 0x3E699571DA7C3300, 0x3E5349BE08062A9E +data8 0x3E5229C4755BB28E, 0x3E67E42677A1F80D +data8 0xBE52B33F6B69C352, 0xBE6B3550084DA57F +data8 0xBE6DB03FD1D09A20, 0xBE60CBC42161B2C1 +data8 0x3E56ED9C78A2B771, 0xBE508E319D0FA795 +data8 0xBE59482AFD1A54E9, 0xBE2A17CEB07FD23E +data8 0x3E68BF5C17365712, 0x3E3956F9B3785569 +LOCAL_OBJECT_END(Constants_exp_64_W2) -FR_X = f9 -FR_Y = f9 -FR_RESULT = f99 .section .text -.proc expm1l# -.global expm1l# -.align 64 -expm1l: -#ifdef _LIBC -.global __expm1l# -__expm1l: -#endif -{ .mii -alloc r32 = ar.pfs,0,30,4,0 -(p0) add r33 = 1, r0 -(p0) cmp.eq.unc p7, p0 = r0, r0 -} -{ .mbb - nop.m 999 -(p0) br.cond.sptk exp_continue - nop.b 999 ;; -} + +GLOBAL_IEEE754_ENTRY(expm1l) // -// Set p7 true for expm1 -// Set Flag = r33 = 1 for expm1 +// Set p7 true for expm1, p6 false // -.endp expm1l -ASM_SIZE_DIRECTIVE(expm1l) - -#ifdef _LIBC -libm_hidden_def (__expm1l) -#endif - -.section .text -.proc expl# -.global expl# -.align 64 -expl: -#ifdef _LIBC -.global __ieee754_expl# -__ieee754_expl: -#endif -{ .mii -alloc r32 = ar.pfs,0,30,4,0 -(p0) add r33 = r0, r0 -(p0) cmp.eq.unc p0, p7 = r0, r0 ;; +{ .mlx + getf.exp GR_signexp_x = f8 // Get sign and exponent of x, redo if unorm + movl GR_sig_inv_ln2 = 0xb8aa3b295c17f0bc // significand of 1/ln2 } -exp_continue: -{ .mfi -(p0) add r32 = 2,r0 -(p0) fnorm.s1 f9 = f8 - nop.i 0 +{ .mlx + addl GR_ad_Arg = @ltoff(Constants_exp_64_Arg#),gp + movl GR_rshf_2to51 = 0x4718000000000000 // 1.10000 2^(63+51) } +;; + { .mfi -(p0) nop.m 0 + ld8 GR_ad_Arg = [GR_ad_Arg] // Point to Arg table + fclass.m p8, p0 = f8, 0x1E7 // Test x for natval, nan, inf, zero + cmp.eq p7, p6 = r0, r0 +} +{ .mfb + mov GR_exp_half = 0x0FFFE // Exponent of 0.5, for very small path + fnorm.s1 FR_norm_x = f8 // Normalize x + br.cond.sptk exp_continue +} +;; + +GLOBAL_IEEE754_END(expm1l) + +GLOBAL_IEEE754_ENTRY(expl) // -// Set p7 false for exp -// Set Flag = r33 = 0 for exp +// Set p7 false for exp, p6 true // -(p0) fclass.m.unc p6, p8 = f8, 0x1E7 - nop.i 0;; +{ .mlx + getf.exp GR_signexp_x = f8 // Get sign and exponent of x, redo if unorm + movl GR_sig_inv_ln2 = 0xb8aa3b295c17f0bc // significand of 1/ln2 } +{ .mlx + addl GR_ad_Arg = @ltoff(Constants_exp_64_Arg#),gp + movl GR_rshf_2to51 = 0x4718000000000000 // 1.10000 2^(63+51) +} +;; + { .mfi - nop.m 999 -(p0) fclass.nm.unc p9, p0 = f8, 0x1FF - nop.i 0 + ld8 GR_ad_Arg = [GR_ad_Arg] // Point to Arg table + fclass.m p8, p0 = f8, 0x1E7 // Test x for natval, nan, inf, zero + cmp.eq p6, p7 = r0, r0 } { .mfi - nop.m 999 -(p0) mov f36 = f1 - nop.i 999 ;; + mov GR_exp_half = 0x0FFFE // Exponent of 0.5, for very small path + fnorm.s1 FR_norm_x = f8 // Normalize x + nop.i 999 } -{ .mfb - nop.m 999 -// -// Identify NatVals, NaNs, Infs, and Zeros. -// Identify EM unsupporteds. -// Save special input registers -(p0) mov f32 = f0 -// -// Create FR_X_cor = 0.0 -// GR_Flag = 0 -// GR_Expo_Range = 2 (r32) for double-extended precision -// FR_Scale = 1.0 -// -(p6) br.cond.spnt EXPL_64_SPECIAL ;; +;; + +exp_continue: +// Form two constants we need +// 1/ln2 * 2^63 to compute w = x * 1/ln2 * 128 +// 1.1000..000 * 2^(63+63-12) to right shift int(N) into the significand + +{ .mfi + setf.sig FR_INV_LN2_2TO63 = GR_sig_inv_ln2 // form 1/ln2 * 2^63 + fclass.nm.unc p9, p0 = f8, 0x1FF // Test x for unsupported + mov GR_exp_2tom51 = 0xffff-51 +} +{ .mlx + setf.d FR_RSHF_2TO51 = GR_rshf_2to51 // Form const 1.1000 * 2^(63+51) + movl GR_rshf = 0x43e8000000000000 // 1.10000 2^63 for right shift +} +;; + +{ .mfi + setf.exp FR_half = GR_exp_half // Form 0.5 for very small path + fma.s1 FR_scale = f1,f1,f0 // Scale = 1.0 + mov GR_exp_bias = 0x0FFFF // Set exponent bias } { .mib - nop.m 999 - nop.i 999 -(p9) br.cond.spnt EXPL_64_UNSUPPORTED ;; + add GR_ad_Limits = 0x20, GR_ad_Arg // Point to Limits table + mov GR_exp_mask = 0x1FFFF // Form exponent mask +(p8) br.cond.spnt EXP_64_SPECIAL // Branch if natval, nan, inf, zero } +;; + { .mfi -(p0) cmp.ne.unc p12, p13 = 0x01, r33 -// -// Branch out for special input values -// -(p0) fcmp.lt.unc.s0 p9,p0 = f8, f0 -(p0) cmp.eq.unc p15, p0 = r0, r0 + setf.exp FR_2TOM51 = GR_exp_2tom51 // Form 2^-51 for scaling float_N + nop.f 999 + add GR_ad_A = 0x40, GR_ad_Arg // Point to A table } -{ .mmi - nop.m 999 -// -// Raise possible denormal operand exception -// Normalize x -// -// This function computes expl( x + x_cor) -// Input FR 1: FR_X -// Input FR 2: FR_X_cor -// Input GR 1: GR_Flag -// Input GR 2: GR_Expo_Range -// Output FR 3: FR_Y_hi -// Output FR 4: FR_Y_lo -// Output FR 5: FR_Scale -// Output PR 1: PR_Safe -(p0) addl r34 = @ltoff(Constants_exp_64_Arg#),gp -(p0) addl r40 = @ltoff(Constants_exp_64_W1#),gp -};; -// -// Prepare to load constants -// Set Safe = True -// +{ .mib + setf.d FR_RSHF = GR_rshf // Form right shift const 1.1000 * 2^63 + add GR_ad_T1 = 0x160, GR_ad_Arg // Point to T1 table +(p9) br.cond.spnt EXP_64_UNSUPPORTED // Branch if unsupported +} +;; -{ .mmi - ld8 r34 = [r34] - ld8 r40 = [r40] -(p0) addl r41 = @ltoff(Constants_exp_64_W2#),gp +.pred.rel "mutex",p6,p7 +{ .mfi + ldfe FR_L_hi = [GR_ad_Arg],16 // Get L_hi + fcmp.eq.s0 p9,p0 = f8, f0 // Dummy op to flag denormals +(p6) add GR_ad_PQ = 0x30, GR_ad_A // Point to P table for exp +} +{ .mfi + ldfe FR_min_oflow_x = [GR_ad_Limits],16 // Get min x to cause overflow + fmpy.s1 FR_rsq = f8, f8 // rsq = x * x for small path +(p7) add GR_ad_PQ = 0x90, GR_ad_A // Point to Q table for expm1 };; { .mmi -(p0) ldfe f37 = [r34],16 -(p0) ld8 r41 = [r41] ;; + ldfe FR_L_lo = [GR_ad_Arg],16 // Get L_lo + ldfe FR_zero_uflow_x = [GR_ad_Limits],16 // Get x for zero uflow result + add GR_ad_W1 = 0x200, GR_ad_T1 // Point to W1 table } +;; -// -// N = fcvt.fx(float_N) -// Set p14 if -6 > expo_X -// -// -// Bias = 0x0FFFF -// expo_X = expo_X and Mask -// - -{ .mmi -(p0) ldfe f40 = [r34],16 - nop.m 999 -// -// Load L_lo -// Set p10 if 14 < expo_X -// -(p0) addl r50 = @ltoff(Constants_exp_64_T1#),gp +{ .mfi + ldfe FR_P6Q9 = [GR_ad_PQ],16 // P6(exp) or Q9(expm1) for small path + mov FR_r = FR_norm_x // r = X for small path + mov GR_very_small_exp = -60 // Exponent of x for very small path } -{ .mmi - nop.m 999 - nop.m 999 -(p0) addl r51 = @ltoff(Constants_exp_64_T2#),gp ;; +{ .mfi + add GR_ad_W2 = 0x400, GR_ad_T1 // Point to W2 table + nop.f 999 +(p7) mov GR_small_exp = -7 // Exponent of x for small path expm1 } -// -// Load W2_ptr -// Branch to SMALL is expo_X < -6 -// +;; -{.mmi -(p0) ld8 r50 = [r50] -(p0) ld8 r51 = [r51] -};; +{ .mmi + ldfe FR_P5Q8 = [GR_ad_PQ],16 // P5(exp) or Q8(expm1) for small path + and GR_exp_x = GR_signexp_x, GR_exp_mask +(p6) mov GR_small_exp = -12 // Exponent of x for small path exp +} +;; -{ .mlx -(p0) ldfe f41 = [r34],16 -// -// float_N = X * L_Inv -// expo_X = exponent of X -// Mask = 0x1FFFF -// -(p0) movl r58 = 0x0FFFF +// N_signif = X * Inv_log2_by_2^12 +// By adding 1.10...0*2^63 we shift and get round_int(N_signif) in significand. +// We actually add 1.10...0*2^51 to X * Inv_log2 to do the same thing. +{ .mfi + ldfe FR_P4Q7 = [GR_ad_PQ],16 // P4(exp) or Q7(expm1) for small path + fma.s1 FR_N_signif = FR_norm_x, FR_INV_LN2_2TO63, FR_RSHF_2TO51 + nop.i 999 } -{ .mlx - nop.m 999 -(p0) movl r39 = 0x1FFFF ;; +{ .mfi + sub GR_exp_x = GR_exp_x, GR_exp_bias // Get exponent + fmpy.s1 FR_r4 = FR_rsq, FR_rsq // Form r4 for small path + cmp.eq.unc p15, p0 = r0, r0 // Set Safe as default } +;; + { .mmi -(p0) getf.exp r37 = f9 - nop.m 999 -(p0) addl r34 = @ltoff(Constants_exp_64_Exponents#),gp ;; + ldfe FR_P3Q6 = [GR_ad_PQ],16 // P3(exp) or Q6(expm1) for small path + cmp.lt p14, p0 = GR_exp_x, GR_very_small_exp // Is |x| < 2^-60? + nop.i 999 } -{ .mii -(p0) ld8 r34 = [r34] - nop.i 999 -(p0) and r37 = r37, r39 ;; +;; + +{ .mfi + ldfe FR_P2Q5 = [GR_ad_PQ],16 // P2(exp) or Q5(expm1) for small path + fmpy.s1 FR_half_x = FR_half, FR_norm_x // 0.5 * x for very small path + cmp.lt p13, p0 = GR_exp_x, GR_small_exp // Is |x| < 2^-m? } -{ .mmi -(p0) sub r37 = r37, r58 ;; -(p0) cmp.gt.unc p14, p0 = -6, r37 -(p0) cmp.lt.unc p10, p0 = 14, r37 ;; +{ .mib + nop.m 999 + nop.i 999 +(p14) br.cond.spnt EXP_VERY_SMALL // Branch if |x| < 2^-60 } +;; + { .mfi -(p0) nop.m 0 -// -// Load L_inv -// Set p12 true for Flag = 0 (exp) -// Set p13 true for Flag = 1 (expm1) -// -(p0) fmpy.s1 f38 = f9, f37 - nop.i 999 ;; + ldfe FR_A3 = [GR_ad_A],16 // Get A3 for normal path + fcmp.ge.s1 p10,p0 = FR_norm_x, FR_min_oflow_x // Will result overflow? + mov GR_big_expo_neg = -16381 // -0x3ffd } { .mfb - nop.m 999 -// -// Load L_hi -// expo_X = expo_X - Bias -// get W1_ptr -// -(p0) fcvt.fx.s1 f39 = f38 -(p14) br.cond.spnt EXPL_SMALL ;; + ldfe FR_P1Q4 = [GR_ad_PQ],16 // P1(exp) or Q4(expm1) for small path + nop.f 999 +(p13) br.cond.spnt EXP_SMALL // Branch if |x| < 2^-m + // m=12 for exp, m=7 for expm1 } -{ .mib - nop.m 999 - nop.i 999 -(p10) br.cond.spnt EXPL_HUGE ;; +;; + +// Now we are on the main path for |x| >= 2^-m, m=12 for exp, m=7 for expm1 +// +// float_N = round_int(N_signif) +// The signficand of N_signif contains the rounded integer part of X * 2^12/ln2, +// as a twos complement number in the lower bits (that is, it may be negative). +// That twos complement number (called N) is put into GR_N. + +// Since N_signif is scaled by 2^51, it must be multiplied by 2^-51 +// before the shift constant 1.10000 * 2^63 is subtracted to yield float_N. +// Thus, float_N contains the floating point version of N + + +{ .mfi + ldfe FR_A2 = [GR_ad_A],16 // Get A2 for main path + fcmp.lt.s1 p11,p0 = FR_norm_x, FR_zero_uflow_x // Certain zero, uflow? + add GR_ad_T2 = 0x100, GR_ad_T1 // Point to T2 table } -{ .mmi -(p0) shladd r34 = r32,4,r34 +{ .mfi nop.m 999 -(p0) addl r35 = @ltoff(Constants_exp_64_A#),gp ;; -} -// -// Load T_1,T_2 -// -{ .mmi - nop.m 999 - ld8 r35 =[r35] - nop.i 99 -};; -{ .mmb -(p0) ldfe f51 = [r35],16 -(p0) ld8 r45 = [r34],8 - nop.b 999 ;; + fms.s1 FR_float_N = FR_N_signif, FR_2TOM51, FR_RSHF // Form float_N + nop.i 999 } -// -// Set Safe = True if k >= big_expo_neg -// Set Safe = False if k < big_expo_neg -// -{ .mmb -(p0) ldfe f49 = [r35],16 -(p0) ld8 r48 = [r34],0 - nop.b 999 ;; +;; + +{ .mbb + getf.sig GR_N_fix = FR_N_signif // Get N from significand +(p10) br.cond.spnt EXP_OVERFLOW // Branch if result will overflow +(p11) br.cond.spnt EXP_CERTAIN_UNDERFLOW_ZERO // Branch if certain zero, uflow } +;; + { .mfi - nop.m 999 -// -// Branch to HUGE is expo_X > 14 -// -(p0) fcvt.xf f38 = f39 - nop.i 999 ;; + ldfe FR_A1 = [GR_ad_A],16 // Get A1 for main path + fnma.s1 FR_r = FR_L_hi, FR_float_N, FR_norm_x // r = -L_hi * float_N + x + extr.u GR_M1 = GR_N_fix, 6, 6 // Extract index M_1 } { .mfi -(p0) getf.sig r52 = f39 - nop.f 999 - nop.i 999 ;; + and GR_M2 = 0x3f, GR_N_fix // Extract index M_2 + nop.f 999 + nop.i 999 } -{ .mii - nop.m 999 -(p0) extr.u r43 = r52, 6, 6 ;; -// -// r = r - float_N * L_lo -// K = extr(N_fix,12,52) -// -(p0) shladd r40 = r43,3,r40 ;; +;; + +// N_fix is only correct up to 50 bits because of our right shift technique. +// Actually in the normal path we will have restricted K to about 14 bits. +// Somewhat arbitrarily we extract 32 bits. +{ .mfi + shladd GR_ad_W1 = GR_M1,3,GR_ad_W1 // Point to W1 + nop.f 999 + extr GR_K = GR_N_fix, 12, 32 // Extract limited range K } { .mfi -(p0) shladd r50 = r43,2,r50 -(p0) fnma.s1 f42 = f40, f38, f9 -// -// float_N = float(N) -// N_fix = signficand N -// -(p0) extr.u r42 = r52, 0, 6 + shladd GR_ad_T1 = GR_M1,2,GR_ad_T1 // Point to T1 + nop.f 999 + shladd GR_ad_T2 = GR_M2,2,GR_ad_T2 // Point to T2 } +;; + { .mmi -(p0) ldfd f43 = [r40],0 ;; -(p0) shladd r41 = r42,3,r41 -(p0) shladd r51 = r42,2,r51 -} -// -// W_1_p1 = 1 + W_1 -// -{ .mmi -(p0) ldfs f44 = [r50],0 ;; -(p0) ldfd f45 = [r41],0 -// -// M_2 = extr(N_fix,0,6) -// M_1 = extr(N_fix,6,6) -// r = X - float_N * L_hi -// -(p0) extr r44 = r52, 12, 52 + ldfs FR_T1 = [GR_ad_T1],0 // Get T1 + ldfd FR_W1 = [GR_ad_W1],0 // Get W1 + add GR_exp_2_k = GR_exp_bias, GR_K // Form exponent of 2^k } +;; + { .mmi -(p0) ldfs f46 = [r51],0 ;; -(p0) sub r46 = r58, r44 -(p0) cmp.gt.unc p8, p15 = r44, r45 -} -// -// W = W_1 + W_1_p1*W_2 -// Load A_2 -// Bias_m_K = Bias - K -// -{ .mii -(p0) ldfe f40 = [r35],16 -// -// load A_1 -// poly = A_2 + r*A_3 -// rsq = r * r -// neg_2_mK = exponent of Bias_m_k -// -(p0) add r47 = r58, r44 ;; -// -// Set Safe = True if k <= big_expo_pos -// Set Safe = False if k > big_expo_pos -// Load A_3 -// -(p15) cmp.lt p8,p15 = r44,r48 ;; + ldfs FR_T2 = [GR_ad_T2],0 // Get T2 + shladd GR_ad_W2 = GR_M2,3,GR_ad_W2 // Point to W2 + sub GR_exp_2_mk = GR_exp_bias, GR_K // Form exponent of 2^-k } +;; + { .mmf -(p0) setf.exp f61 = r46 -// -// Bias_p + K = Bias + K -// T = T_1 * T_2 -// -(p0) setf.exp f36 = r47 -(p0) fnma.s1 f42 = f41, f38, f42 ;; + ldfd FR_W2 = [GR_ad_W2],0 // Get W2 + setf.exp FR_scale = GR_exp_2_k // Set scale = 2^k + fnma.s1 FR_r = FR_L_lo, FR_float_N, FR_r // r = -L_lo * float_N + r } +;; + { .mfi - nop.m 999 -// -// Load W_1,W_2 -// Load big_exp_pos, load big_exp_neg -// -(p0) fadd.s1 f47 = f43, f1 - nop.i 999 ;; + setf.exp FR_2_mk = GR_exp_2_mk // Form 2^-k + fma.s1 FR_poly = FR_r, FR_A3, FR_A2 // poly = r * A3 + A2 + cmp.lt p8,p15 = GR_K,GR_big_expo_neg // Set Safe if K > big_expo_neg } { .mfi - nop.m 999 -(p0) fma.s1 f52 = f42, f51, f49 - nop.i 999 + nop.m 999 + fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fmpy.s1 f48 = f42, f42 - nop.i 999 ;; + nop.m 999 + fmpy.s1 FR_T = FR_T1, FR_T2 // T = T1 * T2 + nop.i 999 } { .mfi - nop.m 999 -(p0) fmpy.s1 f53 = f44, f46 - nop.i 999 ;; + nop.m 999 + fadd.s1 FR_W1_p1 = FR_W1, f1 // W1_p1 = W1 + 1.0 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fma.s1 f54 = f45, f47, f43 - nop.i 999 +(p7) cmp.lt.unc p8, p9 = 10, GR_K // If expm1, set p8 if K > 10 + fma.s1 FR_poly = FR_r, FR_poly, FR_A1 // poly = r * poly + A1 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fneg f61 = f61 - nop.i 999 ;; +(p7) cmp.eq p15, p0 = r0, r0 // If expm1, set Safe flag + fma.s1 FR_T_scale = FR_T, FR_scale, f0 // T_scale = T * scale +(p9) cmp.gt.unc p9, p10 = -10, GR_K // If expm1, set p9 if K < -10 + // If expm1, set p10 if -10<=K<=10 } { .mfi - nop.m 999 -(p0) fma.s1 f52 = f42, f52, f40 - nop.i 999 ;; + nop.m 999 + fma.s1 FR_W = FR_W2, FR_W1_p1, FR_W1 // W = W2 * (W1+1.0) + W1 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fadd.s1 f55 = f54, f1 - nop.i 999 + nop.m 999 + mov FR_Y_hi = FR_T // Assume Y_hi = T + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// W + Wp1 * poly -// -(p0) mov f34 = f53 - nop.i 999 ;; + nop.m 999 + fma.s1 FR_poly = FR_rsq, FR_poly, FR_r // poly = rsq * poly + r + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// A_1 + r * poly -// Scale = setf_expl(Bias_p_k) -// -(p0) fma.s1 f52 = f48, f52, f42 - nop.i 999 ;; + nop.m 999 + fma.s1 FR_Wp1_T_scale = FR_W, FR_T_scale, FR_T_scale // (W+1)*T*scale + nop.i 999 } { .mfi - nop.m 999 -// -// poly = r + rsq(A_1 + r*poly) -// Wp1 = 1 + W -// neg_2_mK = -neg_2_mK -// -(p0) fma.s1 f35 = f55, f52, f54 - nop.i 999 ;; -} -{ .mfb - nop.m 999 -(p0) fmpy.s1 f35 = f35, f53 -// -// Y_hi = T -// Y_lo = T * (W + Wp1*poly) -// -(p12) br.cond.sptk EXPL_MAIN ;; -} -// -// Branch if expl(x) -// Continue for expl(x-1) -// -{ .mii -(p0) cmp.lt.unc p12, p13 = 10, r44 - nop.i 999 ;; -// -// Set p12 if 10 < K, Else p13 -// -(p13) cmp.gt.unc p13, p14 = -10, r44 ;; + nop.m 999 + fma.s1 FR_W_T_scale = FR_W, FR_T_scale, f0 // W*T*scale + nop.i 999 } -// -// K > 10: Y_lo = Y_lo + neg_2_mK -// K <=10: Set p13 if -10 > K, Else set p14 -// +;; + { .mfi -(p13) cmp.eq p15, p0 = r0, r0 -(p14) fadd.s1 f34 = f61, f34 - nop.i 999 ;; + nop.m 999 +(p9) fsub.s1 FR_Y_hi = f0, FR_2_mk // If expm1, if K < -10 set Y_hi + nop.i 999 } { .mfi - nop.m 999 -(p12) fadd.s1 f35 = f35, f61 - nop.i 999 ;; + nop.m 999 +(p10) fsub.s1 FR_Y_hi = FR_T, FR_2_mk // If expm1, if |K|<=10 set Y_hi + nop.i 999 } +;; + { .mfi - nop.m 999 -(p13) fadd.s1 f35 = f35, f34 - nop.i 999 -} -{ .mfb - nop.m 999 -// -// K <= 10 and K < -10, Set Safe = True -// K <= 10 and K < 10, Y_lo = Y_hi + Y_lo -// K <= 10 and K > =-10, Y_hi = Y_hi + neg_2_mk -// -(p13) mov f34 = f61 -(p0) br.cond.sptk EXPL_MAIN ;; -} -EXPL_SMALL: -{ .mmi nop.m 999 -(p0) addl r34 = @ltoff(Constants_exp_64_Exponents#),gp -(p12) addl r35 = @ltoff(Constants_exp_64_P#),gp ;; + fma.s1 FR_result_lo = FR_Wp1_T_scale, FR_poly, FR_W_T_scale + nop.i 999 } -.pred.rel "mutex",p12,p13 -{ .mmi -(p12) ld8 r35=[r35] -nop.m 999 -(p13) addl r35 = @ltoff(Constants_exp_64_Q#),gp -};; -{ .mmi -(p13) ld8 r35=[r35] -(p0) ld8 r34=[r34] -nop.i 999 -};; +;; + +.pred.rel "mutex",p8,p9 +// If K > 10 adjust result_lo = result_lo - scale * 2^-k +// If |K| <= 10 adjust result_lo = result_lo + scale * T { .mfi -(p0) add r34 = 0x48,r34 -// -// Return -// K <= 10 and K < 10, Y_hi = neg_2_mk -// -// /*******************************************************/ -// /*********** Branch EXPL_SMALL ************************/ -// /*******************************************************/ -(p0) mov f42 = f9 - nop.i 999 ;; + nop.m 999 +(p8) fnma.s1 FR_result_lo = FR_scale, FR_2_mk, FR_result_lo // If K > 10 + nop.i 999 } -// -// Flag = 0 -// r4 = rsq * rsq -// { .mfi -(p0) ld8 r49 =[r34],0 - nop.f 999 - nop.i 999 ;; -} -{ .mii - nop.m 999 - nop.i 999 ;; -// -// Flag = 1 -// -(p0) cmp.lt.unc p14, p0 = r37, r49 ;; + nop.m 999 +(p9) fma.s1 FR_result_lo = FR_T_scale, f1, FR_result_lo // If |K| <= 10 + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// r = X -// -(p0) fmpy.s1 f48 = f42, f42 - nop.i 999 ;; + nop.m 999 + fmpy.s0 FR_tmp = FR_A1, FR_A1 // Dummy op to set inexact + nop.i 999 } { .mfb - nop.m 999 -// -// rsq = r * r -// -(p0) fmpy.s1 f50 = f48, f48 -// -// Is input very small? -// -(p14) br.cond.spnt EXPL_VERY_SMALL ;; -} -// -// Flag_not1: Y_hi = 1.0 -// Flag is 1: r6 = rsq * r4 -// -{ .mfi -(p12) ldfe f52 = [r35],16 -(p12) mov f34 = f1 -(p0) add r53 = 0x1,r0 ;; -} -{ .mfi -(p13) ldfe f51 = [r35],16 -// -// Flag_not_1: Y_lo = poly_hi + r4 * poly_lo -// -(p13) mov f34 = f9 - nop.i 999 ;; -} -{ .mmf -(p12) ldfe f53 = [r35],16 -// -// For Flag_not_1, Y_hi = X -// Scale = 1 -// Create 0x000...01 -// -(p0) setf.sig f37 = r53 -(p0) mov f36 = f1 ;; + nop.m 999 +(p15) fma.s0 f8 = FR_Y_hi, FR_scale, FR_result_lo // Safe result +(p15) br.ret.sptk b0 // Safe exit for normal path } -{ .mmi -(p13) ldfe f52 = [r35],16 ;; -(p12) ldfe f54 = [r35],16 - nop.i 999 ;; +;; + +// Here if unsafe, will only be here for exp with K < big_expo_neg +{ .mfb + nop.m 999 + fma.s0 FR_RESULT = FR_Y_hi, FR_scale, FR_result_lo // Prelim result + br.cond.sptk EXP_POSSIBLE_UNDERFLOW // Branch to unsafe code } +;; + + +EXP_SMALL: +// Here if 2^-60 < |x| < 2^-m, m=12 for exp, m=7 for expm1 { .mfi -(p13) ldfe f53 = [r35],16 -(p13) fmpy.s1 f58 = f48, f50 - nop.i 999 ;; -} -// -// Flag_not1: poly_lo = P_5 + r*P_6 -// Flag_1: poly_lo = Q_6 + r*Q_7 -// -{ .mmi -(p13) ldfe f54 = [r35],16 ;; -(p12) ldfe f55 = [r35],16 - nop.i 999 ;; -} -{ .mmi -(p12) ldfe f56 = [r35],16 ;; -(p13) ldfe f55 = [r35],16 - nop.i 999 ;; -} -{ .mmi -(p12) ldfe f57 = [r35],0 ;; -(p13) ldfe f56 = [r35],16 - nop.i 999 ;; +(p7) ldfe FR_Q3 = [GR_ad_Q],16 // Get Q3 for small path, if expm1 +(p6) fma.s1 FR_p65 = FR_P6, FR_r, FR_P5 // If exp, p65 = P6 * r + P5 + nop.i 999 } { .mfi -(p13) ldfe f57 = [r35],0 - nop.f 999 - nop.i 999 ;; -} -{ .mfi - nop.m 999 -// -// For Flag_not_1, load p5,p6,p1,p2 -// Else load p5,p6,p1,p2 -// -(p12) fma.s1 f60 = f52, f42, f53 - nop.i 999 ;; + mov GR_minus_one = -1 +(p7) fma.s1 FR_q98 = FR_Q9, FR_r, FR_Q8 // If expm1, q98 = Q9 * r + Q8 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p13) fma.s1 f60 = f51, f42, f52 - nop.i 999 ;; +(p7) ldfe FR_Q2 = [GR_ad_Q],16 // Get Q2 for small path, if expm1 +(p7) fma.s1 FR_q65 = FR_Q6, FR_r, FR_Q5 // If expm1, q65 = Q6 * r + Q5 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p12) fma.s1 f60 = f60, f42, f54 - nop.i 999 ;; + setf.sig FR_tmp = GR_minus_one // Create value to force inexact +(p6) fma.s1 FR_p21 = FR_P2, FR_r, FR_P1 // If exp, p21 = P2 * r + P1 + nop.i 999 } { .mfi - nop.m 999 -(p12) fma.s1 f59 = f56, f42, f57 - nop.i 999 ;; +(p7) ldfe FR_Q1 = [GR_ad_Q],16 // Get Q1 for small path, if expm1 +(p7) fma.s1 FR_q43 = FR_Q4, FR_r, FR_Q3 // If expm1, q43 = Q4 * r + Q3 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p13) fma.s1 f60 = f42, f60, f53 - nop.i 999 ;; + nop.m 999 +(p6) fma.s1 FR_p654 = FR_p65, FR_r, FR_P4 // If exp, p654 = p65 * r + P4 + nop.i 999 } { .mfi - nop.m 999 -(p12) fma.s1 f59 = f59, f48, f42 - nop.i 999 ;; + nop.m 999 +(p7) fma.s1 FR_q987 = FR_q98, FR_r, FR_Q7 // If expm1, q987 = q98 * r + Q7 + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// Flag_1: poly_lo = Q_5 + r*(Q_6 + r*Q_7) -// Flag_not1: poly_lo = P_4 + r*(P_5 + r*P_6) -// Flag_not1: poly_hi = (P_1 + r*P_2) -// -(p13) fmpy.s1 f60 = f60, f58 - nop.i 999 ;; + nop.m 999 +(p7) fma.s1 FR_q21 = FR_Q2, FR_r, FR_Q1 // If expm1, q21 = Q2 * r + Q1 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p12) fma.s1 f60 = f60, f42, f55 - nop.i 999 ;; + nop.m 999 +(p6) fma.s1 FR_p210 = FR_p21, FR_rsq, FR_r // If exp, p210 = p21 * r + P0 + nop.i 999 } { .mfi - nop.m 999 -// -// Flag_1: poly_lo = r6 *(Q_5 + ....) -// Flag_not1: poly_hi = r + rsq *(P_1 + r*P_2) -// -(p12) fma.s1 f35 = f60, f50, f59 - nop.i 999 + nop.m 999 +(p7) fma.s1 FR_q6543 = FR_q65, FR_rsq, FR_q43 // If expm1, q6543 = q65*r2+q43 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p13) fma.s1 f59 = f54, f42, f55 - nop.i 999 ;; + nop.m 999 +(p6) fma.s1 FR_p6543 = FR_p654, FR_r, FR_P3 // If exp, p6543 = p654 * r + P3 + nop.i 999 } { .mfi - nop.m 999 -// -// Flag_not1: Y_lo = rsq* poly_hi + poly_lo -// Flag_1: poly_lo = rsq* poly_hi + poly_lo -// -(p13) fma.s1 f59 = f59, f42, f56 - nop.i 999 ;; + nop.m 999 +(p7) fma.s1 FR_q9876543 = FR_q987, FR_r4, FR_q6543 // If expm1, q9876543 = ... + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// Flag_not_1: (P_1 + r*P_2) -// -(p13) fma.s1 f59 = f59, f42, f57 - nop.i 999 ;; + nop.m 999 +(p6) fma.s1 FR_Y_lo = FR_p6543, FR_r4, FR_p210 // If exp, form Y_lo + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// Flag_not_1: poly_hi = r + rsq * (P_1 + r*P_2) -// -(p13) fma.s1 f35 = f59, f48, f60 - nop.i 999 ;; + nop.m 999 +(p7) fma.s1 FR_Y_lo = FR_q9876543, FR_rsq, FR_q21 // If expm1, form Y_lo + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// Create 0.000...01 -// -(p0) for f37 = f35, f37 - nop.i 999 ;; -} -{ .mfb - nop.m 999 -// -// Set lsb of Y_lo to 1 -// -(p0) fmerge.se f35 = f35,f37 -(p0) br.cond.sptk EXPL_MAIN ;; -} -EXPL_VERY_SMALL: -{ .mmi - nop.m 999 - nop.m 999 -(p13) addl r34 = @ltoff(Constants_exp_64_Exponents#),gp + nop.m 999 + fmpy.s0 FR_tmp = FR_tmp, FR_tmp // Dummy op to set inexact + nop.i 999 } +;; + +.pred.rel "mutex",p6,p7 { .mfi - nop.m 999 -(p12) mov f35 = f9 - nop.i 999 ;; + nop.m 999 +(p6) fma.s0 f8 = FR_Y_lo, f1, f1 // If exp, result = 1 + Y_lo + nop.i 999 } { .mfb -(p13) ld8 r34 = [r34] -(p12) mov f34 = f1 -(p12) br.cond.sptk EXPL_MAIN ;; -} -{ .mlx -(p13) add r34 = 8,r34 -(p13) movl r39 = 0x0FFFE ;; -} -// -// Load big_exp_neg -// Create 1/2's exponent -// -{ .mii -(p13) setf.exp f56 = r39 -(p13) shladd r34 = r32,4,r34 ;; - nop.i 999 + nop.m 999 +(p7) fma.s0 f8 = FR_Y_lo, FR_rsq, FR_norm_x // If expm1, result = Y_lo*r2+x + br.ret.sptk b0 // Exit for 2^-60 <= |x| < 2^-m + // m=12 for exp, m=7 for expm1 } +;; + + +EXP_VERY_SMALL: // -// Negative exponents are stored after positive +// Here if 0 < |x| < 2^-60 +// If exp, result = 1.0 + x +// If expm1, result = x +x*x/2, but have to check for possible underflow // + { .mfi -(p13) ld8 r45 = [r34],0 -// -// Y_hi = x -// Scale = 1 -// -(p13) fmpy.s1 f35 = f9, f9 - nop.i 999 ;; +(p7) mov GR_exp_underflow = -16381 // Exponent for possible underflow +(p6) fadd.s0 f8 = f1, FR_norm_x // If exp, result = 1+x + nop.i 999 } { .mfi - nop.m 999 -// -// Reset Safe if necessary -// Create 1/2 -// -(p13) mov f34 = f9 - nop.i 999 ;; + nop.m 999 +(p7) fmpy.s1 FR_result_lo = FR_half_x, FR_norm_x // If expm1 result_lo = x*x/2 + nop.i 999 } +;; + { .mfi -(p13) cmp.lt.unc p0, p15 = r37, r45 -(p13) mov f36 = f1 - nop.i 999 ;; +(p7) cmp.lt.unc p0, p8 = GR_exp_x, GR_exp_underflow // Unsafe if expm1 x small +(p7) mov FR_Y_hi = FR_norm_x // If expm1, Y_hi = x +(p7) cmp.lt p0, p15 = GR_exp_x, GR_exp_underflow // Unsafe if expm1 x small } +;; + { .mfb - nop.m 999 -// -// Y_lo = x * x -// -(p13) fmpy.s1 f35 = f35, f56 -// -// Y_lo = x*x/2 -// -(p13) br.cond.sptk EXPL_MAIN ;; -} -EXPL_HUGE: -{ .mfi - nop.m 999 -(p0) fcmp.gt.unc.s1 p14, p0 = f9, f0 - nop.i 999 -} -{ .mlx - nop.m 999 -(p0) movl r39 = 0x15DC0 ;; -} -{ .mfi -(p14) setf.exp f34 = r39 -(p14) mov f35 = f1 -(p14) cmp.eq p0, p15 = r0, r0 ;; + nop.m 999 +(p8) fma.s0 f8 = FR_norm_x, f1, FR_result_lo // If expm1, result=x+x*x/2 +(p15) br.ret.sptk b0 // If Safe, exit } +;; + +// Here if expm1 and 0 < |x| < 2^-16381; may be possible underflow { .mfb - nop.m 999 -(p14) mov f36 = f34 -// -// If x > 0, Set Safe = False -// If x > 0, Y_hi = 2**(24,000) -// If x > 0, Y_lo = 1.0 -// If x > 0, Scale = 2**(24,000) -// -(p14) br.cond.sptk EXPL_MAIN ;; -} -{ .mlx - nop.m 999 -(p12) movl r39 = 0xA240 -} -{ .mlx - nop.m 999 -(p12) movl r38 = 0xA1DC ;; -} -{ .mmb -(p13) cmp.eq p15, p14 = r0, r0 -(p12) setf.exp f34 = r39 - nop.b 999 ;; -} -{ .mlx -(p12) setf.exp f35 = r38 -(p13) movl r39 = 0xFF9C -} -{ .mfi - nop.m 999 -(p13) fsub.s1 f34 = f0, f1 - nop.i 999 ;; + nop.m 999 + fma.s0 FR_RESULT = FR_Y_hi, FR_scale, FR_result_lo // Prelim result + br.cond.sptk EXP_POSSIBLE_UNDERFLOW // Branch to unsafe code } -{ .mfi - nop.m 999 -(p12) mov f36 = f34 -(p12) cmp.eq p0, p15 = r0, r0 ;; +;; + +EXP_CERTAIN_UNDERFLOW_ZERO: +// Here if x < zero_uflow_x +// For exp, set result to tiny+0.0 and set I, U, and branch to error handling +// For expm1, set result to tiny-1.0 and set I, and exit +{ .mmi + alloc GR_SAVE_PFS = ar.pfs,0,3,4,0 + nop.m 999 + mov GR_one = 1 } -{ .mfi -(p13) setf.exp f35 = r39 -(p13) mov f36 = f1 - nop.i 999 ;; +;; + +{ .mmi + setf.exp FR_small = GR_one // Form small value + nop.m 999 +(p6) mov GR_Parameter_TAG = 13 // Error tag for exp underflow } -EXPL_MAIN: +;; + { .mfi -(p0) cmp.ne.unc p12, p0 = 0x01, r33 -(p0) fmpy.s1 f101 = f36, f35 - nop.i 999 ;; + nop.m 999 + fmerge.s FR_X = f8,f8 // Save x for error call + nop.i 999 } +;; + +.pred.rel "mutex",p6,p7 { .mfb - nop.m 999 -(p0) fma.s0 f99 = f34, f36, f101 -(p15) br.cond.sptk EXPL_64_RETURN ;; -} -{ .mfi - nop.m 999 -(p0) fsetc.s3 0x7F,0x01 - nop.i 999 + nop.m 999 +(p6) fma.s0 FR_RESULT = FR_small, FR_small, f0 // If exp, set I,U, tiny result +(p6) br.cond.sptk __libm_error_region // If exp, go to error handling } -{ .mlx - nop.m 999 -(p0) movl r50 = 0x00000000013FFF ;; +{ .mfb + nop.m 999 +(p7) fms.s0 f8 = FR_small, FR_small, f1 // If expm1, set I, result -1.0 +(p7) br.ret.sptk b0 // If expm1, exit +} +;; + + +EXP_OVERFLOW: +// Here if x >= min_oflow_x +{ .mmi + alloc GR_SAVE_PFS = ar.pfs,0,3,4,0 + mov GR_huge_exp = 0x1fffe + nop.i 999 } -// -// S0 user supplied status -// S2 user supplied status + WRE + TD (Overflows) -// S3 user supplied status + RZ + TD (Underflows) -// -// -// If (Safe) is true, then -// Compute result using user supplied status field. -// No overflow or underflow here, but perhaps inexact. -// Return -// Else -// Determine if overflow or underflow was raised. -// Fetch +/- overflow threshold for IEEE single, double, -// double extended -// { .mfi -(p0) setf.exp f60 = r50 -(p0) fma.s3 f102 = f34, f36, f101 - nop.i 999 + mov GR_huge_signif = -0x1 + nop.f 999 +(p6) mov GR_Parameter_TAG = 12 // Error tag for exp overflow } -{ .mfi - nop.m 999 -(p0) fsetc.s3 0x7F,0x40 - nop.i 999 ;; +;; + +{ .mmf + setf.exp FR_huge_exp = GR_huge_exp // Create huge value + setf.sig FR_huge_signif = GR_huge_signif // Create huge value + fmerge.s FR_X = f8,f8 // Save x for error call } +;; + { .mfi - nop.m 999 -// -// For Safe, no need to check for over/under. -// For expm1, handle errors like exp. -// -(p0) fsetc.s2 0x7F,0x42 - nop.i 999;; + nop.m 999 + fmerge.se FR_huge = FR_huge_exp, FR_huge_signif +(p7) mov GR_Parameter_TAG = 39 // Error tag for expm1 overflow } -{ .mfi - nop.m 999 -(p0) fma.s2 f100 = f34, f36, f101 - nop.i 999 ;; +;; + +{ .mfb + nop.m 999 + fma.s0 FR_RESULT = FR_huge, FR_huge, FR_huge // Force I, O, and Inf + br.cond.sptk __libm_error_region // Branch to error handling } +;; + + + +EXP_POSSIBLE_UNDERFLOW: +// Here if exp and zero_uflow_x < x < about -11356 [where k < -16381] +// Here if expm1 and |x| < 2^-16381 { .mfi - nop.m 999 -(p0) fsetc.s2 0x7F,0x40 - nop.i 999 ;; + alloc GR_SAVE_PFS = ar.pfs,0,3,4,0 + fsetc.s2 0x7F,0x41 // Set FTZ and disable traps + nop.i 999 } +;; + { .mfi - nop.m 999 -(p7) fclass.m.unc p12, p0 = f102, 0x00F - nop.i 999 + nop.m 999 + fma.s2 FR_ftz = FR_Y_hi, FR_scale, FR_result_lo // Result with FTZ + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fclass.m.unc p11, p0 = f102, 0x00F - nop.i 999 ;; + nop.m 999 + fsetc.s2 0x7F,0x40 // Disable traps (set s2 default) + nop.i 999 } +;; + { .mfi - nop.m 999 -(p7) fcmp.ge.unc.s1 p10, p0 = f100, f60 - nop.i 999 + nop.m 999 +(p7) fclass.m.unc p12, p0 = FR_ftz, 0x00F // If expm1, FTZ result denorm, zero? + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// Create largest double exponent + 1. -// Create smallest double exponent - 1. -// -(p0) fcmp.ge.unc.s1 p8, p0 = f100, f60 - nop.i 999 ;; -} -// -// fcmp: resultS2 >= + overflow threshold -> set (a) if true -// fcmp: resultS2 <= - overflow threshold -> set (b) if true -// fclass: resultS3 is denorm/unorm/0 -> set (d) if true -// -{ .mib -(p10) mov GR_Parameter_TAG = 39 - nop.i 999 -(p10) br.cond.sptk __libm_error_region ;; -} -{ .mib -(p8) mov GR_Parameter_TAG = 12 - nop.i 999 -(p8) br.cond.sptk __libm_error_region ;; -} -// -// Report that exp overflowed -// -{ .mib -(p12) mov GR_Parameter_TAG = 40 - nop.i 999 -(p12) br.cond.sptk __libm_error_region ;; + nop.m 999 +(p6) fclass.m.unc p11, p0 = FR_ftz, 0x00F // If exp, FTZ result denorm or zero? + nop.i 999 } -{ .mib -(p11) mov GR_Parameter_TAG = 13 - nop.i 999 -(p11) br.cond.sptk __libm_error_region ;; +;; + +{ .mfb +(p12) mov GR_Parameter_TAG = 40 // expm1 underflow + fmerge.s FR_X = f8,f8 // Save x for error call +(p12) br.cond.spnt __libm_error_region // Branch on expm1 underflow } +;; + { .mib - nop.m 999 - nop.i 999 -// -// Report that exp underflowed -// -(p0) br.cond.sptk EXPL_64_RETURN ;; +(p11) mov GR_Parameter_TAG = 13 // exp underflow + nop.i 999 +(p11) br.cond.spnt __libm_error_region // Branch on exp underflow } -EXPL_64_SPECIAL: -{ .mfi - nop.m 999 -(p0) fclass.m.unc p6, p0 = f8, 0x0c3 - nop.i 999 -} -{ .mfi - nop.m 999 -(p0) fclass.m.unc p13, p8 = f8, 0x007 - nop.i 999 ;; +;; + +{ .mfb + nop.m 999 + mov f8 = FR_RESULT // Was safe after all + br.ret.sptk b0 } +;; + + +EXP_64_SPECIAL: +// Here if x natval, nan, inf, zero +// If x natval, +inf, or if expm1 and x zero, just return x. +// The other cases must be tested for, and results set. +// These cases do not generate exceptions. { .mfi - nop.m 999 -(p7) fclass.m.unc p14, p0 = f8, 0x007 - nop.i 999 + nop.m 999 + fclass.m p8, p0 = f8, 0x0c3 // Is x nan? + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fclass.m.unc p12, p9 = f8, 0x021 - nop.i 999 ;; + nop.m 999 +(p6) fclass.m.unc p13, p0 = f8, 0x007 // If exp, is x zero? + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fclass.m.unc p11, p0 = f8, 0x022 - nop.i 999 + nop.m 999 +(p6) fclass.m.unc p11, p0 = f8, 0x022 // If exp, is x -inf? + nop.i 999 } { .mfi - nop.m 999 -(p7) fclass.m.unc p10, p0 = f8, 0x022 - nop.i 999 ;; + nop.m 999 +(p8) fadd.s0 f8 = f8, f1 // If x nan, result quietized x + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// Identify +/- 0, Inf, or -Inf -// Generate the right kind of NaN. -// -(p13) fadd.s0 f99 = f0, f1 - nop.i 999 ;; + nop.m 999 +(p7) fclass.m.unc p10, p0 = f8, 0x022 // If expm1, is x -inf? + nop.i 999 } { .mfi - nop.m 999 -(p14) mov f99 = f8 - nop.i 999 ;; -} -{ .mfb - nop.m 999 -(p6) fadd.s0 f99 = f8, f1 -// -// expl(+/-0) = 1 -// expm1l(+/-0) = +/-0 -// No exceptions raised -// -(p6) br.cond.sptk EXPL_64_RETURN ;; -} -{ .mib - nop.m 999 - nop.i 999 -(p14) br.cond.sptk EXPL_64_RETURN ;; + nop.m 999 +(p13) fadd.s0 f8 = f0, f1 // If exp and x zero, result 1.0 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p11) mov f99 = f0 - nop.i 999 ;; -} -{ .mfb - nop.m 999 -(p10) fsub.s1 f99 = f0, f1 -// -// expl(-Inf) = 0 -// expm1l(-Inf) = -1 -// No exceptions raised. -// -(p10) br.cond.sptk EXPL_64_RETURN ;; -} -{ .mfb - nop.m 999 -(p12) fmpy.s1 f99 = f8, f1 -// -// expl(+Inf) = Inf -// No exceptions raised. -// -(p0) br.cond.sptk EXPL_64_RETURN ;; + nop.m 999 +(p11) mov f8 = f0 // If exp and x -inf, result 0 + nop.i 999 } -EXPL_64_UNSUPPORTED: +;; + { .mfb - nop.m 999 -(p0) fmpy.s0 f99 = f8, f0 -(p0) br.cond.sptk EXPL_64_RETURN ;; + nop.m 999 +(p10) fsub.s1 f8 = f0, f1 // If expm1, x -inf, result -1.0 + br.ret.sptk b0 // Exit special cases } -EXPL_64_RETURN: +;; + + +EXP_64_UNSUPPORTED: +// Here if x unsupported type { .mfb nop.m 999 -(p0) mov f8 = f99 -(p0) br.ret.sptk b0 + fmpy.s0 f8 = f8, f0 // Return nan + br.ret.sptk b0 } -.endp -ASM_SIZE_DIRECTIVE(expl) +;; -.proc __libm_error_region -__libm_error_region: +GLOBAL_IEEE754_END(expl) +LOCAL_LIBM_ENTRY(__libm_error_region) .prologue { .mfi add GR_Parameter_Y=-32,sp // Parameter 2 value @@ -1598,9 +1421,9 @@ __libm_error_region: br.call.sptk b0=__libm_error_support# // Call error handling function };; { .mmi - nop.m 0 - nop.m 0 add GR_Parameter_RESULT = 48,sp + nop.m 0 + nop.i 0 };; { .mmi ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack @@ -1613,8 +1436,7 @@ __libm_error_region: mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs br.ret.sptk b0 // Return };; -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) +LOCAL_LIBM_END(__libm_error_region#) .type __libm_error_support#,@function .global __libm_error_support# diff --git a/sysdeps/ia64/fpu/s_fabs.S b/sysdeps/ia64/fpu/s_fabs.S index ea3908dbc3..3434389a3c 100644 --- a/sysdeps/ia64/fpu/s_fabs.S +++ b/sysdeps/ia64/fpu/s_fabs.S @@ -1,34 +1,82 @@ -/* Copyright (C) 2000 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA - 02111-1307 USA. */ - -#include <sysdep.h> -#undef ret - -ENTRY (__fabs) -{ - fabs fret0 = farg0 - br.ret.sptk.many rp -} -END (__fabs) - -strong_alias (__fabs, __fabsf) -strong_alias (__fabs, __fabsl) - -weak_alias (__fabs, fabs) -weak_alias (__fabsf, fabsf) -weak_alias (__fabsl, fabsl) +.file "fabs.s" + + +// Copyright (c) 2000 - 2003, Intel Corporation +// All rights reserved. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// * The name of Intel Corporation may not be used to endorse or promote +// products derived from this software without specific prior written +// permission. + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://www.intel.com/software/products/opensource/libraries/num.htm. +// +// History +//============================================================== +// 02/02/00 Initial version +// 02/07/02 Added __libm_fabs entry point to test in case compiler inlines +// 05/20/02 Cleaned up namespace and sf0 syntax +// 02/10/03 Reordered header: .section, .global, .proc, .align +// +// API +//============================================================== +// double fabs (double x) +// +// Overview of operation +//============================================================== +// returns absolute value of x + +// floating-point registers used: 1 +// f8, input + +.section .text +.global __libm_fabs# + +.proc __libm_fabs# +__libm_fabs: +.endp __libm_fabs# + +GLOBAL_IEEE754_ENTRY(fabs) + +// set invalid or denormal flags and take fault if +// necessary + +{ .mfi + nop.m 999 + fcmp.eq.unc.s0 p6,p7 = f8,f1 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 + fmerge.s f8 = f0,f8 + br.ret.sptk b0 ;; +} + +GLOBAL_IEEE754_END(fabs) diff --git a/sysdeps/ia64/fpu/s_fabsf.S b/sysdeps/ia64/fpu/s_fabsf.S index 7e5abde625..71bb6da882 100644 --- a/sysdeps/ia64/fpu/s_fabsf.S +++ b/sysdeps/ia64/fpu/s_fabsf.S @@ -1 +1,82 @@ -/* __fabsf is in s_fabs.S. */ +.file "fabsf.s" + + +// Copyright (c) 2000 - 2003, Intel Corporation +// All rights reserved. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// * The name of Intel Corporation may not be used to endorse or promote +// products derived from this software without specific prior written +// permission. + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://www.intel.com/software/products/opensource/libraries/num.htm. +// +// History +//============================================================== +// 02/02/00 Initial version +// 02/07/02 Added __libm_fabsf entry point to test in case compiler inlines +// 05/20/02 Cleaned up namespace and sf0 syntax +// 02/10/03 Reordered header: .section, .global, .proc, .align +// +// API +//============================================================== +// float fabsf (float x) +// +// Overview of operation +//============================================================== +// returns absolute value of x + +// floating-point registers used: 1 +// f8, input + +.section .text +.global __libm_fabsf# + +.proc __libm_fabsf# +__libm_fabsf: +.endp __libm_fabsf# + +GLOBAL_IEEE754_ENTRY(fabsf) + +// set invalid or denormal flags and take fault if +// necessary + +{ .mfi + nop.m 999 + fcmp.eq.unc.s0 p6,p7 = f8,f1 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 + fmerge.s f8 = f0,f8 + br.ret.sptk b0 ;; +} + +GLOBAL_IEEE754_END(fabsf) diff --git a/sysdeps/ia64/fpu/s_fabsl.S b/sysdeps/ia64/fpu/s_fabsl.S index 3d7a41fe2b..a048949147 100644 --- a/sysdeps/ia64/fpu/s_fabsl.S +++ b/sysdeps/ia64/fpu/s_fabsl.S @@ -1 +1,82 @@ -/* __fabsl is in s_fabs.S. */ +.file "fabsl.s" + + +// Copyright (c) 2000 - 2003, Intel Corporation +// All rights reserved. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// * The name of Intel Corporation may not be used to endorse or promote +// products derived from this software without specific prior written +// permission. + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Intel Corporation is the author of this code, and requests that all +// problem reports or change requests be submitted to it directly at +// http://www.intel.com/software/products/opensource/libraries/num.htm. +// +// History +//============================================================== +// 02/02/00 Initial version +// 02/07/02 Added __libm_fabsl entry point to test in case compiler inlines +// 05/20/02 Cleaned up namespace and sf0 syntax +// 02/10/03 Reordered header: .section, .global, .proc, .align +// +// API +//============================================================== +// long double fabsl (long double x) +// +// Overview of operation +//============================================================== +// returns absolute value of x + +// floating-point registers used: 1 +// f8, input + +.section .text +.global __libm_fabsl# + +.proc __libm_fabsl# +__libm_fabsl: +.endp __libm_fabsl# + +GLOBAL_IEEE754_ENTRY(fabsl) + +// set invalid or denormal flags and take fault if +// necessary + +{ .mfi + nop.m 999 + fcmp.eq.unc.s0 p6,p7 = f8,f1 + nop.i 999 ;; +} + +{ .mfb + nop.m 999 + fmerge.s f8 = f0,f8 + br.ret.sptk b0 ;; +} + +GLOBAL_IEEE754_END(fabsl) diff --git a/sysdeps/ia64/fpu/s_floor.S b/sysdeps/ia64/fpu/s_floor.S index 438b0fa867..9ed9d6dcdb 100644 --- a/sysdeps/ia64/fpu/s_floor.S +++ b/sysdeps/ia64/fpu/s_floor.S @@ -1,10 +1,10 @@ .file "floor.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,86 +20,68 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// // Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// problem reports or change requests be submitted to it directly at +// http://www.intel.com/software/products/opensource/libraries/num.htm. // -.align 32 -.global floor# - -.section .text -.proc floor# -.align 32 - // History //============================================================== -// 2/02/00: Initial version -// 3/22/00: Updated to improve performance -// 6/13/00: Improved speed, fixed setting of inexact flag -// 6/27/00: Eliminated incorrect invalid flag setting -// 2/07/01: Corrected sign of zero result in round to -inf mode +// 02/02/00 Initial version +// 03/22/00 Updated to improve performance +// 06/13/00 Improved speed, fixed setting of inexact flag +// 06/27/00 Eliminated incorrect invalid flag setting +// 02/07/01 Corrected sign of zero result in round to -inf mode +// 05/20/02 Cleaned up namespace and sf0 syntax +// 01/28/03 Improved performance +//============================================================== // API //============================================================== // double floor(double x) +//============================================================== -// general input registers: - -floor_GR_FFFF = r14 -floor_GR_signexp = r15 -floor_GR_exponent = r16 -floor_GR_expmask = r17 -floor_GR_bigexp = r18 - - -// predicate registers used: +// general input registers: +// r14 - r18 -// p6 ==> Input is NaN, infinity, zero -// p7 ==> Input is denormal -// p8 ==> Input is <0 -// p9 ==> Input is >=0 -// p10 ==> Input is already an integer (bigger than largest integer) -// p11 ==> Input is not a large integer -// p12 ==> Input is a smaller integer -// p13 ==> Input is not an even integer, so inexact must be set +rSignexp = r14 +rExp = r15 +rExpMask = r16 +rBigexp = r17 +rM1 = r18 +// floating-point registers: +// f8 - f13 -// floating-point registers used: +fXInt = f9 +fNormX = f10 +fTmp = f11 +fAdj = f12 +fPreResult = f13 -FLOOR_NORM_f8 = f9 -FLOOR_FFFF = f10 -FLOOR_INEXACT = f11 -FLOOR_FLOAT_INT_f8 = f12 -FLOOR_INT_f8 = f13 -FLOOR_adj = f14 +// predicate registers used: +// p6 - p9 // Overview of operation //============================================================== - // double floor(double x) -// Return an integer value (represented as a double) that is the largest +// Return an integer value (represented as a double) that is the largest // value not greater than x // This is x rounded toward -infinity to an integral value. // Inexact is set if x != floor(x) -// ************************************************************************** - -// Set denormal flag for denormal input and -// and take denormal fault if necessary. - -// Is the input an integer value already? +//============================================================== // double_extended // if the exponent is > 1003e => 3F(true) = 63(decimal) @@ -120,121 +102,115 @@ FLOOR_adj = f14 // If we multiply by 2^23, we no longer have a fractional part // So input is an integer value already. -// If x is NAN, ZERO, or INFINITY, then return - -// qnan snan inf norm unorm 0 -+ -// 1 1 1 0 0 1 11 0xe7 - -#include "libm_support.h" -floor: -#ifdef _LIBC -.global __floor -__floor: -#endif +.section .text +GLOBAL_IEEE754_ENTRY(floor) { .mfi - getf.exp floor_GR_signexp = f8 - fcvt.fx.trunc.s1 FLOOR_INT_f8 = f8 - addl floor_GR_bigexp = 0x10033, r0 + getf.exp rSignexp = f8 // Get signexp, recompute if unorm + fclass.m p7,p0 = f8, 0x0b // Test x unorm + addl rBigexp = 0x10033, r0 // Set exponent at which is integer } { .mfi - addl floor_GR_FFFF = -1,r0 - fcmp.lt.s1 p8,p9 = f8,f0 - mov floor_GR_expmask = 0x1FFFF ;; + mov rM1 = -1 // Set all ones + fcvt.fx.trunc.s1 fXInt = f8 // Convert to int in significand + mov rExpMask = 0x1FFFF // Form exponent mask } +;; -// p7 ==> denorm { .mfi - setf.sig FLOOR_FFFF = floor_GR_FFFF - fclass.m p7,p0 = f8, 0x0b - nop.i 999 + nop.m 0 + fcmp.lt.s1 p8,p9 = f8, f0 // Test x < 0 + nop.i 0 } -{ .mfi - nop.m 999 - fnorm.s1 FLOOR_NORM_f8 = f8 - nop.i 999 ;; +{ .mfb + setf.sig fTmp = rM1 // Make const for setting inexact + fnorm.s1 fNormX = f8 // Normalize input +(p7) br.cond.spnt FLOOR_UNORM // Branch if x unorm } +;; -// p6 ==> NAN, INF, ZERO -{ .mfb - nop.m 999 - fclass.m p6,p10 = f8, 0xe7 -(p7) br.cond.spnt L(FLOOR_DENORM) ;; +FLOOR_COMMON: +// Return here from FLOOR_UNORM +{ .mfi + nop.m 0 + fclass.m p6,p0 = f8, 0x1e7 // Test x natval, nan, inf, 0 + nop.i 0 } +;; -L(FLOOR_COMMON): .pred.rel "mutex",p8,p9 -// Set adjustment to subtract from trunc(x) for result -// If x<0, adjustment is -1.0 -// If x>=0, adjustment is 0.0 { .mfi - and floor_GR_exponent = floor_GR_signexp, floor_GR_expmask -(p8) fnma.s1 FLOOR_adj = f1,f1,f0 - nop.i 999 + nop.m 0 +(p8) fnma.s1 fAdj = f1, f1, f0 // If x < 0, adjustment is -1 + nop.i 0 } { .mfi - nop.m 999 -(p9) fadd.s1 FLOOR_adj = f0,f0 - nop.i 999 ;; + nop.m 0 +(p9) fma.s1 fAdj = f0, f0, f0 // If x > 0, adjustment is 0 + nop.i 0 } +;; { .mfi - nop.m 999 - fcmp.eq.s0 p12,p0 = f8,f0 // Dummy op to set denormal and invalid flag - nop.i 999 + nop.m 0 + fcvt.xf fPreResult = fXInt // trunc(x) + nop.i 0 } -{ .mfi -(p10) cmp.ge.unc p10,p11 = floor_GR_exponent, floor_GR_bigexp -(p6) fnorm.d f8 = f8 - nop.i 999 ;; +{ .mfb + nop.m 0 +(p6) fma.d.s0 f8 = f8, f1, f0 // Result if x natval, nan, inf, 0 +(p6) br.ret.spnt b0 // Exit if x natval, nan, inf, 0 } +;; -{ .mfi - nop.m 999 -(p11) fcvt.xf FLOOR_FLOAT_INT_f8 = FLOOR_INT_f8 - nop.i 999 ;; +{ .mmi + and rExp = rSignexp, rExpMask // Get biased exponent +;; + cmp.ge p7,p6 = rExp, rBigexp // Is |x| >= 2^52? + nop.i 0 } +;; { .mfi - nop.m 999 -(p10) fnorm.d f8 = FLOOR_NORM_f8 - nop.i 999 ;; + nop.m 0 +(p6) fma.d.s0 f8 = fPreResult, f1, fAdj // Result if !int, |x| < 2^52 + nop.i 0 } - - { .mfi - nop.m 999 -(p11) fadd.d f8 = FLOOR_FLOAT_INT_f8,FLOOR_adj - nop.i 999 ;; + nop.m 0 +(p7) fma.d.s0 f8 = fNormX, f1, f0 // Result, if |x| >= 2^52 + nop.i 0 } +;; { .mfi - nop.m 999 -(p11) fcmp.eq.unc.s1 p12,p13 = FLOOR_FLOAT_INT_f8, FLOOR_NORM_f8 - nop.i 999 ;; + nop.m 0 +(p6) fcmp.eq.unc.s1 p8, p9 = fPreResult, fNormX // Is trunc(x) = x ? + nop.i 0 } +;; -// Set inexact if result not equal to input { .mfi - nop.m 999 -(p13) fmpy.s0 FLOOR_INEXACT = FLOOR_FFFF,FLOOR_FFFF - nop.i 999 + nop.m 0 +(p9) fmpy.s0 fTmp = fTmp, fTmp // Dummy to set inexact + nop.i 0 } -// Set result to input if integer { .mfb - nop.m 999 -(p12) fnorm.d f8 = FLOOR_NORM_f8 - br.ret.sptk b0 ;; + nop.m 0 +(p8) fma.d.s0 f8 = fNormX, f1, f0 // If x int, result normalized x + br.ret.sptk b0 // Exit main path, 0 < |x| < 2^52 } +;; + -// Here if input denorm -L(FLOOR_DENORM): +FLOOR_UNORM: +// Here if x unorm { .mfb - getf.exp floor_GR_signexp = FLOOR_NORM_f8 - fcvt.fx.trunc.s1 FLOOR_INT_f8 = FLOOR_NORM_f8 - br.cond.sptk L(FLOOR_COMMON) ;; + getf.exp rSignexp = fNormX // Get signexp, recompute if unorm + fcmp.eq.s0 p7,p0 = f8, f0 // Dummy op to set denormal flag + br.cond.sptk FLOOR_COMMON // Return to main path } +;; -.endp floor -ASM_SIZE_DIRECTIVE(floor) +GLOBAL_IEEE754_END(floor) diff --git a/sysdeps/ia64/fpu/s_floorf.S b/sysdeps/ia64/fpu/s_floorf.S index 15b2bbd31d..a3f2095931 100644 --- a/sysdeps/ia64/fpu/s_floorf.S +++ b/sysdeps/ia64/fpu/s_floorf.S @@ -1,10 +1,10 @@ .file "floorf.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,85 +20,67 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// // Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// problem reports or change requests be submitted to it directly at +// http://www.intel.com/software/products/opensource/libraries/num.htm. // -.align 32 -.global floorf# - -.section .text -.proc floorf# -.align 32 - // History //============================================================== -// 2/02/00: Initial version -// 6/13/00: Improved speed -// 6/27/00: Eliminated incorrect invalid flag setting -// 2/07/01: Corrected sign of zero result in round to -inf mode +// 02/02/00 Initial version +// 06/13/00 Improved speed +// 06/27/00 Eliminated incorrect invalid flag setting +// 02/07/01 Corrected sign of zero result in round to -inf mode +// 05/20/02 Cleaned up namespace and sf0 syntax +// 01/28/03 Improved performance +//============================================================== // API //============================================================== // float floorf(float x) +//============================================================== -// general input registers: - -floor_GR_FFFF = r14 -floor_GR_signexp = r15 -floor_GR_exponent = r16 -floor_GR_expmask = r17 -floor_GR_bigexp = r18 - - -// predicate registers used: +// general input registers: +// r14 - r18 -// p6 ==> Input is NaN, infinity, zero -// p7 ==> Input is denormal -// p8 ==> Input is <0 -// p9 ==> Input is >=0 -// p10 ==> Input is already an integer (bigger than largest integer) -// p11 ==> Input is not a large integer -// p12 ==> Input is a smaller integer -// p13 ==> Input is not an even integer, so inexact must be set +rSignexp = r14 +rExp = r15 +rExpMask = r16 +rBigexp = r17 +rM1 = r18 +// floating-point registers: +// f8 - f13 -// floating-point registers used: +fXInt = f9 +fNormX = f10 +fTmp = f11 +fAdj = f12 +fPreResult = f13 -FLOOR_NORM_f8 = f9 -FLOOR_FFFF = f10 -FLOOR_INEXACT = f11 -FLOOR_FLOAT_INT_f8 = f12 -FLOOR_INT_f8 = f13 -FLOOR_adj = f14 +// predicate registers used: +// p6 - p9 // Overview of operation //============================================================== - // float floorf(float x) -// Return an integer value (represented as a float) that is the largest +// Return an integer value (represented as a float) that is the largest // value not greater than x // This is x rounded toward -infinity to an integral value. // Inexact is set if x != floorf(x) -// ************************************************************************** - -// Set denormal flag for denormal input and -// and take denormal fault if necessary. - -// Is the input an integer value already? +//============================================================== // double_extended // if the exponent is > 1003e => 3F(true) = 63(decimal) @@ -119,119 +101,115 @@ FLOOR_adj = f14 // If we multiply by 2^23, we no longer have a fractional part // So input is an integer value already. -// If x is NAN, ZERO, or INFINITY, then return - -// qnan snan inf norm unorm 0 -+ -// 1 1 1 0 0 1 11 0xe7 - -#include "libm_support.h" -floorf: -#ifdef _LIBC -.global __floorf -__floorf: -#endif +.section .text +GLOBAL_IEEE754_ENTRY(floorf) { .mfi - getf.exp floor_GR_signexp = f8 - fcvt.fx.trunc.s1 FLOOR_INT_f8 = f8 - addl floor_GR_bigexp = 0x10016, r0 + getf.exp rSignexp = f8 // Get signexp, recompute if unorm + fclass.m p7,p0 = f8, 0x0b // Test x unorm + addl rBigexp = 0x10016, r0 // Set exponent at which is integer } { .mfi - addl floor_GR_FFFF = -1,r0 - fcmp.lt.s1 p8,p9 = f8,f0 - mov floor_GR_expmask = 0x1FFFF ;; + mov rM1 = -1 // Set all ones + fcvt.fx.trunc.s1 fXInt = f8 // Convert to int in significand + mov rExpMask = 0x1FFFF // Form exponent mask } +;; -// p7 ==> denorm { .mfi - setf.sig FLOOR_FFFF = floor_GR_FFFF - fclass.m p7,p0 = f8, 0x0b - nop.i 999 + nop.m 0 + fcmp.lt.s1 p8,p9 = f8, f0 // Test x < 0 + nop.i 0 } -{ .mfi - nop.m 999 - fnorm.s1 FLOOR_NORM_f8 = f8 - nop.i 999 ;; +{ .mfb + setf.sig fTmp = rM1 // Make const for setting inexact + fnorm.s1 fNormX = f8 // Normalize input +(p7) br.cond.spnt FLOOR_UNORM // Branch if x unorm } +;; -// p6 ==> NAN, INF, ZERO -{ .mfb - nop.m 999 - fclass.m p6,p10 = f8, 0xe7 -(p7) br.cond.spnt L(FLOOR_DENORM) ;; +FLOOR_COMMON: +// Return here from FLOOR_UNORM +{ .mfi + nop.m 0 + fclass.m p6,p0 = f8, 0x1e7 // Test x natval, nan, inf, 0 + nop.i 0 } +;; -L(FLOOR_COMMON): .pred.rel "mutex",p8,p9 -// Set adjustment to subtract from trunc(x) for result -// If x<0, adjustment is -1.0 -// If x>=0, adjustment is 0.0 { .mfi - and floor_GR_exponent = floor_GR_signexp, floor_GR_expmask -(p8) fnma.s1 FLOOR_adj = f1,f1,f0 - nop.i 999 + nop.m 0 +(p8) fnma.s1 fAdj = f1, f1, f0 // If x < 0, adjustment is -1 + nop.i 0 } { .mfi - nop.m 999 -(p9) fadd.s1 FLOOR_adj = f0,f0 - nop.i 999 ;; + nop.m 0 +(p9) fma.s1 fAdj = f0, f0, f0 // If x > 0, adjustment is 0 + nop.i 0 } +;; { .mfi - nop.m 999 - fcmp.eq.s0 p12,p0 = f8,f0 // Dummy op to set denormal and invalid flag - nop.i 999 + nop.m 0 + fcvt.xf fPreResult = fXInt // trunc(x) + nop.i 0 } -{ .mfi -(p10) cmp.ge.unc p10,p11 = floor_GR_exponent, floor_GR_bigexp -(p6) fnorm.s f8 = f8 - nop.i 999 ;; +{ .mfb + nop.m 0 +(p6) fma.s.s0 f8 = f8, f1, f0 // Result if x natval, nan, inf, 0 +(p6) br.ret.spnt b0 // Exit if x natval, nan, inf, 0 } +;; -{ .mfi - nop.m 999 -(p11) fcvt.xf FLOOR_FLOAT_INT_f8 = FLOOR_INT_f8 - nop.i 999 ;; +{ .mmi + and rExp = rSignexp, rExpMask // Get biased exponent +;; + cmp.ge p7,p6 = rExp, rBigexp // Is |x| >= 2^23? + nop.i 0 } +;; { .mfi - nop.m 999 -(p10) fnorm.s f8 = FLOOR_NORM_f8 - nop.i 999 ;; + nop.m 0 +(p6) fma.s.s0 f8 = fPreResult, f1, fAdj // Result if !int, |x| < 2^23 + nop.i 0 } - { .mfi - nop.m 999 -(p11) fadd.s f8 = FLOOR_FLOAT_INT_f8,FLOOR_adj - nop.i 999 ;; + nop.m 0 +(p7) fma.s.s0 f8 = fNormX, f1, f0 // Result, if |x| >= 2^23 + nop.i 0 } +;; + { .mfi - nop.m 999 -(p11) fcmp.eq.unc.s1 p12,p13 = FLOOR_FLOAT_INT_f8, FLOOR_NORM_f8 - nop.i 999 ;; + nop.m 0 +(p6) fcmp.eq.unc.s1 p8, p9 = fPreResult, fNormX // Is trunc(x) = x ? + nop.i 0 } +;; -// Set inexact if result not equal to input { .mfi - nop.m 999 -(p13) fmpy.s0 FLOOR_INEXACT = FLOOR_FFFF,FLOOR_FFFF - nop.i 999 + nop.m 0 +(p9) fmpy.s0 fTmp = fTmp, fTmp // Dummy to set inexact + nop.i 0 } -// Set result to input if integer { .mfb - nop.m 999 -(p12) fnorm.s f8 = FLOOR_NORM_f8 - br.ret.sptk b0 ;; + nop.m 0 +(p8) fma.s.s0 f8 = fNormX, f1, f0 // If x int, result normalized x + br.ret.sptk b0 // Exit main path, 0 < |x| < 2^23 } +;; + -// Here if input denorm -L(FLOOR_DENORM): +FLOOR_UNORM: +// Here if x unorm { .mfb - getf.exp floor_GR_signexp = FLOOR_NORM_f8 - fcvt.fx.trunc.s1 FLOOR_INT_f8 = FLOOR_NORM_f8 - br.cond.sptk L(FLOOR_COMMON) ;; + getf.exp rSignexp = fNormX // Get signexp, recompute if unorm + fcmp.eq.s0 p7,p0 = f8, f0 // Dummy op to set denormal flag + br.cond.sptk FLOOR_COMMON // Return to main path } +;; -.endp floorf -ASM_SIZE_DIRECTIVE(floorf) +GLOBAL_IEEE754_END(floorf) diff --git a/sysdeps/ia64/fpu/s_floorl.S b/sysdeps/ia64/fpu/s_floorl.S index 294578e1a7..345c4f30dd 100644 --- a/sysdeps/ia64/fpu/s_floorl.S +++ b/sysdeps/ia64/fpu/s_floorl.S @@ -1,10 +1,10 @@ .file "floorl.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,85 +20,67 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// // Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// problem reports or change requests be submitted to it directly at +// http://www.intel.com/software/products/opensource/libraries/num.htm. // -.align 32 -.global floorl# - -.section .text -.proc floorl# -.align 32 - // History //============================================================== -// 2/02/00: Initial version -// 6/13/00: Improved speed -// 6/27/00: Eliminated incorrect invalid flag setting -// 2/07/01: Corrected sign of zero result in round to -inf mode +// 02/02/00 Initial version +// 06/13/00 Improved speed +// 06/27/00 Eliminated incorrect invalid flag setting +// 02/07/01 Corrected sign of zero result in round to -inf mode +// 05/20/02 Cleaned up namespace and sf0 syntax +// 01/28/03 Improved performance +//============================================================== // API //============================================================== // long double floorl(long double x) +//============================================================== -// general input registers: - -floor_GR_FFFF = r14 -floor_GR_signexp = r15 -floor_GR_exponent = r16 -floor_GR_expmask = r17 -floor_GR_bigexp = r18 - - -// predicate registers used: +// general input registers: +// r14 - r18 -// p6 ==> Input is NaN, infinity, zero -// p7 ==> Input is denormal -// p8 ==> Input is <0 -// p9 ==> Input is >=0 -// p10 ==> Input is already an integer (bigger than largest integer) -// p11 ==> Input is not a large integer -// p12 ==> Input is a smaller integer -// p13 ==> Input is not an even integer, so inexact must be set +rSignexp = r14 +rExp = r15 +rExpMask = r16 +rBigexp = r17 +rM1 = r18 +// floating-point registers: +// f8 - f13 -// floating-point registers used: +fXInt = f9 +fNormX = f10 +fTmp = f11 +fAdj = f12 +fPreResult = f13 -FLOOR_NORM_f8 = f9 -FLOOR_FFFF = f10 -FLOOR_INEXACT = f11 -FLOOR_FLOAT_INT_f8 = f12 -FLOOR_INT_f8 = f13 -FLOOR_adj = f14 +// predicate registers used: +// p6 - p9 // Overview of operation //============================================================== - // long double floorl(long double x) -// Return an integer value (represented as a long double) that is the largest +// Return an integer value (represented as a long double) that is the largest // value not greater than x // This is x rounded toward -infinity to an integral value. // Inexact is set if x != floorl(x) -// ************************************************************************** - -// Set denormal flag for denormal input and -// and take denormal fault if necessary. - -// Is the input an integer value already? +//============================================================== // double_extended // if the exponent is > 1003e => 3F(true) = 63(decimal) @@ -119,119 +101,115 @@ FLOOR_adj = f14 // If we multiply by 2^23, we no longer have a fractional part // So input is an integer value already. -// If x is NAN, ZERO, or INFINITY, then return - -// qnan snan inf norm unorm 0 -+ -// 1 1 1 0 0 1 11 0xe7 - -#include "libm_support.h" -floorl: -#ifdef _LIBC -.global __floorl -__floorl: -#endif +.section .text +GLOBAL_IEEE754_ENTRY(floorl) { .mfi - getf.exp floor_GR_signexp = f8 - fcvt.fx.trunc.s1 FLOOR_INT_f8 = f8 - addl floor_GR_bigexp = 0x1003e, r0 + getf.exp rSignexp = f8 // Get signexp, recompute if unorm + fclass.m p7,p0 = f8, 0x0b // Test x unorm + addl rBigexp = 0x1003e, r0 // Set exponent at which is integer } { .mfi - addl floor_GR_FFFF = -1,r0 - fcmp.lt.s1 p8,p9 = f8,f0 - mov floor_GR_expmask = 0x1FFFF ;; + mov rM1 = -1 // Set all ones + fcvt.fx.trunc.s1 fXInt = f8 // Convert to int in significand + mov rExpMask = 0x1FFFF // Form exponent mask } +;; -// p7 ==> denorm { .mfi - setf.sig FLOOR_FFFF = floor_GR_FFFF - fclass.m p7,p0 = f8, 0x0b - nop.i 999 + nop.m 0 + fcmp.lt.s1 p8,p9 = f8, f0 // Test x < 0 + nop.i 0 } -{ .mfi - nop.m 999 - fnorm.s1 FLOOR_NORM_f8 = f8 - nop.i 999 ;; +{ .mfb + setf.sig fTmp = rM1 // Make const for setting inexact + fnorm.s1 fNormX = f8 // Normalize input +(p7) br.cond.spnt FLOOR_UNORM // Branch if x unorm } +;; -// p6 ==> NAN, INF, ZERO -{ .mfb - nop.m 999 - fclass.m p6,p10 = f8, 0xe7 -(p7) br.cond.spnt L(FLOOR_DENORM) ;; +FLOOR_COMMON: +// Return here from FLOOR_UNORM +{ .mfi + nop.m 0 + fclass.m p6,p0 = f8, 0x1e7 // Test x natval, nan, inf, 0 + nop.i 0 } +;; -L(FLOOR_COMMON): .pred.rel "mutex",p8,p9 -// Set adjustment to subtract from trunc(x) for result -// If x<0, adjustment is -1.0 -// If x>=0, adjustment is 0.0 { .mfi - and floor_GR_exponent = floor_GR_signexp, floor_GR_expmask -(p8) fnma.s1 FLOOR_adj = f1,f1,f0 - nop.i 999 + nop.m 0 +(p8) fnma.s1 fAdj = f1, f1, f0 // If x < 0, adjustment is -1 + nop.i 0 } { .mfi - nop.m 999 -(p9) fadd.s1 FLOOR_adj = f0,f0 - nop.i 999 ;; + nop.m 0 +(p9) fma.s1 fAdj = f0, f0, f0 // If x > 0, adjustment is 0 + nop.i 0 } +;; { .mfi - nop.m 999 - fcmp.eq.s0 p12,p0 = f8,f0 // Dummy op to set denormal and invalid flag - nop.i 999 + nop.m 0 + fcvt.xf fPreResult = fXInt // trunc(x) + nop.i 0 } -{ .mfi -(p10) cmp.ge.unc p10,p11 = floor_GR_exponent, floor_GR_bigexp -(p6) fnorm f8 = f8 - nop.i 999 ;; +{ .mfb + nop.m 0 +(p6) fma.s0 f8 = f8, f1, f0 // Result if x natval, nan, inf, 0 +(p6) br.ret.spnt b0 // Exit if x natval, nan, inf, 0 } +;; -{ .mfi - nop.m 999 -(p11) fcvt.xf FLOOR_FLOAT_INT_f8 = FLOOR_INT_f8 - nop.i 999 ;; +{ .mmi + and rExp = rSignexp, rExpMask // Get biased exponent +;; + cmp.ge p7,p6 = rExp, rBigexp // Is |x| >= 2^63? + nop.i 0 } +;; { .mfi - nop.m 999 -(p10) fnorm f8 = FLOOR_NORM_f8 - nop.i 999 ;; + nop.m 0 +(p6) fma.s0 f8 = fPreResult, f1, fAdj // Result if !int, |x| < 2^63 + nop.i 0 } - { .mfi - nop.m 999 -(p11) fadd f8 = FLOOR_FLOAT_INT_f8,FLOOR_adj - nop.i 999 ;; + nop.m 0 +(p7) fma.s0 f8 = fNormX, f1, f0 // Result, if |x| >= 2^63 + nop.i 0 } +;; + { .mfi - nop.m 999 -(p11) fcmp.eq.unc.s1 p12,p13 = FLOOR_FLOAT_INT_f8, FLOOR_NORM_f8 - nop.i 999 ;; + nop.m 0 +(p6) fcmp.eq.unc.s1 p8, p9 = fPreResult, fNormX // Is trunc(x) = x ? + nop.i 0 } +;; -// Set inexact if result not equal to input { .mfi - nop.m 999 -(p13) fmpy.s0 FLOOR_INEXACT = FLOOR_FFFF,FLOOR_FFFF - nop.i 999 + nop.m 0 +(p9) fmpy.s0 fTmp = fTmp, fTmp // Dummy to set inexact + nop.i 0 } -// Set result to input if integer { .mfb - nop.m 999 -(p12) fnorm f8 = FLOOR_NORM_f8 - br.ret.sptk b0 ;; + nop.m 0 +(p8) fma.s0 f8 = fNormX, f1, f0 // If x int, result normalized x + br.ret.sptk b0 // Exit main path, 0 < |x| < 2^63 } +;; + -// Here if input denorm -L(FLOOR_DENORM): +FLOOR_UNORM: +// Here if x unorm { .mfb - getf.exp floor_GR_signexp = FLOOR_NORM_f8 - fcvt.fx.trunc.s1 FLOOR_INT_f8 = FLOOR_NORM_f8 - br.cond.sptk L(FLOOR_COMMON) ;; + getf.exp rSignexp = fNormX // Get signexp, recompute if unorm + fcmp.eq.s0 p7,p0 = f8, f0 // Dummy op to set denormal flag + br.cond.sptk FLOOR_COMMON // Return to main path } +;; -.endp floorl -ASM_SIZE_DIRECTIVE(floorl) +GLOBAL_IEEE754_END(floorl) diff --git a/sysdeps/ia64/fpu/s_frexp.c b/sysdeps/ia64/fpu/s_frexp.c index 98349bca47..c67500695f 100644 --- a/sysdeps/ia64/fpu/s_frexp.c +++ b/sysdeps/ia64/fpu/s_frexp.c @@ -1,8 +1,10 @@ -// -// Copyright (C) 2000, 2001, Intel Corporation +/* file: frexp.c */ + + +// Copyright (c) 2000-2002, Intel Corporation // All rights reserved. // -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, // and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. // // Redistribution and use in source and binary forms, with or without @@ -19,14 +21,15 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. + // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS @@ -34,22 +37,30 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. +// // +// History +//===================================================================== +// 2/02/00 Initial version +// 1/23/02 Calls kernel with parameter to specify 32- or 64-bit int // +//===================================================================== #include "libm_support.h" +double __libm_frexp(double, int*, int); + double frexp(double x, int *y) { -#ifdef SIZE_INT_64 - return( __libm_frexp_8(x, y) ); +#ifdef SIZE_INT_64 + return( __libm_frexp(x, y, 1) ); #else -#ifdef SIZE_INT_32 - return( _GI___libm_frexp_4(x, y) ); +#ifdef SIZE_INT_32 + return( __libm_frexp(x, y, 0) ); #endif #endif diff --git a/sysdeps/ia64/fpu/s_frexpf.c b/sysdeps/ia64/fpu/s_frexpf.c index f666304147..c21a21dfba 100644 --- a/sysdeps/ia64/fpu/s_frexpf.c +++ b/sysdeps/ia64/fpu/s_frexpf.c @@ -1,8 +1,10 @@ -// -// Copyright (C) 2000, 2001, Intel Corporation +/* file: frexpf.c */ + + +// Copyright (c) 2000-2002, Intel Corporation // All rights reserved. // -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, // and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. // // Redistribution and use in source and binary forms, with or without @@ -19,14 +21,15 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. + // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS @@ -34,22 +37,30 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. +// // +// History +//===================================================================== +// 2/02/00 Initial version +// 1/23/02 Calls kernel with parameter to specify 32- or 64-bit int // +//===================================================================== #include "libm_support.h" +float __libm_frexpf(float, int*, int); + float frexpf(float x, int *y) { -#ifdef SIZE_INT_64 - return( __libm_frexp_8f(x, y) ); +#ifdef SIZE_INT_64 + return( __libm_frexpf(x, y, 1) ); #else -#ifdef SIZE_INT_32 - return( _GI___libm_frexp_4f(x, y) ); +#ifdef SIZE_INT_32 + return( __libm_frexpf(x, y, 0) ); #endif #endif diff --git a/sysdeps/ia64/fpu/s_frexpl.c b/sysdeps/ia64/fpu/s_frexpl.c index 3edc971e3f..13d44ab8b5 100644 --- a/sysdeps/ia64/fpu/s_frexpl.c +++ b/sysdeps/ia64/fpu/s_frexpl.c @@ -1,8 +1,10 @@ -// -// Copyright (C) 2000, 2001, Intel Corporation +/* file: frexpl.c */ + + +// Copyright (c) 2000-2002, Intel Corporation // All rights reserved. // -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, // and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. // // Redistribution and use in source and binary forms, with or without @@ -19,14 +21,15 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. + // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS @@ -34,22 +37,30 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. +// // +// History +//===================================================================== +// 2/02/00 Initial version +// 1/23/02 Calls kernel with parameter to specify 32- or 64-bit int // +//===================================================================== #include "libm_support.h" +long double __libm_frexpl(long double, int*, int); + long double frexpl(long double x, int *y) { -#ifdef SIZE_INT_64 - return( __libm_frexp_8l(x, y) ); +#ifdef SIZE_INT_64 + return( __libm_frexpl(x, y, 1) ); #else -#ifdef SIZE_INT_32 - return( _GI___libm_frexp_4l(x, y) ); +#ifdef SIZE_INT_32 + return( __libm_frexpl(x, y, 0) ); #endif #endif diff --git a/sysdeps/ia64/fpu/s_ilogb.S b/sysdeps/ia64/fpu/s_ilogb.S index 61975dd941..3f2733cabd 100644 --- a/sysdeps/ia64/fpu/s_ilogb.S +++ b/sysdeps/ia64/fpu/s_ilogb.S @@ -1,10 +1,10 @@ .file "ilogb.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,234 +20,248 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// // Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// problem reports or change requests be submitted to it directly at +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 2/03/00 Initial version -// 5/26/00 Fix bug when x a double-extended denormal; +// 02/03/00 Initial version +// 05/26/00 Fix bug when x a double-extended denormal; // if x=0 call error routine, per C9X -// 8/15/00 Bundle added after call to __libm_error_support to properly +// 08/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. -// 1/20/01 Fixed result for x=0, corrected error tag value. - -.align 32 -.global ilogb# - -.section .text -.proc ilogb# -.align 32 - +// 01/20/01 Fixed result for x=0, corrected error tag value. +// 05/20/02 Cleaned up namespace and sf0 syntax +// 01/20/03 Improved performance +// // API //============================================================== -// int = ilogb(double) - +// int ilogb( double x ); +// // Overview of operation //============================================================== -// ilogb computes log2(x) as an int +// The ilogb function extracts the exponent of x as an integer // and returns it in r8 - -// ilogb is similar to logb but differs in the following ways: +// +// ilogb is similar to logb but differs in the following ways: // +-inf // ilogb: returns INT_MAX // logb: returns +inf -// Nan returns FP_ILOGBNAN (which is either INT_MAX or INT_MIN) +// Nan returns FP_LOGBNAN (which is either INT_MAX or INT_MIN) // ilogb: returns INT_MAX (7fffffff) -// logb: returns QNAN (quieted SNAN) +// logb: returns QNAN (quietized SNAN) // 0 returns FP_ILOGB0 (which is either INT_MIN or -INT_MAX) -// ilogb: returns INT_MIN (80000000) -// logb: returns -inf - +// ilogb: returns -INT_MAX (80000001) +// logb: returns -inf, raises the divide-by-zero exception, +// and calls libm_error_support to set domain error +// // Registers used //============================================================== +// general registers used: +// r26 -> r39 +// r36 -> r39 used as parameters to error path +// +// predicate registers used: +// p6 -> p10 +// floating-point registers used: +// f9, f10, f11 +// f8, input -// general local registers: -// ar.pfs r32 -// r33 -> r37 -// r38 -> r41 used as parameters to error path - -// predicate registers used: -// p6 - x nan, inf -// p7 - x 0 -// p8 - x norm, unorm -// p9 - x unorm - -// floating-point registers used: -// f8 - f10 - -#include "libm_support.h" +rExpBias = r26 +rExpMask = r27 +rSignexp_x = r28 +rExp_x = r29 +rIntMax = r30 +rExp_2to64 = r31 GR_SAVE_PFS = r32 +rTrialResult = r33 GR_SAVE_B0 = r34 GR_SAVE_GP = r35 -GR_Parameter_X = r38 -GR_Parameter_Y = r39 -GR_Parameter_RESULT = r40 -GR_Parameter_TAG = r41 -FR_X = f8 -FR_Y = f0 -FR_RESULT = f0 +GR_Parameter_X = r36 +GR_Parameter_Y = r37 +GR_Parameter_RESULT = r38 +GR_Parameter_TAG = r39 +fTmp = f9 +fNorm_x = f10 +f2to64 = f11 -ilogb: +.section .text +GLOBAL_LIBM_ENTRY(ilogb) -// Form signexp of 2^64 in case need to scale denormal -{ .mmf - alloc r32=ar.pfs,1,5,4,0 -(p0) mov r37 = 0x1003f -(p0) fnorm f9 = f8 ;; +// X NORMAL +// TrueExp_x = exp(f8) - 0xffff +// r8 = TrueExp_x +{ .mfi + getf.exp rSignexp_x = f8 + fclass.m p8,p0 = f8, 0x0b // Test for x unorm + mov rExpBias = 0xffff // Exponent bias } - -// Form 2^64 in case need to scale denormal { .mfi -(p0) setf.exp f10 = r37 -(p0) fclass.m.unc p7, p8 = f8, 0xe3 -(p0) mov r34 = 0xffff ;; + nop.m 0 + fnorm.s1 fNorm_x = f8 + mov rExpMask = 0x1ffff // Exponent mask } +;; -// qnan snan inf norm unorm 0 -+ -// 1 1 1 0 0 0 11 -// e 3 -// X ZERO, returns INT_MIN -// X INF or NAN, returns INT_MAX +// Form signexp of 2^64 in case need to scale denormal +{ .mfb + mov rExp_2to64 = 0x1003f + fclass.m p6,p9 = f8, 0x1e3 // Test x natval, nan, inf +(p8) br.cond.spnt ILOGB_DENORM // Branch if x unorm +} +;; +ILOGB_COMMON: +// Return here from ILOGB_DENORM { .mfi -(p0) mov r35 = 0x1ffff -(p8) fclass.m.unc p6, p8 = f8, 0x07 - nop.i 999 ;; + and rExp_x = rSignexp_x, rExpMask // Get biased exponent + fclass.m p7,p10 = f8, 0x07 // Test x zero + nop.i 0 } { .mlx - nop.m 999 -(p7) movl r8 = 0x000000007fffffff ;; + nop.m 0 + movl rIntMax = 0x000000007fffffff // Form INT_MAX } +;; -{ .mib - nop.m 999 - nop.i 999 -(p6) br.cond.spnt L(ILOGB_ZERO) ;; -} - -// Test for denormal +.pred.rel "mutex",p6,p9 { .mfi - nop.m 999 -(p8) fclass.m.unc p9, p0 = f9, 0x0b - nop.i 999 ;; +(p9) sub r8 = rExp_x, rExpBias // Get true exponent for normal path +(p6) fma.s0 fTmp = f8, f8, f0 // Dummy to set Invalid flag +(p6) mov r8 = rIntMax // If nan, inf, return INT_MAX +} +{ .mbb + nop.m 0 +(p7) br.cond.spnt ILOGB_ZERO // Branch if x zero +(p10) br.ret.sptk b0 // Exit if x not zero } +;; -L(ILOGB_COMMON): -// X NORMAL returns true exponent -{ .mmi - nop.m 999 -(p8) getf.exp r33 = f9 - nop.i 999 ;; + +ILOGB_DENORM: +// Form 2^64 in case need to scale denormal +// Check to see if double-extended denormal +{ .mfi + setf.exp f2to64 = rExp_2to64 + fclass.m p8,p0 = fNorm_x, 0x0b + nop.i 0 } +;; -// If denormal add 64 to exponent bias for scaling -{ .mfb -(p9) add r34 = 64, r34 - nop.f 999 -(p9) br.cond.spnt L(ILOGB_DENORM) ;; +{ .mfi + nop.m 0 + fcmp.eq.s0 p7,p0 = f8, f0 // Dummy op to set denormal flag + nop.i 0 } +;; -{ .mmi -(p8) and r36 = r35, r33 - nop.m 999 - nop.i 999 ;; +// If double-extended denormal add 64 to exponent bias for scaling +// If double-extended denormal form x * 2^64 which is normal +{ .mfi +(p8) add rExpBias = 64, rExpBias +(p8) fmpy.s1 fNorm_x = fNorm_x, f2to64 + nop.i 0 } +;; +// Logic is the same as normal path but use normalized input { .mib -(p8) sub r8 = r36, r34 - nop.i 999 -(p0) br.ret.sptk b0 ;; + getf.exp rSignexp_x = fNorm_x + nop.i 0 + br.cond.sptk ILOGB_COMMON // Return to main path } +;; -L(ILOGB_DENORM): -// Here if x denormal -// Form x * 2^64 which is normal -// Return to common code -{ .mfb - cmp.eq p8,p9 = r0,r0 - fmpy f9 = f9, f10 - br.cond.sptk L(ILOGB_COMMON) ;; +ILOGB_ZERO: +// Here if x zero +// Return INT_MIN, call error support + +{ .mlx + alloc r32=ar.pfs,1,3,4,0 + movl rTrialResult = 0x0000000080000000 +} +{ .mib + mov GR_Parameter_TAG = 157 // Error code + nop.i 0 + br.cond.sptk __libm_error_region // Call error support } +;; -// X ZERO -// return INT_MIN, call error support -L(ILOGB_ZERO): -{.mlx - mov GR_Parameter_TAG = 157 -(p6) movl r33 = 0x0000000080000000 ;; -};; -.endp ilogb -ASM_SIZE_DIRECTIVE(ilogb) +GLOBAL_LIBM_END(ilogb) -.proc __libm_error_region -__libm_error_region: +LOCAL_LIBM_ENTRY(__libm_error_region) .prologue + { .mfi - add GR_Parameter_Y=-32,sp // Parameter 2 value + add GR_Parameter_Y=-32,sp // Parameter 2 value nop.f 0 .save ar.pfs,GR_SAVE_PFS - mov GR_SAVE_PFS=ar.pfs // Save ar.pfs + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs } { .mfi .fframe 64 - add sp=-64,sp // Create new stack + add sp=-64,sp // Create new stack nop.f 0 - mov GR_SAVE_GP=gp // Save gp + mov GR_SAVE_GP=gp // Save gp };; + { .mmi - stfd [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack - add GR_Parameter_X = 16,sp // Parameter 1 address + stfd [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address .save b0, GR_SAVE_B0 - mov GR_SAVE_B0=b0 // Save b0 + mov GR_SAVE_B0=b0 // Save b0 };; + .body { .mib - stfd [GR_Parameter_X] = FR_X // Store Parameter 1 on stack - add GR_Parameter_RESULT = 0,GR_Parameter_Y - nop.b 0 // Parameter 3 address + stfd [GR_Parameter_X] = f8 // STORE Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address + nop.b 0 } { .mib - stfd [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack + stfd [GR_Parameter_Y] = f9 // Store Parameter 3 on stack add GR_Parameter_Y = -16,GR_Parameter_Y - br.call.sptk b0=__libm_error_support# // Call error handling function + br.call.sptk b0=__libm_error_support# // Call error handling function };; + { .mmi - nop.m 0 - nop.m 0 add GR_Parameter_RESULT = 48,sp + nop.m 0 + nop.i 0 };; + { .mmi - mov r8 = r33 // Store result + mov r8 = rTrialResult .restore sp add sp = 64,sp // Restore stack pointer mov b0 = GR_SAVE_B0 // Restore return address };; + { .mib mov gp = GR_SAVE_GP // Restore gp mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs - br.ret.sptk b0 // Return + br.ret.sptk b0 };; -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) +LOCAL_LIBM_END(__libm_error_region) + .type __libm_error_support#,@function .global __libm_error_support# diff --git a/sysdeps/ia64/fpu/s_ilogbf.S b/sysdeps/ia64/fpu/s_ilogbf.S index ffa6d3b672..1b6ade6148 100644 --- a/sysdeps/ia64/fpu/s_ilogbf.S +++ b/sysdeps/ia64/fpu/s_ilogbf.S @@ -1,10 +1,10 @@ .file "ilogbf.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,234 +20,248 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// // Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// problem reports or change requests be submitted to it directly at +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 2/03/00 Initial version -// 5/26/00 Fix bug when x a double-extended denormal; +// 02/03/00 Initial version +// 05/26/00 Fix bug when x a double-extended denormal; // if x=0 call error routine, per C9X -// 8/15/00 Bundle added after call to __libm_error_support to properly +// 08/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. -// 1/20/01 Fixed result for x=0 - -.align 32 -.global ilogbf# - -.section .text -.proc ilogbf# -.align 32 - +// 01/20/01 Fixed result for x=0 +// 05/20/02 Cleaned up namespace and sf0 syntax +// 01/20/03 Improved performance +// // API //============================================================== -// int = ilogbf(float) - +// int ilogbf( float x ); +// // Overview of operation //============================================================== -// ilogbf computes log2(x) as an int +// The ilogbf function extracts the exponent of x as an integer // and returns it in r8 - -// ilogbf is similar to logbf but differs in the following ways: +// +// ilogbf is similar to logbf but differs in the following ways: // +-inf // ilogbf: returns INT_MAX // logbf: returns +inf -// Nan returns FP_ILOGBNAN (which is either INT_MAX or INT_MIN) +// Nan returns FP_LOGBNAN (which is either INT_MAX or INT_MIN) // ilogbf: returns INT_MAX (7fffffff) -// logbf: returns QNAN (quieted SNAN) +// logbf: returns QNAN (quietized SNAN) // 0 returns FP_ILOGB0 (which is either INT_MIN or -INT_MAX) -// ilogbf: returns INT_MIN (80000000) -// logbf: returns -inf - +// ilogbf: returns -INT_MAX (80000001) +// logbf: returns -inf, raises the divide-by-zero exception, +// and calls libm_error_support to set domain error +// // Registers used //============================================================== +// general registers used: +// r26 -> r39 +// r36 -> r39 used as parameters to error path +// +// predicate registers used: +// p6 -> p10 +// floating-point registers used: +// f9, f10, f11 +// f8, input -// general local registers: -// ar.pfs r32 -// r33 -> r37 -// r38 -> r41 used as parameters to error path - -// predicate registers used: -// p6 - x nan, inf -// p7 - x 0 -// p8 - x norm, unorm -// p9 - x unorm - -// floating-point registers used: -// f8 - f10 - -#include "libm_support.h" +rExpBias = r26 +rExpMask = r27 +rSignexp_x = r28 +rExp_x = r29 +rIntMax = r30 +rExp_2to64 = r31 GR_SAVE_PFS = r32 +rTrialResult = r33 GR_SAVE_B0 = r34 GR_SAVE_GP = r35 -GR_Parameter_X = r38 -GR_Parameter_Y = r39 -GR_Parameter_RESULT = r40 -GR_Parameter_TAG = r41 -FR_X = f8 -FR_Y = f0 -FR_RESULT = f0 +GR_Parameter_X = r36 +GR_Parameter_Y = r37 +GR_Parameter_RESULT = r38 +GR_Parameter_TAG = r39 +fTmp = f9 +fNorm_x = f10 +f2to64 = f11 -ilogbf: +.section .text +GLOBAL_LIBM_ENTRY(ilogbf) -// Form signexp of 2^64 in case need to scale denormal -{ .mmf - alloc r32=ar.pfs,1,5,4,0 -(p0) mov r37 = 0x1003f -(p0) fnorm f9 = f8 ;; +// X NORMAL +// TrueExp_x = exp(f8) - 0xffff +// r8 = TrueExp_x +{ .mfi + getf.exp rSignexp_x = f8 + fclass.m p8,p0 = f8, 0x0b // Test for x unorm + mov rExpBias = 0xffff // Exponent bias } - -// Form 2^64 in case need to scale denormal { .mfi -(p0) setf.exp f10 = r37 -(p0) fclass.m.unc p7, p8 = f8, 0xe3 -(p0) mov r34 = 0xffff ;; + nop.m 0 + fnorm.s1 fNorm_x = f8 + mov rExpMask = 0x1ffff // Exponent mask } +;; -// qnan snan inf norm unorm 0 -+ -// 1 1 1 0 0 0 11 -// e 3 -// X ZERO, returns INT_MIN -// X INF or NAN, returns INT_MAX +// Form signexp of 2^64 in case need to scale denormal +{ .mfb + mov rExp_2to64 = 0x1003f + fclass.m p6,p9 = f8, 0x1e3 // Test x natval, nan, inf +(p8) br.cond.spnt ILOGB_DENORM // Branch if x unorm +} +;; +ILOGB_COMMON: +// Return here from ILOGB_DENORM { .mfi -(p0) mov r35 = 0x1ffff -(p8) fclass.m.unc p6, p8 = f8, 0x07 - nop.i 999 ;; + and rExp_x = rSignexp_x, rExpMask // Get biased exponent + fclass.m p7,p10 = f8, 0x07 // Test x zero + nop.i 0 } { .mlx - nop.m 999 -(p7) movl r8 = 0x000000007fffffff ;; + nop.m 0 + movl rIntMax = 0x000000007fffffff // Form INT_MAX } +;; -{ .mib - nop.m 999 - nop.i 999 -(p6) br.cond.spnt L(ILOGB_ZERO) ;; -} - -// Test for denormal +.pred.rel "mutex",p6,p9 { .mfi - nop.m 999 -(p8) fclass.m.unc p9, p0 = f9, 0x0b - nop.i 999 ;; +(p9) sub r8 = rExp_x, rExpBias // Get true exponent for normal path +(p6) fma.s0 fTmp = f8, f8, f0 // Dummy to set Invalid flag +(p6) mov r8 = rIntMax // If nan, inf, return INT_MAX +} +{ .mbb + nop.m 0 +(p7) br.cond.spnt ILOGB_ZERO // Branch if x zero +(p10) br.ret.sptk b0 // Exit if x not zero } +;; -L(ILOGB_COMMON): -// X NORMAL returns true exponent -{ .mmi - nop.m 999 -(p8) getf.exp r33 = f9 - nop.i 999 ;; + +ILOGB_DENORM: +// Form 2^64 in case need to scale denormal +// Check to see if double-extended denormal +{ .mfi + setf.exp f2to64 = rExp_2to64 + fclass.m p8,p0 = fNorm_x, 0x0b + nop.i 0 } +;; -// If denormal add 64 to exponent bias for scaling -{ .mfb -(p9) add r34 = 64, r34 - nop.f 999 -(p9) br.cond.spnt L(ILOGB_DENORM) ;; +{ .mfi + nop.m 0 + fcmp.eq.s0 p7,p0 = f8, f0 // Dummy op to set denormal flag + nop.i 0 } +;; -{ .mmi -(p8) and r36 = r35, r33 - nop.m 999 - nop.i 999 ;; +// If double-extended denormal add 64 to exponent bias for scaling +// If double-extended denormal form x * 2^64 which is normal +{ .mfi +(p8) add rExpBias = 64, rExpBias +(p8) fmpy.s1 fNorm_x = fNorm_x, f2to64 + nop.i 0 } +;; +// Logic is the same as normal path but use normalized input { .mib -(p8) sub r8 = r36, r34 - nop.i 999 -(p0) br.ret.sptk b0 ;; + getf.exp rSignexp_x = fNorm_x + nop.i 0 + br.cond.sptk ILOGB_COMMON // Return to main path } +;; -L(ILOGB_DENORM): -// Here if x denormal -// Form x * 2^64 which is normal -// Return to common code -{ .mfb - cmp.eq p8,p9 = r0,r0 - fmpy f9 = f9, f10 - br.cond.sptk L(ILOGB_COMMON) ;; +ILOGB_ZERO: +// Here if x zero +// Return INT_MIN, call error support + +{ .mlx + alloc r32=ar.pfs,1,3,4,0 + movl rTrialResult = 0x0000000080000000 +} +{ .mib + mov GR_Parameter_TAG = 158 // Error code + nop.i 0 + br.cond.sptk __libm_error_region // Call error support } +;; -// X ZERO -// return INT_MIN, call error support -L(ILOGB_ZERO): -{.mlx - mov GR_Parameter_TAG = 158 -(p6) movl r33 = 0x0000000080000000 ;; -};; -.endp ilogbf -ASM_SIZE_DIRECTIVE(ilogbf) +GLOBAL_LIBM_END(ilogbf) -.proc __libm_error_region -__libm_error_region: +LOCAL_LIBM_ENTRY(__libm_error_region) .prologue + { .mfi - add GR_Parameter_Y=-32,sp // Parameter 2 value + add GR_Parameter_Y=-32,sp // Parameter 2 value nop.f 0 .save ar.pfs,GR_SAVE_PFS - mov GR_SAVE_PFS=ar.pfs // Save ar.pfs + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs } { .mfi .fframe 64 - add sp=-64,sp // Create new stack + add sp=-64,sp // Create new stack nop.f 0 - mov GR_SAVE_GP=gp // Save gp + mov GR_SAVE_GP=gp // Save gp };; + { .mmi - stfs [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack - add GR_Parameter_X = 16,sp // Parameter 1 address + stfs [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address .save b0, GR_SAVE_B0 - mov GR_SAVE_B0=b0 // Save b0 + mov GR_SAVE_B0=b0 // Save b0 };; + .body { .mib - stfs [GR_Parameter_X] = FR_X // Store Parameter 1 on stack - add GR_Parameter_RESULT = 0,GR_Parameter_Y - nop.b 0 // Parameter 3 address + stfs [GR_Parameter_X] = f8 // STORE Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address + nop.b 0 } { .mib - stfs [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack + stfs [GR_Parameter_Y] = f9 // Store Parameter 3 on stack add GR_Parameter_Y = -16,GR_Parameter_Y - br.call.sptk b0=__libm_error_support# // Call error handling function + br.call.sptk b0=__libm_error_support# // Call error handling function };; + { .mmi - nop.m 0 - nop.m 0 add GR_Parameter_RESULT = 48,sp + nop.m 0 + nop.i 0 };; + { .mmi - mov r8 = r33 // Store result + mov r8 = rTrialResult .restore sp add sp = 64,sp // Restore stack pointer mov b0 = GR_SAVE_B0 // Restore return address };; + { .mib mov gp = GR_SAVE_GP // Restore gp mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs - br.ret.sptk b0 // Return + br.ret.sptk b0 };; -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) +LOCAL_LIBM_END(__libm_error_region) + .type __libm_error_support#,@function .global __libm_error_support# diff --git a/sysdeps/ia64/fpu/s_ilogbl.S b/sysdeps/ia64/fpu/s_ilogbl.S index 240da060bf..e462fb706e 100644 --- a/sysdeps/ia64/fpu/s_ilogbl.S +++ b/sysdeps/ia64/fpu/s_ilogbl.S @@ -1,10 +1,10 @@ .file "ilogbl.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,234 +20,248 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// // Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// problem reports or change requests be submitted to it directly at +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 2/03/00 Initial version -// 5/26/00 Fix bug when x a double-extended denormal; +// 02/03/00 Initial version +// 05/26/00 Fix bug when x a double-extended denormal; // if x=0 call error routine, per C9X -// 8/15/00 Bundle added after call to __libm_error_support to properly +// 08/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. -// 1/20/01 Fixed result for x=0 - -.align 32 -.global ilogbl# - -.section .text -.proc ilogbl# -.align 32 - +// 01/20/01 Fixed result for x=0 +// 05/20/02 Cleaned up namespace and sf0 syntax +// 01/20/03 Improved performance +// // API //============================================================== -// int = ilogbl(double_extended) - +// int ilogbl( long double x ); +// // Overview of operation //============================================================== -// ilogbl computes log2(x) as an int +// The ilogbl function extracts the exponent of x as an integer // and returns it in r8 - -// ilogbl is similar to logbl but differs in the following ways: +// +// ilogbl is similar to logbl but differs in the following ways: // +-inf // ilogbl: returns INT_MAX // logbl: returns +inf -// Nan returns FP_ILOGBNAN (which is either INT_MAX or INT_MIN) +// Nan returns FP_LOGBNAN (which is either INT_MAX or INT_MIN) // ilogbl: returns INT_MAX (7fffffff) -// logbl: returns QNAN (quieted SNAN) +// logbl: returns QNAN (quietized SNAN) // 0 returns FP_ILOGB0 (which is either INT_MIN or -INT_MAX) -// ilogbl: returns INT_MIN (80000000) -// logbl: returns -inf - +// ilogbl: returns -INT_MAX (80000001) +// logbl: returns -inf, raises the divide-by-zero exception, +// and calls libm_error_support to set domain error +// // Registers used //============================================================== +// general registers used: +// r26 -> r39 +// r36 -> r39 used as parameters to error path +// +// predicate registers used: +// p6 -> p10 +// floating-point registers used: +// f9, f10, f11 +// f8, input -// general local registers: -// ar.pfs r32 -// r33 -> r37 -// r38 -> r41 used as parameters to error path - -// predicate registers used: -// p6 - x nan, inf -// p7 - x 0 -// p8 - x norm, unorm -// p9 - x unorm - -// floating-point registers used: -// f8 - f10 - -#include "libm_support.h" +rExpBias = r26 +rExpMask = r27 +rSignexp_x = r28 +rExp_x = r29 +rIntMax = r30 +rExp_2to64 = r31 GR_SAVE_PFS = r32 +rTrialResult = r33 GR_SAVE_B0 = r34 GR_SAVE_GP = r35 -GR_Parameter_X = r38 -GR_Parameter_Y = r39 -GR_Parameter_RESULT = r40 -GR_Parameter_TAG = r41 -FR_X = f8 -FR_Y = f0 -FR_RESULT = f0 +GR_Parameter_X = r36 +GR_Parameter_Y = r37 +GR_Parameter_RESULT = r38 +GR_Parameter_TAG = r39 +fTmp = f9 +fNorm_x = f10 +f2to64 = f11 -ilogbl: +.section .text +GLOBAL_LIBM_ENTRY(ilogbl) -// Form signexp of 2^64 in case need to scale denormal -{ .mmf - alloc r32=ar.pfs,1,5,4,0 -(p0) mov r37 = 0x1003f -(p0) fnorm f9 = f8 ;; +// X NORMAL +// TrueExp_x = exp(f8) - 0xffff +// r8 = TrueExp_x +{ .mfi + getf.exp rSignexp_x = f8 + fclass.m p8,p0 = f8, 0x0b // Test for x unorm + mov rExpBias = 0xffff // Exponent bias } - -// Form 2^64 in case need to scale denormal { .mfi -(p0) setf.exp f10 = r37 -(p0) fclass.m.unc p7, p8 = f8, 0xe3 -(p0) mov r34 = 0xffff ;; + nop.m 0 + fnorm.s1 fNorm_x = f8 + mov rExpMask = 0x1ffff // Exponent mask } +;; -// qnan snan inf norm unorm 0 -+ -// 1 1 1 0 0 0 11 -// e 3 -// X ZERO, returns INT_MIN -// X INF or NAN, returns INT_MAX +// Form signexp of 2^64 in case need to scale denormal +{ .mfb + mov rExp_2to64 = 0x1003f + fclass.m p6,p9 = f8, 0x1e3 // Test x natval, nan, inf +(p8) br.cond.spnt ILOGB_DENORM // Branch if x unorm +} +;; +ILOGB_COMMON: +// Return here from ILOGB_DENORM { .mfi -(p0) mov r35 = 0x1ffff -(p8) fclass.m.unc p6, p8 = f8, 0x07 - nop.i 999 ;; + and rExp_x = rSignexp_x, rExpMask // Get biased exponent + fclass.m p7,p10 = f8, 0x07 // Test x zero + nop.i 0 } { .mlx - nop.m 999 -(p7) movl r8 = 0x000000007fffffff ;; + nop.m 0 + movl rIntMax = 0x000000007fffffff // Form INT_MAX } +;; -{ .mib - nop.m 999 - nop.i 999 -(p6) br.cond.spnt L(ILOGB_ZERO) ;; -} - -// Test for denormal +.pred.rel "mutex",p6,p9 { .mfi - nop.m 999 -(p8) fclass.m.unc p9, p0 = f9, 0x0b - nop.i 999 ;; +(p9) sub r8 = rExp_x, rExpBias // Get true exponent for normal path +(p6) fma.s0 fTmp = f8, f8, f0 // Dummy to set Invalid flag +(p6) mov r8 = rIntMax // If nan, inf, return INT_MAX +} +{ .mbb + nop.m 0 +(p7) br.cond.spnt ILOGB_ZERO // Branch if x zero +(p10) br.ret.sptk b0 // Exit if x not zero } +;; -L(ILOGB_COMMON): -// X NORMAL returns true exponent -{ .mmi - nop.m 999 -(p8) getf.exp r33 = f9 - nop.i 999 ;; + +ILOGB_DENORM: +// Form 2^64 in case need to scale denormal +// Check to see if double-extended denormal +{ .mfi + setf.exp f2to64 = rExp_2to64 + fclass.m p8,p0 = fNorm_x, 0x0b + nop.i 0 } +;; -// If denormal add 64 to exponent bias for scaling -{ .mfb -(p9) add r34 = 64, r34 - nop.f 999 -(p9) br.cond.spnt L(ILOGB_DENORM) ;; +{ .mfi + nop.m 0 + fcmp.eq.s0 p7,p0 = f8, f0 // Dummy op to set denormal flag + nop.i 0 } +;; -{ .mmi -(p8) and r36 = r35, r33 - nop.m 999 - nop.i 999 ;; +// If double-extended denormal add 64 to exponent bias for scaling +// If double-extended denormal form x * 2^64 which is normal +{ .mfi +(p8) add rExpBias = 64, rExpBias +(p8) fmpy.s1 fNorm_x = fNorm_x, f2to64 + nop.i 0 } +;; +// Logic is the same as normal path but use normalized input { .mib -(p8) sub r8 = r36, r34 - nop.i 999 -(p0) br.ret.sptk b0 ;; + getf.exp rSignexp_x = fNorm_x + nop.i 0 + br.cond.sptk ILOGB_COMMON // Return to main path } +;; -L(ILOGB_DENORM): -// Here if x denormal -// Form x * 2^64 which is normal -// Return to common code -{ .mfb - cmp.eq p8,p9 = r0,r0 - fmpy f9 = f9, f10 - br.cond.sptk L(ILOGB_COMMON) ;; +ILOGB_ZERO: +// Here if x zero +// Return INT_MIN, call error support + +{ .mlx + alloc r32=ar.pfs,1,3,4,0 + movl rTrialResult = 0x0000000080000000 +} +{ .mib + mov GR_Parameter_TAG = 156 // Error code + nop.i 0 + br.cond.sptk __libm_error_region // Call error support } +;; -// X ZERO -// return INT_MIN, call error support -L(ILOGB_ZERO): -{.mlx - mov GR_Parameter_TAG = 156 -(p6) movl r33 = 0x0000000080000000 ;; -};; -.endp ilogbl -ASM_SIZE_DIRECTIVE(ilogbl) +GLOBAL_LIBM_END(ilogbl) -.proc __libm_error_region -__libm_error_region: +LOCAL_LIBM_ENTRY(__libm_error_region) .prologue + { .mfi - add GR_Parameter_Y=-32,sp // Parameter 2 value + add GR_Parameter_Y=-32,sp // Parameter 2 value nop.f 0 .save ar.pfs,GR_SAVE_PFS - mov GR_SAVE_PFS=ar.pfs // Save ar.pfs + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs } { .mfi .fframe 64 - add sp=-64,sp // Create new stack + add sp=-64,sp // Create new stack nop.f 0 - mov GR_SAVE_GP=gp // Save gp + mov GR_SAVE_GP=gp // Save gp };; + { .mmi - stfe [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack - add GR_Parameter_X = 16,sp // Parameter 1 address + stfe [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address .save b0, GR_SAVE_B0 - mov GR_SAVE_B0=b0 // Save b0 + mov GR_SAVE_B0=b0 // Save b0 };; + .body { .mib - stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack - add GR_Parameter_RESULT = 0,GR_Parameter_Y - nop.b 0 // Parameter 3 address + stfe [GR_Parameter_X] = f8 // STORE Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address + nop.b 0 } { .mib - stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack + stfe [GR_Parameter_Y] = f9 // Store Parameter 3 on stack add GR_Parameter_Y = -16,GR_Parameter_Y - br.call.sptk b0=__libm_error_support# // Call error handling function + br.call.sptk b0=__libm_error_support# // Call error handling function };; + { .mmi - nop.m 0 - nop.m 0 add GR_Parameter_RESULT = 48,sp + nop.m 0 + nop.i 0 };; + { .mmi - mov r8 = r33 // Store result + mov r8 = rTrialResult .restore sp add sp = 64,sp // Restore stack pointer mov b0 = GR_SAVE_B0 // Restore return address };; + { .mib mov gp = GR_SAVE_GP // Restore gp mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs - br.ret.sptk b0 // Return + br.ret.sptk b0 };; -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) +LOCAL_LIBM_END(__libm_error_region) + .type __libm_error_support#,@function .global __libm_error_support# diff --git a/sysdeps/ia64/fpu/s_ldexp.S b/sysdeps/ia64/fpu/s_ldexp.S deleted file mode 100644 index 4dcd671c9f..0000000000 --- a/sysdeps/ia64/fpu/s_ldexp.S +++ /dev/null @@ -1,380 +0,0 @@ -.file "ldexp.s" - -// Copyright (C) 2000, 2001, Intel Corporation -// All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// * The name of Intel Corporation may not be used to endorse or promote -// products derived from this software without specific prior written -// permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. -// -// History -//============================================================== -// 2/02/00 Initial version -// 1/26/01 ldex pcompletely reworked and now standalone version -// -// API -//============================================================== -// double = ldexp (double x, int n) -// input floating point f8 and int n (r33) -// output floating point f8 -// -// Returns x* 2**n using an fma and detects overflow -// and underflow. -// -// - -#include "libm_support.h" - -FR_Big = f6 -FR_NBig = f7 -FR_Floating_X = f8 -FR_Result = f8 -FR_Result2 = f9 -FR_Result3 = f11 -FR_Norm_X = f12 -FR_Two_N = f14 -FR_Two_to_Big = f15 - -GR_N_Biased = r15 -GR_Big = r16 -GR_NBig = r17 -GR_Scratch = r18 -GR_Scratch1 = r19 -GR_Bias = r20 -GR_N_as_int = r21 - -GR_SAVE_B0 = r32 -GR_SAVE_GP = r33 -GR_SAVE_PFS = r34 -GR_Parameter_X = r35 -GR_Parameter_Y = r36 -GR_Parameter_RESULT = r37 -GR_Tag = r38 - -.align 32 -.global ldexp - -.section .text -.proc ldexp -.align 32 - -ldexp: - -// -// Is x NAN, INF, ZERO, +-? -// Build the exponent Bias -// -{ .mfi - alloc r32=ar.pfs,1,2,4,0 - fclass.m.unc p7,p0 = FR_Floating_X, 0xe7 //@snan | @qnan | @inf | @zero - addl GR_Bias = 0x0FFFF,r0 -} - -// -// Sign extend input -// Is N zero? -// Normalize x -// -{ .mfi - cmp.eq.unc p6,p0 = r33,r0 - fnorm.s1 FR_Norm_X = FR_Floating_X - sxt4 GR_N_as_int = r33 -} -;; - -// -// Normalize x -// Branch and return special values. -// Create -35000 -// Create 35000 -// -{ .mfi - addl GR_Big = 35000,r0 - nop.f 0 - add GR_N_Biased = GR_Bias,GR_N_as_int -} -{ .mfb - addl GR_NBig = -35000,r0 -(p7) fma.d.s0 FR_Result = FR_Floating_X,f1, f0 -(p7) br.ret.spnt b0 -};; - -// -// Build the exponent Bias -// Return x when N = 0 -// -{ .mfi - setf.exp FR_Two_N = GR_N_Biased - nop.f 0 - addl GR_Scratch1 = 0x063BF,r0 -} -{ .mfb - addl GR_Scratch = 0x019C3F,r0 -(p6) fma.d.s0 FR_Result = FR_Floating_X,f1, f0 -(p6) br.ret.spnt b0 -};; - -// -// Create 2*big -// Create 2**-big -// Is N > 35000 -// Is N < -35000 -// Raise Denormal operand flag with compare -// Main path, create 2**N -// -{ .mfi - setf.exp FR_NBig = GR_Scratch1 - nop.f 0 - cmp.ge.unc p6, p0 = GR_N_as_int, GR_Big -} -{ .mfi - setf.exp FR_Big = GR_Scratch - fcmp.ge.s0 p0,p11 = FR_Floating_X,f0 - cmp.le.unc p8, p0 = GR_N_as_int, GR_NBig -};; - -// -// Adjust 2**N if N was very small or very large -// -{ .mfi - nop.m 0 -(p6) fma.s1 FR_Two_N = FR_Big,f1,f0 - nop.i 0 -} -{ .mlx - nop.m 999 -(p0) movl GR_Scratch = 0x00000000000303FF -};; - - -{ .mfi - nop.m 0 -(p8) fma.s1 FR_Two_N = FR_NBig,f1,f0 - nop.i 0 -} -{ .mlx - nop.m 999 -(p0) movl GR_Scratch1= 0x00000000000103FF -};; - -// Set up necessary status fields -// -// S0 user supplied status -// S2 user supplied status + WRE + TD (Overflows) -// S3 user supplied status + FZ + TD (Underflows) -// -{ .mfi - nop.m 999 -(p0) fsetc.s3 0x7F,0x41 - nop.i 999 -} -{ .mfi - nop.m 999 -(p0) fsetc.s2 0x7F,0x42 - nop.i 999 -};; - -// -// Do final operation -// -{ .mfi - setf.exp FR_NBig = GR_Scratch - fma.d.s0 FR_Result = FR_Two_N,FR_Norm_X,f0 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.d.s3 FR_Result3 = FR_Two_N,FR_Norm_X,f0 - nop.i 999 -};; -{ .mfi - setf.exp FR_Big = GR_Scratch1 - fma.d.s2 FR_Result2 = FR_Two_N,FR_Norm_X,f0 - nop.i 999 -};; - -// -// Check for overflow or underflow. -// Restore s3 -// Restore s2 -// -{ .mfi - nop.m 0 - fsetc.s3 0x7F,0x40 - nop.i 999 -} -{ .mfi - nop.m 0 - fsetc.s2 0x7F,0x40 - nop.i 999 -};; - -// -// Is the result zero? -// -{ .mfi - nop.m 999 - fclass.m.unc p6, p0 = FR_Result3, 0x007 - nop.i 999 -} -{ .mfi - addl GR_Tag = 146, r0 - fcmp.ge.unc.s1 p7, p8 = FR_Result2 , FR_Big - nop.i 0 -};; - -// -// Detect masked underflow - Tiny + Inexact Only -// -{ .mfi - nop.m 999 -(p6) fcmp.neq.unc.s1 p6, p0 = FR_Result , FR_Result2 - nop.i 999 -};; - -// -// Is result bigger the allowed range? -// Branch out for underflow -// -{ .mfb -(p6) addl GR_Tag = 147, r0 -(p8) fcmp.le.unc.s1 p9, p10 = FR_Result2 , FR_NBig -(p6) br.cond.spnt L(LDEXP_UNDERFLOW) -};; - -// -// Branch out for overflow -// -{ .mbb - nop.m 0 -(p7) br.cond.spnt L(LDEXP_OVERFLOW) -(p9) br.cond.spnt L(LDEXP_OVERFLOW) -};; - -// -// Return from main path. -// -{ .mfb - nop.m 999 - nop.f 0 - br.ret.sptk b0;; -} - -.endp ldexp -ASM_SIZE_DIRECTIVE(ldexp) -.proc __libm_error_region -__libm_error_region: - -L(LDEXP_OVERFLOW): -L(LDEXP_UNDERFLOW): - -// -// Get stack address of N -// -.prologue -{ .mfi - add GR_Parameter_Y=-32,sp - nop.f 0 -.save ar.pfs,GR_SAVE_PFS - mov GR_SAVE_PFS=ar.pfs -} -// -// Adjust sp -// -{ .mfi -.fframe 64 - add sp=-64,sp - nop.f 0 - mov GR_SAVE_GP=gp -};; - -// -// Store N on stack in correct position -// Locate the address of x on stack -// -{ .mmi - st8 [GR_Parameter_Y] = GR_N_as_int,16 - add GR_Parameter_X = 16,sp -.save b0, GR_SAVE_B0 - mov GR_SAVE_B0=b0 -};; - -// -// Store x on the stack. -// Get address for result on stack. -// -.body -{ .mib - stfd [GR_Parameter_X] = FR_Norm_X - add GR_Parameter_RESULT = 0,GR_Parameter_Y - nop.b 0 -} -{ .mib - stfd [GR_Parameter_Y] = FR_Result - add GR_Parameter_Y = -16,GR_Parameter_Y - br.call.sptk b0=__libm_error_support# -};; - -// -// Get location of result on stack -// -{ .mmi - nop.m 0 - nop.m 0 - add GR_Parameter_RESULT = 48,sp -};; - -// -// Get the new result -// -{ .mmi - ldfd FR_Result = [GR_Parameter_RESULT] -.restore sp - add sp = 64,sp - mov b0 = GR_SAVE_B0 -};; - -// -// Restore gp, ar.pfs and return -// -{ .mib - mov gp = GR_SAVE_GP - mov ar.pfs = GR_SAVE_PFS - br.ret.sptk b0 -};; - -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) - -.type __libm_error_support#,@function -.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/s_ldexpf.S b/sysdeps/ia64/fpu/s_ldexpf.S deleted file mode 100644 index 36f0111fe1..0000000000 --- a/sysdeps/ia64/fpu/s_ldexpf.S +++ /dev/null @@ -1,379 +0,0 @@ -//.file "ldexpf.s" - -// Copyright (C) 2000, 2001, Intel Corporation -// All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// * The name of Intel Corporation may not be used to endorse or promote -// products derived from this software without specific prior written -// permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. -// -// History -//============================================================== -// 2/02/00 Initial version -// 1/26/01 ldexpf completely reworked and now standalone version -// -// API -//============================================================== -// float = ldexpf (float x, int n) -// input floating point f8 and int n (r33) -// output floating point f8 -// -// Returns x* 2**n using an fma and detects overflow -// and underflow. -// -// - -#include "libm_support.h" - -FR_Big = f6 -FR_NBig = f7 -FR_Floating_X = f8 -FR_Result = f8 -FR_Result2 = f9 -FR_Result3 = f11 -FR_Norm_X = f12 -FR_Two_N = f14 -FR_Two_to_Big = f15 - -GR_N_Biased = r15 -GR_Big = r16 -GR_NBig = r17 -GR_Scratch = r18 -GR_Scratch1 = r19 -GR_Bias = r20 -GR_N_as_int = r21 - -GR_SAVE_B0 = r32 -GR_SAVE_GP = r33 -GR_SAVE_PFS = r34 -GR_Parameter_X = r35 -GR_Parameter_Y = r36 -GR_Parameter_RESULT = r37 -GR_Tag = r38 - -.align 32 -.global ldexpf - -.section .text -.proc ldexpf -.align 32 - -ldexpf: - -// -// Is x NAN, INF, ZERO, +-? -// Build the exponent Bias -// -{ .mfi - alloc r32=ar.pfs,1,2,4,0 - fclass.m.unc p7,p0 = FR_Floating_X, 0xe7 //@snan | @qnan | @inf | @zero - addl GR_Bias = 0x0FFFF,r0 -} - -// -// Sign extend input -// Is N zero? -// Normalize x -// -{ .mfi - cmp.eq.unc p6,p0 = r33,r0 - fnorm.s1 FR_Norm_X = FR_Floating_X - sxt4 GR_N_as_int = r33 -} -;; - -// -// Normalize x -// Branch and return special values. -// Create -35000 -// Create 35000 -// -{ .mfi - addl GR_Big = 35000,r0 - nop.f 0 - add GR_N_Biased = GR_Bias,GR_N_as_int -} -{ .mfb - addl GR_NBig = -35000,r0 -(p7) fma.s.s0 FR_Result = FR_Floating_X,f1, f0 -(p7) br.ret.spnt b0 -};; - -// -// Build the exponent Bias -// Return x when N = 0 -// -{ .mfi - setf.exp FR_Two_N = GR_N_Biased - nop.f 0 - addl GR_Scratch1 = 0x063BF,r0 -} -{ .mfb - addl GR_Scratch = 0x019C3F,r0 -(p6) fma.s.s0 FR_Result = FR_Floating_X,f1, f0 -(p6) br.ret.spnt b0 -};; - -// -// Create 2*big -// Create 2**-big -// Is N > 35000 -// Is N < -35000 -// Raise Denormal operand flag with compare -// Main path, create 2**N -// -{ .mfi - setf.exp FR_NBig = GR_Scratch1 - nop.f 0 - cmp.ge.unc p6, p0 = GR_N_as_int, GR_Big -} -{ .mfi - setf.exp FR_Big = GR_Scratch - fcmp.ge.s0 p0,p11 = FR_Floating_X,f0 - cmp.le.unc p8, p0 = GR_N_as_int, GR_NBig -};; - -// -// Adjust 2**N if N was very small or very large -// -{ .mfi - nop.m 0 -(p6) fma.s1 FR_Two_N = FR_Big,f1,f0 - nop.i 0 -} -{ .mlx - nop.m 999 -(p0) movl GR_Scratch = 0x000000000003007F -};; - - -{ .mfi - nop.m 0 -(p8) fma.s1 FR_Two_N = FR_NBig,f1,f0 - nop.i 0 -} -{ .mlx - nop.m 999 -(p0) movl GR_Scratch1= 0x000000000001007F -};; - -// Set up necessary status fields -// -// S0 user supplied status -// S2 user supplied status + WRE + TD (Overflows) -// S3 user supplied status + FZ + TD (Underflows) -// -{ .mfi - nop.m 999 -(p0) fsetc.s3 0x7F,0x41 - nop.i 999 -} -{ .mfi - nop.m 999 -(p0) fsetc.s2 0x7F,0x42 - nop.i 999 -};; - -// -// Do final operation -// -{ .mfi - setf.exp FR_NBig = GR_Scratch - fma.s.s0 FR_Result = FR_Two_N,FR_Norm_X,f0 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s.s3 FR_Result3 = FR_Two_N,FR_Norm_X,f0 - nop.i 999 -};; -{ .mfi - setf.exp FR_Big = GR_Scratch1 - fma.s.s2 FR_Result2 = FR_Two_N,FR_Norm_X,f0 - nop.i 999 -};; - -// Check for overflow or underflow. -// Restore s3 -// Restore s2 -// -{ .mfi - nop.m 0 - fsetc.s3 0x7F,0x40 - nop.i 999 -} -{ .mfi - nop.m 0 - fsetc.s2 0x7F,0x40 - nop.i 999 -};; - -// -// Is the result zero? -// -{ .mfi - nop.m 999 - fclass.m.unc p6, p0 = FR_Result3, 0x007 - nop.i 999 -} -{ .mfi - addl GR_Tag = 148, r0 - fcmp.ge.unc.s1 p7, p8 = FR_Result2 , FR_Big - nop.i 0 -};; - -// -// Detect masked underflow - Tiny + Inexact Only -// -{ .mfi - nop.m 999 -(p6) fcmp.neq.unc.s1 p6, p0 = FR_Result , FR_Result2 - nop.i 999 -};; - -// -// Is result bigger the allowed range? -// Branch out for underflow -// -{ .mfb -(p6) addl GR_Tag = 149, r0 -(p8) fcmp.le.unc.s1 p9, p10 = FR_Result2 , FR_NBig -(p6) br.cond.spnt L(ldexpf_UNDERFLOW) -};; - -// -// Branch out for overflow -// -{ .mbb - nop.m 0 -(p7) br.cond.spnt L(ldexpf_OVERFLOW) -(p9) br.cond.spnt L(ldexpf_OVERFLOW) -};; - -// -// Return from main path. -// -{ .mfb - nop.m 999 - nop.f 0 - br.ret.sptk b0;; -} - -.endp ldexpf -ASM_SIZE_DIRECTIVE(ldexpf) -.proc __libm_error_region -__libm_error_region: - -L(ldexpf_OVERFLOW): -L(ldexpf_UNDERFLOW): - -// -// Get stack address of N -// -.prologue -{ .mfi - add GR_Parameter_Y=-32,sp - nop.f 0 -.save ar.pfs,GR_SAVE_PFS - mov GR_SAVE_PFS=ar.pfs -} -// -// Adjust sp -// -{ .mfi -.fframe 64 - add sp=-64,sp - nop.f 0 - mov GR_SAVE_GP=gp -};; - -// -// Store N on stack in correct position -// Locate the address of x on stack -// -{ .mmi - st8 [GR_Parameter_Y] = GR_N_as_int,16 - add GR_Parameter_X = 16,sp -.save b0, GR_SAVE_B0 - mov GR_SAVE_B0=b0 -};; - -// -// Store x on the stack. -// Get address for result on stack. -// -.body -{ .mib - stfs [GR_Parameter_X] = FR_Norm_X - add GR_Parameter_RESULT = 0,GR_Parameter_Y - nop.b 0 -} -{ .mib - stfs [GR_Parameter_Y] = FR_Result - add GR_Parameter_Y = -16,GR_Parameter_Y - br.call.sptk b0=__libm_error_support# -};; - -// -// Get location of result on stack -// -{ .mmi - nop.m 0 - nop.m 0 - add GR_Parameter_RESULT = 48,sp -};; - -// -// Get the new result -// -{ .mmi - ldfs FR_Result = [GR_Parameter_RESULT] -.restore sp - add sp = 64,sp - mov b0 = GR_SAVE_B0 -};; - -// -// Restore gp, ar.pfs and return -// -{ .mib - mov gp = GR_SAVE_GP - mov ar.pfs = GR_SAVE_PFS - br.ret.sptk b0 -};; - -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) - -.type __libm_error_support#,@function -.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/s_ldexpl.S b/sysdeps/ia64/fpu/s_ldexpl.S deleted file mode 100644 index fb5d3fd452..0000000000 --- a/sysdeps/ia64/fpu/s_ldexpl.S +++ /dev/null @@ -1,379 +0,0 @@ -//.file "ldexpl.s" - -// Copyright (C) 2000, 2001, Intel Corporation -// All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// * The name of Intel Corporation may not be used to endorse or promote -// products derived from this software without specific prior written -// permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. -// -// History -//============================================================== -// 2/02/00 Initial version -// 1/26/01 ldexpl completely reworked and now standalone version -// -// API -//============================================================== -// double-extended = ldexpl (double-extended x, int n) -// input floating point f8 and int n (r34) -// output floating point f8 -// -// Returns x* 2**n using an fma and detects overflow -// and underflow. -// -// - -#include "libm_support.h" - -FR_Big = f6 -FR_NBig = f7 -FR_Floating_X = f8 -FR_Result = f8 -FR_Result2 = f9 -FR_Result3 = f11 -FR_Norm_X = f12 -FR_Two_N = f14 -FR_Two_to_Big = f15 - -GR_N_Biased = r15 -GR_Big = r16 -GR_NBig = r17 -GR_Scratch = r18 -GR_Scratch1 = r19 -GR_Bias = r20 -GR_N_as_int = r21 - -GR_SAVE_B0 = r32 -GR_SAVE_GP = r33 -GR_SAVE_PFS = r34 -GR_Parameter_X = r35 -GR_Parameter_Y = r36 -GR_Parameter_RESULT = r37 -GR_Tag = r38 - -.align 32 -.global ldexpl - -.section .text -.proc ldexpl -.align 32 - -ldexpl: - -// -// Is x NAN, INF, ZERO, +-? -// Build the exponent Bias -// -{ .mfi - alloc r32=ar.pfs,2,1,4,0 - fclass.m.unc p7,p0 = FR_Floating_X, 0xe7 //@snan | @qnan | @inf | @zero - addl GR_Bias = 0x0FFFF,r0 -} - -// -// Sign extend input -// Is N zero? -// Normalize x -// -{ .mfi - cmp.eq.unc p6,p0 = r34,r0 - fnorm.s1 FR_Norm_X = FR_Floating_X - sxt4 GR_N_as_int = r34 -} -;; - -// -// Normalize x -// Branch and return special values. -// Create -35000 -// Create 35000 -// -{ .mfi - addl GR_Big = 35000,r0 - nop.f 0 - add GR_N_Biased = GR_Bias,GR_N_as_int -} -{ .mfb - addl GR_NBig = -35000,r0 -(p7) fma.s0 FR_Result = FR_Floating_X,f1, f0 -(p7) br.ret.spnt b0 -};; - -// -// Build the exponent Bias -// Return x when N = 0 -// -{ .mfi - setf.exp FR_Two_N = GR_N_Biased - nop.f 0 - addl GR_Scratch1 = 0x063BF,r0 -} -{ .mfb - addl GR_Scratch = 0x019C3F,r0 -(p6) fma.s0 FR_Result = FR_Floating_X,f1, f0 -(p6) br.ret.spnt b0 -};; - -// -// Create 2*big -// Create 2**-big -// Is N > 35000 -// Is N < -35000 -// Raise Denormal operand flag with compare -// Main path, create 2**N -// -{ .mfi - setf.exp FR_NBig = GR_Scratch1 - nop.f 0 - cmp.ge.unc p6, p0 = GR_N_as_int, GR_Big -} -{ .mfi - setf.exp FR_Big = GR_Scratch - fcmp.ge.s0 p0,p11 = FR_Floating_X,f0 - cmp.le.unc p8, p0 = GR_N_as_int, GR_NBig -};; - -// -// Adjust 2**N if N was very small or very large -// -{ .mfi - nop.m 0 -(p6) fma.s1 FR_Two_N = FR_Big,f1,f0 - nop.i 0 -} -{ .mlx - nop.m 999 -(p0) movl GR_Scratch = 0x0000000000033FFF -};; - - -{ .mfi - nop.m 0 -(p8) fma.s1 FR_Two_N = FR_NBig,f1,f0 - nop.i 0 -} -{ .mlx - nop.m 999 -(p0) movl GR_Scratch1= 0x0000000000013FFF -};; - -// Set up necessary status fields -// -// S0 user supplied status -// S2 user supplied status + WRE + TD (Overflows) -// S3 user supplied status + FZ + TD (Underflows) -// -{ .mfi - nop.m 999 -(p0) fsetc.s3 0x7F,0x41 - nop.i 999 -} -{ .mfi - nop.m 999 -(p0) fsetc.s2 0x7F,0x42 - nop.i 999 -};; - -// -// Do final operation -// -{ .mfi - setf.exp FR_NBig = GR_Scratch - fma.s0 FR_Result = FR_Two_N,FR_Norm_X,f0 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s3 FR_Result3 = FR_Two_N,FR_Norm_X,f0 - nop.i 999 -};; -{ .mfi - setf.exp FR_Big = GR_Scratch1 - fma.s2 FR_Result2 = FR_Two_N,FR_Norm_X,f0 - nop.i 999 -};; - -// Check for overflow or underflow. -// Restore s3 -// Restore s2 -// -{ .mfi - nop.m 0 - fsetc.s3 0x7F,0x40 - nop.i 999 -} -{ .mfi - nop.m 0 - fsetc.s2 0x7F,0x40 - nop.i 999 -};; - -// -// Is the result zero? -// -{ .mfi - nop.m 999 - fclass.m.unc p6, p0 = FR_Result3, 0x007 - nop.i 999 -} -{ .mfi - addl GR_Tag = 144, r0 - fcmp.ge.unc.s1 p7, p8 = FR_Result2 , FR_Big - nop.i 0 -};; - -// -// Detect masked underflow - Tiny + Inexact Only -// -{ .mfi - nop.m 999 -(p6) fcmp.neq.unc.s1 p6, p0 = FR_Result , FR_Result2 - nop.i 999 -};; - -// -// Is result bigger the allowed range? -// Branch out for underflow -// -{ .mfb -(p6) addl GR_Tag = 145, r0 -(p8) fcmp.le.unc.s1 p9, p10 = FR_Result2 , FR_NBig -(p6) br.cond.spnt L(ldexpl_UNDERFLOW) -};; - -// -// Branch out for overflow -// -{ .mbb - nop.m 0 -(p7) br.cond.spnt L(ldexpl_OVERFLOW) -(p9) br.cond.spnt L(ldexpl_OVERFLOW) -};; - -// -// Return from main path. -// -{ .mfb - nop.m 999 - nop.f 0 - br.ret.sptk b0;; -} - -.endp ldexpl -ASM_SIZE_DIRECTIVE(ldexpl) -.proc __libm_error_region -__libm_error_region: - -L(ldexpl_OVERFLOW): -L(ldexpl_UNDERFLOW): - -// -// Get stack address of N -// -.prologue -{ .mfi - add GR_Parameter_Y=-32,sp - nop.f 0 -.save ar.pfs,GR_SAVE_PFS - mov GR_SAVE_PFS=ar.pfs -} -// -// Adjust sp -// -{ .mfi -.fframe 64 - add sp=-64,sp - nop.f 0 - mov GR_SAVE_GP=gp -};; - -// -// Store N on stack in correct position -// Locate the address of x on stack -// -{ .mmi - st8 [GR_Parameter_Y] = GR_N_as_int,16 - add GR_Parameter_X = 16,sp -.save b0, GR_SAVE_B0 - mov GR_SAVE_B0=b0 -};; - -// -// Store x on the stack. -// Get address for result on stack. -// -.body -{ .mib - stfe [GR_Parameter_X] = FR_Norm_X - add GR_Parameter_RESULT = 0,GR_Parameter_Y - nop.b 0 -} -{ .mib - stfe [GR_Parameter_Y] = FR_Result - add GR_Parameter_Y = -16,GR_Parameter_Y - br.call.sptk b0=__libm_error_support# -};; - -// -// Get location of result on stack -// -{ .mmi - nop.m 0 - nop.m 0 - add GR_Parameter_RESULT = 48,sp -};; - -// -// Get the new result -// -{ .mmi - ldfe FR_Result = [GR_Parameter_RESULT] -.restore sp - add sp = 64,sp - mov b0 = GR_SAVE_B0 -};; - -// -// Restore gp, ar.pfs and return -// -{ .mib - mov gp = GR_SAVE_GP - mov ar.pfs = GR_SAVE_PFS - br.ret.sptk b0 -};; - -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) - -.type __libm_error_support#,@function -.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/s_log1p.S b/sysdeps/ia64/fpu/s_log1p.S index 0d96c14a55..cd3551984a 100644 --- a/sysdeps/ia64/fpu/s_log1p.S +++ b/sysdeps/ia64/fpu/s_log1p.S @@ -1,10 +1,10 @@ -.file "log1p.s" +.file "log1p.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,1608 +20,1082 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// // Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// problem reports or change requests be submitted to it directly at +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 2/02/00 Initial version -// 4/04/00 Unwind support added -// 8/15/00 Bundle added after call to __libm_error_support to properly +// 02/02/00 Initial version +// 04/04/00 Unwind support added +// 08/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. +// 06/29/01 Improved speed of all paths +// 05/20/02 Cleaned up namespace and sf0 syntax +// 10/02/02 Improved performance by basing on log algorithm +// 02/10/03 Reordered header: .section, .global, .proc, .align +// 04/18/03 Eliminate possible WAW dependency warning // -// ********************************************************************* -// -// Function: log1p(x) = ln(x+1), for double precision x values -// -// ********************************************************************* -// -// Accuracy: Very accurate for double precision values -// -// ********************************************************************* -// -// Resources Used: -// -// Floating-Point Registers: f8 (Input and Return Value) -// f9,f33-f55,f99 -// -// General Purpose Registers: -// r32-r53 -// r54-r57 (Used to pass arguments to error handling routine) -// -// Predicate Registers: p6-p15 -// -// ********************************************************************* -// -// IEEE Special Conditions: -// -// Denormal fault raised on denormal inputs -// Overflow exceptions cannot occur -// Underflow exceptions raised when appropriate for log1p -// (Error Handling Routine called for underflow) -// Inexact raised when appropriate by algorithm -// -// log1p(inf) = inf -// log1p(-inf) = QNaN -// log1p(+/-0) = +/-0 -// log1p(-1) = -inf -// log1p(SNaN) = QNaN -// log1p(QNaN) = QNaN -// log1p(EM_special Values) = QNaN -// -// ********************************************************************* -// -// Computation is based on the following kernel. -// -// ker_log_64( in_FR : X, -// in_FR : E, -// in_FR : Em1, -// in_GR : Expo_Range, -// out_FR : Y_hi, -// out_FR : Y_lo, -// out_FR : Scale, -// out_PR : Safe ) -// -// Overview -// -// The method consists of three cases. -// -// If |X+Em1| < 2^(-80) use case log1p_small; -// elseif |X+Em1| < 2^(-7) use case log_near1; -// else use case log_regular; -// -// Case log1p_small: -// -// log( 1 + (X+Em1) ) can be approximated by (X+Em1). -// -// Case log_near1: -// -// log( 1 + (X+Em1) ) can be approximated by a simple polynomial -// in W = X+Em1. This polynomial resembles the truncated Taylor -// series W - W^/2 + W^3/3 - ... -// -// Case log_regular: -// -// Here we use a table lookup method. The basic idea is that in -// order to compute log(Arg) for an argument Arg in [1,2), we -// construct a value G such that G*Arg is close to 1 and that -// log(1/G) is obtainable easily from a table of values calculated -// beforehand. Thus -// -// log(Arg) = log(1/G) + log(G*Arg) -// = log(1/G) + log(1 + (G*Arg - 1)) -// -// Because |G*Arg - 1| is small, the second term on the right hand -// side can be approximated by a short polynomial. We elaborate -// this method in four steps. -// -// Step 0: Initialization -// -// We need to calculate log( E + X ). Obtain N, S_hi, S_lo such that -// -// E + X = 2^N * ( S_hi + S_lo ) exactly -// -// where S_hi in [1,2) and S_lo is a correction to S_hi in the sense -// that |S_lo| <= ulp(S_hi). -// -// Step 1: Argument Reduction -// -// Based on S_hi, obtain G_1, G_2, G_3 from a table and calculate -// -// G := G_1 * G_2 * G_3 -// r := (G * S_hi - 1) + G * S_lo -// -// These G_j's have the property that the product is exactly -// representable and that |r| < 2^(-12) as a result. -// -// Step 2: Approximation -// -// -// log(1 + r) is approximated by a short polynomial poly(r). -// -// Step 3: Reconstruction -// -// -// Finally, log( E + X ) is given by -// -// log( E + X ) = log( 2^N * (S_hi + S_lo) ) -// ~=~ N*log(2) + log(1/G) + log(1 + r) -// ~=~ N*log(2) + log(1/G) + poly(r). -// -// **** Algorithm **** -// -// Case log1p_small: -// -// Although log(1 + (X+Em1)) is basically X+Em1, we would like to -// preserve the inexactness nature as well as consistent behavior -// under different rounding modes. Note that this case can only be -// taken if E is set to be 1.0. In this case, Em1 is zero, and that -// X can be very tiny and thus the final result can possibly underflow. -// Thus, we compare X against a threshold that is dependent on the -// input Expo_Range. If |X| is smaller than this threshold, we set -// SAFE to be FALSE. -// -// The result is returned as Y_hi, Y_lo, and in the case of SAFE -// is FALSE, an additional value Scale is also returned. -// -// W := X + Em1 -// Threshold := Threshold_Table( Expo_Range ) -// Tiny := Tiny_Table( Expo_Range ) -// -// If ( |W| > Threshold ) then -// Y_hi := W -// Y_lo := -W*W -// Else -// Y_hi := W -// Y_lo := -Tiny -// Scale := 2^(-100) -// Safe := FALSE -// EndIf -// -// -// One may think that Y_lo should be -W*W/2; however, it does not matter -// as Y_lo will be rounded off completely except for the correct effect in -// directed rounding. Clearly -W*W is simplier to compute. Moreover, -// because of the difference in exponent value, Y_hi + Y_lo or -// Y_hi + Scale*Y_lo is always inexact. -// -// Case log_near1: -// -// Here we compute a simple polynomial. To exploit parallelism, we split -// the polynomial into two portions. -// -// W := X + Em1 -// Wsq := W * W -// W4 := Wsq*Wsq -// W6 := W4*Wsq -// Y_hi := W + Wsq*(P_1 + W*(P_2 + W*(P_3 + W*P_4)) -// Y_lo := W6*(P_5 + W*(P_6 + W*(P_7 + W*P_8))) -// set lsb(Y_lo) to be 1 -// -// Case log_regular: -// -// We present the algorithm in four steps. -// -// Step 0. Initialization -// ---------------------- -// -// Z := X + E -// N := unbaised exponent of Z -// S_hi := 2^(-N) * Z -// S_lo := 2^(-N) * { (max(X,E)-Z) + min(X,E) } -// -// Note that S_lo is always 0 for the case E = 0. -// -// Step 1. Argument Reduction -// -------------------------- -// -// Let -// -// Z = 2^N * S_hi = 2^N * 1.d_1 d_2 d_3 ... d_63 -// -// We obtain G_1, G_2, G_3 by the following steps. -// +// API +//============================================================== +// double log1p(double) // -// Define X_0 := 1.d_1 d_2 ... d_14. This is extracted -// from S_hi. +// log1p(x) = log(x+1) // -// Define A_1 := 1.d_1 d_2 d_3 d_4. This is X_0 truncated -// to lsb = 2^(-4). +// Overview of operation +//============================================================== +// Background +// ---------- // -// Define index_1 := [ d_1 d_2 d_3 d_4 ]. +// This algorithm is based on fact that +// log1p(x) = log(1+x) and +// log(a b) = log(a) + log(b). +// In our case we have 1+x = 2^N f, where 1 <= f < 2. +// So +// log(1+x) = log(2^N f) = log(2^N) + log(f) = n*log(2) + log(f) // -// Fetch Z_1 := (1/A_1) rounded UP in fixed point with -// fixed point lsb = 2^(-15). -// Z_1 looks like z_0.z_1 z_2 ... z_15 -// Note that the fetching is done using index_1. -// A_1 is actually not needed in the implementation -// and is used here only to explain how is the value -// Z_1 defined. +// To calculate log(f) we do following +// log(f) = log(f * frcpa(f) / frcpa(f)) = +// = log(f * frcpa(f)) + log(1/frcpa(f)) // -// Fetch G_1 := (1/A_1) truncated to 21 sig. bits. -// floating pt. Again, fetching is done using index_1. A_1 -// explains how G_1 is defined. +// According to definition of IA-64's frcpa instruction it's a +// floating point that approximates 1/f using a lookup on the +// top of 8 bits of the input number's + 1 significand with relative +// error < 2^(-8.886). So we have following // -// Calculate X_1 := X_0 * Z_1 truncated to lsb = 2^(-14) -// = 1.0 0 0 0 d_5 ... d_14 -// This is accomplised by integer multiplication. -// It is proved that X_1 indeed always begin -// with 1.0000 in fixed point. +// |(1/f - frcpa(f)) / (1/f))| = |1 - f*frcpa(f)| < 1/256 // +// and // -// Define A_2 := 1.0 0 0 0 d_5 d_6 d_7 d_8. This is X_1 -// truncated to lsb = 2^(-8). Similar to A_1, -// A_2 is not needed in actual implementation. It -// helps explain how some of the values are defined. +// log(f) = log(f * frcpa(f)) + log(1/frcpa(f)) = +// = log(1 + r) + T // -// Define index_2 := [ d_5 d_6 d_7 d_8 ]. +// The first value can be computed by polynomial P(r) approximating +// log(1 + r) on |r| < 1/256 and the second is precomputed tabular +// value defined by top 8 bit of f. // -// Fetch Z_2 := (1/A_2) rounded UP in fixed point with -// fixed point lsb = 2^(-15). Fetch done using index_2. -// Z_2 looks like z_0.z_1 z_2 ... z_15 +// Finally we have that log(1+x) ~ (N*log(2) + T) + P(r) // -// Fetch G_2 := (1/A_2) truncated to 21 sig. bits. -// floating pt. +// Note that if input argument is close to 0.0 (in our case it means +// that |x| < 1/256) we can use just polynomial approximation +// because 1+x = 2^0 * f = f = 1 + r and +// log(1+x) = log(1 + r) ~ P(r) // -// Calculate X_2 := X_1 * Z_2 truncated to lsb = 2^(-14) -// = 1.0 0 0 0 0 0 0 0 d_9 d_10 ... d_14 -// This is accomplised by integer multiplication. -// It is proved that X_2 indeed always begin -// with 1.00000000 in fixed point. // +// Implementation +// -------------- // -// Define A_3 := 1.0 0 0 0 0 0 0 0 d_9 d_10 d_11 d_12 d_13 1. -// This is 2^(-14) + X_2 truncated to lsb = 2^(-13). +// 1. |x| >= 2^(-8), and x > -1 +// InvX = frcpa(x+1) +// r = InvX*(x+1) - 1 +// P(r) = r*((r*A3 - A2) + r^4*((A4 + r*A5) + r^2*(A6 + r*A7)), +// all coefficients are calcutated in quad and rounded to double +// precision. A7,A6,A5,A4 are stored in memory whereas A3 and A2 +// created with setf. // -// Define index_3 := [ d_9 d_10 d_11 d_12 d_13 ]. +// N = float(n) where n is true unbiased exponent of x // -// Fetch G_3 := (1/A_3) truncated to 21 sig. bits. -// floating pt. Fetch is done using index_3. +// T is tabular value of log(1/frcpa(x)) calculated in quad precision +// and represented by two floating-point numbers 64-bit Thi and 32-bit Tlo. +// To load Thi,Tlo we get bits from 55 to 62 of register format significand +// as index and calculate two addresses +// ad_Thi = Thi_table_base_addr + 8 * index +// ad_Tlo = Tlo_table_base_addr + 4 * index // -// Compute G := G_1 * G_2 * G_3. +// L1 (log(2)) is calculated in quad +// precision and represented by two floating-point 64-bit numbers L1hi,L1lo +// stored in memory. // -// This is done exactly since each of G_j only has 21 sig. bits. +// And final result = ((L1hi*N + Thi) + (N*L1lo + Tlo)) + P(r) // -// Compute // -// r := (G*S_hi - 1) + G*S_lo using 2 FMA operations. +// 2. 2^(-80) <= |x| < 2^(-8) +// r = x +// P(r) = r*((r*A3 - A2) + r^4*((A4 + r*A5) + r^2*(A6 + r*A7)), +// A7,A6,A5,A4,A3,A2 are the same as in case |x| >= 1/256 // -// thus, r approximates G*(S_hi+S_lo) - 1 to within a couple of -// rounding errors. +// And final results +// log(1+x) = P(r) // +// 3. 0 < |x| < 2^(-80) +// Although log1p(x) is basically x, we would like to preserve the inexactness +// nature as well as consistent behavior under different rounding modes. +// We can do this by computing the result as // -// Step 2. Approximation -// --------------------- +// log1p(x) = x - x*x // -// This step computes an approximation to log( 1 + r ) where r is the -// reduced argument just obtained. It is proved that |r| <= 1.9*2^(-13); -// thus log(1+r) can be approximated by a short polynomial: // -// log(1+r) ~=~ poly = r + Q1 r^2 + ... + Q4 r^5 +// Note: NaT, any NaNs, +/-INF, +/-0, negatives and unnormalized numbers are +// filtered and processed on special branches. // + // -// Step 3. Reconstruction -// ---------------------- +// Special values +//============================================================== // -// This step computes the desired result of log(X+E): +// log1p(-1) = -inf // Call error support // -// log(X+E) = log( 2^N * (S_hi + S_lo) ) -// = N*log(2) + log( S_hi + S_lo ) -// = N*log(2) + log(1/G) + -// log(1 + C*(S_hi+S_lo) - 1 ) +// log1p(+qnan) = +qnan +// log1p(-qnan) = -qnan +// log1p(+snan) = +qnan +// log1p(-snan) = -qnan // -// log(2), log(1/G_j) are stored as pairs of (single,double) numbers: -// log2_hi, log2_lo, log1byGj_hi, log1byGj_lo. The high parts are -// single-precision numbers and the low parts are double precision -// numbers. These have the property that +// log1p(x),x<-1= QNAN Indefinite // Call error support +// log1p(-inf) = QNAN Indefinite +// log1p(+inf) = +inf +// log1p(+/-0) = +/-0 // -// N*log2_hi + SUM ( log1byGj_hi ) // -// is computable exactly in double-extended precision (64 sig. bits). -// Finally +// Registers used +//============================================================== +// Floating Point registers used: +// f8, input +// f7 -> f15, f32 -> f40 // -// Y_hi := N*log2_hi + SUM ( log1byGj_hi ) -// Y_lo := poly_hi + [ poly_lo + -// ( SUM ( log1byGj_lo ) + N*log2_lo ) ] -// set lsb(Y_lo) to be 1 +// General registers used: +// r8 -> r11 +// r14 -> r20 // +// Predicate registers used: +// p6 -> p12 -#include "libm_support.h" - -#ifdef _LIBC -.rodata -#else -.data -#endif +// Assembly macros +//============================================================== +GR_TAG = r8 +GR_ad_1 = r8 +GR_ad_2 = r9 +GR_Exp = r10 +GR_N = r11 -// P_7, P_6, P_5, P_4, P_3, P_2, and P_1 +GR_signexp_x = r14 +GR_exp_mask = r15 +GR_exp_bias = r16 +GR_05 = r17 +GR_A3 = r18 +GR_Sig = r19 +GR_Ind = r19 +GR_exp_x = r20 -.align 64 -Constants_P: -ASM_TYPE_DIRECTIVE(Constants_P,@object) -data4 0xEFD62B15,0xE3936754,0x00003FFB,0x00000000 -data4 0xA5E56381,0x8003B271,0x0000BFFC,0x00000000 -data4 0x73282DB0,0x9249248C,0x00003FFC,0x00000000 -data4 0x47305052,0xAAAAAA9F,0x0000BFFC,0x00000000 -data4 0xCCD17FC9,0xCCCCCCCC,0x00003FFC,0x00000000 -data4 0x00067ED5,0x80000000,0x0000BFFD,0x00000000 -data4 0xAAAAAAAA,0xAAAAAAAA,0x00003FFD,0x00000000 -data4 0xFFFFFFFE,0xFFFFFFFF,0x0000BFFD,0x00000000 -ASM_SIZE_DIRECTIVE(Constants_P) - -// log2_hi, log2_lo, Q_4, Q_3, Q_2, and Q_1 -.align 64 -Constants_Q: -ASM_TYPE_DIRECTIVE(Constants_Q,@object) -data4 0x00000000,0xB1721800,0x00003FFE,0x00000000 -data4 0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000 -data4 0x328833CB,0xCCCCCAF2,0x00003FFC,0x00000000 -data4 0xA9D4BAFB,0x80000077,0x0000BFFD,0x00000000 -data4 0xAAABE3D2,0xAAAAAAAA,0x00003FFD,0x00000000 -data4 0xFFFFDAB7,0xFFFFFFFF,0x0000BFFD,0x00000000 -ASM_SIZE_DIRECTIVE(Constants_Q) - -// Z1 - 16 bit fixed, G1 and H1 - IEEE single - -.align 64 -Constants_Z_G_H_h1: -ASM_TYPE_DIRECTIVE(Constants_Z_G_H_h1,@object) -data4 0x00008000,0x3F800000,0x00000000,0x00000000,0x00000000,0x00000000 -data4 0x00007879,0x3F70F0F0,0x3D785196,0x00000000,0x617D741C,0x3DA163A6 -data4 0x000071C8,0x3F638E38,0x3DF13843,0x00000000,0xCBD3D5BB,0x3E2C55E6 -data4 0x00006BCB,0x3F579430,0x3E2FF9A0,0x00000000,0xD86EA5E7,0xBE3EB0BF -data4 0x00006667,0x3F4CCCC8,0x3E647FD6,0x00000000,0x86B12760,0x3E2E6A8C -data4 0x00006187,0x3F430C30,0x3E8B3AE7,0x00000000,0x5C0739BA,0x3E47574C -data4 0x00005D18,0x3F3A2E88,0x3EA30C68,0x00000000,0x13E8AF2F,0x3E20E30F -data4 0x0000590C,0x3F321640,0x3EB9CEC8,0x00000000,0xF2C630BD,0xBE42885B -data4 0x00005556,0x3F2AAAA8,0x3ECF9927,0x00000000,0x97E577C6,0x3E497F34 -data4 0x000051EC,0x3F23D708,0x3EE47FC5,0x00000000,0xA6B0A5AB,0x3E3E6A6E -data4 0x00004EC5,0x3F1D89D8,0x3EF8947D,0x00000000,0xD328D9BE,0xBDF43E3C -data4 0x00004BDB,0x3F17B420,0x3F05F3A1,0x00000000,0x0ADB090A,0x3E4094C3 -data4 0x00004925,0x3F124920,0x3F0F4303,0x00000000,0xFC1FE510,0xBE28FBB2 -data4 0x0000469F,0x3F0D3DC8,0x3F183EBF,0x00000000,0x10FDE3FA,0x3E3A7895 -data4 0x00004445,0x3F088888,0x3F20EC80,0x00000000,0x7CC8C98F,0x3E508CE5 -data4 0x00004211,0x3F042108,0x3F29516A,0x00000000,0xA223106C,0xBE534874 -ASM_SIZE_DIRECTIVE(Constants_Z_G_H_h1) - -// Z2 - 16 bit fixed, G2 and H2 - IEEE single +GR_SAVE_B0 = r33 +GR_SAVE_PFS = r34 +GR_SAVE_GP = r35 +GR_SAVE_SP = r36 -.align 64 -Constants_Z_G_H_h2: -ASM_TYPE_DIRECTIVE(Constants_Z_G_H_h2,@object) -data4 0x00008000,0x3F800000,0x00000000,0x00000000,0x00000000,0x00000000 -data4 0x00007F81,0x3F7F00F8,0x3B7F875D,0x00000000,0x22C42273,0x3DB5A116 -data4 0x00007F02,0x3F7E03F8,0x3BFF015B,0x00000000,0x21F86ED3,0x3DE620CF -data4 0x00007E85,0x3F7D08E0,0x3C3EE393,0x00000000,0x484F34ED,0xBDAFA07E -data4 0x00007E08,0x3F7C0FC0,0x3C7E0586,0x00000000,0x3860BCF6,0xBDFE07F0 -data4 0x00007D8D,0x3F7B1880,0x3C9E75D2,0x00000000,0xA78093D6,0x3DEA370F -data4 0x00007D12,0x3F7A2328,0x3CBDC97A,0x00000000,0x72A753D0,0x3DFF5791 -data4 0x00007C98,0x3F792FB0,0x3CDCFE47,0x00000000,0xA7EF896B,0x3DFEBE6C -data4 0x00007C20,0x3F783E08,0x3CFC15D0,0x00000000,0x409ECB43,0x3E0CF156 -data4 0x00007BA8,0x3F774E38,0x3D0D874D,0x00000000,0xFFEF71DF,0xBE0B6F97 -data4 0x00007B31,0x3F766038,0x3D1CF49B,0x00000000,0x5D59EEE8,0xBE080483 -data4 0x00007ABB,0x3F757400,0x3D2C531D,0x00000000,0xA9192A74,0x3E1F91E9 -data4 0x00007A45,0x3F748988,0x3D3BA322,0x00000000,0xBF72A8CD,0xBE139A06 -data4 0x000079D1,0x3F73A0D0,0x3D4AE46F,0x00000000,0xF8FBA6CF,0x3E1D9202 -data4 0x0000795D,0x3F72B9D0,0x3D5A1756,0x00000000,0xBA796223,0xBE1DCCC4 -data4 0x000078EB,0x3F71D488,0x3D693B9D,0x00000000,0xB6B7C239,0xBE049391 -ASM_SIZE_DIRECTIVE(Constants_Z_G_H_h2) - -// G3 and H3 - IEEE single and h3 -IEEE double +GR_Parameter_X = r37 +GR_Parameter_Y = r38 +GR_Parameter_RESULT = r39 +GR_Parameter_TAG = r40 -.align 64 -Constants_Z_G_H_h3: -ASM_TYPE_DIRECTIVE(Constants_Z_G_H_h3,@object) -data4 0x3F7FFC00,0x38800100,0x562224CD,0x3D355595 -data4 0x3F7FF400,0x39400480,0x06136FF6,0x3D8200A2 -data4 0x3F7FEC00,0x39A00640,0xE8DE9AF0,0x3DA4D68D -data4 0x3F7FE400,0x39E00C41,0xB10238DC,0xBD8B4291 -data4 0x3F7FDC00,0x3A100A21,0x3B1952CA,0xBD89CCB8 -data4 0x3F7FD400,0x3A300F22,0x1DC46826,0xBDB10707 -data4 0x3F7FCC08,0x3A4FF51C,0xF43307DB,0x3DB6FCB9 -data4 0x3F7FC408,0x3A6FFC1D,0x62DC7872,0xBD9B7C47 -data4 0x3F7FBC10,0x3A87F20B,0x3F89154A,0xBDC3725E -data4 0x3F7FB410,0x3A97F68B,0x62B9D392,0xBD93519D -data4 0x3F7FAC18,0x3AA7EB86,0x0F21BD9D,0x3DC18441 -data4 0x3F7FA420,0x3AB7E101,0x2245E0A6,0xBDA64B95 -data4 0x3F7F9C20,0x3AC7E701,0xAABB34B8,0x3DB4B0EC -data4 0x3F7F9428,0x3AD7DD7B,0x6DC40A7E,0x3D992337 -data4 0x3F7F8C30,0x3AE7D474,0x4F2083D3,0x3DC6E17B -data4 0x3F7F8438,0x3AF7CBED,0x811D4394,0x3DAE314B -data4 0x3F7F7C40,0x3B03E1F3,0xB08F2DB1,0xBDD46F21 -data4 0x3F7F7448,0x3B0BDE2F,0x6D34522B,0xBDDC30A4 -data4 0x3F7F6C50,0x3B13DAAA,0xB1F473DB,0x3DCB0070 -data4 0x3F7F6458,0x3B1BD766,0x6AD282FD,0xBDD65DDC -data4 0x3F7F5C68,0x3B23CC5C,0xF153761A,0xBDCDAB83 -data4 0x3F7F5470,0x3B2BC997,0x341D0F8F,0xBDDADA40 -data4 0x3F7F4C78,0x3B33C711,0xEBC394E8,0x3DCD1BD7 -data4 0x3F7F4488,0x3B3BBCC6,0x52E3E695,0xBDC3532B -data4 0x3F7F3C90,0x3B43BAC0,0xE846B3DE,0xBDA3961E -data4 0x3F7F34A0,0x3B4BB0F4,0x785778D4,0xBDDADF06 -data4 0x3F7F2CA8,0x3B53AF6D,0xE55CE212,0x3DCC3ED1 -data4 0x3F7F24B8,0x3B5BA620,0x9E382C15,0xBDBA3103 -data4 0x3F7F1CC8,0x3B639D12,0x5C5AF197,0x3D635A0B -data4 0x3F7F14D8,0x3B6B9444,0x71D34EFC,0xBDDCCB19 -data4 0x3F7F0CE0,0x3B7393BC,0x52CD7ADA,0x3DC74502 -data4 0x3F7F04F0,0x3B7B8B6D,0x7D7F2A42,0xBDB68F17 -ASM_SIZE_DIRECTIVE(Constants_Z_G_H_h3) - -// -// Exponent Thresholds and Tiny Thresholds -// for 8, 11, 15, and 17 bit exponents -// -// Expo_Range Value -// -// 0 (8 bits) 2^(-126) -// 1 (11 bits) 2^(-1022) -// 2 (15 bits) 2^(-16382) -// 3 (17 bits) 2^(-16382) -// -// Tiny_Table -// ---------- -// Expo_Range Value -// -// 0 (8 bits) 2^(-16382) -// 1 (11 bits) 2^(-16382) -// 2 (15 bits) 2^(-16382) -// 3 (17 bits) 2^(-16382) -// -.align 64 -Constants_Threshold: -ASM_TYPE_DIRECTIVE(Constants_Threshold,@object) -data4 0x00000000,0x80000000,0x00003F81,0x00000000 -data4 0x00000000,0x80000000,0x00000001,0x00000000 -data4 0x00000000,0x80000000,0x00003C01,0x00000000 -data4 0x00000000,0x80000000,0x00000001,0x00000000 -data4 0x00000000,0x80000000,0x00000001,0x00000000 -data4 0x00000000,0x80000000,0x00000001,0x00000000 -data4 0x00000000,0x80000000,0x00000001,0x00000000 -data4 0x00000000,0x80000000,0x00000001,0x00000000 -ASM_SIZE_DIRECTIVE(Constants_Threshold) -.align 64 -Constants_1_by_LN10: -ASM_TYPE_DIRECTIVE(Constants_1_by_LN10,@object) -data4 0x37287195,0xDE5BD8A9,0x00003FFD,0x00000000 -data4 0xACCF70C8,0xD56EAABE,0x00003FBD,0x00000000 -ASM_SIZE_DIRECTIVE(Constants_1_by_LN10) +FR_NormX = f7 +FR_RcpX = f9 +FR_r = f10 +FR_r2 = f11 +FR_r4 = f12 +FR_N = f13 +FR_Ln2hi = f14 +FR_Ln2lo = f15 -FR_Input_X = f8 -FR_Neg_One = f9 -FR_E = f33 -FR_Em1 = f34 -FR_Y_hi = f34 -// Shared with Em1 -FR_Y_lo = f35 -FR_Scale = f36 -FR_X_Prime = f37 -FR_Z = f38 -FR_S_hi = f38 -// Shared with Z -FR_W = f39 -FR_G = f40 -FR_wsq = f40 -// Shared with G -FR_H = f41 -FR_w4 = f41 -// Shared with H -FR_h = f42 -FR_w6 = f42 -// Shared with h -FR_G_tmp = f43 -FR_poly_lo = f43 -// Shared with G_tmp -FR_P8 = f43 -// Shared with G_tmp -FR_H_tmp = f44 -FR_poly_hi = f44 - // Shared with H_tmp -FR_P7 = f44 -// Shared with H_tmp -FR_h_tmp = f45 -FR_rsq = f45 -// Shared with h_tmp -FR_P6 = f45 -// Shared with h_tmp -FR_abs_W = f46 -FR_r = f46 -// Shared with abs_W -FR_AA = f47 -FR_log2_hi = f47 -// Shared with AA -FR_BB = f48 -FR_log2_lo = f48 -// Shared with BB -FR_S_lo = f49 -FR_two_negN = f50 -FR_float_N = f51 -FR_Q4 = f52 -FR_dummy = f52 -// Shared with Q4 -FR_P4 = f52 -// Shared with Q4 -FR_Threshold = f52 -// Shared with Q4 -FR_Q3 = f53 -FR_P3 = f53 -// Shared with Q3 -FR_Tiny = f53 -// Shared with Q3 -FR_Q2 = f54 -FR_P2 = f54 -// Shared with Q2 -FR_1LN10_hi = f54 -// Shared with Q2 -FR_Q1 = f55 -FR_P1 = f55 -// Shared with Q1 -FR_1LN10_lo = f55 -// Shared with Q1 -FR_P5 = f98 -FR_SCALE = f98 -FR_Output_X_tmp = f99 +FR_A7 = f32 +FR_A6 = f33 +FR_A5 = f34 +FR_A4 = f35 +FR_A3 = f36 +FR_A2 = f37 -GR_Expo_Range = r32 -GR_Table_Base = r34 -GR_Table_Base1 = r35 -GR_Table_ptr = r36 -GR_Index2 = r37 -GR_signif = r38 -GR_X_0 = r39 -GR_X_1 = r40 -GR_X_2 = r41 -GR_Z_1 = r42 -GR_Z_2 = r43 -GR_N = r44 -GR_Bias = r45 -GR_M = r46 -GR_ScaleN = r47 -GR_Index3 = r48 -GR_Perturb = r49 -GR_Table_Scale = r50 +FR_Thi = f38 +FR_NxLn2hipThi = f38 +FR_NxLn2pT = f38 +FR_Tlo = f39 +FR_NxLn2lopTlo = f39 +FR_Xp1 = f40 -GR_SAVE_PFS = r51 -GR_SAVE_B0 = r52 -GR_SAVE_GP = r53 -GR_Parameter_X = r54 -GR_Parameter_Y = r55 -GR_Parameter_RESULT = r56 +FR_Y = f1 +FR_X = f10 +FR_RESULT = f8 -GR_Parameter_TAG = r57 +// Data +//============================================================== +RODATA +.align 16 + +LOCAL_OBJECT_START(log_data) +// coefficients of polynomial approximation +data8 0x3FC2494104381A8E // A7 +data8 0xBFC5556D556BBB69 // A6 +data8 0x3FC999999988B5E9 // A5 +data8 0xBFCFFFFFFFF6FFF5 // A4 +// +// hi parts of ln(1/frcpa(1+i/256)), i=0...255 +data8 0x3F60040155D5889D // 0 +data8 0x3F78121214586B54 // 1 +data8 0x3F841929F96832EF // 2 +data8 0x3F8C317384C75F06 // 3 +data8 0x3F91A6B91AC73386 // 4 +data8 0x3F95BA9A5D9AC039 // 5 +data8 0x3F99D2A8074325F3 // 6 +data8 0x3F9D6B2725979802 // 7 +data8 0x3FA0C58FA19DFAA9 // 8 +data8 0x3FA2954C78CBCE1A // 9 +data8 0x3FA4A94D2DA96C56 // 10 +data8 0x3FA67C94F2D4BB58 // 11 +data8 0x3FA85188B630F068 // 12 +data8 0x3FAA6B8ABE73AF4C // 13 +data8 0x3FAC441E06F72A9E // 14 +data8 0x3FAE1E6713606D06 // 15 +data8 0x3FAFFA6911AB9300 // 16 +data8 0x3FB0EC139C5DA600 // 17 +data8 0x3FB1DBD2643D190B // 18 +data8 0x3FB2CC7284FE5F1C // 19 +data8 0x3FB3BDF5A7D1EE64 // 20 +data8 0x3FB4B05D7AA012E0 // 21 +data8 0x3FB580DB7CEB5701 // 22 +data8 0x3FB674F089365A79 // 23 +data8 0x3FB769EF2C6B568D // 24 +data8 0x3FB85FD927506A47 // 25 +data8 0x3FB9335E5D594988 // 26 +data8 0x3FBA2B0220C8E5F4 // 27 +data8 0x3FBB0004AC1A86AB // 28 +data8 0x3FBBF968769FCA10 // 29 +data8 0x3FBCCFEDBFEE13A8 // 30 +data8 0x3FBDA727638446A2 // 31 +data8 0x3FBEA3257FE10F79 // 32 +data8 0x3FBF7BE9FEDBFDE5 // 33 +data8 0x3FC02AB352FF25F3 // 34 +data8 0x3FC097CE579D204C // 35 +data8 0x3FC1178E8227E47B // 36 +data8 0x3FC185747DBECF33 // 37 +data8 0x3FC1F3B925F25D41 // 38 +data8 0x3FC2625D1E6DDF56 // 39 +data8 0x3FC2D1610C868139 // 40 +data8 0x3FC340C59741142E // 41 +data8 0x3FC3B08B6757F2A9 // 42 +data8 0x3FC40DFB08378003 // 43 +data8 0x3FC47E74E8CA5F7C // 44 +data8 0x3FC4EF51F6466DE4 // 45 +data8 0x3FC56092E02BA516 // 46 +data8 0x3FC5D23857CD74D4 // 47 +data8 0x3FC6313A37335D76 // 48 +data8 0x3FC6A399DABBD383 // 49 +data8 0x3FC70337DD3CE41A // 50 +data8 0x3FC77654128F6127 // 51 +data8 0x3FC7E9D82A0B022D // 52 +data8 0x3FC84A6B759F512E // 53 +data8 0x3FC8AB47D5F5A30F // 54 +data8 0x3FC91FE49096581B // 55 +data8 0x3FC981634011AA75 // 56 +data8 0x3FC9F6C407089664 // 57 +data8 0x3FCA58E729348F43 // 58 +data8 0x3FCABB55C31693AC // 59 +data8 0x3FCB1E104919EFD0 // 60 +data8 0x3FCB94EE93E367CA // 61 +data8 0x3FCBF851C067555E // 62 +data8 0x3FCC5C0254BF23A5 // 63 +data8 0x3FCCC000C9DB3C52 // 64 +data8 0x3FCD244D99C85673 // 65 +data8 0x3FCD88E93FB2F450 // 66 +data8 0x3FCDEDD437EAEF00 // 67 +data8 0x3FCE530EFFE71012 // 68 +data8 0x3FCEB89A1648B971 // 69 +data8 0x3FCF1E75FADF9BDE // 70 +data8 0x3FCF84A32EAD7C35 // 71 +data8 0x3FCFEB2233EA07CD // 72 +data8 0x3FD028F9C7035C1C // 73 +data8 0x3FD05C8BE0D9635A // 74 +data8 0x3FD085EB8F8AE797 // 75 +data8 0x3FD0B9C8E32D1911 // 76 +data8 0x3FD0EDD060B78080 // 77 +data8 0x3FD122024CF0063F // 78 +data8 0x3FD14BE2927AECD4 // 79 +data8 0x3FD180618EF18ADF // 80 +data8 0x3FD1B50BBE2FC63B // 81 +data8 0x3FD1DF4CC7CF242D // 82 +data8 0x3FD214456D0EB8D4 // 83 +data8 0x3FD23EC5991EBA49 // 84 +data8 0x3FD2740D9F870AFB // 85 +data8 0x3FD29ECDABCDFA03 // 86 +data8 0x3FD2D46602ADCCEE // 87 +data8 0x3FD2FF66B04EA9D4 // 88 +data8 0x3FD335504B355A37 // 89 +data8 0x3FD360925EC44F5C // 90 +data8 0x3FD38BF1C3337E74 // 91 +data8 0x3FD3C25277333183 // 92 +data8 0x3FD3EDF463C1683E // 93 +data8 0x3FD419B423D5E8C7 // 94 +data8 0x3FD44591E0539F48 // 95 +data8 0x3FD47C9175B6F0AD // 96 +data8 0x3FD4A8B341552B09 // 97 +data8 0x3FD4D4F39089019F // 98 +data8 0x3FD501528DA1F967 // 99 +data8 0x3FD52DD06347D4F6 // 100 +data8 0x3FD55A6D3C7B8A89 // 101 +data8 0x3FD5925D2B112A59 // 102 +data8 0x3FD5BF406B543DB1 // 103 +data8 0x3FD5EC433D5C35AD // 104 +data8 0x3FD61965CDB02C1E // 105 +data8 0x3FD646A84935B2A1 // 106 +data8 0x3FD6740ADD31DE94 // 107 +data8 0x3FD6A18DB74A58C5 // 108 +data8 0x3FD6CF31058670EC // 109 +data8 0x3FD6F180E852F0B9 // 110 +data8 0x3FD71F5D71B894EF // 111 +data8 0x3FD74D5AEFD66D5C // 112 +data8 0x3FD77B79922BD37D // 113 +data8 0x3FD7A9B9889F19E2 // 114 +data8 0x3FD7D81B037EB6A6 // 115 +data8 0x3FD8069E33827230 // 116 +data8 0x3FD82996D3EF8BCA // 117 +data8 0x3FD85855776DCBFA // 118 +data8 0x3FD8873658327CCE // 119 +data8 0x3FD8AA75973AB8CE // 120 +data8 0x3FD8D992DC8824E4 // 121 +data8 0x3FD908D2EA7D9511 // 122 +data8 0x3FD92C59E79C0E56 // 123 +data8 0x3FD95BD750EE3ED2 // 124 +data8 0x3FD98B7811A3EE5B // 125 +data8 0x3FD9AF47F33D406B // 126 +data8 0x3FD9DF270C1914A7 // 127 +data8 0x3FDA0325ED14FDA4 // 128 +data8 0x3FDA33440224FA78 // 129 +data8 0x3FDA57725E80C382 // 130 +data8 0x3FDA87D0165DD199 // 131 +data8 0x3FDAAC2E6C03F895 // 132 +data8 0x3FDADCCC6FDF6A81 // 133 +data8 0x3FDB015B3EB1E790 // 134 +data8 0x3FDB323A3A635948 // 135 +data8 0x3FDB56FA04462909 // 136 +data8 0x3FDB881AA659BC93 // 137 +data8 0x3FDBAD0BEF3DB164 // 138 +data8 0x3FDBD21297781C2F // 139 +data8 0x3FDC039236F08818 // 140 +data8 0x3FDC28CB1E4D32FC // 141 +data8 0x3FDC4E19B84723C1 // 142 +data8 0x3FDC7FF9C74554C9 // 143 +data8 0x3FDCA57B64E9DB05 // 144 +data8 0x3FDCCB130A5CEBAF // 145 +data8 0x3FDCF0C0D18F326F // 146 +data8 0x3FDD232075B5A201 // 147 +data8 0x3FDD490246DEFA6B // 148 +data8 0x3FDD6EFA918D25CD // 149 +data8 0x3FDD9509707AE52F // 150 +data8 0x3FDDBB2EFE92C554 // 151 +data8 0x3FDDEE2F3445E4AE // 152 +data8 0x3FDE148A1A2726CD // 153 +data8 0x3FDE3AFC0A49FF3F // 154 +data8 0x3FDE6185206D516D // 155 +data8 0x3FDE882578823D51 // 156 +data8 0x3FDEAEDD2EAC990C // 157 +data8 0x3FDED5AC5F436BE2 // 158 +data8 0x3FDEFC9326D16AB8 // 159 +data8 0x3FDF2391A21575FF // 160 +data8 0x3FDF4AA7EE03192C // 161 +data8 0x3FDF71D627C30BB0 // 162 +data8 0x3FDF991C6CB3B379 // 163 +data8 0x3FDFC07ADA69A90F // 164 +data8 0x3FDFE7F18EB03D3E // 165 +data8 0x3FE007C053C5002E // 166 +data8 0x3FE01B942198A5A0 // 167 +data8 0x3FE02F74400C64EA // 168 +data8 0x3FE04360BE7603AC // 169 +data8 0x3FE05759AC47FE33 // 170 +data8 0x3FE06B5F1911CF51 // 171 +data8 0x3FE078BF0533C568 // 172 +data8 0x3FE08CD9687E7B0E // 173 +data8 0x3FE0A10074CF9019 // 174 +data8 0x3FE0B5343A234476 // 175 +data8 0x3FE0C974C89431CD // 176 +data8 0x3FE0DDC2305B9886 // 177 +data8 0x3FE0EB524BAFC918 // 178 +data8 0x3FE0FFB54213A475 // 179 +data8 0x3FE114253DA97D9F // 180 +data8 0x3FE128A24F1D9AFF // 181 +data8 0x3FE1365252BF0864 // 182 +data8 0x3FE14AE558B4A92D // 183 +data8 0x3FE15F85A19C765B // 184 +data8 0x3FE16D4D38C119FA // 185 +data8 0x3FE18203C20DD133 // 186 +data8 0x3FE196C7BC4B1F3A // 187 +data8 0x3FE1A4A738B7A33C // 188 +data8 0x3FE1B981C0C9653C // 189 +data8 0x3FE1CE69E8BB106A // 190 +data8 0x3FE1DC619DE06944 // 191 +data8 0x3FE1F160A2AD0DA3 // 192 +data8 0x3FE2066D7740737E // 193 +data8 0x3FE2147DBA47A393 // 194 +data8 0x3FE229A1BC5EBAC3 // 195 +data8 0x3FE237C1841A502E // 196 +data8 0x3FE24CFCE6F80D9A // 197 +data8 0x3FE25B2C55CD5762 // 198 +data8 0x3FE2707F4D5F7C40 // 199 +data8 0x3FE285E0842CA383 // 200 +data8 0x3FE294294708B773 // 201 +data8 0x3FE2A9A2670AFF0C // 202 +data8 0x3FE2B7FB2C8D1CC0 // 203 +data8 0x3FE2C65A6395F5F5 // 204 +data8 0x3FE2DBF557B0DF42 // 205 +data8 0x3FE2EA64C3F97654 // 206 +data8 0x3FE3001823684D73 // 207 +data8 0x3FE30E97E9A8B5CC // 208 +data8 0x3FE32463EBDD34E9 // 209 +data8 0x3FE332F4314AD795 // 210 +data8 0x3FE348D90E7464CF // 211 +data8 0x3FE35779F8C43D6D // 212 +data8 0x3FE36621961A6A99 // 213 +data8 0x3FE37C299F3C366A // 214 +data8 0x3FE38AE2171976E7 // 215 +data8 0x3FE399A157A603E7 // 216 +data8 0x3FE3AFCCFE77B9D1 // 217 +data8 0x3FE3BE9D503533B5 // 218 +data8 0x3FE3CD7480B4A8A2 // 219 +data8 0x3FE3E3C43918F76C // 220 +data8 0x3FE3F2ACB27ED6C6 // 221 +data8 0x3FE4019C2125CA93 // 222 +data8 0x3FE4181061389722 // 223 +data8 0x3FE42711518DF545 // 224 +data8 0x3FE436194E12B6BF // 225 +data8 0x3FE445285D68EA69 // 226 +data8 0x3FE45BCC464C893A // 227 +data8 0x3FE46AED21F117FC // 228 +data8 0x3FE47A1527E8A2D3 // 229 +data8 0x3FE489445EFFFCCB // 230 +data8 0x3FE4A018BCB69835 // 231 +data8 0x3FE4AF5A0C9D65D7 // 232 +data8 0x3FE4BEA2A5BDBE87 // 233 +data8 0x3FE4CDF28F10AC46 // 234 +data8 0x3FE4DD49CF994058 // 235 +data8 0x3FE4ECA86E64A683 // 236 +data8 0x3FE503C43CD8EB68 // 237 +data8 0x3FE513356667FC57 // 238 +data8 0x3FE522AE0738A3D7 // 239 +data8 0x3FE5322E26867857 // 240 +data8 0x3FE541B5CB979809 // 241 +data8 0x3FE55144FDBCBD62 // 242 +data8 0x3FE560DBC45153C6 // 243 +data8 0x3FE5707A26BB8C66 // 244 +data8 0x3FE587F60ED5B8FF // 245 +data8 0x3FE597A7977C8F31 // 246 +data8 0x3FE5A760D634BB8A // 247 +data8 0x3FE5B721D295F10E // 248 +data8 0x3FE5C6EA94431EF9 // 249 +data8 0x3FE5D6BB22EA86F5 // 250 +data8 0x3FE5E6938645D38F // 251 +data8 0x3FE5F673C61A2ED1 // 252 +data8 0x3FE6065BEA385926 // 253 +data8 0x3FE6164BFA7CC06B // 254 +data8 0x3FE62643FECF9742 // 255 +// +// two parts of ln(2) +data8 0x3FE62E42FEF00000,0x3DD473DE6AF278ED +// +// lo parts of ln(1/frcpa(1+i/256)), i=0...255 +data4 0x20E70672 // 0 +data4 0x1F60A5D0 // 1 +data4 0x218EABA0 // 2 +data4 0x21403104 // 3 +data4 0x20E9B54E // 4 +data4 0x21EE1382 // 5 +data4 0x226014E3 // 6 +data4 0x2095E5C9 // 7 +data4 0x228BA9D4 // 8 +data4 0x22932B86 // 9 +data4 0x22608A57 // 10 +data4 0x220209F3 // 11 +data4 0x212882CC // 12 +data4 0x220D46E2 // 13 +data4 0x21FA4C28 // 14 +data4 0x229E5BD9 // 15 +data4 0x228C9838 // 16 +data4 0x2311F954 // 17 +data4 0x221365DF // 18 +data4 0x22BD0CB3 // 19 +data4 0x223D4BB7 // 20 +data4 0x22A71BBE // 21 +data4 0x237DB2FA // 22 +data4 0x23194C9D // 23 +data4 0x22EC639E // 24 +data4 0x2367E669 // 25 +data4 0x232E1D5F // 26 +data4 0x234A639B // 27 +data4 0x2365C0E0 // 28 +data4 0x234646C1 // 29 +data4 0x220CBF9C // 30 +data4 0x22A00FD4 // 31 +data4 0x2306A3F2 // 32 +data4 0x23745A9B // 33 +data4 0x2398D756 // 34 +data4 0x23DD0B6A // 35 +data4 0x23DE338B // 36 +data4 0x23A222DF // 37 +data4 0x223164F8 // 38 +data4 0x23B4E87B // 39 +data4 0x23D6CCB8 // 40 +data4 0x220C2099 // 41 +data4 0x21B86B67 // 42 +data4 0x236D14F1 // 43 +data4 0x225A923F // 44 +data4 0x22748723 // 45 +data4 0x22200D13 // 46 +data4 0x23C296EA // 47 +data4 0x2302AC38 // 48 +data4 0x234B1996 // 49 +data4 0x2385E298 // 50 +data4 0x23175BE5 // 51 +data4 0x2193F482 // 52 +data4 0x23BFEA90 // 53 +data4 0x23D70A0C // 54 +data4 0x231CF30A // 55 +data4 0x235D9E90 // 56 +data4 0x221AD0CB // 57 +data4 0x22FAA08B // 58 +data4 0x23D29A87 // 59 +data4 0x20C4B2FE // 60 +data4 0x2381B8B7 // 61 +data4 0x23F8D9FC // 62 +data4 0x23EAAE7B // 63 +data4 0x2329E8AA // 64 +data4 0x23EC0322 // 65 +data4 0x2357FDCB // 66 +data4 0x2392A9AD // 67 +data4 0x22113B02 // 68 +data4 0x22DEE901 // 69 +data4 0x236A6D14 // 70 +data4 0x2371D33E // 71 +data4 0x2146F005 // 72 +data4 0x23230B06 // 73 +data4 0x22F1C77D // 74 +data4 0x23A89FA3 // 75 +data4 0x231D1241 // 76 +data4 0x244DA96C // 77 +data4 0x23ECBB7D // 78 +data4 0x223E42B4 // 79 +data4 0x23801BC9 // 80 +data4 0x23573263 // 81 +data4 0x227C1158 // 82 +data4 0x237BD749 // 83 +data4 0x21DDBAE9 // 84 +data4 0x23401735 // 85 +data4 0x241D9DEE // 86 +data4 0x23BC88CB // 87 +data4 0x2396D5F1 // 88 +data4 0x23FC89CF // 89 +data4 0x2414F9A2 // 90 +data4 0x2474A0F5 // 91 +data4 0x24354B60 // 92 +data4 0x23C1EB40 // 93 +data4 0x2306DD92 // 94 +data4 0x24353B6B // 95 +data4 0x23CD1701 // 96 +data4 0x237C7A1C // 97 +data4 0x245793AA // 98 +data4 0x24563695 // 99 +data4 0x23C51467 // 100 +data4 0x24476B68 // 101 +data4 0x212585A9 // 102 +data4 0x247B8293 // 103 +data4 0x2446848A // 104 +data4 0x246A53F8 // 105 +data4 0x246E496D // 106 +data4 0x23ED1D36 // 107 +data4 0x2314C258 // 108 +data4 0x233244A7 // 109 +data4 0x245B7AF0 // 110 +data4 0x24247130 // 111 +data4 0x22D67B38 // 112 +data4 0x2449F620 // 113 +data4 0x23BBC8B8 // 114 +data4 0x237D3BA0 // 115 +data4 0x245E8F13 // 116 +data4 0x2435573F // 117 +data4 0x242DE666 // 118 +data4 0x2463BC10 // 119 +data4 0x2466587D // 120 +data4 0x2408144B // 121 +data4 0x2405F0E5 // 122 +data4 0x22381CFF // 123 +data4 0x24154F9B // 124 +data4 0x23A4E96E // 125 +data4 0x24052967 // 126 +data4 0x2406963F // 127 +data4 0x23F7D3CB // 128 +data4 0x2448AFF4 // 129 +data4 0x24657A21 // 130 +data4 0x22FBC230 // 131 +data4 0x243C8DEA // 132 +data4 0x225DC4B7 // 133 +data4 0x23496EBF // 134 +data4 0x237C2B2B // 135 +data4 0x23A4A5B1 // 136 +data4 0x2394E9D1 // 137 +data4 0x244BC950 // 138 +data4 0x23C7448F // 139 +data4 0x2404A1AD // 140 +data4 0x246511D5 // 141 +data4 0x24246526 // 142 +data4 0x23111F57 // 143 +data4 0x22868951 // 144 +data4 0x243EB77F // 145 +data4 0x239F3DFF // 146 +data4 0x23089666 // 147 +data4 0x23EBFA6A // 148 +data4 0x23C51312 // 149 +data4 0x23E1DD5E // 150 +data4 0x232C0944 // 151 +data4 0x246A741F // 152 +data4 0x2414DF8D // 153 +data4 0x247B5546 // 154 +data4 0x2415C980 // 155 +data4 0x24324ABD // 156 +data4 0x234EB5E5 // 157 +data4 0x2465E43E // 158 +data4 0x242840D1 // 159 +data4 0x24444057 // 160 +data4 0x245E56F0 // 161 +data4 0x21AE30F8 // 162 +data4 0x23FB3283 // 163 +data4 0x247A4D07 // 164 +data4 0x22AE314D // 165 +data4 0x246B7727 // 166 +data4 0x24EAD526 // 167 +data4 0x24B41DC9 // 168 +data4 0x24EE8062 // 169 +data4 0x24A0C7C4 // 170 +data4 0x24E8DA67 // 171 +data4 0x231120F7 // 172 +data4 0x24401FFB // 173 +data4 0x2412DD09 // 174 +data4 0x248C131A // 175 +data4 0x24C0A7CE // 176 +data4 0x243DD4C8 // 177 +data4 0x24457FEB // 178 +data4 0x24DEEFBB // 179 +data4 0x243C70AE // 180 +data4 0x23E7A6FA // 181 +data4 0x24C2D311 // 182 +data4 0x23026255 // 183 +data4 0x2437C9B9 // 184 +data4 0x246BA847 // 185 +data4 0x2420B448 // 186 +data4 0x24C4CF5A // 187 +data4 0x242C4981 // 188 +data4 0x24DE1525 // 189 +data4 0x24F5CC33 // 190 +data4 0x235A85DA // 191 +data4 0x24A0B64F // 192 +data4 0x244BA0A4 // 193 +data4 0x24AAF30A // 194 +data4 0x244C86F9 // 195 +data4 0x246D5B82 // 196 +data4 0x24529347 // 197 +data4 0x240DD008 // 198 +data4 0x24E98790 // 199 +data4 0x2489B0CE // 200 +data4 0x22BC29AC // 201 +data4 0x23F37C7A // 202 +data4 0x24987FE8 // 203 +data4 0x22AFE20B // 204 +data4 0x24C8D7C2 // 205 +data4 0x24B28B7D // 206 +data4 0x23B6B271 // 207 +data4 0x24C77CB6 // 208 +data4 0x24EF1DCA // 209 +data4 0x24A4F0AC // 210 +data4 0x24CF113E // 211 +data4 0x2496BBAB // 212 +data4 0x23C7CC8A // 213 +data4 0x23AE3961 // 214 +data4 0x2410A895 // 215 +data4 0x23CE3114 // 216 +data4 0x2308247D // 217 +data4 0x240045E9 // 218 +data4 0x24974F60 // 219 +data4 0x242CB39F // 220 +data4 0x24AB8D69 // 221 +data4 0x23436788 // 222 +data4 0x24305E9E // 223 +data4 0x243E71A9 // 224 +data4 0x23C2A6B3 // 225 +data4 0x23FFE6CF // 226 +data4 0x2322D801 // 227 +data4 0x24515F21 // 228 +data4 0x2412A0D6 // 229 +data4 0x24E60D44 // 230 +data4 0x240D9251 // 231 +data4 0x247076E2 // 232 +data4 0x229B101B // 233 +data4 0x247B12DE // 234 +data4 0x244B9127 // 235 +data4 0x2499EC42 // 236 +data4 0x21FC3963 // 237 +data4 0x23E53266 // 238 +data4 0x24CE102D // 239 +data4 0x23CC45D2 // 240 +data4 0x2333171D // 241 +data4 0x246B3533 // 242 +data4 0x24931129 // 243 +data4 0x24405FFA // 244 +data4 0x24CF464D // 245 +data4 0x237095CD // 246 +data4 0x24F86CBD // 247 +data4 0x24E2D84B // 248 +data4 0x21ACBB44 // 249 +data4 0x24F43A8C // 250 +data4 0x249DB931 // 251 +data4 0x24A385EF // 252 +data4 0x238B1279 // 253 +data4 0x2436213E // 254 +data4 0x24F18A3B // 255 +LOCAL_OBJECT_END(log_data) + + +// Code +//============================================================== .section .text -.proc log1p# -.global log1p# -.align 64 -log1p: -#ifdef _LIBC -.global __log1p -__log1p: -#endif - +GLOBAL_IEEE754_ENTRY(log1p) { .mfi -alloc r32 = ar.pfs,0,22,4,0 -(p0) fsub.s1 FR_Neg_One = f0,f1 -(p0) cmp.eq.unc p7, p0 = r0, r0 + getf.exp GR_signexp_x = f8 // if x is unorm then must recompute + fadd.s1 FR_Xp1 = f8, f1 // Form 1+x + mov GR_05 = 0xfffe } - -{ .mfi -(p0) cmp.ne.unc p14, p0 = r0, r0 -(p0) fnorm.s1 FR_X_Prime = FR_Input_X -(p0) cmp.eq.unc p15, p0 = r0, r0 ;; -} - -{ .mfi - nop.m 999 -(p0) fclass.m.unc p6, p0 = FR_Input_X, 0x1E3 - nop.i 999 -} -;; - -{ .mfi - nop.m 999 -(p0) fclass.nm.unc p10, p0 = FR_Input_X, 0x1FF - nop.i 999 +{ .mlx + addl GR_ad_1 = @ltoff(log_data),gp + movl GR_A3 = 0x3fd5555555555557 // double precision memory + // representation of A3 } ;; { .mfi - nop.m 999 -(p0) fcmp.eq.unc.s1 p9, p0 = FR_Input_X, f0 - nop.i 999 -} - -{ .mfi - nop.m 999 -(p0) fadd FR_Em1 = f0,f0 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p0) fadd FR_E = f0,f1 - nop.i 999 ;; + ld8 GR_ad_1 = [GR_ad_1] + fclass.m p8,p0 = f8,0xb // Is x unorm? + mov GR_exp_mask = 0x1ffff } - { .mfi - nop.m 999 -(p0) fcmp.eq.unc.s1 p8, p0 = FR_Input_X, FR_Neg_One - nop.i 999 + nop.m 0 + fnorm.s1 FR_NormX = f8 // Normalize x + mov GR_exp_bias = 0xffff } +;; { .mfi - nop.m 999 -(p0) fcmp.lt.unc.s1 p13, p0 = FR_Input_X, FR_Neg_One - nop.i 999 -} - - -L(LOG_BEGIN): - -{ .mfi - nop.m 999 -(p0) fadd.s1 FR_Z = FR_X_Prime, FR_E - nop.i 999 + setf.exp FR_A2 = GR_05 // create A2 = 0.5 + fclass.m p9,p0 = f8,0x1E1 // is x NaN, NaT or +Inf? + nop.i 0 } - -{ .mlx - nop.m 999 -(p0) movl GR_Table_Scale = 0x0000000000000018 ;; -} - -{ .mmi - nop.m 999 -// -// Create E = 1 and Em1 = 0 -// Check for X == 0, meaning log(1+0) -// Check for X < -1, meaning log(negative) -// Check for X == -1, meaning log(0) -// Normalize x -// Identify NatVals, NaNs, Infs. -// Identify EM unsupporteds. -// Identify Negative values - us S1 so as -// not to raise denormal operand exception -// Set p15 to true for log1p -// Set p14 to false for log1p -// Set p7 true for log and log1p -// -(p0) addl GR_Table_Base = @ltoff(Constants_Z_G_H_h1#),gp - nop.i 999 -} - -{ .mfi - nop.m 999 -(p0) fmax.s1 FR_AA = FR_X_Prime, FR_E - nop.i 999 ;; +{ .mib + setf.d FR_A3 = GR_A3 // create A3 + add GR_ad_2 = 16,GR_ad_1 // address of A5,A4 +(p8) br.cond.spnt log1p_unorm // Branch if x=unorm } +;; +log1p_common: { .mfi - ld8 GR_Table_Base = [GR_Table_Base] -(p0) fmin.s1 FR_BB = FR_X_Prime, FR_E - nop.i 999 + nop.m 0 + frcpa.s1 FR_RcpX,p0 = f1,FR_Xp1 + nop.i 0 } - { .mfb - nop.m 999 -(p0) fadd.s1 FR_W = FR_X_Prime, FR_Em1 -// -// Begin load of constants base -// FR_Z = Z = |x| + E -// FR_W = W = |x| + Em1 -// AA = fmax(|x|,E) -// BB = fmin(|x|,E) -// -(p6) br.cond.spnt L(LOG_64_special) ;; -} - -{ .mib - nop.m 999 - nop.i 999 -(p10) br.cond.spnt L(LOG_64_unsupported) ;; -} - -{ .mib - nop.m 999 - nop.i 999 -(p13) br.cond.spnt L(LOG_64_negative) ;; -} - -{ .mib -(p0) getf.sig GR_signif = FR_Z - nop.i 999 -(p9) br.cond.spnt L(LOG_64_one) ;; -} - -{ .mib - nop.m 999 - nop.i 999 -(p8) br.cond.spnt L(LOG_64_zero) ;; + nop.m 0 +(p9) fma.d.s0 f8 = f8,f1,f0 // set V-flag +(p9) br.ret.spnt b0 // exit for NaN, NaT and +Inf } +;; { .mfi -(p0) getf.exp GR_N = FR_Z -// -// Raise possible denormal operand exception -// Create Bias -// -// This function computes ln( x + e ) -// Input FR 1: FR_X = FR_Input_X -// Input FR 2: FR_E = FR_E -// Input FR 3: FR_Em1 = FR_Em1 -// Input GR 1: GR_Expo_Range = GR_Expo_Range = 1 -// Output FR 4: FR_Y_hi -// Output FR 5: FR_Y_lo -// Output FR 6: FR_Scale -// Output PR 7: PR_Safe -// -(p0) fsub.s1 FR_S_lo = FR_AA, FR_Z -// -// signif = getf.sig(Z) -// abs_W = fabs(w) -// -(p0) extr.u GR_Table_ptr = GR_signif, 59, 4 ;; + getf.exp GR_Exp = FR_Xp1 // signexp of x+1 + fclass.m p10,p0 = FR_Xp1,0x3A // is 1+x < 0? + and GR_exp_x = GR_exp_mask, GR_signexp_x // biased exponent of x } - { .mfi - nop.m 999 -(p0) fmerge.se FR_S_hi = f1,FR_Z -(p0) extr.u GR_X_0 = GR_signif, 49, 15 -} - -{ .mmi - nop.m 999 -(p0) addl GR_Table_Base1 = @ltoff(Constants_Z_G_H_h2#),gp - nop.i 999 + ldfpd FR_A7,FR_A6 = [GR_ad_1] + nop.f 0 + nop.i 0 } ;; -{ .mlx - ld8 GR_Table_Base1 = [GR_Table_Base1] -(p0) movl GR_Bias = 0x000000000000FFFF ;; -} - -{ .mfi - nop.m 999 -(p0) fabs FR_abs_W = FR_W -(p0) pmpyshr2.u GR_Table_ptr = GR_Table_ptr,GR_Table_Scale,0 -} - { .mfi - nop.m 999 -// -// Branch out for special input values -// -(p0) fcmp.lt.unc.s0 p8, p0 = FR_Input_X, f0 - nop.i 999 ;; + getf.sig GR_Sig = FR_Xp1 // get significand to calculate index + // for Thi,Tlo if |x| >= 2^-8 + fcmp.eq.s1 p12,p0 = f8,f0 // is x equal to 0? + sub GR_exp_x = GR_exp_x, GR_exp_bias // true exponent of x } +;; { .mfi - nop.m 999 -// -// X_0 = extr.u(signif,49,15) -// Index1 = extr.u(signif,59,4) -// -(p0) fadd.s1 FR_S_lo = FR_S_lo, FR_BB - nop.i 999 ;; -} - -{ .mii - nop.m 999 - nop.i 999 ;; -// -// Offset_to_Z1 = 24 * Index1 -// For performance, don't use result -// for 3 or 4 cycles. -// -(p0) add GR_Table_ptr = GR_Table_ptr, GR_Table_Base ;; + sub GR_N = GR_Exp,GR_exp_bias // true exponent of x+1 + fcmp.eq.s1 p11,p0 = FR_Xp1,f0 // is x = -1? + cmp.gt p6,p7 = -8, GR_exp_x // Is |x| < 2^-8 } -// -// Add Base to Offset for Z1 -// Create Bias - -{ .mmi -(p0) ld4 GR_Z_1 = [GR_Table_ptr],4 ;; -(p0) ldfs FR_G = [GR_Table_ptr],4 - nop.i 999 ;; -} - -{ .mmi -(p0) ldfs FR_H = [GR_Table_ptr],8 ;; -(p0) ldfd FR_h = [GR_Table_ptr],0 -(p0) pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 +{ .mfb + ldfpd FR_A5,FR_A4 = [GR_ad_2],16 + nop.f 0 +(p10) br.cond.spnt log1p_lt_minus_1 // jump if x < -1 } -// -// Load Z_1 -// Get Base of Table2 -// +;; +// p6 is true if |x| < 1/256 +// p7 is true if |x| >= 1/256 +.pred.rel "mutex",p6,p7 { .mfi -(p0) getf.exp GR_M = FR_abs_W - nop.f 999 - nop.i 999 ;; -} - -{ .mii - nop.m 999 - nop.i 999 ;; -// -// M = getf.exp(abs_W) -// S_lo = AA - Z -// X_1 = pmpyshr2(X_0,Z_1,15) -// -(p0) sub GR_M = GR_M, GR_Bias ;; +(p7) add GR_ad_1 = 0x820,GR_ad_1 // address of log(2) parts +(p6) fms.s1 FR_r = f8,f1,f0 // range reduction for |x|<1/256 +(p6) cmp.gt.unc p10,p0 = -80, GR_exp_x // Is |x| < 2^-80 } -// -// M = M - Bias -// Load G1 -// N = getf.exp(Z) -// - -{ .mii -(p0) cmp.gt.unc p11, p0 = -80, GR_M -(p0) cmp.gt.unc p12, p0 = -7, GR_M ;; -(p0) extr.u GR_Index2 = GR_X_1, 6, 4 ;; -} - -{ .mib - nop.m 999 -// -// if -80 > M, set p11 -// Index2 = extr.u(X_1,6,4) -// if -7 > M, set p12 -// Load H1 -// -(p0) pmpyshr2.u GR_Index2 = GR_Index2,GR_Table_Scale,0 -(p11) br.cond.spnt L(log1p_small) ;; +{ .mfb +(p7) setf.sig FR_N = GR_N // copy unbiased exponent of x to the + // significand field of FR_N +(p7) fms.s1 FR_r = FR_RcpX,FR_Xp1,f1 // range reduction for |x|>=1/256 +(p12) br.ret.spnt b0 // exit for x=0, return x } +;; { .mib - nop.m 999 - nop.i 999 -(p12) br.cond.spnt L(log1p_near) ;; -} - -{ .mii -(p0) sub GR_N = GR_N, GR_Bias -// -// poly_lo = r * poly_lo -// -(p0) add GR_Perturb = 0x1, r0 ;; -(p0) sub GR_ScaleN = GR_Bias, GR_N -} - -{ .mii -(p0) setf.sig FR_float_N = GR_N - nop.i 999 ;; -// -// Prepare Index2 - pmpyshr2.u(X_1,Z_2,15) -// Load h1 -// S_lo = S_lo + BB -// Branch for -80 > M -// -(p0) add GR_Index2 = GR_Index2, GR_Table_Base1 -} - -{ .mmi -(p0) setf.exp FR_two_negN = GR_ScaleN - nop.m 999 -(p0) addl GR_Table_Base = @ltoff(Constants_Z_G_H_h3#),gp -};; - -// -// Index2 points to Z2 -// Branch for -7 > M -// - -{ .mmb -(p0) ld4 GR_Z_2 = [GR_Index2],4 - ld8 GR_Table_Base = [GR_Table_Base] - nop.b 999 ;; -} -(p0) nop.i 999 -// -// Load Z_2 -// N = N - Bias -// Tablebase points to Table3 -// - -{ .mmi -(p0) ldfs FR_G_tmp = [GR_Index2],4 ;; -// -// Load G_2 -// pmpyshr2 X_2= (X_1,Z_2,15) -// float_N = setf.sig(N) -// ScaleN = Bias - N -// -(p0) ldfs FR_H_tmp = [GR_Index2],8 - nop.i 999 ;; -} -// -// Load H_2 -// two_negN = setf.exp(scaleN) -// G = G_1 * G_2 -// - -{ .mfi -(p0) ldfd FR_h_tmp = [GR_Index2],0 - nop.f 999 -(p0) pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 ;; -} - -{ .mii - nop.m 999 -(p0) extr.u GR_Index3 = GR_X_2, 1, 5 ;; -// -// Load h_2 -// H = H_1 + H_2 -// h = h_1 + h_2 -// Index3 = extr.u(X_2,1,5) -// -(p0) shladd GR_Index3 = GR_Index3,4,GR_Table_Base +(p7) ldfpd FR_Ln2hi,FR_Ln2lo = [GR_ad_1],16 +(p7) extr.u GR_Ind = GR_Sig,55,8 // get bits from 55 to 62 as index +(p11) br.cond.spnt log1p_eq_minus_1 // jump if x = -1 } - -{ .mmi - nop.m 999 - nop.m 999 -// -// float_N = fcvt.xf(float_N) -// load G3 -// -(p0) addl GR_Table_Base = @ltoff(Constants_Q#),gp ;; -} - -{ .mfi -ld8 GR_Table_Base = [GR_Table_Base] -nop.f 999 -nop.i 999 -} ;; - -{ .mfi -(p0) ldfe FR_log2_hi = [GR_Table_Base],16 -(p0) fmpy.s1 FR_S_lo = FR_S_lo, FR_two_negN - nop.i 999 ;; -} - -{ .mmf - nop.m 999 -// -// G = G3 * G -// Load h3 -// Load log2_hi -// H = H + H3 -// -(p0) ldfe FR_log2_lo = [GR_Table_Base],16 -(p0) fmpy.s1 FR_G = FR_G, FR_G_tmp ;; -} - -{ .mmf -(p0) ldfs FR_G_tmp = [GR_Index3],4 -// -// h = h + h3 -// r = G * S_hi + 1 -// Load log2_lo -// -(p0) ldfe FR_Q4 = [GR_Table_Base],16 -(p0) fadd.s1 FR_h = FR_h, FR_h_tmp ;; -} - -{ .mfi -(p0) ldfe FR_Q3 = [GR_Table_Base],16 -(p0) fadd.s1 FR_H = FR_H, FR_H_tmp - nop.i 999 ;; -} - -{ .mmf -(p0) ldfs FR_H_tmp = [GR_Index3],4 -(p0) ldfe FR_Q2 = [GR_Table_Base],16 -// -// Comput Index for Table3 -// S_lo = S_lo * two_negN -// -(p0) fcvt.xf FR_float_N = FR_float_N ;; -} -// -// If S_lo == 0, set p8 false -// Load H3 -// Load ptr to table of polynomial coeff. -// +;; { .mmf -(p0) ldfd FR_h_tmp = [GR_Index3],0 -(p0) ldfe FR_Q1 = [GR_Table_Base],0 -(p0) fcmp.eq.unc.s1 p0, p8 = FR_S_lo, f0 ;; -} - -{ .mfi - nop.m 999 -(p0) fmpy.s1 FR_G = FR_G, FR_G_tmp - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p0) fadd.s1 FR_H = FR_H, FR_H_tmp - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p0) fms.s1 FR_r = FR_G, FR_S_hi, f1 - nop.i 999 -} - -{ .mfi - nop.m 999 -(p0) fadd.s1 FR_h = FR_h, FR_h_tmp - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p0) fma.s1 FR_Y_hi = FR_float_N, FR_log2_hi, FR_H - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// Load Q4 -// Load Q3 -// Load Q2 -// Load Q1 -// -(p8) fma.s1 FR_r = FR_G, FR_S_lo, FR_r - nop.i 999 -} - -{ .mfi - nop.m 999 -// -// poly_lo = r * Q4 + Q3 -// rsq = r* r -// -(p0) fma.s1 FR_h = FR_float_N, FR_log2_lo, FR_h - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// If (S_lo!=0) r = s_lo * G + r -// -(p0) fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3 - nop.i 999 -} -// -// Create a 0x00000....01 -// poly_lo = poly_lo * rsq + h -// - -{ .mfi -(p0) setf.sig FR_dummy = GR_Perturb -(p0) fmpy.s1 FR_rsq = FR_r, FR_r - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// h = N * log2_lo + h -// Y_hi = n * log2_hi + H -// -(p0) fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2 - nop.i 999 -} - -{ .mfi - nop.m 999 -(p0) fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// poly_lo = r * poly_o + Q2 -// poly_hi = Q1 * rsq + r -// -(p0) fmpy.s1 FR_poly_lo = FR_poly_lo, FR_r - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p0) fma.s1 FR_poly_lo = FR_poly_lo, FR_rsq, FR_h - nop.i 999 ;; -} - -{ .mfb - nop.m 999 -(p0) fadd.s1 FR_Y_lo = FR_poly_hi, FR_poly_lo -// -// Create the FR for a binary "or" -// Y_lo = poly_hi + poly_lo -// -// (p0) for FR_dummy = FR_Y_lo,FR_dummy ;; -// -// Turn the lsb of Y_lo ON -// -// (p0) fmerge.se FR_Y_lo = FR_Y_lo,FR_dummy ;; -// -// Merge the new lsb into Y_lo, for alone doesn't -// -(p0) br.cond.sptk L(LOG_main) ;; -} - - -L(log1p_near): - -{ .mmi - nop.m 999 - nop.m 999 -// /*******************************************************/ -// /*********** Branch log1p_near ************************/ -// /*******************************************************/ -(p0) addl GR_Table_Base = @ltoff(Constants_P#),gp ;; -} -// -// Load base address of poly. coeff. -// -{.mmi - nop.m 999 - ld8 GR_Table_Base = [GR_Table_Base] - nop.i 999 -};; - -{ .mmb -(p0) add GR_Table_ptr = 0x40,GR_Table_Base -// -// Address tables with separate pointers -// -(p0) ldfe FR_P8 = [GR_Table_Base],16 - nop.b 999 ;; +(p7) shladd GR_ad_2 = GR_Ind,3,GR_ad_2 // address of Thi +(p7) shladd GR_ad_1 = GR_Ind,2,GR_ad_1 // address of Tlo +(p10) fnma.d.s0 f8 = f8,f8,f8 // If |x| very small, result=x-x*x } +;; { .mmb -(p0) ldfe FR_P4 = [GR_Table_ptr],16 -// -// Load P4 -// Load P8 -// -(p0) ldfe FR_P7 = [GR_Table_Base],16 - nop.b 999 ;; -} - -{ .mmf -(p0) ldfe FR_P3 = [GR_Table_ptr],16 -// -// Load P3 -// Load P7 -// -(p0) ldfe FR_P6 = [GR_Table_Base],16 -(p0) fmpy.s1 FR_wsq = FR_W, FR_W ;; -} - -{ .mfi -(p0) ldfe FR_P2 = [GR_Table_ptr],16 - nop.f 999 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p0) fma.s1 FR_Y_hi = FR_W, FR_P4, FR_P3 - nop.i 999 -} -// -// Load P2 -// Load P6 -// Wsq = w * w -// Y_hi = p4 * w + p3 -// - -{ .mfi -(p0) ldfe FR_P5 = [GR_Table_Base],16 -(p0) fma.s1 FR_Y_lo = FR_W, FR_P8, FR_P7 - nop.i 999 ;; -} - -{ .mfi -(p0) ldfe FR_P1 = [GR_Table_ptr],16 -// -// Load P1 -// Load P5 -// Y_lo = p8 * w + P7 -// -(p0) fmpy.s1 FR_w4 = FR_wsq, FR_wsq - nop.i 999 ;; +(p7) ldfd FR_Thi = [GR_ad_2] +(p7) ldfs FR_Tlo = [GR_ad_1] +(p10) br.ret.spnt b0 // Exit if |x| < 2^(-80) } +;; { .mfi - nop.m 999 -(p0) fma.s1 FR_Y_hi = FR_W, FR_Y_hi, FR_P2 - nop.i 999 + nop.m 0 + fma.s1 FR_r2 = FR_r,FR_r,f0 // r^2 + nop.i 0 } - { .mfi - nop.m 999 -(p0) fma.s1 FR_Y_lo = FR_W, FR_Y_lo, FR_P6 -(p0) add GR_Perturb = 0x1, r0 ;; + nop.m 0 + fms.s1 FR_A2 = FR_A3,FR_r,FR_A2 // A3*r+A2 + nop.i 0 } +;; { .mfi - nop.m 999 -// -// w4 = w2 * w2 -// Y_hi = y_hi * w + p2 -// Y_lo = y_lo * w + p6 -// Create perturbation bit -// -(p0) fmpy.s1 FR_w6 = FR_w4, FR_wsq - nop.i 999 ;; + nop.m 0 + fma.s1 FR_A6 = FR_A7,FR_r,FR_A6 // A7*r+A6 + nop.i 0 } - { .mfi - nop.m 999 -(p0) fma.s1 FR_Y_hi = FR_W, FR_Y_hi, FR_P1 - nop.i 999 + nop.m 0 + fma.s1 FR_A4 = FR_A5,FR_r,FR_A4 // A5*r+A4 + nop.i 0 } -// -// Y_hi = y_hi * w + p1 -// w6 = w4 * w2 -// +;; { .mfi -(p0) setf.sig FR_Q4 = GR_Perturb -(p0) fma.s1 FR_Y_lo = FR_W, FR_Y_lo, FR_P5 - nop.i 999 ;; + nop.m 0 +(p7) fcvt.xf FR_N = FR_N + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fma.s1 FR_Y_hi = FR_wsq,FR_Y_hi, FR_W - nop.i 999 -} - -{ .mfb - nop.m 999 -// -// Y_hi = y_hi * wsq + w -// Y_lo = y_lo * w + p5 -// -(p0) fmpy.s1 FR_Y_lo = FR_w6, FR_Y_lo -// -// Y_lo = y_lo * w6 -// -// (p0) for FR_dummy = FR_Y_lo,FR_dummy ;; -// -// Set lsb on: Taken out to improve performance -// -// (p0) fmerge.se FR_Y_lo = FR_Y_lo,FR_dummy ;; -// -// Make sure it's on in Y_lo also. Taken out to improve -// performance -// -(p0) br.cond.sptk L(LOG_main) ;; -} - - -L(log1p_small): - -{ .mmi - nop.m 999 - nop.m 999 -// /*******************************************************/ -// /*********** Branch log1p_small ***********************/ -// /*******************************************************/ -(p0) addl GR_Table_Base = @ltoff(Constants_Threshold#),gp + nop.m 0 + fma.s1 FR_r4 = FR_r2,FR_r2,f0 // r^4 + nop.i 0 } - { .mfi - nop.m 999 -(p0) mov FR_Em1 = FR_W -(p0) cmp.eq.unc p7, p0 = r0, r0 ;; -} - -{ .mlx - ld8 GR_Table_Base = [GR_Table_Base] -(p0) movl GR_Expo_Range = 0x0000000000000002 ;; -} -// -// Set Safe to true -// Set Expo_Range = 0 for single -// Set Expo_Range = 2 for double -// Set Expo_Range = 4 for double-extended -// - -{ .mmi -(p0) shladd GR_Table_Base = GR_Expo_Range,4,GR_Table_Base ;; -(p0) ldfe FR_Threshold = [GR_Table_Base],16 - nop.i 999 -} - -{ .mlx - nop.m 999 -(p0) movl GR_Bias = 0x000000000000FF9B ;; + nop.m 0 + // (A3*r+A2)*r^2+r + fma.s1 FR_A2 = FR_A2,FR_r2,FR_r + nop.i 0 } +;; { .mfi -(p0) ldfe FR_Tiny = [GR_Table_Base],0 - nop.f 999 - nop.i 999 ;; + nop.m 0 + // (A7*r+A6)*r^2+(A5*r+A4) + fma.s1 FR_A4 = FR_A6,FR_r2,FR_A4 + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fcmp.gt.unc.s1 p13, p12 = FR_abs_W, FR_Threshold - nop.i 999 ;; + nop.m 0 + // N*Ln2hi+Thi +(p7) fma.s1 FR_NxLn2hipThi = FR_N,FR_Ln2hi,FR_Thi + nop.i 0 } - { .mfi - nop.m 999 -(p13) fnmpy.s1 FR_Y_lo = FR_W, FR_W - nop.i 999 + nop.m 0 + // N*Ln2lo+Tlo +(p7) fma.s1 FR_NxLn2lopTlo = FR_N,FR_Ln2lo,FR_Tlo + nop.i 0 } +;; { .mfi - nop.m 999 -(p13) fadd FR_SCALE = f0, f1 - nop.i 999 ;; + nop.m 0 +(p7) fma.s1 f8 = FR_A4,FR_r4,FR_A2 // P(r) if |x| >= 1/256 + nop.i 0 } - { .mfi - nop.m 999 -(p12) fsub.s1 FR_Y_lo = f0, FR_Tiny -(p12) cmp.ne.unc p7, p0 = r0, r0 + nop.m 0 + // (N*Ln2hi+Thi) + (N*Ln2lo+Tlo) +(p7) fma.s1 FR_NxLn2pT = FR_NxLn2hipThi,f1,FR_NxLn2lopTlo + nop.i 0 } +;; +.pred.rel "mutex",p6,p7 { .mfi -(p12) setf.exp FR_SCALE = GR_Bias - nop.f 999 - nop.i 999 ;; + nop.m 0 +(p6) fma.d.s0 f8 = FR_A4,FR_r4,FR_A2 // result if 2^(-80) <= |x| < 1/256 + nop.i 0 } - -// -// Set p7 to SAFE = FALSE -// Set Scale = 2^-100 -// { .mfb - nop.m 999 -(p0) fma.d.s0 FR_Input_X = FR_Y_lo,FR_SCALE,FR_Y_hi -(p0) br.ret.sptk b0 + nop.m 0 +(p7) fma.d.s0 f8 = f8,f1,FR_NxLn2pT // result if |x| >= 1/256 + br.ret.sptk b0 // Exit if |x| >= 2^(-80) } ;; -L(LOG_64_one): - +.align 32 +log1p_unorm: +// Here if x=unorm { .mfb - nop.m 999 -(p0) fmpy.d.s0 FR_Input_X = FR_Input_X, f0 -(p0) br.ret.sptk b0 + getf.exp GR_signexp_x = FR_NormX // recompute biased exponent + nop.f 0 + br.cond.sptk log1p_common } ;; -// -// Raise divide by zero for +/-0 input. -// -L(LOG_64_zero): - +.align 32 +log1p_eq_minus_1: +// Here if x=-1 { .mfi -(p0) mov GR_Parameter_TAG = 140 -// -// If we have log1p(0), return -Inf. -// -(p0) fsub.s0 FR_Output_X_tmp = f0, f1 - nop.i 999 ;; + nop.m 0 + fmerge.s FR_X = f8,f8 // keep input argument for subsequent + // call of __libm_error_support# + nop.i 0 } -{ .mfb - nop.m 999 -(p0) frcpa.s0 FR_Output_X_tmp, p8 = FR_Output_X_tmp, f0 -(p0) br.cond.sptk L(LOG_ERROR_Support) ;; -} - -L(LOG_64_special): +;; { .mfi - nop.m 999 -// -// Return -Inf or value from handler. -// -(p0) fclass.m.unc p7, p0 = FR_Input_X, 0x1E1 - nop.i 999 ;; + mov GR_TAG = 140 // set libm error in case of log1p(-1). + frcpa.s0 f8,p0 = f8,f0 // log1p(-1) should be equal to -INF. + // We can get it using frcpa because it + // sets result to the IEEE-754 mandated + // quotient of f8/f0. + nop.i 0 } -{ .mfb - nop.m 999 -// -// Check for Natval, QNan, SNaN, +Inf -// -(p7) fmpy.d.s0 f8 = FR_Input_X, f1 -// -// For SNaN raise invalid and return QNaN. -// For QNaN raise invalid and return QNaN. -// For +Inf return +Inf. -// -(p7) br.ret.sptk b0 +{ .mib + nop.m 0 + nop.i 0 + br.cond.sptk log_libm_err } ;; -// -// For -Inf raise invalid and return QNaN. -// - -{ .mfb -(p0) mov GR_Parameter_TAG = 141 -(p0) fmpy.d.s0 FR_Output_X_tmp = FR_Input_X, f0 -(p0) br.cond.sptk L(LOG_ERROR_Support) ;; +.align 32 +log1p_lt_minus_1: +// Here if x < -1 +{ .mfi + nop.m 0 + fmerge.s FR_X = f8,f8 + nop.i 0 } +;; -// -// Report that log1p(-Inf) computed -// - -L(LOG_64_unsupported): - -// -// Return generated NaN or other value . -// - -{ .mfb - nop.m 999 -(p0) fmpy.d.s0 FR_Input_X = FR_Input_X, f0 -(p0) br.ret.sptk b0 ;; +{ .mfi + mov GR_TAG = 141 // set libm error in case of x < -1. + frcpa.s0 f8,p0 = f0,f0 // log1p(x) x < -1 should be equal to NaN. + // We can get it using frcpa because it + // sets result to the IEEE-754 mandated + // quotient of f0/f0 i.e. NaN. + nop.i 0 } +;; -L(LOG_64_negative): - -{ .mfi - nop.m 999 -// -// Deal with x < 0 in a special way -// -(p0) frcpa.s0 FR_Output_X_tmp, p8 = f0, f0 -// -// Deal with x < 0 in a special way - raise -// invalid and produce QNaN indefinite. -// -(p0) mov GR_Parameter_TAG = 141 +.align 32 +log_libm_err: +{ .mmi + alloc r32 = ar.pfs,1,4,4,0 + mov GR_Parameter_TAG = GR_TAG + nop.i 0 } +;; -.endp log1p# -ASM_SIZE_DIRECTIVE(log1p) +GLOBAL_IEEE754_END(log1p) -.proc __libm_error_region -__libm_error_region: -L(LOG_ERROR_Support): +LOCAL_LIBM_ENTRY(__libm_error_region) .prologue - -// (1) { .mfi - add GR_Parameter_Y=-32,sp // Parameter 2 value + add GR_Parameter_Y = -32,sp // Parameter 2 value nop.f 0 .save ar.pfs,GR_SAVE_PFS - mov GR_SAVE_PFS=ar.pfs // Save ar.pfs + mov GR_SAVE_PFS = ar.pfs // Save ar.pfs } { .mfi .fframe 64 - add sp=-64,sp // Create new stack + add sp = -64,sp // Create new stack nop.f 0 - mov GR_SAVE_GP=gp // Save gp + mov GR_SAVE_GP = gp // Save gp };; - - -// (2) { .mmi - stfd [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack + stfd [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack add GR_Parameter_X = 16,sp // Parameter 1 address .save b0, GR_SAVE_B0 - mov GR_SAVE_B0=b0 // Save b0 + mov GR_SAVE_B0 = b0 // Save b0 };; - .body -// (3) { .mib - stfd [GR_Parameter_X] =FR_Input_X // STORE Parameter 1 on stack - add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address - nop.b 0 + stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address + nop.b 0 } { .mib - stfd [GR_Parameter_Y] = FR_Output_X_tmp // STORE Parameter 3 on stack + stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack add GR_Parameter_Y = -16,GR_Parameter_Y - br.call.sptk b0=__libm_error_support# // Call error handling function + br.call.sptk b0=__libm_error_support# // Call error handling function };; { .mmi - nop.m 0 - nop.m 0 add GR_Parameter_RESULT = 48,sp + nop.m 0 + nop.i 0 };; - -// (4) { .mmi - ldfd FR_Input_X = [GR_Parameter_RESULT] // Get return result off stack + ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack .restore sp - add sp = 64,sp // Restore stack pointer - mov b0 = GR_SAVE_B0 // Restore return address + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address };; { .mib - mov gp = GR_SAVE_GP // Restore gp - mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs - br.ret.sptk b0 + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return };; - -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) - -.proc __libm_LOG_main -__libm_LOG_main: -L(LOG_main): - -// -// kernel_log_64 computes ln(X + E) -// - -{ .mfi - nop.m 999 -(p7) fadd.d.s0 FR_Input_X = FR_Y_lo,FR_Y_hi - nop.i 999 -} - -{ .mmi - nop.m 999 - nop.m 999 -(p14) addl GR_Table_Base = @ltoff(Constants_1_by_LN10#),gp ;; -} - -{ .mmi - nop.m 999 -(p14) ld8 GR_Table_Base = [GR_Table_Base] - nop.i 999 -};; - -{ .mmi -(p14) ldfe FR_1LN10_hi = [GR_Table_Base],16 ;; -(p14) ldfe FR_1LN10_lo = [GR_Table_Base] - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p14) fmpy.s1 FR_Output_X_tmp = FR_Y_lo,FR_1LN10_hi - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p14) fma.s1 FR_Output_X_tmp = FR_Y_hi,FR_1LN10_lo,FR_Output_X_tmp - nop.i 999 ;; -} - -{ .mfb - nop.m 999 -(p14) fma.d.s0 FR_Input_X = FR_Y_hi,FR_1LN10_hi,FR_Output_X_tmp -(p0) br.ret.sptk b0 ;; -} -.endp __libm_LOG_main -ASM_SIZE_DIRECTIVE(__libm_LOG_main) - +LOCAL_LIBM_END(__libm_error_region) .type __libm_error_support#,@function .global __libm_error_support# + diff --git a/sysdeps/ia64/fpu/s_log1pf.S b/sysdeps/ia64/fpu/s_log1pf.S index 8aff9b895a..a148d4b272 100644 --- a/sysdeps/ia64/fpu/s_log1pf.S +++ b/sysdeps/ia64/fpu/s_log1pf.S @@ -1,10 +1,10 @@ -.file "log1pf.s" +.file "log1pf.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,1610 +20,768 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// // Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// problem reports or change requests be submitted to it directly at +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 2/02/00 Initial version -// 4/04/00 Unwind support added -// 8/15/00 Bundle added after call to __libm_error_support to properly +// 02/02/00 Initial version +// 04/04/00 Unwind support added +// 08/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. +// 06/29/01 Improved speed of all paths +// 05/20/02 Cleaned up namespace and sf0 syntax +// 10/02/02 Improved performance by basing on log algorithm +// 02/10/03 Reordered header: .section, .global, .proc, .align +// 04/18/03 Eliminate possible WAW dependency warning // -// ********************************************************************* -// -// Function: log1pf(x) = ln(x+1), for single precision values -// -// ********************************************************************* -// -// Accuracy: Very accurate for single precision values -// -// ********************************************************************* -// -// Resources Used: -// -// Floating-Point Registers: f8 (Input and Return Value) -// f9,f33-f55,f99 -// -// General Purpose Registers: -// r32-r53 -// r54-r57 (Used to pass arguments to error handling routine) -// -// Predicate Registers: p6-p15 -// -// ********************************************************************* -// -// IEEE Special Conditions: -// -// Denormal fault raised on denormal inputs -// Overflow exceptions cannot occur -// Underflow exceptions raised when appropriate for log1pf -// (Error Handling Routine called for underflow) -// Inexact raised when appropriate by algorithm -// -// log1pf(inf) = inf -// log1pf(-inf) = QNaN -// log1pf(+/-0) = +/-0 -// log1pf(-1) = -inf -// log1pf(SNaN) = QNaN -// log1pf(QNaN) = QNaN -// log1pf(EM_special Values) = QNaN -// -// ********************************************************************* -// -// Computation is based on the following kernel. -// -// ker_log_64( in_FR : X, -// in_FR : E, -// in_FR : Em1, -// in_GR : Expo_Range, -// out_FR : Y_hi, -// out_FR : Y_lo, -// out_FR : Scale, -// out_PR : Safe ) -// -// Overview -// -// The method consists of three cases. -// -// If |X+Em1| < 2^(-80) use case log1pf_small; -// elseif |X+Em1| < 2^(-7) use case log_near1; -// else use case log_regular; -// -// Case log1pf_small: -// -// log( 1 + (X+Em1) ) can be approximated by (X+Em1). -// -// Case log_near1: -// -// log( 1 + (X+Em1) ) can be approximated by a simple polynomial -// in W = X+Em1. This polynomial resembles the truncated Taylor -// series W - W^/2 + W^3/3 - ... -// -// Case log_regular: -// -// Here we use a table lookup method. The basic idea is that in -// order to compute log(Arg) for an argument Arg in [1,2), we -// construct a value G such that G*Arg is close to 1 and that -// log(1/G) is obtainable easily from a table of values calculated -// beforehand. Thus -// -// log(Arg) = log(1/G) + log(G*Arg) -// = log(1/G) + log(1 + (G*Arg - 1)) -// -// Because |G*Arg - 1| is small, the second term on the right hand -// side can be approximated by a short polynomial. We elaborate -// this method in four steps. -// -// Step 0: Initialization -// -// We need to calculate log( E + X ). Obtain N, S_hi, S_lo such that -// -// E + X = 2^N * ( S_hi + S_lo ) exactly -// -// where S_hi in [1,2) and S_lo is a correction to S_hi in the sense -// that |S_lo| <= ulp(S_hi). -// -// Step 1: Argument Reduction -// -// Based on S_hi, obtain G_1, G_2, G_3 from a table and calculate -// -// G := G_1 * G_2 * G_3 -// r := (G * S_hi - 1) + G * S_lo -// -// These G_j's have the property that the product is exactly -// representable and that |r| < 2^(-12) as a result. -// -// Step 2: Approximation -// -// -// log(1 + r) is approximated by a short polynomial poly(r). -// -// Step 3: Reconstruction -// -// -// Finally, log( E + X ) is given by -// -// log( E + X ) = log( 2^N * (S_hi + S_lo) ) -// ~=~ N*log(2) + log(1/G) + log(1 + r) -// ~=~ N*log(2) + log(1/G) + poly(r). -// -// **** Algorithm **** -// -// Case log1pf_small: -// -// Although log(1 + (X+Em1)) is basically X+Em1, we would like to -// preserve the inexactness nature as well as consistent behavior -// under different rounding modes. Note that this case can only be -// taken if E is set to be 1.0. In this case, Em1 is zero, and that -// X can be very tiny and thus the final result can possibly underflow. -// Thus, we compare X against a threshold that is dependent on the -// input Expo_Range. If |X| is smaller than this threshold, we set -// SAFE to be FALSE. -// -// The result is returned as Y_hi, Y_lo, and in the case of SAFE -// is FALSE, an additional value Scale is also returned. -// -// W := X + Em1 -// Threshold := Threshold_Table( Expo_Range ) -// Tiny := Tiny_Table( Expo_Range ) -// -// If ( |W| > Threshold ) then -// Y_hi := W -// Y_lo := -W*W -// Else -// Y_hi := W -// Y_lo := -Tiny -// Scale := 2^(-100) -// Safe := FALSE -// EndIf -// -// -// One may think that Y_lo should be -W*W/2; however, it does not matter -// as Y_lo will be rounded off completely except for the correct effect in -// directed rounding. Clearly -W*W is simplier to compute. Moreover, -// because of the difference in exponent value, Y_hi + Y_lo or -// Y_hi + Scale*Y_lo is always inexact. -// -// Case log_near1: -// -// Here we compute a simple polynomial. To exploit parallelism, we split -// the polynomial into two portions. -// -// W := X + Em1 -// Wsq := W * W -// W4 := Wsq*Wsq -// W6 := W4*Wsq -// Y_hi := W + Wsq*(P_1 + W*(P_2 + W*(P_3 + W*P_4)) -// Y_lo := W6*(P_5 + W*(P_6 + W*(P_7 + W*P_8))) -// set lsb(Y_lo) to be 1 -// -// Case log_regular: -// -// We present the algorithm in four steps. -// -// Step 0. Initialization -// ---------------------- -// -// Z := X + E -// N := unbaised exponent of Z -// S_hi := 2^(-N) * Z -// S_lo := 2^(-N) * { (max(X,E)-Z) + min(X,E) } -// -// Note that S_lo is always 0 for the case E = 0. -// -// Step 1. Argument Reduction -// -------------------------- -// -// Let -// -// Z = 2^N * S_hi = 2^N * 1.d_1 d_2 d_3 ... d_63 -// -// We obtain G_1, G_2, G_3 by the following steps. -// +// API +//============================================================== +// float log1pf(float) // -// Define X_0 := 1.d_1 d_2 ... d_14. This is extracted -// from S_hi. +// log1p(x) = log(x+1) // -// Define A_1 := 1.d_1 d_2 d_3 d_4. This is X_0 truncated -// to lsb = 2^(-4). +// Overview of operation +//============================================================== +// Background +// ---------- // -// Define index_1 := [ d_1 d_2 d_3 d_4 ]. +// This algorithm is based on fact that +// log1p(x) = log(1+x) and +// log(a b) = log(a) + log(b). +// In our case we have 1+x = 2^N f, where 1 <= f < 2. +// So +// log(1+x) = log(2^N f) = log(2^N) + log(f) = n*log(2) + log(f) // -// Fetch Z_1 := (1/A_1) rounded UP in fixed point with -// fixed point lsb = 2^(-15). -// Z_1 looks like z_0.z_1 z_2 ... z_15 -// Note that the fetching is done using index_1. -// A_1 is actually not needed in the implementation -// and is used here only to explain how is the value -// Z_1 defined. +// To calculate log(f) we do following +// log(f) = log(f * frcpa(f) / frcpa(f)) = +// = log(f * frcpa(f)) + log(1/frcpa(f)) // -// Fetch G_1 := (1/A_1) truncated to 21 sig. bits. -// floating pt. Again, fetching is done using index_1. A_1 -// explains how G_1 is defined. +// According to definition of IA-64's frcpa instruction it's a +// floating point that approximates 1/f using a lookup on the +// top of 8 bits of the input number's + 1 significand with relative +// error < 2^(-8.886). So we have following // -// Calculate X_1 := X_0 * Z_1 truncated to lsb = 2^(-14) -// = 1.0 0 0 0 d_5 ... d_14 -// This is accomplised by integer multiplication. -// It is proved that X_1 indeed always begin -// with 1.0000 in fixed point. +// |(1/f - frcpa(f)) / (1/f))| = |1 - f*frcpa(f)| < 1/256 // +// and // -// Define A_2 := 1.0 0 0 0 d_5 d_6 d_7 d_8. This is X_1 -// truncated to lsb = 2^(-8). Similar to A_1, -// A_2 is not needed in actual implementation. It -// helps explain how some of the values are defined. +// log(f) = log(f * frcpa(f)) + log(1/frcpa(f)) = +// = log(1 + r) + T // -// Define index_2 := [ d_5 d_6 d_7 d_8 ]. +// The first value can be computed by polynomial P(r) approximating +// log(1 + r) on |r| < 1/256 and the second is precomputed tabular +// value defined by top 8 bit of f. // -// Fetch Z_2 := (1/A_2) rounded UP in fixed point with -// fixed point lsb = 2^(-15). Fetch done using index_2. -// Z_2 looks like z_0.z_1 z_2 ... z_15 +// Finally we have that log(1+x) ~ (N*log(2) + T) + P(r) // -// Fetch G_2 := (1/A_2) truncated to 21 sig. bits. -// floating pt. +// Note that if input argument is close to 0.0 (in our case it means +// that |x| < 1/256) we can use just polynomial approximation +// because 1+x = 2^0 * f = f = 1 + r and +// log(1+x) = log(1 + r) ~ P(r) // -// Calculate X_2 := X_1 * Z_2 truncated to lsb = 2^(-14) -// = 1.0 0 0 0 0 0 0 0 d_9 d_10 ... d_14 -// This is accomplised by integer multiplication. -// It is proved that X_2 indeed always begin -// with 1.00000000 in fixed point. // +// Implementation +// -------------- // -// Define A_3 := 1.0 0 0 0 0 0 0 0 d_9 d_10 d_11 d_12 d_13 1. -// This is 2^(-14) + X_2 truncated to lsb = 2^(-13). +// 1. |x| >= 2^(-8), and x > -1 +// InvX = frcpa(x+1) +// r = InvX*(x+1) - 1 +// P(r) = r*((1 - A2*4) + r^2*(A3 - A4*r)) = r*P2(r), +// A4,A3,A2 are created with setf instruction. +// We use Taylor series and so A4 = 1/4, A3 = 1/3, +// A2 = 1/2 rounded to double. // -// Define index_3 := [ d_9 d_10 d_11 d_12 d_13 ]. +// N = float(n) where n is true unbiased exponent of x // -// Fetch G_3 := (1/A_3) truncated to 21 sig. bits. -// floating pt. Fetch is done using index_3. +// T is tabular value of log(1/frcpa(x)) calculated in quad precision +// and rounded to double. To load T we get bits from 55 to 62 of register +// format significand as index and calculate address +// ad_T = table_base_addr + 8 * index // -// Compute G := G_1 * G_2 * G_3. +// L1 (log(2)) is calculated in quad precision and rounded to double; +// it's created with setf // -// This is done exactly since each of G_j only has 21 sig. bits. +// And final result = P2(r)*r + (T + N*L1) // -// Compute // -// r := (G*S_hi - 1) + G*S_lo using 2 FMA operations. +// 2. 2^(-40) <= |x| < 2^(-8) +// r = x +// P(r) = r*((1 - A2*4) + r^2*(A3 - A4*r)) = r*P2(r), +// A4,A3,A2 are the same as in case |x| >= 1/256 // -// thus, r approximates G*(S_hi+S_lo) - 1 to within a couple of -// rounding errors. +// And final result = P2(r)*r // +// 3. 0 < |x| < 2^(-40) +// Although log1p(x) is basically x, we would like to preserve the inexactness +// nature as well as consistent behavior under different rounding modes. +// We can do this by computing the result as // -// Step 2. Approximation -// --------------------- +// log1p(x) = x - x*x // -// This step computes an approximation to log( 1 + r ) where r is the -// reduced argument just obtained. It is proved that |r| <= 1.9*2^(-13); -// thus log(1+r) can be approximated by a short polynomial: // -// log(1+r) ~=~ poly = r + Q1 r^2 + ... + Q4 r^5 +// Note: NaT, any NaNs, +/-INF, +/-0, negatives and unnormalized numbers are +// filtered and processed on special branches. // + // -// Step 3. Reconstruction -// ---------------------- +// Special values +//============================================================== // -// This step computes the desired result of log(X+E): +// log1p(-1) = -inf // Call error support // -// log(X+E) = log( 2^N * (S_hi + S_lo) ) -// = N*log(2) + log( S_hi + S_lo ) -// = N*log(2) + log(1/G) + -// log(1 + C*(S_hi+S_lo) - 1 ) +// log1p(+qnan) = +qnan +// log1p(-qnan) = -qnan +// log1p(+snan) = +qnan +// log1p(-snan) = -qnan // -// log(2), log(1/G_j) are stored as pairs of (single,double) numbers: -// log2_hi, log2_lo, log1byGj_hi, log1byGj_lo. The high parts are -// single-precision numbers and the low parts are double precision -// numbers. These have the property that +// log1p(x),x<-1= QNAN Indefinite // Call error support +// log1p(-inf) = QNAN Indefinite +// log1p(+inf) = +inf +// log1p(+/-0) = +/-0 // -// N*log2_hi + SUM ( log1byGj_hi ) // -// is computable exactly in double-extended precision (64 sig. bits). -// Finally +// Registers used +//============================================================== +// Floating Point registers used: +// f8, input +// f7 -> f15, f32 -> f36 // -// Y_hi := N*log2_hi + SUM ( log1byGj_hi ) -// Y_lo := poly_hi + [ poly_lo + -// ( SUM ( log1byGj_lo ) + N*log2_lo ) ] -// set lsb(Y_lo) to be 1 +// General registers used: +// r8 -> r11 +// r14 -> r22 // +// Predicate registers used: +// p6 -> p12 -#include "libm_support.h" - -#ifdef _LIBC -.rodata -#else -.data -#endif +// Assembly macros +//============================================================== +GR_TAG = r8 +GR_ad_T = r9 +GR_Exp = r10 +GR_N = r11 -// P_7, P_6, P_5, P_4, P_3, P_2, and P_1 +GR_signexp_x = r14 +GR_exp_mask = r15 +GR_exp_bias = r16 +GR_05 = r17 +GR_A3 = r18 +GR_Sig = r19 +GR_Ind = r19 +GR_exp_x = r20 +GR_Ln2 = r21 +GR_025 = r22 -.align 64 -Constants_P: -ASM_TYPE_DIRECTIVE(Constants_P,@object) -data4 0xEFD62B15,0xE3936754,0x00003FFB,0x00000000 -data4 0xA5E56381,0x8003B271,0x0000BFFC,0x00000000 -data4 0x73282DB0,0x9249248C,0x00003FFC,0x00000000 -data4 0x47305052,0xAAAAAA9F,0x0000BFFC,0x00000000 -data4 0xCCD17FC9,0xCCCCCCCC,0x00003FFC,0x00000000 -data4 0x00067ED5,0x80000000,0x0000BFFD,0x00000000 -data4 0xAAAAAAAA,0xAAAAAAAA,0x00003FFD,0x00000000 -data4 0xFFFFFFFE,0xFFFFFFFF,0x0000BFFD,0x00000000 -ASM_SIZE_DIRECTIVE(Constants_P) - -// log2_hi, log2_lo, Q_4, Q_3, Q_2, and Q_1 -.align 64 -Constants_Q: -ASM_TYPE_DIRECTIVE(Constants_Q,@object) -data4 0x00000000,0xB1721800,0x00003FFE,0x00000000 -data4 0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000 -data4 0x328833CB,0xCCCCCAF2,0x00003FFC,0x00000000 -data4 0xA9D4BAFB,0x80000077,0x0000BFFD,0x00000000 -data4 0xAAABE3D2,0xAAAAAAAA,0x00003FFD,0x00000000 -data4 0xFFFFDAB7,0xFFFFFFFF,0x0000BFFD,0x00000000 -ASM_SIZE_DIRECTIVE(Constants_Q) - -// Z1 - 16 bit fixed, G1 and H1 - IEEE single - -.align 64 -Constants_Z_G_H_h1: -ASM_TYPE_DIRECTIVE(Constants_Z_G_H_h1,@object) -data4 0x00008000,0x3F800000,0x00000000,0x00000000,0x00000000,0x00000000 -data4 0x00007879,0x3F70F0F0,0x3D785196,0x00000000,0x617D741C,0x3DA163A6 -data4 0x000071C8,0x3F638E38,0x3DF13843,0x00000000,0xCBD3D5BB,0x3E2C55E6 -data4 0x00006BCB,0x3F579430,0x3E2FF9A0,0x00000000,0xD86EA5E7,0xBE3EB0BF -data4 0x00006667,0x3F4CCCC8,0x3E647FD6,0x00000000,0x86B12760,0x3E2E6A8C -data4 0x00006187,0x3F430C30,0x3E8B3AE7,0x00000000,0x5C0739BA,0x3E47574C -data4 0x00005D18,0x3F3A2E88,0x3EA30C68,0x00000000,0x13E8AF2F,0x3E20E30F -data4 0x0000590C,0x3F321640,0x3EB9CEC8,0x00000000,0xF2C630BD,0xBE42885B -data4 0x00005556,0x3F2AAAA8,0x3ECF9927,0x00000000,0x97E577C6,0x3E497F34 -data4 0x000051EC,0x3F23D708,0x3EE47FC5,0x00000000,0xA6B0A5AB,0x3E3E6A6E -data4 0x00004EC5,0x3F1D89D8,0x3EF8947D,0x00000000,0xD328D9BE,0xBDF43E3C -data4 0x00004BDB,0x3F17B420,0x3F05F3A1,0x00000000,0x0ADB090A,0x3E4094C3 -data4 0x00004925,0x3F124920,0x3F0F4303,0x00000000,0xFC1FE510,0xBE28FBB2 -data4 0x0000469F,0x3F0D3DC8,0x3F183EBF,0x00000000,0x10FDE3FA,0x3E3A7895 -data4 0x00004445,0x3F088888,0x3F20EC80,0x00000000,0x7CC8C98F,0x3E508CE5 -data4 0x00004211,0x3F042108,0x3F29516A,0x00000000,0xA223106C,0xBE534874 -ASM_SIZE_DIRECTIVE(Constants_Z_G_H_h1) - -// Z2 - 16 bit fixed, G2 and H2 - IEEE single +GR_SAVE_B0 = r33 +GR_SAVE_PFS = r34 +GR_SAVE_GP = r35 +GR_SAVE_SP = r36 -.align 64 -Constants_Z_G_H_h2: -ASM_TYPE_DIRECTIVE(Constants_Z_G_H_h2,@object) -data4 0x00008000,0x3F800000,0x00000000,0x00000000,0x00000000,0x00000000 -data4 0x00007F81,0x3F7F00F8,0x3B7F875D,0x00000000,0x22C42273,0x3DB5A116 -data4 0x00007F02,0x3F7E03F8,0x3BFF015B,0x00000000,0x21F86ED3,0x3DE620CF -data4 0x00007E85,0x3F7D08E0,0x3C3EE393,0x00000000,0x484F34ED,0xBDAFA07E -data4 0x00007E08,0x3F7C0FC0,0x3C7E0586,0x00000000,0x3860BCF6,0xBDFE07F0 -data4 0x00007D8D,0x3F7B1880,0x3C9E75D2,0x00000000,0xA78093D6,0x3DEA370F -data4 0x00007D12,0x3F7A2328,0x3CBDC97A,0x00000000,0x72A753D0,0x3DFF5791 -data4 0x00007C98,0x3F792FB0,0x3CDCFE47,0x00000000,0xA7EF896B,0x3DFEBE6C -data4 0x00007C20,0x3F783E08,0x3CFC15D0,0x00000000,0x409ECB43,0x3E0CF156 -data4 0x00007BA8,0x3F774E38,0x3D0D874D,0x00000000,0xFFEF71DF,0xBE0B6F97 -data4 0x00007B31,0x3F766038,0x3D1CF49B,0x00000000,0x5D59EEE8,0xBE080483 -data4 0x00007ABB,0x3F757400,0x3D2C531D,0x00000000,0xA9192A74,0x3E1F91E9 -data4 0x00007A45,0x3F748988,0x3D3BA322,0x00000000,0xBF72A8CD,0xBE139A06 -data4 0x000079D1,0x3F73A0D0,0x3D4AE46F,0x00000000,0xF8FBA6CF,0x3E1D9202 -data4 0x0000795D,0x3F72B9D0,0x3D5A1756,0x00000000,0xBA796223,0xBE1DCCC4 -data4 0x000078EB,0x3F71D488,0x3D693B9D,0x00000000,0xB6B7C239,0xBE049391 -ASM_SIZE_DIRECTIVE(Constants_Z_G_H_h2) - -// G3 and H3 - IEEE single and h3 -IEEE double +GR_Parameter_X = r37 +GR_Parameter_Y = r38 +GR_Parameter_RESULT = r39 +GR_Parameter_TAG = r40 -.align 64 -Constants_Z_G_H_h3: -ASM_TYPE_DIRECTIVE(Constants_Z_G_H_h3,@object) -data4 0x3F7FFC00,0x38800100,0x562224CD,0x3D355595 -data4 0x3F7FF400,0x39400480,0x06136FF6,0x3D8200A2 -data4 0x3F7FEC00,0x39A00640,0xE8DE9AF0,0x3DA4D68D -data4 0x3F7FE400,0x39E00C41,0xB10238DC,0xBD8B4291 -data4 0x3F7FDC00,0x3A100A21,0x3B1952CA,0xBD89CCB8 -data4 0x3F7FD400,0x3A300F22,0x1DC46826,0xBDB10707 -data4 0x3F7FCC08,0x3A4FF51C,0xF43307DB,0x3DB6FCB9 -data4 0x3F7FC408,0x3A6FFC1D,0x62DC7872,0xBD9B7C47 -data4 0x3F7FBC10,0x3A87F20B,0x3F89154A,0xBDC3725E -data4 0x3F7FB410,0x3A97F68B,0x62B9D392,0xBD93519D -data4 0x3F7FAC18,0x3AA7EB86,0x0F21BD9D,0x3DC18441 -data4 0x3F7FA420,0x3AB7E101,0x2245E0A6,0xBDA64B95 -data4 0x3F7F9C20,0x3AC7E701,0xAABB34B8,0x3DB4B0EC -data4 0x3F7F9428,0x3AD7DD7B,0x6DC40A7E,0x3D992337 -data4 0x3F7F8C30,0x3AE7D474,0x4F2083D3,0x3DC6E17B -data4 0x3F7F8438,0x3AF7CBED,0x811D4394,0x3DAE314B -data4 0x3F7F7C40,0x3B03E1F3,0xB08F2DB1,0xBDD46F21 -data4 0x3F7F7448,0x3B0BDE2F,0x6D34522B,0xBDDC30A4 -data4 0x3F7F6C50,0x3B13DAAA,0xB1F473DB,0x3DCB0070 -data4 0x3F7F6458,0x3B1BD766,0x6AD282FD,0xBDD65DDC -data4 0x3F7F5C68,0x3B23CC5C,0xF153761A,0xBDCDAB83 -data4 0x3F7F5470,0x3B2BC997,0x341D0F8F,0xBDDADA40 -data4 0x3F7F4C78,0x3B33C711,0xEBC394E8,0x3DCD1BD7 -data4 0x3F7F4488,0x3B3BBCC6,0x52E3E695,0xBDC3532B -data4 0x3F7F3C90,0x3B43BAC0,0xE846B3DE,0xBDA3961E -data4 0x3F7F34A0,0x3B4BB0F4,0x785778D4,0xBDDADF06 -data4 0x3F7F2CA8,0x3B53AF6D,0xE55CE212,0x3DCC3ED1 -data4 0x3F7F24B8,0x3B5BA620,0x9E382C15,0xBDBA3103 -data4 0x3F7F1CC8,0x3B639D12,0x5C5AF197,0x3D635A0B -data4 0x3F7F14D8,0x3B6B9444,0x71D34EFC,0xBDDCCB19 -data4 0x3F7F0CE0,0x3B7393BC,0x52CD7ADA,0x3DC74502 -data4 0x3F7F04F0,0x3B7B8B6D,0x7D7F2A42,0xBDB68F17 -ASM_SIZE_DIRECTIVE(Constants_Z_G_H_h3) - -// -// Exponent Thresholds and Tiny Thresholds -// for 8, 11, 15, and 17 bit exponents -// -// Expo_Range Value -// -// 0 (8 bits) 2^(-126) -// 1 (11 bits) 2^(-1022) -// 2 (15 bits) 2^(-16382) -// 3 (17 bits) 2^(-16382) -// -// Tiny_Table -// ---------- -// Expo_Range Value -// -// 0 (8 bits) 2^(-16382) -// 1 (11 bits) 2^(-16382) -// 2 (15 bits) 2^(-16382) -// 3 (17 bits) 2^(-16382) -// -.align 64 -Constants_Threshold: -ASM_TYPE_DIRECTIVE(Constants_Threshold,@object) -data4 0x00000000,0x80000000,0x00003F81,0x00000000 -data4 0x00000000,0x80000000,0x00000001,0x00000000 -data4 0x00000000,0x80000000,0x00003C01,0x00000000 -data4 0x00000000,0x80000000,0x00000001,0x00000000 -data4 0x00000000,0x80000000,0x00000001,0x00000000 -data4 0x00000000,0x80000000,0x00000001,0x00000000 -data4 0x00000000,0x80000000,0x00000001,0x00000000 -data4 0x00000000,0x80000000,0x00000001,0x00000000 -ASM_SIZE_DIRECTIVE(Constants_Threshold) -.align 64 -Constants_1_by_LN10: -ASM_TYPE_DIRECTIVE(Constants_1_by_LN10,@object) -data4 0x37287195,0xDE5BD8A9,0x00003FFD,0x00000000 -data4 0xACCF70C8,0xD56EAABE,0x00003FBD,0x00000000 -ASM_SIZE_DIRECTIVE(Constants_1_by_LN10) +FR_NormX = f7 +FR_RcpX = f9 +FR_r = f10 +FR_r2 = f11 +FR_r4 = f12 +FR_N = f13 +FR_Ln2 = f14 +FR_Xp1 = f15 -FR_Input_X = f8 -FR_Neg_One = f9 -FR_E = f33 -FR_Em1 = f34 -FR_Y_hi = f34 -// Shared with Em1 -FR_Y_lo = f35 -FR_Scale = f36 -FR_X_Prime = f37 -FR_Z = f38 -FR_S_hi = f38 -// Shared with Z -FR_W = f39 -FR_G = f40 -FR_wsq = f40 -// Shared with G -FR_H = f41 -FR_w4 = f41 -// Shared with H -FR_h = f42 -FR_w6 = f42 -// Shared with h -FR_G_tmp = f43 -FR_poly_lo = f43 -// Shared with G_tmp -FR_P8 = f43 -// Shared with G_tmp -FR_H_tmp = f44 -FR_poly_hi = f44 - // Shared with H_tmp -FR_P7 = f44 -// Shared with H_tmp -FR_h_tmp = f45 -FR_rsq = f45 -// Shared with h_tmp -FR_P6 = f45 -// Shared with h_tmp -FR_abs_W = f46 -FR_r = f46 -// Shared with abs_W -FR_AA = f47 -FR_log2_hi = f47 -// Shared with AA -FR_BB = f48 -FR_log2_lo = f48 -// Shared with BB -FR_S_lo = f49 -FR_two_negN = f50 -FR_float_N = f51 -FR_Q4 = f52 -FR_dummy = f52 -// Shared with Q4 -FR_P4 = f52 -// Shared with Q4 -FR_Threshold = f52 -// Shared with Q4 -FR_Q3 = f53 -FR_P3 = f53 -// Shared with Q3 -FR_Tiny = f53 -// Shared with Q3 -FR_Q2 = f54 -FR_P2 = f54 -// Shared with Q2 -FR_1LN10_hi = f54 -// Shared with Q2 -FR_Q1 = f55 -FR_P1 = f55 -// Shared with Q1 -FR_1LN10_lo = f55 -// Shared with Q1 -FR_P5 = f98 -FR_SCALE = f98 -FR_Output_X_tmp = f99 +FR_A4 = f33 +FR_A3 = f34 +FR_A2 = f35 -GR_Expo_Range = r32 -GR_Table_Base = r34 -GR_Table_Base1 = r35 -GR_Table_ptr = r36 -GR_Index2 = r37 -GR_signif = r38 -GR_X_0 = r39 -GR_X_1 = r40 -GR_X_2 = r41 -GR_Z_1 = r42 -GR_Z_2 = r43 -GR_N = r44 -GR_Bias = r45 -GR_M = r46 -GR_ScaleN = r47 -GR_Index3 = r48 -GR_Perturb = r49 -GR_Table_Scale = r50 +FR_T = f36 +FR_NxLn2pT = f36 -GR_SAVE_PFS = r51 -GR_SAVE_B0 = r52 -GR_SAVE_GP = r53 -GR_Parameter_X = r54 -GR_Parameter_Y = r55 -GR_Parameter_RESULT = r56 +FR_Y = f1 +FR_X = f10 +FR_RESULT = f8 -GR_Parameter_TAG = r57 +// Data +//============================================================== +RODATA +.align 16 + +LOCAL_OBJECT_START(log_data) +// ln(1/frcpa(1+i/256)), i=0...255 +data8 0x3F60040155D5889E // 0 +data8 0x3F78121214586B54 // 1 +data8 0x3F841929F96832F0 // 2 +data8 0x3F8C317384C75F06 // 3 +data8 0x3F91A6B91AC73386 // 4 +data8 0x3F95BA9A5D9AC039 // 5 +data8 0x3F99D2A8074325F4 // 6 +data8 0x3F9D6B2725979802 // 7 +data8 0x3FA0C58FA19DFAAA // 8 +data8 0x3FA2954C78CBCE1B // 9 +data8 0x3FA4A94D2DA96C56 // 10 +data8 0x3FA67C94F2D4BB58 // 11 +data8 0x3FA85188B630F068 // 12 +data8 0x3FAA6B8ABE73AF4C // 13 +data8 0x3FAC441E06F72A9E // 14 +data8 0x3FAE1E6713606D07 // 15 +data8 0x3FAFFA6911AB9301 // 16 +data8 0x3FB0EC139C5DA601 // 17 +data8 0x3FB1DBD2643D190B // 18 +data8 0x3FB2CC7284FE5F1C // 19 +data8 0x3FB3BDF5A7D1EE64 // 20 +data8 0x3FB4B05D7AA012E0 // 21 +data8 0x3FB580DB7CEB5702 // 22 +data8 0x3FB674F089365A7A // 23 +data8 0x3FB769EF2C6B568D // 24 +data8 0x3FB85FD927506A48 // 25 +data8 0x3FB9335E5D594989 // 26 +data8 0x3FBA2B0220C8E5F5 // 27 +data8 0x3FBB0004AC1A86AC // 28 +data8 0x3FBBF968769FCA11 // 29 +data8 0x3FBCCFEDBFEE13A8 // 30 +data8 0x3FBDA727638446A2 // 31 +data8 0x3FBEA3257FE10F7A // 32 +data8 0x3FBF7BE9FEDBFDE6 // 33 +data8 0x3FC02AB352FF25F4 // 34 +data8 0x3FC097CE579D204D // 35 +data8 0x3FC1178E8227E47C // 36 +data8 0x3FC185747DBECF34 // 37 +data8 0x3FC1F3B925F25D41 // 38 +data8 0x3FC2625D1E6DDF57 // 39 +data8 0x3FC2D1610C86813A // 40 +data8 0x3FC340C59741142E // 41 +data8 0x3FC3B08B6757F2A9 // 42 +data8 0x3FC40DFB08378003 // 43 +data8 0x3FC47E74E8CA5F7C // 44 +data8 0x3FC4EF51F6466DE4 // 45 +data8 0x3FC56092E02BA516 // 46 +data8 0x3FC5D23857CD74D5 // 47 +data8 0x3FC6313A37335D76 // 48 +data8 0x3FC6A399DABBD383 // 49 +data8 0x3FC70337DD3CE41B // 50 +data8 0x3FC77654128F6127 // 51 +data8 0x3FC7E9D82A0B022D // 52 +data8 0x3FC84A6B759F512F // 53 +data8 0x3FC8AB47D5F5A310 // 54 +data8 0x3FC91FE49096581B // 55 +data8 0x3FC981634011AA75 // 56 +data8 0x3FC9F6C407089664 // 57 +data8 0x3FCA58E729348F43 // 58 +data8 0x3FCABB55C31693AD // 59 +data8 0x3FCB1E104919EFD0 // 60 +data8 0x3FCB94EE93E367CB // 61 +data8 0x3FCBF851C067555F // 62 +data8 0x3FCC5C0254BF23A6 // 63 +data8 0x3FCCC000C9DB3C52 // 64 +data8 0x3FCD244D99C85674 // 65 +data8 0x3FCD88E93FB2F450 // 66 +data8 0x3FCDEDD437EAEF01 // 67 +data8 0x3FCE530EFFE71012 // 68 +data8 0x3FCEB89A1648B971 // 69 +data8 0x3FCF1E75FADF9BDE // 70 +data8 0x3FCF84A32EAD7C35 // 71 +data8 0x3FCFEB2233EA07CD // 72 +data8 0x3FD028F9C7035C1C // 73 +data8 0x3FD05C8BE0D9635A // 74 +data8 0x3FD085EB8F8AE797 // 75 +data8 0x3FD0B9C8E32D1911 // 76 +data8 0x3FD0EDD060B78081 // 77 +data8 0x3FD122024CF0063F // 78 +data8 0x3FD14BE2927AECD4 // 79 +data8 0x3FD180618EF18ADF // 80 +data8 0x3FD1B50BBE2FC63B // 81 +data8 0x3FD1DF4CC7CF242D // 82 +data8 0x3FD214456D0EB8D4 // 83 +data8 0x3FD23EC5991EBA49 // 84 +data8 0x3FD2740D9F870AFB // 85 +data8 0x3FD29ECDABCDFA04 // 86 +data8 0x3FD2D46602ADCCEE // 87 +data8 0x3FD2FF66B04EA9D4 // 88 +data8 0x3FD335504B355A37 // 89 +data8 0x3FD360925EC44F5D // 90 +data8 0x3FD38BF1C3337E75 // 91 +data8 0x3FD3C25277333184 // 92 +data8 0x3FD3EDF463C1683E // 93 +data8 0x3FD419B423D5E8C7 // 94 +data8 0x3FD44591E0539F49 // 95 +data8 0x3FD47C9175B6F0AD // 96 +data8 0x3FD4A8B341552B09 // 97 +data8 0x3FD4D4F3908901A0 // 98 +data8 0x3FD501528DA1F968 // 99 +data8 0x3FD52DD06347D4F6 // 100 +data8 0x3FD55A6D3C7B8A8A // 101 +data8 0x3FD5925D2B112A59 // 102 +data8 0x3FD5BF406B543DB2 // 103 +data8 0x3FD5EC433D5C35AE // 104 +data8 0x3FD61965CDB02C1F // 105 +data8 0x3FD646A84935B2A2 // 106 +data8 0x3FD6740ADD31DE94 // 107 +data8 0x3FD6A18DB74A58C5 // 108 +data8 0x3FD6CF31058670EC // 109 +data8 0x3FD6F180E852F0BA // 110 +data8 0x3FD71F5D71B894F0 // 111 +data8 0x3FD74D5AEFD66D5C // 112 +data8 0x3FD77B79922BD37E // 113 +data8 0x3FD7A9B9889F19E2 // 114 +data8 0x3FD7D81B037EB6A6 // 115 +data8 0x3FD8069E33827231 // 116 +data8 0x3FD82996D3EF8BCB // 117 +data8 0x3FD85855776DCBFB // 118 +data8 0x3FD8873658327CCF // 119 +data8 0x3FD8AA75973AB8CF // 120 +data8 0x3FD8D992DC8824E5 // 121 +data8 0x3FD908D2EA7D9512 // 122 +data8 0x3FD92C59E79C0E56 // 123 +data8 0x3FD95BD750EE3ED3 // 124 +data8 0x3FD98B7811A3EE5B // 125 +data8 0x3FD9AF47F33D406C // 126 +data8 0x3FD9DF270C1914A8 // 127 +data8 0x3FDA0325ED14FDA4 // 128 +data8 0x3FDA33440224FA79 // 129 +data8 0x3FDA57725E80C383 // 130 +data8 0x3FDA87D0165DD199 // 131 +data8 0x3FDAAC2E6C03F896 // 132 +data8 0x3FDADCCC6FDF6A81 // 133 +data8 0x3FDB015B3EB1E790 // 134 +data8 0x3FDB323A3A635948 // 135 +data8 0x3FDB56FA04462909 // 136 +data8 0x3FDB881AA659BC93 // 137 +data8 0x3FDBAD0BEF3DB165 // 138 +data8 0x3FDBD21297781C2F // 139 +data8 0x3FDC039236F08819 // 140 +data8 0x3FDC28CB1E4D32FD // 141 +data8 0x3FDC4E19B84723C2 // 142 +data8 0x3FDC7FF9C74554C9 // 143 +data8 0x3FDCA57B64E9DB05 // 144 +data8 0x3FDCCB130A5CEBB0 // 145 +data8 0x3FDCF0C0D18F326F // 146 +data8 0x3FDD232075B5A201 // 147 +data8 0x3FDD490246DEFA6B // 148 +data8 0x3FDD6EFA918D25CD // 149 +data8 0x3FDD9509707AE52F // 150 +data8 0x3FDDBB2EFE92C554 // 151 +data8 0x3FDDEE2F3445E4AF // 152 +data8 0x3FDE148A1A2726CE // 153 +data8 0x3FDE3AFC0A49FF40 // 154 +data8 0x3FDE6185206D516E // 155 +data8 0x3FDE882578823D52 // 156 +data8 0x3FDEAEDD2EAC990C // 157 +data8 0x3FDED5AC5F436BE3 // 158 +data8 0x3FDEFC9326D16AB9 // 159 +data8 0x3FDF2391A2157600 // 160 +data8 0x3FDF4AA7EE03192D // 161 +data8 0x3FDF71D627C30BB0 // 162 +data8 0x3FDF991C6CB3B379 // 163 +data8 0x3FDFC07ADA69A910 // 164 +data8 0x3FDFE7F18EB03D3E // 165 +data8 0x3FE007C053C5002E // 166 +data8 0x3FE01B942198A5A1 // 167 +data8 0x3FE02F74400C64EB // 168 +data8 0x3FE04360BE7603AD // 169 +data8 0x3FE05759AC47FE34 // 170 +data8 0x3FE06B5F1911CF52 // 171 +data8 0x3FE078BF0533C568 // 172 +data8 0x3FE08CD9687E7B0E // 173 +data8 0x3FE0A10074CF9019 // 174 +data8 0x3FE0B5343A234477 // 175 +data8 0x3FE0C974C89431CE // 176 +data8 0x3FE0DDC2305B9886 // 177 +data8 0x3FE0EB524BAFC918 // 178 +data8 0x3FE0FFB54213A476 // 179 +data8 0x3FE114253DA97D9F // 180 +data8 0x3FE128A24F1D9AFF // 181 +data8 0x3FE1365252BF0865 // 182 +data8 0x3FE14AE558B4A92D // 183 +data8 0x3FE15F85A19C765B // 184 +data8 0x3FE16D4D38C119FA // 185 +data8 0x3FE18203C20DD133 // 186 +data8 0x3FE196C7BC4B1F3B // 187 +data8 0x3FE1A4A738B7A33C // 188 +data8 0x3FE1B981C0C9653D // 189 +data8 0x3FE1CE69E8BB106B // 190 +data8 0x3FE1DC619DE06944 // 191 +data8 0x3FE1F160A2AD0DA4 // 192 +data8 0x3FE2066D7740737E // 193 +data8 0x3FE2147DBA47A394 // 194 +data8 0x3FE229A1BC5EBAC3 // 195 +data8 0x3FE237C1841A502E // 196 +data8 0x3FE24CFCE6F80D9A // 197 +data8 0x3FE25B2C55CD5762 // 198 +data8 0x3FE2707F4D5F7C41 // 199 +data8 0x3FE285E0842CA384 // 200 +data8 0x3FE294294708B773 // 201 +data8 0x3FE2A9A2670AFF0C // 202 +data8 0x3FE2B7FB2C8D1CC1 // 203 +data8 0x3FE2C65A6395F5F5 // 204 +data8 0x3FE2DBF557B0DF43 // 205 +data8 0x3FE2EA64C3F97655 // 206 +data8 0x3FE3001823684D73 // 207 +data8 0x3FE30E97E9A8B5CD // 208 +data8 0x3FE32463EBDD34EA // 209 +data8 0x3FE332F4314AD796 // 210 +data8 0x3FE348D90E7464D0 // 211 +data8 0x3FE35779F8C43D6E // 212 +data8 0x3FE36621961A6A99 // 213 +data8 0x3FE37C299F3C366A // 214 +data8 0x3FE38AE2171976E7 // 215 +data8 0x3FE399A157A603E7 // 216 +data8 0x3FE3AFCCFE77B9D1 // 217 +data8 0x3FE3BE9D503533B5 // 218 +data8 0x3FE3CD7480B4A8A3 // 219 +data8 0x3FE3E3C43918F76C // 220 +data8 0x3FE3F2ACB27ED6C7 // 221 +data8 0x3FE4019C2125CA93 // 222 +data8 0x3FE4181061389722 // 223 +data8 0x3FE42711518DF545 // 224 +data8 0x3FE436194E12B6BF // 225 +data8 0x3FE445285D68EA69 // 226 +data8 0x3FE45BCC464C893A // 227 +data8 0x3FE46AED21F117FC // 228 +data8 0x3FE47A1527E8A2D3 // 229 +data8 0x3FE489445EFFFCCC // 230 +data8 0x3FE4A018BCB69835 // 231 +data8 0x3FE4AF5A0C9D65D7 // 232 +data8 0x3FE4BEA2A5BDBE87 // 233 +data8 0x3FE4CDF28F10AC46 // 234 +data8 0x3FE4DD49CF994058 // 235 +data8 0x3FE4ECA86E64A684 // 236 +data8 0x3FE503C43CD8EB68 // 237 +data8 0x3FE513356667FC57 // 238 +data8 0x3FE522AE0738A3D8 // 239 +data8 0x3FE5322E26867857 // 240 +data8 0x3FE541B5CB979809 // 241 +data8 0x3FE55144FDBCBD62 // 242 +data8 0x3FE560DBC45153C7 // 243 +data8 0x3FE5707A26BB8C66 // 244 +data8 0x3FE587F60ED5B900 // 245 +data8 0x3FE597A7977C8F31 // 246 +data8 0x3FE5A760D634BB8B // 247 +data8 0x3FE5B721D295F10F // 248 +data8 0x3FE5C6EA94431EF9 // 249 +data8 0x3FE5D6BB22EA86F6 // 250 +data8 0x3FE5E6938645D390 // 251 +data8 0x3FE5F673C61A2ED2 // 252 +data8 0x3FE6065BEA385926 // 253 +data8 0x3FE6164BFA7CC06B // 254 +data8 0x3FE62643FECF9743 // 255 +LOCAL_OBJECT_END(log_data) + + +// Code +//============================================================== .section .text -.proc log1pf# -.global log1pf# -.align 64 -log1pf: -#ifdef _LIBC -.global __log1pf -__log1pf: -#endif - -{ .mfi -alloc r32 = ar.pfs,0,22,4,0 -(p0) fsub.s1 FR_Neg_One = f0,f1 -(p0) cmp.eq.unc p7, p0 = r0, r0 -} - +GLOBAL_IEEE754_ENTRY(log1pf) { .mfi -(p0) cmp.ne.unc p14, p0 = r0, r0 -(p0) fnorm.s1 FR_X_Prime = FR_Input_X -(p0) cmp.eq.unc p15, p0 = r0, r0 ;; + getf.exp GR_signexp_x = f8 // if x is unorm then must recompute + fadd.s1 FR_Xp1 = f8, f1 // Form 1+x + mov GR_05 = 0xfffe } - -{ .mfi - nop.m 999 -(p0) fclass.m.unc p6, p0 = FR_Input_X, 0x1E3 - nop.i 999 +{ .mlx + addl GR_ad_T = @ltoff(log_data),gp + movl GR_A3 = 0x3fd5555555555555 // double precision memory + // representation of A3 } ;; { .mfi - nop.m 999 -(p0) fclass.nm.unc p10, p0 = FR_Input_X, 0x1FF - nop.i 999 + ld8 GR_ad_T = [GR_ad_T] + fclass.m p8,p0 = f8,0xb // Is x unorm? + mov GR_exp_mask = 0x1ffff } -;; - { .mfi - nop.m 999 -(p0) fcmp.eq.unc.s1 p9, p0 = FR_Input_X, f0 - nop.i 999 + mov GR_025 = 0xfffd // Exponent of 0.25 + fnorm.s1 FR_NormX = f8 // Normalize x + mov GR_exp_bias = 0xffff } +;; { .mfi - nop.m 999 -(p0) fadd FR_Em1 = f0,f0 - nop.i 999 ;; + setf.exp FR_A2 = GR_05 // create A2 = 0.5 + fclass.m p9,p0 = f8,0x1E1 // is x NaN, NaT or +Inf? + nop.i 0 } - -{ .mfi - nop.m 999 -(p0) fadd FR_E = f0,f1 - nop.i 999 ;; +{ .mib + setf.d FR_A3 = GR_A3 // create A3 + nop.i 0 +(p8) br.cond.spnt log1p_unorm // Branch if x=unorm } +;; +log1p_common: { .mfi - nop.m 999 -(p0) fcmp.eq.unc.s1 p8, p0 = FR_Input_X, FR_Neg_One - nop.i 999 + setf.exp FR_A4 = GR_025 // create A4 = 0.25 + frcpa.s1 FR_RcpX,p0 = f1,FR_Xp1 + nop.i 0 } - -{ .mfi - nop.m 999 -(p0) fcmp.lt.unc.s1 p13, p0 = FR_Input_X, FR_Neg_One - nop.i 999 +{ .mfb + nop.m 0 +(p9) fma.s.s0 f8 = f8,f1,f0 // set V-flag +(p9) br.ret.spnt b0 // exit for NaN, NaT and +Inf } - - -L(LOG_BEGIN): +;; { .mfi - nop.m 999 -(p0) fadd.s1 FR_Z = FR_X_Prime, FR_E - nop.i 999 + getf.exp GR_Exp = FR_Xp1 // signexp of x+1 + fclass.m p10,p0 = FR_Xp1,0x3A // is 1+x < 0? + and GR_exp_x = GR_exp_mask, GR_signexp_x // biased exponent of x } - { .mlx - nop.m 999 -(p0) movl GR_Table_Scale = 0x0000000000000018 ;; -} - -{ .mmi - nop.m 999 -// -// Create E = 1 and Em1 = 0 -// Check for X == 0, meaning log(1+0) -// Check for X < -1, meaning log(negative) -// Check for X == -1, meaning log(0) -// Normalize x -// Identify NatVals, NaNs, Infs. -// Identify EM unsupporteds. -// Identify Negative values - us S1 so as -// not to raise denormal operand exception -// Set p15 to true for log1pf -// Set p14 to false for log1pf -// Set p7 true for log and log1pf -// -(p0) addl GR_Table_Base = @ltoff(Constants_Z_G_H_h1#),gp - nop.i 999 + nop.m 0 + movl GR_Ln2 = 0x3FE62E42FEFA39EF // double precision memory + // representation of log(2) } +;; { .mfi - nop.m 999 -(p0) fmax.s1 FR_AA = FR_X_Prime, FR_E - nop.i 999 ;; + getf.sig GR_Sig = FR_Xp1 // get significand to calculate index + // for T if |x| >= 2^-8 + fcmp.eq.s1 p12,p0 = f8,f0 // is x equal to 0? + sub GR_exp_x = GR_exp_x, GR_exp_bias // true exponent of x } +;; { .mfi - ld8 GR_Table_Base = [GR_Table_Base] -(p0) fmin.s1 FR_BB = FR_X_Prime, FR_E - nop.i 999 + sub GR_N = GR_Exp,GR_exp_bias // true exponent of x+1 + fcmp.eq.s1 p11,p0 = FR_Xp1,f0 // is x = -1? + cmp.gt p6,p7 = -8, GR_exp_x // Is |x| < 2^-8 } - { .mfb - nop.m 999 -(p0) fadd.s1 FR_W = FR_X_Prime, FR_Em1 -// -// Begin load of constants base -// FR_Z = Z = |x| + E -// FR_W = W = |x| + Em1 -// AA = fmax(|x|,E) -// BB = fmin(|x|,E) -// -(p6) br.cond.spnt L(LOG_64_special) ;; -} - -{ .mib - nop.m 999 - nop.i 999 -(p10) br.cond.spnt L(LOG_64_unsupported) ;; -} - -{ .mib - nop.m 999 - nop.i 999 -(p13) br.cond.spnt L(LOG_64_negative) ;; -} - -{ .mib -(p0) getf.sig GR_signif = FR_Z - nop.i 999 -(p9) br.cond.spnt L(LOG_64_one) ;; -} - -{ .mib - nop.m 999 - nop.i 999 -(p8) br.cond.spnt L(LOG_64_zero) ;; -} - -{ .mfi -(p0) getf.exp GR_N = FR_Z -// -// Raise possible denormal operand exception -// Create Bias -// -// This function computes ln( x + e ) -// Input FR 1: FR_X = FR_Input_X -// Input FR 2: FR_E = FR_E -// Input FR 3: FR_Em1 = FR_Em1 -// Input GR 1: GR_Expo_Range = GR_Expo_Range = 1 -// Output FR 4: FR_Y_hi -// Output FR 5: FR_Y_lo -// Output FR 6: FR_Scale -// Output PR 7: PR_Safe -// -(p0) fsub.s1 FR_S_lo = FR_AA, FR_Z -// -// signif = getf.sig(Z) -// abs_W = fabs(w) -// -(p0) extr.u GR_Table_ptr = GR_signif, 59, 4 ;; -} - -{ .mfi - nop.m 999 -(p0) fmerge.se FR_S_hi = f1,FR_Z -(p0) extr.u GR_X_0 = GR_signif, 49, 15 -} - -{ .mmi - nop.m 999 -(p0) addl GR_Table_Base1 = @ltoff(Constants_Z_G_H_h2#),gp - nop.i 999 + nop.m 0 + nop.f 0 +(p10) br.cond.spnt log1p_lt_minus_1 // jump if x < -1 } ;; -{ .mlx - ld8 GR_Table_Base1 = [GR_Table_Base1] -(p0) movl GR_Bias = 0x000000000000FFFF ;; -} - -{ .mfi - nop.m 999 -(p0) fabs FR_abs_W = FR_W -(p0) pmpyshr2.u GR_Table_ptr = GR_Table_ptr,GR_Table_Scale,0 -} - -{ .mfi - nop.m 999 -// -// Branch out for special input values -// -(p0) fcmp.lt.unc.s0 p8, p0 = FR_Input_X, f0 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// X_0 = extr.u(signif,49,15) -// Index1 = extr.u(signif,59,4) -// -(p0) fadd.s1 FR_S_lo = FR_S_lo, FR_BB - nop.i 999 ;; -} - -{ .mii - nop.m 999 - nop.i 999 ;; -// -// Offset_to_Z1 = 24 * Index1 -// For performance, don't use result -// for 3 or 4 cycles. -// -(p0) add GR_Table_ptr = GR_Table_ptr, GR_Table_Base ;; -} -// -// Add Base to Offset for Z1 -// Create Bias - -{ .mmi -(p0) ld4 GR_Z_1 = [GR_Table_ptr],4 ;; -(p0) ldfs FR_G = [GR_Table_ptr],4 - nop.i 999 ;; -} - -{ .mmi -(p0) ldfs FR_H = [GR_Table_ptr],8 ;; -(p0) ldfd FR_h = [GR_Table_ptr],0 -(p0) pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 -} -// -// Load Z_1 -// Get Base of Table2 -// - +// p6 is true if |x| < 1/256 +// p7 is true if |x| >= 1/256 +.pred.rel "mutex",p6,p7 { .mfi -(p0) getf.exp GR_M = FR_abs_W - nop.f 999 - nop.i 999 ;; -} - -{ .mii - nop.m 999 - nop.i 999 ;; -// -// M = getf.exp(abs_W) -// S_lo = AA - Z -// X_1 = pmpyshr2(X_0,Z_1,15) -// -(p0) sub GR_M = GR_M, GR_Bias ;; + nop.m 0 +(p6) fms.s1 FR_r = f8,f1,f0 // range reduction for |x|<1/256 +(p6) cmp.gt.unc p10,p0 = -40, GR_exp_x // Is |x| < 2^-40 } -// -// M = M - Bias -// Load G1 -// N = getf.exp(Z) -// - -{ .mii -(p0) cmp.gt.unc p11, p0 = -80, GR_M -(p0) cmp.gt.unc p12, p0 = -7, GR_M ;; -(p0) extr.u GR_Index2 = GR_X_1, 6, 4 ;; -} - -{ .mib - nop.m 999 -// -// if -80 > M, set p11 -// Index2 = extr.u(X_1,6,4) -// if -7 > M, set p12 -// Load H1 -// -(p0) pmpyshr2.u GR_Index2 = GR_Index2,GR_Table_Scale,0 -(p11) br.cond.spnt L(log1pf_small) ;; +{ .mfb +(p7) setf.sig FR_N = GR_N // copy unbiased exponent of x to the + // significand field of FR_N +(p7) fms.s1 FR_r = FR_RcpX,FR_Xp1,f1 // range reduction for |x|>=1/256 +(p12) br.ret.spnt b0 // exit for x=0, return x } +;; { .mib - nop.m 999 - nop.i 999 -(p12) br.cond.spnt L(log1pf_near) ;; -} - -{ .mii -(p0) sub GR_N = GR_N, GR_Bias -// -// poly_lo = r * poly_lo -// -(p0) add GR_Perturb = 0x1, r0 ;; -(p0) sub GR_ScaleN = GR_Bias, GR_N -} - -{ .mii -(p0) setf.sig FR_float_N = GR_N - nop.i 999 ;; -// -// Prepare Index2 - pmpyshr2.u(X_1,Z_2,15) -// Load h1 -// S_lo = S_lo + BB -// Branch for -80 > M -// -(p0) add GR_Index2 = GR_Index2, GR_Table_Base1 -} - -{ .mmi -(p0) setf.exp FR_two_negN = GR_ScaleN - nop.m 999 -(p0) addl GR_Table_Base = @ltoff(Constants_Z_G_H_h3#),gp -};; - -// -// Index2 points to Z2 -// Branch for -7 > M -// - -{ .mmb -(p0) ld4 GR_Z_2 = [GR_Index2],4 - ld8 GR_Table_Base = [GR_Table_Base] - nop.b 999 ;; -} -(p0) nop.i 999 -// -// Load Z_2 -// N = N - Bias -// Tablebase points to Table3 -// - -{ .mmi -(p0) ldfs FR_G_tmp = [GR_Index2],4 ;; -// -// Load G_2 -// pmpyshr2 X_2= (X_1,Z_2,15) -// float_N = setf.sig(N) -// ScaleN = Bias - N -// -(p0) ldfs FR_H_tmp = [GR_Index2],8 - nop.i 999 ;; -} -// -// Load H_2 -// two_negN = setf.exp(scaleN) -// G = G_1 * G_2 -// - -{ .mfi -(p0) ldfd FR_h_tmp = [GR_Index2],0 - nop.f 999 -(p0) pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 ;; -} - -{ .mii - nop.m 999 -(p0) extr.u GR_Index3 = GR_X_2, 1, 5 ;; -// -// Load h_2 -// H = H_1 + H_2 -// h = h_1 + h_2 -// Index3 = extr.u(X_2,1,5) -// -(p0) shladd GR_Index3 = GR_Index3,4,GR_Table_Base -} - -{ .mmi - nop.m 999 - nop.m 999 -// -// float_N = fcvt.xf(float_N) -// load G3 -// -(p0) addl GR_Table_Base = @ltoff(Constants_Q#),gp ;; -} - -{ .mfi -ld8 GR_Table_Base = [GR_Table_Base] -nop.f 999 -nop.i 999 -} ;; - -{ .mfi -(p0) ldfe FR_log2_hi = [GR_Table_Base],16 -(p0) fmpy.s1 FR_S_lo = FR_S_lo, FR_two_negN - nop.i 999 ;; -} - -{ .mmf - nop.m 999 -// -// G = G3 * G -// Load h3 -// Load log2_hi -// H = H + H3 -// -(p0) ldfe FR_log2_lo = [GR_Table_Base],16 -(p0) fmpy.s1 FR_G = FR_G, FR_G_tmp ;; -} - -{ .mmf -(p0) ldfs FR_G_tmp = [GR_Index3],4 -// -// h = h + h3 -// r = G * S_hi + 1 -// Load log2_lo -// -(p0) ldfe FR_Q4 = [GR_Table_Base],16 -(p0) fadd.s1 FR_h = FR_h, FR_h_tmp ;; -} - -{ .mfi -(p0) ldfe FR_Q3 = [GR_Table_Base],16 -(p0) fadd.s1 FR_H = FR_H, FR_H_tmp - nop.i 999 ;; -} - -{ .mmf -(p0) ldfs FR_H_tmp = [GR_Index3],4 -(p0) ldfe FR_Q2 = [GR_Table_Base],16 -// -// Comput Index for Table3 -// S_lo = S_lo * two_negN -// -(p0) fcvt.xf FR_float_N = FR_float_N ;; + setf.d FR_Ln2 = GR_Ln2 // create log(2) +(p7) extr.u GR_Ind = GR_Sig,55,8 // get bits from 55 to 62 as index +(p11) br.cond.spnt log1p_eq_minus_1 // jump if x = -1 } -// -// If S_lo == 0, set p8 false -// Load H3 -// Load ptr to table of polynomial coeff. -// +;; { .mmf -(p0) ldfd FR_h_tmp = [GR_Index3],0 -(p0) ldfe FR_Q1 = [GR_Table_Base],0 -(p0) fcmp.eq.unc.s1 p0, p8 = FR_S_lo, f0 ;; -} - -{ .mfi - nop.m 999 -(p0) fmpy.s1 FR_G = FR_G, FR_G_tmp - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p0) fadd.s1 FR_H = FR_H, FR_H_tmp - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p0) fms.s1 FR_r = FR_G, FR_S_hi, f1 - nop.i 999 -} - -{ .mfi - nop.m 999 -(p0) fadd.s1 FR_h = FR_h, FR_h_tmp - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p0) fma.s1 FR_Y_hi = FR_float_N, FR_log2_hi, FR_H - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// Load Q4 -// Load Q3 -// Load Q2 -// Load Q1 -// -(p8) fma.s1 FR_r = FR_G, FR_S_lo, FR_r - nop.i 999 -} - -{ .mfi - nop.m 999 -// -// poly_lo = r * Q4 + Q3 -// rsq = r* r -// -(p0) fma.s1 FR_h = FR_float_N, FR_log2_lo, FR_h - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// If (S_lo!=0) r = s_lo * G + r -// -(p0) fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3 - nop.i 999 -} -// -// Create a 0x00000....01 -// poly_lo = poly_lo * rsq + h -// - -{ .mfi -(p0) setf.sig FR_dummy = GR_Perturb -(p0) fmpy.s1 FR_rsq = FR_r, FR_r - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// h = N * log2_lo + h -// Y_hi = n * log2_hi + H -// -(p0) fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2 - nop.i 999 -} - -{ .mfi - nop.m 999 -(p0) fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -// -// poly_lo = r * poly_o + Q2 -// poly_hi = Q1 * rsq + r -// -(p0) fmpy.s1 FR_poly_lo = FR_poly_lo, FR_r - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p0) fma.s1 FR_poly_lo = FR_poly_lo, FR_rsq, FR_h - nop.i 999 ;; -} - -{ .mfb - nop.m 999 -(p0) fadd.s1 FR_Y_lo = FR_poly_hi, FR_poly_lo -// -// Create the FR for a binary "or" -// Y_lo = poly_hi + poly_lo -// -// (p0) for FR_dummy = FR_Y_lo,FR_dummy ;; -// -// Turn the lsb of Y_lo ON -// -// (p0) fmerge.se FR_Y_lo = FR_Y_lo,FR_dummy ;; -// -// Merge the new lsb into Y_lo, for alone doesn't -// -(p0) br.cond.sptk L(LOG_main) ;; -} - - -L(log1pf_near): - -{ .mmi - nop.m 999 - nop.m 999 -// /*******************************************************/ -// /*********** Branch log1pf_near ************************/ -// /*******************************************************/ -(p0) addl GR_Table_Base = @ltoff(Constants_P#),gp ;; -} -// -// Load base address of poly. coeff. -// -{.mmi - nop.m 999 - ld8 GR_Table_Base = [GR_Table_Base] - nop.i 999 -};; - -{ .mmb -(p0) add GR_Table_ptr = 0x40,GR_Table_Base -// -// Address tables with separate pointers -// -(p0) ldfe FR_P8 = [GR_Table_Base],16 - nop.b 999 ;; +(p7) shladd GR_ad_T = GR_Ind,3,GR_ad_T // address of T + nop.m 0 +(p10) fnma.s.s0 f8 = f8,f8,f8 // If |x| very small, result=x-x*x } +;; { .mmb -(p0) ldfe FR_P4 = [GR_Table_ptr],16 -// -// Load P4 -// Load P8 -// -(p0) ldfe FR_P7 = [GR_Table_Base],16 - nop.b 999 ;; -} - -{ .mmf -(p0) ldfe FR_P3 = [GR_Table_ptr],16 -// -// Load P3 -// Load P7 -// -(p0) ldfe FR_P6 = [GR_Table_Base],16 -(p0) fmpy.s1 FR_wsq = FR_W, FR_W ;; -} - -{ .mfi -(p0) ldfe FR_P2 = [GR_Table_ptr],16 - nop.f 999 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p0) fma.s1 FR_Y_hi = FR_W, FR_P4, FR_P3 - nop.i 999 -} -// -// Load P2 -// Load P6 -// Wsq = w * w -// Y_hi = p4 * w + p3 -// - -{ .mfi -(p0) ldfe FR_P5 = [GR_Table_Base],16 -(p0) fma.s1 FR_Y_lo = FR_W, FR_P8, FR_P7 - nop.i 999 ;; -} - -{ .mfi -(p0) ldfe FR_P1 = [GR_Table_ptr],16 -// -// Load P1 -// Load P5 -// Y_lo = p8 * w + P7 -// -(p0) fmpy.s1 FR_w4 = FR_wsq, FR_wsq - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p0) fma.s1 FR_Y_hi = FR_W, FR_Y_hi, FR_P2 - nop.i 999 -} - -{ .mfi - nop.m 999 -(p0) fma.s1 FR_Y_lo = FR_W, FR_Y_lo, FR_P6 -(p0) add GR_Perturb = 0x1, r0 ;; -} - -{ .mfi - nop.m 999 -// -// w4 = w2 * w2 -// Y_hi = y_hi * w + p2 -// Y_lo = y_lo * w + p6 -// Create perturbation bit -// -(p0) fmpy.s1 FR_w6 = FR_w4, FR_wsq - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p0) fma.s1 FR_Y_hi = FR_W, FR_Y_hi, FR_P1 - nop.i 999 -} -// -// Y_hi = y_hi * w + p1 -// w6 = w4 * w2 -// - -{ .mfi -(p0) setf.sig FR_Q4 = GR_Perturb -(p0) fma.s1 FR_Y_lo = FR_W, FR_Y_lo, FR_P5 - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p0) fma.s1 FR_Y_hi = FR_wsq,FR_Y_hi, FR_W - nop.i 999 -} - -{ .mfb - nop.m 999 -// -// Y_hi = y_hi * wsq + w -// Y_lo = y_lo * w + p5 -// -(p0) fmpy.s1 FR_Y_lo = FR_w6, FR_Y_lo -// -// Y_lo = y_lo * w6 -// -// (p0) for FR_dummy = FR_Y_lo,FR_dummy ;; -// -// Set lsb on: Taken out to improve performance -// -// (p0) fmerge.se FR_Y_lo = FR_Y_lo,FR_dummy ;; -// -// Make sure it's on in Y_lo also. Taken out to improve -// performance -// -(p0) br.cond.sptk L(LOG_main) ;; -} - - -L(log1pf_small): - -{ .mmi - nop.m 999 - nop.m 999 -// /*******************************************************/ -// /*********** Branch log1pf_small ***********************/ -// /*******************************************************/ -(p0) addl GR_Table_Base = @ltoff(Constants_Threshold#),gp +(p7) ldfd FR_T = [GR_ad_T] + nop.m 0 +(p10) br.ret.spnt b0 // Exit if |x| < 2^-40 } +;; { .mfi - nop.m 999 -(p0) mov FR_Em1 = FR_W -(p0) cmp.eq.unc p7, p0 = r0, r0 ;; -} - -{ .mlx - ld8 GR_Table_Base = [GR_Table_Base] -(p0) movl GR_Expo_Range = 0x0000000000000002 ;; -} -// -// Set Safe to true -// Set Expo_Range = 0 for single -// Set Expo_Range = 2 for double -// Set Expo_Range = 4 for double-extended -// - -{ .mmi -(p0) shladd GR_Table_Base = GR_Expo_Range,4,GR_Table_Base ;; -(p0) ldfe FR_Threshold = [GR_Table_Base],16 - nop.i 999 + nop.m 0 + fma.s1 FR_r2 = FR_r,FR_r,f0 // r^2 + nop.i 0 } - -{ .mlx - nop.m 999 -(p0) movl GR_Bias = 0x000000000000FF9B ;; -} - { .mfi -(p0) ldfe FR_Tiny = [GR_Table_Base],0 - nop.f 999 - nop.i 999 ;; + nop.m 0 + fnma.s1 FR_A2 = FR_A2,FR_r,f1 // 1.0 - A2*r + nop.i 0 } +;; { .mfi - nop.m 999 -(p0) fcmp.gt.unc.s1 p13, p12 = FR_abs_W, FR_Threshold - nop.i 999 ;; + nop.m 0 + fnma.s1 FR_A3 = FR_A4,FR_r,FR_A3 // A3 - A4*r + nop.i 0 } +;; { .mfi - nop.m 999 -(p13) fnmpy.s1 FR_Y_lo = FR_W, FR_W - nop.i 999 + nop.m 0 +(p7) fcvt.xf FR_N = FR_N + nop.i 0 } +;; { .mfi - nop.m 999 -(p13) fadd FR_SCALE = f0, f1 - nop.i 999 ;; + nop.m 0 + // (A3*r+A2)*r^2+r + fma.s1 FR_A2 = FR_A3,FR_r2,FR_A2 // (A4*r+A3)*r^2+(A2*r+1) + nop.i 0 } +;; { .mfi - nop.m 999 -(p12) fsub.s1 FR_Y_lo = f0, FR_Tiny -(p12) cmp.ne.unc p7, p0 = r0, r0 + nop.m 0 + // N*Ln2hi+T +(p7) fma.s1 FR_NxLn2pT = FR_N,FR_Ln2,FR_T + nop.i 0 } +;; +.pred.rel "mutex",p6,p7 { .mfi -(p12) setf.exp FR_SCALE = GR_Bias - nop.f 999 - nop.i 999 ;; + nop.m 0 +(p6) fma.s.s0 f8 = FR_A2,FR_r,f0 // result if 2^(-40) <= |x| < 1/256 + nop.i 0 } - -// -// Set p7 to SAFE = FALSE -// Set Scale = 2^-100 -// { .mfb - nop.m 999 -(p0) fma.s.s0 FR_Input_X = FR_Y_lo,FR_SCALE,FR_Y_hi -(p0) br.ret.sptk b0 + nop.m 0 +(p7) fma.s.s0 f8 = FR_A2,FR_r,FR_NxLn2pT // result if |x| >= 1/256 + br.ret.sptk b0 // Exit if |x| >= 2^(-40) } ;; -L(LOG_64_one): - +.align 32 +log1p_unorm: +// Here if x=unorm { .mfb - nop.m 999 -(p0) fmpy.s.s0 FR_Input_X = FR_Input_X, f0 -(p0) br.ret.sptk b0 + getf.exp GR_signexp_x = FR_NormX // recompute biased exponent + nop.f 0 + br.cond.sptk log1p_common } ;; -// -// Raise divide by zero for +/-0 input. -// - -L(LOG_64_zero): +.align 32 +log1p_eq_minus_1: +// Here if x=-1 { .mfi -(p0) mov GR_Parameter_TAG = 142 -// -// If we have log1pf(0), return -Inf. -// -(p0) fsub.s0 FR_Output_X_tmp = f0, f1 - nop.i 999 ;; + nop.m 0 + fmerge.s FR_X = f8,f8 // keep input argument for subsequent + // call of __libm_error_support# + nop.i 0 } -{ .mfb - nop.m 999 -(p0) frcpa.s0 FR_Output_X_tmp, p8 = FR_Output_X_tmp, f0 -(p0) br.cond.sptk L(LOG_ERROR_Support) ;; -} - -L(LOG_64_special): +;; { .mfi - nop.m 999 -// -// Return -Inf or value from handler. -// -(p0) fclass.m.unc p7, p0 = FR_Input_X, 0x1E1 - nop.i 999 ;; + mov GR_TAG = 142 // set libm error in case of log1p(-1). + frcpa.s0 f8,p0 = f8,f0 // log1p(-1) should be equal to -INF. + // We can get it using frcpa because it + // sets result to the IEEE-754 mandated + // quotient of f8/f0. + nop.i 0 } - -{ .mfb - nop.m 999 -// -// Check for Natval, QNan, SNaN, +Inf -// -(p7) fmpy.s.s0 f8 = FR_Input_X, f1 -// -// For SNaN raise invalid and return QNaN. -// For QNaN raise invalid and return QNaN. -// For +Inf return +Inf. -// -(p7) br.ret.sptk b0 +{ .mib + nop.m 0 + nop.i 0 + br.cond.sptk log_libm_err } ;; -// -// For -Inf raise invalid and return QNaN. -// - -{ .mfb -(p0) mov GR_Parameter_TAG = 143 -(p0) fmpy.s.s0 FR_Output_X_tmp = FR_Input_X, f0 -(p0) br.cond.sptk L(LOG_ERROR_Support) ;; +.align 32 +log1p_lt_minus_1: +// Here if x < -1 +{ .mfi + nop.m 0 + fmerge.s FR_X = f8,f8 + nop.i 0 } +;; -// -// Report that log1pf(-Inf) computed -// - -L(LOG_64_unsupported): - -// -// Return generated NaN or other value . -// - -{ .mfb - nop.m 999 -(p0) fmpy.s.s0 FR_Input_X = FR_Input_X, f0 -(p0) br.ret.sptk b0 ;; +{ .mfi + mov GR_TAG = 143 // set libm error in case of x < -1. + frcpa.s0 f8,p0 = f0,f0 // log1p(x) x < -1 should be equal to NaN. + // We can get it using frcpa because it + // sets result to the IEEE-754 mandated + // quotient of f0/f0 i.e. NaN. + nop.i 0 } +;; -L(LOG_64_negative): - -{ .mfi - nop.m 999 -// -// Deal with x < 0 in a special way -// -(p0) frcpa.s0 FR_Output_X_tmp, p8 = f0, f0 -// -// Deal with x < 0 in a special way - raise -// invalid and produce QNaN indefinite. -// -(p0) mov GR_Parameter_TAG = 143;; +.align 32 +log_libm_err: +{ .mmi + alloc r32 = ar.pfs,1,4,4,0 + mov GR_Parameter_TAG = GR_TAG + nop.i 0 } +;; -.endp log1pf# -ASM_SIZE_DIRECTIVE(log1pf) +GLOBAL_IEEE754_END(log1pf) -.proc __libm_error_region -__libm_error_region: -L(LOG_ERROR_Support): +LOCAL_LIBM_ENTRY(__libm_error_region) .prologue - -// (1) { .mfi - add GR_Parameter_Y=-32,sp // Parameter 2 value + add GR_Parameter_Y = -32,sp // Parameter 2 value nop.f 0 .save ar.pfs,GR_SAVE_PFS - mov GR_SAVE_PFS=ar.pfs // Save ar.pfs + mov GR_SAVE_PFS = ar.pfs // Save ar.pfs } { .mfi .fframe 64 - add sp=-64,sp // Create new stack + add sp = -64,sp // Create new stack nop.f 0 - mov GR_SAVE_GP=gp // Save gp + mov GR_SAVE_GP = gp // Save gp };; - - -// (2) { .mmi - stfs [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack + stfs [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack add GR_Parameter_X = 16,sp // Parameter 1 address .save b0, GR_SAVE_B0 - mov GR_SAVE_B0=b0 // Save b0 + mov GR_SAVE_B0 = b0 // Save b0 };; - .body -// (3) { .mib - stfs [GR_Parameter_X] =FR_Input_X // STORE Parameter 1 on stack - add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address - nop.b 0 + stfs [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address + nop.b 0 } { .mib - stfs [GR_Parameter_Y] = FR_Output_X_tmp // STORE Parameter 3 on stack + stfs [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack add GR_Parameter_Y = -16,GR_Parameter_Y - br.call.sptk b0=__libm_error_support# // Call error handling function + br.call.sptk b0=__libm_error_support# // Call error handling function };; { .mmi - nop.m 0 - nop.m 0 add GR_Parameter_RESULT = 48,sp + nop.m 0 + nop.i 0 };; - -// (4) { .mmi - ldfs FR_Input_X = [GR_Parameter_RESULT] // Get return result off stack + ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack .restore sp - add sp = 64,sp // Restore stack pointer - mov b0 = GR_SAVE_B0 // Restore return address + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address };; { .mib - mov gp = GR_SAVE_GP // Restore gp - mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs - br.ret.sptk b0 + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return };; - -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) - - -.proc __libm_LOG_main -__libm_LOG_main: -L(LOG_main): - -// -// kernel_log_64 computes ln(X + E) -// - -{ .mfi - nop.m 999 -(p7) fadd.s.s0 FR_Input_X = FR_Y_lo,FR_Y_hi - nop.i 999 -} - -{ .mmi - nop.m 999 - nop.m 999 -(p14) addl GR_Table_Base = @ltoff(Constants_1_by_LN10#),gp ;; -} - -{ .mmi - nop.m 999 -(p14) ld8 GR_Table_Base = [GR_Table_Base] - nop.i 999 -};; - -{ .mmi -(p14) ldfe FR_1LN10_hi = [GR_Table_Base],16 ;; -(p14) ldfe FR_1LN10_lo = [GR_Table_Base] - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p14) fmpy.s1 FR_Output_X_tmp = FR_Y_lo,FR_1LN10_hi - nop.i 999 ;; -} - -{ .mfi - nop.m 999 -(p14) fma.s1 FR_Output_X_tmp = FR_Y_hi,FR_1LN10_lo,FR_Output_X_tmp - nop.i 999 ;; -} - -{ .mfb - nop.m 999 -(p14) fma.s.s0 FR_Input_X = FR_Y_hi,FR_1LN10_hi,FR_Output_X_tmp -(p0) br.ret.sptk b0 ;; -} -.endp __libm_LOG_main -ASM_SIZE_DIRECTIVE(__libm_LOG_main) - +LOCAL_LIBM_END(__libm_error_region) .type __libm_error_support#,@function .global __libm_error_support# + diff --git a/sysdeps/ia64/fpu/s_log1pl.S b/sysdeps/ia64/fpu/s_log1pl.S index 7cd3f7834c..d392a58edf 100644 --- a/sysdeps/ia64/fpu/s_log1pl.S +++ b/sysdeps/ia64/fpu/s_log1pl.S @@ -1,10 +1,10 @@ .file "log1pl.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,7 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -35,55 +35,49 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. // -// ********************************************************************* +//********************************************************************* // // History: -// 2/02/00 hand-optimized -// 4/04/00 Unwind support added -// 8/15/00 Bundle added after call to __libm_error_support to properly +// 02/02/00 Initial version +// 04/04/00 Unwind support added +// 08/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. +// 05/21/01 Removed logl and log10l, putting them in a separate file +// 06/29/01 Improved speed of all paths +// 05/20/02 Cleaned up namespace and sf0 syntax +// 02/10/03 Reordered header: .section, .global, .proc, .align; +// used data8 for long double table values // -// ********************************************************************* +//********************************************************************* // -// ********************************************************************* +//********************************************************************* // -// Function: Combined logl(x), log1pl(x), and log10l(x) where -// logl(x) = ln(x), for double-extended precision x values -// log1pl(x) = ln(x+1), for double-extended precision x values -// log10l(x) = log (x), for double-extended precision x values -// 10 +// Function: log1pl(x) = ln(x+1), for double-extended precision x values // -// ********************************************************************* +//********************************************************************* // // Resources Used: // // Floating-Point Registers: f8 (Input and Return Value) -// f9,f33-f55,f99 +// f34-f82 // // General Purpose Registers: -// r32-r53 -// r54-r57 (Used to pass arguments to error handling routine) +// r32-r56 +// r53-r56 (Used to pass arguments to error handling routine) // -// Predicate Registers: p6-p15 +// Predicate Registers: p6-p13 // -// ********************************************************************* +//********************************************************************* // // IEEE Special Conditions: // -// Denormal fault raised on denormal inputs +// Denormal fault raised on denormal inputs // Overflow exceptions cannot occur // Underflow exceptions raised when appropriate for log1p -// (Error Handling Routine called for underflow) // Inexact raised when appropriate by algorithm // -// logl(inf) = inf -// logl(-inf) = QNaN -// logl(+/-0) = -inf -// logl(SNaN) = QNaN -// logl(QNaN) = QNaN -// logl(EM_special Values) = QNaN // log1pl(inf) = inf // log1pl(-inf) = QNaN // log1pl(+/-0) = +/-0 @@ -91,54 +85,37 @@ // log1pl(SNaN) = QNaN // log1pl(QNaN) = QNaN // log1pl(EM_special Values) = QNaN -// log10l(inf) = inf -// log10l(-inf) = QNaN -// log10l(+/-0) = -inf -// log10l(SNaN) = QNaN -// log10l(QNaN) = QNaN -// log10l(EM_special Values) = QNaN -// -// ********************************************************************* -// -// Computation is based on the following kernel. -// -// ker_log_64( in_FR : X, -// in_FR : E, -// in_FR : Em1, -// in_GR : Expo_Range, -// out_FR : Y_hi, -// out_FR : Y_lo, -// out_FR : Scale, -// out_PR : Safe ) -// +// +//********************************************************************* +// // Overview // // The method consists of three cases. // -// If |X+Em1| < 2^(-80) use case log1pl_small; -// elseif |X+Em1| < 2^(-7) use case log_near1; -// else use case log_regular; +// If |X| < 2^(-80) use case log1p_small; +// else |X| < 2^(-7) use case log_near1; +// else use case log_regular; // -// Case log1pl_small: +// Case log1p_small: // -// logl( 1 + (X+Em1) ) can be approximated by (X+Em1). +// log1pl( X ) = logl( X+1 ) can be approximated by X // // Case log_near1: // -// logl( 1 + (X+Em1) ) can be approximated by a simple polynomial -// in W = X+Em1. This polynomial resembles the truncated Taylor +// log1pl( X ) = log( X+1 ) can be approximated by a simple polynomial +// in W = X. This polynomial resembles the truncated Taylor // series W - W^/2 + W^3/3 - ... // // Case log_regular: // // Here we use a table lookup method. The basic idea is that in -// order to compute logl(Arg) for an argument Arg in [1,2), we -// construct a value G such that G*Arg is close to 1 and that +// order to compute logl(Arg) = log1pl (Arg-1) for an argument Arg in [1,2), +// we construct a value G such that G*Arg is close to 1 and that // logl(1/G) is obtainable easily from a table of values calculated // beforehand. Thus // -// logl(Arg) = logl(1/G) + logl(G*Arg) -// = logl(1/G) + logl(1 + (G*Arg - 1)) +// logl(Arg) = logl(1/G) + logl(G*Arg) +// = logl(1/G) + logl(1 + (G*Arg - 1)) // // Because |G*Arg - 1| is small, the second term on the right hand // side can be approximated by a short polynomial. We elaborate @@ -146,9 +123,9 @@ // // Step 0: Initialization // -// We need to calculate logl( E + X ). Obtain N, S_hi, S_lo such that +// We need to calculate logl( X+1 ). Obtain N, S_hi such that // -// E + X = 2^N * ( S_hi + S_lo ) exactly +// X+1 = 2^N * ( S_hi + S_lo ) exactly // // where S_hi in [1,2) and S_lo is a correction to S_hi in the sense // that |S_lo| <= ulp(S_hi). @@ -157,8 +134,8 @@ // // Based on S_hi, obtain G_1, G_2, G_3 from a table and calculate // -// G := G_1 * G_2 * G_3 -// r := (G * S_hi - 1) + G * S_lo +// G := G_1 * G_2 * G_3 +// r := (G * S_hi - 1) + G * S_lo // // These G_j's have the property that the product is exactly // representable and that |r| < 2^(-12) as a result. @@ -171,61 +148,34 @@ // Step 3: Reconstruction // // -// Finally, logl( E + X ) is given by +// Finally, log1pl( X ) = logl( X+1 ) is given by // -// logl( E + X ) = logl( 2^N * (S_hi + S_lo) ) +// logl( X+1 ) = logl( 2^N * (S_hi + S_lo) ) // ~=~ N*logl(2) + logl(1/G) + logl(1 + r) // ~=~ N*logl(2) + logl(1/G) + poly(r). // // **** Algorithm **** // -// Case log1pl_small: -// -// Although logl(1 + (X+Em1)) is basically X+Em1, we would like to -// preserve the inexactness nature as well as consistent behavior -// under different rounding modes. Note that this case can only be -// taken if E is set to be 1.0. In this case, Em1 is zero, and that -// X can be very tiny and thus the final result can possibly underflow. -// Thus, we compare X against a threshold that is dependent on the -// input Expo_Range. If |X| is smaller than this threshold, we set -// SAFE to be FALSE. -// -// The result is returned as Y_hi, Y_lo, and in the case of SAFE -// is FALSE, an additional value Scale is also returned. -// -// W := X + Em1 -// Threshold := Threshold_Table( Expo_Range ) -// Tiny := Tiny_Table( Expo_Range ) -// -// If ( |W| > Threshold ) then -// Y_hi := W -// Y_lo := -W*W -// Else -// Y_hi := W -// Y_lo := -Tiny -// Scale := 2^(-100) -// Safe := FALSE -// EndIf -// -// -// One may think that Y_lo should be -W*W/2; however, it does not matter -// as Y_lo will be rounded off completely except for the correct effect in -// directed rounding. Clearly -W*W is simplier to compute. Moreover, -// because of the difference in exponent value, Y_hi + Y_lo or -// Y_hi + Scale*Y_lo is always inexact. +// Case log1p_small: +// +// Although log1pl(X) is basically X, we would like to preserve the inexactness +// nature as well as consistent behavior under different rounding modes. +// We can do this by computing the result as +// +// log1pl(X) = X - X*X +// // // Case log_near1: // // Here we compute a simple polynomial. To exploit parallelism, we split // the polynomial into two portions. // -// W := X + Em1 -// Wsq := W * W -// W4 := Wsq*Wsq -// W6 := W4*Wsq -// Y_hi := W + Wsq*(P_1 + W*(P_2 + W*(P_3 + W*P_4)) -// Y_lo := W6*(P_5 + W*(P_6 + W*(P_7 + W*P_8))) -// set lsb(Y_lo) to be 1 +// W := X +// Wsq := W * W +// W4 := Wsq*Wsq +// W6 := W4*Wsq +// Y_hi := W + Wsq*(P_1 + W*(P_2 + W*(P_3 + W*P_4)) +// Y_lo := W6*(P_5 + W*(P_6 + W*(P_7 + W*P_8))) // // Case log_regular: // @@ -234,89 +184,87 @@ // Step 0. Initialization // ---------------------- // -// Z := X + E +// Z := X + 1 // N := unbaised exponent of Z // S_hi := 2^(-N) * Z -// S_lo := 2^(-N) * { (max(X,E)-Z) + min(X,E) } -// -// Note that S_lo is always 0 for the case E = 0. +// S_lo := 2^(-N) * { (max(X,1)-Z) + min(X,1) } // // Step 1. Argument Reduction // -------------------------- // // Let // -// Z = 2^N * S_hi = 2^N * 1.d_1 d_2 d_3 ... d_63 +// Z = 2^N * S_hi = 2^N * 1.d_1 d_2 d_3 ... d_63 // // We obtain G_1, G_2, G_3 by the following steps. // // -// Define X_0 := 1.d_1 d_2 ... d_14. This is extracted -// from S_hi. +// Define X_0 := 1.d_1 d_2 ... d_14. This is extracted +// from S_hi. // -// Define A_1 := 1.d_1 d_2 d_3 d_4. This is X_0 truncated -// to lsb = 2^(-4). +// Define A_1 := 1.d_1 d_2 d_3 d_4. This is X_0 truncated +// to lsb = 2^(-4). // -// Define index_1 := [ d_1 d_2 d_3 d_4 ]. +// Define index_1 := [ d_1 d_2 d_3 d_4 ]. // -// Fetch Z_1 := (1/A_1) rounded UP in fixed point with -// fixed point lsb = 2^(-15). -// Z_1 looks like z_0.z_1 z_2 ... z_15 -// Note that the fetching is done using index_1. -// A_1 is actually not needed in the implementation -// and is used here only to explain how is the value -// Z_1 defined. +// Fetch Z_1 := (1/A_1) rounded UP in fixed point with +// fixed point lsb = 2^(-15). +// Z_1 looks like z_0.z_1 z_2 ... z_15 +// Note that the fetching is done using index_1. +// A_1 is actually not needed in the implementation +// and is used here only to explain how is the value +// Z_1 defined. // -// Fetch G_1 := (1/A_1) truncated to 21 sig. bits. -// floating pt. Again, fetching is done using index_1. A_1 -// explains how G_1 is defined. +// Fetch G_1 := (1/A_1) truncated to 21 sig. bits. +// floating pt. Again, fetching is done using index_1. A_1 +// explains how G_1 is defined. // -// Calculate X_1 := X_0 * Z_1 truncated to lsb = 2^(-14) -// = 1.0 0 0 0 d_5 ... d_14 -// This is accomplised by integer multiplication. -// It is proved that X_1 indeed always begin -// with 1.0000 in fixed point. +// Calculate X_1 := X_0 * Z_1 truncated to lsb = 2^(-14) +// = 1.0 0 0 0 d_5 ... d_14 +// This is accomplised by integer multiplication. +// It is proved that X_1 indeed always begin +// with 1.0000 in fixed point. // // -// Define A_2 := 1.0 0 0 0 d_5 d_6 d_7 d_8. This is X_1 -// truncated to lsb = 2^(-8). Similar to A_1, -// A_2 is not needed in actual implementation. It -// helps explain how some of the values are defined. +// Define A_2 := 1.0 0 0 0 d_5 d_6 d_7 d_8. This is X_1 +// truncated to lsb = 2^(-8). Similar to A_1, +// A_2 is not needed in actual implementation. It +// helps explain how some of the values are defined. // -// Define index_2 := [ d_5 d_6 d_7 d_8 ]. +// Define index_2 := [ d_5 d_6 d_7 d_8 ]. // -// Fetch Z_2 := (1/A_2) rounded UP in fixed point with -// fixed point lsb = 2^(-15). Fetch done using index_2. -// Z_2 looks like z_0.z_1 z_2 ... z_15 +// Fetch Z_2 := (1/A_2) rounded UP in fixed point with +// fixed point lsb = 2^(-15). Fetch done using index_2. +// Z_2 looks like z_0.z_1 z_2 ... z_15 // -// Fetch G_2 := (1/A_2) truncated to 21 sig. bits. -// floating pt. +// Fetch G_2 := (1/A_2) truncated to 21 sig. bits. +// floating pt. // -// Calculate X_2 := X_1 * Z_2 truncated to lsb = 2^(-14) -// = 1.0 0 0 0 0 0 0 0 d_9 d_10 ... d_14 -// This is accomplised by integer multiplication. -// It is proved that X_2 indeed always begin -// with 1.00000000 in fixed point. +// Calculate X_2 := X_1 * Z_2 truncated to lsb = 2^(-14) +// = 1.0 0 0 0 0 0 0 0 d_9 d_10 ... d_14 +// This is accomplised by integer multiplication. +// It is proved that X_2 indeed always begin +// with 1.00000000 in fixed point. // // -// Define A_3 := 1.0 0 0 0 0 0 0 0 d_9 d_10 d_11 d_12 d_13 1. -// This is 2^(-14) + X_2 truncated to lsb = 2^(-13). +// Define A_3 := 1.0 0 0 0 0 0 0 0 d_9 d_10 d_11 d_12 d_13 1. +// This is 2^(-14) + X_2 truncated to lsb = 2^(-13). // -// Define index_3 := [ d_9 d_10 d_11 d_12 d_13 ]. +// Define index_3 := [ d_9 d_10 d_11 d_12 d_13 ]. // -// Fetch G_3 := (1/A_3) truncated to 21 sig. bits. -// floating pt. Fetch is done using index_3. +// Fetch G_3 := (1/A_3) truncated to 21 sig. bits. +// floating pt. Fetch is done using index_3. // -// Compute G := G_1 * G_2 * G_3. +// Compute G := G_1 * G_2 * G_3. // -// This is done exactly since each of G_j only has 21 sig. bits. +// This is done exactly since each of G_j only has 21 sig. bits. // -// Compute +// Compute // -// r := (G*S_hi - 1) + G*S_lo using 2 FMA operations. +// r := (G*S_hi - 1) + G*S_lo using 2 FMA operations. // -// thus, r approximates G*(S_hi+S_lo) - 1 to within a couple of -// rounding errors. +// Thus r approximates G*(S_hi + S_lo) - 1 to within a couple of +// rounding errors. // // // Step 2. Approximation @@ -326,1258 +274,878 @@ // reduced argument just obtained. It is proved that |r| <= 1.9*2^(-13); // thus logl(1+r) can be approximated by a short polynomial: // -// logl(1+r) ~=~ poly = r + Q1 r^2 + ... + Q4 r^5 +// logl(1+r) ~=~ poly = r + Q1 r^2 + ... + Q4 r^5 // // // Step 3. Reconstruction // ---------------------- // -// This step computes the desired result of logl(X+E): +// This step computes the desired result of logl(X+1): // -// logl(X+E) = logl( 2^N * (S_hi + S_lo) ) -// = N*logl(2) + logl( S_hi + S_lo ) -// = N*logl(2) + logl(1/G) + -// logl(1 + C*(S_hi+S_lo) - 1 ) +// logl(X+1) = logl( 2^N * (S_hi + S_lo) ) +// = N*logl(2) + logl( S_hi + S_lo) ) +// = N*logl(2) + logl(1/G) + +// logl(1 + G * ( S_hi + S_lo ) - 1 ) // // logl(2), logl(1/G_j) are stored as pairs of (single,double) numbers: // log2_hi, log2_lo, log1byGj_hi, log1byGj_lo. The high parts are // single-precision numbers and the low parts are double precision // numbers. These have the property that // -// N*log2_hi + SUM ( log1byGj_hi ) +// N*log2_hi + SUM ( log1byGj_hi ) // // is computable exactly in double-extended precision (64 sig. bits). // Finally // -// Y_hi := N*log2_hi + SUM ( log1byGj_hi ) -// Y_lo := poly_hi + [ poly_lo + -// ( SUM ( log1byGj_lo ) + N*log2_lo ) ] -// set lsb(Y_lo) to be 1 +// Y_hi := N*log2_hi + SUM ( log1byGj_hi ) +// Y_lo := poly_hi + [ poly_lo + +// ( SUM ( log1byGj_lo ) + N*log2_lo ) ] // -#include "libm_support.h" +RODATA +.align 64 -#ifdef _LIBC -.rodata -#else -.data -#endif +// ************* DO NOT CHANGE THE ORDER OF THESE TABLES ************* -// P_7, P_6, P_5, P_4, P_3, P_2, and P_1 +// P_8, P_7, P_6, P_5, P_4, P_3, P_2, and P_1 + +LOCAL_OBJECT_START(Constants_P) +//data4 0xEFD62B15,0xE3936754,0x00003FFB,0x00000000 +//data4 0xA5E56381,0x8003B271,0x0000BFFC,0x00000000 +//data4 0x73282DB0,0x9249248C,0x00003FFC,0x00000000 +//data4 0x47305052,0xAAAAAA9F,0x0000BFFC,0x00000000 +//data4 0xCCD17FC9,0xCCCCCCCC,0x00003FFC,0x00000000 +//data4 0x00067ED5,0x80000000,0x0000BFFD,0x00000000 +//data4 0xAAAAAAAA,0xAAAAAAAA,0x00003FFD,0x00000000 +//data4 0xFFFFFFFE,0xFFFFFFFF,0x0000BFFD,0x00000000 +data8 0xE3936754EFD62B15,0x00003FFB +data8 0x8003B271A5E56381,0x0000BFFC +data8 0x9249248C73282DB0,0x00003FFC +data8 0xAAAAAA9F47305052,0x0000BFFC +data8 0xCCCCCCCCCCD17FC9,0x00003FFC +data8 0x8000000000067ED5,0x0000BFFD +data8 0xAAAAAAAAAAAAAAAA,0x00003FFD +data8 0xFFFFFFFFFFFFFFFE,0x0000BFFD +LOCAL_OBJECT_END(Constants_P) -.align 64 -Constants_P: -ASM_TYPE_DIRECTIVE(Constants_P,@object) -data4 0xEFD62B15,0xE3936754,0x00003FFB,0x00000000 -data4 0xA5E56381,0x8003B271,0x0000BFFC,0x00000000 -data4 0x73282DB0,0x9249248C,0x00003FFC,0x00000000 -data4 0x47305052,0xAAAAAA9F,0x0000BFFC,0x00000000 -data4 0xCCD17FC9,0xCCCCCCCC,0x00003FFC,0x00000000 -data4 0x00067ED5,0x80000000,0x0000BFFD,0x00000000 -data4 0xAAAAAAAA,0xAAAAAAAA,0x00003FFD,0x00000000 -data4 0xFFFFFFFE,0xFFFFFFFF,0x0000BFFD,0x00000000 -ASM_SIZE_DIRECTIVE(Constants_P) - // log2_hi, log2_lo, Q_4, Q_3, Q_2, and Q_1 -.align 64 -Constants_Q: -ASM_TYPE_DIRECTIVE(Constants_Q,@object) -data4 0x00000000,0xB1721800,0x00003FFE,0x00000000 -data4 0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000 -data4 0x328833CB,0xCCCCCAF2,0x00003FFC,0x00000000 -data4 0xA9D4BAFB,0x80000077,0x0000BFFD,0x00000000 -data4 0xAAABE3D2,0xAAAAAAAA,0x00003FFD,0x00000000 -data4 0xFFFFDAB7,0xFFFFFFFF,0x0000BFFD,0x00000000 -ASM_SIZE_DIRECTIVE(Constants_Q) - -// Z1 - 16 bit fixed, G1 and H1 - IEEE single - -.align 64 -Constants_Z_G_H_h1: -ASM_TYPE_DIRECTIVE(Constants_Z_G_H_h1,@object) -data4 0x00008000,0x3F800000,0x00000000,0x00000000,0x00000000,0x00000000 -data4 0x00007879,0x3F70F0F0,0x3D785196,0x00000000,0x617D741C,0x3DA163A6 -data4 0x000071C8,0x3F638E38,0x3DF13843,0x00000000,0xCBD3D5BB,0x3E2C55E6 -data4 0x00006BCB,0x3F579430,0x3E2FF9A0,0x00000000,0xD86EA5E7,0xBE3EB0BF -data4 0x00006667,0x3F4CCCC8,0x3E647FD6,0x00000000,0x86B12760,0x3E2E6A8C -data4 0x00006187,0x3F430C30,0x3E8B3AE7,0x00000000,0x5C0739BA,0x3E47574C -data4 0x00005D18,0x3F3A2E88,0x3EA30C68,0x00000000,0x13E8AF2F,0x3E20E30F -data4 0x0000590C,0x3F321640,0x3EB9CEC8,0x00000000,0xF2C630BD,0xBE42885B -data4 0x00005556,0x3F2AAAA8,0x3ECF9927,0x00000000,0x97E577C6,0x3E497F34 -data4 0x000051EC,0x3F23D708,0x3EE47FC5,0x00000000,0xA6B0A5AB,0x3E3E6A6E -data4 0x00004EC5,0x3F1D89D8,0x3EF8947D,0x00000000,0xD328D9BE,0xBDF43E3C -data4 0x00004BDB,0x3F17B420,0x3F05F3A1,0x00000000,0x0ADB090A,0x3E4094C3 -data4 0x00004925,0x3F124920,0x3F0F4303,0x00000000,0xFC1FE510,0xBE28FBB2 -data4 0x0000469F,0x3F0D3DC8,0x3F183EBF,0x00000000,0x10FDE3FA,0x3E3A7895 -data4 0x00004445,0x3F088888,0x3F20EC80,0x00000000,0x7CC8C98F,0x3E508CE5 -data4 0x00004211,0x3F042108,0x3F29516A,0x00000000,0xA223106C,0xBE534874 -ASM_SIZE_DIRECTIVE(Constants_Z_G_H_h1) - -// Z2 - 16 bit fixed, G2 and H2 - IEEE single - -.align 64 -Constants_Z_G_H_h2: -ASM_TYPE_DIRECTIVE(Constants_Z_G_H_h2,@object) -data4 0x00008000,0x3F800000,0x00000000,0x00000000,0x00000000,0x00000000 -data4 0x00007F81,0x3F7F00F8,0x3B7F875D,0x00000000,0x22C42273,0x3DB5A116 -data4 0x00007F02,0x3F7E03F8,0x3BFF015B,0x00000000,0x21F86ED3,0x3DE620CF -data4 0x00007E85,0x3F7D08E0,0x3C3EE393,0x00000000,0x484F34ED,0xBDAFA07E -data4 0x00007E08,0x3F7C0FC0,0x3C7E0586,0x00000000,0x3860BCF6,0xBDFE07F0 -data4 0x00007D8D,0x3F7B1880,0x3C9E75D2,0x00000000,0xA78093D6,0x3DEA370F -data4 0x00007D12,0x3F7A2328,0x3CBDC97A,0x00000000,0x72A753D0,0x3DFF5791 -data4 0x00007C98,0x3F792FB0,0x3CDCFE47,0x00000000,0xA7EF896B,0x3DFEBE6C -data4 0x00007C20,0x3F783E08,0x3CFC15D0,0x00000000,0x409ECB43,0x3E0CF156 -data4 0x00007BA8,0x3F774E38,0x3D0D874D,0x00000000,0xFFEF71DF,0xBE0B6F97 -data4 0x00007B31,0x3F766038,0x3D1CF49B,0x00000000,0x5D59EEE8,0xBE080483 -data4 0x00007ABB,0x3F757400,0x3D2C531D,0x00000000,0xA9192A74,0x3E1F91E9 -data4 0x00007A45,0x3F748988,0x3D3BA322,0x00000000,0xBF72A8CD,0xBE139A06 -data4 0x000079D1,0x3F73A0D0,0x3D4AE46F,0x00000000,0xF8FBA6CF,0x3E1D9202 -data4 0x0000795D,0x3F72B9D0,0x3D5A1756,0x00000000,0xBA796223,0xBE1DCCC4 -data4 0x000078EB,0x3F71D488,0x3D693B9D,0x00000000,0xB6B7C239,0xBE049391 -ASM_SIZE_DIRECTIVE(Constants_Z_G_H_h2) - -// G3 and H3 - IEEE single and h3 -IEEE double - -.align 64 -Constants_Z_G_H_h3: -ASM_TYPE_DIRECTIVE(Constants_Z_G_H_h3,@object) -data4 0x3F7FFC00,0x38800100,0x562224CD,0x3D355595 -data4 0x3F7FF400,0x39400480,0x06136FF6,0x3D8200A2 -data4 0x3F7FEC00,0x39A00640,0xE8DE9AF0,0x3DA4D68D -data4 0x3F7FE400,0x39E00C41,0xB10238DC,0xBD8B4291 -data4 0x3F7FDC00,0x3A100A21,0x3B1952CA,0xBD89CCB8 -data4 0x3F7FD400,0x3A300F22,0x1DC46826,0xBDB10707 -data4 0x3F7FCC08,0x3A4FF51C,0xF43307DB,0x3DB6FCB9 -data4 0x3F7FC408,0x3A6FFC1D,0x62DC7872,0xBD9B7C47 -data4 0x3F7FBC10,0x3A87F20B,0x3F89154A,0xBDC3725E -data4 0x3F7FB410,0x3A97F68B,0x62B9D392,0xBD93519D -data4 0x3F7FAC18,0x3AA7EB86,0x0F21BD9D,0x3DC18441 -data4 0x3F7FA420,0x3AB7E101,0x2245E0A6,0xBDA64B95 -data4 0x3F7F9C20,0x3AC7E701,0xAABB34B8,0x3DB4B0EC -data4 0x3F7F9428,0x3AD7DD7B,0x6DC40A7E,0x3D992337 -data4 0x3F7F8C30,0x3AE7D474,0x4F2083D3,0x3DC6E17B -data4 0x3F7F8438,0x3AF7CBED,0x811D4394,0x3DAE314B -data4 0x3F7F7C40,0x3B03E1F3,0xB08F2DB1,0xBDD46F21 -data4 0x3F7F7448,0x3B0BDE2F,0x6D34522B,0xBDDC30A4 -data4 0x3F7F6C50,0x3B13DAAA,0xB1F473DB,0x3DCB0070 -data4 0x3F7F6458,0x3B1BD766,0x6AD282FD,0xBDD65DDC -data4 0x3F7F5C68,0x3B23CC5C,0xF153761A,0xBDCDAB83 -data4 0x3F7F5470,0x3B2BC997,0x341D0F8F,0xBDDADA40 -data4 0x3F7F4C78,0x3B33C711,0xEBC394E8,0x3DCD1BD7 -data4 0x3F7F4488,0x3B3BBCC6,0x52E3E695,0xBDC3532B -data4 0x3F7F3C90,0x3B43BAC0,0xE846B3DE,0xBDA3961E -data4 0x3F7F34A0,0x3B4BB0F4,0x785778D4,0xBDDADF06 -data4 0x3F7F2CA8,0x3B53AF6D,0xE55CE212,0x3DCC3ED1 -data4 0x3F7F24B8,0x3B5BA620,0x9E382C15,0xBDBA3103 -data4 0x3F7F1CC8,0x3B639D12,0x5C5AF197,0x3D635A0B -data4 0x3F7F14D8,0x3B6B9444,0x71D34EFC,0xBDDCCB19 -data4 0x3F7F0CE0,0x3B7393BC,0x52CD7ADA,0x3DC74502 -data4 0x3F7F04F0,0x3B7B8B6D,0x7D7F2A42,0xBDB68F17 -ASM_SIZE_DIRECTIVE(Constants_Z_G_H_h3) +LOCAL_OBJECT_START(Constants_Q) +//data4 0x00000000,0xB1721800,0x00003FFE,0x00000000 +//data4 0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000 +//data4 0x328833CB,0xCCCCCAF2,0x00003FFC,0x00000000 +//data4 0xA9D4BAFB,0x80000077,0x0000BFFD,0x00000000 +//data4 0xAAABE3D2,0xAAAAAAAA,0x00003FFD,0x00000000 +//data4 0xFFFFDAB7,0xFFFFFFFF,0x0000BFFD,0x00000000 +data8 0xB172180000000000,0x00003FFE +data8 0x82E308654361C4C6,0x0000BFE2 +data8 0xCCCCCAF2328833CB,0x00003FFC +data8 0x80000077A9D4BAFB,0x0000BFFD +data8 0xAAAAAAAAAAABE3D2,0x00003FFD +data8 0xFFFFFFFFFFFFDAB7,0x0000BFFD +LOCAL_OBJECT_END(Constants_Q) + +// 1/ln10_hi, 1/ln10_lo + +LOCAL_OBJECT_START(Constants_1_by_LN10) +//data4 0x37287195,0xDE5BD8A9,0x00003FFD,0x00000000 +//data4 0xACCF70C8,0xD56EAABE,0x00003FBB,0x00000000 +data8 0xDE5BD8A937287195,0x00003FFD +data8 0xD56EAABEACCF70C8,0x00003FBB +LOCAL_OBJECT_END(Constants_1_by_LN10) + + +// Z1 - 16 bit fixed -// -// Exponent Thresholds and Tiny Thresholds -// for 8, 11, 15, and 17 bit exponents -// -// Expo_Range Value -// -// 0 (8 bits) 2^(-126) -// 1 (11 bits) 2^(-1022) -// 2 (15 bits) 2^(-16382) -// 3 (17 bits) 2^(-16382) -// -// Tiny_Table -// ---------- -// Expo_Range Value -// -// 0 (8 bits) 2^(-16382) -// 1 (11 bits) 2^(-16382) -// 2 (15 bits) 2^(-16382) -// 3 (17 bits) 2^(-16382) -// +LOCAL_OBJECT_START(Constants_Z_1) +data4 0x00008000 +data4 0x00007879 +data4 0x000071C8 +data4 0x00006BCB +data4 0x00006667 +data4 0x00006187 +data4 0x00005D18 +data4 0x0000590C +data4 0x00005556 +data4 0x000051EC +data4 0x00004EC5 +data4 0x00004BDB +data4 0x00004925 +data4 0x0000469F +data4 0x00004445 +data4 0x00004211 +LOCAL_OBJECT_END(Constants_Z_1) -.align 64 -Constants_Threshold: -ASM_TYPE_DIRECTIVE(Constants_Threshold,@object) -data4 0x00000000,0x80000000,0x00003F81,0x00000000 -data4 0x00000000,0x80000000,0x00000001,0x00000000 -data4 0x00000000,0x80000000,0x00003C01,0x00000000 -data4 0x00000000,0x80000000,0x00000001,0x00000000 -data4 0x00000000,0x80000000,0x00000001,0x00000000 -data4 0x00000000,0x80000000,0x00000001,0x00000000 -data4 0x00000000,0x80000000,0x00000001,0x00000000 -data4 0x00000000,0x80000000,0x00000001,0x00000000 -ASM_SIZE_DIRECTIVE(Constants_Threshold) +// G1 and H1 - IEEE single and h1 - IEEE double -.align 64 -Constants_1_by_LN10: -ASM_TYPE_DIRECTIVE(Constants_1_by_LN10,@object) -data4 0x37287195,0xDE5BD8A9,0x00003FFD,0x00000000 -data4 0xACCF70C8,0xD56EAABE,0x00003FBB,0x00000000 -ASM_SIZE_DIRECTIVE(Constants_1_by_LN10) - -FR_Input_X = f8 -FR_Neg_One = f9 -FR_E = f33 -FR_Em1 = f34 -FR_Y_hi = f34 -// Shared with Em1 -FR_Y_lo = f35 -FR_Scale = f36 -FR_X_Prime = f37 -FR_Z = f38 -FR_S_hi = f38 -// Shared with Z -FR_W = f39 -FR_G = f40 -FR_wsq = f40 -// Shared with G -FR_H = f41 -FR_w4 = f41 -// Shared with H -FR_h = f42 -FR_w6 = f42 -// Shared with h -FR_G_tmp = f43 -FR_poly_lo = f43 -// Shared with G_tmp -FR_P8 = f43 -// Shared with G_tmp -FR_H_tmp = f44 -FR_poly_hi = f44 - // Shared with H_tmp -FR_P7 = f44 -// Shared with H_tmp -FR_h_tmp = f45 -FR_rsq = f45 -// Shared with h_tmp -FR_P6 = f45 -// Shared with h_tmp -FR_abs_W = f46 -FR_r = f46 -// Shared with abs_W -FR_AA = f47 -FR_log2_hi = f47 -// Shared with AA -FR_BB = f48 -FR_log2_lo = f48 -// Shared with BB -FR_S_lo = f49 -FR_two_negN = f50 -FR_float_N = f51 -FR_Q4 = f52 -FR_dummy = f52 -// Shared with Q4 -FR_P4 = f52 -// Shared with Q4 -FR_Threshold = f52 -// Shared with Q4 -FR_Q3 = f53 -FR_P3 = f53 -// Shared with Q3 -FR_Tiny = f53 -// Shared with Q3 -FR_Q2 = f54 -FR_P2 = f54 -// Shared with Q2 -FR_1LN10_hi = f54 -// Shared with Q2 -FR_Q1 = f55 -FR_P1 = f55 -// Shared with Q1 -FR_1LN10_lo = f55 -// Shared with Q1 -FR_P5 = f98 -FR_SCALE = f98 -FR_Output_X_tmp = f99 - -GR_Expo_Range = r32 -GR_Table_Base = r34 -GR_Table_Base1 = r35 -GR_Table_ptr = r36 -GR_Index2 = r37 -GR_signif = r38 -GR_X_0 = r39 -GR_X_1 = r40 -GR_X_2 = r41 -GR_Z_1 = r42 -GR_Z_2 = r43 -GR_N = r44 -GR_Bias = r45 -GR_M = r46 -GR_ScaleN = r47 -GR_Index3 = r48 -GR_Perturb = r49 -GR_Table_Scale = r50 +LOCAL_OBJECT_START(Constants_G_H_h1) +data4 0x3F800000,0x00000000 +data8 0x0000000000000000 +data4 0x3F70F0F0,0x3D785196 +data8 0x3DA163A6617D741C +data4 0x3F638E38,0x3DF13843 +data8 0x3E2C55E6CBD3D5BB +data4 0x3F579430,0x3E2FF9A0 +data8 0xBE3EB0BFD86EA5E7 +data4 0x3F4CCCC8,0x3E647FD6 +data8 0x3E2E6A8C86B12760 +data4 0x3F430C30,0x3E8B3AE7 +data8 0x3E47574C5C0739BA +data4 0x3F3A2E88,0x3EA30C68 +data8 0x3E20E30F13E8AF2F +data4 0x3F321640,0x3EB9CEC8 +data8 0xBE42885BF2C630BD +data4 0x3F2AAAA8,0x3ECF9927 +data8 0x3E497F3497E577C6 +data4 0x3F23D708,0x3EE47FC5 +data8 0x3E3E6A6EA6B0A5AB +data4 0x3F1D89D8,0x3EF8947D +data8 0xBDF43E3CD328D9BE +data4 0x3F17B420,0x3F05F3A1 +data8 0x3E4094C30ADB090A +data4 0x3F124920,0x3F0F4303 +data8 0xBE28FBB2FC1FE510 +data4 0x3F0D3DC8,0x3F183EBF +data8 0x3E3A789510FDE3FA +data4 0x3F088888,0x3F20EC80 +data8 0x3E508CE57CC8C98F +data4 0x3F042108,0x3F29516A +data8 0xBE534874A223106C +LOCAL_OBJECT_END(Constants_G_H_h1) -// -// Added for unwind support -// +// Z2 - 16 bit fixed + +LOCAL_OBJECT_START(Constants_Z_2) +data4 0x00008000 +data4 0x00007F81 +data4 0x00007F02 +data4 0x00007E85 +data4 0x00007E08 +data4 0x00007D8D +data4 0x00007D12 +data4 0x00007C98 +data4 0x00007C20 +data4 0x00007BA8 +data4 0x00007B31 +data4 0x00007ABB +data4 0x00007A45 +data4 0x000079D1 +data4 0x0000795D +data4 0x000078EB +LOCAL_OBJECT_END(Constants_Z_2) + +// G2 and H2 - IEEE single and h2 - IEEE double + +LOCAL_OBJECT_START(Constants_G_H_h2) +data4 0x3F800000,0x00000000 +data8 0x0000000000000000 +data4 0x3F7F00F8,0x3B7F875D +data8 0x3DB5A11622C42273 +data4 0x3F7E03F8,0x3BFF015B +data8 0x3DE620CF21F86ED3 +data4 0x3F7D08E0,0x3C3EE393 +data8 0xBDAFA07E484F34ED +data4 0x3F7C0FC0,0x3C7E0586 +data8 0xBDFE07F03860BCF6 +data4 0x3F7B1880,0x3C9E75D2 +data8 0x3DEA370FA78093D6 +data4 0x3F7A2328,0x3CBDC97A +data8 0x3DFF579172A753D0 +data4 0x3F792FB0,0x3CDCFE47 +data8 0x3DFEBE6CA7EF896B +data4 0x3F783E08,0x3CFC15D0 +data8 0x3E0CF156409ECB43 +data4 0x3F774E38,0x3D0D874D +data8 0xBE0B6F97FFEF71DF +data4 0x3F766038,0x3D1CF49B +data8 0xBE0804835D59EEE8 +data4 0x3F757400,0x3D2C531D +data8 0x3E1F91E9A9192A74 +data4 0x3F748988,0x3D3BA322 +data8 0xBE139A06BF72A8CD +data4 0x3F73A0D0,0x3D4AE46F +data8 0x3E1D9202F8FBA6CF +data4 0x3F72B9D0,0x3D5A1756 +data8 0xBE1DCCC4BA796223 +data4 0x3F71D488,0x3D693B9D +data8 0xBE049391B6B7C239 +LOCAL_OBJECT_END(Constants_G_H_h2) + +// G3 and H3 - IEEE single and h3 - IEEE double + +LOCAL_OBJECT_START(Constants_G_H_h3) +data4 0x3F7FFC00,0x38800100 +data8 0x3D355595562224CD +data4 0x3F7FF400,0x39400480 +data8 0x3D8200A206136FF6 +data4 0x3F7FEC00,0x39A00640 +data8 0x3DA4D68DE8DE9AF0 +data4 0x3F7FE400,0x39E00C41 +data8 0xBD8B4291B10238DC +data4 0x3F7FDC00,0x3A100A21 +data8 0xBD89CCB83B1952CA +data4 0x3F7FD400,0x3A300F22 +data8 0xBDB107071DC46826 +data4 0x3F7FCC08,0x3A4FF51C +data8 0x3DB6FCB9F43307DB +data4 0x3F7FC408,0x3A6FFC1D +data8 0xBD9B7C4762DC7872 +data4 0x3F7FBC10,0x3A87F20B +data8 0xBDC3725E3F89154A +data4 0x3F7FB410,0x3A97F68B +data8 0xBD93519D62B9D392 +data4 0x3F7FAC18,0x3AA7EB86 +data8 0x3DC184410F21BD9D +data4 0x3F7FA420,0x3AB7E101 +data8 0xBDA64B952245E0A6 +data4 0x3F7F9C20,0x3AC7E701 +data8 0x3DB4B0ECAABB34B8 +data4 0x3F7F9428,0x3AD7DD7B +data8 0x3D9923376DC40A7E +data4 0x3F7F8C30,0x3AE7D474 +data8 0x3DC6E17B4F2083D3 +data4 0x3F7F8438,0x3AF7CBED +data8 0x3DAE314B811D4394 +data4 0x3F7F7C40,0x3B03E1F3 +data8 0xBDD46F21B08F2DB1 +data4 0x3F7F7448,0x3B0BDE2F +data8 0xBDDC30A46D34522B +data4 0x3F7F6C50,0x3B13DAAA +data8 0x3DCB0070B1F473DB +data4 0x3F7F6458,0x3B1BD766 +data8 0xBDD65DDC6AD282FD +data4 0x3F7F5C68,0x3B23CC5C +data8 0xBDCDAB83F153761A +data4 0x3F7F5470,0x3B2BC997 +data8 0xBDDADA40341D0F8F +data4 0x3F7F4C78,0x3B33C711 +data8 0x3DCD1BD7EBC394E8 +data4 0x3F7F4488,0x3B3BBCC6 +data8 0xBDC3532B52E3E695 +data4 0x3F7F3C90,0x3B43BAC0 +data8 0xBDA3961EE846B3DE +data4 0x3F7F34A0,0x3B4BB0F4 +data8 0xBDDADF06785778D4 +data4 0x3F7F2CA8,0x3B53AF6D +data8 0x3DCC3ED1E55CE212 +data4 0x3F7F24B8,0x3B5BA620 +data8 0xBDBA31039E382C15 +data4 0x3F7F1CC8,0x3B639D12 +data8 0x3D635A0B5C5AF197 +data4 0x3F7F14D8,0x3B6B9444 +data8 0xBDDCCB1971D34EFC +data4 0x3F7F0CE0,0x3B7393BC +data8 0x3DC7450252CD7ADA +data4 0x3F7F04F0,0x3B7B8B6D +data8 0xBDB68F177D7F2A42 +LOCAL_OBJECT_END(Constants_G_H_h3) -GR_SAVE_PFS = r51 -GR_SAVE_B0 = r52 -GR_SAVE_GP = r53 -GR_Parameter_X = r54 -GR_Parameter_Y = r55 -GR_Parameter_RESULT = r56 -GR_Parameter_TAG = r57 + +// Floating Point Registers + +FR_Input_X = f8 + +FR_Y_hi = f34 +FR_Y_lo = f35 + +FR_Scale = f36 +FR_X_Prime = f37 +FR_S_hi = f38 +FR_W = f39 +FR_G = f40 + +FR_H = f41 +FR_wsq = f42 +FR_w4 = f43 +FR_h = f44 +FR_w6 = f45 + +FR_G2 = f46 +FR_H2 = f47 +FR_poly_lo = f48 +FR_P8 = f49 +FR_poly_hi = f50 + +FR_P7 = f51 +FR_h2 = f52 +FR_rsq = f53 +FR_P6 = f54 +FR_r = f55 + +FR_log2_hi = f56 +FR_log2_lo = f57 +FR_p87 = f58 +FR_p876 = f58 +FR_p8765 = f58 +FR_float_N = f59 +FR_Q4 = f60 + +FR_p43 = f61 +FR_p432 = f61 +FR_p4321 = f61 +FR_P4 = f62 +FR_G3 = f63 +FR_H3 = f64 +FR_h3 = f65 + +FR_Q3 = f66 +FR_P3 = f67 +FR_Q2 = f68 +FR_P2 = f69 +FR_1LN10_hi = f70 + +FR_Q1 = f71 +FR_P1 = f72 +FR_1LN10_lo = f73 +FR_P5 = f74 +FR_rcub = f75 + +FR_Output_X_tmp = f76 +FR_Neg_One = f77 +FR_Z = f78 +FR_AA = f79 +FR_BB = f80 +FR_S_lo = f81 +FR_2_to_minus_N = f82 FR_X = f8 FR_Y = f0 -FR_RESULT = f99 +FR_RESULT = f76 -.section .text -.proc logl# -.global logl# -.align 64 -logl: -#ifdef _LIBC -.global __ieee754_logl -__ieee754_logl: -#endif -{ .mfi -alloc r32 = ar.pfs,0,22,4,0 -(p0) fnorm.s1 FR_X_Prime = FR_Input_X -(p0) cmp.eq.unc p7, p0 = r0, r0 -} -{ .mfi -(p0) cmp.ne.unc p14, p0 = r0, r0 -(p0) fclass.m.unc p6, p0 = FR_Input_X, 0x1E3 -(p0) cmp.ne.unc p15, p0 = r0, r0 ;; -} -{ .mfi - nop.m 0 -(p0) fclass.nm.unc p10, p0 = FR_Input_X, 0x1FF - nop.i 0 -} -{ .mfi -nop.m 999 -(p0) fcmp.eq.unc.s1 p8, p0 = FR_Input_X, f0 - nop.i 0 -} -{ .mfi - nop.m 999 -(p0) fcmp.lt.unc.s1 p13, p0 = FR_Input_X, f0 - nop.i 0 -} -{ .mfi - nop.m 999 -(p0) fcmp.eq.unc.s1 p9, p0 = FR_Input_X, f1 - nop.i 999 ;; -} -{ .mfi - nop.m 999 -(p0) fsub.s1 FR_Em1 = f0,f1 - nop.i 999 -} -{ .mfb - nop.m 999 -(p0) fadd FR_E = f0,f0 -// -// Create E = 0 and Em1 = -1 -// Check for X == 1, meaning logl(1) -// Check for X < 0, meaning logl(negative) -// Check for X == 0, meaning logl(0) -// Identify NatVals, NaNs, Infs. -// Identify EM unsupporteds. -// Identify Negative values - us S1 so as -// not to raise denormal operand exception -// Set p15 to false for log -// Set p14 to false for log -// Set p7 true for log and log1p -// -(p0) br.cond.sptk L(LOGL_BEGIN) ;; -} -.endp logl -ASM_SIZE_DIRECTIVE(logl) +// General Purpose Registers -.section .text -.proc log10l# -.global log10l# -.align 64 -log10l: -#ifdef _LIBC -.global __ieee754_log10l -__ieee754_log10l: -#endif -{ .mfi -alloc r32 = ar.pfs,0,22,4,0 -(p0) fadd FR_E = f0,f0 - nop.i 0 -} -{ .mfi - nop.m 0 -(p0) fsub.s1 FR_Em1 = f0,f1 - nop.i 0 -} -{ .mfi -(p0) cmp.ne.unc p15, p0 = r0, r0 -(p0) fcmp.eq.unc.s1 p9, p0 = FR_Input_X, f1 - nop.i 0 -} -{ .mfi -(p0) cmp.eq.unc p14, p0 = r0, r0 -(p0) fcmp.lt.unc.s1 p13, p0 = FR_Input_X, f0 -(p0) cmp.ne.unc p7, p0 = r0, r0 ;; -} -{ .mfi - nop.m 999 -(p0) fcmp.eq.unc.s1 p8, p0 = FR_Input_X, f0 - nop.i 999 -} -{ .mfi - nop.m 999 -(p0) fclass.nm.unc p10, p0 = FR_Input_X, 0x1FF - nop.i 999 ;; -} -{ .mfi - nop.m 999 -(p0) fclass.m.unc p6, p0 = FR_Input_X, 0x1E3 - nop.i 999 -} -{ .mfb - nop.m 999 -(p0) fnorm.s1 FR_X_Prime = FR_Input_X -// -// Create E = 0 and Em1 = -1 -// Check for X == 1, meaning logl(1) -// Check for X < 0, meaning logl(negative) -// Check for X == 0, meaning logl(0) -// Identify NatVals, NaNs, Infs. -// Identify EM unsupporteds. -// Identify Negative values - us S1 so as -// Identify Negative values - us S1 so as -// not to raise denormal operand exception -// Set p15 to false for log10 -// Set p14 to true for log10 -// Set p7 to false for log10 -// -(p0) br.cond.sptk L(LOGL_BEGIN) ;; -} +GR_ad_p = r33 +GR_Index1 = r34 +GR_Index2 = r35 +GR_signif = r36 +GR_X_0 = r37 +GR_X_1 = r38 +GR_X_2 = r39 +GR_minus_N = r39 +GR_Z_1 = r40 +GR_Z_2 = r41 +GR_N = r42 +GR_Bias = r43 +GR_M = r44 +GR_Index3 = r45 +GR_exp_2tom80 = r45 +GR_ad_p2 = r46 +GR_exp_mask = r47 +GR_exp_2tom7 = r48 +GR_ad_ln10 = r49 +GR_ad_tbl_1 = r50 +GR_ad_tbl_2 = r51 +GR_ad_tbl_3 = r52 +GR_ad_q = r53 +GR_ad_z_1 = r54 +GR_ad_z_2 = r55 +GR_ad_z_3 = r56 +GR_minus_N = r39 + +// +// Added for unwind support +// -.endp log10l -ASM_SIZE_DIRECTIVE(log10l) +GR_SAVE_PFS = r50 +GR_SAVE_B0 = r51 +GR_SAVE_GP = r52 +GR_Parameter_X = r53 +GR_Parameter_Y = r54 +GR_Parameter_RESULT = r55 +GR_Parameter_TAG = r56 .section .text -.proc log1pl# -.global log1pl# -.align 64 -log1pl: -#ifdef _LIBC -.global __log1pl -__log1pl: -#endif +GLOBAL_IEEE754_ENTRY(log1pl) { .mfi -alloc r32 = ar.pfs,0,22,4,0 -(p0) fsub.s1 FR_Neg_One = f0,f1 -(p0) cmp.eq.unc p7, p0 = r0, r0 -} -{ .mfi -(p0) cmp.ne.unc p14, p0 = r0, r0 -(p0) fnorm.s1 FR_X_Prime = FR_Input_X -(p0) cmp.eq.unc p15, p0 = r0, r0 ;; + alloc r32 = ar.pfs,0,21,4,0 + fclass.m p6, p0 = FR_Input_X, 0x1E3 // Test for natval, nan, inf + nop.i 999 } { .mfi - nop.m 0 -(p0) fclass.m.unc p6, p0 = FR_Input_X, 0x1E3 - nop.i 0 + addl GR_ad_z_1 = @ltoff(Constants_Z_1#),gp + fma.s1 FR_Z = FR_Input_X, f1, f1 // x+1 + nop.i 999 } +;; + { .mfi nop.m 999 -(p0) fclass.nm.unc p10, p0 = FR_Input_X, 0x1FF - nop.i 0 + fmerge.ns FR_Neg_One = f1, f1 // Form -1.0 + nop.i 999 } { .mfi nop.m 999 -(p0) fcmp.eq.unc.s1 p9, p0 = FR_Input_X, f0 - nop.i 0 + fnorm.s1 FR_X_Prime = FR_Input_X // Normalize x + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fadd FR_Em1 = f0,f0 - nop.i 999 ;; + ld8 GR_ad_z_1 = [GR_ad_z_1] // Get pointer to Constants_Z_1 + nop.f 999 + mov GR_exp_2tom7 = 0x0fff8 // Exponent of 2^-7 } -{ .mfi - nop.m 999 -(p0) fadd FR_E = f0,f1 - nop.i 999 ;; +;; + +{ .mfb + getf.sig GR_signif = FR_Z // Get significand of x+1 + fcmp.eq.s1 p9, p0 = FR_Input_X, f0 // Test for x=0 +(p6) br.cond.spnt LOG1P_special // Branch for nan, inf, natval } +;; + { .mfi - nop.m 999 -(p0) fcmp.eq.unc.s1 p8, p0 = FR_Input_X, FR_Neg_One - nop.i 999 + add GR_ad_tbl_1 = 0x040, GR_ad_z_1 // Point to Constants_G_H_h1 + fcmp.lt.s1 p13, p0 = FR_X_Prime, FR_Neg_One // Test for x<-1 + add GR_ad_p = -0x100, GR_ad_z_1 // Point to Constants_P } { .mfi - nop.m 999 -(p0) fcmp.lt.unc.s1 p13, p0 = FR_Input_X, FR_Neg_One - nop.i 999 + add GR_ad_z_2 = 0x140, GR_ad_z_1 // Point to Constants_Z_2 + nop.f 999 + add GR_ad_tbl_2 = 0x180, GR_ad_z_1 // Point to Constants_G_H_h2 } -L(LOGL_BEGIN): +;; + { .mfi - nop.m 999 -(p0) fadd.s1 FR_Z = FR_X_Prime, FR_E - nop.i 999 -} -{ .mlx - nop.m 999 -(p0) movl GR_Table_Scale = 0x0000000000000018 ;; + add GR_ad_q = 0x080, GR_ad_p // Point to Constants_Q + fcmp.eq.s1 p8, p0 = FR_X_Prime, FR_Neg_One // Test for x=-1 + extr.u GR_Index1 = GR_signif, 59, 4 // Get high 4 bits of signif } -{ .mmi - nop.m 999 - nop.m 999 -// -// Create E = 1 and Em1 = 0 -// Check for X == 0, meaning logl(1+0) -// Check for X < -1, meaning logl(negative) -// Check for X == -1, meaning logl(0) -// Normalize x -// Identify NatVals, NaNs, Infs. -// Identify EM unsupporteds. -// Identify Negative values - us S1 so as -// not to raise denormal operand exception -// Set p15 to true for log1p -// Set p14 to false for log1p -// Set p7 true for log and log1p -// -(p0) addl GR_Table_Base = @ltoff(Constants_Z_G_H_h1#),gp +{ .mfb + add GR_ad_tbl_3 = 0x280, GR_ad_z_1 // Point to Constants_G_H_h3 + nop.f 999 +(p9) br.ret.spnt b0 // Exit if x=0, return input } +;; + { .mfi - nop.m 999 -(p0) fmax.s1 FR_AA = FR_X_Prime, FR_E - nop.i 999 ;; + shladd GR_ad_z_1 = GR_Index1, 2, GR_ad_z_1 // Point to Z_1 + fclass.nm p10, p0 = FR_Input_X, 0x1FF // Test for unsupported + extr.u GR_X_0 = GR_signif, 49, 15 // Get high 15 bits of significand } { .mfi - ld8 GR_Table_Base = [GR_Table_Base] -(p0) fmin.s1 FR_BB = FR_X_Prime, FR_E - nop.i 999 -} -{ .mfb - nop.m 999 -(p0) fadd.s1 FR_W = FR_X_Prime, FR_Em1 -// -// Begin load of constants base -// FR_Z = Z = |x| + E -// FR_W = W = |x| + Em1 -// AA = fmax(|x|,E) -// BB = fmin(|x|,E) -// -(p6) br.cond.spnt L(LOGL_64_special) ;; + ldfe FR_P8 = [GR_ad_p],16 // Load P_8 for near1 path + fsub.s1 FR_W = FR_X_Prime, f0 // W = x + add GR_ad_ln10 = 0x060, GR_ad_q // Point to Constants_1_by_LN10 } -{ .mib - nop.m 999 - nop.i 999 -(p10) br.cond.spnt L(LOGL_64_unsupported) ;; +;; + +{ .mfi + ld4 GR_Z_1 = [GR_ad_z_1] // Load Z_1 + fmax.s1 FR_AA = FR_X_Prime, f1 // For S_lo, form AA = max(X,1.0) + mov GR_exp_mask = 0x1FFFF // Create exponent mask } { .mib - nop.m 999 - nop.i 999 -(p13) br.cond.spnt L(LOGL_64_negative) ;; + shladd GR_ad_tbl_1 = GR_Index1, 4, GR_ad_tbl_1 // Point to G_1 + mov GR_Bias = 0x0FFFF // Create exponent bias +(p13) br.cond.spnt LOG1P_LT_Minus_1 // Branch if x<-1 } -{ .mib -(p0) getf.sig GR_signif = FR_Z - nop.i 999 -(p9) br.cond.spnt L(LOGL_64_one) ;; +;; + +{ .mfb + ldfps FR_G, FR_H = [GR_ad_tbl_1],8 // Load G_1, H_1 + fmerge.se FR_S_hi = f1,FR_Z // Form |x+1| +(p8) br.cond.spnt LOG1P_EQ_Minus_1 // Branch if x=-1 } -{ .mib - nop.m 999 - nop.i 999 -(p8) br.cond.spnt L(LOGL_64_zero) ;; +;; + +{ .mmb + getf.exp GR_N = FR_Z // Get N = exponent of x+1 + ldfd FR_h = [GR_ad_tbl_1] // Load h_1 +(p10) br.cond.spnt LOG1P_unsupported // Branch for unsupported type } +;; + { .mfi -(p0) getf.exp GR_N = FR_Z -// -// Raise possible denormal operand exception -// Create Bias -// -// This function computes ln( x + e ) -// Input FR 1: FR_X = FR_Input_X -// Input FR 2: FR_E = FR_E -// Input FR 3: FR_Em1 = FR_Em1 -// Input GR 1: GR_Expo_Range = GR_Expo_Range = 1 -// Output FR 4: FR_Y_hi -// Output FR 5: FR_Y_lo -// Output FR 6: FR_Scale -// Output PR 7: PR_Safe -// -(p0) fsub.s1 FR_S_lo = FR_AA, FR_Z + ldfe FR_log2_hi = [GR_ad_q],16 // Load log2_hi + fcmp.eq.s0 p8, p0 = FR_Input_X, f0 // Dummy op to flag denormals + pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 // Get bits 30-15 of X_0 * Z_1 +} +;; + // -// signif = getf.sig(Z) -// abs_W = fabs(w) +// For performance, don't use result of pmpyshr2.u for 4 cycles. // -(p0) extr.u GR_Table_ptr = GR_signif, 59, 4 ;; -} -{ .mfi - nop.m 999 -(p0) fmerge.se FR_S_hi = f1,FR_Z -(p0) extr.u GR_X_0 = GR_signif, 49, 15 -} { .mmi - nop.m 999 - nop.m 999 -(p0) addl GR_Table_Base1 = @ltoff(Constants_Z_G_H_h2#),gp ;; -} -{ .mlx - ld8 GR_Table_Base1 = [GR_Table_Base1] -(p0) movl GR_Bias = 0x000000000000FFFF ;; -} -{ .mfi - nop.m 999 -(p0) fabs FR_abs_W = FR_W -(p0) pmpyshr2.u GR_Table_ptr = GR_Table_ptr,GR_Table_Scale,0 -} -{ .mfi - nop.m 999 -// -// Branch out for special input values -// -(p0) fcmp.lt.unc.s0 p8, p0 = FR_Input_X, f0 - nop.i 999 ;; + ldfe FR_log2_lo = [GR_ad_q],16 // Load log2_lo + sub GR_N = GR_N, GR_Bias + mov GR_exp_2tom80 = 0x0ffaf // Exponent of 2^-80 } +;; + { .mfi - nop.m 999 -// -// X_0 = extr.u(signif,49,15) -// Index1 = extr.u(signif,59,4) -// -(p0) fadd.s1 FR_S_lo = FR_S_lo, FR_BB - nop.i 999 ;; + ldfe FR_Q4 = [GR_ad_q],16 // Load Q4 + fms.s1 FR_S_lo = FR_AA, f1, FR_Z // Form S_lo = AA - Z + sub GR_minus_N = GR_Bias, GR_N // Form exponent of 2^(-N) } -{ .mii - nop.m 999 - nop.i 999 ;; -// -// Offset_to_Z1 = 24 * Index1 -// For performance, don't use result -// for 3 or 4 cycles. -// -(p0) add GR_Table_ptr = GR_Table_ptr, GR_Table_Base ;; +;; + +{ .mmf + ldfe FR_Q3 = [GR_ad_q],16 // Load Q3 + setf.sig FR_float_N = GR_N // Put integer N into rightmost significand + fmin.s1 FR_BB = FR_X_Prime, f1 // For S_lo, form BB = min(X,1.0) } -// -// Add Base to Offset for Z1 -// Create Bias +;; + { .mmi -(p0) ld4 GR_Z_1 = [GR_Table_ptr],4 ;; -(p0) ldfs FR_G = [GR_Table_ptr],4 - nop.i 999 ;; + getf.exp GR_M = FR_W // Get signexp of w = x + ldfe FR_Q2 = [GR_ad_q],16 // Load Q2 + extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1 } +;; + { .mmi -(p0) ldfs FR_H = [GR_Table_ptr],8 ;; -(p0) ldfd FR_h = [GR_Table_ptr],0 -(p0) pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 -} -// -// Load Z_1 -// Get Base of Table2 -// -{ .mfi -(p0) getf.exp GR_M = FR_abs_W - nop.f 999 - nop.i 999 ;; -} -{ .mii - nop.m 999 - nop.i 999 ;; -// -// M = getf.exp(abs_W) -// S_lo = AA - Z -// X_1 = pmpyshr2(X_0,Z_1,15) -// -(p0) sub GR_M = GR_M, GR_Bias ;; -} -// -// M = M - Bias -// Load G1 -// N = getf.exp(Z) -// -{ .mii -(p0) cmp.gt.unc p11, p0 = -80, GR_M -(p0) cmp.gt.unc p12, p0 = -7, GR_M ;; -(p0) extr.u GR_Index2 = GR_X_1, 6, 4 ;; -} -{ .mib - nop.m 999 -// -// if -80 > M, set p11 -// Index2 = extr.u(X_1,6,4) -// if -7 > M, set p12 -// Load H1 -// -(p0) pmpyshr2.u GR_Index2 = GR_Index2,GR_Table_Scale,0 -(p11) br.cond.spnt L(log1pl_small) ;; -} -{ .mib - nop.m 999 - nop.i 999 -(p12) br.cond.spnt L(log1pl_near) ;; -} -{ .mii -(p0) sub GR_N = GR_N, GR_Bias -// -// poly_lo = r * poly_lo -// -(p0) add GR_Perturb = 0x1, r0 ;; -(p0) sub GR_ScaleN = GR_Bias, GR_N -} -{ .mii -(p0) setf.sig FR_float_N = GR_N - nop.i 999 ;; -// -// Prepare Index2 - pmpyshr2.u(X_1,Z_2,15) -// Load h1 -// S_lo = S_lo + BB -// Branch for -80 > M -// -(p0) add GR_Index2 = GR_Index2, GR_Table_Base1 + ldfe FR_Q1 = [GR_ad_q] // Load Q1 + shladd GR_ad_z_2 = GR_Index2, 2, GR_ad_z_2 // Point to Z_2 + add GR_ad_p2 = 0x30,GR_ad_p // Point to P_4 } +;; + { .mmi -(p0) setf.exp FR_two_negN = GR_ScaleN - nop.m 999 -(p0) addl GR_Table_Base = @ltoff(Constants_Z_G_H_h3#),gp ;; + ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2 + shladd GR_ad_tbl_2 = GR_Index2, 4, GR_ad_tbl_2 // Point to G_2 + and GR_M = GR_exp_mask, GR_M // Get exponent of w = x } -// -// Index2 points to Z2 -// Branch for -7 > M -// -{ .mmb -(p0) ld4 GR_Z_2 = [GR_Index2],4 -(p0) ld8 GR_Table_Base = [GR_Table_Base] - nop.b 999 ;; -} -(p0) nop.i 999 -// -// Load Z_2 -// N = N - Bias -// Tablebase points to Table3 -// +;; + { .mmi -(p0) ldfs FR_G_tmp = [GR_Index2],4 ;; -// -// Load G_2 -// pmpyshr2 X_2= (X_1,Z_2,15) -// float_N = setf.sig(N) -// ScaleN = Bias - N -// -(p0) ldfs FR_H_tmp = [GR_Index2],8 - nop.i 999 ;; + ldfps FR_G2, FR_H2 = [GR_ad_tbl_2],8 // Load G_2, H_2 + cmp.lt p8, p9 = GR_M, GR_exp_2tom7 // Test |x| < 2^-7 + cmp.lt p7, p0 = GR_M, GR_exp_2tom80 // Test |x| < 2^-80 } -// -// Load H_2 -// two_negN = setf.exp(scaleN) -// G = G_1 * G_2 -// +;; + +// Small path is separate code +// p7 is for the small path: |x| < 2^-80 +// near1 and regular paths are merged. +// p8 is for the near1 path: |x| < 2^-7 +// p9 is for regular path: |x| >= 2^-7 + { .mfi -(p0) ldfd FR_h_tmp = [GR_Index2],0 - nop.f 999 -(p0) pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 ;; + ldfd FR_h2 = [GR_ad_tbl_2] // Load h_2 + nop.f 999 + nop.i 999 } -{ .mii - nop.m 999 -(p0) extr.u GR_Index3 = GR_X_2, 1, 5 ;; -// -// Load h_2 -// H = H_1 + H_2 -// h = h_1 + h_2 -// Index3 = extr.u(X_2,1,5) -// -(p0) shladd GR_Index3 = GR_Index3,4,GR_Table_Base +{ .mfb +(p9) setf.exp FR_2_to_minus_N = GR_minus_N // Form 2^(-N) +(p7) fnma.s0 f8 = FR_X_Prime, FR_X_Prime, FR_X_Prime // Result x - x*x +(p7) br.ret.spnt b0 // Branch if |x| < 2^-80 } +;; + { .mmi - nop.m 999 - nop.m 999 -// -// float_N = fcvt.xf(float_N) -// load G3 -// -(p0) addl GR_Table_Base = @ltoff(Constants_Q#),gp ;; +(p8) ldfe FR_P7 = [GR_ad_p],16 // Load P_7 for near1 path +(p8) ldfe FR_P4 = [GR_ad_p2],16 // Load P_4 for near1 path +(p9) pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 // Get bits 30-15 of X_1 * Z_2 } -{ .mmi - nop.m 999 - ld8 GR_Table_Base = [GR_Table_Base] - nop.i 999 -};; +;; -{ .mfi -(p0) ldfe FR_log2_hi = [GR_Table_Base],16 -(p0) fmpy.s1 FR_S_lo = FR_S_lo, FR_two_negN - nop.i 999 ;; -} -{ .mmf - nop.m 999 // -// G = G3 * G -// Load h3 -// Load log2_hi -// H = H + H3 +// For performance, don't use result of pmpyshr2.u for 4 cycles. // -(p0) ldfe FR_log2_lo = [GR_Table_Base],16 -(p0) fmpy.s1 FR_G = FR_G, FR_G_tmp ;; -} { .mmf -(p0) ldfs FR_G_tmp = [GR_Index3],4 -// -// h = h + h3 -// r = G * S_hi + 1 -// Load log2_lo -// -(p0) ldfe FR_Q4 = [GR_Table_Base],16 -(p0) fadd.s1 FR_h = FR_h, FR_h_tmp ;; -} -{ .mfi -(p0) ldfe FR_Q3 = [GR_Table_Base],16 -(p0) fadd.s1 FR_H = FR_H, FR_H_tmp - nop.i 999 ;; +(p8) ldfe FR_P6 = [GR_ad_p],16 // Load P_6 for near1 path +(p8) ldfe FR_P3 = [GR_ad_p2],16 // Load P_3 for near1 path +(p9) fma.s1 FR_S_lo = FR_S_lo, f1, FR_BB // S_lo = S_lo + BB } +;; + { .mmf -(p0) ldfs FR_H_tmp = [GR_Index3],4 -(p0) ldfe FR_Q2 = [GR_Table_Base],16 -// -// Comput Index for Table3 -// S_lo = S_lo * two_negN -// -(p0) fcvt.xf FR_float_N = FR_float_N ;; +(p8) ldfe FR_P5 = [GR_ad_p],16 // Load P_5 for near1 path +(p8) ldfe FR_P2 = [GR_ad_p2],16 // Load P_2 for near1 path +(p8) fmpy.s1 FR_wsq = FR_W, FR_W // wsq = w * w for near1 path } -// -// If S_lo == 0, set p8 false -// Load H3 -// Load ptr to table of polynomial coeff. -// -{ .mmf -(p0) ldfd FR_h_tmp = [GR_Index3],0 -(p0) ldfe FR_Q1 = [GR_Table_Base],0 -(p0) fcmp.eq.unc.s1 p0, p8 = FR_S_lo, f0 ;; +;; + +{ .mmi +(p8) ldfe FR_P1 = [GR_ad_p2],16 ;; // Load P_1 for near1 path + nop.m 999 +(p9) extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2 } +;; + { .mfi - nop.m 999 -(p0) fmpy.s1 FR_G = FR_G, FR_G_tmp - nop.i 999 ;; +(p9) shladd GR_ad_tbl_3 = GR_Index3, 4, GR_ad_tbl_3 // Point to G_3 +(p9) fcvt.xf FR_float_N = FR_float_N + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fadd.s1 FR_H = FR_H, FR_H_tmp - nop.i 999 ;; +(p9) ldfps FR_G3, FR_H3 = [GR_ad_tbl_3],8 // Load G_3, H_3 + nop.f 999 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fms.s1 FR_r = FR_G, FR_S_hi, f1 - nop.i 999 +(p9) ldfd FR_h3 = [GR_ad_tbl_3] // Load h_3 +(p9) fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2 + nop.i 999 } { .mfi - nop.m 999 -(p0) fadd.s1 FR_h = FR_h, FR_h_tmp - nop.i 999 ;; + nop.m 999 +(p9) fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2 + nop.i 999 } -{ .mfi - nop.m 999 -(p0) fma.s1 FR_Y_hi = FR_float_N, FR_log2_hi, FR_H - nop.i 999 ;; +;; + +{ .mmf + nop.m 999 + nop.m 999 +(p9) fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2 } +;; + { .mfi - nop.m 999 -// -// Load Q4 -// Load Q3 -// Load Q2 -// Load Q1 -// -(p8) fma.s1 FR_r = FR_G, FR_S_lo, FR_r - nop.i 999 + nop.m 999 +(p8) fmpy.s1 FR_w4 = FR_wsq, FR_wsq // w4 = w^4 for near1 path + nop.i 999 } { .mfi - nop.m 999 -// -// poly_lo = r * Q4 + Q3 -// rsq = r* r -// -(p0) fma.s1 FR_h = FR_float_N, FR_log2_lo, FR_h - nop.i 999 ;; + nop.m 999 +(p8) fma.s1 FR_p87 = FR_W, FR_P8, FR_P7 // p87 = w * P8 + P7 + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// If (S_lo!=0) r = s_lo * G + r -// -(p0) fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3 - nop.i 999 + nop.m 999 +(p9) fma.s1 FR_S_lo = FR_S_lo, FR_2_to_minus_N, f0 // S_lo = S_lo * 2^(-N) + nop.i 999 } -// -// Create a 0x00000....01 -// poly_lo = poly_lo * rsq + h -// { .mfi -(p0) setf.sig FR_dummy = GR_Perturb -(p0) fmpy.s1 FR_rsq = FR_r, FR_r - nop.i 999 ;; + nop.m 999 +(p8) fma.s1 FR_p43 = FR_W, FR_P4, FR_P3 // p43 = w * P4 + P3 + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// h = N * log2_lo + h -// Y_hi = n * log2_hi + H -// -(p0) fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2 - nop.i 999 + nop.m 999 +(p9) fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3 + nop.i 999 } { .mfi - nop.m 999 -(p0) fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r - nop.i 999 ;; + nop.m 999 +(p9) fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3 + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// poly_lo = r * poly_o + Q2 -// poly_hi = Q1 * rsq + r -// -(p0) fmpy.s1 FR_poly_lo = FR_poly_lo, FR_r - nop.i 999 ;; + nop.m 999 +(p9) fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3 + nop.i 999 } { .mfi - nop.m 999 -(p0) fma.s1 FR_poly_lo = FR_poly_lo, FR_rsq, FR_h - nop.i 999 ;; -} -{ .mfb - nop.m 999 -(p0) fadd.s1 FR_Y_lo = FR_poly_hi, FR_poly_lo -// -// Create the FR for a binary "or" -// Y_lo = poly_hi + poly_lo -// -// (p0) for FR_dummy = FR_Y_lo,FR_dummy ;; -// -// Turn the lsb of Y_lo ON -// -// (p0) fmerge.se FR_Y_lo = FR_Y_lo,FR_dummy ;; -// -// Merge the new lsb into Y_lo, for alone doesn't -// -(p0) br.cond.sptk LOGL_main ;; -} -L(log1pl_near): -{ .mmi - nop.m 999 - nop.m 999 -// /*******************************************************/ -// /*********** Branch log1pl_near ************************/ -// /*******************************************************/ -(p0) addl GR_Table_Base = @ltoff(Constants_P#),gp ;; -} -{ .mmi nop.m 999 - ld8 GR_Table_Base = [GR_Table_Base] +(p8) fmpy.s1 FR_w6 = FR_w4, FR_wsq // w6 = w^6 for near1 path nop.i 999 -};; -// -// Load base address of poly. coeff. -// -{ .mmb -(p0) add GR_Table_ptr = 0x40,GR_Table_Base -// -// Address tables with separate pointers -// -(p0) ldfe FR_P8 = [GR_Table_Base],16 - nop.b 999 ;; -} -{ .mmb -(p0) ldfe FR_P4 = [GR_Table_ptr],16 -// -// Load P4 -// Load P8 -// -(p0) ldfe FR_P7 = [GR_Table_Base],16 - nop.b 999 ;; -} -{ .mmf -(p0) ldfe FR_P3 = [GR_Table_ptr],16 -// -// Load P3 -// Load P7 -// -(p0) ldfe FR_P6 = [GR_Table_Base],16 -(p0) fmpy.s1 FR_wsq = FR_W, FR_W ;; } +;; + { .mfi -(p0) ldfe FR_P2 = [GR_Table_ptr],16 - nop.f 999 - nop.i 999 ;; + nop.m 999 +(p8) fma.s1 FR_p432 = FR_W, FR_p43, FR_P2 // p432 = w * p43 + P2 + nop.i 999 } { .mfi - nop.m 999 -(p0) fma.s1 FR_Y_hi = FR_W, FR_P4, FR_P3 - nop.i 999 + nop.m 999 +(p8) fma.s1 FR_p876 = FR_W, FR_p87, FR_P6 // p876 = w * p87 + P6 + nop.i 999 } -// -// Load P2 -// Load P6 -// Wsq = w * w -// Y_hi = p4 * w + p3 -// +;; + { .mfi -(p0) ldfe FR_P5 = [GR_Table_Base],16 -(p0) fma.s1 FR_Y_lo = FR_W, FR_P8, FR_P7 - nop.i 999 ;; + nop.m 999 +(p9) fms.s1 FR_r = FR_G, FR_S_hi, f1 // r = G * S_hi - 1 + nop.i 999 } { .mfi -(p0) ldfe FR_P1 = [GR_Table_ptr],16 -// -// Load P1 -// Load P5 -// Y_lo = p8 * w + P7 -// -(p0) fmpy.s1 FR_w4 = FR_wsq, FR_wsq - nop.i 999 ;; + nop.m 999 +(p9) fma.s1 FR_Y_hi = FR_float_N, FR_log2_hi, FR_H // Y_hi = N * log2_hi + H + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fma.s1 FR_Y_hi = FR_W, FR_Y_hi, FR_P2 - nop.i 999 + nop.m 999 +(p9) fma.s1 FR_h = FR_float_N, FR_log2_lo, FR_h // h = N * log2_lo + h + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fma.s1 FR_Y_lo = FR_W, FR_Y_lo, FR_P6 -(p0) add GR_Perturb = 0x1, r0 ;; + nop.m 999 +(p9) fma.s1 FR_r = FR_G, FR_S_lo, FR_r // r = G * S_lo + (G * S_hi - 1) + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// w4 = w2 * w2 -// Y_hi = y_hi * w + p2 -// Y_lo = y_lo * w + p6 -// Create perturbation bit -// -(p0) fmpy.s1 FR_w6 = FR_w4, FR_wsq - nop.i 999 ;; + nop.m 999 +(p8) fma.s1 FR_p4321 = FR_W, FR_p432, FR_P1 // p4321 = w * p432 + P1 + nop.i 999 } { .mfi - nop.m 999 -(p0) fma.s1 FR_Y_hi = FR_W, FR_Y_hi, FR_P1 - nop.i 999 + nop.m 999 +(p8) fma.s1 FR_p8765 = FR_W, FR_p876, FR_P5 // p8765 = w * p876 + P5 + nop.i 999 } -// -// Y_hi = y_hi * w + p1 -// w6 = w4 * w2 -// +;; + { .mfi -(p0) setf.sig FR_Q4 = GR_Perturb -(p0) fma.s1 FR_Y_lo = FR_W, FR_Y_lo, FR_P5 - nop.i 999 ;; + nop.m 999 +(p9) fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3 // poly_lo = r * Q4 + Q3 + nop.i 999 } { .mfi - nop.m 999 -(p0) fma.s1 FR_dummy = FR_wsq,FR_Y_hi, f0 - nop.i 999 + nop.m 999 +(p9) fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fma.s1 FR_Y_hi = FR_W,f1,f0 - nop.i 999 -};; -{ .mfb - nop.m 999 -// -// Y_hi = w -// Y_lo = y_lo * w + p5 -// -(p0) fma.s1 FR_Y_lo = FR_w6, FR_Y_lo,FR_dummy -// -// Y_lo = y_lo * w6 + y_high order part. -// -// performance -// -(p0) br.cond.sptk LOGL_main ;; -} -L(log1pl_small): -{ .mmi - nop.m 999 -// /*******************************************************/ -// /*********** Branch log1pl_small ***********************/ -// /*******************************************************/ -(p0) addl GR_Table_Base = @ltoff(Constants_Threshold#),gp + nop.m 999 +(p8) fma.s1 FR_Y_lo = FR_wsq, FR_p4321, f0 // Y_lo = wsq * p4321 + nop.i 999 } { .mfi nop.m 999 -(p0) mov FR_Em1 = FR_W -(p0) cmp.eq.unc p7, p0 = r0, r0 ;; -} -{ .mlx - ld8 GR_Table_Base = [GR_Table_Base] -(p0) movl GR_Expo_Range = 0x0000000000000004 ;; -} -// -// Set Safe to true -// Set Expo_Range = 0 for single -// Set Expo_Range = 2 for double -// Set Expo_Range = 4 for double-extended -// -{ .mmi -(p0) shladd GR_Table_Base = GR_Expo_Range,4,GR_Table_Base ;; -(p0) ldfe FR_Threshold = [GR_Table_Base],16 - nop.i 999 -} -{ .mlx - nop.m 999 -(p0) movl GR_Bias = 0x000000000000FF9B ;; +(p8) fma.s1 FR_Y_hi = FR_W, f1, f0 // Y_hi = w for near1 path + nop.i 999 } +;; + { .mfi -(p0) ldfe FR_Tiny = [GR_Table_Base],0 - nop.f 999 - nop.i 999 ;; + nop.m 999 +(p9) fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2 // poly_lo = poly_lo * r + Q2 + nop.i 999 } { .mfi - nop.m 999 -(p0) fcmp.gt.unc.s1 p13, p12 = FR_abs_W, FR_Threshold - nop.i 999 ;; + nop.m 999 +(p9) fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p13) fnmpy.s1 FR_Y_lo = FR_W, FR_W - nop.i 999 + nop.m 999 +(p8) fma.s1 FR_Y_lo = FR_w6, FR_p8765,FR_Y_lo // Y_lo = w6 * p8765 + w2 * p4321 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p13) fadd FR_SCALE = f0, f1 - nop.i 999 ;; + nop.m 999 +(p9) fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r // poly_hi = Q1 * rsq + r + nop.i 999 } +;; + { .mfi - nop.m 999 -(p12) fsub.s1 FR_Y_lo = f0, FR_Tiny -(p12) cmp.ne.unc p7, p0 = r0, r0 + nop.m 999 +(p9) fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h // poly_lo = poly_lo*r^3 + h + nop.i 999 } +;; + { .mfi -(p12) setf.exp FR_SCALE = GR_Bias - nop.f 999 - nop.i 999 ;; -} -{ .mfb - nop.m 999 -// -// Set p7 to SAFE = FALSE -// Set Scale = 2^-100 -// -(p0) fma.s0 f8 = FR_Y_lo,FR_SCALE,FR_Y_hi -(p0) br.ret.sptk b0 ;; + nop.m 999 +(p9) fadd.s1 FR_Y_lo = FR_poly_hi, FR_poly_lo // Y_lo = poly_hi + poly_lo + nop.i 999 } -L(LOGL_64_one): +;; + +// Remainder of code is common for near1 and regular paths { .mfb - nop.m 999 -(p0) fmpy.s0 f8 = FR_Input_X, f0 -(p0) br.ret.sptk b0 ;; + nop.m 999 + fadd.s0 f8 = FR_Y_lo,FR_Y_hi // Result=Y_lo+Y_hi + br.ret.sptk b0 // Common exit for 2^-80 < x < inf } -// -// Raise divide by zero for +/-0 input. -// -L(LOGL_64_zero): -{ .mfi -(p0) mov GR_Parameter_TAG = 0 +;; + + +// Here if x=-1 +LOG1P_EQ_Minus_1: // -// If we have logl(1), log10l(1) or log1pl(0), return 0. +// If x=-1 raise divide by zero and return -inf // -(p0) fsub.s0 FR_Output_X_tmp = f0, f1 - nop.i 999 ;; -} -{ .mii -(p14) mov GR_Parameter_TAG = 6 - nop.i 999 ;; -(p15) mov GR_Parameter_TAG = 138 ;; -} -{ .mfb - nop.m 999 -(p0) frcpa.s0 FR_Output_X_tmp, p8 = FR_Output_X_tmp, f0 -(p0) br.cond.sptk __libm_error_region ;; +{ .mfi + mov GR_Parameter_TAG = 138 + fsub.s1 FR_Output_X_tmp = f0, f1 + nop.i 999 } +;; + { .mfb - nop.m 999 -// -// Report that logl(0) computed -// { .mfb -(p0) mov FR_Input_X = FR_Output_X_tmp -(p0) br.ret.sptk b0 ;; + nop.m 999 + frcpa.s0 FR_Output_X_tmp, p8 = FR_Output_X_tmp, f0 + br.cond.sptk __libm_error_region } +;; -L(LOGL_64_special): +LOG1P_special: { .mfi - nop.m 999 -// -// Return -Inf or value from handler. -// -(p0) fclass.m.unc p7, p0 = FR_Input_X, 0x1E1 - nop.i 999 ;; + nop.m 999 + fclass.m.unc p8, p0 = FR_Input_X, 0x1E1 // Test for natval, nan, +inf + nop.i 999 } -{ .mfb - nop.m 999 -// -// Check for Natval, QNan, SNaN, +Inf -// -(p7) fmpy.s0 f8 = FR_Input_X, f1 +;; + // // For SNaN raise invalid and return QNaN. // For QNaN raise invalid and return QNaN. // For +Inf return +Inf. // -(p7) br.ret.sptk b0 ;; +{ .mfb + nop.m 999 +(p8) fmpy.s0 f8 = FR_Input_X, f1 +(p8) br.ret.sptk b0 // Return for natval, nan, +inf } +;; + // // For -Inf raise invalid and return QNaN. // -{ .mii -(p0) mov GR_Parameter_TAG = 1 - nop.i 999 ;; -(p14) mov GR_Parameter_TAG = 7 ;; -} -{ .mfi -(p15) mov GR_Parameter_TAG = 139 - nop.f 999 - nop.i 999 ;; -} { .mfb - nop.m 999 -(p0) fmpy.s0 FR_Output_X_tmp = FR_Input_X, f0 -(p0) br.cond.sptk __libm_error_region ;; + mov GR_Parameter_TAG = 139 + fmpy.s0 FR_Output_X_tmp = FR_Input_X, f0 + br.cond.sptk __libm_error_region } -// -// Report that logl(-Inf) computed -// Report that log10l(-Inf) computed -// Report that log1p(-Inf) computed -// -{ .mfb - nop.m 0 -(p0) mov FR_Input_X = FR_Output_X_tmp -(p0) br.ret.sptk b0 ;; -} -L(LOGL_64_unsupported): -{ .mfb - nop.m 999 +;; + + +LOG1P_unsupported: // -// Return generated NaN or other value . +// Return generated NaN or other value. // -(p0) fmpy.s0 f8 = FR_Input_X, f0 -(p0) br.ret.sptk b0 ;; +{ .mfb + nop.m 999 + fmpy.s0 f8 = FR_Input_X, f0 + br.ret.sptk b0 } -L(LOGL_64_negative): -{ .mfi - nop.m 999 -// -// Deal with x < 0 in a special way -// -(p0) frcpa.s0 FR_Output_X_tmp, p8 = f0, f0 +;; + +// Here if -inf < x < -1 +LOG1P_LT_Minus_1: // -// Deal with x < 0 in a special way - raise +// Deal with x < -1 in a special way - raise // invalid and produce QNaN indefinite. // -(p0) mov GR_Parameter_TAG = 1 ;; -} -{ .mii -(p14) mov GR_Parameter_TAG = 7 - nop.i 999 ;; -(p15) mov GR_Parameter_TAG = 139 +{ .mfb + mov GR_Parameter_TAG = 139 + frcpa.s0 FR_Output_X_tmp, p8 = f0, f0 + br.cond.sptk __libm_error_region } -.endp log1pl -ASM_SIZE_DIRECTIVE(log1pl) +;; + -.proc __libm_error_region -__libm_error_region: +GLOBAL_IEEE754_END(log1pl) +LOCAL_LIBM_ENTRY(__libm_error_region) .prologue { .mfi add GR_Parameter_Y=-32,sp // Parameter 2 value @@ -1609,8 +1177,8 @@ __libm_error_region: br.call.sptk b0=__libm_error_support# // Call error handling function };; { .mmi - nop.m 0 - nop.m 0 + nop.m 999 + nop.m 999 add GR_Parameter_RESULT = 48,sp };; { .mmi @@ -1625,52 +1193,7 @@ __libm_error_region: br.ret.sptk b0 // Return };; -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) - -.proc LOGL_main -LOGL_main: -{ .mfi - nop.m 999 -// -// kernel_log_64 computes ln(X + E) -// -(p7) fadd.s0 FR_Input_X = FR_Y_lo,FR_Y_hi - nop.i 0 -} -{ .mmi - nop.m 999 - nop.m 999 -(p14) addl GR_Table_Base = @ltoff(Constants_1_by_LN10#),gp ;; -} -{ .mmi - nop.m 999 -(p14) ld8 GR_Table_Base = [GR_Table_Base] - nop.i 999 -};; - -{ .mmi -(p14) ldfe FR_1LN10_hi = [GR_Table_Base],16 ;; -(p14) ldfe FR_1LN10_lo = [GR_Table_Base] - nop.i 999 ;; -} -{ .mfi - nop.m 999 -(p14) fmpy.s1 FR_Output_X_tmp = FR_Y_lo,FR_1LN10_hi - nop.i 999 ;; -} -{ .mfi - nop.m 999 -(p14) fma.s1 FR_Output_X_tmp = FR_Y_hi,FR_1LN10_lo,FR_Output_X_tmp - nop.i 999 ;; -} -{ .mfb - nop.m 999 -(p14) fma.s0 FR_Input_X = FR_Y_hi,FR_1LN10_hi,FR_Output_X_tmp -(p0) br.ret.sptk b0 ;; -} -.endp LOGL_main -ASM_SIZE_DIRECTIVE(LOGL_main) +LOCAL_LIBM_END(__libm_error_region#) .type __libm_error_support#,@function .global __libm_error_support# diff --git a/sysdeps/ia64/fpu/s_logb.S b/sysdeps/ia64/fpu/s_logb.S index 76c4fe778e..dfe581a826 100644 --- a/sysdeps/ia64/fpu/s_logb.S +++ b/sysdeps/ia64/fpu/s_logb.S @@ -1,10 +1,10 @@ .file "logb.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,41 +20,43 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// // Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// problem reports or change requests be submitted to it directly at +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 2/02/00 Initial version -// 2/16/00 Modified to conform to C9X -// 3/16/00 Improved speed -// 4/04/00 Unwind support added -// 5/30/00 Fixed bug when x double-extended denormal -// 8/15/00 Bundle added after call to __libm_error_support to properly +// 02/02/00 Initial version +// 02/16/00 Modified to conform to C9X +// 03/16/00 Improved speed +// 04/04/00 Unwind support added +// 05/30/00 Fixed bug when x double-extended denormal +// 08/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. +// 05/20/02 Cleaned up namespace and sf0 syntax +// 01/20/03 Improved performance // // API //============================================================== -// double logb( double x); +// double logb( double x ); // // Overview of operation //============================================================== -// The logb function extracts the exponent of x as an integer in -// floating-point format. +// The logb function extracts the exponent of x as an integer in +// floating-point format. // logb computes log2 of x as a double // // logb is similar to ilogb but differs in the following ways: @@ -71,217 +73,168 @@ // // Registers used //============================================================== -// general registers used: -// ar.pfs r32 -// r33 -> r37 -// r38 -> r41 used as parameters to error path +// general registers used: +// r26 -> r38 +// r35 -> r38 used as parameters to error path // -// predicate registers used: +// predicate registers used: // p6, p7, p8 -// floating-point registers used: +// floating-point registers used: // f9, f10, f11 // f8, input -#include "libm_support.h" +rExpBias = r26 +rExpMask = r27 +rSignexp_x = r28 +rExp_x = r29 +rTrueExp_x = r30 +rExp_2to64 = r31 -GR_SAVE_B0 = r34 -GR_SAVE_GP = r35 GR_SAVE_PFS = r32 +GR_SAVE_B0 = r33 +GR_SAVE_GP = r34 -GR_Parameter_X = r38 -GR_Parameter_Y = r39 -GR_Parameter_RESULT = r40 +GR_Parameter_X = r35 +GR_Parameter_Y = r36 +GR_Parameter_RESULT = r37 +GR_Parameter_TAG = r38 -.align 32 -.global logb# +fExp_in_signif = f9 +fNorm_x = f10 +fFloat_Exp = f10 +f2to64 = f11 .section .text -.proc logb# -.align 32 +GLOBAL_LIBM_ENTRY(logb) - -logb: - -// qnan snan inf norm unorm 0 -+ -// 0 0 0 0 1 0 11 -// 0 b -{ .mfi - alloc r32=ar.pfs,1,5,4,0 -(p0) fclass.m.unc p8,p0 = f8, 0x0b - nop.i 999 -} // X NORMAL -// r37 = exp(f8) - - 0xffff -// sig(f8) = r37 +// TrueExp_x = exp(f8) - 0xffff +// sig = TrueExp_x // f8 = convert_to_fp (sig)) { .mfi -(p0) getf.exp r35 = f8 -(p0) fnorm f10=f8 - nop.i 999 ;; + getf.exp rSignexp_x = f8 + fclass.m p8,p0 = f8, 0x0b // Test for x unorm + mov rExpBias = 0xffff // Exponent bias } - -// qnan snan inf norm unorm 0 -+ -// 1 1 1 0 0 0 11 -// e 3 -{ .mmf -(p0) mov r33 = 0xffff -(p0) mov r34 = 0x1ffff -(p0) fclass.m.unc p6,p0 = f8, 0xe3 ;; +{ .mfi + nop.m 0 + fnorm.s1 fNorm_x = f8 + mov rExpMask = 0x1ffff // Exponent mask } +;; +// Form signexp of 2^64 in case need to scale denormal { .mfb -(p0) and r36 = r35, r34 -(p0) fclass.m.unc p7,p0 = f8, 0x07 -(p8) br.cond.spnt L(LOGB_DENORM) ;; + mov rExp_2to64 = 0x1003f + fclass.m p6,p0 = f8, 0x1e3 // Test x natval, nan, inf +(p8) br.cond.spnt LOGB_DENORM // Branch if x unorm } +;; -{ .mib -(p0) sub r37 = r36, r33 - nop.i 999 -(p6) br.cond.spnt L(LOGB_NAN_INF) ;; +LOGB_COMMON: +// Return here from LOGB_DENORM +{ .mfi + and rExp_x = rSignexp_x, rExpMask // Get biased exponent + fclass.m p7,p0 = f8, 0x07 // Test x zero + nop.i 0 } +;; -{ .mib -(p0) setf.sig f9 = r37 - nop.i 999 -(p7) br.cond.spnt L(LOGB_ZERO) ;; +// X NAN or INFINITY, return f8 * f8 +{ .mfb + sub rTrueExp_x = rExp_x, rExpBias // Get true exponent +(p6) fma.d.s0 f8= f8,f8,f0 // Result if x natval, nan, inf +(p6) br.ret.spnt b0 // Exit if x natval, nan, inf } +;; -{ .mfi - nop.m 999 -(p0) fcvt.xf f10 = f9 - nop.i 999 ;; +{ .mib + setf.sig fExp_in_signif = rTrueExp_x // Exponent as integer in fp + nop.i 999 +(p7) br.cond.spnt LOGB_ZERO } +;; +// Result can be represented in less than 24 bits, so no precision completer +// is needed. { .mfb - nop.m 999 -(p0) fnorm.d f8 = f10 -(p0) br.ret.sptk b0 ;; + nop.m 0 + fcvt.xf f8 = fExp_in_signif + br.ret.sptk b0 // Exit main path, 0 < |x| < inf } +;; -L(LOGB_DENORM): -// Form signexp of 2^64 in case need to scale denormal +LOGB_DENORM: +// Form 2^64 in case need to scale denormal // Check to see if double-extended denormal { .mfi -(p0) mov r38 = 0x1003f -(p0) fclass.m.unc p8,p0 = f10, 0x0b - nop.i 999 ;; + setf.exp f2to64 = rExp_2to64 + fclass.m p8,p0 = fNorm_x, 0x0b + nop.i 0 } +;; -// Form 2^64 in case need to scale denormal { .mfi -(p0) setf.exp f11 = r38 - nop.f 999 - nop.i 999 ;; + nop.m 0 + fcmp.eq.s0 p7,p0 = f8, f0 // Dummy op to set denormal flag + nop.i 0 } +;; // If double-extended denormal add 64 to exponent bias for scaling // If double-extended denormal form x * 2^64 which is normal { .mfi -(p8) add r33 = 64, r33 -(p8) fmpy f10 = f10, f11 - nop.i 999 ;; +(p8) add rExpBias = 64, rExpBias +(p8) fmpy.s1 fNorm_x = fNorm_x, f2to64 + nop.i 0 } +;; // Logic is the same as normal path but use normalized input -{ .mmi -(p0) getf.exp r35 = f10 ;; - nop.m 999 - nop.i 999 ;; -} - -{ .mmi -(p0) and r36 = r35, r34 ;; -(p0) sub r37 = r36, r33 - nop.i 999 ;; -} - -{ .mmi -(p0) setf.sig f9 = r37 - nop.m 999 - nop.i 999 ;; +{ .mib + getf.exp rSignexp_x = fNorm_x + nop.i 0 + br.cond.sptk LOGB_COMMON // Return to main path } +;; -{ .mfi - nop.m 999 -(p0) fcvt.xf f10 = f9 - nop.i 999 ;; -} +LOGB_ZERO: +// Here if x zero +// f10 = -|f8| +// f9 = 1.0/f10 = -1.0/|f8| = -inf -{ .mfb - nop.m 999 -(p0) fnorm.d f8 = f10 -(p0) br.ret.sptk b0 ;; +{ .mmf + alloc r32=ar.pfs,1,2,4,0 + mov GR_Parameter_TAG = 151 // Error code + fmerge.ns f10 = f0,f8 } +;; -L(LOGB_NAN_INF): - -// X NAN or INFINITY, return f8 * f8 { .mfb - nop.m 999 -(p0) fma.d f8= f8,f8,f0 -(p0) br.ret.sptk b0 ;; -} - -.endp logb# -ASM_SIZE_DIRECTIVE(logb) - -// Stack operations when calling error support. -// (1) (2) (3) (call) (4) -// sp -> + psp -> + psp -> + sp -> + -// | | | | -// | | <- GR_Y R3 ->| <- GR_RESULT | -> f8 -// | | | | -// | <-GR_Y Y2->| Y2 ->| <- GR_Y | -// | | | | -// | | <- GR_X X1 ->| | -// | | | | -// sp-64 -> + sp -> + sp -> + + -// save ar.pfs save b0 restore gp -// save gp restore ar.pfs - - - -.proc __libm_error_region -__libm_error_region: -L(LOGB_ZERO): -.prologue - -// f9 = |f8| -// f10 = -f9 = -|f8| -// f9 = 1.0/f10 = -1.0/-|f8| - -{ .mfi - mov r41 = 151 // Error code -(p0) fmerge.s f9 = f0,f8 - nop.i 999 + nop.m 0 + frcpa.s0 f9,p6 = f1,f10 // Produce -inf, Z flag + br.cond.sptk __libm_error_region // Call error support } ;; +GLOBAL_LIBM_END(logb) -{ .mfi - nop.m 999 - fmerge.ns f10 = f0,f9 - nop.i 999 -} -;; +LOCAL_LIBM_ENTRY(__libm_error_region) +.prologue -// (1) { .mfi - add GR_Parameter_Y=-32,sp // Parameter 2 value - frcpa f9,p6 = f1,f10 + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 0 .save ar.pfs,GR_SAVE_PFS - mov GR_SAVE_PFS=ar.pfs // Save ar.pfs + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs } { .mfi .fframe 64 - add sp=-64,sp // Create new stack + add sp=-64,sp // Create new stack nop.f 0 - mov GR_SAVE_GP=gp // Save gp + mov GR_SAVE_GP=gp // Save gp };; - -// (2) { .mmi stfd [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack add GR_Parameter_X = 16,sp // Parameter 1 address @@ -290,38 +243,38 @@ L(LOGB_ZERO): };; .body -// (3) { .mib stfd [GR_Parameter_X] = f8 // STORE Parameter 1 on stack add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address - nop.b 0 + nop.b 0 } { .mib stfd [GR_Parameter_Y] = f9 // Store Parameter 3 on stack add GR_Parameter_Y = -16,GR_Parameter_Y br.call.sptk b0=__libm_error_support# // Call error handling function };; + { .mmi - nop.m 0 - nop.m 0 add GR_Parameter_RESULT = 48,sp + nop.m 0 + nop.i 0 };; -// (4) { .mmi ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack .restore sp add sp = 64,sp // Restore stack pointer mov b0 = GR_SAVE_B0 // Restore return address };; + { .mib mov gp = GR_SAVE_GP // Restore gp mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs br.ret.sptk b0 };; -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) +LOCAL_LIBM_END(__libm_error_region) + .type __libm_error_support#,@function .global __libm_error_support# diff --git a/sysdeps/ia64/fpu/s_logbf.S b/sysdeps/ia64/fpu/s_logbf.S index f2f671f892..1d605cd97c 100644 --- a/sysdeps/ia64/fpu/s_logbf.S +++ b/sysdeps/ia64/fpu/s_logbf.S @@ -1,10 +1,10 @@ .file "logbf.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,44 +20,46 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// // Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// problem reports or change requests be submitted to it directly at +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 2/02/00 Initial version -// 2/16/00 Modified to conform to C9X -// 3/16/00 Improved speed -// 4/04/00 Unwind support added -// 5/30/00 Fixed bug when x double-extended denormal -// 8/15/00 Bundle added after call to __libm_error_support to properly +// 02/02/00 Initial version +// 02/16/00 Modified to conform to C9X +// 03/16/00 Improved speed +// 04/04/00 Unwind support added +// 05/30/00 Fixed bug when x double-extended denormal +// 08/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. +// 05/20/02 Cleaned up namespace and sf0 syntax +// 01/20/03 Improved performance // // API //============================================================== -// float logbf( float x); +// float logbf( float x ); // // Overview of operation //============================================================== -// The logbf function extracts the exponent of x as an integer in -// floating-point format. +// The logbf function extracts the exponent of x as an integer in +// floating-point format. // logbf computes log2 of x as a float - -// logbf is similar to ilogbf but differs in the following ways: +// +// logbf is similar to ilogbf but differs in the following ways: // +-inf // ilogbf: returns INT_MAX // logbf: returns +inf @@ -71,243 +73,207 @@ // // Registers used //============================================================== -// general registers used: -// ar.pfs r32 -// r33 -> r37 -// r38 -> r41 used as parameters to error path +// general registers used: +// r26 -> r38 +// r35 -> r38 used as parameters to error path // -// predicate registers used: +// predicate registers used: // p6, p7, p8 -// -// floating-point registers used: +// floating-point registers used: // f9, f10, f11 // f8, input -#include "libm_support.h" - -GR_SAVE_B0 = r34 -// r40 is address of table of coefficients -GR_SAVE_PFS = r32 -GR_SAVE_GP = r35 +rExpBias = r26 +rExpMask = r27 +rSignexp_x = r28 +rExp_x = r29 +rTrueExp_x = r30 +rExp_2to64 = r31 -GR_Parameter_X = r38 -GR_Parameter_Y = r39 -GR_Parameter_RESULT = r40 -GR_Parameter_TAG = r41 +GR_SAVE_PFS = r32 +GR_SAVE_B0 = r33 +GR_SAVE_GP = r34 -FR_X = f8 -FR_Y = f0 -FR_RESULT = f10 +GR_Parameter_X = r35 +GR_Parameter_Y = r36 +GR_Parameter_RESULT = r37 +GR_Parameter_TAG = r38 - -.align 32 -.global logbf# +fExp_in_signif = f9 +fNorm_x = f10 +fFloat_Exp = f10 +f2to64 = f11 .section .text -.proc logbf# -.align 32 - +GLOBAL_LIBM_ENTRY(logbf) -logbf: - -// qnan snan inf norm unorm 0 -+ -// 0 0 0 0 1 0 11 -// 0 b -{ .mfi - alloc r32=ar.pfs,1,5,4,0 -(p0) fclass.m.unc p8,p0 = f8, 0x0b - nop.i 999 -} // X NORMAL -// r37 = exp(f8) - - 0xffff -// sig(f8) = r37 +// TrueExp_x = exp(f8) - 0xffff +// sig = TrueExp_x // f8 = convert_to_fp (sig)) { .mfi -(p0) getf.exp r35 = f8 -(p0) fnorm f10=f8 - nop.i 999 ;; + getf.exp rSignexp_x = f8 + fclass.m p8,p0 = f8, 0x0b // Test for x unorm + mov rExpBias = 0xffff // Exponent bias } - -// qnan snan inf norm unorm 0 -+ -// 1 1 1 0 0 0 11 -// e 3 -{ .mmf -(p0) mov r33 = 0xffff -(p0) mov r34 = 0x1ffff -(p0) fclass.m.unc p6,p0 = f8, 0xe3 ;; +{ .mfi + nop.m 0 + fnorm.s1 fNorm_x = f8 + mov rExpMask = 0x1ffff // Exponent mask } +;; +// Form signexp of 2^64 in case need to scale denormal { .mfb -(p0) and r36 = r35, r34 -(p0) fclass.m.unc p7,p0 = f8, 0x07 -(p8) br.cond.spnt L(LOGB_DENORM) ;; + mov rExp_2to64 = 0x1003f + fclass.m p6,p0 = f8, 0x1e3 // Test x natval, nan, inf +(p8) br.cond.spnt LOGB_DENORM // Branch if x unorm } +;; -{ .mib -(p0) sub r37 = r36, r33 - nop.i 999 -(p6) br.cond.spnt L(LOGB_NAN_INF) ;; +LOGB_COMMON: +// Return here from LOGB_DENORM +{ .mfi + and rExp_x = rSignexp_x, rExpMask // Get biased exponent + fclass.m p7,p0 = f8, 0x07 // Test x zero + nop.i 0 } +;; -{ .mib -(p0) setf.sig f9 = r37 - nop.i 999 -(p7) br.cond.spnt L(LOGB_ZERO) ;; +// X NAN or INFINITY, return f8 * f8 +{ .mfb + sub rTrueExp_x = rExp_x, rExpBias // Get true exponent +(p6) fma.s.s0 f8= f8,f8,f0 // Result if x natval, nan, inf +(p6) br.ret.spnt b0 // Exit if x natval, nan, inf } +;; -{ .mfi - nop.m 999 -(p0) fcvt.xf f10 = f9 - nop.i 999 ;; +{ .mib + setf.sig fExp_in_signif = rTrueExp_x // Exponent as integer in fp + nop.i 999 +(p7) br.cond.spnt LOGB_ZERO } +;; +// Result can be represented in less than 24 bits, so no precision completer +// is needed. { .mfb - nop.m 999 -(p0) fnorm.s f8 = f10 -(p0) br.ret.sptk b0 ;; + nop.m 0 + fcvt.xf f8 = fExp_in_signif + br.ret.sptk b0 // Exit main path, 0 < |x| < inf } +;; -L(LOGB_DENORM): -// Form signexp of 2^64 in case need to scale denormal +LOGB_DENORM: +// Form 2^64 in case need to scale denormal // Check to see if double-extended denormal { .mfi -(p0) mov r38 = 0x1003f -(p0) fclass.m.unc p8,p0 = f10, 0x0b - nop.i 999 ;; + setf.exp f2to64 = rExp_2to64 + fclass.m p8,p0 = fNorm_x, 0x0b + nop.i 0 } +;; -// Form 2^64 in case need to scale denormal { .mfi -(p0) setf.exp f11 = r38 - nop.f 999 - nop.i 999 ;; + nop.m 0 + fcmp.eq.s0 p7,p0 = f8, f0 // Dummy op to set denormal flag + nop.i 0 } +;; // If double-extended denormal add 64 to exponent bias for scaling // If double-extended denormal form x * 2^64 which is normal { .mfi -(p8) add r33 = 64, r33 -(p8) fmpy f10 = f10, f11 - nop.i 999 ;; +(p8) add rExpBias = 64, rExpBias +(p8) fmpy.s1 fNorm_x = fNorm_x, f2to64 + nop.i 0 } +;; // Logic is the same as normal path but use normalized input -{ .mmi -(p0) getf.exp r35 = f10 ;; - nop.m 999 - nop.i 999 ;; -} - -{ .mmi -(p0) and r36 = r35, r34 ;; -(p0) sub r37 = r36, r33 - nop.i 999 ;; -} - -{ .mmi -(p0) setf.sig f9 = r37 - nop.m 999 - nop.i 999 ;; +{ .mib + getf.exp rSignexp_x = fNorm_x + nop.i 0 + br.cond.sptk LOGB_COMMON // Return to main path } +;; -{ .mfi - nop.m 999 -(p0) fcvt.xf f10 = f9 - nop.i 999 ;; -} +LOGB_ZERO: +// Here if x zero +// f10 = -|f8| +// f9 = 1.0/f10 = -1.0/|f8| = -inf -{ .mfb - nop.m 999 -(p0) fnorm.s f8 = f10 -(p0) br.ret.sptk b0 ;; +{ .mmf + alloc r32=ar.pfs,1,2,4,0 + mov GR_Parameter_TAG = 152 // Error code + fmerge.ns f10 = f0,f8 } +;; -L(LOGB_NAN_INF): - -// X NAN or INFINITY, return f8 * f8 { .mfb - nop.m 999 -(p0) fma.s f8= f8,f8,f0 -(p0) br.ret.sptk b0 ;; + nop.m 0 + frcpa.s0 f9,p6 = f1,f10 // Produce -inf, Z flag + br.cond.sptk __libm_error_region // Call error support } +;; -L(LOGB_ZERO): - -// X ZERO -// return -1.0/fabs(f8)=-inf, set divide-by-zero flag, call error support -{ .mfi - nop.m 999 -(p0) fmerge.s f9 = f0,f8 - nop.i 999 ;; -} +GLOBAL_LIBM_END(logbf) -{ .mfi - nop.m 999 -(p0) fmerge.ns f10 = f0,f9 - nop.i 999 ;; -} +LOCAL_LIBM_ENTRY(__libm_error_region) +.prologue { .mfi - nop.m 999 -(p0) frcpa f10,p6 = f1,f10 - nop.i 999 ;; -} - -.endp logbf -ASM_SIZE_DIRECTIVE(logbf) - - -.proc __libm_error_region -__libm_error_region: -.prologue -{ .mii - add GR_Parameter_Y=-32,sp // Parameter 2 value -(p0) mov GR_Parameter_TAG = 152 + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 0 .save ar.pfs,GR_SAVE_PFS - mov GR_SAVE_PFS=ar.pfs // Save ar.pfs + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs } { .mfi .fframe 64 - add sp=-64,sp // Create new stack + add sp=-64,sp // Create new stack nop.f 0 - mov GR_SAVE_GP=gp // Save gp + mov GR_SAVE_GP=gp // Save gp };; + { .mmi - stfs [GR_Parameter_Y] = FR_Y,16 // Store Parameter 2 on stack - add GR_Parameter_X = 16,sp // Parameter 1 address + stfs [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address .save b0, GR_SAVE_B0 - mov GR_SAVE_B0=b0 // Save b0 + mov GR_SAVE_B0=b0 // Save b0 };; + .body { .mib - stfs [GR_Parameter_X] = FR_X // Store Parameter 1 on stack - add GR_Parameter_RESULT = 0,GR_Parameter_Y - nop.b 0 // Parameter 3 address + stfs [GR_Parameter_X] = f8 // STORE Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address + nop.b 0 } { .mib - stfs [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack + stfs [GR_Parameter_Y] = f9 // Store Parameter 3 on stack add GR_Parameter_Y = -16,GR_Parameter_Y - br.call.sptk b0=__libm_error_support# // Call error handling function + br.call.sptk b0=__libm_error_support# // Call error handling function };; + { .mmi - nop.m 0 - nop.m 0 add GR_Parameter_RESULT = 48,sp + nop.m 0 + nop.i 0 };; + { .mmi ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack .restore sp add sp = 64,sp // Restore stack pointer mov b0 = GR_SAVE_B0 // Restore return address };; + { .mib - mov gp = GR_SAVE_GP // Restore gp + mov gp = GR_SAVE_GP // Restore gp mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs - br.ret.sptk b0 // Return -};; + br.ret.sptk b0 +};; -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) +LOCAL_LIBM_END(__libm_error_region) .type __libm_error_support#,@function diff --git a/sysdeps/ia64/fpu/s_logbl.S b/sysdeps/ia64/fpu/s_logbl.S index 38b131f3aa..6a08e94201 100644 --- a/sysdeps/ia64/fpu/s_logbl.S +++ b/sysdeps/ia64/fpu/s_logbl.S @@ -1,10 +1,10 @@ .file "logbl.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,44 +20,46 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// // Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// problem reports or change requests be submitted to it directly at +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 2/02/00 Initial version -// 2/16/00 Modified to conform to C9X -// 3/16/00 Improved speed -// 4/04/00 Unwind support added -// 5/30/00 Fixed bug when x double-extended denormal -// 8/15/00 Bundle added after call to __libm_error_support to properly +// 02/02/00 Initial version +// 02/16/00 Modified to conform to C9X +// 03/16/00 Improved speed +// 04/04/00 Unwind support added +// 05/30/00 Fixed bug when x double-extended denormal +// 08/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. +// 05/20/02 Cleaned up namespace and sf0 syntax +// 01/20/03 Improved performance // // API //============================================================== -// long double logbl( long double x); +// long double logbl( long double x ); // // Overview of operation //============================================================== -// The logbl function extracts the exponent of x as an integer in -// floating-point format. +// The logbl function extracts the exponent of x as an integer in +// floating-point format. // logbl computes log2 of x as a long double // -// logbl is similar to ilogbl but differs in the following ways: +// logbl is similar to ilogbl but differs in the following ways: // +-inf // ilogbl: returns INT_MAX // logbl: returns +inf @@ -71,229 +73,208 @@ // // Registers used //============================================================== -// general registers used: -// ar.pfs r32 -// r33 -> r37 -// r38 -> r41 used as parameters to error path +// general registers used: +// r26 -> r38 +// r35 -> r38 used as parameters to error path // -// predicate registers used: +// predicate registers used: // p6, p7, p8 -// -// floating-point registers used: +// floating-point registers used: // f9, f10, f11 // f8, input -#include "libm_support.h" +rExpBias = r26 +rExpMask = r27 +rSignexp_x = r28 +rExp_x = r29 +rTrueExp_x = r30 +rExp_2to64 = r31 GR_SAVE_PFS = r32 -GR_SAVE_B0 = r34 -GR_SAVE_GP = r35 -GR_Parameter_X = r38 -GR_Parameter_Y = r39 -GR_Parameter_RESULT = r40 -GR_Parameter_TAG = r41 +GR_SAVE_B0 = r33 +GR_SAVE_GP = r34 -FR_X = f8 -FR_Y = f0 -FR_RESULT = f10 +GR_Parameter_X = r35 +GR_Parameter_Y = r36 +GR_Parameter_RESULT = r37 +GR_Parameter_TAG = r38 -.align 32 -.global logbl# +fExp_in_signif = f9 +fNorm_x = f10 +fFloat_Exp = f10 +f2to64 = f11 .section .text -.proc logbl# -.align 32 - +GLOBAL_LIBM_ENTRY(logbl) -logbl: - -// qnan snan inf norm unorm 0 -+ -// 0 0 0 0 1 0 11 -// 0 b -{ .mfi - alloc r32=ar.pfs,1,5,4,0 -(p0) fclass.m.unc p8,p0 = f8, 0x0b - nop.i 999 -} // X NORMAL -// r37 = exp(f8) - - 0xffff -// sig(f8) = r37 +// TrueExp_x = exp(f8) - 0xffff +// sig = TrueExp_x // f8 = convert_to_fp (sig)) { .mfi -(p0) getf.exp r35 = f8 -(p0) fnorm f10=f8 - nop.i 999 ;; + getf.exp rSignexp_x = f8 + fclass.m p8,p0 = f8, 0x0b // Test for x unorm + mov rExpBias = 0xffff // Exponent bias } - -// qnan snan inf norm unorm 0 -+ -// 1 1 1 0 0 0 11 -// e 3 -{ .mmf -(p0) mov r33 = 0xffff -(p0) mov r34 = 0x1ffff -(p0) fclass.m.unc p6,p0 = f8, 0xe3 ;; +{ .mfi + nop.m 0 + fnorm.s1 fNorm_x = f8 + mov rExpMask = 0x1ffff // Exponent mask } +;; +// Form signexp of 2^64 in case need to scale denormal { .mfb -(p0) and r36 = r35, r34 -(p0) fclass.m.unc p7,p0 = f8, 0x07 -(p8) br.cond.spnt L(LOGB_DENORM) ;; + mov rExp_2to64 = 0x1003f + fclass.m p6,p0 = f8, 0x1e3 // Test x natval, nan, inf +(p8) br.cond.spnt LOGB_DENORM // Branch if x unorm } +;; -{ .mib -(p0) sub r37 = r36, r33 - nop.i 999 -(p6) br.cond.spnt L(LOGB_NAN_INF) ;; +LOGB_COMMON: +// Return here from LOGB_DENORM +{ .mfi + and rExp_x = rSignexp_x, rExpMask // Get biased exponent + fclass.m p7,p0 = f8, 0x07 // Test x zero + nop.i 0 } +;; + +// X NAN or INFINITY, return f8 * f8 +{ .mfb + sub rTrueExp_x = rExp_x, rExpBias // Get true exponent +(p6) fma.s0 f8= f8,f8,f0 // Result if x natval, nan, inf +(p6) br.ret.spnt b0 // Exit if x natval, nan, inf +} +;; { .mib -(p0) setf.sig f9 = r37 + setf.sig fExp_in_signif = rTrueExp_x // Exponent as integer in fp nop.i 999 -(p7) br.cond.spnt L(LOGB_ZERO) ;; -} -{ .mfi - nop.m 999 -(p0) fcvt.xf f10 = f9 - nop.i 999 ;; +(p7) br.cond.spnt LOGB_ZERO } +;; +// Result can be represented in less than 24 bits, so no precision completer +// is needed. { .mfb - nop.m 999 -(p0) fnorm f8 = f10 -(p0) br.ret.sptk b0 ;; + nop.m 0 + fcvt.xf f8 = fExp_in_signif + br.ret.sptk b0 // Exit main path, 0 < |x| < inf } +;; -L(LOGB_DENORM): -// Form signexp of 2^64 in case need to scale denormal +LOGB_DENORM: +// Form 2^64 in case need to scale denormal // Check to see if double-extended denormal { .mfi -(p0) mov r38 = 0x1003f -(p0) fclass.m.unc p8,p0 = f10, 0x0b - nop.i 999 ;; + setf.exp f2to64 = rExp_2to64 + fclass.m p8,p0 = fNorm_x, 0x0b + nop.i 0 } +;; -// Form 2^64 in case need to scale denormal { .mfi -(p0) setf.exp f11 = r38 - nop.f 999 - nop.i 999 ;; + nop.m 0 + fcmp.eq.s0 p7,p0 = f8, f0 // Dummy op to set denormal flag + nop.i 0 } +;; // If double-extended denormal add 64 to exponent bias for scaling // If double-extended denormal form x * 2^64 which is normal { .mfi -(p8) add r33 = 64, r33 -(p8) fmpy f10 = f10, f11 - nop.i 999 ;; +(p8) add rExpBias = 64, rExpBias +(p8) fmpy.s1 fNorm_x = fNorm_x, f2to64 + nop.i 0 } +;; // Logic is the same as normal path but use normalized input -{ .mmi -(p0) getf.exp r35 = f10 ;; - nop.m 999 - nop.i 999 ;; -} - -{ .mmi -(p0) and r36 = r35, r34 ;; -(p0) sub r37 = r36, r33 - nop.i 999 ;; -} - -{ .mmi -(p0) setf.sig f9 = r37 - nop.m 999 - nop.i 999 ;; +{ .mib + getf.exp rSignexp_x = fNorm_x + nop.i 0 + br.cond.sptk LOGB_COMMON // Return to main path } +;; -{ .mfi - nop.m 999 -(p0) fcvt.xf f10 = f9 - nop.i 999 ;; -} +LOGB_ZERO: +// Here if x zero +// f10 = -|f8| +// f9 = 1.0/f10 = -1.0/|f8| = -inf -{ .mfb - nop.m 999 -(p0) fnorm f8 = f10 -(p0) br.ret.sptk b0 ;; +{ .mmf + alloc r32=ar.pfs,1,2,4,0 + mov GR_Parameter_TAG = 150 // Error code + fmerge.ns f10 = f0,f8 } +;; -L(LOGB_NAN_INF): - -// X NAN or INFINITY, return f8 * f8 { .mfb - nop.m 999 -(p0) fma f8= f8,f8,f0 -(p0) br.ret.sptk b0 ;; + nop.m 0 + frcpa.s0 f9,p6 = f1,f10 // Produce -inf, Z flag + br.cond.sptk __libm_error_region // Call error support } +;; -L(LOGB_ZERO): -{.mfi - nop.m 0 -(p0) frcpa.s0 f10,p6 = f1,f0 - nop.i 0 -};; -{.mfi - mov GR_Parameter_TAG = 150 -(p0) fms.s1 f10 = f0,f0,f10 - nop.i 0 -};; -// X ZERO -// return -1.0/fabs(f8)=-inf, set divide-by-zero flag, call error support -.endp logbl -ASM_SIZE_DIRECTIVE(logbl) +GLOBAL_LIBM_END(logbl) -.proc __libm_error_region -__libm_error_region: +LOCAL_LIBM_ENTRY(__libm_error_region) .prologue + { .mfi - add GR_Parameter_Y=-32,sp // Parameter 2 value + add GR_Parameter_Y=-32,sp // Parameter 2 value nop.f 0 .save ar.pfs,GR_SAVE_PFS - mov GR_SAVE_PFS=ar.pfs // Save ar.pfs + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs } { .mfi .fframe 64 - add sp=-64,sp // Create new stack + add sp=-64,sp // Create new stack nop.f 0 - mov GR_SAVE_GP=gp // Save gp + mov GR_SAVE_GP=gp // Save gp };; + { .mmi - stfe [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack - add GR_Parameter_X = 16,sp // Parameter 1 address + stfe [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address .save b0, GR_SAVE_B0 - mov GR_SAVE_B0=b0 // Save b0 + mov GR_SAVE_B0=b0 // Save b0 };; + .body { .mib - stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack - add GR_Parameter_RESULT = 0,GR_Parameter_Y - nop.b 0 // Parameter 3 address + stfe [GR_Parameter_X] = f8 // STORE Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address + nop.b 0 } { .mib - stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack + stfe [GR_Parameter_Y] = f9 // Store Parameter 3 on stack add GR_Parameter_Y = -16,GR_Parameter_Y - br.call.sptk b0=__libm_error_support# // Call error handling function + br.call.sptk b0=__libm_error_support# // Call error handling function };; + { .mmi - nop.m 0 - nop.m 0 add GR_Parameter_RESULT = 48,sp + nop.m 0 + nop.i 0 };; + { .mmi ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack .restore sp add sp = 64,sp // Restore stack pointer mov b0 = GR_SAVE_B0 // Restore return address };; + { .mib mov gp = GR_SAVE_GP // Restore gp mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs - br.ret.sptk b0 // Return + br.ret.sptk b0 };; -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) +LOCAL_LIBM_END(__libm_error_region) + .type __libm_error_support#,@function .global __libm_error_support# diff --git a/sysdeps/ia64/fpu/s_modf.S b/sysdeps/ia64/fpu/s_modf.S index e8e672adfe..2008bbfc5c 100644 --- a/sysdeps/ia64/fpu/s_modf.S +++ b/sysdeps/ia64/fpu/s_modf.S @@ -1,10 +1,10 @@ .file "modf.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,7 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -35,14 +35,16 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 2/02/00: Initial version -// 4/04/00: Improved speed, corrected result for NaN input +// 02/02/00 Initial version +// 04/04/00 Improved speed, corrected result for NaN input // 12/22/00 Fixed so inexact flag is never set, and invalid is not set for // qnans nor for inputs larger than 2^63. +// 05/20/02 Cleaned up namespace and sf0 syntax +// 02/10/03 Reordered header: .section, .global, .proc, .align // // API //============================================================== @@ -97,8 +99,6 @@ // p13 --------------------------------------------------->| // -#include "libm_support.h" - // floating-point registers used: MODF_NORM_F8 = f9 MODF_FRACTION_PART = f10 @@ -115,23 +115,17 @@ modf_exp = r18 // r33 = iptr -.align 32 -.global modf# - .section .text -.proc modf# -.align 32 - +GLOBAL_LIBM_ENTRY(modf) // Main path is p9, p11, p8 FALSE and p12 TRUE // Assume input is normalized and get signexp // Normalize input just in case // Form exponent bias -modf: { .mfi getf.exp modf_signexp = f8 - fnorm MODF_NORM_F8 = f8 + fnorm.s0 MODF_NORM_F8 = f8 addl modf_GR_FFFF = 0xffff, r0 } // Get integer part of input @@ -176,10 +170,10 @@ modf: { .mfb (p10) cmp.ge.unc p9,p12 = modf_exp, modf_GR_no_frac (p6) fclass.m.unc p6,p7 = f8, 0x23 -(p8) br.cond.spnt L(MODF_DENORM) ;; +(p8) br.cond.spnt MODF_DENORM ;; } -L(MODF_COMMON): +MODF_COMMON: // For HUGE set fraction to signed 0 { .mfi nop.m 999 @@ -189,7 +183,7 @@ L(MODF_COMMON): // For HUGE set integer part to normalized input { .mfi nop.m 999 -(p9) fnorm.d MODF_INTEGER_PART = MODF_NORM_F8 +(p9) fnorm.d.s0 MODF_INTEGER_PART = MODF_NORM_F8 nop.i 999 ;; } @@ -201,7 +195,7 @@ L(MODF_COMMON): } { .mfi nop.m 999 -(p11) fnorm.d f8 = MODF_NORM_F8 +(p11) fnorm.d.s0 f8 = MODF_NORM_F8 nop.i 999 ;; } @@ -242,7 +236,7 @@ L(MODF_COMMON): // For NORMAL test if fraction part is zero; if so append correct sign { .mfi nop.m 999 -(p12) fcmp.eq.unc p7,p0 = MODF_NORM_F8, MODF_INTEGER_PART +(p12) fcmp.eq.unc.s0 p7,p0 = MODF_NORM_F8, MODF_INTEGER_PART nop.i 999 ;; } @@ -259,7 +253,7 @@ L(MODF_COMMON): br.ret.sptk b0 ;; } -L(MODF_DENORM): +MODF_DENORM: // If x unorm get signexp from normalized input // If x unorm get integer part from normalized input { .mfi @@ -278,8 +272,7 @@ L(MODF_DENORM): { .mfb (p10) cmp.ge.unc p9,p12 = modf_exp, modf_GR_no_frac nop.f 999 - br.cond.spnt L(MODF_COMMON) ;; + br.cond.spnt MODF_COMMON ;; } -.endp modf -ASM_SIZE_DIRECTIVE(modf) +GLOBAL_LIBM_END(modf) diff --git a/sysdeps/ia64/fpu/s_modff.S b/sysdeps/ia64/fpu/s_modff.S index 6aa43c884d..edc1120971 100644 --- a/sysdeps/ia64/fpu/s_modff.S +++ b/sysdeps/ia64/fpu/s_modff.S @@ -1,10 +1,10 @@ .file "modff.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,7 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -35,14 +35,16 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 2/02/00: Initial version -// 4/04/00: Improved speed, corrected result for NaN input +// 02/02/00 Initial version +// 04/04/00 Improved speed, corrected result for NaN input // 12/22/00 Fixed so inexact flag is never set, and invalid is not set for // qnans nor for inputs larger than 2^63. +// 05/20/02 Cleaned up namespace and sf0 syntax +// 02/10/03 Reordered header: .section, .global, .proc, .align // // API //============================================================== @@ -97,8 +99,6 @@ // p13 --------------------------------------------------->| // -#include "libm_support.h" - // floating-point registers used: MODF_NORM_F8 = f9 MODF_FRACTION_PART = f10 @@ -115,23 +115,17 @@ modf_exp = r18 // r33 = iptr -.align 32 -.global modff# - .section .text -.proc modff# -.align 32 - +GLOBAL_LIBM_ENTRY(modff) // Main path is p9, p11, p8 FALSE and p12 TRUE // Assume input is normalized and get signexp // Normalize input just in case // Form exponent bias -modff: { .mfi getf.exp modf_signexp = f8 - fnorm MODF_NORM_F8 = f8 + fnorm.s0 MODF_NORM_F8 = f8 addl modf_GR_FFFF = 0xffff, r0 } // Get integer part of input @@ -176,10 +170,10 @@ modff: { .mfb (p10) cmp.ge.unc p9,p12 = modf_exp, modf_GR_no_frac (p6) fclass.m.unc p6,p7 = f8, 0x23 -(p8) br.cond.spnt L(MODF_DENORM) ;; +(p8) br.cond.spnt MODF_DENORM ;; } -L(MODF_COMMON): +MODF_COMMON: // For HUGE set fraction to signed 0 { .mfi nop.m 999 @@ -189,7 +183,7 @@ L(MODF_COMMON): // For HUGE set integer part to normalized input { .mfi nop.m 999 -(p9) fnorm.s MODF_INTEGER_PART = MODF_NORM_F8 +(p9) fnorm.s.s0 MODF_INTEGER_PART = MODF_NORM_F8 nop.i 999 ;; } @@ -201,7 +195,7 @@ L(MODF_COMMON): } { .mfi nop.m 999 -(p11) fnorm.s f8 = MODF_NORM_F8 +(p11) fnorm.s.s0 f8 = MODF_NORM_F8 nop.i 999 ;; } @@ -242,7 +236,7 @@ L(MODF_COMMON): // For NORMAL test if fraction part is zero; if so append correct sign { .mfi nop.m 999 -(p12) fcmp.eq.unc p7,p0 = MODF_NORM_F8, MODF_INTEGER_PART +(p12) fcmp.eq.unc.s0 p7,p0 = MODF_NORM_F8, MODF_INTEGER_PART nop.i 999 ;; } @@ -259,7 +253,7 @@ L(MODF_COMMON): br.ret.sptk b0 ;; } -L(MODF_DENORM): +MODF_DENORM: // If x unorm get signexp from normalized input // If x unorm get integer part from normalized input { .mfi @@ -278,8 +272,7 @@ L(MODF_DENORM): { .mfb (p10) cmp.ge.unc p9,p12 = modf_exp, modf_GR_no_frac nop.f 999 - br.cond.spnt L(MODF_COMMON) ;; + br.cond.spnt MODF_COMMON ;; } -.endp modff -ASM_SIZE_DIRECTIVE(modff) +GLOBAL_LIBM_END(modff) diff --git a/sysdeps/ia64/fpu/s_modfl.S b/sysdeps/ia64/fpu/s_modfl.S index b5eb509adf..eaf410cb6c 100644 --- a/sysdeps/ia64/fpu/s_modfl.S +++ b/sysdeps/ia64/fpu/s_modfl.S @@ -1,10 +1,10 @@ .file "modfl.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,7 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -35,15 +35,17 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 2/02/00: Initial version -// 4/04/00: Improved speed, corrected result for NaN input -// 5/30/00 Fixed bug for exponent 0x1003e +// 02/02/00 Initial version +// 04/04/00 Improved speed, corrected result for NaN input +// 05/30/00 Fixed bug for exponent 0x1003e // 12/22/00 Fixed so inexact flag is never set, and invalid is not set for // qnans nor for inputs larger than 2^63. +// 05/20/02 Cleaned up namespace and sf0 syntax +// 02/10/03 Reordered header: .section, .global, .proc, .align // // API //============================================================== @@ -92,8 +94,6 @@ // p13 --------------------------------------------------->| // -#include "libm_support.h" - // floating-point registers used: MODF_NORM_F8 = f9 MODF_FRACTION_PART = f10 @@ -110,23 +110,17 @@ modf_exp = r18 // r34 = iptr -.align 32 -.global modfl# - .section .text -.proc modfl# -.align 32 - +GLOBAL_LIBM_ENTRY(modfl) // Main path is p9, p11, p8 FALSE and p12 TRUE // Assume input is normalized and get signexp // Normalize input just in case // Form exponent bias -modfl: { .mfi getf.exp modf_signexp = f8 - fnorm MODF_NORM_F8 = f8 + fnorm.s0 MODF_NORM_F8 = f8 addl modf_GR_FFFF = 0xffff, r0 } // Get integer part of input @@ -171,10 +165,10 @@ modfl: { .mfb (p10) cmp.ge.unc p9,p12 = modf_exp, modf_GR_no_frac (p6) fclass.m.unc p6,p7 = f8, 0x23 -(p8) br.cond.spnt L(MODF_DENORM) ;; +(p8) br.cond.spnt MODF_DENORM ;; } -L(MODF_COMMON): +MODF_COMMON: // For HUGE set fraction to signed 0 { .mfi nop.m 999 @@ -184,7 +178,7 @@ L(MODF_COMMON): // For HUGE set integer part to normalized input { .mfi nop.m 999 -(p9) fnorm MODF_INTEGER_PART = MODF_NORM_F8 +(p9) fnorm.s0 MODF_INTEGER_PART = MODF_NORM_F8 nop.i 999 ;; } @@ -196,7 +190,7 @@ L(MODF_COMMON): } { .mfi nop.m 999 -(p11) fnorm f8 = MODF_NORM_F8 +(p11) fnorm.s0 f8 = MODF_NORM_F8 nop.i 999 ;; } @@ -237,7 +231,7 @@ L(MODF_COMMON): // For NORMAL test if fraction part is zero; if so append correct sign { .mfi nop.m 999 -(p12) fcmp.eq.unc p7,p0 = MODF_NORM_F8, MODF_INTEGER_PART +(p12) fcmp.eq.unc.s0 p7,p0 = MODF_NORM_F8, MODF_INTEGER_PART nop.i 999 ;; } @@ -254,7 +248,7 @@ L(MODF_COMMON): br.ret.sptk b0 ;; } -L(MODF_DENORM): +MODF_DENORM: // If x unorm get signexp from normalized input // If x unorm get integer part from normalized input { .mfi @@ -273,8 +267,7 @@ L(MODF_DENORM): { .mfb (p10) cmp.ge.unc p9,p12 = modf_exp, modf_GR_no_frac nop.f 999 - br.cond.spnt L(MODF_COMMON) ;; + br.cond.spnt MODF_COMMON ;; } -.endp modfl -ASM_SIZE_DIRECTIVE(modfl) +GLOBAL_LIBM_END(modfl) diff --git a/sysdeps/ia64/fpu/s_nearbyint.S b/sysdeps/ia64/fpu/s_nearbyint.S index 6ee01ea260..cba74e61d3 100644 --- a/sysdeps/ia64/fpu/s_nearbyint.S +++ b/sysdeps/ia64/fpu/s_nearbyint.S @@ -1,11 +1,10 @@ .file "nearbyint.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 10/19/2000 by John Harrison, Cristina Iordache, Ted Kubaska, -// Bob Norin, Tom Rowan, Shane Story, and Ping Tak Peter Tang of the -// Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -21,7 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -36,20 +35,19 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 10/19/2000: Created -// 2/08/01 Corrected behavior for all rounding modes. +// 10/19/00 Created +// 02/08/01 Corrected behavior for all rounding modes. +// 05/20/02 Cleaned up namespace and sf0 syntax +// 02/10/03 Reordered header: .section, .global, .proc, .align //============================================================== // // API //============================================================== // double nearbyint(double x) - -#include "libm_support.h" - // // general registers used: // @@ -110,15 +108,8 @@ NEARBYINT_INT_f8 = f11 // 1 1 1 0 0 1 11 0xe7 -.align 32 -.global nearbyint# - .section .text -.proc nearbyint# -.align 32 - - -nearbyint: +GLOBAL_LIBM_ENTRY(nearbyint) { .mfi mov nearbyint_GR_fpsr = ar40 // Read the fpsr--need to check rc.s0 @@ -141,7 +132,7 @@ nearbyint: { .mfb nop.m 999 -(p6) fnorm.d f8 = f8 +(p6) fnorm.d.s0 f8 = f8 (p6) br.ret.spnt b0 // Exit if x nan, inf, zero ;; } @@ -177,11 +168,11 @@ nearbyint: // Check to see if s0 rounding mode is round to nearest. If not then set s2 // rounding mode to that of s0 and repeat conversions. -L(NEARBYINT_COMMON): +NEARBYINT_COMMON: { .mfb cmp.ne p11,p0 = nearbyint_GR_rcs0, r0 (p6) fclass.m.unc p9,p10 = NEARBYINT_FLOAT_INT_f8, 0x07 // Test for result=0 -(p11) br.cond.spnt L(NEARBYINT_NOT_ROUND_NEAREST) // Branch if not round to nearest +(p11) br.cond.spnt NEARBYINT_NOT_ROUND_NEAREST // Branch if not round to nearest ;; } @@ -200,13 +191,13 @@ L(NEARBYINT_COMMON): } { .mfb nop.m 999 -(p10) fnorm.d f8 = NEARBYINT_FLOAT_INT_f8 +(p10) fnorm.d.s0 f8 = NEARBYINT_FLOAT_INT_f8 br.ret.sptk b0 ;; } -L(NEARBYINT_NOT_ROUND_NEAREST): +NEARBYINT_NOT_ROUND_NEAREST: // Set rounding mode of s2 to that of s0 { .mfi mov nearbyint_GR_rcs0 = r0 // Clear so we don't come back here @@ -225,10 +216,9 @@ L(NEARBYINT_NOT_ROUND_NEAREST): { .mfb nop.m 999 fcvt.xf NEARBYINT_FLOAT_INT_f8 = NEARBYINT_INT_f8 - br.cond.sptk L(NEARBYINT_COMMON) + br.cond.sptk NEARBYINT_COMMON ;; } -.endp nearbyint -ASM_SIZE_DIRECTIVE(nearbyint) +GLOBAL_LIBM_END(nearbyint) diff --git a/sysdeps/ia64/fpu/s_nearbyintf.S b/sysdeps/ia64/fpu/s_nearbyintf.S index 7050ddc52c..6471232513 100644 --- a/sysdeps/ia64/fpu/s_nearbyintf.S +++ b/sysdeps/ia64/fpu/s_nearbyintf.S @@ -1,11 +1,10 @@ .file "nearbyintf.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 10/19/2000 by John Harrison, Cristina Iordache, Ted Kubaska, -// Bob Norin, Tom Rowan, Shane Story, and Ping Tak Peter Tang of the -// Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -21,7 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -36,20 +35,19 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 10/19/2000: Created -// 2/08/01 Corrected behavior for all rounding modes. +// 10/19/00 Created +// 02/08/01 Corrected behavior for all rounding modes. +// 05/20/02 Cleaned up namespace and sf0 syntax +// 02/10/03 Reordered header: .section, .global, .proc, .align //============================================================== // // API //============================================================== // float nearbyintf(float x) - -#include "libm_support.h" - // // general registers used: // @@ -110,15 +108,8 @@ NEARBYINT_INT_f8 = f11 // 1 1 1 0 0 1 11 0xe7 -.align 32 -.global nearbyintf# - .section .text -.proc nearbyintf# -.align 32 - - -nearbyintf: +GLOBAL_LIBM_ENTRY(nearbyintf) { .mfi mov nearbyint_GR_fpsr = ar40 // Read the fpsr--need to check rc.s0 @@ -141,7 +132,7 @@ nearbyintf: { .mfb nop.m 999 -(p6) fnorm.s f8 = f8 +(p6) fnorm.s.s0 f8 = f8 (p6) br.ret.spnt b0 // Exit if x nan, inf, zero ;; } @@ -177,11 +168,11 @@ nearbyintf: // Check to see if s0 rounding mode is round to nearest. If not then set s2 // rounding mode to that of s0 and repeat conversions. -L(NEARBYINT_COMMON): +NEARBYINT_COMMON: { .mfb cmp.ne p11,p0 = nearbyint_GR_rcs0, r0 (p6) fclass.m.unc p9,p10 = NEARBYINT_FLOAT_INT_f8, 0x07 // Test for result=0 -(p11) br.cond.spnt L(NEARBYINT_NOT_ROUND_NEAREST) // Branch if not round to nearest +(p11) br.cond.spnt NEARBYINT_NOT_ROUND_NEAREST // Branch if not round to nearest ;; } @@ -200,13 +191,13 @@ L(NEARBYINT_COMMON): } { .mfb nop.m 999 -(p10) fnorm.s f8 = NEARBYINT_FLOAT_INT_f8 +(p10) fnorm.s.s0 f8 = NEARBYINT_FLOAT_INT_f8 br.ret.sptk b0 ;; } -L(NEARBYINT_NOT_ROUND_NEAREST): +NEARBYINT_NOT_ROUND_NEAREST: // Set rounding mode of s2 to that of s0 { .mfi mov nearbyint_GR_rcs0 = r0 // Clear so we don't come back here @@ -225,10 +216,9 @@ L(NEARBYINT_NOT_ROUND_NEAREST): { .mfb nop.m 999 fcvt.xf NEARBYINT_FLOAT_INT_f8 = NEARBYINT_INT_f8 - br.cond.sptk L(NEARBYINT_COMMON) + br.cond.sptk NEARBYINT_COMMON ;; } -.endp nearbyintf -ASM_SIZE_DIRECTIVE(nearbyintf) +GLOBAL_LIBM_END(nearbyintf) diff --git a/sysdeps/ia64/fpu/s_nearbyintl.S b/sysdeps/ia64/fpu/s_nearbyintl.S index 95ba6ab260..9c4c2e4f16 100644 --- a/sysdeps/ia64/fpu/s_nearbyintl.S +++ b/sysdeps/ia64/fpu/s_nearbyintl.S @@ -1,11 +1,10 @@ .file "nearbyintl.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 10/19/2000 by John Harrison, Cristina Iordache, Ted Kubaska, -// Bob Norin, Tom Rowan, Shane Story, and Ping Tak Peter Tang of the -// Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -21,7 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -36,20 +35,19 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 10/19/2000: Created -// 2/08/01 Corrected behavior for all rounding modes. +// 10/19/00 Created +// 02/08/01 Corrected behavior for all rounding modes. +// 05/20/02 Cleaned up namespace and sf0 syntax +// 02/10/03 Reordered header: .section, .global, .proc, .align //============================================================== // // API //============================================================== // long double nearbyintl(long double x) - -#include "libm_support.h" - // // general registers used: // @@ -111,15 +109,8 @@ NEARBYINT_SIGNED_FLOAT_INT_f8 = f12 // 1 1 1 0 0 1 11 0xe7 -.align 32 -.global nearbyintl# - .section .text -.proc nearbyintl# -.align 32 - - -nearbyintl: +GLOBAL_LIBM_ENTRY(nearbyintl) { .mfi mov nearbyint_GR_fpsr = ar40 // Read the fpsr--need to check rc.s0 @@ -142,7 +133,7 @@ nearbyintl: { .mfb nop.m 999 -(p6) fnorm f8 = f8 +(p6) fnorm.s0 f8 = f8 (p6) br.ret.spnt b0 // Exit if x nan, inf, zero ;; } @@ -180,11 +171,11 @@ nearbyintl: // rounding mode to that of s0 and repeat conversions. // Must merge the original sign for cases where the result is zero or the input // is the largest that still has a fraction (0x1007dfffffffffff) -L(NEARBYINT_COMMON): +NEARBYINT_COMMON: { .mfb cmp.ne p11,p0 = nearbyint_GR_rcs0, r0 (p6) fmerge.s NEARBYINT_SIGNED_FLOAT_INT_f8 = f8, NEARBYINT_FLOAT_INT_f8 -(p11) br.cond.spnt L(NEARBYINT_NOT_ROUND_NEAREST) // Branch if not round to nearest +(p11) br.cond.spnt NEARBYINT_NOT_ROUND_NEAREST // Branch if not round to nearest ;; } @@ -197,13 +188,13 @@ L(NEARBYINT_COMMON): { .mfb nop.m 999 -(p6) fnorm f8 = NEARBYINT_SIGNED_FLOAT_INT_f8 +(p6) fnorm.s0 f8 = NEARBYINT_SIGNED_FLOAT_INT_f8 br.ret.sptk b0 ;; } -L(NEARBYINT_NOT_ROUND_NEAREST): +NEARBYINT_NOT_ROUND_NEAREST: // Set rounding mode of s2 to that of s0 { .mfi mov nearbyint_GR_rcs0 = r0 // Clear so we don't come back here @@ -222,10 +213,9 @@ L(NEARBYINT_NOT_ROUND_NEAREST): { .mfb nop.m 999 fcvt.xf NEARBYINT_FLOAT_INT_f8 = NEARBYINT_INT_f8 - br.cond.sptk L(NEARBYINT_COMMON) + br.cond.sptk NEARBYINT_COMMON ;; } -.endp nearbyintl -ASM_SIZE_DIRECTIVE(nearbyintl) +GLOBAL_LIBM_END(nearbyintl) diff --git a/sysdeps/ia64/fpu/s_nextafterl.c b/sysdeps/ia64/fpu/s_nextafterl.c deleted file mode 100644 index f59f16848f..0000000000 --- a/sysdeps/ia64/fpu/s_nextafterl.c +++ /dev/null @@ -1 +0,0 @@ -#include <sysdeps/i386/fpu/s_nextafterl.c> diff --git a/sysdeps/ia64/fpu/s_nexttoward.c b/sysdeps/ia64/fpu/s_nexttoward.c deleted file mode 100644 index aee2bb5895..0000000000 --- a/sysdeps/ia64/fpu/s_nexttoward.c +++ /dev/null @@ -1 +0,0 @@ -#include <sysdeps/i386/fpu/s_nexttoward.c> diff --git a/sysdeps/ia64/fpu/s_nexttowardf.c b/sysdeps/ia64/fpu/s_nexttowardf.c deleted file mode 100644 index 55e95f6916..0000000000 --- a/sysdeps/ia64/fpu/s_nexttowardf.c +++ /dev/null @@ -1 +0,0 @@ -#include <sysdeps/i386/fpu/s_nexttowardf.c> diff --git a/sysdeps/ia64/fpu/s_rint.S b/sysdeps/ia64/fpu/s_rint.S index d04f06a31f..1735d9b498 100644 --- a/sysdeps/ia64/fpu/s_rint.S +++ b/sysdeps/ia64/fpu/s_rint.S @@ -1,10 +1,10 @@ .file "rint.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,74 +20,68 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// // Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// problem reports or change requests be submitted to it directly at +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 2/02/00: Initial version -// 2/08/01 Corrected behavior for all rounding modes. -// +// 02/02/00 Initial version +// 02/08/01 Corrected behavior for all rounding modes. +// 05/20/02 Cleaned up namespace and sf0 syntax +// 01/20/03 Improved performance +//============================================================== + // API //============================================================== // double rint(double x) +//============================================================== -#include "libm_support.h" - -// -// general registers used: -// -rint_GR_FFFF = r14 -rint_GR_signexp = r15 -rint_GR_exponent = r16 -rint_GR_17ones = r17 -rint_GR_10033 = r18 -rint_GR_fpsr = r19 -rint_GR_rcs0 = r20 -rint_GR_rcs0_mask = r21 +// general input registers: +// r14 - r21 +rSignexp = r14 +rExp = r15 +rExpMask = r16 +rBigexp = r17 +rM1 = r18 +rFpsr = r19 +rRcs0 = r20 +rRcs0Mask = r21 -// predicate registers used: -// p6-11 +// floating-point registers: +// f8 - f11 -// floating-point registers used: +fXInt = f9 +fNormX = f10 +fTmp = f11 -RINT_NORM_f8 = f9 -RINT_FFFF = f10 -RINT_INEXACT = f11 -RINT_FLOAT_INT_f8 = f12 -RINT_INT_f8 = f13 +// predicate registers used: +// p6 - p10 // Overview of operation //============================================================== - // double rint(double x) -// Return an integer value (represented as a double) that is x rounded to integer in current -// rounding mode +// Return an integer value (represented as a double) that is x +// rounded to integer in current rounding mode // Inexact is set if x != rint(x) -// ******************************************************************************* - -// Set denormal flag for denormal input and -// and take denormal fault if necessary. - -// Is the input an integer value already? +//============================================================== // double_extended -// if the exponent is >= 1003e => 3F(true) = 63(decimal) +// if the exponent is > 1003e => 3F(true) = 63(decimal) // we have a significand of 64 bits 1.63-bits. // If we multiply by 2^63, we no longer have a fractional part // So input is an integer value already. @@ -100,155 +94,136 @@ RINT_INT_f8 = f13 // So input is an integer value already. // single -// if the exponent is >= 10016 => 17(true) = 23(decimal) -// we have a significand of 53 bits 1.52-bits. (implicit 1) -// If we multiply by 2^52, we no longer have a fractional part +// if the exponent is > 10016 => 17(true) = 23(decimal) +// we have a significand of 24 bits 1.23-bits. (implicit 1) +// If we multiply by 2^23, we no longer have a fractional part // So input is an integer value already. -// If x is NAN, ZERO, or INFINITY, then return - -// qnan snan inf norm unorm 0 -+ -// 1 1 1 0 0 1 11 0xe7 - - -.align 32 -.global rint# - .section .text -.proc rint# -.align 32 - - -rint: -#ifdef _LIBC -.global __rint -.type __rint,@function -__rint: -#endif +GLOBAL_IEEE754_ENTRY(rint) { .mfi - mov rint_GR_fpsr = ar40 // Read the fpsr--need to check rc.s0 - fcvt.fx.s1 RINT_INT_f8 = f8 - addl rint_GR_10033 = 0x10033, r0 + getf.exp rSignexp = f8 // Get signexp, recompute if unorm + fclass.m p7,p0 = f8, 0x0b // Test x unorm + addl rBigexp = 0x10033, r0 // Set exponent at which is integer } { .mfi - mov rint_GR_FFFF = -1 - fnorm.s1 RINT_NORM_f8 = f8 - mov rint_GR_17ones = 0x1FFFF -;; + mov rM1 = -1 // Set all ones + fcvt.fx.s1 fXInt = f8 // Convert to int in significand + mov rExpMask = 0x1FFFF // Form exponent mask } +;; { .mfi - setf.sig RINT_FFFF = rint_GR_FFFF - fclass.m.unc p6,p0 = f8, 0xe7 - mov rint_GR_rcs0_mask = 0x0c00 -;; + mov rFpsr = ar40 // Read fpsr -- check rc.s0 + fclass.m p6,p0 = f8, 0x1e3 // Test x natval, nan, inf + nop.i 0 } - { .mfb - nop.m 999 -(p6) fnorm.d f8 = f8 -(p6) br.ret.spnt b0 // Exit if x nan, inf, zero -;; + setf.sig fTmp = rM1 // Make const for setting inexact + fnorm.s1 fNormX = f8 // Normalize input +(p7) br.cond.spnt RINT_UNORM // Branch if x unorm } - -{ .mfi - nop.m 999 - fcvt.xf RINT_FLOAT_INT_f8 = RINT_INT_f8 - nop.i 999 ;; + + +RINT_COMMON: +// Return here from RINT_UNORM +{ .mfb + and rExp = rSignexp, rExpMask // Get biased exponent +(p6) fma.d.s0 f8 = f8, f1, f0 // Result if x natval, nan, inf +(p6) br.ret.spnt b0 // Exit if x natval, nan, inf } +;; { .mfi - getf.exp rint_GR_signexp = RINT_NORM_f8 - fcmp.eq.s0 p8,p0 = f8,f0 // Dummy op to set denormal - nop.i 999 -;; + mov rRcs0Mask = 0x0c00 // Mask for rc.s0 + fcvt.xf f8 = fXInt // Result assume |x| < 2^52 + cmp.ge p7,p8 = rExp, rBigexp // Is |x| >= 2^52? } - - -{ .mii - nop.m 999 - nop.i 999 - and rint_GR_exponent = rint_GR_signexp, rint_GR_17ones ;; -} -{ .mmi - cmp.ge.unc p7,p6 = rint_GR_exponent, rint_GR_10033 - and rint_GR_rcs0 = rint_GR_rcs0_mask, rint_GR_fpsr - nop.i 999 -;; +// We must correct result if |x| >= 2^52 +{ .mfi + nop.m 0 +(p7) fma.d.s0 f8 = fNormX, f1, f0 // If |x| >= 2^52, result x + nop.i 0 } - -// Check to see if s0 rounding mode is round to nearest. If not then set s2 -// rounding mode to that of s0 and repeat conversions. -L(RINT_COMMON): -{ .mfb - cmp.ne p11,p0 = rint_GR_rcs0, r0 -(p6) fclass.m.unc p9,p10 = RINT_FLOAT_INT_f8, 0x07 // Test for result=0 -(p11) br.cond.spnt L(RINT_NOT_ROUND_NEAREST) // Branch if not round to nearest ;; -} { .mfi - nop.m 999 -(p6) fcmp.eq.unc.s1 p0,p8 = RINT_FLOAT_INT_f8, RINT_NORM_f8 - nop.i 999 + nop.m 0 + fcmp.eq.unc.s1 p0, p9 = f8, fNormX // Is result = x ? + nop.i 0 } { .mfi - nop.m 999 -(p7) fnorm.d.s0 f8 = f8 - nop.i 999 -;; + nop.m 0 +(p8) fmerge.s f8 = fNormX, f8 // Make sure sign rint(x) = sign x + nop.i 0 } +;; -// If result is zero, merge sign of input { .mfi - nop.m 999 -(p9) fmerge.s f8 = f8, RINT_FLOAT_INT_f8 - nop.i 999 +(p8) and rRcs0 = rFpsr, rRcs0Mask // Get rounding mode for sf0 + nop.f 0 + nop.i 0 } -{ .mfi - nop.m 999 -(p10) fnorm.d f8 = RINT_FLOAT_INT_f8 - nop.i 999 ;; + +// If |x| < 2^52 we must test for other rounding modes +{ .mfi +(p8) cmp.ne.unc p10,p0 = rRcs0, r0 // Test for other rounding modes +(p9) fmpy.s0 fTmp = fTmp, fTmp // Dummy to set inexact + nop.i 0 +} +{ .mbb + nop.m 0 +(p10) br.cond.spnt RINT_NOT_ROUND_NEAREST // Branch if not round nearest + br.ret.sptk b0 // Exit main path if round nearest } +;; + + +RINT_UNORM: +// Here if x unorm { .mfb - nop.m 999 -(p8) fmpy.s0 RINT_INEXACT = RINT_FFFF,RINT_FFFF // Dummy to set inexact - br.ret.sptk b0 -;; + getf.exp rSignexp = fNormX // Get signexp, recompute if unorm + fcmp.eq.s0 p7,p0 = f8, f0 // Dummy op to set denormal flag + br.cond.sptk RINT_COMMON // Return to main path } +;; -L(RINT_NOT_ROUND_NEAREST): -// Set rounding mode of s2 to that of s0 +RINT_NOT_ROUND_NEAREST: +// Here if not round to nearest, and |x| < 2^52 +// Set rounding mode of s2 to that of s0, and repeat the conversion using s2 { .mfi - mov rint_GR_rcs0 = r0 // Clear so we don't come back here - fsetc.s2 0x7f, 0x40 - nop.i 999 -;; + nop.m 0 + fsetc.s2 0x7f, 0x40 + nop.i 0 } +;; { .mfi - nop.m 999 - fcvt.fx.s2 RINT_INT_f8 = f8 - nop.i 999 + nop.m 0 + fcvt.fx.s2 fXInt = fNormX // Convert to int in significand + nop.i 0 +} ;; + +{ .mfi + nop.m 0 + fcvt.xf f8 = fXInt // Expected result + nop.i 0 } +;; +// Be sure sign of result = sign of input. Fixes cases where result is 0. { .mfb - nop.m 999 - fcvt.xf RINT_FLOAT_INT_f8 = RINT_INT_f8 - br.cond.sptk L(RINT_COMMON) -;; + nop.m 0 + fmerge.s f8 = fNormX, f8 + br.ret.sptk b0 // Exit main path } +;; - -.endp rint -ASM_SIZE_DIRECTIVE(rint) -#ifdef _LIBC -ASM_SIZE_DIRECTIVE(__rint) -#endif +GLOBAL_IEEE754_END(rint) diff --git a/sysdeps/ia64/fpu/s_rintf.S b/sysdeps/ia64/fpu/s_rintf.S index 73cb98a048..05d6b411f2 100644 --- a/sysdeps/ia64/fpu/s_rintf.S +++ b/sysdeps/ia64/fpu/s_rintf.S @@ -1,10 +1,10 @@ .file "rintf.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,74 +20,68 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// // Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// problem reports or change requests be submitted to it directly at +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 2/02/00: Initial version -// 2/08/01 Corrected behavior for all rounding modes. -// +// 02/02/00 Initial version +// 02/08/01 Corrected behavior for all rounding modes. +// 05/20/02 Cleaned up namespace and sf0 syntax +// 01/20/03 Improved performance +//============================================================== + // API //============================================================== // float rintf(float x) +//============================================================== -#include "libm_support.h" - -// -// general registers used: -// -rint_GR_FFFF = r14 -rint_GR_signexp = r15 -rint_GR_exponent = r16 -rint_GR_17ones = r17 -rint_GR_10033 = r18 -rint_GR_fpsr = r19 -rint_GR_rcs0 = r20 -rint_GR_rcs0_mask = r21 +// general input registers: +// r14 - r21 +rSignexp = r14 +rExp = r15 +rExpMask = r16 +rBigexp = r17 +rM1 = r18 +rFpsr = r19 +rRcs0 = r20 +rRcs0Mask = r21 -// predicate registers used: -// p6-11 +// floating-point registers: +// f8 - f11 -// floating-point registers used: +fXInt = f9 +fNormX = f10 +fTmp = f11 -RINT_NORM_f8 = f9 -RINT_FFFF = f10 -RINT_INEXACT = f11 -RINT_FLOAT_INT_f8 = f12 -RINT_INT_f8 = f13 +// predicate registers used: +// p6 - p10 // Overview of operation //============================================================== - // float rintf(float x) -// Return an integer value (represented as a float) that is x rounded to integer in current -// rounding mode -// Inexact is set if x != rintf(x) -// ******************************************************************************* - -// Set denormal flag for denormal input and -// and take denormal fault if necessary. - -// Is the input an integer value already? +// Return an integer value (represented as a float) that is x +// rounded to integer in current rounding mode +// Inexact is set if x != rint(x) +//============================================================== // double_extended -// if the exponent is >= 1003e => 3F(true) = 63(decimal) +// if the exponent is > 1003e => 3F(true) = 63(decimal) // we have a significand of 64 bits 1.63-bits. // If we multiply by 2^63, we no longer have a fractional part // So input is an integer value already. @@ -100,155 +94,136 @@ RINT_INT_f8 = f13 // So input is an integer value already. // single -// if the exponent is >= 10016 => 17(true) = 23(decimal) -// we have a significand of 53 bits 1.52-bits. (implicit 1) -// If we multiply by 2^52, we no longer have a fractional part +// if the exponent is > 10016 => 17(true) = 23(decimal) +// we have a significand of 24 bits 1.23-bits. (implicit 1) +// If we multiply by 2^23, we no longer have a fractional part // So input is an integer value already. -// If x is NAN, ZERO, or INFINITY, then return - -// qnan snan inf norm unorm 0 -+ -// 1 1 1 0 0 1 11 0xe7 - - -.align 32 -.global rintf# - .section .text -.proc rintf# -.align 32 - - -rintf: -#ifdef _LIBC -.global __rintf -.type __rintf,@function -__rintf: -#endif +GLOBAL_IEEE754_ENTRY(rintf) { .mfi - mov rint_GR_fpsr = ar40 // Read the fpsr--need to check rc.s0 - fcvt.fx.s1 RINT_INT_f8 = f8 - addl rint_GR_10033 = 0x10016, r0 + getf.exp rSignexp = f8 // Get signexp, recompute if unorm + fclass.m p7,p0 = f8, 0x0b // Test x unorm + addl rBigexp = 0x10016, r0 // Set exponent at which is integer } { .mfi - mov rint_GR_FFFF = -1 - fnorm.s1 RINT_NORM_f8 = f8 - mov rint_GR_17ones = 0x1FFFF -;; + mov rM1 = -1 // Set all ones + fcvt.fx.s1 fXInt = f8 // Convert to int in significand + mov rExpMask = 0x1FFFF // Form exponent mask } +;; { .mfi - setf.sig RINT_FFFF = rint_GR_FFFF - fclass.m.unc p6,p0 = f8, 0xe7 - mov rint_GR_rcs0_mask = 0x0c00 -;; + mov rFpsr = ar40 // Read fpsr -- check rc.s0 + fclass.m p6,p0 = f8, 0x1e3 // Test x natval, nan, inf + nop.i 0 } - { .mfb - nop.m 999 -(p6) fnorm.s f8 = f8 -(p6) br.ret.spnt b0 // Exit if x nan, inf, zero -;; + setf.sig fTmp = rM1 // Make const for setting inexact + fnorm.s1 fNormX = f8 // Normalize input +(p7) br.cond.spnt RINT_UNORM // Branch if x unorm } - -{ .mfi - nop.m 999 - fcvt.xf RINT_FLOAT_INT_f8 = RINT_INT_f8 - nop.i 999 ;; + + +RINT_COMMON: +// Return here from RINT_UNORM +{ .mfb + and rExp = rSignexp, rExpMask // Get biased exponent +(p6) fma.s.s0 f8 = f8, f1, f0 // Result if x natval, nan, inf +(p6) br.ret.spnt b0 // Exit if x natval, nan, inf } +;; { .mfi - getf.exp rint_GR_signexp = RINT_NORM_f8 - fcmp.eq.s0 p8,p0 = f8,f0 // Dummy op to set denormal - nop.i 999 -;; + mov rRcs0Mask = 0x0c00 // Mask for rc.s0 + fcvt.xf f8 = fXInt // Result assume |x| < 2^23 + cmp.ge p7,p8 = rExp, rBigexp // Is |x| >= 2^23? } - - -{ .mii - nop.m 999 - nop.i 999 - and rint_GR_exponent = rint_GR_signexp, rint_GR_17ones ;; -} -{ .mmi - cmp.ge.unc p7,p6 = rint_GR_exponent, rint_GR_10033 - and rint_GR_rcs0 = rint_GR_rcs0_mask, rint_GR_fpsr - nop.i 999 -;; +// We must correct result if |x| >= 2^23 +{ .mfi + nop.m 0 +(p7) fma.s.s0 f8 = fNormX, f1, f0 // If |x| >= 2^23, result x + nop.i 0 } - -// Check to see if s0 rounding mode is round to nearest. If not then set s2 -// rounding mode to that of s0 and repeat conversions. -L(RINT_COMMON): -{ .mfb - cmp.ne p11,p0 = rint_GR_rcs0, r0 -(p6) fclass.m.unc p9,p10 = RINT_FLOAT_INT_f8, 0x07 // Test for result=0 -(p11) br.cond.spnt L(RINT_NOT_ROUND_NEAREST) // Branch if not round to nearest ;; -} { .mfi - nop.m 999 -(p6) fcmp.eq.unc.s1 p0,p8 = RINT_FLOAT_INT_f8, RINT_NORM_f8 - nop.i 999 + nop.m 0 + fcmp.eq.unc.s1 p0, p9 = f8, fNormX // Is result = x ? + nop.i 0 } { .mfi - nop.m 999 -(p7) fnorm.s.s0 f8 = f8 - nop.i 999 -;; + nop.m 0 +(p8) fmerge.s f8 = fNormX, f8 // Make sure sign rint(x) = sign x + nop.i 0 } +;; -// If result is zero, merge sign of input { .mfi - nop.m 999 -(p9) fmerge.s f8 = f8, RINT_FLOAT_INT_f8 - nop.i 999 +(p8) and rRcs0 = rFpsr, rRcs0Mask // Get rounding mode for sf0 + nop.f 0 + nop.i 0 } -{ .mfi - nop.m 999 -(p10) fnorm.s f8 = RINT_FLOAT_INT_f8 - nop.i 999 ;; + +// If |x| < 2^23 we must test for other rounding modes +{ .mfi +(p8) cmp.ne.unc p10,p0 = rRcs0, r0 // Test for other rounding modes +(p9) fmpy.s0 fTmp = fTmp, fTmp // Dummy to set inexact + nop.i 0 +} +{ .mbb + nop.m 0 +(p10) br.cond.spnt RINT_NOT_ROUND_NEAREST // Branch if not round nearest + br.ret.sptk b0 // Exit main path if round nearest } +;; + + +RINT_UNORM: +// Here if x unorm { .mfb - nop.m 999 -(p8) fmpy.s0 RINT_INEXACT = RINT_FFFF,RINT_FFFF // Dummy to set inexact - br.ret.sptk b0 -;; + getf.exp rSignexp = fNormX // Get signexp, recompute if unorm + fcmp.eq.s0 p7,p0 = f8, f0 // Dummy op to set denormal flag + br.cond.sptk RINT_COMMON // Return to main path } +;; -L(RINT_NOT_ROUND_NEAREST): -// Set rounding mode of s2 to that of s0 +RINT_NOT_ROUND_NEAREST: +// Here if not round to nearest, and |x| < 2^23 +// Set rounding mode of s2 to that of s0, and repeat the conversion using s2 { .mfi - mov rint_GR_rcs0 = r0 // Clear so we don't come back here - fsetc.s2 0x7f, 0x40 - nop.i 999 -;; + nop.m 0 + fsetc.s2 0x7f, 0x40 + nop.i 0 } +;; { .mfi - nop.m 999 - fcvt.fx.s2 RINT_INT_f8 = f8 - nop.i 999 + nop.m 0 + fcvt.fx.s2 fXInt = fNormX // Convert to int in significand + nop.i 0 +} ;; + +{ .mfi + nop.m 0 + fcvt.xf f8 = fXInt // Expected result + nop.i 0 } +;; +// Be sure sign of result = sign of input. Fixes cases where result is 0. { .mfb - nop.m 999 - fcvt.xf RINT_FLOAT_INT_f8 = RINT_INT_f8 - br.cond.sptk L(RINT_COMMON) -;; + nop.m 0 + fmerge.s f8 = fNormX, f8 + br.ret.sptk b0 // Exit main path } +;; - -.endp rintf -ASM_SIZE_DIRECTIVE(rintf) -#ifdef _LIBC -ASM_SIZE_DIRECTIVE(__rintf) -#endif +GLOBAL_IEEE754_END(rintf) diff --git a/sysdeps/ia64/fpu/s_rintl.S b/sysdeps/ia64/fpu/s_rintl.S index 857e8d5208..b5402149ec 100644 --- a/sysdeps/ia64/fpu/s_rintl.S +++ b/sysdeps/ia64/fpu/s_rintl.S @@ -1,10 +1,10 @@ .file "rintl.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,76 +20,68 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// // Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// problem reports or change requests be submitted to it directly at +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 2/02/00: Initial version -// 5/24/00 Fixed case of 2^63 - 1 + 0.5 (0x1007dffffffffffffffff) -// 2/08/01 Corrected behavior for all rounding modes. -// +// 02/02/00 Initial version +// 02/08/01 Corrected behavior for all rounding modes. +// 05/20/02 Cleaned up namespace and sf0 syntax +// 01/20/03 Improved performance +//============================================================== + // API //============================================================== // long double rintl(long double x) +//============================================================== -#include "libm_support.h" - -// -// general registers used: -// -rint_GR_FFFF = r14 -rint_GR_signexp = r15 -rint_GR_exponent = r16 -rint_GR_17ones = r17 -rint_GR_10033 = r18 -rint_GR_fpsr = r19 -rint_GR_rcs0 = r20 -rint_GR_rcs0_mask = r21 +// general input registers: +// r14 - r21 +rSignexp = r14 +rExp = r15 +rExpMask = r16 +rBigexp = r17 +rM1 = r18 +rFpsr = r19 +rRcs0 = r20 +rRcs0Mask = r21 -// predicate registers used: -// p6-11 +// floating-point registers: +// f8 - f11 -// floating-point registers used: +fXInt = f9 +fNormX = f10 +fTmp = f11 -RINT_NORM_f8 = f9 -RINT_FFFF = f10 -RINT_INEXACT = f11 -RINT_FLOAT_INT_f8 = f12 -RINT_INT_f8 = f13 -RINT_SIGNED_FLOAT_INT_f8 = f14 +// predicate registers used: +// p6 - p10 // Overview of operation //============================================================== - // long double rintl(long double x) -// Return an integer value (represented as a long double) that is x rounded to integer in current -// rounding mode -// Inexact is set if x != rintl(x) -// ******************************************************************************* - -// Set denormal flag for denormal input and -// and take denormal fault if necessary. - -// Is the input an integer value already? +// Return an integer value (represented as a long double) that is x +// rounded to integer in current rounding mode +// Inexact is set if x != rint(x) +//============================================================== // double_extended -// if the exponent is >= 1003e => 3F(true) = 63(decimal) +// if the exponent is > 1003e => 3F(true) = 63(decimal) // we have a significand of 64 bits 1.63-bits. // If we multiply by 2^63, we no longer have a fractional part // So input is an integer value already. @@ -102,151 +94,136 @@ RINT_SIGNED_FLOAT_INT_f8 = f14 // So input is an integer value already. // single -// if the exponent is >= 10016 => 17(true) = 23(decimal) -// we have a significand of 53 bits 1.52-bits. (implicit 1) -// If we multiply by 2^52, we no longer have a fractional part +// if the exponent is > 10016 => 17(true) = 23(decimal) +// we have a significand of 24 bits 1.23-bits. (implicit 1) +// If we multiply by 2^23, we no longer have a fractional part // So input is an integer value already. -// If x is NAN, ZERO, or INFINITY, then return - -// qnan snan inf norm unorm 0 -+ -// 1 1 1 0 0 1 11 0xe7 - - -.align 32 -.global rintl# - .section .text -.proc rintl# -.align 32 - - -rintl: -#ifdef _LIBC -.global __rintl -.type __rintl,@function -__rintl: -#endif +GLOBAL_IEEE754_ENTRY(rintl) { .mfi - mov rint_GR_fpsr = ar40 // Read the fpsr--need to check rc.s0 - fcvt.fx.s1 RINT_INT_f8 = f8 - addl rint_GR_10033 = 0x1003e, r0 + getf.exp rSignexp = f8 // Get signexp, recompute if unorm + fclass.m p7,p0 = f8, 0x0b // Test x unorm + addl rBigexp = 0x1003e, r0 // Set exponent at which is integer } { .mfi - mov rint_GR_FFFF = -1 - fnorm.s1 RINT_NORM_f8 = f8 - mov rint_GR_17ones = 0x1FFFF -;; + mov rM1 = -1 // Set all ones + fcvt.fx.s1 fXInt = f8 // Convert to int in significand + mov rExpMask = 0x1FFFF // Form exponent mask } +;; { .mfi - setf.sig RINT_FFFF = rint_GR_FFFF - fclass.m.unc p6,p0 = f8, 0xe7 - mov rint_GR_rcs0_mask = 0x0c00 -;; + mov rFpsr = ar40 // Read fpsr -- check rc.s0 + fclass.m p6,p0 = f8, 0x1e3 // Test x natval, nan, inf + nop.i 0 } - { .mfb - nop.m 999 -(p6) fnorm f8 = f8 -(p6) br.ret.spnt b0 // Exit if x nan, inf, zero -;; + setf.sig fTmp = rM1 // Make const for setting inexact + fnorm.s1 fNormX = f8 // Normalize input +(p7) br.cond.spnt RINT_UNORM // Branch if x unorm } - -{ .mfi - nop.m 999 - fcvt.xf RINT_FLOAT_INT_f8 = RINT_INT_f8 - nop.i 999 ;; + + +RINT_COMMON: +// Return here from RINT_UNORM +{ .mfb + and rExp = rSignexp, rExpMask // Get biased exponent +(p6) fma.s0 f8 = f8, f1, f0 // Result if x natval, nan, inf +(p6) br.ret.spnt b0 // Exit if x natval, nan, inf } +;; { .mfi - getf.exp rint_GR_signexp = RINT_NORM_f8 - fcmp.eq.s0 p8,p0 = f8,f0 // Dummy op to set denormal - nop.i 999 -;; + mov rRcs0Mask = 0x0c00 // Mask for rc.s0 + fcvt.xf f8 = fXInt // Result assume |x| < 2^63 + cmp.ge p7,p8 = rExp, rBigexp // Is |x| >= 2^63? } - - -{ .mii - nop.m 999 - nop.i 999 - and rint_GR_exponent = rint_GR_signexp, rint_GR_17ones ;; -} -{ .mmi - cmp.ge.unc p7,p6 = rint_GR_exponent, rint_GR_10033 - and rint_GR_rcs0 = rint_GR_rcs0_mask, rint_GR_fpsr - nop.i 999 -;; +// We must correct result if |x| >= 2^63 +{ .mfi + nop.m 0 +(p7) fma.s0 f8 = fNormX, f1, f0 // If |x| >= 2^63, result x + nop.i 0 } - -// Check to see if s0 rounding mode is round to nearest. If not then set s2 -// rounding mode to that of s0 and repeat conversions. -// Must merge the original sign for cases where the result is zero or the input -// is the largest that still has a fraction (0x1007dfffffffffff) -L(RINT_COMMON): -{ .mfb - cmp.ne p11,p0 = rint_GR_rcs0, r0 -(p6) fmerge.s RINT_SIGNED_FLOAT_INT_f8 = f8, RINT_FLOAT_INT_f8 -(p11) br.cond.spnt L(RINT_NOT_ROUND_NEAREST) // Branch if not round to nearest ;; -} { .mfi - nop.m 999 -(p6) fcmp.eq.unc.s1 p0,p8 = RINT_FLOAT_INT_f8, RINT_NORM_f8 - nop.i 999 + nop.m 0 + fcmp.eq.unc.s1 p0, p9 = f8, fNormX // Is result = x ? + nop.i 0 } { .mfi - nop.m 999 -(p7) fnorm.s0 f8 = f8 - nop.i 999 -;; + nop.m 0 +(p8) fmerge.s f8 = fNormX, f8 // Make sure sign rint(x) = sign x + nop.i 0 } +;; { .mfi - nop.m 999 -(p6) fnorm f8 = RINT_SIGNED_FLOAT_INT_f8 - nop.i 999 +(p8) and rRcs0 = rFpsr, rRcs0Mask // Get rounding mode for sf0 + nop.f 0 + nop.i 0 +} ;; + +// If |x| < 2^63 we must test for other rounding modes +{ .mfi +(p8) cmp.ne.unc p10,p0 = rRcs0, r0 // Test for other rounding modes +(p9) fmpy.s0 fTmp = fTmp, fTmp // Dummy to set inexact + nop.i 0 +} +{ .mbb + nop.m 0 +(p10) br.cond.spnt RINT_NOT_ROUND_NEAREST // Branch if not round nearest + br.ret.sptk b0 // Exit main path if round nearest } +;; + + +RINT_UNORM: +// Here if x unorm { .mfb - nop.m 999 -(p8) fmpy.s0 RINT_INEXACT = RINT_FFFF,RINT_FFFF // Dummy to set inexact - br.ret.sptk b0 -;; + getf.exp rSignexp = fNormX // Get signexp, recompute if unorm + fcmp.eq.s0 p7,p0 = f8, f0 // Dummy op to set denormal flag + br.cond.sptk RINT_COMMON // Return to main path } +;; -L(RINT_NOT_ROUND_NEAREST): -// Set rounding mode of s2 to that of s0 +RINT_NOT_ROUND_NEAREST: +// Here if not round to nearest, and |x| < 2^63 +// Set rounding mode of s2 to that of s0, and repeat the conversion using s2 { .mfi - mov rint_GR_rcs0 = r0 // Clear so we don't come back here - fsetc.s2 0x7f, 0x40 - nop.i 999 -;; + nop.m 0 + fsetc.s2 0x7f, 0x40 + nop.i 0 } +;; { .mfi - nop.m 999 - fcvt.fx.s2 RINT_INT_f8 = f8 - nop.i 999 + nop.m 0 + fcvt.fx.s2 fXInt = fNormX // Convert to int in significand + nop.i 0 +} ;; + +{ .mfi + nop.m 0 + fcvt.xf f8 = fXInt // Expected result + nop.i 0 } +;; +// Be sure sign of result = sign of input. Fixes cases where result is 0. { .mfb - nop.m 999 - fcvt.xf RINT_FLOAT_INT_f8 = RINT_INT_f8 - br.cond.sptk L(RINT_COMMON) -;; + nop.m 0 + fmerge.s f8 = fNormX, f8 + br.ret.sptk b0 // Exit main path } +;; - -.endp rintl -ASM_SIZE_DIRECTIVE(rintl) -#ifdef _LIBC -ASM_SIZE_DIRECTIVE(__rintl) -#endif +GLOBAL_IEEE754_END(rintl) diff --git a/sysdeps/ia64/fpu/s_round.S b/sysdeps/ia64/fpu/s_round.S index b08ede1740..04033b4aa2 100644 --- a/sysdeps/ia64/fpu/s_round.S +++ b/sysdeps/ia64/fpu/s_round.S @@ -1,11 +1,10 @@ .file "round.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 10/25/2000 by John Harrison, Cristina Iordache, Ted Kubaska, -// Bob Norin, Tom Rowan, Shane Story, and Ping Tak Peter Tang of the -// Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -21,229 +20,202 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// // Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// problem reports or change requests be submitted to it directly at +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 10/25/2000: Created +// 10/25/00 Initial version +// 06/14/01 Changed cmp to an equivalent form +// 05/20/02 Cleaned up namespace and sf0 syntax +// 01/20/03 Improved performance and reduced code size +// 04/18/03 Eliminate possible WAW dependency warning //============================================================== -// + // API //============================================================== // double round(double x) -// +//============================================================== -#include "libm_support.h" +// general input registers: +// r14 - r19 -// general input registers: -// -round_GR_half = r14 -round_GR_big = r15 -round_GR_expmask = r16 -round_GR_signexp = r17 -round_GR_exp = r18 -round_GR_expdiff = r19 - -// predicate registers used: -// p6 - p10 +rSignexp = r14 +rExp = r15 +rExpMask = r16 +rBigexp = r17 +rExpHalf = r18 +rExpMHalf = r19 + +// floating-point registers: +// f8 - f13 -// floating-point registers used: +fXtruncInt = f9 +fNormX = f10 +fHalf = f11 +fMHalf = f12 +fRem = f13 -ROUND_NORM_f8 = f9 -ROUND_TRUNC_f8 = f10 -ROUND_RINT_f8 = f11 -ROUND_FLOAT_TRUNC_f8 = f12 -ROUND_FLOAT_RINT_f8 = f13 -ROUND_REMAINDER = f14 -ROUND_HALF = f15 +// predicate registers used: +// p6 - p10 // Overview of operation //============================================================== - // double round(double x) -// Return an integer value (represented as a double) that is x -// rounded to nearest integer, halfway cases rounded away from -// zero. +// Return an integer value (represented as a double) that is x +// rounded to nearest integer, halfway cases rounded away from +// zero. // if x>0 result = trunc(x+0.5) // if x<0 result = trunc(x-0.5) -// ******************************************************************************* - -// Set denormal flag for denormal input and -// and take denormal fault if necessary. +// +//============================================================== -// If x is NAN, ZERO, INFINITY, or >= 2^52 then return +// double_extended +// if the exponent is > 1003e => 3F(true) = 63(decimal) +// we have a significand of 64 bits 1.63-bits. +// If we multiply by 2^63, we no longer have a fractional part +// So input is an integer value already. -// qnan snan inf norm unorm 0 -+ -// 1 1 1 0 0 1 11 0xe7 +// double +// if the exponent is >= 10033 => 34(true) = 52(decimal) +// 34 + 3ff = 433 +// we have a significand of 53 bits 1.52-bits. (implicit 1) +// If we multiply by 2^52, we no longer have a fractional part +// So input is an integer value already. +// single +// if the exponent is > 10016 => 17(true) = 23(decimal) +// we have a significand of 24 bits 1.23-bits. (implicit 1) +// If we multiply by 2^23, we no longer have a fractional part +// So input is an integer value already. -.align 32 -.global round# .section .text -.proc round# -.align 32 - +GLOBAL_LIBM_ENTRY(round) -round: - -// Get exponent for +0.5 -// Truncate x to integer { .mfi - addl round_GR_half = 0x0fffe, r0 - fcvt.fx.trunc.s1 ROUND_TRUNC_f8 = f8 - nop.i 999 -} - -// Get signexp of x -// Normalize input -// Form exponent mask -{ .mfi - getf.exp round_GR_signexp = f8 - fnorm ROUND_NORM_f8 = f8 - addl round_GR_expmask = 0x1ffff, r0 ;; + getf.exp rSignexp = f8 // Get signexp, recompute if unorm + fcvt.fx.trunc.s1 fXtruncInt = f8 // Convert to int in significand + addl rBigexp = 0x10033, r0 // Set exponent at which is integer } - -// Form +0.5 -// Round x to integer { .mfi - setf.exp ROUND_HALF = round_GR_half - fcvt.fx.s1 ROUND_RINT_f8 = f8 - nop.i 999 ;; + mov rExpHalf = 0x0FFFE // Form sign and exponent of 0.5 + fnorm.s1 fNormX = f8 // Normalize input + mov rExpMask = 0x1FFFF // Form exponent mask } -// Get exp of x -// Test for NAN, INF, ZERO -// Get exponent at which input has no fractional part -{ .mfi - and round_GR_exp = round_GR_expmask, round_GR_signexp - fclass.m p8,p9 = f8,0xe7 - addl round_GR_big = 0x10033, r0 ;; -} - -// Get exp-bigexp -// If exp is so big there is no fractional part, then turn on p8, off p9 -{ .mmi - sub round_GR_expdiff = round_GR_exp, round_GR_big ;; -#ifdef _LIBC -(p9) cmp.lt.or.andcm p8,p9 = r0, round_GR_expdiff -#else -(p9) cmp.ge.or.andcm p8,p9 = round_GR_expdiff, r0 -#endif - nop.i 999 ;; -} - -// Set p6 if x<0, else set p7 -{ .mfi - nop.m 999 -(p9) fcmp.lt.unc p6,p7 = f8,f0 - nop.i 999 +;; + +{ .mmf + setf.exp fHalf = rExpHalf // Form 0.5 + mov rExpMHalf = 0x2FFFE // Form sign and exponent of -0.5 + fclass.m p7,p0 = f8, 0x0b // Test x unorm } - -// If NAN, INF, ZERO, or no fractional part, result is just normalized input -{ .mfi - nop.m 999 -(p8) fnorm.d.s0 f8 = f8 - nop.i 999 ;; +;; + +{ .mfb + setf.exp fMHalf = rExpMHalf // Form -0.5 + fclass.m p6,p0 = f8, 0x1e3 // Test x natval, nan, inf +(p7) br.cond.spnt ROUND_UNORM // Branch if x unorm } +;; -// Float the truncated integer +ROUND_COMMON: +// Return here from ROUND_UNORM { .mfi - nop.m 999 -(p9) fcvt.xf ROUND_FLOAT_TRUNC_f8 = ROUND_TRUNC_f8 - nop.i 999 ;; + nop.m 0 + fcmp.lt.s1 p8,p9 = f8, f0 // Test if x < 0 + nop.i 0 +} +{ .mfb + and rExp = rSignexp, rExpMask // Get biased exponent +(p6) fma.d.s0 f8 = f8, f1, f0 // Result if x natval, nan, inf +(p6) br.ret.spnt b0 // Exit if x natval, nan, inf } +;; -// Float the rounded integer to get preliminary result { .mfi - nop.m 999 -(p9) fcvt.xf ROUND_FLOAT_RINT_f8 = ROUND_RINT_f8 - nop.i 999 ;; -} - -// If x<0 and the difference of the truncated input minus the input is 0.5 -// then result = truncated input - 1.0 -// Else if x>0 and the difference of the input minus truncated input is 0.5 -// then result = truncated input + 1.0 -// Else -// result = rounded input -// Endif -{ .mfi - nop.m 999 -(p6) fsub.s1 ROUND_REMAINDER = ROUND_FLOAT_TRUNC_f8, ROUND_NORM_f8 - nop.i 999 + cmp.lt p6,p0 = rExp, rExpHalf // Is |x| < 0.5? + fcvt.xf f8 = fXtruncInt // Pre-Result if 0.5 <= |x| < 2^52 + cmp.ge p7,p0 = rExp, rBigexp // Is |x| >= 2^52? } - { .mfi - nop.m 999 -(p7) fsub.s1 ROUND_REMAINDER = ROUND_NORM_f8, ROUND_FLOAT_TRUNC_f8 - nop.i 999 ;; + cmp.lt p10,p0 = rExp, rExpHalf // Is |x| < 0.5? + nop.f 0 + nop.i 0 } +;; -// Assume preliminary result is rounded integer +// We must correct result if |x| < 0.5, or |x| >= 2^52 +.pred.rel "mutex",p6,p7 { .mfi - nop.m 999 -(p9) fnorm.d.s0 f8 = ROUND_FLOAT_RINT_f8 - nop.i 999 + nop.m 0 +(p6) fmerge.s f8 = fNormX, f0 // If |x| < 0.5, result sgn(x)*0 + nop.i 0 } - -// If x<0, test if result=0 -{ .mfi - nop.m 999 -(p6) fcmp.eq.unc p10,p0 = ROUND_FLOAT_RINT_f8,f0 - nop.i 999 ;; +{ .mfb +(p7) cmp.eq p10,p0 = r0, r0 // Also turn on p10 if |x| >= 2^52 +(p7) fma.d.s0 f8 = fNormX, f1, f0 // If |x| >= 2^52, result x +(p10) br.ret.spnt b0 // Exit |x| < 0.5 or |x| >= 2^52 } +;; -// If x<0 and result=0, set result=-0 +// Here if 0.5 <= |x| < 2^52 { .mfi - nop.m 999 -(p10) fmerge.ns f8 = f1,f8 - nop.i 999 + nop.m 0 + fms.s1 fRem = fNormX, f1, f8 // Get remainder = x - trunc(x) + nop.i 0 } - -// If x<0, test if remainder=0.5 +;; + { .mfi - nop.m 999 -(p6) fcmp.eq.unc p6,p0 = ROUND_REMAINDER, ROUND_HALF - nop.i 999 ;; + nop.m 0 +(p8) fcmp.le.s1 p8,p0 = fRem, fMHalf + nop.i 0 } - -// If x>0, test if remainder=0.5 { .mfi - nop.m 999 -(p7) fcmp.eq.unc p7,p0 = ROUND_REMAINDER, ROUND_HALF - nop.i 999 ;; + nop.m 0 +(p9) fcmp.ge.s1 p9,p0 = fRem, fHalf + nop.i 0 } +;; -// If x<0 and remainder=0.5, result=truncated-1.0 -// If x>0 and remainder=0.5, result=truncated+1.0 -// Exit -.pred.rel "mutex",p6,p7 +// If x < 0 and remainder <= -0.5, then subtract 1 from result +// If x > 0 and remainder >= +0.5, then add 1 to result +.pred.rel "mutex",p8,p9 { .mfi - nop.m 999 -(p6) fsub.d.s0 f8 = ROUND_FLOAT_TRUNC_f8,f1 - nop.i 999 + nop.m 0 +(p8) fms.d.s0 f8 = f8, f1, f1 + nop.i 0 } - { .mfb - nop.m 999 -(p7) fadd.d.s0 f8 = ROUND_FLOAT_TRUNC_f8,f1 - br.ret.sptk b0 ;; + nop.m 0 +(p9) fma.d.s0 f8 = f8, f1, f1 + br.ret.sptk b0 +} +;; + + +ROUND_UNORM: +// Here if x unorm +{ .mfb + getf.exp rSignexp = fNormX // Get signexp, recompute if unorm + fcmp.eq.s0 p7,p0 = f8, f0 // Dummy op to set denormal flag + br.cond.sptk ROUND_COMMON // Return to main path } +;; -.endp round -ASM_SIZE_DIRECTIVE(round) +GLOBAL_LIBM_END(round) diff --git a/sysdeps/ia64/fpu/s_roundf.S b/sysdeps/ia64/fpu/s_roundf.S index 42ee60b218..1e8dc78777 100644 --- a/sysdeps/ia64/fpu/s_roundf.S +++ b/sysdeps/ia64/fpu/s_roundf.S @@ -1,11 +1,10 @@ .file "roundf.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 10/25/2000 by John Harrison, Cristina Iordache, Ted Kubaska, -// Bob Norin, Tom Rowan, Shane Story, and Ping Tak Peter Tang of the -// Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -21,229 +20,202 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// // Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// problem reports or change requests be submitted to it directly at +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 10/25/2000: Created +// 10/25/00 Initial version +// 06/14/01 Changed cmp to an equivalent form +// 05/20/02 Cleaned up namespace and sf0 syntax +// 01/20/03 Improved performance and reduced code size +// 04/18/03 Eliminate possible WAW dependency warning //============================================================== -// + // API //============================================================== // float roundf(float x) -// +//============================================================== -#include "libm_support.h" +// general input registers: +// r14 - r19 -// general input registers: -// -roundf_GR_half = r14 -roundf_GR_big = r15 -roundf_GR_expmask = r16 -roundf_GR_signexp = r17 -roundf_GR_exp = r18 -roundf_GR_expdiff = r19 - -// predicate registers used: -// p6 - p10 +rSignexp = r14 +rExp = r15 +rExpMask = r16 +rBigexp = r17 +rExpHalf = r18 +rExpMHalf = r19 + +// floating-point registers: +// f8 - f13 -// floating-point registers used: +fXtruncInt = f9 +fNormX = f10 +fHalf = f11 +fMHalf = f12 +fRem = f13 -ROUNDF_NORM_f8 = f9 -ROUNDF_TRUNC_f8 = f10 -ROUNDF_RINT_f8 = f11 -ROUNDF_FLOAT_TRUNC_f8 = f12 -ROUNDF_FLOAT_RINT_f8 = f13 -ROUNDF_REMAINDER = f14 -ROUNDF_HALF = f15 +// predicate registers used: +// p6 - p10 // Overview of operation //============================================================== - // float roundf(float x) -// Return an integer value (represented as a float) that is x -// rounded to nearest integer, halfway cases rounded away from -// zero. +// Return an integer value (represented as a float) that is x +// rounded to nearest integer, halfway cases rounded away from +// zero. // if x>0 result = trunc(x+0.5) // if x<0 result = trunc(x-0.5) -// ******************************************************************************* - -// Set denormal flag for denormal input and -// and take denormal fault if necessary. +// +//============================================================== -// If x is NAN, ZERO, INFINITY, or >= 2^23 then return +// double_extended +// if the exponent is > 1003e => 3F(true) = 63(decimal) +// we have a significand of 64 bits 1.63-bits. +// If we multiply by 2^63, we no longer have a fractional part +// So input is an integer value already. -// qnan snan inf norm unorm 0 -+ -// 1 1 1 0 0 1 11 0xe7 +// double +// if the exponent is >= 10033 => 34(true) = 52(decimal) +// 34 + 3ff = 433 +// we have a significand of 53 bits 1.52-bits. (implicit 1) +// If we multiply by 2^52, we no longer have a fractional part +// So input is an integer value already. +// single +// if the exponent is > 10016 => 17(true) = 23(decimal) +// we have a significand of 24 bits 1.23-bits. (implicit 1) +// If we multiply by 2^23, we no longer have a fractional part +// So input is an integer value already. -.align 32 -.global roundf# .section .text -.proc roundf# -.align 32 - +GLOBAL_LIBM_ENTRY(roundf) -roundf: - -// Get exponent for +0.5 -// Truncate x to integer { .mfi - addl roundf_GR_half = 0x0fffe, r0 - fcvt.fx.trunc.s1 ROUNDF_TRUNC_f8 = f8 - nop.i 999 -} - -// Get signexp of x -// Normalize input -// Form exponent mask -{ .mfi - getf.exp roundf_GR_signexp = f8 - fnorm ROUNDF_NORM_f8 = f8 - addl roundf_GR_expmask = 0x1ffff, r0 ;; + getf.exp rSignexp = f8 // Get signexp, recompute if unorm + fcvt.fx.trunc.s1 fXtruncInt = f8 // Convert to int in significand + addl rBigexp = 0x10016, r0 // Set exponent at which is integer } - -// Form +0.5 -// Round x to integer { .mfi - setf.exp ROUNDF_HALF = roundf_GR_half - fcvt.fx.s1 ROUNDF_RINT_f8 = f8 - nop.i 999 ;; + mov rExpHalf = 0x0FFFE // Form sign and exponent of 0.5 + fnorm.s1 fNormX = f8 // Normalize input + mov rExpMask = 0x1FFFF // Form exponent mask } -// Get exp of x -// Test for NAN, INF, ZERO -// Get exponent at which input has no fractional part -{ .mfi - and roundf_GR_exp = roundf_GR_expmask, roundf_GR_signexp - fclass.m p8,p9 = f8,0xe7 - addl roundf_GR_big = 0x10016, r0 ;; -} - -// Get exp-bigexp -// If exp is so big there is no fractional part, then turn on p8, off p9 -{ .mmi - sub roundf_GR_expdiff = roundf_GR_exp, roundf_GR_big ;; -#ifdef _LIBC -(p9) cmp.lt.or.andcm p8,p9 = r0, roundf_GR_expdiff -#else -(p9) cmp.ge.or.andcm p8,p9 = roundf_GR_expdiff, r0 -#endif - nop.i 999 ;; -} - -// Set p6 if x<0, else set p7 -{ .mfi - nop.m 999 -(p9) fcmp.lt.unc p6,p7 = f8,f0 - nop.i 999 +;; + +{ .mmf + setf.exp fHalf = rExpHalf // Form 0.5 + mov rExpMHalf = 0x2FFFE // Form sign and exponent of -0.5 + fclass.m p7,p0 = f8, 0x0b // Test x unorm } - -// If NAN, INF, ZERO, or no fractional part, result is just normalized input -{ .mfi - nop.m 999 -(p8) fnorm.s.s0 f8 = f8 - nop.i 999 ;; +;; + +{ .mfb + setf.exp fMHalf = rExpMHalf // Form -0.5 + fclass.m p6,p0 = f8, 0x1e3 // Test x natval, nan, inf +(p7) br.cond.spnt ROUND_UNORM // Branch if x unorm } +;; -// Float the truncated integer +ROUND_COMMON: +// Return here from ROUND_UNORM { .mfi - nop.m 999 -(p9) fcvt.xf ROUNDF_FLOAT_TRUNC_f8 = ROUNDF_TRUNC_f8 - nop.i 999 ;; + nop.m 0 + fcmp.lt.s1 p8,p9 = f8, f0 // Test if x < 0 + nop.i 0 +} +{ .mfb + and rExp = rSignexp, rExpMask // Get biased exponent +(p6) fma.s.s0 f8 = f8, f1, f0 // Result if x natval, nan, inf +(p6) br.ret.spnt b0 // Exit if x natval, nan, inf } +;; -// Float the rounded integer to get preliminary result { .mfi - nop.m 999 -(p9) fcvt.xf ROUNDF_FLOAT_RINT_f8 = ROUNDF_RINT_f8 - nop.i 999 ;; -} - -// If x<0 and the difference of the truncated input minus the input is 0.5 -// then result = truncated input - 1.0 -// Else if x>0 and the difference of the input minus truncated input is 0.5 -// then result = truncated input + 1.0 -// Else -// result = rounded input -// Endif -{ .mfi - nop.m 999 -(p6) fsub.s1 ROUNDF_REMAINDER = ROUNDF_FLOAT_TRUNC_f8, ROUNDF_NORM_f8 - nop.i 999 + cmp.lt p6,p0 = rExp, rExpHalf // Is |x| < 0.5? + fcvt.xf f8 = fXtruncInt // Pre-Result if 0.5 <= |x| < 2^23 + cmp.ge p7,p0 = rExp, rBigexp // Is |x| >= 2^23? } - { .mfi - nop.m 999 -(p7) fsub.s1 ROUNDF_REMAINDER = ROUNDF_NORM_f8, ROUNDF_FLOAT_TRUNC_f8 - nop.i 999 ;; + cmp.lt p10,p0 = rExp, rExpHalf // Is |x| < 0.5? + nop.f 0 + nop.i 0 } +;; -// Assume preliminary result is rounded integer +// We must correct result if |x| < 0.5, or |x| >= 2^23 +.pred.rel "mutex",p6,p7 { .mfi - nop.m 999 -(p9) fnorm.s.s0 f8 = ROUNDF_FLOAT_RINT_f8 - nop.i 999 + nop.m 0 +(p6) fmerge.s f8 = fNormX, f0 // If |x| < 0.5, result sgn(x)*0 + nop.i 0 } - -// If x<0, test if result=0 -{ .mfi - nop.m 999 -(p6) fcmp.eq.unc p10,p0 = ROUNDF_FLOAT_RINT_f8,f0 - nop.i 999 ;; +{ .mfb +(p7) cmp.eq p10,p0 = r0, r0 // Also turn on p10 if |x| >= 2^23 +(p7) fma.s.s0 f8 = fNormX, f1, f0 // If |x| >= 2^23, result x +(p10) br.ret.spnt b0 // Exit |x| < 0.5 or |x| >= 2^23 } +;; -// If x<0 and result=0, set result=-0 +// Here if 0.5 <= |x| < 2^23 { .mfi - nop.m 999 -(p10) fmerge.ns f8 = f1,f8 - nop.i 999 + nop.m 0 + fms.s1 fRem = fNormX, f1, f8 // Get remainder = x - trunc(x) + nop.i 0 } - -// If x<0, test if remainder=0.5 +;; + { .mfi - nop.m 999 -(p6) fcmp.eq.unc p6,p0 = ROUNDF_REMAINDER, ROUNDF_HALF - nop.i 999 ;; + nop.m 0 +(p8) fcmp.le.s1 p8,p0 = fRem, fMHalf + nop.i 0 } - -// If x>0, test if remainder=0.5 { .mfi - nop.m 999 -(p7) fcmp.eq.unc p7,p0 = ROUNDF_REMAINDER, ROUNDF_HALF - nop.i 999 ;; + nop.m 0 +(p9) fcmp.ge.s1 p9,p0 = fRem, fHalf + nop.i 0 } +;; -// If x<0 and remainder=0.5, result=truncated-1.0 -// If x>0 and remainder=0.5, result=truncated+1.0 -// Exit -.pred.rel "mutex",p6,p7 +// If x < 0 and remainder <= -0.5, then subtract 1 from result +// If x > 0 and remainder >= +0.5, then add 1 to result +.pred.rel "mutex",p8,p9 { .mfi - nop.m 999 -(p6) fsub.s.s0 f8 = ROUNDF_FLOAT_TRUNC_f8,f1 - nop.i 999 + nop.m 0 +(p8) fms.s.s0 f8 = f8, f1, f1 + nop.i 0 } - { .mfb - nop.m 999 -(p7) fadd.s.s0 f8 = ROUNDF_FLOAT_TRUNC_f8,f1 - br.ret.sptk b0 ;; + nop.m 0 +(p9) fma.s.s0 f8 = f8, f1, f1 + br.ret.sptk b0 +} +;; + + +ROUND_UNORM: +// Here if x unorm +{ .mfb + getf.exp rSignexp = fNormX // Get signexp, recompute if unorm + fcmp.eq.s0 p7,p0 = f8, f0 // Dummy op to set denormal flag + br.cond.sptk ROUND_COMMON // Return to main path } +;; -.endp roundf -ASM_SIZE_DIRECTIVE(roundf) +GLOBAL_LIBM_END(roundf) diff --git a/sysdeps/ia64/fpu/s_roundl.S b/sysdeps/ia64/fpu/s_roundl.S index b30f590917..79dff00c06 100644 --- a/sysdeps/ia64/fpu/s_roundl.S +++ b/sysdeps/ia64/fpu/s_roundl.S @@ -1,11 +1,10 @@ .file "roundl.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 10/25/2000 by John Harrison, Cristina Iordache, Ted Kubaska, -// Bob Norin, Tom Rowan, Shane Story, and Ping Tak Peter Tang of the -// Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -21,229 +20,202 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// // Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// problem reports or change requests be submitted to it directly at +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 10/25/2000: Created +// 10/25/00 Initial version +// 06/14/01 Changed cmp to an equivalent form +// 05/20/02 Cleaned up namespace and sf0 syntax +// 01/20/03 Improved performance and reduced code size +// 04/18/03 Eliminate possible WAW dependency warning //============================================================== -// + // API //============================================================== // long double roundl(long double x) -// +//============================================================== -#include "libm_support.h" +// general input registers: +// r14 - r19 -// general input registers: -// -roundl_GR_half = r14 -roundl_GR_big = r15 -roundl_GR_expmask = r16 -roundl_GR_signexp = r17 -roundl_GR_exp = r18 -roundl_GR_expdiff = r19 - -// predicate registers used: -// p6 - p10 +rSignexp = r14 +rExp = r15 +rExpMask = r16 +rBigexp = r17 +rExpHalf = r18 +rExpMHalf = r19 + +// floating-point registers: +// f8 - f13 -// floating-point registers used: +fXtruncInt = f9 +fNormX = f10 +fHalf = f11 +fMHalf = f12 +fRem = f13 -ROUNDL_NORM_f8 = f9 -ROUNDL_TRUNC_f8 = f10 -ROUNDL_RINT_f8 = f11 -ROUNDL_FLOAT_TRUNC_f8 = f12 -ROUNDL_FLOAT_RINT_f8 = f13 -ROUNDL_REMAINDER = f14 -ROUNDL_HALF = f15 +// predicate registers used: +// p6 - p10 // Overview of operation //============================================================== - // long double roundl(long double x) -// Return an integer value (represented as a long double) that is x -// rounded to nearest integer, halfway cases rounded away from -// zero. +// Return an integer value (represented as a long double) that is x +// rounded to nearest integer, halfway cases rounded away from +// zero. // if x>0 result = trunc(x+0.5) // if x<0 result = trunc(x-0.5) -// ******************************************************************************* - -// Set denormal flag for denormal input and -// and take denormal fault if necessary. +// +//============================================================== -// If x is NAN, ZERO, INFINITY, or >= 2^63 then return +// double_extended +// if the exponent is > 1003e => 3F(true) = 63(decimal) +// we have a significand of 64 bits 1.63-bits. +// If we multiply by 2^63, we no longer have a fractional part +// So input is an integer value already. -// qnan snan inf norm unorm 0 -+ -// 1 1 1 0 0 1 11 0xe7 +// double +// if the exponent is >= 10033 => 34(true) = 52(decimal) +// 34 + 3ff = 433 +// we have a significand of 53 bits 1.52-bits. (implicit 1) +// If we multiply by 2^52, we no longer have a fractional part +// So input is an integer value already. +// single +// if the exponent is > 10016 => 17(true) = 23(decimal) +// we have a significand of 24 bits 1.23-bits. (implicit 1) +// If we multiply by 2^23, we no longer have a fractional part +// So input is an integer value already. -.align 32 -.global roundl# .section .text -.proc roundl# -.align 32 - +GLOBAL_LIBM_ENTRY(roundl) -roundl: - -// Get exponent for +0.5 -// Truncate x to integer { .mfi - addl roundl_GR_half = 0x0fffe, r0 - fcvt.fx.trunc.s1 ROUNDL_TRUNC_f8 = f8 - nop.i 999 -} - -// Get signexp of x -// Normalize input -// Form exponent mask -{ .mfi - getf.exp roundl_GR_signexp = f8 - fnorm ROUNDL_NORM_f8 = f8 - addl roundl_GR_expmask = 0x1ffff, r0 ;; + getf.exp rSignexp = f8 // Get signexp, recompute if unorm + fcvt.fx.trunc.s1 fXtruncInt = f8 // Convert to int in significand + addl rBigexp = 0x1003e, r0 // Set exponent at which is integer } - -// Form +0.5 -// Round x to integer { .mfi - setf.exp ROUNDL_HALF = roundl_GR_half - fcvt.fx.s1 ROUNDL_RINT_f8 = f8 - nop.i 999 ;; + mov rExpHalf = 0x0FFFE // Form sign and exponent of 0.5 + fnorm.s1 fNormX = f8 // Normalize input + mov rExpMask = 0x1FFFF // Form exponent mask } -// Get exp of x -// Test for NAN, INF, ZERO -// Get exponent at which input has no fractional part -{ .mfi - and roundl_GR_exp = roundl_GR_expmask, roundl_GR_signexp - fclass.m p8,p9 = f8,0xe7 - addl roundl_GR_big = 0x1003e, r0 ;; -} - -// Get exp-bigexp -// If exp is so big there is no fractional part, then turn on p8, off p9 -{ .mmi - sub roundl_GR_expdiff = roundl_GR_exp, roundl_GR_big ;; -#ifdef _LIBC -(p9) cmp.lt.or.andcm p8,p9 = r0, roundl_GR_expdiff -#else -(p9) cmp.ge.or.andcm p8,p9 = roundl_GR_expdiff, r0 -#endif - nop.i 999 ;; -} - -// Set p6 if x<0, else set p7 -{ .mfi - nop.m 999 -(p9) fcmp.lt.unc p6,p7 = f8,f0 - nop.i 999 +;; + +{ .mmf + setf.exp fHalf = rExpHalf // Form 0.5 + mov rExpMHalf = 0x2FFFE // Form sign and exponent of -0.5 + fclass.m p7,p0 = f8, 0x0b // Test x unorm } - -// If NAN, INF, ZERO, or no fractional part, result is just normalized input -{ .mfi - nop.m 999 -(p8) fnorm.s0 f8 = f8 - nop.i 999 ;; +;; + +{ .mfb + setf.exp fMHalf = rExpMHalf // Form -0.5 + fclass.m p6,p0 = f8, 0x1e3 // Test x natval, nan, inf +(p7) br.cond.spnt ROUND_UNORM // Branch if x unorm } +;; -// Float the truncated integer +ROUND_COMMON: +// Return here from ROUND_UNORM { .mfi - nop.m 999 -(p9) fcvt.xf ROUNDL_FLOAT_TRUNC_f8 = ROUNDL_TRUNC_f8 - nop.i 999 ;; + nop.m 0 + fcmp.lt.s1 p8,p9 = f8, f0 // Test if x < 0 + nop.i 0 +} +{ .mfb + and rExp = rSignexp, rExpMask // Get biased exponent +(p6) fma.s0 f8 = f8, f1, f0 // Result if x natval, nan, inf +(p6) br.ret.spnt b0 // Exit if x natval, nan, inf } +;; -// Float the rounded integer to get preliminary result { .mfi - nop.m 999 -(p9) fcvt.xf ROUNDL_FLOAT_RINT_f8 = ROUNDL_RINT_f8 - nop.i 999 ;; -} - -// If x<0 and the difference of the truncated input minus the input is 0.5 -// then result = truncated input - 1.0 -// Else if x>0 and the difference of the input minus truncated input is 0.5 -// then result = truncated input + 1.0 -// Else -// result = rounded input -// Endif -{ .mfi - nop.m 999 -(p6) fsub.s1 ROUNDL_REMAINDER = ROUNDL_FLOAT_TRUNC_f8, ROUNDL_NORM_f8 - nop.i 999 + cmp.lt p6,p0 = rExp, rExpHalf // Is |x| < 0.5? + fcvt.xf f8 = fXtruncInt // Pre-Result if 0.5 <= |x| < 2^63 + cmp.ge p7,p0 = rExp, rBigexp // Is |x| >= 2^63? } - { .mfi - nop.m 999 -(p7) fsub.s1 ROUNDL_REMAINDER = ROUNDL_NORM_f8, ROUNDL_FLOAT_TRUNC_f8 - nop.i 999 ;; + cmp.lt p10,p0 = rExp, rExpHalf // Is |x| < 0.5? + nop.f 0 + nop.i 0 } +;; -// Assume preliminary result is rounded integer +// We must correct result if |x| < 0.5, or |x| >= 2^63 +.pred.rel "mutex",p6,p7 { .mfi - nop.m 999 -(p9) fnorm.s0 f8 = ROUNDL_FLOAT_RINT_f8 - nop.i 999 + nop.m 0 +(p6) fmerge.s f8 = fNormX, f0 // If |x| < 0.5, result sgn(x)*0 + nop.i 0 } - -// If x<0, test if result=0 -{ .mfi - nop.m 999 -(p6) fcmp.eq.unc p10,p0 = ROUNDL_FLOAT_RINT_f8,f0 - nop.i 999 ;; +{ .mfb +(p7) cmp.eq p10,p0 = r0, r0 // Also turn on p10 if |x| >= 2^63 +(p7) fma.s0 f8 = fNormX, f1, f0 // If |x| >= 2^63, result x +(p10) br.ret.spnt b0 // Exit |x| < 0.5 or |x| >= 2^63 } +;; -// If x<0 and result=0, set result=-0 +// Here if 0.5 <= |x| < 2^63 { .mfi - nop.m 999 -(p10) fmerge.ns f8 = f1,f8 - nop.i 999 + nop.m 0 + fms.s1 fRem = fNormX, f1, f8 // Get remainder = x - trunc(x) + nop.i 0 } - -// If x<0, test if remainder=0.5 +;; + { .mfi - nop.m 999 -(p6) fcmp.eq.unc p6,p0 = ROUNDL_REMAINDER, ROUNDL_HALF - nop.i 999 ;; + nop.m 0 +(p8) fcmp.le.s1 p8,p0 = fRem, fMHalf + nop.i 0 } - -// If x>0, test if remainder=0.5 { .mfi - nop.m 999 -(p7) fcmp.eq.unc p7,p0 = ROUNDL_REMAINDER, ROUNDL_HALF - nop.i 999 ;; + nop.m 0 +(p9) fcmp.ge.s1 p9,p0 = fRem, fHalf + nop.i 0 } +;; -// If x<0 and remainder=0.5, result=truncated-1.0 -// If x>0 and remainder=0.5, result=truncated+1.0 -// Exit -.pred.rel "mutex",p6,p7 +// If x < 0 and remainder <= -0.5, then subtract 1 from result +// If x > 0 and remainder >= +0.5, then add 1 to result +.pred.rel "mutex",p8,p9 { .mfi - nop.m 999 -(p6) fsub.s0 f8 = ROUNDL_FLOAT_TRUNC_f8,f1 - nop.i 999 + nop.m 0 +(p8) fms.s0 f8 = f8, f1, f1 + nop.i 0 } - { .mfb - nop.m 999 -(p7) fadd.s0 f8 = ROUNDL_FLOAT_TRUNC_f8,f1 - br.ret.sptk b0 ;; + nop.m 0 +(p9) fma.s0 f8 = f8, f1, f1 + br.ret.sptk b0 +} +;; + + +ROUND_UNORM: +// Here if x unorm +{ .mfb + getf.exp rSignexp = fNormX // Get signexp, recompute if unorm + fcmp.eq.s0 p7,p0 = f8, f0 // Dummy op to set denormal flag + br.cond.sptk ROUND_COMMON // Return to main path } +;; -.endp roundl -ASM_SIZE_DIRECTIVE(roundl) +GLOBAL_LIBM_END(roundl) diff --git a/sysdeps/ia64/fpu/s_scalbn.S b/sysdeps/ia64/fpu/s_scalbn.S deleted file mode 100644 index 50d14b4e30..0000000000 --- a/sysdeps/ia64/fpu/s_scalbn.S +++ /dev/null @@ -1,379 +0,0 @@ -.file "scalbn.s" - -// Copyright (C) 2000, 2001, Intel Corporation -// All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// * The name of Intel Corporation may not be used to endorse or promote -// products derived from this software without specific prior written -// permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. -// -// History -//============================================================== -// 2/02/00 Initial version -// 1/26/01 Scalbn completely reworked and now standalone version -// -// API -//============================================================== -// double = scalbn (double x, int n) -// input floating point f8 and int n (r33) -// output floating point f8 -// -// Returns x* 2**n using an fma and detects overflow -// and underflow. -// -// - -#include "libm_support.h" - -FR_Big = f6 -FR_NBig = f7 -FR_Floating_X = f8 -FR_Result = f8 -FR_Result2 = f9 -FR_Result3 = f11 -FR_Norm_X = f12 -FR_Two_N = f14 -FR_Two_to_Big = f15 - -GR_N_Biased = r15 -GR_Big = r16 -GR_NBig = r17 -GR_Scratch = r18 -GR_Scratch1 = r19 -GR_Bias = r20 -GR_N_as_int = r21 - -GR_SAVE_B0 = r32 -GR_SAVE_GP = r33 -GR_SAVE_PFS = r34 -GR_Parameter_X = r35 -GR_Parameter_Y = r36 -GR_Parameter_RESULT = r37 -GR_Tag = r38 - -.align 32 -.global scalbn - -.section .text -.proc scalbn -.align 32 - -scalbn: - -// -// Is x NAN, INF, ZERO, +-? -// Build the exponent Bias -// -{ .mfi - alloc r32=ar.pfs,1,2,4,0 - fclass.m.unc p7,p0 = FR_Floating_X, 0xe7 //@snan | @qnan | @inf | @zero - addl GR_Bias = 0x0FFFF,r0 -} - -// -// Sign extend input -// Is N zero? -// Normalize x -// -{ .mfi - cmp.eq.unc p6,p0 = r33,r0 - fnorm.s1 FR_Norm_X = FR_Floating_X - sxt4 GR_N_as_int = r33 -} -;; - -// -// Normalize x -// Branch and return special values. -// Create -35000 -// Create 35000 -// -{ .mfi - addl GR_Big = 35000,r0 - nop.f 0 - add GR_N_Biased = GR_Bias,GR_N_as_int -} -{ .mfb - addl GR_NBig = -35000,r0 -(p7) fma.d.s0 FR_Result = FR_Floating_X,f1, f0 -(p7) br.ret.spnt b0 -};; - -// -// Build the exponent Bias -// Return x when N = 0 -// -{ .mfi - setf.exp FR_Two_N = GR_N_Biased - nop.f 0 - addl GR_Scratch1 = 0x063BF,r0 -} -{ .mfb - addl GR_Scratch = 0x019C3F,r0 -(p6) fma.d.s0 FR_Result = FR_Floating_X,f1, f0 -(p6) br.ret.spnt b0 -};; - -// -// Create 2*big -// Create 2**-big -// Is N > 35000 -// Is N < -35000 -// Raise Denormal operand flag with compare -// Main path, create 2**N -// -{ .mfi - setf.exp FR_NBig = GR_Scratch1 - nop.f 0 - cmp.ge.unc p6, p0 = GR_N_as_int, GR_Big -} -{ .mfi - setf.exp FR_Big = GR_Scratch - fcmp.ge.s0 p0,p11 = FR_Floating_X,f0 - cmp.le.unc p8, p0 = GR_N_as_int, GR_NBig -};; - -// -// Adjust 2**N if N was very small or very large -// -{ .mfi - nop.m 0 -(p6) fma.s1 FR_Two_N = FR_Big,f1,f0 - nop.i 0 -} -{ .mlx - nop.m 999 -(p0) movl GR_Scratch = 0x00000000000303FF -};; - - -{ .mfi - nop.m 0 -(p8) fma.s1 FR_Two_N = FR_NBig,f1,f0 - nop.i 0 -} -{ .mlx - nop.m 999 -(p0) movl GR_Scratch1= 0x00000000000103FF -};; - -// Set up necessary status fields -// -// S0 user supplied status -// S2 user supplied status + WRE + TD (Overflows) -// S3 user supplied status + FZ + TD (Underflows) -// -{ .mfi - nop.m 999 -(p0) fsetc.s3 0x7F,0x41 - nop.i 999 -} -{ .mfi - nop.m 999 -(p0) fsetc.s2 0x7F,0x42 - nop.i 999 -};; - -// -// Do final operation -// -{ .mfi - setf.exp FR_NBig = GR_Scratch - fma.d.s0 FR_Result = FR_Two_N,FR_Norm_X,f0 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.d.s3 FR_Result3 = FR_Two_N,FR_Norm_X,f0 - nop.i 999 -};; -{ .mfi - setf.exp FR_Big = GR_Scratch1 - fma.d.s2 FR_Result2 = FR_Two_N,FR_Norm_X,f0 - nop.i 999 -};; - -// Check for overflow or underflow. -// Restore s3 -// Restore s2 -// -{ .mfi - nop.m 0 - fsetc.s3 0x7F,0x40 - nop.i 999 -} -{ .mfi - nop.m 0 - fsetc.s2 0x7F,0x40 - nop.i 999 -};; - -// -// Is the result zero? -// -{ .mfi - nop.m 999 - fclass.m.unc p6, p0 = FR_Result3, 0x007 - nop.i 999 -} -{ .mfi - addl GR_Tag = 176, r0 - fcmp.ge.unc.s1 p7, p8 = FR_Result2 , FR_Big - nop.i 0 -};; - -// -// Detect masked underflow - Tiny + Inexact Only -// -{ .mfi - nop.m 999 -(p6) fcmp.neq.unc.s1 p6, p0 = FR_Result , FR_Result2 - nop.i 999 -};; - -// -// Is result bigger the allowed range? -// Branch out for underflow -// -{ .mfb -(p6) addl GR_Tag = 177, r0 -(p8) fcmp.le.unc.s1 p9, p10 = FR_Result2 , FR_NBig -(p6) br.cond.spnt L(SCALBN_UNDERFLOW) -};; - -// -// Branch out for overflow -// -{ .mbb - nop.m 0 -(p7) br.cond.spnt L(SCALBN_OVERFLOW) -(p9) br.cond.spnt L(SCALBN_OVERFLOW) -};; - -// -// Return from main path. -// -{ .mfb - nop.m 999 - nop.f 0 - br.ret.sptk b0;; -} - -.endp scalbn -ASM_SIZE_DIRECTIVE(scalbn) -.proc __libm_error_region -__libm_error_region: - -L(SCALBN_OVERFLOW): -L(SCALBN_UNDERFLOW): - -// -// Get stack address of N -// -.prologue -{ .mfi - add GR_Parameter_Y=-32,sp - nop.f 0 -.save ar.pfs,GR_SAVE_PFS - mov GR_SAVE_PFS=ar.pfs -} -// -// Adjust sp -// -{ .mfi -.fframe 64 - add sp=-64,sp - nop.f 0 - mov GR_SAVE_GP=gp -};; - -// -// Store N on stack in correct position -// Locate the address of x on stack -// -{ .mmi - st8 [GR_Parameter_Y] = GR_N_as_int,16 - add GR_Parameter_X = 16,sp -.save b0, GR_SAVE_B0 - mov GR_SAVE_B0=b0 -};; - -// -// Store x on the stack. -// Get address for result on stack. -// -.body -{ .mib - stfd [GR_Parameter_X] = FR_Norm_X - add GR_Parameter_RESULT = 0,GR_Parameter_Y - nop.b 0 -} -{ .mib - stfd [GR_Parameter_Y] = FR_Result - add GR_Parameter_Y = -16,GR_Parameter_Y - br.call.sptk b0=__libm_error_support# -};; - -// -// Get location of result on stack -// -{ .mmi - nop.m 0 - nop.m 0 - add GR_Parameter_RESULT = 48,sp -};; - -// -// Get the new result -// -{ .mmi - ldfd FR_Result = [GR_Parameter_RESULT] -.restore sp - add sp = 64,sp - mov b0 = GR_SAVE_B0 -};; - -// -// Restore gp, ar.pfs and return -// -{ .mib - mov gp = GR_SAVE_GP - mov ar.pfs = GR_SAVE_PFS - br.ret.sptk b0 -};; - -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(scalbn) - -.type __libm_error_support#,@function -.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/s_scalbnf.S b/sysdeps/ia64/fpu/s_scalbnf.S deleted file mode 100644 index ff7d1ca637..0000000000 --- a/sysdeps/ia64/fpu/s_scalbnf.S +++ /dev/null @@ -1,379 +0,0 @@ -//.file "scalbnf.s" - -// Copyright (C) 2000, 2001, Intel Corporation -// All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// * The name of Intel Corporation may not be used to endorse or promote -// products derived from this software without specific prior written -// permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. -// -// History -//============================================================== -// 2/02/00 Initial version -// 1/26/01 scalbnf completely reworked and now standalone version -// -// API -//============================================================== -// float = scalbnf (float x, int n) -// input floating point f8 and int n (r33) -// output floating point f8 -// -// Returns x* 2**n using an fma and detects overflow -// and underflow. -// -// - -#include "libm_support.h" - -FR_Big = f6 -FR_NBig = f7 -FR_Floating_X = f8 -FR_Result = f8 -FR_Result2 = f9 -FR_Result3 = f11 -FR_Norm_X = f12 -FR_Two_N = f14 -FR_Two_to_Big = f15 - -GR_N_Biased = r15 -GR_Big = r16 -GR_NBig = r17 -GR_Scratch = r18 -GR_Scratch1 = r19 -GR_Bias = r20 -GR_N_as_int = r21 - -GR_SAVE_B0 = r32 -GR_SAVE_GP = r33 -GR_SAVE_PFS = r34 -GR_Parameter_X = r35 -GR_Parameter_Y = r36 -GR_Parameter_RESULT = r37 -GR_Tag = r38 - -.align 32 -.global scalbnf - -.section .text -.proc scalbnf -.align 32 - -scalbnf: - -// -// Is x NAN, INF, ZERO, +-? -// Build the exponent Bias -// -{ .mfi - alloc r32=ar.pfs,1,2,4,0 - fclass.m.unc p7,p0 = FR_Floating_X, 0xe7 //@snan | @qnan | @inf | @zero - addl GR_Bias = 0x0FFFF,r0 -} - -// -// Sign extend input -// Is N zero? -// Normalize x -// -{ .mfi - cmp.eq.unc p6,p0 = r33,r0 - fnorm.s1 FR_Norm_X = FR_Floating_X - sxt4 GR_N_as_int = r33 -} -;; - -// -// Normalize x -// Branch and return special values. -// Create -35000 -// Create 35000 -// -{ .mfi - addl GR_Big = 35000,r0 - nop.f 0 - add GR_N_Biased = GR_Bias,GR_N_as_int -} -{ .mfb - addl GR_NBig = -35000,r0 -(p7) fma.s.s0 FR_Result = FR_Floating_X,f1, f0 -(p7) br.ret.spnt b0 -};; - -// -// Build the exponent Bias -// Return x when N = 0 -// -{ .mfi - setf.exp FR_Two_N = GR_N_Biased - nop.f 0 - addl GR_Scratch1 = 0x063BF,r0 -} -{ .mfb - addl GR_Scratch = 0x019C3F,r0 -(p6) fma.s.s0 FR_Result = FR_Floating_X,f1, f0 -(p6) br.ret.spnt b0 -};; - -// -// Create 2*big -// Create 2**-big -// Is N > 35000 -// Is N < -35000 -// Raise Denormal operand flag with compare -// Main path, create 2**N -// -{ .mfi - setf.exp FR_NBig = GR_Scratch1 - nop.f 0 - cmp.ge.unc p6, p0 = GR_N_as_int, GR_Big -} -{ .mfi - setf.exp FR_Big = GR_Scratch - fcmp.ge.s0 p0,p11 = FR_Floating_X,f0 - cmp.le.unc p8, p0 = GR_N_as_int, GR_NBig -};; - -// -// Adjust 2**N if N was very small or very large -// -{ .mfi - nop.m 0 -(p6) fma.s1 FR_Two_N = FR_Big,f1,f0 - nop.i 0 -} -{ .mlx - nop.m 999 -(p0) movl GR_Scratch = 0x000000000003007F -};; - - -{ .mfi - nop.m 0 -(p8) fma.s1 FR_Two_N = FR_NBig,f1,f0 - nop.i 0 -} -{ .mlx - nop.m 999 -(p0) movl GR_Scratch1= 0x000000000001007F -};; - -// Set up necessary status fields -// -// S0 user supplied status -// S2 user supplied status + WRE + TD (Overflows) -// S3 user supplied status + FZ + TD (Underflows) -// -{ .mfi - nop.m 999 -(p0) fsetc.s3 0x7F,0x41 - nop.i 999 -} -{ .mfi - nop.m 999 -(p0) fsetc.s2 0x7F,0x42 - nop.i 999 -};; - -// -// Do final operation -// -{ .mfi - setf.exp FR_NBig = GR_Scratch - fma.s.s0 FR_Result = FR_Two_N,FR_Norm_X,f0 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s.s3 FR_Result3 = FR_Two_N,FR_Norm_X,f0 - nop.i 999 -};; -{ .mfi - setf.exp FR_Big = GR_Scratch1 - fma.s.s2 FR_Result2 = FR_Two_N,FR_Norm_X,f0 - nop.i 999 -};; - -// Check for overflow or underflow. -// Restore s3 -// Restore s2 -// -{ .mfi - nop.m 0 - fsetc.s3 0x7F,0x40 - nop.i 999 -} -{ .mfi - nop.m 0 - fsetc.s2 0x7F,0x40 - nop.i 999 -};; - -// -// Is the result zero? -// -{ .mfi - nop.m 999 - fclass.m.unc p6, p0 = FR_Result3, 0x007 - nop.i 999 -} -{ .mfi - addl GR_Tag = 178, r0 - fcmp.ge.unc.s1 p7, p8 = FR_Result2 , FR_Big - nop.i 0 -};; - -// -// Detect masked underflow - Tiny + Inexact Only -// -{ .mfi - nop.m 999 -(p6) fcmp.neq.unc.s1 p6, p0 = FR_Result , FR_Result2 - nop.i 999 -};; - -// -// Is result bigger the allowed range? -// Branch out for underflow -// -{ .mfb -(p6) addl GR_Tag = 179, r0 -(p8) fcmp.le.unc.s1 p9, p10 = FR_Result2 , FR_NBig -(p6) br.cond.spnt L(scalbnf_UNDERFLOW) -};; - -// -// Branch out for overflow -// -{ .mbb - nop.m 0 -(p7) br.cond.spnt L(scalbnf_OVERFLOW) -(p9) br.cond.spnt L(scalbnf_OVERFLOW) -};; - -// -// Return from main path. -// -{ .mfb - nop.m 999 - nop.f 0 - br.ret.sptk b0;; -} - -.endp scalbnf -ASM_SIZE_DIRECTIVE(scalbnf) -.proc __libm_error_region -__libm_error_region: - -L(scalbnf_OVERFLOW): -L(scalbnf_UNDERFLOW): - -// -// Get stack address of N -// -.prologue -{ .mfi - add GR_Parameter_Y=-32,sp - nop.f 0 -.save ar.pfs,GR_SAVE_PFS - mov GR_SAVE_PFS=ar.pfs -} -// -// Adjust sp -// -{ .mfi -.fframe 64 - add sp=-64,sp - nop.f 0 - mov GR_SAVE_GP=gp -};; - -// -// Store N on stack in correct position -// Locate the address of x on stack -// -{ .mmi - st8 [GR_Parameter_Y] = GR_N_as_int,16 - add GR_Parameter_X = 16,sp -.save b0, GR_SAVE_B0 - mov GR_SAVE_B0=b0 -};; - -// -// Store x on the stack. -// Get address for result on stack. -// -.body -{ .mib - stfs [GR_Parameter_X] = FR_Norm_X - add GR_Parameter_RESULT = 0,GR_Parameter_Y - nop.b 0 -} -{ .mib - stfs [GR_Parameter_Y] = FR_Result - add GR_Parameter_Y = -16,GR_Parameter_Y - br.call.sptk b0=__libm_error_support# -};; - -// -// Get location of result on stack -// -{ .mmi - nop.m 0 - nop.m 0 - add GR_Parameter_RESULT = 48,sp -};; - -// -// Get the new result -// -{ .mmi - ldfs FR_Result = [GR_Parameter_RESULT] -.restore sp - add sp = 64,sp - mov b0 = GR_SAVE_B0 -};; - -// -// Restore gp, ar.pfs and return -// -{ .mib - mov gp = GR_SAVE_GP - mov ar.pfs = GR_SAVE_PFS - br.ret.sptk b0 -};; - -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) - -.type __libm_error_support#,@function -.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/s_scalbnl.S b/sysdeps/ia64/fpu/s_scalbnl.S deleted file mode 100644 index 9e54a2ec0a..0000000000 --- a/sysdeps/ia64/fpu/s_scalbnl.S +++ /dev/null @@ -1,379 +0,0 @@ -//.file "scalbnl.s" - -// Copyright (C) 2000, 2001, Intel Corporation -// All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// * The name of Intel Corporation may not be used to endorse or promote -// products derived from this software without specific prior written -// permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. -// -// History -//============================================================== -// 2/02/00 Initial version -// 1/26/01 scalbnl completely reworked and now standalone version -// -// API -//============================================================== -// double-extended = scalbnl (double-extended x, int n) -// input floating point f8 and int n (r34) -// output floating point f8 -// -// Returns x* 2**n using an fma and detects overflow -// and underflow. -// -// - -#include "libm_support.h" - -FR_Big = f6 -FR_NBig = f7 -FR_Floating_X = f8 -FR_Result = f8 -FR_Result2 = f9 -FR_Result3 = f11 -FR_Norm_X = f12 -FR_Two_N = f14 -FR_Two_to_Big = f15 - -GR_N_Biased = r15 -GR_Big = r16 -GR_NBig = r17 -GR_Scratch = r18 -GR_Scratch1 = r19 -GR_Bias = r20 -GR_N_as_int = r21 - -GR_SAVE_B0 = r32 -GR_SAVE_GP = r33 -GR_SAVE_PFS = r34 -GR_Parameter_X = r35 -GR_Parameter_Y = r36 -GR_Parameter_RESULT = r37 -GR_Tag = r38 - -.align 32 -.global scalbnl - -.section .text -.proc scalbnl -.align 32 - -scalbnl: - -// -// Is x NAN, INF, ZERO, +-? -// Build the exponent Bias -// -{ .mfi - alloc r32=ar.pfs,2,1,4,0 - fclass.m.unc p7,p0 = FR_Floating_X, 0xe7 //@snan | @qnan | @inf | @zero - addl GR_Bias = 0x0FFFF,r0 -} - -// -// Sign extend input -// Is N zero? -// Normalize x -// -{ .mfi - cmp.eq.unc p6,p0 = r34,r0 - fnorm.s1 FR_Norm_X = FR_Floating_X - sxt4 GR_N_as_int = r34 -} -;; - -// -// Normalize x -// Branch and return special values. -// Create -35000 -// Create 35000 -// -{ .mfi - addl GR_Big = 35000,r0 - nop.f 0 - add GR_N_Biased = GR_Bias,GR_N_as_int -} -{ .mfb - addl GR_NBig = -35000,r0 -(p7) fma.s0 FR_Result = FR_Floating_X,f1, f0 -(p7) br.ret.spnt b0 -};; - -// -// Build the exponent Bias -// Return x when N = 0 -// -{ .mfi - setf.exp FR_Two_N = GR_N_Biased - nop.f 0 - addl GR_Scratch1 = 0x063BF,r0 -} -{ .mfb - addl GR_Scratch = 0x019C3F,r0 -(p6) fma.s0 FR_Result = FR_Floating_X,f1, f0 -(p6) br.ret.spnt b0 -};; - -// -// Create 2*big -// Create 2**-big -// Is N > 35000 -// Is N < -35000 -// Raise Denormal operand flag with compare -// Main path, create 2**N -// -{ .mfi - setf.exp FR_NBig = GR_Scratch1 - nop.f 0 - cmp.ge.unc p6, p0 = GR_N_as_int, GR_Big -} -{ .mfi - setf.exp FR_Big = GR_Scratch - fcmp.ge.s0 p0,p11 = FR_Floating_X,f0 - cmp.le.unc p8, p0 = GR_N_as_int, GR_NBig -};; - -// -// Adjust 2**N if N was very small or very large -// -{ .mfi - nop.m 0 -(p6) fma.s1 FR_Two_N = FR_Big,f1,f0 - nop.i 0 -} -{ .mlx - nop.m 999 -(p0) movl GR_Scratch = 0x0000000000033FFF -};; - - -{ .mfi - nop.m 0 -(p8) fma.s1 FR_Two_N = FR_NBig,f1,f0 - nop.i 0 -} -{ .mlx - nop.m 999 -(p0) movl GR_Scratch1= 0x0000000000013FFF -};; - -// Set up necessary status fields -// -// S0 user supplied status -// S2 user supplied status + WRE + TD (Overflows) -// S3 user supplied status + FZ + TD (Underflows) -// -{ .mfi - nop.m 999 -(p0) fsetc.s3 0x7F,0x41 - nop.i 999 -} -{ .mfi - nop.m 999 -(p0) fsetc.s2 0x7F,0x42 - nop.i 999 -};; - -// -// Do final operation -// -{ .mfi - setf.exp FR_NBig = GR_Scratch - fma.s0 FR_Result = FR_Two_N,FR_Norm_X,f0 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s3 FR_Result3 = FR_Two_N,FR_Norm_X,f0 - nop.i 999 -};; -{ .mfi - setf.exp FR_Big = GR_Scratch1 - fma.s2 FR_Result2 = FR_Two_N,FR_Norm_X,f0 - nop.i 999 -};; - -// Check for overflow or underflow. -// Restore s3 -// Restore s2 -// -{ .mfi - nop.m 0 - fsetc.s3 0x7F,0x40 - nop.i 999 -} -{ .mfi - nop.m 0 - fsetc.s2 0x7F,0x40 - nop.i 999 -};; - -// -// Is the result zero? -// -{ .mfi - nop.m 999 - fclass.m.unc p6, p0 = FR_Result3, 0x007 - nop.i 999 -} -{ .mfi - addl GR_Tag = 174, r0 - fcmp.ge.unc.s1 p7, p8 = FR_Result2 , FR_Big - nop.i 0 -};; - -// -// Detect masked underflow - Tiny + Inexact Only -// -{ .mfi - nop.m 999 -(p6) fcmp.neq.unc.s1 p6, p0 = FR_Result , FR_Result2 - nop.i 999 -};; - -// -// Is result bigger the allowed range? -// Branch out for underflow -// -{ .mfb -(p6) addl GR_Tag = 175, r0 -(p8) fcmp.le.unc.s1 p9, p10 = FR_Result2 , FR_NBig -(p6) br.cond.spnt L(scalbnl_UNDERFLOW) -};; - -// -// Branch out for overflow -// -{ .mbb - nop.m 0 -(p7) br.cond.spnt L(scalbnl_OVERFLOW) -(p9) br.cond.spnt L(scalbnl_OVERFLOW) -};; - -// -// Return from main path. -// -{ .mfb - nop.m 999 - nop.f 0 - br.ret.sptk b0;; -} - -.endp scalbnl -ASM_SIZE_DIRECTIVE(scalbnl) -.proc __libm_error_region -__libm_error_region: - -L(scalbnl_OVERFLOW): -L(scalbnl_UNDERFLOW): - -// -// Get stack address of N -// -.prologue -{ .mfi - add GR_Parameter_Y=-32,sp - nop.f 0 -.save ar.pfs,GR_SAVE_PFS - mov GR_SAVE_PFS=ar.pfs -} -// -// Adjust sp -// -{ .mfi -.fframe 64 - add sp=-64,sp - nop.f 0 - mov GR_SAVE_GP=gp -};; - -// -// Store N on stack in correct position -// Locate the address of x on stack -// -{ .mmi - st8 [GR_Parameter_Y] = GR_N_as_int,16 - add GR_Parameter_X = 16,sp -.save b0, GR_SAVE_B0 - mov GR_SAVE_B0=b0 -};; - -// -// Store x on the stack. -// Get address for result on stack. -// -.body -{ .mib - stfe [GR_Parameter_X] = FR_Norm_X - add GR_Parameter_RESULT = 0,GR_Parameter_Y - nop.b 0 -} -{ .mib - stfe [GR_Parameter_Y] = FR_Result - add GR_Parameter_Y = -16,GR_Parameter_Y - br.call.sptk b0=__libm_error_support# -};; - -// -// Get location of result on stack -// -{ .mmi - nop.m 0 - nop.m 0 - add GR_Parameter_RESULT = 48,sp -};; - -// -// Get the new result -// -{ .mmi - ldfe FR_Result = [GR_Parameter_RESULT] -.restore sp - add sp = 64,sp - mov b0 = GR_SAVE_B0 -};; - -// -// Restore gp, ar.pfs and return -// -{ .mib - mov gp = GR_SAVE_GP - mov ar.pfs = GR_SAVE_PFS - br.ret.sptk b0 -};; - -.endp __libm_error_region -ASM_SIZE_DIRECTIVE(__libm_error_region) - -.type __libm_error_support#,@function -.global __libm_error_support# diff --git a/sysdeps/ia64/fpu/s_significand.S b/sysdeps/ia64/fpu/s_significand.S index 84141daf4d..720e043e5c 100644 --- a/sysdeps/ia64/fpu/s_significand.S +++ b/sysdeps/ia64/fpu/s_significand.S @@ -1,10 +1,10 @@ .file "significand.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,7 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -35,13 +35,15 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 2/02/00: Initial version -// 4/04/00 Unwind support added -// 5/31/00: Fixed bug when x a double-extended denormal +// 02/02/00 Initial version +// 04/04/00 Unwind support added +// 05/31/00 Fixed bug when x a double-extended denormal +// 05/20/02 Cleaned up namespace and sf0 syntax +// 02/10/03 Reordered header: .section, .global, .proc, .align // // API //============================================================== @@ -56,18 +58,10 @@ // p6, p7 // // floating-point registers used: -// f8, f9, f10 - -#include "libm_support.h" - -.align 32 -.global significand# +// f8, f9, f10 .section .text -.proc significand# -.align 32 - -significand: +GLOBAL_LIBM_ENTRY(significand) // qnan snan inf norm unorm 0 -+ // 1 1 1 0 0 1 11 @@ -75,19 +69,19 @@ significand: // f10 gets f8(sign) with f1(exp,significand) { .mfi nop.m 999 -(p0) fmerge.s f10 = f8,f1 + fmerge.s f10 = f8,f1 nop.i 999 } { .mfi nop.m 999 -(p0) fnorm f9 = f8 + fnorm.s0 f9 = f8 nop.i 999 ;; } // Test for denormal input { .mfi nop.m 999 -(p0) fclass.m.unc p7,p0 = f8, 0x0b + fclass.m.unc p7,p0 = f8, 0x0b nop.i 999 ;; } @@ -97,14 +91,14 @@ significand: // return sign(f8) exp(f8) significand(f8), normalized. { .mfi nop.m 999 -(p0) fclass.m.unc p0,p6 = f8, 0xe7 + fclass.m.unc p0,p6 = f8, 0xe7 nop.i 999 ;; } { .mmb nop.m 999 nop.m 999 -(p7) br.cond.spnt L(SIGNIFICAND_DENORM) ;; // Branch if x denormal +(p7) br.cond.spnt SIGNIFICAND_DENORM ;; // Branch if x denormal } { .mfi @@ -115,29 +109,29 @@ significand: { .mfb nop.m 999 -(p0) fnorm.d f8 = f8 -(p0) br.ret.sptk b0 ;; + fnorm.d.s0 f8 = f8 + br.ret.sptk b0 ;; } -L(SIGNIFICAND_DENORM): +SIGNIFICAND_DENORM: // Here if x denorm { .mfi nop.m 999 -(p0) fmerge.se f8 = f10,f9 + fmerge.se f8 = f10,f9 nop.i 999 ;; } // Check if fnorm(x) still denormal, means x double-extended denormal { .mfi nop.m 999 -(p0) fclass.m.unc p7,p0 = f9, 0x0b + fclass.m.unc p7,p0 = f9, 0x0b nop.i 999 ;; } // This will be the final result unless x double-extended denormal { .mfi nop.m 999 -(p0) fnorm.d f8 = f8 + fnorm.d.s0 f8 = f8 nop.i 999 ;; } @@ -152,9 +146,8 @@ L(SIGNIFICAND_DENORM): // Final normalization if x double-extended denorm { .mfb nop.m 999 -(p7) fnorm.d f8 = f8 -(p0) br.ret.sptk b0 ;; +(p7) fnorm.d.s0 f8 = f8 + br.ret.sptk b0 ;; } -.endp significand -ASM_SIZE_DIRECTIVE(significand) +GLOBAL_LIBM_END(significand) diff --git a/sysdeps/ia64/fpu/s_significandf.S b/sysdeps/ia64/fpu/s_significandf.S index d8cdc159f6..5c8299b944 100644 --- a/sysdeps/ia64/fpu/s_significandf.S +++ b/sysdeps/ia64/fpu/s_significandf.S @@ -1,10 +1,10 @@ .file "significandf.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,7 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -35,13 +35,15 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 2/02/00: Initial version -// 2/03/00: Modified to improve speed -// 5/31/00: Fixed bug when x a double-extended denormal +// 02/02/00 Initial version +// 02/03/00 Modified to improve speed +// 05/31/00 Fixed bug when x a double-extended denormal +// 05/20/02 Cleaned up namespace and sf0 syntax +// 02/10/03 Reordered header: .section, .global, .proc, .align // // API //============================================================== @@ -55,18 +57,10 @@ // p6, p7 // // floating-point registers used: -// f8, f9, f10 - -#include "libm_support.h" - -.align 32 -.global significandf# +// f8, f9, f10 .section .text -.proc significandf# -.align 32 - -significandf: +GLOBAL_LIBM_ENTRY(significandf) // qnan snan inf norm unorm 0 -+ // 1 1 1 0 0 1 11 @@ -74,19 +68,19 @@ significandf: // f10 gets f8(sign) with f1(exp,significand) { .mfi nop.m 999 -(p0) fmerge.s f10 = f8,f1 + fmerge.s f10 = f8,f1 nop.i 999 } { .mfi nop.m 999 -(p0) fnorm f9 = f8 + fnorm.s0 f9 = f8 nop.i 999 ;; } // Test for denormal input { .mfi nop.m 999 -(p0) fclass.m.unc p7,p0 = f8, 0x0b + fclass.m.unc p7,p0 = f8, 0x0b nop.i 999 ;; } @@ -96,14 +90,14 @@ significandf: // return sign(f8) exp(f8) significand(f8), normalized. { .mfi nop.m 999 -(p0) fclass.m.unc p0,p6 = f8, 0xe7 + fclass.m.unc p0,p6 = f8, 0xe7 nop.i 999 ;; } { .mmb nop.m 999 nop.m 999 -(p7) br.cond.spnt L(SIGNIFICAND_DENORM) ;; // Branch if x denormal +(p7) br.cond.spnt SIGNIFICAND_DENORM ;; // Branch if x denormal } { .mfi @@ -114,29 +108,29 @@ significandf: { .mfb nop.m 999 -(p0) fnorm.s f8 = f8 -(p0) br.ret.sptk b0 ;; + fnorm.s.s0 f8 = f8 + br.ret.sptk b0 ;; } -L(SIGNIFICAND_DENORM): +SIGNIFICAND_DENORM: // Here if x denorm { .mfi nop.m 999 -(p0) fmerge.se f8 = f10,f9 + fmerge.se f8 = f10,f9 nop.i 999 ;; } // Check if fnorm(x) still denormal, means x double-extended denormal { .mfi nop.m 999 -(p0) fclass.m.unc p7,p0 = f9, 0x0b + fclass.m.unc p7,p0 = f9, 0x0b nop.i 999 ;; } // This will be the final result unless x double-extended denormal { .mfi nop.m 999 -(p0) fnorm.s f8 = f8 + fnorm.s.s0 f8 = f8 nop.i 999 ;; } @@ -151,9 +145,8 @@ L(SIGNIFICAND_DENORM): // Final normalization if x double-extended denorm { .mfb nop.m 999 -(p7) fnorm.s f8 = f8 -(p0) br.ret.sptk b0 ;; +(p7) fnorm.s.s0 f8 = f8 + br.ret.sptk b0 ;; } -.endp significandf -ASM_SIZE_DIRECTIVE(significandf) +GLOBAL_LIBM_END(significandf) diff --git a/sysdeps/ia64/fpu/s_significandl.S b/sysdeps/ia64/fpu/s_significandl.S index 268d3567d0..f62df4310c 100644 --- a/sysdeps/ia64/fpu/s_significandl.S +++ b/sysdeps/ia64/fpu/s_significandl.S @@ -1,10 +1,10 @@ .file "significandl.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,7 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -35,13 +35,15 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 2/02/00: Initial version -// 2/03/00: Modified to improve speed -// 5/31/00: Fixed bug when x a double-extended denormal +// 02/02/00 Initial version +// 02/03/00 Modified to improve speed +// 05/31/00 Fixed bug when x a double-extended denormal +// 05/20/02 Cleaned up namespace and sf0 syntax +// 02/10/03 Reordered header: .section, .global, .proc, .align // // API //============================================================== @@ -56,18 +58,10 @@ // p6, p7 // // floating-point registers used: -// f8, f9, f10 - -#include "libm_support.h" - -.align 32 -.global significandl# +// f8, f9, f10 .section .text -.proc significandl# -.align 32 - -significandl: +GLOBAL_LIBM_ENTRY(significandl) // qnan snan inf norm unorm 0 -+ // 1 1 1 0 0 1 11 @@ -75,19 +69,19 @@ significandl: // f10 gets f8(sign) with f1(exp,significand) { .mfi nop.m 999 -(p0) fmerge.s f10 = f8,f1 + fmerge.s f10 = f8,f1 nop.i 999 } { .mfi nop.m 999 -(p0) fnorm f9 = f8 + fnorm.s0 f9 = f8 nop.i 999 ;; } // Test for denormal input { .mfi nop.m 999 -(p0) fclass.m.unc p7,p0 = f8, 0x0b + fclass.m.unc p7,p0 = f8, 0x0b nop.i 999 ;; } @@ -97,14 +91,14 @@ significandl: // return sign(f8) exp(f8) significand(f8), normalized. { .mfi nop.m 999 -(p0) fclass.m.unc p0,p6 = f8, 0xe7 + fclass.m.unc p0,p6 = f8, 0xe7 nop.i 999 ;; } { .mmb nop.m 999 nop.m 999 -(p7) br.cond.spnt L(SIGNIFICAND_DENORM) ;; // Branch if x denormal +(p7) br.cond.spnt SIGNIFICAND_DENORM ;; // Branch if x denormal } { .mfi @@ -115,29 +109,29 @@ significandl: { .mfb nop.m 999 -(p0) fnorm f8 = f8 -(p0) br.ret.sptk b0 ;; + fnorm.s0 f8 = f8 + br.ret.sptk b0 ;; } -L(SIGNIFICAND_DENORM): +SIGNIFICAND_DENORM: // Here if x denorm { .mfi nop.m 999 -(p0) fmerge.se f8 = f10,f9 + fmerge.se f8 = f10,f9 nop.i 999 ;; } // Check if fnorm(x) still denormal, means x double-extended denormal { .mfi nop.m 999 -(p0) fclass.m.unc p7,p0 = f9, 0x0b + fclass.m.unc p7,p0 = f9, 0x0b nop.i 999 ;; } // This will be the final result unless x double-extended denormal { .mfi nop.m 999 -(p0) fnorm f8 = f8 + fnorm.s0 f8 = f8 nop.i 999 ;; } @@ -152,9 +146,8 @@ L(SIGNIFICAND_DENORM): // Final normalization if x double-extended denorm { .mfb nop.m 999 -(p7) fnorm f8 = f8 -(p0) br.ret.sptk b0 ;; +(p7) fnorm.s0 f8 = f8 + br.ret.sptk b0 ;; } -.endp significandl -ASM_SIZE_DIRECTIVE(significandl) +GLOBAL_LIBM_END(significandl) diff --git a/sysdeps/ia64/fpu/s_sincos.c b/sysdeps/ia64/fpu/s_sincos.c index 1ddbc2122a..41254ae60a 100644 --- a/sysdeps/ia64/fpu/s_sincos.c +++ b/sysdeps/ia64/fpu/s_sincos.c @@ -1,9 +1 @@ -#include <math.h> - -void -__sincos (double x, double *s, double *c) -{ - *s = sin (x); - *c = cos (x); -} -weak_alias (__sincos, sincos) +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/s_sincosf.c b/sysdeps/ia64/fpu/s_sincosf.c index efd0fe3038..41254ae60a 100644 --- a/sysdeps/ia64/fpu/s_sincosf.c +++ b/sysdeps/ia64/fpu/s_sincosf.c @@ -1,9 +1 @@ -#include <math.h> - -void -__sincosf (float x, float *s, float *c) -{ - *s = sinf (x); - *c = cosf (x); -} -weak_alias (__sincosf, sincosf) +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/s_sincosl.c b/sysdeps/ia64/fpu/s_sincosl.c index a835b772e2..41254ae60a 100644 --- a/sysdeps/ia64/fpu/s_sincosl.c +++ b/sysdeps/ia64/fpu/s_sincosl.c @@ -1,9 +1 @@ -#include <math.h> - -void -__sincosl (long double x, long double *s, long double *c) -{ - *s = sinl (x); - *c = cosl (x); -} -weak_alias (__sincosl, sincosl) +/* Not needed. */ diff --git a/sysdeps/ia64/fpu/s_tan.S b/sysdeps/ia64/fpu/s_tan.S index 3a497fcf4c..3000f5ee06 100644 --- a/sysdeps/ia64/fpu/s_tan.S +++ b/sysdeps/ia64/fpu/s_tan.S @@ -1,10 +1,10 @@ -.file "tan.s" +.file "tancot.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,7 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -32,20 +32,24 @@ // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// +// // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 2/02/00: Initial version -// 4/04/00 Unwind support added +// 02/02/00 Initial version +// 04/04/00 Unwind support added // 12/27/00 Improved speed +// 02/21/01 Updated to call tanl +// 05/30/02 Added cot +// 02/10/03 Reordered header: .section, .global, .proc, .align // // API //============================================================== -// double tan( double x); +// double tan(double x); +// double cot(double x); // // Overview of operation //============================================================== @@ -61,11 +65,14 @@ // Nfloat = round_int(tan_W) // // tan_r = x - Nfloat * (pi/2)_hi -// tan_r = tan_r - Nfloat * (pi/2)_lo +// a) tan_r = tan_r - Nfloat * (pi/2)_lo (for tan) +// b) tan_r = Nfloat * (pi/2)_lo - tan_r (for cot) // // We have two paths: p8, when Nfloat is even and p9. when Nfloat is odd. -// p8: tan(X) = tan(r) -// p9: tan(X) = -cot(r) +// a) for tan: p8: tan(X) = tan(r) +// p9: tan(X) = -cot(r) +// b) for cot: p9: cot(X) = cot(r) +// p8: cot(X) = -tan(r) // // Each is evaluated as a series. The p9 path requires 1/r. // @@ -75,19 +82,16 @@ // Registers used //============================================================== // -// predicate registers used: -// p6-10 +// predicate registers used: +// p6-12 // -// floating-point registers used: -// f10-15, f32-105 +// floating-point registers used: +// f10-15, f32-106 // f8, input // // general registers used -// r14-18, r32-43 +// r14-26, r32-39 // - -#include "libm_support.h" - // Assembly macros //============================================================== TAN_INV_PI_BY_2_2TO64 = f10 @@ -105,28 +109,28 @@ tan_Pi_by_2_lo = f34 tan_P0 = f35 tan_P1 = f36 tan_P2 = f37 -tan_P3 = f38 -tan_P4 = f39 -tan_P5 = f40 +tan_P3 = f38 +tan_P4 = f39 +tan_P5 = f40 tan_P6 = f41 tan_P7 = f42 -tan_P8 = f43 -tan_P9 = f44 -tan_P10 = f45 +tan_P8 = f43 +tan_P9 = f44 +tan_P10 = f45 tan_P11 = f46 -tan_P12 = f47 +tan_P12 = f47 tan_P13 = f48 tan_P14 = f49 tan_P15 = f50 -tan_Q0 = f51 -tan_Q1 = f52 -tan_Q2 = f53 -tan_Q3 = f54 -tan_Q4 = f55 -tan_Q5 = f56 -tan_Q6 = f57 -tan_Q7 = f58 +tan_Q0 = f51 +tan_Q1 = f52 +tan_Q2 = f53 +tan_Q3 = f54 +tan_Q4 = f55 +tan_Q5 = f56 +tan_Q6 = f57 +tan_Q7 = f58 tan_Q8 = f59 tan_Q9 = f60 tan_Q10 = f61 @@ -153,19 +157,19 @@ tan_v10 = f79 tan_v2 = f80 tan_v9 = f81 tan_v1 = f82 -tan_int_Nfloat = f83 -tan_Nfloat = f84 +tan_int_Nfloat = f83 +tan_Nfloat = f84 -tan_NORM_f8 = f85 +tan_NORM_f8 = f85 tan_W = f86 tan_y0 = f87 -tan_d = f88 -tan_y1 = f89 -tan_dsq = f90 -tan_y2 = f91 -tan_d4 = f92 -tan_inv_r = f93 +tan_d = f88 +tan_y1 = f89 +tan_dsq = f90 +tan_y2 = f91 +tan_d4 = f92 +tan_inv_r = f93 tan_z1 = f94 tan_z2 = f95 @@ -180,6 +184,7 @@ tan_z10 = f103 tan_z11 = f104 tan_z12 = f105 +arg_copy = f106 ///////////////////////////////////////////////////////////// @@ -188,37 +193,33 @@ tan_GR_rshf_2to64 = r15 tan_GR_exp_2tom64 = r16 tan_GR_n = r17 tan_GR_rshf = r18 - -tan_AD = r33 -tan_GR_10009 = r34 -tan_GR_17_ones = r35 -tan_GR_N_odd_even = r36 -tan_GR_N = r37 -tan_signexp = r38 -tan_exp = r39 -tan_ADQ = r40 - -GR_SAVE_PFS = r41 -GR_SAVE_B0 = r42 -GR_SAVE_GP = r43 - - -#ifdef _LIBC -.rodata -#else -.data -#endif +tan_AD = r19 +tan_GR_10009 = r20 +tan_GR_17_ones = r21 +tan_GR_N_odd_even = r22 +tan_GR_N = r23 +tan_signexp = r24 +tan_exp = r25 +tan_ADQ = r26 + +GR_SAVE_B0 = r33 +GR_SAVE_PFS = r34 +GR_SAVE_GP = r35 +GR_Parameter_X = r36 +GR_Parameter_Y = r37 +GR_Parameter_RESULT = r38 +GR_Parameter_Tag = r39 + + +RODATA .align 16 -double_tan_constants: -ASM_TYPE_DIRECTIVE(double_tan_constants,@object) -// data8 0xA2F9836E4E44152A, 0x00003FFE // 2/pi +LOCAL_OBJECT_START(double_tan_constants) data8 0xC90FDAA22168C234, 0x00003FFF // pi/2 hi - - data8 0xBEEA54580DDEA0E1 // P14 + data8 0xBEEA54580DDEA0E1 // P14 data8 0x3ED3021ACE749A59 // P15 - data8 0xBEF312BD91DC8DA1 // P12 + data8 0xBEF312BD91DC8DA1 // P12 data8 0x3EFAE9AFC14C5119 // P13 data8 0x3F2F342BF411E769 // P8 data8 0x3F1A60FC9F3B0227 // P9 @@ -232,10 +233,9 @@ ASM_TYPE_DIRECTIVE(double_tan_constants,@object) data8 0x3FC11111111111C2 // P1 data8 0x3FABA1BA1BA0E850 // P2 data8 0x3F9664F4886725A7 // P3 -ASM_SIZE_DIRECTIVE(double_tan_constants) +LOCAL_OBJECT_END(double_tan_constants) -double_Q_tan_constants: -ASM_TYPE_DIRECTIVE(double_Q_tan_constants,@object) +LOCAL_OBJECT_START(double_Q_tan_constants) data8 0xC4C6628B80DC1CD1, 0x00003FBF // pi/2 lo data8 0x3E223A73BA576E48 // Q8 data8 0x3DF54AD8D1F2CA43 // Q9 @@ -248,35 +248,19 @@ ASM_TYPE_DIRECTIVE(double_Q_tan_constants,@object) data8 0x3F61566ABBFFB489 // Q2 data8 0x3F2BBD77945C1733 // Q3 data8 0x3D927FB33E2B0E04 // Q10 -ASM_SIZE_DIRECTIVE(double_Q_tan_constants) +LOCAL_OBJECT_END(double_Q_tan_constants) - -.align 32 -.global tan# -#ifdef _LIBC -.global __tan# -#endif +.section .text //////////////////////////////////////////////////////// - - -.section .text -.proc tan# -#ifdef _LIBC -.proc __tan# -#endif -.align 32 -tan: -#ifdef _LIBC -__tan: -#endif +LOCAL_LIBM_ENTRY(cot) // The initial fnorm will take any unmasked faults and // normalize any single/double unorms { .mlx - alloc r32=ar.pfs,1,11,0,0 + cmp.eq p12, p11 = r0, r0 // set p12=1, p11=0 for cot movl tan_GR_sig_inv_pi_by_2 = 0xA2F9836E4E44152A // significand of 2/pi } { .mlx @@ -285,18 +269,47 @@ __tan: } ;; -{ .mfi - ld8 tan_AD = [tan_AD] - fnorm tan_NORM_f8 = f8 +{ .mlx mov tan_GR_exp_2tom64 = 0xffff-64 // exponent of scaling factor 2^-64 + movl tan_GR_rshf = 0x43e8000000000000 // 1.1000 2^63 for right shift } +{ .mfb + ld8 tan_AD = [tan_AD] + fnorm.s0 tan_NORM_f8 = f8 + br.cond.sptk COMMON_PATH +} +;; + +LOCAL_LIBM_END(cot) + +GLOBAL_IEEE754_ENTRY(tan) +// The initial fnorm will take any unmasked faults and +// normalize any single/double unorms + { .mlx - nop.m 999 + cmp.eq p11, p12 = r0, r0 // set p11=1, p12=0 for tan + movl tan_GR_sig_inv_pi_by_2 = 0xA2F9836E4E44152A // significand of 2/pi +} +{ .mlx + addl tan_AD = @ltoff(double_tan_constants), gp + movl tan_GR_rshf_2to64 = 0x47e8000000000000 // 1.1000 2^(63+63+1) +} +;; + +{ .mlx + mov tan_GR_exp_2tom64 = 0xffff-64 // exponent of scaling factor 2^-64 movl tan_GR_rshf = 0x43e8000000000000 // 1.1000 2^63 for right shift } +{ .mfi + ld8 tan_AD = [tan_AD] + fnorm.s0 tan_NORM_f8 = f8 + nop.i 0 +} ;; +// Common path for both tan and cot +COMMON_PATH: // Form two constants we need // 2/pi * 2^1 * 2^63, scaled by 2^64 since we just loaded the significand // 1.1000...000 * 2^(63+63+1) to right shift int(W) into the significand @@ -313,7 +326,7 @@ __tan: { .mmf setf.exp TAN_2TOM64 = tan_GR_exp_2tom64 adds tan_ADQ = double_Q_tan_constants - double_tan_constants, tan_AD - fclass.m.unc p6,p0 = f8, 0x07 // Test for x=0 +(p11) fclass.m.unc p6,p0 = f8, 0x07 // Test for x=0 (tan) } ;; @@ -323,79 +336,79 @@ __tan: // 1.1000...000 * 2^63, the right shift constant { .mmf setf.d TAN_RSHF = tan_GR_rshf - ldfe tan_Pi_by_2_hi = [tan_AD],16 + ldfe tan_Pi_by_2_hi = [tan_AD],16 fclass.m.unc p7,p0 = f8, 0x23 // Test for x=inf } ;; { .mfb - ldfe tan_Pi_by_2_lo = [tan_ADQ],16 + ldfe tan_Pi_by_2_lo = [tan_ADQ],16 fclass.m.unc p8,p0 = f8, 0xc3 // Test for x=nan -(p6) br.ret.spnt b0 ;; // Exit for x=0 +(p6) br.ret.spnt b0 ;; // Exit for x=0 (tan only) } { .mfi - ldfpd tan_P14,tan_P15 = [tan_AD],16 + ldfpd tan_P14,tan_P15 = [tan_AD],16 (p7) frcpa.s0 f8,p9=f0,f0 // Set qnan indef if x=inf mov tan_GR_10009 = 0x10009 } { .mib - ldfpd tan_Q8,tan_Q9 = [tan_ADQ],16 + ldfpd tan_Q8,tan_Q9 = [tan_ADQ],16 nop.i 999 (p7) br.ret.spnt b0 ;; // Exit for x=inf } { .mfi - ldfpd tan_P12,tan_P13 = [tan_AD],16 -(p8) fma.d f8=f8,f1,f8 // Set qnan if x=nan + ldfpd tan_P12,tan_P13 = [tan_AD],16 +(p12) fclass.m.unc p6,p0 = f8, 0x07 // Test for x=0 (cot) nop.i 999 } -{ .mib - ldfpd tan_Q4,tan_Q5 = [tan_ADQ],16 - nop.i 999 +{ .mfb + ldfpd tan_Q4,tan_Q5 = [tan_ADQ],16 +(p8) fma.d.s0 f8=f8,f1,f8 // Set qnan if x=nan (p8) br.ret.spnt b0 ;; // Exit for x=nan } -{ .mmi - getf.exp tan_signexp = tan_NORM_f8 - ldfpd tan_P8,tan_P9 = [tan_AD],16 - nop.i 999 ;; +{ .mmf + getf.exp tan_signexp = tan_NORM_f8 + ldfpd tan_P8,tan_P9 = [tan_AD],16 + fmerge.s arg_copy = f8, f8 ;; // Save input for error call } -// Multiply x by scaled 2/pi and add large const to shift integer part of W to +// Multiply x by scaled 2/pi and add large const to shift integer part of W to // rightmost bits of significand -{ .mfi +{ .mmf + alloc r32=ar.pfs,0,4,4,0 ldfpd tan_Q6,tan_Q7 = [tan_ADQ],16 fma.s1 TAN_W_2TO64_RSH = tan_NORM_f8,TAN_INV_PI_BY_2_2TO64,TAN_RSHF_2TO64 - nop.i 999 ;; -} +};; -{ .mmi - ldfpd tan_P10,tan_P11 = [tan_AD],16 - nop.m 999 - and tan_exp = tan_GR_17_ones, tan_signexp ;; +{ .mmf + ldfpd tan_P10,tan_P11 = [tan_AD],16 + and tan_exp = tan_GR_17_ones, tan_signexp +(p6) frcpa.s0 f8, p0 = f1, f8 ;; // cot(+-0) = +-Inf } // p7 is true if we must call DBX TAN // p7 is true if f8 exp is > 0x10009 (which includes all ones // NAN or inf) -{ .mmi - ldfpd tan_Q0,tan_Q1 = [tan_ADQ],16 - cmp.ge.unc p7,p0 = tan_exp,tan_GR_10009 - nop.i 999 ;; +{ .mmb + ldfpd tan_Q0,tan_Q1 = [tan_ADQ],16 + cmp.ge.unc p7,p0 = tan_exp,tan_GR_10009 +(p7) br.cond.spnt TAN_DBX ;; } { .mmb - ldfpd tan_P4,tan_P5 = [tan_AD],16 - nop.m 999 -(p7) br.cond.spnt L(TAN_DBX) ;; + ldfpd tan_P4,tan_P5 = [tan_AD],16 +(p6) mov GR_Parameter_Tag = 226 // (cot) +(p6) br.cond.spnt __libm_error_region ;; // call error support if cot(+-0) } { .mmi - ldfpd tan_Q2,tan_Q3 = [tan_ADQ],16 + ldfpd tan_Q2,tan_Q3 = [tan_ADQ],16 nop.m 999 nop.i 999 ;; } @@ -404,8 +417,8 @@ __tan: // TAN_NFLOAT = Round_Int_Nearest(tan_W) { .mfi - ldfpd tan_P6,tan_P7 = [tan_AD],16 - fms.s1 TAN_NFLOAT = TAN_W_2TO64_RSH,TAN_2TOM64,TAN_RSHF + ldfpd tan_P6,tan_P7 = [tan_AD],16 + fms.s1 TAN_NFLOAT = TAN_W_2TO64_RSH,TAN_2TOM64,TAN_RSHF nop.i 999 ;; } @@ -418,22 +431,22 @@ __tan: { .mfi - ldfpd tan_P0,tan_P1 = [tan_AD],16 + ldfpd tan_P0,tan_P1 = [tan_AD],16 nop.f 999 nop.i 999 ;; } -{ .mfi +{ .mmi getf.sig tan_GR_n = TAN_W_2TO64_RSH - nop.f 999 + ldfpd tan_P2,tan_P3 = [tan_AD] nop.i 999 ;; } // tan_r = -tan_Nfloat * tan_Pi_by_2_hi + x { .mfi - ldfpd tan_P2,tan_P3 = [tan_AD] - fnma.s1 tan_r = TAN_NFLOAT, tan_Pi_by_2_hi, tan_NORM_f8 +(p12) add tan_GR_n = 0x1, tan_GR_n // N = N + 1 (for cot) + fnma.s1 tan_r = TAN_NFLOAT, tan_Pi_by_2_hi, tan_NORM_f8 nop.i 999 ;; } @@ -441,42 +454,49 @@ __tan: // p8 ==> even // p9 ==> odd { .mmi - and tan_GR_N_odd_even = 0x1, tan_GR_n ;; + and tan_GR_N_odd_even = 0x1, tan_GR_n ;; nop.m 999 cmp.eq.unc p8,p9 = tan_GR_N_odd_even, r0 ;; } -// tan_r = tan_r -tan_Nfloat * tan_Pi_by_2_lo +.pred.rel "mutex", p11, p12 +// tan_r = tan_r -tan_Nfloat * tan_Pi_by_2_lo (tan) { .mfi nop.m 999 - fnma.s1 tan_r = TAN_NFLOAT, tan_Pi_by_2_lo, tan_r +(p11) fnma.s1 tan_r = TAN_NFLOAT, tan_Pi_by_2_lo, tan_r + nop.i 999 +} +// tan_r = -(tan_r -tan_Nfloat * tan_Pi_by_2_lo) (cot) +{ .mfi + nop.m 999 +(p12) fms.s1 tan_r = TAN_NFLOAT, tan_Pi_by_2_lo, tan_r nop.i 999 ;; } { .mfi nop.m 999 - fma.s1 tan_rsq = tan_r, tan_r, f0 + fma.s1 tan_rsq = tan_r, tan_r, f0 nop.i 999 ;; } { .mfi nop.m 999 -(p9) frcpa.s1 tan_y0, p10 = f1,tan_r +(p9) frcpa.s1 tan_y0, p0 = f1,tan_r nop.i 999 ;; } { .mfi nop.m 999 -(p8) fma.s1 tan_v18 = tan_rsq, tan_P15, tan_P14 +(p8) fma.s1 tan_v18 = tan_rsq, tan_P15, tan_P14 nop.i 999 } { .mfi nop.m 999 -(p8) fma.s1 tan_v4 = tan_rsq, tan_P1, tan_P0 +(p8) fma.s1 tan_v4 = tan_rsq, tan_P1, tan_P0 nop.i 999 ;; } @@ -484,12 +504,12 @@ __tan: { .mfi nop.m 999 -(p8) fma.s1 tan_v16 = tan_rsq, tan_P13, tan_P12 - nop.i 999 +(p8) fma.s1 tan_v16 = tan_rsq, tan_P13, tan_P12 + nop.i 999 } { .mfi nop.m 999 -(p8) fma.s1 tan_v17 = tan_rsq, tan_rsq, f0 +(p8) fma.s1 tan_v17 = tan_rsq, tan_rsq, f0 nop.i 999 ;; } @@ -497,12 +517,12 @@ __tan: { .mfi nop.m 999 -(p8) fma.s1 tan_v12 = tan_rsq, tan_P9, tan_P8 - nop.i 999 +(p8) fma.s1 tan_v12 = tan_rsq, tan_P9, tan_P8 + nop.i 999 } { .mfi nop.m 999 -(p8) fma.s1 tan_v13 = tan_rsq, tan_P11, tan_P10 +(p8) fma.s1 tan_v13 = tan_rsq, tan_P11, tan_P10 nop.i 999 ;; } @@ -510,12 +530,12 @@ __tan: { .mfi nop.m 999 -(p8) fma.s1 tan_v7 = tan_rsq, tan_P5, tan_P4 - nop.i 999 +(p8) fma.s1 tan_v7 = tan_rsq, tan_P5, tan_P4 + nop.i 999 } { .mfi nop.m 999 -(p8) fma.s1 tan_v8 = tan_rsq, tan_P7, tan_P6 +(p8) fma.s1 tan_v8 = tan_rsq, tan_P7, tan_P6 nop.i 999 ;; } @@ -523,12 +543,12 @@ __tan: { .mfi nop.m 999 -(p9) fnma.s1 tan_d = tan_r, tan_y0, f1 - nop.i 999 +(p9) fnma.s1 tan_d = tan_r, tan_y0, f1 + nop.i 999 } { .mfi nop.m 999 -(p8) fma.s1 tan_v5 = tan_rsq, tan_P3, tan_P2 +(p8) fma.s1 tan_v5 = tan_rsq, tan_P3, tan_P2 nop.i 999 ;; } @@ -536,36 +556,36 @@ __tan: { .mfi nop.m 999 -(p9) fma.s1 tan_z11 = tan_rsq, tan_Q9, tan_Q8 +(p9) fma.s1 tan_z11 = tan_rsq, tan_Q9, tan_Q8 nop.i 999 } { .mfi nop.m 999 -(p9) fma.s1 tan_z12 = tan_rsq, tan_rsq, f0 +(p9) fma.s1 tan_z12 = tan_rsq, tan_rsq, f0 nop.i 999 ;; } { .mfi nop.m 999 -(p8) fma.s1 tan_v15 = tan_v17, tan_v18, tan_v16 - nop.i 999 +(p8) fma.s1 tan_v15 = tan_v17, tan_v18, tan_v16 + nop.i 999 } { .mfi nop.m 999 -(p9) fma.s1 tan_z7 = tan_rsq, tan_Q5, tan_Q4 +(p9) fma.s1 tan_z7 = tan_rsq, tan_Q5, tan_Q4 nop.i 999 ;; } { .mfi nop.m 999 -(p8) fma.s1 tan_v11 = tan_v17, tan_v13, tan_v12 +(p8) fma.s1 tan_v11 = tan_v17, tan_v13, tan_v12 nop.i 999 } { .mfi nop.m 999 -(p9) fma.s1 tan_z8 = tan_rsq, tan_Q7, tan_Q6 +(p9) fma.s1 tan_z8 = tan_rsq, tan_Q7, tan_Q6 nop.i 999 ;; } @@ -573,13 +593,13 @@ __tan: { .mfi nop.m 999 -(p8) fma.s1 tan_v14 = tan_v17, tan_v17, f0 - nop.i 999 +(p8) fma.s1 tan_v14 = tan_v17, tan_v17, f0 + nop.i 999 } { .mfi nop.m 999 -(p9) fma.s1 tan_z3 = tan_rsq, tan_Q1, tan_Q0 - nop.i 999 ;; +(p9) fma.s1 tan_z3 = tan_rsq, tan_Q1, tan_Q0 + nop.i 999 ;; } @@ -587,12 +607,12 @@ __tan: { .mfi nop.m 999 -(p8) fma.s1 tan_v3 = tan_v17, tan_v5, tan_v4 +(p8) fma.s1 tan_v3 = tan_v17, tan_v5, tan_v4 nop.i 999 } { .mfi nop.m 999 -(p8) fma.s1 tan_v6 = tan_v17, tan_v8, tan_v7 +(p8) fma.s1 tan_v6 = tan_v17, tan_v8, tan_v7 nop.i 999 ;; } @@ -600,89 +620,89 @@ __tan: { .mfi nop.m 999 -(p9) fma.s1 tan_y1 = tan_y0, tan_d, tan_y0 - nop.i 999 +(p9) fma.s1 tan_y1 = tan_y0, tan_d, tan_y0 + nop.i 999 } { .mfi nop.m 999 -(p9) fma.s1 tan_dsq = tan_d, tan_d, f0 - nop.i 999 ;; +(p9) fma.s1 tan_dsq = tan_d, tan_d, f0 + nop.i 999 ;; } { .mfi nop.m 999 -(p9) fma.s1 tan_z10 = tan_z12, tan_Q10, tan_z11 - nop.i 999 +(p9) fma.s1 tan_z10 = tan_z12, tan_Q10, tan_z11 + nop.i 999 } { .mfi nop.m 999 -(p9) fma.s1 tan_z9 = tan_z12, tan_z12,f0 +(p9) fma.s1 tan_z9 = tan_z12, tan_z12,f0 nop.i 999 ;; } { .mfi nop.m 999 -(p9) fma.s1 tan_z4 = tan_rsq, tan_Q3, tan_Q2 - nop.i 999 +(p9) fma.s1 tan_z4 = tan_rsq, tan_Q3, tan_Q2 + nop.i 999 } { .mfi nop.m 999 -(p9) fma.s1 tan_z6 = tan_z12, tan_z8, tan_z7 - nop.i 999 ;; +(p9) fma.s1 tan_z6 = tan_z12, tan_z8, tan_z7 + nop.i 999 ;; } { .mfi nop.m 999 -(p8) fma.s1 tan_v10 = tan_v14, tan_v15, tan_v11 - nop.i 999 ;; +(p8) fma.s1 tan_v10 = tan_v14, tan_v15, tan_v11 + nop.i 999 ;; } { .mfi nop.m 999 -(p9) fma.s1 tan_y2 = tan_y1, tan_d, tan_y0 - nop.i 999 +(p9) fma.s1 tan_y2 = tan_y1, tan_d, tan_y0 + nop.i 999 } { .mfi nop.m 999 -(p9) fma.s1 tan_d4 = tan_dsq, tan_dsq, tan_d +(p9) fma.s1 tan_d4 = tan_dsq, tan_dsq, tan_d nop.i 999 ;; } { .mfi nop.m 999 -(p8) fma.s1 tan_v2 = tan_v14, tan_v6, tan_v3 +(p8) fma.s1 tan_v2 = tan_v14, tan_v6, tan_v3 nop.i 999 } { .mfi nop.m 999 -(p8) fma.s1 tan_v9 = tan_v14, tan_v14, f0 +(p8) fma.s1 tan_v9 = tan_v14, tan_v14, f0 nop.i 999 ;; } { .mfi nop.m 999 -(p9) fma.s1 tan_z2 = tan_z12, tan_z4, tan_z3 - nop.i 999 +(p9) fma.s1 tan_z2 = tan_z12, tan_z4, tan_z3 + nop.i 999 } { .mfi nop.m 999 -(p9) fma.s1 tan_z5 = tan_z9, tan_z10, tan_z6 +(p9) fma.s1 tan_z5 = tan_z9, tan_z10, tan_z6 nop.i 999 ;; } { .mfi nop.m 999 -(p9) fma.s1 tan_inv_r = tan_d4, tan_y2, tan_y0 - nop.i 999 +(p9) fma.s1 tan_inv_r = tan_d4, tan_y2, tan_y0 + nop.i 999 } { .mfi nop.m 999 @@ -694,12 +714,12 @@ __tan: { .mfi nop.m 999 -(p8) fma.s1 tan_v1 = tan_v9, tan_v10, tan_v2 - nop.i 999 +(p8) fma.s1 tan_v1 = tan_v9, tan_v10, tan_v2 + nop.i 999 } { .mfi nop.m 999 -(p9) fma.s1 tan_z1 = tan_z9, tan_z5, tan_z2 +(p9) fma.s1 tan_z1 = tan_z9, tan_z5, tan_z2 nop.i 999 ;; } @@ -707,64 +727,150 @@ __tan: { .mfi nop.m 999 -(p8) fma.d.s0 f8 = tan_v1, tan_rcube, tan_r - nop.i 999 +(p8) fma.d.s0 f8 = tan_v1, tan_rcube, tan_r + nop.i 999 } { .mfb nop.m 999 -(p9) fms.d.s0 f8 = tan_r, tan_z1, tan_inv_r - br.ret.sptk b0 ;; +(p9) fms.d.s0 f8 = tan_r, tan_z1, tan_inv_r + br.ret.sptk b0 ;; } -.endp tan# -ASM_SIZE_DIRECTIVE(tan) - +GLOBAL_IEEE754_END(tan) -.proc __libm_callout -__libm_callout: -L(TAN_DBX): +LOCAL_LIBM_ENTRY(__libm_callout) +TAN_DBX: .prologue { .mfi - nop.m 0 - fmerge.s f9 = f0,f0 -.save ar.pfs,GR_SAVE_PFS - mov GR_SAVE_PFS=ar.pfs + nop.m 0 + fmerge.s f9 = f0,f0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs } ;; { .mfi - mov GR_SAVE_GP=gp - nop.f 0 -.save b0, GR_SAVE_B0 - mov GR_SAVE_B0=b0 + mov GR_SAVE_GP=gp + nop.f 0 +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 } .body -{ .mfb +{ .mmb nop.m 999 - nop.f 999 - br.call.sptk.many b0=__libm_tan# ;; + nop.m 999 +(p11) br.cond.sptk.many call_tanl ;; } +// Here if we should call cotl +{ .mmb + nop.m 999 + nop.m 999 + br.call.sptk.many b0=__libm_cotl# ;; +} { .mfi - mov gp = GR_SAVE_GP - fnorm.d f8 = f8 - mov b0 = GR_SAVE_B0 + mov gp = GR_SAVE_GP + fnorm.d.s0 f8 = f8 + mov b0 = GR_SAVE_B0 } ;; +{ .mib + nop.m 999 + mov ar.pfs = GR_SAVE_PFS + br.ret.sptk b0 +;; +} + +// Here if we should call tanl +call_tanl: +{ .mmb + nop.m 999 + nop.m 999 + br.call.sptk.many b0=__libm_tanl# ;; +} + +{ .mfi + mov gp = GR_SAVE_GP + fnorm.d.s0 f8 = f8 + mov b0 = GR_SAVE_B0 +} +;; { .mib - nop.m 999 + nop.m 999 mov ar.pfs = GR_SAVE_PFS br.ret.sptk b0 ;; } +LOCAL_LIBM_END(__libm_callout) + +.type __libm_tanl#,@function +.global __libm_tanl# +.type __libm_cotl#,@function +.global __libm_cotl# + +LOCAL_LIBM_ENTRY(__libm_error_region) +.prologue + +// (1) +{ .mfi + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +{ .mfi +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; + +// (2) +{ .mmi + stfd [GR_Parameter_Y] = f1,16 // STORE Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; + +.body +// (3) +{ .mib + stfd [GR_Parameter_X] = arg_copy // STORE Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address + nop.b 0 +} +{ .mib + stfd [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function +};; +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; + +// (4) +{ .mmi + ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; + +LOCAL_LIBM_END(__libm_error_region) -.endp __libm_callout -ASM_SIZE_DIRECTIVE(__libm_callout) +.type __libm_error_support#,@function +.global __libm_error_support# -.type __libm_tan#,@function -.global __libm_tan# diff --git a/sysdeps/ia64/fpu/s_tanf.S b/sysdeps/ia64/fpu/s_tanf.S index a84009e2fe..48f82345f9 100644 --- a/sysdeps/ia64/fpu/s_tanf.S +++ b/sysdeps/ia64/fpu/s_tanf.S @@ -1,10 +1,10 @@ -.file "tanf.s" +.file "tancotf.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,7 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -32,739 +32,658 @@ // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// +// // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. // // History //============================================================== -// 2/02/00: Initial version -// 4/04/00 Unwind support added +// 02/02/00 Initial version +// 04/04/00 Unwind support added // 12/27/00 Improved speed +// 02/21/01 Updated to call tanl +// 05/30/02 Improved speed, added cotf. +// 11/25/02 Added explicit completer on fnorm +// 02/10/03 Reordered header: .section, .global, .proc, .align +// 04/17/03 Eliminated redundant stop bits // -// API +// APIs //============================================================== -// float tan( float x); +// float tanf(float) +// float cotf(float) // -// Overview of operation +// Algorithm Description for tanf //============================================================== -// If the input value in radians is |x| >= 1.xxxxx 2^10 call the -// older slower version. +// The tanf function computes the principle value of the tangent of x, +// where x is radian argument. // -// The new algorithm is used when |x| <= 1.xxxxx 2^9. +// There are 5 paths: +// 1. x = +/-0.0 +// Return tanf(x) = +/-0.0 // -// Represent the input X as Nfloat * pi/2 + r -// where r can be negative and |r| <= pi/4 +// 2. x = [S,Q]NaN +// Return tanf(x) = QNaN // -// tan_W = x * 2/pi -// Nfloat = round_int(tan_W) +// 3. x = +/-Inf +// Return tanf(x) = QNaN // -// tan_r = x - Nfloat * (pi/2)_hi -// tan_r = tan_r - Nfloat * (pi/2)_lo +// 4. x = r + (Pi/2)*N, N = RoundInt(x*(2/Pi)), N is even, |r|<Pi/4 +// Return tanf(x) = P19(r) = A1*r + A3*r^3 + A5*r^5 + ... + A19*r^19 = +// = r*(A1 + A3*t + A5*t^2 + ... + A19*t^9) = r*P9(t), where t = r^2 // -// We have two paths: p8, when Nfloat is even and p9. when Nfloat is odd. -// p8: tan(X) = tan(r) -// p9: tan(X) = -cot(r) +// 5. x = r + (Pi/2)*N, N = RoundInt(x*(2/Pi)), N is odd, |r|<Pi/4 +// Return tanf(x) = -1/r + P11(r) = -1/r + B1*r + B3*r^3 + ... + B11*r^11 = +// = -1/r + r*(B1 + B3*t + B5*t^2 + ... + B11*t^5) = -1/r + r*P11(t), +// where t = r^2 // -// Each is evaluated as a series. The p9 path requires 1/r. +// Algorithm Description for cotf +//============================================================== +// The cotf function computes the principle value of the cotangent of x, +// where x is radian argument. // -// The coefficients used in the series are stored in a table as -// are the pi constants. +// There are 5 paths: +// 1. x = +/-0.0 +// Return cotf(x) = +/-Inf and error handling is called // -// Registers used -//============================================================== +// 2. x = [S,Q]NaN +// Return cotf(x) = QNaN // -// predicate registers used: -// p6-10 +// 3. x = +/-Inf +// Return cotf(x) = QNaN // -// floating-point registers used: -// f10-15, f32-105 +// 4. x = r + (Pi/2)*N, N = RoundInt(x*(2/Pi)), N is odd, |r|<Pi/4 +// Return cotf(x) = P19(-r) = A1*(-r) + A3*(-r^3) + ... + A19*(-r^19) = +// = -r*(A1 + A3*t + A5*t^2 + ... + A19*t^9) = -r*P9(t), where t = r^2 +// +// 5. x = r + (Pi/2)*N, N = RoundInt(x*(2/Pi)), N is even, |r|<Pi/4 +// Return cotf(x) = 1/r + P11(-r) = 1/r + B1*(-r) + ... + B11*(-r^11) = +// = 1/r - r*(B1 + B3*t + B5*t^2 + ... + B11*t^5) = 1/r - r*P11(t), +// where t = r^2 +// +// We set p10 and clear p11 if computing tanf, vice versa for cotf. +// +// +// Registers used +//============================================================== +// Floating Point registers used: // f8, input +// f32 -> f80 // -// general registers used -// r14-18, r32-43 +// General registers used: +// r14 -> r23, r32 -> r39 +// +// Predicate registers used: +// p6 -> p13 // - -#include "libm_support.h" - // Assembly macros //============================================================== -TAN_INV_PI_BY_2_2TO64 = f10 -TAN_RSHF_2TO64 = f11 -TAN_2TOM64 = f12 -TAN_RSHF = f13 -TAN_W_2TO64_RSH = f14 -TAN_NFLOAT = f15 - -tan_Inv_Pi_by_2 = f32 -tan_Pi_by_2_hi = f33 -tan_Pi_by_2_lo = f34 - - -tan_P0 = f35 -tan_P1 = f36 -tan_P2 = f37 -tan_P3 = f38 -tan_P4 = f39 -tan_P5 = f40 -tan_P6 = f41 -tan_P7 = f42 -tan_P8 = f43 -tan_P9 = f44 -tan_P10 = f45 -tan_P11 = f46 -tan_P12 = f47 -tan_P13 = f48 -tan_P14 = f49 -tan_P15 = f50 - -tan_Q0 = f51 -tan_Q1 = f52 -tan_Q2 = f53 -tan_Q3 = f54 -tan_Q4 = f55 -tan_Q5 = f56 -tan_Q6 = f57 -tan_Q7 = f58 -tan_Q8 = f59 -tan_Q9 = f60 -tan_Q10 = f61 - -tan_r = f62 -tan_rsq = f63 -tan_rcube = f64 - -tan_v18 = f65 -tan_v16 = f66 -tan_v17 = f67 -tan_v12 = f68 -tan_v13 = f69 -tan_v7 = f70 -tan_v8 = f71 -tan_v4 = f72 -tan_v5 = f73 -tan_v15 = f74 -tan_v11 = f75 -tan_v14 = f76 -tan_v3 = f77 -tan_v6 = f78 -tan_v10 = f79 -tan_v2 = f80 -tan_v9 = f81 -tan_v1 = f82 -tan_int_Nfloat = f83 -tan_Nfloat = f84 - -tan_NORM_f8 = f85 -tan_W = f86 - -tan_y0 = f87 -tan_d = f88 -tan_y1 = f89 -tan_dsq = f90 -tan_y2 = f91 -tan_d4 = f92 -tan_inv_r = f93 - -tan_z1 = f94 -tan_z2 = f95 -tan_z3 = f96 -tan_z4 = f97 -tan_z5 = f98 -tan_z6 = f99 -tan_z7 = f100 -tan_z8 = f101 -tan_z9 = f102 -tan_z10 = f103 -tan_z11 = f104 -tan_z12 = f105 - - -///////////////////////////////////////////////////////////// - -tan_GR_sig_inv_pi_by_2 = r14 -tan_GR_rshf_2to64 = r15 -tan_GR_exp_2tom64 = r16 -tan_GR_n = r17 -tan_GR_rshf = r18 - -tan_AD = r33 -tan_GR_10009 = r34 -tan_GR_17_ones = r35 -tan_GR_N_odd_even = r36 -tan_GR_N = r37 -tan_signexp = r38 -tan_exp = r39 -tan_ADQ = r40 - -GR_SAVE_PFS = r41 -GR_SAVE_B0 = r42 -GR_SAVE_GP = r43 - - -#ifdef _LIBC -.rodata -#else -.data -#endif +// integer registers +rExp = r14 +rSignMask = r15 +rRshf = r16 +rScFctrExp = r17 +rIntN = r18 +rSigRcpPiby2 = r19 +rScRshf = r20 +rCoeffA = r21 +rCoeffB = r22 +rExpCut = r23 + +GR_SAVE_B0 = r33 +GR_SAVE_PFS = r34 +GR_SAVE_GP = r35 +GR_Parameter_X = r36 +GR_Parameter_Y = r37 +GR_Parameter_RESULT = r38 +GR_Parameter_Tag = r39 + +//============================================================== +// floating point registers +fScRcpPiby2 = f32 +fScRshf = f33 +fNormArg = f34 +fScFctr = f35 +fRshf = f36 +fShiftedN = f37 +fN = f38 +fR = f39 +fA01 = f40 +fA03 = f41 +fA05 = f42 +fA07 = f43 +fA09 = f44 +fA11 = f45 +fA13 = f46 +fA15 = f47 +fA17 = f48 +fA19 = f49 +fB01 = f50 +fB03 = f51 +fB05 = f52 +fB07 = f53 +fB09 = f54 +fB11 = f55 +fA03_01 = f56 +fA07_05 = f57 +fA11_09 = f58 +fA15_13 = f59 +fA19_17 = f60 +fA11_05 = f61 +fA19_13 = f62 +fA19_05 = f63 +fRbyA03_01 = f64 +fB03_01 = f65 +fB07_05 = f66 +fB11_09 = f67 +fB11_05 = f68 +fRbyB03_01 = f69 +fRbyB11_01 = f70 +fRp2 = f71 +fRp4 = f72 +fRp8 = f73 +fRp5 = f74 +fY0 = f75 +fY1 = f76 +fD = f77 +fDp2 = f78 +fInvR = f79 +fPiby2 = f80 +//============================================================== -.align 16 -double_tan_constants: -ASM_TYPE_DIRECTIVE(double_tan_constants,@object) -// data8 0xA2F9836E4E44152A, 0x00003FFE // 2/pi - data8 0xC90FDAA22168C234, 0x00003FFF // pi/2 hi - - data8 0xBEEA54580DDEA0E1 // P14 - data8 0x3ED3021ACE749A59 // P15 - data8 0xBEF312BD91DC8DA1 // P12 - data8 0x3EFAE9AFC14C5119 // P13 - data8 0x3F2F342BF411E769 // P8 - data8 0x3F1A60FC9F3B0227 // P9 - data8 0x3EFF246E78E5E45B // P10 - data8 0x3F01D9D2E782875C // P11 - data8 0x3F8226E34C4499B6 // P4 - data8 0x3F6D6D3F12C236AC // P5 - data8 0x3F57DA1146DCFD8B // P6 - data8 0x3F43576410FE3D75 // P7 - data8 0x3FD5555555555555 // P0 - data8 0x3FC11111111111C2 // P1 - data8 0x3FABA1BA1BA0E850 // P2 - data8 0x3F9664F4886725A7 // P3 -ASM_SIZE_DIRECTIVE(double_tan_constants) - -double_Q_tan_constants: -ASM_TYPE_DIRECTIVE(double_Q_tan_constants,@object) - data8 0xC4C6628B80DC1CD1, 0x00003FBF // pi/2 lo - data8 0x3E223A73BA576E48 // Q8 - data8 0x3DF54AD8D1F2CA43 // Q9 - data8 0x3EF66A8EE529A6AA // Q4 - data8 0x3EC2281050410EE6 // Q5 - data8 0x3E8D6BB992CC3CF5 // Q6 - data8 0x3E57F88DE34832E4 // Q7 - data8 0x3FD5555555555555 // Q0 - data8 0x3F96C16C16C16DB8 // Q1 - data8 0x3F61566ABBFFB489 // Q2 - data8 0x3F2BBD77945C1733 // Q3 - data8 0x3D927FB33E2B0E04 // Q10 -ASM_SIZE_DIRECTIVE(double_Q_tan_constants) - - - -.align 32 -.global tanf# -#ifdef _LIBC -.global __tanf# -#endif - -//////////////////////////////////////////////////////// +RODATA +.align 16 +LOCAL_OBJECT_START(coeff_A) +data8 0x3FF0000000000000 // A1 = 1.00000000000000000000e+00 +data8 0x3FD5555556BCE758 // A3 = 3.33333334641442641606e-01 +data8 0x3FC111105C2DAE48 // A5 = 1.33333249100689099175e-01 +data8 0x3FABA1F876341060 // A7 = 5.39701122561673229739e-02 +data8 0x3F965FB86D12A38D // A9 = 2.18495194027670719750e-02 +data8 0x3F8265F62415F9D6 // A11 = 8.98353860497717439465e-03 +data8 0x3F69E3AE64CCF58D // A13 = 3.16032468108912746342e-03 +data8 0x3F63920D09D0E6F6 // A15 = 2.38897844840557235331e-03 +LOCAL_OBJECT_END(coeff_A) + +LOCAL_OBJECT_START(coeff_B) +data8 0xC90FDAA22168C235, 0x3FFF // pi/2 +data8 0x3FD55555555358DB // B1 = 3.33333333326107426583e-01 +data8 0x3F96C16C252F643F // B3 = 2.22222230621336129239e-02 +data8 0x3F61566243AB3C60 // B5 = 2.11638633968606896785e-03 +data8 0x3F2BC1169BD4438B // B7 = 2.11748132564551094391e-04 +data8 0x3EF611B4CEA056A1 // B9 = 2.10467959860990200942e-05 +data8 0x3EC600F9E32194BF // B11 = 2.62305891234274186608e-06 +data8 0xBF42BA7BCC177616 // A17 =-5.71546981685324877205e-04 +data8 0x3F4F2614BC6D3BB8 // A19 = 9.50584530849832782542e-04 +LOCAL_OBJECT_END(coeff_B) .section .text -.proc tanf# -#ifdef _LIBC -.proc __tanf# -#endif -.align 32 -tanf: -#ifdef _LIBC -__tanf: -#endif -// The initial fnorm will take any unmasked faults and -// normalize any single/double unorms + +LOCAL_LIBM_ENTRY(cotf) { .mlx - alloc r32=ar.pfs,1,11,0,0 - movl tan_GR_sig_inv_pi_by_2 = 0xA2F9836E4E44152A // significand of 2/pi + getf.exp rExp = f8 // ***** Get 2ˆ17 * s + E + movl rSigRcpPiby2= 0xA2F9836E4E44152A // significand of 2/Pi } { .mlx - addl tan_AD = @ltoff(double_tan_constants), gp - movl tan_GR_rshf_2to64 = 0x47e8000000000000 // 1.1000 2^(63+63+1) + addl rCoeffA = @ltoff(coeff_A), gp + movl rScRshf = 0x47e8000000000000 // 1.5*2^(63+63+1) } ;; { .mfi - ld8 tan_AD = [tan_AD] - fnorm tan_NORM_f8 = f8 - mov tan_GR_exp_2tom64 = 0xffff-64 // exponent of scaling factor 2^-64 + alloc r32 = ar.pfs, 0, 4, 4, 0 + fclass.m p9, p0 = f8, 0xc3 // Test for x=nan + cmp.eq p11, p10 = r0, r0 // if p11=1 we compute cotf } -{ .mlx - nop.m 999 - movl tan_GR_rshf = 0x43e8000000000000 // 1.1000 2^63 for right shift +{ .mib + ld8 rCoeffA = [rCoeffA] + mov rExpCut = 0x10009 // cutoff for exponent + br.cond.sptk Common_Path } ;; +LOCAL_LIBM_END(cotf) -// Form two constants we need -// 2/pi * 2^1 * 2^63, scaled by 2^64 since we just loaded the significand -// 1.1000...000 * 2^(63+63+1) to right shift int(W) into the significand -{ .mmi - setf.sig TAN_INV_PI_BY_2_2TO64 = tan_GR_sig_inv_pi_by_2 - setf.d TAN_RSHF_2TO64 = tan_GR_rshf_2to64 - mov tan_GR_17_ones = 0x1ffff ;; -} - +GLOBAL_IEEE754_ENTRY(tanf) -// Form another constant -// 2^-64 for scaling Nfloat -// 1.1000...000 * 2^63, the right shift constant -{ .mmf - setf.exp TAN_2TOM64 = tan_GR_exp_2tom64 - adds tan_ADQ = double_Q_tan_constants - double_tan_constants, tan_AD - fclass.m.unc p6,p0 = f8, 0x07 // Test for x=0 +{ .mlx + getf.exp rExp = f8 // ***** Get 2ˆ17 * s + E + movl rSigRcpPiby2= 0xA2F9836E4E44152A // significand of 2/Pi } -;; - - -// Form another constant -// 2^-64 for scaling Nfloat -// 1.1000...000 * 2^63, the right shift constant -{ .mmf - setf.d TAN_RSHF = tan_GR_rshf - ldfe tan_Pi_by_2_hi = [tan_AD],16 - fclass.m.unc p7,p0 = f8, 0x23 // Test for x=inf +{ .mlx + addl rCoeffA = @ltoff(coeff_A), gp + movl rScRshf = 0x47e8000000000000 // 1.5*2^(63+63+1) } ;; -{ .mfb - ldfe tan_Pi_by_2_lo = [tan_ADQ],16 - fclass.m.unc p8,p0 = f8, 0xc3 // Test for x=nan -(p6) br.ret.spnt b0 ;; // Exit for x=0 -} - { .mfi - ldfpd tan_P14,tan_P15 = [tan_AD],16 -(p7) frcpa.s0 f8,p9=f0,f0 // Set qnan indef if x=inf - mov tan_GR_10009 = 0x10009 + alloc r32 = ar.pfs, 0, 4, 4, 0 + fclass.m p9, p0 = f8, 0xc3 // Test for x=nan + cmp.eq p10, p11 = r0, r0 // if p10=1 we compute tandf } { .mib - ldfpd tan_Q8,tan_Q9 = [tan_ADQ],16 - nop.i 999 -(p7) br.ret.spnt b0 ;; // Exit for x=inf + ld8 rCoeffA = [rCoeffA] + mov rExpCut = 0x10009 // cutoff for exponent + nop.b 0 } +;; +// Below is common path for both tandf and cotdf +Common_Path: { .mfi - ldfpd tan_P12,tan_P13 = [tan_AD],16 -(p8) fma.s f8=f8,f1,f8 // Set qnan if x=nan - nop.i 999 + setf.sig fScRcpPiby2 = rSigRcpPiby2 // 2^(63+1)*(2/Pi) + fclass.m p8, p0 = f8, 0x23 // Test for x=inf + mov rSignMask = 0x1ffff // mask for sign bit } -{ .mib - ldfpd tan_Q4,tan_Q5 = [tan_ADQ],16 - nop.i 999 -(p8) br.ret.spnt b0 ;; // Exit for x=nan +{ .mlx + setf.d fScRshf = rScRshf // 1.5*2^(63+63+1) + movl rRshf = 0x43e8000000000000 // 1.5 2^63 for right shift } +;; -{ .mmi - getf.exp tan_signexp = tan_NORM_f8 - ldfpd tan_P8,tan_P9 = [tan_AD],16 - nop.i 999 ;; +{ .mfi + and rSignMask = rSignMask, rExp // clear sign bit +(p10) fclass.m.unc p7, p0 = f8, 0x07 // Test for x=0 (for tanf) + mov rScFctrExp = 0xffff-64 // exp of scaling factor +} +{ .mfb + adds rCoeffB = coeff_B - coeff_A, rCoeffA +(p9) fma.s.s0 f8 = f8, f1, f8 // Set qnan if x=nan +(p9) br.ret.spnt b0 // Exit for x=nan } +;; -// Multiply x by scaled 2/pi and add large const to shift integer part of W to -// rightmost bits of significand { .mfi - ldfpd tan_Q6,tan_Q7 = [tan_ADQ],16 - fma.s1 TAN_W_2TO64_RSH = tan_NORM_f8,TAN_INV_PI_BY_2_2TO64,TAN_RSHF_2TO64 - nop.i 999 ;; + cmp.ge p6, p0 = rSignMask, rExpCut // p6 = (E => 0x10009) +(p8) frcpa.s0 f8, p0 = f0, f0 // Set qnan indef if x=inf + mov GR_Parameter_Tag = 227 // (cotf) } - -{ .mmi - ldfpd tan_P10,tan_P11 = [tan_AD],16 - nop.m 999 - and tan_exp = tan_GR_17_ones, tan_signexp ;; +{ .mbb + ldfe fPiby2 = [rCoeffB], 16 +(p8) br.ret.spnt b0 // Exit for x=inf +(p6) br.cond.spnt Huge_Argument // Branch if |x|>=2^10 } +;; +{ .mfi + nop.m 0 +(p11) fclass.m.unc p6, p0 = f8, 0x07 // Test for x=0 (for cotf) + nop.i 0 +} +{ .mfb + nop.m 0 + fnorm.s0 fNormArg = f8 +(p7) br.ret.spnt b0 // Exit for x=0 (for tanf) +} +;; -// p7 is true if we must call DBX TAN -// p7 is true if f8 exp is > 0x10009 (which includes all ones -// NAN or inf) -{ .mmi - ldfpd tan_Q0,tan_Q1 = [tan_ADQ],16 - cmp.ge.unc p7,p0 = tan_exp,tan_GR_10009 - nop.i 999 ;; +{ .mmf + ldfpd fA01, fA03 = [rCoeffA], 16 + ldfpd fB01, fB03 = [rCoeffB], 16 + fmerge.s f10 = f8, f8 // Save input for error call } +;; +{ .mmf + setf.exp fScFctr = rScFctrExp // get as real + setf.d fRshf = rRshf // get right shifter as real +(p6) frcpa.s0 f8, p0 = f1, f8 // cotf(+-0) = +-Inf +} +;; { .mmb - ldfpd tan_P4,tan_P5 = [tan_AD],16 - nop.m 999 -(p7) br.cond.spnt L(TAN_DBX) ;; + ldfpd fA05, fA07 = [rCoeffA], 16 + ldfpd fB05, fB07 = [rCoeffB], 16 +(p6) br.cond.spnt __libm_error_region // call error support if cotf(+-0) } - +;; { .mmi - ldfpd tan_Q2,tan_Q3 = [tan_ADQ],16 - nop.m 999 - nop.i 999 ;; -} - - - -// TAN_NFLOAT = Round_Int_Nearest(tan_W) -{ .mfi - ldfpd tan_P6,tan_P7 = [tan_AD],16 - fms.s1 TAN_NFLOAT = TAN_W_2TO64_RSH,TAN_2TOM64,TAN_RSHF - nop.i 999 ;; + ldfpd fA09, fA11 = [rCoeffA], 16 + ldfpd fB09, fB11 = [rCoeffB], 16 + nop.i 0 } - +;; { .mfi - ldfd tan_Q10 = [tan_ADQ] - nop.f 999 - nop.i 999 ;; + nop.m 0 + fma.s1 fShiftedN = fNormArg,fScRcpPiby2,fScRshf // x*2^70*(2/Pi)+ScRshf + nop.i 0 } - +;; { .mfi - ldfpd tan_P0,tan_P1 = [tan_AD],16 - nop.f 999 - nop.i 999 ;; + nop.m 0 + fms.s1 fN = fShiftedN, fScFctr, fRshf // N = Y*2^(-70) - Rshf + nop.i 0 } +;; - +.pred.rel "mutex", p10, p11 { .mfi - getf.sig tan_GR_n = TAN_W_2TO64_RSH - nop.f 999 - nop.i 999 ;; + getf.sig rIntN = fShiftedN // get N as integer +(p10) fnma.s1 fR = fN, fPiby2, fNormArg // R = x - (Pi/2)*N (tanf) + nop.i 0 } - -// tan_r = -tan_Nfloat * tan_Pi_by_2_hi + x { .mfi - ldfpd tan_P2,tan_P3 = [tan_AD] - fnma.s1 tan_r = TAN_NFLOAT, tan_Pi_by_2_hi, tan_NORM_f8 - nop.i 999 ;; + nop.m 0 +(p11) fms.s1 fR = fN, fPiby2, fNormArg // R = (Pi/2)*N - x (cotf) + nop.i 0 } +;; - -// p8 ==> even -// p9 ==> odd { .mmi - and tan_GR_N_odd_even = 0x1, tan_GR_n ;; - nop.m 999 - cmp.eq.unc p8,p9 = tan_GR_N_odd_even, r0 ;; + ldfpd fA13, fA15 = [rCoeffA], 16 + ldfpd fA17, fA19 = [rCoeffB], 16 + nop.i 0 } +;; - -// tan_r = tan_r -tan_Nfloat * tan_Pi_by_2_lo -{ .mfi - nop.m 999 - fnma.s1 tan_r = TAN_NFLOAT, tan_Pi_by_2_lo, tan_r - nop.i 999 ;; -} - - +Return_From_Huges: { .mfi - nop.m 999 - fma.s1 tan_rsq = tan_r, tan_r, f0 - nop.i 999 ;; + nop.m 0 + fma.s1 fRp2 = fR, fR, f0 // R^2 +(p11) add rIntN = 0x1, rIntN // N = N + 1 (cotf) } - +;; { .mfi - nop.m 999 -(p9) frcpa.s1 tan_y0, p10 = f1,tan_r - nop.i 999 ;; + nop.m 0 + frcpa.s1 fY0, p0 = f1, fR // Y0 ~ 1/R + tbit.z p8, p9 = rIntN, 0 // p8=1 if N is even } +;; - +// Below are mixed polynomial calculations (mixed for even and odd N) { .mfi - nop.m 999 -(p8) fma.s1 tan_v18 = tan_rsq, tan_P15, tan_P14 - nop.i 999 + nop.m 0 +(p9) fma.s1 fB03_01 = fRp2, fB03, fB01 // R^2*B3 + B1 + nop.i 0 } { .mfi - nop.m 999 -(p8) fma.s1 tan_v4 = tan_rsq, tan_P1, tan_P0 - nop.i 999 ;; + nop.m 0 + fma.s1 fRp4 = fRp2, fRp2, f0 // R^4 + nop.i 0 } - - +;; { .mfi - nop.m 999 -(p8) fma.s1 tan_v16 = tan_rsq, tan_P13, tan_P12 - nop.i 999 + nop.m 0 +(p8) fma.s1 fA15_13 = fRp2, fA15, fA13 // R^2*A15 + A13 + nop.i 0 } { .mfi - nop.m 999 -(p8) fma.s1 tan_v17 = tan_rsq, tan_rsq, f0 - nop.i 999 ;; + nop.m 0 +(p8) fma.s1 fA19_17 = fRp2, fA19, fA17 // R^2*A19 + A17 + nop.i 0 } - - +;; { .mfi - nop.m 999 -(p8) fma.s1 tan_v12 = tan_rsq, tan_P9, tan_P8 - nop.i 999 + nop.m 0 +(p8) fma.s1 fA07_05 = fRp2, fA07, fA05 // R^2*A7 + A5 + nop.i 0 } { .mfi - nop.m 999 -(p8) fma.s1 tan_v13 = tan_rsq, tan_P11, tan_P10 - nop.i 999 ;; + nop.m 0 +(p8) fma.s1 fA11_09 = fRp2, fA11, fA09 // R^2*A11 + A9 + nop.i 0 } - - +;; { .mfi - nop.m 999 -(p8) fma.s1 tan_v7 = tan_rsq, tan_P5, tan_P4 - nop.i 999 + nop.m 0 +(p9) fma.s1 fB07_05 = fRp2, fB07, fB05 // R^2*B7 + B5 + nop.i 0 } { .mfi - nop.m 999 -(p8) fma.s1 tan_v8 = tan_rsq, tan_P7, tan_P6 - nop.i 999 ;; + nop.m 0 +(p9) fma.s1 fB11_09 = fRp2, fB11, fB09 // R^2*B11 + B9 + nop.i 0 } - - +;; { .mfi - nop.m 999 -(p9) fnma.s1 tan_d = tan_r, tan_y0, f1 - nop.i 999 + nop.m 0 +(p9) fnma.s1 fD = fR, fY0, f1 // D = 1 - R*Y0 + nop.i 0 } { .mfi - nop.m 999 -(p8) fma.s1 tan_v5 = tan_rsq, tan_P3, tan_P2 - nop.i 999 ;; + nop.m 0 +(p8) fma.s1 fA03_01 = fRp2, fA03, fA01 // R^2*A3 + A1 + nop.i 0 } - - +;; { .mfi - nop.m 999 -(p9) fma.s1 tan_z11 = tan_rsq, tan_Q9, tan_Q8 - nop.i 999 + nop.m 0 + fma.s1 fRp8 = fRp4, fRp4, f0 // R^8 + nop.i 0 } { .mfi - nop.m 999 -(p9) fma.s1 tan_z12 = tan_rsq, tan_rsq, f0 - nop.i 999 ;; + nop.m 0 + fma.s1 fRp5 = fR, fRp4, f0 // R^5 + nop.i 0 } - +;; { .mfi - nop.m 999 -(p8) fma.s1 tan_v15 = tan_v17, tan_v18, tan_v16 - nop.i 999 + nop.m 0 +(p8) fma.s1 fA11_05 = fRp4, fA11_09, fA07_05 // R^4*(R^2*A11 + A9) + ... + nop.i 0 } { .mfi - nop.m 999 -(p9) fma.s1 tan_z7 = tan_rsq, tan_Q5, tan_Q4 - nop.i 999 ;; + nop.m 0 +(p8) fma.s1 fA19_13 = fRp4, fA19_17, fA15_13 // R^4*(R^2*A19 + A17) + .. + nop.i 0 } - +;; { .mfi - nop.m 999 -(p8) fma.s1 tan_v11 = tan_v17, tan_v13, tan_v12 - nop.i 999 + nop.m 0 +(p9) fma.s1 fB11_05 = fRp4, fB11_09, fB07_05 // R^4*(R^2*B11 + B9) + ... + nop.i 0 } { .mfi - nop.m 999 -(p9) fma.s1 tan_z8 = tan_rsq, tan_Q7, tan_Q6 - nop.i 999 ;; + nop.m 0 +(p9) fma.s1 fRbyB03_01 = fR, fB03_01, f0 // R*(R^2*B3 + B1) + nop.i 0 } - - +;; { .mfi - nop.m 999 -(p8) fma.s1 tan_v14 = tan_v17, tan_v17, f0 - nop.i 999 + nop.m 0 +(p9) fma.s1 fY1 = fY0, fD, fY0 // Y1 = Y0*D + Y0 + nop.i 0 } { .mfi - nop.m 999 -(p9) fma.s1 tan_z3 = tan_rsq, tan_Q1, tan_Q0 - nop.i 999 ;; + nop.m 0 +(p9) fma.s1 fDp2 = fD, fD, f0 // D^2 + nop.i 0 } - - - +;; { .mfi - nop.m 999 -(p8) fma.s1 tan_v3 = tan_v17, tan_v5, tan_v4 - nop.i 999 + nop.m 0 + // R^8*(R^6*A19 + R^4*A17 + R^2*A15 + A13) + R^6*A11 + R^4*A9 + R^2*A7 + A5 +(p8) fma.d.s1 fA19_05 = fRp8, fA19_13, fA11_05 + nop.i 0 } { .mfi - nop.m 999 -(p8) fma.s1 tan_v6 = tan_v17, tan_v8, tan_v7 - nop.i 999 ;; + nop.m 0 +(p8) fma.d.s1 fRbyA03_01 = fR, fA03_01, f0 // R*(R^2*A3 + A1) + nop.i 0 } - - +;; { .mfi - nop.m 999 -(p9) fma.s1 tan_y1 = tan_y0, tan_d, tan_y0 - nop.i 999 + nop.m 0 +(p9) fma.d.s1 fInvR = fY1, fDp2, fY1 // 1/R = Y1*D^2 + Y1 + nop.i 0 } { .mfi - nop.m 999 -(p9) fma.s1 tan_dsq = tan_d, tan_d, f0 - nop.i 999 ;; + nop.m 0 + // R^5*(R^6*B11 + R^4*B9 + R^2*B7 + B5) + R^3*B3 + R*B1 +(p9) fma.d.s1 fRbyB11_01 = fRp5, fB11_05, fRbyB03_01 + nop.i 0 } +;; - +.pred.rel "mutex", p8, p9 { .mfi - nop.m 999 -(p9) fma.s1 tan_z10 = tan_z12, tan_Q10, tan_z11 - nop.i 999 + nop.m 0 + // Result = R^5*(R^14*A19 + R^12*A17 + R^10*A15 + ...) + R^3*A3 + R*A1 +(p8) fma.s.s0 f8 = fRp5, fA19_05, fRbyA03_01 + nop.i 0 } -{ .mfi - nop.m 999 -(p9) fma.s1 tan_z9 = tan_z12, tan_z12,f0 - nop.i 999 ;; +{ .mfb + nop.m 0 + // Result = -1/R + R^11*B11 + R^9*B9 + R^7*B7 + R^5*B5 + R^3*B3 + R*B1 +(p9) fnma.s.s0 f8 = f1, fInvR, fRbyB11_01 + br.ret.sptk b0 // exit for main path } +;; +GLOBAL_IEEE754_END(tanf) + +LOCAL_LIBM_ENTRY(__libm_callout) +Huge_Argument: +.prologue { .mfi - nop.m 999 -(p9) fma.s1 tan_z4 = tan_rsq, tan_Q3, tan_Q2 - nop.i 999 -} -{ .mfi - nop.m 999 -(p9) fma.s1 tan_z6 = tan_z12, tan_z8, tan_z7 - nop.i 999 ;; + nop.m 0 + fmerge.s f9 = f0,f0 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs } - - +;; { .mfi - nop.m 999 -(p8) fma.s1 tan_v10 = tan_v14, tan_v15, tan_v11 - nop.i 999 ;; + mov GR_SAVE_GP=gp + nop.f 0 +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 } - - -{ .mfi +.body +{ .mmb nop.m 999 -(p9) fma.s1 tan_y2 = tan_y1, tan_d, tan_y0 - nop.i 999 -} -{ .mfi nop.m 999 -(p9) fma.s1 tan_d4 = tan_dsq, tan_dsq, tan_d - nop.i 999 ;; +(p10) br.cond.sptk.many call_tanl ;; } - -{ .mfi +// Here if we should call cotl (p10=0, p11=1) +{ .mmb nop.m 999 -(p8) fma.s1 tan_v2 = tan_v14, tan_v6, tan_v3 - nop.i 999 -} -{ .mfi nop.m 999 -(p8) fma.s1 tan_v9 = tan_v14, tan_v14, f0 - nop.i 999 ;; + br.call.sptk.many b0=__libm_cotl# ;; } - { .mfi - nop.m 999 -(p9) fma.s1 tan_z2 = tan_z12, tan_z4, tan_z3 - nop.i 999 + mov gp = GR_SAVE_GP + fnorm.s.s0 f8 = f8 + mov b0 = GR_SAVE_B0 } -{ .mfi +;; + +{ .mib nop.m 999 -(p9) fma.s1 tan_z5 = tan_z9, tan_z10, tan_z6 - nop.i 999 ;; + mov ar.pfs = GR_SAVE_PFS + br.ret.sptk b0 +;; } - -{ .mfi +// Here if we should call tanl (p10=1, p11=0) +call_tanl: +{ .mmb nop.m 999 -(p9) fma.s1 tan_inv_r = tan_d4, tan_y2, tan_y0 - nop.i 999 -} -{ .mfi nop.m 999 -(p8) fma.s1 tan_rcube = tan_rsq, tan_r, f0 - nop.i 999 ;; + br.call.sptk.many b0=__libm_tanl# ;; } - - { .mfi - nop.m 999 -(p8) fma.s1 tan_v1 = tan_v9, tan_v10, tan_v2 - nop.i 999 + mov gp = GR_SAVE_GP + fnorm.s.s0 f8 = f8 + mov b0 = GR_SAVE_B0 } -{ .mfi +;; + +{ .mib nop.m 999 -(p9) fma.s1 tan_z1 = tan_z9, tan_z5, tan_z2 - nop.i 999 ;; + mov ar.pfs = GR_SAVE_PFS + br.ret.sptk b0 +;; } +LOCAL_LIBM_END(__libm_callout) - -{ .mfi - nop.m 999 -(p8) fma.s.s0 f8 = tan_v1, tan_rcube, tan_r - nop.i 999 -} -{ .mfb - nop.m 999 -(p9) fms.s.s0 f8 = tan_r, tan_z1, tan_inv_r - br.ret.sptk b0 ;; -} -.endp tanf# -ASM_SIZE_DIRECTIVE(tanf#) +.type __libm_tanl#,@function +.global __libm_tanl# +.type __libm_cotl#,@function +.global __libm_cotl# -.proc __libm_callout -__libm_callout: -L(TAN_DBX): +LOCAL_LIBM_ENTRY(__libm_error_region) .prologue +// (1) { .mfi - nop.m 0 - fmerge.s f9 = f0,f0 + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 0 .save ar.pfs,GR_SAVE_PFS - mov GR_SAVE_PFS=ar.pfs + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs } -;; - { .mfi - mov GR_SAVE_GP=gp - nop.f 0 +.fframe 64 + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp +};; + +// (2) +{ .mmi + stfs [GR_Parameter_Y] = f1,16 // STORE Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address .save b0, GR_SAVE_B0 - mov GR_SAVE_B0=b0 -} + mov GR_SAVE_B0=b0 // Save b0 +};; .body -{ .mfb - nop.m 999 - nop.f 999 - br.call.sptk.many b0=__libm_tan# ;; -} - - -{ .mfi - mov gp = GR_SAVE_GP - fnorm.s f8 = f8 - mov b0 = GR_SAVE_B0 +// (3) +{ .mib + stfs [GR_Parameter_X] = f10 // STORE Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address + nop.b 0 } -;; - +{ .mib + stfs [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function +};; +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; +// (4) +{ .mmi + ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address +};; { .mib - nop.m 999 - mov ar.pfs = GR_SAVE_PFS - br.ret.sptk b0 -;; -} + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; +LOCAL_LIBM_END(__libm_error_region) -.endp __libm_callout -ASM_SIZE_DIRECTIVE(__libm_callout) +.type __libm_error_support#,@function +.global __libm_error_support# -.type __libm_tan#,@function -.global __libm_tan# diff --git a/sysdeps/ia64/fpu/s_tanl.S b/sysdeps/ia64/fpu/s_tanl.S index e13e6c6cbd..345a059c5f 100644 --- a/sysdeps/ia64/fpu/s_tanl.S +++ b/sysdeps/ia64/fpu/s_tanl.S @@ -1,10 +1,10 @@ -.file "tanl.s" +.file "tancotl.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, -// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,7 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -35,50 +35,77 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// http://www.intel.com/software/products/opensource/libraries/num.htm. // -// ********************************************************************* +//********************************************************************* // // History: // -// 2/02/2000 (hand-optimized) -// 4/04/00 Unwind support added +// 02/02/00 (hand-optimized) +// 04/04/00 Unwind support added // 12/28/00 Fixed false invalid flags +// 02/06/02 Improved speed +// 05/07/02 Changed interface to __libm_pi_by_2_reduce +// 05/30/02 Added cotl +// 02/10/03 Reordered header: .section, .global, .proc, .align; +// used data8 for long double table values +// 05/15/03 Reformatted data tables // -// ********************************************************************* +//********************************************************************* // -// Function: tanl(x) = tangent(x), for double-extended precision x values +// Functions: tanl(x) = tangent(x), for double-extended precision x values +// cotl(x) = cotangent(x), for double-extended precision x values // -// ********************************************************************* +//********************************************************************* // // Resources Used: // // Floating-Point Registers: f8 (Input and Return Value) // f9-f15 -// f32-f112 +// f32-f121 // // General Purpose Registers: -// r32-r48 -// r49-r50 (Used to pass arguments to pi_by_2 reduce routine) +// r14-r26,r32-r57 // // Predicate Registers: p6-p15 // -// ********************************************************************* +//********************************************************************* // -// IEEE Special Conditions: +// IEEE Special Conditions for tanl: // // Denormal fault raised on denormal inputs // Overflow exceptions do not occur -// Underflow exceptions raised when appropriate for tan +// Underflow exceptions raised when appropriate for tan // (No specialized error handling for this routine) // Inexact raised when appropriate by algorithm // -// tan(SNaN) = QNaN -// tan(QNaN) = QNaN -// tan(inf) = QNaN -// tan(+/-0) = +/-0 +// tanl(SNaN) = QNaN +// tanl(QNaN) = QNaN +// tanl(inf) = QNaN +// tanl(+/-0) = +/-0 +// +//********************************************************************* +// +// IEEE Special Conditions for cotl: +// +// Denormal fault raised on denormal inputs +// Overflow exceptions occur at zero and near zero +// Underflow exceptions do not occur +// Inexact raised when appropriate by algorithm +// +// cotl(SNaN) = QNaN +// cotl(QNaN) = QNaN +// cotl(inf) = QNaN +// cotl(+/-0) = +/-Inf and error handling is called +// +//********************************************************************* // -// ********************************************************************* +// Below are mathematical and algorithmic descriptions for tanl. +// For cotl we use next identity cot(x) = -tan(x + Pi/2). +// So, to compute cot(x) we just need to increment N (N = N + 1) +// and invert sign of the computed result. +// +//********************************************************************* // // Mathematical Description // @@ -106,13 +133,13 @@ // ------- // // tan(r + c) = r + c + r^3/3 ...accurately -// -cot(r + c) = -1/(r+c) + r/3 ...accurately +// -cot(r + c) = -1/(r+c) + r/3 ...accurately // // Case 4: // ------- // // tan(r + c) = r + c + r^3/3 + 2r^5/15 ...accurately -// -cot(r + c) = -1/(r+c) + r/3 + r^3/45 ...accurately +// -cot(r + c) = -1/(r+c) + r/3 + r^3/45 ...accurately // // // The only cases left are Cases 1 and 3 of the argument reduction @@ -143,13 +170,13 @@ // Since Arg = N pi/4 + r + c accurately, we have // // tan(Arg) = tan(r+c) for N even, -// = -cot(r+c) otherwise. +// = -cot(r+c) otherwise. // // Here for this case, both tan(r) and -cot(r) can be approximated // by simple polynomials: // // tan(r) = r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19 -// -cot(r) = -1/r + Q1_1 r + Q1_2 r^3 + ... + Q1_7 r^13 +// -cot(r) = -1/r + Q1_1 r + Q1_2 r^3 + ... + Q1_7 r^13 // // accurately. Since |r| is relatively small, tan(r+c) and // -cot(r+c) can be accurately approximated by replacing r with @@ -178,21 +205,21 @@ // The required calculation is either // // tan(r + c) = tan(r) + correction, or -// -cot(r + c) = -cot(r) + correction. +// -cot(r + c) = -cot(r) + correction. // // Specifically, // // tan(r + c) = tan(r) + c tan'(r) + O(c^2) -// = tan(r) + c sec^2(r) + O(c^2) -// = tan(r) + c SEC_sq ...accurately +// = tan(r) + c sec^2(r) + O(c^2) +// = tan(r) + c SEC_sq ...accurately // as long as SEC_sq approximates sec^2(r) // to, say, 5 bits or so. // // Similarly, // -// -cot(r + c) = -cot(r) - c cot'(r) + O(c^2) -// = -cot(r) + c csc^2(r) + O(c^2) -// = -cot(r) + c CSC_sq ...accurately +// -cot(r + c) = -cot(r) - c cot'(r) + O(c^2) +// = -cot(r) + c csc^2(r) + O(c^2) +// = -cot(r) + c CSC_sq ...accurately // as long as CSC_sq approximates csc^2(r) // to, say, 5 bits or so. // @@ -208,14 +235,14 @@ // where // // B = 2^k * 1.b_1 b_2 ... b_5 1 -// x = |r| - B +// x = |r| - B // // Now, // tan(B) + tan(x) // tan( B + x ) = ------------------------ // 1 - tan(B)*tan(x) // -// / \ +// / \ // | tan(B) + tan(x) | // = tan(B) + | ------------------------ - tan(B) | @@ -248,7 +275,7 @@ // cot( B + x ) = ------------------------ // tan(B) + tan(x) // -// / \ +// / \ // | 1 - tan(B)*tan(x) | // = cot(B) + | ----------------------- - cot(B) | @@ -273,7 +300,7 @@ // Arg = N * pi/2 + r + c ...accurately // // tan(Arg) = tan(r) + correction if N is even; -// = -cot(r) + correction otherwise. +// = -cot(r) + correction otherwise. // // For Cases 2 and 4, // @@ -292,8 +319,8 @@ // tan(Arg) = r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19 // + c*(1 + r^2) N even // -// = -1/(r+c) + Q1_1 r + Q1_2 r^3 + ... + Q1_7 r^13 -// + Q1_1*c N odd +// = -1/(r+c) + Q1_1 r + Q1_2 r^3 + ... + Q1_7 r^13 +// + Q1_1*c N odd // // Case normal_r: 2^(-2) <= |r| <= pi/4 // @@ -304,15 +331,15 @@ // // tan(Arg) = tan(r) + c*sec^2(r) // = tan( sgn_r * (B+x) ) + c * sec^2(|r|) -// = sgn_r * ( tan(B+x) + sgn_r*c*sec^2(|r|) ) -// = sgn_r * ( tan(B+x) + sgn_r*c*sec^2(B) ) +// = sgn_r * ( tan(B+x) + sgn_r*c*sec^2(|r|) ) +// = sgn_r * ( tan(B+x) + sgn_r*c*sec^2(B) ) // // since B approximates |r| to 2^(-6) in relative accuracy. // // / (1/[sin(B)*cos(B)]) * tan(x) // tan(Arg) = sgn_r * | tan(B) + -------------------------------- // \ cot(B) - tan(x) -// \ +// \ // + CORR | // / @@ -324,15 +351,15 @@ // // tan(Arg) = -cot(r) + c*csc^2(r) // = -cot( sgn_r * (B+x) ) + c * csc^2(|r|) -// = sgn_r * ( -cot(B+x) + sgn_r*c*csc^2(|r|) ) -// = sgn_r * ( -cot(B+x) + sgn_r*c*csc^2(B) ) +// = sgn_r * ( -cot(B+x) + sgn_r*c*csc^2(|r|) ) +// = sgn_r * ( -cot(B+x) + sgn_r*c*csc^2(B) ) // // since B approximates |r| to 2^(-6) in relative accuracy. // // / (1/[sin(B)*cos(B)]) * tan(x) // tan(Arg) = sgn_r * | -cot(B) + -------------------------------- // \ tan(B) + tan(x) -// \ +// \ // + CORR | // / @@ -356,8 +383,8 @@ // For N even, // // rsq := r * r -// Result := c + r * rsq * P1_1 -// Result := r + Result ...in user-defined rounding +// Poly := c + r * rsq * P1_1 +// Result := r + Poly ...in user-defined rounding // // For N odd, // S_hi := -frcpa(r) ...8 bits @@ -375,8 +402,8 @@ // For N even, // // rsq := r * r -// Result := c + r * rsq * (P1_1 + rsq * P1_2) -// Result := r + Result ...in user-defined rounding +// Poly := c + r * rsq * (P1_1 + rsq * P1_2) +// Result := r + Poly ...in user-defined rounding // // For N odd, // S_hi := -frcpa(r) ...8 bits @@ -414,8 +441,8 @@ // Poly2 := P1_4 + rsq*(P1_5 + rsq*(P1_6 + ... rsq*P1_9)) // CORR := c * ( 1 + rsq ) // Poly := Poly1 + r_to_the_8*Poly2 -// Result := r*Poly + CORR -// Result := r + Result ...in user-defined rounding +// Poly := r*Poly + CORR +// Result := r + Poly ...in user-defined rounding // ...note that Poly1 and r_to_the_8 can be computed in parallel // ...with Poly2 (Poly1 is intentionally set to be much // ...shorter than Poly2 so that r_to_the_8 and CORR can be hidden) @@ -434,8 +461,8 @@ // rsq := r*r // P := Q1_1 + rsq*(Q1_2 + rsq*(Q1_3 + ... + rsq*Q1_7)) // -// Result := r*P + S_lo -// Result := S_hi + Result ...in user-defined rounding +// Poly := r*P + S_lo +// Result := S_hi + Poly ...in user-defined rounding // // // Algorithm for the case of normal_r @@ -454,7 +481,7 @@ // / (1/[sin(B)*cos(B)]) * tan(x) // sgn_r * | tan(B) + -------------------------------- + // \ cot(B) - tan(x) -// \ +// \ // CORR | // / @@ -463,7 +490,7 @@ // calculated beforehand and stored in a table. Specifically, // the table values are // -// tan(B) as T_hi + T_lo; +// tan(B) as T_hi + T_lo; // cot(B) as C_hi + C_lo; // 1/[sin(B)*cos(B)] as SC_inv // @@ -559,7 +586,7 @@ // / (1/[sin(B)*cos(B)]) * tan(x) // sgn_r * | -cot(B) + -------------------------------- + // \ tan(B) + tan(x) -// \ +// \ // CORR | // / @@ -568,7 +595,7 @@ // calculated beforehand and stored in a table. Specifically, // the table values are // -// tan(B) as T_hi + T_lo; +// tan(B) as T_hi + T_lo; // cot(B) as C_hi + C_lo; // 1/[sin(B)*cos(B)] as SC_inv // @@ -675,254 +702,382 @@ // // -#include "libm_support.h" - -#ifdef _LIBC -.rodata -#else -.data -#endif -.align 128 - -TANL_BASE_CONSTANTS: -ASM_TYPE_DIRECTIVE(TANL_BASE_CONSTANTS,@object) -data4 0x4B800000, 0xCB800000, 0x38800000, 0xB8800000 // two**24, -two**24 - // two**-14, -two**-14 -data4 0x4E44152A, 0xA2F9836E, 0x00003FFE, 0x00000000 // two_by_pi -data4 0xCE81B9F1, 0xC84D32B0, 0x00004016, 0x00000000 // P_0 -data4 0x2168C235, 0xC90FDAA2, 0x00003FFF, 0x00000000 // P_1 -data4 0xFC8F8CBB, 0xECE675D1, 0x0000BFBD, 0x00000000 // P_2 -data4 0xACC19C60, 0xB7ED8FBB, 0x0000BF7C, 0x00000000 // P_3 -data4 0x5F000000, 0xDF000000, 0x00000000, 0x00000000 // two_to_63, -two_to_63 -data4 0x6EC6B45A, 0xA397E504, 0x00003FE7, 0x00000000 // Inv_P_0 -data4 0xDBD171A1, 0x8D848E89, 0x0000BFBF, 0x00000000 // d_1 -data4 0x18A66F8E, 0xD5394C36, 0x0000BF7C, 0x00000000 // d_2 -data4 0x2168C234, 0xC90FDAA2, 0x00003FFE, 0x00000000 // PI_BY_4 -data4 0x2168C234, 0xC90FDAA2, 0x0000BFFE, 0x00000000 // MPI_BY_4 -data4 0x3E800000, 0xBE800000, 0x00000000, 0x00000000 // two**-2, -two**-2 -data4 0x2F000000, 0xAF000000, 0x00000000, 0x00000000 // two**-33, -two**-33 -data4 0xAAAAAABD, 0xAAAAAAAA, 0x00003FFD, 0x00000000 // P1_1 -data4 0x88882E6A, 0x88888888, 0x00003FFC, 0x00000000 // P1_2 -data4 0x0F0177B6, 0xDD0DD0DD, 0x00003FFA, 0x00000000 // P1_3 -data4 0x646B8C6D, 0xB327A440, 0x00003FF9, 0x00000000 // P1_4 -data4 0x1D5F7D20, 0x91371B25, 0x00003FF8, 0x00000000 // P1_5 -data4 0x61C67914, 0xEB69A5F1, 0x00003FF6, 0x00000000 // P1_6 -data4 0x019318D2, 0xBEDD37BE, 0x00003FF5, 0x00000000 // P1_7 -data4 0x3C794015, 0x9979B146, 0x00003FF4, 0x00000000 // P1_8 -data4 0x8C6EB58A, 0x8EBD21A3, 0x00003FF3, 0x00000000 // P1_9 -data4 0xAAAAAAB4, 0xAAAAAAAA, 0x00003FFD, 0x00000000 // Q1_1 -data4 0x0B5FC93E, 0xB60B60B6, 0x00003FF9, 0x00000000 // Q1_2 -data4 0x0C9BBFBF, 0x8AB355E0, 0x00003FF6, 0x00000000 // Q1_3 -data4 0xCBEE3D4C, 0xDDEBBC89, 0x00003FF2, 0x00000000 // Q1_4 -data4 0x5F80BBB6, 0xB3548A68, 0x00003FEF, 0x00000000 // Q1_5 -data4 0x4CED5BF1, 0x91362560, 0x00003FEC, 0x00000000 // Q1_6 -data4 0x8EE92A83, 0xF189D95A, 0x00003FE8, 0x00000000 // Q1_7 -data4 0xAAAB362F, 0xAAAAAAAA, 0x00003FFD, 0x00000000 // P2_1 -data4 0xE97A6097, 0x88888886, 0x00003FFC, 0x00000000 // P2_2 -data4 0x25E716A1, 0xDD108EE0, 0x00003FFA, 0x00000000 // P2_3 +RODATA +.align 16 + +LOCAL_OBJECT_START(TANL_BASE_CONSTANTS) + +tanl_table_1: +data8 0xA2F9836E4E44152A, 0x00003FFE // two_by_pi +data8 0xC84D32B0CE81B9F1, 0x00004016 // P_0 +data8 0xC90FDAA22168C235, 0x00003FFF // P_1 +data8 0xECE675D1FC8F8CBB, 0x0000BFBD // P_2 +data8 0xB7ED8FBBACC19C60, 0x0000BF7C // P_3 +LOCAL_OBJECT_END(TANL_BASE_CONSTANTS) + +LOCAL_OBJECT_START(tanl_table_2) +data8 0xC90FDAA22168C234, 0x00003FFE // PI_BY_4 +data8 0xA397E5046EC6B45A, 0x00003FE7 // Inv_P_0 +data8 0x8D848E89DBD171A1, 0x0000BFBF // d_1 +data8 0xD5394C3618A66F8E, 0x0000BF7C // d_2 +data4 0x3E800000 // two**-2 +data4 0xBE800000 // -two**-2 +data4 0x00000000 // pad +data4 0x00000000 // pad +LOCAL_OBJECT_END(tanl_table_2) + +LOCAL_OBJECT_START(tanl_table_p1) +data8 0xAAAAAAAAAAAAAABD, 0x00003FFD // P1_1 +data8 0x8888888888882E6A, 0x00003FFC // P1_2 +data8 0xDD0DD0DD0F0177B6, 0x00003FFA // P1_3 +data8 0xB327A440646B8C6D, 0x00003FF9 // P1_4 +data8 0x91371B251D5F7D20, 0x00003FF8 // P1_5 +data8 0xEB69A5F161C67914, 0x00003FF6 // P1_6 +data8 0xBEDD37BE019318D2, 0x00003FF5 // P1_7 +data8 0x9979B1463C794015, 0x00003FF4 // P1_8 +data8 0x8EBD21A38C6EB58A, 0x00003FF3 // P1_9 +LOCAL_OBJECT_END(tanl_table_p1) + +LOCAL_OBJECT_START(tanl_table_q1) +data8 0xAAAAAAAAAAAAAAB4, 0x00003FFD // Q1_1 +data8 0xB60B60B60B5FC93E, 0x00003FF9 // Q1_2 +data8 0x8AB355E00C9BBFBF, 0x00003FF6 // Q1_3 +data8 0xDDEBBC89CBEE3D4C, 0x00003FF2 // Q1_4 +data8 0xB3548A685F80BBB6, 0x00003FEF // Q1_5 +data8 0x913625604CED5BF1, 0x00003FEC // Q1_6 +data8 0xF189D95A8EE92A83, 0x00003FE8 // Q1_7 +LOCAL_OBJECT_END(tanl_table_q1) + +LOCAL_OBJECT_START(tanl_table_p2) +data8 0xAAAAAAAAAAAB362F, 0x00003FFD // P2_1 +data8 0x88888886E97A6097, 0x00003FFC // P2_2 +data8 0xDD108EE025E716A1, 0x00003FFA // P2_3 +LOCAL_OBJECT_END(tanl_table_p2) + +LOCAL_OBJECT_START(tanl_table_tm2) // // Entries T_hi double-precision memory format // Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64) // Entries T_lo single-precision memory format // Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64) // -data4 0x62400794, 0x3FD09BC3, 0x23A05C32, 0x00000000 -data4 0xDFFBC074, 0x3FD124A9, 0x240078B2, 0x00000000 -data4 0x5BD4920F, 0x3FD1AE23, 0x23826B8E, 0x00000000 -data4 0x15E2701D, 0x3FD23835, 0x22D31154, 0x00000000 -data4 0x63739C2D, 0x3FD2C2E4, 0x2265C9E2, 0x00000000 -data4 0xAFEEA48B, 0x3FD34E36, 0x245C05EB, 0x00000000 -data4 0x7DBB35D1, 0x3FD3DA31, 0x24749F2D, 0x00000000 -data4 0x67321619, 0x3FD466DA, 0x2462CECE, 0x00000000 -data4 0x1F94A4D5, 0x3FD4F437, 0x246D0DF1, 0x00000000 -data4 0x740C3E6D, 0x3FD5824D, 0x240A85B5, 0x00000000 -data4 0x4CB1E73D, 0x3FD61123, 0x23F96E33, 0x00000000 -data4 0xAD9EA64B, 0x3FD6A0BE, 0x247C5393, 0x00000000 -data4 0xB804FD01, 0x3FD73125, 0x241F3B29, 0x00000000 -data4 0xAB53EE83, 0x3FD7C25E, 0x2479989B, 0x00000000 -data4 0xE6640EED, 0x3FD8546F, 0x23B343BC, 0x00000000 -data4 0xE8AF1892, 0x3FD8E75F, 0x241454D1, 0x00000000 -data4 0x53928BDA, 0x3FD97B35, 0x238613D9, 0x00000000 -data4 0xEB9DE4DE, 0x3FDA0FF6, 0x22859FA7, 0x00000000 -data4 0x99ECF92D, 0x3FDAA5AB, 0x237A6D06, 0x00000000 -data4 0x6D8F1796, 0x3FDB3C5A, 0x23952F6C, 0x00000000 -data4 0x9CFB8BE4, 0x3FDBD40A, 0x2280FC95, 0x00000000 -data4 0x87943100, 0x3FDC6CC3, 0x245D2EC0, 0x00000000 -data4 0xB736C500, 0x3FDD068C, 0x23C4AD7D, 0x00000000 -data4 0xE1DDBC31, 0x3FDDA16D, 0x23D076E6, 0x00000000 -data4 0xEB515A93, 0x3FDE3D6E, 0x244809A6, 0x00000000 -data4 0xE6E9E5F1, 0x3FDEDA97, 0x220856C8, 0x00000000 -data4 0x1963CE69, 0x3FDF78F1, 0x244BE993, 0x00000000 -data4 0x7D635BCE, 0x3FE00C41, 0x23D21799, 0x00000000 -data4 0x1C302CD3, 0x3FE05CAB, 0x248A1B1D, 0x00000000 -data4 0xDB6A1FA0, 0x3FE0ADB9, 0x23D53E33, 0x00000000 -data4 0x4A20BA81, 0x3FE0FF72, 0x24DB9ED5, 0x00000000 -data4 0x153FA6F5, 0x3FE151D9, 0x24E9E451, 0x00000000 +data8 0x3FD09BC362400794 +data4 0x23A05C32, 0x00000000 +data8 0x3FD124A9DFFBC074 +data4 0x240078B2, 0x00000000 +data8 0x3FD1AE235BD4920F +data4 0x23826B8E, 0x00000000 +data8 0x3FD2383515E2701D +data4 0x22D31154, 0x00000000 +data8 0x3FD2C2E463739C2D +data4 0x2265C9E2, 0x00000000 +data8 0x3FD34E36AFEEA48B +data4 0x245C05EB, 0x00000000 +data8 0x3FD3DA317DBB35D1 +data4 0x24749F2D, 0x00000000 +data8 0x3FD466DA67321619 +data4 0x2462CECE, 0x00000000 +data8 0x3FD4F4371F94A4D5 +data4 0x246D0DF1, 0x00000000 +data8 0x3FD5824D740C3E6D +data4 0x240A85B5, 0x00000000 +data8 0x3FD611234CB1E73D +data4 0x23F96E33, 0x00000000 +data8 0x3FD6A0BEAD9EA64B +data4 0x247C5393, 0x00000000 +data8 0x3FD73125B804FD01 +data4 0x241F3B29, 0x00000000 +data8 0x3FD7C25EAB53EE83 +data4 0x2479989B, 0x00000000 +data8 0x3FD8546FE6640EED +data4 0x23B343BC, 0x00000000 +data8 0x3FD8E75FE8AF1892 +data4 0x241454D1, 0x00000000 +data8 0x3FD97B3553928BDA +data4 0x238613D9, 0x00000000 +data8 0x3FDA0FF6EB9DE4DE +data4 0x22859FA7, 0x00000000 +data8 0x3FDAA5AB99ECF92D +data4 0x237A6D06, 0x00000000 +data8 0x3FDB3C5A6D8F1796 +data4 0x23952F6C, 0x00000000 +data8 0x3FDBD40A9CFB8BE4 +data4 0x2280FC95, 0x00000000 +data8 0x3FDC6CC387943100 +data4 0x245D2EC0, 0x00000000 +data8 0x3FDD068CB736C500 +data4 0x23C4AD7D, 0x00000000 +data8 0x3FDDA16DE1DDBC31 +data4 0x23D076E6, 0x00000000 +data8 0x3FDE3D6EEB515A93 +data4 0x244809A6, 0x00000000 +data8 0x3FDEDA97E6E9E5F1 +data4 0x220856C8, 0x00000000 +data8 0x3FDF78F11963CE69 +data4 0x244BE993, 0x00000000 +data8 0x3FE00C417D635BCE +data4 0x23D21799, 0x00000000 +data8 0x3FE05CAB1C302CD3 +data4 0x248A1B1D, 0x00000000 +data8 0x3FE0ADB9DB6A1FA0 +data4 0x23D53E33, 0x00000000 +data8 0x3FE0FF724A20BA81 +data4 0x24DB9ED5, 0x00000000 +data8 0x3FE151D9153FA6F5 +data4 0x24E9E451, 0x00000000 +LOCAL_OBJECT_END(tanl_table_tm2) + +LOCAL_OBJECT_START(tanl_table_tm1) // // Entries T_hi double-precision memory format // Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64) // Entries T_lo single-precision memory format // Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64) // -data4 0xBA1BE39E, 0x3FE1CEC4, 0x24B60F9E, 0x00000000 -data4 0x5ABD9B2D, 0x3FE277E4, 0x248C2474, 0x00000000 -data4 0x0272B110, 0x3FE32418, 0x247B8311, 0x00000000 -data4 0x890E2DF0, 0x3FE3D38B, 0x24C55751, 0x00000000 -data4 0x46236871, 0x3FE4866D, 0x24E5BC34, 0x00000000 -data4 0x45E044B0, 0x3FE53CEE, 0x24001BA4, 0x00000000 -data4 0x82EC06E4, 0x3FE5F742, 0x24B973DC, 0x00000000 -data4 0x25DF43F9, 0x3FE6B5A1, 0x24895440, 0x00000000 -data4 0xCAFD348C, 0x3FE77844, 0x240021CA, 0x00000000 -data4 0xCEED6B92, 0x3FE83F6B, 0x24C45372, 0x00000000 -data4 0xA34F3665, 0x3FE90B58, 0x240DAD33, 0x00000000 -data4 0x2C1E56B4, 0x3FE9DC52, 0x24F846CE, 0x00000000 -data4 0x27041578, 0x3FEAB2A4, 0x2323FB6E, 0x00000000 -data4 0x9DD8C373, 0x3FEB8E9F, 0x24B3090B, 0x00000000 -data4 0x65C9AA7B, 0x3FEC709B, 0x2449F611, 0x00000000 -data4 0xACCF8435, 0x3FED58F4, 0x23616A7E, 0x00000000 -data4 0x97635082, 0x3FEE480F, 0x24C2FEAE, 0x00000000 -data4 0xF0ACC544, 0x3FEF3E57, 0x242CE964, 0x00000000 -data4 0xF7E06E4B, 0x3FF01E20, 0x2480D3EE, 0x00000000 -data4 0x8A798A69, 0x3FF0A125, 0x24DB8967, 0x00000000 +data8 0x3FE1CEC4BA1BE39E +data4 0x24B60F9E, 0x00000000 +data8 0x3FE277E45ABD9B2D +data4 0x248C2474, 0x00000000 +data8 0x3FE324180272B110 +data4 0x247B8311, 0x00000000 +data8 0x3FE3D38B890E2DF0 +data4 0x24C55751, 0x00000000 +data8 0x3FE4866D46236871 +data4 0x24E5BC34, 0x00000000 +data8 0x3FE53CEE45E044B0 +data4 0x24001BA4, 0x00000000 +data8 0x3FE5F74282EC06E4 +data4 0x24B973DC, 0x00000000 +data8 0x3FE6B5A125DF43F9 +data4 0x24895440, 0x00000000 +data8 0x3FE77844CAFD348C +data4 0x240021CA, 0x00000000 +data8 0x3FE83F6BCEED6B92 +data4 0x24C45372, 0x00000000 +data8 0x3FE90B58A34F3665 +data4 0x240DAD33, 0x00000000 +data8 0x3FE9DC522C1E56B4 +data4 0x24F846CE, 0x00000000 +data8 0x3FEAB2A427041578 +data4 0x2323FB6E, 0x00000000 +data8 0x3FEB8E9F9DD8C373 +data4 0x24B3090B, 0x00000000 +data8 0x3FEC709B65C9AA7B +data4 0x2449F611, 0x00000000 +data8 0x3FED58F4ACCF8435 +data4 0x23616A7E, 0x00000000 +data8 0x3FEE480F97635082 +data4 0x24C2FEAE, 0x00000000 +data8 0x3FEF3E57F0ACC544 +data4 0x242CE964, 0x00000000 +data8 0x3FF01E20F7E06E4B +data4 0x2480D3EE, 0x00000000 +data8 0x3FF0A1258A798A69 +data4 0x24DB8967, 0x00000000 +LOCAL_OBJECT_END(tanl_table_tm1) + +LOCAL_OBJECT_START(tanl_table_cm2) // // Entries C_hi double-precision memory format // Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64) // Entries C_lo single-precision memory format // Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64) // -data4 0xE63EFBD0, 0x400ED3E2, 0x259D94D4, 0x00000000 -data4 0xC515DAB5, 0x400DDDB4, 0x245F0537, 0x00000000 -data4 0xBE19A79F, 0x400CF57A, 0x25D4EA9F, 0x00000000 -data4 0xD15298ED, 0x400C1A06, 0x24AE40A0, 0x00000000 -data4 0x164B2708, 0x400B4A4C, 0x25A5AAB6, 0x00000000 -data4 0x5285B068, 0x400A855A, 0x25524F18, 0x00000000 -data4 0x3FFA549F, 0x4009CA5A, 0x24C999C0, 0x00000000 -data4 0x646AF623, 0x4009188A, 0x254FD801, 0x00000000 -data4 0x6084D0E7, 0x40086F3C, 0x2560F5FD, 0x00000000 -data4 0xA29A76EE, 0x4007CDD2, 0x255B9D19, 0x00000000 -data4 0x6C8ECA95, 0x400733BE, 0x25CB021B, 0x00000000 -data4 0x1F8DDC52, 0x4006A07E, 0x24AB4722, 0x00000000 -data4 0xC298AD58, 0x4006139B, 0x252764E2, 0x00000000 -data4 0xBAD7164B, 0x40058CAB, 0x24DAF5DB, 0x00000000 -data4 0xAE31A5D3, 0x40050B4B, 0x25EA20F4, 0x00000000 -data4 0x89F85A8A, 0x40048F21, 0x2583A3E8, 0x00000000 -data4 0xA862380D, 0x400417DA, 0x25DCC4CC, 0x00000000 -data4 0x1088FCFE, 0x4003A52B, 0x2430A492, 0x00000000 -data4 0xCD3527D5, 0x400336CC, 0x255F77CF, 0x00000000 -data4 0x5760766D, 0x4002CC7F, 0x25DA0BDA, 0x00000000 -data4 0x11CE02E3, 0x40026607, 0x256FF4A2, 0x00000000 -data4 0xD37BBE04, 0x4002032C, 0x25208AED, 0x00000000 -data4 0x7F050775, 0x4001A3BD, 0x24B72DD6, 0x00000000 -data4 0xA554848A, 0x40014789, 0x24AB4DAA, 0x00000000 -data4 0x323E81B7, 0x4000EE65, 0x2584C440, 0x00000000 -data4 0x21CF1293, 0x40009827, 0x25C9428D, 0x00000000 -data4 0x3D415EEB, 0x400044A9, 0x25DC8482, 0x00000000 -data4 0xBD72C577, 0x3FFFE78F, 0x257F5070, 0x00000000 -data4 0x75EFD28E, 0x3FFF4AC3, 0x23EBBF7A, 0x00000000 -data4 0x60B52DDE, 0x3FFEB2AF, 0x22EECA07, 0x00000000 -data4 0x35204180, 0x3FFE1F19, 0x24191079, 0x00000000 -data4 0x54F7E60A, 0x3FFD8FCA, 0x248D3058, 0x00000000 +data8 0x400ED3E2E63EFBD0 +data4 0x259D94D4, 0x00000000 +data8 0x400DDDB4C515DAB5 +data4 0x245F0537, 0x00000000 +data8 0x400CF57ABE19A79F +data4 0x25D4EA9F, 0x00000000 +data8 0x400C1A06D15298ED +data4 0x24AE40A0, 0x00000000 +data8 0x400B4A4C164B2708 +data4 0x25A5AAB6, 0x00000000 +data8 0x400A855A5285B068 +data4 0x25524F18, 0x00000000 +data8 0x4009CA5A3FFA549F +data4 0x24C999C0, 0x00000000 +data8 0x4009188A646AF623 +data4 0x254FD801, 0x00000000 +data8 0x40086F3C6084D0E7 +data4 0x2560F5FD, 0x00000000 +data8 0x4007CDD2A29A76EE +data4 0x255B9D19, 0x00000000 +data8 0x400733BE6C8ECA95 +data4 0x25CB021B, 0x00000000 +data8 0x4006A07E1F8DDC52 +data4 0x24AB4722, 0x00000000 +data8 0x4006139BC298AD58 +data4 0x252764E2, 0x00000000 +data8 0x40058CABBAD7164B +data4 0x24DAF5DB, 0x00000000 +data8 0x40050B4BAE31A5D3 +data4 0x25EA20F4, 0x00000000 +data8 0x40048F2189F85A8A +data4 0x2583A3E8, 0x00000000 +data8 0x400417DAA862380D +data4 0x25DCC4CC, 0x00000000 +data8 0x4003A52B1088FCFE +data4 0x2430A492, 0x00000000 +data8 0x400336CCCD3527D5 +data4 0x255F77CF, 0x00000000 +data8 0x4002CC7F5760766D +data4 0x25DA0BDA, 0x00000000 +data8 0x4002660711CE02E3 +data4 0x256FF4A2, 0x00000000 +data8 0x4002032CD37BBE04 +data4 0x25208AED, 0x00000000 +data8 0x4001A3BD7F050775 +data4 0x24B72DD6, 0x00000000 +data8 0x40014789A554848A +data4 0x24AB4DAA, 0x00000000 +data8 0x4000EE65323E81B7 +data4 0x2584C440, 0x00000000 +data8 0x4000982721CF1293 +data4 0x25C9428D, 0x00000000 +data8 0x400044A93D415EEB +data4 0x25DC8482, 0x00000000 +data8 0x3FFFE78FBD72C577 +data4 0x257F5070, 0x00000000 +data8 0x3FFF4AC375EFD28E +data4 0x23EBBF7A, 0x00000000 +data8 0x3FFEB2AF60B52DDE +data4 0x22EECA07, 0x00000000 +data8 0x3FFE1F1935204180 +data4 0x24191079, 0x00000000 +data8 0x3FFD8FCA54F7E60A +data4 0x248D3058, 0x00000000 +LOCAL_OBJECT_END(tanl_table_cm2) + +LOCAL_OBJECT_START(tanl_table_cm1) // // Entries C_hi double-precision memory format // Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64) // Entries C_lo single-precision memory format // Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64) // -data4 0x79F6FADE, 0x3FFCC06A, 0x239C7886, 0x00000000 -data4 0x891662A6, 0x3FFBB91F, 0x250BD191, 0x00000000 -data4 0x529F155D, 0x3FFABFB6, 0x256CC3E6, 0x00000000 -data4 0x2E964AE9, 0x3FF9D300, 0x250843E3, 0x00000000 -data4 0x89DCB383, 0x3FF8F1EF, 0x2277C87E, 0x00000000 -data4 0x7C87DBD6, 0x3FF81B93, 0x256DA6CF, 0x00000000 -data4 0x1042EDE4, 0x3FF74F14, 0x2573D28A, 0x00000000 -data4 0x1784B360, 0x3FF68BAF, 0x242E489A, 0x00000000 -data4 0x7C923C4C, 0x3FF5D0B5, 0x2532D940, 0x00000000 -data4 0xF418EF20, 0x3FF51D88, 0x253C7DD6, 0x00000000 -data4 0x02F88DAE, 0x3FF4719A, 0x23DB59BF, 0x00000000 -data4 0x49DA0788, 0x3FF3CC66, 0x252B4756, 0x00000000 -data4 0x0B980DB8, 0x3FF32D77, 0x23FE585F, 0x00000000 -data4 0xE56C987A, 0x3FF2945F, 0x25378A63, 0x00000000 -data4 0xB16523F6, 0x3FF200BD, 0x247BB2E0, 0x00000000 -data4 0x8CE27778, 0x3FF17235, 0x24446538, 0x00000000 -data4 0xFDEFE692, 0x3FF0E873, 0x2514638F, 0x00000000 -data4 0x33154062, 0x3FF0632C, 0x24A7FC27, 0x00000000 -data4 0xB3EF115F, 0x3FEFC42E, 0x248FD0FE, 0x00000000 -data4 0x135D26F6, 0x3FEEC9E8, 0x2385C719, 0x00000000 +data8 0x3FFCC06A79F6FADE +data4 0x239C7886, 0x00000000 +data8 0x3FFBB91F891662A6 +data4 0x250BD191, 0x00000000 +data8 0x3FFABFB6529F155D +data4 0x256CC3E6, 0x00000000 +data8 0x3FF9D3002E964AE9 +data4 0x250843E3, 0x00000000 +data8 0x3FF8F1EF89DCB383 +data4 0x2277C87E, 0x00000000 +data8 0x3FF81B937C87DBD6 +data4 0x256DA6CF, 0x00000000 +data8 0x3FF74F141042EDE4 +data4 0x2573D28A, 0x00000000 +data8 0x3FF68BAF1784B360 +data4 0x242E489A, 0x00000000 +data8 0x3FF5D0B57C923C4C +data4 0x2532D940, 0x00000000 +data8 0x3FF51D88F418EF20 +data4 0x253C7DD6, 0x00000000 +data8 0x3FF4719A02F88DAE +data4 0x23DB59BF, 0x00000000 +data8 0x3FF3CC6649DA0788 +data4 0x252B4756, 0x00000000 +data8 0x3FF32D770B980DB8 +data4 0x23FE585F, 0x00000000 +data8 0x3FF2945FE56C987A +data4 0x25378A63, 0x00000000 +data8 0x3FF200BDB16523F6 +data4 0x247BB2E0, 0x00000000 +data8 0x3FF172358CE27778 +data4 0x24446538, 0x00000000 +data8 0x3FF0E873FDEFE692 +data4 0x2514638F, 0x00000000 +data8 0x3FF0632C33154062 +data4 0x24A7FC27, 0x00000000 +data8 0x3FEFC42EB3EF115F +data4 0x248FD0FE, 0x00000000 +data8 0x3FEEC9E8135D26F6 +data4 0x2385C719, 0x00000000 +LOCAL_OBJECT_END(tanl_table_cm1) + +LOCAL_OBJECT_START(tanl_table_scim2) // // Entries SC_inv in Swapped IEEE format (extended) // Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64) // -data4 0x1BF30C9E, 0x839D6D4A, 0x00004001, 0x00000000 -data4 0x554B0EB0, 0x80092804, 0x00004001, 0x00000000 -data4 0xA1CF0DE9, 0xF959F94C, 0x00004000, 0x00000000 -data4 0x77378677, 0xF3086BA0, 0x00004000, 0x00000000 -data4 0xCCD4723C, 0xED154515, 0x00004000, 0x00000000 -data4 0x1C27CF25, 0xE7790944, 0x00004000, 0x00000000 -data4 0x8DDACB88, 0xE22D037D, 0x00004000, 0x00000000 -data4 0x89C73522, 0xDD2B2D8A, 0x00004000, 0x00000000 -data4 0xBB2C1171, 0xD86E1A23, 0x00004000, 0x00000000 -data4 0xDFF5E0F9, 0xD3F0E288, 0x00004000, 0x00000000 -data4 0x283BEBD5, 0xCFAF16B1, 0x00004000, 0x00000000 -data4 0x0D88DD53, 0xCBA4AFAA, 0x00004000, 0x00000000 -data4 0xCA67C43D, 0xC7CE03CC, 0x00004000, 0x00000000 -data4 0x0CA0DDB0, 0xC427BC82, 0x00004000, 0x00000000 -data4 0xF13D8CAB, 0xC0AECD57, 0x00004000, 0x00000000 -data4 0x71ECE6B1, 0xBD606C38, 0x00004000, 0x00000000 -data4 0xA44C4929, 0xBA3A0A96, 0x00004000, 0x00000000 -data4 0xE5CCCEC1, 0xB7394F6F, 0x00004000, 0x00000000 -data4 0x9637D8BC, 0xB45C1203, 0x00004000, 0x00000000 -data4 0x92CB051B, 0xB1A05528, 0x00004000, 0x00000000 -data4 0x6BA2FFD0, 0xAF04432B, 0x00004000, 0x00000000 -data4 0x7221235F, 0xAC862A23, 0x00004000, 0x00000000 -data4 0x5F00A9D1, 0xAA2478AF, 0x00004000, 0x00000000 -data4 0x81E082BF, 0xA7DDBB0C, 0x00004000, 0x00000000 -data4 0x45684FEE, 0xA5B0987D, 0x00004000, 0x00000000 -data4 0x627A8F53, 0xA39BD0F5, 0x00004000, 0x00000000 -data4 0x6EC5C8B0, 0xA19E3B03, 0x00004000, 0x00000000 -data4 0x91CD7C66, 0x9FB6C1F0, 0x00004000, 0x00000000 -data4 0x1FA3DF8A, 0x9DE46410, 0x00004000, 0x00000000 -data4 0xA8F6B888, 0x9C263139, 0x00004000, 0x00000000 -data4 0xC27B0450, 0x9A7B4968, 0x00004000, 0x00000000 -data4 0x5EE614EE, 0x98E2DB7E, 0x00004000, 0x00000000 +data8 0x839D6D4A1BF30C9E, 0x00004001 +data8 0x80092804554B0EB0, 0x00004001 +data8 0xF959F94CA1CF0DE9, 0x00004000 +data8 0xF3086BA077378677, 0x00004000 +data8 0xED154515CCD4723C, 0x00004000 +data8 0xE77909441C27CF25, 0x00004000 +data8 0xE22D037D8DDACB88, 0x00004000 +data8 0xDD2B2D8A89C73522, 0x00004000 +data8 0xD86E1A23BB2C1171, 0x00004000 +data8 0xD3F0E288DFF5E0F9, 0x00004000 +data8 0xCFAF16B1283BEBD5, 0x00004000 +data8 0xCBA4AFAA0D88DD53, 0x00004000 +data8 0xC7CE03CCCA67C43D, 0x00004000 +data8 0xC427BC820CA0DDB0, 0x00004000 +data8 0xC0AECD57F13D8CAB, 0x00004000 +data8 0xBD606C3871ECE6B1, 0x00004000 +data8 0xBA3A0A96A44C4929, 0x00004000 +data8 0xB7394F6FE5CCCEC1, 0x00004000 +data8 0xB45C12039637D8BC, 0x00004000 +data8 0xB1A0552892CB051B, 0x00004000 +data8 0xAF04432B6BA2FFD0, 0x00004000 +data8 0xAC862A237221235F, 0x00004000 +data8 0xAA2478AF5F00A9D1, 0x00004000 +data8 0xA7DDBB0C81E082BF, 0x00004000 +data8 0xA5B0987D45684FEE, 0x00004000 +data8 0xA39BD0F5627A8F53, 0x00004000 +data8 0xA19E3B036EC5C8B0, 0x00004000 +data8 0x9FB6C1F091CD7C66, 0x00004000 +data8 0x9DE464101FA3DF8A, 0x00004000 +data8 0x9C263139A8F6B888, 0x00004000 +data8 0x9A7B4968C27B0450, 0x00004000 +data8 0x98E2DB7E5EE614EE, 0x00004000 +LOCAL_OBJECT_END(tanl_table_scim2) + +LOCAL_OBJECT_START(tanl_table_scim1) // // Entries SC_inv in Swapped IEEE format (extended) // Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64) // -data4 0x13B2B5BA, 0x969F335C, 0x00004000, 0x00000000 -data4 0xD4C0F548, 0x93D446D9, 0x00004000, 0x00000000 -data4 0x61B798AF, 0x9147094F, 0x00004000, 0x00000000 -data4 0x758787AC, 0x8EF317CC, 0x00004000, 0x00000000 -data4 0xB99EEFDB, 0x8CD498B3, 0x00004000, 0x00000000 -data4 0xDFF8BC37, 0x8AE82A7D, 0x00004000, 0x00000000 -data4 0xE3C55D42, 0x892AD546, 0x00004000, 0x00000000 -data4 0xD15573C1, 0x8799FEA9, 0x00004000, 0x00000000 -data4 0x435A4B4C, 0x86335F88, 0x00004000, 0x00000000 -data4 0x3E93A87B, 0x84F4FB6E, 0x00004000, 0x00000000 -data4 0x80A382FB, 0x83DD1952, 0x00004000, 0x00000000 -data4 0xA4CB8C9E, 0x82EA3D7F, 0x00004000, 0x00000000 -data4 0x6861D0A8, 0x821B247C, 0x00004000, 0x00000000 -data4 0x63E8D244, 0x816EBED1, 0x00004000, 0x00000000 -data4 0x27E4CFC6, 0x80E42D91, 0x00004000, 0x00000000 -data4 0x28E64AFD, 0x807ABF8D, 0x00004000, 0x00000000 -data4 0x863B4FD8, 0x8031EF26, 0x00004000, 0x00000000 -data4 0xAE8C11FD, 0x800960AD, 0x00004000, 0x00000000 -data4 0x5FDBEC21, 0x8000E147, 0x00004000, 0x00000000 -data4 0xA07791FA, 0x80186650, 0x00004000, 0x00000000 -ASM_SIZE_DIRECTIVE(TANL_BASE_CONSTANTS) - -Arg = f8 +data8 0x969F335C13B2B5BA, 0x00004000 +data8 0x93D446D9D4C0F548, 0x00004000 +data8 0x9147094F61B798AF, 0x00004000 +data8 0x8EF317CC758787AC, 0x00004000 +data8 0x8CD498B3B99EEFDB, 0x00004000 +data8 0x8AE82A7DDFF8BC37, 0x00004000 +data8 0x892AD546E3C55D42, 0x00004000 +data8 0x8799FEA9D15573C1, 0x00004000 +data8 0x86335F88435A4B4C, 0x00004000 +data8 0x84F4FB6E3E93A87B, 0x00004000 +data8 0x83DD195280A382FB, 0x00004000 +data8 0x82EA3D7FA4CB8C9E, 0x00004000 +data8 0x821B247C6861D0A8, 0x00004000 +data8 0x816EBED163E8D244, 0x00004000 +data8 0x80E42D9127E4CFC6, 0x00004000 +data8 0x807ABF8D28E64AFD, 0x00004000 +data8 0x8031EF26863B4FD8, 0x00004000 +data8 0x800960ADAE8C11FD, 0x00004000 +data8 0x8000E1475FDBEC21, 0x00004000 +data8 0x80186650A07791FA, 0x00004000 +LOCAL_OBJECT_END(tanl_table_scim1) + +Arg = f8 +Save_Norm_Arg = f8 // For input to reduction routine Result = f8 -fp_tmp = f9 +r = f8 // For output from reduction routine +c = f9 // For output from reduction routine U_2 = f10 -rsq = f11 +rsq = f11 C_hi = f12 C_lo = f13 T_hi = f14 T_lo = f15 -N_0 = f32 d_1 = f33 -MPI_BY_4 = f34 +N_0 = f34 tail = f35 tanx = f36 Cx = f37 @@ -949,8 +1104,6 @@ P1_7 = f51 P1_8 = f52 P1_9 = f53 -TWO_TO_63 = f54 -NEGTWO_TO_63 = f55 x = f56 xsq = f57 Tx = f58 @@ -966,12 +1119,10 @@ B = f67 SC_inv = f68 Pos_r = f69 N_0_fix = f70 -PI_BY_4 = f71 -NEGTWO_TO_NEG2 = f72 -TWO_TO_24 = f73 +d_2 = f71 +PI_BY_4 = f72 TWO_TO_NEG14 = f74 TWO_TO_NEG33 = f75 -NEGTWO_TO_24 = f76 NEGTWO_TO_NEG14 = f76 NEGTWO_TO_NEG33 = f77 two_by_PI = f78 @@ -982,13 +1133,14 @@ P_2 = f82 P_3 = f83 s_val = f84 w = f85 -c = f86 -r = f87 +B_mask1 = f86 +B_mask2 = f87 +w2 = f88 A = f89 a = f90 t = f91 U_1 = f92 -d_2 = f93 +NEGTWO_TO_NEG2 = f93 TWO_TO_NEG2 = f94 Q1_1 = f95 Q1_2 = f96 @@ -1009,609 +1161,641 @@ V_hiabs = f110 V = f111 Inv_P_0 = f112 +FR_inv_pi_2to63 = f113 +FR_rshf_2to64 = f114 +FR_2tom64 = f115 +FR_rshf = f116 +Norm_Arg = f117 +Abs_Arg = f118 +TWO_TO_NEG65 = f119 +fp_tmp = f120 +mOne = f121 + +GR_sig_inv_pi = r14 +GR_rshf_2to64 = r15 +GR_exp_2tom64 = r16 +GR_rshf = r17 +GR_exp_2_to_63 = r18 +GR_exp_2_to_24 = r19 +GR_signexp_x = r20 +GR_exp_x = r21 +GR_exp_mask = r22 +GR_exp_2tom14 = r23 +GR_exp_m2tom14 = r24 +GR_exp_2tom33 = r25 +GR_exp_m2tom33 = r26 + GR_SAVE_B0 = r33 GR_SAVE_GP = r34 GR_SAVE_PFS = r35 -delta1 = r36 +table_base = r36 table_ptr1 = r37 table_ptr2 = r38 -i_0 = r39 -i_1 = r40 -N_fix_gr = r41 -N_inc = r42 -exp_Arg = r43 -exp_r = r44 -sig_r = r45 -lookup = r46 -table_offset = r47 -Create_B = r48 +table_ptr3 = r39 +lookup = r40 +N_fix_gr = r41 +GR_exp_2tom2 = r42 +GR_exp_2tom65 = r43 +exp_r = r44 +sig_r = r45 +bmask1 = r46 +table_offset = r47 +bmask2 = r48 gr_tmp = r49 +cot_flag = r50 + +GR_SAVE_B0 = r51 +GR_SAVE_PFS = r52 +GR_SAVE_GP = r53 +GR_Parameter_X = r54 +GR_Parameter_Y = r55 +GR_Parameter_RESULT = r56 +GR_Parameter_Tag = r57 + .section .text -.global tanl -.proc tanl -tanl: -#ifdef _LIBC -.global __tanl -.proc __tanl -__tanl: -#endif -{ .mfi -alloc r32 = ar.pfs, 0,17,2,0 -(p0) fclass.m.unc p6,p0 = Arg, 0x1E7 - addl gr_tmp = -1,r0 -} -{ .mfi - nop.m 0 -(p0) fclass.nm.unc p7,p0 = Arg, 0x1FF - nop.i 0 +.global __libm_tanl# +.global __libm_cotl# + +.proc __libm_cotl# +__libm_cotl: +.endp __libm_cotl# +LOCAL_LIBM_ENTRY(cotl) + +{ .mlx + alloc r32 = ar.pfs, 0,22,4,0 + movl GR_sig_inv_pi = 0xa2f9836e4e44152a // significand of 1/pi +} +{ .mlx + mov GR_exp_mask = 0x1ffff // Exponent mask + movl GR_rshf_2to64 = 0x47e8000000000000 // 1.1000 2^(63+64) +} +;; + +// Check for NatVals, Infs , NaNs, and Zeros +{ .mfi + getf.exp GR_signexp_x = Arg // Get sign and exponent of x + fclass.m p6,p0 = Arg, 0x1E7 // Test for natval, nan, inf, zero + mov cot_flag = 0x1 +} +{ .mfb + addl table_base = @ltoff(TANL_BASE_CONSTANTS), gp // Pointer to table ptr + fnorm.s1 Norm_Arg = Arg // Normalize x + br.cond.sptk COMMON_PATH };; +LOCAL_LIBM_END(cotl) + +.proc __libm_tanl# +__libm_tanl: +.endp __libm_tanl# +GLOBAL_IEEE754_ENTRY(tanl) + +{ .mlx + alloc r32 = ar.pfs, 0,22,4,0 + movl GR_sig_inv_pi = 0xa2f9836e4e44152a // significand of 1/pi +} +{ .mlx + mov GR_exp_mask = 0x1ffff // Exponent mask + movl GR_rshf_2to64 = 0x47e8000000000000 // 1.1000 2^(63+64) +} +;; + +// Check for NatVals, Infs , NaNs, and Zeros { .mfi -(p0) addl table_ptr1 = @ltoff(TANL_BASE_CONSTANTS), gp - nop.f 999 + getf.exp GR_signexp_x = Arg // Get sign and exponent of x + fclass.m p6,p0 = Arg, 0x1E7 // Test for natval, nan, inf, zero + mov cot_flag = 0x0 +} +{ .mfi + addl table_base = @ltoff(TANL_BASE_CONSTANTS), gp // Pointer to table ptr + fnorm.s1 Norm_Arg = Arg // Normalize x nop.i 0 +};; + +// Common path for both tanl and cotl +COMMON_PATH: +{ .mfi + setf.sig FR_inv_pi_2to63 = GR_sig_inv_pi // Form 1/pi * 2^63 + fclass.m p9, p0 = Arg, 0x0b // Test x denormal + mov GR_exp_2tom64 = 0xffff - 64 // Scaling constant to compute N +} +{ .mlx + setf.d FR_rshf_2to64 = GR_rshf_2to64 // Form const 1.1000 * 2^(63+64) + movl GR_rshf = 0x43e8000000000000 // Form const 1.1000 * 2^63 } ;; -{ .mmi -(p0) ld8 table_ptr1 = [table_ptr1] - setf.sig fp_tmp = gr_tmp // Make a constant so fmpy produces inexact - nop.i 999 + +// Check for everything - if false, then must be pseudo-zero or pseudo-nan. +// Branch out to deal with special values. +{ .mfi + addl gr_tmp = -1,r0 + fclass.nm p7,p0 = Arg, 0x1FF // Test x unsupported + mov GR_exp_2_to_63 = 0xffff + 63 // Exponent of 2^63 +} +{ .mfb + ld8 table_base = [table_base] // Get pointer to constant table + fms.s1 mOne = f0, f0, f1 +(p6) br.cond.spnt TANL_SPECIAL // Branch if x natval, nan, inf, zero } ;; -// -// Check for NatVals, Infs , NaNs, and Zeros -// Check for everything - if false, then must be pseudo-zero -// or pseudo-nan. -// Local table pointer -// -{ .mbb -(p0) add table_ptr2 = 96, table_ptr1 -(p6) br.cond.spnt L(TANL_SPECIAL) -(p7) br.cond.spnt L(TANL_SPECIAL) ;; +{ .mmb + setf.sig fp_tmp = gr_tmp // Make a constant so fmpy produces inexact + mov GR_exp_2_to_24 = 0xffff + 24 // Exponent of 2^24 +(p9) br.cond.spnt TANL_DENORMAL // Branch if x denormal } +;; + +TANL_COMMON: +// Return to here if x denormal // -// Point to Inv_P_0 -// Branch out to deal with unsupporteds and special values. -// -{ .mmf -(p0) ldfs TWO_TO_24 = [table_ptr1],4 -(p0) ldfs TWO_TO_63 = [table_ptr2],4 -// -// Load -2**24, load -2**63. -// -(p0) fcmp.eq.s0 p0, p6 = Arg, f1 ;; -} +// Do fcmp to generate Denormal exception +// - can't do FNORM (will generate Underflow when U is unmasked!) +// Branch out to deal with unsupporteds values. { .mfi -(p0) ldfs NEGTWO_TO_63 = [table_ptr2],12 -(p0) fnorm.s1 Arg = Arg - nop.i 999 + setf.exp FR_2tom64 = GR_exp_2tom64 // Form 2^-64 for scaling N_float + fcmp.eq.s0 p0, p6 = Arg, f1 // Dummy to flag denormals + add table_ptr1 = 0, table_base // Point to tanl_table_1 } -// -// Load 2**24, Load 2**63. -// -{ .mmi -(p0) ldfs NEGTWO_TO_24 = [table_ptr1],12 ;; -// -// Do fcmp to generate Denormal exception -// - can't do FNORM (will generate Underflow when U is unmasked!) -// Normalize input argument. -// -(p0) ldfe two_by_PI = [table_ptr1],16 - nop.i 999 +{ .mib + setf.d FR_rshf = GR_rshf // Form right shift const 1.1000 * 2^63 + add table_ptr2 = 80, table_base // Point to tanl_table_2 +(p7) br.cond.spnt TANL_UNSUPPORTED // Branch if x unsupported type } -{ .mmi -(p0) ldfe Inv_P_0 = [table_ptr2],16 ;; -(p0) ldfe d_1 = [table_ptr2],16 - nop.i 999 +;; + +{ .mfi + and GR_exp_x = GR_exp_mask, GR_signexp_x // Get exponent of x + fmpy.s1 Save_Norm_Arg = Norm_Arg, f1 // Save x if large arg reduction + dep.z bmask1 = 0x7c, 56, 8 // Form mask to get 5 msb of r + // bmask1 = 0x7c00000000000000 } +;; + // // Decide about the paths to take: -// PR_1 and PR_3 set if -2**24 < Arg < 2**24 - CASE 1 OR 2 -// OTHERWISE - CASE 3 OR 4 -// Load inverse of P_0 . -// Set PR_6 if Arg <= -2**63 -// Are there any Infs, NaNs, or zeros? +// Set PR_6 if |Arg| >= 2**63 +// Set PR_9 if |Arg| < 2**24 - CASE 1 OR 2 +// OTHERWISE Set PR_8 - CASE 3 OR 4 // -{ .mmi -(p0) ldfe P_0 = [table_ptr1],16 ;; -(p0) ldfe d_2 = [table_ptr2],16 - nop.i 999 +// Branch out if the magnitude of the input argument is >= 2^63 +// - do this branch before the next. +{ .mfi + ldfe two_by_PI = [table_ptr1],16 // Load 2/pi + nop.f 999 + dep.z bmask2 = 0x41, 57, 7 // Form mask to OR to produce B + // bmask2 = 0x8200000000000000 } -// -// Set PR_8 if Arg <= -2**24 -// Set PR_6 if Arg >= 2**63 -// -{ .mmi -(p0) ldfe P_1 = [table_ptr1],16 ;; -(p0) ldfe PI_BY_4 = [table_ptr2],16 - nop.i 999 +{ .mib + ldfe PI_BY_4 = [table_ptr2],16 // Load pi/4 + cmp.ge p6,p0 = GR_exp_x, GR_exp_2_to_63 // Is |x| >= 2^63 +(p6) br.cond.spnt TANL_ARG_TOO_LARGE // Branch if |x| >= 2^63 } -// -// Set PR_8 if Arg >= 2**24 -// +;; + { .mmi -(p0) ldfe P_2 = [table_ptr1],16 ;; -(p0) ldfe MPI_BY_4 = [table_ptr2],16 - nop.i 999 -} -// -// Load P_2 and PI_BY_4 -// -{ .mfi -(p0) ldfe P_3 = [table_ptr1],16 - nop.f 999 - nop.i 999 ;; -} -{ .mfi - nop.m 999 -(p0) fcmp.le.unc.s1 p6,p7 = Arg,NEGTWO_TO_63 - nop.i 999 + ldfe P_0 = [table_ptr1],16 // Load P_0 + ldfe Inv_P_0 = [table_ptr2],16 // Load Inv_P_0 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p0) fcmp.le.unc.s1 p8,p9 = Arg,NEGTWO_TO_24 - nop.i 999 ;; + ldfe P_1 = [table_ptr1],16 // Load P_1 + fmerge.s Abs_Arg = f0, Norm_Arg // Get |x| + mov GR_exp_m2tom33 = 0x2ffff - 33 // Form signexp of -2^-33 } { .mfi - nop.m 999 -(p7) fcmp.ge.s1 p6,p0 = Arg,TWO_TO_63 - nop.i 999 + ldfe d_1 = [table_ptr2],16 // Load d_1 for 2^24 <= |x| < 2^63 + nop.f 999 + mov GR_exp_2tom33 = 0xffff - 33 // Form signexp of 2^-33 } -{ .mfi - nop.m 999 -(p9) fcmp.ge.s1 p8,p0 = Arg,TWO_TO_24 - nop.i 999 ;; +;; + +{ .mmi + ldfe P_2 = [table_ptr1],16 // Load P_2 + ldfe d_2 = [table_ptr2],16 // Load d_2 for 2^24 <= |x| < 2^63 + cmp.ge p8,p0 = GR_exp_x, GR_exp_2_to_24 // Is |x| >= 2^24 } -{ .mib - nop.m 999 - nop.i 999 -// -// Load P_3 and -PI_BY_4 -// -(p6) br.cond.spnt L(TANL_ARG_TOO_LARGE) ;; +;; + +// Use special scaling to right shift so N=Arg * 2/pi is in rightmost bits +// Branch to Cases 3 or 4 if Arg <= -2**24 or Arg >= 2**24 +{ .mfb + ldfe P_3 = [table_ptr1],16 // Load P_3 + fma.s1 N_fix = Norm_Arg, FR_inv_pi_2to63, FR_rshf_2to64 +(p8) br.cond.spnt TANL_LARGER_ARG // Branch if 2^24 <= |x| < 2^63 } -{ .mib - nop.m 999 - nop.i 999 -// -// Load 2**(-2). -// Load -2**(-2). -// Branch out if we have a special argument. -// Branch out if the magnitude of the input argument is too large -// - do this branch before the next. +;; + +// Here if 0 < |x| < 2^24 +// ARGUMENT REDUCTION CODE - CASE 1 and 2 // -(p8) br.cond.spnt L(TANL_LARGER_ARG) ;; +{ .mmf + setf.exp TWO_TO_NEG33 = GR_exp_2tom33 // Form 2^-33 + setf.exp NEGTWO_TO_NEG33 = GR_exp_m2tom33 // Form -2^-33 + fmerge.s r = Norm_Arg,Norm_Arg // Assume r=x, ok if |x| < pi/4 } +;; + // -// Branch to Cases 3 or 4 if Arg <= -2**24 or Arg >= 2**24 +// If |Arg| < pi/4, set PR_8, else pi/4 <=|Arg| < 2^24 - set PR_9. // +// Case 2: Convert integer N_fix back to normalized floating-point value. { .mfi -(p0) ldfs TWO_TO_NEG2 = [table_ptr2],4 -// ARGUMENT REDUCTION CODE - CASE 1 and 2 -// Load 2**(-2). -// Load -2**(-2). -(p0) fmpy.s1 N = Arg,two_by_PI - nop.i 999 ;; + getf.sig sig_r = Norm_Arg // Get sig_r if 1/4 <= |x| < pi/4 + fcmp.lt.s1 p8,p9= Abs_Arg,PI_BY_4 // Test |x| < pi/4 + mov GR_exp_2tom2 = 0xffff - 2 // Form signexp of 2^-2 } { .mfi -(p0) ldfs NEGTWO_TO_NEG2 = [table_ptr2],12 -// -// N = Arg * 2/pi -// -(p0) fcmp.lt.unc.s1 p8,p9= Arg,PI_BY_4 - nop.i 999 ;; -} -{ .mfi - nop.m 999 -// -// if Arg < pi/4, set PR_8. -// -(p8) fcmp.gt.s1 p8,p9= Arg,MPI_BY_4 - nop.i 999 ;; + ldfps TWO_TO_NEG2, NEGTWO_TO_NEG2 = [table_ptr2] // Load 2^-2, -2^-2 + fms.s1 N = N_fix, FR_2tom64, FR_rshf // Use scaling to get N floated + mov N_fix_gr = r0 // Assume N=0, ok if |x| < pi/4 } +;; + // // Case 1: Is |r| < 2**(-2). // Arg is the same as r in this case. // r = Arg // c = 0 // +// Case 2: Place integer part of N in GP register. { .mfi -(p8) mov N_fix_gr = r0 -// -// if Arg > -pi/4, reset PR_8. -// Select the case when |Arg| < pi/4 - set PR[8] = true. -// Else Select the case when |Arg| >= pi/4 - set PR[9] = true. -// -(p0) fcvt.fx.s1 N_fix = N - nop.i 999 ;; -} -{ .mfi - nop.m 999 -// -// Grab the integer part of N . -// -(p8) mov r = Arg - nop.i 999 -} -{ .mfi - nop.m 999 -(p8) mov c = f0 - nop.i 999 ;; -} -{ .mfi - nop.m 999 -(p8) fcmp.lt.unc.s1 p10, p11 = Arg, TWO_TO_NEG2 - nop.i 999 ;; +(p9) getf.sig N_fix_gr = N_fix + fmerge.s c = f0, f0 // Assume c=0, ok if |x| < pi/4 + cmp.lt p10, p0 = GR_exp_x, GR_exp_2tom2 // Test if |x| < 1/4 } +;; + { .mfi - nop.m 999 -(p10) fcmp.gt.s1 p10,p0 = Arg, NEGTWO_TO_NEG2 - nop.i 999 ;; + setf.sig B_mask1 = bmask1 // Form mask to get 5 msb of r + nop.f 999 + mov exp_r = GR_exp_x // Get exp_r if 1/4 <= |x| < pi/4 } -{ .mfi - nop.m 999 -// -// Case 2: Place integer part of N in GP register. -// -(p9) fcvt.xf N = N_fix - nop.i 999 ;; -} -{ .mib -(p9) getf.sig N_fix_gr = N_fix - nop.i 999 -// -// Case 2: Convert integer N_fix back to normalized floating-point value. -// -(p10) br.cond.spnt L(TANL_SMALL_R) ;; -} -{ .mib - nop.m 999 - nop.i 999 -(p8) br.cond.sptk L(TANL_NORMAL_R) ;; +{ .mbb + setf.sig B_mask2 = bmask2 // Form mask to form B from r +(p10) br.cond.spnt TANL_SMALL_R // Branch if 0 < |x| < 1/4 +(p8) br.cond.spnt TANL_NORMAL_R // Branch if 1/4 <= |x| < pi/4 } +;; + +// Here if pi/4 <= |x| < 2^24 // // Case 1: PR_3 is only affected when PR_1 is set. // -{ .mmi -(p9) ldfs TWO_TO_NEG33 = [table_ptr2], 4 ;; // -// Case 2: Load 2**(-33). +// Case 2: w = N * P_2 +// Case 2: s_val = -N * P_1 + Arg // -(p9) ldfs NEGTWO_TO_NEG33 = [table_ptr2], 4 - nop.i 999 ;; + +{ .mfi + nop.m 999 + fnma.s1 s_val = N, P_1, Norm_Arg + nop.i 999 } { .mfi - nop.m 999 -// -// Case 2: Load -2**(-33). -// -(p9) fnma.s1 s_val = N, P_1, Arg - nop.i 999 + nop.m 999 + fmpy.s1 w = N, P_2 // w = N * P_2 for |s| >= 2^-33 + nop.i 999 } +;; + +// Case 2_reduce: w = N * P_3 (change sign) { .mfi - nop.m 999 -(p9) fmpy.s1 w = N, P_2 - nop.i 999 ;; + nop.m 999 + fmpy.s1 w2 = N, P_3 // w = N * P_3 for |s| < 2^-33 + nop.i 999 } +;; + +// Case 1_reduce: r = s + w (change sign) { .mfi - nop.m 999 -// -// Case 2: w = N * P_2 -// Case 2: s_val = -N * P_1 + Arg -// -(p0) fcmp.lt.unc.s1 p9,p8 = s_val, TWO_TO_NEG33 - nop.i 999 ;; + nop.m 999 + fsub.s1 r = s_val, w // r = s_val - w for |s| >= 2^-33 + nop.i 999 } +;; + +// Case 2_reduce: U_1 = N * P_2 + w { .mfi - nop.m 999 + nop.m 999 + fma.s1 U_1 = N, P_2, w2 // U_1 = N * P_2 + w for |s| < 2^-33 + nop.i 999 +} +;; + // // Decide between case_1 and case_2 reduce: +// Case 1_reduce: |s| >= 2**(-33) +// Case 2_reduce: |s| < 2**(-33) // -(p9) fcmp.gt.s1 p9, p8 = s_val, NEGTWO_TO_NEG33 - nop.i 999 ;; +{ .mfi + nop.m 999 + fcmp.lt.s1 p9, p8 = s_val, TWO_TO_NEG33 + nop.i 999 } +;; + { .mfi - nop.m 999 -// -// Case 1_reduce: s <= -2**(-33) or s >= 2**(-33) -// Case 2_reduce: -2**(-33) < s < 2**(-33) -// -(p8) fsub.s1 r = s_val, w - nop.i 999 + nop.m 999 +(p9) fcmp.gt.s1 p9, p8 = s_val, NEGTWO_TO_NEG33 + nop.i 999 } +;; + +// Case 1_reduce: c = s - r { .mfi - nop.m 999 -(p9) fmpy.s1 w = N, P_3 - nop.i 999 ;; + nop.m 999 + fsub.s1 c = s_val, r // c = s_val - r for |s| >= 2^-33 + nop.i 999 } +;; + +// Case 2_reduce: r is complete here - continue to calculate c . +// r = s - U_1 { .mfi - nop.m 999 -(p9) fma.s1 U_1 = N, P_2, w - nop.i 999 + nop.m 999 +(p9) fsub.s1 r = s_val, U_1 + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 +(p9) fms.s1 U_2 = N, P_2, U_1 + nop.i 999 +} +;; + // // Case 1_reduce: Is |r| < 2**(-2), if so set PR_10 -// else set PR_11. +// else set PR_13. // -(p8) fsub.s1 c = s_val, r - nop.i 999 ;; -} + { .mfi - nop.m 999 -// -// Case 1_reduce: r = s + w (change sign) -// Case 2_reduce: w = N * P_3 (change sign) -// -(p8) fcmp.lt.unc.s1 p10, p11 = r, TWO_TO_NEG2 - nop.i 999 ;; + nop.m 999 + fand B = B_mask1, r + nop.i 999 } { .mfi - nop.m 999 -(p10) fcmp.gt.s1 p10, p11 = r, NEGTWO_TO_NEG2 - nop.i 999 ;; + nop.m 999 +(p8) fcmp.lt.unc.s1 p10, p13 = r, TWO_TO_NEG2 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p9) fsub.s1 r = s_val, U_1 - nop.i 999 +(p8) getf.sig sig_r = r // Get signif of r if |s| >= 2^-33 + nop.f 999 + nop.i 999 } +;; + { .mfi - nop.m 999 -// +(p8) getf.exp exp_r = r // Extract signexp of r if |s| >= 2^-33 +(p10) fcmp.gt.s1 p10, p13 = r, NEGTWO_TO_NEG2 + nop.i 999 +} +;; + // Case 1_reduce: c is complete here. +// Case 1: Branch to SMALL_R or NORMAL_R. // c = c + w (w has not been negated.) -// Case 2_reduce: r is complete here - continue to calculate c . -// r = s - U_1 -// -(p9) fms.s1 U_2 = N, P_2, U_1 - nop.i 999 ;; -} { .mfi - nop.m 999 + nop.m 999 +(p8) fsub.s1 c = c, w // c = c - w for |s| >= 2^-33 + nop.i 999 +} +{ .mbb + nop.m 999 +(p10) br.cond.spnt TANL_SMALL_R // Branch if pi/4 < |x| < 2^24 and |r|<1/4 +(p13) br.cond.sptk TANL_NORMAL_R_A // Branch if pi/4 < |x| < 2^24 and |r|>=1/4 +} +;; + + +// Here if pi/4 < |x| < 2^24 and |s| < 2^-33 // -// Case 1_reduce: c = s - r -// Case 2_reduce: U_1 = N * P_2 + w +// Is i_1 = lsb of N_fix_gr even or odd? +// if i_1 == 0, set p11, else set p12. // -(p8) fsub.s1 c = c, w - nop.i 999 ;; -} { .mfi - nop.m 999 -(p9) fsub.s1 s_val = s_val, r - nop.i 999 + nop.m 999 + fsub.s1 s_val = s_val, r + add N_fix_gr = N_fix_gr, cot_flag // N = N + 1 (for cotl) } -{ .mfb - nop.m 999 +{ .mfi + nop.m 999 // // Case 2_reduce: // U_2 = N * P_2 - U_1 // Not needed until later. // -(p9) fadd.s1 U_2 = U_2, w + fadd.s1 U_2 = U_2, w2 // // Case 2_reduce: // s = s - r // U_2 = U_2 + w // -(p10) br.cond.spnt L(TANL_SMALL_R) ;; -} -{ .mib - nop.m 999 - nop.i 999 -(p11) br.cond.sptk L(TANL_NORMAL_R) ;; + nop.i 999 } -{ .mii - nop.m 999 +;; + // // Case 2_reduce: // c = c - U_2 // c is complete here // Argument reduction ends here. // -(p9) extr.u i_1 = N_fix_gr, 0, 1 ;; -(p9) cmp.eq.unc p11, p12 = 0x0000,i_1 ;; -} -{ .mfi - nop.m 999 -// -// Is i_1 even or odd? -// if i_1 == 0, set p11, else set p12. -// -(p11) fmpy.s1 rsq = r, r - nop.i 999 ;; -} { .mfi - nop.m 999 -(p12) frcpa.s1 S_hi,p0 = f1, r - nop.i 999 + nop.m 999 + fmpy.s1 rsq = r, r + tbit.z p11, p12 = N_fix_gr, 0 ;; // Set p11 if N even, p12 if odd } - - -// -// Case 1: Branch to SMALL_R or NORMAL_R. -// Case 1 is done now. -// - { .mfi -(p9) addl table_ptr1 = @ltoff(TANL_BASE_CONSTANTS), gp -(p9) fsub.s1 c = s_val, U_1 - nop.i 999 ;; + nop.m 999 +(p12) frcpa.s1 S_hi,p0 = f1, r + nop.i 999 } -;; - -{ .mmi -(p9) ld8 table_ptr1 = [table_ptr1] +{ .mfi nop.m 999 + fsub.s1 c = s_val, U_1 nop.i 999 } ;; - { .mmi -(p9) add table_ptr1 = 224, table_ptr1 ;; -(p9) ldfe P1_1 = [table_ptr1],144 - nop.i 999 ;; + add table_ptr1 = 160, table_base ;; // Point to tanl_table_p1 + ldfe P1_1 = [table_ptr1],144 + nop.i 999 ;; } // -// Get [i_1] - lsb of N_fix_gr . // Load P1_1 and point to Q1_1 . // { .mfi -(p9) ldfe Q1_1 = [table_ptr1] , 0 + ldfe Q1_1 = [table_ptr1] // // N even: rsq = r * Z // N odd: S_hi = frcpa(r) // -(p12) fmerge.ns S_hi = S_hi, S_hi - nop.i 999 +(p12) fmerge.ns S_hi = S_hi, S_hi + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 // // Case 2_reduce: // c = s - U_1 // -(p9) fsub.s1 c = c, U_2 - nop.i 999 ;; +(p9) fsub.s1 c = c, U_2 + nop.i 999 ;; } { .mfi - nop.m 999 -(p12) fma.s1 poly1 = S_hi, r, f1 - nop.i 999 ;; + nop.m 999 +(p12) fma.s1 poly1 = S_hi, r, f1 + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N odd: Change sign of S_hi // -(p11) fmpy.s1 rsq = rsq, P1_1 - nop.i 999 ;; +(p11) fmpy.s1 rsq = rsq, P1_1 + nop.i 999 ;; } { .mfi - nop.m 999 -(p12) fma.s1 S_hi = S_hi, poly1, S_hi - nop.i 999 ;; + nop.m 999 +(p12) fma.s1 S_hi = S_hi, poly1, S_hi + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even: rsq = rsq * P1_1 // N odd: poly1 = 1.0 + S_hi * r 16 bits partial account for necessary // -(p11) fma.s1 Result = r, rsq, c - nop.i 999 ;; +(p11) fma.s1 Poly = r, rsq, c + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // -// N even: Result = c + r * rsq +// N even: Poly = c + r * rsq // N odd: S_hi = S_hi + S_hi*poly1 16 bits account for necessary // -(p12) fma.s1 poly1 = S_hi, r, f1 - nop.i 999 ;; +(p12) fma.s1 poly1 = S_hi, r, f1 +(p11) tbit.z.unc p14, p15 = cot_flag, 0 ;; // p14=1 for tanl; p15=1 for cotl } { .mfi - nop.m 999 + nop.m 999 // -// N even: Result = Result + r +// N even: Result = Poly + r // N odd: poly1 = 1.0 + S_hi * r 32 bits partial // -(p11) fadd.s0 Result = r, Result - nop.i 999 ;; +(p14) fadd.s0 Result = r, Poly // for tanl + nop.i 999 +} +{ .mfi + nop.m 999 +(p15) fms.s0 Result = r, mOne, Poly // for cotl + nop.i 999 } +;; + { .mfi - nop.m 999 -(p12) fma.s1 S_hi = S_hi, poly1, S_hi - nop.i 999 ;; + nop.m 999 +(p12) fma.s1 S_hi = S_hi, poly1, S_hi + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even: Result1 = Result + r // N odd: S_hi = S_hi * poly1 + S_hi 32 bits // -(p12) fma.s1 poly1 = S_hi, r, f1 - nop.i 999 ;; +(p12) fma.s1 poly1 = S_hi, r, f1 + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N odd: poly1 = S_hi * r + 1.0 64 bits partial // -(p12) fma.s1 S_hi = S_hi, poly1, S_hi - nop.i 999 ;; +(p12) fma.s1 S_hi = S_hi, poly1, S_hi + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N odd: poly1 = S_hi * poly + 1.0 64 bits // -(p12) fma.s1 poly1 = S_hi, r, f1 - nop.i 999 ;; +(p12) fma.s1 poly1 = S_hi, r, f1 + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N odd: poly1 = S_hi * r + 1.0 // -(p12) fma.s1 poly1 = S_hi, c, poly1 - nop.i 999 ;; +(p12) fma.s1 poly1 = S_hi, c, poly1 + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N odd: poly1 = S_hi * c + poly1 // -(p12) fmpy.s1 S_lo = S_hi, poly1 - nop.i 999 ;; +(p12) fmpy.s1 S_lo = S_hi, poly1 + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N odd: S_lo = S_hi * poly1 // -(p12) fma.s1 S_lo = Q1_1, r, S_lo - nop.i 999 +(p12) fma.s1 S_lo = Q1_1, r, S_lo +(p12) tbit.z.unc p14, p15 = cot_flag, 0 // p14=1 for tanl; p15=1 for cotl } { .mfi - nop.m 999 + nop.m 999 // // N odd: Result = S_hi + S_lo // -(p0) fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact - nop.i 999 ;; + fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact + nop.i 999 ;; } -{ .mfb - nop.m 999 +{ .mfi + nop.m 999 // // N odd: S_lo = S_lo + Q1_1 * r // -(p12) fadd.s0 Result = S_hi, S_lo -(p0) br.ret.sptk b0 ;; +(p14) fadd.s0 Result = S_hi, S_lo // for tanl + nop.i 999 +} +{ .mfb + nop.m 999 +(p15) fms.s0 Result = S_hi, mOne, S_lo // for cotl + br.ret.sptk b0 ;; // Exit for pi/4 <= |x| < 2^24 and |s| < 2^-33 } -L(TANL_LARGER_ARG): - +TANL_LARGER_ARG: +// Here if 2^24 <= |x| < 2^63 // // ARGUMENT REDUCTION CODE - CASE 3 and 4 // -{ .mfi -(p0) addl table_ptr1 = @ltoff(TANL_BASE_CONSTANTS), gp -(p0) fmpy.s1 N_0 = Arg, Inv_P_0 - nop.i 999 +{ .mmf + mov GR_exp_2tom14 = 0xffff - 14 // Form signexp of 2^-14 + mov GR_exp_m2tom14 = 0x2ffff - 14 // Form signexp of -2^-14 + fmpy.s1 N_0 = Norm_Arg, Inv_P_0 } ;; { .mmi -(p0) ld8 table_ptr1 = [table_ptr1] - nop.m 999 + setf.exp TWO_TO_NEG14 = GR_exp_2tom14 // Form 2^-14 + setf.exp NEGTWO_TO_NEG14 = GR_exp_m2tom14// Form -2^-14 nop.i 999 } ;; @@ -1622,661 +1806,605 @@ L(TANL_LARGER_ARG): // N_0 = Arg * Inv_P_0 // { .mmi -(p0) add table_ptr1 = 8, table_ptr1 ;; -// -// Point to 2*-14 -// -(p0) ldfs TWO_TO_NEG14 = [table_ptr1], 4 - nop.i 999 ;; + add table_ptr2 = 144, table_base ;; // Point to 2^-2 + ldfps TWO_TO_NEG2, NEGTWO_TO_NEG2 = [table_ptr2] + nop.i 999 } -// -// Load 2**(-14). -// -{ .mmi -(p0) ldfs NEGTWO_TO_NEG14 = [table_ptr1], 180 ;; +;; + // // N_0_fix = integer part of N_0 . -// Adjust table_ptr1 to beginning of table. // -(p0) ldfs TWO_TO_NEG2 = [table_ptr1], 4 - nop.i 999 ;; -} // // Make N_0 the integer part. // { .mfi -(p0) ldfs NEGTWO_TO_NEG2 = [table_ptr1] -// -// Load -2**(-14). -// -(p0) fcvt.fx.s1 N_0_fix = N_0 - nop.i 999 ;; + nop.m 999 + fcvt.fx.s1 N_0_fix = N_0 + nop.i 999 ;; } { .mfi - nop.m 999 -(p0) fcvt.xf N_0 = N_0_fix - nop.i 999 ;; + setf.sig B_mask1 = bmask1 // Form mask to get 5 msb of r + fcvt.xf N_0 = N_0_fix + nop.i 999 ;; } { .mfi - nop.m 999 -(p0) fnma.s1 ArgPrime = N_0, P_0, Arg - nop.i 999 + setf.sig B_mask2 = bmask2 // Form mask to form B from r + fnma.s1 ArgPrime = N_0, P_0, Norm_Arg + nop.i 999 } { .mfi - nop.m 999 -(p0) fmpy.s1 w = N_0, d_1 - nop.i 999 ;; + nop.m 999 + fmpy.s1 w = N_0, d_1 + nop.i 999 ;; } -{ .mfi - nop.m 999 // // ArgPrime = -N_0 * P_0 + Arg // w = N_0 * d_1 // -(p0) fmpy.s1 N = ArgPrime, two_by_PI - nop.i 999 ;; -} -{ .mfi - nop.m 999 // // N = ArgPrime * 2/pi // -(p0) fcvt.fx.s1 N_fix = N - nop.i 999 ;; -} +// fcvt.fx.s1 N_fix = N +// Use special scaling to right shift so N=Arg * 2/pi is in rightmost bits +// Branch to Cases 3 or 4 if Arg <= -2**24 or Arg >= 2**24 { .mfi - nop.m 999 -// -// N_fix is the integer part. -// -(p0) fcvt.xf N = N_fix - nop.i 999 ;; + nop.m 999 + fma.s1 N_fix = ArgPrime, FR_inv_pi_2to63, FR_rshf_2to64 + + nop.i 999 ;; } +// Convert integer N_fix back to normalized floating-point value. { .mfi -(p0) getf.sig N_fix_gr = N_fix - nop.f 999 - nop.i 999 ;; + nop.m 999 + fms.s1 N = N_fix, FR_2tom64, FR_rshf // Use scaling to get N floated + nop.i 999 } -{ .mfi - nop.m 999 +;; + // // N is the integer part of the reduced-reduced argument. // Put the integer in a GP register. // -(p0) fnma.s1 s_val = N, P_1, ArgPrime - nop.i 999 -} { .mfi - nop.m 999 -(p0) fnma.s1 w = N, P_2, w - nop.i 999 ;; + getf.sig N_fix_gr = N_fix + nop.f 999 + nop.i 999 } -{ .mfi - nop.m 999 +;; + // // s_val = -N*P_1 + ArgPrime // w = -N*P_2 + w // -(p0) fcmp.lt.unc.s1 p11, p10 = s_val, TWO_TO_NEG14 - nop.i 999 ;; -} -{ .mfi - nop.m 999 -(p11) fcmp.gt.s1 p11, p10 = s_val, NEGTWO_TO_NEG14 - nop.i 999 ;; -} { .mfi - nop.m 999 -// -// Case 3: r = s_val + w (Z complete) -// Case 4: U_hi = N_0 * d_1 -// -(p10) fmpy.s1 V_hi = N, P_2 - nop.i 999 + nop.m 999 + fnma.s1 s_val = N, P_1, ArgPrime + nop.i 999 } { .mfi - nop.m 999 -(p11) fmpy.s1 U_hi = N_0, d_1 - nop.i 999 ;; + nop.m 999 + fnma.s1 w = N, P_2, w + nop.i 999 } -{ .mfi - nop.m 999 -// -// Case 3: r = s_val + w (Z complete) +;; + +// Case 4: V_hi = N * P_2 // Case 4: U_hi = N_0 * d_1 -// -(p11) fmpy.s1 V_hi = N, P_2 - nop.i 999 -} { .mfi - nop.m 999 -(p11) fmpy.s1 U_hi = N_0, d_1 - nop.i 999 ;; + nop.m 999 + fmpy.s1 V_hi = N, P_2 // V_hi = N * P_2 for |s| < 2^-14 + nop.i 999 } { .mfi - nop.m 999 -// -// Decide between case 3 and 4: -// Case 3: s <= -2**(-14) or s >= 2**(-14) -// Case 4: -2**(-14) < s < 2**(-14) -// -(p10) fadd.s1 r = s_val, w - nop.i 999 + nop.m 999 + fmpy.s1 U_hi = N_0, d_1 // U_hi = N_0 * d_1 for |s| < 2^-14 + nop.i 999 } +;; + +// Case 3: r = s_val + w (Z complete) +// Case 4: w = N * P_3 { .mfi - nop.m 999 -(p11) fmpy.s1 w = N, P_3 - nop.i 999 ;; + nop.m 999 + fadd.s1 r = s_val, w // r = s_val + w for |s| >= 2^-14 + nop.i 999 } { .mfi - nop.m 999 -// -// Case 4: We need abs of both U_hi and V_hi - dont -// worry about switched sign of V_hi . -// -(p11) fsub.s1 A = U_hi, V_hi - nop.i 999 + nop.m 999 + fmpy.s1 w2 = N, P_3 // w = N * P_3 for |s| < 2^-14 + nop.i 999 } -{ .mfi - nop.m 999 -// +;; + // Case 4: A = U_hi + V_hi // Note: Worry about switched sign of V_hi, so subtract instead of add. -// -(p11) fnma.s1 V_lo = N, P_2, V_hi - nop.i 999 ;; +// Case 4: V_lo = -N * P_2 - V_hi (U_hi is in place of V_hi in writeup) +// Note: the (-) is still missing for V_hi. +{ .mfi + nop.m 999 + fsub.s1 A = U_hi, V_hi // A = U_hi - V_hi for |s| < 2^-14 + nop.i 999 } { .mfi - nop.m 999 -(p11) fms.s1 U_lo = N_0, d_1, U_hi - nop.i 999 ;; + nop.m 999 + fnma.s1 V_lo = N, P_2, V_hi // V_lo = V_hi - N * P_2 for |s| < 2^-14 + nop.i 999 } +;; + +// Decide between case 3 and 4: +// Case 3: |s| >= 2**(-14) Set p10 +// Case 4: |s| < 2**(-14) Set p11 +// +// Case 4: U_lo = N_0 * d_1 - U_hi { .mfi - nop.m 999 -(p11) fabs V_hiabs = V_hi - nop.i 999 + nop.m 999 + fms.s1 U_lo = N_0, d_1, U_hi // U_lo = N_0*d_1 - U_hi for |s| < 2^-14 + nop.i 999 } { .mfi - nop.m 999 -// -// Case 4: V_hi = N * P_2 -// w = N * P_3 -// Note the product does not include the (-) as in the writeup -// so (-) missing for V_hi and w . -(p10) fadd.s1 r = s_val, w - nop.i 999 ;; + nop.m 999 + fcmp.lt.s1 p11, p10 = s_val, TWO_TO_NEG14 + nop.i 999 } +;; + +// Case 4: We need abs of both U_hi and V_hi - dont +// worry about switched sign of V_hi. { .mfi - nop.m 999 -// -// Case 3: c = s_val - r -// Case 4: U_lo = N_0 * d_1 - U_hi -// -(p11) fabs U_hiabs = U_hi - nop.i 999 + nop.m 999 + fabs V_hiabs = V_hi // |V_hi| for |s| < 2^-14 + nop.i 999 } { .mfi - nop.m 999 -(p11) fmpy.s1 w = N, P_3 - nop.i 999 ;; + nop.m 999 +(p11) fcmp.gt.s1 p11, p10 = s_val, NEGTWO_TO_NEG14 + nop.i 999 } +;; + +// Case 3: c = s_val - r { .mfi - nop.m 999 -// -// Case 4: Set P_12 if U_hiabs >= V_hiabs -// -(p11) fadd.s1 C_hi = s_val, A - nop.i 999 ;; + nop.m 999 + fabs U_hiabs = U_hi // |U_hi| for |s| < 2^-14 + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 + fsub.s1 c = s_val, r // c = s_val - r for |s| >= 2^-14 + nop.i 999 +} +;; + +// For Case 3, |s| >= 2^-14, determine if |r| < 1/4 // // Case 4: C_hi = s_val + A // -(p11) fadd.s1 t = U_lo, V_lo - nop.i 999 ;; -} { .mfi - nop.m 999 -// -// Case 3: Is |r| < 2**(-2), if so set PR_7 -// else set PR_8. -// Case 3: If PR_7 is set, prepare to branch to Small_R. -// Case 3: If PR_8 is set, prepare to branch to Normal_R. -// -(p10) fsub.s1 c = s_val, r - nop.i 999 ;; + nop.m 999 +(p11) fadd.s1 C_hi = s_val, A // C_hi = s_val + A for |s| < 2^-14 + nop.i 999 } { .mfi - nop.m 999 -// -// Case 3: c = (s - r) + w (c complete) -// -(p11) fcmp.ge.unc.s1 p12, p13 = U_hiabs, V_hiabs - nop.i 999 + nop.m 999 +(p10) fcmp.lt.unc.s1 p14, p15 = r, TWO_TO_NEG2 + nop.i 999 } +;; + { .mfi - nop.m 999 -(p11) fms.s1 w = N_0, d_2, w - nop.i 999 ;; + getf.sig sig_r = r // Get signif of r if |s| >= 2^-33 + fand B = B_mask1, r + nop.i 999 } +;; + +// Case 4: t = U_lo + V_lo { .mfi - nop.m 999 -// -// Case 4: V_hi = N * P_2 -// w = N * P_3 -// Note the product does not include the (-) as in the writeup -// so (-) missing for V_hi and w . -// -(p10) fcmp.lt.unc.s1 p14, p15 = r, TWO_TO_NEG2 - nop.i 999 ;; + getf.exp exp_r = r // Extract signexp of r if |s| >= 2^-33 +(p11) fadd.s1 t = U_lo, V_lo // t = U_lo + V_lo for |s| < 2^-14 + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 (p14) fcmp.gt.s1 p14, p15 = r, NEGTWO_TO_NEG2 - nop.i 999 ;; + nop.i 999 } -{ .mfb - nop.m 999 +;; + +// Case 3: c = (s - r) + w (c complete) +{ .mfi + nop.m 999 +(p10) fadd.s1 c = c, w // c = c + w for |s| >= 2^-14 + nop.i 999 +} +{ .mbb + nop.m 999 +(p14) br.cond.spnt TANL_SMALL_R // Branch if 2^24 <= |x| < 2^63 and |r|< 1/4 +(p15) br.cond.sptk TANL_NORMAL_R_A // Branch if 2^24 <= |x| < 2^63 and |r|>=1/4 +} +;; + + +// Here if 2^24 <= |x| < 2^63 and |s| < 2^-14 >>>>>>> Case 4. // -// Case 4: V_lo = -N * P_2 - V_hi (U_hi is in place of V_hi in writeup) -// Note: the (-) is still missing for V_hi . +// Case 4: Set P_12 if U_hiabs >= V_hiabs // Case 4: w = w + N_0 * d_2 // Note: the (-) is now incorporated in w . -// -(p10) fadd.s1 c = c, w -// -// Case 4: t = U_lo + V_lo -// Note: remember V_lo should be (-), subtract instead of add. NO -// -(p14) br.cond.spnt L(TANL_SMALL_R) ;; -} -{ .mib - nop.m 999 - nop.i 999 -(p15) br.cond.spnt L(TANL_NORMAL_R) ;; -} { .mfi - nop.m 999 -// -// Case 3: Vector off when |r| < 2**(-2). Recall that PR_3 will be true. -// The remaining stuff is for Case 4. -// -(p12) fsub.s1 a = U_hi, A -(p11) extr.u i_1 = N_fix_gr, 0, 1 ;; + add table_ptr1 = 160, table_base // Point to tanl_table_p1 + fcmp.ge.unc.s1 p12, p13 = U_hiabs, V_hiabs + nop.i 999 } { .mfi - nop.m 999 -// -// Case 4: C_lo = s_val - C_hi -// -(p11) fadd.s1 t = t, w - nop.i 999 + nop.m 999 + fms.s1 w2 = N_0, d_2, w2 + nop.i 999 } +;; + +// Case 4: C_lo = s_val - C_hi { .mfi - nop.m 999 -(p13) fadd.s1 a = V_hi, A - nop.i 999 ;; + ldfe P1_1 = [table_ptr1], 16 // Load P1_1 + fsub.s1 C_lo = s_val, C_hi + nop.i 999 } - - +;; // // Case 4: a = U_hi - A // a = V_hi - A (do an add to account for missing (-) on V_hi // - { .mfi -(p11) addl table_ptr1 = @ltoff(TANL_BASE_CONSTANTS), gp -(p11) fsub.s1 C_lo = s_val, C_hi - nop.i 999 + ldfe P1_2 = [table_ptr1], 128 // Load P1_2 +(p12) fsub.s1 a = U_hi, A + nop.i 999 +} +{ .mfi + nop.m 999 +(p13) fadd.s1 a = V_hi, A + nop.i 999 } ;; +// Case 4: t = U_lo + V_lo + w +{ .mfi + ldfe Q1_1 = [table_ptr1], 16 // Load Q1_1 + fadd.s1 t = t, w2 + nop.i 999 +} +;; - -// // Case 4: a = (U_hi - A) + V_hi // a = (V_hi - A) + U_hi // In each case account for negative missing form V_hi . // - - -{ .mmi -(p11) ld8 table_ptr1 = [table_ptr1] +{ .mfi + ldfe Q1_2 = [table_ptr1], 16 // Load Q1_2 +(p12) fsub.s1 a = a, V_hi + nop.i 999 +} +{ .mfi nop.m 999 +(p13) fsub.s1 a = U_hi, a nop.i 999 } ;; - // // Case 4: C_lo = (s_val - C_hi) + A // -{ .mmi -(p11) add table_ptr1 = 224, table_ptr1 ;; -(p11) ldfe P1_1 = [table_ptr1], 16 - nop.i 999 ;; -} { .mfi -(p11) ldfe P1_2 = [table_ptr1], 128 -// -// Case 4: w = U_lo + V_lo + w -// -(p12) fsub.s1 a = a, V_hi - nop.i 999 ;; -} -// -// Case 4: r = C_hi + C_lo -// -{ .mfi -(p11) ldfe Q1_1 = [table_ptr1], 16 -(p11) fadd.s1 C_lo = C_lo, A - nop.i 999 ;; + nop.m 999 + fadd.s1 C_lo = C_lo, A + nop.i 999 ;; } // -// Case 4: c = C_hi - r -// Get [i_1] - lsb of N_fix_gr. +// Case 4: t = t + a // { .mfi -(p11) ldfe Q1_2 = [table_ptr1], 16 - nop.f 999 - nop.i 999 ;; + nop.m 999 + fadd.s1 t = t, a + nop.i 999 } +;; + +// Case 4: C_lo = C_lo + t +// Case 4: r = C_hi + C_lo { .mfi - nop.m 999 -(p13) fsub.s1 a = U_hi, a - nop.i 999 ;; + nop.m 999 + fadd.s1 C_lo = C_lo, t + nop.i 999 } +;; + { .mfi - nop.m 999 -(p11) fadd.s1 t = t, a - nop.i 999 ;; + nop.m 999 + fadd.s1 r = C_hi, C_lo + nop.i 999 } -{ .mfi - nop.m 999 +;; + // -// Case 4: t = t + a +// Case 4: c = C_hi - r // -(p11) fadd.s1 C_lo = C_lo, t - nop.i 999 ;; -} { .mfi - nop.m 999 -// -// Case 4: C_lo = C_lo + t -// -(p11) fadd.s1 r = C_hi, C_lo - nop.i 999 ;; + nop.m 999 + fsub.s1 c = C_hi, r + nop.i 999 } { .mfi - nop.m 999 -(p11) fsub.s1 c = C_hi, r - nop.i 999 + nop.m 999 + fmpy.s1 rsq = r, r + add N_fix_gr = N_fix_gr, cot_flag // N = N + 1 (for cotl) } -{ .mfi - nop.m 999 -// +;; + // Case 4: c = c + C_lo finished. -// Is i_1 even or odd? -// if i_1 == 0, set PR_4, else set PR_5. // -// r and c have been computed. -// We known whether this is the sine or cosine routine. -// Make sure ftz mode is set - should be automatic when using wre -(p0) fmpy.s1 rsq = r, r - nop.i 999 ;; -} +// Is i_1 = lsb of N_fix_gr even or odd? +// if i_1 == 0, set PR_11, else set PR_12. +// { .mfi - nop.m 999 -(p11) fadd.s1 c = c , C_lo -(p11) cmp.eq.unc p11, p12 = 0x0000, i_1 ;; + nop.m 999 + fadd.s1 c = c , C_lo + tbit.z p11, p12 = N_fix_gr, 0 } +;; + +// r and c have been computed. { .mfi - nop.m 999 + nop.m 999 (p12) frcpa.s1 S_hi, p0 = f1, r - nop.i 999 + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 // // N odd: Change sign of S_hi // -(p11) fma.s1 Result = rsq, P1_2, P1_1 - nop.i 999 ;; +(p11) fma.s1 Poly = rsq, P1_2, P1_1 + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 (p12) fma.s1 P = rsq, Q1_2, Q1_1 - nop.i 999 + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 // // N odd: Result = S_hi + S_lo (User supplied rounding mode for C1) // -(p0) fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact - nop.i 999 ;; + fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even: rsq = r * r // N odd: S_hi = frcpa(r) // (p12) fmerge.ns S_hi = S_hi, S_hi - nop.i 999 + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 // // N even: rsq = rsq * P1_2 + P1_1 // N odd: poly1 = 1.0 + S_hi * r 16 bits partial account for necessary // -(p11) fmpy.s1 Result = rsq, Result - nop.i 999 ;; +(p11) fmpy.s1 Poly = rsq, Poly + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 (p12) fma.s1 poly1 = S_hi, r,f1 - nop.i 999 +(p11) tbit.z.unc p14, p15 = cot_flag, 0 // p14=1 for tanl; p15=1 for cotl } { .mfi - nop.m 999 + nop.m 999 // -// N even: Result = Result * rsq +// N even: Poly = Poly * rsq // N odd: S_hi = S_hi + S_hi*poly1 16 bits account for necessary // -(p11) fma.s1 Result = r, Result, c - nop.i 999 ;; +(p11) fma.s1 Poly = r, Poly, c + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 (p12) fma.s1 S_hi = S_hi, poly1, S_hi - nop.i 999 + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 // // N odd: S_hi = S_hi * poly1 + S_hi 32 bits // -(p11) fadd.s0 Result= r, Result - nop.i 999 ;; +(p14) fadd.s0 Result = r, Poly // for tanl + nop.i 999 ;; } + +.pred.rel "mutex",p15,p12 { .mfi - nop.m 999 + nop.m 999 +(p15) fms.s0 Result = r, mOne, Poly // for cotl + nop.i 999 +} +{ .mfi + nop.m 999 (p12) fma.s1 poly1 = S_hi, r, f1 - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // -// N even: Result = Result * r + c +// N even: Poly = Poly * r + c // N odd: poly1 = 1.0 + S_hi * r 32 bits partial // (p12) fma.s1 S_hi = S_hi, poly1, S_hi - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 (p12) fma.s1 poly1 = S_hi, r, f1 - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // -// N even: Result1 = Result + r (Rounding mode S0) +// N even: Result = Poly + r (Rounding mode S0) // N odd: poly1 = S_hi * r + 1.0 64 bits partial // (p12) fma.s1 S_hi = S_hi, poly1, S_hi - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N odd: poly1 = S_hi * poly + S_hi 64 bits // (p12) fma.s1 poly1 = S_hi, r, f1 - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N odd: poly1 = S_hi * r + 1.0 // (p12) fma.s1 poly1 = S_hi, c, poly1 - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N odd: poly1 = S_hi * c + poly1 // (p12) fmpy.s1 S_lo = S_hi, poly1 - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N odd: S_lo = S_hi * poly1 // (p12) fma.s1 S_lo = P, r, S_lo - nop.i 999 ;; +(p12) tbit.z.unc p14, p15 = cot_flag, 0 ;; // p14=1 for tanl; p15=1 for cotl +} + +{ .mfi + nop.m 999 +(p14) fadd.s0 Result = S_hi, S_lo // for tanl + nop.i 999 } { .mfb - nop.m 999 + nop.m 999 // // N odd: S_lo = S_lo + r * P // -(p12) fadd.s0 Result = S_hi, S_lo -(p0) br.ret.sptk b0 ;; +(p15) fms.s0 Result = S_hi, mOne, S_lo // for cotl + br.ret.sptk b0 ;; // Exit for 2^24 <= |x| < 2^63 and |s| < 2^-14 } -L(TANL_SMALL_R): -{ .mii - nop.m 999 -(p0) extr.u i_1 = N_fix_gr, 0, 1 ;; -(p0) cmp.eq.unc p11, p12 = 0x0000, i_1 -} +TANL_SMALL_R: +// Here if |r| < 1/4 +// r and c have been computed. +// ***************************************************************** +// ***************************************************************** +// ***************************************************************** +// N odd: S_hi = frcpa(r) +// Get [i_1] - lsb of N_fix_gr. Set p11 if N even, p12 if N odd. +// N even: rsq = r * r { .mfi - nop.m 999 -(p0) fmpy.s1 rsq = r, r - nop.i 999 ;; + add table_ptr1 = 160, table_base // Point to tanl_table_p1 + frcpa.s1 S_hi, p0 = f1, r // S_hi for N odd + add N_fix_gr = N_fix_gr, cot_flag // N = N + 1 (for cotl) } { .mfi -(p0) addl table_ptr1 = @ltoff(TANL_BASE_CONSTANTS), gp -(p12) frcpa.s1 S_hi, p0 = f1, r - nop.i 999 + add table_ptr2 = 400, table_base // Point to Q1_7 + fmpy.s1 rsq = r, r + nop.i 999 } ;; - { .mmi -(p0) ld8 table_ptr1 = [table_ptr1] - nop.m 999 - nop.i 999 + ldfe P1_1 = [table_ptr1], 16 +;; + ldfe P1_2 = [table_ptr1], 16 + tbit.z p11, p12 = N_fix_gr, 0 } ;; -// ***************************************************************** -// ***************************************************************** -// ***************************************************************** - -{ .mmi -(p0) add table_ptr1 = 224, table_ptr1 ;; -(p0) ldfe P1_1 = [table_ptr1], 16 - nop.i 999 ;; -} -// r and c have been computed. -// We known whether this is the sine or cosine routine. -// Make sure ftz mode is set - should be automatic when using wre -// |r| < 2**(-2) { .mfi -(p0) ldfe P1_2 = [table_ptr1], 16 -(p11) fmpy.s1 r_to_the_8 = rsq, rsq - nop.i 999 ;; + ldfe P1_3 = [table_ptr1], 96 + nop.f 999 + nop.i 999 } -// -// Set table_ptr1 to beginning of constant table. -// Get [i_1] - lsb of N_fix_gr. -// +;; + { .mfi -(p0) ldfe P1_3 = [table_ptr1], 96 -// -// N even: rsq = r * r -// N odd: S_hi = frcpa(r) -// +(p11) ldfe P1_9 = [table_ptr1], -16 (p12) fmerge.ns S_hi = S_hi, S_hi - nop.i 999 ;; + nop.i 999 } -// -// Is i_1 even or odd? -// if i_1 == 0, set PR_11. -// if i_1 != 0, set PR_12. -// { .mfi -(p11) ldfe P1_9 = [table_ptr1], -16 + nop.m 999 +(p11) fmpy.s1 r_to_the_8 = rsq, rsq + nop.i 999 +} +;; + // // N even: Poly2 = P1_7 + Poly2 * rsq // N odd: poly2 = Q1_5 + poly2 * rsq // +{ .mfi +(p11) ldfe P1_8 = [table_ptr1], -16 (p11) fadd.s1 CORR = rsq, f1 - nop.i 999 ;; + nop.i 999 } -{ .mmi -(p11) ldfe P1_8 = [table_ptr1], -16 ;; +;; + // // N even: Poly1 = P1_2 + P1_3 * rsq -// N odd: poly1 = 1.0 + S_hi * r +// N odd: poly1 = 1.0 + S_hi * r // 16 bits partial account for necessary (-1) // +{ .mmi (p11) ldfe P1_7 = [table_ptr1], -16 - nop.i 999 ;; +;; +(p11) ldfe P1_6 = [table_ptr1], -16 + nop.i 999 } +;; + // // N even: Poly1 = P1_1 + Poly1 * rsq // N odd: S_hi = S_hi + S_hi * poly1) 16 bits account for necessary // -{ .mfi -(p11) ldfe P1_6 = [table_ptr1], -16 // // N even: Poly2 = P1_5 + Poly2 * rsq // N odd: poly2 = Q1_3 + poly2 * rsq // +{ .mfi +(p11) ldfe P1_5 = [table_ptr1], -16 (p11) fmpy.s1 r_to_the_8 = r_to_the_8, r_to_the_8 - nop.i 999 ;; + nop.i 999 } +{ .mfi + nop.m 999 +(p12) fma.s1 poly1 = S_hi, r, f1 + nop.i 999 +} +;; + // // N even: Poly1 = Poly1 * rsq // N odd: poly1 = 1.0 + S_hi * r 32 bits partial // -{ .mfi -(p11) ldfe P1_5 = [table_ptr1], -16 -(p12) fma.s1 poly1 = S_hi, r, f1 - nop.i 999 ;; -} // // N even: CORR = CORR * c @@ -2290,44 +2418,30 @@ L(TANL_SMALL_R): { .mmf (p11) ldfe P1_4 = [table_ptr1], -16 -(p0) addl table_ptr2 = @ltoff(TANL_BASE_CONSTANTS), gp -(p11) fmpy.s1 CORR = CORR, c -} -;; - - -{ .mmi -(p0) ld8 table_ptr2 = [table_ptr2] nop.m 999 - nop.i 999 +(p11) fmpy.s1 CORR = CORR, c } ;; - -{ .mii -(p0) add table_ptr2 = 464, table_ptr2 - nop.i 999 ;; - nop.i 999 -} { .mfi - nop.m 999 + nop.m 999 (p11) fma.s1 Poly1 = P1_3, rsq, P1_2 - nop.i 999 ;; + nop.i 999 ;; } { .mfi -(p0) ldfe Q1_7 = [table_ptr2], -16 +(p12) ldfe Q1_7 = [table_ptr2], -16 (p12) fma.s1 S_hi = S_hi, poly1, S_hi - nop.i 999 ;; + nop.i 999 ;; } { .mfi -(p0) ldfe Q1_6 = [table_ptr2], -16 +(p12) ldfe Q1_6 = [table_ptr2], -16 (p11) fma.s1 Poly2 = P1_9, rsq, P1_8 - nop.i 999 ;; + nop.i 999 ;; } { .mmi -(p0) ldfe Q1_5 = [table_ptr2], -16 ;; +(p12) ldfe Q1_5 = [table_ptr2], -16 ;; (p12) ldfe Q1_4 = [table_ptr2], -16 - nop.i 999 ;; + nop.i 999 ;; } { .mfi (p12) ldfe Q1_3 = [table_ptr2], -16 @@ -2336,735 +2450,795 @@ L(TANL_SMALL_R): // N odd: poly2 = Q1_6 + Q1_7 * rsq // (p11) fma.s1 Poly1 = Poly1, rsq, P1_1 - nop.i 999 ;; + nop.i 999 ;; } { .mfi (p12) ldfe Q1_2 = [table_ptr2], -16 (p12) fma.s1 poly1 = S_hi, r, f1 - nop.i 999 ;; + nop.i 999 ;; } { .mfi (p12) ldfe Q1_1 = [table_ptr2], -16 (p11) fma.s1 Poly2 = Poly2, rsq, P1_7 - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even: CORR = rsq + 1 // N even: r_to_the_8 = rsq * rsq // (p11) fmpy.s1 Poly1 = Poly1, rsq - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 (p12) fma.s1 S_hi = S_hi, poly1, S_hi - nop.i 999 + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 (p12) fma.s1 poly2 = Q1_7, rsq, Q1_6 - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 (p11) fma.s1 Poly2 = Poly2, rsq, P1_6 - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 (p12) fma.s1 poly1 = S_hi, r, f1 - nop.i 999 + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 (p12) fma.s1 poly2 = poly2, rsq, Q1_5 - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 (p11) fma.s1 Poly2= Poly2, rsq, P1_5 - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 (p12) fma.s1 S_hi = S_hi, poly1, S_hi - nop.i 999 + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 (p12) fma.s1 poly2 = poly2, rsq, Q1_4 - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even: r_to_the_8 = r_to_the_8 * r_to_the_8 // N odd: poly1 = S_hi * r + 1.0 64 bits partial // (p11) fma.s1 Poly2 = Poly2, rsq, P1_4 - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // -// N even: Result = CORR + Poly * r +// N even: Poly = CORR + Poly * r // N odd: P = Q1_1 + poly2 * rsq // (p12) fma.s1 poly1 = S_hi, r, f1 - nop.i 999 + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 (p12) fma.s1 poly2 = poly2, rsq, Q1_3 - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even: Poly2 = P1_4 + Poly2 * rsq // N odd: poly2 = Q1_2 + poly2 * rsq // (p11) fma.s1 Poly = Poly2, r_to_the_8, Poly1 - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 (p12) fma.s1 poly1 = S_hi, c, poly1 - nop.i 999 + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 (p12) fma.s1 poly2 = poly2, rsq, Q1_2 - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even: Poly = Poly1 + Poly2 * r_to_the_8 // N odd: S_hi = S_hi * poly1 + S_hi 64 bits // -(p11) fma.s1 Result = Poly, r, CORR - nop.i 999 ;; +(p11) fma.s1 Poly = Poly, r, CORR + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // -// N even: Result = r + Result (User supplied rounding mode) +// N even: Result = r + Poly (User supplied rounding mode) // N odd: poly1 = S_hi * c + poly1 // (p12) fmpy.s1 S_lo = S_hi, poly1 - nop.i 999 +(p11) tbit.z.unc p14, p15 = cot_flag, 0 // p14=1 for tanl; p15=1 for cotl } { .mfi - nop.m 999 + nop.m 999 (p12) fma.s1 P = poly2, rsq, Q1_1 - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N odd: poly1 = S_hi * r + 1.0 // // // N odd: S_lo = S_hi * poly1 // -(p11) fadd.s0 Result = Result, r - nop.i 999 ;; +(p14) fadd.s0 Result = Poly, r // for tanl + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 +(p15) fms.s0 Result = Poly, mOne, r // for cotl + nop.i 999 ;; +} + +{ .mfi + nop.m 999 // // N odd: S_lo = Q1_1 * c + S_lo // (p12) fma.s1 S_lo = Q1_1, c, S_lo - nop.i 999 + nop.i 999 } { .mfi - nop.m 999 -(p0) fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact - nop.i 999 ;; + nop.m 999 + fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N odd: Result = S_lo + r * P // (p12) fma.s1 Result = P, r, S_lo - nop.i 999 ;; +(p12) tbit.z.unc p14, p15 = cot_flag, 0 ;; // p14=1 for tanl; p15=1 for cotl } -{ .mfb - nop.m 999 + // // N odd: Result = Result + S_hi (user supplied rounding mode) // -(p12) fadd.s0 Result = Result, S_hi -(p0) br.ret.sptk b0 ;; +{ .mfi + nop.m 999 +(p14) fadd.s0 Result = Result, S_hi // for tanl + nop.i 999 +} +{ .mfb + nop.m 999 +(p15) fms.s0 Result = Result, mOne, S_hi // for cotl + br.ret.sptk b0 ;; // Exit |r| < 1/4 path } -L(TANL_NORMAL_R): -{ .mfi -(p0) getf.sig sig_r = r +TANL_NORMAL_R: +// Here if 1/4 <= |x| < pi/4 or if |x| >= 2^63 and |r| >= 1/4 // ******************************************************************* // ******************************************************************* // ******************************************************************* // // r and c have been computed. -// Make sure ftz mode is set - should be automatic when using wre -// // -// Get [i_1] - lsb of N_fix_gr alone. -// -(p0) fmerge.s Pos_r = f1, r -(p0) extr.u i_1 = N_fix_gr, 0, 1 ;; -} -{ .mfi - nop.m 999 -(p0) fmerge.s sgn_r = r, f1 -(p0) cmp.eq.unc p11, p12 = 0x0000, i_1 ;; -} -{ .mfi - nop.m 999 - nop.f 999 -(p0) extr.u lookup = sig_r, 58, 5 -} -{ .mlx - nop.m 999 -(p0) movl Create_B = 0x8200000000000000 ;; -} { .mfi -(p0) addl table_ptr1 = @ltoff(TANL_BASE_CONSTANTS), gp - nop.f 999 -(p0) dep Create_B = lookup, Create_B, 58, 5 -} -;; - - -// -// Get [i_1] - lsb of N_fix_gr alone. -// Pos_r = abs (r) -// - - -{ .mmi -(p0) ld8 table_ptr1 = [table_ptr1] nop.m 999 + fand B = B_mask1, r nop.i 999 } ;; - +TANL_NORMAL_R_A: +// Enter here if pi/4 <= |x| < 2^63 and |r| >= 1/4 +// Get the 5 bits or r for the lookup. 1.xxxxx .... { .mmi - nop.m 999 -(p0) setf.sig B = Create_B -// -// Set table_ptr1 and table_ptr2 to base address of -// constant table. -// -(p0) add table_ptr1 = 480, table_ptr1 ;; -} -{ .mmb - nop.m 999 -// -// Is i_1 or i_0 == 0 ? -// Create the constant 1 00000 1000000000000000000000... -// -(p0) ldfe P2_1 = [table_ptr1], 16 - nop.b 999 + add table_ptr1 = 416, table_base // Point to tanl_table_p2 + mov GR_exp_2tom65 = 0xffff - 65 // Scaling constant for B + extr.u lookup = sig_r, 58, 5 } +;; + { .mmi - nop.m 999 ;; -(p0) getf.exp exp_r = Pos_r - nop.i 999 + ldfe P2_1 = [table_ptr1], 16 + setf.exp TWO_TO_NEG65 = GR_exp_2tom65 // 2^-65 for scaling B if exp_r=-2 + add N_fix_gr = N_fix_gr, cot_flag // N = N + 1 (for cotl) } -// -// Get r's exponent -// Get r's significand -// -{ .mmi -(p0) ldfe P2_2 = [table_ptr1], 16 ;; -// -// Get the 5 bits or r for the lookup. 1.xxxxx .... -// from sig_r. -// Grab lsb of exp of B -// -(p0) ldfe P2_3 = [table_ptr1], 16 - nop.i 999 ;; +;; + +.pred.rel "mutex",p11,p12 +// B = 2^63 * 1.xxxxx 100...0 +{ .mfi + ldfe P2_2 = [table_ptr1], 16 + for B = B_mask2, B + mov table_offset = 512 // Assume table offset is 512 } -{ .mii - nop.m 999 -(p0) andcm table_offset = 0x0001, exp_r ;; -(p0) shl table_offset = table_offset, 9 ;; +;; + +{ .mfi + ldfe P2_3 = [table_ptr1], 16 + fmerge.s Pos_r = f1, r + tbit.nz p8,p9 = exp_r, 0 } -{ .mii - nop.m 999 -// -// Deposit 0 00000 1000000000000000000000... on -// 1 xxxxx yyyyyyyyyyyyyyyyyyyyyy..., -// getting rid of the ys. +;; + // Is B = 2** -2 or B= 2** -1? If 2**-1, then // we want an offset of 512 for table addressing. -// -(p0) shladd table_offset = lookup, 4, table_offset ;; -// -// B = ........ 1xxxxx 1000000000000000000... -// -(p0) add table_ptr1 = table_ptr1, table_offset ;; -} -{ .mmb - nop.m 999 -// -// B = ........ 1xxxxx 1000000000000000000... -// Convert B so it has the same exponent as Pos_r -// -(p0) ldfd T_hi = [table_ptr1], 8 - nop.b 999 ;; +{ .mii + add table_ptr2 = 1296, table_base // Point to tanl_table_cm2 +(p9) shladd table_offset = lookup, 4, table_offset +(p8) shladd table_offset = lookup, 4, r0 } +;; +{ .mmi + add table_ptr1 = table_ptr1, table_offset // Point to T_hi + add table_ptr2 = table_ptr2, table_offset // Point to C_hi + add table_ptr3 = 2128, table_base // Point to tanl_table_scim2 +} +;; +{ .mmi + ldfd T_hi = [table_ptr1], 8 // Load T_hi +;; + ldfd C_hi = [table_ptr2], 8 // Load C_hi + add table_ptr3 = table_ptr3, table_offset // Point to SC_inv +} +;; // // x = |r| - B -// Load T_hi. -// Load C_hi. // - -{ .mmf -(p0) addl table_ptr2 = @ltoff(TANL_BASE_CONSTANTS), gp -(p0) ldfs T_lo = [table_ptr1] -(p0) fmerge.se B = Pos_r, B +// Convert B so it has the same exponent as Pos_r before subtracting +{ .mfi + ldfs T_lo = [table_ptr1] // Load T_lo +(p9) fnma.s1 x = B, FR_2tom64, Pos_r + nop.i 999 } -;; - - -{ .mmi -(p0) ld8 table_ptr2 = [table_ptr2] +{ .mfi nop.m 999 +(p8) fnma.s1 x = B, TWO_TO_NEG65, Pos_r nop.i 999 } ;; - -{ .mii -(p0) add table_ptr2 = 1360, table_ptr2 - nop.i 999 ;; -(p0) add table_ptr2 = table_ptr2, table_offset ;; +{ .mfi + ldfs C_lo = [table_ptr2] // Load C_lo + nop.f 999 + nop.i 999 } +;; + { .mfi -(p0) ldfd C_hi = [table_ptr2], 8 -(p0) fsub.s1 x = Pos_r, B - nop.i 999 ;; + ldfe SC_inv = [table_ptr3] // Load SC_inv + fmerge.s sgn_r = r, f1 + tbit.z p11, p12 = N_fix_gr, 0 // p11 if N even, p12 if odd + } -{ .mii -(p0) ldfs C_lo = [table_ptr2],255 - nop.i 999 ;; +;; + // // xsq = x * x // N even: Tx = T_hi * x -// Load T_lo. -// Load C_lo - increment pointer to get SC_inv -// - cant get all the way, do an add later. -// -(p0) add table_ptr2 = 569, table_ptr2 ;; -} // // N even: Tx1 = Tx + 1 // N odd: Cx1 = 1 - Cx // + { .mfi -(p0) ldfe SC_inv = [table_ptr2], 0 - nop.f 999 - nop.i 999 ;; -} -{ .mfi - nop.m 999 -(p0) fmpy.s1 xsq = x, x - nop.i 999 + nop.m 999 + fmpy.s1 xsq = x, x + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 (p11) fmpy.s1 Tx = T_hi, x - nop.i 999 ;; -} -{ .mfi - nop.m 999 -(p12) fmpy.s1 Cx = C_hi, x - nop.i 999 ;; + nop.i 999 } -{ .mfi - nop.m 999 +;; + // // N odd: Cx = C_hi * x // -(p0) fma.s1 P = P2_3, xsq, P2_2 - nop.i 999 -} { .mfi - nop.m 999 + nop.m 999 +(p12) fmpy.s1 Cx = C_hi, x + nop.i 999 +} +;; // // N even and odd: P = P2_3 + P2_2 * xsq // +{ .mfi + nop.m 999 + fma.s1 P = P2_3, xsq, P2_2 + nop.i 999 +} +{ .mfi + nop.m 999 (p11) fadd.s1 Tx1 = Tx, f1 - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even: D = C_hi - tanx // N odd: D = T_hi + tanx // (p11) fmpy.s1 CORR = SC_inv, T_hi - nop.i 999 + nop.i 999 } { .mfi - nop.m 999 -(p0) fmpy.s1 Sx = SC_inv, x - nop.i 999 ;; + nop.m 999 + fmpy.s1 Sx = SC_inv, x + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 (p12) fmpy.s1 CORR = SC_inv, C_hi - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 (p12) fsub.s1 V_hi = f1, Cx - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 -(p0) fma.s1 P = P, xsq, P2_1 - nop.i 999 + nop.m 999 + fma.s1 P = P, xsq, P2_1 + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 // // N even and odd: P = P2_1 + P * xsq // (p11) fma.s1 V_hi = Tx, Tx1, f1 - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even: Result = sgn_r * tail + T_hi (user rounding mode for C1) // N odd: Result = sgn_r * tail + C_hi (user rounding mode for C1) // -(p0) fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact - nop.i 999 ;; + fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact + nop.i 999 ;; } { .mfi - nop.m 999 -(p0) fmpy.s1 CORR = CORR, c - nop.i 999 ;; + nop.m 999 + fmpy.s1 CORR = CORR, c + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 (p12) fnma.s1 V_hi = Cx,V_hi,f1 - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even: V_hi = Tx * Tx1 + 1 // N odd: Cx1 = 1 - Cx * Cx1 // -(p0) fmpy.s1 P = P, xsq - nop.i 999 + fmpy.s1 P = P, xsq + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 // // N even and odd: P = P * xsq // (p11) fmpy.s1 V_hi = V_hi, T_hi - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even and odd: tail = P * tail + V_lo // (p11) fmpy.s1 T_hi = sgn_r, T_hi - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 -(p0) fmpy.s1 CORR = CORR, sgn_r - nop.i 999 ;; + nop.m 999 + fmpy.s1 CORR = CORR, sgn_r + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 (p12) fmpy.s1 V_hi = V_hi,C_hi - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even: V_hi = T_hi * V_hi // N odd: V_hi = C_hi * V_hi // -(p0) fma.s1 tanx = P, x, x - nop.i 999 + fma.s1 tanx = P, x, x + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 (p12) fnmpy.s1 C_hi = sgn_r, C_hi - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even: V_lo = 1 - V_hi + C_hi // N odd: V_lo = 1 - V_hi + T_hi // (p11) fadd.s1 CORR = CORR, T_lo - nop.i 999 + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 (p12) fsub.s1 CORR = CORR, C_lo - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even and odd: tanx = x + x * P // N even and odd: Sx = SC_inv * x // (p11) fsub.s1 D = C_hi, tanx - nop.i 999 + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 (p12) fadd.s1 D = T_hi, tanx - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N odd: CORR = SC_inv * C_hi // N even: CORR = SC_inv * T_hi // -(p0) fnma.s1 D = V_hi, D, f1 - nop.i 999 ;; + fnma.s1 D = V_hi, D, f1 + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even and odd: D = 1 - V_hi * D // N even and odd: CORR = CORR * c // -(p0) fma.s1 V_hi = V_hi, D, V_hi - nop.i 999 ;; + fma.s1 V_hi = V_hi, D, V_hi + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even and odd: V_hi = V_hi + V_hi * D // N even and odd: CORR = sgn_r * CORR // (p11) fnma.s1 V_lo = V_hi, C_hi, f1 - nop.i 999 + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 (p12) fnma.s1 V_lo = V_hi, T_hi, f1 - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even: CORR = COOR + T_lo // N odd: CORR = CORR - C_lo // (p11) fma.s1 V_lo = tanx, V_hi, V_lo - nop.i 999 + tbit.nz p15, p0 = cot_flag, 0 // p15=1 if we compute cotl } { .mfi - nop.m 999 + nop.m 999 (p12) fnma.s1 V_lo = tanx, V_hi, V_lo - nop.i 999 ;; + nop.i 999 ;; } + { .mfi - nop.m 999 + nop.m 999 +(p15) fms.s1 T_hi = f0, f0, T_hi // to correct result's sign for cotl + nop.i 999 +} +{ .mfi + nop.m 999 +(p15) fms.s1 C_hi = f0, f0, C_hi // to correct result's sign for cotl + nop.i 999 +};; + +{ .mfi + nop.m 999 +(p15) fms.s1 sgn_r = f0, f0, sgn_r // to correct result's sign for cotl + nop.i 999 +};; + +{ .mfi + nop.m 999 // // N even: V_lo = V_lo + V_hi * tanx // N odd: V_lo = V_lo - V_hi * tanx // (p11) fnma.s1 V_lo = C_lo, V_hi, V_lo - nop.i 999 + nop.i 999 } { .mfi - nop.m 999 + nop.m 999 (p12) fnma.s1 V_lo = T_lo, V_hi, V_lo - nop.i 999 ;; + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even: V_lo = V_lo - V_hi * C_lo // N odd: V_lo = V_lo - V_hi * T_lo // -(p0) fmpy.s1 V_lo = V_hi, V_lo - nop.i 999 ;; + fmpy.s1 V_lo = V_hi, V_lo + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even and odd: V_lo = V_lo * V_hi // -(p0) fadd.s1 tail = V_hi, V_lo - nop.i 999 ;; + fadd.s1 tail = V_hi, V_lo + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even and odd: tail = V_hi + V_lo // -(p0) fma.s1 tail = tail, P, V_lo - nop.i 999 ;; + fma.s1 tail = tail, P, V_lo + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even: T_hi = sgn_r * T_hi // N odd : C_hi = -sgn_r * C_hi // -(p0) fma.s1 tail = tail, Sx, CORR - nop.i 999 ;; + fma.s1 tail = tail, Sx, CORR + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even and odd: tail = Sx * tail + CORR // -(p0) fma.s1 tail = V_hi, Sx, tail - nop.i 999 ;; + fma.s1 tail = V_hi, Sx, tail + nop.i 999 ;; } { .mfi - nop.m 999 + nop.m 999 // // N even an odd: tail = Sx * V_hi + tail // (p11) fma.s0 Result = sgn_r, tail, T_hi - nop.i 999 + nop.i 999 } { .mfb - nop.m 999 + nop.m 999 (p12) fma.s0 Result = sgn_r, tail, C_hi -(p0) br.ret.sptk b0 ;; + br.ret.sptk b0 ;; // Exit for 1/4 <= |r| < pi/4 } -L(TANL_SPECIAL): +TANL_DENORMAL: +// Here if x denormal { .mfb - nop.m 999 -(p0) fmpy.s0 Arg = Arg, f0 -(p0) br.ret.sptk b0 ;; + getf.exp GR_signexp_x = Norm_Arg // Get sign and exponent of x + nop.f 999 + br.cond.sptk TANL_COMMON // Return to common code } +;; + + +TANL_SPECIAL: +TANL_UNSUPPORTED: // // Code for NaNs, Unsupporteds, Infs, or +/- zero ? // Invalid raised for Infs and SNaNs. // -.endp tanl -ASM_SIZE_DIRECTIVE(tanl) +{ .mfi + nop.m 999 + fmerge.s f10 = f8, f8 // Save input for error call + tbit.nz p6, p7 = cot_flag, 0 // p6=1 if we compute cotl +} +;; -// ******************************************************************* -// ******************************************************************* -// ******************************************************************* -// -// Special Code to handle very large argument case. -// Call int pi_by_2_reduce(&x,&r,&c) -// for |arguments| >= 2**63 -// (Arg or x) is in f8 -// Address to save r and c as double -// ******************************************************************* -// ******************************************************************* -// ******************************************************************* +{ .mfi + nop.m 999 +(p6) fclass.m p6, p7 = f8, 0x7 // Test for zero (cotl only) + nop.i 999 +} +;; + +.pred.rel "mutex", p6, p7 +{ .mfi +(p6) mov GR_Parameter_Tag = 225 // (cotl) +(p6) frcpa.s0 f8, p0 = f1, f8 // cotl(+-0) = +-Inf + nop.i 999 +} +{ .mfb + nop.m 999 +(p7) fmpy.s0 f8 = f8, f0 +(p7) br.ret.sptk b0 +} +;; + +GLOBAL_IEEE754_END(tanl) -.proc __libm_callout -__libm_callout: -L(TANL_ARG_TOO_LARGE): +LOCAL_LIBM_ENTRY(__libm_error_region) .prologue + +// (1) { .mfi - add r50=-32,sp // Parameter: r address - nop.f 0 + add GR_Parameter_Y=-32,sp // Parameter 2 value + nop.f 0 .save ar.pfs,GR_SAVE_PFS - mov GR_SAVE_PFS=ar.pfs // Save ar.pfs + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs } { .mfi .fframe 64 - add sp=-64,sp // Create new stack - nop.f 0 - mov GR_SAVE_GP=gp // Save gp + add sp=-64,sp // Create new stack + nop.f 0 + mov GR_SAVE_GP=gp // Save gp };; + +// (2) { .mmi - stfe [r50] = f0,16 // Clear Parameter r on stack - add r49 = 16,sp // Parameter x address + stfe [GR_Parameter_Y] = f1,16 // STORE Parameter 2 on stack + add GR_Parameter_X = 16,sp // Parameter 1 address .save b0, GR_SAVE_B0 - mov GR_SAVE_B0=b0 // Save b0 + mov GR_SAVE_B0=b0 // Save b0 };; + .body +// (3) { .mib - stfe [r50] = f0,-16 // Clear Parameter c on stack - nop.i 0 - nop.b 0 + stfe [GR_Parameter_X] = f10 // STORE Parameter 1 on stack + add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address + nop.b 0 } { .mib - stfe [r49] = Arg // Store Parameter x on stack - nop.i 0 -(p0) br.call.sptk b0=__libm_pi_by_2_reduce# ;; + stfe [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack + add GR_Parameter_Y = -16,GR_Parameter_Y + br.call.sptk b0=__libm_error_support# // Call error handling function +};; +{ .mmi + nop.m 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp +};; + +// (4) +{ .mmi + ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack +.restore sp + add sp = 64,sp // Restore stack pointer + mov b0 = GR_SAVE_B0 // Restore return address };; +{ .mib + mov gp = GR_SAVE_GP // Restore gp + mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs + br.ret.sptk b0 // Return +};; + +LOCAL_LIBM_END(__libm_error_region) + +.type __libm_error_support#,@function +.global __libm_error_support# + + +// ******************************************************************* +// ******************************************************************* +// ******************************************************************* // -// Load 2^-2 +// Special Code to handle very large argument case. +// Call int __libm_pi_by_2_reduce(x,r,c) for |arguments| >= 2**63 +// The interface is custom: +// On input: +// (Arg or x) is in f8 +// On output: +// r is in f8 +// c is in f9 +// N is in r8 +// We know also that __libm_pi_by_2_reduce preserves f10-15, f71-127. We +// use this to eliminate save/restore of key fp registers in this calling +// function. // +// ******************************************************************* +// ******************************************************************* +// ******************************************************************* + +LOCAL_LIBM_ENTRY(__libm_callout) +TANL_ARG_TOO_LARGE: +.prologue +{ .mfi + add table_ptr2 = 144, table_base // Point to 2^-2 + nop.f 999 +.save ar.pfs,GR_SAVE_PFS + mov GR_SAVE_PFS=ar.pfs // Save ar.pfs +} +;; + +// Load 2^-2, -2^-2 { .mmi -(p0) ldfe Arg =[r49],16 + ldfps TWO_TO_NEG2, NEGTWO_TO_NEG2 = [table_ptr2] + setf.sig B_mask1 = bmask1 // Form mask to get 5 msb of r +.save b0, GR_SAVE_B0 + mov GR_SAVE_B0=b0 // Save b0 +};; + +.body // -// Call argument reduction +// Call argument reduction with x in f8 +// Returns with N in r8, r in f8, c in f9 +// Assumes f71-127 are preserved across the call // -(p0) ldfs TWO_TO_NEG2 = [table_ptr2],4 -// Get Arg off stack -// Get r off stack - hi order part -// Get c off stack - lo order part -(p0) mov N_fix_gr = r8 ;; -} -{ .mmb -(p0) ldfe r =[r50],16 -(p0) ldfs NEGTWO_TO_NEG2 = [table_ptr2],4 - nop.b 999 ;; +{ .mib + setf.sig B_mask2 = bmask2 // Form mask to form B from r + mov GR_SAVE_GP=gp // Save gp + br.call.sptk b0=__libm_pi_by_2_reduce# } +;; + +// +// Is |r| < 2**(-2) +// { .mfi -(p0) ldfe c =[r50],-32 - nop.f 999 - nop.i 999 ;; + getf.sig sig_r = r // Extract significand of r + fcmp.lt.s1 p6, p0 = r, TWO_TO_NEG2 + mov gp = GR_SAVE_GP // Restore gp } +;; + { .mfi -.restore sp - add sp = 64,sp // Restore stack pointer + getf.exp exp_r = r // Extract signexp of r + nop.f 999 + mov b0 = GR_SAVE_B0 // Restore return address +} +;; + // -// Is |r| < 2**(-2) +// Get N_fix_gr // -(p0) fcmp.lt.unc.s1 p6, p0 = r, TWO_TO_NEG2 -mov b0 = GR_SAVE_B0 // Restore return address -};; { .mfi - mov gp = GR_SAVE_GP // Restore gp -(p6) fcmp.gt.unc.s1 p6, p0 = r, NEGTWO_TO_NEG2 - mov ar.pfs = GR_SAVE_PFS // Restore gp -};; + mov N_fix_gr = r8 +(p6) fcmp.gt.unc.s1 p6, p0 = r, NEGTWO_TO_NEG2 + mov ar.pfs = GR_SAVE_PFS // Restore pfs +} +;; + { .mbb - nop.m 999 -(p6) br.cond.spnt L(TANL_SMALL_R) -(p0) br.cond.sptk L(TANL_NORMAL_R) ;; + nop.m 999 +(p6) br.cond.spnt TANL_SMALL_R // Branch if |r| < 1/4 + br.cond.sptk TANL_NORMAL_R // Branch if 1/4 <= |r| < pi/4 } +;; -.endp __libm_callout -ASM_SIZE_DIRECTIVE(__libm_callout) +LOCAL_LIBM_END(__libm_callout) .type __libm_pi_by_2_reduce#,@function .global __libm_pi_by_2_reduce# diff --git a/sysdeps/ia64/fpu/s_trunc.S b/sysdeps/ia64/fpu/s_trunc.S index 0be91200e3..b9ad03b5a8 100644 --- a/sysdeps/ia64/fpu/s_trunc.S +++ b/sysdeps/ia64/fpu/s_trunc.S @@ -1,11 +1,10 @@ .file "trunc.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 7/7/2000 by John Harrison, Cristina Iordache, Ted Kubaska, -// Bob Norin, Shane Story, and Ping Tak Peter Tang of the -// Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -21,33 +20,28 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// // Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// problem reports or change requests be submitted to it directly at +// http://www.intel.com/software/products/opensource/libraries/num.htm. // -.align 32 -.global trunc# - -.section .text -.proc trunc# -.align 32 - // History //============================================================== -// 7/7/00: Created +// 07/07/00 Created +// 05/20/02 Cleaned up namespace and sf0 syntax +// 01/20/03 Improved performance and reduced code size //============================================================== // API @@ -55,25 +49,28 @@ // double trunc(double x) //============================================================== -#include "libm_support.h" +// general input registers: +// r14 - r18 -// general input registers: -TRUNC_GR_FFFF = r14 -TRUNC_GR_signexp = r15 -TRUNC_GR_exponent = r16 -TRUNC_GR_expmask = r17 -TRUNC_GR_bigexp = r18 +rExpBias = r14 +rSignexp = r15 +rExp = r16 +rExpMask = r17 +rBigexp = r18 // floating-point registers: -// f8, f9, f11, f12 +// f8 - f10 + +fXtruncInt = f9 +fNormX = f10 -// predicate registers used: -// p6, p7, p8, p9, p10, p11 +// predicate registers used: +// p6, p7 // Overview of operation //============================================================== // double trunc(double x) -// Return an integer value (represented as a double) less than or +// Return an integer value (represented as a double) less than or // equal to x in magnitude. // This is x rounded toward zero to an integral value. //============================================================== @@ -97,105 +94,73 @@ TRUNC_GR_bigexp = r18 // If we multiply by 2^23, we no longer have a fractional part // So input is an integer value already. -trunc: +.section .text +GLOBAL_LIBM_ENTRY(trunc) { .mfi - getf.exp TRUNC_GR_signexp = f8 - fcvt.fx.trunc.s1 f9 = f8 - addl TRUNC_GR_bigexp = 0x10033, r0 + getf.exp rSignexp = f8 // Get signexp, recompute if unorm + fcvt.fx.trunc.s1 fXtruncInt = f8 // Convert to int in significand + addl rBigexp = 0x10033, r0 // Set exponent at which is integer } { .mfi - mov TRUNC_GR_FFFF = 0x0FFFF - fnorm.d f11 = f8 - mov TRUNC_GR_expmask = 0x1FFFF -};; -// get the exponent of x -// convert x to integer in signficand of f9 -// Normalize x - this will raise invalid on SNaNs, the -// denormal operand flag - and possibly a spurious U flag -// get exponent only mask (will exclude sign bit) + mov rExpBias = 0x0FFFF // Form exponent bias + fnorm.s1 fNormX = f8 // Normalize input + mov rExpMask = 0x1FFFF // Form exponent mask +} +;; { .mfi nop.m 0 - fclass.m p7,p8 = f8, 0x0b + fclass.m p7,p0 = f8, 0x0b // Test x unorm nop.i 0 } -{ .mfi - nop.m 0 - fcmp.eq.unc.s1 p9,p0 = f8,f0 - nop.i 0 -};; -// fclass to set p7 if unnorm -{ .mmi - and TRUNC_GR_exponent = TRUNC_GR_signexp, TRUNC_GR_expmask ;; -(p8) cmp.ge.unc p10,p11 = TRUNC_GR_exponent, TRUNC_GR_bigexp -(p8) cmp.ne.unc p6,p0 = TRUNC_GR_exponent, TRUNC_GR_signexp -};; -// Get the exponent of x -// Test if exponent such that result already an integer -// Test if x < 0 -{ .mmi -(p9) cmp.eq.andcm p10,p11 = r0, r0 -(p6) cmp.lt.unc p6,p0 = TRUNC_GR_exponent, TRUNC_GR_FFFF - nop.i 0 -};; -// If -1 < x < 0, set p6, turn off p10 and p11, and set result to -0.0 -{ .mfb -(p6) cmp.eq.andcm p10,p11 = r0, r0 -(p6) fmerge.s f8 = f8, f0 - nop.b 0 -};; -// If not a unnorm, set p10 if x already is a big int, nan, or inf? -// If not a unnorm, set p10 if x already is a big int, nan, or inf? -.pred.rel "mutex",p10,p11 +;; + { .mfb nop.m 0 -(p11) fcvt.xf f8 = f9 - nop.b 0 + fclass.m p6,p0 = f8, 0x1e3 // Test x natval, nan, inf +(p7) br.cond.spnt TRUNC_UNORM // Branch if x unorm } +;; + +TRUNC_COMMON: +// Return here from TRUNC_UNORM { .mfb + and rExp = rSignexp, rExpMask // Get biased exponent +(p6) fma.d.s0 f8 = f8, f1, f0 // Result if x natval, nan, inf +(p6) br.ret.spnt b0 // Exit if x natval, nan, inf +} +;; + +{ .mfi + cmp.lt p6,p0 = rExp, rExpBias // Is |x| < 1? + fcvt.xf f8 = fXtruncInt // Result, assume 1 <= |x| < 2^52 + cmp.ge p7,p0 = rExp, rBigexp // Is |x| >= 2^52? +} +;; + +// We must correct result if |x| < 1, or |x| >= 2^52 +.pred.rel "mutex",p6,p7 +{ .mfi nop.m 0 -(p10) fma.d.s1 f8 = f11,f1,f0 -(p8) br.ret.sptk b0 -};; -// If not a unnorm and not an big int, nan,or +/-inf convert signficand -// back to f8. -// If not a unorm and a big int, nan, or +/-inf, return fnorm'd x -// If not a unorm, Return -// If unnorm, get the exponent again - perhaps it wasn't a denorm. -{ .mfb -(p7) getf.exp TRUNC_GR_signexp = f11 -(p7) fcvt.fx.trunc.s1 f12 = f11 - nop.b 0 -};; -{ .mfb - and TRUNC_GR_exponent = TRUNC_GR_signexp, TRUNC_GR_expmask - fcmp.lt.unc.s1 p9,p0 = f8,f0 - nop.b 0 -};; -{ .mfb - cmp.ge.unc p10,p11 = TRUNC_GR_exponent, TRUNC_GR_bigexp - nop.f 0 - nop.b 0 -};; -// If a unnorm, check to see if value is already a big int. +(p6) fmerge.s f8 = fNormX, f0 // If |x| < 1, result sgn(x)*0 + nop.i 0 +} { .mfb - nop.m 0 -(p11) fcvt.xf f8 = f12 - nop.b 0 + nop.m 0 +(p7) fma.d.s0 f8 = fNormX, f1, f0 // If |x| >= 2^52, result x + br.ret.sptk b0 // Exit main path } -{ .mfi - nop.m 0 -(p10) fma.d.s1 f8 = f11,f1,f0 - nop.i 0 -};; +;; + + +TRUNC_UNORM: +// Here if x unorm { .mfb - nop.m 0 -(p9) fmerge.ns f8 = f1,f8 - br.ret.sptk b0 -};; -// If so return it. Otherwise, return (fcvt.xf(fcvt.fx.trunc(x))) -// Make sure the result is negative if it should be - that is -// negative(denormal) -> -0. -.endp trunc -ASM_SIZE_DIRECTIVE(trunc) + getf.exp rSignexp = fNormX // Get signexp, recompute if unorm + fcmp.eq.s0 p7,p0 = f8, f0 // Dummy op to set denormal flag + br.cond.sptk TRUNC_COMMON // Return to main path +} +;; + +GLOBAL_LIBM_END(trunc) diff --git a/sysdeps/ia64/fpu/s_truncf.S b/sysdeps/ia64/fpu/s_truncf.S index 0ac4181209..ff40bc7101 100644 --- a/sysdeps/ia64/fpu/s_truncf.S +++ b/sysdeps/ia64/fpu/s_truncf.S @@ -1,11 +1,10 @@ .file "truncf.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 7/7/2000 by John Harrison, Cristina Iordache, Ted Kubaska, -// Bob Norin, Shane Story, and Ping Tak Peter Tang of the -// Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -21,33 +20,28 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// // Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// problem reports or change requests be submitted to it directly at +// http://www.intel.com/software/products/opensource/libraries/num.htm. // -.align 32 -.global truncf# - -.section .text -.proc truncf# -.align 32 - // History //============================================================== -// 7/7/00: Created +// 07/07/00 Created +// 05/20/02 Cleaned up namespace and sf0 syntax +// 01/20/03 Improved performance and reduced code size //============================================================== // API @@ -55,25 +49,28 @@ // float truncf(float x) //============================================================== -#include "libm_support.h" +// general input registers: +// r14 - r18 -// general input registers: -TRUNC_GR_FFFF = r14 -TRUNC_GR_signexp = r15 -TRUNC_GR_exponent = r16 -TRUNC_GR_expmask = r17 -TRUNC_GR_bigexp = r18 +rExpBias = r14 +rSignexp = r15 +rExp = r16 +rExpMask = r17 +rBigexp = r18 // floating-point registers: -// f8, f9, f11, f12 +// f8 - f10 + +fXtruncInt = f9 +fNormX = f10 -// predicate registers used: -// p6, p7, p8, p9, p10, p11 +// predicate registers used: +// p6, p7 // Overview of operation //============================================================== // float truncf(float x) -// Return an integer value (represented as a float) less than or +// Return an integer value (represented as a float) less than or // equal to x in magnitude. // This is x rounded toward zero to an integral value. //============================================================== @@ -97,105 +94,73 @@ TRUNC_GR_bigexp = r18 // If we multiply by 2^23, we no longer have a fractional part // So input is an integer value already. -truncf: +.section .text +GLOBAL_LIBM_ENTRY(truncf) { .mfi - getf.exp TRUNC_GR_signexp = f8 - fcvt.fx.trunc.s1 f9 = f8 - addl TRUNC_GR_bigexp = 0x10016, r0 + getf.exp rSignexp = f8 // Get signexp, recompute if unorm + fcvt.fx.trunc.s1 fXtruncInt = f8 // Convert to int in significand + addl rBigexp = 0x10016, r0 // Set exponent at which is integer } { .mfi - mov TRUNC_GR_FFFF = 0x0FFFF - fnorm.s f11 = f8 - mov TRUNC_GR_expmask = 0x1FFFF -};; -// get the exponent of x -// convert x to integer in signficand of f9 -// Normalize x - this will raise invalid on SNaNs, the -// denormal operand flag - and possibly a spurious U flag -// get exponent only mask (will exclude sign bit) + mov rExpBias = 0x0FFFF // Form exponent bias + fnorm.s1 fNormX = f8 // Normalize input + mov rExpMask = 0x1FFFF // Form exponent mask +} +;; { .mfi nop.m 0 - fclass.m p7,p8 = f8, 0x0b + fclass.m p7,p0 = f8, 0x0b // Test x unorm nop.i 0 } -{ .mfi - nop.m 0 - fcmp.eq.unc.s1 p9,p0 = f8,f0 - nop.i 0 -};; -// fclass to set p7 if unnorm -{ .mmi - and TRUNC_GR_exponent = TRUNC_GR_signexp, TRUNC_GR_expmask ;; -(p8) cmp.ge.unc p10,p11 = TRUNC_GR_exponent, TRUNC_GR_bigexp -(p8) cmp.ne.unc p6,p0 = TRUNC_GR_exponent, TRUNC_GR_signexp -};; -// Get the exponent of x -// Test if exponent such that result already an integer -// Test if x < 0 -{ .mmi -(p9) cmp.eq.andcm p10,p11 = r0, r0 -(p6) cmp.lt.unc p6,p0 = TRUNC_GR_exponent, TRUNC_GR_FFFF - nop.i 0 -};; -// If -1 < x < 0, set p6, turn off p10 and p11, and set result to -0.0 -{ .mfb -(p6) cmp.eq.andcm p10,p11 = r0, r0 -(p6) fmerge.s f8 = f8, f0 - nop.b 0 -};; -// If not a unnorm, set p10 if x already is a big int, nan, or inf? -// If not a unnorm, set p10 if x already is a big int, nan, or inf? -.pred.rel "mutex",p10,p11 +;; + { .mfb nop.m 0 -(p11) fcvt.xf f8 = f9 - nop.b 0 + fclass.m p6,p0 = f8, 0x1e3 // Test x natval, nan, inf +(p7) br.cond.spnt TRUNC_UNORM // Branch if x unorm } +;; + +TRUNC_COMMON: +// Return here from TRUNC_UNORM { .mfb + and rExp = rSignexp, rExpMask // Get biased exponent +(p6) fma.s.s0 f8 = f8, f1, f0 // Result if x natval, nan, inf +(p6) br.ret.spnt b0 // Exit if x natval, nan, inf +} +;; + +{ .mfi + cmp.lt p6,p0 = rExp, rExpBias // Is |x| < 1? + fcvt.xf f8 = fXtruncInt // Result, assume 1 <= |x| < 2^23 + cmp.ge p7,p0 = rExp, rBigexp // Is |x| >= 2^23? +} +;; + +// We must correct result if |x| < 1, or |x| >= 2^23 +.pred.rel "mutex",p6,p7 +{ .mfi nop.m 0 -(p10) fma.s.s1 f8 = f11,f1,f0 -(p8) br.ret.sptk b0 -};; -// If not a unnorm and not an big int, nan,or +/-inf convert signficand -// back to f8. -// If not a unorm and a big int, nan, or +/-inf, return fnorm'd x -// If not a unorm, Return -// If unnorm, get the exponent again - perhaps it wasn't a denorm. -{ .mfb -(p7) getf.exp TRUNC_GR_signexp = f11 -(p7) fcvt.fx.trunc.s1 f12 = f11 - nop.b 0 -};; -{ .mfb - and TRUNC_GR_exponent = TRUNC_GR_signexp, TRUNC_GR_expmask - fcmp.lt.unc.s1 p9,p0 = f8,f0 - nop.b 0 -};; -{ .mfb - cmp.ge.unc p10,p11 = TRUNC_GR_exponent, TRUNC_GR_bigexp - nop.f 0 - nop.b 0 -};; -// If a unnorm, check to see if value is already a big int. +(p6) fmerge.s f8 = fNormX, f0 // If |x| < 1, result sgn(x)*0 + nop.i 0 +} { .mfb - nop.m 0 -(p11) fcvt.xf f8 = f12 - nop.b 0 + nop.m 0 +(p7) fma.s.s0 f8 = fNormX, f1, f0 // If |x| >= 2^23, result x + br.ret.sptk b0 // Exit main path } -{ .mfi - nop.m 0 -(p10) fma.s.s1 f8 = f11,f1,f0 - nop.i 0 -};; +;; + + +TRUNC_UNORM: +// Here if x unorm { .mfb - nop.m 0 -(p9) fmerge.ns f8 = f1,f8 - br.ret.sptk b0 -};; -// If so return it. Otherwise, return (fcvt.xf(fcvt.fx.trunc(x))) -// Make sure the result is negative if it should be - that is -// negative(denormal) -> -0. -.endp truncf -ASM_SIZE_DIRECTIVE(truncf) + getf.exp rSignexp = fNormX // Get signexp, recompute if unorm + fcmp.eq.s0 p7,p0 = f8, f0 // Dummy op to set denormal flag + br.cond.sptk TRUNC_COMMON // Return to main path +} +;; + +GLOBAL_LIBM_END(truncf) diff --git a/sysdeps/ia64/fpu/s_truncl.S b/sysdeps/ia64/fpu/s_truncl.S index 91bf96ce90..1afa19ba2b 100644 --- a/sysdeps/ia64/fpu/s_truncl.S +++ b/sysdeps/ia64/fpu/s_truncl.S @@ -1,11 +1,10 @@ .file "truncl.s" -// Copyright (C) 2000, 2001, Intel Corporation + +// Copyright (c) 2000 - 2003, Intel Corporation // All rights reserved. -// -// Contributed 7/7/2000 by John Harrison, Cristina Iordache, Ted Kubaska, -// Bob Norin, Shane Story, and Ping Tak Peter Tang of the -// Computational Software Lab, Intel Corporation. +// +// Contributed 2000 by the Intel Numerics Group, Intel Corporation // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -21,59 +20,57 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// // Intel Corporation is the author of this code, and requests that all -// problem reports or change requests be submitted to it directly at -// http://developer.intel.com/opensource. +// problem reports or change requests be submitted to it directly at +// http://www.intel.com/software/products/opensource/libraries/num.htm. // -.align 32 -.global truncl# - -.section .text -.proc truncl# -.align 32 - // History //============================================================== -// 7/7/00: Created +// 07/07/00 Created +// 05/20/02 Cleaned up namespace and sf0 syntax +// 01/20/03 Improved performance and reduced code size //============================================================== // API //============================================================== -// long double truncl(float x) +// long double truncl(long double x) //============================================================== -#include "libm_support.h" +// general input registers: +// r14 - r18 -// general input registers: -TRUNC_GR_FFFF = r14 -TRUNC_GR_signexp = r15 -TRUNC_GR_exponent = r16 -TRUNC_GR_expmask = r17 -TRUNC_GR_bigexp = r18 +rExpBias = r14 +rSignexp = r15 +rExp = r16 +rExpMask = r17 +rBigexp = r18 // floating-point registers: -// f8, f9, f11, f12 +// f8 - f10 -// predicate registers used: -// p6, p7, p8, p9, p10, p11 +fXtruncInt = f9 +fNormX = f10 + +// predicate registers used: +// p6, p7 // Overview of operation //============================================================== // long double truncl(long double x) -// Return an integer value (represented as a long double) less than or +// Return an integer value (represented as a long double) less than or // equal to x in magnitude. // This is x rounded toward zero to an integral value. //============================================================== @@ -97,105 +94,73 @@ TRUNC_GR_bigexp = r18 // If we multiply by 2^23, we no longer have a fractional part // So input is an integer value already. -truncl: +.section .text +GLOBAL_LIBM_ENTRY(truncl) { .mfi - getf.exp TRUNC_GR_signexp = f8 - fcvt.fx.trunc.s1 f9 = f8 - addl TRUNC_GR_bigexp = 0x1003e, r0 + getf.exp rSignexp = f8 // Get signexp, recompute if unorm + fcvt.fx.trunc.s1 fXtruncInt = f8 // Convert to int in significand + addl rBigexp = 0x1003e, r0 // Set exponent at which is integer } { .mfi - mov TRUNC_GR_FFFF = 0x0FFFF - fnorm f11 = f8 - mov TRUNC_GR_expmask = 0x1FFFF -};; -// get the exponent of x -// convert x to integer in signficand of f9 -// Normalize x - this will raise invalid on SNaNs, the -// denormal operand flag - and possibly a spurious U flag -// get exponent only mask (will exclude sign bit) + mov rExpBias = 0x0FFFF // Form exponent bias + fnorm.s1 fNormX = f8 // Normalize input + mov rExpMask = 0x1FFFF // Form exponent mask +} +;; { .mfi nop.m 0 - fclass.m p7,p8 = f8, 0x0b + fclass.m p7,p0 = f8, 0x0b // Test x unorm nop.i 0 } -{ .mfi - nop.m 0 - fcmp.eq.unc.s1 p9,p0 = f8,f0 - nop.i 0 -};; -// fclass to set p7 if unnorm -{ .mmi - and TRUNC_GR_exponent = TRUNC_GR_signexp, TRUNC_GR_expmask ;; -(p8) cmp.ge.unc p10,p11 = TRUNC_GR_exponent, TRUNC_GR_bigexp -(p8) cmp.ne.unc p6,p0 = TRUNC_GR_exponent, TRUNC_GR_signexp -};; -// Get the exponent of x -// Test if exponent such that result already an integer -// Test if x < 0 -{ .mmi -(p9) cmp.eq.andcm p10,p11 = r0, r0 -(p6) cmp.lt.unc p6,p0 = TRUNC_GR_exponent, TRUNC_GR_FFFF - nop.i 0 -};; -// If -1 < x < 0, set p6, turn off p10 and p11, and set result to -0.0 -{ .mfb -(p6) cmp.eq.andcm p10,p11 = r0, r0 -(p6) fmerge.s f8 = f8, f0 - nop.b 0 -};; -// If not a unnorm, set p10 if x already is a big int, nan, or inf? -// If not a unnorm, set p10 if x already is a big int, nan, or inf? -.pred.rel "mutex",p10,p11 +;; + { .mfb nop.m 0 -(p11) fcvt.xf f8 = f9 - nop.b 0 + fclass.m p6,p0 = f8, 0x1e3 // Test x natval, nan, inf +(p7) br.cond.spnt TRUNC_UNORM // Branch if x unorm } +;; + +TRUNC_COMMON: +// Return here from TRUNC_UNORM { .mfb + and rExp = rSignexp, rExpMask // Get biased exponent +(p6) fma.s0 f8 = f8, f1, f0 // Result if x natval, nan, inf +(p6) br.ret.spnt b0 // Exit if x natval, nan, inf +} +;; + +{ .mfi + cmp.lt p6,p0 = rExp, rExpBias // Is |x| < 1? + fcvt.xf f8 = fXtruncInt // Result, assume 1 <= |x| < 2^63 + cmp.ge p7,p0 = rExp, rBigexp // Is |x| >= 2^63? +} +;; + +// We must correct result if |x| < 1, or |x| >= 2^63 +.pred.rel "mutex",p6,p7 +{ .mfi nop.m 0 -(p10) fma.s1 f8 = f11,f1,f0 -(p8) br.ret.sptk b0 -};; -// If not a unnorm and not an big int, nan,or +/-inf convert signficand -// back to f8. -// If not a unorm and a big int, nan, or +/-inf, return fnorm'd x -// If not a unorm, Return -// If unnorm, get the exponent again - perhaps it wasn't a denorm. -{ .mfb -(p7) getf.exp TRUNC_GR_signexp = f11 -(p7) fcvt.fx.trunc.s1 f12 = f11 - nop.b 0 -};; -{ .mfb - and TRUNC_GR_exponent = TRUNC_GR_signexp, TRUNC_GR_expmask - fcmp.lt.unc.s1 p9,p0 = f8,f0 - nop.b 0 -};; -{ .mfb - cmp.ge.unc p10,p11 = TRUNC_GR_exponent, TRUNC_GR_bigexp - nop.f 0 - nop.b 0 -};; -// If a unnorm, check to see if value is already a big int. +(p6) fmerge.s f8 = fNormX, f0 // If |x| < 1, result sgn(x)*0 + nop.i 0 +} { .mfb - nop.m 0 -(p11) fcvt.xf f8 = f12 - nop.b 0 + nop.m 0 +(p7) fma.s0 f8 = fNormX, f1, f0 // If |x| >= 2^63, result x + br.ret.sptk b0 // Exit main path } -{ .mfi - nop.m 0 -(p10) fma.s1 f8 = f11,f1,f0 - nop.i 0 -};; +;; + + +TRUNC_UNORM: +// Here if x unorm { .mfb - nop.m 0 -(p9) fmerge.ns f8 = f1,f8 - br.ret.sptk b0 -};; -// If so return it. Otherwise, return (fcvt.xf(fcvt.fx.trunc(x))) -// Make sure the result is negative if it should be - that is -// negative(denormal) -> -0. -.endp truncl -ASM_SIZE_DIRECTIVE(truncl) + getf.exp rSignexp = fNormX // Get signexp, recompute if unorm + fcmp.eq.s0 p7,p0 = f8, f0 // Dummy op to set denormal flag + br.cond.sptk TRUNC_COMMON // Return to main path +} +;; + +GLOBAL_LIBM_END(truncl) |